binary Pattern Matching in ES6 with (pattern, s) as strings - javascript

Given two strings pattern and s. The first string pattern contains only the
symbols 0 and 1, and the second string s contains only lowercase English
letters.
Let's say that pattern matches a substring s[l..r] of s if the following 3
conditions are met:
they have equal length;
for each 0 in pattern the corresponding letter in the substring is a vowel;
for each 1 in pattern the corresponding letter is a consonant.
the task is to calculate the number of substrings of s that match pattern.
Note: In this we define the vowels as a,e,i,o,u, and y. All other
letters are consonants.
I am not challenging anyone here, I have tried different ways but could not achieve. This question was asked in codesignal test assessment recently.

Here is my approach to tackle the problem.
replacing all 0 to a regex matching vowels and 1 to non-vowels from the pattern (after checking the inputs) and using that as regex (with overlapping) on s can help us with the requirements set.
function matchOverlap(input, re) {
var r = [],
m;
// prevent infinite loops
if (!re.global) re = new RegExp(
re.source, (re + '').split('/').pop() + 'g'
);
while (m = re.exec(input)) {
re.lastIndex -= m[0].length - 1;
r.push(m[0]);
}
return r;
}
function algorithm(pattern, s) {
const VOWELS = 'aeiouy'
if (pattern.match('[^01]'))
throw new Error('only 0 and 1 allowed in pattern')
else if (s.match('[^a-z]'))
throw new Error('only a-z allowed in s')
const generatedRegex = new RegExp(
pattern
.replace(/0/g, `[${VOWELS}]`)
.replace(/1/g, `[^${VOWELS}]`),
'g')
console.log("GENERATED REGEX:", generatedRegex)
const matches = matchOverlap(s, generatedRegex)
console.log("MATCHES:", matches)
return matches.length
}
console.log("FINAL RESULT: " + algorithm('101', 'wasistdas'))
// the following throws error as per the requirement
// console.log(algorithm('234234', 'sdfsdf'))
// console.log(algorithm('10101', 'ASDFDSFSD'))
The matchOverlap function used was taken from this answer

You could take check for length first and then check the test with a regular expression for consonants against the pattern and count.
function getCount(pattern, s) {
if (pattern.length !== s.length) return false;
const regExp = /^[^aeiouy]$/;
let count = 0;
for (let i = 0; i < pattern.length; i++) {
if (+regExp.test(s[i]) === +pattern[i]) count++;
}
return count;
}
console.log(getCount('010', 'ama'));

you should convert the input string to binary format.
function convertToBinary(source) {
var vowels = 'aeiouy'
var len = source.length
var binaryStr = ''
for (i = 0; i < len; i++) {
binaryStr += vowels.includes(source[i]) ? '0' : '1'
}
return binaryStr
}
function isMatch(txt, pattern) {
return txt === pattern
}
function findMatches(source, pattern) {
var binaryString = convertToBinary(source)
var result = []
var patternLen = pattern.length
for (var i = 0; i < binaryString.length - patternLen; i++) {
if (isMatch(binaryString.substr(i, patternLen), pattern)) {
result.push(source.substr(i, patternLen))
}
}
return result
}
var text = 'thisisaresultoffunction'
var pattern = '1011'
console.log(findMatches(text, pattern))
its result
[ "sult", "toff", "func" ]

This is a brute force C# version
int binaryPatternMatching(string pattern, string s) {
int count = 0;
char[] vowel = {'a', 'e', 'i', 'o', 'u', 'y'};
for(int i=0; i<=(s.Length - pattern.Length); i++){
int k=i;
bool match = true;
bool cTM = true;
int j=0;
while(match == true && j < pattern.Length){
if(pattern[j] == '0')
{
if(vowel.Contains(s[k])){
cTM = true;
}
else{
cTM = false;
}
}
else
{
if(!vowel.Contains(s[k])){
cTM = true;
}
else{
cTM = false;
}
}
k += 1;
j += 1;
match = (match && cTM);
}
if(match){
count += 1;
}
}
return count;
}
can be optimized

got it! also look at this question
the regex lib is more powerful than re
import regex as re
def pattern_finder(pattern,source):
vowels = ['aeouiy']
# using list comprehension to build the regular expression
reg_ex = "".join(['[aeiouy]' if num=='0' else '[^aeiouy]' for num in pattern ])
#finding overlapped patterns
matches = re.findall(reg_ex, source, overlapped=True)
return len(matches)

Related

How to loop through and segment a string into an array using a RegExp?

I use the custom JS code heavily in Zapier. When arrays are imported into this step, Zapier converts it it into a literal string, i.e:
['BigBoatBob, XL-1', 'LittleBoatMike, M-2', 'SunkBoatCheney, XS-9']
turns into:
'BigBoatBob, XL-1,LittleBoatMike, M-2,SunkBoatCheney, XS-9'
I've created a function to parse out the array items (accounting for textual commas) but it seems very, VERY sloppy. Anyone have any suggestions to refine / shorten/ make look more professional? Thanks for helping me to further my abilities :)
var array = splitArray('BigBoatBob, XL-1,LittleBoatMike, M-2,SunkBoatCheney, XS-9');
function splitArray(x) {
const pos = [];
const POS = [];
const res = [];
for (var i = 0; i < x.length; i++) {
if (x[i] == ',') pos.push(i);
}
for (i = 0; i < pos.length; i++) {
let a = x.slice(pos[i]);
if (!a.startsWith(', ')) POS.push(pos[i]);
}
POS.push(x.length);
POS.unshift(0);
for (i = 0; i < POS.length - 1; i++) {
res.push(x.slice(POS[i], POS[i+1]));
}
return res.map(x => {
if (x.startsWith(',')) {
return x.slice(1);
} else {
return x;
}
});
}
console.log(array);
If you can rely on the spaces after the commas within the strings and rely on their not being one between the strings, you can use split with the regular expression /,(?! )/ which says "a comma not followed by a space:"
const str = 'BigBoatBob, XL-1,LittleBoatMike, M-2,SunkBoatCheney, XS-9';
const array = str.split(/,(?! )/);
console.log(array);
If you can't rely on that but you can rely on the format of the XL-1 and such, you can do it with an exec loop (or with an up-to-date JavaScript engine or a polyfill, with matchAll):
const str = 'BigBoatBob, XL-1,LittleBoatMike, M-2,SunkBoatCheney, XS-9';
const array = [];
const rex = /(.*?,\s*[A-Z]{1,2}-\d)\s*,?/g;
let match;
while ((match = rex.exec(str)) !== null) {
array.push(match[1]);
}
console.log(array);
The regular expression /(.*?,\s*[A-Z]{1,2}-\d)\s*,?/g means:
.*? any number of any character, non-greedy
, a comma
\s* zero or more whitespace characters
[A-Z]{1,2} one or two letters from the range A-Z
- a dash
\d a single digit (use \d+ if there can be more than one)
All of the above is in a capture group
,? an optional comma following it
I would use Array.reduce:
var s = 'BigBoatBob, XL-1,LittleBoatMike, M-2,SunkBoatCheney, XS-9'
var result = s.split(',').reduce((acc, curr, i) => {
if(i % 2 == 0) { acc[i] = curr }
else { acc[i - 1] += curr }
return acc
}, []).filter(x => x)
console.log(result)
Shorthand,
function splitIt(str) {
return str.split(',').reduce((a,v,i)=>((i % 2 == 0)?a.push(v):a[a.length-1]=a[a.length-1]+","+v,a),[]);
}
// Example
let str = `BigBoatBob, XL-1,LittleBoatMike, M-2,SunkBoatCheney, XS-9`;
console.log(splitIt(str));

Regex for same word shows first false and then true [duplicate]

This question already has answers here:
Why does a RegExp with global flag give wrong results?
(7 answers)
Closed 6 years ago.
I have problem with testing string with regex.
After iterations "aab", "aab", "aba".. here comes the problem when testing string "baa" first time is ok, result is false because regex test is setup to check is there repeating letter inside string, but when testing again "baa" result is now true. Why is this happening?
Here is the code:
//Here are function for swaping letters
String.prototype.swapLetters=function(index) {
var temp = this.split("");
var n = temp[index];
temp[index]=temp[index+1]; temp[index+1]=n;
var str1 = temp.join("");
return str1;
}
function permAlone(str) {
//the function for calculating number of total combinations
function returnFactorial(num){
if(num===0){
return 1;
} else {
return returnFactorial(num-1)*num;
}
}
var combs = returnFactorial(str.length);
var c = 0;
var permutations = 0;
var reg = new RegExp(/(.)\1+/g);
for (var i = 0; i < combs; i++) {
if(c>=str.length-1){
c = 0;
}
str = str.swapLetters(c);
if(!reg.test(str)){
permutations++;
console.log(str);
}
c++;
}
}
permAlone('aab');
Firstly, the condition you have if(!reg.test(str)){} should not have a !(not) in it if you intend to identify a regex match. I am not sure if that is what you wanted, though.
Secondly, I removed the 'global' match flag 'g' so that the regex is basically "reset" to start matching from the beginning of the text before each execution. This is because a single RegExp object when executed multiple times, starts to match the text each time from the last match index. This can give you a detailed explanation for this. Why RegExp with global flag in Javascript give wrong results?.
Try this.
//Here are function for swaping letters
String.prototype.swapLetters=function(index) {
var temp = this.split("");
var n = temp[index];
temp[index]=temp[index+1]; temp[index+1]=n;
var str1 = temp.join("");
return str1;
}
function permAlone(str) {
//the function for calculating number of total combinations
function returnFactorial(num){
if(num===0){
return 1;
} else {
return returnFactorial(num-1)*num;
}
}
var combs = returnFactorial(str.length);
var c = 0;
var permutations = 0;
var reg = new RegExp(/(\w)\1+/);
for (var i = 0; i < combs; i++) {
if(c>=str.length-1){
c = 0;
}
str = str.swapLetters(c);
console.log("case : " + str);
if(reg.test(str)){
permutations++;
console.log(str + " has repeated letters");
}
c++;
}
}
permAlone('aab');

Reducing duplicate characters in a string to a given minimum

I was messing around with the first question here: Reduce duplicate characters to a desired minimum and am looking for more elegant answers than what I came up with. It passes the test but curious to see other solutions. The sample tests are:
reduceString('aaaabbbb', 2) 'aabb'
reduceString('xaaabbbb', 2) 'xaabb'
reduceString('aaaabbbb', 1) 'ab'
reduceString('aaxxxaabbbb', 2) 'aaxxaabb'
and my solution (that passes these tests):
reduceString = function(str, amount) {
var count = 0;
var result = '';
for (var i = 0; i < str.length; i++) {
if (str[i] === str[i+1]) {
count++;
if (count < amount) {
result += str[i];
}
} else {
count = 0;
result += str[i];
}
};
return result;
}
Just use regular expressions.
var reduceString = function (str, amount) {
var re = new RegExp("(.)(?=\\1{" + amount + "})","g");
return str.replace(re, "");
}
I guess my best solution would be like
var str = "axxxaabbbbcaaxxxaab",
redStr = (s,n) => s.replace(/(\w)\1+/g,"$1".repeat(n));
console.log(redStr(str,2));
I tried to make it as short as possible:
reduceString = function(str, amount) {
var finalString = '', cL = '', counter;
str.split('').forEach(function(i){
if (i !== cL) counter = 0;
counter++;
cL = i;
if (counter <= amount ) finalString = finalString + i;
});
return finalString;
}
You can use reg expression instead. tested in javascript.
how it works:
(.) //match any character
\1 //if it follow by the same character
+{2 //more than 1 times
/g //global
$1 //is 1 time by $1$1 is 2 times
reduceString('aaaabbbb', 2)
reduceString('xaaabbbb', 2)
reduceString('aaaabbbb', 1)
reduceString('aaxxxaabbbb', 2)
function reduceString(txt,num)
{
var canRepeat=['$1'];
for (i=1;i<num;i++)
{
canRepeat.push('$1')
}
canRepeat = canRepeat.join('');
console.log(txt.replace(/(.)\1{2,}/g, canRepeat))
}
With regex:
var reduceString = function(str, amount) {
var x = [ ...new Set(str) ];
for (var c of x){
var rex = new RegExp(c + '{'+amount+',}','g');
str = str.replace(rex,string(c,amount));
}
return str;
};
var string = function(c,amount){
for(var i=0,s="";i<amount;i++)s+=c;
return s;
};
Up above regex solutions are much more better, but here is my accepted solution with reduce:
make an array from string via spread operator
Check the previous item
find how many times char is repeated in result string
otherwise concat result string with the current char
Don`t forget to use the second argument as the initial value, and return for each cases
reduceString = function(str, amount) {
return [...str].reduce(((res, cur)=>{
if(res.length && cur === res[res.length-1]){
dupsCount = [...res].filter(char => char === cur).length
if(dupsCount===amount){
return res;
}
else {
res+=cur;
return res;
}
}
res+=cur;
return res;
}),"")
}

Return the first word with the greatest number of repeated letters

This is a question from coderbyte’s easy set. Many people asked about it already, but I’m really curious about what’s wrong with my particular solution (I know it’s a pretty dumb and inefficient one..)
Original question:
Have the function LetterCountI(str) take the str parameter being passed and return the first word with the greatest number of repeated letters. For example: "Today, is the greatest day ever!" should return greatest because it has 2 e's (and 2 t's) and it comes before ever which also has 2 e's. If there are no words with repeating letters return -1. Words will be separated by spaces.
My solution works most of the time. But if it seems the last word of the input isn’t valued by my code. For example, for “a bb ccc”, “bb” will be returned instead of “ccc”. But the funny thing here is if the string only contains one word, the result is correct. For example, “ccc” returns “ccc”.
Please tell me where I was wrong. Thank you in advance!
function LetterCountI(str) {
str.toLowerCase();
var arr = str.split(" ");
var count = 0;
var word = "-1";
for (var i = 0; i < arr.length; i++) {
for (var a = 0; a < arr[i].length; a++) {
var countNew = 0;
for (var b = a + 1; b < arr[i].length; b++) {
if(arr[i][a] === arr[i][b])
countNew += 1;
}
if (countNew > count) {
count = countNew;
word = arr[i];
}
}
return word;
}
}
Please find below the workable version of your code:
function LetterCountI(str) {
str = str.toLowerCase();
var arr = str.split(" ");
var count = 0;
var word = "-1";
for (var i = 0; i < arr.length; i++) {
for (var a = 0; a < arr[i].length; a++) {
var countNew = 0;
for (var b = a + 1; b < arr[i].length; b++) {
if (arr[i][a] === arr[i][b])
countNew += 1;
}
if (countNew > count) {
count = countNew;
word = arr[i];
}
}
}
return word;
}
Here is the Java code soln for your problem.
You have returned your answer incorrectly. You should have returned word/Answer/res out of "for loops".
Check my chode here.
public static String StringChallenge( String str) {
String[] arr = str.split(" ");
int count = 0; String res = "-1";
for (int i = 0; i < arr.length ; i++) {
for (int j = 0; j < arr[i].length() ; j++) {
int counter = 0;
for (int k = j + 1; k < arr[i].length() ; k++) {
if(arr[i].charAt(j) === arr[i].charAt(k) )
counter ++;
}
if (counter > count) {
count = counter; res = arr[i];
}
}
return res;
}
}
I think the problem is that you're placing the return statement inside your outermost loop. It should be inside your inner loop.
So you have to place the return statement within the inner loop.
Correct use of return
if (countNew > count) {
count = countNew;
word = arr[i];
}
return word;
}
}
}
You need to move the return word; statement outside of the loop to fix your version.
I also put together another take on the algorithm that relies on a few built in javascript methods like Array.map and Math.max, just for reference. I ran a few tests and it seems to be a few milliseconds faster, but not by much.
function LetterCountI(str) {
var maxCount = 0;
var word = '-1';
//split string into words based on spaces and count repeated characters
str.toLowerCase().split(" ").forEach(function(currentWord){
var hash = {};
//split word into characters and increment a hash map for repeated values
currentWord.split('').forEach(function(letter){
if (hash.hasOwnProperty(letter)) {
hash[letter]++;
} else {
hash[letter] = 1;
}
});
//covert the hash map to an array of character counts
var characterCounts = Object.keys(hash).map(function(key){ return hash[key]; });
//find the maximum value in the squashed array
var currentMaxRepeatedCount = Math.max.apply(null, characterCounts);
//if the current word has a higher repeat count than previous max, replace it
if (currentMaxRepeatedCount > maxCount) {
maxCount = currentMaxRepeatedCount;
word = currentWord;
}
});
return word;
}
Yet another solution in a more functional programming style:
JavaScript
function LetterCountI(str) {
return ((str = str.split(' ').map(function(word) {
var letters = word.split('').reduce(function(map, letter) {
map[letter] = map.hasOwnProperty(letter) ? map[letter] + 1 : 1;
return map;
}, {}); // map of letters to number of occurrences in the word
return {
word: word,
count: Object.keys(letters).filter(function(letter) {
return letters[letter] > 1;
}).length // number of repeated letters
};
}).sort(function(a, b) { // Sort words by number of repeated letters
return b.count - a.count;
}).shift()) && str.count && str.word) || -1; // return first word with maximum repeated letters or -1
}
console.log(LetterCountI('Today, is the greatest day ever!')); // => greatest
Plunker
http://plnkr.co/edit/BRywasUkQ3KYdhRpBfU2?p=preview
I recommend use regular expression: /a+/g to find a list of letter with a key word a.
My example :
var str = aa yyyyy bb cccc cc dd bbb;
Fist, find a list of different word :
>>> ["a", "y", "b", "c", "d"]
Use regular expression for each word in list of different word :
var word = lstDiffWord[1];
var
wordcount = str.match(new RegExp(word+'+','g'));
console.log(wordcount);
>>>>["yyyyy"]
Here is full example: http://jsfiddle.net/sxro0sLq/4/

JavaScript strings outside of the BMP

BMP being Basic Multilingual Plane
According to JavaScript: the Good Parts:
JavaScript was built at a time when Unicode was a 16-bit character set, so all characters in JavaScript are 16 bits wide.
This leads me to believe that JavaScript uses UCS-2 (not UTF-16!) and can only handle characters up to U+FFFF.
Further investigation confirms this:
> String.fromCharCode(0x20001);
The fromCharCode method seems to only use the lowest 16 bits when returning the Unicode character. Trying to get U+20001 (CJK unified ideograph 20001) instead returns U+0001.
Question: is it at all possible to handle post-BMP characters in JavaScript?
2011-07-31: slide twelve from Unicode Support Shootout: The Good, The Bad, & the (mostly) Ugly covers issues related to this quite well:
Depends what you mean by ‘support’. You can certainly put non-UCS-2 characters in a JS string using surrogates, and browsers will display them if they can.
But, each item in a JS string is a separate UTF-16 code unit. There is no language-level support for handling full characters: all the standard String members (length, split, slice etc) all deal with code units not characters, so will quite happily split surrogate pairs or hold invalid surrogate sequences.
If you want surrogate-aware methods, I'm afraid you're going to have to start writing them yourself! For example:
String.prototype.getCodePointLength= function() {
return this.length-this.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g).length+1;
};
String.fromCodePoint= function() {
var chars= Array.prototype.slice.call(arguments);
for (var i= chars.length; i-->0;) {
var n = chars[i]-0x10000;
if (n>=0)
chars.splice(i, 1, 0xD800+(n>>10), 0xDC00+(n&0x3FF));
}
return String.fromCharCode.apply(null, chars);
};
I came to the same conclusion as bobince. If you want to work with strings containing unicode characters outside of the BMP, you have to reimplement javascript's String methods. This is because javascript counts characters as each 16-bit code value. Symbols outside of the BMP need two code values to be represented. You therefore run into a case where some symbols count as two characters and some count only as one.
I've reimplemented the following methods to treat each unicode code point as a single character: .length, .charCodeAt, .fromCharCode, .charAt, .indexOf, .lastIndexOf, .splice, and .split.
You can check it out on jsfiddle: http://jsfiddle.net/Y89Du/
Here's the code without comments. I tested it, but it may still have errors. Comments are welcome.
if (!String.prototype.ucLength) {
String.prototype.ucLength = function() {
// this solution was taken from
// http://stackoverflow.com/questions/3744721/javascript-strings-outside-of-the-bmp
return this.length - this.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g).length + 1;
};
}
if (!String.prototype.codePointAt) {
String.prototype.codePointAt = function (ucPos) {
if (isNaN(ucPos)){
ucPos = 0;
}
var str = String(this);
var codePoint = null;
var pairFound = false;
var ucIndex = -1;
var i = 0;
while (i < str.length){
ucIndex += 1;
var code = str.charCodeAt(i);
var next = str.charCodeAt(i + 1);
pairFound = (0xD800 <= code && code <= 0xDBFF && 0xDC00 <= next && next <= 0xDFFF);
if (ucIndex == ucPos){
codePoint = pairFound ? ((code - 0xD800) * 0x400) + (next - 0xDC00) + 0x10000 : code;
break;
} else{
i += pairFound ? 2 : 1;
}
}
return codePoint;
};
}
if (!String.fromCodePoint) {
String.fromCodePoint = function () {
var strChars = [], codePoint, offset, codeValues, i;
for (i = 0; i < arguments.length; ++i) {
codePoint = arguments[i];
offset = codePoint - 0x10000;
if (codePoint > 0xFFFF){
codeValues = [0xD800 + (offset >> 10), 0xDC00 + (offset & 0x3FF)];
} else{
codeValues = [codePoint];
}
strChars.push(String.fromCharCode.apply(null, codeValues));
}
return strChars.join("");
};
}
if (!String.prototype.ucCharAt) {
String.prototype.ucCharAt = function (ucIndex) {
var str = String(this);
var codePoint = str.codePointAt(ucIndex);
var ucChar = String.fromCodePoint(codePoint);
return ucChar;
};
}
if (!String.prototype.ucIndexOf) {
String.prototype.ucIndexOf = function (searchStr, ucStart) {
if (isNaN(ucStart)){
ucStart = 0;
}
if (ucStart < 0){
ucStart = 0;
}
var str = String(this);
var strUCLength = str.ucLength();
searchStr = String(searchStr);
var ucSearchLength = searchStr.ucLength();
var i = ucStart;
while (i < strUCLength){
var ucSlice = str.ucSlice(i,i+ucSearchLength);
if (ucSlice == searchStr){
return i;
}
i++;
}
return -1;
};
}
if (!String.prototype.ucLastIndexOf) {
String.prototype.ucLastIndexOf = function (searchStr, ucStart) {
var str = String(this);
var strUCLength = str.ucLength();
if (isNaN(ucStart)){
ucStart = strUCLength - 1;
}
if (ucStart >= strUCLength){
ucStart = strUCLength - 1;
}
searchStr = String(searchStr);
var ucSearchLength = searchStr.ucLength();
var i = ucStart;
while (i >= 0){
var ucSlice = str.ucSlice(i,i+ucSearchLength);
if (ucSlice == searchStr){
return i;
}
i--;
}
return -1;
};
}
if (!String.prototype.ucSlice) {
String.prototype.ucSlice = function (ucStart, ucStop) {
var str = String(this);
var strUCLength = str.ucLength();
if (isNaN(ucStart)){
ucStart = 0;
}
if (ucStart < 0){
ucStart = strUCLength + ucStart;
if (ucStart < 0){ ucStart = 0;}
}
if (typeof(ucStop) == 'undefined'){
ucStop = strUCLength - 1;
}
if (ucStop < 0){
ucStop = strUCLength + ucStop;
if (ucStop < 0){ ucStop = 0;}
}
var ucChars = [];
var i = ucStart;
while (i < ucStop){
ucChars.push(str.ucCharAt(i));
i++;
}
return ucChars.join("");
};
}
if (!String.prototype.ucSplit) {
String.prototype.ucSplit = function (delimeter, limit) {
var str = String(this);
var strUCLength = str.ucLength();
var ucChars = [];
if (delimeter == ''){
for (var i = 0; i < strUCLength; i++){
ucChars.push(str.ucCharAt(i));
}
ucChars = ucChars.slice(0, 0 + limit);
} else{
ucChars = str.split(delimeter, limit);
}
return ucChars;
};
}
More recent JavaScript engines have String.fromCodePoint.
const ideograph = String.fromCodePoint( 0x20001 ); // outside the BMP
Also a code-point iterator, which gets you the code-point length.
function countCodePoints( str )
{
const i = str[Symbol.iterator]();
let count = 0;
while( !i.next().done ) ++count;
return count;
}
console.log( ideograph.length ); // gives '2'
console.log( countCodePoints(ideograph) ); // '1'
Yes, you can. Although support to non-BMP characters directly in source documents is optional according to the ECMAScript standard, modern browsers let you use them. Naturally, the document encoding must be properly declared, and for most practical purposes you would need to use the UTF-8 encoding. Moreover, you need an editor that can handle UTF-8, and you need some input method(s); see e.g. my Full Unicode Input utility.
Using suitable tools and settings, you can write var foo = '𠀁'.
The non-BMP characters will be internally represented as surrogate pairs, so each non-BMP character counts as 2 in the string length.
Using for (c of this) instruction, one can make various computations on a string that contains non-BMP characters. For instance, to compute the string length, and to get the nth character of the string:
String.prototype.magicLength = function()
{
var c, k;
k = 0;
for (c of this) // iterate each char of this
{
k++;
}
return k;
}
String.prototype.magicCharAt = function(n)
{
var c, k;
k = 0;
for (c of this) // iterate each char of this
{
if (k == n) return c + "";
k++;
}
return "";
}
This old topic has now a simple solution in ES6:
Split characters into an array
simple version
[..."😴😄😃⛔🎠🚓🚇"] // ["😴", "😄", "😃", "⛔", "🎠", "🚓", "🚇"]
Then having each one separated you can handle them easily for most common cases.
Credit: DownGoat
Full solution
To overcome special emojis as the one in the comment, one can search for the connection charecter (char code 8205 in UTF-16) and make some modifications. Here is how:
let myStr = "👩‍👩‍👧‍👧😃𝌆"
let arr = [...myStr]
for (i = arr.length-1; i--; i>= 0) {
if (arr[i].charCodeAt(0) == 8205) { // special combination character
arr[i-1] += arr[i] + arr[i+1]; // combine them back to a single emoji
arr.splice(i, 2)
}
}
console.log(arr.length) //3
Haven't found a case where this doesn't work. Comment if you do.
To conclude
it seems that JS uses the 8205 char code to represent UCS-2 characters as a UTF-16 combinations.

Categories

Resources