I am new to js.
can you tell me how to print like this * "a" -> "a1" * "aabbbaa" -> "a2b3a2"
i tried with hash map but test cases failing.
providing my code below.
i am not good in hash map.
can you tell me how to solve with hash map so that in future I can fix it my self.
not sure what data structure to use for this one.
providing my code below.
const _ = require("underscore");
const rle = ( input ) => {
console.log("input--->" + input);
//var someString ="aaa";
var someString = input;
var arr = someString.split("");
var numberCount = {};
for(var i=0; i< arr.length; i++) {
var alphabet = arr[i];
if(numberCount[alphabet]){
numberCount[alphabet] = numberCount[alphabet] + 1;
}
else{
numberCount[alphabet] = 1;
}
}
console.log("a:" + numberCount['a'], "b:" + numberCount['b']);
}
/**
* boolean doTestsPass()
* Returns true if all the tests pass. Otherwise returns false.
*/
/**
* Returns true if all tests pass; otherwise, returns false.
*/
const doTestsPass = () => {
const VALID_COMBOS = {"aaa": "a3", "aaabbc":"a3b2c1"};
let testPassed = true;
_.forEach(VALID_COMBOS, function(value, key) {
console.log(key, rle(key));
if (value !== rle(key)) {
testPassed = false;
}
});
return testPassed;
}
/**
* Main execution entry.
*/
if(doTestsPass())
{
console.log("All tests pass!");
}
else
{
console.log("There are test failures.");
}
You could
match groups of characters,
get the character and the count and
join it to a string.
function runLengthEncoding(string) {
return string
.match(/(.)\1*/g) // keep same characters in a single string
.map(s => s[0] + s.length) // take first character of string and length
.join(''); // create string of array
}
console.log(['a', 'aaa', 'aaabbc'].map(runLengthEncoding));
This is a bit more understandable version which iterates the given string and count the characters. If a different character is found, the last character and count is added to the result string.
At the end, a check is made, to prevent counting of empty strings and the last character cound is added to the result.
function runLengthEncoding(string) {
var result = '',
i,
count = 0,
character = string[0];
for (i = 0; i < string.length; i++) {
if (character === string[i]) {
count++;
continue;
}
result += character + count;
character = string[i];
count = 1;
}
if (count) {
result += character + count;
}
return result;
}
console.log(['', 'a', 'aaa', 'aaabbc'].map(runLengthEncoding));
You can reduce the array into a multidimensional array. map and join the array to convert to string.
const rle = (input) => {
return input.split("").reduce((c, v) => {
if (c[c.length - 1] && c[c.length - 1][0] === v) c[c.length - 1][1]++;
else c.push([v, 1]);
return c;
}, []).map(o => o.join('')).join('');
}
console.log(rle("a"));
console.log(rle("aabbbaa"));
console.log(rle("aaaaaa"));
Your function rle doesn't return a result.
Also note, this implementation may pass the test cases you wrote, but not the examples you mentioned in your question: for the string "aabbaa" this will produce "a4b2", not " a2b2a2" .
A simpler solution:
function runLengthEncoding(str) {
let out = "";
for (let i = 0; i < str.length; ++i) {
let temp = str[i];
let count = 1;
while (i < str.length && str[i+1] == temp) {
++count;
++i;
}
out += temp + count;
} // end-for
return out;
}
console.log(runLengthEncoding("a"));
console.log(runLengthEncoding("aabbbaa"));
console.log(runLengthEncoding("aaaaaa"));
Let's say I have this 3 emojis in a string: πππͺ
There are not any spaces or any other character except emojis in the string.
How can I remove the last emoji in javascript?
The answer below doesn't use any special package and safely removes the last emoji
function safeEmojiBackspace(str)
{
let initialRealCount = fancyCount(str);
while(str.length > 0 && fancyCount(str) !== initialRealCount - 1)
{
str = str.substring(0,str.length - 1);
}
return str;
}
function fancyCount(str){
const joiner = "\u{200D}";
const split = str.split(joiner);
let count = 0;
for(const s of split){
//removing the variation selectors
const num = Array.from(s.split(/[\ufe00-\ufe0f]/).join("")).length;
count += num;
}
//assuming the joiners are used appropriately
return count / split.length;
}
Sample usage
let str = "somethingπππͺ";
str = safeEmojiBackspace(str);//"somethingππ"
You can do this. It will always remove the last emoji.
function removeEmoji() {
var emoStringArray = document.getElementById('emoji').innerHTML;
var lastIndex = emoStringArray.lastIndexOf(" ");
var stripedEmoStringArray = emoStringArray.substring(0, lastIndex);
document.getElementById('emoji').innerHTML = stripedEmoStringArray;
}
<p id="emoji">
π π πͺ
</p>
<button onclick="removeEmoji()">Remove</button>
I hope this is what you want.
var emoString = "π π πͺ";
emoString = emoString.slice(0, -2);
However, this would work only if you have 3 emojis in total. Hence to achieve a generalised solution, you can use the underscore functions split() and javascript function join() :
var emoString = "π π πͺ";
emoString = _.rest(emoString.split(' ')).join(' ')
Hope this will solve your issue.
Ok, here is how I solved it:
function deleteEmoji(emojiStr) {
let emojisArray = emojiStr.match(/([\uD800-\uDBFF][\uDC00-\uDFFF])/g);
emojisArray = emojisArray.splice(0, emojisArray.length - 1);
return emojisArray.join("");
}
let emojitext = "πππͺ";
console.log(deleteEmoji(emojitext));
I was actually surprised that unicode in this day an age is still not fully supported in browsers. I assume a lot of this is down to windows and it's version of UTF-16.
The OP I believe has found his own solution to the original problem, but I thought there has to be a more generic solution to surrogate pair unicode characters.
Anyway, so my solution is convert the text into a UTF-32 array, these can then be manipulated must easier, using slice etc.
After you have done what you want to the array, just convert back.
Below is an example.
Some of the code I got from -> Is it possible to convert a string containing "high" unicode chars to an array consisting of dec values derived from utf-32 ("real") codes?
and http://speakingjs.com/es5/ch24.html
function decodeUnicode(str) {
const r = [];
let i = 0;
while(i < str.length) {
let chr = str.charCodeAt(i++);
if(chr >= 0xD800 && chr <= 0xDBFF) {
var low = str.charCodeAt(i++);
r.push(0x10000 +
((chr - 0xD800) << 10) | (low - 0xDC00));
} else {
r.push(chr);
}
}
return r;
}
function toUTF16(codePoint) {
const TEN_BITS = parseInt('1111111111', 2);
if (codePoint <= 0xFFFF) { return codePoint; }
codePoint -= 0x10000;
const leadingSurrogate = 0xD800 | (codePoint >> 10);
const trailingSurrogate = 0xDC00 | (codePoint & TEN_BITS);
return String.fromCharCode(leadingSurrogate) +
String.fromCharCode(trailingSurrogate);
}
function encodeUnicode(data) {
return data.reduce((a, v) => {
a += toUTF16(v);
return a;
},"");
}
var unicode = decodeUnicode("πππͺ");
for (let l = 0; l < unicode.length; l ++)
console.log(encodeUnicode(
unicode.slice(0, l ? -l : unicode.length)));
console.log("pick some random ones");
let str = "";
for (let l = 0; l < 20; l ++) {
let rnd = Math.trunc(Math.random()*unicode.length);
str += encodeUnicode(unicode.slice(rnd,rnd+1));
}
console.log(str);
What's the JavaScript equivalent to this C# Method:
var x = "|f|oo||";
var y = x.Trim('|'); // "f|oo"
C# trims the selected character only at the beginning and end of the string!
One line is enough:
var x = '|f|oo||';
var y = x.replace(/^\|+|\|+$/g, '');
document.write(x + '<br />' + y);
^ beginning of the string
\|+ pipe, one or more times
| or
\|+ pipe, one or more times
$ end of the string
A general solution:
function trim (s, c) {
if (c === "]") c = "\\]";
if (c === "^") c = "\\^";
if (c === "\\") c = "\\\\";
return s.replace(new RegExp(
"^[" + c + "]+|[" + c + "]+$", "g"
), "");
}
chars = ".|]\\^";
for (c of chars) {
s = c + "foo" + c + c + "oo" + c + c + c;
console.log(s, "->", trim(s, c));
}
Parameter c is expected to be a character (a string of length 1).
As mentionned in the comments, it might be useful to support multiple characters, as it's quite common to trim multiple whitespace-like characters for example. To do this, MightyPork suggests to replace the ifs with the following line of code:
c = c.replace(/[-/\\^$*+?.()|[\]{}]/g, '\\$&');
This part [-/\\^$*+?.()|[\]{}] is a set of special characters in regular expression syntax, and $& is a placeholder which stands for the matching character, meaning that the replace function escapes special characters. Try in your browser console:
> "{[hello]}".replace(/[-/\\^$*+?.()|[\]{}]/g, '\\$&')
"\{\[hello\]\}"
Update: Was curious around the performance of different solutions and so I've updated a basic benchmark here:
https://www.measurethat.net/Benchmarks/Show/12738/0/trimming-leadingtrailing-characters
Some interesting and unexpected results running under Chrome.
https://www.measurethat.net/Benchmarks/ShowResult/182877
+-----------------------------------+-----------------------+
| Test name | Executions per second |
+-----------------------------------+-----------------------+
| Index Version (Jason Larke) | 949979.7 Ops/sec |
| Substring Version (Pho3niX83) | 197548.9 Ops/sec |
| Regex Version (leaf) | 107357.2 Ops/sec |
| Boolean Filter Version (mbaer3000)| 94162.3 Ops/sec |
| Spread Version (Robin F.) | 4242.8 Ops/sec |
+-----------------------------------+-----------------------+
Please note; tests were carried out on only a single test string (with both leading and trailing characters that needed trimming). In addition, this benchmark only gives an indication of raw speed; other factors like memory usage are also important to consider.
If you're dealing with longer strings I believe this should outperform most of the other options by reducing the number of allocated strings to either zero or one:
function trim(str, ch) {
var start = 0,
end = str.length;
while(start < end && str[start] === ch)
++start;
while(end > start && str[end - 1] === ch)
--end;
return (start > 0 || end < str.length) ? str.substring(start, end) : str;
}
// Usage:
trim('|hello|world|', '|'); // => 'hello|world'
Or if you want to trim from a set of multiple characters:
function trimAny(str, chars) {
var start = 0,
end = str.length;
while(start < end && chars.indexOf(str[start]) >= 0)
++start;
while(end > start && chars.indexOf(str[end - 1]) >= 0)
--end;
return (start > 0 || end < str.length) ? str.substring(start, end) : str;
}
// Usage:
trimAny('|hello|world ', [ '|', ' ' ]); // => 'hello|world'
// because '.indexOf' is used, you could also pass a string for the 2nd parameter:
trimAny('|hello| world ', '| '); // => 'hello|world'
EDIT: For fun, trim words (rather than individual characters)
// Helper function to detect if a string contains another string
// at a specific position.
// Equivalent to using `str.indexOf(substr, pos) === pos` but *should* be more efficient on longer strings as it can exit early (needs benchmarks to back this up).
function hasSubstringAt(str, substr, pos) {
var idx = 0, len = substr.length;
for (var max = str.length; idx < len; ++idx) {
if ((pos + idx) >= max || str[pos + idx] != substr[idx])
break;
}
return idx === len;
}
function trimWord(str, word) {
var start = 0,
end = str.length,
len = word.length;
while (start < end && hasSubstringAt(str, word, start))
start += word.length;
while (end > start && hasSubstringAt(str, word, end - len))
end -= word.length
return (start > 0 || end < str.length) ? str.substring(start, end) : str;
}
// Usage:
trimWord('blahrealmessageblah', 'blah');
If I understood well, you want to remove a specific character only if it is at the beginning or at the end of the string (ex: ||fo||oo|||| should become foo||oo). You can create an ad hoc function as follows:
function trimChar(string, charToRemove) {
while(string.charAt(0)==charToRemove) {
string = string.substring(1);
}
while(string.charAt(string.length-1)==charToRemove) {
string = string.substring(0,string.length-1);
}
return string;
}
I tested this function with the code below:
var str = "|f|oo||";
$( "#original" ).html( "Original String: '" + str + "'" );
$( "#trimmed" ).html( "Trimmed: '" + trimChar(str, "|") + "'" );
You can use a regular expression such as:
var x = "|f|oo||";
var y = x.replace(/^\|+|\|+$/g, "");
alert(y); // f|oo
UPDATE:
Should you wish to generalize this into a function, you can do the following:
var escapeRegExp = function(strToEscape) {
// Escape special characters for use in a regular expression
return strToEscape.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\\\^\$\|]/g, "\\$&");
};
var trimChar = function(origString, charToTrim) {
charToTrim = escapeRegExp(charToTrim);
var regEx = new RegExp("^[" + charToTrim + "]+|[" + charToTrim + "]+$", "g");
return origString.replace(regEx, "");
};
var x = "|f|oo||";
var y = trimChar(x, "|");
alert(y); // f|oo
A regex-less version which is easy on the eye:
const trim = (str, chars) => str.split(chars).filter(Boolean).join(chars);
For use cases where we're certain that there's no repetition of the chars off the edges.
to keep this question up to date:
here is an approach i'd choose over the regex function using the ES6 spread operator.
function trimByChar(string, character) {
const first = [...string].findIndex(char => char !== character);
const last = [...string].reverse().findIndex(char => char !== character);
return string.substring(first, string.length - last);
}
Improved version after #fabian 's comment (can handle strings containing the same character only)
function trimByChar1(string, character) {
const arr = Array.from(string);
const first = arr.findIndex(char => char !== character);
const last = arr.reverse().findIndex(char => char !== character);
return (first === -1 && last === -1) ? '' : string.substring(first, string.length - last);
}
This can trim several characters at a time:
function trimChars (str, c) {
var re = new RegExp("^[" + c + "]+|[" + c + "]+$", "g");
return str.replace(re,"");
}
var x = "|f|oo||";
x = trimChars(x, '|'); // f|oo
var y = "..++|f|oo||++..";
y = trimChars(y, '|.+'); // f|oo
var z = "\\f|oo\\"; // \f|oo\
// For backslash, remember to double-escape:
z = trimChars(z, "\\\\"); // f|oo
For use in your own script and if you don't mind changing the prototype, this can be a convenient "hack":
String.prototype.trimChars = function (c) {
var re = new RegExp("^[" + c + "]+|[" + c + "]+$", "g");
return this.replace(re,"");
}
var x = "|f|oo||";
x = x.trimChars('|'); // f|oo
Since I use the trimChars function extensively in one of my scripts, I prefer this solution. But there are potential issues with modifying an object's prototype.
If you define these functions in your program, your strings will have an upgraded version of trim that can trim all given characters:
String.prototype.trimLeft = function(charlist) {
if (charlist === undefined)
charlist = "\s";
return this.replace(new RegExp("^[" + charlist + "]+"), "");
};
String.prototype.trim = function(charlist) {
return this.trimLeft(charlist).trimRight(charlist);
};
String.prototype.trimRight = function(charlist) {
if (charlist === undefined)
charlist = "\s";
return this.replace(new RegExp("[" + charlist + "]+$"), "");
};
var withChars = "/-center-/"
var withoutChars = withChars.trim("/-")
document.write(withoutChars)
Source
https://www.sitepoint.com/trimming-strings-in-javascript/
const trim = (str, char) => {
let i = 0;
let j = str.length-1;
while (str[i] === char) i++;
while (str[j] === char) j--;
return str.slice(i,j+1);
}
console.log(trim('|f|oo|', '|')); // f|oo
Non-regex solution.
Two pointers: i (beginning) & j (end).
Only move pointers if they match char and stop when they don't.
Return remaining string.
I would suggest looking at lodash and how they implemented the trim function.
See Lodash Trim for the documentation and the source to see the exact code that does the trimming.
I know this does not provide an exact answer your question, but I think it's good to set a reference to a library on such a question since others might find it useful.
This one trims all leading and trailing delimeters
const trim = (str, delimiter) => {
const pattern = `[^\\${delimiter}]`;
const start = str.search(pattern);
const stop = str.length - str.split('').reverse().join('').search(pattern);
return str.substring(start, stop);
}
const test = '||2|aaaa12bb3ccc|||||';
console.log(trim(test, '|')); // 2|aaaa12bb3ccc
I like the solution from #Pho3niX83...
Let's extend it with "word" instead of "char"...
function trimWord(_string, _word) {
var splitted = _string.split(_word);
while (splitted.length && splitted[0] === "") {
splitted.shift();
}
while (splitted.length && splitted[splitted.length - 1] === "") {
splitted.pop();
}
return splitted.join(_word);
};
The best way to resolve this task is (similar with PHP trim function):
function trim( str, charlist ) {
if ( typeof charlist == 'undefined' ) {
charlist = '\\s';
}
var pattern = '^[' + charlist + ']*(.*?)[' + charlist + ']*$';
return str.replace( new RegExp( pattern ) , '$1' )
}
document.getElementById( 'run' ).onclick = function() {
document.getElementById( 'result' ).value =
trim( document.getElementById( 'input' ).value,
document.getElementById( 'charlist' ).value);
}
<div>
<label for="input">Text to trim:</label><br>
<input id="input" type="text" placeholder="Text to trim" value="dfstextfsd"><br>
<label for="charlist">Charlist:</label><br>
<input id="charlist" type="text" placeholder="Charlist" value="dfs"><br>
<label for="result">Result:</label><br>
<input id="result" type="text" placeholder="Result" disabled><br>
<button type="button" id="run">Trim it!</button>
</div>
P.S.: why i posted my answer, when most people already done it before? Because i found "the best" mistake in all of there answers: all used the '+' meta instead of '*', 'cause trim must remove chars IF THEY ARE IN START AND/OR END, but it return original string in else case.
Another version to use regular expression.
No or(|) used and no global(g) used.
function escapeRegexp(s) {
return s.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&');
}
function trimSpecific(value, find) {
const find2 = escapeRegexp(find);
return value.replace(new RegExp(`^[${find2}]*(.*?)[${find2}]*$`), '$1')
}
console.log(trimSpecific('"a"b"', '"') === 'a"b');
console.log(trimSpecific('""ab"""', '"') === 'ab');
console.log(trimSpecific('"', '"') === '');
console.log(trimSpecific('"a', '"') === 'a');
console.log(trimSpecific('a"', '"') === 'a');
console.log(trimSpecific('[a]', '[]') === 'a');
console.log(trimSpecific('{[a]}', '[{}]') === 'a');
expanding on #leaf 's answer, here's one that can take multiple characters:
var trim = function (s, t) {
var tr, sr
tr = t.split('').map(e => `\\\\${e}`).join('')
sr = s.replace(new RegExp(`^[${tr}]+|[${tr}]+$`, 'g'), '')
return sr
}
function trim(text, val) {
return text.replace(new RegExp('^'+val+'+|'+val+'+$','g'), '');
}
"|Howdy".replace(new RegExp("^\\|"),"");
(note the double escaping. \\ needed, to have an actually single slash in the string, that then leads to escaping of | in the regExp).
Only few characters need regExp-Escaping., among them the pipe operator.
const special = ':;"<>?/!`~##$%^&*()+=-_ '.split("");
const trim = (input) => {
const inTrim = (str) => {
const spStr = str.split("");
let deleteTill = 0;
let startChar = spStr[deleteTill];
while (special.some((s) => s === startChar)) {
deleteTill++;
if (deleteTill <= spStr.length) {
startChar = spStr[deleteTill];
} else {
deleteTill--;
break;
}
}
spStr.splice(0, deleteTill);
return spStr.join("");
};
input = inTrim(input);
input = inTrim(input.split("").reverse().join("")).split("").reverse().join("");
return input;
};
alert(trim('##This is what I use$%'));
String.prototype.TrimStart = function (n) {
if (this.charAt(0) == n)
return this.substr(1);
};
String.prototype.TrimEnd = function (n) {
if (this.slice(-1) == n)
return this.slice(0, -1);
};
To my knowledge, jQuery doesnt have a built in function the method your are asking about.
With javascript however, you can just use replace to change the content of your string:
x.replace(/|/i, ""));
This will replace all occurences of | with nothing.
try:
console.log(x.replace(/\|/g,''));
Try this method:
var a = "anan gΓΌzel mi?";
if (a.endsWith("?")) a = a.slice(0, -1);
document.body.innerHTML = a;
Is there any function to do the following?
var specialStr = 'ipsum ÑÑ éé lore';
var encodedStr = someFunction(specialStr);
// then encodedStr should be like 'ipsum \u00E1\u00E1 \u00E9\u00E9 lore'
I need to encode the characters that are out of ASCII range, and need to do it with that encoding. I don't know its name. Is it Unicode maybe?
This should do the trick:
function padWithLeadingZeros(string) {
return new Array(5 - string.length).join("0") + string;
}
function unicodeCharEscape(charCode) {
return "\\u" + padWithLeadingZeros(charCode.toString(16));
}
function unicodeEscape(string) {
return string.split("")
.map(function (char) {
var charCode = char.charCodeAt(0);
return charCode > 127 ? unicodeCharEscape(charCode) : char;
})
.join("");
}
For example:
var specialStr = 'ipsum ÑÑ éé lore';
var encodedStr = unicodeEscape(specialStr);
assert.equal("ipsum \\u00e1\\u00e1 \\u00e9\\u00e9 lore", encodedStr);
If you need hex encoding rather than unicode then you can simplify #Domenic's answer to:
"aΓ€ΓΓ₯fu".replace(/./g, function(c){return c.charCodeAt(0)<128?c:"\\x"+c.charCodeAt(0).toString(16)})
returns: "a\xe4\xdf\xe5fu"
Just for information you can do as Domenic said or use the escape function but that will generate unicode with a different format (more browser friendly):
>>> escape("ÑéΓΓ³ΓΊ");
"%E1%E9%ED%F3%FA"
This works for me. Specifically when using the Dropbox REST API:
encodeNonAsciiCharacters(value: string) {
let out = ""
for (let i = 0; i < value.length; i++) {
const ch = value.charAt(i);
let chn = ch.charCodeAt(0);
if (chn <= 127) out += ch;
else {
let hex = chn.toString(16);
if (hex.length < 4)
hex = "000".substring(hex.length - 1) + hex;
out += "\\u" + hex;
}
}
return out;
}
BMP being Basic Multilingual Plane
According to JavaScript: the Good Parts:
JavaScript was built at a time when Unicode was a 16-bit character set, so all characters in JavaScript are 16 bits wide.
This leads me to believe that JavaScript uses UCS-2 (not UTF-16!) and can only handle characters up to U+FFFF.
Further investigation confirms this:
> String.fromCharCode(0x20001);
The fromCharCode method seems to only use the lowest 16 bits when returning the Unicode character. Trying to get U+20001 (CJK unified ideograph 20001) instead returns U+0001.
Question: is it at all possible to handle post-BMP characters in JavaScript?
2011-07-31: slide twelve from Unicode Support Shootout: The Good, The Bad, & the (mostly) Ugly covers issues related to this quite well:
Depends what you mean by βsupportβ. You can certainly put non-UCS-2 characters in a JS string using surrogates, and browsers will display them if they can.
But, each item in a JS string is a separate UTF-16 code unit. There is no language-level support for handling full characters: all the standard String members (length, split, slice etc) all deal with code units not characters, so will quite happily split surrogate pairs or hold invalid surrogate sequences.
If you want surrogate-aware methods, I'm afraid you're going to have to start writing them yourself! For example:
String.prototype.getCodePointLength= function() {
return this.length-this.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g).length+1;
};
String.fromCodePoint= function() {
var chars= Array.prototype.slice.call(arguments);
for (var i= chars.length; i-->0;) {
var n = chars[i]-0x10000;
if (n>=0)
chars.splice(i, 1, 0xD800+(n>>10), 0xDC00+(n&0x3FF));
}
return String.fromCharCode.apply(null, chars);
};
I came to the same conclusion as bobince. If you want to work with strings containing unicode characters outside of the BMP, you have to reimplement javascript's String methods. This is because javascript counts characters as each 16-bit code value. Symbols outside of the BMP need two code values to be represented. You therefore run into a case where some symbols count as two characters and some count only as one.
I've reimplemented the following methods to treat each unicode code point as a single character: .length, .charCodeAt, .fromCharCode, .charAt, .indexOf, .lastIndexOf, .splice, and .split.
You can check it out on jsfiddle: http://jsfiddle.net/Y89Du/
Here's the code without comments. I tested it, but it may still have errors. Comments are welcome.
if (!String.prototype.ucLength) {
String.prototype.ucLength = function() {
// this solution was taken from
// http://stackoverflow.com/questions/3744721/javascript-strings-outside-of-the-bmp
return this.length - this.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g).length + 1;
};
}
if (!String.prototype.codePointAt) {
String.prototype.codePointAt = function (ucPos) {
if (isNaN(ucPos)){
ucPos = 0;
}
var str = String(this);
var codePoint = null;
var pairFound = false;
var ucIndex = -1;
var i = 0;
while (i < str.length){
ucIndex += 1;
var code = str.charCodeAt(i);
var next = str.charCodeAt(i + 1);
pairFound = (0xD800 <= code && code <= 0xDBFF && 0xDC00 <= next && next <= 0xDFFF);
if (ucIndex == ucPos){
codePoint = pairFound ? ((code - 0xD800) * 0x400) + (next - 0xDC00) + 0x10000 : code;
break;
} else{
i += pairFound ? 2 : 1;
}
}
return codePoint;
};
}
if (!String.fromCodePoint) {
String.fromCodePoint = function () {
var strChars = [], codePoint, offset, codeValues, i;
for (i = 0; i < arguments.length; ++i) {
codePoint = arguments[i];
offset = codePoint - 0x10000;
if (codePoint > 0xFFFF){
codeValues = [0xD800 + (offset >> 10), 0xDC00 + (offset & 0x3FF)];
} else{
codeValues = [codePoint];
}
strChars.push(String.fromCharCode.apply(null, codeValues));
}
return strChars.join("");
};
}
if (!String.prototype.ucCharAt) {
String.prototype.ucCharAt = function (ucIndex) {
var str = String(this);
var codePoint = str.codePointAt(ucIndex);
var ucChar = String.fromCodePoint(codePoint);
return ucChar;
};
}
if (!String.prototype.ucIndexOf) {
String.prototype.ucIndexOf = function (searchStr, ucStart) {
if (isNaN(ucStart)){
ucStart = 0;
}
if (ucStart < 0){
ucStart = 0;
}
var str = String(this);
var strUCLength = str.ucLength();
searchStr = String(searchStr);
var ucSearchLength = searchStr.ucLength();
var i = ucStart;
while (i < strUCLength){
var ucSlice = str.ucSlice(i,i+ucSearchLength);
if (ucSlice == searchStr){
return i;
}
i++;
}
return -1;
};
}
if (!String.prototype.ucLastIndexOf) {
String.prototype.ucLastIndexOf = function (searchStr, ucStart) {
var str = String(this);
var strUCLength = str.ucLength();
if (isNaN(ucStart)){
ucStart = strUCLength - 1;
}
if (ucStart >= strUCLength){
ucStart = strUCLength - 1;
}
searchStr = String(searchStr);
var ucSearchLength = searchStr.ucLength();
var i = ucStart;
while (i >= 0){
var ucSlice = str.ucSlice(i,i+ucSearchLength);
if (ucSlice == searchStr){
return i;
}
i--;
}
return -1;
};
}
if (!String.prototype.ucSlice) {
String.prototype.ucSlice = function (ucStart, ucStop) {
var str = String(this);
var strUCLength = str.ucLength();
if (isNaN(ucStart)){
ucStart = 0;
}
if (ucStart < 0){
ucStart = strUCLength + ucStart;
if (ucStart < 0){ ucStart = 0;}
}
if (typeof(ucStop) == 'undefined'){
ucStop = strUCLength - 1;
}
if (ucStop < 0){
ucStop = strUCLength + ucStop;
if (ucStop < 0){ ucStop = 0;}
}
var ucChars = [];
var i = ucStart;
while (i < ucStop){
ucChars.push(str.ucCharAt(i));
i++;
}
return ucChars.join("");
};
}
if (!String.prototype.ucSplit) {
String.prototype.ucSplit = function (delimeter, limit) {
var str = String(this);
var strUCLength = str.ucLength();
var ucChars = [];
if (delimeter == ''){
for (var i = 0; i < strUCLength; i++){
ucChars.push(str.ucCharAt(i));
}
ucChars = ucChars.slice(0, 0 + limit);
} else{
ucChars = str.split(delimeter, limit);
}
return ucChars;
};
}
More recent JavaScript engines have String.fromCodePoint.
const ideograph = String.fromCodePoint( 0x20001 ); // outside the BMP
Also a code-point iterator, which gets you the code-point length.
function countCodePoints( str )
{
const i = str[Symbol.iterator]();
let count = 0;
while( !i.next().done ) ++count;
return count;
}
console.log( ideograph.length ); // gives '2'
console.log( countCodePoints(ideograph) ); // '1'
Yes, you can. Although support to non-BMP characters directly in source documents is optional according to the ECMAScript standard, modern browsers let you use them. Naturally, the document encoding must be properly declared, and for most practical purposes you would need to use the UTF-8 encoding. Moreover, you need an editor that can handle UTF-8, and you need some input method(s); see e.g. my Full Unicode Input utility.
Using suitable tools and settings, you can write var foo = 'π '.
The non-BMP characters will be internally represented as surrogate pairs, so each non-BMP character counts as 2 in the string length.
Using for (c of this) instruction, one can make various computations on a string that contains non-BMP characters. For instance, to compute the string length, and to get the nth character of the string:
String.prototype.magicLength = function()
{
var c, k;
k = 0;
for (c of this) // iterate each char of this
{
k++;
}
return k;
}
String.prototype.magicCharAt = function(n)
{
var c, k;
k = 0;
for (c of this) // iterate each char of this
{
if (k == n) return c + "";
k++;
}
return "";
}
This old topic has now a simple solution in ES6:
Split characters into an array
simple version
[..."π΄ππβπ ππ"] // ["π΄", "π", "π", "β", "π ", "π", "π"]
Then having each one separated you can handle them easily for most common cases.
Credit: DownGoat
Full solution
To overcome special emojis as the one in the comment, one can search for the connection charecter (char code 8205 in UTF-16) and make some modifications. Here is how:
let myStr = "π©βπ©βπ§βπ§ππ"
let arr = [...myStr]
for (i = arr.length-1; i--; i>= 0) {
if (arr[i].charCodeAt(0) == 8205) { // special combination character
arr[i-1] += arr[i] + arr[i+1]; // combine them back to a single emoji
arr.splice(i, 2)
}
}
console.log(arr.length) //3
Haven't found a case where this doesn't work. Comment if you do.
To conclude
it seems that JS uses the 8205 char code to represent UCS-2 characters as a UTF-16 combinations.