JavaScript strings outside of the BMP - javascript

BMP being Basic Multilingual Plane
According to JavaScript: the Good Parts:
JavaScript was built at a time when Unicode was a 16-bit character set, so all characters in JavaScript are 16 bits wide.
This leads me to believe that JavaScript uses UCS-2 (not UTF-16!) and can only handle characters up to U+FFFF.
Further investigation confirms this:
> String.fromCharCode(0x20001);
The fromCharCode method seems to only use the lowest 16 bits when returning the Unicode character. Trying to get U+20001 (CJK unified ideograph 20001) instead returns U+0001.
Question: is it at all possible to handle post-BMP characters in JavaScript?
2011-07-31: slide twelve from Unicode Support Shootout: The Good, The Bad, & the (mostly) Ugly covers issues related to this quite well:

Depends what you mean by β€˜support’. You can certainly put non-UCS-2 characters in a JS string using surrogates, and browsers will display them if they can.
But, each item in a JS string is a separate UTF-16 code unit. There is no language-level support for handling full characters: all the standard String members (length, split, slice etc) all deal with code units not characters, so will quite happily split surrogate pairs or hold invalid surrogate sequences.
If you want surrogate-aware methods, I'm afraid you're going to have to start writing them yourself! For example:
String.prototype.getCodePointLength= function() {
return this.length-this.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g).length+1;
};
String.fromCodePoint= function() {
var chars= Array.prototype.slice.call(arguments);
for (var i= chars.length; i-->0;) {
var n = chars[i]-0x10000;
if (n>=0)
chars.splice(i, 1, 0xD800+(n>>10), 0xDC00+(n&0x3FF));
}
return String.fromCharCode.apply(null, chars);
};

I came to the same conclusion as bobince. If you want to work with strings containing unicode characters outside of the BMP, you have to reimplement javascript's String methods. This is because javascript counts characters as each 16-bit code value. Symbols outside of the BMP need two code values to be represented. You therefore run into a case where some symbols count as two characters and some count only as one.
I've reimplemented the following methods to treat each unicode code point as a single character: .length, .charCodeAt, .fromCharCode, .charAt, .indexOf, .lastIndexOf, .splice, and .split.
You can check it out on jsfiddle: http://jsfiddle.net/Y89Du/
Here's the code without comments. I tested it, but it may still have errors. Comments are welcome.
if (!String.prototype.ucLength) {
String.prototype.ucLength = function() {
// this solution was taken from
// http://stackoverflow.com/questions/3744721/javascript-strings-outside-of-the-bmp
return this.length - this.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g).length + 1;
};
}
if (!String.prototype.codePointAt) {
String.prototype.codePointAt = function (ucPos) {
if (isNaN(ucPos)){
ucPos = 0;
}
var str = String(this);
var codePoint = null;
var pairFound = false;
var ucIndex = -1;
var i = 0;
while (i < str.length){
ucIndex += 1;
var code = str.charCodeAt(i);
var next = str.charCodeAt(i + 1);
pairFound = (0xD800 <= code && code <= 0xDBFF && 0xDC00 <= next && next <= 0xDFFF);
if (ucIndex == ucPos){
codePoint = pairFound ? ((code - 0xD800) * 0x400) + (next - 0xDC00) + 0x10000 : code;
break;
} else{
i += pairFound ? 2 : 1;
}
}
return codePoint;
};
}
if (!String.fromCodePoint) {
String.fromCodePoint = function () {
var strChars = [], codePoint, offset, codeValues, i;
for (i = 0; i < arguments.length; ++i) {
codePoint = arguments[i];
offset = codePoint - 0x10000;
if (codePoint > 0xFFFF){
codeValues = [0xD800 + (offset >> 10), 0xDC00 + (offset & 0x3FF)];
} else{
codeValues = [codePoint];
}
strChars.push(String.fromCharCode.apply(null, codeValues));
}
return strChars.join("");
};
}
if (!String.prototype.ucCharAt) {
String.prototype.ucCharAt = function (ucIndex) {
var str = String(this);
var codePoint = str.codePointAt(ucIndex);
var ucChar = String.fromCodePoint(codePoint);
return ucChar;
};
}
if (!String.prototype.ucIndexOf) {
String.prototype.ucIndexOf = function (searchStr, ucStart) {
if (isNaN(ucStart)){
ucStart = 0;
}
if (ucStart < 0){
ucStart = 0;
}
var str = String(this);
var strUCLength = str.ucLength();
searchStr = String(searchStr);
var ucSearchLength = searchStr.ucLength();
var i = ucStart;
while (i < strUCLength){
var ucSlice = str.ucSlice(i,i+ucSearchLength);
if (ucSlice == searchStr){
return i;
}
i++;
}
return -1;
};
}
if (!String.prototype.ucLastIndexOf) {
String.prototype.ucLastIndexOf = function (searchStr, ucStart) {
var str = String(this);
var strUCLength = str.ucLength();
if (isNaN(ucStart)){
ucStart = strUCLength - 1;
}
if (ucStart >= strUCLength){
ucStart = strUCLength - 1;
}
searchStr = String(searchStr);
var ucSearchLength = searchStr.ucLength();
var i = ucStart;
while (i >= 0){
var ucSlice = str.ucSlice(i,i+ucSearchLength);
if (ucSlice == searchStr){
return i;
}
i--;
}
return -1;
};
}
if (!String.prototype.ucSlice) {
String.prototype.ucSlice = function (ucStart, ucStop) {
var str = String(this);
var strUCLength = str.ucLength();
if (isNaN(ucStart)){
ucStart = 0;
}
if (ucStart < 0){
ucStart = strUCLength + ucStart;
if (ucStart < 0){ ucStart = 0;}
}
if (typeof(ucStop) == 'undefined'){
ucStop = strUCLength - 1;
}
if (ucStop < 0){
ucStop = strUCLength + ucStop;
if (ucStop < 0){ ucStop = 0;}
}
var ucChars = [];
var i = ucStart;
while (i < ucStop){
ucChars.push(str.ucCharAt(i));
i++;
}
return ucChars.join("");
};
}
if (!String.prototype.ucSplit) {
String.prototype.ucSplit = function (delimeter, limit) {
var str = String(this);
var strUCLength = str.ucLength();
var ucChars = [];
if (delimeter == ''){
for (var i = 0; i < strUCLength; i++){
ucChars.push(str.ucCharAt(i));
}
ucChars = ucChars.slice(0, 0 + limit);
} else{
ucChars = str.split(delimeter, limit);
}
return ucChars;
};
}

More recent JavaScript engines have String.fromCodePoint.
const ideograph = String.fromCodePoint( 0x20001 ); // outside the BMP
Also a code-point iterator, which gets you the code-point length.
function countCodePoints( str )
{
const i = str[Symbol.iterator]();
let count = 0;
while( !i.next().done ) ++count;
return count;
}
console.log( ideograph.length ); // gives '2'
console.log( countCodePoints(ideograph) ); // '1'

Yes, you can. Although support to non-BMP characters directly in source documents is optional according to the ECMAScript standard, modern browsers let you use them. Naturally, the document encoding must be properly declared, and for most practical purposes you would need to use the UTF-8 encoding. Moreover, you need an editor that can handle UTF-8, and you need some input method(s); see e.g. my Full Unicode Input utility.
Using suitable tools and settings, you can write var foo = '𠀁'.
The non-BMP characters will be internally represented as surrogate pairs, so each non-BMP character counts as 2 in the string length.

Using for (c of this) instruction, one can make various computations on a string that contains non-BMP characters. For instance, to compute the string length, and to get the nth character of the string:
String.prototype.magicLength = function()
{
var c, k;
k = 0;
for (c of this) // iterate each char of this
{
k++;
}
return k;
}
String.prototype.magicCharAt = function(n)
{
var c, k;
k = 0;
for (c of this) // iterate each char of this
{
if (k == n) return c + "";
k++;
}
return "";
}

This old topic has now a simple solution in ES6:
Split characters into an array
simple version
[..."πŸ˜΄πŸ˜„πŸ˜ƒβ›”πŸŽ πŸš“πŸš‡"] // ["😴", "πŸ˜„", "πŸ˜ƒ", "β›”", "🎠", "πŸš“", "πŸš‡"]
Then having each one separated you can handle them easily for most common cases.
Credit: DownGoat
Full solution
To overcome special emojis as the one in the comment, one can search for the connection charecter (char code 8205 in UTF-16) and make some modifications. Here is how:
let myStr = "πŸ‘©β€πŸ‘©β€πŸ‘§β€πŸ‘§πŸ˜ƒπŒ†"
let arr = [...myStr]
for (i = arr.length-1; i--; i>= 0) {
if (arr[i].charCodeAt(0) == 8205) { // special combination character
arr[i-1] += arr[i] + arr[i+1]; // combine them back to a single emoji
arr.splice(i, 2)
}
}
console.log(arr.length) //3
Haven't found a case where this doesn't work. Comment if you do.
To conclude
it seems that JS uses the 8205 char code to represent UCS-2 characters as a UTF-16 combinations.

Related

How can i make a loop that will show '-' mark x time iteration was? [duplicate]

In Perl I can repeat a character multiple times using the syntax:
$a = "a" x 10; // results in "aaaaaaaaaa"
Is there a simple way to accomplish this in Javascript? I can obviously use a function, but I was wondering if there was any built in approach, or some other clever technique.
These days, the repeat string method is implemented almost everywhere. (It is not in Internet Explorer.) So unless you need to support older browsers, you can simply write:
"a".repeat(10)
Before repeat, we used this hack:
Array(11).join("a") // create string with 10 a's: "aaaaaaaaaa"
(Note that an array of length 11 gets you only 10 "a"s, since Array.join puts the argument between the array elements.)
Simon also points out that according to this benchmark, it appears that it's faster in Safari and Chrome (but not Firefox) to repeat a character multiple times by simply appending using a for loop (although a bit less concise).
In a new ES6 harmony, you will have native way for doing this with repeat. Also ES6 right now only experimental, this feature is already available in Edge, FF, Chrome and Safari
"abc".repeat(3) // "abcabcabc"
And surely if repeat function is not available you can use old-good Array(n + 1).join("abc")
Convenient if you repeat yourself a lot:
String.prototype.repeat = String.prototype.repeat || function(n){
n= n || 1;
return Array(n+1).join(this);
}
alert( 'Are we there yet?\nNo.\n'.repeat(10) )
Array(10).fill('a').join('')
Although the most voted answer is a bit more compact, with this approach you don't have to add an extra array item.
An alternative is:
for(var word = ''; word.length < 10; word += 'a'){}
If you need to repeat multiple chars, multiply your conditional:
for(var word = ''; word.length < 10 * 3; word += 'foo'){}
NOTE: You do not have to overshoot by 1 as with word = Array(11).join('a')
The most performance-wice way is https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/repeat
Short version is below.
String.prototype.repeat = function(count) {
if (count < 1) return '';
var result = '', pattern = this.valueOf();
while (count > 1) {
if (count & 1) result += pattern;
count >>>= 1, pattern += pattern;
}
return result + pattern;
};
var a = "a";
console.debug(a.repeat(10));
Polyfill from Mozilla:
if (!String.prototype.repeat) {
String.prototype.repeat = function(count) {
'use strict';
if (this == null) {
throw new TypeError('can\'t convert ' + this + ' to object');
}
var str = '' + this;
count = +count;
if (count != count) {
count = 0;
}
if (count < 0) {
throw new RangeError('repeat count must be non-negative');
}
if (count == Infinity) {
throw new RangeError('repeat count must be less than infinity');
}
count = Math.floor(count);
if (str.length == 0 || count == 0) {
return '';
}
// Ensuring count is a 31-bit integer allows us to heavily optimize the
// main part. But anyway, most current (August 2014) browsers can't handle
// strings 1 << 28 chars or longer, so:
if (str.length * count >= 1 << 28) {
throw new RangeError('repeat count must not overflow maximum string size');
}
var rpt = '';
for (;;) {
if ((count & 1) == 1) {
rpt += str;
}
count >>>= 1;
if (count == 0) {
break;
}
str += str;
}
// Could we try:
// return Array(count + 1).join(this);
return rpt;
}
}
If you're not opposed to including a library in your project, lodash has a repeat function.
_.repeat('*', 3);
// β†’ '***
https://lodash.com/docs#repeat
For all browsers
The following function will perform a lot faster than the option suggested in the accepted answer:
var repeat = function(str, count) {
var array = [];
for(var i = 0; i < count;)
array[i++] = str;
return array.join('');
}
You'd use it like this :
var repeatedString = repeat("a", 10);
To compare the performance of this function with that of the option proposed in the accepted answer, see this Fiddle and this Fiddle for benchmarks.
For moderns browsers only
In modern browsers, you can now do this using String.prototype.repeat method:
var repeatedString = "a".repeat(10);
Read more about this method on MDN.
This option is even faster. Unfortunately, it doesn't work in any version of Internet explorer. The numbers in the table specify the first browser version that fully supports the method:
In ES2015/ES6 you can use "*".repeat(n)
So just add this to your projects, and your are good to go.
String.prototype.repeat = String.prototype.repeat ||
function(n) {
if (n < 0) throw new RangeError("invalid count value");
if (n == 0) return "";
return new Array(n + 1).join(this.toString())
};
String.repeat() is supported by 96.39% of browsers as of now.
function pad(text, maxLength){
return text + "0".repeat(maxLength - text.length);
}
console.log(pad('text', 7)); //text000
/**
* Repeat a string `n`-times (recursive)
* #param {String} s - The string you want to repeat.
* #param {Number} n - The times to repeat the string.
* #param {String} d - A delimiter between each string.
*/
var repeat = function (s, n, d) {
return --n ? s + (d || "") + repeat(s, n, d) : "" + s;
};
var foo = "foo";
console.log(
"%s\n%s\n%s\n%s",
repeat(foo), // "foo"
repeat(foo, 2), // "foofoo"
repeat(foo, "2"), // "foofoo"
repeat(foo, 2, "-") // "foo-foo"
);
Just for the fun of it, here is another way by using the toFixed(), used to format floating point numbers.
By doing
(0).toFixed(2)
(0).toFixed(3)
(0).toFixed(4)
we get
0.00
0.000
0.0000
If the first two characters 0. are deleted, we can use this repeating pattern to generate any repetition.
function repeat(str, nTimes) {
return (0).toFixed(nTimes).substr(2).replaceAll('0', str);
}
console.info(repeat('3', 5));
console.info(repeat('hello ', 4));
Another interesting way to quickly repeat n character is to use idea from quick exponentiation algorithm:
var repeatString = function(string, n) {
var result = '', i;
for (i = 1; i <= n; i *= 2) {
if ((n & i) === i) {
result += string;
}
string = string + string;
}
return result;
};
For repeat a value in my projects i use repeat
For example:
var n = 6;
for (i = 0; i < n; i++) {
console.log("#".repeat(i+1))
}
but be careful because this method has been added to the ECMAScript 6 specification.
function repeatString(n, string) {
var repeat = [];
repeat.length = n + 1;
return repeat.join(string);
}
repeatString(3,'x'); // => xxx
repeatString(10,'🌹'); // => "🌹🌹🌹🌹🌹🌹🌹🌹🌹🌹"
This is how you can call a function and get the result by the helps of Array() and join()
using Typescript and arrow fun
const repeatString = (str: string, num: number) => num > 0 ?
Array(num+1).join(str) : "";
console.log(repeatString("🌷",10))
//outputs: 🌷🌷🌷🌷🌷🌷🌷🌷🌷🌷
function repeatString(str, num) {
// Array(num+1) is the string you want to repeat and the times to repeat the string
return num > 0 ? Array(num+1).join(str) : "";
}
console.log(repeatString("a",10))
// outputs: aaaaaaaaaa
console.log(repeatString("🌷",10))
//outputs: 🌷🌷🌷🌷🌷🌷🌷🌷🌷🌷
Here is what I use:
function repeat(str, num) {
var holder = [];
for(var i=0; i<num; i++) {
holder.push(str);
}
return holder.join('');
}
I realize that it's not a popular task, what if you need to repeat your string not an integer number of times?
It's possible with repeat() and slice(), here's how:
String.prototype.fracRepeat = function(n){
if(n < 0) n = 0;
var n_int = ~~n; // amount of whole times to repeat
var n_frac = n - n_int; // amount of fraction times (e.g., 0.5)
var frac_length = ~~(n_frac * this.length); // length in characters of fraction part, floored
return this.repeat(n) + this.slice(0, frac_length);
}
And below a shortened version:
String.prototype.fracRepeat = function(n){
if(n < 0) n = 0;
return this.repeat(n) + this.slice(0, ~~((n - ~~n) * this.length));
}
var s = "abcd";
console.log(s.fracRepeat(2.5))
I'm going to expand on #bonbon's answer. His method is an easy way to "append N chars to an existing string", just in case anyone needs to do that. For example since "a google" is a 1 followed by 100 zeros.
for(var google = '1'; google.length < 1 + 100; google += '0'){}
document.getElementById('el').innerText = google;
<div>This is "a google":</div>
<div id="el"></div>
NOTE: You do have to add the length of the original string to the conditional.
Lodash offers a similar functionality as the Javascript repeat() function which is not available in all browers. It is called _.repeat and available since version 3.0.0:
_.repeat('a', 10);
var stringRepeat = function(string, val) {
var newString = [];
for(var i = 0; i < val; i++) {
newString.push(string);
}
return newString.join('');
}
var repeatedString = stringRepeat("a", 1);
Can be used as a one-liner too:
function repeat(str, len) {
while (str.length < len) str += str.substr(0, len-str.length);
return str;
}
In CoffeeScript:
( 'a' for dot in [0..10]).join('')
String.prototype.repeat = function (n) { n = Math.abs(n) || 1; return Array(n + 1).join(this || ''); };
// console.log("0".repeat(3) , "0".repeat(-3))
// return: "000" "000"

Any alternative way of using this .length & .split()?

I want to split lower, upper & also the value of textBox without using .split() and also I want
to find the length of the string without using .length. Can anybody solve my problem I am tried but
I cannot find the exact logic for this problem.
var lowercase = "abcdefghijklmnopqrstuvwxyz";
var uppercase = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
function Print() {
var input = document.getElementById('demo').value;
document.write(document.getElementById('demo1').innerHTML = toUpper(input));
}
function toUpper(input) {
var upperCase = uppercase.split(""); //other way to split uppercase
var lowerCase = lowercase.split(""); //other way to split lowercase
var inputText = input.split(""); //other way to split input
var newText = "";
var found;
for (var i = 0; i < inputText.length; i++) { //not using .length to other way to find the size of inputText
found = false;
for (var ctr = 0; ctr < lowerCase.length; ctr++) { //not using .length other way to find the size of lowerCase
if (inputText[i] == lowerCase[ctr]) {
found = true;
break;
}
}
if (found) { //true
newText = newText + upperCase[ctr];
} else {
newText = newText + inputText[i];
}
}
return newText;
}
You can count the length of a string using the array function reduce.
Reduce loops over all elements in an array and executes a function you give it to reduce it to one value, you can read more here.
To get reduce working on strings, you need to use Array.from, like this:
Array.from(lowerCase).reduce((sum, carry) => sum + 1, 0) // 26
Reduce accepts a starting argument, which we set to zero here.
This way you do not need to use the split or length functions.
You don't need to check if the input is in a string either, you can use charCodeAt() and fromCharCode().
If you take your input and loop through it using Array.from() then forEach, you can get something which looks like this:
function print() {
const input = document.querySelector('#input').value;
document.querySelector('#target').value = stringToUpper(input);
}
function stringToUpper(input) {
let output = "";
Array.from(input).forEach(char => output += charToUpper(char));
return output;
}
function charToUpper(char) {
let code = char.charCodeAt(0);
code >= 97 && code <= 122 ? code -= 32 : code;
return String.fromCharCode(code);
}
<div>
<input id="input" placeholder="enter text here">
</div>
<button onclick="print()">To Upper</button>
<div>
<input id="target">
</div>
The key line is where we take the output and add the char (as upper) to it:
output += charToUpper(char)
If you don't know about arrow functions, you can read more here
This line:
code >= 97 && code <= 122 ? code -= 32 : code;
is just checking if the char is lower case (number between 97 and 122) and if so, subtracting 32 to get it to upper case.
The reason it is subtract not add is in utf-16, the chars are laid out like this:
ABCDEFGHIJKLMNOPQRTUWXYZabcdefghijklmnopqrtuwxyz
See here for more
I don't know what you mean by "split the value of textBox", but one way to determine the length of a string without using .length would be to use a for...of loop and have a counter increment each time it runs to keep track of the number of characters in the string.
let string = 'boo'
let lengthCounter = 0
for (let char of string) {
lengthCounter++
}
//lengthCounter = 3
https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Statements/for...of
You can define your own split and length functions:
function mySplit(a){
var counter = 0;
rslt = [];
var val = a[counter];
while(typeof val != "undefined"){
rslt.push(a[counter]);
counter ++;
val = a[counter];
}
return rslt;
}
function myLength(a){
var counter = 0;
var val = a[counter];
while(typeof val != "undefined"){
counter ++;
val = a[counter];
}
return counter;
}
Your function now should be like:
function toUpper(input) {
var upperCase = mySplit(uppercase);
var lowerCase = mySplit(lowercase);
var inputText = mySplit(input);
var newText = "";
var found;
for (var i = 0; i < myLength(inputText); i++) {
found = false;
for (var ctr = 0; ctr < myLength(lowerCase); ctr++) {
if (inputText[i] == lowerCase[ctr]) {
found = true;
break;
}
}
if (found) { //true
newText = newText + upperCase[ctr];
} else {
newText = newText + inputText[i];
}
}
return newText;
}
The simplest way would be to just use the build in function of javascript .toUpperCase() (see example 1). https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/toUpperCase
Else if you insist on using a for.loop you may do so aswell (see example two). You do not need the split() function since a string already is an arrayof characters. Also be aware that not all characters in the web have lowercase counterparts, so the logic itself is flawed.
//REM: This lines are not required.
/*
var lowercase = "abcdefghijklmnopqrstuvwxyz";
var uppercase = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
function Print() {
var input = document.getElementById('demo').value;
document.write(document.getElementById('demo1').innerHTML = toUpper(input));
}
*/
//REM: Version 1 (using string.toUpperCase())
(function toUpper1(input){
var tReturn = (input || '').toUpperCase();
console.log('toUpper1', tReturn);
return tReturn
}('abcDEFghiJKL'));
//REM: Version 2 (using your way)
(function toUpper2(input){
var tReturn = '';
if(input && input.length){
for(let i=0, j=input.length; i<j; i++){
tReturn += (input[i] === input[i].toLowerCase()) ? input[i].toUpperCase() : input[i]
}
};
console.log('toUpper2', tReturn);
return tReturn
}('abcDEFghiJKL'));

Join string based on startsWith() and endsWith()

I have string var str1 = 'foobarbaz' and var str2 = 'bazfoo'
I want to join them based on overlapping starting and ending characters. The result I am looking for is 'foobarbazfoo'.
I am currently doing it in a following way:
function merge(str1, str2) {
var size = Math.min(str1.length, str2.length);
index = 0;
for (var i = 0; i < size; i++) {
var ends = str1.substr(str1.length - i);
var starts = str2.substr(0, i);
if (ends === starts) {
index = i;
}
}
if (index === 0) {
throw 'Strings do not overlap';
} else {
return str1 + str2.substr(index, str2.length);
}
}
I wonder, if there is more elegant and efficient way of doing it ?
i think it would be a good idea to add the function to the String's prototype and using startsWith() and Conditional (ternary) Operator this what i could come up with :
String.prototype.merge = function(str) {
let match;
for (let i = this.length; i >= 0; i--)
(str.startsWith(this.slice(i))) && (match = this.slice(i));
return this.slice(0, this.indexOf(match)) + str.slice(str.indexOf(match), str.length)
}
let merged = 'foobarbaz'.merge('bazfoo')
console.log(merged);
in terms of speed, both methods are identical ( tested execution time with Performance.now() )
but less lines and a declarative rather than imperative code.
feel free to choose betwee slice and substring ( slice vs substring )

How can I remove the last emoji of a group of emojis in javascript?

Let's say I have this 3 emojis in a string: πŸ˜€πŸŽƒπŸ‘ͺ
There are not any spaces or any other character except emojis in the string.
How can I remove the last emoji in javascript?
The answer below doesn't use any special package and safely removes the last emoji
function safeEmojiBackspace(str)
{
let initialRealCount = fancyCount(str);
while(str.length > 0 && fancyCount(str) !== initialRealCount - 1)
{
str = str.substring(0,str.length - 1);
}
return str;
}
function fancyCount(str){
const joiner = "\u{200D}";
const split = str.split(joiner);
let count = 0;
for(const s of split){
//removing the variation selectors
const num = Array.from(s.split(/[\ufe00-\ufe0f]/).join("")).length;
count += num;
}
//assuming the joiners are used appropriately
return count / split.length;
}
Sample usage
let str = "somethingπŸ˜€πŸŽƒπŸ‘ͺ";
str = safeEmojiBackspace(str);//"somethingπŸ˜€πŸŽƒ"
You can do this. It will always remove the last emoji.
function removeEmoji() {
var emoStringArray = document.getElementById('emoji').innerHTML;
var lastIndex = emoStringArray.lastIndexOf(" ");
var stripedEmoStringArray = emoStringArray.substring(0, lastIndex);
document.getElementById('emoji').innerHTML = stripedEmoStringArray;
}
<p id="emoji">
πŸ˜€ πŸŽƒ πŸ‘ͺ
</p>
<button onclick="removeEmoji()">Remove</button>
I hope this is what you want.
var emoString = "πŸ˜€ πŸŽƒ πŸ‘ͺ";
emoString = emoString.slice(0, -2);
However, this would work only if you have 3 emojis in total. Hence to achieve a generalised solution, you can use the underscore functions split() and javascript function join() :
var emoString = "πŸ˜€ πŸŽƒ πŸ‘ͺ";
emoString = _.rest(emoString.split(' ')).join(' ')
Hope this will solve your issue.
Ok, here is how I solved it:
function deleteEmoji(emojiStr) {
let emojisArray = emojiStr.match(/([\uD800-\uDBFF][\uDC00-\uDFFF])/g);
emojisArray = emojisArray.splice(0, emojisArray.length - 1);
return emojisArray.join("");
}
let emojitext = "πŸ˜€πŸŽƒπŸ‘ͺ";
console.log(deleteEmoji(emojitext));
I was actually surprised that unicode in this day an age is still not fully supported in browsers. I assume a lot of this is down to windows and it's version of UTF-16.
The OP I believe has found his own solution to the original problem, but I thought there has to be a more generic solution to surrogate pair unicode characters.
Anyway, so my solution is convert the text into a UTF-32 array, these can then be manipulated must easier, using slice etc.
After you have done what you want to the array, just convert back.
Below is an example.
Some of the code I got from -> Is it possible to convert a string containing "high" unicode chars to an array consisting of dec values derived from utf-32 ("real") codes?
and http://speakingjs.com/es5/ch24.html
function decodeUnicode(str) {
const r = [];
let i = 0;
while(i < str.length) {
let chr = str.charCodeAt(i++);
if(chr >= 0xD800 && chr <= 0xDBFF) {
var low = str.charCodeAt(i++);
r.push(0x10000 +
((chr - 0xD800) << 10) | (low - 0xDC00));
} else {
r.push(chr);
}
}
return r;
}
function toUTF16(codePoint) {
const TEN_BITS = parseInt('1111111111', 2);
if (codePoint <= 0xFFFF) { return codePoint; }
codePoint -= 0x10000;
const leadingSurrogate = 0xD800 | (codePoint >> 10);
const trailingSurrogate = 0xDC00 | (codePoint & TEN_BITS);
return String.fromCharCode(leadingSurrogate) +
String.fromCharCode(trailingSurrogate);
}
function encodeUnicode(data) {
return data.reduce((a, v) => {
a += toUTF16(v);
return a;
},"");
}
var unicode = decodeUnicode("πŸ˜€πŸŽƒπŸ‘ͺ");
for (let l = 0; l < unicode.length; l ++)
console.log(encodeUnicode(
unicode.slice(0, l ? -l : unicode.length)));
console.log("pick some random ones");
let str = "";
for (let l = 0; l < 20; l ++) {
let rnd = Math.trunc(Math.random()*unicode.length);
str += encodeUnicode(unicode.slice(rnd,rnd+1));
}
console.log(str);

UTF-8 support for regular expression in Javascript

I am trying to create a Javascript function that would find all positions for a set of patterns inside a UTF-8 string. For example:
I have a string "detaj" (it's a transcription written with International Phonetic Alphabet symbols, so I need a full UTF-8 support).
And I have an array of patterns: ["(?!dʒ)d", "(?!tʃ)t"] (each string is also UTF-8 encoded).
I need to find the position of each pattern and obtain the following array:
[0] => [0, "(?!dΚ’)d"],
[1] => [2, "(?!tʃ)t"]
0 - is the position of the symbol "d", 2 - is the position of the symbol "t".
I started with this function:
https://stackoverflow.com/a/3410557/2006215
var str = "I learned to play the Ukulele in Lebanon."
var regex = /le/gi, result, indices = [];
while ( (result = regex.exec(str)) ) {
indices.push(result.index);
}
And I changed it to something like this:
function getAllIndicesOfArrayOfStringInsideStringRegex (sounds_regex_array, word_transcription) {
var allIndices = [];
for (var i = 0; i < sounds_regex_array.length; i++) {
var currentSoundRegex = sounds_regex_array[i];
// straightforward approach doesn't work:
//var pattern = new RegExp(currentSoundRegex, "g");
// hexEncode is taken from here - https://stackoverflow.com/a/21648161/2006215 - doesn't work:
//var pattern = new RegExp(currentSoundRegex.hexEncode, "g");
// I'm trying to use utf8.js plugin from here - https://github.com/mathiasbynens/utf8.js - doesn't work:
var pattern = new RegExp(utf8.encode(currentSoundRegex), "g");
var indices = getIndicesOfRegex (pattern, word_transcription);
for (var j = 0; j < indices.length; j++) {
allIndices.push([indices[j], currentSoundRegex ]);
}
}
return allIndices;
}
function getIndicesOfRegex (regex, str) {
var result, indices = [];
while (result = regex.exec(str)) {
indices.push(result.index);
}
return indices;
}
Anybody has any ideas?
UPDATE: I take both the transcription and the regex patterns from json file that I generate with PHP from UTF-8 strings. I am not sure how to call this, but it's not UTF-8. In any case it doesn't work with my Javascript function.
var questions = [{"word":"sorte","word_transcription":"s\u0254\u0281t","sounds_array":["d","t"],"sounds_regex_array":["(?!d\u0292)d","(?!t\u0283)t"]}];
I found where the problem was.
The error was triggered because I tried to execute lookbehind in Javascript, which is not supported.
The workaround for custom lookbehind functions is proposed here - http://blog.stevenlevithan.com/archives/javascript-regex-lookbehind
But finally I just did my own modifications of the code. The above functions require XRegExp library, which is pretty heavy.
My solution:
function getIndicesOfRegex (currentSoundRegex, pattern, str) {
var result, indices = [];
while (result = pattern.exec(str)) {
if ((currentSoundRegex === "Κ’") && (result.index > 0) && (str.substring(result.index-1, result.index) === "d")) { continue; }
if ((currentSoundRegex === "Κƒ") && (result.index > 0) && (str.substring(result.index-1, result.index) === "t")) { continue; }
indices.push(result.index);
}
return indices;
}
function getAllIndicesOfArrayOfStringInsideStringRegex (sounds_array, sounds_regex_array, word_transcription) {
var allIndices = [];
for (var i = 0; i < sounds_regex_array.length; i++) {
var currentSoundRegex = sounds_regex_array[i];
// lookbehind doesn't work in Javascript:
// possible workaround - http://blog.stevenlevithan.com/archives/javascript-regex-lookbehind
if (currentSoundRegex === "(?<!d)Κ’") {
currentSoundRegex = "Κ’";
}
if (currentSoundRegex === "(?<!t)Κƒ") {
currentSoundRegex = "Κƒ";
}
var pattern = new RegExp(currentSoundRegex, "g");
var indices = getIndicesOfRegex (currentSoundRegex, pattern, word_transcription);
var currentSound = sounds_array[i];
for (var j = 0; j < indices.length; j++) {
allIndices.push([indices[j], currentSound]);
}
}
return allIndices;
}

Categories

Resources