XOR of two hex strings in JavaScript - javascript

var hex1 = "B1C85C061C98E713DEF0E2EDDDDB432738674C9F8962F09B75E943D55F9FB39F";
var hex2 = "121B0D3327A21B8048FC7CA6FD07AACC0D8DF59B99DB098686696573E3686E6C";
var result = hex1 ^ hex2; //XOR the values
console.log(result); // outputs: 0 which does not sound good.
Any ideas how to perform XOR operations on hex values?

Bitwise operations in JavaScript only work on numeric values.
You should parseInt(hexString, 16) your hex string before. Specifically in your case this wouldn't work because your hex is too big for a number. You would have to create your own customized XOR function.
Take a look at this link: How to convert hex string into a bytes array, and a bytes array in the hex string?
The resulting bytearray will be ellegible for a manual XOR. Byte by byte. Maybe this will help: Java XOR over two arrays.

If you are on Nodejs, you could transform the hex strings to Buffers then use map to build the resulting string.
function xor(hex1, hex2) {
const buf1 = Buffer.from(hex1, 'hex');
const buf2 = Buffer.from(hex2, 'hex');
const bufResult = buf1.map((b, i) => b ^ buf2[i]);
return bufResult.toString('hex');
}

str = 'abc';
c = '';
key = 'K';
for(i=0; i<str.length; i++) {
c += String.fromCharCode(str[i].charCodeAt(0).toString(10) ^ key.charCodeAt(0).toString(10)); // XORing with letter 'K'
}
return c;
Output of string 'abc':
"*)("

You can use a function like this.
function xor(a, b) {
if (!Buffer.isBuffer(a)) a = new Buffer(a)
if (!Buffer.isBuffer(b)) b = new Buffer(b)
var res = []
if (a.length > b.length) {
for (var i = 0; i < b.length; i++) {
res.push(a[i] ^ b[i])
}
} else {
for (var i = 0; i < a.length; i++) {
res.push(a[i] ^ b[i])
}
}
return new Buffer(res);
}
Source: https://github.com/czzarr/node-bitwise-xor

Below is function that takes in 2 strings like "041234FFFFFFFFFF" and "0000000709000003" (a classic example of pin block and card block)
Expected result from the above 2 strings is "041234F8F6FFFFFC"
function bitwiseXorHexString(pinBlock1, pinBlock2) {
var result = ''
for (let index = 0; index < 16; index++) {
const temp = (parseInt(pinBlock1.charAt(index), 16) ^ parseInt(pinBlock2.charAt(index), 16)).toString(16).toUpperCase()
result += temp
}
return result
}
Note: This was made to xor 2 strings of fixed length 16. You may modify it as per your needs.

Related

How can I convert an ArrayBuffer to 11-Bits Values and Back again in Javascript

I need to convert a 256-bit ArrayBuffer into 24 11-bit values and then back again.
Is there a simple code snippet to handle this type of operation.
I have this version, to convert it to 24 11 bit values.
var newBuffer = new Uint8Array(data);
var result = [];
for (var i =0, l = 24;i<l;i++){
var index = parseInt((i*11.0)/8.0);
var subBuffer;
if (i==23){
var proxyBuffer = new Uint8Array(2);
proxyBuffer.set(newBuffer.slice(index,index+1));
subBuffer = proxyBuffer;
}else{
subBuffer = newBuffer.slice(index,index+2);
}
var value = new Uint16Array(subBuffer.buffer);
value = value >> (i*3)%8;
value = value % 2048;
result.push(value);
}
console.log(result);
Using bit operations can simplify the conversion process - using parseInt and decimal arithmetic is not an easy approach.
The concept code below uses plain arrays of octet and 11 bit values. While Uint8Array and Uint16Array types may be a better choice, creating typed arrays and/or converting an arrayBuffer to and from a suitable array type is not included.
function ui8To11( buffer8) {
var buffer11 = [];
var acc = 0;
var accBits = 0;
function add( octet) {
acc = (octet << accBits) | acc;
accBits += 8;
if( accBits >=11) {
buffer11.push( acc & 0x7ff);
acc >>= 11;
accBits -= 11;
}
}
function flush() {
if( accBits) {
buffer11.push( acc);
}
}
buffer8.forEach( add);
flush();
return buffer11;
}
function ui11To8( buffer11) {
var buffer8 = [];
var acc = 0;
var accBits = 0;
function add( ui11) {
acc = (ui11 << accBits) | acc;
accBits += 11;
while( accBits >= 8) {
buffer8.push( acc & 0xff);
acc >>= 8;
accBits -= 8;
}
}
function flush() {
if( accBits) {
buffer8.push( acc);
}
}
buffer11.forEach( add);
flush();
return buffer8;
}
var buffer8 = [1,2,3]; // 8 bit values, least significant octet at index 0
console.log("octets: ", buffer8);
var buffer11 = ui8To11( buffer8);
console.log("undectets: ", buffer11);
var reconstructed = ui11To8( buffer11)
console.log("convertedBack", reconstructed);
There is an assumption here that the input array is little-endian, as in each entry in the input array is more significant than the previous entry.
Conversion between 8 and 11 bit values and back again follows a similar pattern, but pushing bits from the accumulator to the output array requires a loop when converting from a higher number of bits to a lower.
The example takes 3 x 8 bit values (24 bits in total) and produces 3 x 11 bit values (33 bits in total). Converting back 33 bits to uint8 integers produces 5 x 8 bit values ( 40 bits). You may need to add code to limit the number of integers pushed into output arrays within conversion routines or truncate output arrays returned as required.
There is a library called Uint1Array which makes everything much easier.
var arr = new Uint1Array(buffer);
for (let i=0, l=arr.length; i<l; i+=11){
var zero = new Uint1Array(16);
for (let index =0, length = 11; index<length;index++){
zero[index]=arr[i+index];
}
let bit16 = new Uint16Array(zero.buffer)[0];
outPut.push(bit16);
}
console.log(outPut);
var bit256 = new Uint1Array(256);
for (let i=0, l=outPut.length;i<l;i++){
var hold = new Uint16Array(1);
hold[0]=outPut[i];
let bit16 = new Uint1Array(hold.buffer);
let bit11 = bit16.slice(0,11);
for (let i2=0, l2=11;i2<l2;i2++){
bit256[(i*11)+i2]=bit11[i2];
}
}

How can I remove the last emoji of a group of emojis in javascript?

Let's say I have this 3 emojis in a string: πŸ˜€πŸŽƒπŸ‘ͺ
There are not any spaces or any other character except emojis in the string.
How can I remove the last emoji in javascript?
The answer below doesn't use any special package and safely removes the last emoji
function safeEmojiBackspace(str)
{
let initialRealCount = fancyCount(str);
while(str.length > 0 && fancyCount(str) !== initialRealCount - 1)
{
str = str.substring(0,str.length - 1);
}
return str;
}
function fancyCount(str){
const joiner = "\u{200D}";
const split = str.split(joiner);
let count = 0;
for(const s of split){
//removing the variation selectors
const num = Array.from(s.split(/[\ufe00-\ufe0f]/).join("")).length;
count += num;
}
//assuming the joiners are used appropriately
return count / split.length;
}
Sample usage
let str = "somethingπŸ˜€πŸŽƒπŸ‘ͺ";
str = safeEmojiBackspace(str);//"somethingπŸ˜€πŸŽƒ"
You can do this. It will always remove the last emoji.
function removeEmoji() {
var emoStringArray = document.getElementById('emoji').innerHTML;
var lastIndex = emoStringArray.lastIndexOf(" ");
var stripedEmoStringArray = emoStringArray.substring(0, lastIndex);
document.getElementById('emoji').innerHTML = stripedEmoStringArray;
}
<p id="emoji">
πŸ˜€ πŸŽƒ πŸ‘ͺ
</p>
<button onclick="removeEmoji()">Remove</button>
I hope this is what you want.
var emoString = "πŸ˜€ πŸŽƒ πŸ‘ͺ";
emoString = emoString.slice(0, -2);
However, this would work only if you have 3 emojis in total. Hence to achieve a generalised solution, you can use the underscore functions split() and javascript function join() :
var emoString = "πŸ˜€ πŸŽƒ πŸ‘ͺ";
emoString = _.rest(emoString.split(' ')).join(' ')
Hope this will solve your issue.
Ok, here is how I solved it:
function deleteEmoji(emojiStr) {
let emojisArray = emojiStr.match(/([\uD800-\uDBFF][\uDC00-\uDFFF])/g);
emojisArray = emojisArray.splice(0, emojisArray.length - 1);
return emojisArray.join("");
}
let emojitext = "πŸ˜€πŸŽƒπŸ‘ͺ";
console.log(deleteEmoji(emojitext));
I was actually surprised that unicode in this day an age is still not fully supported in browsers. I assume a lot of this is down to windows and it's version of UTF-16.
The OP I believe has found his own solution to the original problem, but I thought there has to be a more generic solution to surrogate pair unicode characters.
Anyway, so my solution is convert the text into a UTF-32 array, these can then be manipulated must easier, using slice etc.
After you have done what you want to the array, just convert back.
Below is an example.
Some of the code I got from -> Is it possible to convert a string containing "high" unicode chars to an array consisting of dec values derived from utf-32 ("real") codes?
and http://speakingjs.com/es5/ch24.html
function decodeUnicode(str) {
const r = [];
let i = 0;
while(i < str.length) {
let chr = str.charCodeAt(i++);
if(chr >= 0xD800 && chr <= 0xDBFF) {
var low = str.charCodeAt(i++);
r.push(0x10000 +
((chr - 0xD800) << 10) | (low - 0xDC00));
} else {
r.push(chr);
}
}
return r;
}
function toUTF16(codePoint) {
const TEN_BITS = parseInt('1111111111', 2);
if (codePoint <= 0xFFFF) { return codePoint; }
codePoint -= 0x10000;
const leadingSurrogate = 0xD800 | (codePoint >> 10);
const trailingSurrogate = 0xDC00 | (codePoint & TEN_BITS);
return String.fromCharCode(leadingSurrogate) +
String.fromCharCode(trailingSurrogate);
}
function encodeUnicode(data) {
return data.reduce((a, v) => {
a += toUTF16(v);
return a;
},"");
}
var unicode = decodeUnicode("πŸ˜€πŸŽƒπŸ‘ͺ");
for (let l = 0; l < unicode.length; l ++)
console.log(encodeUnicode(
unicode.slice(0, l ? -l : unicode.length)));
console.log("pick some random ones");
let str = "";
for (let l = 0; l < 20; l ++) {
let rnd = Math.trunc(Math.random()*unicode.length);
str += encodeUnicode(unicode.slice(rnd,rnd+1));
}
console.log(str);

Longitudinal redundancy check in Javascript

I'm working with a system that integrates a Point of Sell (POS) device, I use chrome serial to scan ports and be able to read credit card data.
The problem I'm facing is that I need to concat the LRC from a string in this format:
STX = '\002' (2 HEX) (Start of text)
LLL = Length of data (doesn't include STX or ETX but command).
Command C50 {C = A message from PC to POS, 50 the actual code that "prints" a message on POS}
ETX = '\003' (3 HEX) (End of text)
LRC = Longitudinal Redundancy Check
A message example would be as follows:
'\002014C50HELLO WORLD\003'
Here we can see 002 as STX, 014 is the length from C50 to D, and 003 as ETX.
I found some algorithms in C# like this one or this one and even this one in Java, I even saw this question that was removed from SO on Google's cache, which actually asks the same as I but had no examples or answers.
I also made this Java algorithm:
private int calculateLRC(String str) {
int result = 0;
for (int i = 0; i < str.length(); i++) {
String char1 = str.substring(i, i + 1);
char[] char2 = char1.toCharArray();
int number = char2[0];
result = result ^ number;
}
return result;
}
and tried passing it to Javascript (where I have poor knowledge)
function calculateLRC2(str) {
var result = 0;
for (var i = 0; i < str.length; i++) {
var char1 = str.substring(i, i + 1);
//var char2[] = char1.join('');
var number = char1;
result = result ^ number;
}
return result.toString();
}
and after following the Wikipedia's pseudocode I tried doing this:
function calculateLRC(str) {
var buffer = convertStringToArrayBuffer(str);
var lrc;
for (var i = 0; i < str.length; i++) {
lrc = (lrc + buffer[i]) & 0xFF;
}
lrc = ((lrc ^ 0xFF) + 1) & 0xFF;
return lrc;
}
This is how I call the above method:
var finalMessage = '\002014C50HELLO WORLD\003'
var lrc = calculateLRC(finalMessage);
console.log('lrc: ' + lrc);
finalMessage = finalMessage.concat(lrc);
console.log('finalMessage: ' + finalMessage);
However after trying all these methods, I still can't send a message to POS correctly. I have 3 days now trying to fix this thing and can't do anything more unless I finish it.
Is there anyone that knows another way to calculate LRC or what am I doing wrong here? I need it to be with Javascritpt since POS comunicates with PC through NodeJS.
Oh btw the code from convertStringToArrayBuffer is on the chrome serial documentation which is this one:
var writeSerial=function(str) {
chrome.serial.send(connectionId, convertStringToArrayBuffer(str), onSend);
}
// Convert string to ArrayBuffer
var convertStringToArrayBuffer=function(str) {
var buf=new ArrayBuffer(str.length);
var bufView=new Uint8Array(buf);
for (var i=0; i<str.length; i++) {
bufView[i]=str.charCodeAt(i);
}
return buf;
}
Edit After testing I came with this algorithm which returns a 'z' (lower case) with the following input: \002007C50HOLA\003.
function calculateLRC (str) {
var bytes = [];
var lrc = 0;
for (var i = 0; i < str.length; i++) {
bytes.push(str.charCodeAt(i));
}
for (var i = 0; i < str.length; i++) {
lrc ^= bytes[i];
console.log('lrc: ' + lrc);
//console.log('lrcString: ' + String.fromCharCode(lrc));
}
console.log('bytes: ' + bytes);
return String.fromCharCode(lrc);
}
However with some longer inputs and specialy when trying to read card data, LRC becomes sometimes a Control Character which in my case that I use them on my String, might be a problem. Is there a way to force LRC to avoid those characters? Or maybe I'm doing it wrong and that's why I'm having those characters as output.
I solved LRC issue by calculating it with the following method, after reading #Jack A.'s answer and modifying it to this one:
function calculateLRC (str) {
var bytes = [];
var lrc = 0;
for (var i = 0; i < str.length; i++) {
bytes.push(str.charCodeAt(i));
}
for (var i = 0; i < str.length; i++) {
lrc ^= bytes[i];
}
return String.fromCharCode(lrc);
}
Explanation of what it does:
1st: it converts the string received to it's ASCII equivalent (charCodeAt()).
2nd: it calculates LRC by doing a XOR operation between last calculated LRC (0 on 1st iteration) and string's ASCII for each char.
3rd: it converts from ASCII to it's equivalent chat (fromCharCode()) and returns this char to main function (or whatever function called it).
Your pseudocode-based algorithm is using addition. For the XOR version, try this:
function calculateLRC(str) {
var buffer = convertStringToArrayBuffer(str);
var lrc = 0;
for (var i = 0; i < str.length; i++) {
lrc = (lrc ^ buffer[i]) & 0xFF;
}
return lrc;
}
I think your original attempt at the XOR version was failing because you needed to get the character code. The number variable still contained a string when you did result = result ^ number, so the results were probably not what you expected.
This is a SWAG since I don't have Node.JS installed at the moment so I can't verify it will work.
Another thing I would be concerned about is character encoding. JavaScript uses UTF-16 for text, so converting any non-ASCII characters to 8-bit bytes may give unexpected results.

How do I implement hex2bin()?

I need to communicate between Javascript and PHP (I use jQuery for AJAX), but the output of the PHP script may contain binary data. That's why I use bin2hex() and json_encode() on PHP side.
How do I convert the hexadecimal string in binary string, with JavaScript?
To answer your question:
function Hex2Bin(n){if(!checkHex(n))return 0;return parseInt(n,16).toString(2)}
Here are some further functions you may find useful for working with binary data:
//Useful Functions
function checkBin(n){return/^[01]{1,64}$/.test(n)}
function checkDec(n){return/^[0-9]{1,64}$/.test(n)}
function checkHex(n){return/^[0-9A-Fa-f]{1,64}$/.test(n)}
function pad(s,z){s=""+s;return s.length<z?pad("0"+s,z):s}
function unpad(s){s=""+s;return s.replace(/^0+/,'')}
//Decimal operations
function Dec2Bin(n){if(!checkDec(n)||n<0)return 0;return n.toString(2)}
function Dec2Hex(n){if(!checkDec(n)||n<0)return 0;return n.toString(16)}
//Binary Operations
function Bin2Dec(n){if(!checkBin(n))return 0;return parseInt(n,2).toString(10)}
function Bin2Hex(n){if(!checkBin(n))return 0;return parseInt(n,2).toString(16)}
//Hexadecimal Operations
function Hex2Bin(n){if(!checkHex(n))return 0;return parseInt(n,16).toString(2)}
function Hex2Dec(n){if(!checkHex(n))return 0;return parseInt(n,16).toString(10)}
JavaScript doesn't have support for binary data. Nevertheless you can emulate this with regular strings.
var hex = "375771", // ASCII HEX: 37="7", 57="W", 71="q"
bytes = [],
str;
for(var i=0; i< hex.length-1; i+=2){
bytes.push(parseInt(hex.substr(i, 2), 16));
}
str = String.fromCharCode.apply(String, bytes);
alert(str); // 7Wq
function hex2bin(hex)
{
var bytes = [], str;
for(var i=0; i< hex.length-1; i+=2)
bytes.push(parseInt(hex.substr(i, 2), 16));
return String.fromCharCode.apply(String, bytes);
}
thanks to Andris!
Other useful information about this topic (dex2bin,bin2dec) can be found here.
According to that, here is a bin2hex solution:
parseInt(1100,2).toString(16); //--> c
Although not an answer to the actual question, it is perhaps useful in this case to also know how to reverse the process:
function bin2hex (bin)
{
var i = 0, l = bin.length, chr, hex = ''
for (i; i < l; ++i)
{
chr = bin.charCodeAt(i).toString(16)
hex += chr.length < 2 ? '0' + chr : chr
}
return hex
}
As an example, using hex2bin on b637eb9146e84cb79f6d981ac9463de1 returns ¢7ëFèL·mÉF=Ñ, and then passing this to bin2hex returns b637eb9146e84cb79f6d981ac9463de1.
It might also be useful to prototype these functions to the String object:
String.prototype.hex2bin = function ()
{
var i = 0, l = this.length - 1, bytes = []
for (i; i < l; i += 2)
{
bytes.push(parseInt(this.substr(i, 2), 16))
}
return String.fromCharCode.apply(String, bytes)
}
String.prototype.bin2hex = function ()
{
var i = 0, l = this.length, chr, hex = ''
for (i; i < l; ++i)
{
chr = this.charCodeAt(i).toString(16)
hex += chr.length < 2 ? '0' + chr : chr
}
return hex
}
alert('b637eb9146e84cb79f6d981ac9463de1'.hex2bin().bin2hex())
All proposed solutions use String.fromCharCode, why not simply using unescape?
String.prototype.hex2bin = function()
{
var i = 0, len = this.length, result = "";
//Converting the hex string into an escaped string, so if the hex string is "a2b320", it will become "%a2%b3%20"
for(; i < len; i+=2)
result += '%' + this.substr(i, 2);
return unescape(result);
}
and then:
alert( "68656c6c6f".hex2bin() ); //shows "hello"
With reference to node.js ( not in browser ).
Basically it's all over-engineered and does not work well.
responses are out of alignment and though text-wise they are the same bit wise everything is all over the place :
curl http://phpimpl.domain.com/testhex.php | xxd
00000000: de56 a735 4739 c01d f2dc e14b ba30 8af0 .Q.%G9.....;.0..
curl http://nodejs.domain.com/ | xxd
00000000: c39e 56c2 a725 4739 c380 c3ad c3b1 c39c ..Q..%G9........
00000010: c3a1 37c2 6b30 c28f c3b0 ..;..0....
The proper way to implement this in node is :
function hex2bin(hex){
return new Buffer(hex,"hex");
}
curl http://nodejs.domain.com/ | xxd
00000000: de56 a735 4739 c01d f2dc e14b ba30 8af0 .Q.%G9.....;.0..
Hope this helps.
Here is an implementation of hex2bin in JS that takes a string and returns Uint8Array, works both in browsers and nodejs,
function hex2bin(hex) {
var length = hex.length / 2;
var result = new Uint8Array(length);
for (var i = 0; i < length; ++i) {
result[i] = parseInt(hex.slice(i * 2, i * 2 + 2), 16);
}
return result;
}
And its inverse,
function bin2hex(bin) {
return Array.from(bin).map(function (x) {
return x.toString(16).padStart(2, '0');
}).join('');
}
If someone needs the other direction (bin to hex), here is it:
function bin2hex(bin) {
return new Buffer(bin).toString("hex");
}
JavaScript does actually contain support for binary data. See Uint8Array.
Just read each byte from the array and convert it into hexadecimal.

JavaScript strings outside of the BMP

BMP being Basic Multilingual Plane
According to JavaScript: the Good Parts:
JavaScript was built at a time when Unicode was a 16-bit character set, so all characters in JavaScript are 16 bits wide.
This leads me to believe that JavaScript uses UCS-2 (not UTF-16!) and can only handle characters up to U+FFFF.
Further investigation confirms this:
> String.fromCharCode(0x20001);
The fromCharCode method seems to only use the lowest 16 bits when returning the Unicode character. Trying to get U+20001 (CJK unified ideograph 20001) instead returns U+0001.
Question: is it at all possible to handle post-BMP characters in JavaScript?
2011-07-31: slide twelve from Unicode Support Shootout: The Good, The Bad, & the (mostly) Ugly covers issues related to this quite well:
Depends what you mean by β€˜support’. You can certainly put non-UCS-2 characters in a JS string using surrogates, and browsers will display them if they can.
But, each item in a JS string is a separate UTF-16 code unit. There is no language-level support for handling full characters: all the standard String members (length, split, slice etc) all deal with code units not characters, so will quite happily split surrogate pairs or hold invalid surrogate sequences.
If you want surrogate-aware methods, I'm afraid you're going to have to start writing them yourself! For example:
String.prototype.getCodePointLength= function() {
return this.length-this.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g).length+1;
};
String.fromCodePoint= function() {
var chars= Array.prototype.slice.call(arguments);
for (var i= chars.length; i-->0;) {
var n = chars[i]-0x10000;
if (n>=0)
chars.splice(i, 1, 0xD800+(n>>10), 0xDC00+(n&0x3FF));
}
return String.fromCharCode.apply(null, chars);
};
I came to the same conclusion as bobince. If you want to work with strings containing unicode characters outside of the BMP, you have to reimplement javascript's String methods. This is because javascript counts characters as each 16-bit code value. Symbols outside of the BMP need two code values to be represented. You therefore run into a case where some symbols count as two characters and some count only as one.
I've reimplemented the following methods to treat each unicode code point as a single character: .length, .charCodeAt, .fromCharCode, .charAt, .indexOf, .lastIndexOf, .splice, and .split.
You can check it out on jsfiddle: http://jsfiddle.net/Y89Du/
Here's the code without comments. I tested it, but it may still have errors. Comments are welcome.
if (!String.prototype.ucLength) {
String.prototype.ucLength = function() {
// this solution was taken from
// http://stackoverflow.com/questions/3744721/javascript-strings-outside-of-the-bmp
return this.length - this.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g).length + 1;
};
}
if (!String.prototype.codePointAt) {
String.prototype.codePointAt = function (ucPos) {
if (isNaN(ucPos)){
ucPos = 0;
}
var str = String(this);
var codePoint = null;
var pairFound = false;
var ucIndex = -1;
var i = 0;
while (i < str.length){
ucIndex += 1;
var code = str.charCodeAt(i);
var next = str.charCodeAt(i + 1);
pairFound = (0xD800 <= code && code <= 0xDBFF && 0xDC00 <= next && next <= 0xDFFF);
if (ucIndex == ucPos){
codePoint = pairFound ? ((code - 0xD800) * 0x400) + (next - 0xDC00) + 0x10000 : code;
break;
} else{
i += pairFound ? 2 : 1;
}
}
return codePoint;
};
}
if (!String.fromCodePoint) {
String.fromCodePoint = function () {
var strChars = [], codePoint, offset, codeValues, i;
for (i = 0; i < arguments.length; ++i) {
codePoint = arguments[i];
offset = codePoint - 0x10000;
if (codePoint > 0xFFFF){
codeValues = [0xD800 + (offset >> 10), 0xDC00 + (offset & 0x3FF)];
} else{
codeValues = [codePoint];
}
strChars.push(String.fromCharCode.apply(null, codeValues));
}
return strChars.join("");
};
}
if (!String.prototype.ucCharAt) {
String.prototype.ucCharAt = function (ucIndex) {
var str = String(this);
var codePoint = str.codePointAt(ucIndex);
var ucChar = String.fromCodePoint(codePoint);
return ucChar;
};
}
if (!String.prototype.ucIndexOf) {
String.prototype.ucIndexOf = function (searchStr, ucStart) {
if (isNaN(ucStart)){
ucStart = 0;
}
if (ucStart < 0){
ucStart = 0;
}
var str = String(this);
var strUCLength = str.ucLength();
searchStr = String(searchStr);
var ucSearchLength = searchStr.ucLength();
var i = ucStart;
while (i < strUCLength){
var ucSlice = str.ucSlice(i,i+ucSearchLength);
if (ucSlice == searchStr){
return i;
}
i++;
}
return -1;
};
}
if (!String.prototype.ucLastIndexOf) {
String.prototype.ucLastIndexOf = function (searchStr, ucStart) {
var str = String(this);
var strUCLength = str.ucLength();
if (isNaN(ucStart)){
ucStart = strUCLength - 1;
}
if (ucStart >= strUCLength){
ucStart = strUCLength - 1;
}
searchStr = String(searchStr);
var ucSearchLength = searchStr.ucLength();
var i = ucStart;
while (i >= 0){
var ucSlice = str.ucSlice(i,i+ucSearchLength);
if (ucSlice == searchStr){
return i;
}
i--;
}
return -1;
};
}
if (!String.prototype.ucSlice) {
String.prototype.ucSlice = function (ucStart, ucStop) {
var str = String(this);
var strUCLength = str.ucLength();
if (isNaN(ucStart)){
ucStart = 0;
}
if (ucStart < 0){
ucStart = strUCLength + ucStart;
if (ucStart < 0){ ucStart = 0;}
}
if (typeof(ucStop) == 'undefined'){
ucStop = strUCLength - 1;
}
if (ucStop < 0){
ucStop = strUCLength + ucStop;
if (ucStop < 0){ ucStop = 0;}
}
var ucChars = [];
var i = ucStart;
while (i < ucStop){
ucChars.push(str.ucCharAt(i));
i++;
}
return ucChars.join("");
};
}
if (!String.prototype.ucSplit) {
String.prototype.ucSplit = function (delimeter, limit) {
var str = String(this);
var strUCLength = str.ucLength();
var ucChars = [];
if (delimeter == ''){
for (var i = 0; i < strUCLength; i++){
ucChars.push(str.ucCharAt(i));
}
ucChars = ucChars.slice(0, 0 + limit);
} else{
ucChars = str.split(delimeter, limit);
}
return ucChars;
};
}
More recent JavaScript engines have String.fromCodePoint.
const ideograph = String.fromCodePoint( 0x20001 ); // outside the BMP
Also a code-point iterator, which gets you the code-point length.
function countCodePoints( str )
{
const i = str[Symbol.iterator]();
let count = 0;
while( !i.next().done ) ++count;
return count;
}
console.log( ideograph.length ); // gives '2'
console.log( countCodePoints(ideograph) ); // '1'
Yes, you can. Although support to non-BMP characters directly in source documents is optional according to the ECMAScript standard, modern browsers let you use them. Naturally, the document encoding must be properly declared, and for most practical purposes you would need to use the UTF-8 encoding. Moreover, you need an editor that can handle UTF-8, and you need some input method(s); see e.g. my Full Unicode Input utility.
Using suitable tools and settings, you can write var foo = '𠀁'.
The non-BMP characters will be internally represented as surrogate pairs, so each non-BMP character counts as 2 in the string length.
Using for (c of this) instruction, one can make various computations on a string that contains non-BMP characters. For instance, to compute the string length, and to get the nth character of the string:
String.prototype.magicLength = function()
{
var c, k;
k = 0;
for (c of this) // iterate each char of this
{
k++;
}
return k;
}
String.prototype.magicCharAt = function(n)
{
var c, k;
k = 0;
for (c of this) // iterate each char of this
{
if (k == n) return c + "";
k++;
}
return "";
}
This old topic has now a simple solution in ES6:
Split characters into an array
simple version
[..."πŸ˜΄πŸ˜„πŸ˜ƒβ›”πŸŽ πŸš“πŸš‡"] // ["😴", "πŸ˜„", "πŸ˜ƒ", "β›”", "🎠", "πŸš“", "πŸš‡"]
Then having each one separated you can handle them easily for most common cases.
Credit: DownGoat
Full solution
To overcome special emojis as the one in the comment, one can search for the connection charecter (char code 8205 in UTF-16) and make some modifications. Here is how:
let myStr = "πŸ‘©β€πŸ‘©β€πŸ‘§β€πŸ‘§πŸ˜ƒπŒ†"
let arr = [...myStr]
for (i = arr.length-1; i--; i>= 0) {
if (arr[i].charCodeAt(0) == 8205) { // special combination character
arr[i-1] += arr[i] + arr[i+1]; // combine them back to a single emoji
arr.splice(i, 2)
}
}
console.log(arr.length) //3
Haven't found a case where this doesn't work. Comment if you do.
To conclude
it seems that JS uses the 8205 char code to represent UCS-2 characters as a UTF-16 combinations.

Categories

Resources