Encoding a String to ISO-8859-1 - javascript

Is there a way that a can encode my string Olá to Olá in JavaScript? And do this to all accented characters.
My header looks like:
Content-Type: text/html;charset=iso-8859-1

If you already have a proper string, you can do it like this:
ECMAScript ≥ 6, with Emoji support
(see: https://medium.com/#giltayar/iterating-over-emoji-characters-the-es6-way-f06e4589516)
function decimalEscape(s) {
let buffer = [];
for(let ch of s) {
if(ch.codePointAt(0) <= 127) {
buffer.push(ch);
} else {
buffer.push('&#' + ch.codePointAt(0) + ';');
}
}
return buffer.join('');
}
ECMAScript ≤ 5, without Emoji support:
function decimalEscape(s) {
var buffer = [];
for(var i = 0, f = s.length; i < f; ++i) {
if(s.charCodeAt(i) <= 127) {
buffer.push(s.charAt(i));
} else {
buffer.push('&#' + s.charCodeAt(i) + ';');
}
}
return buffer.join('');
}
Usage:
decimalEscape("Olá"); // -> returns "Olá"
If you don't have a proper JavaScript string yet (just a bunch of bytes in some kind of buffer or if the string you have is already in the wrong encoding), you will have to fix the string first, of course.

Related

Equivalent of Java's getBytes in JavaScript for different encodings

I have a function in Java that I need to convert to JavaScript and that contains this line:
byte[] bytes = ttText.getBytes(Charset.forName("Cp1250"));
ttText is String. I need to do the same. I need to get the bytes of a string encoded in Cp1250 (windows-1250), modify those bytes and then convert it back to string. Is there a way how to do it in JavaScript?
I discovered for example TextEncoder and TextDecoder but the support for different encodings than UTF-8 was dropped some time ago.
var cp1250 = '€ ‚ „…†‡ ‰Š‹ŚŤŽŹ ‘’“”•–— ™š›śťžź ˇ˘Ł¤Ą¦§¨©Ş«¬­®Ż°±˛ł´µ¶·¸ąş»Ľ˝ľżŔÁÂĂÄĹĆÇČÉĘËĚÍÎĎĐŃŇÓÔŐÖ×ŘŮÚŰÜÝŢßŕáâăäĺćçčéęëěíîďđńňóôőö÷řůúűüýţ˙';
function encodeCP1250(text) {
var buf = [];
for (var i = 0; i < text.length; i++) {
var code = cp1250.indexOf(text[i]);
if (code >= 0) {
code += 128;
} else {
code = text.charCodeAt(i);
}
buf.push(code > 255 ? 32 : code);
}
return buf;
}
function decodeCP1250(buf) {
var text = '';
for (var i = 0; i < buf.length; i++) {
var code = buf[i];
text += code > 127 ? cp1250[code - 128] : String.fromCharCode(code);
}
return text;
}
var buf = encodeCP1250('AÁÂĂÄ'); // [65, 193, 194, 195, 196]
var text = decodeCP1250(buf); // 'AÁÂĂÄ'
Upd: Chrome and Firefox have TextDecoder as experimental feature, but TextEncoder works only with UTF-8.
Try this.
https://mths.be/windows-1250
This looks promising. It provides support for both encoding and decoding.
All you need to do is add the library and use the methods.
var encodedData = windows1250.encode(text);

How to convert text to binary code in JavaScript?

I want JavaScript to translate text in a textarea into binary code.
For example, if a user types in "TEST" into the textarea, the value "01010100 01000101 01010011 01010100" should be returned.
I would like to avoid using a switch statement to assign each character a binary code value (e.g. case "T": return "01010100) or any other similar technique.
Here's a JSFiddle to show what I mean. Is this possible in native JavaScript?
What you should do is convert every char using charCodeAt function to get the Ascii Code in decimal. Then you can convert it to Binary value using toString(2):
function convert() {
var output = document.getElementById("ti2");
var input = document.getElementById("ti1").value;
output.value = "";
for (var i = 0; i < input.length; i++) {
output.value += input[i].charCodeAt(0).toString(2) + " ";
}
}
<input id="ti1" value ="TEST"/>
<input id="ti2"/>
<button onClick="convert();">Convert!</button>
And here's a fiddle: http://jsfiddle.net/fA24Y/1/
This might be the simplest you can get:
function text2Binary(string) {
return string.split('').map(function (char) {
return char.charCodeAt(0).toString(2);
}).join(' ');
}
traverse the string
convert every character to their char code
convert the char code to binary
push it into an array and add the left 0s
return a string separated by space
Code:
function textToBin(text) {
var length = text.length,
output = [];
for (var i = 0;i < length; i++) {
var bin = text[i].charCodeAt().toString(2);
output.push(Array(8-bin.length+1).join("0") + bin);
}
return output.join(" ");
}
textToBin("!a") => "00100001 01100001"
Another way
function textToBin(text) {
return (
Array
.from(text)
.reduce((acc, char) => acc.concat(char.charCodeAt().toString(2)), [])
.map(bin => '0'.repeat(8 - bin.length) + bin )
.join(' ')
);
}
Here's a pretty generic, native implementation, that I wrote some time ago,
// ABC - a generic, native JS (A)scii(B)inary(C)onverter.
// (c) 2013 Stephan Schmitz <eyecatchup#gmail.com>
// License: MIT, http://eyecatchup.mit-license.org
// URL: https://gist.github.com/eyecatchup/6742657
var ABC = {
toAscii: function(bin) {
return bin.replace(/\s*[01]{8}\s*/g, function(bin) {
return String.fromCharCode(parseInt(bin, 2))
})
},
toBinary: function(str, spaceSeparatedOctets) {
return str.replace(/[\s\S]/g, function(str) {
str = ABC.zeroPad(str.charCodeAt().toString(2));
return !1 == spaceSeparatedOctets ? str : str + " "
})
},
zeroPad: function(num) {
return "00000000".slice(String(num).length) + num
}
};
and to be used as follows:
var binary1 = "01100110011001010110010101101100011010010110111001100111001000000110110001110101011000110110101101111001",
binary2 = "01100110 01100101 01100101 01101100 01101001 01101110 01100111 00100000 01101100 01110101 01100011 01101011 01111001",
binary1Ascii = ABC.toAscii(binary1),
binary2Ascii = ABC.toAscii(binary2);
console.log("Binary 1: " + binary1);
console.log("Binary 1 to ASCII: " + binary1Ascii);
console.log("Binary 2: " + binary2);
console.log("Binary 2 to ASCII: " + binary2Ascii);
console.log("Ascii to Binary: " + ABC.toBinary(binary1Ascii)); // default: space-separated octets
console.log("Ascii to Binary /wo spaces: " + ABC.toBinary(binary1Ascii, 0)); // 2nd parameter false to not space-separate octets
Source is on Github (gist): https://gist.github.com/eyecatchup/6742657
Hope it helps. Feel free to use for whatever you want (well, at least for whatever MIT permits).
var PADDING = "00000000"
var string = "TEST"
var resultArray = []
for (var i = 0; i < string.length; i++) {
var compact = string.charCodeAt(i).toString(2)
var padded = compact.substring(0, PADDING.length - compact.length) + compact
resultArray.push(padded)
}
console.log(resultArray.join(" "))
The other answers will work for most cases. But it's worth noting that charCodeAt() and related don't work with UTF-8 strings (that is, they throw errors if there are any characters outside the standard ASCII range). Here's a workaround.
// UTF-8 to binary
var utf8ToBin = function( s ){
s = unescape( encodeURIComponent( s ) );
var chr, i = 0, l = s.length, out = '';
for( ; i < l; i ++ ){
chr = s.charCodeAt( i ).toString( 2 );
while( chr.length % 8 != 0 ){ chr = '0' + chr; }
out += chr;
}
return out;
};
// Binary to UTF-8
var binToUtf8 = function( s ){
var i = 0, l = s.length, chr, out = '';
for( ; i < l; i += 8 ){
chr = parseInt( s.substr( i, 8 ), 2 ).toString( 16 );
out += '%' + ( ( chr.length % 2 == 0 ) ? chr : '0' + chr );
}
return decodeURIComponent( out );
};
The escape/unescape() functions are deprecated. If you need polyfills for them, you can check out the more comprehensive UTF-8 encoding example found here: http://jsfiddle.net/47zwb41o
Just a hint into the right direction
var foo = "TEST",
res = [ ];
foo.split('').forEach(function( letter ) {
var bin = letter.charCodeAt( 0 ).toString( 2 ),
padding = 8 - bin.length;
res.push( new Array( padding+1 ).join( '0' ) + bin );
});
console.log( res );
8-bit characters with leading 0
'sometext'
.split('')
.map((char) => '00'.concat(char.charCodeAt(0).toString(2)).slice(-8))
.join(' ');
If you need 6 or 7 bit, just change .slice(-8)
Thank you Majid Laissi for your answer
I made 2 functions out from your code:
the goal was to implement convertation of string to VARBINARY, BINARY and back
const stringToBinary = function(string, maxBytes) {
//for BINARY maxBytes = 255
//for VARBINARY maxBytes = 65535
let binaryOutput = '';
if (string.length > maxBytes) {
string = string.substring(0, maxBytes);
}
for (var i = 0; i < string.length; i++) {
binaryOutput += string[i].charCodeAt(0).toString(2) + ' ';
}
return binaryOutput;
};
and backward convertation:
const binaryToString = function(binary) {
const arrayOfBytes = binary.split(' ');
let stringOutput = '';
for (let i = 0; i < arrayOfBytes.length; i++) {
stringOutput += String.fromCharCode(parseInt(arrayOfBytes[i], 2));
}
return stringOutput;
};
and here is a working example: https://jsbin.com/futalidenu/edit?js,console
Provided you're working in node or a browser with BigInt support, this version cuts costs by saving the expensive string construction for the very end:
const zero = 0n
const shift = 8n
function asciiToBinary (str) {
const len = str.length
let n = zero
for (let i = 0; i < len; i++) {
n = (n << shift) + BigInt(str.charCodeAt(i))
}
return n.toString(2).padStart(len * 8, 0)
}
It's about twice as fast as the other solutions mentioned here including this simple es6+ implementation:
const toBinary = s => [...s]
.map(x => x
.codePointAt()
.toString(2)
.padStart(8,0)
)
.join('')
If you need to handle unicode characters, here's this guy:
const zero = 0n
const shift = 8n
const bigShift = 16n
const byte = 255n
function unicodeToBinary (str) {
const len = str.length
let n = zero
for (let i = 0; i < len; i++) {
const bits = BigInt(str.codePointAt(i))
n = (n << (bits > byte ? bigShift : shift)) + bits
}
const bin = n.toString(2)
return bin.padStart(8 * Math.ceil(bin.length / 8), 0)
}
this seems to be the simplified version
Array.from('abc').map((each)=>each.charCodeAt(0).toString(2)).join(" ")
This is as short as you can get. It's based on the top-rated answer but transformed to a reduce function.
"TEST".split("").reduce(function (a, b) { return a + b.charCodeAt(0).toString(2)}, "")
const textToBinary = (string) => {
return string.split('').map((char) =>
char.charCodeAt().toString(2)).join(' ');
}
console.log(textToBinary('hello world'))
var UTF8ToBin=function(f){for(var a,c=0,d=(f=unescape(encodeURIComponent(f))).length,b="";c<d;c++){for(a=f.charCodeAt(c).toString(2);a.length%8!=0;){a="0"+a}b+=a}return b},binToUTF8=function(f){for(var a,c=0,d=f.length,b="";c<d;c+=8){b+="%"+((a=parseInt(f.substr(c,8),2).toString(16)).length%2==0?a:"0"+a)}return decodeURIComponent(b)};
This is a small minified JavaScript Code to convert UTF8 to Binary and Vice versa.
This is a solution for UTF-8-based textual binary representation. It leverages TextEncoder, which encodes a string to its UTF-8 bytes.
This solution separates characters by spaces. The individual "byte-bits" of multi-byte characters are separated by a minus character (-).
// inspired by https://stackoverflow.com/a/40031979/923560
function stringToUtf8BinaryRepresentation(inputString) {
const result = Array.from(inputString).map(
char => [... new TextEncoder().encode(char)].map(
x => x.toString(2).padStart(8, '0')
).join('-')
).join(' ');
return result;
}
// ### example usage #########################
function print(inputString) {
console.log("--------------");
console.log(inputString);
console.log(stringToUtf8BinaryRepresentation(inputString));
}
// compare with https://en.wikipedia.org/wiki/UTF-8#Encoding
// compare with https://en.wikipedia.org/wiki/UTF-8#Codepage_layout
// compare with UTF-16, which JavaScript uses for strings: https://en.wikipedia.org/wiki/UTF-16#Examples
print("TEST");
print("hello world");
print("$");
print("£");
print("€");
print("한");
print("𐍈");
print("παράδειγμα");
print("🤡");
print("👨‍👩‍👧‍👦");
print("👩🏻‍🤝‍🧑🏿");
print("🇺🇦");
use the code: 'text'.split('').map(e=>{return e.charCodeAt(0).toString(2)}) e.g.-
const text='some text';
const output=text.split('').map(e=>{return e.charCodeAt(0).toString(2)})
Simple using Buffer
const text = "TEST";
[...Buffer.from(text).values()] // [ 84, 69, 83, 84 ]
.map(byte => byte.toString(2).padStart(8, 0)) // [ '01010100', '01000101', '01010011', '01010100' ]
.join(' ') // '01010100 01000101 01010011 01010100'
The shortest and simplest solution:
"x".charCodeAt().toString(2) // 1111000
String.charCodeAt() charCodeAt(0) returns unicode: "x".charCodeAt() // 120
Object.toString() charCodeAt().toString(2) converts unicode to binary.
For multiple string characters:
[..."Tesla"].map((i) => i.charCodeAt().toString(2)).join(" ");
// 1010100 1100101 1110011 1101100 1100001
Spread syntax (...)
[..."Tesla"] // ['T', 'e', 's', 'l', 'a']
Array.map()
[..."Tesla"].map((i) => i.charCodeAt()) // [84, 101, 115, 108, 97]
Array.join() Put a space " " after each element in the array map(i) and convert the array to string.
I'm pretty sure that you can do something like this:
Returns a STRING:
const toBinary = (str)=>{
let r = []
for (let i=0; i<str.length; i++) {
r.push(str.charCodeAt(i).toString(2));
}
return r.join("");
}
Or, as an int:
const toBinary = (str)=>{
let r = []
for (let i=0; i<str.length; i++) {
r.push(str.charCodeAt(i).toString(2));
}
return parseInt(r.join(""));
}

How do I implement hex2bin()?

I need to communicate between Javascript and PHP (I use jQuery for AJAX), but the output of the PHP script may contain binary data. That's why I use bin2hex() and json_encode() on PHP side.
How do I convert the hexadecimal string in binary string, with JavaScript?
To answer your question:
function Hex2Bin(n){if(!checkHex(n))return 0;return parseInt(n,16).toString(2)}
Here are some further functions you may find useful for working with binary data:
//Useful Functions
function checkBin(n){return/^[01]{1,64}$/.test(n)}
function checkDec(n){return/^[0-9]{1,64}$/.test(n)}
function checkHex(n){return/^[0-9A-Fa-f]{1,64}$/.test(n)}
function pad(s,z){s=""+s;return s.length<z?pad("0"+s,z):s}
function unpad(s){s=""+s;return s.replace(/^0+/,'')}
//Decimal operations
function Dec2Bin(n){if(!checkDec(n)||n<0)return 0;return n.toString(2)}
function Dec2Hex(n){if(!checkDec(n)||n<0)return 0;return n.toString(16)}
//Binary Operations
function Bin2Dec(n){if(!checkBin(n))return 0;return parseInt(n,2).toString(10)}
function Bin2Hex(n){if(!checkBin(n))return 0;return parseInt(n,2).toString(16)}
//Hexadecimal Operations
function Hex2Bin(n){if(!checkHex(n))return 0;return parseInt(n,16).toString(2)}
function Hex2Dec(n){if(!checkHex(n))return 0;return parseInt(n,16).toString(10)}
JavaScript doesn't have support for binary data. Nevertheless you can emulate this with regular strings.
var hex = "375771", // ASCII HEX: 37="7", 57="W", 71="q"
bytes = [],
str;
for(var i=0; i< hex.length-1; i+=2){
bytes.push(parseInt(hex.substr(i, 2), 16));
}
str = String.fromCharCode.apply(String, bytes);
alert(str); // 7Wq
function hex2bin(hex)
{
var bytes = [], str;
for(var i=0; i< hex.length-1; i+=2)
bytes.push(parseInt(hex.substr(i, 2), 16));
return String.fromCharCode.apply(String, bytes);
}
thanks to Andris!
Other useful information about this topic (dex2bin,bin2dec) can be found here.
According to that, here is a bin2hex solution:
parseInt(1100,2).toString(16); //--> c
Although not an answer to the actual question, it is perhaps useful in this case to also know how to reverse the process:
function bin2hex (bin)
{
var i = 0, l = bin.length, chr, hex = ''
for (i; i < l; ++i)
{
chr = bin.charCodeAt(i).toString(16)
hex += chr.length < 2 ? '0' + chr : chr
}
return hex
}
As an example, using hex2bin on b637eb9146e84cb79f6d981ac9463de1 returns ¶7ëFèL·mÉF=á, and then passing this to bin2hex returns b637eb9146e84cb79f6d981ac9463de1.
It might also be useful to prototype these functions to the String object:
String.prototype.hex2bin = function ()
{
var i = 0, l = this.length - 1, bytes = []
for (i; i < l; i += 2)
{
bytes.push(parseInt(this.substr(i, 2), 16))
}
return String.fromCharCode.apply(String, bytes)
}
String.prototype.bin2hex = function ()
{
var i = 0, l = this.length, chr, hex = ''
for (i; i < l; ++i)
{
chr = this.charCodeAt(i).toString(16)
hex += chr.length < 2 ? '0' + chr : chr
}
return hex
}
alert('b637eb9146e84cb79f6d981ac9463de1'.hex2bin().bin2hex())
All proposed solutions use String.fromCharCode, why not simply using unescape?
String.prototype.hex2bin = function()
{
var i = 0, len = this.length, result = "";
//Converting the hex string into an escaped string, so if the hex string is "a2b320", it will become "%a2%b3%20"
for(; i < len; i+=2)
result += '%' + this.substr(i, 2);
return unescape(result);
}
and then:
alert( "68656c6c6f".hex2bin() ); //shows "hello"
With reference to node.js ( not in browser ).
Basically it's all over-engineered and does not work well.
responses are out of alignment and though text-wise they are the same bit wise everything is all over the place :
curl http://phpimpl.domain.com/testhex.php | xxd
00000000: de56 a735 4739 c01d f2dc e14b ba30 8af0 .Q.%G9.....;.0..
curl http://nodejs.domain.com/ | xxd
00000000: c39e 56c2 a725 4739 c380 c3ad c3b1 c39c ..Q..%G9........
00000010: c3a1 37c2 6b30 c28f c3b0 ..;..0....
The proper way to implement this in node is :
function hex2bin(hex){
return new Buffer(hex,"hex");
}
curl http://nodejs.domain.com/ | xxd
00000000: de56 a735 4739 c01d f2dc e14b ba30 8af0 .Q.%G9.....;.0..
Hope this helps.
Here is an implementation of hex2bin in JS that takes a string and returns Uint8Array, works both in browsers and nodejs,
function hex2bin(hex) {
var length = hex.length / 2;
var result = new Uint8Array(length);
for (var i = 0; i < length; ++i) {
result[i] = parseInt(hex.slice(i * 2, i * 2 + 2), 16);
}
return result;
}
And its inverse,
function bin2hex(bin) {
return Array.from(bin).map(function (x) {
return x.toString(16).padStart(2, '0');
}).join('');
}
If someone needs the other direction (bin to hex), here is it:
function bin2hex(bin) {
return new Buffer(bin).toString("hex");
}
JavaScript does actually contain support for binary data. See Uint8Array.
Just read each byte from the array and convert it into hexadecimal.

Need to escape non-ASCII characters in JavaScript

Is there any function to do the following?
var specialStr = 'ipsum áá éé lore';
var encodedStr = someFunction(specialStr);
// then encodedStr should be like 'ipsum \u00E1\u00E1 \u00E9\u00E9 lore'
I need to encode the characters that are out of ASCII range, and need to do it with that encoding. I don't know its name. Is it Unicode maybe?
This should do the trick:
function padWithLeadingZeros(string) {
return new Array(5 - string.length).join("0") + string;
}
function unicodeCharEscape(charCode) {
return "\\u" + padWithLeadingZeros(charCode.toString(16));
}
function unicodeEscape(string) {
return string.split("")
.map(function (char) {
var charCode = char.charCodeAt(0);
return charCode > 127 ? unicodeCharEscape(charCode) : char;
})
.join("");
}
For example:
var specialStr = 'ipsum áá éé lore';
var encodedStr = unicodeEscape(specialStr);
assert.equal("ipsum \\u00e1\\u00e1 \\u00e9\\u00e9 lore", encodedStr);
If you need hex encoding rather than unicode then you can simplify #Domenic's answer to:
"aäßåfu".replace(/./g, function(c){return c.charCodeAt(0)<128?c:"\\x"+c.charCodeAt(0).toString(16)})
returns: "a\xe4\xdf\xe5fu"
Just for information you can do as Domenic said or use the escape function but that will generate unicode with a different format (more browser friendly):
>>> escape("áéíóú");
"%E1%E9%ED%F3%FA"
This works for me. Specifically when using the Dropbox REST API:
encodeNonAsciiCharacters(value: string) {
let out = ""
for (let i = 0; i < value.length; i++) {
const ch = value.charAt(i);
let chn = ch.charCodeAt(0);
if (chn <= 127) out += ch;
else {
let hex = chn.toString(16);
if (hex.length < 4)
hex = "000".substring(hex.length - 1) + hex;
out += "\\u" + hex;
}
}
return out;
}

JavaScript strings outside of the BMP

BMP being Basic Multilingual Plane
According to JavaScript: the Good Parts:
JavaScript was built at a time when Unicode was a 16-bit character set, so all characters in JavaScript are 16 bits wide.
This leads me to believe that JavaScript uses UCS-2 (not UTF-16!) and can only handle characters up to U+FFFF.
Further investigation confirms this:
> String.fromCharCode(0x20001);
The fromCharCode method seems to only use the lowest 16 bits when returning the Unicode character. Trying to get U+20001 (CJK unified ideograph 20001) instead returns U+0001.
Question: is it at all possible to handle post-BMP characters in JavaScript?
2011-07-31: slide twelve from Unicode Support Shootout: The Good, The Bad, & the (mostly) Ugly covers issues related to this quite well:
Depends what you mean by ‘support’. You can certainly put non-UCS-2 characters in a JS string using surrogates, and browsers will display them if they can.
But, each item in a JS string is a separate UTF-16 code unit. There is no language-level support for handling full characters: all the standard String members (length, split, slice etc) all deal with code units not characters, so will quite happily split surrogate pairs or hold invalid surrogate sequences.
If you want surrogate-aware methods, I'm afraid you're going to have to start writing them yourself! For example:
String.prototype.getCodePointLength= function() {
return this.length-this.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g).length+1;
};
String.fromCodePoint= function() {
var chars= Array.prototype.slice.call(arguments);
for (var i= chars.length; i-->0;) {
var n = chars[i]-0x10000;
if (n>=0)
chars.splice(i, 1, 0xD800+(n>>10), 0xDC00+(n&0x3FF));
}
return String.fromCharCode.apply(null, chars);
};
I came to the same conclusion as bobince. If you want to work with strings containing unicode characters outside of the BMP, you have to reimplement javascript's String methods. This is because javascript counts characters as each 16-bit code value. Symbols outside of the BMP need two code values to be represented. You therefore run into a case where some symbols count as two characters and some count only as one.
I've reimplemented the following methods to treat each unicode code point as a single character: .length, .charCodeAt, .fromCharCode, .charAt, .indexOf, .lastIndexOf, .splice, and .split.
You can check it out on jsfiddle: http://jsfiddle.net/Y89Du/
Here's the code without comments. I tested it, but it may still have errors. Comments are welcome.
if (!String.prototype.ucLength) {
String.prototype.ucLength = function() {
// this solution was taken from
// http://stackoverflow.com/questions/3744721/javascript-strings-outside-of-the-bmp
return this.length - this.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g).length + 1;
};
}
if (!String.prototype.codePointAt) {
String.prototype.codePointAt = function (ucPos) {
if (isNaN(ucPos)){
ucPos = 0;
}
var str = String(this);
var codePoint = null;
var pairFound = false;
var ucIndex = -1;
var i = 0;
while (i < str.length){
ucIndex += 1;
var code = str.charCodeAt(i);
var next = str.charCodeAt(i + 1);
pairFound = (0xD800 <= code && code <= 0xDBFF && 0xDC00 <= next && next <= 0xDFFF);
if (ucIndex == ucPos){
codePoint = pairFound ? ((code - 0xD800) * 0x400) + (next - 0xDC00) + 0x10000 : code;
break;
} else{
i += pairFound ? 2 : 1;
}
}
return codePoint;
};
}
if (!String.fromCodePoint) {
String.fromCodePoint = function () {
var strChars = [], codePoint, offset, codeValues, i;
for (i = 0; i < arguments.length; ++i) {
codePoint = arguments[i];
offset = codePoint - 0x10000;
if (codePoint > 0xFFFF){
codeValues = [0xD800 + (offset >> 10), 0xDC00 + (offset & 0x3FF)];
} else{
codeValues = [codePoint];
}
strChars.push(String.fromCharCode.apply(null, codeValues));
}
return strChars.join("");
};
}
if (!String.prototype.ucCharAt) {
String.prototype.ucCharAt = function (ucIndex) {
var str = String(this);
var codePoint = str.codePointAt(ucIndex);
var ucChar = String.fromCodePoint(codePoint);
return ucChar;
};
}
if (!String.prototype.ucIndexOf) {
String.prototype.ucIndexOf = function (searchStr, ucStart) {
if (isNaN(ucStart)){
ucStart = 0;
}
if (ucStart < 0){
ucStart = 0;
}
var str = String(this);
var strUCLength = str.ucLength();
searchStr = String(searchStr);
var ucSearchLength = searchStr.ucLength();
var i = ucStart;
while (i < strUCLength){
var ucSlice = str.ucSlice(i,i+ucSearchLength);
if (ucSlice == searchStr){
return i;
}
i++;
}
return -1;
};
}
if (!String.prototype.ucLastIndexOf) {
String.prototype.ucLastIndexOf = function (searchStr, ucStart) {
var str = String(this);
var strUCLength = str.ucLength();
if (isNaN(ucStart)){
ucStart = strUCLength - 1;
}
if (ucStart >= strUCLength){
ucStart = strUCLength - 1;
}
searchStr = String(searchStr);
var ucSearchLength = searchStr.ucLength();
var i = ucStart;
while (i >= 0){
var ucSlice = str.ucSlice(i,i+ucSearchLength);
if (ucSlice == searchStr){
return i;
}
i--;
}
return -1;
};
}
if (!String.prototype.ucSlice) {
String.prototype.ucSlice = function (ucStart, ucStop) {
var str = String(this);
var strUCLength = str.ucLength();
if (isNaN(ucStart)){
ucStart = 0;
}
if (ucStart < 0){
ucStart = strUCLength + ucStart;
if (ucStart < 0){ ucStart = 0;}
}
if (typeof(ucStop) == 'undefined'){
ucStop = strUCLength - 1;
}
if (ucStop < 0){
ucStop = strUCLength + ucStop;
if (ucStop < 0){ ucStop = 0;}
}
var ucChars = [];
var i = ucStart;
while (i < ucStop){
ucChars.push(str.ucCharAt(i));
i++;
}
return ucChars.join("");
};
}
if (!String.prototype.ucSplit) {
String.prototype.ucSplit = function (delimeter, limit) {
var str = String(this);
var strUCLength = str.ucLength();
var ucChars = [];
if (delimeter == ''){
for (var i = 0; i < strUCLength; i++){
ucChars.push(str.ucCharAt(i));
}
ucChars = ucChars.slice(0, 0 + limit);
} else{
ucChars = str.split(delimeter, limit);
}
return ucChars;
};
}
More recent JavaScript engines have String.fromCodePoint.
const ideograph = String.fromCodePoint( 0x20001 ); // outside the BMP
Also a code-point iterator, which gets you the code-point length.
function countCodePoints( str )
{
const i = str[Symbol.iterator]();
let count = 0;
while( !i.next().done ) ++count;
return count;
}
console.log( ideograph.length ); // gives '2'
console.log( countCodePoints(ideograph) ); // '1'
Yes, you can. Although support to non-BMP characters directly in source documents is optional according to the ECMAScript standard, modern browsers let you use them. Naturally, the document encoding must be properly declared, and for most practical purposes you would need to use the UTF-8 encoding. Moreover, you need an editor that can handle UTF-8, and you need some input method(s); see e.g. my Full Unicode Input utility.
Using suitable tools and settings, you can write var foo = '𠀁'.
The non-BMP characters will be internally represented as surrogate pairs, so each non-BMP character counts as 2 in the string length.
Using for (c of this) instruction, one can make various computations on a string that contains non-BMP characters. For instance, to compute the string length, and to get the nth character of the string:
String.prototype.magicLength = function()
{
var c, k;
k = 0;
for (c of this) // iterate each char of this
{
k++;
}
return k;
}
String.prototype.magicCharAt = function(n)
{
var c, k;
k = 0;
for (c of this) // iterate each char of this
{
if (k == n) return c + "";
k++;
}
return "";
}
This old topic has now a simple solution in ES6:
Split characters into an array
simple version
[..."😴😄😃⛔🎠🚓🚇"] // ["😴", "😄", "😃", "⛔", "🎠", "🚓", "🚇"]
Then having each one separated you can handle them easily for most common cases.
Credit: DownGoat
Full solution
To overcome special emojis as the one in the comment, one can search for the connection charecter (char code 8205 in UTF-16) and make some modifications. Here is how:
let myStr = "👩‍👩‍👧‍👧😃𝌆"
let arr = [...myStr]
for (i = arr.length-1; i--; i>= 0) {
if (arr[i].charCodeAt(0) == 8205) { // special combination character
arr[i-1] += arr[i] + arr[i+1]; // combine them back to a single emoji
arr.splice(i, 2)
}
}
console.log(arr.length) //3
Haven't found a case where this doesn't work. Comment if you do.
To conclude
it seems that JS uses the 8205 char code to represent UCS-2 characters as a UTF-16 combinations.

Categories

Resources