Need to escape non-ASCII characters in JavaScript - javascript

Is there any function to do the following?
var specialStr = 'ipsum áá éé lore';
var encodedStr = someFunction(specialStr);
// then encodedStr should be like 'ipsum \u00E1\u00E1 \u00E9\u00E9 lore'
I need to encode the characters that are out of ASCII range, and need to do it with that encoding. I don't know its name. Is it Unicode maybe?

This should do the trick:
function padWithLeadingZeros(string) {
return new Array(5 - string.length).join("0") + string;
}
function unicodeCharEscape(charCode) {
return "\\u" + padWithLeadingZeros(charCode.toString(16));
}
function unicodeEscape(string) {
return string.split("")
.map(function (char) {
var charCode = char.charCodeAt(0);
return charCode > 127 ? unicodeCharEscape(charCode) : char;
})
.join("");
}
For example:
var specialStr = 'ipsum áá éé lore';
var encodedStr = unicodeEscape(specialStr);
assert.equal("ipsum \\u00e1\\u00e1 \\u00e9\\u00e9 lore", encodedStr);

If you need hex encoding rather than unicode then you can simplify #Domenic's answer to:
"aäßåfu".replace(/./g, function(c){return c.charCodeAt(0)<128?c:"\\x"+c.charCodeAt(0).toString(16)})
returns: "a\xe4\xdf\xe5fu"

Just for information you can do as Domenic said or use the escape function but that will generate unicode with a different format (more browser friendly):
>>> escape("áéíóú");
"%E1%E9%ED%F3%FA"

This works for me. Specifically when using the Dropbox REST API:
encodeNonAsciiCharacters(value: string) {
let out = ""
for (let i = 0; i < value.length; i++) {
const ch = value.charAt(i);
let chn = ch.charCodeAt(0);
if (chn <= 127) out += ch;
else {
let hex = chn.toString(16);
if (hex.length < 4)
hex = "000".substring(hex.length - 1) + hex;
out += "\\u" + hex;
}
}
return out;
}

Related

Javascript - Search unicode string in unicode string

When i try to search a unicode string in a unicode string, i find no solution.
Ex: check if string 'vie' is contained in string 'Mr. ViỆt has a blue house'
So i try a hard way as below:
// Convert string to Unicode
function toUnicode(theString) {
var unicodeString = '';
for (var i=0; i < theString.length; i++) {
var theUnicode = theString.charCodeAt(i).toString(16).toUpperCase();
while (theUnicode.length < 4) {
theUnicode = '0' + theUnicode;
}
theUnicode = '\\u' + theUnicode;
unicodeString += theUnicode;
}
return unicodeString;
}
// Convert string to be Regex Unicode
function toRegexUnicode(theString) {
var unicodeString = '';
for (var i=0; i < theString.length; i++) {
var theUnicode = theString.charCodeAt(i).toString(16).toUpperCase();
while (theUnicode.length < 4) {
theUnicode = '0' + theUnicode;
}
theUnicode = '\\u' + theUnicode;
unicodeString += theUnicode;
}
return new RegExp('[' + unicodeString + ']')
}
// Search
function searchUnicode() {
var strOriginal = "Mr. ViỆt has a blue house"
var regexUnicode = toRegexUnicode(strOriginal)
var strSearch = toUnicode('vie')
var result = regexUnicode.test(strSearch)
console.log(result)
}
Test at: https://www.w3schools.com/code/tryit.asp?filename=FY3NGXMQRMLA
Are there any better ways?
First, your regex expression is wrong. Remove the braces.
Second, you're creating your regex testing the wrong way around.
You're currently setting up your regex search using your full string.
You're also not converting your strOriginal to Unicode.
This means your searchUnicode function needs to appear as follows:
var strOriginal = "Mr. ViỆt has a blue house"
var strOriginalUnicode = toUnicode(strOriginal)
var strSearch = toUnicode('vie')
var regexUnicode = toRegexUnicode(strSearch)
var result = regexUnicode.test(strOriginalUnicode)
Next, we can simplify your toRegexUnicode function as such:
// Convert string to be Regex Unicode
function toRegexUnicode(theString) {
theString = theString.replace(/\\/g, "\\\\")
return new RegExp(theString)
}
No need to reuse your conversion method. You will also note global replacements of all \ to become \\. That's because Regex considers a backslash as an escape character so we need to escape our escape character.
I try another way, just convert all string to ASCII then search:
function stringToASCII(str) {
try {
return str.replace(/[àáảãạâầấẩẫậăằắẳẵặ]/g, 'a')
.replace(/[èéẻẽẹêềếểễệ]/g, 'e')
.replace(/[đ]/g, 'd')
.replace(/[ìíỉĩị]/g, 'i')
.replace(/[òóỏõọôồốổỗộơờớởỡợ]/g, 'o')
.replace(/[ùúủũụưừứửữự]/g, 'u')
.replace(/[ỳýỷỹỵ]/g, 'y')
} catch {
return ''
}
}
function searchASCII() {
var strOriginal = "Mr. ViỆt lê nguyễn thị tùng á à ạds"
var strSearch = "vie"
var strOriginalToASCII = stringToASCII(strOriginal.toLowerCase())
var strSearchToASCII = stringToASCII(strSearch.toLowerCase())
var result = strOriginalToASCII.includes(strSearchToASCII)
// Results
console.log('strOriginalToASCII: ', strOriginalToASCII)
console.log('strSearchToASCII: ', strSearchToASCII)
console.log('result: ', result)
}
Output:
strOriginalToASCII: mr. viet le nguyen thi tung a a ads
strSearchToASCII: vie
result: true
Test at: https://www.w3schools.com/code/tryit.asp?filename=FY3NGXMQRMLA

How can I remove the last emoji of a group of emojis in javascript?

Let's say I have this 3 emojis in a string: 😀🎃👪
There are not any spaces or any other character except emojis in the string.
How can I remove the last emoji in javascript?
The answer below doesn't use any special package and safely removes the last emoji
function safeEmojiBackspace(str)
{
let initialRealCount = fancyCount(str);
while(str.length > 0 && fancyCount(str) !== initialRealCount - 1)
{
str = str.substring(0,str.length - 1);
}
return str;
}
function fancyCount(str){
const joiner = "\u{200D}";
const split = str.split(joiner);
let count = 0;
for(const s of split){
//removing the variation selectors
const num = Array.from(s.split(/[\ufe00-\ufe0f]/).join("")).length;
count += num;
}
//assuming the joiners are used appropriately
return count / split.length;
}
Sample usage
let str = "something😀🎃👪";
str = safeEmojiBackspace(str);//"something😀🎃"
You can do this. It will always remove the last emoji.
function removeEmoji() {
var emoStringArray = document.getElementById('emoji').innerHTML;
var lastIndex = emoStringArray.lastIndexOf(" ");
var stripedEmoStringArray = emoStringArray.substring(0, lastIndex);
document.getElementById('emoji').innerHTML = stripedEmoStringArray;
}
<p id="emoji">
😀 🎃 👪
</p>
<button onclick="removeEmoji()">Remove</button>
I hope this is what you want.
var emoString = "😀 🎃 👪";
emoString = emoString.slice(0, -2);
However, this would work only if you have 3 emojis in total. Hence to achieve a generalised solution, you can use the underscore functions split() and javascript function join() :
var emoString = "😀 🎃 👪";
emoString = _.rest(emoString.split(' ')).join(' ')
Hope this will solve your issue.
Ok, here is how I solved it:
function deleteEmoji(emojiStr) {
let emojisArray = emojiStr.match(/([\uD800-\uDBFF][\uDC00-\uDFFF])/g);
emojisArray = emojisArray.splice(0, emojisArray.length - 1);
return emojisArray.join("");
}
let emojitext = "😀🎃👪";
console.log(deleteEmoji(emojitext));
I was actually surprised that unicode in this day an age is still not fully supported in browsers. I assume a lot of this is down to windows and it's version of UTF-16.
The OP I believe has found his own solution to the original problem, but I thought there has to be a more generic solution to surrogate pair unicode characters.
Anyway, so my solution is convert the text into a UTF-32 array, these can then be manipulated must easier, using slice etc.
After you have done what you want to the array, just convert back.
Below is an example.
Some of the code I got from -> Is it possible to convert a string containing "high" unicode chars to an array consisting of dec values derived from utf-32 ("real") codes?
and http://speakingjs.com/es5/ch24.html
function decodeUnicode(str) {
const r = [];
let i = 0;
while(i < str.length) {
let chr = str.charCodeAt(i++);
if(chr >= 0xD800 && chr <= 0xDBFF) {
var low = str.charCodeAt(i++);
r.push(0x10000 +
((chr - 0xD800) << 10) | (low - 0xDC00));
} else {
r.push(chr);
}
}
return r;
}
function toUTF16(codePoint) {
const TEN_BITS = parseInt('1111111111', 2);
if (codePoint <= 0xFFFF) { return codePoint; }
codePoint -= 0x10000;
const leadingSurrogate = 0xD800 | (codePoint >> 10);
const trailingSurrogate = 0xDC00 | (codePoint & TEN_BITS);
return String.fromCharCode(leadingSurrogate) +
String.fromCharCode(trailingSurrogate);
}
function encodeUnicode(data) {
return data.reduce((a, v) => {
a += toUTF16(v);
return a;
},"");
}
var unicode = decodeUnicode("😀🎃👪");
for (let l = 0; l < unicode.length; l ++)
console.log(encodeUnicode(
unicode.slice(0, l ? -l : unicode.length)));
console.log("pick some random ones");
let str = "";
for (let l = 0; l < 20; l ++) {
let rnd = Math.trunc(Math.random()*unicode.length);
str += encodeUnicode(unicode.slice(rnd,rnd+1));
}
console.log(str);

Trim specific character from a string

What's the JavaScript equivalent to this C# Method:
var x = "|f|oo||";
var y = x.Trim('|'); // "f|oo"
C# trims the selected character only at the beginning and end of the string!
One line is enough:
var x = '|f|oo||';
var y = x.replace(/^\|+|\|+$/g, '');
document.write(x + '<br />' + y);
^ beginning of the string
\|+ pipe, one or more times
| or
\|+ pipe, one or more times
$ end of the string
A general solution:
function trim (s, c) {
if (c === "]") c = "\\]";
if (c === "^") c = "\\^";
if (c === "\\") c = "\\\\";
return s.replace(new RegExp(
"^[" + c + "]+|[" + c + "]+$", "g"
), "");
}
chars = ".|]\\^";
for (c of chars) {
s = c + "foo" + c + c + "oo" + c + c + c;
console.log(s, "->", trim(s, c));
}
Parameter c is expected to be a character (a string of length 1).
As mentionned in the comments, it might be useful to support multiple characters, as it's quite common to trim multiple whitespace-like characters for example. To do this, MightyPork suggests to replace the ifs with the following line of code:
c = c.replace(/[-/\\^$*+?.()|[\]{}]/g, '\\$&');
This part [-/\\^$*+?.()|[\]{}] is a set of special characters in regular expression syntax, and $& is a placeholder which stands for the matching character, meaning that the replace function escapes special characters. Try in your browser console:
> "{[hello]}".replace(/[-/\\^$*+?.()|[\]{}]/g, '\\$&')
"\{\[hello\]\}"
Update: Was curious around the performance of different solutions and so I've updated a basic benchmark here:
https://www.measurethat.net/Benchmarks/Show/12738/0/trimming-leadingtrailing-characters
Some interesting and unexpected results running under Chrome.
https://www.measurethat.net/Benchmarks/ShowResult/182877
+-----------------------------------+-----------------------+
| Test name | Executions per second |
+-----------------------------------+-----------------------+
| Index Version (Jason Larke) | 949979.7 Ops/sec |
| Substring Version (Pho3niX83) | 197548.9 Ops/sec |
| Regex Version (leaf) | 107357.2 Ops/sec |
| Boolean Filter Version (mbaer3000)| 94162.3 Ops/sec |
| Spread Version (Robin F.) | 4242.8 Ops/sec |
+-----------------------------------+-----------------------+
Please note; tests were carried out on only a single test string (with both leading and trailing characters that needed trimming). In addition, this benchmark only gives an indication of raw speed; other factors like memory usage are also important to consider.
If you're dealing with longer strings I believe this should outperform most of the other options by reducing the number of allocated strings to either zero or one:
function trim(str, ch) {
var start = 0,
end = str.length;
while(start < end && str[start] === ch)
++start;
while(end > start && str[end - 1] === ch)
--end;
return (start > 0 || end < str.length) ? str.substring(start, end) : str;
}
// Usage:
trim('|hello|world|', '|'); // => 'hello|world'
Or if you want to trim from a set of multiple characters:
function trimAny(str, chars) {
var start = 0,
end = str.length;
while(start < end && chars.indexOf(str[start]) >= 0)
++start;
while(end > start && chars.indexOf(str[end - 1]) >= 0)
--end;
return (start > 0 || end < str.length) ? str.substring(start, end) : str;
}
// Usage:
trimAny('|hello|world ', [ '|', ' ' ]); // => 'hello|world'
// because '.indexOf' is used, you could also pass a string for the 2nd parameter:
trimAny('|hello| world ', '| '); // => 'hello|world'
EDIT: For fun, trim words (rather than individual characters)
// Helper function to detect if a string contains another string
// at a specific position.
// Equivalent to using `str.indexOf(substr, pos) === pos` but *should* be more efficient on longer strings as it can exit early (needs benchmarks to back this up).
function hasSubstringAt(str, substr, pos) {
var idx = 0, len = substr.length;
for (var max = str.length; idx < len; ++idx) {
if ((pos + idx) >= max || str[pos + idx] != substr[idx])
break;
}
return idx === len;
}
function trimWord(str, word) {
var start = 0,
end = str.length,
len = word.length;
while (start < end && hasSubstringAt(str, word, start))
start += word.length;
while (end > start && hasSubstringAt(str, word, end - len))
end -= word.length
return (start > 0 || end < str.length) ? str.substring(start, end) : str;
}
// Usage:
trimWord('blahrealmessageblah', 'blah');
If I understood well, you want to remove a specific character only if it is at the beginning or at the end of the string (ex: ||fo||oo|||| should become foo||oo). You can create an ad hoc function as follows:
function trimChar(string, charToRemove) {
while(string.charAt(0)==charToRemove) {
string = string.substring(1);
}
while(string.charAt(string.length-1)==charToRemove) {
string = string.substring(0,string.length-1);
}
return string;
}
I tested this function with the code below:
var str = "|f|oo||";
$( "#original" ).html( "Original String: '" + str + "'" );
$( "#trimmed" ).html( "Trimmed: '" + trimChar(str, "|") + "'" );
You can use a regular expression such as:
var x = "|f|oo||";
var y = x.replace(/^\|+|\|+$/g, "");
alert(y); // f|oo
UPDATE:
Should you wish to generalize this into a function, you can do the following:
var escapeRegExp = function(strToEscape) {
// Escape special characters for use in a regular expression
return strToEscape.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\\\^\$\|]/g, "\\$&");
};
var trimChar = function(origString, charToTrim) {
charToTrim = escapeRegExp(charToTrim);
var regEx = new RegExp("^[" + charToTrim + "]+|[" + charToTrim + "]+$", "g");
return origString.replace(regEx, "");
};
var x = "|f|oo||";
var y = trimChar(x, "|");
alert(y); // f|oo
A regex-less version which is easy on the eye:
const trim = (str, chars) => str.split(chars).filter(Boolean).join(chars);
For use cases where we're certain that there's no repetition of the chars off the edges.
to keep this question up to date:
here is an approach i'd choose over the regex function using the ES6 spread operator.
function trimByChar(string, character) {
const first = [...string].findIndex(char => char !== character);
const last = [...string].reverse().findIndex(char => char !== character);
return string.substring(first, string.length - last);
}
Improved version after #fabian 's comment (can handle strings containing the same character only)
function trimByChar1(string, character) {
const arr = Array.from(string);
const first = arr.findIndex(char => char !== character);
const last = arr.reverse().findIndex(char => char !== character);
return (first === -1 && last === -1) ? '' : string.substring(first, string.length - last);
}
This can trim several characters at a time:
function trimChars (str, c) {
var re = new RegExp("^[" + c + "]+|[" + c + "]+$", "g");
return str.replace(re,"");
}
var x = "|f|oo||";
x = trimChars(x, '|'); // f|oo
var y = "..++|f|oo||++..";
y = trimChars(y, '|.+'); // f|oo
var z = "\\f|oo\\"; // \f|oo\
// For backslash, remember to double-escape:
z = trimChars(z, "\\\\"); // f|oo
For use in your own script and if you don't mind changing the prototype, this can be a convenient "hack":
String.prototype.trimChars = function (c) {
var re = new RegExp("^[" + c + "]+|[" + c + "]+$", "g");
return this.replace(re,"");
}
var x = "|f|oo||";
x = x.trimChars('|'); // f|oo
Since I use the trimChars function extensively in one of my scripts, I prefer this solution. But there are potential issues with modifying an object's prototype.
If you define these functions in your program, your strings will have an upgraded version of trim that can trim all given characters:
String.prototype.trimLeft = function(charlist) {
if (charlist === undefined)
charlist = "\s";
return this.replace(new RegExp("^[" + charlist + "]+"), "");
};
String.prototype.trim = function(charlist) {
return this.trimLeft(charlist).trimRight(charlist);
};
String.prototype.trimRight = function(charlist) {
if (charlist === undefined)
charlist = "\s";
return this.replace(new RegExp("[" + charlist + "]+$"), "");
};
var withChars = "/-center-/"
var withoutChars = withChars.trim("/-")
document.write(withoutChars)
Source
https://www.sitepoint.com/trimming-strings-in-javascript/
const trim = (str, char) => {
let i = 0;
let j = str.length-1;
while (str[i] === char) i++;
while (str[j] === char) j--;
return str.slice(i,j+1);
}
console.log(trim('|f|oo|', '|')); // f|oo
Non-regex solution.
Two pointers: i (beginning) & j (end).
Only move pointers if they match char and stop when they don't.
Return remaining string.
I would suggest looking at lodash and how they implemented the trim function.
See Lodash Trim for the documentation and the source to see the exact code that does the trimming.
I know this does not provide an exact answer your question, but I think it's good to set a reference to a library on such a question since others might find it useful.
This one trims all leading and trailing delimeters
const trim = (str, delimiter) => {
const pattern = `[^\\${delimiter}]`;
const start = str.search(pattern);
const stop = str.length - str.split('').reverse().join('').search(pattern);
return str.substring(start, stop);
}
const test = '||2|aaaa12bb3ccc|||||';
console.log(trim(test, '|')); // 2|aaaa12bb3ccc
I like the solution from #Pho3niX83...
Let's extend it with "word" instead of "char"...
function trimWord(_string, _word) {
var splitted = _string.split(_word);
while (splitted.length && splitted[0] === "") {
splitted.shift();
}
while (splitted.length && splitted[splitted.length - 1] === "") {
splitted.pop();
}
return splitted.join(_word);
};
The best way to resolve this task is (similar with PHP trim function):
function trim( str, charlist ) {
if ( typeof charlist == 'undefined' ) {
charlist = '\\s';
}
var pattern = '^[' + charlist + ']*(.*?)[' + charlist + ']*$';
return str.replace( new RegExp( pattern ) , '$1' )
}
document.getElementById( 'run' ).onclick = function() {
document.getElementById( 'result' ).value =
trim( document.getElementById( 'input' ).value,
document.getElementById( 'charlist' ).value);
}
<div>
<label for="input">Text to trim:</label><br>
<input id="input" type="text" placeholder="Text to trim" value="dfstextfsd"><br>
<label for="charlist">Charlist:</label><br>
<input id="charlist" type="text" placeholder="Charlist" value="dfs"><br>
<label for="result">Result:</label><br>
<input id="result" type="text" placeholder="Result" disabled><br>
<button type="button" id="run">Trim it!</button>
</div>
P.S.: why i posted my answer, when most people already done it before? Because i found "the best" mistake in all of there answers: all used the '+' meta instead of '*', 'cause trim must remove chars IF THEY ARE IN START AND/OR END, but it return original string in else case.
Another version to use regular expression.
No or(|) used and no global(g) used.
function escapeRegexp(s) {
return s.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&');
}
function trimSpecific(value, find) {
const find2 = escapeRegexp(find);
return value.replace(new RegExp(`^[${find2}]*(.*?)[${find2}]*$`), '$1')
}
console.log(trimSpecific('"a"b"', '"') === 'a"b');
console.log(trimSpecific('""ab"""', '"') === 'ab');
console.log(trimSpecific('"', '"') === '');
console.log(trimSpecific('"a', '"') === 'a');
console.log(trimSpecific('a"', '"') === 'a');
console.log(trimSpecific('[a]', '[]') === 'a');
console.log(trimSpecific('{[a]}', '[{}]') === 'a');
expanding on #leaf 's answer, here's one that can take multiple characters:
var trim = function (s, t) {
var tr, sr
tr = t.split('').map(e => `\\\\${e}`).join('')
sr = s.replace(new RegExp(`^[${tr}]+|[${tr}]+$`, 'g'), '')
return sr
}
function trim(text, val) {
return text.replace(new RegExp('^'+val+'+|'+val+'+$','g'), '');
}
"|Howdy".replace(new RegExp("^\\|"),"");
(note the double escaping. \\ needed, to have an actually single slash in the string, that then leads to escaping of | in the regExp).
Only few characters need regExp-Escaping., among them the pipe operator.
const special = ':;"<>?/!`~##$%^&*()+=-_ '.split("");
const trim = (input) => {
const inTrim = (str) => {
const spStr = str.split("");
let deleteTill = 0;
let startChar = spStr[deleteTill];
while (special.some((s) => s === startChar)) {
deleteTill++;
if (deleteTill <= spStr.length) {
startChar = spStr[deleteTill];
} else {
deleteTill--;
break;
}
}
spStr.splice(0, deleteTill);
return spStr.join("");
};
input = inTrim(input);
input = inTrim(input.split("").reverse().join("")).split("").reverse().join("");
return input;
};
alert(trim('##This is what I use$%'));
String.prototype.TrimStart = function (n) {
if (this.charAt(0) == n)
return this.substr(1);
};
String.prototype.TrimEnd = function (n) {
if (this.slice(-1) == n)
return this.slice(0, -1);
};
To my knowledge, jQuery doesnt have a built in function the method your are asking about.
With javascript however, you can just use replace to change the content of your string:
x.replace(/|/i, ""));
This will replace all occurences of | with nothing.
try:
console.log(x.replace(/\|/g,''));
Try this method:
var a = "anan güzel mi?";
if (a.endsWith("?")) a = a.slice(0, -1);
document.body.innerHTML = a;

How to convert text to binary code in JavaScript?

I want JavaScript to translate text in a textarea into binary code.
For example, if a user types in "TEST" into the textarea, the value "01010100 01000101 01010011 01010100" should be returned.
I would like to avoid using a switch statement to assign each character a binary code value (e.g. case "T": return "01010100) or any other similar technique.
Here's a JSFiddle to show what I mean. Is this possible in native JavaScript?
What you should do is convert every char using charCodeAt function to get the Ascii Code in decimal. Then you can convert it to Binary value using toString(2):
function convert() {
var output = document.getElementById("ti2");
var input = document.getElementById("ti1").value;
output.value = "";
for (var i = 0; i < input.length; i++) {
output.value += input[i].charCodeAt(0).toString(2) + " ";
}
}
<input id="ti1" value ="TEST"/>
<input id="ti2"/>
<button onClick="convert();">Convert!</button>
And here's a fiddle: http://jsfiddle.net/fA24Y/1/
This might be the simplest you can get:
function text2Binary(string) {
return string.split('').map(function (char) {
return char.charCodeAt(0).toString(2);
}).join(' ');
}
traverse the string
convert every character to their char code
convert the char code to binary
push it into an array and add the left 0s
return a string separated by space
Code:
function textToBin(text) {
var length = text.length,
output = [];
for (var i = 0;i < length; i++) {
var bin = text[i].charCodeAt().toString(2);
output.push(Array(8-bin.length+1).join("0") + bin);
}
return output.join(" ");
}
textToBin("!a") => "00100001 01100001"
Another way
function textToBin(text) {
return (
Array
.from(text)
.reduce((acc, char) => acc.concat(char.charCodeAt().toString(2)), [])
.map(bin => '0'.repeat(8 - bin.length) + bin )
.join(' ')
);
}
Here's a pretty generic, native implementation, that I wrote some time ago,
// ABC - a generic, native JS (A)scii(B)inary(C)onverter.
// (c) 2013 Stephan Schmitz <eyecatchup#gmail.com>
// License: MIT, http://eyecatchup.mit-license.org
// URL: https://gist.github.com/eyecatchup/6742657
var ABC = {
toAscii: function(bin) {
return bin.replace(/\s*[01]{8}\s*/g, function(bin) {
return String.fromCharCode(parseInt(bin, 2))
})
},
toBinary: function(str, spaceSeparatedOctets) {
return str.replace(/[\s\S]/g, function(str) {
str = ABC.zeroPad(str.charCodeAt().toString(2));
return !1 == spaceSeparatedOctets ? str : str + " "
})
},
zeroPad: function(num) {
return "00000000".slice(String(num).length) + num
}
};
and to be used as follows:
var binary1 = "01100110011001010110010101101100011010010110111001100111001000000110110001110101011000110110101101111001",
binary2 = "01100110 01100101 01100101 01101100 01101001 01101110 01100111 00100000 01101100 01110101 01100011 01101011 01111001",
binary1Ascii = ABC.toAscii(binary1),
binary2Ascii = ABC.toAscii(binary2);
console.log("Binary 1: " + binary1);
console.log("Binary 1 to ASCII: " + binary1Ascii);
console.log("Binary 2: " + binary2);
console.log("Binary 2 to ASCII: " + binary2Ascii);
console.log("Ascii to Binary: " + ABC.toBinary(binary1Ascii)); // default: space-separated octets
console.log("Ascii to Binary /wo spaces: " + ABC.toBinary(binary1Ascii, 0)); // 2nd parameter false to not space-separate octets
Source is on Github (gist): https://gist.github.com/eyecatchup/6742657
Hope it helps. Feel free to use for whatever you want (well, at least for whatever MIT permits).
var PADDING = "00000000"
var string = "TEST"
var resultArray = []
for (var i = 0; i < string.length; i++) {
var compact = string.charCodeAt(i).toString(2)
var padded = compact.substring(0, PADDING.length - compact.length) + compact
resultArray.push(padded)
}
console.log(resultArray.join(" "))
The other answers will work for most cases. But it's worth noting that charCodeAt() and related don't work with UTF-8 strings (that is, they throw errors if there are any characters outside the standard ASCII range). Here's a workaround.
// UTF-8 to binary
var utf8ToBin = function( s ){
s = unescape( encodeURIComponent( s ) );
var chr, i = 0, l = s.length, out = '';
for( ; i < l; i ++ ){
chr = s.charCodeAt( i ).toString( 2 );
while( chr.length % 8 != 0 ){ chr = '0' + chr; }
out += chr;
}
return out;
};
// Binary to UTF-8
var binToUtf8 = function( s ){
var i = 0, l = s.length, chr, out = '';
for( ; i < l; i += 8 ){
chr = parseInt( s.substr( i, 8 ), 2 ).toString( 16 );
out += '%' + ( ( chr.length % 2 == 0 ) ? chr : '0' + chr );
}
return decodeURIComponent( out );
};
The escape/unescape() functions are deprecated. If you need polyfills for them, you can check out the more comprehensive UTF-8 encoding example found here: http://jsfiddle.net/47zwb41o
Just a hint into the right direction
var foo = "TEST",
res = [ ];
foo.split('').forEach(function( letter ) {
var bin = letter.charCodeAt( 0 ).toString( 2 ),
padding = 8 - bin.length;
res.push( new Array( padding+1 ).join( '0' ) + bin );
});
console.log( res );
8-bit characters with leading 0
'sometext'
.split('')
.map((char) => '00'.concat(char.charCodeAt(0).toString(2)).slice(-8))
.join(' ');
If you need 6 or 7 bit, just change .slice(-8)
Thank you Majid Laissi for your answer
I made 2 functions out from your code:
the goal was to implement convertation of string to VARBINARY, BINARY and back
const stringToBinary = function(string, maxBytes) {
//for BINARY maxBytes = 255
//for VARBINARY maxBytes = 65535
let binaryOutput = '';
if (string.length > maxBytes) {
string = string.substring(0, maxBytes);
}
for (var i = 0; i < string.length; i++) {
binaryOutput += string[i].charCodeAt(0).toString(2) + ' ';
}
return binaryOutput;
};
and backward convertation:
const binaryToString = function(binary) {
const arrayOfBytes = binary.split(' ');
let stringOutput = '';
for (let i = 0; i < arrayOfBytes.length; i++) {
stringOutput += String.fromCharCode(parseInt(arrayOfBytes[i], 2));
}
return stringOutput;
};
and here is a working example: https://jsbin.com/futalidenu/edit?js,console
Provided you're working in node or a browser with BigInt support, this version cuts costs by saving the expensive string construction for the very end:
const zero = 0n
const shift = 8n
function asciiToBinary (str) {
const len = str.length
let n = zero
for (let i = 0; i < len; i++) {
n = (n << shift) + BigInt(str.charCodeAt(i))
}
return n.toString(2).padStart(len * 8, 0)
}
It's about twice as fast as the other solutions mentioned here including this simple es6+ implementation:
const toBinary = s => [...s]
.map(x => x
.codePointAt()
.toString(2)
.padStart(8,0)
)
.join('')
If you need to handle unicode characters, here's this guy:
const zero = 0n
const shift = 8n
const bigShift = 16n
const byte = 255n
function unicodeToBinary (str) {
const len = str.length
let n = zero
for (let i = 0; i < len; i++) {
const bits = BigInt(str.codePointAt(i))
n = (n << (bits > byte ? bigShift : shift)) + bits
}
const bin = n.toString(2)
return bin.padStart(8 * Math.ceil(bin.length / 8), 0)
}
this seems to be the simplified version
Array.from('abc').map((each)=>each.charCodeAt(0).toString(2)).join(" ")
This is as short as you can get. It's based on the top-rated answer but transformed to a reduce function.
"TEST".split("").reduce(function (a, b) { return a + b.charCodeAt(0).toString(2)}, "")
const textToBinary = (string) => {
return string.split('').map((char) =>
char.charCodeAt().toString(2)).join(' ');
}
console.log(textToBinary('hello world'))
var UTF8ToBin=function(f){for(var a,c=0,d=(f=unescape(encodeURIComponent(f))).length,b="";c<d;c++){for(a=f.charCodeAt(c).toString(2);a.length%8!=0;){a="0"+a}b+=a}return b},binToUTF8=function(f){for(var a,c=0,d=f.length,b="";c<d;c+=8){b+="%"+((a=parseInt(f.substr(c,8),2).toString(16)).length%2==0?a:"0"+a)}return decodeURIComponent(b)};
This is a small minified JavaScript Code to convert UTF8 to Binary and Vice versa.
This is a solution for UTF-8-based textual binary representation. It leverages TextEncoder, which encodes a string to its UTF-8 bytes.
This solution separates characters by spaces. The individual "byte-bits" of multi-byte characters are separated by a minus character (-).
// inspired by https://stackoverflow.com/a/40031979/923560
function stringToUtf8BinaryRepresentation(inputString) {
const result = Array.from(inputString).map(
char => [... new TextEncoder().encode(char)].map(
x => x.toString(2).padStart(8, '0')
).join('-')
).join(' ');
return result;
}
// ### example usage #########################
function print(inputString) {
console.log("--------------");
console.log(inputString);
console.log(stringToUtf8BinaryRepresentation(inputString));
}
// compare with https://en.wikipedia.org/wiki/UTF-8#Encoding
// compare with https://en.wikipedia.org/wiki/UTF-8#Codepage_layout
// compare with UTF-16, which JavaScript uses for strings: https://en.wikipedia.org/wiki/UTF-16#Examples
print("TEST");
print("hello world");
print("$");
print("£");
print("€");
print("한");
print("𐍈");
print("παράδειγμα");
print("🤡");
print("👨‍👩‍👧‍👦");
print("👩🏻‍🤝‍🧑🏿");
print("🇺🇦");
use the code: 'text'.split('').map(e=>{return e.charCodeAt(0).toString(2)}) e.g.-
const text='some text';
const output=text.split('').map(e=>{return e.charCodeAt(0).toString(2)})
Simple using Buffer
const text = "TEST";
[...Buffer.from(text).values()] // [ 84, 69, 83, 84 ]
.map(byte => byte.toString(2).padStart(8, 0)) // [ '01010100', '01000101', '01010011', '01010100' ]
.join(' ') // '01010100 01000101 01010011 01010100'
The shortest and simplest solution:
"x".charCodeAt().toString(2) // 1111000
String.charCodeAt() charCodeAt(0) returns unicode: "x".charCodeAt() // 120
Object.toString() charCodeAt().toString(2) converts unicode to binary.
For multiple string characters:
[..."Tesla"].map((i) => i.charCodeAt().toString(2)).join(" ");
// 1010100 1100101 1110011 1101100 1100001
Spread syntax (...)
[..."Tesla"] // ['T', 'e', 's', 'l', 'a']
Array.map()
[..."Tesla"].map((i) => i.charCodeAt()) // [84, 101, 115, 108, 97]
Array.join() Put a space " " after each element in the array map(i) and convert the array to string.
I'm pretty sure that you can do something like this:
Returns a STRING:
const toBinary = (str)=>{
let r = []
for (let i=0; i<str.length; i++) {
r.push(str.charCodeAt(i).toString(2));
}
return r.join("");
}
Or, as an int:
const toBinary = (str)=>{
let r = []
for (let i=0; i<str.length; i++) {
r.push(str.charCodeAt(i).toString(2));
}
return parseInt(r.join(""));
}

JavaScript strings outside of the BMP

BMP being Basic Multilingual Plane
According to JavaScript: the Good Parts:
JavaScript was built at a time when Unicode was a 16-bit character set, so all characters in JavaScript are 16 bits wide.
This leads me to believe that JavaScript uses UCS-2 (not UTF-16!) and can only handle characters up to U+FFFF.
Further investigation confirms this:
> String.fromCharCode(0x20001);
The fromCharCode method seems to only use the lowest 16 bits when returning the Unicode character. Trying to get U+20001 (CJK unified ideograph 20001) instead returns U+0001.
Question: is it at all possible to handle post-BMP characters in JavaScript?
2011-07-31: slide twelve from Unicode Support Shootout: The Good, The Bad, & the (mostly) Ugly covers issues related to this quite well:
Depends what you mean by ‘support’. You can certainly put non-UCS-2 characters in a JS string using surrogates, and browsers will display them if they can.
But, each item in a JS string is a separate UTF-16 code unit. There is no language-level support for handling full characters: all the standard String members (length, split, slice etc) all deal with code units not characters, so will quite happily split surrogate pairs or hold invalid surrogate sequences.
If you want surrogate-aware methods, I'm afraid you're going to have to start writing them yourself! For example:
String.prototype.getCodePointLength= function() {
return this.length-this.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g).length+1;
};
String.fromCodePoint= function() {
var chars= Array.prototype.slice.call(arguments);
for (var i= chars.length; i-->0;) {
var n = chars[i]-0x10000;
if (n>=0)
chars.splice(i, 1, 0xD800+(n>>10), 0xDC00+(n&0x3FF));
}
return String.fromCharCode.apply(null, chars);
};
I came to the same conclusion as bobince. If you want to work with strings containing unicode characters outside of the BMP, you have to reimplement javascript's String methods. This is because javascript counts characters as each 16-bit code value. Symbols outside of the BMP need two code values to be represented. You therefore run into a case where some symbols count as two characters and some count only as one.
I've reimplemented the following methods to treat each unicode code point as a single character: .length, .charCodeAt, .fromCharCode, .charAt, .indexOf, .lastIndexOf, .splice, and .split.
You can check it out on jsfiddle: http://jsfiddle.net/Y89Du/
Here's the code without comments. I tested it, but it may still have errors. Comments are welcome.
if (!String.prototype.ucLength) {
String.prototype.ucLength = function() {
// this solution was taken from
// http://stackoverflow.com/questions/3744721/javascript-strings-outside-of-the-bmp
return this.length - this.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g).length + 1;
};
}
if (!String.prototype.codePointAt) {
String.prototype.codePointAt = function (ucPos) {
if (isNaN(ucPos)){
ucPos = 0;
}
var str = String(this);
var codePoint = null;
var pairFound = false;
var ucIndex = -1;
var i = 0;
while (i < str.length){
ucIndex += 1;
var code = str.charCodeAt(i);
var next = str.charCodeAt(i + 1);
pairFound = (0xD800 <= code && code <= 0xDBFF && 0xDC00 <= next && next <= 0xDFFF);
if (ucIndex == ucPos){
codePoint = pairFound ? ((code - 0xD800) * 0x400) + (next - 0xDC00) + 0x10000 : code;
break;
} else{
i += pairFound ? 2 : 1;
}
}
return codePoint;
};
}
if (!String.fromCodePoint) {
String.fromCodePoint = function () {
var strChars = [], codePoint, offset, codeValues, i;
for (i = 0; i < arguments.length; ++i) {
codePoint = arguments[i];
offset = codePoint - 0x10000;
if (codePoint > 0xFFFF){
codeValues = [0xD800 + (offset >> 10), 0xDC00 + (offset & 0x3FF)];
} else{
codeValues = [codePoint];
}
strChars.push(String.fromCharCode.apply(null, codeValues));
}
return strChars.join("");
};
}
if (!String.prototype.ucCharAt) {
String.prototype.ucCharAt = function (ucIndex) {
var str = String(this);
var codePoint = str.codePointAt(ucIndex);
var ucChar = String.fromCodePoint(codePoint);
return ucChar;
};
}
if (!String.prototype.ucIndexOf) {
String.prototype.ucIndexOf = function (searchStr, ucStart) {
if (isNaN(ucStart)){
ucStart = 0;
}
if (ucStart < 0){
ucStart = 0;
}
var str = String(this);
var strUCLength = str.ucLength();
searchStr = String(searchStr);
var ucSearchLength = searchStr.ucLength();
var i = ucStart;
while (i < strUCLength){
var ucSlice = str.ucSlice(i,i+ucSearchLength);
if (ucSlice == searchStr){
return i;
}
i++;
}
return -1;
};
}
if (!String.prototype.ucLastIndexOf) {
String.prototype.ucLastIndexOf = function (searchStr, ucStart) {
var str = String(this);
var strUCLength = str.ucLength();
if (isNaN(ucStart)){
ucStart = strUCLength - 1;
}
if (ucStart >= strUCLength){
ucStart = strUCLength - 1;
}
searchStr = String(searchStr);
var ucSearchLength = searchStr.ucLength();
var i = ucStart;
while (i >= 0){
var ucSlice = str.ucSlice(i,i+ucSearchLength);
if (ucSlice == searchStr){
return i;
}
i--;
}
return -1;
};
}
if (!String.prototype.ucSlice) {
String.prototype.ucSlice = function (ucStart, ucStop) {
var str = String(this);
var strUCLength = str.ucLength();
if (isNaN(ucStart)){
ucStart = 0;
}
if (ucStart < 0){
ucStart = strUCLength + ucStart;
if (ucStart < 0){ ucStart = 0;}
}
if (typeof(ucStop) == 'undefined'){
ucStop = strUCLength - 1;
}
if (ucStop < 0){
ucStop = strUCLength + ucStop;
if (ucStop < 0){ ucStop = 0;}
}
var ucChars = [];
var i = ucStart;
while (i < ucStop){
ucChars.push(str.ucCharAt(i));
i++;
}
return ucChars.join("");
};
}
if (!String.prototype.ucSplit) {
String.prototype.ucSplit = function (delimeter, limit) {
var str = String(this);
var strUCLength = str.ucLength();
var ucChars = [];
if (delimeter == ''){
for (var i = 0; i < strUCLength; i++){
ucChars.push(str.ucCharAt(i));
}
ucChars = ucChars.slice(0, 0 + limit);
} else{
ucChars = str.split(delimeter, limit);
}
return ucChars;
};
}
More recent JavaScript engines have String.fromCodePoint.
const ideograph = String.fromCodePoint( 0x20001 ); // outside the BMP
Also a code-point iterator, which gets you the code-point length.
function countCodePoints( str )
{
const i = str[Symbol.iterator]();
let count = 0;
while( !i.next().done ) ++count;
return count;
}
console.log( ideograph.length ); // gives '2'
console.log( countCodePoints(ideograph) ); // '1'
Yes, you can. Although support to non-BMP characters directly in source documents is optional according to the ECMAScript standard, modern browsers let you use them. Naturally, the document encoding must be properly declared, and for most practical purposes you would need to use the UTF-8 encoding. Moreover, you need an editor that can handle UTF-8, and you need some input method(s); see e.g. my Full Unicode Input utility.
Using suitable tools and settings, you can write var foo = '𠀁'.
The non-BMP characters will be internally represented as surrogate pairs, so each non-BMP character counts as 2 in the string length.
Using for (c of this) instruction, one can make various computations on a string that contains non-BMP characters. For instance, to compute the string length, and to get the nth character of the string:
String.prototype.magicLength = function()
{
var c, k;
k = 0;
for (c of this) // iterate each char of this
{
k++;
}
return k;
}
String.prototype.magicCharAt = function(n)
{
var c, k;
k = 0;
for (c of this) // iterate each char of this
{
if (k == n) return c + "";
k++;
}
return "";
}
This old topic has now a simple solution in ES6:
Split characters into an array
simple version
[..."😴😄😃⛔🎠🚓🚇"] // ["😴", "😄", "😃", "⛔", "🎠", "🚓", "🚇"]
Then having each one separated you can handle them easily for most common cases.
Credit: DownGoat
Full solution
To overcome special emojis as the one in the comment, one can search for the connection charecter (char code 8205 in UTF-16) and make some modifications. Here is how:
let myStr = "👩‍👩‍👧‍👧😃𝌆"
let arr = [...myStr]
for (i = arr.length-1; i--; i>= 0) {
if (arr[i].charCodeAt(0) == 8205) { // special combination character
arr[i-1] += arr[i] + arr[i+1]; // combine them back to a single emoji
arr.splice(i, 2)
}
}
console.log(arr.length) //3
Haven't found a case where this doesn't work. Comment if you do.
To conclude
it seems that JS uses the 8205 char code to represent UCS-2 characters as a UTF-16 combinations.

Categories

Resources