Get the Value of a byte with Javascript - javascript

I'm not sure if the title makes any sense, but here is what I need.
This is my byte: ���������ˇ�����0F*E��ù� �
I have already managed to get the values of this byte with this php snippet:
<?php
$var = "���������ˇ�����0F*E��ù�";
for($i = 0; $i < strlen($var); $i++)
{
echo ord($var[$i])."<br/>";
}
?>
The result was:
0
0
0
0
0
0
0
0
2
2
0
255
0
0
0
0
0
2
48
70
1
42
69
0
0
1
157
0
But now I need to do the exact same thing without php, but in Java Script.
Any help would be appreciated

If you're wanting to get the numeric value of each character in a string in JavaScript, that can be done like so:
var someString = "blarg";
for(var i=0;i<someString.length;i++) {
var char = someString.charCodeAt(i);
}
String.charCodeAt(index) returns the Unicode code-point value of the specified character in the string. It does not behave like PHP or C where it returns the numeric value of a fixed 8-bit encoding (i.e. ASCII). Assuming your string is a human-readable string (as opposed to raw binary data) then using charCodeAt is perfectly fine. If you're working with raw binary data then don't use a JavaScript string.
If your strings contain characters that have Unicode code-points below 128 then charCodeAt behaves the same as ord in PHP or C's char type, however the example you've provided contains non-ASCII characters, so Unicode's (sometimes complicated) rules will come into play.
See the documentation on charCodeAt here: https://developer.mozilla.org/en-US/docs/JavaScript/Reference/Global_Objects/String/charCodeAt

The PHP String is computed as 8-bit (bytes 0..255) while JavaScript uses 16-bit unicode characters (0..65535). Depending on your string you may either split it into (16bit) char codes or into the bytes. If you know your String contains only 8-bit chars you may ignore the "hiByte" (see below) to get the same results as in PHP.
function toByteVersionA(s) {
var len = s.length;
// char codes
var charCodes = new Array();
for(var i=0; i<len; i++) {
charCodes.push(s.charCodeAt(i).toString());
}
var charCodesString = charCodes.join(" ");
return charCodesString;
}
function toByteVersionB(s) {
var len = s.length;
var bytes = new Array();
for(var i=0; i<len; i++) {
var charCode = s.charCodeAt(i);
var loByte = charCode & 255;
var hiByte = charCode >> 8;
bytes.push(loByte.toString());
bytes.push(hiByte.toString());
}
var bytesString = bytes.join(" ");
return bytesString;
}
function toByteVersionC(s) {
var len = s.length;
var bytes = new Array();
for(var i=0; i<len; i++) {
var charCode = s.charCodeAt(i);
var loByte = charCode & 255;
bytes.push(loByte.toString());
}
var bytesString = bytes.join(" ");
return bytesString;
}
var myString = "abc"; // whatever your String is
var myBytes = toByteVersionA(myString); // whatever version you want

Related

Converting UUID to bytes [duplicate]

How can I convert a string in bytearray using JavaScript. Output should be equivalent of the below C# code.
UnicodeEncoding encoding = new UnicodeEncoding();
byte[] bytes = encoding.GetBytes(AnyString);
As UnicodeEncoding is by default of UTF-16 with Little-Endianness.
Edit: I have a requirement to match the bytearray generated client side with the one generated at server side using the above C# code.
Update 2018 - The easiest way in 2018 should be TextEncoder
let utf8Encode = new TextEncoder();
utf8Encode.encode("abc");
// Uint8Array [ 97, 98, 99 ]
Caveats - The returned element is a Uint8Array, and not all browsers support it.
If you are looking for a solution that works in node.js, you can use this:
var myBuffer = [];
var str = 'Stack Overflow';
var buffer = new Buffer(str, 'utf16le');
for (var i = 0; i < buffer.length; i++) {
myBuffer.push(buffer[i]);
}
console.log(myBuffer);
In C# running this
UnicodeEncoding encoding = new UnicodeEncoding();
byte[] bytes = encoding.GetBytes("Hello");
Will create an array with
72,0,101,0,108,0,108,0,111,0
For a character which the code is greater than 255 it will look like this
If you want a very similar behavior in JavaScript you can do this (v2 is a bit more robust solution, while the original version will only work for 0x00 ~ 0xff)
var str = "Hello竜";
var bytes = []; // char codes
var bytesv2 = []; // char codes
for (var i = 0; i < str.length; ++i) {
var code = str.charCodeAt(i);
bytes = bytes.concat([code]);
bytesv2 = bytesv2.concat([code & 0xff, code / 256 >>> 0]);
}
// 72, 101, 108, 108, 111, 31452
console.log('bytes', bytes.join(', '));
// 72, 0, 101, 0, 108, 0, 108, 0, 111, 0, 220, 122
console.log('bytesv2', bytesv2.join(', '));
I suppose C# and Java produce equal byte arrays. If you have non-ASCII characters, it's not enough to add an additional 0. My example contains a few special characters:
var str = "Hell ö € Ω 𝄞";
var bytes = [];
var charCode;
for (var i = 0; i < str.length; ++i)
{
charCode = str.charCodeAt(i);
bytes.push((charCode & 0xFF00) >> 8);
bytes.push(charCode & 0xFF);
}
alert(bytes.join(' '));
// 0 72 0 101 0 108 0 108 0 32 0 246 0 32 32 172 0 32 3 169 0 32 216 52 221 30
I don't know if C# places BOM (Byte Order Marks), but if using UTF-16, Java String.getBytes adds following bytes: 254 255.
String s = "Hell ö € Ω ";
// now add a character outside the BMP (Basic Multilingual Plane)
// we take the violin-symbol (U+1D11E) MUSICAL SYMBOL G CLEF
s += new String(Character.toChars(0x1D11E));
// surrogate codepoints are: d834, dd1e, so one could also write "\ud834\udd1e"
byte[] bytes = s.getBytes("UTF-16");
for (byte aByte : bytes) {
System.out.print((0xFF & aByte) + " ");
}
// 254 255 0 72 0 101 0 108 0 108 0 32 0 246 0 32 32 172 0 32 3 169 0 32 216 52 221 30
Edit:
Added a special character (U+1D11E) MUSICAL SYMBOL G CLEF (outside BPM, so taking not only 2 bytes in UTF-16, but 4.
Current JavaScript versions use "UCS-2" internally, so this symbol takes the space of 2 normal characters.
I'm not sure but when using charCodeAt it seems we get exactly the surrogate codepoints also used in UTF-16, so non-BPM characters are handled correctly.
This problem is absolutely non-trivial. It might depend on the used JavaScript versions and engines. So if you want reliable solutions, you should have a look at:
https://github.com/koichik/node-codepoint/
http://mathiasbynens.be/notes/javascript-escapes
Mozilla Developer Network: charCodeAt
BigEndian vs. LittleEndian
UTF-16 Byte Array
JavaScript encodes strings as UTF-16, just like C#'s UnicodeEncoding, so creating a byte array is relatively straightforward.
JavaScript's charCodeAt() returns a 16-bit code unit (aka a 2-byte integer between 0 and 65535). You can split it into distinct bytes using the following:
function strToUtf16Bytes(str) {
const bytes = [];
for (ii = 0; ii < str.length; ii++) {
const code = str.charCodeAt(ii); // x00-xFFFF
bytes.push(code & 255, code >> 8); // low, high
}
return bytes;
}
For example:
strToUtf16Bytes('🌵');
// [ 60, 216, 53, 223 ]
This works between C# and JavaScript because they both support UTF-16. However, if you want to get a UTF-8 byte array from JS, you must transcode the bytes.
UTF-8 Byte Array
The solution feels somewhat non-trivial, but I used the code below in production with great success (original source).
Also, for the interested reader, I published my unicode helpers that help me work with string lengths reported by other languages such as PHP.
/**
* Convert a string to a unicode byte array
* #param {string} str
* #return {Array} of bytes
*/
export function strToUtf8Bytes(str) {
const utf8 = [];
for (let ii = 0; ii < str.length; ii++) {
let charCode = str.charCodeAt(ii);
if (charCode < 0x80) utf8.push(charCode);
else if (charCode < 0x800) {
utf8.push(0xc0 | (charCode >> 6), 0x80 | (charCode & 0x3f));
} else if (charCode < 0xd800 || charCode >= 0xe000) {
utf8.push(0xe0 | (charCode >> 12), 0x80 | ((charCode >> 6) & 0x3f), 0x80 | (charCode & 0x3f));
} else {
ii++;
// Surrogate pair:
// UTF-16 encodes 0x10000-0x10FFFF by subtracting 0x10000 and
// splitting the 20 bits of 0x0-0xFFFFF into two halves
charCode = 0x10000 + (((charCode & 0x3ff) << 10) | (str.charCodeAt(ii) & 0x3ff));
utf8.push(
0xf0 | (charCode >> 18),
0x80 | ((charCode >> 12) & 0x3f),
0x80 | ((charCode >> 6) & 0x3f),
0x80 | (charCode & 0x3f),
);
}
}
return utf8;
}
Inspired by #hgoebl's answer. His code is for UTF-16 and I needed something for US-ASCII. So here's a more complete answer covering US-ASCII, UTF-16, and UTF-32.
/**#returns {Array} bytes of US-ASCII*/
function stringToAsciiByteArray(str)
{
var bytes = [];
for (var i = 0; i < str.length; ++i)
{
var charCode = str.charCodeAt(i);
if (charCode > 0xFF) // char > 1 byte since charCodeAt returns the UTF-16 value
{
throw new Error('Character ' + String.fromCharCode(charCode) + ' can\'t be represented by a US-ASCII byte.');
}
bytes.push(charCode);
}
return bytes;
}
/**#returns {Array} bytes of UTF-16 Big Endian without BOM*/
function stringToUtf16ByteArray(str)
{
var bytes = [];
//currently the function returns without BOM. Uncomment the next line to change that.
//bytes.push(254, 255); //Big Endian Byte Order Marks
for (var i = 0; i < str.length; ++i)
{
var charCode = str.charCodeAt(i);
//char > 2 bytes is impossible since charCodeAt can only return 2 bytes
bytes.push((charCode & 0xFF00) >>> 8); //high byte (might be 0)
bytes.push(charCode & 0xFF); //low byte
}
return bytes;
}
/**#returns {Array} bytes of UTF-32 Big Endian without BOM*/
function stringToUtf32ByteArray(str)
{
var bytes = [];
//currently the function returns without BOM. Uncomment the next line to change that.
//bytes.push(0, 0, 254, 255); //Big Endian Byte Order Marks
for (var i = 0; i < str.length; i+=2)
{
var charPoint = str.codePointAt(i);
//char > 4 bytes is impossible since codePointAt can only return 4 bytes
bytes.push((charPoint & 0xFF000000) >>> 24);
bytes.push((charPoint & 0xFF0000) >>> 16);
bytes.push((charPoint & 0xFF00) >>> 8);
bytes.push(charPoint & 0xFF);
}
return bytes;
}
UTF-8 is variable length and isn't included because I would have to write the encoding myself. UTF-8 and UTF-16 are variable length. UTF-8, UTF-16, and UTF-32 have a minimum number of bits as their name indicates. If a UTF-32 character has a code point of 65 then that means there are 3 leading 0s. But the same code for UTF-16 has only 1 leading 0. US-ASCII on the other hand is fixed width 8-bits which means it can be directly translated to bytes.
String.prototype.charCodeAt returns a maximum number of 2 bytes and matches UTF-16 exactly. However for UTF-32 String.prototype.codePointAt is needed which is part of the ECMAScript 6 (Harmony) proposal. Because charCodeAt returns 2 bytes which is more possible characters than US-ASCII can represent, the function stringToAsciiByteArray will throw in such cases instead of splitting the character in half and taking either or both bytes.
Note that this answer is non-trivial because character encoding is non-trivial. What kind of byte array you want depends on what character encoding you want those bytes to represent.
javascript has the option of internally using either UTF-16 or UCS-2 but since it has methods that act like it is UTF-16 I don't see why any browser would use UCS-2.
Also see: https://mathiasbynens.be/notes/javascript-encoding
Yes I know the question is 4 years old but I needed this answer for myself.
Since I cannot comment on the answer, I'd build on Jin Izzraeel's answer
var myBuffer = [];
var str = 'Stack Overflow';
var buffer = new Buffer(str, 'utf16le');
for (var i = 0; i < buffer.length; i++) {
myBuffer.push(buffer[i]);
}
console.log(myBuffer);
by saying that you could use this if you want to use a Node.js buffer in your browser.
https://github.com/feross/buffer
Therefore, Tom Stickel's objection is not valid, and the answer is indeed a valid answer.
String.prototype.encodeHex = function () {
return this.split('').map(e => e.charCodeAt())
};
String.prototype.decodeHex = function () {
return this.map(e => String.fromCharCode(e)).join('')
};
The best solution I've come up with at on the spot (though most likely crude) would be:
String.prototype.getBytes = function() {
var bytes = [];
for (var i = 0; i < this.length; i++) {
var charCode = this.charCodeAt(i);
var cLen = Math.ceil(Math.log(charCode)/Math.log(256));
for (var j = 0; j < cLen; j++) {
bytes.push((charCode << (j*8)) & 0xFF);
}
}
return bytes;
}
Though I notice this question has been here for over a year.
I know the question is almost 4 years old, but this is what worked smoothly with me:
String.prototype.encodeHex = function () {
var bytes = [];
for (var i = 0; i < this.length; ++i) {
bytes.push(this.charCodeAt(i));
}
return bytes;
};
Array.prototype.decodeHex = function () {
var str = [];
var hex = this.toString().split(',');
for (var i = 0; i < hex.length; i++) {
str.push(String.fromCharCode(hex[i]));
}
return str.toString().replace(/,/g, "");
};
var str = "Hello World!";
var bytes = str.encodeHex();
alert('The Hexa Code is: '+bytes+' The original string is: '+bytes.decodeHex());
or, if you want to work with strings only, and no Array, you can use:
String.prototype.encodeHex = function () {
var bytes = [];
for (var i = 0; i < this.length; ++i) {
bytes.push(this.charCodeAt(i));
}
return bytes.toString();
};
String.prototype.decodeHex = function () {
var str = [];
var hex = this.split(',');
for (var i = 0; i < hex.length; i++) {
str.push(String.fromCharCode(hex[i]));
}
return str.toString().replace(/,/g, "");
};
var str = "Hello World!";
var bytes = str.encodeHex();
alert('The Hexa Code is: '+bytes+' The original string is: '+bytes.decodeHex());
Here is the same function that #BrunoLM posted converted to a String prototype function:
String.prototype.getBytes = function () {
var bytes = [];
for (var i = 0; i < this.length; ++i) {
bytes.push(this.charCodeAt(i));
}
return bytes;
};
If you define the function as such, then you can call the .getBytes() method on any string:
var str = "Hello World!";
var bytes = str.getBytes();
You don't need underscore, just use built-in map:
var string = 'Hello World!';
document.write(string.split('').map(function(c) { return c.charCodeAt(); }));

How to convert big number to string value in JavaScript without using any external lib?

Convert the ASCII value sentence to its equivalent string
This is for writing a similar program given above. I tried to convert the input directly to string value for the iteration.
What is happening is, assume the input value is
var num = 23511011501782351112179911801562340161171141148;
When I convert this number to string
num.toString()
I'm getting the result like this:
"2.351101150178235e+46"
There are so many similar questions asked in SOF, but I didn't see any proper answers.
Can someone help me with how to iterate each value in the input?
Thanks in advance.
If I understand you mean correctly, you can replace the code char ch = (char)num; with var ch = String.fromCharCode(num);. Like this:
var result = [];
function asciiToSentence(str, len)
{
var num = 0;
for (var i = 0; i < len; i++) {
// Append the current digit
num = num * 10 + (str[i] - '0');
// If num is within the required range
if (num >= 32 && num <= 122) {
// Convert num to char
var ch = String.fromCharCode(num);
result.push(ch)
// Reset num to 0
num = 0;
}
}
}
var str = "7110110110711510211111471101101107115";
var len = str.length;
asciiToSentence(str, len);
console.log(result.join(''));
Reference: String.fromCharCode()
Explanation:
First, inside the function asciiToSentence, we need 2 parameters str and len (str is a string while len is the length of that string).
Next, we make a temporary num to calculate the number inside the string based on this table: ASCII Printable Characters
We trying to parse one-by-one character to a number and multiply it with 10. Then, we compare it between 32 and 122 (based on the number in the table above).
If the number we have is inside the range, we parse that number to a character using String.fromCharCode function and reset the value num. Otherwise, we continue the loop and increase the value num

Longitudinal redundancy check in Javascript

I'm working with a system that integrates a Point of Sell (POS) device, I use chrome serial to scan ports and be able to read credit card data.
The problem I'm facing is that I need to concat the LRC from a string in this format:
STX = '\002' (2 HEX) (Start of text)
LLL = Length of data (doesn't include STX or ETX but command).
Command C50 {C = A message from PC to POS, 50 the actual code that "prints" a message on POS}
ETX = '\003' (3 HEX) (End of text)
LRC = Longitudinal Redundancy Check
A message example would be as follows:
'\002014C50HELLO WORLD\003'
Here we can see 002 as STX, 014 is the length from C50 to D, and 003 as ETX.
I found some algorithms in C# like this one or this one and even this one in Java, I even saw this question that was removed from SO on Google's cache, which actually asks the same as I but had no examples or answers.
I also made this Java algorithm:
private int calculateLRC(String str) {
int result = 0;
for (int i = 0; i < str.length(); i++) {
String char1 = str.substring(i, i + 1);
char[] char2 = char1.toCharArray();
int number = char2[0];
result = result ^ number;
}
return result;
}
and tried passing it to Javascript (where I have poor knowledge)
function calculateLRC2(str) {
var result = 0;
for (var i = 0; i < str.length; i++) {
var char1 = str.substring(i, i + 1);
//var char2[] = char1.join('');
var number = char1;
result = result ^ number;
}
return result.toString();
}
and after following the Wikipedia's pseudocode I tried doing this:
function calculateLRC(str) {
var buffer = convertStringToArrayBuffer(str);
var lrc;
for (var i = 0; i < str.length; i++) {
lrc = (lrc + buffer[i]) & 0xFF;
}
lrc = ((lrc ^ 0xFF) + 1) & 0xFF;
return lrc;
}
This is how I call the above method:
var finalMessage = '\002014C50HELLO WORLD\003'
var lrc = calculateLRC(finalMessage);
console.log('lrc: ' + lrc);
finalMessage = finalMessage.concat(lrc);
console.log('finalMessage: ' + finalMessage);
However after trying all these methods, I still can't send a message to POS correctly. I have 3 days now trying to fix this thing and can't do anything more unless I finish it.
Is there anyone that knows another way to calculate LRC or what am I doing wrong here? I need it to be with Javascritpt since POS comunicates with PC through NodeJS.
Oh btw the code from convertStringToArrayBuffer is on the chrome serial documentation which is this one:
var writeSerial=function(str) {
chrome.serial.send(connectionId, convertStringToArrayBuffer(str), onSend);
}
// Convert string to ArrayBuffer
var convertStringToArrayBuffer=function(str) {
var buf=new ArrayBuffer(str.length);
var bufView=new Uint8Array(buf);
for (var i=0; i<str.length; i++) {
bufView[i]=str.charCodeAt(i);
}
return buf;
}
Edit After testing I came with this algorithm which returns a 'z' (lower case) with the following input: \002007C50HOLA\003.
function calculateLRC (str) {
var bytes = [];
var lrc = 0;
for (var i = 0; i < str.length; i++) {
bytes.push(str.charCodeAt(i));
}
for (var i = 0; i < str.length; i++) {
lrc ^= bytes[i];
console.log('lrc: ' + lrc);
//console.log('lrcString: ' + String.fromCharCode(lrc));
}
console.log('bytes: ' + bytes);
return String.fromCharCode(lrc);
}
However with some longer inputs and specialy when trying to read card data, LRC becomes sometimes a Control Character which in my case that I use them on my String, might be a problem. Is there a way to force LRC to avoid those characters? Or maybe I'm doing it wrong and that's why I'm having those characters as output.
I solved LRC issue by calculating it with the following method, after reading #Jack A.'s answer and modifying it to this one:
function calculateLRC (str) {
var bytes = [];
var lrc = 0;
for (var i = 0; i < str.length; i++) {
bytes.push(str.charCodeAt(i));
}
for (var i = 0; i < str.length; i++) {
lrc ^= bytes[i];
}
return String.fromCharCode(lrc);
}
Explanation of what it does:
1st: it converts the string received to it's ASCII equivalent (charCodeAt()).
2nd: it calculates LRC by doing a XOR operation between last calculated LRC (0 on 1st iteration) and string's ASCII for each char.
3rd: it converts from ASCII to it's equivalent chat (fromCharCode()) and returns this char to main function (or whatever function called it).
Your pseudocode-based algorithm is using addition. For the XOR version, try this:
function calculateLRC(str) {
var buffer = convertStringToArrayBuffer(str);
var lrc = 0;
for (var i = 0; i < str.length; i++) {
lrc = (lrc ^ buffer[i]) & 0xFF;
}
return lrc;
}
I think your original attempt at the XOR version was failing because you needed to get the character code. The number variable still contained a string when you did result = result ^ number, so the results were probably not what you expected.
This is a SWAG since I don't have Node.JS installed at the moment so I can't verify it will work.
Another thing I would be concerned about is character encoding. JavaScript uses UTF-16 for text, so converting any non-ASCII characters to 8-bit bytes may give unexpected results.

Word Array to String

how to do this in Javascript or Jquery?
Please suggest in 2 steps:
1.- Word Array to Single Byte Array.
2.- Byte Array to String.
Maybe this can help:
function hex2a(hex) {
var str = '';
for (var i = 0; i < hex.length; i += 2)
str += String.fromCharCode(parseInt(hex.substr(i, 2), 16));
return str;
}
What you are trying to achieve is already implemented in CryptoJS. From the documentation:
You can convert a WordArray object to other formats by explicitly calling the toString method and passing an encoder.
var hash = CryptoJS.SHA256("Message");
alert(hash.toString(CryptoJS.enc.Base64));
alert(hash.toString(CryptoJS.enc.Hex));
Honestly I have no idea why you want to implement that yourself... But if you absolutely need to do it "manually" in the 2 steps you mentioned, you could try something like this:
function wordToByteArray(wordArray) {
var byteArray = [], word, i, j;
for (i = 0; i < wordArray.length; ++i) {
word = wordArray[i];
for (j = 3; j >= 0; --j) {
byteArray.push((word >> 8 * j) & 0xFF);
}
}
return byteArray;
}
function byteArrayToString(byteArray) {
var str = "", i;
for (i = 0; i < byteArray.length; ++i) {
str += escape(String.fromCharCode(byteArray[i]));
}
return str;
}
var hash = CryptoJS.SHA256("Message");
var byteArray = wordToByteArray(hash.words);
alert(byteArrayToString(byteArray));
The wordToByteArray function should work perfectly, but be aware that byteArrayToString will produce weird results in almost any case. I don't know much about encodings, but ASCII only uses 7 bits so you won't get ASCII chars when trying to encode an entire byte. So I added the escape function to at least be able to display all those strange chars you might get. ;)
I'd recommend you use the functions CryptoJS has already implemented or just use the byte array (without converting it to string) for your analysis.

How to convert a String to Bytearray

How can I convert a string in bytearray using JavaScript. Output should be equivalent of the below C# code.
UnicodeEncoding encoding = new UnicodeEncoding();
byte[] bytes = encoding.GetBytes(AnyString);
As UnicodeEncoding is by default of UTF-16 with Little-Endianness.
Edit: I have a requirement to match the bytearray generated client side with the one generated at server side using the above C# code.
Update 2018 - The easiest way in 2018 should be TextEncoder
let utf8Encode = new TextEncoder();
utf8Encode.encode("abc");
// Uint8Array [ 97, 98, 99 ]
Caveats - The returned element is a Uint8Array, and not all browsers support it.
If you are looking for a solution that works in node.js, you can use this:
var myBuffer = [];
var str = 'Stack Overflow';
var buffer = new Buffer(str, 'utf16le');
for (var i = 0; i < buffer.length; i++) {
myBuffer.push(buffer[i]);
}
console.log(myBuffer);
In C# running this
UnicodeEncoding encoding = new UnicodeEncoding();
byte[] bytes = encoding.GetBytes("Hello");
Will create an array with
72,0,101,0,108,0,108,0,111,0
For a character which the code is greater than 255 it will look like this
If you want a very similar behavior in JavaScript you can do this (v2 is a bit more robust solution, while the original version will only work for 0x00 ~ 0xff)
var str = "Hello竜";
var bytes = []; // char codes
var bytesv2 = []; // char codes
for (var i = 0; i < str.length; ++i) {
var code = str.charCodeAt(i);
bytes = bytes.concat([code]);
bytesv2 = bytesv2.concat([code & 0xff, code / 256 >>> 0]);
}
// 72, 101, 108, 108, 111, 31452
console.log('bytes', bytes.join(', '));
// 72, 0, 101, 0, 108, 0, 108, 0, 111, 0, 220, 122
console.log('bytesv2', bytesv2.join(', '));
I suppose C# and Java produce equal byte arrays. If you have non-ASCII characters, it's not enough to add an additional 0. My example contains a few special characters:
var str = "Hell ö € Ω 𝄞";
var bytes = [];
var charCode;
for (var i = 0; i < str.length; ++i)
{
charCode = str.charCodeAt(i);
bytes.push((charCode & 0xFF00) >> 8);
bytes.push(charCode & 0xFF);
}
alert(bytes.join(' '));
// 0 72 0 101 0 108 0 108 0 32 0 246 0 32 32 172 0 32 3 169 0 32 216 52 221 30
I don't know if C# places BOM (Byte Order Marks), but if using UTF-16, Java String.getBytes adds following bytes: 254 255.
String s = "Hell ö € Ω ";
// now add a character outside the BMP (Basic Multilingual Plane)
// we take the violin-symbol (U+1D11E) MUSICAL SYMBOL G CLEF
s += new String(Character.toChars(0x1D11E));
// surrogate codepoints are: d834, dd1e, so one could also write "\ud834\udd1e"
byte[] bytes = s.getBytes("UTF-16");
for (byte aByte : bytes) {
System.out.print((0xFF & aByte) + " ");
}
// 254 255 0 72 0 101 0 108 0 108 0 32 0 246 0 32 32 172 0 32 3 169 0 32 216 52 221 30
Edit:
Added a special character (U+1D11E) MUSICAL SYMBOL G CLEF (outside BPM, so taking not only 2 bytes in UTF-16, but 4.
Current JavaScript versions use "UCS-2" internally, so this symbol takes the space of 2 normal characters.
I'm not sure but when using charCodeAt it seems we get exactly the surrogate codepoints also used in UTF-16, so non-BPM characters are handled correctly.
This problem is absolutely non-trivial. It might depend on the used JavaScript versions and engines. So if you want reliable solutions, you should have a look at:
https://github.com/koichik/node-codepoint/
http://mathiasbynens.be/notes/javascript-escapes
Mozilla Developer Network: charCodeAt
BigEndian vs. LittleEndian
UTF-16 Byte Array
JavaScript encodes strings as UTF-16, just like C#'s UnicodeEncoding, so creating a byte array is relatively straightforward.
JavaScript's charCodeAt() returns a 16-bit code unit (aka a 2-byte integer between 0 and 65535). You can split it into distinct bytes using the following:
function strToUtf16Bytes(str) {
const bytes = [];
for (ii = 0; ii < str.length; ii++) {
const code = str.charCodeAt(ii); // x00-xFFFF
bytes.push(code & 255, code >> 8); // low, high
}
return bytes;
}
For example:
strToUtf16Bytes('🌵');
// [ 60, 216, 53, 223 ]
This works between C# and JavaScript because they both support UTF-16. However, if you want to get a UTF-8 byte array from JS, you must transcode the bytes.
UTF-8 Byte Array
The solution feels somewhat non-trivial, but I used the code below in production with great success (original source).
Also, for the interested reader, I published my unicode helpers that help me work with string lengths reported by other languages such as PHP.
/**
* Convert a string to a unicode byte array
* #param {string} str
* #return {Array} of bytes
*/
export function strToUtf8Bytes(str) {
const utf8 = [];
for (let ii = 0; ii < str.length; ii++) {
let charCode = str.charCodeAt(ii);
if (charCode < 0x80) utf8.push(charCode);
else if (charCode < 0x800) {
utf8.push(0xc0 | (charCode >> 6), 0x80 | (charCode & 0x3f));
} else if (charCode < 0xd800 || charCode >= 0xe000) {
utf8.push(0xe0 | (charCode >> 12), 0x80 | ((charCode >> 6) & 0x3f), 0x80 | (charCode & 0x3f));
} else {
ii++;
// Surrogate pair:
// UTF-16 encodes 0x10000-0x10FFFF by subtracting 0x10000 and
// splitting the 20 bits of 0x0-0xFFFFF into two halves
charCode = 0x10000 + (((charCode & 0x3ff) << 10) | (str.charCodeAt(ii) & 0x3ff));
utf8.push(
0xf0 | (charCode >> 18),
0x80 | ((charCode >> 12) & 0x3f),
0x80 | ((charCode >> 6) & 0x3f),
0x80 | (charCode & 0x3f),
);
}
}
return utf8;
}
Inspired by #hgoebl's answer. His code is for UTF-16 and I needed something for US-ASCII. So here's a more complete answer covering US-ASCII, UTF-16, and UTF-32.
/**#returns {Array} bytes of US-ASCII*/
function stringToAsciiByteArray(str)
{
var bytes = [];
for (var i = 0; i < str.length; ++i)
{
var charCode = str.charCodeAt(i);
if (charCode > 0xFF) // char > 1 byte since charCodeAt returns the UTF-16 value
{
throw new Error('Character ' + String.fromCharCode(charCode) + ' can\'t be represented by a US-ASCII byte.');
}
bytes.push(charCode);
}
return bytes;
}
/**#returns {Array} bytes of UTF-16 Big Endian without BOM*/
function stringToUtf16ByteArray(str)
{
var bytes = [];
//currently the function returns without BOM. Uncomment the next line to change that.
//bytes.push(254, 255); //Big Endian Byte Order Marks
for (var i = 0; i < str.length; ++i)
{
var charCode = str.charCodeAt(i);
//char > 2 bytes is impossible since charCodeAt can only return 2 bytes
bytes.push((charCode & 0xFF00) >>> 8); //high byte (might be 0)
bytes.push(charCode & 0xFF); //low byte
}
return bytes;
}
/**#returns {Array} bytes of UTF-32 Big Endian without BOM*/
function stringToUtf32ByteArray(str)
{
var bytes = [];
//currently the function returns without BOM. Uncomment the next line to change that.
//bytes.push(0, 0, 254, 255); //Big Endian Byte Order Marks
for (var i = 0; i < str.length; i+=2)
{
var charPoint = str.codePointAt(i);
//char > 4 bytes is impossible since codePointAt can only return 4 bytes
bytes.push((charPoint & 0xFF000000) >>> 24);
bytes.push((charPoint & 0xFF0000) >>> 16);
bytes.push((charPoint & 0xFF00) >>> 8);
bytes.push(charPoint & 0xFF);
}
return bytes;
}
UTF-8 is variable length and isn't included because I would have to write the encoding myself. UTF-8 and UTF-16 are variable length. UTF-8, UTF-16, and UTF-32 have a minimum number of bits as their name indicates. If a UTF-32 character has a code point of 65 then that means there are 3 leading 0s. But the same code for UTF-16 has only 1 leading 0. US-ASCII on the other hand is fixed width 8-bits which means it can be directly translated to bytes.
String.prototype.charCodeAt returns a maximum number of 2 bytes and matches UTF-16 exactly. However for UTF-32 String.prototype.codePointAt is needed which is part of the ECMAScript 6 (Harmony) proposal. Because charCodeAt returns 2 bytes which is more possible characters than US-ASCII can represent, the function stringToAsciiByteArray will throw in such cases instead of splitting the character in half and taking either or both bytes.
Note that this answer is non-trivial because character encoding is non-trivial. What kind of byte array you want depends on what character encoding you want those bytes to represent.
javascript has the option of internally using either UTF-16 or UCS-2 but since it has methods that act like it is UTF-16 I don't see why any browser would use UCS-2.
Also see: https://mathiasbynens.be/notes/javascript-encoding
Yes I know the question is 4 years old but I needed this answer for myself.
Since I cannot comment on the answer, I'd build on Jin Izzraeel's answer
var myBuffer = [];
var str = 'Stack Overflow';
var buffer = new Buffer(str, 'utf16le');
for (var i = 0; i < buffer.length; i++) {
myBuffer.push(buffer[i]);
}
console.log(myBuffer);
by saying that you could use this if you want to use a Node.js buffer in your browser.
https://github.com/feross/buffer
Therefore, Tom Stickel's objection is not valid, and the answer is indeed a valid answer.
String.prototype.encodeHex = function () {
return this.split('').map(e => e.charCodeAt())
};
String.prototype.decodeHex = function () {
return this.map(e => String.fromCharCode(e)).join('')
};
The best solution I've come up with at on the spot (though most likely crude) would be:
String.prototype.getBytes = function() {
var bytes = [];
for (var i = 0; i < this.length; i++) {
var charCode = this.charCodeAt(i);
var cLen = Math.ceil(Math.log(charCode)/Math.log(256));
for (var j = 0; j < cLen; j++) {
bytes.push((charCode << (j*8)) & 0xFF);
}
}
return bytes;
}
Though I notice this question has been here for over a year.
I know the question is almost 4 years old, but this is what worked smoothly with me:
String.prototype.encodeHex = function () {
var bytes = [];
for (var i = 0; i < this.length; ++i) {
bytes.push(this.charCodeAt(i));
}
return bytes;
};
Array.prototype.decodeHex = function () {
var str = [];
var hex = this.toString().split(',');
for (var i = 0; i < hex.length; i++) {
str.push(String.fromCharCode(hex[i]));
}
return str.toString().replace(/,/g, "");
};
var str = "Hello World!";
var bytes = str.encodeHex();
alert('The Hexa Code is: '+bytes+' The original string is: '+bytes.decodeHex());
or, if you want to work with strings only, and no Array, you can use:
String.prototype.encodeHex = function () {
var bytes = [];
for (var i = 0; i < this.length; ++i) {
bytes.push(this.charCodeAt(i));
}
return bytes.toString();
};
String.prototype.decodeHex = function () {
var str = [];
var hex = this.split(',');
for (var i = 0; i < hex.length; i++) {
str.push(String.fromCharCode(hex[i]));
}
return str.toString().replace(/,/g, "");
};
var str = "Hello World!";
var bytes = str.encodeHex();
alert('The Hexa Code is: '+bytes+' The original string is: '+bytes.decodeHex());
Here is the same function that #BrunoLM posted converted to a String prototype function:
String.prototype.getBytes = function () {
var bytes = [];
for (var i = 0; i < this.length; ++i) {
bytes.push(this.charCodeAt(i));
}
return bytes;
};
If you define the function as such, then you can call the .getBytes() method on any string:
var str = "Hello World!";
var bytes = str.getBytes();
You don't need underscore, just use built-in map:
var string = 'Hello World!';
document.write(string.split('').map(function(c) { return c.charCodeAt(); }));

Categories

Resources