Convert String to Byte String in Javascript - javascript

In Python, I have the following code to store bytes in a variable like: -
K = b"\x00" * 32
I was trying to write a javascript equivalent of this code to get bytes string using the following code:
function toUTF8Array(str) {
var utf8 = [];
for (var i = 0; i < str.length; i++) {
var charcode = str.charCodeAt(i);
if (charcode < 0x80) utf8.push(charcode);
else if (charcode < 0x800) {
utf8.push(0xc0 | (charcode >> 6), 0x80 | (charcode & 0x3f));
} else if (charcode < 0xd800 || charcode >= 0xe000) {
utf8.push(
0xe0 | (charcode >> 12),
0x80 | ((charcode >> 6) & 0x3f),
0x80 | (charcode & 0x3f)
);
}
// surrogate pair
else {
i++;
// UTF-16 encodes 0x10000-0x10FFFF by
// subtracting 0x10000 and splitting the
// 20 bits of 0x0-0xFFFFF into two halves
charcode =
0x10000 + (((charcode & 0x3ff) << 10) | (str.charCodeAt(i) & 0x3ff));
utf8.push(
0xf0 | (charcode >> 18),
0x80 | ((charcode >> 12) & 0x3f),
0x80 | ((charcode >> 6) & 0x3f),
0x80 | (charcode & 0x3f)
);
}
}
return utf8;
}
But it is generating byte array as follows:
[
0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0 ]
I don't want the output as plain array of numbers. I need the bytes output like Python's (out of the Python code as: K = b"\x00" * 32).
How to achieve that?

Related

JavaScript equivalent of Java's String.getBytes(StandardCharsets.UTF_8)

I have the following Java code:
String str = "\u00A0";
byte[] bytes = str.getBytes(StandardCharsets.UTF_8);
System.out.println(Arrays.toString(bytes));
This outputs the following byte array:
[-62, -96]
I am trying to get the same result in Javascript. I have tried the solution posted here:
https://stackoverflow.com/a/51904484/12177456
function strToUtf8Bytes(str) {
const utf8 = [];
for (let ii = 0; ii < str.length; ii++) {
let charCode = str.charCodeAt(ii);
if (charCode < 0x80) utf8.push(charCode);
else if (charCode < 0x800) {
utf8.push(0xc0 | (charCode >> 6), 0x80 | (charCode & 0x3f));
} else if (charCode < 0xd800 || charCode >= 0xe000) {
utf8.push(0xe0 | (charCode >> 12), 0x80 | ((charCode >> 6) & 0x3f), 0x80 | (charCode & 0x3f));
} else {
ii++;
// Surrogate pair:
// UTF-16 encodes 0x10000-0x10FFFF by subtracting 0x10000 and
// splitting the 20 bits of 0x0-0xFFFFF into two halves
charCode = 0x10000 + (((charCode & 0x3ff) << 10) | (str.charCodeAt(ii) & 0x3ff));
utf8.push(
0xf0 | (charCode >> 18),
0x80 | ((charCode >> 12) & 0x3f),
0x80 | ((charCode >> 6) & 0x3f),
0x80 | (charCode & 0x3f),
);
}
}
return utf8;
}
console.log(strToUtf8Bytes("h\u00A0i"));
But this gives this (which is a https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Uint8Array):
[194, 160]
This is a problem for me as I'm using the graal js engine, and need to pass the array to a java function that expects a byte[], so any value in the array > 127 will cause an error, as described here:
https://github.com/oracle/graal/issues/2118
Note I also tried the TextEncoder class instead of the strToUtf8Bytes function as described here:
java string.getBytes("UTF-8") javascript equivalent
but it gives the same result as above.
Is there something else I can try here so that I can get JavaScript to generate the same array as Java?
The result is the same in terms of bytes, JS just defaults to unsigned bytes.
U in Uint8Array stands for “unsigned”; the signed variant is called Int8Array.
The conversion is easy: just pass the result to the Int8Array constructor:
console.log(new Int8Array(new TextEncoder().encode("\u00a0"))); // Int8Array [ -62, -96 ]

How to convert a string to base64 encoding using byte array in JavaScript?

I have the below .NET code to convert a string to Base64 encoding by first converting it to byte array. I tried different answers on Stack Overflow to convert the string in byte array and then use btoa() function for base64 encoding in JavaScript. But, I'm not getting the exact encoded value as shared below.
For string value,
BBFDC43D-4890-4558-BB89-50D802014A97
I need Base64 encoding as,
PcT9u5BIWEW7iVDYAgFKlw==
.NET code:
String str = "BBFDC43D-4890-4558-BB89-50D802014A97"
Guid guid = new Guid(str);
Console.WriteLine(guid); // bbfdc43d-4890-4558-bb89-50d802014a97
Byte[] bytes = guid.ToByteArray();
Console.WriteLine(bytes); // System.Byte[]
String s = Convert.ToBase64String(bytes, Base64FormattingOptions.InsertLineBreaks);
Console.WriteLine(s); // PcT9u5BIWEW7iVDYAgFKlw==
Currently, I tried with the below code, which is not producing the desired result:
function strToUtf8Bytes(str) {
const utf8 = [];
for (let ii = 0; ii < str.length; ii++) {
let charCode = str.charCodeAt(ii);
if (charCode < 0x80) utf8.push(charCode);
else if (charCode < 0x800) {
utf8.push(0xc0 | (charCode >> 6), 0x80 | (charCode & 0x3f));
} else if (charCode < 0xd800 || charCode >= 0xe000) {
utf8.push(0xe0 | (charCode >> 12), 0x80 | ((charCode >> 6) & 0x3f), 0x80 | (charCode & 0x3f));
} else {
ii++;
// Surrogate pair:
// UTF-16 encodes 0x10000-0x10FFFF by subtracting 0x10000 and
// splitting the 20 bits of 0x0-0xFFFFF into two halves
charCode = 0x10000 + (((charCode & 0x3ff) << 10) | (str.charCodeAt(ii) & 0x3ff));
utf8.push(
0xf0 | (charCode >> 18),
0x80 | ((charCode >> 12) & 0x3f),
0x80 | ((charCode >> 6) & 0x3f),
0x80 | (charCode & 0x3f),
);
}
}
return utf8;
}
const str = "BBFDC43D-4890-4558-BB89-50D802014A97";
const strByteArr = strToUtf8Bytes(str);
const strBase64 = btoa(strByteArr);
// NjYsNjYsNzAsNjgsNjcsNTIsNTEsNjgsNDUsNTIsNTYsNTcsNDgsNDUsNTIsNTMsNTMsNTYsNDUsNjYsNjYsNTYsNTcsNDUsNTMsNDgsNjgsNTYsNDgsNTAsNDgsNDksNTIsNjUsNTcsNTU=
Your problem is caused by the following:
btoa() is using ASCII encoding
guid.ToByteArray(); does not use ASCII encoding
If you modify your C# code like this:
String str = "BBFDC43D-4890-4558-BB89-50D802014A97";
//Guid guid = new Guid(str);
//Console.WriteLine(guid);
// bbfdc43d-4890-4558-bb89-50d802014a97
//Byte[] bytes = guid.ToByteArray();
byte[] bytes = System.Text.Encoding.ASCII.GetBytes(str);
//Console.WriteLine(bytes); // System.Byte[]
String s = Convert.ToBase64String(bytes, Base64FormattingOptions.InsertLineBreaks);
Console.WriteLine(s);
You will get the following output:
QkJGREM0M0QtNDg5MC00NTU4LUJCODktNTBEODAyMDE0QTk3
Which will be the same string as the one returned from the btoa() function:
var rawString = "BBFDC43D-4890-4558-BB89-50D802014A97";
var b64encoded = btoa(rawString);
console.log(b64encoded);
Output:
QkJGREM0M0QtNDg5MC00NTU4LUJCODktNTBEODAyMDE0QTk3
UPDATE - Since you can't modify the C# code
You should adapt your Javascript code by combining Piotr's answer and this SO answer
function guidToBytes(guid) {
var bytes = [];
guid.split('-').map((number, index) => {
var bytesInChar = index < 3 ? number.match(/.{1,2}/g).reverse() : number.match(/.{1,2}/g);
bytesInChar.map((byte) => { bytes.push(parseInt(byte, 16)); })
});
return bytes;
}
function arrayBufferToBase64(buffer) {
var binary = '';
var bytes = new Uint8Array(buffer);
var len = bytes.byteLength;
for (var i = 0; i < len; i++) {
binary += String.fromCharCode(bytes[i]);
}
return btoa(binary);
}
var str = "BBFDC43D-4890-4558-BB89-50D802014A97";
var guidBytes = guidToBytes(str);
var b64encoded = arrayBufferToBase64(guidBytes);
console.log(b64encoded);
Output:
PcT9u5BIWEW7iVDYAgFKlw==
The problem with your code is representation of Guid. In C# code you are converting "BBFDC43D-4890-4558-BB89-50D802014A97" into UUID which is a 128-bit number. In JavaScript code, you are doing something else. You iterate through the string and calculate a byte array of a string. They are simply not equal.
Now you have to options
Implement proper guid conversion in JS (this may help: https://gist.github.com/daboxu/4f1dd0a254326ac2361f8e78f89e97ae)
In C# calculate byte array in the same way as in JS
Your string is a hexadecimal value, which you use to create a GUID. Then you convert the GUID into a byte array with:
Byte[] bytes = guid.ToByteArray();
The GUID is a 16-byte value which can be represented as a hexadecimal value. When you convert this GUID into a byte array, you will get the 16 bytes of the value, not the byte representation of the hexadecimal value.
In the provided JavaScript function you are doing something else: You are converting the string directly to a byte array.
In C# you do the equivalent with an Encoding:
String str = "BBFDC43D-4890-4558-BB89-50D802014A97";
Byte[] bytes = Encoding.UTF8.GetBytes(str);

Meaning of this number function with different number values in if blocks

Wondering what this encode_number function means, all of the different numbers. Specifically, I know that 32768 is for 16 bits (Math.pow(2, 16) / 2), and that the last else block is (from source code comments) a 32 bit number. But I don't understand where these numbers are coming from: 107, 139, 108, 1131, 247, 251, 28, 29. Would like to know what the meaning of this function is. From here.
function encode_number(v) {
if (v >= -107 && v <= 107) {
return [v + 139]
} else if (v >= 108 && v <= 1131) {
v = v - 108
return [(v >> 8) + 247, v & 0xFF]
} else if (v >= -1131 && v <= -108) {
v = -v - 108
return [(v >> 8) + 251, v & 0xFF]
} else if (v >= -32768 && v <= 32767) {
// encode_number16
return [28, (v >> 8) & 0xFF, v & 0xFF]
} else {
// encode_number32
return [29, (v >> 24) & 0xFF, (v >> 16) & 0xFF, (v >> 8) & 0xFF, v & 0xFF]
}
}
Well, that was simple:
By using the blame view of that linked source code, you come across the commit that introduced these lines. The commit message was: "Add CFF table encoding."
By searching for CFF and 107, you find an explanation of that CFF dictionary format
So, this function is used to encode something for that "Compact Font Format" which kind of belongs to OpenType. But what that actually means is out of my knowledge ;)

C# decimal to C++ float and javascript number

I have a tcp server written in c#. I have to write two client (c++ and javascript). I can deserialize decimal (16byte - 128bit) in c# client but I can't deserialize other languages.
Decimals not too big, I can use float or double.
When serialize decimal:
MemoryStream combinedMessage = new MemoryStream();
decimal d = 2135102.06m;
using (BinaryWriter writer = new BinaryWriter(combinedMessage, encoding))
{
writer.Write(d);
}
byte[] message = combinedMessage.ToArray();
Serialized as:
62 232 185 12 0 0 0 0 0 0 0 0 0 0 2 0
How I can deserialize decimal from byte[] in c++ and javascript?
The first 12 bytes are a little-endian 96-bit integer, byte 13 and 14 are unused (for now), byte 15 contains the scale (power of 10 to divide by), and byte 16 contains the sign bit in the MSB (other bits unused). The main difficulty lies in accurate conversion -- even if the decimal is "not too big", converting it to a float or Number can be done in ways that lose more or less accuracy.
The following routine isn't necessarily the most accurate way to convert decimals, nor the fastest, but if you are not overly concerned with either accuracy or speed it'll get the job done, and it has the benefit of being easy to translate to most any C-like language. Here it is in JavaScript:
var b = [ 62, 232, 185, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0 ];
var d = 0.0;
for (var i = 11; i >= 0; --i) {
var k = b[i];
for (var j = 0; j != 8; ++j) {
d *= 2;
d += (k & 0x80) >> 7;
k <<= 1;
}
}
var scale = b[14];
d /= Math.pow(10, scale);
if (b[15] >= 0x80) d = -d;
This is almost valid C# already; all you need to change is Math.Pow and byte[] b = { 62 ... }. For C (and by extension C++) the changes aren't much more complicated:
#include <math.h>
unsigned char b[] = { 62, 232, 185, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0 };
double d = 0.0;
for (int i = 11; i >= 0; --i) {
unsigned char k = b[i];
for (int j = 0; j != 8; ++j) {
d *= 2;
d += (k & 0x80) >> 7;
k <<= 1;
}
}
int scale = b[14];
d /= pow(10, scale);
if (b[15] >= 0x80) d = -d;

Javascript: unicode character to BYTE based hex escape sequence (NOT surrogates)

In javascript I am trying to make unicode into byte based hex escape sequences that are compatible with C:
ie. 😄
becomes: \xF0\x9F\x98\x84 (correct)
NOT javascript surrogates, not \uD83D\uDE04 (wrong)
I cannot figure out the math relationship between the four bytes C wants vs the two surrogates javascript uses. I suspect the algorithm is far more complex than my feeble attempts.
Thanks for any tips.
encodeURIComponent does this work:
var input = "\uD83D\uDE04";
var result = encodeURIComponent(input).replace(/%/g, "\\x"); // \xF0\x9F\x98\x84
Upd: Actually, C strings can contain digits and letters without escaping, but if you really need to escape them:
function escape(s, escapeEverything) {
if (escapeEverything) {
s = s.replace(/[\x10-\x7f]/g, function (s) {
return "-x" + s.charCodeAt(0).toString(16).toUpperCase();
});
}
s = encodeURIComponent(s).replace(/%/g, "\\x");
if (escapeEverything) {
s = s.replace(/\-/g, "\\");
}
return s;
}
Found a solution here: http://jonisalonen.com/2012/from-utf-16-to-utf-8-in-javascript/
I would have never figured out THAT math, wow.
somewhat minified
function UTF8seq(s) {
var i,c,u=[];
for (i=0; i < s.length; i++) {
c = s.charCodeAt(i);
if (c < 0x80) { u.push(c); }
else if (c < 0x800) { u.push(0xc0 | (c >> 6), 0x80 | (c & 0x3f)); }
else if (c < 0xd800 || c >= 0xe000) { u.push(0xe0 | (c >> 12), 0x80 | ((c>>6) & 0x3f), 0x80 | (c & 0x3f)); }
else { i++; c = 0x10000 + (((c & 0x3ff)<<10) | (s.charCodeAt(i) & 0x3ff));
u.push(0xf0 | (c >>18), 0x80 | ((c>>12) & 0x3f), 0x80 | ((c>>6) & 0x3f), 0x80 | (c & 0x3f)); }
}
for (i=0; i < u.length; i++) { u[i]=u[i].toString(16); }
return '\\x'+u.join('\\x');
}
Your C code expects an UTF-8 string (the symbol is represented as 4 bytes). The JS representation you see is UTF-16 however (the symbol is represented as 2 uint16s, a surrogate pair).
You will first need to get the (Unicode) code point for your symbol (from the UTF-16 JS string), then build the UTF-8 representation for it from that.
Since ES6 you can use the codePointAt method for the first part, which I would recommend using as a shim even if not supported. I guess you don't want to decode surrogate pairs yourself :-)
For the rest, I don't think there's a library method, but you can write it yourself according to the spec:
function hex(x) {
x = x.toString(16);
return (x.length > 2 ? "\\u0000" : "\\x00").slice(0,-x.length)+x.toUpperCase();
}
var c = "😄";
console.log(c.length, hex(c.charCodeAt(0))+hex(c.charCodeAt(1))); // 2, "\uD83D\uDE04"
var cp = c.codePointAt(0);
var bytes = new Uint8Array(4);
bytes[3] = 0x80 | cp & 0x3F;
bytes[2] = 0x80 | (cp >>>= 6) & 0x3F;
bytes[1] = 0x80 | (cp >>>= 6) & 0x3F;
bytes[0] = 0xF0 | (cp >>>= 6) & 0x3F;
console.log(Array.prototype.map.call(bytes, hex).join("")) // "\xf0\x9f\x98\x84"
(tested in Chrome)

Categories

Resources