Writing a Hexadecimal Escape Character Sequence using Variables

Writing a Hexadecimal Escape Character Sequence using Variables - javascript

Testing Hex Character Codes
Problem
What does a Vertical Tab or a Backspace character actually do? I want to find out.
My experiment is to find out exactly what happens when every hex character is put into a string. I thought the best way to do this would be to created a nested loop to go through each of the 16 hexadecimal characters to create each possible 2 digit hex character code.
I soon discovered that you cannot use the \x escape character with interpolated variables, and so I expect what I have set out to do might be impossible.
const hexCharacters = "0123456789ABCDEF";
let code = "";
let char1 = "";
let char2 = "";
for (charPos1 = 0; charPos1 < hexCharacters.length; charPos1++) {
for (charPos2 = 0; charPos2 < hexCharacters.length; charPos2++) {
char1 = hexCharacters[charPos1];
char2 = hexCharacters[charPos2];
code = `${char1}${char2}`;
printHexChar(code);
}
}
function printHexChar(string) {
let output = `<p>Hex Code ${string} = \x${string}</p>`; // THE PROBLEM IS CLEAR
document.write(output)
}
I know it will also probably fail once it gets past 7F or whichever is the last character in the set, but that's not the main issue here! :D
Potential solution
string.prototype.fromCharCode
This sort of string method approach would seem to be the answer, but it is meant for U-16 character codes, and that is not what I wanted to test. There doesn't seem to be an existing string method for hex codes. Probably because nobody would ever want one, but nevertheless it would be cool.
Conclusion
Is there any way to create an escape character sequence from assembled parts that will render not as plain text, but as a proper escape character sequence?
Apologies if this has been asked before in some form, but with my feeble understanding of things I just couldn't find an answer.

You can use String.fromCharCode with parseInt.
`<p>Hex Code ${string} = ${String.fromCharCode(parseInt(string, 16))}</p>`;
const hexCharacters = "0123456789ABCDEF";
let code = "";
let char1 = "";
let char2 = "";
for (charPos1 = 0; charPos1 < hexCharacters.length; charPos1++) {
for (charPos2 = 0; charPos2 < hexCharacters.length; charPos2++) {
char1 = hexCharacters[charPos1];
char2 = hexCharacters[charPos2];
code = `${char1}${char2}`;
printHexChar(code);
}
}
function printHexChar(string) {
let output = `<p>Hex Code ${string} = ${String.fromCharCode(parseInt(string, 16))}</p>`;
document.write(output)
}
eval works as well, though it should generally be avoided.
`<p>Hex Code ${string} = ${eval('"\\x'+string+'"')}</p>`

If you want to output \x literally, then in a string literal you need to escape the escape character, so `\\x`.
string.prototype.fromCharCode [...] is meant for U-16 character codes
JavaScript uses one character encoding. The following strings are all equal:
let a = String.fromCharCode(27);
let b = "\x1B";
let c = "\u001B";
console.log(a === b, b === c);
If I understand correctly, you want to produce a string literal that shows \x escape sequences -- not the actual character:
// Prepare string
let chars = Array.from({length: 128}, (_, i) => String.fromCharCode(i))
.join("");
// Escape them
let escaped = Array.from(chars, ch => `\\x${ch.charCodeAt().toString(16).padStart(2, "0")}`).join("");
console.log(escaped);
But you might also use JSON.stringify. Although it uses different escape sequences (\u instead of \x), and only for non-display characters, it will be the exact same string when evaluated. Here is a demo:
// Prepare string
let chars = Array.from({length: 128}, (_, i) => String.fromCharCode(i))
.join("");
// Escape them
let escaped = '"' + Array.from(chars, ch => `\\x${ch.charCodeAt().toString(16).padStart(2, "0")}`).join("") + '"';
console.log(escaped);
// Or JSONify them
let json = JSON.stringify(chars);
console.log(json);
// Compare them, when evaluated:
console.log(eval(escaped) === eval(json));
Finally, note that there is nothing special about hexadecimal: it is just a representation of an integer. In the end, it is the numerical value that is important, not the representation of it. It is that numerical value that corresponds to a character.
Addendum
If you prefer code that sticks to old-style JavaScript, here is something equivalent of the last code snippet:
// Prepare string
let chars = "";
for (let i = 0; i < 128; i++) {
chars += String.fromCharCode(i);
}
// Escape the characters in this string
let escaped = '"';
for (let i = 0; i < chars.length; i++) {
let ch = chars.charCodeAt(i);
let hex = ch.toString(16);
if (hex.length === 1) hex = "0" + hex;
escaped += "\\x" + hex;
}
escaped += '"';
console.log(escaped);
// Or JSONify them
let json = JSON.stringify(chars);
console.log(json);
// Compare them, when evaluated:
console.log(eval(escaped) === eval(json));

Related

Javascript hexadecimal to ASCII with latin extended symbols

I am getting a hexadecimal value of my string that looks like this:
String has letters with diacritics: č,š,ř, ...
Hexadecimal value of this string is:
0053007400720069006E006700200068006100730020006C0065007400740065007200730020007700690074006800200064006900610063007200690074006900630073003A0020010D002C00200161002C00200159002C0020002E002E002E
The problem is that when i try to convert this value back to ascii it poorly converts the č,š,ř,.. and returns symbol of little box with question mark in it instead of these symbols.
My code for converting hex to ascii:
function convertHexadecimal(hexx){
let index = hexx.indexOf("~");
let strInfo = hexx.substring(0, index+1);
let strMessage = hexx.substring(index+1);
var hex = strMessage.toString();
var str = '';
for (var i = 0; i < hex.length; i += 2){
str += String.fromCharCode(parseInt(hex.substr(i, 2), 16));
}
console.log("Zpráva: " + str);
var strFinal = strInfo + str;
return strFinal;
}
Can somebody help me with this?

First an example solution:
let demoHex = `0053007400720069006E006700200068006100730020006C0065007400740065007200730020007700690074006800200064006900610063007200690074006900630073003A0020010D002C00200161002C00200159002C0020002E002E002E`;
function hexToString(hex) {
let str="";
for( var i = 0; i < hex.length; i +=4) {
str += String.fromCharCode( Number("0x" + hex.substr(i,4)));
}
return str;
}
console.log("Decoded string: %s", hexToString(demoHex) );
What it's doing:
It's treating the hex characters as a sequence of 4 hexadecimal digits that provide the UTF-16 character code of a character.
It gets each set of 4 digits in a loop using String.prototype.substr. Note MDN says .substr is deprecated but this is not mentioned in the ECMASript standard - rewrite it to use substring or something else as you wish.
Hex characters are prefixed with "0x" to make them a valid number representation in JavaScript and converted to a number object using Number. The number is then converted to a character string using the String.fromCharCode static method.
I guessed the format of the hex string by looking at it, which means a general purpose encoding routine to encode UTF16 characters (not code points) into hex could look like:
const hexEncodeUTF16 =
str=>str.split('')
.map( char => char.charCodeAt(0).toString(16).padStart(4,'0'))
.join('');
console.log( hexEncodeUTF16( "String has letters with diacritics: č, š, ř, ..."));
I hope these examples show what needs doing - there are any number of ways to implement it in code.

Split and replace text by two rules (regex)

I trying to split text by two rules:
Split by whitespace
Split words greater than 5 symbols into two separate words like (aaaaawww into aaaaa- and www)
I create regex that can detect this rules (https://regex101.com/r/fyskB3/2) but can't understand how to make both rules work in (text.split(/REGEX/)
Currently regex - (([\s]+)|(\w{5})(?=\w))
For example initial text is hello i am markopollo and result should look like ['hello', 'i', 'am', 'marko-', 'pollo']

It would probably be easier to use .match: match up to 5 characters that aren't whitespace:
const str = 'wqerweirj ioqwejr qiwejrio jqoiwejr qwer qwer';
console.log(
str.match(/[^ ]{1,5}/g)
)

My approach would be to process the string before splitting (I'm a big fan of RegEx):
1- Search and replace all the 5 consecutive non-last characters with \1-.
The pattern (\w{5}\B) will do the trick, \w{5} will match 5 exact characters and \B will match only if the last character is not the ending character of the word.
2- Split the string by spaces.
var text = "hello123467891234 i am markopollo";
var regex = /(\w{5}\B)/g;
var processedText = text.replace(regex, "$1- ");
var result = processedText.split(" ");
console.log(result)
Hope it helps!

Something like this should work:
const str = "hello i am markopollo";
const words = str.split(/\s+/);
const CHUNK_SIZE=5;
const out = [];
for(const word of words) {
if(word.length > CHUNK_SIZE) {
let chunks = chunkSubstr(word,CHUNK_SIZE);
let last = chunks.pop();
out.push(...chunks.map(c => c + '-'),last);
} else {
out.push(word);
}
}
console.log(out);
// credit: https://stackoverflow.com/a/29202760/65387
function chunkSubstr(str, size) {
const numChunks = Math.ceil(str.length / size)
const chunks = new Array(numChunks)
for (let i = 0, o = 0; i < numChunks; ++i, o += size) {
chunks[i] = str.substr(o, size)
}
return chunks
}
i.e., first split the string into words on spaces, and then find words longer than 5 chars and 'chunk' them. I popped off the last chunk to avoid adding a - to it, but there might be a more efficient way if you patch chunkSubstr instead.
regex.split doesn't work so well because it will basically remove those items from the output. In your case, it appears you want to strip the whitespace but keep the words, so splitting on both won't work.

Uses the regex expression of #CertainPerformance = [^\s]{1,5}, then apply regex.exec, finally loop all matches to reach the goal.
Like below demo:
const str = 'wqerweirj ioqwejr qiwejrio jqoiwejr qwer qwer'
let regex1 = RegExp('[^ ]{1,5}', 'g')
function customSplit(targetString, regexExpress) {
let result = []
let matchItem = null
while ((matchItem = regexExpress.exec(targetString)) !== null) {
result.push(
matchItem[0] + (
matchItem[0].length === 5 && targetString[regexExpress.lastIndex] && targetString[regexExpress.lastIndex] !== ' '
? '-' : '')
)
}
return result
}
console.log(customSplit(str, regex1))
console.log(customSplit('hello i am markopollo', regex1))

Javascript regex to add to every character of a string a backslash

So as the title says I'd like to add to every character of a string a backslash, whether the string has special characters or not. The string should not be considered 'safe'
eg:
let str = 'dj%^3&something';
str = str.replace(x, y);
// str = '\d\j\%\^\3\&\s\o\m\e\t\h\i\n\g'

You could capture every character in the string with (.) and use \\$1 as replacement, I'm not an expert but basically \\ will render to \ and $1 will render to whatever (.) captures.
HIH
EDIT
please refer to Wiktor Stribiżew's comment for an alternative which will require less coding. Changes as follows:
str = str.replace(/(.)/g, '\\$1'); for str = str.replace(/./g, '\\$&');
Also, for future reference I strongly advice you to visit regexr.com when it comes to regular expressions, it's helped ME a lot
let str = 'dj%^3&something';
str = str.replace(/(.)/g, '\\$1');
console.log(str);

If you just want to display a string safely, you should just do:
let str = 'dj%^3&something';
let node = document.createTextNode(str);
let dest = document.querySelector('.whatever');
dest.appendChild(node);
And then you are guaranteed that it will be treated as text, and won't be able to execute a script or anything.
For example: https://jsfiddle.net/s6udj03L/1/

You can split the string to an array, add a \ to each element to the array, then joint the array back to the string that you wanted.
var str = 'dj%^3&something';
var split = str.split(""); // split string into array
for (var i = 0; i < split.length; i++) {
split[i] = '\\' + split[i]; // add backslash to each element in array
}
var joint = split.join('') // joint array to string
console.log(joint);

If you don't care about creating a new string and don't really have to use a regex, why not just iterate over the existing one and place a \ before each char. Notice to you have to put \\ to escape the first \.
To consider it safe, you have to encode it somehow. You could replace typical 'non-safe' characters like in the encode() function below. Notice how & get's replaced by &
let str = 'dj%^3&something';
let out = "";
for(var i = 0; i < str.length; i++) {
out += ("\\" + str[i]);
}
console.log(out);
console.log(encode(out));
function encode(string) {
return String(string).replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>').replace(/"/g, '"');
}

Replace content present in the nested brackets

Input = ABCDEF ((3) abcdef),GHIJKLMN ((4)(5) Value),OPQRSTUVW((4(5)) Value (3))
Expected Output = ABCDEF,GHIJKLMN,OPQRSTUVW
Tried so far
Output = Input.replace(/ *\([^)]*\)*/g, "");

Using a regex here probably won't work, or scale, because you expect nested parentheses in your input string. Regex works well when there is a known and fixed structure to the input. Instead, I would recommend that you approach this using a parser. In the code below, I iterate over the input string, one character at at time, and I use a counter to keep track of how many open parentheses there are. If we are inside a parenthesis term, then we don't record those characters. I also have one simple replacement at the end to remove whitespace, which is an additional step which your output implies, but you never explicitly mentioned.
var pCount = 0;
var Input = "ABCDEF ((3) abcdef),GHIJKLMN ((4)(5) Value),OPQRSTUVW((4(5)) Value (3))";
var Output = "";
for (var i=0; i < Input.length; i++) {
if (Input[i] === '(') {
pCount++;
}
else if (Input[i] === ')') {
pCount--;
}
else if (pCount == 0) {
Output += Input[i];
}
}
Output = Output.replace(/ /g,'');
console.log(Output);

If you need to remove nested parentheses, you may use a trick from Remove Nested Patterns with One Line of JavaScript.
var Input = "ABCDEF ((3) abcdef),GHIJKLMN ((4)(5) Value),OPQRSTUVW((4(5)) Value (3))";
var Output = Input;
while (Output != (Output = Output.replace(/\s*\([^()]*\)/g, "")));
console.log(Output);
Or, you could use a recursive function:
function remove_nested_parens(s) {
let new_s = s.replace(/\s*\([^()]*\)/g, "");
return new_s == s ? s : remove_nested_parens(new_s);
}
console.log(remove_nested_parens("ABCDEF ((3) abcdef),GHIJKLMN ((4)(5) Value),OPQRSTUVW((4(5)) Value (3))"));
Here, \s*\([^()]*\) matches 0+ whitespaces, (, 0+ chars other than ( and ) and then a ), and the replace operation is repeated until the string does not change.

Split string in javascript by lines, preserving newlines?

How would I split a javascript string such as foo\nbar\nbaz to an array of lines, while preserving the newlines? I'd like to get ['foo\n', 'bar\n', 'baz'] as output;
I'm aware there are numerous possible answers - I'm just curious to find a stylish one.
With perl I'd use a zero-width lookbehind assertion: split /(?<=\n)/, but they are not supported in javascript regexs.
PS. Extra points for handling different line endings (at least \r\n) and handling the missing last newline (as in my example).

You can perform a global match with this pattern: /[^\n]+(?:\r?\n|$)/g
It matches any non-newline character then matches an optional \r followed by \n, or the end of the string.
var input = "foo\r\n\nbar\nbaz";
var result = input.match(/[^\n]+(?:\r?\n|$)/g);
Result: ["foo\r\n", "bar\n", "baz"]

how about this?
"foo\nbar\nbaz".split(/^/m);
Result
["foo
", "bar
", "baz"]

The other answers and answers in comments are all flawed in different ways. I needed a function that works correctly on any string or file.
Here is a simple and correct answer:
function split_lines(s) {
return s.match(/[^\n]*\n|[^\n]+/g);
}
input = "foo\r\n\nbar\n\r\nba\rz\r\r\r";
a = split_lines(input);
Array(5) [ "foo\r\n", "\n", "bar\n", "\r\n", "ba\rz\r\r\r" ]
It effectively splits at each newline \n but includes the \n, and includes a final line without trailing \n if and only if it is not empty. It includes all input characters in the output. We don't need any special treatment for \r.
I've tested this on a large chunk of random data, it does preserve all input characters, and \n only occur at the end of the lines.
Here's a test script:
function split_lines(s) {
return s.match(/[^\n]*\n|[^\n]+/g);
}
function gen_random_string(n, ncharset=256, nlprob=0.05, crprob=0.05) {
var s = "";
for (let i = 0; i < n; ++i) {
var r = Math.random();
if (r < nlprob)
s += "\n";
else if (r < nlprob + crprob)
s += "\r";
else {
var cc = Math.floor(r / (1 - nlprob - crprob) * ncharset);
var c = String.fromCharCode(cc);
s += c;
}
}
return s;
}
function test(...args) {
var s = gen_random_string(...args);
console.log(`generated random string of length ${s.length} with args:`, ...args);
var ok = true, ok1;
var a = split_lines(s);
console.log(`split into ${a.length} lines`);
ok1 = s === a.join('');
ok = ok && ok1;
console.log("split lines combine to give the original string?", ok1 ? "OK" : "FAIL");
for (var i = 0; i < a.length; ++i) {
var s1 = a[i];
ok1 = s1.endsWith("\n") || i == a.length-1;
ok = ok && ok1;
ok1 = !s1.slice(0, -1).includes("\n");
ok = ok && ok1;
}
console.log("tested each line other than the last ends with \\n");
console.log("tested each line does not contain \\n before the last character");
console.log("Final result", ok ? "OK" : "FAIL");
}
test(10000, 256);
test(10000, 65536);

I'd stay away from split with regular expressions since IE has a failed implementation of it. Use match instead.
"foo\nbar\nbaz".match(/^.*(\r?\n|$)/mg)
Result: ["foo\n", "bar\n", "baz"]

One simple but crude method would be first to replace "\n"s with a 2 special characters. Split on the second one, and replace the first with "\n" after splitting. Not efficient and not elegant, but definitely works.

Develop Reference

JavaScript is the programming language of the Web.

Writing a Hexadecimal Escape Character Sequence using Variables - javascript

Related

Javascript hexadecimal to ASCII with latin extended symbols

Split and replace text by two rules (regex)

Javascript regex to add to every character of a string a backslash

Replace content present in the nested brackets

Split string in javascript by lines, preserving newlines?

Categories

Resources