Split string in javascript by lines, preserving newlines? - javascript

How would I split a javascript string such as foo\nbar\nbaz to an array of lines, while preserving the newlines? I'd like to get ['foo\n', 'bar\n', 'baz'] as output;
I'm aware there are numerous possible answers - I'm just curious to find a stylish one.
With perl I'd use a zero-width lookbehind assertion: split /(?<=\n)/, but they are not supported in javascript regexs.
PS. Extra points for handling different line endings (at least \r\n) and handling the missing last newline (as in my example).

You can perform a global match with this pattern: /[^\n]+(?:\r?\n|$)/g
It matches any non-newline character then matches an optional \r followed by \n, or the end of the string.
var input = "foo\r\n\nbar\nbaz";
var result = input.match(/[^\n]+(?:\r?\n|$)/g);
Result: ["foo\r\n", "bar\n", "baz"]

how about this?
"foo\nbar\nbaz".split(/^/m);
Result
["foo
", "bar
", "baz"]

The other answers and answers in comments are all flawed in different ways. I needed a function that works correctly on any string or file.
Here is a simple and correct answer:
function split_lines(s) {
return s.match(/[^\n]*\n|[^\n]+/g);
}
input = "foo\r\n\nbar\n\r\nba\rz\r\r\r";
a = split_lines(input);
Array(5) [ "foo\r\n", "\n", "bar\n", "\r\n", "ba\rz\r\r\r" ]
It effectively splits at each newline \n but includes the \n, and includes a final line without trailing \n if and only if it is not empty. It includes all input characters in the output. We don't need any special treatment for \r.
I've tested this on a large chunk of random data, it does preserve all input characters, and \n only occur at the end of the lines.
Here's a test script:
function split_lines(s) {
return s.match(/[^\n]*\n|[^\n]+/g);
}
function gen_random_string(n, ncharset=256, nlprob=0.05, crprob=0.05) {
var s = "";
for (let i = 0; i < n; ++i) {
var r = Math.random();
if (r < nlprob)
s += "\n";
else if (r < nlprob + crprob)
s += "\r";
else {
var cc = Math.floor(r / (1 - nlprob - crprob) * ncharset);
var c = String.fromCharCode(cc);
s += c;
}
}
return s;
}
function test(...args) {
var s = gen_random_string(...args);
console.log(`generated random string of length ${s.length} with args:`, ...args);
var ok = true, ok1;
var a = split_lines(s);
console.log(`split into ${a.length} lines`);
ok1 = s === a.join('');
ok = ok && ok1;
console.log("split lines combine to give the original string?", ok1 ? "OK" : "FAIL");
for (var i = 0; i < a.length; ++i) {
var s1 = a[i];
ok1 = s1.endsWith("\n") || i == a.length-1;
ok = ok && ok1;
ok1 = !s1.slice(0, -1).includes("\n");
ok = ok && ok1;
}
console.log("tested each line other than the last ends with \\n");
console.log("tested each line does not contain \\n before the last character");
console.log("Final result", ok ? "OK" : "FAIL");
}
test(10000, 256);
test(10000, 65536);

I'd stay away from split with regular expressions since IE has a failed implementation of it. Use match instead.
"foo\nbar\nbaz".match(/^.*(\r?\n|$)/mg)
Result: ["foo\n", "bar\n", "baz"]

One simple but crude method would be first to replace "\n"s with a 2 special characters. Split on the second one, and replace the first with "\n" after splitting. Not efficient and not elegant, but definitely works.

Related

Writing a Hexadecimal Escape Character Sequence using Variables

Testing Hex Character Codes
Problem
What does a Vertical Tab or a Backspace character actually do? I want to find out.
My experiment is to find out exactly what happens when every hex character is put into a string. I thought the best way to do this would be to created a nested loop to go through each of the 16 hexadecimal characters to create each possible 2 digit hex character code.
I soon discovered that you cannot use the \x escape character with interpolated variables, and so I expect what I have set out to do might be impossible.
const hexCharacters = "0123456789ABCDEF";
let code = "";
let char1 = "";
let char2 = "";
for (charPos1 = 0; charPos1 < hexCharacters.length; charPos1++) {
for (charPos2 = 0; charPos2 < hexCharacters.length; charPos2++) {
char1 = hexCharacters[charPos1];
char2 = hexCharacters[charPos2];
code = `${char1}${char2}`;
printHexChar(code);
}
}
function printHexChar(string) {
let output = `<p>Hex Code ${string} = \x${string}</p>`; // THE PROBLEM IS CLEAR
document.write(output)
}
I know it will also probably fail once it gets past 7F or whichever is the last character in the set, but that's not the main issue here! :D
Potential solution
string.prototype.fromCharCode
This sort of string method approach would seem to be the answer, but it is meant for U-16 character codes, and that is not what I wanted to test. There doesn't seem to be an existing string method for hex codes. Probably because nobody would ever want one, but nevertheless it would be cool.
Conclusion
Is there any way to create an escape character sequence from assembled parts that will render not as plain text, but as a proper escape character sequence?
Apologies if this has been asked before in some form, but with my feeble understanding of things I just couldn't find an answer.
You can use String.fromCharCode with parseInt.
`<p>Hex Code ${string} = ${String.fromCharCode(parseInt(string, 16))}</p>`;
const hexCharacters = "0123456789ABCDEF";
let code = "";
let char1 = "";
let char2 = "";
for (charPos1 = 0; charPos1 < hexCharacters.length; charPos1++) {
for (charPos2 = 0; charPos2 < hexCharacters.length; charPos2++) {
char1 = hexCharacters[charPos1];
char2 = hexCharacters[charPos2];
code = `${char1}${char2}`;
printHexChar(code);
}
}
function printHexChar(string) {
let output = `<p>Hex Code ${string} = ${String.fromCharCode(parseInt(string, 16))}</p>`;
document.write(output)
}
eval works as well, though it should generally be avoided.
`<p>Hex Code ${string} = ${eval('"\\x'+string+'"')}</p>`
If you want to output \x literally, then in a string literal you need to escape the escape character, so `\\x`.
string.prototype.fromCharCode [...] is meant for U-16 character codes
JavaScript uses one character encoding. The following strings are all equal:
let a = String.fromCharCode(27);
let b = "\x1B";
let c = "\u001B";
console.log(a === b, b === c);
If I understand correctly, you want to produce a string literal that shows \x escape sequences -- not the actual character:
// Prepare string
let chars = Array.from({length: 128}, (_, i) => String.fromCharCode(i))
.join("");
// Escape them
let escaped = Array.from(chars, ch => `\\x${ch.charCodeAt().toString(16).padStart(2, "0")}`).join("");
console.log(escaped);
But you might also use JSON.stringify. Although it uses different escape sequences (\u instead of \x), and only for non-display characters, it will be the exact same string when evaluated. Here is a demo:
// Prepare string
let chars = Array.from({length: 128}, (_, i) => String.fromCharCode(i))
.join("");
// Escape them
let escaped = '"' + Array.from(chars, ch => `\\x${ch.charCodeAt().toString(16).padStart(2, "0")}`).join("") + '"';
console.log(escaped);
// Or JSONify them
let json = JSON.stringify(chars);
console.log(json);
// Compare them, when evaluated:
console.log(eval(escaped) === eval(json));
Finally, note that there is nothing special about hexadecimal: it is just a representation of an integer. In the end, it is the numerical value that is important, not the representation of it. It is that numerical value that corresponds to a character.
Addendum
If you prefer code that sticks to old-style JavaScript, here is something equivalent of the last code snippet:
// Prepare string
let chars = "";
for (let i = 0; i < 128; i++) {
chars += String.fromCharCode(i);
}
// Escape the characters in this string
let escaped = '"';
for (let i = 0; i < chars.length; i++) {
let ch = chars.charCodeAt(i);
let hex = ch.toString(16);
if (hex.length === 1) hex = "0" + hex;
escaped += "\\x" + hex;
}
escaped += '"';
console.log(escaped);
// Or JSONify them
let json = JSON.stringify(chars);
console.log(json);
// Compare them, when evaluated:
console.log(eval(escaped) === eval(json));

Replace content present in the nested brackets

Input = ABCDEF ((3) abcdef),GHIJKLMN ((4)(5) Value),OPQRSTUVW((4(5)) Value (3))
Expected Output = ABCDEF,GHIJKLMN,OPQRSTUVW
Tried so far
Output = Input.replace(/ *\([^)]*\)*/g, "");
Using a regex here probably won't work, or scale, because you expect nested parentheses in your input string. Regex works well when there is a known and fixed structure to the input. Instead, I would recommend that you approach this using a parser. In the code below, I iterate over the input string, one character at at time, and I use a counter to keep track of how many open parentheses there are. If we are inside a parenthesis term, then we don't record those characters. I also have one simple replacement at the end to remove whitespace, which is an additional step which your output implies, but you never explicitly mentioned.
var pCount = 0;
var Input = "ABCDEF ((3) abcdef),GHIJKLMN ((4)(5) Value),OPQRSTUVW((4(5)) Value (3))";
var Output = "";
for (var i=0; i < Input.length; i++) {
if (Input[i] === '(') {
pCount++;
}
else if (Input[i] === ')') {
pCount--;
}
else if (pCount == 0) {
Output += Input[i];
}
}
Output = Output.replace(/ /g,'');
console.log(Output);
If you need to remove nested parentheses, you may use a trick from Remove Nested Patterns with One Line of JavaScript.
var Input = "ABCDEF ((3) abcdef),GHIJKLMN ((4)(5) Value),OPQRSTUVW((4(5)) Value (3))";
var Output = Input;
while (Output != (Output = Output.replace(/\s*\([^()]*\)/g, "")));
console.log(Output);
Or, you could use a recursive function:
function remove_nested_parens(s) {
let new_s = s.replace(/\s*\([^()]*\)/g, "");
return new_s == s ? s : remove_nested_parens(new_s);
}
console.log(remove_nested_parens("ABCDEF ((3) abcdef),GHIJKLMN ((4)(5) Value),OPQRSTUVW((4(5)) Value (3))"));
Here, \s*\([^()]*\) matches 0+ whitespaces, (, 0+ chars other than ( and ) and then a ), and the replace operation is repeated until the string does not change.

How to remove string between two characters every time they occur [duplicate]

This question already has answers here:
Strip HTML from Text JavaScript
(44 answers)
removing html tags from string
(3 answers)
Closed 7 years ago.
I need to get rid of any text inside < and >, including the two delimiters themselves.
So for example, from string
<brev-y>th</brev-y><sw-ex>a</sw-ex><sl>t</sl>​
I would like to get this one
that
This is what i've tried so far:
var str = annotation.split(' ');
str.substring(str.lastIndexOf("<") + 1, str.lastIndexOf(">"))
But it doesn't work for every < and >.
I'd rather not use RegEx if possible, but I'm happy to hear if it's the only option.
You can simply use the replace method with /<[^>]*>/g.It matches < followed by [^>]* any amount of non> until > globally.
var str = '<brev-y>th</brev-y><sw-ex>a</sw-ex><sl>t</sl>';
str = str.replace(/<[^>]*>/g, "");
alert(str);
For string removal you can use RegExp, it is ok.
"<brev-y>th</brev-y><sw-ex>a</sw-ex><sl>t</sl>​".replace(/<\/?[^>]+>/g, "")
Since the text you want is always after a > character, you could split it at that point, and then the first character in each String of the array would be the character you need. For example:
String[] strings = stringName.split("<");
String word = "";
for(int i = 0; i < strings.length; i++) {
word += strings[i].charAt(0);
}
This is probably glitchy right now, but I think this would work. You don't need to actually remove the text between the "<>"- just get the character right after a '>'
Using a regular expression is not the only option, but it's a pretty good option.
You can easily parse the string to remove the tags, for example by using a state machine where the < and > characters turns on and off a state of ignoring characters. There are other methods of course, some shorter, some more efficient, but they will all be a few lines of code, while a regular expression solution is just a single replace.
Example:
function removeHtml1(str) {
return str.replace(/<[^>]*>/g, '');
}
function removeHtml2(str) {
var result = '';
var ignore = false;
for (var i = 0; i < str.length; i++) {
var c = str.charAt(i);
switch (c) {
case '<': ignore = true; break;
case '>': ignore = false; break;
default: if (!ignore) result += c;
}
}
return result;
}
var s = "<brev-y>th</brev-y><sw-ex>a</sw-ex><sl>t</sl>";
console.log(removeHtml1(s));
console.log(removeHtml2(s));
There are several ways to do this. Some are better than others. I haven't done one lately for these two specific characters, so I took a minute and wrote some code that may work. I will describe how it works. Create a function with a loop that copies an incoming string, character by character, to an outgoing string. Make the function a string type so it will return your modified string. Create the loop to scan from incoming from string[0] and while less than string.length(). Within the loop, add an if statement. When the if statement sees a "<" character in the incoming string it stops copying, but continues to look at every character in the incoming string until it sees the ">" character. When the ">" is found, it starts copying again. It's that simple.
The following code may need some refinement, but it should get you started on the method described above. It's not the fastest and not the most elegant but the basic idea is there. This did compile, and it ran correctly, here, with no errors. In my test program it produced the correct output. However, you may need to test it further in the context of your program.
string filter_on_brackets(string str1)
{
string str2 = "";
int copy_flag = 1;
for (size_t i = 0 ; i < str1.length();i++)
{
if(str1[i] == '<')
{
copy_flag = 0;
}
if(str1[i] == '>')
{
copy_flag = 2;
}
if(copy_flag == 1)
{
str2 += str1[i];
}
if(copy_flag == 2)
{
copy_flag = 1;
}
}
return str2;
}

Splitting string to array while ignoring content between apostrophes

I need something that takes a string, and divides it into an array.
I want to split it after every space, so that this -
"Hello everybody!" turns into ---> ["Hello", "Everybody!"]
However, I want it to ignore spaces inbetween apostrophes. So for examples -
"How 'are you' today?" turns into ---> ["How", "'are you'", "today?"]
Now I wrote the following code (which works), but something tells me that what I did is pretty much horrible and that it can be done with probably 50% less code.
I'm also pretty new to JS so I guess I still don't adhere to all the idioms of the language.
function getFixedArray(text) {
var textArray = text.split(' '); //Create an array from the string, splitting by spaces.
var finalArray = [];
var bFoundLeadingApostrophe = false;
var bFoundTrailingApostrophe = false;
var leadingRegExp = /^'/;
var trailingRegExp = /'$/;
var concatenatedString = "";
for (var i = 0; i < textArray.length; i++) {
var text = textArray[i];
//Found a leading apostrophe
if(leadingRegExp.test(text) && !bFoundLeadingApostrophe && !trailingRegExp.test(text)) {
concatenatedString =concatenatedString + text;
bFoundLeadingApostrophe = true;
}
//Found the trailing apostrophe
else if(trailingRegExp.test(text ) && !bFoundTrailingApostrophe) {
concatenatedString = concatenatedString + ' ' + text;
finalArray.push(concatenatedString);
concatenatedString = "";
bFoundLeadingApostrophe = false;
bFoundTrailingApostrophe = false;
}
//Found no trailing apostrophe even though the leading flag indicates true, so we want this string.
else if (bFoundLeadingApostrophe && !bFoundTrailingApostrophe) {
concatenatedString = concatenatedString + ' ' + text;
}
//Regular text
else {
finalArray.push(text);
}
}
return finalArray;
}
I would deeply appreciate it if somebody could go through this and teach me how this should be rewritten, in a more correct & efficient way (and perhaps a more "JS" way).
Thanks!
Edit -
Well I just found a few problems, some of which I fixed, and some I'm not sure how to handle without making this code too complex (for example the string "hello 'every body'!" doesn't split properly....)
You could try matching instead of splitting:
string.match(/(?:['"].+?['"])|\S+/g)
The above regex will match anything in between quotes (including the quotes), or anything that's not a space otherwise.
If you want to also match characters after the quotes, like ? and ! you can try:
/(?:['"].+?['"]\W?)|\S+/g
For "hello 'every body'!" it will give you this array:
["hello", "'every body'!"]
Note that \W matches space as well, if you want to match punctuation you could be explicit by using a character class in place of \W
[,.?!]
Or simply trim the strings after matching:
string.match(regex).map(function(x){return x.trim()})

Javascript Remove strings in beginning and end

base on the following string
...here..
..there...
.their.here.
How can i remove the . on the beginning and end of string like the trim that removes all spaces, using javascript
the output should be
here
there
their.here
These are the reasons why the RegEx for this task is /(^\.+|\.+$)/mg:
Inside /()/ is where you write the pattern of the substring you want to find in the string:
/(ol)/ This will find the substring ol in the string.
var x = "colt".replace(/(ol)/, 'a'); will give you x == "cat";
The ^\.+|\.+$ in /()/ is separated into 2 parts by the symbol | [means or]
^\.+ and \.+$
^\.+ means to find as many . as possible at the start.
^ means at the start; \ is to escape the character; adding + behind a character means to match any string containing one or more that character
\.+$ means to find as many . as possible at the end.
$ means at the end.
The m behind /()/ is used to specify that if the string has newline or carriage return characters, the ^ and $ operators will now match against a newline boundary, instead of a string boundary.
The g behind /()/ is used to perform a global match: so it find all matches rather than stopping after the first match.
To learn more about RegEx you can check out this guide.
Try to use the following regex
var text = '...here..\n..there...\n.their.here.';
var replaced = text.replace(/(^\.+|\.+$)/mg, '');
Here is working Demo
Use Regex /(^\.+|\.+$)/mg
^ represent at start
\.+ one or many full stops
$ represents at end
so:
var text = '...here..\n..there...\n.their.here.';
alert(text.replace(/(^\.+|\.+$)/mg, ''));
Here is an non regular expression answer which utilizes String.prototype
String.prototype.strim = function(needle){
var first_pos = 0;
var last_pos = this.length-1;
//find first non needle char position
for(var i = 0; i<this.length;i++){
if(this.charAt(i) !== needle){
first_pos = (i == 0? 0:i);
break;
}
}
//find last non needle char position
for(var i = this.length-1; i>0;i--){
if(this.charAt(i) !== needle){
last_pos = (i == this.length? this.length:i+1);
break;
}
}
return this.substring(first_pos,last_pos);
}
alert("...here..".strim('.'));
alert("..there...".strim('.'))
alert(".their.here.".strim('.'))
alert("hereagain..".strim('.'))
and see it working here : http://jsfiddle.net/cettox/VQPbp/
Slightly more code-golfy, if not readable, non-regexp prototype extension:
String.prototype.strim = function(needle) {
var out = this;
while (0 === out.indexOf(needle))
out = out.substr(needle.length);
while (out.length === out.lastIndexOf(needle) + needle.length)
out = out.slice(0,out.length-needle.length);
return out;
}
var spam = "this is a string that ends with thisthis";
alert("#" + spam.strim("this") + "#");
Fiddle-ige
Use RegEx with javaScript Replace
var res = s.replace(/(^\.+|\.+$)/mg, '');
We can use replace() method to remove the unwanted string in a string
Example:
var str = '<pre>I'm big fan of Stackoverflow</pre>'
str.replace(/<pre>/g, '').replace(/<\/pre>/g, '')
console.log(str)
output:
Check rules on RULES blotter

Categories

Resources