Split a CSV string by line skipping newlines contained between quotes

Split a CSV string by line skipping newlines contained between quotes - javascript

If the following regex can split a csv string by line.
var lines = csv.split(/\r|\r?\n/g);
How could this be adapted to skip newline chars that are contained within a CSV value (Ie between quotes/double-quotes)?
Example:
2,"Evans & Sutherland","230-132-111AA",,"Visual","P
CB",,1,"Offsite",
If you don't see it, here's a version with the newlines visible:
2,"Evans & Sutherland","230-132-111AA",,"Visual","P\r\nCB",,1,"Offsite",\r\n
The part I'm trying to skip over is the newline contained in the middle of the "PCB" entry.
Update:
I probably should've mentioned this before but this is a part of a dedicated CSV parsing library called jquery-csv. To provide a better context I have added the current parser implementation below.
Here's the code for validating and parsing an entry (ie one line):
$.csvEntry2Array = function(csv, meta) {
var meta = (meta !== undefined ? meta : {});
var separator = 'separator' in meta ? meta.separator : $.csvDefaults.separator;
var delimiter = 'delimiter' in meta ? meta.delimiter : $.csvDefaults.delimiter;
// build the CSV validator regex
var reValid = /^\s*(?:D[^D\\]*(?:\\[\S\s][^D\\]*)*D|[^SD\s\\]*(?:\s+[^SD\s\\]+)*)\s*(?:S\s*(?:D[^D\\]*(?:\\[\S\s][^D\\]*)*D|[^SD\s\\]*(?:\s+[^SD\s\\]+)*)\s*)*$/;
reValid = RegExp(reValid.source.replace(/S/g, separator));
reValid = RegExp(reValid.source.replace(/D/g, delimiter));
// build the CSV line parser regex
var reValue = /(?!\s*$)\s*(?:D([^D\\]*(?:\\[\S\s][^D\\]*)*)D|([^SD\s\\]*(?:\s+[^SD\s\\]+)*))\s*(?:S|$)/g;
reValue = RegExp(reValue.source.replace(/S/g, separator), 'g');
reValue = RegExp(reValue.source.replace(/D/g, delimiter), 'g');
// Return NULL if input string is not well formed CSV string.
if (!reValid.test(csv)) {
return null;
}
// "Walk" the string using replace with callback.
var output = [];
csv.replace(reValue, function(m0, m1, m2) {
// Remove backslash from any delimiters in the value
if (m1 !== undefined) {
var reDelimiterUnescape = /\\D/g;
reDelimiterUnescape = RegExp(reDelimiterUnescape.source.replace(/D/, delimiter), 'g');
output.push(m1.replace(reDelimiterUnescape, delimiter));
} else if (m2 !== undefined) {
output.push(m2);
}
return '';
});
// Handle special case of empty last value.
var reEmptyLast = /S\s*$/;
reEmptyLast = RegExp(reEmptyLast.source.replace(/S/, separator));
if (reEmptyLast.test(csv)) {
output.push('');
}
return output;
};
Note: I haven't tested yet but I think I could probably incorporate the last match into the main split/callback.
This is the code that does the split-by-line part:
$.csv2Array = function(csv, meta) {
var meta = (meta !== undefined ? meta : {});
var separator = 'separator' in meta ? meta.separator : $.csvDefaults.separator;
var delimiter = 'delimiter' in meta ? meta.delimiter : $.csvDefaults.delimiter;
var skip = 'skip' in meta ? meta.skip : $.csvDefaults.skip;
// process by line
var lines = csv.split(/\r\n|\r|\n/g);
var output = [];
for(var i in lines) {
if(i < skip) {
continue;
}
// process each value
var line = $.csvEntry2Array(lines[i], {
delimiter: delimiter,
separator: separator
});
output.push(line);
}
return output;
};
For a breakdown on how that reges works take a look at this answer. Mine is a slightly adapted version. I consolidated the single and double quote matching to match just one text delimiter and made the delimiter/separators dynamic. It does a great job of validating entiries but the line-splitting solution I added on top is pretty frail and breaks on the edge case I described above.
I'm just looking for a solution that walks the string extracting valid entries (to pass on to the entry parser) or fails on bad data returning an error indicating the line the parsing failed on.
Update:
splitLines: function(csv, delimiter) {
var state = 0;
var value = "";
var line = "";
var lines = [];
function endOfRow() {
lines.push(value);
value = "";
state = 0;
};
csv.replace(/(\"|,|\n|\r|[^\",\r\n]+)/gm, function (m0){
switch (state) {
// the start of an entry
case 0:
if (m0 === "\"") {
state = 1;
} else if (m0 === "\n") {
endOfRow();
} else if (/^\r$/.test(m0)) {
// carriage returns are ignored
} else {
value += m0;
state = 3;
}
break;
// delimited input
case 1:
if (m0 === "\"") {
state = 2;
} else {
value += m0;
state = 1;
}
break;
// delimiter found in delimited input
case 2:
// is the delimiter escaped?
if (m0 === "\"" && value.substr(value.length - 1) === "\"") {
value += m0;
state = 1;
} else if (m0 === ",") {
value += m0;
state = 0;
} else if (m0 === "\n") {
endOfRow();
} else if (m0 === "\r") {
// Ignore
} else {
throw new Error("Illegal state");
}
break;
// un-delimited input
case 3:
if (m0 === ",") {
value += m0;
state = 0;
} else if (m0 === "\"") {
throw new Error("Unquoted delimiter found");
} else if (m0 === "\n") {
endOfRow();
} else if (m0 === "\r") {
// Ignore
} else {
throw new Error("Illegal data");
}
break;
default:
throw new Error("Unknown state");
}
return "";
});
if (state != 0) {
endOfRow();
}
return lines;
}
All it took is 4 states for a line splitter:
0: the start of an entry
1: the following is quoted
2: a second quote has been encountered
3: the following isn't quoted
It's almost a complete parser. For my use case, I just wanted a line splitter so I could provide a more granual approach to processing CSV data.
Note: Credit for this approach goes to another dev whom I won't name publicly without his permission. All I did was adapt it from a complete parser to a line-splitter.
Update:
Discovered a few broken edge cases in the previous lineSplitter implementation. The one provided should be fully RFC 4180 compliant.

As I have noted in a comment there is no complete solution just using single regex.
A novel method using several regexps by splitting on comma and joining back strings with embedded commas is described here:-
Personally I would use a simple finite state machine as described here
The state machine has more code, but the code is cleaner and its clear what each piece of code is doing. Longer term this will be much more reliable and maintainable.

It's not a good idea to use regex's to parse. Better to use it to detect the "bad" splits and then merge them back:
var lines = csv.split(/\r?\n/g);
var bad = [];
for(var i=lines.length-1; i> 0; i--) {
// find all the unescaped quotes on the line:
var m = lines[i].match(/[^\\]?\"/g);
// if there are an odd number of them, this line, and the line after it is bad:
if((m ? m.length : 0) % 2 == 1) { bad.push(i--); }
}
// starting at the bottom of the list, merge lines back, using \r\n
for(var b=0,len=bad.length; b < len; b++) {
lines.splice(bad[b]-1, 2, lines[bad[b]-1]+"\r\n"+lines[bad[b]]);
}
(This answer is licensed under both CC0 and WTFPL.)

Be careful- That newline is PART of that value. It's not PCB, it's P\nCB.
However, why can't you just use string.split(',')? If need be, you can run through the list and cast to ints or remove the padded quotation marks.

Related

How do I mask an email address between the first and the last character before the # sign?

My goal is to edit the string (which has an email) to mask the first part, like say the email is johndoe#abc.com then I should output j*****e#abc.com.
var maskPII = function(S) {
var ans = "";
if(S.includes("#")){
S = S.toLowerCase();
var parts = S.split("#");
var first = parts[0];
for(var i=0;i<parts[0].length;i++){
if(i!=0 && i!=parts[0].length - 1)
first[i] = '*';
}
ans = first +"#" +parts[1];
}else{
}
return ans;
};
However in my loop I can't change the characters to asterisks.
After execution I see value of first still same as parts[0] and has no asterisks, can some one explain why? Also, what would I need to do to modify the variable inside loop?

To answer your question... javascript allows you access values of a string using [] indexing.. but that is read only access... you cannot insert/replace values using that operator.
Ref: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String
When using bracket notation for character access,
attempting to delete or assign a value to these properties will not succeed.
The properties involved are neither writable nor configurable.
(See Object.defineProperty() for more information.)
You need to extract the values you want to keep from the existing string and build up a new string as noted in other answers...

Well, this's what you're looking for, and this will be the output j*****e#abc.com.
var ans = "";
var S = "johndoe#abc.com"; //example
S = S.toLowerCase();
var parts = S.split("#");
var first = "";
for(var i = 0; i < parts[0].length; i++){
if(i != 0 && i != parts[0].length - 1){
first += '*';
}else{
first += parts[0][i];
}
}
ans = first +"#"+ parts[1];
console.log(ans);

Here is the code with your approach:
var maskPII = function(S) {
var ans = "";
if(S.includes("#")){
S = S.toLowerCase();
var parts = S.split("#");
var first = parts[0][0];
for(var i=0;i<parts[0].length;i++){
if(i!=0 && i!=parts[0].length - 1)
first += '*';
}
ans = first + parts[0][parts[0].length - 1] +"#" +parts[1];
}else{
}
return ans;
};
But if i were you i would use:
var mail = "johndoe#abc.com";
mail = mail.replace(/(?<=.)(.+?)(?=.#)/gi, '*'.repeat(mail.split('#')[0].length - 2));
console.log(mail);

You can use the bracket notation on a string (like an array) to get the character at a specific index, but you can't use this to change characters. So first[i] = '*' in your code wont do anything.
Strings in JavaScript are immutable. This means that if you want to change a string, a new string instance will be created. This also means that when you change a string in a for-loop, it can impact performance. (Although in this case the difference wont be noticeable.
)
I would use this code:
function maskPII(str) {
const indexOfAt = str.indexOf('#');
if (indexOfAt <= 2) {
return str;
}
return str[0] + '*'.repeat(indexOfAt - 2) + str.substring(indexOfAt - 1);
}
const email = 'johndoe#abc.com';
console.log(email);
console.log(maskPII(email));
It will look for the index of the # sign. If the index is less or equal than 2, (when not found the index will be -1) it will return the original string.
Otherwise it will get the first character, calculate the amount of asterisks needed (index of the # sign -2) and repeat those and then add the rest of the original string.

JS (no lookback) regex for replacing :bound SQL vars without replacing 'colons:in:literals' ...?

Messing around with node-mysql, I wrote some code that lets me use PDO-style :bound values (plus ::bound field names), and rewrites the query with ? and ?? respectively where they are found, and builds a linear array of the values when I execute the statement. I did this because when I look at a SQL statement with a ton of ? ?? all over it and have to count the number of params in my execution, it makes my eyes bleed. I want to just assign a standard object at execution time.
The trouble is, after writing this (it works) I realized my regex for finding those colons in the statement had one tiny little problem, namely, it looks like this:
/.?:(\w+)/g
It picks up the first colon if needed and we take it from there. The problem is, it also picks up colons in literals within the query. So if for some reason you wanted a non-bound string as part of your insert/update, it would be replaced by this engine.
Is there any standard regex for picking up every global instance of the word ":param{#}" in the following statement, without picking up the word "Hello:world", in JS, without lookbacks?
INSERT INTO test VALUES(:param1, :param2, 'Hello:world', :param3);

You're often much better off writing a parser than using regular expressions. It's much more flexible, gives you better error reporting and allows you to handle current & future edge cases much more easily.
The string parsing deals with MySQL string literals syntax & escape sequences described here and just skips over them.
I'm not dealing with valid/invalid binding boundaries, but you could add that if you wanted. You could also remove error reporting such as underterminated string literals and just be forgiving.
The lookahead === ':' && peek() !== '=' condition is to ignore the := MySQL operator.
const parseBindings = (() => {
const bindingCharRx = /\w/;
return function(sql) {
const bindings = [];
let i = 0,
lookahead = sql[i];
while (lookahead) {
if (isStringDelim(lookahead)) parseString();
else if (lookahead === ':' && peek() !== '=') parseBinding();
else consume();
}
return bindings;
function parseString() {
const start = i,
delim = lookahead;
consume();
while (lookahead) {
if (lookahead === '\\') {
consume();
consume();
continue;
}
if (lookahead === delim) {
consume();
if (lookahead !== delim) return;
}
consume();
}
throw new Error(`Underterminated string literal starting at index ${start}.`);
}
function isStringDelim(char) {
return char === "'" || char === '"';
}
function parseBinding() {
const start = i;
consume();
while (lookahead && bindingCharRx.test(lookahead)) consume();
const name = sql.slice(start + 1, i);
if (!name.length) {
throw new Error(`Invalid binding starting at index ${start}.`);
}
bindings.push({
start,
end: i,
name: name
});
}
function consume() {
lookahead = sql[++i]
}
function peek() {
return sql[i + 1]
}
}
})();
function replaceNamedBindings(values, sql) {
const bindings = parseBindings(sql);
const bindingNames = new Set(bindings.map(b => b.name));
const unknownBinding = Object.keys(values).find(k => !bindingNames.has(k));
if (unknownBinding) throw new Error(`Couldn't find a binding named '${unknownBinding}'.`);
let lastIndex = 0,
newSql = '';
for (const binding of bindings) {
if (binding.name in values) {
newSql += sql.slice(lastIndex, binding.start) + values[binding.name];
lastIndex = binding.end;
}
}
newSql += sql.slice(lastIndex);
return newSql;
}
const sql = `INSERT INTO test VALUES(:param1, :param2, 'Hello:world', :param3);`;
console.log(replaceNamedBindings({
param1: '(param1 value)',
param2: '(param2 value)',
param3: '(param3 value)'
}, sql));
console.log(parseBindings(sql));
console.log(parseBindings(`:pickup1 ":dontpickup1" ':dontpickup2' := """:dontpickup3" ''':dontpickup4' "\\":dontpickup5" :pickup2`));
//Will throw exception b/c :world is not a binding
console.log(replaceNamedBindings({
world: '(world value)'
}, sql));

How do I parse JSON sprinkled unpredictably into a string?

Suppose that I've got a node.js application that receives input in a weird format: strings with JSON arbitrarily sprinkled into them, like so:
This is a string {"with":"json","in":"it"} followed by more text {"and":{"some":["more","json"]}} and more text
I have a couple guarantees about this input text:
The bits of literal text in between the JSON objects are always free from curly braces.
The top level JSON objects shoved into the text are always object literals, never arrays.
My goal is to split this into an array, with the literal text left alone and the JSON parsed out, like this:
[
"This is a string ",
{"with":"json","in":"it"},
" followed by more text ",
{"and":{"some":["more","json"]}},
" and more text"
]
So far I've written a naive solution that simply counts curly braces to decide where the JSON starts and stops. But this wouldn't work if the JSON contains strings with curly braces in them {"like":"this one } right here"}. I could try to get around that by doing similar quote counting math, but then I also have to account for escaped quotes. At that point it feels like I'm redoing way too much of JSON.parse's job. Is there a better way to solve this problem?

You can check if JSON.parse throws an error to determine if the chunk is a valid JSON object or not. If it throws an error then the unquoted } are unbalanced:
const tests = [
'{"just":"json }}{}{}{{[]}}}}","x":[1,2,3]}',
'Just a string',
'This string has a tricky case: {"like":"this one } right here"}',
'This string {} has a tiny JSON object in it.',
'.{}.',
'This is a string {"with":"json","in":"it"} followed by more text {"and":{"some":["more","json"]}} and more text',
];
tests.forEach( test => console.log( parse_json_interleaved_string( test ) ) );
function parse_json_interleaved_string ( str ) {
const chunks = [ ];
let last_json_end_index = -1;
let json_index = str.indexOf( '{', last_json_end_index + 1 );
for ( ; json_index !== -1; json_index = str.indexOf( '{', last_json_end_index + 1 ) ) {
// Push the plain string before the JSON
if ( json_index !== last_json_end_index + 1 )
chunks.push( str.substring( last_json_end_index, json_index ) );
let json_end_index = str.indexOf( '}', json_index + 1 );
// Find the end of the JSON
while ( true ) {
try {
JSON.parse( str.substring( json_index, json_end_index + 1 ) );
break;
} catch ( e ) {
json_end_index = str.indexOf( '}', json_end_index + 1 );
if ( json_end_index === -1 )
throw new Error( 'Unterminated JSON object in string' );
}
}
// Push JSON
chunks.push( str.substring( json_index, json_end_index + 1 ) );
last_json_end_index = json_end_index + 1;
}
// Push final plain string if any
if ( last_json_end_index === - 1 )
chunks.push( str );
else if ( str.length !== last_json_end_index )
chunks.push( str.substr( last_json_end_index ) );
return chunks;
}

Here's a comparatively simple brute-force approach: split the whole input string on curly braces, then step through the array in order. Whenever you come across an open brace, find the longest chunk of the array from that starting point that successfully parses as JSON. Rinse and repeat.
This will not work if the input contains invalid JSON and/or unbalanced braces (see the last two test cases below.)
const tryJSON = input => {
try {
return JSON.parse(input);
} catch (e) {
return false;
}
}
const parse = input => {
let output = [];
let chunks = input.split(/([{}])/);
for (let i = 0; i < chunks.length; i++) {
if (chunks[i] === '{') {
// found some possible JSON; start at the last } and backtrack until it works.
for (let j = chunks.lastIndexOf('}'); j > i; j--) {
if (chunks[j] === '}') {
// Does it blend?
let parsed = tryJSON(chunks.slice(i, j + 1).join(""))
if (parsed) {
// it does! Grab the whole thing and skip ahead
output.push(parsed);
i = j;
}
}
}
} else if (chunks[i]) {
// neither JSON nor empty
output.push(chunks[i])
}
}
console.log(output)
return output
}
parse(`{"foo": "bar"}`)
parse(`test{"foo": "b}ar{{[[[{}}}}{}{}}"}`)
parse(`this {"is": "a st}ri{ng"} with {"json": ["in", "i{t"]}`)
parse(`{}`)
parse(`this {"i{s": invalid}`)
parse(`So is {this: "one"}`)

I could try to get around that by doing similar quote counting math, but then I also have to account for escaped quotes. At that point it feels like I'm redoing way too much of JSON.parse's job. Is there a better way to solve this problem?
I don't think so. Your input is pretty far from JSON.
But accounting for all those things isn't that hard.
The following snippet should work:
function construct(str) {
const len = str.length
let lastSavedIndex = -1
let bracketLevel = 0
let inJsonString = false
let lastCharWasEscapeChar = false
let result = []
for(let i = 0; i < len; ++i) {
if(bracketLevel !== 0 && !lastCharWasEscapeChar && str[i] === '"') {
inJsonString = !inJsonString
}
else if (!inJsonString && str[i] === '{') {
if (bracketLevel === 0) {
result.push(str.substring(lastSavedIndex + 1, i))
lastSavedIndex = i - 1
}
++bracketLevel
}
else if (!inJsonString && str[i] === '}') {
--bracketLevel
if (bracketLevel === 0) {
result.push(JSON.parse(str.substring(lastSavedIndex + 1, i + 1)))
lastSavedIndex = i
}
}
else if (inJsonString && str[i] === '\\') {
lastCharWasEscapeChar = !lastCharWasEscapeChar
}
else {
lastCharWasEscapeChar = false
}
}
if(lastSavedIndex !== len -1) {
result.push(str.substring(lastSavedIndex + 1, len))
}
return result
}
const standardText = 'This is a string {"with":"json","in":"it"} followed by more text {"and":{"some":["more","json"]}} and more text. {"foo": "bar}"}'
const inputTA = document.getElementById('input')
const outputDiv = document.getElementById('output')
function updateOutput() {
outputDiv.innerText =
JSON.stringify(
construct(inputTA.value),
null,
2
)
}
inputTA.oninput = updateOutput
inputTA.value = standardText
updateOutput()
<textarea id="input" rows="5" cols="50"></textarea>
<pre id="output"><pre>

You can use RegExp /(\s(?=[{]))|\s(?=[\w\s]+[{])/ig to .split() space character followed by opening curly brace { or space character followed by one or more word or space characters followed by opening curly brace, .filter() to remove undefined values from resulting array, create a new array, then while the resulting split array has .length get the index where the value contains only space characters, .splice() the beginning of the matched array to the index plus 1, if array .length is 0 .push() empty string '' else space character ' ' with match .join()ed by space character ' ' .replace() last space character and .shift() matched array, which is JSON, then next element of the matched array.
const str = `This is a string {"with":"json","in":"it"} followed by more text {"and":{"some":["more","json"]}} and more text {"like":"this one } right here"}`;
const formatStringContainingJSON = s => {
const r = /(\s(?=[{]))|\s(?=[\w\s]+[{])/ig;
const matches = s.split(r).filter(Boolean);
const res = [];
while (matches.length) {
const index = matches.findIndex(s => /^\s+$/.test(s));
const match = matches.splice(0, index + 1);
res.push(
`${!res.length ? '' : ' '}${match.join(' ').replace(/\s$/, '')}`
, `${matches.shift()}`
);
};
return res;
}
let result = formatStringContainingJSON(str);
console.log(result);

Here you one approach that iterates char by char. First we create an array from the input and then use reduce() on it. When we detect an opening curly bracket { we push the current accumulated chunk on an array of detected results, and then we set a flag on the accumulator object we are using on reduce. While this flag is set to true we will try to parse for a JSON and only when success we put the chunk representing the JSON on the array of detected results and set the flag again to false.
The accumulator of the reduce() method will hold next data:
res: an array with detected results: strings or jsons.
chunk: a string representing the current accumulated chunk of chars.
isJson: a boolean indicating if the current chunk is json or not.
const input = 'This is a string {"with":"json", "in":"it"} followed by more text {"and":{"some":["more","json","data"]}} and more text';
let obj = Array.from(input).reduce(({res, isJson, chunk}, curr) =>
{
if (curr === "{")
{
if (!isJson) res.push(chunk);
chunk = isJson ? chunk + curr : curr;
isJson = true;
}
else if (isJson)
{
try
{
chunk += curr;
JSON.parse(chunk);
// If no error, we found a JSON.
res.push(chunk);
chunk = "";
isJson = false;
}
catch(e) {/* Ignore error */}
}
else
{
chunk += curr;
}
return {res, isJson, chunk};
}, {res:[], isJson:false, chunk:""})
// First stage done, lets debug obtained data.
obj.res.push(obj.chunk);
console.log(obj.res);
// Finally, we map the pieces.
let res = obj.res.map(x => x.match("{") ? JSON.parse(x) : x);
console.log(res);

Obligatory answer: this is an improper format (because of this complication, and the guarantee is a security hole if the parser is improperly designed); it should ideally be redesigned. (Sorry, it had to be said.)
Barring that, you can generate a parser using your favorite parser generator that outputs to javascript as a target language. It might even have a demo grammar for JSON.
However, the glaring security issue is incredibly scary (if any JSON gets past the 'guarantee', suddenly it's a vector). An array interspersed representation seems nicer, with the constraint that assert(text.length == markup.length+1):
'{
"text": ['Hello', 'this is red text!'],
"markup": [{"text":"everyone", "color":"red"}]
}'
or even nicer:
'[
{"type":"text", "text":"Hello"},
{"type":"markup", "text":"everyone", "color":"red"} # or ,"val":{"text":.., "color":..}}
{"type":"text", "text":"this is red text!"},
...
]'
Store compressed ideally. Unserialize without any worries with JSON.parse.

encodeURIComponent throws an exception

I am programmatically building a URI with the help of the encodeURIComponent function using user provided input. However, when the user enters invalid unicode characters (such as U+DFFF), the function throws an exception with the following message:
The URI to be encoded contains an invalid character
I looked this up on MSDN, but that didn't tell me anything I didn't already know.
To correct this error
Ensure the string to be encoded contains only valid Unicode sequences.
My question is, is there a way to sanitize the user provided input to remove all invalid Unicode sequences before I pass it on to the encodeURIComponent function?

Taking the programmatic approach to discover the answer, the only range that turned up any problems was \ud800-\udfff, the range for high and low surrogates:
for (var regex = '/[', firstI = null, lastI = null, i = 0; i <= 65535; i++) {
try {
encodeURIComponent(String.fromCharCode(i));
}
catch(e) {
if (firstI !== null) {
if (i === lastI + 1) {
lastI++;
}
else if (firstI === lastI) {
regex += '\\u' + firstI.toString(16);
firstI = lastI = i;
}
else {
regex += '\\u' + firstI.toString(16) + '-' + '\\u' + lastI.toString(16);
firstI = lastI = i;
}
}
else {
firstI = i;
lastI = i;
}
}
}
if (firstI === lastI) {
regex += '\\u' + firstI.toString(16);
}
else {
regex += '\\u' + firstI.toString(16) + '-' + '\\u' + lastI.toString(16);
}
regex += ']/';
alert(regex); // /[\ud800-\udfff]/
I then confirmed this with a simpler example:
for (var i = 0; i <= 65535 && (i <0xD800 || i >0xDFFF ) ; i++) {
try {
encodeURIComponent(String.fromCharCode(i));
}
catch(e) {
alert(e); // Doesn't alert
}
}
alert('ok!');
And this fits with what MSDN says because indeed all those Unicode characters (even valid Unicode "non-characters") besides surrogates are all valid Unicode sequences.
You can indeed filter out high and low surrogates, but when used in a high-low pair, they become legitimate (as they are meant to be used in this way to allow for Unicode to expand (drastically) beyond its original maximum number of characters):
alert(encodeURIComponent('\uD800\uDC00')); // ok
alert(encodeURIComponent('\uD800')); // not ok
alert(encodeURIComponent('\uDC00')); // not ok either
So, if you want to take the easy route and block surrogates, it is just a matter of:
urlPart = urlPart.replace(/[\ud800-\udfff]/g, '');
If you want to strip out unmatched (invalid) surrogates while allowing surrogate pairs (which are legitimate sequences but the characters are rarely ever needed), you can do the following:
function stripUnmatchedSurrogates (str) {
return str.replace(/[\uD800-\uDBFF](?![\uDC00-\uDFFF])/g, '').split('').reverse().join('').replace(/[\uDC00-\uDFFF](?![\uD800-\uDBFF])/g, '').split('').reverse().join('');
}
var urlPart = '\uD801 \uD801\uDC00 \uDC01'
alert(stripUnmatchedSurrogates(urlPart)); // Leaves one valid sequence (representing a single non-BMP character)
If JavaScript had negative lookbehind the function would be a lot less ugly...

JavaScript CSV Parser Library [closed]

Closed. This question is off-topic. It is not currently accepting answers.
Want to improve this question? Update the question so it's on-topic for Stack Overflow.
Closed 9 years ago.
Improve this question
Is there a decent CSV Parser library for JavaScript? I've used this and that solution so far. In the first solution a new line is never created as a new sub-array, also the code tells so and the second solution does not work on text files formatted in Windows with <CR><LF> , respectively \r\n
Is it sufficient to apply
text = text.replace("\r","");
to the Windows CSV files? This actually works, but I think this is a little bit quirks. Are there csv parser which are more common than a random bloggers solution?

Here's the 'easy' solution
csv.split(/\r\n|\r|\n/g)
It handles:
\n
\r
\r\n
\n\r
Unfortunately, it breaks on values that contain newline chars between delimiters.
For example, the following line entry...
"this is some","valid CSV data","with a \r\nnewline char"
Will break it because the '\r\n' will be mistakenly interpreted as the end of an entry.
For a complete solution, your best bet is to create a ND-FSM (Non-Deterministic Finite State Machine) lexer/parser. If you have ever heard of the Chomsky Hierarchy, CSV can be parsed as a Type III grammar. That means char-by-char or token-by-token processing with state tracking.
I have a fully RFC 4180 compliant client-side library available but somehow I attracted the attention of a delete-happy mod for external linking. There's a link in my profile if you're interested; otherwise, good luck.
I'll give you fair warning from experience, CSV looks deceptively easy on the surface. After studying tens/hundreds of implementations, I have only seen 3 javascript parsers that did a reasonable job of meeting the spec and none of them were completely RFC compliant. I managed to write one but only with the help of the community and lots and lots of pain.

If you're working in Node, there's an excellent CSV parser that can handle extremely large amounts of data (>GB files) and supports escape characters.
If you're working in browser JS, you could still extract the processing logic from the code so that it operates on a string (instead of a Node Stream).

Here is one way to do it:
// based on json_parse from JavaScript The Good Part by D. Crockford
var csv_parse = function () {
var at,
ch,
text,
error = function (m) {
throw {
name: 'SyntaxError',
message: m,
at: at,
text: text
};
},
next = function (c) {
if (c && c !== ch) {
error("Expected '" + c + "' instead of '" + ch + "'");
}
ch = text.charAt(at);
at += 1;
return ch;
},
//needed to handle "" which indicates escaped quote
peek = function () {
return text.charAt(at);
},
white = function () {
while (ch && ch <= ' ' && ch !== '\n') {
next();
}
},
// if numeric, then return number
number = function () {
var number,
string = word();
number = +string;
if (isNaN(number)) {
return string;
} else {
return number;
}
},
word = function () {
var string = '';
while (ch !== ',' && ch !== '\n') {
string += ch;
next();
}
return string;
},
// the matching " is the end of word not ,
// need to worry about "", which is escaped quote
quoted = function () {
var string ='';
if (ch === '"') {
while (next()) {
if (ch === '"') {
//print('need to know ending quote or escaped quote');
// need to know ending quote or escaped quote ("")
if (peek() === '"') {
//print('maybe double quote near '+string);
next('"');
string += ch;
} else {
next('"')
return string;
}
} else {
string += ch;
}
}
return string;
}
error("Bad string");
},
value = function () {
white();
switch(ch) {
case '-':
return number();
case '"':
return quoted();
default:
return ch >= '0' && ch <= '9' ? number() : word();
}
return number();
},
line = function () {
var array = [];
white();
if (ch === '\n') {
next('\n');
return array;//empty []
}
while (ch) {
array.push( value() );
white();
if (ch === '\n') {
next('\n');
return array;//got something
}
next(',');// not very liberal with delimiter
white();
}
};
return function (_line) {
var result;
text = _line;
at = 0;
ch = ' ';
result = line();
white();
if (ch) {
error("Syntax error");
}
return result;
};
}();

My function is solid, just drop in and use, I hope it is of help to you.
csvToArray v1.3
A compact (508 bytes) but compliant function to convert a CSV string into a 2D array, conforming to the RFC4180 standard.
http://code.google.com/p/csv-to-array/
Common Usage: jQuery
$.ajax({
url: "test.csv",
dataType: 'text',
cache: false
}).done(function(csvAsString){
csvAsArray=csvAsString.csvToArray();
});
Common usage: Javascript
csvAsArray = csvAsString.csvToArray();
Override field separator
csvAsArray = csvAsString.csvToArray("|");
Override record separator
csvAsArray = csvAsString.csvToArray("", "#");
Override Skip Header
csvAsArray = csvAsString.csvToArray("", "", 1);
Override all
csvAsArray = csvAsString.csvToArray("|", "#", 1);

Develop Reference

JavaScript is the programming language of the Web.

Split a CSV string by line skipping newlines contained between quotes - javascript

Be careful- That newline is PART of that value. It's not PCB, it's P\nCB. However, why can't you just use string.split(',')? If need be, you can run through the list and cast to ints or remove the padded quotation marks.

Related

How do I mask an email address between the first and the last character before the # sign?

JS (no lookback) regex for replacing :bound SQL vars without replacing 'colons:in:literals' ...?

How do I parse JSON sprinkled unpredictably into a string?

encodeURIComponent throws an exception

JavaScript CSV Parser Library [closed]

Categories

Resources