Suppose that I've got a node.js application that receives input in a weird format: strings with JSON arbitrarily sprinkled into them, like so:
This is a string {"with":"json","in":"it"} followed by more text {"and":{"some":["more","json"]}} and more text
I have a couple guarantees about this input text:
The bits of literal text in between the JSON objects are always free from curly braces.
The top level JSON objects shoved into the text are always object literals, never arrays.
My goal is to split this into an array, with the literal text left alone and the JSON parsed out, like this:
[
"This is a string ",
{"with":"json","in":"it"},
" followed by more text ",
{"and":{"some":["more","json"]}},
" and more text"
]
So far I've written a naive solution that simply counts curly braces to decide where the JSON starts and stops. But this wouldn't work if the JSON contains strings with curly braces in them {"like":"this one } right here"}. I could try to get around that by doing similar quote counting math, but then I also have to account for escaped quotes. At that point it feels like I'm redoing way too much of JSON.parse's job. Is there a better way to solve this problem?
You can check if JSON.parse throws an error to determine if the chunk is a valid JSON object or not. If it throws an error then the unquoted } are unbalanced:
const tests = [
'{"just":"json }}{}{}{{[]}}}}","x":[1,2,3]}',
'Just a string',
'This string has a tricky case: {"like":"this one } right here"}',
'This string {} has a tiny JSON object in it.',
'.{}.',
'This is a string {"with":"json","in":"it"} followed by more text {"and":{"some":["more","json"]}} and more text',
];
tests.forEach( test => console.log( parse_json_interleaved_string( test ) ) );
function parse_json_interleaved_string ( str ) {
const chunks = [ ];
let last_json_end_index = -1;
let json_index = str.indexOf( '{', last_json_end_index + 1 );
for ( ; json_index !== -1; json_index = str.indexOf( '{', last_json_end_index + 1 ) ) {
// Push the plain string before the JSON
if ( json_index !== last_json_end_index + 1 )
chunks.push( str.substring( last_json_end_index, json_index ) );
let json_end_index = str.indexOf( '}', json_index + 1 );
// Find the end of the JSON
while ( true ) {
try {
JSON.parse( str.substring( json_index, json_end_index + 1 ) );
break;
} catch ( e ) {
json_end_index = str.indexOf( '}', json_end_index + 1 );
if ( json_end_index === -1 )
throw new Error( 'Unterminated JSON object in string' );
}
}
// Push JSON
chunks.push( str.substring( json_index, json_end_index + 1 ) );
last_json_end_index = json_end_index + 1;
}
// Push final plain string if any
if ( last_json_end_index === - 1 )
chunks.push( str );
else if ( str.length !== last_json_end_index )
chunks.push( str.substr( last_json_end_index ) );
return chunks;
}
Here's a comparatively simple brute-force approach: split the whole input string on curly braces, then step through the array in order. Whenever you come across an open brace, find the longest chunk of the array from that starting point that successfully parses as JSON. Rinse and repeat.
This will not work if the input contains invalid JSON and/or unbalanced braces (see the last two test cases below.)
const tryJSON = input => {
try {
return JSON.parse(input);
} catch (e) {
return false;
}
}
const parse = input => {
let output = [];
let chunks = input.split(/([{}])/);
for (let i = 0; i < chunks.length; i++) {
if (chunks[i] === '{') {
// found some possible JSON; start at the last } and backtrack until it works.
for (let j = chunks.lastIndexOf('}'); j > i; j--) {
if (chunks[j] === '}') {
// Does it blend?
let parsed = tryJSON(chunks.slice(i, j + 1).join(""))
if (parsed) {
// it does! Grab the whole thing and skip ahead
output.push(parsed);
i = j;
}
}
}
} else if (chunks[i]) {
// neither JSON nor empty
output.push(chunks[i])
}
}
console.log(output)
return output
}
parse(`{"foo": "bar"}`)
parse(`test{"foo": "b}ar{{[[[{}}}}{}{}}"}`)
parse(`this {"is": "a st}ri{ng"} with {"json": ["in", "i{t"]}`)
parse(`{}`)
parse(`this {"i{s": invalid}`)
parse(`So is {this: "one"}`)
I could try to get around that by doing similar quote counting math, but then I also have to account for escaped quotes. At that point it feels like I'm redoing way too much of JSON.parse's job. Is there a better way to solve this problem?
I don't think so. Your input is pretty far from JSON.
But accounting for all those things isn't that hard.
The following snippet should work:
function construct(str) {
const len = str.length
let lastSavedIndex = -1
let bracketLevel = 0
let inJsonString = false
let lastCharWasEscapeChar = false
let result = []
for(let i = 0; i < len; ++i) {
if(bracketLevel !== 0 && !lastCharWasEscapeChar && str[i] === '"') {
inJsonString = !inJsonString
}
else if (!inJsonString && str[i] === '{') {
if (bracketLevel === 0) {
result.push(str.substring(lastSavedIndex + 1, i))
lastSavedIndex = i - 1
}
++bracketLevel
}
else if (!inJsonString && str[i] === '}') {
--bracketLevel
if (bracketLevel === 0) {
result.push(JSON.parse(str.substring(lastSavedIndex + 1, i + 1)))
lastSavedIndex = i
}
}
else if (inJsonString && str[i] === '\\') {
lastCharWasEscapeChar = !lastCharWasEscapeChar
}
else {
lastCharWasEscapeChar = false
}
}
if(lastSavedIndex !== len -1) {
result.push(str.substring(lastSavedIndex + 1, len))
}
return result
}
const standardText = 'This is a string {"with":"json","in":"it"} followed by more text {"and":{"some":["more","json"]}} and more text. {"foo": "bar}"}'
const inputTA = document.getElementById('input')
const outputDiv = document.getElementById('output')
function updateOutput() {
outputDiv.innerText =
JSON.stringify(
construct(inputTA.value),
null,
2
)
}
inputTA.oninput = updateOutput
inputTA.value = standardText
updateOutput()
<textarea id="input" rows="5" cols="50"></textarea>
<pre id="output"><pre>
You can use RegExp /(\s(?=[{]))|\s(?=[\w\s]+[{])/ig to .split() space character followed by opening curly brace { or space character followed by one or more word or space characters followed by opening curly brace, .filter() to remove undefined values from resulting array, create a new array, then while the resulting split array has .length get the index where the value contains only space characters, .splice() the beginning of the matched array to the index plus 1, if array .length is 0 .push() empty string '' else space character ' ' with match .join()ed by space character ' ' .replace() last space character and .shift() matched array, which is JSON, then next element of the matched array.
const str = `This is a string {"with":"json","in":"it"} followed by more text {"and":{"some":["more","json"]}} and more text {"like":"this one } right here"}`;
const formatStringContainingJSON = s => {
const r = /(\s(?=[{]))|\s(?=[\w\s]+[{])/ig;
const matches = s.split(r).filter(Boolean);
const res = [];
while (matches.length) {
const index = matches.findIndex(s => /^\s+$/.test(s));
const match = matches.splice(0, index + 1);
res.push(
`${!res.length ? '' : ' '}${match.join(' ').replace(/\s$/, '')}`
, `${matches.shift()}`
);
};
return res;
}
let result = formatStringContainingJSON(str);
console.log(result);
Here you one approach that iterates char by char. First we create an array from the input and then use reduce() on it. When we detect an opening curly bracket { we push the current accumulated chunk on an array of detected results, and then we set a flag on the accumulator object we are using on reduce. While this flag is set to true we will try to parse for a JSON and only when success we put the chunk representing the JSON on the array of detected results and set the flag again to false.
The accumulator of the reduce() method will hold next data:
res: an array with detected results: strings or jsons.
chunk: a string representing the current accumulated chunk of chars.
isJson: a boolean indicating if the current chunk is json or not.
const input = 'This is a string {"with":"json", "in":"it"} followed by more text {"and":{"some":["more","json","data"]}} and more text';
let obj = Array.from(input).reduce(({res, isJson, chunk}, curr) =>
{
if (curr === "{")
{
if (!isJson) res.push(chunk);
chunk = isJson ? chunk + curr : curr;
isJson = true;
}
else if (isJson)
{
try
{
chunk += curr;
JSON.parse(chunk);
// If no error, we found a JSON.
res.push(chunk);
chunk = "";
isJson = false;
}
catch(e) {/* Ignore error */}
}
else
{
chunk += curr;
}
return {res, isJson, chunk};
}, {res:[], isJson:false, chunk:""})
// First stage done, lets debug obtained data.
obj.res.push(obj.chunk);
console.log(obj.res);
// Finally, we map the pieces.
let res = obj.res.map(x => x.match("{") ? JSON.parse(x) : x);
console.log(res);
Obligatory answer: this is an improper format (because of this complication, and the guarantee is a security hole if the parser is improperly designed); it should ideally be redesigned. (Sorry, it had to be said.)
Barring that, you can generate a parser using your favorite parser generator that outputs to javascript as a target language. It might even have a demo grammar for JSON.
However, the glaring security issue is incredibly scary (if any JSON gets past the 'guarantee', suddenly it's a vector). An array interspersed representation seems nicer, with the constraint that assert(text.length == markup.length+1):
'{
"text": ['Hello', 'this is red text!'],
"markup": [{"text":"everyone", "color":"red"}]
}'
or even nicer:
'[
{"type":"text", "text":"Hello"},
{"type":"markup", "text":"everyone", "color":"red"} # or ,"val":{"text":.., "color":..}}
{"type":"text", "text":"this is red text!"},
...
]'
Store compressed ideally. Unserialize without any worries with JSON.parse.
Related
I have the following problem statement:
Write a function, uncompress, that takes in a string as an argument.
The input string will be formatted into multiple groups according to
the following pattern:
number + char
for example, '2c' or '3a'.
The function should return an uncompressed version of the string where
each 'char' of a group is repeated 'number' times consecutively. You
may assume that the input string is well-formed according to the
previously mentioned pattern.
test_00: uncompress("2c3a1t"); // -> 'ccaaat'
Here is my code which is using a stack. The problem is that it's only returning 'cc' and I can't figure out why. I've console logged what goes into the IF ELSE and I'm hitting both so I don't understand why nothing gets pushed to the stack.
Would really appreciate the help if someone can spot what I'm missing.
const uncompress = (s) => {
const nums = '23456789';
const stack = [];
for (let char of s) {
if (nums.includes(char)) {
stack.push(Number(char));
} else {
const num = stack.pop();
stack.push(char.repeat(num));
};
};
return stack.join('');
};
console.log(uncompress("2c3a1t")); // -> 'ccaaat'
Here's how I would do it:
Split the string up into pairs of numbers and chars:
str.match(/\d+[a-zA-Z]/g)
And reduce that array to a string, while taking each value from the array, getting the char from it (cv.match(/[a-zA-Z]/)[0]) and repeating it according to the number (.repeat(parseInt(cv)))
const uncompress = str => str.match(/\d+[a-zA-Z]/g).reduce((acc, cv) =>
acc + cv.match(/[a-zA-Z]/)[0].repeat(parseInt(cv)), "")
console.log(uncompress("2c3a1t"))
console.log(uncompress("27b1d8g"))
And just like that I was able to write the code which passed the test case:
const nums = '123456789';
const stack = [];
for (let char of s) {
if (nums.includes(char)) {
stack.push(Number(char));
} else {
let num = '';
while (nums.includes(stack[stack.length - 1])) {
num += stack.pop();
}
stack.push(char.repeat(num));
};
};
return stack.join('');
};
Im currently stuck trying to convert a string into JSON in javascript.
the string im getting from the server is:
"{knee=true, centered=true}"
the outcome im looking for is something like this:
{ knee: true, centered: true}
but since the string is using equals and there are missing quotes the JSON.parse isnt working, I dont know how to solve this. any help will be appreciated, thank you!
The best I could do was this ... It returns value of object in strings though it seems to work perfect ! ( Actually this one challenged me so I had to do it ) :-)
let str = "{knee = true, centered = true}";
str = str.replaceAll('{', '')
str = str.replaceAll('}', '')
str = str.split(",")
str = Object.assign({}, str);
let key_value;
let key;
let val;
for (var i = 0; i < Object.keys(str).length; i++) {
key_value = str[i].split("=");
key = String(key_value[0]);
val = key_value[1];
str[i] = val;
delete Object.assign(str, {[key]: str[i]
})[i];
}
console.log(str)
Assuming you don't have nested things or strings with commas or brackets in them, you could replace all { with {", = with ":, and , with , ":
const str = "{knee=true, centered=true}"
console.log(
JSON.parse(str.split('{').join('{"').split('=').join('":').split(', ').join(', "'))
)
Without more specifics it's impossible to verify how correct this is, but if I was to make some assumptions:
An object is a set of key/value pairs surrounded by { and }
Key/value pairs are separated by ,
Any arbitrary whitespace is allowed around key/value pairs
A key and value are separated by a =
Values can only hold the value true or false which should be translated to a JavaScript boolean
...then parsing can be done through some regular expressions and string manipulations.
const objectRegExp = /^\{(.*)}$/;
function parseNJson(str) { // notJSON
const match = objectRegExp.exec(str);
if (!match) {
throw new Error('This is not NJson');
}
const [, keyValuesBlock] = match;
const keyValueStatements = keyValuesBlock.split(',');
const keyValues = keyValueStatements.map(statement => statement.split('='));
return keyValues.reduce((result, [keyStr, valueStr]) => {
const key = keyStr.trim();
const trimmedValue = valueStr.trim();
let value;
if (trimmedValue === 'true') {
value = true;
} else if (trimmedValue === 'false') {
value = false;
} else {
throw new Error(`Unsupported value ${trimmedValue}`);
}
return Object.assign(result, { [key]: value });
}, {});
}
This will easily fall apart if any assumptions were incorrect, like "what if values can be strings? What if strings can be quoted with double quotes? What if they can also be surrounded by single quotes? What if numbers are supported? What if hexadecimal numbers are supported?"
If the data being sent on the server is a standard format, they should be able to tell you "this was formatted as X" so you can find a spec-compliant X parser. Or you could insist data is sent as JSON instead, since that's a super common exchange format. The best thing is that the server and client are using a common, well-defined message formatting spec so you don't accidentally break things whenever receiving or sending data that has characteristics you didn't account for.
Messing around with node-mysql, I wrote some code that lets me use PDO-style :bound values (plus ::bound field names), and rewrites the query with ? and ?? respectively where they are found, and builds a linear array of the values when I execute the statement. I did this because when I look at a SQL statement with a ton of ? ?? all over it and have to count the number of params in my execution, it makes my eyes bleed. I want to just assign a standard object at execution time.
The trouble is, after writing this (it works) I realized my regex for finding those colons in the statement had one tiny little problem, namely, it looks like this:
/.?:(\w+)/g
It picks up the first colon if needed and we take it from there. The problem is, it also picks up colons in literals within the query. So if for some reason you wanted a non-bound string as part of your insert/update, it would be replaced by this engine.
Is there any standard regex for picking up every global instance of the word ":param{#}" in the following statement, without picking up the word "Hello:world", in JS, without lookbacks?
INSERT INTO test VALUES(:param1, :param2, 'Hello:world', :param3);
You're often much better off writing a parser than using regular expressions. It's much more flexible, gives you better error reporting and allows you to handle current & future edge cases much more easily.
The string parsing deals with MySQL string literals syntax & escape sequences described here and just skips over them.
I'm not dealing with valid/invalid binding boundaries, but you could add that if you wanted. You could also remove error reporting such as underterminated string literals and just be forgiving.
The lookahead === ':' && peek() !== '=' condition is to ignore the := MySQL operator.
const parseBindings = (() => {
const bindingCharRx = /\w/;
return function(sql) {
const bindings = [];
let i = 0,
lookahead = sql[i];
while (lookahead) {
if (isStringDelim(lookahead)) parseString();
else if (lookahead === ':' && peek() !== '=') parseBinding();
else consume();
}
return bindings;
function parseString() {
const start = i,
delim = lookahead;
consume();
while (lookahead) {
if (lookahead === '\\') {
consume();
consume();
continue;
}
if (lookahead === delim) {
consume();
if (lookahead !== delim) return;
}
consume();
}
throw new Error(`Underterminated string literal starting at index ${start}.`);
}
function isStringDelim(char) {
return char === "'" || char === '"';
}
function parseBinding() {
const start = i;
consume();
while (lookahead && bindingCharRx.test(lookahead)) consume();
const name = sql.slice(start + 1, i);
if (!name.length) {
throw new Error(`Invalid binding starting at index ${start}.`);
}
bindings.push({
start,
end: i,
name: name
});
}
function consume() {
lookahead = sql[++i]
}
function peek() {
return sql[i + 1]
}
}
})();
function replaceNamedBindings(values, sql) {
const bindings = parseBindings(sql);
const bindingNames = new Set(bindings.map(b => b.name));
const unknownBinding = Object.keys(values).find(k => !bindingNames.has(k));
if (unknownBinding) throw new Error(`Couldn't find a binding named '${unknownBinding}'.`);
let lastIndex = 0,
newSql = '';
for (const binding of bindings) {
if (binding.name in values) {
newSql += sql.slice(lastIndex, binding.start) + values[binding.name];
lastIndex = binding.end;
}
}
newSql += sql.slice(lastIndex);
return newSql;
}
const sql = `INSERT INTO test VALUES(:param1, :param2, 'Hello:world', :param3);`;
console.log(replaceNamedBindings({
param1: '(param1 value)',
param2: '(param2 value)',
param3: '(param3 value)'
}, sql));
console.log(parseBindings(sql));
console.log(parseBindings(`:pickup1 ":dontpickup1" ':dontpickup2' := """:dontpickup3" ''':dontpickup4' "\\":dontpickup5" :pickup2`));
//Will throw exception b/c :world is not a binding
console.log(replaceNamedBindings({
world: '(world value)'
}, sql));
I have an ASCII character string and I need to convert it to a normal string.
var asciiString = '84114117116104326510811997121115328710511011532'
function strFunc(str) {
var result = []
var strSplit = str.split('');
var validAscii = ['32'];
for(var i=65; i<=90; i++) {
validAscii.push(i.toString());
}
for(var i=97; i<=122; i++) {
validAscii.push(i.toString());
}
strSplit.forEach((item, index) => {
if(validAscii.includes(parseInt(item))) {
result.push(item)
} else if (validAscii.includes(`${parseInt(strSplit[index])}${parseInt(strSplit[index + 1])}`)){
result.push(item)
}
})
return result.fromCharCodeAt(...result)
}
console.log(strFunc(asciiString))
Why does it return an empty string? I need to split the string into either 2 digit or 3 digits and compare it with the array I built.
The string should be split as [84, 114, 117, 116, 104, 32, 65, ...] which translates to TRUTH A....
Please advice.
I'd do this way
const encodedString = '84114117116104326510811997121115328710511011532';
const codes = [];
for (let i = 0; i < encodedString.length;) {
const numDigits = encodedString[i] === '1' ? 3 : 2;
codes.push(encodedString.substr(i, numDigits));
i += numDigits;
}
const str = String.fromCharCode(...codes);
console.log(`"${str}"`);
Some notes:
It assume values in the encoded string go from 32 to 127. There's no error checking
There's no reason to call parseInt as JavaScript will convert numbers strings to numbers so passing the numbers as strings to String.fromCharCode works.
As for why your code doesn't work, a couple of issues include
it's looping over every character, not every code.
It's looping over 8, 4, 1, 1, 4, ... instead of 84, 114, ...
This means neither test will pass since item will never be something found in validAscii which means result will have nothing pushed to it.
There's no function Array.fromCharCodeAt
result is an array and there is no such function as array.fromCharCodeAt. If result had the correct codes in it then you could use String.fromCharCode(...result)
When you're combining two elements of the string, you need to call parseInt() *on the result of the concatenation, not concatenate the results of parseInt(). So it should be:
} else if (validAscii.includes(parseInt(item + strSplit[index+1]))){
And since ASCII values can be 3 digits, you need another else if that looks for item + strSplit[index+1] + strSplit[index+2].
Another problem is that you're pushing item onto the result string. But to get the corresponding character, you need to use String.fromCharCode() to convert the concatenated ASCII code to a character.
strSplit.forEach((item, index) => {
if (validAscii.includes(parseInt(item))) {
result.push(String.fromCharCode(item))
} else if (validAscii.includes(parseInt(item + strSplit[index+1]))) {
result.push(String.fromCharCode(parseInt(item + strSplit[index+1]))
} else if (validAscii.includes(parseInt(item + strSplit[index+1] + strSplit[index+2]))) {
result.push(String.fromCharCode(parseInt(item + strSplit[index+1] + strSplit[index+2]))
}
})
Note that using forEach like this is probably not a good idea. If there are overlapping items in the input that are both in validAscii, you'll add both of them to the result. E.g. if it contains 678 you'll match both 67 and 78, and add the corresponding characters to the result. Instead, you should use an ordinaryfor` loop, and increment the index by the number of characters that you matched.
You want parseInt around the templated string, not the individual items. You were checking if the string is included in an array filled with numbers.
var asciiString = '84114117116104326510811997121115328710511011532'
function strFunc(str) {
var result = []
var strSplit = str.split('');
var validAscii = [32];
for(var i=65; i<=90; i++) {
validAscii.push(i);
}
for(var i=97; i<=122; i++) {
validAscii.push(i);
}
strSplit.forEach((item, index) => {
if(validAscii.includes(parseInt(item))) {
result.push(item)
} else if (validAscii.includes(parseInt(`${(strSplit[index])}${(strSplit[index + 1])}`))){
result.push(item);
}
})
return result.join('');
}
console.log(strFunc(asciiString))
If the following regex can split a csv string by line.
var lines = csv.split(/\r|\r?\n/g);
How could this be adapted to skip newline chars that are contained within a CSV value (Ie between quotes/double-quotes)?
Example:
2,"Evans & Sutherland","230-132-111AA",,"Visual","P
CB",,1,"Offsite",
If you don't see it, here's a version with the newlines visible:
2,"Evans & Sutherland","230-132-111AA",,"Visual","P\r\nCB",,1,"Offsite",\r\n
The part I'm trying to skip over is the newline contained in the middle of the "PCB" entry.
Update:
I probably should've mentioned this before but this is a part of a dedicated CSV parsing library called jquery-csv. To provide a better context I have added the current parser implementation below.
Here's the code for validating and parsing an entry (ie one line):
$.csvEntry2Array = function(csv, meta) {
var meta = (meta !== undefined ? meta : {});
var separator = 'separator' in meta ? meta.separator : $.csvDefaults.separator;
var delimiter = 'delimiter' in meta ? meta.delimiter : $.csvDefaults.delimiter;
// build the CSV validator regex
var reValid = /^\s*(?:D[^D\\]*(?:\\[\S\s][^D\\]*)*D|[^SD\s\\]*(?:\s+[^SD\s\\]+)*)\s*(?:S\s*(?:D[^D\\]*(?:\\[\S\s][^D\\]*)*D|[^SD\s\\]*(?:\s+[^SD\s\\]+)*)\s*)*$/;
reValid = RegExp(reValid.source.replace(/S/g, separator));
reValid = RegExp(reValid.source.replace(/D/g, delimiter));
// build the CSV line parser regex
var reValue = /(?!\s*$)\s*(?:D([^D\\]*(?:\\[\S\s][^D\\]*)*)D|([^SD\s\\]*(?:\s+[^SD\s\\]+)*))\s*(?:S|$)/g;
reValue = RegExp(reValue.source.replace(/S/g, separator), 'g');
reValue = RegExp(reValue.source.replace(/D/g, delimiter), 'g');
// Return NULL if input string is not well formed CSV string.
if (!reValid.test(csv)) {
return null;
}
// "Walk" the string using replace with callback.
var output = [];
csv.replace(reValue, function(m0, m1, m2) {
// Remove backslash from any delimiters in the value
if (m1 !== undefined) {
var reDelimiterUnescape = /\\D/g;
reDelimiterUnescape = RegExp(reDelimiterUnescape.source.replace(/D/, delimiter), 'g');
output.push(m1.replace(reDelimiterUnescape, delimiter));
} else if (m2 !== undefined) {
output.push(m2);
}
return '';
});
// Handle special case of empty last value.
var reEmptyLast = /S\s*$/;
reEmptyLast = RegExp(reEmptyLast.source.replace(/S/, separator));
if (reEmptyLast.test(csv)) {
output.push('');
}
return output;
};
Note: I haven't tested yet but I think I could probably incorporate the last match into the main split/callback.
This is the code that does the split-by-line part:
$.csv2Array = function(csv, meta) {
var meta = (meta !== undefined ? meta : {});
var separator = 'separator' in meta ? meta.separator : $.csvDefaults.separator;
var delimiter = 'delimiter' in meta ? meta.delimiter : $.csvDefaults.delimiter;
var skip = 'skip' in meta ? meta.skip : $.csvDefaults.skip;
// process by line
var lines = csv.split(/\r\n|\r|\n/g);
var output = [];
for(var i in lines) {
if(i < skip) {
continue;
}
// process each value
var line = $.csvEntry2Array(lines[i], {
delimiter: delimiter,
separator: separator
});
output.push(line);
}
return output;
};
For a breakdown on how that reges works take a look at this answer. Mine is a slightly adapted version. I consolidated the single and double quote matching to match just one text delimiter and made the delimiter/separators dynamic. It does a great job of validating entiries but the line-splitting solution I added on top is pretty frail and breaks on the edge case I described above.
I'm just looking for a solution that walks the string extracting valid entries (to pass on to the entry parser) or fails on bad data returning an error indicating the line the parsing failed on.
Update:
splitLines: function(csv, delimiter) {
var state = 0;
var value = "";
var line = "";
var lines = [];
function endOfRow() {
lines.push(value);
value = "";
state = 0;
};
csv.replace(/(\"|,|\n|\r|[^\",\r\n]+)/gm, function (m0){
switch (state) {
// the start of an entry
case 0:
if (m0 === "\"") {
state = 1;
} else if (m0 === "\n") {
endOfRow();
} else if (/^\r$/.test(m0)) {
// carriage returns are ignored
} else {
value += m0;
state = 3;
}
break;
// delimited input
case 1:
if (m0 === "\"") {
state = 2;
} else {
value += m0;
state = 1;
}
break;
// delimiter found in delimited input
case 2:
// is the delimiter escaped?
if (m0 === "\"" && value.substr(value.length - 1) === "\"") {
value += m0;
state = 1;
} else if (m0 === ",") {
value += m0;
state = 0;
} else if (m0 === "\n") {
endOfRow();
} else if (m0 === "\r") {
// Ignore
} else {
throw new Error("Illegal state");
}
break;
// un-delimited input
case 3:
if (m0 === ",") {
value += m0;
state = 0;
} else if (m0 === "\"") {
throw new Error("Unquoted delimiter found");
} else if (m0 === "\n") {
endOfRow();
} else if (m0 === "\r") {
// Ignore
} else {
throw new Error("Illegal data");
}
break;
default:
throw new Error("Unknown state");
}
return "";
});
if (state != 0) {
endOfRow();
}
return lines;
}
All it took is 4 states for a line splitter:
0: the start of an entry
1: the following is quoted
2: a second quote has been encountered
3: the following isn't quoted
It's almost a complete parser. For my use case, I just wanted a line splitter so I could provide a more granual approach to processing CSV data.
Note: Credit for this approach goes to another dev whom I won't name publicly without his permission. All I did was adapt it from a complete parser to a line-splitter.
Update:
Discovered a few broken edge cases in the previous lineSplitter implementation. The one provided should be fully RFC 4180 compliant.
As I have noted in a comment there is no complete solution just using single regex.
A novel method using several regexps by splitting on comma and joining back strings with embedded commas is described here:-
Personally I would use a simple finite state machine as described here
The state machine has more code, but the code is cleaner and its clear what each piece of code is doing. Longer term this will be much more reliable and maintainable.
It's not a good idea to use regex's to parse. Better to use it to detect the "bad" splits and then merge them back:
var lines = csv.split(/\r?\n/g);
var bad = [];
for(var i=lines.length-1; i> 0; i--) {
// find all the unescaped quotes on the line:
var m = lines[i].match(/[^\\]?\"/g);
// if there are an odd number of them, this line, and the line after it is bad:
if((m ? m.length : 0) % 2 == 1) { bad.push(i--); }
}
// starting at the bottom of the list, merge lines back, using \r\n
for(var b=0,len=bad.length; b < len; b++) {
lines.splice(bad[b]-1, 2, lines[bad[b]-1]+"\r\n"+lines[bad[b]]);
}
(This answer is licensed under both CC0 and WTFPL.)
Be careful- That newline is PART of that value. It's not PCB, it's P\nCB.
However, why can't you just use string.split(',')? If need be, you can run through the list and cast to ints or remove the padded quotation marks.