Nodejs encoding issue - javascript

I'm trying to get data from a request, but the formatting or encoding isn't what I'm looking for.
I've tried to set the encoding using req.setEncoding('utf8')
The string I should be getting is:
import Graphics.Element exposing (..)
import Graphics.Collage exposing (..)
import Color exposing (..)
main : Element
main = collage 500 500 [filled orange (circle (1 + 49)]
What I am actually getting is: import+Graphics.Element+exposing+%28..%29%0D%0Aimport+Graphics.Collage+exposing+%28..%29%0D%0Aimport+Color+exposing+%28..%29%0D%0Amain+%3A+Element%0D%0Amain+%3D+collage+500+500+%5Bfilled+orange+%28circle+%281+%2B+49%29%5D
This is where I read the data and set the encoding:
function onPost () {
// When there is a POST request
app.post('/elmsim.html',function (req, res) {
console.log('POST request received from elmsim')
req.setEncoding('ascii')
req.on('data', function (data) {
// Create new directory
createDir(data, res)
})
})
}
Any help would be great! Thanks

The string you are getting is an url encoded string.
Have you try to call decodeUriComponent on the string?
decodeURIComponent( string )

Luca's answer is correct, but decodeURIComponent will not work for strings including a plus sign. You must split the string using '%2B' as a splitter (This represents a plus sign) and apply decodeURIComponent to each individual string. The strings can then be concatenated, and the plus signs can be added back.
This is my solution:
function decodeWithPlus(str) {
// Create array seperated by +
var splittedstr = str.split('%2B')
// Decode each array element and add to output string seperated by '+'
var outs = ''
var first = true
splittedstr.forEach(function (element) {
if (first) {
outs += replaceAll('+', ' ', decodeURIComponent(element))
first = false
}
else {
outs += '+' + replaceAll('+', ' ', decodeURIComponent(element))
}
})
return outs
}
function replaceAll(find, replace, str) {
var outs = ''
for (i = 0; i < str.length; i++) {
if (str[i] === find) {
outs += replace
}
else {
outs += str[i]
}
}
return outs
}

Related

How do I parse JSON sprinkled unpredictably into a string?

Suppose that I've got a node.js application that receives input in a weird format: strings with JSON arbitrarily sprinkled into them, like so:
This is a string {"with":"json","in":"it"} followed by more text {"and":{"some":["more","json"]}} and more text
I have a couple guarantees about this input text:
The bits of literal text in between the JSON objects are always free from curly braces.
The top level JSON objects shoved into the text are always object literals, never arrays.
My goal is to split this into an array, with the literal text left alone and the JSON parsed out, like this:
[
"This is a string ",
{"with":"json","in":"it"},
" followed by more text ",
{"and":{"some":["more","json"]}},
" and more text"
]
So far I've written a naive solution that simply counts curly braces to decide where the JSON starts and stops. But this wouldn't work if the JSON contains strings with curly braces in them {"like":"this one } right here"}. I could try to get around that by doing similar quote counting math, but then I also have to account for escaped quotes. At that point it feels like I'm redoing way too much of JSON.parse's job. Is there a better way to solve this problem?
You can check if JSON.parse throws an error to determine if the chunk is a valid JSON object or not. If it throws an error then the unquoted } are unbalanced:
const tests = [
'{"just":"json }}{}{}{{[]}}}}","x":[1,2,3]}',
'Just a string',
'This string has a tricky case: {"like":"this one } right here"}',
'This string {} has a tiny JSON object in it.',
'.{}.',
'This is a string {"with":"json","in":"it"} followed by more text {"and":{"some":["more","json"]}} and more text',
];
tests.forEach( test => console.log( parse_json_interleaved_string( test ) ) );
function parse_json_interleaved_string ( str ) {
const chunks = [ ];
let last_json_end_index = -1;
let json_index = str.indexOf( '{', last_json_end_index + 1 );
for ( ; json_index !== -1; json_index = str.indexOf( '{', last_json_end_index + 1 ) ) {
// Push the plain string before the JSON
if ( json_index !== last_json_end_index + 1 )
chunks.push( str.substring( last_json_end_index, json_index ) );
let json_end_index = str.indexOf( '}', json_index + 1 );
// Find the end of the JSON
while ( true ) {
try {
JSON.parse( str.substring( json_index, json_end_index + 1 ) );
break;
} catch ( e ) {
json_end_index = str.indexOf( '}', json_end_index + 1 );
if ( json_end_index === -1 )
throw new Error( 'Unterminated JSON object in string' );
}
}
// Push JSON
chunks.push( str.substring( json_index, json_end_index + 1 ) );
last_json_end_index = json_end_index + 1;
}
// Push final plain string if any
if ( last_json_end_index === - 1 )
chunks.push( str );
else if ( str.length !== last_json_end_index )
chunks.push( str.substr( last_json_end_index ) );
return chunks;
}
Here's a comparatively simple brute-force approach: split the whole input string on curly braces, then step through the array in order. Whenever you come across an open brace, find the longest chunk of the array from that starting point that successfully parses as JSON. Rinse and repeat.
This will not work if the input contains invalid JSON and/or unbalanced braces (see the last two test cases below.)
const tryJSON = input => {
try {
return JSON.parse(input);
} catch (e) {
return false;
}
}
const parse = input => {
let output = [];
let chunks = input.split(/([{}])/);
for (let i = 0; i < chunks.length; i++) {
if (chunks[i] === '{') {
// found some possible JSON; start at the last } and backtrack until it works.
for (let j = chunks.lastIndexOf('}'); j > i; j--) {
if (chunks[j] === '}') {
// Does it blend?
let parsed = tryJSON(chunks.slice(i, j + 1).join(""))
if (parsed) {
// it does! Grab the whole thing and skip ahead
output.push(parsed);
i = j;
}
}
}
} else if (chunks[i]) {
// neither JSON nor empty
output.push(chunks[i])
}
}
console.log(output)
return output
}
parse(`{"foo": "bar"}`)
parse(`test{"foo": "b}ar{{[[[{}}}}{}{}}"}`)
parse(`this {"is": "a st}ri{ng"} with {"json": ["in", "i{t"]}`)
parse(`{}`)
parse(`this {"i{s": invalid}`)
parse(`So is {this: "one"}`)
I could try to get around that by doing similar quote counting math, but then I also have to account for escaped quotes. At that point it feels like I'm redoing way too much of JSON.parse's job. Is there a better way to solve this problem?
I don't think so. Your input is pretty far from JSON.
But accounting for all those things isn't that hard.
The following snippet should work:
function construct(str) {
const len = str.length
let lastSavedIndex = -1
let bracketLevel = 0
let inJsonString = false
let lastCharWasEscapeChar = false
let result = []
for(let i = 0; i < len; ++i) {
if(bracketLevel !== 0 && !lastCharWasEscapeChar && str[i] === '"') {
inJsonString = !inJsonString
}
else if (!inJsonString && str[i] === '{') {
if (bracketLevel === 0) {
result.push(str.substring(lastSavedIndex + 1, i))
lastSavedIndex = i - 1
}
++bracketLevel
}
else if (!inJsonString && str[i] === '}') {
--bracketLevel
if (bracketLevel === 0) {
result.push(JSON.parse(str.substring(lastSavedIndex + 1, i + 1)))
lastSavedIndex = i
}
}
else if (inJsonString && str[i] === '\\') {
lastCharWasEscapeChar = !lastCharWasEscapeChar
}
else {
lastCharWasEscapeChar = false
}
}
if(lastSavedIndex !== len -1) {
result.push(str.substring(lastSavedIndex + 1, len))
}
return result
}
const standardText = 'This is a string {"with":"json","in":"it"} followed by more text {"and":{"some":["more","json"]}} and more text. {"foo": "bar}"}'
const inputTA = document.getElementById('input')
const outputDiv = document.getElementById('output')
function updateOutput() {
outputDiv.innerText =
JSON.stringify(
construct(inputTA.value),
null,
2
)
}
inputTA.oninput = updateOutput
inputTA.value = standardText
updateOutput()
<textarea id="input" rows="5" cols="50"></textarea>
<pre id="output"><pre>
You can use RegExp /(\s(?=[{]))|\s(?=[\w\s]+[{])/ig to .split() space character followed by opening curly brace { or space character followed by one or more word or space characters followed by opening curly brace, .filter() to remove undefined values from resulting array, create a new array, then while the resulting split array has .length get the index where the value contains only space characters, .splice() the beginning of the matched array to the index plus 1, if array .length is 0 .push() empty string '' else space character ' ' with match .join()ed by space character ' ' .replace() last space character and .shift() matched array, which is JSON, then next element of the matched array.
const str = `This is a string {"with":"json","in":"it"} followed by more text {"and":{"some":["more","json"]}} and more text {"like":"this one } right here"}`;
const formatStringContainingJSON = s => {
const r = /(\s(?=[{]))|\s(?=[\w\s]+[{])/ig;
const matches = s.split(r).filter(Boolean);
const res = [];
while (matches.length) {
const index = matches.findIndex(s => /^\s+$/.test(s));
const match = matches.splice(0, index + 1);
res.push(
`${!res.length ? '' : ' '}${match.join(' ').replace(/\s$/, '')}`
, `${matches.shift()}`
);
};
return res;
}
let result = formatStringContainingJSON(str);
console.log(result);
Here you one approach that iterates char by char. First we create an array from the input and then use reduce() on it. When we detect an opening curly bracket { we push the current accumulated chunk on an array of detected results, and then we set a flag on the accumulator object we are using on reduce. While this flag is set to true we will try to parse for a JSON and only when success we put the chunk representing the JSON on the array of detected results and set the flag again to false.
The accumulator of the reduce() method will hold next data:
res: an array with detected results: strings or jsons.
chunk: a string representing the current accumulated chunk of chars.
isJson: a boolean indicating if the current chunk is json or not.
const input = 'This is a string {"with":"json", "in":"it"} followed by more text {"and":{"some":["more","json","data"]}} and more text';
let obj = Array.from(input).reduce(({res, isJson, chunk}, curr) =>
{
if (curr === "{")
{
if (!isJson) res.push(chunk);
chunk = isJson ? chunk + curr : curr;
isJson = true;
}
else if (isJson)
{
try
{
chunk += curr;
JSON.parse(chunk);
// If no error, we found a JSON.
res.push(chunk);
chunk = "";
isJson = false;
}
catch(e) {/* Ignore error */}
}
else
{
chunk += curr;
}
return {res, isJson, chunk};
}, {res:[], isJson:false, chunk:""})
// First stage done, lets debug obtained data.
obj.res.push(obj.chunk);
console.log(obj.res);
// Finally, we map the pieces.
let res = obj.res.map(x => x.match("{") ? JSON.parse(x) : x);
console.log(res);
Obligatory answer: this is an improper format (because of this complication, and the guarantee is a security hole if the parser is improperly designed); it should ideally be redesigned. (Sorry, it had to be said.)
Barring that, you can generate a parser using your favorite parser generator that outputs to javascript as a target language. It might even have a demo grammar for JSON.
However, the glaring security issue is incredibly scary (if any JSON gets past the 'guarantee', suddenly it's a vector). An array interspersed representation seems nicer, with the constraint that assert(text.length == markup.length+1):
'{
"text": ['Hello', 'this is red text!'],
"markup": [{"text":"everyone", "color":"red"}]
}'
or even nicer:
'[
{"type":"text", "text":"Hello"},
{"type":"markup", "text":"everyone", "color":"red"} # or ,"val":{"text":.., "color":..}}
{"type":"text", "text":"this is red text!"},
...
]'
Store compressed ideally. Unserialize without any worries with JSON.parse.

Building JavaScript Array in C#, apostrophes changing

I have done this to build JavaScript Arrays from int, double and string lists.
public string listToJsArray<T>(List<T> cslist)
{
bool numeric = true;
if(
!(typeof(T)==typeof(int)
|| typeof(T) == typeof(string)
|| typeof(T) == typeof(double))
)
{
throw (new ArgumentException(message: "Only int, double and string are supported"));
}
if(typeof(T)==typeof(string))
{
numeric = false;
}
string JsArray = "[";
for(int i=0;i<cslist.Count;i++)
{
string dataWithSurrendings = cslist[i].ToString();
if(!numeric)
{
dataWithSurrendings = "'" + cslist[i].ToString() + "'";
}
if(i !=0)
{
dataWithSurrendings = "," + dataWithSurrendings;
}
if(i +1==cslist.Count)
{
dataWithSurrendings = dataWithSurrendings + "]";
}
JsArray += dataWithSurrendings;
}
return JsArray;
}
My problem is when a list of strings is passed, apostrophes turn into '.
for example, a list of {"1","2","3","4","5","6","7"} becomes this:
['1','2','3','4','1','6','7']
What modification is needed in this function, to return a correct array in JavaScript?
None of solutions did solve the problem. With JsonConvert I get almost same result. The problem is the single or double quote in View editor have not the same encoding as CS string.
I'm assuming that you are doing this to drop into a webpage somewhere, something like:
<script>
#{
var output = listToJsArray(Model.SomeList);
}
var myArray = #Html.Raw(output);
// some Javascript using that array
</script>
Don't waste your time trying to do it yourself. It's a pain and you are reinventing the wheel. JSON is valid Javascript and a serialization of an array into JSON is absolutely identical to a Javascript array literal. So use Javascript. JSON.Net is really useful here:
<script>
#{
var output = Newtonsoft.Json.JsonConvert.SerializeObject(Model.SomeList);
}
var myArray = #Html.Raw(output);
// some Javascript using that array
</script>
The serializer will handle all the annoying escaping, special characters and edge cases for you.

encodeURIComponent throws an exception

I am programmatically building a URI with the help of the encodeURIComponent function using user provided input. However, when the user enters invalid unicode characters (such as U+DFFF), the function throws an exception with the following message:
The URI to be encoded contains an invalid character
I looked this up on MSDN, but that didn't tell me anything I didn't already know.
To correct this error
Ensure the string to be encoded contains only valid Unicode sequences.
My question is, is there a way to sanitize the user provided input to remove all invalid Unicode sequences before I pass it on to the encodeURIComponent function?
Taking the programmatic approach to discover the answer, the only range that turned up any problems was \ud800-\udfff, the range for high and low surrogates:
for (var regex = '/[', firstI = null, lastI = null, i = 0; i <= 65535; i++) {
try {
encodeURIComponent(String.fromCharCode(i));
}
catch(e) {
if (firstI !== null) {
if (i === lastI + 1) {
lastI++;
}
else if (firstI === lastI) {
regex += '\\u' + firstI.toString(16);
firstI = lastI = i;
}
else {
regex += '\\u' + firstI.toString(16) + '-' + '\\u' + lastI.toString(16);
firstI = lastI = i;
}
}
else {
firstI = i;
lastI = i;
}
}
}
if (firstI === lastI) {
regex += '\\u' + firstI.toString(16);
}
else {
regex += '\\u' + firstI.toString(16) + '-' + '\\u' + lastI.toString(16);
}
regex += ']/';
alert(regex); // /[\ud800-\udfff]/
I then confirmed this with a simpler example:
for (var i = 0; i <= 65535 && (i <0xD800 || i >0xDFFF ) ; i++) {
try {
encodeURIComponent(String.fromCharCode(i));
}
catch(e) {
alert(e); // Doesn't alert
}
}
alert('ok!');
And this fits with what MSDN says because indeed all those Unicode characters (even valid Unicode "non-characters") besides surrogates are all valid Unicode sequences.
You can indeed filter out high and low surrogates, but when used in a high-low pair, they become legitimate (as they are meant to be used in this way to allow for Unicode to expand (drastically) beyond its original maximum number of characters):
alert(encodeURIComponent('\uD800\uDC00')); // ok
alert(encodeURIComponent('\uD800')); // not ok
alert(encodeURIComponent('\uDC00')); // not ok either
So, if you want to take the easy route and block surrogates, it is just a matter of:
urlPart = urlPart.replace(/[\ud800-\udfff]/g, '');
If you want to strip out unmatched (invalid) surrogates while allowing surrogate pairs (which are legitimate sequences but the characters are rarely ever needed), you can do the following:
function stripUnmatchedSurrogates (str) {
return str.replace(/[\uD800-\uDBFF](?![\uDC00-\uDFFF])/g, '').split('').reverse().join('').replace(/[\uDC00-\uDFFF](?![\uD800-\uDBFF])/g, '').split('').reverse().join('');
}
var urlPart = '\uD801 \uD801\uDC00 \uDC01'
alert(stripUnmatchedSurrogates(urlPart)); // Leaves one valid sequence (representing a single non-BMP character)
If JavaScript had negative lookbehind the function would be a lot less ugly...

Split a CSV string by line skipping newlines contained between quotes

If the following regex can split a csv string by line.
var lines = csv.split(/\r|\r?\n/g);
How could this be adapted to skip newline chars that are contained within a CSV value (Ie between quotes/double-quotes)?
Example:
2,"Evans & Sutherland","230-132-111AA",,"Visual","P
CB",,1,"Offsite",
If you don't see it, here's a version with the newlines visible:
2,"Evans & Sutherland","230-132-111AA",,"Visual","P\r\nCB",,1,"Offsite",\r\n
The part I'm trying to skip over is the newline contained in the middle of the "PCB" entry.
Update:
I probably should've mentioned this before but this is a part of a dedicated CSV parsing library called jquery-csv. To provide a better context I have added the current parser implementation below.
Here's the code for validating and parsing an entry (ie one line):
$.csvEntry2Array = function(csv, meta) {
var meta = (meta !== undefined ? meta : {});
var separator = 'separator' in meta ? meta.separator : $.csvDefaults.separator;
var delimiter = 'delimiter' in meta ? meta.delimiter : $.csvDefaults.delimiter;
// build the CSV validator regex
var reValid = /^\s*(?:D[^D\\]*(?:\\[\S\s][^D\\]*)*D|[^SD\s\\]*(?:\s+[^SD\s\\]+)*)\s*(?:S\s*(?:D[^D\\]*(?:\\[\S\s][^D\\]*)*D|[^SD\s\\]*(?:\s+[^SD\s\\]+)*)\s*)*$/;
reValid = RegExp(reValid.source.replace(/S/g, separator));
reValid = RegExp(reValid.source.replace(/D/g, delimiter));
// build the CSV line parser regex
var reValue = /(?!\s*$)\s*(?:D([^D\\]*(?:\\[\S\s][^D\\]*)*)D|([^SD\s\\]*(?:\s+[^SD\s\\]+)*))\s*(?:S|$)/g;
reValue = RegExp(reValue.source.replace(/S/g, separator), 'g');
reValue = RegExp(reValue.source.replace(/D/g, delimiter), 'g');
// Return NULL if input string is not well formed CSV string.
if (!reValid.test(csv)) {
return null;
}
// "Walk" the string using replace with callback.
var output = [];
csv.replace(reValue, function(m0, m1, m2) {
// Remove backslash from any delimiters in the value
if (m1 !== undefined) {
var reDelimiterUnescape = /\\D/g;
reDelimiterUnescape = RegExp(reDelimiterUnescape.source.replace(/D/, delimiter), 'g');
output.push(m1.replace(reDelimiterUnescape, delimiter));
} else if (m2 !== undefined) {
output.push(m2);
}
return '';
});
// Handle special case of empty last value.
var reEmptyLast = /S\s*$/;
reEmptyLast = RegExp(reEmptyLast.source.replace(/S/, separator));
if (reEmptyLast.test(csv)) {
output.push('');
}
return output;
};
Note: I haven't tested yet but I think I could probably incorporate the last match into the main split/callback.
This is the code that does the split-by-line part:
$.csv2Array = function(csv, meta) {
var meta = (meta !== undefined ? meta : {});
var separator = 'separator' in meta ? meta.separator : $.csvDefaults.separator;
var delimiter = 'delimiter' in meta ? meta.delimiter : $.csvDefaults.delimiter;
var skip = 'skip' in meta ? meta.skip : $.csvDefaults.skip;
// process by line
var lines = csv.split(/\r\n|\r|\n/g);
var output = [];
for(var i in lines) {
if(i < skip) {
continue;
}
// process each value
var line = $.csvEntry2Array(lines[i], {
delimiter: delimiter,
separator: separator
});
output.push(line);
}
return output;
};
For a breakdown on how that reges works take a look at this answer. Mine is a slightly adapted version. I consolidated the single and double quote matching to match just one text delimiter and made the delimiter/separators dynamic. It does a great job of validating entiries but the line-splitting solution I added on top is pretty frail and breaks on the edge case I described above.
I'm just looking for a solution that walks the string extracting valid entries (to pass on to the entry parser) or fails on bad data returning an error indicating the line the parsing failed on.
Update:
splitLines: function(csv, delimiter) {
var state = 0;
var value = "";
var line = "";
var lines = [];
function endOfRow() {
lines.push(value);
value = "";
state = 0;
};
csv.replace(/(\"|,|\n|\r|[^\",\r\n]+)/gm, function (m0){
switch (state) {
// the start of an entry
case 0:
if (m0 === "\"") {
state = 1;
} else if (m0 === "\n") {
endOfRow();
} else if (/^\r$/.test(m0)) {
// carriage returns are ignored
} else {
value += m0;
state = 3;
}
break;
// delimited input
case 1:
if (m0 === "\"") {
state = 2;
} else {
value += m0;
state = 1;
}
break;
// delimiter found in delimited input
case 2:
// is the delimiter escaped?
if (m0 === "\"" && value.substr(value.length - 1) === "\"") {
value += m0;
state = 1;
} else if (m0 === ",") {
value += m0;
state = 0;
} else if (m0 === "\n") {
endOfRow();
} else if (m0 === "\r") {
// Ignore
} else {
throw new Error("Illegal state");
}
break;
// un-delimited input
case 3:
if (m0 === ",") {
value += m0;
state = 0;
} else if (m0 === "\"") {
throw new Error("Unquoted delimiter found");
} else if (m0 === "\n") {
endOfRow();
} else if (m0 === "\r") {
// Ignore
} else {
throw new Error("Illegal data");
}
break;
default:
throw new Error("Unknown state");
}
return "";
});
if (state != 0) {
endOfRow();
}
return lines;
}
All it took is 4 states for a line splitter:
0: the start of an entry
1: the following is quoted
2: a second quote has been encountered
3: the following isn't quoted
It's almost a complete parser. For my use case, I just wanted a line splitter so I could provide a more granual approach to processing CSV data.
Note: Credit for this approach goes to another dev whom I won't name publicly without his permission. All I did was adapt it from a complete parser to a line-splitter.
Update:
Discovered a few broken edge cases in the previous lineSplitter implementation. The one provided should be fully RFC 4180 compliant.
As I have noted in a comment there is no complete solution just using single regex.
A novel method using several regexps by splitting on comma and joining back strings with embedded commas is described here:-
Personally I would use a simple finite state machine as described here
The state machine has more code, but the code is cleaner and its clear what each piece of code is doing. Longer term this will be much more reliable and maintainable.
It's not a good idea to use regex's to parse. Better to use it to detect the "bad" splits and then merge them back:
var lines = csv.split(/\r?\n/g);
var bad = [];
for(var i=lines.length-1; i> 0; i--) {
// find all the unescaped quotes on the line:
var m = lines[i].match(/[^\\]?\"/g);
// if there are an odd number of them, this line, and the line after it is bad:
if((m ? m.length : 0) % 2 == 1) { bad.push(i--); }
}
// starting at the bottom of the list, merge lines back, using \r\n
for(var b=0,len=bad.length; b < len; b++) {
lines.splice(bad[b]-1, 2, lines[bad[b]-1]+"\r\n"+lines[bad[b]]);
}
(This answer is licensed under both CC0 and WTFPL.)
Be careful- That newline is PART of that value. It's not PCB, it's P\nCB.
However, why can't you just use string.split(',')? If need be, you can run through the list and cast to ints or remove the padded quotation marks.

JavaScript CSV Parser Library [closed]

Closed. This question is off-topic. It is not currently accepting answers.
Want to improve this question? Update the question so it's on-topic for Stack Overflow.
Closed 9 years ago.
Improve this question
Is there a decent CSV Parser library for JavaScript? I've used this and that solution so far. In the first solution a new line is never created as a new sub-array, also the code tells so and the second solution does not work on text files formatted in Windows with <CR><LF> , respectively \r\n
Is it sufficient to apply
text = text.replace("\r","");
to the Windows CSV files? This actually works, but I think this is a little bit quirks. Are there csv parser which are more common than a random bloggers solution?
Here's the 'easy' solution
csv.split(/\r\n|\r|\n/g)
It handles:
\n
\r
\r\n
\n\r
Unfortunately, it breaks on values that contain newline chars between delimiters.
For example, the following line entry...
"this is some","valid CSV data","with a \r\nnewline char"
Will break it because the '\r\n' will be mistakenly interpreted as the end of an entry.
For a complete solution, your best bet is to create a ND-FSM (Non-Deterministic Finite State Machine) lexer/parser. If you have ever heard of the Chomsky Hierarchy, CSV can be parsed as a Type III grammar. That means char-by-char or token-by-token processing with state tracking.
I have a fully RFC 4180 compliant client-side library available but somehow I attracted the attention of a delete-happy mod for external linking. There's a link in my profile if you're interested; otherwise, good luck.
I'll give you fair warning from experience, CSV looks deceptively easy on the surface. After studying tens/hundreds of implementations, I have only seen 3 javascript parsers that did a reasonable job of meeting the spec and none of them were completely RFC compliant. I managed to write one but only with the help of the community and lots and lots of pain.
If you're working in Node, there's an excellent CSV parser that can handle extremely large amounts of data (>GB files) and supports escape characters.
If you're working in browser JS, you could still extract the processing logic from the code so that it operates on a string (instead of a Node Stream).
Here is one way to do it:
// based on json_parse from JavaScript The Good Part by D. Crockford
var csv_parse = function () {
var at,
ch,
text,
error = function (m) {
throw {
name: 'SyntaxError',
message: m,
at: at,
text: text
};
},
next = function (c) {
if (c && c !== ch) {
error("Expected '" + c + "' instead of '" + ch + "'");
}
ch = text.charAt(at);
at += 1;
return ch;
},
//needed to handle "" which indicates escaped quote
peek = function () {
return text.charAt(at);
},
white = function () {
while (ch && ch <= ' ' && ch !== '\n') {
next();
}
},
// if numeric, then return number
number = function () {
var number,
string = word();
number = +string;
if (isNaN(number)) {
return string;
} else {
return number;
}
},
word = function () {
var string = '';
while (ch !== ',' && ch !== '\n') {
string += ch;
next();
}
return string;
},
// the matching " is the end of word not ,
// need to worry about "", which is escaped quote
quoted = function () {
var string ='';
if (ch === '"') {
while (next()) {
if (ch === '"') {
//print('need to know ending quote or escaped quote');
// need to know ending quote or escaped quote ("")
if (peek() === '"') {
//print('maybe double quote near '+string);
next('"');
string += ch;
} else {
next('"')
return string;
}
} else {
string += ch;
}
}
return string;
}
error("Bad string");
},
value = function () {
white();
switch(ch) {
case '-':
return number();
case '"':
return quoted();
default:
return ch >= '0' && ch <= '9' ? number() : word();
}
return number();
},
line = function () {
var array = [];
white();
if (ch === '\n') {
next('\n');
return array;//empty []
}
while (ch) {
array.push( value() );
white();
if (ch === '\n') {
next('\n');
return array;//got something
}
next(',');// not very liberal with delimiter
white();
}
};
return function (_line) {
var result;
text = _line;
at = 0;
ch = ' ';
result = line();
white();
if (ch) {
error("Syntax error");
}
return result;
};
}();
My function is solid, just drop in and use, I hope it is of help to you.
csvToArray v1.3
A compact (508 bytes) but compliant function to convert a CSV string into a 2D array, conforming to the RFC4180 standard.
http://code.google.com/p/csv-to-array/
Common Usage: jQuery
$.ajax({
url: "test.csv",
dataType: 'text',
cache: false
}).done(function(csvAsString){
csvAsArray=csvAsString.csvToArray();
});
Common usage: Javascript
csvAsArray = csvAsString.csvToArray();
Override field separator
csvAsArray = csvAsString.csvToArray("|");
Override record separator
csvAsArray = csvAsString.csvToArray("", "#");
Override Skip Header
csvAsArray = csvAsString.csvToArray("", "", 1);
Override all
csvAsArray = csvAsString.csvToArray("|", "#", 1);

Categories

Resources