Converting csv data into an array format - javascript

I am trying to form a wordCloud using jquery. I have a csv file to be read and use that data to form a wordCloud.
I have coloumns in my csv file as
text weight
Lorem 15
Ipsum 9
and so on
But the input data needs to be in the following format
var word_array = [
{text: "Lorem", weight: 15},
{text: "Ipsum", weight: 9},
{text: "Dolor", weight: 6},
{text: "Sit", weight: 7}
];
How should i convert my csv data into the above format to be able to form the word cloud.Please attach the code if possible. I am doing all this in my html page.Thank you.

Here is a possible solution in ES6:
const csv2json = (str, delimiter = ',') => {
const titles = str.slice(0, str.indexOf('\n')).split(delimiter);
const rows = str.slice(str.indexOf('\n') + 1).split('\n');
return rows.map(row => {
const values = row.split(delimiter);
return titles.reduce((object, curr, i) => (object[curr] = values[i], object), {})
});
};
let csv = "text weight\nLorem 15\nIpsum 9";
let word_array = csv2json(csv,' ');
console.log(word_array)

Here is a possible solution :
var csv = "";
csv += "text weight\n";
csv += "Lorem 15\n";
csv += "Ipsum 9";
var lines = csv.split("\n");
var titles = lines[0].split(" ");
var data = new Array(lines.length - 1);
for (var i = 1; i < lines.length; i++) {
data[i - 1] = {};
lines[i] = lines[i].split(" ");
for (var j = 0; j < titles.length; j++) {
data[i - 1][titles[j]] = lines[i][j];
}
}
console.log(data);

Here's some code I wrote for doing this:
const lineRegex = /((\\\n)|[^\n])+/g;
const datumRegex = /,?(("(\\"|.)+?")|([^",][^,]*))?/g;
const array2d: string[][] = rawCsvFile.match(lineRegex).map((row) =>
row.match(datumRegex).map((datum) => datum.replace(/^,?"?|"$/g, "").trim()),
);
The regexes above will handle newlines within datums as well as escaped quotes and quoted commas. And if you want to create an array of objects where each row is an object with the first rows values as prop names, you can use this.
console.log(array2d); // [[name, age], ["joe", 35], ["martha", 28]]
const out = [];
const propsRow = array2d[0];
array2d.forEach((row, i) => {
if (i === 0) { return; }
const addMe: any = {};
row.forEach((datum, j) => addMe[propsRow[j]] = datum);
out.push(addMe);
});
console.log(out); // [{name: "joe", age: 35}, {name: "martha", age: 28}]

You can get the data from CSV To Array by using below below code
/**
* Convert data in CSV (comma separated value) format to a javascript array.
*
* Values are separated by a comma, or by a custom one character delimeter.
* Rows are separated by a new-line character.
*
* Leading and trailing spaces and tabs are ignored.
* Values may optionally be enclosed by double quotes.
* Values containing a special character (comma's, double-quotes, or new-lines)
* must be enclosed by double-quotes.
* Embedded double-quotes must be represented by a pair of consecutive
* double-quotes.
*
* Example usage:
* var csv = '"x", "y", "z"\n12.3, 2.3, 8.7\n4.5, 1.2, -5.6\n';
* var array = csv2array(csv);
*
* Author: Jos de Jong, 2010
*
* #param {string} data The data in CSV format.
* #param {string} delimeter [optional] a custom delimeter. Comma ',' by default
* The Delimeter must be a single character.
* #return {Array} array A two dimensional array containing the data
* #throw {String} error The method throws an error when there is an
* error in the provided data.
*/
function csv2array(data, delimeter) {
// Retrieve the delimeter
if (delimeter == undefined)
delimeter = ',';
if (delimeter && delimeter.length > 1)
delimeter = ',';
// initialize variables
var newline = '\n';
var eof = '';
var i = 0;
var c = data.charAt(i);
var row = 0;
var col = 0;
var array = new Array();
while (c != eof) {
// skip whitespaces
while (c == ' ' || c == '\t' || c == '\r') {
c = data.charAt(++i); // read next char
}
// get value
var value = "";
if (c == '\"') {
// value enclosed by double-quotes
c = data.charAt(++i);
do {
if (c != '\"') {
// read a regular character and go to the next character
value += c;
c = data.charAt(++i);
}
if (c == '\"') {
// check for escaped double-quote
var cnext = data.charAt(i+1);
if (cnext == '\"') {
// this is an escaped double-quote.
// Add a double-quote to the value, and move two characters ahead.
value += '\"';
i += 2;
c = data.charAt(i);
}
}
}
while (c != eof && c != '\"');
if (c == eof) {
throw "Unexpected end of data, double-quote expected";
}
c = data.charAt(++i);
}
else {
// value without quotes
while (c != eof && c != delimeter && c!= newline && c != ' ' && c != '\t' && c != '\r') {
value += c;
c = data.charAt(++i);
}
}
// add the value to the array
if (array.length <= row)
array.push(new Array());
array[row].push(value);
// skip whitespaces
while (c == ' ' || c == '\t' || c == '\r') {
c = data.charAt(++i);
}
// go to the next row or column
if (c == delimeter) {
// to the next column
col++;
}
else if (c == newline) {
// to the next row
col = 0;
row++;
}
else if (c != eof) {
// unexpected character
throw "Delimiter expected after character " + i;
}
// go to the next character
c = data.charAt(++i);
}
return array;
}
Working Example Demo Link : http://www.speqmath.com/tutorials/csv2array/

Related

Improving combinations from abc[d[e,f],gh] pattern algorithm

I wrote an algorithm that is inadequate, namely because it does not handle [,abc] cases (see the string variations and conditions below), and would like to know how it can be improved so it covers those cases:
Given
Pattern, that describes strings variations: abc[de[f,g],hk], which gives
abcdef
abcdeg
abchk
Pattern consists of "arrays", that followed by strings: abc[...], and strings adj,kg,q
Another possible more complex example: utvk[fvu,gn[u,k,r],nl,q[t[ij,lo[z,x]],bm]].
Conditions
Strings itself can contain only letters and numbers. There couldn't be abc[h\,k,b] or abc[h\[k,b] that gives abch,k or abch[k.
"Arrays" always not empty, and has at least 2 elements.
There can be any order of "array", or "only string" value, i.e.: abc[a,b[c,d]] or abc[a[b,c],d]. The order is strict from left to right, there can not be from pattern abc[d,e] combinations eabc or dabc.
abc[d,e] doesn't gives abcde nor abced string, only abcd and abce.
Pattern always starts with string with array: something[...].
There can be string without array: abc[a,bc[d,f]], but array without string is not allowed: abc[a,[d,f]].
There can be an empty string, i.e.: a[,b], that gives a and ab
My solution
function getStrings(pat) {
if(pat.indexOf('[') == -1)
return pat;
String.prototype.insert = function(index, string) {
if (index > 0) {
return this.substring(0, index) + string + this.substr(index);
}
return string + this;
};
function getArray(str, start, isSource = false) {
if (start < 0) return null;
var n = 0;
var ret = "";
var i = start;
for (; i < str.length; i++) {
if (str[i] == "[") n++;
else if (str[i] == "]") n--;
if (n == 0) break;
}
var ret = {
str: "",
arr: "",
end: 0,
};
ret.arr = str.slice(start, i) + "]";
ret.end = i;
start--;
var end = start;
for (
;
start > 0 &&
str[start] != "," &&
str[start] != "]" &&
str[start] != "[";
start--
) {}
if(!isSource)
start++;
end++;
ret.str = str.slice(start, end);
return ret;
}
function getElement(source, start) {
var ret = [];
start++;
for (
;
start < source.length && source[start] != "," && source[start] != "]";
start++
)
ret[ret.length] = source[start];
return ret;
}
var source = getArray(pat, pat.indexOf("["), true); // parsing
var ar = source.arr;
source.arrs = getArrays(source); // parsing
source.source = true;
var fi = "";
var temp_out = [];
var tol = 0;
return getVariations(source); // getting variations of parsed
function getVariations(source) {
if (source.arrs == undefined) {
} else
for (var i = 0; i < source.arrs.length; i++) {
if (source.source) fi = source.str;
if (!source.arrs[i].arrs) {
temp_out[tol] = fi + source.arrs[i].str;
tol++;
} else {
var lafi = fi;
fi += source.arrs[i].str;
getVariations(source.arrs[i]);
if(i != source.arrs.length - 1)
fi = lafi;
}
if (source.source && i == source.arrs.length - 1) {
var temp = temp_out;
temp_out = [];
tol = 0;
return temp;
}
}
}
function getArrays(source) {
var la = 1;
var start = 0;
var arrs = [];
if (!source.arr) return;
while (start != -1) {
start = source.arr.indexOf("[", la);
var qstart = source.arr.indexOf(",", la);
if(source.arr[la] == ',')
qstart = source.arr.indexOf(",", la+1);
var pu = false;
if(qstart != la && qstart != -1 && qstart < start && start != -1)
{
pu = true;
var str = source.arr;
var buf = [];
qstart--;
var i = -1;
for(i = qstart; i > 0 && str[i] != '[' && str[i] != ','; i--)
{}
i++;
for(; i < str.length && str[i]!= ','; i++)
{
buf[buf.length] = str[i];
}
if(buf.length == 0)
{
la = start;
alert("1!")
}
else
{
buf = buf.join('');
arrs[arrs.length] = {str:buf};
la += buf.length+1;
}
}
else
if (start != -1) {
arrs[arrs.length] = getArray(source.arr, start);
la = arrs[arrs.length - 1].end + 1;
} else {
start = source.arr.indexOf(",", la);
if (start != -1) {
var ret = getElement(source.arr, start);
arrs[arrs.length] = ret;
la += ret.length;
}
}
}
for (var i = 0; i < arrs.length; i++)
if (typeof arrs[i] != "string" && arrs[i].arr) {
arrs[i].arrs = getArrays(arrs[i]);
var st = arrs[i].arr;
if (occ(arrs[i].arr, "[") == 1 && occ(arrs[i].arr, "]") == 1) {
st = st.replaceAll("[", '["');
st = st.replaceAll("]", '"]');
st = st.replaceAll(",", '","');
st = JSON.parse(st);
for (var j = 0; j < st.length; j++) st[j] = { str: st[j] };
arrs[i].arrs = st;
}
} else if (typeof arrs[i] == "string") {
arrs[i] = { str: arrs[i] };
}
RecursArrs(arrs);
return arrs;
}
function RecursArrs(arrs) {
for (var i = 0; i < arrs.length; i++) {
if (!arrs[i].source)
if (arrs[i].arr) {
delete arrs[i].arr;
delete arrs[i].end;
}
if (!arrs[i].str) {
try{
arrs[i] = { str: arrs[i].join("") };
}catch(er)
{
arrs[i] = {str:''};
}
if (i && arrs[i - 1].str == arrs[i].str) {
arrs.splice(i, 1);
i--;
}
} else if (arrs[i].arrs) RecursArrs(arrs[i].arrs);
}
}
function occ(string, word) {
return string.split(word).length - 1;
}
}
// getStrings('IE5E[COR[R[,G[A,E,I]],S,T,U,V,W,X,Y,Z],EORRG[I,M]]')
I would use a regular expression to break up the input into tokens. In this case I chose to take pairs of (letters, delimiter), where the delimiter is one of "[", "]", ",". The letters part could be empty.
Then I would use a recursive function like you did, but I went for a recursive generator function.
Here is the suggested implementation:
function* getStrings(pattern) {
const tokens = pattern.matchAll(/([^[\],]*)([[\],])?/g);
function* dfs(recur=false) {
let expectToken = true;
while (true) {
const [, token, delim] = tokens.next().value;
if (delim === "[") {
for (const deep of dfs(true)) yield token + deep;
} else {
if (token || expectToken) yield token;
if (delim === "]" && !recur) throw "Invalid pattern: too many `]`";
if (!delim && recur) throw "Invalid pattern: missing `]`";
if (delim !== ",") return;
}
expectToken = delim !== "["; // After [...] we don't expect a letter
}
}
yield* dfs();
}
const input = 'IE5E[COR[R[,G[A,E,I]],S,T,U,V,W,X,Y,Z],EORRG[I,M]]';
for (const s of getStrings(input))
console.log(s);
This implementation should match the patterns according to the given restrictions, but it will also allow the following:
An "array" can start without a prefix of letters. So [a,b] is allowed and will produce the same output as a,b.
An "array" may be followed immediately by letters or a new "array", but this will be interpreted as if they were separated by a comma. So x[a,b]c will be interpreted as x[a,b],c
An "array" can be empty. In that case the array is ignored. So x[] is the same as x.
There is some basic error checking: an error will be generated when the brackets are not balanced.
We can do this in an inside-out fashion. If we replace the innermost group (e.g. 'de[fg]' with its expansion, 'def,deg', and recur until there are no more groups remaining, we will have created a comma-separated list of final strings, which we can simply split apart and return.
const _expand = (
s,
match = s .match (/(.*?)(\w*)\[([^\[\]]+)\](.*)/),
[_, a, b, c, d] = match || []
) => match ? _expand (a + c .split (',') .map (x => b + x) .join (',') + d) : s
const expand = (s) => _expand (s) .split (',')
console .log (expand ('abc[de[f,g],hk]'))
console .log (expand ('utvk[fvu,gn[u,k,r],nl,q[t[ij,lo[z,x]],bm]]'))
.as-console-wrapper {max-height: 100% !important; top: 0}
Our main recursive function -- _expand -- uses a regular expression that extracts the first group, and breaks it into constituent parts, and puts it back together by mapping over the parts of the array. Then our public function, expand simply calls the recursive one and splits the result into an array.
For example, this is how the recursive calls would be handled for the string, 'utvk[fvu,gn[u,k,r],nl,q[t[ij,lo[z,x]],bm]]':
'utvk[fvu,gn[u,k,r],nl,q[t[ij,lo[z,x]],bm]]' //-->
// ^^^^^^^^^
'utvk[fvu,gnu,gnk,gnr,nl,q[t[ij,lo[z,x]],bm]]' //-->
// ^^^^^^^
'utvk[fvu,gnu,gnk,gnr,nl,q[t[ij,loz,lox],bm]]' //-->
// ^^^^^^^^^^^^^^^^^
'utvk[fvu,gnu,gnk,gnr,nl,q[tij,tloz,tlox,bm]]' //-->
// ^^^^^^^^^^^^^^^^^^^
'utvk[fvu,gnu,gnk,gnr,nl,qtij,qtloz,qtlox,qbm]' //-->
// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
'utvkfvu,utvkgnu,utvkgnk,utvkgnr,utvknl,utvkqtij,utvkqtloz,utvkqtlox,utvkqbm'
Update: Regex explanation:
The regex used here can be broken down into six sections:
(.*?): captures (non-greedy) an initial set of characters, stored as a
(\w*): captures our letters before an opening brace, stored as b
\[: captures an opening brace ([)
([^\[\]]+): captures everything but braces ([ or ]), stored as c
\]: captures a closing brace (])
(.*): captures everything after the closing brace, stored as d
The point is for the group inside the braces to include no other braces. An example might look like this:
utvk[fvu,gn[u,k,r],nl,q[t[ij,lo[z,x]],bm]]
`----+---'\/|`-+-'|`----------+-----------'
\ | \ \ \__ \
| \ \_ \__ \____ \
a: (.*?) \_ \_ \ \ \
~~~~~ | \ \__ \ \
b: (\w*) | \ \ \
~~~~~ | \ \ \
[: \[ | \ \
~~ | \ \
c: ([^\[\]]+) \ \
~~~~~~~~~~ | |
]: \] |
~~ |
d: (.*)
~~~~
Vanialla solution without recursion:
const expander = /([^,[\]]*?)\[([^[\]]*?)]/;
const parse = (fields) => {
let result = fields;
while (result.match(expander)) {
result = result.replace(expander, (m, p1, p2) => p2.split(',').map((e) => `${p1}${e}`).join(','));
}
return result.split(',');
};
console.log(parse('abc[de[f,g],hk]'));
// => [ 'abcdef', 'abcdeg', 'abchk' ]
console.log(parse('utvk[fvu,gn[u,k,r],nl,q[t[ij,lo[z,x]],bm]]'));
// => [ 'utvkfvu', 'utvkgnu', 'utvkgnk', 'utvkgnr', 'utvknl', 'utvkqtij', 'utvkqtloz', 'utvkqtlox', 'utvkqbm' ]
.as-console-wrapper {max-height: 100% !important; top: 0}
Basically I just took the code from object-fields, which one could use as follows
// const objectFields = require('object-fields');
const parse = (input) => objectFields.split(input.replace(/\[/g, '(').replace(/]/g, ')')).map((e) => e.replace(/\./g, ''));
console.log(parse('abc[de[f,g],hk]'));
// => [ 'abcdef', 'abcdeg', 'abchk' ]
console.log(parse('utvk[fvu,gn[u,k,r],nl,q[t[ij,lo[z,x]],bm]]'));
// => [ 'utvkfvu', 'utvkgnu', 'utvkgnk', 'utvkgnr', 'utvknl', 'utvkqtij', 'utvkqtloz', 'utvkqtlox', 'utvkqbm' ]
.as-console-wrapper {max-height: 100% !important; top: 0}
<script src="https://bundle.run/object-fields#3.0.1"></script>
Disclaimer: I'm the author of object-fields
Let's describe an algorithm in words. Let's define word as a group of consecutive letters without a comma or bracket, which can also be an empty string. Then one way to think about this process is as a stack with two types of entries:
A word.
An opening bracket, [.
As we traverse the string,
(1) push words and opening brackets onto the stack, not commas.
(2a) when we reach a closing bracket, ], we start a list and keep popping the stack, adding words to that list until we pop an opening bracket from the stack. We then (2b) pop the next entry in the stack, which is the prefix for our current list, and (2c) push each entry from the list onto the stack with the prefix prepended.
Finally, return the stack.
Here's an implementation of the algorithm described above.
function f(s) {
if (s.length == 0) {
return [];
}
const stack = [""];
let i = 0;
while (i < s.length) {
if (s[i] == "[") {
i += 1;
stack.push("[", "");
} else if (s[i] == "]") {
i += 1;
const suffixes = [];
while (true) {
const word = stack.pop();
if (word == "[") {
const prefix = stack.pop();
for (let j = suffixes.length - 1; j >= 0; j--) {
stack.push(prefix + suffixes[j]);
}
break;
} else {
suffixes.push(word);
}
}
} else if (s[i] == ",") {
i += 1;
stack.push("");
} else {
stack[stack.length - 1] += s[i];
i += 1;
}
}
return stack;
}
// Output
var s = "a[bp,c[,d]],b[yx,]"
console.log(s);
for (const w of f(s)) {
console.log(w);
}
console.log("");
s = "abc[de[f,g],hk]"
console.log(s);
for (const w of f(s)) {
console.log(w);
}
Here is a recursion free solution using object-scan.
This solution is probably more of academic interest since it uses library internals and I wrote it to satisfy my curiosity whether it could be done this way. Also serves as a head scratcher for #ScottSauyet - payback for his answer which took me a while to figure out =)
Anyways, enjoy!
.as-console-wrapper {max-height: 100% !important; top: 0}
<script type="module">
import objectScan from 'https://cdn.jsdelivr.net/npm/object-scan#18.4.0/lib/index.min.js';
import { compile } from 'https://cdn.jsdelivr.net/npm/object-scan#18.4.0/lib/core/compiler.js';
const parse = (input) => {
const compiled = compile([input.replace(/\[/g, '.{').replace(/]/g, '}')], {});
return objectScan(['++{children[*]}.value'], {
filterFn: ({ parent }) => parent.children.length === 0,
rtn: ({ parents }) => parents.filter((e) => !Array.isArray(e)).map(({ value }) => value).reverse().slice(1).join('')
})(compiled);
};
console.log(parse('abc[de[f,g],hk]'));
// => [ 'abcdef', 'abcdeg', 'abchk' ]
console.log(parse('utvk[fvu,gn[u,k,r],nl,q[t[ij,lo[z,x]],bm]]'));
// => [ 'utvkfvu', 'utvkgnu', 'utvkgnk', 'utvkgnr', 'utvknl', 'utvkqtij', 'utvkqtloz', 'utvkqtlox', 'utvkqbm' ]
</script>
Disclaimer: I'm the author of object-scan

READ TEXT FILE AND CONVERT TO JSON IN JavaScript [duplicate]

Where could I find some JavaScript code to parse CSV data?
You can use the CSVToArray() function mentioned in this blog entry.
<script type="text/javascript">
// ref: http://stackoverflow.com/a/1293163/2343
// This will parse a delimited string into an array of
// arrays. The default delimiter is the comma, but this
// can be overriden in the second argument.
function CSVToArray( strData, strDelimiter ){
// Check to see if the delimiter is defined. If not,
// then default to comma.
strDelimiter = (strDelimiter || ",");
// Create a regular expression to parse the CSV values.
var objPattern = new RegExp(
(
// Delimiters.
"(\\" + strDelimiter + "|\\r?\\n|\\r|^)" +
// Quoted fields.
"(?:\"([^\"]*(?:\"\"[^\"]*)*)\"|" +
// Standard fields.
"([^\"\\" + strDelimiter + "\\r\\n]*))"
),
"gi"
);
// Create an array to hold our data. Give the array
// a default empty first row.
var arrData = [[]];
// Create an array to hold our individual pattern
// matching groups.
var arrMatches = null;
// Keep looping over the regular expression matches
// until we can no longer find a match.
while (arrMatches = objPattern.exec( strData )){
// Get the delimiter that was found.
var strMatchedDelimiter = arrMatches[ 1 ];
// Check to see if the given delimiter has a length
// (is not the start of string) and if it matches
// field delimiter. If id does not, then we know
// that this delimiter is a row delimiter.
if (
strMatchedDelimiter.length &&
strMatchedDelimiter !== strDelimiter
){
// Since we have reached a new row of data,
// add an empty row to our data array.
arrData.push( [] );
}
var strMatchedValue;
// Now that we have our delimiter out of the way,
// let's check to see which kind of value we
// captured (quoted or unquoted).
if (arrMatches[ 2 ]){
// We found a quoted value. When we capture
// this value, unescape any double quotes.
strMatchedValue = arrMatches[ 2 ].replace(
new RegExp( "\"\"", "g" ),
"\""
);
} else {
// We found a non-quoted value.
strMatchedValue = arrMatches[ 3 ];
}
// Now that we have our value string, let's add
// it to the data array.
arrData[ arrData.length - 1 ].push( strMatchedValue );
}
// Return the parsed data.
return( arrData );
}
</script>
jQuery-CSV
It's a jQuery plugin designed to work as an end-to-end solution for parsing CSV into JavaScript data. It handles every single edge case presented in RFC 4180, as well as some that pop up for Excel/Google spreadsheet exports (i.e., mostly involving null values) that the specification is missing.
Example:
track,artist,album,year
Dangerous,'Busta Rhymes','When Disaster Strikes',1997
// Calling this
music = $.csv.toArrays(csv)
// Outputs...
[
["track", "artist", "album", "year"],
["Dangerous", "Busta Rhymes", "When Disaster Strikes", "1997"]
]
console.log(music[1][2]) // Outputs: 'When Disaster Strikes'
Update:
Oh yeah, I should also probably mention that it's completely configurable.
music = $.csv.toArrays(csv, {
delimiter: "'", // Sets a custom value delimiter character
separator: ';', // Sets a custom field separator character
});
Update 2:
It now works with jQuery on Node.js too. So you have the option of doing either client-side or server-side parsing with the same library.
Update 3:
Since the Google Code shutdown, jquery-csv has been migrated to GitHub.
Disclaimer: I am also the author of jQuery-CSV.
Here's an extremely simple CSV parser that handles quoted fields with commas, new lines, and escaped double quotation marks. There's no splitting or regular expression. It scans the input string 1-2 characters at a time and builds an array.
Test it at http://jsfiddle.net/vHKYH/.
function parseCSV(str) {
var arr = [];
var quote = false; // 'true' means we're inside a quoted field
// Iterate over each character, keep track of current row and column (of the returned array)
for (var row = 0, col = 0, c = 0; c < str.length; c++) {
var cc = str[c], nc = str[c+1]; // Current character, next character
arr[row] = arr[row] || []; // Create a new row if necessary
arr[row][col] = arr[row][col] || ''; // Create a new column (start with empty string) if necessary
// If the current character is a quotation mark, and we're inside a
// quoted field, and the next character is also a quotation mark,
// add a quotation mark to the current column and skip the next character
if (cc == '"' && quote && nc == '"') { arr[row][col] += cc; ++c; continue; }
// If it's just one quotation mark, begin/end quoted field
if (cc == '"') { quote = !quote; continue; }
// If it's a comma and we're not in a quoted field, move on to the next column
if (cc == ',' && !quote) { ++col; continue; }
// If it's a newline (CRLF) and we're not in a quoted field, skip the next character
// and move on to the next row and move to column 0 of that new row
if (cc == '\r' && nc == '\n' && !quote) { ++row; col = 0; ++c; continue; }
// If it's a newline (LF or CR) and we're not in a quoted field,
// move on to the next row and move to column 0 of that new row
if (cc == '\n' && !quote) { ++row; col = 0; continue; }
if (cc == '\r' && !quote) { ++row; col = 0; continue; }
// Otherwise, append the current character to the current column
arr[row][col] += cc;
}
return arr;
}
I have an implementation as part of a spreadsheet project.
This code is not yet tested thoroughly, but anyone is welcome to use it.
As some of the answers noted though, your implementation can be much simpler if you actually have DSV or TSV file, as they disallow the use of the record and field separators in the values. CSV, on the other hand, can actually have commas and newlines inside a field, which breaks most regular expression and split-based approaches.
var CSV = {
parse: function(csv, reviver) {
reviver = reviver || function(r, c, v) { return v; };
var chars = csv.split(''), c = 0, cc = chars.length, start, end, table = [], row;
while (c < cc) {
table.push(row = []);
while (c < cc && '\r' !== chars[c] && '\n' !== chars[c]) {
start = end = c;
if ('"' === chars[c]){
start = end = ++c;
while (c < cc) {
if ('"' === chars[c]) {
if ('"' !== chars[c+1]) {
break;
}
else {
chars[++c] = ''; // unescape ""
}
}
end = ++c;
}
if ('"' === chars[c]) {
++c;
}
while (c < cc && '\r' !== chars[c] && '\n' !== chars[c] && ',' !== chars[c]) {
++c;
}
} else {
while (c < cc && '\r' !== chars[c] && '\n' !== chars[c] && ',' !== chars[c]) {
end = ++c;
}
}
row.push(reviver(table.length-1, row.length, chars.slice(start, end).join('')));
if (',' === chars[c]) {
++c;
}
}
if ('\r' === chars[c]) {
++c;
}
if ('\n' === chars[c]) {
++c;
}
}
return table;
},
stringify: function(table, replacer) {
replacer = replacer || function(r, c, v) { return v; };
var csv = '', c, cc, r, rr = table.length, cell;
for (r = 0; r < rr; ++r) {
if (r) {
csv += '\r\n';
}
for (c = 0, cc = table[r].length; c < cc; ++c) {
if (c) {
csv += ',';
}
cell = replacer(r, c, table[r][c]);
if (/[,\r\n"]/.test(cell)) {
cell = '"' + cell.replace(/"/g, '""') + '"';
}
csv += (cell || 0 === cell) ? cell : '';
}
}
return csv;
}
};
csvToArray v1.3
A compact (645 bytes), but compliant function to convert a CSV string into a 2D array, conforming to the RFC4180 standard.
https://code.google.com/archive/p/csv-to-array/downloads
Common Usage: jQuery
$.ajax({
url: "test.csv",
dataType: 'text',
cache: false
}).done(function(csvAsString){
csvAsArray=csvAsString.csvToArray();
});
Common usage: JavaScript
csvAsArray = csvAsString.csvToArray();
Override field separator
csvAsArray = csvAsString.csvToArray("|");
Override record separator
csvAsArray = csvAsString.csvToArray("", "#");
Override Skip Header
csvAsArray = csvAsString.csvToArray("", "", 1);
Override all
csvAsArray = csvAsString.csvToArray("|", "#", 1);
Here's my PEG(.js) grammar that seems to do ok at RFC 4180 (i.e. it handles the examples at http://en.wikipedia.org/wiki/Comma-separated_values):
start
= [\n\r]* first:line rest:([\n\r]+ data:line { return data; })* [\n\r]* { rest.unshift(first); return rest; }
line
= first:field rest:("," text:field { return text; })*
& { return !!first || rest.length; } // ignore blank lines
{ rest.unshift(first); return rest; }
field
= '"' text:char* '"' { return text.join(''); }
/ text:[^\n\r,]* { return text.join(''); }
char
= '"' '"' { return '"'; }
/ [^"]
Try it out at http://jsfiddle.net/knvzk/10 or http://pegjs.majda.cz/online. Download the generated parser at https://gist.github.com/3362830.
Here's another solution. This uses:
a coarse global regular expression for splitting the CSV string (which includes surrounding quotes and trailing commas)
fine-grained regular expression for cleaning up the surrounding quotes and trailing commas
also, has type correction differentiating strings, numbers, boolean values and null values
For the following input string:
"This is\, a value",Hello,4,-123,3.1415,'This is also\, possible',true,
The code outputs:
[
"This is, a value",
"Hello",
4,
-123,
3.1415,
"This is also, possible",
true,
null
]
Here's my implementation of parseCSVLine() in a runnable code snippet:
function parseCSVLine(text) {
return text.match( /\s*(\"[^"]*\"|'[^']*'|[^,]*)\s*(,|$)/g ).map( function (text) {
let m;
if (m = text.match(/^\s*,?$/)) return null; // null value
if (m = text.match(/^\s*\"([^"]*)\"\s*,?$/)) return m[1]; // Double Quoted Text
if (m = text.match(/^\s*'([^']*)'\s*,?$/)) return m[1]; // Single Quoted Text
if (m = text.match(/^\s*(true|false)\s*,?$/)) return m[1] === "true"; // Boolean
if (m = text.match(/^\s*((?:\+|\-)?\d+)\s*,?$/)) return parseInt(m[1]); // Integer Number
if (m = text.match(/^\s*((?:\+|\-)?\d*\.\d*)\s*,?$/)) return parseFloat(m[1]); // Floating Number
if (m = text.match(/^\s*(.*?)\s*,?$/)) return m[1]; // Unquoted Text
return text;
} );
}
let data = `"This is\, a value",Hello,4,-123,3.1415,'This is also\, possible',true,`;
let obj = parseCSVLine(data);
console.log( JSON.stringify( obj, undefined, 2 ) );
Here's my simple vanilla JavaScript code:
let a = 'one,two,"three, but with a comma",four,"five, with ""quotes"" in it.."'
console.log(splitQuotes(a))
function splitQuotes(line) {
if(line.indexOf('"') < 0)
return line.split(',')
let result = [], cell = '', quote = false;
for(let i = 0; i < line.length; i++) {
char = line[i]
if(char == '"' && line[i+1] == '"') {
cell += char
i++
} else if(char == '"') {
quote = !quote;
} else if(!quote && char == ',') {
result.push(cell)
cell = ''
} else {
cell += char
}
if ( i == line.length-1 && cell) {
result.push(cell)
}
}
return result
}
I'm not sure why I couldn't get Kirtan's example to work for me. It seemed to be failing on empty fields or maybe fields with trailing commas...
This one seems to handle both.
I did not write the parser code, just a wrapper around the parser function to make this work for a file. See attribution.
var Strings = {
/**
* Wrapped CSV line parser
* #param s String delimited CSV string
* #param sep Separator override
* #attribution: http://www.greywyvern.com/?post=258 (comments closed on blog :( )
*/
parseCSV : function(s,sep) {
// http://stackoverflow.com/questions/1155678/javascript-string-newline-character
var universalNewline = /\r\n|\r|\n/g;
var a = s.split(universalNewline);
for(var i in a){
for (var f = a[i].split(sep = sep || ","), x = f.length - 1, tl; x >= 0; x--) {
if (f[x].replace(/"\s+$/, '"').charAt(f[x].length - 1) == '"') {
if ((tl = f[x].replace(/^\s+"/, '"')).length > 1 && tl.charAt(0) == '"') {
f[x] = f[x].replace(/^\s*"|"\s*$/g, '').replace(/""/g, '"');
} else if (x) {
f.splice(x - 1, 2, [f[x - 1], f[x]].join(sep));
} else f = f.shift().split(sep).concat(f);
} else f[x].replace(/""/g, '"');
} a[i] = f;
}
return a;
}
}
Regular expressions to the rescue! These few lines of code handle properly quoted fields with embedded commas, quotes, and newlines based on the RFC 4180 standard.
function parseCsv(data, fieldSep, newLine) {
fieldSep = fieldSep || ',';
newLine = newLine || '\n';
var nSep = '\x1D';
var qSep = '\x1E';
var cSep = '\x1F';
var nSepRe = new RegExp(nSep, 'g');
var qSepRe = new RegExp(qSep, 'g');
var cSepRe = new RegExp(cSep, 'g');
var fieldRe = new RegExp('(?<=(^|[' + fieldSep + '\\n]))"(|[\\s\\S]+?(?<![^"]"))"(?=($|[' + fieldSep + '\\n]))', 'g');
var grid = [];
data.replace(/\r/g, '').replace(/\n+$/, '').replace(fieldRe, function(match, p1, p2) {
return p2.replace(/\n/g, nSep).replace(/""/g, qSep).replace(/,/g, cSep);
}).split(/\n/).forEach(function(line) {
var row = line.split(fieldSep).map(function(cell) {
return cell.replace(nSepRe, newLine).replace(qSepRe, '"').replace(cSepRe, ',');
});
grid.push(row);
});
return grid;
}
const csv = 'A1,B1,C1\n"A ""2""","B, 2","C\n2"';
const separator = ','; // field separator, default: ','
const newline = ' <br /> '; // newline representation in case a field contains newlines, default: '\n'
var grid = parseCsv(csv, separator, newline);
// expected: [ [ 'A1', 'B1', 'C1' ], [ 'A "2"', 'B, 2', 'C <br /> 2' ] ]
You don't need a parser-generator such as lex/yacc. The regular expression handles RFC 4180 properly thanks to positive lookbehind, negative lookbehind, and positive lookahead.
Clone/download code at https://github.com/peterthoeny/parse-csv-js
Just throwing this out there.. I recently ran into the need to parse CSV columns with Javascript, and I opted for my own simple solution. It works for my needs, and may help someone else.
const csvString = '"Some text, some text",,"",true,false,"more text","more,text, more, text ",true';
const parseCSV = text => {
const lines = text.split('\n');
const output = [];
lines.forEach(line => {
line = line.trim();
if (line.length === 0) return;
const skipIndexes = {};
const columns = line.split(',');
output.push(columns.reduce((result, item, index) => {
if (skipIndexes[index]) return result;
if (item.startsWith('"') && !item.endsWith('"')) {
while (!columns[index + 1].endsWith('"')) {
index++;
item += `,${columns[index]}`;
skipIndexes[index] = true;
}
index++;
skipIndexes[index] = true;
item += `,${columns[index]}`;
}
result.push(item);
return result;
}, []));
});
return output;
};
console.log(parseCSV(csvString));
Personally I like to use deno std library since most modules are officially compatible with the browser
The problem is that the std is in typescript but official solution might happen in the future https://github.com/denoland/deno_std/issues/641 https://github.com/denoland/dotland/issues/1728
For now there is an actively maintained on the fly transpiler https://bundle.deno.dev/
so you can use it simply like this
<script type="module">
import { parse } from "https://bundle.deno.dev/https://deno.land/std#0.126.0/encoding/csv.ts"
console.log(await parse("a,b,c\n1,2,3"))
</script>
I have constructed this JavaScript script to parse a CSV in string to array object. I find it better to break down the whole CSV into lines, fields and process them accordingly. I think that it will make it easy for you to change the code to suit your need.
//
//
// CSV to object
//
//
const new_line_char = '\n';
const field_separator_char = ',';
function parse_csv(csv_str) {
var result = [];
let line_end_index_moved = false;
let line_start_index = 0;
let line_end_index = 0;
let csr_index = 0;
let cursor_val = csv_str[csr_index];
let found_new_line_char = get_new_line_char(csv_str);
let in_quote = false;
// Handle \r\n
if (found_new_line_char == '\r\n') {
csv_str = csv_str.split(found_new_line_char).join(new_line_char);
}
// Handle the last character is not \n
if (csv_str[csv_str.length - 1] !== new_line_char) {
csv_str += new_line_char;
}
while (csr_index < csv_str.length) {
if (cursor_val === '"') {
in_quote = !in_quote;
} else if (cursor_val === new_line_char) {
if (in_quote === false) {
if (line_end_index_moved && (line_start_index <= line_end_index)) {
result.push(parse_csv_line(csv_str.substring(line_start_index, line_end_index)));
line_start_index = csr_index + 1;
} // Else: just ignore line_end_index has not moved or line has not been sliced for parsing the line
} // Else: just ignore because we are in a quote
}
csr_index++;
cursor_val = csv_str[csr_index];
line_end_index = csr_index;
line_end_index_moved = true;
}
// Handle \r\n
if (found_new_line_char == '\r\n') {
let new_result = [];
let curr_row;
for (var i = 0; i < result.length; i++) {
curr_row = [];
for (var j = 0; j < result[i].length; j++) {
curr_row.push(result[i][j].split(new_line_char).join('\r\n'));
}
new_result.push(curr_row);
}
result = new_result;
}
return result;
}
function parse_csv_line(csv_line_str) {
var result = [];
//let field_end_index_moved = false;
let field_start_index = 0;
let field_end_index = 0;
let csr_index = 0;
let cursor_val = csv_line_str[csr_index];
let in_quote = false;
// Pretend that the last char is the separator_char to complete the loop
csv_line_str += field_separator_char;
while (csr_index < csv_line_str.length) {
if (cursor_val === '"') {
in_quote = !in_quote;
} else if (cursor_val === field_separator_char) {
if (in_quote === false) {
if (field_start_index <= field_end_index) {
result.push(parse_csv_field(csv_line_str.substring(field_start_index, field_end_index)));
field_start_index = csr_index + 1;
} // Else: just ignore field_end_index has not moved or field has not been sliced for parsing the field
} // Else: just ignore because we are in quote
}
csr_index++;
cursor_val = csv_line_str[csr_index];
field_end_index = csr_index;
field_end_index_moved = true;
}
return result;
}
function parse_csv_field(csv_field_str) {
with_quote = (csv_field_str[0] === '"');
if (with_quote) {
csv_field_str = csv_field_str.substring(1, csv_field_str.length - 1); // remove the start and end quotes
csv_field_str = csv_field_str.split('""').join('"'); // handle double quotes
}
return csv_field_str;
}
// Initial method: check the first newline character only
function get_new_line_char(csv_str) {
if (csv_str.indexOf('\r\n') > -1) {
return '\r\n';
} else {
return '\n'
}
}
Just use .split(','):
var str = "How are you doing today?";
var n = str.split(" ");

Comma separated values except for those inside of double quotes [duplicate]

This question already has answers here:
Regex to match all instances not inside quotes
(4 answers)
Closed 7 years ago.
I have a service that formats strings in certain fields. Basically, when a user clicks out of the input box (on blur), the string is cleansed of illegal characters, and whitespace is replaced with commas. This is fine, but I would like to allow a user to add double quotes around grouped words. On blur, this should remove the quotes, but maintain the space in between the words, and then add a comma afterwards. I have tried everything but I can't get this to work. Here is how my service is currently set up:
angular.module('testApp')
.factory('formatStringService', [
function () {
return {
formatString: function (string) {
var styleStr = string;
if (styleStr === undefined) {
return;
}
styleStr = this.stringReplace(styleStr, '\n', ',');
styleStr = this.stringReplace(styleStr, '\t', ',');
styleStr = this.stringReplace(styleStr, ' ', ',');
styleStr = this.stringReplace(styleStr, ';', ',');
styleStr = this.newLine(styleStr);
styleStr = this.validated(styleStr);
for (var g = 0; g < 9; g++) {
styleStr = this.stringReplace(styleStr, ',,', ',');
}
if (styleStr.charAt(styleStr.length - 1) === ',') {
styleStr = styleStr.substr(0, (styleStr.length - 1));
}
if (styleStr.charAt(0) === '*') {
styleStr = styleStr.substr(1, (styleStr.length - 1));
}
if (styleStr.charAt(styleStr.length - 1) === '*') {
styleStr = styleStr.substr(0, (styleStr.length - 1));
}
return styleStr;
},
stringReplace: function (string, text, by) {
var strLength = string.length,
txtLength = text.length;
if ((strLength === 0) || (txtLength === 0)) {
return string;
}
var i = string.indexOf(text);
if ((!i) && (text !== string.substring(0, txtLength))) {
return string;
}
if (i === -1) {
return string;
}
var newstr = string.substring(0, i) + by;
if (i + txtLength < strLength) {
newstr += this.stringReplace(string.substring(i + txtLength, strLength), text, by);
}
return newstr;
},
validated: function (string) {
for (var i = 0, output = '', valid = '1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ,~#+/\\*- '; i < string.length; i++) {
if (valid.indexOf(string.charAt(i)) !== -1) {
output += string.charAt(i);
}
}
return output;
},
newLine: function (string) {
for (var i = 0, output = ''; i < string.length; i++) {
if (string.charCodeAt(i) !== 10) {
output += string.charAt(i);
}
}
return output;
}
};
}
]);
Input string example: 1 2 3 "test test" 7 8
Should output: 1,2,3,test test,7,8
Here's a neat regex trick that you can use for this purpose:
var indices = [],
re = /"[^"]*"|( )/g,
str = '1 2 3 "test test" 7 8';
while ((match = re.exec(str)) !== null) {
if (match[1] !== undefined) indices.push(match.index);
}
var split = [], prevIndex = -1;
indices.forEach(function(index) {
split.push(str.slice(prevIndex + 1, index));
prevIndex = index;
});
document.getElementById('output').innerText = split.join('\n');
<pre id='output'></pre>
What we're doing here is matching on the regex /"[^"]*"|( )/—that is, either "stuff between quotes" or "a single space." So if we find a quote, we immediately start matching "stuff between quotes" (because regex is greedy), and hence any spaces between quotes are just gobbled up in that section of the regex.
Then we know that the ( ) will only be matched if we're not inside double quotes. So we stick the space into a capture group, and then for every match we can simply check whether the capture group exists.
Using a positive look ahead, something like this should do it
'1 2 3 "test test" 7 8'.match(/(".*?"|[^"\s]+)(?=\s|$)/g)
Reg Exp Visualizer

Get the line number of a string from body of text

Let's say I have a file with two lines and I get the indexOf a substring in the file. It returns with 18 the character that the substring is found at. How can I find the line with this information?
var file = [
'var foo = "hello"',
'console.log(foo)',
].join('\n')
var char = file.indexOf('console') // => 18
var line = lineOfChar(file, char) // => 2
A possible way to achieve this, is to find the the string like you did with:
var index = file.indexOf('console'); // => 18
Then use this index to make a substring containing everything before that index:
var tempString = str.substring(0, index);
And lastly we count the occurrences of \n:
var lineNumber = tempString.split('\n').length;
// You should do - 1 if you want your 'first' line to be 0
var assert = require('assert')
var file = [
'var alpha = "hello"',
'var beta = "hello"',
'var gamma = "hello"',
'var delta = "hello"',
'var episilon = "hello"'
].join('\n')
function getLine (body, charOrString) {
if (!body) return false
if (!charOrString) return false
var char = (typeof charOrString === 'string') ? body.indexOf(charOrString) : charOrString
var subBody = body.substring(0, char)
if (subBody === '') return false
var match = subBody.match(/\n/gi)
if (match) return match.length + 1
return 1
}
assert.equal(getLine(file, 'missing'), false)
assert.equal(getLine(file, 'alpha'), 1)
assert.equal(getLine(file, 'beta'), 2)
assert.equal(getLine(file, 'gamma'), 3)
assert.equal(getLine(file, 'delta'), 4)
assert.equal(getLine(file, 'episilon'), 5)
function lineOf(text, substring){
var line = 0, matchedChars = 0;
for (var i = 0; i < text.length; i++) {
text[i] === substring[matchedChars] ? matchedChars++ : matchedChars = 0;
if (matchedChars === substring.length){
return line;
}
if (text[i] === '\n'){
line++;
}
}
return -1;
}
Avoids iterating over the string twice, once to find the substring and again to find the newlines.

How can I put data from csv file to array using jquery? [duplicate]

Where could I find some JavaScript code to parse CSV data?
You can use the CSVToArray() function mentioned in this blog entry.
<script type="text/javascript">
// ref: http://stackoverflow.com/a/1293163/2343
// This will parse a delimited string into an array of
// arrays. The default delimiter is the comma, but this
// can be overriden in the second argument.
function CSVToArray( strData, strDelimiter ){
// Check to see if the delimiter is defined. If not,
// then default to comma.
strDelimiter = (strDelimiter || ",");
// Create a regular expression to parse the CSV values.
var objPattern = new RegExp(
(
// Delimiters.
"(\\" + strDelimiter + "|\\r?\\n|\\r|^)" +
// Quoted fields.
"(?:\"([^\"]*(?:\"\"[^\"]*)*)\"|" +
// Standard fields.
"([^\"\\" + strDelimiter + "\\r\\n]*))"
),
"gi"
);
// Create an array to hold our data. Give the array
// a default empty first row.
var arrData = [[]];
// Create an array to hold our individual pattern
// matching groups.
var arrMatches = null;
// Keep looping over the regular expression matches
// until we can no longer find a match.
while (arrMatches = objPattern.exec( strData )){
// Get the delimiter that was found.
var strMatchedDelimiter = arrMatches[ 1 ];
// Check to see if the given delimiter has a length
// (is not the start of string) and if it matches
// field delimiter. If id does not, then we know
// that this delimiter is a row delimiter.
if (
strMatchedDelimiter.length &&
strMatchedDelimiter !== strDelimiter
){
// Since we have reached a new row of data,
// add an empty row to our data array.
arrData.push( [] );
}
var strMatchedValue;
// Now that we have our delimiter out of the way,
// let's check to see which kind of value we
// captured (quoted or unquoted).
if (arrMatches[ 2 ]){
// We found a quoted value. When we capture
// this value, unescape any double quotes.
strMatchedValue = arrMatches[ 2 ].replace(
new RegExp( "\"\"", "g" ),
"\""
);
} else {
// We found a non-quoted value.
strMatchedValue = arrMatches[ 3 ];
}
// Now that we have our value string, let's add
// it to the data array.
arrData[ arrData.length - 1 ].push( strMatchedValue );
}
// Return the parsed data.
return( arrData );
}
</script>
jQuery-CSV
It's a jQuery plugin designed to work as an end-to-end solution for parsing CSV into JavaScript data. It handles every single edge case presented in RFC 4180, as well as some that pop up for Excel/Google spreadsheet exports (i.e., mostly involving null values) that the specification is missing.
Example:
track,artist,album,year
Dangerous,'Busta Rhymes','When Disaster Strikes',1997
// Calling this
music = $.csv.toArrays(csv)
// Outputs...
[
["track", "artist", "album", "year"],
["Dangerous", "Busta Rhymes", "When Disaster Strikes", "1997"]
]
console.log(music[1][2]) // Outputs: 'When Disaster Strikes'
Update:
Oh yeah, I should also probably mention that it's completely configurable.
music = $.csv.toArrays(csv, {
delimiter: "'", // Sets a custom value delimiter character
separator: ';', // Sets a custom field separator character
});
Update 2:
It now works with jQuery on Node.js too. So you have the option of doing either client-side or server-side parsing with the same library.
Update 3:
Since the Google Code shutdown, jquery-csv has been migrated to GitHub.
Disclaimer: I am also the author of jQuery-CSV.
Here's an extremely simple CSV parser that handles quoted fields with commas, new lines, and escaped double quotation marks. There's no splitting or regular expression. It scans the input string 1-2 characters at a time and builds an array.
Test it at http://jsfiddle.net/vHKYH/.
function parseCSV(str) {
var arr = [];
var quote = false; // 'true' means we're inside a quoted field
// Iterate over each character, keep track of current row and column (of the returned array)
for (var row = 0, col = 0, c = 0; c < str.length; c++) {
var cc = str[c], nc = str[c+1]; // Current character, next character
arr[row] = arr[row] || []; // Create a new row if necessary
arr[row][col] = arr[row][col] || ''; // Create a new column (start with empty string) if necessary
// If the current character is a quotation mark, and we're inside a
// quoted field, and the next character is also a quotation mark,
// add a quotation mark to the current column and skip the next character
if (cc == '"' && quote && nc == '"') { arr[row][col] += cc; ++c; continue; }
// If it's just one quotation mark, begin/end quoted field
if (cc == '"') { quote = !quote; continue; }
// If it's a comma and we're not in a quoted field, move on to the next column
if (cc == ',' && !quote) { ++col; continue; }
// If it's a newline (CRLF) and we're not in a quoted field, skip the next character
// and move on to the next row and move to column 0 of that new row
if (cc == '\r' && nc == '\n' && !quote) { ++row; col = 0; ++c; continue; }
// If it's a newline (LF or CR) and we're not in a quoted field,
// move on to the next row and move to column 0 of that new row
if (cc == '\n' && !quote) { ++row; col = 0; continue; }
if (cc == '\r' && !quote) { ++row; col = 0; continue; }
// Otherwise, append the current character to the current column
arr[row][col] += cc;
}
return arr;
}
I have an implementation as part of a spreadsheet project.
This code is not yet tested thoroughly, but anyone is welcome to use it.
As some of the answers noted though, your implementation can be much simpler if you actually have DSV or TSV file, as they disallow the use of the record and field separators in the values. CSV, on the other hand, can actually have commas and newlines inside a field, which breaks most regular expression and split-based approaches.
var CSV = {
parse: function(csv, reviver) {
reviver = reviver || function(r, c, v) { return v; };
var chars = csv.split(''), c = 0, cc = chars.length, start, end, table = [], row;
while (c < cc) {
table.push(row = []);
while (c < cc && '\r' !== chars[c] && '\n' !== chars[c]) {
start = end = c;
if ('"' === chars[c]){
start = end = ++c;
while (c < cc) {
if ('"' === chars[c]) {
if ('"' !== chars[c+1]) {
break;
}
else {
chars[++c] = ''; // unescape ""
}
}
end = ++c;
}
if ('"' === chars[c]) {
++c;
}
while (c < cc && '\r' !== chars[c] && '\n' !== chars[c] && ',' !== chars[c]) {
++c;
}
} else {
while (c < cc && '\r' !== chars[c] && '\n' !== chars[c] && ',' !== chars[c]) {
end = ++c;
}
}
row.push(reviver(table.length-1, row.length, chars.slice(start, end).join('')));
if (',' === chars[c]) {
++c;
}
}
if ('\r' === chars[c]) {
++c;
}
if ('\n' === chars[c]) {
++c;
}
}
return table;
},
stringify: function(table, replacer) {
replacer = replacer || function(r, c, v) { return v; };
var csv = '', c, cc, r, rr = table.length, cell;
for (r = 0; r < rr; ++r) {
if (r) {
csv += '\r\n';
}
for (c = 0, cc = table[r].length; c < cc; ++c) {
if (c) {
csv += ',';
}
cell = replacer(r, c, table[r][c]);
if (/[,\r\n"]/.test(cell)) {
cell = '"' + cell.replace(/"/g, '""') + '"';
}
csv += (cell || 0 === cell) ? cell : '';
}
}
return csv;
}
};
csvToArray v1.3
A compact (645 bytes), but compliant function to convert a CSV string into a 2D array, conforming to the RFC4180 standard.
https://code.google.com/archive/p/csv-to-array/downloads
Common Usage: jQuery
$.ajax({
url: "test.csv",
dataType: 'text',
cache: false
}).done(function(csvAsString){
csvAsArray=csvAsString.csvToArray();
});
Common usage: JavaScript
csvAsArray = csvAsString.csvToArray();
Override field separator
csvAsArray = csvAsString.csvToArray("|");
Override record separator
csvAsArray = csvAsString.csvToArray("", "#");
Override Skip Header
csvAsArray = csvAsString.csvToArray("", "", 1);
Override all
csvAsArray = csvAsString.csvToArray("|", "#", 1);
Here's my PEG(.js) grammar that seems to do ok at RFC 4180 (i.e. it handles the examples at http://en.wikipedia.org/wiki/Comma-separated_values):
start
= [\n\r]* first:line rest:([\n\r]+ data:line { return data; })* [\n\r]* { rest.unshift(first); return rest; }
line
= first:field rest:("," text:field { return text; })*
& { return !!first || rest.length; } // ignore blank lines
{ rest.unshift(first); return rest; }
field
= '"' text:char* '"' { return text.join(''); }
/ text:[^\n\r,]* { return text.join(''); }
char
= '"' '"' { return '"'; }
/ [^"]
Try it out at http://jsfiddle.net/knvzk/10 or http://pegjs.majda.cz/online. Download the generated parser at https://gist.github.com/3362830.
Here's another solution. This uses:
a coarse global regular expression for splitting the CSV string (which includes surrounding quotes and trailing commas)
fine-grained regular expression for cleaning up the surrounding quotes and trailing commas
also, has type correction differentiating strings, numbers, boolean values and null values
For the following input string:
"This is\, a value",Hello,4,-123,3.1415,'This is also\, possible',true,
The code outputs:
[
"This is, a value",
"Hello",
4,
-123,
3.1415,
"This is also, possible",
true,
null
]
Here's my implementation of parseCSVLine() in a runnable code snippet:
function parseCSVLine(text) {
return text.match( /\s*(\"[^"]*\"|'[^']*'|[^,]*)\s*(,|$)/g ).map( function (text) {
let m;
if (m = text.match(/^\s*,?$/)) return null; // null value
if (m = text.match(/^\s*\"([^"]*)\"\s*,?$/)) return m[1]; // Double Quoted Text
if (m = text.match(/^\s*'([^']*)'\s*,?$/)) return m[1]; // Single Quoted Text
if (m = text.match(/^\s*(true|false)\s*,?$/)) return m[1] === "true"; // Boolean
if (m = text.match(/^\s*((?:\+|\-)?\d+)\s*,?$/)) return parseInt(m[1]); // Integer Number
if (m = text.match(/^\s*((?:\+|\-)?\d*\.\d*)\s*,?$/)) return parseFloat(m[1]); // Floating Number
if (m = text.match(/^\s*(.*?)\s*,?$/)) return m[1]; // Unquoted Text
return text;
} );
}
let data = `"This is\, a value",Hello,4,-123,3.1415,'This is also\, possible',true,`;
let obj = parseCSVLine(data);
console.log( JSON.stringify( obj, undefined, 2 ) );
Here's my simple vanilla JavaScript code:
let a = 'one,two,"three, but with a comma",four,"five, with ""quotes"" in it.."'
console.log(splitQuotes(a))
function splitQuotes(line) {
if(line.indexOf('"') < 0)
return line.split(',')
let result = [], cell = '', quote = false;
for(let i = 0; i < line.length; i++) {
char = line[i]
if(char == '"' && line[i+1] == '"') {
cell += char
i++
} else if(char == '"') {
quote = !quote;
} else if(!quote && char == ',') {
result.push(cell)
cell = ''
} else {
cell += char
}
if ( i == line.length-1 && cell) {
result.push(cell)
}
}
return result
}
I'm not sure why I couldn't get Kirtan's example to work for me. It seemed to be failing on empty fields or maybe fields with trailing commas...
This one seems to handle both.
I did not write the parser code, just a wrapper around the parser function to make this work for a file. See attribution.
var Strings = {
/**
* Wrapped CSV line parser
* #param s String delimited CSV string
* #param sep Separator override
* #attribution: http://www.greywyvern.com/?post=258 (comments closed on blog :( )
*/
parseCSV : function(s,sep) {
// http://stackoverflow.com/questions/1155678/javascript-string-newline-character
var universalNewline = /\r\n|\r|\n/g;
var a = s.split(universalNewline);
for(var i in a){
for (var f = a[i].split(sep = sep || ","), x = f.length - 1, tl; x >= 0; x--) {
if (f[x].replace(/"\s+$/, '"').charAt(f[x].length - 1) == '"') {
if ((tl = f[x].replace(/^\s+"/, '"')).length > 1 && tl.charAt(0) == '"') {
f[x] = f[x].replace(/^\s*"|"\s*$/g, '').replace(/""/g, '"');
} else if (x) {
f.splice(x - 1, 2, [f[x - 1], f[x]].join(sep));
} else f = f.shift().split(sep).concat(f);
} else f[x].replace(/""/g, '"');
} a[i] = f;
}
return a;
}
}
Regular expressions to the rescue! These few lines of code handle properly quoted fields with embedded commas, quotes, and newlines based on the RFC 4180 standard.
function parseCsv(data, fieldSep, newLine) {
fieldSep = fieldSep || ',';
newLine = newLine || '\n';
var nSep = '\x1D';
var qSep = '\x1E';
var cSep = '\x1F';
var nSepRe = new RegExp(nSep, 'g');
var qSepRe = new RegExp(qSep, 'g');
var cSepRe = new RegExp(cSep, 'g');
var fieldRe = new RegExp('(?<=(^|[' + fieldSep + '\\n]))"(|[\\s\\S]+?(?<![^"]"))"(?=($|[' + fieldSep + '\\n]))', 'g');
var grid = [];
data.replace(/\r/g, '').replace(/\n+$/, '').replace(fieldRe, function(match, p1, p2) {
return p2.replace(/\n/g, nSep).replace(/""/g, qSep).replace(/,/g, cSep);
}).split(/\n/).forEach(function(line) {
var row = line.split(fieldSep).map(function(cell) {
return cell.replace(nSepRe, newLine).replace(qSepRe, '"').replace(cSepRe, ',');
});
grid.push(row);
});
return grid;
}
const csv = 'A1,B1,C1\n"A ""2""","B, 2","C\n2"';
const separator = ','; // field separator, default: ','
const newline = ' <br /> '; // newline representation in case a field contains newlines, default: '\n'
var grid = parseCsv(csv, separator, newline);
// expected: [ [ 'A1', 'B1', 'C1' ], [ 'A "2"', 'B, 2', 'C <br /> 2' ] ]
You don't need a parser-generator such as lex/yacc. The regular expression handles RFC 4180 properly thanks to positive lookbehind, negative lookbehind, and positive lookahead.
Clone/download code at https://github.com/peterthoeny/parse-csv-js
Just throwing this out there.. I recently ran into the need to parse CSV columns with Javascript, and I opted for my own simple solution. It works for my needs, and may help someone else.
const csvString = '"Some text, some text",,"",true,false,"more text","more,text, more, text ",true';
const parseCSV = text => {
const lines = text.split('\n');
const output = [];
lines.forEach(line => {
line = line.trim();
if (line.length === 0) return;
const skipIndexes = {};
const columns = line.split(',');
output.push(columns.reduce((result, item, index) => {
if (skipIndexes[index]) return result;
if (item.startsWith('"') && !item.endsWith('"')) {
while (!columns[index + 1].endsWith('"')) {
index++;
item += `,${columns[index]}`;
skipIndexes[index] = true;
}
index++;
skipIndexes[index] = true;
item += `,${columns[index]}`;
}
result.push(item);
return result;
}, []));
});
return output;
};
console.log(parseCSV(csvString));
Personally I like to use deno std library since most modules are officially compatible with the browser
The problem is that the std is in typescript but official solution might happen in the future https://github.com/denoland/deno_std/issues/641 https://github.com/denoland/dotland/issues/1728
For now there is an actively maintained on the fly transpiler https://bundle.deno.dev/
so you can use it simply like this
<script type="module">
import { parse } from "https://bundle.deno.dev/https://deno.land/std#0.126.0/encoding/csv.ts"
console.log(await parse("a,b,c\n1,2,3"))
</script>
I have constructed this JavaScript script to parse a CSV in string to array object. I find it better to break down the whole CSV into lines, fields and process them accordingly. I think that it will make it easy for you to change the code to suit your need.
//
//
// CSV to object
//
//
const new_line_char = '\n';
const field_separator_char = ',';
function parse_csv(csv_str) {
var result = [];
let line_end_index_moved = false;
let line_start_index = 0;
let line_end_index = 0;
let csr_index = 0;
let cursor_val = csv_str[csr_index];
let found_new_line_char = get_new_line_char(csv_str);
let in_quote = false;
// Handle \r\n
if (found_new_line_char == '\r\n') {
csv_str = csv_str.split(found_new_line_char).join(new_line_char);
}
// Handle the last character is not \n
if (csv_str[csv_str.length - 1] !== new_line_char) {
csv_str += new_line_char;
}
while (csr_index < csv_str.length) {
if (cursor_val === '"') {
in_quote = !in_quote;
} else if (cursor_val === new_line_char) {
if (in_quote === false) {
if (line_end_index_moved && (line_start_index <= line_end_index)) {
result.push(parse_csv_line(csv_str.substring(line_start_index, line_end_index)));
line_start_index = csr_index + 1;
} // Else: just ignore line_end_index has not moved or line has not been sliced for parsing the line
} // Else: just ignore because we are in a quote
}
csr_index++;
cursor_val = csv_str[csr_index];
line_end_index = csr_index;
line_end_index_moved = true;
}
// Handle \r\n
if (found_new_line_char == '\r\n') {
let new_result = [];
let curr_row;
for (var i = 0; i < result.length; i++) {
curr_row = [];
for (var j = 0; j < result[i].length; j++) {
curr_row.push(result[i][j].split(new_line_char).join('\r\n'));
}
new_result.push(curr_row);
}
result = new_result;
}
return result;
}
function parse_csv_line(csv_line_str) {
var result = [];
//let field_end_index_moved = false;
let field_start_index = 0;
let field_end_index = 0;
let csr_index = 0;
let cursor_val = csv_line_str[csr_index];
let in_quote = false;
// Pretend that the last char is the separator_char to complete the loop
csv_line_str += field_separator_char;
while (csr_index < csv_line_str.length) {
if (cursor_val === '"') {
in_quote = !in_quote;
} else if (cursor_val === field_separator_char) {
if (in_quote === false) {
if (field_start_index <= field_end_index) {
result.push(parse_csv_field(csv_line_str.substring(field_start_index, field_end_index)));
field_start_index = csr_index + 1;
} // Else: just ignore field_end_index has not moved or field has not been sliced for parsing the field
} // Else: just ignore because we are in quote
}
csr_index++;
cursor_val = csv_line_str[csr_index];
field_end_index = csr_index;
field_end_index_moved = true;
}
return result;
}
function parse_csv_field(csv_field_str) {
with_quote = (csv_field_str[0] === '"');
if (with_quote) {
csv_field_str = csv_field_str.substring(1, csv_field_str.length - 1); // remove the start and end quotes
csv_field_str = csv_field_str.split('""').join('"'); // handle double quotes
}
return csv_field_str;
}
// Initial method: check the first newline character only
function get_new_line_char(csv_str) {
if (csv_str.indexOf('\r\n') > -1) {
return '\r\n';
} else {
return '\n'
}
}
Just use .split(','):
var str = "How are you doing today?";
var n = str.split(" ");

Categories

Resources