Node.js: remove specific columns from CSV file - javascript

I have a CSV file can contain around million records, how can I remove columns starting with _ and generate a resulting csv
For the sake of simplicity, consider i have the below csv
Sr.No Col1 Col2 _Col3 Col4 _Col5
1 txt png 676766 win 8787
2 jpg pdf 565657 lin 8787
3 pdf jpg 786786 lin 9898
I would want the output to be
Sr.No Col1 Col2 Col4
1 txt png win
2 jpg pdf lin
3 pdf jpg lin
Do i need to read the entire file to achive this or is there a better approach to do this.
const csv = require('csv-parser');
const fs = require('fs');
fs.createReadStream('data.csv')
.pipe(csv())
.on('data', (row) => {
// generate a new csv with removing specific column
})
.on('end', () => {
console.log('CSV file successfully processed');
});
Any help on how can i achieve this would be helpful.
Thanks.

To anyone who stumbles on the post
I was able to transform the csv's using below code using fs and csv modules.
await fs.createReadStream(m.path)
.pipe(csv.parse({delimiter: '\t', columns: true}))
.pipe(csv.transform((input) => {
delete input['_Col3'];
console.log(input);
return input;
}))
.pipe(csv.stringify({header: true}))
.pipe(fs.createWriteStream(transformedPath))
.on('finish', () => {
console.log('finish....');
}).on('error', () => {
console.log('error.....');
});
Source: https://gist.github.com/donmccurdy/6cbcd8cee74301f92b4400b376efda1d

Actually you can handle that by using two npm packages.
https://www.npmjs.com/package/csvtojson
to convert your library to JSON format
then use this
https://www.npmjs.com/package/json2csv
with the second library. If you know what are the exact fields you want. you can pass parameters to specifically select the fields you want.
const { Parser } = require('json2csv');
const fields = ['field1', 'field2', 'field3'];
const opts = { fields };
try {
const parser = new Parser(opts);
const csv = parser.parse(myData);
console.log(csv);
} catch (err) {
console.error(err);
}
Or you can modify the JSON object manually to drop those columns

Try this with csv lib
const csv = require('csv');
const fs = require('fs');
const csvString=`col1,col2
value1,value2`
csv.parse(csvString, {columns: true})
.pipe(csv.transform(({col1,col2}) => ({col1}))) // remove col2
.pipe(csv.stringify({header:true}))
.pipe(fs.createWriteStream('./file.csv'))

With this function I accomplished the column removal from a CSV
removeCol(csv, col) {
let lines = csv.split("\n");
let headers = lines[0].split(",");
let colNameToRemove = headers.find(h=> h.trim() === col);
let index = headers.indexOf(colNameToRemove);
let newLines = [];
lines.map((line)=>{
let fields = line.split(",");
fields.splice(index, 1)
newLines.push(fields)
})
let arrData = '';
for (let index = 0; index < newLines.length; index++) {
const element = newLines[index];
arrData += element.join(',') + '\n'
}
return arrData;
}

Related

csvtojson node.js (combine two codes)

How to combine these two codes, so it doesn't just covert csv to Json (first code), but also save this as an json array in an extra file?(second code)
this (first) code converts csv file to json array:
const fs = require("fs");
let fileReadStream = fs.createReadStream("myCsvFile.csv");
let invalidLineCount = 0;
const csvtojson = require("csvtojson");
csvtojson({ "delimiter": ";", "fork": true })
.preFileLine((fileLineString, lineIdx)=> {
let invalidLinePattern = /^['"].*[^"'];/;
if (invalidLinePattern.test(fileLineString)) {
console.log(`Line #${lineIdx + 1} is invalid, skipping:`, fileLineString);
fileLineString = "";
invalidLineCount++;
}
return fileLineString
})
.fromStream(fileReadStream)
.subscribe((dataObj) => {
console.log(dataObj);
// I added the second code hier, but it wirtes the last object of the array (because of the loop?)
}
});
and this (second) code saves the json array to an external file:
fs.writeFile('example.json', JSON.stringify(dataObj, null, 4);
The quistion is how to put the second codes into the first code (combine them)?
You can use .on('done',(error)=>{ ... }) method. (csvtojson). Push the data into a variable in subscribe method and write the data as JSON in .on('done'). (test was successful).
Check it out:
let fileReadStream = fs.createReadStream("username-password.csv");
let invalidLineCount = 0;
let data = []
csvtojson({ "delimiter": ";", "fork": true })
.preFileLine((fileLineString, lineIdx)=> {
let invalidLinePattern = /^['"].*[^"'];/;
if (invalidLinePattern.test(fileLineString)) {
console.log(`Line #${lineIdx + 1} is invalid, skipping:`, fileLineString);
fileLineString = "";
invalidLineCount++;
}
return fileLineString
})
.fromStream(fileReadStream)
.subscribe((dataObj) => {
// console.log(dataObj)
data.push(dataObj)
})
.on('done',(error)=>{
fs.writeFileSync('example.json', JSON.stringify(data, null, 4))
})
Not sure if you are able to change the library but I would definitely recommend Papaparse for this - https://www.npmjs.com/package/papaparse
Your code would then look something like this:
const fs = require('fs'), papa = require('papaparse');
var readFile = fs.createReadStream(file);
papa.parse(readFile, {
complete: function (results, file) {
fs.writeFile('example.json', JSON.stringifiy(results.data), function (err) {
if(err) console.log(err);
// callback etc
})
}
});

Javascript to download and process GTFS zip file

I try to download, unzip and process a GTFS file in zip format. Downloading and unzipping are working, but I get error message when try to use txt files with gtfs-utils module in gtfsFunc(). Output is undefined. Delays are hardcoded just for testing purpose.
const dl = new DownloaderHelper('http://www.bkk.hu/gtfs/budapest_gtfs.zip', __dirname);
dl.on('end', () => console.log('Download Completed'))
dl.start();
myVar = setTimeout(zipFunc, 30000);
function zipFunc() {
console.log('Unzipping started...');
var zip = new AdmZip("./budapest_gtfs.zip");
var zipEntries = zip.getEntries();
zip.extractAllTo("./gtfsdata/", true);
}
myVar = setTimeout(gtfsFunc, 40000);
function gtfsFunc() {
console.log('Processing started...');
const readFile = name => readCsv('./gtfsdata/' + name + '.txt')
const filter = t => t.route_id === 'M4'
readStops(readFile, filter)
.then((stops) => {
const someStopId = Object.keys(stops)[0]
const someStop = stops[someStopId]
console.log(someStop)
})
}
As #ChickenSoups said, you are trying to filter stops files with route_id field and this txt doesnt have this field.
The fields that stops has are:
stop_id, stop_name, stop_lat, stop_lon, stop_code, location_type, parent_station, wheelchair_boarding, stop_direction
Perhaps what you need is read the Trips.txt file instead Stops.txt, as this file has route_id field.
And you can accomplish this using readTrips function:
const readTrips = require("gtfs-utils/read-trips");
And your gtfsFunc would be:
function gtfsFunc() {
console.log("Processing started...");
const readFile = (name) => {
return readCsv("./gtfsdata/" + name + ".txt").on("error", console.error);
};
//I used 5200 because your Trips.txt contains routes id with this value
const filterTrips = (t) => t.route_id === "5200";
readTrips(readFile, filterTrips).then((stops) => {
console.log("filtered stops", stops);
const someStopId = Object.keys(stops)[0];
const someStop = stops[someStopId];
console.log("someStop", someStop);
});
}
Or if what you really want is to read Stops.txt, you just need to change your filter
const filter = t => t.route_id === 'M4'
to use some valid field, for example:
const filter = t => t.stop_name=== 'M4'
Stop data don't have route_id field.
You should try other data, such as Trip or Route
You can look at the first row in your data file to see which field do they have.
GTFS data structure here

How to populate new date column to existing CSV file

I am working with CSV file (~20MB). It has three columns as shown below and the file name presents the start time i.e. timestamp of the first row (in this case file name is 20200325131010000.csv).
x;y;z
3;-132;976
3;-131;978
3;-130;975
4;-132;975
5;-131;976
3;-132;975
The difference between the timestamp of the consecutive row is 20 ms. How can I efficiently populate the new date column to the existing file? The final CSV file should look like this:
timestamp;x;y;z
20200325131010000;3;-132;976
20200325131010020;3;-131;978
20200325131010040;3;-130;975
20200325131010060;4;-132;975
20200325131010080;5;-131;976
20200325131010100;3;-132;975
So far I have tried the following code:
const csv = require('csv-parser');
const fs = require('fs');
var json2csv = require('json2csv').parse;
var dataArray = [];
fs.createReadStream('C:/Users/Downloads/20200325131010000.csv')
.pipe(csv())
.on('data', (row) => {
row.timestamp= "20200325131010000";
dataArray.push(row);
})
.on('end', () => {
var result = json2csv({ data: dataArray, fields: Object.keys(dataArray[0]) });
fs.writeFileSync("test.csv", result);
});
The above code generates the following output (all timestamp are the same which is not desirable):
timestamp;x;y;z
20200325131010000;3;-132;976
20200325131010000;3;-131;978
20200325131010000;3;-130;975
20200325131010000;4;-132;975
20200325131010000;5;-131;976
20200325131010000;3;-132;975
The problem with this code is that it adds the same timestamp (i.e. 20200325131010000) to all the rows. How can I correctly populate the timestamp column?
Kindly suggest me an efficient way to achieve this.
I think this code should work and solve your problem.
const csv = require('csv-parser');
const fs = require('fs');
var json2csv = require('json2csv').parse;
const filePath = 'C:/Users/Downloads/20200325131010000.csv';
const fileName = (/^.+\/([^/]+)\.csv$/gi).exec(filePath)[1]; // extract file name from path
const intialTimestamp = parseInt(fileName); // convert string to number
let i = 0;
var dataArray = [];
fs.createReadStream(filePath)
.pipe(csv())
.on('data', (row) => {
row.timestamp= (intialTimestamp + (i * 20)).toString(); // increase the number by 20 everytime
dataArray.push(row);
i++;
})
.on('end', () => {
var result = json2csv({ data: dataArray, fields: Object.keys(dataArray[0]) });
fs.writeFileSync("test.csv", result);
});

How do you insert line breaks inside Excel cell

I have to parse a csv file of test cases exported from Zephyr for Jira. I need to format them in a way that I can import them into a new test management system. The new format requires test steps to be in one cell separated by a line break like so:
Cell formatting
Here is the script I have written to do the parsing. The steps and expected results are all added to the same cell but without the line breaks.
const fs = require("fs");
const Papa = require("papaparse");
// ******************* user defined values ******************* //
// file to parse
const file = process.argv[2];
// ******************* parse csv ******************* //
const parsedTests = [];
// create file object
const fileObject = fs.readFileSync(file, "utf8");
// set config
const config = {
header: true,
complete: function(results) {
return results;
}
};
// parse and save results
const results = Papa.parse(fileObject, config);
const arr = Object.values(results.data);
function addTests(parsedTests, test) {
const found = parsedTests.some(el => el.ExecutionId === test.ExecutionId);
if (!found) {
parsedTests.push(test);
}
return parsedTests;
}
function addSteps(parstedTests, test, el) {
const found = parsedTests.some(el => el.ExecutionId === test.ExecutionId);
if (found && test.OrderId > 1) {
test.Step += `${test.Step}\n\n`;
test.ExpectedResults += `${test.ExpectedResults}\n\n`;
}
}
arr.forEach(el => {
addTests(parsedTests, el);
});
arr.forEach(el => {
addSteps(parsedTests, el);
});
console.log(parsedTests)
// write back to csv
fs.writeFileSync("./parsedtests.csv", Papa.unparse(parsedTests));

Read lines of a txt file and organize in a JSON file

I have a text file where each line is separated into 4 categories by colons and I want to put this into a JSON file, where each category is a value to the corresponding name in the JSON file.
Example data.txt file:
Date1:cat1:dog1:bug1
Date2:cat2:dog2:bug2
Date3:cat3:dog3:bug3
Example JSON file:
{
"Date1": {
"cat": "cat1",
"dog": "dog1",
"bug": "bug1"
},
"Date2": {
"cat": "cat2",
"dog": "dog2",
"bug": "bug2"
...
...
}
I've never used JSON before but I think that's how to format it. How would I sort each line using the colons as markers for the next value and store it in the JSON file with the correct name using JavaScript and Node.js?
Use the csv package if you don't want to handle parsing csv file by yourself.
const fs = require("fs");
const csv = require("csv");
const result = {};
const keys = ["cat", "dog", "bug"]
// Read data
const readStream = fs.createReadStream("yourfile.txt");
// Parser
const parser = csv.parse({ delimiter: ":" });
parser.on("data", (chunk) => {
result[chunk[0]] = {};
for(let i = 1; i < chunk.length; i ++) {
result[chunk[0]][keys[i - 1]] = chunk[i];
}
});
parser.on("end", () => {
console.log(result);
});
readStream.pipe(parser);
If your JSON has this defined structure you can go about it with the following code:
import * as fs from 'fs';
/* If you have a large file this is a bad Idea, refer to reading from a stream
* From zhangjinzhou's answer
*/
const file = fs.readFileSync('path/to/data.txt', 'utf8');
const json = file.split(/\n|\r\n/).map(line => {
const values = line.split(":");
let obj = {}
obj[values[0]] = {
cat: values[1],
dog: values[2],
bug: values[3],
};
return obj
}).reduce((acc, current) => Object.assign(acc, current), {})
Using RegExp and Array#forEach, convert the string to lines, then iterate over them and fill up the object with the corresponding data via:
const dataFileContent =
`Date1:cat1:dog1:bug1
Date2:cat2:dog2:bug2
Date3:cat3:dog3:bug3`;
function processData(data) {
// convert to lines
const lines = data.match(/[^\r\n]+/g) || [];
const object = {};
// iterate over the lines
lines.forEach(line => {
const parts = line.split(':');
const main = parts.shift();
const pattern = /^(.*?)(\d+)$/;
// create an object for each main part
object[main] = {};
// fill each main part with the sub parts
parts.forEach(part => {
const match = part.match(pattern) || [];
const key = match[1];
const value = match[2];
if (match) {
object[main][key] = key + value;
}
});
});
return object;
}
const processedData = processData(dataFileContent);
console.log(processedData);
Then convert the processedData to JSON by using JSON.stringify and save it to a file via:
const fs = require('fs');
...
// processData
...
const json = JSON.stringify(processedData);
fs.writeFile('my_json_file.json', json, 'utf8');
For larger files, consider using Streams in Node.js as suggested by #zhangjinzhou.

Categories

Resources