I have a function that scans a directory and creates a JSON file with the audio files metadata. I want it to check if the file already exists and only overwrite if there is any diference between the file that was created from the last time the script was run and the data from the the second time it runs.
This is my code:
var fs = require('fs');
var nodeID3 = require('node-id3');
var path = require('path');
var tracksPath = './public/tracks/';
var dataPath = './public/data/';
fs.readdir(tracksPath,function(err,files){
if(err) {
throw err;
}
//Read the tracks metadata
var tracksMetadata = [];
files.forEach(function(trackName){
var trackFile = nodeID3.read(tracksPath + trackName);
//If the track returns metadata push it to the array
if (trackFile.title && trackFile.artist){
var metadata = {
"filename" : trackName,
"title" : trackFile.title,
"artist" : trackFile.artist
};
tracksMetadata.push(metadata);
}
//If no metadata is found ignore and log it to the console
else if (trackName.charAt(0) != "."){
var filename = {
"filename" : trackName
};
tracksMetadata.push(filename);
console.log(trackName + " doesn't have metadata. Ignoring.");
}
if(fs.existsSync(dataPath + "metadata.json")){
fs.readFile(dataPath + "metadata.json",'utf8', function (err, data){
if (err) throw err;
console.log(JSON.parse(JSON.stringify(data)));
console.log(JSON.parse(JSON.stringify(tracksMetadata)));
console.log(Boolean(JSON.parse(JSON.stringify(data)) == JSON.parse(JSON.stringify(tracksMetadata))));
});
}
});
fs.writeFile(path.join(dataPath, 'metadata.json'),
JSON.stringify(tracksMetadata),'utf8', function(err){
if(err){
throw err;
}
console.log("Tracks Metadata JSON created succesfully");
});
});
Right now I'm only writing to the console a Boolean value that checks wether the data from the file and the data generated by the function are equal and so far I get false.
What should I do?
Related
Trying to create a new csv file in a directory.
I want to store the data of a variable inside that csv file:
handleRequest(req, res) {
var svcReq = req.body.svcReq;
var csvRecData = JSON.stringify(req.body);
console.log("DATA WE ARE GETIING IS: " + csvRecData);
if (svcReq == 'invDetails') {
var checking = fs.writeFile('../i1/csvData/myCsvFile.csv', csvRecData, function (err) {
if (err) throw err;
console.log("Saved! got the file");
console.log("Checking csvData:" + checking);
});
}
}
I don't see any errors in the console or terminal but the file is not generated. What is my issue?
The path in writeFile should be pointed correctly..you cannot simply use "../il/csv" from your current file.First check your current directory using path.
1)Install path npm module
2)
var path = require('path');
var fs = require('fs');
console.log(path.join(__dirname))
fs.writeFile((path.join(__dirname)+"/test123.csv"), "Sally Whittaker,2018,McCarren House,312,3.75!", function(err) {
if(err) {
return console.log(err);
}
console.log("The file was saved!");
});
I'm just finishing off this basic webscraper project for a tshirt website.
It enters through one hardcoded url, the home page. It will search for any product pages, and add them to an url. If it finds another link (remainder), it will scrape that again and find any more product pages. It adds the product pages to urlSet and will then scrape those again, grab the tshirt data (price, img, title) and then convert, then write them to a CSV file.
For some reason, this is not working on the second run through of the scrape with 'remainder'.
If I remove the second scrape of url, everything works out fine and the file gets written correctly. But if I want to get the other product pages, it seems to be failing somewhere.
Here is my code, i apologise for posting so much of it but I don't know how it will be understood properly without the right context, hopefully it's been commented okay:
//TASK: Create a command line application that goes to an ecommerce site to get the latest prices.
//Save the scraped data in a spreadsheet (CSV format).
'use strict';
//Modules being used:
var cheerio = require('cheerio');
var json2csv = require('json2csv');
var request = require('request');
var moment = require('moment');
var fs = require('fs');
//harcoded url
var url = 'http://shirts4mike.com/';
//url for tshirt pages
var urlSet = new Set();
var remainder;
var tshirtArray = [];
const requestPromise = function(url) {
return new Promise(function(resolve, reject) {
request(url, function(error, response, html) {
if(error)return reject(error);
if(!error && response.statusCode == 200){
return resolve(html);
}
});
});
}
// Go into webpage via url, load html and grab links shirt in url
function scrape (url) {
console.log("Currently scraping " + url)
return requestPromise(url)
.then(function(html) {
var $ = cheerio.load(html);
var links = [];
//get all the links
$('a[href*=shirt]').each(function(){
var a = $(this).attr('href');
//add into link array
links.push(url + a);
});
// return array of links
return links;
});
}
function nextStep (arrayOfLinks) {
var promiseArray = [];
console.log(arrayOfLinks);
for(var i = 0; i < arrayOfLinks.length; i++){
promiseArray.push(requestPromise(arrayOfLinks[i]));
}
//return both the html of pages and their urls
return Promise.all(promiseArray)
.then(function(arrayOfHtml){
return {arrayOfHtml: arrayOfHtml , arrayOfUrls: arrayOfLinks};
});
}
//go through the html of each url and add to urlSet if there is a checkout button
//add to remainder otherwise to rescrape
function lastStep (obj){
for(var i = 0; i < obj.arrayOfHtml.length; i++){
var $ = cheerio.load(obj.arrayOfHtml[i]);
//if page has a submit it must be a product page
if($('[type=submit]').length !== 0){
//add page to set
urlSet.add(obj.arrayOfUrls[i]);
console.log(obj.arrayOfUrls[i]);
} else if(remainder == undefined) {
//if not a product page, add it to remainder so it another scrape can be performed.
remainder = obj.arrayOfUrls[i];
console.log("The remainder is " + remainder)
}
}
//return remainder for second run-through of scrape
return remainder;
}
//iterate through urlSet (product pages and grab html)
function lastScraperPt1(){
//call lastScraper so we can grab data from the set (product pages)
//scrape set, product pages
var promiseArray = [];
for(var item of urlSet){
var url = item;
promiseArray.push(requestPromise(url));
}
return Promise.all(promiseArray)
.then(function(arrayOfHtml){
return arrayOfHtml;
});
}
//iterate over the html of the product pages and store data as objects
function lastScraperPt2(html){
for(var i = 0; i < html.length; i++){
var $ = cheerio.load(html[i]);
//grab data and store as variables
var price = $('.price').text();
var imgURL = $('.shirt-picture').find('img').attr('src');
var title = $('body').find('.shirt-details > h1').text().slice(4);
var tshirtObject = {};
//add values into tshirt object
tshirtObject.Title = title;
tshirtObject.Price = price;
tshirtObject.ImageURL = imgURL;
tshirtObject.URL = url;
tshirtObject.Date = moment().format('MMMM Do YYYY, h:mm:ss a');
//add the object into the array of tshirts
tshirtArray.push(tshirtObject);
}
convertJson2Csv();
}
//convert tshirt objects and save as CSV file
function convertJson2Csv(){
//The scraper should generate a folder called `data` if it doesn’t exist.
var dir ='./data';
if(!fs.existsSync(dir)){
fs.mkdirSync(dir);
}
var fields = ['Title', 'Price', 'ImageURL', 'URL', 'Date'];
//convert tshirt data into CSV and pass in fields
var csv = json2csv({ data: tshirtArray, fields: fields });
//Name of file will be the date
var fileDate = moment().format('MM-DD-YY');
var fileName = dir + '/' + fileDate + '.csv';
//Write file
fs.writeFile(fileName, csv, {overwrite: true}, function(err) {
console.log('file saved');
if (err) throw err;
});
}
scrape(url) //scrape from original entry point
.then(nextStep)
.then(lastStep)
.then(scrape) //scrape again but with remainder url
.then(nextStep)
.then(lastStep)
.then(lastScraperPt1)
.then(lastScraperPt2)
.catch(function(err) {
// handle any error from any request here
console.log(err);
});
I'm console logging the arrayOfLinks in nextStep so I can see that they are being grabbed properly, I just cannot work out why they aren't being passed through to 'lastStep' properly.
Currently scraping http://shirts4mike.com/
[ 'http://shirts4mike.com/shirts.php',
'http://shirts4mike.com/shirts.php',
'http://shirts4mike.com/shirt.php?id=108',
'http://shirts4mike.com/shirt.php?id=107',
'http://shirts4mike.com/shirt.php?id=106',
'http://shirts4mike.com/shirt.php?id=105' ]
The remainder is http://shirts4mike.com/shirts.php
http://shirts4mike.com/shirt.php?id=108
http://shirts4mike.com/shirt.php?id=107
http://shirts4mike.com/shirt.php?id=106
http://shirts4mike.com/shirt.php?id=105
Currently scraping http://shirts4mike.com/shirts.php
[ 'http://shirts4mike.com/shirts.phpshirts.php',
'http://shirts4mike.com/shirts.phpshirt.php?id=101',
'http://shirts4mike.com/shirts.phpshirt.php?id=102',
'http://shirts4mike.com/shirts.phpshirt.php?id=103',
'http://shirts4mike.com/shirts.phpshirt.php?id=104',
'http://shirts4mike.com/shirts.phpshirt.php?id=105',
'http://shirts4mike.com/shirts.phpshirt.php?id=106',
'http://shirts4mike.com/shirts.phpshirt.php?id=107',
'http://shirts4mike.com/shirts.phpshirt.php?id=108' ]
BUT if I choose to only call the first scrape and don't call the second, like this:
scrape(url) //scrape from original entry point
.then(nextStep)
.then(lastStep)
.then(lastScraperPt1)
.then(lastScraperPt2)
.catch(function(err) {
// handle any error from any request here
console.log(err);
});
... Then everything works. I just don't get to all the urls.
What is happening here and how can I fix it? Thank you guys
The issue is tshirtArray is not defined in convertJson2Csv(). At lastlastScraperPt2 pass tshirtArray to convertJsonCsv()
convertJson2Csv(tshirtArray)
at convertJson2Csv
function convertJson2Csv(tshirtArray) {
// do stuff
}
One problem seems to be in your lastStep. It looks like you mean for remainder to be another array of urls. Correct me if I'm wrong there. However, what's happing is that the first time the if($('[type=submit]').length !== 0) condition fails, you'll automatically go down to the next block, because remainder start undefined. Whatever the current url is, you assign that one to remainder. For the rest of the iterations of your for-loop, you will never again hit the condition where remainder == undefined. So if you will only ever end up with one url assigned to remainder, while any more that you were hoping to get will simply be passed over.
You might want to define remainder as remainder = [];. And then instead of saying else if (remainder == undefined), you would just say
} else {
remainder.push(obj.arrayOfUrls[i]);
}
However, then you're passing an array of urls to scrape when scrape is only expecting a single url. If this is what you want and I am right in assuming that you mean for remainder to be an array of urls, you could defined a new function as follows:
function scrapeRemainders(remainders) {
var promises = [];
remainder.forEach(function (url) {
promises.push(requestPromise(url));
});
return Promise.all(promises).then(function (results) {
_.flattenDeep(results);
})
}
Then instead of the second scrape in your promise chain, you would replace it with scrapeRemainders. Also, for you the _ in the previous function, you would need to npm install lodash and then var _ = require('lodash'). On a side note, lodash has nothing to do with promises, but it is a great tool for data manipulation. You should look into it when you have the chance.
Also, in lastScraperPt1, you can change
return Promise.all(promiseArray)
.then(function(arrayOfHtml){
return arrayOfHtml;
});
to
return Promise.all(promiseArray);
It does the same thing.
Hope this helps. If this does not answer your question, comment at me and I can change my answer accordingly.
All fixed, it was grabbing the wrong urls in scrape(). Though I only knew this after I logged the statusCodes to the console :
//TASK: Create a command line application that goes to an ecommerce site to get the latest prices.
//Save the scraped data in a spreadsheet (CSV format).
'use strict';
//Modules being used:
var cheerio = require('cheerio');
var json2csv = require('json2csv');
var request = require('request');
var moment = require('moment');
var fs = require('fs');
//harcoded url
var urlHome = 'http://shirts4mike.com/';
//url for tshirt pages
var urlSet = [];
var tshirtArray = [];
const requestPromise = function(url) {
return new Promise(function(resolve, reject) {
request(url, function(error, response, html) {
if(error) {
errorHandler(error);
return reject(error);
}
if(!error && response.statusCode == 200){
return resolve(html);
}
if(response.statusCode !== 200){
console.log("response code is " + response.statusCode);
}
return resolve("");
});
});
}
// Go into webpage via url, load html and grab links shirt in url
function scrape (url) {
console.log("Currently scraping " + url)
return requestPromise(url)
.then(function(html) {
var $ = cheerio.load(html);
var links = [];
var URL = 'http://shirts4mike.com/';
//get all the links
$('a[href*=shirt]').each(function(){
var a = $(this).attr('href');
//add into link array
links.push(URL + a);
});
// return array of links
return links;
});
}
function nextStep (arrayOfLinks) {
var promiseArray = [];
console.log(arrayOfLinks);
for(var i = 0; i < arrayOfLinks.length; i++){
promiseArray.push(requestPromise(arrayOfLinks[i]));
}
//return both the html of pages and their urls
return Promise.all(promiseArray)
.then(function(arrayOfHtml){
return {arrayOfHtml: arrayOfHtml , arrayOfUrls: arrayOfLinks};
});
}
//go through the html of each url and add to urlSet if there is a checkout button
//add to remainder otherwise to rescrape
function lastStep (obj){
for(var i = 0; i < obj.arrayOfHtml.length; i++){
var $ = cheerio.load(obj.arrayOfHtml[i]);
//if page has a submit it must be a product page
if($('[type=submit]').length !== 0){
//add page to set
urlSet.push(obj.arrayOfUrls[i]);
console.log(obj.arrayOfUrls[i]);
} else if(remainder == undefined) {
//if not a product page, add it to remainder so it another scrape can be performed.
var remainder = obj.arrayOfUrls[i];
console.log("The remainder is " + remainder)
}
}
//return remainder for second run-through of scrape
return remainder;
}
//iterate through urlSet (product pages and grab html)
function lastScraperPt1(){
//call lastScraper so we can grab data from the set (product pages)
//scrape set, product pages
var promiseArray = [];
for(var item of urlSet){
var url = item;
promiseArray.push(requestPromise(url));
}
return Promise.all(promiseArray)
.then(function(arrayOfHtml){
return arrayOfHtml;
});
}
//iterate over the html of the product pages and store data as objects
function lastScraperPt2(html){
for(var i = 0; i < html.length; i++){
var $ = cheerio.load(html[i]);
//grab data and store as variables
var price = $('.price').text();
var imgURL = $('.shirt-picture').find('img').attr('src');
var title = $('body').find('.shirt-details > h1').text().slice(4);
var tshirtObject = {};
//add values into tshirt object
tshirtObject.Title = title;
tshirtObject.Price = price;
tshirtObject.ImageURL = urlHome + imgURL;
tshirtObject.URL = urlSet[i];
tshirtObject.Date = moment().format('MMMM Do YYYY, h:mm:ss a');
//add the object into the array of tshirts
tshirtArray.push(tshirtObject);
}
return tshirtArray;
}
//conver tshirt objects and save as CSV file
function convertJson2Csv(tshirtArray){
//The scraper should generate a folder called `data` if it doesn’t exist.
var dir ='./data';
if(!fs.existsSync(dir)){
fs.mkdirSync(dir);
}
var fields = ['Title', 'Price', 'ImageURL', 'URL', 'Date'];
//convert tshirt data into CSV and pass in fields
var csv = json2csv({ data: tshirtArray, fields: fields });
//Name of file will be the date
var fileDate = moment().format('MM-DD-YY');
var fileName = dir + '/' + fileDate + '.csv';
//Write file
fs.writeFile(fileName, csv, {overwrite: true}, function(err) {
console.log('file saved');
if (err) errorHandler(err);
});
}
scrape(urlHome) //scrape from original entry point
.then(nextStep)
.then(lastStep)
.then(scrape)
.then(nextStep)
.then(lastStep)
.then(lastScraperPt1)
.then(lastScraperPt2)
.then(convertJson2Csv)
.catch(function(err) {
// handle any error from any request here
console.log(err);
});
//If the site is down, an error message describing the issue should appear in the console.
//This is to be tested by disabling wifi on your device.
//When an error occurs log it to a file scraper-error.log . It should append to the bottom of the file with a time stamp and error
var errorHandler = function (error) {
console.log(error.message);
console.log('The scraper could not not scrape data from ' + url + ' there is either a problem with your internet connection or the site may be down');
/**
* create new date for log file
*/
var loggerDate = new Date();
/**
* create message as a variable
*/
var errLog = '[' + loggerDate + '] ' + error.message + '\n';
/**
*when the error occurs, log that to the error logger file
*/
fs.appendFile('scraper-error.log', errLog, function (err) {
if (err) throw err;
console.log('There was an error. The error was logged to scraper-error.log');
});
};
I have a root directory say "A" inside this directory i am having lots of directories say "1","2","3","4","5"........ and in all these subdirectories i have single file called cucumber.json. All i want to do is read the cucumber.json file and get the accumulated result. How can i achieve this using node js.
In the below screen shot my root directory is "cucumber" and inside that i have lot of sub directories. All these sub directories contains a single file named cucumber.json.
Are there any dedicated node package which can make my work easy.
Let me know if any further info is required.
Hi there please try the following (javascript):
// Require filesystem package for IO operations
var fs = require('fs');
// Put the path you are looking for here
var path = "d:\\nodef";
//Call the function defined below
recursiveloop(path, function(err,result){
/* begin processing of each result */
// For each file in the array
for(i=0;i<result.length;i++)
{
//Write the name of the file
console.log('Processing: ' + result[i]);
//Read the file
fs.readFile(result[i], 'utf8', function(err, data){
//If there is an error notify to the console
if(err) console.log('Error: ' + err);
//Parse the json object
var obj = JSON.parse(data);
//Print out contents
console.log('Name: ' + obj.name);
console.log('Position: ' + obj.position);
})
}
});
// Asynchronous function to read folders and files recursively
function recursiveloop(dir, done)
{
var results = [];
fs.readdir(dir, function(err, list){
if (err) return done(err);
var i = 0;
(function next() {
var file = list[i++];
if (!file) return done(null, results);
file = dir + '/' + file;
fs.stat(file, function(err, stat) {
if (stat && stat.isDirectory()) {
recursiveloop(file, function(err, res) {
results = results.concat(res);
next();
});
} else {
results.push(file);
next();
}
});
})();
});
}
Trying to allow users to upload image files to the Node.js server in a MEAN Stack application. I am using ng-file-upload for the client side angular directive. That seems to be working good enough. I run into an error when I pass the image to the server.
I use an API route to handle the work on the server side. The server will be responsible for saving the file to disk with node-multiparty module. It seems to hit route but when it tries to emit a close event I get the error. throw new Error('"name" and "value" are required for setHeader().'
The file I want is in my temp folder but it doesn't get saved to the target directory on my server plus I get the header error after the file should have been saved. So I need to stop the error and save the file with fs.rename() to the target image directory.
Here is the code that is breaking.
file api.js
// router to save images
router.route('/img/upload')
.post(function (req, res) {
console.log("image upload hits the router")
var options = {};
var count = 0;
var form = new multiparty.Form(options);
//save file to disk
form.on('file', function (name, file) {
var uploadDirectory = 'img/user/profile/';
var oldPath = file.path;
var newPath = uploadDirectory + file.originalFilename;
fs.rename(oldPath, newPath, function (err) {
if (err) throw err;
console.log('renamed complete');
});
});
// Close emitted after form parsed
form.on('close', function () {
console.log('Upload completed!');
res.setHeader('text/plain'); // Here is the line that gives an error.
res.end('Received ' + count + ' files');
});
// Parse req
form.parse(req);
});
So this is what I got to work for me
The actual line that gave me an error was setHeaders. It appears I needed to put the name and value as strings separated by a comma. This works perfectly for me now. I hope it saves everyone time coding.
// post
.post(function (req, res) {
var options = {};
var count = 0;
var form = new multiparty.Form(options);
form.on('error', function (err) {
console.log('Error parsing form: ' + err.stack);
});
//save file to disk
form.on('file', function (name, file) {
var uploadDirectory = '/img/user/profile/';
var oldPath = file.path;
var newPath = uploadDirectory + file.originalFilename;
fs.rename(oldPath, newPath, function (err) {
if (err) throw err;
console.log('renamed complete');
});
});
// Close emitted after form parsed
form.on('close', function () {
console.log('Upload completed!');
res.setHeader('Content-Type', 'text/plain');
res.end('Received ' + count + ' files');
});
// Parse req
form.parse(req);
});
I want to download a zip file from the internet and unzip it in memory without saving to a temporary file. How can I do this?
Here is what I tried:
var url = 'http://bdn-ak.bloomberg.com/precanned/Comdty_Calendar_Spread_Option_20120428.txt.zip';
var request = require('request'), fs = require('fs'), zlib = require('zlib');
request.get(url, function(err, res, file) {
if(err) throw err;
zlib.unzip(file, function(err, txt) {
if(err) throw err;
console.log(txt.toString()); //outputs nothing
});
});
[EDIT]
As, suggested, I tried using the adm-zip library and I still cannot make this work:
var ZipEntry = require('adm-zip/zipEntry');
request.get(url, function(err, res, zipFile) {
if(err) throw err;
var zip = new ZipEntry();
zip.setCompressedData(new Buffer(zipFile.toString('utf-8')));
var text = zip.getData();
console.log(text.toString()); // fails
});
You need a library that can handle buffers. The latest version of adm-zip will do:
npm install adm-zip
My solution uses the http.get method, since it returns Buffer chunks.
Code:
var file_url = 'http://notepad-plus-plus.org/repository/7.x/7.6/npp.7.6.bin.x64.zip';
var AdmZip = require('adm-zip');
var http = require('http');
http.get(file_url, function(res) {
var data = [], dataLen = 0;
res.on('data', function(chunk) {
data.push(chunk);
dataLen += chunk.length;
}).on('end', function() {
var buf = Buffer.alloc(dataLen);
for (var i = 0, len = data.length, pos = 0; i < len; i++) {
data[i].copy(buf, pos);
pos += data[i].length;
}
var zip = new AdmZip(buf);
var zipEntries = zip.getEntries();
console.log(zipEntries.length)
for (var i = 0; i < zipEntries.length; i++) {
if (zipEntries[i].entryName.match(/readme/))
console.log(zip.readAsText(zipEntries[i]));
}
});
});
The idea is to create an array of buffers and concatenate them into a new one at the end. This is due to the fact that buffers cannot be resized.
Update
This is a simpler solution that uses the request module to obtain the response in a buffer, by setting encoding: null in the options. It also follows redirects and resolves http/https automatically.
var file_url = 'https://github.com/mihaifm/linq/releases/download/3.1.1/linq.js-3.1.1.zip';
var AdmZip = require('adm-zip');
var request = require('request');
request.get({url: file_url, encoding: null}, (err, res, body) => {
var zip = new AdmZip(body);
var zipEntries = zip.getEntries();
console.log(zipEntries.length);
zipEntries.forEach((entry) => {
if (entry.entryName.match(/readme/i))
console.log(zip.readAsText(entry));
});
});
The body of the response is a buffer that can be passed directly to AdmZip, simplifying the whole process.
Sadly you can't pipe the response stream into the unzip job as node zlib lib allows you to do, you have to cache and wait the end of the response. I suggest you to pipe the response to a fs stream in case of big files, otherwise you will full fill your memory in a blink!
I don't completely understand what you are trying to do, but imho this is the best approach. You should keep your data in memory only the time you really need it, and then stream to the csv parser.
If you want to keep all your data in memory you can replace the csv parser method fromPath with from that takes a buffer instead and in getData return directly unzipped
You can use the AMDZip (as #mihai said) instead of node-zip, just pay attention because AMDZip is not yet published in npm so you need:
$ npm install git://github.com/cthackers/adm-zip.git
N.B. Assumption: the zip file contains only one file
var request = require('request'),
fs = require('fs'),
csv = require('csv')
NodeZip = require('node-zip')
function getData(tmpFolder, url, callback) {
var tempZipFilePath = tmpFolder + new Date().getTime() + Math.random()
var tempZipFileStream = fs.createWriteStream(tempZipFilePath)
request.get({
url: url,
encoding: null
}).on('end', function() {
fs.readFile(tempZipFilePath, 'base64', function (err, zipContent) {
var zip = new NodeZip(zipContent, { base64: true })
Object.keys(zip.files).forEach(function (filename) {
var tempFilePath = tmpFolder + new Date().getTime() + Math.random()
var unzipped = zip.files[filename].data
fs.writeFile(tempFilePath, unzipped, function (err) {
callback(err, tempFilePath)
})
})
})
}).pipe(tempZipFileStream)
}
getData('/tmp/', 'http://bdn-ak.bloomberg.com/precanned/Comdty_Calendar_Spread_Option_20120428.txt.zip', function (err, path) {
if (err) {
return console.error('error: %s' + err.message)
}
var metadata = []
csv().fromPath(path, {
delimiter: '|',
columns: true
}).transform(function (data){
// do things with your data
if (data.NAME[0] === '#') {
metadata.push(data.NAME)
} else {
return data
}
}).on('data', function (data, index) {
console.log('#%d %s', index, JSON.stringify(data, null, ' '))
}).on('end',function (count) {
console.log('Metadata: %s', JSON.stringify(metadata, null, ' '))
console.log('Number of lines: %d', count)
}).on('error', function (error) {
console.error('csv parsing error: %s', error.message)
})
})
If you're under MacOS or Linux, you can use the unzip command to unzip from stdin.
In this example I'm reading the zip file from the filesystem into a Buffer object but it works
with a downloaded file as well:
// Get a Buffer with the zip content
var fs = require("fs")
, zip = fs.readFileSync(__dirname + "/test.zip");
// Now the actual unzipping:
var spawn = require('child_process').spawn
, fileToExtract = "test.js"
// -p tells unzip to extract to stdout
, unzip = spawn("unzip", ["-p", "/dev/stdin", fileToExtract ])
;
// Write the Buffer to stdin
unzip.stdin.write(zip);
// Handle errors
unzip.stderr.on('data', function (data) {
console.log("There has been an error: ", data.toString("utf-8"));
});
// Handle the unzipped stdout
unzip.stdout.on('data', function (data) {
console.log("Unzipped file: ", data.toString("utf-8"));
});
unzip.stdin.end();
Which is actually just the node version of:
cat test.zip | unzip -p /dev/stdin test.js
EDIT: It's worth noting that this will not work if the input zip is too big to be read in one chunk from stdin. If you need to read bigger files, and your zip file contains only one file, you can use funzip instead of unzip:
var unzip = spawn("funzip");
If your zip file contains multiple files (and the file you want isn't the first one) I'm afraid to say you're out of luck. Unzip needs to seek in the .zip file since zip files are just a container, and unzip may just unzip the last file in it. In that case you have to save the file temporarily (node-temp comes in handy).
Two days ago the module node-zip has been released, which is a wrapper for the JavaScript only version of Zip: JSZip.
var NodeZip = require('node-zip')
, zip = new NodeZip(zipBuffer.toString("base64"), { base64: true })
, unzipped = zip.files["your-text-file.txt"].data;