I want to download a zip file from a url and parse its contents in node. I do not want to save the file on disk. The zip file is a directory of csv files that I want to process. How can I approach this? The only package that has an option for unzipping from a URL is unzipper but it does not work for me. Every other package lets you unzip a file on disk by providing the path to the file but not a url.
I am downloading the file like so:
const res = await this.get(test)
But what can I do now? There are packages like AdmZip that can extract zip files but need a path as a string to a file on disk. Is there a way I can pass/ stream my res object above to the below?
var AdmZip = require('adm-zip');
// reading archives
var zip = new AdmZip("./my_file.zip");
var zipEntries = zip.getEntries(); // an array of ZipEntry records
zipEntries.forEach(function(zipEntry) {
console.log(zipEntry.toString()); // outputs zip entries information
if (zipEntry.entryName == "my_file.txt") {
console.log(zipEntry.getData().toString('utf8'));
}
});
Here's a simple example of downloading a .zip file and unzipping using adm-zip. As #anonymouze points out, you can pass a buffer to the AdmZip constructor.
const axios = require("axios");
const AdmZip = require('adm-zip');
async function get(url) {
const options = {
method: 'GET',
url: url,
responseType: "arraybuffer"
};
const { data } = await axios(options);
return data;
}
async function getAndUnZip(url) {
const zipFileBuffer = await get(url);
const zip = new AdmZip(zipFileBuffer);
const entries = zip.getEntries();
for(let entry of entries) {
const buffer = entry.getData();
console.log("File: " + entry.entryName + ", length (bytes): " + buffer.length + ", contents: " + buffer.toString("utf-8"));
}
}
getAndUnZip('https://www.learningcontainer.com/wp-content/uploads/2020/05/sample-zip-file.zip');
In this case I'm simply using axios to download a zip file buffer, then parsing it with AdmZip.
Each entry's data can be accessed with entry.getData(), which will return a buffer.
In this case we'll see an output something like this:
File: sample.txt, length (bytes): 282, contents: I would love to try or hear the sample audio your app can produce. I do...
Here's another example, this time using node-fetch:
const fetch = require('node-fetch');
const AdmZip = require('adm-zip');
async function get(url) {
return fetch(url).then(res => res.buffer());
}
async function getAndUnZip(url) {
const zipFileBuffer = await get(url);
const zip = new AdmZip(zipFileBuffer);
const entries = zip.getEntries();
for(let entry of entries) {
const buffer = entry.getData();
console.log("File: " + entry.entryName + ", length (bytes): " + buffer.length + ", contents: " + buffer.toString("utf-8"));
}
}
getAndUnZip('https://www.learningcontainer.com/wp-content/uploads/2020/05/sample-zip-file.zip');
I want to rename a particular string in filenames. I am using glob and path to extract multiple file names from multiple locations. Now I just want to rename those files like abcd-change-name.js to abcd-name-changed.js
Here's what I have done so far
var glob = require("glob")
var path = require('path')
const fs = require('fs')
glob(process.cwd() + "/directory/**/*-change-name*.js", {}, function (er,
files) {
for(i=0; i<files.length; i++){
var f = path.basename(files[i])
var d = path.dirname(files[i])
fs.renameSync(files[i] , d + '/name-changed.js', function (err) {
if (err) throw err;
console.log('renamed complete');
});
}
})
The code is changing all the files with the js extension to name-changed.js in their respective folders.
Your code uses has the line fs.renameSync(files[i], d + '/name-changed.js', ... but this line of code renames files[i] to '[foldername]/name-changed.js'.
I would suggest having something like fs.renameSync(files[i], files[i].replace('change-name', 'name-changed'), ...
In other words, you have told fs to rename the file to have a filename of 'name-changed.js' but you want it to contain the original filename data but with 'change-name' replaced with 'name-changed'.
Here is a full code example based on your code.
var glob = require("glob")
var path = require('path')
const fs = require('fs')
glob(process.cwd() + "/directory/**/*-change-name*.js", {}, function (er,
files) {
for(i=0; i<files.length; i++){
var f = path.basename(files[i])
var d = path.dirname(files[i])
fs.renameSync(files[i] , files[i].replace('change-name', 'name-changed'), function (err) {
if (err) throw err;
console.log('renamed complete');
});
}
})
I have a directory full of txt files containing json content. I would like to read the whole directory and rename the files according to the json tag value label.
I know how to read a single file using the below code but how do you read a whole directory?
function readTextFile(file) {
var rawFile = new XMLHttpRequest();
rawFile.open("GET", file, false);
rawFile.onreadystatechange = function () {
if (rawFile.readyState === 4) {
if (rawFile.status === 200 || rawFile.status == 0) {
var allText = rawFile.responseText;
alert(allText);
}
}
}
rawFile.send(null);
}
These code gives you list of your files in folder:
var fs = require('fs');
var files = fs.readdirSync('/assets/photos/');
Then you can iterate these list and do your code.
Using the node filesystem (fs) module you can do what you want assuming it's all locally accessible and you have permissions. Here's a way it could work:
const fs = require("fs");
const dir = "/path/to/the/directory";
// get the directory contents
const files = fs.readdirSync(dir);
for (const file of files) {
// for each make sure it's a file (not a subdirectory)
const stat = fs.statSync(file);
if (stat.isFile()) {
// read in the file and parse it as JSON
const rawdata = fs.readFileSync(file);
try {
const json = JSON.parse(rawdata);
if (json.label) {
// build the new filename using 'label'
const newfile = `${dir}/${label}.json`;
fs.renameSync(file, newfile)
}
}
catch (err) {
console.log(`Error working with ${file}. Err: ${err}`);
}
}
}
That's the idea. Additional error checking can be done for safety like making sure the new filename doesn't already exist.
lets assume i have this (snippet):
var filesarray = [];
if(req.files){
req.files.forEach(function(file){
var fileName = file.filename;
var filePath = file.path;
filesarray.push(
filePath
);
})
}
And later i push it with mongoose:
DB.create({
filepaths: filesarray,
}), function (err, res) {
if (err) {
throw err;
}else{
console.log("1 document inserted");
DB.close()
}}
});
The result i receive is not really what i want, because in mongodb i get a comma separated list, like:
filepaths
/files/1540474914824.png,/files/1540474914828.png,files/1540474914831.png
I would like to have something like:
filepaths
filename -> filepath
filename -> filepath
filename -> filepath
i hope i could make clear whats the goal. I am sure there is a elegant way to reacht the goal, so could someone please give me a direction.
Thanks,
Regards
First off you cant have more than one key in am object so {filepaths: {file: '1', file: '2'}} will not work. You will need to have a unique name per file/path
var files = {};
if(req.files){
req.files.forEach(function(file){
var fileName = file.filename;
var filePath = file.path;
files[fileName] = filePath;
})
}
you could use map to have an array of object but this seems more cumbersome to me
var files;
if(req.files){
files = req.files.map(function(file){
var fileName = file.filename;
var filePath = file.path;
return { [fileName]: filePath };
})
}
I want to download a zip file from the internet and unzip it in memory without saving to a temporary file. How can I do this?
Here is what I tried:
var url = 'http://bdn-ak.bloomberg.com/precanned/Comdty_Calendar_Spread_Option_20120428.txt.zip';
var request = require('request'), fs = require('fs'), zlib = require('zlib');
request.get(url, function(err, res, file) {
if(err) throw err;
zlib.unzip(file, function(err, txt) {
if(err) throw err;
console.log(txt.toString()); //outputs nothing
});
});
[EDIT]
As, suggested, I tried using the adm-zip library and I still cannot make this work:
var ZipEntry = require('adm-zip/zipEntry');
request.get(url, function(err, res, zipFile) {
if(err) throw err;
var zip = new ZipEntry();
zip.setCompressedData(new Buffer(zipFile.toString('utf-8')));
var text = zip.getData();
console.log(text.toString()); // fails
});
You need a library that can handle buffers. The latest version of adm-zip will do:
npm install adm-zip
My solution uses the http.get method, since it returns Buffer chunks.
Code:
var file_url = 'http://notepad-plus-plus.org/repository/7.x/7.6/npp.7.6.bin.x64.zip';
var AdmZip = require('adm-zip');
var http = require('http');
http.get(file_url, function(res) {
var data = [], dataLen = 0;
res.on('data', function(chunk) {
data.push(chunk);
dataLen += chunk.length;
}).on('end', function() {
var buf = Buffer.alloc(dataLen);
for (var i = 0, len = data.length, pos = 0; i < len; i++) {
data[i].copy(buf, pos);
pos += data[i].length;
}
var zip = new AdmZip(buf);
var zipEntries = zip.getEntries();
console.log(zipEntries.length)
for (var i = 0; i < zipEntries.length; i++) {
if (zipEntries[i].entryName.match(/readme/))
console.log(zip.readAsText(zipEntries[i]));
}
});
});
The idea is to create an array of buffers and concatenate them into a new one at the end. This is due to the fact that buffers cannot be resized.
Update
This is a simpler solution that uses the request module to obtain the response in a buffer, by setting encoding: null in the options. It also follows redirects and resolves http/https automatically.
var file_url = 'https://github.com/mihaifm/linq/releases/download/3.1.1/linq.js-3.1.1.zip';
var AdmZip = require('adm-zip');
var request = require('request');
request.get({url: file_url, encoding: null}, (err, res, body) => {
var zip = new AdmZip(body);
var zipEntries = zip.getEntries();
console.log(zipEntries.length);
zipEntries.forEach((entry) => {
if (entry.entryName.match(/readme/i))
console.log(zip.readAsText(entry));
});
});
The body of the response is a buffer that can be passed directly to AdmZip, simplifying the whole process.
Sadly you can't pipe the response stream into the unzip job as node zlib lib allows you to do, you have to cache and wait the end of the response. I suggest you to pipe the response to a fs stream in case of big files, otherwise you will full fill your memory in a blink!
I don't completely understand what you are trying to do, but imho this is the best approach. You should keep your data in memory only the time you really need it, and then stream to the csv parser.
If you want to keep all your data in memory you can replace the csv parser method fromPath with from that takes a buffer instead and in getData return directly unzipped
You can use the AMDZip (as #mihai said) instead of node-zip, just pay attention because AMDZip is not yet published in npm so you need:
$ npm install git://github.com/cthackers/adm-zip.git
N.B. Assumption: the zip file contains only one file
var request = require('request'),
fs = require('fs'),
csv = require('csv')
NodeZip = require('node-zip')
function getData(tmpFolder, url, callback) {
var tempZipFilePath = tmpFolder + new Date().getTime() + Math.random()
var tempZipFileStream = fs.createWriteStream(tempZipFilePath)
request.get({
url: url,
encoding: null
}).on('end', function() {
fs.readFile(tempZipFilePath, 'base64', function (err, zipContent) {
var zip = new NodeZip(zipContent, { base64: true })
Object.keys(zip.files).forEach(function (filename) {
var tempFilePath = tmpFolder + new Date().getTime() + Math.random()
var unzipped = zip.files[filename].data
fs.writeFile(tempFilePath, unzipped, function (err) {
callback(err, tempFilePath)
})
})
})
}).pipe(tempZipFileStream)
}
getData('/tmp/', 'http://bdn-ak.bloomberg.com/precanned/Comdty_Calendar_Spread_Option_20120428.txt.zip', function (err, path) {
if (err) {
return console.error('error: %s' + err.message)
}
var metadata = []
csv().fromPath(path, {
delimiter: '|',
columns: true
}).transform(function (data){
// do things with your data
if (data.NAME[0] === '#') {
metadata.push(data.NAME)
} else {
return data
}
}).on('data', function (data, index) {
console.log('#%d %s', index, JSON.stringify(data, null, ' '))
}).on('end',function (count) {
console.log('Metadata: %s', JSON.stringify(metadata, null, ' '))
console.log('Number of lines: %d', count)
}).on('error', function (error) {
console.error('csv parsing error: %s', error.message)
})
})
If you're under MacOS or Linux, you can use the unzip command to unzip from stdin.
In this example I'm reading the zip file from the filesystem into a Buffer object but it works
with a downloaded file as well:
// Get a Buffer with the zip content
var fs = require("fs")
, zip = fs.readFileSync(__dirname + "/test.zip");
// Now the actual unzipping:
var spawn = require('child_process').spawn
, fileToExtract = "test.js"
// -p tells unzip to extract to stdout
, unzip = spawn("unzip", ["-p", "/dev/stdin", fileToExtract ])
;
// Write the Buffer to stdin
unzip.stdin.write(zip);
// Handle errors
unzip.stderr.on('data', function (data) {
console.log("There has been an error: ", data.toString("utf-8"));
});
// Handle the unzipped stdout
unzip.stdout.on('data', function (data) {
console.log("Unzipped file: ", data.toString("utf-8"));
});
unzip.stdin.end();
Which is actually just the node version of:
cat test.zip | unzip -p /dev/stdin test.js
EDIT: It's worth noting that this will not work if the input zip is too big to be read in one chunk from stdin. If you need to read bigger files, and your zip file contains only one file, you can use funzip instead of unzip:
var unzip = spawn("funzip");
If your zip file contains multiple files (and the file you want isn't the first one) I'm afraid to say you're out of luck. Unzip needs to seek in the .zip file since zip files are just a container, and unzip may just unzip the last file in it. In that case you have to save the file temporarily (node-temp comes in handy).
Two days ago the module node-zip has been released, which is a wrapper for the JavaScript only version of Zip: JSZip.
var NodeZip = require('node-zip')
, zip = new NodeZip(zipBuffer.toString("base64"), { base64: true })
, unzipped = zip.files["your-text-file.txt"].data;