Pipe NodeJS Stream to an Array

Pipe NodeJS Stream to an Array - javascript

My use case is this: I am looking to read a CSV file in Node and get only the headers. I don't want to write the results of a read stream to a file, rather push the headers to an array once the file is read, so I can take that array and do something to it later on. OR, better yet, take the stream and as it is being read, transform it, then send it to an array. File is a contrived value. I am stuck at this point, where the current output of datafile is an empty array:
const fs = require('fs');
const parse = require('csv-parse');
const file = "my file path";
let dataFile = [];
rs = fs.createReadStream(file);
parser = parse({columns: true}, function(err, data){
return getHeaders(data)
})
function getHeaders(file){
return file.map(function(header){
return dataFile.push(Object.keys(header))
})
}
What do I need to do in order to get the results I need? I am expecting the headers to be found in an array as the end result.

Ok, so there is some confusing things in your code, and one mistake : you didn't actually call your code :)
First, a solution, add this line, after parser :
rs.pipe(parser).on('end', function(){
console.log(dataFile);
});
And magic, dataFile is not empty.
You stream the file from disk, pass it to the parser, then at the end, call a callback.
For the confusing parts :
parser = parse({columns: true}, function(err, data){
// You don't need to return anything from the callback, you give the impression that parser will be the result of getHeaders, it's not, it's a stream.
return getHeaders(data)
})
function getHeaders(file){
// change map to each, with no return, map returns an array of the return of the callback, you return an array with the result of each push (wich is the index of the new object).
return file.map(function(header){
return dataFile.push(Object.keys(header))
})
}
And finaly :
Please choose with ending line with ; or not, but not a mix ;)
You should end with something like :
const fs = require('fs');
const parse = require('csv-parse');
const file = "./test.csv";
var dataFile = [];
rs = fs.createReadStream(file);
parser = parse({columns: true}, function(err, data){
getHeaders(data);
});
rs.pipe(parser).on('end', function(){
console.log(dataFile);
});
function getHeaders(file){
file.each(function(header){
dataFile.push(Object.keys(header));
});
}

Related

Making any reference to Nodejs' process.argv causes errors in unexpected place (reading a file)

I am writing code that generates a very large JSON object, saves it to a file, then loads the file and inserts the data into a Mongo collection. I want to pass a string from the command line when calling the script that I use to set the file name, as well as the collection name. I call it like so: node --max-old-space-size=8192 data_generator.js foo 1000000.
The code fails with error ENOENT: no such file or directory, open 'foo.json' on the third line of the function gen_collection() where I set the variable data. This error does not appear when a file foo.json already exists, even if it is empty. Before it fails, the code successfully creates a file foo.json but it contains only an empty array [].
The code fails with this same exact error when I include any reference to process.argv. This includes when I try to set any variable to a value from the process.argv array. The code works when I set the variables fname as const fname = "foo" and size as const size = 0. However, even if the only reference I have to process.argv is in a console.log i.e. adding console.log(process.argv[2] to main(), it fails with the exact same error as above.
Here is the code I am trying to run:
const { MongoClient } = require("mongodb");
const fs = require('fs');
const bjson = require('big-json');
async function main() {
const uri = "my db uri";
const client = new MongoClient(uri);
const fname = process.argv[2];
const size = parseInt(process.argv[3]);
// const fname = 'small'
// const size = 1
try {
await client.connect({ useUnifiedTopology: true });
await write_data_to_disk(fname, size);
await gen_collection(client, fname);
} catch (e) {
console.error(e);
} finally {
await client.close();
}
};
// generate data as json aray and write to local file
async function write_data_to_disk(fname, size) {
let arr = [];
for (let i = 0; i < size; i++) {
let doc = gen_document();
arr.push(doc);
}
const strStream = bjson.createStringifyStream({
body: arr
})
let logger = fs.createWriteStream(`${fname}.json`);
strStream.on('data', (d) => {
logger.write(d);
})
};
async function gen_collection(client, fname) {
let db = client.db('test');
let collection = db.collection(fname);
let data = JSON.parse(fs.readFileSync(`${fname}.json`, 'utf8')); // ERROR APPEARS ON THIS LINE
bulkUpdateOps = [];
data.forEach((doc) => {
bulkUpdateOps.push({"insertOne": {"document": doc}});
if (bulkUpdateOps.length === 1000) {
collection.bulkWrite(bulkUpdateOps);
bulkUpdateOps = [];
}
})
if (bulkUpdateOps.length > 0) {
collection.bulkWrite(bulkUpdateOps);
}
};
function gen_document() {
// returns json object
};

You're doing
await write_data_to_disk(...)
but that function doesn't return a promise that is connected to when it's done. So, you're trying to read the resulting file BEFORE it has been created or before it has valid content in it and thus the ENOENT error as the file doesn't yet exist when you're trying to read from it in the following function.
Writestreams do not play nicely with promises unless you wrap them in your own promise that resolves when you are completely done writing to the stream and the file has been closed.
Also, you probably want to just .pipe() strStream to the logger stream. Much easier and you can then just monitor when that pipe() operation is done to resolve the promise you wrap around that operation.
You can promisify write_data_to_disk() like this:
// generate data as json aray and write to local file
function write_data_to_disk(fname, size) {
return new Promise((resolve, reject) => {
const arr = [];
for (let i = 0; i < size; i++) {
let doc = gen_document();
arr.push(doc);
}
const strStream = bjson.createStringifyStream({ body: arr });
const dest = fs.createWriteStream(`${fname}.json`, {emitClose: true});
// monitor for completion and errors
dest.on('error', reject).on('close', resolve);
strStream.on('error', reject);
// pipe all the content from strStream to the dest writeStream
strStream.pipe(dest);
});
}
Since this returns a promise that is truly tied to when the write operation is done, you can then use await write_data_to_disk(...).

NodeJS - code inside an async function executes out of order

I have a werid problem. What i want to do is, when calling an api, i want to make a request to the postgresql db with a bulk insert. First, i execute the loop which reads a file and extracts data from it to form the values array. Then, i want to generate a bulk insert request with pg-promise library. But when the code executes, what i get is that it tries to generate a request before the loop even starts, and it throws an error about an empty array. What the?
async import(req, res) {
var rd = readline.createInterface({
input: fs.createReadStream('.../file.csv'),
console: false
});
const createQuery = `INSERT INTO
table('columns')
VALUES ?`;
const values = [];
rd.on('line', function(line) {
//stuff
values.push({
//stuff
});
}
});
const cs = new pgp.helpers.ColumnSet(['columns'], {table: 'table'});
const query = pgp.helpers.insert(values, cs);
I've removed the details, but i hope this provides enough info. I've tried to put console logs before, in and after the loop, and first, the stuff before and after the loop gets logged, the error is thrown, and then the loop executes and logs stuff. Do i not understand or miss something?

Just making the function async doesn't accomplish anything for you here by itself. You have to find out what asynchronous operations you can await in order to serialize your asynchronous operations.
In the latest versions of node.js, you can use the for await () structure with the asynchronous iterator in the readline interface to process all the lines.
async import(req, res) {
var rd = readline.createInterface({
input: fs.createReadStream('.../file.csv'),
console: false
});
const createQuery = `INSERT INTO
table('columns')
VALUES ?`;
const values = [];
for await (const line of rd) {
values.push({...})
}
const cs = new pgp.helpers.ColumnSet(['columns'], {table: 'table'});
const query = pgp.helpers.insert(values, cs);
}
FYI, you can see an example of this in the readline doc.
You also don't need to use async at all as you can solve it by just putting your last two lines into an event listener for the close event:
import(req, res) {
var rd = readline.createInterface({
input: fs.createReadStream('.../file.csv'),
console: false
});
const createQuery = `INSERT INTO
table('columns')
VALUES ?`;
const values = [];
rd.on('line', function(line) {
//stuff
values.push({
//stuff
});
}
}).on('close', () => {
// done with all line processing here
const cs = new pgp.helpers.ColumnSet(['columns'], {table: 'table'});
const query = pgp.helpers.insert(values, cs);
});
}

When you call rd.on() you are just establishing an event with a callback to be called when the line event occurs. All your code is doing is establishling the callback but then proceeding with the rest of your code which then tries to insert the values in the database. You need to move the code that inserts into the database inside the callback of your rd.on() after you loop through all your values and push them into the array.
However, I'm not familiar of what the line even is in the case of a file. If it is truly line by line for the file then you obviously can't bulk insert there. My suggestion at that point would be to move that step into its own asyn function and await the result of that function before doing the insert.

how to fix this javascript program?

My code as following:
var data_array = [];
readData('file.txt');
console.log(data_array[0]);
console.log(data_array[1]);
console.log(data_array.length.toString());
console.log(data_array[data_array.length-1]);
//reads file into array
function readData(filepath) {
var fs = require('fs');
if (fs.existsSync(filepath)) {
var array = fs.readFileSync(filepath).toString().split("\n");
var data_array = array.slice(7,array.length - 2);
} else {
process.exit();
}
}
When I run this, I got following
undefined
undefined
0
undefined
See Data_array is used within the if statement.
I think the array did not receive anything, that is why it is printing nothing but undefined and its length is 0.
How can I enforce it execute step by step these lines in the written order
var data_array = [];
readData('file.txt');
console.log(data_array[0]);
console.log(data_array[1]);
....

Your code is failing, because you are creating local scope variable and it shadows global. To fix, replace var data_array = array... with data_array = array....
Also, keep in mind that you are using few antipatterns:
First, don't check for file existence before reading it — because it's possible for someone to delete file between your checks. Instead, just read it and handle exception.
Second, read file with readFileSync(filepath, { encoding: 'utf8' }) — this will return the string right away, so you won't need toString().
Third — array.slice() supports negative indexes (they count from the end of array), so you can literally have array.slice(7, -2).
And in general, unless this is a single use throw-away code, I'd suggest you to use asynchronous function counterparts:
const Promise = require('bluebird');
const fs = Promise.promisifyAll(require('fs'));
// reads file into array
const readData = Promise.coroutine(function *(filepath) {
try {
const array = (yield fs.readFileAsync(filepath, { encoding: 'utf8' })
).split("\n");
const data_array = array.slice(7, -2);
return data_array;
} catch(e) {
console.error(e);
process.exit();
}
});
Promise.coroutine(function *() {
const data_array = yield readData('file.txt');
console.log(data_array[0]);
console.log(data_array[1]);
console.log(data_array.length.toString());
console.log(data_array[data_array.length-1]);
})();

Read json file and append a particular array to outer scope list - javascript - Nodejs

var dir = require('node-dir');
var path = require('path');
var fs = require('fs');
var root_directory = "C:/ProgramData/XXX/YYY/Resources/LocalLibrary";
function getitems(res, fileHandler){
dir.files(root_directory, function(err, files) {
if (err) throw err;
fileHandler(res, files);
});
};
function fileHandler(res, files) {
var finaldependency = [];
files.forEach(function (fileName) {
if (path.extname(fileName) === '.Item') {
var singleObj = {};
singleObj['key'] = fileName;
var content = fs.readFileSync(path.resolve(fileName), "utf8");
var json = JSON.parse(content);
singleObj['value'] = json.References;
finaldependency.push(singleObj);
}
});
res.json(finaldependency);
}
module.exports = function(app) {
// api ---------------------------------------------------------------------
// get all items
app.get('/api/item', function(req, res) {
// use mongoose to get all items in the database
getitems(res, fileHandler);
});
Steps Involved:
1.When receive a request process all the files present in the root_directory(it contains many nested subdirectories)
2. get the list of all the files with the help of node-dir module
3. pass the file list to a asynchronous fileHandler
My question ? I am doing a synchronous blocking style of file Handling(fs.readfileSync) !!!
and append the data as a key-value pair to the outer scope list finaldependency
Is this a right approach ?? or is there any optimized approach of doing this ???

Your fileHandler function might take long time hence it can block client request. Server performance in terms of request processing might get worse with increasing client request.
You can try executing the forEach loop in process.nextTick.

Node.js Piping the same readable stream into multiple (writable) targets

I need to run two commands in series that need to read data from the same stream.
After piping a stream into another the buffer is emptied so i can't read data from that stream again so this doesn't work:
var spawn = require('child_process').spawn;
var fs = require('fs');
var request = require('request');
var inputStream = request('http://placehold.it/640x360');
var identify = spawn('identify',['-']);
inputStream.pipe(identify.stdin);
var chunks = [];
identify.stdout.on('data',function(chunk) {
chunks.push(chunk);
});
identify.stdout.on('end',function() {
var size = getSize(Buffer.concat(chunks)); //width
var convert = spawn('convert',['-','-scale',size * 0.5,'png:-']);
inputStream.pipe(convert.stdin);
convert.stdout.pipe(fs.createWriteStream('half.png'));
});
function getSize(buffer){
return parseInt(buffer.toString().split(' ')[2].split('x')[0]);
}
Request complains about this
Error: You cannot pipe after data has been emitted from the response.
and changing the inputStream to fs.createWriteStream yields the same issue of course.
I don't want to write into a file but reuse in some way the stream that request produces (or any other for that matter).
Is there a way to reuse a readable stream once it finishes piping?
What would be the best way to accomplish something like the above example?

You have to create duplicate of the stream by piping it to two streams. You can create a simple stream with a PassThrough stream, it simply passes the input to the output.
const spawn = require('child_process').spawn;
const PassThrough = require('stream').PassThrough;
const a = spawn('echo', ['hi user']);
const b = new PassThrough();
const c = new PassThrough();
a.stdout.pipe(b);
a.stdout.pipe(c);
let count = 0;
b.on('data', function (chunk) {
count += chunk.length;
});
b.on('end', function () {
console.log(count);
c.pipe(process.stdout);
});
Output:
8
hi user

The first answer only works if streams take roughly the same amount of time to process data. If one takes significantly longer, the faster one will request new data, consequently overwriting the data still being used by the slower one (I had this problem after trying to solve it using a duplicate stream).
The following pattern worked very well for me. It uses a library based on Stream2 streams, Streamz, and Promises to synchronize async streams via a callback. Using the familiar example from the first answer:
spawn = require('child_process').spawn;
pass = require('stream').PassThrough;
streamz = require('streamz').PassThrough;
var Promise = require('bluebird');
a = spawn('echo', ['hi user']);
b = new pass;
c = new pass;
a.stdout.pipe(streamz(combineStreamOperations));
function combineStreamOperations(data, next){
Promise.join(b, c, function(b, c){ //perform n operations on the same data
next(); //request more
}
count = 0;
b.on('data', function(chunk) { count += chunk.length; });
b.on('end', function() { console.log(count); c.pipe(process.stdout); });

You can use this small npm package I created:
readable-stream-clone
With this you can reuse readable streams as many times as you need

For general problem, the following code works fine
var PassThrough = require('stream').PassThrough
a=PassThrough()
b1=PassThrough()
b2=PassThrough()
a.pipe(b1)
a.pipe(b2)
b1.on('data', function(data) {
console.log('b1:', data.toString())
})
b2.on('data', function(data) {
console.log('b2:', data.toString())
})
a.write('text')

I have a different solution to write to two streams simultaneously, naturally, the time to write will be the addition of the two times, but I use it to respond to a download request, where I want to keep a copy of the downloaded file on my server (actually I use a S3 backup, so I cache the most used files locally to avoid multiple file transfers)
/**
* A utility class made to write to a file while answering a file download request
*/
class TwoOutputStreams {
constructor(streamOne, streamTwo) {
this.streamOne = streamOne
this.streamTwo = streamTwo
}
setHeader(header, value) {
if (this.streamOne.setHeader)
this.streamOne.setHeader(header, value)
if (this.streamTwo.setHeader)
this.streamTwo.setHeader(header, value)
}
write(chunk) {
this.streamOne.write(chunk)
this.streamTwo.write(chunk)
}
end() {
this.streamOne.end()
this.streamTwo.end()
}
}
You can then use this as a regular OutputStream
const twoStreamsOut = new TwoOutputStreams(fileOut, responseStream)
and pass it to to your method as if it was a response or a fileOutputStream

If you have async operations on the PassThrough streams, the answers posted here won't work.
A solution that works for async operations includes buffering the stream content and then creating streams from the buffered result.
To buffer the result you can use concat-stream
const Promise = require('bluebird');
const concat = require('concat-stream');
const getBuffer = function(stream){
return new Promise(function(resolve, reject){
var gotBuffer = function(buffer){
resolve(buffer);
}
var concatStream = concat(gotBuffer);
stream.on('error', reject);
stream.pipe(concatStream);
});
}
To create streams from the buffer you can use:
const { Readable } = require('stream');
const getBufferStream = function(buffer){
const stream = new Readable();
stream.push(buffer);
stream.push(null);
return Promise.resolve(stream);
}

What about piping into two or more streams not at the same time ?
For example :
var PassThrough = require('stream').PassThrough;
var mybiraryStream = stream.start(); //never ending audio stream
var file1 = fs.createWriteStream('file1.wav',{encoding:'binary'})
var file2 = fs.createWriteStream('file2.wav',{encoding:'binary'})
var mypass = PassThrough
mybinaryStream.pipe(mypass)
mypass.pipe(file1)
setTimeout(function(){
mypass.pipe(file2);
},2000)
The above code does not produce any errors but the file2 is empty

Develop Reference

JavaScript is the programming language of the Web.

Pipe NodeJS Stream to an Array - javascript

Related

Making any reference to Nodejs' process.argv causes errors in unexpected place (reading a file)

NodeJS - code inside an async function executes out of order

how to fix this javascript program?

Read json file and append a particular array to outer scope list - javascript - Nodejs

Node.js Piping the same readable stream into multiple (writable) targets

Categories

Resources