I am trying to do the following:
Stream a csv file in line by line.
Modify the data contained in each line.
Once all lines are streamed and processed, finish and move on to next task.
The problem is .on("end") fires before .on("data") finishes processing each line. How can I get .on("end") to fire after .on("data") has finished processing all the lines?
Below is a simple example of what I am talking about:
import parse from 'csv-parse';
var parser = parse({});
fs.createReadStream(this.upload.location)
.pipe(parser)
.on("data", line => {
var num = Math.floor((Math.random() * 100) + 1);
num = num % 3;
num = num * 1000;
setTimeout( () => {
console.log('data process complete');
}, num);
})
.on("end", () => {
console.log('Done: parseFile');
next(null);
});
Thanks in advance.
I think the issue is the setTimeout (or any other async task) within the data event listener. end is firing after data but the async task is causing it to log messages even after the stream fires end.
If you take out the setTimeout then you'll see that it logs all the messages in data before end. You can still perform async tasks but there will be a potential batch of them that run after the stream has ended.
This code helps explain what is going on:
const fs = require('fs')
const testFileName = 'testfile.txt'
fs.writeFileSync(testFileName, '123456789')
let count = 0
const readStream = fs.createReadStream(testFileName, {
encoding: 'utf8',
highWaterMark: 1 // low highWaterMark so we can have more chunks to observe
})
readStream.on('data', (data) => {
console.log('+++++++++++processing sync+++++++++++++')
console.log(data)
console.log('+++++++++++end processing sync+++++++++++++')
setTimeout(() => {
console.log('-----------processing async-------------')
console.log(data)
console.log('-----------end processing async-------------')
}, ++count * 1000)
})
readStream.on('end', () => {
console.log('stream ended but still have async tasks doing their thing')
fs.unlinkSync(testFileName)
})
Related
I have an open Websocket connection and it's handing out events. All good, but once a new event arrives, I need to do a whole lot of things and sometimes events arrive so quickly one after the other that there is no time to get the stuff done properly. I need some sort of queue inside this function that tells the events to take it easy and only keep going at most one per second, and otherwise wait in some sort of queue until the second elapses to go ahead and continue.
edit: No external libraries allowed, unfortunately.
ws = new WebSocket(`wss://hallo.com/ws/`);
ws.onmessage = readMessage;
async function readMessage(event) {
print(event)
//do important things
//but not too frequently!
}
How do I do that?
I found this but it goes over my simple head:
"You can have a queue-like promise that keeps on accumulating promises to make sure they run sequentially:
let cur = Promise.resolve();
function enqueue(f) {
cur = cur.then(f); }
function someAsyncWork() {
return new Promise(resolve => {
setTimeout(() => {
resolve('async work done');
}, 5);
}); } async function msg() {
const msg = await someAsyncWork();
console.log(msg); }
const main = async() => {
web3.eth.subscribe('pendingTransactions').on("data", function(tx) {
enqueue(async function() {
console.log('1st print: ',tx);
await msg();
console.log('2nd print: ',tx);
});
}) }
main();
"
I'd honestly use something like lodash's throttle to do this. The following snippet should solve your problem.
ws = new WebSocket(`wss://hallo.com/ws/`);
ws.onmessage = _.throttle(readMessage, 1000);
async function readMessage(event) {
print(event)
//do important things
//but not too frequently!
}
For achieving queuing, you can make use of "settimeout" in simple/core javascript.
Whenever you receive a message from websocket, put the message processing function in a settimeout, this will ensure that the message is processed not immediately as its received, but with a delay, hence in a way you can achieve queuing.
The problem with this is that it does not guarantee that the processing of messages is sequential as they are received if that is needed.
By default settimeout in javascript does give the guarantee of when the function inside will be triggered after the time given is elapsed.
Also it may not reduce the load on your message processor service for a high volume situation and since individual messages are queued two/more functions can become ready to be processed from setimeout within some time frame.
An ideal way to do so would be to create a queue. On a high level code flow this can be achieved as follows
var queue = [];
function getFromQueue() {
return queue.shift();
}
function insertQueue(msg) { //called whenever a new message arrives
queue.push(msg);
console.log("Queue state", queue);
}
// can be used if one does not want to wait for previous message processing to finish
// (function executorService(){
// setTimeout(async () => {
// const data = getFromQueue();
// await processData(data);
// executorService();
// }, 1000)
// })()
(function executorService(){
return new Promise((res, rej) => {
setTimeout(async () => {
const data = getFromQueue();
console.log("Started processing", data)
const resp = await processData(data); //waiting for async processing of message to finish
res(resp);
}, 2000)
}).then((data) =>{
console.log("Successfully processed event", data)
}).catch((err) => {
console.log(err)
}).finally(() => {
executorService();
})
})()
// to simulate async processing of messages
function processData(data){
return new Promise((res, rej) => {
setTimeout(async () => {
console.log("Finished processing", data)
res(data);
}, 4000)
})
}
// to simulate message received by web socket
var i = 0;
var insertRand = setInterval(function(){
insertQueue(i); // this must be called on when web socket message received
i+=1;
}, 1000)
I am trying to read some large CSV files and processing those data, so there is a rate limit in processing, so I want to add 1mnt delay between each request. I tried with the set timeout, but finally, come to know there is a limit for settimeout and get the following error. I am not sure any other way to handle the situation, the CSV file has more than 1M records. Am I doing anything wrong here?
Error
Timeout duration was set to 1. (node:41) TimeoutOverflowWarning:
2241362000 does not fit into a 32-bit signed integer.
Samle code:
const Queue = require('bull');
const domainQueue = new Queue(config.api.crawlerQ, {
redis: connectRedis(),
});
let ctr = 0;
function processCSV (name, fileName, options) {
return new Promise((resolve, reject) => {
console.log('process csv started', new Date());
let filePath = config.api.basePath + fileName;
stream = fs.createReadStream(filePath)
.on('error', (error) => {
// handle error
console.log('error processing csv');
reject(error);
})
.pipe(csv())
.on('data', async (row) => {
ctr++
increment(row, ctr)
})
.on('end', () => {
console.log('stream processCSV end', fileName, new Date());
resolve(filePath);
})
});
}
async function increment(raw, counter) {
setTimeout(async function(){
console.log('say i am inside a function', counter, new Date());
domainQueue.add(data, options); // Add jobs to queue - Here i Need a delay say 1mnt, if i
// add jobs without delay it will hit ratelimit
}, 60000 * counter);
}
function queueWorkerProcess(value) { // Process jobs in queue and save in text file
console.log('value', value, new Date());
return new Promise(resolve => {
resolve();
});
}
Here's a general idea. You need to keep track of how many items are inflight being processed to both limit the amount of memory used and to control the load on whatever resources you're storing the results in.
When you hit some limit of how many are inflight, you pause the stream. When you get back below the limit, you resume the stream. You increment a counter on .add() and decrement a counter on the completed message to keep track of things. That's where you pause or resume the stream.
FYI, just inserting a setTimeout() somewhere won't help you. To get your memory usage under control, you have to pause the flow of data from the stream once you have too many items in process. Then, when the items get back under a threshold, you can then resume the stream.
Here's an outline of what that could look like:
const Queue = require('bull');
const domainQueue = new Queue(config.api.crawlerQ, {
redis: connectRedis(),
});
// counter that keeps track of how many items in the queue
let queueCntr = 0;
// you tune this constant up or down to manage memory usage or tweak performance
// this is what keeps you from having too many requests going at once
const queueMax = 20;
function processCSV(name, fileName, options) {
return new Promise((resolve, reject) => {
let paused = false;
console.log('process csv started', new Date());
const filePath = config.api.basePath + fileName;
const stream = fs.createReadStream(filePath)
.on('error', (error) => {
// handle error
console.log('error processing csv');
domainQueue.off('completed', completed);
reject(error);
}).pipe(csv())
.on('data', async (row) => {
increment(row, ctr);
if (queueCntr)
})
.on('end', () => {
console.log('stream processCSV end', fileName, new Date());
domainQueue.off('completed', completed);
resolve(filePath);
});
function completed() {
--queueCntr;
// see if queue got small enough we now resume the stream
if (paused && queueCntr < queueMax) {
stream.resume();
paused = false;
}
}
domainQueue.on('completed', completed);
function increment(raw, counter) {
++queueCntr;
domainQueue.add(data, options);
if (!paused && queueCntr > queueMax) {
stream.pause();
paused = true;
}
}
});
}
And, if you're calling processCSV() multiple times with different files, you should sequence them so you don't call the 2nd one until the first one is done, don't call the 3rd one until the 2nd one is done and so on... You don't show that code so we can't make a specific suggestion on that.
I am trying to track the progress of a pipe from a read stream to write stream so I can display the progress to the user.
My original idea was to track progress when the data event is emitted as shown here:
const fs = require('fs');
let final = fs.createWriteStream('output');
fs.createReadStream('file')
.on('close', () => {
console.log('done');
})
.on('error', (err) => {
console.error(err);
})
.on('data', (data) => {
console.log("data");
/* Calculate progress */
})
.pipe(final);
However I realized just cause it was read, doesn't mean it was actually written. This can be seen if the pipe is removed, as the data event still emits.
How can track write progress when piping with Node.js?
You can use a dummy Transform stream like this:
const stream = require('stream');
let totalBytes = 0;
stream.pipeline(
fs.createReadStream(from_file),
new stream.Transform({
transform(chunk, encoding, callback) {
totalBytes += chunk.length;
console.log(totalBytes);
this.push(chunk);
callback();
}
}),
fs.createWriteStream(to_file),
err => {
if (err)
...
}
);
You can do the piping manually, and make use of the callback from writable.write()
callback: < function > Callback for when this chunk of data is flushed
const fs = require('fs');
let from_file = `<from_file>`;
let to_file = '<to_file>';
let from_stream = fs.createReadStream(from_file);
let to_stream = fs.createWriteStream(to_file);
// get total size of the file
let { size } = fs.statSync(from_file);
let written = 0;
from_stream.on('data', data => {
// do the piping manually here.
to_stream.write(data, () => {
written += data.length;
console.log(`written ${written} of ${size} bytes (${(written/size*100).toFixed(2)}%)`);
});
});
Somehow I remember this thread being about memory efficiency, anyway, I've rigged up a small script that's very memory efficient and tracks progress very well. I tested it under a 230MB file and the result speaks for itself. https://gist.github.com/J-Cake/78ce059972595823243526e022e327e4
The sample file I used was a bit weird as the content-length header it reported was in fact off but the program uses no more than 4.5 MiB of memory.
I periodically have to download/parse a bunch of Json data, about 1000~1.000.000 lines.
Each request has a chunk limit of 5000. So I would like to fire of a bunch of request at the time, stream each output through its own Transfomer for filtering out the key/value's and then write to a combined stream that writes its output to the database.
But with every attempt it doesn't work, or it gives errors because to many event listeners are set. What seems correct if I understand the the 'last pipe' is always the reference next in the chain.
Here is some code (changed it lot of times so could make little sense).
The question is: Is it bad practice to join multiple streams to one? Google also doesn't show a whole lot about it.
Thanks!
brokerApi/getCandles.js
// The 'combined output' stream
let passStream = new Stream.PassThrough();
countChunks.forEach(chunk => {
let arr = [];
let leftOver = '';
let startFound = false;
let lastPiece = false;
let firstByte = false;
let now = Date.now();
let transformStream = this._client
// Returns PassThrough stream
.getCandles(instrument, chunk.from, chunk.until, timeFrame, chunk.count)
.on('error', err => console.error(err) || passStream.emit('error', err))
.on('end', () => {
if (++finished === countChunks.length)
passStream.end();
})
.pipe(passStream);
transformStream._transform = function(data, type, done) {
/** Treansform to typedArray **/
this.push(/** Taansformed value **/)
}
});
Extra - Other file that 'consumes' the stream (writes to DB)
DataLayer.js
brokerApi.getCandles(instrument, timeFrame, from, until, count)
.on('data', async (buf: NodeBuffer) => {
this._dataLayer.write(instrument, timeFrame, buf);
if (from && until) {
await this._mapper.update(instrument, timeFrame, from, until, buf.length / (10 * Float64Array.BYTES_PER_ELEMENT));
} else {
if (buf.length) {
if (!from)
from = buf.readDoubleLE(0);
if (!until) {
until = buf.readDoubleLE(buf.length - (10 * Float64Array.BYTES_PER_ELEMENT));
console.log('UNTIL TUNIL', until);
}
if (from && until)
await this._mapper.update(instrument, timeFrame, from, until, buf.length / (10 * Float64Array.BYTES_PER_ELEMENT));
}
}
})
.on('end', () => {
winston.info(`Cache: Fetching ${instrument} took ${Date.now() - now} ms`);
resolve()
})
.on('error', reject)
Check out the stream helpers from highlandjs, e.g. (untested, pseudo code):
function getCandle(candle) {...}
_(chunks).map(getCandle).parallel(5000).pipe(...)
I am trying to figure out how to run in parallel (in this case 10) async function based on a stream of parsing datas from a website using lapwinglabs/x-ray webscraper.
let pauser = new Rx.Subject()
let count = 0
let max = 10
// function that parse a single url to retrieve data
// return Observable
let parsing_each_link = url => {
return Rx.Observable.create(
observer => {
xray(url, selector)((err, data) => {
if (err) observer.onError(err)
observer.onNext(data)
observer.onCompleted()
})
})
}
// retrieve all the urls from a main page => node stream
let streamNode = xray(main_url, selector)
.paginate(some_selector)
.write()
.pipe(JSONStream.parse('*'))
// convert node stream to RxJS
let streamRx = RxNode.fromStream(streamNode)
.do(() => {
if (count === max) {
pauser.onNext(true)
count = 0
}
})
.do(() => count++)
.buffer(pauser) // take only 10 url by 10 url
streamRx.subscribe(
ten_urls => {
Rx.Observable.forkJoin(
ten_urls.map(url => parsing_each_link(url))
)
.subscribe(
x => console.log("Next : ", JSON.stringify(x, null, 4))
)
}
)
Next on the last console.log is never called ?!?
Impossible to say for sure, but if you can make sure that ten_urls are emitted as expected, then the next step is to make sure that the observable parsing_each_link does complete, as forkJoin will wait for the last value of each of its source observables.
I could not see any call to observer.onComplete in your code.