I am trying to read some large CSV files and processing those data, so there is a rate limit in processing, so I want to add 1mnt delay between each request. I tried with the set timeout, but finally, come to know there is a limit for settimeout and get the following error. I am not sure any other way to handle the situation, the CSV file has more than 1M records. Am I doing anything wrong here?
Error
Timeout duration was set to 1. (node:41) TimeoutOverflowWarning:
2241362000 does not fit into a 32-bit signed integer.
Samle code:
const Queue = require('bull');
const domainQueue = new Queue(config.api.crawlerQ, {
redis: connectRedis(),
});
let ctr = 0;
function processCSV (name, fileName, options) {
return new Promise((resolve, reject) => {
console.log('process csv started', new Date());
let filePath = config.api.basePath + fileName;
stream = fs.createReadStream(filePath)
.on('error', (error) => {
// handle error
console.log('error processing csv');
reject(error);
})
.pipe(csv())
.on('data', async (row) => {
ctr++
increment(row, ctr)
})
.on('end', () => {
console.log('stream processCSV end', fileName, new Date());
resolve(filePath);
})
});
}
async function increment(raw, counter) {
setTimeout(async function(){
console.log('say i am inside a function', counter, new Date());
domainQueue.add(data, options); // Add jobs to queue - Here i Need a delay say 1mnt, if i
// add jobs without delay it will hit ratelimit
}, 60000 * counter);
}
function queueWorkerProcess(value) { // Process jobs in queue and save in text file
console.log('value', value, new Date());
return new Promise(resolve => {
resolve();
});
}
Here's a general idea. You need to keep track of how many items are inflight being processed to both limit the amount of memory used and to control the load on whatever resources you're storing the results in.
When you hit some limit of how many are inflight, you pause the stream. When you get back below the limit, you resume the stream. You increment a counter on .add() and decrement a counter on the completed message to keep track of things. That's where you pause or resume the stream.
FYI, just inserting a setTimeout() somewhere won't help you. To get your memory usage under control, you have to pause the flow of data from the stream once you have too many items in process. Then, when the items get back under a threshold, you can then resume the stream.
Here's an outline of what that could look like:
const Queue = require('bull');
const domainQueue = new Queue(config.api.crawlerQ, {
redis: connectRedis(),
});
// counter that keeps track of how many items in the queue
let queueCntr = 0;
// you tune this constant up or down to manage memory usage or tweak performance
// this is what keeps you from having too many requests going at once
const queueMax = 20;
function processCSV(name, fileName, options) {
return new Promise((resolve, reject) => {
let paused = false;
console.log('process csv started', new Date());
const filePath = config.api.basePath + fileName;
const stream = fs.createReadStream(filePath)
.on('error', (error) => {
// handle error
console.log('error processing csv');
domainQueue.off('completed', completed);
reject(error);
}).pipe(csv())
.on('data', async (row) => {
increment(row, ctr);
if (queueCntr)
})
.on('end', () => {
console.log('stream processCSV end', fileName, new Date());
domainQueue.off('completed', completed);
resolve(filePath);
});
function completed() {
--queueCntr;
// see if queue got small enough we now resume the stream
if (paused && queueCntr < queueMax) {
stream.resume();
paused = false;
}
}
domainQueue.on('completed', completed);
function increment(raw, counter) {
++queueCntr;
domainQueue.add(data, options);
if (!paused && queueCntr > queueMax) {
stream.pause();
paused = true;
}
}
});
}
And, if you're calling processCSV() multiple times with different files, you should sequence them so you don't call the 2nd one until the first one is done, don't call the 3rd one until the 2nd one is done and so on... You don't show that code so we can't make a specific suggestion on that.
Related
I am trying to use a twitter npm to search for tweets in realtime and like them. It streams the tweets data and then uses .post to create the likes.
Currently works but I keep running into 429 too many request errors because of the api rate limit. Ive been trying to get it to pause after each like, however nothing I've tried seems to work. At most it effects the loop before or after but never in between the post/like action.
Any ideas how to get it to delay after each post(like)? I've commented out some of the things I've already tried.
// Returns a Promise that resolves after "ms" Milliseconds
const timer = ms => new Promise(res => setTimeout(res, ms))
const wait = (duration, ...args) => new Promise(resolve => {
setTimeout(resolve, duration, ...args);
});
function LikeTweets() {
client.stream('statuses/filter', { track: terms }, function (stream) {
stream.on('data', async function (tweet) {
// try {
// for (var i = 0; i < 3;) {
v1Client.post('favorites/create', { id: tweet.id_str })
.then(async (result) => {
console.log(result.text);
i++;
console.log(i);
await timer(10000);
}).then(async (newresult) => {
console.log(newresult);
await timer(10000);
}).catch(error => {
console.log(error);
return;
});
//await timer(3000); // then the created Promise can be awaited
// }
// } catch(err) {
// console.log("or catching here?");
// setTimeout(function() {
// LikeTweets();
// }, 15000);
// }
});
});
}
setTimeout(function() {
LikeTweets();
}, 15000);
You make one request per invocation of the stream.on("data", ...) event handler, therefore if 100 data events arrive within a minute, you will make 100 requests within that minute. This exceeds the rate limit.
You must ensure that the sequence of requests made is slower than the sequence of incoming events. The following code illustrates how this decoupling of sequences can be achieved:
/* Make one request every 20 seconds. */
var requestQueue = [];
function processQueue() {
var r = requestQueue.shift();
if (r) v1Client.post("favorites/create", r.payload).then(r.resolve, r.reject);
setTimeout(processQueue, 20000);
}
processQueue();
/* Use this function to schedule another request. */
function makeRequest(payload) {
var r = {payload};
requestQueue.push(r);
return new Promise(function(resolve, reject) {
r.resolve = resolve;
r.reject = reject;
});
}
stream.on("data", function(tweet) {
makeRequest({id: tweet.id_str}).then(async (result) => {...});
});
The promise returned by makeRequest can take a while until it resolves, therefore the code {...} may be executed only after several seconds or even minutes. In other words: The code uses the power of promises to keep within the strictures of the API rate limit.
This works only if, in the long run average, the number of incoming data events does not exceed the possible rate of outgoing requests, which is 1 every 20 seconds. This is nothing you can get around without a mass-update API (which would not be in the interest of the Twitter community, I assume).
I have an open Websocket connection and it's handing out events. All good, but once a new event arrives, I need to do a whole lot of things and sometimes events arrive so quickly one after the other that there is no time to get the stuff done properly. I need some sort of queue inside this function that tells the events to take it easy and only keep going at most one per second, and otherwise wait in some sort of queue until the second elapses to go ahead and continue.
edit: No external libraries allowed, unfortunately.
ws = new WebSocket(`wss://hallo.com/ws/`);
ws.onmessage = readMessage;
async function readMessage(event) {
print(event)
//do important things
//but not too frequently!
}
How do I do that?
I found this but it goes over my simple head:
"You can have a queue-like promise that keeps on accumulating promises to make sure they run sequentially:
let cur = Promise.resolve();
function enqueue(f) {
cur = cur.then(f); }
function someAsyncWork() {
return new Promise(resolve => {
setTimeout(() => {
resolve('async work done');
}, 5);
}); } async function msg() {
const msg = await someAsyncWork();
console.log(msg); }
const main = async() => {
web3.eth.subscribe('pendingTransactions').on("data", function(tx) {
enqueue(async function() {
console.log('1st print: ',tx);
await msg();
console.log('2nd print: ',tx);
});
}) }
main();
"
I'd honestly use something like lodash's throttle to do this. The following snippet should solve your problem.
ws = new WebSocket(`wss://hallo.com/ws/`);
ws.onmessage = _.throttle(readMessage, 1000);
async function readMessage(event) {
print(event)
//do important things
//but not too frequently!
}
For achieving queuing, you can make use of "settimeout" in simple/core javascript.
Whenever you receive a message from websocket, put the message processing function in a settimeout, this will ensure that the message is processed not immediately as its received, but with a delay, hence in a way you can achieve queuing.
The problem with this is that it does not guarantee that the processing of messages is sequential as they are received if that is needed.
By default settimeout in javascript does give the guarantee of when the function inside will be triggered after the time given is elapsed.
Also it may not reduce the load on your message processor service for a high volume situation and since individual messages are queued two/more functions can become ready to be processed from setimeout within some time frame.
An ideal way to do so would be to create a queue. On a high level code flow this can be achieved as follows
var queue = [];
function getFromQueue() {
return queue.shift();
}
function insertQueue(msg) { //called whenever a new message arrives
queue.push(msg);
console.log("Queue state", queue);
}
// can be used if one does not want to wait for previous message processing to finish
// (function executorService(){
// setTimeout(async () => {
// const data = getFromQueue();
// await processData(data);
// executorService();
// }, 1000)
// })()
(function executorService(){
return new Promise((res, rej) => {
setTimeout(async () => {
const data = getFromQueue();
console.log("Started processing", data)
const resp = await processData(data); //waiting for async processing of message to finish
res(resp);
}, 2000)
}).then((data) =>{
console.log("Successfully processed event", data)
}).catch((err) => {
console.log(err)
}).finally(() => {
executorService();
})
})()
// to simulate async processing of messages
function processData(data){
return new Promise((res, rej) => {
setTimeout(async () => {
console.log("Finished processing", data)
res(data);
}, 4000)
})
}
// to simulate message received by web socket
var i = 0;
var insertRand = setInterval(function(){
insertQueue(i); // this must be called on when web socket message received
i+=1;
}, 1000)
I want to test how much requests i can do and get their total time elapsed. My Promise function
async execQuery(response, query) {
let request = new SQL.Request();
return new Promise((resolve, reject) => {
request.query(query, (error, result) => {
if (error) {
reject(error);
} else {
resolve(result);
}
});
});
}
And my api
app.get('/api/bookings/:uid', (req, res) => {
let st = new stopwatch();
let id = req.params.uid;
let query = `SELECT * FROM booking.TransactionDetails WHERE UID='${id}'`;
for (let i = 0; i < 10000; i++) {
st.start();
db.execQuery(res, query);
}
});
I can't stop the for loop since its async but I also don't know how can I stop executing other calls after the one which first rejects so i can get the counter and the elapsed time of all successful promises. How can i achieve that?
You can easily create a composable wrapper for this, or a subclass:
Inheritance:
class TimedPromise extends Promise {
constructor(executor) {
this.startTime = performance.now(); // or Date.now
super(executor);
let end = () => this.endTime = performance.now();
this.then(end, end); // replace with finally when available
}
get time() {
return this.startTime - this.endTime; // time in milliseconds it took
}
}
Then you can use methods like:
TimedPromise.all(promises);
TimedPromise.race(promises);
var foo = new TimedPromise(resolve => setTimeout(resolve, 100);
let res = await foo;
console.log(foo.time); // how long foo took
Plus then chaining would work, async functions won't (since they always return native promises).
Composition:
function time(promise) {
var startTime = performance.now(), endTime;
let end = () => endTime = performance.now();
promise.then(end, end); // replace with finally when appropriate.
return () => startTime - endTime;
}
Then usage is:
var foo = new Promise(resolve => setTimeout(resolve, 100);
var timed = time(foo);
await foo;
console.log(timed()); // how long foo took
This has the advantage of working everywhere, but the disadvantage of manually having to time every promise. I prefer this approach for its explicitness and arguably nicer design.
As a caveat, since a rejection handler is attached, you have to be 100% sure you're adding your own .catch or then handler since otherwise the error will not log to the console.
Wouldn't this work in your promise ?
new Promise((resolve, reject) => {
var time = Date.now();
request.query(query, (error, result) => {
if (error) {
reject(error);
} else {
resolve(result);
}
});
}).then(function(r){
//code
}).catch(function(e){
console.log('it took : ', Date.now() - time);
});
Or put the .then and .catch after your db.execQuery() call
You made 2 comments that would indicate you want to stop all on going queries when a promise fails but fail to mention what SQL is and if request.query is something that you can cancel.
In your for loop you already ran all the request.query statements, if you want to run only one query and then the other you have to do request.query(query).then(-=>request.query(query)).then... but it'll take longer because you don't start them all at once.
Here is code that would tell you how long all the queries took but I think you should tell us what SQL is so we could figure out how to set connection pooling and caching (probably the biggest performance gainer).
//removed the async, this function does not await anything
// so there is no need for async
//removed initializing request, you can re use the one created in
// the run function, that may shave some time off total runtime
// but not sure if request can share connections (in that case)
// it's better to create a couple and pass them along as their
// connection becomes available (connection pooling)
const execQuery = (response, query, request) =>
new Promise(
(resolve, reject) =>
request.query(
query
,(error, result) =>
(error)
? reject(error)
: resolve(result)
)
);
// save failed queries and resolve them with Fail object
const Fail = function(detail){this.detail=detail;};
// let request = new SQL.Request();
const run = (numberOfTimes) => {
const start = new Date().getTime();
const request = new SQL.Request();
Promise.all(
(x=>{
for (let i = 0; i < numberOfTimes; i++) {
let query = `SELECT * FROM booking.TransactionDetails WHERE UID='${i}'`;
db.execQuery(res, query, request)
.then(
x=>[x,query]
,err=>[err,query]
)
}
})()//IIFE creating array of promises
)
.then(
results => {
const totalRuntime = new Date().getTime()-start;
const failed = results.filter(r=>(r&&r.constructor)===Fail);
console.log(`Total runtime in ms:${totalRuntime}
Failed:${failed.length}
Succeeded:${results.length-failed.length}`);
}
)
};
//start the whole thing with:
run(10000);
I periodically have to download/parse a bunch of Json data, about 1000~1.000.000 lines.
Each request has a chunk limit of 5000. So I would like to fire of a bunch of request at the time, stream each output through its own Transfomer for filtering out the key/value's and then write to a combined stream that writes its output to the database.
But with every attempt it doesn't work, or it gives errors because to many event listeners are set. What seems correct if I understand the the 'last pipe' is always the reference next in the chain.
Here is some code (changed it lot of times so could make little sense).
The question is: Is it bad practice to join multiple streams to one? Google also doesn't show a whole lot about it.
Thanks!
brokerApi/getCandles.js
// The 'combined output' stream
let passStream = new Stream.PassThrough();
countChunks.forEach(chunk => {
let arr = [];
let leftOver = '';
let startFound = false;
let lastPiece = false;
let firstByte = false;
let now = Date.now();
let transformStream = this._client
// Returns PassThrough stream
.getCandles(instrument, chunk.from, chunk.until, timeFrame, chunk.count)
.on('error', err => console.error(err) || passStream.emit('error', err))
.on('end', () => {
if (++finished === countChunks.length)
passStream.end();
})
.pipe(passStream);
transformStream._transform = function(data, type, done) {
/** Treansform to typedArray **/
this.push(/** Taansformed value **/)
}
});
Extra - Other file that 'consumes' the stream (writes to DB)
DataLayer.js
brokerApi.getCandles(instrument, timeFrame, from, until, count)
.on('data', async (buf: NodeBuffer) => {
this._dataLayer.write(instrument, timeFrame, buf);
if (from && until) {
await this._mapper.update(instrument, timeFrame, from, until, buf.length / (10 * Float64Array.BYTES_PER_ELEMENT));
} else {
if (buf.length) {
if (!from)
from = buf.readDoubleLE(0);
if (!until) {
until = buf.readDoubleLE(buf.length - (10 * Float64Array.BYTES_PER_ELEMENT));
console.log('UNTIL TUNIL', until);
}
if (from && until)
await this._mapper.update(instrument, timeFrame, from, until, buf.length / (10 * Float64Array.BYTES_PER_ELEMENT));
}
}
})
.on('end', () => {
winston.info(`Cache: Fetching ${instrument} took ${Date.now() - now} ms`);
resolve()
})
.on('error', reject)
Check out the stream helpers from highlandjs, e.g. (untested, pseudo code):
function getCandle(candle) {...}
_(chunks).map(getCandle).parallel(5000).pipe(...)
I am trying to do the following:
Stream a csv file in line by line.
Modify the data contained in each line.
Once all lines are streamed and processed, finish and move on to next task.
The problem is .on("end") fires before .on("data") finishes processing each line. How can I get .on("end") to fire after .on("data") has finished processing all the lines?
Below is a simple example of what I am talking about:
import parse from 'csv-parse';
var parser = parse({});
fs.createReadStream(this.upload.location)
.pipe(parser)
.on("data", line => {
var num = Math.floor((Math.random() * 100) + 1);
num = num % 3;
num = num * 1000;
setTimeout( () => {
console.log('data process complete');
}, num);
})
.on("end", () => {
console.log('Done: parseFile');
next(null);
});
Thanks in advance.
I think the issue is the setTimeout (or any other async task) within the data event listener. end is firing after data but the async task is causing it to log messages even after the stream fires end.
If you take out the setTimeout then you'll see that it logs all the messages in data before end. You can still perform async tasks but there will be a potential batch of them that run after the stream has ended.
This code helps explain what is going on:
const fs = require('fs')
const testFileName = 'testfile.txt'
fs.writeFileSync(testFileName, '123456789')
let count = 0
const readStream = fs.createReadStream(testFileName, {
encoding: 'utf8',
highWaterMark: 1 // low highWaterMark so we can have more chunks to observe
})
readStream.on('data', (data) => {
console.log('+++++++++++processing sync+++++++++++++')
console.log(data)
console.log('+++++++++++end processing sync+++++++++++++')
setTimeout(() => {
console.log('-----------processing async-------------')
console.log(data)
console.log('-----------end processing async-------------')
}, ++count * 1000)
})
readStream.on('end', () => {
console.log('stream ended but still have async tasks doing their thing')
fs.unlinkSync(testFileName)
})