fs.readFile is very slow, am I making too many request?

fs.readFile is very slow, am I making too many request? - javascript

node.js beginner here:
A node.js applications scrapes an array of links (linkArray) from a list of ~30 urls.
Each domain/url has a corresponding (name).json file that is used to check whether the scraped links are new or not.
All pages are fetched, links are scraped into arrays, and then passed to:
function checkLinks(linkArray, name){
console.log(name, "checkLinks");
fs.readFile('json/'+name+'.json', 'utf8', function readFileCallback(err, data){
if(err && err.errno != -4058) throw err;
if(err && err.errno == -4058){
console.log(name+'.json', " is NEW .json");
compareAndAdd(linkArray, {linkArray: []}.linkArray, name);
}
else{
//file EXISTS
compareAndAdd(linkArray, JSON.parse(data).linkArray, name);
}
});
}
compareAndAdd() reads:
function compareAndAdd(arrNew, arrOld, name){
console.log(name, "compareAndAdd()");
if(!arrOld) var arrOld = [];
if(!arrNew) var arrNew = [];
//compare and remove dups
function hasDup(value) {
for (var i = 0; i < arrOld.length; i++)
if(value.href == arrOld[i].href)
if(value.text.length <= arrOld[i].text.length) return false;
arrOld.push(value);
return true;
}
var rArr = arrNew.filter(hasDup);
//update existing array;
if(rArr.length > 0){
fs.writeFile('json/'+name+'.json', JSON.stringify({linkArray: arrOld}), function (err) {
if (err) return console.log(err);
console.log(" "+name+'.json UPDATED');
});
}
else console.log(" "+name, "no changes, nothing to update");
return;
}
checkLinks() is where the program hangs, it's unbelievably slow. I understand that fs.readFile is being hit multiple times a second, but imo less than 30 hits seems pretty trivial: assuming this is a function meant to be used to serve data to (potentially) millions of users. Am I expecting too much from fs.readFile, or (more likely) is there another component (like writeFile, or something else entirely) that's locking everything up.
supplemental:
using write/readFileSync creates a lot of problems: this program is inherently async because it begins with request to external websites with largely varied response times and read/write would frequently collide. the functions above insure that writing for a given file only happens after it's been read. (though it is very slow)
Also, this program does not exit on its own, and I do not know why.
edit
I've reworked the program to read first then write synchronously last and the process is down to ~12 seconds. Apparently fs.readFile was getting hung when called multiple times. I don't understand when/how to use asynchronous fs, if multiple calls hangs the function.

All async fs operations are executed inside the libuv thread pool, which has a default size of 4 (can be changed by setting the UV_THREADPOOL_SIZE environment variable to something different). If all threads in the thread pool are busy, any fs operations will be queued up.
I should also point out that fs is not the only module that uses the thread pool, dns.lookup() (the default hostname resolution method used internally by node), async zlib methods, crypto.randomBytes(), and a couple of other things IIRC also use the libuv thread pool. This is just something to keep in mind.

If you read many files (checkLinks) in a loop, firstly ALL the fs.readFile functions will be called. And only AFTER that callbacks will be processed (they processed only if main function stack is empty). This would lead to significant starting delay. But don't worry about that.
You point that a program never ends. So, make a counter, count calls to checkLinks, and decrease the counter after callback function is called. Inside the callback, check the counter against 0 and then do finalizing logic (I suspect this could be a response to the http request).
Actually, it doesn't matter whether you use async version or sync. They will work relatively the same time.

Related

Synchronization in Javascript & Socket.io

I have a Node.js server running the following code, which uses socket.io library :
var Common = require('./common');
var _ = require('lodash');
var store = (function() {
var userMapByUserId = {};
var tokenSet = {};
var generateToken = function(userId) {
if (_.has(userMapByUserId, userId)) {
var token = '';
do {
token = Common.randomGenerator();
} while (_.has(tokenSet, token));
if (userMapByUserId[userId].tokens === undefined) {
userMapByUserId[userId].tokens = {};
}
userMapByUserId[userId].tokens[token] = true;
}
};
var deleteToken = function(userId, tokenValue) {
if (_.has(userMapByUserId, userId)) {
if (userMapByUserId[userId].tokens !== undefined) {
userMapByUserId[userId].tokens = _.omit(userMapByUserId[userId].tokens, tokenValue);
}
if (_.has(tokenSet, tokenValue)) {
tokenSet = _.omit(tokenSet, tokenValue);
}
}
};
return {
generateToken: generateToken,
deleteToken: deleteToken
};
}());
module.exports = function(socket, io) {
socket.on('generateToken', function(ownerUser) {
store.generateToken(ownerUser.userId);
io.sockets.emit('userList', {
userMapByUserId: store.getUserList()
});
});
socket.on('deleteToken', function(token) {
store.deleteToken(token.userId, token.tokenValue);
io.sockets.emit('userList', {
userMapByUserId: store.getUserList()
});
});
};
So, basically we can have multiple clients sending requests to this server to add/remove tokens. Do I need to worry about synchronization and race conditions ?

I don't see any concurrency issues in the code you've shown.
The Javascript in node.js is single threaded and works via an event queue. One thread of execution runs until it completes and then the JS engine fetches the next event waiting to run and runs that thread of execution until it completes. As such, there is no pre-emptive concurrency in Javascript like you might have to worry about in a language or situation using threads.
There are still some places in node.js where you can get opportunities for concurrency issues. This can happen if you are using asynchronous operations in your code (like async IO to a socket or disk). In that case, your thread of execution runs to completion, but your async operation is still running and has not finished yet and has not called its callback yet. At this point, some other event can get processed and other code can run. So, if your async callback refers to state that could get changed by other event handlers, then you could have concurrency issues.
In the two socket event handlers you disclosed in your question, I don't see any asynchronous callbacks within the code running in the event handlers. If that's the case, then there are no opportunities for concurrency there. Even if there were concurrency opportunities, but the separate pieces of code weren't using/modifying the same underlying data, then you would still be OK.
To give you an example of something that can cause a concurrency issue, I had a node.js app that was running on a Raspberry Pi. Every 10 seconds it would read a couple of digital temperature sensors and would record the data in an in-memory data structure. Every few hours, it would write the data out to disk. The process of writing the data out to disk was all done with a serios of async disk IO operations. Each async disk IO operation has a callback and the code continued only within that callback (technically I was using promises to manage the callbacks, but its the same concept). This create a concurrency issue because if the timer that records new temperatures fires while I'm in the middle of one of these async disk operations, then the data structure could get changed in the middle of writing it to disk which could cause me problems.
It wasn't that the async operation would get interrupted in the middle of it's disk write (node.js is not pre-emptive in that way), but because the whole operation of writing to disk consisted of many separate async file writes, other events could get processed between the separate async file writes. If those events could muss with your data, then it could create a problem.
My solution was to set a flag when I started writing the data to disk (then clear the flag when done) and if I wanted to add something to the data structure while that flag was set, then the new data would go into a queue and would be processed later so the core data structure was not changed while I was writing it to disk. I was able to implement this entirely in a few methods that were used for modifying the data so that the calling code didn't even need to know anything different was going on.
Though this answer was written for Ajax in a browser, the event queue concept is the same in node.js so you may find it helpful. Note that node.js is called "event IO for V8 Javascript" for a reason (because it runs via an event queue).

Javascript (node.js) capped number of child processes

hopefully I can describe what I'm looking for clearly enough. Working with Node and Python.
I'm trying to run a number of child processes (.py scripts, using child_process.exec()) in parallel, but no more than a specified number at a time (say, 2). I receive an unknown number of requests in batches (say this batch has 3 requests). I'd like to stop spawning processes until one of the current ones finishes.
for (var i = 0; i < requests.length; i++) {
//code that would ideally block execution for a moment
while (active_pids.length == max_threads){
console.log("Waiting for more threads...");
sleep(100)
continue
};
//code that needs to run if threads are available
active_pids.push(i);
cp.exec('python python-test.py '+ requests[i],function(err, stdout){
console.log("Data processed for: " + stdout);
active_pids.shift();
if (err != null){
console.log(err);
}
});
}
I know that while loop doesn't work, it was the first attempt.
I'm guessing there's a way to do this with
setTimeout(someSpawningFunction(){
if (active_pids.length == max_threads){
return
} else {
//spawn process?
}
},100)
But I can't quite wrap my head around it.
Or maybe
waitpid(-1)
Inserted in the for loop above in an if statement in place of the while loop? However I can't get the waitpid() module to install at the moment.
And yes, I understand that blocking execution is considered very bad in JS, but in my case, I need it to happen. I'd rather avoid external cluster manager-type libraries if possible.
Thanks for any help.
EDIT/Partial Solution
An ugly hack would be to use the answer from: this SO question (execSync()). But that would block the loop until the LAST child finished. That's my plan so far, but not ideal.

async.timesLimit from the async library is the perfect tool to use here. It allows you to asynchronously run a function n times, but run a maximum of k of those function calls in parallel at any given time.
async.timesLimit(requests.length, max_threads, function(i, next){
cp.exec('python python-test.py '+ requests[i], function(err, stdout){
console.log("Data processed for: " + stdout);
if (err != null){
console.log(err);
}
// this task is resolved
next(null, stdout);
});
}, function(err, stdoutArray) {
// this runs after all processes have run; what's next?
});
Or, if you want errors to be fatal and stop the loop, call next(err, stdout).

You can just maintain a queue of external processes waiting to run and a counter for how many are currently running. The queue would simply contain an object for each process that had properties containing the data you need to know what process to run. You can just use an array of those objects for the queue.
Whenever you get a new request to run an external process, you add it to the queue and then you start up external processes increasing your counter each time you start one until your counter hits your max number.
Then, while monitoring those external processes, whenever one finishes, you decrement the counter and if your queue of tasks waiting to run is not empty, you start up another one and increase the counter again.
The async library has this type of functionality built in (running a specific number of operations at a time), though it isn't very difficult to implement yourself with a queue and a counter. The key is that you just have to hook into the completion even for your external process so you can maintain the counter and start up any new tasks that are waiting.
There is no reason to need to use synchronous or serial execution or to block in order to achieve your goal here.

Node.js sync vs. async

I'm currently learning node.js and I see 2 examples for sync and async program (same one).
I do understand the concept of a callback, but i'm trying to understand the benefit for the second (async) example, as it seems that the two of them are doing the exact same thing even though this difference...
Can you please detail the reason why would the second example be better?
I'll be happy to get an ever wider explanation that would help me understand the concept..
Thank you!!
1st example:
var fs = require('fs');
function calculateByteSize() {
var totalBytes = 0,
i,
filenames,
stats;
filenames = fs.readdirSync(".");
for (i = 0; i < filenames.length; i ++) {
stats = fs.statSync("./" + filenames[i]);
totalBytes += stats.size;
}
console.log(totalBytes);
}
calculateByteSize();
2nd example:
var fs = require('fs');
var count = 0,
totalBytes = 0;
function calculateByteSize() {
fs.readdir(".", function (err, filenames) {
var i;
count = filenames.length;
for (i = 0; i < filenames.length; i++) {
fs.stat("./" + filenames[i], function (err, stats) {
totalBytes += stats.size;
count--;
if (count === 0) {
console.log(totalBytes);
}
});
}
});
}
calculateByteSize();

Your first example is all blocking I/O. In other words, you would need to wait until the readdir operation is complete before looping through each file. Then you would need to block (wait) for each individual sync stat operation to run before moving on to the next file. No code could run after calculateByteSize() call until all operations are completed.
The async (second) example on the otherhand is all non-blocking using the callback pattern. Here, the execution returns to just after the calculateByteSize() call as soon as fs.readdir is called (but before the callback is run). Once the readdir task is complete it performs a callback to your anonymous function. Here it loops through each of the files and again does non-blocking calls to fs.stat.
The second is more advantageous. If you can pretend that calls to readdir or stat can range from 250ms to 750ms to complete (this is probably not the case), you would be waiting for serial calls to your sync operations. However, the async operations would not cause you to wait between each call. In other words, looping over the readdir files, you would need to wait for each stat operation to complete if you were doing it synchronously. If you were to do it asynchronously, you would not have to wait to call each fs.stat call.

In your first example, the node.js process, which is single-threaded, is blocking for the entire duration of your readdirSync, and can't do anything else except wait for the result to be returned. In the second example, the process can handle other tasks and the event loop will return it to the continuation of the callback when the result is available. So you can handle a much much higher total throughput by using asynchronous code -- the time spent waiting for the readdir in the first example is probably thousands of times as long as the time actually spend executing your code, so you're wasting 99.9% or more of your CPU time.

In your example the benefit of async programming is indeed not much visible. But suppose that your program needs to do other things as well. Remember that your JavaScript code is running in a single thread, so when you chose the synchronous implementation the program can't do anything else but waiting for the IO operation to finish. When you use async programming, your program can do other important tasks while the IO operation runs in the background (outside the JavaScript thread)

Can you please detail the reason why would the second example be better? I'll be happy to get an ever wider explanation that would help me understand the concept..
It's all about concurrency for network servers (thus the name "node"). If this were in a build script the second, synchronous example would be "better" in that is is more straightforward. And given a single disk, there might not be much actual benefit to making it asynchronous.
However, in a network service, the first, synchronous version would block the entire process and defeat node's main design principle. Performance would be slow as number of concurrent clients increased. However the second, asynchronous example, would perform relatively well as while waiting for the relatively-slow filesystem to come back with results, it could handle all the relatively-fast CPU operations concurrently. The async version should basically be able to saturate your filesystem and however much your filesystem can deliver, node will be able to get it out to clients at that rate.

Lots of good answers here, but be sure to also read the docs:
The synchronous versions will block the entire process until they complete--halting all connections.
There is a good overview of sync vs async in the documentation: http://nodejs.org/api/fs.html#fs_file_system

How exactly does the nodejs promise library work for multiple promises?

Recently I made a webscraper in nodejs using 'promise'. I created a Promise for each url I wanted to scrape and then used all method:
var fetchUrlArray=[];
for(...){
var mPromise = new Promise(function(resolve,reject){
(http.get(...))()
});
fetchUrlArray.push(mPromise);
}
Promise.all(fetchUrlArray).then(...)
There were thousands of urls but only a few of them got timed out. I got the impression that it was handling 5 promises in parallel at a time.
My question is how exactly does promise.all() work. Does it:
Call each promise one by one and switch to the next one till the previous one is resolved.
Or does in process the promises in a batch of a few from the array.
Or does it fire all promises
What is the best way to solve this problem in nodejs. Because as it stands I can solve this problem way faster in Java/C#

What you pass Promise.all() is an array of promises. It knows absolutely nothing about what is behind those promises. All it knows is that those promises will get resolved or rejected sometime in the future and it will create a new master promise that follows the sum of all the promises you passed it. This is one of the nice things about promises. They are an abstraction that lets you coordinate any type of action (usually asynchronous) without regard for what type of action it is. As such, promises have literally nothing to do with the actual action. All they do is monitor the completion or error of the action and report that back to those agents following the promise. Other code actually runs the action.
In your particular case, you are immediately calling http.get() in a tight loop and your code (nothing to do with promises) is launching a zillion http.get() operations at once. Those will get fired as fast as the underlying transport can do them (likely subject to connection limits).
If you want them to be launched serially or in batches of say 10 at a time, then you have to code it that way yourself. Promises have nothing to do with that.
You could use promises to help you code them to launch serially or in batches, but it would take extra of your code to do that either way to make that happen.
The Async library is specifically built for running things in parallel, but with a maximum number in flight at any given time because this is a common scheme where you either have connection limits on your end or you don't want to overwhelm the receiving server. You may be interested in the parallelLimit option which lets you run a number of async operations in parallel, but with a maximum number in flight at any given time.

I would do it like this
Personally, I'm not a big fan of Promises. I think the API is extremely verbose and the resulting code is very hard to read. The method defined below results in very flat code and it's much easier to immediately understand what's going on. At least imo.
Here's a little thing I created for an answer to this question
// void asyncForEach(Array arr, Function iterator, Function callback)
// * iterator(item, done) - done can be called with an err to shortcut to callback
// * callback(done) - done recieves error if an iterator sent one
function asyncForEach(arr, iterator, callback) {
// create a cloned queue of arr
var queue = arr.slice(0);
// create a recursive iterator
function next(err) {
// if there's an error, bubble to callback
if (err) return callback(err);
// if the queue is empty, call the callback with no error
if (queue.length === 0) return callback(null);
// call the callback with our task
// we pass `next` here so the task can let us know when to move on to the next task
iterator(queue.shift(), next);
}
// start the loop;
next();
}
You can use it like this
var urls = [
"http://example.com/cat",
"http://example.com/hat",
"http://example.com/wat"
];
function eachUrl(url, done){
http.get(url, function(res) {
// do something with res
done();
}).on("error", function(err) {
done(err);
});
}
function urlsDone(err) {
if (err) throw err;
console.log("done getting all urls");
}
asyncForEach(urls, eachUrl, urlsDone);
Benefits of this
no external dependencies or beta apis
reusable on any array you want to perform async tasks on
non-blocking, just as you've come to expect with node
could be easily adapted for parallel processing
by writing your own utility, you better understand how this kind of thing works
If you just want to grab a module to help you, look into async and the async.eachSeries method.

First, a clarification: A promise does represent the future result of a computation, nothing else. It does not represent the task or computation itself, which means it cannot be "called" or "fired".
Your script does create all those thousands of promises immediately, and each of those creations does call http.get immediately. I would suspect that the http library (or something it depends on) has a connection pool with a limit of how many requests to make in parallel, and defers the rest implicitly.
Promise.all does not do any "processing" - it's not responsible for starting the tasks and resolving the passed promises. It only listens to them and checks whether they all are ready, and returns a promise for that eventual result.

In node.js, how to use child_process.exec so all can happen asynchronously?

I have a server built on node.js. Below is one of the request handler functions:
var exec = require("child_process").exec
function doIt(response) {
//some trivial and fast code - can be ignored
exec(
"sleep 10", //run OS' sleep command, sleep for 10 seconds
//sleeping(10), //commented out. run a local function, defined below.
function(error, stdout, stderr) {
response.writeHead(200, {"Content-Type": "text/plain"});
response.write(stdout);
response.end();
});
//some trivial and fast code - can be ignored
}
Meanwhile, in the same module file there is a local function "sleeping" defined, which as its name indicates will sleep for 10 seconds.
function sleeping(sec) {
var begin = new Date().getTime();
while (new Date().getTime() < begin + sec*1000); //just loop till timeup.
}
Here come three questions --
As we know, node.js is single-processed, asynchronous, event-driven. Is it true that ALL functions with a callback argument is asynchronous? For example, if I have a function my_func(callback_func), which takes another function as an argument. Are there any restrictions on the callback_func or somewhere to make my_func asynchronous?
So at least the child_process.exec is asynchronous with a callback anonymous function as argument. Here I pass "sleep 10" as the first argument, to call the OS's sleep command and wait for 10 seconds. It won't block the whole node process, i.e. any other request sent to another request handler won't be blocked as long as 10 seconds by the "doIt" handler. However, if immediately another request is sent to the server and should be handled by the same "doIt" handler, will it have to wait till the previous "doIt" request ends?
If I use the sleeping(10) function call (commented out) to replace the "sleep 10", I found it does block other requests till 10 seconds after. Could anyone explain why the difference?
Thanks a bunch!
-- update per request --
One comment says this question seemed duplicate to another one (How to promisify Node's child_process.exec and child_process.execFile functions with Bluebird?) that was asked one year after this one.. Well these are too different - this was asked for asynchronous in general with a specific buggy case, while that one was asking about the Promise object per se. Both the intent and use cases vary.
(If by any chance these are similar, shouldn't the newer one marked as duplicate to the older one?)

First you can promisify the child_process.
const util = require('util');
const exec = util.promisify(require('child_process').exec);
async function lsExample() {
const { stdout, stderr } = await exec('ls');
if (stderr) {
// handle error
console.log('stderr:', stderr);
}
console.log('stdout:', stdout);
}
lsExample()
As an async function, lsExample returns a promise.
Run all promises in parallel with Promise.all([]).
Promise.all([lsExample(), otherFunctionExample()]);
If you need to wait on the promises to finish in parallel, await them.
await Promise.all([aPromise(), bPromise()]);
If you need the values from those promises
const [a, b] = await Promise.all([aPromise(), bPromise(])

1) No. For example .forEach is synchronous:
var lst = [1,2,3];
console.log("start")
lst.forEach(function(el) {
console.log(el);
});
console.log("end")
Whether function is asynchronous or not it purely depends on the implementation - there are no restrictions. You can't know it a priori (you have to either test it or know how it is implemented or read and believe in documentation). There's even more, depending on arguments the function can be either asynchronous or synchronous or both.
2) No. Each request will spawn a separate "sleep" process.
3) That's because your sleeping function is a total mess - it is not sleep at all. What it does is it uses an infinite loop and checks for date (thus using 100% of CPU). Since node.js is single-threaded then it just blocks entire server - because it is synchronous. This is wrong, don't do this. Use setTimeout instead.

Develop Reference

JavaScript is the programming language of the Web.