Process 50k webpages on runtime (NodeJS) - javascript

I need to download ~50k webpages, get some data from them and put it to variable.
I wrap each request into Promise and then Promise.all() them. I use Request library.
Simplified code:
const request = require('request');
const urls = [url1, url2, ...];
const promises = [];
urls.forEach(url => {
promises.push((resolve, reject) => {
request(url, (error, response, body) => {
if(error){ reject(error); return; }
// do something with page
resolve(someData);
});
});
});
Promise.all(promises.map(pr => new Promise(pr)))
.then((someDataArray)=>{ /* process data /* });
But I receive ENFILE exception, which stands for too many open files in the system (on my desktop max number of open files is 2048).
I know that Promises execute on creation, but I can't solve this problem.
Maybe there is other approach to do that?
Thanks for response.

What you want is to launch N requests then start a new one whenever one finishes (be it successful or not).
There are many libraries for that but it's important to be able to implement this kind of limitation yourself:
const request = require('request');
const urls = [url1, url2, ...];
const MAX_QUERIES = 10;
var remaining = urls.length;
const promises = [];
function startQuery(url){
if (!url) return;
request(url, (error, response, body) => {
if (error) // handle error
else // handle result
startQuery(urls.shift());
if (--remaining==0) return allFinished();
});
}
for (var i=0; i<MAX_QUERIES; i++) startQuery(urls.shift());
function allFinished(){
// all done
}

You can try this using async.forEachLimit where you can define the limit on number of requests. It will execute the next batch of limited requests once the previous batch is complete.
Install the package using npm install --save async
async.forEachLimit(urls, 50,function(url, callback) {
//process url using request module
callback();
}, function(err) {
if (err) return next(err);
console.log("All urls are processed");
});
for further help look: https://caolan.github.io/async/docs.html

Others have said how to do the flow control using async or promises, and I won't repeat them. Personally, I prefer the async JS method but that's just my preference.
Two things that they did not cover, however, which I think are as important as flow control if you want your script performant and reliable.
1) Don't rely on the callbacks or promises to handle processing the files. All examples provided so far use those. Myself, I would make use of the request streams API instead to treat the request as a readable stream and pipe that stream to a writeable that processes it. Simplest example is to use fs to write the file to the file system. This makes much better use of your system resources as it writes each data chunk to storage as it comes in, rather than having to hold the whole file in memory. You can then call the callbacknor resolve the promise when the stream ends.
2) You should not try and process and in memory list of 50k URLs. If you do and you fail on, let's say the 20,000th URL, then you have to figure out how to sort out the done ones from the not done ones and update your code or the JSON file you read them from. Instead, use a database (any will do) that has a collection/table/whatever of URLs and metadata about them. When your program runs, query for ones that don't have the attributes indicating that they have been successfully fetched, and then when you successfully fetch them or the request fails, you can use that same data structure to give you some intelligence about why it failed or when it succeeded.

Install async package and use forEachLimit to limit number of operations.
const request = require('request');
const urls = [];
for(var temp=0;temp<1024;temp++){
urls.push("http://www.google.com");
}
const async = require("async");
const promises = [];
var i=0;
async.forEachLimit(urls, 10, function(url, callback) {
request(url, (error, response, body) => {
if (error) {
callback(error);
return;
}
var somedata = null;
console.log(++i);
callback(null, somedata);
});
}, function(err) {
/* process data */
});

Like said in comments, you could use the async.js module
const request = require('request');
const async = require('async');
var listOfUrls = [url1, url2, ...];
async.mapLimit(listOfUrls, 10, function(url, callback) {
// iterator function
request(url, function(error, response, body) {
if (!error && response.statusCode == 200) {
var dataFromPage = ""; // get data from the page
callback(null, arrToCheck);
} else {
callback(error || response.statusCode);
}
});
}, function(err, results) {
// completion function
if (!err) {
// process all results in the array here
// Do something with the data
resolve(results);
} else {
// handle error here
console.log(err);
}
});
Here you will process 10 url's at a time, when all url's have been processed the result callback is called, where you can process your data

Related

Too many simultaneous requests with NodeJS+request-promise

I have NodeJS project with a BIG array (about 9000 elements) containing URLs. Those URLs are going to be requested using the request-promise package. However, 9000 concurrent GET requests to the same website from the same client is neither liked by the server or the client, so I want to spread them out over time. I have looked around a bit and found Promise.map together with the {concurrency: int} option here, which sounded like it would do what I want. But I cannot get it to work. My code looks like this:
const rp = require('request-promise');
var MongoClient = require('mongodb').MongoClient;
var URLarray = []; //This contains 9000 URLs
function getWebsite(url) {
rp(url)
.then(html => { /* Do some stuff */ })
.catch(err => { console.log(err) });
}
MongoClient.connect('mongodb://localhost:27017/some-database', function (err, client) {
Promise.map(URLArray, (url) => {
db.collection("some-collection").findOne({URL: url}, (err, data) => {
if (err) throw err;
getWebsite(url, (result) => {
if(result != null) {
console.log(result);
}
});
}, {concurrency: 1});
});
I think I probably misunderstand how to deal with promises. In this scenario I would have thought that, with the concurrency option set to 1, each URL in the array would in turn be used in the database search and then passed as a parameter to getWebsite, whose result would be displayed in its callback function. THEN the next element in the array would be processed.
What actually happens is that a few (maybe 10) of the URLs are fetch correctly, then the server starts to respond sporadically with 500 internal server error. After a few seconds, my computer freezes and then restarts (which I guess is due to some kind of panic?).
How can I attack this problem?
If the problem is really about concurrency, you can divide the work into chunks and chain the chunks.
Let's start with a function that does a mongo lookup and a get....
// answer a promise that resolves to data from mongo and a get from the web
// for a given a url, return { mongoResult, webResult }
// (assuming this is what OP wants. the OP appears to discard the mongo result)
//
function lookupAndGet(url) {
// use the promise-returning variant of findOne
let result = {}
return db.collection("some-collection").findOne({URL: url}).then(mongoData => {
result.mongoData = mongoData
return rp(url)
}).then(webData => {
result.webData = webData
return result
})
}
lodash and underscore both offer a chunk method that breaks an array into an array of smaller. Write your own or use theirs.
const _ = require('lodash')
let chunks = _.chunk(URLArray, 5) // say 5 is a reasonable concurrency
Here's the point of the answer, make a chain of chunks so you only perform the smaller size concurrently...
let chain = chunks.reduce((acc, chunk) => {
const chunkPromise = Promise.all(chunk.map(url => lookupAndGet(url)))
return acc.then(chunkPromise)
}, Promise.resolve())
Now execute the chain. The chunk promises will return chunk-sized arrays of results, so your reduced result will be an array of arrays. Fortunately, lodash and underscore both have a method to "flatten" the nested array.
// turn [ url, url, ...] into [ { mongoResult, webResult }, { mongoResult, webResult }, ...]
// running only 5 requests at a time
chain.then(result => {
console.log(_.flatten(result))
})

Multiple Http requests called and stored in one array, but how to wait until all requests finish before working with array

I have a array of values I want to loop over. Each of these values will be used to make an http request to a server. From the server I will recieve a response for each request. I want to store all these responses in a single array and then do work on the array once ALL requests have finished. Due to the async nature of my code I am not sure how to make the application wait until all the requests have finished. What is happening is I am making the requests, but the work I want to do with the array is already starting before ALL the requests have finished due to the async nature. How can I make this code "synchronous" in the sence it waits until all requests have finished before starting to do the work with the listOfResponses array
//import the require library to make http requests to a server
const request = require('request');
//values to be sent via a restful GET request
const list = [
'value_one',
'value_two'
];
//store resoonses from GET request
var listOfResponses = [];
//loop through the list
list.forEach(function(word) {
//Make a rest GET call to a server
var url = 'http://notsurehowtomakethisworksoiamaskingstackoverflow.com/api/words/' + word;
request(url, {
json: true
}, (err, res, body) => {
if (err) {
return console.log(err);
}
//store the response from the server into out array
listOfResponses.push(body.response);
});
});
/* *******************************
HERE I WANT TO DO STUFF WITH listOfResponses ONCE ALL THE REQUESTS FINISH
********************************** */
Just map it to an array of promises:
const promises = list.map(word => new Promise(resolve => {
var url = 'http://notsurehowtomakethisworksoiamaskingstackoverflow.com/api/words/' + word;
request(url, {
json: true
}, (err, res) => {
if (err) {
return reject(err);
}
resolve(res.body);
});
}));
Then you can get all the results using Promise.all :
Promise.all(promises).then(results => {
//...
});
Simply check the responses each time a request ends:
//import the require library to make http requests to a server
const request = require('request');
//values to be sent via a restful GET request
const list = [
'value_one',
'value_two'
];
//store resoonses from GET request
var listOfResponses = [];
//loop through the list
list.forEach(function(word) {
//Make a rest GET call to a server
var url = 'http://notsurehowtomakethisworksoiamaskingstackoverflow.com/api/words/' + word;
request(url, {
json: true
}, (err, res, body) => {
if (err) {
return console.log(err);
}
//store the response from the server into out array
listOfResponses.push(body.response);
check();
});
});
// CHECK THE LIST
function check() {
if (listOfResponses.length == list.length) {
console.log("YAY! Here you have your responses", listOfResponses);
}
}
This is an asynchronous scenario, a way to accomplish that is calling recursively a function that will loop over your list of words. The recursion works according to each response from your server.
Another approach is using Promise.
Look this code snippet (Recursion approach):
//import the require library to make http requests to a server
const request = require('request');
//values to be sent via a restful GET request
const list = [
'value_one',
'value_two'
];
//store resoonses from GET request
var listOfResponses = [];
//loop through the list
var loop = function(array, index, cb) {
if (index === array.length)
cb();
return;
//Make a rest GET call to a server
var url = 'http://notsurehowtomakethisworksoiamaskingstackoverflow.com/api/words/' + array[i];
request(url, {
json: true
}, (err, res, body) => {
if (err) {
return console.log(err);
}
//store the response from the server into out array
listOfResponses.push(body.response);
loop(array, i++, cb);
});
};
loop(list, 0, function() {
/* *******************************
HERE I WANT TO DO STUFF WITH listOfResponses ONCE ALL THE REQUESTS FINISH
********************************** */
});
As you can see, the loop starts with a call to loop function with index = 0 and every response will call the loop function with an incremented index.
The recursion ends when index == list.length and the callback is executed to keep the flow of your logic.

sails.js node.js Parse JSON on controller

In my controller called MapController I'm doing a function to do a parse of remote json files, and from an if-else structure add some values in an array called "parsewebservice", apparently everything is working fine but console.log ( parsewebservice); is not returning the values that were passed to the array "parsewebservice" in the place where it is returning it empty. But when I put it inside the forEach it returns, but everything cluttered and repeated then is not the right way.
I wanted to know why the values that were passed to the array "parsewebservice" are not going along with the variable after populada and what would be the correct way to do it?
Here is my code below:
/**
* MapController
*
* #description :: Server-side logic for managing Maps
* #help :: See http://sailsjs.org/#!/documentation/concepts/Controllers
*/
module.exports = {
index: function(req, res, next) {
Data.find(function foundData(err, datas) {
if (err) return next(err);
var parsewebservice = [];
datas.forEach(function(data, index) {
var req = require("request");
var url = data.address + "?f=pjson";
req(url, function(err, res, retorno) {
if (err) {
console.log(err);
} else {
var camadas = JSON.parse(retorno);
if (camadas.mapName) {
camadas.layers.forEach(function(campo, i) {
if (campo.subLayerIds != null) {
} else if (campo.subLayerIds == null) {
parsewebservice.push([i, "dynamicMapLayer", campo.name, data.address]);
}
});
} else if (camadas.serviceDataType) {
parsewebservice.push([null, "imageMapLayer", camadas.name, data.address]);
} else if (camadas.type) {
parsewebservice.push([null, "featureLayer", camadas.name, data.address]);
}
}
});
});
console.log(parsewebservice);
});
},
};
My first comment has to be that you should not combine function(req, res) with var req = require('request')... you lose your access to the original req object!
So, you need to run a list of async tasks, and do something when they are all complete. That will never be entirely easy, and no matter what, you will have to get used to the idea that your code does not run from top to bottom as you've written it. Your console.log at the bottom runs before any of the callbacks (functions you pass in) you pass to your external requests.
The right way to do this is to use promises. It looks like you are using this request library, whose returned requests can only accept callbacks, not be returned as promises. You can create your own promise wrapper for them, or use an alternative library (several are recommended on the page).
I don't want to write a whole intro-to-promises right here, so what I will do is give you a less pretty, but maybe more understandable way to run some code at the completion of all your requests.
Data.find(function foundData(err, datas) {
if (err) return next(err);
var parsewebservice = [];
// here we will write some code that we will run once per returned data
var processResponse = function(resp) {
parsewebservice.push(resp);
if(parsewebservice.length >= datas.length) {
// we are done, that was the final request
console.log(parsewebservice);
return res.send({data: parsewebservice)}); // or whatever
}
};
datas.forEach(function(data, index) {
var request = require("request");
var url = data.address + "?f=pjson";
request(url, function(err, res, retorno) {
// do some processing of retorno...
// call our function to handle the result
processResponse(retorno);
});
});
console.log(parsewebservice); // still an empty array here
});
I solved the problem.
the "request" module is asynchronous so we need to wait for it to respond and then send the response to the view.
To do this we created a function called "foo" to contain the foreach and the request, we made a callback of that function and finally we made the response (res.view) within that function, so that the controller response would only be sent after the response of the "foo" function to the callback. So we were able to parse.json the data from the "data" collection using foreach and the "request" module and send the objects to the view.
Many thanks to all who have helped me, my sincere thanks.

How to return from a looped asynchronous function with callback in Node

I am trying to write a function that:
Takes an array of URLs
Gets files from URLs in parallel (order's irrelevant)
Processes each file
Returns an object with the processed files
Furthermore, I don't need for errors in #2 or #3 to affect the rest of the execution in my application in any way - the app could continue even if all the requests or processing failed.
I know how to fire all the requests in a loop, then once I have all the data, fire the callback to process the files, by using this insertCollection pattern.
However, this is not efficient, as I shouldn't need to wait for ALL files to download before attempting to process them - I would like to process them as each download finishes.
So far I have this code:
const request = require('request');
const urlArray = [urlA, urlB, urlC];
const results = {};
let count = 0;
let processedResult;
const makeRequests = function (urls, callback) {
for (let url of urls) {
request(url, function(error, response, body) {
if (error) {
callback(error);
return;
}
processedResult = callback(null, body)
if (processedResult) {
console.log(processedResult); // prints correctly!
return processedResult;
}
})
}
};
const processResult = function(error, file) {
if (error) {
console.log(error);
results.errors.push(error);
}
const processedFile = file + `<!-- Hello, Dolly! ${count}-->`;
results.processedFiles.push(processedFile);
if (++count === urlArray.length) {
return results;
}
};
const finalResult = makeRequests(urlArray, processResult);
console.log(finalResult); // undefined;
In the last call to processResult I manage to send a return, and makeRequests captures it, but I'm failing to "reign it in" in finalResult after that.
My questions are:
Why is this not working? I can print a well-formed processedResult
on the last iteration of makeRequests, but somehow I cannot return
it back to the caller (finalResult)
How can this be solved, ideally "by hand", without promises or the
help of libraries like async?
The makeRequests function returns undefined to finalResult because that is a synchronous function. Nothing stops the code executing, so it gets to the end of the function and, because there is no defined return statement, it returns undefined as default.

NodeJS Variable Collision? Async / Sync with Request

Issue : Data INFO_X sometimes and randomly become Null.
Question ,
Does the variable overwrites each other for INFO_1 INFO_2 INFO_3 since Nodejs run fast unlike PHP it follows a sequence/step by step.
I checked for NULLS before doing a request but the debug shows it's NOT NULL before executing the 2nd request, at random, random variables will become null upon submitting the 2nd request.
I also checked my source is definitely not returning any null.
Is the variable being overwritten before the 2nd request is sent or what? Please advice.
var request = require('request');
var urls = [ 'URL',
'URL',
'URL'];
urls.forEach(processUrl);
function processUrl(url) {
request(url, function (error, response, body) {
if (!error) {
var obj = JSON.parse(body);
for (var i = 0, len = obj['products'].length; i < len; ++i) {
var data = obj['products'][i];
var INFO_1 = data.INFO_1
var INFO_2 = data.INFO_2
var INFO_3 = data.INFO_3
request("URL/POSTINFO?INFO_1="+INFO_1+"&INFO_2="+INFO_2+"&INFO_3="+INFO_3+"&seller_id=", function(error, response, body) {
console.log(body);
});
}
}
});
}
Yes, that is the case, because request function is asynchronous. I wouldn't call nodejs "faster" than PHP, it just runs asynchronous request methods, while PHP is generally synchronous.
You could resolve the issue with promises, e.g. Promise.all([]) and provide an array of request functions (see here: https://developer.mozilla.org/en/docs/Web/JavaScript/Reference/Global_Objects/Promise/all), or in your case, use this library https://github.com/caolan/async#forEach
Since you're using callbacks in the request function, your best option is to use async as provided in the link above. Here's an example code:
function request(url, cb) {
setTimeout(() => {
cb(url + ' accessed at ' + new Date());
}, 2000);
}
var urls = ['URL1', 'URL2', 'URL3'];
async.each(urls, (item) => {
console.log(item);
request(item, (value) => {
request(value, (newValue)=>{
console.log(newValue);
});
});
}, (err) => {
console.log(err);
});
Here's a working example: https://plnkr.co/edit/Q5RAvKdaLxV9cUT4GP4w?p=preview

Categories

Resources