Too many simultaneous requests with NodeJS+request-promise - javascript

I have NodeJS project with a BIG array (about 9000 elements) containing URLs. Those URLs are going to be requested using the request-promise package. However, 9000 concurrent GET requests to the same website from the same client is neither liked by the server or the client, so I want to spread them out over time. I have looked around a bit and found Promise.map together with the {concurrency: int} option here, which sounded like it would do what I want. But I cannot get it to work. My code looks like this:
const rp = require('request-promise');
var MongoClient = require('mongodb').MongoClient;
var URLarray = []; //This contains 9000 URLs
function getWebsite(url) {
rp(url)
.then(html => { /* Do some stuff */ })
.catch(err => { console.log(err) });
}
MongoClient.connect('mongodb://localhost:27017/some-database', function (err, client) {
Promise.map(URLArray, (url) => {
db.collection("some-collection").findOne({URL: url}, (err, data) => {
if (err) throw err;
getWebsite(url, (result) => {
if(result != null) {
console.log(result);
}
});
}, {concurrency: 1});
});
I think I probably misunderstand how to deal with promises. In this scenario I would have thought that, with the concurrency option set to 1, each URL in the array would in turn be used in the database search and then passed as a parameter to getWebsite, whose result would be displayed in its callback function. THEN the next element in the array would be processed.
What actually happens is that a few (maybe 10) of the URLs are fetch correctly, then the server starts to respond sporadically with 500 internal server error. After a few seconds, my computer freezes and then restarts (which I guess is due to some kind of panic?).
How can I attack this problem?

If the problem is really about concurrency, you can divide the work into chunks and chain the chunks.
Let's start with a function that does a mongo lookup and a get....
// answer a promise that resolves to data from mongo and a get from the web
// for a given a url, return { mongoResult, webResult }
// (assuming this is what OP wants. the OP appears to discard the mongo result)
//
function lookupAndGet(url) {
// use the promise-returning variant of findOne
let result = {}
return db.collection("some-collection").findOne({URL: url}).then(mongoData => {
result.mongoData = mongoData
return rp(url)
}).then(webData => {
result.webData = webData
return result
})
}
lodash and underscore both offer a chunk method that breaks an array into an array of smaller. Write your own or use theirs.
const _ = require('lodash')
let chunks = _.chunk(URLArray, 5) // say 5 is a reasonable concurrency
Here's the point of the answer, make a chain of chunks so you only perform the smaller size concurrently...
let chain = chunks.reduce((acc, chunk) => {
const chunkPromise = Promise.all(chunk.map(url => lookupAndGet(url)))
return acc.then(chunkPromise)
}, Promise.resolve())
Now execute the chain. The chunk promises will return chunk-sized arrays of results, so your reduced result will be an array of arrays. Fortunately, lodash and underscore both have a method to "flatten" the nested array.
// turn [ url, url, ...] into [ { mongoResult, webResult }, { mongoResult, webResult }, ...]
// running only 5 requests at a time
chain.then(result => {
console.log(_.flatten(result))
})

Related

Node.js - How to return callback with array from for loop with MySQL query?

I'm trying to get list of virtual communities on my Node.js app and then return it with callback function. When i call a getList() method with callback it returns a empty array.
const mysqli = require("../mysqli/connect");
class Communities{
getList(callback){
var list = [];
mysqli.query("SELECT * FROM communities", (err, communities) => {
for(let i = 0; i < communities.length; i++){
mysqli.query("SELECT name FROM users WHERE id='"+ communities[i].host +"'", (err, host) => {
list.push({
"id": communities[i].id,
"name": communities[i].name,
"hostID": communities[i].host,
"hostName": host[0].name,
"verified": communities[i].verified,
"people": communities[i].people
});
});
}
callback(list);
});
}
}
new Communities().getList((list) => {
console.log(list);
});
I need to make for loop to asynchronous and call callback when for loop ends. Please let me know how to do this. Thanks.
Callbacks get really ugly if you have to combine multiple of them, thats why Promises were invented to simplify that. To use Promises in your case you have to create a Promise first when querying the database¹:
const query = q => new Promise((resolve, reject) => mysqli.query(q, (err, result) => err ? reject(err) : resolve(result)));
Now doing multiple queries will return multiple promises, that can be combined using Promise.all to one single promise²:
async getList(){
const communities = await query("SELECT * FROM communities");
const result = await/*³*/ Promise.all(communities.map(async community => {
const host = await query(`SELECT name FROM users WHERE id='${community.host}'`);/*⁴*/
return {
...community,
hostName: host[0].name,
};
}));
return result;
}
Now you can easily get the result with:
new Communities().getList().then(list => {
console.log(list);
});
Read on:
Working with Promises - Google Developers
Understanding async / await - Ponyfoo
Notes:
¹: If you do that more often, you should probably rather use a mysql library that does support promises natively, that safes a lot of work.
²: Through that the requests are done in parallel, which means, that it is way faster than doing one after another (which could be done using a for loop & awaiting inside of it).
³: That await is superfluous, but I prefer to keep it to mark it as an asynchronous action.
⁴: I guess that could also be done using one SQL query, so if it is too slow for your usecase (which I doubt) you should optimize the query itself.

Sending API calls in batches

I'm currently trying to simulate half a million IoT devices to push payload to Azure IoT Hub using nodejs. Since node is multi-threaded in nature, its flooding iot hub with data and i am getting network errors.
I also tried async/await method but that is taking a lot of time to push data to IoT Hub.
Is there a way to only run 100 calls in parallel, wait for all of them to complete and then run the next 100 in node?
Much appreciated!
Build your batches as a nested array of Promises, then use Promise.all
on each batch in a loop that awaits for each Promise.all to resolve.
// This is a mock request function, could be a `request` call
// or a database query; whatever it is, it MUST return a Promise.
const sendRequest = () => {
return new Promise((resolve) => {
setTimeout(() => {
console.log('request sent')
resolve()
}, 1000)
})
}
// 5 batches * 2 requests = 10 requests.
const batches = Array(5).fill(Array(2).fill(sendRequest))
;(async function() {
for (const batch of batches) {
try {
console.log('-- sending batch --')
await Promise.all(batch.map(f => f()))
} catch(err) {
console.error(err)
}
}
})()
If you are using lodash you can make it a bit easier by using chunk which will divide an array into chunks of provided max size
So in your case you can use it like this
variable calls (array of 550 lets say)
const batchCalls = _.chunk(calls, 100);
for (const batchCall of batchCalls) {
await Promise.all(batchCall.map(call => call())) // makes a hundred calls in series
}
You can readily use bluebird Promise's map with concurrency option. This processes the max records as mentioned in the concurrency, before picking up the next batch.
example :
Promise.map([], {concurrency : 100})
limited-request-queue could be used to queue the request. There are options to set the Maximum number of connections at any given time. Below is the code we used to send 5 request every second. Also there will only be 5 request sent at any given time.
limited-request-queue
/*
Request passed to Targer App (5 requests per seconds)
Get the response for each request and passed the response to Source App
maxSockets: The maximum number of connections allowed at any given time. A value of 0 will prevent anything from going out. A value of Infinity will provide no concurrency limiting.
maxSocketsPerHost:The maximum number of connections per host allowed at any given time. A value of 0 will prevent anything from going out. A value of Infinity will provide no per-host concurrency limiting.
rateLimit: The number of milliseconds to wait before each maxSocketsPerHost
*/
var queue1 = new RequestQueue({'maxSockets': 5, 'maxSocketsPerHost': 5, 'rateLimit': 1000}, {
item: function(input, done) {
request(input.url, function(error, response) {
input.res.send(response.body);
done();
});
},
end: function() {
console.log("Queue 1 completed!");
}
});
//To queue request - A for loop could be used to send multiple request
queue1.enqueue({'url': ''});
If I'm not mistaken, you can use the 'array' of items and the Promise.all() method (or in your case .allSettled() to just see the results of each call) and then process each one inside it like this:
function chunk (items, size) {
const chunks = [];
items = [].concat(...items);
while (items.length) { chunks.push(items.splice(0, size)); }
return chunks;
}
async function ProcessDevice(device) {
// do your work here
}
// splice your items into chunks of 100, then process each chunk
// catching the result of each ProcessDevice in the chunk.map
// the results of the chunk are passed into the .then( )
// and you have a .catch( ) in case there's an error anywhere in the items
var jobArray = chunk(items,100);
for (let i = 0; i < jobArray.length; i++) {
Promise.allSettled(
jobArray[i].map(ja => ProcessDevice(ja))
.then(function(results) { console.log("PromiseResults: " + results); })
.catch((err) => { console.log("error: " + err); });
}

Axios.all in NodeJS failing with 404 error

I hope you can help me because it is HOURS of trying to get this problem resolved. I've googled so much and tried all of the solutions I found, but I keep getting the same error.
I am trying to make an axis get a request to an API that is paginated for 1 result per page, loop through all of the results, and resolve the promises with the promise array.
I have verified that without the loop, just getting 1 request, everything works. I have successful writing to MongoDB using MongoDB driver and its fine. Once I bring the loop in I cannot get the promises to resolve. I was able to console.log that the promise array does, indeed, have x number of pending promises in them.
const MongoClient = require('mongodb')
const axios = require('axios');
const url = 'https://catfact.ninja/fact'
let db = null;
let client = null;
//this one works great
const getMetaData = function () {
let data = axios.get(url+"s")
.then(response => {
return response.data
}).catch(error => console.log(error));
return data;
}
//this one will not resolve
const dataArray = async function (total) {
let baseUrl = url+"s/facts?page="
let res =[];
let promises = [];
for (let page = 1; page <= total; page++){
promises.push(axios.get(baseUrl+page))
}
axios.all(promises).then(result => console.log(result))
//originally i wanted to map the result to an array of json
//objects, but if i could even get a console.log it would be
//a win. spread operator not working, Promise.all not working
//i must have tried 12 different stackoverflow responses from
//other questions. until i can resolve the promises I can't do anything.
}
exports.connect = async function(url, done) {
if (db) return done();
// let data = await getMetaData()
// let total = data['total']
let arr = dataArray(5);
//console.log("arr is "+arr)
MongoClient.connect(url, {useNewUrlParser: true}, function (err, client){
if (err) return done(err);
client = client;
db = client.db('morefun');
/*
db.collection('catfacts').insertMany(dataArray, function(err, res){
if (err) throw err;
console.log("Inserted: " + res.insertedCount);
})*/
done();
});
}
exports.get = function() {
return db;
}
//make sure this is correct
exports.close = function(done) {
if (db) {
client.close(function(err, result) {
db = null;
mode = null;
done(err);
});
}
}
I need an array of JSON objects for the insertMany function to work. please someone help me. what am I doing wrong?
In the for loop, you are creating a URL like this: https://catfact.ninja/facts/facts?page=1 – this is incorrect, the correct URL should be https://catfact.ninja/facts?page=1 (with facts only once).
Also, the keyword async is not needed here, and you should return the result of axios.all.
A correct version of your code:
const dataArray = function (total) {
let baseUrl = url+"s?page="
let res =[];
let promises = [];
for (let page = 1; page <= total; page++){
promises.push(axios.get(baseUrl+page))
}
return axios.all(promises).then(result => {
console.log(result);
return result;
});
}
You can then get your data like this:
let arr = await dataArray(5);
Getting the actual data the way you want it
From your comments, I see that what you really want is to post-process the data obtained from the API to ultimately get one array that contains only the cat data.
You can do this by “massaging” the data with map and reduce, like this:
return axios
.all(promises)
.then(result => result.map(({ data }) => data.data).reduce((curr, acc) => acc.concat(curr), []));
Note: I've left out the console.log statement here for brevity.
The actual data is nested as property 'data' in an object within an object as property 'data', so the map call retrieves that
We get an array of arrays, each with an object with cat data; the reduce call flattens this to a simple array of cat data
We get a result that looks like this, which is hopefully what you want 😁:
[
{
"fact": "Cats see six times better in the dark and at night than humans.",
"length": 63
},
{
"fact": "The ability of a cat to find its way home is called “psi-traveling.” Experts think cats either use the angle of the sunlight to find their way or that cats have magnetized cells in their brains that act as compasses.",
"length": 220
},
{
"fact": "Cat's urine glows under a black light.",
"length": 38
},
{
"fact": "There are more than 500 million domestic cats in the world, with approximately 40 recognized breeds.",
"length": 100
},
{
"fact": "A tomcat (male cat) can begin mating when he is between 7 and 10 months old.",
"length": 76
}
]
No sure if it's the answer but I know I've ran into issues when not using the syntax axios wants exactly for an axios all.
axios.all([fetch1request(), fetch2request()]).then(axios.spread((fetch1, fetch2 ) => * whatever logic you need. But at this point the requests are complete* })

How to detect when multiple asynchronous calls for multiple arrays are complete in Node.js

I am using [ssh2-sftp-client][1] package to recursively read all the directories inside a given remote path.
Here is the code.
const argv = require('yargs').argv;
const client = require('ssh-sftp-client');
const server = new Client();
const auth = {
host: '192.168.1.11',
username: argv.u,
password: argv.p
};
const serverRoot = '/sites/';
const siteName = 'webmaster.com';
// list of directories on the server will be pushed to this array
const serverPaths = [];
server.connect(auth).then(() => {
console.log(`connected to ${auth.host} as ${auth.username}`);
}).catch((err) => {
if (err) throw err;
});
server.list('/sites/').then((dirs) => {
redursiveDirectorySearch(dirs, `${serverRoot}${siteName}/`);
})
.catch((err) => {
if (err) throw err;
});
function recursiveDirectorySearch(dirs, prevPath) {
let paths = dirs.filter((dir) => {
// returns directories only
return dir.type === 'd';
});
if (paths.length > 0) {
paths.forEach((path) => {
server
.list(`${prevPath}${path.name}`)
.then((dirs) => {
console.log(`${prevPath}${path.name}`);
recursiveDirectorySearch(dirs, `${prevPath}${path.name}`);
serverPaths.push(`${prevPath}${path.name}`);
})
}
}
}
At first, a connection will be made to the server and then list whatever is under '/sites/' directory, which will then be passed to 'recursiveDirectorySearch' function. This function will receive an array of whatever is found under '/sites/' directory on the server as the first parameter, which will be filtered out so it only has directories. If one or more directory was found, a call to the server for each directory in the array will be made in order to retrieve everything under '/sites/'+'name of the directory in the array'. This same function will be called again with whatever is returned by the call to the server until no other directory is found.
Whenever a directory is found, its name in string will be pushed to 'serverPaths' array. As far as I can tell, this search is working and successfully pushing all the directory names to the array.
However, I can't think of a way to detect when this recursive search for all the directories is complete so I can do something with the 'serverPaths' array.
I tried to take advantage of Promise.all() but don't know how to use it when how many function calls are made is unknown.
You're simply lacking a couple of returns, add a Promise.all, and an Array#map and you're done
Note: not using Promise.all on serverPaths, but rather, using the fact that returning a Promise in .then will result in the Promise that is returned by .then taking on the Promise that is returned (hmmm, that isn't very well explained, is it, but it's Promises 101 stuff really!
server.list('/sites/').then((dirs) => {
// added a return here
return recursiveDirectorySearch(dirs, `${serverRoot}${siteName}/`);
})
.then(() => {
// everything is done at this point,
// serverPaths should be complete
})
.catch((err) => {
if (err) throw err;
});
function recursiveDirectorySearch(dirs, prevPath) {
let paths = dirs.filter((dir) => {
// returns directories only
return dir.type === 'd';
});
// added a return, Promise.all and changed forEach to map
return Promise.all(paths.map((path) => {
//added a return here
return server
.list(`${prevPath}${path.name}`)
.then((dirs) => {
console.log(`${prevPath}${path.name}`);
// swapped the next two lines
serverPaths.push(`${prevPath}${path.name}`);
// added a return here, push the path before
return recursiveDirectorySearch(dirs, `${prevPath}${path.name}`);
})
}));
}
One of the main things that is jumping out at me is your initial if statement. (if paths.length > 0) { run recursion } This appears to work really well for the first call because you know that the data coming back will be populated with an array full of directories.
Your function however, does not appear to have logic built out for an array with a length of 0. In this scenario it would be possible for you to get all of the directory names you are looking for. Presented in the manner that you are looking for. It would also mean that your calls on the higher parts of the tree are never able to resolve.
Try to add logic to handle cases for an array with a length of zero | if (paths.length === 0) return; | This would be a hard break out of the recursive calls on the higher parts of the stack.

Is it possible to asynchronously collect items from a generator into an array?

I'm playing around with writing a web service using Node.js/Express which generates some objects based on templates and then returns the generated data. I'm using Bluebird promises to manage all the async logic. After stripping out all the unimportant stuff, my code looks something like this[1].
My problem is the core logic can block for several seconds if the requested number of output elements is large. Since I've been playing with ES6 for this project, my first thought was to factor out the element creation into a generator[2]. However, the only way I can find to get all the results from this generator is Array.from, which doesn't help with the blocking.
I've played around with .map, .all, .coroutine, and a couple of other things, in an attempt to asynchronously collect the results from the generator, but I haven't had any luck. Is there any nice way to do this with Bluebird? (Or perhaps a better way of doing it altogether?)
Native ES6 Promise.all can take an iterator and give back an array of values, but V8 doesn't support this yet. Also, in my experimentation with polyfills/Firefox, it seems to be synchronous.
This is not-too-common operation, so I don't care much about absolute performance. I just want to avoid blocking the event queue, and I would prefer a nice, easy to read and maintain solution.
[1]:
let Bluebird = require('bluebird');
let templates = ...; // logic to load data templates
function createRandomElementFromRandomTemplate(templates) {
let el;
// synchronous work that can take a couple of milliseconds...
return el;
};
api.createRandomElements = function(req, res) {
let numEls = req.params.numEls;
Bluebird.resolve(templates)
.then(templates => {
let elements = [];
// numEls could potentially be several thousand
for(let i = 0; i < numEls; ++i) {
elements.push(createRandomElementFromRandomTemplate(templates));
}
return elements;
})
.then(elements => {
res.json(elements);
})
.error(err => {
res.status(500).json(err);
});
}
[2]:
function* generateRandomElementsFromRandomTemplate(templates, numEls) {
for(let i = 0; i < numEls; ++i) {
let el;
// synchronous work that can take a couple of milliseconds...
yield el;
}
}
api.createRandomElements = function(req, res) {
let numEls = req.params.numEls;
Bluebird.resolve(templates)
.then(templates => {
// this still blocks
return Array.from(generateRandomElementsFromRandomTemplate(templates, numEls));
})
.then(elements => {
res.json(elements);
})
.error(err => {
res.status(500).json(err);
});
}
Here's a halfway-decent solution I found after looking more closely at Bluebird's .map() as Benjamin suggested. I still have the feeling I'm missing something, though.
The main reason I started with Bluebird was because of Mongoose, so I left a bit of that in for a more realistic sample.
let Bluebird = require('bluebird');
let mongoose = require('mongoose');
Bluebird.promisifyAll(mongoose);
const Template = mongoose.models.Template,
UserPref = mongoose.models.UserPref;
// just a normal function that generates one element with a random choice of template
function createRandomElementFromRandomTemplate(templates, userPrefs) {
let el;
// synchronous work that can take a couple of milliseconds...
return el;
}
api.generate = function(req, res) {
let userId = req.params.userId;
let numRecord = req.params.numRecords
let data;
Bluebird.props({
userprefs: UserPref.findOneAsync({userId: userId}),
templates: Template.findAsync({})
})
.then(_data => {
data = _data;
// use a sparse array to convince .map() to loop the desired number of times
return Array(numRecords);
})
.map(() => {
// ignore the parameter map passes in - we're using the exact same data in each iteration
// generate one item each time and let Bluebird collect them into an array
// I think this could work just as easily with a coroutine
return Bluebird.delay(createRandomElementFromRandomTemplate(data.templates, data.userprefs), 0);
}, {concurrency: 5})
.then(generated => {
return Generated.createAsync(generated);
})
.then(results => {
res.json(results);
})
.catch(err => {
console.log(err);
res.status(500);
});
};

Categories

Resources