NodeJS Variable Collision? Async / Sync with Request - javascript

Issue : Data INFO_X sometimes and randomly become Null.
Question ,
Does the variable overwrites each other for INFO_1 INFO_2 INFO_3 since Nodejs run fast unlike PHP it follows a sequence/step by step.
I checked for NULLS before doing a request but the debug shows it's NOT NULL before executing the 2nd request, at random, random variables will become null upon submitting the 2nd request.
I also checked my source is definitely not returning any null.
Is the variable being overwritten before the 2nd request is sent or what? Please advice.
var request = require('request');
var urls = [ 'URL',
'URL',
'URL'];
urls.forEach(processUrl);
function processUrl(url) {
request(url, function (error, response, body) {
if (!error) {
var obj = JSON.parse(body);
for (var i = 0, len = obj['products'].length; i < len; ++i) {
var data = obj['products'][i];
var INFO_1 = data.INFO_1
var INFO_2 = data.INFO_2
var INFO_3 = data.INFO_3
request("URL/POSTINFO?INFO_1="+INFO_1+"&INFO_2="+INFO_2+"&INFO_3="+INFO_3+"&seller_id=", function(error, response, body) {
console.log(body);
});
}
}
});
}

Yes, that is the case, because request function is asynchronous. I wouldn't call nodejs "faster" than PHP, it just runs asynchronous request methods, while PHP is generally synchronous.
You could resolve the issue with promises, e.g. Promise.all([]) and provide an array of request functions (see here: https://developer.mozilla.org/en/docs/Web/JavaScript/Reference/Global_Objects/Promise/all), or in your case, use this library https://github.com/caolan/async#forEach
Since you're using callbacks in the request function, your best option is to use async as provided in the link above. Here's an example code:
function request(url, cb) {
setTimeout(() => {
cb(url + ' accessed at ' + new Date());
}, 2000);
}
var urls = ['URL1', 'URL2', 'URL3'];
async.each(urls, (item) => {
console.log(item);
request(item, (value) => {
request(value, (newValue)=>{
console.log(newValue);
});
});
}, (err) => {
console.log(err);
});
Here's a working example: https://plnkr.co/edit/Q5RAvKdaLxV9cUT4GP4w?p=preview

Related

How to do sequencial HTTP calls?

I have a couple of APIs I need to call to collect and merge information.
I make the first API call and, based on the result, I make several calls to the second one (in a loop).
Since http requests are asynchronous I'm loosing the information. By the time the second step is finished the server (nodejs) already sent the response back to the client.
I've already tried to, somehow, use the callback functions. This managed to keep the response to the client waiting but the information of the second call was still lost. I guess somehow the variables are not being synchronized.
I also did a quick test with away/async but my Javascript mojo was not enough to make it run without errors.
/* pseudo code */
function getData(var1, callback){
url= "http://test.server/bla?param="+var1;
request.get(url, function (error, response, body){
var results = [];
for(var item of JSON.parse(body).entity.resultArray) {
var o = {};
o['data1'] = item.data1;
o['data2'] = item.data2;
o['data3'] = item.data3;
getSecondStep(o, function(secondStepData){
//console.log("Callback object");
//console.log(o);
o['secondStepData'] = secondStepData;
});
results.push(o);
}
callback(results);
});
}
function getSecondStep(object, callback){
url = "http://othertest.server/foobar?param=" + object.data1;
request.get(url, function (error, response, body){
var results = [];
if(response.statusCode == 200){
for(var item of JSON.parse(body).object.array) {
var o = {}
o['data4'] = item.data4;
o['data5'] = item.data5;
results.push(o);
}
callback(results);
}
});
}
What I would like is to be able to collect all the information into one JSON object to return it back to the client.
The client will then be responsible for rendering it in a nice way.
I recommend using the async / await pattern with the request-promise-native library.
This makes API calls really easy to make and the code is cleaner when using this pattern.
In the example below I'm just calling a httpbin API to generate a UUID but the principle applies for any API.
const rp = require('request-promise-native');
async function callAPIs() {
let firstAPIResponse = await rp("https://httpbin.org/uuid", { json: true });
console.log("First API response: ", firstAPIResponse);
// Call several times, we can switch on the first API response if we like.
const callCount = 3;
let promiseList = [...Array(callCount).keys()].map(() => rp("https://httpbin.org/uuid", { json: true }));
let secondAPIResponses = await Promise.all(promiseList);
return { firstAPIResponse: firstAPIResponse, secondAPIResponses: secondAPIResponses };
}
async function testAPIs() {
let combinedResponse = await callAPIs();
console.log("Combined response: " , combinedResponse);
}
testAPIs();
In this simple example we get a combined response like so:
{
{
firstAPIResponse: { uuid: '640858f8-2e69-4c2b-8f2e-da8c68795f21' },
secondAPIResponses: [
{ uuid: '202f9618-f646-49a2-8d30-4fe153e3c78a' },
{ uuid: '381b57db-2b7f-424a-9899-7e2f543867a8' },
{ uuid: '50facc6e-1d7c-41c6-aa0e-095915ae3070' }
]
}
}
I suggest you go over to a library that supports promises (eg: https://github.com/request/request-promise) as the code becomes much easier to deal with than the callback method.
Your code would look something like:
function getData(var1){
var url = "http://test.server/bla?param="+var1;
return request.get(url).then(result1 => {
var arr = JSON.parse(body).entity.resultArray;
return Promise.all( arr.map(x => request.get("http://othertest.server/foobar?param=" + result1.data1)))
.then(result2 => {
return {
data1: result1.data1,
data2: result1.data2,
data3: result1.data3,
secondStepData: result2.map(x => ({data4:x.data4, data5:x.data5}))
}
})
});
}
And usage would be
getData("SomeVar1").then(result => ... );
The problem is that you are calling the callback while you still have async calls going on. Several approaches are possible, such us using async/await, or reverting to Promises (which I would probably do in your case).
Or you can, well, call the callback only when you have all the information available. Pseudo code follows:
function getData(var1, callback){
url= "http://test.server/bla?param="+var1;
request.get(url, function (error, response, body){
var results = [];
var items = JSON.parse(body).entity.resultArray;
var done = 0, max = items.length;
for(var item of items) {
var o = {};
o['data1'] = item.data1;
o['data2'] = item.data2;
o['data3'] = item.data3;
getSecondStep(o, function(secondStepData){
//console.log("Callback object");
//console.log(o);
o['secondStepData'] = secondStepData;
results.push(o);
done += 1;
if(done === max) callback(results);
});
}
});
}
(note that since this is pseudo code, I am not checking for errors or handling a possible empty result from request.get(...))
You need to call the callback of first function only when all the second callback functions have been called. Try this changes:
function getData(var1, callback) {
url = "http://test.server/bla?param=" + var1;
request.get(url, function (error, response, body) {
var results = [],count=0;
var arr = JSON.parse(body).entity.resultArray;
for (let [index, value] of arr.entries()) {
var o = {};
o['data1'] = item.data1;
o['data2'] = item.data2;
o['data3'] = item.data3;
getSecondStep(o, function (secondStepData) {
//console.log("Callback object");
//console.log(o);
o['secondStepData'] = secondStepData;
results[index] = o;
count++;
if (count === arr.length) {
callback(results);
}
});
}
});
}

Multiple Http requests called and stored in one array, but how to wait until all requests finish before working with array

I have a array of values I want to loop over. Each of these values will be used to make an http request to a server. From the server I will recieve a response for each request. I want to store all these responses in a single array and then do work on the array once ALL requests have finished. Due to the async nature of my code I am not sure how to make the application wait until all the requests have finished. What is happening is I am making the requests, but the work I want to do with the array is already starting before ALL the requests have finished due to the async nature. How can I make this code "synchronous" in the sence it waits until all requests have finished before starting to do the work with the listOfResponses array
//import the require library to make http requests to a server
const request = require('request');
//values to be sent via a restful GET request
const list = [
'value_one',
'value_two'
];
//store resoonses from GET request
var listOfResponses = [];
//loop through the list
list.forEach(function(word) {
//Make a rest GET call to a server
var url = 'http://notsurehowtomakethisworksoiamaskingstackoverflow.com/api/words/' + word;
request(url, {
json: true
}, (err, res, body) => {
if (err) {
return console.log(err);
}
//store the response from the server into out array
listOfResponses.push(body.response);
});
});
/* *******************************
HERE I WANT TO DO STUFF WITH listOfResponses ONCE ALL THE REQUESTS FINISH
********************************** */
Just map it to an array of promises:
const promises = list.map(word => new Promise(resolve => {
var url = 'http://notsurehowtomakethisworksoiamaskingstackoverflow.com/api/words/' + word;
request(url, {
json: true
}, (err, res) => {
if (err) {
return reject(err);
}
resolve(res.body);
});
}));
Then you can get all the results using Promise.all :
Promise.all(promises).then(results => {
//...
});
Simply check the responses each time a request ends:
//import the require library to make http requests to a server
const request = require('request');
//values to be sent via a restful GET request
const list = [
'value_one',
'value_two'
];
//store resoonses from GET request
var listOfResponses = [];
//loop through the list
list.forEach(function(word) {
//Make a rest GET call to a server
var url = 'http://notsurehowtomakethisworksoiamaskingstackoverflow.com/api/words/' + word;
request(url, {
json: true
}, (err, res, body) => {
if (err) {
return console.log(err);
}
//store the response from the server into out array
listOfResponses.push(body.response);
check();
});
});
// CHECK THE LIST
function check() {
if (listOfResponses.length == list.length) {
console.log("YAY! Here you have your responses", listOfResponses);
}
}
This is an asynchronous scenario, a way to accomplish that is calling recursively a function that will loop over your list of words. The recursion works according to each response from your server.
Another approach is using Promise.
Look this code snippet (Recursion approach):
//import the require library to make http requests to a server
const request = require('request');
//values to be sent via a restful GET request
const list = [
'value_one',
'value_two'
];
//store resoonses from GET request
var listOfResponses = [];
//loop through the list
var loop = function(array, index, cb) {
if (index === array.length)
cb();
return;
//Make a rest GET call to a server
var url = 'http://notsurehowtomakethisworksoiamaskingstackoverflow.com/api/words/' + array[i];
request(url, {
json: true
}, (err, res, body) => {
if (err) {
return console.log(err);
}
//store the response from the server into out array
listOfResponses.push(body.response);
loop(array, i++, cb);
});
};
loop(list, 0, function() {
/* *******************************
HERE I WANT TO DO STUFF WITH listOfResponses ONCE ALL THE REQUESTS FINISH
********************************** */
});
As you can see, the loop starts with a call to loop function with index = 0 and every response will call the loop function with an incremented index.
The recursion ends when index == list.length and the callback is executed to keep the flow of your logic.

Process 50k webpages on runtime (NodeJS)

I need to download ~50k webpages, get some data from them and put it to variable.
I wrap each request into Promise and then Promise.all() them. I use Request library.
Simplified code:
const request = require('request');
const urls = [url1, url2, ...];
const promises = [];
urls.forEach(url => {
promises.push((resolve, reject) => {
request(url, (error, response, body) => {
if(error){ reject(error); return; }
// do something with page
resolve(someData);
});
});
});
Promise.all(promises.map(pr => new Promise(pr)))
.then((someDataArray)=>{ /* process data /* });
But I receive ENFILE exception, which stands for too many open files in the system (on my desktop max number of open files is 2048).
I know that Promises execute on creation, but I can't solve this problem.
Maybe there is other approach to do that?
Thanks for response.
What you want is to launch N requests then start a new one whenever one finishes (be it successful or not).
There are many libraries for that but it's important to be able to implement this kind of limitation yourself:
const request = require('request');
const urls = [url1, url2, ...];
const MAX_QUERIES = 10;
var remaining = urls.length;
const promises = [];
function startQuery(url){
if (!url) return;
request(url, (error, response, body) => {
if (error) // handle error
else // handle result
startQuery(urls.shift());
if (--remaining==0) return allFinished();
});
}
for (var i=0; i<MAX_QUERIES; i++) startQuery(urls.shift());
function allFinished(){
// all done
}
You can try this using async.forEachLimit where you can define the limit on number of requests. It will execute the next batch of limited requests once the previous batch is complete.
Install the package using npm install --save async
async.forEachLimit(urls, 50,function(url, callback) {
//process url using request module
callback();
}, function(err) {
if (err) return next(err);
console.log("All urls are processed");
});
for further help look: https://caolan.github.io/async/docs.html
Others have said how to do the flow control using async or promises, and I won't repeat them. Personally, I prefer the async JS method but that's just my preference.
Two things that they did not cover, however, which I think are as important as flow control if you want your script performant and reliable.
1) Don't rely on the callbacks or promises to handle processing the files. All examples provided so far use those. Myself, I would make use of the request streams API instead to treat the request as a readable stream and pipe that stream to a writeable that processes it. Simplest example is to use fs to write the file to the file system. This makes much better use of your system resources as it writes each data chunk to storage as it comes in, rather than having to hold the whole file in memory. You can then call the callbacknor resolve the promise when the stream ends.
2) You should not try and process and in memory list of 50k URLs. If you do and you fail on, let's say the 20,000th URL, then you have to figure out how to sort out the done ones from the not done ones and update your code or the JSON file you read them from. Instead, use a database (any will do) that has a collection/table/whatever of URLs and metadata about them. When your program runs, query for ones that don't have the attributes indicating that they have been successfully fetched, and then when you successfully fetch them or the request fails, you can use that same data structure to give you some intelligence about why it failed or when it succeeded.
Install async package and use forEachLimit to limit number of operations.
const request = require('request');
const urls = [];
for(var temp=0;temp<1024;temp++){
urls.push("http://www.google.com");
}
const async = require("async");
const promises = [];
var i=0;
async.forEachLimit(urls, 10, function(url, callback) {
request(url, (error, response, body) => {
if (error) {
callback(error);
return;
}
var somedata = null;
console.log(++i);
callback(null, somedata);
});
}, function(err) {
/* process data */
});
Like said in comments, you could use the async.js module
const request = require('request');
const async = require('async');
var listOfUrls = [url1, url2, ...];
async.mapLimit(listOfUrls, 10, function(url, callback) {
// iterator function
request(url, function(error, response, body) {
if (!error && response.statusCode == 200) {
var dataFromPage = ""; // get data from the page
callback(null, arrToCheck);
} else {
callback(error || response.statusCode);
}
});
}, function(err, results) {
// completion function
if (!err) {
// process all results in the array here
// Do something with the data
resolve(results);
} else {
// handle error here
console.log(err);
}
});
Here you will process 10 url's at a time, when all url's have been processed the result callback is called, where you can process your data

Node.js web scraping with loop on array of URLs

I'm trying to build a little script to scrap some data. I'm some basics knowledge in javascript however I'm kind of lost with all the async callback or promises stuff. Here is what I have now :
url = "http://Blablablabla.com";
var shares = function(req, res) {
request(url, function (error, response, body) {
if (!error) {
var $ = cheerio.load(body),
share = $(".theitemIwant").html();
return res.send(url + ":" + share);
} else {
console.log("We've encountered an error: " + error);
}
})
}
So everything is fine with this piece of code. What I would like to do is :
Using an array of url var urls = [url1,url2,url3,etc...]
Storing my scrapped data into another array, something like this data = [{url: url1, shares: share},{url: url2, shares: share},etc...]
I know I need to use something like this data.push({ urls: url, shares: share})})
and I understand that I need to loop over my first url array to push data into my second data array.
however I'm kind of lost with the request method and the way I should deal with async issue in my situation.
thanks !
edit#1 :
I tried this to use promises :
var url = "www.blablabla.com"
var geturl = request(url, function (error, response, body) {
if (!error) { return $ = cheerio.load(body) } else
{ console.log("We've encountered an error: " + error); }
});
var shares = geturl.then( function() {
return $(".nb-shares").html();
})
but got the following error geturl.then is not a function
I think you should use async:
var async = require('async');
var urls = ["http://example.com", "http://example.com", "http://example.com"];
var data = [];
var calls = urls.map((url) => (cb) => {
request(url, (error, response, body) => {
if (error) {
console.error("We've encountered an error:", error);
return cb();
}
var $ = cheerio.load(body),
share = $(".theitemIwant").html();
data.push({ url, share })
})
})
async.parallel(calls, () => { /* YOUR CODE HERE */ })
You could do the same with promises, but I don't see why.
I took a stab at it. You need to install the q library and require it to
var Q = require('q');
//... where ever your function is
//start with an array of string urls
var urls = [ "http://Blablablabla.com", '...', '...'];
//store results in this array in the form:
// {
// url: url,
// promise: <will be resolved when its done>,
// share:'code that you wanted'
// }
var results = [];
//loop over each url and perform the request
urls.forEach(processUrl);
function processUrl(url) {
//we use deferred object so we can know when the request is done
var deferred = Q.defer();
//create a new result object and add it to results
var result = {
url: url,
promise: deferred.promise
};
results.push(result);
//perform the request
request(url, function (error, response, body) {
if (!error) {
var $ = cheerio.load(body),
share = $(".theitemIwant").html();
//resolve the promise so we know this request is done.
// no one is using the resolve, but if they were they would get the result of share
deferred.resolve(share);
//set the value we extracted to the results object
result.share = share;
} else {
//request failed, reject the promise to abort the chain and fall into the "catch" block
deferred.reject(error)
console.log("We've encountered an error: " + error);
}
});
}
//results.map, converts the "array" to just promises
//Q.all takes in an array of promises
//when they are all done it rull call your then/catch block.
Q.all(results.map(function(i){i.promise}))
.then(sendResponse) //when all promises are done it calls this
.catch(sendError); //if any promise fails it calls this
function sendError(error){
res.status(500).json({failed: error});
}
function sendResponse(data){ //data = response from every resolve call
//process results and convert to your response
return res.send(results);
}
Here is another solution I like a lot :
const requestPromise = require('request-promise');
const Promise = require('bluebird');
const cheerio = require('cheerio');
const urls = ['http://google.be', 'http://biiinge.konbini.com/series/au-dela-des-murs-serie-herve-hadmar-marc-herpoux-critique/?src=konbini_home']
Promise.map(urls, requestPromise)
.map((htmlOnePage, index) => {
const $ = cheerio.load(htmlOnePage);
const share = $('.nb-shares').html();
let shareTuple = {};
shareTuple[urls[index]] = share;
return shareTuple;
})
.then(console.log)
.catch((e) => console.log('We encountered an error' + e));

variable not defined node.js

I'm trying to use Node.js to get a response from an API, I want to clean the API response and use the result.
So to access the first API I have the following code.
To store and use the result I believe I need to store the JSON output globally.
However, I can't work out how to do this.
Example -
var request = require('request');
request({url: 'https://www.car2go.com/api/v2.1/vehicles?loc=wien&oauth_consumer_key=car2gowebsite&format=json', json: true}, function(err, res, json) {
if (err) {
throw err;
}
car2go = json.placemarks;
for (i = 0; i < car2go.length; i++) {
delete car2go[i].address;
delete car2go[i].charging;
delete car2go[i].exterior;
delete car2go[i].interior;
delete car2go[i].smartPhoneRequired;
delete car2go[i].vin
car2go[i].vendor = 'car2go';
car2go[i].city = 'wien';
car2go[i].carmake = 'Smart';
car2go[i].carmodel = 'Fortwo';
}
console.log(car2go);
});
This prints the desired result however I know that this is because my variable is defined within the function.
I want to access the variable outside of the function.
To test if I could do this I changed the code to -
var request = require('request');
request({url: 'https://www.car2go.com/api/v2.1/vehicles?loc=wien&oauth_consumer_key=car2gowebsite&format=json', json: true}, function(err, res, json) {
if (err) {
throw err;
}
car2go = json.placemarks;
for (i = 0; i < car2go.length; i++) {
delete car2go[i].address;
delete car2go[i].charging;
delete car2go[i].exterior;
delete car2go[i].interior;
delete car2go[i].smartPhoneRequired;
delete car2go[i].vin
car2go[i].vendor = 'car2go';
car2go[i].city = 'wien';
car2go[i].carmake = 'Smart';
car2go[i].carmodel = 'Fortwo';
}
});
console.log(car2go);
But if I do this I get
ReferenceError: car2go is not defined
I am running Node v0.12.2 on Mac OS Yosemite (10.10.3).
Admittedly I am very new to node and I am more familiar with R, Python and PL SQL.
There is no way to get reference to it outside of the callback function because the console.log line runs before the callback function is invoked. The reason you have to pass a callback function into the request API is because the request library needs to invoke that function when it's done making the request. Meanwhile, your app moves on and does other things (such as running that console.log line) while it waits for the callback function to fire.
That said, there are a number of ways to deal with asynchronous code. My favorite way is with promises. I use a library called bluebird for handling promises.
var request = require('request');
var Promise = require('bluebird');
var requestP = Promise.promisify(request);
The call to Promise.promisify(request) returns a new function that doesn't take a callback function, but instead returns a promise.
requestP({ url: 'https://www.car2go.com/api/v2.1/vehicles?loc=wien&oauth_consumer_key=car2gowebsite&format=json', json: true })
.spread(function(res, json) {
var car2go = json.placemarks;
for (i = 0; i < car2go.length; i++) {
delete car2go[i].address;
delete car2go[i].charging;
delete car2go[i].exterior;
delete car2go[i].interior;
delete car2go[i].smartPhoneRequired;
delete car2go[i].vin
car2go[i].vendor = 'car2go';
car2go[i].city = 'wien';
car2go[i].carmake = 'Smart';
car2go[i].carmodel = 'Fortwo';
}
})
.then(function (car2go) {
console.log(car2go);
})
.catch(function (err) {
console.error(err);
});
Note: .spread is the same as .then except if the resolved value is an array (which it will be because the callback passed to the request library accepts 2 arguments, which bluebird will translate into an array that the promise resolves to) .spread will split up the array back into multiple arguments passed into the function you give to .spread.
Promise.resolve(['hi', 'there']).then(function (result) {
console.log(result); // "['hi', 'there']"
});
Promise.resolve(['hi', 'there']).spread(function (str1, str2) {
console.log(str1); // 'hi'
console.log(str2); // 'there'
});
You're not going to be able to return that value all the way back out to the same context from which you began the asynchronous call, but you can at least write code that looks somewhat synchronous when using promises.
Without promises you'll be forced to call functions from within functions from within functions from within functions ;)
The response is asynchronous. That means the callback function gets called sometime LATER in the future so your console.log(car2go) is executing BEFORE the callback has even been called.
The only place you can reliably use the response is inside the callback or in a function called from the callback. You cannot use it the way you are trying to. Using asynchronous responses in Javascript requires programming in an asynchronous fashion which means processing results and using results IN the asynchronous callbacks only.
Here's where the console.log() should be:
var request = require('request');
request({url: 'https://www.car2go.com/api/v2.1/vehicles?loc=wien&oauth_consumer_key=car2gowebsite&format=json', json: true}, function (err, res, json) {
if (err) {
throw err;
}
car2go = json.placemarks;
for (i = 0; i < car2go.length; i++) {
delete car2go[i].address;
delete car2go[i].charging;
delete car2go[i].exterior;
delete car2go[i].interior;
delete car2go[i].smartPhoneRequired;
delete car2go[i].vin
car2go[i].vendor = 'car2go';
car2go[i].city = 'wien';
car2go[i].carmake = 'Smart';
car2go[i].carmodel = 'Fortwo';
}
// here is where the result is available
console.log(car2go);
});

Categories

Resources