This is my functions
function parseLinks(links, callback) {
var products = [];
for (var i = 0; i < links.length; i++) {
request(links[i], function (error, response, body) {
var product;
if (!error && response.statusCode == 200) {
var $ = cheerio.load(body);
// title
var title = $('h1').text();
if (!title)
var title = $('title').text();
var description = $('meta[name="description"]').attr('content');
product = new Product(links[0].trim(), title.trim(), description.trim());
products.push(product);
}
});
}
callback(products) // the callback only do a console.log(products)
}
After that, I want to do a console.log(products) who display all the products.
So I setup a callback attached to parseLinks and call it after the for loop. The problem is in my for loop, I call asynchronous function request each times, so my callback is called before the end of all the request calls, so my console.log(products) print an empty array.
Do you know how fix that ?
Thanks
You have to check if all the asynchronous calls have finished. Create an inner function that calls callback when all asynchronous work is done:
function parseLinks(links, callback) {
var products = [],
numberOfItems = links.length; // numbers of linkes to be parsed
function checkIfDone() { // this function will be called each time link is parsed
numberOfItems--; // decrement the numberOfItems (number that tells us how many links left)
if(numberOfItems === 0) // if there are none left (all links are parsed), then call callback with the resultant array.
callback(products);
}
for (var i = 0; i < links.length; i++) {
request(links[i], function (error, response, body) {
// ...
checkIfDone(); // everytime a link is parsed, call checkIfDone
});
}
}
You can embed the logic of checkIfDone inside the function request directly. I used a separate function for clarity.
The best way is to use async.
var async = require("async");
function parseLinks(links, callback) {
var products = [];
async.forEach(links, function(link, done) {
request(link, function (error, response, body) {
var product;
if (!error && response.statusCode == 200) {
var $ = cheerio.load(body);
// title
var title = $('h1').text();
if (!title)
var title = $('title').text();
var description = $('meta[name="description"]').attr('content');
product = new Product(links[0].trim(), title.trim(), description.trim());
products.push(product);
}
done();
});
}, function() {
callback(products);
});
}
You can use async.each from asnyc module
Simplified code:
function parseLinks(links, callback) {
var products = [];
async.each(links, function(link, requestCallback) {
request(links[i], function(error, response, body) {
//... rest of your code
requestCallback(); //Request has ended
});
}, function(err) {
//All requests ended!
callback();
});
}
Related
I have a couple of APIs I need to call to collect and merge information.
I make the first API call and, based on the result, I make several calls to the second one (in a loop).
Since http requests are asynchronous I'm loosing the information. By the time the second step is finished the server (nodejs) already sent the response back to the client.
I've already tried to, somehow, use the callback functions. This managed to keep the response to the client waiting but the information of the second call was still lost. I guess somehow the variables are not being synchronized.
I also did a quick test with away/async but my Javascript mojo was not enough to make it run without errors.
/* pseudo code */
function getData(var1, callback){
url= "http://test.server/bla?param="+var1;
request.get(url, function (error, response, body){
var results = [];
for(var item of JSON.parse(body).entity.resultArray) {
var o = {};
o['data1'] = item.data1;
o['data2'] = item.data2;
o['data3'] = item.data3;
getSecondStep(o, function(secondStepData){
//console.log("Callback object");
//console.log(o);
o['secondStepData'] = secondStepData;
});
results.push(o);
}
callback(results);
});
}
function getSecondStep(object, callback){
url = "http://othertest.server/foobar?param=" + object.data1;
request.get(url, function (error, response, body){
var results = [];
if(response.statusCode == 200){
for(var item of JSON.parse(body).object.array) {
var o = {}
o['data4'] = item.data4;
o['data5'] = item.data5;
results.push(o);
}
callback(results);
}
});
}
What I would like is to be able to collect all the information into one JSON object to return it back to the client.
The client will then be responsible for rendering it in a nice way.
I recommend using the async / await pattern with the request-promise-native library.
This makes API calls really easy to make and the code is cleaner when using this pattern.
In the example below I'm just calling a httpbin API to generate a UUID but the principle applies for any API.
const rp = require('request-promise-native');
async function callAPIs() {
let firstAPIResponse = await rp("https://httpbin.org/uuid", { json: true });
console.log("First API response: ", firstAPIResponse);
// Call several times, we can switch on the first API response if we like.
const callCount = 3;
let promiseList = [...Array(callCount).keys()].map(() => rp("https://httpbin.org/uuid", { json: true }));
let secondAPIResponses = await Promise.all(promiseList);
return { firstAPIResponse: firstAPIResponse, secondAPIResponses: secondAPIResponses };
}
async function testAPIs() {
let combinedResponse = await callAPIs();
console.log("Combined response: " , combinedResponse);
}
testAPIs();
In this simple example we get a combined response like so:
{
{
firstAPIResponse: { uuid: '640858f8-2e69-4c2b-8f2e-da8c68795f21' },
secondAPIResponses: [
{ uuid: '202f9618-f646-49a2-8d30-4fe153e3c78a' },
{ uuid: '381b57db-2b7f-424a-9899-7e2f543867a8' },
{ uuid: '50facc6e-1d7c-41c6-aa0e-095915ae3070' }
]
}
}
I suggest you go over to a library that supports promises (eg: https://github.com/request/request-promise) as the code becomes much easier to deal with than the callback method.
Your code would look something like:
function getData(var1){
var url = "http://test.server/bla?param="+var1;
return request.get(url).then(result1 => {
var arr = JSON.parse(body).entity.resultArray;
return Promise.all( arr.map(x => request.get("http://othertest.server/foobar?param=" + result1.data1)))
.then(result2 => {
return {
data1: result1.data1,
data2: result1.data2,
data3: result1.data3,
secondStepData: result2.map(x => ({data4:x.data4, data5:x.data5}))
}
})
});
}
And usage would be
getData("SomeVar1").then(result => ... );
The problem is that you are calling the callback while you still have async calls going on. Several approaches are possible, such us using async/await, or reverting to Promises (which I would probably do in your case).
Or you can, well, call the callback only when you have all the information available. Pseudo code follows:
function getData(var1, callback){
url= "http://test.server/bla?param="+var1;
request.get(url, function (error, response, body){
var results = [];
var items = JSON.parse(body).entity.resultArray;
var done = 0, max = items.length;
for(var item of items) {
var o = {};
o['data1'] = item.data1;
o['data2'] = item.data2;
o['data3'] = item.data3;
getSecondStep(o, function(secondStepData){
//console.log("Callback object");
//console.log(o);
o['secondStepData'] = secondStepData;
results.push(o);
done += 1;
if(done === max) callback(results);
});
}
});
}
(note that since this is pseudo code, I am not checking for errors or handling a possible empty result from request.get(...))
You need to call the callback of first function only when all the second callback functions have been called. Try this changes:
function getData(var1, callback) {
url = "http://test.server/bla?param=" + var1;
request.get(url, function (error, response, body) {
var results = [],count=0;
var arr = JSON.parse(body).entity.resultArray;
for (let [index, value] of arr.entries()) {
var o = {};
o['data1'] = item.data1;
o['data2'] = item.data2;
o['data3'] = item.data3;
getSecondStep(o, function (secondStepData) {
//console.log("Callback object");
//console.log(o);
o['secondStepData'] = secondStepData;
results[index] = o;
count++;
if (count === arr.length) {
callback(results);
}
});
}
});
}
I've got this function
function parseLink(link) {
var product;
request(link, function (error, response, body) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(body);
// title
var title = $('h1').text();
if (!title)
var title = $('title').text();
var description = $('meta[name="description"]').attr('content');
product = new Product(link.trim(), title.trim(), description.trim());
}
});
console.log(product);
return product;
}
And I don't understand why when I do console.log(product) outside of the request call, I've got undefinded but inside, I can see my product.
I learn lot of things about scopes in javascript and I don't understand, causse I defined product in the top function.
I need to return this variable for get it in another function, if do the return inside request I've got of course an undefined so I need to do that outside...
Thank you
javascript does not run the code like c or php where you can be sure that the next line of code runs when the previous is ready. In your case request is an asynchronous function so the two lines
console.log(product);
return product;
are mostly run before your request function is ready. In that case you can not just return some value from your parseLink function. You have two possibilities here:
use promises:
https://developer.mozilla.org/en/docs/Web/JavaScript/Reference/Global_Objects/Promise
use a callback:
like this:
function parseLink(link, callback) {
var product;
request(link, function (error, response, body) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(body);
// title
var title = $('h1').text();
if (!title)
var title = $('title').text();
var description = $('meta[name="description"]').attr('content');
product = new Product(link.trim(), title.trim(), description.trim());
callback(product);
}
});
}
and you run the code like
parseLink('http://...', function(product) { /* do something with the product */ });
ps: the use of callbacks is a lot easier imo, but in some cases you can to separate the scope for example if you run it in a for loop
request is an asynchronous call so this procedure gets pushed to the event queue which will run once the current call stack has finished. console.log prints undefined because that is the default value for unassigned variables.
You must use callbacks or promises if you need to return the value from the asynchronous call. Here's an example using a Promise:
function parseLink(link) {
return new Promise((resolve, reject) => {
request(link, function(error, response, body) {
if (error) return reject(error);
if (response.statusCode !== 200) {
return reject(new Error('Not OK'));
}
var $ = cheerio.load(body);
var title = $('h1').text() || $('title').text();
var description = $('meta[name="description"]').attr('content');
var product = new Product(link.trim(), title.trim(), description.trim());
resolve(product);
});
});
}
parseLink('http://example.com')
.then(product => {
console.log(product);
})
.catch(error => {
console.error(error);
});
in the "LearnYouNode" excercise i have following question to the task 9. In the hints, there is mentioned, that this excercise could also be done with the packages "asycn" or "after". I tried it with both and both failed :|
(i have already done the recommended solution with no additional package)
Here are my miserable attempts:
Shared code of both variants:
var http = require("http");
var bl = require("bl");
var after = require("after");
var async = require("async");
var results = [];
//getting the 3 arguments
var urls = []
for(var i = 2; i < process.argv.length; i++){
urls.push(process.argv[i]);
}
function printResults(){
for(var i = 0; i < results.length; i++){
console.log(results[i]);
}
}
"after" attempt:
//Understood it that way printResults is called after the var next is 3 times generated, no result at all
var next = after(3, printResults)
for(i = 0; i<urls.length;i++){
next(i);
}
var next = function (i){
http.get(urls[i], response => {
response.setEncoding('utf8');
var singleString = "";
response.on("data", data =>{
singleString += data;
}).on("end",function() {
results.push(singleString);
});
response.on("error", err => {
return console.log(err);
});
});
}
"asycn" attempt:
// I get output but in the wrong order
async.map(urls,function(url, printResults) {
http.get(url, response => {
response.setEncoding('utf8');
var singleString = "";
response.on("data", data =>{
singleString += data;
}).on("end",function() {
console.log(singleString);
});
response.on("error", err => {
console.log(err);
});
});
}, function(err){
console.log(err);
});
I really donĀ“t get what i am doing wrong. Thank you very much for your help. Kind regards,
SirSandmann
It's all about syntax, you should read docs more carefully.
For async:
var http = require('http');
var urls = process.argv.slice(2);
var async = require('async');
// the second parameter is a callback provided by async
// it allows async to know when your async task is done
async.map(urls, (url, next) => {
http.get(url, response => {
var str = '';
response
.on('data', data => str += data)
.on('end', () => next(null, str)); // we use the callback here to give the data back
});
}, (err, results) => {
// the second callback is not an error handler
// it's called when all async jobs are done
// so the logging of the results goes here
results.forEach(res => console.log(res));
});
For the after solution, you declared next() twice and you don't use the original next() in your http.get():
var http = require('http');
var urls = process.argv.slice(2);
var after = require('after');
var results = [];
function printResults() {
results.forEach(res => console.log(res));
}
// when you call next(), it tells after 'hey, my async job is done'
var next = after(3, printResults);
urls.forEach((url, i) => {
http.get(url, response => {
var str = '';
response
.on('data', data => str += data)
.on('end', () => {
// after is very "dumb", you have to store your result yourself at the right index
results[i] = str;
// this is very important, 'hey, my async job is done'
next();
});
});
});
In both solutions, you noticed there is a callback (I called it next in both), in asynchronous Javascript, a callback is the only way to notify an asynchronous job is done. That why it's important in both examples to call the provided callback.
I am attempting to build a web scraper using nodeJS that searches a website's HTML for images, caches the image source URLs, then searches for the one with largest size.
The problem I am having is deliverLargestImage() is firing before the array of image source URLs is looped through to get their file sizes. I am attempting to use both async.series and async.each to have this work properly.
How do I force deliverLargestImage() to wait until the async.each inside getFileSizes() is finished?
JS
var async, request, cheerio, gm;
async = require('async');
request = require('request');
cheerio = require('cheerio');
gm = require('gm').subClass({ imageMagick: true });
function imageScraper () {
var imgSources, largestImage;
imgSources = [];
largestImage = {
url: '',
size: 0
};
async.series([
function getImageUrls (callback) {
request('http://www.example.com/', function (error, response, html) {
if (!error && response.statusCode === 200) {
var $ = cheerio.load(html);
$('img').each(function (i, elem) {
if ( $(this).attr('src').indexOf('http://') > -1 ) {
var src = $(this).attr('src');
imgSources.push(src);
}
});
}
callback();
});
},
function getFileSizes (callback) {
async.each(imgSources, function (img, _callback) {
gm(img).filesize(function (err, value) {
checkSize(img, value);
_callback();
});
});
callback();
},
function deliverLargestImage (callback) {
callback();
return largestImage;
}
]);
function checkSize (imgUrl, value) {
var r, raw;
if (value !== undefined) {
r = /\d+/;
raw = value.match(r)[0];
if (raw >= largestImage.size) {
largestImage.url = imgUrl;
largestImage.size = raw;
}
}
}
}
imageScraper();
Try moving the callback() here:
function getFileSizes (callback) {
async.each(imgSources, function (img, _callback) {
gm(img).filesize(function (err, value) {
checkSize(img, value);
_callback();
});
}, function(err){ callback(err); }); /* <-- put here */
/* callback(); <-- wrong here */
},
each accepts a callback as a third parameter that gets executed when the inner loop over each element is finished:
Arguments
arr - An array to iterate over.
iterator(item, callback) - A function to apply to each item in arr.
The iterator is passed a callback(err) which must be called once it has
completed. If no error has occured, the callback should be run without
arguments or with an explicit null argument.
callback(err) - A callback which is called when all iterator functions
have finished, or an error occurs.
I want to call a function after an asynchronous for loop iterating through values of an Javascript object finishes executing. I have the following code
for (course in courses) {
var url = '...' + courses[course];
request(url, (function (course) {
return function (err, resp, body) {
$ = cheerio.load(body);
//Some code for which I use object values
};
})(course));
}
This can be done in vanilla JS, but I recommend the async module, which is the most popular library for handling async code in Node.js. For example, with async.each:
var async = require('async');
var courseIds = Object.keys(courses);
// Function for handling each course.
function perCourse(courseId, callback) {
var course = courses[courseId];
// do something with each course.
callback();
}
async.each(courseIds, perCourse, function (err) {
// Executed after each course has been processed.
});
If you want to use a result from each iteration, then async.map is similar, but passes an array of results to the second argument of the callback.
If you prefer vanilla JS, then this will work in place of async.each:
function each(list, func, callback) {
// Avoid emptying the original list.
var listCopy = list.slice(0);
// Consumes the list an element at a time from the left.
// If you are concerned with overhead in using the shift
// you can accomplish the same with an iterator.
function doOne(err) {
if (err) {
return callback(err);
}
if (listCopy.length === 0) {
return callback();
}
var thisElem = listCopy.shift();
func(thisElem, doOne);
}
doOne();
}
(taken from a gist I wrote a while back)
I strongly suggest that you use the async library however. Async is fiddly to write, and functions like async.auto are brilliant.
A possible simple JS solution would be to do something like this.
var courses = {
lorum: 'fee',
ipsum: 'fy',
selum: 'foe'
};
var keys = Object.keys(courses);
var waiting = keys.length;
function completedAll() {
console.log('completed all');
}
function callOnCourseComplete(course, func) {
console.log('completed', course);
waiting -= 1;
if (!waiting) {
func();
}
}
var delay = 10000;
keys.forEach(function(course) {
var url = '...' + courses[course];
console.log('request', url);
setTimeout((function(closureCourse) {
return function( /* err, resp, body */ ) {
// Some code for which I use object values
callOnCourseComplete(closureCourse, completedAll);
};
}(course)), (delay /= 2));
});
Update: Probably a better Javascript solution would be to use Promises
const courses = {
lorum: 'fee',
ipsum: 'fy',
selum: 'foe',
};
function completedAll() {
console.log('completed all');
}
function callOnCourseComplete(courseName) {
console.log('completed', courseName);
}
let delay = 10000;
const arrayOfPromises = Object.keys(courses).map(courseName => (
new Promise((resolve, reject) => {
const url = `...${courses[courseName]}`;
console.log('request', url);
setTimeout((err, resp, body) => {
if (err) {
reject(err);
}
// Some code for which I use object values
resolve(courseName);
}, (delay /= 2));
}))
.then(callOnCourseComplete));
Promise.all(arrayOfPromises)
.then(completedAll)
.catch(console.error);