async.series and async.each not working as expected - javascript

I am attempting to build a web scraper using nodeJS that searches a website's HTML for images, caches the image source URLs, then searches for the one with largest size.
The problem I am having is deliverLargestImage() is firing before the array of image source URLs is looped through to get their file sizes. I am attempting to use both async.series and async.each to have this work properly.
How do I force deliverLargestImage() to wait until the async.each inside getFileSizes() is finished?
JS
var async, request, cheerio, gm;
async = require('async');
request = require('request');
cheerio = require('cheerio');
gm = require('gm').subClass({ imageMagick: true });
function imageScraper () {
var imgSources, largestImage;
imgSources = [];
largestImage = {
url: '',
size: 0
};
async.series([
function getImageUrls (callback) {
request('http://www.example.com/', function (error, response, html) {
if (!error && response.statusCode === 200) {
var $ = cheerio.load(html);
$('img').each(function (i, elem) {
if ( $(this).attr('src').indexOf('http://') > -1 ) {
var src = $(this).attr('src');
imgSources.push(src);
}
});
}
callback();
});
},
function getFileSizes (callback) {
async.each(imgSources, function (img, _callback) {
gm(img).filesize(function (err, value) {
checkSize(img, value);
_callback();
});
});
callback();
},
function deliverLargestImage (callback) {
callback();
return largestImage;
}
]);
function checkSize (imgUrl, value) {
var r, raw;
if (value !== undefined) {
r = /\d+/;
raw = value.match(r)[0];
if (raw >= largestImage.size) {
largestImage.url = imgUrl;
largestImage.size = raw;
}
}
}
}
imageScraper();

Try moving the callback() here:
function getFileSizes (callback) {
async.each(imgSources, function (img, _callback) {
gm(img).filesize(function (err, value) {
checkSize(img, value);
_callback();
});
}, function(err){ callback(err); }); /* <-- put here */
/* callback(); <-- wrong here */
},
each accepts a callback as a third parameter that gets executed when the inner loop over each element is finished:
Arguments
arr - An array to iterate over.
iterator(item, callback) - A function to apply to each item in arr.
The iterator is passed a callback(err) which must be called once it has
completed. If no error has occured, the callback should be run without
arguments or with an explicit null argument.
callback(err) - A callback which is called when all iterator functions
have finished, or an error occurs.

Related

Get the value from callback function and set to parent function

I am new to Nodejs and javascript and working on nodejs api code. I am using GandiAPI to check domainAvaliablity(Project related requirement) and have created a get request method (checkDomainAvaliablity) like this.
exports.checkDomainAvaliablity = function (req, res) {
gandiApi.methodCall('domain.available', [gandiapikey, [domain]], callback)
};
And I have a callback function which have 2 parameters(which I can not change).
I am able to get the value succesfully in my callback function.
Now I want to return "value" from callback and want to set in "res" paramameter of checkDomainAvaliablity function(Parent function) (something like res.json(task)) .
var callback = function (error, value) {
console.dir(value)
if (value[domain] == 'pending') {
console.log('result is not yet ready')
setTimeout(function () {
gandiApi.methodCall('domain.available', [gandiapikey, [domain]],
callback)
}, 700)
}
else {
console.dir(value)
}
// I want to return "value" from here and want to set in "res" paramameter of checkDomainAvaliablity function (Parent function).
}
Note: Use of callbackfuncion is neccessary.
Thanks #trincot. Putting the callback function inside the parent function works fine.
exports.checkDomainAvaliablity = function (req, res) {
domain = req.params.domainAvaliablity
var callback = function (error, value) {
console.log("logging" + value + error)
if (value[domain] == 'pending') {
console.log('result is not yet ready')
setTimeout(function () {
gandiApi.methodCall('domain.available', [gandiapikey, [domain]],
callback)
}, 700)
}
else {
res.send(value);
console.dir(value)
}
}
gandiApi.methodCall('domain.available', [gandiapikey, [domain]], callback)
};

Know when my function is over

This is my functions
function parseLinks(links, callback) {
var products = [];
for (var i = 0; i < links.length; i++) {
request(links[i], function (error, response, body) {
var product;
if (!error && response.statusCode == 200) {
var $ = cheerio.load(body);
// title
var title = $('h1').text();
if (!title)
var title = $('title').text();
var description = $('meta[name="description"]').attr('content');
product = new Product(links[0].trim(), title.trim(), description.trim());
products.push(product);
}
});
}
callback(products) // the callback only do a console.log(products)
}
After that, I want to do a console.log(products) who display all the products.
So I setup a callback attached to parseLinks and call it after the for loop. The problem is in my for loop, I call asynchronous function request each times, so my callback is called before the end of all the request calls, so my console.log(products) print an empty array.
Do you know how fix that ?
Thanks
You have to check if all the asynchronous calls have finished. Create an inner function that calls callback when all asynchronous work is done:
function parseLinks(links, callback) {
var products = [],
numberOfItems = links.length; // numbers of linkes to be parsed
function checkIfDone() { // this function will be called each time link is parsed
numberOfItems--; // decrement the numberOfItems (number that tells us how many links left)
if(numberOfItems === 0) // if there are none left (all links are parsed), then call callback with the resultant array.
callback(products);
}
for (var i = 0; i < links.length; i++) {
request(links[i], function (error, response, body) {
// ...
checkIfDone(); // everytime a link is parsed, call checkIfDone
});
}
}
You can embed the logic of checkIfDone inside the function request directly. I used a separate function for clarity.
The best way is to use async.
var async = require("async");
function parseLinks(links, callback) {
var products = [];
async.forEach(links, function(link, done) {
request(link, function (error, response, body) {
var product;
if (!error && response.statusCode == 200) {
var $ = cheerio.load(body);
// title
var title = $('h1').text();
if (!title)
var title = $('title').text();
var description = $('meta[name="description"]').attr('content');
product = new Product(links[0].trim(), title.trim(), description.trim());
products.push(product);
}
done();
});
}, function() {
callback(products);
});
}
You can use async.each from asnyc module
Simplified code:
function parseLinks(links, callback) {
var products = [];
async.each(links, function(link, requestCallback) {
request(links[i], function(error, response, body) {
//... rest of your code
requestCallback(); //Request has ended
});
}, function(err) {
//All requests ended!
callback();
});
}

Use callback with javascript

I am creating project using javascript and nodejs. I am integrating callback in my function inside for loop with condition basis,but am unable to do this.my problem is callback is completed on first iteration of loop. here is my code:
function tagEndpointNames(callback) {
var data = userGenerateToken();
var sql = "SELECT * FROM topology_data WHERE topology_coordinates !='' and topology_uuid is not null"
var query = conn.query(sql, function(err, tagEndpointNames) {
for (var i = 0; i < tagEndpointNames.length; i++) {
var topologytagData = {
"topology_tag": tagEndpointNames[i].topology_uuid
}
var tpCooridinates = JSON.parse(tagEndpointNames[i].topology_coordinates);
for (var j = 0; j < tpCooridinates.stageObjects.length; j++) {
if (tpCooridinates.stageObjects.length) {
if (tpCooridinates.stageObjects[j].endPointId) {
if (isGuid(tpCooridinates.stageObjects[j].endPointId)) {
var endPointUUID = tpCooridinates.stageObjects[j].endPointId;
var _ro = require('request');
var url = url;
var _d = '';
_ro({
url: url,
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': 'Bearer ' + data['access_token']
},
json: topologytagData
}, function(_e, _r, _b) {
if (_r.statusCode == 200 && !_e) {
callback()
//return;
} else {
callback()
console.log("andarss")
return;
}
})
}
}
}
}
}
})
}
Here is the function call:
tagEndpointNames(function(){
console.log ('Server Closed during MIGRATION JOB 4');
server.close(function () {
process.exit(0);
});
})
When you are running asynchronous process with callback in a for loop, remember that the callback from callee will be fired in the first event completed inside the loop. In your case request lib call is an asynchronous process inside for loop, you need to handle all callback from all the request call before you want to callback the callee.
Please read:
How to write asynchronous functions for Node.js
Maybe it's time for you to start using Javascript Promise.
The async library for Node will help you for doing this kind of tasks.
Use async waterfall.It Runs an array of functions in series, each passing their results to the next in the array. However, if any of the functions pass an error to the callback, the next function is not executed and the main callback is immediately called with the error.
js
var create = function (req, res) {
async.waterfall([
_function1(req),
_function2,
_function3
], function (error, success) {
if (error) { alert('Something is wrong!'); }
return alert('Done!');
});
};
function _function1 (req) {
return function (callback) {
var something = req.body;
callback (null, something);
}
}
function _function2 (something, callback) {
return function (callback) {
var somethingelse = function () { // do something here };
callback (err, somethingelse);
}
}
function _function3 (something, callback) {
return function (callback) {
var somethingmore = function () { // do something here };
callback (err, somethingmore);
}
}
Reference

Asynchronous Calls and Recursion with Node.js

I'm looking to execute a callback upon the full completion of a recursive function that can go on for an undetermined amount of time. I'm struggling with async issues and was hoping to get some help here. The code, using the request module, is as follows:
var start = function(callback) {
request.get({
url: 'aaa.com'
}, function (error, response, body) {
var startingPlace = JSON.parse(body).id;
recurse(startingPlace, callback);
});
};
var recurse = function(startingPlace, callback) {
request.get({
url: 'bbb'
}, function(error, response, body) {
// store body somewhere outside these funtions
// make second request
request.get({
url: 'ccc'
}, function(error, response, body) {
var anArray = JSON.parse(body).stuff;
if (anArray) {
anArray.forEach(function(thing) {
request.get({
url: 'ddd'
}, function(error, response, body) {
var nextPlace = JSON.parse(body).place;
recurse(nextPlace);
});
})
}
});
});
callback();
};
start(function() {
// calls final function to print out results from storage that gets updated each recursive call
finalFunction();
});
It seems that once my code goes past the for loop in the nested requests, it continues out of the request and ends the initial function call while the recursive calls are still going on. I want it to not finish the highest-level iteration until all the nested recursive calls have completed (which I have no way of knowing how many there are).
Any help is GREATLY appreciated!
In your example you have no recursive calls. If I understand correctly you want to say that recurse(point, otherFunc); is the beginning of a recursive call.
Then just go back to the definition of the recursive call (which you have not shown in your post) and do this (add a third argument for a callback function to be called in the end of recursion; the caller will pass it as a parameter):
function recurse(startingPlace, otherFunc, callback_one) {
// code you may have ...
if (your_terminating_criterion === true) {
return callback_one(val); // where val is potentially some value you want to return (or a json object with results)
}
// more code you may have
}
Then in the original code that you posted, make this call instead (in the inner-most part):
recurse(startingPlace, otherFunc, function (results) {
// results is now a variable with the data returned at the end of recursion
console.log ("Recursion finished with results " + results);
callback(); // the callback that you wanted to call right from the beginning
});
Just spend some time and try to understand my explanation. When you understand, then you will know node. This is the node philosophy in one post. I hope it is clear. Your very first example should look like this:
var start = function(callback) {
request.get({
url: 'aaa.com'
}, function (error, response, body) {
var startingPlace = JSON.parse(body).id;
recurse(startingPlace, otherFunc, function (results) {
console.log ("Recursion finished with results " + results);
callback();
});
});
};
Below is only additional information in case you are interested. Otherwise you are set with the above.
Typically in node.js though, people return an error value as well, so that the caller knows if the function that was called has finished successfully. There is no big mystery here. Instead of returning just results people make a call of the form
return callback_one(null, val);
Then in the other function you can have:
recurse(startingPlace, otherFunc, function (recError, results) {
if (recErr) {
// treat the error from recursion
return callback(); // important: use return, otherwise you will keep on executing whatever is there after the if part when the callback ends ;)
}
// No problems/errors
console.log ("Recursion finished with results " + results);
callback(); // writing down `return callback();` is not a bad habit when you want to stop execution there and actually call the callback()
});
Update with my suggestion
This is my suggestion for the recursive function, but before that, it looks like you need to define your own get:
function myGet (a, callback) {
request.get(a, function (error, response, body) {
var nextPlace = JSON.parse(body).place;
return callback(null, nextPlace); // null for no errors, and return the nextPlace to async
});
}
var recurse = function(startingPlace, callback2) {
request.get({
url: 'bbb'
}, function(error1, response1, body1) {
// store body somewhere outside these funtions
// make second request
request.get({
url: 'ccc'
}, function(error2, response2, body2) {
var anArray = JSON.parse(body2).stuff;
if (anArray) {
// The function that you want to call for each element of the array is `get`.
// So, prepare these calls, but you also need to pass different arguments
// and this is where `bind` comes into the picture and the link that I gave earlier.
var theParallelCalls = [];
for (var i = 0; i < anArray.length; i++) {
theParallelCalls.push(myGet.bind(null, {url: 'ddd'})); // Here, during the execution, parallel will pass its own callback as third argument of `myGet`; this is why we have callback and callback2 in the code
}
// Now perform the parallel calls:
async.parallel(theParallelCalls, function (error3, results) {
// All the parallel calls have returned
for (var i = 0; i < results.length; i++) {
var nextPlace = results[i];
recurse(nextPlace, callback2);
}
});
} else {
return callback2(null);
}
});
});
};
Note that I assume that the get request for 'bbb' is always followed by a get request for 'ccc'. In other words, you have not hidden a return point for the recursive calls where you have the comments.
Typically when you write a recursive function it will do something and then either call itself or return.
You need to define callback in the scope of the recursive function (i.e. recurse instead of start), and you need to call it at the point where you would normally return.
So, a hypothetical example would look something like:
get_all_pages(callback, page) {
page = page || 1;
request.get({
url: "http://example.com/getPage.php",
data: { page_number: 1 },
success: function (data) {
if (data.is_last_page) {
// We are at the end so we call the callback
callback(page);
} else {
// We are not at the end so we recurse
get_all_pages(callback, page + 1);
}
}
}
}
function show_page_count(data) {
alert(data);
}
get_all_pages(show_page_count);
I think you might find caolan/async useful. Look especially into async.waterfall. It will allow you to pass results from a callback from another and when done, do something with the results.
Example:
async.waterfall([
function(cb) {
request.get({
url: 'aaa.com'
}, function(err, res, body) {
if(err) {
return cb(err);
}
cb(null, JSON.parse(body).id);
});
},
function(id, cb) {
// do that otherFunc now
// ...
cb(); // remember to pass result here
}
], function (err, result) {
// do something with possible error and result now
});
If your recursive function is synchronous, just call the callback on the next line:
var start = function(callback) {
request.get({
url: 'aaa.com'
}, function (error, response, body) {
var startingPlace = JSON.parse(body).id;
recurse(startingPlace, otherFunc);
// Call output function AFTER recursion has completed
callback();
});
};
Else you need to keep a reference to the callback in your recursive function.
Pass the callback as an argument to the function and call it whenever it is finished.
var start = function(callback) {
request.get({
url: 'aaa.com'
}, function (error, response, body) {
var startingPlace = JSON.parse(body).id;
recurse(startingPlace, otherFunc, callback);
});
};
Build your code from this example:
var udpate = function (callback){
//Do stuff
callback(null);
}
function doUpdate() {
update(updateDone)
}
function updateDone(err) {
if (err)
throw err;
else
doUpdate()
}
doUpdate();
With ES6, 'es6-deferred' & 'q'. You could try as following,
var Q = require('q');
var Deferred = require('es6-deferred');
const process = (id) => {
var request = new Deferred();
const ids =//do something and get the data;
const subPromises = ids.map(id => process(id));
Q.all(subPromises).then(function () {
request.resolve();
})
.catch(error => {
console.log(error);
});
return request.promise
}
process("testId").then(() => {
console.log("done");
});

How to call a function after an asynchronous for loop of Object values finished executing

I want to call a function after an asynchronous for loop iterating through values of an Javascript object finishes executing. I have the following code
for (course in courses) {
var url = '...' + courses[course];
request(url, (function (course) {
return function (err, resp, body) {
$ = cheerio.load(body);
//Some code for which I use object values
};
})(course));
}
This can be done in vanilla JS, but I recommend the async module, which is the most popular library for handling async code in Node.js. For example, with async.each:
var async = require('async');
var courseIds = Object.keys(courses);
// Function for handling each course.
function perCourse(courseId, callback) {
var course = courses[courseId];
// do something with each course.
callback();
}
async.each(courseIds, perCourse, function (err) {
// Executed after each course has been processed.
});
If you want to use a result from each iteration, then async.map is similar, but passes an array of results to the second argument of the callback.
If you prefer vanilla JS, then this will work in place of async.each:
function each(list, func, callback) {
// Avoid emptying the original list.
var listCopy = list.slice(0);
// Consumes the list an element at a time from the left.
// If you are concerned with overhead in using the shift
// you can accomplish the same with an iterator.
function doOne(err) {
if (err) {
return callback(err);
}
if (listCopy.length === 0) {
return callback();
}
var thisElem = listCopy.shift();
func(thisElem, doOne);
}
doOne();
}
(taken from a gist I wrote a while back)
I strongly suggest that you use the async library however. Async is fiddly to write, and functions like async.auto are brilliant.
A possible simple JS solution would be to do something like this.
var courses = {
lorum: 'fee',
ipsum: 'fy',
selum: 'foe'
};
var keys = Object.keys(courses);
var waiting = keys.length;
function completedAll() {
console.log('completed all');
}
function callOnCourseComplete(course, func) {
console.log('completed', course);
waiting -= 1;
if (!waiting) {
func();
}
}
var delay = 10000;
keys.forEach(function(course) {
var url = '...' + courses[course];
console.log('request', url);
setTimeout((function(closureCourse) {
return function( /* err, resp, body */ ) {
// Some code for which I use object values
callOnCourseComplete(closureCourse, completedAll);
};
}(course)), (delay /= 2));
});
Update: Probably a better Javascript solution would be to use Promises
const courses = {
lorum: 'fee',
ipsum: 'fy',
selum: 'foe',
};
function completedAll() {
console.log('completed all');
}
function callOnCourseComplete(courseName) {
console.log('completed', courseName);
}
let delay = 10000;
const arrayOfPromises = Object.keys(courses).map(courseName => (
new Promise((resolve, reject) => {
const url = `...${courses[courseName]}`;
console.log('request', url);
setTimeout((err, resp, body) => {
if (err) {
reject(err);
}
// Some code for which I use object values
resolve(courseName);
}, (delay /= 2));
}))
.then(callOnCourseComplete));
Promise.all(arrayOfPromises)
.then(completedAll)
.catch(console.error);

Categories

Resources