I've made a webscraper with cheerio and request and I'm trying now to implement a loop on an array of url.
Unfortunately I'm doing something wrong with my calls and callback but I can not figure out what.
This is my code :
var getWebData = function(url) {
var i = 1;
var data = [];
for (c = 0; c < url.length; c++) {
data[i] = request(url[c], function(err, resp, body) {
console.log('ok');
if (!err) {
console.log('there');
var $ = cheerio.load(body);
$('.text').each(function(i, element) {
var jsObject = { name : "", description : "", price: "", categorie: "", pricePerKg: "", capacity: "", weight: "", scrapingDate : "", url: ""};
var name = 'TESTOK';
jsObject.name = name;
data.push(jsObject);
})
return data;
}
console.log('but');
});
i++;
}
var json = JSON.stringify(data);
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err) {
console.log('File successfully written!');
})
}
getWebData(url);
app.listen('8080');
Note than any of my debugs print are not printed.
Does anyone know what's wrong in my code and how can I do to make it work ?
request is Aysnc
var json = JSON.stringify(data);
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err) {
console.log('File successfully written!');
})
This above code runs before the for loop completetes execution and populates data object.
Try executing this piece of code when loop complete execution.
run this command first npm install async --save
var async = require('async');
var getWebData = function(url){
var data = [];
async.eachSeries(url, function(urlSingle , cb){
request(urlSingle, function(err, resp, body) {
//write your logic here and push data in to data object
cb();
})
},function(){
// this will rum when loop is done
var json = JSON.stringify(data);
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err) {
console.log('File successfully written!');
});
});
}
I have been reading Asif's answer and the comments. That implementation is correct but you dont have to increment the c variable, also, if you initiate c=0 before, all the requests will be to url[0].
note that async.eachSeries callbacks each element of the array url in "urlsingle" callback, so you should use
request(urlsingle, ...
or consider using async.eachOf which gives you the index of each element in the array.
check for async documentation for any doubts http://caolan.github.io/async/
for (c = 0; c < url.length; c++) {
……
}
you should change like this:
var async = require('asycn');
async.map(url,
function(item, callback) {
data[i] = request(url[c],
function(err, resp, body) {
console.log('ok');
if (!err) {
console.log('there');
var $ = cheerio.load(body);
$('.text').each(function(i, element) {
var jsObject = {
name: "",
description: "",
price: "",
categorie: "",
pricePerKg: "",
capacity: "",
weight: "",
scrapingDate: "",
url: ""
};
var name = 'TESTOK';
jsObject.name = name;
data.push(jsObject);
}) callback(err, data);
}
console.log('but');
});
i++;
},function(err, results) {
if(err){
console.log(err);
}
});
in the loop is a time consuming operation.you should use asynchronous operation.
Related
I am having trouble implementing Q promises with recursive dynamodb call, new to nodejs and q, considering the limitations of the dynamodb to retrieve results, we need to run recursive query to get the required results.
normally we use the query with Q implementation something like this as
function getDBResults(){
var q = Q.defer();
var params = {
TableName: mytable,
IndexName: 'mytable-index',
KeyConditionExpression: 'id = :id',
FilterExpression: 'deliveryTime between :startTime and :endTime',
ExpressionAttributeValues: {
':startTime': {
N: startTime.toString()
},
":endTime": {
N: endTime.toString()
},
":id": {
S: id.toString()
}
},
Select: 'ALL_ATTRIBUTES',
ScanIndexForward: false,
};
dynamodb.query(params, function(err, data) {
if (err) {
console.log('Dynamo fail ' + err);
q.reject(err);
} else {
console.log('DATA'+ data);
var results = data.Items;
q.resolve(results);
}
});
return q.promise;
}
getDBResults.then(
function(data) {
// handle data
},
function(err) {
//handle error
}
);
Using recursive query I can get the results but I need those results to be used in another function, but because of nodejs async nature,the next function calls happens already before the recursive query function finishes its job, now I want that I get all the results from the recursive query function and then get as a promise to a new function and finally handle all the data.
recursive query for dynamodb looks like this.
function getDBResults(){
//var q = Q.defer();
params = {
TableName: mytable,
IndexName: 'mytable-index',
KeyConditionExpression: 'id = :id',
FilterExpression: 'deliveryTime between :startTime and :endTime',
ExpressionAttributeValues: {
':startTime': {
N: startTime.toString()
},
":endTime": {
N: endTime.toString()
},
":id": {
S: id.toString()
}
},
Select: 'ALL_ATTRIBUTES',
ScanIndexForward: false,
};
dynamodb.query(params, onQueryCallBack);
}
function onQueryCallBack(err, data) {
if (err) {
console.log('Dynamo fail ' + err);
console.error("Could not query db" + err);
} else {
if (typeof data.LastEvaluatedKey != "undefined") {
console.log("query for more...");
params.ExclusiveStartKey = data.LastEvaluatedKey;
dynamodb.query(params, onQueryCallBack);
}
data.Items.forEach(function(item) {
allResults.push(item);
});
//console.log('NO:OF Results:' + allResults.length);
//q.resolve(tickets);
//});
}
Now I want that I can get the results as promise finally so I can handle them in the next function like this.
getDBResults.then(
function(data) {
// handle data
},
function(err) {
//handle error
}
);
Please help me on this, sorry if its a stupid question but recursive calls with promises have made a hurdle for me.
Thanks
First of all, keep the promisified function you already have. Use it as a building block for the recursive solution, instead of trying to alter it!
It might need two small adjustments though:
function getDBResults(startKey){
// ^^^^^^^^
var q = Q.defer();
var params = {
ExclusiveStartKey: startKey,
// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
… // rest as before
};
dynamodb.query(params, function(err, data) {
if (err) {
q.reject(err);
} else {
q.resolve(data);
// ^^^^ Not `data.Items`
}
});
return q.promise;
}
Now we can use that to trivially implement the recursive solution:
function getRecursiveDBResults(key) {
return getDBResults(key).then(function(data) {
if (typeof data.LastEvaluatedKey != "undefined") {
return getRecursiveDBResults(data.LastEvaluatedKey).then(items) {
return data.Items.concat(items);
});
} else {
return data.Items
}
});
}
Here is how i solve the problem, Thanks Bergi for your solution as well
function getDBResults() {
var q = Q.defer();
var dynamodb = core.getDynamoDB();
params = {
TableName: mytable,
IndexName: 'mytable-index',
KeyConditionExpression: 'id = :id',
FilterExpression: 'deliveryTime between :startTime and :endTime',
ExpressionAttributeValues: {
':startTime': {
N: startTime.toString()
},
":endTime": {
N: endTime.toString()
},
":id": {
S: id.toString()
}
},
Select: 'ALL_ATTRIBUTES',
ScanIndexForward: false,
};
var results = [];
var callback = function(err, data) {
if (err) {
console.log('Dynamo fail ' + err);
q.reject(err);
} else if (data.LastEvaluatedKey) {
params.ExclusiveStartKey = data.LastEvaluatedKey;
dynamodb.query(params, callback);
} else {
q.resolve(results);
}
data.Items.forEach(function(item) {
results.push(item);
});
}
dynamodb.query(params, callback);
return q.promise;
}
I'm going to the next step of my webscraper today !
I'm already looping on an url array with async and I would loop again in this callback and wait for its exectution before restart.
I can not figure out how use two callback.
This is my code :
var getWebData = function(url) {
var data = [];
async.eachSeries(url, function(urlSingle, cb) {
request(urlSingle, function(err, resp, body) {
if (!err) {
var $ = cheerio.load(body);
var categoriesURL = [];
$('.ombre_menu li').each(function(i, element) {
$(this).find('.nav_sous-menu_bloc li a').each(function(i, element) {
categoriesURL.push('https://blabla' + $(this).attr('href'));
})
// I WANT TO LOOP on the categoriesURL array HERE
var jsObject = { name : "", description : "", price: "", categorie: "", liter: "", kilo: "", pricePer: "", quantity: "", capacity: "", promotion: "", scrapingDate : "", url: "" };
data.push(jsObject);
})
}
cb();
})
}, function() {
// this will rum when loop is done
var json = JSON.stringify(data);
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err) {
console.log('File successfully written!');
});
});
}
getWebData(url);
app.listen('8080');
Does anyone know how can I do ?
Thanks
Made couple of changes in your code:
Used .mapSeries in place of .eachSeries. This way you can get data from iterator function in same order as the input array. Means you'll get [4,9] for input [2,3] to a square function, never [9,4]
Broke code into functions so that each function does one specific task
Moved categoriesURL processing out of loop 1
Returning early. It improves code readability. if (err) return callback(err);
function getWebData(url) {
// Using .mapSeries in place of .eachSeries as you seem to want to get data from iterator function
async.mapSeries(url, processUrl, function(err, results) {
// this will rum when loop is done
var json = JSON.stringify(results);
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err) {
console.error('Error', err);
console.log('File successfully written!');
});
});
}
function processUrl(url, callback) {
request(url, function(err, resp, body) {
if (err) // Return simple cases early; Improves code readability
return callback(err); // or return callback(); -- if you don't want to send error upwards
var $ = cheerio.load(body);
var categoriesURL = [];
$('.ombre_menu li')
.each(function(i, element) { // loop 1
$(this)
.find('.nav_sous-menu_bloc li a')
.each(function(i, element) { // loop 2
categoriesURL.push('https://blablablac' + $(this)
.attr('href'));
}) // loop 2 end
}) // loop 1 end
// I WANT TO LOOP ON THE categoriesURL ARRAY HERE
// Using .mapSeries in place of .eachSeries for same above reason
async.mapSeries(categoriesURL, processCategoryUrl, function(err, results) {
if (err)
return callback(err);
// This function is called after process array categoriesURL
// Do what you want here then call callback provided to this method
return callback(null, results);
})
})
}
function processCategoryUrl(categoryUrl, callback) {
// Just process categoryUrl here and call callback with error or results
return callback();
}
getWebData(url);
app.listen('8080');
You can use nested eachSeries. Like this:
var getWebData = function(url) {
var data = [];
async.eachSeries(url, function(urlSingle, cb) {
request(urlSingle, function(err, resp, body) {
if (!err) {
var $ = cheerio.load(body);
var categoriesURL = [];
$('.ombre_menu li').each(function(i, element) {
$(this).find('.nav_sous-menu_bloc li a').each(function(i, element) {
categoriesURL.push('https://blablablac' + $(this).attr('href'));
})
async.eachSeries(caturl, function(categoriesURL, cb2) {
//Do whatever you want to do here
cb2();
}, function() {
//You can apply if and else for err an according to that you can set your callback responce here
cb();
};
})
}
})
}, function() {
// this will rum when loop is done
var json = JSON.stringify(data);
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err) {
console.log('File successfully written!');
});
});
}
getWebData(url);
app.listen('8080');
I'm brand new to express.js and API calls, and I can't figure out why this is crashing my server? Essentially it will run the first time through and render the page, but then crash the server saying:
TypeError: Cannot read property 'length' of undefined
for (var i = 0; i < data.businesses.length; i++) {
relevant section of code:
router.get('/:term/:radius/:lat/:lng', function (req, res) {
var yelp = require("yelp").createClient({
consumer_key: "xxxx",
consumer_secret: "xxxx",
token: "xxxx",
token_secret: "xxxx"
});
yelp.search({
term: req.params.term,
radius_filter: req.params.radius,
ll: req.params.lat + ',' + req.params.lng
},
function (error, data) {
var businessesArr = [];
if (typeof data) {
for (var i = 0; i < data.businesses.length; i++) {
businessesArr.push({
name: data.businesses[i].name,
image_url: data.businesses[i].image_url
});
}
res.render('selection', {
businesses: businessesArr
});
// console.log(data);
} else {
console.log(error);
}
});
});
This line:
if (typeof data) {
is always going to evaluate to true since it actually returns a string no matter what.
Replace it with something like:
if (data && data.businesses) {
my openstates.billDetail function has a call back function inside of it, and I'm getting an error: 'callback must be a function', but my callback appears to be a function!
app.get('/search/:searchTerm', function(req, response) {
var nameArray = req.params.searchTerm.split('_');
var bills = []
var billIds = []
openstates.legSearch({
first_name: nameArray[0],
last_name: nameArray[1]
}, function(err, data) {
if (!err) {
openstates.billSearch({
state: 'CA',
chamber: 'lower',
page: '1'
}, function(err, data) {
for (var billIndex = 0; billIndex < data.length; billIndex++) {
billIds.push(data[billIndex].id)
}
for (var billIdIndex = 0; billIdIndex < billIds.length; billIdIndex++) {
openstates.billDetail(billIds[billIdIndex], function(err, data) {
console.log(data);
})
}
})
}
})
})
Anyone have any thoughts on this? my other callbacks work fine...
I tested your code, and it works perfectly fine.
I added the following to test it:
var openstates = {legSearch: function(a, cb){ cb(null, 'aa'); },
billSearch: function(a, cb){ cb(null, 'bb'); },
billDetail: function(a, cb){ cb(null, 'cc'); }};
Got no error, and the last function print 'cc' as expected.
BTW, you do not send any response back, I am not sure if this is on purpose.
I am using Cradle to store objects in CouchDB from my Node.js server. The objects contain functions....
function AnObject(a, b){
this.a = a; this.b = b;
this.addparts = function(){return this.a + this.b;};}
var cradle = require('cradle');
var db = new(cradle.Connection)('http://localhost', 5984, {cache: true, raw: false}).database('myDB');
var myObject = new AnObject(1, 2);
console.log("addparts:" + myObject.addparts());
db.save('myObjectId', myObject);
This works fine and the document is stored but when I retrieve it, I can no longer call the function on the returned document...
db.get('myObjectId', function(err, myRetrievedObject){
console.log("addparts:" + myRetrievedObject.addparts());
});
This fails with a (Property is not a function) Error..
node cradle_test
cradle_test.js:21
console.log("addparts:" + myRetrievedObject.addparts());
^
TypeError: Property 'addparts' of object {"_id":"myObjectId","_rev":"2-83535db5101fedfe30a1548fa2641f29","a":1,"b":2,"addparts":"function (){return this.a + this.b;}"} is not a function
CouchDB stores JSON. Functions are not valid JSON. Functions are never stored in the database.
I recommend you move the functions out into a prototype.
function AnObject(a, b){
this.a = a; this.b = b;
}
AnObject.prototype.addparts = function(){
return this.a + this.b;
};
db.get('myObjectId', function(err, myRetrievedObject){
var obj = Object.create(AnObject.prototype);
// for some value of extend ( https://github.com/Raynos/pd#pd.extend )
extend(obj, myRetrievedObject);
console.log("addparts:" + obj.addparts());
});
This way your not saving functions and you can still operate on your object using your methods. You just have to make sure that your retrieved object is made an instance of AnObject
There is a way to store functions in CouchDB: as attachments.
Define your functions in a separate .js file
(for example, a set of functions you want to share across multiple servers or app instances).
/modules/db.js:
var db = {}
db.insert = function(nanoDb, object, cb){
//insert new doc
nanoDb.insert(object, function(err, body, header) {
if (!err) {
cb.call(null, {success: true, data: body});
}else{
console.log('[insert] ', err.message);
cb.call(null, {success: false, error: err.message});
}
});
}
db.bulkInsert = function(nanoDb, array, cb){
//structure for bulk insert
var data = {
docs: array
}
//insert new doc
nanoDb.bulk(data, function(err, body, header) {
if (!err) {
cb.call(null, {success: true, data: body});
}else{
console.log('[bulkInsert] ', err.message);
cb.call(null, {success: false, error: err.message});
}
});
}
db.bulkDelete = function(nanoDb, array, cb){
for(i in array){
array[i]._deleted = true;
}
var data = {
docs: array
}
//check if the url exists in the db
nanoDb.bulk(data, function(err, body) {
if (!err){
cb.call(null, {success: true, data: data});
}else{
console.log('[bulkDelete] ', err.message);
cb.call(null, {success: false, error: err.message});
}
});
}
db.view = function(nanoDb, design, view, params, cb){
nanoDb.view(design, view, params, function(err, body) {
if (!err){
var docs = util.extractDocs(body);
cb.call(null, {success: true, data: docs});
}else{
console.log('[view] ', err.message);
cb.call(null, {success: false, error: err.message});
}
});
}
db.search = function(nanoDb, design, index, params, cb){
nanoDb.search(design, index, params, function(err, body) {
if (!err) {
var docs = util.extractDocsSearch(body);
cb.call(null, {success: true, data: docs});
}else{
console.log('[search] ', err.message);
cb.call(null, {success: false, error: err.message});
}
});
}
db.follow = function(nanoDb, params){
var feed = nanoDb.follow(params);
return feed;
}
module.exports = db;
Use a CouchApp to deploy the functions as attachments (in a design doc):
//your couchapp
var couchapp = require('couchapp')
//url to your database
var url = '...';
//empty design doc (for attachments)
ddoc = {
_id: '_design/mods'
};
//folder containing .js files
couchapp.loadAttachments(ddoc, './modules/');
//this function uploads your attachments
couchapp.createApp(ddoc, url, function(app) {
app.push(function(){
//do something
});
});
Now, get the functions wherever you need them:
//use mikaels request module if you like
var request = require('request');
//tell the app where to get your .js file
//sometimes a good idea to persist these references in memory or even in your couchdb
var fileUrl = '/_design/modules/db.js'
//set this variable in the proper scope
var db;
//we'll use this to 'require' the .js file
var _require = function(src, file) {
var m = new module.constructor();
m.paths = module.paths;
m._compile(src, file);
return m.exports;
}
request({ url: fileUrl, json: true }, function (err, response, data) {
if (!err && response.statusCode === 200) {
//now we assign our required .js file (object w/functions) back into usable form
//woot woot!
db = _require(data);
}else{
console.log('[request]', err);
}
});
Do stuff!
db.doSomething()