Select using recursivity to iterate node js - javascript

i'm iterating trought a mysql database with 20000 rows , the problem is that in each roww i need to call a url and get its content and update the database, the problem is... how do i wait the whole process to continue the iteration?, and how can i make it faster like , do two at time , do tree at time?. thanks
var query = connection.query('SELECT * from product where product.product_description = "0" ', function(err, rows, fields) {
kontador =1;
if (!err)
{
var url = rows[0].url;
url = url.replace('../..','');
//console.log(url);
id = rows[0].id;
url = 'http://example.com'+url;
doCall(url,id,kontador,function(response){
console.log(response,kontador);
if(response && kontador <= rows.length){
var url = rows[kontador].url;
url = url.replace('../..','');
id = rows[kontador].id;
url = 'http://www.example2.com'+url;
//console.log(id);
doCall(url,id, kontador, doCall);
kontador +=1;
}
});
}
else
console.log('Error while performing Query.');
});
function doCall(urlToCall,id,kontador, callback)
{
request({'url':urlToCall}, function(error, response, html){
//console.log('inside');
//console.log(error);
if(!error){
var $ = cheerio.load(html);
$('#content').filter(function(){
var data = $(this);
data = data.find('p');
// console.log('-');
// console.log(data.html());
var queryy = connection.query(' UPDATE product SET product_description = "'+data.html()+'" WHERE id = '+id, function(err, rows, fields) {
if (!err)
{
console.log('updated! ');
return callback(true);
}else{
console.log('error sql!');
}
});
//process.exit();
});
}
});
}

In order to orchestrate the async behavior of your application (what can be done in parallel, should there be throttling, ..) you should use an existing library like :
async - https://www.npmjs.com/package/async - if you prefer node.js callback style
bluebird - http://bluebirdjs.com/docs/getting-started.html - if you prefer promises
highland - https://www.npmjs.com/package/highland - somewhat hybrid + stream like
There are many other libraries that can help you build complex async call graphs.

Related

Express and Mongodb insert same data multiple times

I am quite new to Express and Mongodb. The project that I am working on requires me to:
Take an object that contains multiple url
Download the content of the url and save it to a cloud storage
Generate links for each of the file saved
Save these links into Mongodb as individual documents
The incoming object looks something like this:
{
"id" : 12345678,
"attachments" : [
{
"original_url" : "https://example.com/1.png",
},
{
"original_url" : "https://example.com/2.png",
},
{
"original_url" : "https://example.com/3.png",
}
]
}
the end goal is to have 3 separate document like this saved on mongodb:
{
"id" : 87654321,
"some_other_data": "etc",
"new_url" : "https://mycloudstorage.com/name_1.png"
}
I have a simple loop like this:
for(var i = 0; i < original_data.attachments.length; i++){
var new_url = "https://example.com/" + i + ".png";
download(original_url, new_url, function(new_url){
console.log(new_url)
new_data.new_url = new_url;
save_new_url_to_mongodb(new_data);
});
}
and the saving function looks like this:
function save_new_url_to_mongodb (data, cb) {
getCollection(collection, function (err, collection) {
if (err) {
return cb(err);
}
collection.insert(data, {w: 1, ordered: false}, function (err, result) {
if (err) {
return cb(err);
}
var item = fromMongo(result.ops);
cb(null, item);
});
});
}
var download = function(original_url, new_url, callback){
request.head(original_url, function(err, res, body){
if(res === undefined){
console.log(err);
} else {
var localUrlStream = request(original_url);
var file = bucket.file(new_url);
var remoteWriteStream = file.createWriteStream();
var stream = localUrlStream.pipe(remoteWriteStream);
stream.on('error', function (err) {
next(err);
});
stream.on('finish', function(){
callback(new_url);
});
}
});
};
The downloading part is fine, I get 3 different image files in my cloud storage. The console.log also gives me 3 different new urls.
The problem is that the newly saved mongodb document all have the same new_url. And sometimes if there are more original_url in the original data, some of the new documents would fail to save.
Thanks a lot
It's a scoping issue in your assignment of new_url in the for loop. See here: JavaScript closure inside loops – simple practical example
A solution is to use Array.Prototype.forEach which inherently solves the scope issue since each iteration creates a closure for the callback
original_data.attachments.forEach(function(i) {
var new_url = "https://example.com/" + i + ".png";
download(original_url, new_url, function(new_url){
console.log(new_url)
new_data.new_url = new_url;
save_new_url_to_mongodb(new_data);
});
})

Javascript function doesn't return query result

I am trying to figure out why one of my queries won't return the value from a query...my code looks like this:
var client = new pg.Client(conString);
client.connect();
var query = client.query("SELECT count(*) as count FROM sat_scores")
// Don't use demo key in production. Get a key from https://api.nasa.gov/index.html#apply-for-an-api-key
function getNEO(callback) {
var data = '';
query.on('rows', function(rows) {
console.log("Row count is: %s", rows[0].count)
data += rows[0].count;
});
query.on('end', function() {
callback(data);
});
}
with that, getNEO returns a blank...but if I set var data = '4', then getNEO returns 4....the query should return 128 but it just returns a blank...
First of all, getNEO() doesn't return anything - I'm operating on the assumption that you call getNEO() exactly once for your query, and pass in a callback to handle the data, and that callback is what's not getting the appropriate data?
My typical recommendation for troubleshooting things like this is to simplify your code, and try and get really close to any example code given (for instance):
var client = new pg.Client(conString);
// define your callback here, in theory
client.connect(function (err) {
if (err) throw err;
var query = client.query("SELECT count(*) as count FROM sat_scores"),
function(err, result) {
if (err) throw err;
console.log(result.rows.length);
}
);
});
... I'm doing a couple things here you'll want to note:
It looks like the client.connect() method is asynchronous - you can't just connect and then go run your query, you have to wait until the connection is completed, hence the callback. Looking through the code, it looks like it may emit a connect event when it's ready to send queries, so you don't have to use a callback on the connect() method directly.
I don't see a data event in the documentation for the query object nor do I see one in the code. You could use the row event, or you could use a callback directly on the query as in the example on the main page - that's what I've done here in the interest of simplicity.
I don't see the count property you're using, and row[0] is only going to be the first result - I think you want the length property on the whole rows array if you're looking for the number of rows returned.
I don't know if you have a good reason to use the getNEO() function as opposed to putting the code directly in procedurally, but I think you can get a closer approximation of what you're after like this:
var client = new pg.Client(conString);
// define your callback here, in theory
client.connect();
function getNEO(callback) {
client.on('connect', function () {
var query = client.query("SELECT count(*) as count FROM sat_scores"));
query.on('end', function(result) {
callback(result.rowCount);
});
});
}
... so, you can call your getNEO() function whenever you like, it'll appropriately wait for the connection to be completed, and then you can skip tracking each row as it comes; the end event receives the result object which will give you all the rows and the row count to do with what you wish.
so here is how I was able to resolve the issue....I moved the var query inside of the function
function getNEO(state, callback) {
var conString = "postgres://alexa:al#alexadb2.cgh3p2.us-east-1.redshift.amazonaws.com:5439/alexa";
var client = new pg.Client(conString);
client.connect();
var data = '';
var query = client.query("SELECT avg(Math) as math, avg(Reading) as reading FROM sat_scores WHERE State = '" + state + "'");
console.log("query is: %s", query);
query.on('row', function(row) {
console.log("Row cnt is: %s", row.math);
console.log("row is: " + row)
data += row;
});
console.log("made it");
query.on('end', function() {
callback(data);
});
}

Array filled in the wrong order

I have a strange problem, when I push my result in my array, the result isn't at the right position in my array (for example the result instead of being at the index number 1 is at the index 3), and when I re-run my module results change of position randomly in the array .
var cote = function(links, callback) {
var http = require('http');
var bl = require('bl');
var coteArgus = [];
for (i = 0; i < links.length; i ++) {
http.get('http://www.website.com/' + links[i], function(response) {
response.pipe(bl(function(err, data) {
if (err) {
callback(err + " erreur");
return;
}
var data = data.toString()
newcoteArgus = data.substring(data.indexOf('<div class="tx12">') + 85, data.indexOf(';</span>') - 5);
myresult.push(newcoteArgus);
callback(myresult);
}));
});
}
};
exports.cote = cote;
The problem lies in the fact that although the for is synchronous the http.get and the pipe operation are not (I/O is async in nodejs) so the order of the array depends on which request and pipe finishes first which is unknown.
Try to avoid making async operations in a loop, instead use libraries like async for flow control.
I think this can be done in the right order, using async map
Here a sample with map and using request module.
// There's no need to make requires inside the function,
// is better just one time outside the function.
var request = require("request");
var async = require("async");
var cote = function(links, callback) {
var coteArgus = [];
async.map(links, function(link, nextLink) {
request("http://www.website.com/" + link, function(err, response, body) {
if (err) {
// if error so, send to line 28 with a error, exit from loop.
return nextLink(err);
}
var newcoteArgus = body.substring(
body.indexOf("<div class='tx12'>") + 85,
body.indexOf(";</span>") - 5
);
// pass to next link, and add newcoteArgus to the final result
nextLink(null, newcoteArgus);
});
},
function(err, results) {
// if there's some errors, so call with error
if(err) return callback(err);
// there's no errors so get results as second arg
callback(null, results);
});
};
exports.cote = cote;
One more thing, i'm not sure, really what you are doing in the part where you search html content in the responses but there's a really good library to work with JQuery selectors from server side maybe can be useful for you.
Here's how you should call the function
// Call function sample.
var thelinks = ["features", "how-it-works"];
cote(thelinks, function(err, data) {
if(err) return console.log("Error: ", err);
console.log("data --> ", data);
});

Recursive Fetch All Items In DynamoDB Query using Node JS

This is probably more of an JS/Async question than a DynamoDB specific question -
I want to fetch all the items in a table with a hash key in Amazon's DynamoDB. The table also has Range key in it.
I am using a NodeJS library which is a wrapper around AWS DynamoDB REST API. -
Node-DynamoDB
DynamoDB only returns 1 MB worth of results with each query. To fetch reminder of results, it includes lastEvaluatedKey . We can include this in another query to fetch another 1 MB worth of results and so on...
I am facing difficulty in writing a recursive async function which should hit the service sequentially till i can get all the results back. (table will never have more than 10 MB for my use case, no chance of a runaway query)
Some pseudo code for illustration:
ddb.query('products', primarykey, {}, function(err,result){
//check err
if(result && result.lastEvaluatedKey){
//run the query again
var tempSet = result.items;
//temporarily store result.items so we can continue and fetch remaining items.
}
else{
var finalSet = result.items;
//figure out how to merge with items that were fetched before.
}
});
var getAll = function(primarykey, cb) {
var finalSet = [],
nextBatch = function(lek) {
ddb.query('products', primarykey, {
exclusiveStartKey: lek
}, function(err, result) {
if (err) return cb(err);
if (result.items.length)
finalSet.push.apply(finalSet, result.items);
if (result.lastEvaluatedKey)
nextBatch(result.lastEvaluatedKey);
else
cb(null, finalSet);
});
};
nextBatch();
};
getAll(primarykey, function(err, all) {
console.log(err, all);
});
After few cups of coffee, i wrote this recursive function..
Hope this helps others, If you see a bug , please edit it or leave a comment
var DynamoDbItemFetcher = function(table,hash,maxItems,callback){
var self = this;
self.table = table;
self.startKey = null;
self.hash = hash;
self.maxItems = maxItems;
self.items = [];
self.callback = callback;
self.getItems = function(){
var params = {};
if(self.startKey){
params.exclusiveStartKey = self.startKey;
}
ddb.query(self.table,self.hash,params,function(err1,result){
if(err1)
return self.callback(err1, null);
if(result){
self.items = self.items.concat(result.items);
if(result.lastEvaluatedKey && result.lastEvaluatedKey.hash){
if(self.maxItems && self.items.length > self.maxItems){
self.callback(null,self.items);
}else {
self.startKey = result.lastEvaluatedKey;//reset start key
self.getItems(callback);//recursive call...
}
}else{
//no more items..return whatever is in store.
self.callback(null,self.items);
}
}
else{
self.callback(null, null);
}
});
};
};
Here's a variation using promises. I needed to get a list of table names, not scan items from a table, but similar concepts apply.
function getTableNames(key, prevTableNames) {
return new Promise(function(resolve, reject) {
let request = dynamodb.listTables({
ExclusiveStartTableName: key
}, function(err, response) {
if (err) {
reject(err);
} else {
let tableNames = (prevTableNames || []).concat(response.TableNames);
if (response.LastEvaluatedTableName) {
getTableNames(response.LastEvaluatedTableName, tableNames)
.then(resolve)
.catch(reject);
} else {
resolve(tableNames)
}
}
});
});
}

Calling a function from within a callback causes node app to hang

I have updated the post with the actual code.
The problem is that the node app hangs and does not exit unless I comment out the query in addArticle. I am wonder what I'm doing wrong here (in regards to the hanging problem).
function addArticle(title, text, date, link) {
connection.query("SELECT * FROM articles WHERE link LIKE "+connection.escape(link), function(error, rows, fields) {
if(rows.length == 0) {
console.log("article not in database");
console.log(connection.escape(title));
var values = [connection.escape(title), connection.escape(text), date, connection.escape(link), '{}'];
connection.query("INSERT INTO articles (title, text, date, link, topics) VALUES ?", [[values]], function(err) {
if(err) throw err;
});
}
});
}
function scrapeReuters() {
var url = 'http://www.reuters.com/news/archive/technologyNews?date=10092013';
request(url, function(err, resp, body){
$ = cheerio.load(body);
links = $('a');
$(links).each(function(i, link){
var addr = $(link).attr('href');
if(addr != undefined && addr.indexOf('article') != -1 && addr.indexOf('http') == -1 ) {
var full_link = "http://www.reuters.com"+addr;
var title = $(link).text();
request(full_link, function(err, resp, body){
$ = cheerio.load(body);
para = $('p').text();
addArticle(title, para,new Date().getTime(), full_link);
});
}
});
});
}
You probably need to close the connection after all the queries have finished. You can try using the https://github.com/caolan/async library to run the queries in sequence and then in a master callback, close the connection.
Its a little tricky, but first you need to define an array of functions to execute. Then you run async.sequence(arrayOfFns,masterCallback). The master callback gets errs and results (notice plural, its from all the functions). In that master callback, terminate the mysql connection/and or end the process.
To do this, I would rewrite the addArticle query to just return the query string. Then before your $(links).each loop, I would make an array called toInsert
In each loop I would say
toInsert.push(function(callback) {
connection.query(addArticle(...),function(err) {
if(err) callback(err);
else callback(null,true);
});
});
Then after the loop run
async.sequence(toInsert,function(errs,results) {
connection.close() //not sure if correct
process.exit(); //maybe, if needed?
});

Categories

Resources