node js request promise, scraping - javascript

I'm scraping a website , i use request library with node js, so I got an array of urls I loop through and do a request on each, the problem is that requests are asynchronous and I need to do something ONLY after all the requests are completed here is the code:
for (var i = 0; i < urls.length; i++) {
request(urls[i], function (err,resp,body) {
if (!err && resp.statusCode == 200) {
var $ = cheerio.load(body);
var string = $('.author .mini').text();
var regExp = /(\+971|00971|05)\d{1,12}/g;
if(string.match(regExp)) {mobilePhones.push(string.match(regExp)[0])}
}
});
}
so when all the requests are done, I just want to console.log(mobilePhones);

This would be much easier to accomplish using promise.all():
var myFunc = function (err,resp,body) {
if (!err && resp.statusCode == 200) {
var $ = cheerio.load(body);
var string = $('.author .mini').text();
var regExp = /(\+971|00971|05)\d{1,12}/g;
if(string.match(regExp)) {mobilePhones.push(string.match(regExp)[0])}
};
var p1 = request(urls[0], myFunc);
var p2 = request(urls[1], myFunc);
// Do this for all of urls.length
Promise.all([p1, p2, p3, p4, p5]).then(() => {
console.log(mobilePhones);
});

var count = urls.length;
for (var i = 0; i < urls.length; i++) {
request(urls[i], function (err,resp,body) {
if (!err && resp.statusCode == 200) {
var $ = cheerio.load(body);
var string = $('.author .mini').text();
var regExp = /(\+971|00971|05)\d{1,12}/g;
if(string.match(regExp)) {mobilePhones.push(string.match(regExp)[0])}
}
count--;
if (count === 0) doSomething() // all requests are done.
});
}

Related

Javascript print all the status at a time using request node module

I'm new to javascript and promises, I had a requirement where I need to get the http status of my URL's(i have 10 URL's) all at a time (Not sequentialy).
So I wrote below code.
var request = require('request');
var fun = function(i) {
request('http://myapp' + i + '.com', function(error, response, body) {
console.log(response && response.statusCode, i);
});
};
for (i = 0; i < 10; i++) {
fun(i);
}
but I'm getting
status1 then status2 then status3 and so on..... my requirement is to print all the status at one time.
then tried below code
var request = require('request');
var fun = function(myapp) {
return new Promise(function(resolve, reject) {
request('http://' + myapp + '.com', function(error, response, body) {
resolve(response && response.statusCode);
});
});
};
for (i = 0; i < 10; i++) {
fun('myapp' + i).then(function(val1) {
console.log(val1);
});
}
but still, I'm getting status1 then status2 then status3 and so on.....
Any help is appreciated.
You could collect all the promises in an array, then use Promise.all on that to get an array of results:
const promises = [];
for (i = 0; i < 10; i++) {
promises.push( fun('myapp' + i));
Promise.all(promises)
.then(console.log, console.error);
Your code is correct. It executes asynchronously in parallel.
However you are confused into seeing output be printed in sequence (or what looks like sequence) but this is normal, no matter if parallel or synhronous, output (as it is coded will be printed one after the other, maybe not in same order).
if you want to output all at once when finished, do sth like the following:
var request = require('request');
var finished = new Array(10);
var numFinished = 0;
var fun = function(i) {
request('http://myapp' + i + '.com', function(error, response, body) {
finished[i] = response && response.statusCode ? response.statusCode : 'No response';
numFinished++;
});
};
for (i = 0; i < 10; i++) {
fun(i);
}
var timer = setTinterval(function(){
if ( 10 <= numFinished )
{
clearInterval(timer);
console.log(finished.join(',')); // print all at once
}
}, 500);
Or if you use promises you can do:
var request = require('request');
var fun = function(myapp) {
return new Promise(function(resolve, reject) {
request('http://' + myapp + '.com', function(error, response, body) {
resolve(response && response.statusCode ? response.statusCode : 'No response');
});
});
};
var promises = new Array(10);
for (i = 0; i < 10; i++) {
promises[i] = fun('myapp' + i);
Promise.all(promises).then(function(results){
console.log(results.join(',')); // print all at once
});

how to make async html parser in nodejs with promises?

having fun with promises in JS and trying to craft simple xpath website parser, but I am struggling with logic on finishing overall parsing process, my code is:
var request = require('request');
var xpath = require('xpath');
var dom = require('xmldom').DOMParser;
var olxMain = 'https://www.some.site/';
var xpathRoot = '//a[contains(#href, "https://www.some.site/mask/")]';
var linksXpath = '//a';
var allGlobalLinks = [];
var getLink = function (node) {
for (key in node['attributes']) {
if (node['attributes'][key]['name'] === 'href') {
return node['attributes'][key]['value'];
}
}
}
var getData = function (url, xpathPattern) {
return new Promise(function (resolve, reject) {
console.log("Opening " + url);
var processResponse = function (error, response, body) {
var doc = new dom().parseFromString(body);
var childNodes = xpath.select(xpathPattern, doc);
var links = childNodes.map(function (n) {
return getLink(n);
});
resolve(links);
};
request({url: url}, processResponse);
}
);
}
var arrayUnique = function (x, i, a) {
return a.indexOf(x) == i;
};
var main = function () {
getData(olxMain, xpathRoot).then(function (links) {
links = links.filter(arrayUnique);
var maxThreads = 10, n = 0;
var chunks = [];
for (k in links) {
var url = links[k];
n++;
if (n <= maxThreads)
chunks.push(url);
else {
n = 0;
// console.log(chunks);
Promise.all(chunks.map(function (url) {
return getData(url, linksXpath);
})).then(function (links) {
// add these links to global scope list here
});
console.log("Finished mappings iteration");
});
chunks = [];
}
}
;
});
}
main();
So what I want is basically some kind of threadPool with promises, how to I manage these 10 promises, when they all are finished, I should spawn another 10 more, until list is finished and all Promises have finished ?

Chrome Extension: Data is not being inserted nor fetched

I am using WebSQL. I am trying to add data in Async Block which is making data not to be inserted. Code is given below:
function fetchData(){
var xhr = new XMLHttpRequest();
xhr.open("GET", "http://localhost/x/fetch.php", true);
xhr.onreadystatechange = function() {
if (xhr.readyState == 4) {
// JSON.parse does not evaluate the attacker's scripts.
var resp = xhr.responseText;
if(resp != null) {
var json = JSON.parse(resp)
console.log(resp);
var data = json['data'];
if(data != null) {
openDatabase('documents', '1.0', 'documents', 5*1024*1024, function (db) {
alert('Called'); // This is called after below two calls.
insertRecord(db);
fetchRecord(db);
});
//var dbConnection = openDbConnect();
//createTable(dbConnection);
for(var a=0;a <= data.length;a++) {
alert(data[a].title);
}
}
}
}
}
xhr.send();
}
JSON Dump
{"data":[{"id":"1","title":"- Parts I & II”,”CODE”:”xxx”,”product_url":"http:\/\/www.example.com","image_url":"http:\/\/ecx.images-example.com\/images\/I\/61ujIIMyW7L.jpg","price":"$25.00"},{"id":"2","title”:”AJDJDDJDr”,”Code”:”XX”,”product_url":"http:\/\/www.example.com","image_url":"http:\/\/dc.images-example.com\/images\/I\/41jFVZL72YL.jpg","price":"$10.99"}]}
Try this ;)
Problem in this loop condition:
for(var a = 0; a <= data.length; a++) {
^
Here you are starting from 0 and looping to data.length
So to loop with arrays as array index starts from 0 loop till a <= data.length - 1 OR a < data.length
for(var a = 0; a < data.length; a++) {
OR
for(var a=0; a <= (data.length - 1); a++) {
Instead of for loop you can use for...in like this:
for(var index in data){
alert(data[index].title);
}

'Juggling Async' - Why does my solution not return anything at all?

After asking a question and getting a very helpful answer on what the 'Async Juggling' assignment in learnyounode was asking me to do, I set out to implement it myself.
The problem is, my setup isn't having any success! Even though I've referred to other solutions out there, my setup simply isn't returning any results when I do a learnyounode verify myscript.js.
GIST: jugglingAsync.js
var http = require('http');
var app = (function () {
// Private variables...
var responsesRemaining,
urls = [],
responses = [];
var displayResponses = function() {
for(var iterator in responses) {
console.log(responses[iterator]);
}
};
// Public scope...
var pub = {};
pub.main = function (args) {
responsesRemaining = args.length - 2;
// For every argument, push a URL and prep a response.
for(var i = 2; i < args.length; i++) {
urls.push(args[i]);
responses.push('');
}
// For every URL, set off an async request.
for(var iterator in urls) {
var i = iterator;
var url = urls[i];
http.get(url, function(response) {
response.setEncoding('utf8');
response.on('data', function(data) {
if(response.headers.host == url)
responses[i] += data;
});
response.on('end', function() {
if(--responsesRemaining == 0)
displayResponses();
});
});
}
};
return pub;
})();
app.main(process.argv);
Question: What am I doing wrong?
This line
for(var iterator in urls) {
doesn't do what you think it does. It actually loops over the properties of urls (see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Statements/for...in). Instead, you have to do something like
for(var i = 0; i < urls.length; i++) {
var url = urls[i];
...
}
or
urls.forEach(function(url, index) {
...
});
In addition to not properly looping through the arrays inside the app module, I was also not properly concatenating data returned from the response.on('data') event. Originally I was doing...
responses[index] += data;
Instead, the correct thing to do was:
responses[index] = responses[index] + data;
Changing that, as well as the things noted by #arghbleargh got the 'Async Juggling' to fully verify!
I have tested my code and it all worked:
~ $ node juggling_async.js site1 site2 site3 site4 ...
The JS code does not limit only to three sites.
var http = require('http');
// Process all the site-names from the arguments and store them in sites[].
// This way does not limit the count to only 3 sites.
var sites = [];
(function loadSites() {
for(var i = 2, len = process.argv.length; i < len; ++i) {
var site = process.argv[i];
if(site.substr(0, 6) != 'http://') site = 'http://' + site;
sites.push(site);
}
})();
var home_pages = [];
var count = 0;
function httpGet(index) {
var home_page = '';
var site = sites[index];
http.get(site, function(res) {
res.setEncoding('utf8');
res.on('data', function(data) {
home_page += data;
});
res.on('end', function() {
++count;
home_pages[index] = home_page;
if(count == sites.length) {
// Yahoo! We have reached the last one.
for(var i = 0; i < sites.length; ++i) {
console.log('\n############ Site #' + (+i+1) + ': ' + sites[i]);
console.log(home_pages[i]);
console.log('============================================\n');
}
}
});
})
.on('error', function(e) {
console.log('Error at loop index ' + inddex + ': ' + e.message);
})
;
}
for(var i = 0; i < sites.length; ++i) {
httpGet(i);
}

Intermittent behavior in my AJAX, Greasemonkey script

I've a small Greasemonkey script that doesn't include any random part, but its results change with each page reload.
I'm a noob and I'm probably doing something wrong, but I don't know what. I hope you'll be able to help me.
The code is too large and too poorly written to be reproduced here, so I'll try to sum up my situation:
I have a list of links which have href=javascript:void(0) and onclick=f(link_id).
f(x) makes an XML HTTP request to the server, and returns the link address.
My script is meant to precompute f(x) and change the href value when the page loads.
I have a function wait() that waits for the page to load, then a function findLinks() that gets the nodes that are to be changed (with xpath).
Then a function sendRequest() that sends the xhr to the server. And, finally handleRequest() that asynchronously (r.onreadystatechange) retrieves the response, and sets the nodes previously found.
Do you see anything wrong with this idea?
Using a network analyzer, I can see that the request is always sent fine, and the response also.
Sometimes the href value is changed, but sometimes for some links it isn't and remains javascript:void(0).
I really don't see why it works only half the time...
function getUrlParameterFromString(urlString, name) {
name = name.replace(/[\[]/, "\\\[").replace(/[\]]/, "\\\]");
var regexS = "[\\?&]" + name + "=([^&#]*)";
var regex = new RegExp(regexS);
var results = regex.exec(urlString);
if (results == null) {
return "";
} else {
return results[1];
}
}
function getUrlParameter(name) {
return getUrlParameterFromString(window.location.href, name);
}
function wait() {
var findPattern = "//a";
var resultLinks = document.evaluate(findPattern, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
if (resultLinks == null || resultLinks.snapshotLength == 0) {
return setTimeout(_wait, 100);
} else {
for (var i = 0, len = resultLinks.snapshotLength; i < len; i++) {
var node = resultLinks.snapshotItem(i);
var s = node.getAttribute('onclick');
var linkId = s.substring(2, s.length - 1); // f(x)->x
sendRequest(linkId, node);
}
}
}
function sendRequest(linkId, nodeToModify) {
window.XMLHttpRequest ? r = new XMLHttpRequest : window.ActiveXObject && (r = new ActiveXObject("Microsoft.XMLHTTP"));
if (r) {
r.open("POST", "some_url", !0);
r.onreadystatechange = function () {
handleRequest(nodeToModify, linkId, r);
}
r.setRequestHeader("Content-type", "application/x-www-form-urlencoded");
r.send(linkId);
}
}
function handleRequest(nodeToModify, num, r) {
if (r.readyState == 4) {
if (r.status == 200) {
console.log('handleRequest() used');
var a = r.responseText;
if (a == null || a.length < 10) {
sendRequest(num, nodeToModify);
} else {
var url = unescape((getUrlParameterFromString(a, "url")).replace(/\+/g, " "));
nodeToModify.setAttribute('href', url);
nodeToModify.setAttribute('onclick', "");
}
} else {
alert("An error occurred: " + r.statusText)
}
}
}
wait();
It looks like that script will change exactly 1 link. Look-up "closures"; this loop:
for (var i = 0, len = resultLinks.snapshotLength; i < len; i++) {
var node = resultLinks.snapshotItem(i);
var s = node.getAttribute('onclick');
var linkId = s.substring(2, s.length - 1); // f(x)->x
sendRequest(linkId, node);
}
needs a closure so that sendRequest() gets the correct values. Otherwise, only the last link will be modified.
Try:
for (var i = 0, len = resultLinks.snapshotLength; i < len; i++) {
var node = resultLinks.snapshotItem(i);
var s = node.getAttribute('onclick');
var linkId = s.substring(2, s.length - 1); // f(x)->x
//-- Create a closure so that sendRequest gets the correct values.
( function (linkId, node) {
sendRequest (linkId, node);
}
)(linkId, node);
}

Categories

Resources