I'm trying to scrape images off a page but the page returns a placeholder source attr if that page isn't fully loaded, (takes about 0.5 seconds to fully load) how would I make request wait?
tried doing
function findCommonMovies(movie, callback){
request('http://www.imdb.com/find?ref_=nv_sr_fn&q='+ movie +'&s=all', function (error, response, body) {
if (error){
return
}else{
var $ = cheerio.load(body);
var title = $(".result_text").first().text().split("(")[0].split(" ").join('')
var commonMovies = []
// var endurl = $("a[name=tt] .result_text a").attr("href")
var endurl = $('a[name=tt]').parent().parent().find(".findSection .findList .findResult .result_text a").attr("href");
request('http://www.imdb.com' + endurl, function (err, response, body) {
if (err){
console.log(err)
}else{
setInterval(function(){var $ = cheerio.load(body)}, 2000)
$(".rec_page .rec_item a img").each(function(){
var title = $(this).attr("title")
var image = $(this).attr("src")
commonMovies.push({title: title, image: image})
});
}
callback(commonMovies)
});
}
});
}
findCommonMovies("Gotham", function(common){
console.log(common)
})
Cheerio is not a web browser. It's just a parser of HTML. Which means that the javascript functions which make the async requests are not being executed.
So. You can't do what you want unless you use something that acts as a web browser. Selenium for example adds an API to a lot of web browsers.
You need to download Selenium client and keep running it as long as you want to keep making requests to sites with async content loading.
Also, you are going to need a wrapper based on the language you are using and the webdriver you want. The webdriver is used to add support for different web browsers.
I assume you are using NodeJS or something similar based on js so, here you go.
And be sure to check the API.
Hope to be of some help.
You could also check PhantomJS.
you can set timeout:
var options = {
url : 'http://www.imdb.com/find?ref_=nv_sr_fn&q='+ movie +'&s=all',
timeout: 10000 //set waiting time till 10 minutes.
}
request(options, function(err, response, body){
if (err) {
console.log(err);
}
//do what you want here
}
setTimeout(function, millseconds to wait) will pause for how many seconds you want.
setTimeout(function(){var $ = cheerio.load(body)}, 2000)
It appears to me like your callback is located in the wrong place and there should be no need for any timer. When request() calls its callback, the whole response is ready so no need for a timer.
Here's the code with the callback in the right place and also changed so that it has an error argument so the caller can propagate and detect errors:
function findCommonMovies(movie, callback){
request('http://www.imdb.com/find?ref_=nv_sr_fn&q='+ movie +'&s=all', function (error, response, body) {
if (error) {
callback(error);
return;
} else {
var $ = cheerio.load(body);
var title = $(".result_text").first().text().split("(")[0].split(" ").join('')
var commonMovies = [];
// var endurl = $("a[name=tt] .result_text a").attr("href")
var endurl = $('a[name=tt]').parent().parent().find(".findSection .findList .findResult .result_text a").attr("href");
request('http://www.imdb.com' + endurl, function (err, response, body) {
if (err) {
console.log(err)
callback(err);
} else {
var $ = cheerio.load(body);
$(".rec_page .rec_item a img").each(function(){
var title = $(this).attr("title");
var image = $(this).attr("src");
commonMovies.push({title, image});
});
callback(null, commonMovies);
}
});
}
});
}
findCommonMovies("Gotham", function(err, common) {
if (err) {
console.log(err);
} else {
console.log(common)
}
});
Note: This will access ONLY the HTML markup served by the server for the URLs you request. If those pages have content that is inserted by browser Javascript, that content will not be present in what you get here and no delay will make it appear. That's because cheerio does not run browser Javascript, it JUST parses the HTML that the server originally sends. To run browser Javascript, you need a more complete browser engine than cheerio provides such as PhantomJS that will actually run the page's Javascript.
Related
Below is my code for the /search page on my local webserver. I want to be able to update the page, and display the most recent search query. the problem is that by reloading the page, I end up running the portion that writes to a file (which inadvertently reloads the page) etc etc etc. Leading a vicious cycle of writing and reloading that can quickly get out of hand. Can you help me find a way to avoid this?
app.get('/search/', async (req, res) => {
//console.log(req.body)
// console.log(req.body.query)
var act = null;
var intervalId = setInterval(async function() {
if (req.body.query) {
act = await searchTweets(req.body.query);
} else {
act = await searchTweets("CVE")
}
fs.writeFile("./search.html", `<!DOCTYPE html> <html> <body>\n<a href='http://localhost:${port}/index.html' class='button'>Return to homepage</a>\n` + act + '\n</body>\n</html>', function(err) {
if (err) {
return console.error(err);
}
})
}, 5000);
fs.readFile("./search.html", 'utf8', function(err, data) {
res.send(data);
// Display the file content
console.log(data);
});
});
P.S. to monitor the file for changing I am using livereload:
liverlserver.watch("../BasicServer");
I want to bruteforce loop though pages for example page1.com - page100.com and get some element data from each. how can i do this? preferably with javascript if possible
if you are using node, take a look at crawler. it's like scrapy for python.
you could use it like below(from docs):
var crawler = require('crawler');
var c = new Crawler({
rateLimit: 2000,
maxConnections: 1,
callback: function(error, res, done) {
if(error) {
console.log(error)
} else {
var $ = res.$;
console.log($('title').text())
}
done();
}
})
c.queue(['http://www.page1.com','http://www.page2.com'], ...)
I need to download ~50k webpages, get some data from them and put it to variable.
I wrap each request into Promise and then Promise.all() them. I use Request library.
Simplified code:
const request = require('request');
const urls = [url1, url2, ...];
const promises = [];
urls.forEach(url => {
promises.push((resolve, reject) => {
request(url, (error, response, body) => {
if(error){ reject(error); return; }
// do something with page
resolve(someData);
});
});
});
Promise.all(promises.map(pr => new Promise(pr)))
.then((someDataArray)=>{ /* process data /* });
But I receive ENFILE exception, which stands for too many open files in the system (on my desktop max number of open files is 2048).
I know that Promises execute on creation, but I can't solve this problem.
Maybe there is other approach to do that?
Thanks for response.
What you want is to launch N requests then start a new one whenever one finishes (be it successful or not).
There are many libraries for that but it's important to be able to implement this kind of limitation yourself:
const request = require('request');
const urls = [url1, url2, ...];
const MAX_QUERIES = 10;
var remaining = urls.length;
const promises = [];
function startQuery(url){
if (!url) return;
request(url, (error, response, body) => {
if (error) // handle error
else // handle result
startQuery(urls.shift());
if (--remaining==0) return allFinished();
});
}
for (var i=0; i<MAX_QUERIES; i++) startQuery(urls.shift());
function allFinished(){
// all done
}
You can try this using async.forEachLimit where you can define the limit on number of requests. It will execute the next batch of limited requests once the previous batch is complete.
Install the package using npm install --save async
async.forEachLimit(urls, 50,function(url, callback) {
//process url using request module
callback();
}, function(err) {
if (err) return next(err);
console.log("All urls are processed");
});
for further help look: https://caolan.github.io/async/docs.html
Others have said how to do the flow control using async or promises, and I won't repeat them. Personally, I prefer the async JS method but that's just my preference.
Two things that they did not cover, however, which I think are as important as flow control if you want your script performant and reliable.
1) Don't rely on the callbacks or promises to handle processing the files. All examples provided so far use those. Myself, I would make use of the request streams API instead to treat the request as a readable stream and pipe that stream to a writeable that processes it. Simplest example is to use fs to write the file to the file system. This makes much better use of your system resources as it writes each data chunk to storage as it comes in, rather than having to hold the whole file in memory. You can then call the callbacknor resolve the promise when the stream ends.
2) You should not try and process and in memory list of 50k URLs. If you do and you fail on, let's say the 20,000th URL, then you have to figure out how to sort out the done ones from the not done ones and update your code or the JSON file you read them from. Instead, use a database (any will do) that has a collection/table/whatever of URLs and metadata about them. When your program runs, query for ones that don't have the attributes indicating that they have been successfully fetched, and then when you successfully fetch them or the request fails, you can use that same data structure to give you some intelligence about why it failed or when it succeeded.
Install async package and use forEachLimit to limit number of operations.
const request = require('request');
const urls = [];
for(var temp=0;temp<1024;temp++){
urls.push("http://www.google.com");
}
const async = require("async");
const promises = [];
var i=0;
async.forEachLimit(urls, 10, function(url, callback) {
request(url, (error, response, body) => {
if (error) {
callback(error);
return;
}
var somedata = null;
console.log(++i);
callback(null, somedata);
});
}, function(err) {
/* process data */
});
Like said in comments, you could use the async.js module
const request = require('request');
const async = require('async');
var listOfUrls = [url1, url2, ...];
async.mapLimit(listOfUrls, 10, function(url, callback) {
// iterator function
request(url, function(error, response, body) {
if (!error && response.statusCode == 200) {
var dataFromPage = ""; // get data from the page
callback(null, arrToCheck);
} else {
callback(error || response.statusCode);
}
});
}, function(err, results) {
// completion function
if (!err) {
// process all results in the array here
// Do something with the data
resolve(results);
} else {
// handle error here
console.log(err);
}
});
Here you will process 10 url's at a time, when all url's have been processed the result callback is called, where you can process your data
I have updated the post with the actual code.
The problem is that the node app hangs and does not exit unless I comment out the query in addArticle. I am wonder what I'm doing wrong here (in regards to the hanging problem).
function addArticle(title, text, date, link) {
connection.query("SELECT * FROM articles WHERE link LIKE "+connection.escape(link), function(error, rows, fields) {
if(rows.length == 0) {
console.log("article not in database");
console.log(connection.escape(title));
var values = [connection.escape(title), connection.escape(text), date, connection.escape(link), '{}'];
connection.query("INSERT INTO articles (title, text, date, link, topics) VALUES ?", [[values]], function(err) {
if(err) throw err;
});
}
});
}
function scrapeReuters() {
var url = 'http://www.reuters.com/news/archive/technologyNews?date=10092013';
request(url, function(err, resp, body){
$ = cheerio.load(body);
links = $('a');
$(links).each(function(i, link){
var addr = $(link).attr('href');
if(addr != undefined && addr.indexOf('article') != -1 && addr.indexOf('http') == -1 ) {
var full_link = "http://www.reuters.com"+addr;
var title = $(link).text();
request(full_link, function(err, resp, body){
$ = cheerio.load(body);
para = $('p').text();
addArticle(title, para,new Date().getTime(), full_link);
});
}
});
});
}
You probably need to close the connection after all the queries have finished. You can try using the https://github.com/caolan/async library to run the queries in sequence and then in a master callback, close the connection.
Its a little tricky, but first you need to define an array of functions to execute. Then you run async.sequence(arrayOfFns,masterCallback). The master callback gets errs and results (notice plural, its from all the functions). In that master callback, terminate the mysql connection/and or end the process.
To do this, I would rewrite the addArticle query to just return the query string. Then before your $(links).each loop, I would make an array called toInsert
In each loop I would say
toInsert.push(function(callback) {
connection.query(addArticle(...),function(err) {
if(err) callback(err);
else callback(null,true);
});
});
Then after the loop run
async.sequence(toInsert,function(errs,results) {
connection.close() //not sure if correct
process.exit(); //maybe, if needed?
});
I'm writing a downloader with node.js and the request module. Using the stream syntax I'm doing
var r = request(url).pipe(fs.createWriteStream(targetPath));
r.on('error', function(err) { console.log(err); });
r.on('finish', cb);
to download the file, save it and call the callback. However, in almost 50% of the cases the file is either empty or not created at all. No error event is emitted. It seems like the finish event is triggered even though the file wasn't (completely) written yet.
Context: The whole thing is wrapped into async.each calls.
Any clues? Thanks!
You need to close the file before accessing it:
var file = fs.createWriteStream(targetPath);
var r = request(url).pipe(file);
r.on('error', function(err) { console.log(err); });
r.on('finish', function() { file.close(cb) });
Incidentally, if the url replies with any http error (such as a 404 not found), that won't trigger the 'error' event, so you should probably check that separately:
function handleFailure(err) { console.log(err); };
var file = fs.createWriteStream(targetPath);
request(url, function(error, response) {
if (response.statusCode != 200) {
console.log("oops, got a " + response.statusCode);
return
}
// close is async, and you have to wait until close completes otherwise
// you'll (very rarely) not be able to access the file in the callback.
file.on('finish', function() { file.close(cb) });
response.pipe(file).on('error', handleFailure)
file.on('error', handleFailure)
}).on('error', handleFailure);