Using Multiple page.open in Single Script - javascript

My goal is to execute PhantomJS by using:
// adding $op and $er for debugging purposes
exec('phantomjs script.js', $op, $er);
print_r($op);
echo $er;
And then inside script.js, I plan to use multiple page.open() to capture screenshots of different pages such as:
var url = 'some dynamic url goes here';
page = require('webpage').create();
page.open(url, function (status) {
console.log('opening page 1');
page.render('./slide1.png');
});
page = require('webpage').create();
page.open(url, function (status) {
console.log('opening page 2');
page.render('./slide2.png');
});
page = require('webpage').create();
page.open(url, function (status) {
console.log('opening page 3');
page.render('./slide3.png');
phantom.exit(); //<-- Exiting phantomJS only after opening all 3 pages
});
On running exec, I get the following output on page:
Array ( [0] => opening page 3 ) 0
As a result I only get the screenshot of the 3rd page. I'm not sure why PhantomJS is skipping the first and second blocks of code (evident from the missing console.log() messages that were supposed to be output from 1st and 2nd block) and only executing the third block of code.

The problem is that the second page.open is being invoked before the first one finishes, which can cause multiple problems. You want logic roughly like the following (assuming the filenames are given as command line arguments):
function handle_page(file){
page.open(file,function(){
...
page.evaluate(function(){
...do stuff...
});
page.render(...);
setTimeout(next_page,100);
});
}
function next_page(){
var file=args.shift();
if(!file){phantom.exit(0);}
handle_page(file);
}
next_page();
Right, it's recursive. This ensures that the processing of the function passed to page.open finishes, with a little 100ms grace period, before you go to the next file.
By the way, you don't need to keep repeating
page = require('webpage').create();

I've tried the accepted answer suggestions, but it doesn't work (at least not for v2.1.1).
To be accurate the accepted answer worked some of the time, but I still experienced sporadic failed page.open() calls, about 90% of the time on specific data sets.
The simplest answer I found is to instantiate a new page module for each url.
// first page
var urlA = "http://first/url"
var pageA = require('webpage').create()
pageA.open(urlA, function(status){
if (status){
setTimeout(openPageB, 100) // open second page call
} else{
phantom.exit(1)
}
})
// second page
var urlB = "http://second/url"
var pageB = require('webpage').create()
function openPageB(){
pageB.open(urlB, function(){
// ...
// ...
})
}
The following from the page module api documentation on the close method says:
close() {void}
Close the page and releases the memory heap associated with it. Do not use the page instance after calling this.
Due to some technical limitations, the web page object might not be completely garbage collected. This is often encountered when the same object is used over and over again. Calling this function may stop the increasing heap allocation.
Basically after I tested the close() method I decided using the same web page instance for different open() calls is too unreliable and it needed to be said.

You can use recursion:
var page = require('webpage').create();
// the urls to navigate to
var urls = [
'http://phantomjs.org/',
'https://twitter.com/sidanmor',
'https://github.com/sidanmor'
];
var i = 0;
// the recursion function
var genericCallback = function () {
return function (status) {
console.log("URL: " + urls[i]);
console.log("Status: " + status);
// exit if there was a problem with the navigation
if (!status || status === 'fail') phantom.exit();
i++;
if (status === "success") {
//-- YOUR STUFF HERE ----------------------
// do your stuff here... I'm taking a picture of the page
page.render('example' + i + '.png');
//-----------------------------------------
if (i < urls.length) {
// navigate to the next url and the callback is this function (recursion)
page.open(urls[i], genericCallback());
} else {
// try navigate to the next url (it is undefined because it is the last element) so the callback is exit
page.open(urls[i], function () {
phantom.exit();
});
}
}
};
};
// start from the first url
page.open(urls[i], genericCallback());

Using Queued Processes, sample:
var page = require('webpage').create();
// Queue Class Helper
var Queue = function() {
this._tasks = [];
};
Queue.prototype.add = function(fn, scope) {
this._tasks.push({fn: fn,scope: scope});
return this;
};
Queue.prototype.process = function() {
var proxy, self = this;
task = this._tasks.shift();
if(!task) {return;}
proxy = {end: function() {self.process();}};
task.fn.call(task.scope, proxy);
return this;
};
Queue.prototype.clear = function() {
this._tasks = []; return this;
};
// Init pages .....
var q = new Queue();
q.add(function(proxy) {
page.open(url1, function() {
// page.evaluate
proxy.end();
});
});
q.add(function(proxy) {
page.open(url2, function() {
// page.evaluate
proxy.end();
});
});
q.add(function(proxy) {
page.open(urln, function() {
// page.evaluate
proxy.end();
});
});
// .....
q.add(function(proxy) {
phantom.exit()
proxy.end();
});
q.process();
I hope this is useful, regards.

Related

Scraping dynamically rendered links from an infinite scrollbar in CasperJS

I'm trying to scrape the links on the left sidebar of this page using CasperJS.
The page has hundreds of links in the sidebar, but only loads 20 at a time when you scroll down. This code successfully grabs the first 20 (needs casperjs and phantomjs globally installed to run):
var casper = require('casper').create();
// helper function that gets all of the resume links on the page:
var getAllLinks = function() {
var linksOnThisPage = []
$("a[href^='/ResumeB']").each(function(index, linkDiv) {
$linkDiv = $(linkDiv)
linksOnThisPage.push('http://www.super-resume.com' + $linkDiv.attr('href'))
});
return linksOnThisPage
};
//start casper, go to page, run helper function:
casper.start('http://www.super-resume.com/ResumeBuilder.jtp?query=Database+Administrator', function() {
allLinks=casper.evaluate(getAllLinks)
console.log('number of links found:', allLinks.length);
});
casper.run();
I can make the page scroll down in the actual browser with this:
$('#search-left-inner').scrollTop(10000);
10000 is an arbitrarily big number; every time you run that code in the browser, it loads 20 more links. (Ideally I'd like to be able to grab all at once without having to keep reloading 20 at a time, but that's less pressing for now.)
If I put that line inside the getAllLinks function like so:
var getAllLinks = function() {
$('#search-left-inner').scrollTop(10000);
var linksOnThisPage = []
//etc, etc,
it still only loads 20 links. Many similar posts discuss synchronicity issues, so I've I've tried to get the it to wait for the sidebar to finish loading a few ways, including this:
var getAllLinks = function() {
casper.then(function () {
$('#search-left-inner').scrollTop(100000);
});
casper.then(function () {
var linksOnThisPage = []
//etc. etc.
}
but now for some reason it only finds one link instead of 20.
I presume that if you scroll, it doesn't immediately load the next items, because loading takes time. You need to wait a little after scrolling before you can attempt to scrape all of the elements again.
casper.start(url)
.thenEvaluate(scroll)
.wait(5000, function(){
var links = this.evaluate(getAllLinks);
this.echo(links.length);
})
.run();
If this produces more links, then you can try the next step and that is infinite scrolling until no new elements are loaded. This can be done with asynchronous recursion in CasperJS:
var linkCount = -1;
function getAllLinks() {
var linksOnThisPage = []
$("a[href^='/ResumeB']").each(function(index, linkDiv) {
$linkDiv = $(linkDiv)
linksOnThisPage.push('http://www.super-resume.com' + $linkDiv.attr('href'))
});
return linksOnThisPage
}
function scroll() {
$('#search-left-inner').scrollTop(10000);
}
/**
* Returns true if more elements were loaded that were before
*/
function checkMore(){
var newLinks = this.evaluate(getAllLinks);
var newCount = newLinks.length;
if (linkCount === -1) {
linkCount = newCount;
}
return linkCount < newCount
}
/**
* Executes the a single iteration step and waits for a change in numbers.
* Terminates if there are no changes in 6 seconds.
*/
function step(){
this.thenEvaluate(scroll)
.waitFor(check, step, function _onTimeout(){
var links = this.evaluate(getAllLinks);
this.echo("finished with " + links.length + " links\n\n"+links.join("\n"));
}, 6000);
}
casper.start(url, step).run();
Keep in mind that it makes only sense to use jQuery in the DOM context (page context) which is inside of casper.evaluate(). I suggest that you also read the PhantomJS documentation of that function.

Using phantom.js to scrape data

Following on from this question, I am trying to scrape data using phantomjs, modifying a script from here:
My goal is to integrate a working function (see 2nd code snippet) into the script below in 1st code snippet. I have tried doing this but keep getting errors. Is there a way I can actually do the integration?
(note: using phantomjs because the site is an angular app where initial HTML doesn't contain any of the data I amlooking for, i.e.a headless web browser. So I need to load the page in memory, wait for angular to do its thing (a set delay of some sort), and then scrape the rendered DOM)
The errors (and output) I get when I execute my script (phantomjs scraping.js) are as follow:
console> SPR-ERROR: 103 - Invalid published date console> v6
ReferenceError: Can't find variable: angular
http://stage.inc.com/js/Inc5000ListApp.js?UPDATE1:2
http://www.inc.com/inc5000/index.html:2485
console> SPR-ERROR:103 - Invalid published date (date)
====================================================
Step "0"
====================================================
console>Reached scrapeData
console>
Seems like it is connecting to the desired site. How do I modify this script below to fit the extraction code at the bottom of this qn:
var page = new WebPage(),
url = 'http://www.inc.com/inc5000/index.html',
stepIndex = 0;
/**
* From PhantomJS documentation:
* This callback is invoked when there is a JavaScript console. The callback may accept up to three arguments:
* the string for the message, the line number, and the source identifier.
*/
page.onConsoleMessage = function (msg, line, source) {
console.log('console> ' + msg);
};
/**
* From PhantomJS documentation:
* This callback is invoked when there is a JavaScript alert. The only argument passed to the callback is the string for the message.
*/
page.onAlert = function (msg) {
console.log('alert!!> ' + msg);
};
// Callback is executed each time a page is loaded...
page.open(url, function (status) {
if (status === 'success') {
// State is initially empty. State is persisted between page loads and can be used for identifying which page we're on.
console.log('============================================');
console.log('Step "' + stepIndex + '"');
console.log('============================================');
// Inject jQuery for scraping (you need to save jquery-1.6.1.min.js in the same folder as this file)
page.injectJs('jquery-1.6.1.min.js');
// Our "event loop"
if(!phantom.state){
//initialize();
scrapeData();
} else {
phantom.state();
}
// Save screenshot for debugging purposes
page.render("step" + stepIndex++ + ".png");
}
});
function scrapeData(){
page.evaluate(function() {
console.log('Reached scrapeData');
var DATA = [];
$('tr.ng-scope').each(function(){
var $tds = $(this).find('td');
DATA.push({
rank: $tds.eq(0).text(),
company: $tds.eq(1).text(),
growth: $tds.eq(2).text(),
revenue: $tds.eq(3).text(),
industry: $tds.eq(4).text()
});
});
console.log(DATA);
});
phantom.state = parseResults;
// scraping code here
}
// Step 1
function initialize() {
page.evaluate(function() {
console.log('Searching...');
});
// Phantom state doesn't change between page reloads
// We use the state to store the search result handler, ie. the next step
phantom.state = parseResults;
}
// Step 2
function parseResults() {
page.evaluate(function() {
$('#search-result a').each(function(index, link) {
console.log($(link).attr('href'));
})
console.log('Parsed results');
});
// If there was a 3rd step we could point to another function
// but we would have to reload the page for the callback to be called again
phantom.exit();
}
I know this code below works in the console, but how I can integrate it with the code script above to successfully scrape data from multiple pages on the site:
request('http://www.inc.com/inc5000/index.html', function (error, response, html) {
if(error || response.statusCode != 200) return;
var $ = cheerio.load(html);
var DATA = [];
$('tr.ng-scope').each(function(){
var $tds = $(this).find('td');
DATA.push({
rank: $tds.eq(0).text(),
company: $tds.eq(1).text(),
growth: $tds.eq(2).text(),
revenue: $tds.eq(3).text(),
industry: $tds.eq(4).text()
});
});
console.log(DATA);
});

WinJS: Loading data

I'm trying to develop my first Windows 8 Store app (HTML/JS). I am using the Grid App Template which suites my Needs I think the best.
This is my model:
I have three entities: 1. GalleryCategory 2. Gallery 3. GalleryItem.
A Gallery is linked to exactly one Category. A GalleryItem is linked to exactly one Gallery...so nothing fancy here...
I'm using the out of the box data.js file to load all categories and all galleries on the Startup of the app. But when I open the galleryDetail.html (which is supposed to Show all the Images of the particular Gallery) I want to load all Images of the Gallery then. (to avoid to much loading on the beginning).
And now I'm finally coming to the Point that I do not understand:
How can I manage this?? I mean
WinJS.UI.Pages.define("/pages/galleryDetail/galleryDetail.html", {
// This function is called whenever a user navigates to this page. It
// populates the page elements with the app's data.
ready: function (element, options) {
var item = options && options.item ? Data.resolveItemReference(options.item) : Data.items.getAt(0);
element.querySelector(".titlearea .pagetitle").textContent = item.group.title;
element.querySelector("article .item-title").textContent = item.title;
element.querySelector("article .item-subtitle").textContent = item.subtitle;
element.querySelector("article .item-image").src = item.backgroundImage;
element.querySelector("article .item-image").alt = item.subtitle;
element.querySelector("article .item-content").innerHTML = item.content;
element.querySelector(".content").focus();
var galleryId = item.key;
WinJS.xhr({ url: "http://someUrlToAnAspNetWebsite/Handlers/GalleryItemsHandler.ashx?galleryId=" + galleryId }).done(
// Complete function
function (response) {
var items = JSON.parse(response.responseText);
items.forEach(function (item) {
galleryItemsList.push(item);
});
dataList = new WinJS.Binding.List(galleryItemsList);
var galleryItemsListView = document.getElementById('galleryItemsListView').winControl;
galleryItemsList.itemDataSource = dataList.dataSource;
},
// Error function
function (response) {
// handle error here...
},
// Progress function
function (response) {
// progress implementation goes here...
}
);
},
my Problem is obivous...the ready function continues / Ends before the data is retrieved...as the async call takes a while.
But I thought using the promise (.done()) will do this for me (synchronising the threads)?? Or do I need to use the join() function. If so, where and how?? Sorry for my issues with this...
Thanks for any help...
The ready function itself is an async function, so you only have to return a promise to tell its caller that its not done until some promise is resolved. So you can fix your issue with 7 key strokes. Just add return before the WinJS.xhr call.

PhantomJS page fetching with nested loop to get new pages

I want to fetch a list online from a certain URL that is in JSON format and then use the DATA_ID from each item in that list to call a new URL. I'm just new with PhantomJS and I can't figure out why nest loops inside the page.open() acts all weird. Also the way to use phantom.exit() seems to be really weird doing what I want to achieve.
Here's my code:
console.log('Loading recipes');
console.log('===============================================================');
var page = require('webpage').create();
var url = 'http://www.hiddenurl.com/recipes/all';
page.open(url, function (status) {
//Page is loaded!
var js = page.evaluate(function () {
return document.getElementsByTagName('pre')[0];
});
var recipes = JSON.parse(js.innerHTML).results;
//console.log(recipes[0].name.replace('[s]', ''));
for (i = 0; i < recipes.length; i++) {
console.log(recipes[i].name.replace('[s]', ''));
var craft_page = require('webpage').create();
var craft_url = 'http://www.hiddenurl.com/recipe/' + recipes[i].data_id;
craft_page.open(craft_url, function (craft_status) {
//Page is loaded!
var craft_js = craft_page.evaluate(function () {
return document.getElementsByTagName('body')[0];
});
var craftp = craft_js.innerHTML;
console.log('test');
});
if (i == 5) {
console.log('===============================================================');
phantom.exit();
//break;
}
}
});
The thing that happens here is that this line:
console.log(recipes[i].name.replace('[s]', ''));
..prints the following:
===============================================================
Item from DATA_ID 1
Item from DATA_ID 2
Item from DATA_ID 3
Item from DATA_ID 4
Item from DATA_ID 5
..then it just prints the next:
===============================================================
..followed by:
'test'
'test'
'test'
'test'
'test'
Why is this not happening serial? The data from the innerly called page() request gets heaped up and dumped at the end, even after phantom.exit() should actually already be called.
Also when I free-loop a normal data-set I get this error:
QEventDispatcherUNIXPrivate(): Unable to create thread pipe: Too many open files
2013-01-31T15:35:18 [FATAL] QEventDispatcherUNIXPrivate(): Can not continue without a thread pipe
Abort trap: 6
Is there any way I can set GLOBAL_PARAMETERS or direct the process in some way so I can just handle 100's of page requests?
Thanks in advance!
I've made a workaround with Python by calling PhantomJS separately through the shell, like this:
import os
import json
cmd = "./phantomjs fetch.js"
fin,fout = os.popen4(cmd)
result = fout.read()
recipes = json.loads(result)
print recipes['count']
Not the actual solution for the PhantomJS issue, but it's a working solution and has less problems with memory and code-structure.

Problems making GET request from jQuery

I'm trying to make an HTTP GET request using the jQuery get() function, but I'm having some trouble.
Here's what my code looks like:
// get the links on the page
var pageLinks = $.find('#pageLinks');
// loop through each of the links
$(pageLinks).find('a').each(function(){
if($(this).attr('title') !== "Next Page"){
// make a GET request to the URL of this link
$.get($(this).attr("href"), function(data) {
console.log("here");
var temp = parse_page(data);
// concatenate the return string with another
bdy = bdy+String(temp);
console.log("done");
});
}
});
There are multiple pages that I need to get data from. Since the get() function is asynchronous, I get the pages in a random order. Secondly, the concatenation does not work. Even though I get each of the pages, they're not put into bdy.
Can anyone suggest how I might deal with this?
Thanks a lot!!
Construct bdy after all pages are retrieved, i.e. store get results in a dictionary or array; wait for all gets to finish; then assemble them in the correct order.
I tried this one and it works:
// get the links on the page
var pageLinks = $('a');
var bdy
// loop through each of the links
$(pageLinks).each(function(){
console.log(this);
// make a GET request to the URL of this link
$.get($(this).attr("href"), function(data) {
// concatenate the return string with another
bdy = bdy + data.toString();
console.log(bdy);
});
});
As an example of what #muratgu has said:
var results = [];
var count = 0;
function allDone() {
var bdy = results.join("");
// do stuff with bdy
}
// get the links on the page
var pageLinks = $.find('#pageLinks');
// filter the links so we're left with the links we want
var wantedLinks = $(pageLinks).find('a').filter(function (idx) {
return $(this).attr('title') !== "Next Page";
});
// remember how many links we're working on
count = wantedLinks.length;
// loop through each of the links
wantedLinks.each(function (idx) {
// make a GET request to the URL of this link
$.get($(this).attr("href"), function (data) {
console.log("here");
var temp = parse_page(data);
results[idx] = temp;
// Decrement the count.
count--;
if (count === 0) {
// All done.
allDone();
}
});
});
You could go further and abstract this into a data type that can perform N async downloads, and then notify you when all are complete.
I just found that there are modules that allow one to manage the control flow in JS. The ones I found are:
Async
Step
For help using the above modules, see my follow up question here.

Categories

Resources