Using phantom.js to scrape data

Using phantom.js to scrape data - javascript

Following on from this question, I am trying to scrape data using phantomjs, modifying a script from here:
My goal is to integrate a working function (see 2nd code snippet) into the script below in 1st code snippet. I have tried doing this but keep getting errors. Is there a way I can actually do the integration?
(note: using phantomjs because the site is an angular app where initial HTML doesn't contain any of the data I amlooking for, i.e.a headless web browser. So I need to load the page in memory, wait for angular to do its thing (a set delay of some sort), and then scrape the rendered DOM)
The errors (and output) I get when I execute my script (phantomjs scraping.js) are as follow:
console> SPR-ERROR: 103 - Invalid published date console> v6
ReferenceError: Can't find variable: angular
http://stage.inc.com/js/Inc5000ListApp.js?UPDATE1:2
http://www.inc.com/inc5000/index.html:2485
console> SPR-ERROR:103 - Invalid published date (date)
====================================================
Step "0"
====================================================
console>Reached scrapeData
console>
Seems like it is connecting to the desired site. How do I modify this script below to fit the extraction code at the bottom of this qn:
var page = new WebPage(),
url = 'http://www.inc.com/inc5000/index.html',
stepIndex = 0;
/**
* From PhantomJS documentation:
* This callback is invoked when there is a JavaScript console. The callback may accept up to three arguments:
* the string for the message, the line number, and the source identifier.
*/
page.onConsoleMessage = function (msg, line, source) {
console.log('console> ' + msg);
};
/**
* From PhantomJS documentation:
* This callback is invoked when there is a JavaScript alert. The only argument passed to the callback is the string for the message.
*/
page.onAlert = function (msg) {
console.log('alert!!> ' + msg);
};
// Callback is executed each time a page is loaded...
page.open(url, function (status) {
if (status === 'success') {
// State is initially empty. State is persisted between page loads and can be used for identifying which page we're on.
console.log('============================================');
console.log('Step "' + stepIndex + '"');
console.log('============================================');
// Inject jQuery for scraping (you need to save jquery-1.6.1.min.js in the same folder as this file)
page.injectJs('jquery-1.6.1.min.js');
// Our "event loop"
if(!phantom.state){
//initialize();
scrapeData();
} else {
phantom.state();
}
// Save screenshot for debugging purposes
page.render("step" + stepIndex++ + ".png");
}
});
function scrapeData(){
page.evaluate(function() {
console.log('Reached scrapeData');
var DATA = [];
$('tr.ng-scope').each(function(){
var $tds = $(this).find('td');
DATA.push({
rank: $tds.eq(0).text(),
company: $tds.eq(1).text(),
growth: $tds.eq(2).text(),
revenue: $tds.eq(3).text(),
industry: $tds.eq(4).text()
});
});
console.log(DATA);
});
phantom.state = parseResults;
// scraping code here
}
// Step 1
function initialize() {
page.evaluate(function() {
console.log('Searching...');
});
// Phantom state doesn't change between page reloads
// We use the state to store the search result handler, ie. the next step
phantom.state = parseResults;
}
// Step 2
function parseResults() {
page.evaluate(function() {
$('#search-result a').each(function(index, link) {
console.log($(link).attr('href'));
})
console.log('Parsed results');
});
// If there was a 3rd step we could point to another function
// but we would have to reload the page for the callback to be called again
phantom.exit();
}
I know this code below works in the console, but how I can integrate it with the code script above to successfully scrape data from multiple pages on the site:
request('http://www.inc.com/inc5000/index.html', function (error, response, html) {
if(error || response.statusCode != 200) return;
var $ = cheerio.load(html);
var DATA = [];
$('tr.ng-scope').each(function(){
var $tds = $(this).find('td');
DATA.push({
rank: $tds.eq(0).text(),
company: $tds.eq(1).text(),
growth: $tds.eq(2).text(),
revenue: $tds.eq(3).text(),
industry: $tds.eq(4).text()
});
});
console.log(DATA);
});

Related

PhantomJS - Trying to find out if an element is empty or not

I'm trying to write a simple PhantomJS script where I find an element by ID and determine if it is empty of not. I've tried a few suggested things such as .childNodes.length, .textContent, etc.
These either result in a null error:
TypeError: null is not an object (evaluating 'document.getElementById('idname').childNodes')
Or phantom just crashes and refuses to check the links at all, usually this happens if I run my script twice in a row without much pause. And it will sometimes sit and do nothing.
I've written other scrapers that effectively used getElementById in this way, and they were successful, although there I was just checking if the element existed by checking if it was !== null. Checking manually, this element does exist in all the pages I'm checking, it's just that it sometimes has content and sometimes doesn't (it's a div). Anyway, here is my code:
var fs = require('fs')
var urls = fs.read('urls.txt').split('\n');
var page;
page = require('webpage').create();
console.log('The default user agent is ' + page.settings.userAgent);
page.settings.userAgent = 'SpecialAgent';
function check_link(url){
page = require('webpage').create();
page.open(url, function(status){
if (status !== 'success') {
console.log('Unable to access network');
} else {
var error = page.evaluate(function() {
return document.getElementById('error-message');
});
console.log(error.childNodes.length);
fs.write('results.csv', error.childNodes.length + ', ' + url + '\n', 'a');
page.release();
setTimeout(next_link, 1000);
}
});
}
function next_link(){
var url = urls.shift();
console.log(url);
if(!urls){
phantom.exit(0);
} else{
check_link(url);
}
}
next_link();

PhantomJS provides access to the sandboxed page context (DOM context) through page.evaluate() with the following note:
Note: The arguments and the return value to the evaluate function must be a simple primitive object. The rule of thumb: if it can be serialized via JSON, then it is fine.
Closures, functions, DOM nodes, etc. will not work!
So you cannot pass the DOM node out of the page context, but you can do everything you want with it in the page context and then pass out the result.
var errors = page.evaluate(function() {
var e = document.getElementById('error-message');
return (e && e.childNodes) ? e.childNodes.length : -1
});
console.log(errors);

In PhantomJS, why does jQuery call cause script to hang?

I'm using PhantomJS 2.0.0, on a Mac OS X Yosemite:
$ phantomjs --version
2.0.0
My script, shown below, appears to hang at the line where $('h1').size() is called:
system = require('system');
function usage() {
console.log("usage: phantomjs " + system.args[0] + " <url>");
phantom.exit(1);
}
console.log("system.args.length=" + system.args.length);
if (system.args.length != 2) {
console.log("usage bad....");
usage();
} else {
var url = system.args[1];
var page = require('webpage').create();
console.log("Opening page: " + url);
page.open(url, function (status) {
if (status !== "success") {
console.log("Unable to access network");
} else {
console.log("Setting timeout...");
window.setTimeout(function() {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.11.1/jquery.min.js", function() {
console.log("Searching for Seawolf Calendar...");
console.log("h1.size=" + $('h1').size());
console.log("Exiting with status 0...");
phantom.exit(0);
});
}, 5000);
}
});
}
The script is invoked from the command-line like this, for example:
phantomjs download-framed-content.js "http://www.sonoma.edu/calendar/groups/clubs.html"
with output like this:
system.args.length=2
Opening page: http://www.sonoma.edu/calendar/groups/clubs.html
Setting timeout...
Searching for Seawolf Calendar...
[Hung ...]
Why is the jQuery call hanging the script?

PhantomJS 2.0.0 doesn't show any errors for some reason (this is a known bug).
The error would be that $ is not a function. If jQuery is present in the page, then you can use it in the page, but it won't work outside of the page context (inside page.evaluate()).
You can only access the DOM/page context through page.evaluate():
console.log("h1.size=" + page.evaluate(function(){
return $('h1').size();
}));
Note that you cannot use any outside variables inside of the page.evaluate(), because it is sandboxed. The documentation says:
Note: The arguments and the return value to the evaluate function must be a simple primitive object. The rule of thumb: if it can be serialized via JSON, then it is fine.
Closures, functions, DOM nodes, etc. will not work!

Getting result from querying sqlite db in the add-on script to be submitted to the content script

I am writting a modest firefox add-on and I have some problems getting the results used inside the "flow" of the add-on script.
I have the code taking care of querying a sqlite database as a module but I don't know how to create a callback inside of it so that the pagemod in the add-on script can use it and pass it to the content script.
Basically here is what I have:
main.js :
var pageMod = require("sdk/page-mod");
var self = require("sdk/self");
var myDbScript = require('./myDbScript');
pageMod.PageMod({
include: "*.example.com/*",
contentScriptFile: [self.data.url('jquery-1.10.2.min.js'),
self.data.url('myContentScript.js')],
onAttach: function(worker) {
// Query the database on behalf of the content script
worker.port.on('queryTheDB', function(message) {
// Get the data from the DB (é is some test value here)
// Not working because asynchronous querying of the DB
var resultFromDB = myDbScript.getResult(2);
// Send the result to the content script
worker.port.emit('hereIsYourResult', resultFromDB);
});
}
});
myDBScript.js
// Get required components
var {components} = require("chrome");
components.utils.import("resource://gre/modules/FileUtils.jsm");
components.utils.import("resource://gre/modules/Services.jsm");
// Some code to get the DB
// Create statement to retrieve country based on the IP
var statement = dbConnection.createStatement("SELECT col1, col2 FROM table WHERE col1 = :given_col1");
function getResult(submittedValue) {
// Bind parameters
statement.params.given_col1 = submittedValue;
// Execute
statement.executeAsync({
handleResult: function(aResultSet) {
for (let row = aResultSet.getNextRow();
row;
row = aResultSet.getNextRow()) {
var resultFromDB = row.getResultByName("col2");
}
},
handleError: function(aError) {
print("Error: " + aError.message);
return 'error';
},
handleCompletion: function(aReason) {
if (aReason != components.interfaces.mozIStorageStatementCallback.REASON_FINISHED) {
print("Query canceled or aborted!");
return 'canceledOrAborted';
} else {
// Sending the result to the add-on script so that it can
// pass it to the content script
notifyingTheAddonScript(resultFromDB);
}
}
});
}
// Enable the use of the getResult function
exports.getResult = getResult;
The thing is that I don't see how to have the addon script be aware that the result is ready. Please bear with me, I am a noob at this...

Since I don't have the full source, I cannot test. So you'll have to fix any I made errors yourself ;)
First, lets add a callback.
// #param {function(result, error)} callback
// Called upon query completion.
// if |error| is a string, then the query failed.
// Else |result| will contain an array of values.
function getResult(submittedValue, callback) { // changed
// Bind parameters
statement.params.given_col1 = submittedValue;
var rv = [], err = null; // added
// Execute
statement.executeAsync({
handleResult: function(aResultSet) {
for (let row = aResultSet.getNextRow();
row;
row = aResultSet.getNextRow()) {
rv.push(row.getResultByName("col2")); // changed
}
},
handleError: function(aError) {
print("Error: " + aError.message);
err = aError.message; // changed
},
handleCompletion: function(aReason) {
if (aReason != components.interfaces.mozIStorageStatementCallback.REASON_FINISHED) {
print("Query canceled or aborted!");
err = err || 'canceled or aborted'; // changed
}
callback(err ? null : rv, err); // replaced
}
});
}
Lets use this stuff now in the pagemod
onAttach: function(worker) {
// Query the database on behalf of the content script
worker.port.on('queryTheDB', function(message) {
// Get the data from the DB (é is some test value here)
// Not working because asynchronous querying of the DB
myDbScript.getResult(2, function callback(result, error) {
if (error) {
worker.port.emit("hereIsYourError", error);
return;
}
worker.port.emit("hereIsYourResult", result);
});
});
}
You might want to take some precautions not to fire multiple queries. While it would be OK to do so, it might hurt performance ;)
Since our callback already looks kinda like a promise, it might actually be a good idea to use promises, maybe even with the Sqlite.jsm module and some Task.jsm magic.

Using Multiple page.open in Single Script

My goal is to execute PhantomJS by using:
// adding $op and $er for debugging purposes
exec('phantomjs script.js', $op, $er);
print_r($op);
echo $er;
And then inside script.js, I plan to use multiple page.open() to capture screenshots of different pages such as:
var url = 'some dynamic url goes here';
page = require('webpage').create();
page.open(url, function (status) {
console.log('opening page 1');
page.render('./slide1.png');
});
page = require('webpage').create();
page.open(url, function (status) {
console.log('opening page 2');
page.render('./slide2.png');
});
page = require('webpage').create();
page.open(url, function (status) {
console.log('opening page 3');
page.render('./slide3.png');
phantom.exit(); //<-- Exiting phantomJS only after opening all 3 pages
});
On running exec, I get the following output on page:
Array ( [0] => opening page 3 ) 0
As a result I only get the screenshot of the 3rd page. I'm not sure why PhantomJS is skipping the first and second blocks of code (evident from the missing console.log() messages that were supposed to be output from 1st and 2nd block) and only executing the third block of code.

The problem is that the second page.open is being invoked before the first one finishes, which can cause multiple problems. You want logic roughly like the following (assuming the filenames are given as command line arguments):
function handle_page(file){
page.open(file,function(){
...
page.evaluate(function(){
...do stuff...
});
page.render(...);
setTimeout(next_page,100);
});
}
function next_page(){
var file=args.shift();
if(!file){phantom.exit(0);}
handle_page(file);
}
next_page();
Right, it's recursive. This ensures that the processing of the function passed to page.open finishes, with a little 100ms grace period, before you go to the next file.
By the way, you don't need to keep repeating
page = require('webpage').create();

I've tried the accepted answer suggestions, but it doesn't work (at least not for v2.1.1).
To be accurate the accepted answer worked some of the time, but I still experienced sporadic failed page.open() calls, about 90% of the time on specific data sets.
The simplest answer I found is to instantiate a new page module for each url.
// first page
var urlA = "http://first/url"
var pageA = require('webpage').create()
pageA.open(urlA, function(status){
if (status){
setTimeout(openPageB, 100) // open second page call
} else{
phantom.exit(1)
}
})
// second page
var urlB = "http://second/url"
var pageB = require('webpage').create()
function openPageB(){
pageB.open(urlB, function(){
// ...
// ...
})
}
The following from the page module api documentation on the close method says:
close() {void}
Close the page and releases the memory heap associated with it. Do not use the page instance after calling this.
Due to some technical limitations, the web page object might not be completely garbage collected. This is often encountered when the same object is used over and over again. Calling this function may stop the increasing heap allocation.
Basically after I tested the close() method I decided using the same web page instance for different open() calls is too unreliable and it needed to be said.

You can use recursion:
var page = require('webpage').create();
// the urls to navigate to
var urls = [
'http://phantomjs.org/',
'https://twitter.com/sidanmor',
'https://github.com/sidanmor'
];
var i = 0;
// the recursion function
var genericCallback = function () {
return function (status) {
console.log("URL: " + urls[i]);
console.log("Status: " + status);
// exit if there was a problem with the navigation
if (!status || status === 'fail') phantom.exit();
i++;
if (status === "success") {
//-- YOUR STUFF HERE ----------------------
// do your stuff here... I'm taking a picture of the page
page.render('example' + i + '.png');
//-----------------------------------------
if (i < urls.length) {
// navigate to the next url and the callback is this function (recursion)
page.open(urls[i], genericCallback());
} else {
// try navigate to the next url (it is undefined because it is the last element) so the callback is exit
page.open(urls[i], function () {
phantom.exit();
});
}
}
};
};
// start from the first url
page.open(urls[i], genericCallback());

Using Queued Processes, sample:
var page = require('webpage').create();
// Queue Class Helper
var Queue = function() {
this._tasks = [];
};
Queue.prototype.add = function(fn, scope) {
this._tasks.push({fn: fn,scope: scope});
return this;
};
Queue.prototype.process = function() {
var proxy, self = this;
task = this._tasks.shift();
if(!task) {return;}
proxy = {end: function() {self.process();}};
task.fn.call(task.scope, proxy);
return this;
};
Queue.prototype.clear = function() {
this._tasks = []; return this;
};
// Init pages .....
var q = new Queue();
q.add(function(proxy) {
page.open(url1, function() {
// page.evaluate
proxy.end();
});
});
q.add(function(proxy) {
page.open(url2, function() {
// page.evaluate
proxy.end();
});
});
q.add(function(proxy) {
page.open(urln, function() {
// page.evaluate
proxy.end();
});
});
// .....
q.add(function(proxy) {
phantom.exit()
proxy.end();
});
q.process();
I hope this is useful, regards.

PhantomJS page fetching with nested loop to get new pages

I want to fetch a list online from a certain URL that is in JSON format and then use the DATA_ID from each item in that list to call a new URL. I'm just new with PhantomJS and I can't figure out why nest loops inside the page.open() acts all weird. Also the way to use phantom.exit() seems to be really weird doing what I want to achieve.
Here's my code:
console.log('Loading recipes');
console.log('===============================================================');
var page = require('webpage').create();
var url = 'http://www.hiddenurl.com/recipes/all';
page.open(url, function (status) {
//Page is loaded!
var js = page.evaluate(function () {
return document.getElementsByTagName('pre')[0];
});
var recipes = JSON.parse(js.innerHTML).results;
//console.log(recipes[0].name.replace('[s]', ''));
for (i = 0; i < recipes.length; i++) {
console.log(recipes[i].name.replace('[s]', ''));
var craft_page = require('webpage').create();
var craft_url = 'http://www.hiddenurl.com/recipe/' + recipes[i].data_id;
craft_page.open(craft_url, function (craft_status) {
//Page is loaded!
var craft_js = craft_page.evaluate(function () {
return document.getElementsByTagName('body')[0];
});
var craftp = craft_js.innerHTML;
console.log('test');
});
if (i == 5) {
console.log('===============================================================');
phantom.exit();
//break;
}
}
});
The thing that happens here is that this line:
console.log(recipes[i].name.replace('[s]', ''));
..prints the following:
===============================================================
Item from DATA_ID 1
Item from DATA_ID 2
Item from DATA_ID 3
Item from DATA_ID 4
Item from DATA_ID 5
..then it just prints the next:
===============================================================
..followed by:
'test'
'test'
'test'
'test'
'test'
Why is this not happening serial? The data from the innerly called page() request gets heaped up and dumped at the end, even after phantom.exit() should actually already be called.
Also when I free-loop a normal data-set I get this error:
QEventDispatcherUNIXPrivate(): Unable to create thread pipe: Too many open files
2013-01-31T15:35:18 [FATAL] QEventDispatcherUNIXPrivate(): Can not continue without a thread pipe
Abort trap: 6
Is there any way I can set GLOBAL_PARAMETERS or direct the process in some way so I can just handle 100's of page requests?
Thanks in advance!

I've made a workaround with Python by calling PhantomJS separately through the shell, like this:
import os
import json
cmd = "./phantomjs fetch.js"
fin,fout = os.popen4(cmd)
result = fout.read()
recipes = json.loads(result)
print recipes['count']
Not the actual solution for the PhantomJS issue, but it's a working solution and has less problems with memory and code-structure.

Develop Reference

JavaScript is the programming language of the Web.

Using phantom.js to scrape data - javascript

Related

PhantomJS - Trying to find out if an element is empty or not

In PhantomJS, why does jQuery call cause script to hang?

Getting result from querying sqlite db in the add-on script to be submitted to the content script

Using Multiple page.open in Single Script

PhantomJS page fetching with nested loop to get new pages

Categories

Resources