evaluating only javascript elements within a page using phantomjs - javascript

I want to evaluate the javascript links on this website:
http://www.egypt.gov.eg
For example, I have this element:
<li class="language">
<a id="ctl00_btnLang" href="javascript:__doPostBack('ctl00$btnLang','')">English </a>
<img src="/CSS/images/langArwEn.gif" alt="Language Arrow" /></li>
So, I tried to use phantomjs to evaluate it, using this code:
var page = require('webpage').create();
var fs = require('fs'),
system = require('system');
page.open('http://www.egypt.gov.eg', function (status) {
if (status !== 'success') {
console.log('Unable to access network');
} else {
var p = page.evaluate(function () {
return document.getElementsByID('html')[0].innerHTML
});
fs.write('out.htm', p, 'w');
console.log(p);
}
phantom.exit();
});
but it does not evaluate the html, it just returns it as is. So is there a way to evaluate the whole page html using phantomjs, and is there a way to single out elements which have javascript to be evaluated one by one?

The first question is what do you mean by 'evaluate' the javascript links on the page... do you want PhantomJS to click on them?
Docs for page automation are here:
http://phantomjs.org/page-automation.html
To click on this specific link you could do:
var page = require('webpage').create();
page.open('http://www.egypt.gov.eg', function (status) {
if (status !== 'success') {
console.log('Unable to access network');
} else {
var p = page.evaluate(function () {
document.getElementById('ctl00_btnLang').click()
});
}
phantom.exit();
});
...but this is going to send the browser to a new url and I'm not sure what happens to your PhantomJS page object in that case.
Do you want to click all the javascript links on that page and grab the contents of each new HTML page that is loaded?
It seems like that would be easier with CasperJS "CasperJS is a navigation scripting & testing utility for PhantomJS"
http://docs.casperjs.org/
Have a look at the example code here which scrapes and follows links from an array of starting urls:
https://github.com/n1k0/casperjs/blob/master/samples/dynamic.js

Related

Why PhantomJS not scraping the page it is redirected to?

I am scraping http://www.asx.com.au/asx/markets/optionPrices.do?by=underlyingCode&underlyingCode=XJO
It shows a blank white page at first, in that page there is some obfuscated JS code.
That code sends a POST request automatically, and then loads actual page.
I have this code to follow the redirected page, but its not working.
var page;
var myurl = "http://www.asx.com.au/asx/markets/optionPrices.do?by=underlyingCode&underlyingCode=XJO";
var renderPage = function (url) {
page = require('webpage').create();
page.onNavigationRequested = function (url, type, willNavigate, main) {
if (main && url != myurl) {
myurl = url;
console.log("redirect caught")
// GUILTY CODE
renderPage(url);
}
};
page.open(url, function (status) {
if (status === "success") {
console.log("success")
page.render('yourscreenshot.png');
phantom.exit(0);
} else {
console.log("failed")
phantom.exit(1);
}
});
}
renderPage(myurl);
It only outputs
success
redirect caught
See my code, why GUILTY CODE part is not being executed ... Why renderPage(url) is not being called after redirect caught?
From my understanding phantomJS doesn't really handle redirects well. That may be your issue. You may want to test this in a different way. Or you can use another browser to perform these tests to confirm. Check out this git issue to see what I mean https://github.com/ariya/phantomjs/issues/10389.

PhantomJS page.injectJs doesn't work

I'm currently trying to write the page source code into a text file by a URL. Everything works well, but I want to additionally inject a JavaScript file. The problem is that the file does not include properly. Only the last pages that are loaded, but others are incomplete.
//phantomjs C:\PhantomJS\Script\test1.js
var fs = require('fs');
var numeroEpisode = 0;
var maxEpisode = 10;
var fichierLien = fs.read('C:\\PhantomJS\\Fichier\\lien.txt');
var ListeLien = fichierLien.split(/[\n]/);
var page = require('webpage').create();
function GetPage()
{
if (numeroEpisode > maxEpisode)
{
phantom.exit();
}
page.open(ListeLien[numeroEpisode], function(status)
{
if(status !== 'success')
{
console.log('Impossible de charger la page.');
}
else
{
console.log('URL: '+ListeLien[numeroEpisode]+'');
page.injectJs('http://mylink.com', function() { });
var path = 'C:\\PhantomJS\\Fichier\\episode_'+numeroEpisode+'.html';
fs.write(path, page.content, 'w');
setTimeout(GetPage, 15000); // run again in 15 seconds
numeroEpisode++;
}
});
}
GetPage();
Don't mix up page.injectJs() and page.includeJs().
injectJs(filename): Loads a local JavaScript file into the page and evaluates it synchronously.
includeJs(url, callback): Loads a remote JavaScript file from the specified URL and evaluates it. Since it has to request a remote resource, this is done asynchronously. The passed callback is called as soon as the operation finished. If you don't use the callback, your code will most likely run before the remote JavaScript was included. Use that callback:
page.includeJs('http://mylink.com', function() {
var path = 'C:\\PhantomJS\\Fichier\\episode_'+numeroEpisode+'.html';
fs.write(path, page.content, 'w');
numeroEpisode++;
setTimeout(GetPage, 15000); // run again in 15 seconds
});
Since the JavaScript that you load changes something on the page, you probably need to load it after all the pages script have run. If this is a JavaScript heavy page, then you need to wait a little. You can wait a static amount of time:
setTimeout(function(){
page.includeJs('http://mylink.com', function() {
//...
});
}, 5000); // 5 seconds
or utilize waitFor to wait until an element appears that denotes that the page is completely loaded. This can be very tricky sometimes.
If you still want to use injectJs() instead of includeJs() (for example because of its synchronous nature), then you need to download the external JavaScript file to your machine and then you can use injectJs().

How can I execute a script after calling window.location.href?

I have a script that redirects the user to another page. I want to load some content into a div on the new page after the new page has fully loaded. How can I do this. The following doesn't work.
function goToPage() {
window.location.href = 'http://www.mypage.com/info';
$('.my_class').load('my/url/path/with/content/to/load');
}
The newly loaded page http://www.mypage.com/info contains the following div:
<div class="my_class"></div>
What am I doing wrong?
Redirect to the new page, but append a hash signal to the URL.
function goToPage() {
window.location.href = 'http://www.mypage.com/info#load-stuff;
}
Then on load of the target page, evaluate the URL checking for that hash signal.
function pageLoad() {
if (window.location.hash === "#load-stuff") {
$('.my_class').load('my/url/path/with/content/to/load');
}
}
If your application is using jQuery it'd look something like:
$(function () {
if (window.location.hash === "#load-stuff") {
$('.my_class').load('my/url/path/with/content/to/load');
}
});
That's the rough idea at least.
As pointed out in the other answers, you won't be able to perform any script instructions from your original site. Instead of using PHP to create the content statically, you could also use HTML fragments as arguments, e.g. like this:
// in the original page:
function goToPage() {
window.location.href = 'http://www.mypage.com/info#my/url/path/with/content/to/load';
}
// in http://www.mypage.com/info:
$( document ).ready(function () {
if(window.location.hash)
$('.my_class').load(window.location.hash.substring(1));
}
An easy way to pass data to your page you are redirecting to would be to set some url parameters.
For example:
window.location.href - "http://youpage.com/?key=value"
When that page loads you could have a:
$(document).ready(function(){
var my_param = getUrlParameter('key');
if(my_param == "value"){
//do your stuff here
}
});
var getUrlParameter = function getUrlParameter(sParam) {
var sPageURL = decodeURIComponent(window.location.search.substring(1)),
sURLVariables = sPageURL.split('&'),
sParameterName,
i;
for (i = 0; i < sURLVariables.length; i++) {
sParameterName = sURLVariables[i].split('=');
if (sParameterName[0] === sParam) {
return sParameterName[1] === undefined ? true : sParameterName[1];
}
}
};
You should just run
$('.my_class').load('my/url/path/with/content/to/load');
on this page: http://www.mypage.com/info.
When you do window.location.href = 'http://www.mypage.com/info'; you're redirecting to another page. Nothing after that line will happen. You have to instead run the code after that line on the page that's loaded.
You can do this a few different ways. Try leveraging the localstorage API and passing info or content with a name and value pair (or a few of them) and unpack it on the receiving end.
On the page you're redirecting to, check for the localstorage key, and then load the contents of it (the aforementioned name and value pairs) into a div.
As an alternative, you can write one script file that you can deploy to several pages; do a check on window.location.href and conditionally load script accordingly. If you're on the redirected page, you can run whatever script you like. The nice part about doing it this way is that you're still working with one JS file - no need to fragment your code (assuming, of course, that the pages you're working with are all on the same site).
You don't need to do anything with php if you don't want to, or hashes... there's a few nifty tools that will do the trick if you can leverage HTML5 and its associated APIs.
window.location = '#/MyPage';
setTimeout(function() {
//MyCode To Run After PageLoad
});
You are redirecting the browser with window.location.href and I'm afraid as you are purely just changing the browser's location, you can't have any affect/input on the page you are moving to (unless you use query string parameters and then create content with something like PHP (myurl.php?newcontent=whatever) )
Once you redirect you can no longer execute scripts on that page, as the page is unloaded.
Try this,
redirect page:
function goToPage() {
window.location.href = 'http://www.mypage.com/info;
}
mypage.com/info:
js:
$('.my_class').load('my/url/path/with/content/to/load');
html:
<div class="my_class"></div>
I hope this helped you out, and let me know if you need further assistance!

Load a javascript/ajax call on click using phantomjs

I am trying to build a webscraper with which I can download the HTML source after information is received from a ajax call on click.
Simply speaking initially I download a the webpage and then on clicking the next button the page is loaded with a new set of images using a ajax call and I need to capture the html source after clicking next.
The next click source looks something like this
Next Page
And on the same page is the javascript function nextpage which handles the ajax call.
Is there a way to do this using phantomjs? I am very new to phantomjs so let me know if anything is not clear.
Currently I am only able to load the contents from original webpage.
var page = require('webpage').create();
page.open('somewebpage', function (status) {
if (status !== 'success') {
console.log('Unable to access network');
} else {
var p = page.evaluate(function () {
return document.getElementsByTagName('html')[0].innerHTML
});
console.log(p);
}
phantom.exit();
});
Thanks
Try:
var content = page.evaluate( function() { return
(new XMLSerializer()).serializeToString( document ); } );

Navigating / scraping hashbang links with javascript (phantomjs)

I'm trying to download the HTML of a website that is almost entirely generated by JavaScript. So, I need to simulate browser access and have been playing around with PhantomJS. Problem is, the site uses hashbang URLs and I can't seem to get PhantomJS to process the hashbang -- it just keeps calling up the homepage.
The site is http://www.regulations.gov. The default takes you to #!home. I've tried using the following code (from here) to try and process different hashbangs.
if (phantom.state.length === 0) {
if (phantom.args.length === 0) {
console.log('Usage: loadreg_1.js <some hash>');
phantom.exit();
}
var address = 'http://www.regulations.gov/';
console.log(address);
phantom.state = Date.now().toString();
phantom.open(address);
} else {
var hash = phantom.args[0];
document.location = hash;
console.log(document.location.hash);
var elapsed = Date.now() - new Date().setTime(phantom.state);
if (phantom.loadStatus === 'success') {
if (!first_time) {
var first_time = true;
if (!document.addEventListener) {
console.log('Not SUPPORTED!');
}
phantom.render('result.png');
var markup = document.documentElement.innerHTML;
console.log(markup);
phantom.exit();
}
} else {
console.log('FAIL to load the address');
phantom.exit();
}
}
This code produces the correct hashbang (for instance, I can set the hash to '#!contactus') but it doesn't dynamically generate any different HTML--just the default page. It does, however, correctly output that has when I call document.location.hash.
I've also tried to set the initial address to the hashbang, but then the script just hangs and doesn't do anything. For example, if I set the url to http://www.regulations.gov/#!searchResults;rpp=10;po=0 the script just hangs after printing the address to the terminal and nothing ever happens.
The issue here is that the content of the page loads asynchronously, but you're expecting it to be available as soon as the page is loaded.
In order to scrape a page that loads content asynchronously, you need to wait to scrape until the content you're interested in has been loaded. Depending on the page, there might be different ways of checking, but the easiest is just to check at regular intervals for something you expect to see, until you find it.
The trick here is figuring out what to look for - you need something that won't be present on the page until your desired content has been loaded. In this case, the easiest option I found for top-level pages is to manually input the H1 tags you expect to see on each page, keying them to the hash:
var titleMap = {
'#!contactUs': 'Contact Us',
'#!aboutUs': 'About Us'
// etc for the other pages
};
Then in your success block, you can set a recurring timeout to look for the title you want in an h1 tag. When it shows up, you know you can render the page:
if (phantom.loadStatus === 'success') {
// set a recurring timeout for 300 milliseconds
var timeoutId = window.setInterval(function () {
// check for title element you expect to see
var h1s = document.querySelectorAll('h1');
if (h1s) {
// h1s is a node list, not an array, hence the
// weird syntax here
Array.prototype.forEach.call(h1s, function(h1) {
if (h1.textContent.trim() === titleMap[hash]) {
// we found it!
console.log('Found H1: ' + h1.textContent.trim());
phantom.render('result.png');
console.log("Rendered image.");
// stop the cycle
window.clearInterval(timeoutId);
phantom.exit();
}
});
console.log('Found H1 tags, but not ' + titleMap[hash]);
}
console.log('No H1 tags found.');
}, 300);
}
The above code works for me. But it won't work if you need to scrape search results - you'll need to figure out an identifying element or bit of text that you can look for without having to know the title ahead of time.
Edit: Also, it looks like the newest version of PhantomJS now triggers an onResourceReceived event when it gets new data. I haven't looked into this, but you might be able to bind a listener to this event to achieve the same effect.

Categories

Resources