I am trying to build a webscraper with which I can download the HTML source after information is received from a ajax call on click.
Simply speaking initially I download a the webpage and then on clicking the next button the page is loaded with a new set of images using a ajax call and I need to capture the html source after clicking next.
The next click source looks something like this
Next Page
And on the same page is the javascript function nextpage which handles the ajax call.
Is there a way to do this using phantomjs? I am very new to phantomjs so let me know if anything is not clear.
Currently I am only able to load the contents from original webpage.
var page = require('webpage').create();
page.open('somewebpage', function (status) {
if (status !== 'success') {
console.log('Unable to access network');
} else {
var p = page.evaluate(function () {
return document.getElementsByTagName('html')[0].innerHTML
});
console.log(p);
}
phantom.exit();
});
Thanks
Try:
var content = page.evaluate( function() { return
(new XMLSerializer()).serializeToString( document ); } );
Related
I am scraping http://www.asx.com.au/asx/markets/optionPrices.do?by=underlyingCode&underlyingCode=XJO
It shows a blank white page at first, in that page there is some obfuscated JS code.
That code sends a POST request automatically, and then loads actual page.
I have this code to follow the redirected page, but its not working.
var page;
var myurl = "http://www.asx.com.au/asx/markets/optionPrices.do?by=underlyingCode&underlyingCode=XJO";
var renderPage = function (url) {
page = require('webpage').create();
page.onNavigationRequested = function (url, type, willNavigate, main) {
if (main && url != myurl) {
myurl = url;
console.log("redirect caught")
// GUILTY CODE
renderPage(url);
}
};
page.open(url, function (status) {
if (status === "success") {
console.log("success")
page.render('yourscreenshot.png');
phantom.exit(0);
} else {
console.log("failed")
phantom.exit(1);
}
});
}
renderPage(myurl);
It only outputs
success
redirect caught
See my code, why GUILTY CODE part is not being executed ... Why renderPage(url) is not being called after redirect caught?
From my understanding phantomJS doesn't really handle redirects well. That may be your issue. You may want to test this in a different way. Or you can use another browser to perform these tests to confirm. Check out this git issue to see what I mean https://github.com/ariya/phantomjs/issues/10389.
I'm currently trying to write the page source code into a text file by a URL. Everything works well, but I want to additionally inject a JavaScript file. The problem is that the file does not include properly. Only the last pages that are loaded, but others are incomplete.
//phantomjs C:\PhantomJS\Script\test1.js
var fs = require('fs');
var numeroEpisode = 0;
var maxEpisode = 10;
var fichierLien = fs.read('C:\\PhantomJS\\Fichier\\lien.txt');
var ListeLien = fichierLien.split(/[\n]/);
var page = require('webpage').create();
function GetPage()
{
if (numeroEpisode > maxEpisode)
{
phantom.exit();
}
page.open(ListeLien[numeroEpisode], function(status)
{
if(status !== 'success')
{
console.log('Impossible de charger la page.');
}
else
{
console.log('URL: '+ListeLien[numeroEpisode]+'');
page.injectJs('http://mylink.com', function() { });
var path = 'C:\\PhantomJS\\Fichier\\episode_'+numeroEpisode+'.html';
fs.write(path, page.content, 'w');
setTimeout(GetPage, 15000); // run again in 15 seconds
numeroEpisode++;
}
});
}
GetPage();
Don't mix up page.injectJs() and page.includeJs().
injectJs(filename): Loads a local JavaScript file into the page and evaluates it synchronously.
includeJs(url, callback): Loads a remote JavaScript file from the specified URL and evaluates it. Since it has to request a remote resource, this is done asynchronously. The passed callback is called as soon as the operation finished. If you don't use the callback, your code will most likely run before the remote JavaScript was included. Use that callback:
page.includeJs('http://mylink.com', function() {
var path = 'C:\\PhantomJS\\Fichier\\episode_'+numeroEpisode+'.html';
fs.write(path, page.content, 'w');
numeroEpisode++;
setTimeout(GetPage, 15000); // run again in 15 seconds
});
Since the JavaScript that you load changes something on the page, you probably need to load it after all the pages script have run. If this is a JavaScript heavy page, then you need to wait a little. You can wait a static amount of time:
setTimeout(function(){
page.includeJs('http://mylink.com', function() {
//...
});
}, 5000); // 5 seconds
or utilize waitFor to wait until an element appears that denotes that the page is completely loaded. This can be very tricky sometimes.
If you still want to use injectJs() instead of includeJs() (for example because of its synchronous nature), then you need to download the external JavaScript file to your machine and then you can use injectJs().
Below is my code but page load continuously,i want to load only once
window.onload = function () {
window.location.reload();
}
There are a few ways you could solve this, all of which require saving state across page loads. You could use cookies, localStorage, the location object itself, etc.
Here's a way that checks to see if there is a hash string 'reloaded' and, if not, adds it and reloads the page. Then, when it tries to execute again, the hash will be there and it will not reload:
if (location.hash.indexOf('reloaded') === -1) {
location.hash += 'reloaded';
location.reload();
}
$(document).ready(function(){
if(document.URL.indexOf("#")==-1){ //Check if the current URL contains '#'
url = document.URL+"#"; // use "#". Add hash to URL
location = "#";
location.reload(true); //Reload the page
}
});
Due to the if condition page will reload only once.
The other way to achieve this is :
(function()
{
if( window.localStorage )
{
if( !localStorage.getItem('firstLoad') )
{
localStorage['firstLoad'] = true;
window.location.reload();
}
else
localStorage.removeItem('firstLoad');
}
})();
window.onload = function ()
{
// for getting params value
function parse(val)
{
var result = "not found";
tmp = [];
location.search
.substr(1)
.split("&")
.forEach(function (item) {
tmp = item.split("=");
if (tmp[0] === val) result = decodeURIComponent(tmp[1]);
});
return result;
}
if(parse("load")!="once")
{
//sending parameter so next time it won't reload..
window.location.href += "?load=once";
window.location.reload();
}
}
By nature of visiting a page, It will only load once. You could change your code to prove this fact:
window.onload = function () {
alert("Loaded");
}
But, I would suggest the vapor.js route to detecting page load, that is, omit this onload call, because the lines of code in the onload function run after the page is loaded. I think you either don't know what your goal is or you have an entirely different problem you are trying to solve in a way that does not make sense
You built a loop,
site is loading
window.onload is triggered
reload is initiaded
site is (re-)loading
window.onload is triggered
reload is initiaded
.......
.......
Important fact for you to learn is that browsers run through your code from top to bottom and when you reload your page, the whole prozess repeats.
So every Time you reload, the window.onload event-listener is registered and calls the function attached to it, as soon as the window object is fully loaded.
There is no mechanism that tells the browser to stop.
if you would like run your Javascript code once the DOM is loaded, and you are looking for an browser independent solution i would recommend jQuery and its $( document ).ready() function.
with jQuery included to your Page:
$( document ).ready(function(){
//Code inside this function runs after your document is loaded
})
I want to evaluate the javascript links on this website:
http://www.egypt.gov.eg
For example, I have this element:
<li class="language">
<a id="ctl00_btnLang" href="javascript:__doPostBack('ctl00$btnLang','')">English </a>
<img src="/CSS/images/langArwEn.gif" alt="Language Arrow" /></li>
So, I tried to use phantomjs to evaluate it, using this code:
var page = require('webpage').create();
var fs = require('fs'),
system = require('system');
page.open('http://www.egypt.gov.eg', function (status) {
if (status !== 'success') {
console.log('Unable to access network');
} else {
var p = page.evaluate(function () {
return document.getElementsByID('html')[0].innerHTML
});
fs.write('out.htm', p, 'w');
console.log(p);
}
phantom.exit();
});
but it does not evaluate the html, it just returns it as is. So is there a way to evaluate the whole page html using phantomjs, and is there a way to single out elements which have javascript to be evaluated one by one?
The first question is what do you mean by 'evaluate' the javascript links on the page... do you want PhantomJS to click on them?
Docs for page automation are here:
http://phantomjs.org/page-automation.html
To click on this specific link you could do:
var page = require('webpage').create();
page.open('http://www.egypt.gov.eg', function (status) {
if (status !== 'success') {
console.log('Unable to access network');
} else {
var p = page.evaluate(function () {
document.getElementById('ctl00_btnLang').click()
});
}
phantom.exit();
});
...but this is going to send the browser to a new url and I'm not sure what happens to your PhantomJS page object in that case.
Do you want to click all the javascript links on that page and grab the contents of each new HTML page that is loaded?
It seems like that would be easier with CasperJS "CasperJS is a navigation scripting & testing utility for PhantomJS"
http://docs.casperjs.org/
Have a look at the example code here which scrapes and follows links from an array of starting urls:
https://github.com/n1k0/casperjs/blob/master/samples/dynamic.js
I've understood from the docs that closing chrome extension popups when losing focus has been a design choice.
I'm working on an extension where the user chooses to save elements from a webpage. As he interacts with the main webpage I would like the popup to get updated but that's obviously not possible.
What's the proper way of handling this situation? (this is my first chrome extension)
You can have a content script detect the "save" action. Let's suppose it's a specific DOM element you know for sure it's going to be in the specific main, or that you create by yourself.
content.js
//content script
document.onreadystatechange = function () {
if (document.readyState == "complete") {
// Grab the UI frmo the mainpage you want to append the save functionality
var someElementsYouWantToAppendASaveButtonTo = document.getElementsByTagName("...");
var len = someElementsYouWantToAppendASaveButtonTo.length;
for (var i = 0; i < len; i++) {
// Create a UI save button to provide a functionality
var theSaveButton = document.createElement("button");
theSaveButton.value = "Save to Chrome Extension";
// Send data to extension when clicked
theSaveButton.addEventListener("click", function() {
var dataToSentToExtension = {...} // Retrieve from the clicked element, or whatever you want to save
chrome.extension.sendMessage(dataToSentToExtension, function(response) {
if(response.success) console.log("Saved successfully");
else console.log("There was an error while saving")
});
}, false);
someElementsYouWantToAppendASaveButtonTo[i].appendChild(theSaveButton)
}
}
}
Then, on the background, you detect the response and set up the popup as you wish.
background.js
chrome.extension.onMessage.addListener(function(request, sender, sendResponse) {
if(request.dataToSave) {
chrome.storage.local.set(dataToSave, function() {...});
// You can then set upn the proper popup for the next click or even switch to it
switch(request.popupToDisplay) {
case "awesomeDisplay":
chrome.browserAction.setPopup({...})
break;
}
var responseFromExtension = {success: true}
} else {
var responseFromExtension = {error: true}
}
});
It seems you are looking to modify\update your popup.html page in accord to changes in a web page. If so, use content scripts and establish connection for single message communication with background page(Since focus is lost every time) and update popup.html indirectly.
References:
Content Scripts
Background Page
Message Passing
Reference for communication between popup and background page apart from these,
there are bunch of questions on these topics, they will get you started..