How to get HTML content using PhantomJS after X seconds? - javascript

How to get the content of website after 10 seconds using PhantomJS? In my website for example, I have script that do setTimeout then change the DOM. I need to feach the website html with that change.
I can't find any working answers.

Try this code:
var page = require('webpage').create();
var fs = require('fs');
page.open('http://example.com', function(status) {
console.log('Page load status: ' + status);
if (status === 'success') {
setTimeout(function(){
fs.write('example.html', page.content, 'a');
console.log('Page saved');
phantom.exit();
}, 10000);
}
});

Related

Crawl webpage which are rendered via JavaScript. PhtantomJs or any other tool?

I'm building a tool to crawl a page and store its html locally.
Also load that HTML on a webpage using iframe. So I have unbind and bind events on the crawled page.
I'm using PhantomJS to get the webpage data.
Web pages that render data via JavaScript are not feasible to crawl. Is there any way in PhantomJs to do so ?
Code to get web page data after page has loaded using PhantomJs is :
PHP code to shell execute phantomJs command
$shelldata = exec(PHATOM_JS_PATH."bin/phantomjs ".PHATOM_JS_PATH."/phantomcode.js $WEB_URL > webpage.html 2>&1");
Sample 1
var system = require('system');
var page = require('webpage').create();
var args = system.args;
page.onLoadFinished = function(status) {
// console.log('Status: ' + status);
console.log(page.content);
phantom.exit();
};
if(args.length > 1){
page.open(args[1], function(status) {
if(status == "success"){
}else{
console.log("Invalid");
phantom.exit();
}
});
}else{
console.log("Invalid");
phantom.exit();
}
Sample 2
var system = require('system');
var page = require('webpage').create();
var args = system.args;
if(args.length > 1){
page.open(args[1], function(status) {
if(status == "success"){
setTimeout(function() {
console.log(page.content);
phantom.exit();
}, 200);
}else{
console.log("Invalid");
phantom.exit();
}
});
}else{
console.log("Invalid");
phantom.exit();
}
instead of using SetTimeout with 200ms I think it will be better to inject scrolling down a script to the website you want to download, that's how you avoid cases of sites using windowing
here is an exmaple made with puppeteer written in node
const puppeteer = require("puppeteer");
const fs = require("fs");
const injectionPath = "scrollInjection.js";
const writeContent = (content)=>{
fs.writeFile("./test/pageoutput", content, function(err) {
if(err) {
return console.log(err);
}
console.log("The file was saved!");
});
}
const delay = time => {
return new Promise(function(resolve) {
setTimeout(resolve, time);
});
};
let run = async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
//wait for page to fully loaded
await page.goto(
"URL TO GO TO"
);
//inject scrolling down script
await page
.evaluate(fs.readFileSync(injectionPath, "utf8"))
.catch(err => console.log(err));
//scroll down for x sec
await delay("HOW MUCH TIME TO SCROLL DOWN THE PAGE ");
// get page content
const html = await page.content();
writeContent(html)
await browser.close();
};
run();
injectionFile.js
function myTimer() {
window.scrollTo(0, document.body.scrollHeight);
}
var scroller = setInterval(function() {
myTimer();
}, 1000);

Having trouble downloading dynamic web content with PhantomJS

My goal is to download the dynamic web content of a website, so javascript is necessary to be executed on the received content. The code that I am currently using with PhantomJS 2.1 is the following:
var page = require('webpage').create();
var fs = require('fs');
page.open('https://sports.bovada.lv/soccer/premier-league', function () {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function () {
page.evaluate(); // Edit: this line is removed
page.close();
});
});
page.onLoadFinished = function() {
console.log("Download finished");
fs.write('test.html', page.content, 'w');
phantom.exit(0);
};
The code is saving the received page as "test.html", but unfortunately it is not loading the full page content as it does with a web browser. I would appreciate if someone could help me out.
Website used for testing: https://sports.bovada.lv/soccer/premier-league
The issue could be that your're exiting too soon. Try delaying script termination:
page.onLoadFinished = function() {
console.log("Download finished");
fs.write('test.html', page.content, 'w');
setTimeout(function(){
phantom.exit(0);
}, 1000);
};

How to open a specific URL in a session after login in PhantomJS

I'm loging in to facebook with PhantomJS. Here is my code:
var page = require('webpage').create();
phantom.cookiesEnabled = true;
page.open("http://www.facebook.com/login.php", function(status) {
if (status === "success") {
page.evaluate(function() {
document.getElementById("email").value = "email";
document.getElementById("pass").value = "pass";
document.getElementById("loginbutton").click();
});
window.setTimeout(function() {
page.render("page.png");
phantom.exit();
}, 5000);
}
});
page.open("https://www.facebook.com/myprofil", function(status) {
window.setTimeout(function() {
page.render("profil.png");
phantom.exit();
}, 5000);
});
so far so good the login process goes fine, but I didn't find the right method to visit my profile URL for example in the same session. I tried to use page.open again, but I keep getting the login form with the page I requested as if I never logged in before. What's the proper way to navigate within your session?
page.open is an asynchronous function. It takes some time to open a page. When it's done, the callback is called. If you're calling page.open twice in a row without waiting for the first call to be finished, you're essentially overwriting the first request, so only the second one will be executed.
You need to nest the calls:
page.open("http://www.facebook.com/login.php", function(status) {
if (status === "success") {
page.evaluate(function() {
document.getElementById("email").value = "email";
document.getElementById("pass").value = "pass";
document.getElementById("loginbutton").click();
});
window.setTimeout(function() {
page.render("page.png");
page.open("https://www.facebook.com/myprofil", function(status) {
window.setTimeout(function() {
page.render("profil.png");
phantom.exit();
}, 5000);
});
}, 5000);
}
});
Remember that PhantomJS exits as soon as phantom.exit() is called. So you should probably have a single phantom.exit() when your script is supposed to end.

Add/inject image to DOM via PhantomJS

I'm trying to dynamically add an image to the DOM of a loaded page, but it's not showing up when rendering the page.
In page.evaluate in modify the DOM like this (excerpt):
page.open(url, function(status) {
...
window.setTimeout(function () {
...
page.evaluate(function() {
...
var myimg = document.createElement("img");
myimg.setAttribute('src', 'http://www.foobar.com/fooimage.png');
myimg.setAttribute('height', '41px');
myimg.setAttribute('width', '80px');
outerdiv.appendChild(myimg); // outerdiv is visible in the rendered output
document.body.appendChild(outerdiv);
...
}
page.render
Debugging page.content shows that it's successfully added, but page.render does not show it (only the outerdiv it's appended to). Instead of using an external URL src I also tried a base64 encoded string with no luck. I also omitted the path and stored the file inside PhantomJS' include path. None of this 3 seems to work.
I also have a window.timeout of 2000 so I don't think it's an issue of rendering the page before the PNG is loaded.
What would be the proper way to add the src? External URL, local file? Why isn't even adding a base64 encoded image working? Are any security limitations blocking what I'm trying to do? I'm running PhantomJS 1.9.0 btw.
I cannot reproduce your problem with this complete script.
var page = require('webpage').create();
var url = "http://phantomjs.org/img/phantomjs-logo.png";
page.open("http://example.com", function (status) {
if (status !== 'success') {
console.log('Unable to access network');
phantom.exit();
} else {
page.render("without.png");
page.evaluate(function(url){
var myimg = document.createElement("img");
myimg.setAttribute('src', url);
document.body.appendChild(myimg);
}, url);
setTimeout(function(){
page.render("with.png");
phantom.exit();
}, 5000);
}
});
I tried it with PhantomJS 1.9.0 and 1.9.8 with exactly the same result.
I came across the same problem. When I am using page.evaluate() to add images, they are not getting loaded, but content is getting changed.
If I do the same with page.content, images are getting loaded and working as expected.
var page = require('webpage').create();
page.onLoadFinished = function() {
console.log('');
console.log('load finished');
var render = page.render('evaluateTesting.png');
}
page.open('about:blank', function(status) {
console.log('');
console.log('open callback');
//page reloading after this line
page.content = '<!DOCTYPE html><head></head><body><img src="image path"></img></body>';
/*
* this is not causing page reload
* but content is changing
page.evaluate(function() {
console.log('');
console.log('evaluating');
var body = document.getElementsByTagName('body');
body = body[0];
body.innerHTML = '<img src="file:///home/ravitejay/projects/testappDup/sample.png"></img><br>';
})*/
console.log('content : '+page.content);
})

On PhantomJS I can't include jQuery and without jQuery I can't post form data

I am having trouble running jQuery in PhantomJS. I have found this answer, which talks about no variable is available inside evaluate function but the question is about a node module and on my example I only call console.log inside evaluate function. I have put this question on GitHub too.
Previously, for some pages, the following evaluate code didn't execute. Now that #b1f56gd4 has provided some help, it now prints messages; I can't execute it but now I can see this:
The page at https://login.yahoo.com/ ran insecure content from http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js.
I can't load jQuery from different domain and the --local-to-remote-url-access=true or --web-security=false options make no difference.
I will try load jQuery locally. Here is the code:
console.log('Loading a web page');
var url = 'https://login.yahoo.com/';
var page = require('webpage').create();
console.log('Setting error handling');
page.onConsoleMessage = function (msg) {
console.log(msg);
};
page.onError = function (msg, trace) {
console.log(msg);
trace.forEach(function(item) {
console.log(' ', item.file, ':', item.line);
})
phantom.exit();
}
console.log('Error handling is set');
console.log('Opening page');
page.open(url, function (status) {
if (status != 'success') {
console.log('F-' + status);
} else {
console.log('S-' + status);
//-------------------------------------------------
var jsLoc = '';
jsLoc = 'jquery.min.js'; // to load local
//jsLoc = 'http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js'; // to load remote
var func = function(pg){
console.log('Function called');
console.log('Page evaluating');
console.log(pg);
pg.evaluate(function() {
console.log('Page evaluate started');
//---
var loginVar = 'ih5d4hf65465fd45h6#yahoo.com.br';
var pwdVar = 'itsmypass_445f4hd564hd56f46s';
//---
$("#login_form #username").value = loginVar;
$("#login_form #passwd").value = pwdVar;
//---
});
console.log('Rendering');
pg.render('ystsA.png');
console.log('Rendered');
}
if (typeof jQuery == 'undefined') {
console.log('JQuery Loading'); // <<<<==== Execute only until here
console.log('Source:['+jsLoc+']');
var rs = page.includeJs(jsLoc, function() // <<<<===== Fail here, jsLoc was changed to load locally and after tried remotely, i tried use page.injectJs but fail too
{
console.log('JQuery Loaded'); // <<<< ===== Never reach here, no matter if loading local or remote script in include above
func(page);
});
page.render('ystsB.png');
} else {
console.log('JQuery Already Loaded');
func(page);
page.render('ystsC.png');
}
//-------------------------------------------------
}
phantom.exit();
});
After reading #g4d564w56 answer i did all without JQuery then i can fill textbox but cant click on button to post on login form.
See the new code:
console.log('Loading a web page');
var url = 'https://login.yahoo.com/';
var page = require('webpage').create();
console.log('Setting error handling');
page.onConsoleMessage = function (msg) {
console.log(msg);
};
page.onError = function (msg, trace) {
console.log(msg);
trace.forEach(function(item) {
console.log(' ', item.file, ':', item.line);
})
phantom.exit();
}
console.log('Error handling is set');
console.log('Opening page');
page.open(url, function (status) {
if (status != 'success') {
console.log('F-' + status);
} else {
console.log('S-' + status);
//-------------------------------------------------
var jsLoc = '';
jsLoc = 'jquery.min.js'; // to load local
//jsLoc = 'http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js'; // to load remote
var act01 = function(pg){
console.log('Function called');
console.log('Page evaluating');
console.log(pg);
pg.evaluate(function() {
var getElmById = function(id){
return document.getElementById(id);
}
console.log('Page evaluate started');
//---
var loginVar = 'ih5d4hf65465fd45h6#yahoo.com.br';
var pwdVar = 'itsmypass_445f4hd564hd56f46s';
//---
getElmById("username").value = loginVar;
getElmById("passwd").value = pwdVar;
getElmById("login_form").submit(); /// <<<<==== now its dont work !!!
//---
});
console.log('Rendering');
pg.render('ystsA.png');
console.log('Rendered');
}
act01(page);
//-------------------------------------------------
}
phantom.exit();
});
I know this question has already been answer about a year ago, but the answer didn't really address the issue. The reason for the error below:
"The page at https://login.yahoo.com/ ran insecure content from
http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js."
Is that the login page is an https page and you're trying to load an http resource. If you change the url to https://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js this error will go away. Took a while to figure that out.
A working version using google search.
var page, doSearch, displayResults;
page = require('webpage').create();
doSearch = function() {
console.log('Searching...');
page.evaluate(function() {
$("input[name=q]").val('what is phantomjs');
$("form").trigger('submit');
return true;
});
page.render('phantomjs-searching.png');
};
displayResults = function() {
console.log('Results...');
page.evaluate(function() {
$('h3 a').each(function(i) {
console.log([i + 1, $(this).text(), ' // ' + $(this).attr('href')].join(': '));
});
return true;
});
page.render('phantomjs-results.png');
};
page.onLoadFinished = function(status) {
if (status === 'success') {
page.includeJs('http://ajax.googleapis.com/ajax/libs/jquery/1.10.2/jquery.min.js', function() {
if (!phantom.state) {
doSearch();
phantom.state = 'results';
} else {
displayResults();
phantom.exit();
}
});
} else {
console.log('Connection failed.');
phantom.exit();
}
};
page.onConsoleMessage = function(msg) {
console.log(msg);
};
page.open('http://google.com');
Try the next code from http://snippets.aktagon.com/snippets/534-How-to-scrape-web-pages-with-PhantomJS-and-jQuery. It loads a local copy of jQuery, but can also use the jQuery instance loaded by the requested page.
var page = new WebPage(),
url = 'http://localhost/a-search-form',
stepIndex = 0;
/**
* From PhantomJS documentation:
* This callback is invoked when there is a JavaScript console. The callback may accept up to three arguments:
* the string for the message, the line number, and the source identifier.
*/
page.onConsoleMessage = function (msg, line, source) {
console.log('console> ' + msg);
};
/**
* From PhantomJS documentation:
* This callback is invoked when there is a JavaScript alert. The only argument passed to the callback is the string for the message.
*/
page.onAlert = function (msg) {
console.log('alert!!> ' + msg);
};
// Callback is executed each time a page is loaded...
page.open(url, function (status) {
if (status === 'success') {
// State is initially empty. State is persisted between page loads and can be used for identifying which page we're on.
console.log('============================================');
console.log('Step "' + stepIndex + '"');
console.log('============================================');
// Inject jQuery for scraping (you need to save jquery-1.6.1.min.js in the same folder as this file)
page.injectJs('jquery-1.6.1.min.js');
// Our "event loop"
if(!phantom.state){
initialize();
} else {
phantom.state();
}
// Save screenshot for debugging purposes
page.render("step" + stepIndex++ + ".png");
}
});
// Step 1
function initialize() {
page.evaluate(function() {
$('form#search input.query').val('Jebus saves');
$('form#search').submit();
console.log('Searching...');
});
// Phantom state doesn't change between page reloads
// We use the state to store the search result handler, ie. the next step
phantom.state = parseResults;
}
// Step 2
function parseResults() {
page.evaluate(function() {
$('#search-result a').each(function(index, link) {
console.log($(link).attr('href'));
})
console.log('Parsed results');
});
// If there was a 3rd step we could point to another function
// but we would have to reload the page for the callback to be called again
phantom.exit();
}
There is a well know bug that PhantomJS cant load JQuery, will be hard to post some form data to server but you can select elements only using querySelectorAll like this example: how to scrape links with phantomjs
#lmeurs answer is very good but is not functional.
I used the answer to create something functional to you :) .
var page = new WebPage();
var url = 'http://br.search.yahoo.com';
var stepIndex = 0;
page.onConsoleMessage = function (msg, line, source) { console.log('console> ' + msg); };
page.onAlert = function (msg) { console.log('alert!!> ' + msg); };
function takeShot(){
console.log("TakingShot");
page.render("step" + stepIndex + ".png");
console.log("ShotTake");
}
function step0() {
console.log("step 00 enter");
page.evaluate(function() {
$("form [type='text']").val('its now sunday searching it');
$("form [type='submit']").submit();
});
console.log("step 00 exit");
}
function step1() {
console.log("step 01 enter");
page.evaluate(function() {
$('#search-result a').each(function(index, link) {
console.log($(link).attr('href'));
})
});
console.log("step 01 exit");
phantom.exit();
}
page.open(url, function (status) {
console.log("[- STARTING -]");
if (status === 'success') {
var cmd = ""
page.injectJs('jquery-1.6.1.min.js');
while(true)
{
console.log("Step["+stepIndex+"] starting on ["+new Date()+"]");
//cmd = "var x = step"+stepIndex+";"
//console.log(cmd);
//eval(cmd);
switch(stepIndex){
case 0:
step0();
break;
case 1:
step1();
break;
}
takeShot();
stepIndex++;
}
}
});

Categories

Resources