I have a webpage I want to print.
for that I am using phantomjs.
In the following example you will notice a strange effect.
non of the rotated text has been rendered correctly
I cant seem to get my head around how this can happen,
chrome is rendering this perfectly and phantomjs-webpage userAgent is set to webkit.
This is the link I'm trying to capture:
http://www.facegift.co.il/canvas/print.aspx?userItemId=27477&Sc=974088&pagenum=3&width=1500&print=2
This is the image that's been rendered:
and This is my code example:
var page = require('webpage').create();
var args = require('system').args;
var isLoad = false;
page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36';
page.viewportSize = { width: 500, height: 687.5 };
page.open('http://www.facegift.co.il/canvas/print.aspx?userItemId=27477&Sc=974088&pagenum=3&width=1500&print=2', function (status) {
var ue = page.evaluate(function () {
return window.navigator.appVersion;
});
console.log("status: " + status);
console.log("ue: " + ue);
if (status === "success") {
var timer = setInterval(function () {
var ps = page.evaluate(function () {
document.body.bgColor = 'white';
return document.getElementById("pageStatus").innerHTML;
});
if (!isLoad) {
console.log("wait...");
if (ps == "loaded") {
clearInterval(timer);
console.log("loaded");
isLoad = true;
page.render('new/exam.jpg');
phantom.exit();
}
}
}, 10);
}
});
Your help is highly appreciated.
Thank you.
Related
I try to use webserver talk to outside world and setInterval want its execution automatically every sometimes.
One of my casperjs setting is this.capture(x + 'apple.png');
I though it will show three images under my folder if setInterval run three times.
As the result i only save one image is 1apple.png.
Although i can see a lots of info on my terminal
I want to ask what step should i miss it ? Any help would be appreciated.
Thanks in advance.
Here is my code today.js:
var webserver = require('webserver');
var server = webserver.create();
var service = server.listen('8080', {
'keepAlive': true
}, function (request, response) {
response.statusCode = 200;
response.write('<html><body>What the hell~~</body></html>');
var casper = require("casper").create({
verbose: true,
logLevel: 'debug', // debug, info, warning, error
pageSettings: {
loadImages: false,
loadPlugins: false,
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/537.4'
}
});
var movieTitle = [];
var movieEnTitle = [];
var title = [];
var movieTime = [];
var movieVersion = [];
var city = '台南';
var latitude = 22.993089;
var longitude = 120.196876;
var theaterName = '今日戲院';
var data = {};
data.theater = [];
data.movie = [];
var x =1;
function getMovieTitle() {
var title = document.querySelectorAll('div.theaterlist_name a');
return Array.prototype.map.call(title, function (e) {
return e.innerText;
});
};
function getEnTitle() {
var title = document.querySelectorAll('div.en a');
return Array.prototype.map.call(title, function (e) {
return e.innerText;
});
};
function getMovieTime() {
var title = document.querySelectorAll('ul.theater_time');
return Array.prototype.map.call(title, function (e) {
return e.innerText;
});
};
function getMovieVersion() {
var version = document.querySelectorAll('div.tapR');
return Array.prototype.map.call(version, function (e) {
return e.innerText;
});
};
// 台南 今日戲院 from 奇摩
casper.start('https://tw.movies.yahoo.com/theater_result.html/id=67', function () {
this.echo(this.getTitle());
});
casper.then(function () {
this.echo('Image');
this.echo(x);
this.capture(x + 'apple.png');
x++;
});
casper.then(function () {
movieTitle = this.evaluate(getMovieTitle);
movieEnTitle = this.evaluate(getEnTitle);
movieTime = this.evaluate(getMovieTime);
movieVersion = this.evaluate(getMovieVersion);
});
casper.then(function () {
console.log('Print:\n');
this.echo(movieTitle.length + ' Movie title found :\n');
this.echo(movieTitle.join('\n'));
this.echo(movieEnTitle.length + ' Movie title found :\n');
this.echo(movieEnTitle.join('\n'));
this.echo(movieTime.length + ' Movie time found :\n');
this.echo(movieTime.join('\n'));
this.echo(movieVersion.length + ' Movie version found :\n');
this.echo(movieVersion.join('\n'));
this.echo(outPutJSON());
});
function outPutJSON() {
data.theater.push({
name: theaterName,
city: city,
latitude: latitude,
longitude: longitude
});
// 將中英文名字合併
for (var i = 0; i < movieTitle.length; i++) {
title.push(movieTitle[i] + movieEnTitle[i]);
}
for (var i = 0; i < movieTime.length; i++) {
var name = title[i];
var sourceTime = movieTime[i].match(/.{1,5}/g);
var times = [];
times.push(sourceTime);
var version = movieVersion[i];
data.movie.push({
name: name,
version: version,
time: times
});
}
return JSON.stringify(data);
}
// casper.run(function () {
// // this.echo('Done').exit();
// this.echo('Done');
// });
setInterval(function () {
casper.run(function () {
// this.echo('Done').exit();
this.echo('Done');
});
}, 2000);
response.write(outPutJSON());
response.close();
});
Here is my folder when i command this file , you can see only capture image once 1apple.png.
One way to achieve what you want would be have a cron job scrape the site at the desired frequency and put the results in a directory served by a web server. Below is a stand-alone script that will fetch the title and capture the image of a site once per hour. Modifying this.step is probably unwise (inspiration from this site:
https://github.com/yotsumoto/casperjs-goto )
var casper = require('casper').create();
var x = 0;
casper.start('http://localhost:8080', function() {
// change 'http://localhost:8080' to the site to be scraped
this.echo(this.getTitle());
this.capture(x++ + '.png');
this.wait(60 * 60 * 1000, function() {
// 60 minutes * 60 seconds * 1000 milliseconds
this.step = 0;
});
});
casper.run();
I am going to run phantomjs in nodejs child process (spawn) to render webpage to picture. but when i set page.setting.javascriptEnabled = true and render page over three times(first time and second time are right), it will throw error. and if set javascriptEnabled = false, it runs well.
this is log in iterm
code:
var webpage = require('webpage');
var write = require('system').stdout.write;
var configMod = require('./config');
var config = configMod.get();
var args = require('system').args;
var phantomId = args[1];
function doJob(job) {
var page = webpage.create();
var jobId = job.id;
var url = job.url;
var viewportSize = job.viewportSize || config.viewportSize;
var clipRect = job.clipRect || config.clipRect;
var zoomFactor = job.zoomFactor || config.zoomFactor;
var imagePath = job.imagePath;
viewportSize && (page.viewportSize = viewportSize);
clipRect && (page.clipRect = clipRect);
zoomFactor && (page.zoomFactor = zoomFactor);
page.settings = {
javascriptEnabled: true,
loadImages: true,
resourceTimeout: 3000,
userAgent: 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.31 (KHTML, like Gecko) PhantomJS/2.1.0'
};
page.open(url, function (status) {
write(url + ':' + status)
var data;
var fetchObj;
if (status === 'fail') {
data = {
JOBID: jobId,
url: url,
phantomId: phantomId,
status: false
};
// release the memory
page.close();
// send data to NodeJS
write('{{begin}}' + JSON.stringify(data) + '{{end}}');
} else if (status === 'success') {
page.render(imagePath, {quality: job.quality});
data = {
JOBID: jobId,
url: url,
phantomId: phantomId,
image: imagePath,
status: true
};
// release the memory
page.close();
// send data to NodeJS
write('{{begin}}' + JSON.stringify(data) + '{{end}}');
}
});
}
I'm struggling to get a casperjs to move on to the next page after it has recursively worked through the links on the page.
I can get it to take data from each page and move through the pages, or click on each link on a page, but I can't get it doing both.
var utils = require('utils');
var x = require('casper').selectXPath;
var casper = require('casper').create({
verbose: true,
logLevel: 'error',
waitTimeout: 10000,
pageSettings: {
loadImages: false,
loadPlugins: false,
userAgent: 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36'
}
});
var currentPage = 1;
var i = 0;
var links = [];
var link_titles = [];
var terminate = function() {
this.echo("Exiting..").exit();
};
function getSelectedPage() {
var el = document.querySelector('td.cur');
return parseInt(el.textContent);
}
function getPageLinks () {
var links = document.querySelectorAll('h3.r a');
return [].map.call(links, function(link) {
return link.getAttribute('href');
});
}
function getLinkData(link) {
this.thenOpen(link, function() {
var title = this.getTitle();
// Add the data from link
var data = {
title: title,
};
link_titles.push(data);
});
}
function loopThroughLinks() {
if( i < links.length) {
this.echo('[LINK #' + i + '] '+ links[i]);
getLinkData.call(this, links[i]);
i++;
this.run(loopThroughLinks);
} else {
utils.dump(link_titles);
}
}
function linkData(){
links = this.evaluate(getPageLinks);
this.run(loopThroughLinks);
}
var processPage = function() {
this.run(linkData);
//PROBLEM EXISTS BELOW HERE - IF YOU COMMENT OUT FROM HERE IT RUNS AS EXPECTED FOR THE FIRST PAGE
//WITH CODE BELOW INCLUDED, SKIPS this.run(linkData) AND JUST GOES THROUGH PAGES;
this.then(function(){
if (currentPage >= 3) {
return terminate.call(casper);
}
currentPage++;
this.echo("requesting next page: " + currentPage);
this.capture("google-results-p" + currentPage + ".png");
this.thenClick('a.pn span').then(function(){
this.waitFor(function(){
return currentPage === this.evaluate(getSelectedPage);
}, processPage, terminate);
});
}); //COMMENT OUT TO HERE FOR WORKING ONE PAGE VERSION
}
casper.start('https://www.google.co.uk/?gws_rd=ssl#q=casperjs');
casper.run(processPage);
Updated code to reflect multiple run calls. Now looping through first page corrrectly, but printing results from first page for all other pages??
var utils = require('utils');
var x = require('casper').selectXPath;
var casper = require('casper').create({
verbose: true,
logLevel: 'error',
waitTimeout: 10000,
pageSettings: {
loadImages: false,
loadPlugins: false,
userAgent: 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36'
}
});
var currentPage = 1;
var i = 0;
var links = [];
var link_titles = [];
var terminate = function() {
this.echo("Exiting..").exit();
};
function getSelectedPage() {
var el = document.querySelector('td.cur');
return parseInt(el.textContent);
}
function getPageLinks() {
var links = document.querySelectorAll("h3.r a");
return Array.prototype.map.call(links, function(e) {
try {
// google handles redirects hrefs to some script of theirs
return (/url\?q=(.*)&sa=U/).exec(e.getAttribute("href"))[1];
} catch (err) {
return e.getAttribute("href");
}
});
}
function getLinkData(link) {
this.thenOpen(link, function() {
//var title = this.fetchText('title');
var title = this.getTitle();
// Add the staff data from link
var data = {
title: title,
};
link_titles.push(data);
this.then(function(){ ///ADDED - BACK TO RIGHT PAGE FOR SELECTOR
this.back();
});
});
}
function loopThroughLinks() {
if( i < links.length) {
this.echo('[LINK #' + i + '] '+ links[i]);
getLinkData.call(this, links[i]);
i++;
this.then(loopThroughLinks);
} else {
utils.dump(link_titles);
}
}
function linkData(){
links = this.evaluate(getPageLinks);
this.then(loopThroughLinks);
}
var processPage = function() {
this.wait(2000, function(){
this.then(linkData);
});
this.wait(2000, function(){
this.then(function(){
if (currentPage >= 3) {
return terminate.call(casper);
}
this.echo("requesting next page: " + currentPage);
this.capture("google-results-p" + currentPage + ".png");
currentPage++;
this.thenClick('a.pn span').then(function(){
this.capture('google-results-2-p' + currentPage + '.png');
this.waitFor(function(){
return currentPage === this.evaluate(getSelectedPage);
}, processPage, terminate);
});
});
});
}
casper.start('https://www.google.co.uk/?gws_rd=ssl#q=casperjs');
casper.then(processPage);
casper.run();
You have to have only one casper.run() (and only one casper.start()) call. run() starts the CasperJS step queue and will finish execution if there are no further steps. The only call that needs to stay is casper.run(processPage);, but all other this.run(...) calls need to be changed to this.then(...).
I don't have any idea why scrollToBottom() is not working. I want to keep scrolling to bottom of the page if the current data (from evaluate) is greater than previous data.
PhantomJS 2.0.0
CasperJS 1.1.0-beta3
var casper = require('casper').create({
verbose: true,
logLevel: 'info',
pageSettings: {
loadImages: false,
loadPlugins: false,
userAgent: 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36'
}
});
casper.options.waitTimeout = 60000;
var utils = require('utils');
var hospitals = [];
var prevTotalHospitals = 0;
var currentTotalHospitals = 0;
casper.start('https://www.docdoc.com/').thenClick("#form-submit-btn").then(function() {
this.wait(5000);
this.scrollToBottom();
processPage();
});
casper.on("remote.message", function(msg){
this.echo("remote> " + msg);
});
casper.on('step.error', function(err) {
this.die("Step has failed: " + err);
})
casper.on("page.error", function(msg, trace) {
this.echo("Error: " + msg, "ERROR");
});
casper.on('complete.error', function(err) {
this.die("Complete callback has failed: " + err);
});
casper.run(function(){
utils.dump(hospitals);
});
function getCurrentTotalHospitals(){
var resultsNodeList = document.querySelectorAll("div.results-list div.result");
return resultsNodeList.length;
}
function getDetails(){
var details = [];
var resultsNodeList = document.querySelectorAll("div.results-list div.result");
console.log("resultsNodeList.length " + resultsNodeList.length);
for (var i = 0; i < resultsNodeList.length; i++) {
var detail = {
"name" : resultsNodeList[i].querySelector("h2.link").textContent.replace(/\n/g, ''),
"country" : resultsNodeList[i].querySelector("h3.country").textContent.replace(/\n/g, ''),
"specialities" : resultsNodeList[i].querySelector("div.specialities").textContent.replace(/\n/g, ''),
"language" : resultsNodeList[i].querySelector("div.language").textContent.replace(/\n/g, '')
};
details.push(detail);
};
return JSON.stringify(details);
}
function stopScript() {
utils.dump(hospitals);
console.log("Exiting..." + hospitals.length);
casper.exit();
};
function processPage() {
currentTotalHospitals = casper.evaluate(getCurrentTotalHospitals);
console.log(currentTotalHospitals + " <> " + prevTotalHospitals);
if (currentTotalHospitals > prevTotalHospitals) {
prevTotalHospitals = currentTotalHospitals;
hospitals = hospitals.concat(casper.evaluate(getDetails));
casper.scrollToBottom();
casper.wait(5000);
processPage();
} else {
stopScript();
}
}
evaluate() is a synchronous function. Since it is used synchronously inside of processPage(), processPage() is also synchronous at the beginning. Later you're using wait() which is asynchronous. The processPage() that comes after wait() is executed immediately.
You can use it in this way:
if (...) {
...
casper.scrollToBottom();
casper.wait(5000, processPage);
} else {...}
By the way, the same is true for the first wait(). It should be:
this.wait(5000, function(){
this.scrollToBottom();
processPage();
});
I'm using phantomjs to capture screen of my webpage. I have SVG elements on my page and sometimes it doesn't render those correctly.
var page = require('webpage').create();
var args = require('system').args;
var isLoad = false;
page.settings.userAgent = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90";
var w = parseFloat(args[4]);
var h = parseFloat(args[5]);
page.viewportSize = { width: w, height: h };
page.open('http://www.facegift.co.il/canvas/?userItemId=' + args[1] + '&Sc=' + args[2] + '&print=' + args[6] + '&width=' + args[4] + '&pageNum=' + args[3] + '&ver=' + args[7], function(status) {
var ue = page.evaluate(function(){
return navigator.userAgent;
});
console.log("status: " + status);
if(status === "success") {
setInterval(function(){
var ps = page.evaluate(function() {
document.body.bgColor = 'white';
return document.getElementById("pageStatus").innerHTML;
});
if(!isLoad){
console.log("wait...");
if(ps == "loaded"){
console.log("loaded");
isLoad = true;
page.render('example1.jpg');
console.log("rendered");
phantom.exit();
}
}
}, 100);
}
});
here are examples of two results with the same request:
here is the link to the actual page I want to render:
http://www.facegift.co.il/canvas/?userItemId=17887&sc=987404&print=1&width=4000&pageNum=7
to call phantom I use:
phantomjs facegift.js 17887 987404 7 4000 2048.77 1 2335
ok, fixed. the problem was in my code.
thanks to #Paul LeBeau who answered my next related question.
SVG disappear when zoom in on chrome
phantomjs and chrome are great. sorry I thought something went wrong on their behalf.