Using the 'webpage' Phantom module in node.js - javascript

I am trying to wrap a PhantomJS script in a node.js process. The phantom script grabs a url from the arguments provided on the command line and outputs a pdf (much similar to the rasterize.js example included with the pahntom install).
The phantom script I have works fine, it's just my employer wants a node script if possible. No problem, I can use the node-phantom node module to wrap it.
But now I've hit a stumbling block, my phantom script has:
var page = require('webpage').create();
So, node.js is trying to find a module called 'webpage', the 'webpage' module is built into the phantom install so node can't find it. As far as I can tell, there is no npm module called 'webpage'.
'webpage' is used like this:
page.open(address, function (status) {
if (status !== 'success') {
// --- Error opening the webpage ---
console.log('Unable to load the address!');
} else {
// --- Keep Looping Until Render Completes ---
window.setTimeout(function () {
page.render(output);
phantom.exit();
}, 200);
}
});
where address is the url specified on the command line and output is another argument, the name and type of the file.
Can anyone help me out? This is quite an abstract one so I'm not expecting much if I'm honest, worth a try though.
Thanks.
EDIT - Approx 2hrs later
I now have this which throws out a PDF:
var phanty = require('node-phantom');
var system = require('system');
phanty.create(function(err,phantom) {
//var page = require('webpage').create();
var address;
var output;
var size;
if (system.args.length < 4 || system.args.length > 6) {
// --- Bad Input ---
console.log('Wrong usage, you need to specify the BLAH BLAH BLAH');
phantom.exit(1);
} else {
phantom.createPage(function(err,page){
// --- Set Variables, Web Address, Output ---
address = system.args[2];
output = system.args[3];
page.viewportSize = { width: 600, height: 600 };
// --- Set Variables, Web Address ---
if (system.args.length > 4 && system.args[3].substr(-4) === ".pdf") {
// --- PDF Specific ---
size = system.args[4].split('*');
page.paperSize = size.length === 2 ? { width: size[0], height: size[1], margin: '0px' }
: { format: system.args[4], orientation: 'portrait', margin: '1cm' };
}
// --- Zoom Factor (Should Never Be Set) ---
if (system.args.length > 5) {
page.zoomFactor = system.args[5];
} else {
page.zoomFactor = 1;
}
//----------------------------------------------------
page.open(address ,function(err,status){
if (status !== 'success') {
// --- Error opening the webpage ---
console.log('Unable to load the address!');
} else {
// --- Keep Looping Until Render Completes ---
process.nextTick(function () {
page.render(output);
phantom.exit();
}, 200);
}
});
});
}
});
But! It's not the right size! The page object created using the phantom 'webpage' create() function looks like this before it's passed the URL:
Whereas mine in my node script, looks like this:
Is it possible to hard code the properties to achieve A4 formatting? What properties am I missing?
I'm so close!

It should be something like:
var phantom=require('../node-phantom');
phantom.create(function(error,ph){
ph.createPage(function(err,page){
page.open(url ,function(err,status){
// do something
});
});
});
Your confusion here is because you want to reuse the same concepts and metaphors from your PhantomJS script. It does not work that way. I suggest that you spend some time studying the included tests of node-phantom, see https://github.com/alexscheelmeyer/node-phantom/tree/master/test.

Using https://github.com/sgentle/phantomjs-node I have made an A4 page in nodejs using phantom with the following code:
phantom.create(function(ph){
ph.createPage(function(page) {
page.set("paperSize", { format: "A4", orientation: 'portrait', margin: '1cm' });
page.open("http://www.google.com", function(status) {
page.render("google.pdf", function(){
console.log("page rendered");
ph.exit();
})
})
})
});
Side Note:
the page.set() function takes any variable that you would set in the rasterize.js example. See how paperSize is set above and compare it to the relevant lines in rasterize.js

Related

fabricjs or slimerjs, "all objects displayed" event?

I'm using slimerjs to render some html files. Each file contains a JSON string that gets loaded with a call to
fabricjsCanvas.loadFromJson(jsonString, fabricjsCanvas.renderAll.bind(fabricjsCanvas));
This is where I open my page
page.open(address, function (status) {
if (status !== 'success') {
console.log('Unable to load the page!');
phantom.exit(1);
} else {
page.render(output);
window.setTimeout(function () {
page.render(output);
phantom.exit();
}, 5000);
}
});
As you can see I had to set a timeout after which slimerjs closes the page saving what's on it. I really don't like this solution, coz I need to render multiple pages, some of them are very small, and could take less than 200 milliseconds, others are huge and could take more than 5000, so this is just bad for perfomances and isn't even a "safe solution" against page taking a long time to render. I tryid putting a console.log at the end of canvas.renderAll call and then add this piece of code to my slimerjs script
page.onConsoleMessage = function (msg) {
console.log(msg);
page.render(output);
phantom.exit();
};page.open(address, function (status) {
if (status !== 'success') {
console.log('Unable to load the address!');
phantom.exit(1);
}
});
I hoped that this would have helped but nothing really changed, the reanderAll finishes before all objects are displayed.
Is there some event I can catch, or something else I can do to prevent this?
You should use a callback page.onLoadFinished().
page.onLoadFinished = function(status, url, isFrame) {
console.log('Loading of '+url+' is a '+ status);
page.render();
};
This function run after full upload page
I managed to find a solution. First of all I changed my html template,since I only need to render per page i used a StaticCanvas instead of a normal canvas. Since a static canvas only has 2 frames to render(the top one and the secondary container, at least this is what I've learned in my experience) I added an event lister on after:render event, so after the second frame has been rendered I print a console message, at this point the page.onConsoleMessage gets called and closes the phantom process.
In this way I don't need to allow a standard amount of time that could be too much (loosing perfomances) or not enough (and the image would result blank).
this is the script in my html template
var framesRendered = 0;
var canvas = new fabric.StaticCanvas('canvas', {width: {{width}}, height: {{height}} });
canvas.setZoom({{{zoom}}});
canvas.on('after:render', function() {
if(framesRendered == 1)
console.log('render complete');
else framesRendered++;
});
canvas.loadFromJSON({{{data}}}, canvas.renderAll.bind(canvas), function (o, object) {
if (object.type === 'picturebox' && object.filters.length) {
object.applyFilters(function () {
canvas.renderAll();
});
}
});
and this is my slimerjs script
page.onConsoleMessage = function(){
page.render(output);
phantom.exit();
};
page.open(address, function (status) {
if (status !== 'success') {
console.log('Unable to load the address!');
phantom.exit(1);
}
});
I'll leave this one here in case someone needs it.

PhantomJS + jQuery -> Can't get image

PROBLEM: the function inside page.evaluate doesn't find any img (therefore, console.log(images.length) outputs 0); however, there are many images in the page, and some even have ids.
QUESTION: What's going on? Why $('img') doesn't find anything?
UPDATE 1: This is a <frame> problem. I had to switch to the frame in order to make the jQuery script correctly work.
DETAILS: I'm running a phantomjs script to access a webpage (link) and fetch all available images. It first saves a screenshot of the page just for comparison, and then it should through every <img> tag (using jQuery $('img')) and get the image dimensions and, using phantomjs's page.clipRect, it saves each image inside a folder.
var page = require('webpage').create();
var url = 'http://www.receita.fazenda.gov.br/pessoajuridica/cnpj/cnpjreva/cnpjreva_solicitacao.asp';
page.open(url, function (status) {
console.log("Status: " + status);
if (status === "success") {
page.render('example.png');
}
// Asynchronous call!
page.includeJs('http://ajax.googleapis.com/ajax/libs/jquery/2.1.3/jquery.min.js', function () {
console.log('\n Evaluate Page \n');
// Sandboxed
var images = page.evaluate(function () {
var images = [];
function getImgDimensions($i) {
return {
top: $i.offset().top,
left: $i.offset().left,
width: $i.width(),
height: $i.height(),
}
}
$('img').each(function () {
var img = getImgDimensions($(this));
images.push(img);
});
return images;
});
console.log(images.length);
images.forEach(function (imageObj, index, array) {
page.clipRect = imageObj;
page.render('images/' + index + '.png');
});
// Exit the session
phantom.exit();
});
});
I've looked at the site. The img that you want is inside of an iframe. You first need to switch to it.
Use for example:
page.switchToChildFrame(0);
to switch to the first child frame. Do this before you call page.includeJs().
If you want to do something in the parent page afterwards, you would have to change back with page.switchToParentFrame();.

How to avoid using a global variable, when closing an EventSource in internal links?

I have a web application, where some internal pages use an EventSource to receive live updates from the server.
The client code looks like this:
var LiveClient = (function() {
return {
live: function(i) {
var source = new EventSource("/stream/tick");
source.addEventListener('messages.keepalive', function(e) {
console.log("Client "+ i + ' received a message.');
});
}
};
})();
You can see a live demo on heroku: http://eventsourcetest.herokuapp.com/test/test/1. If you open the developer console, you will see a message printed every time an event is received.
The problem is that when visiting internal links, the EventSource remains open, causing messages to be printed even after the visitor moves from one page to another - so if you visit the three links on the top, you will get messages from three sources.
How can I close the previous connection after the user moves from one internal page to another?
A hacky workaround that I tried was to use a global variable for the EventSource object, like this:
var LiveClient = (function() {
return {
live_global: function(i) {
// We set source as global, otherwise we were left
// with sources remaining open after visiting internal
// pages
if (typeof source != "undefined" && source != null) {
if (source.OPEN) {
source.close();
console.log("Closed source");
}
}
source = new EventSource("/stream/tick");
source.addEventListener('messages.keepalive', function(e) {
console.log("Client "+ i + ' received a message.');
});
}
};
})();
Demo here: http://eventsourcetest.herokuapp.com/test/test_global/1, but I am looking for a solution that would avoid the use of a global variable if possible.
The HTML code that is generated is:
Page 1 |
Page 2 |
Page 3 |
<p>This is page 3</p>
<script>
$(function() {
LiveClient.live_global(3);
});
</script>
or with LiveClient.live_global(1); for the case with the global variable.
Try this. I haven't tested it. If it works, you might be able to replace LiveClient.source with this.source which is a lot cleaner imo.
var LiveClient = (function() {
return {
source: null,
live_global: function(i) {
// We set source as global, otherwise we were left
// with sources remaining open after visiting internal
// pages
if (typeof LiveClient.source != "undefined" && LiveClient.source != null) {
if (source.OPEN) {
source.close();
console.log("Closed source");
}
}
LiveClient.source = new EventSource("/stream/tick");
LiveClient.source.addEventListener('messages.keepalive', function(e) {
console.log("Client "+ i + ' received a message.');
});
}
};
})();

Testing Custom JS with Jasmine and Protractor

I'm trying to write Jasmine test code for a custom JS module(?) that cookies return visitors and redirects them to their cookie'd region.
The JS works fine, but it doesn't seem to be recognized by Jasmine / Protractor as those pages don't redirect as they should.
my spec.js file:
describe("cookies", function() {
var bridgepage = ('http://0.0.0.0:4567/');
var citypage = ('http://0.0.0.0:4567/nyc/');
// cookied visitors
it('should redirect to city pages for cookied visitors', function(){
browser.get(citypage);
browser.get(bridgepage)
browser.sleep(1000);
expect(browser.getCurrentUrl()).toEqual(citypage);
})
});
folder structure:
- source
-- javascripts
---- lib
-- all.js
-- cookies.js // file to test
- spec
-- javascripts
---- helpers
---- supoort
-- spec.js // specs
cookies.js file:
var cookies = (function() {
var utma = $.cookie("__utma");
var serviceArea = $.cookie('first_visit_service_area')
var isFirstVisit = function() {
if (utma == null) {
return false;
}
else {
var utmaArray = utma.split('.');
var firstVisitTime = utmaArray[2];
var currentVisitTime = utmaArray[4];
return (firstVisitTime == currentVisitTime);
}
}
[...]
// Set cookies according to (...)
// If the user has never visited the site, (...)
// If the user (...), (...)
if (utma == null) {
// user has apparently disabled cookies or blocked google analytics cookies
return;
}
if (window.location.pathname == '/' && serviceArea) {
redirectServiceArea();
}
if (isFirstVisit) {
setCookies();
}
}());
This my first attempt at writing JS tests, so any help would be much appreciated! :)

login to a webpage using phantomjs and Jquery

I am new to phantomjs, Java script and WebScraping in General. What I want to do is basic http authentication and then visit another URL to get some information. Here is what I have till now. Please tell me what I am doing wrong.
var page = require('webpage').create();
var system = require('system');
page.onConsoleMessage = function(msg) {
console.log(msg);
};
page.onAlert = function(msg) {
console.log('alert!!>' + msg);
};
page.settings.userName = "foo";
page.settings.password = "bar";
page.open("http://localhost/login", function(status) {
console.log(status);
var retval = page.evaluate(function() {
return "test";
});
console.log(retval);
page.open("http://localhost/ticket/" + system.args[1], function(status) {
if ( status === "success" ) {
page.injectJs("jquery.min.js");
var k = page.evaluate(function () {
var a = $("div.description > h3 + p");
if (a.length == 2) {
console.log(a.slice(-1).text())
}
else {
console.log(a.slice(-2).text())
}
//return document.getElementById('addfiles');
});
}
});
phantom.exit();
});
I am passing an argument to this file: a ticket number which gets appended to the 2nd URL.
I would recommend CasperJS highly for this.
CasperJS is an open source navigation scripting & testing utility written in Javascript and based on PhantomJS — the scriptable headless WebKit engine. It eases the process of defining a full navigation scenario and provides useful high-level functions, methods & syntactic sugar for doing common tasks such as:
defining & ordering browsing navigation steps
filling & submitting forms
clicking & following links
capturing screenshots of a page (or part of it)
testing remote DOM
logging events
downloading resources, including binary ones
writing functional test suites, saving results as JUnit XML
scraping Web contents
(from the CasperJS website)
I recently spent a day trying to get PhantomJS by itself to do things like fill out a log-in form and navigate to the next page.
CasperJS has a nice API purpose built for forms as well:
http://docs.casperjs.org/en/latest/modules/casper.html#fill
var casper = require('casper').create();
casper.start('http://some.tld/contact.form', function() {
this.fill('form#contact-form', {
'subject': 'I am watching you',
'content': 'So be careful.',
'civility': 'Mr',
'name': 'Chuck Norris',
'email': 'chuck#norris.com',
'cc': true,
'attachment': '/Users/chuck/roundhousekick.doc'
}, true);
});
casper.then(function() {
this.evaluateOrDie(function() {
return /message sent/.test(document.body.innerText);
}, 'sending message failed');
});
casper.run(function() {
this.echo('message sent').exit();
});

Categories

Resources