screenshot.js
var page = require("webpage").create();
var homePage = "http://www.google.com/";
page.open(homePage);
page.onLoadFinished = function(status) {
var url = page.url;
console.log("Status: " + status);
console.log("Loaded: " + url);
page.render("google.png");
phantom.exit();
};
Terminal:
bin/phantomjs screenshot.js
Question:
Is there any way that I can send phantomjs the URL (value of var homePage above) somehow outside of screenshot.js so that its not hard coded inside the script?
Add the url to the command line
bin/phantomjs screenshot.js http://www.google.com/
Here you have an example from the docs :https://github.com/ariya/phantomjs/blob/master/examples/arguments.js
var system = require('system');
if (system.args.length === 1) {
console.log('Try to pass some args when invoking this script!');
} else {
system.args.forEach(function (arg, i) {
console.log(i + ': ' + arg);
});
}
phantom.exit();
Related
I'm having a hard time converting this code for it to be usable in a node server. So this code is written to run in a PhantomJS process (i.e. $: phantomjs index.js) but I want to run it in a node server using the package require("phantom"); I'm having a trouble getting these two callbacks to work however.
page.onLoadFinished = function(status){
console.log("Load Finished");
};
page.onUrlChanged = function(){
console.log("URL Changed");
};
Here is my pathetic attempt at trying to nodefy the whole situation.
phantom.create(['--ignore-ssl-errors=yes','--load-images=no']).then(function(ph) {
console.log("here");
ph.createPage().then(function(page) {
page.property('onResourceRequested', function(requestData, networkRequest) {
console.log(requestData.url);
});
page.open('https://example.com/login').then(function(status) {
console.log(status);
if (status !== 'success') { console.log("failed connection")} else {
page.evaluate(function() {
document.getElementById('email').value = "stuff";
document.getElementById('password').value = "things";
setTimeout(document.getElementsByTagName('button')[0].click(),5000);
console.log("login attempt");
setTimeout(document.URL, 2000);
});
page.onLoadFinished = function(status){
console.log("Load Finished");
};
page.onUrlChanged = function(){
console.log("url changed");
};
}
});
});
});
Also the code works and gets the page and clicks the button, however the problem is after the phantom logs in, I need data from the next page which I was going to use the onUrlChanged and onLoadFinished to do.
page.onLoadFinished and page.onUrlChanged are callback functions that are executed after page has been opened, so it makes sense to assign them before opening an url.
It is also a useful habit to subscribe to console.log and error messages from a webpage.
var phantom = require('phantom');
phantom.create(['--ignore-ssl-errors=yes','--load-images=no']).then(function(ph) {
console.log("here");
ph.createPage().then(function(page) {
page.property('onError', function(msg, trace) {
var msgStack = ['ERROR: ' + msg];
if (trace && trace.length) {
msgStack.push('TRACE:');
trace.forEach(function(t) {
msgStack.push(' -> ' + t.file + ': ' + t.line + (t.function ? ' (in function "' + t.function +'")' : ''));
});
}
console.error(msgStack.join('\n'));
});
page.property('onConsoleMessage', function(msg, lineNum, sourceId) {
console.log('CONSOLE: ' + msg + ' (from line #' + lineNum + ' in "' + sourceId + '")');
});
page.property('onResourceRequested', function(requestData, networkRequest) {
console.log(requestData.url);
});
page.property('onLoadFinished', function(status) {
console.log("Load Finished with status " + status);
});
page.property('onUrlChanged', function(targetUrl) {
console.log("URL changed to: " + targetUrl);
});
page.open('https://example.com/login').then(function(status) {
if (status !== 'success') { console.log("failed connection")} else {
page.evaluate(function() {
document.getElementById('email').value = "email";
document.getElementById('password').value = "password";
setTimeout(function(){
console.log("login attempt");
document.getElementsByTagName('button')[0].click();
}, 5000);
});
});
}
});
});
});
In trying to get a hang of node.js asynchronous coding style, I decided to write a program that would read a text file containing a bunch of URLS to download and download each file. I started out writing a function to download just one file (which works fine), but having trouble extending the logic to download multiple files.
Here's the code:
var http = require("http"),
fs = require("fs"),
input = process.argv[2],
folder = "C:/Users/Wiz/Downloads/",
regex = /(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?/,
urls = null,
url = "",
filename = "";
fs.readFile(input, "utf8", function(e, data) {
console.log("Reading file: " + input);
if (e) console.log("Got error:" + e.message);
urls = data.split("\n");
for (var i = urls.length; i--;) {
url = urls[i];
if (!url.match(regex)) continue;
filename = folder + url.substring(url.lastIndexOf('/') + 1);
downloadQueue.addItem(url, filename);
}
});
var downloadQueue = {
queue: [],
addItem: function(p_sSrc, p_sDest) {
this.queue.push({
src: p_sSrc,
dest: p_sDest
});
if (this.queue.length === 1) {
this.getNext();
}
},
getNext: function() {
var l_oItem = this.queue[0];
http.get(l_oItem.src, function(response) {
console.log("Downloading: " + l_oItem.dest);
var file = fs.createWriteStream(l_oItem.dest);
response.on("end", function() {
file.end();
console.log("Download complete.");
downloadQueue.removeItem();
}).on("error", function(error) {
console.log("Error: " + error.message);
fs.unlink(l_oItem.dest);
});
response.pipe(file);
});
},
removeItem: function() {
this.queue.splice(0, 1);
if (this.queue.length != 0) {
this.getNext();
} else {
console.log("All items downloaded");
}
}
};
How do I structure the code so that the completion of the first download can signal the initiation of the next one. Please note that this exercise is just for learning purposes, to understand how asynchronous coding works. In practice, I'm sure there are much better tools out there to download multiple files.
Try simple at first, it look like you copy paste codes and quite don't understand what they do.
Do a simple loop, that get the url, and print something.
var http = require('http');
URL = require('url').parse('http://www.timeapi.org/utc/now?format=%25F%20%25T%20-%20%25N')
URL['headers'] = {'User-Agent': 'Hello World'}
// launch 20 queries asynchronously
for(var i = 0; i < 20; i++) {
(function(i) {
console.log('Query ' + i + ' started');
var req = http.request(URL, function(res) {
console.log('Query ' + i + ' status: ' + res.statusCode + ' - ' + res.statusMessage);
res.on('data', function(content){
console.log('Query ' + i + ' ended - ' + content);
});
});
req.on('error', function(err) {
console.log('Query ' + i + ' return error: ' + err.message);
});
req.end();
})(i);
}
All the urls will be fetched asynchronously. You can observe that the response does not arrive in order, but are still processed correctly.
The difficulty with async is not to do the things is parallel, because you just write like a single task, and execute multiple time. It becomes complicated when you need for instance to wait for all tasks to finished before continuing. And for that, have a look at promises
Here is what I started out with. Figuring that each download was invoked asynchronously, they would all be independent of each other.
var http = require("http"),
fs = require("fs"),
input = process.argv[2],
folder = "C:/Users/Wiz/Downloads/",
regex = /(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?/,
urls = null,
url = "",
filename = "";
fs.readFile(input, "utf8",
function(e, data) {
console.log("Reading file: " + input);
if (e) console.log("Got error:" + e.message);
urls = data.split("\n");
for (var i = urls.length; i--;) {
url = urls[i];
if (!url.match(regex)) continue;
filename = folder + url.substring(url.lastIndexOf('/') + 1);
http.get(url, function(response) {
var file = fs.createWriteStream(filename);
response.on("end", function() {
file.end();
});
response.pipe(file);
})
}
});
I used this example to create a phantomjs code to login to website.
var page = require('webpage').create();
page.open("http://www.facebook.com/login.php", function(status) {
if (status === "success") {
page.onConsoleMessage = function(msg, lineNum, sourceId) {
console.log('CONSOLE: ' + msg + ' (from line #' + lineNum + ' in "' + sourceId + '")');
};
page.evaluate(function() {
console.log('hello');
document.getElementById("email").value = "email";
document.getElementById("pass").value = "password";
document.getElementById("u_0_1").click();
// page is redirecting.
});
setTimeout(function() {
page.evaluate(function() {
console.log('haha');
});
page.render("page.png");
phantom.exit();
}, 5000);
}
});
From this link.
https://gist.github.com/ecin/2473860
But I want to open another link from a button or go directly on it. How can I do it?
Here is a simpler example. Doesn't work...
var page = require('webpage').create();
var url = "www.example.com";
page.open(url, function (status) {
setTimeout(function () {
page.evaluate(function () {
console.log('haha');
});
page.render("example.png");
phantom.exit();
}, 5000);
});
var url = "www.google.com";
page.open(url, function (status) {
setTimeout(function () {
page.evaluate(function () {
console.log('haha');
});
page.render("google.png");
phantom.exit();
}, 5000);
});
Very close, now combine your two snippets into one. page.open() is asynchronous which is why you need to open the next page only after the first one has finished:
var page = require('webpage').create();
var url = "http://www.example.com";
page.onConsoleMessage = function(msg, lineNum, sourceId) {
console.log('CONSOLE: ' + msg + ' (from line #' + lineNum + ' in "' + sourceId + '")');
};
page.open(url, function (status) {
page.onConsoleMessage = function(msg, lineNum, sourceId) {
console.log('CONSOLE: ' + msg + ' (from line #' + lineNum + ' in "' + sourceId + '")');
};
page.evaluate(function() {
document.getElementById("email").value = "email";
document.getElementById("pass").value = "password";
document.getElementById("u_0_1").click();
// page is redirecting.
});
setTimeout(function () {
page.evaluate(function () {
console.log('haha');
});
page.render("example.png");
var url = "http://www.google.com";
page.open(url, function (status) {
setTimeout(function () {
page.evaluate(function () {
console.log('haha');
});
page.render("google.png");
phantom.exit();
}, 5000);
});
}, 5000);
});
To actually see the console.log() inside of page.evaluate() you will need to register to the page.onConsoleMessage event. There are more other events that are helpful when debugging.
Don't forget to add the protocol (http:// or file:///) to the URLs that you're opening. PhantomJS is a bit picky in that regard.
Instead of waiting a static amount of time (setTimeout()) until the next page is loaded after you do some action. You should make use of the page.onLoadFinished event. This is rather cumbersome to get right for navigation intensive scripts. Use CasperJS for longer scripts.
Oftentimes Element.click() doesn't work. This question has many solutions for those cases.
I happened to write a program to use the google translate (http://www.translate.google.com)
using PhantomJS.
But I'am unable to insert text into the textarea . .I’ve searched a lot but nothing proved useful . However i am able to print the result content .
Here's my code:
var page = require('webpage').create();
page.open("http://translate.google.com", function(status) {
if ( status === "success" ) {
console.log(status);
page.includeJs("//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min.js", function() {
var c=page.evaluate(function(){$('source').val("sample text");
return $('#source').text();
});
var f= page.evaluate(function() {
$('#source').val("sbjbsdfsdfbbs");
return $('#source').text();
});
console.log(f);//should print input text
var result= page.evaluate(function() {
$('#source').val("sbjbsdfsdfbbs");
return $('#result_box').text();
});
console.log(result);
phantom.exit()
});
}
});
Try to attach your text and languages to the URL when opening the page:
'http://translate.google.com/#en/es/translate this'
var page = require('webpage').create();
var inputText = 'translate this';
var langFrom = 'en';
var langTo = 'es';
var pageURL = 'http://translate.google.com/#' + langFrom + '/' + langTo + '/' + inputText;
page.open(pageURL, function(status) {
// your code to get results here
});
Here you go:
translate.js:
#!/usr/bin/env phantomjs
var system = require('system');
var text = system.stdin.read();
var sourceLang="en";
var targetLang="pt_BR";
var url = "https://translate.google.com/#"+sourceLang+"/"+targetLang;
var page = require('webpage').create();
page.settings.userAgent = 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:13.0) Gecko/20100101 Firefox/13.0';
page.onError = function(msg, trace) {
var msgStack = ['ERROR: ' + msg];
if (trace && trace.length) {
msgStack.push('TRACE:');
trace.forEach(function(t) {
msgStack.push(' -> ' + t.file + ': ' + t.line + (t.function ? ' (in function "' + t.function + '")' : ''));
});
}
// uncomment to log into the console
// console.error(msgStack.join('\n'));
};
page.onConsoleMessage = function (msg) {
if ( msg == "phanthom.exit()" ) {
phantom.exit();
} else {
system.stdout.write(msg);
system.stdout.flush();
}
page.render("test.png");
};
/*
* This function wraps WebPage.evaluate, and offers the possibility to pass
* parameters into the webpage function. The PhantomJS issue is here:
*
* http://code.google.com/p/phantomjs/issues/detail?id=132
*
* This is from comment #43.
*/
function evaluate(page, func) {
var args = [].slice.call(arguments, 2);
var fn = "function() { return (" + func.toString() + ").apply(this, " + JSON.stringify(args) + ");}";
return page.evaluate(fn);
}
page.open(url, function (status) {
if (status !== 'success') {
console.log('Unable to access network');
} else {
var result = evaluate(page, function(text){
var getResult=function(){
var result_box=document.querySelector("#result_box");
var input_box=document.querySelector("#source");
if ( input_box == null )
setTimeout( getResult, 1000 );
else {
input_box.value=text;
if ( result_box == null || result_box.innerText == "" ) {
setTimeout( getResult, 1000 );
} else {
console.log(result_box.innerText);
console.log("phanthom.exit()")
}
}
}
getResult();
}, text );
}
});
...
$ echo "phantomjs, is a fantastic tool" | phantomjs translate.js | iconv --from cp1252
PhantomJS, é uma ferramenta fantástica
I'm trying to scrape a number of pages that have a standard format. I've been able to use Phantomjs to successfully scrape a single page, but when I try to iterate over multiple ones, the asynchronous processing makes things hang up. What's the proper way to tell Casper/Phantom to wait?
var page = require('webpage').create();
var fs = require('fs');
page.onConsoleMessage = function(msg) {
phantom.outputEncoding = "utf-8";
console.log(msg);
};
// this overwrites the previous output file
f = fs.open("lat_long.txt", "w");
f.write("--");
f.close();
// this is the unique identifier for the locations. For now, I just have three datapoints
var EPAID = ["KYD980501076","ME8170022018", "MEN000103584"];
/// this code will be used to loop through the different locations. For now, set to look at only one.
for (q= 0; q < 1; q++) {
var processing = false;
//we construct the target url
var url = "http://iaspub.epa.gov/enviro/efsystemquery.cerclis?fac_search=site_epa_id&fac_value=" + EPAID[0] + "&fac_search_type=Beginning+With&postal_code=&location_address=&add_search_type=Beginning+With&city_name=&county_name=&state_code=&program_search=1&report=2&page_no=1&output_sql_switch=TRUE&database_type=CERCLIS" ;
page.open(url);
page.onLoadFinished = function(status) {
if ( status === "success" ) {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
var str = page.evaluate(function() {
$value = [];
$Object = $(".result tr");
for (i =0 ; i < 10; i++) {
$value.push($Object.find('td').html(),$Object.find('td').next().next().html() );
$Object = $Object.next();
}
$string = "{ EPAID: "+ $value[0] + ", " +
"Name: "+ $value[1] + ", " +
"City: "+ $value[4] + ", " +
"State: "+ $value[6] + ", " +
"ZipCode: "+ $value[8] + ", " +
"Latitude: "+ $value[14] + ", " +
"Longitude: "+ $value[16] + " }" ;
return $string;
});
f = fs.open("lat_long.txt", "a");
f.write(str);
f.close();
processing = true;
console.log("writing to file");
phantom.exit();
});
}
// right here it should delay until the previous page is completed
// while (!processing) {
// setTimeout(function(){ console.log("waiting....");},1000);
// }
};
}
console.log("finished all pages");
If you switched to using casperJS, it is as simple as changing your page.open() into page.thenOpen(). (This CasperJS - How to open up all links in an array of links question looks very similar to yours?)
If you wanted to stick with PhantomJS you need to start the next page load in the onSuccess callback of the previous load. This is tedious, and needs care to avoid large memory usage. (I did it once or twice, but now simply use CasperJS.)
An alternative approach is to create the page object inside the loop. However that is not quite answering your question, as then they will run in parallel. But you could use setTimeout to stagger each once to avoid a burst of activity if you have hundreds of URLs!
Here is the code that ultimately works (using the timeout approach since I wasn't able to get the success callback to work better).
With casperjs installed, I named this file "process.js" and was able to run it from the command line as "casperjs process.js"
var page = require('webpage').create();
var fs = require('fs');
page.onConsoleMessage = function(msg) {
phantom.outputEncoding = "utf-8";
console.log(msg);
};
// this overwrites the previous output f
// this is the unique identifier for the locations.
var EPAID = ["NED981713837",... , "FLD049985302", "NJD986643153"];
f = fs.open("lat_long.txt", "w");
f.write("-<>-");
f.close();
var count = 0;
var target = 1400;
var written = [];
function yourFunction(){
if (count < target) {
process(count);
count++;
setTimeout(yourFunction, 5000);
} else {
console.log("exiting");
phantom.exit();
return;
}
}
function process(counter){
var processing = false;
console.log("Beginning record #" + counter);
//we construct the target url
var url = "http://iaspub.epa.gov/enviro/efsystemquery.cerclis?fac_search=site_epa_id&fac_value=" + EPAID[counter] + "&fac_search_type=Beginning+With&postal_code=&location_address=&add_search_type=Beginning+With&city_name=&county_name=&state_code=&program_search=1&report=2&page_no=1&output_sql_switch=TRUE&database_type=CERCLIS" ;
page.open(url);
page.onLoadFinished = function(status) {
if ( status === "success" ) {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
var str = page.evaluate(function() {
$value = [];
$Object = $(".result tr");
for (i =0 ; i < 10; i++) {
$value.push($Object.find('td').html(),$Object.find('td').next().next().html() );
$Object = $Object.next();
}
$string = "{ \"EPAID\": \""+ $value[0] + "\", " +
"\"Name\": \""+ $value[1] + "\", " +
"\"City\": \""+ $value[4] + "\", " +
"\"State\": \""+ $value[6] + "\", " +
"\"ZipCode\": \""+ $value[8] + "\", " +
"\"Latitude\": "+ $value[14] + ", " +
"\"Longitude\": "+ $value[16] + " }," ;
return $string;
});
if (written[counter] === undefined) {
f = fs.open("lat_long.txt", "a");
f.write(str);
f.close();
written[counter] = true;
console.log("Writing to file #"+ counter);
}
});
}
};
}
console.log("Start...");
yourFunction();