I am tring to scrape a few sites. Here is my code:
for (var i = 0; i < urls.length; i++) {
url = urls[i];
console.log("Start scraping: " + url);
page.open(url, function () {
waitFor(function() {
return page.evaluate(function() {
return document.getElementById("progressWrapper").childNodes.length == 1;
});
}, function() {
var price = page.evaluate(function() {
// do something
return price;
});
console.log(price);
result = url + " ; " + price;
output = output + "\r\n" + result;
});
});
}
fs.write('test.txt', output);
phantom.exit();
I want to scrape all sites in the array urls, extract some information and then write this information to a text file.
But there seems to be a problem with the for loop. When scraping only one site without using a loop, all works as I want. But with the loop, first nothing happens, then the line
console.log("Start scraping: " + url);
is shown, but one time too much.
If url = {a,b,c}, then phantomjs does:
Start scraping: a
Start scraping: b
Start scraping: c
Start scraping:
It seems that page.open isn't called at all.
I am newbie to JS so I am sorry for this stupid question.
PhantomJS is asynchronous. By calling page.open() multiple times using a loop, you essentially rush the execution of the callback. You're overwriting the current request before it is finished with a new request which is then again overwritten. You need to execute them one after the other, for example like this:
page.open(url, function () {
waitFor(function() {
// something
}, function() {
page.open(url, function () {
waitFor(function() {
// something
}, function() {
// and so on
});
});
});
});
But this is tedious. There are utilities that can help you with writing nicer code like async.js. You can install it in the directory of the phantomjs script through npm.
var async = require("async"); // install async through npm
var tests = urls.map(function(url){
return function(callback){
page.open(url, function () {
waitFor(function() {
// something
}, function() {
callback();
});
});
};
});
async.series(tests, function finish(){
fs.write('test.txt', output);
phantom.exit();
});
If you don't want any dependencies, then it is also easy to define your own recursive function (from here):
var urls = [/*....*/];
function handle_page(url){
page.open(url, function(){
waitFor(function() {
// something
}, function() {
next_page();
});
});
}
function next_page(){
var url = urls.shift();
if(!urls){
phantom.exit(0);
}
handle_page(url);
}
next_page();
Related
I have a function that looks like this:
const XML = '.xml code';
var array = [];
var i = 0; //counter
XMLExtract(XML, 'loc', false, (error, element) => {
if (error) {
throw new Error(error);
}
array[i] = element;
console.log(array[i]);
function_name(array[i]); //imported function from external .js
i++;
});
Basically I want to run function() to return the response that it gives and then run function() again with the new parameter. However, the above code doesn't work, it just overlaps.
I've also checked previous solutions: https://stackoverflow.com/a/5010339 but I think I don't really know how to implement it. Any suggestion?
UPDATE: external.js
module.exports = function() {
this.function_name = function() {
(async function() {
...
await instance.exit();
}());
};
};
This question already has an answer here:
PNG is not being rendered using PhantomJS with multiple attempts in a loop
(1 answer)
Closed 6 years ago.
well my code is something like this just a few lines
var a = "http://lnmtl.com/chapter/renegade-immortal-chapter-";
var b = 558;
var d = "rennegrade_ch";
var f = ".png";
var page = require('webpage').create();
var i = 0;
for (i = b; i < 560; i++) {
var c = a + i;
console.log(c);
page.open(c, function () {
var e = d + i + f;
console.log(e);
page.render(e);
});
}
phantom.exit();
the webpage can be rendered individually but once i put it inside for loop all it does is print the first console output properly but the second one it skips i guess its not entering the page.open function then for loop value increases then same thing happens again I have no idea why its not entering render function i tried to put var page = require('webpage').create();
inside for loop too but still no change
UPDATE: On another question stackoverflow.com/questions/31621577/png-is-not-being-rendered-using-phantomjs-with-multiple-attempts-in-a-loop?rq=1
it was pointed that this method wont work because of async nature of function but the example code provided in it isnt helpful enough can anyone example and i also tried set timeout as suggested in it still same thing happens so any other idea ?
Your phantom.exit() call kills the PhantomJS browser before you do any rendering. You have to wait for the rendering to end before you can exit(). You need to have some mechanism to say when the rendering is done. I'd suggest wrapping each of your renders in a Promise. Then using a Promise.all() to wait for all the render promises to resolve. After they resolve, exit PhantomJS.
Right now, you have what is below, which does not respect the asynchronous nature of page.open():
for (...) {
// I probably wont finish because phantom dies almost immediately
page.open(c, function () {
// I won't finish running since phantom dies
page.render(e);
});
}
// I'm going to kill the phantom almost immediately
phantom.exit();
You want something like the code below, which will wait for all the renders to finish. This will put renders of each of the sites we provide in a subdirectory "renders".
Note: You will need to install the es6-promise shim for this to work since PhantomJS does not yet support Promises. Thanks for the comment about that Artjon B
/*jslint node:true*/
/*globals phantom, sayHello*/
"use strict";
var Promise = require("es6-Promise").Promise;
// Array of URLs that we want to render
var urlArr = [
{
name: "google",
url: "http://www.google.com"
},
{
name: "yahoo",
url: "http://www.yahoo.com"
},
{
name: "bing",
url: "http://www.bing.com"
}
];
// Map URLs to promises
var proms = urlArr.map(function (url) {
// Return a promise for each URL
return new Promise(function (resolve, reject) {
// Make a page
var page = require("webpage").create();
// Open the URL
console.log("opening: " + url.name);
page.open(url.url, function () {
// Render the page
page.render("render/" + url.name + ".png");
console.log("done rendering: " + url.name);
// Say that we are done with rendering
resolve();
});
});
});
// Wait for all rendering to finish
Promise.all(proms).then(function () {
console.log("closing phantom");
// Exit phantom
phantom.exit();
});
For async request inside loop you should use asynchronous library so you can debug your code and don't get memory leak issue
async-js will be good in your case
npm install async
var async = require('async');
var a = "http://lnmtl.com/chapter/renegade-immortal-chapter-";
var b = 558;
var d = "rennegrade_ch";
var f = ".png";
var page = require('webpage').create();
var i = 0;
async.whilst(
function() {
return i < 560;
},
function(callback) {
i++;
var c = a + i;
console.log(c);
page.open(c, function() {
var e = d + i + f;
console.log(e);
page.render(e);
callback(null, i);
});
},
function(err, n) {
if(err) console.log(err);
phantom.exit();
});
so i use one js file to load multiple html and js files whenever they are needed. I have a working code for plenty modules. In the example below you can see the first two modules. All of them look exactly the same. Now i want to "outsource" recurring code into a function with parameters so that the code-amount overall gets minimized. Since i have never done something like this before i could need some help (i am learning js at the moment). I would realy appreciate some help.
//first module
if (moduleID === "placeone") {
var isLoaded = 0;
if (isLoaded) {
console.log("file already loaded");
returnValue = new PlaceOneModule(id, moduleInitialData);
}
$("#placeone").load("html/modules/PlaceOneModule.html", function (response, status, xhr) {
console.log("PlaceOneModule.html" + " " + status);
$.getScript("js/modules/PlaceOneModule.js").done(function () {
console.log("PlaceOneModule.js geladen");
isLoaded = 1;
returnValue = new PlaceOneModule(id, moduleInitialData);
}).fail(function () {
console.log("PlaceOneModule.js nicht geladen");
});
});
}
//second module
if (moduleID === "placetwo") {
var isLoaded = 0;
if (isLoaded) {
console.log("file already loaded");
returnValue = new PlaceTwoModule(id, moduleInitialData);
}
$("#placetwo").load("html/modules/PlaceTwoModule.html", function (response, status, xhr) {
console.log("PlaceTwoModule.html" + " " + status);
$.getScript("js/modules/PlaceTwoModule.js").done(function () {
console.log("PlaceTwoModule.js geladen");
isLoaded = 1;
returnValue = new PlaceTwoModule(id, moduleInitialData);
}).fail(function () {
console.log("PlaceTwoModule.js nicht geladen");
});
});
}
The question is rather complex to answer, as there are many things to account for.
var cache = {};
function module(name, target, done) {
if (!(name in cache)) {
return $(target).load('html/modules/' + name + '.html', function(response, status, xhr) {
console.log(name + '.html ' + status);
$.getScript('js/modules/' + name + '.js')
.done(function() {
console.log(name + '.js geladen');
cache[name] = window[name];
done(null, cache[name]);
})
.fail(function() {
var message = name + '.js nicht geladen';
cache[name] = function() {
console.error(message);
};
done(message);
});
});
}
setTimeout(function() {
done(null, cache[name]);
}, 0);
}
I'll try to explain my train of thought behind this:
var cache = {} - you will need something to keep track of each individual module
function module(name, target, done) {
name would be the base name of your module, e.g. PlaceTwoModule, this was already used consistently across the html and js files and the js function name
target would be the selector where the html file should be loaded
as one of the actions you take requires async operation, the entire functionality needs to become async, I introduce a callback (done) argument
if (!(name in cache)) - if the module is not yet cached, it requires some fetching, so the load is triggered first thing
once the load completes, it will fire the $.getScript
if the $.getScript works out, the name will be assumed to be in window and a reference is stored in the cache variable, after that, the done callback is invoked (with the function as second argument).
if the $.getScript didn't work out, we add a function to the cache, which does nothing more than telling you it will not work, after that, the done callback is invoked (with an error as first argument).
if the name did exist in the cache, we will be calling the done callback right after we exit the module function
So, how to use this?
It now boils down to calling the module function
module('#placeone', 'PlaceOneModule', function(error, PlaceModule) {
if (error) throw new Error(error);
var instance = new PlaceModule(id, initial);
// ...
}
I have used the common function(error, value) {..} signature for the callback function, which always has the error as first argument (allowing for other arguments to be added and made optional).
There are some caveats/assumptions worth mentioning:
I have not provided a failsafe for preventing multiple loads of the same module (so it is the same as in your example) if earlier calls to module are still loading
no matter what target you invoke module with, it will only load 'once' (well, see the previous line ;-) )
I assume the loaded modules are in the global (window) scope in order to keep the example simple, keep in mind to not 'pollute the global scope'
This has become a rather elaborate answer, I hope I explained every step involved sufficiently.
Something like this possibly:
var modules = [];
modules.push({
js: 'PlaceOneModule',
id: 'placeone'
});
modules.push({
js: 'PlaceTwoModule',
id: 'placetwo'
});
var module = modules.filter(function(m) {
return m.id === moduleID;
});
if (module) {
var isLoaded = 0;
if (!!window[module.js]) {
console.log("file already loaded");
returnValue = window[module.js];
}
$("#" + module.id).load("html/modules/" + module.js + ".html", function(response, status, xhr) {
$.getScript("js/modules/" + module.js + ".js").done(function() {
returnValue = new window[module.js](id, moduleInitialData);
}).fail(function() {
console.log("PlaceOneModule.js nicht geladen");
});
});
}
I am trying to follow some tutorials, a simple console log to get the title of the page, however returns undefined, any idea why is this happening?
var phantom = Meteor.npmRequire('phantom');
phantom.create(function(ph){
ph.createPage(function(page){
page.open('http://m.bing.com', function(status) {
var title = page.evaluate(function() {
return document.title;
});
console.log(title);
ph.exit();
});
});
});
It looks like you are using the phantomjs-node bridge between PhantomJS and node. Its API is different from PhantomJS in that each function or property that you call/set takes a callback and so does page.evaluate for when the result is ready.
The example in the README on GitHub directly shows your intended code:
phantom.create(function (ph) {
ph.createPage(function (page) {
page.open("http://www.google.com", function (status) {
console.log("opened google? ", status);
page.evaluate(function () { return document.title; }, function (result) {
console.log('Page title is ' + result);
ph.exit();
});
});
});
});
That's because page.evaluate is async. Maybe try wrapping it in a Future-ized function
var title = (function(page) {
// not sure about Meteor.npmRequire... so I'm using Npm.require instead
var Future = Npm.require('fibers/future');
var future = new Future();
page.evaluate(function() {
future.return(document.title);
});
return future.wait();
})(page);
console.log(title);
I'm currently trying to make a Casper module that does something using a casper module, and returns a variable from that, kinda like this:
var data = [];
exports.parsePage = function(argUrl) {
url = baseUrl = argUrl;
if (!url) {
casper.warn('No url passed, aborting.').exit();
}
casper.start('https://js-uri.googlecode.com/svn/trunk/lib/URI.js', function() {
var scriptCode = this.getPageContent() + '; return URI;';
window.URI = new Function(scriptCode)();
if (typeof window.URI === "function") {
this.echo('URI.js loaded');
} else {
this.warn('Could not setup URI.js').exit();
}
//process is a function that processes the page
}).run(process);
return data;
}
and my test looks like this:
var scanner = require('./pageParser');
console.log(JSON.stringify(scanner.parsePage('http://url.com')));
Is it possible to wait for casper to finish execution before returning data in the parsePage function?
You can use a wait-like function like in this example taken from phantomjs, but you are missing a fundamental concept of javascript: async and callbacks.
So, a possible solution is...
module pageParser.js:
function process(callback) {
//do something here
callback(data);
}
exports.parsePage = function(argUrl, callback) {
...
casper.start('https://js-uri.googlecode.com/svn/trunk/lib/URI.js', function() {
...
}).run(process(callback));
}
main script:
var scanner = require('./pageParser');
scanner.parsePage('http://url.com', function(data) {
console.log(JSON.stringify(data));
});