CasperJS evaluate() is not executing from within each() block - javascript

So I'm crawling a page, collecting links, then I would like to crawl those links to complete my dataset. Here's some code:
crawl.js:
var casper = require("casper").create({
waitTimeout: 3000,
pageSettings: {
userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:23.0) Gecko/20130404 Firefox/23.0"
},
clientScripts: ["includes/jquery.min.js"],
verbose: true
});
var followers = require('./followers');
var currentPage = 1;
var x = require('casper').selectXPath;
Object.size = function(obj) {
var size = 0, key;
for (key in obj) {
if (obj.hasOwnProperty(key)) size++
}
return size;
};
var collectFollowers = function() {
var url;
this.echo("capturing page " + currentPage);
this.capture("wowhead-p" + currentPage + ".png");
// don't go too far down the rabbit hole
if (currentPage >= 5 || !this.exists(x('//*[text()="Next ›"]'))) {
processFollowers.call(casper);
return terminate.call(casper);
}
currentPage++;
this.echo("requesting next page: " + currentPage);
url = this.getCurrentUrl();
var links = this.evaluate(function() {
var obj = {}
$('.listview-cleartext').map(function(){
obj[$(this).text()] = $(this).attr('href');
});
return obj;
});
for (key in links) {
followers.followers[key] = links[key];
}
this.echo("Page links: " + Object.size(followers.followers));
//this.emit('update.followers', links);
this.thenClick(x('//*[text()="Next ›"]')).then(function() {
this.waitFor(function() {
return url !== this.getCurrentUrl();
}, collectFollowers, processFollowers);
});
};
var processFollowers = function() {
this.echo("Total followers:" + Object.size(followers.followers));
this.each(Object.keys(followers.followers), function(casper, key) {
this.thenOpen('http://wowhead.com' + followers.followers[key]).then(function() {
this.echo("On http://wowhead.com" + followers.followers[key]);
this.evaluate(function() {
this.echo("Inside the evaluate statement.");
if ($('a[href=#quests]').length) {
this.echo("Has quest!");
$('a[href=#quests]').click();
var questURL = $('#tab-quests').show().find('.listview-cleartext').attr('href');
var questName = $('#tab-quests').show().find('.listview-cleartext').text();
this.echo("Quest URL: " + questURL);
followers.followers[key] = {"name": key, "quest": {"url": questURL, "name": questName}};
} else {
this.echo("Does not have quest!");
}
});
});
});
}
var terminate = function() {
this.echo("Done.").exit();
}
casper.start("http://wowhead.com/followers=2");
casper.waitForSelector(x('//*[text()="Next ›"]'), collectFollowers, processFollowers);
casper.run();
followers.js:
var require = patchRequire(require);
var utils = require('utils');
var followers = {};
exports.followers = followers;
followers is used to store a global variable, an object that I continually build and update as I crawl pages. So I go through 3 pages of data, collect links successfully, then begin to process them. As it stands, CasperJS appears to open each page successfully, however the evaluate function is never called.
I was able to get this functionality to work within PhantomJS with some async logic, but switched to casper because it appeared as though this would be taken care of under the hood. I've tried various combinations of thenOpen(), then() and open(), thenOpen() without the then(), etc.. What am I messing up?

casper.evalute() is the sandboxed page context in the same way the as the PhantomJS version (page.evaluate()). It has no access to variables defined outside.
this inside of evaluate() refers to window and not casper and I doubt that there is such a function like window.echo(). If you want to receive console messages from the page context, you need to register to the remote.message event:
casper.on("remote.message", function(msg){
this.echo("remote: " + msg);
});
You have to explicitly pass the result out of the page context and add it there:
var result = this.evaluate(function() {
console.log("Inside the evaluate statement.");
if ($('a[href=#quests]').length) {
console.log("Has quest!");
$('a[href=#quests]').click();
var questURL = $('#tab-quests').show().find('.listview-cleartext').attr('href');
var questName = $('#tab-quests').show().find('.listview-cleartext').text();
console.log("Quest URL: " + questURL);
return {"url": questURL, "name": questName}};
} else {
console.log("Does not have quest!");
return null;
}
});
if (result) {
followers.followers[key] = {name: key, quest: result};
}

Related

How to setInterval correct in casperjs?

I try to use webserver talk to outside world and setInterval want its execution automatically every sometimes.
One of my casperjs setting is this.capture(x + 'apple.png');
I though it will show three images under my folder if setInterval run three times.
As the result i only save one image is 1apple.png.
Although i can see a lots of info on my terminal
I want to ask what step should i miss it ? Any help would be appreciated.
Thanks in advance.
Here is my code today.js:
var webserver = require('webserver');
var server = webserver.create();
var service = server.listen('8080', {
'keepAlive': true
}, function (request, response) {
response.statusCode = 200;
response.write('<html><body>What the hell~~</body></html>');
var casper = require("casper").create({
verbose: true,
logLevel: 'debug', // debug, info, warning, error
pageSettings: {
loadImages: false,
loadPlugins: false,
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/537.4'
}
});
var movieTitle = [];
var movieEnTitle = [];
var title = [];
var movieTime = [];
var movieVersion = [];
var city = '台南';
var latitude = 22.993089;
var longitude = 120.196876;
var theaterName = '今日戲院';
var data = {};
data.theater = [];
data.movie = [];
var x =1;
function getMovieTitle() {
var title = document.querySelectorAll('div.theaterlist_name a');
return Array.prototype.map.call(title, function (e) {
return e.innerText;
});
};
function getEnTitle() {
var title = document.querySelectorAll('div.en a');
return Array.prototype.map.call(title, function (e) {
return e.innerText;
});
};
function getMovieTime() {
var title = document.querySelectorAll('ul.theater_time');
return Array.prototype.map.call(title, function (e) {
return e.innerText;
});
};
function getMovieVersion() {
var version = document.querySelectorAll('div.tapR');
return Array.prototype.map.call(version, function (e) {
return e.innerText;
});
};
// 台南 今日戲院 from 奇摩
casper.start('https://tw.movies.yahoo.com/theater_result.html/id=67', function () {
this.echo(this.getTitle());
});
casper.then(function () {
this.echo('Image');
this.echo(x);
this.capture(x + 'apple.png');
x++;
});
casper.then(function () {
movieTitle = this.evaluate(getMovieTitle);
movieEnTitle = this.evaluate(getEnTitle);
movieTime = this.evaluate(getMovieTime);
movieVersion = this.evaluate(getMovieVersion);
});
casper.then(function () {
console.log('Print:\n');
this.echo(movieTitle.length + ' Movie title found :\n');
this.echo(movieTitle.join('\n'));
this.echo(movieEnTitle.length + ' Movie title found :\n');
this.echo(movieEnTitle.join('\n'));
this.echo(movieTime.length + ' Movie time found :\n');
this.echo(movieTime.join('\n'));
this.echo(movieVersion.length + ' Movie version found :\n');
this.echo(movieVersion.join('\n'));
this.echo(outPutJSON());
});
function outPutJSON() {
data.theater.push({
name: theaterName,
city: city,
latitude: latitude,
longitude: longitude
});
// 將中英文名字合併
for (var i = 0; i < movieTitle.length; i++) {
title.push(movieTitle[i] + movieEnTitle[i]);
}
for (var i = 0; i < movieTime.length; i++) {
var name = title[i];
var sourceTime = movieTime[i].match(/.{1,5}/g);
var times = [];
times.push(sourceTime);
var version = movieVersion[i];
data.movie.push({
name: name,
version: version,
time: times
});
}
return JSON.stringify(data);
}
// casper.run(function () {
// // this.echo('Done').exit();
// this.echo('Done');
// });
setInterval(function () {
casper.run(function () {
// this.echo('Done').exit();
this.echo('Done');
});
}, 2000);
response.write(outPutJSON());
response.close();
});
Here is my folder when i command this file , you can see only capture image once 1apple.png.
One way to achieve what you want would be have a cron job scrape the site at the desired frequency and put the results in a directory served by a web server. Below is a stand-alone script that will fetch the title and capture the image of a site once per hour. Modifying this.step is probably unwise (inspiration from this site:
https://github.com/yotsumoto/casperjs-goto )
var casper = require('casper').create();
var x = 0;
casper.start('http://localhost:8080', function() {
// change 'http://localhost:8080' to the site to be scraped
this.echo(this.getTitle());
this.capture(x++ + '.png');
this.wait(60 * 60 * 1000, function() {
// 60 minutes * 60 seconds * 1000 milliseconds
this.step = 0;
});
});
casper.run();

Updated scope variable not showed by view. Promises not working

I'm trying to retrieve data from my Firebase Database, and corresponding images from Firebase Storage. The problem is, my view does not want to update itself with the data.
If I try to simply fetch the data from my database, it works perfectly. Once I add functionality to fetch pictures (which takes slightly longer) it looks like my view simply looks immediately at the scope variable and does not wait for $scope.friendsinfo to update. I think I'm doing something wrong with my promises and should be using $q, but I have no idea how exactly. Can anyone tell me what the best way would be to go about this? Thanks a lot!
var friendsRef = firebase.database().ref('friendships/' + firebase.auth().currentUser.uid);
$scope.friends = $firebaseArray(friendsRef);
$scope.friendsinfo = [];
$scope.$watch('friends', function() {
var newfriends = $scope.friends;
var newfriendsinfo = [];
for(var i = 0; i < newfriends.length; i++){
var ref = firebase.database().ref('users/' + newfriends[i].$id);
var profilePicRef = firebase.storage().ref("profilepictures/" + newfriends[i].$id + "/profilepicture");
var picPromise = fetchPicture(profilePicRef);
var newfriendid = newfriends[i].$id;
var newfriendagreed = newfriends[i].agreed;
picPromise.then(function(data){
ref.once('value', function(snapshot){
newfriendsinfo.push({
id: newfriendid,
name: snapshot.val().name,
email: snapshot.val().email,
agreed: newfriendagreed,
profilepicture: data //This is the functionality that causes my view to not display the updated $scope.friendsinfo because it takes too long.
});
});
});
}
$scope.friendsinfo = newfriendsinfo;
alert($scope.friendsinfo.length);
}, true);
function fetchPicture(ref){
return ref.getDownloadURL().then(function(url) {
return url;
}).catch(function(error) {
alert("error");
});
}
I have not got your code properly but posting code which will you to guide that how to use promises with resolve approach :
function asyncGreet(name) {
var deferred = $q.defer();
setTimeout(function() {
deferred.notify('About to greet ' + name + '.');
if (okToGreet(name)) {
deferred.resolve('Hello, ' + name + '!');
} else {
deferred.reject('Greeting ' + name + ' is not allowed.');
}
}, 1000);
return deferred.promise;
}
var promise = asyncGreet('Robin Hood');
promise.then(function(greeting) {
alert('Success: ' + greeting);
}, function(reason) {
alert('Failed: ' + reason);
}, function(update) {
alert('Got notification: ' + update);
});
If anyone ever needs the solution, here it is. Turns out the problem is mainly caused by waiting for the for loop to finish, for which each item in term waits for another function to finish. This is how I was able to solve it. It's probably not optimal, but it'll do for now :)
var friendsRef = firebase.database().ref('friendships/' + firebase.auth().currentUser.uid);
$scope.friends = $firebaseArray(friendsRef);
$scope.friendsinfo = [];
$scope.$watch('friends', function() {
var newfriends = $scope.friends;
asyncUpdateFriendsInfo(newfriends).then(function(newlist){
$scope.friendsinfo = newlist;
});
}, true);
function fetchPicture(ref){
return ref.getDownloadURL().then(function(url) {
return url;
}).catch(function(error) {
alert("error");
});
}
function asyncUpdateFriendsInfo(newfriends){
var deferred = $q.defer();
var newfriendsinfo = [];
for(var i = 0; i < newfriends.length; i++){
var ref = firebase.database().ref('users/' + newfriends[i].$id);
var profilePicRef = firebase.storage().ref("profilepictures/" + newfriends[i].$id + "/profilepicture");
var picPromise = fetchPicture(profilePicRef);
var newfriendid = newfriends[i].$id;
var newfriendagreed = newfriends[i].agreed;
picPromise.then(function(data){
ref.once('value', function(snapshot){
newfriendsinfo.push({
id: newfriendid,
name: snapshot.val().name,
email: snapshot.val().email,
agreed: newfriendagreed,
profilepicture: data
});
}).then(function(){
if (newfriendsinfo.length == newfriends.length){
deferred.resolve(newfriendsinfo);
}
});
});
}
return deferred.promise;
}

How can I show with PhantomJS the url of the processed page in the generated PDF?

My goal was to generate a PDF from every page included in the sitemap of a website created with Rails. I'm using PhantomJS to get it. I'm quite new in this field, but I could do it, but when I was finished, I realized that it would be usable also to see at the beginning of every PDF the url of the page from which the PDF was generated, so I can browse quicker to the page (the site has over hundred pages).
Here is the Javascript:
// Render Sitemap to file
var RenderUrlsToFile, arrayOfUrls, system;
system = require("system");
/*
Render given urls
#param array of URLs to render
#param callbackPerUrl Function called after finishing each URL, including the last URL
#param callbackFinal Function called after finishing everything
*/
var getFileNumber = function(urlIndex) {
if (urlIndex <10) {
return "00" + urlIndex;
} else {
if (urlIndex <100) {
return "0" + urlIndex;
} else {
return urlIndex;
}
}
};
RenderUrlsToFile = function(urls, callbackPerUrl, callbackFinal) {
var getFilename, next, page, retrieve, urlIndex, webpage;
urlIndex = 0;
webpage = require("webpage");
page = null;
getFilename = function() {
return "rendermulti-" + getFileNumber(urlIndex) + ".pdf";
};
next = function(status, url, file) {
page.close();
callbackPerUrl(status, url, file);
return retrieve();
};
retrieve = function() {
var url;
if (urls.length > 0) {
url = urls.shift();
urlIndex++;
page = webpage.create();
page.viewportSize = {
width: 1920,
height: 1880
};
page.settings.userAgent = "Phantom.js bot";
return page.open(url, function(status) {
var file;
file = getFilename();
if (status === "success") {
return window.setTimeout((function() {
// !!!!!!!!!!!!! Doesn't work !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
page.evaluate(function() {
var x = document.getElementById("logoAndNavigation");
var newP = document.createElement("P")
var textnode = window.location.protocol + "//" + window.location.host + "/" + window.location.pathname;
newP.appendChild(textnode)
x.insertBefore(newP, x.childNodes[0]);
});
// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
page.render("tempPdfs/" + file);
return next(status, url, file);
}), 200);
} else {
return next(status, url, file);
}
});
} else {
return callbackFinal();
}
};
return retrieve();
};
// This makes an array with all the urls inside the sitemap
var arrayOfUrls = [''];
var page = require('webpage').create();
page.open('http://localhost:3000/sitemap.xml', function() {
var content = page.content;
parser = new DOMParser();
xmlDoc = parser.parseFromString(content,'text/xml');
var loc = xmlDoc.getElementsByTagName('loc');
for(var i=0; i < loc.length; i++)
{
var url=loc[i].textContent;
arrayOfUrls.push(url);
}
});
RenderUrlsToFile(arrayOfUrls, (function(status, url, file) {
if (status !== "success") {
return console.log("Unable to render '" + url + "'");
} else {
return console.log("Rendered '" + url + "' at '" + file + "'");
}
}), function() {
return phantom.exit();
});
I tried to solve the issue with the urls, with the code framed with the comment
// !!!!!!!!!!!!! Doesn't work !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
I wanted to show the url inside an element of the page, that has the id #logoAndNavigation, but I get this error:
NOT_FOUND_ERR: DOM Exception 8: An attempt was made to reference a Node in a context where it does not exist.
If I use only a string like "hello" inside the variable textnode, it works, but not if I try to use the url of the page.
Can anyone please help me?
Thank you in advance!
appendChild expects a node not a string. You probably mean to use
var x = document.getElementById("logoAndNavigation");
var newP = document.createElement("p"); // small p
var textnode = window.location.protocol + "//" + window.location.host + "/" + window.location.pathname;
newP.innerHTML = textnode; // this
x.insertBefore(newP, x.childNodes[0]);
You can also use the example of printheaderfooter.js to add the URL directly to the header or footer.

JavaScript Module not incrementing number correctly

I have been trying to learn how to write modules in JavaScript. With this attempt I am trying to load 10 pictures from Flickr on page load, and then load 10 more pictures once the user scrolls to the bottom of the page. This is not firing consistantly and I am not sure why.
I would like to load 10 pictures at page load, and then 10 additional pictures each time the user scrolls down to the bottom of the page.
I think the issue is with the curPage property that is called using this.settings.curPage
curPage is incremented in the jaxPhotos method using this.settings.curPage++
I am not sure but I think the issue is with either the jaxPhotos method or the scrollMorePics method.
Here's a fiddle with my module:http://jsfiddle.net/R3Bt7/
Here's my HTML:
<div class="flickrContainer" data-options='{"searchQuery" : "candy", "tagQuery" : "candy", "tagMode": "all", "picsPerPage" : "10", "curPage" : 1}'>
</div>
Here's my JS:
var FlickrModule = (function ($element) {
var flickrFeed = function () {
this.$element = $element;
this.init();
};
flickrFeed.prototype.init = function () {
this.setOptions()
.jaxPhotos(this.settings.curPage)
.onScrollHandler();
};
flickrFeed.prototype.setOptions = function () {
var options = this.$element.data().options;
var defaults = {
searchQuery : '',
tagQuery : '',
tagMode : '',
picsPerPage: '1',
curPage: 1
}
this.settings = $.extend({}, defaults, options);
return this;
};
flickrFeed.prototype.jaxPhotos = function (pageNumber) {
var self = this;
// ajax call to flickr json
$.ajax({
url: '//api.flickr.com/services/rest/?method=flickr.photos.search&api_key=xxxxxxxxxxxxxxxxxxxx&tags=' + this.settings.searchQuery + '&tag_mode=' + this.settings.tagMode + '&page=' + this.settings.currPage + '&per_page=' + this.settings.picsPerPage + '&format=json&jsoncallback=?',
dataType: 'jsonp',
data: JSON,
success: function (data) {
// start assembling some dom elements to wrap around each page
var pageTxtWrap = document.createElement('div'),
pageTxt= document.createElement('p');
pageTxt.textContent = 'Page ' + pageNumber + ' - Scroll down for more pictures!';
pageTxt.innerText = 'Page ' + pageNumber + ' - Scroll down for more pictures!';
pageTxtWrap.className = 'pageTextWrap';
pageTxtWrap.appendChild(pageTxt);
// Use createDocumentFragment() as it is the fastest method of element creation
var docFragPageHdr = document.createDocumentFragment();
docFragPageHdr.appendChild(pageTxtWrap);
document.body.appendChild(docFragPageHdr);
// create variables for easier access to the JSON trees we're using
flickr = data.photos,
flickrLength = flickr.photo.length;
// run through the JSON we just got and assemble the pictures
for (var i = 0; i < flickrLength; i++) {
var farmId = flickr.photo[i].farm,
serverId = flickr.photo[i].server,
photoId = flickr.photo[i].id,
secretId = flickr.photo[i].secret,
imgTitle = flickr.photo[i].title;
var flickImg = document.createElement('img');
flickImg.className = 'flickerImg';
flickImg.id = 'flickImg'+i;
flickImg.title = imgTitle;
flickImg.src = 'http://farm' + farmId + '.staticflickr.com/' + serverId + '/' + photoId + '_' + secretId + '_m.jpg';
var docFragFlickImg = document.createDocumentFragment();
docFragFlickImg.appendChild(flickImg);
document.body.appendChild(docFragFlickImg);
}
}
});
// increase currPage so we can go to the next page of pictures
this.settings.curPage++;
return this;
};
flickrFeed.prototype.onScrollHandler = function () {
$(document).on('scroll', this.scrollMorePics.bind(this));
return this;
};
flickrFeed.prototype.scrollMorePics = function(){
if ( $(window).scrollTop() + $(window).height() > $(document).height() - 50 ) {
console.log('Before ajax curPage = ', this.settings.curPage);
this.jaxPhotos(this.settings.curPage);
console.log('After ajax curPage = ', this.settings.curPage);
};
return this;
};
return flickrFeed;
}( $('.flickrContainer') ));
(function () {
var myModule = new FlickrModule();
})();
A small example on how you can access instance variables and methods based on your code:
var FlickrModule = (function ($) {
var flickrFeed = function ($element) {
this.$element = $element;
this.init();
};
flickrFeed.prototype.init = function(){
console.log('init', this.$element);
};
return flickrFeed;
})(jQuery);
$(function(){
var $container = $('.flickrContainer'),
fm = new FlickrModule($container);
});
http://jsfiddle.net/5nJqM/

how to scrape links with phantomjs

Can PhantomJS be used an an alternative to BeautifulSoup?
I am trying to search on Etsy and visit all the links in term. In Python, I know how to do this (with BeautifulSoup) but today I want to see if I can do the same with PhantomJS. I'm not getting very far.
This script should search "hello kitty" on Etsy and return all the of products
<a class="listing-thumb" href=...></a> and print them in the console. Ideally I'd visit them later on and get the information I need. Right now it just freezes. Any ideas?
var page = require('webpage').create();
var url = 'http://www.etsy.com/search?q=hello%20kitty';
page.open(url, function(status){
// list all the a.href links in the hello kitty etsy page
var link = page.evaluate(function() {
return document.querySelectorAll('a.listing-thumb');
});
for(var i = 0; i < link.length; i++){ console.log(link[i].href); }
phantom.exit();
});
I have toyed with using CasperJS, which may be better designed for this.
PhantomJS evaluate() cannot serialize and return complex objects like HTMLElements or NodeLists, so you have to map them to serializable things before:
var page = require('webpage').create();
var url = 'http://www.etsy.com/search?q=hello%20kitty';
page.open(url, function(status) {
// list all the a.href links in the hello kitty etsy page
var links = page.evaluate(function() {
return [].map.call(document.querySelectorAll('a.listing-thumb'), function(link) {
return link.getAttribute('href');
});
});
console.log(links.join('\n'));
phantom.exit();
});
Note: here we use [].map.call() in order to treat a NodeList as a standard Array.
The only problem with your code is that you do not understand phantomjs scopes. You have phantom and page scopes. You tried to return JavaScript DOM object references (those can't be serialized) from page scope (page.evaluate runs in page scope) to phantom main scope. I think that is not possible. Here follows code that works:
var page = require('webpage').create();
var url = 'http://www.etsy.com/search?q=hello%20kitty';
// for debug (to see if page returns status code 200)
page.onResourceReceived = function(response) {
if (response.url === url) {
console.log('Resorce: "' + response.url + '" status: ' + response.status);
if (response.status === 200) {
console.log(response.url);
for (var i = 0; i < response.headers.length; i++) {
console.log(response.headers[i].name + ': ' + response.headers[i].value);
}
}
}
};
page.onLoadFinished = function(status){
console.log('Status: ' + status);
console.log('Starting evaluate...');
var links = page.evaluate(function() {
var nodes = [],
matches = document.querySelectorAll("a.listing-thumb");
for(var i = 0; i < matches.length; ++i) {
nodes.push(matches[i].href);
}
return nodes;
});
console.log('Done evaluate... count: ' + links.length);
if (links && links.length > 0) {
for(var i = 0; i < links.length; ++i) {
console.log('(' + i + ') ' + links[i]);
}
} else {
console.log("No match found!");
}
phantom.exit(0);
};
page.open(url);
Here is some code I recently wrote that scrapes urls using PhantomJs, if you provide only a URL it will display all URLS's on the page, if you supply an argument of class|id followed by a "class/id name" it will display the urls of the class/id only.
//////////////////////////////////////////////////////////
///// PhantomJS URL Scraper v.1.3 /////
//
// Copyrighted by +A.M.Danischewski 2016+ (c)
// This program may be reutilized without limits, provided this
// notice remain intact.
//
// Usage: phantomjs phantom_urls.js <URL> [["class"|"id"] [<query id/class name>]]
//
// Argument 1: URL -- "https://www.youtube.com/watch?v=8TniRMwL2Vg"
// Argument 2: "class" or "id"
// Argument 3: If Argument 2 was provided, "class name" or "id name"
//
// By default this program will display ALL urls from a user supplied URL.
// If a class name or id name is provided then only URL's from the class
// or id are displayed.
//
///////////////////////////////////
var page = require('webpage').create(),
system = require('system'),
address;
if (system.args.length === 1) {
console.log(' Usage: phantomjs phantom_urls.js <URL> [["class"|"id"] [<query id/class name>]]');
phantom.exit();
}
address = system.args[1];
querytype= system.args[2];
queryclass = system.args[3];
page.open(address, function(status) {
if (status !== 'success') {
console.log('Error loading address: '+address);
} else {
//console.log('Success! In loading address: '+address);
}
});
page.onConsoleMessage = function(msg) {
console.log(msg);
}
page.onLoadFinished = function(status) {
var dynclass="function() { window.class_urls = new Array(); window.class_urls_next=0; var listings = document.getElementsByClassName('"+queryclass+"'); for (var i=0; i < listings.length; i++) { var el = listings[i]; var ellnks=[].map.call(el.querySelectorAll('a'),function(link) {return link.getAttribute('href');}); var elhtml=el.innerHTML; window.class_urls.push(ellnks.join('\\n')); }; return window.class_urls;}";
var dynid="function() { window.id_urls = new Array(); window.id_urls_next=0; var listings = document.getElementById('"+queryclass+"'); var ellnks=[].map.call(listings.querySelectorAll('a'),function(link) {return link.getAttribute('href');}); var elhtml=listings.innerHTML; window.id_urls.push(ellnks.join('\\n')); return window.id_urls;}";
var allurls="function() { var links = page.evaluate(function() { return [].map.call(document.querySelectorAll('a'), function(link) { return link.getAttribute('href'); };); };); console.log(links.join('\\n')); }";
var page_eval_function="";
if (querytype === "class") {
console.log(page.evaluate(dynclass).toString().replace(/,/g, "\n"));
} else if (querytype === "id") {
console.log(page.evaluate(dynid).toString().replace(/,/g, "\n"));
} else {
var links = page.evaluate(function() {
return [].map.call(document.querySelectorAll('a'), function(link) {
return link.getAttribute('href');
});
});
console.log(links.join('\n'));
}
phantom.exit();
};

Categories

Resources