how to scrape links with phantomjs - javascript

Can PhantomJS be used an an alternative to BeautifulSoup?
I am trying to search on Etsy and visit all the links in term. In Python, I know how to do this (with BeautifulSoup) but today I want to see if I can do the same with PhantomJS. I'm not getting very far.
This script should search "hello kitty" on Etsy and return all the of products
<a class="listing-thumb" href=...></a> and print them in the console. Ideally I'd visit them later on and get the information I need. Right now it just freezes. Any ideas?
var page = require('webpage').create();
var url = 'http://www.etsy.com/search?q=hello%20kitty';
page.open(url, function(status){
// list all the a.href links in the hello kitty etsy page
var link = page.evaluate(function() {
return document.querySelectorAll('a.listing-thumb');
});
for(var i = 0; i < link.length; i++){ console.log(link[i].href); }
phantom.exit();
});
I have toyed with using CasperJS, which may be better designed for this.

PhantomJS evaluate() cannot serialize and return complex objects like HTMLElements or NodeLists, so you have to map them to serializable things before:
var page = require('webpage').create();
var url = 'http://www.etsy.com/search?q=hello%20kitty';
page.open(url, function(status) {
// list all the a.href links in the hello kitty etsy page
var links = page.evaluate(function() {
return [].map.call(document.querySelectorAll('a.listing-thumb'), function(link) {
return link.getAttribute('href');
});
});
console.log(links.join('\n'));
phantom.exit();
});
Note: here we use [].map.call() in order to treat a NodeList as a standard Array.

The only problem with your code is that you do not understand phantomjs scopes. You have phantom and page scopes. You tried to return JavaScript DOM object references (those can't be serialized) from page scope (page.evaluate runs in page scope) to phantom main scope. I think that is not possible. Here follows code that works:
var page = require('webpage').create();
var url = 'http://www.etsy.com/search?q=hello%20kitty';
// for debug (to see if page returns status code 200)
page.onResourceReceived = function(response) {
if (response.url === url) {
console.log('Resorce: "' + response.url + '" status: ' + response.status);
if (response.status === 200) {
console.log(response.url);
for (var i = 0; i < response.headers.length; i++) {
console.log(response.headers[i].name + ': ' + response.headers[i].value);
}
}
}
};
page.onLoadFinished = function(status){
console.log('Status: ' + status);
console.log('Starting evaluate...');
var links = page.evaluate(function() {
var nodes = [],
matches = document.querySelectorAll("a.listing-thumb");
for(var i = 0; i < matches.length; ++i) {
nodes.push(matches[i].href);
}
return nodes;
});
console.log('Done evaluate... count: ' + links.length);
if (links && links.length > 0) {
for(var i = 0; i < links.length; ++i) {
console.log('(' + i + ') ' + links[i]);
}
} else {
console.log("No match found!");
}
phantom.exit(0);
};
page.open(url);

Here is some code I recently wrote that scrapes urls using PhantomJs, if you provide only a URL it will display all URLS's on the page, if you supply an argument of class|id followed by a "class/id name" it will display the urls of the class/id only.
//////////////////////////////////////////////////////////
///// PhantomJS URL Scraper v.1.3 /////
//
// Copyrighted by +A.M.Danischewski 2016+ (c)
// This program may be reutilized without limits, provided this
// notice remain intact.
//
// Usage: phantomjs phantom_urls.js <URL> [["class"|"id"] [<query id/class name>]]
//
// Argument 1: URL -- "https://www.youtube.com/watch?v=8TniRMwL2Vg"
// Argument 2: "class" or "id"
// Argument 3: If Argument 2 was provided, "class name" or "id name"
//
// By default this program will display ALL urls from a user supplied URL.
// If a class name or id name is provided then only URL's from the class
// or id are displayed.
//
///////////////////////////////////
var page = require('webpage').create(),
system = require('system'),
address;
if (system.args.length === 1) {
console.log(' Usage: phantomjs phantom_urls.js <URL> [["class"|"id"] [<query id/class name>]]');
phantom.exit();
}
address = system.args[1];
querytype= system.args[2];
queryclass = system.args[3];
page.open(address, function(status) {
if (status !== 'success') {
console.log('Error loading address: '+address);
} else {
//console.log('Success! In loading address: '+address);
}
});
page.onConsoleMessage = function(msg) {
console.log(msg);
}
page.onLoadFinished = function(status) {
var dynclass="function() { window.class_urls = new Array(); window.class_urls_next=0; var listings = document.getElementsByClassName('"+queryclass+"'); for (var i=0; i < listings.length; i++) { var el = listings[i]; var ellnks=[].map.call(el.querySelectorAll('a'),function(link) {return link.getAttribute('href');}); var elhtml=el.innerHTML; window.class_urls.push(ellnks.join('\\n')); }; return window.class_urls;}";
var dynid="function() { window.id_urls = new Array(); window.id_urls_next=0; var listings = document.getElementById('"+queryclass+"'); var ellnks=[].map.call(listings.querySelectorAll('a'),function(link) {return link.getAttribute('href');}); var elhtml=listings.innerHTML; window.id_urls.push(ellnks.join('\\n')); return window.id_urls;}";
var allurls="function() { var links = page.evaluate(function() { return [].map.call(document.querySelectorAll('a'), function(link) { return link.getAttribute('href'); };); };); console.log(links.join('\\n')); }";
var page_eval_function="";
if (querytype === "class") {
console.log(page.evaluate(dynclass).toString().replace(/,/g, "\n"));
} else if (querytype === "id") {
console.log(page.evaluate(dynid).toString().replace(/,/g, "\n"));
} else {
var links = page.evaluate(function() {
return [].map.call(document.querySelectorAll('a'), function(link) {
return link.getAttribute('href');
});
});
console.log(links.join('\n'));
}
phantom.exit();
};

Related

nodeJS Crawler : unable to get the tagname associated with search word

I have created a crawler in NodeJS
I have a website : "http://www.google.com" for which I have written the crawler
Technology used is nodeJS, cheerio
Sample example of what I have achieved :
For example , lets search google.com. there is a button called "google search".
Let us search for the text "google search". Today my crawler can find the word in the page and say it has found it.
Today it shows : text " google search" found on google.com
What I need the result to be :
What it needs to do is in addition to finding text, also tell me the tag name , that in this case is a button
Needed output is : text "google search" found on google.com of "TAGNAME: BUTTON"
I tried using indexOf, but it isn't working. Please suggest how to do ?
Here is the code
!!!
index.js
var request = require('request');
var cheerio = require('cheerio');
var URL = require('url-parse');
var START_URL = "https://www.mytravelexp.com/";
var SEARCH_WORD ="Pack your travel essentials";
var MAX_PAGES_TO_VISIT = 20;
var pagesVisited = {};
var numPagesVisited = 0;
var pagesToVisit = [];
var url = new URL(START_URL);
var baseUrl = url.protocol + "//" + url.hostname;
pagesToVisit.push(START_URL);
crawl();
function crawl() {
if(numPagesVisited >= MAX_PAGES_TO_VISIT) {
console.log("Reached max limit of number of pages to visit.");
return;
}
var nextPage = pagesToVisit.pop();
if (nextPage in pagesVisited) {
// We've already visited this page, so repeat the crawl
crawl();
} else {
// New page we haven't visited
visitPage(nextPage, crawl);
}
}
function visitPage(url, callback) {
// Add page to our set
pagesVisited[url] = true;
numPagesVisited++;
// Make the request
request(url, function(error, response, body) {
console.log("***************************")
console.log(" Visiting page: " + url + '\n');
if(response.statusCode !== 200) {
callback();
return;
}
// Parse the document body
var $ = cheerio.load(body);
var isWordFound = searchForWord($, SEARCH_WORD);
if(isWordFound) {
console.log(' ' + SEARCH_WORD + ' found at page ' + url);
collectInternalLinks($);
callback();
} else {
collectInternalLinks($);
// In this short program, our callback is just calling crawl()
callback();
}
});
}
function searchForWord($, word) {
var bodyText = $('html > body').html().toLowerCase();
return(bodyText.includes(word.toLowerCase()) !== -1);
}
function collectInternalLinks($) {
var relativeLinks = $("a[href^='/']");
relativeLinks.each(function() {
pagesToVisit.push(baseUrl + $(this).attr('href'));
});
var absoluteLinks = $("a[href^='http']");
absoluteLinks.each(function() {
pagesToVisit.push($(this).attr('href'));
});
}
if ( $('123')[0].name === 'button' ){
console.log($('button').contents().first().text());
console.log( $('123').attr('name') );
}

Use the page title as the screenshot file name in PhantomJS

The following PhantomJS code can be used to obtain page title <title> of a web page
var page = require('webpage').create();
page.open(url, function(status) {
var title = page.evaluate(function() {
return document.title;
});
console.log('Page title is ' + title);
phantom.exit();
});
The following PhantomJS code renders multiple URLs to png files.
// Render Multiple URLs to file
var RenderUrlsToFile, arrayOfUrls, system;
system = require("system");
/*
Render given urls
#param array of URLs to render
#param callbackPerUrl Function called after finishing each URL, including the last URL
#param callbackFinal Function called after finishing everything
*/
RenderUrlsToFile = function(urls, callbackPerUrl, callbackFinal) {
var getFilename, next, page, retrieve, urlIndex, webpage;
urlIndex = 0;
webpage = require("webpage");
page = null;
getFilename = function() {
return "rendermulti-" + urlIndex + ".png";
};
next = function(status, url, file) {
page.close();
callbackPerUrl(status, url, file);
return retrieve();
};
retrieve = function() {
var url;
if (urls.length > 0) {
url = urls.shift();
urlIndex++;
page = webpage.create();
page.viewportSize = {
width: 800,
height: 600
};
page.settings.userAgent = "Phantom.js bot";
return page.open("http://" + url, function(status) {
var file;
file = getFilename();
if (status === "success") {
return window.setTimeout((function() {
page.render(file);
return next(status, url, file);
}), 200);
} else {
return next(status, url, file);
}
});
} else {
return callbackFinal();
}
};
return retrieve();
};
arrayOfUrls = null;
if (system.args.length > 1) {
arrayOfUrls = Array.prototype.slice.call(system.args, 1);
} else {
console.log("Usage: phantomjs render_multi_url.js [domain.name1, domain.name2, ...]");
arrayOfUrls = ["www.google.com", "www.bbc.co.uk", "www.phantomjs.org"];
}
RenderUrlsToFile(arrayOfUrls, (function(status, url, file) {
if (status !== "success") {
return console.log("Unable to render '" + url + "'");
} else {
return console.log("Rendered '" + url + "' at '" + file + "'");
}
}), function() {
return phantom.exit();
});
The names of rendered files are in the format of "rendermulti-" + urlIndex + ".png" . But I want it to be page title+".png". How can I modify above code for my requirement.
Since page is global, you can easily change getFilename() in this way:
getFilename = function() {
var title = page.evaluate(function() {
return document.title;
});
return title + ".png";
};
You also don't need to access the page context (inside of page.evaluate()) to get the title. You can simply access page.title:
getFilename = function() {
return page.title + ".png";
};
It may be the case that the title contains characters that cannot appear in a directory or file. If it contains for example a/b, this will try write file b.png to directory a which of course doesn't exist.
Simply remove such characters:
return title.replace(/[\\\/:]/g, "_") + ".png";

CasperJS evaluate() is not executing from within each() block

So I'm crawling a page, collecting links, then I would like to crawl those links to complete my dataset. Here's some code:
crawl.js:
var casper = require("casper").create({
waitTimeout: 3000,
pageSettings: {
userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:23.0) Gecko/20130404 Firefox/23.0"
},
clientScripts: ["includes/jquery.min.js"],
verbose: true
});
var followers = require('./followers');
var currentPage = 1;
var x = require('casper').selectXPath;
Object.size = function(obj) {
var size = 0, key;
for (key in obj) {
if (obj.hasOwnProperty(key)) size++
}
return size;
};
var collectFollowers = function() {
var url;
this.echo("capturing page " + currentPage);
this.capture("wowhead-p" + currentPage + ".png");
// don't go too far down the rabbit hole
if (currentPage >= 5 || !this.exists(x('//*[text()="Next ›"]'))) {
processFollowers.call(casper);
return terminate.call(casper);
}
currentPage++;
this.echo("requesting next page: " + currentPage);
url = this.getCurrentUrl();
var links = this.evaluate(function() {
var obj = {}
$('.listview-cleartext').map(function(){
obj[$(this).text()] = $(this).attr('href');
});
return obj;
});
for (key in links) {
followers.followers[key] = links[key];
}
this.echo("Page links: " + Object.size(followers.followers));
//this.emit('update.followers', links);
this.thenClick(x('//*[text()="Next ›"]')).then(function() {
this.waitFor(function() {
return url !== this.getCurrentUrl();
}, collectFollowers, processFollowers);
});
};
var processFollowers = function() {
this.echo("Total followers:" + Object.size(followers.followers));
this.each(Object.keys(followers.followers), function(casper, key) {
this.thenOpen('http://wowhead.com' + followers.followers[key]).then(function() {
this.echo("On http://wowhead.com" + followers.followers[key]);
this.evaluate(function() {
this.echo("Inside the evaluate statement.");
if ($('a[href=#quests]').length) {
this.echo("Has quest!");
$('a[href=#quests]').click();
var questURL = $('#tab-quests').show().find('.listview-cleartext').attr('href');
var questName = $('#tab-quests').show().find('.listview-cleartext').text();
this.echo("Quest URL: " + questURL);
followers.followers[key] = {"name": key, "quest": {"url": questURL, "name": questName}};
} else {
this.echo("Does not have quest!");
}
});
});
});
}
var terminate = function() {
this.echo("Done.").exit();
}
casper.start("http://wowhead.com/followers=2");
casper.waitForSelector(x('//*[text()="Next ›"]'), collectFollowers, processFollowers);
casper.run();
followers.js:
var require = patchRequire(require);
var utils = require('utils');
var followers = {};
exports.followers = followers;
followers is used to store a global variable, an object that I continually build and update as I crawl pages. So I go through 3 pages of data, collect links successfully, then begin to process them. As it stands, CasperJS appears to open each page successfully, however the evaluate function is never called.
I was able to get this functionality to work within PhantomJS with some async logic, but switched to casper because it appeared as though this would be taken care of under the hood. I've tried various combinations of thenOpen(), then() and open(), thenOpen() without the then(), etc.. What am I messing up?
casper.evalute() is the sandboxed page context in the same way the as the PhantomJS version (page.evaluate()). It has no access to variables defined outside.
this inside of evaluate() refers to window and not casper and I doubt that there is such a function like window.echo(). If you want to receive console messages from the page context, you need to register to the remote.message event:
casper.on("remote.message", function(msg){
this.echo("remote: " + msg);
});
You have to explicitly pass the result out of the page context and add it there:
var result = this.evaluate(function() {
console.log("Inside the evaluate statement.");
if ($('a[href=#quests]').length) {
console.log("Has quest!");
$('a[href=#quests]').click();
var questURL = $('#tab-quests').show().find('.listview-cleartext').attr('href');
var questName = $('#tab-quests').show().find('.listview-cleartext').text();
console.log("Quest URL: " + questURL);
return {"url": questURL, "name": questName}};
} else {
console.log("Does not have quest!");
return null;
}
});
if (result) {
followers.followers[key] = {name: key, quest: result};
}

How can I show with PhantomJS the url of the processed page in the generated PDF?

My goal was to generate a PDF from every page included in the sitemap of a website created with Rails. I'm using PhantomJS to get it. I'm quite new in this field, but I could do it, but when I was finished, I realized that it would be usable also to see at the beginning of every PDF the url of the page from which the PDF was generated, so I can browse quicker to the page (the site has over hundred pages).
Here is the Javascript:
// Render Sitemap to file
var RenderUrlsToFile, arrayOfUrls, system;
system = require("system");
/*
Render given urls
#param array of URLs to render
#param callbackPerUrl Function called after finishing each URL, including the last URL
#param callbackFinal Function called after finishing everything
*/
var getFileNumber = function(urlIndex) {
if (urlIndex <10) {
return "00" + urlIndex;
} else {
if (urlIndex <100) {
return "0" + urlIndex;
} else {
return urlIndex;
}
}
};
RenderUrlsToFile = function(urls, callbackPerUrl, callbackFinal) {
var getFilename, next, page, retrieve, urlIndex, webpage;
urlIndex = 0;
webpage = require("webpage");
page = null;
getFilename = function() {
return "rendermulti-" + getFileNumber(urlIndex) + ".pdf";
};
next = function(status, url, file) {
page.close();
callbackPerUrl(status, url, file);
return retrieve();
};
retrieve = function() {
var url;
if (urls.length > 0) {
url = urls.shift();
urlIndex++;
page = webpage.create();
page.viewportSize = {
width: 1920,
height: 1880
};
page.settings.userAgent = "Phantom.js bot";
return page.open(url, function(status) {
var file;
file = getFilename();
if (status === "success") {
return window.setTimeout((function() {
// !!!!!!!!!!!!! Doesn't work !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
page.evaluate(function() {
var x = document.getElementById("logoAndNavigation");
var newP = document.createElement("P")
var textnode = window.location.protocol + "//" + window.location.host + "/" + window.location.pathname;
newP.appendChild(textnode)
x.insertBefore(newP, x.childNodes[0]);
});
// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
page.render("tempPdfs/" + file);
return next(status, url, file);
}), 200);
} else {
return next(status, url, file);
}
});
} else {
return callbackFinal();
}
};
return retrieve();
};
// This makes an array with all the urls inside the sitemap
var arrayOfUrls = [''];
var page = require('webpage').create();
page.open('http://localhost:3000/sitemap.xml', function() {
var content = page.content;
parser = new DOMParser();
xmlDoc = parser.parseFromString(content,'text/xml');
var loc = xmlDoc.getElementsByTagName('loc');
for(var i=0; i < loc.length; i++)
{
var url=loc[i].textContent;
arrayOfUrls.push(url);
}
});
RenderUrlsToFile(arrayOfUrls, (function(status, url, file) {
if (status !== "success") {
return console.log("Unable to render '" + url + "'");
} else {
return console.log("Rendered '" + url + "' at '" + file + "'");
}
}), function() {
return phantom.exit();
});
I tried to solve the issue with the urls, with the code framed with the comment
// !!!!!!!!!!!!! Doesn't work !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
I wanted to show the url inside an element of the page, that has the id #logoAndNavigation, but I get this error:
NOT_FOUND_ERR: DOM Exception 8: An attempt was made to reference a Node in a context where it does not exist.
If I use only a string like "hello" inside the variable textnode, it works, but not if I try to use the url of the page.
Can anyone please help me?
Thank you in advance!
appendChild expects a node not a string. You probably mean to use
var x = document.getElementById("logoAndNavigation");
var newP = document.createElement("p"); // small p
var textnode = window.location.protocol + "//" + window.location.host + "/" + window.location.pathname;
newP.innerHTML = textnode; // this
x.insertBefore(newP, x.childNodes[0]);
You can also use the example of printheaderfooter.js to add the URL directly to the header or footer.

How to get all images from document and store to local

My goal is to get all images from document, then download all images bigger than 150x150px to local.
I'm stucked on retrieving files from URL i got on previous steps. Here is the buggy code line (full code - at the end):
...
var copyResult = fs.copy(imagesURLs[i], destFile);
...
When i run from console it just hangs up on fs.copy(), without any errors.
As i can understand, fs.copy() doesn't work with remote URLs, even if you set all proper args (--load-images=yes, --local-to-remote-url-access=yes). Am i right or there's something i did wrong with copy()? And are there any methods to get files directly from webkit's cache?
Got latest phantomjs version and ubuntu server.
I would be appreciate for any kind of help.
Full script code:
if (phantom.args.length < 1 || phantom.args.length > 2)
{
console.log('Usage: phantomjs ' + phantom.scriptName + ' <URL>');
phantom.exit();
}
else
{
var page = new WebPage(),
address = phantom.args[0];
page.viewportSize = { width: 1200, height: 4000 };
page.open(address, function (status)
{
if (status === 'success')
{
var imagesURLs = page.evaluate(function ()
{
var documentImages = [], imagesCount = document.images.length, index = 0;
while (index < imagesCount)
{
if ((document.images[index].width >= 150) && (document.images[index].height >= 150))
{
documentImages.push(document.images[index].src);
}
index++;
}
return documentImages;
});
var fs = require('fs');
for (var i in imagesURLs)
{
var fileName = imagesURLs[i].replace(/^.*[\\\/]/, '');
var destFile = '' + fs.workingDirectory + '/www/images/' + fileName;
console.log(destFile);
var copyResult = fs.copy(imagesURLs[i], destFile);
console.log(copyResult);
}
}
else
{
console.log('status: ' + status);
}
phantom.exit();
});
}
man try this.
function SaveAs(imgURL)
{
var oPop = window.open(imgURL,"","width=1, height=1, top=5000, left=5000");
for(;oPop.document.readyState != "complete"; )
{
if (oPop.document.readyState == "complete")break;
}
oPop.document.execCommand("SaveAs");
oPop.close();
}

Categories

Resources