with casper.on("resource.requested"), we can capture the resource requests and perform checks for evaluation.
On page load, we are pushing all the network requests URL in an array and then traverse the array to find the number of calls to GOOGLE Analytics (i.e. _utm.gif).
// google analytics calls testing
casper.test.begin('Test Container Tags', function suite(test) {
casper.start("http://www.viget.com/", function() {
});
var urls = [],
links = [];
casper.on('resource.requested', function(requestData, resource) {
urls.push(decodeURI(requestData.url));
});
casper.then(function() {
var index = -1;
var found = 0;
for (var i = 0; i < urls.length; i++)
{
index = urls[i].indexOf('__utm.gif');
if (index > -1)
found = found+1;
}
casper.echo('found' + found);
test.assert(found > 0, 'Page Load Test Complete');
});
//Emit "resource.requested" to capture the network request on link click
casper.then(function(self) {
var utils = require('utils');
var x = require('casper').selectXPath;
casper.click(x("//a[data-type]"));
casper.emit('resource.requested');
});
casper.run(function() {
test.done();
});
});
But, now the next Ask is to verify the network resource requests on hyperlinks click event. Tried to make that work with casper.emit("resource.requested") but no success.
Already spent one complete day to find the workaround for the same. ANY feedback would be appreciated at this point.
You could use a casper.waitForResource() after the click and do your validation there.
casper.test.begin('Test Container Tags', function suite(test) {
casper.start("http://www.viget.com/", function() {
});
var urls = [],
links = [];
casper.on('resource.requested', function(requestData, resource) {
urls.push(decodeURI(requestData.url));
});
casper.then(function() {
var index = -1;
var found = 0;
for (var i = 0; i < urls.length; i++)
{
index = urls[i].indexOf('__utm.gif');
if (index > -1)
found = found+1;
}
casper.echo('found' + found);
test.assert(found > 0, 'Page Load Test Complete');
});
//Emit "resource.requested" to capture the network request on link click
casper.then(function(self) {
var utils = require('utils');
var x = require('casper').selectXPath;
casper.click(x("//a[data-type]"));
});
casper.waitForResource(function testResource(resource) {
console.log('----->' + resource.url);
});
casper.run(function() {
test.done();
});
});
Related
I'm trying to make an extension that collecting social networks links from the web page where user is. So when user clicking button getLinks we get all links and then by checking condition passing them in the blocks of the extension. I tried to use chrome.tabs.executeScript, and get links through urls = $$('a'); but it's not working
$('#getLinks').click(function(e)
{
var allLinks = [];
var i = 0;
chrome.tabs.executeScript( null, {"code": "urls = $$('a'); for (url in urls)
{ allLinks[i]=urls[url].href; i++;}"}, function() {
var vk;
var facebook;
var linkedin;
for (var i=0; i<allLinks.length; i++)
{
var profil = (allLinks[i].href);
if(profil.indexOf('vk.com')!=-1)
{
vk = profil;
$('#vk').text(vk);
}
if(profilito.indexOf('facebook.com')!=-1)
{
facebook = profil;
$('#fb').text(facebook);
}
if(profilito.indexOf('linkedin.com')!=-1)
{
linkedin = profil;
$('#linkin').text(linkedin);
}
}
});
});
That's not how executeScript is used. That code can not access the variables allLinks and i because it is executed elsewhere. But you can make use of the returned value of that code like in this other SO question:
$('#getLinks').click(function(e) {
chrome.tabs.executeScript( null, {"code": "var urls = document.querySelectorAll('a'); for(var i = 0; i < urls.length; i++) { urls[i] = urls[i].href; }; urls"}, function(results) {
var allLinks = results[0];
// use allLinks here
});
});
So finally I got an answer on my own question and posting here the solution
$('#getUser').click(function(e) {
chrome.tabs.executeScript(null,{code: 'Array.from(document.getElementsByTagName("a")).map(a => a.innerHTML)'},function (results){
var vk = [];
var facebook = [];
var linkedin = [];
var allElements = results[0];
for (var i=0; i<allElements.length; i++)
{
if (allElements[i].indexOf("https://vk.com") !== -1)
{
vk.push (allElements[i]);
}
if (allElements[i].indexOf("https://facebook.com") !== -1 )
{
facebook.push (allElements[i]);
}
if (allElements[i].indexOf("https://www.linkedin.com") !== -1 )
{
linkedin.push (allElements[i]);
}
}
});
All links that we are finding on the page sorted in 3 arrays by belonging to the social networks
The following code is a simple scraper written in CasperJS.
var casper = require('casper').create();
var url = casper.cli.get(0);
var page1 = casper.cli.get(1);
var page2 = casper.cli.get(2);
//console.log(page2);
var proxy = casper.cli.get(3);
//alert(page1);
var exp = /[-a-zA-Z0-9#:%_\+.~#?&//=]{2,256}\.[a-z]{2,4}\b(\/[-a-zA-Z0-9#:%_\+.~#?&//=]*)?/gi;
var regex = new RegExp(exp);
var baseUrl = url;
//console.log(baseUrl);
var nextBtn = "a.navigation-button.next";
var allLinks = [];
casper.start(baseUrl);
casper.waitForSelector(nextBtn, processPage);
casper.run();
function processPage() {
for (var i = page1; i < page2; i = i + 1) {
console.log(i);
var pageData = this.evaluate(getPageData);
allLinks = allLinks.concat(pageData);
if (!this.exists(nextBtn)) {
return;
};
this.thenClick(nextBtn).then(function() {
//this.echo(i);
this.echo(this.getCurrentUrl());
//this.wait(1000);
});
};
}
function getPageData(){
//return document.title;
var links = document.getElementsByClassName('pro-title');
links = Array.prototype.map.call(links,function(link){
return link.getAttribute('href');
});
return links;
};
casper.then(function(){
//require('utils').dump(allLinks);
this.each(allLinks,function(self,link){
if (link.match(regex)) {
self.thenOpen(link,function(a){
jsonObj = {};
jsonObj.title = this.fetchText('a.profile-full-name');
jsonObj.services = this.getHTML('div.info-list-text span:nth-child(2) span');
jsonObj.services = jsonObj.services.replace(/&/g,"and");
jsonObj.location = this.getHTML('div.pro-info-horizontal-list div.info-list-label:nth-child(3) div.info-list-text span');
//jsonObj.contact = this.fetchText('span.pro-contact-text');
jsonObj.description = this.getHTML('div.profile-about div:nth-child(1)');
//jsonObj.description.replace(/\s/g, '');
//require('utils').dump(jsonObj);
//jsonObj.description = jsonObj.description.replace(/[\t\n]/g,"");
//jsonObj = JSON.stringify(jsonObj, null, '\t');
//console.log(i);
require('utils').dump(jsonObj);
});
};
});
});
I am executing this script as follows,
casperjs scraping.js http://www.houzz.com/professionals/c/Chicago--IL/p/15 1 3
The first CLI argument is the starting URL. The second and third arguments are the starting and ending page numbers of the scrape.
I am able to extract data from the first page, but I don't understand why I am not able to extract data from any of the consequent pages.
You cannot mix synchronous and asynchronous code like this in processPage. The loop is immediately executed, but the click and the loading of the next page happens asynchronously. The evaluation of the page has to be done asynchronously:
function processPage() {
for (var i = page1; i < page2; i = i + 1) {
this.then(function(){
console.log(i);
var pageData = this.evaluate(getPageData);
allLinks = allLinks.concat(pageData);
if (!this.exists(nextBtn)) {
return;
}
this.thenClick(nextBtn).then(function() {
this.echo(this.getCurrentUrl());
});
});
};
}
After asking a question and getting a very helpful answer on what the 'Async Juggling' assignment in learnyounode was asking me to do, I set out to implement it myself.
The problem is, my setup isn't having any success! Even though I've referred to other solutions out there, my setup simply isn't returning any results when I do a learnyounode verify myscript.js.
GIST: jugglingAsync.js
var http = require('http');
var app = (function () {
// Private variables...
var responsesRemaining,
urls = [],
responses = [];
var displayResponses = function() {
for(var iterator in responses) {
console.log(responses[iterator]);
}
};
// Public scope...
var pub = {};
pub.main = function (args) {
responsesRemaining = args.length - 2;
// For every argument, push a URL and prep a response.
for(var i = 2; i < args.length; i++) {
urls.push(args[i]);
responses.push('');
}
// For every URL, set off an async request.
for(var iterator in urls) {
var i = iterator;
var url = urls[i];
http.get(url, function(response) {
response.setEncoding('utf8');
response.on('data', function(data) {
if(response.headers.host == url)
responses[i] += data;
});
response.on('end', function() {
if(--responsesRemaining == 0)
displayResponses();
});
});
}
};
return pub;
})();
app.main(process.argv);
Question: What am I doing wrong?
This line
for(var iterator in urls) {
doesn't do what you think it does. It actually loops over the properties of urls (see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Statements/for...in). Instead, you have to do something like
for(var i = 0; i < urls.length; i++) {
var url = urls[i];
...
}
or
urls.forEach(function(url, index) {
...
});
In addition to not properly looping through the arrays inside the app module, I was also not properly concatenating data returned from the response.on('data') event. Originally I was doing...
responses[index] += data;
Instead, the correct thing to do was:
responses[index] = responses[index] + data;
Changing that, as well as the things noted by #arghbleargh got the 'Async Juggling' to fully verify!
I have tested my code and it all worked:
~ $ node juggling_async.js site1 site2 site3 site4 ...
The JS code does not limit only to three sites.
var http = require('http');
// Process all the site-names from the arguments and store them in sites[].
// This way does not limit the count to only 3 sites.
var sites = [];
(function loadSites() {
for(var i = 2, len = process.argv.length; i < len; ++i) {
var site = process.argv[i];
if(site.substr(0, 6) != 'http://') site = 'http://' + site;
sites.push(site);
}
})();
var home_pages = [];
var count = 0;
function httpGet(index) {
var home_page = '';
var site = sites[index];
http.get(site, function(res) {
res.setEncoding('utf8');
res.on('data', function(data) {
home_page += data;
});
res.on('end', function() {
++count;
home_pages[index] = home_page;
if(count == sites.length) {
// Yahoo! We have reached the last one.
for(var i = 0; i < sites.length; ++i) {
console.log('\n############ Site #' + (+i+1) + ': ' + sites[i]);
console.log(home_pages[i]);
console.log('============================================\n');
}
}
});
})
.on('error', function(e) {
console.log('Error at loop index ' + inddex + ': ' + e.message);
})
;
}
for(var i = 0; i < sites.length; ++i) {
httpGet(i);
}
I'm chaining getJSON requests with .when.
Code is similar to this:
$.when( $.getJSON(url0),$.getJSON(url1), $.getJSON(url2)).done( function() {
$.each(arguments, function(index, result) { …
How can I write this so if the URL set contains url3 or url4 or more or only url0 it can proceed?
I store the url vars in a file or in local storage.
var list = ['obj1', 'obj2', 'obj3', 'obj4', 'obj5'];
var callback = function() {
console.log("done");
};
var requests = [];
for(i = 0; i < list.length; i++) {
requests.push($.ajax({
url: 'url',
success: function() {
console.log('suc');
}
}));
}
$.when.apply(undefined, requests).then(function(results){callback()});
You can give me some more details so i can load that in array and can show you how that works
JSFIDDLE DEMO : http://jsfiddle.net/MBZEu/4/
or you can try
var urlArr = ['url1', 'url2', 'url3', 'url4', 'url5'];
var callback = function() {
console.log("done");
};
var requests = [];
for(i = 0; i < urlArr.length; i++) {
requests.push($.getJSON(urlArr[i])); //or something similar which can push url in array
}
$.when.apply(undefined, requests).then(function(results){callback()});
or use this to see whats going on with req
requests.push($.getJSON(urlArr[i], function(res){console.log(res)}));
Can PhantomJS be used an an alternative to BeautifulSoup?
I am trying to search on Etsy and visit all the links in term. In Python, I know how to do this (with BeautifulSoup) but today I want to see if I can do the same with PhantomJS. I'm not getting very far.
This script should search "hello kitty" on Etsy and return all the of products
<a class="listing-thumb" href=...></a> and print them in the console. Ideally I'd visit them later on and get the information I need. Right now it just freezes. Any ideas?
var page = require('webpage').create();
var url = 'http://www.etsy.com/search?q=hello%20kitty';
page.open(url, function(status){
// list all the a.href links in the hello kitty etsy page
var link = page.evaluate(function() {
return document.querySelectorAll('a.listing-thumb');
});
for(var i = 0; i < link.length; i++){ console.log(link[i].href); }
phantom.exit();
});
I have toyed with using CasperJS, which may be better designed for this.
PhantomJS evaluate() cannot serialize and return complex objects like HTMLElements or NodeLists, so you have to map them to serializable things before:
var page = require('webpage').create();
var url = 'http://www.etsy.com/search?q=hello%20kitty';
page.open(url, function(status) {
// list all the a.href links in the hello kitty etsy page
var links = page.evaluate(function() {
return [].map.call(document.querySelectorAll('a.listing-thumb'), function(link) {
return link.getAttribute('href');
});
});
console.log(links.join('\n'));
phantom.exit();
});
Note: here we use [].map.call() in order to treat a NodeList as a standard Array.
The only problem with your code is that you do not understand phantomjs scopes. You have phantom and page scopes. You tried to return JavaScript DOM object references (those can't be serialized) from page scope (page.evaluate runs in page scope) to phantom main scope. I think that is not possible. Here follows code that works:
var page = require('webpage').create();
var url = 'http://www.etsy.com/search?q=hello%20kitty';
// for debug (to see if page returns status code 200)
page.onResourceReceived = function(response) {
if (response.url === url) {
console.log('Resorce: "' + response.url + '" status: ' + response.status);
if (response.status === 200) {
console.log(response.url);
for (var i = 0; i < response.headers.length; i++) {
console.log(response.headers[i].name + ': ' + response.headers[i].value);
}
}
}
};
page.onLoadFinished = function(status){
console.log('Status: ' + status);
console.log('Starting evaluate...');
var links = page.evaluate(function() {
var nodes = [],
matches = document.querySelectorAll("a.listing-thumb");
for(var i = 0; i < matches.length; ++i) {
nodes.push(matches[i].href);
}
return nodes;
});
console.log('Done evaluate... count: ' + links.length);
if (links && links.length > 0) {
for(var i = 0; i < links.length; ++i) {
console.log('(' + i + ') ' + links[i]);
}
} else {
console.log("No match found!");
}
phantom.exit(0);
};
page.open(url);
Here is some code I recently wrote that scrapes urls using PhantomJs, if you provide only a URL it will display all URLS's on the page, if you supply an argument of class|id followed by a "class/id name" it will display the urls of the class/id only.
//////////////////////////////////////////////////////////
///// PhantomJS URL Scraper v.1.3 /////
//
// Copyrighted by +A.M.Danischewski 2016+ (c)
// This program may be reutilized without limits, provided this
// notice remain intact.
//
// Usage: phantomjs phantom_urls.js <URL> [["class"|"id"] [<query id/class name>]]
//
// Argument 1: URL -- "https://www.youtube.com/watch?v=8TniRMwL2Vg"
// Argument 2: "class" or "id"
// Argument 3: If Argument 2 was provided, "class name" or "id name"
//
// By default this program will display ALL urls from a user supplied URL.
// If a class name or id name is provided then only URL's from the class
// or id are displayed.
//
///////////////////////////////////
var page = require('webpage').create(),
system = require('system'),
address;
if (system.args.length === 1) {
console.log(' Usage: phantomjs phantom_urls.js <URL> [["class"|"id"] [<query id/class name>]]');
phantom.exit();
}
address = system.args[1];
querytype= system.args[2];
queryclass = system.args[3];
page.open(address, function(status) {
if (status !== 'success') {
console.log('Error loading address: '+address);
} else {
//console.log('Success! In loading address: '+address);
}
});
page.onConsoleMessage = function(msg) {
console.log(msg);
}
page.onLoadFinished = function(status) {
var dynclass="function() { window.class_urls = new Array(); window.class_urls_next=0; var listings = document.getElementsByClassName('"+queryclass+"'); for (var i=0; i < listings.length; i++) { var el = listings[i]; var ellnks=[].map.call(el.querySelectorAll('a'),function(link) {return link.getAttribute('href');}); var elhtml=el.innerHTML; window.class_urls.push(ellnks.join('\\n')); }; return window.class_urls;}";
var dynid="function() { window.id_urls = new Array(); window.id_urls_next=0; var listings = document.getElementById('"+queryclass+"'); var ellnks=[].map.call(listings.querySelectorAll('a'),function(link) {return link.getAttribute('href');}); var elhtml=listings.innerHTML; window.id_urls.push(ellnks.join('\\n')); return window.id_urls;}";
var allurls="function() { var links = page.evaluate(function() { return [].map.call(document.querySelectorAll('a'), function(link) { return link.getAttribute('href'); };); };); console.log(links.join('\\n')); }";
var page_eval_function="";
if (querytype === "class") {
console.log(page.evaluate(dynclass).toString().replace(/,/g, "\n"));
} else if (querytype === "id") {
console.log(page.evaluate(dynid).toString().replace(/,/g, "\n"));
} else {
var links = page.evaluate(function() {
return [].map.call(document.querySelectorAll('a'), function(link) {
return link.getAttribute('href');
});
});
console.log(links.join('\n'));
}
phantom.exit();
};