jQuery indexOf error for xdomain.js cross-domain link script - javascript

I'm using a script to detect cross-domain links for google analytics cross-domain tracking. The original script (xdomain.js) was provided by the great folks at Luna Metrics. Here is the script with my modifications, hat-tip to educardocereto here on StackOverflow for the suggested changes to enable setAllowAnchor in the GATC (I've commented line 40 where the console error first points to):
var jQueryXD = jQuery.noConflict();
/* I added var because page loads 2 versions
of jquery - not the source of the problem.*/
function listenToClicks()
{
var domains=["domain1.com", "domain2.com"];
var fileTypes=[".pdf"];
jQueryXD('a').each(function(index) {
var link = jQueryXD(this);
var href = link.attr('href');
jQueryXD.each(fileTypes, function(i) {
if(jQueryXD(link).attr('href').indexOf(this)!=-1){ //this is line 40
valid = false;
jQueryXD(link).bind('click', function(c) {
c.preventDefault();
_gat._getTrackerByName()._trackEvent('Download', 'Click - ' + jQueryXD(link).attr('href'));
setTimeout('document.location = "' + jQueryXD(link).attr('href') + '"', 100);
});
}
});
var valid = false;
jQueryXD.each(domains, function(j) {
try
{
if((jQueryXD(link).attr('href').indexOf(this)!=-1)&&(window.location.href.indexOf(this)==-1)){
valid = true;
if (valid)
{
jQueryXD(link).bind('click', function(l) {
if(typeof(_gat)=="object"){
l.preventDefault();
if (jQueryXD(link).attr('target') != "_blank")
{ // _gaq.push(['_link',jQueryXD(link).attr('href')]);
_gaq.push(['_link',jQueryXD(link).attr('href'), true]); // mod
}
else
{
var tracker = _gat._getTrackerByName();
//var fullUrl = tracker._getLinkerUrl(jQueryXD(link).attr('href'));
var fullUrl = tracker._getLinkerUrl(jQueryXD(link).attr('href'), true); //mod
window.open(fullUrl);
}
}
});
}
}
}
catch(e)
{
//Bad A tag
}
});
var rootDomain = document.domain.split(".")[document.domain.split(".").length - 2] + "." + document.domain.split(".")[document.domain.split(".").length - 1];
if ( (href.match(/^http/)) && (href.indexOf(rootDomain) == -1) && !valid) {
jQueryXD(link).bind('click', function(d) {
d.preventDefault();
_gat._getTrackerByName()._trackEvent('Outbound Link', href);
setTimeout('document.location = "' + href + '"', 100);
});
}
});
}
jQueryXD(document).ready(function() {
listenToClicks();
});
The output from the Chrome javascript console:
Uncaught TypeError:
Cannot call method 'indexOf' of undefined xdomain-nfi-nfs-anchormod-noconflict.js:40
jQueryXD.each.valid xdomain-nfi-nfs-anchormod-noconflict.js:40
jQuery.extend.each jquery-1.2.6.min.js:21
(anonymous function) xdomain-nfi-nfs-anchormod-noconflict.js:39
jQuery.extend.each jquery-1.2.6.min.js:21
jQuery.fn.jQuery.each jquery-1.2.6.min.js:12
listenToClicks xdomain-nfi-nfs-anchormod-noconflict.js:35
(anonymous function) xdomain-nfi-nfs-anchormod-noconflict.js:100
jQuery.fn.extend.ready jquery-1.2.6.min.js:27
jQuery.extend.ready.jQuery.readyList jquery-1.2.6.min.js:27
jQuery.extend.each jquery-1.2.6.min.js:21
jQuery.extend.ready jquery-1.2.6.min.js:27
So, atleast it seems to be not mixing up the two jquery instances. I've also tried it with jquery 1.7.1. I'm using 1.2.6 because the script seems to have been moste tested on that version.

Here you cache both the jQuerified element and the href attr.
var link = jQueryXD(this);
var href = link.attr('href');
Than why do you later do this:
jQueryXD(link).attr('href').indexOf(this)
You could either call link.attr('href').indexOf(this) since link is already a jQuery object or you could use directly the href you cached and do this href.indexOf(this).
Still I think the error you see happens when a link doesn't have an href attribute. So you better check if the href is not undefined before continuing your logic.
I tested it on both jQuery 1.2.6 and 1.7. It seems to be working fine.
Here's the finished script.
var jQueryXD = jQuery.noConflict();
/* I added var because page loads 2 versions
of jquery - not the source of the problem.*/
function listenToClicks() {
var domains = ["domain1.com", "domain2.com"];
var fileTypes = [".pdf"];
jQueryXD('a').each(function(index) {
var link = jQueryXD(this);
var href = link.attr('href');
if(!href){
// This element doesnt have a href
return true;
}
var valid = false;
jQueryXD.each(fileTypes, function(i) {
if (href.indexOf(this) != -1) { //this is line 40
valid = false;
link.bind('click', function(c) {
c.preventDefault();
_gat._getTrackerByName()._trackEvent('Download', 'Click - ' + link.attr('href'));
setTimeout('document.location = "' + href + '"', 100);
});
}
});
jQueryXD.each(domains, function(j) {
try {
if ((href.indexOf(this) != -1) && (window.location.href.indexOf(this) == -1)) {
valid = true;
if (valid) {
link.bind('click', function(l) {
if (typeof(_gat) == "object") {
l.preventDefault();
if (link.attr('target') != "_blank") { // _gaq.push(['_link',jQueryXD(link).attr('href')]);
_gaq.push(['_link', href, true]); // mod
}
else {
var tracker = _gat._getTrackerByName();
//var fullUrl = tracker._getLinkerUrl(href);
var fullUrl = tracker._getLinkerUrl(href, true); //mod
window.open(fullUrl);
}
}
});
}
}
}
catch (e) {
//Bad A tag
}
});
var rootDomain = document.domain.split(".")[document.domain.split(".").length - 2] + "." + document.domain.split(".")[document.domain.split(".").length - 1];
if ((href.match(/^http/)) && (href.indexOf(rootDomain) == -1) && !valid) {
jQueryXD(link).bind('click', function(d) {
d.preventDefault();
_gat._getTrackerByName()._trackEvent('Outbound Link', href);
setTimeout('document.location = "' + href + '"', 100);
});
}
});
}
jQueryXD(document).ready(function() {
listenToClicks();
});
But you might be reinventing the wheel here. There are some better scripts out there to achieve the same thing. I think you might be interested in looking into GAS. It's a wrapper around ga.js that extends and add a bunch of stuff including crossDomain and downloadTracking.
Spoiler: I'm the main developer of GAS.
https://github.com/CardinalPath/gas

Related

CEFSharp RegisterExtension not working

I had an screen scraper app that used CEFSharp that was working fine until I updated CEFSharp to the latest version. It appears that the way I was registering javascript extension functions no longer works. Here is my startup code:
[STAThread]
public static void Main()
{
try
{
CefSettings settings = new CefSettings();
settings.RegisterExtension(new CefExtension("showModalDialog", Resources.showModalDialog));
//Perform dependency check to make sure all relevant resources are in our output directory.
Cef.Initialize(settings, performDependencyCheck: true, browserProcessHandler: null);
ProcessCommandLine();
var browser = new BrowserForm("https://www.google.com");
Application.Run(browser);
}
catch (Exception ex)
{
MessageBox.Show(ex.ToString());
}
}
If I comment out the settings.RegisterExtension line, it runs fine. It used to work. Here is the code for my extension:
(function () {
absolutePath = function (href) {
var link = document.createElement("a");
link.href = href;
return (link.protocol + "//" + link.host + link.pathname + link.search + link.hash);
}
showModalDialog = function (url, arg, opt) {
url = url || ''; //URL of a dialog
arg = arg || null; //arguments to a dialog
opt = opt || 'dialogWidth:300px;dialogHeight:200px'; //options: dialogTop;dialogLeft;dialogWidth;dialogHeight or CSS styles
var caller = showModalDialog.caller.toString();
var dialog = document.body.appendChild(document.createElement('dialog'));
dialog.setAttribute('style', opt.replace(/dialog/gi, ''));
dialog.innerHTML = '×<iframe id="dialog-body" name="dialog-body" src="' + absolutePath(url) + '" style="border: 0; width: 100%; height: 100%;"></iframe>';
//document.getElementById('dialog-body').contentWindow.dialogArguments = arg;
document.getElementById('dialog-close').addEventListener('click', function (e) {
e.preventDefault();
dialog.close();
});
document.getElementById('dialog-body').addEventListener('load', function (e) {
this.style.height = this.contentWindow.document.body.scrollHeight + 'px';
this.style.width = this.contentWindow.document.body.scrollWidth + 'px';
this.contentWindow.close = function () {
dialog.close();
};
this.contentWindow.dialogArguments = arg;
this.window = this.contentWindow;
});
dialog.showModal();
//if using yield
if (caller.indexOf('yield') >= 0) {
return new Promise(function (resolve, reject) {
dialog.addEventListener('close', function () {
var returnValue = document.getElementById('dialog- body').contentWindow.returnValue;
document.body.removeChild(dialog);
resolve(returnValue);
});
});
}
//if using eval
var isNext = false;
var nextStmts = caller.split('\n').filter(function (stmt) {
if (isNext || stmt.indexOf('showModalDialog(') >= 0)
return isNext = true;
return false;
});
dialog.addEventListener('close', function () {
var returnValue = document.getElementById('dialog-body').contentWindow.returnValue;
document.body.removeChild(dialog);
//nextStmts[0] = nextStmts[0].replace(/(window\.)?showModalDialog\(.*\)/g, JSON.stringify(returnValue));
//eval('{\n' + nextStmts.join('\n'));
});
throw 'Execution stopped until showModalDialog is closed';
};
})();
Did something change about the syntax of extensions?
https://bugs.chromium.org/p/chromium/issues/detail?id=665391
It's a Chrome thing and it doesn't look like they are going to fix it.

JQuery History.js plugin not replacing state of one page in both HTML4 and HTML5 browsers

I am using JQuery History.js plugin to enable History API in HTML5 browsers and emulate in HTML4 browsers. I am using Ajaxify script to implement this plugin. I changed this script a little as shown:
var History, $, document;
function PrepareVariables() {
History = window.History,
$ = window.jQuery,
document = window.document;
}
function InitHistory() {
// Prepare Variables
var
/* Application Specific Variables */
//contentSelector = '#content,article:first,.article:first,.post:first',
contentSelector = '#navcontent';
$content = $(contentSelector), //.filter(':first'),
//contentNode = $content.get(0),
$menu = $('#menu,#nav,nav:first,.nav:first').filter(':first'),
activeClass = 'active selected current youarehere',
activeSelector = '.active,.selected,.current,.youarehere',
menuChildrenSelector = '> li,> ul > li',
completedEventName = 'statechangecomplete',
/* Application Generic Variables */
$window = $(window),
$body = $(document.body),
rootUrl = History.getRootUrl(),
scrollOptions = {
duration: 800,
easing: 'swing'
};
// Ensure Content
if ($content.length === 0) {
$content = $body;
}
// Internal Helper
$.expr[':'].internal = function (obj, index, meta, stack) {
// Prepare
var
$this = $(obj),
url = $this.attr('href') || '',
isInternalLink;
// Check link
isInternalLink = url.substring(0, rootUrl.length) === rootUrl || url.indexOf(':') === -1;
// Ignore or Keep
return isInternalLink;
};
// HTML Helper
var documentHtml = function (html) {
// Prepare
var result = String(html)
.replace(/<\!DOCTYPE[^>]*>/i, '')
.replace(/<(html|head|body|title|meta|script)([\s\>])/gi, '<div class="document-$1"$2')
.replace(/<\/(html|head|body|title|meta|script)\>/gi, '</div>');
// Return
return $.trim(result);
};
// Ajaxify Helper
$.fn.ajaxify = function () {
// Prepare
var $this = $(this);
// Ajaxify
//$this.find('a:internal:not(.no-ajaxy)').click(function (event) {
$this.find("a[data-isnav='0']").click(function (event) {
// Prepare
var
$this = $(this),
url = $this.attr('href'),
title = ($this.attr('title') || null);
// Continue as normal for cmd clicks etc
if (event.which == 2 || event.metaKey) {
return true;
}
// Ajaxify this link
History.pushState(null, title, url);
event.preventDefault();
return false;
});
// Chain
return $this;
};
// Ajaxify our Internal Links
$body.ajaxify();
// Hook into State Changes
$window.bind('statechange', function () {
// Prepare Variables
var
State = History.getState(),
url = State.url,
relativeUrl = url.replace(rootUrl, '');
// Start Fade Out
// Animating to opacity to 0 still keeps the element's height intact
// Which prevents that annoying pop bang issue when loading in new content
$content.animate({
opacity: 0
}, 800);
// Ajax Request the Traditional Page
callAjax("GetContent", {
URL: url /*typeOfHeader: contentType, argsdata: argdata*/
},
false,
function () {
var ops = $('#ops');
if (ops != null) ops.html('');
ShowProgress('');
//var now = (new Date()).getTime(); //Caching
//if (headerCache.exist(url)) {
// tDiff = now - headerCacheTime;
// if (tDiff < 3000) {
// setContentData(headerCache.get(url));
// return true;
// }
//}
},
function (d) {
//headerCache.set(url, d, null);
//cacheName = url;
HideProgress();
setContentData(d);
}, null);
// end ajax
}); // end onStateChange
}
(function (window, undefined) {
// Prepare our Variables
PrepareVariables();
// Check to see if History.js is enabled for our Browser
if (!History.enabled) {
return false;
}
// Wait for Document
$(function () {
InitHistory();
});
// end onDomLoad
})(window); // end closure
function UpdateHistory() {
var title = (document.title.trim().length > 0 ? document.title : null);
var url = window.location.href.replace(/^.*\/\/[^\/]+/, '');
var History = window.History;
History.replaceState(null, title, url);
$('a[data-isnav="0"').click(function () {
// Prepare
var
$this = $(this),
url = $this.attr('href'),
title = ($this.attr('title') || null);
// Continue as normal for cmd clicks etc
if (event.which == 2 || event.metaKey) {
return true;
}
// Ajaxify this link
History.pushState(null, title, url);
event.preventDefault();
return false;
});
}
function setContentData(d) {
var data = d.data;
// Fetch the scripts
//$scripts = $dataContent.find('.document-script');
//if ($scripts.length) {
// $scripts.detach();
//}
// Fetch the content
contentHtml = data;
if (!contentHtml) {
document.location.href = url;
return false;
}
// Update the menu
//$menuChildren = $menu.find(menuChildrenSelector);
//$menuChildren.filter(activeSelector).removeClass(activeClass);
//$menuChildren = $menuChildren.has('a[href^="' + relativeUrl + '"],a[href^="/' + relativeUrl + '"],a[href^="' + url + '"]');
//if ($menuChildren.length === 1) { $menuChildren.addClass(activeClass); }
// Update the content
$content.stop(true, true);
$content.html(contentHtml).ajaxify().css('opacity', 100).show(); /* you could fade in here if you'd like */
//Intialize other content
initContent();
// Update the title
//document.title = $data.find('.document-title:first').text();
//try {
// document.getElementsByTagName('title')[0].innerHTML = document.title.replace('<', '<').replace('>', '>').replace(' & ', ' & ');
//}
//catch (Exception) { }
// Add the scripts
//$scripts.each(function () {
// var $script = $(this), scriptText = $script.text(), scriptNode = document.createElement('script');
// if ($script.attr('src')) {
// if (!$script[0].async) { scriptNode.async = false; }
// scriptNode.src = $script.attr('src');
// }
// scriptNode.appendChild(document.createTextNode(scriptText));
// contentNode.appendChild(scriptNode);
//});
// Complete the change
if ($body.ScrollTo || false) {
$body.ScrollTo(scrollOptions);
} /* http://balupton.com/projects/jquery-scrollto */
$window.trigger(completedEventName);
// Inform Google Analytics of the change
if (typeof window._gaq !== 'undefined') {
window._gaq.push(['_trackPageview', relativeUrl]);
}
// Inform ReInvigorate of a state change
if (typeof window.reinvigorate !== 'undefined' && typeof window.reinvigorate.ajax_track !== 'undefined') {
reinvigorate.ajax_track(url);
// ^ we use the full url here as that is what reinvigorate supports
}
}
It is working fine and the content added on page using Ajax is added to previous state using UpdateHistory() function. On some pages the state is updated successfully but on one page it is not updating the content when the page is accessed for the second time. I searched SO for all the similar questions but unable to get any solution. First I thought the problem is with Internet Explorer but then I tried it on Firefox but it didn't work. Please tell me what can be the reason?
UPDATE
It's working for URLs like:
http://localhost:13956/AppStore/App/2012/Install
But not for:
http://localhost:13956/AppStore
It's look like first page is not saved. Try to call UpdateHistory() or History.pushState(null, title, url) inside InitHistory().

How can I show with PhantomJS the url of the processed page in the generated PDF?

My goal was to generate a PDF from every page included in the sitemap of a website created with Rails. I'm using PhantomJS to get it. I'm quite new in this field, but I could do it, but when I was finished, I realized that it would be usable also to see at the beginning of every PDF the url of the page from which the PDF was generated, so I can browse quicker to the page (the site has over hundred pages).
Here is the Javascript:
// Render Sitemap to file
var RenderUrlsToFile, arrayOfUrls, system;
system = require("system");
/*
Render given urls
#param array of URLs to render
#param callbackPerUrl Function called after finishing each URL, including the last URL
#param callbackFinal Function called after finishing everything
*/
var getFileNumber = function(urlIndex) {
if (urlIndex <10) {
return "00" + urlIndex;
} else {
if (urlIndex <100) {
return "0" + urlIndex;
} else {
return urlIndex;
}
}
};
RenderUrlsToFile = function(urls, callbackPerUrl, callbackFinal) {
var getFilename, next, page, retrieve, urlIndex, webpage;
urlIndex = 0;
webpage = require("webpage");
page = null;
getFilename = function() {
return "rendermulti-" + getFileNumber(urlIndex) + ".pdf";
};
next = function(status, url, file) {
page.close();
callbackPerUrl(status, url, file);
return retrieve();
};
retrieve = function() {
var url;
if (urls.length > 0) {
url = urls.shift();
urlIndex++;
page = webpage.create();
page.viewportSize = {
width: 1920,
height: 1880
};
page.settings.userAgent = "Phantom.js bot";
return page.open(url, function(status) {
var file;
file = getFilename();
if (status === "success") {
return window.setTimeout((function() {
// !!!!!!!!!!!!! Doesn't work !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
page.evaluate(function() {
var x = document.getElementById("logoAndNavigation");
var newP = document.createElement("P")
var textnode = window.location.protocol + "//" + window.location.host + "/" + window.location.pathname;
newP.appendChild(textnode)
x.insertBefore(newP, x.childNodes[0]);
});
// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
page.render("tempPdfs/" + file);
return next(status, url, file);
}), 200);
} else {
return next(status, url, file);
}
});
} else {
return callbackFinal();
}
};
return retrieve();
};
// This makes an array with all the urls inside the sitemap
var arrayOfUrls = [''];
var page = require('webpage').create();
page.open('http://localhost:3000/sitemap.xml', function() {
var content = page.content;
parser = new DOMParser();
xmlDoc = parser.parseFromString(content,'text/xml');
var loc = xmlDoc.getElementsByTagName('loc');
for(var i=0; i < loc.length; i++)
{
var url=loc[i].textContent;
arrayOfUrls.push(url);
}
});
RenderUrlsToFile(arrayOfUrls, (function(status, url, file) {
if (status !== "success") {
return console.log("Unable to render '" + url + "'");
} else {
return console.log("Rendered '" + url + "' at '" + file + "'");
}
}), function() {
return phantom.exit();
});
I tried to solve the issue with the urls, with the code framed with the comment
// !!!!!!!!!!!!! Doesn't work !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
I wanted to show the url inside an element of the page, that has the id #logoAndNavigation, but I get this error:
NOT_FOUND_ERR: DOM Exception 8: An attempt was made to reference a Node in a context where it does not exist.
If I use only a string like "hello" inside the variable textnode, it works, but not if I try to use the url of the page.
Can anyone please help me?
Thank you in advance!
appendChild expects a node not a string. You probably mean to use
var x = document.getElementById("logoAndNavigation");
var newP = document.createElement("p"); // small p
var textnode = window.location.protocol + "//" + window.location.host + "/" + window.location.pathname;
newP.innerHTML = textnode; // this
x.insertBefore(newP, x.childNodes[0]);
You can also use the example of printheaderfooter.js to add the URL directly to the header or footer.

Overriding XMLHttpRequest's open/send method

I am trying to catch every XHR Request send from Gmail and add a classification lable in the body of the mail. I am not sending new XHR requests. My code is working like filter for all XHR requests by overriding XMLHttpRequest's open/send function. I am working on this issue from last week.Please help me.
function sendEmail() {
//alert("In sendEmail()");
var overrideMethods = function() {
//alert("In overrideMethods()");
window.XMLHttpRequest.prototype._open = XMLHttpRequest.prototype.open;
XMLHttpRequest.prototype.open = function (method, url, async, user, password) {
this.openParams = {
url: url
};
return window.XMLHttpRequest.prototype._open.apply(this, arguments);
};
window.XMLHttpRequest.prototype._send = XMLHttpRequest.prototype.send;
window.XMLHttpRequest.prototype.send = function send() {
var defered = false;
var searchPattern = /(&selectedLable=|^selectedLable=)(.*?)&/;
alert("In send()");
if (typeof arguments[0] === "string" && arguments.length === 1) {
var str = arguments[0];
//alert("Inside of first if");
//alert("str : " + str);
if (this.openParams.url.match(/&act\=sm/) && str.match(/&bcc\=/) && str.match(searchPattern)) {
defered = true;
var sendData = (str.match(searchPattern) && str.match(searchPattern)[2]);
var tag = JSON.parse(decodeURIComponent(sendData)).tag;
alert("tag : " + tag);
/* Modify the POST url to reflect the tag */
str = str.replace(searchPattern, "");
str = str.replace(/&subject=/, "&subject=" + tag + ": ");
str = str.replace(/&body\=/, "&body=<br>" + tag.toLowerCase() + "<br>");
/* Capitalize the tag. */
arguments[0] = str + "&acn=!" + tag.charAt(0).toUpperCase() + tag.slice(1).toLowerCase();
window.XMLHttpRequest.prototype._send.apply(this, arguments);
}
}
if (!defered) { 
window.XMLHttpRequest.prototype._send.apply(this, arguments);
}
};
}
window.location.href = 'javascript: (' + overrideMethods.toString().replace(/(\n|\ {2,})/gm, '') + ')();';
}
sendEmail();
I found this link on stack overflow :
Overriding XMLHttpRequest's send method
Please note my method is working fine for native chrome extension, but its not working for crossrider extension.

how to scrape links with phantomjs

Can PhantomJS be used an an alternative to BeautifulSoup?
I am trying to search on Etsy and visit all the links in term. In Python, I know how to do this (with BeautifulSoup) but today I want to see if I can do the same with PhantomJS. I'm not getting very far.
This script should search "hello kitty" on Etsy and return all the of products
<a class="listing-thumb" href=...></a> and print them in the console. Ideally I'd visit them later on and get the information I need. Right now it just freezes. Any ideas?
var page = require('webpage').create();
var url = 'http://www.etsy.com/search?q=hello%20kitty';
page.open(url, function(status){
// list all the a.href links in the hello kitty etsy page
var link = page.evaluate(function() {
return document.querySelectorAll('a.listing-thumb');
});
for(var i = 0; i < link.length; i++){ console.log(link[i].href); }
phantom.exit();
});
I have toyed with using CasperJS, which may be better designed for this.
PhantomJS evaluate() cannot serialize and return complex objects like HTMLElements or NodeLists, so you have to map them to serializable things before:
var page = require('webpage').create();
var url = 'http://www.etsy.com/search?q=hello%20kitty';
page.open(url, function(status) {
// list all the a.href links in the hello kitty etsy page
var links = page.evaluate(function() {
return [].map.call(document.querySelectorAll('a.listing-thumb'), function(link) {
return link.getAttribute('href');
});
});
console.log(links.join('\n'));
phantom.exit();
});
Note: here we use [].map.call() in order to treat a NodeList as a standard Array.
The only problem with your code is that you do not understand phantomjs scopes. You have phantom and page scopes. You tried to return JavaScript DOM object references (those can't be serialized) from page scope (page.evaluate runs in page scope) to phantom main scope. I think that is not possible. Here follows code that works:
var page = require('webpage').create();
var url = 'http://www.etsy.com/search?q=hello%20kitty';
// for debug (to see if page returns status code 200)
page.onResourceReceived = function(response) {
if (response.url === url) {
console.log('Resorce: "' + response.url + '" status: ' + response.status);
if (response.status === 200) {
console.log(response.url);
for (var i = 0; i < response.headers.length; i++) {
console.log(response.headers[i].name + ': ' + response.headers[i].value);
}
}
}
};
page.onLoadFinished = function(status){
console.log('Status: ' + status);
console.log('Starting evaluate...');
var links = page.evaluate(function() {
var nodes = [],
matches = document.querySelectorAll("a.listing-thumb");
for(var i = 0; i < matches.length; ++i) {
nodes.push(matches[i].href);
}
return nodes;
});
console.log('Done evaluate... count: ' + links.length);
if (links && links.length > 0) {
for(var i = 0; i < links.length; ++i) {
console.log('(' + i + ') ' + links[i]);
}
} else {
console.log("No match found!");
}
phantom.exit(0);
};
page.open(url);
Here is some code I recently wrote that scrapes urls using PhantomJs, if you provide only a URL it will display all URLS's on the page, if you supply an argument of class|id followed by a "class/id name" it will display the urls of the class/id only.
//////////////////////////////////////////////////////////
///// PhantomJS URL Scraper v.1.3 /////
//
// Copyrighted by +A.M.Danischewski 2016+ (c)
// This program may be reutilized without limits, provided this
// notice remain intact.
//
// Usage: phantomjs phantom_urls.js <URL> [["class"|"id"] [<query id/class name>]]
//
// Argument 1: URL -- "https://www.youtube.com/watch?v=8TniRMwL2Vg"
// Argument 2: "class" or "id"
// Argument 3: If Argument 2 was provided, "class name" or "id name"
//
// By default this program will display ALL urls from a user supplied URL.
// If a class name or id name is provided then only URL's from the class
// or id are displayed.
//
///////////////////////////////////
var page = require('webpage').create(),
system = require('system'),
address;
if (system.args.length === 1) {
console.log(' Usage: phantomjs phantom_urls.js <URL> [["class"|"id"] [<query id/class name>]]');
phantom.exit();
}
address = system.args[1];
querytype= system.args[2];
queryclass = system.args[3];
page.open(address, function(status) {
if (status !== 'success') {
console.log('Error loading address: '+address);
} else {
//console.log('Success! In loading address: '+address);
}
});
page.onConsoleMessage = function(msg) {
console.log(msg);
}
page.onLoadFinished = function(status) {
var dynclass="function() { window.class_urls = new Array(); window.class_urls_next=0; var listings = document.getElementsByClassName('"+queryclass+"'); for (var i=0; i < listings.length; i++) { var el = listings[i]; var ellnks=[].map.call(el.querySelectorAll('a'),function(link) {return link.getAttribute('href');}); var elhtml=el.innerHTML; window.class_urls.push(ellnks.join('\\n')); }; return window.class_urls;}";
var dynid="function() { window.id_urls = new Array(); window.id_urls_next=0; var listings = document.getElementById('"+queryclass+"'); var ellnks=[].map.call(listings.querySelectorAll('a'),function(link) {return link.getAttribute('href');}); var elhtml=listings.innerHTML; window.id_urls.push(ellnks.join('\\n')); return window.id_urls;}";
var allurls="function() { var links = page.evaluate(function() { return [].map.call(document.querySelectorAll('a'), function(link) { return link.getAttribute('href'); };); };); console.log(links.join('\\n')); }";
var page_eval_function="";
if (querytype === "class") {
console.log(page.evaluate(dynclass).toString().replace(/,/g, "\n"));
} else if (querytype === "id") {
console.log(page.evaluate(dynid).toString().replace(/,/g, "\n"));
} else {
var links = page.evaluate(function() {
return [].map.call(document.querySelectorAll('a'), function(link) {
return link.getAttribute('href');
});
});
console.log(links.join('\n'));
}
phantom.exit();
};

Categories

Resources