CasperJS - Scraper not navigating to the next page - javascript

The following code is a simple scraper written in CasperJS.
var casper = require('casper').create();
var url = casper.cli.get(0);
var page1 = casper.cli.get(1);
var page2 = casper.cli.get(2);
//console.log(page2);
var proxy = casper.cli.get(3);
//alert(page1);
var exp = /[-a-zA-Z0-9#:%_\+.~#?&//=]{2,256}\.[a-z]{2,4}\b(\/[-a-zA-Z0-9#:%_\+.~#?&//=]*)?/gi;
var regex = new RegExp(exp);
var baseUrl = url;
//console.log(baseUrl);
var nextBtn = "a.navigation-button.next";
var allLinks = [];
casper.start(baseUrl);
casper.waitForSelector(nextBtn, processPage);
casper.run();
function processPage() {
for (var i = page1; i < page2; i = i + 1) {
console.log(i);
var pageData = this.evaluate(getPageData);
allLinks = allLinks.concat(pageData);
if (!this.exists(nextBtn)) {
return;
};
this.thenClick(nextBtn).then(function() {
//this.echo(i);
this.echo(this.getCurrentUrl());
//this.wait(1000);
});
};
}
function getPageData(){
//return document.title;
var links = document.getElementsByClassName('pro-title');
links = Array.prototype.map.call(links,function(link){
return link.getAttribute('href');
});
return links;
};
casper.then(function(){
//require('utils').dump(allLinks);
this.each(allLinks,function(self,link){
if (link.match(regex)) {
self.thenOpen(link,function(a){
jsonObj = {};
jsonObj.title = this.fetchText('a.profile-full-name');
jsonObj.services = this.getHTML('div.info-list-text span:nth-child(2) span');
jsonObj.services = jsonObj.services.replace(/&/g,"and");
jsonObj.location = this.getHTML('div.pro-info-horizontal-list div.info-list-label:nth-child(3) div.info-list-text span');
//jsonObj.contact = this.fetchText('span.pro-contact-text');
jsonObj.description = this.getHTML('div.profile-about div:nth-child(1)');
//jsonObj.description.replace(/\s/g, '');
//require('utils').dump(jsonObj);
//jsonObj.description = jsonObj.description.replace(/[\t\n]/g,"");
//jsonObj = JSON.stringify(jsonObj, null, '\t');
//console.log(i);
require('utils').dump(jsonObj);
});
};
});
});
I am executing this script as follows,
casperjs scraping.js http://www.houzz.com/professionals/c/Chicago--IL/p/15 1 3
The first CLI argument is the starting URL. The second and third arguments are the starting and ending page numbers of the scrape.
I am able to extract data from the first page, but I don't understand why I am not able to extract data from any of the consequent pages.

You cannot mix synchronous and asynchronous code like this in processPage. The loop is immediately executed, but the click and the loading of the next page happens asynchronously. The evaluation of the page has to be done asynchronously:
function processPage() {
for (var i = page1; i < page2; i = i + 1) {
this.then(function(){
console.log(i);
var pageData = this.evaluate(getPageData);
allLinks = allLinks.concat(pageData);
if (!this.exists(nextBtn)) {
return;
}
this.thenClick(nextBtn).then(function() {
this.echo(this.getCurrentUrl());
});
});
};
}

Related

Check if series of images exists, stop when not found in Javascript

I'd like to show series of dynamically loaded images named in a structured way in my website. Images are located in a different domain which causes Same Origin Policy restriction. I cannot use $.ajax.
I build my url like with a counter such as www.example.com/Images/img-1.jpg, www.example.com/Images/img-2.jpg and it goes...
I've tried several other answers from similar posts but couldn't make them work. Either the loop runs forever or it never finds the images even though they exist in the path.
1st try:
ShowImages: function () {
var urlBase = 'http://www.example.com/Images/img-';
var i = 1;
while (true) {
var url = urlBase + i + '.jpg';
var imgDom = new Image();
imgDom.onload = function () {
alert('worked');
i++;
};
imgDom.onerror = function () {
return; // I want to either break the while loop, or return from ShowImages function
};
imgDom.src = url;
}
},
It never hits the .onerror.
2nd try:
ShowImages: function () {
var urlBase = 'http://www.example.com/Images/img-';
var i = 1;
while (true) {
var url = urlBase + i + '.jpg';
var imgDom = document.createElement("img");
$(imgDom).attr('src', url);
if (imgDom.complete) {
alert('worked');
i++;
} else {
break;
}
}
},
It never hits the .complete.
Trying with your first option (you basically need to chain them)
ShowImages(1);
ShowImages: function (counter) {
var urlBase = 'http://www.example.com/Images/img-';
var url = urlBase + counter + '.jpg';
var imgDom = new Image();
imgDom.onload = function () {
alert('worked');
ShowImages(counter+1);
};
imgDom.onerror = function () {
alert("all images done..");
return; // I want to either break the while loop, or return from ShowImages function
};
imgDom.src = url;
};

SlimerJS is not running my script

I am trying to run my CasperJS test using slimerJS, and it just do nothing, opened the FF browser and displayed the SlimerJS logo. If I am trying to run a simple script like loading of Google homepage or something it works fine. Attached the script below, can you please tell me what is wrong there? Thanks!
// Simple messaging flow
action_helpers_v2 = require('../v2_modules/action_helpers_v2.js');
helpers_v2 = require('../v2_modules/helpers_v2.js');
var staging = casper.cli.get('staging'),
unixTimeStamp = (Date.now()).toString().substring(4),
currentTime = new Date(),
currentHour = currentTime.getHours(),
timeout = 15000;
var genereateText = function genereateText() {
var text = "";
var possible = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
for( var i=0; i < 20; i++ ) {
text += possible.charAt(Math.floor(Math.random() * possible.length));
}
return text;
}
var mouse = require("mouse").create(casper);
var webPage = require('webpage');
var page = webPage.create();
var live = casper.cli.get("prod"),
staging = casper.cli.get("staging"),
localeTesting = casper.cli.get("localeTesting"),
buyerUserName = 'settings120',
buyerPassword = 123456,
orders_array = [],
numberOfTests = 0;
casper.test.begin("Test Name: simple messaging flow", function suite(test) {
casper.start('http://fiverr.com');
casper.then(function() {
if (staging) {
action_helpers_v2.enableStaging(staging);
}
});
casper.then(function() {
action_helpers_v2.navigateToHomepage();
});
casper.then(function() {
action_helpers_v2.login(buyerUserName,buyerPassword);
action_helpers_v2.navigateToConversation("yogev_a");
});
casper.wait(5000);
var message = genereateText();
casper.then(function() {
casper.evaluate(function(message) {
$('#message_body')[0].value = message;
}, message);
});
casper.then(function() {
casper.evaluate(function() {
$('.btn-send-message')[0].click()
});
});
casper.waitForText(message, function() {
casper.test.pass("Message was sent");
}, function() {
helpers_v2.printScreenAndMessage('message_error', 'message_error');
casper.test.fail("Message wasnt sent");
});
casper.run(function(){
test.done();
});
});

How to return array from JavaScript function that retrieves data from text file?

I am building a Windows 8 Store app with HTML/CSS/JavaScript. I am reading in data from a text file through a function, and then putting that data into an array. I am trying to return the array through the function, but it is not working. Any help would be greatly appreciated. I've attached my code snippet.
// Load user data
var DefineUserData = function LoadUserData() {
return Windows.Storage.ApplicationData.current.localFolder.getFileAsync(loadfile).done(function (UserFile) {
return Windows.Storage.FileIO.readTextAsync(UserFile).done(function (fileResult) {
var userdata = new Object();
var dataobject = {};
var innercount;
var outercount;
var fileResultByLines = fileResult.split("\n");
for (outercount = 0; outercount <= (fileResultByLines.length - 2) ; outercount++) {
var tempArray = fileResultByLines[outercount].split(",");
dataobject.metrictitle = tempArray[0];
dataobject.numinputs = tempArray[1];
dataobject.inputs = new Array();
for (innercount = 0; innercount <= parseInt(dataobject.numinputs) ; innercount++) {
dataobject.inputs[innercount] = tempArray[innercount + 2];
}
userdata[outercount] = dataobject;
}
return userdata;
});
},
function (errorResult) {
document.getElementById("resbutton1").innerText = errorResult;
})
}
Your DefineUserData function is returning a Promise, not a value. Additionally done functions don't return anything. Instead you'll need to use then functions instead of done functions in DefineUserData and then handle add a done function (or then) to the code that calls this function.
Also, You can make your promises easier to read, and easier to work with by chaining then functions instead of nesting them.
Currently on Win7 at the office so I can't test this, but try something similar to this pseudo-code. Note then functions instead of done. The last then returns your data. Sample snippet afterwards to illustrate calling this and handling the result.
// modified version of yours
var DefineUserData = function LoadUserData() {
return Windows.Storage.ApplicationData.current.localFolder
.getFileAsync(loadfile)
.then(function (UserFile) {
return Windows.Storage.FileIO.readTextAsync(UserFile);
}).then(function (fileResult) {
var userdata = new Object();
var dataobject = {};
var innercount;
var outercount;
var fileResultByLines = fileResult.split("\n");
for (outercount = 0; outercount <= (fileResultByLines.length - 2) ; outercount++) {
var tempArray = fileResultByLines[outercount].split(",");
dataobject.metrictitle = tempArray[0];
dataobject.numinputs = tempArray[1];
dataobject.inputs = new Array();
for (innercount = 0; innercount <= parseInt(dataobject.numinputs) ; innercount++) {
dataobject.inputs[innercount] = tempArray[innercount + 2];
}
userdata[outercount] = dataobject;
}
return userdata;
},
function (errorResult) {
document.getElementById("resbutton1").innerText = errorResult;
});
}
// some other code...
DefineUserData.done(function (userdata) {
// do something
});

omit certain pages from history cookie

I am using this script to store the user's history to a cookie for the last 10 pages accessed. So far I've got the script displaying the cookie data using the document.title, and url in a list.
My question is what would be the simplest way to add a page skip feature, that would let me omit certain pages from being added to the history cookie? Everything I've tried hasn't worked, as it's a little bit outside of my knowledge.
Thanks for your time and help.
JS:
(function($){
var history;
function getHistory() {
var tmp = $.cookie("history");
if (tmp===undefined || tmp===null) tmp = "";
if ($.trim(tmp)=="") tmp = [];
else tmp = tmp.split("||");
history = [];
$.each(tmp, function(){
var split = this.split("|");
history.push({
title: split[0],
url: split[1]
});
});
}
function saveHistory() {
var tmp = [];
$.each(history, function(){
tmp.push(this.title+"|"+this.url);
});
$.cookie("history",tmp.join("||"),{ expires: 60, path: "/" });
}
function addToHistory(title,url) {
var newHistory = []
$.each(history, function(){
if (this.url!=url) newHistory.push(this);
});
history = newHistory;
if (history.length>=10) {
history.shift();
}
history.push({
title: title,
url: url
});
saveHistory();
writeHistory();
}
function writeHistory() {
var list = $("<ul />");
$.each(history, function() {
var element = $("<li />");
var link = $("<a />");
link.attr("href",this.url);
link.text(this.title);
element.append(link);
list.append(element);
});
$("#history").empty().append(list);
}
$(document).ready(function(){
getHistory();
var url = document.location.href;
var split = url.split("#");
var title;
if (split.length > 1) {
title = $("#"+split[1]).text();
} else {
title = document.title;
}
if (title===undefined || title===null || $.trim(title)=="") title = url;
addToHistory(title,url);
url = split[0];
$("a[href^='#']").click(function(){
var link = $(this);
var href = link.attr("href");
var linkUrl = url+href;
var title = $(href).text();
if (title===undefined || title===null || $.trim(title)==="") title = linkUrl;
addToHistory(title,linkUrl);
});
});
})(jQuery);
HTML:
<div id="history"></div>
several ways you could approach this... You could keep an Array of urls not to save, or you could put something in the page that would let the script know not to save that page?...
function saveHistory(){
if ($('.no-save-history')) return false;
/*...*/
}
HTML:
< div id="history" class="no-save-history">

How do get param from a url

As seen below I'm trying to get #currentpage to pass client params
Can someone help out thanks.
$(document).ready(function() {
window.addEventListener("load", windowLoaded, false);
function windowLoaded() {
chrome.tabs.getSelected(null, function(tab) {
document.getElementById('currentpage').innerHTML = tab.url;
});
}
var url = $("currentpage");
// yes I relize this is the part not working.
var client = jQuery.param("currentpage");
var page = jQuery.param("currentpage");
var devurl = "http://#/?clientsNumber=" + client + "&pageName=" + page ;
});
This is a method to extract the params from a url
function getUrlParams(url) {
var paramMap = {};
var questionMark = url.indexOf('?');
if (questionMark == -1) {
return paramMap;
}
var parts = url.substring(questionMark + 1).split("&");
for (var i = 0; i < parts.length; i ++) {
var component = parts[i].split("=");
paramMap [decodeURIComponent(component[0])] = decodeURIComponent(component[1]);
}
return paramMap;
}
Here's how to use it in your code
var url = "?c=231171&p=home";
var params = getUrlParams(url);
var devurl = "http://site.com/?c=" + encodeURIComponent(params.c) + "&p=" + encodeURIComponent(params.p) + "&genphase2=true";
// devurl == "http://site.com/?c=231171&p=home&genphase2=true"
See it in action http://jsfiddle.net/mendesjuan/TCpsD/
Here's the code you posted with minimal changes to get it working, it also uses $.param as it's intended, that is to create a query string from a JS object, this works well since my suggested function returns an object from the url
$(document).ready(function() {
// This does not handle arrays because it's not part of the official specs
// PHP and some other server side languages support it but there's no official
// consensus
function getUrlParams(url) {
var paramMap = {};
var questionMark = url.indexOf('?');
if (questionMark == -1) {
return paramMap;
}
var parts = url.substring(questionMark + 1).split("&");
for (var i = 0; i < parts.length; i ++) {
var component = parts[i].split("=");
paramMap [decodeURIComponent(component[0])] = decodeURIComponent(component[1]);
}
return paramMap;
}
// no need for the extra load listener here, jquery.ready already puts
// your code in the onload
chrome.tabs.getSelected(null, function(tab) {
document.getElementById('currentpage').innerHTML = tab.url;
});
var url = $("currentpage");
var paramMap = getUrlParams(url);
// Add the genphase parameter to the param map
paramMap.genphase2 = true;
// Use jQuery.param to create the url to click on
var devurl = "http://site.com/?"+ jQuery.param(paramMap);
$('#mydev').click( function(){
window.open(devurl);
});
});

Categories

Resources