not able to scrape anything from lenskart website - javascript

I'm trying to scrape some product information from the site http://www.lenskart.com/eyeglasses/frame-style/half-rim.html using phantomjs. Below is the script I have used:
var page = require('webpage').create();
page.onConsoleMessage = function(msg){
console.log(msg);
};
page.open('http://www.lenskart.com/eyeglasses/frame-style/half-rim.html', function () {
console.log('1234');
// Checks for bottom div and scrolls down from time to time
window.setInterval(function() {
console.log('ddddddddddd');
// Checks if there is a div with class=".has-more-items"
// (not sure if this is the best way of doing it)
var count = page.evaluate(function() {
try{
return document.getElementsByClassName('top-to-bottom')[0].style.display;
}catch(e){return e.message;}
});
//count = 1
console.log(count);
console.log('1111 '+count);
if(count === 'block' || count ==='inline' ) { // Didn't find
console.log('count '+count);
//this.echo('wdc');
page.evaluate(function() {
// Scrolls to the bottom of page
window.document.body.scrollTop = document.body.scrollHeight;
});
}
else
{ // Found
//Do what you want
console.log('len123');
try {
var links = page.evaluate(function() {
return [].map.call(document.querySelectorAll('div.product-view-region a.product-image'), function(link) {
//console.log(link.getAttribute('href'));
return link.getAttribute('href');
});
});
} catch (e) {
console.log(e.message); return [];
}
//console.log(links.join(','));
//console.log(links);
phantom.exit();
}
}, 1000); // Number o ms to wait between scrolls
});
But I am continuosly getting this error:
TypeError: 'undefined' is not a function (evaluating 'this.each(function(value,i
ndex){results.push(iterator.call(context,value,index));})')
http://d37l6i9uhpr090.cloudfront.net/js/top_common.js:809 in collect
phantomjs://webpage.evaluate():6
phantomjs://webpage.evaluate():7
phantomjs://webpage.evaluate():7
null
I also tried to scrape some other information but got the same result. I don't understand where am I going wrong!

Related

CasperJS: WaitFor timeout function to do rescroll?

I met some problem when I use CasperJS to scrape a website. The website is dynamically loaded like Twitter, so I want to do infinite scroll,
and thanks to #Artjom B. I found you code to do this.
var tryAndScroll = function (casper) {
try {
casper.echo('SCROLL!!');
casper.scrollToBottom();
if (casper.exists('div.loading')) {
var curItems = casper.evaluate(getCurrentInfosNum);
casper.echo(curItems);
casper.waitFor(function check() {
return curItems != casper.evaluate(getCurrentInfosNum);
}, function then() {
casper.wait(800);
tryAndScroll(casper);
}, function onTimeout() {
casper.emit('scroll.timeout',curItems);
}, 15000);
} else {
casper.echo("No more items");
return true;
}
} catch (err) {
casper.echo(err);
}
} //casper.tryAndScroll
And now, I want to continue to scroll many times when the timeout function invoked so I create my own event listener,‘scroll.timeout’.
var SRCOLL_NUM = 0;
var PreOfLoaded = 0;
casper.on('scroll.timeout', function (NumOfLoaded) {
if (SRCOLL_NUM <= 4) {
if (PreOfLoaded == NumOfLoaded)
SRCOLL_NUM++;
this.echo("Scroll Timeout,reScroll");
PreOfLoaded = NumOfLoaded;
tryAndScroll(casper);
} else {
this.echo("Scroll Timeout,reScroll times maximum");
SRCOLL_NUM = 0;
PreOfLoaded = 0;
}
});
However, when scroll timeout occurred, it printed Scroll Timeout,reScroll on the console. Then it skips tryAndScroll() and go to the next step in the main function. I want to continue to next step after retry scroll many times. What should I do?
I found CasperJS author illustrate :Automatic retry when open fails
var casper = require('casper').create();
casper.tryOpen = function(url, then) {
return this.then(function() {
this.open(url);
this.waitFor(function testStatus() {
return this.getCurrentHTTPStatus === 200;
}, then, function onFail() {
console.log('failed, retrying');
this.tryOpen(url);
}, 2000);
});
};
casper.start().tryOpen('http://failing.url.com/foo.bar', function() {
this.echo('wow, it worked, wtf');
}).run();
unfortunately, it doesn't work for me.
Try this
return this.currentHTTPStatus === 200;
I tested with the newest version of casperjs 1.1.1, it's working fine

Scraping an infinite scroll page stops without scrolling

I am currently working with PhantomJS and CasperJS to scrape for links in a website. The site uses javascript to dynamically load results. The below snippet however is not getting me all the results the page contains. What I need is to scroll down to the bottom of the page, see if the spinner shows up (meaning there’s more content still to come), wait until the new content had loaded and then keep scrolling until no more new content was shown. Then store the links with class name .title in an array. Link to the webpage for scraping.
var casper = require('casper').create();
var urls = [];
function tryAndScroll(casper) {
casper.waitFor(function() {
this.page.scrollPosition = { top: this.page.scrollPosition["top"] + 4000, left: 0 };
return true;
}, function() {
var info = this.getElementInfo('.badge-post-grid-load-more');
if (info["visible"] == true) {
this.waitWhileVisible('.badge-post-grid-load-more', function () {
this.emit('results.loaded');
}, function () {
this.echo('next results not loaded');
}, 5000);
}
}, function() {
this.echo("Scrolling failed. Sorry.").exit();
}, 500);
}
casper.on('results.loaded', function () {
tryAndScroll(this);
});
casper.start('http://example.com/', function() {
this.waitUntilVisible('.title', function() {
tryAndScroll(this);
});
});
casper.then(function() {
casper.each(this.getElementsInfo('.title'), function(casper, element, j) {
var url = element["attributes"]["href"];
urls.push(url);
});
});
casper.run(function() {
this.echo(urls.length + ' links found:');
this.echo(urls.join('\n')).exit();
});
I've looked at the page. Your misconception is probably that you think the .badge-post-grid-load-more element vanishes as soon as the next elements are loaded. This is not the case. It doesn't change at all. You have to find another way to test whether new elements were put into the DOM.
You could for example retrieve the current number of elements and use waitFor to detect when the number changes.
function getNumberOfItems(casper) {
return casper.getElementsInfo(".listview .badge-grid-item").length;
}
function tryAndScroll(casper) {
casper.page.scrollPosition = { top: casper.page.scrollPosition["top"] + 4000, left: 0 };
var info = casper.getElementInfo('.badge-post-grid-load-more');
if (info.visible) {
var curItems = getNumberOfItems(casper);
casper.waitFor(function check(){
return curItems != getNumberOfItems(casper);
}, function then(){
tryAndScroll(this);
}, function onTimeout(){
this.echo("Timout reached");
}, 20000);
} else {
casper.echo("no more items");
}
}
I've also streamlined tryAndScroll a little. There were completely unnecessary functions: the first casper.waitFor wasn't waiting at all and because of that the onTimeout callback could never be invoked.

Scrape links, store in array and then run another process in CasperJS

I currently have two CasperJS scripts that I want to combine into one for usability purposes. test1.js scrapes a webpage for links ( <a> elements). All the resulting links scraped are stored in an array urls. Script test2.js takes a link and extracts youtube src link if present from iframe.
How can I gather all links (test1.js) and then visit each link to extract a youtube link (test2.js), finally store YouTube links in array and display result?
test1.js
var urls = [];
var casper = require('casper').create();
function getNumberOfItems(casper) {
return casper.getElementsInfo(".listview .badge-grid-item").length;
}
function tryAndScroll(casper) {
casper.page.scrollPosition = { top: casper.page.scrollPosition["top"] + 4000, left: 0 };
var info = casper.getElementInfo('.badge-post-grid-load-more');
if (info.visible) {
var curItems = getNumberOfItems(casper);
if( curItems <= 60 ) {
casper.waitFor(function check(){
return curItems != getNumberOfItems(casper);
}, function then(){
tryAndScroll(this);
}, function onTimeout(){
this.echo("Timout reached");
}, 20000);
}
} else {
casper.echo("no more items");
}
}
casper.start('http://example.com', function() {
tryAndScroll(this);
});
casper.then(function() {
casper.each(this.getElementsInfo('.title'), function(casper, element, j) {
var url = element["attributes"]["href"];
urls.push(url);
});
});
casper.run(function() {
this.echo(urls.join('\n')).exit();
this.echo(urls.length + ' links found');
});
test2.js (Currently only takes one url)
var casper = require('casper').create();
var yt_links = [];
casper.start('http://example.com', function() {
this.click('.responsivewrapper');
});
casper.then(function() {
casper.each(this.getElementsInfo('.badge-youtube-player'), function(casper, element, j) {
var url = element["attributes"]["src"];
yt_links.push(url);
});
});
casper.run(function() {
this.echo(yt_links.join('\n')).exit();
this.echo(yt_links.length + ' link(s) found');
});
start and run functions of CasperJS can only be used once, but there is also the thenOpen function to open a URL in a step. All then* and wait* functions are step functions. By calling them, you essentially schedule the steps that those functions represent. Furthermore, you can nest CasperJS steps. So that the steps further down in the script, but higher up in the tree will only be executed when all the nested steps are finished.
// last step of test1.js
casper.then(function() {
this.getElementsInfo('.title').forEach(function(element) {
// skip elements that don't have a href attribute...
if (!element.attributes.href) {
return;
}
// here come the contents of test2.js
casper.thenOpen(element.attributes.href, function() {
this.click('.responsivewrapper');
}).then(function(){
...
}).then(function(){
this.echo(yt_links.join('\n')).exit();
this.echo(yt_links.length + ' link(s) found');
});
});
});
I used the builder/promise pattern to make the code example a little shorter.

ngInfiniteScroll "load next page" function being called repeatedly

Using the "loading remote data" example from the ngInfiniteScroll website I have tried to implement infinite scrolling. My issue;
The function nextPage() gets called continuously until there are no more records left to load (controlled by an offset value in the SQL query).
I'd appreciate any input on this as I'm rather lost.
Thanks in advance.
HTML
<tbody>
<div id="infinite_scroll" infinite-scroll='visits.nextPage()' infinite-scroll-disabled='visits.busy' infinite-scroll-distance='1'>
<tr ng-repeat="visit in visits.items" ng-class="{active: visit.uuid == data.detailItem.uuid}" ng-click="openDetailItem(visit.uuid)">
<td>{{visit.details.name}}</td>
<td>{{visit.created_at}}</td>
</tr>
</div>
</tbody>
Javascript - AngularJs Factory
angular.module('app').factory('Visits', function(VisitResource) {
// new visits object
var Visits = function () {
this.items = [];
this.busy = false;
this.offset = 0;
};
Visits.prototype.nextPage = function () {
// busy - stop
if (this.busy == true) {
// DEBUG
console.log('busy test 1.1: ' + this.busy);
return;
} else {
// DEBUG
console.log('busy test 1.2: ' + this.busy);
}
// busy now
this.busy = true;
VisitResource.getVisitations({
limit: 500,
offset: this.offset
}, function (response) {
// stop loading if no data returned
if(response.data.length == 0) {
// DEBUG
console.log('busy test 2: ' + this.busy);
return;
} else {
// DEBUG
console.log('Payload: ' + response.data.length);
}
var _this = this;
angular.forEach(response.data, function (a_visit) {
_this.items.push(a_visit);
});
// set the last acquired record value
this.offset = this.items[this.items.length - 1].id;
// not busy
this.busy = false;
}.bind(this));
};
return Visits;
});
As it turns out you can't get vanilla nginfinitescroll to trigger when the container is scrolled as nginfinitescroll is looking at the height of the window.
Here is a link to the answer on SO:
angularjs infinite scroll in a container

Navigating to Blacklisted URL's and Canceling Them

I need to write a Firefox extension that creates a blacklist and whitelist of URL's, and checks to make sure the user wants to navigate to them whenever the user attempts to do so. I'm doing this using a main script and a content script that I attach to every page (using PageMod); I attached a listener using jQuery to every link (with the tag "a") which executes a function using window.onbeforeunload. I have two questions:
How would I prompt/ask the user if they actually did want to go to the site?
How would I stop the browser from navigating to the site if the user decided not to?
Right now my code passes messages between the two scripts in order to accomplish my goal; as far as I can tell, I can only use "document" in the content script, and save the blacklist/whitelist in the main script. I'm using simple-storage to save my lists, and the port module to pass messages between the scripts.
For question 1, I've attempted using confirm(message) to get a positive/negative response from the user, but the popup either doesn't show up or shows up for a split second then gets automatically answered with a negative response. When I look in my console's error messages, I see a "prompt aborted by user" error.
For question 2, I've already tried using event.preventDefault() by passing the click event to the function (this worked, I think). Is there a better way to do this? I've seen people using window.location = "", et cetera to do this.
Anyways, the code is below:
MAIN.JS
var ss = require("sdk/simple-storage");
exports.main = function() {
if (!ss.storage.blacklist) {
ss.storage.blacklist = [];}
if (!ss.storage.whitelist) {
ss.storage.whitelist = [];}
var data = require("sdk/self").data;
var pageMod = require("sdk/page-mod");
pageMod.PageMod({
include: "*",
contentScriptFile: [data.url("jquery-1.10.2.min.js"),data.url("secChk.js")],
onAttach: function(worker) {
function whiteCNTD(str) {
for (var index = 0; index < ss.storage.whitelist.length; index++) {
if (ss.storage.whitelist[index] == str) {
return index;
}
}
return -1;
}
function blackCNTD(str) {
for (var index = 0; index < ss.storage.blacklist.length; index++) {
if (ss.storage.blacklist[index] == str) {
return index;
}
}
return -1;
}
function checkLists(URL) {
if (whiteCNTD(URL) == -1) {
if (blackCNTD(URL) != -1) {
var bool = false;
worker.port.emit("navq", "Do you want to go to this link and add it to the whitelist?");
worker.port.on("yes", function() {
bool = true;
});
worker.port.on("no", function() {
bool = false;
});
if (bool == true) {
ss.storage.blacklist.splice(index, 1);
ss.storage.whitelist.push(URL);
return true;
}
else {
return false;
}
}
else {
var bool = false;
worker.port.emit("safeq", "Is this a safe site?");
worker.port.on("yes", function() {
bool = true;
});
worker.port.on("no", function() {
bool = false;
});
if (bool == true) {
ss.storage.whitelist.push(URL);
return true;
}
else {
ss.storage.blacklist.push(URL);
return false;
}
}
}
return true;
}
worker.port.on("newURL", function(URL) {
var s = "";
s = URL;
if (checkLists(s)) {
worker.port.emit("good", s);
} else if (!checkLists(s)) {
worker.port.emit("bad", s);
}
});
}
});
}
SECCHK.JS
//Check if the site is a bad site whenever a link is clicked
$("a").click(function(event) {
window.onbeforeunload = function() {
self.port.on("navq", function(message) {
var r = confirm("Do you want to go to this link and add it to the whitelist?");
if (r == true) {
self.port.emit("yes", message);
} else if (r == false) {
self.port.emit("no", message);
}
});
self.port.on("safeq", function(message) {
var r = confirm("Is this a safe site?");
if (r == true) {
self.port.emit("yes", temp);
} else if (r == false) {
self.port.emit("no", temp);
}
});
link = document.activeElement.href;
self.port.emit("newURL", link);
self.port.on("good", function(message) {
return true;
});
self.port.on("bad", function(message) {
return false;
});
}
});

Categories

Resources