Scraping an infinite scroll page stops without scrolling - javascript

I am currently working with PhantomJS and CasperJS to scrape for links in a website. The site uses javascript to dynamically load results. The below snippet however is not getting me all the results the page contains. What I need is to scroll down to the bottom of the page, see if the spinner shows up (meaning there’s more content still to come), wait until the new content had loaded and then keep scrolling until no more new content was shown. Then store the links with class name .title in an array. Link to the webpage for scraping.
var casper = require('casper').create();
var urls = [];
function tryAndScroll(casper) {
casper.waitFor(function() {
this.page.scrollPosition = { top: this.page.scrollPosition["top"] + 4000, left: 0 };
return true;
}, function() {
var info = this.getElementInfo('.badge-post-grid-load-more');
if (info["visible"] == true) {
this.waitWhileVisible('.badge-post-grid-load-more', function () {
this.emit('results.loaded');
}, function () {
this.echo('next results not loaded');
}, 5000);
}
}, function() {
this.echo("Scrolling failed. Sorry.").exit();
}, 500);
}
casper.on('results.loaded', function () {
tryAndScroll(this);
});
casper.start('http://example.com/', function() {
this.waitUntilVisible('.title', function() {
tryAndScroll(this);
});
});
casper.then(function() {
casper.each(this.getElementsInfo('.title'), function(casper, element, j) {
var url = element["attributes"]["href"];
urls.push(url);
});
});
casper.run(function() {
this.echo(urls.length + ' links found:');
this.echo(urls.join('\n')).exit();
});

I've looked at the page. Your misconception is probably that you think the .badge-post-grid-load-more element vanishes as soon as the next elements are loaded. This is not the case. It doesn't change at all. You have to find another way to test whether new elements were put into the DOM.
You could for example retrieve the current number of elements and use waitFor to detect when the number changes.
function getNumberOfItems(casper) {
return casper.getElementsInfo(".listview .badge-grid-item").length;
}
function tryAndScroll(casper) {
casper.page.scrollPosition = { top: casper.page.scrollPosition["top"] + 4000, left: 0 };
var info = casper.getElementInfo('.badge-post-grid-load-more');
if (info.visible) {
var curItems = getNumberOfItems(casper);
casper.waitFor(function check(){
return curItems != getNumberOfItems(casper);
}, function then(){
tryAndScroll(this);
}, function onTimeout(){
this.echo("Timout reached");
}, 20000);
} else {
casper.echo("no more items");
}
}
I've also streamlined tryAndScroll a little. There were completely unnecessary functions: the first casper.waitFor wasn't waiting at all and because of that the onTimeout callback could never be invoked.

Related

CasperJS: WaitFor timeout function to do rescroll?

I met some problem when I use CasperJS to scrape a website. The website is dynamically loaded like Twitter, so I want to do infinite scroll,
and thanks to #Artjom B. I found you code to do this.
var tryAndScroll = function (casper) {
try {
casper.echo('SCROLL!!');
casper.scrollToBottom();
if (casper.exists('div.loading')) {
var curItems = casper.evaluate(getCurrentInfosNum);
casper.echo(curItems);
casper.waitFor(function check() {
return curItems != casper.evaluate(getCurrentInfosNum);
}, function then() {
casper.wait(800);
tryAndScroll(casper);
}, function onTimeout() {
casper.emit('scroll.timeout',curItems);
}, 15000);
} else {
casper.echo("No more items");
return true;
}
} catch (err) {
casper.echo(err);
}
} //casper.tryAndScroll
And now, I want to continue to scroll many times when the timeout function invoked so I create my own event listener,‘scroll.timeout’.
var SRCOLL_NUM = 0;
var PreOfLoaded = 0;
casper.on('scroll.timeout', function (NumOfLoaded) {
if (SRCOLL_NUM <= 4) {
if (PreOfLoaded == NumOfLoaded)
SRCOLL_NUM++;
this.echo("Scroll Timeout,reScroll");
PreOfLoaded = NumOfLoaded;
tryAndScroll(casper);
} else {
this.echo("Scroll Timeout,reScroll times maximum");
SRCOLL_NUM = 0;
PreOfLoaded = 0;
}
});
However, when scroll timeout occurred, it printed Scroll Timeout,reScroll on the console. Then it skips tryAndScroll() and go to the next step in the main function. I want to continue to next step after retry scroll many times. What should I do?
I found CasperJS author illustrate :Automatic retry when open fails
var casper = require('casper').create();
casper.tryOpen = function(url, then) {
return this.then(function() {
this.open(url);
this.waitFor(function testStatus() {
return this.getCurrentHTTPStatus === 200;
}, then, function onFail() {
console.log('failed, retrying');
this.tryOpen(url);
}, 2000);
});
};
casper.start().tryOpen('http://failing.url.com/foo.bar', function() {
this.echo('wow, it worked, wtf');
}).run();
unfortunately, it doesn't work for me.
Try this
return this.currentHTTPStatus === 200;
I tested with the newest version of casperjs 1.1.1, it's working fine

Scrape links, store in array and then run another process in CasperJS

I currently have two CasperJS scripts that I want to combine into one for usability purposes. test1.js scrapes a webpage for links ( <a> elements). All the resulting links scraped are stored in an array urls. Script test2.js takes a link and extracts youtube src link if present from iframe.
How can I gather all links (test1.js) and then visit each link to extract a youtube link (test2.js), finally store YouTube links in array and display result?
test1.js
var urls = [];
var casper = require('casper').create();
function getNumberOfItems(casper) {
return casper.getElementsInfo(".listview .badge-grid-item").length;
}
function tryAndScroll(casper) {
casper.page.scrollPosition = { top: casper.page.scrollPosition["top"] + 4000, left: 0 };
var info = casper.getElementInfo('.badge-post-grid-load-more');
if (info.visible) {
var curItems = getNumberOfItems(casper);
if( curItems <= 60 ) {
casper.waitFor(function check(){
return curItems != getNumberOfItems(casper);
}, function then(){
tryAndScroll(this);
}, function onTimeout(){
this.echo("Timout reached");
}, 20000);
}
} else {
casper.echo("no more items");
}
}
casper.start('http://example.com', function() {
tryAndScroll(this);
});
casper.then(function() {
casper.each(this.getElementsInfo('.title'), function(casper, element, j) {
var url = element["attributes"]["href"];
urls.push(url);
});
});
casper.run(function() {
this.echo(urls.join('\n')).exit();
this.echo(urls.length + ' links found');
});
test2.js (Currently only takes one url)
var casper = require('casper').create();
var yt_links = [];
casper.start('http://example.com', function() {
this.click('.responsivewrapper');
});
casper.then(function() {
casper.each(this.getElementsInfo('.badge-youtube-player'), function(casper, element, j) {
var url = element["attributes"]["src"];
yt_links.push(url);
});
});
casper.run(function() {
this.echo(yt_links.join('\n')).exit();
this.echo(yt_links.length + ' link(s) found');
});
start and run functions of CasperJS can only be used once, but there is also the thenOpen function to open a URL in a step. All then* and wait* functions are step functions. By calling them, you essentially schedule the steps that those functions represent. Furthermore, you can nest CasperJS steps. So that the steps further down in the script, but higher up in the tree will only be executed when all the nested steps are finished.
// last step of test1.js
casper.then(function() {
this.getElementsInfo('.title').forEach(function(element) {
// skip elements that don't have a href attribute...
if (!element.attributes.href) {
return;
}
// here come the contents of test2.js
casper.thenOpen(element.attributes.href, function() {
this.click('.responsivewrapper');
}).then(function(){
...
}).then(function(){
this.echo(yt_links.join('\n')).exit();
this.echo(yt_links.length + ' link(s) found');
});
});
});
I used the builder/promise pattern to make the code example a little shorter.

History.js causes infinite loop after four pushStates

I'm learning History.js and I stumbled upon a problem. I use this function to load content into block:
loadPost: function(article) {
Functions.foldAllBlocks();
var url = article.attr('data-url'),
content = article.find('.entry'),
oldText = content.text(),
blockPosition = article.offset().top,
postTitle = article.find('h2').text();
if(article.hasClass('expand')) {
article.attr('data-entry',oldText);
$.get(url, function (data) {
article.addClass('collapse').removeClass('expand');
var entry = $(data).find('.entry > *');
content.html(entry);
Functions.orderBlocks();
Functions.maintainHistory(postTitle, url);
$('body, html').animate({
scrollTop: blockPosition
},200);
});
} else {
article.addClass('expand').removeClass('collapse');
content.text(article.attr('data-entry'));
Functions.orderBlocks();
Functions.maintainHistory(baseSitename, baseURL);
$('body, html').animate({
scrollTop: blockPosition
},500);
}
}
and this to maintain my history state:
maintainHistory: function(title, url){
(function(window, undefined) {
History.Adapter.bind(window, 'statechange', function() {
var State = History.getState(),
cleanUrl = State.cleanUrl;
if(cleanUrl === baseURL) {
Functions.foldAllBlocks();
} else {
var article = $('h2 a[href="'+cleanUrl+'"]').closest('article');
Functions.loadPost(article);
}
});
History.pushState({ page: title }, title, url);
})(window);
}
There two functions are the only one using History.js. My problem is - every time, after fourth click (and thus, forth loadPost and maintainHistory firing) browser is getting stuck between last two articles.
What can I do to prevent this from happening?
Your maintainHistory function could use a minor rewrite. The way you have it now, the History.Adaptor will get stacked up with "statechange" handlers. Binding your event should only take place once, not every time you call the method. Try this:
maintainHistory: (function(window, undefined) {
console.log("Binding 'statechange' event - this should only happen once");
History.Adapter.bind(window, 'statechange', function() {
console.log("window.statechange event fired!");
var State = History.getState(),
cleanUrl = State.cleanUrl;
if(cleanUrl === baseURL) {
Functions.foldAllBlocks();
} else {
var article = $('h2 a[href="'+cleanUrl+'"]').closest('article');
Functions.loadPost(article);
}
});
// This is the function that is bound to maintainHistory
// The wrapping function is self-executing, will only happen once, and
// creates a nice little closure for binding the "statechange" event
return function(title, url) {
console.log("maintainHistory was called with title", title, "and url", url);
History.pushState({ page: title }, title, url);
};
})(window),
nextPropertyInFunctions: function() { ...}

clearInterval not working as I expect it too

I made a demo which is here. All you have to do is start typing in the text field, make sure you have the console open. So as you type, you'll instantly see the OMG Saved, and the counter in the console will go nuts.
Now click the button, watching the console you should see something like 11 or some other value, but you'll also see the counter reset and continues going. I do not want this. I want the counter to stop, I have clicked a button and while the page hasn't refreshed, the counter should stop if I understand these docs on setInterval().
the app I am developing which uses code very similar to this, does not refresh as most single page apps don't. So it is imperative that I have control over this setInterval.
So my question is:
How do I reset the counter such that, until I type again in the input box OR if the input box element cannot be found the flash message does not show up, the interval is set back to 0.
update
The following is the JavaScript code, which is run on the link provided above.
var ObjectClass = {
initialize: function() {
$('#flash-message').hide();
},
syncSave: function() {
$('#content').keypress(function(){
SomeOtherClass.autoSave = setInterval( function(){
$('#flash-message').show();
$('#flash-message').delay(1000).fadeOut('slow');
}, 500);
});
},
listenForClick: function() {
$('#click-me').click(function() {
console.log(SomeOtherClass.autoSave);
clearInterval(SomeOtherClass.autoSave);
});
}
};
var SomeOtherClass = {
autoSave: null
};
ObjectClass.initialize();
ObjectClass.syncSave();
ObjectClass.listenForClick();
You have to put this
clearInterval(SomeOtherClass.autoSave);
before this line:
SomeOtherClass.autoSave = setInterval( function(){
So that you kill the previous interval and you ahve ONLY ONE interval at the same time
Your code will be:
var ObjectClass = {
initialize: function () {
$('#flash-message').hide();
},
syncSave: function () {
$('#content').keypress(function () {
clearInterval(SomeOtherClass.autoSave);
SomeOtherClass.autoSave = setInterval(function () {
$('#flash-message').show();
$('#flash-message').delay(1000).fadeOut('slow');
}, 500);
});
},
listenForClick: function () {
$('#click-me').click(function () {
console.log(SomeOtherClass.autoSave);
clearInterval(SomeOtherClass.autoSave);
});
}
};
var SomeOtherClass = {
autoSave: null
};
ObjectClass.initialize();
ObjectClass.syncSave();
ObjectClass.listenForClick();
What you need to do is use a timeout instead of an interval, like this:
var ObjectClass = {
initialize: function() {
$('#flash-message').hide();
},
syncSave: function() {
$('#content').keypress(function(){
SomeOtherClass.autoSave = setTimeout( function(){
$('#flash-message').show();
$('#flash-message').delay(1000).fadeOut('slow');
}, 500);
});
},
listenForClick: function() {
$('#click-me').click(function() {
console.log(SomeOtherClass.autoSave);
if(typeof SomeOtherClass.autoSave === 'number'){
clearTimeout(SomeOtherClass.autoSave);
SomeOtherClass.autoSave = 0;
}
});
}
};
var SomeOtherClass = {
autoSave: 0
};
ObjectClass.initialize();
ObjectClass.syncSave();
ObjectClass.listenForClick();

Wait until div is not visible to process next line

I need to write some code which is supposed to wait until a predefined div is no longer visible in order to process the next line. I plan on using jQuery( ":visible" ) for this, and was thinking I could have some type of while loop. Does anyone have a good suggestion on how to accomplish this task?
$( document ).ready(function() {
$(".scroller-right" ).mouseup(function( event ) {
alert('right');
pollVisibility();
});
});
function pollVisibility() {
if ($(".mstrWaitBox").attr("visibility")!== 'undefined') || $(".mstrWaitBox").attr("visibility") !== false) {
alert('inside else');
microstrategy.getViewerBone().commands.exec('refresh');
} else {
setTimeout(pollVisibility, 100);
}
}
$( document ).ready(function() {
$(".scroller-right" ).mouseup(function( event ) {
alert('right');
pollVisibility();
});
});
function pollVisibility() {
if (!$(".mstrWaitBox").is(":visible")) {
alert('inside if');
microstrategy.getViewerBone().commands.exec('refresh');
} else {
setTimeout(pollVisibility, 100);
}
}
div when not visible:
<div class=​"mstrWaitBox" id=​"divWaitBox" scriptclass=​"mstrDialogImpl" dg=​"1" ty=​"edt">​
</div>​
div when visible:
<div class=​"mstrWaitBox" id=​"divWaitBox" scriptclass=​"mstrDialogImpl" dg=​"1" ty=​"edt" visibility="visible">​
</div>​
You can use the setTimeout function to poll the display status of the div. This implementation checks to see if the div is invisible every 1/2 second, once the div is no longer visible, execute some code. In my example we show another div, but you could easily call a function or do whatever.
http://jsfiddle.net/vHmq6/1/
Script
$(function() {
setTimeout(function() {
$("#hideThis").hide();
}, 3000);
pollVisibility();
function pollVisibility() {
if (!$("#hideThis").is(":visible")) {
// call a function here, or do whatever now that the div is not visible
$("#thenShowThis").show();
} else {
setTimeout(pollVisibility, 500);
}
}
}
Html
<div id='hideThis' style="display:block">
The other thing happens when this is no longer visible in about 3s</div>
<div id='thenShowThis' style="display:none">Hi There</div>
If your code is running in a modern browser you could always use the MutationObserver object and fallback on polling with setInterval or setTimeout when it's not supported.
There seems to be a polyfill as well, however I have never tried it and it's the first time I have a look at the project.
FIDDLE
var div = document.getElementById('test'),
divDisplay = div.style.display,
observer = new MutationObserver(function () {
var currentDisplay = div.style.display;
if (divDisplay !== currentDisplay) {
console.log('new display is ' + (divDisplay = currentDisplay));
}
});
//observe changes
observer.observe(div, { attributes: true });
div.style.display = 'none';
setTimeout(function () {
div.style.display = 'block';
}, 500);
However an even better alternative in my opinion would be to add an interceptor to third-party function that's hiding the div, if possible.
E.g
var hideImportantElement = function () {
//hide logic
};
//intercept
hideImportantElement = (function (fn) {
return function () {
fn.apply(this, arguments);
console.log('element was hidden');
};
})(hideImportantElement);
I used this approach to wait for an element to disappear so I can execute the other functions after that.
Let's say doTheRestOfTheStuff(parameters) function should only be called after the element with ID the_Element_ID disappears, we can use,
var existCondition = setInterval(function() {
if ($('#the_Element_ID').length <= 0) {
console.log("Exists!");
clearInterval(existCondition);
doTheRestOfTheStuff(parameters);
}
}, 100); // check every 100ms

Categories

Resources