Scraping dynamically rendered links from an infinite scrollbar in CasperJS

Scraping dynamically rendered links from an infinite scrollbar in CasperJS - javascript

I'm trying to scrape the links on the left sidebar of this page using CasperJS.
The page has hundreds of links in the sidebar, but only loads 20 at a time when you scroll down. This code successfully grabs the first 20 (needs casperjs and phantomjs globally installed to run):
var casper = require('casper').create();
// helper function that gets all of the resume links on the page:
var getAllLinks = function() {
var linksOnThisPage = []
$("a[href^='/ResumeB']").each(function(index, linkDiv) {
$linkDiv = $(linkDiv)
linksOnThisPage.push('http://www.super-resume.com' + $linkDiv.attr('href'))
});
return linksOnThisPage
};
//start casper, go to page, run helper function:
casper.start('http://www.super-resume.com/ResumeBuilder.jtp?query=Database+Administrator', function() {
allLinks=casper.evaluate(getAllLinks)
console.log('number of links found:', allLinks.length);
});
casper.run();
I can make the page scroll down in the actual browser with this:
$('#search-left-inner').scrollTop(10000);
10000 is an arbitrarily big number; every time you run that code in the browser, it loads 20 more links. (Ideally I'd like to be able to grab all at once without having to keep reloading 20 at a time, but that's less pressing for now.)
If I put that line inside the getAllLinks function like so:
var getAllLinks = function() {
$('#search-left-inner').scrollTop(10000);
var linksOnThisPage = []
//etc, etc,
it still only loads 20 links. Many similar posts discuss synchronicity issues, so I've I've tried to get the it to wait for the sidebar to finish loading a few ways, including this:
var getAllLinks = function() {
casper.then(function () {
$('#search-left-inner').scrollTop(100000);
});
casper.then(function () {
var linksOnThisPage = []
//etc. etc.
}
but now for some reason it only finds one link instead of 20.

I presume that if you scroll, it doesn't immediately load the next items, because loading takes time. You need to wait a little after scrolling before you can attempt to scrape all of the elements again.
casper.start(url)
.thenEvaluate(scroll)
.wait(5000, function(){
var links = this.evaluate(getAllLinks);
this.echo(links.length);
})
.run();
If this produces more links, then you can try the next step and that is infinite scrolling until no new elements are loaded. This can be done with asynchronous recursion in CasperJS:
var linkCount = -1;
function getAllLinks() {
var linksOnThisPage = []
$("a[href^='/ResumeB']").each(function(index, linkDiv) {
$linkDiv = $(linkDiv)
linksOnThisPage.push('http://www.super-resume.com' + $linkDiv.attr('href'))
});
return linksOnThisPage
}
function scroll() {
$('#search-left-inner').scrollTop(10000);
}
/**
* Returns true if more elements were loaded that were before
*/
function checkMore(){
var newLinks = this.evaluate(getAllLinks);
var newCount = newLinks.length;
if (linkCount === -1) {
linkCount = newCount;
}
return linkCount < newCount
}
/**
* Executes the a single iteration step and waits for a change in numbers.
* Terminates if there are no changes in 6 seconds.
*/
function step(){
this.thenEvaluate(scroll)
.waitFor(check, step, function _onTimeout(){
var links = this.evaluate(getAllLinks);
this.echo("finished with " + links.length + " links\n\n"+links.join("\n"));
}, 6000);
}
casper.start(url, step).run();
Keep in mind that it makes only sense to use jQuery in the DOM context (page context) which is inside of casper.evaluate(). I suggest that you also read the PhantomJS documentation of that function.

Related

JavaScript works when setTimeout() is used, but it isn't working when document.eventListener('DOMContentLoaded', x) is used on a WordPress page. Why?

I have a few lines of JavaScript code that pick up heading texts from separate sections and place them into their respective input fields. They are also executed on single pages using wp_enqueue_script.
It works absolutely fine when setTimeout() is used:
function passengerElevator() {
var getProductName = document.querySelectorAll('[data-id="6657316"]');
getProductName.forEach(function(item) {
var productName = item.querySelector('.lift');
var inputArea = item.querySelector('input[name=product]');
inputArea.value = productName.innerText;
});
var getProductName = document.querySelectorAll('[data-id="e9c06d5"]');
getProductName.forEach(function(item) {
var productName = item.querySelector('.lift');
var inputArea = item.querySelector('input[name=product]');
inputArea.value = productName.innerText;
});
setTimeout(function() { passengerElevator() },3000);
However, there is problem of page size (some pages have more than 10 input fields) and I don't want to set an astronomically high ms to delay the script. So I decided to fire it on DOMContentLoaded:
document.addEventListener("DOMContentLoaded", passengerElevator);
function passengerElevator() {
var getProductName = document.querySelectorAll('[data-id="6657316"]');
getProductName.forEach(function(item) {
var productName = item.querySelector('.lift'); // heading text (ex:Panoramic Lift)
var inputArea = item.querySelector('input[name=product]');
inputArea.value = productName.innerText; //ouput here
});
var getProductName = document.querySelectorAll('[data-id="e9c06d5"]');
getProductName.forEach(function(item) {
var productName = item.querySelector('.lift'); // Heading text (ex:Home Lift)
var inputArea = item.querySelector('input[name=product]');
inputArea.value = productName.innerText; // Output here
});
}
As you may have already guessed, it is not working. Is my code too messy to be executed faster or is there any other problem I am missing?
I know similar questions have been asked previously, however, no existing answer I found was able to help me.

It seems like you try to loop through elements that are still not loaded. Perhaps they are being appended to the page via Ajax, so DOMContentLoaded can't help there.
You can create your own check for those elements using setInterval, so use something like this:
let dataIdCheck = setInterval(() => {
if (document.querySelectorAll('[data-id="6657316"]').length > 0 && document.querySelectorAll('[data-id="e9c06d5"]').length > 0) {
clearInterval(dataIdCheck);
// your code here
}
}, 500);
This code will run every 500 milliseconds and check if those two elements exists, using .length. Once they do exists, we stop the interval and run the code.
I also suggest to do console.log('in') to check that our interval stop running once the elements are found.

jQuery prevent reload if button clicked

I have a jQuery datatable that immediately loads ON READY. After that, the datatable is reloaded every 30 seconds. This feature is functioning properly.
I have added a search feature that automatically reloads the datatable with new search results. This part is also functioning properly.
The problem I am experiencing is when I am using the search feature, and the new search results are returned. After 30 seconds, the new results are cleared and the datatable reloads with all of the original records.
Here is what I am currently attempting:
$(document).ready(function()
{
var searchCriteria = "";
displayBookings(searchCriteria);
var idle = 0;
var idleInterval = setInterval(timer, 30000);
$(this).mousemove(function(e){idle = 0;});
$(this).keypress(function(e){idle = 0;});
function timer()
{
idle = idle + 1;
if(idle > 2)
{
displayBookings(searchCriteria);
console.log('table reloaded');
}
}
$('#searchPending').on('click', function()
{
var isPending = 'Y';
var searchCriteria = {
isPending: isPending
};
displayBookings(searchCriteria);
});
});
The function displayBookings() takes searchCriteria. If searchCriteria is blank, then a basic query is fired. Obviously is searchCriteria contains parameters, then the same query is fired with a WHERE clause attached. I did not disclose the code for displayBookings().
All I need to do is stop the 30 second interval if the #searchPending button is clicked.

Clear the interval so it will stop loading.
clearInterval(idleInterval)
specifically in your code:
$('#searchPending').on('click', function()
{
clearInterval(idleInterval)
var isPending = 'Y';
var searchCriteria = {
isPending: isPending
};
displayBookings(searchCriteria);
});

Rather than start and stop the timer interval, since you'll run into a bit of a race condition, you can just have the "refresh" (your "timer" function) refresh using the latest search criteria. To do this, just pass the same object into your displayBookings function. E.g.
const search = { criteria: "" };
$(...).click(() => {
search.criteria = 'change it...';
displayBookings(search.criteria);
});
setInterval(() => displayBookings(search.criteria), 30000);
This way, if a refresh happens, it will use the latest search.criteria. You can achieve the same result with minimal change in your code by simply removing the var from the second searchCriteria. Currently, without removing the var, your outer criteria is being "shadowed" by your inner.
I alluded to debouncing1 in one of my comments. I misread the code and debouncing is not what you want. Instead, you want to only "refresh" if there hasn't been any user activity within some threshold. Here's an alternative from the approach you used:
let lastInteraction = 0;
function interact() {
lastInteraction = Date.now();
}
$(this).mousemove(interact);
$(this).keypress(interact);
Then in your refresh function:
if (Date.now() - lastInteraction > threshold) { ...
Implementing both the central criteria and revised idle check:
$(document).ready(function() {
const idle = {
threshold: 1000,
lastInteraction: 0,
interact() {
idle.lastInteraction = Date.now();
},
isIdle() {
return Date.now() - idle.lastInteraction > idle.threshold;
}
};
const search = { criteria: "" };
$(this).mousemove(idle.interact);
$(this).keypress(idle.interact);
setInterval(() => {
if (idle.isIdle()) {
displayBookings(search.criteria);
}
}, 30000);
$('#searchPending').on('click', () => {
search.criteria = { isPending: 'Y' };
displayBookings(search.criteria);
});
displayBookings(search.criteria);
});
1 The Wikipedia article linked to discusses debouncing with a keyboard. It's the same concept. You'd use debouncing on your displayBookings function if you plan on having it execute live as the user is typing. This would prevent too many HTTP requests from happening in a short duration of time.

Check if Typekit has loaded with JavaScript

I'm currently building a layout that animates using jQuery, and I am finding out the width of a div using .width(). However, sometimes it is getting the .width() before TypeKit has been activated (thus giving an incorrect width).
Is there a way to check when TypeKit has loaded by using an if statement?

Yes, there is.
Instead of calling the usual try{Typekit.load();}catch(e){} in your head tag, you can use Typekit.load with callbacks (docs):
try {
Typekit.load({
loading: function() {
// JavaScript to execute when fonts start loading
},
active: function() {
// JavaScript to execute when fonts become active
// this is where you want to init your animation stuff
},
inactive: function() {
// JavaScript to execute when fonts become inactive
}
})
} catch(e) {}
I've literally just done this for my own project, where I don't have the ability to change that code. So If you're in the same situation, try this:
// configure these
var check_interval = 100; // how many ms to leave before checking again
var give_up_after_ms = 2000; // how many ms before we consider the page loaded anyway.
// caches etc
var count = 0;
var count_limit = give_up_after_ms / check_interval;
var html = $("html");
var font_loaded_check_interval;
var check_load_status = function(callback) {
if(html.hasClass("wf-active") || count >= count_limit) {
// fonts are loaded or give_up_after_ms was reached
if(font_loaded_check_interval) {
clearInterval(font_loaded_check_interval);
font_loaded_check_interval = null;
}
// call the callback
callback.call(this);
return true;
}
count++;
return false;
};
function doneCallback() {
// code to run when fonts are loaded or timeout reached
alert("Done");
}
// check on initial run of JS, and if not ready, start checking at regular intervals.
if( ! check_load_status(doneCallback)) {
font_loaded_check_interval = setInterval(function() {
check_load_status(doneCallback);
}, check_interval);
}

Using Multiple page.open in Single Script

My goal is to execute PhantomJS by using:
// adding $op and $er for debugging purposes
exec('phantomjs script.js', $op, $er);
print_r($op);
echo $er;
And then inside script.js, I plan to use multiple page.open() to capture screenshots of different pages such as:
var url = 'some dynamic url goes here';
page = require('webpage').create();
page.open(url, function (status) {
console.log('opening page 1');
page.render('./slide1.png');
});
page = require('webpage').create();
page.open(url, function (status) {
console.log('opening page 2');
page.render('./slide2.png');
});
page = require('webpage').create();
page.open(url, function (status) {
console.log('opening page 3');
page.render('./slide3.png');
phantom.exit(); //<-- Exiting phantomJS only after opening all 3 pages
});
On running exec, I get the following output on page:
Array ( [0] => opening page 3 ) 0
As a result I only get the screenshot of the 3rd page. I'm not sure why PhantomJS is skipping the first and second blocks of code (evident from the missing console.log() messages that were supposed to be output from 1st and 2nd block) and only executing the third block of code.

The problem is that the second page.open is being invoked before the first one finishes, which can cause multiple problems. You want logic roughly like the following (assuming the filenames are given as command line arguments):
function handle_page(file){
page.open(file,function(){
...
page.evaluate(function(){
...do stuff...
});
page.render(...);
setTimeout(next_page,100);
});
}
function next_page(){
var file=args.shift();
if(!file){phantom.exit(0);}
handle_page(file);
}
next_page();
Right, it's recursive. This ensures that the processing of the function passed to page.open finishes, with a little 100ms grace period, before you go to the next file.
By the way, you don't need to keep repeating
page = require('webpage').create();

I've tried the accepted answer suggestions, but it doesn't work (at least not for v2.1.1).
To be accurate the accepted answer worked some of the time, but I still experienced sporadic failed page.open() calls, about 90% of the time on specific data sets.
The simplest answer I found is to instantiate a new page module for each url.
// first page
var urlA = "http://first/url"
var pageA = require('webpage').create()
pageA.open(urlA, function(status){
if (status){
setTimeout(openPageB, 100) // open second page call
} else{
phantom.exit(1)
}
})
// second page
var urlB = "http://second/url"
var pageB = require('webpage').create()
function openPageB(){
pageB.open(urlB, function(){
// ...
// ...
})
}
The following from the page module api documentation on the close method says:
close() {void}
Close the page and releases the memory heap associated with it. Do not use the page instance after calling this.
Due to some technical limitations, the web page object might not be completely garbage collected. This is often encountered when the same object is used over and over again. Calling this function may stop the increasing heap allocation.
Basically after I tested the close() method I decided using the same web page instance for different open() calls is too unreliable and it needed to be said.

You can use recursion:
var page = require('webpage').create();
// the urls to navigate to
var urls = [
'http://phantomjs.org/',
'https://twitter.com/sidanmor',
'https://github.com/sidanmor'
];
var i = 0;
// the recursion function
var genericCallback = function () {
return function (status) {
console.log("URL: " + urls[i]);
console.log("Status: " + status);
// exit if there was a problem with the navigation
if (!status || status === 'fail') phantom.exit();
i++;
if (status === "success") {
//-- YOUR STUFF HERE ----------------------
// do your stuff here... I'm taking a picture of the page
page.render('example' + i + '.png');
//-----------------------------------------
if (i < urls.length) {
// navigate to the next url and the callback is this function (recursion)
page.open(urls[i], genericCallback());
} else {
// try navigate to the next url (it is undefined because it is the last element) so the callback is exit
page.open(urls[i], function () {
phantom.exit();
});
}
}
};
};
// start from the first url
page.open(urls[i], genericCallback());

Using Queued Processes, sample:
var page = require('webpage').create();
// Queue Class Helper
var Queue = function() {
this._tasks = [];
};
Queue.prototype.add = function(fn, scope) {
this._tasks.push({fn: fn,scope: scope});
return this;
};
Queue.prototype.process = function() {
var proxy, self = this;
task = this._tasks.shift();
if(!task) {return;}
proxy = {end: function() {self.process();}};
task.fn.call(task.scope, proxy);
return this;
};
Queue.prototype.clear = function() {
this._tasks = []; return this;
};
// Init pages .....
var q = new Queue();
q.add(function(proxy) {
page.open(url1, function() {
// page.evaluate
proxy.end();
});
});
q.add(function(proxy) {
page.open(url2, function() {
// page.evaluate
proxy.end();
});
});
q.add(function(proxy) {
page.open(urln, function() {
// page.evaluate
proxy.end();
});
});
// .....
q.add(function(proxy) {
phantom.exit()
proxy.end();
});
q.process();
I hope this is useful, regards.

JS difference between enter page and refresh page

Is it possible to detect situation when page is entered and when the same page is refreshed
if (entered) alert("hi");
if (refreshed) alert("you've refreshed");
Somehow there are some little differences between page rendering when entered and when refreshed and it would be much easier to detect the case than to debug it for me (if its even possible - maybe some browser optimization stuff is causing it).

This isn't an ideal solution, but if your page can load in under 5 seconds than this will work, and assuming you are not navigation to another page, then returning within 5 seconds.
window.onbeforeunload = function(){
window.sessionStorage.setItem('lastvisit', new Date().getTime());
}
var lastVisit = +window.sessionStorage.getItem('lastvisit');
var isRefresh = (new Date().getTime() - lastVisit) < 5000;
console.log(isRefresh);

There is no perfect way of tracking reloads verses new page loads but this solution works in most situations. Use sessionStorage in combination with an unload event:
(function (win) {
'use strict';
var reloaded = false,
ss = win.sessionStorage,
offset = 1000, // 1 second, may need tweaking if
// your page takes a long time to load/where
// this code is located in your page
now = function () {
return (new Date()).getTime();
},
lastUnload = ss.getItem('lastunload'),
loadStatus = document.getElementById('status');
// sessionStorage returns null if nothing was stored
if (lastUnload !== null) {
// sessionStorage returns a string, +lastUnload
// coerces the string held in lastUnload into an integer
// see https://developer.mozilla.org/en-US/docs/JavaScript/Reference/Operators/Arithmetic_Operators#-_.28Unary_Negation.29
if (+lastUnload + offset > now()) {
reloaded = true;
}
}
win.addEventListener('unload', function () {
ss.setItem('lastunload', now());
}, false);
if (lastUnload === null) {
loadStatus.innerHTML = 'First visit of session.';
} else if (reloaded) {
loadStatus.innerHTML = 'Page was reloaded.';
} else {
loadStatus.innerHTML = 'Navigated back to page after leaving';
}
}(window));
This code defines a page reload as returning to the page within 1 second of leaving it, so there could be false positives if someone leaves the page and immediately hits the back button but with normal browsing behavior that really shouldn't happen. You can modify the offset variable if you want to give more or less leeway, but 1 second seems to be a good default.
After developing this code I also found this similar answer.

If sessionStorage is available, you can use that.
if (!window.sessionStorage.getItem('visited')) {
//entered
window.sessionStorage.setItem('visited', true);
}
else {
//refreshed
}
More on sessionStorage

Develop Reference

JavaScript is the programming language of the Web.

Scraping dynamically rendered links from an infinite scrollbar in CasperJS - javascript

Related

JavaScript works when setTimeout() is used, but it isn't working when document.eventListener('DOMContentLoaded', x) is used on a WordPress page. Why?

jQuery prevent reload if button clicked

Check if Typekit has loaded with JavaScript

Using Multiple page.open in Single Script

JS difference between enter page and refresh page

Categories

Resources