Scrape chained selects with updated data using CasperJS - javascript

There are 2 selects with IDs. The 2nd select box data is linked based on what you select on the first select box. In other words, if you select "BMW" in the second select box should appear 316,318,320 ... you got the point.
The first select has listener
('#brand').change(function(){
call ajax and fill the data for the second select box
}
What I want to do at the end is to get all the options for models, for all the brands :-)
What I got so far is:
var casper = require('casper').create({
loadImages:false,
verbose: true,
logLevel: 'debug',
clientScripts: ["includes/jquery.min.js"]
});
casper.userAgent('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36');
casper.on('remote.message', function(msg) {
this.echo('remote message caught: ' + msg);
});
casper.on( 'page.error', function (msg, trace) {
this.echo( 'Error: ' + msg, 'ERROR' );
});
casper.start('http://www.mywebsite.kitchen/');
casper.then(function(){
this.evaluate(function(valueOptionSelect){
$('#brand').val(6).trigger('change');
},optionFirstSelect);
this.waitFor(function check() {
return this.evaluate(function(casper) {
var len = $('#model1 option').length;
console.log('length of options is ->', len);
return $('#model1 option').length > 1;
});
}, function then() {
//well i still haven't reach that point
}, function timeOut(){
casper.echo(arguments)
});
});
casper.run(function() {
//finish execution script
this.exit();
});
now what I get for console log is:
//EDIT - the length is 1 not 0
length of options is 1
When I execute $('#brand').val(6).trigger('change'); $('#model1 option').length in my browser console I get correct results.

Related

access the 'd' element from an SVG object with casperjs

I am a very beginer in javascript/phantomjs/casperjs (like only several days of learning) but I am stuck with this svg graph I am trying to scrap data from.
I am trying to access the d="M20,331.37,331.37,21.40...." element from an SVG object using a casperjs code, and write in the console and a txt file (or CSV). I try the following code:
var casper = require('casper').create({
pageSettings: {
loadImages: true,
loadPlugins: true,
userAgent: 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'
}
});
//First step is to open baidu.html
casper.start().thenOpen("file:///baidu.html", function() {
console.log("Baidu website opened");
this.wait(6000);
});
casper.then(function() {
var dataFromGraph = this.getElementsAttribute(require('casper').selectXPath('//*[#id="trend"]/svg/path[6]'),"d");
this.echo(dataFromGraph);
require('fs').write("data_graph.txt", dataFromGraph,'w');
});
casper.run();
But nothing worked. I get NULL element or empty result.
This is all the other code I try:
var dataFromGraph = this.fetchText(require('casper').selectXPath('//*[#id="trend"]/svg/path[6]/d'));
var dataFromGraph = this.getElementsAttribute(require('casper').selectXPath('//*[#id="trend"]/svg/path[6]'),"d") //,"d")
var dataFromGraph = this.getElementInfo(require('casper').selectXPath('//*[#id="trend"]/svg/path[6]'))
var dataFromGraph = this.fetchText("#trend > svg > path");
I have the Xpath and the selector of the object but I am not sure how to acces it. Here is a picture of the element I want to scrap.
As the website I want to scrap need a password, this is the HTML file that I save from it https://ufile.io/5y9g2.
The element I want to scrap is the data behind the graph here.
Any help would be very appreciated.
I reworked your script a bit and now it works. Check the snippet below.
var fs = require('fs');
var casper = require('casper').create();
casper.start().thenOpen("http://localhost:8001/baidu.html", function() {
console.log("Baidu website opened");
});
casper.then(function() {
var graphData = this.evaluate(function() {
return document.querySelector('#trend > svg > path:nth-child(11)').getAttribute('d')
});
this.echo(graphData);
fs.write("data_graph.txt", graphData,'w');
});
casper.run();
Hope that helps!

How to click a button ajax via casperjs?

This is my code. It run ok, it access site, and fill zipcode ok. But i dont know why it cant click button "GO" . What wrong in my code ? Thank you
Site is: https://www.az.aaa.com/membership/gift-membership-form?promocode=zumz0
Thank you !
var casper = require("casper").create();
var mouse = require("mouse").create(casper);
var casper = require('casper').create({
verbose: true,
logLevel: 'debug', // debug, info, warning, error
clientScripts: ["includes/jquery.min.js"],
pageSettings:{
loadImages: true,
loadPlugins: false,
userAgent: 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
}
}
);
var fs = require('fs');
//var password = casper.cli.get(1);
function strip_tags(str) {
str = str.toString();
return str.replace(/<\/?[^>]+>/gi, '');
}
casper.start().thenOpen("https://www.az.aaa.com/membership/gift-membership-form?promocode=zumz0", function(){
console.log("1. Access Site");
});
casper.waitForSelector('#edit-zipcode', function(){
console.log("2. Box Zipcode Found ! ");
this.capture('site_box_zipcode.png');
});
casper.then(function(){
fs.write("debug_zipcode.html", this.getPageContent(), 'w');
});
casper.then(function(){
console.log("3. Filling Form Zipcode")
this.evaluate(function(){
$('#edit-zipcode').attr('value', '86322');
$('#edit-tqs1-submit').click();
//document.getElementById("edit-zipcode").value='86322';
//document.getElementById("edit-tqs1-submit").click();
});
//this.mouse.rightclick('button.col-xs-12 col-md-1 product-button js-product-button pull-right');
})
//
casper.then(function(){
this.wait(7000);//Wait a bit so page loads (there are a lot of ajax calls and that is why we are waiting 6 seconds)
this.capture('Afterpostzipcode.png');
console.log("4. Finish Capture Picture");
});
casper.then(function(){
fs.write("debug_fl.html", this.get, 'w');
});
casper.run();

CasperJS - NodeList.length return 0

I tried to extract data from some webpages using CasperJS, I have tried adding this.wait(5000) inside getDetails(), but I don't know why direktoriNodeList.length always return 0
PhantomJS : 2.0.0
CasperJS : 1.1.0-beta3
//casperjs --proxy=127.0.0.1:9050 --proxy-type=socks5 axa-mandiri.casper.js
var casper = require("casper").create({
verbose: true,
logLevel: "info",
pageSettings: {
loadImages: false, //The script is much faster when this field is set to false
loadPlugins: false,
userAgent: "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36"
}
});
var utils = require('utils');
var currentPage = 1;
var hospitals = [];
var url = 'https://www.axa-mandiri.co.id/direktori/rumah-sakit/';//Type your url
casper.start(url);//Start CasperJS
casper.waitForSelector('#main-direktori', processPage, stopScript);//Wait until content loads and then process the page
casper.run(function() {
utils.dump(hospitals);
this.exit();
});
function getDetails(){
/*
In this function you can put anything you want in order to extract your data from the website.
NOTE: This function is executed in page context, and will should be called as parameter to Casper's evaluate function.
*/
.
console.log("getDetails " + currentPage);
var details = [];
var direktoriNodeList = document.querySelectorAll("ul#main-direktori li.direktori-list");
console.log("direktoriNodeList.length " + direktoriNodeList.length);
utils.dump(direktoriNodeList);
for (var i = 0; i < direktoriNodeList.length; i++) {
console.log("querySelectorAll " + i);
var detail = {
name : direktoriNodeList[i].querySelector("div.details strong").textContent.replace(/\n/g, ''),
phone : direktoriNodeList[i].querySelector("div.details span:nth-child(1)").textContent.replace(/\n/g, ''),
map : direktoriNodeList[i].querySelector("div.map-details a.get-direction").getAttribute("href")
};
details.push(detail);
}
/*
[].forEach.call(document.querySelectorAll("ul#main-direktori li.direktori-list"), function(elem) {
console.log("querySelectorAll");
var detail = {
name : elem.querySelector("div.details strong").textContent.replace(/\n/g, ''),
phone : elem.querySelector("div.details span:nth-child(1)").textContent.replace(/\n/g, ''),
map : elem.querySelector("div.map-details a.get-direction").getAttribute("href")
};
details.push(detail);
});
*/
return JSON.stringify(details);
}
function stopScript() {
utils.dump(hospitals);
console.log("Exiting...");
this.exit();
};
function processPage() {
//your function which will do data scraping from the page. If you need to extract data from tables, from divs write your logic in this function
hospitals = hospitals.concat(this.evaluate(getDetails()));
//If there is no nextButton on the page, then exit a script because we hit the last page
if (this.exists("a.nextpostslink") == false) {
stopScript();
}
//Click on the next button
this.thenClick("a.nextpostslink").then(function() {
currentPage++;
this.waitForSelector("#main-direktori", processPage, stopScript);
});
};
casper.evaluate(fn, ...) expects a function, not an array. Change
hospitals = hospitals.concat(this.evaluate(getDetails()));
to
hospitals = hospitals.concat(this.evaluate(getDetails));
The problem here is that you're executing the function in the outer context instead of passing it into the page context. Don't forget to register to the "remote.message" event to see console.log() calls from the page context:
casper.on("remote.message", function(msg){
this.echo("remote> " + msg);
});

Select option from dropdown and submit request using nodejs

I am working on nodejs for scrapping a website and I am very new to nodejs.The website initial page is a popup in which one has to select option from selectbox and submit only then later pages can be browsed.this has to be done for first time and then it will be stored as cookie for later use.
I am able to get html page of popup but I am not able to select option from selectbox and submit request.
Here is my Code
var express = require('express');
var request=require('request');
var cheerio=require('cheerio');
var j = request.jar();
//var cookie = request.cookie();
j.setCookie("city_id=1; path=/; domain=.bigbasket.com", 'http://bigbasket.com/', function(error, cookie) {
//console.log("error"+error.message);
console.log("cookie "+cookie);
});
var app=express();
app.get('/', function(req, res){
console.log("hi");
var sessionVal = req.session;
request({uri:'http://bigbasket.com/',
headers:{'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.81 Safari/537.36' ,
'content-type':'application/x-www-form-urlencoded; charset=UTF-8',
'connection':'keep-alive'},
jar:j},
function(err, response, body) {
// console.log("err "+err.message);
console.log("header"+JSON.stringify(response.headers));
console.log("status"+response.statusCode);
console.log("cookie "+response.cookie);
console.log(body);
var $=cheerio.load(body,{xmlMode: true});
console.log($);
var $selectBox= $('select').filter('.selectboxdiv');
console.log($selectBox.text());
response.end;
});
});
app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;
I am able to get select box options through below code :
var $selectBox= $('select').filter('.selectboxdiv');
console.log($selectBox.text());
But I am not able to select that option and submit.I have to select city from dropdown menu and submit request so that I could scrape data from upcoming webpages.
EDIT:
What do you see if you use 'req.body.NAME_OF_YOUR_DROPDOWN_HERE'? Maybe you can see the selected option then?

fastest way to scrape text node with casperjs

I have this structure and I need get text from plain text node like this
<strong><font color="#666666">Phones:</font></strong>
<br>
<br>
<img src="/image/fgh.jpg" title="Velcom" alt="Velcom" style="margin: 2 5 -3 5;">
"+375 29" //get this
<b>611 77 83</b> //and this
I try to use XPath copied from chrome console
casper.thenOpen('url', function() {
result = this.getElementInfo(x('//*[#id="main_content"]/table[2]/tbody/tr[17]/td/table/tbody/tr/td[1]/p[1]/text()[3]'));
});
casper.then(function() {
this.echo(result.text);
});
but it is not working. Also when I try result.data
console.log(this.getElementInfo(x('//*[#id="main_content"]/table[2]/tbody/tr[17]/td/table/tbody/tr/td[1]/p[1]/text()[3]')));
returns null, but this element exists in the page, I checked it out
Make sure you have included:
var x = require('casper').selectXPath;
If that is still not working the following will retrieve all text from page then you can parse. This is not recommended for performance but does work if you have anchor text to parse on. You will need to slightly modify.
var casper = require("casper").create ({
waitTimeout: 15000,
stepTimeout: 15000,
verbose: true,
viewportSize: {
width: 1400,
height: 768
},
onWaitTimeout: function() {
logConsole('Wait TimeOut Occured');
this.capture('xWait_timeout.png');
this.exit();
},
onStepTimeout: function() {
logConsole('Step TimeOut Occured');
this.capture('xStepTimeout.png');
this.exit();
}
});
casper.on('remote.message', function(msg) {
logConsole('***remote message caught***: ' + msg);
});
casper.userAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/537.4');
// vars
var gUrl = 'WebAddy'; //+++ Update URL
casper.start(gUrl, function() {
var tPlainText = this.evaluate(function() {
var bodyText = document.body;
var textContent = bodyText.textContent || bodyText.innerText;
var tCheck = textContent.indexOf("Phones:");
if (tCheck === -1) {
tPlainText = 'Phone Text Not Found';
return tPlainText;
} else {
// parse text
var tSplit = textContent.split('Phones:');
var tStr = (tSplit[1]) ? tSplit[1] : '';
var tPos1 = tStr.indexOf(''); //+++ insert text to stop parse
var tDesiredText = (tPos1 !== -1) ? tStr.substring(0, tPos1) : null;
return tDesiredText;
}
});
console.log(tPlainText);
});
casper.run();
An old question but I had the same issue. I need to get the following text, so here is how I did it.
__utils__.getElementByXPath("//bla...bla/following-sibling::node()").textContent;

Categories

Resources