fastest way to scrape text node with casperjs - javascript

I have this structure and I need get text from plain text node like this
<strong><font color="#666666">Phones:</font></strong>
<br>
<br>
<img src="/image/fgh.jpg" title="Velcom" alt="Velcom" style="margin: 2 5 -3 5;">
"+375 29" //get this
<b>611 77 83</b> //and this
I try to use XPath copied from chrome console
casper.thenOpen('url', function() {
result = this.getElementInfo(x('//*[#id="main_content"]/table[2]/tbody/tr[17]/td/table/tbody/tr/td[1]/p[1]/text()[3]'));
});
casper.then(function() {
this.echo(result.text);
});
but it is not working. Also when I try result.data
console.log(this.getElementInfo(x('//*[#id="main_content"]/table[2]/tbody/tr[17]/td/table/tbody/tr/td[1]/p[1]/text()[3]')));
returns null, but this element exists in the page, I checked it out

Make sure you have included:
var x = require('casper').selectXPath;
If that is still not working the following will retrieve all text from page then you can parse. This is not recommended for performance but does work if you have anchor text to parse on. You will need to slightly modify.
var casper = require("casper").create ({
waitTimeout: 15000,
stepTimeout: 15000,
verbose: true,
viewportSize: {
width: 1400,
height: 768
},
onWaitTimeout: function() {
logConsole('Wait TimeOut Occured');
this.capture('xWait_timeout.png');
this.exit();
},
onStepTimeout: function() {
logConsole('Step TimeOut Occured');
this.capture('xStepTimeout.png');
this.exit();
}
});
casper.on('remote.message', function(msg) {
logConsole('***remote message caught***: ' + msg);
});
casper.userAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/537.4');
// vars
var gUrl = 'WebAddy'; //+++ Update URL
casper.start(gUrl, function() {
var tPlainText = this.evaluate(function() {
var bodyText = document.body;
var textContent = bodyText.textContent || bodyText.innerText;
var tCheck = textContent.indexOf("Phones:");
if (tCheck === -1) {
tPlainText = 'Phone Text Not Found';
return tPlainText;
} else {
// parse text
var tSplit = textContent.split('Phones:');
var tStr = (tSplit[1]) ? tSplit[1] : '';
var tPos1 = tStr.indexOf(''); //+++ insert text to stop parse
var tDesiredText = (tPos1 !== -1) ? tStr.substring(0, tPos1) : null;
return tDesiredText;
}
});
console.log(tPlainText);
});
casper.run();

An old question but I had the same issue. I need to get the following text, so here is how I did it.
__utils__.getElementByXPath("//bla...bla/following-sibling::node()").textContent;

Related

FB Share JavaScript "share_button is not defined"

Hello I am trying to implement the share of facebook in my javascript code, however I get this error, I tried several solutions suggested but none was successful in its my implementation.
Could someone help me correct this error?
Thanks in advance. Any help is welcome.
Full code:
https://pastebin.com/kSgFDf0L
Error on console.
FB Share JavaScript "share_button is not defined"
$(document).ready(function() {
window.fbAsyncInit = function() {
FB.init({appId: 'xxxx', status: true, cookie: true,
xfbml: true});
};
(function() {
var e = document.createElement('script'); e.async = true;
e.src = document.location.protocol +
'//connect.facebook.net/en_US/all.js';
document.getElementById('fb-root').appendChild(e);
}());
$(document).ready(function(){
$('#share_button').click(function(e){
e.preventDefault();
FB.ui(
{
method: 'feed',
name: 'This is the content of the "name" field.',
link: 'http://www.groupstudy.in/articlePost.php?id=A_111213073144',
picture: 'http://www.groupstudy.in/img/logo3.jpeg',
caption: 'Top 3 reasons why you should care about your finance',
description: "What happens when you don't take care of your finances? Just look at our country -- you spend irresponsibly, get in debt up to your eyeballs, and stress about how you're going to make ends meet. The difference is that you don't have a glut of taxpayers…",
message: ""
});
});
});
$(".result").on("click", function() {
var id = $(this).attr("data-linkId");
var url = $(this).attr("href");
if(!id) {
alert("data-linkId attribute not found");
}
increaseLinkClicks(id, url);
return false;
});
var grid = $(".imageResults");
grid.on("layoutComplete", function() {
$(".gridItem img").css("visibility", "visible");
});
grid.masonry({
itemSelector: ".gridItem",
columnWidth: 200,
gutter: 5,
isInitLayout: false
});
$("[data-fancybox]").fancybox({
caption : function( instance, item ) {
var caption = $(this).data('caption') || '';
var siteUrl = $(this).data('siteurl') || '';
if ( item.type === 'image' ) {
caption = (caption.length ? caption + '<br />' : '')
+ 'View image<br>'
+ 'Visit page<br>'
+ 'Share';
}
I believe that by declaring the variable it will be possible to execute the function. I just don't know if it will work according to what you expect.
Well. I believe this works ->
caption : function( instance, item ) {
var caption = $(this).data('caption') || '';
var siteUrl = $(this).data('siteurl') || '';
var share_button = $('#share_button') || '';

access the 'd' element from an SVG object with casperjs

I am a very beginer in javascript/phantomjs/casperjs (like only several days of learning) but I am stuck with this svg graph I am trying to scrap data from.
I am trying to access the d="M20,331.37,331.37,21.40...." element from an SVG object using a casperjs code, and write in the console and a txt file (or CSV). I try the following code:
var casper = require('casper').create({
pageSettings: {
loadImages: true,
loadPlugins: true,
userAgent: 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'
}
});
//First step is to open baidu.html
casper.start().thenOpen("file:///baidu.html", function() {
console.log("Baidu website opened");
this.wait(6000);
});
casper.then(function() {
var dataFromGraph = this.getElementsAttribute(require('casper').selectXPath('//*[#id="trend"]/svg/path[6]'),"d");
this.echo(dataFromGraph);
require('fs').write("data_graph.txt", dataFromGraph,'w');
});
casper.run();
But nothing worked. I get NULL element or empty result.
This is all the other code I try:
var dataFromGraph = this.fetchText(require('casper').selectXPath('//*[#id="trend"]/svg/path[6]/d'));
var dataFromGraph = this.getElementsAttribute(require('casper').selectXPath('//*[#id="trend"]/svg/path[6]'),"d") //,"d")
var dataFromGraph = this.getElementInfo(require('casper').selectXPath('//*[#id="trend"]/svg/path[6]'))
var dataFromGraph = this.fetchText("#trend > svg > path");
I have the Xpath and the selector of the object but I am not sure how to acces it. Here is a picture of the element I want to scrap.
As the website I want to scrap need a password, this is the HTML file that I save from it https://ufile.io/5y9g2.
The element I want to scrap is the data behind the graph here.
Any help would be very appreciated.
I reworked your script a bit and now it works. Check the snippet below.
var fs = require('fs');
var casper = require('casper').create();
casper.start().thenOpen("http://localhost:8001/baidu.html", function() {
console.log("Baidu website opened");
});
casper.then(function() {
var graphData = this.evaluate(function() {
return document.querySelector('#trend > svg > path:nth-child(11)').getAttribute('d')
});
this.echo(graphData);
fs.write("data_graph.txt", graphData,'w');
});
casper.run();
Hope that helps!

Skip or ignore JavaScript error (on page third-party JS) in Phantom JS

Consider the code below:
var steps = [];
var testindex = 0;
var testError = 0;
var loadInProgress = false;
var webPage = require('webpage');
var page = webPage.create();
page.settings.userAgent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36';
page.settings.javascriptEnabled = true;
page.settings.loadImages = false;
phantom.cookiesEnabled = true;
phantom.javascriptEnabled = true;
steps = [
//Step 1 - Open site
function() {
console.log('Step 1 - Open UK site');
page.open("http://www.example.com/", function(status) {
if (status !== 'success') {
console.log("Adidas UK is loading for long");
}
});
},
function() {
console.log('Step 2 - Search for the product');
page.evaluate(function(login, password) {
bla bla bla ...
});
},
function() {
bla bla ...
},
many more functions ...
//Execute steps one by one
interval = setInterval(executeRequestsStepByStep, 50);
function executeRequestsStepByStep() {
if (loadInProgress == false && typeof steps[testindex] == "function") {
//console.log("step " + (testindex + 1));
steps[testindex]();
testindex++;
}
if (typeof steps[testindex] != "function") {
console.log("Script is completed...");
phantom.exit();
}
}
page.onError = function(msg, trace) {
var msgStack = ['ERROR: ' + msg];
if (trace && trace.length) {
msgStack.push('TRACE:');
trace.forEach(function(t) {
msgStack.push(' -> ' + t.file + ': ' + t.line + (t.function ? ' (in function "' + t.function+'")' : ''));
});
}
console.error(msgStack.join('\n'));
phantom.exit(2); //this is needed in case any error related to phantom script occurs.
};
In the above code example, when i execute the script i get error something like this in Step 1 and Step 5 which is caused by some javascript code execution on the page at a point of time.
ERROR: TypeError: undefined is not a constructor (evaluating 'sf_chat_Widget.loc
ale.includes("CH")')
TRACE:
-> https://e-com.secure.force.com/adidasContact/servlet/servlet.FileDownload?fi
le=01520000003gdry: 58 (in function "loadWidget")
-> http://www.adidas.co.uk/: 3367 (in function "onload")
phantomjs://code/UkScript.js:158 in onError
I wish to ignore or skip this error without halting the execution of steps in script and only catch such errors caused by phantom JS script which is not happening and throwing all errors occurred on page.
Is there already a way to skip or ignore specific on page error and
just catch errors happened in phantom js script?
P.S. I'm new to Stackoverflow when it comes to asking a question so if you find anything irrelevant or it is already been asked before please don't hesitate to fire.
Your help is much appreciated!

How to get JSON objects embedded in HTML page result of JS running by PhantomJS and pass them to java code?

I use JS script code that described in this answer, but I don't want to save html result page in html file. I want to extract Json object from <div class="rg_meta"> and pass them to Java code.
In searching, I find using "document", but I get undefined error. I am newbie in PhantomJS and working with JSON in Java.
var page = require('webpage').create();
var fs = require('fs');
var system = require('system');
var url = "";
var searchParameter = "";
var count=0;
if (system.args.length === 4) {
url=system.args[1];
searchParameter=system.args[2];
count=system.args[3];
}
if(url==="" || searchParameter===""){
phantom.exit();
}
page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36';
page.zoomFactor = 0.1;
page.viewportSize = {
width: 1920,
height: 1080
};
var divCount="-1";
var topPosition=0;
var unchangedCounter=0;
page.open(url, function(status) {
console.log("Status: " + status);
if(status === "success") {
window.setInterval(function() {
var newDivCount = page.evaluate(function() {
var divs = document.querySelectorAll(".rg_di.rg_bx.rg_el.ivg-i");
return divs[divs.length-1].getAttribute("data-ri");
});
topPosition = topPosition + 1080;
page.scrollPosition = {
top: topPosition,
left: 0
};
if(newDivCount===divCount){
page.evaluate(function() {
var elems=document.getElementByClassName("rg_meta");
console.log(elems.length);
var button = document.querySelector("#smb");
if(!(typeof button === "undefined")) {
button.click();
console.log('Clicked');
return true;
}else{
return false;
}
});
if(parseInt(unchangedCounter,10) === parseInt(count,10)){
/* var path = searchParameter+'.html';
fs.write('seedHtml/'+path, page.content, 'w');
console.log('printing html');*/
phantom.exit();
}else{
unchangedCounter=unchangedCounter+1;
}
}else{
unchangedCounter=0;
}
divCount = newDivCount;
}, 500);
}else{
phantom.exit();
}
});
HTML5 data Attributes
Fortunately, HTML5 introduces custom data attributes.
<div id="msglist" data-user="bob" data-list-size="5" data-maxage="180"></div>
Custom data attributes:
are strings — you can store anything which can be string encoded, such as JSON. Type conversion must be handled in JavaScript.
should only be used when no suitable HTML5 element or attribute exists.
JavaScript Parsing #1:
Every browser will let you fetch and modify data- attributes using the getAttribute and setAttribute methods, e.g.
var msglist = document.getElementById("msglist");
var show = msglist.getAttribute("data-list-size");
msglist.setAttribute("data-list-size", show+3);
It works, but should only be used as a fallback for older browsers.
JavaScript Parsing #2:
Since version 1.4.3, jQuery’s data() method has parsed HTML5 data attributes. You don’t need to specify the data- prefix so the equivalent code can be written:
var msglist = $("#msglist");
var show = msglist.data("list-size");
msglist.data("list-size", show+3);
Hope it helps!!!

CasperJS - NodeList.length return 0

I tried to extract data from some webpages using CasperJS, I have tried adding this.wait(5000) inside getDetails(), but I don't know why direktoriNodeList.length always return 0
PhantomJS : 2.0.0
CasperJS : 1.1.0-beta3
//casperjs --proxy=127.0.0.1:9050 --proxy-type=socks5 axa-mandiri.casper.js
var casper = require("casper").create({
verbose: true,
logLevel: "info",
pageSettings: {
loadImages: false, //The script is much faster when this field is set to false
loadPlugins: false,
userAgent: "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36"
}
});
var utils = require('utils');
var currentPage = 1;
var hospitals = [];
var url = 'https://www.axa-mandiri.co.id/direktori/rumah-sakit/';//Type your url
casper.start(url);//Start CasperJS
casper.waitForSelector('#main-direktori', processPage, stopScript);//Wait until content loads and then process the page
casper.run(function() {
utils.dump(hospitals);
this.exit();
});
function getDetails(){
/*
In this function you can put anything you want in order to extract your data from the website.
NOTE: This function is executed in page context, and will should be called as parameter to Casper's evaluate function.
*/
.
console.log("getDetails " + currentPage);
var details = [];
var direktoriNodeList = document.querySelectorAll("ul#main-direktori li.direktori-list");
console.log("direktoriNodeList.length " + direktoriNodeList.length);
utils.dump(direktoriNodeList);
for (var i = 0; i < direktoriNodeList.length; i++) {
console.log("querySelectorAll " + i);
var detail = {
name : direktoriNodeList[i].querySelector("div.details strong").textContent.replace(/\n/g, ''),
phone : direktoriNodeList[i].querySelector("div.details span:nth-child(1)").textContent.replace(/\n/g, ''),
map : direktoriNodeList[i].querySelector("div.map-details a.get-direction").getAttribute("href")
};
details.push(detail);
}
/*
[].forEach.call(document.querySelectorAll("ul#main-direktori li.direktori-list"), function(elem) {
console.log("querySelectorAll");
var detail = {
name : elem.querySelector("div.details strong").textContent.replace(/\n/g, ''),
phone : elem.querySelector("div.details span:nth-child(1)").textContent.replace(/\n/g, ''),
map : elem.querySelector("div.map-details a.get-direction").getAttribute("href")
};
details.push(detail);
});
*/
return JSON.stringify(details);
}
function stopScript() {
utils.dump(hospitals);
console.log("Exiting...");
this.exit();
};
function processPage() {
//your function which will do data scraping from the page. If you need to extract data from tables, from divs write your logic in this function
hospitals = hospitals.concat(this.evaluate(getDetails()));
//If there is no nextButton on the page, then exit a script because we hit the last page
if (this.exists("a.nextpostslink") == false) {
stopScript();
}
//Click on the next button
this.thenClick("a.nextpostslink").then(function() {
currentPage++;
this.waitForSelector("#main-direktori", processPage, stopScript);
});
};
casper.evaluate(fn, ...) expects a function, not an array. Change
hospitals = hospitals.concat(this.evaluate(getDetails()));
to
hospitals = hospitals.concat(this.evaluate(getDetails));
The problem here is that you're executing the function in the outer context instead of passing it into the page context. Don't forget to register to the "remote.message" event to see console.log() calls from the page context:
casper.on("remote.message", function(msg){
this.echo("remote> " + msg);
});

Categories

Resources