Problems with web scraping phantom js? - javascript

I am trying to scrape html from a webpage using phantom.js but then I am getting this strange error. When I run the script once, I am getting the correct response, but then when I try again, I am getting no response.
It looks like its loading forever. I don't want to call phantom.exit() as it stops the server. So, what could I be doing wrong?
var page = require('webpage').create(),
server = require('webserver').create();
var service = server.listen(8003, function (request, response) {
console.log('Request received at ' + new Date());
// TODO: parse `request` and determine where to go
page.open('https://www.sportpesa.co.ke/?sportId=1&league=76080&leagueName=la%20liga&top=0',function() {
setTimeout(function() {
var test = page.plainText
console.log(page.plainText);
response.write(page.plainText)
response.close();
page.close();
})
})
});

I think your problem is you are closing the page after the first time and then attempting to reuse it again. Per the docs, you should not do this
Close the page and releases the memory heap associated with it. Do not use the page instance after calling this.
Try recreating the page object upon each request, like so:
var webpage = require('webpage'),
server = require('webserver').create();
var service = server.listen(8003, function (request, response) {
console.log('Request received at ' + new Date());
let page = webpage.create()
// TODO: parse `request` and determine where to go
page.open('https://www.sportpesa.co.ke/?sportId=1&league=76080&leagueName=la%20liga&top=0',function() {
setTimeout(function() {
var test = page.plainText
console.log(page.plainText);
response.write(page.plainText)
response.close();
page.close();
})
})
});

Related

How to stream tweets from specific accounts

I'm using the twitter npm package in an attempt to stream tweets from specified accounts.
I'm having trouble navigating the twitter api docs, so I'm a little confused.
I can hit the REST endpoint to get the specified user tweets info with:
var client = new Twitter({});
client.get('statuses/user_timeline', { screen_name: user }, function(error, tweets) {
if(error) throw error;
console.log(tweets);
});
How do I stream the tweets? Is it even possible? If not, how else could I accomplish this? I would like this to be as responsive and immediate as possible.
Figured it out...
var stream = client.stream('statuses/filter', { follow: userId });
stream.on('data', function(event) {
console.log(event && event.text);
});
This will client stream reader will display the tweets as they are made automatically.
Unfortunately, the screen_name of the user cannot be used, so you'll have to find that out beforehand.
I wrote the code below and was able to get last 20 tweets of a user and save them in a file log.txt.
var Twitter = require('twitter');
var fs = require("fs");
//Your keys go inside {}
var client = new Twitter({});
//These are the parameters for the API call
var params = {
screen_name: 'damiengold', //use damiengold instead of #damiengold
count: 20
};
//Perform the API call and return data
client.get('statuses/user_timeline.json', params, function(error, tweets, response) {
var dataToFile;
if (!error) {
for (var i = 0; i < tweets.length; i++) {
console.log(tweets[i].created_at + " " + tweets[i].text);
console.log("-------------------------------");
dataToFile = tweets[i].created_at + " " + tweets[i].text + "\n-----------\n";
//This block of code will append tweets to the file called "log.txt".
fs.appendFile("log.txt", dataToFile, function(err) {
// If the code experiences any errors it will log the error to the console.
if (err) {
return console.log(err);
}
})
}
}
console.log("Last 20 tweets are now displayed on the screen and saved in log.txt.");
});

nodejs: node-http-proxy and harmon: rewriting the html response from the end point instead of the 302 redirected response.

I'm using nodejs with node-http-proxy along with harmon. I am using harmon to rewrite the proxied response to include a javascript file and a css file. When I set the target of the proxy to be http://nodejs.org or anything other than localhost, I receive a 301 or 302 redirect. The script is rewriting the 301 response instead of the fully proxied response. How can I use harmon to rewrite the end response instead of the 302 response?
Here is the example of the script I am running from the harmon example folder:
var http = require('http');
var connect = require('connect');
var httpProxy = require('http-proxy');
var selects = [];
var simpleselect = {};
//<img id="logo" src="/images/logo.svg" alt="node.js">
simpleselect.query = 'img';
simpleselect.func = function (node) {
//Create a read/write stream wit the outer option
//so we get the full tag and we can replace it
var stm = node.createStream({ "outer" : true });
//variable to hold all the info from the data events
var tag = '';
//collect all the data in the stream
stm.on('data', function(data) {
tag += data;
});
//When the read side of the stream has ended..
stm.on('end', function() {
//Print out the tag you can also parse it or regex if you want
process.stdout.write('tag: ' + tag + '\n');
process.stdout.write('end: ' + node.name + '\n');
//Now on the write side of the stream write some data using .end()
//N.B. if end isn't called it will just hang.
stm.end('<img id="logo" src="http://i.imgur.com/LKShxfc.gif" alt="node.js">');
});
}
selects.push(simpleselect);
//
// Basic Connect App
//
var app = connect();
var proxy = httpProxy.createProxyServer({
target: 'http://nodejs.org'
})
app.use(require('../')([], selects, true));
app.use(
function (req, res) {
proxy.web(req, res);
}
);
The problem is that a lot of sites are now redirecting HTTP to HTTPS.
nodejs.org is one of those.
I have updated the sample https://github.com/No9/harmon/blob/master/examples/doge.js to show how the http-proxy needs to be configured to deal with HTTPS.
If you still have problems with other arbitrary redirects please log an issue on harmon.
Thanks

How to stop CasperJS execution and let the user input some value and then continue to execute?

I'm using PhantomJS and CasperJS to automate some of my tasks. In one of the task, I need to manually provide captcha strings before I can actually work on the task. For this problem, what I can think of is to capture a screenshot of the web page, then manually check the captured image and save the captcha string into a text file. After that I can use the file system module in CasperJS to read that value and continue to do the process. I want to know what's the best way to do this kind of tasks.
Because of the stuctured manner/control flow of CasperJS compared to PhantomJS, such a task is not easy.
1. Pull approach (file polling)
Let's say there is a secondary program (type 1) which handles showing the CAPTCHA, receiving the input and writing a text file with the CAPTCHA input. All that CasperJS can handle is to write the CAPTCHA screenshot to disk and wait for the file with the "parsed" text.
var fs = require("fs"),
captchaFile = "cfile.png",
parsedFile = "pfile.txt";
casper.waitForCaptcha = function(captchaFile, parsedFile){
casper.then(function(){
this.captureSelector(captchaFile, "someSelectorOfTheCaptcha");
});
casper.waitFor(function check(){
return fs.exists(parsedFile);
}, function then(){
// do something on time
// check if correct...
if (!correct) {
fs.remove(captchaFile);
fs.remove(parsedFile);
this.waitForCaptcha(captchaFile, parsedFile);
// Problem: the secondary process needs to sense that a new CAPTCHA is presented
}
}, function onTimeout(){
// do something when failed
}, 60000); // 1min should suffice as a timeout
return this;
};
casper.start(url).waitForCaptcha(captchaFile, parsedFile).run();
This code assumes that you want to retry when the CAPTCHA was wrong, but not if the minute deliberately passed without the decoded file. This is a pull process by polling if files are already there.
2. Push approach
A push process is also possible where the secondary program (type 2) sends requests to the CasperJS process by utilizing the PhantomJS webserver module. Because there will be two concurrent control flows, the CasperJS part needs to wait a long time, but as soon as a request is received with the decoded words the waiting can be broken with unwait.
var server = require('webserver').create(),
fs = require("fs"),
captchaFile = "cfile.png";
function neverendingWait(){
this.wait(5000, neverendingWait);
}
casper.checkCaptcha = function(captchaFile, phantomPort, secondaryPort){
// here the CAPTCHA is saved to disk but it can also be set directly if captured through casper.captureBase64
this.captureSelector(captchaFile, "someSelectorOfTheCaptcha");
// send request to the secondary program from the page context
this.evaluate(function(file){
__utils__.sendAJAX("http://localhost:"+secondaryPort+"/", "POST", {file: file}, true);
}, captchaFile);
// start the server to receive solved CAPTCHAs
server.listen(phantomPort, {
'keepAlive': true
}, function (request, response) {
console.log('Request received at ' + new Date());
if (request.post) { // is there a response?
this.then(function(){
// check if it is correct by reading request.post ...
if (!correct){
response.statusCode = 404;
response.headers = {
'Cache': 'no-cache',
'Content-Type': 'text/plain;charset=utf-8'
};
response.close();
server.close();
this.checkCaptcha(captchaFile, phantomPort, secondaryPort);
} else {
response.statusCode = 200;
response.headers = {
'Cache': 'no-cache',
'Content-Type': 'text/plain;charset=utf-8'
};
response.close();
server.close();
this.unwait(); // abort the neverendingWait
}
});
} else {
response.statusCode = 404;
response.headers = {
'Cache': 'no-cache',
'Content-Type': 'text/plain;charset=utf-8'
};
response.close();
server.close();
this.checkCaptcha(captchaFile, phantomPort, secondaryPort);
}
});
return this;
};
casper.start(url).then(function(){
this.checkCaptcha(captchaFile, 8080, 8081);
}).then(neverendingWait).then(function(){
// Do something here when the captcha is successful
}).run();

Accessing JSON data when PhantomJS onResourceReceived is triggered (ESPN fantasy football draft app)

So I'm trying to write a hook into ESPN fantasy football's HTML lite draft page to cross-reference player ranking lists (from a CSV file) to eliminate already-drafted players from the available pool. I've done this by hand in the past: but with a 16-team draft by the late rounds, it's nearly impossible to keep up since by then no one really knows who the players are.
I'm very much a Javascript and PhantomJS newbie, so please don't laugh.
At this point, I can see the page.onResourceReceived metadata in my console as the AJAX polls the PhantomJS instance. But I can't figure out how to access the data actually being received by the "browser". According to Chrome's inspector, the "Preview" tab under the Network Inspector tab -- either a time sync signal or the data of the actual player who was drafted is being sent to the browser in JSON format.
Long story short, how do I get the actual JSON data when I receive the page.onResourceReceived metadata?
(P.S. I know I commented out phantom.exit(); that's to keep the script from terminating after the redirect and onLoad is complete--I need to keep it running to listen for the draft updates)
var draft = 'http://games.espn.go.com/ffl/htmldraft?leagueId=1246633&teamId=8&fromTeamId=8';
var draftURL = encodeURIComponent(draft);
var page = require('webpage').create(),
server = 'https://r.espn.go.com/espn/memberservices/pc/login',
data = 'SUBMIT=1&failedLocation=&aff_code=espn_fantgames&appRedirect=' + draftURL + '&cookieDomain=.go.com&multipleDomains=true&username=[redacted]&password=[redacted]&submit=Sign+In';
page.onResourceReceived = function (response) {
console.log('Response (#' + response.id + ', stage "' + response.stage + '"): ' + JSON.stringify(response));
};
page.open(server, 'post', data, function (status) {
if (status !== 'success') {
console.log('Unable to post!');
} else {
page.render('example.png');
//console.log(page.content)
}
//phantom.exit();
});
The following version of your script will just grab and return the entire contents of the URL you are accessing. You are not really going to get useful json data, I don't think, just an html page, unless I'm missing something. In my tests, all I get is html:
var draft = 'http://games.espn.go.com/ffl/htmldraft?leagueId=1246633&teamId=8&fromTeamId=8';
var draftURL = encodeURIComponent(draft);
var page = require('webpage').create(),
server = 'https://r.espn.go.com/espn/memberservices/pc/login',
data = 'SUBMIT=1&failedLocation=&aff_code=espn_fantgames&appRedirect=' + draftURL + '&cookieDomain=.go.com&multipleDomains=true&username=[redacted]&password=[redacted]&submit=Sign+In';
page.open(server, 'post', data, function (status) {
if (status == 'success') {
var delay, checker = (function() {
var html = page.evaluate(function () {
var body = document.getElementsByTagName('body')[0];
return document.getElementsByTagName('html')[0].outerHTML;
});
if (html) {
clearTimeout(delay);
console.log(html);
phantom.exit();
}
});
delay = setInterval(checker, 100);
}
else {
phantom.exit();
}
});
Currently, phantomjs doesn't include the response body in the onResponseReceived events.
You could instead slimerjs, which mirrors the phantomjs, but does allow you to access response.body (which should have the JSON data). Example here:
http://darrendev.blogspot.jp/2013/11/saving-downloaded-files-in-slimerjs-and.html
Alternatively, you could write a chrome extension and create a content script that grabs the data.

Node.js copy remote file to server

Right now I'm using this script in PHP. I pass it the image and size (large/medium/small) and if it's on my server it returns the link, otherwise it copies it from a remote server then returns the local link.
function getImage ($img, $size) {
if (#filesize("./images/".$size."/".$img.".jpg")) {
return './images/'.$size.'/'.$img.'.jpg';
} else {
copy('http://www.othersite.com/images/'.$size.'/'.$img.'.jpg', './images/'.$size.'/'.$img.'.jpg');
return './images/'.$size.'/'.$img.'.jpg';
}
}
It works fine, but I'm trying to do the same thing in Node.js and I can't seem to figure it out. The filesystem seems to be unable to interact with any remote servers so I'm wondering if I'm just messing something up, or if it can't be done natively and a module will be required.
Anyone know of a way in Node.js?
You should check out http.Client and http.ClientResponse. Using those you can make a request to the remote server and write out the response to a local file using fs.WriteStream.
Something like this:
var http = require('http');
var fs = require('fs');
var google = http.createClient(80, 'www.google.com');
var request = google.request('GET', '/',
{'host': 'www.google.com'});
request.end();
out = fs.createWriteStream('out');
request.on('response', function (response) {
response.setEncoding('utf8');
response.on('data', function (chunk) {
out.write(chunk);
});
});
I haven't tested that, and I'm not sure it'll work out of the box. But I hope it'll guide you to what you need.
To give a more updated version (as the most recent answer is 4 years old, and http.createClient is now deprecated), here is a solution using the request method:
var fs = require('fs');
var request = require('request');
function getImage (img, size, filesize) {
var imgPath = size + '/' + img + '.jpg';
if (filesize) {
return './images/' + imgPath;
} else {
request('http://www.othersite.com/images/' + imgPath).pipe(fs.createWriteStream('./images/' + imgPath))
return './images/' + imgPath;
}
}
If you can't use remote user's password for some reasons and need to use the identity key (RSA) for authentication, then programmatically executing the scp with child_process is good to go
const { exec } = require('child_process');
exec(`scp -i /path/to/key username#example.com:/remote/path/to/file /local/path`,
(error, stdout, stderr) => {
if (error) {
console.log(`There was an error ${error}`);
}
console.log(`The stdout is ${stdout}`);
console.log(`The stderr is ${stderr}`);
});

Categories

Resources