nodejs web scraper for password protected website - javascript

I am trying to scrape a website using nodejs and it works perfectly on sites that do not require any authentication. But whenever I try to scrape a site with a form that requires username and password I only get the HTML from the authentication page (that is, if you would click 'view page source' on the authentication page it self, that is the HTML I get). I am able to get the desired HTML using curl
curl -d "username=myuser&password=mypw&submit=Login" URL
Here is my code...
var express = require('express');
var fs = require('fs'); //access to file system
var request = require('request');
var cheerio = require('cheerio');
var app = express();
app.get('/scrape', function(req, res){
url = 'myURL'
request(url, function(error, response, html){
// check errors
if(!error){
// Next, we'll utilize the cheerio library on the returned html which will essentially give us jQuery functionality
var $ = cheerio.load(html);
var title, release, rating;
var json = { title : "", release : "", rating : ""};
$('.span8 b').filter(function(){
// Let's store the data we filter into a variable so we can easily see what's going on.
var data = $(this);
title = data.first().text();
release = data.text();
json.title = title;
json.release = release;
})
}
else{
console.log("Error occurred: " + error);
}
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output.json file');
})
res.send('Check your console!')
})
})
app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;
I have tried the following...
var request = require('request',
username:'myuser',
password:'mypw');
This just returns the authentication page's HTML
request({form: {username:myuser, password:mypw, submit:Login}, url: myURL}, function(error, response, html){
...
...
...
}
This also just returns the authentication page's HTML
So my question is how do I achieve this using nodejs?

you shouldn't use .get but .post and put the post param (username and password) in your call
request.post({
headers: {'content-type' : 'application/x-www-form-urlencoded'},
url: url,
body: "username=myuser&password=mypw&submit=Login"
}, function(error, response, html){
//do your parsing...
var $ = cheerio.load(html)
});

Related

node.js: The POST method in my RESTAPI does not work

I start learning Node.js and Express.js and I'm trying to create a simple API to list data from JSON file (using the GET method) and add a new user using the POST method.
the GET method works fine but the POST method does not work
when I request http://127.0.0.1:8080/listusers the API sends all users in a JSON file.
when I request http://127.0.0.1:8080/adduser the API has to add new User Info and send the new data back to the browser.
NOTE: I read all the questions on Stackoverflow about this problem but
non of them help me so I have to ask again.
the problem is when I request http://127.0.0.1:8080/adduser I get the following error
Cannot GET /adduser
here is the server.js:
var express = require('express');
var app = express();
var fs = require('fs');
var user = {
"user4" : {
"name" : "mounir",
"password" : "password4",
"profession" : "teacher",
"id": 4
}
};
app.post('/adduser', function (req, res) {
// First read existing users.
fs.readFile( __dirname + "/" + "users.json", 'utf8', function (err, data) {
data = JSON.parse( data );
data["user4"] = user["user4"];
console.log( data );
res.end(JSON.stringify(data) );
});
});
app.get('/listusers', function (req, res) {
fs.readFile( __dirname + "/" + "users.json", 'utf8', function (err, data) {
console.log(data);
res.end(data);
});
});
var server = app.listen(8080, function () {
var host = server.address().address;
var port = server.address().port;
console.log("listening at http://%s:%s", "0.0.0.0", port)
});
The answer is in the error. Cannot GET /adduser. Keyword GET! If you are making a post request, be sure you include the appropriate headers and that you are making a POST request, with a body, and not a GET request. For instance if you are using fetch:
const myInit = {
method: 'POST',
headers: myHeaders,
body: {
...
}
};
fetch("http://127.0.0.1:8080/adduser", myInit)
.then(res => {
...
});

Node.js app with python script

I created a python script that parses a website(IMDB) and organizes it into a dataframe.
I also have a node.js app that allows me to find a variable (movie ID based on movie name in the code called pyvar) that I would include in the python script. So how can I include this variable that I get after running javascript app into python script, run the script and then send the result back to the node.js app? (that would be dataframe converted to lets say json)
Node.js app
var express = require("express")
var app = express()
var request = require("request")
app.set("view engine", "ejs")
app.get("/", function(req, res){
res.render("search")
})
app.get("/results", function(req, res){
var query = req.query.search
var url = "http://www.omdbapi.com/?s=" + query + "&apikey=thewdb"
request(url, function(error, response, body){
if(!error && response.statusCode == 200){
var data = JSON.parse(body)
res.render("results", {data: data})
var pyvar = data["Search"][0]["imdbID"]
}
})
})
app.listen(process.env.PORT, process.env.IP, function(){
console.log("Movie App has started!!!");
})
The python script in a nutshell looks like this:
url = 'website.org/' + pyvar + '/blah'
parse(url)
return dataframe
After that I would send the dataframe in some form back to the node.js app and display the results or even better if it would allow me to download the dataframe converted to xlsx but it might be too complicated.
You can use child_process spawn to execute your python script, as Felix Kling suggest in his comment, and return it result to your nodejs app. Then you could use a package like node-xlsx to transform the data to an Excel file.
Something like that:
app.js
// ...
const { spawn } = require('child_process');
const xlsx = require('node-xlsx');
// ...
app.get("/results", (req, res) => {
let query = req.query.search;
let url = "http://www.omdbapi.com/?s=" + query + "&apikey=thewdb";
request(url, (error, response, body) => {
if (!error && response.statusCode == 200) {
let data = JSON.parse(body);
let pyvar = data["Search"][0]["imdbID"];
// Call the python script
let pythonScript = spawn('./script.py', [pyvar]);
pythonScript.stdout.on('data', data => {
// Here transform the datatable to xls sheet
let xlsx = xlsx.build([{ name: "myXlsxSheet", data: data.toString() }])
// And send the file
res.end(new Buffer(xlsx, 'binary'));
});
}
})
})
// ...
script.py
#!/usr/bin/python
import sys
import pandas
pyvar = sys.argv[1]
# Here the script that parse the website
url = 'website.org/' + pyvar + '/blah'
data = parse(url)
print pandas.DataFrame(data)

REST API node.js

I am trying to retrieve data from a REST API in the server side (.js) and display it in my view (.jade)
I was able to get the data but was not able to send it to the view .
This is how my code looks like :
var BugData ='initial data' ;
var https = require('https');
var optionsget = {
rejectUnauthorized: false,
host : 'My host', // here only the domain name
// (no http/https !)
port : 443,
path : 'Mypath', // the rest of the url with parameters if needed
method : 'GET' // do GET
};
console.info('Options prepared:');
console.info(optionsget);
console.info('Do the GET call');
// do the GET request
var reqGet = https.request(optionsget, function(res) {
console.log("statusCode: ", res.statusCode);
res.on('data', function(d) {
console.info('GET result:\n');
BugData =d;
console.log('Show Data : ***** \n' +d);
});
});
reqGet.end();
reqGet.on('error', function(e) {
console.error(e);
});
res.render('index', { ab:BugData});
BugData (was defined before )is the variable i am trying to send to the view but for some reasons it is empty and does not contain the variable 'd'
Does anyone know why or how can i solve this ?
Thanks
There is no need to write that long code.
Be simple, follow these steps:
1) install request package:
npm install --save request
2) outside of router add:
var request = require('request');
process.env.NODE_TLS_REJECT_UNAUTHORIZED = 0;
3) use this code inside router:
request.get({url: 'https://my-host/Mypath'}, function(err, response, body) {
var data = {};
if (err) {
console.error(err);
data.err = err;
}
data.ab = body;
console.log('Data: ', data);
res.render('index', data);
});

HTTP request from within Express/Node.js

I'm trying to set up an express service for a program that I'm writing that contacts an external API and then returns the results so it can be stored in a Mongo I have set up.
This seems like it should be fairly straightforward, but I'm new to Node.js/Express and I'm getting a "Can't set headers after they are sent" error.
I'm getting the data that I want from the external API, but how to send that data properly back to my Angular app.js so it can update in my table?
"addSelected()" is the function I'm calling in my app.js to kick off the process. The "data" prints part of the way through the full response but then cuts off and gives me the "Can't set Headers after they are sent" error. From what I understand this is from sending the response and then trying to modify the response header after the fact.. but I'm unsure of a workaround or if I'm just formatting everything wrong as this is my first swing at MEAN stack in general.
I know the problem is on the line "res.send(data)" in server.js but I don't know how to correctly format the response.
My code:
server.js
//server.js
//setup ==============================
var express = require ('express');
var request = require('request');
var app = express();
var mongoose = require('mongoose');
var https = require('https');
//config ============================
app.use(express.static(__dirname + '/public/'));
console.log("running PipeHelper");
mongoose.connect('mongoedit');
var Schema = mongoose.Schema;
var opSchema = new Schema({
title: String,
description: String,
company: String,
post_date: String,
close_date: String,
contact: String,
location: String,
url: String,
notice_type: String
});
var item = mongoose.model('item', opSchema);
//routes===========================
//returns full database
app.get('/api/db', function(req, res){
item.find({},function(err, items){
if (err) res.send(err);
res.json(items);
});
});
//searches FBO for opportunities to add to database
app.get('/api/search:FBO_key', function(req, res){
var data;
console.log("2");
var baseURL = "api.data.gov"
var params = "/gsa/fbopen/v0/opps?q=" + req.params.FBO_key;
params += "&api_key="+"keyyyy";
params += "&all";
params += "&start=0";
params += "&p=1";
params += "&limit=10";
url = baseURL+params;
var options = {
port: 443,
host: 'api.data.gov',
path: params,
method: 'GET'
};
//get FBO data
var request = https.request(options, function(response){
console.log("4");
response.on('data', function (chunk){
//response data to send back to app.js
data += chunk.toString();
res.send(data);
});
});
console.log("3");
request.end();
request.on('error', function(e){
console.error(e);
});
});
app.get('/', function(req,res){
res.sendfile('./public/index.html');
});
app.listen(8000);
app.js
var app = angular.module("pipeHelper", ['smart-table']);
app.controller('mainCtrl', [
'$scope', '$http', function($scope, $http){
$scope.selected = [];
$scope.displayData= [];
$scope.items=[];
$scope.FBOsearch;
//populates table on startup with whole DB
$http.get('./api/db')
.success(function(data){
$scope.items=data;
$scope.displayData = [].concat($scope.items);
})
.error(function(data){
console.log('Error: '+data);
});
$scope.addSelected = function(){
//search FBO, add opportunities, update table
console.log("1");
$http.get('./api/search'+'NGA')
.success(function(data){
console.log("5");
console.log(data);
$scope.items=data;
$scope.displayData= [].concat($scope.items);
})
.error(function(data){
console.log('Error: ' +data);
});
};
$scope.isSelected = function(item){
//if its selected, remove it
// if its unselected, add it
if ($scope.selected.indexOf(item)==-1){
$scope.selected.push(item);
}
else{
$scope.selected.splice($scope.selected.indexOf(item), 1);
}
console.log($scope.selected);
//temp placeholder function. Eventually add to array of selected objects for placement in Pipeliner/deletion
};
}]);
solved the issue. I was unaware that response.on('data') gets called multiple times, thus calling my res.send(data) multiple times and incompletely causing it to crash with the error. I added the following to the request function:
response.on('end'function(){
res.send(data);
};
basically when the external API data is finished coming in, send it with express. Learn by doing I suppose. Hope this helps someone eventually.
I can't leave a comment, so I will just make it an answer.
I would recommend installing node-inspector, npm install -g node-debug. Then run your app with node-debug server.js. This will spawn a new instance of Firefox or Chrome dev tools and allows you to debug your nodeJS code. Very useful.
The error you are seeing is most likely related to request.end(), if I were to guess. After .end() is called, you can no longer modify the header content. I doubt it would make a difference, but try putting the request.end() after you have the request.on('error') call.
EDIT: 10/15/15
I would highly recommend installing VS Code. It has a built-in debugger for node apps.

How can I scrape sites that require authentication using node.js?

I've come across many tutorials explaining how to scrape public websites that don't require authentication/login, using node.js.
Can somebody explain how to scrape sites that require login using node.js?
Use Mikeal's Request library, you need to enable cookies support like this:
var request = request.defaults({jar: true})
So you first should create a username on that site (manually) and pass the username and the password as params when making the POST request to that site. After that the server will respond with a cookie which Request will remember, so you will be able to access the pages that require you to be logged into that site.
Note: this approach doesn't work if something like reCaptcha is used on the login page.
I've been working with NodeJs Scrapers for more than 2 years now
I can tell you that the best choice when dealing with logins and authentication is to NOT use direct request
That is because you just waste time on building manual requests and it is way slower,
Instead, use a high lever browser that you control via an API like Puppeteer or NightmareJs
I have a good starter and in-depth guide on How to start scraping with Puppeteer, I'm sure it will help!
Or using superagent:
var superagent = require('superagent')
var agent = superagent.agent();
agent is then a persistent browser, which will handle getting and setting cookies, referers, etc. Just agent.get, agent.post() as normal.
You can scrape the data from sites that require authentication like csrf token.
Using the cookies for each request like this:
var j = request.jar(); // this is to set the jar of request for session and cookie persistence
request = request.defaults({ jar: j }); //here we are setting the default cookies of request
Here is small code to elaborate it further:
var express = require('express');
var bodyParser = require('body-parser');
var querystring = require('querystring');
var request = require('request'); //npm request package to send a get and post request to a url
const cheerio = require('cheerio'); //npm package used for scraping content from third party sites
var cookieParser = require('cookie-parser')
var http = require('http');
var app = express();
app.use(cookieParser());
var _csrf; //variable to store the _csrf value to be used later
app.use(bodyParser.json());
var html = '';
var j = request.jar(); // this is to set the jar of request for session and cookie persistence
request = request.defaults({ jar: j }); //here we are setting the default cookies of request
//___________________API CALL TO VERIFY THE GMS NUMBER_______________________
app.get('/check', function(req, response) {
var schemeId = null;
if (req.query.schemeId) {
schemeId = req.query.schemeId;
console.log(schemeId);
} else {
response.send('false');
response.end();
}
getCsrfValue(function(err, res) {
if (!err) {
_csrf = res;
console.log(_csrf);
request.post({
headers: {
'Authorization': '',
'Content-Type': 'application/x-www-form-urlencoded',
},
uri: 'https://www.xyz.site',
body: "schemeId=" + schemeId + "&_csrf=" + _csrf
}, function(err, res, body) {
if (err) {
console.log(err);
} else {
console.log("body of post: " + res.body);
const $ = cheerio.load(body.toString());
var txt = $('.schemeCheckResult').text();
console.log(txt);
if (txt) {
response.send('true');
} else {
response.send('false');
}
html += body;
}
});
} else {
response.send(err);
}
})
});
//______________FUNCTION TO SCRAPE THE CSRF TOKEN FROM THE SITE____________
function getCsrfValue(callback) {
request.get({
headers: {
'Authorization': '',
'Content-Type': 'application/x-www-form-urlencoded',
},
uri: 'https://www.xyz.site'
}, function(err, res, body) {
if (err) {
return callback(err);
} else {
const $ = cheerio.load(body.toString());
var txt = $('input[name=_csrf]').val();
_csrf = txt;
return callback(null, _csrf);
}
});
}
module.exports = app;

Categories

Resources