How can I scrape sites that require authentication using node.js? - javascript

I've come across many tutorials explaining how to scrape public websites that don't require authentication/login, using node.js.
Can somebody explain how to scrape sites that require login using node.js?

Use Mikeal's Request library, you need to enable cookies support like this:
var request = request.defaults({jar: true})
So you first should create a username on that site (manually) and pass the username and the password as params when making the POST request to that site. After that the server will respond with a cookie which Request will remember, so you will be able to access the pages that require you to be logged into that site.
Note: this approach doesn't work if something like reCaptcha is used on the login page.

I've been working with NodeJs Scrapers for more than 2 years now
I can tell you that the best choice when dealing with logins and authentication is to NOT use direct request
That is because you just waste time on building manual requests and it is way slower,
Instead, use a high lever browser that you control via an API like Puppeteer or NightmareJs
I have a good starter and in-depth guide on How to start scraping with Puppeteer, I'm sure it will help!

Or using superagent:
var superagent = require('superagent')
var agent = superagent.agent();
agent is then a persistent browser, which will handle getting and setting cookies, referers, etc. Just agent.get, agent.post() as normal.

You can scrape the data from sites that require authentication like csrf token.
Using the cookies for each request like this:
var j = request.jar(); // this is to set the jar of request for session and cookie persistence
request = request.defaults({ jar: j }); //here we are setting the default cookies of request
Here is small code to elaborate it further:
var express = require('express');
var bodyParser = require('body-parser');
var querystring = require('querystring');
var request = require('request'); //npm request package to send a get and post request to a url
const cheerio = require('cheerio'); //npm package used for scraping content from third party sites
var cookieParser = require('cookie-parser')
var http = require('http');
var app = express();
app.use(cookieParser());
var _csrf; //variable to store the _csrf value to be used later
app.use(bodyParser.json());
var html = '';
var j = request.jar(); // this is to set the jar of request for session and cookie persistence
request = request.defaults({ jar: j }); //here we are setting the default cookies of request
//___________________API CALL TO VERIFY THE GMS NUMBER_______________________
app.get('/check', function(req, response) {
var schemeId = null;
if (req.query.schemeId) {
schemeId = req.query.schemeId;
console.log(schemeId);
} else {
response.send('false');
response.end();
}
getCsrfValue(function(err, res) {
if (!err) {
_csrf = res;
console.log(_csrf);
request.post({
headers: {
'Authorization': '',
'Content-Type': 'application/x-www-form-urlencoded',
},
uri: 'https://www.xyz.site',
body: "schemeId=" + schemeId + "&_csrf=" + _csrf
}, function(err, res, body) {
if (err) {
console.log(err);
} else {
console.log("body of post: " + res.body);
const $ = cheerio.load(body.toString());
var txt = $('.schemeCheckResult').text();
console.log(txt);
if (txt) {
response.send('true');
} else {
response.send('false');
}
html += body;
}
});
} else {
response.send(err);
}
})
});
//______________FUNCTION TO SCRAPE THE CSRF TOKEN FROM THE SITE____________
function getCsrfValue(callback) {
request.get({
headers: {
'Authorization': '',
'Content-Type': 'application/x-www-form-urlencoded',
},
uri: 'https://www.xyz.site'
}, function(err, res, body) {
if (err) {
return callback(err);
} else {
const $ = cheerio.load(body.toString());
var txt = $('input[name=_csrf]').val();
_csrf = txt;
return callback(null, _csrf);
}
});
}
module.exports = app;

Related

Check if header value match

Is it possible to check the header value in Node.js? I would like to create a route that can be accessed only if the user supplies a header and its value matches what is coded. For example, suppose that route expect a header like AccessKey: 12345 so it checks if there is such a header containing such value and if it doesn't match it throws an error. I tried to use something like res.hasHeader() like this:
app.route('/rest/api/here').get((req, res) => {
if (res.hasHeader('AccessKey', '12345')){
res.send('test')
} else {
res.send('Header value doesn\'t match')
}
})
but it only checks if the header exists itself and doesn't check if the value match. Application is mostly for educational purposes so this approach is acceptable, if possible.
I would recommend using a library to parse the result, here is a complete example with comments to explain the parts. As the #Heretic Monkey pointed, out the token is on the request object, but thats not the approach i would use.
// this is standard set of imports in app generated by
// express --no-view
// from package npm i -g express-generator
var express = require('express');
var path = require('path');
var cookieParser = require('cookie-parser');
var logger = require('morgan');
// token 'parsing' library
var bearer = require('express-bearer-token');
// more boilerplate
var app = express();
app.use(logger('dev'));
app.use(express.json());
app.use(express.urlencoded({ extended: false }));
app.use(cookieParser());
app.use(express.static(path.join(__dirname, 'public')));
// look for the key in headers: { Authorization: AccessKey <your key> }
// this library also has options for query, body, etc...
// https://www.npmjs.com/package/express-bearer-token
app.use(bearer({ headerKey: 'AccessKey' }));
// if present and what you wanted, proceed, else, fail
var protect = (req, res, next) => (
(req.token && req.token === '12345')
? next()
: next(new Error('bad token'))
);
// example protected (can protect a whole router with router.use(protect))
app.get('/protected', protect, (r, s) => s.json({ data: 'api' }));
// example not protected
app.get('/example', (r, s) => s.json({ not: 'protected' }));
// make sure to status 500 to make axios client throw
app.use((error, r, s, n) => s
.status(500)
.json({ error: (error + '') }));
// run the server
var server = app.listen(3000);
// client code (axios works in browser same exact api)
var axios = require('axios');
// wait until server started
setTimeout(async function() {
// you will get status 500 without key on protected route
try {
await axios.get('http://localhost:3000/protected');
console.log('nope, wont see me print')
} catch (e) {
console.log('error for protected no token:', e.response.data.error);
}
// non protected works as expected
var example = await axios.get('http://localhost:3000/example');
console.log('got example data fine: ', example.data);
// for protected, need to supply header
var protected = await axios({
method: 'get',
url: 'http://localhost:3000/protected',
headers: { Authorization: 'AccessKey 12345' }
});
console.log('got protected data fine w/tok: ', protected.data);
// wait for server to shut down then exit the program
server.close(() => console.log('bye'))
}, 500);

HTTPS Redirect in next js

I'm trying to do a http to https redirect using next,
so if the user enters the site at http://www.example.com redirect to https://www.example.com
I'm using GraphQL Yoga on the server side, so not sure how I could accomplish this in my index file on the server side.
I've tried using the meta tag and changing the protocol in the window object but no luck with doing so in server-side rendering.
Is there any way I can accomplish this redirect on the client side using next js or on the server side?
const cookieParser = require('cookie-parser')
const jwt = require('jsonwebtoken')
require('dotenv').config({path: '.env'})
const createServer = require('./createServer')
const db = require('./db')
const sslRedirect = require('heroku-ssl-redirect');
const server = createServer()
//Express middleware to handle cookies
server.express.use(cookieParser())
//decode JWT
server.express.use((req, res, next) => {
const { token } = req.cookies;
if (token) {
const { userId } = jwt.verify(token, process.env.APP_SECRET);
req.userId = userId;
}
next()
})
//Populates user on request
server.express.use(async (req, res, next) => {
if(!req.userId) return next()
const user = await db.query.user({
where: {id: req.userId}
}, `{id, permissions, email, name}`)
req.user = user
next()
})
//start
server.start({
cors: {
credentials: true,
origin: process.env.FRONTEND_URL
},
}, starting => {
console.log(`server is running on port ${starting.port}`)
})
What I have done in the past is started both a HTTP and a HTTPS server with express.
The HTTPS is the server with all the routes\API's configured.
The HTTP server simply to redirects all GET requests to HTTPS. See the following code which could be used setup the HTTP server to do the redirect.
let httpRedirectServer = express();
// set up a route to redirect http to https
httpRedirectServer.get('*', (request, response) => {
response.redirect('https://' + request.headers.host + request.url);
});
httpRedirectServer.listen(80);
httpRedirectServer.on('listening', () => {
console.log("Listening to redirect http to https");
});
Alternatively on the client side a quick fix is to redirect in javascript by running something like.
// Check the URL starts with 'http://xxxxx' protocol, if it does then redirect to 'https://xxxxx' url of same resource
var httpTokens = /^http:\/\/(.*)$/.exec(window.location.href);
if(httpTokens) {
window.location.replace('https://' + httpTokens[1]);
}

Node proxy server modify response after query in database

I am troubling with nodejs proxy server modified(write) response.
I want to achieve auto login for one site via node proxy server and for that i have to query in database then i can modified response but it seems req ended before req.write and getting Error: write after end
Below is my implementation so far.
var express = require('express');
var proxy = require('http-proxy-middleware');
var options = {
target: 'http://example.com/', // target host
changeOrigin: true,
onProxyReq: function onProxyReq(proxyReq, req, res) {
var _write = res.write;
var body = "";
proxyReq.on('data', function(data) {
data = data.toString('utf-8');
body += data;
});
res.write = function (data) {
try{
//I have database query here instead of setTimeout
setTimeout(function(){
/* Modified response here and write */
_write.call(res, data); //can't write because req already end
},3000);
} catch (err) {
console.log('err',err);
}
}
}
}
// create the proxy (without context)
var exampleProxy = proxy(options);
// mount `exampleProxy` in web server
var app = express();
app.use('/', exampleProxy);
app.listen(8080);
Can anyone guide me how to achieve this ?

How to use a node.js back-end server to interact with the front-end and a mongolab database?

I am building a website with a simple jquery/html/css front-end and a node.js server back-end. If my front-end has a function to request a user's information from the server like so:
function requestUser(email, password) {
xmlhttp = new XMLHttpRequest();
xmlhttp.open("GET", "http://localhost:8888/getUser/" + email + "/" + password, true);
xmlhttp.onreadystatechange = function() {
if (xmlhttp.readyState == 4 && xmlhttp.status == 200) {
console.log(xmlhttp.responseText);
}
}
xmlhttp.send();
}
and my node server looks like this:
var http = require("http"),
mongojs = require("mongojs"),
fs = require("fs"),
url = require("url");
express = require("express")
var server = http.createServer(requestHandler);
server.listen(8888);
var uri = "mongodb://<dbuser>:<dbpassword>#ds036698.mongolab.com:36698/alirodatabase";
var db = mongojs(uri, ["Papers", "Users"]);
console.log("node server running back end of app");
function requestHandler(request, response) {
//request for user is .../getUser/<username>/<password>
var path = url.parse(request.url).pathname;
var details = path.split('/');
if(details.indexOf("getUser") != -1) {
console.log("recieved request for user");
var user = db.Users.find({"email": details[details.indexOf("getUser") + 1],
"password": details[details.indexOf("getUser") + 2]});
user = user.toArray[0];
response.writeHead(200, {"Content-Type": "text/json"});
response.write(JSON.stringify(user));
}
else {
fs.readFile("./index.html", function(err, file) {
if(err) {
return
}
response.writeHead(200, {"Content-Type": "text/html"});
response.end(file, "utf-8");
});
}
}
why isn't it working? I get a 'mixed content' and/or 'corss-origin' error from firefox when I try to request from the server. How can I have the node server running in the same domain as the rest of the site to avoid these errors?
is really hard to read your code, I understand what you are trying to do, but let me suggest first a better structure easier to read, understand and implement more routes for your server, please check here:
var express = require('express'),
cors = require('cors'),
app = express();
app.use(cors());
app.get('/getUser/:user/:passwd', function(req, res, next) {
// Perform all mongo operations here using req.params.user and req.params.passwd
// and in the callback send a response like the object below
res.json({
msg: 'This is CORS-enabled for all origins!',
user: req.params.user,
passwd: req.params.passwd
});
});
app.listen(8888, function() {
console.log('CORS-enabled web server listening on port 8888');
});
Also the lack of CORS support (https://developer.mozilla.org/en-US/docs/Web/HTTP/Access_control_CORS) as you will need this in your use case for if you are planning to host serve static files consuming this service hosted in a different server, so lets use this module: https://www.npmjs.com/package/cors and it will allow express to process a request from anywhere.

nodejs web scraper for password protected website

I am trying to scrape a website using nodejs and it works perfectly on sites that do not require any authentication. But whenever I try to scrape a site with a form that requires username and password I only get the HTML from the authentication page (that is, if you would click 'view page source' on the authentication page it self, that is the HTML I get). I am able to get the desired HTML using curl
curl -d "username=myuser&password=mypw&submit=Login" URL
Here is my code...
var express = require('express');
var fs = require('fs'); //access to file system
var request = require('request');
var cheerio = require('cheerio');
var app = express();
app.get('/scrape', function(req, res){
url = 'myURL'
request(url, function(error, response, html){
// check errors
if(!error){
// Next, we'll utilize the cheerio library on the returned html which will essentially give us jQuery functionality
var $ = cheerio.load(html);
var title, release, rating;
var json = { title : "", release : "", rating : ""};
$('.span8 b').filter(function(){
// Let's store the data we filter into a variable so we can easily see what's going on.
var data = $(this);
title = data.first().text();
release = data.text();
json.title = title;
json.release = release;
})
}
else{
console.log("Error occurred: " + error);
}
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output.json file');
})
res.send('Check your console!')
})
})
app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;
I have tried the following...
var request = require('request',
username:'myuser',
password:'mypw');
This just returns the authentication page's HTML
request({form: {username:myuser, password:mypw, submit:Login}, url: myURL}, function(error, response, html){
...
...
...
}
This also just returns the authentication page's HTML
So my question is how do I achieve this using nodejs?
you shouldn't use .get but .post and put the post param (username and password) in your call
request.post({
headers: {'content-type' : 'application/x-www-form-urlencoded'},
url: url,
body: "username=myuser&password=mypw&submit=Login"
}, function(error, response, html){
//do your parsing...
var $ = cheerio.load(html)
});

Categories

Resources