Scraping content from Google search results with request in Node.Js - javascript

For my Node.Js app I need to get the first page of Google search results but from the .com domain because I need the "People also search for" knowledge graph info, which only shows up on Google.Com.
I figured I can use the request and cheerio modules to scrap content from Google's search results page, but when I try to access the URL I need, i.e. https://www.google.com/search?gws_rd=ssl&site=&source=hp&q=google&oq=google Google automatically redirects me to the .de domain (as I'm based in Germany).
I tried setting it to first load http://www.google.com/ncr url which automatically switches off country-specific redirect in browsers, but it didn't work...
Does anybody know what I could do differently to make it work?
Here's my code... Thank you!
var request = require("request");
var cheerio = require("cheerio");
function dataCookieToString(dataCookie) {
var t = "";
for (var x = 0; x < dataCookie.length; x++) {
t += ((t != "") ? "; " : "") + dataCookie[x].key + "=" + dataCookie[x].value;
}
return t;
}
function mkdataCookie(cookie) {
var t, j;
cookie = cookie.toString().replace(/,([^ ])/g, ",[12],$1").split(",[12],");
for (var x = 0; x < cookie.length; x++) {
cookie[x] = cookie[x].split("; ");
j = cookie[x][0].split("=");
t = {
key: j[0],
value: j[1]
};
for (var i = 1; i < cookie[x].length; i++) {
j = cookie[x][i].split("=");
t[j[0]] = j[1];
}
cookie[x] = t;
}
return cookie;
}
var dataCookie = mkdataCookie('MC_STORE_ID=66860; expires=' + new Date(new Date().getTime() + 86409000));
request({
uri: "https://www.google.com/ncr",
headers: {
'User-Agent': 'Mozilla/5.0',
"Cookie": dataCookieToString(dataCookie)
}
}, function(error, response, body) {
request({
uri: "https://www.google.com/search?gws_rd=ssl&site=&source=hp&q=google&oq=google",
headers: {
'User-Agent': 'Mozilla/5.0'
}
}, function(error, response, body) {
console.log(body);
var $ = cheerio.load(body);
$(".kno-fb-ctx").each(function() {
var link = $(this);
var text = link.text();
console.log(text);
});
});
});

Here's the solution: it's much easier than I thought.
However, I still have a problem that the body I get does not contain the stuff that only show up when javascript is enabled.
Anybody knows how to modify the code below so it also includes javascript-enabled content into the body?
var request = require('request');
var cheerio = require("cheerio");
request = request.defaults({jar: true});
var options = {
url: 'http://www.google.com/ncr',
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; rv:1.9.2.16) Gecko/20110319 Firefox/3.6.16'
}
};
request(options, function () {
request('https://www.google.com/search?gws_rd=ssl&site=&source=hp&q=google&oq=google', function (error, response, body) {
var $ = cheerio.load(body);
$("li").each(function() {
var link = $(this);
var text = link.text();
console.log(text);
});
});
});

Related

Write final json to a file from repeated requests to rest API

I am trying to build a file of json data from repeated calls to a restAPI. The final file to be written is the sum of the data received from all the calls. At present the file is being written with contents of the first call then overwritten by the contents of the first + second call (see console output below code).
As I have to make many calls, once the code is working, I would like to only write the file once the request has finished and the json string has been built. Does anyone now how I would go about doing this? Maybe with a callback(?), which I still don't have the hang of, once the requests have finished or the json string has finished being built.
"use strict";
const fs = require('fs');
const request = require('request');
var parse = require('csv-parse');
const path = "../path tocsv.csv";
const pathJSON = "../pathtoJSON.json";
var shapes = "https://url";
var options = {
url: '',
method: 'GET',
accept: "application/json",
json: true,
};
var csvData = [];
var jsonData = "[";
fs.createReadStream(path)
.pipe(parse({delimiter: ','}))
.on('data', function(data) {
csvData.push(data[1]);
})
.on('end',function() {
var start = Date.now();
var records = csvData.length //2212 objects
console.log(records);
var dataLength = 2 //set low at moment
for (var i = 0; i < dataLength; i += 1) {
var url = shapes + csvData[i];
options.url = url; //set url query
request(options, function(error, response, body) {
var time = Date.now() - start;
var s = JSON.stringify(body.response);
console.log( '\n' + (Buffer.byteLength(s)/1000).toFixed(2)+
" kilobytes downloaded in: " + (time/1000) + " sec");
console.log(i)
buildJSON(s);
});
}
function buildJSON(s) {
var newStr = s.substring(1, s .length-1);
jsonData += newStr + ',';
writeFile(jsonData);
}
function writeFile(jsonData) {
fs.writeFile(pathJSON, jsonData, function(err) {
if (err) {
return console.log(err);
} else {
console.log("file complete")
}
});
}
});
128.13 kilobytes downloaded in: 2.796 sec
2
file complete
256.21 kilobytes downloaded in: 3.167 sec
2
file complete
Perhaps writing to the file after all requests are complete will help. In the current code, the writeFile function is called each time a request is completed (which overwrites the file each time)
A quick way to fix this is to count requests (and failures) and write to file only after all the requests are complete.
"use strict";
const fs = require('fs');
const request = require('request');
var parse = require('csv-parse');
const path = "../path tocsv.csv";
const pathJSON = "../pathtoJSON.json";
var shapes = "https://url";
var options = {
url: '',
method: 'GET',
accept: "application/json",
json: true,
};
var csvData = [];
var jsonData = "[";
fs.createReadStream(path)
.pipe(parse({
delimiter: ','
}))
.on('data', function (data) {
csvData.push(data[1]);
})
.on('end', function () {
var start = Date.now();
var records = csvData.length //2212 objects
console.log(records);
var dataLength = 2 //set low at moment
var jsonsDownloaded = 0; // Counter to track complete JSON requests
var jsonsFailed = 0; // Counter to handle failed JSON requests
for (var i = 0; i < dataLength; i += 1) {
var url = shapes + csvData[i];
options.url = url; //set url query
request(options, function (error, response, body) {
if(error){
jsonsFailed++;
writeFile(jsonData);
return;
}
jsonsDownloaded++;
var time = Date.now() - start;
var s = JSON.stringify(body.response);
console.log('\n' + (Buffer.byteLength(s) / 1000).toFixed(2) +
" kilobytes downloaded in: " + (time / 1000) + " sec");
console.log(i)
buildJSON(s);
});
}
function buildJSON(s) {
var newStr = s.substring(1, s.length - 1);
jsonData += newStr + ',';
writeFile(jsonData);
}
function writeFile(jsonData) {
if(dataLength - (jsonsDownloaded + jsonsFailed) > 0){
return;
}
fs.writeFile(pathJSON, jsonData, function (err) {
if (err) {
return console.log(err);
} else {
console.log("file complete")
}
});
}
});
Note:
Requests being fired in quick succession like (2000 requests in a for loop) in my experience does not work well.. Try batching them. Also, doing it this way does not guarantee order (if that is important in your usecase)
An alternative would be to open your file in append mode. You can do this by passing an extra options object with flag set to your fs.writeFile call.
fs.writeFile(pathJSON, jsonData, {
flag: 'a'
}, function (err) {
if (err) {
return console.log(err);
}
});
References:
fs.writeFile Docs
File system flags

Node JS Http.get file from aspx

I'm attempting to download a PDF document from any of the 'download' buttons on this website using Node's Http module. How can I download the PDF document without downloading the aspx file instead, which is what is happening with my code? For some reason, my code downloads an aspx file that says 'Error Message - File does not exist or you do not have permission to view this file', even though I can easily download the file from my web browser. Here is my code:
var pdf_text = require("pdf-text");
var request = require("request");
var http = require("http");
var fs = require("fs");
var cheerio = require("cheerio");
var urllib = require("url");
var path = "final.pdf";
var url = "http://www2.nationalgrid.com/UK/Industry-information/System-charges/Electricity-transmission/Assistance-for-areas-with-high-distribution-costs/";
var links = [];
request(url, function(error, response, html) {
if(!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$(".txtLnk").each(function() {
links.push("http://www2.nationalgrid.com" + $(this).attr("href"));
});
var file = fs.createWriteStream(urllib.parse(links[1]).pathname.split('/').pop());
var options = {
host: urllib.parse(links[1]).host,
port: 80,
path: urllib.parse(links[1]).pathname,
headers: {
"User-Agent": "Mozilla/5.0 (X11; Linux i686; rv:43.0) Gecko/201001101 Firefox/43.0"
}
};
http.get(options, function(res) {
res.on('data', function(data) {
file.write(data);
}).on('end', function() {
file.end();
});
});
console.log(links);
}
});
function data_from_pdf(pdf) {
pdf_text("pdf/" + pdf, function(err, chunks) {
var data = chunks.join("").substring(chunks.join("").search("(p/kWh)") + 6, chunks.join("").search("(p/kWh)") + 21);
var date = data.substring(0, data.indexOf("/") + 3);
var rate = data.substring(data.indexOf("/") + 3);
var json_data = "{" + "\n\tname: " + "final.pdf" + ",\n\tdate: " + date + ",\n\trate: " + rate + "\n}";
return json_data;
});
}
Turns out, if I just replace "options" with the base URL, it works. Strange. Problem solved. :)
Try this:
var request = require("request");
var fs = require("fs");
var cheerio = require("cheerio");
var path = "./final.pdf";
var url = "http://www2.nationalgrid.com/UK/Industry-information/System-charges/Electricity-transmission/Assistance-for-areas-with-high-distribution-costs/";
var links = [];
request(url, function(error, response, html) {
if(!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$(".txtLnk").each(function() {
links.push("http://www2.nationalgrid.com" + $(this).attr("href"));
});
var r = request(links[0]);
r.on('response', function (res) {
console.log(res.headers);
res.pipe(fs.createWriteStream(path));
});
}
});

Web Crawler - returning an array to be used in the next function

The first function I can confirm works correctly.
I'd like to return an array to the variable AtoZLinks so it can be used in the later function. I will make requests to each url in the array and extract more information from within those links.
Many thanks in advance, I've been working on this as a project for some days, I'm a beginner in jQuery, Web Crawling, JS, NodeJS and expressJS. Thrown myself in the deep end for work.
var express = require('express');
var request = require('request');
var cheerio = require('cheerio');
var router = express.Router();
var fullUrl;
fullUrl = [];
var AtoZLinks = function(){
var url = 'http://example1.com';
request(url, function(error, response, html) {
if (!error && response.statusCode === 200) {
var $ = cheerio.load(html);
var fullUrl = [];
var places = "Places";
$('.clear a').each(function() {
var link = $(this);
link.each(function(index) {
var href = link.attr('href');
if (href.match(places)) {
// The urls from fullUrl here to be returned to parent variable.
fullUrl[index] = url + href; // Think something is wrong here... I've also tried "fullUrl.push(url + href);"
console.log(fullUrl); // This prints out all urls correctly
}
});
});
for (var i = 0; i < fullUrl.length; i++) {
console.log(fullUrl[i];
} // This code only prints out the last url stored (So I'm thinking the urls are being stored incorrectly...)
}
});
};
/* GET crawler page. */
router.get('/crawler', function(req, res, next) {
AtoZLinks();
next();
}, function(req, res) {
});
module.exports = router;
// Feel free to ignore the following work I've done or..
// Your support with the the following function will be a bonus!
// I need to use the links in the previous array variable in the following
// function to extract further urls within those urls that I will work with.
var url = AtoZLinks;
request(AtoZLinks, function(error, response, html) {
if (!error && response.statusCode === 200) {
var $ = cheerio.load(html);
// This selector is the code needed to extract the links from within the
// links in the AtoZLinks array.
$('div.fclist-section.clear.list-four').each(function() {
$(this).find('a').each(function() {
var link = $(this);
var href = link.attr('href');
fullUrl = url + href;
console.log(fullUrl);
});
});
}
);
Do you mean something like this?
var arrURLs;
arrURLs = [
'www.ask.com',
'www.google.com',
'www.bing.com',
'www.yahoo.com'
];
AtoZLinks(arrURLs);
var AtoZLinks = function(theURLs){
for (var i = 0; i < theURLs.length; i++) {
var url = theURLs[i];
request(url, function(error, response, html) {
if (!error && response.statusCode === 200) {
var $ = cheerio.load(html);
var fullUrl = [];
var places = "Places";
$('.clear a').each(function() {
var link = $(this);
link.each(function(index) {
var href = link.attr('href');
//absolute match
if (href === url) {
//true
} else {
//false
}
//href contains url
if (href.indexOf(url) > -1) {
//true
} else {
//false
}
if (href.match(places)) {
// The urls from fullUrl here to be returned to parent variable.
fullUrl.push(url + href);
console.log(JSON.stringify(fullUrl));
}
});
});
}
});
}
};

Memory leak in a simple api web service in nodeJS

var url = require('url');
var http = require('http');
var downloader = require("./downloader");
http.createServer(onRequest).listen(8080);
function onRequest(request,response)
{
if(request.method=='POST')
handlePost(request,response);
else
handleGet(request,response);
}
function handlePost(request,response)
{
var data = '';
request.on('data',function(chunk){
data+=chunk.toString();
});
request.on('end',downloadTrainStatus);
function downloadTrainStatus()
{
var downloadPromise = downloader.download(data);
downloadPromise.then(function (responses) {
var total = responses.length;
var result = [];
for( var i = 0 ; i < total ; i++)
result.push(JSON.parse(responses[i][1]));
response.writeHead(200);
response.write(JSON.stringify(result));
response.end();
}, function (err) {
console.log(err);
var result = { status : "error" };
response.writeHead(200);
response.write(JSON.stringify(result));
response.end();
})
}
}
function handleGet(request,response)
{
console.log("GET request");
response.writeHead(200);
response.write("Get request works !!");
response.end();
}
The above file server.js simply starts the web server. It uses a file downloader.js included below. The downloader.js just parallely makes 10 or 20 parallel web requests to a url which returns a JSON response. On running this process for half a day, the memory usage of the process shoots to 1.5GB. Is there any memory leak in this code ?
var Promise = require('bluebird')
var request = Promise.promisify(require('request'))
function download(json)
{
var requests = []
var data = JSON.parse(json);
for(var i = 0; i< data.total ; i++)
{
var stationCode = stations[i].StationCode;
var journeyDay = stations[i].JourneyDayCode;
requests.push(downloadStatus());
}
return Promise.all(requests);
}
function downloadStatus()
{
var url = "http://google.com";
var headers = {'User-Agent' : 'Apache-HttpClient/UNAVAILABLE (java 1.4)'};
var options = {
url: url ,
headers: headers,
timeout: 15 * 1000
}
return request(options);
}
module.exports.download = download;

'Juggling Async' - Why does my solution not return anything at all?

After asking a question and getting a very helpful answer on what the 'Async Juggling' assignment in learnyounode was asking me to do, I set out to implement it myself.
The problem is, my setup isn't having any success! Even though I've referred to other solutions out there, my setup simply isn't returning any results when I do a learnyounode verify myscript.js.
GIST: jugglingAsync.js
var http = require('http');
var app = (function () {
// Private variables...
var responsesRemaining,
urls = [],
responses = [];
var displayResponses = function() {
for(var iterator in responses) {
console.log(responses[iterator]);
}
};
// Public scope...
var pub = {};
pub.main = function (args) {
responsesRemaining = args.length - 2;
// For every argument, push a URL and prep a response.
for(var i = 2; i < args.length; i++) {
urls.push(args[i]);
responses.push('');
}
// For every URL, set off an async request.
for(var iterator in urls) {
var i = iterator;
var url = urls[i];
http.get(url, function(response) {
response.setEncoding('utf8');
response.on('data', function(data) {
if(response.headers.host == url)
responses[i] += data;
});
response.on('end', function() {
if(--responsesRemaining == 0)
displayResponses();
});
});
}
};
return pub;
})();
app.main(process.argv);
Question: What am I doing wrong?
This line
for(var iterator in urls) {
doesn't do what you think it does. It actually loops over the properties of urls (see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Statements/for...in). Instead, you have to do something like
for(var i = 0; i < urls.length; i++) {
var url = urls[i];
...
}
or
urls.forEach(function(url, index) {
...
});
In addition to not properly looping through the arrays inside the app module, I was also not properly concatenating data returned from the response.on('data') event. Originally I was doing...
responses[index] += data;
Instead, the correct thing to do was:
responses[index] = responses[index] + data;
Changing that, as well as the things noted by #arghbleargh got the 'Async Juggling' to fully verify!
I have tested my code and it all worked:
~ $ node juggling_async.js site1 site2 site3 site4 ...
The JS code does not limit only to three sites.
var http = require('http');
// Process all the site-names from the arguments and store them in sites[].
// This way does not limit the count to only 3 sites.
var sites = [];
(function loadSites() {
for(var i = 2, len = process.argv.length; i < len; ++i) {
var site = process.argv[i];
if(site.substr(0, 6) != 'http://') site = 'http://' + site;
sites.push(site);
}
})();
var home_pages = [];
var count = 0;
function httpGet(index) {
var home_page = '';
var site = sites[index];
http.get(site, function(res) {
res.setEncoding('utf8');
res.on('data', function(data) {
home_page += data;
});
res.on('end', function() {
++count;
home_pages[index] = home_page;
if(count == sites.length) {
// Yahoo! We have reached the last one.
for(var i = 0; i < sites.length; ++i) {
console.log('\n############ Site #' + (+i+1) + ': ' + sites[i]);
console.log(home_pages[i]);
console.log('============================================\n');
}
}
});
})
.on('error', function(e) {
console.log('Error at loop index ' + inddex + ': ' + e.message);
})
;
}
for(var i = 0; i < sites.length; ++i) {
httpGet(i);
}

Categories

Resources