I'm attempting to download a PDF document from any of the 'download' buttons on this website using Node's Http module. How can I download the PDF document without downloading the aspx file instead, which is what is happening with my code? For some reason, my code downloads an aspx file that says 'Error Message - File does not exist or you do not have permission to view this file', even though I can easily download the file from my web browser. Here is my code:
var pdf_text = require("pdf-text");
var request = require("request");
var http = require("http");
var fs = require("fs");
var cheerio = require("cheerio");
var urllib = require("url");
var path = "final.pdf";
var url = "http://www2.nationalgrid.com/UK/Industry-information/System-charges/Electricity-transmission/Assistance-for-areas-with-high-distribution-costs/";
var links = [];
request(url, function(error, response, html) {
if(!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$(".txtLnk").each(function() {
links.push("http://www2.nationalgrid.com" + $(this).attr("href"));
});
var file = fs.createWriteStream(urllib.parse(links[1]).pathname.split('/').pop());
var options = {
host: urllib.parse(links[1]).host,
port: 80,
path: urllib.parse(links[1]).pathname,
headers: {
"User-Agent": "Mozilla/5.0 (X11; Linux i686; rv:43.0) Gecko/201001101 Firefox/43.0"
}
};
http.get(options, function(res) {
res.on('data', function(data) {
file.write(data);
}).on('end', function() {
file.end();
});
});
console.log(links);
}
});
function data_from_pdf(pdf) {
pdf_text("pdf/" + pdf, function(err, chunks) {
var data = chunks.join("").substring(chunks.join("").search("(p/kWh)") + 6, chunks.join("").search("(p/kWh)") + 21);
var date = data.substring(0, data.indexOf("/") + 3);
var rate = data.substring(data.indexOf("/") + 3);
var json_data = "{" + "\n\tname: " + "final.pdf" + ",\n\tdate: " + date + ",\n\trate: " + rate + "\n}";
return json_data;
});
}
Turns out, if I just replace "options" with the base URL, it works. Strange. Problem solved. :)
Try this:
var request = require("request");
var fs = require("fs");
var cheerio = require("cheerio");
var path = "./final.pdf";
var url = "http://www2.nationalgrid.com/UK/Industry-information/System-charges/Electricity-transmission/Assistance-for-areas-with-high-distribution-costs/";
var links = [];
request(url, function(error, response, html) {
if(!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$(".txtLnk").each(function() {
links.push("http://www2.nationalgrid.com" + $(this).attr("href"));
});
var r = request(links[0]);
r.on('response', function (res) {
console.log(res.headers);
res.pipe(fs.createWriteStream(path));
});
}
});
Related
When i run the below code in console like myFunc.myfunc("SomeText"), it creates the bucket but when I use the same function on button, it doesn't create anything. Sometimes it does create but you have to wait around 20-30 minutes before you get it right next time.
I am using node.js at the client side, browserify to make the require() work on client side and cors-anywhere for CORS policy.
JS File-
// Load the SDK and UUID
function uploadToS3(dataForBody) {
var AWS = require('aws-sdk'); //Browserify will be used and bundled in uploadDocsBundle
var uuid = require('uuid');
AWS.config.accessKeyId = 'my access key id';
AWS.config.secretAccessKey = 'my secret access id';
AWS.config.region = 'eu-west-1';
// Create an S3 client
var s3 = new AWS.S3();
// Create a bucket and upload something into it
var bucketName = 'node-sdk-sample-' + uuid.v4();
var keyName = 'hello_world.txt';
s3.createBucket({ Bucket: bucketName }, function () {
var params = {
Bucket: bucketName, Key: keyName, Body: dataForBody
};
s3.putObject(params, function (err, data) {
if (err)
console.log(err);
else
console.log("Successfully uploaded data to " + bucketName + "/" + keyName);
});
});
}
(function () {
var cors_api_host = 'cors-anywhere.herokuapp.com';
var cors_api_url = 'https://' + cors_api_host + '/';
var slice = [].slice;
var origin = window.location.protocol + '//' + window.location.host;
var open = XMLHttpRequest.prototype.open;
XMLHttpRequest.prototype.open = function () {
var args = slice.call(arguments);
var targetOrigin = /^https?:\/\/([^\/]+)/i.exec(args[1]);
if (targetOrigin && targetOrigin[0].toLowerCase() !== origin &&
targetOrigin[1] !== cors_api_host) {
args[1] = cors_api_url + args[1];
}
return open.apply(this, args);
};
})();
======================================================
HTML File
<button type="submit" onClick="myFunc.myfunc('hello')" class="btn btn-primary">Submit</button>
Edit2:-
=======
// Load the SDK and UUID
module.exports = { myfunc: uploadToS3 };
function uploadToS3(dataForBody) {
var AWS = require('aws-sdk'); //Browserify will be used and bundled in uploadDocsBundle
var uuid = require('uuid');
AWS.config.accessKeyId = 'AccessKey';
AWS.config.secretAccessKey = 'SecretAccessID';
AWS.config.region = 'eu-west-1';
// Create a bucket and upload something into it
var bucketName = 'node-sdk-sample-' + uuid.v4();
var keyName = 'hello_world2.txt';
var bucketPromise = new AWS.S3().createBucket({ Bucket: bucketName }).promise();
//Handle promise fulfilled/rejected states
bucketPromise.then(
function (data) {
// Create params for putObject call
var objectParams = { Bucket: bucketName, Key: keyName, Body: 'Hello World!' };
// Create object upload promise
var uploadPromise = new AWS.S3().putObject(objectParams).promise();
uploadPromise.then(
function(data) {
console.log("Successfully uploaded data to " + bucketName + "/" + keyName);
});
}).catch(
function (err) {
console.error(err, err.stack);
});
}
(function () {
var cors_api_host = 'cors-anywhere.herokuapp.com';
var cors_api_url = 'https://' + cors_api_host + '/';
var slice = [].slice;
var origin = window.location.protocol + '//' + window.location.host;
var open = XMLHttpRequest.prototype.open;
XMLHttpRequest.prototype.open = function () {
var args = slice.call(arguments);
var targetOrigin = /^https?:\/\/([^\/]+)/i.exec(args[1]);
if (targetOrigin && targetOrigin[0].toLowerCase() !== origin &&
targetOrigin[1] !== cors_api_host) {
args[1] = cors_api_url + args[1];
}
return open.apply(this, args);
};
})();
I am trying to build a file of json data from repeated calls to a restAPI. The final file to be written is the sum of the data received from all the calls. At present the file is being written with contents of the first call then overwritten by the contents of the first + second call (see console output below code).
As I have to make many calls, once the code is working, I would like to only write the file once the request has finished and the json string has been built. Does anyone now how I would go about doing this? Maybe with a callback(?), which I still don't have the hang of, once the requests have finished or the json string has finished being built.
"use strict";
const fs = require('fs');
const request = require('request');
var parse = require('csv-parse');
const path = "../path tocsv.csv";
const pathJSON = "../pathtoJSON.json";
var shapes = "https://url";
var options = {
url: '',
method: 'GET',
accept: "application/json",
json: true,
};
var csvData = [];
var jsonData = "[";
fs.createReadStream(path)
.pipe(parse({delimiter: ','}))
.on('data', function(data) {
csvData.push(data[1]);
})
.on('end',function() {
var start = Date.now();
var records = csvData.length //2212 objects
console.log(records);
var dataLength = 2 //set low at moment
for (var i = 0; i < dataLength; i += 1) {
var url = shapes + csvData[i];
options.url = url; //set url query
request(options, function(error, response, body) {
var time = Date.now() - start;
var s = JSON.stringify(body.response);
console.log( '\n' + (Buffer.byteLength(s)/1000).toFixed(2)+
" kilobytes downloaded in: " + (time/1000) + " sec");
console.log(i)
buildJSON(s);
});
}
function buildJSON(s) {
var newStr = s.substring(1, s .length-1);
jsonData += newStr + ',';
writeFile(jsonData);
}
function writeFile(jsonData) {
fs.writeFile(pathJSON, jsonData, function(err) {
if (err) {
return console.log(err);
} else {
console.log("file complete")
}
});
}
});
128.13 kilobytes downloaded in: 2.796 sec
2
file complete
256.21 kilobytes downloaded in: 3.167 sec
2
file complete
Perhaps writing to the file after all requests are complete will help. In the current code, the writeFile function is called each time a request is completed (which overwrites the file each time)
A quick way to fix this is to count requests (and failures) and write to file only after all the requests are complete.
"use strict";
const fs = require('fs');
const request = require('request');
var parse = require('csv-parse');
const path = "../path tocsv.csv";
const pathJSON = "../pathtoJSON.json";
var shapes = "https://url";
var options = {
url: '',
method: 'GET',
accept: "application/json",
json: true,
};
var csvData = [];
var jsonData = "[";
fs.createReadStream(path)
.pipe(parse({
delimiter: ','
}))
.on('data', function (data) {
csvData.push(data[1]);
})
.on('end', function () {
var start = Date.now();
var records = csvData.length //2212 objects
console.log(records);
var dataLength = 2 //set low at moment
var jsonsDownloaded = 0; // Counter to track complete JSON requests
var jsonsFailed = 0; // Counter to handle failed JSON requests
for (var i = 0; i < dataLength; i += 1) {
var url = shapes + csvData[i];
options.url = url; //set url query
request(options, function (error, response, body) {
if(error){
jsonsFailed++;
writeFile(jsonData);
return;
}
jsonsDownloaded++;
var time = Date.now() - start;
var s = JSON.stringify(body.response);
console.log('\n' + (Buffer.byteLength(s) / 1000).toFixed(2) +
" kilobytes downloaded in: " + (time / 1000) + " sec");
console.log(i)
buildJSON(s);
});
}
function buildJSON(s) {
var newStr = s.substring(1, s.length - 1);
jsonData += newStr + ',';
writeFile(jsonData);
}
function writeFile(jsonData) {
if(dataLength - (jsonsDownloaded + jsonsFailed) > 0){
return;
}
fs.writeFile(pathJSON, jsonData, function (err) {
if (err) {
return console.log(err);
} else {
console.log("file complete")
}
});
}
});
Note:
Requests being fired in quick succession like (2000 requests in a for loop) in my experience does not work well.. Try batching them. Also, doing it this way does not guarantee order (if that is important in your usecase)
An alternative would be to open your file in append mode. You can do this by passing an extra options object with flag set to your fs.writeFile call.
fs.writeFile(pathJSON, jsonData, {
flag: 'a'
}, function (err) {
if (err) {
return console.log(err);
}
});
References:
fs.writeFile Docs
File system flags
i'm very new to Javascript and i just want to login into website from NodeJS request. This website need information from the first time visited to login.
Here is my code.
var cheerio = require('cheerio');
var loginLink = 'link';
var loginJar = request.jar();
var ltValue = '';
request.get({url: loginLink, jar: loginJar}, function(err, httpResponse, html)
{
var dat = cheerio.load(html);
var arr = dat('input[name="lt"]');
ltValue = arr.attr('value');
arr = dat('input[name="execution"]');
executionValue = arr.attr('value');
/*Post body*/
var loginBody = 'username=' + usn + '&password=' + pwd + '<=' + ltValue + '&execution=' + executionValue
request.post({url: loginLink, jar: loginJar, method: 'post', json: true, body: loginBody, }}, function(err, res, b)
{
if (b.indexOf('errors') != -1)
console.log("Success");
else console.log("Fail");
});
});
I have write try it in C# and it work correctly but in my NodeJs code it always return fail. I have tried everytime but i couldn't do it. Please help me with this problem.
byte[] binData = Encoding.ASCII.GetBytes(loginBody)
string loginFile = "loginInfo.txt";
HttpWebRequest request = (HttpWebRequest)WebRequest.Create("link");
request.Method = "POST";
request.ContentType = "application/x-www-form-urlencoded";
request.ContentLength = binData.Length;
request.CookieContainer = cookieContainer;
using (Stream stream = request.GetRequestStream())
{
stream.Write(binData, 0, binData.Length);
}
WebResponse response = request.GetResponse();
using (StreamReader reader = new StreamReader(response.GetResponseStream()))
{
File.WriteAllText(loginFile, reader.ReadToEnd());
}
string loginData = userID + " " + password;
File.WriteAllText("login.txt", loginData);
The first function I can confirm works correctly.
I'd like to return an array to the variable AtoZLinks so it can be used in the later function. I will make requests to each url in the array and extract more information from within those links.
Many thanks in advance, I've been working on this as a project for some days, I'm a beginner in jQuery, Web Crawling, JS, NodeJS and expressJS. Thrown myself in the deep end for work.
var express = require('express');
var request = require('request');
var cheerio = require('cheerio');
var router = express.Router();
var fullUrl;
fullUrl = [];
var AtoZLinks = function(){
var url = 'http://example1.com';
request(url, function(error, response, html) {
if (!error && response.statusCode === 200) {
var $ = cheerio.load(html);
var fullUrl = [];
var places = "Places";
$('.clear a').each(function() {
var link = $(this);
link.each(function(index) {
var href = link.attr('href');
if (href.match(places)) {
// The urls from fullUrl here to be returned to parent variable.
fullUrl[index] = url + href; // Think something is wrong here... I've also tried "fullUrl.push(url + href);"
console.log(fullUrl); // This prints out all urls correctly
}
});
});
for (var i = 0; i < fullUrl.length; i++) {
console.log(fullUrl[i];
} // This code only prints out the last url stored (So I'm thinking the urls are being stored incorrectly...)
}
});
};
/* GET crawler page. */
router.get('/crawler', function(req, res, next) {
AtoZLinks();
next();
}, function(req, res) {
});
module.exports = router;
// Feel free to ignore the following work I've done or..
// Your support with the the following function will be a bonus!
// I need to use the links in the previous array variable in the following
// function to extract further urls within those urls that I will work with.
var url = AtoZLinks;
request(AtoZLinks, function(error, response, html) {
if (!error && response.statusCode === 200) {
var $ = cheerio.load(html);
// This selector is the code needed to extract the links from within the
// links in the AtoZLinks array.
$('div.fclist-section.clear.list-four').each(function() {
$(this).find('a').each(function() {
var link = $(this);
var href = link.attr('href');
fullUrl = url + href;
console.log(fullUrl);
});
});
}
);
Do you mean something like this?
var arrURLs;
arrURLs = [
'www.ask.com',
'www.google.com',
'www.bing.com',
'www.yahoo.com'
];
AtoZLinks(arrURLs);
var AtoZLinks = function(theURLs){
for (var i = 0; i < theURLs.length; i++) {
var url = theURLs[i];
request(url, function(error, response, html) {
if (!error && response.statusCode === 200) {
var $ = cheerio.load(html);
var fullUrl = [];
var places = "Places";
$('.clear a').each(function() {
var link = $(this);
link.each(function(index) {
var href = link.attr('href');
//absolute match
if (href === url) {
//true
} else {
//false
}
//href contains url
if (href.indexOf(url) > -1) {
//true
} else {
//false
}
if (href.match(places)) {
// The urls from fullUrl here to be returned to parent variable.
fullUrl.push(url + href);
console.log(JSON.stringify(fullUrl));
}
});
});
}
});
}
};
For my Node.Js app I need to get the first page of Google search results but from the .com domain because I need the "People also search for" knowledge graph info, which only shows up on Google.Com.
I figured I can use the request and cheerio modules to scrap content from Google's search results page, but when I try to access the URL I need, i.e. https://www.google.com/search?gws_rd=ssl&site=&source=hp&q=google&oq=google Google automatically redirects me to the .de domain (as I'm based in Germany).
I tried setting it to first load http://www.google.com/ncr url which automatically switches off country-specific redirect in browsers, but it didn't work...
Does anybody know what I could do differently to make it work?
Here's my code... Thank you!
var request = require("request");
var cheerio = require("cheerio");
function dataCookieToString(dataCookie) {
var t = "";
for (var x = 0; x < dataCookie.length; x++) {
t += ((t != "") ? "; " : "") + dataCookie[x].key + "=" + dataCookie[x].value;
}
return t;
}
function mkdataCookie(cookie) {
var t, j;
cookie = cookie.toString().replace(/,([^ ])/g, ",[12],$1").split(",[12],");
for (var x = 0; x < cookie.length; x++) {
cookie[x] = cookie[x].split("; ");
j = cookie[x][0].split("=");
t = {
key: j[0],
value: j[1]
};
for (var i = 1; i < cookie[x].length; i++) {
j = cookie[x][i].split("=");
t[j[0]] = j[1];
}
cookie[x] = t;
}
return cookie;
}
var dataCookie = mkdataCookie('MC_STORE_ID=66860; expires=' + new Date(new Date().getTime() + 86409000));
request({
uri: "https://www.google.com/ncr",
headers: {
'User-Agent': 'Mozilla/5.0',
"Cookie": dataCookieToString(dataCookie)
}
}, function(error, response, body) {
request({
uri: "https://www.google.com/search?gws_rd=ssl&site=&source=hp&q=google&oq=google",
headers: {
'User-Agent': 'Mozilla/5.0'
}
}, function(error, response, body) {
console.log(body);
var $ = cheerio.load(body);
$(".kno-fb-ctx").each(function() {
var link = $(this);
var text = link.text();
console.log(text);
});
});
});
Here's the solution: it's much easier than I thought.
However, I still have a problem that the body I get does not contain the stuff that only show up when javascript is enabled.
Anybody knows how to modify the code below so it also includes javascript-enabled content into the body?
var request = require('request');
var cheerio = require("cheerio");
request = request.defaults({jar: true});
var options = {
url: 'http://www.google.com/ncr',
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; rv:1.9.2.16) Gecko/20110319 Firefox/3.6.16'
}
};
request(options, function () {
request('https://www.google.com/search?gws_rd=ssl&site=&source=hp&q=google&oq=google', function (error, response, body) {
var $ = cheerio.load(body);
$("li").each(function() {
var link = $(this);
var text = link.text();
console.log(text);
});
});
});