Scraping JavaScript-generated website with Node.js [duplicate] - javascript

This question already has answers here:
How can I scrape pages with dynamic content using node.js?
(5 answers)
Closed last month.
When I parse a static html page, my node.js app works well. However, when the url is a JavaScript-generated page, the app doesn't work. How can I scrape a JavaScript-generated web page?
My app.js
var express = require('express'),
fs = require('fs'),
request = require('request'),
cheerio = require('cheerio'),
app = express();
app.get('/scrape', function( req, res ) {
url = 'http://www.apache.org/';
request( url, function( error, response, html ) {
if( !error ) {
var $ = cheerio.load(html);
var title, release, rating;
var json = { title : "" };
$('body').filter(function() {
var data = $(this);
title = data.find('.panel-title').text();
json.title = title;
})
}
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err) {
console.log( 'File successfully written! - Check your project directory for the output.json file' );
});
// Finally, we'll just send out a message to the browser reminding you that this app does not have a UI.
res.send( 'Check your console!' );
});
});
app.listen('8081');
console.log('Magic happens on port 8081');
exports = module.exports = app;

Cheerio won't execute the javascript on the page as it's just made for parsing plain HTML.
I'd suggest a different approach using something like PhantomJS: http://phantomjs.org/

Related

create downloable file for mobile and desktop

I want to create a vCard in the frontend and to make it downloadable (by clicking a button) in a ReactJs Project. I use a NodeJS module called vcards-js that creates a string with the content of the desired vCard (v3.0). The thing I am struggling with is to make it downloadable (as .vcf file).
here is my code
const {fname,lname,phoneNumber,email,organisation,title}=req.body
const mycard=vcard()
mycard.firstName=fname
mycard.lastName=lname
mycard.phoneNumber=phoneNumber
mycard.email=email
mycard.organisation=organisation
mycard.title=title
// enter code here
console.log(mycard.getFormattedString(),"utf-8")
after console.log() i get vcard format data but how to get it downloadable ;
pls write the steps
I think your questions is answered here.
To translate this answer to your example, it would be something like this:
var textFile = null,
makeTextFile = function (text) {
var data = new Blob([text], {type: 'text/plain'});
// If we are replacing a previously generated file we need to
// manually revoke the object URL to avoid memory leaks.
if (textFile !== null) {
window.URL.revokeObjectURL(textFile);
}
textFile = window.URL.createObjectURL(data);
// returns a URL you can use as a href
return textFile;
};
// -------------
var create = document.getElementById('create');
create.addEventListener('click', function () {
var link = document.createElement('a');
link.setAttribute('download', 'XXXXX.vcf' /* set your vcf name here*/);
link.href = makeTextFile(
mycard.getFormattedString()
);
document.body.appendChild(link);
// wait for the link to be added to the document
window.requestAnimationFrame(function () {
var event = new MouseEvent('click');
link.dispatchEvent(event);
document.body.removeChild(link);
});
}, false);
I have not used vcard.js myself but by reading the README at github (https://github.com/enesser/vCards-js#on-the-web) you only redirect the user to the url of the endpoint where you generate the vcard. Something like this:
Nodejs:
var express = require('express');
var router = express.Router();
module.exports = function (app) {
app.use('/vcard', router);
};
router.get('/', function (req, res, next) {
var vCardsJS = require('vcards-js');
var vCard = vCardsJS();
// Create your vcard here
res.send(vCard.getFormattedString());
});
Front end:
window.location = "/vcard"

not able to fetch text data from web url using javascript

I need to extract text data from web url (http://www.africau.edu/images/default/sample.pdf)
I used two node_module.
1) crawler-Request
it('Read Pdf Data using crawler',function(){
const crawler = require('crawler-request');
function response_text_size(response){
response["size"] = response.text.length;
return response;
}
crawler("http://www.africau.edu/images/default/sample.pdf",response_text_size).then(function(response){
// handle response
console.log("Reponse =" + response.size);
});
});
What happen for this it will not print anything on console.
2) pfd2json/pdfparser
it('Read Data from url',function(){
var request = require('request');
var pdf = require('pfd2json/pdfparser');
var fs = require('fs');
var pdfUrl = "http://www.africau.edu/images/default/sample.pdf";
let databuffer = fs.readFileSync(pdfUrl);
pdf(databuffer).then(function(data){
var arr:Array<String> = data.text;
var n = arr.includes('Thursday 02 May');
console.log("Print Array " + n);
});
});
Failed: ENOENT: no such file or directory, open 'http://www.africau.edu/images/default/sample.pdf'
I am able to access data from local path but not able to extract it from url.
The issue here is that you are using the fs module (File System) to read a file on a distant server.
You also mistyped the pdf2json module, which should give you an error ?
You did require the request module. This module will make it possible to access that distant file. Here's one way to do this :
it('Read Data from url', function () {
var request = require('request');
var PDFParser = require('pdf2json');
var pdfUrl = 'http://unec.edu.az/application/uploads/2014/12/pdf-sample.pdf';
var pdfParser = new PDFParser(this, 1);
// executed if the parser fails for any reason
pdfParser.on("pdfParser_dataError", errData => console.error(errData.parserError));
// executed when the parser finished
pdfParser.on("pdfParser_dataReady", pdfData => console.log(pdfParser.getRawTextContent()));
// request to get the pdf's file content then call the pdf parser on the retrieved buffer
request({ url: pdfUrl, encoding: null }, (error, response, body) => pdfParser.parseBuffer(body));
});
This will make it possible to load the distant .pdf file in your program.
I'd recommend looking at the pdf2json documentation if you want to do more. This will simply output the textual content of the .pdf file when the parser has completed reading data.

How to save the downloadable file link in zombie.js

I am scrapping a website using node.js and zombie.js. I am facing an issue where in a file I have an anchor which holds the link to download a pdf file.
If I click it using browser.clickLink() function, the result that I get in console is beyond my understanding. Is there a way to save this pdf file and have its link like in php? I want to save it for further processing. Here is my test js code
var http = require('http');
var browser = require('zombie');
var assert = require('assert');
const hostname = '127.0.0.1';
const port = 3000;
const server = http.createServer((req, res) => {
res.statusCode = 200;
//res.setHeader('Content-Type', 'text/plain');
//res.end('Hello World\n');
});
server.listen(port, hostname, () => {
console.log(`Server running at http://${hostname}:${port}/`);
});
var url = 'http://localhost/Node/HM_LandRegistry/downloadPdf.html'
browser.visit(url, function(error,browser) {
//browser.dump();
//console.log('browser.text (".link")', browser.text(".link"));
browser.clickLink("a.link");
browser.wait().then(function(){
console.log(browser.text());
browser.dump();
});
});
Here is something I found on google groups. It has solved my problem.
function getLinks(browser) {
var links = browser.querySelectorAll('.link');
return Array.prototype.map.call(links, function(e) {
return e.getAttribute('href'); // returns an array. Use .toString() to get string only
});
}
Save the link

Issue when doing web scraper

I am scraping the webpage https://www.g2a.com/rising-storm-2-vietnam-steam-cd-key-global.html
I need to get the title from the table data.
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
app.get('/scrape', function(req, res) {
url = 'https://www.g2a.com/rising-storm-2-vietnam-steam-cd-key-global.html';
request(url, function(error, response, body) {
if (!error) {
var $ = cheerio.load(body);
var arr = [];
var title = $('.mp-user-rating tr').each(function() {
var tableData = $('.marketplace-name > .mp-rating-popup');
arr.push({ 'title': tableData.text() });
});
}
res.send('Check your console!')
});
})
app.listen('8081');
console.log('Magic happens on port 8081');
exports = module.exports = app;
Here the data is in third column and cannot able to get .mp-user-rating tr data what is expected.
The image shows the structure of the table
Any help would be appreciated.
So, I went to the page and ran this in the console.
var arr = [];
var title = jQuery('.mp-user-rating tr').each(function(i, element) {
var tableData = jQuery(element).find('.mp-rating-popup');
arr.push({ 'title': tableData.text() });
});
console.log(arr);
The array consists of 8 objects that each have the titles within them.
UPDATE:
I pulled in the html information using your code. I think the issue is, the html is loaded asynchronously by the website, as a result, pulling the html will only retrieve the static markup. You will need to use PhantomJS or chrome's headless browser in order to load the website and allow the asyncronous information to load, then you can grab the html.
See here for some good docs on PhantomJS: https://github.com/Medium/phantomjs

nodejs: node-http-proxy and harmon: rewriting the html response from the end point instead of the 302 redirected response.

I'm using nodejs with node-http-proxy along with harmon. I am using harmon to rewrite the proxied response to include a javascript file and a css file. When I set the target of the proxy to be http://nodejs.org or anything other than localhost, I receive a 301 or 302 redirect. The script is rewriting the 301 response instead of the fully proxied response. How can I use harmon to rewrite the end response instead of the 302 response?
Here is the example of the script I am running from the harmon example folder:
var http = require('http');
var connect = require('connect');
var httpProxy = require('http-proxy');
var selects = [];
var simpleselect = {};
//<img id="logo" src="/images/logo.svg" alt="node.js">
simpleselect.query = 'img';
simpleselect.func = function (node) {
//Create a read/write stream wit the outer option
//so we get the full tag and we can replace it
var stm = node.createStream({ "outer" : true });
//variable to hold all the info from the data events
var tag = '';
//collect all the data in the stream
stm.on('data', function(data) {
tag += data;
});
//When the read side of the stream has ended..
stm.on('end', function() {
//Print out the tag you can also parse it or regex if you want
process.stdout.write('tag: ' + tag + '\n');
process.stdout.write('end: ' + node.name + '\n');
//Now on the write side of the stream write some data using .end()
//N.B. if end isn't called it will just hang.
stm.end('<img id="logo" src="http://i.imgur.com/LKShxfc.gif" alt="node.js">');
});
}
selects.push(simpleselect);
//
// Basic Connect App
//
var app = connect();
var proxy = httpProxy.createProxyServer({
target: 'http://nodejs.org'
})
app.use(require('../')([], selects, true));
app.use(
function (req, res) {
proxy.web(req, res);
}
);
The problem is that a lot of sites are now redirecting HTTP to HTTPS.
nodejs.org is one of those.
I have updated the sample https://github.com/No9/harmon/blob/master/examples/doge.js to show how the http-proxy needs to be configured to deal with HTTPS.
If you still have problems with other arbitrary redirects please log an issue on harmon.
Thanks

Categories

Resources