Issue when doing web scraper - javascript

I am scraping the webpage https://www.g2a.com/rising-storm-2-vietnam-steam-cd-key-global.html
I need to get the title from the table data.
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
app.get('/scrape', function(req, res) {
url = 'https://www.g2a.com/rising-storm-2-vietnam-steam-cd-key-global.html';
request(url, function(error, response, body) {
if (!error) {
var $ = cheerio.load(body);
var arr = [];
var title = $('.mp-user-rating tr').each(function() {
var tableData = $('.marketplace-name > .mp-rating-popup');
arr.push({ 'title': tableData.text() });
});
}
res.send('Check your console!')
});
})
app.listen('8081');
console.log('Magic happens on port 8081');
exports = module.exports = app;
Here the data is in third column and cannot able to get .mp-user-rating tr data what is expected.
The image shows the structure of the table
Any help would be appreciated.

So, I went to the page and ran this in the console.
var arr = [];
var title = jQuery('.mp-user-rating tr').each(function(i, element) {
var tableData = jQuery(element).find('.mp-rating-popup');
arr.push({ 'title': tableData.text() });
});
console.log(arr);
The array consists of 8 objects that each have the titles within them.
UPDATE:
I pulled in the html information using your code. I think the issue is, the html is loaded asynchronously by the website, as a result, pulling the html will only retrieve the static markup. You will need to use PhantomJS or chrome's headless browser in order to load the website and allow the asyncronous information to load, then you can grab the html.
See here for some good docs on PhantomJS: https://github.com/Medium/phantomjs

Related

How to send data between Node js and javascript in an html file

I am using an eye tracker and I want to create a website that displays this data in real time. I have the eye tracker notifying a Node js server and it provides data really consistently but when I used socket.io to send the data over it was buffering really slowly. I want a way to receive this data in a script in my index.html from the Node js server in real time or as close as possible. Any suggestions?
I have found my solution in socket.io-streams. Here's what I did:
in the app.js:
var ss = require('socket.io-stream');
var Readable = require('stream').Readable;
const io = require('socket.io')(http);
io.of('/data').on('connection', socket => {
var eyeTracker = ...
var listener = {
...
onGazeData:function(gazeData){ //trigger for recieving a gaze location
var s = new Readable()
s._read = function() {};
var stream = ss.createStream();
toSend = gazeData.x + "," + gazeData.y
s.push(toSend);
s.pipe(stream);
ss(socket).emit('gaze',stream);
}
}
eyeTracker.setListener(listener);
});
app.get('/', function (req, res) {
res.sendFile(path.join(__dirname, 'index.html'));
}
and in the index.js (linked to index.html) I put:
var ss = require('socket.io-stream');
$(function(){
var socket = io.connect('http://localhost:3000/data'); //or where ever you are running
socket.on('connect', function() {
ss(socket).on('gaze', function(stream) {
stream.on('data', function(data) {
//do what you want with data
})
})
});
//other parts of script outside of socket
});
This solution was able to keep up with the data being streamed.

How can I render a static HTML file with Handlebars on a Nodejs server?

I have come across plenty of resources online for this but haven't been able to find one that is straight forward enough for me to understand.
At the moment, I have multiple massive <script> tags in an HTML document that has handlebars content. The server sends this HTML document to the client where the client then renders the page with data from an AJAX call. I'd like to move this entire process server-side so that all the server has to do is send a static file and re-render the page when data is updated. Data changes a few times per day - which is why it isn't hard coded in and I would like to run the handlebars compiler on the HTML document when data is updated.
Is it possible to simply put the HTML document with handlebars templating in <script> tags through a function to generate a new HTML file with data filled in?
Here is the code I have within my app.js file that is runned the Node server that does not do what I want it to:
function registerHelpers(callback){
Handlebars.registerHelper('equal', function(lvalue, rvalue, options) {
if (arguments.length < 3)
throw new Error("Handlebars Helper equal needs 2 parameters");
if( lvalue!=rvalue ) {
return options.inverse(this);
} else {
return options.fn(this);
}
});
Handlebars.registerHelper('trim', function(text) {
text = text.replace(/ /g, '');
return new Handlebars.SafeString(text);
});
callback();
}
function buildHomePage() {
var source = require(__dirname + '/public/home.handlebars');
var template = Handlebars.precompile(source);
var collection = db.get('datalist'); //Monk call to MongoDB
collection.find({}, function (err, docs){
var result = template(docs);
console.log(result)
var fs = require('fs');
fs.writeFile("test.html", result, function(err) {
if(err) {
console.log(err);
}
});
});
};
registerHelpers(buildHomePage);
The following can render handlebars to static html. Run node example.js. You may need to run npm install --save handlebars prior.
var fs = require('fs');
var Handlebars = require('handlebars');
function render(filename, data)
{
var source = fs.readFileSync(filename,'utf8').toString();
var template = Handlebars.compile(source);
var output = template(data);
return output;
}
var data = JSON.parse(fs.readFileSync("./data/strings.json", 'utf8'));
var result = render('./templates/somefile.html', data);
console.log(result);
If your handlebars templates are simple, with only string replacement, you can do this with underscore.js. Assume this example is named 'generate.js'
var fs = require('fs');
var _ = require('underscore');
_.templateSettings.interpolate = /\{\{(.+?)\}\}/g;
function render(filename, data)
{
var source = fs.readFileSync(filename,'utf8').toString();
var compiled = _.template(source);
return compiled(data);
}
var data = JSON.parse(fs.readFileSync("./data/strings.json", 'utf8'));
var result = render('./templates/somefile.html', data);
console.log(result);
Then run node generate.js to output the rendered template to the console. You may need to do npm install --save underscore prior.

How take the JSON in Node.JS

I have this in Node.JS file.
var express = require('express');
var app = express();
var http = require('http').Server(app);
var cfenv = require("cfenv");
var appEnv = cfenv.getAppEnv();
http.listen(appEnv.port, appEnv.bind);
var PersonalityInsightsV2 = require('watson-developer-cloud/personality-insights/v2');
var personality_insights = new PersonalityInsightsV2({
username: '<YOUR-USERNAME>',
password: '<YOUR-PASSWORD>'
});
personality_insights.profile({
text: "<YOUR-100-UNIQUE-WORDS>",
language: 'en' },
function (err, response) {
if (err)
console.log('error:', err);
else
console.log(JSON.stringify(response, null, 2));
});
I am sending an API call but as you can see, it shows me the result in JSON in the console.
How can I make this result in JSON that shows me in the console, show it to me in an HTML?
Thank you very much!
I supose that the problem is in console.log(JSON.stringify(res,null, 2));, but, I don't know how put this in HTML.
You can't just turn JSON into HTML. JSON is a data format. HTML is a markup language. You'll manually have to create some HTML with the way you want it, and then drop in values from the JSON.
For example, you could do something like this:
else {
const html =
`<!DOCTYPE html>
<body>
<p>${response.name}</p>
`;
console.log(html);
}
That would give you some HTML like:
<!DOCTYPE html>
<body>
<p>Bob</p>
assuming response has a value of name.
It sounds like you're wanting to view the JSON on an HTML page in a browser. Something like this should help. It will start your Express server listening on whatever port you specified using appEnv.port, and will serve up myJson (which will then be assigned in your code)
var express = require('express');
var app = express();
var http = require('http').Server(app);
var cfenv = require("cfenv");
var appEnv = cfenv.getAppEnv();
var myJson;
// respond with JSON when a GET request is made to the index
app.get('/', function (req, res) {
res.send(myJson)
})
app.listen(appEnv.port);
var PersonalityInsightsV2 = require('watson-developer-cloud/personality-insights/v2');
var personality_insights = new PersonalityInsightsV2({
username: '<YOUR-USERNAME>',
password: '<YOUR-PASSWORD>'
});
personality_insights.profile({
text: "<YOUR-100-UNIQUE-WORDS>",
language: 'en' },
function (err, response) {
if (err)
console.log('error:', err);
else
myJson = JSON.stringify(response, null, 2);
});
To try this, you would open your browser to "http://localhost:appEnv.port/" (where appEnv.port is the port you chose). You should see your JSON output

Scraping JavaScript-generated website with Node.js [duplicate]

This question already has answers here:
How can I scrape pages with dynamic content using node.js?
(5 answers)
Closed last month.
When I parse a static html page, my node.js app works well. However, when the url is a JavaScript-generated page, the app doesn't work. How can I scrape a JavaScript-generated web page?
My app.js
var express = require('express'),
fs = require('fs'),
request = require('request'),
cheerio = require('cheerio'),
app = express();
app.get('/scrape', function( req, res ) {
url = 'http://www.apache.org/';
request( url, function( error, response, html ) {
if( !error ) {
var $ = cheerio.load(html);
var title, release, rating;
var json = { title : "" };
$('body').filter(function() {
var data = $(this);
title = data.find('.panel-title').text();
json.title = title;
})
}
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err) {
console.log( 'File successfully written! - Check your project directory for the output.json file' );
});
// Finally, we'll just send out a message to the browser reminding you that this app does not have a UI.
res.send( 'Check your console!' );
});
});
app.listen('8081');
console.log('Magic happens on port 8081');
exports = module.exports = app;
Cheerio won't execute the javascript on the page as it's just made for parsing plain HTML.
I'd suggest a different approach using something like PhantomJS: http://phantomjs.org/

Node.js - Looping through array of URLS one at a time

I am a beginner at node js and I'm trying to write a web scraping script. I got permission from the site admin to scrape their products if I make less then 15 requests a minute. When I started out it used to request all the URLs at once but after some tooling around, I was able to go through each item in the array, but the script doesn't stop when there is no more items in the array? I'm not really happy with my result and feel like there is a better way to do this.
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
var async = require('async');
app.get('/scrape', function(req, res){
productListing = ['ohio-precious-metals-1-ounce-silver-bar','morgan-1-ounce-silver-bar']
var i = 0;
async.eachLimit(productListing, 1, function (product, callback) {
var getProducts = function () {
var url = 'http://cbmint.com/' + productListing[i];
request(url, function(error, response, html) {
if(!error){
var $ = cheerio.load(html);
var title;
var json = { title : ""};
$('.product-name').filter(function(){
var data = $(this);
title = data.children().children().first().text();
json.title = title;
})
}
var theTime = new Date().getTime();
console.log(i);
console.log(json.title);
console.log(theTime);
i++;
});
}
setInterval(getProducts,10000);
})
res.send('Check your console!')
})
app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;
You aren't calling callback inside the iterator function. Take a look at the docs for eachLimit.

Categories

Resources