I am a beginner at node js and I'm trying to write a web scraping script. I got permission from the site admin to scrape their products if I make less then 15 requests a minute. When I started out it used to request all the URLs at once but after some tooling around, I was able to go through each item in the array, but the script doesn't stop when there is no more items in the array? I'm not really happy with my result and feel like there is a better way to do this.
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
var async = require('async');
app.get('/scrape', function(req, res){
productListing = ['ohio-precious-metals-1-ounce-silver-bar','morgan-1-ounce-silver-bar']
var i = 0;
async.eachLimit(productListing, 1, function (product, callback) {
var getProducts = function () {
var url = 'http://cbmint.com/' + productListing[i];
request(url, function(error, response, html) {
if(!error){
var $ = cheerio.load(html);
var title;
var json = { title : ""};
$('.product-name').filter(function(){
var data = $(this);
title = data.children().children().first().text();
json.title = title;
})
}
var theTime = new Date().getTime();
console.log(i);
console.log(json.title);
console.log(theTime);
i++;
});
}
setInterval(getProducts,10000);
})
res.send('Check your console!')
})
app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;
You aren't calling callback inside the iterator function. Take a look at the docs for eachLimit.
Related
I am scraping the webpage https://www.g2a.com/rising-storm-2-vietnam-steam-cd-key-global.html
I need to get the title from the table data.
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
app.get('/scrape', function(req, res) {
url = 'https://www.g2a.com/rising-storm-2-vietnam-steam-cd-key-global.html';
request(url, function(error, response, body) {
if (!error) {
var $ = cheerio.load(body);
var arr = [];
var title = $('.mp-user-rating tr').each(function() {
var tableData = $('.marketplace-name > .mp-rating-popup');
arr.push({ 'title': tableData.text() });
});
}
res.send('Check your console!')
});
})
app.listen('8081');
console.log('Magic happens on port 8081');
exports = module.exports = app;
Here the data is in third column and cannot able to get .mp-user-rating tr data what is expected.
The image shows the structure of the table
Any help would be appreciated.
So, I went to the page and ran this in the console.
var arr = [];
var title = jQuery('.mp-user-rating tr').each(function(i, element) {
var tableData = jQuery(element).find('.mp-rating-popup');
arr.push({ 'title': tableData.text() });
});
console.log(arr);
The array consists of 8 objects that each have the titles within them.
UPDATE:
I pulled in the html information using your code. I think the issue is, the html is loaded asynchronously by the website, as a result, pulling the html will only retrieve the static markup. You will need to use PhantomJS or chrome's headless browser in order to load the website and allow the asyncronous information to load, then you can grab the html.
See here for some good docs on PhantomJS: https://github.com/Medium/phantomjs
Send a parameter(URL) from another script through recursion to this script.
var express = require('express');
var request = require('request');
var cheerio = require('cheerio');
var mongodb = require('mongodb');
var app = express();
var MongoClient = mongodb.MongoClient;
// Connection URL. This is where your mongodb server is running.
var murl = 'mongodb://localhost:27017/getData';
url = '';
app.get('/getData', function(req, res){
firstCall(req,res)
//console.log("cookie",req.cookies);
})
var firstCall = function(req, res, data){
console.log("URL: ", url);
res.send('Check your console!');
}
app.listen('3000')
console.log('Magic happens on port 3000');
module.exports = function(app) {
app.get('/getData', function() {});
};
I want this code to act as backbone or logic board. And some other file should be able to trigger this logic board file by adding the URL to this file.
Like we pass parameters to function to call. How do I do it here.
fist of all im not shure if the following is a non-blocking problem?
im getting started with https://github.com/sahat/hackathon-starter
currently i try to read all files out of a folder and later process all files...
i used EventEmitter to kind of manage the workflow.
i want to clear all arrays if the URL is refeshed or loaded new, but somehow if i reaload the URL there seems to be something inside the arrays which cases multiple outputs of the same data?
at the moment i just would be happy to have a correct console.log output.
/**
* GET /
* Home page.
*/
var fs = require('fs');
//XML
var jsxml = require("node-jsxml");
var Namespace = jsxml.Namespace,
QName = jsxml.QName,
XML = jsxml.XML,
XMLList = jsxml.XMLList;
//EventEmitter
var EventEmitter=require('events').EventEmitter;
var dateinamenEE=new EventEmitter();
var dateiinhaltEE=new EventEmitter();
var dateinamen = [];
var dateiinhalt = [];
exports.index = function(req, res) {
fs.readdir('./data', function (err, files) {
if (!err) {
files.forEach(function(value) {
dateinamen.push(value);
});
dateinamenEE.emit('dateinamen_ready');
} else {
throw err;
}
});
dateinamenEE.on('dateinamen_ready',function(){
dateinamen.forEach(function(value) {
var buf = fs.readFileSync('./data/'+value, "utf8");
var xml = new XML(buf);
var list = xml.descendants("suggestion");
var ergebnis = "";
var basiswort = "";
var buchstabe = "";
var obj = null;
list.each(function(item, index){
ergebnis = item.attribute('data').toString()
//basiswort = value.replace("%2B", " ");
//basiswort = basiswort.replace(".xml", "");
//var pieces = buchstabe.split(" ");
obj = {k: basiswort, b: buchstabe, e: ergebnis};
dateiinhalt.push(obj);
});
});
dateiinhaltEE.emit('dateiinhalt_ready');
});
dateiinhaltEE.on('dateiinhalt_ready',function(){
//console.log(dateiinhalt);
console.log("dateinamen:" + dateinamen.length);
console.log("dateiinhalt:" + dateiinhalt.length);
});
res.render('home', {
title: 'Home'
});
};
If if log the length of the 2 arrays the output on the second reload shows. First time loading the url:
Express server listening on port 3000 in development mode
dateinamen:2
dateiinhalt:20
Second time / refreshing the url:
GET / 200 898.198 ms - -
GET /fonts/fontawesome-webfont.woff2?v=4.3.0 304 12.991 ms - -
GET /favicon.ico 200 4.516 ms - -
dateinamen:4
dateiinhalt:60
dateinamen:4
dateiinhalt:60
dateinamen:4
dateiinhalt:100
dateinamen:4
dateiinhalt:100
GET / 200 139.259 ms - -
What causes the code to extend the arrays while reloading the page?
The non-blocking problem is due do your for(...) loops.
Changing them by : array.forEach(function(elem, index){});
EDIT
The arrays should be initialized inside the index function :
exports.index = function(req, res) {
var dateinamen = [];
var dateiinhalt = [];
...
Also, I'm not sure you need the use of EventEmitter.
Something like
`
fs.readdir('./data', function (err, files) {
if (!err) {
files.forEach(function(file){
var buf = fs.readFileSync('./data/'+file, "utf8");
var xml = new XML(buf);
var list = xml.descendants("suggestion");
var ergebnis = null;
var obj = null;
list.each(function(item, index){
ergebnis = item.attribute('data').toString();
obj = {k: file, v: ergebnis};
dateiinhalt.push(obj);
});
});
console.log(dateiinhalt);
} else {
throw err;
}
});
`
could do the job no?
(I wanted to say this as a comment, but I'm still missing reputation)
My controller is a soap client as shown below
var _ = require('lodash'),
memoize = require('memoize'),
soap = require('soap'),
http = require('http');
var wsdlUrl = 'http://www.proxixnetwork.com/gsert/PxPointGeocode.asmx?WSDL';
var geocode = function(req,res){
var sessionId = null;
soap.createClient(wsdlUrl, function(err,client){
var args = {"username":'user123', "password":'password123'};
client.PxPointGeocode.PxPointGeocodeSoap.Authenticate(args,function(err,result){
res.jsonp(result.AuthenticateResult.SessionID);
})
});
}
exports.authenticate = geocode;
This soap service provides a session id that will be used in subsequent requests. Hence, I wanted to use 'memoize' to cache the method.
I defined a method that wraps around the soap call and 'memoize'ing it but the problem is that the call to soapClient is not happening.
I do not know how to make the call from router wait for the soap call
Note: Also tried async library's waterfall but did not work.
var _ = require('lodash'),
memoize = require('memoize'),
soap = require('soap'),
http = require('http'),
wsdlUrl = 'http://www.proxixnetwork.com/gsert/PxPointGeocode.asmx?WSDL';
var getSession = function () {
var args = {"username": 'user123', "password": 'password123'};
var sessionId = null;
soap.createClient(wsdlUrl, function (err, client) {
console.log('Inside proxix client'); //not printing
client.PxPointGeocode.PxPointGeocodeSoap.Authenticate(args, function (err, result) {
sessionId= result.AuthenticateResult.SessionID;
//if I use res.jsonp() - the call could be made
})
});
return sessionId;
};
var cached = memoize(getSession);
var geocode = function (req, res) {
var sesssionObj = cached();
res.jsonp(sessionObj);
}
exports.authenticate = geocode;
The two issues I'm seeing are :
memoize is a caching mechanism but is in no way going to change the ansynchronous nature of your function. Returning sessionId will not work because it is not set before it gets to that line. You need to use a callback.
You have not specified which arguments you want memoize to use for the caching, nor where you are getting those values. I'm going to assume for this example that it's username, password and wsdlUrl. and that they are set directly in req.
.
var _ = require('lodash'),
memoize = require('memoize'),
soap = require('soap'),
http = require('http');
var getSessionId = function(username,password,wsdlUrl,callback){
soap.createClient(wsdlUrl, function(err,client){
var args = {"username":username, "password":password};
client.PxPointGeocode.PxPointGeocodeSoap.Authenticate(args,function(err,result){
if(err){
return callback(err);
}
callback(null,result.AuthenticateResult.SessionID);
})
});
});
var getSessionIdCached = memoize(getSessionId,{async:true});
var geocode = function(req,res){
getSessionIdCached(req.username,req.password,req.wsdlUrl,function(err,sessionId){
if(err){
//do some error handling, and probably return
}
res.jsonp(sessionId);
});
});
exports.authenticate = geocode;
I have 3 node js files :
mysqlconnection.js to store the database connection properties:
var mysql = require('mysql');
var cjson = require('cjson');
var yaml_config = require('node-yaml-config');
// project files
var config = yaml_config.load(__dirname + '/billingv2.yaml');
exports.execute = function(callback){
var connection = mysql.createConnection(
{
host : config.host,
user : config.user,
password : config.password,
database : config.database,
}
);
connection.connect();
return callback(null,connection);
}
subscriptionRestService.js to handle the REST api calls:
var express = require('express');
var app = express();
app.use(express.bodyParser());
var fs = require('fs');
// Project files
var mysql = require('./mysqlRestService.js');
// Get Resource Subscription data by Resourceuri
app.post('/pricingdetails', function(req, res) {
var workload = req.body;
if(workload.elements && workload.elements.length > 0)
{
var arr = [];
for(var index in workload.elements)
{
arr[index] = workload.elements[index].uri;
}
var resourceIdentifiers = arr.join(',');
}
console.log(resourceIdentifiers);
mysql.getPricingDetail(function(resourceIdentifiers,callback){
});
});
mysqlRestService.js to handle mysql queries/stored procedures:
// packages
var mysql = require('mysql');
var cjson = require('cjson');
var fs = require('fs');
var yaml_config = require('node-yaml-config');
// project files
var dbconnection = require('./mysqlconnection');
exports.getPricingDetail = function (resourceIdentifiers,callback){
console.log('entered into mysql function');
console.log(resourceIdentifiers);
var pricingDetail = {};
dbconnection.execute(function(err,response){
if(err){
throw err;
}
else
{
var selectqueryString = "call SP_ExposePricingDetailforUI('" + resourceIdentifiers + "')";
response.query(selectqueryString, function(err,pricingDetail){
if(err) {
throw err;
}
else
{
console.log(pricingDetail);
pricingDetail = pricingDetail;
}
});
}
});
//console.log('printing pricing details');
//console.log(pricingDetail);
};
problems faced
Unable to send the variable resourceIdentifiers from subscriptionRestService to mysqlRestService.js
Unable to return the pricingdetail from mysqlRestService.js to calling function in subscriptionRestService.
Any guidance greatly appreciated.
Unable to send the variable resourceIdentifiers from subscriptionRestService to mysqlRestService.js
Well, you didn't send it. It currently is a parameter of your callback function in the invocation, not an argument for the parameter of getPricingDetails. Use
mysql.getPricingDetail(resourceIdentifiers, function callback(result){
// use result here
});
Unable to return the pricingdetail from mysqlRestService.js to calling function in subscriptionRestService.
I've got no idea what pricingDetail = pricingDetail; was supposed to do. You have to call (invoke) back the callback here! Use
callback(pricingDetail);