how to parse links by class name using node.js? - javascript

This server gets all links of google's first site:
var http = require('http');
var qs = require('querystring');
var request = require("request");
var cheerio = require('cheerio');
http.createServer().listen(1337, "127.0.0.1");
function parsehl(body, callback){
$ = cheerio.load(body);
var result = '';
links = $('a'); //jquery get all hyperlinks ???????
$(links).each(function(i, link){
var hyperlink = '\n'+ $(link).attr('href');
result = result + hyperlink ;
});
callback(result);}
request({uri: 'http://www.google.com/search?q=rio&start=00' }, function (error, response, body) {
console.log('url requested ') ;
if (!error){
parsehl(body, function(result){
console.log(result);
});
}
else
{
console.log(error);
}
});
This server gets the links by tag name a.
The links we need have the class name l (looks like nr 1 but is an l like in little).
How can we get the links here by class name?
We maybe need just one line where we put the questionmarks.

Have you tried:
links = $('a.l');
?

Related

How to add into array the url that contains specific String?

I am trying to get all values of element
var request = require('request');
var cheerio = require('cheerio');
var url = "https://www.mismarcadores.com";
request(url, function(err, resp, body) {
if (err) throw err;
var $ = cheerio.load(body);
var addUrl= [];
$('a').each(function (i, element) {
var a = $(this);
var href = a.attr('href');
addUrl.push(href);
})
console.log(array[0]);
})
I have this code that add the links to array called addUrl , this works perfect but now I am looking how to add to this array if the url contains the word 'baloncesto' in href.
Good example : https://www.mismarcadores.com/baloncesto/alemania/
This URL , is good but
Bad example : https://www.mismarcadores.com/golf/
This is wrong.
I am developing this using NodeJS but this is only a simply javascript that now I don't know how to made this.
Could anyone help to me?
Please try like this:
var request = require('request');
var cheerio = require('cheerio');
var url = "https://www.mismarcadores.com";
var filterStr = 'baloncesto';
request(url, function(err, resp, body) {
if (err) throw err;
var $ = cheerio.load(body);
var addUrl= [];
$('a').each(function (i, element) {
var href = $(this).attr('href');
if (href.includes(filterStr)) addUrl.push(href);
})
console.log(addUrl);
})
Okay, so what you're looking for is a pattern match - if href contains the string baloncesto. I suspect that this SO thread will be helpful to you - you're basically looking to nest this line -
addUrl.push(href);
- in an if statement, like
if(href.includes('baloncesto')) {
addUrl.push(href);
}
...but definitely look at that other answer for reference, in case you're using a version of JS that's older and not compatible with ES6.
you can try this
if(href.contains('baloncesto')){
addUrl.push(href);
}
Try this
var request = require('request');
var cheerio = require('cheerio');
var url = "https://www.mismarcadores.com";
request(url, function(err, resp, body) {
if (err) throw err;
var $ = cheerio.load(body);
var addUrl= [];
$('a').each(function (i, element) {
var a = $(this);
var href = a.attr('href');
if (decodeURIComponent(href).contains('baloncesto')){
addUrl.push(href);
} else {
//do something else
}
})
console.log(array[0]);
})

Issue when doing web scraper

I am scraping the webpage https://www.g2a.com/rising-storm-2-vietnam-steam-cd-key-global.html
I need to get the title from the table data.
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
app.get('/scrape', function(req, res) {
url = 'https://www.g2a.com/rising-storm-2-vietnam-steam-cd-key-global.html';
request(url, function(error, response, body) {
if (!error) {
var $ = cheerio.load(body);
var arr = [];
var title = $('.mp-user-rating tr').each(function() {
var tableData = $('.marketplace-name > .mp-rating-popup');
arr.push({ 'title': tableData.text() });
});
}
res.send('Check your console!')
});
})
app.listen('8081');
console.log('Magic happens on port 8081');
exports = module.exports = app;
Here the data is in third column and cannot able to get .mp-user-rating tr data what is expected.
The image shows the structure of the table
Any help would be appreciated.
So, I went to the page and ran this in the console.
var arr = [];
var title = jQuery('.mp-user-rating tr').each(function(i, element) {
var tableData = jQuery(element).find('.mp-rating-popup');
arr.push({ 'title': tableData.text() });
});
console.log(arr);
The array consists of 8 objects that each have the titles within them.
UPDATE:
I pulled in the html information using your code. I think the issue is, the html is loaded asynchronously by the website, as a result, pulling the html will only retrieve the static markup. You will need to use PhantomJS or chrome's headless browser in order to load the website and allow the asyncronous information to load, then you can grab the html.
See here for some good docs on PhantomJS: https://github.com/Medium/phantomjs

Is Promise.all not working on the second time through? Why not?

I'm just finishing off this basic webscraper project for a tshirt website.
It enters through one hardcoded url, the home page. It will search for any product pages, and add them to an url. If it finds another link (remainder), it will scrape that again and find any more product pages. It adds the product pages to urlSet and will then scrape those again, grab the tshirt data (price, img, title) and then convert, then write them to a CSV file.
For some reason, this is not working on the second run through of the scrape with 'remainder'.
If I remove the second scrape of url, everything works out fine and the file gets written correctly. But if I want to get the other product pages, it seems to be failing somewhere.
Here is my code, i apologise for posting so much of it but I don't know how it will be understood properly without the right context, hopefully it's been commented okay:
//TASK: Create a command line application that goes to an ecommerce site to get the latest prices.
//Save the scraped data in a spreadsheet (CSV format).
'use strict';
//Modules being used:
var cheerio = require('cheerio');
var json2csv = require('json2csv');
var request = require('request');
var moment = require('moment');
var fs = require('fs');
//harcoded url
var url = 'http://shirts4mike.com/';
//url for tshirt pages
var urlSet = new Set();
var remainder;
var tshirtArray = [];
const requestPromise = function(url) {
return new Promise(function(resolve, reject) {
request(url, function(error, response, html) {
if(error)return reject(error);
if(!error && response.statusCode == 200){
return resolve(html);
}
});
});
}
// Go into webpage via url, load html and grab links shirt in url
function scrape (url) {
console.log("Currently scraping " + url)
return requestPromise(url)
.then(function(html) {
var $ = cheerio.load(html);
var links = [];
//get all the links
$('a[href*=shirt]').each(function(){
var a = $(this).attr('href');
//add into link array
links.push(url + a);
});
// return array of links
return links;
});
}
function nextStep (arrayOfLinks) {
var promiseArray = [];
console.log(arrayOfLinks);
for(var i = 0; i < arrayOfLinks.length; i++){
promiseArray.push(requestPromise(arrayOfLinks[i]));
}
//return both the html of pages and their urls
return Promise.all(promiseArray)
.then(function(arrayOfHtml){
return {arrayOfHtml: arrayOfHtml , arrayOfUrls: arrayOfLinks};
});
}
//go through the html of each url and add to urlSet if there is a checkout button
//add to remainder otherwise to rescrape
function lastStep (obj){
for(var i = 0; i < obj.arrayOfHtml.length; i++){
var $ = cheerio.load(obj.arrayOfHtml[i]);
//if page has a submit it must be a product page
if($('[type=submit]').length !== 0){
//add page to set
urlSet.add(obj.arrayOfUrls[i]);
console.log(obj.arrayOfUrls[i]);
} else if(remainder == undefined) {
//if not a product page, add it to remainder so it another scrape can be performed.
remainder = obj.arrayOfUrls[i];
console.log("The remainder is " + remainder)
}
}
//return remainder for second run-through of scrape
return remainder;
}
//iterate through urlSet (product pages and grab html)
function lastScraperPt1(){
//call lastScraper so we can grab data from the set (product pages)
//scrape set, product pages
var promiseArray = [];
for(var item of urlSet){
var url = item;
promiseArray.push(requestPromise(url));
}
return Promise.all(promiseArray)
.then(function(arrayOfHtml){
return arrayOfHtml;
});
}
//iterate over the html of the product pages and store data as objects
function lastScraperPt2(html){
for(var i = 0; i < html.length; i++){
var $ = cheerio.load(html[i]);
//grab data and store as variables
var price = $('.price').text();
var imgURL = $('.shirt-picture').find('img').attr('src');
var title = $('body').find('.shirt-details > h1').text().slice(4);
var tshirtObject = {};
//add values into tshirt object
tshirtObject.Title = title;
tshirtObject.Price = price;
tshirtObject.ImageURL = imgURL;
tshirtObject.URL = url;
tshirtObject.Date = moment().format('MMMM Do YYYY, h:mm:ss a');
//add the object into the array of tshirts
tshirtArray.push(tshirtObject);
}
convertJson2Csv();
}
//convert tshirt objects and save as CSV file
function convertJson2Csv(){
//The scraper should generate a folder called `data` if it doesn’t exist.
var dir ='./data';
if(!fs.existsSync(dir)){
fs.mkdirSync(dir);
}
var fields = ['Title', 'Price', 'ImageURL', 'URL', 'Date'];
//convert tshirt data into CSV and pass in fields
var csv = json2csv({ data: tshirtArray, fields: fields });
//Name of file will be the date
var fileDate = moment().format('MM-DD-YY');
var fileName = dir + '/' + fileDate + '.csv';
//Write file
fs.writeFile(fileName, csv, {overwrite: true}, function(err) {
console.log('file saved');
if (err) throw err;
});
}
scrape(url) //scrape from original entry point
.then(nextStep)
.then(lastStep)
.then(scrape) //scrape again but with remainder url
.then(nextStep)
.then(lastStep)
.then(lastScraperPt1)
.then(lastScraperPt2)
.catch(function(err) {
// handle any error from any request here
console.log(err);
});
I'm console logging the arrayOfLinks in nextStep so I can see that they are being grabbed properly, I just cannot work out why they aren't being passed through to 'lastStep' properly.
Currently scraping http://shirts4mike.com/
[ 'http://shirts4mike.com/shirts.php',
'http://shirts4mike.com/shirts.php',
'http://shirts4mike.com/shirt.php?id=108',
'http://shirts4mike.com/shirt.php?id=107',
'http://shirts4mike.com/shirt.php?id=106',
'http://shirts4mike.com/shirt.php?id=105' ]
The remainder is http://shirts4mike.com/shirts.php
http://shirts4mike.com/shirt.php?id=108
http://shirts4mike.com/shirt.php?id=107
http://shirts4mike.com/shirt.php?id=106
http://shirts4mike.com/shirt.php?id=105
Currently scraping http://shirts4mike.com/shirts.php
[ 'http://shirts4mike.com/shirts.phpshirts.php',
'http://shirts4mike.com/shirts.phpshirt.php?id=101',
'http://shirts4mike.com/shirts.phpshirt.php?id=102',
'http://shirts4mike.com/shirts.phpshirt.php?id=103',
'http://shirts4mike.com/shirts.phpshirt.php?id=104',
'http://shirts4mike.com/shirts.phpshirt.php?id=105',
'http://shirts4mike.com/shirts.phpshirt.php?id=106',
'http://shirts4mike.com/shirts.phpshirt.php?id=107',
'http://shirts4mike.com/shirts.phpshirt.php?id=108' ]
BUT if I choose to only call the first scrape and don't call the second, like this:
scrape(url) //scrape from original entry point
.then(nextStep)
.then(lastStep)
.then(lastScraperPt1)
.then(lastScraperPt2)
.catch(function(err) {
// handle any error from any request here
console.log(err);
});
... Then everything works. I just don't get to all the urls.
What is happening here and how can I fix it? Thank you guys
The issue is tshirtArray is not defined in convertJson2Csv(). At lastlastScraperPt2 pass tshirtArray to convertJsonCsv()
convertJson2Csv(tshirtArray)
at convertJson2Csv
function convertJson2Csv(tshirtArray) {
// do stuff
}
One problem seems to be in your lastStep. It looks like you mean for remainder to be another array of urls. Correct me if I'm wrong there. However, what's happing is that the first time the if($('[type=submit]').length !== 0) condition fails, you'll automatically go down to the next block, because remainder start undefined. Whatever the current url is, you assign that one to remainder. For the rest of the iterations of your for-loop, you will never again hit the condition where remainder == undefined. So if you will only ever end up with one url assigned to remainder, while any more that you were hoping to get will simply be passed over.
You might want to define remainder as remainder = [];. And then instead of saying else if (remainder == undefined), you would just say
} else {
remainder.push(obj.arrayOfUrls[i]);
}
However, then you're passing an array of urls to scrape when scrape is only expecting a single url. If this is what you want and I am right in assuming that you mean for remainder to be an array of urls, you could defined a new function as follows:
function scrapeRemainders(remainders) {
var promises = [];
remainder.forEach(function (url) {
promises.push(requestPromise(url));
});
return Promise.all(promises).then(function (results) {
_.flattenDeep(results);
})
}
Then instead of the second scrape in your promise chain, you would replace it with scrapeRemainders. Also, for you the _ in the previous function, you would need to npm install lodash and then var _ = require('lodash'). On a side note, lodash has nothing to do with promises, but it is a great tool for data manipulation. You should look into it when you have the chance.
Also, in lastScraperPt1, you can change
return Promise.all(promiseArray)
.then(function(arrayOfHtml){
return arrayOfHtml;
});
to
return Promise.all(promiseArray);
It does the same thing.
Hope this helps. If this does not answer your question, comment at me and I can change my answer accordingly.
All fixed, it was grabbing the wrong urls in scrape(). Though I only knew this after I logged the statusCodes to the console :
//TASK: Create a command line application that goes to an ecommerce site to get the latest prices.
//Save the scraped data in a spreadsheet (CSV format).
'use strict';
//Modules being used:
var cheerio = require('cheerio');
var json2csv = require('json2csv');
var request = require('request');
var moment = require('moment');
var fs = require('fs');
//harcoded url
var urlHome = 'http://shirts4mike.com/';
//url for tshirt pages
var urlSet = [];
var tshirtArray = [];
const requestPromise = function(url) {
return new Promise(function(resolve, reject) {
request(url, function(error, response, html) {
if(error) {
errorHandler(error);
return reject(error);
}
if(!error && response.statusCode == 200){
return resolve(html);
}
if(response.statusCode !== 200){
console.log("response code is " + response.statusCode);
}
return resolve("");
});
});
}
// Go into webpage via url, load html and grab links shirt in url
function scrape (url) {
console.log("Currently scraping " + url)
return requestPromise(url)
.then(function(html) {
var $ = cheerio.load(html);
var links = [];
var URL = 'http://shirts4mike.com/';
//get all the links
$('a[href*=shirt]').each(function(){
var a = $(this).attr('href');
//add into link array
links.push(URL + a);
});
// return array of links
return links;
});
}
function nextStep (arrayOfLinks) {
var promiseArray = [];
console.log(arrayOfLinks);
for(var i = 0; i < arrayOfLinks.length; i++){
promiseArray.push(requestPromise(arrayOfLinks[i]));
}
//return both the html of pages and their urls
return Promise.all(promiseArray)
.then(function(arrayOfHtml){
return {arrayOfHtml: arrayOfHtml , arrayOfUrls: arrayOfLinks};
});
}
//go through the html of each url and add to urlSet if there is a checkout button
//add to remainder otherwise to rescrape
function lastStep (obj){
for(var i = 0; i < obj.arrayOfHtml.length; i++){
var $ = cheerio.load(obj.arrayOfHtml[i]);
//if page has a submit it must be a product page
if($('[type=submit]').length !== 0){
//add page to set
urlSet.push(obj.arrayOfUrls[i]);
console.log(obj.arrayOfUrls[i]);
} else if(remainder == undefined) {
//if not a product page, add it to remainder so it another scrape can be performed.
var remainder = obj.arrayOfUrls[i];
console.log("The remainder is " + remainder)
}
}
//return remainder for second run-through of scrape
return remainder;
}
//iterate through urlSet (product pages and grab html)
function lastScraperPt1(){
//call lastScraper so we can grab data from the set (product pages)
//scrape set, product pages
var promiseArray = [];
for(var item of urlSet){
var url = item;
promiseArray.push(requestPromise(url));
}
return Promise.all(promiseArray)
.then(function(arrayOfHtml){
return arrayOfHtml;
});
}
//iterate over the html of the product pages and store data as objects
function lastScraperPt2(html){
for(var i = 0; i < html.length; i++){
var $ = cheerio.load(html[i]);
//grab data and store as variables
var price = $('.price').text();
var imgURL = $('.shirt-picture').find('img').attr('src');
var title = $('body').find('.shirt-details > h1').text().slice(4);
var tshirtObject = {};
//add values into tshirt object
tshirtObject.Title = title;
tshirtObject.Price = price;
tshirtObject.ImageURL = urlHome + imgURL;
tshirtObject.URL = urlSet[i];
tshirtObject.Date = moment().format('MMMM Do YYYY, h:mm:ss a');
//add the object into the array of tshirts
tshirtArray.push(tshirtObject);
}
return tshirtArray;
}
//conver tshirt objects and save as CSV file
function convertJson2Csv(tshirtArray){
//The scraper should generate a folder called `data` if it doesn’t exist.
var dir ='./data';
if(!fs.existsSync(dir)){
fs.mkdirSync(dir);
}
var fields = ['Title', 'Price', 'ImageURL', 'URL', 'Date'];
//convert tshirt data into CSV and pass in fields
var csv = json2csv({ data: tshirtArray, fields: fields });
//Name of file will be the date
var fileDate = moment().format('MM-DD-YY');
var fileName = dir + '/' + fileDate + '.csv';
//Write file
fs.writeFile(fileName, csv, {overwrite: true}, function(err) {
console.log('file saved');
if (err) errorHandler(err);
});
}
scrape(urlHome) //scrape from original entry point
.then(nextStep)
.then(lastStep)
.then(scrape)
.then(nextStep)
.then(lastStep)
.then(lastScraperPt1)
.then(lastScraperPt2)
.then(convertJson2Csv)
.catch(function(err) {
// handle any error from any request here
console.log(err);
});
//If the site is down, an error message describing the issue should appear in the console.
//This is to be tested by disabling wifi on your device.
//When an error occurs log it to a file scraper-error.log . It should append to the bottom of the file with a time stamp and error
var errorHandler = function (error) {
console.log(error.message);
console.log('The scraper could not not scrape data from ' + url + ' there is either a problem with your internet connection or the site may be down');
/**
* create new date for log file
*/
var loggerDate = new Date();
/**
* create message as a variable
*/
var errLog = '[' + loggerDate + '] ' + error.message + '\n';
/**
*when the error occurs, log that to the error logger file
*/
fs.appendFile('scraper-error.log', errLog, function (err) {
if (err) throw err;
console.log('There was an error. The error was logged to scraper-error.log');
});
};

Node.js - Looping through array of URLS one at a time

I am a beginner at node js and I'm trying to write a web scraping script. I got permission from the site admin to scrape their products if I make less then 15 requests a minute. When I started out it used to request all the URLs at once but after some tooling around, I was able to go through each item in the array, but the script doesn't stop when there is no more items in the array? I'm not really happy with my result and feel like there is a better way to do this.
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
var async = require('async');
app.get('/scrape', function(req, res){
productListing = ['ohio-precious-metals-1-ounce-silver-bar','morgan-1-ounce-silver-bar']
var i = 0;
async.eachLimit(productListing, 1, function (product, callback) {
var getProducts = function () {
var url = 'http://cbmint.com/' + productListing[i];
request(url, function(error, response, html) {
if(!error){
var $ = cheerio.load(html);
var title;
var json = { title : ""};
$('.product-name').filter(function(){
var data = $(this);
title = data.children().children().first().text();
json.title = title;
})
}
var theTime = new Date().getTime();
console.log(i);
console.log(json.title);
console.log(theTime);
i++;
});
}
setInterval(getProducts,10000);
})
res.send('Check your console!')
})
app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;
You aren't calling callback inside the iterator function. Take a look at the docs for eachLimit.

How to find all the links in a webpage that has a specific file extension?

Is it possible to find a href in a website that has a certain file extension. for example it would print http://www.test.com/something.mp3 http://www.test.com/somelinktoamuscifile.mp3 http://www.test.com/music.mp3.
It would show all of links, with a file extension of .mp3 for example.
would you do
var extension = ".mp3"
var checker = url + extension
if(url == checker){console.log(url);}
So you want to extract all links that contain a certain string from any given url?
Maybe this script will help you:
var request = require('request');
var cheerio = require('cheerio');
var url = "http://www.stackoverflow.com";
var toFind = "delete" //use file extension or whatever you want to find
request(url, function(err, resp, body) {
if (err) throw err;
var $ = cheerio.load(body);
$('a').each(function (i, element) {
var a = $(this);
//console.log(a.attr('href'));
var href = a.attr('href');
if (href && href.indexOf(toFind) != -1) {
console.log(href);
}
})
})
Output:
$ node scraping.js
http://ux.stackexchange.com/questions/49991/should-yes-delete-it-be-red-or-green
Just change the content of url and toFind. There is a good tutorial on web scraping here and here. Of course this can be done in a lot of different programming languages. I merely used javascript because you tagged it that way.
Here is a native javascript solution that works in current browsers (IE8+, Chrome, Firefox) without jQuery.
function getLinksWithExtension(extension) {
var links = document.querySelectorAll('a[href$="' + extension + '"]'),
i;
for (i=0; i<links.length; i++){
console.log(links[i]);
}
}
I think it goes like this:
var mp3_extension = '.mp3';
var url_string = url.split('.');
var url_extension = url_string[url_string.length-1];
if(url_extension === mp3_extension){
//go go go!!!
}

Categories

Resources