I am trying to get all values of element
var request = require('request');
var cheerio = require('cheerio');
var url = "https://www.mismarcadores.com";
request(url, function(err, resp, body) {
if (err) throw err;
var $ = cheerio.load(body);
var addUrl= [];
$('a').each(function (i, element) {
var a = $(this);
var href = a.attr('href');
addUrl.push(href);
})
console.log(array[0]);
})
I have this code that add the links to array called addUrl , this works perfect but now I am looking how to add to this array if the url contains the word 'baloncesto' in href.
Good example : https://www.mismarcadores.com/baloncesto/alemania/
This URL , is good but
Bad example : https://www.mismarcadores.com/golf/
This is wrong.
I am developing this using NodeJS but this is only a simply javascript that now I don't know how to made this.
Could anyone help to me?
Please try like this:
var request = require('request');
var cheerio = require('cheerio');
var url = "https://www.mismarcadores.com";
var filterStr = 'baloncesto';
request(url, function(err, resp, body) {
if (err) throw err;
var $ = cheerio.load(body);
var addUrl= [];
$('a').each(function (i, element) {
var href = $(this).attr('href');
if (href.includes(filterStr)) addUrl.push(href);
})
console.log(addUrl);
})
Okay, so what you're looking for is a pattern match - if href contains the string baloncesto. I suspect that this SO thread will be helpful to you - you're basically looking to nest this line -
addUrl.push(href);
- in an if statement, like
if(href.includes('baloncesto')) {
addUrl.push(href);
}
...but definitely look at that other answer for reference, in case you're using a version of JS that's older and not compatible with ES6.
you can try this
if(href.contains('baloncesto')){
addUrl.push(href);
}
Try this
var request = require('request');
var cheerio = require('cheerio');
var url = "https://www.mismarcadores.com";
request(url, function(err, resp, body) {
if (err) throw err;
var $ = cheerio.load(body);
var addUrl= [];
$('a').each(function (i, element) {
var a = $(this);
var href = a.attr('href');
if (decodeURIComponent(href).contains('baloncesto')){
addUrl.push(href);
} else {
//do something else
}
})
console.log(array[0]);
})
Related
I am trying to develop a web scraper using Cheerio to parse an HTML table and output the results in a CSV file, unfortunately the code won't return anything. My code is the following:
var request = require("request");
var cheerio = require("cheerio");
var fs = require("fs");
var url = "https://stat.epa.gov.tw/";
request(url, function(err, response, html){
if(!err) {
var $ = cheerio.load(html);
var allitems = $('table.tableAQI').children().children();
var items = []
allitems.each(function(index){
var result = $('table.tableAQI').children().children().eq(index).find("td").text();
if(result !="") {
items.push(result);
}
});
fs.writeFile("output2.csv", JSON.stringify(items, null, 4), function(err){
if(err) {
console.log(err);
} else {
console.log("Data has been written");
}
});
console.log(items);
}
});
The website provides 2 tables I tried the same code to parse the other table ('table.aqicolor') and it worked just fine so I am not really sure what is wrong.
I appreciate your support.
I am trying to scrape some menu information from the following restaurant: Red Cow url: "http://redcowmn.com/50th-street-menu/"
I want menu item name and the price, for now I am trouble-shooting just getting name information. Can someone help my in troubleshooting my code? Thanks.
I have tried the following code:
var request = require("request");
var cheerio = require("cheerio");
var fs = require("fs");
var url = "http://redcowmn.com/50th-street-menu/";
request(url, function(err, response, html) {
if(!err) {
var $ = cheerio.load(html);
var allItems = $("#block-yui_3_17_2_9_1423750151911_47600").children();
var items = [];
allItems.each(function(index) {
var result = $("#block-yui_3_17_2_9_1423750151911_47600").children().eq(0).children().eq(5).find("#menu-item-title").text();
if(result !== "") {
items.push(result);
}
});
fs.writeFile("output1.txt", JSON.stringify(items, null, 4), function(err) {
if(err) {
console.log(err);
} else {
console.log("Data has been added to a file.");
}
})
console.log(items);
}
});
Try to get names of mathes in "Starting soon" section of this web-site: https://favbet.com/en/bets/
This is my code:
var request = require('request'),
cheerio = require('cheerio');
matches = [];
request('https://favbet.com/en/bets/', function (err, resp, body) {
if (!err && resp.statusCode == 200) {
var $ = cheerio.load(body);
var content = $('#startsoon').find('li.col20').each(function() {
var match = this.text();
matches.push(match);
});
console.log(matches);
}
});
Console.log returns blank array. I'm not quite understand how the deep nesting works in this case.
I assign an empty array to the global variable artistURLs. I then push strings (the local variable artistURL) into the artistURLs array with in the Cheerio .each() iterator method.
var request = require('request'),
cheerio = require('cheerio'),
url = 'http://www.primarytalent.com/',
artistURLs = [];
request(url, function(err, resp, body){
if(err)
throw err;
$ = cheerio.load(body);
$('#rosterlists div li a').each(function(){
var urlCap = this.attr('href').slice(1);
var artistURL = url.concat(urlCap);
artistURLs.push(artistURL);
});
console.log(artistURLs);
});
I know that artistURL is successfully being pushed into artistURLs because
console.log(artistURLs);
will display the populated array in my terminal. The problem is if I try running console.log(artistURLs); outside of the callback function in the global scope. For example
var request = require('request'),
cheerio = require('cheerio'),
url = 'http://www.primarytalent.com/',
artistURLs = [];
request(url, function(err, resp, body){
if(err)
throw err;
$ = cheerio.load(body);
$('#rosterlists div li a').each(function(){
var urlCap = this.attr('href').slice(1);
var artistURL = url.concat(urlCap);
artistURLs.push(artistURL);
});
});
console.log(artistURLs);
So you can see that I have moved console.log(artistURLs); outside of request(). For some reason trying to access artistURLs in the global scope returns me an empty array as if all the processing that happened with in `request()~ never even happened.
How come this is happening and how can ensure that all the urls that are being pushed into artistURLs remain in artistURLs?
Thanks
The request() module is asynchronous, so console.log() is executing before the HTTP call completes when you use console.log(). For example, take this code:
var request = require('request');
var cheerio = require('cheerio');
var url = 'http://www.primarytalent.com/';
var artistURLs = [];
request(url, function(err, resp, body){
if (err)
throw err;
$ = cheerio.load(body);
$('#rosterlists div li a').each(function(){
var urlCap = this.attr('href').slice(1);
var artistURL = url.concat(urlCap);
artistURLs.push(artistURL);
console.log('push');
});
});
console.log(artistURLs);
You will see this result:
[]
push
push
...
push
push
push
To prevent this from happening, only use the variable artistURLs inside of the HTTP request callback.
This server gets all links of google's first site:
var http = require('http');
var qs = require('querystring');
var request = require("request");
var cheerio = require('cheerio');
http.createServer().listen(1337, "127.0.0.1");
function parsehl(body, callback){
$ = cheerio.load(body);
var result = '';
links = $('a'); //jquery get all hyperlinks ???????
$(links).each(function(i, link){
var hyperlink = '\n'+ $(link).attr('href');
result = result + hyperlink ;
});
callback(result);}
request({uri: 'http://www.google.com/search?q=rio&start=00' }, function (error, response, body) {
console.log('url requested ') ;
if (!error){
parsehl(body, function(result){
console.log(result);
});
}
else
{
console.log(error);
}
});
This server gets the links by tag name a.
The links we need have the class name l (looks like nr 1 but is an l like in little).
How can we get the links here by class name?
We maybe need just one line where we put the questionmarks.
Have you tried:
links = $('a.l');
?