Javascript: Parsing HTML table with Cheerio - javascript

I am trying to develop a web scraper using Cheerio to parse an HTML table and output the results in a CSV file, unfortunately the code won't return anything. My code is the following:
var request = require("request");
var cheerio = require("cheerio");
var fs = require("fs");
var url = "https://stat.epa.gov.tw/";
request(url, function(err, response, html){
if(!err) {
var $ = cheerio.load(html);
var allitems = $('table.tableAQI').children().children();
var items = []
allitems.each(function(index){
var result = $('table.tableAQI').children().children().eq(index).find("td").text();
if(result !="") {
items.push(result);
}
});
fs.writeFile("output2.csv", JSON.stringify(items, null, 4), function(err){
if(err) {
console.log(err);
} else {
console.log("Data has been written");
}
});
console.log(items);
}
});
The website provides 2 tables I tried the same code to parse the other table ('table.aqicolor') and it worked just fine so I am not really sure what is wrong.
I appreciate your support.

Related

Write CSV file column title in javascript

This is my code. I am trying to output my data as CSV and I am being successful but I can't add the title of the columns. Although I saved the CSV file but the column titles are missing. Can anyone help in this?
var d= new Date();
var t=d.getHours()+":"+d.getMinutes()+":"+d.getSeconds();
var data = dataAWS
var result = data.map(function(val) {
return t+'","'+val.payload.test_data;
}).join('"\n"');
var csvData = '"'+ result +'"';
var fs = require('fs');
fs.writeFile("tempN.csv", csvData, 'utf8', function (err) {
if (err) {
console.log("An error occured while saving CSV.");
return console.log(err);
}
console.log("CSV file has been saved.");
});
Without knowing more I would suggest something like:
data.unshift(column_headers);
var result = data.map(function(val) {
return t+'","'+val.payload.test_data;
}).join("\n")
etc, etc

How to add into array the url that contains specific String?

I am trying to get all values of element
var request = require('request');
var cheerio = require('cheerio');
var url = "https://www.mismarcadores.com";
request(url, function(err, resp, body) {
if (err) throw err;
var $ = cheerio.load(body);
var addUrl= [];
$('a').each(function (i, element) {
var a = $(this);
var href = a.attr('href');
addUrl.push(href);
})
console.log(array[0]);
})
I have this code that add the links to array called addUrl , this works perfect but now I am looking how to add to this array if the url contains the word 'baloncesto' in href.
Good example : https://www.mismarcadores.com/baloncesto/alemania/
This URL , is good but
Bad example : https://www.mismarcadores.com/golf/
This is wrong.
I am developing this using NodeJS but this is only a simply javascript that now I don't know how to made this.
Could anyone help to me?
Please try like this:
var request = require('request');
var cheerio = require('cheerio');
var url = "https://www.mismarcadores.com";
var filterStr = 'baloncesto';
request(url, function(err, resp, body) {
if (err) throw err;
var $ = cheerio.load(body);
var addUrl= [];
$('a').each(function (i, element) {
var href = $(this).attr('href');
if (href.includes(filterStr)) addUrl.push(href);
})
console.log(addUrl);
})
Okay, so what you're looking for is a pattern match - if href contains the string baloncesto. I suspect that this SO thread will be helpful to you - you're basically looking to nest this line -
addUrl.push(href);
- in an if statement, like
if(href.includes('baloncesto')) {
addUrl.push(href);
}
...but definitely look at that other answer for reference, in case you're using a version of JS that's older and not compatible with ES6.
you can try this
if(href.contains('baloncesto')){
addUrl.push(href);
}
Try this
var request = require('request');
var cheerio = require('cheerio');
var url = "https://www.mismarcadores.com";
request(url, function(err, resp, body) {
if (err) throw err;
var $ = cheerio.load(body);
var addUrl= [];
$('a').each(function (i, element) {
var a = $(this);
var href = a.attr('href');
if (decodeURIComponent(href).contains('baloncesto')){
addUrl.push(href);
} else {
//do something else
}
})
console.log(array[0]);
})

Trying to scrape restaurant menu data using node.js / cheerio - but returning blank array

I am trying to scrape some menu information from the following restaurant: Red Cow url: "http://redcowmn.com/50th-street-menu/"
I want menu item name and the price, for now I am trouble-shooting just getting name information. Can someone help my in troubleshooting my code? Thanks.
I have tried the following code:
var request = require("request");
var cheerio = require("cheerio");
var fs = require("fs");
var url = "http://redcowmn.com/50th-street-menu/";
request(url, function(err, response, html) {
if(!err) {
var $ = cheerio.load(html);
var allItems = $("#block-yui_3_17_2_9_1423750151911_47600").children();
var items = [];
allItems.each(function(index) {
var result = $("#block-yui_3_17_2_9_1423750151911_47600").children().eq(0).children().eq(5).find("#menu-item-title").text();
if(result !== "") {
items.push(result);
}
});
fs.writeFile("output1.txt", JSON.stringify(items, null, 4), function(err) {
if(err) {
console.log(err);
} else {
console.log("Data has been added to a file.");
}
})
console.log(items);
}
});

Issue when doing web scraper

I am scraping the webpage https://www.g2a.com/rising-storm-2-vietnam-steam-cd-key-global.html
I need to get the title from the table data.
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
app.get('/scrape', function(req, res) {
url = 'https://www.g2a.com/rising-storm-2-vietnam-steam-cd-key-global.html';
request(url, function(error, response, body) {
if (!error) {
var $ = cheerio.load(body);
var arr = [];
var title = $('.mp-user-rating tr').each(function() {
var tableData = $('.marketplace-name > .mp-rating-popup');
arr.push({ 'title': tableData.text() });
});
}
res.send('Check your console!')
});
})
app.listen('8081');
console.log('Magic happens on port 8081');
exports = module.exports = app;
Here the data is in third column and cannot able to get .mp-user-rating tr data what is expected.
The image shows the structure of the table
Any help would be appreciated.
So, I went to the page and ran this in the console.
var arr = [];
var title = jQuery('.mp-user-rating tr').each(function(i, element) {
var tableData = jQuery(element).find('.mp-rating-popup');
arr.push({ 'title': tableData.text() });
});
console.log(arr);
The array consists of 8 objects that each have the titles within them.
UPDATE:
I pulled in the html information using your code. I think the issue is, the html is loaded asynchronously by the website, as a result, pulling the html will only retrieve the static markup. You will need to use PhantomJS or chrome's headless browser in order to load the website and allow the asyncronous information to load, then you can grab the html.
See here for some good docs on PhantomJS: https://github.com/Medium/phantomjs

How to dump tabular data from a webpage made of no-js class?

I am trying to scrape through a web-page which uses no-js html class.
I've come up with code to scrape.
Now that webpage has a table always and I want that full table in an excel file.
That means scrape through the webpage and dump the table into a file.
How do I do that?
Here's the code so far.
var http = require("http");
function download(url, callback) {
http.get(url, function(res) {
var data = "";
res.on('data', function (chunk) {
data += chunk;
});
res.on("end", function() {
callback(data);
});
}).on("error", function() {
callback(null);
});
}
var url = "http://kayak.com"
download(url, function(data) {
if (data) {
console.log(data);
}
else console.log("error");
});
You could use the request module to get the page markup and then parse it with cherrio.
Cherrio provides a lightweight jquery implementation that can be used on the server:
https://github.com/MatthewMueller/cheerio
Request provides a simplified http client:
https://github.com/mikeal/request
var request = require('request');
var cheerio = require('cheerio');
var url = 'http://kayak.com';
request(url, function(err, res, body){
$ = cheerio.load(body);
var $rows = $('table tr').toArray();
$rows.map(function(row){
var cells = $(row).find('td').toArray();
console.log(cells.map(function(cell){
return $(cell).text().trim();
}).join(', '));
});
});

Categories

Resources