I am attempting to scrape the entire table that contains specified text but having issues navigating through the cheerio object it returns, do I continue adding .children for each iteration or is there a more elegant solution to this?
const test = 'https://www.sec.gov/Archives/edgar/data/24741/000002474118000053/glw-20180930x10q.htm'
const request = require('request');
const cheerio = require('cheerio');
request(test, (error, res, html) => {
if (!error && res.statusCode == 200) {
// console.log(html)
const $ = cheerio.load(html)
const table = $("tbody:contains('Cash and cash equivalents')")
// console.log(table[0])
for (var i in table[0].children){
for (var j in table[0].children[i].children) {
console.log(table[0].children[i].children[j].children)
}
}
}
})
I am hoping to get to just the text of the table
Related
I'm trying to scrape data from this website (https://www.brvm.org/fr/cours-actions/0) using nodejs in visual studio code.
With the code I can get the tr elements but I want to make a loop which will push every children td of a tr to create a table in html.
Below is my code
import fetch from "node-fetch";
import cheerio from "cheerio";
const scrapedData = [];
async function fetchData(url) {
const response = await fetch(url);
const data = await response.text();
/* console.log(data); */
getData(data);
}
fetchData("https://www.brvm.org/fr/cours-actions/0");
function getData(html) {
const $ = cheerio.load(html);
$("#block-system-main > div > table > tbody", html).each(function () {
console.log($(this).text());
});
}
How can I create the loop?
You can loop over the rows, then use .find on each row to grab its cells:
import cheerio from "cheerio"; // 1.0.0-rc.12
const parseHTML = html => {
const $ = cheerio.load(html);
const sel = "#block-system-main > div > table > tbody tr";
return [...$(sel)].map(e =>
[...$(e).find("td")].map(e => $(e).text())
);
};
(async () => {
const url = "https://www.brvm.org/fr/cours-actions/0";
const response = await fetch(url);
const result = parseHTML(await response.text());
console.table(result);
})();
See also Scraping all rows from table using cheerio.
I am using Cheerio for web scraping, I have used bs4 earlier.
I want to scrape https://rera.kerala.gov.in/rera_project_details this website; in Python to scrape table we can use findall("tr")[0] to get first <tr>.
But how to perform same in Cheerio?
Below is my code:
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
const url = "https://rera.kerala.gov.in/rera_project_details";
const arr = [];
request({method:"GET",url}, function(err, res, body){
if (res.statusCode==200){
let $ = cheerio.load(body);
const getID = $("#block-zircon-content");
const tbody = getID.find('tbody');
tbody.each((i, el)=>{
const ff = $(el).find("tr");
console.log(ff.html());//it returns first tr
//how to get 2 tr so that i can get td of second tr and can inde on td also
})
}}
)
If I loop over it returns all tr , now how to index on each td so that in last column of table I can get a link to get pdf?
Edit
I have reached till here but how to get list of td elements in tr:
const getID = $(".views-table");
const getBody = getID.find("tbody");
const gettr = getBody.find("tr");
const getfirsttr = $.html(gettr[0]);//it gives me first tr
const getfirsttd = getfirsttr.find("td")//does not work
To answer the index question:
$('tr').eq(n)
will give you the nth tr as a cheerio object. and
$('tr')[n]
will give it as a parse5 object
You should be able to use a selector that will give you all the elements from the required table. Once you have the elements you can access their properties, children etc.
const url = "https://rera.kerala.gov.in/rera_project_details";
request({method:"GET",url}, function(err, res, body) {
if (res.statusCode==200) {
let $ = cheerio.load(body);
// Get all td elements from the table.
let tdElements = $("#block-zircon-content tbody tr td").map((i, el)=>{
return el;
}).toArray();
console.log(`<td> list: Found ${tdElements.length} elements..`);
console.log("tdElements[0]:", tdElements[0]);
console.log("tdElements[0]: (html)", $.html(tdElements[0]))
}}
);
To simply find all td elements in the table using .find() we can try:
const trElements = $("#block-zircon-content tbody").find("tr");
const tdElements = trElements.find("td").toArray();
console.log(`first td:`, tdElements[0]);
all right after doing research and and help above from terry i have understood how it works..
all cheerio functions works on selector html not on text..
below is my code in case any other beginner like me is using cheerio and stuck
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
// const { get } = require('request');
// const { EACCES } = require('constants');
const url = "https://rera.kerala.gov.in/rera_project_details";
const arr = [];
request({method:"GET",url}, function(err, res, body){
if (res.statusCode==200){
let $ = cheerio.load(body);
// this is a selector
const getID = $(".views-table");
const getBody = getID.find("tbody");
const gettr = getBody.find("tr");
gettr.each((index, element)=>{
// if i use normal element it will be treated as normal text but children are avaiable
//ON SELECTORS WE CAN APPLY ALL FUNCTIONS
var std = $(element).find("td")
let number = $(std[0]).contents().text();
let ReraNumbers = $(std[1]).contents().text();
let name = $(std[2]).contents().text().trim()
// difference between tohtml and html is $.html retunr html tag
// to html returns html content
})
// const tdElements= gettr.find("td").toArray();
// console.log(tdElements[2].children[0].data.trim())
// let tdElements = $("#block-zircon-content tbody tr td").map((i, el)=>{
// return el;
// }).toArray();
// console.log(`<td> list: Found ${tdElements.length} elements..`);
// console.log("tdElements[0]:", tdElements[0]);
// console.log("tdElements[0]: (html)", $.html(tdElements[0]))
}}
)
I have a CSV file can contain around million records, how can I remove columns starting with _ and generate a resulting csv
For the sake of simplicity, consider i have the below csv
Sr.No Col1 Col2 _Col3 Col4 _Col5
1 txt png 676766 win 8787
2 jpg pdf 565657 lin 8787
3 pdf jpg 786786 lin 9898
I would want the output to be
Sr.No Col1 Col2 Col4
1 txt png win
2 jpg pdf lin
3 pdf jpg lin
Do i need to read the entire file to achive this or is there a better approach to do this.
const csv = require('csv-parser');
const fs = require('fs');
fs.createReadStream('data.csv')
.pipe(csv())
.on('data', (row) => {
// generate a new csv with removing specific column
})
.on('end', () => {
console.log('CSV file successfully processed');
});
Any help on how can i achieve this would be helpful.
Thanks.
To anyone who stumbles on the post
I was able to transform the csv's using below code using fs and csv modules.
await fs.createReadStream(m.path)
.pipe(csv.parse({delimiter: '\t', columns: true}))
.pipe(csv.transform((input) => {
delete input['_Col3'];
console.log(input);
return input;
}))
.pipe(csv.stringify({header: true}))
.pipe(fs.createWriteStream(transformedPath))
.on('finish', () => {
console.log('finish....');
}).on('error', () => {
console.log('error.....');
});
Source: https://gist.github.com/donmccurdy/6cbcd8cee74301f92b4400b376efda1d
Actually you can handle that by using two npm packages.
https://www.npmjs.com/package/csvtojson
to convert your library to JSON format
then use this
https://www.npmjs.com/package/json2csv
with the second library. If you know what are the exact fields you want. you can pass parameters to specifically select the fields you want.
const { Parser } = require('json2csv');
const fields = ['field1', 'field2', 'field3'];
const opts = { fields };
try {
const parser = new Parser(opts);
const csv = parser.parse(myData);
console.log(csv);
} catch (err) {
console.error(err);
}
Or you can modify the JSON object manually to drop those columns
Try this with csv lib
const csv = require('csv');
const fs = require('fs');
const csvString=`col1,col2
value1,value2`
csv.parse(csvString, {columns: true})
.pipe(csv.transform(({col1,col2}) => ({col1}))) // remove col2
.pipe(csv.stringify({header:true}))
.pipe(fs.createWriteStream('./file.csv'))
With this function I accomplished the column removal from a CSV
removeCol(csv, col) {
let lines = csv.split("\n");
let headers = lines[0].split(",");
let colNameToRemove = headers.find(h=> h.trim() === col);
let index = headers.indexOf(colNameToRemove);
let newLines = [];
lines.map((line)=>{
let fields = line.split(",");
fields.splice(index, 1)
newLines.push(fields)
})
let arrData = '';
for (let index = 0; index < newLines.length; index++) {
const element = newLines[index];
arrData += element.join(',') + '\n'
}
return arrData;
}
I'm trying to build an API grabber that requests JSON.
Here is some code:
function getPosts(tag, url, callback){
let newUrl = `${url}?tag=${tag}`;
request.get(newUrl, { json: true }, (err, res, body) => {
callback(body.posts);
});
}
Array builder function:
buildPostsCollection(options, url, callback){
let posts = [],
tagsFinished = 0;
const permPostsArray = [];
if(!options || !url){
return "Error: No options or URL given";
}
for(let i = 0; i < options.tags.length; i++){
getPosts(options.tags[i], url, (posts) => {
tagsFinished++;
console.log(tagsFinished);
permPostsArray.push(posts);
if(tagsFinished == options.tags.length){
callback(permPostsArray);
}
});
If I refresh the page quick enough it alternates between the different tag results.
It's supposed to combine both requests into one array.
In my server.js, I am trying to loop through the array, that has different urls and use those urls for app.get request function.
Here is my code:
let articleUrlArray = [ 'https://techcrunch.com/2018/05/19/shared-housing-startups-are-taking-off/',
'https://techcrunch.com/2018/05/19/shared-housing-startups-are-taking-off/',
'https://techcrunch.com/2018/05/19/my-data-request-lists-guides-to-get-data-about-you/',
'https://techcrunch.com/2018/05/19/siempos-new-app-will-break-your-smartphone-addiction/',
'https://techcrunch.com/2018/05/19/la-belle-vie-wants-to-compete-with-amazon-prime-now-in-paris/',
'https://techcrunch.com/2018/05/19/apple-started-paying-15-billion-european-tax-fine/',
'https://techcrunch.com/2018/05/19/original-content-dear-white-people/',
'https://techcrunch.com/2018/05/19/meet-the-judges-for-the-tc-startup-battlefield-europe-at-vivatech/',
'https://techcrunch.com/2018/05/18/nasas-newest-planet-hunting-satellite-takes-a-stellar-first-test-image/',
'https://techcrunch.com/video-article/turning-your-toys-into-robots-with-circuit-cubes/',
'https://techcrunch.com/2018/05/18/does-googles-duplex-violate-two-party-consent-laws/' ];
for(var i = 0; i < articleUrlArray.length-1; i++) {
app.get('/news/news-desc', function(req, res) {
var data = '';
var techCrunchNewsItems = [];
request( articleUrlArray[i], function(err, response, html) {
var $ = cheerio.load(html);
if($('.article-content').children('p').eq(0).text().split(' ').length > 50) {
techCrunchNewsItems.push({
bodyOne: $('.article-content').children('p').eq(0).text()
});
} else {
techCrunchNewsItems.push({
bodyOne: $('.article-content').children('p').eq(0).text(),
bodyTwo: $('.article-content').children('p').eq(1).text()
});
}
data = techCrunchNewsItems;
res.send(JSON.stringify(data));
});
})
}
As you can see in my code, I have an array call "articleUrlArray" and created "for loop" to loop through this array to get each "articleUrl". Then use that "articleUrl" for request function and get the body content for that url.
No matter whatever happens, I always "only" get the body content for the last url. It is not getting the body content for every urls in the "articleUrlArray".
What am I doing wrong?
Here is the screenshot of what I am getting after running Hugo Nasciutti's solution below:
const articleUrlArray = [
'https://techcrunch.com/2018/05/19/shared-housing-startups-are-taking-off/',
'https://techcrunch.com/2018/05/19/shared-housing-startups-are-taking-off/',
'https://techcrunch.com/2018/05/19/my-data-request-lists-guides-to-get-data-about-you/',
'https://techcrunch.com/2018/05/19/siempos-new-app-will-break-your-smartphone-addiction/',
'https://techcrunch.com/2018/05/19/la-belle-vie-wants-to-compete-with-amazon-prime-now-in-paris/',
'https://techcrunch.com/2018/05/19/apple-started-paying-15-billion-european-tax-fine/',
'https://techcrunch.com/2018/05/19/original-content-dear-white-people/',
'https://techcrunch.com/2018/05/19/meet-the-judges-for-the-tc-startup-battlefield-europe-at-vivatech/',
'https://techcrunch.com/2018/05/18/nasas-newest-planet-hunting-satellite-takes-a-stellar-first-test-image/',
'https://techcrunch.com/video-article/turning-your-toys-into-robots-with-circuit-cubes/',
'https://techcrunch.com/2018/05/18/does-googles-duplex-violate-two-party-consent-laws/'
];
const checkBody = res => (err, response, html) => {
const $ = cheerio.load(html);
const articleContent = $('.article-content').children('p')
const bodyOne = articleContent.eq(0).text()
const bodyTwo = articleContent.eq(1).text()
const isExtensive = bodyOne.split(' ').length > 50
res(isExtensive ? { bodyOne } : { bodyOne, bodyTwo })
}
const getArticle = article => new Promise(res => request(article, checkBody(res)))
app.get('/news/news-desc', (req, res) => {
Promise.all(articleUrlArray.map(getArticle)).then(data => res.send(JSON.stringify(data)))
})
What is really going on here is that I am using a function to bring an array of Promises and when all of them are solved, then, respond the request with the array of objects stringified. I took the liberty of implementing arrow functions and constants.