How to perform indexing in Cheerio for web scraping - javascript
I am using Cheerio for web scraping, I have used bs4 earlier.
I want to scrape https://rera.kerala.gov.in/rera_project_details this website; in Python to scrape table we can use findall("tr")[0] to get first <tr>.
But how to perform same in Cheerio?
Below is my code:
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
const url = "https://rera.kerala.gov.in/rera_project_details";
const arr = [];
request({method:"GET",url}, function(err, res, body){
if (res.statusCode==200){
let $ = cheerio.load(body);
const getID = $("#block-zircon-content");
const tbody = getID.find('tbody');
tbody.each((i, el)=>{
const ff = $(el).find("tr");
console.log(ff.html());//it returns first tr
//how to get 2 tr so that i can get td of second tr and can inde on td also
})
}}
)
If I loop over it returns all tr , now how to index on each td so that in last column of table I can get a link to get pdf?
Edit
I have reached till here but how to get list of td elements in tr:
const getID = $(".views-table");
const getBody = getID.find("tbody");
const gettr = getBody.find("tr");
const getfirsttr = $.html(gettr[0]);//it gives me first tr
const getfirsttd = getfirsttr.find("td")//does not work
To answer the index question:
$('tr').eq(n)
will give you the nth tr as a cheerio object. and
$('tr')[n]
will give it as a parse5 object
You should be able to use a selector that will give you all the elements from the required table. Once you have the elements you can access their properties, children etc.
const url = "https://rera.kerala.gov.in/rera_project_details";
request({method:"GET",url}, function(err, res, body) {
if (res.statusCode==200) {
let $ = cheerio.load(body);
// Get all td elements from the table.
let tdElements = $("#block-zircon-content tbody tr td").map((i, el)=>{
return el;
}).toArray();
console.log(`<td> list: Found ${tdElements.length} elements..`);
console.log("tdElements[0]:", tdElements[0]);
console.log("tdElements[0]: (html)", $.html(tdElements[0]))
}}
);
To simply find all td elements in the table using .find() we can try:
const trElements = $("#block-zircon-content tbody").find("tr");
const tdElements = trElements.find("td").toArray();
console.log(`first td:`, tdElements[0]);
all right after doing research and and help above from terry i have understood how it works..
all cheerio functions works on selector html not on text..
below is my code in case any other beginner like me is using cheerio and stuck
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
// const { get } = require('request');
// const { EACCES } = require('constants');
const url = "https://rera.kerala.gov.in/rera_project_details";
const arr = [];
request({method:"GET",url}, function(err, res, body){
if (res.statusCode==200){
let $ = cheerio.load(body);
// this is a selector
const getID = $(".views-table");
const getBody = getID.find("tbody");
const gettr = getBody.find("tr");
gettr.each((index, element)=>{
// if i use normal element it will be treated as normal text but children are avaiable
//ON SELECTORS WE CAN APPLY ALL FUNCTIONS
var std = $(element).find("td")
let number = $(std[0]).contents().text();
let ReraNumbers = $(std[1]).contents().text();
let name = $(std[2]).contents().text().trim()
// difference between tohtml and html is $.html retunr html tag
// to html returns html content
})
// const tdElements= gettr.find("td").toArray();
// console.log(tdElements[2].children[0].data.trim())
// let tdElements = $("#block-zircon-content tbody tr td").map((i, el)=>{
// return el;
// }).toArray();
// console.log(`<td> list: Found ${tdElements.length} elements..`);
// console.log("tdElements[0]:", tdElements[0]);
// console.log("tdElements[0]: (html)", $.html(tdElements[0]))
}}
)
Related
Parse table from HTML with cheerio in node
I'm trying to scrape data from this website (https://www.brvm.org/fr/cours-actions/0) using nodejs in visual studio code. With the code I can get the tr elements but I want to make a loop which will push every children td of a tr to create a table in html. Below is my code import fetch from "node-fetch"; import cheerio from "cheerio"; const scrapedData = []; async function fetchData(url) { const response = await fetch(url); const data = await response.text(); /* console.log(data); */ getData(data); } fetchData("https://www.brvm.org/fr/cours-actions/0"); function getData(html) { const $ = cheerio.load(html); $("#block-system-main > div > table > tbody", html).each(function () { console.log($(this).text()); }); } How can I create the loop?
You can loop over the rows, then use .find on each row to grab its cells: import cheerio from "cheerio"; // 1.0.0-rc.12 const parseHTML = html => { const $ = cheerio.load(html); const sel = "#block-system-main > div > table > tbody tr"; return [...$(sel)].map(e => [...$(e).find("td")].map(e => $(e).text()) ); }; (async () => { const url = "https://www.brvm.org/fr/cours-actions/0"; const response = await fetch(url); const result = parseHTML(await response.text()); console.table(result); })(); See also Scraping all rows from table using cheerio.
How to append the child to the parent without using the ID element?
I used an ID to add a new element to divTwo. I want to know how to add the p tag inside divOne and divTwo without referring to their identifier. please help. // create Tag const createElement = (elementName) => document.createElement(elementName); const appendTo = (idElement, element) => document.getElementById(idElement).append(element); const setAttribute = (eleName, attribute, valueAttribute) => eleName.setAttribute(attribute, valueAttribute); const setText = (id, text) => document.getElementById(id).innerHTML = text; // Tag HTML // div one const divOne = createElement("div"); const appendDivOne = appendTo("demo", divOne) const setIDOne = setAttribute(divOne, "id", "divOne"); // div two const divTwo = createElement("div"); const appendDivTwo = appendTo("demo", divTwo) const setIDTwo = setAttribute(divTwo, "id", "divTwo"); // child div two const divTwoChild = createElement("p"); const appendDivTwoChild = appendTo("divTwo", divTwoChild); const setIDChildeTwo = setAttribute(divTwoChild, "id", "ChildeTwo"); const text = setText("ChildeTwo", "childe two"); <div id="demo"></div>
It seems you are trying to append to div1 which, according to your code, will be the first element in . If you want to append a P tag, you can do: const divOneChild = createElement("p") const appendP = appendTo(document.getElementById("demo").firstChild, divOneChild)
You can access your elements directly after creation... for example when using const divTwoChild = createElement("p");, you can use divTwoChild.append() ... there is also a function called insertAdjacentHTML(), where you can add a html code directly on given position, read about in in here. Examples below (last 3 lines): // create Tag const createElement = (elementName) => document.createElement(elementName); const appendTo = (idElement, element) => document.getElementById(idElement).append(element); const setAttribute = (eleName, attribute, valueAttribute) => eleName.setAttribute(attribute, valueAttribute); const setText = (id, text) => document.getElementById(id).innerHTML = text; // Tag HTML // div one const divOne = createElement("div"); const appendDivOne = appendTo("demo", divOne) const setIDOne = setAttribute(divOne, "id", "divOne"); // div two const divTwo = createElement("div"); const appendDivTwo = appendTo("demo", divTwo) const setIDTwo = setAttribute(divTwo, "id", "divTwo"); // child div two const divTwoChild = createElement("p"); const appendDivTwoChild = appendTo("divTwo", divTwoChild); const setIDChildeTwo = setAttribute(divTwoChild, "id", "ChildeTwo"); divTwoChild.append("childe two"); // <-- here divOne.append('I am div one!'); // <-- or here divTwo.insertAdjacentHTML('beforeend', '<p>I am a new p in div 2!</p>'); // <-- or here <div id="demo"></div>
Cheerio Get Image Src
im try to image for a table and push that link in to array. when i run the code img get an undefined. how i can fix this problem. $("body > form > table:nth-child(5) > tbody > tr").each((index, element) => { if (index === 0) return true; const tds = $(element).find("td"); const img = $(tds[0]).attr('src'); // undefined const flight = $(tds[1]).text(); //working const origin = $(tds[2]).text(); //working const time = $(tds[3]).text(); //working const estimted = $(tds[4]).text(); //working const status = $(tds[5]).text(); //working console.log(img); const tableRow = { flight, origin, time, estimted, status }; });
$(tds[0]).attr('src') will be undefined because td's aren't images. Maybe you wanted: $(tds[0]).find('img').attr('src')
JS - scraping table that contains specified row
I am attempting to scrape the entire table that contains specified text but having issues navigating through the cheerio object it returns, do I continue adding .children for each iteration or is there a more elegant solution to this? const test = 'https://www.sec.gov/Archives/edgar/data/24741/000002474118000053/glw-20180930x10q.htm' const request = require('request'); const cheerio = require('cheerio'); request(test, (error, res, html) => { if (!error && res.statusCode == 200) { // console.log(html) const $ = cheerio.load(html) const table = $("tbody:contains('Cash and cash equivalents')") // console.log(table[0]) for (var i in table[0].children){ for (var j in table[0].children[i].children) { console.log(table[0].children[i].children[j].children) } } } }) I am hoping to get to just the text of the table
how to select innerHTML from an elementHandle in puppeteer
Using the node puppeteer module, how do I continue with this code to get the innerContent here? const els = Promise.all(await page.$$(selector)).then(results => { results.map(async el => { const tr = await el.$('tr') //How do I convert this element handle to get its innerText content? }) })
Like this textValue = tr.getProperty('innerText').jsonValue()