I'm trying to scrape data from this website (https://www.brvm.org/fr/cours-actions/0) using nodejs in visual studio code.
With the code I can get the tr elements but I want to make a loop which will push every children td of a tr to create a table in html.
Below is my code
import fetch from "node-fetch";
import cheerio from "cheerio";
const scrapedData = [];
async function fetchData(url) {
const response = await fetch(url);
const data = await response.text();
/* console.log(data); */
getData(data);
}
fetchData("https://www.brvm.org/fr/cours-actions/0");
function getData(html) {
const $ = cheerio.load(html);
$("#block-system-main > div > table > tbody", html).each(function () {
console.log($(this).text());
});
}
How can I create the loop?
You can loop over the rows, then use .find on each row to grab its cells:
import cheerio from "cheerio"; // 1.0.0-rc.12
const parseHTML = html => {
const $ = cheerio.load(html);
const sel = "#block-system-main > div > table > tbody tr";
return [...$(sel)].map(e =>
[...$(e).find("td")].map(e => $(e).text())
);
};
(async () => {
const url = "https://www.brvm.org/fr/cours-actions/0";
const response = await fetch(url);
const result = parseHTML(await response.text());
console.table(result);
})();
See also Scraping all rows from table using cheerio.
Related
I am trying to pass information from an API call into an HTML table which I can then also use for graphs. A picture of the array is attached to show the data structure. I keep receiving an error that column_names is not iterable but I have not been able to work it out after hours of searching. I think it has to do with the names in my array but can't find a solution. I'm new to this and I feel like the answer is painfully simple so any help or explanation of my error would be appreciated.
Array Format
async function loadintotable(url, table) {
const tableHead = table.querySelector('thead');
const tableBody = table.querySelector('tbody');
const response = await fetch(url);
const { column_names, data } = await response.json();
tableHead.innerHTML = '<tr></tr>';
tableBody.innerHTML = '';
for (const headerText of column_names) {
const headerElement = document.createElement('th');
headerElement.textContent = headerText;
tableHead.querySelector('tr').appendChild(headerElement);
}
for (const row of data) {
const rowElement = document.createElement('tr');
for (const cellText of row) {
const cellElement = document.createElement('td');
cellElement.textContent = cellText;
rowElement.appendChild(cellElement);
}
tableBody.appendChild(rowElement);
}
}
Your api response is of format
{
dataset: {
column_names: [],
data: []
}
}
So, in order to access column_names and data you have to
const json = await response.json();
const { column_names, data } = json.dataset;
Or in one line
const { column_names, data } = (await response.json()).dataset;
Notice the .dataset at the end of the line
I am using Cheerio for web scraping, I have used bs4 earlier.
I want to scrape https://rera.kerala.gov.in/rera_project_details this website; in Python to scrape table we can use findall("tr")[0] to get first <tr>.
But how to perform same in Cheerio?
Below is my code:
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
const url = "https://rera.kerala.gov.in/rera_project_details";
const arr = [];
request({method:"GET",url}, function(err, res, body){
if (res.statusCode==200){
let $ = cheerio.load(body);
const getID = $("#block-zircon-content");
const tbody = getID.find('tbody');
tbody.each((i, el)=>{
const ff = $(el).find("tr");
console.log(ff.html());//it returns first tr
//how to get 2 tr so that i can get td of second tr and can inde on td also
})
}}
)
If I loop over it returns all tr , now how to index on each td so that in last column of table I can get a link to get pdf?
Edit
I have reached till here but how to get list of td elements in tr:
const getID = $(".views-table");
const getBody = getID.find("tbody");
const gettr = getBody.find("tr");
const getfirsttr = $.html(gettr[0]);//it gives me first tr
const getfirsttd = getfirsttr.find("td")//does not work
To answer the index question:
$('tr').eq(n)
will give you the nth tr as a cheerio object. and
$('tr')[n]
will give it as a parse5 object
You should be able to use a selector that will give you all the elements from the required table. Once you have the elements you can access their properties, children etc.
const url = "https://rera.kerala.gov.in/rera_project_details";
request({method:"GET",url}, function(err, res, body) {
if (res.statusCode==200) {
let $ = cheerio.load(body);
// Get all td elements from the table.
let tdElements = $("#block-zircon-content tbody tr td").map((i, el)=>{
return el;
}).toArray();
console.log(`<td> list: Found ${tdElements.length} elements..`);
console.log("tdElements[0]:", tdElements[0]);
console.log("tdElements[0]: (html)", $.html(tdElements[0]))
}}
);
To simply find all td elements in the table using .find() we can try:
const trElements = $("#block-zircon-content tbody").find("tr");
const tdElements = trElements.find("td").toArray();
console.log(`first td:`, tdElements[0]);
all right after doing research and and help above from terry i have understood how it works..
all cheerio functions works on selector html not on text..
below is my code in case any other beginner like me is using cheerio and stuck
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
// const { get } = require('request');
// const { EACCES } = require('constants');
const url = "https://rera.kerala.gov.in/rera_project_details";
const arr = [];
request({method:"GET",url}, function(err, res, body){
if (res.statusCode==200){
let $ = cheerio.load(body);
// this is a selector
const getID = $(".views-table");
const getBody = getID.find("tbody");
const gettr = getBody.find("tr");
gettr.each((index, element)=>{
// if i use normal element it will be treated as normal text but children are avaiable
//ON SELECTORS WE CAN APPLY ALL FUNCTIONS
var std = $(element).find("td")
let number = $(std[0]).contents().text();
let ReraNumbers = $(std[1]).contents().text();
let name = $(std[2]).contents().text().trim()
// difference between tohtml and html is $.html retunr html tag
// to html returns html content
})
// const tdElements= gettr.find("td").toArray();
// console.log(tdElements[2].children[0].data.trim())
// let tdElements = $("#block-zircon-content tbody tr td").map((i, el)=>{
// return el;
// }).toArray();
// console.log(`<td> list: Found ${tdElements.length} elements..`);
// console.log("tdElements[0]:", tdElements[0]);
// console.log("tdElements[0]: (html)", $.html(tdElements[0]))
}}
)
I have a problem with this piece of code.
I import input data from a file formated like so and store it in const input:
aabcccccaaa
aaccb
shsudbud
There are no spaces or any other white characters except from '\n' newline.
I get inputs in this way: (LiveServer inside VS Code)
const getData = async () => {
const resp = await fetch("./inputs.txt");
const data = await resp.text();
return data;
};
Then I call:
const myFunc = async () => {
const input = await getData();
const rows = input.split("\n").map(row => row);
rows.forEach(row => {
const charArr = [...row];
console.log(charArr);
});
};
After logging to console first and second row it seems like there is "" (empty string) attached to the end of each of them. The third element is fine so I guess its somehow connected with newline character.
I have also tried creating charArr by doing:
const charArr = Array.from(row);
Or
const charArr = row.split("");
But the outcome was the same.
Later I found this topic: Remove empty elements from an array in Javascript
So I tried:
const charArr = [...row].filter(Boolean);
But the "" is still at the end of charArr created from 1st and 2nd row.
const input = `aabcccccaaa
aaccb
shsudbud`;
const rows = input.split("\n").map(row => row);
rows.forEach(row => {
const charArr = [...row];
console.log(charArr);
});
In this snippet everything works fine. So here is where my questions start:
Why does .filter() method not work properly in this case?
Could this problem browser specific?
Thanks in advance.
I have a webpage, where I want to hover over all anchor tags and get the styles computed for that tag. This function which I wrote doesn't seem to work as it gives me original style of the anchor and not the hover styles.
Please help.
let data = await page.evaluate(() => {
let elements = document.getElementsByTagName('a');
properties = []
for (var element of elements){
element.focus();
properties.push(JSON.parse(JSON.stringify(window.getComputedStyle(element, null)["backgroundColor"])));
}
return properties;
});
https://developer.mozilla.org/en-US/docs/Web/API/Window/getComputedStyle
try document.getComputedStyle(element, ':hover')
First of all, you should convert results from document.getElementsByTagName to normal array
const elements = [...document.getElementsByTagName('textarea')];
Next to get element property use this syntax:
window.getComputedStyle(element).getPropertyValue("background-color")
Finally, this is a fully working example:
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://css-tricks.com/almanac/selectors/f/focus/');
const data = await page.evaluate(() => {
const elements = document.getElementsByTagName('textarea');
return [...elements].map(element => {
element.focus();
return window.getComputedStyle(element).getPropertyValue("background-color");
});
});
console.log(data);
await browser.close();
})();
You can use page.$$() to obtain an ElementHandle array of textarea elements.
Then, you can use the elementHandle.hover() to hover over each element and then page.evaluate() to obtain the computed background color to push to your data array:
const elements = await page.$$( 'textarea' );
const data = [];
for ( let i = 0; i < elements.length; i++ )
{
await elements[i].hover();
data.push( await page.evaluate( element => window.getComputedStyle( element ).backgroundColor, elements[i] ) );
}
console.log( data );
Using the node puppeteer module, how do I continue with this code to get the innerContent here?
const els = Promise.all(await page.$$(selector)).then(results => {
results.map(async el => {
const tr = await el.$('tr')
//How do I convert this element handle to get its innerText content?
})
})
Like this
textValue = tr.getProperty('innerText').jsonValue()