Select the second table row of a table using puppeteer - javascript

I'm working in a crawler using node.js and puppeteer, my goal is to get the data of two columns in a table(date and description), the code work fine until the block to get the data from columns...
Full code below, include the url for the page i'm crawling:
const fs = require('fs');
const puppeteer = require('puppeteer');
const urlConsulta = "http://www.tre-pr.jus.br/";
const numeroProcessoSeq = "000000889";
const numeroProcessoAno = "2014";
const numeroProcessoDigito = "6160047";
var wait = ms => new Promise((r, j)=> setTimeout(r, ms));
void (async () => {
try {
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.goto(urlConsulta);
await page.select('#acao', 'pesquisarNumUnico');
await page.evaluate((numeroProcessoSeq, numeroProcessoAno, numeroProcessoDigito) => {
document.getElementById('numUnicoSequencial').value = numeroProcessoSeq;
document.getElementById('numUnicoAno').value = numeroProcessoAno;
document.getElementById('numUnicoOrigem').value = numeroProcessoDigito;
}, numeroProcessoSeq, numeroProcessoAno, numeroProcessoDigito);
await page.$eval('form[action*="http://www.tre-pr.jus.br/##processrequest"]', form => form.submit());
await page.waitForNavigation();
var frame = await page.frames().find(f => f.name() === 'ifr_servicos');
await frame.click('a[href*="ExibirDadosProcesso"]');
await page.frames().find(f => f.name() === 'ifr_servicos');
await wait(10000);
await frame.click('[name*="todos"]');
await frame.$eval('[name*="ExibirPartesProcessoZona"]', form => form.submit());
await wait(10000);
let string = await buscaFases(frame);
fs.writeFile("teste.txt", string, function(err) {
if(err) {
return console.log(err);
}
console.log("The file was saved!");
});
console.log(string);
await wait(10000);
await browser.close();
} catch (error) {
console.log(error);
}
})();
async function buscaFases(frame) {
return await frame.evaluate(() => {
let div = document.querySelector('div[id*="conteudo"]');
let rowns = Array.from(div.children[4].children[0].children);
let movimentosInfo = rowns.map(row => {
let data = row.querySelector("tr td:first-child").textContent;
let descricao = row.querySelector("tr td:first-child + td").textContent;
return { data, descricao };
});
return JSON.stringify(movimentosInfo);
});
};
The specific lines to get the data :
let data = row.querySelector("tr td:first-child").textContent;
let descricao = row.querySelector("tr td:first-child + td").textContent;

The problem is that not all tr are having the child elements you are expecting. This might be because of a td tag with a colspan. So you should first filter your array to sort the other elements out.
Code
Change your lines including your map function beginning from let movimentosInfo = ... to this:
let movimentosInfo = rowns.filter(row => {
return row.querySelector("tr td:first-child") && row.querySelector("tr td:first-child + td");
}).map(row => {
let data = row.querySelector("tr td:first-child").textContent;
let descricao = row.querySelector("tr td:first-child + td").textContent;
return { data, descricao };
});
This adds a filter function which tests whether the desired elements do exist before mapping their content.

Related

Puppeteer - how to not repeat if statements to check for missing selectors

I've managed to get Puppeteer working to scrape data off a number of different web pages. However, I'm repeating the same if statement for each bit of data - I'm really new to Javascript so I'm pretty sure I'm overlooking something simple so it's not repeated.
I've searched online quite a bit & tried a few different things but can't get it working.
For the code example below, I've taken out a lot of the different query selectors so it's easier to read and just an example, but in the actual code there's 12 of them, all with exactly the same code except the querySelector.
const puppeteer = require('puppeteer');
// This gets the url's I want to scrape stored on another file
let urls = require('./assets/links.js').urls;
(async () => {
// Initiate the browser
const browser = await puppeteer.launch();
// Create a new page with the default browser context
const page = await browser.newPage();
for (let i = 0; i < urls.length; i++) {
// Go to the target website
await page.goto(urls[i]);
let url = urls[i];
const title = await page.evaluate(() => {
let element = document.querySelector('h1')
if (element) {
return element.innerText
} return null;
})
const reviews = await page.evaluate(() => {
let element = document.querySelector('.example-class')
if (element) {
return element.innerText
} return null;
})
const description = await page.evaluate(() => {
let element = document.querySelector('#example-id')
if (element) {
return element.innerText
} return null;
})
console.log({ url, title, reviews, description });
}
// Closes the browser and all of its pages
await browser.close();
})();
I've tried creating a function but it wouldn't let me use it with await.
You can write a simple function that grabs text from a selector and returns null if the element doesn't exist:
const puppeteer = require("puppeteer"); // ^19.0.0
const {urls} = require("./assets/links.js");
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
const text = sel =>
page.$eval(sel, el => el.textContent).catch(() => null);
for (const url of urls) {
await page.goto(url);
console.log({
url,
title: await text("h1"),
reviews: await text(".example-class"),
description: await text("#example-id"),
});
}
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
If there's 12 of them, you might want to add an array and a loop:
const puppeteer = require("puppeteer"); // ^19.0.0
const {urls} = require("./assets/links.js");
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
const text = sel =>
page.$eval(sel, el => el.textContent).catch(() => null);
const selectors = {
title: "h1",
reviews: ".example-class",
description: "#example-id",
// ...
};
for (const url of urls) {
await page.goto(url);
const textProms = Object.entries(selectors).map(
async (k, sel) => [k, await text(sel)]
);
console.log({
url,
...Object.fromEntries(await Promise.all(textProms)),
});
}
})()
.catch(err => console.error(err))
.finally(() => browser?.close());

Web Scraping using Puppeteer returns undefined during atcoder contest

I made a web scrapper for parsing test cases of Atcoder contest. It works well if the contest is already finished but gives an error for an ongoing contest. The error arises when accessing the rows of the table HTML element. I am positive that the table exists but for some reason, the script returns undefined for an ongoing contest.
Error:
Error: Evaluation failed: TypeError: Cannot read properties of undefined (reading 'rows')
at pptr://__puppeteer_evaluation_script__:3:32
at ExecutionContext._ExecutionContext_evaluate (/mnt/d/c++/node_modules/puppeteer-core/lib/cjs/puppeteer/common/ExecutionContext.js:229:15)
at process.processTicksAndRejections (node:internal/process/task_queues:95:5)
at async ExecutionContext.evaluate (/mnt/d/c++/node_modules/puppeteer-core/lib/cjs/puppeteer/common/ExecutionContext.js:107:16)
at async scrapeSite (/mnt/d/c++/codeforces/atcoder.js:57:33)
Here is my Scrapper: atcoder.js:
const puppeteer = require("puppeteer");
const fs = require("fs");
const contest_id = process.argv[2];
async function scrapeProblem(problem_letter) {
const url = `https://atcoder.jp/contests/${contest_id}/tasks/${contest_id}_${problem_letter.toLowerCase()}`;
console.log(url);
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url, { waitUntil: "networkidle0" });
const samples_scraped = await page.evaluate(() => {
const samples = document.querySelectorAll("pre");
const scraped = Array.from(samples).filter((child) => {
return child.id !== "";
});
let num_scraped = scraped.length;
// The elements were repeated twice, so remove the extra elements
for (let i = 0; i < num_scraped / 2; i++) scraped.pop();
return scraped.map((ele) => ele.innerText);
// return Array.from(samples).map((child) => child.innerText);
});
let id = 1;
// Now we need to store the samples in text format
samples_scraped.map((ele, idx) => {
if (idx % 2 == 0) {
// Input
fs.writeFile(`${problem_letter}-${id}.in`, ele, (err) => {
if (err) throw err;
});
} else {
// Output
fs.writeFile(`${problem_letter}-${id}.out`, ele, (err) => {
if (err) throw err;
});
id++;
}
return ele;
});
await browser.close();
} catch (e) {
console.log(e);
}
}
async function scrapeSite() {
const url = `https://atcoder.jp/contests/${contest_id}/tasks`;
console.log(url);
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url, { waitUntil: "networkidle0" });
// Returns all the problem letters
const problem_letters = await page.evaluate(() => {
const table = document.querySelectorAll("table")[0];
const rows = table.rows.length;
const letters = [];
for (let i = 1; i < rows; i++) {
letters.push(table.rows[i].cells[0].innerText);
}
return letters;
});
console.log(problem_letters);
for (problem_letter of problem_letters) {
scrapeProblem(problem_letter);
}
await browser.close();
} catch (e) {
console.log(e);
}
}
scrapeSite();
The scrapeProblem(problem_letter) is a helper function to scrape the test cases for the given problem letter. It then stores the test cases to the user's file system using fs module.
The scrapeSite() function first parses the homepage for the number of problems and the problem letter associated with each problem. It then calls the scrapeProblem(problem_letter) helper function to parse the required web site for test cases.
To run the script: node scrapper.js abc280
Update: I tried it in a new contest and again got the same error. This time I took a screenshot using Puppeteer and found out the problem. I am getting permission denied if I try to accesss the site without logging in for an ongoing contest.
The problem was the site requires us to login and only then we can see the problem statements of an ongoing contest. So I added a function which will first login to the site and then it will proceed to parse the test cases.
Updated code:
const puppeteer = require("puppeteer");
const fs = require("fs");
require('dotenv').config();
const contest_id = process.argv[2];
async function login(browser, page) {
const url = `https://atcoder.jp/login?continue=https%3A%2F%2Fatcoder.jp%2F`;
console.log("Logging in..", url);
try {
await page.goto(url, { waitUntil: "networkidle0" });
await page.type('#username', process.env.USERNAME);
await page.type("#password", process.env.PASSWORD);
await page.click("#submit");
} catch (e) {
console.log("Login failed...");
console.log(e);
}
}
async function scrapeProblem(browser, Problem) {
const url = Problem.Url;
console.log(url);
try {
// const browser = await puppeteer.launch();
const page = await browser.newPage();
// await login(browser, page);
await page.goto(url, { waitUntil: "networkidle0" });
const samples_scraped = await page.evaluate(() => {
const samples = document.querySelectorAll("pre");
const scraped = Array.from(samples).filter((child) => {
return child.id !== "";
});
let num_scraped = scraped.length;
// The elements were repeated twice, so remove the extra elements
for (let i = 0; i < num_scraped / 2; i++) scraped.pop();
return scraped.map((ele) => ele.innerText);
// return Array.from(samples).map((child) => child.innerText);
});
let id = 1;
// Now we need to store the samples in text format
samples_scraped.map((ele, idx) => {
if (idx % 2 == 0) {
// Input
fs.writeFile(`${Problem.Problem_letter}-${id}.in`, ele, (err) => {
if (err) throw err;
});
} else {
// Output
fs.writeFile(`${Problem.Problem_letter}-${id}.out`, ele, (err) => {
if (err) throw err;
});
id++;
}
return ele;
});
// await browser.close();
} catch (e) {
console.log(e);
}
}
async function scrapeSite() {
const url = `https://atcoder.jp/contests/${contest_id}/tasks`;
console.log(url);
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await login(browser, page);
await page.goto(url, { waitUntil: "networkidle0" });
// await page.screenshot({ path: "./screenshot.png", fullPage: true});
// Returns all the problem letters
const problems = await page.evaluate(() => {
const table = document.querySelectorAll("table")[0];
const rows = table.rows.length;
const letters = [];
for (let i = 1; i < rows; i++) {
letters.push({Problem_letter: table.rows[i].cells[0].innerText, Url: table.rows[i].cells[0].firstChild.href });
}
return letters;
});
console.log(problems);
const promises = []
for (problem of problems) {
promises.push(scrapeProblem(browser, problem));
}
await Promise.all(promises); // All the promises must be resolved before closing the browser
await browser.close();
} catch (e) {
console.log(e);
}
}
scrapeSite();

Unable to implement any logic to scrape content from innermost pages using puppeteer

I've created a script using puppeteer to scrape the links of different authors from a webpage traversing multiple pages triggering click on the next page button. The script appears to be working in the right way.
Although the content of this site is static, I intentionally used puppeteer within the following script only to learn as to how I can parse content from inner pages.
Given that I wish to go one layer deep to scrape description from such pages. How can I achieve that?
const puppeteer = require('puppeteer');
function run (pagesToScrape) {
return new Promise(async (resolve, reject) => {
try {
if (!pagesToScrape) {
pagesToScrape = 1;
}
const browser = await puppeteer.launch({headless:false});
const [page] = await browser.pages();
await page.goto("https://quotes.toscrape.com/");
let currentPage = 1;
let urls = [];
while (currentPage <= pagesToScrape) {
let newUrls = await page.evaluate(() => {
let results = [];
let items = document.querySelectorAll('[class="quote"]');
items.forEach((item) => {
results.push({
authorUrl: 'https://quotes.toscrape.com' + item.querySelector("small.author + a").getAttribute('href'),
title: item.querySelector("span.text").innerText
});
});
return results;
});
urls = urls.concat(newUrls);
if (currentPage < pagesToScrape) {
await Promise.all([
await page.waitForSelector('li.next > a'),
await page.click('li.next > a'),
await page.waitForSelector('[class="quote"]')
])
}
currentPage++;
}
browser.close();
return resolve(urls);
} catch (e) {
return reject(e);
}
})
}
run(3).then(console.log).catch(console.error);
I would go this way:
const puppeteer = require('puppeteer');
let browser;
(async function main() {
browser = await puppeteer.launch({ headless: false, defaultViewport: null });
const [pageQuotes] = await browser.pages();
const pageAbout = await browser.newPage();
await pageQuotes.bringToFront(); // Otherwise, click on the next page link does not work.
const pagesToScrape = 3;
await pageQuotes.goto('https://quotes.toscrape.com/');
let currentPage = 1;
const data = { quotes: {}, abouts: {} };
const visitedAbouts = new Set();
while (currentPage <= pagesToScrape) {
await pageQuotes.waitForSelector('.quote');
const { quotes, aboutURLs } = await pageQuotes.evaluate(() => ({
quotes: Array.from(
document.querySelectorAll('.quote'),
quote => [quote.querySelector('small.author').innerText, quote.innerText],
),
aboutURLs: Array.from(
document.querySelectorAll('.quote small.author + a[href]'),
quote => quote.href,
),
}));
for (const [author, quote] of quotes) {
if (data.quotes[author] === undefined) data.quotes[author] = [];
data.quotes[author].push(quote);
}
for (const aboutURL of aboutURLs) {
if (!visitedAbouts.has(aboutURL)) {
visitedAbouts.add(aboutURL);
await pageAbout.goto(aboutURL);
await pageAbout.waitForSelector('div.author-details');
const { title, about } = await pageAbout.evaluate(() => ({
title: document.querySelector('div.author-details h3.author-title').innerText,
about: document.querySelector('div.author-details').innerText,
}));
data.abouts[title] = about;
}
}
if (currentPage < pagesToScrape) {
const nextLink = await pageQuotes.waitForSelector('li.next > a');
await Promise.all([
nextLink.click(),
pageQuotes.waitForNavigation(),
]);
}
currentPage++;
}
console.log(JSON.stringify(data, null, ' '));
})().catch(console.error).finally(async () => { if (browser) await browser.close(); });

Puppeteer: setDefaultNavigationTimeout to 0 still times out

Every time I run this script is times-out.
Does setDefaultNavigationTimeout actually prevent time outs?
There is about 26 URLs I'm going through, and each page has a large amount of images. Can't imagine Puppeteer can't handle these pages just because of heavy images?
const url = 'test.com';
const jsonReturn = [];
async function runScraper() {
const browser = await puppeteer.launch(prodConfig);
const page = await browser.newPage({
timeout: 0
});
page.setDefaultNavigationTimeout(0);
await page.goto(url, { waitUntil: 'domcontentloaded' });
await page.waitForSelector('.featured-shows-featured-show');
let featuredShowsURLs = await page.$$eval('.featured-shows-featured-show > a', (links) => {
return links.map(link => {
return link.href;
});
});
featuredShowsURLs = _.uniq(featuredShowsURLs)
for (const featuredShowsURL of featuredShowsURLs) {
const page = await browser.newPage({
timeout: 0
});
try {
await page.goto(featuredShowsURL);
await page.waitForSelector('.show-title');
} catch (e) {
featuredShowsURL;
debugger;
}
const showTitle = await findAndReturnSelectorText('.show-title', page);
const showDates = await findAndReturnSelectorText('.show-dates', page);
const showLocation = await findAndReturnSelectorText('.show-location', page);
const showGallery = await findAndReturnSelectorText('.entity-link', page);
const showDetail = await findAndReturnSelectorText('.show-press-release', page);
const newItem = {
showTitle,
showDates,
showLocation,
showGallery,
showDetail,
};
const id = hash(newItem);
jsonReturn.push({
...newItem,
id
});
}
await browser.close();
}
runScraper();

How to get an attribute value from html node <a>?

I am trying to scrape the website below and I am not getting the value of the attribute 'data-link'.
http://www.apptrace.com/itunes/charts/FRA/topfreeapplications/36/2018-12-27
Could someone help me?
//attempt #1 (error)
const puppeteer = require('puppeteer')
let scrape = async () => {
const browser = await puppeteer.launch({headless: true})
const page = await browser.newPage()
await page.goto('http://www.apptrace.com/itunes/charts/USA/topfreeapplications/36')
await page.waitFor(1000)
const countryCharts = await page.evaluate(() => {
const abbrAppsCountry = []
document.getElementById('#current_storefront_list')
.getAttribute('li > a[data-link]')
.forEach(app => abbrAppsCountry.push(app.value))
return abbrAppsCountry
})
browser.close()
return countryCharts
}
scrape().then((value) => {
console.log(value)
})
//attempt #2 (array of nulls)
const puppeteer = require('puppeteer')
let scrape = async () => {
const browser = await puppeteer.launch({headless: true})
const page = await browser.newPage()
await page.goto('http://www.apptrace.com/itunes/charts/USA/topfreeapplications/36')
await page.waitFor(1000)
const countryCharts = await page.evaluate(() => {
const abbrAppsCountry = []
document.querySelectorAll('#current_storefront_list > li > a[data-link]')
.forEach(app => abbrAppsCountry.push(app.value))
return abbrAppsCountry
})
browser.close()
return countryCharts
}
scrape().then((value) => {
console.log(value)
})
I would like to get the abbreviation of country names.
You can use dataset or getAttribute APIs:
document.querySelectorAll('#current_storefront_list > li > a')
.forEach(app => abbrAppsCountry.push(app.dataset.link))
Or:
document.querySelectorAll('#current_storefront_list > li > a')
.forEach(app => abbrAppsCountry.push(app.getAttribute('data-link')))

Categories

Resources