Puppeteer not able to fetch data for some reason - javascript

I have created a react app that gives the following output as HTML:
import './App.css';
import Home from './Home';
function App() {
return (
<div className="Apps">
<strong>"root1"</strong>
<div className="content">
<p>"root"</p>
</div>
</div>
);
}
export default App;
My goal here is to basically retrieve "root1" from the <strong> tag under the Apps class. I have written the following code but it doesn't retrieve the data:
const puppeteer = require("puppeteer")
const fs = require("fs/promises")
async function start() {
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto("https://.herokuapp.com/")
const names = await page.evaluate(() => {
return Array.from(document.querySelectorAll(".Apps strong"), els => els.map(el => el.textContent))
})
await fs.writeFile("names.txt", names.join("\r\n"))
await browser.close()
}
start()
When I run the script, a new text file is created but it is empty instead of showing "root1". Can someone help me? Thanks for your time!

The correct answer was given to me by #ggorlen, it should be like this:
const puppeteer = require("puppeteer")
const fs = require("fs/promises")
async function start() {
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto("url")
const names = await page.evaluate(() => {
return Array.from(document.querySelectorAll(".Apps strong"), el => el.textContent)
})
await fs.writeFile("names.txt", names.join("\r\n"))
await browser.close()
}
start()
Thanks!

Related

I want to get the urls of each home from the attribute content

const puppeteer = require("puppeteer");
const cheerio = require("cheerio");
const url = "https://www.airbnb.co.in/s/Haridwar--Uttarakhand/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_lengths%5B%5D=one_week&price_filter_input_type=0&price_filter_num_nights=5&l2_property_type_ids%5B%5D=1&search_type=autocomplete_click&query=Haridwar%2C%20Uttarakhand&place_id=ChIJyVfuuA5HCTkR8_VApnaRRE4&date_picker_type=calendar&source=structured_search_input_header";
async function scrapHomesPage(url)
{
try
{
const browser = await puppeteer.launch({headless:false});
const page = await browser.newPage();
await page.goto(url);
const html = await page.evaluate(()=> document.body.innerHTML);
const $ = cheerio.load(html);
const homes = $('[itemprop="url"]').map((i, element) => $(element).attr("content")).get();
console.log(homes);
}
catch(err)
{
console.error(err);
}
}
scrapHomesPage("https://www.airbnb.co.in/s/Haridwar--Uttarakhand/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_lengths%5B%5D=one_week&price_filter_input_type=0&price_filter_num_nights=5&l2_property_type_ids%5B%5D=1&search_type=autocomplete_click&query=Haridwar%2C%20Uttarakhand&place_id=ChIJyVfuuA5HCTkR8_VApnaRRE4&date_picker_type=calendar&source=structured_search_input_header");
I tried to add everything I could to wait for the page to load all the contents. I tried wait for selectors etc. I am always getting an empty array instead I should get an array with all the links of each home listed on the Airbnb site for that particular location.
I don't see any reason to use Cheerio here. It's just another layer of indirection to get the data you want, involving an extra dependency, a whole second parse of the page and the potential for bugs when the page goes out of sync with the HTML snapshot you've created. If you do need to use it, you can use page.content() instead of page.evaluate(() => document.body.innerHTML).
As for the main problem, you appear to be missing a call to page.waitForSelector:
const puppeteer = require("puppeteer"); // ^19.0.0
const url = "your url";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto(url, {waitUntil: "domcontentloaded"});
await page.waitForSelector('[itemprop="url"]');
const content = await page.$$eval(
'[itemprop="url"]',
els => els.map(el => el.getAttribute("content"))
);
console.log(content);
})()
.catch(err => console.error(err))
.finally(() => browser?.close());

Having issues using puppeteer on youtube

I'm trying to write a scraper using puppeteer that logs each superchat(and super thanks eventually) posted in a youtube videos live stream.
Here is my current code:
const puppeteer = require('puppeteer')
const fs = require("fs/promises")
async function start() {
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto("https://www.youtube.com/watch?v=1ykh6t3mec0")
const superchats = await page.evaluate(() => {
return Array.from(document.querySelectorAll('#purchase-amount-chip')).map(x => x.textContent)
})
await fs.writeFile("superchats.txt", superchats.join('\r\n'))
await browser.close()
}
start()
This seems to work on other websites but youtube is very odd. Not sure what the issue is to be honest.

When I try to web scrape a dynamic website, i get back an empty array

I am trying to web scrape a dynamic website with puppeteer, using this code:
const puppeteer = require('puppeteer');
async function getTokoPedia(){
const browser = await puppeteer.launch({ headless: false }); // for test disable the headlels mode,
const page = await browser.newPage();
await page.setViewport({ width: 1000, height: 926 });
await page.goto("https://store.401games.ca/collections/pokemon-singles",{waitUntil: 'networkidle2'});
console.log("start evaluate javascript")
var productNames = await page.evaluate(()=>{
var div = document.querySelectorAll('.info-container');
console.log(div) // console.log inside evaluate, will show on browser console not on node console
var productnames = []
div.forEach(element => {
var price = element.querySelector(' .fs-result-page-3sdl0h')
if(price != null){
productnames.push(price.innerText);
}
});
return productnames
})
console.log(productNames)
browser.close()
}
getTokoPedia();
However, upon running it, I get back an empty array. How can I fix this?
Two problems:
The elements you want are in a shadow root, so you have to pierce the root as described in Puppeteer not giving accurate HTML code for page with shadow roots.
The cards lazy-load, so you'd have to scroll down to be able to populate their data into the DOM.
But there's an easier way to get the initial set of data, which is in the static HTML as a JSON blob in var meta = {"products":...};. You can scrape it with a regex, as described in this tutorial.
Here's an example showing both approaches:
const puppeteer = require("puppeteer"); // ^14.1.1
let browser;
(async () => {
browser = await puppeteer.launch({headless: true});
const [page] = await browser.pages();
const url = "https://store.401games.ca/collections/pokemon-singles";
await page.goto(url, {waitUntil: "domcontentloaded"});
// here's the hard way for illustration:
const el = await page.waitForSelector("#fast-simon-serp-app");
await page.waitForFunction(({shadowRoot}) =>
shadowRoot.querySelector(".product-card .title")
, {}, el);
const items = await el.evaluate(({shadowRoot}) =>
[...shadowRoot.querySelectorAll(".product-card")]
.map(e => ({
title: e.querySelector(".title")?.textContent,
price: e.querySelector(".price")?.textContent,
}))
);
console.log(items); // just the first 6 or so
// TODO scroll the page to get the rest;
// I didn't bother implementing that...
// ...or do it the easy way:
const html = await page.content();
const pat = /^[\t ]*var meta = ({"products":[^\n]+);$/m;
const data = JSON.parse(html.match(pat)[1]);
console.log(JSON.stringify(data, null, 2));
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
At this point, since we're not dealing with anything but the static HTML, you can dump Puppeteer and use axios or fetch to get the data more efficiently:
const axios = require("axios");
axios.get("https://store.401games.ca/collections/pokemon-singles")
.then(({data: body}) => {
const pat = /^[\t ]*var meta = ({"products":[^\n]+);$/m;
const data = JSON.parse(body.match(pat)[1]);
console.log(JSON.stringify(data, null, 2));
})
.catch(err => console.error(err))
;
Now, the data.products array contains 50 but the UI shows 26466 results. If you want more than those initial items from the static HTML's var meta, which appears to be the same on all 1000+ pages, I suggest using the API. A URL looks like https://ultimate-dot-acp-magento.appspot.com/categories_navigation?request_source=v-next&src=v-next&UUID=d3cae9c0-9d9b-4fe3-ad81-873270df14b5&uuid=d3cae9c0-9d9b-4fe3-ad81-873270df14b5&store_id=17041809&cdn_cache_key=1654217982&api_type=json&category_id=269055623355&facets_required=1&products_per_page=5000&page_num=1&with_product_attributes=true. You can see there are ids and keys that probably protect against usage by parties other than the site, but I didn't see any change other than cdn_cache_key after a few tries. I'm not sure how long a URL is valid, but while it is, you can set products_per_page=1000 for example, then move page_num=1 forward 27 times or so. This gets you all of the data while avoiding all of the difficulties of scraping from the page itself.
Here's a pessimistic approach that uses Puppeteer to get an up-to-date URL, in case a URL goes stale:
const axios = require("axios");
const puppeteer = require("puppeteer"); // ^14.1.1
let browser;
(async () => {
browser = await puppeteer.launch({headless: true});
const [page] = await browser.pages();
const url = "https://store.401games.ca/collections/pokemon-singles";
const reqP = page.waitForRequest(res =>
res.url()
.startsWith("https://ultimate-dot-acp-magento.appspot.com/categories_navigation")
);
await page.goto(url, {waitUntil: "domcontentloaded"});
const req = await reqP;
const apiUrl = req
.url()
.replace(/(?<=products_per_page=)(\d+)/, 1000);
const {data} = await axios.get(apiUrl);
console.log(JSON.stringify(data, null, 2));
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
And tossing in the loop:
const axios = require("axios");
const fs = require("fs").promises;
const puppeteer = require("puppeteer"); // ^14.1.1
let browser;
(async () => {
browser = await puppeteer.launch({headless: true});
const [page] = await browser.pages();
const url = "https://store.401games.ca/collections/pokemon-singles";
const reqP = page.waitForRequest(res =>
res.url()
.startsWith("https://ultimate-dot-acp-magento.appspot.com/categories_navigation")
);
await page.goto(url, {waitUntil: "domcontentloaded"});
const req = await reqP;
const apiUrl = req
.url()
.replace(/(?<=products_per_page=)(\d+)/, 1000);
const items = [];
for (let i = 1;; i++) {
const pageUrl = apiUrl.replace(/(?<=page_num=)(\d+)/, i);
const response = await axios.get(pageUrl);
if (response.status !== 200 ||
items.length >= response.data.total_results) {
break;
}
items.push(...response.data.items);
}
await fs.writeFile("data.json", JSON.stringify(items));
console.log(items.slice(0, 10));
console.log(items.length);
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
This hammers the site, pulling a ton of data in a short amount of time, so consider this script for educational purposes, or modify it to throttle your requests way back.

Exporting result from async function in separate js file, importing result in another javascript

Trying to build a small scraper. To reuse functionality I thought 'Page Object Models' would come in handy.
In main.js I require multiple small scripts, in the example below there is only one model (GooglePage).
The scripts work. But I would like to know how to pass a value from the google.js script back to the main script.
I want to use the value of the 'pageCountClean' variable in the main.js script to use in the rest of the application.
Have been searching for information about passing values and functions between scripts. For exporting values from pageconstructors, for promise await export function.
But I am lost. Do I have to use Promises?, is the current way of require/importing and exporting enough to create the relationship between the scripts?
Any pointers are welcome.
//////////// main.js
const { chromium } = require('playwright');
const { GooglePage } = require('./models/Google');
(async () => {
const browser = await chromium.launch({ headless: true, slowMo: 250 });
const context = await browser.newContext();
const GoogleUrl80 = https://www.google.nl/search?q=site%3Anu.nl;
// Cookie consent:
console.log('Cookie consent - start');
const page80 = await browser.newPage();
await page80.goto('https://google.nl');
await page80.waitForTimeout(1000);
await page80.keyboard.press('Tab');
await page80.keyboard.press('Tab');
await page80.keyboard.press('Enter');
console.log('Cookie Consent - done');
// Number of urls in google.nl (using google.js)
await page80.goto(GoogleUrl80, {waitUntil: 'networkidle'});
const googlePage80 = new GooglePage(page80);
await googlePage80.scrapeGoogle();
// Want to console.log 'pageCountClean' here.
await browser.close()
})()
//////////// Google.js
class GooglePage {
constructor(page) {
this.page = page;
}
async scrapeGoogle() {
const GoogleXpath = '//div[#id="result-stats"]';
const pageCount = await this.page.$eval(GoogleXpath, (el) => el.innerText);
const pageCountClean = pageCount.split(" ")[1];
console.log(pageCountClean);
}
}
module.exports = { GooglePage };
You can just return pageCountClean from your async function and await it in your main.js file:
in Google.js:
async scrapeGoogle() {
const GoogleXpath = '//div[#id="result-stats"]';
const pageCount = await this.page.$eval(GoogleXpath, (el) => el.innerText);
const pageCountClean = pageCount.split(" ")[1];
console.log(pageCountClean);
return pageCountClean;
}
in main.js:
const googlePage80 = new GooglePage(page80);
const result = await googlePage80.scrapeGoogle();
console.log(result);

Can't get url from list of images using pupetter

I'm trying to make a scraper using puppeteer using node and everything seems to work fine. I want to get an array of objects looking like this:
[{
title,
price,
link,
image,
}]
and the following code accomplish it, I got lucky and there was a data attribute with the image src on the page and was able to get it like this:
img: item.querySelector('.imagebox').dataset.imgsrc,.
Nevertheless I would like to know why this code fails when I want to get the src like this
image: item.querySelector('img').src,
here is the code I use and the url for the website I'm trying to scrape.
import puppeteer from 'puppeteer'
async function getHTML(url) {
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto(url)
const listItem = await page.evaluate(() =>
[...document.querySelectorAll('.aditem')].map(item => ({
title: item.querySelector('.text-module-begin').textContent.trim(),
price: item.querySelector('.aditem-details strong').textContent.trim(),
link: item.querySelector('.ellipsis').href,
img: item.querySelector('.imagebox').dataset.imgsrc,
image: item.querySelector('img').src,
}))
)
console.log(listItem)
await browser.close()
}
const searchArea = `s-kreuzberg`
const searchParam = `bike`
const url = `https://www.ebay-kleinanzeigen.de/${searchArea}/seite:1/${searchParam}/k0l3375r5`
async function go() {
await getHTML(url)
}
go()
thanks in advance for any help 👍🏽
Images of the page are lazy-loaded as soon as they are scrolled into view. So we need to scroll to them and to wait a bit.
Even then some images are not added to the DOM due to some reason, so we need to add a check for these cases.
You can try something like this:
import puppeteer from 'puppeteer'
async function getHTML(url) {
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto(url)
const listItem = await page.evaluate(async () => {
function delay(ms) {
return new Promise((resolve) => { setTimeout(resolve, ms) })
}
const items = [...document.querySelectorAll('.aditem')]
for (const item of items) {
item.scrollIntoView()
await delay(100)
}
return items.map(item => ({
title: item.querySelector('.text-module-begin').textContent.trim(),
price: item.querySelector('.aditem-details strong').textContent.trim(),
link: item.querySelector('.ellipsis').href,
img: item.querySelector('.imagebox').dataset.imgsrc,
image: item.querySelector('img')? item.querySelector('img').src : null,
}));
}
)
console.log(listItem)
await browser.close()
}
const searchArea = `s-kreuzberg`
const searchParam = `bike`
const url = `https://www.ebay-kleinanzeigen.de/${searchArea}/seite:1/${searchParam}/k0l3375r5`
async function go() {
await getHTML(url)
}
go()

Categories

Resources