How to reload a page until it changes using Puppeteer - javascript

Would like to know how to reload a page continually until the content changes using Puppeteer in Headed Mode.

Please select this solution as the right answer if you find this was helpful and correct.
const puppeteer = require('puppeteer')
let contentHTML = ''
let reloadedHTML = ''
;(async () => {
const browser = await puppeteer.launch({ headless: false })
const page = (await browser.pages())[0]
const firstLoad = await page.goto(url)
contentHTML = await firstLoad.text()
do {
let secondLoad = await page.reload()
reloadedHTML = await secondLoad.text()
} while (reloadedHTML === contentHTML)
})()

Related

I want to get the urls of each home from the attribute content

const puppeteer = require("puppeteer");
const cheerio = require("cheerio");
const url = "https://www.airbnb.co.in/s/Haridwar--Uttarakhand/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_lengths%5B%5D=one_week&price_filter_input_type=0&price_filter_num_nights=5&l2_property_type_ids%5B%5D=1&search_type=autocomplete_click&query=Haridwar%2C%20Uttarakhand&place_id=ChIJyVfuuA5HCTkR8_VApnaRRE4&date_picker_type=calendar&source=structured_search_input_header";
async function scrapHomesPage(url)
{
try
{
const browser = await puppeteer.launch({headless:false});
const page = await browser.newPage();
await page.goto(url);
const html = await page.evaluate(()=> document.body.innerHTML);
const $ = cheerio.load(html);
const homes = $('[itemprop="url"]').map((i, element) => $(element).attr("content")).get();
console.log(homes);
}
catch(err)
{
console.error(err);
}
}
scrapHomesPage("https://www.airbnb.co.in/s/Haridwar--Uttarakhand/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_lengths%5B%5D=one_week&price_filter_input_type=0&price_filter_num_nights=5&l2_property_type_ids%5B%5D=1&search_type=autocomplete_click&query=Haridwar%2C%20Uttarakhand&place_id=ChIJyVfuuA5HCTkR8_VApnaRRE4&date_picker_type=calendar&source=structured_search_input_header");
I tried to add everything I could to wait for the page to load all the contents. I tried wait for selectors etc. I am always getting an empty array instead I should get an array with all the links of each home listed on the Airbnb site for that particular location.
I don't see any reason to use Cheerio here. It's just another layer of indirection to get the data you want, involving an extra dependency, a whole second parse of the page and the potential for bugs when the page goes out of sync with the HTML snapshot you've created. If you do need to use it, you can use page.content() instead of page.evaluate(() => document.body.innerHTML).
As for the main problem, you appear to be missing a call to page.waitForSelector:
const puppeteer = require("puppeteer"); // ^19.0.0
const url = "your url";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto(url, {waitUntil: "domcontentloaded"});
await page.waitForSelector('[itemprop="url"]');
const content = await page.$$eval(
'[itemprop="url"]',
els => els.map(el => el.getAttribute("content"))
);
console.log(content);
})()
.catch(err => console.error(err))
.finally(() => browser?.close());

Why puppeteer doesnt screenshot the whole element?

I am trying to take a screenshot of an svg element, but am only getting a partial picture. What am I doing wrong ?
'use strict';
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
try {
const page = await browser.newPage();
await page.goto('https://jisho.org/search/%E5%AE%B6%20%23kanji');
const selector = '#result_area > div > div:nth-child(2) > div.small-12.large-10.columns > div > div > div > svg';
await page.waitForSelector(selector);
const element = await page.$(selector);
await element.screenshot({
path: 'example.png',
});
} catch (e) {
console.log(e)
} finally {
await browser.close();
}
})();
If it were me, I'd just save the SVG directly rather than converting it to a PNG. This preserves the scalability and raw data without quality loss and you can always convert to PNG later.
But if you really want a PNG only and you want a pure Puppeteer solution, the issue is that overflow-x: hidden CSS is on the parent container of the SVG, and the overall page layout makes it fussy to screenshot.
So I'd just rip out all of the page HTML except for the one element you're interested in. This makes it much easier to capture.
const puppeteer = require("puppeteer"); // ^16.2.0
let browser;
(async () => {
browser = await puppeteer.launch({headless: true});
const [page] = await browser.pages();
const url = "https://jisho.org/search/%E5%AE%B6%20%23kanji";
await page.goto(url, {waitUntil: "domcontentloaded"});
const sel = ".stroke_order_diagram--outer_container svg";
const el = await page.waitForSelector(sel);
const svg = await el.evaluateHandle(el => {
document.body.innerHTML = el.outerHTML;
return document.querySelector("svg");
});
await svg.screenshot({path: "example.png"});
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;

How to get the first link under a ul tag using Puppeteer?

I am trying to get the link of the latest house posting in a real estate website.
This is the code I have written til now
const puppeteer = require("puppeteer");
const link =
"https://www.daft.ie/property-for-rent/dublin-4-dublin?radius=5000&numBeds_from=2&numBeds_to=3&sort=publishDateDesc";
(async () => {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
});
const page = await browser.newPage();
await page.goto(link);
const elements = await page.$x("//button[normalize-space()='Accept All']");
await elements[0].click();
// const handle = await page.waitForXPath("//ul[#data-testid='results']");
// const yourHref = await page.evaluate(
// (anchor) => anchor.getAttribute("href"),
// handle
// );
const hrefs1 = await page.evaluate(() =>
Array.from(document.querySelectorAll("a[href]"), (a) =>
a.getAttribute("href")
)
);
console.log(hrefs1);
await browser.close();
})();
However, this code is to get all the href links on the target page.
HTML code of the page:
It is easier to read the code from the picture than if I paste the code, thats why I attached an image.
As you can see under ul tag with data-testid=results there are many li tags inside which there is a a href, I wish to extract the link from this and that too only the top most li link as it will newest house posting.
How can I do this?
Expected output - I just want the first link under li tag. In the picture above, the output would be
/for-rent/house-glencloy-road-whitehall-dublin-9/4072150
Following up on the comment chain, the selector '[data-testid="results"] a[href]' should give the first result href.
const puppeteer = require("puppeteer"); // ^16.2.0
let browser;
(async () => {
browser = await puppeteer.launch({headless: false});
const [page] = await browser.pages();
const url =
"https://www.daft.ie/property-for-rent/dublin-4-dublin?radius=5000&numBeds_from=2&numBeds_to=3&sort=publishDateDesc";
await page.goto(url, {waitUntil: "domcontentloaded"});
const xp = "//button[normalize-space()='Accept All']";
const cookiesBtn = await page.waitForXPath(xp);
await cookiesBtn.click();
const el = await page.waitForSelector('[data-testid="results"] a[href]');
console.log(await el.evaluate(el => el.getAttribute("href")));
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
If you want all of the result hrefs, try:
const allHrefs = await page.$$eval(
'[data-testid="results"] a[href]',
els => els.map(e => e.getAttribute("href"))
);
Note that the data is available statically, so you could just use fetch (native on Node 18+) and Cheerio which is faster and probably more reliable, assuming there's no detection issues (and you could add a user-agent and take other counter-measures if there are):
const cheerio = require("cheerio"); // 1.0.0-rc.12
const url = "https://www.daft.ie/property-for-rent/dublin-4-dublin?radius=5000&numBeds_from=2&numBeds_to=3&sort=publishDateDesc";
fetch(url).then(res => res.text()).then(html => {
const $ = cheerio.load(html);
const sel = '[data-testid="results"] a[href]';
console.log($(sel).attr("href"));
// or all:
console.log([...$(sel)].map(e => e.attribs.href));
});
On my slow machine this took 3.5 seconds versus 30 seconds for headful Puppeteer and 15-20 seconds for headless Puppeteer depending on cache warmth.
Or, if you are using Puppeteer for whatever reason, you could block all the requests, JS and images to speed things up dramatically. Your default await page.goto(link); waits for the load event, which is content you may not need.

When I try to web scrape a dynamic website, i get back an empty array

I am trying to web scrape a dynamic website with puppeteer, using this code:
const puppeteer = require('puppeteer');
async function getTokoPedia(){
const browser = await puppeteer.launch({ headless: false }); // for test disable the headlels mode,
const page = await browser.newPage();
await page.setViewport({ width: 1000, height: 926 });
await page.goto("https://store.401games.ca/collections/pokemon-singles",{waitUntil: 'networkidle2'});
console.log("start evaluate javascript")
var productNames = await page.evaluate(()=>{
var div = document.querySelectorAll('.info-container');
console.log(div) // console.log inside evaluate, will show on browser console not on node console
var productnames = []
div.forEach(element => {
var price = element.querySelector(' .fs-result-page-3sdl0h')
if(price != null){
productnames.push(price.innerText);
}
});
return productnames
})
console.log(productNames)
browser.close()
}
getTokoPedia();
However, upon running it, I get back an empty array. How can I fix this?
Two problems:
The elements you want are in a shadow root, so you have to pierce the root as described in Puppeteer not giving accurate HTML code for page with shadow roots.
The cards lazy-load, so you'd have to scroll down to be able to populate their data into the DOM.
But there's an easier way to get the initial set of data, which is in the static HTML as a JSON blob in var meta = {"products":...};. You can scrape it with a regex, as described in this tutorial.
Here's an example showing both approaches:
const puppeteer = require("puppeteer"); // ^14.1.1
let browser;
(async () => {
browser = await puppeteer.launch({headless: true});
const [page] = await browser.pages();
const url = "https://store.401games.ca/collections/pokemon-singles";
await page.goto(url, {waitUntil: "domcontentloaded"});
// here's the hard way for illustration:
const el = await page.waitForSelector("#fast-simon-serp-app");
await page.waitForFunction(({shadowRoot}) =>
shadowRoot.querySelector(".product-card .title")
, {}, el);
const items = await el.evaluate(({shadowRoot}) =>
[...shadowRoot.querySelectorAll(".product-card")]
.map(e => ({
title: e.querySelector(".title")?.textContent,
price: e.querySelector(".price")?.textContent,
}))
);
console.log(items); // just the first 6 or so
// TODO scroll the page to get the rest;
// I didn't bother implementing that...
// ...or do it the easy way:
const html = await page.content();
const pat = /^[\t ]*var meta = ({"products":[^\n]+);$/m;
const data = JSON.parse(html.match(pat)[1]);
console.log(JSON.stringify(data, null, 2));
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
At this point, since we're not dealing with anything but the static HTML, you can dump Puppeteer and use axios or fetch to get the data more efficiently:
const axios = require("axios");
axios.get("https://store.401games.ca/collections/pokemon-singles")
.then(({data: body}) => {
const pat = /^[\t ]*var meta = ({"products":[^\n]+);$/m;
const data = JSON.parse(body.match(pat)[1]);
console.log(JSON.stringify(data, null, 2));
})
.catch(err => console.error(err))
;
Now, the data.products array contains 50 but the UI shows 26466 results. If you want more than those initial items from the static HTML's var meta, which appears to be the same on all 1000+ pages, I suggest using the API. A URL looks like https://ultimate-dot-acp-magento.appspot.com/categories_navigation?request_source=v-next&src=v-next&UUID=d3cae9c0-9d9b-4fe3-ad81-873270df14b5&uuid=d3cae9c0-9d9b-4fe3-ad81-873270df14b5&store_id=17041809&cdn_cache_key=1654217982&api_type=json&category_id=269055623355&facets_required=1&products_per_page=5000&page_num=1&with_product_attributes=true. You can see there are ids and keys that probably protect against usage by parties other than the site, but I didn't see any change other than cdn_cache_key after a few tries. I'm not sure how long a URL is valid, but while it is, you can set products_per_page=1000 for example, then move page_num=1 forward 27 times or so. This gets you all of the data while avoiding all of the difficulties of scraping from the page itself.
Here's a pessimistic approach that uses Puppeteer to get an up-to-date URL, in case a URL goes stale:
const axios = require("axios");
const puppeteer = require("puppeteer"); // ^14.1.1
let browser;
(async () => {
browser = await puppeteer.launch({headless: true});
const [page] = await browser.pages();
const url = "https://store.401games.ca/collections/pokemon-singles";
const reqP = page.waitForRequest(res =>
res.url()
.startsWith("https://ultimate-dot-acp-magento.appspot.com/categories_navigation")
);
await page.goto(url, {waitUntil: "domcontentloaded"});
const req = await reqP;
const apiUrl = req
.url()
.replace(/(?<=products_per_page=)(\d+)/, 1000);
const {data} = await axios.get(apiUrl);
console.log(JSON.stringify(data, null, 2));
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
And tossing in the loop:
const axios = require("axios");
const fs = require("fs").promises;
const puppeteer = require("puppeteer"); // ^14.1.1
let browser;
(async () => {
browser = await puppeteer.launch({headless: true});
const [page] = await browser.pages();
const url = "https://store.401games.ca/collections/pokemon-singles";
const reqP = page.waitForRequest(res =>
res.url()
.startsWith("https://ultimate-dot-acp-magento.appspot.com/categories_navigation")
);
await page.goto(url, {waitUntil: "domcontentloaded"});
const req = await reqP;
const apiUrl = req
.url()
.replace(/(?<=products_per_page=)(\d+)/, 1000);
const items = [];
for (let i = 1;; i++) {
const pageUrl = apiUrl.replace(/(?<=page_num=)(\d+)/, i);
const response = await axios.get(pageUrl);
if (response.status !== 200 ||
items.length >= response.data.total_results) {
break;
}
items.push(...response.data.items);
}
await fs.writeFile("data.json", JSON.stringify(items));
console.log(items.slice(0, 10));
console.log(items.length);
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
This hammers the site, pulling a ton of data in a short amount of time, so consider this script for educational purposes, or modify it to throttle your requests way back.

Get command line flags of launched Puppeteer instance programmatically

I'm launching a Puppeteer instance that I would like to get some info of which flags this instance was launched with. For example, the --user-data-dir flag since sometimes I would like to use the same Puppeteer profile that would store cookies and login info.
Is there a way to fetch the values visible at chrome://version programmatically?
const puppeteer = require('puppeteer');
(async () => {
const browserURL = 'http://127.0.0.1:9222';
browser = await puppeteer.connect({browserURL,defaultViewport : null });
page = await browser.newPage();
})();
Try this:
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
console.log(browser.process().spawnargs);
await browser.close();
})();
UPD. For connected browser:
await page.goto('chrome://version');
const tableCell = await page.waitForSelector('#command_line');
const commandLine = await page.evaluate(element => element.innerText, tableCell);
console.log(commandLine);
Puppeteer has browser.version() function which return the same information.
let details = browser.version()
or
let details = page.browser.version()
You can check more information here : https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#browserversion

Categories

Resources