const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
const url= 'https://www.booking.com/searchresults.en-us.html?label=gen173nr-1FCAEoggI46AdIM1gEaLUBiAEBmAExuAEXyAEP2AEB6AEB-AECiAIBqAIDuAL_6vKbBsACAdICJDA0MjU3ZGNjLTJlOWEtNDMyYi1hNDQ2LTg0MmIwMjczYjMzYtgCBeACAQ&sid=5727a604ee8453f41f52e9d9b2d7a5c8&aid=304142&ss=Mingora&ssne=Mingora&ssne_untouched=Mingora&efdco=1&lang=en-us&sb=1&src_elem=sb&src=searchresults&dest_id=-2769132&dest_type=city&checkin=2022-11-27&checkout=2022-11-30&group_adults=2&no_rooms=1&group_children=0&sb_travel_purpose=leisure'
await page.goto(url);
await page.screenshot({path: 'example.png'});
const selector = "#search_results_table div > h3 > a";
const titles = await page.evaluate(() =>
Array.from(document.querySelectorAll("h3.a4225678b2 a.e13098a59f"))
.map((name) => name.textContent)
);
console.log(titles.length);
console.log(titles);
await browser.close();
})();
Here is my simple code that I want to use to scrape names of the hotels in a particular location. The query works fine when I run it in the console in the Google Chrome browser but when I run it in Node.js with Puppeteer, it returns an empty array. What am I doing wrong?
I am trying to get the link of the latest house posting in a real estate website.
This is the code I have written til now
const puppeteer = require("puppeteer");
const link =
"https://www.daft.ie/property-for-rent/dublin-4-dublin?radius=5000&numBeds_from=2&numBeds_to=3&sort=publishDateDesc";
(async () => {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
});
const page = await browser.newPage();
await page.goto(link);
const elements = await page.$x("//button[normalize-space()='Accept All']");
await elements[0].click();
// const handle = await page.waitForXPath("//ul[#data-testid='results']");
// const yourHref = await page.evaluate(
// (anchor) => anchor.getAttribute("href"),
// handle
// );
const hrefs1 = await page.evaluate(() =>
Array.from(document.querySelectorAll("a[href]"), (a) =>
a.getAttribute("href")
)
);
console.log(hrefs1);
await browser.close();
})();
However, this code is to get all the href links on the target page.
HTML code of the page:
It is easier to read the code from the picture than if I paste the code, thats why I attached an image.
As you can see under ul tag with data-testid=results there are many li tags inside which there is a a href, I wish to extract the link from this and that too only the top most li link as it will newest house posting.
How can I do this?
Expected output - I just want the first link under li tag. In the picture above, the output would be
/for-rent/house-glencloy-road-whitehall-dublin-9/4072150
Following up on the comment chain, the selector '[data-testid="results"] a[href]' should give the first result href.
const puppeteer = require("puppeteer"); // ^16.2.0
let browser;
(async () => {
browser = await puppeteer.launch({headless: false});
const [page] = await browser.pages();
const url =
"https://www.daft.ie/property-for-rent/dublin-4-dublin?radius=5000&numBeds_from=2&numBeds_to=3&sort=publishDateDesc";
await page.goto(url, {waitUntil: "domcontentloaded"});
const xp = "//button[normalize-space()='Accept All']";
const cookiesBtn = await page.waitForXPath(xp);
await cookiesBtn.click();
const el = await page.waitForSelector('[data-testid="results"] a[href]');
console.log(await el.evaluate(el => el.getAttribute("href")));
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
If you want all of the result hrefs, try:
const allHrefs = await page.$$eval(
'[data-testid="results"] a[href]',
els => els.map(e => e.getAttribute("href"))
);
Note that the data is available statically, so you could just use fetch (native on Node 18+) and Cheerio which is faster and probably more reliable, assuming there's no detection issues (and you could add a user-agent and take other counter-measures if there are):
const cheerio = require("cheerio"); // 1.0.0-rc.12
const url = "https://www.daft.ie/property-for-rent/dublin-4-dublin?radius=5000&numBeds_from=2&numBeds_to=3&sort=publishDateDesc";
fetch(url).then(res => res.text()).then(html => {
const $ = cheerio.load(html);
const sel = '[data-testid="results"] a[href]';
console.log($(sel).attr("href"));
// or all:
console.log([...$(sel)].map(e => e.attribs.href));
});
On my slow machine this took 3.5 seconds versus 30 seconds for headful Puppeteer and 15-20 seconds for headless Puppeteer depending on cache warmth.
Or, if you are using Puppeteer for whatever reason, you could block all the requests, JS and images to speed things up dramatically. Your default await page.goto(link); waits for the load event, which is content you may not need.
I'm trying to write a scraper using puppeteer that logs each superchat(and super thanks eventually) posted in a youtube videos live stream.
Here is my current code:
const puppeteer = require('puppeteer')
const fs = require("fs/promises")
async function start() {
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto("https://www.youtube.com/watch?v=1ykh6t3mec0")
const superchats = await page.evaluate(() => {
return Array.from(document.querySelectorAll('#purchase-amount-chip')).map(x => x.textContent)
})
await fs.writeFile("superchats.txt", superchats.join('\r\n'))
await browser.close()
}
start()
This seems to work on other websites but youtube is very odd. Not sure what the issue is to be honest.
When ever I am trying to run the code I am getting error "TypeError: browser.newPage is not a function". Here is my code
const {chromium} = require('playwright');
(async() => {
const browser = chromium.launch({
headless:false
});
const page = await browser.newPage();
await page.goto('https://www.google.com');
await browser.close();
})();
If I change the code to the following then I am able to run the code but url is not getting open whatever I am providing, Just browser is getting open and close.
const playwright = require('playwright');
(async () => {
for (const browserType of ['webkit']) {
const browser = await playwright[browserType].launch({
headless:false
});
const context = await browser.newContext();
const page = await context.newPage('https://www.google.com');
await browser.close();
}
})();
I am using Playwright version 1.20.0
node.js version is v16.14.0
Try this
const context = await browser.newContext();
const page = await context.newPage();
await page.goto('https://www.google.com');
You might want to check out the documentation: https://playwright.dev/docs/1.19/intro#first-test
Would like to know how to reload a page continually until the content changes using Puppeteer in Headed Mode.
Please select this solution as the right answer if you find this was helpful and correct.
const puppeteer = require('puppeteer')
let contentHTML = ''
let reloadedHTML = ''
;(async () => {
const browser = await puppeteer.launch({ headless: false })
const page = (await browser.pages())[0]
const firstLoad = await page.goto(url)
contentHTML = await firstLoad.text()
do {
let secondLoad = await page.reload()
reloadedHTML = await secondLoad.text()
} while (reloadedHTML === contentHTML)
})()