const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
const url= 'https://www.booking.com/searchresults.en-us.html?label=gen173nr-1FCAEoggI46AdIM1gEaLUBiAEBmAExuAEXyAEP2AEB6AEB-AECiAIBqAIDuAL_6vKbBsACAdICJDA0MjU3ZGNjLTJlOWEtNDMyYi1hNDQ2LTg0MmIwMjczYjMzYtgCBeACAQ&sid=5727a604ee8453f41f52e9d9b2d7a5c8&aid=304142&ss=Mingora&ssne=Mingora&ssne_untouched=Mingora&efdco=1&lang=en-us&sb=1&src_elem=sb&src=searchresults&dest_id=-2769132&dest_type=city&checkin=2022-11-27&checkout=2022-11-30&group_adults=2&no_rooms=1&group_children=0&sb_travel_purpose=leisure'
await page.goto(url);
await page.screenshot({path: 'example.png'});
const selector = "#search_results_table div > h3 > a";
const titles = await page.evaluate(() =>
Array.from(document.querySelectorAll("h3.a4225678b2 a.e13098a59f"))
.map((name) => name.textContent)
);
console.log(titles.length);
console.log(titles);
await browser.close();
})();
Here is my simple code that I want to use to scrape names of the hotels in a particular location. The query works fine when I run it in the console in the Google Chrome browser but when I run it in Node.js with Puppeteer, it returns an empty array. What am I doing wrong?
Related
const puppeteer = require("puppeteer");
const cheerio = require("cheerio");
const url = "https://www.airbnb.co.in/s/Haridwar--Uttarakhand/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_lengths%5B%5D=one_week&price_filter_input_type=0&price_filter_num_nights=5&l2_property_type_ids%5B%5D=1&search_type=autocomplete_click&query=Haridwar%2C%20Uttarakhand&place_id=ChIJyVfuuA5HCTkR8_VApnaRRE4&date_picker_type=calendar&source=structured_search_input_header";
async function scrapHomesPage(url)
{
try
{
const browser = await puppeteer.launch({headless:false});
const page = await browser.newPage();
await page.goto(url);
const html = await page.evaluate(()=> document.body.innerHTML);
const $ = cheerio.load(html);
const homes = $('[itemprop="url"]').map((i, element) => $(element).attr("content")).get();
console.log(homes);
}
catch(err)
{
console.error(err);
}
}
scrapHomesPage("https://www.airbnb.co.in/s/Haridwar--Uttarakhand/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_lengths%5B%5D=one_week&price_filter_input_type=0&price_filter_num_nights=5&l2_property_type_ids%5B%5D=1&search_type=autocomplete_click&query=Haridwar%2C%20Uttarakhand&place_id=ChIJyVfuuA5HCTkR8_VApnaRRE4&date_picker_type=calendar&source=structured_search_input_header");
I tried to add everything I could to wait for the page to load all the contents. I tried wait for selectors etc. I am always getting an empty array instead I should get an array with all the links of each home listed on the Airbnb site for that particular location.
I don't see any reason to use Cheerio here. It's just another layer of indirection to get the data you want, involving an extra dependency, a whole second parse of the page and the potential for bugs when the page goes out of sync with the HTML snapshot you've created. If you do need to use it, you can use page.content() instead of page.evaluate(() => document.body.innerHTML).
As for the main problem, you appear to be missing a call to page.waitForSelector:
const puppeteer = require("puppeteer"); // ^19.0.0
const url = "your url";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto(url, {waitUntil: "domcontentloaded"});
await page.waitForSelector('[itemprop="url"]');
const content = await page.$$eval(
'[itemprop="url"]',
els => els.map(el => el.getAttribute("content"))
);
console.log(content);
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
I am trying to get the link of the latest house posting in a real estate website.
This is the code I have written til now
const puppeteer = require("puppeteer");
const link =
"https://www.daft.ie/property-for-rent/dublin-4-dublin?radius=5000&numBeds_from=2&numBeds_to=3&sort=publishDateDesc";
(async () => {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
});
const page = await browser.newPage();
await page.goto(link);
const elements = await page.$x("//button[normalize-space()='Accept All']");
await elements[0].click();
// const handle = await page.waitForXPath("//ul[#data-testid='results']");
// const yourHref = await page.evaluate(
// (anchor) => anchor.getAttribute("href"),
// handle
// );
const hrefs1 = await page.evaluate(() =>
Array.from(document.querySelectorAll("a[href]"), (a) =>
a.getAttribute("href")
)
);
console.log(hrefs1);
await browser.close();
})();
However, this code is to get all the href links on the target page.
HTML code of the page:
It is easier to read the code from the picture than if I paste the code, thats why I attached an image.
As you can see under ul tag with data-testid=results there are many li tags inside which there is a a href, I wish to extract the link from this and that too only the top most li link as it will newest house posting.
How can I do this?
Expected output - I just want the first link under li tag. In the picture above, the output would be
/for-rent/house-glencloy-road-whitehall-dublin-9/4072150
Following up on the comment chain, the selector '[data-testid="results"] a[href]' should give the first result href.
const puppeteer = require("puppeteer"); // ^16.2.0
let browser;
(async () => {
browser = await puppeteer.launch({headless: false});
const [page] = await browser.pages();
const url =
"https://www.daft.ie/property-for-rent/dublin-4-dublin?radius=5000&numBeds_from=2&numBeds_to=3&sort=publishDateDesc";
await page.goto(url, {waitUntil: "domcontentloaded"});
const xp = "//button[normalize-space()='Accept All']";
const cookiesBtn = await page.waitForXPath(xp);
await cookiesBtn.click();
const el = await page.waitForSelector('[data-testid="results"] a[href]');
console.log(await el.evaluate(el => el.getAttribute("href")));
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
If you want all of the result hrefs, try:
const allHrefs = await page.$$eval(
'[data-testid="results"] a[href]',
els => els.map(e => e.getAttribute("href"))
);
Note that the data is available statically, so you could just use fetch (native on Node 18+) and Cheerio which is faster and probably more reliable, assuming there's no detection issues (and you could add a user-agent and take other counter-measures if there are):
const cheerio = require("cheerio"); // 1.0.0-rc.12
const url = "https://www.daft.ie/property-for-rent/dublin-4-dublin?radius=5000&numBeds_from=2&numBeds_to=3&sort=publishDateDesc";
fetch(url).then(res => res.text()).then(html => {
const $ = cheerio.load(html);
const sel = '[data-testid="results"] a[href]';
console.log($(sel).attr("href"));
// or all:
console.log([...$(sel)].map(e => e.attribs.href));
});
On my slow machine this took 3.5 seconds versus 30 seconds for headful Puppeteer and 15-20 seconds for headless Puppeteer depending on cache warmth.
Or, if you are using Puppeteer for whatever reason, you could block all the requests, JS and images to speed things up dramatically. Your default await page.goto(link); waits for the load event, which is content you may not need.
I'm trying to write a scraper using puppeteer that logs each superchat(and super thanks eventually) posted in a youtube videos live stream.
Here is my current code:
const puppeteer = require('puppeteer')
const fs = require("fs/promises")
async function start() {
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto("https://www.youtube.com/watch?v=1ykh6t3mec0")
const superchats = await page.evaluate(() => {
return Array.from(document.querySelectorAll('#purchase-amount-chip')).map(x => x.textContent)
})
await fs.writeFile("superchats.txt", superchats.join('\r\n'))
await browser.close()
}
start()
This seems to work on other websites but youtube is very odd. Not sure what the issue is to be honest.
When ever I am trying to run the code I am getting error "TypeError: browser.newPage is not a function". Here is my code
const {chromium} = require('playwright');
(async() => {
const browser = chromium.launch({
headless:false
});
const page = await browser.newPage();
await page.goto('https://www.google.com');
await browser.close();
})();
If I change the code to the following then I am able to run the code but url is not getting open whatever I am providing, Just browser is getting open and close.
const playwright = require('playwright');
(async () => {
for (const browserType of ['webkit']) {
const browser = await playwright[browserType].launch({
headless:false
});
const context = await browser.newContext();
const page = await context.newPage('https://www.google.com');
await browser.close();
}
})();
I am using Playwright version 1.20.0
node.js version is v16.14.0
Try this
const context = await browser.newContext();
const page = await context.newPage();
await page.goto('https://www.google.com');
You might want to check out the documentation: https://playwright.dev/docs/1.19/intro#first-test
I'm launching a Puppeteer instance that I would like to get some info of which flags this instance was launched with. For example, the --user-data-dir flag since sometimes I would like to use the same Puppeteer profile that would store cookies and login info.
Is there a way to fetch the values visible at chrome://version programmatically?
const puppeteer = require('puppeteer');
(async () => {
const browserURL = 'http://127.0.0.1:9222';
browser = await puppeteer.connect({browserURL,defaultViewport : null });
page = await browser.newPage();
})();
Try this:
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
console.log(browser.process().spawnargs);
await browser.close();
})();
UPD. For connected browser:
await page.goto('chrome://version');
const tableCell = await page.waitForSelector('#command_line');
const commandLine = await page.evaluate(element => element.innerText, tableCell);
console.log(commandLine);
Puppeteer has browser.version() function which return the same information.
let details = browser.version()
or
let details = page.browser.version()
You can check more information here : https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#browserversion