How to call Page.navigate and trigger associated Page.loadEventFired? - javascript

I am using Chrome stable 60 (https://chromedevtools.github.io/devtools-protocol/1-2/Page/) for headless. I need to be able to do this:
Navigate to page 1
Take screenshot1
Navigate to page 2 (after page 1 is done)
Take screenshot2
However, I can't see to call Page.navigate twice because Page.loadEventFired will pick up on the latest one.
I don't want to use Canary because it's so unstable (screenshot doesn't even work right). So I think Target isn't an option (if it could be).
What is the best way to do url navigation in serial fashion like that?
I looked at https://github.com/LucianoGanga/simple-headless-chrome to see how they do it (await mainTab.goTo) but can't seem to figure out yet.

The link here https://github.com/cyrus-and/chrome-remote-interface/issues/92 gave me some idea:
const fs = require('fs');
const CDP = require('chrome-remote-interface');
function loadForScrot(url) {
return new Promise(async (fulfill, reject) => {
const tab = await CDP.New();
const client = await CDP({tab});
const {Page} = client;
Page.loadEventFired(() => {
fulfill({client, tab});
});
await Page.enable();
await Page.navigate({url});
});
}
async function process(urls) {
try {
const handlers = await Promise.all(urls.map(loadForScrot));
for (const {client, tab} of handlers) {
const {Page} = client;
await CDP.Activate({id: tab.id});
const filename = `/tmp/scrot_${tab.id}.png`;
const result = await Page.captureScreenshot();
const image = Buffer.from(result.data, 'base64');
fs.writeFileSync(filename, image);
console.log(filename);
await client.close();
}
} catch (err) {
console.error(err);
}
}
process(['http://example.com',
'http://example.com',
'http://example.com',
'http://example.com',
'http://example.com',
'http://example.com',
'http://example.com',
'http://example.com']);

Checkout the new library from google team puppeteer

Related

How to select specific button in puppeteer

So I'm building a program that scrapes Poshmark webpages and extracts the usernames of each seller on the page!
I want it to go through every page using the 'next' button, but theres 6 buttons all with the same class name...
Heres the link: https://poshmark.com/category/Men-Jackets_&_Coats?sort_by=like_count&all_size=true&my_size=false
(In my google chrome this page has an infinite scroll (hence the scrollToBottom async function i started writing) but i realized inside puppeteer's chrome it has 'next page' buttons.)
The window displays page 1-5 and then the 'next page' button.
The problem is that all of the buttons share the same html class name, so I'm confused on how to differentiate.
const e = require('express');
const puppeteer = require('puppeteer');
const url = "https://poshmark.com/category/Men-Jackets_&_Coats?sort_by=like_count&all_size=true&my_size=false";
let usernames = [];
const initItemArea = async (page) => {
const itemArea = await page.evaluate(() => {
return Array.from(document.querySelectorAll('.tc--g.m--l--1.ellipses')).map(x => x.textContent);
});
}
const pushToArray = async (itemArea, page) => {
itemArea.forEach(function (element) {
//console.log('username: ', $(element).text());
usernames.push(element);
});
};
const scrollToBottom = async (itemArea, page) => {
while (true) {
previousHeight = await page.evaluate('document.body.scrollHeight');
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
await page.waitForFunction(`document.body.scrollHeight > ${previousHeight}`);
await new Promise((resolve) => setTimeout(resolve, 1000));
await page.screenshot({path : "ss.png"})
}
};
const gotoNextPage = async (page) => {
await page.waitForSelector(".button.btn.btn--pagination");
const nextButton = await page.evaluate((page) => {
document.querySelector(".button.btn.btn--pagination")
});
await page.click(nextButton);
console.log('Next Page Loading')
};
async function main() {
const client = await puppeteer.launch({
headless: false,
executablePath: "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
});
const page = await client.newPage();
await page.goto(url);
await page.waitForSelector(".tc--g.m--l--1.ellipses");
const itemArea = await page.evaluate(() => {
return Array.from(document.querySelectorAll('.tc--g.m--l--1.ellipses')).map(x => x.textContent);
});
gotoNextPage(page)
};
main();
Currently, my gotoNextPage function doesnt even find the button, so i thought i'd entered the selector wrong...
Then when I went to find the selector, I realized all buttons have the same one anyway...
My html knowledge is basically nonexistent, but I want to finish this project out. All help is very appreciated.
Bonus: my initPageArea function doesn't work when I call as a function like that, so I hardcoded it into main()...
I'll be diving deep into this problem later on, as I've seen it before, but any quick answers / direction would be awesome.
Thanks a lot.
you can try selecting the buttons using their position in the page.
For example, you can select the first button using the following CSS selector:
.button.btn.btn--pagination:nth-child(1)
to select the second button:
.button.btn.btn--pagination:nth-child(2)
Got the idea? :)
you can refactor your gotoNextPage function to use this approach, consider this example:
const gotoNextPage = async (page, buttonIndex) => {
await page.waitForSelector(".button.btn.btn--pagination");
// Select the button using its position in the page
const nextButton = await page.evaluate((buttonIndex) => {
return document.querySelector(`.button.btn.btn--pagination:nth-child(${buttonIndex})`);
}, buttonIndex);
// Click on the button
await page.click(nextButton);
console.log("Next Page Loading");
};
Whenever you're messing with buttons and scroll, it's a good idea to think about where the data is coming from. It's usually being delivered to the front-end via a JSON API, so you might as well try to hit that API directly rather than mess with the DOM.
const url = maxId => `https://poshmark.com/vm-rest/channel_groups/category/channels/category/collections/post?request={%22filters%22:{%22department%22:%22Men%22,%22category_v2%22:%22Jackets_%26_Coats%22,%22inventory_status%22:[%22available%22]},%22sort_by%22:%22like_count%22,%22facets%22:[%22color%22,%22brand%22,%22size%22],%22experience%22:%22all%22,%22sizeSystem%22:%22us%22,%22max_id%22:%22${maxId}%22,%22count%22:%2248%22}&summarize=true&pm_version=226.1.0`;
(async () => {
const usernames = [];
for (let maxId = 1; maxId < 5 /* for testing */; maxId++) {
const response = await fetch(url(maxId)); // Node 18 or install node-fetch
if (!response.ok) {
throw Error(response.statusText);
}
const payload = await response.json();
if (payload.error) {
break;
}
usernames.push(...payload.data.map(e => e.creator_username));
}
console.log(usernames.slice(0, 10));
console.log("usernames.length", usernames.length);
})()
.catch(err => console.error(err));
The response blob has a ton of additional data.
I would add a significant delay between requests if I were to use code like this to avoid rate limiting/blocking.
If you're set on Puppeteer, something like this should work as well, although it's slower and I didn't have time to run to the end of the 5k (or more?) users:
const puppeteer = require("puppeteer"); // ^19.1.0
const url = "Your URL";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto(url, {waitUntil: "domcontentloaded"});
const usernames = [];
const sel = ".tc--g.m--l--1.ellipses";
for (;;) {
try {
await page.waitForSelector(sel);
const users = await page.$$eval(sel, els => {
const text = els.map(e => e.textContent);
els.forEach(el => el.remove());
return text;
});
console.log(users); // optional for debugging
usernames.push(...users);
await page.$$eval(
".btn--pagination",
els => els.find(el => el.textContent.includes("Next")).click()
);
}
catch (err) {
break;
}
}
console.log(usernames);
console.log(usernames.length);
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
I don't think navigations are triggered by the "Next" button, so my strategy for detecting when a page transition has occurred involves destroying the current set of elements after scraping the usernames, then waiting until the next batch shows up. This may seem inelegant, but it's easy to implement and seems reliable, not making assumptions about the usernames themselves.
It's also possible to use Puppeteer and make or intercept API requests, armed with a fresh cookie. This is sort of halfway between the two extremes shown above. For example:
const puppeteer = require("puppeteer");
const url = "Your URL";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto(url, {waitUntil: "domcontentloaded"});
const usernames = await page.evaluate(async () => {
const url = maxId => `https://poshmark.com/vm-rest/channel_groups/category/channels/category/collections/post?request={%22filters%22:{%22department%22:%22Men%22,%22category_v2%22:%22Jackets_%26_Coats%22,%22inventory_status%22:[%22available%22]},%22sort_by%22:%22like_count%22,%22facets%22:[%22color%22,%22brand%22,%22size%22],%22experience%22:%22all%22,%22sizeSystem%22:%22us%22,%22max_id%22:%22${maxId}%22,%22count%22:%2248%22}&summarize=true&pm_version=226.1.0`;
const usernames = [];
try {
for (let maxId = 1; maxId < 5 /* for testing */; maxId++) {
const response = await fetch(url(maxId)); // node 18 or install node-fetch
if (!response.ok) {
throw Error(response.statusText);
break;
}
const json = await response.json();
if (json.error) {
break;
}
usernames.push(...json.data.map(e => e.creator_username));
}
}
catch (err) {
console.error(err);
}
return usernames;
});
console.log(usernames);
console.log("usernames.length", usernames.length);
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
The above code limits to 4 requests to keep it simple and easy to validate.
Blocking images and other unnecessary resources can help speed the Puppeteer versions up, left as an exercise (or just use the direct fetch version shown at top).

Download website locally without Javascript using puppeteer

I am trying to download a website as static, I mean without JS, only HTML & CSS.
I've tried many approaches yet some issues still present regarding CSS and Images.
A snippet
const puppeteer = require('puppeteer');
const {URL} = require('url');
const fse = require('fs-extra');
const path = require('path');
(async (urlToFetch) => {
const browser = await puppeteer.launch({
headless: true,
slowMo: 100
});
const page = await browser.newPage();
await page.setRequestInterception(true);
page.on("request", request => {
if (request.resourceType() === "script") {
request.abort()
} else {
request.continue()
}
})
page.on('response', async (response) => {
const url = new URL(response.url());
let filePath = path.resolve(`./output${url.pathname}`);
if(path.extname(url.pathname).trim() === '') {
filePath = `${filePath}/index.html`;
}
await fse.outputFile(filePath, await response.buffer());
console.log(`File ${filePath} is written successfully`);
});
await page.goto(urlToFetch, {
waitUntil: 'networkidle2'
})
setTimeout(async () => {
await browser.close();
}, 60000 * 4)
})('https://stackoverflow.com/');
I've tried using
content = await page.content();
fs.writeFileSync('index.html', content, { encoding: 'utf-8' });
As well as, I download it using CDPSession.
I've tried it using website-scraper
So what is the best approach to come to a solution where I provide a website link, then It downloads it as static website.
Try using this https://www.npmjs.com/package/website-scraper
It will save the website into a local directory
Have you tried something like wget or curl?
wget -p https://stackoverflow.com/questions/67559777/download-website-locally-without-javascript-using-puppeteer
Should do the trick

Exporting result from async function in separate js file, importing result in another javascript

Trying to build a small scraper. To reuse functionality I thought 'Page Object Models' would come in handy.
In main.js I require multiple small scripts, in the example below there is only one model (GooglePage).
The scripts work. But I would like to know how to pass a value from the google.js script back to the main script.
I want to use the value of the 'pageCountClean' variable in the main.js script to use in the rest of the application.
Have been searching for information about passing values and functions between scripts. For exporting values from pageconstructors, for promise await export function.
But I am lost. Do I have to use Promises?, is the current way of require/importing and exporting enough to create the relationship between the scripts?
Any pointers are welcome.
//////////// main.js
const { chromium } = require('playwright');
const { GooglePage } = require('./models/Google');
(async () => {
const browser = await chromium.launch({ headless: true, slowMo: 250 });
const context = await browser.newContext();
const GoogleUrl80 = https://www.google.nl/search?q=site%3Anu.nl;
// Cookie consent:
console.log('Cookie consent - start');
const page80 = await browser.newPage();
await page80.goto('https://google.nl');
await page80.waitForTimeout(1000);
await page80.keyboard.press('Tab');
await page80.keyboard.press('Tab');
await page80.keyboard.press('Enter');
console.log('Cookie Consent - done');
// Number of urls in google.nl (using google.js)
await page80.goto(GoogleUrl80, {waitUntil: 'networkidle'});
const googlePage80 = new GooglePage(page80);
await googlePage80.scrapeGoogle();
// Want to console.log 'pageCountClean' here.
await browser.close()
})()
//////////// Google.js
class GooglePage {
constructor(page) {
this.page = page;
}
async scrapeGoogle() {
const GoogleXpath = '//div[#id="result-stats"]';
const pageCount = await this.page.$eval(GoogleXpath, (el) => el.innerText);
const pageCountClean = pageCount.split(" ")[1];
console.log(pageCountClean);
}
}
module.exports = { GooglePage };
You can just return pageCountClean from your async function and await it in your main.js file:
in Google.js:
async scrapeGoogle() {
const GoogleXpath = '//div[#id="result-stats"]';
const pageCount = await this.page.$eval(GoogleXpath, (el) => el.innerText);
const pageCountClean = pageCount.split(" ")[1];
console.log(pageCountClean);
return pageCountClean;
}
in main.js:
const googlePage80 = new GooglePage(page80);
const result = await googlePage80.scrapeGoogle();
console.log(result);

Node.js: share connection object throughout the application

I am a having issues with implementing generic-pool using puppeteer. Below is my relevant part of the code.
UPDATE
Thanks #Jacob for the help and i am more clear about the concept and how it works and the code is also more readable and clear. I am still having issues where a generic pool is getting created on every request. How do i ensure that the same generic pool is used every time instead of creating new one
browser-pool.js
const genericPool = require('generic-pool');
const puppeteer = require('puppeteer');
class BrowserPool {
static async getPool() {
const browserParams = process.env.NODE_ENV == 'development' ? {
headless: false,
devtools: false,
executablePath: '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'
}
:
{
headless: true,
devtools: false,
executablePath: 'google-chrome-unstable',
args: ['--no-sandbox', '--disable-dev-shm-usage']
};
const factory = {
create: function() {
return puppeteer.launch(browserParams);
},
destroy: function(instance) {
console.log('closing browser in hrere.....');
instance.close();
}
};
const opts = {
max: 5
};
this.myBrowserPool = genericPool.createPool(factory, opts);
}
static async returnPool() {
if (this.myBrowserPool == "") {
getPool();
}
return this.myBrowserPool.acquire();
}
}
BrowserPool.myBrowserPool = null;
module.exports = BrowserPool;
process-export.js
const BrowserPool = require('./browser-pool');
async function performExport(params){
const myPool = BrowserPool.getPool();
const resp = BrowserPool.myBrowserPool.acquire().then(async function(client){
try {
const url = config.get('url');
const page = await client.newPage();
await page.goto(url, {waitUntil: ['networkidle2', 'domcontentloaded']});
let gotoUrl = `${url}/dashboards/${exportParams.dashboardId}?csv_export_id=${exportParams.csvExportId}`;
//more processing
await page.goto(gotoUrl, {waitUntil: 'networkidle2' })
await myPool().myBrowserPool.release(client);
return Data;
} catch(err) {
try {
const l = await BrowserPool.myBrowserPool.destroy(client);
} catch(e) {
}
return err;
}
}).catch(function(err) {
return err;
});
return resp;
}
module.exports.performExport = performExport;
My understanding is that
1) When the application starts I can spin up for example 2 chromium instances and then when ever i want to visit a page i can use either of the two connections, so the browsers are essentially open and we improve the performance since the browser start can take time. is this correct?
2) Where do I place the acquire() code, I understand this should be in the app.js, so we acquire the instances rite when the app boots, but my pupeteer code is in a different file, how do i pass the browser reference in the file which has my pupeteer code.
When I use the above the code, a new browser instances spins up every time and the max property is not considered and it opens up as many instances are requested.
My apologies if its something very trial and i might have not understood the concept fully. Any help in clarifying this would be really helpful.
When using a pool, you'll need to use .acquire() to obtain an object, and then .release() when you're done so the object is returned to the pool and made available to something else. Without using .release(), you'd might as well have no pool at all. I like to use this helper pattern with pools:
class BrowserPool {
// ...
static async withBrowser(fn) {
const pool = BrowserPool.myBrowserPool;
const browser = await pool.acquire();
try {
await fn(browser);
} finally {
pool.release(browser);
}
}
}
This can be used like this anywhere in your code:
await BrowserPool.withBrowser(async browser => {
await browser.doSomeThing();
await browser.doSomeThingElse();
});
The key is the finally clause makes sure that whether your tasks complete or throw an error, you'll cleanly release the browser back to the pool every time.
It sounds like you might have the concept of the max option backwards as well and are expecting the browser instances to be spawned up to max. Rather, max means "only create up to max number of resources." If you try to acquire a sixth resource without anything having been released, for example, the acquire(...) call will block until one item is returned to the pool.
The min option, on the other hand, means "keep at least this many items on hand at all times", which you can use to pre-allocate resources. If you want 5 items to be created in advance, set min to 5. If you want 5 items and only five items to be created, set both min and max to 5.
Update:
I notice in your original code that you destroy in case of error and release when there isn't an error. Still would prefer the benefit of a wrapper function like mine to centralize all resource acquiring/releasing logic (the SRP approach). Here's how it could be updated to automatically destroy on errors instead:
class BrowserPool {
// ...
static async withBrowser(fn) {
const pool = BrowserPool.myBrowserPool;
const browser = await pool.acquire();
try {
await fn(browser);
pool.release(browser);
} catch (err) {
await pool.destroy(browser);
throw err;
}
}
}
Addendum
Figuring out what's going on in your code will be easier if you embrace the async function instead of mixing async function stuff and Promise callback stuff. Here's how it can be rewritten:
async function performExport(params){
const myPool = BrowserPool.myBrowserPool;
const client = await myPool.acquire();
try {
const url = config.get('url');
const page = await client.newPage();
await page.goto(url, {waitUntil: ['networkidle2', 'domcontentloaded']});
let gotoUrl = `${url}/dashboards/${exportParams.dashboardId}?csv_export_id=${exportParams.csvExportId}`;
//more processing
await page.goto(gotoUrl, {waitUntil: 'networkidle2' })
await myPool.release(client);
return Data;
} catch(err) {
try {
const l = await myPool.destroy(client);
} catch(e) {
}
return err; // Are you sure you want to do this? Would suggest throw err.
}
}

Set localstorage items before page loads in puppeteer?

We have some routing logic that kicks you to the homepage if you dont have a JWT_TOKEN set... I want to set this before the page loads/before the js is invoked.
how do i do this ?
You have to register localStorage item like this:
await page.evaluate(() => {
localStorage.setItem('token', 'example-token');
});
You should do it after page page.goto - browser must have an url to register local storage item on it. After this, enter the same page once again, this time token should be here before the page is loaded.
Here is a fully working example:
const puppeteer = require('puppeteer');
const http = require('http');
const html = `
<html>
<body>
<div id="element"></div>
<script>
document.getElementById('element').innerHTML =
localStorage.getItem('token') ? 'signed' : 'not signed';
</script>
</body>
</html>`;
http
.createServer((req, res) => {
res.writeHead(200, { 'Content-Type': 'text/html' });
res.write(html);
res.end();
})
.listen(8080);
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('http://localhost:8080/');
await page.evaluate(() => {
localStorage.setItem('token', 'example-token');
});
await page.goto('http://localhost:8080/');
const text = await page.evaluate(
() => document.querySelector('#element').textContent
);
console.log(text);
await browser.close();
process.exit(0);
})();
There's some discussion about this in Puppeteer's GitHub issues.
You can load a page on the domain, set your localStorage, then go to the actual page you want to load with localStorage ready. You can also intercept the first url load to return instantly instead of actually load the page, potentially saving a lot of time.
const doSomePuppeteerThings = async () => {
const url = 'http://example.com/';
const browser = await puppeteer.launch();
const localStorage = { storageKey: 'storageValue' };
await setDomainLocalStorage(browser, url, localStorage);
const page = await browser.newPage();
// do your actual puppeteer things now
};
const setDomainLocalStorage = async (browser, url, values) => {
const page = await browser.newPage();
await page.setRequestInterception(true);
page.on('request', r => {
r.respond({
status: 200,
contentType: 'text/plain',
body: 'tweak me.',
});
});
await page.goto(url);
await page.evaluate(values => {
for (const key in values) {
localStorage.setItem(key, values[key]);
}
}, values);
await page.close();
};
in 2021 it work with following code:
// store in localstorage the token
await page.evaluateOnNewDocument (
token => {
localStorage.clear();
localStorage.setItem('token', token);
}, 'eyJh...9_8cw');
// open the url
await page.goto('http://localhost:3000/Admin', { waitUntil: 'load' });
The next line from the first comment does not work unfortunately
await page.evaluate(() => {
localStorage.setItem('token', 'example-token'); // not work, produce errors :(
});
Without requiring to double goTo this would work:
const browser = await puppeteer.launch();
browser.on('targetchanged', async (target) => {
const targetPage = await target.page();
const client = await targetPage.target().createCDPSession();
await client.send('Runtime.evaluate', {
expression: `localStorage.setItem('hello', 'world')`,
});
});
// newPage, goTo, etc...
Adapted from the lighthouse doc for puppeteer that do something similar: https://github.com/GoogleChrome/lighthouse/blob/master/docs/puppeteer.md
Try and additional script tag. Example:
Say you have a main.js script that houses your routing logic.
Then a setJWT.js script that houses your token logic.
Then within your html that is loading these scripts order them in this way:
<script src='setJWT.js'></script>
<script src='main.js'></script>
This would only be good for initial start of the page.
Most routing libraries, however, usually have an event hook system that you can hook into before a route renders. I would store the setJWT logic somewhere in that callback.

Categories

Resources