HTML element not selecting in Puppeteer - javascript

So I have an HTML excerpt from a webpage as follows:
<li class="PaEvOc tv5olb wbTnP gws-horizon-textlists__li-ed">
//random div/element stuff inside here
</li>
<li class ="PaEvOc tv5olb gws-horizon-textlists__li-ed">
//random div/element stuff inside here as well
</li>
Not sure how to properly copy HTML but if you look at "events near location" on Google Chrome, I'm looking at these and trying to scrape the data from them:
https://i.stack.imgur.com/fv4a4.png
To start, I'm just trying to figure out how to properly select these elements in Puppeteer:
(async () => {
const browser = await puppeteer.launch({ args: [
'--no-sandbox'
]});
const page = await browser.newPage();
page.once('load', () => console.log('Page loaded!'));
await page.goto('https://www.google.com/search?q=events+near+poughkeepsie+today&client=safari&rls=en&uact=5&ibp=htl;events&rciv=evn&sa=X&fpstate=tldetail');
console.log('Hit wait for selector')
const test = await page.waitForSelector(".PaEvOc");
console.log('finished waiting for selector');
const seeMoreEventsButton = await page.$(".PaEvOc");
console.log('seeMoreEventsButton is ' + seeMoreEventsButton);
console.log('test is ' + test);
})();
What exactly is the problem here? Any and all help much appreciated, thank you!

I suggest reading this: https://intoli.com/blog/not-possible-to-block-chrome-headless/
Basically, websites are detecting that you are scraping, but you can work around it.
Here is what I did to make your console logs print something useful
const puppeteer = require('puppeteer');
(async () => {
const preparePageForTests = async (page) => {
const userAgent = 'Mozilla/5.0 (X11; Linux x86_64)' +
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.39 Safari/537.36';
await page.setUserAgent(userAgent);
}
const browser = await puppeteer.launch({ args: [
'--no-sandbox'
]});
const page = await browser.newPage();
await preparePageForTests(page);
page.once('load', () => console.log('Page loaded!'));
await page.goto('https://www.google.com/search?q=events+near+poughkeepsie+today&client=safari&rls=en&uact=5&ibp=htl;events&rciv=evn&sa=X&fpstate=tldetail');
console.log('Hit wait for selector')
const test = await page.waitForSelector(".PaEvOc");
console.log('finished waiting for selector');
const seeMoreEventsButton = await page.$(".PaEvOc");
console.log('seeMoreEventsButton is ' + seeMoreEventsButton);
console.log('test is ' + test);
})();

Related

How to select specific button in puppeteer

So I'm building a program that scrapes Poshmark webpages and extracts the usernames of each seller on the page!
I want it to go through every page using the 'next' button, but theres 6 buttons all with the same class name...
Heres the link: https://poshmark.com/category/Men-Jackets_&_Coats?sort_by=like_count&all_size=true&my_size=false
(In my google chrome this page has an infinite scroll (hence the scrollToBottom async function i started writing) but i realized inside puppeteer's chrome it has 'next page' buttons.)
The window displays page 1-5 and then the 'next page' button.
The problem is that all of the buttons share the same html class name, so I'm confused on how to differentiate.
const e = require('express');
const puppeteer = require('puppeteer');
const url = "https://poshmark.com/category/Men-Jackets_&_Coats?sort_by=like_count&all_size=true&my_size=false";
let usernames = [];
const initItemArea = async (page) => {
const itemArea = await page.evaluate(() => {
return Array.from(document.querySelectorAll('.tc--g.m--l--1.ellipses')).map(x => x.textContent);
});
}
const pushToArray = async (itemArea, page) => {
itemArea.forEach(function (element) {
//console.log('username: ', $(element).text());
usernames.push(element);
});
};
const scrollToBottom = async (itemArea, page) => {
while (true) {
previousHeight = await page.evaluate('document.body.scrollHeight');
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
await page.waitForFunction(`document.body.scrollHeight > ${previousHeight}`);
await new Promise((resolve) => setTimeout(resolve, 1000));
await page.screenshot({path : "ss.png"})
}
};
const gotoNextPage = async (page) => {
await page.waitForSelector(".button.btn.btn--pagination");
const nextButton = await page.evaluate((page) => {
document.querySelector(".button.btn.btn--pagination")
});
await page.click(nextButton);
console.log('Next Page Loading')
};
async function main() {
const client = await puppeteer.launch({
headless: false,
executablePath: "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
});
const page = await client.newPage();
await page.goto(url);
await page.waitForSelector(".tc--g.m--l--1.ellipses");
const itemArea = await page.evaluate(() => {
return Array.from(document.querySelectorAll('.tc--g.m--l--1.ellipses')).map(x => x.textContent);
});
gotoNextPage(page)
};
main();
Currently, my gotoNextPage function doesnt even find the button, so i thought i'd entered the selector wrong...
Then when I went to find the selector, I realized all buttons have the same one anyway...
My html knowledge is basically nonexistent, but I want to finish this project out. All help is very appreciated.
Bonus: my initPageArea function doesn't work when I call as a function like that, so I hardcoded it into main()...
I'll be diving deep into this problem later on, as I've seen it before, but any quick answers / direction would be awesome.
Thanks a lot.
you can try selecting the buttons using their position in the page.
For example, you can select the first button using the following CSS selector:
.button.btn.btn--pagination:nth-child(1)
to select the second button:
.button.btn.btn--pagination:nth-child(2)
Got the idea? :)
you can refactor your gotoNextPage function to use this approach, consider this example:
const gotoNextPage = async (page, buttonIndex) => {
await page.waitForSelector(".button.btn.btn--pagination");
// Select the button using its position in the page
const nextButton = await page.evaluate((buttonIndex) => {
return document.querySelector(`.button.btn.btn--pagination:nth-child(${buttonIndex})`);
}, buttonIndex);
// Click on the button
await page.click(nextButton);
console.log("Next Page Loading");
};
Whenever you're messing with buttons and scroll, it's a good idea to think about where the data is coming from. It's usually being delivered to the front-end via a JSON API, so you might as well try to hit that API directly rather than mess with the DOM.
const url = maxId => `https://poshmark.com/vm-rest/channel_groups/category/channels/category/collections/post?request={%22filters%22:{%22department%22:%22Men%22,%22category_v2%22:%22Jackets_%26_Coats%22,%22inventory_status%22:[%22available%22]},%22sort_by%22:%22like_count%22,%22facets%22:[%22color%22,%22brand%22,%22size%22],%22experience%22:%22all%22,%22sizeSystem%22:%22us%22,%22max_id%22:%22${maxId}%22,%22count%22:%2248%22}&summarize=true&pm_version=226.1.0`;
(async () => {
const usernames = [];
for (let maxId = 1; maxId < 5 /* for testing */; maxId++) {
const response = await fetch(url(maxId)); // Node 18 or install node-fetch
if (!response.ok) {
throw Error(response.statusText);
}
const payload = await response.json();
if (payload.error) {
break;
}
usernames.push(...payload.data.map(e => e.creator_username));
}
console.log(usernames.slice(0, 10));
console.log("usernames.length", usernames.length);
})()
.catch(err => console.error(err));
The response blob has a ton of additional data.
I would add a significant delay between requests if I were to use code like this to avoid rate limiting/blocking.
If you're set on Puppeteer, something like this should work as well, although it's slower and I didn't have time to run to the end of the 5k (or more?) users:
const puppeteer = require("puppeteer"); // ^19.1.0
const url = "Your URL";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto(url, {waitUntil: "domcontentloaded"});
const usernames = [];
const sel = ".tc--g.m--l--1.ellipses";
for (;;) {
try {
await page.waitForSelector(sel);
const users = await page.$$eval(sel, els => {
const text = els.map(e => e.textContent);
els.forEach(el => el.remove());
return text;
});
console.log(users); // optional for debugging
usernames.push(...users);
await page.$$eval(
".btn--pagination",
els => els.find(el => el.textContent.includes("Next")).click()
);
}
catch (err) {
break;
}
}
console.log(usernames);
console.log(usernames.length);
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
I don't think navigations are triggered by the "Next" button, so my strategy for detecting when a page transition has occurred involves destroying the current set of elements after scraping the usernames, then waiting until the next batch shows up. This may seem inelegant, but it's easy to implement and seems reliable, not making assumptions about the usernames themselves.
It's also possible to use Puppeteer and make or intercept API requests, armed with a fresh cookie. This is sort of halfway between the two extremes shown above. For example:
const puppeteer = require("puppeteer");
const url = "Your URL";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto(url, {waitUntil: "domcontentloaded"});
const usernames = await page.evaluate(async () => {
const url = maxId => `https://poshmark.com/vm-rest/channel_groups/category/channels/category/collections/post?request={%22filters%22:{%22department%22:%22Men%22,%22category_v2%22:%22Jackets_%26_Coats%22,%22inventory_status%22:[%22available%22]},%22sort_by%22:%22like_count%22,%22facets%22:[%22color%22,%22brand%22,%22size%22],%22experience%22:%22all%22,%22sizeSystem%22:%22us%22,%22max_id%22:%22${maxId}%22,%22count%22:%2248%22}&summarize=true&pm_version=226.1.0`;
const usernames = [];
try {
for (let maxId = 1; maxId < 5 /* for testing */; maxId++) {
const response = await fetch(url(maxId)); // node 18 or install node-fetch
if (!response.ok) {
throw Error(response.statusText);
break;
}
const json = await response.json();
if (json.error) {
break;
}
usernames.push(...json.data.map(e => e.creator_username));
}
}
catch (err) {
console.error(err);
}
return usernames;
});
console.log(usernames);
console.log("usernames.length", usernames.length);
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
The above code limits to 4 requests to keep it simple and easy to validate.
Blocking images and other unnecessary resources can help speed the Puppeteer versions up, left as an exercise (or just use the direct fetch version shown at top).

Need to select very specific element using querySelector without returning undefined

I'm scraping a site for data using Puppeteer and need to get a really specific piece of data from the site, I'm trying to use querySelector to get the classname of where the data is but its proven rather difficult because there are 22 other elements that use the exact classname(the classname is FormData), out of the 22 its the 18th and I've been trying to select it and print it out but to no avail, I always get the same error or something along the lines.
Code
// MODULES
const puppeteer = require("puppeteer");
// Url where we get and scrape the data from
const URL = "https://www.sec.gov/edgar/search/#/category=form-cat2";
(async () => {
try {
const chromeBrowser = await puppeteer.launch({ headless: true });
const page = await chromeBrowser.newPage();
await page.goto(URL, {timeout: 0});
const getInfo = await page.evaluate(() => {
const secTableEN = document.querySelector(".table td.entity-name");
const secTableFiled = document.querySelector(".table td.filed");
const secTableLinkPrice = document.querySelector('.FormData')[17];
return {
secTableEN: secTableEN.innerText,
secTableFiled: secTableFiled.innerText,
secTableLinkPrice: secTableLinkPrice.innerText,
};
});
console.log(
"Name: " + getInfo.secTableEN, '\n' +
"Amount Purchased: " + getInfo.secTableLinkPrice, '\n'
);
await page.close();
await chromeBrowser.close();
} catch (e) {
console.error(e)
}
})();
The error I'm always getting is:Error: Evaluation failed: TypeError: Cannot read properties of undefined (reading 'innerText') and only always happens when I try returning the secTableLinkPrice.innerText the other two alone always work fine. What can I do?
Apparently the price you want from the top result is in a popup, so you need to click on one of the .preview-file links to make that popup appear. Only then can you select .FormData from the iframe modal.
const puppeteer = require("puppeteer"); // ^19.1.0
const url = "<YOUR URL>";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
const ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36";
await page.setUserAgent(ua);
await page.goto(url, {waitUntil: "domcontentloaded"});
const $ = (...args) => page.waitForSelector(...args);
await (await $(".filetype .preview-file")).click();
const frame = await (await $("#ipreviewer")).contentFrame();
await frame.waitForSelector(".FormText");
const price = await frame.$$eval(".FormText", els =>
els.find(e => e.textContent.trim() === "$")
.parentNode
.textContent
.trim()
);
console.log(price);
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
Now, the popup triggers a network request to an XML file (which appears to be HTML), so it might be easiest to just download that, since it probably has all of the data you want. In the code below, I'm actually parsing and traversing that HTML with Puppeteer, so it looks like more work, but perhaps you could just save this file to disk, depending on your needs:
// ... same as above ...
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
const ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36";
await page.setUserAgent(ua);
await page.goto(url, {waitUntil: "domcontentloaded"});
const responseP = page.waitForResponse(res =>
res.status() === 200 && res.url().endsWith(".xml")
);
const a = await page.waitForSelector(".filetype .preview-file");
await a.click();
const html = await (await responseP).text();
await page.evaluate(html => document.body.outerHTML = html, html);
const price = await page.$$eval(".FormText", els =>
els.find(e => e.textContent.trim() === "$")
.parentNode
.textContent
.trim()
);
console.log(price);
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
Finally, some documents don't have a price, so the above code only works on the "4 (Insider trading report)". Furthermore, I haven't validated that all of these "type 4" reports are exactly the same. You'll probably want to handle this in your code and proceed carefully.

puppeteer being redirected when browser is not

Attempting to test page https://publicindex.sccourts.org/anderson/publicindex/
When navigating with standard browser to the page, the navigation ends at the requested page (https://publicindex.sccourts.org/anderson/publicindex/) with the page displaying an "accept" button.
However, when testing with puppeteer in headless mode, the request is redirected to https://publicindex.sccourts.org.
I have a rough idea of what is occuring, but can not seem to prevent the redirection to https://publicindex.sccourts.org when the page is requested using puppeteer.
here is what I believe is occuring with the user controlled browser:
request for page is sent. (assuming first visit)
the response is pure JS,
The js code specifies to:
copy the initial page request headers
add a specific header, and re-request the same page (xhr)
copies a url from one of the response headers and replaces the location
(or)
checks the page history,
adds the url from the response to page to history,
opens a new window,
writes the xhr response to the new page
closes the new window
adds an event listener for a function in the returned xhr request
fires the event
With puppeteer I have tried tracing the js, recording har, monitoring cookies, watching the request chain, intercepting page requests and adjusting headers,watching history....etc. I'm stumped.
Here is the most basic version of the puppeteer script:
function run () {
let url = 'https://publicindex.sccourts.org/anderson/publicindex/';
const puppeteer = require('puppeteer');
const PuppeteerHar = require('puppeteer-har');
puppeteer.launch({headless: true}).then(async browser => {
const page = await browser.newPage();
await page.setJavaScriptEnabled(true);
await page.setViewport({width: 1920, height: 1280});
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36');
const har = new PuppeteerHar(page);
await har.start({path: 'results.har'});
const response = await page.goto(url);
await page.waitForNavigation();
await har.stop();
let bodyHTML = await page.content();
console.log(bodyHTML);
});
};
run();
why can I not get puppeteer to simply replicate the process that is being executed by js when I am navigating to the page in chrome, and end navigation on the "accept" page?
here is a version with more verbose logging:
function run () {
let url = 'https://publicindex.sccourts.org/anderson/publicindex/';
const puppeteer = require('puppeteer');
const PuppeteerHar = require('puppeteer-har');
puppeteer.launch().then(async browser => {
const page = await browser.newPage();
await page.setJavaScriptEnabled(true);
await page.setViewport({width:1920,height:1280});
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36');
await page.setRequestInterception(true);
page.on('frameattached', frame =>{ console.log('frame attached ');});
page.on('framedetached', frame =>{ console.log('frame detached ');});
page.on('framenavigated', frame =>{ console.log('frame navigated '); });
page.on('requestfailed', req =>{ console.log('request failed ');});
page.on('requestfinished', req =>{ console.log('frame finished '); console.log(req.url())});
let count = 0;
let headers = '';
page.on('request', interceptedRequest => {
console.log('requesting ' + count + 'times');
console.log('request for ' + interceptedRequest.url());
console.log(interceptedRequest);
if (count>2) {
interceptedRequest.abort();
return;
}
if (interceptedRequest.url() == url) {
count++;
if (count == 1) {
const headers = interceptedRequest.headers();
headers['authority'] = 'publicindex.sccourts.org';
headers['sec-fetch-dest'] = 'empty';
headers['sec-fetch-mode'] = 'cors';
headers['sec-fetch-site'] = 'same-origin';
headers['upgrade-insecure-requests'] = '1';
interceptedRequest.continue({headers});
return;
} else {
interceptedRequest.continue();
return;
}
}
count++;
interceptedRequest.continue();
return;
});
const har = new PuppeteerHar(page);
await har.start({ path: 'results.har' });
await page.tracing.start({path: 'trace.json'});
await Promise.all([page.coverage.startJSCoverage({reportAnonymousScripts : true})]);
const response = await page.goto(url);
const session = await page.target().createCDPSession();
await session.send('Page.enable');
await session.send('Page.setWebLifecycleState', {state: 'active'});
const jsCoverage = await Promise.all([page.coverage.stopJSCoverage()]);
console.log(jsCoverage);
const chain = response.request().redirectChain();
console.log(chain + "\n\n");
await page.waitForNavigation();
await har.stop();
let bodyHTML = await page.content();
console.log(bodyHTML);
});
};
run();
I don't have a full resolution but I know where the redirection is happening.
I tested your script locally with below:
const puppeteer = require('puppeteer');
const PuppeteerHar = require('puppeteer-har');
function run () {
let url = 'https://publicindex.sccourts.org/anderson/publicindex/';
puppeteer.launch({headless: false, devtools: true }).then(async browser => {
const page = await browser.newPage();
await page.setRequestInterception(true);
page.on('request', request => {
console.log('GOT NEW REQUEST', request.url());
request.continue();
});
page.on('response', response => {
console.log('GOT NEW RESPONSE', response.status(), response.headers());
});
await page.setJavaScriptEnabled(true);
await page.setViewport({width: 1920, height: 1280});
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36');
const har = new PuppeteerHar(page);
await har.start({path: 'results.har'});
const response = await page.goto(url);
await page.waitForNavigation();
await har.stop();
let bodyHTML = await page.content();
});
};
run();
I edited three parts:
Removed headless mode and open the devtools automatically
Intercept all network requests (that I audited)
Hoisted require import because it hurts my eyes. I always see them call without nesting
Turns out the page https://publicindex.sccourts.org/anderson/publicindex/ make a request to https://publicindex.sccourts.org/
However this request returns a 302 Redirect to https://www.sccourts.org/caseSearch/ location, so the browser acts accordingly
I would try to investigate this weird request if it is legit or not and why it redirects on chrome puppeteer
This post might help, there could be something related on chromium being seen as insecure
I also tried to pass args: ['--disable-web-security', '--allow-running-insecure-content'] to launch() object parameter, but without results
Please let us know how it goes! Har has been fun to discover!

Scraping of website through Puppeteer returns undefined

I was trying to scrape the Myntra website. The link is here
I used Puppeteer and Node JS to scrape it. It was working fine and currently I get an error
Error: Evaluation failed: TypeError: Cannot read property 'textContent' of null
at __puppeteer_evaluation_script__:2:55
The function returns an empty object. I have attached my code below.
const puppeteer = require('puppeteer');
(async () => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://www.myntra.com/jeans/only/only-women-black-skinny-fit-mid-rise-low-distress-stretchable-cropped-jeans/10973332/buy');
const body = await page.evaluate( () => {
return document.querySelector('.pdp-price') ;
});
console.log(body);
await browser.close();
} catch (error) {
console.log(error);
}
})();
It seems that this site is blocking requests for which HeadlessChrome is specified in the user-agent, so I changed the user-agent and now everything works as you need. Try this code:
const puppeteer = require('puppeteer');
(async () => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setExtraHTTPHeaders({
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
});
await page.goto('https://www.myntra.com/jeans/only/only-women-black-skinny-fit-mid-rise-low-distress-stretchable-cropped-jeans/10973332/buy');
const body = await page.evaluate(() => {
return document.querySelector('.pdp-price').textContent;
});
console.log(body);
await browser.close();
} catch (error) {
console.log(error);
}
})();
Something is trying to call .textContent on something that's null. I don't see it in your example, but this is what would happen if code like querySelector('.pdp-price') doesn't find anything - maybe because the page hasn't fully loaded yet or the selector doesn't match anything.
You can pass other options to page.goto to make it wait for longer, which could let things load.

Pausing a loop when the queue is 'full'

I'm doing a web crawler with the puppeteer!.
The loop passes to the function (loadPage (URL)) a valid url, but would like to pause the loop when N pages are being handled by crawling.
I thought about doing a timeout, with an average time the puppeer takes to run. But I do not think it's a solution. But, I am open to any discussion.
Thank you.
--- Editing for future reference ---
const puppeteer = require('puppeteer');
const stores = require('./data.json').stores;
const MAX_CONCURRENT_TASKS = 5;
let TOTAL_PAGES = 0;
const start = async () => {
//#TODO Create a separate log routine
console.log('Total de Lojas', stores.length)
let actualStore = null;
let activatedStores = [];
for (const store of stores) {
if (store.active) {
activatedStores.push(store)
}
}
//#TODO Create a separate log routine
console.log('Lojas ativas', activatedStores.length)
try {
const browser = await puppeteer.launch({
headless: false //Debug porpouse
});
const pagePool = await Promise.all(Array.from(
new Array(MAX_CONCURRENT_TASKS),
() => browser.newPage()
))
while (activatedStores.length !== 0) {
//#TODO Create a separate log routine
console.log(`Stores left: ${activatedStores.length - MAX_CONCURRENT_TASKS}!`)
await Promise.all(
activatedStores.splice(0, MAX_CONCURRENT_TASKS)
.map((store, i) => loadPage(store.siteMap,
pagePool[i], store))
)
}
await browser.close();
} catch (error) {
//#TODO create function to generate error logs
console.error(error)
}
}
/**
*Function to load pages
*
* #param {string} url - a valid url
* #param {puppeter} page - puppeteer browser.newPage()
* #param {Object} store - the settings of this store
*/
const loadPage = async (url, page, store) => {
const opts = {
timeout: 0,
waitUntil: 'domcontentloaded'
}
page.setUserAgent('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36')
await page.goto(url, opts);
//#TODO Create a separate log routine
console.log(await page.evaluate(() => document.location.href));
}
start()
Without code, it's hard to be sure what exactly you need. Maybe this example can give you some hints.
'use strict';
const puppeteer = require('puppeteer');
(async function main() {
try {
const urls = Array.from(
new Array(20),
(_, i) => `https://example.org/?foo=${i}`
);
const numberOfConcurrentTasks = 3;
const browser = await puppeteer.launch();
const pagePool = await Promise.all(Array.from(
new Array(numberOfConcurrentTasks),
() => browser.newPage()
));
while (urls.length !== 0) {
console.log(`URLs left: ${urls.length}.`);
await Promise.all(
urls.splice(0, numberOfConcurrentTasks)
.map((url, i) => processDoc(url, pagePool[i]))
);
}
await browser.close();
} catch (err) {
console.error(err);
}
})();
async function processDoc(url, page) {
await page.goto(url);
console.log(await page.evaluate(() => document.location.href));
}
I am not able to provide you a code sample here, but definitely, you should look into a iterators and generators concept. Generators use principle of non-blocking pause, that allows you to do some computations, stop and do other logic, return to your computations.

Categories

Resources