Stop the loop after the n-th element - puppeteer - javascript

How can I stop the loop after the fifth link?
This is my code
await page.goto('https://old.reddit.com',{"waitUntil" : "networkidle0"});
const a_elems = await page.$$('.thumbnail');
for (var i=0; i<a_elems.length; i++) {
const elem = a_elems[i];
const href = await page.evaluate(e => e.href, elem);
const newPage = await browser.newPage();
await newPage.goto(href,{"waitUntil" : "networkidle0"});
}
I tried nth-child but it doesn't work.
Do you have any ideas?
Thank you!

by simply add the condition in your for loop
for (var i=0; i<a_elems.length && i<5; i++) {
const elem = a_elems[i];
const href = await page.evaluate(e => e.href, elem);
const newPage = await browser.newPage();
await newPage.goto(href,{"waitUntil" : "networkidle0"});
}

Related

I can't get puppeteer-core to use the search parameters in the url to get a full list of items from a website by changing the "pageSize" parameter

I am trying to get a list of items from a website with puppeteer-core.
Here is the code that should print 774 in the console but only returns 24.
const puppeteer = require('puppeteer-core');
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
async function test() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setJavaScriptEnabled(true)
await page.setDefaultNavigationTimeout(0);
await page.goto("https://www.liveauctioneers.com/catalog/256884_8-27-22-cameo-cut-glass-art-glass-and-more/?page=1&pageSize=1000", {waitUntil: "networkidle2"});
let pageContent = await page.content()
let dom = new JSDOM(pageContent)
let div = dom.window.document.querySelectorAll("div")
await div.forEach(element => {
if (element.id == "content") {
dom = new JSDOM(element.innerHTML)
div = dom.window.document.querySelectorAll("div")
div.forEach(element => {
if (element.className == "App__StyledApp-sc-eiwfgw-0 cHSpyq") {
dom = new JSDOM(element.innerHTML)
div = dom.window.document.querySelectorAll("div")
div.forEach(element => {
if (element.id == "main") {
dom = new JSDOM(element.innerHTML)
div = dom.window.document.querySelectorAll("div")
dom = new JSDOM(div[0].innerHTML)
div = dom.window.document.querySelectorAll("div")
div.forEach(element => {
if (element.className == "CatalogPageItems__StyledContainer-sc-y0p083-0 bLuQEb") {
dom = new JSDOM(element.innerHTML)
div = dom.window.document.querySelectorAll("div")
dom = new JSDOM(div[1].innerHTML)
div = dom.window.document.querySelectorAll("div")
dom = new JSDOM(div[0].innerHTML)
div = dom.window.document.querySelectorAll("div")
console.log(div.length)
}
});
}
});
}
});
}
})
await browser.close();
}
test()
For me this code returns 24 instead of 774. If I load the url "https://www.liveauctioneers.com/catalog/256884_8-27-22-cameo-cut-glass-art-glass-and-more/" into my browser the pageSize defaults to 24.
I tried to simplify the code to find the number of items. I looked only at how many children the grid has and ended up with this one simple line that gives me the correct result:
...
let pageContent = await page.content();
let dom = new JSDOM(pageContent);
let count = dom.window.document.querySelector(".Grid__ItemGridContainer-sc-1fy9g29-0").childElementCount;
console.log(count);
await browser.close();
...

Nodejs/Puppeteer How to iteratively scrape website without restarting another puppeteer instance?

I am trying to web scrape with puppetter, currently the following code works but I think there can be optimizations made one of which I think is to only use one puppetter instance. But I don't really know how to do that. Can anyone help me?
This is the working but slow original:
const puppeteer = require('puppeteer');
async function scrapeProduct(url) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
const [el] = await page.$x('xpath of element');
const txt = await el.getProperty('textContent');
const rawTxt = await txt.jsonValue();
browser.close();
return rawTxt;
}
async function something() {
var some_varible = the length of some array;
process.setMaxListeners(some_varible);
for (var i = 0; i < some_varible; i++) {
var target = some_array[i].Name
var url = 'somewebsite' + target;
console.log(target + ": " + await scrapeProduct(url));
}
}
something();
This is my pathetic attempt at not using multiple instances of puppeteer: (Does not work)
const puppeteer = require('puppeteer');
async function scrapeProduct(url, page) {
await page.goto(url);
const [el] = await page.$x('xpath of element');
const txt = await el.getProperty('textContent');
const rawTxt = await txt.jsonValue();
return rawTxt;
}
async function something() {
var some_varible = the length of some array;
process.setMaxListeners(some_varible);
const browser = await puppeteer.launch();
const page = await browser.newPage();
for (var i = 0; i < some_varible; i++) {
var target = some_array[i].Name
var url = 'somewebsite' + target;
console.log(target + ": " + await scrapeProduct(url, page));
}
browser.close();
}
something();

async/await, not waiting for the variable to populate before printing

async function filterusers(users){
let usersfiltered=[]
for(var i = 0;i < users.length; i++){
let userref = db.collection('usernames').doc(users[i]);
let getDoc = userref.get()
.then(doc => {
if (doc.exists) {
usersfiltered.push(users[i])
}
})
}
return await usersfiltered;
}
filterusers(users).then(console.log);
i am looking to wait for the filtered result but it always prints blank array ie before the result is being returned.
async function filterusers(users){
let usersfiltered=[]
// You should use let or const instead of var.
for(let i = 0;i < users.length; i++){
// I believe getting doc needs await.
let userref = await db.collection('usernames').doc(users[i]);
await userref.get()
.then(doc => {
if (doc.exists) {
usersfiltered.push(users[i])
}
})
}
return usersfiltered;
}
filterusers(users).then(console.log);
First, you have to turn the array of users into an array of Promises ( asynchronous operations) by using Array.map:
const checkUserPromises = users.map((user) => {
const userref = db.collection("usernames").doc(user);
return userref.get().then((doc) => {
if (doc.exists) {
return user;
} else {
return null;
}
});
});
Then, you need to await these promises with Promises.all:
const checkedUsers = await Promise.all(checkUserPromises);
Lastly, you may want to filter out the users that are not existing:
const existingUsers = checkedUsers.filter((user) => user !== null);
await should be with Promise
async function filterusers(users) {
let usersfiltered = [];
for (let i = 0; i < users.length; i++) {
let userref = db.collection("usernames").doc(users[i]);
let getDoc = await userref.get();
if (getDoc.exists) {
usersfiltered.push(users[i]);
}
}
return usersfiltered;
}
let filterdUsers = filterusers(users);
console.log(filterdUsers);
Firstly, do not mix async/await with .then.
Secondly, use new ES6 for loops, to make the code work properly with async/await.
async function filterusers(users) {
let usersfiltered = [];
filterusers(users).then(console.log);
for (const user of users) {
let userref = db.collection('usernames').doc(user);
const doc = await userref.get();
if (doc.exists) {
usersfiltered.push(user);
}
}
return await usersfiltered;
}
Solved it myself by moving the await to before userref.get()
async function filterusers(users){
let usersfiltered=[]
for(var i = 0;i < users.length; i++){
let userref = db.collection('usernames').doc(users[i]);
let getDoc = await userref.get()
.then(doc => {
if (doc.exists) {
usersfiltered.push(users[i])
}
})
}
return usersfiltered;
}
filterusers(users).then(console.log);

Puppeteer doesn't display results from first cycle of loops

I have this piece of code which loops through one page with 3 frames and collect data from them an put them together.The problem is that the code is display incomplete results for first 2 loops , after that everything is fine, or randomly i got an error like Execution context was destroyed, most likely because of a navigation.
Please excuse my bad code but I have only 2 months on coding in javaScript
const puppeteer = require('puppeteer');
const elementsToClickSelector = 'body > form > font > select option';
const allLineIds = 'body > form > font > select > option';
const timpSosiri = 'body > b > font';
function run () {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.goto('');
const frame = page.frames().find(f => f.name() === 'stanga');
const cframe = page.frames().find(f => f.name() === 'centru');
const dframe = page.frames().find(f => f.name() === 'dreapta');
// get all station name to be clicked
let elementsToClick = await frame.$$(elementsToClickSelector);
console.log(`Elements to click: ${elementsToClick.length}`);
if (page.frames().find(f => f.name().includes('stanga'))) {
console.info('Frame was in the DOM and in the frames list')
} else {
console.error('Frame was in the DOM but not in the frames list')
}
let test =[];
for (let i =0, length = elementsToClick.length; i< length; i++){
const item = await frame.evaluateHandle((i) =>{
return document.querySelectorAll('option')[i];
},i);
await frame.waitFor(1000);
const statieNume = await (await elementsToClick[i].getProperty('innerText')).jsonValue();
console.log(statieNume);
await item.click();
// get all linie ids to be clicked
let idLine = await cframe.$$(allLineIds);
for(let j = 0, length1 = idLine.length; j<length1; j++){
const lineItem = await cframe.evaluateHandle((j) =>{
return document.querySelectorAll('option')[j];
}, j);
const linie = await (await idLine[j].getProperty('innerText')).jsonValue();
console.log(linie);
lineItem.click();
cframe.waitForSelector('body > form > font > select option');
let timp = await dframe.$$(timpSosiri);
for( let k = 0, lengthk = timp.length; k < lengthk; k++){
const sosiri = await dframe.evaluateHandle((k) =>{
return document.querySelectorAll('b')[k];
},k);
dframe.waitForSelector('body > b > font');
sosiri.click();
const timpLinie = await (await timp[k].getProperty('innerHTML')).jsonValue();
console.log(timpLinie);
test.push({
statie:statieNume,
linie: linie,
timpi: timpLinie
});
}
}
}
browser.close();
return resolve(JSON.stringify(test));
} catch (e) {
return reject(e);
}
})
}
run().then(console.log).catch(console.error);
Output
Mures
A Saguna
[1] S5
[0] E3
[0] S5
A.Guttenbrun_1
[1] E8
Sosire1: 17:31<br> Sosire2: 17:38
[0] 21
Sosire1: 17:31<br> Sosire2: 17:38
A.Guttenbrun_2
[0] S10
Sosire1: 17:26<br> Sosire2: 17:55
[1] Tv5
Sosire1: 17:26<br> Sosire2: 17:55
First Mures doesn't display lines and time and A Saguna dosen't disply time.

Looping through a set of urls in Puppeteer

How would I scrape content from multiple urls using Puppeteer?
I've created a loop, but I'm only seeing the results for the first url.
I suspect it's something to do with where I declare the results variable, but I've had no luck trying, does anyone know how to do this?
const puppeteer = require('puppeteer');
function run() {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
const urls = ["https://www.marksandspencer.com/high-neck-long-sleeve-blouse/p/p60260040?image=SD_01_T43_5168_HD_X_EC_90&color=LIGHTDENIM&prevPage=plp", "https://www.marksandspencer.com/pure-cotton-printed-short-sleeve-t-shirt/p/p60263529?image=SD_01_T41_8030Z_Z4_X_EC_90&color=WHITEMIX&prevPage=plp"];
for (let i = 0; i < urls.length; i++) {
const url = urls[i];
await page.goto(url);
let products = await page.evaluate(() => {
let product = document.querySelector('h1[itemprop=name]').innerText;
let results = [];
let items = document.querySelectorAll('[data-ttip-id=sizeGridTooltip] tbody tr td label');
items.forEach((element) => {
let size = element.getAttribute('for');
let stockLevel = "";
let nearest_td = element.closest('td');
if (nearest_td.classList.contains('low-stock')) {
stockLevel = "Low stock"
} else if (nearest_td.classList.contains('out-of-stock')) {
stockLevel = "Out of stock"
} else {
stockLevel = "In stock"
}
results.push({
product: product,
size: size,
stock: stockLevel
})
});
return results
})
browser.close();
return resolve(products);
}
} catch (e) {
return reject(e);
}
})
}
run().then(console.log).catch(console.error);
These lines are inside your for loop:
browser.close();
return resolve(products);
So as part of the first iteration, you close the browser and return the function. You should move this out of your for loop and store products inside an array like this:
const urls = /* ... */;
const productsList = [];
for (let i = 0; i < urls.length; i++) {
const url = urls[i];
await page.goto(url);
let products = await page.evaluate(/* ... */);
productsList.push(products);
}
browser.close();
return resolve(productsList); // resolve with an array containing the aggregated products
In case you are looking for a more elegant solution (for crawling the pages in parallel), you might want to have a look at the library puppeteer-cluster (disclaimer: I'm the author).

Categories

Resources