Looping through a set of urls in Puppeteer - javascript

How would I scrape content from multiple urls using Puppeteer?
I've created a loop, but I'm only seeing the results for the first url.
I suspect it's something to do with where I declare the results variable, but I've had no luck trying, does anyone know how to do this?
const puppeteer = require('puppeteer');
function run() {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
const urls = ["https://www.marksandspencer.com/high-neck-long-sleeve-blouse/p/p60260040?image=SD_01_T43_5168_HD_X_EC_90&color=LIGHTDENIM&prevPage=plp", "https://www.marksandspencer.com/pure-cotton-printed-short-sleeve-t-shirt/p/p60263529?image=SD_01_T41_8030Z_Z4_X_EC_90&color=WHITEMIX&prevPage=plp"];
for (let i = 0; i < urls.length; i++) {
const url = urls[i];
await page.goto(url);
let products = await page.evaluate(() => {
let product = document.querySelector('h1[itemprop=name]').innerText;
let results = [];
let items = document.querySelectorAll('[data-ttip-id=sizeGridTooltip] tbody tr td label');
items.forEach((element) => {
let size = element.getAttribute('for');
let stockLevel = "";
let nearest_td = element.closest('td');
if (nearest_td.classList.contains('low-stock')) {
stockLevel = "Low stock"
} else if (nearest_td.classList.contains('out-of-stock')) {
stockLevel = "Out of stock"
} else {
stockLevel = "In stock"
}
results.push({
product: product,
size: size,
stock: stockLevel
})
});
return results
})
browser.close();
return resolve(products);
}
} catch (e) {
return reject(e);
}
})
}
run().then(console.log).catch(console.error);

These lines are inside your for loop:
browser.close();
return resolve(products);
So as part of the first iteration, you close the browser and return the function. You should move this out of your for loop and store products inside an array like this:
const urls = /* ... */;
const productsList = [];
for (let i = 0; i < urls.length; i++) {
const url = urls[i];
await page.goto(url);
let products = await page.evaluate(/* ... */);
productsList.push(products);
}
browser.close();
return resolve(productsList); // resolve with an array containing the aggregated products
In case you are looking for a more elegant solution (for crawling the pages in parallel), you might want to have a look at the library puppeteer-cluster (disclaimer: I'm the author).

Related

Puppeteer: Save data from a for loop in database

I'm web-scraping a site and managed to extract data in the for loop.
However, I don't know how can I save it to my MongoDB database as I'm receiving an error ReferenceError: nameElement is not defined .
How can I save the results from my for loop as an object to my database?
const kclResults = [];
async function scrapeInfiniteScrollItems(
page,
scrollDelay = 10000
) {
try {
const html = await page.content();
const $ = cheerio.load(html);
await page.evaluate(() => {
let elements = $("[role='listitem']")
.find("._2DX0iPG8PDF3Si_o5PlzIj")
.toArray();
for (i = 0; i < elements.length; i++) {
$(elements[i]).click();
const nameElement = $("[data-log-name='PersonName']").text();
const emailElement = $("[data-log-name='Email']").text();
const allElements = $("[aria-label='Contact information']").text();
const officeLocation = $("[data-log-name='OfficeLocation']").text();
const position = $("[data-log-name='Company']").text();
const jobTitle = $("[data-log-name='JobTitle']").text();
const departament = $("[data-log-name='Department']").text();
console.log(
`email: ${emailElement} name: ${nameElement} allElements: ${allElements} \n office location: ${officeLocation} \n position: ${position} \n jobTitle: ${jobTitle} \n departament: ${departament}`
);
}
});
let kclResult = new KingsDB({
nameElement,
emailElement,
allElements,
officeLocation,
position,
jobTitle,
departament,
});
kclResults.push(kclResult);
console.log(kclResults);
kclResult.save();
return kclResults;
} catch (error) {
console.log(error);
}
}
You are declaring nameElement (and other variables) in for loop scope and trying to access it outside that scope.
Just create an array of "elements" and iterate over it when you're writing it to your DB. This code below should work:
const kclResults = [];
async function scrapeInfiniteScrollItems(
page,
scrollDelay = 10000
) {
try {
const html = await page.content();
const $ = cheerio.load(html);
const resultArr = await page.evaluate(() => {
let elements = $("[role='listitem']")
.find("._2DX0iPG8PDF3Si_o5PlzIj")
.toArray();
const resultArr = [];
for (i = 0; i < elements.length; i++) {
$(elements[i]).click();
const nameElement = $("[data-log-name='PersonName']").text();
const emailElement = $("[data-log-name='Email']").text();
const allElements = $("[aria-label='Contact information']").text();
const officeLocation = $("[data-log-name='OfficeLocation']").text();
const position = $("[data-log-name='Company']").text();
const jobTitle = $("[data-log-name='JobTitle']").text();
const departament = $("[data-log-name='Department']").text();
resultArr.push({
nameElement,
emailElement,
allElements,
officeLocation,
position,
jobTitle,
departament
});
console.log(
`email: ${emailElement} name: ${nameElement} allElements: ${allElements} \n office location: ${officeLocation} \n position: ${position} \n jobTitle: ${jobTitle} \n departament: ${departament}`
);
}
return resultArr;
});
const kclResults = [];
for (let result of resultArr) {
const {
nameElement,
emailElement,
allElements,
officeLocation,
position,
jobTitle,
departament
} = result;
let kclResult = new KingsDB({
nameElement,
emailElement,
allElements,
officeLocation,
position,
jobTitle,
departament,
});
kclResults.push(kclResult);
console.log(kclResults);
kclResults.push(kclResult.save());
}
return kclResults;
} catch (error) {
console.log(error);
}
}
PS: The function passed to pageEvaluate runs in browser context and thus doesn't have access to your node variables, until they are explicitly passed as an argument.

async/await, not waiting for the variable to populate before printing

async function filterusers(users){
let usersfiltered=[]
for(var i = 0;i < users.length; i++){
let userref = db.collection('usernames').doc(users[i]);
let getDoc = userref.get()
.then(doc => {
if (doc.exists) {
usersfiltered.push(users[i])
}
})
}
return await usersfiltered;
}
filterusers(users).then(console.log);
i am looking to wait for the filtered result but it always prints blank array ie before the result is being returned.
async function filterusers(users){
let usersfiltered=[]
// You should use let or const instead of var.
for(let i = 0;i < users.length; i++){
// I believe getting doc needs await.
let userref = await db.collection('usernames').doc(users[i]);
await userref.get()
.then(doc => {
if (doc.exists) {
usersfiltered.push(users[i])
}
})
}
return usersfiltered;
}
filterusers(users).then(console.log);
First, you have to turn the array of users into an array of Promises ( asynchronous operations) by using Array.map:
const checkUserPromises = users.map((user) => {
const userref = db.collection("usernames").doc(user);
return userref.get().then((doc) => {
if (doc.exists) {
return user;
} else {
return null;
}
});
});
Then, you need to await these promises with Promises.all:
const checkedUsers = await Promise.all(checkUserPromises);
Lastly, you may want to filter out the users that are not existing:
const existingUsers = checkedUsers.filter((user) => user !== null);
await should be with Promise
async function filterusers(users) {
let usersfiltered = [];
for (let i = 0; i < users.length; i++) {
let userref = db.collection("usernames").doc(users[i]);
let getDoc = await userref.get();
if (getDoc.exists) {
usersfiltered.push(users[i]);
}
}
return usersfiltered;
}
let filterdUsers = filterusers(users);
console.log(filterdUsers);
Firstly, do not mix async/await with .then.
Secondly, use new ES6 for loops, to make the code work properly with async/await.
async function filterusers(users) {
let usersfiltered = [];
filterusers(users).then(console.log);
for (const user of users) {
let userref = db.collection('usernames').doc(user);
const doc = await userref.get();
if (doc.exists) {
usersfiltered.push(user);
}
}
return await usersfiltered;
}
Solved it myself by moving the await to before userref.get()
async function filterusers(users){
let usersfiltered=[]
for(var i = 0;i < users.length; i++){
let userref = db.collection('usernames').doc(users[i]);
let getDoc = await userref.get()
.then(doc => {
if (doc.exists) {
usersfiltered.push(users[i])
}
})
}
return usersfiltered;
}
filterusers(users).then(console.log);

Pagination in Zapier

I am trying following code to get all records from a paginated API in Zapier.
const limitPerPage = 20;
const apiUrl = "https://myurl.com/data";
var lastCursor = null;
var output = null;
const getContent = async function (cursor) {
let actualUrl = apiUrl + `?cursor=${cursor}&limit=${limitPerPage}`;
var apiResults = await fetch(actualUrl)
.then(resp => {
return resp.json;
});
}
const getEntireContentList = async function (cursor) {
const results = await getContent(cursor);
console.log("Retreiving data from API for cursor : " + cursor);
if (results.metadata.cursor !== "") {
return results.concat(await getEntireContentList(results.metadata.cursor));
} else {
return results;
}
};
(async() => {
const entireList = await getEntireContentList();
console.log(entireList);
output = entireList;
callback(null, entireList);
})();
I get error as
You did not define output! Try output = {id: 1, hello: await Promise.resolve("world")};
How can I fix this?
Your problem is that though you're awaiting in that function, the top-level carries on and execution ends before your code has had a chance to run.
The good news is, Zapier wraps your code in an async function already, so you can use await at the top level (per these docs).
Try this instead:
const limitPerPage = 20;
const apiUrl = "https://myurl.com/data";
let lastCursor = null;
// var output = null; // zapier does this for you already
const getContent = async function (cursor) {
const actualUrl = apiUrl + `?cursor=${cursor}&limit=${limitPerPage}`;
const rawResponse = await fetch(actualUrl)
return resp.json() // async function, you had it as a property
}
const getEntireContentList = async function (cursor) {
const results = await getContent(cursor);
console.log("Retreiving data from API for cursor : " + cursor);
if (results.metadata.cursor !== "") {
return results.concat(await getEntireUserList(results.metadata.cursor)); // should this be named getEntireContentList?
} else {
return results;
}
};
return {
results: await getEntireContentList()
}
I noticed this is a recursive approach. That's fine, but remember that you've got limited execution time. You also might hit memory limits (depending on how many objects you're returning), so keep an eye on that.

Puppeteer doesn't display results from first cycle of loops

I have this piece of code which loops through one page with 3 frames and collect data from them an put them together.The problem is that the code is display incomplete results for first 2 loops , after that everything is fine, or randomly i got an error like Execution context was destroyed, most likely because of a navigation.
Please excuse my bad code but I have only 2 months on coding in javaScript
const puppeteer = require('puppeteer');
const elementsToClickSelector = 'body > form > font > select option';
const allLineIds = 'body > form > font > select > option';
const timpSosiri = 'body > b > font';
function run () {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.goto('');
const frame = page.frames().find(f => f.name() === 'stanga');
const cframe = page.frames().find(f => f.name() === 'centru');
const dframe = page.frames().find(f => f.name() === 'dreapta');
// get all station name to be clicked
let elementsToClick = await frame.$$(elementsToClickSelector);
console.log(`Elements to click: ${elementsToClick.length}`);
if (page.frames().find(f => f.name().includes('stanga'))) {
console.info('Frame was in the DOM and in the frames list')
} else {
console.error('Frame was in the DOM but not in the frames list')
}
let test =[];
for (let i =0, length = elementsToClick.length; i< length; i++){
const item = await frame.evaluateHandle((i) =>{
return document.querySelectorAll('option')[i];
},i);
await frame.waitFor(1000);
const statieNume = await (await elementsToClick[i].getProperty('innerText')).jsonValue();
console.log(statieNume);
await item.click();
// get all linie ids to be clicked
let idLine = await cframe.$$(allLineIds);
for(let j = 0, length1 = idLine.length; j<length1; j++){
const lineItem = await cframe.evaluateHandle((j) =>{
return document.querySelectorAll('option')[j];
}, j);
const linie = await (await idLine[j].getProperty('innerText')).jsonValue();
console.log(linie);
lineItem.click();
cframe.waitForSelector('body > form > font > select option');
let timp = await dframe.$$(timpSosiri);
for( let k = 0, lengthk = timp.length; k < lengthk; k++){
const sosiri = await dframe.evaluateHandle((k) =>{
return document.querySelectorAll('b')[k];
},k);
dframe.waitForSelector('body > b > font');
sosiri.click();
const timpLinie = await (await timp[k].getProperty('innerHTML')).jsonValue();
console.log(timpLinie);
test.push({
statie:statieNume,
linie: linie,
timpi: timpLinie
});
}
}
}
browser.close();
return resolve(JSON.stringify(test));
} catch (e) {
return reject(e);
}
})
}
run().then(console.log).catch(console.error);
Output
Mures
A Saguna
[1] S5
[0] E3
[0] S5
A.Guttenbrun_1
[1] E8
Sosire1: 17:31<br> Sosire2: 17:38
[0] 21
Sosire1: 17:31<br> Sosire2: 17:38
A.Guttenbrun_2
[0] S10
Sosire1: 17:26<br> Sosire2: 17:55
[1] Tv5
Sosire1: 17:26<br> Sosire2: 17:55
First Mures doesn't display lines and time and A Saguna dosen't disply time.

Scraping IMDb episodes using Cheerio.js - only first page of TV episodes is returned

Working on scraping TV episodes from IMDb (Breaking Bad in the example below). The problem is when implementing the for loop, only the first iteration of j is returned.
My assumption is the return statement is exiting the loop but I'm unsure how to fix the problem.
const fetch = require('node-fetch');
const cheerio = require('cheerio');
const searchUrl = 'https://www.imdb.com/find?s=tt&ttype=tv&ref_=fn_tv&q=';
const movieUrl = 'https://www.imdb.com/title/';
async function getEpisodes(searchTerm) {
//const imdbID = await getID(searchTerm);
//const numSeasons = await getSeasons(imdbID);
const imdbID = 'tt0903747';
const numSeasons = 5;
const episodes = [];
for (let j = 1; j <= numSeasons; j++) {
return fetch(`${movieUrl}${imdbID}/episodes?season=${j}`)
.then(response => response.text())
.then(body => {
const $ = cheerio.load(body);
$('div[itemProp="episodes"]').each(function (i, element) {
const airdate = $(element).find('.airdate').text().trim();
const episodeTitle = $(element).find('a[itemProp="name"]').text().trim();
const votes = $(element).find('.ipl-rating-star__total-votes').text().trim().match(/\(([^)]+)\)/)[1];
const rating = $(element).find('.ipl-rating-star ').find('.ipl-rating-star__rating').text().trim().slice(0, 3);
episode = {
season: j,
episodeTitle,
airdate,
votes,
rating
};
episodes.push(episode);
});
return episodes; //Only season 1 is returned.
});
}
}
Let's rewrite the function using async await style. This way we make sure we fire fetch numSeasons times, await all of them, and process them one by one.
async function processResponse(response, season) {
const body = await response.text();
const $ = cheerio.load(body);
let episodes = [];
$('div[itemProp="episodes"]').each(function (i, element) {
const airdate = $(element).find('.airdate').text().trim();
const episodeTitle = $(element).find('a[itemProp="name"]').text().trim();
const votes = $(element).find('.ipl-rating-star__total-votes').text().trim().match(/\(([^)]+)\)/)[1];
const rating = $(element).find('.ipl-rating-star ').find('.ipl-rating-star__rating').text().trim().slice(0, 3);
episode = {
season,
episodeTitle,
airdate,
votes,
rating
};
episodes.push(episode);
});
return episodes;
}
async function getEpisodes(searchTerm) {
//const imdbID = await getID(searchTerm);
//const numSeasons = await getSeasons(imdbID);
const imdbID = 'tt0903747';
const numSeasons = 5;
let promises = [];
for (let j = 1; j <= numSeasons; j++) {
promises.push(fetch(`${movieUrl}${imdbID}/episodes?season=${j}`));
}
const responses = await Promise.all(promises);
return responses.reduce((accumulator, response, index) => {
return accumulator.concat(await processResponse(response, index + 1));
}, []);
}

Categories

Resources