Puppeteer doesn't display results from first cycle of loops - javascript

I have this piece of code which loops through one page with 3 frames and collect data from them an put them together.The problem is that the code is display incomplete results for first 2 loops , after that everything is fine, or randomly i got an error like Execution context was destroyed, most likely because of a navigation.
Please excuse my bad code but I have only 2 months on coding in javaScript
const puppeteer = require('puppeteer');
const elementsToClickSelector = 'body > form > font > select option';
const allLineIds = 'body > form > font > select > option';
const timpSosiri = 'body > b > font';
function run () {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.goto('');
const frame = page.frames().find(f => f.name() === 'stanga');
const cframe = page.frames().find(f => f.name() === 'centru');
const dframe = page.frames().find(f => f.name() === 'dreapta');
// get all station name to be clicked
let elementsToClick = await frame.$$(elementsToClickSelector);
console.log(`Elements to click: ${elementsToClick.length}`);
if (page.frames().find(f => f.name().includes('stanga'))) {
console.info('Frame was in the DOM and in the frames list')
} else {
console.error('Frame was in the DOM but not in the frames list')
}
let test =[];
for (let i =0, length = elementsToClick.length; i< length; i++){
const item = await frame.evaluateHandle((i) =>{
return document.querySelectorAll('option')[i];
},i);
await frame.waitFor(1000);
const statieNume = await (await elementsToClick[i].getProperty('innerText')).jsonValue();
console.log(statieNume);
await item.click();
// get all linie ids to be clicked
let idLine = await cframe.$$(allLineIds);
for(let j = 0, length1 = idLine.length; j<length1; j++){
const lineItem = await cframe.evaluateHandle((j) =>{
return document.querySelectorAll('option')[j];
}, j);
const linie = await (await idLine[j].getProperty('innerText')).jsonValue();
console.log(linie);
lineItem.click();
cframe.waitForSelector('body > form > font > select option');
let timp = await dframe.$$(timpSosiri);
for( let k = 0, lengthk = timp.length; k < lengthk; k++){
const sosiri = await dframe.evaluateHandle((k) =>{
return document.querySelectorAll('b')[k];
},k);
dframe.waitForSelector('body > b > font');
sosiri.click();
const timpLinie = await (await timp[k].getProperty('innerHTML')).jsonValue();
console.log(timpLinie);
test.push({
statie:statieNume,
linie: linie,
timpi: timpLinie
});
}
}
}
browser.close();
return resolve(JSON.stringify(test));
} catch (e) {
return reject(e);
}
})
}
run().then(console.log).catch(console.error);
Output
Mures
A Saguna
[1] S5
[0] E3
[0] S5
A.Guttenbrun_1
[1] E8
Sosire1: 17:31<br> Sosire2: 17:38
[0] 21
Sosire1: 17:31<br> Sosire2: 17:38
A.Guttenbrun_2
[0] S10
Sosire1: 17:26<br> Sosire2: 17:55
[1] Tv5
Sosire1: 17:26<br> Sosire2: 17:55
First Mures doesn't display lines and time and A Saguna dosen't disply time.

Related

Calling an asynchronous function, main thread wont stop and then there is result with undefined

I am trying to improve my skills with async, await. So I am trying to make an app that collects the prices of different flights in different periods and then it decides in which period the plane ticket is cheapest for personal use.
const puppeteerExtra = require("puppeteer-extra");
const pluginStealth = require("puppeteer-extra-plugin-stealth");
puppeteerExtra.use(pluginStealth());
const PCR = require("puppeteer-chromium-resolver");
const howLongStart = 7;
const howLongEnd = 8;
const fromDate = new Date("2023-07-15");
const toDate = new Date("2023-08-31");
const airport = "PDL";
let tickets = [];
for (let i = 0; i < howLongEnd - howLongStart; i++) {
let howLong = howLongStart + i;
let tempFromDate = new Date("2023-07-15");
let tempFromD = new Date("2023-07-15");
let tempToDate = addDays(tempFromD, howLong);
async function ticketFirstMethod() {
const ticketFirst = await searchFlight(airport, tempFromDate, tempToDate);
tickets.push(ticketFirst);
}
ticketFirstMethod();
while (addDays(tempToDate, 1) <= toDate) {
tempFromDate = addDays(tempFromDate, 1);
tempToDate = addDays(tempToDate, 1);
async function ticketMethod() {
let ticket = await searchFlight(airport, tempFromDate, tempToDate);
tickets.push(ticket);
}
ticketMethod();
}
}
let lowestTicket;
let lowest = Number.POSITIVE_INFINITY;
let highest = Number.NEGATIVE_INFINITY;
let tmp;
for (let i = tickets.length - 1; i >= 0; i--) {
tmp = tickets[i][0];
if (tmp < lowest) {
lowest = tmp;
lowestTicket = tickets[i];
}
if (tmp > highest) highest = tmp;
}
console.log(lowestTicket);
function addDays(date, days) {
date.setDate(date.getDate() + days);
return date;
}
async function searchFlight(airport, tempFromDate, tempToDate) {
const stats = await PCR();
const browser = await puppeteerExtra.launch({
executablePath: stats.executablePath,
headless: false,
});
const page = await browser.newPage();
await page.goto(
"https://www.pelikan.cz/cs/letenky/T:1,P:4000E_0_0,CDF:PRGMUCFRATXLVIE,CDT:C" +
airport +
",R:1,DD:" +
tempFromDate.getFullYear +
"_" +
tempFromDate.getMonth +
"_" +
tempFromDate.getDay +
",DR:" +
tempToDate.getFullYear +
"_" +
tempToDate.getMonth +
"_" +
tempToDate.getDay +
"/",
{ waitUntil: "networkidle2", timeout: 0 }
);
const cheapestPrice = await page.waitForSelector(
"#flight-10000 > div:nth-child(1) > flights-flight:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(2) > div:nth-child(1) > div:nth-child(1) > div:nth-child(3)"
);
const price = await page.evaluate((el) => el.textContent, cheapestPrice);
const priceOnly = price.replace(/\D/g, "");
const ticket = [priceOnly, page.url()];
await browser.close();
return ticket;
}
I have tried to put here an example of the code.
Can anyone please help me?
EXPECTED
Firstly I choose a period from when to when it should be searching for the ticket. Then I call searchFlight with this period of time to search for the ticket. The main thread will wait for the function to be processed and then the ticket is pushed to tickets.
BEHAVIOUR
The main thread will not wait and it continous so there is undefined ticket pushed to tickets.
I was trying to use the then method on the line where I am calling searchFlight function. In then method I put tickets.push(ticket). But that didn't work.
I was trying to search for fix but because I dont understand await, async that much I could not fix my code.
First off, remove the (async () => { .... }() wrapper. That's superfluous and getting in the way. The parent function is already async so the wrapper is not needed.
Then, searchFlight is async so you need to await its result where you are calling it. And, you'll need to make it's parent function async so you can use that await.
const ticket = await searchFlight(airport, tempFromDate, tempToDate);
Then, you have to actually return a result from inside of searchFlight. Right now, you have no return result at the top level of that function.
I would suggest you do that by not mixing await and .then(). Just use await like this:
async function searchFlight(airport, tempFromDate, tempToDate){
const stats = await PCR();
const browser = await puppeteerExtra.launch({
executablePath: stats.executablePath,
headless: false
});
const page = await browser.newPage()
await page.goto("...", {waitUntil: "networkidle2", timeout: 0})
const cheapestPrice = await page.waitForSelector('...');
const price = await page.evaluate(el => el.textContent, cheapestPrice);
const priceOnly = price.replace(/\D/g, "");
const ticket = [priceOnly, page.url()];
await browser.close()
return ticket;
}
And, please eliminate any use of var. One should only be using const or let in modern Javascript.

Promise.all returns data in a random order

I was having problems with data fetching taking a long time, because i was fetching data in a loop like this:
const req = await fetch(
"https://www.supplier.co.uk/newarrivals/?setPerPage=25&search_direction=asc&pageID=" +
page
);
const html = await req.text();
const $ = cheerio.load(html);
let newProducts = [];
for (let i = 1; i < 26; i++) {
let pageSrc = $(
`#product_listing > tbody > #_${i} > td:nth-child(2) > a`
).attr("href");
pageSrc = "https://www.supplier.co.uk" + pageSrc;
const req2 = await fetch(pageSrc);
const html2 = await req2.text();
const $2 = cheerio.load(html2);
let imageSrc = $2(
"#product-main-image .main-image-inner:first-child img"
).attr("src");
const name = $2("#product-details dd:nth-child(2)")
.text();
const brand = $2("#product-details dd:nth-child(4)")
.text();
const price = $2("#product-details dd:nth-child(6)")
.text();
newProducts.push({
name,
imageSrc,
brand,
price,
pageSrc,
});
}
return newProducts;
With the help of people on here, i managed to change it so it will fetch all of the URLs in parallel using Promise.all() like this:
const pageSrcs = [];
for (let i = 1; i < 26; i++) {
let pageSrc = $(
`#product_listing > tbody > #_${i} > td:nth-child(2) > a`
).attr("href");
pageSrc = "https://www.supplier.co.uk" + pageSrc;
pageSrcs.push(pageSrc);
}
await Promise.all(
pageSrcs.map((pageSrc) =>
fetch(pageSrc)
.then((res) => res.text())
.then((html2) => {
const $2 = cheerio.load(html2);
let imageSrc = $2(
"#product-main-image .main-image-inner:first-child img"
).attr("src");
const brand = $2("#product-details dd:nth-child(4)").text();
//etc more selectors & functions
newProducts.push({
name,
imageSrc,
brand,
price,
pageSrc,
stock,
arrival,
type,
});
})
)
);
return newProducts;
But now the data gets returned in a random order, obviously since the requests are parallel. Is there a simple way to order them?
edit: added rest of code
You should use the return value from Promise.all, this will be in order. Currently you use the individual return values before they are collected by Promise.all
const texts = await Promise.all(
pageSrcs.map((pageSrc) => fetch(pageSrc).then((res) => res.text()))
)
for (const html of texts) {
const $2 = cheerio.load(html);
let imageSrc = $2("#product-main-image .main-image-inner:first-child img").attr("src");
const brand = $2("#product-details dd:nth-child(4)").text();
//etc...
}

Adapt web-scraper JavaScript code for Puppeteer

I have minimal coding knowledge and I'm trying to adapt some tutorials without success.
The JavaScript code I wish to adapt (script A) is pasted into the Chrome developer console and successfully pulls the data I need. This JavaScript snippet identifies the largest price graphic in an e-commerce site.
A second tutorial (script B) is run from the shell and calls the Puppeteer library. This script pulls some hotel booking data and runs successfully.
I wish to adapt script A to run from the shell using the Puppeteer library.
This is Script A -
let elements = [
...document.querySelectorAll(' body *')
]
function createRecordFromElement(element) {
const text = element.textContent.trim()
var record = {}
const bBox = element.getBoundingClientRect()
if(text.length <= 30 && !(bBox.x == 0 && bBox.y == 0)) {
record['fontSize'] = parseInt(getComputedStyle(element)['fontSize']) }
record['y'] = bBox.y
record['x'] = bBox.x
record['text'] = text
return record
}
let records = elements.map(createRecordFromElement)
function canBePrice(record) {
if( record['y'] > 600 ||
record['fontSize'] == undefined ||
!record['text'].match(/(^(US ){0,1}(rs\.|Rs\.|RS\.|\$|₹|INR|USD|CAD|C\$){0,1}(\s){0,1}[\d,]+(\.\d+){0,1}(\s){0,1}(AED){0,1}$)/)
)
return false
else return true
}
let possiblePriceRecords = records.filter(canBePrice)
let priceRecordsSortedByFontSize = possiblePriceRecords.sort(function(a, b) {
if (a['fontSize'] == b['fontSize']) return a['y'] > b['y']
return a['fontSize'] < b['fontSize']
})
console.log(priceRecordsSortedByFontSize[0]['text']);console.log(priceRecordsSortedByFontSize[1]['text']);
This is Script B -
const puppeteer = require('puppeteer');
let bookingUrl = 'insert booking URL';
(async () => {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 926 });
await page.goto(bookingUrl);
// get hotel details
let hotelData = await page.evaluate(() => {
let hotels = [];
// get the hotel elements
let hotelsElms = document.querySelectorAll('div.sr_property_block[data-hotelid]');
// get the hotel data
hotelsElms.forEach((hotelelement) => {
let hotelJson = {};
try {
hotelJson.name = hotelelement.querySelector('span.sr-hotel__name').innerText;
hotelJson.reviews = hotelelement.querySelector('span.review-score-widget__subtext').innerText;
hotelJson.rating = hotelelement.querySelector('span.review-score-badge').innerText;
if(hotelelement.querySelector('strong.price')){
hotelJson.price = hotelelement.querySelector('strong.price').innerText;
}
}
catch (exception){
}
hotels.push(hotelJson);
});
return hotels;
});
console.dir(hotelData);
})();
I've had various attempts at adapting Script A into the format of Script B. Various and many different errors have been thrown. Without coding knowledge, I'm not getting anywhere.
Here's one of many variations I've tried, called Script C -
const puppeteer = require('puppeteer-core');
let bookingUrl = 'https://shop.coles.com.au/a/dianella/product/moccona-coffee-capsules-espresso-7';
(async () => {
const browser = await puppeteer.launch({
executablePath: '/usr/bin/chromium-browser',
headless: true
});
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 926 });
await page.goto(bookingUrl);
// get hotel details
let hotelData = await page.evaluate(() => {
let hotels = [];
// get the hotel elements
let elements = [
...document.querySelectorAll(' body *')
]
function createRecordFromElement(element) {
const text = element.textContent.trim()
var record = {}
const bBox = element.getBoundingClientRect()
if(text.length <= 30 && !(bBox.x == 0 && bBox.y == 0)) {
record['fontSize'] = parseInt(getComputedStyle(element)['fontSize']) }
record['y'] = bBox.y
record['x'] = bBox.x
record['text'] = text
return record
}
let records = elements.map(createRecordFromElement)
function canBePrice(record) {
if( record['y'] > 600 ||
record['fontSize'] == undefined ||
!record['text'].match(/(^(US ){0,1}(rs\.|Rs\.|RS\.|\$|₹|INR|USD|CAD|C\$){0,1}(\s){0,1}[\d,]+(\.\d+){0,1}(\s){0,1}(AED){0,1}$)/)
)
return false
else return true
}
let possiblePriceRecords = records.filter(canBePrice)
let priceRecordsSortedByFontSize = possiblePriceRecords.sort(function(a, b) {
if (a['fontSize'] == b['fontSize']) return a['y'] > b['y']
return a['fontSize'] < b['fontSize']
})
console.log(priceRecordsSortedByFontSize[0]['text']);
})();
Here's the links to the tutorials for info -
https://www.scrapehero.com/how-to-scrape-prices-from-any-ecommerce-website/
https://www.scrapehero.com/how-to-build-a-web-scraper-using-puppeteer-and-node-js/
Is there anything obviously wrong in Script C?
After reading through script C, it appears that you have not made any mistakes, rather the website you are attempting to access has decided to block scraper bots.
A quick host lookup on the domain shows that they are using security service section.io to block scraper bots on their website. See:
shop.coles.com.au is an alias for shop.coles.com.au.c.section.io.
shop.coles.com.au.c.section.io is an alias for shop.coles.com.au.x.section.io

Looping through a set of urls in Puppeteer

How would I scrape content from multiple urls using Puppeteer?
I've created a loop, but I'm only seeing the results for the first url.
I suspect it's something to do with where I declare the results variable, but I've had no luck trying, does anyone know how to do this?
const puppeteer = require('puppeteer');
function run() {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
const urls = ["https://www.marksandspencer.com/high-neck-long-sleeve-blouse/p/p60260040?image=SD_01_T43_5168_HD_X_EC_90&color=LIGHTDENIM&prevPage=plp", "https://www.marksandspencer.com/pure-cotton-printed-short-sleeve-t-shirt/p/p60263529?image=SD_01_T41_8030Z_Z4_X_EC_90&color=WHITEMIX&prevPage=plp"];
for (let i = 0; i < urls.length; i++) {
const url = urls[i];
await page.goto(url);
let products = await page.evaluate(() => {
let product = document.querySelector('h1[itemprop=name]').innerText;
let results = [];
let items = document.querySelectorAll('[data-ttip-id=sizeGridTooltip] tbody tr td label');
items.forEach((element) => {
let size = element.getAttribute('for');
let stockLevel = "";
let nearest_td = element.closest('td');
if (nearest_td.classList.contains('low-stock')) {
stockLevel = "Low stock"
} else if (nearest_td.classList.contains('out-of-stock')) {
stockLevel = "Out of stock"
} else {
stockLevel = "In stock"
}
results.push({
product: product,
size: size,
stock: stockLevel
})
});
return results
})
browser.close();
return resolve(products);
}
} catch (e) {
return reject(e);
}
})
}
run().then(console.log).catch(console.error);
These lines are inside your for loop:
browser.close();
return resolve(products);
So as part of the first iteration, you close the browser and return the function. You should move this out of your for loop and store products inside an array like this:
const urls = /* ... */;
const productsList = [];
for (let i = 0; i < urls.length; i++) {
const url = urls[i];
await page.goto(url);
let products = await page.evaluate(/* ... */);
productsList.push(products);
}
browser.close();
return resolve(productsList); // resolve with an array containing the aggregated products
In case you are looking for a more elegant solution (for crawling the pages in parallel), you might want to have a look at the library puppeteer-cluster (disclaimer: I'm the author).

Scraping IMDb episodes using Cheerio.js - only first page of TV episodes is returned

Working on scraping TV episodes from IMDb (Breaking Bad in the example below). The problem is when implementing the for loop, only the first iteration of j is returned.
My assumption is the return statement is exiting the loop but I'm unsure how to fix the problem.
const fetch = require('node-fetch');
const cheerio = require('cheerio');
const searchUrl = 'https://www.imdb.com/find?s=tt&ttype=tv&ref_=fn_tv&q=';
const movieUrl = 'https://www.imdb.com/title/';
async function getEpisodes(searchTerm) {
//const imdbID = await getID(searchTerm);
//const numSeasons = await getSeasons(imdbID);
const imdbID = 'tt0903747';
const numSeasons = 5;
const episodes = [];
for (let j = 1; j <= numSeasons; j++) {
return fetch(`${movieUrl}${imdbID}/episodes?season=${j}`)
.then(response => response.text())
.then(body => {
const $ = cheerio.load(body);
$('div[itemProp="episodes"]').each(function (i, element) {
const airdate = $(element).find('.airdate').text().trim();
const episodeTitle = $(element).find('a[itemProp="name"]').text().trim();
const votes = $(element).find('.ipl-rating-star__total-votes').text().trim().match(/\(([^)]+)\)/)[1];
const rating = $(element).find('.ipl-rating-star ').find('.ipl-rating-star__rating').text().trim().slice(0, 3);
episode = {
season: j,
episodeTitle,
airdate,
votes,
rating
};
episodes.push(episode);
});
return episodes; //Only season 1 is returned.
});
}
}
Let's rewrite the function using async await style. This way we make sure we fire fetch numSeasons times, await all of them, and process them one by one.
async function processResponse(response, season) {
const body = await response.text();
const $ = cheerio.load(body);
let episodes = [];
$('div[itemProp="episodes"]').each(function (i, element) {
const airdate = $(element).find('.airdate').text().trim();
const episodeTitle = $(element).find('a[itemProp="name"]').text().trim();
const votes = $(element).find('.ipl-rating-star__total-votes').text().trim().match(/\(([^)]+)\)/)[1];
const rating = $(element).find('.ipl-rating-star ').find('.ipl-rating-star__rating').text().trim().slice(0, 3);
episode = {
season,
episodeTitle,
airdate,
votes,
rating
};
episodes.push(episode);
});
return episodes;
}
async function getEpisodes(searchTerm) {
//const imdbID = await getID(searchTerm);
//const numSeasons = await getSeasons(imdbID);
const imdbID = 'tt0903747';
const numSeasons = 5;
let promises = [];
for (let j = 1; j <= numSeasons; j++) {
promises.push(fetch(`${movieUrl}${imdbID}/episodes?season=${j}`));
}
const responses = await Promise.all(promises);
return responses.reduce((accumulator, response, index) => {
return accumulator.concat(await processResponse(response, index + 1));
}, []);
}

Categories

Resources