Why page.$$eval is working too many times? - javascript

I want to build a simple bot with puppeteeer.
I used page.$$eval then I tried to fetch data from table(10 page) and mapped that data.
However I can fetch data very well on the other hand the code is working 10 times per page. I mean every row fetched 10 times.
Here is my code snippet:
const tablolariCek = async (url, sayfaSayisi) => {
const browser = await puppeteer.launch({ headless: false });
let page = await browser.newPage();
await page.goto(url);
await page.waitForSelector('#mydata_next');
let okulUni = [];
for (let index = 0; index <= sayfaSayisi; index++) {
let okullar = await page.$$eval(
'#mydata > tbody > [role="row"]',
(uniler) =>
uniler.map((okul) => {
//Here is working 10 times per page.
let uni = {};
uni.okulkodu = okul.querySelector('a').innerText.trim();
const fontVeriler = okul.querySelectorAll('font');
const strongVeriler = okul.querySelectorAll('strong');
for (let index = 0; index < strongVeriler.length; index++) {
if (index == 0) {
uni.uniadi = strongVeriler[index].innerText.trim();
} else if (index == 1) {
uni.bolumadi = strongVeriler[index].innerText.trim();
}
}
for (let index = 0; index < fontVeriler.length; index++) {
if (index == 1) {
uni.bolumadi += ' ' + fontVeriler[index].innerText.trim();
} else if (index == 10) {
uni.siralama2019 = fontVeriler[index].innerText.trim();
} else if (index == 14) {
uni.puan2019 = fontVeriler[index].innerText.trim();
}
}
return uni;
})
);
await page.click('#mydata_next');
okullar.forEach((okul) => {
okulUni.push(okul);
});
}
browser.close();
return okulUni;
};
Here it is what am I trying to fetch
<table id="mydata">
<tbody>
<tr role="row" class="odd">//this line
</tbody>
I can't find solution.
I found a solution by changing this line.
const browser = await puppeteer.launch({ headless: false });
const browser = await puppeteer.launch({ headless: false,slowMo: 150 });
I think due to the speed The code can't fetch table that exactly right. Everything works fine now. Thank you for the answers.

I found a solution by changing this line.
const browser = await puppeteer.launch({ headless: false });
const browser = await puppeteer.launch({ headless: false,slowMo: 150 });
I think due to the speed The code can't fetch table that exactly right. Everything works fine now. Thank you for the answers.

Related

Calling an asynchronous function, main thread wont stop and then there is result with undefined

I am trying to improve my skills with async, await. So I am trying to make an app that collects the prices of different flights in different periods and then it decides in which period the plane ticket is cheapest for personal use.
const puppeteerExtra = require("puppeteer-extra");
const pluginStealth = require("puppeteer-extra-plugin-stealth");
puppeteerExtra.use(pluginStealth());
const PCR = require("puppeteer-chromium-resolver");
const howLongStart = 7;
const howLongEnd = 8;
const fromDate = new Date("2023-07-15");
const toDate = new Date("2023-08-31");
const airport = "PDL";
let tickets = [];
for (let i = 0; i < howLongEnd - howLongStart; i++) {
let howLong = howLongStart + i;
let tempFromDate = new Date("2023-07-15");
let tempFromD = new Date("2023-07-15");
let tempToDate = addDays(tempFromD, howLong);
async function ticketFirstMethod() {
const ticketFirst = await searchFlight(airport, tempFromDate, tempToDate);
tickets.push(ticketFirst);
}
ticketFirstMethod();
while (addDays(tempToDate, 1) <= toDate) {
tempFromDate = addDays(tempFromDate, 1);
tempToDate = addDays(tempToDate, 1);
async function ticketMethod() {
let ticket = await searchFlight(airport, tempFromDate, tempToDate);
tickets.push(ticket);
}
ticketMethod();
}
}
let lowestTicket;
let lowest = Number.POSITIVE_INFINITY;
let highest = Number.NEGATIVE_INFINITY;
let tmp;
for (let i = tickets.length - 1; i >= 0; i--) {
tmp = tickets[i][0];
if (tmp < lowest) {
lowest = tmp;
lowestTicket = tickets[i];
}
if (tmp > highest) highest = tmp;
}
console.log(lowestTicket);
function addDays(date, days) {
date.setDate(date.getDate() + days);
return date;
}
async function searchFlight(airport, tempFromDate, tempToDate) {
const stats = await PCR();
const browser = await puppeteerExtra.launch({
executablePath: stats.executablePath,
headless: false,
});
const page = await browser.newPage();
await page.goto(
"https://www.pelikan.cz/cs/letenky/T:1,P:4000E_0_0,CDF:PRGMUCFRATXLVIE,CDT:C" +
airport +
",R:1,DD:" +
tempFromDate.getFullYear +
"_" +
tempFromDate.getMonth +
"_" +
tempFromDate.getDay +
",DR:" +
tempToDate.getFullYear +
"_" +
tempToDate.getMonth +
"_" +
tempToDate.getDay +
"/",
{ waitUntil: "networkidle2", timeout: 0 }
);
const cheapestPrice = await page.waitForSelector(
"#flight-10000 > div:nth-child(1) > flights-flight:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(2) > div:nth-child(1) > div:nth-child(1) > div:nth-child(3)"
);
const price = await page.evaluate((el) => el.textContent, cheapestPrice);
const priceOnly = price.replace(/\D/g, "");
const ticket = [priceOnly, page.url()];
await browser.close();
return ticket;
}
I have tried to put here an example of the code.
Can anyone please help me?
EXPECTED
Firstly I choose a period from when to when it should be searching for the ticket. Then I call searchFlight with this period of time to search for the ticket. The main thread will wait for the function to be processed and then the ticket is pushed to tickets.
BEHAVIOUR
The main thread will not wait and it continous so there is undefined ticket pushed to tickets.
I was trying to use the then method on the line where I am calling searchFlight function. In then method I put tickets.push(ticket). But that didn't work.
I was trying to search for fix but because I dont understand await, async that much I could not fix my code.
First off, remove the (async () => { .... }() wrapper. That's superfluous and getting in the way. The parent function is already async so the wrapper is not needed.
Then, searchFlight is async so you need to await its result where you are calling it. And, you'll need to make it's parent function async so you can use that await.
const ticket = await searchFlight(airport, tempFromDate, tempToDate);
Then, you have to actually return a result from inside of searchFlight. Right now, you have no return result at the top level of that function.
I would suggest you do that by not mixing await and .then(). Just use await like this:
async function searchFlight(airport, tempFromDate, tempToDate){
const stats = await PCR();
const browser = await puppeteerExtra.launch({
executablePath: stats.executablePath,
headless: false
});
const page = await browser.newPage()
await page.goto("...", {waitUntil: "networkidle2", timeout: 0})
const cheapestPrice = await page.waitForSelector('...');
const price = await page.evaluate(el => el.textContent, cheapestPrice);
const priceOnly = price.replace(/\D/g, "");
const ticket = [priceOnly, page.url()];
await browser.close()
return ticket;
}
And, please eliminate any use of var. One should only be using const or let in modern Javascript.

Login > Go to Specific Page > Search > Grab Specific Content From Page Using Node JS and Puppeteer

I want to scrape some data from login protected page using Node Js and Puppeteer. When I try to scrape data, the shows page is not reachable. I don't have any idea about why I am getting output like this. Here is my code.
I am new to Puppeteer and Node Js, its been 8 hours I am trying to modify that code, but not working.
async function EmailLookup(MlsNumber)
{
resp = new { Success = false };
(page = await singletonBrowser.Instance.newPage())
{
await page.setRequestInterception(true);
page.Request += (sender, e) =>
{
if (e.Request.resourceType === resourceType.document || e.Request.resourceType === resourceType.Script || e.Request.resourceType === resourceType.Xhr)
e.Request.Continue();
else
e.Request.Abort();
};
await page.async("https://example.com/idp/login");
if (!page.Url.Contains("layouts"))
{
await page.waitForSelector("#username");
await page.waitForSelector("#password");
await page.waitForSelector("#loginbtn");
await page.async("#username", "xxxxx");
await page.async("#password", "xxxxx");
await page.async("#password", "\r");
await page.waitForNavigation(new navigationOptions { Timeout = 0 });
}
await page.goto("https://example.com/launch?layoutid=61&appid=433");
await page.waitForSelector("#ctl02_m_ucSpeedBar_m_tbSpeedBar");
await page.async("#ctl02_m_ucSpeedBar_m_tbSpeedBar", MlsNumber);
await page.async("#ctl02_m_ucSpeedBar_m_tbSpeedBar", "\r");
await page.waitForNavigation(new navigationOptions { Timeout = 0 });
var MLSLink = await page.waitForXPath("//a[text()='" + MlsNumber + "']");
if (MLSLink != null)
{
await MLSLink.click();
await page.waitForNavigation(new navigationOptions{ Timeout = 0 });
var Content = await page.get();
htmldoc = new htmldoc();
htmldoc.load(Content);
var parcelNode = htmldoc.document.selectSingleNode("//a[contains(#href,'AssessorParcelDetail')]");
var emailNode = htmldoc.document.selectSingleNode("//a[contains(#href,'mailto:')]");
if (emailNode != null && parcelNode != null)
{
resp.Success = true;
resp.Email = emailNode.innerText;
resp.Parcel = parcelNode.innerText;
}
}
return json(resp);
}
}

Adapt web-scraper JavaScript code for Puppeteer

I have minimal coding knowledge and I'm trying to adapt some tutorials without success.
The JavaScript code I wish to adapt (script A) is pasted into the Chrome developer console and successfully pulls the data I need. This JavaScript snippet identifies the largest price graphic in an e-commerce site.
A second tutorial (script B) is run from the shell and calls the Puppeteer library. This script pulls some hotel booking data and runs successfully.
I wish to adapt script A to run from the shell using the Puppeteer library.
This is Script A -
let elements = [
...document.querySelectorAll(' body *')
]
function createRecordFromElement(element) {
const text = element.textContent.trim()
var record = {}
const bBox = element.getBoundingClientRect()
if(text.length <= 30 && !(bBox.x == 0 && bBox.y == 0)) {
record['fontSize'] = parseInt(getComputedStyle(element)['fontSize']) }
record['y'] = bBox.y
record['x'] = bBox.x
record['text'] = text
return record
}
let records = elements.map(createRecordFromElement)
function canBePrice(record) {
if( record['y'] > 600 ||
record['fontSize'] == undefined ||
!record['text'].match(/(^(US ){0,1}(rs\.|Rs\.|RS\.|\$|₹|INR|USD|CAD|C\$){0,1}(\s){0,1}[\d,]+(\.\d+){0,1}(\s){0,1}(AED){0,1}$)/)
)
return false
else return true
}
let possiblePriceRecords = records.filter(canBePrice)
let priceRecordsSortedByFontSize = possiblePriceRecords.sort(function(a, b) {
if (a['fontSize'] == b['fontSize']) return a['y'] > b['y']
return a['fontSize'] < b['fontSize']
})
console.log(priceRecordsSortedByFontSize[0]['text']);console.log(priceRecordsSortedByFontSize[1]['text']);
This is Script B -
const puppeteer = require('puppeteer');
let bookingUrl = 'insert booking URL';
(async () => {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 926 });
await page.goto(bookingUrl);
// get hotel details
let hotelData = await page.evaluate(() => {
let hotels = [];
// get the hotel elements
let hotelsElms = document.querySelectorAll('div.sr_property_block[data-hotelid]');
// get the hotel data
hotelsElms.forEach((hotelelement) => {
let hotelJson = {};
try {
hotelJson.name = hotelelement.querySelector('span.sr-hotel__name').innerText;
hotelJson.reviews = hotelelement.querySelector('span.review-score-widget__subtext').innerText;
hotelJson.rating = hotelelement.querySelector('span.review-score-badge').innerText;
if(hotelelement.querySelector('strong.price')){
hotelJson.price = hotelelement.querySelector('strong.price').innerText;
}
}
catch (exception){
}
hotels.push(hotelJson);
});
return hotels;
});
console.dir(hotelData);
})();
I've had various attempts at adapting Script A into the format of Script B. Various and many different errors have been thrown. Without coding knowledge, I'm not getting anywhere.
Here's one of many variations I've tried, called Script C -
const puppeteer = require('puppeteer-core');
let bookingUrl = 'https://shop.coles.com.au/a/dianella/product/moccona-coffee-capsules-espresso-7';
(async () => {
const browser = await puppeteer.launch({
executablePath: '/usr/bin/chromium-browser',
headless: true
});
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 926 });
await page.goto(bookingUrl);
// get hotel details
let hotelData = await page.evaluate(() => {
let hotels = [];
// get the hotel elements
let elements = [
...document.querySelectorAll(' body *')
]
function createRecordFromElement(element) {
const text = element.textContent.trim()
var record = {}
const bBox = element.getBoundingClientRect()
if(text.length <= 30 && !(bBox.x == 0 && bBox.y == 0)) {
record['fontSize'] = parseInt(getComputedStyle(element)['fontSize']) }
record['y'] = bBox.y
record['x'] = bBox.x
record['text'] = text
return record
}
let records = elements.map(createRecordFromElement)
function canBePrice(record) {
if( record['y'] > 600 ||
record['fontSize'] == undefined ||
!record['text'].match(/(^(US ){0,1}(rs\.|Rs\.|RS\.|\$|₹|INR|USD|CAD|C\$){0,1}(\s){0,1}[\d,]+(\.\d+){0,1}(\s){0,1}(AED){0,1}$)/)
)
return false
else return true
}
let possiblePriceRecords = records.filter(canBePrice)
let priceRecordsSortedByFontSize = possiblePriceRecords.sort(function(a, b) {
if (a['fontSize'] == b['fontSize']) return a['y'] > b['y']
return a['fontSize'] < b['fontSize']
})
console.log(priceRecordsSortedByFontSize[0]['text']);
})();
Here's the links to the tutorials for info -
https://www.scrapehero.com/how-to-scrape-prices-from-any-ecommerce-website/
https://www.scrapehero.com/how-to-build-a-web-scraper-using-puppeteer-and-node-js/
Is there anything obviously wrong in Script C?
After reading through script C, it appears that you have not made any mistakes, rather the website you are attempting to access has decided to block scraper bots.
A quick host lookup on the domain shows that they are using security service section.io to block scraper bots on their website. See:
shop.coles.com.au is an alias for shop.coles.com.au.c.section.io.
shop.coles.com.au.c.section.io is an alias for shop.coles.com.au.x.section.io

Puppeteer doesn't display results from first cycle of loops

I have this piece of code which loops through one page with 3 frames and collect data from them an put them together.The problem is that the code is display incomplete results for first 2 loops , after that everything is fine, or randomly i got an error like Execution context was destroyed, most likely because of a navigation.
Please excuse my bad code but I have only 2 months on coding in javaScript
const puppeteer = require('puppeteer');
const elementsToClickSelector = 'body > form > font > select option';
const allLineIds = 'body > form > font > select > option';
const timpSosiri = 'body > b > font';
function run () {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.goto('');
const frame = page.frames().find(f => f.name() === 'stanga');
const cframe = page.frames().find(f => f.name() === 'centru');
const dframe = page.frames().find(f => f.name() === 'dreapta');
// get all station name to be clicked
let elementsToClick = await frame.$$(elementsToClickSelector);
console.log(`Elements to click: ${elementsToClick.length}`);
if (page.frames().find(f => f.name().includes('stanga'))) {
console.info('Frame was in the DOM and in the frames list')
} else {
console.error('Frame was in the DOM but not in the frames list')
}
let test =[];
for (let i =0, length = elementsToClick.length; i< length; i++){
const item = await frame.evaluateHandle((i) =>{
return document.querySelectorAll('option')[i];
},i);
await frame.waitFor(1000);
const statieNume = await (await elementsToClick[i].getProperty('innerText')).jsonValue();
console.log(statieNume);
await item.click();
// get all linie ids to be clicked
let idLine = await cframe.$$(allLineIds);
for(let j = 0, length1 = idLine.length; j<length1; j++){
const lineItem = await cframe.evaluateHandle((j) =>{
return document.querySelectorAll('option')[j];
}, j);
const linie = await (await idLine[j].getProperty('innerText')).jsonValue();
console.log(linie);
lineItem.click();
cframe.waitForSelector('body > form > font > select option');
let timp = await dframe.$$(timpSosiri);
for( let k = 0, lengthk = timp.length; k < lengthk; k++){
const sosiri = await dframe.evaluateHandle((k) =>{
return document.querySelectorAll('b')[k];
},k);
dframe.waitForSelector('body > b > font');
sosiri.click();
const timpLinie = await (await timp[k].getProperty('innerHTML')).jsonValue();
console.log(timpLinie);
test.push({
statie:statieNume,
linie: linie,
timpi: timpLinie
});
}
}
}
browser.close();
return resolve(JSON.stringify(test));
} catch (e) {
return reject(e);
}
})
}
run().then(console.log).catch(console.error);
Output
Mures
A Saguna
[1] S5
[0] E3
[0] S5
A.Guttenbrun_1
[1] E8
Sosire1: 17:31<br> Sosire2: 17:38
[0] 21
Sosire1: 17:31<br> Sosire2: 17:38
A.Guttenbrun_2
[0] S10
Sosire1: 17:26<br> Sosire2: 17:55
[1] Tv5
Sosire1: 17:26<br> Sosire2: 17:55
First Mures doesn't display lines and time and A Saguna dosen't disply time.

Looping through a set of urls in Puppeteer

How would I scrape content from multiple urls using Puppeteer?
I've created a loop, but I'm only seeing the results for the first url.
I suspect it's something to do with where I declare the results variable, but I've had no luck trying, does anyone know how to do this?
const puppeteer = require('puppeteer');
function run() {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
const urls = ["https://www.marksandspencer.com/high-neck-long-sleeve-blouse/p/p60260040?image=SD_01_T43_5168_HD_X_EC_90&color=LIGHTDENIM&prevPage=plp", "https://www.marksandspencer.com/pure-cotton-printed-short-sleeve-t-shirt/p/p60263529?image=SD_01_T41_8030Z_Z4_X_EC_90&color=WHITEMIX&prevPage=plp"];
for (let i = 0; i < urls.length; i++) {
const url = urls[i];
await page.goto(url);
let products = await page.evaluate(() => {
let product = document.querySelector('h1[itemprop=name]').innerText;
let results = [];
let items = document.querySelectorAll('[data-ttip-id=sizeGridTooltip] tbody tr td label');
items.forEach((element) => {
let size = element.getAttribute('for');
let stockLevel = "";
let nearest_td = element.closest('td');
if (nearest_td.classList.contains('low-stock')) {
stockLevel = "Low stock"
} else if (nearest_td.classList.contains('out-of-stock')) {
stockLevel = "Out of stock"
} else {
stockLevel = "In stock"
}
results.push({
product: product,
size: size,
stock: stockLevel
})
});
return results
})
browser.close();
return resolve(products);
}
} catch (e) {
return reject(e);
}
})
}
run().then(console.log).catch(console.error);
These lines are inside your for loop:
browser.close();
return resolve(products);
So as part of the first iteration, you close the browser and return the function. You should move this out of your for loop and store products inside an array like this:
const urls = /* ... */;
const productsList = [];
for (let i = 0; i < urls.length; i++) {
const url = urls[i];
await page.goto(url);
let products = await page.evaluate(/* ... */);
productsList.push(products);
}
browser.close();
return resolve(productsList); // resolve with an array containing the aggregated products
In case you are looking for a more elegant solution (for crawling the pages in parallel), you might want to have a look at the library puppeteer-cluster (disclaimer: I'm the author).

Categories

Resources