I am trying to web scrape with puppetter, currently the following code works but I think there can be optimizations made one of which I think is to only use one puppetter instance. But I don't really know how to do that. Can anyone help me?
This is the working but slow original:
const puppeteer = require('puppeteer');
async function scrapeProduct(url) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
const [el] = await page.$x('xpath of element');
const txt = await el.getProperty('textContent');
const rawTxt = await txt.jsonValue();
browser.close();
return rawTxt;
}
async function something() {
var some_varible = the length of some array;
process.setMaxListeners(some_varible);
for (var i = 0; i < some_varible; i++) {
var target = some_array[i].Name
var url = 'somewebsite' + target;
console.log(target + ": " + await scrapeProduct(url));
}
}
something();
This is my pathetic attempt at not using multiple instances of puppeteer: (Does not work)
const puppeteer = require('puppeteer');
async function scrapeProduct(url, page) {
await page.goto(url);
const [el] = await page.$x('xpath of element');
const txt = await el.getProperty('textContent');
const rawTxt = await txt.jsonValue();
return rawTxt;
}
async function something() {
var some_varible = the length of some array;
process.setMaxListeners(some_varible);
const browser = await puppeteer.launch();
const page = await browser.newPage();
for (var i = 0; i < some_varible; i++) {
var target = some_array[i].Name
var url = 'somewebsite' + target;
console.log(target + ": " + await scrapeProduct(url, page));
}
browser.close();
}
something();
Related
I am trying to improve my skills with async, await. So I am trying to make an app that collects the prices of different flights in different periods and then it decides in which period the plane ticket is cheapest for personal use.
const puppeteerExtra = require("puppeteer-extra");
const pluginStealth = require("puppeteer-extra-plugin-stealth");
puppeteerExtra.use(pluginStealth());
const PCR = require("puppeteer-chromium-resolver");
const howLongStart = 7;
const howLongEnd = 8;
const fromDate = new Date("2023-07-15");
const toDate = new Date("2023-08-31");
const airport = "PDL";
let tickets = [];
for (let i = 0; i < howLongEnd - howLongStart; i++) {
let howLong = howLongStart + i;
let tempFromDate = new Date("2023-07-15");
let tempFromD = new Date("2023-07-15");
let tempToDate = addDays(tempFromD, howLong);
async function ticketFirstMethod() {
const ticketFirst = await searchFlight(airport, tempFromDate, tempToDate);
tickets.push(ticketFirst);
}
ticketFirstMethod();
while (addDays(tempToDate, 1) <= toDate) {
tempFromDate = addDays(tempFromDate, 1);
tempToDate = addDays(tempToDate, 1);
async function ticketMethod() {
let ticket = await searchFlight(airport, tempFromDate, tempToDate);
tickets.push(ticket);
}
ticketMethod();
}
}
let lowestTicket;
let lowest = Number.POSITIVE_INFINITY;
let highest = Number.NEGATIVE_INFINITY;
let tmp;
for (let i = tickets.length - 1; i >= 0; i--) {
tmp = tickets[i][0];
if (tmp < lowest) {
lowest = tmp;
lowestTicket = tickets[i];
}
if (tmp > highest) highest = tmp;
}
console.log(lowestTicket);
function addDays(date, days) {
date.setDate(date.getDate() + days);
return date;
}
async function searchFlight(airport, tempFromDate, tempToDate) {
const stats = await PCR();
const browser = await puppeteerExtra.launch({
executablePath: stats.executablePath,
headless: false,
});
const page = await browser.newPage();
await page.goto(
"https://www.pelikan.cz/cs/letenky/T:1,P:4000E_0_0,CDF:PRGMUCFRATXLVIE,CDT:C" +
airport +
",R:1,DD:" +
tempFromDate.getFullYear +
"_" +
tempFromDate.getMonth +
"_" +
tempFromDate.getDay +
",DR:" +
tempToDate.getFullYear +
"_" +
tempToDate.getMonth +
"_" +
tempToDate.getDay +
"/",
{ waitUntil: "networkidle2", timeout: 0 }
);
const cheapestPrice = await page.waitForSelector(
"#flight-10000 > div:nth-child(1) > flights-flight:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(2) > div:nth-child(1) > div:nth-child(1) > div:nth-child(3)"
);
const price = await page.evaluate((el) => el.textContent, cheapestPrice);
const priceOnly = price.replace(/\D/g, "");
const ticket = [priceOnly, page.url()];
await browser.close();
return ticket;
}
I have tried to put here an example of the code.
Can anyone please help me?
EXPECTED
Firstly I choose a period from when to when it should be searching for the ticket. Then I call searchFlight with this period of time to search for the ticket. The main thread will wait for the function to be processed and then the ticket is pushed to tickets.
BEHAVIOUR
The main thread will not wait and it continous so there is undefined ticket pushed to tickets.
I was trying to use the then method on the line where I am calling searchFlight function. In then method I put tickets.push(ticket). But that didn't work.
I was trying to search for fix but because I dont understand await, async that much I could not fix my code.
First off, remove the (async () => { .... }() wrapper. That's superfluous and getting in the way. The parent function is already async so the wrapper is not needed.
Then, searchFlight is async so you need to await its result where you are calling it. And, you'll need to make it's parent function async so you can use that await.
const ticket = await searchFlight(airport, tempFromDate, tempToDate);
Then, you have to actually return a result from inside of searchFlight. Right now, you have no return result at the top level of that function.
I would suggest you do that by not mixing await and .then(). Just use await like this:
async function searchFlight(airport, tempFromDate, tempToDate){
const stats = await PCR();
const browser = await puppeteerExtra.launch({
executablePath: stats.executablePath,
headless: false
});
const page = await browser.newPage()
await page.goto("...", {waitUntil: "networkidle2", timeout: 0})
const cheapestPrice = await page.waitForSelector('...');
const price = await page.evaluate(el => el.textContent, cheapestPrice);
const priceOnly = price.replace(/\D/g, "");
const ticket = [priceOnly, page.url()];
await browser.close()
return ticket;
}
And, please eliminate any use of var. One should only be using const or let in modern Javascript.
I am trying to get a list of items from a website with puppeteer-core.
Here is the code that should print 774 in the console but only returns 24.
const puppeteer = require('puppeteer-core');
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
async function test() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setJavaScriptEnabled(true)
await page.setDefaultNavigationTimeout(0);
await page.goto("https://www.liveauctioneers.com/catalog/256884_8-27-22-cameo-cut-glass-art-glass-and-more/?page=1&pageSize=1000", {waitUntil: "networkidle2"});
let pageContent = await page.content()
let dom = new JSDOM(pageContent)
let div = dom.window.document.querySelectorAll("div")
await div.forEach(element => {
if (element.id == "content") {
dom = new JSDOM(element.innerHTML)
div = dom.window.document.querySelectorAll("div")
div.forEach(element => {
if (element.className == "App__StyledApp-sc-eiwfgw-0 cHSpyq") {
dom = new JSDOM(element.innerHTML)
div = dom.window.document.querySelectorAll("div")
div.forEach(element => {
if (element.id == "main") {
dom = new JSDOM(element.innerHTML)
div = dom.window.document.querySelectorAll("div")
dom = new JSDOM(div[0].innerHTML)
div = dom.window.document.querySelectorAll("div")
div.forEach(element => {
if (element.className == "CatalogPageItems__StyledContainer-sc-y0p083-0 bLuQEb") {
dom = new JSDOM(element.innerHTML)
div = dom.window.document.querySelectorAll("div")
dom = new JSDOM(div[1].innerHTML)
div = dom.window.document.querySelectorAll("div")
dom = new JSDOM(div[0].innerHTML)
div = dom.window.document.querySelectorAll("div")
console.log(div.length)
}
});
}
});
}
});
}
})
await browser.close();
}
test()
For me this code returns 24 instead of 774. If I load the url "https://www.liveauctioneers.com/catalog/256884_8-27-22-cameo-cut-glass-art-glass-and-more/" into my browser the pageSize defaults to 24.
I tried to simplify the code to find the number of items. I looked only at how many children the grid has and ended up with this one simple line that gives me the correct result:
...
let pageContent = await page.content();
let dom = new JSDOM(pageContent);
let count = dom.window.document.querySelector(".Grid__ItemGridContainer-sc-1fy9g29-0").childElementCount;
console.log(count);
await browser.close();
...
How can I stop the loop after the fifth link?
This is my code
await page.goto('https://old.reddit.com',{"waitUntil" : "networkidle0"});
const a_elems = await page.$$('.thumbnail');
for (var i=0; i<a_elems.length; i++) {
const elem = a_elems[i];
const href = await page.evaluate(e => e.href, elem);
const newPage = await browser.newPage();
await newPage.goto(href,{"waitUntil" : "networkidle0"});
}
I tried nth-child but it doesn't work.
Do you have any ideas?
Thank you!
by simply add the condition in your for loop
for (var i=0; i<a_elems.length && i<5; i++) {
const elem = a_elems[i];
const href = await page.evaluate(e => e.href, elem);
const newPage = await browser.newPage();
await newPage.goto(href,{"waitUntil" : "networkidle0"});
}
I am trying to add and remove event handler for onSelectionChanged.
If I defined the functions like below with export async function
export async function EnableCellHighlight(): Promise<void> {
console.log(1);
let _this: any = this;
await Excel.run(async function(context){
let workbook = context.workbook;
workbook.onSelectionChanged.add(CellHighlightHandler);
await context.sync();
});
}
export async function CellHighlightHandler(event: Excel.SelectionChangedEventArgs): Promise<void>{
await Excel.run(async (context) => {
let workbook = context.workbook;
let sheets = workbook.worksheets;
sheets.load("items");
await context.sync();
// clear previous meekou conditional format
await clearMeekouFormat();
// add new conditional format
let selection = workbook.getSelectedRange();
selection.load("rowIndex,columnIndex");
await context.sync();
let rowConditionalFormat = selection.getEntireRow().conditionalFormats.add(Excel.ConditionalFormatType.custom);
rowConditionalFormat.custom.format.fill.color = "red";
rowConditionalFormat.custom.rule.formula = `=ROW()= + ${(selection.rowIndex + 1)} + N("${AppConsts.Meekou}")`;
let columnConditionalFormat = selection.getEntireColumn().conditionalFormats.add(Excel.ConditionalFormatType.custom);
columnConditionalFormat.custom.format.fill.color = "red";
columnConditionalFormat.custom.rule.formula = `=Column()= + ${(selection.columnIndex + 1)} + N("${AppConsts.Meekou}")`;
await context.sync();
});
}
export async function clearMeekouFormat(): Promise<void> {
await Excel.run(async function (context) {
let workbook = context.workbook;
let worksheets = workbook.worksheets;
worksheets.load("items/name");
await context.sync();
for (let i = 0; i < worksheets.items.length; i++) {
let worksheet = worksheets.items[i];
let conditionalFormats = worksheet.getRange().conditionalFormats;
conditionalFormats.load("items/type");
await context.sync();
let total = conditionalFormats.items.length;
for(let j = total-1; j >=0; j --){
let conditionalFormat = conditionalFormats.items[j];
if(conditionalFormat.type == Excel.ConditionalFormatType.custom){
conditionalFormat.load("custom/rule/formula");
await context.sync();
if (conditionalFormat.custom.rule.formula.includes(AppConsts.Meekou)) {
conditionalFormat.delete();
}
}
}
}
await context.sync();
});
}
It will works correctly.
But, if I move above code to a class like below:
export class ExcelService {
private cellHighlightHandler = this.CellHighlightHandler;
//#region excel events
async enableCellHighlight(): Promise<void> {
console.log(1);
let _this: any = this;
await Excel.run(async function(context){
let workbook = context.workbook;
workbook.onSelectionChanged.add(_this.cellHighlightHandler);
await context.sync();
});
}
async disableCellHightlight(): Promise<void> {
let _this: any = this;
await this.clearMeekouFormat();
await Excel.run(async function(context){
let workbook = context.workbook;
workbook.onSelectionChanged.remove(_this.cellHighlightHandler);
await context.sync();
});
}
async CellHighlightHandler(event: Excel.SelectionChangedEventArgs): Promise<void>{
let _this: ExcelService = this;
await Excel.run(async (context) => {
let workbook = context.workbook;
let sheets = workbook.worksheets;
sheets.load("items");
await context.sync().catch(e => console.log(e));
// clear previous meekou conditional format
await _this.clearMeekouFormat().catch(e => console.log(e));
// add new conditional format
let selection = workbook.getSelectedRange();
selection.load("rowIndex,columnIndex");
await context.sync();
let rowConditionalFormat = selection.getEntireRow().conditionalFormats.add(Excel.ConditionalFormatType.custom);
rowConditionalFormat.custom.format.fill.color = "red";
rowConditionalFormat.custom.rule.formula = `=ROW()= + ${(selection.rowIndex + 1)} + N("${AppConsts.Meekou}")`;
let columnConditionalFormat = selection.getEntireColumn().conditionalFormats.add(Excel.ConditionalFormatType.custom);
columnConditionalFormat.custom.format.fill.color = "red";
columnConditionalFormat.custom.rule.formula = `=Column()= + ${(selection.columnIndex + 1)} + N("${AppConsts.Meekou}")`;
await context.sync();
});
}
/**
* clear all meekou conditional format
*/
async clearMeekouFormat(): Promise<void> {
await Excel.run(async function (context) {
let workbook = context.workbook;
let worksheets = workbook.worksheets;
worksheets.load("items/name");
await context.sync();
for (let i = 0; i < worksheets.items.length; i++) {
let worksheet = worksheets.items[i];
let conditionalFormats = worksheet.getRange().conditionalFormats;
conditionalFormats.load("items/type");
await context.sync();
let total = conditionalFormats.items.length;
for(let j = total-1; j >=0; j --){
let conditionalFormat = conditionalFormats.items[j];
if(conditionalFormat.type == Excel.ConditionalFormatType.custom){
conditionalFormat.load("custom/rule/formula");
await context.sync();
if (conditionalFormat.custom.rule.formula.includes(AppConsts.Meekou)) {
conditionalFormat.delete();
}
}
}
}
await context.sync();
});
}
//#endregion
}
The code will run stop await _this.clearMeekouFormat().catch(e => console.log(e));, but it did not throw any error
Do you have tried to remove the "catch(...)", like what you called in the first sample code "await _this.clearMeekouFormat();", Function "clearMeekouFormat(...)" actually does not have return expected promise?
How would I scrape content from multiple urls using Puppeteer?
I've created a loop, but I'm only seeing the results for the first url.
I suspect it's something to do with where I declare the results variable, but I've had no luck trying, does anyone know how to do this?
const puppeteer = require('puppeteer');
function run() {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
const urls = ["https://www.marksandspencer.com/high-neck-long-sleeve-blouse/p/p60260040?image=SD_01_T43_5168_HD_X_EC_90&color=LIGHTDENIM&prevPage=plp", "https://www.marksandspencer.com/pure-cotton-printed-short-sleeve-t-shirt/p/p60263529?image=SD_01_T41_8030Z_Z4_X_EC_90&color=WHITEMIX&prevPage=plp"];
for (let i = 0; i < urls.length; i++) {
const url = urls[i];
await page.goto(url);
let products = await page.evaluate(() => {
let product = document.querySelector('h1[itemprop=name]').innerText;
let results = [];
let items = document.querySelectorAll('[data-ttip-id=sizeGridTooltip] tbody tr td label');
items.forEach((element) => {
let size = element.getAttribute('for');
let stockLevel = "";
let nearest_td = element.closest('td');
if (nearest_td.classList.contains('low-stock')) {
stockLevel = "Low stock"
} else if (nearest_td.classList.contains('out-of-stock')) {
stockLevel = "Out of stock"
} else {
stockLevel = "In stock"
}
results.push({
product: product,
size: size,
stock: stockLevel
})
});
return results
})
browser.close();
return resolve(products);
}
} catch (e) {
return reject(e);
}
})
}
run().then(console.log).catch(console.error);
These lines are inside your for loop:
browser.close();
return resolve(products);
So as part of the first iteration, you close the browser and return the function. You should move this out of your for loop and store products inside an array like this:
const urls = /* ... */;
const productsList = [];
for (let i = 0; i < urls.length; i++) {
const url = urls[i];
await page.goto(url);
let products = await page.evaluate(/* ... */);
productsList.push(products);
}
browser.close();
return resolve(productsList); // resolve with an array containing the aggregated products
In case you are looking for a more elegant solution (for crawling the pages in parallel), you might want to have a look at the library puppeteer-cluster (disclaimer: I'm the author).