Puppeteer: setDefaultNavigationTimeout to 0 still times out

Puppeteer: setDefaultNavigationTimeout to 0 still times out - javascript

Every time I run this script is times-out.
Does setDefaultNavigationTimeout actually prevent time outs?
There is about 26 URLs I'm going through, and each page has a large amount of images. Can't imagine Puppeteer can't handle these pages just because of heavy images?
const url = 'test.com';
const jsonReturn = [];
async function runScraper() {
const browser = await puppeteer.launch(prodConfig);
const page = await browser.newPage({
timeout: 0
});
page.setDefaultNavigationTimeout(0);
await page.goto(url, { waitUntil: 'domcontentloaded' });
await page.waitForSelector('.featured-shows-featured-show');
let featuredShowsURLs = await page.$$eval('.featured-shows-featured-show > a', (links) => {
return links.map(link => {
return link.href;
});
});
featuredShowsURLs = _.uniq(featuredShowsURLs)
for (const featuredShowsURL of featuredShowsURLs) {
const page = await browser.newPage({
timeout: 0
});
try {
await page.goto(featuredShowsURL);
await page.waitForSelector('.show-title');
} catch (e) {
featuredShowsURL;
debugger;
}
const showTitle = await findAndReturnSelectorText('.show-title', page);
const showDates = await findAndReturnSelectorText('.show-dates', page);
const showLocation = await findAndReturnSelectorText('.show-location', page);
const showGallery = await findAndReturnSelectorText('.entity-link', page);
const showDetail = await findAndReturnSelectorText('.show-press-release', page);
const newItem = {
showTitle,
showDates,
showLocation,
showGallery,
showDetail,
};
const id = hash(newItem);
jsonReturn.push({
...newItem,
id
});
}
await browser.close();
}
runScraper();

Related

Puppeteer timeouts in headless mode

I need to go to the UPS site, type in a tracking number, then get the data about that tracking number. I turned headless to false then the timeouts disappeared, turn it to true, and it will timeout or wait forever. I tried a number of different methods (any I could find), to mess around with the promises, but nothing helped.
const puppeteer = require('puppeteer');
var fs = require("fs").promises;
var Page = {};
var Browser = {};
async function printPage(){
console.log("printPage()");
await fs.writeFile("pagecontent_afterClick.txt", await Page.content());
const count = await Page.$$eval('#st_App_PkgStsMonthNum', divs => divs.length);
console.log(count);
const deliveredOn = await Page.$eval('#st_App_PkgStsMonthNum', el => el.textContent);
const deliveredToAddress = await Page.$eval('#stApp_txtAddress', el => el.textContent);
const deliveredToCountry = await Page.$eval('#stApp_txtCountry', el => el.textContent);
const deliveredBy = await Page.$eval('#stApp_valReceivedBy', el => el.textContent);
console.log("browser close");
await Browser.close();
}
async function start(){
console.log("start");
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
Page = page
Browser = browser
const navigationPromise = page.waitForNavigation({waitUntil: 'load'})
await navigationPromise;
await fs.writeFile("pagecontent_B4.txt", await page.content());
await page.type("#stApp_trackingNumber", "1Z0F1R740320306827");
const searchValue = await page.$eval('#stApp_trackingNumber', el => el.value);
console.log(searchValue);
const count = await page.$$eval('button#stApp_btnTrack.ups-cta.ups-cta_primary', divs => divs.length);
console.log(count);
try {
console.log("end waiting");
await Promise.all([
page.click('#stApp_btnTrack'),
page.waitForNavigation({waitUntil: 'networkidle2'})
]);
printPage();
} catch (e) {
if (e instanceof puppeteer.errors.TimeoutError) {
console.log("timeout");
printPage();
}
}
}
start();

puppeteer waitForSelector when there are multiple elements with same class and more than one can be visible

What i'm trying to accomplish is to save complete document with all the comments expanded.
Unfortunately there are multiple selectors with same class and most of them are hidden and what i believe puppeteer does it takes first found selector and waits until it's visible which never happens.
Url: https://www.discoverpermaculture.com/permaculture-masterclass-video-1
const puppeteer = require('puppeteer');
const isElementVisible = async (page, cssSelector) => {
let visible = true;
await page
.waitForSelector(cssSelector, { visible: true, timeout: 2000 })
.catch(() => {
visible = false;
});
if(visible)console.log('Selector '+cssSelector+'visible!');
return visible;
};
async function run () {
let browser = await puppeteer.launch({headless: true, defaultViewport: null, args: ['--window-size=1920,10000',],});
const page = await browser.newPage();
const fs = require('fs');
await page.goto('https://www.discoverpermaculture.com/permaculture-masterclass-video-1');
await page.waitForTimeout(4000)
const elementHandle = await page.waitForSelector('iframe');
const frame = await elementHandle.contentFrame();
//loading all the comments (works because there's only one 'a.load-more__button' element a time)
const selectorForLoadMoreButton = 'a.load-more__button';
let loadMoreVisible = await isElementVisible(frame, selectorForLoadMoreButton);
while (loadMoreVisible) {
console.log('Loading comments');
await frame
.click(selectorForLoadMoreButton)
.catch(() => {});
loadMoreVisible = await isElementVisible(frame, selectorForLoadMoreButton);
}
//expanding comments doesn't work because each comment have a.see-more but some are hidden
const selectorForSeeMoreButton = 'a.see-more';
let seeMoreVisible = await isElementVisible(frame, selectorForSeeMoreButton);
while (seeMoreVisible) {
console.log('Expanding comments');
await frame
.click(selectorForSeeMoreButton)
.catch(() => {});
seeMoreVisible = await isElementVisible(frame, selectorForSeeMoreButton);
}
const cdp = await page.target().createCDPSession();
const { data } = await cdp.send('Page.captureSnapshot', { format: 'mhtml' });
fs.writeFileSync('out.mhtml', data);
browser.close();
}
run();
Any ideas how to handle this?

It turned out that each comment have 'a.see-more' element but if it's not a long one it also have '.hidden' class. Had to update this piece of code to search for all the 'a.see-more' elements but without '.hidden' class.
const selectorForSeeMoreButton = 'a.see-more:not(.hidden)';
let seeMoreVisible = await isElementVisible(frame, selectorForSeeMoreButton);
while (seeMoreVisible) {
console.log('Expanding comments');
await frame
.click(selectorForSeeMoreButton)
.catch(() => {});
seeMoreVisible = await isElementVisible(frame, electorForSeeMoreButton);
}

Unable to implement any logic to scrape content from innermost pages using puppeteer

I've created a script using puppeteer to scrape the links of different authors from a webpage traversing multiple pages triggering click on the next page button. The script appears to be working in the right way.
Although the content of this site is static, I intentionally used puppeteer within the following script only to learn as to how I can parse content from inner pages.
Given that I wish to go one layer deep to scrape description from such pages. How can I achieve that?
const puppeteer = require('puppeteer');
function run (pagesToScrape) {
return new Promise(async (resolve, reject) => {
try {
if (!pagesToScrape) {
pagesToScrape = 1;
}
const browser = await puppeteer.launch({headless:false});
const [page] = await browser.pages();
await page.goto("https://quotes.toscrape.com/");
let currentPage = 1;
let urls = [];
while (currentPage <= pagesToScrape) {
let newUrls = await page.evaluate(() => {
let results = [];
let items = document.querySelectorAll('[class="quote"]');
items.forEach((item) => {
results.push({
authorUrl: 'https://quotes.toscrape.com' + item.querySelector("small.author + a").getAttribute('href'),
title: item.querySelector("span.text").innerText
});
});
return results;
});
urls = urls.concat(newUrls);
if (currentPage < pagesToScrape) {
await Promise.all([
await page.waitForSelector('li.next > a'),
await page.click('li.next > a'),
await page.waitForSelector('[class="quote"]')
])
}
currentPage++;
}
browser.close();
return resolve(urls);
} catch (e) {
return reject(e);
}
})
}
run(3).then(console.log).catch(console.error);

I would go this way:
const puppeteer = require('puppeteer');
let browser;
(async function main() {
browser = await puppeteer.launch({ headless: false, defaultViewport: null });
const [pageQuotes] = await browser.pages();
const pageAbout = await browser.newPage();
await pageQuotes.bringToFront(); // Otherwise, click on the next page link does not work.
const pagesToScrape = 3;
await pageQuotes.goto('https://quotes.toscrape.com/');
let currentPage = 1;
const data = { quotes: {}, abouts: {} };
const visitedAbouts = new Set();
while (currentPage <= pagesToScrape) {
await pageQuotes.waitForSelector('.quote');
const { quotes, aboutURLs } = await pageQuotes.evaluate(() => ({
quotes: Array.from(
document.querySelectorAll('.quote'),
quote => [quote.querySelector('small.author').innerText, quote.innerText],
),
aboutURLs: Array.from(
document.querySelectorAll('.quote small.author + a[href]'),
quote => quote.href,
),
}));
for (const [author, quote] of quotes) {
if (data.quotes[author] === undefined) data.quotes[author] = [];
data.quotes[author].push(quote);
}
for (const aboutURL of aboutURLs) {
if (!visitedAbouts.has(aboutURL)) {
visitedAbouts.add(aboutURL);
await pageAbout.goto(aboutURL);
await pageAbout.waitForSelector('div.author-details');
const { title, about } = await pageAbout.evaluate(() => ({
title: document.querySelector('div.author-details h3.author-title').innerText,
about: document.querySelector('div.author-details').innerText,
}));
data.abouts[title] = about;
}
}
if (currentPage < pagesToScrape) {
const nextLink = await pageQuotes.waitForSelector('li.next > a');
await Promise.all([
nextLink.click(),
pageQuotes.waitForNavigation(),
]);
}
currentPage++;
}
console.log(JSON.stringify(data, null, ' '));
})().catch(console.error).finally(async () => { if (browser) await browser.close(); });

Clicking on internal javascript links and returning urls using puppeteer

My goal is to click on each link (called a footnote) on this page and then return the footnote link, text, and then all of the URLs that appear in the sidebar. I'm stuck on accessing the sidebar values when they appear and after a few weeks of failure, I'm looking for some pointers on what I'm doing wrong (very new to both javascript and puppeteer).
const puppeteer = require('puppeteer');
const url = 'https://www.churchofjesuschrist.org/study/scriptures/bofm/1-ne/11?lang=eng';
const selector = '.study-note-ref';
(async function () {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.goto(url);
const footnotes = await page.$$eval(selector, nodes => {
return nodes.map(node => {
const ref = node.href.replace('https://www.churchofjesuschrist.org', '');
const txt = node.text;
return {
ref,
txt
};
});
});
for (const a of footnotes) {
page.click(a.ref);
const links = await page.$$eval('.scripture-ref', nodes => {
return nodes.map(node => {
return node.href
})
})
}
console.log(footnotes);
console.log(links);
// const fs = require('fs');
// fs.writeFile('./footnotes.json', JSON.stringify(footnotes), err => err ? console.log(err) : null);
await browser.close();
})();

Maybe something like this:
const puppeteer = require('puppeteer');
const url = 'https://www.churchofjesuschrist.org/study/scriptures/bofm/1-ne/11?lang=eng';
const selector = '.study-note-ref';
(async function main() {
const browser = await puppeteer.launch({ headless: true });
const [page] = await browser.pages();
await page.goto(url);
const data = {};
for (const footnote of await page.$$(selector)) {
const [href, text] = await page.evaluate(
(a) => {
a.click();
return [a.getAttribute('href').replace('/#note', ''), a.innerText.slice(1)];
},
footnote
);
data[href] = { text };
const header = await page.waitForXPath(`//aside/div/header/span[text()="${href} ${text}"]`);
data[href].links = await page.evaluate(
(span) => {
const aside = span.closest('aside');
return [...aside.querySelectorAll('a[href]')].map(
a => ({ [a.innerText]: a.href })
);
},
header
);
console.log(`Done: ${href} ${text}`);
}
console.log(JSON.stringify(data, null, 2));
await browser.close();
})();
Part of the output:
{
"1a": {
"text": "pondering",
"links": [
{
"D&C 76:19": "https://www.churchofjesuschrist.org/study/scriptures/dc-testament/dc/76.19?lang=eng#p19"
},
{
"TG Meditation": "https://www.churchofjesuschrist.org/study/scriptures/tg/meditation?lang=eng"
},
{
"Doctrine and Covenants 76:19": "https://www.churchofjesuschrist.org/study/scriptures/dc-testament/dc/76.19?lang=eng#p19#19"
},
{
"Meditation, Meditate": "https://www.churchofjesuschrist.org/study/scriptures/tg/meditation?lang=eng"
}
]
},
}

Use array of keywords and loop through script in Playwright

So, I am trying to scrape a couple of searchengines with a couple of search phrases with Playwright.
Running the script with one query is working.
Working:
const { chromium } = require('playwright');
(async () => {
const browser = await chromium.launch({ headless: false, slowMo: 250 });
const context = await browser.newContext()
const page = await context.newPage();
const keyWord = ('Arsenal');
await page.goto('https://duckduckgo.com/');
await page.fill('//input[#name="q"]',keyWord);
await page.keyboard.press('Enter');
const getOne = (' (//h2[#class="result__title"])[9] ');
await page.waitForSelector(getOne)
const pushOne = await page.$(getOne);
const One = await pushOne.evaluate(element => element.innerText);
console.log(One);
await page.goto('https://yandex.com/');
await page.fill('//input[#aria-label="Request"]', keyWord);
await page.keyboard.press('Enter');
const getTwo = (' //li[#data-first-snippet] //div[#class="organic__url-text"] ');
await page.waitForSelector(getTwo)
const pushTwo = await page.$(getTwo);
const Two = await pushTwo.evaluate(element => element.innerText);
console.log(Two);
await browser.close()
})()
But when I use an array with phrases (keyWordlist) I fail to get the script running.
Have searched around for using Array with 'For' and 'Foreach' loops, but haven't been able to fix it.
I want to run the different keywords through the different searchengines and list the results.
For 3 keywords in two searchengines that would get 6 results.
const { chromium } = require('playwright');
(async () => {
const browser = await chromium.launch({ headless: false, slowMo: 250 });
const context = await browser.newContext()
const page = await context.newPage();
let kewWordlist = ['Arsenal', 'Liverpool', 'Ajax']
for (var i=0; i<=kewWordlist.length; i++) {
// for (const i in kewWordlist){
async () => {
const keyWord = kewWordlist[i];
await page.goto('https://duckduckgo.com/');
await page.fill('//input[#name="q"]',keyWord);
// await page.fill('//input[#name="q"]',[i]);
// await page.fill('//input[#name="q"]',`${keyWord}`);
await page.keyboard.press('Enter');
const getOne = (' (//h2[#class="result__title"])[9] ');
await page.waitForSelector(getOne)
const pushOne = await page.$(getOne);
const One = await pushOne.evaluate(element => element.innerText);
console.log(One);
// await page.goto('https://yandex.com/');
// await page.fill('//input[#aria-label="Request"]', keyWord);
// await page.keyboard.press('Enter');
// const getTwo = (' //li[#data-first-snippet] //div[#class="organic__url-text"] ');
// await page.waitForSelector(getTwo)
// const pushTwo = await page.$(getTwo);
// const Two = await pushTwo.evaluate(element => element.innerText);
// console.log(Two);
}}
await browser.close()
})()
If anyone has some pointers on how to solve this, much obliged.

maybe the result selectors needs some tweaking but I think this is what you were looking for:
test.only('search search engines', async({page, context}) => {
const search = [
{
name: 'yandex',
url: 'https://yandex.com/',
elementFill: '//input[#aria-label="Request"]',
elementResult: '//li[#data-first-snippet] //div[#class="organic__url-text"]'
},
{
name: 'google',
url: 'https://www.google.nl',
elementFill: '//input[#name="q"]',
elementResult: '(//h2[#class="result__title"])[9]'
},
{
name: '',
url: 'https://duckduckgo.com/',
elementFill: '//input[#name="q"]',
elementResult: '(//h2[#class="result__title"])[9]'
}
]
const kewWordlist = ['Arsenal', 'Liverpool', 'Ajax']
for (let i = 0; i < search.length; i++) {
const searchName = search[i].name
const searchResult = search[i].elementResult
const searchFill = search[i].elementFill
const searchPage = await context.newPage()
await searchPage.waitForLoadState()
await searchPage.goto(`${search[i].url}`)
for (let i = 0; i < kewWordlist.length; i++) {
await searchPage.fill(searchFill,kewWordlist[i])
await searchPage.keyboard.press('Enter')
await searchPage.waitForSelector(searchResult)
const result = await page.$(searchResult)
console.log(`${searchName}: ${result} `)
}
}
})

The reason your loop isn't working is that you have an async function inside of it that you never call. There are a few ways you could go about this:
You could take your first version, have it accept a word to search, and run that over each element of the array:
const searchOneKeyword = async (keyWord) => {
const browser = await chromium.launch({ headless: false, slowMo: 250 });
const context = await browser.newContext()
const page = await context.newPage();
// rest of code
}
const kewWordList = ['Arsenal', 'Liverpool', 'Ajax']
keyWordList.forEach((k) => {
searchOneKeyword(k)
})
Or if you'd like to keep the same browser instance, you can do it in a loop in the function:
const search = async (words) => {
const browser = await chromium.launch({ headless: false, slowMo: 250 });
const context = await browser.newContext()
const page = await context.newPage();
for (const keyWord of words) {
await page.goto('https://duckduckgo.com/');
await page.fill('//input[#name="q"]',keyWord);
await page.keyboard.press('Enter');
const getOne = (' (//h2[#class="result__title"])[9] ');
await page.waitForSelector(getOne)
const pushOne = await page.$(getOne);
const One = await pushOne.evaluate(element => element.innerText);
console.log(One);
// etc.
}
await browser.close()
}
search(keyWordList)
In both of those cases, you're logging, but never returning anything, so if you need that data in another function afterwards, you'd have to change that. Example:
const search = async (words) => {
const browser = await chromium.launch({ headless: false, slowMo: 250 });
const context = await browser.newContext()
const page = await context.newPage();
const results = await Promise.all(words.map((keyWord) => {
await page.goto('https://duckduckgo.com/');
await page.fill('//input[#name="q"]',keyWord);
await page.keyboard.press('Enter');
// etc.
return [ One, Two ]
}))
await browser.close()
return results
}
search(keyWordList).then((results) => { console.log(results.flat()) })

I have spent a couple of hours trying to get the script working based on your suggestions. No result unfortunately. I get errors like 'await is only valid in async function' and 'Unreachable code detected'. Searched for other examples, for some inspiration, but none found. If you or someone else has a suggestion, please share! This is code I have now:
const { chromium } = require('playwright');
let keyWordList = ['Arsenal', 'Liverpool', 'Ajax']
const search = async function words() {
const browser = await chromium.launch({ headless: false, slowMo: 250 });
const context = await browser.newContext()
const page = await context.newPage();
}
const results = await Promise.all(words.map(keyWord))
//DUCKDUCKGO
await page.goto('https://duckduckgo.com/');
await page.fill('//input[#name="q"]',keyWord);
await page.keyboard.press('Enter');
const getOne = (' (//h2[#class="result__title"])[9] ');
await page.waitForSelector(getOne)
const pushOne = await page.$(getOne);
const One = await pushOne.evaluate(element => element.innerText);
//YANDEX
await page.goto('https://yandex.com/');
await page.fill('//input[#aria-label="Request"]', keyWord);
await page.keyboard.press('Enter');
const getTwo = (' //li[#data-first-snippet] //div[#class="organic__url-text"] ');
await page.waitForSelector(getTwo)
const pushTwo = await page.$(getTwo);
const Two = await pushTwo.evaluate(element => element.innerText);
console.log(Two);
return [ One , Two ]
return results
search(keyWordList).then((results) => { console.log(results.flat())
await browser.close();
})

Develop Reference

JavaScript is the programming language of the Web.

Puppeteer: setDefaultNavigationTimeout to 0 still times out - javascript

Related

Puppeteer timeouts in headless mode

puppeteer waitForSelector when there are multiple elements with same class and more than one can be visible

Unable to implement any logic to scrape content from innermost pages using puppeteer

Clicking on internal javascript links and returning urls using puppeteer

Use array of keywords and loop through script in Playwright

Categories

Resources