I am scrolling to the bottom of a YouTube page and scrolling part works fine but the problem is once I reach the bottom of the site when I try to close the browser I get error ProtocolError: Protocol error (Runtime.callFunctionOn): Target closed.. Why is this happening and how can I fix it. Thanks in advance.
let clientHeightArr = []
let clientHeightArrTracker = []
const scrapeInfiniteScrollItems = async(browser, page) => {
var infiniteScrollTrackerInterval = setInterval(async() => {
clientHeightArrTracker.push(clientHeightArr.length)
if (clientHeightArrTracker.some((e, i, arr) => arr.indexOf(e) !== i) == true) {
clearInterval(infiniteScrollTrackerInterval)
console.log('Bottom is reached')
//causes error "ProtocolError: Protocol error (Runtime.callFunctionOn): Target closed."
await browser.close()
}
}, 2000)
while (true) {
const previousHeight = await page.evaluate(
"document.querySelector('ytd-app').scrollHeight"
);
await page.evaluate(() => {
const youtubeScrollHeight =
document.querySelector("ytd-app").scrollHeight;
window.scrollTo(0, youtubeScrollHeight);
});
await page.waitForFunction(
`document.querySelector('ytd-app').scrollHeight > ${previousHeight}`, {
timeout: 0
},
);
const clientHeight = await page.$$eval("ytd-app", el => el.map(x => x.clientHeight));
clientHeightArr.push(clientHeight[0])
await page.waitForTimeout(1000)
}
};
(async() => {
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.goto('https://www.youtube.com/c/mkbhd/videos', {
waitUntil: 'networkidle2',
});
await scrapeInfiniteScrollItems(browser, page)
})();
Related
I need to go to the UPS site, type in a tracking number, then get the data about that tracking number. I turned headless to false then the timeouts disappeared, turn it to true, and it will timeout or wait forever. I tried a number of different methods (any I could find), to mess around with the promises, but nothing helped.
const puppeteer = require('puppeteer');
var fs = require("fs").promises;
var Page = {};
var Browser = {};
async function printPage(){
console.log("printPage()");
await fs.writeFile("pagecontent_afterClick.txt", await Page.content());
const count = await Page.$$eval('#st_App_PkgStsMonthNum', divs => divs.length);
console.log(count);
const deliveredOn = await Page.$eval('#st_App_PkgStsMonthNum', el => el.textContent);
const deliveredToAddress = await Page.$eval('#stApp_txtAddress', el => el.textContent);
const deliveredToCountry = await Page.$eval('#stApp_txtCountry', el => el.textContent);
const deliveredBy = await Page.$eval('#stApp_valReceivedBy', el => el.textContent);
console.log("browser close");
await Browser.close();
}
async function start(){
console.log("start");
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
Page = page
Browser = browser
const navigationPromise = page.waitForNavigation({waitUntil: 'load'})
await navigationPromise;
await fs.writeFile("pagecontent_B4.txt", await page.content());
await page.type("#stApp_trackingNumber", "1Z0F1R740320306827");
const searchValue = await page.$eval('#stApp_trackingNumber', el => el.value);
console.log(searchValue);
const count = await page.$$eval('button#stApp_btnTrack.ups-cta.ups-cta_primary', divs => divs.length);
console.log(count);
try {
console.log("end waiting");
await Promise.all([
page.click('#stApp_btnTrack'),
page.waitForNavigation({waitUntil: 'networkidle2'})
]);
printPage();
} catch (e) {
if (e instanceof puppeteer.errors.TimeoutError) {
console.log("timeout");
printPage();
}
}
}
start();
What i'm trying to accomplish is to save complete document with all the comments expanded.
Unfortunately there are multiple selectors with same class and most of them are hidden and what i believe puppeteer does it takes first found selector and waits until it's visible which never happens.
Url: https://www.discoverpermaculture.com/permaculture-masterclass-video-1
const puppeteer = require('puppeteer');
const isElementVisible = async (page, cssSelector) => {
let visible = true;
await page
.waitForSelector(cssSelector, { visible: true, timeout: 2000 })
.catch(() => {
visible = false;
});
if(visible)console.log('Selector '+cssSelector+'visible!');
return visible;
};
async function run () {
let browser = await puppeteer.launch({headless: true, defaultViewport: null, args: ['--window-size=1920,10000',],});
const page = await browser.newPage();
const fs = require('fs');
await page.goto('https://www.discoverpermaculture.com/permaculture-masterclass-video-1');
await page.waitForTimeout(4000)
const elementHandle = await page.waitForSelector('iframe');
const frame = await elementHandle.contentFrame();
//loading all the comments (works because there's only one 'a.load-more__button' element a time)
const selectorForLoadMoreButton = 'a.load-more__button';
let loadMoreVisible = await isElementVisible(frame, selectorForLoadMoreButton);
while (loadMoreVisible) {
console.log('Loading comments');
await frame
.click(selectorForLoadMoreButton)
.catch(() => {});
loadMoreVisible = await isElementVisible(frame, selectorForLoadMoreButton);
}
//expanding comments doesn't work because each comment have a.see-more but some are hidden
const selectorForSeeMoreButton = 'a.see-more';
let seeMoreVisible = await isElementVisible(frame, selectorForSeeMoreButton);
while (seeMoreVisible) {
console.log('Expanding comments');
await frame
.click(selectorForSeeMoreButton)
.catch(() => {});
seeMoreVisible = await isElementVisible(frame, selectorForSeeMoreButton);
}
const cdp = await page.target().createCDPSession();
const { data } = await cdp.send('Page.captureSnapshot', { format: 'mhtml' });
fs.writeFileSync('out.mhtml', data);
browser.close();
}
run();
Any ideas how to handle this?
It turned out that each comment have 'a.see-more' element but if it's not a long one it also have '.hidden' class. Had to update this piece of code to search for all the 'a.see-more' elements but without '.hidden' class.
const selectorForSeeMoreButton = 'a.see-more:not(.hidden)';
let seeMoreVisible = await isElementVisible(frame, selectorForSeeMoreButton);
while (seeMoreVisible) {
console.log('Expanding comments');
await frame
.click(selectorForSeeMoreButton)
.catch(() => {});
seeMoreVisible = await isElementVisible(frame, electorForSeeMoreButton);
}
I've created a script using puppeteer to scrape the links of different authors from a webpage traversing multiple pages triggering click on the next page button. The script appears to be working in the right way.
Although the content of this site is static, I intentionally used puppeteer within the following script only to learn as to how I can parse content from inner pages.
Given that I wish to go one layer deep to scrape description from such pages. How can I achieve that?
const puppeteer = require('puppeteer');
function run (pagesToScrape) {
return new Promise(async (resolve, reject) => {
try {
if (!pagesToScrape) {
pagesToScrape = 1;
}
const browser = await puppeteer.launch({headless:false});
const [page] = await browser.pages();
await page.goto("https://quotes.toscrape.com/");
let currentPage = 1;
let urls = [];
while (currentPage <= pagesToScrape) {
let newUrls = await page.evaluate(() => {
let results = [];
let items = document.querySelectorAll('[class="quote"]');
items.forEach((item) => {
results.push({
authorUrl: 'https://quotes.toscrape.com' + item.querySelector("small.author + a").getAttribute('href'),
title: item.querySelector("span.text").innerText
});
});
return results;
});
urls = urls.concat(newUrls);
if (currentPage < pagesToScrape) {
await Promise.all([
await page.waitForSelector('li.next > a'),
await page.click('li.next > a'),
await page.waitForSelector('[class="quote"]')
])
}
currentPage++;
}
browser.close();
return resolve(urls);
} catch (e) {
return reject(e);
}
})
}
run(3).then(console.log).catch(console.error);
I would go this way:
const puppeteer = require('puppeteer');
let browser;
(async function main() {
browser = await puppeteer.launch({ headless: false, defaultViewport: null });
const [pageQuotes] = await browser.pages();
const pageAbout = await browser.newPage();
await pageQuotes.bringToFront(); // Otherwise, click on the next page link does not work.
const pagesToScrape = 3;
await pageQuotes.goto('https://quotes.toscrape.com/');
let currentPage = 1;
const data = { quotes: {}, abouts: {} };
const visitedAbouts = new Set();
while (currentPage <= pagesToScrape) {
await pageQuotes.waitForSelector('.quote');
const { quotes, aboutURLs } = await pageQuotes.evaluate(() => ({
quotes: Array.from(
document.querySelectorAll('.quote'),
quote => [quote.querySelector('small.author').innerText, quote.innerText],
),
aboutURLs: Array.from(
document.querySelectorAll('.quote small.author + a[href]'),
quote => quote.href,
),
}));
for (const [author, quote] of quotes) {
if (data.quotes[author] === undefined) data.quotes[author] = [];
data.quotes[author].push(quote);
}
for (const aboutURL of aboutURLs) {
if (!visitedAbouts.has(aboutURL)) {
visitedAbouts.add(aboutURL);
await pageAbout.goto(aboutURL);
await pageAbout.waitForSelector('div.author-details');
const { title, about } = await pageAbout.evaluate(() => ({
title: document.querySelector('div.author-details h3.author-title').innerText,
about: document.querySelector('div.author-details').innerText,
}));
data.abouts[title] = about;
}
}
if (currentPage < pagesToScrape) {
const nextLink = await pageQuotes.waitForSelector('li.next > a');
await Promise.all([
nextLink.click(),
pageQuotes.waitForNavigation(),
]);
}
currentPage++;
}
console.log(JSON.stringify(data, null, ' '));
})().catch(console.error).finally(async () => { if (browser) await browser.close(); });
My goal is to click on each link (called a footnote) on this page and then return the footnote link, text, and then all of the URLs that appear in the sidebar. I'm stuck on accessing the sidebar values when they appear and after a few weeks of failure, I'm looking for some pointers on what I'm doing wrong (very new to both javascript and puppeteer).
const puppeteer = require('puppeteer');
const url = 'https://www.churchofjesuschrist.org/study/scriptures/bofm/1-ne/11?lang=eng';
const selector = '.study-note-ref';
(async function () {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.goto(url);
const footnotes = await page.$$eval(selector, nodes => {
return nodes.map(node => {
const ref = node.href.replace('https://www.churchofjesuschrist.org', '');
const txt = node.text;
return {
ref,
txt
};
});
});
for (const a of footnotes) {
page.click(a.ref);
const links = await page.$$eval('.scripture-ref', nodes => {
return nodes.map(node => {
return node.href
})
})
}
console.log(footnotes);
console.log(links);
// const fs = require('fs');
// fs.writeFile('./footnotes.json', JSON.stringify(footnotes), err => err ? console.log(err) : null);
await browser.close();
})();
Maybe something like this:
const puppeteer = require('puppeteer');
const url = 'https://www.churchofjesuschrist.org/study/scriptures/bofm/1-ne/11?lang=eng';
const selector = '.study-note-ref';
(async function main() {
const browser = await puppeteer.launch({ headless: true });
const [page] = await browser.pages();
await page.goto(url);
const data = {};
for (const footnote of await page.$$(selector)) {
const [href, text] = await page.evaluate(
(a) => {
a.click();
return [a.getAttribute('href').replace('/#note', ''), a.innerText.slice(1)];
},
footnote
);
data[href] = { text };
const header = await page.waitForXPath(`//aside/div/header/span[text()="${href} ${text}"]`);
data[href].links = await page.evaluate(
(span) => {
const aside = span.closest('aside');
return [...aside.querySelectorAll('a[href]')].map(
a => ({ [a.innerText]: a.href })
);
},
header
);
console.log(`Done: ${href} ${text}`);
}
console.log(JSON.stringify(data, null, 2));
await browser.close();
})();
Part of the output:
{
"1a": {
"text": "pondering",
"links": [
{
"D&C 76:19": "https://www.churchofjesuschrist.org/study/scriptures/dc-testament/dc/76.19?lang=eng#p19"
},
{
"TGĀ Meditation": "https://www.churchofjesuschrist.org/study/scriptures/tg/meditation?lang=eng"
},
{
"Doctrine and Covenants 76:19": "https://www.churchofjesuschrist.org/study/scriptures/dc-testament/dc/76.19?lang=eng#p19#19"
},
{
"Meditation, Meditate": "https://www.churchofjesuschrist.org/study/scriptures/tg/meditation?lang=eng"
}
]
},
}
Every time I run this script is times-out.
Does setDefaultNavigationTimeout actually prevent time outs?
There is about 26 URLs I'm going through, and each page has a large amount of images. Can't imagine Puppeteer can't handle these pages just because of heavy images?
const url = 'test.com';
const jsonReturn = [];
async function runScraper() {
const browser = await puppeteer.launch(prodConfig);
const page = await browser.newPage({
timeout: 0
});
page.setDefaultNavigationTimeout(0);
await page.goto(url, { waitUntil: 'domcontentloaded' });
await page.waitForSelector('.featured-shows-featured-show');
let featuredShowsURLs = await page.$$eval('.featured-shows-featured-show > a', (links) => {
return links.map(link => {
return link.href;
});
});
featuredShowsURLs = _.uniq(featuredShowsURLs)
for (const featuredShowsURL of featuredShowsURLs) {
const page = await browser.newPage({
timeout: 0
});
try {
await page.goto(featuredShowsURL);
await page.waitForSelector('.show-title');
} catch (e) {
featuredShowsURL;
debugger;
}
const showTitle = await findAndReturnSelectorText('.show-title', page);
const showDates = await findAndReturnSelectorText('.show-dates', page);
const showLocation = await findAndReturnSelectorText('.show-location', page);
const showGallery = await findAndReturnSelectorText('.entity-link', page);
const showDetail = await findAndReturnSelectorText('.show-press-release', page);
const newItem = {
showTitle,
showDates,
showLocation,
showGallery,
showDetail,
};
const id = hash(newItem);
jsonReturn.push({
...newItem,
id
});
}
await browser.close();
}
runScraper();