I have a two step program :
Get a list of href from a page
Loop infinitely on each page of this list, get an element and display it in console
I try to use function with Puppeteer-Cluter but it doesn't work properly.
const { Cluster } = require('puppeteer-cluster');
const fs = require("fs");
const { addExtra } = require("puppeteer-extra");
const vanillaPuppeteer = require("puppeteer");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
var moment = require('moment');
var regexTemps = /(\d+)\s(\w+)$/;
const urlsToCheck = [];
TZ = 'Europe/Paris'
process.env.TZ = 'Europe/Paris'
(async () => {
const puppeteer = addExtra(vanillaPuppeteer);
puppeteer.use(StealthPlugin());
const cluster = await Cluster.launch({
puppeteer,
puppeteerOptions: {
headless: false,
args: ['--no-sandbox'],
},
maxConcurrency: 10,
concurrency: Cluster.CONCURRENCY_CONTEXT,
monitor: false,
skipDuplicateUrls: true,
timeout:30000,
retryLimit:10,
})
cluster.on('taskerror', (err, data, willRetry) => {
if (willRetry) {
console.warn(`Encountered an error while crawling ${data}. ${err.message}\nThis job will be retried`);
} else {
console.error(`Failed to crawl ${data}: ${err.message}`);
}
});
const getElementOnPage = async ({ page, data: url }) => {
console.log('=> Go to URL : ',url);
await page.goto(url);
while (true) {
console.log('=> Reload URL : ',page.url())
await page.reload();
await page.waitForTimeout(1000);
let allNews = await page.$$("article.news"); // [] if nothing
let firstNews = allNews[0];
await page.waitForTimeout(1000);
let info = await firstNews.$eval('.info span', s => s.textContent.trim());
console.log(new Date(), 'info : ',info);
}
};
const getListOfPagesToExplore = async ({ page, data: url }) => {
console.log(new Date(), 'Get the list of deal pages to explore');
await page.goto(url, {waitUntil: 'domcontentloaded'});
await page.waitForTimeout(500);
const hrefsToVisit = await page.$x('//a');
let idxTab=0;
for( let hrefToVisit of hrefsToVisit ) {
var link = await page.evaluate(el => el.getAttribute("href"), hrefToVisit);
console.log(new Date(), 'adding link to list : ', link);
idxTab++;
urlsToCheck.push(link);
}
};
cluster.queue('https://www.apagewithsomelinks.com', getListOfPagesToExplore);
await cluster.idle();
await cluster.close();
console.log(urlsToCheck);
//Display correctly several link in an array
for( let url of urlsToCheck ) {
console.log('Push in queue : ',url);
cluster.queue(url, getElementOnPage);
}
await cluster.idle();
await cluster.close();
})();
When I launch it, it retrieve the links to scrap.
It display "Push in queue : ..." for each URL.
But then, the method getElementOnPage is launched only for the first URL, and runs infinitely like asked. But why the other URL are note launched ??
Before that, I don't use function, I used one unique task with :
await cluster.task(async ({ page, data: url }) => {
But how combine function and this thing ?
Related
I am trying to scrape the YouTube headline and link from a channel using Puppeteer. While executing the program, I am facing the Evaluation Error as following:
Error: Evaluation failed: TypeError: Cannot read properties of null (reading 'innerText')
at pptr://__puppeteer_evaluation_script__:10:65
at ExecutionContext._ExecutionContext_evaluate (E:\somoy\node_modules\puppeteer-core\lib\cjs\puppeteer\common\ExecutionContext.js:229:15)
at process.processTicksAndRejections (node:internal/process/task_queues:95:5)
at async ExecutionContext.evaluate (E:\somoy\node_modules\puppeteer-core\lib\cjs\puppeteer\common\ExecutionContext.js:107:16)
at async initiate (E:\somoy\appNew.js:45:20)
at async E:\somoy\appNew.js:155:9
async function initiate() {
const browser = await puppeteer.launch({ headless: false, defaultViewport: null, userDataDir: './userdata', executablePath: 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe' });
const page = await browser.newPage();
page.setDefaultNavigationTimeout(0)
await page.goto('https://www.youtube.com/#ProthomAlo/videos', { waitUntil: 'networkidle2' });
await delay(5000);
if (!fs.existsSync('storeLink.txt')) {
//create new file if not exist
fs.writeFileSync("storeLink.txt", '');
}
articleLinkarr = (fs.readFileSync('storeLink.txt', { encoding: 'utf8' })).split('\n')
let articles = await page.evaluate(async (articleLinkarr) => {
//console.log('Hello1')
let arrObj = [];
articles = document.querySelectorAll('.style-scope.ytd-rich-grid-media');
for (let i = 0; i < articles.length; i++) {
//for (let i = 0; i < 20; i++) {
//const category = document.querySelector('.print-entity-section-wrapper.F93gk').innerText
//const headline = articles[i].querySelector('div > h3').innerText
const headline = articles[i].querySelector('h3').innerText
const link = 'https://www.youtube.com' + articles[i].querySelector('a').getAttribute('href')
// if (!(link.includes('video') || link.includes('fun') || link.includes('photo'))) {
// if (!articleLinkarr.includes(link)) {
arrObj.push({ articleHeadline: headline, articleLink: link })
// }
// }
};
return arrObj;
}, articleLinkarr)
}
Puppeteer doesn't seem necessary here if you just want the initial set of titles. There's a JSON blob in the static HTML which has the title list, so you can make a simple HTTP request to the URL and pull the blob out with an HTML parser, then walk the object structure.
const cheerio = require("cheerio"); // 1.0.0-rc.12
const url = "Your URL";
fetch(url) // Node 18 or install node-fetch
.then(res => {
if (!res.ok) {
throw Error(res.statusText);
}
return res.text();
})
.then(html => {
const $ = cheerio.load(html);
const script = $(
[...$("script")].find(e =>
$(e).text().startsWith("var ytInitialData = {")
)
)
.text()
.slice(20, -1);
const data = JSON.parse(script);
const titles = [];
const {contents} =
data.contents.twoColumnBrowseResultsRenderer.tabs[1].tabRenderer
.content.richGridRenderer;
for (const c of contents) {
if (!c.richItemRenderer) {
continue;
}
const title =
c.richItemRenderer.content.videoRenderer.title.runs[0].text;
const url =
c.richItemRenderer.content.videoRenderer.navigationEndpoint
.commandMetadata.webCommandMetadata.url;
titles.push({title, url});
}
console.log(titles);
})
.catch(err => console.error(err));
If you do want to use Puppeteer, you can select these titles and URLs with:
const puppeteer = require("puppeteer"); // ^19.0.0
const url = "Your URL";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto(url, {waitUntil: "domcontentloaded"});
await page.waitForSelector("#video-title-link");
const titles = await page.$$eval("#video-title-link", els =>
els.map(e => ({title: e.textContent, url: e.href}))
.filter(e => e.url)
);
console.log(titles);
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
For some reason, the ids aren't unique.
Although this is less code, this approach is much slower than fetch (~10x slower on my machine), although you can speed it up a bit by blocking irrelevant resources.
As an aside, always use const in front of your variables to avoid making them global.
page.setDefaultNavigationTimeout(0) is generally not a great pattern--this could hang forever. I'd set this to 3 or 4 minutes at most. If nav is taking that long, something is wrong and you should get that logged so you can take a look at it.
I am trying to run this code with multiple address ips but I think I put the proxy code in the wrong place can someone help, the proxy dashboard shows that the code uses the proxy but when he opened the browser the address IP doesn't change is still my local IP.the code need to run multiple browsers each browser with a unique different IP.
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());
const { Cluster } = require("puppeteer-cluster");
const util = require('util');
const fs = require('fs');
const readFile = util.promisify(fs.readFile);
const langBtns = require('./langBtns');
async function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
let choices = Object.values(langBtns);
let randObj;
let buttons;
let lang;
let button;
let goodBtnSelector;
let cluster;
const start = async () => {
cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_CONTEXT,
maxConcurrency: 4, // provide the puppeteer-core library
puppeteer,
puppeteerOptions: {
headless: false,
executablePath:
"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
slowMo: 10,
defaultViewport: null,
args: [
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage",
"--disable-accelerated-2d-canvas",
"--no-first-run",
"--no-zygote",
"--disable-gpu",
"--disable-notifications",
"--proxy-server=megaproxy.rotating.proxyrack.net:1200"
],
},
// and provide executable path (in this case for a Chrome installation in Ubuntu)
});
await cluster.task(async ({ page, data: lang}) => {
await Promise.all([
await visitWebsite(page, lang),
]);
});
//add number of bots, here 5
for (let index = 0; index < 1000000; index++) {
//Choosing an object (lang+buttons) from the JSON file
randObj = choices[Math.floor(Math.random()*choices.length)];
//Storing the buttons of the choosing language in the array buttons
buttons = randObj.buttons;
button = buttons[Math.floor(Math.random()*buttons.length)];
lang = randObj.language;
goodBtnSelector = randObj.goodButtonSelector;
}
await cluster.idle();
await cluster.close();
};
const visitWebsite = async (page, lang) => {
await cluster.queue(lang);
// add proxy credentials
await page.authenticate({
username: 'XXXXXXXX',
password: 'XXXXXXXXXXX'
});
console.log(lang);
try{
await page.goto(`https://whatismyipaddress.com/`, {
waitUntil: "load",
timeout: 60000
});
await sleep(200 + Math.floor(Math.random() * 50));
let scriptFilePath = 'search.js';
let testScript = (await readFile(`${scriptFilePath}`));
await Promise.all([
eval('(async () => {' + testScript.toString() + '})')()
]);
}catch (e) {
console.log(e.toString());
}
};
start();
I had the same issue with puppeteer cluster v0.22.0
I solved the issue by changing concurrency Option to Cluster.CONCURRENCY_PAGE
Eg:
this.cluster = await Cluster.launch({
puppeteer,
puppeteerOptions: {
headless: true,
ignoreHTTPSErrors: true,
args,
ignoreDefaultArgs: ['--mute-audio', '--hide-scrollbars'],
},
concurrency: Cluster.CONCURRENCY_PAGE,
maxConcurrency: 12,
timeout: 900000,
retryLimit: 4,
});
What i'm trying to accomplish is to save complete document with all the comments expanded.
Unfortunately there are multiple selectors with same class and most of them are hidden and what i believe puppeteer does it takes first found selector and waits until it's visible which never happens.
Url: https://www.discoverpermaculture.com/permaculture-masterclass-video-1
const puppeteer = require('puppeteer');
const isElementVisible = async (page, cssSelector) => {
let visible = true;
await page
.waitForSelector(cssSelector, { visible: true, timeout: 2000 })
.catch(() => {
visible = false;
});
if(visible)console.log('Selector '+cssSelector+'visible!');
return visible;
};
async function run () {
let browser = await puppeteer.launch({headless: true, defaultViewport: null, args: ['--window-size=1920,10000',],});
const page = await browser.newPage();
const fs = require('fs');
await page.goto('https://www.discoverpermaculture.com/permaculture-masterclass-video-1');
await page.waitForTimeout(4000)
const elementHandle = await page.waitForSelector('iframe');
const frame = await elementHandle.contentFrame();
//loading all the comments (works because there's only one 'a.load-more__button' element a time)
const selectorForLoadMoreButton = 'a.load-more__button';
let loadMoreVisible = await isElementVisible(frame, selectorForLoadMoreButton);
while (loadMoreVisible) {
console.log('Loading comments');
await frame
.click(selectorForLoadMoreButton)
.catch(() => {});
loadMoreVisible = await isElementVisible(frame, selectorForLoadMoreButton);
}
//expanding comments doesn't work because each comment have a.see-more but some are hidden
const selectorForSeeMoreButton = 'a.see-more';
let seeMoreVisible = await isElementVisible(frame, selectorForSeeMoreButton);
while (seeMoreVisible) {
console.log('Expanding comments');
await frame
.click(selectorForSeeMoreButton)
.catch(() => {});
seeMoreVisible = await isElementVisible(frame, selectorForSeeMoreButton);
}
const cdp = await page.target().createCDPSession();
const { data } = await cdp.send('Page.captureSnapshot', { format: 'mhtml' });
fs.writeFileSync('out.mhtml', data);
browser.close();
}
run();
Any ideas how to handle this?
It turned out that each comment have 'a.see-more' element but if it's not a long one it also have '.hidden' class. Had to update this piece of code to search for all the 'a.see-more' elements but without '.hidden' class.
const selectorForSeeMoreButton = 'a.see-more:not(.hidden)';
let seeMoreVisible = await isElementVisible(frame, selectorForSeeMoreButton);
while (seeMoreVisible) {
console.log('Expanding comments');
await frame
.click(selectorForSeeMoreButton)
.catch(() => {});
seeMoreVisible = await isElementVisible(frame, electorForSeeMoreButton);
}
I am trying to get the puppeteer web scraping to scrape clutch.co using the below scripts but not working properly. I'm developing a simple web scraper for clutch.co. I want to extract contacts and company names for my lead generation project. In fact, to achieve this goal would improve my understanding of javascript.
Brower.js
const puppeteer = require("puppeteer");
async function startBrowser() {
let browser;
try {
console.log("Opening the browser......");
browser = await puppeteer.launch({
headless: false,
args: ["--disable-setuid-sandbox"],
ignoreHTTPSErrors: true,
});
} catch (err) {
console.log("Could not create a browser instance => : ", err);
}
return browser;
}
module.exports = {
startBrowser,
};
Pagecontroller.js
const pageScraper = require("./pageScraper");
async function scrapeAll(browserInstance) {
let browser;
try {
browser = await browserInstance;
await pageScraper.scraper(browser);
} catch (err) {
console.log("Could not resolve the browser instance => ", err);
}
}
module.exports = (browserInstance) => scrapeAll(browserInstance);
Pagescraper.js
const scraperObject = {
url: "https://clutch.co/sitemap",
async scraper(browser) {
let page = await browser.newPage();
console.log(`Navigating to ${this.url}...`);
await page.setDefaultNavigationTimeout(0);
await page.goto(this.url);
// Wait for the required DOM to be rendered
await page.waitForSelector(".container");
// Get the link to all Categories
let urls = await page.$$eval(".sitemap-menu > li", (links) => {
// Extract the links from the data
links = links.map((el) => el.querySelector("div > a").href);
return links;
});
// Loop through each of those links, open a new page instance and get the relevant data from them
let pagePromise = (link) =>
new Promise(async (resolve, reject) => {
let dataObj = {};
let newPage = await browser.newPage();
await page.setDefaultNavigationTimeout(0);
await newPage.goto(link);
dataObj["companyName"] = await newPage.$eval(
"h3 > a",
(text) => text.textContent
);
dataObj["tagLine"] = await newPage.$eval(
".tagline",
(text) => text.textContent
);
resolve(dataObj);
await newPage.close();
});
for (link in urls) {
let currentPageData = await pagePromise(urls[link]);
// scrapedData.push(currentPageData);
console.log(currentPageData);
}
},
};
module.exports = scraperObject;
index.js
const browserObject = require("./browser");
const scraperController = require("./pageController");
//Start the browser and create a browser instance
let browserInstance = browserObject.startBrowser();
// Pass the browser instance to the scraper controller
scraperController(browserInstance);
I can't get the script to load and scrape the profile pages. it keeps giving me errors.
Every time I run this script is times-out.
Does setDefaultNavigationTimeout actually prevent time outs?
There is about 26 URLs I'm going through, and each page has a large amount of images. Can't imagine Puppeteer can't handle these pages just because of heavy images?
const url = 'test.com';
const jsonReturn = [];
async function runScraper() {
const browser = await puppeteer.launch(prodConfig);
const page = await browser.newPage({
timeout: 0
});
page.setDefaultNavigationTimeout(0);
await page.goto(url, { waitUntil: 'domcontentloaded' });
await page.waitForSelector('.featured-shows-featured-show');
let featuredShowsURLs = await page.$$eval('.featured-shows-featured-show > a', (links) => {
return links.map(link => {
return link.href;
});
});
featuredShowsURLs = _.uniq(featuredShowsURLs)
for (const featuredShowsURL of featuredShowsURLs) {
const page = await browser.newPage({
timeout: 0
});
try {
await page.goto(featuredShowsURL);
await page.waitForSelector('.show-title');
} catch (e) {
featuredShowsURL;
debugger;
}
const showTitle = await findAndReturnSelectorText('.show-title', page);
const showDates = await findAndReturnSelectorText('.show-dates', page);
const showLocation = await findAndReturnSelectorText('.show-location', page);
const showGallery = await findAndReturnSelectorText('.entity-link', page);
const showDetail = await findAndReturnSelectorText('.show-press-release', page);
const newItem = {
showTitle,
showDates,
showLocation,
showGallery,
showDetail,
};
const id = hash(newItem);
jsonReturn.push({
...newItem,
id
});
}
await browser.close();
}
runScraper();