puppeteer-cluster, different data to the same url

puppeteer-cluster, different data to the same url - javascript

i put an example below that i want to add different search inputs (firstWord + scndWord) from array of object to two google pages in the same time, so opening pages dynamically depend on the array length
1st page open google then write red flower
2nd page open google but write 'gaming PC
const { Cluster } = require('puppeteer-cluster');
(async () => {
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_PAGE,
maxConcurrency: 10,
puppeteerOptions: {
headless: false,
},
});
cluster.on('taskerror', (err, url) => {
console.error((new Date()).toJSON() + ` Error crawling ${url}: ${err.message}`);
});
const arr = [
{firstWord: 'gaming ',scndWord: 'pc'},
{firstWord: 'red ', scndWord: 'flower'}
]
await cluster.task(async ({ page, data: index }) => {
await page.goto('https://www.google.com/');
await page.focus('body > div.L3eUgb > div.o3j99.ikrT4e.om7nvf > form > div:nth-child(1) > div.A8SBwf > div.RNNXgb > div > div.a4bIc > input')
await page.keyboard.type('flower');
await page.waitForNavigation()
await page.screenshot({ path: 'screenshot.png', fullPage: true })
});
for (let index = 0; index <=arr.length -1 ; index++) {
cluster.execute(index);}
I'm confusing how to do that, i will be thankful for the help

i got it
const { Cluster } = require('puppeteer-cluster');
(async () => {
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_PAGE,
maxConcurrency: 10,
puppeteerOptions: {
headless: false,
},
});
cluster.on('taskerror', (err, url) => {
console.error((new Date()).toJSON() + ` Error crawling ${url}: ${err.message}`);
});
const arr = [
{firstWord: 'gaming ',scndWord: 'pc'},
{firstWord: 'red ', scndWord: 'flower'}
]
await cluster.task(async ({ page, data: [firstWord , scndWord] }) => {
await page.goto('https://www.google.com/');
await page.focus('body > div.L3eUgb > div.o3j99.ikrT4e.om7nvf > form > div:nth-child(1) > div.A8SBwf > div.RNNXgb > div > div.a4bIc > input')
await page.keyboard.type(firstWord + scndWord);
await page.waitForNavigation()
await page.screenshot({ path: 'screenshot.png', fullPage: true })
});
arr.map((serach)=>{
cluster.execute([serach.firstWord, serach.scndWord]);
})
// await cluster.idle();
// await cluster.close();
})();

Related

ProtocolError: Protocol error (Runtime.callFunctionOn): Target closed

I am scrolling to the bottom of a YouTube page and scrolling part works fine but the problem is once I reach the bottom of the site when I try to close the browser I get error ProtocolError: Protocol error (Runtime.callFunctionOn): Target closed.. Why is this happening and how can I fix it. Thanks in advance.
let clientHeightArr = []
let clientHeightArrTracker = []
const scrapeInfiniteScrollItems = async(browser, page) => {
var infiniteScrollTrackerInterval = setInterval(async() => {
clientHeightArrTracker.push(clientHeightArr.length)
if (clientHeightArrTracker.some((e, i, arr) => arr.indexOf(e) !== i) == true) {
clearInterval(infiniteScrollTrackerInterval)
console.log('Bottom is reached')
//causes error "ProtocolError: Protocol error (Runtime.callFunctionOn): Target closed."
await browser.close()
}
}, 2000)
while (true) {
const previousHeight = await page.evaluate(
"document.querySelector('ytd-app').scrollHeight"
);
await page.evaluate(() => {
const youtubeScrollHeight =
document.querySelector("ytd-app").scrollHeight;
window.scrollTo(0, youtubeScrollHeight);
});
await page.waitForFunction(
`document.querySelector('ytd-app').scrollHeight > ${previousHeight}`, {
timeout: 0
},
);
const clientHeight = await page.$$eval("ytd-app", el => el.map(x => x.clientHeight));
clientHeightArr.push(clientHeight[0])
await page.waitForTimeout(1000)
}
};
(async() => {
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.goto('https://www.youtube.com/c/mkbhd/videos', {
waitUntil: 'networkidle2',
});
await scrapeInfiniteScrollItems(browser, page)
})();

How handle multiple functions in puppeteer-cluster?

I have a two step program :
Get a list of href from a page
Loop infinitely on each page of this list, get an element and display it in console
I try to use function with Puppeteer-Cluter but it doesn't work properly.
const { Cluster } = require('puppeteer-cluster');
const fs = require("fs");
const { addExtra } = require("puppeteer-extra");
const vanillaPuppeteer = require("puppeteer");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
var moment = require('moment');
var regexTemps = /(\d+)\s(\w+)$/;
const urlsToCheck = [];
TZ = 'Europe/Paris'
process.env.TZ = 'Europe/Paris'
(async () => {
const puppeteer = addExtra(vanillaPuppeteer);
puppeteer.use(StealthPlugin());
const cluster = await Cluster.launch({
puppeteer,
puppeteerOptions: {
headless: false,
args: ['--no-sandbox'],
},
maxConcurrency: 10,
concurrency: Cluster.CONCURRENCY_CONTEXT,
monitor: false,
skipDuplicateUrls: true,
timeout:30000,
retryLimit:10,
})
cluster.on('taskerror', (err, data, willRetry) => {
if (willRetry) {
console.warn(`Encountered an error while crawling ${data}. ${err.message}\nThis job will be retried`);
} else {
console.error(`Failed to crawl ${data}: ${err.message}`);
}
});
const getElementOnPage = async ({ page, data: url }) => {
console.log('=> Go to URL : ',url);
await page.goto(url);
while (true) {
console.log('=> Reload URL : ',page.url())
await page.reload();
await page.waitForTimeout(1000);
let allNews = await page.$$("article.news"); // [] if nothing
let firstNews = allNews[0];
await page.waitForTimeout(1000);
let info = await firstNews.$eval('.info span', s => s.textContent.trim());
console.log(new Date(), 'info : ',info);
}
};
const getListOfPagesToExplore = async ({ page, data: url }) => {
console.log(new Date(), 'Get the list of deal pages to explore');
await page.goto(url, {waitUntil: 'domcontentloaded'});
await page.waitForTimeout(500);
const hrefsToVisit = await page.$x('//a');
let idxTab=0;
for( let hrefToVisit of hrefsToVisit ) {
var link = await page.evaluate(el => el.getAttribute("href"), hrefToVisit);
console.log(new Date(), 'adding link to list : ', link);
idxTab++;
urlsToCheck.push(link);
}
};
cluster.queue('https://www.apagewithsomelinks.com', getListOfPagesToExplore);
await cluster.idle();
await cluster.close();
console.log(urlsToCheck);
//Display correctly several link in an array
for( let url of urlsToCheck ) {
console.log('Push in queue : ',url);
cluster.queue(url, getElementOnPage);
}
await cluster.idle();
await cluster.close();
})();
When I launch it, it retrieve the links to scrap.
It display "Push in queue : ..." for each URL.
But then, the method getElementOnPage is launched only for the first URL, and runs infinitely like asked. But why the other URL are note launched ??
Before that, I don't use function, I used one unique task with :
await cluster.task(async ({ page, data: url }) => {
But how combine function and this thing ?

Infinite loop with try catch puppeteer

I have this code I made using Pupperteer to save URLs of Bing images, I have an Array of product names and I keep searching inside the Loop with the Try catch to get these URLs, but I have one problem, I wanted that when it didn't find the product (it didn't find a result) it would jump to the next item in the Array but it just doesn't do anything and Pupperteer closes the Browser entering the Finally of the Try catch, without pointing out any error. My logic seems right, can anyone help me with this question? I've tried everything
index.js
const fs = require("fs");
const puppeteer = require("puppeteer-core");
(async () => {
const browser = await puppeteer.launch({
executablePath:
"C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe",
headless: true,
timeout: 0,
});
const page = await browser.newPage();
await page.setDefaultNavigationTimeout(0);
let results = new Array();
const arr = [
{
ID: 6011,
Nome: "BRA MASCARA BIGODE Q2321(KRIAT 13726)",
Imagens: "",
},
{
ID: 6012,
Nome: "BRA MASCARA CAVEIRA ASSUSTADORA 1UN",
Imagens: "",
},
{
ID: 6013,
Nome: "BRA MASCARA CAVEIRA PLASTICA 1UN",
Imagens: "",
}
];
await page.goto(`https://www.bing.com/search?q=Hello world`);
await page.waitForSelector("#bnp_btn_accept", { visible: true });
await page.evaluate(() => {
const btn = document.getElementById("bnp_btn_accept");
if (btn) {
btn.click();
}
});
let cont = 0;
try {
for (cont; cont < arr.length; ) {
let image;
await page.goto(`https://www.bing.com/search?q=${arr[cont].Nome}`);
await page.waitForTimeout(3000);
await page.waitForSelector("#b-scopeListItem-images", { visible: true });
await page.evaluate(() => {
const imageBtn = document.getElementById("b-scopeListItem-images");
imageBtn?.children[0]?.click();
});
await page.waitForTimeout(3000);
await page.waitForSelector(".iusc", { visible: true });
await page
.evaluate(() => {
const firstImage = JSON.parse(
document.getElementsByClassName("iusc")[0].getAttribute("m")
).turl;
const semResultado = document.getElementById("dg_nr");
if (semResultado) {
console.log("Não tem resultado");
} else {
return firstImage;
}
})
.then(async (res) => {
arr[cont].Imagens = await res;
results.push(arr[cont]);
var stream = fs.createWriteStream("results.json");
stream.once("open", function (fd) {
stream.write(JSON.stringify(results));
stream.end();
});
console.log("Produto " + cont + " adicionado no Arquivo");
cont++;
})
.catch((err) => {
console.log("O Produto " + cont + " deu algum erro: " + err);
cont++;
});
}
} catch (err) {
cont++;
} finally {
var stream = fs.createWriteStream("results.json");
stream.once("open", function (fd) {
stream.write(JSON.stringify(results));
stream.end();
});
await browser.close();
}
})();
Fixed:
const fs = require("fs");
const puppeteer = require("puppeteer-core");
(async () => {
const browser = await puppeteer.launch({
executablePath:
"C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe",
headless: true,
timeout: 0,
});
const page = await browser.newPage();
await page.setDefaultNavigationTimeout(0);
let results = new Array();
const arr = [
{
ID: 6011,
Nome: "BRA MASCARA BIGODE Q2321(KRIAT 13726)",
Imagens: "",
},
{
ID: 6012,
Nome: "BRA MASCARA CAVEIRA ASSUSTADORA 1UN",
Imagens: "",
},
{
ID: 6013,
Nome: "BRA MASCARA CAVEIRA PLASTICA 1UN",
Imagens: "",
}
];
await page.goto(`https://www.bing.com/search?q=Hello world`);
await page.waitForSelector("#bnp_btn_accept", { visible: true });
await page.evaluate(() => {
const btn = document.getElementById("bnp_btn_accept");
if (btn) {
btn.click();
}
});
let cont = 0;
try {
for (cont; cont < arr.length; ) {
let image;
await page.goto(`https://www.bing.com/search?q=${arr[cont].Nome}`);
await page.waitForTimeout(3000);
await page.waitForSelector("#b-scopeListItem-images", { visible: true });
await page.evaluate(() => {
const imageBtn = document.getElementById("b-scopeListItem-images");
imageBtn?.children[0]?.click();
});
await page.waitForTimeout(3000);
await page.waitForSelector(".iusc", { visible: true }).catch(() => {
cont++;
});
await page
.evaluate(() => {
const firstImage = JSON.parse(
document.getElementsByClassName("iusc")[0].getAttribute("m")
).turl;
const semResultado = document.getElementById("dg_nr");
if (semResultado) {
console.log("Não tem resultado");
} else {
return firstImage;
}
})
.then(async (res) => {
arr[cont].Imagens = await res;
results.push(arr[cont]);
var stream = fs.createWriteStream("results.json");
stream.once("open", function (fd) {
stream.write(JSON.stringify(results));
stream.end();
});
console.log("Produto " + cont + " adicionado no Arquivo");
cont++;
})
.catch((err) => {
console.log("O Produto " + cont + " deu algum erro: " + err);
cont++;
});
}
} finally {
var stream = fs.createWriteStream("results.json");
stream.once("open", function (fd) {
stream.write(JSON.stringify(results));
stream.end();
});
await browser.close();
}
})();

Unable to implement any logic to scrape content from innermost pages using puppeteer

I've created a script using puppeteer to scrape the links of different authors from a webpage traversing multiple pages triggering click on the next page button. The script appears to be working in the right way.
Although the content of this site is static, I intentionally used puppeteer within the following script only to learn as to how I can parse content from inner pages.
Given that I wish to go one layer deep to scrape description from such pages. How can I achieve that?
const puppeteer = require('puppeteer');
function run (pagesToScrape) {
return new Promise(async (resolve, reject) => {
try {
if (!pagesToScrape) {
pagesToScrape = 1;
}
const browser = await puppeteer.launch({headless:false});
const [page] = await browser.pages();
await page.goto("https://quotes.toscrape.com/");
let currentPage = 1;
let urls = [];
while (currentPage <= pagesToScrape) {
let newUrls = await page.evaluate(() => {
let results = [];
let items = document.querySelectorAll('[class="quote"]');
items.forEach((item) => {
results.push({
authorUrl: 'https://quotes.toscrape.com' + item.querySelector("small.author + a").getAttribute('href'),
title: item.querySelector("span.text").innerText
});
});
return results;
});
urls = urls.concat(newUrls);
if (currentPage < pagesToScrape) {
await Promise.all([
await page.waitForSelector('li.next > a'),
await page.click('li.next > a'),
await page.waitForSelector('[class="quote"]')
])
}
currentPage++;
}
browser.close();
return resolve(urls);
} catch (e) {
return reject(e);
}
})
}
run(3).then(console.log).catch(console.error);

I would go this way:
const puppeteer = require('puppeteer');
let browser;
(async function main() {
browser = await puppeteer.launch({ headless: false, defaultViewport: null });
const [pageQuotes] = await browser.pages();
const pageAbout = await browser.newPage();
await pageQuotes.bringToFront(); // Otherwise, click on the next page link does not work.
const pagesToScrape = 3;
await pageQuotes.goto('https://quotes.toscrape.com/');
let currentPage = 1;
const data = { quotes: {}, abouts: {} };
const visitedAbouts = new Set();
while (currentPage <= pagesToScrape) {
await pageQuotes.waitForSelector('.quote');
const { quotes, aboutURLs } = await pageQuotes.evaluate(() => ({
quotes: Array.from(
document.querySelectorAll('.quote'),
quote => [quote.querySelector('small.author').innerText, quote.innerText],
),
aboutURLs: Array.from(
document.querySelectorAll('.quote small.author + a[href]'),
quote => quote.href,
),
}));
for (const [author, quote] of quotes) {
if (data.quotes[author] === undefined) data.quotes[author] = [];
data.quotes[author].push(quote);
}
for (const aboutURL of aboutURLs) {
if (!visitedAbouts.has(aboutURL)) {
visitedAbouts.add(aboutURL);
await pageAbout.goto(aboutURL);
await pageAbout.waitForSelector('div.author-details');
const { title, about } = await pageAbout.evaluate(() => ({
title: document.querySelector('div.author-details h3.author-title').innerText,
about: document.querySelector('div.author-details').innerText,
}));
data.abouts[title] = about;
}
}
if (currentPage < pagesToScrape) {
const nextLink = await pageQuotes.waitForSelector('li.next > a');
await Promise.all([
nextLink.click(),
pageQuotes.waitForNavigation(),
]);
}
currentPage++;
}
console.log(JSON.stringify(data, null, ' '));
})().catch(console.error).finally(async () => { if (browser) await browser.close(); });

Puppeteer how to read and use data from a text file

Hi I'm working on a script to learn node JS, and I'm stuck at this point :
I would like to read a text file with my email: password, and with that I would like to open as many tabs as many email:password I have and use each email: password to connect to the website
const puppeteer = require("puppeteer");
const lineReader = require("line-reader");
(async () => {
let data = [];
const promises = []
let int = 0;
lineReader.eachLine("c.txt", function(line) {
int++;
data = line.split(":")
console.log(data);
});
console.log(int);
for (let i = 0; i < 2; i++) {
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
const navigationPromise = page.waitForNavigation();
await page.goto("https:www.site.com/en/launch/");
await page.setViewport({ width: 1920, height: 1080 });
await page.waitForSelector(
".d-sm-h > .bg-white > .right-nav > .member-nav-item > .join-log-in"
);
await page.click(
".d-sm-h > .bg-white > .right-nav > .member-nav-item > .join-log-in"
);
const emailInput = '[placeholder="Adresse e-mail"][autocomplete="email"]';
await page.waitForSelector(emailInput, { timeout: 0 });
await page.focus(emailInput);
await page.keyboard.type(data[0]);
const passwordInput =
'[placeholder="Mot de passe"][autocomplete="current-password"]';
await page.waitForSelector(passwordInput, { timeout: 0 });
await page.focus(passwordInput);
await page.keyboard.type(data[1]);
await page.click(
".site-unite-submit-button.loginSubmit.site-unite-component"
);
}
await Promise.all(promises)
})();
but what I have when I consol log my data :
[ 'email#gmail.com:tesssst' ]
[ 'email2#gmail.com:tesssst' ]
I would like to know and understand how can I use each email:password that i have on my txt file to open tabs and use my data to login to the website
Thank you

Develop Reference

JavaScript is the programming language of the Web.

puppeteer-cluster, different data to the same url - javascript

Related

ProtocolError: Protocol error (Runtime.callFunctionOn): Target closed

How handle multiple functions in puppeteer-cluster?

Infinite loop with try catch puppeteer

Unable to implement any logic to scrape content from innermost pages using puppeteer

Puppeteer how to read and use data from a text file

Categories

Resources