I have this code I made using Pupperteer to save URLs of Bing images, I have an Array of product names and I keep searching inside the Loop with the Try catch to get these URLs, but I have one problem, I wanted that when it didn't find the product (it didn't find a result) it would jump to the next item in the Array but it just doesn't do anything and Pupperteer closes the Browser entering the Finally of the Try catch, without pointing out any error. My logic seems right, can anyone help me with this question? I've tried everything
index.js
const fs = require("fs");
const puppeteer = require("puppeteer-core");
(async () => {
const browser = await puppeteer.launch({
executablePath:
"C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe",
headless: true,
timeout: 0,
});
const page = await browser.newPage();
await page.setDefaultNavigationTimeout(0);
let results = new Array();
const arr = [
{
ID: 6011,
Nome: "BRA MASCARA BIGODE Q2321(KRIAT 13726)",
Imagens: "",
},
{
ID: 6012,
Nome: "BRA MASCARA CAVEIRA ASSUSTADORA 1UN",
Imagens: "",
},
{
ID: 6013,
Nome: "BRA MASCARA CAVEIRA PLASTICA 1UN",
Imagens: "",
}
];
await page.goto(`https://www.bing.com/search?q=Hello world`);
await page.waitForSelector("#bnp_btn_accept", { visible: true });
await page.evaluate(() => {
const btn = document.getElementById("bnp_btn_accept");
if (btn) {
btn.click();
}
});
let cont = 0;
try {
for (cont; cont < arr.length; ) {
let image;
await page.goto(`https://www.bing.com/search?q=${arr[cont].Nome}`);
await page.waitForTimeout(3000);
await page.waitForSelector("#b-scopeListItem-images", { visible: true });
await page.evaluate(() => {
const imageBtn = document.getElementById("b-scopeListItem-images");
imageBtn?.children[0]?.click();
});
await page.waitForTimeout(3000);
await page.waitForSelector(".iusc", { visible: true });
await page
.evaluate(() => {
const firstImage = JSON.parse(
document.getElementsByClassName("iusc")[0].getAttribute("m")
).turl;
const semResultado = document.getElementById("dg_nr");
if (semResultado) {
console.log("Não tem resultado");
} else {
return firstImage;
}
})
.then(async (res) => {
arr[cont].Imagens = await res;
results.push(arr[cont]);
var stream = fs.createWriteStream("results.json");
stream.once("open", function (fd) {
stream.write(JSON.stringify(results));
stream.end();
});
console.log("Produto " + cont + " adicionado no Arquivo");
cont++;
})
.catch((err) => {
console.log("O Produto " + cont + " deu algum erro: " + err);
cont++;
});
}
} catch (err) {
cont++;
} finally {
var stream = fs.createWriteStream("results.json");
stream.once("open", function (fd) {
stream.write(JSON.stringify(results));
stream.end();
});
await browser.close();
}
})();
Fixed:
const fs = require("fs");
const puppeteer = require("puppeteer-core");
(async () => {
const browser = await puppeteer.launch({
executablePath:
"C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe",
headless: true,
timeout: 0,
});
const page = await browser.newPage();
await page.setDefaultNavigationTimeout(0);
let results = new Array();
const arr = [
{
ID: 6011,
Nome: "BRA MASCARA BIGODE Q2321(KRIAT 13726)",
Imagens: "",
},
{
ID: 6012,
Nome: "BRA MASCARA CAVEIRA ASSUSTADORA 1UN",
Imagens: "",
},
{
ID: 6013,
Nome: "BRA MASCARA CAVEIRA PLASTICA 1UN",
Imagens: "",
}
];
await page.goto(`https://www.bing.com/search?q=Hello world`);
await page.waitForSelector("#bnp_btn_accept", { visible: true });
await page.evaluate(() => {
const btn = document.getElementById("bnp_btn_accept");
if (btn) {
btn.click();
}
});
let cont = 0;
try {
for (cont; cont < arr.length; ) {
let image;
await page.goto(`https://www.bing.com/search?q=${arr[cont].Nome}`);
await page.waitForTimeout(3000);
await page.waitForSelector("#b-scopeListItem-images", { visible: true });
await page.evaluate(() => {
const imageBtn = document.getElementById("b-scopeListItem-images");
imageBtn?.children[0]?.click();
});
await page.waitForTimeout(3000);
await page.waitForSelector(".iusc", { visible: true }).catch(() => {
cont++;
});
await page
.evaluate(() => {
const firstImage = JSON.parse(
document.getElementsByClassName("iusc")[0].getAttribute("m")
).turl;
const semResultado = document.getElementById("dg_nr");
if (semResultado) {
console.log("Não tem resultado");
} else {
return firstImage;
}
})
.then(async (res) => {
arr[cont].Imagens = await res;
results.push(arr[cont]);
var stream = fs.createWriteStream("results.json");
stream.once("open", function (fd) {
stream.write(JSON.stringify(results));
stream.end();
});
console.log("Produto " + cont + " adicionado no Arquivo");
cont++;
})
.catch((err) => {
console.log("O Produto " + cont + " deu algum erro: " + err);
cont++;
});
}
} finally {
var stream = fs.createWriteStream("results.json");
stream.once("open", function (fd) {
stream.write(JSON.stringify(results));
stream.end();
});
await browser.close();
}
})();
Related
I have this bunch of code which runs infinitely as it can be seen from the terminal img. How can I stop it? Delete function is assigned to a button so I don't know why it runs even though I didn't click that button.
//server.js
app.get("/getMaintenanceTypes",(req,res)=>{
pool.query(`Select * from maintenancetypes`, (err, result) => {
if(!err){
console.log("maintanencetypes", result.rows);
res.status(200).send(result.rows);
}else{
res.status(404).json(err.message)
console.log("maintanencetypes", err.message)
}
})
})
//main.js
const getMainTypes = async () => {
const response = await axios.get(`${serverBaseUrl}/getMaintenanceTypes`);
//console.log("response", response.data);
const response2 = [];
for (var i = 0; i < response.data.length; i++) {
let entry = {
id: null,
title: "",
description: "",
};
entry.id = response.data[i].id;
entry.title = response.data[i].title;
entry.description = response.data[i].description;
response2.push(entry);
//console.log("entry", entry);
}
console.log("RESPONSE2", response2);
return response2;
}
async function deleteAllMainTypes() {
try {
const resp = await axios.delete(
`${serverBaseUrl}/deleteAllMainTypes`
);
} catch (error) {
console.log(error);
}
const maintenanceList = await getMainTypes();
//console.log("main list", maintenanceList);
setMaintenanceList(maintenanceList);
};
const deleteHandler = () => {
deleteAllMainTypes();
}
<Button onClick={deleteHandler} >Delete All</Button>
I've created a script using puppeteer to scrape the links of different authors from a webpage traversing multiple pages triggering click on the next page button. The script appears to be working in the right way.
Although the content of this site is static, I intentionally used puppeteer within the following script only to learn as to how I can parse content from inner pages.
Given that I wish to go one layer deep to scrape description from such pages. How can I achieve that?
const puppeteer = require('puppeteer');
function run (pagesToScrape) {
return new Promise(async (resolve, reject) => {
try {
if (!pagesToScrape) {
pagesToScrape = 1;
}
const browser = await puppeteer.launch({headless:false});
const [page] = await browser.pages();
await page.goto("https://quotes.toscrape.com/");
let currentPage = 1;
let urls = [];
while (currentPage <= pagesToScrape) {
let newUrls = await page.evaluate(() => {
let results = [];
let items = document.querySelectorAll('[class="quote"]');
items.forEach((item) => {
results.push({
authorUrl: 'https://quotes.toscrape.com' + item.querySelector("small.author + a").getAttribute('href'),
title: item.querySelector("span.text").innerText
});
});
return results;
});
urls = urls.concat(newUrls);
if (currentPage < pagesToScrape) {
await Promise.all([
await page.waitForSelector('li.next > a'),
await page.click('li.next > a'),
await page.waitForSelector('[class="quote"]')
])
}
currentPage++;
}
browser.close();
return resolve(urls);
} catch (e) {
return reject(e);
}
})
}
run(3).then(console.log).catch(console.error);
I would go this way:
const puppeteer = require('puppeteer');
let browser;
(async function main() {
browser = await puppeteer.launch({ headless: false, defaultViewport: null });
const [pageQuotes] = await browser.pages();
const pageAbout = await browser.newPage();
await pageQuotes.bringToFront(); // Otherwise, click on the next page link does not work.
const pagesToScrape = 3;
await pageQuotes.goto('https://quotes.toscrape.com/');
let currentPage = 1;
const data = { quotes: {}, abouts: {} };
const visitedAbouts = new Set();
while (currentPage <= pagesToScrape) {
await pageQuotes.waitForSelector('.quote');
const { quotes, aboutURLs } = await pageQuotes.evaluate(() => ({
quotes: Array.from(
document.querySelectorAll('.quote'),
quote => [quote.querySelector('small.author').innerText, quote.innerText],
),
aboutURLs: Array.from(
document.querySelectorAll('.quote small.author + a[href]'),
quote => quote.href,
),
}));
for (const [author, quote] of quotes) {
if (data.quotes[author] === undefined) data.quotes[author] = [];
data.quotes[author].push(quote);
}
for (const aboutURL of aboutURLs) {
if (!visitedAbouts.has(aboutURL)) {
visitedAbouts.add(aboutURL);
await pageAbout.goto(aboutURL);
await pageAbout.waitForSelector('div.author-details');
const { title, about } = await pageAbout.evaluate(() => ({
title: document.querySelector('div.author-details h3.author-title').innerText,
about: document.querySelector('div.author-details').innerText,
}));
data.abouts[title] = about;
}
}
if (currentPage < pagesToScrape) {
const nextLink = await pageQuotes.waitForSelector('li.next > a');
await Promise.all([
nextLink.click(),
pageQuotes.waitForNavigation(),
]);
}
currentPage++;
}
console.log(JSON.stringify(data, null, ' '));
})().catch(console.error).finally(async () => { if (browser) await browser.close(); });
I would like to gather all events and JavaScript code with dependencies for a particular form field/all form field including frameworks. I have tried with puppeteer and CDP to get the events for a field and subsequently gather JavaScript. I was able to get events details successfully. Not sure how to walk all traces from the event to collect used JavaScript code. Quick help appreciated.
Following code has been used to collect events.
listener = await windowHandle._client.send('DOMDebugger.getEventListeners', {
objectId: submitElementHandle._remoteObject.objectId
});
Another variant, more hacky and less reliable.
import puppeteer from 'puppeteer';
const browser = await puppeteer.launch(/* { headless: false, defaultViewport: null } */);
try {
const [page] = await browser.pages();
// Hook on setting listeners.
await page.evaluateOnNewDocument(() => {
window._puppeteerListenersMap = new Map();
const _puppeteerOldAddEventListener = EventTarget.prototype.addEventListener;
EventTarget.prototype.addEventListener = function newAddEventListener(...args) {
const element = this;
const [eventName, eventListener] = args;
if (!_puppeteerListenersMap.get(element)) {
_puppeteerListenersMap.set(element, Object.create(null));
}
const allListeners = _puppeteerListenersMap.get(element);
allListeners[eventName] ??= [];
allListeners[eventName].push(eventListener);
_puppeteerOldAddEventListener.call(this, ...args);
};
});
await page.goto('https://example.org/');
// Test setting listeners.
await page.evaluate(() => {
document.body.addEventListener('click', function click() {
console.log('addEventListener click');
});
document.body.addEventListener('dblclick', function dblclick() {
console.log('addEventListener dblclick');
});
document.body.onclick = function onclick() { console.log('onclick'); };
document.body.ondblclick = function ondblclick() { console.log('ondblclick'); };
});
// Test getting listeners.
const data = await page.evaluate(() => {
const element = document.body;
const elementListeners = Object.create(null);
const allListeners = _puppeteerListenersMap.get(element);
if (allListeners) {
for (const [eventName, eventListeners] of Object.entries(allListeners)) {
elementListeners[eventName] = eventListeners.map(
eventListener => eventListener.toString()
);
}
}
for (const name in element) {
if (name.startsWith('on') &&
element[name] !== null &&
typeof element[name] === 'function'
) {
elementListeners[name] = element[name].toString();
}
}
return elementListeners;
});
console.log(JSON.stringify(data, null, ' '));
} catch (err) { console.error(err); } finally { await browser.close(); }
Output:
{
"click": [
"function click() {\n console.log('addEventListener click');\n }"
],
"dblclick": [
"function dblclick() {\n console.log('addEventListener dblclick');\n }"
],
"onclick": "function onclick() { console.log('onclick'); }",
"ondblclick": "function ondblclick() { console.log('ondblclick'); }"
}
UPD. Testing for the page from codepen:
// Test getting listeners.
const data = await page.evaluate(() => {
const element = document.querySelector('#signup_v1-email');
element.value = 'foo'; // Input invalid email.
const elementListeners = Object.create(null);
const allListeners = _puppeteerListenersMap.get(element);
if (allListeners) {
for (const [eventName, eventListeners] of Object.entries(allListeners)) {
elementListeners[eventName] = eventListeners.map(
eventListener => (
eventListener.call(element, new Event(eventName)), // 'not a valid email' in Browser console.
eventListener.toString()
)
);
}
}
for (const name in element) {
if (name.startsWith('on') &&
element[name] !== null &&
typeof element[name] === 'function'
) {
elementListeners[name] = element[name].toString();
}
}
return elementListeners;
});
This is all I can get from the Chrome DevTools Protocol documents:
import puppeteer from 'puppeteer';
const browser = await puppeteer.launch();
try {
const [page] = await browser.pages();
const cdp = await page.target().createCDPSession();
await cdp.send('Debugger.enable');
const scriptIdToUrlMap = new Map();
cdp.on('Debugger.scriptParsed', ({ scriptId, url }) => {
scriptIdToUrlMap.set(scriptId, url);
});
await page.goto('https://chromedevtools.github.io/devtools-protocol/tot/DOMDebugger/');
await page.waitForSelector('body > main');
const { objectId } = (await cdp.send('Runtime.evaluate', {
expression: 'document.querySelector("body > main")',
})).result;
const { listeners } = await cdp.send('DOMDebugger.getEventListeners', { objectId });
for (const listener of listeners) {
console.log('Listener details:', listener, '\n');
console.log('Script URL:', scriptIdToUrlMap.get(listener.scriptId), '\n');
const { scriptSource } = await cdp.send('Debugger.getScriptSource', {
scriptId: listener.scriptId,
});
console.log(
'Script source start:',
scriptSource.split('\n')[listener.lineNumber].slice(listener.columnNumber),
'...\n',
);
}
} catch (err) { console.error(err); } finally { await browser.close(); }
Current output:
Listener details: {
type: 'click',
useCapture: false,
passive: false,
once: false,
scriptId: '4',
lineNumber: 181,
columnNumber: 649
}
Script URL: https://chromedevtools.github.io/devtools-protocol/scripts/index.js
Script source start: (){I.classList.contains("shown")&&(I.classList.remove("shown"),P.focus())}document.addEventListener("keydown",e=>{e.metaKey||e.ctrlKey||e.altKey||(e.keyCode>=65&&e.keyCode<=90&&document.querySelector("cr-search-control").inputElement.focus(),"Escape"===e.key&&I.classList.contains("shown")&&I.classList.remove("shown"))}),P.addEventListener("click",e=>{e.stopPropagation(),I.addEventListener("transitionend",()=>{O.focus()},{once:!0}),I.classList.add("shown")}),B.addEventListener("click",W),O.addEventListener("click",W); ...
My goal is to click on each link (called a footnote) on this page and then return the footnote link, text, and then all of the URLs that appear in the sidebar. I'm stuck on accessing the sidebar values when they appear and after a few weeks of failure, I'm looking for some pointers on what I'm doing wrong (very new to both javascript and puppeteer).
const puppeteer = require('puppeteer');
const url = 'https://www.churchofjesuschrist.org/study/scriptures/bofm/1-ne/11?lang=eng';
const selector = '.study-note-ref';
(async function () {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.goto(url);
const footnotes = await page.$$eval(selector, nodes => {
return nodes.map(node => {
const ref = node.href.replace('https://www.churchofjesuschrist.org', '');
const txt = node.text;
return {
ref,
txt
};
});
});
for (const a of footnotes) {
page.click(a.ref);
const links = await page.$$eval('.scripture-ref', nodes => {
return nodes.map(node => {
return node.href
})
})
}
console.log(footnotes);
console.log(links);
// const fs = require('fs');
// fs.writeFile('./footnotes.json', JSON.stringify(footnotes), err => err ? console.log(err) : null);
await browser.close();
})();
Maybe something like this:
const puppeteer = require('puppeteer');
const url = 'https://www.churchofjesuschrist.org/study/scriptures/bofm/1-ne/11?lang=eng';
const selector = '.study-note-ref';
(async function main() {
const browser = await puppeteer.launch({ headless: true });
const [page] = await browser.pages();
await page.goto(url);
const data = {};
for (const footnote of await page.$$(selector)) {
const [href, text] = await page.evaluate(
(a) => {
a.click();
return [a.getAttribute('href').replace('/#note', ''), a.innerText.slice(1)];
},
footnote
);
data[href] = { text };
const header = await page.waitForXPath(`//aside/div/header/span[text()="${href} ${text}"]`);
data[href].links = await page.evaluate(
(span) => {
const aside = span.closest('aside');
return [...aside.querySelectorAll('a[href]')].map(
a => ({ [a.innerText]: a.href })
);
},
header
);
console.log(`Done: ${href} ${text}`);
}
console.log(JSON.stringify(data, null, 2));
await browser.close();
})();
Part of the output:
{
"1a": {
"text": "pondering",
"links": [
{
"D&C 76:19": "https://www.churchofjesuschrist.org/study/scriptures/dc-testament/dc/76.19?lang=eng#p19"
},
{
"TG Meditation": "https://www.churchofjesuschrist.org/study/scriptures/tg/meditation?lang=eng"
},
{
"Doctrine and Covenants 76:19": "https://www.churchofjesuschrist.org/study/scriptures/dc-testament/dc/76.19?lang=eng#p19#19"
},
{
"Meditation, Meditate": "https://www.churchofjesuschrist.org/study/scriptures/tg/meditation?lang=eng"
}
]
},
}
Every time I run this script is times-out.
Does setDefaultNavigationTimeout actually prevent time outs?
There is about 26 URLs I'm going through, and each page has a large amount of images. Can't imagine Puppeteer can't handle these pages just because of heavy images?
const url = 'test.com';
const jsonReturn = [];
async function runScraper() {
const browser = await puppeteer.launch(prodConfig);
const page = await browser.newPage({
timeout: 0
});
page.setDefaultNavigationTimeout(0);
await page.goto(url, { waitUntil: 'domcontentloaded' });
await page.waitForSelector('.featured-shows-featured-show');
let featuredShowsURLs = await page.$$eval('.featured-shows-featured-show > a', (links) => {
return links.map(link => {
return link.href;
});
});
featuredShowsURLs = _.uniq(featuredShowsURLs)
for (const featuredShowsURL of featuredShowsURLs) {
const page = await browser.newPage({
timeout: 0
});
try {
await page.goto(featuredShowsURL);
await page.waitForSelector('.show-title');
} catch (e) {
featuredShowsURL;
debugger;
}
const showTitle = await findAndReturnSelectorText('.show-title', page);
const showDates = await findAndReturnSelectorText('.show-dates', page);
const showLocation = await findAndReturnSelectorText('.show-location', page);
const showGallery = await findAndReturnSelectorText('.entity-link', page);
const showDetail = await findAndReturnSelectorText('.show-press-release', page);
const newItem = {
showTitle,
showDates,
showLocation,
showGallery,
showDetail,
};
const id = hash(newItem);
jsonReturn.push({
...newItem,
id
});
}
await browser.close();
}
runScraper();