I am having trouble recording an element on a page after checking if it exists. The block of code I'm referring to is under the "// phone" comment.
This code loops through each section (section of sections) on the page and records "company" and "phone." "Phone" may not be present in some sections so I figured I'd pass it through an if statement to check if it exists. This creates an error = "Error: failed to find element matching selector ".mn-contact-phone"" How do I solve this?
(async () => {
try {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
// loop through pages
for (let pg = 1; pg < 5; pg++) {
await page.goto("webpage");
// record number of sections
const sections = await page.$$("#mn-members-listings > div");
// loop through each section
for (const section of sections) {
// company
let company = await section.$eval(
"div.mn-searchlisting-title",
comp => comp.innerText
);
// phone --> THIS IF/ELSE THROWS AN ERROR
if (section.$(".mn-contact-phone").length > 0) {
let phone = await section.$eval(".mn-contact-phone", phn => phn.innerText);
} else {
let phone = "";
}
console.log(`Company = ${company} | Phone = ${phone}`);
}
}
await browser.close();
} catch (error) {
console.log(`Our error is = ${error}`);
}
})();
From puppeteer docs:
The method runs document.querySelector within the page. If no element
matches the selector, the return value resolves to null.
1) null doesn't have length.
2) ElementHandle.$ returns a promise.
change the condition to:
if (await section.$(".mn-contact-phone"))
or if there are multiple elements:
if (await section.$$(".mn-contact-phone").length > 0)
Related
I am having an issue retrieving an IFrame within an IFrame I am not exactly sure how to get the result that I am looking for, I am able to retrieve the Parent IFrame just fine, however when trying to choose the child it does not find the selector of the click method. How would I go about solving this? I am quite new to Puppeteer
// Get Email
await click(contentBuilderFrame, getEmail)
await page.waitFor(200);
await click(contentBuilderFrame, '#index-entry > div > div > div.active > div > div > div.contentproperties-header > div.contentproperties-buttons > div > div > div > a');
await page.waitFor(200);
const container = await findFrame(page, "EmailAppContainer");
await page.waitFor(200);
// Will not find it
const child = await findFrame(container, "cloud/tools/SSO.aspx?legacy=1&env=default&appId=6DF293A1-4FDC-41CD-AA08-89EFC793C33C&deepLink=%2f%23%2femails");
// Timeout because cant find selector due to not finding frame
await click(child, '#sendflow-step-content')
findFrame Method:
async function findFrame(page, urlPart) {
const mainFrame = page.mainFrame();
const contentFrames = await getContentFrames(mainFrame);
let frame = null;
for (const contentFrame of contentFrames) {
if (contentFrame.url().includes(urlPart)) {
frame = contentFrame;
}
}
return frame;
}
getContentFrames:
async function getContentFrames(frame) {
const contentFrames = [];
const iframes = await frame.$$("iframe");
for (const iframe of iframes) {
const contentFrame = await iframe.contentFrame();
if (contentFrame != null) {
contentFrames.push(contentFrame);
let subContentFrames = await getContentFrames(contentFrame);
if (subContentFrames.length > 0) {
contentFrames.push(...subContentFrames);
}
}
}
//console.log(contentFrames);
return contentFrames;
}
My understanding here is that I get all the frames with the getContentFrames method - I made sure that is the case by logging the output for these, but unsure how to proceed from there on out?
I am very new to puppeteer. I started yesterday and I'm trying to make a program that flips through a url that incrementally stores player id's one after the other and saves the player stats using neDB. There are thousands of links to flip through and I have found that if i use a for loop my computer basically crashes because 1,000 Chromiums try to open all at the same time. Is there a better way, or proper way to do this? Any advice would be appreciated.
const puppeteer = require('puppeteer');
const Datastore = require('nedb');
const database = new Datastore('database.db');
database.loadDatabase();
async function scrapeProduct(url){
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
let attributes = [];
//Getting player's name
const [name] = await page.$x('//*[#id="ctl00_ctl00_ctl00_Main_Main_name"]');
const txt = await name.getProperty('innerText');
const playerName = await txt.jsonValue();
attributes.push(playerName);
//Getting all 12 individual stats of the player
for(let i = 1; i < 13; i++){
let vLink = '//*[#id="ctl00_ctl00_ctl00_Main_Main_SectionTabBox"]/div/div/div/div[1]/table/tbody/tr['+i+']/td[2]';
const [e1] = await page.$x(vLink);
const val = await e1.getProperty('innerText');
const skillVal = await val.jsonValue();
attributes.push(skillVal);
}
//creating a player object to store the data how i want (i know this is probably ugly code and could be done in a much better way)
let player = {
Name: attributes[0],
Athleticism: attributes[1],
Speed: attributes[2],
Durability: attributes[3],
Work_Ethic: attributes[4],
Stamina: attributes[5],
Strength: attributes[6],
Blocking: attributes[7],
Tackling: attributes[8],
Hands: attributes[9],
Game_Instinct: attributes[10],
Elusiveness: attributes[11],
Technique: attributes[12],
};
database.insert(player);
await browser.close();
}
//For loop to loop through 1000 player links... Url.com is swapped in here because the actual url is ridiculously long and not important.
for(let i = 0; i <= 1000; i++){
let link = 'https://url.com/?id='+i+'§ion=Ratings';
scrapeProduct(link);
console.log("Player #" + i + " scrapped");
}
The easiest tweak would be to wait for each link to finish before starting the next:
(async () => {
for(let i = 0; i <= 1000; i++){
let link = 'https://url.com/?id='+i+'§ion=Ratings';
await scrapeProduct(link);
console.log("Player #" + i + " scrapped");
}
})();
You could also allow only enough open as your computer can handle. This will require more resources, but will allow the process to finish faster. Figure out the limit you want, then do something like:
let i = 0;
const getNextLink = () => {
if (i > 1000) return;
let link = 'https://url.com/?id='+i+'§ion=Ratings';
i++;
return scrapeProduct(link)
.then(getNextLink)
.catch(handleErrors);
};
Promise.all(Array.from(
{ length: 4 }, // allow 4 to run concurrently
getNextLink
))
.then(() => {
// all done
});
The above allows for 4 calls of scrapeProduct to be active at any one time - change the number as needed.
If you think that the issue with speed is reopening/closing the browser with each run, move browser to the global scope and initialize it to null. Then create a init function with something like:
async function init(){
if(!browser)
browser = await puppeteer.launch()
}
Allow pages to be passed to your scrapeProduct function. async function scrapeProduct(url) becomes async function scrapeProduct(url,page). Replace await browser.close() with await page.close(). Now your loop will look like this:
//For loop to loop through 1000 player links... Url.com is swapped in here because the actual url is ridiculously long and not important.
await init();
for(let i = 0; i <= 1000; i++){
let link = 'https://url.com/?id='+i+'§ion=Ratings';
let page = await browser.newPage()
scrapeProduct(link,page);
console.log("Player #" + i + " scrapped");
}
await browser.close()
If you wanted to limit number of pages the browser will concurrently run you could create a function to do that:
async function getTotalPages(){
const allPages = await browser.pages()
return allPages.length
}
async function newPage(){
const MAX_PAGES = 5
await new Promise(resolve=>{
// check once a second to check on pages open
const interval = setInterval(async ()=>{
let totalPages = await getTotalPages()
if(totalPages< MAX_PAGES){
clearInterval(interval)
resolve()
}
},1000)
})
return await browser.newPage()
}
If you did this, in your loop you'd replace let page = await browser.newPage with let page = await newPage()
I recently made a quick web scraper using puppeteer as it targets a JS website and want it to send the output that i get inside my console into discord. The thing is that I always get e.g price not defined or so when the script tries to send the web hook onto discord. Thank you all for your help in advance here is my code if someone can help me out please. I mean where should I put my const embed in order for it to work properly.
const puppeteer = require('puppeteer-extra');
// add stealth plugin and use defaults (all evasion techniques)
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
const { Webhook, MessageBuilder } = require('discord-webhook-node');
const hook = new Webhook("https://discordapp.com/api/webhooks/733332015654371361/9VGAVW-BNlf3G4j3L6GhAIDni17yNIVf9gfmf_TNTQafP40LqYvRwhaYZzL_b58kpkkl");
const url = "https://www.asos.com/fr/nike/nike-air-max-270-baskets-triple-noir-ah8050-005/prd/12490103?clr=noir-triple&colourwayid=16391201&SearchQuery=nike air max 270";
puppeteer.use(StealthPlugin());
async function ConfigureBrowser(){
const browser = await puppeteer.launch({ headless: true }); // for test disable the headlels mode,
const page = await browser.newPage();
await page.setViewport({ width: 1000, height: 926 });
await page.goto(url,{waitUntil: 'networkidle2'})
return page;
};
async function Scrape(page) {
// await page.reload();
console.log("start evaluate javascript")
/** #type {string[]} */
var productINFO = await page.evaluate(()=>{
var div = document.querySelectorAll('.core-product-container');
console.log(div) // console.log inside evaluate, will show on browser console not on node console
const productnames = []
div.forEach(element => {
var titleelem = element.querySelector('#aside-content > div.product-hero > h1');
if(titleelem != null){
productnames.push(titleelem.textContent.trim());
} //#aside-content > div.product-hero > h1
});
const productprice = []
div.forEach(element => {
var price = element.querySelector('[class="current-price"]');
if(price != null){
productprice.push(price.textContent.trim());
}
});
const productsizes = []
div.forEach(element => {
var sizes = element.querySelector('[data-id="sizeSelect"]');
if(sizes != null){
productsizes.push(sizes.textContent.trim());
}
// productsizes.forEach()
})
return [productnames, productprice, productsizes]
})
return productINFO;
// const embed = new MessageBuilder()
// .setTitle(productnames)
// .setURL(url)
// .addField('Prix', productprice, true)
// .addField('sizes', productsizes, true)
// .setColor(8008905)
// // .setThumbnail({image})
// .setDescription('Checked')
// //.setImage(image)
// .setFooter('', 'https://cdn.discordapp.com/attachments/720763827658162260/730786942316183603/image0.jpg')
// hook.send(embed);
discoord(productINFO);
console.log(productINFO);
//browser.close()
} ;
async function Monitor() {
let page = await ConfigureBrowser();
await Scrape(page);
// console.log(productINFO);
}
Monitor();
I'm trying to scrape an old website built with tr, br and iframe. Everything was going good so far before I started to want to extract data from an iframe, see iFrameScraping setTimeout, but the clicking is too fast for me to be able to get the datas. Would anyone have an idea of how to click, wait for the content to show and be scraped, then continue?
const newResult = await page.evaluate(async(resultLength) => {
const elements = document.getElementsByClassName('class');
for(i = 0; i < resultLength; i++) {
const companyArray = elements[i].innerHTML.split('<br>');
let companyStreet,
companyPostalCode;
// Get company name
const memberNumber = elements[i].getElementsByTagName('a')[0].getAttribute('href').match(/[0-9]{1,5}/)[0];
const companyName = await companyArray[0].replace(/<a[^>]*><span[^>]*><\/span>/, '').replace(/<\/a>/, '');
const companyNumber = await companyArray[0].match(/[0-9]{6,8}/) ? companyArray[0].match(/[0-9]{6,8}/)[0] : '';
// Get town name
const companyTown = await companyArray[1].replace('"', '');
// Get region name
const companyRegion = await companyArray[2].replace(/<span[^>]*>Some text:<\/span>/, '');
// Get phone number
const telNumber = await elements[i].innerHTML.substring(elements[i].innerHTML.lastIndexOf('</span>')).replace('</span>', '').replace('<br>', '');
const iFrameScraping = await setTimeout(async({elements, i}) => {
elements[i].getElementsByTagName('a')[0].click();
const iFrameContent = await document.getElementById('some-id').contentWindow.document.getElementById('lblAdresse').innerHTML.split('<br>');
companyStreet = iFrameContent[0].replace('"', '');
companyPostalCode = iFrameContent[2].replace('"', '');
}, 2000, {elements, i});
console.log(companyStreet, companyPostalCode)
};
}, pageSearchResults.length);
I fixed my issues after a while, so I'll share my solution.
I add to stop getting all the data with a loop from the evaluate because it's going to fast and creating a race condition. Instead I used a combination of page.$$ coupled with a for…of loop. Note that the forEach from es6 are causing race condition as well, since puppeteer does not wait for them to end to continue its execution.
Here is the example from my updated code:
const companies = await page.$$('.repmbr_result_item');
const companiesLinks = await page.$$('.repmbr_result_item a');
for(company of companies) {
const companyEl = await page.evaluate(el => el.innerHTML, company)
const companyElArray = companyEl.split('<br>');
How can I check if an element exists in selenium?
I have tried:
browser.driver.findElements(by.id('my-id'))
but it does not seem to work.
Use isElementPresent
browser.driver.isElementPresent(by.id('my-id'))
or isPresent
element(by.id('my-id')).isPresent()
The problem is looking for an element that does not exist throws an exception. You ar eon the right track with using findElements as this will not throw an error if it cannot find the element, the problem you have is being left with a list that contains the elements found but not comparing to make sure that there is at least 1 element in the list (e.g. it found one element)
public boolean exists(By by){
return !driver.findElements(by).isEmpty();
}
is a function that will return true if it exists, false otherwise. Its clear from this method how you could modify it to suit your need.
webdriver.js example
In this example I am waiting for a modal to exist because its a modal that fades in
See the line that reads
const doesModalFadeInExist = await this.driver.executeScript('return document.getElementsByClassName("modal fade in")');
You can put this line of code in a wait that waits for the element array to be >=1
This way you know it is there after checking inside the wait.
We can also check opacity at the same time.
Once the element array >=1 and opacity = '1' . then we exit the wait and continue the test
async waitForModalFadeIn() {
try {
let isVisible;
let opacityReturned;
let isModalFadeInFound;
const dialog = await this.driver.wait(until.elementLocated(this.baseselector.MODAL_FADE_IN));
await this.driver.wait(async () => {
const doesModalFadeInExist = await this.driver.executeScript('return document.getElementsByClassName("modal fade in")');
isModalFadeInFound = doesModalFadeInExist;
const bool = await this.isDisplayed(this.baseselector.MODAL_FADE_IN);
isVisible = bool;
const opacity = await dialog.getCssValue('opacity');
opacityReturned = opacity;
return isVisible === true && isModalFadeInFound.length > 0 && opacity === '1';
}, 4000);
return opacityReturned === '1' && isVisible === true && isModalFadeInFound.length > 0;
} catch (err) {
console.log(`Function waitForModalFadeIn: Failed to open modal${err}`);
return null;
}
}
Example Test
it('Trigger "unmetered" modals in the technical spec section', async () => {
await wHost.visit(page = '/web-hosting');
const table = await driver.wait(until.elementLocated(wHost.selector.COMPETITOR_TABLE), this.pageLoadTimeOut);
await driver.executeScript('arguments[0].scrollIntoView()', table);
const modals = await table.findElements(wHost.selector.UNLIMITED_MODAL);
for (let i = 0; i < modals.length; i++) {
await modals[i].click();
assert.isTrue(await common.waitForModalFadeIn());
assert.isTrue(await common.closeModal());
}
});
This is what you are looking for, This function takes id as parameter,
If the element is found it returns json object with status=true and element you were looking for.
async function ElementExistsID(element){
return new Promise(async (resolve)=>{
try{
let ElementResult=await driver.findElement(webdriver.By.id(element));
resolve({"element":ElementResult,"status":true});
}
catch(error){
log.warn(error);
resolve({"status":false});
}
})
}
You can use it like this
let ElementIamLookingFor=await ElementExistsID(IDOfTheElement);
if(ElementIamLookingFor.status){
console.log("Element exists");
}
else{
throw new Error("Element not found");
}
After many attempts, the following worked for me:
function doIfPresent(elementId, doIfPresent, next) {
var elem = by.id(elementId);
browser.driver.isElementPresent(elem).then(function (present){
if (present) {
doIfPresent(element(elem));
}
next();
});
}
eg.
doIfPresent('my-button', function(element){ element.click();}, function(){
// your code continues here
});
I don't get it though why we'd need to use futures here.