If I go to https://investor.vanguard.com/mutual-funds/profile/VMMXX and execute document.querySelector("[data-ng-if='productSummaryTitle']").innerText from console, I get what I am expecting: Product summary.
But when I try to do the same with puppeteer, I get UnhandledPromiseRejectionWarning: Error: Evaluation failed: TypeError: Cannot read property 'innerText' of null at __puppeteer_evaluation_script__:3:83. What am I missing?
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({ headless: false })
const page = await browser.newPage()
await page.goto('https://investor.vanguard.com/mutual-funds/profile/VMMXX')
const result = await page.evaluate(() => {
let myText = document.querySelector("[data-ng-if='productSummaryTitle']").innerText
return {
myText
}
})
console.log(result)
browser.close()
})()
You could wait for that selector first
const element = await page.waitForSelector('[data-ng-if='productSummaryTitle']');
const text = await element.evaluate(el => el.innerText);
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://investor.vanguard.com/mutual-funds/profile/VMMXX');
const element = await page.waitForSelector("[data-ng-if='productSummaryTitle']");
const text = await element.evaluate(el => el.innerText);
console.log(text);
await browser.close();
})();
Related
I need to go to the UPS site, type in a tracking number, then get the data about that tracking number. I turned headless to false then the timeouts disappeared, turn it to true, and it will timeout or wait forever. I tried a number of different methods (any I could find), to mess around with the promises, but nothing helped.
const puppeteer = require('puppeteer');
var fs = require("fs").promises;
var Page = {};
var Browser = {};
async function printPage(){
console.log("printPage()");
await fs.writeFile("pagecontent_afterClick.txt", await Page.content());
const count = await Page.$$eval('#st_App_PkgStsMonthNum', divs => divs.length);
console.log(count);
const deliveredOn = await Page.$eval('#st_App_PkgStsMonthNum', el => el.textContent);
const deliveredToAddress = await Page.$eval('#stApp_txtAddress', el => el.textContent);
const deliveredToCountry = await Page.$eval('#stApp_txtCountry', el => el.textContent);
const deliveredBy = await Page.$eval('#stApp_valReceivedBy', el => el.textContent);
console.log("browser close");
await Browser.close();
}
async function start(){
console.log("start");
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
Page = page
Browser = browser
const navigationPromise = page.waitForNavigation({waitUntil: 'load'})
await navigationPromise;
await fs.writeFile("pagecontent_B4.txt", await page.content());
await page.type("#stApp_trackingNumber", "1Z0F1R740320306827");
const searchValue = await page.$eval('#stApp_trackingNumber', el => el.value);
console.log(searchValue);
const count = await page.$$eval('button#stApp_btnTrack.ups-cta.ups-cta_primary', divs => divs.length);
console.log(count);
try {
console.log("end waiting");
await Promise.all([
page.click('#stApp_btnTrack'),
page.waitForNavigation({waitUntil: 'networkidle2'})
]);
printPage();
} catch (e) {
if (e instanceof puppeteer.errors.TimeoutError) {
console.log("timeout");
printPage();
}
}
}
start();
I got the following error in a JavaScript program with puppeteer: UnhandledPromiseRejectionWarning: TypeError: Cannot read property 'getProperty' of undefined
I've checked several times but the element exists and can be found through the XPath. Here's my code
const puppeteer = require('puppeteer');
async function scraping(url, username, password) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url, { waitUntil: 'networkidle0' });
const [loginTitle] = await page.$x('/html/body/div[1]/div[2]/div/div/div[1]/h3');
const loginTitleTxt = await loginTitle.getProperty('textContent')
const loginTitleRawTxt = await loginTitleTxt.jsonValue();
const [usernameInputField] = await page.$x('//*[#id="form-username"]/input');
usernameInputField.type(username);
const [passwordInputField] = await page.$x('//*[#id="form-password"]/input');
passwordInputField.type(password);
await page.evaluate(() => {
document.evaluate('/html/body/div[1]/div[2]/div/div/div[2]/form/div[3]/div[1]/button', document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue.click();
});
// Here is the error
const [planTitle] = await page.$x('/html/body/center[1]/p[1]/table/tbody/tr[2]/td');
const planTitleTxt = await planTitle.getProperty('textContent')
const planTitleRawTxt = await planTitleTxt.jsonValue();
console.log(planTitleRawTxt);
browser.close();
}
scraping('https://hereisttheurl.net/abc.html', 'aa.bb', '123');
I was trying to scrape a thumbnail image from youtube with its XPath but I am getting undefined for the src. I can't figure out what is causing this? I already tried using both the XPath and full XPath but that didn't help. Any help is appreciated. Thanks in advance.
const puppeteer = require('puppeteer');
async function scrapeChannel1(url) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url, {
timeout: 0
});
const [el2] = await page.$x('//*[#id="dismissible"]/ytd-thumbnail');
const src1 = await el2.getProperty('src');
const thumbnailURL1 = await src1.jsonValue();
browser.close();
console.log({
thumbnailURL1
})
return {
thumbnailURL1
}
}
scrapeChannel1('https://www.youtube.com/')
The <img> you are looking for is placed a bit deeper in the DOM at: '//*[#id="dismissible"]/ytd-thumbnail/a/yt-img-shadow/img' (so you should add: /a/yt-img-shadow/img at the end of your XPath expression).
Note, you have more powerful tools in puppeteer than .getProperty('src') to retrieve DOM element properties.
E.g. page.$eval:
const selector = 'ytd-thumbnail > a > yt-img-shadow > #img'
const imageSrc = await page.$eval(selector, el => el.src)
// returns: https://i.ytimg.com/vi/{youtube_id}/hqdefault.jpg...
Or if you want all images use page.$$eval:
const imageSrcs = await page.$$eval(selector, elems => elems.map(el => el.src))
If you want to get images src from YouTube, you need to scroll video thumbnails into view like in the code below (also check it on the online IDE):
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());
const mainPageUrl = "https://www.youtube.com";
async function scrollPage(page, scrollElements) {
let currentElement = 0;
while (true) {
let elementsLength = await page.evaluate((scrollElements) => {
return document.querySelectorAll(scrollElements).length;
}, scrollElements);
for (; currentElement < elementsLength; currentElement++) {
await page.waitForTimeout(200);
await page.evaluate(
(currentElement, scrollElements) => {
document.querySelectorAll(scrollElements)[currentElement].scrollIntoView();
},
currentElement,
scrollElements
);
}
await page.waitForTimeout(5000);
let newElementsLength = await page.evaluate((scrollElements) => {
return document.querySelectorAll(scrollElements).length;
}, scrollElements);
if (newElementsLength === elementsLength || currentElement > 100) break; // if you want to get all elements (or some other number of elements) change number to 'Infinity' (or some other number)
}
}
async function getThumbnails() {
const browser = await puppeteer.launch({
headless: false,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
await page.setDefaultNavigationTimeout(60000);
await page.goto(mainPageUrl);
await page.waitForSelector("#contents");
const scrollElements = "a#thumbnail";
await scrollPage(page, scrollElements);
await page.waitForTimeout(10000);
const urls = await page.$$eval("a#thumbnail #img", (els) => els.map(el => el.getAttribute('src')).filter(el => el));
await browser.close();
return urls;
}
getThumbnails().then(console.log);
Output
[
"https://i.ytimg.com/vi/02oeySm1CJA/hq720.jpg?sqp=-oaymwEcCNAFEJQDSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLBmrYMHESpY_f1oTNx00iuR3tNeCQ",
"https://i.ytimg.com/vi/RMo2haIPYBM/hq720_live.jpg?sqp=CNifxJcG-oaymwEcCNAFEJQDSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLBw4ogzR0709SqbttRdEzfL-aTdgQ",
"https://i.ytimg.com/vi/qJFFp_ta1Zk/hqdefault.jpg?sqp=-oaymwEcCOADEI4CSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLBJ-44OFgBUuVUYWBVh3Yi3hQgwIg",
"https://i.ytimg.com/vi/OZoTjoN-Sn0/hqdefault.jpg?sqp=-oaymwEcCOADEI4CSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLCOeGTCnlT4U0wV1SNclkmFUEHLaA",
"https://i.ytimg.com/vi/L8cH2gI67uk/hqdefault.jpg?sqp=-oaymwEcCOADEI4CSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLAuvZ3khIjpvAVTGjmR9FDxQrPIgQ",
"https://i.ytimg.com/vi/6rUyVKyJnGY/hq720.jpg?sqp=-oaymwEcCNAFEJQDSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLCifsTG4MlA3mf8CcJDkfKdWaZkaA",
"https://i.ytimg.com/vi/xpaURivPZFk/hq720_2.jpg?sqp=-oaymwEdCJYDENAFSFXyq4qpAw8IARUAAIhCcAHAAQbQAQE=&rs=AOn4CLA5oFDDsVzbV3tUqyfogfuf3LPahQ",
"https://i.ytimg.com/vi/MsR76PyVdUs/hq720_2.jpg?sqp=-oaymwEdCJYDENAFSFXyq4qpAw8IARUAAIhCcAHAAQbQAQE=&rs=AOn4CLAEBYGNvif-7LWx2mqW4G9o-OUhEQ",
"https://i.ytimg.com/vi/liasQRRVt5w/hq720_2.jpg?sqp=-oaymwEdCJYDENAFSFXyq4qpAw8IARUAAIhCcAHAAQbQAQE=&rs=AOn4CLAUcMpyKY0GhmNAHHtP_cDkAp18DQ",
"https://i.ytimg.com/vi/Dr5IqlTLMDM/hq720_2.jpg?sqp=-oaymwEdCJYDENAFSFXyq4qpAw8IARUAAIhCcAHAAQbQAQE=&rs=AOn4CLBOSUi6mgjdD5a-Jx8Ns24SlexB1g",
"https://i.ytimg.com/vi/E8kit8xJKdI/hq720_2.jpg?sqp=-oaymwEdCJYDENAFSFXyq4qpAw8IARUAAIhCcAHAAQbQAQE=&rs=AOn4CLDDStn95G7ei5DTusGXE4RimzdLUw",
"https://i.ytimg.com/vi/SqEaahOmLHU/hq720_2.jpg?sqp=-oaymwEdCM0CENAFSFXyq4qpAw8IARUAAIhCcAHAAQbQAQE=&rs=AOn4CLBDcWLCklNxEAuT1ZvSTKrIplGOag",
...and other results
]
You can read more about scraping YouTube search from my blog post Web scraping YouTube search video results with Nodejs.
So, I am trying to scrape a couple of searchengines with a couple of search phrases with Playwright.
Running the script with one query is working.
Working:
const { chromium } = require('playwright');
(async () => {
const browser = await chromium.launch({ headless: false, slowMo: 250 });
const context = await browser.newContext()
const page = await context.newPage();
const keyWord = ('Arsenal');
await page.goto('https://duckduckgo.com/');
await page.fill('//input[#name="q"]',keyWord);
await page.keyboard.press('Enter');
const getOne = (' (//h2[#class="result__title"])[9] ');
await page.waitForSelector(getOne)
const pushOne = await page.$(getOne);
const One = await pushOne.evaluate(element => element.innerText);
console.log(One);
await page.goto('https://yandex.com/');
await page.fill('//input[#aria-label="Request"]', keyWord);
await page.keyboard.press('Enter');
const getTwo = (' //li[#data-first-snippet] //div[#class="organic__url-text"] ');
await page.waitForSelector(getTwo)
const pushTwo = await page.$(getTwo);
const Two = await pushTwo.evaluate(element => element.innerText);
console.log(Two);
await browser.close()
})()
But when I use an array with phrases (keyWordlist) I fail to get the script running.
Have searched around for using Array with 'For' and 'Foreach' loops, but haven't been able to fix it.
I want to run the different keywords through the different searchengines and list the results.
For 3 keywords in two searchengines that would get 6 results.
const { chromium } = require('playwright');
(async () => {
const browser = await chromium.launch({ headless: false, slowMo: 250 });
const context = await browser.newContext()
const page = await context.newPage();
let kewWordlist = ['Arsenal', 'Liverpool', 'Ajax']
for (var i=0; i<=kewWordlist.length; i++) {
// for (const i in kewWordlist){
async () => {
const keyWord = kewWordlist[i];
await page.goto('https://duckduckgo.com/');
await page.fill('//input[#name="q"]',keyWord);
// await page.fill('//input[#name="q"]',[i]);
// await page.fill('//input[#name="q"]',`${keyWord}`);
await page.keyboard.press('Enter');
const getOne = (' (//h2[#class="result__title"])[9] ');
await page.waitForSelector(getOne)
const pushOne = await page.$(getOne);
const One = await pushOne.evaluate(element => element.innerText);
console.log(One);
// await page.goto('https://yandex.com/');
// await page.fill('//input[#aria-label="Request"]', keyWord);
// await page.keyboard.press('Enter');
// const getTwo = (' //li[#data-first-snippet] //div[#class="organic__url-text"] ');
// await page.waitForSelector(getTwo)
// const pushTwo = await page.$(getTwo);
// const Two = await pushTwo.evaluate(element => element.innerText);
// console.log(Two);
}}
await browser.close()
})()
If anyone has some pointers on how to solve this, much obliged.
maybe the result selectors needs some tweaking but I think this is what you were looking for:
test.only('search search engines', async({page, context}) => {
const search = [
{
name: 'yandex',
url: 'https://yandex.com/',
elementFill: '//input[#aria-label="Request"]',
elementResult: '//li[#data-first-snippet] //div[#class="organic__url-text"]'
},
{
name: 'google',
url: 'https://www.google.nl',
elementFill: '//input[#name="q"]',
elementResult: '(//h2[#class="result__title"])[9]'
},
{
name: '',
url: 'https://duckduckgo.com/',
elementFill: '//input[#name="q"]',
elementResult: '(//h2[#class="result__title"])[9]'
}
]
const kewWordlist = ['Arsenal', 'Liverpool', 'Ajax']
for (let i = 0; i < search.length; i++) {
const searchName = search[i].name
const searchResult = search[i].elementResult
const searchFill = search[i].elementFill
const searchPage = await context.newPage()
await searchPage.waitForLoadState()
await searchPage.goto(`${search[i].url}`)
for (let i = 0; i < kewWordlist.length; i++) {
await searchPage.fill(searchFill,kewWordlist[i])
await searchPage.keyboard.press('Enter')
await searchPage.waitForSelector(searchResult)
const result = await page.$(searchResult)
console.log(`${searchName}: ${result} `)
}
}
})
The reason your loop isn't working is that you have an async function inside of it that you never call. There are a few ways you could go about this:
You could take your first version, have it accept a word to search, and run that over each element of the array:
const searchOneKeyword = async (keyWord) => {
const browser = await chromium.launch({ headless: false, slowMo: 250 });
const context = await browser.newContext()
const page = await context.newPage();
// rest of code
}
const kewWordList = ['Arsenal', 'Liverpool', 'Ajax']
keyWordList.forEach((k) => {
searchOneKeyword(k)
})
Or if you'd like to keep the same browser instance, you can do it in a loop in the function:
const search = async (words) => {
const browser = await chromium.launch({ headless: false, slowMo: 250 });
const context = await browser.newContext()
const page = await context.newPage();
for (const keyWord of words) {
await page.goto('https://duckduckgo.com/');
await page.fill('//input[#name="q"]',keyWord);
await page.keyboard.press('Enter');
const getOne = (' (//h2[#class="result__title"])[9] ');
await page.waitForSelector(getOne)
const pushOne = await page.$(getOne);
const One = await pushOne.evaluate(element => element.innerText);
console.log(One);
// etc.
}
await browser.close()
}
search(keyWordList)
In both of those cases, you're logging, but never returning anything, so if you need that data in another function afterwards, you'd have to change that. Example:
const search = async (words) => {
const browser = await chromium.launch({ headless: false, slowMo: 250 });
const context = await browser.newContext()
const page = await context.newPage();
const results = await Promise.all(words.map((keyWord) => {
await page.goto('https://duckduckgo.com/');
await page.fill('//input[#name="q"]',keyWord);
await page.keyboard.press('Enter');
// etc.
return [ One, Two ]
}))
await browser.close()
return results
}
search(keyWordList).then((results) => { console.log(results.flat()) })
I have spent a couple of hours trying to get the script working based on your suggestions. No result unfortunately. I get errors like 'await is only valid in async function' and 'Unreachable code detected'. Searched for other examples, for some inspiration, but none found. If you or someone else has a suggestion, please share! This is code I have now:
const { chromium } = require('playwright');
let keyWordList = ['Arsenal', 'Liverpool', 'Ajax']
const search = async function words() {
const browser = await chromium.launch({ headless: false, slowMo: 250 });
const context = await browser.newContext()
const page = await context.newPage();
}
const results = await Promise.all(words.map(keyWord))
//DUCKDUCKGO
await page.goto('https://duckduckgo.com/');
await page.fill('//input[#name="q"]',keyWord);
await page.keyboard.press('Enter');
const getOne = (' (//h2[#class="result__title"])[9] ');
await page.waitForSelector(getOne)
const pushOne = await page.$(getOne);
const One = await pushOne.evaluate(element => element.innerText);
//YANDEX
await page.goto('https://yandex.com/');
await page.fill('//input[#aria-label="Request"]', keyWord);
await page.keyboard.press('Enter');
const getTwo = (' //li[#data-first-snippet] //div[#class="organic__url-text"] ');
await page.waitForSelector(getTwo)
const pushTwo = await page.$(getTwo);
const Two = await pushTwo.evaluate(element => element.innerText);
console.log(Two);
return [ One , Two ]
return results
search(keyWordList).then((results) => { console.log(results.flat())
await browser.close();
})
I am trying to create UI in Electron for scraper in Puppeteer.
Every time I use page.evaluate() it returns an empty object [object Object],
here is an example:
const puppeteer = require('puppeteer');
const scrape = async () => {
const browser = await puppeteer.launch({
executablePath: '/usr/bin/google-chrome',
headless: true,
});
const page = await browser.newPage();
await page.goto("https://google.com/", {
waitUntil: 'networkidle2',
timeout: 90000
});
const length = await page.evaluate(`selector => {
return Array.from(document.querySelectorAll(selector)).length;
}`, 'div');
await page.close();
await browser.close();
return length;
}
document.querySelector("button").addEventListener("click", async function() {
const divs_len = await scrape();
const par = document.querySelector('#par');
par.innerText = divs_len;
});
// par shows [object Object]
EDIT
I have used the following resource to fix the sample code:
https://github.com/puppeteer/puppeteer/issues/4221#issuecomment-478780545
And here is the working version:
const puppeteer = require('puppeteer');
const scrape = async () => {
const browser = await puppeteer.launch({
executablePath: '/usr/bin/google-chrome',
headless: true,
});
const page = await browser.newPage();
await page.goto("https://google.com/", {
waitUntil: 'networkidle2',
timeout: 90000
});
const functionToBeEvaluated = selector => {
return Array.from(document.querySelectorAll(selector)).length;
}
const result = await page.evaluate('(' + functionToBeEvaluated.toString() + ')("div");');
await page.close();
await browser.close();
return result;
}
document.querySelector("button").addEventListener("click", async function() {
const divs_len = await scrape();
const par = document.querySelector('#par');
par.innerText = divs_len;
});
In page.evaluate() argument, you can use a function expression that will be called or a string with a direct code that will be executed. If you send a function expression as a string, page.evaluate() returns just a reference to this very function which becomes an empty object as functions are not serializable. Try this:
const length = await page.evaluate(selector => {
return Array.from(document.querySelectorAll(selector)).length;
}, 'div');