This is the test I wrote and my problem is, that I tried to build it as robust as possible and still, it sometimes works and sometimes it fails for reasons like shown below.
Maybe I am missing something I'd appreciate it if one of you could help me make the tests more robust.
Note: That I removed sensitive information from the code.
const { getStatement, portalLogin, getDeletableList } = require("./helper.js");
const user = { userName: "*****#*****.de", passWord: "*******" };
const puppeteer = require("puppeteer");
const headlessVal = false;
const webSiteUrlLogin = "https://... .de/dashboard";
const webSiteUrlMe = "https://... .de/me";
// elements with classes
const change_username_button = ".change-username-button";
const save_user_changes_btn = ".save-user-changes-btn";
// elements with ID's
const user_personal_title = "#user_personal_title";
// [happy path] 1 user changes #user_personal_title
test("[happy path] 1 user changes #user_personal_title", async () => {
//////// SETUP & LOGIN START /////////
const browser = await puppeteer.launch({
headless: headlessVal,
args: ["--window-size=1920,1080"],
defaultViewport: null,
});
// new browser window opens
const page = await browser.newPage();
// open specified url and waits for it to be loaded
await page.goto(webSiteUrlLogin, {
//wait until there are no more conections that traffic
waitUntil: "networkidle0",
});
await page.waitForNavigation({
waitUntil: "networkidle2",
});
const user_local = {
userName:
user.userName !== undefined ? user.userName : process.env.USERNAME,
passWord:
user.passWord !== undefined ? user.passWord : process.env.PASSWORD,
};
await page.waitForTimeout(500);
await page.evaluate((user_local) => {
<FUNCTION LOGIN>
}, user_local);
await page.waitForTimeout(600);
//////// SETUP & LOGIN DONE /////////
//////// TEST START /////////
getStatement({ msg: "[START]: 1.1 title: '' >'Prof'.", runlocally: true });
await page.waitForTimeout(650);
await page.goto(webSiteUrlMe, {
waitUntil: "networkidle0",
});
// 1 change #user_personal_title from '' >> 'Prof.'
console.log("1 change #user_personal_title from '' >> 'Prof.'");
await page.waitForTimeout(650);
await page.evaluate((user_personal_title) => {
document.querySelector(user_personal_title).value = "";
}, user_personal_title);
await page.type(user_personal_title, "Prof.");
let user_personal_title_val = await page.$eval(
user_personal_title,
(el) => el.value
);
expect(user_personal_title_val).toBe("Prof.");
await page.click(save_user_changes_btn);
await page.waitForNavigation({
//wait until there are no more conections that have traffic
waitUntil: "networkidle0",
});
user_personal_title_val = await page.$eval(
user_personal_title,
(el) => el.value
);
expect(user_personal_title_val).toBe("Prof.");
await page.waitForTimeout(650);
await page.evaluate(() => {
document.querySelector("#user_personal_title").value = "";
});
await page.click(save_user_changes_btn);
await page.waitForNavigation({
//wait until there are no more conections that have traffic
waitUntil: "networkidle0",
});
user_personal_title_val = await page.$eval(
user_personal_title,
(el) => el.value
);
expect(user_personal_title_val).toBe("");
////// TEST DONE //////////
await page.click(".sign-out");
await page.removeAllListeners();
await page.close();
await browser.close();
getStatement({ msg: "[ENDING]: 1.1 ", runlocally: true });
}, 12000);
Failing:
Evaluation failed: TypeError: Cannot set property 'value' of null
at __puppeteer_evaluation_script__:2:55
at ExecutionContext._evaluateInternal (node_modules/puppeteer/lib/cjs/puppeteer/common/ExecutionContext.js:217:19)
at ExecutionContext.evaluate (node_modules/puppeteer/lib/cjs/puppeteer/common/ExecutionContext.js:106:16)
at Object.<anonymous> (joyce_PORTAL/end-to-end/me.test.js:168:3)
The element you are selecting in document.querySelector(user_personal_title).value isn't available on page yet. You need to wait for your selector element to be available on page before you execute your page using waitForSelector(selector).
Simple POC is as following:
page.waitForSelector(user_personal_title)
.then(() => //evaluate your document.querySelector(user_personal_title).value code here
);
//or simply
await page.waitForSelector(user_personal_title)
Related
I'm trying to scrape a page of card items. I'd like to extract the titles, prices, image sources and other properties from these cards. However, when I scrape with Puppeteer and Cheerio, some of the data is missing. See the image below:
How can I make sure all of the data comes through?
This is my code:
(async () => {
try {
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer2.use(StealthPlugin());
const browser = await puppeteer2.launch({
executablePath: "/usr/bin/chromium-browser",
headless: true,
args: [
"--no-sandbox",
"--disable-setuid-sandbox",
"--user-agent=" + USER_AGENT + "",
],
});
const page = await browser.newPage({ignoreHTTPSErrors: true});
await page.setDefaultNavigationTimeout(0);
await page.goto("https://es.wallapop.com/search?keywords=", {
waitUntil: "networkidle0",
});
await page.waitForTimeout(30000);
const body = await page.evaluate(() => {
return document.querySelector("body").innerHTML;
});
var $ = cheerio.load(body);
const pageItems = $(".ItemCardList__item .ng-star-inserted")
.toArray()
.map((item) => {
const $item = $(item);
return {
// id: $item.attr('data-adid'), c10420p([^i]*)\/
id: uuid.v4(),
title: $item.find(".ItemCard__info").text(),
link: "https://es.wallapop.com/item/",
image: $item.find(".w-100").attr("src"),
price: $item
.find(".ItemCard__price")
.text()
.replace(/[_\W]+/g, ""),
empresa: "wallapop",
};
});
const allItems = items.concat(pageItems);
console.log(
pageItems.length,
"items retrieved",
allItems.length,
"acumulat ed",
);
// ...
I didn't bother testing with Cheerio, but this might be a good example of the "using a separate HTML parser with Puppeteer" antipattern.
Using plain Puppeteer works fine for me:
const puppeteer = require("puppeteer"); // ^19.6.3
const url = "<Your URL>";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto(url, {waitUntil: "domcontentloaded"});
await page.waitForSelector(".ItemCard__info");
const items = await page.$$eval(".ItemCardList__item", els =>
els.map(e => ({
title: e.querySelector(".ItemCard__info").textContent.trim(),
img: e.querySelector("img").getAttribute("src"),
price: e.querySelector(".ItemCard__price").textContent.trim(),
}))
);
console.log(items);
console.log(items.length); // => 40
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
Other remarks:
Watch out for await page.setDefaultNavigationTimeout(0); which can hang your process indefinitely. If a navigation doesn't resolve in a few minutes, something has gone wrong and it's appropriate to throw and log diagnostics so the maintainer can look at the situation. Or at least programmatically re-try the operation.
page.waitForTimeout() is poor practice and rightfully deprecated, but can be useful for checking for dynamic loads, as you're probably attempting to do here.
Instead of
const body = await page.evaluate(() => {
return document.querySelector('body').innerHTML;
});
use const body = await page.content();.
What i'm trying to accomplish is to save complete document with all the comments expanded.
Unfortunately there are multiple selectors with same class and most of them are hidden and what i believe puppeteer does it takes first found selector and waits until it's visible which never happens.
Url: https://www.discoverpermaculture.com/permaculture-masterclass-video-1
const puppeteer = require('puppeteer');
const isElementVisible = async (page, cssSelector) => {
let visible = true;
await page
.waitForSelector(cssSelector, { visible: true, timeout: 2000 })
.catch(() => {
visible = false;
});
if(visible)console.log('Selector '+cssSelector+'visible!');
return visible;
};
async function run () {
let browser = await puppeteer.launch({headless: true, defaultViewport: null, args: ['--window-size=1920,10000',],});
const page = await browser.newPage();
const fs = require('fs');
await page.goto('https://www.discoverpermaculture.com/permaculture-masterclass-video-1');
await page.waitForTimeout(4000)
const elementHandle = await page.waitForSelector('iframe');
const frame = await elementHandle.contentFrame();
//loading all the comments (works because there's only one 'a.load-more__button' element a time)
const selectorForLoadMoreButton = 'a.load-more__button';
let loadMoreVisible = await isElementVisible(frame, selectorForLoadMoreButton);
while (loadMoreVisible) {
console.log('Loading comments');
await frame
.click(selectorForLoadMoreButton)
.catch(() => {});
loadMoreVisible = await isElementVisible(frame, selectorForLoadMoreButton);
}
//expanding comments doesn't work because each comment have a.see-more but some are hidden
const selectorForSeeMoreButton = 'a.see-more';
let seeMoreVisible = await isElementVisible(frame, selectorForSeeMoreButton);
while (seeMoreVisible) {
console.log('Expanding comments');
await frame
.click(selectorForSeeMoreButton)
.catch(() => {});
seeMoreVisible = await isElementVisible(frame, selectorForSeeMoreButton);
}
const cdp = await page.target().createCDPSession();
const { data } = await cdp.send('Page.captureSnapshot', { format: 'mhtml' });
fs.writeFileSync('out.mhtml', data);
browser.close();
}
run();
Any ideas how to handle this?
It turned out that each comment have 'a.see-more' element but if it's not a long one it also have '.hidden' class. Had to update this piece of code to search for all the 'a.see-more' elements but without '.hidden' class.
const selectorForSeeMoreButton = 'a.see-more:not(.hidden)';
let seeMoreVisible = await isElementVisible(frame, selectorForSeeMoreButton);
while (seeMoreVisible) {
console.log('Expanding comments');
await frame
.click(selectorForSeeMoreButton)
.catch(() => {});
seeMoreVisible = await isElementVisible(frame, electorForSeeMoreButton);
}
This is the code I have given:
const puppeteer = require('puppeteer');
(async () => {
const baseUrl = 'example.com';
const browser = await puppeteer.launch({ headless: true});
const context = await browser.createIncognitoBrowserContext();
const page = await context.newPage();
function delay(time) {
return new Promise(function (resolve) {
setTimeout(resolve, time)
});
}
await page.goto(baseUrl);
await page.waitForSelector('[name="loginfmt"]');
await page.type('[name="loginfmt"]', 'abc#amazon.com');
await page.click('[type="submit"]');
delay(1000);
await page.authenticate({ username: `abc#amazon.com`, password: `abc#123` });
await page.click((`[data-row-id="111-222-333-444"]`));
await delay(1000);
await page.pdf(
{
path: "Z1.pdf",
printBackground: true,
width: '1300px',
height: '14200px'
}
)
browser.close();
})()
When inspected the page with document.querySelectorAll([data-row-id="111-222-333-444"]); it returned a NodeList array which consisted the details which I have been looking for.
However when tried the same via Puppeteer it terminates with an error saying "Error: No node found for selector: [data-row-id="111-222-333-444"]"
Tried using:
"const button= await page.$(([data-row-id="111-222-333-444"]);
button.click();"
I have been trying to fix this since a long time. It would be really helpful if I get a fix for this.
Try using this code:
const puppeteer = require('puppeteer');
(async () => {
const baseUrl = 'example.com';
const browser = await puppeteer.launch({ headless: true});
const context = await browser.createIncognitoBrowserContext();
const page = await context.newPage();
await page.goto(baseUrl);
await page.waitForSelector('[name="loginfmt"]');
await page.type('[name="loginfmt"]', 'abc#amazon.com');
await page.click('[type="submit"]');
await page.waitForTimeout(1000)
await page.authenticate({ username: `abc#amazon.com`, password: `abc#123` });
await page.waitForTimeout(5000)
await page.$(`[data-row-id="111-222-333-444"]`);
await page.click((`[data-row-id="111-222-333-444"]`));
await page.waitForTimeout(1000)
await page.pdf(
{
path: "Z1.pdf",
printBackground: true,
width: '1300px',
height: '14200px'
}
)
browser.close();
})()
I'm new to puppeteer so dont know much about it. This is my code so far and everything works.
But I want it to click the login button on the page after it has put the text in the fields, but I cannot figure out how to do it for the life of me. I've tried many different things and none work. Any help with this would be awesome.
just incase you need it
https://server.nitrado.net/usa/rent-gameserver
(async () => {
console.log('launch browser');
const browser = await pup.launch({headless: false});
console.log('new page');
const page = await browser.newPage();
console.log('goto');
await page.setViewport({ width: 1920, height: 1080 });
await page.goto('https://server.nitrado.net/usa/rent-gameserver', { waitUntil: "networkidle2", timeout: 60000 });
await page.waitFor(5000);
console.log('extract login iframe');
var iframes = await page.frames();
var loginFrame = iframes.find(f => f.url().indexOf("oauth.nitrado.net") > -1);
await page.waitFor(5000);
console.log('evaluate iframe');
await loginFrame.evaluate(() => {
document.getElementById('username').value = 'test';
document.getElementById('password').value = '12345';
});
await page.waitFor(300000);
console.log('done');
await browser.close();
})()```
I tried a workaround entering the frame url, I'm not sure if this will help, but here goes the code (main.js):
const pup = require('puppeteer');
mainFunc = async function () {
return new Promise(async (resolve, reject) => { //Wrap de promise
var browser;
try {
//Wrap de tratamento de erros
const browser = await pup.launch({ headless: false });
const page = await browser.newPage();
//SELECTORS:
var userInputSel = '#username';
var passInputSel = '#password';
var loginBtnSel = '#auth_login_ws_header > form > button';
var myUser = "myusername"; //PUT YOUR USERNAME HERE!!!
var myPass = "MyPaSsWoRd123"; //PUT YOUR PASSWORD HERE!!!
await page.goto('https://server.nitrado.net/usa/rent-gameserver',
{ waitUntil: "networkidle2", timeout: 60000 });
await page.waitFor(5000);
console.log('extract login iframe');
var iframes = await page.frames();
var loginFrame = iframes.find(f => f.url().indexOf("oauth.nitrado.net") > -1);
console.log(loginFrame.url())
await page.goto(loginFrame.url(),
{ waitUntil: "networkidle2", timeout: 60000 });
await page.waitFor(5000);
console.log('evaluate iframe');
/*await loginFrame.evaluate(() => {
document.getElementById('username').value = 'test';
document.getElementById('password').value = '12345';
});*/
await page.waitForSelector(userInputSel);
await page.type(userInputSel, myUser);
await page.waitForSelector(passInputSel);
await page.type(passInputSel, myPass);
await page.waitForSelector(loginBtnSel);
await page.click(loginBtnSel);
await page.waitFor(300000);
console.log('done');
await browser.close();
} catch (e) {
if(browser!=undefined){
browser.close();//Close browser if error
}
return reject(e);
}
});//Wrap de promise
}
mainFunc();
This is a running version for you to test. Just type "node main". (of course you need puppeteer installed (npm i puppeteer))...
I am trying to create UI in Electron for scraper in Puppeteer.
Every time I use page.evaluate() it returns an empty object [object Object],
here is an example:
const puppeteer = require('puppeteer');
const scrape = async () => {
const browser = await puppeteer.launch({
executablePath: '/usr/bin/google-chrome',
headless: true,
});
const page = await browser.newPage();
await page.goto("https://google.com/", {
waitUntil: 'networkidle2',
timeout: 90000
});
const length = await page.evaluate(`selector => {
return Array.from(document.querySelectorAll(selector)).length;
}`, 'div');
await page.close();
await browser.close();
return length;
}
document.querySelector("button").addEventListener("click", async function() {
const divs_len = await scrape();
const par = document.querySelector('#par');
par.innerText = divs_len;
});
// par shows [object Object]
EDIT
I have used the following resource to fix the sample code:
https://github.com/puppeteer/puppeteer/issues/4221#issuecomment-478780545
And here is the working version:
const puppeteer = require('puppeteer');
const scrape = async () => {
const browser = await puppeteer.launch({
executablePath: '/usr/bin/google-chrome',
headless: true,
});
const page = await browser.newPage();
await page.goto("https://google.com/", {
waitUntil: 'networkidle2',
timeout: 90000
});
const functionToBeEvaluated = selector => {
return Array.from(document.querySelectorAll(selector)).length;
}
const result = await page.evaluate('(' + functionToBeEvaluated.toString() + ')("div");');
await page.close();
await browser.close();
return result;
}
document.querySelector("button").addEventListener("click", async function() {
const divs_len = await scrape();
const par = document.querySelector('#par');
par.innerText = divs_len;
});
In page.evaluate() argument, you can use a function expression that will be called or a string with a direct code that will be executed. If you send a function expression as a string, page.evaluate() returns just a reference to this very function which becomes an empty object as functions are not serializable. Try this:
const length = await page.evaluate(selector => {
return Array.from(document.querySelectorAll(selector)).length;
}, 'div');