How to access global variables in puppeteer - javascript

In this example code, when it reads the array contactList and j it says both are not defined, what is wrong?
const { join } = require('path');
const puppeteer = require('puppeteer');
(async () => {
// 1. Launch the browser
const browser = await puppeteer.launch({
"args": [
'--remote-debugging-port=9222'
],
"defaultViewport": {
"height": 1080,
"width": 1920
},
"headless": false
});
// 2. Open a new page
const page = await browser.newPage();
// 3. Navigate to URL
await page.goto('https://');
await new Promise(r => setTimeout(r, 10000));
console.log('Ready');
var contactList = ['cesar','gab','777','81411579','34353'];
var fLen = contactList.length;
var j = 0;
for (i = 0; i < fLen; i++) {
await page.evaluate(() => {
function searchContact(contact_name = "") {
//search = document.querySelector('#side > div._1Ra05 > div > label > div > div._1awRl.copyable-text.selectable-text');
search = document.querySelector('#side > div._1Ra05 > div > label > div > div._1awRl.copyable-text.selectable-text');
}
j++;
searchContact(contactList[j]);
}
}

Take a a look at the Puppeteer documentation for page.evaluate(pageFunction[,...args]). It states:
pageFunction <function|string> Function to be evaluated in the page context
Note (my bold) "evaluated in the page context". The variables j and contactList do not exist in the context of the page.
Thankfully however, Puppeteer has a way of calling server side code from the context of the page with page.exposeFunction(name, puppeteerFunction)
The method adds a function called name on the page's window object. When called, the function executes puppeteerFunction in node.js and returns a Promise which resolves to the return value of puppeteerFunction.
For your use case, it would look something like the following:
const puppeteer = require('puppeteer');
(async function()
{
const browser = await puppeteer.launch({
headless: false,
args: [
"--no-sandbox", // I needed these args for it to run on my machine, you probably don't need them.
"--disable-setuid-sandbox"
]
});
const page = await browser.newPage();
const contacts = ["Charlie", "Carl", "Dennis", "Conrad"];
await page.exposeFunction("getContacts", function()
{
return contacts;
});
await page.exposeFunction("addContact", function(contact)
{
contacts.push(contact);
});
await page.evaluate(async function()
{
await addContact("Henry");
await addContact("Olav");
const contacts = await getContacts();
contacts.forEach(function(contact)
{
const div = document.createElement("div");
div.innerHTML = contact;
document.body.appendChild(div);
});
});
console.log("Contacts after evaluating page function: ", contacts.join(", "));
})()
Note that this is a toy example, although a complete and runable one. You should be able to figure out the rest from this. The code you posted in your example in the OP does not make much sense (i.e. the endlessly recursive function searchContact()) so you will just have to adapt this to your use case.

Related

function in page.evaluate() is not working/being executed - Puppeteer

My Code below tries to collect a bunch of hyper links that come under the class name ".jss2". However, I do not think the function within my page.evaluate() is working. When I run the code, the link_list const doesn't get displayed.
I ran the document.querySelectorAll on the Chrome console and that was perfectly fine - really having a hard time with this.
async function testing() {
const browser = await puppeteer.launch({headless:false});
const page = await browser.newPage();
await page.setViewport({width: 1200, height: 800});
await page.goto(url);
const link_list = await this.page.evaluate(() => {
let elements = Array.from(document.querySelectorAll(".jss2"));
let links = elements.map(element => {
return element.href;
});
return (links);
});
console.log(link_list);
}
const link_list = await page.$$eval('.classname', links => links.map(link => link.href));
Found the answer here: PUPPETEER - unable to extract elements on certain websites using page.evaluate(() => document.querySelectorAll())

NodeJs Pagination,recursive promise problem

I am scraping multiple pages with cheerio and axios in node.js
I am having a hard time with Promises, can someone help me return the JSON if I hit the last page? Thanks!
const getWebsiteContent = async (url) => {
await axios.get(url).then(res => {
const $ = cheerio.load(res.data)
pageNum = getTotalpages($); // Get the pagination
console.log(url);
//Some scraping here
})
indexPage++; // Increment to the next page
const nextPageLink = baseUrl + '&page=' + indexPage; // get next page
if (indexPage > pageNum) {
var editedText = text.slice(0, text.length - 1);
editedText += ']}';
editedText = JSON.parse(editedText); // I want to return this and use elsewhere
return editedText;
}
setTimeout(async () => {
getWebsiteContent(nextPageLink); // Call itself
}, 1000);
}
var myJSON= await getWebsiteContent(baseUrl); // something like this
I would write getPages as an async generator -
async function* getPages (href, initPage = 0) {
const res = await axios.get(setPage(href, initPage))
const $ = cheerio.load(res.data)
const pages = getTotalpages($)
yield { page: initPage, dom: $ }
for (let p = initPage; p < pages; p++) {
await sleep(1000)
const r = await axios.get(setPage(href, p))
yield { page: p, dom: cheerio.load(r.data) }
}
}
This depends on helper setPage that manipulates the href page number using the url module, which is much safer than hobbling together strings by hand -
function setPage (href, page) {
const u = new URL(href)
u.searchParams.set("page", page)
return u.toString()
}
And another helper, sleep, which prevents the mixing of setTimeout with async-based code. This allows us to easily pause between pages -
async function sleep (ms) {
return new Promise(r => setTimeout(r, ms))
}
Finally we write scrape which is a simple wrapper around getPages. This allows us to reuse the getPages function to scrape various elements as needed. A benefit of using this approach is that the caller can determine what happens with each page. Below we push to result array, but as another example we could write each page to disk using the fs module. Obviously this for you to decide -
async function scrape (href) {
const result = []
for await (const {page, dom} of getPages(href)) {
console.log("scraped page", page) // some status message
result.push(getSomeData(dom)) // get something from each page
}
return result
}
scrape(myUrl).then(console.log, console.error)
You shouldn't be using then with your async / await code.
pagination should look something like this:
let response = await axios.get(url)
let $ = cheerio.load(response.data)
// do some scraping
while(url = $('[rel=next]').attr('href')){
response = await axios.get(url)
$ = cheerio.load(response.data)
// do more scraping
}

How to get input element with puppeteer, when the page load all elements inside frameset tag

I am trying to get all input element in this website:
http://rwis.mdt.mt.gov/scanweb/swframe.asp?Pageid=SfHistoryTable&Units=English&Groupid=269000&Siteid=269003&Senid=0&DisplayClass=NonJava&SenType=All&CD=7%2F1%2F2020+10%3A41%3A50+AM
Here is element source page looks like.
here is my code:
const puppeteer = require("puppeteer");
function run() {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(
"http://rwis.mdt.mt.gov/scanweb/swframe.asp?Pageid=SfHistoryTable&Units=English&Groupid=269000&Siteid=269003&Senid=0&DisplayClass=NonJava&SenType=All&CD=7%2F1%2F2020+10%3A41%3A50+AM"
);
let urls = await page.evaluate(() => {
let results = [];
let items = document.querySelectorAll("input").length;
return items;
});
browser.close();
return resolve(urls);
} catch (e) {
return reject(e);
}
});
}
run().then(console.log).catch(console.error);
Right now my output have 0, when i run document.querySelectorAll("input").length in the console, it give me 8 .
It seems like everything is loaded in the frameset tag, this might be the issue, could anyone have any idea how to solve this issue?
You have to get the frame element, from there you can get the frame itself so you can call evaluate inside that frame:
const elementHandle = await page.$('frame[name=SWContent]');
const frame = await elementHandle.contentFrame();
let urls = await frame.evaluate(() => {
let results = [];
let items = document.querySelectorAll("input").length;
return items;
});

How to use date.js through Puppeteer exposeFunction?

I am using puppeteer 1.19.0 and date.js 0.3.3 for this example
const puppeteer = require('puppeteer');
const date = require('date.js');
let scrape = async () => {
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.exposeFunction('formatDate', (text) =>
date(text));
await page.goto('https://www.daraz.com.bd/products/the-vip-suction-mobile-phone-stand-pocket-size-i113492895-s1030756115.html');
await page.waitFor(1000);
const result = await page.evaluate(() => {
let elements = document.querySelectorAll('#module_product_qna > div.pdp-mod-qna > div:nth-child(2) > ul > li')
for (var element of elements)
{
let question = element.querySelector('div:first-of-type > div.qna-content').innerText;
let qtime = element.querySelector('div:first-of-type > div.qna-meta').innerText;
let q = qtime.match(/- (.+)/);
qtime = formatDate(q[1]);
return {
question,
qtime
}
}});
browser.close();
return result;
};
scrape().then((value) => {
console.log(value);
});
You can see I am trying to use date function of date.js library to parse relative date through puppeteer exposeFunction but date function is not working inside page context. Any suggestions what I am doing wrong?
I appreciate your replies!
From puppeteer docs:
The method adds a function called name on the page's window object.
When called, the function executes puppeteerFunction in node.js and
returns a Promise which resolves to the return value of
puppeteerFunction.
Try this:
const result = await page.evaluate(async() => {
let elements = document.querySelectorAll('#module_product_qna > div.pdp-mod-qna > div:nth-child(2) > ul > li')
for (var element of elements)
{
let question = element.querySelector('div:first-of-type > div.qna-content').innerText;
let qtime = element.querySelector('div:first-of-type > div.qna-meta').innerText;
let q = qtime.match(/- (.+)/);
qtime = await window.formatDate(q[1]);
return {
question,
qtime
}
}});

How to execute something similar to a goto statement in node js or how to create and call a function within an asynchronous function?

I am running an automated test through puppeteer that fills up a form and checks for captcha as well. If the captcha is incorrect, it refreshes to a new image but then I need to process the whole image again and reach the function which was used earlier to process it.
(async function example() {
const browser = await puppeteer.launch({headless: false})
const page = await browser.newPage()
/*-----------NEED TO COME BACK HERE-----------*/
const tessProcess = utils.promisify(tesseract.process);
await page.setViewport(viewPort)
await page.goto('http://www.example.com')
await page.screenshot(options)
const text = await tessProcess('new.png');
console.log(text.trim());
await page.$eval('input[id=userEnteredCaptcha]', (el, value) => el.value = value, text.trim())
await page.$eval('input[id=companyID]', el => el.value = 'val');
const submitBtn = await page.$('[id="data"]');
await submitBtn.click();
try {
var x = await page.waitFor("#msgboxclose");
console.log("Captcha error")
}
catch (e) {
console.error('No Error');
}
if(x){
await page.keyboard.press('Escape');
/*---------GO FROM HERE--------*/
}
})()
I want to sort of create a loop so that the image can be processed again whenever the captcha is wrong
Declare a boolean variable that indicates whether you need to try again or not, and put the repeated functionality inside a while loop that checks that variable. If the x condition at the end of the loop is not fulfilled, set tryAgain to false, so that no further iterations occur:
(async function example() {
const browser = await puppeteer.launch({headless: false})
const page = await browser.newPage()
let tryAgain = true; // <--------------------------
while (tryAgain) { // <--------------------------
/*-----------NEED TO COME BACK HERE-----------*/
const tessProcess = utils.promisify(tesseract.process);
await page.setViewport(viewPort)
await page.goto('http://www.example.com')
await page.screenshot(options)
const text = await tessProcess('new.png');
console.log(text.trim());
await page.$eval('input[id=userEnteredCaptcha]', (el, value) => el.value = value, text.trim())
await page.$eval('input[id=companyID]', el => el.value = 'val');
const submitBtn = await page.$('[id="data"]');
await submitBtn.click();
try {
var x = await page.waitFor("#msgboxclose");
console.log("Captcha error")
}
catch (e) {
console.error('No Error');
}
if(x){
await page.keyboard.press('Escape');
/*---------GO FROM HERE--------*/
} else {
tryAgain = false; // <--------------------------
}
}
})()

Categories

Resources