I am using puppeteer 1.19.0 and date.js 0.3.3 for this example
const puppeteer = require('puppeteer');
const date = require('date.js');
let scrape = async () => {
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.exposeFunction('formatDate', (text) =>
date(text));
await page.goto('https://www.daraz.com.bd/products/the-vip-suction-mobile-phone-stand-pocket-size-i113492895-s1030756115.html');
await page.waitFor(1000);
const result = await page.evaluate(() => {
let elements = document.querySelectorAll('#module_product_qna > div.pdp-mod-qna > div:nth-child(2) > ul > li')
for (var element of elements)
{
let question = element.querySelector('div:first-of-type > div.qna-content').innerText;
let qtime = element.querySelector('div:first-of-type > div.qna-meta').innerText;
let q = qtime.match(/- (.+)/);
qtime = formatDate(q[1]);
return {
question,
qtime
}
}});
browser.close();
return result;
};
scrape().then((value) => {
console.log(value);
});
You can see I am trying to use date function of date.js library to parse relative date through puppeteer exposeFunction but date function is not working inside page context. Any suggestions what I am doing wrong?
I appreciate your replies!
From puppeteer docs:
The method adds a function called name on the page's window object.
When called, the function executes puppeteerFunction in node.js and
returns a Promise which resolves to the return value of
puppeteerFunction.
Try this:
const result = await page.evaluate(async() => {
let elements = document.querySelectorAll('#module_product_qna > div.pdp-mod-qna > div:nth-child(2) > ul > li')
for (var element of elements)
{
let question = element.querySelector('div:first-of-type > div.qna-content').innerText;
let qtime = element.querySelector('div:first-of-type > div.qna-meta').innerText;
let q = qtime.match(/- (.+)/);
qtime = await window.formatDate(q[1]);
return {
question,
qtime
}
}});
Related
In this example code, when it reads the array contactList and j it says both are not defined, what is wrong?
const { join } = require('path');
const puppeteer = require('puppeteer');
(async () => {
// 1. Launch the browser
const browser = await puppeteer.launch({
"args": [
'--remote-debugging-port=9222'
],
"defaultViewport": {
"height": 1080,
"width": 1920
},
"headless": false
});
// 2. Open a new page
const page = await browser.newPage();
// 3. Navigate to URL
await page.goto('https://');
await new Promise(r => setTimeout(r, 10000));
console.log('Ready');
var contactList = ['cesar','gab','777','81411579','34353'];
var fLen = contactList.length;
var j = 0;
for (i = 0; i < fLen; i++) {
await page.evaluate(() => {
function searchContact(contact_name = "") {
//search = document.querySelector('#side > div._1Ra05 > div > label > div > div._1awRl.copyable-text.selectable-text');
search = document.querySelector('#side > div._1Ra05 > div > label > div > div._1awRl.copyable-text.selectable-text');
}
j++;
searchContact(contactList[j]);
}
}
Take a a look at the Puppeteer documentation for page.evaluate(pageFunction[,...args]). It states:
pageFunction <function|string> Function to be evaluated in the page context
Note (my bold) "evaluated in the page context". The variables j and contactList do not exist in the context of the page.
Thankfully however, Puppeteer has a way of calling server side code from the context of the page with page.exposeFunction(name, puppeteerFunction)
The method adds a function called name on the page's window object. When called, the function executes puppeteerFunction in node.js and returns a Promise which resolves to the return value of puppeteerFunction.
For your use case, it would look something like the following:
const puppeteer = require('puppeteer');
(async function()
{
const browser = await puppeteer.launch({
headless: false,
args: [
"--no-sandbox", // I needed these args for it to run on my machine, you probably don't need them.
"--disable-setuid-sandbox"
]
});
const page = await browser.newPage();
const contacts = ["Charlie", "Carl", "Dennis", "Conrad"];
await page.exposeFunction("getContacts", function()
{
return contacts;
});
await page.exposeFunction("addContact", function(contact)
{
contacts.push(contact);
});
await page.evaluate(async function()
{
await addContact("Henry");
await addContact("Olav");
const contacts = await getContacts();
contacts.forEach(function(contact)
{
const div = document.createElement("div");
div.innerHTML = contact;
document.body.appendChild(div);
});
});
console.log("Contacts after evaluating page function: ", contacts.join(", "));
})()
Note that this is a toy example, although a complete and runable one. You should be able to figure out the rest from this. The code you posted in your example in the OP does not make much sense (i.e. the endlessly recursive function searchContact()) so you will just have to adapt this to your use case.
I'm working on a web scraper in Javascript using puppeteer and whenever I try to log the text content of an element it says "Promise { Pending }". I've looked at other answers and none of them worked
const element = await page.$("#ctl00_ContentPlaceHolder1_NameLinkButton");
const text = await page.evaluate(element => element.textContent, element);
console.log(text);
Your answer is correct. but I think you forget to add await before page.evaluate().
There three ways to do that.
First way. just like what are you do. but I don't prefer it because
you don't need to call page.evaluate() to get .textContent
const puppeteer = require('puppeteer');
puppeteer.launch().then(async browser => {
const elementId = 'container';
const page = await browser.newPage();
await page.goto('https://metwally.me');
const element = await page.$(`#${elementId}`);
if (element) {
const text = await page.evaluate(element => element.textContent, element);
console.log(text);
} else {
// handle not exists id
console.log('Not Found');
}
});
Second way. you will call page.evaluate() and use JavaScript Dom to get textContent. like document.getElementById(elementId).textContent.
const puppeteer = require('puppeteer');
puppeteer.launch().then(async browser => {
const elementId = 'container';
const page = await browser.newPage();
await page.goto('https://metwally.me');
const text = await page.evaluate(
elementId => {
const element = document.getElementById(elementId);
return element ? element.textContent : null;
}, elementId);
if (text !== null) {
console.log(text);
} else {
// handle not exists id
console.log('Not Found');
}
});
Third way. you will select element by puppeteer selector then get textContent property using await element.getProperty('textContent') then get value from textContent._remoteObject.value.
const puppeteer = require('puppeteer');
puppeteer.launch().then(async browser => {
const elementId = 'container';
const page = await browser.newPage();
await page.goto('https://metwally.me');
const element = await page.$(`#${elementId}`);
if (element) {
const textContent = await element.getProperty('textContent');
const text = textContent._remoteObject.value;
console.log(text);
} else {
// handle not exists id
console.log('Not Found');
}
});
NOTE: All these examples working successfully in my machine.
os ubuntu 20.04
nodejs v10.19.0
puppeteer v1.19.0
References
Puppeteer page.$
Document.getElementById()
Node.textContent
I am trying to get all input element in this website:
http://rwis.mdt.mt.gov/scanweb/swframe.asp?Pageid=SfHistoryTable&Units=English&Groupid=269000&Siteid=269003&Senid=0&DisplayClass=NonJava&SenType=All&CD=7%2F1%2F2020+10%3A41%3A50+AM
Here is element source page looks like.
here is my code:
const puppeteer = require("puppeteer");
function run() {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(
"http://rwis.mdt.mt.gov/scanweb/swframe.asp?Pageid=SfHistoryTable&Units=English&Groupid=269000&Siteid=269003&Senid=0&DisplayClass=NonJava&SenType=All&CD=7%2F1%2F2020+10%3A41%3A50+AM"
);
let urls = await page.evaluate(() => {
let results = [];
let items = document.querySelectorAll("input").length;
return items;
});
browser.close();
return resolve(urls);
} catch (e) {
return reject(e);
}
});
}
run().then(console.log).catch(console.error);
Right now my output have 0, when i run document.querySelectorAll("input").length in the console, it give me 8 .
It seems like everything is loaded in the frameset tag, this might be the issue, could anyone have any idea how to solve this issue?
You have to get the frame element, from there you can get the frame itself so you can call evaluate inside that frame:
const elementHandle = await page.$('frame[name=SWContent]');
const frame = await elementHandle.contentFrame();
let urls = await frame.evaluate(() => {
let results = [];
let items = document.querySelectorAll("input").length;
return items;
});
Requirement is to create folder structure from an array in a SharePoint library using JavaScript. Below is the desired structure:
var ary = [A,B,C]
A -> Fldr1 -> Fldr2 -> File
B -> Fldr1 -> Fldr2 -> File
C -> Fldr1 -> Fldr2 -> File
But currently its creating folder A,B and C in library but inside structure is being created for C only.
So result am getting is :
A
B
C -> Fldr1 -> Fldr2 -> File
Below code works perfect when only one item in array, but fails when multiple items.
Here I check if folder exists, then check for 2nd level, if doesn't exist create first and so on for remaining structure.
async function processArray(selectedCountries) {
await selectedCountries.map(async (item) => {
let promiseCntry = await checkCntryFolder(item);
if(flag){ //if cntry exists
let promiseChckfolder = await checkFolder(tdmrkNm);
if(flagFldr)//if folder exists
{
let promiseChkSubFolder = await checkSubFolder(appStatus);
if(flagSub)//if sub -folder exists
{
let createFile = await CreateFileInSideFolder();
}
else
{
let promiseCreateSub = await createSubFolder(appStatus);
let createFile = await CreateFileInSideFolder();
}
}
}
});
}
}
Stop using deferreds and stop using the done method. Use proper promises with then instead.
Assuming this is jQuery, where those objects with done methods also have then methods, you can (and should) just use async/await directly:
async function callAry(array) {
return Promise.all(array.map(async (item) => {
const flag = await checkCntryFolder(item);
if (flag) {
const folderFlag = await checkFolder(nm);
if (folderFlag) {
const subFlag = await checkSubFolder(Status);
if (subFlag) {
await CreateFileInSideFolder();
console.log('file done');
}
}
}
}));
}
ok, so finally I have got it working:
I was not passing proper arguments to the methods.
Below is the working code:
async function processArray(selectedCountries) {
return Promise.all(selectedCountries.map(async (item) => {
//await selectedCountries.map(async (item) => {
let promiseCntry = await checkCntryFolder(item);
if(flag){ //if cntry exists
let promiseChckfolder = await checkFolder(tdmrkNm,item);
if(flagFldr)//if folder exists
{
let promiseChkSubFolder = await checkSubFolder(appStatus,tdmrkNm,item);
if(flagSub)//if sub -folder exists
{
let createFile = await CreateFileInSideFolder(appStatus,tdmrkNm,item);
}
else
{
let promiseCreateSub = await createSubFolder(appStatus,tdmrkNm,item);
let createFile = await CreateFileInSideFolder(appStatus,tdmrkNm,item);
}
}
else//if folder doesn't exist
{
let createFldr = await createFolder(tdmrkNm,item);
let promiseChkSubFolder = await checkSubFolder(appStatus,tdmrkNm,item);
if(flagSub)
{
let createFile = await CreateFileInSideFolder(appStatus,tdmrkNm,item);
}
else
{
let promiseCreateSub = await createSubFolder(appStatus,tdmrkNm,item);
let createFile = await CreateFileInSideFolder(appStatus,tdmrkNm,item);
}
}
}
else//if cntry doesn't exist
{
let createCntry = await createCntryFolder(item);
let promiseChckfolder = await checkFolder(tdmrkNm,item);
if(flagFldr)//if folder exists
{
let promiseChkSubFolder = await checkSubFolder(appStatus,tdmrkNm,item);
if(flagSub) //if sub-folder exists
{
let createFile = await CreateFileInSideFolder(appStatus,tdmrkNm,item);
}
else //if sub-folder doesn't exist
{
let promiseCreateSub = await createSubFolder(appStatus,tdmrkNm,item);
let createFile = await CreateFileInSideFolder(appStatus,tdmrkNm,item);
}
}
else //if folder doesn't exist
{
let createFldr = await createFolder(tdmrkNm,item);
let promiseChkSubFolder = await checkSubFolder(appStatus,tdmrkNm,item);
if(flagSub)//if sub-folder exists
{
let createFile = await CreateFileInSideFolder(appStatus,tdmrkNm,item);
}
else//if sub-folder doesn't exist
{
let promiseCreateSub = await createSubFolder(appStatus,tdmrkNm,item);
let createFile = await CreateFileInSideFolder(appStatus,tdmrkNm,item);
}
}
}
},Promise.resolve()));
I have a function which takes in a tag name and text as input and returns all the elements made of the given tag containing the given text as output (I get an array of all the elements having the matching text).
I will be using this function across multiple functions so I thought I could save it in another file and import the function into all the other files that I may need but I am unable to transfer the element. I am using puppeteer to open the browser and get my required documents.
The code I am importing:
commonFunctions.js:
module.exports = {
matchTagAndTextContents: async function matchTagAndTextContents(page, selector, text) {
const ele = await page.evaluate((selector,text) => {
function matchTagAndText(sel, txt) {
var elements = document.querySelectorAll(selector);
return Array.prototype.filter.call(elements, function(element){
return RegExp(text).test(element.textContent);
});
}
const matchedElements = matchTagAndText(selector,text);
return matchedElements;
},selector,text);
return ele;
}
}
Another file where I try to use the imported function:
foo.js:
const commonFunctions = require('./commonFunctions');
const puppeteer = require('puppeteer');
let browser = null;
browser = await puppeteer.launch({args: ['--no-sandbox', '--disable-setuid-sandbox']});
(async () => {
let page = await browser.newPage();
await page.goto("https://www.google.com");
let elem = null;
await commonFunctions.matchTagAndTextContents(page,'h1','Google').then( res => {
elem = res;
});
await page.evaluate((elem) => {
elem.forEach( el => {
el.click();
})
},elem);
})();
Here inside foo.js I keep getting el.click() is not a function, but if I implement the forEach inside the commonFunctions.js like:
matchedElements.forEach( el => {
el.click();
});
It works and the element gets clicked. What am I doing wrong?
Thats beacause elem is null in your execution and res its assigned to elem in the evaluated scope.
try changing
let elem = null;
await commonFunctions.matchTagAndTextContents(page,'h1','Google').then( res => {
elem = res;
});
await page.evaluate((elem) => {
elem.forEach( el => {
el.click();
})
},elem);
whith
var elem = null;
elem = await commonFunctions.matchTagAndTextContents(page,'h1','Google');
await page.evaluate((elem) => {
elem.forEach( el => {
el.click();
})
},elem);