Puppeteer & cycling a process through multiple users - javascript

I'm trying to scrape information from a webpage behind a login wall for two users. As it stands, I've managed to get the code to do what I want for the first user i.e. go to webpage, login, gather the links associated with properties in a saved list, use that list to gather more details and log them to console.
The challenge I have now is getting the code to loop this round the second user without having to dupe the code. How would you suggest I go about it?
Secondly I need to make the array for each user, declared as uniquePropertyLinks in the below, accessible outside of the function userProcess.
How can I produce a new array for each user?
How can I access the array outside the function?
Here is the code:
const puppeteer = require('puppeteer');
//Code to locate text and enable it to be clicked
const escapeXpathString = str => {
const splitedQuotes = str.replace(/'/g, `', "'", '`);
return `concat('${splitedQuotes}', '')`;
};
const clickByText = async (page, text) => {
const escapedText = escapeXpathString(text);
const linkHandlers = await page.$x(`//a[contains(text(), ${escapedText})]`);
if (linkHandlers.length > 0) {
await linkHandlers[0].click();
} else {
throw new Error(`Link not found: ${text}`);
}
};
//User credentials
const userAEmail = 'abc#hotmail.com';
const userAPassword = '123';
const userBEmail = 'def#hotmail.com';
const userBPassword = '456';
//Logout
const LogOut = async (page) => {
await page.goto('https://www.website.com');
await clickByText(page, 'Log out');
await page.waitForNavigation({waitUntil: 'load'});
console.log('Signed out');
};
///////////////////////////
//SCRAPE PROCESS
async function userProcess() {
try {
const browser = await puppeteer.launch({ headless : false });
const page = await browser.newPage();
page.setUserAgent('BLAHBLAHBLAH');
//Go to Website saved list
await page.goto('https://www.website.com/shortlist.html', {waitUntil: 'networkidle2'});
console.log('Page loaded');
//User A log in
await page.type('input[name=email]', userAEmail, {delay: 10});
await page.type('input[name=password]', userAPassword, {delay: 10});
await page.click('.mrm-button',{delay: 10});
await page.waitForNavigation({waitUntil: 'load'})
console.log('Signed in');
//Wait for website saved list to load
const propertyList = await page.$$('.title');
console.log(propertyList.length);
//Collecting links from saved list and de-duping into an array
const propertyLinks = await page.evaluate(() => Array.from(document.querySelectorAll('.sc-jbKcbu'), e => e.href));
let uniquePropertyLinks = [...new Set(propertyLinks)];
console.log(uniquePropertyLinks);
//Sign out
LogOut(page);
} catch (err) {
console.log('Our error - ', err.message);
}
};
userProcess();

Let's see some of the things you might need to complete your task. I think it's better to take time and develop the skills yourself, but I can perhaps point out a few key things.
You use:
const userAEmail = 'abc#hotmail.com';
const userAPassword = '123';
const userBEmail = 'def#hotmail.com';
const userBPassword = '456';
but then you're talking about looping. With such a data structure, it will be difficult to loop these two users. I recommend putting it into an object like so:
const users = {
a: {
email: 'abc#hotmail.com',
password: '123',
},
b: {
email: 'def#hotmail.com',
password: '456',
},
};
then you can easily look with for example for .. in:
for (const user in users) {
console.log(users[user]);
}
or with .forEach():
Object.values(users).forEach(user => {
console.log(user);
});
need to make the array for each user, declared as uniquePropertyLinks in the below, accessible outside of the function userProcess.
Then declare the array outside of the funtion:
let uniquePropertyLinks = [];
async function userProcess() {
// you can access uniquePropertyLinks here
}
// and you can access uniquePropertyLinks here as well
How can I produce a new array for each user? How can I access the array outside the function?
Again, it'd be better to choose a differen data structure, let's day an object with keys that would represent each user and values would be arrays. It'd look like so:
let uniquePropertyLinks = {};
uniquePropertyLinks.a = [];
uniquePropertyLinks.b = [];
which looks like this:
{ a: [], b: [] }
so you can save whatever values for user a into uniquePropertyLinks.a array and whatever values you need into uniquePropertyLinks.b array:
uniquePropertyLinks.a.push('new_value_for_a_user');
similarly for user b.
Now you should have all the bits you need in order to go back to your code and make the necessary changes.

For those looking for the results of pavelsaman's advice below is the updated code:
const puppeteer = require('puppeteer');
//Object storing user credentials
let userAEmail = 'abc';
let userAPassword = '123';
let userBEmail = 'def';
let userBPassword = '456';
const users = {
userA: {
email: userAEmail,
password: userAPassword,
},
userB: {
email: userBEmail,
password: userBPassword,
},
};
//Object storing users saved lists as arrays
const usersPropertyLinks = {};
usersPropertyLinks.userA = [];
usersPropertyLinks.userB = [];
//Function to retrieve users saved list of properties
async function retrieveUserSavedList(users, usersPropertyLinks) {
try {
//Load broswer
const browser = await puppeteer.launch({ headless : true });
const page = await browser.newPage();
page.setUserAgent('BLAHHBLAHHBLAHH');
for (const user in users) {
//Go to saved list
await page.goto('https://www.website.co.uk/user/shortlist.html', {waitUntil: 'networkidle2'});
await page.waitForSelector('.mrm-button');
//User log in
await page.type('input[name=email]', users[user].email, {delay: 10});
await page.type('input[name=password]', users[user].password, {delay: 10});
await page.click('.mrm-button',{delay: 10});
await page.waitForNavigation({waitUntil: 'load'})
console.log('Success: ' + users[user].email + ' logged in');
//Collecting saved property links and de-duping into an array
const propertyLinks = await page.evaluate(() => Array.from(document.querySelectorAll('.sc-jbKcbu'), e => e.href));
//Add saved property links to an array for each user
if (users[user].email === userAEmail ) {
usersPropertyLinks.userA.push(...new Set(propertyLinks));
} else if (users[user].email === userBEmail ) {
usersPropertyLinks.userB.push(...new Set(propertyLinks));
} else {
console.log('problem saving links to user array');
};
//Sign out
await page.click('.sc-kAzzGY',{delay: 10});
await page.waitForNavigation({waitUntil: 'load'});
console.log('Success: ' + users[user].email + ' logged out');
};
browser.close();
} catch (err) {
console.log('Error retrieve user saved list - ', err.message);
}
};
//Run the code
retrieveUserSavedList(users, usersPropertyLinks);

Related

How to select specific button in puppeteer

So I'm building a program that scrapes Poshmark webpages and extracts the usernames of each seller on the page!
I want it to go through every page using the 'next' button, but theres 6 buttons all with the same class name...
Heres the link: https://poshmark.com/category/Men-Jackets_&_Coats?sort_by=like_count&all_size=true&my_size=false
(In my google chrome this page has an infinite scroll (hence the scrollToBottom async function i started writing) but i realized inside puppeteer's chrome it has 'next page' buttons.)
The window displays page 1-5 and then the 'next page' button.
The problem is that all of the buttons share the same html class name, so I'm confused on how to differentiate.
const e = require('express');
const puppeteer = require('puppeteer');
const url = "https://poshmark.com/category/Men-Jackets_&_Coats?sort_by=like_count&all_size=true&my_size=false";
let usernames = [];
const initItemArea = async (page) => {
const itemArea = await page.evaluate(() => {
return Array.from(document.querySelectorAll('.tc--g.m--l--1.ellipses')).map(x => x.textContent);
});
}
const pushToArray = async (itemArea, page) => {
itemArea.forEach(function (element) {
//console.log('username: ', $(element).text());
usernames.push(element);
});
};
const scrollToBottom = async (itemArea, page) => {
while (true) {
previousHeight = await page.evaluate('document.body.scrollHeight');
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
await page.waitForFunction(`document.body.scrollHeight > ${previousHeight}`);
await new Promise((resolve) => setTimeout(resolve, 1000));
await page.screenshot({path : "ss.png"})
}
};
const gotoNextPage = async (page) => {
await page.waitForSelector(".button.btn.btn--pagination");
const nextButton = await page.evaluate((page) => {
document.querySelector(".button.btn.btn--pagination")
});
await page.click(nextButton);
console.log('Next Page Loading')
};
async function main() {
const client = await puppeteer.launch({
headless: false,
executablePath: "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
});
const page = await client.newPage();
await page.goto(url);
await page.waitForSelector(".tc--g.m--l--1.ellipses");
const itemArea = await page.evaluate(() => {
return Array.from(document.querySelectorAll('.tc--g.m--l--1.ellipses')).map(x => x.textContent);
});
gotoNextPage(page)
};
main();
Currently, my gotoNextPage function doesnt even find the button, so i thought i'd entered the selector wrong...
Then when I went to find the selector, I realized all buttons have the same one anyway...
My html knowledge is basically nonexistent, but I want to finish this project out. All help is very appreciated.
Bonus: my initPageArea function doesn't work when I call as a function like that, so I hardcoded it into main()...
I'll be diving deep into this problem later on, as I've seen it before, but any quick answers / direction would be awesome.
Thanks a lot.
you can try selecting the buttons using their position in the page.
For example, you can select the first button using the following CSS selector:
.button.btn.btn--pagination:nth-child(1)
to select the second button:
.button.btn.btn--pagination:nth-child(2)
Got the idea? :)
you can refactor your gotoNextPage function to use this approach, consider this example:
const gotoNextPage = async (page, buttonIndex) => {
await page.waitForSelector(".button.btn.btn--pagination");
// Select the button using its position in the page
const nextButton = await page.evaluate((buttonIndex) => {
return document.querySelector(`.button.btn.btn--pagination:nth-child(${buttonIndex})`);
}, buttonIndex);
// Click on the button
await page.click(nextButton);
console.log("Next Page Loading");
};
Whenever you're messing with buttons and scroll, it's a good idea to think about where the data is coming from. It's usually being delivered to the front-end via a JSON API, so you might as well try to hit that API directly rather than mess with the DOM.
const url = maxId => `https://poshmark.com/vm-rest/channel_groups/category/channels/category/collections/post?request={%22filters%22:{%22department%22:%22Men%22,%22category_v2%22:%22Jackets_%26_Coats%22,%22inventory_status%22:[%22available%22]},%22sort_by%22:%22like_count%22,%22facets%22:[%22color%22,%22brand%22,%22size%22],%22experience%22:%22all%22,%22sizeSystem%22:%22us%22,%22max_id%22:%22${maxId}%22,%22count%22:%2248%22}&summarize=true&pm_version=226.1.0`;
(async () => {
const usernames = [];
for (let maxId = 1; maxId < 5 /* for testing */; maxId++) {
const response = await fetch(url(maxId)); // Node 18 or install node-fetch
if (!response.ok) {
throw Error(response.statusText);
}
const payload = await response.json();
if (payload.error) {
break;
}
usernames.push(...payload.data.map(e => e.creator_username));
}
console.log(usernames.slice(0, 10));
console.log("usernames.length", usernames.length);
})()
.catch(err => console.error(err));
The response blob has a ton of additional data.
I would add a significant delay between requests if I were to use code like this to avoid rate limiting/blocking.
If you're set on Puppeteer, something like this should work as well, although it's slower and I didn't have time to run to the end of the 5k (or more?) users:
const puppeteer = require("puppeteer"); // ^19.1.0
const url = "Your URL";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto(url, {waitUntil: "domcontentloaded"});
const usernames = [];
const sel = ".tc--g.m--l--1.ellipses";
for (;;) {
try {
await page.waitForSelector(sel);
const users = await page.$$eval(sel, els => {
const text = els.map(e => e.textContent);
els.forEach(el => el.remove());
return text;
});
console.log(users); // optional for debugging
usernames.push(...users);
await page.$$eval(
".btn--pagination",
els => els.find(el => el.textContent.includes("Next")).click()
);
}
catch (err) {
break;
}
}
console.log(usernames);
console.log(usernames.length);
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
I don't think navigations are triggered by the "Next" button, so my strategy for detecting when a page transition has occurred involves destroying the current set of elements after scraping the usernames, then waiting until the next batch shows up. This may seem inelegant, but it's easy to implement and seems reliable, not making assumptions about the usernames themselves.
It's also possible to use Puppeteer and make or intercept API requests, armed with a fresh cookie. This is sort of halfway between the two extremes shown above. For example:
const puppeteer = require("puppeteer");
const url = "Your URL";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto(url, {waitUntil: "domcontentloaded"});
const usernames = await page.evaluate(async () => {
const url = maxId => `https://poshmark.com/vm-rest/channel_groups/category/channels/category/collections/post?request={%22filters%22:{%22department%22:%22Men%22,%22category_v2%22:%22Jackets_%26_Coats%22,%22inventory_status%22:[%22available%22]},%22sort_by%22:%22like_count%22,%22facets%22:[%22color%22,%22brand%22,%22size%22],%22experience%22:%22all%22,%22sizeSystem%22:%22us%22,%22max_id%22:%22${maxId}%22,%22count%22:%2248%22}&summarize=true&pm_version=226.1.0`;
const usernames = [];
try {
for (let maxId = 1; maxId < 5 /* for testing */; maxId++) {
const response = await fetch(url(maxId)); // node 18 or install node-fetch
if (!response.ok) {
throw Error(response.statusText);
break;
}
const json = await response.json();
if (json.error) {
break;
}
usernames.push(...json.data.map(e => e.creator_username));
}
}
catch (err) {
console.error(err);
}
return usernames;
});
console.log(usernames);
console.log("usernames.length", usernames.length);
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
The above code limits to 4 requests to keep it simple and easy to validate.
Blocking images and other unnecessary resources can help speed the Puppeteer versions up, left as an exercise (or just use the direct fetch version shown at top).

Web-scraping Company data with puppeteer

I am trying to get the company data from this website called similar web but upon making a lot of requests it recognizes my script as a bot so is there any way to bypass this check? or suggest any website to scrap data easily, we can't use LinkedIn by the way.
const puppeteer = require("puppeteer");
const searchCompany = "zoominfo.com";
const Link = `https://www.similarweb.com/website/${searchCompany}/#overview`;
// console.log(companyPage);
let page;
(async function () {
try {
let browserOpen = await puppeteer.launch({
headless: false,
// dumpio: true,
// args: ["--start-maximized"],
defaultViewport: null,
});
let newTab = await browserOpen.newPage();
await newTab.goto(Link);
await newTab.screenshot({ path: "sc.png" });
await newTab.waitForSelector(".data-company-info__row");
let ans = await newTab.evaluate(() => {
let name = document.querySelectorAll(".data-company-info__row")[0]
.textContent;
let location = document.querySelectorAll(".data-company-info__row")[3]
.textContent;
let industry = document.querySelectorAll(".data-company-info__row")[5]
.textContent;
// console.log(ans);
return { name, location, industry };
});
console.log(ans);
await browserOpen.close();
} catch (err) {
console.log(err);
}
})();
Just out of curiosity - what do you use similarweb data for?
You can try using https://github.com/bda-research/node-crawler that has delays and max connections params

How to get links from multiple pages in a single array

I have a working code that successfully obtains all product links from multiple pages that have at least a 20% discount. The only problem is that it returns links in the arrays for each page separately. However, I would like it to return links for all pages in a single array and then transfer them to another function. I tried to create a string var all_links = [] and push all the links from each page into it and then return them like return all_links, as I know from a simpler example. However, I have not been successful in this case because I have no experience with coding. I started learning the basics three weeks ago. I would be very grateful if you could help me with the whole code as I don't have the necessary prior knowledge.
const puppeteer = require('puppeteer')
const minDiscount = 20;
async function getLinks() {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
});
const page = await browser.newPage();
const url = 'https://www.mytoys.de/spielzeug-spiele/holz/';
await page.goto(url);
// getting all the products, this will return an array of ElementHandle
while(await page.$(".pager__link--next")){
await page.waitForSelector(".pager__link--next")
await page.waitForTimeout(1000);
await page.click('.pager__link--next')
await page.waitForTimeout(1500);
const products = await page.$$('.prod-grid.js-prod-grid .prod-grid__item.js-prod-grid_item');
const proms = await Promise.allSettled(
products.map(async (prod) => {
// searching for a discount on each product
const disc = await prod.$$eval(
'.prod-grid.js-prod-grid .prod-flag.prod-flag-sale',
(discount) =>
discount.map((discItem) =>
discItem.innerText.replace(/[^0-9.]/g, '').replace(/\D+/g,'0')
)
);
// if it has a discount
if (disc.length > 0) {
// we parse the discount to Integer type to compare it to minDiscount
const discountInt = parseInt(disc[0], 10);
if (discountInt >= minDiscount) {
// we get the link of the product
const link = await prod.$$eval('.prod-grid.js-prod-grid .prod-tile__link.js-prodlink', (allAs) => allAs.map((a) => a.href));
if (link.length > 0) {
// push an object containing the discount and the link of the product
return link[0];
}
}
}
return null;
})
);
const bulkArray = proms.map((item) => {
if (item.status === 'fulfilled') return item.value;
});
const endArray = bulkArray.filter(item => item !== null);
console.log(endArray);
}
}
getLinks();
An example of the result I am currently obtaining
[
'https://www.mytoys.de/erzi-kinderwurst-sortiment-spiellebensmittel-6749036.html',
'https://www.mytoys.de/chr-tanner-spiellebensmittel-wurststaender-1031946.html',
'https://www.mytoys.de/hape-xylophon-und-hammerspiel-2503719.html',
'https://www.mytoys.de/erzi-kinderparty-spiellebensmittel-6749035.html',
]
[
'https://www.mytoys.de/brio-holzeisenbahnset-landleben-5501952.html',
'https://www.mytoys.de/brio-brio-33277-bahn-ir-reisezug-set-4592516.html',
'https://www.mytoys.de/brio-parkhaus-strassen-schienen-3175226.html',
'https://www.mytoys.de/mytoys-steckwuerfel-12-tlg-11389814.html',
'https://www.mytoys.de/brio-schienen-und-weichensortiment-1758325.html',
]
[
'https://www.mytoys.de/hape-grosser-baukran-4141517.html',
'https://www.mytoys.de/noris-mein-buntes-tuermchenspiel-3421170.html',
'https://www.mytoys.de/goki-ziehtier-schaf-suse-2488933.html',
'https://www.mytoys.de/eichhorn-colorsoundzug-mit-licht-1521635.html',
]
An example of the result you would like to obtain
[
'https://www.mytoys.de/erzi-kinderwurst-sortiment-spiellebensmittel-6749036.html',
'https://www.mytoys.de/chr-tanner-spiellebensmittel-wurststaender-1031946.html',
'https://www.mytoys.de/hape-xylophon-und-hammerspiel-2503719.html',
'https://www.mytoys.de/erzi-kinderparty-spiellebensmittel-6749035.html',
'https://www.mytoys.de/brio-holzeisenbahnset-landleben-5501952.html',
'https://www.mytoys.de/brio-brio-33277-bahn-ir-reisezug-set-4592516.html',
'https://www.mytoys.de/brio-parkhaus-strassen-schienen-3175226.html',
'https://www.mytoys.de/mytoys-steckwuerfel-12-tlg-11389814.html',
'https://www.mytoys.de/brio-schienen-und-weichensortiment-1758325.html',
'https://www.mytoys.de/hape-grosser-baukran-4141517.html',
'https://www.mytoys.de/noris-mein-buntes-tuermchenspiel-3421170.html',
'https://www.mytoys.de/goki-ziehtier-schaf-suse-2488933.html',
'https://www.mytoys.de/eichhorn-colorsoundzug-mit-licht-1521635.html',
]
Declare new variable for links collecting before your loop:
const allLinks = []; // <--
while(await page.$(".pager__link--next")){ ... }
Push all links into it:
...
const endArray = bulkArray.filter(item => item !== null);
console.log(endArray);
allLinks.push(endArray); // <--
Return / log result after loop execution:
async function getLinks() {
...
return allLinks.flat(); // <--
}
console.log(await getLinks()) // result array
Refs: Array.prototype.flat()

Exporting result from async function in separate js file, importing result in another javascript

Trying to build a small scraper. To reuse functionality I thought 'Page Object Models' would come in handy.
In main.js I require multiple small scripts, in the example below there is only one model (GooglePage).
The scripts work. But I would like to know how to pass a value from the google.js script back to the main script.
I want to use the value of the 'pageCountClean' variable in the main.js script to use in the rest of the application.
Have been searching for information about passing values and functions between scripts. For exporting values from pageconstructors, for promise await export function.
But I am lost. Do I have to use Promises?, is the current way of require/importing and exporting enough to create the relationship between the scripts?
Any pointers are welcome.
//////////// main.js
const { chromium } = require('playwright');
const { GooglePage } = require('./models/Google');
(async () => {
const browser = await chromium.launch({ headless: true, slowMo: 250 });
const context = await browser.newContext();
const GoogleUrl80 = https://www.google.nl/search?q=site%3Anu.nl;
// Cookie consent:
console.log('Cookie consent - start');
const page80 = await browser.newPage();
await page80.goto('https://google.nl');
await page80.waitForTimeout(1000);
await page80.keyboard.press('Tab');
await page80.keyboard.press('Tab');
await page80.keyboard.press('Enter');
console.log('Cookie Consent - done');
// Number of urls in google.nl (using google.js)
await page80.goto(GoogleUrl80, {waitUntil: 'networkidle'});
const googlePage80 = new GooglePage(page80);
await googlePage80.scrapeGoogle();
// Want to console.log 'pageCountClean' here.
await browser.close()
})()
//////////// Google.js
class GooglePage {
constructor(page) {
this.page = page;
}
async scrapeGoogle() {
const GoogleXpath = '//div[#id="result-stats"]';
const pageCount = await this.page.$eval(GoogleXpath, (el) => el.innerText);
const pageCountClean = pageCount.split(" ")[1];
console.log(pageCountClean);
}
}
module.exports = { GooglePage };
You can just return pageCountClean from your async function and await it in your main.js file:
in Google.js:
async scrapeGoogle() {
const GoogleXpath = '//div[#id="result-stats"]';
const pageCount = await this.page.$eval(GoogleXpath, (el) => el.innerText);
const pageCountClean = pageCount.split(" ")[1];
console.log(pageCountClean);
return pageCountClean;
}
in main.js:
const googlePage80 = new GooglePage(page80);
const result = await googlePage80.scrapeGoogle();
console.log(result);

Iterate through all documents in a MongoDB collection and saving the data in array

I thought this would be a straightforward task but what I am trying to do is go through all the documents (users) in my collection using a cursor and saving some data into a JS array or set about a specific field of the user or all of the data on that user. I am able to see each document printed on the console, but after looping through all the documents, my array is still empty when printed. Where am I going wrong and if there is an alternative approach please let me know.
const mongoose = require('mongoose');
let Users = require('./models/user.model');
const uri = process.env.ATLAS_URI;
mongoose.connect(uri, { useNewUrlParser: true, useCreateIndex:true, useUnifiedTopology: true});
const connection = mongoose.connection;
connection.once('open', () => {
console.log("MongoDB connection success");
})
let arr = [];
async function getRecords() {
let cursor = Users.find({}).cursor();
for (let doc = await cursor.next(); doc != null; doc = await cursor.next()) {
arr.push(doc); //does not work :(
console.log(doc); //this works
}
}
getRecords();
console.log("ARRAY:",arr); //prints []
Suggestion
Your code should read as such
let arr = [];
// This is an async function (returns a promise)
async function getRecords() {
let docs = await Users.find({}).lean();
arr = docs.filter((doc) => doc !== null); // As an example, however, enter appropriate condition for filter
return arr;
}
// Call the async funtion
getRecords().then(docs => {
console.log("ARRAY:", arr);
});

Categories

Resources