How to get links from multiple pages in a single array - javascript

I have a working code that successfully obtains all product links from multiple pages that have at least a 20% discount. The only problem is that it returns links in the arrays for each page separately. However, I would like it to return links for all pages in a single array and then transfer them to another function. I tried to create a string var all_links = [] and push all the links from each page into it and then return them like return all_links, as I know from a simpler example. However, I have not been successful in this case because I have no experience with coding. I started learning the basics three weeks ago. I would be very grateful if you could help me with the whole code as I don't have the necessary prior knowledge.
const puppeteer = require('puppeteer')
const minDiscount = 20;
async function getLinks() {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
});
const page = await browser.newPage();
const url = 'https://www.mytoys.de/spielzeug-spiele/holz/';
await page.goto(url);
// getting all the products, this will return an array of ElementHandle
while(await page.$(".pager__link--next")){
await page.waitForSelector(".pager__link--next")
await page.waitForTimeout(1000);
await page.click('.pager__link--next')
await page.waitForTimeout(1500);
const products = await page.$$('.prod-grid.js-prod-grid .prod-grid__item.js-prod-grid_item');
const proms = await Promise.allSettled(
products.map(async (prod) => {
// searching for a discount on each product
const disc = await prod.$$eval(
'.prod-grid.js-prod-grid .prod-flag.prod-flag-sale',
(discount) =>
discount.map((discItem) =>
discItem.innerText.replace(/[^0-9.]/g, '').replace(/\D+/g,'0')
)
);
// if it has a discount
if (disc.length > 0) {
// we parse the discount to Integer type to compare it to minDiscount
const discountInt = parseInt(disc[0], 10);
if (discountInt >= minDiscount) {
// we get the link of the product
const link = await prod.$$eval('.prod-grid.js-prod-grid .prod-tile__link.js-prodlink', (allAs) => allAs.map((a) => a.href));
if (link.length > 0) {
// push an object containing the discount and the link of the product
return link[0];
}
}
}
return null;
})
);
const bulkArray = proms.map((item) => {
if (item.status === 'fulfilled') return item.value;
});
const endArray = bulkArray.filter(item => item !== null);
console.log(endArray);
}
}
getLinks();
An example of the result I am currently obtaining
[
'https://www.mytoys.de/erzi-kinderwurst-sortiment-spiellebensmittel-6749036.html',
'https://www.mytoys.de/chr-tanner-spiellebensmittel-wurststaender-1031946.html',
'https://www.mytoys.de/hape-xylophon-und-hammerspiel-2503719.html',
'https://www.mytoys.de/erzi-kinderparty-spiellebensmittel-6749035.html',
]
[
'https://www.mytoys.de/brio-holzeisenbahnset-landleben-5501952.html',
'https://www.mytoys.de/brio-brio-33277-bahn-ir-reisezug-set-4592516.html',
'https://www.mytoys.de/brio-parkhaus-strassen-schienen-3175226.html',
'https://www.mytoys.de/mytoys-steckwuerfel-12-tlg-11389814.html',
'https://www.mytoys.de/brio-schienen-und-weichensortiment-1758325.html',
]
[
'https://www.mytoys.de/hape-grosser-baukran-4141517.html',
'https://www.mytoys.de/noris-mein-buntes-tuermchenspiel-3421170.html',
'https://www.mytoys.de/goki-ziehtier-schaf-suse-2488933.html',
'https://www.mytoys.de/eichhorn-colorsoundzug-mit-licht-1521635.html',
]
An example of the result you would like to obtain
[
'https://www.mytoys.de/erzi-kinderwurst-sortiment-spiellebensmittel-6749036.html',
'https://www.mytoys.de/chr-tanner-spiellebensmittel-wurststaender-1031946.html',
'https://www.mytoys.de/hape-xylophon-und-hammerspiel-2503719.html',
'https://www.mytoys.de/erzi-kinderparty-spiellebensmittel-6749035.html',
'https://www.mytoys.de/brio-holzeisenbahnset-landleben-5501952.html',
'https://www.mytoys.de/brio-brio-33277-bahn-ir-reisezug-set-4592516.html',
'https://www.mytoys.de/brio-parkhaus-strassen-schienen-3175226.html',
'https://www.mytoys.de/mytoys-steckwuerfel-12-tlg-11389814.html',
'https://www.mytoys.de/brio-schienen-und-weichensortiment-1758325.html',
'https://www.mytoys.de/hape-grosser-baukran-4141517.html',
'https://www.mytoys.de/noris-mein-buntes-tuermchenspiel-3421170.html',
'https://www.mytoys.de/goki-ziehtier-schaf-suse-2488933.html',
'https://www.mytoys.de/eichhorn-colorsoundzug-mit-licht-1521635.html',
]

Declare new variable for links collecting before your loop:
const allLinks = []; // <--
while(await page.$(".pager__link--next")){ ... }
Push all links into it:
...
const endArray = bulkArray.filter(item => item !== null);
console.log(endArray);
allLinks.push(endArray); // <--
Return / log result after loop execution:
async function getLinks() {
...
return allLinks.flat(); // <--
}
console.log(await getLinks()) // result array
Refs: Array.prototype.flat()

Related

How to select specific button in puppeteer

So I'm building a program that scrapes Poshmark webpages and extracts the usernames of each seller on the page!
I want it to go through every page using the 'next' button, but theres 6 buttons all with the same class name...
Heres the link: https://poshmark.com/category/Men-Jackets_&_Coats?sort_by=like_count&all_size=true&my_size=false
(In my google chrome this page has an infinite scroll (hence the scrollToBottom async function i started writing) but i realized inside puppeteer's chrome it has 'next page' buttons.)
The window displays page 1-5 and then the 'next page' button.
The problem is that all of the buttons share the same html class name, so I'm confused on how to differentiate.
const e = require('express');
const puppeteer = require('puppeteer');
const url = "https://poshmark.com/category/Men-Jackets_&_Coats?sort_by=like_count&all_size=true&my_size=false";
let usernames = [];
const initItemArea = async (page) => {
const itemArea = await page.evaluate(() => {
return Array.from(document.querySelectorAll('.tc--g.m--l--1.ellipses')).map(x => x.textContent);
});
}
const pushToArray = async (itemArea, page) => {
itemArea.forEach(function (element) {
//console.log('username: ', $(element).text());
usernames.push(element);
});
};
const scrollToBottom = async (itemArea, page) => {
while (true) {
previousHeight = await page.evaluate('document.body.scrollHeight');
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
await page.waitForFunction(`document.body.scrollHeight > ${previousHeight}`);
await new Promise((resolve) => setTimeout(resolve, 1000));
await page.screenshot({path : "ss.png"})
}
};
const gotoNextPage = async (page) => {
await page.waitForSelector(".button.btn.btn--pagination");
const nextButton = await page.evaluate((page) => {
document.querySelector(".button.btn.btn--pagination")
});
await page.click(nextButton);
console.log('Next Page Loading')
};
async function main() {
const client = await puppeteer.launch({
headless: false,
executablePath: "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
});
const page = await client.newPage();
await page.goto(url);
await page.waitForSelector(".tc--g.m--l--1.ellipses");
const itemArea = await page.evaluate(() => {
return Array.from(document.querySelectorAll('.tc--g.m--l--1.ellipses')).map(x => x.textContent);
});
gotoNextPage(page)
};
main();
Currently, my gotoNextPage function doesnt even find the button, so i thought i'd entered the selector wrong...
Then when I went to find the selector, I realized all buttons have the same one anyway...
My html knowledge is basically nonexistent, but I want to finish this project out. All help is very appreciated.
Bonus: my initPageArea function doesn't work when I call as a function like that, so I hardcoded it into main()...
I'll be diving deep into this problem later on, as I've seen it before, but any quick answers / direction would be awesome.
Thanks a lot.
you can try selecting the buttons using their position in the page.
For example, you can select the first button using the following CSS selector:
.button.btn.btn--pagination:nth-child(1)
to select the second button:
.button.btn.btn--pagination:nth-child(2)
Got the idea? :)
you can refactor your gotoNextPage function to use this approach, consider this example:
const gotoNextPage = async (page, buttonIndex) => {
await page.waitForSelector(".button.btn.btn--pagination");
// Select the button using its position in the page
const nextButton = await page.evaluate((buttonIndex) => {
return document.querySelector(`.button.btn.btn--pagination:nth-child(${buttonIndex})`);
}, buttonIndex);
// Click on the button
await page.click(nextButton);
console.log("Next Page Loading");
};
Whenever you're messing with buttons and scroll, it's a good idea to think about where the data is coming from. It's usually being delivered to the front-end via a JSON API, so you might as well try to hit that API directly rather than mess with the DOM.
const url = maxId => `https://poshmark.com/vm-rest/channel_groups/category/channels/category/collections/post?request={%22filters%22:{%22department%22:%22Men%22,%22category_v2%22:%22Jackets_%26_Coats%22,%22inventory_status%22:[%22available%22]},%22sort_by%22:%22like_count%22,%22facets%22:[%22color%22,%22brand%22,%22size%22],%22experience%22:%22all%22,%22sizeSystem%22:%22us%22,%22max_id%22:%22${maxId}%22,%22count%22:%2248%22}&summarize=true&pm_version=226.1.0`;
(async () => {
const usernames = [];
for (let maxId = 1; maxId < 5 /* for testing */; maxId++) {
const response = await fetch(url(maxId)); // Node 18 or install node-fetch
if (!response.ok) {
throw Error(response.statusText);
}
const payload = await response.json();
if (payload.error) {
break;
}
usernames.push(...payload.data.map(e => e.creator_username));
}
console.log(usernames.slice(0, 10));
console.log("usernames.length", usernames.length);
})()
.catch(err => console.error(err));
The response blob has a ton of additional data.
I would add a significant delay between requests if I were to use code like this to avoid rate limiting/blocking.
If you're set on Puppeteer, something like this should work as well, although it's slower and I didn't have time to run to the end of the 5k (or more?) users:
const puppeteer = require("puppeteer"); // ^19.1.0
const url = "Your URL";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto(url, {waitUntil: "domcontentloaded"});
const usernames = [];
const sel = ".tc--g.m--l--1.ellipses";
for (;;) {
try {
await page.waitForSelector(sel);
const users = await page.$$eval(sel, els => {
const text = els.map(e => e.textContent);
els.forEach(el => el.remove());
return text;
});
console.log(users); // optional for debugging
usernames.push(...users);
await page.$$eval(
".btn--pagination",
els => els.find(el => el.textContent.includes("Next")).click()
);
}
catch (err) {
break;
}
}
console.log(usernames);
console.log(usernames.length);
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
I don't think navigations are triggered by the "Next" button, so my strategy for detecting when a page transition has occurred involves destroying the current set of elements after scraping the usernames, then waiting until the next batch shows up. This may seem inelegant, but it's easy to implement and seems reliable, not making assumptions about the usernames themselves.
It's also possible to use Puppeteer and make or intercept API requests, armed with a fresh cookie. This is sort of halfway between the two extremes shown above. For example:
const puppeteer = require("puppeteer");
const url = "Your URL";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto(url, {waitUntil: "domcontentloaded"});
const usernames = await page.evaluate(async () => {
const url = maxId => `https://poshmark.com/vm-rest/channel_groups/category/channels/category/collections/post?request={%22filters%22:{%22department%22:%22Men%22,%22category_v2%22:%22Jackets_%26_Coats%22,%22inventory_status%22:[%22available%22]},%22sort_by%22:%22like_count%22,%22facets%22:[%22color%22,%22brand%22,%22size%22],%22experience%22:%22all%22,%22sizeSystem%22:%22us%22,%22max_id%22:%22${maxId}%22,%22count%22:%2248%22}&summarize=true&pm_version=226.1.0`;
const usernames = [];
try {
for (let maxId = 1; maxId < 5 /* for testing */; maxId++) {
const response = await fetch(url(maxId)); // node 18 or install node-fetch
if (!response.ok) {
throw Error(response.statusText);
break;
}
const json = await response.json();
if (json.error) {
break;
}
usernames.push(...json.data.map(e => e.creator_username));
}
}
catch (err) {
console.error(err);
}
return usernames;
});
console.log(usernames);
console.log("usernames.length", usernames.length);
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
The above code limits to 4 requests to keep it simple and easy to validate.
Blocking images and other unnecessary resources can help speed the Puppeteer versions up, left as an exercise (or just use the direct fetch version shown at top).

When I try to web scrape a dynamic website, i get back an empty array

I am trying to web scrape a dynamic website with puppeteer, using this code:
const puppeteer = require('puppeteer');
async function getTokoPedia(){
const browser = await puppeteer.launch({ headless: false }); // for test disable the headlels mode,
const page = await browser.newPage();
await page.setViewport({ width: 1000, height: 926 });
await page.goto("https://store.401games.ca/collections/pokemon-singles",{waitUntil: 'networkidle2'});
console.log("start evaluate javascript")
var productNames = await page.evaluate(()=>{
var div = document.querySelectorAll('.info-container');
console.log(div) // console.log inside evaluate, will show on browser console not on node console
var productnames = []
div.forEach(element => {
var price = element.querySelector(' .fs-result-page-3sdl0h')
if(price != null){
productnames.push(price.innerText);
}
});
return productnames
})
console.log(productNames)
browser.close()
}
getTokoPedia();
However, upon running it, I get back an empty array. How can I fix this?
Two problems:
The elements you want are in a shadow root, so you have to pierce the root as described in Puppeteer not giving accurate HTML code for page with shadow roots.
The cards lazy-load, so you'd have to scroll down to be able to populate their data into the DOM.
But there's an easier way to get the initial set of data, which is in the static HTML as a JSON blob in var meta = {"products":...};. You can scrape it with a regex, as described in this tutorial.
Here's an example showing both approaches:
const puppeteer = require("puppeteer"); // ^14.1.1
let browser;
(async () => {
browser = await puppeteer.launch({headless: true});
const [page] = await browser.pages();
const url = "https://store.401games.ca/collections/pokemon-singles";
await page.goto(url, {waitUntil: "domcontentloaded"});
// here's the hard way for illustration:
const el = await page.waitForSelector("#fast-simon-serp-app");
await page.waitForFunction(({shadowRoot}) =>
shadowRoot.querySelector(".product-card .title")
, {}, el);
const items = await el.evaluate(({shadowRoot}) =>
[...shadowRoot.querySelectorAll(".product-card")]
.map(e => ({
title: e.querySelector(".title")?.textContent,
price: e.querySelector(".price")?.textContent,
}))
);
console.log(items); // just the first 6 or so
// TODO scroll the page to get the rest;
// I didn't bother implementing that...
// ...or do it the easy way:
const html = await page.content();
const pat = /^[\t ]*var meta = ({"products":[^\n]+);$/m;
const data = JSON.parse(html.match(pat)[1]);
console.log(JSON.stringify(data, null, 2));
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
At this point, since we're not dealing with anything but the static HTML, you can dump Puppeteer and use axios or fetch to get the data more efficiently:
const axios = require("axios");
axios.get("https://store.401games.ca/collections/pokemon-singles")
.then(({data: body}) => {
const pat = /^[\t ]*var meta = ({"products":[^\n]+);$/m;
const data = JSON.parse(body.match(pat)[1]);
console.log(JSON.stringify(data, null, 2));
})
.catch(err => console.error(err))
;
Now, the data.products array contains 50 but the UI shows 26466 results. If you want more than those initial items from the static HTML's var meta, which appears to be the same on all 1000+ pages, I suggest using the API. A URL looks like https://ultimate-dot-acp-magento.appspot.com/categories_navigation?request_source=v-next&src=v-next&UUID=d3cae9c0-9d9b-4fe3-ad81-873270df14b5&uuid=d3cae9c0-9d9b-4fe3-ad81-873270df14b5&store_id=17041809&cdn_cache_key=1654217982&api_type=json&category_id=269055623355&facets_required=1&products_per_page=5000&page_num=1&with_product_attributes=true. You can see there are ids and keys that probably protect against usage by parties other than the site, but I didn't see any change other than cdn_cache_key after a few tries. I'm not sure how long a URL is valid, but while it is, you can set products_per_page=1000 for example, then move page_num=1 forward 27 times or so. This gets you all of the data while avoiding all of the difficulties of scraping from the page itself.
Here's a pessimistic approach that uses Puppeteer to get an up-to-date URL, in case a URL goes stale:
const axios = require("axios");
const puppeteer = require("puppeteer"); // ^14.1.1
let browser;
(async () => {
browser = await puppeteer.launch({headless: true});
const [page] = await browser.pages();
const url = "https://store.401games.ca/collections/pokemon-singles";
const reqP = page.waitForRequest(res =>
res.url()
.startsWith("https://ultimate-dot-acp-magento.appspot.com/categories_navigation")
);
await page.goto(url, {waitUntil: "domcontentloaded"});
const req = await reqP;
const apiUrl = req
.url()
.replace(/(?<=products_per_page=)(\d+)/, 1000);
const {data} = await axios.get(apiUrl);
console.log(JSON.stringify(data, null, 2));
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
And tossing in the loop:
const axios = require("axios");
const fs = require("fs").promises;
const puppeteer = require("puppeteer"); // ^14.1.1
let browser;
(async () => {
browser = await puppeteer.launch({headless: true});
const [page] = await browser.pages();
const url = "https://store.401games.ca/collections/pokemon-singles";
const reqP = page.waitForRequest(res =>
res.url()
.startsWith("https://ultimate-dot-acp-magento.appspot.com/categories_navigation")
);
await page.goto(url, {waitUntil: "domcontentloaded"});
const req = await reqP;
const apiUrl = req
.url()
.replace(/(?<=products_per_page=)(\d+)/, 1000);
const items = [];
for (let i = 1;; i++) {
const pageUrl = apiUrl.replace(/(?<=page_num=)(\d+)/, i);
const response = await axios.get(pageUrl);
if (response.status !== 200 ||
items.length >= response.data.total_results) {
break;
}
items.push(...response.data.items);
}
await fs.writeFile("data.json", JSON.stringify(items));
console.log(items.slice(0, 10));
console.log(items.length);
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
This hammers the site, pulling a ton of data in a short amount of time, so consider this script for educational purposes, or modify it to throttle your requests way back.

Puppeteer & cycling a process through multiple users

I'm trying to scrape information from a webpage behind a login wall for two users. As it stands, I've managed to get the code to do what I want for the first user i.e. go to webpage, login, gather the links associated with properties in a saved list, use that list to gather more details and log them to console.
The challenge I have now is getting the code to loop this round the second user without having to dupe the code. How would you suggest I go about it?
Secondly I need to make the array for each user, declared as uniquePropertyLinks in the below, accessible outside of the function userProcess.
How can I produce a new array for each user?
How can I access the array outside the function?
Here is the code:
const puppeteer = require('puppeteer');
//Code to locate text and enable it to be clicked
const escapeXpathString = str => {
const splitedQuotes = str.replace(/'/g, `', "'", '`);
return `concat('${splitedQuotes}', '')`;
};
const clickByText = async (page, text) => {
const escapedText = escapeXpathString(text);
const linkHandlers = await page.$x(`//a[contains(text(), ${escapedText})]`);
if (linkHandlers.length > 0) {
await linkHandlers[0].click();
} else {
throw new Error(`Link not found: ${text}`);
}
};
//User credentials
const userAEmail = 'abc#hotmail.com';
const userAPassword = '123';
const userBEmail = 'def#hotmail.com';
const userBPassword = '456';
//Logout
const LogOut = async (page) => {
await page.goto('https://www.website.com');
await clickByText(page, 'Log out');
await page.waitForNavigation({waitUntil: 'load'});
console.log('Signed out');
};
///////////////////////////
//SCRAPE PROCESS
async function userProcess() {
try {
const browser = await puppeteer.launch({ headless : false });
const page = await browser.newPage();
page.setUserAgent('BLAHBLAHBLAH');
//Go to Website saved list
await page.goto('https://www.website.com/shortlist.html', {waitUntil: 'networkidle2'});
console.log('Page loaded');
//User A log in
await page.type('input[name=email]', userAEmail, {delay: 10});
await page.type('input[name=password]', userAPassword, {delay: 10});
await page.click('.mrm-button',{delay: 10});
await page.waitForNavigation({waitUntil: 'load'})
console.log('Signed in');
//Wait for website saved list to load
const propertyList = await page.$$('.title');
console.log(propertyList.length);
//Collecting links from saved list and de-duping into an array
const propertyLinks = await page.evaluate(() => Array.from(document.querySelectorAll('.sc-jbKcbu'), e => e.href));
let uniquePropertyLinks = [...new Set(propertyLinks)];
console.log(uniquePropertyLinks);
//Sign out
LogOut(page);
} catch (err) {
console.log('Our error - ', err.message);
}
};
userProcess();
Let's see some of the things you might need to complete your task. I think it's better to take time and develop the skills yourself, but I can perhaps point out a few key things.
You use:
const userAEmail = 'abc#hotmail.com';
const userAPassword = '123';
const userBEmail = 'def#hotmail.com';
const userBPassword = '456';
but then you're talking about looping. With such a data structure, it will be difficult to loop these two users. I recommend putting it into an object like so:
const users = {
a: {
email: 'abc#hotmail.com',
password: '123',
},
b: {
email: 'def#hotmail.com',
password: '456',
},
};
then you can easily look with for example for .. in:
for (const user in users) {
console.log(users[user]);
}
or with .forEach():
Object.values(users).forEach(user => {
console.log(user);
});
need to make the array for each user, declared as uniquePropertyLinks in the below, accessible outside of the function userProcess.
Then declare the array outside of the funtion:
let uniquePropertyLinks = [];
async function userProcess() {
// you can access uniquePropertyLinks here
}
// and you can access uniquePropertyLinks here as well
How can I produce a new array for each user? How can I access the array outside the function?
Again, it'd be better to choose a differen data structure, let's day an object with keys that would represent each user and values would be arrays. It'd look like so:
let uniquePropertyLinks = {};
uniquePropertyLinks.a = [];
uniquePropertyLinks.b = [];
which looks like this:
{ a: [], b: [] }
so you can save whatever values for user a into uniquePropertyLinks.a array and whatever values you need into uniquePropertyLinks.b array:
uniquePropertyLinks.a.push('new_value_for_a_user');
similarly for user b.
Now you should have all the bits you need in order to go back to your code and make the necessary changes.
For those looking for the results of pavelsaman's advice below is the updated code:
const puppeteer = require('puppeteer');
//Object storing user credentials
let userAEmail = 'abc';
let userAPassword = '123';
let userBEmail = 'def';
let userBPassword = '456';
const users = {
userA: {
email: userAEmail,
password: userAPassword,
},
userB: {
email: userBEmail,
password: userBPassword,
},
};
//Object storing users saved lists as arrays
const usersPropertyLinks = {};
usersPropertyLinks.userA = [];
usersPropertyLinks.userB = [];
//Function to retrieve users saved list of properties
async function retrieveUserSavedList(users, usersPropertyLinks) {
try {
//Load broswer
const browser = await puppeteer.launch({ headless : true });
const page = await browser.newPage();
page.setUserAgent('BLAHHBLAHHBLAHH');
for (const user in users) {
//Go to saved list
await page.goto('https://www.website.co.uk/user/shortlist.html', {waitUntil: 'networkidle2'});
await page.waitForSelector('.mrm-button');
//User log in
await page.type('input[name=email]', users[user].email, {delay: 10});
await page.type('input[name=password]', users[user].password, {delay: 10});
await page.click('.mrm-button',{delay: 10});
await page.waitForNavigation({waitUntil: 'load'})
console.log('Success: ' + users[user].email + ' logged in');
//Collecting saved property links and de-duping into an array
const propertyLinks = await page.evaluate(() => Array.from(document.querySelectorAll('.sc-jbKcbu'), e => e.href));
//Add saved property links to an array for each user
if (users[user].email === userAEmail ) {
usersPropertyLinks.userA.push(...new Set(propertyLinks));
} else if (users[user].email === userBEmail ) {
usersPropertyLinks.userB.push(...new Set(propertyLinks));
} else {
console.log('problem saving links to user array');
};
//Sign out
await page.click('.sc-kAzzGY',{delay: 10});
await page.waitForNavigation({waitUntil: 'load'});
console.log('Success: ' + users[user].email + ' logged out');
};
browser.close();
} catch (err) {
console.log('Error retrieve user saved list - ', err.message);
}
};
//Run the code
retrieveUserSavedList(users, usersPropertyLinks);

Exporting result from async function in separate js file, importing result in another javascript

Trying to build a small scraper. To reuse functionality I thought 'Page Object Models' would come in handy.
In main.js I require multiple small scripts, in the example below there is only one model (GooglePage).
The scripts work. But I would like to know how to pass a value from the google.js script back to the main script.
I want to use the value of the 'pageCountClean' variable in the main.js script to use in the rest of the application.
Have been searching for information about passing values and functions between scripts. For exporting values from pageconstructors, for promise await export function.
But I am lost. Do I have to use Promises?, is the current way of require/importing and exporting enough to create the relationship between the scripts?
Any pointers are welcome.
//////////// main.js
const { chromium } = require('playwright');
const { GooglePage } = require('./models/Google');
(async () => {
const browser = await chromium.launch({ headless: true, slowMo: 250 });
const context = await browser.newContext();
const GoogleUrl80 = https://www.google.nl/search?q=site%3Anu.nl;
// Cookie consent:
console.log('Cookie consent - start');
const page80 = await browser.newPage();
await page80.goto('https://google.nl');
await page80.waitForTimeout(1000);
await page80.keyboard.press('Tab');
await page80.keyboard.press('Tab');
await page80.keyboard.press('Enter');
console.log('Cookie Consent - done');
// Number of urls in google.nl (using google.js)
await page80.goto(GoogleUrl80, {waitUntil: 'networkidle'});
const googlePage80 = new GooglePage(page80);
await googlePage80.scrapeGoogle();
// Want to console.log 'pageCountClean' here.
await browser.close()
})()
//////////// Google.js
class GooglePage {
constructor(page) {
this.page = page;
}
async scrapeGoogle() {
const GoogleXpath = '//div[#id="result-stats"]';
const pageCount = await this.page.$eval(GoogleXpath, (el) => el.innerText);
const pageCountClean = pageCount.split(" ")[1];
console.log(pageCountClean);
}
}
module.exports = { GooglePage };
You can just return pageCountClean from your async function and await it in your main.js file:
in Google.js:
async scrapeGoogle() {
const GoogleXpath = '//div[#id="result-stats"]';
const pageCount = await this.page.$eval(GoogleXpath, (el) => el.innerText);
const pageCountClean = pageCount.split(" ")[1];
console.log(pageCountClean);
return pageCountClean;
}
in main.js:
const googlePage80 = new GooglePage(page80);
const result = await googlePage80.scrapeGoogle();
console.log(result);

Scraper (puppeteer) doesn't map over my array - JavaScript / React

I have written a web scraper with puppeteer. It screens jobs from a job portal. I can screen title, position and image.
The created array from my scraper looks like this:
[{
"id": "2018-12-03T14:12:03Z",
"position": "Frontend Entwickler React (w/m)",
"company": "Muster AG",
"image": "https://www.stepstone.de/upload_de/logo/blabla.gif",
"date": "2018-12-03T14:12:03Z",
"href": "https://www.stepstone.de/stellenangebote--Frontend-Entwickler"
}]
Here is the code of my scraper.js:
const fs = require('fs')
const path = require('path')
const puppeteer = require('puppeteer')
;(async () => {
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto(
'https://www.stepstone.de/5/ergebnisliste.html?stf=freeText&ns=1&qs=%5B%7B%22id%22%3A%22231794%22%2C%22description%22%3A%22Frontend-Entwickler%2Fin%22%2C%22type%22%3A%22jd%22%7D%2C%7B%22id%22%3A%22300000115%22%2C%22description%22%3A%22Deutschland%22%2C%22type%22%3A%22geocity%22%7D%5D&companyID=0&cityID=300000115&sourceOfTheSearchField=homepagemex%3Ageneral&searchOrigin=Homepage_top-search&ke=Frontend-Entwickler%2Fin&ws=Deutschland&ra=30'
)
const stepstone = await page.evaluate(() => {
return Array.from(document.querySelectorAll('.job-element'), card => {
const id = card.querySelector('time').getAttribute('datetime')
const href = card
.querySelector('.job-element__body > a')
.getAttribute('href')
const position = card
.querySelector('.job-element__body__title')
.textContent.trim()
.replace(/^(.{45}[^\s]*).*/, '$1')
const company = card
.querySelector('.job-element__body__company')
.textContent.trim()
.replace(/^(.{20}[^\s]*).*/, '$1')
const image_element = card.querySelector('.job-element__logo img')
const image = image_element.dataset.src
? `https://www.stepstone.de${image_element.dataset.src}`
: image_element.src
const date = card.querySelector('time').getAttribute('datetime')
return {
id,
position,
company,
image,
date,
href
}
})
})
fs.writeFile(
path.join(__dirname, 'src/stepstone.json'),
JSON.stringify(stepstone),
err => {
if (err) {
console.error(err)
} else {
console.log('Great, it worked!')
}
}
)
await browser.close()
})()
My Approach: After scraping the title, position, etc. I also want to include the job details. So I told my scraper to go to the href link of each job item in the array where this information is stored.
And from that link grab the job details classes just like above. So I tried to map over the above array and tell the scraper to grab the items from each href link, like this:
stepstone.map(async stone => {
const page = await browser.newPage()
await page.goto(stone.href)
const details = await page.evaluate(() => {
return document.querySelector('card__body')
})
return {
...stone,
details
}
})
My Problem:
However, the JSON file does not update with the "details" key (which shall hold information from 'card__body').
Any suggestions?
Thx!

Categories

Resources