How to parse images posted by user in google reviews? - javascript

I am working on a scraping project to scrape google maps reviews, but I got struck when I have to parse images posted by users. I tried this method which only gives me the first image posted by user :
$('.gws-localreviews__google-review').each((i,el) => {
images[i] = $(el)
.find(".EDblX .JrO5Xe").attr("style")
})
I am scraping google reviews by this URL: https://www.google.com/async/reviewDialog?hl=en&async=feature_id:0x47e66e2964e34e2d:0x8ddca9ee380ef7e0,next_page_token:,sort_by:,start_index:,associated_topic:,_fmt:pc
Here is my response:
[
'background-image:url(https://lh5.googleusercontent.com/p/AF1QipPgClEw3JwTLJOuf-DqC2xtZRodoavkpYVFBYqu=w100-h100-p-n-k-no)',
'background-image:url(https://lh5.googleusercontent.com/p/AF1QipOs9TSNoyYmW1GL4SH9PlkAihvWsUbMTn-8O2Sj=w100-h100-p-n-k-no)',
'background-image:url(https://lh5.googleusercontent.com/p/AF1QipMBRGdJb3zL1rME20osajG-bosdIV8U82VTYS1n=w100-h100-p-n-k-no)',
'background-image:url(https://lh5.googleusercontent.com/p/AF1QipOBuGDXFDhJP69LNo6yI9cZWcjSVHpVfPBNoKyL=w100-h100-p-n-k-no)',
'background-image:url(https://lh5.googleusercontent.com/p/AF1QipP7wBZt8Kilm8VF75T8amjMrZ7ZkOpmtb0nHChF=w100-h100-p-n-k-no)',
'background-image:url(https://lh5.googleusercontent.com/p/AF1QipOixJabuSd4mSHnveU5JSQ1ZszHJ6Hn-pkeosiY=w100-h100-p-n-k-no)',
'background-image:url(https://lh5.googleusercontent.com/p/AF1QipPyOXO1vnyTXVnlkPJNLlnoHYHEna36vYnrqwE=w100-h100-p-n-k-no)',
'background-image:url(https://lh5.googleusercontent.com/p/AF1QipPReBboes7S7lNklRT21pwn096JUQVJbTX3VRRA=w100-h100-p-n-k-no)',
'background-image:url(https://lh5.googleusercontent.com/p/AF1QipNPLvARJu1vDk03r_y4fp8f7aDDvzRX-7yJklW8=w100-h100-p-n-k-no)',
'background-image:url(https://lh5.googleusercontent.com/p/AF1QipMW3jp20hjKwuvhogH9ZC8IeH8QhQTUESH_ycNX=w100-h100-p-n-k-no)'
]
But what I want an object containing one user images, then a second a object containing a second user images and so on.
I want the results like this:
[
{
"All image's links posted by user 1"
},
{
"All images links posted by user 2"
}
{
"All images links posted by user 3"
},
{
"All images links posted by user 4"
},
{
........
}
]

So I found the answer, it was a bit tricky but yeah it will be solved like this :
$(".gws-localreviews__google-review").each((i, el) => {
images[i] = $(el)
.find(".EDblX .JrO5Xe")
.toArray()
.map($)
.map(d => d.attr("style").substring(21 , d.attr("style").lastIndexOf(")")))
});
})
It will return the response like this:
[
[
All images posted by user 1,
],
[
All images posted by user 2,
],
[.........
]

I don't see your full code, but as an alternative, to scrape all Google Maps Users Reviews images, you can use some browser automation, e.g. Puppeteer. In the code below I show you how you can do this (also check it on the online IDE):
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());
const placeUrl =
"https://www.google.com/maps/place/Starbucks/data=!4m7!3m6!1s0x549069a98254bd17:0xb2f64f75b3edf4c3!8m2!3d47.5319688!4d-122.1942498!16s%2Fg%2F1tdfmzpb!19sChIJF71UgqlpkFQRw_Tts3VP9rI?authuser=0&hl=en&rclk=1";
async function scrollPage(page, scrollContainer) {
let lastHeight = await page.evaluate(`document.querySelector("${scrollContainer}").scrollHeight`);
while (true) {
await page.evaluate(`document.querySelector("${scrollContainer}").scrollTo(0, document.querySelector("${scrollContainer}").scrollHeight)`);
await page.waitForTimeout(2000);
let newHeight = await page.evaluate(`document.querySelector("${scrollContainer}").scrollHeight`);
if (newHeight === lastHeight) {
break;
}
lastHeight = newHeight;
}
}
async function getReviewsFromPage(page) {
const reviews = await page.evaluate(() => {
return Array.from(document.querySelectorAll(".jftiEf")).map((el, i) => {
return {
[`All image's links posted by user ${i + 1}`]: Array.from(el.querySelectorAll(".KtCyie button")).length
? Array.from(el.querySelectorAll(".KtCyie button")).map((el) => getComputedStyle(el).backgroundImage.slice(5, -2))
: undefined,
};
});
});
return reviews;
}
async function getLocalPlaceReviews() {
const browser = await puppeteer.launch({
headless: false,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
await page.setDefaultNavigationTimeout(60000);
await page.goto(placeUrl);
await page.click(".HHrUdb");
await page.waitForTimeout(2000);
await page.waitForSelector(".jftiEf");
await scrollPage(page, ".DxyBCb");
const rawReviews = await getReviewsFromPage(page);
const reviews = rawReviews.filter((el) => Object.keys(el).length);
await browser.close();
return reviews;
}
getLocalPlaceReviews().then(console.log);
Output
[
{
"All image's links posted by user 3":[
"https://lh5.googleusercontent.com/p/AF1QipNw_1HUpO-tbdPg0wS_hlMGnzhRFrDt8-CGE7yf=w600-h450-p-k-no"
]
},
{
"All image's links posted by user 5":[
"https://lh5.googleusercontent.com/p/AF1QipNIUP-aOWRElmfVOjnf5lJJYFiLKBaSx7MSkhg8=w300-h450-p-k-no",
"https://lh5.googleusercontent.com/p/AF1QipPcTFJIW9JAZxZ0PU0WC2U5rPnESv7OnrnSANwV=w300-h225-p-k-no",
"https://lh5.googleusercontent.com/p/AF1QipN_LkT7MCwx-oaf1yXkMnc_D-gm6HrWa7Kqoep8=w300-h225-p-k-no"
]
},
{
"All image's links posted by user 6":[
"https://lh5.googleusercontent.com/p/AF1QipPrI2xvgjFNh2vxFmBxRJBYvw553mORZdRZYwdZ=w300-h450-p-k-no",
"https://lh5.googleusercontent.com/p/AF1QipPVZ4YJqXjLvL-XTFBpB0oo4lVaBdrAGv2Ohyux=w300-h450-p-k-no"
]
},
{
"All image's links posted by user 9":[
"https://lh5.googleusercontent.com/p/AF1QipNTPaghKZbvv5aouQjzCq7no46UiiCa8IbsNmCZ=w600-h450-p-k-no"
]
}
...and other reviews
]
You can read more about scraping Google Maps Users Reviews from my blog post Web Scraping Google Maps Reviews with Nodejs.

Related

How to select specific button in puppeteer

So I'm building a program that scrapes Poshmark webpages and extracts the usernames of each seller on the page!
I want it to go through every page using the 'next' button, but theres 6 buttons all with the same class name...
Heres the link: https://poshmark.com/category/Men-Jackets_&_Coats?sort_by=like_count&all_size=true&my_size=false
(In my google chrome this page has an infinite scroll (hence the scrollToBottom async function i started writing) but i realized inside puppeteer's chrome it has 'next page' buttons.)
The window displays page 1-5 and then the 'next page' button.
The problem is that all of the buttons share the same html class name, so I'm confused on how to differentiate.
const e = require('express');
const puppeteer = require('puppeteer');
const url = "https://poshmark.com/category/Men-Jackets_&_Coats?sort_by=like_count&all_size=true&my_size=false";
let usernames = [];
const initItemArea = async (page) => {
const itemArea = await page.evaluate(() => {
return Array.from(document.querySelectorAll('.tc--g.m--l--1.ellipses')).map(x => x.textContent);
});
}
const pushToArray = async (itemArea, page) => {
itemArea.forEach(function (element) {
//console.log('username: ', $(element).text());
usernames.push(element);
});
};
const scrollToBottom = async (itemArea, page) => {
while (true) {
previousHeight = await page.evaluate('document.body.scrollHeight');
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
await page.waitForFunction(`document.body.scrollHeight > ${previousHeight}`);
await new Promise((resolve) => setTimeout(resolve, 1000));
await page.screenshot({path : "ss.png"})
}
};
const gotoNextPage = async (page) => {
await page.waitForSelector(".button.btn.btn--pagination");
const nextButton = await page.evaluate((page) => {
document.querySelector(".button.btn.btn--pagination")
});
await page.click(nextButton);
console.log('Next Page Loading')
};
async function main() {
const client = await puppeteer.launch({
headless: false,
executablePath: "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
});
const page = await client.newPage();
await page.goto(url);
await page.waitForSelector(".tc--g.m--l--1.ellipses");
const itemArea = await page.evaluate(() => {
return Array.from(document.querySelectorAll('.tc--g.m--l--1.ellipses')).map(x => x.textContent);
});
gotoNextPage(page)
};
main();
Currently, my gotoNextPage function doesnt even find the button, so i thought i'd entered the selector wrong...
Then when I went to find the selector, I realized all buttons have the same one anyway...
My html knowledge is basically nonexistent, but I want to finish this project out. All help is very appreciated.
Bonus: my initPageArea function doesn't work when I call as a function like that, so I hardcoded it into main()...
I'll be diving deep into this problem later on, as I've seen it before, but any quick answers / direction would be awesome.
Thanks a lot.
you can try selecting the buttons using their position in the page.
For example, you can select the first button using the following CSS selector:
.button.btn.btn--pagination:nth-child(1)
to select the second button:
.button.btn.btn--pagination:nth-child(2)
Got the idea? :)
you can refactor your gotoNextPage function to use this approach, consider this example:
const gotoNextPage = async (page, buttonIndex) => {
await page.waitForSelector(".button.btn.btn--pagination");
// Select the button using its position in the page
const nextButton = await page.evaluate((buttonIndex) => {
return document.querySelector(`.button.btn.btn--pagination:nth-child(${buttonIndex})`);
}, buttonIndex);
// Click on the button
await page.click(nextButton);
console.log("Next Page Loading");
};
Whenever you're messing with buttons and scroll, it's a good idea to think about where the data is coming from. It's usually being delivered to the front-end via a JSON API, so you might as well try to hit that API directly rather than mess with the DOM.
const url = maxId => `https://poshmark.com/vm-rest/channel_groups/category/channels/category/collections/post?request={%22filters%22:{%22department%22:%22Men%22,%22category_v2%22:%22Jackets_%26_Coats%22,%22inventory_status%22:[%22available%22]},%22sort_by%22:%22like_count%22,%22facets%22:[%22color%22,%22brand%22,%22size%22],%22experience%22:%22all%22,%22sizeSystem%22:%22us%22,%22max_id%22:%22${maxId}%22,%22count%22:%2248%22}&summarize=true&pm_version=226.1.0`;
(async () => {
const usernames = [];
for (let maxId = 1; maxId < 5 /* for testing */; maxId++) {
const response = await fetch(url(maxId)); // Node 18 or install node-fetch
if (!response.ok) {
throw Error(response.statusText);
}
const payload = await response.json();
if (payload.error) {
break;
}
usernames.push(...payload.data.map(e => e.creator_username));
}
console.log(usernames.slice(0, 10));
console.log("usernames.length", usernames.length);
})()
.catch(err => console.error(err));
The response blob has a ton of additional data.
I would add a significant delay between requests if I were to use code like this to avoid rate limiting/blocking.
If you're set on Puppeteer, something like this should work as well, although it's slower and I didn't have time to run to the end of the 5k (or more?) users:
const puppeteer = require("puppeteer"); // ^19.1.0
const url = "Your URL";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto(url, {waitUntil: "domcontentloaded"});
const usernames = [];
const sel = ".tc--g.m--l--1.ellipses";
for (;;) {
try {
await page.waitForSelector(sel);
const users = await page.$$eval(sel, els => {
const text = els.map(e => e.textContent);
els.forEach(el => el.remove());
return text;
});
console.log(users); // optional for debugging
usernames.push(...users);
await page.$$eval(
".btn--pagination",
els => els.find(el => el.textContent.includes("Next")).click()
);
}
catch (err) {
break;
}
}
console.log(usernames);
console.log(usernames.length);
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
I don't think navigations are triggered by the "Next" button, so my strategy for detecting when a page transition has occurred involves destroying the current set of elements after scraping the usernames, then waiting until the next batch shows up. This may seem inelegant, but it's easy to implement and seems reliable, not making assumptions about the usernames themselves.
It's also possible to use Puppeteer and make or intercept API requests, armed with a fresh cookie. This is sort of halfway between the two extremes shown above. For example:
const puppeteer = require("puppeteer");
const url = "Your URL";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto(url, {waitUntil: "domcontentloaded"});
const usernames = await page.evaluate(async () => {
const url = maxId => `https://poshmark.com/vm-rest/channel_groups/category/channels/category/collections/post?request={%22filters%22:{%22department%22:%22Men%22,%22category_v2%22:%22Jackets_%26_Coats%22,%22inventory_status%22:[%22available%22]},%22sort_by%22:%22like_count%22,%22facets%22:[%22color%22,%22brand%22,%22size%22],%22experience%22:%22all%22,%22sizeSystem%22:%22us%22,%22max_id%22:%22${maxId}%22,%22count%22:%2248%22}&summarize=true&pm_version=226.1.0`;
const usernames = [];
try {
for (let maxId = 1; maxId < 5 /* for testing */; maxId++) {
const response = await fetch(url(maxId)); // node 18 or install node-fetch
if (!response.ok) {
throw Error(response.statusText);
break;
}
const json = await response.json();
if (json.error) {
break;
}
usernames.push(...json.data.map(e => e.creator_username));
}
}
catch (err) {
console.error(err);
}
return usernames;
});
console.log(usernames);
console.log("usernames.length", usernames.length);
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
The above code limits to 4 requests to keep it simple and easy to validate.
Blocking images and other unnecessary resources can help speed the Puppeteer versions up, left as an exercise (or just use the direct fetch version shown at top).

Why will puppeteer not click on the video

I am currently writing a simple program that grabs the name of a song from my discord bot, finds the video and passes it to a function to convert to mp3. My problem is that puppeteer dosen't click on the video and instead just returns the search page link.
Here is my code to grab the link and pass it through download:
async function findSongName(stringWithName){
let stringName = stringWithName.replace(commands.play, '')
const link = 'https://www.youtube.com';
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(link)
await page.type('ytd-searchbox#search.style-scope.ytd-masthead', stringName);
page.keyboard.press('Enter');
await page.click('yt-interaction#extended');
console.log(page.url())
await browser.close()
}
It sounds like you want to get the title and URL of the top result for a YT search. For starters, you don't need to start at the YT homepage. Just navigate to https://www.youtube.com/results?search_query=${yourQuery} to speed things up and reduce complexity.
Next, if you view the page source of /results, there's a large (~1 MB) global data structure called ytInitialData that has all of the relevant results in it (along with a lot of other irrelevant stuff, admittedly). Theoretically, you could grab the page with Axios, parse out ytInitialData with Cheerio, grab your data using plain array/object JS and skip Puppeteer entirely.
Of course, using the YT search API is the most reliable and proper way.
Since you're using Puppeteer, though, the data can be pulled out of the "#items a#video-title" elements as follows:
const puppeteer = require("puppeteer");
const searchYT = async (page, searchQuery) => {
const encodedQuery = encodeURIComponent(searchQuery);
const url = `https://www.youtube.com/results?search_query=${encodedQuery}`;
await page.goto(url);
const sel = "a#video-title";
await page.waitForSelector(sel);
return page.$$eval(sel, els =>
els.map(e => ({
title: e.textContent.trim(),
href: e.href,
}))
);
};
let browser;
(async () => {
browser = await puppeteer.launch({headless: true});
const [page] = await browser.pages();
await page.setRequestInterception(true);
page.on("request", req => {
req.resourceType() === "image" ? req.abort() : req.continue();
});
const results = await searchYT(page, "stack overflow");
console.log(results);
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
Output (for the search term "stack overflow"):
[
{
title: 'Stack Overflow is full of idiots.',
href: 'https://www.youtube.com/watch?v=I_ZK0t9-llo'
},
{
title: "How To Use Stack Overflow (no, ForrestKnight, it's not full of idiots)",
href: 'https://www.youtube.com/watch?v=sMIslcynm0Q'
},
{
title: 'How to use Stack Overflow as a Beginner ?',
href: 'https://www.youtube.com/watch?v=Vt-Wf7d0CFo'
},
{
title: 'How Microsoft Uses Stack Overflow for Teams',
href: 'https://www.youtube.com/watch?v=mhh0aK6yJgA'
},
// ...
]
Since you only want the first result, then it's here, but if you want more than the initial batch, either work through ytInitialData as described above or scroll the page down with Puppeteer.
Now that you have a video URL that you want to make into an mp3, I'd recommend youtube-dl. There are Node wrappers you can install to access its API easily, such as node-youtube-dl which was the first result when I searched and I've never used before.

How to get links from multiple pages in a single array

I have a working code that successfully obtains all product links from multiple pages that have at least a 20% discount. The only problem is that it returns links in the arrays for each page separately. However, I would like it to return links for all pages in a single array and then transfer them to another function. I tried to create a string var all_links = [] and push all the links from each page into it and then return them like return all_links, as I know from a simpler example. However, I have not been successful in this case because I have no experience with coding. I started learning the basics three weeks ago. I would be very grateful if you could help me with the whole code as I don't have the necessary prior knowledge.
const puppeteer = require('puppeteer')
const minDiscount = 20;
async function getLinks() {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
});
const page = await browser.newPage();
const url = 'https://www.mytoys.de/spielzeug-spiele/holz/';
await page.goto(url);
// getting all the products, this will return an array of ElementHandle
while(await page.$(".pager__link--next")){
await page.waitForSelector(".pager__link--next")
await page.waitForTimeout(1000);
await page.click('.pager__link--next')
await page.waitForTimeout(1500);
const products = await page.$$('.prod-grid.js-prod-grid .prod-grid__item.js-prod-grid_item');
const proms = await Promise.allSettled(
products.map(async (prod) => {
// searching for a discount on each product
const disc = await prod.$$eval(
'.prod-grid.js-prod-grid .prod-flag.prod-flag-sale',
(discount) =>
discount.map((discItem) =>
discItem.innerText.replace(/[^0-9.]/g, '').replace(/\D+/g,'0')
)
);
// if it has a discount
if (disc.length > 0) {
// we parse the discount to Integer type to compare it to minDiscount
const discountInt = parseInt(disc[0], 10);
if (discountInt >= minDiscount) {
// we get the link of the product
const link = await prod.$$eval('.prod-grid.js-prod-grid .prod-tile__link.js-prodlink', (allAs) => allAs.map((a) => a.href));
if (link.length > 0) {
// push an object containing the discount and the link of the product
return link[0];
}
}
}
return null;
})
);
const bulkArray = proms.map((item) => {
if (item.status === 'fulfilled') return item.value;
});
const endArray = bulkArray.filter(item => item !== null);
console.log(endArray);
}
}
getLinks();
An example of the result I am currently obtaining
[
'https://www.mytoys.de/erzi-kinderwurst-sortiment-spiellebensmittel-6749036.html',
'https://www.mytoys.de/chr-tanner-spiellebensmittel-wurststaender-1031946.html',
'https://www.mytoys.de/hape-xylophon-und-hammerspiel-2503719.html',
'https://www.mytoys.de/erzi-kinderparty-spiellebensmittel-6749035.html',
]
[
'https://www.mytoys.de/brio-holzeisenbahnset-landleben-5501952.html',
'https://www.mytoys.de/brio-brio-33277-bahn-ir-reisezug-set-4592516.html',
'https://www.mytoys.de/brio-parkhaus-strassen-schienen-3175226.html',
'https://www.mytoys.de/mytoys-steckwuerfel-12-tlg-11389814.html',
'https://www.mytoys.de/brio-schienen-und-weichensortiment-1758325.html',
]
[
'https://www.mytoys.de/hape-grosser-baukran-4141517.html',
'https://www.mytoys.de/noris-mein-buntes-tuermchenspiel-3421170.html',
'https://www.mytoys.de/goki-ziehtier-schaf-suse-2488933.html',
'https://www.mytoys.de/eichhorn-colorsoundzug-mit-licht-1521635.html',
]
An example of the result you would like to obtain
[
'https://www.mytoys.de/erzi-kinderwurst-sortiment-spiellebensmittel-6749036.html',
'https://www.mytoys.de/chr-tanner-spiellebensmittel-wurststaender-1031946.html',
'https://www.mytoys.de/hape-xylophon-und-hammerspiel-2503719.html',
'https://www.mytoys.de/erzi-kinderparty-spiellebensmittel-6749035.html',
'https://www.mytoys.de/brio-holzeisenbahnset-landleben-5501952.html',
'https://www.mytoys.de/brio-brio-33277-bahn-ir-reisezug-set-4592516.html',
'https://www.mytoys.de/brio-parkhaus-strassen-schienen-3175226.html',
'https://www.mytoys.de/mytoys-steckwuerfel-12-tlg-11389814.html',
'https://www.mytoys.de/brio-schienen-und-weichensortiment-1758325.html',
'https://www.mytoys.de/hape-grosser-baukran-4141517.html',
'https://www.mytoys.de/noris-mein-buntes-tuermchenspiel-3421170.html',
'https://www.mytoys.de/goki-ziehtier-schaf-suse-2488933.html',
'https://www.mytoys.de/eichhorn-colorsoundzug-mit-licht-1521635.html',
]
Declare new variable for links collecting before your loop:
const allLinks = []; // <--
while(await page.$(".pager__link--next")){ ... }
Push all links into it:
...
const endArray = bulkArray.filter(item => item !== null);
console.log(endArray);
allLinks.push(endArray); // <--
Return / log result after loop execution:
async function getLinks() {
...
return allLinks.flat(); // <--
}
console.log(await getLinks()) // result array
Refs: Array.prototype.flat()

Puppeteer & cycling a process through multiple users

I'm trying to scrape information from a webpage behind a login wall for two users. As it stands, I've managed to get the code to do what I want for the first user i.e. go to webpage, login, gather the links associated with properties in a saved list, use that list to gather more details and log them to console.
The challenge I have now is getting the code to loop this round the second user without having to dupe the code. How would you suggest I go about it?
Secondly I need to make the array for each user, declared as uniquePropertyLinks in the below, accessible outside of the function userProcess.
How can I produce a new array for each user?
How can I access the array outside the function?
Here is the code:
const puppeteer = require('puppeteer');
//Code to locate text and enable it to be clicked
const escapeXpathString = str => {
const splitedQuotes = str.replace(/'/g, `', "'", '`);
return `concat('${splitedQuotes}', '')`;
};
const clickByText = async (page, text) => {
const escapedText = escapeXpathString(text);
const linkHandlers = await page.$x(`//a[contains(text(), ${escapedText})]`);
if (linkHandlers.length > 0) {
await linkHandlers[0].click();
} else {
throw new Error(`Link not found: ${text}`);
}
};
//User credentials
const userAEmail = 'abc#hotmail.com';
const userAPassword = '123';
const userBEmail = 'def#hotmail.com';
const userBPassword = '456';
//Logout
const LogOut = async (page) => {
await page.goto('https://www.website.com');
await clickByText(page, 'Log out');
await page.waitForNavigation({waitUntil: 'load'});
console.log('Signed out');
};
///////////////////////////
//SCRAPE PROCESS
async function userProcess() {
try {
const browser = await puppeteer.launch({ headless : false });
const page = await browser.newPage();
page.setUserAgent('BLAHBLAHBLAH');
//Go to Website saved list
await page.goto('https://www.website.com/shortlist.html', {waitUntil: 'networkidle2'});
console.log('Page loaded');
//User A log in
await page.type('input[name=email]', userAEmail, {delay: 10});
await page.type('input[name=password]', userAPassword, {delay: 10});
await page.click('.mrm-button',{delay: 10});
await page.waitForNavigation({waitUntil: 'load'})
console.log('Signed in');
//Wait for website saved list to load
const propertyList = await page.$$('.title');
console.log(propertyList.length);
//Collecting links from saved list and de-duping into an array
const propertyLinks = await page.evaluate(() => Array.from(document.querySelectorAll('.sc-jbKcbu'), e => e.href));
let uniquePropertyLinks = [...new Set(propertyLinks)];
console.log(uniquePropertyLinks);
//Sign out
LogOut(page);
} catch (err) {
console.log('Our error - ', err.message);
}
};
userProcess();
Let's see some of the things you might need to complete your task. I think it's better to take time and develop the skills yourself, but I can perhaps point out a few key things.
You use:
const userAEmail = 'abc#hotmail.com';
const userAPassword = '123';
const userBEmail = 'def#hotmail.com';
const userBPassword = '456';
but then you're talking about looping. With such a data structure, it will be difficult to loop these two users. I recommend putting it into an object like so:
const users = {
a: {
email: 'abc#hotmail.com',
password: '123',
},
b: {
email: 'def#hotmail.com',
password: '456',
},
};
then you can easily look with for example for .. in:
for (const user in users) {
console.log(users[user]);
}
or with .forEach():
Object.values(users).forEach(user => {
console.log(user);
});
need to make the array for each user, declared as uniquePropertyLinks in the below, accessible outside of the function userProcess.
Then declare the array outside of the funtion:
let uniquePropertyLinks = [];
async function userProcess() {
// you can access uniquePropertyLinks here
}
// and you can access uniquePropertyLinks here as well
How can I produce a new array for each user? How can I access the array outside the function?
Again, it'd be better to choose a differen data structure, let's day an object with keys that would represent each user and values would be arrays. It'd look like so:
let uniquePropertyLinks = {};
uniquePropertyLinks.a = [];
uniquePropertyLinks.b = [];
which looks like this:
{ a: [], b: [] }
so you can save whatever values for user a into uniquePropertyLinks.a array and whatever values you need into uniquePropertyLinks.b array:
uniquePropertyLinks.a.push('new_value_for_a_user');
similarly for user b.
Now you should have all the bits you need in order to go back to your code and make the necessary changes.
For those looking for the results of pavelsaman's advice below is the updated code:
const puppeteer = require('puppeteer');
//Object storing user credentials
let userAEmail = 'abc';
let userAPassword = '123';
let userBEmail = 'def';
let userBPassword = '456';
const users = {
userA: {
email: userAEmail,
password: userAPassword,
},
userB: {
email: userBEmail,
password: userBPassword,
},
};
//Object storing users saved lists as arrays
const usersPropertyLinks = {};
usersPropertyLinks.userA = [];
usersPropertyLinks.userB = [];
//Function to retrieve users saved list of properties
async function retrieveUserSavedList(users, usersPropertyLinks) {
try {
//Load broswer
const browser = await puppeteer.launch({ headless : true });
const page = await browser.newPage();
page.setUserAgent('BLAHHBLAHHBLAHH');
for (const user in users) {
//Go to saved list
await page.goto('https://www.website.co.uk/user/shortlist.html', {waitUntil: 'networkidle2'});
await page.waitForSelector('.mrm-button');
//User log in
await page.type('input[name=email]', users[user].email, {delay: 10});
await page.type('input[name=password]', users[user].password, {delay: 10});
await page.click('.mrm-button',{delay: 10});
await page.waitForNavigation({waitUntil: 'load'})
console.log('Success: ' + users[user].email + ' logged in');
//Collecting saved property links and de-duping into an array
const propertyLinks = await page.evaluate(() => Array.from(document.querySelectorAll('.sc-jbKcbu'), e => e.href));
//Add saved property links to an array for each user
if (users[user].email === userAEmail ) {
usersPropertyLinks.userA.push(...new Set(propertyLinks));
} else if (users[user].email === userBEmail ) {
usersPropertyLinks.userB.push(...new Set(propertyLinks));
} else {
console.log('problem saving links to user array');
};
//Sign out
await page.click('.sc-kAzzGY',{delay: 10});
await page.waitForNavigation({waitUntil: 'load'});
console.log('Success: ' + users[user].email + ' logged out');
};
browser.close();
} catch (err) {
console.log('Error retrieve user saved list - ', err.message);
}
};
//Run the code
retrieveUserSavedList(users, usersPropertyLinks);

Scraper (puppeteer) doesn't map over my array - JavaScript / React

I have written a web scraper with puppeteer. It screens jobs from a job portal. I can screen title, position and image.
The created array from my scraper looks like this:
[{
"id": "2018-12-03T14:12:03Z",
"position": "Frontend Entwickler React (w/m)",
"company": "Muster AG",
"image": "https://www.stepstone.de/upload_de/logo/blabla.gif",
"date": "2018-12-03T14:12:03Z",
"href": "https://www.stepstone.de/stellenangebote--Frontend-Entwickler"
}]
Here is the code of my scraper.js:
const fs = require('fs')
const path = require('path')
const puppeteer = require('puppeteer')
;(async () => {
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto(
'https://www.stepstone.de/5/ergebnisliste.html?stf=freeText&ns=1&qs=%5B%7B%22id%22%3A%22231794%22%2C%22description%22%3A%22Frontend-Entwickler%2Fin%22%2C%22type%22%3A%22jd%22%7D%2C%7B%22id%22%3A%22300000115%22%2C%22description%22%3A%22Deutschland%22%2C%22type%22%3A%22geocity%22%7D%5D&companyID=0&cityID=300000115&sourceOfTheSearchField=homepagemex%3Ageneral&searchOrigin=Homepage_top-search&ke=Frontend-Entwickler%2Fin&ws=Deutschland&ra=30'
)
const stepstone = await page.evaluate(() => {
return Array.from(document.querySelectorAll('.job-element'), card => {
const id = card.querySelector('time').getAttribute('datetime')
const href = card
.querySelector('.job-element__body > a')
.getAttribute('href')
const position = card
.querySelector('.job-element__body__title')
.textContent.trim()
.replace(/^(.{45}[^\s]*).*/, '$1')
const company = card
.querySelector('.job-element__body__company')
.textContent.trim()
.replace(/^(.{20}[^\s]*).*/, '$1')
const image_element = card.querySelector('.job-element__logo img')
const image = image_element.dataset.src
? `https://www.stepstone.de${image_element.dataset.src}`
: image_element.src
const date = card.querySelector('time').getAttribute('datetime')
return {
id,
position,
company,
image,
date,
href
}
})
})
fs.writeFile(
path.join(__dirname, 'src/stepstone.json'),
JSON.stringify(stepstone),
err => {
if (err) {
console.error(err)
} else {
console.log('Great, it worked!')
}
}
)
await browser.close()
})()
My Approach: After scraping the title, position, etc. I also want to include the job details. So I told my scraper to go to the href link of each job item in the array where this information is stored.
And from that link grab the job details classes just like above. So I tried to map over the above array and tell the scraper to grab the items from each href link, like this:
stepstone.map(async stone => {
const page = await browser.newPage()
await page.goto(stone.href)
const details = await page.evaluate(() => {
return document.querySelector('card__body')
})
return {
...stone,
details
}
})
My Problem:
However, the JSON file does not update with the "details" key (which shall hold information from 'card__body').
Any suggestions?
Thx!

Categories

Resources