Getting all elements on the page with javascript - javascript

I wrote a web page crawler that successfully crawls a web page and grabs the data of one job posting using puppeteer. I'm now trying to get all the elements on the page with the class of .opportunity and then pass it into a function that will get the data for that specific job posting. However the list of getElementsByClassName is returning an empty object?
const puppeteer = require('puppeteer');
const fs = require('fs');
async function crawlOpo(opo) {
const opportunity = {
title: '',
desc: '',
category: '',
reqName: '',
hours: '',
postingDate: '',
locationName: '',
address: ''
};
const title = await page.evaluate(() => {
try {
return opo.querySelector('.row .col-lg-20 h3 a').innerText
} catch(err) {
return err
}
});
const desc = await page.evaluate(() => {
try {
return opo.querySelector('.hidden-xs.paragraph').innerText
} catch(err) {
return err
}
});
const category = await page.evaluate(() => {
try {
return opo.querySelector('.row.paragraph .col-sm-18 .row .col-md-8 .label-with-icon span').innerText
} catch(err) {
return err
}
});
const reqName = await page.evaluate(() => {
try {
return opo.querySelector('.row.paragraph .col-sm-18 .row .col-md-8:nth-of-type(2) .label-with-icon span').innerText
} catch(err) {
return err
}
});
const hours = await page.evaluate(() => {
try {
return opo.querySelector('.row.paragraph .col-sm-18 .row .col-md-8:nth-of-type(3) .label-with-icon span').innerText
} catch(err) {
return err
}
});
const postingDate = await page.evaluate(() => {
try {
return opo.querySelector('.row .col-lg-4 h3 small').innerText
} catch(err) {
return err
}
});
const locationName = await page.evaluate(() => {
try {
return opo.querySelector('.row.paragraph:nth-of-type(2) .col-lg-20 div div candidate-physical-location address span:nth-of-type(2) span').innerText
} catch(err) {
return err
}
});
opportunity.title = title;
opportunity.desc = desc;
opportunity.category = category;
opportunity.reqName = reqName;
opportunity.hours = hours;
opportunity.postingDate = postingDate;
opportunity.locationName = locationName;
opportunities.push(opportunity)
console.log(opportunities);
browser.close();
}
(async () => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
const opportunities = [];
let url = "https://recruiting2.ultipro.com/PUB1004PSCU/JobBoard/d433f5c3-37c8-4bcf-a3af-248a707c7d31/?q=&o=postedDateDesc"
await page.goto(url, {timeout: 0, waitUntil: 'networkidle0'});
const oportunitiesDOM = await page.evaluate(() => {
return document.getElementsByClassName('opportunity');
});
oportunitiesDOM.forEach(opo => {
await crawlOpo(opo)
});
} catch (err) {
console.error(err)
}
})()
The logic here is that it runs an async arrow function that will launch a browser -> load the page -> evalaute the page -> grab all the elements with the class .opporuntity -> loop over list and pass each opportunity into the crawlOpo function and then grab the specific data needed for that opportunity and then assign that object to an array.

In the argument of document.getElementsByClassName('.opportunity'), you have the CSS selector '.opportunity'.
The getElementsByClassName method takes a name of a class as an argument, not a CSS selector.
Most likely it should be corrected to document.getElementsByClassName('opportunity').

Related

Async function, do something after the map function is finished

async function testing(summoner_name) {
try {
var match;
let summoner = {
name: [summoner_name],
};
const id = await fetchAccountID(summoner_name);
const matchList = await fetchMatches(id);
Object.keys(matchList.matches).map((key, i) => {
setTimeout(async function () {
match = await fetchMatch(matchList.matches[key].gameId);
summoner = await getMatchStats(
match,
matchList.matches[key].champion,
summoner
);
}, i * 100);
});
} catch (error) {
console.log(error);
}
}
I would like to do something after the map function is done iterating over all the keys, how can I achieve that?
Do you mean this?
async function testing(summoner_name) {
try {
let summoner = {
name: [summoner_name],
};
const id = await fetchAccountID(summoner_name);
const matchList = await fetchMatches(id);
//Promise in Serial
for (const key of Object.keys(matchList.matches)) {
const match = await fetchMatch(matchList.matches[key].gameId);
summoner = await getMatchStats(
match,
matchList.matches[key].champion,
summoner
);
}
} catch (error) {
console.log(error);
}
}

Creating an continous increasing list using promise-pool and puppeteer

I need to create scraping tool using puppeteer however I have some issues adding items to the queue
What I got
const PromisePool = require("#supercharge/promise-pool");
const puppeteer = require("puppeteer");
const domain = process.argv[2];
let list = [];
list[0] = domain;
const run = async () => {
const { results, errors } = await PromisePool.for(list)
.withConcurrency(2)
.process(async (webpage) => {
links = [];
const getData = async () => {
return await page.evaluate(async () => {
return await new Promise((resolve) => {
resolve(Array.from(document.querySelectorAll("a")).map((anchor) => [anchor.href]));
});
});
};
links = await getData();
for (var link in links) {
var new_url = String(links[link]);
new_url = new_url.split("#")[0];
console.log("new url: " + new_url);
if (new_url.includes(domain)) {
if (new_url in list) {
console.log("Url already exists: " + new_url);
continue;
}
list[new_url] = new_url;
} else {
console.log("Url is external: " + new_url);
}
}
browser.close();
});
};
const mainFunction = async () => {
const result = await run();
return result;
};
(async () => {
console.log(await mainFunction());
console.log(list);
})();
The problem is inside
links = [];
const getData = async () => {
return await page.evaluate(async () => {
return await new Promise((resolve) => {
resolve(Array.from(document.querySelectorAll("a")).map((anchor) => [anchor.href]));
});
});
};
links = await getData();
page.evaluate is async and it doesn't wait for a return this links is never updated for the next PromisePool process.
I need a way to wait for response to return and then continue rest of the script to process.
You could use page.$$eval to retrieve the same links with a single await.
page.$$eval(selector, pageFunction[, ...args])
It is basically what you are trying to achieve as the $$eval method "runs Array.from(document.querySelectorAll(selector)) within the page [context] and passes it as the first argument to pageFunction." (docs)
E.g.:
const links = await page.$$eval('a', anchors => anchors.map(el => el.href));

get the return value from async function without calling it again

Here is the code:
const onStartRecord = async() => {
try {
const path = Platform.select({
ios: `file:///audio/${filenameGenerator}.m4a`,
android: `file:///audio/${filenameGenerator}.mp4`,
});
const audioSet: AudioSet = {
AudioEncoderAndroid: AudioEncoderAndroidType.AAC,
AudioSourceAndroid: AudioSourceAndroidType.MIC,
AVEncoderAudioQualityKeyIOS: AVEncoderAudioQualityIOSType.high,
AVNumberOfChannelsKeyIOS: 2,
AVFormatIDKeyIOS: AVEncodingOption.aac,
};
console.log('audioSet', audioSet);
const uri = await audioRecorderPlayer.startRecorder(path, audioSet);
audioRecorderPlayer.addRecordBackListener((e: any) => {
setAudioProp(audioProp => {
return { ...audioProp,
recordSecs: e.current_position,
recordTime: audioRecorderPlayer.mmssss(Math.floor(e.current_position)),
}
});
});
console.log(`uri: ${uri}`);
return uri
} catch (err) {
console.log(err);
return;
}
};
const audioPath = async() => {
const result = await onStartRecord();
return result;
}
const onSubmit = async() => {
const audiopath = await audioPath();
console.log("this is the audiopath", audiopath)
}
};
I can get what I want when I trigger the onSubmit function, but the problem is, it also trigger the onStartRecord function again which will cause error in my case, I just want to get the uri generated when the onStartRecord resolved, but I don't want to trigger it again, so what can I do if I need to use the onSubmit function and get the value from onStartRecord? thx !
Instead of returning uri, onStartRecord should assign it to a global variable.
Then audioPath() can return that variable.
let savedAudioPath;
const onStartRecord = async() => {
try {
const path = Platform.select({
ios: `file:///audio/${filenameGenerator}.m4a`,
android: `file:///audio/${filenameGenerator}.mp4`,
});
const audioSet: AudioSet = {
AudioEncoderAndroid: AudioEncoderAndroidType.AAC,
AudioSourceAndroid: AudioSourceAndroidType.MIC,
AVEncoderAudioQualityKeyIOS: AVEncoderAudioQualityIOSType.high,
AVNumberOfChannelsKeyIOS: 2,
AVFormatIDKeyIOS: AVEncodingOption.aac,
};
console.log('audioSet', audioSet);
const uri = await audioRecorderPlayer.startRecorder(path, audioSet);
audioRecorderPlayer.addRecordBackListener((e: any) => {
setAudioProp(audioProp => {
return { ...audioProp,
recordSecs: e.current_position,
recordTime: audioRecorderPlayer.mmssss(Math.floor(e.current_position)),
}
});
});
console.log(`uri: ${uri}`);
savedAudioPath = uri;
} catch (err) {
console.log(err);
return;
}
};
const audioPath = async () => savedAudioPath;

How to return an element that isn't in the page source using Puppeteer

I'm trying to return some information from a page using the following code to select a page element and return some values within it:
const puppeteer = require('puppeteer');
function run (numberOfPages) {
return new Promise(async (resolve, reject) => {
try {
if (!numberOfPages) {
numberOfPages = 1;
}
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setRequestInterception(true);
page.on('request', (request) => {
if (request.resourceType() === 'document') {
request.continue();
} else {
request.abort();
}
});
await page.goto('https://careers.google.com/jobs/results/');
let currentPage = 1;
let urls=[];
while (currentPage <= numberOfPages) {
await page.waitForSelector('a.gc-card');
let newUrls = await page.evaluate(() => {
let results = [];
let items = document.querySelectorAll('a.gc-card');
items.forEach((item) => {
results.push({
jobTitle: item.innerText,
url: item.getAttribute('href')
});
});
return results;
});
urls = urls.concat(newUrls);
if (currentPage < pagesToScrape) {
await Promise.all([
await page.waitForSelector('a.gc-link gc-link--on-grey gc-action-group__item gc-h-larger-tap-target'),
await page.click('a.gc-link gc-link--on-grey gc-action-group__item gc-h-larger-tap-target'),
await page.waitForSelector('a.gc-link gc-link--on-grey gc-action-group__item gc-h-larger-tap-target')
])
}
currentPage++;
await page.waitFor(500);
}
browser.close();
return resolve(urls);
} catch (e) {
return reject(e);
}
})
}
run(1).then(console.log).catch(console.error);
I can see using inspect in dev tools that the class gc-card is present in the dom when the page is loaded but for some reason await page.waitForSelector('a.gc-card'); times out every time I run the code. Not totally sure the reason for this, but think it could be something to do with the majority of the page body of the page being loaded through a script.
The desired outcome is to return an array with all the job titles and URLs on the page.
You request even is aborting all the javascript files the site needs to run.
page.on('request', (request) => {
if (request.resourceType() === 'document') {
request.continue();
} else {
request.abort();
}
});
Instead of allowing only a document think in a negative way and stop the requests you are sure you won't need.

How to get the result of async / await function?

I would like to return an object from the the async / await function A to pass it to another function.
Currently what I get as a result is Promise{ <pending> }' or undefined.
function A:
const parseRss = data => data.forEach(rssLink => {
const getFeed = async () => {
try {
const feed = await rssParser.parseURL(rssLink.rss);
const emailContent = {
emailBody: {
title: feed.title,
content: []
}
}
feed.items.forEach(item => {
feedObj.emailBody.content.push(`${item.title} : ${item.link}`)
});
return emailContent;
} catch (e) {
console.error(e);
}
};
return (async () => {
return await getFeed();
})();
});
Function B:
try {
const data = await getDataWithRss();
const emailData = await parseRss([{rss:'http://reddit.com/.rss'}]); // emailData is undefined
return formatEmail(emailData);
} catch (error) {
console.log(error);
}
How do I return emailContent from function A to use it in function B?
Thanks!
Since you made getFeed as async, no need another async. You are looping through, so return an array of promises. Once the you call use Promise.all to resolve. Since there could be multiple urls to fetch.
const parseRss = (data) =>
data.map((rssLink) => {
const getFeed = async () => {
try {
const feed = await rssParser.parseURL(rssLink.rss);
const emailContent = {
emailBody: {
title: feed.title,
content: [],
},
};
feed.items.forEach((item) => {
feedObj.emailBody.content.push(`${item.title} : ${item.link}`);
});
return emailContent;
} catch (e) {
console.error(e);
}
};
return getFeed();
});
try {
const data = await getDataWithRss();
const emailData = await Promise.all(parseRss([{rss:'http://reddit.com/.rss'}])); // emailData is undefined
return formatEmail(emailData);
} catch (error) {
console.log(error);
}
await will not work inside a forEach loop. Use a for...in loop instead.
actually, getFeed() is not necessary inside inner scope, you can use async in map callback:
const parseRss = data => data.map(async rssLink => {
const feed = await rssParser.parseURL(rssLink.rss);
const emailContent = {
emailBody: {
title: feed.title,
content: []
}
};
feed.items.forEach(item => {
feedObj.emailBody.content.push(`${item.title} : ${item.link}`)
});
return emailContent;
});

Categories

Resources