Looping through multiple links properly - javascript

I am very new to puppeteer. I started yesterday and I'm trying to make a program that flips through a url that incrementally stores player id's one after the other and saves the player stats using neDB. There are thousands of links to flip through and I have found that if i use a for loop my computer basically crashes because 1,000 Chromiums try to open all at the same time. Is there a better way, or proper way to do this? Any advice would be appreciated.
const puppeteer = require('puppeteer');
const Datastore = require('nedb');
const database = new Datastore('database.db');
database.loadDatabase();
async function scrapeProduct(url){
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
let attributes = [];
//Getting player's name
const [name] = await page.$x('//*[#id="ctl00_ctl00_ctl00_Main_Main_name"]');
const txt = await name.getProperty('innerText');
const playerName = await txt.jsonValue();
attributes.push(playerName);
//Getting all 12 individual stats of the player
for(let i = 1; i < 13; i++){
let vLink = '//*[#id="ctl00_ctl00_ctl00_Main_Main_SectionTabBox"]/div/div/div/div[1]/table/tbody/tr['+i+']/td[2]';
const [e1] = await page.$x(vLink);
const val = await e1.getProperty('innerText');
const skillVal = await val.jsonValue();
attributes.push(skillVal);
}
//creating a player object to store the data how i want (i know this is probably ugly code and could be done in a much better way)
let player = {
Name: attributes[0],
Athleticism: attributes[1],
Speed: attributes[2],
Durability: attributes[3],
Work_Ethic: attributes[4],
Stamina: attributes[5],
Strength: attributes[6],
Blocking: attributes[7],
Tackling: attributes[8],
Hands: attributes[9],
Game_Instinct: attributes[10],
Elusiveness: attributes[11],
Technique: attributes[12],
};
database.insert(player);
await browser.close();
}
//For loop to loop through 1000 player links... Url.com is swapped in here because the actual url is ridiculously long and not important.
for(let i = 0; i <= 1000; i++){
let link = 'https://url.com/?id='+i+'&section=Ratings';
scrapeProduct(link);
console.log("Player #" + i + " scrapped");
}

The easiest tweak would be to wait for each link to finish before starting the next:
(async () => {
for(let i = 0; i <= 1000; i++){
let link = 'https://url.com/?id='+i+'&section=Ratings';
await scrapeProduct(link);
console.log("Player #" + i + " scrapped");
}
})();
You could also allow only enough open as your computer can handle. This will require more resources, but will allow the process to finish faster. Figure out the limit you want, then do something like:
let i = 0;
const getNextLink = () => {
if (i > 1000) return;
let link = 'https://url.com/?id='+i+'&section=Ratings';
i++;
return scrapeProduct(link)
.then(getNextLink)
.catch(handleErrors);
};
Promise.all(Array.from(
{ length: 4 }, // allow 4 to run concurrently
getNextLink
))
.then(() => {
// all done
});
The above allows for 4 calls of scrapeProduct to be active at any one time - change the number as needed.

If you think that the issue with speed is reopening/closing the browser with each run, move browser to the global scope and initialize it to null. Then create a init function with something like:
async function init(){
if(!browser)
browser = await puppeteer.launch()
}
Allow pages to be passed to your scrapeProduct function. async function scrapeProduct(url) becomes async function scrapeProduct(url,page). Replace await browser.close() with await page.close(). Now your loop will look like this:
//For loop to loop through 1000 player links... Url.com is swapped in here because the actual url is ridiculously long and not important.
await init();
for(let i = 0; i <= 1000; i++){
let link = 'https://url.com/?id='+i+'&section=Ratings';
let page = await browser.newPage()
scrapeProduct(link,page);
console.log("Player #" + i + " scrapped");
}
await browser.close()
If you wanted to limit number of pages the browser will concurrently run you could create a function to do that:
async function getTotalPages(){
const allPages = await browser.pages()
return allPages.length
}
async function newPage(){
const MAX_PAGES = 5
await new Promise(resolve=>{
// check once a second to check on pages open
const interval = setInterval(async ()=>{
let totalPages = await getTotalPages()
if(totalPages< MAX_PAGES){
clearInterval(interval)
resolve()
}
},1000)
})
return await browser.newPage()
}
If you did this, in your loop you'd replace let page = await browser.newPage with let page = await newPage()

Related

Puppeteer blocking variables inside functions

I recently made a quick web scraper using puppeteer as it targets a JS website and want it to send the output that i get inside my console into discord. The thing is that I always get e.g price not defined or so when the script tries to send the web hook onto discord. Thank you all for your help in advance here is my code if someone can help me out please. I mean where should I put my const embed in order for it to work properly.
const puppeteer = require('puppeteer-extra');
// add stealth plugin and use defaults (all evasion techniques)
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
const { Webhook, MessageBuilder } = require('discord-webhook-node');
const hook = new Webhook("https://discordapp.com/api/webhooks/733332015654371361/9VGAVW-BNlf3G4j3L6GhAIDni17yNIVf9gfmf_TNTQafP40LqYvRwhaYZzL_b58kpkkl");
const url = "https://www.asos.com/fr/nike/nike-air-max-270-baskets-triple-noir-ah8050-005/prd/12490103?clr=noir-triple&colourwayid=16391201&SearchQuery=nike air max 270";
puppeteer.use(StealthPlugin());
async function ConfigureBrowser(){
const browser = await puppeteer.launch({ headless: true }); // for test disable the headlels mode,
const page = await browser.newPage();
await page.setViewport({ width: 1000, height: 926 });
await page.goto(url,{waitUntil: 'networkidle2'})
return page;
};
async function Scrape(page) {
// await page.reload();
console.log("start evaluate javascript")
/** #type {string[]} */
var productINFO = await page.evaluate(()=>{
var div = document.querySelectorAll('.core-product-container');
console.log(div) // console.log inside evaluate, will show on browser console not on node console
const productnames = []
div.forEach(element => {
var titleelem = element.querySelector('#aside-content > div.product-hero > h1');
if(titleelem != null){
productnames.push(titleelem.textContent.trim());
} //#aside-content > div.product-hero > h1
});
const productprice = []
div.forEach(element => {
var price = element.querySelector('[class="current-price"]');
if(price != null){
productprice.push(price.textContent.trim());
}
});
const productsizes = []
div.forEach(element => {
var sizes = element.querySelector('[data-id="sizeSelect"]');
if(sizes != null){
productsizes.push(sizes.textContent.trim());
}
// productsizes.forEach()
})
return [productnames, productprice, productsizes]
})
return productINFO;
// const embed = new MessageBuilder()
// .setTitle(productnames)
// .setURL(url)
// .addField('Prix', productprice, true)
// .addField('sizes', productsizes, true)
// .setColor(8008905)
// // .setThumbnail({image})
// .setDescription('Checked')
// //.setImage(image)
// .setFooter('', 'https://cdn.discordapp.com/attachments/720763827658162260/730786942316183603/image0.jpg')
// hook.send(embed);
discoord(productINFO);
console.log(productINFO);
//browser.close()
} ;
async function Monitor() {
let page = await ConfigureBrowser();
await Scrape(page);
// console.log(productINFO);
}
Monitor();

How do you call a asynchronous function using setInterval?

I get a random word and then use the word to generate a GIF.
My code here runs for only one time. I want it to generate another word and get another image without refreshing the browser.
So,I have used setInerval();by passing the the function that gets the image using fetch()
const section = document.getElementById('main');
const text = document.querySelector('.word');
let wordurl = 'https://random-word-api.herokuapp.com/word?number=1&swear=0';
let giphyapikey = '*****************';
//Setinterval
setInterval(wordgif(), 5000);
//make WordGIF call
function wordgif() {
wordGIF().then(results => {
text.innerHTML = results.word;
section.innerHTML = `<img src=${results.imgurl}>`;
}).catch(err => console.error(err))
}
//Async/await
async function wordGIF() {
let fetchword = await fetch(wordurl);
let word = await fetchword.json();
console.log(word)
let fetchgif = await fetch(`http://api.giphy.com/v1/gifs/search?q=${word}&api_key=${giphyapikey}&limit=1`);
let gif = await fetchgif.json();
console.log(gif)
let imgurl = gif.data[0].images['fixed_height_small'].url;
return {
word: word,
imgurl: imgurl
}
}
As far as my understanding shouldn't
setInterval(wordgif(), 5000);
be called every 5 seconds and generate a new word and image?
How do you setInterval with asynchronus function?
setInterval(wordgif(), 5000);
This code will call wordgif, then pass the result of that function to setInterval. It is equivalent to:
const wordgifResult = wordgif();
setInterval(wordgifResult, 5000);
Since wordgif doesn't return a value, calling setInterval has no real effect.
If you want setInterval to call wordgif, then you need only pass a reference to the function as the argument:
setInterval(wordgif, 5000);
I've updated your code a little bit.
You should clear the interval regularly.
You don't need to return anything from the async function, just do what you want to do inside the function.
Must check if the gif file available before rendering it.
const section = document.getElementById('main');
const text = document.querySelector('.word');
let wordurl = 'https://random-word-api.herokuapp.com/word?number=1&swear=0';
let giphyapikey = '62urPH2PxR2otT2FjFFGNlvpXmnvRVfF';
wordGIF(); // can load first gif before interval
//Setinterval
let interval;
if (interval) clearInterval(interval);
interval = setInterval(wordGIF, 5000);
//Async/await
async function wordGIF() {
let fetchword = await fetch(wordurl);
let word = await fetchword.json();
let fetchgif = await fetch(`https://api.giphy.com/v1/gifs/search?q=${word}&api_key=${giphyapikey}&limit=1`);
let gif = await fetchgif.json();
console.log('Gif available: ' + (gif && Object.keys(gif.data).length > 0));
if (gif && Object.keys(gif.data).length > 0) {
let imgurl = gif.data[0].images['fixed_height_small'].url;
text.innerHTML = word;
section.innerHTML = `<img src=${imgurl}>`;
}
}
.as-console-wrapper {
max-height: 20px !important;
}
<div id="main"></div>
<div class="word"></div>

forEach with single document firebase queries in client side?

I have different document id for every loop and when I query inside the forEach loop query is working but not pushing the obj into the array
function getAllDonations() {
donations = [];
const user_session_data = sessionStorage.getItem('LoginInfo');
const parse_user_login_data = JSON.parse(user_session_data);
let TABLE_NAME = "donation_favourites";
let get_requests_qry = App.db.collection(TABLE_NAME);
get_requests_qry.where('user_id', '==', parse_user_login_data.user_id).get().then(snapshot => {
let changes = snapshot.docChanges();
changes.forEach(change => {
var one_item = change.doc.data();
let TABLE_NAME1 = "donation_requests";
let get_requests_qry1 = App.db.collection(TABLE_NAME1);
get_requests_qry1.doc(one_item.donationId).get().then(snapshot => {
donations.push(snapshot.data())
});
});
console.log("checking the data",donations.length) //this length is not coming
});
}
If you want to read the files in use forloop but it is not recommended for large loop for small loop it is ok
if you want to read files parallel use forEach
You can also do it with async and await instead forLoop
await Promise.all(changes.map(async (change) => {
var one_item = change.doc.data()
let TABLE_NAME1 = "donation_requests";
let get_requests_qry1 = App.db.collection(TABLE_NAME1);
var snapshot1 = await get_requests_qry1.doc(one_item.donationId).get()
donations.push(snapshot1.data())
}));

Waiting for an iframe to be opened and scraped is too slow to scrape js

I'm trying to scrape an old website built with tr, br and iframe. Everything was going good so far before I started to want to extract data from an iframe, see iFrameScraping setTimeout, but the clicking is too fast for me to be able to get the datas. Would anyone have an idea of how to click, wait for the content to show and be scraped, then continue?
const newResult = await page.evaluate(async(resultLength) => {
const elements = document.getElementsByClassName('class');
for(i = 0; i < resultLength; i++) {
const companyArray = elements[i].innerHTML.split('<br>');
let companyStreet,
companyPostalCode;
// Get company name
const memberNumber = elements[i].getElementsByTagName('a')[0].getAttribute('href').match(/[0-9]{1,5}/)[0];
const companyName = await companyArray[0].replace(/<a[^>]*><span[^>]*><\/span>/, '').replace(/<\/a>/, '');
const companyNumber = await companyArray[0].match(/[0-9]{6,8}/) ? companyArray[0].match(/[0-9]{6,8}/)[0] : '';
// Get town name
const companyTown = await companyArray[1].replace('"', '');
// Get region name
const companyRegion = await companyArray[2].replace(/<span[^>]*>Some text:<\/span>/, '');
// Get phone number
const telNumber = await elements[i].innerHTML.substring(elements[i].innerHTML.lastIndexOf('</span>')).replace('</span>', '').replace('<br>', '');
const iFrameScraping = await setTimeout(async({elements, i}) => {
elements[i].getElementsByTagName('a')[0].click();
const iFrameContent = await document.getElementById('some-id').contentWindow.document.getElementById('lblAdresse').innerHTML.split('<br>');
companyStreet = iFrameContent[0].replace('"', '');
companyPostalCode = iFrameContent[2].replace('"', '');
}, 2000, {elements, i});
console.log(companyStreet, companyPostalCode)
};
}, pageSearchResults.length);
I fixed my issues after a while, so I'll share my solution.
I add to stop getting all the data with a loop from the evaluate because it's going to fast and creating a race condition. Instead I used a combination of page.$$ coupled with a for…of loop. Note that the forEach from es6 are causing race condition as well, since puppeteer does not wait for them to end to continue its execution.
Here is the example from my updated code:
const companies = await page.$$('.repmbr_result_item');
const companiesLinks = await page.$$('.repmbr_result_item a');
for(company of companies) {
const companyEl = await page.evaluate(el => el.innerHTML, company)
const companyElArray = companyEl.split('<br>');

await in nested for ... of loop

async traverse(url) {
const ts = new TournamentScraper()
const ms = new MatchScraper()
const results = []
const tournaments = await ts.run(url)
for(let href of tournaments.map(t => t.href)){
let matches = await ms.run(href)
let pages = ms.getPages()
let seasons = ms.getSeasons()
//console.log(pages)
//console.log(seasons)
results.push(matches)
for(let href of pages) {
//console.log(href)
matches = await ms.run(href)
//console.log(matches)
results.push(matches)
}
}
return results
}
TournamentScraper returns an array of objects, which typically looks like this:
{name: 'Foo', href: 'www.example.org/tournaments/foo/'}
The link points to the tournament's last season's first page. This page contains the links to the other seasons and a paginator (if any).
MatchScraper's run returns some data, and sets the instance's dom property. getPages() and getSeasons() consumes this property and each returns an array of links.
The problem that results contains only the first batch of matches. I can see the 2nd page's matches in the console log, but they are not in the results array when traverse returns.
I found this rule which is against await in for loop. The problem, that I have to wait for ms.run(href), because it sets dom, and getPages() and getSeasons() needs it to be set, to extract the needed links.
I think this should work. It utilizes Promise all rather than for loops
const run = href => ms.run(href);
async function getMatches(href) {
const out = [];
const matches = await run(href);
const pages = ms.getPages();
out.push(matches);
if(pages.length) {
const pageResults = await Promise.all(pages.map(href => run(href)));
out.push(...pageResults);
}
return out;
}
async function traverse(url) {
const ts = new TournamentScraper();
const ms = new MatchScraper();
const tournaments = await ts.run(url)
const matches = await Promise.all(tournaments.map(t => getMatches(t.href)));
return matches.reduce((a, b) => {
a.push(...b);
return a;
}, []);
}

Categories

Resources