Puppeteer blocking variables inside functions - javascript

I recently made a quick web scraper using puppeteer as it targets a JS website and want it to send the output that i get inside my console into discord. The thing is that I always get e.g price not defined or so when the script tries to send the web hook onto discord. Thank you all for your help in advance here is my code if someone can help me out please. I mean where should I put my const embed in order for it to work properly.
const puppeteer = require('puppeteer-extra');
// add stealth plugin and use defaults (all evasion techniques)
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
const { Webhook, MessageBuilder } = require('discord-webhook-node');
const hook = new Webhook("https://discordapp.com/api/webhooks/733332015654371361/9VGAVW-BNlf3G4j3L6GhAIDni17yNIVf9gfmf_TNTQafP40LqYvRwhaYZzL_b58kpkkl");
const url = "https://www.asos.com/fr/nike/nike-air-max-270-baskets-triple-noir-ah8050-005/prd/12490103?clr=noir-triple&colourwayid=16391201&SearchQuery=nike air max 270";
puppeteer.use(StealthPlugin());
async function ConfigureBrowser(){
const browser = await puppeteer.launch({ headless: true }); // for test disable the headlels mode,
const page = await browser.newPage();
await page.setViewport({ width: 1000, height: 926 });
await page.goto(url,{waitUntil: 'networkidle2'})
return page;
};
async function Scrape(page) {
// await page.reload();
console.log("start evaluate javascript")
/** #type {string[]} */
var productINFO = await page.evaluate(()=>{
var div = document.querySelectorAll('.core-product-container');
console.log(div) // console.log inside evaluate, will show on browser console not on node console
const productnames = []
div.forEach(element => {
var titleelem = element.querySelector('#aside-content > div.product-hero > h1');
if(titleelem != null){
productnames.push(titleelem.textContent.trim());
} //#aside-content > div.product-hero > h1
});
const productprice = []
div.forEach(element => {
var price = element.querySelector('[class="current-price"]');
if(price != null){
productprice.push(price.textContent.trim());
}
});
const productsizes = []
div.forEach(element => {
var sizes = element.querySelector('[data-id="sizeSelect"]');
if(sizes != null){
productsizes.push(sizes.textContent.trim());
}
// productsizes.forEach()
})
return [productnames, productprice, productsizes]
})
return productINFO;
// const embed = new MessageBuilder()
// .setTitle(productnames)
// .setURL(url)
// .addField('Prix', productprice, true)
// .addField('sizes', productsizes, true)
// .setColor(8008905)
// // .setThumbnail({image})
// .setDescription('Checked')
// //.setImage(image)
// .setFooter('', 'https://cdn.discordapp.com/attachments/720763827658162260/730786942316183603/image0.jpg')
// hook.send(embed);
discoord(productINFO);
console.log(productINFO);
//browser.close()
} ;
async function Monitor() {
let page = await ConfigureBrowser();
await Scrape(page);
// console.log(productINFO);
}
Monitor();

Related

How can I create a script to fill fields automatically?

I want to create a script with JavaScript that allows me to automatically go to a website and automatically fill in the fields on that website. When all the fields have been filled in automatically, I want the submit button to be clicked and the process to start over. I define the fields by their unique id. I now have the problem that when I run the script that I don't get to the website automatically. Can anyone help me how I could improve my script so that I can automate this process?
Here is my code:
function goingToWebsite(){
window.location.href = 'https://example.com/';
}
function fillForm(){
document.getElementById('gender').value = "Herr"
document.getElementById('lastname').value = "Example";
document.getElementById('firstname').value = "Example";
document.getElementById('address').value = "Example";
document.getElementById('postcode').value = "Example";
document.getElementById('city').value = "Example";
document.getElementById('mobile').value = "Example";
document.getElementById('email').value = "Example";
document.getElementById('country_code').value = "Example";
document.getElementById('birthday').value = "Example";
document.getElementById('profession').value = "Example";
document.getElementById('hobbies').value = "Example";
document.getElementById('skills').value = "Example";
document.getElementById('wish').value = "Example";
element = document.getElementById('agb-checkbox');
element.checked = true;
}
function submitForm(){
document.getElementsByName("JETZT ANMELDEN").click();
}
function main(){
for (let i=0; i<99; i++){
goingToWebsite();
fillForm();
submitForm();
setTimeout(timeout, 3000);
}
}
Thanks a lot
I have it now like this:
const puppeteer = require('puppeteer');
async function main(){
const browser = await puppeteer.launch({
headless: false
});
for (let i=0; i<3; i++){
const page = await browser.newPage();
await page.goto('https://beispiel.de/');
await page.type('#gender', 'Herr');
await page.type('#lastname', 'Beispiel');
await page.type('#firstname', 'Beispiel');
await page.type('#address', 'Beispiel 29');
await page.type('#postcode', 'Beispiel');
await page.type('#city', 'Beispiel');
await page.type('#mobile', '0041797766666');
await page.type('#email', 'Beispiel#outlook.com');
await page.type('#country_code', 'Deutschland');
await page.type('#birthday', '28.07.1995');
await page.type('#profession', 'Beispiel');
await page.type('#hobbies', 'Beispiel');
await page.type('#skills', 'Beispiel');
await page.type('#wish', 'Beispiel');
await (await page.waitForSelector('#agb-checkbox')).click();
const searchBtn = await page.$x("//button[#class='form-btn']");
searchBtn[0].click();
await page.waitForTimeout(5000);
await browser.close();
}
}
main();
When the first loop is finished it doesn't reload again a new page what could be the reason?
You cannot do this with client-side javascript.You should use server-side language for this.After location change, you cannot manage the current site.
If you are familiar with node js you can use puppeteer.

How/When to remove child elements to clear search result?

Trying to clear my search result after I submit a new API call. Tried implementing gallery.remove(galleryItems); at different points but to no avail.
A bit disappointed I couldn't figure it out but happy I was able to get a few async functions going. Anyway, here's the code:
'use strict';
const form = document.querySelector('#searchForm');
const gallery = document.querySelector('.flexbox-container');
const galleryItems = document.getElementsByClassName('flexbox-item');
form.addEventListener('submit', async (e) => {
e.preventDefault();
const userSearch = form.elements.query.value; // grab user input
const res = await getRequest(userSearch); // async func that returns a fully parsed Promise
tvShowMatches(res.data); // looks for matches, creates and appends name + image;
form.elements.query.value = '';
});
const getRequest = async (search) => {
const config = { params: { q: search } };
const res = await axios.get('http://api.tvmaze.com/search/shows', config);
return res;
};
const tvShowMatches = async (shows) => {
for (let result of shows) {
if (result.show.image) {
// new div w/ flexbox-item class + append to gallery
const tvShowMatch = document.createElement('DIV')
tvShowMatch.classList.add('flexbox-item');
gallery.append(tvShowMatch);
// create, fill & append tvShowName to tvShowMatch
const tvShowName = document.createElement('P');
tvShowName.textContent = result.show.name;
tvShowMatch.append(tvShowName);
// create, fill & append tvShowImg to tvShowMatch
const tvShowImg = document.createElement('IMG');
tvShowImg.src = result.show.image.medium;
tvShowMatch.append(tvShowImg);
}
}
};
Thanks
Instead of gallery.remove(galleryItems); consider resetting gallery.innerHTML to an empty string whenever a submit event occurs
Like this:
form.addEventListener('submit', async (e) => {
e.preventDefault();
gallery.innerHTML = ''; // Reset here
const userSearch = form.elements.query.value; // grab user input
const res = await getRequest(userSearch); // async func that returns a fully parsed Promise
tvShowMatches(res.data); // looks for matches, creates and appends name + image;
form.elements.query.value = '';
});
I believe this will do it.. you were close.
const galleryItems = document.getElementsByClassName('flexbox-item');
// to remove
galleryItems.forEach(elem => elem.remove() );

TypeError: Cannot read property 'getProperty' of undefined? Node.js, Puppeteer

When trying to get the text of the 'name' element for my scraper. I try to grab it with the full Xpath and get the error 'TypeError: Cannot read property 'getProperty' of undefined' I tried just using the regular Xpath but that said name: 'skip navigation' why is get property coming back as undefined? it only happens when trying to get the channel title, it works when getting the profile image.
scaper.js
const puppeteer = require('puppeteer');
async function scrapeChannel(url) {
const browser = await puppeteer.launch()
const page = await browser.newPage();
await page.goto(url);
// const xpath_expression = '/html/body/ytd-app/div/ytd-page-manager/ytd-browse[2]/div[3]/ytd-c4-tabbed-header-renderer/tp-yt-app-header-layout/div/tp-yt-app-header/div[2]/div[2]/div/div[1]/div/div[1]/ytd-channel-name/div/div/yt-formatted-string';
// await page.waitForXPath(xpath_expression);
const [el] = await page.$x('/html/body/ytd-app/div/ytd-page-manager/ytd-browse[2]/div[3]/ytd-c4-tabbed-header-renderer/tp-yt-app-header-layout/div/tp-yt-app-header/div[2]/div[2]/div/div[1]/div/div[1]/ytd-channel-name/div/div/yt-formatted-string');
const text = await el.getProperty('textContent');
const name = await text.jsonValue();
const [el2] = await page.$x('//*[#id="img"]');
const src = await el2.getProperty('src');
const avatarURL = await src.jsonValue();
browser.close();
console.log({name, avatarURL});
return { name, avatarURL}
}
}
scrapeChannel('https://www.youtube.com/channel/UC8butISFwT-Wl7EV0hUK0BQ')
index.html
function newEl(type, attrs = {}) {
const el = document.createElement(type);
for (let attr in attrs) {
const value = attrs[attr];
if (attr == "innerText") el.innerText = value;
else el.setAttribute(attr, value);
}
return el;
}
It seems you may have a typo in XPath. When I try your XPath in the browser console, it returns no elements. However, with this one change, it returns an element:
$x('/html/body/ytd-app/div/ytd-page-manager/ytd-browse[1]/div[3]/ytd-c4-tabbed-header-renderer/tp-yt-app-header-layout/div/tp-yt-app-header/div[2]/div[2]/div/div[1]/div/div[1]/ytd-channel-name/div/div/yt-formatted-string')
.......................................................^: 1 instead of 2

Looping through multiple links properly

I am very new to puppeteer. I started yesterday and I'm trying to make a program that flips through a url that incrementally stores player id's one after the other and saves the player stats using neDB. There are thousands of links to flip through and I have found that if i use a for loop my computer basically crashes because 1,000 Chromiums try to open all at the same time. Is there a better way, or proper way to do this? Any advice would be appreciated.
const puppeteer = require('puppeteer');
const Datastore = require('nedb');
const database = new Datastore('database.db');
database.loadDatabase();
async function scrapeProduct(url){
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
let attributes = [];
//Getting player's name
const [name] = await page.$x('//*[#id="ctl00_ctl00_ctl00_Main_Main_name"]');
const txt = await name.getProperty('innerText');
const playerName = await txt.jsonValue();
attributes.push(playerName);
//Getting all 12 individual stats of the player
for(let i = 1; i < 13; i++){
let vLink = '//*[#id="ctl00_ctl00_ctl00_Main_Main_SectionTabBox"]/div/div/div/div[1]/table/tbody/tr['+i+']/td[2]';
const [e1] = await page.$x(vLink);
const val = await e1.getProperty('innerText');
const skillVal = await val.jsonValue();
attributes.push(skillVal);
}
//creating a player object to store the data how i want (i know this is probably ugly code and could be done in a much better way)
let player = {
Name: attributes[0],
Athleticism: attributes[1],
Speed: attributes[2],
Durability: attributes[3],
Work_Ethic: attributes[4],
Stamina: attributes[5],
Strength: attributes[6],
Blocking: attributes[7],
Tackling: attributes[8],
Hands: attributes[9],
Game_Instinct: attributes[10],
Elusiveness: attributes[11],
Technique: attributes[12],
};
database.insert(player);
await browser.close();
}
//For loop to loop through 1000 player links... Url.com is swapped in here because the actual url is ridiculously long and not important.
for(let i = 0; i <= 1000; i++){
let link = 'https://url.com/?id='+i+'&section=Ratings';
scrapeProduct(link);
console.log("Player #" + i + " scrapped");
}
The easiest tweak would be to wait for each link to finish before starting the next:
(async () => {
for(let i = 0; i <= 1000; i++){
let link = 'https://url.com/?id='+i+'&section=Ratings';
await scrapeProduct(link);
console.log("Player #" + i + " scrapped");
}
})();
You could also allow only enough open as your computer can handle. This will require more resources, but will allow the process to finish faster. Figure out the limit you want, then do something like:
let i = 0;
const getNextLink = () => {
if (i > 1000) return;
let link = 'https://url.com/?id='+i+'&section=Ratings';
i++;
return scrapeProduct(link)
.then(getNextLink)
.catch(handleErrors);
};
Promise.all(Array.from(
{ length: 4 }, // allow 4 to run concurrently
getNextLink
))
.then(() => {
// all done
});
The above allows for 4 calls of scrapeProduct to be active at any one time - change the number as needed.
If you think that the issue with speed is reopening/closing the browser with each run, move browser to the global scope and initialize it to null. Then create a init function with something like:
async function init(){
if(!browser)
browser = await puppeteer.launch()
}
Allow pages to be passed to your scrapeProduct function. async function scrapeProduct(url) becomes async function scrapeProduct(url,page). Replace await browser.close() with await page.close(). Now your loop will look like this:
//For loop to loop through 1000 player links... Url.com is swapped in here because the actual url is ridiculously long and not important.
await init();
for(let i = 0; i <= 1000; i++){
let link = 'https://url.com/?id='+i+'&section=Ratings';
let page = await browser.newPage()
scrapeProduct(link,page);
console.log("Player #" + i + " scrapped");
}
await browser.close()
If you wanted to limit number of pages the browser will concurrently run you could create a function to do that:
async function getTotalPages(){
const allPages = await browser.pages()
return allPages.length
}
async function newPage(){
const MAX_PAGES = 5
await new Promise(resolve=>{
// check once a second to check on pages open
const interval = setInterval(async ()=>{
let totalPages = await getTotalPages()
if(totalPages< MAX_PAGES){
clearInterval(interval)
resolve()
}
},1000)
})
return await browser.newPage()
}
If you did this, in your loop you'd replace let page = await browser.newPage with let page = await newPage()

How to check if element exists inside async / await with Puppeteer

I am having trouble recording an element on a page after checking if it exists. The block of code I'm referring to is under the "// phone" comment.
This code loops through each section (section of sections) on the page and records "company" and "phone." "Phone" may not be present in some sections so I figured I'd pass it through an if statement to check if it exists. This creates an error = "Error: failed to find element matching selector ".mn-contact-phone"" How do I solve this?
(async () => {
try {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
// loop through pages
for (let pg = 1; pg < 5; pg++) {
await page.goto("webpage");
// record number of sections
const sections = await page.$$("#mn-members-listings > div");
// loop through each section
for (const section of sections) {
// company
let company = await section.$eval(
"div.mn-searchlisting-title",
comp => comp.innerText
);
// phone --> THIS IF/ELSE THROWS AN ERROR
if (section.$(".mn-contact-phone").length > 0) {
let phone = await section.$eval(".mn-contact-phone", phn => phn.innerText);
} else {
let phone = "";
}
console.log(`Company = ${company} | Phone = ${phone}`);
}
}
await browser.close();
} catch (error) {
console.log(`Our error is = ${error}`);
}
})();
From puppeteer docs:
The method runs document.querySelector within the page. If no element
matches the selector, the return value resolves to null.
1) null doesn't have length.
2) ElementHandle.$ returns a promise.
change the condition to:
if (await section.$(".mn-contact-phone"))
or if there are multiple elements:
if (await section.$$(".mn-contact-phone").length > 0)

Categories

Resources