amazon scrapping puppeteer click next page problem - javascript

Trying to scrap amazon products variation in a single page.
PS C:\xampp\htdocs\js> node varyant
5
SONY PlayStation 5 HD Kamera | Fiyat = 1.172,48TL
SONY PlayStation 5 Dualsense Sarj İstasyonu | Fiyat = 729,00TL
SONY PlayStation 5 HD Kamera | Fiyat = 1.172,48TL
SONY PlayStation 5 Konsol | Fiyat = Ürün Yok!
SONY PlayStation 5 PULSE 3D Kablosuz Kulaklık | Fiyat = 1.997,00TL
Thats the result
const puppeteer = require("puppeteer");
async function varyant() {
const browser = await puppeteer.launch({
headless: false,
slowMo: 20,
defaultViewport: null,
});
const page = await browser.newPage();
await page.goto(
"https://www.amazon.com.tr/Sony-SONY-PlayStation-5-Konsol/dp/B08MLPLQHV",
{
waitUntil: "networkidle2",
timeout: 0,
}
);
await page.waitForSelector('[id="sp-cc-rejectall-link"]');
await page.click('[id="sp-cc-rejectall-link"]');
if (await page.$('[id="twisterContainer"]')) {
////varyant varsa
await page.content();
const liste = await page.$$(
'[id="twisterContainer"] > div > form > div >ul>li'
);
console.log(liste.length);
for await (const list of liste) {
await list.click();
await page.waitForSelector(".swatchSelect");
await page.waitForTimeout(3000);
const urun = await page.$eval(
"#productTitle",
(elHandles) => elHandles.outerText
);
const element2 = await page.$(
'[id="corePrice_feature_div"] > div > span > span'
);
if (element2 !== null) {
var varyantfiyat = await element2.evaluate((el) => el.textContent);
} else {
var varyantfiyat = "Ürün Yok!";
//continue;
}
console.log(`${urun} | Fiyat = ${varyantfiyat}`);
}
} else {
const element3 = await page.$(
'[id="corePrice_feature_div"] > div > span > span'
);
var varyantfiyat1 = await element3.evaluate((el) => el.textContent);
console.log(varyantfiyat1);
}
await page.waitForTimeout(3000);
await browser.close();
}
5 different products in webpage, but as seen in the result click event not happens correctly and same product comes.Where is my mistake.This script sometimes works perfect sometimes duplicate results.

Related

Puppeteer timeouts in headless mode

I need to go to the UPS site, type in a tracking number, then get the data about that tracking number. I turned headless to false then the timeouts disappeared, turn it to true, and it will timeout or wait forever. I tried a number of different methods (any I could find), to mess around with the promises, but nothing helped.
const puppeteer = require('puppeteer');
var fs = require("fs").promises;
var Page = {};
var Browser = {};
async function printPage(){
console.log("printPage()");
await fs.writeFile("pagecontent_afterClick.txt", await Page.content());
const count = await Page.$$eval('#st_App_PkgStsMonthNum', divs => divs.length);
console.log(count);
const deliveredOn = await Page.$eval('#st_App_PkgStsMonthNum', el => el.textContent);
const deliveredToAddress = await Page.$eval('#stApp_txtAddress', el => el.textContent);
const deliveredToCountry = await Page.$eval('#stApp_txtCountry', el => el.textContent);
const deliveredBy = await Page.$eval('#stApp_valReceivedBy', el => el.textContent);
console.log("browser close");
await Browser.close();
}
async function start(){
console.log("start");
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
Page = page
Browser = browser
const navigationPromise = page.waitForNavigation({waitUntil: 'load'})
await navigationPromise;
await fs.writeFile("pagecontent_B4.txt", await page.content());
await page.type("#stApp_trackingNumber", "1Z0F1R740320306827");
const searchValue = await page.$eval('#stApp_trackingNumber', el => el.value);
console.log(searchValue);
const count = await page.$$eval('button#stApp_btnTrack.ups-cta.ups-cta_primary', divs => divs.length);
console.log(count);
try {
console.log("end waiting");
await Promise.all([
page.click('#stApp_btnTrack'),
page.waitForNavigation({waitUntil: 'networkidle2'})
]);
printPage();
} catch (e) {
if (e instanceof puppeteer.errors.TimeoutError) {
console.log("timeout");
printPage();
}
}
}
start();

puppeteer returning undefined when trying to scrape img src

I was trying to scrape a thumbnail image from youtube with its XPath but I am getting undefined for the src. I can't figure out what is causing this? I already tried using both the XPath and full XPath but that didn't help. Any help is appreciated. Thanks in advance.
const puppeteer = require('puppeteer');
async function scrapeChannel1(url) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url, {
timeout: 0
});
const [el2] = await page.$x('//*[#id="dismissible"]/ytd-thumbnail');
const src1 = await el2.getProperty('src');
const thumbnailURL1 = await src1.jsonValue();
browser.close();
console.log({
thumbnailURL1
})
return {
thumbnailURL1
}
}
scrapeChannel1('https://www.youtube.com/')
The <img> you are looking for is placed a bit deeper in the DOM at: '//*[#id="dismissible"]/ytd-thumbnail/a/yt-img-shadow/img' (so you should add: /a/yt-img-shadow/img at the end of your XPath expression).
Note, you have more powerful tools in puppeteer than .getProperty('src') to retrieve DOM element properties.
E.g. page.$eval:
const selector = 'ytd-thumbnail > a > yt-img-shadow > #img'
const imageSrc = await page.$eval(selector, el => el.src)
// returns: https://i.ytimg.com/vi/{youtube_id}/hqdefault.jpg...
Or if you want all images use page.$$eval:
const imageSrcs = await page.$$eval(selector, elems => elems.map(el => el.src))
If you want to get images src from YouTube, you need to scroll video thumbnails into view like in the code below (also check it on the online IDE):
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());
const mainPageUrl = "https://www.youtube.com";
async function scrollPage(page, scrollElements) {
let currentElement = 0;
while (true) {
let elementsLength = await page.evaluate((scrollElements) => {
return document.querySelectorAll(scrollElements).length;
}, scrollElements);
for (; currentElement < elementsLength; currentElement++) {
await page.waitForTimeout(200);
await page.evaluate(
(currentElement, scrollElements) => {
document.querySelectorAll(scrollElements)[currentElement].scrollIntoView();
},
currentElement,
scrollElements
);
}
await page.waitForTimeout(5000);
let newElementsLength = await page.evaluate((scrollElements) => {
return document.querySelectorAll(scrollElements).length;
}, scrollElements);
if (newElementsLength === elementsLength || currentElement > 100) break; // if you want to get all elements (or some other number of elements) change number to 'Infinity' (or some other number)
}
}
async function getThumbnails() {
const browser = await puppeteer.launch({
headless: false,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
await page.setDefaultNavigationTimeout(60000);
await page.goto(mainPageUrl);
await page.waitForSelector("#contents");
const scrollElements = "a#thumbnail";
await scrollPage(page, scrollElements);
await page.waitForTimeout(10000);
const urls = await page.$$eval("a#thumbnail #img", (els) => els.map(el => el.getAttribute('src')).filter(el => el));
await browser.close();
return urls;
}
getThumbnails().then(console.log);
Output
[
"https://i.ytimg.com/vi/02oeySm1CJA/hq720.jpg?sqp=-oaymwEcCNAFEJQDSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLBmrYMHESpY_f1oTNx00iuR3tNeCQ",
"https://i.ytimg.com/vi/RMo2haIPYBM/hq720_live.jpg?sqp=CNifxJcG-oaymwEcCNAFEJQDSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLBw4ogzR0709SqbttRdEzfL-aTdgQ",
"https://i.ytimg.com/vi/qJFFp_ta1Zk/hqdefault.jpg?sqp=-oaymwEcCOADEI4CSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLBJ-44OFgBUuVUYWBVh3Yi3hQgwIg",
"https://i.ytimg.com/vi/OZoTjoN-Sn0/hqdefault.jpg?sqp=-oaymwEcCOADEI4CSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLCOeGTCnlT4U0wV1SNclkmFUEHLaA",
"https://i.ytimg.com/vi/L8cH2gI67uk/hqdefault.jpg?sqp=-oaymwEcCOADEI4CSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLAuvZ3khIjpvAVTGjmR9FDxQrPIgQ",
"https://i.ytimg.com/vi/6rUyVKyJnGY/hq720.jpg?sqp=-oaymwEcCNAFEJQDSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLCifsTG4MlA3mf8CcJDkfKdWaZkaA",
"https://i.ytimg.com/vi/xpaURivPZFk/hq720_2.jpg?sqp=-oaymwEdCJYDENAFSFXyq4qpAw8IARUAAIhCcAHAAQbQAQE=&rs=AOn4CLA5oFDDsVzbV3tUqyfogfuf3LPahQ",
"https://i.ytimg.com/vi/MsR76PyVdUs/hq720_2.jpg?sqp=-oaymwEdCJYDENAFSFXyq4qpAw8IARUAAIhCcAHAAQbQAQE=&rs=AOn4CLAEBYGNvif-7LWx2mqW4G9o-OUhEQ",
"https://i.ytimg.com/vi/liasQRRVt5w/hq720_2.jpg?sqp=-oaymwEdCJYDENAFSFXyq4qpAw8IARUAAIhCcAHAAQbQAQE=&rs=AOn4CLAUcMpyKY0GhmNAHHtP_cDkAp18DQ",
"https://i.ytimg.com/vi/Dr5IqlTLMDM/hq720_2.jpg?sqp=-oaymwEdCJYDENAFSFXyq4qpAw8IARUAAIhCcAHAAQbQAQE=&rs=AOn4CLBOSUi6mgjdD5a-Jx8Ns24SlexB1g",
"https://i.ytimg.com/vi/E8kit8xJKdI/hq720_2.jpg?sqp=-oaymwEdCJYDENAFSFXyq4qpAw8IARUAAIhCcAHAAQbQAQE=&rs=AOn4CLDDStn95G7ei5DTusGXE4RimzdLUw",
"https://i.ytimg.com/vi/SqEaahOmLHU/hq720_2.jpg?sqp=-oaymwEdCM0CENAFSFXyq4qpAw8IARUAAIhCcAHAAQbQAQE=&rs=AOn4CLBDcWLCklNxEAuT1ZvSTKrIplGOag",
...and other results
]
You can read more about scraping YouTube search from my blog post Web scraping YouTube search video results with Nodejs.

Use array of keywords and loop through script in Playwright

So, I am trying to scrape a couple of searchengines with a couple of search phrases with Playwright.
Running the script with one query is working.
Working:
const { chromium } = require('playwright');
(async () => {
const browser = await chromium.launch({ headless: false, slowMo: 250 });
const context = await browser.newContext()
const page = await context.newPage();
const keyWord = ('Arsenal');
await page.goto('https://duckduckgo.com/');
await page.fill('//input[#name="q"]',keyWord);
await page.keyboard.press('Enter');
const getOne = (' (//h2[#class="result__title"])[9] ');
await page.waitForSelector(getOne)
const pushOne = await page.$(getOne);
const One = await pushOne.evaluate(element => element.innerText);
console.log(One);
await page.goto('https://yandex.com/');
await page.fill('//input[#aria-label="Request"]', keyWord);
await page.keyboard.press('Enter');
const getTwo = (' //li[#data-first-snippet] //div[#class="organic__url-text"] ');
await page.waitForSelector(getTwo)
const pushTwo = await page.$(getTwo);
const Two = await pushTwo.evaluate(element => element.innerText);
console.log(Two);
await browser.close()
})()
But when I use an array with phrases (keyWordlist) I fail to get the script running.
Have searched around for using Array with 'For' and 'Foreach' loops, but haven't been able to fix it.
I want to run the different keywords through the different searchengines and list the results.
For 3 keywords in two searchengines that would get 6 results.
const { chromium } = require('playwright');
(async () => {
const browser = await chromium.launch({ headless: false, slowMo: 250 });
const context = await browser.newContext()
const page = await context.newPage();
let kewWordlist = ['Arsenal', 'Liverpool', 'Ajax']
for (var i=0; i<=kewWordlist.length; i++) {
// for (const i in kewWordlist){
async () => {
const keyWord = kewWordlist[i];
await page.goto('https://duckduckgo.com/');
await page.fill('//input[#name="q"]',keyWord);
// await page.fill('//input[#name="q"]',[i]);
// await page.fill('//input[#name="q"]',`${keyWord}`);
await page.keyboard.press('Enter');
const getOne = (' (//h2[#class="result__title"])[9] ');
await page.waitForSelector(getOne)
const pushOne = await page.$(getOne);
const One = await pushOne.evaluate(element => element.innerText);
console.log(One);
// await page.goto('https://yandex.com/');
// await page.fill('//input[#aria-label="Request"]', keyWord);
// await page.keyboard.press('Enter');
// const getTwo = (' //li[#data-first-snippet] //div[#class="organic__url-text"] ');
// await page.waitForSelector(getTwo)
// const pushTwo = await page.$(getTwo);
// const Two = await pushTwo.evaluate(element => element.innerText);
// console.log(Two);
}}
await browser.close()
})()
If anyone has some pointers on how to solve this, much obliged.
maybe the result selectors needs some tweaking but I think this is what you were looking for:
test.only('search search engines', async({page, context}) => {
const search = [
{
name: 'yandex',
url: 'https://yandex.com/',
elementFill: '//input[#aria-label="Request"]',
elementResult: '//li[#data-first-snippet] //div[#class="organic__url-text"]'
},
{
name: 'google',
url: 'https://www.google.nl',
elementFill: '//input[#name="q"]',
elementResult: '(//h2[#class="result__title"])[9]'
},
{
name: '',
url: 'https://duckduckgo.com/',
elementFill: '//input[#name="q"]',
elementResult: '(//h2[#class="result__title"])[9]'
}
]
const kewWordlist = ['Arsenal', 'Liverpool', 'Ajax']
for (let i = 0; i < search.length; i++) {
const searchName = search[i].name
const searchResult = search[i].elementResult
const searchFill = search[i].elementFill
const searchPage = await context.newPage()
await searchPage.waitForLoadState()
await searchPage.goto(`${search[i].url}`)
for (let i = 0; i < kewWordlist.length; i++) {
await searchPage.fill(searchFill,kewWordlist[i])
await searchPage.keyboard.press('Enter')
await searchPage.waitForSelector(searchResult)
const result = await page.$(searchResult)
console.log(`${searchName}: ${result} `)
}
}
})
The reason your loop isn't working is that you have an async function inside of it that you never call. There are a few ways you could go about this:
You could take your first version, have it accept a word to search, and run that over each element of the array:
const searchOneKeyword = async (keyWord) => {
const browser = await chromium.launch({ headless: false, slowMo: 250 });
const context = await browser.newContext()
const page = await context.newPage();
// rest of code
}
const kewWordList = ['Arsenal', 'Liverpool', 'Ajax']
keyWordList.forEach((k) => {
searchOneKeyword(k)
})
Or if you'd like to keep the same browser instance, you can do it in a loop in the function:
const search = async (words) => {
const browser = await chromium.launch({ headless: false, slowMo: 250 });
const context = await browser.newContext()
const page = await context.newPage();
for (const keyWord of words) {
await page.goto('https://duckduckgo.com/');
await page.fill('//input[#name="q"]',keyWord);
await page.keyboard.press('Enter');
const getOne = (' (//h2[#class="result__title"])[9] ');
await page.waitForSelector(getOne)
const pushOne = await page.$(getOne);
const One = await pushOne.evaluate(element => element.innerText);
console.log(One);
// etc.
}
await browser.close()
}
search(keyWordList)
In both of those cases, you're logging, but never returning anything, so if you need that data in another function afterwards, you'd have to change that. Example:
const search = async (words) => {
const browser = await chromium.launch({ headless: false, slowMo: 250 });
const context = await browser.newContext()
const page = await context.newPage();
const results = await Promise.all(words.map((keyWord) => {
await page.goto('https://duckduckgo.com/');
await page.fill('//input[#name="q"]',keyWord);
await page.keyboard.press('Enter');
// etc.
return [ One, Two ]
}))
await browser.close()
return results
}
search(keyWordList).then((results) => { console.log(results.flat()) })
I have spent a couple of hours trying to get the script working based on your suggestions. No result unfortunately. I get errors like 'await is only valid in async function' and 'Unreachable code detected'. Searched for other examples, for some inspiration, but none found. If you or someone else has a suggestion, please share! This is code I have now:
const { chromium } = require('playwright');
let keyWordList = ['Arsenal', 'Liverpool', 'Ajax']
const search = async function words() {
const browser = await chromium.launch({ headless: false, slowMo: 250 });
const context = await browser.newContext()
const page = await context.newPage();
}
const results = await Promise.all(words.map(keyWord))
//DUCKDUCKGO
await page.goto('https://duckduckgo.com/');
await page.fill('//input[#name="q"]',keyWord);
await page.keyboard.press('Enter');
const getOne = (' (//h2[#class="result__title"])[9] ');
await page.waitForSelector(getOne)
const pushOne = await page.$(getOne);
const One = await pushOne.evaluate(element => element.innerText);
//YANDEX
await page.goto('https://yandex.com/');
await page.fill('//input[#aria-label="Request"]', keyWord);
await page.keyboard.press('Enter');
const getTwo = (' //li[#data-first-snippet] //div[#class="organic__url-text"] ');
await page.waitForSelector(getTwo)
const pushTwo = await page.$(getTwo);
const Two = await pushTwo.evaluate(element => element.innerText);
console.log(Two);
return [ One , Two ]
return results
search(keyWordList).then((results) => { console.log(results.flat())
await browser.close();
})

Puppeteer eval issues

I am having issues trying to get this to work.
I am needing to select the calendar heading "May 2020", but am not having any luck, could I get someone to look at what I am doing wrong?
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://www.stayz.com.au/holiday-rental/p9177051?noDates=true',{waitUntil: 'domcontentloaded'});
const headingTxt = await page.evaluate(() =>
document.querySelector('#rates-availability > div > div > section > div > div.inline-calendar > div > div.cal-controls__calendar-parent--middle-multi > div > div:nth-child(2) > h4 > span').innerText);
console.log('');
console.log('========[ output ]======== ', headingTxt);
console.log('');
await page.close();
await browser.close();
})();
This selector needs scrolling and wide viewport to appear:
'use strict';
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({ headless: false, args: ['--start-maximized'] }); //
const page = await browser.newPage();
await page.setViewport({ width: 1280, height: 850 });
await page.goto('https://www.stayz.com.au/holiday-rental/p9177051?noDates=true', {waitUntil: 'domcontentloaded'});
const selector = '#rates-availability > div > div > section > div > div.inline-calendar > div > div.cal-controls__calendar-parent--middle-multi > div > div:nth-child(2) > h4 > span';
while ((await page.$(selector)) === null) {
await page.evaluate(() => { window.scrollBy(0, window.innerHeight); });
await page.waitFor(1000);
}
const headingTxt = await page.evaluate(
selector => document.querySelector(selector).innerText,
selector
);
console.log('');
console.log('========[ output ]======== ', headingTxt);
console.log('');
await page.close();
await browser.close();
})();

Puppeteer how to read and use data from a text file

Hi I'm working on a script to learn node JS, and I'm stuck at this point :
I would like to read a text file with my email: password, and with that I would like to open as many tabs as many email:password I have and use each email: password to connect to the website
const puppeteer = require("puppeteer");
const lineReader = require("line-reader");
(async () => {
let data = [];
const promises = []
let int = 0;
lineReader.eachLine("c.txt", function(line) {
int++;
data = line.split(":")
console.log(data);
});
console.log(int);
for (let i = 0; i < 2; i++) {
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
const navigationPromise = page.waitForNavigation();
await page.goto("https:www.site.com/en/launch/");
await page.setViewport({ width: 1920, height: 1080 });
await page.waitForSelector(
".d-sm-h > .bg-white > .right-nav > .member-nav-item > .join-log-in"
);
await page.click(
".d-sm-h > .bg-white > .right-nav > .member-nav-item > .join-log-in"
);
const emailInput = '[placeholder="Adresse e-mail"][autocomplete="email"]';
await page.waitForSelector(emailInput, { timeout: 0 });
await page.focus(emailInput);
await page.keyboard.type(data[0]);
const passwordInput =
'[placeholder="Mot de passe"][autocomplete="current-password"]';
await page.waitForSelector(passwordInput, { timeout: 0 });
await page.focus(passwordInput);
await page.keyboard.type(data[1]);
await page.click(
".site-unite-submit-button.loginSubmit.site-unite-component"
);
}
await Promise.all(promises)
})();
but what I have when I consol log my data :
[ 'email#gmail.com:tesssst' ]
[ 'email2#gmail.com:tesssst' ]
I would like to know and understand how can I use each email:password that i have on my txt file to open tabs and use my data to login to the website
Thank you

Categories

Resources