Trying to scrap amazon products variation in a single page.
PS C:\xampp\htdocs\js> node varyant
5
SONY PlayStation 5 HD Kamera | Fiyat = 1.172,48TL
SONY PlayStation 5 Dualsense Sarj İstasyonu | Fiyat = 729,00TL
SONY PlayStation 5 HD Kamera | Fiyat = 1.172,48TL
SONY PlayStation 5 Konsol | Fiyat = Ürün Yok!
SONY PlayStation 5 PULSE 3D Kablosuz Kulaklık | Fiyat = 1.997,00TL
Thats the result
const puppeteer = require("puppeteer");
async function varyant() {
const browser = await puppeteer.launch({
headless: false,
slowMo: 20,
defaultViewport: null,
});
const page = await browser.newPage();
await page.goto(
"https://www.amazon.com.tr/Sony-SONY-PlayStation-5-Konsol/dp/B08MLPLQHV",
{
waitUntil: "networkidle2",
timeout: 0,
}
);
await page.waitForSelector('[id="sp-cc-rejectall-link"]');
await page.click('[id="sp-cc-rejectall-link"]');
if (await page.$('[id="twisterContainer"]')) {
////varyant varsa
await page.content();
const liste = await page.$$(
'[id="twisterContainer"] > div > form > div >ul>li'
);
console.log(liste.length);
for await (const list of liste) {
await list.click();
await page.waitForSelector(".swatchSelect");
await page.waitForTimeout(3000);
const urun = await page.$eval(
"#productTitle",
(elHandles) => elHandles.outerText
);
const element2 = await page.$(
'[id="corePrice_feature_div"] > div > span > span'
);
if (element2 !== null) {
var varyantfiyat = await element2.evaluate((el) => el.textContent);
} else {
var varyantfiyat = "Ürün Yok!";
//continue;
}
console.log(`${urun} | Fiyat = ${varyantfiyat}`);
}
} else {
const element3 = await page.$(
'[id="corePrice_feature_div"] > div > span > span'
);
var varyantfiyat1 = await element3.evaluate((el) => el.textContent);
console.log(varyantfiyat1);
}
await page.waitForTimeout(3000);
await browser.close();
}
5 different products in webpage, but as seen in the result click event not happens correctly and same product comes.Where is my mistake.This script sometimes works perfect sometimes duplicate results.
Related
I need to go to the UPS site, type in a tracking number, then get the data about that tracking number. I turned headless to false then the timeouts disappeared, turn it to true, and it will timeout or wait forever. I tried a number of different methods (any I could find), to mess around with the promises, but nothing helped.
const puppeteer = require('puppeteer');
var fs = require("fs").promises;
var Page = {};
var Browser = {};
async function printPage(){
console.log("printPage()");
await fs.writeFile("pagecontent_afterClick.txt", await Page.content());
const count = await Page.$$eval('#st_App_PkgStsMonthNum', divs => divs.length);
console.log(count);
const deliveredOn = await Page.$eval('#st_App_PkgStsMonthNum', el => el.textContent);
const deliveredToAddress = await Page.$eval('#stApp_txtAddress', el => el.textContent);
const deliveredToCountry = await Page.$eval('#stApp_txtCountry', el => el.textContent);
const deliveredBy = await Page.$eval('#stApp_valReceivedBy', el => el.textContent);
console.log("browser close");
await Browser.close();
}
async function start(){
console.log("start");
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
Page = page
Browser = browser
const navigationPromise = page.waitForNavigation({waitUntil: 'load'})
await navigationPromise;
await fs.writeFile("pagecontent_B4.txt", await page.content());
await page.type("#stApp_trackingNumber", "1Z0F1R740320306827");
const searchValue = await page.$eval('#stApp_trackingNumber', el => el.value);
console.log(searchValue);
const count = await page.$$eval('button#stApp_btnTrack.ups-cta.ups-cta_primary', divs => divs.length);
console.log(count);
try {
console.log("end waiting");
await Promise.all([
page.click('#stApp_btnTrack'),
page.waitForNavigation({waitUntil: 'networkidle2'})
]);
printPage();
} catch (e) {
if (e instanceof puppeteer.errors.TimeoutError) {
console.log("timeout");
printPage();
}
}
}
start();
I was trying to scrape a thumbnail image from youtube with its XPath but I am getting undefined for the src. I can't figure out what is causing this? I already tried using both the XPath and full XPath but that didn't help. Any help is appreciated. Thanks in advance.
const puppeteer = require('puppeteer');
async function scrapeChannel1(url) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url, {
timeout: 0
});
const [el2] = await page.$x('//*[#id="dismissible"]/ytd-thumbnail');
const src1 = await el2.getProperty('src');
const thumbnailURL1 = await src1.jsonValue();
browser.close();
console.log({
thumbnailURL1
})
return {
thumbnailURL1
}
}
scrapeChannel1('https://www.youtube.com/')
The <img> you are looking for is placed a bit deeper in the DOM at: '//*[#id="dismissible"]/ytd-thumbnail/a/yt-img-shadow/img' (so you should add: /a/yt-img-shadow/img at the end of your XPath expression).
Note, you have more powerful tools in puppeteer than .getProperty('src') to retrieve DOM element properties.
E.g. page.$eval:
const selector = 'ytd-thumbnail > a > yt-img-shadow > #img'
const imageSrc = await page.$eval(selector, el => el.src)
// returns: https://i.ytimg.com/vi/{youtube_id}/hqdefault.jpg...
Or if you want all images use page.$$eval:
const imageSrcs = await page.$$eval(selector, elems => elems.map(el => el.src))
If you want to get images src from YouTube, you need to scroll video thumbnails into view like in the code below (also check it on the online IDE):
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());
const mainPageUrl = "https://www.youtube.com";
async function scrollPage(page, scrollElements) {
let currentElement = 0;
while (true) {
let elementsLength = await page.evaluate((scrollElements) => {
return document.querySelectorAll(scrollElements).length;
}, scrollElements);
for (; currentElement < elementsLength; currentElement++) {
await page.waitForTimeout(200);
await page.evaluate(
(currentElement, scrollElements) => {
document.querySelectorAll(scrollElements)[currentElement].scrollIntoView();
},
currentElement,
scrollElements
);
}
await page.waitForTimeout(5000);
let newElementsLength = await page.evaluate((scrollElements) => {
return document.querySelectorAll(scrollElements).length;
}, scrollElements);
if (newElementsLength === elementsLength || currentElement > 100) break; // if you want to get all elements (or some other number of elements) change number to 'Infinity' (or some other number)
}
}
async function getThumbnails() {
const browser = await puppeteer.launch({
headless: false,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
await page.setDefaultNavigationTimeout(60000);
await page.goto(mainPageUrl);
await page.waitForSelector("#contents");
const scrollElements = "a#thumbnail";
await scrollPage(page, scrollElements);
await page.waitForTimeout(10000);
const urls = await page.$$eval("a#thumbnail #img", (els) => els.map(el => el.getAttribute('src')).filter(el => el));
await browser.close();
return urls;
}
getThumbnails().then(console.log);
Output
[
"https://i.ytimg.com/vi/02oeySm1CJA/hq720.jpg?sqp=-oaymwEcCNAFEJQDSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLBmrYMHESpY_f1oTNx00iuR3tNeCQ",
"https://i.ytimg.com/vi/RMo2haIPYBM/hq720_live.jpg?sqp=CNifxJcG-oaymwEcCNAFEJQDSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLBw4ogzR0709SqbttRdEzfL-aTdgQ",
"https://i.ytimg.com/vi/qJFFp_ta1Zk/hqdefault.jpg?sqp=-oaymwEcCOADEI4CSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLBJ-44OFgBUuVUYWBVh3Yi3hQgwIg",
"https://i.ytimg.com/vi/OZoTjoN-Sn0/hqdefault.jpg?sqp=-oaymwEcCOADEI4CSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLCOeGTCnlT4U0wV1SNclkmFUEHLaA",
"https://i.ytimg.com/vi/L8cH2gI67uk/hqdefault.jpg?sqp=-oaymwEcCOADEI4CSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLAuvZ3khIjpvAVTGjmR9FDxQrPIgQ",
"https://i.ytimg.com/vi/6rUyVKyJnGY/hq720.jpg?sqp=-oaymwEcCNAFEJQDSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLCifsTG4MlA3mf8CcJDkfKdWaZkaA",
"https://i.ytimg.com/vi/xpaURivPZFk/hq720_2.jpg?sqp=-oaymwEdCJYDENAFSFXyq4qpAw8IARUAAIhCcAHAAQbQAQE=&rs=AOn4CLA5oFDDsVzbV3tUqyfogfuf3LPahQ",
"https://i.ytimg.com/vi/MsR76PyVdUs/hq720_2.jpg?sqp=-oaymwEdCJYDENAFSFXyq4qpAw8IARUAAIhCcAHAAQbQAQE=&rs=AOn4CLAEBYGNvif-7LWx2mqW4G9o-OUhEQ",
"https://i.ytimg.com/vi/liasQRRVt5w/hq720_2.jpg?sqp=-oaymwEdCJYDENAFSFXyq4qpAw8IARUAAIhCcAHAAQbQAQE=&rs=AOn4CLAUcMpyKY0GhmNAHHtP_cDkAp18DQ",
"https://i.ytimg.com/vi/Dr5IqlTLMDM/hq720_2.jpg?sqp=-oaymwEdCJYDENAFSFXyq4qpAw8IARUAAIhCcAHAAQbQAQE=&rs=AOn4CLBOSUi6mgjdD5a-Jx8Ns24SlexB1g",
"https://i.ytimg.com/vi/E8kit8xJKdI/hq720_2.jpg?sqp=-oaymwEdCJYDENAFSFXyq4qpAw8IARUAAIhCcAHAAQbQAQE=&rs=AOn4CLDDStn95G7ei5DTusGXE4RimzdLUw",
"https://i.ytimg.com/vi/SqEaahOmLHU/hq720_2.jpg?sqp=-oaymwEdCM0CENAFSFXyq4qpAw8IARUAAIhCcAHAAQbQAQE=&rs=AOn4CLBDcWLCklNxEAuT1ZvSTKrIplGOag",
...and other results
]
You can read more about scraping YouTube search from my blog post Web scraping YouTube search video results with Nodejs.
So, I am trying to scrape a couple of searchengines with a couple of search phrases with Playwright.
Running the script with one query is working.
Working:
const { chromium } = require('playwright');
(async () => {
const browser = await chromium.launch({ headless: false, slowMo: 250 });
const context = await browser.newContext()
const page = await context.newPage();
const keyWord = ('Arsenal');
await page.goto('https://duckduckgo.com/');
await page.fill('//input[#name="q"]',keyWord);
await page.keyboard.press('Enter');
const getOne = (' (//h2[#class="result__title"])[9] ');
await page.waitForSelector(getOne)
const pushOne = await page.$(getOne);
const One = await pushOne.evaluate(element => element.innerText);
console.log(One);
await page.goto('https://yandex.com/');
await page.fill('//input[#aria-label="Request"]', keyWord);
await page.keyboard.press('Enter');
const getTwo = (' //li[#data-first-snippet] //div[#class="organic__url-text"] ');
await page.waitForSelector(getTwo)
const pushTwo = await page.$(getTwo);
const Two = await pushTwo.evaluate(element => element.innerText);
console.log(Two);
await browser.close()
})()
But when I use an array with phrases (keyWordlist) I fail to get the script running.
Have searched around for using Array with 'For' and 'Foreach' loops, but haven't been able to fix it.
I want to run the different keywords through the different searchengines and list the results.
For 3 keywords in two searchengines that would get 6 results.
const { chromium } = require('playwright');
(async () => {
const browser = await chromium.launch({ headless: false, slowMo: 250 });
const context = await browser.newContext()
const page = await context.newPage();
let kewWordlist = ['Arsenal', 'Liverpool', 'Ajax']
for (var i=0; i<=kewWordlist.length; i++) {
// for (const i in kewWordlist){
async () => {
const keyWord = kewWordlist[i];
await page.goto('https://duckduckgo.com/');
await page.fill('//input[#name="q"]',keyWord);
// await page.fill('//input[#name="q"]',[i]);
// await page.fill('//input[#name="q"]',`${keyWord}`);
await page.keyboard.press('Enter');
const getOne = (' (//h2[#class="result__title"])[9] ');
await page.waitForSelector(getOne)
const pushOne = await page.$(getOne);
const One = await pushOne.evaluate(element => element.innerText);
console.log(One);
// await page.goto('https://yandex.com/');
// await page.fill('//input[#aria-label="Request"]', keyWord);
// await page.keyboard.press('Enter');
// const getTwo = (' //li[#data-first-snippet] //div[#class="organic__url-text"] ');
// await page.waitForSelector(getTwo)
// const pushTwo = await page.$(getTwo);
// const Two = await pushTwo.evaluate(element => element.innerText);
// console.log(Two);
}}
await browser.close()
})()
If anyone has some pointers on how to solve this, much obliged.
maybe the result selectors needs some tweaking but I think this is what you were looking for:
test.only('search search engines', async({page, context}) => {
const search = [
{
name: 'yandex',
url: 'https://yandex.com/',
elementFill: '//input[#aria-label="Request"]',
elementResult: '//li[#data-first-snippet] //div[#class="organic__url-text"]'
},
{
name: 'google',
url: 'https://www.google.nl',
elementFill: '//input[#name="q"]',
elementResult: '(//h2[#class="result__title"])[9]'
},
{
name: '',
url: 'https://duckduckgo.com/',
elementFill: '//input[#name="q"]',
elementResult: '(//h2[#class="result__title"])[9]'
}
]
const kewWordlist = ['Arsenal', 'Liverpool', 'Ajax']
for (let i = 0; i < search.length; i++) {
const searchName = search[i].name
const searchResult = search[i].elementResult
const searchFill = search[i].elementFill
const searchPage = await context.newPage()
await searchPage.waitForLoadState()
await searchPage.goto(`${search[i].url}`)
for (let i = 0; i < kewWordlist.length; i++) {
await searchPage.fill(searchFill,kewWordlist[i])
await searchPage.keyboard.press('Enter')
await searchPage.waitForSelector(searchResult)
const result = await page.$(searchResult)
console.log(`${searchName}: ${result} `)
}
}
})
The reason your loop isn't working is that you have an async function inside of it that you never call. There are a few ways you could go about this:
You could take your first version, have it accept a word to search, and run that over each element of the array:
const searchOneKeyword = async (keyWord) => {
const browser = await chromium.launch({ headless: false, slowMo: 250 });
const context = await browser.newContext()
const page = await context.newPage();
// rest of code
}
const kewWordList = ['Arsenal', 'Liverpool', 'Ajax']
keyWordList.forEach((k) => {
searchOneKeyword(k)
})
Or if you'd like to keep the same browser instance, you can do it in a loop in the function:
const search = async (words) => {
const browser = await chromium.launch({ headless: false, slowMo: 250 });
const context = await browser.newContext()
const page = await context.newPage();
for (const keyWord of words) {
await page.goto('https://duckduckgo.com/');
await page.fill('//input[#name="q"]',keyWord);
await page.keyboard.press('Enter');
const getOne = (' (//h2[#class="result__title"])[9] ');
await page.waitForSelector(getOne)
const pushOne = await page.$(getOne);
const One = await pushOne.evaluate(element => element.innerText);
console.log(One);
// etc.
}
await browser.close()
}
search(keyWordList)
In both of those cases, you're logging, but never returning anything, so if you need that data in another function afterwards, you'd have to change that. Example:
const search = async (words) => {
const browser = await chromium.launch({ headless: false, slowMo: 250 });
const context = await browser.newContext()
const page = await context.newPage();
const results = await Promise.all(words.map((keyWord) => {
await page.goto('https://duckduckgo.com/');
await page.fill('//input[#name="q"]',keyWord);
await page.keyboard.press('Enter');
// etc.
return [ One, Two ]
}))
await browser.close()
return results
}
search(keyWordList).then((results) => { console.log(results.flat()) })
I have spent a couple of hours trying to get the script working based on your suggestions. No result unfortunately. I get errors like 'await is only valid in async function' and 'Unreachable code detected'. Searched for other examples, for some inspiration, but none found. If you or someone else has a suggestion, please share! This is code I have now:
const { chromium } = require('playwright');
let keyWordList = ['Arsenal', 'Liverpool', 'Ajax']
const search = async function words() {
const browser = await chromium.launch({ headless: false, slowMo: 250 });
const context = await browser.newContext()
const page = await context.newPage();
}
const results = await Promise.all(words.map(keyWord))
//DUCKDUCKGO
await page.goto('https://duckduckgo.com/');
await page.fill('//input[#name="q"]',keyWord);
await page.keyboard.press('Enter');
const getOne = (' (//h2[#class="result__title"])[9] ');
await page.waitForSelector(getOne)
const pushOne = await page.$(getOne);
const One = await pushOne.evaluate(element => element.innerText);
//YANDEX
await page.goto('https://yandex.com/');
await page.fill('//input[#aria-label="Request"]', keyWord);
await page.keyboard.press('Enter');
const getTwo = (' //li[#data-first-snippet] //div[#class="organic__url-text"] ');
await page.waitForSelector(getTwo)
const pushTwo = await page.$(getTwo);
const Two = await pushTwo.evaluate(element => element.innerText);
console.log(Two);
return [ One , Two ]
return results
search(keyWordList).then((results) => { console.log(results.flat())
await browser.close();
})
I am having issues trying to get this to work.
I am needing to select the calendar heading "May 2020", but am not having any luck, could I get someone to look at what I am doing wrong?
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://www.stayz.com.au/holiday-rental/p9177051?noDates=true',{waitUntil: 'domcontentloaded'});
const headingTxt = await page.evaluate(() =>
document.querySelector('#rates-availability > div > div > section > div > div.inline-calendar > div > div.cal-controls__calendar-parent--middle-multi > div > div:nth-child(2) > h4 > span').innerText);
console.log('');
console.log('========[ output ]======== ', headingTxt);
console.log('');
await page.close();
await browser.close();
})();
This selector needs scrolling and wide viewport to appear:
'use strict';
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({ headless: false, args: ['--start-maximized'] }); //
const page = await browser.newPage();
await page.setViewport({ width: 1280, height: 850 });
await page.goto('https://www.stayz.com.au/holiday-rental/p9177051?noDates=true', {waitUntil: 'domcontentloaded'});
const selector = '#rates-availability > div > div > section > div > div.inline-calendar > div > div.cal-controls__calendar-parent--middle-multi > div > div:nth-child(2) > h4 > span';
while ((await page.$(selector)) === null) {
await page.evaluate(() => { window.scrollBy(0, window.innerHeight); });
await page.waitFor(1000);
}
const headingTxt = await page.evaluate(
selector => document.querySelector(selector).innerText,
selector
);
console.log('');
console.log('========[ output ]======== ', headingTxt);
console.log('');
await page.close();
await browser.close();
})();
Hi I'm working on a script to learn node JS, and I'm stuck at this point :
I would like to read a text file with my email: password, and with that I would like to open as many tabs as many email:password I have and use each email: password to connect to the website
const puppeteer = require("puppeteer");
const lineReader = require("line-reader");
(async () => {
let data = [];
const promises = []
let int = 0;
lineReader.eachLine("c.txt", function(line) {
int++;
data = line.split(":")
console.log(data);
});
console.log(int);
for (let i = 0; i < 2; i++) {
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
const navigationPromise = page.waitForNavigation();
await page.goto("https:www.site.com/en/launch/");
await page.setViewport({ width: 1920, height: 1080 });
await page.waitForSelector(
".d-sm-h > .bg-white > .right-nav > .member-nav-item > .join-log-in"
);
await page.click(
".d-sm-h > .bg-white > .right-nav > .member-nav-item > .join-log-in"
);
const emailInput = '[placeholder="Adresse e-mail"][autocomplete="email"]';
await page.waitForSelector(emailInput, { timeout: 0 });
await page.focus(emailInput);
await page.keyboard.type(data[0]);
const passwordInput =
'[placeholder="Mot de passe"][autocomplete="current-password"]';
await page.waitForSelector(passwordInput, { timeout: 0 });
await page.focus(passwordInput);
await page.keyboard.type(data[1]);
await page.click(
".site-unite-submit-button.loginSubmit.site-unite-component"
);
}
await Promise.all(promises)
})();
but what I have when I consol log my data :
[ 'email#gmail.com:tesssst' ]
[ 'email2#gmail.com:tesssst' ]
I would like to know and understand how can I use each email:password that i have on my txt file to open tabs and use my data to login to the website
Thank you