Iterate over elements in Nodejs - javascript

I have a webpage, where I want to hover over all anchor tags and get the styles computed for that tag. This function which I wrote doesn't seem to work as it gives me original style of the anchor and not the hover styles.
Please help.
let data = await page.evaluate(() => {
let elements = document.getElementsByTagName('a');
properties = []
for (var element of elements){
element.focus();
properties.push(JSON.parse(JSON.stringify(window.getComputedStyle(element, null)["backgroundColor"])));
}
return properties;
});

https://developer.mozilla.org/en-US/docs/Web/API/Window/getComputedStyle
try document.getComputedStyle(element, ':hover')

First of all, you should convert results from document.getElementsByTagName to normal array
const elements = [...document.getElementsByTagName('textarea')];
Next to get element property use this syntax:
window.getComputedStyle(element).getPropertyValue("background-color")
Finally, this is a fully working example:
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://css-tricks.com/almanac/selectors/f/focus/');
const data = await page.evaluate(() => {
const elements = document.getElementsByTagName('textarea');
return [...elements].map(element => {
element.focus();
return window.getComputedStyle(element).getPropertyValue("background-color");
});
});
console.log(data);
await browser.close();
})();

You can use page.$$() to obtain an ElementHandle array of textarea elements.
Then, you can use the elementHandle.hover() to hover over each element and then page.evaluate() to obtain the computed background color to push to your data array:
const elements = await page.$$( 'textarea' );
const data = [];
for ( let i = 0; i < elements.length; i++ )
{
await elements[i].hover();
data.push( await page.evaluate( element => window.getComputedStyle( element ).backgroundColor, elements[i] ) );
}
console.log( data );

Related

TypeError: Cannot read property 'getProperty' of undefined? Node.js, Puppeteer

When trying to get the text of the 'name' element for my scraper. I try to grab it with the full Xpath and get the error 'TypeError: Cannot read property 'getProperty' of undefined' I tried just using the regular Xpath but that said name: 'skip navigation' why is get property coming back as undefined? it only happens when trying to get the channel title, it works when getting the profile image.
scaper.js
const puppeteer = require('puppeteer');
async function scrapeChannel(url) {
const browser = await puppeteer.launch()
const page = await browser.newPage();
await page.goto(url);
// const xpath_expression = '/html/body/ytd-app/div/ytd-page-manager/ytd-browse[2]/div[3]/ytd-c4-tabbed-header-renderer/tp-yt-app-header-layout/div/tp-yt-app-header/div[2]/div[2]/div/div[1]/div/div[1]/ytd-channel-name/div/div/yt-formatted-string';
// await page.waitForXPath(xpath_expression);
const [el] = await page.$x('/html/body/ytd-app/div/ytd-page-manager/ytd-browse[2]/div[3]/ytd-c4-tabbed-header-renderer/tp-yt-app-header-layout/div/tp-yt-app-header/div[2]/div[2]/div/div[1]/div/div[1]/ytd-channel-name/div/div/yt-formatted-string');
const text = await el.getProperty('textContent');
const name = await text.jsonValue();
const [el2] = await page.$x('//*[#id="img"]');
const src = await el2.getProperty('src');
const avatarURL = await src.jsonValue();
browser.close();
console.log({name, avatarURL});
return { name, avatarURL}
}
}
scrapeChannel('https://www.youtube.com/channel/UC8butISFwT-Wl7EV0hUK0BQ')
index.html
function newEl(type, attrs = {}) {
const el = document.createElement(type);
for (let attr in attrs) {
const value = attrs[attr];
if (attr == "innerText") el.innerText = value;
else el.setAttribute(attr, value);
}
return el;
}
It seems you may have a typo in XPath. When I try your XPath in the browser console, it returns no elements. However, with this one change, it returns an element:
$x('/html/body/ytd-app/div/ytd-page-manager/ytd-browse[1]/div[3]/ytd-c4-tabbed-header-renderer/tp-yt-app-header-layout/div/tp-yt-app-header/div[2]/div[2]/div/div[1]/div/div[1]/ytd-channel-name/div/div/yt-formatted-string')
.......................................................^: 1 instead of 2

How to find document.activeElement in Puppeteer

i want to autofill a form with puppeteer.
I fill out the first input, then click on a button that then creates a new input field that has focus.
How can i select this input? Can i use document.activeElement and how?
let newActivity = 'button.new_activity'
await page.waitForSelector(newActivity)
await page.click(newActivity)
// find active/focused input
await page.type(focusedInput, 'message')
You can use evaluateHandle to get the element handle, and then call the type function on that element.
const el = await page.evaluateHandle(() => document.activeElement);
await el.type('message');
function findFocusedNode(node) {
if (node.focused) {
return node;
}
for (const child of node.children || []) {
const focusedNode = findFocusedNode(child);
if (focusedNode) {
return focusedNode;
}
}
}
const snapshot = await page.accessibility.snapshot();
const focusedNode = findFocusedNode(snapshot);
console.log('focusedNode', focusedNode);
https://github.com/puppeteer/puppeteer/blob/master/docs/api.md#class-accessibility

Waiting for an iframe to be opened and scraped is too slow to scrape js

I'm trying to scrape an old website built with tr, br and iframe. Everything was going good so far before I started to want to extract data from an iframe, see iFrameScraping setTimeout, but the clicking is too fast for me to be able to get the datas. Would anyone have an idea of how to click, wait for the content to show and be scraped, then continue?
const newResult = await page.evaluate(async(resultLength) => {
const elements = document.getElementsByClassName('class');
for(i = 0; i < resultLength; i++) {
const companyArray = elements[i].innerHTML.split('<br>');
let companyStreet,
companyPostalCode;
// Get company name
const memberNumber = elements[i].getElementsByTagName('a')[0].getAttribute('href').match(/[0-9]{1,5}/)[0];
const companyName = await companyArray[0].replace(/<a[^>]*><span[^>]*><\/span>/, '').replace(/<\/a>/, '');
const companyNumber = await companyArray[0].match(/[0-9]{6,8}/) ? companyArray[0].match(/[0-9]{6,8}/)[0] : '';
// Get town name
const companyTown = await companyArray[1].replace('"', '');
// Get region name
const companyRegion = await companyArray[2].replace(/<span[^>]*>Some text:<\/span>/, '');
// Get phone number
const telNumber = await elements[i].innerHTML.substring(elements[i].innerHTML.lastIndexOf('</span>')).replace('</span>', '').replace('<br>', '');
const iFrameScraping = await setTimeout(async({elements, i}) => {
elements[i].getElementsByTagName('a')[0].click();
const iFrameContent = await document.getElementById('some-id').contentWindow.document.getElementById('lblAdresse').innerHTML.split('<br>');
companyStreet = iFrameContent[0].replace('"', '');
companyPostalCode = iFrameContent[2].replace('"', '');
}, 2000, {elements, i});
console.log(companyStreet, companyPostalCode)
};
}, pageSearchResults.length);
I fixed my issues after a while, so I'll share my solution.
I add to stop getting all the data with a loop from the evaluate because it's going to fast and creating a race condition. Instead I used a combination of page.$$ coupled with a for…of loop. Note that the forEach from es6 are causing race condition as well, since puppeteer does not wait for them to end to continue its execution.
Here is the example from my updated code:
const companies = await page.$$('.repmbr_result_item');
const companiesLinks = await page.$$('.repmbr_result_item a');
for(company of companies) {
const companyEl = await page.evaluate(el => el.innerHTML, company)
const companyElArray = companyEl.split('<br>');

how to select innerHTML from an elementHandle in puppeteer

Using the node puppeteer module, how do I continue with this code to get the innerContent here?
const els = Promise.all(await page.$$(selector)).then(results => {
results.map(async el => {
const tr = await el.$('tr')
//How do I convert this element handle to get its innerText content?
})
})
Like this
textValue = tr.getProperty('innerText').jsonValue()

await in nested for ... of loop

async traverse(url) {
const ts = new TournamentScraper()
const ms = new MatchScraper()
const results = []
const tournaments = await ts.run(url)
for(let href of tournaments.map(t => t.href)){
let matches = await ms.run(href)
let pages = ms.getPages()
let seasons = ms.getSeasons()
//console.log(pages)
//console.log(seasons)
results.push(matches)
for(let href of pages) {
//console.log(href)
matches = await ms.run(href)
//console.log(matches)
results.push(matches)
}
}
return results
}
TournamentScraper returns an array of objects, which typically looks like this:
{name: 'Foo', href: 'www.example.org/tournaments/foo/'}
The link points to the tournament's last season's first page. This page contains the links to the other seasons and a paginator (if any).
MatchScraper's run returns some data, and sets the instance's dom property. getPages() and getSeasons() consumes this property and each returns an array of links.
The problem that results contains only the first batch of matches. I can see the 2nd page's matches in the console log, but they are not in the results array when traverse returns.
I found this rule which is against await in for loop. The problem, that I have to wait for ms.run(href), because it sets dom, and getPages() and getSeasons() needs it to be set, to extract the needed links.
I think this should work. It utilizes Promise all rather than for loops
const run = href => ms.run(href);
async function getMatches(href) {
const out = [];
const matches = await run(href);
const pages = ms.getPages();
out.push(matches);
if(pages.length) {
const pageResults = await Promise.all(pages.map(href => run(href)));
out.push(...pageResults);
}
return out;
}
async function traverse(url) {
const ts = new TournamentScraper();
const ms = new MatchScraper();
const tournaments = await ts.run(url)
const matches = await Promise.all(tournaments.map(t => getMatches(t.href)));
return matches.reduce((a, b) => {
a.push(...b);
return a;
}, []);
}

Categories

Resources