Scraping dynamic pages with node.js and headless browser - javascript

I'm trying to scrap data from the page that is loading dynamically. For this I'm using headless browser puppeteer
Puppeteer can be seen as the headlessBrowserClient in the code.
The main challenge is to gracefully close the browser as soon as needed data received. But if you close it earlier than evaluateCustomCode execution is finished - evaluateCustomCode progress would be lost.
evaluateCustomCode is a function that can be called as if we run it in the Chrome Dev tools.
To have control over the network requests and async flow of puppeteer API - I use async generator that encapsulates all the logic described above.
The problem is that I feel that the code smells, but I can't see any better solution.
Ideas ?
module.exports = function buildClient (headlessBrowserClient) {
const getPageContent = async (pageUrl, evaluateCustomCode) => {
const request = sendRequest(pageUrl)
const { value: page } = await request.next()
if (page) {
const pageContent = await page.evaluate(evaluateCustomCode)
request.next()
return pageContent
}
}
async function * sendRequest (url) {
const browser = await headlessBrowserClient.launch()
const page = await browser.newPage()
const state = {
req: { url },
}
try {
await page.goto(url)
yield page
} catch (error) {
throw new APIError(error, state)
} finally {
yield browser.close()
}
}
return {
getPageContent,
}
}

You can use waitForFunction or waitFor and evaluate with Promise.all. No matter how dynamic the website is, you are waiting for something to be true at end and close the browser when that happens.
Since I do not have access to your dynamic url, I am going to use some random variables and delays as example. It will resolve once the variable returns truthy.
await page.waitForFunction((()=>!!someVariableThatShouldBeTrue);
If your dynamic page actually creates a selector somewhere after you evaluate the code? In that case,
await page.waitFor('someSelector')
Now back to your customCode, let me rename that for you a bit,
await page.evaluate(customCode)
Where customCode is something that will set a variable someVariableThatShouldBeTrue to true somewhere. Honestly it can be anything, a request, a string or anything. The possibilities are endless.
You can put a promise inside page.evaluate, recent chromium supports them very well. So the following will work too, resolve once you are loaded the function/data. Make sure the customCode is an async function or returns promise.
const pageContent = await page.evaluate(CustomCode);
Alright, now we have all required pieces. I modified the code a bit so it doesn't smell to me :D ,
module.exports = function buildClient(headlessBrowserClient) {
return {
getPageContent: async (url, CustomCode) => {
const state = {
req: { url },
};
// so that we can call them on "finally" block
let browser, page;
try {
// launch browser
browser = await headlessBrowserClient.launch()
page = await browser.newPage()
await page.goto(url)
// evaluate and wait for something to happen
// first element returns the pageContent, but whole will resolve if both ends truthy
const [pageContent] = await Promise.all([
await page.evaluate(CustomCode),
await page.waitForFunction((() => !!someVariableThatShouldBeTrue))
])
// Or, You realize you can put a promise inside page.evaluate, recent chromium supports them very well
// const pageContent = await page.evaluate(CustomCode)
return pageContent;
} catch (error) {
throw new APIError(error, state)
} finally {
// NOTE: Maybe we can move them on a different function
await page.close()
await browser.close()
}
}
}
}
You can change and tweak it more as you wish. I did not test the final code (since I don't have APIError, evaluateCustomCode etc) but it should work.
It doesn't have all those generators and stuff like that. Promises, That's how you can deal with dynamic pages :D .
PS: IMO, Such questions are more befitting for the code review.

Related

Function does not pass object to the constant

I'm new to javascript so maybe it's a dumb mistake. I'm trying to pass the values ​​of the object that I get in this webscrapping function to the constant but I'm not succeeding. Every time I try to print the menu it prints as "undefined".
`
const puppeteer = require("puppeteer");
async function getMenu() {
console.log("Opening the browser...");
const browser = await puppeteer.launch({
headless: true
});
const page = await browser.newPage();
await page.goto('https://pra.ufpr.br/ru/ru-centro-politecnico/', {waitUntil: 'domcontentloaded'});
console.log("Content loaded...");
// Get the viewport of the page
const fullMenu = await page.evaluate(() => {
return {
day: document.querySelector('#conteudo div:nth-child(3) p strong').innerText,
breakfastFood: document.querySelector('tbody tr:nth-child(2)').innerText,
lunchFood: document.querySelector('tbody tr:nth-child(4)').innerText,
dinnerFood: document.querySelector('tbody tr:nth-child(6)').innerText
};
});
await browser.close();
return {
breakfast: fullMenu.day + "\nCafé da Manhã:\n" + fullMenu.breakfastFood,
lunch: fullMenu.day + "\nAlmoço:\n" + fullMenu.lunchFood,
dinner: fullMenu.day + "\nJantar:\n" + fullMenu.dinnerFood
};
};
const menu = getMenu();
console.log(menu.breakfast);
`
I've tried to pass these values ​​in several ways to a variable but I'm not succeeding. I also accept other methods of passing these strings, I'm doing it this way because it's the simplest I could think of.
Your getMenu() is an async function.
In your last bit of code, can you change it to,
(async () => {
let menu = await getMenu();
console.log(menu.breakfast);
})();
credit to this post.
I have no access to the package that you imported. You may try changing the last part of your code to:
const menu = await getMenu();
if (menu) {
console.log(menu.breakfast);
}
Explanation
getMenu() and await getMenu() are different things in JS. getMenu() is a Promise Object which does not represent any string / number / return value. await getMenu() tells JS to run other code first to wait for the result of getMenu().
Despite await tells JS to wait for getMenu() to be resolved, it doesn't stop console.log(menu.breakfast) from running. Your code will try to access menu - which at that moment it is a Promise object. Therefore, breakfast property doesn't exist in the Promise object, so you get undefined.
By adding a if (menu) {...} statement, javascript will wait until menu is resolved before going inside the if-statement. This is useful when you want to do console.log() on a async/await return value.

Javascript await reading text file

This is probably dead simple, but I can't quite figure it out. I simply want to read the contexts of a text file into a variable. What I have is:
async function load(path) {
try {
const response = await fetch(path);
const text = await response.text();
return text;
} catch (err) {
console.error(err);
}
}
var source_text = load(source_text_path);
console.log(source_text);
To my mind, this should work but only the pending promise is returned and not the text, thought I thought it was awaiting properly.
You need to wait for the load method.
var source_text = await load(source_text_path);
OR
load(source_text_path).then(e => console.log(e))
The function is indeed awaiting as it should, but since it's async, it will always result in a Promise<things> when you return thing.
Which means you should either await it elsewhere:
var source_text = await load(source_text_path);
or then it:
load(source_text_path).then((source_text) => {
console.log(source_text);
})

How can I click on all links matching a selector with Playwright?

I'm using Playwright to scrape some data. How do I click on all links on the page matching a selector?
const { firefox } = require('playwright');
(async () => {
const browser = await firefox.launch({headless: false, slowMo: 50});
const page = await browser.newPage();
await page.goto('https://www.google.com');
page.pause(); // allow user to manually search for something
const wut = await page.$$eval('a', links => {
links.forEach(async (link) => {
link.click(); // maybe works?
console.log('whoopee'); // doesn't print anything
page.goBack(); // crashes
});
return links;
});
console.log(`wut? ${wut}`); // prints 'wut? undefined'
await browser.close();
})();
Some issues:
console.log inside the $$eval doesn't do anything.
page.goBack() and page.pause() inside the eval cause a crash.
The return value of $$eval is undefined (if I comment out page.goBack() so I get a return value at all). If I return links.length instead of links, it's correct (i.e. it's a positive integer). Huh?
I get similar results with:
const links = await page.locator('a');
await links.evaluateAll(...)
Clearly I don't know what I'm doing. What's the correct code to achieve something like this?
(X-Y problem alert: I don't actually care if I do this with $$eval, Playwright, or frankly even Javascript; all I really want to do is make this work in any language or tool).
const { context } = await launch({ slowMo: 250 });
const page = await context.newPage();
await page.goto('https://stackoverflow.com/questions/70702820/how-can-i-click-on-all-links-matching-a-selector-with-playwright');
const links = page.locator('a:visible');
const linksCount = await links.count();
for (let i = 0; i < linksCount; i++) {
await page.bringToFront();
try {
const [newPage] = await Promise.all([
context.waitForEvent('page', { timeout: 5000 }),
links.nth(i).click({ modifiers: ['Control', 'Shift'] })
]);
await newPage.waitForLoadState();
console.log('Title:', await newPage.title());
console.log('URL: ', page.url());
await newPage.close();
}
catch {
continue;
}
}
There's a number of ways you could do this, but I like this approach the most. Clicking a link, waiting for the page to load, and then going back to the previous page has a lot of problems with it - most importantly is that for many pages the links might change every time the page loads. Ctrl+shift+clicking opens in a new tab, which you can access using the Promise.all pattern and catching the 'page' event.
I only tried this on this page, so I'm sure there's tons of other problems that my arise. But for this page in particular, using 'a:visible' was necessary to prevent getting stuck on hidden links. The whole clicking operation is wrapped in a try/catch because some of the links aren't real links and don't open a new page.
Depending on your use case, it may be easiest just to grab all the hrefs from each link:
const links = page.locator('a:visible');
const linksCount = await links.count();
const hrefs = [];
for (let i = 0; i < linksCount; i++) {
hrefs.push(await links.nth(i).getAttribute('href'));
}
console.log(hrefs);
Try this approach.I will use typescript.
await page.waitForSelector(selector,{timeout:10000});
const links = await page.$$(selector);
for(const link of links)
{
await link.click({timeout:8000});
//your additional code
}
See more on https://youtu.be/54OwsiRa_eE?t=488

How to avoid an infinite loop in JavaScript

I have a Selenium webdriverIO V5 framework. The issue I am facing here is, the below code works fine on Mac OS, but it does not work correctly on the Windows OS. In the Windows OS it gets stuck with an infinite loop issue.
The below code functionality is: Merge yaml files (which contains locators) and return the value of the locator by passing the key:
const glob = require('glob');
const yamlMerge = require('yaml-merge');
const sleep = require('system-sleep');
let xpath;
class Page {
getElements(elementId) {
function objectCollector() {
glob('tests/wdio/locators/*.yml', function (er, files) {
if (er) throw er;
xpath = yamlMerge.mergeFiles(files);
});
do {
sleep(10);
} while (xpath === undefined);
return xpath;
}
objectCollector();
return xpath[elementId];
}
}
module.exports = new Page();
Since you are waiting on the results of a callback, I would recommend returning a new Promise from your getElements function and resolve() the value you receive inside the callback. Then when you call getElements, you will need to resolve that Promise or use the await notation. The function will stop at that point and wait until the Promise resolves, but the event loop will still continue. See some documentation for more information.
I'll write an example below of what your code might look like using a Promise, but when you call getElements, you will need to put the keyword await before it. If you want to avoid that, you could resolve the Promise from objectCollector while you're in getElements and remove the async keyword from its definition, but you really should not get in the way of asynchronous JavaScript. Also, you can probably shorten the code a bit because objectCollector looks like an unnecessary function in this example:
const glob = require('glob')
const yamlMerge = require('yaml-merge')
const sleep = require('system-sleep')
let xpath
class Page {
function async getElements(elementId) {
function objectCollector() {
return new Promise((resolve,reject) => {
glob('tests/wdio/locators/*.yml', function (er, files) {
if (er) reject(er)
resolve(yamlMerge.mergeFiles(files))
})
})
}
let xpath = await objectCollector()
return xpath[elementId]
}
}
module.exports = new Page();

Puppeteer Async Await Loop in NodeJS

I am trying to make a script that :
Grabs all urls from a sitemap
Takes a screenshot of it with puppeteer
I am currently trying to understand how to code asynchronously but I still have troubles with finding the right coding pattern for this problem.
Here is the code I currently have :
// const spider = require('./spider');
const Promise = require('bluebird');
const puppeteer = require('puppeteer');
const SpiderConstructor = require('sitemapper');
async function crawl(url, timeout) {
const results = await spider(url, timeout);
await Promise.each(results, async (result, index) => {
await screen(result, index);
});
}
async function screen(result, index) {
const browser = await puppeteer.launch();
console.log('doing', index);
const page = await browser.newPage();
await page.goto(result);
const path = await 'screenshots/' + index + page.title() + '.png';
await page.screenshot({path});
browser.close();
}
async function spider(url, timeout) {
const spider = await new SpiderConstructor({
url: url,
timeout: timeout
});
const data = await spider.fetch();
console.log(data.sites.length);
return data.sites;
};
crawl('https://www.google.com/sitemap.xml', 15000)
.catch(err => {
console.error(err);
});
I am having the following problems :
The length of the results array is not a constant, it varies every time I launch the script, which I guess resides in the fact it is not fully resolved when I display it, but I thought the whole point of await was so that we are guarantied that on next line the promise is resolved.
The actual screenshotting action part of the script doesn't work half the time and I am pretty sure I have unresolved promises but I have no of the actual pattern for looping over an async function, right now it seems like it does a screenshot after the other (linear and incremental) but I get alot of duplicates.
Any help is appreciated. Thank you for your time

Categories

Resources