puppeteer being redirected when browser is not - javascript

Attempting to test page https://publicindex.sccourts.org/anderson/publicindex/
When navigating with standard browser to the page, the navigation ends at the requested page (https://publicindex.sccourts.org/anderson/publicindex/) with the page displaying an "accept" button.
However, when testing with puppeteer in headless mode, the request is redirected to https://publicindex.sccourts.org.
I have a rough idea of what is occuring, but can not seem to prevent the redirection to https://publicindex.sccourts.org when the page is requested using puppeteer.
here is what I believe is occuring with the user controlled browser:
request for page is sent. (assuming first visit)
the response is pure JS,
The js code specifies to:
copy the initial page request headers
add a specific header, and re-request the same page (xhr)
copies a url from one of the response headers and replaces the location
(or)
checks the page history,
adds the url from the response to page to history,
opens a new window,
writes the xhr response to the new page
closes the new window
adds an event listener for a function in the returned xhr request
fires the event
With puppeteer I have tried tracing the js, recording har, monitoring cookies, watching the request chain, intercepting page requests and adjusting headers,watching history....etc. I'm stumped.
Here is the most basic version of the puppeteer script:
function run () {
let url = 'https://publicindex.sccourts.org/anderson/publicindex/';
const puppeteer = require('puppeteer');
const PuppeteerHar = require('puppeteer-har');
puppeteer.launch({headless: true}).then(async browser => {
const page = await browser.newPage();
await page.setJavaScriptEnabled(true);
await page.setViewport({width: 1920, height: 1280});
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36');
const har = new PuppeteerHar(page);
await har.start({path: 'results.har'});
const response = await page.goto(url);
await page.waitForNavigation();
await har.stop();
let bodyHTML = await page.content();
console.log(bodyHTML);
});
};
run();
why can I not get puppeteer to simply replicate the process that is being executed by js when I am navigating to the page in chrome, and end navigation on the "accept" page?
here is a version with more verbose logging:
function run () {
let url = 'https://publicindex.sccourts.org/anderson/publicindex/';
const puppeteer = require('puppeteer');
const PuppeteerHar = require('puppeteer-har');
puppeteer.launch().then(async browser => {
const page = await browser.newPage();
await page.setJavaScriptEnabled(true);
await page.setViewport({width:1920,height:1280});
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36');
await page.setRequestInterception(true);
page.on('frameattached', frame =>{ console.log('frame attached ');});
page.on('framedetached', frame =>{ console.log('frame detached ');});
page.on('framenavigated', frame =>{ console.log('frame navigated '); });
page.on('requestfailed', req =>{ console.log('request failed ');});
page.on('requestfinished', req =>{ console.log('frame finished '); console.log(req.url())});
let count = 0;
let headers = '';
page.on('request', interceptedRequest => {
console.log('requesting ' + count + 'times');
console.log('request for ' + interceptedRequest.url());
console.log(interceptedRequest);
if (count>2) {
interceptedRequest.abort();
return;
}
if (interceptedRequest.url() == url) {
count++;
if (count == 1) {
const headers = interceptedRequest.headers();
headers['authority'] = 'publicindex.sccourts.org';
headers['sec-fetch-dest'] = 'empty';
headers['sec-fetch-mode'] = 'cors';
headers['sec-fetch-site'] = 'same-origin';
headers['upgrade-insecure-requests'] = '1';
interceptedRequest.continue({headers});
return;
} else {
interceptedRequest.continue();
return;
}
}
count++;
interceptedRequest.continue();
return;
});
const har = new PuppeteerHar(page);
await har.start({ path: 'results.har' });
await page.tracing.start({path: 'trace.json'});
await Promise.all([page.coverage.startJSCoverage({reportAnonymousScripts : true})]);
const response = await page.goto(url);
const session = await page.target().createCDPSession();
await session.send('Page.enable');
await session.send('Page.setWebLifecycleState', {state: 'active'});
const jsCoverage = await Promise.all([page.coverage.stopJSCoverage()]);
console.log(jsCoverage);
const chain = response.request().redirectChain();
console.log(chain + "\n\n");
await page.waitForNavigation();
await har.stop();
let bodyHTML = await page.content();
console.log(bodyHTML);
});
};
run();

I don't have a full resolution but I know where the redirection is happening.
I tested your script locally with below:
const puppeteer = require('puppeteer');
const PuppeteerHar = require('puppeteer-har');
function run () {
let url = 'https://publicindex.sccourts.org/anderson/publicindex/';
puppeteer.launch({headless: false, devtools: true }).then(async browser => {
const page = await browser.newPage();
await page.setRequestInterception(true);
page.on('request', request => {
console.log('GOT NEW REQUEST', request.url());
request.continue();
});
page.on('response', response => {
console.log('GOT NEW RESPONSE', response.status(), response.headers());
});
await page.setJavaScriptEnabled(true);
await page.setViewport({width: 1920, height: 1280});
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36');
const har = new PuppeteerHar(page);
await har.start({path: 'results.har'});
const response = await page.goto(url);
await page.waitForNavigation();
await har.stop();
let bodyHTML = await page.content();
});
};
run();
I edited three parts:
Removed headless mode and open the devtools automatically
Intercept all network requests (that I audited)
Hoisted require import because it hurts my eyes. I always see them call without nesting
Turns out the page https://publicindex.sccourts.org/anderson/publicindex/ make a request to https://publicindex.sccourts.org/
However this request returns a 302 Redirect to https://www.sccourts.org/caseSearch/ location, so the browser acts accordingly
I would try to investigate this weird request if it is legit or not and why it redirects on chrome puppeteer
This post might help, there could be something related on chromium being seen as insecure
I also tried to pass args: ['--disable-web-security', '--allow-running-insecure-content'] to launch() object parameter, but without results
Please let us know how it goes! Har has been fun to discover!

Related

Need to select very specific element using querySelector without returning undefined

I'm scraping a site for data using Puppeteer and need to get a really specific piece of data from the site, I'm trying to use querySelector to get the classname of where the data is but its proven rather difficult because there are 22 other elements that use the exact classname(the classname is FormData), out of the 22 its the 18th and I've been trying to select it and print it out but to no avail, I always get the same error or something along the lines.
Code
// MODULES
const puppeteer = require("puppeteer");
// Url where we get and scrape the data from
const URL = "https://www.sec.gov/edgar/search/#/category=form-cat2";
(async () => {
try {
const chromeBrowser = await puppeteer.launch({ headless: true });
const page = await chromeBrowser.newPage();
await page.goto(URL, {timeout: 0});
const getInfo = await page.evaluate(() => {
const secTableEN = document.querySelector(".table td.entity-name");
const secTableFiled = document.querySelector(".table td.filed");
const secTableLinkPrice = document.querySelector('.FormData')[17];
return {
secTableEN: secTableEN.innerText,
secTableFiled: secTableFiled.innerText,
secTableLinkPrice: secTableLinkPrice.innerText,
};
});
console.log(
"Name: " + getInfo.secTableEN, '\n' +
"Amount Purchased: " + getInfo.secTableLinkPrice, '\n'
);
await page.close();
await chromeBrowser.close();
} catch (e) {
console.error(e)
}
})();
The error I'm always getting is:Error: Evaluation failed: TypeError: Cannot read properties of undefined (reading 'innerText') and only always happens when I try returning the secTableLinkPrice.innerText the other two alone always work fine. What can I do?
Apparently the price you want from the top result is in a popup, so you need to click on one of the .preview-file links to make that popup appear. Only then can you select .FormData from the iframe modal.
const puppeteer = require("puppeteer"); // ^19.1.0
const url = "<YOUR URL>";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
const ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36";
await page.setUserAgent(ua);
await page.goto(url, {waitUntil: "domcontentloaded"});
const $ = (...args) => page.waitForSelector(...args);
await (await $(".filetype .preview-file")).click();
const frame = await (await $("#ipreviewer")).contentFrame();
await frame.waitForSelector(".FormText");
const price = await frame.$$eval(".FormText", els =>
els.find(e => e.textContent.trim() === "$")
.parentNode
.textContent
.trim()
);
console.log(price);
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
Now, the popup triggers a network request to an XML file (which appears to be HTML), so it might be easiest to just download that, since it probably has all of the data you want. In the code below, I'm actually parsing and traversing that HTML with Puppeteer, so it looks like more work, but perhaps you could just save this file to disk, depending on your needs:
// ... same as above ...
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
const ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36";
await page.setUserAgent(ua);
await page.goto(url, {waitUntil: "domcontentloaded"});
const responseP = page.waitForResponse(res =>
res.status() === 200 && res.url().endsWith(".xml")
);
const a = await page.waitForSelector(".filetype .preview-file");
await a.click();
const html = await (await responseP).text();
await page.evaluate(html => document.body.outerHTML = html, html);
const price = await page.$$eval(".FormText", els =>
els.find(e => e.textContent.trim() === "$")
.parentNode
.textContent
.trim()
);
console.log(price);
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
Finally, some documents don't have a price, so the above code only works on the "4 (Insider trading report)". Furthermore, I haven't validated that all of these "type 4" reports are exactly the same. You'll probably want to handle this in your code and proceed carefully.

How to select dropdown menu items on webpage with Puppeteer

I am trying to learn JavaScript and trying to select the sort by drop down menu and click on one of its four items on this website: https://www.centris.ca/en/properties~for-sale?view=Thumbnail
But I keep getting no node found for selector.
This is my code:
const puppeteer = require('puppeteer')
function run () {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch({headless: false})
const page = await browser.newPage()
await page.goto("https://www.centris.ca/")
await page.$eval('i.far.fa-search', el => el.click())
await page.select("select#selectSortById", "3")
//browser.close()
return resolve(page)
} catch (e) {
return reject(e)
}
})
}
run().then(console.log).catch(console.error)
The code you're using is for manipulating HTML <select> and <option> elements. But the structure you're looking at are just some <a>s in a <div> that are styled and have JS that makes them behave like a dropdown, but aren't really dropdowns as far as Puppeteer is concerned, likely explaining why your code doesn't work.
I'd just select the <a> (acting like an "option") you want, then click it (using the native click to avoid visibility weirdness). I tossed in a waitForFunction to detect when the filter has actually been applied, but that might not be what you want to do next. Even so, it helps verify that this works before dumping the screenshot.
I also set a user agent so that headless mode works, if desired.
const puppeteer = require("puppeteer"); // ^19.1.0
let browser;
(async () => {
browser = await puppeteer.launch({headless: true});
const [page] = await browser.pages();
await page.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36");
await page.goto("https://www.centris.ca/en/properties~for-sale");
const recent = await page.waitForSelector('#selectSortById [data-option-value="3"]');
await recent.evaluate(el => el.click());
// assume the whole first page are new listings and assume a few classes exist
await page.waitForFunction(`
document.querySelectorAll(".banner.new-property").length ===
document.querySelectorAll(".property-thumbnail-item").length
`);
await page.screenshot({path: "result.png"});
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
Unrelated to the scraping task, but worth a peek to improve your JS code: What is the explicit promise construction antipattern and how do I avoid it?

HTML element not selecting in Puppeteer

So I have an HTML excerpt from a webpage as follows:
<li class="PaEvOc tv5olb wbTnP gws-horizon-textlists__li-ed">
//random div/element stuff inside here
</li>
<li class ="PaEvOc tv5olb gws-horizon-textlists__li-ed">
//random div/element stuff inside here as well
</li>
Not sure how to properly copy HTML but if you look at "events near location" on Google Chrome, I'm looking at these and trying to scrape the data from them:
https://i.stack.imgur.com/fv4a4.png
To start, I'm just trying to figure out how to properly select these elements in Puppeteer:
(async () => {
const browser = await puppeteer.launch({ args: [
'--no-sandbox'
]});
const page = await browser.newPage();
page.once('load', () => console.log('Page loaded!'));
await page.goto('https://www.google.com/search?q=events+near+poughkeepsie+today&client=safari&rls=en&uact=5&ibp=htl;events&rciv=evn&sa=X&fpstate=tldetail');
console.log('Hit wait for selector')
const test = await page.waitForSelector(".PaEvOc");
console.log('finished waiting for selector');
const seeMoreEventsButton = await page.$(".PaEvOc");
console.log('seeMoreEventsButton is ' + seeMoreEventsButton);
console.log('test is ' + test);
})();
What exactly is the problem here? Any and all help much appreciated, thank you!
I suggest reading this: https://intoli.com/blog/not-possible-to-block-chrome-headless/
Basically, websites are detecting that you are scraping, but you can work around it.
Here is what I did to make your console logs print something useful
const puppeteer = require('puppeteer');
(async () => {
const preparePageForTests = async (page) => {
const userAgent = 'Mozilla/5.0 (X11; Linux x86_64)' +
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.39 Safari/537.36';
await page.setUserAgent(userAgent);
}
const browser = await puppeteer.launch({ args: [
'--no-sandbox'
]});
const page = await browser.newPage();
await preparePageForTests(page);
page.once('load', () => console.log('Page loaded!'));
await page.goto('https://www.google.com/search?q=events+near+poughkeepsie+today&client=safari&rls=en&uact=5&ibp=htl;events&rciv=evn&sa=X&fpstate=tldetail');
console.log('Hit wait for selector')
const test = await page.waitForSelector(".PaEvOc");
console.log('finished waiting for selector');
const seeMoreEventsButton = await page.$(".PaEvOc");
console.log('seeMoreEventsButton is ' + seeMoreEventsButton);
console.log('test is ' + test);
})();

How do I evaluate an Xpath of an XpathResult?

I am wanting to scrape data from online stores such as Argos. Argos has cards for each of their products and so I would like to first get a list of all of the product card nodes and then access specific Xpaths on each of these nodes.
Using the below code I am able to get a list of all of the unique product cards on the page, however when I try to look at the 'name' Xpath on each of these products they all return the exact same node (the 'name' node of the very first product). What am I doing wrong?
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36');
await page.goto("https://www.argos.co.uk/clearance/toys/c:30299/clearance:true/opt/page:1");
await page.waitForXPath("//div[#data-test='component-product-card']")
let p = await page.evaluate( () => {
const prods = document.evaluate("//div[#data-test='component-product-card']", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null)
while(prod = prods.iterateNext()){
let name = document.evaluate("//a[#itemprop='name']", prod, null, XPathResult.STRING_TYPE, null)
}
})

Scraping of website through Puppeteer returns undefined

I was trying to scrape the Myntra website. The link is here
I used Puppeteer and Node JS to scrape it. It was working fine and currently I get an error
Error: Evaluation failed: TypeError: Cannot read property 'textContent' of null
at __puppeteer_evaluation_script__:2:55
The function returns an empty object. I have attached my code below.
const puppeteer = require('puppeteer');
(async () => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://www.myntra.com/jeans/only/only-women-black-skinny-fit-mid-rise-low-distress-stretchable-cropped-jeans/10973332/buy');
const body = await page.evaluate( () => {
return document.querySelector('.pdp-price') ;
});
console.log(body);
await browser.close();
} catch (error) {
console.log(error);
}
})();
It seems that this site is blocking requests for which HeadlessChrome is specified in the user-agent, so I changed the user-agent and now everything works as you need. Try this code:
const puppeteer = require('puppeteer');
(async () => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setExtraHTTPHeaders({
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
});
await page.goto('https://www.myntra.com/jeans/only/only-women-black-skinny-fit-mid-rise-low-distress-stretchable-cropped-jeans/10973332/buy');
const body = await page.evaluate(() => {
return document.querySelector('.pdp-price').textContent;
});
console.log(body);
await browser.close();
} catch (error) {
console.log(error);
}
})();
Something is trying to call .textContent on something that's null. I don't see it in your example, but this is what would happen if code like querySelector('.pdp-price') doesn't find anything - maybe because the page hasn't fully loaded yet or the selector doesn't match anything.
You can pass other options to page.goto to make it wait for longer, which could let things load.

Categories

Resources