I want to scrape some data from login protected page using Node Js and Puppeteer. When I try to scrape data, the shows page is not reachable. I don't have any idea about why I am getting output like this. Here is my code.
I am new to Puppeteer and Node Js, its been 8 hours I am trying to modify that code, but not working.
async function EmailLookup(MlsNumber)
{
resp = new { Success = false };
(page = await singletonBrowser.Instance.newPage())
{
await page.setRequestInterception(true);
page.Request += (sender, e) =>
{
if (e.Request.resourceType === resourceType.document || e.Request.resourceType === resourceType.Script || e.Request.resourceType === resourceType.Xhr)
e.Request.Continue();
else
e.Request.Abort();
};
await page.async("https://example.com/idp/login");
if (!page.Url.Contains("layouts"))
{
await page.waitForSelector("#username");
await page.waitForSelector("#password");
await page.waitForSelector("#loginbtn");
await page.async("#username", "xxxxx");
await page.async("#password", "xxxxx");
await page.async("#password", "\r");
await page.waitForNavigation(new navigationOptions { Timeout = 0 });
}
await page.goto("https://example.com/launch?layoutid=61&appid=433");
await page.waitForSelector("#ctl02_m_ucSpeedBar_m_tbSpeedBar");
await page.async("#ctl02_m_ucSpeedBar_m_tbSpeedBar", MlsNumber);
await page.async("#ctl02_m_ucSpeedBar_m_tbSpeedBar", "\r");
await page.waitForNavigation(new navigationOptions { Timeout = 0 });
var MLSLink = await page.waitForXPath("//a[text()='" + MlsNumber + "']");
if (MLSLink != null)
{
await MLSLink.click();
await page.waitForNavigation(new navigationOptions{ Timeout = 0 });
var Content = await page.get();
htmldoc = new htmldoc();
htmldoc.load(Content);
var parcelNode = htmldoc.document.selectSingleNode("//a[contains(#href,'AssessorParcelDetail')]");
var emailNode = htmldoc.document.selectSingleNode("//a[contains(#href,'mailto:')]");
if (emailNode != null && parcelNode != null)
{
resp.Success = true;
resp.Email = emailNode.innerText;
resp.Parcel = parcelNode.innerText;
}
}
return json(resp);
}
}
I'm trying to solve a captcha in an iframe. Am I accessing iframe correctly with the codes below? I'm able to click the button in the iframe but not move the mouse. I'm getting x,y, width, height positions but the positions may be wrong since it's in an iframe. If I grab the iframe URL, visit it and solve the captcha then I'm able to move the mouse and solve the captcha.
Thanks
const puppeteer = require('puppeteer')
const fs = require('fs').promises
const Jimp = require('jimp')
const pixelmatch = require('pixelmatch')
const { cv } = require('opencv-wasm')
async function findPuzzlePosition (page) {
let images = await page.$$eval('.geetest_canvas_img canvas', canvases => canvases.map(canvas => canvas.toDataURL().replace(/^data:image\/png;base64,/, '')))
await fs.writeFile(`./puzzle.png`, images[1], 'base64')
let srcPuzzleImage = await Jimp.read('./puzzle.png')
let srcPuzzle = cv.matFromImageData(srcPuzzleImage.bitmap)
let dstPuzzle = new cv.Mat()
cv.cvtColor(srcPuzzle, srcPuzzle, cv.COLOR_BGR2GRAY)
cv.threshold(srcPuzzle, dstPuzzle, 127, 255, cv.THRESH_BINARY)
let kernel = cv.Mat.ones(5, 5, cv.CV_8UC1)
let anchor = new cv.Point(-1, -1)
cv.dilate(dstPuzzle, dstPuzzle, kernel, anchor, 1)
cv.erode(dstPuzzle, dstPuzzle, kernel, anchor, 1)
let contours = new cv.MatVector()
let hierarchy = new cv.Mat()
cv.findContours(dstPuzzle, contours, hierarchy, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)
let contour = contours.get(0)
let moment = cv.moments(contour)
return [Math.floor(moment.m10 / moment.m00), Math.floor(moment.m01 / moment.m00)]
}
async function findDiffPosition (page) {
await page.waitFor(100)
let srcImage = await Jimp.read('./diff.png')
let src = cv.matFromImageData(srcImage.bitmap)
let dst = new cv.Mat()
let kernel = cv.Mat.ones(5, 5, cv.CV_8UC1)
let anchor = new cv.Point(-1, -1)
cv.threshold(src, dst, 127, 255, cv.THRESH_BINARY)
cv.erode(dst, dst, kernel, anchor, 1)
cv.dilate(dst, dst, kernel, anchor, 1)
cv.erode(dst, dst, kernel, anchor, 1)
cv.dilate(dst, dst, kernel, anchor, 1)
cv.cvtColor(dst, dst, cv.COLOR_BGR2GRAY)
cv.threshold(dst, dst, 150, 255, cv.THRESH_BINARY_INV)
let contours = new cv.MatVector()
let hierarchy = new cv.Mat()
cv.findContours(dst, contours, hierarchy, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)
let contour = contours.get(0)
let moment = cv.moments(contour)
return [Math.floor(moment.m10 / moment.m00), Math.floor(moment.m01 / moment.m00)]
}
async function saveSliderCaptchaImages(page) {
await page.waitForSelector('[aria-label="Click to verify"]')
await page.waitFor(3000)
await page.click('[aria-label="Click to verify"]')
await page.waitForSelector('.geetest_canvas_img canvas', { visible: true })
await page.waitFor(1000)
let images = await page.$$eval('.geetest_canvas_img canvas', canvases => {
return canvases.map(canvas => canvas.toDataURL().replace(/^data:image\/png;base64,/, ''))
})
await fs.writeFile(`./captcha.png`, images[0], 'base64')
await fs.writeFile(`./original.png`, images[2], 'base64')
}
async function saveDiffImage() {
const originalImage = await Jimp.read('./original.png')
const captchaImage = await Jimp.read('./captcha.png')
const { width, height } = originalImage.bitmap
const diffImage = new Jimp(width, height)
const diffOptions = { includeAA: true, threshold: 0.2 }
pixelmatch(originalImage.bitmap.data, captchaImage.bitmap.data, diffImage.bitmap.data, width, height, diffOptions)
diffImage.write('./diff.png')
}
async function solveCaptcha (page) {
console.log(page)
await saveSliderCaptchaImages(page)
await saveDiffImage()
let [cx, cy] = await findDiffPosition(page)
const sliderHandle = await page.$('.geetest_slider_button')
const handle = await sliderHandle.boundingBox()
await page.waitFor(5000)
console.log(handle)
let xPosition = handle.x + handle.width / 2
let yPosition = handle.y + handle.height / 2
await page.mouse.move(xPosition, yPosition)
await page.mouse.down()
xPosition = handle.x + cx - handle.width / 2
yPosition = handle.y + handle.height / 3
await page.mouse.move(xPosition, yPosition, { steps: 25 })
await page.waitFor(100)
let [cxPuzzle, cyPuzzle] = await findPuzzlePosition(page)
xPosition = xPosition + cx - cxPuzzle
yPosition = handle.y + handle.height / 2
await page.mouse.move(xPosition, yPosition, { steps: 5 })
await page.mouse.up()
await page.waitFor(3000)
// success!
await fs.unlink('./original.png')
await fs.unlink('./captcha.png')
await fs.unlink('./diff.png')
await fs.unlink('./puzzle.png')
}
async function start(){
const browser = await puppeteer.launch({
headless: false,
defaultViewport: { width: 1366, height: 768 },
args: [ '--proxy-server=x.x.x.x:xxx', '--disable-web-security',
'--disable-features=IsolateOrigins,site-per-process'],
})
const page = await browser.newPage()
await page.authenticate({
username: 'xxx',
password: 'xxx',
});
await page.goto('https://someurlwithcaptchainiframe.com', { waitUntil: 'networkidle2' })
await page.waitFor(1000)
await page.content();
try {
innerText = await page.evaluate(() => {
return JSON.parse(document.querySelector("body").innerText);
});
}
catch(err) {
console.log('BLOCKED, we want to solve captcha here..')
await page.waitFor(9000)
//frame = await page.mainFrame().childFrames()
//solveCaptcha(frame)
let captchaFrame // this will be populated later by our identified frame
for (const frame of page.mainFrame().childFrames()){
// Here you can use few identifying methods like url(),name(),title()
if (frame.url().includes('geo')){
console.log('we found the captcha iframe')
captchaFrame = frame
console.log('Frame URL: '+captchaFrame._url)
await solveCaptcha(captchaFrame)
// we assign this frame to myFrame to use it later
}
}
console.log('HERE..')
await page.waitFor(90000)
// solveCaptcha(frame)
}
//console.log("innerText now contains the JSON");
//console.log(innerText);
}
start()
You must use ".mouse" object from page instance, not from iframe, like this
if(await page.$('div[id="px-captcha"] > div > iframe[style*="khtml-user-select: none"]') != null){
const frameHandle = await page.$('div[id="px-captcha"] > div > iframe[style*="khtml-user-select: none"]')
const captcha_frame = await frameHandle.contentFrame()
if(await captcha_frame.$('div[aria-label*="Human Challenge"]') != null){
const example = await captcha_frame.$('div[aria-label*="Human Challenge"]')
const box = await example.boundingBox()
await page.mouse.move(box.x + box.width / 2, box.y + box.height / 2)
await page.mouse.down()
await page.waitForTimeout(5000)
await page.mouse.up()
}
}
I'm creating a script to capture screenshots of web pages with puppeteer, I don't understand why on this site https://www.105.net/ I save the screenshot in a mobile version. do you have any suggestions?
with headless: false, I only save screenshots in the desktop version, while with headless: true, I save the screenshot in the mobile version.
I would always like to save in desktop version
const puppeteer = require('puppeteer');
//Larghezza e altezza del dispositivo - Desktop/Tablet/Mobile
const device_width = 1920;
const device_height = 1080;
//Patch di Chrome
var systemPath = '';
if (process.platform == 'darwin'){
console.log('Apro Chrome per MacOS');
var systemPath = '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome';
}else if(process.platform == 'win64'){
console.log('Apro Chrome per Windows 64bit');
var systemPath = 'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe';
}else {
console.log('Apro Chrome per Windows 32bit');
var systemPath = 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe';
}
//Configuro le classi/id dei bottoni del network
const AME = 'button.amecp_button-accetto.iubenda-cs-close-btn';
var BtnCookie = [
AME
];
//Render della pagina web
const waitTillHTMLRendered = async (page, timeout = 30000) => {
const checkDurationMsecs = 1000;
const maxChecks = timeout / checkDurationMsecs;
let lastHTMLSize = 0;
let checkCounts = 1;
let countStableSizeIterations = 0;
const minStableSizeIterations = 3;
while(checkCounts++ <= maxChecks){
let html = await page.content();
let currentHTMLSize = html.length;
let bodyHTMLSize = await page.evaluate(() => document.body.innerHTML.length);
console.log('last: ', lastHTMLSize, ' <> curr: ', currentHTMLSize, " body html size: ", bodyHTMLSize);
if(lastHTMLSize != 0 && currentHTMLSize == lastHTMLSize)
countStableSizeIterations++;
else
countStableSizeIterations = 0; //reset the counter
if(countStableSizeIterations >= minStableSizeIterations) {
console.log("Pagina renderizzata completamente...");
break;
}
lastHTMLSize = currentHTMLSize;
await page.waitFor(checkDurationMsecs);
}
};
(async function () {
//Leggo il file delle URL
var fs = require('fs');
var urlArray = fs.readFileSync('url-list.js').toString().split("\n");
//Fai un ciclo per tutte le URL
for(var i = 0; i < urlArray.length; i++){
//Controllo se nel file list url sono prensenti spazi vuoti
//if (urlArray[i] != ''){
if (urlArray[i].indexOf("http") != '-1'){
//Lancio Puppeteer
const browser = await puppeteer.launch({
headless: true,
executablePath: systemPath,
args: ['--disable-gpu','--disable-extensions'],
defaultViewport: null});
//Apro il boswser, elimino la cache e setto le dimensioni della pagina
const page = await browser.newPage();
const currentAgent = await page.evaluate('navigator.userAgent');
console.log(currentAgent);
//await page.setUserAgent(currentAgent);
await page.setCacheEnabled(false);
await page.setViewport({width: device_width, height: device_height});
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36');
//Dimmi su quale URL stai lavorando
console.log(' ');
console.log('Apro URL > '+urlArray[i]);
console.log(' ');
await page.goto(urlArray[i],{waitUntil:'networkidle2'});
//+'?bypassprivacy'
await page.waitFor(20000);
//Trova la classe/id del bottone in pagina
var contaNumeroValoriBtnCookie = BtnCookie.length;
for(var n = 0; n <= BtnCookie.length; n++){
if (await page.$(BtnCookie[n]) !== null ) {
await page.click(BtnCookie[n]);
console.log('Bypass Cookie... OK!');
break;
}else if (n == contaNumeroValoriBtnCookie) {
console.log('Cookie non trovato!');
}else {
//console.log('Cerco il cookie...');
}
} //FINE - Trova la classe/id del bottone in pagina
await waitTillHTMLRendered(page);
await page.waitFor(20000);
//Configuro il CSS per i siti:
const urlCeck = new URL(urlArray[i]);
if (urlCeck.hostname == 'www.grazia.it'){
//console.log(urlCeck.hostname); // Logs: 'wwww.nomesito.it'
/* Spengo tutti gli article per evitare lo scroll infinito */
await page.addStyleTag({content: '#container-articles article.article-shadow {display:none;} #container-articles article:first-of-type {display:block !important;}'})
}
if (urlCeck.hostname == 'blog.giallozafferano.it'){
/* Disattivo il popup sui BlogAltervista se esistono! */
await page.addStyleTag({content: '#av-chatbot-banner, .av-push-notifications {display:none !important;}'})
}
if (urlCeck.hostname == 'www.smartworld.it'){
/* Spengo tutti gli article per evitare lo scroll infinito */
await page.addStyleTag({content: '#content article {display:none;} #content article:first-of-type {display:block !important;}'})
}
if (urlCeck.hostname == 'www.pianetadonna.it' || urlCeck.hostname == 'www.pianetamamma.it'){
/* Spengo tutti gli article per evitare lo scroll infinito */
await page.addStyleTag({content: '.foglia-speciale-scroll {display:none !important;}'})
}
//Scrolla tutta la pagina per caricare i contenuti
await autoScroll(page);
async function autoScroll(page){
await page.evaluate(async () => {
await new Promise((resolve, reject) => {
var totalHeight = 0;
var distance = 100;
var timer = setInterval(() => {
var scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if(totalHeight >= scrollHeight){
clearInterval(timer);
resolve();
}
}, 300);
});
});
}
// Tortna in cima alla pagina
await page.evaluate(_ => {window.scrollTo(0, 0);});
await page.waitFor(20000);
//Pulisco l'URL prima di salvare il file
var str = urlArray[i];
str = str.replace(/[^\w]+/ig,'-');
var convertiUrl = str;
//Salvo lo screenshot
await page.screenshot({path: './screenshot/'+convertiUrl+i+'.jpg', fullPage: true}); //
console.log('Screenshot salvato con successo! :)');
//Fine
await page.close();
browser.close();
} //end if (urlArray[i].indexOf("http") != '-1'){
}//fine ciclo
})(); //end script
setUserAgent doesn't change based on your operating system?
I don't understand why if I insert the setUserAgent the page rendering function crashes
Add the following line
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36');
to send a header indicating you're using a desktop based browser.
Also don't forget to await the page.setViewport statement and It should work.
Here's a sample snippet to get you started:
const puppeteer = require('puppeteer')
const PAGE_URL = 'https://www.105.net/';
(async function () {
const browser = await puppeteer.launch({
headless: true,
executablePath: '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
args: ['--disable-gpu','--disable-extensions']});
const device_width = 1920;
const device_height = 1080;
const page = await browser.newPage();
await page.setCacheEnabled(false);
await page.setViewport({width: device_width, height: device_height})
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36');
await page.goto(PAGE_URL, {
timeout: 60000
})
await page.screenshot({
path: './screenshot.jpg'
})
await page.close()
await browser.close()
})()
I want to build a simple bot with puppeteeer.
I used page.$$eval then I tried to fetch data from table(10 page) and mapped that data.
However I can fetch data very well on the other hand the code is working 10 times per page. I mean every row fetched 10 times.
Here is my code snippet:
const tablolariCek = async (url, sayfaSayisi) => {
const browser = await puppeteer.launch({ headless: false });
let page = await browser.newPage();
await page.goto(url);
await page.waitForSelector('#mydata_next');
let okulUni = [];
for (let index = 0; index <= sayfaSayisi; index++) {
let okullar = await page.$$eval(
'#mydata > tbody > [role="row"]',
(uniler) =>
uniler.map((okul) => {
//Here is working 10 times per page.
let uni = {};
uni.okulkodu = okul.querySelector('a').innerText.trim();
const fontVeriler = okul.querySelectorAll('font');
const strongVeriler = okul.querySelectorAll('strong');
for (let index = 0; index < strongVeriler.length; index++) {
if (index == 0) {
uni.uniadi = strongVeriler[index].innerText.trim();
} else if (index == 1) {
uni.bolumadi = strongVeriler[index].innerText.trim();
}
}
for (let index = 0; index < fontVeriler.length; index++) {
if (index == 1) {
uni.bolumadi += ' ' + fontVeriler[index].innerText.trim();
} else if (index == 10) {
uni.siralama2019 = fontVeriler[index].innerText.trim();
} else if (index == 14) {
uni.puan2019 = fontVeriler[index].innerText.trim();
}
}
return uni;
})
);
await page.click('#mydata_next');
okullar.forEach((okul) => {
okulUni.push(okul);
});
}
browser.close();
return okulUni;
};
Here it is what am I trying to fetch
<table id="mydata">
<tbody>
<tr role="row" class="odd">//this line
</tbody>
I can't find solution.
I found a solution by changing this line.
const browser = await puppeteer.launch({ headless: false });
const browser = await puppeteer.launch({ headless: false,slowMo: 150 });
I think due to the speed The code can't fetch table that exactly right. Everything works fine now. Thank you for the answers.
I found a solution by changing this line.
const browser = await puppeteer.launch({ headless: false });
const browser = await puppeteer.launch({ headless: false,slowMo: 150 });
I think due to the speed The code can't fetch table that exactly right. Everything works fine now. Thank you for the answers.
I have minimal coding knowledge and I'm trying to adapt some tutorials without success.
The JavaScript code I wish to adapt (script A) is pasted into the Chrome developer console and successfully pulls the data I need. This JavaScript snippet identifies the largest price graphic in an e-commerce site.
A second tutorial (script B) is run from the shell and calls the Puppeteer library. This script pulls some hotel booking data and runs successfully.
I wish to adapt script A to run from the shell using the Puppeteer library.
This is Script A -
let elements = [
...document.querySelectorAll(' body *')
]
function createRecordFromElement(element) {
const text = element.textContent.trim()
var record = {}
const bBox = element.getBoundingClientRect()
if(text.length <= 30 && !(bBox.x == 0 && bBox.y == 0)) {
record['fontSize'] = parseInt(getComputedStyle(element)['fontSize']) }
record['y'] = bBox.y
record['x'] = bBox.x
record['text'] = text
return record
}
let records = elements.map(createRecordFromElement)
function canBePrice(record) {
if( record['y'] > 600 ||
record['fontSize'] == undefined ||
!record['text'].match(/(^(US ){0,1}(rs\.|Rs\.|RS\.|\$|₹|INR|USD|CAD|C\$){0,1}(\s){0,1}[\d,]+(\.\d+){0,1}(\s){0,1}(AED){0,1}$)/)
)
return false
else return true
}
let possiblePriceRecords = records.filter(canBePrice)
let priceRecordsSortedByFontSize = possiblePriceRecords.sort(function(a, b) {
if (a['fontSize'] == b['fontSize']) return a['y'] > b['y']
return a['fontSize'] < b['fontSize']
})
console.log(priceRecordsSortedByFontSize[0]['text']);console.log(priceRecordsSortedByFontSize[1]['text']);
This is Script B -
const puppeteer = require('puppeteer');
let bookingUrl = 'insert booking URL';
(async () => {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 926 });
await page.goto(bookingUrl);
// get hotel details
let hotelData = await page.evaluate(() => {
let hotels = [];
// get the hotel elements
let hotelsElms = document.querySelectorAll('div.sr_property_block[data-hotelid]');
// get the hotel data
hotelsElms.forEach((hotelelement) => {
let hotelJson = {};
try {
hotelJson.name = hotelelement.querySelector('span.sr-hotel__name').innerText;
hotelJson.reviews = hotelelement.querySelector('span.review-score-widget__subtext').innerText;
hotelJson.rating = hotelelement.querySelector('span.review-score-badge').innerText;
if(hotelelement.querySelector('strong.price')){
hotelJson.price = hotelelement.querySelector('strong.price').innerText;
}
}
catch (exception){
}
hotels.push(hotelJson);
});
return hotels;
});
console.dir(hotelData);
})();
I've had various attempts at adapting Script A into the format of Script B. Various and many different errors have been thrown. Without coding knowledge, I'm not getting anywhere.
Here's one of many variations I've tried, called Script C -
const puppeteer = require('puppeteer-core');
let bookingUrl = 'https://shop.coles.com.au/a/dianella/product/moccona-coffee-capsules-espresso-7';
(async () => {
const browser = await puppeteer.launch({
executablePath: '/usr/bin/chromium-browser',
headless: true
});
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 926 });
await page.goto(bookingUrl);
// get hotel details
let hotelData = await page.evaluate(() => {
let hotels = [];
// get the hotel elements
let elements = [
...document.querySelectorAll(' body *')
]
function createRecordFromElement(element) {
const text = element.textContent.trim()
var record = {}
const bBox = element.getBoundingClientRect()
if(text.length <= 30 && !(bBox.x == 0 && bBox.y == 0)) {
record['fontSize'] = parseInt(getComputedStyle(element)['fontSize']) }
record['y'] = bBox.y
record['x'] = bBox.x
record['text'] = text
return record
}
let records = elements.map(createRecordFromElement)
function canBePrice(record) {
if( record['y'] > 600 ||
record['fontSize'] == undefined ||
!record['text'].match(/(^(US ){0,1}(rs\.|Rs\.|RS\.|\$|₹|INR|USD|CAD|C\$){0,1}(\s){0,1}[\d,]+(\.\d+){0,1}(\s){0,1}(AED){0,1}$)/)
)
return false
else return true
}
let possiblePriceRecords = records.filter(canBePrice)
let priceRecordsSortedByFontSize = possiblePriceRecords.sort(function(a, b) {
if (a['fontSize'] == b['fontSize']) return a['y'] > b['y']
return a['fontSize'] < b['fontSize']
})
console.log(priceRecordsSortedByFontSize[0]['text']);
})();
Here's the links to the tutorials for info -
https://www.scrapehero.com/how-to-scrape-prices-from-any-ecommerce-website/
https://www.scrapehero.com/how-to-build-a-web-scraper-using-puppeteer-and-node-js/
Is there anything obviously wrong in Script C?
After reading through script C, it appears that you have not made any mistakes, rather the website you are attempting to access has decided to block scraper bots.
A quick host lookup on the domain shows that they are using security service section.io to block scraper bots on their website. See:
shop.coles.com.au is an alias for shop.coles.com.au.c.section.io.
shop.coles.com.au.c.section.io is an alias for shop.coles.com.au.x.section.io