I want to scrape some data from login protected page using Node Js and Puppeteer. When I try to scrape data, the shows page is not reachable. I don't have any idea about why I am getting output like this. Here is my code.
I am new to Puppeteer and Node Js, its been 8 hours I am trying to modify that code, but not working.
async function EmailLookup(MlsNumber)
{
resp = new { Success = false };
(page = await singletonBrowser.Instance.newPage())
{
await page.setRequestInterception(true);
page.Request += (sender, e) =>
{
if (e.Request.resourceType === resourceType.document || e.Request.resourceType === resourceType.Script || e.Request.resourceType === resourceType.Xhr)
e.Request.Continue();
else
e.Request.Abort();
};
await page.async("https://example.com/idp/login");
if (!page.Url.Contains("layouts"))
{
await page.waitForSelector("#username");
await page.waitForSelector("#password");
await page.waitForSelector("#loginbtn");
await page.async("#username", "xxxxx");
await page.async("#password", "xxxxx");
await page.async("#password", "\r");
await page.waitForNavigation(new navigationOptions { Timeout = 0 });
}
await page.goto("https://example.com/launch?layoutid=61&appid=433");
await page.waitForSelector("#ctl02_m_ucSpeedBar_m_tbSpeedBar");
await page.async("#ctl02_m_ucSpeedBar_m_tbSpeedBar", MlsNumber);
await page.async("#ctl02_m_ucSpeedBar_m_tbSpeedBar", "\r");
await page.waitForNavigation(new navigationOptions { Timeout = 0 });
var MLSLink = await page.waitForXPath("//a[text()='" + MlsNumber + "']");
if (MLSLink != null)
{
await MLSLink.click();
await page.waitForNavigation(new navigationOptions{ Timeout = 0 });
var Content = await page.get();
htmldoc = new htmldoc();
htmldoc.load(Content);
var parcelNode = htmldoc.document.selectSingleNode("//a[contains(#href,'AssessorParcelDetail')]");
var emailNode = htmldoc.document.selectSingleNode("//a[contains(#href,'mailto:')]");
if (emailNode != null && parcelNode != null)
{
resp.Success = true;
resp.Email = emailNode.innerText;
resp.Parcel = parcelNode.innerText;
}
}
return json(resp);
}
}
I am trying to web scrape with puppetter, currently the following code works but I think there can be optimizations made one of which I think is to only use one puppetter instance. But I don't really know how to do that. Can anyone help me?
This is the working but slow original:
const puppeteer = require('puppeteer');
async function scrapeProduct(url) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
const [el] = await page.$x('xpath of element');
const txt = await el.getProperty('textContent');
const rawTxt = await txt.jsonValue();
browser.close();
return rawTxt;
}
async function something() {
var some_varible = the length of some array;
process.setMaxListeners(some_varible);
for (var i = 0; i < some_varible; i++) {
var target = some_array[i].Name
var url = 'somewebsite' + target;
console.log(target + ": " + await scrapeProduct(url));
}
}
something();
This is my pathetic attempt at not using multiple instances of puppeteer: (Does not work)
const puppeteer = require('puppeteer');
async function scrapeProduct(url, page) {
await page.goto(url);
const [el] = await page.$x('xpath of element');
const txt = await el.getProperty('textContent');
const rawTxt = await txt.jsonValue();
return rawTxt;
}
async function something() {
var some_varible = the length of some array;
process.setMaxListeners(some_varible);
const browser = await puppeteer.launch();
const page = await browser.newPage();
for (var i = 0; i < some_varible; i++) {
var target = some_array[i].Name
var url = 'somewebsite' + target;
console.log(target + ": " + await scrapeProduct(url, page));
}
browser.close();
}
something();
I'm trying to solve a captcha in an iframe. Am I accessing iframe correctly with the codes below? I'm able to click the button in the iframe but not move the mouse. I'm getting x,y, width, height positions but the positions may be wrong since it's in an iframe. If I grab the iframe URL, visit it and solve the captcha then I'm able to move the mouse and solve the captcha.
Thanks
const puppeteer = require('puppeteer')
const fs = require('fs').promises
const Jimp = require('jimp')
const pixelmatch = require('pixelmatch')
const { cv } = require('opencv-wasm')
async function findPuzzlePosition (page) {
let images = await page.$$eval('.geetest_canvas_img canvas', canvases => canvases.map(canvas => canvas.toDataURL().replace(/^data:image\/png;base64,/, '')))
await fs.writeFile(`./puzzle.png`, images[1], 'base64')
let srcPuzzleImage = await Jimp.read('./puzzle.png')
let srcPuzzle = cv.matFromImageData(srcPuzzleImage.bitmap)
let dstPuzzle = new cv.Mat()
cv.cvtColor(srcPuzzle, srcPuzzle, cv.COLOR_BGR2GRAY)
cv.threshold(srcPuzzle, dstPuzzle, 127, 255, cv.THRESH_BINARY)
let kernel = cv.Mat.ones(5, 5, cv.CV_8UC1)
let anchor = new cv.Point(-1, -1)
cv.dilate(dstPuzzle, dstPuzzle, kernel, anchor, 1)
cv.erode(dstPuzzle, dstPuzzle, kernel, anchor, 1)
let contours = new cv.MatVector()
let hierarchy = new cv.Mat()
cv.findContours(dstPuzzle, contours, hierarchy, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)
let contour = contours.get(0)
let moment = cv.moments(contour)
return [Math.floor(moment.m10 / moment.m00), Math.floor(moment.m01 / moment.m00)]
}
async function findDiffPosition (page) {
await page.waitFor(100)
let srcImage = await Jimp.read('./diff.png')
let src = cv.matFromImageData(srcImage.bitmap)
let dst = new cv.Mat()
let kernel = cv.Mat.ones(5, 5, cv.CV_8UC1)
let anchor = new cv.Point(-1, -1)
cv.threshold(src, dst, 127, 255, cv.THRESH_BINARY)
cv.erode(dst, dst, kernel, anchor, 1)
cv.dilate(dst, dst, kernel, anchor, 1)
cv.erode(dst, dst, kernel, anchor, 1)
cv.dilate(dst, dst, kernel, anchor, 1)
cv.cvtColor(dst, dst, cv.COLOR_BGR2GRAY)
cv.threshold(dst, dst, 150, 255, cv.THRESH_BINARY_INV)
let contours = new cv.MatVector()
let hierarchy = new cv.Mat()
cv.findContours(dst, contours, hierarchy, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)
let contour = contours.get(0)
let moment = cv.moments(contour)
return [Math.floor(moment.m10 / moment.m00), Math.floor(moment.m01 / moment.m00)]
}
async function saveSliderCaptchaImages(page) {
await page.waitForSelector('[aria-label="Click to verify"]')
await page.waitFor(3000)
await page.click('[aria-label="Click to verify"]')
await page.waitForSelector('.geetest_canvas_img canvas', { visible: true })
await page.waitFor(1000)
let images = await page.$$eval('.geetest_canvas_img canvas', canvases => {
return canvases.map(canvas => canvas.toDataURL().replace(/^data:image\/png;base64,/, ''))
})
await fs.writeFile(`./captcha.png`, images[0], 'base64')
await fs.writeFile(`./original.png`, images[2], 'base64')
}
async function saveDiffImage() {
const originalImage = await Jimp.read('./original.png')
const captchaImage = await Jimp.read('./captcha.png')
const { width, height } = originalImage.bitmap
const diffImage = new Jimp(width, height)
const diffOptions = { includeAA: true, threshold: 0.2 }
pixelmatch(originalImage.bitmap.data, captchaImage.bitmap.data, diffImage.bitmap.data, width, height, diffOptions)
diffImage.write('./diff.png')
}
async function solveCaptcha (page) {
console.log(page)
await saveSliderCaptchaImages(page)
await saveDiffImage()
let [cx, cy] = await findDiffPosition(page)
const sliderHandle = await page.$('.geetest_slider_button')
const handle = await sliderHandle.boundingBox()
await page.waitFor(5000)
console.log(handle)
let xPosition = handle.x + handle.width / 2
let yPosition = handle.y + handle.height / 2
await page.mouse.move(xPosition, yPosition)
await page.mouse.down()
xPosition = handle.x + cx - handle.width / 2
yPosition = handle.y + handle.height / 3
await page.mouse.move(xPosition, yPosition, { steps: 25 })
await page.waitFor(100)
let [cxPuzzle, cyPuzzle] = await findPuzzlePosition(page)
xPosition = xPosition + cx - cxPuzzle
yPosition = handle.y + handle.height / 2
await page.mouse.move(xPosition, yPosition, { steps: 5 })
await page.mouse.up()
await page.waitFor(3000)
// success!
await fs.unlink('./original.png')
await fs.unlink('./captcha.png')
await fs.unlink('./diff.png')
await fs.unlink('./puzzle.png')
}
async function start(){
const browser = await puppeteer.launch({
headless: false,
defaultViewport: { width: 1366, height: 768 },
args: [ '--proxy-server=x.x.x.x:xxx', '--disable-web-security',
'--disable-features=IsolateOrigins,site-per-process'],
})
const page = await browser.newPage()
await page.authenticate({
username: 'xxx',
password: 'xxx',
});
await page.goto('https://someurlwithcaptchainiframe.com', { waitUntil: 'networkidle2' })
await page.waitFor(1000)
await page.content();
try {
innerText = await page.evaluate(() => {
return JSON.parse(document.querySelector("body").innerText);
});
}
catch(err) {
console.log('BLOCKED, we want to solve captcha here..')
await page.waitFor(9000)
//frame = await page.mainFrame().childFrames()
//solveCaptcha(frame)
let captchaFrame // this will be populated later by our identified frame
for (const frame of page.mainFrame().childFrames()){
// Here you can use few identifying methods like url(),name(),title()
if (frame.url().includes('geo')){
console.log('we found the captcha iframe')
captchaFrame = frame
console.log('Frame URL: '+captchaFrame._url)
await solveCaptcha(captchaFrame)
// we assign this frame to myFrame to use it later
}
}
console.log('HERE..')
await page.waitFor(90000)
// solveCaptcha(frame)
}
//console.log("innerText now contains the JSON");
//console.log(innerText);
}
start()
You must use ".mouse" object from page instance, not from iframe, like this
if(await page.$('div[id="px-captcha"] > div > iframe[style*="khtml-user-select: none"]') != null){
const frameHandle = await page.$('div[id="px-captcha"] > div > iframe[style*="khtml-user-select: none"]')
const captcha_frame = await frameHandle.contentFrame()
if(await captcha_frame.$('div[aria-label*="Human Challenge"]') != null){
const example = await captcha_frame.$('div[aria-label*="Human Challenge"]')
const box = await example.boundingBox()
await page.mouse.move(box.x + box.width / 2, box.y + box.height / 2)
await page.mouse.down()
await page.waitForTimeout(5000)
await page.mouse.up()
}
}
I'm creating a script to take screenshots of Web pages with the puppeteer, I don't understand why on this site the image is saved with a width greater than that which I have set, 1920px.
If I have the fixed width of the browser, why does the screenshot come out with a greater width?
I would like to save the screenshot with a fixed width of 1920px and height based on the total content of the page.
The width of the saved image should be as wide as the width of the browser, why doesn't this happen?
const puppeteer = require('puppeteer');
const os = require('os');
const username = require('username');
//I identify the operating system and the architect of the CPU to run the Google Chrome Patch
var architetturaCPU = os.arch();
var sistemaOperativo = os.type();
console.log('System OS: '+sistemaOperativo+' '+architetturaCPU);
//console.log(os.platform());
// Device width and height
const device_width = 1920;
const device_height = 1080;
//Patch di Chrome
var systemPath = '';
if (sistemaOperativo == 'Darwin'){
console.log('Chrome for MacOS');
var systemPath = '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome';
} else
if(sistemaOperativo == 'Windows_NT' && architetturaCPU == 'x64'){
console.log('Chrome for Windows 64bit');
var systemPath = 'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe';
}else
if(sistemaOperativo == 'Windows_NT' && architetturaCPU == 'x32'){
console.log('Chrome for Windows 32bit');
var systemPath = 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe';
}else
if(sistemaOperativo == 'Windows_NT' && architetturaCPU == 'ia32'){
console.log('Chrome for Windows 32bit');
var systemPath = 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe';
}
//I build an Array and insert all the buttons for the consent of the Cookies for the Network sites
const btncookie1 = 'button.cs-close-btn';
const btncookie2 = 'button.cs-accept-btn.cs-btn-primary';
var BtnCookie = [
btncookie1,
btncookie2
];
(async function () {
//I read the url file
var fs = require('fs');
var urlArray = fs.readFileSync('url-list.js').toString().split("\n").filter(a => a);
//Launch Puppeteer
const browser = await puppeteer.launch({
headless: true,
executablePath: systemPath,
args: ['--disable-dev-shm-usage','--no-sandbox','--window-size=1920,1080'],
defaultViewport: null
});
//Loop through all the url-list.js URL
var contaUrl = 0;
for(var i = 0; i < urlArray.length; i++){
//Check if empty spaces are present in the url file list
if (urlArray[i].indexOf("http") != '-1'){
//I open the boswser, delete the cache and set the page size
const page = await browser.newPage();
const client = await page.target().createCDPSession();
await client.send('Network.clearBrowserCookies');
await client.send('Network.clearBrowserCache');
await page.setCacheEnabled(false);
await page.setViewport({width: device_width, height: device_height});
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36');
//Tell me which URL you are working on
console.log(' ');
console.log('\x1b[33m%s','Open URL > '+urlArray[i],'\x1b[0m');
console.log(' ');
await page.goto(urlArray[i],{waitUntil:'networkidle2'});
await page.waitFor(20000);
//Find the class / id of the button on the page to accept cookies
var contaNumeroValoriBtnCookie = BtnCookie.length;
for(var n = 0; n <= BtnCookie.length; n++){
if (await page.$(BtnCookie[n]) !== null ) {
console.log(BtnCookie[n]);
const navigationPromise = page.waitForSelector(BtnCookie[n]);
await page.click(BtnCookie[n]);
await navigationPromise;
console.log('\x1b[32m%s', 'Bypass Cookie... OK!','\x1b[0m');
break;
}else if (n == contaNumeroValoriBtnCookie) {
console.log('\x1b[31m%s', 'Cookie not found!','\x1b[0m');
}else {
//console.log('I'm looking for the cookie...');
}
} //end - Find the class / id of the button on the page to accept cookies
//Scroll the entire page to load the content
await autoScroll(page);
async function autoScroll(page){
await page.evaluate(async () => {
await new Promise((resolve, reject) => {
var totalHeight = 0;
var distance = 100;
var timer = setInterval(() => {
var scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if(totalHeight >= scrollHeight){
clearInterval(timer);
resolve();
}
}, 300);
});
});
}
//Go back to the top of the page
await page.evaluate(_ => {window.scrollTo(0, 0);});
await page.waitFor(10000);
//I clean up the URL before saving the file
var str = urlArray[i];
str = str.replace(/[^\w]+/ig,'-');
var convertiUrl = str;
//SAVE screenshot
await page.screenshot({path: './screenshot/'+convertiUrl+i+'.jpg', fullPage: true});
await page.waitFor(5000);
await page.close();
}//end if (urlArray[i].indexOf("http") != '-1'){
}//end loop
browser.close();
console.log(' ');
console.log('\x1b[32m%s', contaUrl+' all screenshot saved :)','\x1b[0m');
console.log(' ');
})(); //end script
Try to add these line to resize viewport after the page.goto method:
...
await page.goto(urlArray[i],{timeout: 0, waitUntil:'networkidle2'});
await page.waitFor(20000);
await page.setViewport({
width: 1920,
height: 1080,
});
//Find the class / id of the button on the page to accept cookies
var contaNumeroValoriBtnCookie = BtnCookie.length;
...
if (!("indexedDB" in window)) {
console.warn("IndexedDB not supported");
return;
}
log("db created ",data.product_detail)
const dbName = "gt_form";
const storeName = "product_store";
const version = 1; //versions start at 1
const db = await openDB(dbName, version, {
upgrade(db, oldVersion, newVersion, transaction) {
const store = db.createObjectStore(storeName);
}
});
const tx = await db.transaction(storeName, "readwrite");
const store = await tx.objectStore(storeName);
its working fine on mozila and chrome but don't know why rest of the browser is not supporting ven chrome incognito mode also