I've been making incremental progress, but I'm fairly stumped at this point.
This is the site I'm trying to download from https://www.transtats.bts.gov/OT_Delay/OT_DelayCause1.asp
The reason I'm using Puppeteer is because I can't find a supported API to get this data (if there is one happy to try it)
The link is "Download Raw Data"
My script runs to the end, but doesn't seem to actually download any files. I tried installing puppeteer-extra and setting the downloads path:
const puppeteer = require("puppeteer-extra");
const { executablePath } = require('puppeteer')
...
var dir = "/home/ubuntu/AirlineStatsFetcher/downloads";
console.log('dir to set for downloads', dir);
puppeteer.use(require('puppeteer-extra-plugin-user-preferences')
(
{
userPrefs: {
download: {
prompt_for_download: false,
open_pdf_in_system_reader: true,
default_directory: dir,
},
plugins: {
always_open_pdf_externally: true
},
}
}));
const browser = await puppeteer.launch({
headless: true, slowMo: 100, executablePath: executablePath()
});
...
// Doesn't seem to work
await page.waitForSelector('table > tbody > tr > .finePrint:nth-child(3) > a:nth-child(2)');
console.log('Clicking on link to download CSV');
await page.click('table > tbody > tr > .finePrint:nth-child(3) > a:nth-child(2)');
After a while I figured why not tried to build the full URL and then do a GET request but then i run into other problems (UNABLE_TO_VERIFY_LEAF_SIGNATURE). Before going down this route farther (which feels a little hacky) I wanted to ask advice here.
Is there something I'm missing in terms of configuration to get it to download?
Downloading files using puppeteer seems to be a moving target btw not well supported today. For now (puppeteer 19.2.2) I would go with https.get instead.
"use strict";
const fs = require("fs");
const https = require("https");
// Not sure why puppeteer-extra is used... maybe https://stackoverflow.com/a/73869616/1258111 solves the need in future.
const puppeteer = require("puppeteer-extra");
const { executablePath } = require("puppeteer");
(async () => {
puppeteer.use(
require("puppeteer-extra-plugin-user-preferences")({
userPrefs: {
download: {
prompt_for_download: false,
open_pdf_in_system_reader: false,
},
plugins: {
always_open_pdf_externally: false,
},
},
})
);
const browser = await puppeteer.launch({
headless: true,
slowMo: 100,
executablePath: executablePath(),
});
const page = await browser.newPage();
await page.goto(
"https://www.transtats.bts.gov/OT_Delay/OT_DelayCause1.asp ",
{
waitUntil: "networkidle2",
}
);
const handle = await page.$(
"table > tbody > tr > .finePrint:nth-child(3) > a:nth-child(2)"
);
const relativeZipUrl = await page.evaluate(
(anchor) => anchor.getAttribute("href"),
handle
);
const url = "https://www.transtats.bts.gov/OT_Delay/".concat(relativeZipUrl);
const encodedUrl = encodeURI(url);
//Don't use in production
https.globalAgent.options.rejectUnauthorized = false;
https.get(encodedUrl, (res) => {
const path = `${__dirname}/download.zip`;
const filePath = fs.createWriteStream(path);
res.pipe(filePath);
filePath.on("finish", () => {
filePath.close();
console.log("Download Completed");
});
});
await browser.close();
})();
Related
I am trying to maximize the browser window using Playwright. I tried below code but the browser is not maximized to full mode.
hooks.cjs class:
const playwright = require('playwright');
const { BeforeAll, Before, After, AfterAll , Status } = require('#cucumber/cucumber');
// Launch options.
const options = {
slowMo: 100,
ignoreHTTPSErrors: true,
};
// Create a global browser for the test session.
BeforeAll(async () => {
console.log('Launch Browser');
global.browser = await playwright['chromium'].launch({ headless: false,
args:['--window-size=1920,1040']}); // --start-maximized //defaultViewport: null
//global.context = await playwright['chromium'].launch({ args: ['--start-maximized'] });
})
// Create a new browser context for each test.
Before(async () => {
console.log('Create a new Context and Page')
global.context = await global.browser.newContext()
global.page = await global.context.newPage()
})
// Close the page and context after each test.
After(async () => {
console.log('Close Context and Page');
await global.page.close();
await global.context.close();
})
// Take Screenshot if Scenario Fails
After(async function (scenario) {
if (scenario.result.status === Status.FAILED) {
const buffer = await global.page.screenshot({ path: `reports/${scenario.pickle.name}.png`, fullPage: true })
this.attach(buffer, 'image/png');
}
})
// Close Browser
AfterAll(async () => {
console.log('close Browser')
await global.browser.close()
});
I'm running with npm: npm run test -- --tags "#Begin"
I have tried in many ways, but I cannot launch browser maximized. How can I do this?
If you want to start your browser maximized, then use the --start-maximized flag when starting the browser, and disable fixed viewport when launching a context or page. Example
global.browser = await playwright['chromium'].launch({ headless: false,
args:['--start-maximized']});
global.context = await global.browser.newContext({ viewport: null})
In python, use the equivalent no_viewport arg and pass it as True when starting a context
I am trying to get the company data from this website called similar web but upon making a lot of requests it recognizes my script as a bot so is there any way to bypass this check? or suggest any website to scrap data easily, we can't use LinkedIn by the way.
const puppeteer = require("puppeteer");
const searchCompany = "zoominfo.com";
const Link = `https://www.similarweb.com/website/${searchCompany}/#overview`;
// console.log(companyPage);
let page;
(async function () {
try {
let browserOpen = await puppeteer.launch({
headless: false,
// dumpio: true,
// args: ["--start-maximized"],
defaultViewport: null,
});
let newTab = await browserOpen.newPage();
await newTab.goto(Link);
await newTab.screenshot({ path: "sc.png" });
await newTab.waitForSelector(".data-company-info__row");
let ans = await newTab.evaluate(() => {
let name = document.querySelectorAll(".data-company-info__row")[0]
.textContent;
let location = document.querySelectorAll(".data-company-info__row")[3]
.textContent;
let industry = document.querySelectorAll(".data-company-info__row")[5]
.textContent;
// console.log(ans);
return { name, location, industry };
});
console.log(ans);
await browserOpen.close();
} catch (err) {
console.log(err);
}
})();
Just out of curiosity - what do you use similarweb data for?
You can try using https://github.com/bda-research/node-crawler that has delays and max connections params
I am trying to download a website as static, I mean without JS, only HTML & CSS.
I've tried many approaches yet some issues still present regarding CSS and Images.
A snippet
const puppeteer = require('puppeteer');
const {URL} = require('url');
const fse = require('fs-extra');
const path = require('path');
(async (urlToFetch) => {
const browser = await puppeteer.launch({
headless: true,
slowMo: 100
});
const page = await browser.newPage();
await page.setRequestInterception(true);
page.on("request", request => {
if (request.resourceType() === "script") {
request.abort()
} else {
request.continue()
}
})
page.on('response', async (response) => {
const url = new URL(response.url());
let filePath = path.resolve(`./output${url.pathname}`);
if(path.extname(url.pathname).trim() === '') {
filePath = `${filePath}/index.html`;
}
await fse.outputFile(filePath, await response.buffer());
console.log(`File ${filePath} is written successfully`);
});
await page.goto(urlToFetch, {
waitUntil: 'networkidle2'
})
setTimeout(async () => {
await browser.close();
}, 60000 * 4)
})('https://stackoverflow.com/');
I've tried using
content = await page.content();
fs.writeFileSync('index.html', content, { encoding: 'utf-8' });
As well as, I download it using CDPSession.
I've tried it using website-scraper
So what is the best approach to come to a solution where I provide a website link, then It downloads it as static website.
Try using this https://www.npmjs.com/package/website-scraper
It will save the website into a local directory
Have you tried something like wget or curl?
wget -p https://stackoverflow.com/questions/67559777/download-website-locally-without-javascript-using-puppeteer
Should do the trick
I have a list of urls that need to be scraped from a website that uses React, for this reason I am using Puppeteer.
I do not want to be blocked by anti-bot servers, for this reason I have added puppeteer-extra-plugin-stealth
I want to prevent ads from loading on the pages, so I am blocking ads by using puppeteer-extra-plugin-adblocker
I also want to prevent my IP address from being blacklisted, so I have used TOR nodes to have different IP addresses.
Below is a simplified version of my code and the setup works (TOR_port and webUrl are assigned dynamically though but for simplifying my question I have assigned it as a variable) .
There is a problem though:
const puppeteer = require('puppeteer-extra');
const _StealthPlugin = require('puppeteer-extra-plugin-stealth');
const _AdblockerPlugin = require('puppeteer-extra-plugin-adblocker');
puppeteer.use(_StealthPlugin());
puppeteer.use(_AdblockerPlugin());
var TOR_port = 13931;
var webUrl ='https://www.zillow.com/homedetails/2861-Bass-Haven-Ln-Saint-Augustine-FL-32092/47739703_zpid/';
const browser = await puppeteer.launch({
dumpio: false,
headless: false,
args: [
`--proxy-server=socks5://127.0.0.1:${TOR_port}`,
`--no-sandbox`,
],
ignoreHTTPSErrors: true,
});
try {
const page = await browser.newPage();
await page.setViewport({ width: 1280, height: 720 });
await page.goto(webUrl, {
waitUntil: 'load',
timeout: 30000,
});
page
.waitForSelector('.price')
.then(() => {
console.log('The price is available');
await browser.close();
})
.catch(() => {
// close this since it is clearly not a zillow website
throw new Error('This is not the zillow website');
});
} catch (e) {
await browser.close();
}
The above setup works but is very unreliable and I recently learnt about Puppeteer-Cluster. I need it to help me manage crawling multiple pages, to track my scraping tasks.
So, my question is how do I implement Puppeteer-Cluster with the above set-up. I am aware of an example(https://github.com/thomasdondorf/puppeteer-cluster/blob/master/examples/different-puppeteer-library.js) offered by the library to show how you can implement plugins, but is so bare that I didn't quite understand it.
How do I implement Puppeteer-Cluster with the above TOR, AdBlocker, and Stealth configurations?
You can just hand over your puppeteer Instance like following:
const puppeteer = require('puppeteer-extra');
const _StealthPlugin = require('puppeteer-extra-plugin-stealth');
const _AdblockerPlugin = require('puppeteer-extra-plugin-adblocker');
puppeteer.use(_StealthPlugin());
puppeteer.use(_AdblockerPlugin());
const browser = await puppeteer.launch({
puppeteer,
});
Src: https://github.com/thomasdondorf/puppeteer-cluster#clusterlaunchoptions
You can just add the plugins with puppeteer.use()
You have to use puppeteer-extra.
const { addExtra } = require("puppeteer-extra");
const vanillaPuppeteer = require("puppeteer");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
const RecaptchaPlugin = require("puppeteer-extra-plugin-recaptcha");
const { Cluster } = require("puppeteer-cluster");
(async () => {
const puppeteer = addExtra(vanillaPuppeteer);
puppeteer.use(StealthPlugin());
puppeteer.use(RecaptchaPlugin());
// Do stuff
})();
I am currently trying to find the number of pages in a single pdf / what is the total size of the pdf file created by puppeteer.page as per requirement
Here's what I did:
try {
const generatedPdfFilePath = `${directory}/feedback-${requestId}.pdf`;
const htmlFilePath = `${directory}/report-${requestId}.html`;
const htmlTemplate =
fs.readFileSync(path.join(process.cwd(), '/data/feedback-template.hbs'), 'utf-8');
const template = handlebars.compile(htmlTemplate);
const htmlFile = minify(template(data), {
collapseWhitespace: true,
});
fs.writeFileSync(htmlFilePath , htmlFile);
const options = {
format: 'A4',
printBackground: true,
path: generatedPdfFilePath ,
};
const browser = await puppeteer.launch({
args: ['--no-sandbox'],
headless: true,
});
const page = await browser.newPage();
await page.goto(`file://${htmlFilePath}`, {
waitUntil: 'networkidle0',
timeout: 300000,
});
await page.pdf(options);
// Do something here to find number of pages in this pdf
await browser.close();
resolve({ file: generatedPdfFilePath });
} catch (error) {
console.log(error);
reject(error);
}
So far what I have done is created an html template for the pdf, then used puppeteer, headless chrome for nodejs to generate the required pdf of the page. But now Im sort of stuck because I want to know how many pages are actually in this pdf file or in other words what is the size of the pdf which I need in further calculations. I have only mentioned the relevant code here for ease.
Also, Im pretty new to puppeteer, Can someone explain how can I get details of this pdf. I have been searching for quite some time now and no luck. Puppeteer's documentation isn't helping in any case no details are there on why we do what we do. All I get is the details on pdf options..
docs
Any help would be much appreciated.
You can use the pdf-parse node module, like this:
const fs = require('fs');
const pdf = require('pdf-parse');
let dataBuffer = fs.readFileSync('path to PDF file...');
pdf(dataBuffer).then(function(data) {
// number of pages
console.log(data.numpages);
});
Your code would become something like:
const pdf = require('pdf-parse');
try {
const generatedPdfFilePath = `${directory}/feedback-${requestId}.pdf`;
const htmlFilePath = `${directory}/report-${requestId}.html`;
const htmlTemplate =
fs.readFileSync(path.join(process.cwd(), '/data/feedback-template.hbs'), 'utf-8');
const template = handlebars.compile(htmlTemplate);
const htmlFile = minify(template(data), {
collapseWhitespace: true,
});
fs.writeFileSync(htmlFilePath , htmlFile);
const options = {
format: 'A4',
printBackground: true,
path: generatedPdfFilePath ,
};
const browser = await puppeteer.launch({
args: ['--no-sandbox'],
headless: true,
});
const page = await browser.newPage();
await page.goto(`file://${htmlFilePath}`, {
waitUntil: 'networkidle0',
timeout: 300000,
});
await page.pdf(options);
// Do something here to find number of pages in this pdf
let dataBuffer = fs.readFileSync(htmlFilePath);
const pdfInfo = await pdf(dataBuffer);
const numPages = pdfInfo.numpages;
await browser.close();
resolve({ file: generatedPdfFilePath });
} catch (error) {
console.log(error);
reject(error);
}