How to download file with direct link Puppeteer? - javascript

I need to download a image with puppeteer. the problem here, buffer return by goto method. i think it return sequence as the image load. so writeFile method only get the last buffer. Is there other promise method to tackle sequence buffer?
const puppeteer = require('puppeteer-core');
const fs = require('fs').promises;
(async () => {
const options = {
product: 'chrome',
headless: true,
pipe: true,
executablePath: 'chrome.exe'
};
const browser = await puppeteer.launch(options);
const page = await browser.newPage();
const response = await page.goto('https://static.wikia.nocookie.net/naruto/images/d/dd/Naruto_Uzumaki%21%21.png/revision/latest?cb=20161013233552');
// save buffer to file
await fs.writeFile('file.jpg', await response.buffer());
browser.close();
})();

Download with puppeteer is an hell...
for simple downloading in a folder on you pc:
await page._client.send('Page.setDownloadBehavior', {behavior: 'allow',
downloadPath: 'C:\folder...'}); // this set the destionation of file dowloaded by pup.
// then page.goto ....
This is a full-example: you can use to dowload any kind of resources, also if a login is required :-)
const puppeteer = require('puppeteer');
const fs = require('fs').promises;
(async () => {
let browser = await puppeteer.launch();
let page = await browser.newPage();
await page.goto('https://static.wikia.nocookie.net')
// ...eventually login
let fn = async (URI) => {
// fetch the page from inside the html page
const res = await fetch(URI, {'credentials': 'same-origin'});
// attachment; filename="...."; size="1234"
let str = res.headers.get('Content-Disposition');
const regex = /filename="([^"]*)".*size="([^"]*)"/gm;
let m = regex.exec(str);
let filename = m?m[1]:null;
let size = m?m[2]:null;
let blob = await res.blob()
let bufferArray = await blob.arrayBuffer();
var base64String = btoa([].reduce.call(new Uint8Array(bufferArray),function(p,c){return p+String.fromCharCode(c)},''))
return {base64String, size, filename};
}
let x = await page.evaluate(fn, 'https://static.wikia.nocookie.net/naruto/images/d/dd/Naruto_Uzumaki%21%21.png/revision/latest?cb=20161013233552')
// x.base64String <- buffer, you can write
// x.filename <- name of file downloaded
// x.size <- size
// console.log(x.blob)
await fs.writeFile('message1.png', x.base64String, {encoding: "base64"})
console.log('ok!')
})().then();

Related

I want to get the urls of each home from the attribute content

const puppeteer = require("puppeteer");
const cheerio = require("cheerio");
const url = "https://www.airbnb.co.in/s/Haridwar--Uttarakhand/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_lengths%5B%5D=one_week&price_filter_input_type=0&price_filter_num_nights=5&l2_property_type_ids%5B%5D=1&search_type=autocomplete_click&query=Haridwar%2C%20Uttarakhand&place_id=ChIJyVfuuA5HCTkR8_VApnaRRE4&date_picker_type=calendar&source=structured_search_input_header";
async function scrapHomesPage(url)
{
try
{
const browser = await puppeteer.launch({headless:false});
const page = await browser.newPage();
await page.goto(url);
const html = await page.evaluate(()=> document.body.innerHTML);
const $ = cheerio.load(html);
const homes = $('[itemprop="url"]').map((i, element) => $(element).attr("content")).get();
console.log(homes);
}
catch(err)
{
console.error(err);
}
}
scrapHomesPage("https://www.airbnb.co.in/s/Haridwar--Uttarakhand/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_lengths%5B%5D=one_week&price_filter_input_type=0&price_filter_num_nights=5&l2_property_type_ids%5B%5D=1&search_type=autocomplete_click&query=Haridwar%2C%20Uttarakhand&place_id=ChIJyVfuuA5HCTkR8_VApnaRRE4&date_picker_type=calendar&source=structured_search_input_header");
I tried to add everything I could to wait for the page to load all the contents. I tried wait for selectors etc. I am always getting an empty array instead I should get an array with all the links of each home listed on the Airbnb site for that particular location.
I don't see any reason to use Cheerio here. It's just another layer of indirection to get the data you want, involving an extra dependency, a whole second parse of the page and the potential for bugs when the page goes out of sync with the HTML snapshot you've created. If you do need to use it, you can use page.content() instead of page.evaluate(() => document.body.innerHTML).
As for the main problem, you appear to be missing a call to page.waitForSelector:
const puppeteer = require("puppeteer"); // ^19.0.0
const url = "your url";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto(url, {waitUntil: "domcontentloaded"});
await page.waitForSelector('[itemprop="url"]');
const content = await page.$$eval(
'[itemprop="url"]',
els => els.map(el => el.getAttribute("content"))
);
console.log(content);
})()
.catch(err => console.error(err))
.finally(() => browser?.close());

How to upload a random file from Directory then delete it Puppeteer?

So I have a directory with images: C:\Users\username\Desktop\mpimages
I need 1 random image to be uploaded then deleted from the folder.
This is the class which is clicked which brings up the file browser window to upload file
< div class="om3e55n1" >
I've found this solution online but it's not working:
fs.readdirSync is not a function.
const rnd = (arr) => arr[Math.floor(Math.random() * arr.length)];
const rndFile = () => `${dir}/${rnd(fs.readdirSync(dir))}`;
(async () => {
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.goto("YOUR/UPLOAD/URL")
const file = rndFile();
const elementHandle = await page.$("input[type=file]");
await elementHandle.uploadFile(file);
await page.click("input[type=submit]")
fs.unlinkSync(file)
})();

How to get the first link under a ul tag using Puppeteer?

I am trying to get the link of the latest house posting in a real estate website.
This is the code I have written til now
const puppeteer = require("puppeteer");
const link =
"https://www.daft.ie/property-for-rent/dublin-4-dublin?radius=5000&numBeds_from=2&numBeds_to=3&sort=publishDateDesc";
(async () => {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
});
const page = await browser.newPage();
await page.goto(link);
const elements = await page.$x("//button[normalize-space()='Accept All']");
await elements[0].click();
// const handle = await page.waitForXPath("//ul[#data-testid='results']");
// const yourHref = await page.evaluate(
// (anchor) => anchor.getAttribute("href"),
// handle
// );
const hrefs1 = await page.evaluate(() =>
Array.from(document.querySelectorAll("a[href]"), (a) =>
a.getAttribute("href")
)
);
console.log(hrefs1);
await browser.close();
})();
However, this code is to get all the href links on the target page.
HTML code of the page:
It is easier to read the code from the picture than if I paste the code, thats why I attached an image.
As you can see under ul tag with data-testid=results there are many li tags inside which there is a a href, I wish to extract the link from this and that too only the top most li link as it will newest house posting.
How can I do this?
Expected output - I just want the first link under li tag. In the picture above, the output would be
/for-rent/house-glencloy-road-whitehall-dublin-9/4072150
Following up on the comment chain, the selector '[data-testid="results"] a[href]' should give the first result href.
const puppeteer = require("puppeteer"); // ^16.2.0
let browser;
(async () => {
browser = await puppeteer.launch({headless: false});
const [page] = await browser.pages();
const url =
"https://www.daft.ie/property-for-rent/dublin-4-dublin?radius=5000&numBeds_from=2&numBeds_to=3&sort=publishDateDesc";
await page.goto(url, {waitUntil: "domcontentloaded"});
const xp = "//button[normalize-space()='Accept All']";
const cookiesBtn = await page.waitForXPath(xp);
await cookiesBtn.click();
const el = await page.waitForSelector('[data-testid="results"] a[href]');
console.log(await el.evaluate(el => el.getAttribute("href")));
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
If you want all of the result hrefs, try:
const allHrefs = await page.$$eval(
'[data-testid="results"] a[href]',
els => els.map(e => e.getAttribute("href"))
);
Note that the data is available statically, so you could just use fetch (native on Node 18+) and Cheerio which is faster and probably more reliable, assuming there's no detection issues (and you could add a user-agent and take other counter-measures if there are):
const cheerio = require("cheerio"); // 1.0.0-rc.12
const url = "https://www.daft.ie/property-for-rent/dublin-4-dublin?radius=5000&numBeds_from=2&numBeds_to=3&sort=publishDateDesc";
fetch(url).then(res => res.text()).then(html => {
const $ = cheerio.load(html);
const sel = '[data-testid="results"] a[href]';
console.log($(sel).attr("href"));
// or all:
console.log([...$(sel)].map(e => e.attribs.href));
});
On my slow machine this took 3.5 seconds versus 30 seconds for headful Puppeteer and 15-20 seconds for headless Puppeteer depending on cache warmth.
Or, if you are using Puppeteer for whatever reason, you could block all the requests, JS and images to speed things up dramatically. Your default await page.goto(link); waits for the load event, which is content you may not need.

When I try to web scrape a dynamic website, i get back an empty array

I am trying to web scrape a dynamic website with puppeteer, using this code:
const puppeteer = require('puppeteer');
async function getTokoPedia(){
const browser = await puppeteer.launch({ headless: false }); // for test disable the headlels mode,
const page = await browser.newPage();
await page.setViewport({ width: 1000, height: 926 });
await page.goto("https://store.401games.ca/collections/pokemon-singles",{waitUntil: 'networkidle2'});
console.log("start evaluate javascript")
var productNames = await page.evaluate(()=>{
var div = document.querySelectorAll('.info-container');
console.log(div) // console.log inside evaluate, will show on browser console not on node console
var productnames = []
div.forEach(element => {
var price = element.querySelector(' .fs-result-page-3sdl0h')
if(price != null){
productnames.push(price.innerText);
}
});
return productnames
})
console.log(productNames)
browser.close()
}
getTokoPedia();
However, upon running it, I get back an empty array. How can I fix this?
Two problems:
The elements you want are in a shadow root, so you have to pierce the root as described in Puppeteer not giving accurate HTML code for page with shadow roots.
The cards lazy-load, so you'd have to scroll down to be able to populate their data into the DOM.
But there's an easier way to get the initial set of data, which is in the static HTML as a JSON blob in var meta = {"products":...};. You can scrape it with a regex, as described in this tutorial.
Here's an example showing both approaches:
const puppeteer = require("puppeteer"); // ^14.1.1
let browser;
(async () => {
browser = await puppeteer.launch({headless: true});
const [page] = await browser.pages();
const url = "https://store.401games.ca/collections/pokemon-singles";
await page.goto(url, {waitUntil: "domcontentloaded"});
// here's the hard way for illustration:
const el = await page.waitForSelector("#fast-simon-serp-app");
await page.waitForFunction(({shadowRoot}) =>
shadowRoot.querySelector(".product-card .title")
, {}, el);
const items = await el.evaluate(({shadowRoot}) =>
[...shadowRoot.querySelectorAll(".product-card")]
.map(e => ({
title: e.querySelector(".title")?.textContent,
price: e.querySelector(".price")?.textContent,
}))
);
console.log(items); // just the first 6 or so
// TODO scroll the page to get the rest;
// I didn't bother implementing that...
// ...or do it the easy way:
const html = await page.content();
const pat = /^[\t ]*var meta = ({"products":[^\n]+);$/m;
const data = JSON.parse(html.match(pat)[1]);
console.log(JSON.stringify(data, null, 2));
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
At this point, since we're not dealing with anything but the static HTML, you can dump Puppeteer and use axios or fetch to get the data more efficiently:
const axios = require("axios");
axios.get("https://store.401games.ca/collections/pokemon-singles")
.then(({data: body}) => {
const pat = /^[\t ]*var meta = ({"products":[^\n]+);$/m;
const data = JSON.parse(body.match(pat)[1]);
console.log(JSON.stringify(data, null, 2));
})
.catch(err => console.error(err))
;
Now, the data.products array contains 50 but the UI shows 26466 results. If you want more than those initial items from the static HTML's var meta, which appears to be the same on all 1000+ pages, I suggest using the API. A URL looks like https://ultimate-dot-acp-magento.appspot.com/categories_navigation?request_source=v-next&src=v-next&UUID=d3cae9c0-9d9b-4fe3-ad81-873270df14b5&uuid=d3cae9c0-9d9b-4fe3-ad81-873270df14b5&store_id=17041809&cdn_cache_key=1654217982&api_type=json&category_id=269055623355&facets_required=1&products_per_page=5000&page_num=1&with_product_attributes=true. You can see there are ids and keys that probably protect against usage by parties other than the site, but I didn't see any change other than cdn_cache_key after a few tries. I'm not sure how long a URL is valid, but while it is, you can set products_per_page=1000 for example, then move page_num=1 forward 27 times or so. This gets you all of the data while avoiding all of the difficulties of scraping from the page itself.
Here's a pessimistic approach that uses Puppeteer to get an up-to-date URL, in case a URL goes stale:
const axios = require("axios");
const puppeteer = require("puppeteer"); // ^14.1.1
let browser;
(async () => {
browser = await puppeteer.launch({headless: true});
const [page] = await browser.pages();
const url = "https://store.401games.ca/collections/pokemon-singles";
const reqP = page.waitForRequest(res =>
res.url()
.startsWith("https://ultimate-dot-acp-magento.appspot.com/categories_navigation")
);
await page.goto(url, {waitUntil: "domcontentloaded"});
const req = await reqP;
const apiUrl = req
.url()
.replace(/(?<=products_per_page=)(\d+)/, 1000);
const {data} = await axios.get(apiUrl);
console.log(JSON.stringify(data, null, 2));
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
And tossing in the loop:
const axios = require("axios");
const fs = require("fs").promises;
const puppeteer = require("puppeteer"); // ^14.1.1
let browser;
(async () => {
browser = await puppeteer.launch({headless: true});
const [page] = await browser.pages();
const url = "https://store.401games.ca/collections/pokemon-singles";
const reqP = page.waitForRequest(res =>
res.url()
.startsWith("https://ultimate-dot-acp-magento.appspot.com/categories_navigation")
);
await page.goto(url, {waitUntil: "domcontentloaded"});
const req = await reqP;
const apiUrl = req
.url()
.replace(/(?<=products_per_page=)(\d+)/, 1000);
const items = [];
for (let i = 1;; i++) {
const pageUrl = apiUrl.replace(/(?<=page_num=)(\d+)/, i);
const response = await axios.get(pageUrl);
if (response.status !== 200 ||
items.length >= response.data.total_results) {
break;
}
items.push(...response.data.items);
}
await fs.writeFile("data.json", JSON.stringify(items));
console.log(items.slice(0, 10));
console.log(items.length);
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
This hammers the site, pulling a ton of data in a short amount of time, so consider this script for educational purposes, or modify it to throttle your requests way back.

Exporting result from async function in separate js file, importing result in another javascript

Trying to build a small scraper. To reuse functionality I thought 'Page Object Models' would come in handy.
In main.js I require multiple small scripts, in the example below there is only one model (GooglePage).
The scripts work. But I would like to know how to pass a value from the google.js script back to the main script.
I want to use the value of the 'pageCountClean' variable in the main.js script to use in the rest of the application.
Have been searching for information about passing values and functions between scripts. For exporting values from pageconstructors, for promise await export function.
But I am lost. Do I have to use Promises?, is the current way of require/importing and exporting enough to create the relationship between the scripts?
Any pointers are welcome.
//////////// main.js
const { chromium } = require('playwright');
const { GooglePage } = require('./models/Google');
(async () => {
const browser = await chromium.launch({ headless: true, slowMo: 250 });
const context = await browser.newContext();
const GoogleUrl80 = https://www.google.nl/search?q=site%3Anu.nl;
// Cookie consent:
console.log('Cookie consent - start');
const page80 = await browser.newPage();
await page80.goto('https://google.nl');
await page80.waitForTimeout(1000);
await page80.keyboard.press('Tab');
await page80.keyboard.press('Tab');
await page80.keyboard.press('Enter');
console.log('Cookie Consent - done');
// Number of urls in google.nl (using google.js)
await page80.goto(GoogleUrl80, {waitUntil: 'networkidle'});
const googlePage80 = new GooglePage(page80);
await googlePage80.scrapeGoogle();
// Want to console.log 'pageCountClean' here.
await browser.close()
})()
//////////// Google.js
class GooglePage {
constructor(page) {
this.page = page;
}
async scrapeGoogle() {
const GoogleXpath = '//div[#id="result-stats"]';
const pageCount = await this.page.$eval(GoogleXpath, (el) => el.innerText);
const pageCountClean = pageCount.split(" ")[1];
console.log(pageCountClean);
}
}
module.exports = { GooglePage };
You can just return pageCountClean from your async function and await it in your main.js file:
in Google.js:
async scrapeGoogle() {
const GoogleXpath = '//div[#id="result-stats"]';
const pageCount = await this.page.$eval(GoogleXpath, (el) => el.innerText);
const pageCountClean = pageCount.split(" ")[1];
console.log(pageCountClean);
return pageCountClean;
}
in main.js:
const googlePage80 = new GooglePage(page80);
const result = await googlePage80.scrapeGoogle();
console.log(result);

Categories

Resources