Puppeteer web scraping is not loading profile for scraping - javascript

I am trying to get the puppeteer web scraping to scrape clutch.co using the below scripts but not working properly. I'm developing a simple web scraper for clutch.co. I want to extract contacts and company names for my lead generation project. In fact, to achieve this goal would improve my understanding of javascript.
Brower.js
const puppeteer = require("puppeteer");
async function startBrowser() {
let browser;
try {
console.log("Opening the browser......");
browser = await puppeteer.launch({
headless: false,
args: ["--disable-setuid-sandbox"],
ignoreHTTPSErrors: true,
});
} catch (err) {
console.log("Could not create a browser instance => : ", err);
}
return browser;
}
module.exports = {
startBrowser,
};
Pagecontroller.js
const pageScraper = require("./pageScraper");
async function scrapeAll(browserInstance) {
let browser;
try {
browser = await browserInstance;
await pageScraper.scraper(browser);
} catch (err) {
console.log("Could not resolve the browser instance => ", err);
}
}
module.exports = (browserInstance) => scrapeAll(browserInstance);
Pagescraper.js
const scraperObject = {
url: "https://clutch.co/sitemap",
async scraper(browser) {
let page = await browser.newPage();
console.log(`Navigating to ${this.url}...`);
await page.setDefaultNavigationTimeout(0);
await page.goto(this.url);
// Wait for the required DOM to be rendered
await page.waitForSelector(".container");
// Get the link to all Categories
let urls = await page.$$eval(".sitemap-menu > li", (links) => {
// Extract the links from the data
links = links.map((el) => el.querySelector("div > a").href);
return links;
});
// Loop through each of those links, open a new page instance and get the relevant data from them
let pagePromise = (link) =>
new Promise(async (resolve, reject) => {
let dataObj = {};
let newPage = await browser.newPage();
await page.setDefaultNavigationTimeout(0);
await newPage.goto(link);
dataObj["companyName"] = await newPage.$eval(
"h3 > a",
(text) => text.textContent
);
dataObj["tagLine"] = await newPage.$eval(
".tagline",
(text) => text.textContent
);
resolve(dataObj);
await newPage.close();
});
for (link in urls) {
let currentPageData = await pagePromise(urls[link]);
// scrapedData.push(currentPageData);
console.log(currentPageData);
}
},
};
module.exports = scraperObject;
index.js
const browserObject = require("./browser");
const scraperController = require("./pageController");
//Start the browser and create a browser instance
let browserInstance = browserObject.startBrowser();
// Pass the browser instance to the scraper controller
scraperController(browserInstance);
I can't get the script to load and scrape the profile pages. it keeps giving me errors.

Related

Puppeteer - how to not repeat if statements to check for missing selectors

I've managed to get Puppeteer working to scrape data off a number of different web pages. However, I'm repeating the same if statement for each bit of data - I'm really new to Javascript so I'm pretty sure I'm overlooking something simple so it's not repeated.
I've searched online quite a bit & tried a few different things but can't get it working.
For the code example below, I've taken out a lot of the different query selectors so it's easier to read and just an example, but in the actual code there's 12 of them, all with exactly the same code except the querySelector.
const puppeteer = require('puppeteer');
// This gets the url's I want to scrape stored on another file
let urls = require('./assets/links.js').urls;
(async () => {
// Initiate the browser
const browser = await puppeteer.launch();
// Create a new page with the default browser context
const page = await browser.newPage();
for (let i = 0; i < urls.length; i++) {
// Go to the target website
await page.goto(urls[i]);
let url = urls[i];
const title = await page.evaluate(() => {
let element = document.querySelector('h1')
if (element) {
return element.innerText
} return null;
})
const reviews = await page.evaluate(() => {
let element = document.querySelector('.example-class')
if (element) {
return element.innerText
} return null;
})
const description = await page.evaluate(() => {
let element = document.querySelector('#example-id')
if (element) {
return element.innerText
} return null;
})
console.log({ url, title, reviews, description });
}
// Closes the browser and all of its pages
await browser.close();
})();
I've tried creating a function but it wouldn't let me use it with await.
You can write a simple function that grabs text from a selector and returns null if the element doesn't exist:
const puppeteer = require("puppeteer"); // ^19.0.0
const {urls} = require("./assets/links.js");
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
const text = sel =>
page.$eval(sel, el => el.textContent).catch(() => null);
for (const url of urls) {
await page.goto(url);
console.log({
url,
title: await text("h1"),
reviews: await text(".example-class"),
description: await text("#example-id"),
});
}
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
If there's 12 of them, you might want to add an array and a loop:
const puppeteer = require("puppeteer"); // ^19.0.0
const {urls} = require("./assets/links.js");
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
const text = sel =>
page.$eval(sel, el => el.textContent).catch(() => null);
const selectors = {
title: "h1",
reviews: ".example-class",
description: "#example-id",
// ...
};
for (const url of urls) {
await page.goto(url);
const textProms = Object.entries(selectors).map(
async (k, sel) => [k, await text(sel)]
);
console.log({
url,
...Object.fromEntries(await Promise.all(textProms)),
});
}
})()
.catch(err => console.error(err))
.finally(() => browser?.close());

Web Scraping using Puppeteer returns undefined during atcoder contest

I made a web scrapper for parsing test cases of Atcoder contest. It works well if the contest is already finished but gives an error for an ongoing contest. The error arises when accessing the rows of the table HTML element. I am positive that the table exists but for some reason, the script returns undefined for an ongoing contest.
Error:
Error: Evaluation failed: TypeError: Cannot read properties of undefined (reading 'rows')
at pptr://__puppeteer_evaluation_script__:3:32
at ExecutionContext._ExecutionContext_evaluate (/mnt/d/c++/node_modules/puppeteer-core/lib/cjs/puppeteer/common/ExecutionContext.js:229:15)
at process.processTicksAndRejections (node:internal/process/task_queues:95:5)
at async ExecutionContext.evaluate (/mnt/d/c++/node_modules/puppeteer-core/lib/cjs/puppeteer/common/ExecutionContext.js:107:16)
at async scrapeSite (/mnt/d/c++/codeforces/atcoder.js:57:33)
Here is my Scrapper: atcoder.js:
const puppeteer = require("puppeteer");
const fs = require("fs");
const contest_id = process.argv[2];
async function scrapeProblem(problem_letter) {
const url = `https://atcoder.jp/contests/${contest_id}/tasks/${contest_id}_${problem_letter.toLowerCase()}`;
console.log(url);
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url, { waitUntil: "networkidle0" });
const samples_scraped = await page.evaluate(() => {
const samples = document.querySelectorAll("pre");
const scraped = Array.from(samples).filter((child) => {
return child.id !== "";
});
let num_scraped = scraped.length;
// The elements were repeated twice, so remove the extra elements
for (let i = 0; i < num_scraped / 2; i++) scraped.pop();
return scraped.map((ele) => ele.innerText);
// return Array.from(samples).map((child) => child.innerText);
});
let id = 1;
// Now we need to store the samples in text format
samples_scraped.map((ele, idx) => {
if (idx % 2 == 0) {
// Input
fs.writeFile(`${problem_letter}-${id}.in`, ele, (err) => {
if (err) throw err;
});
} else {
// Output
fs.writeFile(`${problem_letter}-${id}.out`, ele, (err) => {
if (err) throw err;
});
id++;
}
return ele;
});
await browser.close();
} catch (e) {
console.log(e);
}
}
async function scrapeSite() {
const url = `https://atcoder.jp/contests/${contest_id}/tasks`;
console.log(url);
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url, { waitUntil: "networkidle0" });
// Returns all the problem letters
const problem_letters = await page.evaluate(() => {
const table = document.querySelectorAll("table")[0];
const rows = table.rows.length;
const letters = [];
for (let i = 1; i < rows; i++) {
letters.push(table.rows[i].cells[0].innerText);
}
return letters;
});
console.log(problem_letters);
for (problem_letter of problem_letters) {
scrapeProblem(problem_letter);
}
await browser.close();
} catch (e) {
console.log(e);
}
}
scrapeSite();
The scrapeProblem(problem_letter) is a helper function to scrape the test cases for the given problem letter. It then stores the test cases to the user's file system using fs module.
The scrapeSite() function first parses the homepage for the number of problems and the problem letter associated with each problem. It then calls the scrapeProblem(problem_letter) helper function to parse the required web site for test cases.
To run the script: node scrapper.js abc280
Update: I tried it in a new contest and again got the same error. This time I took a screenshot using Puppeteer and found out the problem. I am getting permission denied if I try to accesss the site without logging in for an ongoing contest.
The problem was the site requires us to login and only then we can see the problem statements of an ongoing contest. So I added a function which will first login to the site and then it will proceed to parse the test cases.
Updated code:
const puppeteer = require("puppeteer");
const fs = require("fs");
require('dotenv').config();
const contest_id = process.argv[2];
async function login(browser, page) {
const url = `https://atcoder.jp/login?continue=https%3A%2F%2Fatcoder.jp%2F`;
console.log("Logging in..", url);
try {
await page.goto(url, { waitUntil: "networkidle0" });
await page.type('#username', process.env.USERNAME);
await page.type("#password", process.env.PASSWORD);
await page.click("#submit");
} catch (e) {
console.log("Login failed...");
console.log(e);
}
}
async function scrapeProblem(browser, Problem) {
const url = Problem.Url;
console.log(url);
try {
// const browser = await puppeteer.launch();
const page = await browser.newPage();
// await login(browser, page);
await page.goto(url, { waitUntil: "networkidle0" });
const samples_scraped = await page.evaluate(() => {
const samples = document.querySelectorAll("pre");
const scraped = Array.from(samples).filter((child) => {
return child.id !== "";
});
let num_scraped = scraped.length;
// The elements were repeated twice, so remove the extra elements
for (let i = 0; i < num_scraped / 2; i++) scraped.pop();
return scraped.map((ele) => ele.innerText);
// return Array.from(samples).map((child) => child.innerText);
});
let id = 1;
// Now we need to store the samples in text format
samples_scraped.map((ele, idx) => {
if (idx % 2 == 0) {
// Input
fs.writeFile(`${Problem.Problem_letter}-${id}.in`, ele, (err) => {
if (err) throw err;
});
} else {
// Output
fs.writeFile(`${Problem.Problem_letter}-${id}.out`, ele, (err) => {
if (err) throw err;
});
id++;
}
return ele;
});
// await browser.close();
} catch (e) {
console.log(e);
}
}
async function scrapeSite() {
const url = `https://atcoder.jp/contests/${contest_id}/tasks`;
console.log(url);
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await login(browser, page);
await page.goto(url, { waitUntil: "networkidle0" });
// await page.screenshot({ path: "./screenshot.png", fullPage: true});
// Returns all the problem letters
const problems = await page.evaluate(() => {
const table = document.querySelectorAll("table")[0];
const rows = table.rows.length;
const letters = [];
for (let i = 1; i < rows; i++) {
letters.push({Problem_letter: table.rows[i].cells[0].innerText, Url: table.rows[i].cells[0].firstChild.href });
}
return letters;
});
console.log(problems);
const promises = []
for (problem of problems) {
promises.push(scrapeProblem(browser, problem));
}
await Promise.all(promises); // All the promises must be resolved before closing the browser
await browser.close();
} catch (e) {
console.log(e);
}
}
scrapeSite();

Puppeteer Node Js YouTube data scraping error "Evaluation Failed"

I am trying to scrape the YouTube headline and link from a channel using Puppeteer. While executing the program, I am facing the Evaluation Error as following:
Error: Evaluation failed: TypeError: Cannot read properties of null (reading 'innerText')
at pptr://__puppeteer_evaluation_script__:10:65
at ExecutionContext._ExecutionContext_evaluate (E:\somoy\node_modules\puppeteer-core\lib\cjs\puppeteer\common\ExecutionContext.js:229:15)
at process.processTicksAndRejections (node:internal/process/task_queues:95:5)
at async ExecutionContext.evaluate (E:\somoy\node_modules\puppeteer-core\lib\cjs\puppeteer\common\ExecutionContext.js:107:16)
at async initiate (E:\somoy\appNew.js:45:20)
at async E:\somoy\appNew.js:155:9
async function initiate() {
const browser = await puppeteer.launch({ headless: false, defaultViewport: null, userDataDir: './userdata', executablePath: 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe' });
const page = await browser.newPage();
page.setDefaultNavigationTimeout(0)
await page.goto('https://www.youtube.com/#ProthomAlo/videos', { waitUntil: 'networkidle2' });
await delay(5000);
if (!fs.existsSync('storeLink.txt')) {
//create new file if not exist
fs.writeFileSync("storeLink.txt", '');
}
articleLinkarr = (fs.readFileSync('storeLink.txt', { encoding: 'utf8' })).split('\n')
let articles = await page.evaluate(async (articleLinkarr) => {
//console.log('Hello1')
let arrObj = [];
articles = document.querySelectorAll('.style-scope.ytd-rich-grid-media');
for (let i = 0; i < articles.length; i++) {
//for (let i = 0; i < 20; i++) {
//const category = document.querySelector('.print-entity-section-wrapper.F93gk').innerText
//const headline = articles[i].querySelector('div > h3').innerText
const headline = articles[i].querySelector('h3').innerText
const link = 'https://www.youtube.com' + articles[i].querySelector('a').getAttribute('href')
// if (!(link.includes('video') || link.includes('fun') || link.includes('photo'))) {
// if (!articleLinkarr.includes(link)) {
arrObj.push({ articleHeadline: headline, articleLink: link })
// }
// }
};
return arrObj;
}, articleLinkarr)
}
Puppeteer doesn't seem necessary here if you just want the initial set of titles. There's a JSON blob in the static HTML which has the title list, so you can make a simple HTTP request to the URL and pull the blob out with an HTML parser, then walk the object structure.
const cheerio = require("cheerio"); // 1.0.0-rc.12
const url = "Your URL";
fetch(url) // Node 18 or install node-fetch
.then(res => {
if (!res.ok) {
throw Error(res.statusText);
}
return res.text();
})
.then(html => {
const $ = cheerio.load(html);
const script = $(
[...$("script")].find(e =>
$(e).text().startsWith("var ytInitialData = {")
)
)
.text()
.slice(20, -1);
const data = JSON.parse(script);
const titles = [];
const {contents} =
data.contents.twoColumnBrowseResultsRenderer.tabs[1].tabRenderer
.content.richGridRenderer;
for (const c of contents) {
if (!c.richItemRenderer) {
continue;
}
const title =
c.richItemRenderer.content.videoRenderer.title.runs[0].text;
const url =
c.richItemRenderer.content.videoRenderer.navigationEndpoint
.commandMetadata.webCommandMetadata.url;
titles.push({title, url});
}
console.log(titles);
})
.catch(err => console.error(err));
If you do want to use Puppeteer, you can select these titles and URLs with:
const puppeteer = require("puppeteer"); // ^19.0.0
const url = "Your URL";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto(url, {waitUntil: "domcontentloaded"});
await page.waitForSelector("#video-title-link");
const titles = await page.$$eval("#video-title-link", els =>
els.map(e => ({title: e.textContent, url: e.href}))
.filter(e => e.url)
);
console.log(titles);
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
For some reason, the ids aren't unique.
Although this is less code, this approach is much slower than fetch (~10x slower on my machine), although you can speed it up a bit by blocking irrelevant resources.
As an aside, always use const in front of your variables to avoid making them global.
page.setDefaultNavigationTimeout(0) is generally not a great pattern--this could hang forever. I'd set this to 3 or 4 minutes at most. If nav is taking that long, something is wrong and you should get that logged so you can take a look at it.

How handle multiple functions in puppeteer-cluster?

I have a two step program :
Get a list of href from a page
Loop infinitely on each page of this list, get an element and display it in console
I try to use function with Puppeteer-Cluter but it doesn't work properly.
const { Cluster } = require('puppeteer-cluster');
const fs = require("fs");
const { addExtra } = require("puppeteer-extra");
const vanillaPuppeteer = require("puppeteer");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
var moment = require('moment');
var regexTemps = /(\d+)\s(\w+)$/;
const urlsToCheck = [];
TZ = 'Europe/Paris'
process.env.TZ = 'Europe/Paris'
(async () => {
const puppeteer = addExtra(vanillaPuppeteer);
puppeteer.use(StealthPlugin());
const cluster = await Cluster.launch({
puppeteer,
puppeteerOptions: {
headless: false,
args: ['--no-sandbox'],
},
maxConcurrency: 10,
concurrency: Cluster.CONCURRENCY_CONTEXT,
monitor: false,
skipDuplicateUrls: true,
timeout:30000,
retryLimit:10,
})
cluster.on('taskerror', (err, data, willRetry) => {
if (willRetry) {
console.warn(`Encountered an error while crawling ${data}. ${err.message}\nThis job will be retried`);
} else {
console.error(`Failed to crawl ${data}: ${err.message}`);
}
});
const getElementOnPage = async ({ page, data: url }) => {
console.log('=> Go to URL : ',url);
await page.goto(url);
while (true) {
console.log('=> Reload URL : ',page.url())
await page.reload();
await page.waitForTimeout(1000);
let allNews = await page.$$("article.news"); // [] if nothing
let firstNews = allNews[0];
await page.waitForTimeout(1000);
let info = await firstNews.$eval('.info span', s => s.textContent.trim());
console.log(new Date(), 'info : ',info);
}
};
const getListOfPagesToExplore = async ({ page, data: url }) => {
console.log(new Date(), 'Get the list of deal pages to explore');
await page.goto(url, {waitUntil: 'domcontentloaded'});
await page.waitForTimeout(500);
const hrefsToVisit = await page.$x('//a');
let idxTab=0;
for( let hrefToVisit of hrefsToVisit ) {
var link = await page.evaluate(el => el.getAttribute("href"), hrefToVisit);
console.log(new Date(), 'adding link to list : ', link);
idxTab++;
urlsToCheck.push(link);
}
};
cluster.queue('https://www.apagewithsomelinks.com', getListOfPagesToExplore);
await cluster.idle();
await cluster.close();
console.log(urlsToCheck);
//Display correctly several link in an array
for( let url of urlsToCheck ) {
console.log('Push in queue : ',url);
cluster.queue(url, getElementOnPage);
}
await cluster.idle();
await cluster.close();
})();
When I launch it, it retrieve the links to scrap.
It display "Push in queue : ..." for each URL.
But then, the method getElementOnPage is launched only for the first URL, and runs infinitely like asked. But why the other URL are note launched ??
Before that, I don't use function, I used one unique task with :
await cluster.task(async ({ page, data: url }) => {
But how combine function and this thing ?

How can I download images on a page using puppeteer?

I'm new to web scraping and want to download all images on a webpage using puppeteer:
const puppeteer = require('puppeteer');
let scrape = async () => {
// Actual Scraping goes Here...
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.goto('https://memeculture69.tumblr.com/');
// Right click and save images
};
scrape().then((value) => {
console.log(value); // Success!
});
I have looked at the API‌ docs but could not figure out how to acheive this. So appreciate your help.
If you want to skip the manual dom traversal you can write the images to disk directly from the page response.
Example:
const puppeteer = require('puppeteer');
const fs = require('fs');
const path = require('path');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
page.on('response', async response => {
const url = response.url();
if (response.request().resourceType() === 'image') {
response.buffer().then(file => {
const fileName = url.split('/').pop();
const filePath = path.resolve(__dirname, fileName);
const writeStream = fs.createWriteStream(filePath);
writeStream.write(file);
});
}
});
await page.goto('https://memeculture69.tumblr.com/');
await browser.close();
})();
See the documentation for page.on and for the HTTPResponse object that you get from page.on('response', ...).
Here is another example. It goes to a generic search in google and downloads the google image at the top left.
const puppeteer = require('puppeteer');
const fs = require('fs');
async function run() {
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.setViewport({ width: 1200, height: 1200 });
await page.goto('https://www.google.com/search?q=.net+core&rlz=1C1GGRV_enUS785US785&oq=.net+core&aqs=chrome..69i57j69i60l3j69i65j69i60.999j0j7&sourceid=chrome&ie=UTF-8');
const IMAGE_SELECTOR = '#tsf > div:nth-child(2) > div > div.logo > a > img';
let imageHref = await page.evaluate((sel) => {
return document.querySelector(sel).getAttribute('src').replace('/', '');
}, IMAGE_SELECTOR);
console.log("https://www.google.com/" + imageHref);
var viewSource = await page.goto("https://www.google.com/" + imageHref);
fs.writeFile(".googles-20th-birthday-us-5142672481189888-s.png", await viewSource.buffer(), function (err) {
if (err) {
return console.log(err);
}
console.log("The file was saved!");
});
browser.close();
}
run();
If you have a list of images you want to download then you could change the selector to programatically change as needed and go down the list of images downloading them one at a time.
You can use the following to scrape an array of all the src attributes of all images on the page:
const images = await page.evaluate(() => Array.from(document.images, e => e.src));
Then you can use the Node File System Module and HTTP or HTTPS Module to download each image.
Complete Example:
'use strict';
const fs = require('fs');
const https = require('https');
const puppeteer = require('puppeteer');
/* ============================================================
Promise-Based Download Function
============================================================ */
const download = (url, destination) => new Promise((resolve, reject) => {
const file = fs.createWriteStream(destination);
https.get(url, response => {
response.pipe(file);
file.on('finish', () => {
file.close(resolve(true));
});
}).on('error', error => {
fs.unlink(destination);
reject(error.message);
});
});
/* ============================================================
Download All Images
============================================================ */
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
let result;
await page.goto('https://www.example.com/');
const images = await page.evaluate(() => Array.from(document.images, e => e.src));
for (let i = 0; i < images.length; i++) {
result = await download(images[i], `image-${i}.png`);
if (result === true) {
console.log('Success:', images[i], 'has been downloaded successfully.');
} else {
console.log('Error:', images[i], 'was not downloaded.');
console.error(result);
}
}
await browser.close();
})();
The logic is simple i think. You just need to make a function which will take url of image and save it to your directory. The puppeteer will just scrape the image url and pass it to downloader function. Here is an example:
const puppeteer = require('puppeteer');
const fs = require('fs');
const request = require('request');
// This is main download function which takes the url of your image
function download(uri, filename) {
return new Promise((resolve, reject) => {
request.head(uri, function (err, res, body) {
request(uri).pipe(fs.createWriteStream(filename)).on('close', resolve);
});
});
}
let main = async () => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.goto('https://memeculture69.tumblr.com/');
await page.waitFor(1000);
const imageUrl = await page.evaluate(
// here we got the image url from the selector.
() => document.querySelector('img.image')
);
// Now just simply pass the image url
// to the downloader function to download the image.
await download(imageUrl, 'image.png');
};
main();
This code saves all images found on the page into images folder
page.on('response', async (response) => {
const matches = /.*\.(jpg|png|svg|gif)$/.exec(response.url());
if (matches && (matches.length === 2)) {
const extension = matches[1];
const buffer = await response.buffer();
fs.writeFileSync(`images/${matches[0]}.${extension}`, buffer, 'base64');
}
});
For image download by its selector I did the following:
Obtained uri for the image using selector
Passed uri to the download function
const puppeteer = require('puppeteer');
const fs = require('fs');
var request = require('request');
//download function
var download = function (uri, filename, callback) {
request.head(uri, function (err, res, body) {
console.log('content-type:', res.headers['content-type']);
console.log('content-length:', res.headers['content-length']);
request(uri).pipe(fs.createWriteStream(filename)).on('close', callback);
});
};
(async () => {
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox'], //for no sandbox
});
const page = await browser.newPage();
await page.goto('http://example.com');// your url here
let imageLink = await page.evaluate(() => {
const image = document.querySelector('#imageId');
return image.src;
})
await download(imageLink, 'myImage.png', function () {
console.log('done');
});
...
})();
Resource: Downloading images with node.js
It is possible to get all the images without visiting each url independently. You need to listen to all the requests to the server:
await page.setRequestInterception(true)
await page.on('request', function (request) {
request.continue()
})
await page.on('response', async function (response) {
// Filter those responses that are interesting
const data = await response.buffer()
// data contains the img information
})
You can also filter based on the request type.
const blocked_resources = [
'stylesheet',
/*'image',*/
'media',
'font'
];
const _handleRequest = request => {
const type = request.resourceType();
if (blocked_resources.some(r => type === r)) {
request.abort();
return;
}
request.continue();
return;
}
const puppeteer = require("puppeteer")
const fs = require("fs/promises")
// add the url of website below which you want to scrape
const yourURL = "example.com"
async function scrapeIt() {
// it will launch browser
const browser = await puppeteer.launch()
// This line of code opens new page in browser
const page = await browser.newPage()
// page will open the webpage of your provided url
await page.goto(yourURL)
const photos = await page.$$eval("img", (imgs) => {
return imgs.map((x) => x.src)
})
for (const photo of photos) {
const imagepage = await page.goto(photo)
await fs.writeFile(photo.split("/").pop(), await imagepage.buffer())
}
await browser.close()
}
scrapeIt()
Download google images with 100% quality based on your search query using puppeteer in nodejs.
It is a straightforward approach.
Open google images.
Search for images using keyword.
Click the images one by one to open it's right preview panel.
Store all the links.
Download the images.
Note: If you download the images without previewing , you will lose quality.
const request = require('request');
const cheerio = require('cheerio');
const fs = require('fs');
var puppeteer = require('puppeteer');
const readline = require("readline-sync");
const path = require('path');
const axios = require('axios').default;
// fileUrl: the absolute url of the image or video you want to download
// downloadFolder: the path of the downloaded file on your machine
const downloadFile = async (fileUrl,localFilePath) => {
try {
const response = await axios({
method: 'GET',
url: fileUrl,
responseType: 'stream',
});
const w = response.data.pipe(fs.createWriteStream(localFilePath));
w.on('finish', () => {
console.log('Successfully downloaded file!');
});
} catch (err) {
throw new Error(err);
}
};
const Google_Image = 'https://www.google.com/search?site=&tbm=isch&source=hp&biw=1873&bih=990&'
let data = 'Ramayana HD Images Good Quality wallpaper'
let search_url = Google_Image + 'q=' + data;
var imagelinkslist =[];
let main = async () => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
let result;
await page.goto(search_url);
// /html/body/div[2]/c-wiz/div[3]/div[1]/div/div/div/div[1]/div[1]/span/div[1]/div[1]/div[1]/a[1]/div[1]/img
// /html/body/div[2]/c-wiz/div[3]/div[1]/div/div/div/div[1]/div[1]/span/div[1]/div[1]/div[2]/a[1]/div[1]/img
let previewimagexpath = '/html/body/div[2]/c-wiz/div[3]/div[2]/div[3]/div/div/div[3]/div[2]/c-wiz/div/div[1]/div[1]/div[2]/div/a/img'
// previewimagexpath = '//*[#id="Sva75c"]/div/div/div[3]/div[2]/c-wiz/div/div[1]/div[1]/div[2]/div/a/img'
for(let i=1;i<20;i++)
{
let imagexpath = '/html/body/div[2]/c-wiz/div[3]/div[1]/div/div/div/div[1]/div[1]/span/div[1]/div[1]/div['+i+']/a[1]/div[1]/img'
const elements = await page.$x(imagexpath)
await elements[0].click();
await page.waitForTimeout(3000);
const image = await page.$x(previewimagexpath);
let d = await image[0].getProperty('src')
//console.log(d._remoteObject.value);
imagelinkslist.push(d._remoteObject.value)
}
await browser.close();
};
main().then(()=>{
console.log('Got Image links');
imagelinkslist.map((el,index)=>{
let url = el;
//console.log(url);
const path = `./images/image${index+1}.png`;
if(url.includes('https'))
downloadFile(url , path);
})
// console.log(imagelinkslist)
});

Categories

Resources