I'm new to web scraping and want to download all images on a webpage using puppeteer:
const puppeteer = require('puppeteer');
let scrape = async () => {
// Actual Scraping goes Here...
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.goto('https://memeculture69.tumblr.com/');
// Right click and save images
};
scrape().then((value) => {
console.log(value); // Success!
});
I have looked at the API docs but could not figure out how to acheive this. So appreciate your help.
If you want to skip the manual dom traversal you can write the images to disk directly from the page response.
Example:
const puppeteer = require('puppeteer');
const fs = require('fs');
const path = require('path');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
page.on('response', async response => {
const url = response.url();
if (response.request().resourceType() === 'image') {
response.buffer().then(file => {
const fileName = url.split('/').pop();
const filePath = path.resolve(__dirname, fileName);
const writeStream = fs.createWriteStream(filePath);
writeStream.write(file);
});
}
});
await page.goto('https://memeculture69.tumblr.com/');
await browser.close();
})();
See the documentation for page.on and for the HTTPResponse object that you get from page.on('response', ...).
Here is another example. It goes to a generic search in google and downloads the google image at the top left.
const puppeteer = require('puppeteer');
const fs = require('fs');
async function run() {
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.setViewport({ width: 1200, height: 1200 });
await page.goto('https://www.google.com/search?q=.net+core&rlz=1C1GGRV_enUS785US785&oq=.net+core&aqs=chrome..69i57j69i60l3j69i65j69i60.999j0j7&sourceid=chrome&ie=UTF-8');
const IMAGE_SELECTOR = '#tsf > div:nth-child(2) > div > div.logo > a > img';
let imageHref = await page.evaluate((sel) => {
return document.querySelector(sel).getAttribute('src').replace('/', '');
}, IMAGE_SELECTOR);
console.log("https://www.google.com/" + imageHref);
var viewSource = await page.goto("https://www.google.com/" + imageHref);
fs.writeFile(".googles-20th-birthday-us-5142672481189888-s.png", await viewSource.buffer(), function (err) {
if (err) {
return console.log(err);
}
console.log("The file was saved!");
});
browser.close();
}
run();
If you have a list of images you want to download then you could change the selector to programatically change as needed and go down the list of images downloading them one at a time.
You can use the following to scrape an array of all the src attributes of all images on the page:
const images = await page.evaluate(() => Array.from(document.images, e => e.src));
Then you can use the Node File System Module and HTTP or HTTPS Module to download each image.
Complete Example:
'use strict';
const fs = require('fs');
const https = require('https');
const puppeteer = require('puppeteer');
/* ============================================================
Promise-Based Download Function
============================================================ */
const download = (url, destination) => new Promise((resolve, reject) => {
const file = fs.createWriteStream(destination);
https.get(url, response => {
response.pipe(file);
file.on('finish', () => {
file.close(resolve(true));
});
}).on('error', error => {
fs.unlink(destination);
reject(error.message);
});
});
/* ============================================================
Download All Images
============================================================ */
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
let result;
await page.goto('https://www.example.com/');
const images = await page.evaluate(() => Array.from(document.images, e => e.src));
for (let i = 0; i < images.length; i++) {
result = await download(images[i], `image-${i}.png`);
if (result === true) {
console.log('Success:', images[i], 'has been downloaded successfully.');
} else {
console.log('Error:', images[i], 'was not downloaded.');
console.error(result);
}
}
await browser.close();
})();
The logic is simple i think. You just need to make a function which will take url of image and save it to your directory. The puppeteer will just scrape the image url and pass it to downloader function. Here is an example:
const puppeteer = require('puppeteer');
const fs = require('fs');
const request = require('request');
// This is main download function which takes the url of your image
function download(uri, filename) {
return new Promise((resolve, reject) => {
request.head(uri, function (err, res, body) {
request(uri).pipe(fs.createWriteStream(filename)).on('close', resolve);
});
});
}
let main = async () => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.goto('https://memeculture69.tumblr.com/');
await page.waitFor(1000);
const imageUrl = await page.evaluate(
// here we got the image url from the selector.
() => document.querySelector('img.image')
);
// Now just simply pass the image url
// to the downloader function to download the image.
await download(imageUrl, 'image.png');
};
main();
This code saves all images found on the page into images folder
page.on('response', async (response) => {
const matches = /.*\.(jpg|png|svg|gif)$/.exec(response.url());
if (matches && (matches.length === 2)) {
const extension = matches[1];
const buffer = await response.buffer();
fs.writeFileSync(`images/${matches[0]}.${extension}`, buffer, 'base64');
}
});
For image download by its selector I did the following:
Obtained uri for the image using selector
Passed uri to the download function
const puppeteer = require('puppeteer');
const fs = require('fs');
var request = require('request');
//download function
var download = function (uri, filename, callback) {
request.head(uri, function (err, res, body) {
console.log('content-type:', res.headers['content-type']);
console.log('content-length:', res.headers['content-length']);
request(uri).pipe(fs.createWriteStream(filename)).on('close', callback);
});
};
(async () => {
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox'], //for no sandbox
});
const page = await browser.newPage();
await page.goto('http://example.com');// your url here
let imageLink = await page.evaluate(() => {
const image = document.querySelector('#imageId');
return image.src;
})
await download(imageLink, 'myImage.png', function () {
console.log('done');
});
...
})();
Resource: Downloading images with node.js
It is possible to get all the images without visiting each url independently. You need to listen to all the requests to the server:
await page.setRequestInterception(true)
await page.on('request', function (request) {
request.continue()
})
await page.on('response', async function (response) {
// Filter those responses that are interesting
const data = await response.buffer()
// data contains the img information
})
You can also filter based on the request type.
const blocked_resources = [
'stylesheet',
/*'image',*/
'media',
'font'
];
const _handleRequest = request => {
const type = request.resourceType();
if (blocked_resources.some(r => type === r)) {
request.abort();
return;
}
request.continue();
return;
}
const puppeteer = require("puppeteer")
const fs = require("fs/promises")
// add the url of website below which you want to scrape
const yourURL = "example.com"
async function scrapeIt() {
// it will launch browser
const browser = await puppeteer.launch()
// This line of code opens new page in browser
const page = await browser.newPage()
// page will open the webpage of your provided url
await page.goto(yourURL)
const photos = await page.$$eval("img", (imgs) => {
return imgs.map((x) => x.src)
})
for (const photo of photos) {
const imagepage = await page.goto(photo)
await fs.writeFile(photo.split("/").pop(), await imagepage.buffer())
}
await browser.close()
}
scrapeIt()
Download google images with 100% quality based on your search query using puppeteer in nodejs.
It is a straightforward approach.
Open google images.
Search for images using keyword.
Click the images one by one to open it's right preview panel.
Store all the links.
Download the images.
Note: If you download the images without previewing , you will lose quality.
const request = require('request');
const cheerio = require('cheerio');
const fs = require('fs');
var puppeteer = require('puppeteer');
const readline = require("readline-sync");
const path = require('path');
const axios = require('axios').default;
// fileUrl: the absolute url of the image or video you want to download
// downloadFolder: the path of the downloaded file on your machine
const downloadFile = async (fileUrl,localFilePath) => {
try {
const response = await axios({
method: 'GET',
url: fileUrl,
responseType: 'stream',
});
const w = response.data.pipe(fs.createWriteStream(localFilePath));
w.on('finish', () => {
console.log('Successfully downloaded file!');
});
} catch (err) {
throw new Error(err);
}
};
const Google_Image = 'https://www.google.com/search?site=&tbm=isch&source=hp&biw=1873&bih=990&'
let data = 'Ramayana HD Images Good Quality wallpaper'
let search_url = Google_Image + 'q=' + data;
var imagelinkslist =[];
let main = async () => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
let result;
await page.goto(search_url);
// /html/body/div[2]/c-wiz/div[3]/div[1]/div/div/div/div[1]/div[1]/span/div[1]/div[1]/div[1]/a[1]/div[1]/img
// /html/body/div[2]/c-wiz/div[3]/div[1]/div/div/div/div[1]/div[1]/span/div[1]/div[1]/div[2]/a[1]/div[1]/img
let previewimagexpath = '/html/body/div[2]/c-wiz/div[3]/div[2]/div[3]/div/div/div[3]/div[2]/c-wiz/div/div[1]/div[1]/div[2]/div/a/img'
// previewimagexpath = '//*[#id="Sva75c"]/div/div/div[3]/div[2]/c-wiz/div/div[1]/div[1]/div[2]/div/a/img'
for(let i=1;i<20;i++)
{
let imagexpath = '/html/body/div[2]/c-wiz/div[3]/div[1]/div/div/div/div[1]/div[1]/span/div[1]/div[1]/div['+i+']/a[1]/div[1]/img'
const elements = await page.$x(imagexpath)
await elements[0].click();
await page.waitForTimeout(3000);
const image = await page.$x(previewimagexpath);
let d = await image[0].getProperty('src')
//console.log(d._remoteObject.value);
imagelinkslist.push(d._remoteObject.value)
}
await browser.close();
};
main().then(()=>{
console.log('Got Image links');
imagelinkslist.map((el,index)=>{
let url = el;
//console.log(url);
const path = `./images/image${index+1}.png`;
if(url.includes('https'))
downloadFile(url , path);
})
// console.log(imagelinkslist)
});
Related
I've managed to get Puppeteer working to scrape data off a number of different web pages. However, I'm repeating the same if statement for each bit of data - I'm really new to Javascript so I'm pretty sure I'm overlooking something simple so it's not repeated.
I've searched online quite a bit & tried a few different things but can't get it working.
For the code example below, I've taken out a lot of the different query selectors so it's easier to read and just an example, but in the actual code there's 12 of them, all with exactly the same code except the querySelector.
const puppeteer = require('puppeteer');
// This gets the url's I want to scrape stored on another file
let urls = require('./assets/links.js').urls;
(async () => {
// Initiate the browser
const browser = await puppeteer.launch();
// Create a new page with the default browser context
const page = await browser.newPage();
for (let i = 0; i < urls.length; i++) {
// Go to the target website
await page.goto(urls[i]);
let url = urls[i];
const title = await page.evaluate(() => {
let element = document.querySelector('h1')
if (element) {
return element.innerText
} return null;
})
const reviews = await page.evaluate(() => {
let element = document.querySelector('.example-class')
if (element) {
return element.innerText
} return null;
})
const description = await page.evaluate(() => {
let element = document.querySelector('#example-id')
if (element) {
return element.innerText
} return null;
})
console.log({ url, title, reviews, description });
}
// Closes the browser and all of its pages
await browser.close();
})();
I've tried creating a function but it wouldn't let me use it with await.
You can write a simple function that grabs text from a selector and returns null if the element doesn't exist:
const puppeteer = require("puppeteer"); // ^19.0.0
const {urls} = require("./assets/links.js");
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
const text = sel =>
page.$eval(sel, el => el.textContent).catch(() => null);
for (const url of urls) {
await page.goto(url);
console.log({
url,
title: await text("h1"),
reviews: await text(".example-class"),
description: await text("#example-id"),
});
}
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
If there's 12 of them, you might want to add an array and a loop:
const puppeteer = require("puppeteer"); // ^19.0.0
const {urls} = require("./assets/links.js");
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
const text = sel =>
page.$eval(sel, el => el.textContent).catch(() => null);
const selectors = {
title: "h1",
reviews: ".example-class",
description: "#example-id",
// ...
};
for (const url of urls) {
await page.goto(url);
const textProms = Object.entries(selectors).map(
async (k, sel) => [k, await text(sel)]
);
console.log({
url,
...Object.fromEntries(await Promise.all(textProms)),
});
}
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
I am trying to scrape the YouTube headline and link from a channel using Puppeteer. While executing the program, I am facing the Evaluation Error as following:
Error: Evaluation failed: TypeError: Cannot read properties of null (reading 'innerText')
at pptr://__puppeteer_evaluation_script__:10:65
at ExecutionContext._ExecutionContext_evaluate (E:\somoy\node_modules\puppeteer-core\lib\cjs\puppeteer\common\ExecutionContext.js:229:15)
at process.processTicksAndRejections (node:internal/process/task_queues:95:5)
at async ExecutionContext.evaluate (E:\somoy\node_modules\puppeteer-core\lib\cjs\puppeteer\common\ExecutionContext.js:107:16)
at async initiate (E:\somoy\appNew.js:45:20)
at async E:\somoy\appNew.js:155:9
async function initiate() {
const browser = await puppeteer.launch({ headless: false, defaultViewport: null, userDataDir: './userdata', executablePath: 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe' });
const page = await browser.newPage();
page.setDefaultNavigationTimeout(0)
await page.goto('https://www.youtube.com/#ProthomAlo/videos', { waitUntil: 'networkidle2' });
await delay(5000);
if (!fs.existsSync('storeLink.txt')) {
//create new file if not exist
fs.writeFileSync("storeLink.txt", '');
}
articleLinkarr = (fs.readFileSync('storeLink.txt', { encoding: 'utf8' })).split('\n')
let articles = await page.evaluate(async (articleLinkarr) => {
//console.log('Hello1')
let arrObj = [];
articles = document.querySelectorAll('.style-scope.ytd-rich-grid-media');
for (let i = 0; i < articles.length; i++) {
//for (let i = 0; i < 20; i++) {
//const category = document.querySelector('.print-entity-section-wrapper.F93gk').innerText
//const headline = articles[i].querySelector('div > h3').innerText
const headline = articles[i].querySelector('h3').innerText
const link = 'https://www.youtube.com' + articles[i].querySelector('a').getAttribute('href')
// if (!(link.includes('video') || link.includes('fun') || link.includes('photo'))) {
// if (!articleLinkarr.includes(link)) {
arrObj.push({ articleHeadline: headline, articleLink: link })
// }
// }
};
return arrObj;
}, articleLinkarr)
}
Puppeteer doesn't seem necessary here if you just want the initial set of titles. There's a JSON blob in the static HTML which has the title list, so you can make a simple HTTP request to the URL and pull the blob out with an HTML parser, then walk the object structure.
const cheerio = require("cheerio"); // 1.0.0-rc.12
const url = "Your URL";
fetch(url) // Node 18 or install node-fetch
.then(res => {
if (!res.ok) {
throw Error(res.statusText);
}
return res.text();
})
.then(html => {
const $ = cheerio.load(html);
const script = $(
[...$("script")].find(e =>
$(e).text().startsWith("var ytInitialData = {")
)
)
.text()
.slice(20, -1);
const data = JSON.parse(script);
const titles = [];
const {contents} =
data.contents.twoColumnBrowseResultsRenderer.tabs[1].tabRenderer
.content.richGridRenderer;
for (const c of contents) {
if (!c.richItemRenderer) {
continue;
}
const title =
c.richItemRenderer.content.videoRenderer.title.runs[0].text;
const url =
c.richItemRenderer.content.videoRenderer.navigationEndpoint
.commandMetadata.webCommandMetadata.url;
titles.push({title, url});
}
console.log(titles);
})
.catch(err => console.error(err));
If you do want to use Puppeteer, you can select these titles and URLs with:
const puppeteer = require("puppeteer"); // ^19.0.0
const url = "Your URL";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto(url, {waitUntil: "domcontentloaded"});
await page.waitForSelector("#video-title-link");
const titles = await page.$$eval("#video-title-link", els =>
els.map(e => ({title: e.textContent, url: e.href}))
.filter(e => e.url)
);
console.log(titles);
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
For some reason, the ids aren't unique.
Although this is less code, this approach is much slower than fetch (~10x slower on my machine), although you can speed it up a bit by blocking irrelevant resources.
As an aside, always use const in front of your variables to avoid making them global.
page.setDefaultNavigationTimeout(0) is generally not a great pattern--this could hang forever. I'd set this to 3 or 4 minutes at most. If nav is taking that long, something is wrong and you should get that logged so you can take a look at it.
What i'm trying to accomplish is to save complete document with all the comments expanded.
Unfortunately there are multiple selectors with same class and most of them are hidden and what i believe puppeteer does it takes first found selector and waits until it's visible which never happens.
Url: https://www.discoverpermaculture.com/permaculture-masterclass-video-1
const puppeteer = require('puppeteer');
const isElementVisible = async (page, cssSelector) => {
let visible = true;
await page
.waitForSelector(cssSelector, { visible: true, timeout: 2000 })
.catch(() => {
visible = false;
});
if(visible)console.log('Selector '+cssSelector+'visible!');
return visible;
};
async function run () {
let browser = await puppeteer.launch({headless: true, defaultViewport: null, args: ['--window-size=1920,10000',],});
const page = await browser.newPage();
const fs = require('fs');
await page.goto('https://www.discoverpermaculture.com/permaculture-masterclass-video-1');
await page.waitForTimeout(4000)
const elementHandle = await page.waitForSelector('iframe');
const frame = await elementHandle.contentFrame();
//loading all the comments (works because there's only one 'a.load-more__button' element a time)
const selectorForLoadMoreButton = 'a.load-more__button';
let loadMoreVisible = await isElementVisible(frame, selectorForLoadMoreButton);
while (loadMoreVisible) {
console.log('Loading comments');
await frame
.click(selectorForLoadMoreButton)
.catch(() => {});
loadMoreVisible = await isElementVisible(frame, selectorForLoadMoreButton);
}
//expanding comments doesn't work because each comment have a.see-more but some are hidden
const selectorForSeeMoreButton = 'a.see-more';
let seeMoreVisible = await isElementVisible(frame, selectorForSeeMoreButton);
while (seeMoreVisible) {
console.log('Expanding comments');
await frame
.click(selectorForSeeMoreButton)
.catch(() => {});
seeMoreVisible = await isElementVisible(frame, selectorForSeeMoreButton);
}
const cdp = await page.target().createCDPSession();
const { data } = await cdp.send('Page.captureSnapshot', { format: 'mhtml' });
fs.writeFileSync('out.mhtml', data);
browser.close();
}
run();
Any ideas how to handle this?
It turned out that each comment have 'a.see-more' element but if it's not a long one it also have '.hidden' class. Had to update this piece of code to search for all the 'a.see-more' elements but without '.hidden' class.
const selectorForSeeMoreButton = 'a.see-more:not(.hidden)';
let seeMoreVisible = await isElementVisible(frame, selectorForSeeMoreButton);
while (seeMoreVisible) {
console.log('Expanding comments');
await frame
.click(selectorForSeeMoreButton)
.catch(() => {});
seeMoreVisible = await isElementVisible(frame, electorForSeeMoreButton);
}
I was trying to scrape a thumbnail image from youtube with its XPath but I am getting undefined for the src. I can't figure out what is causing this? I already tried using both the XPath and full XPath but that didn't help. Any help is appreciated. Thanks in advance.
const puppeteer = require('puppeteer');
async function scrapeChannel1(url) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url, {
timeout: 0
});
const [el2] = await page.$x('//*[#id="dismissible"]/ytd-thumbnail');
const src1 = await el2.getProperty('src');
const thumbnailURL1 = await src1.jsonValue();
browser.close();
console.log({
thumbnailURL1
})
return {
thumbnailURL1
}
}
scrapeChannel1('https://www.youtube.com/')
The <img> you are looking for is placed a bit deeper in the DOM at: '//*[#id="dismissible"]/ytd-thumbnail/a/yt-img-shadow/img' (so you should add: /a/yt-img-shadow/img at the end of your XPath expression).
Note, you have more powerful tools in puppeteer than .getProperty('src') to retrieve DOM element properties.
E.g. page.$eval:
const selector = 'ytd-thumbnail > a > yt-img-shadow > #img'
const imageSrc = await page.$eval(selector, el => el.src)
// returns: https://i.ytimg.com/vi/{youtube_id}/hqdefault.jpg...
Or if you want all images use page.$$eval:
const imageSrcs = await page.$$eval(selector, elems => elems.map(el => el.src))
If you want to get images src from YouTube, you need to scroll video thumbnails into view like in the code below (also check it on the online IDE):
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());
const mainPageUrl = "https://www.youtube.com";
async function scrollPage(page, scrollElements) {
let currentElement = 0;
while (true) {
let elementsLength = await page.evaluate((scrollElements) => {
return document.querySelectorAll(scrollElements).length;
}, scrollElements);
for (; currentElement < elementsLength; currentElement++) {
await page.waitForTimeout(200);
await page.evaluate(
(currentElement, scrollElements) => {
document.querySelectorAll(scrollElements)[currentElement].scrollIntoView();
},
currentElement,
scrollElements
);
}
await page.waitForTimeout(5000);
let newElementsLength = await page.evaluate((scrollElements) => {
return document.querySelectorAll(scrollElements).length;
}, scrollElements);
if (newElementsLength === elementsLength || currentElement > 100) break; // if you want to get all elements (or some other number of elements) change number to 'Infinity' (or some other number)
}
}
async function getThumbnails() {
const browser = await puppeteer.launch({
headless: false,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
await page.setDefaultNavigationTimeout(60000);
await page.goto(mainPageUrl);
await page.waitForSelector("#contents");
const scrollElements = "a#thumbnail";
await scrollPage(page, scrollElements);
await page.waitForTimeout(10000);
const urls = await page.$$eval("a#thumbnail #img", (els) => els.map(el => el.getAttribute('src')).filter(el => el));
await browser.close();
return urls;
}
getThumbnails().then(console.log);
Output
[
"https://i.ytimg.com/vi/02oeySm1CJA/hq720.jpg?sqp=-oaymwEcCNAFEJQDSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLBmrYMHESpY_f1oTNx00iuR3tNeCQ",
"https://i.ytimg.com/vi/RMo2haIPYBM/hq720_live.jpg?sqp=CNifxJcG-oaymwEcCNAFEJQDSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLBw4ogzR0709SqbttRdEzfL-aTdgQ",
"https://i.ytimg.com/vi/qJFFp_ta1Zk/hqdefault.jpg?sqp=-oaymwEcCOADEI4CSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLBJ-44OFgBUuVUYWBVh3Yi3hQgwIg",
"https://i.ytimg.com/vi/OZoTjoN-Sn0/hqdefault.jpg?sqp=-oaymwEcCOADEI4CSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLCOeGTCnlT4U0wV1SNclkmFUEHLaA",
"https://i.ytimg.com/vi/L8cH2gI67uk/hqdefault.jpg?sqp=-oaymwEcCOADEI4CSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLAuvZ3khIjpvAVTGjmR9FDxQrPIgQ",
"https://i.ytimg.com/vi/6rUyVKyJnGY/hq720.jpg?sqp=-oaymwEcCNAFEJQDSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLCifsTG4MlA3mf8CcJDkfKdWaZkaA",
"https://i.ytimg.com/vi/xpaURivPZFk/hq720_2.jpg?sqp=-oaymwEdCJYDENAFSFXyq4qpAw8IARUAAIhCcAHAAQbQAQE=&rs=AOn4CLA5oFDDsVzbV3tUqyfogfuf3LPahQ",
"https://i.ytimg.com/vi/MsR76PyVdUs/hq720_2.jpg?sqp=-oaymwEdCJYDENAFSFXyq4qpAw8IARUAAIhCcAHAAQbQAQE=&rs=AOn4CLAEBYGNvif-7LWx2mqW4G9o-OUhEQ",
"https://i.ytimg.com/vi/liasQRRVt5w/hq720_2.jpg?sqp=-oaymwEdCJYDENAFSFXyq4qpAw8IARUAAIhCcAHAAQbQAQE=&rs=AOn4CLAUcMpyKY0GhmNAHHtP_cDkAp18DQ",
"https://i.ytimg.com/vi/Dr5IqlTLMDM/hq720_2.jpg?sqp=-oaymwEdCJYDENAFSFXyq4qpAw8IARUAAIhCcAHAAQbQAQE=&rs=AOn4CLBOSUi6mgjdD5a-Jx8Ns24SlexB1g",
"https://i.ytimg.com/vi/E8kit8xJKdI/hq720_2.jpg?sqp=-oaymwEdCJYDENAFSFXyq4qpAw8IARUAAIhCcAHAAQbQAQE=&rs=AOn4CLDDStn95G7ei5DTusGXE4RimzdLUw",
"https://i.ytimg.com/vi/SqEaahOmLHU/hq720_2.jpg?sqp=-oaymwEdCM0CENAFSFXyq4qpAw8IARUAAIhCcAHAAQbQAQE=&rs=AOn4CLBDcWLCklNxEAuT1ZvSTKrIplGOag",
...and other results
]
You can read more about scraping YouTube search from my blog post Web scraping YouTube search video results with Nodejs.
I am trying to get the puppeteer web scraping to scrape clutch.co using the below scripts but not working properly. I'm developing a simple web scraper for clutch.co. I want to extract contacts and company names for my lead generation project. In fact, to achieve this goal would improve my understanding of javascript.
Brower.js
const puppeteer = require("puppeteer");
async function startBrowser() {
let browser;
try {
console.log("Opening the browser......");
browser = await puppeteer.launch({
headless: false,
args: ["--disable-setuid-sandbox"],
ignoreHTTPSErrors: true,
});
} catch (err) {
console.log("Could not create a browser instance => : ", err);
}
return browser;
}
module.exports = {
startBrowser,
};
Pagecontroller.js
const pageScraper = require("./pageScraper");
async function scrapeAll(browserInstance) {
let browser;
try {
browser = await browserInstance;
await pageScraper.scraper(browser);
} catch (err) {
console.log("Could not resolve the browser instance => ", err);
}
}
module.exports = (browserInstance) => scrapeAll(browserInstance);
Pagescraper.js
const scraperObject = {
url: "https://clutch.co/sitemap",
async scraper(browser) {
let page = await browser.newPage();
console.log(`Navigating to ${this.url}...`);
await page.setDefaultNavigationTimeout(0);
await page.goto(this.url);
// Wait for the required DOM to be rendered
await page.waitForSelector(".container");
// Get the link to all Categories
let urls = await page.$$eval(".sitemap-menu > li", (links) => {
// Extract the links from the data
links = links.map((el) => el.querySelector("div > a").href);
return links;
});
// Loop through each of those links, open a new page instance and get the relevant data from them
let pagePromise = (link) =>
new Promise(async (resolve, reject) => {
let dataObj = {};
let newPage = await browser.newPage();
await page.setDefaultNavigationTimeout(0);
await newPage.goto(link);
dataObj["companyName"] = await newPage.$eval(
"h3 > a",
(text) => text.textContent
);
dataObj["tagLine"] = await newPage.$eval(
".tagline",
(text) => text.textContent
);
resolve(dataObj);
await newPage.close();
});
for (link in urls) {
let currentPageData = await pagePromise(urls[link]);
// scrapedData.push(currentPageData);
console.log(currentPageData);
}
},
};
module.exports = scraperObject;
index.js
const browserObject = require("./browser");
const scraperController = require("./pageController");
//Start the browser and create a browser instance
let browserInstance = browserObject.startBrowser();
// Pass the browser instance to the scraper controller
scraperController(browserInstance);
I can't get the script to load and scrape the profile pages. it keeps giving me errors.