Two asyn/await functions are running in prallel instead of sequential - javascript

I have two async functions.
each function launches a headless browser.
scrapeHeader() function scrape data and save to JSON file.
scrapeData() function read from JSON file written by scrapeHeaders and scrape data. and save it to a JSON file.
I am calling both functions in the main function. I expect both of these functions to run in sequential order. But when I run the main function both functions run in parallel and launch two headless browsers. Here is the code.
async function main() {
// extract headers.
console.log("scraping headers ... ");
await scrapeHeaders();
// scrape data.
console.log("scraping data ... ");
await scrapeData();
}
In my previous implementation, I was returning data from scrapeHeaders() function and passing it to scrapeData() function then the logic was working as expected.
I've read that async/await code runs sequentially.
But I think the engine is considering both functions independent and that's why it is running them in parallel. How to tell the engine to wait until the first function is executed completely?
what's the other way to solve the problem without passing data from the first function to the second function?
code of scrapeHeaders method.
export async function scrapeHeaders() {
const url = 'url';
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
});
const page = await browser.newPage();
// change user agent.
await changeUserAgent(page);
await page.setCookie({
name: "flmdat",
value:
"value",
domain: "www.something.com",
path: "/",
});
// block resources.
// await blockResources(page);
await page.goto(url, { timeout: 0, waitUntil: "networkidle2" });
// getting number of projects available on site.
const someData = await page.evaluate(() => {
//browser logic
});
if (someData === 0) {
console.log("No data available");
return [];
}
const possiblePages = Math.ceil(someData / 25);
const maxPages = 400;
const pages = possiblePages > maxPages ? max pages : possiblePages;
console.log("pages to scrape: ", pages);
let totalHeaders: Header[] = [];
await waitForTimeout(10000);
for (let i = 0; i < pages; i++) {
console.log(`going to page ${i + 1}`);
if (i !== 0) {
// gets next url if not first page
const nextUrl = `${url}?&page=${i + 1}`;
try {
await page.goto(nextUrl, {
timeout: 60000, // wait for 1 minute
waitUntil: "networkidle2",
});
} catch (err) {
handleError(err, `error while going to ${nextUrl}`);
}
}
const headers = await page.evaluate(() => {
// browser logic
return headers
});
// writing headers to file.
const headers = rawHeaders.filter(
(header) => header !== undefined
) as Header[];
// saving headers of each page to total headers array.
totalHeaders.push(...headers);
// saving headers scraped at the moment to file.
fs.writeFileSync(
"headers.json",
JSON.stringify(totalHeaders)
);
console.log(`got ${headers.length} projects info from page ${i + 1}`);
// to not burden server.
}
console.log(`scraped ${totalHeaders.length} headers`);
await browser.close();
}
code of scrapeData() method
export async function scrapeData() {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
});
const page = await browser.newPage();
// changing user agent
await changeUserAgent(page);
await page.setCookie({
name: "name",
value:
"value",
domain: "www.something.com",
path: "/",
});
let headers: Header[] = [];
try {
headers = JSON.parse(
fs.readFileSync("headers.json", "utf-8")
);
} catch (err) {
throw new Error("error while reading headers file in Posts");
}
console.log("total headers to scrape: ", headers.length);
let posts: Info[] = [];
for (let i = 0; i < headers.length; i++) {
// scrape data from post
try {
const post: Info | null = await scrapePost(
headers[i],
page
);
// if post is not available.
if (post === null) {
console.log(`post with ${headers[i].url} is not available. skipping it...`);
continue;
}
if (post.url) {
// scrape data from company page
const info: postPage = await Info(
post.url,
page
);
posts.push({ ...post, ...info });
} else {
data.push(post);
}
} catch (err) {
handleError(err, `error while scraping post page url:${headers[i].url}`);
}
if ((i + 1) % 10 === 0) {
// saving data in file after every 10 posts.
fs.writeFileSync("posts.json", JSON.stringify(post));
console.log(`Scraped ${i + 1} posts`);
}
}
fs.writeFileSync("posts.json", JSON.stringify(posts));
console.log(`scraped ${posts.length} posts`);
await page.close();
await browser.close();
}

Related

Investigating an issue with my Uniswap tokens scraper that errors out after 7 requests to the Graph API

I'm making a scraper that will grab every Uniswap pair and save it to an array using the Graph API.
My problem occurs when I make my 7th request to the API.
Initially, I thought I was being rate limited because I was fetching 1000 tokens at a time, but after adding a 10 second wait between calls and decreasing the fetched tokens from 1000 to 10, it still stops on the 7th loop.
The script works perfectly until this point.
const axios = require('axios');
const fs = require('fs');
async function getTokens(skip) {
try {
const query = `
query tokens($skip: Int!) {
tokens(first: 10, skip: $skip) {
id
name
symbol
}
}
`;
const variables = {
skip: skip
};
const headers = {
"Content-Type": "application/json"
};
const { data } = await axios.post("https://api.thegraph.com/subgraphs/name/uniswap/uniswap-v3", {
query,
variables
}, {
headers
});
return data.data.tokens;
} catch (err) {
console.error(err);
return []
}
}
async function saveTokens(tokens) {
try {
await fs.promises.writeFile("uniTokens.json", JSON.stringify(tokens), { flag: "w" });
} catch (err) {
console.error(err);
}
}
async function main() {
let skip = 0;
let tokens = [];
const retrievedIds = new Set();
while (true) {
const newTokens = await getTokens(skip);
if (newTokens.length === 0) {
console.log("Reached end of tokens, finishing up...");
break;
}
// Only save tokens that haven't been retrieved before
const newIds = new Set(newTokens.map(token => token.id));
newIds.forEach(id => {
if (!retrievedIds.has(id)) {
tokens.push(newTokens.find(token => token.id === id));
retrievedIds.add(id);
}
});
console.log(`Retrieved ${tokens.length} tokens`);
await saveTokens(tokens);
skip += 1000;
// delay the next request by 10 seconds
//await new Promise(resolve => setTimeout(resolve, 10000));
}
}
main();
This is the error that it produces:
TypeError: Cannot read properties of undefined (reading 'tokens')
at getTokens (/root/unipairs/uni:31:26)
at process.processTicksAndRejections (node:internal/process/task_queues:95:5)
at async main (/root/unipairs/uni:52:27)
Reached end of tokens, finishing up...

Web Scraping using Puppeteer returns undefined during atcoder contest

I made a web scrapper for parsing test cases of Atcoder contest. It works well if the contest is already finished but gives an error for an ongoing contest. The error arises when accessing the rows of the table HTML element. I am positive that the table exists but for some reason, the script returns undefined for an ongoing contest.
Error:
Error: Evaluation failed: TypeError: Cannot read properties of undefined (reading 'rows')
at pptr://__puppeteer_evaluation_script__:3:32
at ExecutionContext._ExecutionContext_evaluate (/mnt/d/c++/node_modules/puppeteer-core/lib/cjs/puppeteer/common/ExecutionContext.js:229:15)
at process.processTicksAndRejections (node:internal/process/task_queues:95:5)
at async ExecutionContext.evaluate (/mnt/d/c++/node_modules/puppeteer-core/lib/cjs/puppeteer/common/ExecutionContext.js:107:16)
at async scrapeSite (/mnt/d/c++/codeforces/atcoder.js:57:33)
Here is my Scrapper: atcoder.js:
const puppeteer = require("puppeteer");
const fs = require("fs");
const contest_id = process.argv[2];
async function scrapeProblem(problem_letter) {
const url = `https://atcoder.jp/contests/${contest_id}/tasks/${contest_id}_${problem_letter.toLowerCase()}`;
console.log(url);
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url, { waitUntil: "networkidle0" });
const samples_scraped = await page.evaluate(() => {
const samples = document.querySelectorAll("pre");
const scraped = Array.from(samples).filter((child) => {
return child.id !== "";
});
let num_scraped = scraped.length;
// The elements were repeated twice, so remove the extra elements
for (let i = 0; i < num_scraped / 2; i++) scraped.pop();
return scraped.map((ele) => ele.innerText);
// return Array.from(samples).map((child) => child.innerText);
});
let id = 1;
// Now we need to store the samples in text format
samples_scraped.map((ele, idx) => {
if (idx % 2 == 0) {
// Input
fs.writeFile(`${problem_letter}-${id}.in`, ele, (err) => {
if (err) throw err;
});
} else {
// Output
fs.writeFile(`${problem_letter}-${id}.out`, ele, (err) => {
if (err) throw err;
});
id++;
}
return ele;
});
await browser.close();
} catch (e) {
console.log(e);
}
}
async function scrapeSite() {
const url = `https://atcoder.jp/contests/${contest_id}/tasks`;
console.log(url);
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url, { waitUntil: "networkidle0" });
// Returns all the problem letters
const problem_letters = await page.evaluate(() => {
const table = document.querySelectorAll("table")[0];
const rows = table.rows.length;
const letters = [];
for (let i = 1; i < rows; i++) {
letters.push(table.rows[i].cells[0].innerText);
}
return letters;
});
console.log(problem_letters);
for (problem_letter of problem_letters) {
scrapeProblem(problem_letter);
}
await browser.close();
} catch (e) {
console.log(e);
}
}
scrapeSite();
The scrapeProblem(problem_letter) is a helper function to scrape the test cases for the given problem letter. It then stores the test cases to the user's file system using fs module.
The scrapeSite() function first parses the homepage for the number of problems and the problem letter associated with each problem. It then calls the scrapeProblem(problem_letter) helper function to parse the required web site for test cases.
To run the script: node scrapper.js abc280
Update: I tried it in a new contest and again got the same error. This time I took a screenshot using Puppeteer and found out the problem. I am getting permission denied if I try to accesss the site without logging in for an ongoing contest.
The problem was the site requires us to login and only then we can see the problem statements of an ongoing contest. So I added a function which will first login to the site and then it will proceed to parse the test cases.
Updated code:
const puppeteer = require("puppeteer");
const fs = require("fs");
require('dotenv').config();
const contest_id = process.argv[2];
async function login(browser, page) {
const url = `https://atcoder.jp/login?continue=https%3A%2F%2Fatcoder.jp%2F`;
console.log("Logging in..", url);
try {
await page.goto(url, { waitUntil: "networkidle0" });
await page.type('#username', process.env.USERNAME);
await page.type("#password", process.env.PASSWORD);
await page.click("#submit");
} catch (e) {
console.log("Login failed...");
console.log(e);
}
}
async function scrapeProblem(browser, Problem) {
const url = Problem.Url;
console.log(url);
try {
// const browser = await puppeteer.launch();
const page = await browser.newPage();
// await login(browser, page);
await page.goto(url, { waitUntil: "networkidle0" });
const samples_scraped = await page.evaluate(() => {
const samples = document.querySelectorAll("pre");
const scraped = Array.from(samples).filter((child) => {
return child.id !== "";
});
let num_scraped = scraped.length;
// The elements were repeated twice, so remove the extra elements
for (let i = 0; i < num_scraped / 2; i++) scraped.pop();
return scraped.map((ele) => ele.innerText);
// return Array.from(samples).map((child) => child.innerText);
});
let id = 1;
// Now we need to store the samples in text format
samples_scraped.map((ele, idx) => {
if (idx % 2 == 0) {
// Input
fs.writeFile(`${Problem.Problem_letter}-${id}.in`, ele, (err) => {
if (err) throw err;
});
} else {
// Output
fs.writeFile(`${Problem.Problem_letter}-${id}.out`, ele, (err) => {
if (err) throw err;
});
id++;
}
return ele;
});
// await browser.close();
} catch (e) {
console.log(e);
}
}
async function scrapeSite() {
const url = `https://atcoder.jp/contests/${contest_id}/tasks`;
console.log(url);
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await login(browser, page);
await page.goto(url, { waitUntil: "networkidle0" });
// await page.screenshot({ path: "./screenshot.png", fullPage: true});
// Returns all the problem letters
const problems = await page.evaluate(() => {
const table = document.querySelectorAll("table")[0];
const rows = table.rows.length;
const letters = [];
for (let i = 1; i < rows; i++) {
letters.push({Problem_letter: table.rows[i].cells[0].innerText, Url: table.rows[i].cells[0].firstChild.href });
}
return letters;
});
console.log(problems);
const promises = []
for (problem of problems) {
promises.push(scrapeProblem(browser, problem));
}
await Promise.all(promises); // All the promises must be resolved before closing the browser
await browser.close();
} catch (e) {
console.log(e);
}
}
scrapeSite();

Puppeteer Node Js YouTube data scraping error "Evaluation Failed"

I am trying to scrape the YouTube headline and link from a channel using Puppeteer. While executing the program, I am facing the Evaluation Error as following:
Error: Evaluation failed: TypeError: Cannot read properties of null (reading 'innerText')
at pptr://__puppeteer_evaluation_script__:10:65
at ExecutionContext._ExecutionContext_evaluate (E:\somoy\node_modules\puppeteer-core\lib\cjs\puppeteer\common\ExecutionContext.js:229:15)
at process.processTicksAndRejections (node:internal/process/task_queues:95:5)
at async ExecutionContext.evaluate (E:\somoy\node_modules\puppeteer-core\lib\cjs\puppeteer\common\ExecutionContext.js:107:16)
at async initiate (E:\somoy\appNew.js:45:20)
at async E:\somoy\appNew.js:155:9
async function initiate() {
const browser = await puppeteer.launch({ headless: false, defaultViewport: null, userDataDir: './userdata', executablePath: 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe' });
const page = await browser.newPage();
page.setDefaultNavigationTimeout(0)
await page.goto('https://www.youtube.com/#ProthomAlo/videos', { waitUntil: 'networkidle2' });
await delay(5000);
if (!fs.existsSync('storeLink.txt')) {
//create new file if not exist
fs.writeFileSync("storeLink.txt", '');
}
articleLinkarr = (fs.readFileSync('storeLink.txt', { encoding: 'utf8' })).split('\n')
let articles = await page.evaluate(async (articleLinkarr) => {
//console.log('Hello1')
let arrObj = [];
articles = document.querySelectorAll('.style-scope.ytd-rich-grid-media');
for (let i = 0; i < articles.length; i++) {
//for (let i = 0; i < 20; i++) {
//const category = document.querySelector('.print-entity-section-wrapper.F93gk').innerText
//const headline = articles[i].querySelector('div > h3').innerText
const headline = articles[i].querySelector('h3').innerText
const link = 'https://www.youtube.com' + articles[i].querySelector('a').getAttribute('href')
// if (!(link.includes('video') || link.includes('fun') || link.includes('photo'))) {
// if (!articleLinkarr.includes(link)) {
arrObj.push({ articleHeadline: headline, articleLink: link })
// }
// }
};
return arrObj;
}, articleLinkarr)
}
Puppeteer doesn't seem necessary here if you just want the initial set of titles. There's a JSON blob in the static HTML which has the title list, so you can make a simple HTTP request to the URL and pull the blob out with an HTML parser, then walk the object structure.
const cheerio = require("cheerio"); // 1.0.0-rc.12
const url = "Your URL";
fetch(url) // Node 18 or install node-fetch
.then(res => {
if (!res.ok) {
throw Error(res.statusText);
}
return res.text();
})
.then(html => {
const $ = cheerio.load(html);
const script = $(
[...$("script")].find(e =>
$(e).text().startsWith("var ytInitialData = {")
)
)
.text()
.slice(20, -1);
const data = JSON.parse(script);
const titles = [];
const {contents} =
data.contents.twoColumnBrowseResultsRenderer.tabs[1].tabRenderer
.content.richGridRenderer;
for (const c of contents) {
if (!c.richItemRenderer) {
continue;
}
const title =
c.richItemRenderer.content.videoRenderer.title.runs[0].text;
const url =
c.richItemRenderer.content.videoRenderer.navigationEndpoint
.commandMetadata.webCommandMetadata.url;
titles.push({title, url});
}
console.log(titles);
})
.catch(err => console.error(err));
If you do want to use Puppeteer, you can select these titles and URLs with:
const puppeteer = require("puppeteer"); // ^19.0.0
const url = "Your URL";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto(url, {waitUntil: "domcontentloaded"});
await page.waitForSelector("#video-title-link");
const titles = await page.$$eval("#video-title-link", els =>
els.map(e => ({title: e.textContent, url: e.href}))
.filter(e => e.url)
);
console.log(titles);
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
For some reason, the ids aren't unique.
Although this is less code, this approach is much slower than fetch (~10x slower on my machine), although you can speed it up a bit by blocking irrelevant resources.
As an aside, always use const in front of your variables to avoid making them global.
page.setDefaultNavigationTimeout(0) is generally not a great pattern--this could hang forever. I'd set this to 3 or 4 minutes at most. If nav is taking that long, something is wrong and you should get that logged so you can take a look at it.

How handle multiple functions in puppeteer-cluster?

I have a two step program :
Get a list of href from a page
Loop infinitely on each page of this list, get an element and display it in console
I try to use function with Puppeteer-Cluter but it doesn't work properly.
const { Cluster } = require('puppeteer-cluster');
const fs = require("fs");
const { addExtra } = require("puppeteer-extra");
const vanillaPuppeteer = require("puppeteer");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
var moment = require('moment');
var regexTemps = /(\d+)\s(\w+)$/;
const urlsToCheck = [];
TZ = 'Europe/Paris'
process.env.TZ = 'Europe/Paris'
(async () => {
const puppeteer = addExtra(vanillaPuppeteer);
puppeteer.use(StealthPlugin());
const cluster = await Cluster.launch({
puppeteer,
puppeteerOptions: {
headless: false,
args: ['--no-sandbox'],
},
maxConcurrency: 10,
concurrency: Cluster.CONCURRENCY_CONTEXT,
monitor: false,
skipDuplicateUrls: true,
timeout:30000,
retryLimit:10,
})
cluster.on('taskerror', (err, data, willRetry) => {
if (willRetry) {
console.warn(`Encountered an error while crawling ${data}. ${err.message}\nThis job will be retried`);
} else {
console.error(`Failed to crawl ${data}: ${err.message}`);
}
});
const getElementOnPage = async ({ page, data: url }) => {
console.log('=> Go to URL : ',url);
await page.goto(url);
while (true) {
console.log('=> Reload URL : ',page.url())
await page.reload();
await page.waitForTimeout(1000);
let allNews = await page.$$("article.news"); // [] if nothing
let firstNews = allNews[0];
await page.waitForTimeout(1000);
let info = await firstNews.$eval('.info span', s => s.textContent.trim());
console.log(new Date(), 'info : ',info);
}
};
const getListOfPagesToExplore = async ({ page, data: url }) => {
console.log(new Date(), 'Get the list of deal pages to explore');
await page.goto(url, {waitUntil: 'domcontentloaded'});
await page.waitForTimeout(500);
const hrefsToVisit = await page.$x('//a');
let idxTab=0;
for( let hrefToVisit of hrefsToVisit ) {
var link = await page.evaluate(el => el.getAttribute("href"), hrefToVisit);
console.log(new Date(), 'adding link to list : ', link);
idxTab++;
urlsToCheck.push(link);
}
};
cluster.queue('https://www.apagewithsomelinks.com', getListOfPagesToExplore);
await cluster.idle();
await cluster.close();
console.log(urlsToCheck);
//Display correctly several link in an array
for( let url of urlsToCheck ) {
console.log('Push in queue : ',url);
cluster.queue(url, getElementOnPage);
}
await cluster.idle();
await cluster.close();
})();
When I launch it, it retrieve the links to scrap.
It display "Push in queue : ..." for each URL.
But then, the method getElementOnPage is launched only for the first URL, and runs infinitely like asked. But why the other URL are note launched ??
Before that, I don't use function, I used one unique task with :
await cluster.task(async ({ page, data: url }) => {
But how combine function and this thing ?

Puppeteer web scraping is not loading profile for scraping

I am trying to get the puppeteer web scraping to scrape clutch.co using the below scripts but not working properly. I'm developing a simple web scraper for clutch.co. I want to extract contacts and company names for my lead generation project. In fact, to achieve this goal would improve my understanding of javascript.
Brower.js
const puppeteer = require("puppeteer");
async function startBrowser() {
let browser;
try {
console.log("Opening the browser......");
browser = await puppeteer.launch({
headless: false,
args: ["--disable-setuid-sandbox"],
ignoreHTTPSErrors: true,
});
} catch (err) {
console.log("Could not create a browser instance => : ", err);
}
return browser;
}
module.exports = {
startBrowser,
};
Pagecontroller.js
const pageScraper = require("./pageScraper");
async function scrapeAll(browserInstance) {
let browser;
try {
browser = await browserInstance;
await pageScraper.scraper(browser);
} catch (err) {
console.log("Could not resolve the browser instance => ", err);
}
}
module.exports = (browserInstance) => scrapeAll(browserInstance);
Pagescraper.js
const scraperObject = {
url: "https://clutch.co/sitemap",
async scraper(browser) {
let page = await browser.newPage();
console.log(`Navigating to ${this.url}...`);
await page.setDefaultNavigationTimeout(0);
await page.goto(this.url);
// Wait for the required DOM to be rendered
await page.waitForSelector(".container");
// Get the link to all Categories
let urls = await page.$$eval(".sitemap-menu > li", (links) => {
// Extract the links from the data
links = links.map((el) => el.querySelector("div > a").href);
return links;
});
// Loop through each of those links, open a new page instance and get the relevant data from them
let pagePromise = (link) =>
new Promise(async (resolve, reject) => {
let dataObj = {};
let newPage = await browser.newPage();
await page.setDefaultNavigationTimeout(0);
await newPage.goto(link);
dataObj["companyName"] = await newPage.$eval(
"h3 > a",
(text) => text.textContent
);
dataObj["tagLine"] = await newPage.$eval(
".tagline",
(text) => text.textContent
);
resolve(dataObj);
await newPage.close();
});
for (link in urls) {
let currentPageData = await pagePromise(urls[link]);
// scrapedData.push(currentPageData);
console.log(currentPageData);
}
},
};
module.exports = scraperObject;
index.js
const browserObject = require("./browser");
const scraperController = require("./pageController");
//Start the browser and create a browser instance
let browserInstance = browserObject.startBrowser();
// Pass the browser instance to the scraper controller
scraperController(browserInstance);
I can't get the script to load and scrape the profile pages. it keeps giving me errors.

Categories

Resources