async method always returning undefined - javascript

I can't solve that problem so I'm asking that here:
This is the async function, and as you can see is returning an array. But it returns an undefined value.
async function scrape(pageURL) {
var dealArray = [];
try {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.goto(pageURL);
await page.waitForSelector('div.s-item-container');
const dealsElements = await page.$$('div.s-item-container');
for(deal of dealsElements) {
let dealTitleElement = await deal.$('div.s-item-container a.s-access-detail-page');
let dealTitleValue = await (await dealTitleElement.getProperty('title')).jsonValue();
let dealPriceElement= await deal.$('div.s-item-container span.a-color-price');
let dealPriceValue = await (await dealPriceElement.getProperty('textContent')).jsonValue();
let dealReviewsElement = await deal.$('div.s-item-container .a-icon-star');
let dealLinkValue = await (await dealTitleElement.getProperty('href')).jsonValue() + '&tag=dragonstv-21';
let dealReviewsClass = await (await dealReviewsElement.getProperty('className')).jsonValue();
let dealReviewsValue;
if(dealReviewsClass) {
let starValue = dealReviewsClass.substring(26);
if(starValue.indexOf('-') === -1) {
dealReviewsValue = starValue;
} else {
let stars = starValue.replace('-', '.');
dealReviewsValue = stars;
}
}
dealArray.push({
"title": dealTitleValue,
"price": dealPriceValue,
"reviews": dealReviewsValue + "/5.0",
"link": dealLinkValue,
"store": "Amazon",
});
}
return Promise.resolve(dealArray);
} catch(e) {
console.error('Error: ' + e);
}
}
And here is how I'm calling it:
scrape('working link').then((data) => {
console.log(data) // result: undefined
}
It works only if I declare the variable out of the function and the function doesn't return anything but only changes the array content.

As written, your function must return an array (empty or otherwise). If it's returning undefined, then you're generating an exception and should see one in the console, via your catch statement. If you're not seeing it, you might try removing the try/catch and see what exception bubbles up.

I've actually figured the problem out. It was returning a string so I had to use JSON.parse(request) so I have an object on which I can work on.

Related

Web Scraping using Puppeteer returns undefined during atcoder contest

I made a web scrapper for parsing test cases of Atcoder contest. It works well if the contest is already finished but gives an error for an ongoing contest. The error arises when accessing the rows of the table HTML element. I am positive that the table exists but for some reason, the script returns undefined for an ongoing contest.
Error:
Error: Evaluation failed: TypeError: Cannot read properties of undefined (reading 'rows')
at pptr://__puppeteer_evaluation_script__:3:32
at ExecutionContext._ExecutionContext_evaluate (/mnt/d/c++/node_modules/puppeteer-core/lib/cjs/puppeteer/common/ExecutionContext.js:229:15)
at process.processTicksAndRejections (node:internal/process/task_queues:95:5)
at async ExecutionContext.evaluate (/mnt/d/c++/node_modules/puppeteer-core/lib/cjs/puppeteer/common/ExecutionContext.js:107:16)
at async scrapeSite (/mnt/d/c++/codeforces/atcoder.js:57:33)
Here is my Scrapper: atcoder.js:
const puppeteer = require("puppeteer");
const fs = require("fs");
const contest_id = process.argv[2];
async function scrapeProblem(problem_letter) {
const url = `https://atcoder.jp/contests/${contest_id}/tasks/${contest_id}_${problem_letter.toLowerCase()}`;
console.log(url);
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url, { waitUntil: "networkidle0" });
const samples_scraped = await page.evaluate(() => {
const samples = document.querySelectorAll("pre");
const scraped = Array.from(samples).filter((child) => {
return child.id !== "";
});
let num_scraped = scraped.length;
// The elements were repeated twice, so remove the extra elements
for (let i = 0; i < num_scraped / 2; i++) scraped.pop();
return scraped.map((ele) => ele.innerText);
// return Array.from(samples).map((child) => child.innerText);
});
let id = 1;
// Now we need to store the samples in text format
samples_scraped.map((ele, idx) => {
if (idx % 2 == 0) {
// Input
fs.writeFile(`${problem_letter}-${id}.in`, ele, (err) => {
if (err) throw err;
});
} else {
// Output
fs.writeFile(`${problem_letter}-${id}.out`, ele, (err) => {
if (err) throw err;
});
id++;
}
return ele;
});
await browser.close();
} catch (e) {
console.log(e);
}
}
async function scrapeSite() {
const url = `https://atcoder.jp/contests/${contest_id}/tasks`;
console.log(url);
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url, { waitUntil: "networkidle0" });
// Returns all the problem letters
const problem_letters = await page.evaluate(() => {
const table = document.querySelectorAll("table")[0];
const rows = table.rows.length;
const letters = [];
for (let i = 1; i < rows; i++) {
letters.push(table.rows[i].cells[0].innerText);
}
return letters;
});
console.log(problem_letters);
for (problem_letter of problem_letters) {
scrapeProblem(problem_letter);
}
await browser.close();
} catch (e) {
console.log(e);
}
}
scrapeSite();
The scrapeProblem(problem_letter) is a helper function to scrape the test cases for the given problem letter. It then stores the test cases to the user's file system using fs module.
The scrapeSite() function first parses the homepage for the number of problems and the problem letter associated with each problem. It then calls the scrapeProblem(problem_letter) helper function to parse the required web site for test cases.
To run the script: node scrapper.js abc280
Update: I tried it in a new contest and again got the same error. This time I took a screenshot using Puppeteer and found out the problem. I am getting permission denied if I try to accesss the site without logging in for an ongoing contest.
The problem was the site requires us to login and only then we can see the problem statements of an ongoing contest. So I added a function which will first login to the site and then it will proceed to parse the test cases.
Updated code:
const puppeteer = require("puppeteer");
const fs = require("fs");
require('dotenv').config();
const contest_id = process.argv[2];
async function login(browser, page) {
const url = `https://atcoder.jp/login?continue=https%3A%2F%2Fatcoder.jp%2F`;
console.log("Logging in..", url);
try {
await page.goto(url, { waitUntil: "networkidle0" });
await page.type('#username', process.env.USERNAME);
await page.type("#password", process.env.PASSWORD);
await page.click("#submit");
} catch (e) {
console.log("Login failed...");
console.log(e);
}
}
async function scrapeProblem(browser, Problem) {
const url = Problem.Url;
console.log(url);
try {
// const browser = await puppeteer.launch();
const page = await browser.newPage();
// await login(browser, page);
await page.goto(url, { waitUntil: "networkidle0" });
const samples_scraped = await page.evaluate(() => {
const samples = document.querySelectorAll("pre");
const scraped = Array.from(samples).filter((child) => {
return child.id !== "";
});
let num_scraped = scraped.length;
// The elements were repeated twice, so remove the extra elements
for (let i = 0; i < num_scraped / 2; i++) scraped.pop();
return scraped.map((ele) => ele.innerText);
// return Array.from(samples).map((child) => child.innerText);
});
let id = 1;
// Now we need to store the samples in text format
samples_scraped.map((ele, idx) => {
if (idx % 2 == 0) {
// Input
fs.writeFile(`${Problem.Problem_letter}-${id}.in`, ele, (err) => {
if (err) throw err;
});
} else {
// Output
fs.writeFile(`${Problem.Problem_letter}-${id}.out`, ele, (err) => {
if (err) throw err;
});
id++;
}
return ele;
});
// await browser.close();
} catch (e) {
console.log(e);
}
}
async function scrapeSite() {
const url = `https://atcoder.jp/contests/${contest_id}/tasks`;
console.log(url);
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await login(browser, page);
await page.goto(url, { waitUntil: "networkidle0" });
// await page.screenshot({ path: "./screenshot.png", fullPage: true});
// Returns all the problem letters
const problems = await page.evaluate(() => {
const table = document.querySelectorAll("table")[0];
const rows = table.rows.length;
const letters = [];
for (let i = 1; i < rows; i++) {
letters.push({Problem_letter: table.rows[i].cells[0].innerText, Url: table.rows[i].cells[0].firstChild.href });
}
return letters;
});
console.log(problems);
const promises = []
for (problem of problems) {
promises.push(scrapeProblem(browser, problem));
}
await Promise.all(promises); // All the promises must be resolved before closing the browser
await browser.close();
} catch (e) {
console.log(e);
}
}
scrapeSite();

How to read a lot of documents (1M+) from a collection in Cloud Firestore?

The code below fails with error 9 FAILED_PRECONDITION: The requested snapshot version is too old.
const ref = db.collection('Collection');
const snapshot = await ref.get();
snapshot.forEach((doc,index) => {
...use data
})
Getting all documents from one collection in Firestore
EDIT:
getData();
async function getData(doc) {
let snapshot = await global.db.collection('Collection').orderBy().startAfter(doc).limit(5000).get();
const last = snapshot.docs[snapshot.docs.length - 1];
snapshot.forEach((doc,index) => {
//...use data
})
if (snapshot.docs.length < 5000) {
return;
}
else {
getData(last)
}
}
EDIT 2 (works, a bit slow, reads sequentially, 5000 docs at a time):
let snapshot = null;
let totalIndex = 0;
await getData();
async function getData(doc) {
if (!doc) {
const first = await global.db.collection("collection").doc("docID");
snapshot = await global.db.collection('collection').orderBy(admin.firestore.FieldPath.documentId()).startAt(first).limit(5000).get();
}
else {
snapshot = await global.db.collection('Prompts').orderBy(admin.firestore.FieldPath.documentId()).startAfter(doc).limit(5000).get();
}
const last = snapshot.docs[snapshot.docs.length - 1];
snapshot.forEach((doc,index) => {
console.log(totalIndex++);
//...use data
})
if (snapshot.docs.length < 5000) {
return;
}
else {
getData(last)
}
}
Since you're initially calling getData() without any arguments, that leads to doc being undefined in that function body. And calling startAfter(undefined) is not valid.
What you'll want to do is optionally adding that startAfter with something like:
async function getData(doc) {
let query = global.db.collection('Collection').orderBy();
if (doc) {
query = query.startAfter(doc);
}
let snapshot = await query.limit(5000).get();
...

Two asyn/await functions are running in prallel instead of sequential

I have two async functions.
each function launches a headless browser.
scrapeHeader() function scrape data and save to JSON file.
scrapeData() function read from JSON file written by scrapeHeaders and scrape data. and save it to a JSON file.
I am calling both functions in the main function. I expect both of these functions to run in sequential order. But when I run the main function both functions run in parallel and launch two headless browsers. Here is the code.
async function main() {
// extract headers.
console.log("scraping headers ... ");
await scrapeHeaders();
// scrape data.
console.log("scraping data ... ");
await scrapeData();
}
In my previous implementation, I was returning data from scrapeHeaders() function and passing it to scrapeData() function then the logic was working as expected.
I've read that async/await code runs sequentially.
But I think the engine is considering both functions independent and that's why it is running them in parallel. How to tell the engine to wait until the first function is executed completely?
what's the other way to solve the problem without passing data from the first function to the second function?
code of scrapeHeaders method.
export async function scrapeHeaders() {
const url = 'url';
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
});
const page = await browser.newPage();
// change user agent.
await changeUserAgent(page);
await page.setCookie({
name: "flmdat",
value:
"value",
domain: "www.something.com",
path: "/",
});
// block resources.
// await blockResources(page);
await page.goto(url, { timeout: 0, waitUntil: "networkidle2" });
// getting number of projects available on site.
const someData = await page.evaluate(() => {
//browser logic
});
if (someData === 0) {
console.log("No data available");
return [];
}
const possiblePages = Math.ceil(someData / 25);
const maxPages = 400;
const pages = possiblePages > maxPages ? max pages : possiblePages;
console.log("pages to scrape: ", pages);
let totalHeaders: Header[] = [];
await waitForTimeout(10000);
for (let i = 0; i < pages; i++) {
console.log(`going to page ${i + 1}`);
if (i !== 0) {
// gets next url if not first page
const nextUrl = `${url}?&page=${i + 1}`;
try {
await page.goto(nextUrl, {
timeout: 60000, // wait for 1 minute
waitUntil: "networkidle2",
});
} catch (err) {
handleError(err, `error while going to ${nextUrl}`);
}
}
const headers = await page.evaluate(() => {
// browser logic
return headers
});
// writing headers to file.
const headers = rawHeaders.filter(
(header) => header !== undefined
) as Header[];
// saving headers of each page to total headers array.
totalHeaders.push(...headers);
// saving headers scraped at the moment to file.
fs.writeFileSync(
"headers.json",
JSON.stringify(totalHeaders)
);
console.log(`got ${headers.length} projects info from page ${i + 1}`);
// to not burden server.
}
console.log(`scraped ${totalHeaders.length} headers`);
await browser.close();
}
code of scrapeData() method
export async function scrapeData() {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
});
const page = await browser.newPage();
// changing user agent
await changeUserAgent(page);
await page.setCookie({
name: "name",
value:
"value",
domain: "www.something.com",
path: "/",
});
let headers: Header[] = [];
try {
headers = JSON.parse(
fs.readFileSync("headers.json", "utf-8")
);
} catch (err) {
throw new Error("error while reading headers file in Posts");
}
console.log("total headers to scrape: ", headers.length);
let posts: Info[] = [];
for (let i = 0; i < headers.length; i++) {
// scrape data from post
try {
const post: Info | null = await scrapePost(
headers[i],
page
);
// if post is not available.
if (post === null) {
console.log(`post with ${headers[i].url} is not available. skipping it...`);
continue;
}
if (post.url) {
// scrape data from company page
const info: postPage = await Info(
post.url,
page
);
posts.push({ ...post, ...info });
} else {
data.push(post);
}
} catch (err) {
handleError(err, `error while scraping post page url:${headers[i].url}`);
}
if ((i + 1) % 10 === 0) {
// saving data in file after every 10 posts.
fs.writeFileSync("posts.json", JSON.stringify(post));
console.log(`Scraped ${i + 1} posts`);
}
}
fs.writeFileSync("posts.json", JSON.stringify(posts));
console.log(`scraped ${posts.length} posts`);
await page.close();
await browser.close();
}

Loop through pages and return product links in single array

I have the code I obtained with some help and it can be found at this link HERE. It works well in principle, but the problem is that it returns links and arrays for each page separately. However, I want it to return links in a single array for all pages together. I need to create a string var all_links = [] and push all the links from each page into it all_links.push(...links) and then return them like return all_links But the problem is that I’m failing because I don’t know exactly how to do it in this case. I am a pure beginner with no prior knowledge in coding
const puppeteer = require('puppeteer')
const minDiscount = 20;
async function getLinks() {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
});
const page = await browser.newPage();
const url = 'https://www.mytoys.de/spielzeug-spiele/holz/';
await page.goto(url);
// getting all the products, this will return an array of ElementHandle
while(await page.$(".pager__link--next")){
await page.waitForSelector(".pager__link--next")
await page.waitForTimeout(1000);
await page.click('.pager__link--next')
await page.waitForTimeout(1500);
const products = await page.$$('.prod-grid.js-prod-grid .prod-grid__item.js-prod-grid_item');
const proms = await Promise.allSettled(
products.map(async (prod) => {
// searching for a discount on each product
const disc = await prod.$$eval(
'.prod-grid.js-prod-grid .prod-flag.prod-flag-sale',
(discount) =>
discount.map((discItem) =>
discItem.innerText.replace(/[^0-9.]/g, '').replace(/\D+/g,'0')
)
);
// if it has a discount
if (disc.length > 0) {
// we parse the discount to Integer type to compare it to minDiscount
const discountInt = parseInt(disc[0], 10);
if (discountInt >= minDiscount) {
// we get the link of the product
const link = await prod.$$eval('.prod-grid.js-prod-grid .prod-tile__link.js-prodlink', (allAs) => allAs.map((a) => a.href));
if (link.length > 0) {
// push an object containing the discount and the link of the product
return link[0];
}
}
}
return null;
})
);
const bulkArray = proms.map((item) => {
if (item.status === 'fulfilled') return item.value;
});
const endArray = bulkArray.filter(item => item !== null);
console.log(endArray);
}
}
getLinks();
Please help and be merciful as I am just starting to learn the basics
Here is a cleanup to get you started:
const products = await page.$$eval('.prod-tile', divs => divs.map(div => {
return {
url: div.querySelector('a')?.href
discount: div.querySelector('.prod-flag-sale')?.innerText
}
}))
At this point just follow the next links and do the same thing for each page.

Is this Transaction valid?

I am wondering if this transaction is even valid and actually ensuring quantity is being the most up to date.
async function deductQuantity(orders: [Order]) : Promise<boolean> {
try {
let document = await admin.firestore().collection("MenuItems")
orders.forEach(async (order)=> {
let itemDoc = (await document.where(`item.catalogType`, "==", order.catalogType).where(`item.id`, "==", order.item.id))
let get = await itemDoc.get()
get.forEach(async a=> {
const pp = document.doc(a.id)
await admin.firestore().runTransaction(async (t)=> {
const mostRecentDoc = await t.get(pp)
const data = await mostRecentDoc.data()
if (data == undefined){
return
}
const newQuantity = data.item.quantity - order.quantity
await t.update(pp, {[`item.quantity`] : newQuantity})
})
})
})
return true
} catch (error) {
console.log("dum: " + error)
return false
}
}
the part where I do let get = await itemDoc.get(), and get.ForEach, is kind of unnecessary because I know, that it will only return one document that matches the query field, but I need to forEach it in order to get the child component\s. Anyways, is it a valid transaction?

Categories

Resources