I need to create scraping tool using puppeteer however I have some issues adding items to the queue
What I got
const PromisePool = require("#supercharge/promise-pool");
const puppeteer = require("puppeteer");
const domain = process.argv[2];
let list = [];
list[0] = domain;
const run = async () => {
const { results, errors } = await PromisePool.for(list)
.withConcurrency(2)
.process(async (webpage) => {
links = [];
const getData = async () => {
return await page.evaluate(async () => {
return await new Promise((resolve) => {
resolve(Array.from(document.querySelectorAll("a")).map((anchor) => [anchor.href]));
});
});
};
links = await getData();
for (var link in links) {
var new_url = String(links[link]);
new_url = new_url.split("#")[0];
console.log("new url: " + new_url);
if (new_url.includes(domain)) {
if (new_url in list) {
console.log("Url already exists: " + new_url);
continue;
}
list[new_url] = new_url;
} else {
console.log("Url is external: " + new_url);
}
}
browser.close();
});
};
const mainFunction = async () => {
const result = await run();
return result;
};
(async () => {
console.log(await mainFunction());
console.log(list);
})();
The problem is inside
links = [];
const getData = async () => {
return await page.evaluate(async () => {
return await new Promise((resolve) => {
resolve(Array.from(document.querySelectorAll("a")).map((anchor) => [anchor.href]));
});
});
};
links = await getData();
page.evaluate is async and it doesn't wait for a return this links is never updated for the next PromisePool process.
I need a way to wait for response to return and then continue rest of the script to process.
You could use page.$$eval to retrieve the same links with a single await.
page.$$eval(selector, pageFunction[, ...args])
It is basically what you are trying to achieve as the $$eval method "runs Array.from(document.querySelectorAll(selector)) within the page [context] and passes it as the first argument to pageFunction." (docs)
E.g.:
const links = await page.$$eval('a', anchors => anchors.map(el => el.href));
Related
The Problem is with the uplines.push.
I always get an empty uplines array so the last part of the code doesn't run. The promises resolve later and I get the correct data. May I know how to go about doing it the correct way?
const getAllUplines = async () => {
uplines = [];
const findUser = async (userFid) => {
const userDoc = await firestore.collection("users").doc(userFid).get();
if (userDoc.exists) {
const user = { ...userDoc.data(), id: userDoc.id };
console.log(user);
uplines.push(user);
if (user.immediateUplineFid) {
findUser(user.immediateUplineFid); //self looping
}
} else {
console.log("No User Found");
return null;
}
};
sale.rens.forEach(async (ren) => {
findUser(ren.userFid);
});
console.log(uplines);
return uplines;
};
let uplines = await getAllUplines();
console.log(uplines);
uplines = uplines.filter(
(v, i) => uplines.findIndex((index) => index === v) === i
); //remove duplicates
uplines.forEach((user) => {
if (user.chatId) {
sendTelegramMessage(user.chatId, saleToDisplay, currentUser.displayName);
console.log("Telegram Message Sent to " + user.displayName);
} else {
console.log(user.displayName + " has no chatId");
}
});
There are a few things that you have missed out while implementing the async call, which are explained in the inline comments in the code snippet.
A short explanation for what happened in your code is that in the line sale.rens.forEach you are passing an async function in the argument, which does not make any difference to the function forEach, it will execute it without waiting for it to complete.
Therefore in my answer I am using Promise.all to wait for all the async function calls to complete before returning the result.
// This is wrapped in an immediately executed async function because await in root is not supported here
(async () => {
const mockGetData = () => new Promise(resolve => setTimeout(resolve, 1000));
const sale = {
rens: [
{ userFid: 1 },
{ userFid: 2 },
{ userFid: 3 }
]
};
const getAllUplines = async () => {
const uplines = [];
const findUser = async (userFid) => {
// Simulating an async function call
const userDoc = await mockGetData();
console.log("User data received");
uplines.push(`User ${userFid}`);
};
const promises = [];
sale.rens.forEach(ren => { // This function in foreach does not have to be declared as async
// The function findUser is an async function, which returns a promise, so we have to keep track of all the promises returned to be used later
promises.push(findUser(ren.userFid));
});
await Promise.all(promises);
return uplines;
};
let uplines = await getAllUplines();
console.log(uplines);
})();
In order to get the results of getAllUplines() properly, you need to add await to all async functions called in getAllUplines().
const getAllUplines = async () => {
uplines = [];
const findUser = async (userFid) => {
const userDoc = await firestore.collection("users").doc(userFid).get();
if (userDoc.exists) {
const user = { ...userDoc.data(), id: userDoc.id };
console.log(user);
uplines.push(user);
if (user.immediateUplineFid) {
await findUser(user.immediateUplineFid); //self looping
}
} else {
console.log("No User Found");
return null;
}
};
sale.rens.forEach(async (ren) => {
await findUser(ren.userFid);
});
console.log(uplines);
return uplines;
};
I have a Firebase Realtime database function running. The problem is that the foreach loop executes after the return return ref.child("/leaderboard").set(updates); I know i have to do something with Promise() ? But not sure how. Any ideas.
const functions = require('firebase-functions');
const admin = require('firebase-admin');
admin.initializeApp();
// Checks the weekly scores and creates a leaderboard entry of the top 3
exports.insertLeaderboard = functions.database.ref('/challenges/weekly/{weeklyId}/scores/{userId}')
.onWrite(async (change) => {
const ref = change.after.ref.parent.parent; // reference to the parent
const leaderboardItems = ref.child("scores").orderByChild('score').limitToLast(3);
const snapshot = await leaderboardItems.once('value');
var updates = snapshot.val();
snapshot.forEach(async element => {
const playerRef = admin.database().ref("players/" + element.key + "/playerProfile");
await playerRef.once('value', (result) => {
if (result.exists) {
console.log("Found Element:" + result.key);
updates[element.key]["name"] = result.child("DisplayName").val();
} else {
console.log("NOT Found Element:" + element.key);
}
});
});
console.log("Doing Final Write");
return ref.child("/leaderboard").set(updates);
});
Yes I solved this by following the steps in this answer by Bergi
async function printFiles () {
const files = await getFilePaths();
await Promise.all(files.map(async (file) => {
const contents = await fs.readFile(file, 'utf8');
console.log(contents)
}));
}
Like this!
var data_snap_arr = [];
snapshot.forEach(child_Snapshot => {
var stuff = child_Snapshot.val();
stuff.key = child_Snapshot.key;
data_snap_arr.push(stuff);
});
await Promise.all(data_snap_arr.map(handleSnapshot)).then(result => {
const finalData = {};
for (var i = 0; i < result.length; i++) {
// process data here - data returned as array object
}
return ref.child("/leaderboard").set(finalData);
});
I have the following function
const getQuotes = symbol => {
let quotes = {};
new DeltaRestClient(api_key, api_secret).then(client => {
const linkOptions = createIdentifiers(symbol, false);
Object.entries(linkOptions).forEach(entry => {
client.apis.Products.getTicker({ symbol: entry[1] }).then(response => {
const ticker = JSON.parse(response.data.toString());
quotes[entry[0]] = parseFloat(ticker.result.close);
});
});
});
return quotes;
};
which I call in
const start = async () => {
const quotes = await getQuotes("ABCD");
console.log(quotes);
};
But because of the asynchronicity, getQuotes returns before all the .then are resolved and an empty object is returned.
How can I change this such that the value is only returned if all .then are resolved?
You have to await in the function you are calling too:
const getQuotes = async symbol => {
let quotes = {};
const client = await new DeltaRestClient(api_key, api_secret);
const linkOptions = createIdentifiers(symbol, false);
for (const entry of Object.entries(linkOptions)) {
const response = await client.apis.Products.getTicker({ symbol: entry[1] });
const ticker = JSON.parse(response.data.toString());
quotes[entry[0]] = parseFloat(ticker.result.close);
}
return quotes;
};
And call it accordingly:
const start = async () => {
const quotes = await getQuotes("ABCD");
console.log(quotes);
};
Generelly mixing async/await and Promise.then/.catch leads to shady code that is easily misunderstood.
And if you like, you can increase readability by destructuring the entry element:
for (const [key, symbol] of Object.entries(linkOptions)) {
const response = await client.apis.Products.getTicker({ symbol, });
const ticker = JSON.parse(response.data.toString());
quotes[key] = parseFloat(ticker.result.close);
}
I am trying to load data from firebase by calling a function in which it filters data and returns them.
When I call this function in my main function, it returns "undefined". I know the data is there (console.log(postsArray)) prints the data but I guess the return executes before data is loaded.
What am I doing wrong?
calling_Function_in_Main = async () => {
const data = await FirebaseData ();
console.log(data);
};
FirebaseData is the function that I call in my main function to load data and to return them
let postsArrays=[];
const FirebaseData = async () => {
const getViewableLink = async (link) => { //some function };
const loadData = async () => {
const database = firebase.database();
const data = database.ref();
const loadProfile = data
.child('Posts')
.orderByChild('Active')
.equalTo(true)
.once('value', function gotData(data) {
Object.values(readInfo).forEach(async (element) => {
element.Option1Link = await getViewableLink(
preLink + element.Option1Link,
);
postsArray.push(element);
}
});
})
.catch((error) => {
console.log(error);
}
})
.then((postsArray) => {
console.log(postsArray);
return postsArray;
});
};
await loadData();
};
export default FirebaseSwipeData;
You can't use foreach with async/await because It is not asynchronous. It is blocking.
you have 2 ways to fix this:
1- Reading in sequence: you can use for...of loop
for(const element of Object.values(readInfo)) {
element.Option1Link = await getViewableLink(
preLink + element.Option1Link,
);
postsArray.push(element);
}
2- Reading in parallel: you can use Promise.all
await Promise.all(Object.values(readInfo).map(async (element) => {
element.Option1Link = await getViewableLink(
preLink + element.Option1Link,
);
postsArray.push(element);
}));
Hope that solves the problem, for you
I am trying to run the node js Lighthouse function serially (one at a time) with an array of URLs. My problem is that whenever I loop through the array, Lighthouse runs all the URLs at once, which I imagine is problematic if you have a very large array of URLs.
The code:
for(let url of urls) {
function launchChromeAndRunLighthouse(url, opts, config = null) {
return chromeLauncher.launch({chromeFlags: opts.chromeFlags}).then(chrome => {
opts.port = chrome.port;
return lighthouse(url, opts, config).then(results => {
return chrome.kill().then(() => results.lhr)
});
});
}
}
launchChromeAndRunLighthouse('https://example.com', opts).then(results => {
// Use results!
});
Please help! And thank you for your time!
Your answer is correct but it can be improved. Since you have access to async and await, you should fully utilize it to make your code cleaner:
async function launchChromeAndRunLighthouse (url, opts, config = null) {
const chrome = await chromeLauncher.launch({chromeFlags: opts.chromeFlags});
opts.port = chrome.port;
const { lhr } = await lighthouse(url, opts, config);
await chrome.kill();
return lhr;
}
async function launchAudit (urls) {
for (const url of urls) {
const results = await launchChromeAndRunLighthouse(url, opts);
// Use results!
};
}
launchAudit(urls);
I believe I figured it out. What I did is below. Please continue to send feedback if you think this is wrong.
function launchChromeAndRunLighthouse(url, opts, config = null) {
return chromeLauncher.launch({chromeFlags: opts.chromeFlags}).then(chrome => {
opts.port = chrome.port;
return lighthouse(url, opts, config).then(results => {
return chrome.kill().then(() => results.lhr)
});
});
};
async function launchAudit(urls) {
for (let url of urls) {
await launchChromeAndRunLighthouse(url, opts).then(results => {
// Use results!
});
};
};
launchAudit(urls);
A variation on Patric Roberts answer (which should be the accepted answer).
I was wondering if it was necessary to kill chrome every iteration.
const lighthouse = require('lighthouse');
const chromeLauncher = require('chrome-launcher');
function launchChromeAndRunLighthouse(sites, opts, config = null) {
return chromeLauncher.launch({chromeFlags: opts.chromeFlags}).then(chrome => {
opts.port = chrome.port;
const siteResults = [];
return new Promise((resolve, reject) => {
// batch async functions.
// C/O https://stackoverflow.com/questions/43082934/how-to-execute-promises-sequentially-passing-the-parameters-from-an-array
const runBatch = async (iterable, action) => {
for (const x of iterable) {
await action(x)
}
}
// func to run lighthouse
const doLightHouse = (site) => new Promise((resolve, reject) => {
lighthouse(site, opts, config).then(results => {
siteResults.push(results.lhr);
resolve();
});
});
// go go go
runBatch(sites, doLightHouse).then(d => {
chrome.kill().then((result) => {
resolve(siteResults)
})
});
});
});
}
const opts = {
chromeFlags: ['--show-paint-rects'],
onlyCategories: ['performance']
};
const sites = ['https://www.example.com', 'https://www.test.com']
launchChromeAndRunLighthouse(sites, opts).then(results => {
// Use results!
console.log(results);
});
Just to execute your code as test, we'll use async/await and IIFE
Then, will create function which will put all our request to array of non resolved promises, so we could use it with Promise.all()
You need to rewrite code in something like this:
(async() => {
const promisesToExecute = [];
const launchChromeAndRunLighthouse = async (url, opts, config = null) => {
const chrome = await return chromeLauncher.launch({chromeFlags: opts.chromeFlags});
opts.port = chrome.port;
promisesToExecute.push(lighthouse(url, opts, config));
}
const results = await Promise.all(promisesToExecute);
for(const result of results) {
const resolvedResult = await result.kill();
// here you can access your results.lhr
console.log(resolvedResult.lhr);
}
})()
Please note, this code wasn't tested, so there might be problems with kill() on result. But, the main goal is to answer your question and explain how to execute promises.
Also, if you don't want to execute all promises at the same time, you could use Promise.waterfall with some npm package, like this