const puppeteer = require('puppeteer');
const init = async () => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
// login
let login = async () => {
console.log('login init');
await page.goto(HOME_PAGE);
await page.type($clientID, CLIENT_ID);
await page.type($userName, USER_NAME);
await page.type($password, PASSWORD);
await page.click($submitBtn);
await page.waitFor(WAIT_SEC);
await page.goto(SCHEDULE_PAGE);
console.log('login end');
}
// look for schedule
let setStartDate = async () => {
console.log('start init');
await page.waitFor(3000);
await page.click('#selfsched_startDate_dtInput', { clickCount: 3 });
await page.keyboard.press('Backspace');
await page.type($startDate, START_DATE);
console.log('start end');
}
let setEndDate = async () => {
console.log('end init');
await page.click($endDate, { clickCount: 3 });
await page.keyboard.press('Backspace');
await page.type($endDate, END_DATE);
await page.keyboard.press('Enter');
console.log('end end');
}
let confirmSchedule = async () => {
console.log('confirm init');
await page.waitFor(WAIT_SEC);
await page.click($confirmBtn);
console.log('confirm end');
}
let steps = [
login(),
setStartDate(),
setEndDate(),
confirmSchedule()
];
await Promise.all(steps);
console.log('im finishing');
browser.close();
}
init()
.then(values => {
console.log('success');
})
.catch(err => {
});
Whenever my code gets to the setStartDate function nothing happens. I've added console.log messages but they're not coming in sequential order as i thought they would. I thought Promise.all() waits for everything in order..... also my knowledge in async / promises / await is not the greatest :) Thanks for the help
order of console logs im getting
login init
start init
end init
confirm init
login end
I thought Promise.all() waits for everything in order
This is basically the opposite of what Promise.all does:
There is no implied ordering in the execution of the array of Promises given. On some computers, they may be executed in parallel, or in some sense concurrently, while on others they may be executed serially. For this reason, there must be no dependency in any Promise on the order of execution of the Promises.
https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Promise/all
You should just await your functions in order:
await login()
await setStartDate()
await setEndDate()
await confirmSchedule()
Related
I am trying to scrape multiple URL one by one, then repeat the scrape after one minute.
But I keep getting two errors and was hoping for some help.
I got an error saying:
functions declared within loops referencing an outer scoped variable may lead to confusing semantics
And I get this error when I run the function / code:
TimeoutError: Navigation timeout of 30000 ms exceeded.
My code:
const puppeteer = require("puppeteer");
const urls = [
'https://www.youtube.com/watch?v=cw9FIeHbdB8',
'https://www.youtube.com/watch?v=imy1px59abE',
'https://www.youtube.com/watch?v=dQw4w9WgXcQ'
];
const scrape = async() => {
let browser, page;
try {
browser = await puppeteer.launch({ headless: true });
page = await browser.newPage();
for (let i = 0; i < urls.length; i++) {
const url = urls[i];
await page.goto(`${url}`);
await page.waitForNavigation({ waitUntil: 'networkidle2' });
await page.waitForSelector('.view-count', { visible: true, timeout: 60000 });
const data = await page.evaluate(() => { // functions declared within loops referencing an outer scoped on this line.
return [
JSON.stringify(document.querySelector('#text > a').innerText),
JSON.stringify(document.querySelector('#container > h1').innerText),
JSON.stringify(document.querySelector('.view-count').innerText),
JSON.stringify(document.querySelector('#owner-sub-count').innerText)
];
});
const [channel, title, views, subs] = [JSON.parse(data[0]), JSON.parse(data[1]), JSON.parse(data[2]), JSON.parse(data[3])];
console.log({ channel, title, views, subs });
}
} catch(err) {
console.log(err);
} finally {
if (browser) {
await browser.close();
}
await setTimeout(scrape, 60000); // repeat after one minute after all urls have been scrape.
}
};
scrape();
I would really appreciate any help I could get.
I'd suggest a design like this:
const puppeteer = require("puppeteer");
const sleep = ms => new Promise(resolve => setTimeout(resolve), ms);
const scrapeTextSelectors = async (browser, url, textSelectors) => {
let page;
try {
page = await browser.newPage();
page.setDefaultNavigationTimeout(50 * 1000);
page.goto(url);
const dataPromises = textSelectors.map(async ({name, sel}) => {
await page.waitForSelector(sel);
return [name, await page.$eval(sel, e => e.innerText)];
});
return Object.fromEntries(await Promise.all(dataPromises));
}
finally {
page?.close();
}
};
const urls = [
"https://www.youtube.com/watch?v=cw9FIeHbdB8",
"https://www.youtube.com/watch?v=imy1px59abE",
"https://www.youtube.com/watch?v=dQw4w9WgXcQ",
];
const textSelectors = [
{name: "channel", sel: "#text > a"},
{name: "title", sel: "#container > h1"},
{name: "views", sel: ".view-count"},
{name: "subs", sel: "#owner-sub-count"},
];
let browser;
(async () => {
browser = await puppeteer.launch({headless: true});
for (;; await sleep(60 * 1000)) {
const data = await Promise.allSettled(urls.map(url =>
scrapeTextSelectors(browser, url, textSelectors)
));
console.log(data);
}
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
A few remarks:
This runs in parallel on the 3 URLs using Promise.allSettled. If you have more URLs, you'll want a task queue or run synchronously over the URLs with a for .. of loop so you don't outstrip the system's resources. See this answer for elaboration.
I use waitForSelector on each and every selector rather than just '.view-count' so you won't miss anything.
page.setDefaultNavigationTimeout(50 * 1000); gives you an adjustable 50-second delay on all operations.
Moving the loops to sleep and step over the URLs into the caller gives cleaner, more flexible code. Generally, if a function can operate on a single element rather than a collection, it should.
Error handling is improved; Promise.allSettled lets the caller control what to do if any requests fail. You might want to filter and/or map the data response to remove the statuses: data.map(({value}) => value).
Generally, return instead of console.log data to keep functions flexible. The caller can console.log in the format they desire, if they desire.
There's no need to do anything special in page.goto(url) because we're awaiting selectors on the very next line. "networkidle2" just slows things down, waiting for network requests that might not impact the selectors we're interested in.
JSON.stringify/JSON.parse is already called by Puppeteer on the return value of evaluate so you can skip it in most cases.
Generally, don't do anything but cleanup in finally blocks. await setTimeout(scrape, 60000) is misplaced.
This works. Putting the for loop in a Promise and waitUntil: "networkidle2" as an option when page.goto() resolves your problem. You don't need to generate a new browser each time, so it should be declared outside of the for loop.
const puppeteer = require("puppeteer");
const urls = [
"https://www.youtube.com/watch?v=cw9FIeHbdB8",
"https://www.youtube.com/watch?v=imy1px59abE",
"https://www.youtube.com/watch?v=dQw4w9WgXcQ",
];
const scrape = async () => {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
new Promise(async (resolve, reject) => {
for (url of urls) {
// your timeout
await page.waitForTimeout(6 * 1000);
await page.goto(`${url}`, {
waitUntil: "networkidle2",
timeout: 60 * 1000,
});
await page.waitForSelector(".view-count", {
waitUntil: "networkidle2",
timeout: 60 * 1000,
});
const data = await page.evaluate(() => {
return [
JSON.stringify(document.querySelector("#text > a").innerText),
JSON.stringify(document.querySelector("#container > h1").innerText),
JSON.stringify(document.querySelector(".view-count").innerText),
JSON.stringify(document.querySelector("#owner-sub-count").innerText),
];
});
const [channel, title, views, subs] = [
JSON.parse(data[0]),
JSON.parse(data[1]),
JSON.parse(data[2]),
JSON.parse(data[3]),
];
console.log({ channel, title, views, subs });
}
resolve(true);
})
.then(async () => {
await browser.close();
})
.catch((reason) => {
console.log(reason);
});
};
scrape();
#Update
As per ggorlen suggestion, the below-refactored code should serve your problem. Comment in the code indicates the purpose of that line
const puppeteer = require("puppeteer");
const urls = [
"https://www.youtube.com/watch?v=cw9FIeHbdB8",
"https://www.youtube.com/watch?v=imy1px59abE",
"https://www.youtube.com/watch?v=dQw4w9WgXcQ",
];
const scrape = async () => {
// generate a headless browser instance
const browser = await puppeteer.launch({ headless: true });
// used .entries to get the index and value
for (const [index, url] of urls.entries()) {
// generating a new page for each of the content
const page = await browser.newPage();
// your 60 timeout from 2nd index
if (index > 0) await page.waitForTimeout(60 * 1000);
// wait for the page response to available with 60 seconds timeout (error throw)
await page.goto(`${url}`, {
waitUntil: "networkidle2",
timeout: 60 * 1000,
});
// wait for .view-count section to be available
await page.waitForSelector(".view-count");
// don't need json stringify or parse as puppeteer does so
await page.evaluate(() =>
({
channel: document.querySelector("#text > a").innerText,
title: document.querySelector("#container > h1").innerText,
views: document.querySelector(".view-count").innerText,
subs: document.querySelector("#owner-sub-count").innerText
})
).then(data => {
// your scrapped success data
console.log('response', data);
}).catch(reason => {
// your scrapping error reason
console.log('error', reason);
}).finally(async () => {
// close your current page
await page.close();
})
}
// after looping through finally close the browser
await browser.close();
};
scrape();
I have a function that looks like this:
async function sync(req, res, done){
await createRecords().then(async ()=> {
await Promise.all(
[
quantityReport(),
listings(),
productIdentifiers(),
productChildren()
])
}).then(async ()=>{
await saveAll()
} ).then(await createCSV);
}
module.exports = sync
I am calling it like this inside a switch:
// const saveRecords = require('../scripts/saveRecords.js') <- for reference
await saveRecords;
My problem is that the program continues before saveRecords finishes and I cannot figure out why.
All of the functions in Promise.all are asynchronous functions.
If I call sync() directly in saveRecords.js it works fine.
Thanks.
edit
createCSV also works fine in other locations in the program. It's imported to this file like this:
const {createCSV, uploadReports} = require('../scripts/createCSV.js')
//in createCSV.js
module.exports = createCSV;
I'd refactor your function as such (by the way, sync doesn't sounds like a great name for your function, write something more obvious).
async function sync(req, res, done){
try{
await createRecords()
const _res = await Promise.all([
quantityReport(),
listings(),
productIdentifiers(),
productChildren()
])
if(_res) {
await saveAll()
await createCSV()
}
return
}
catch(err){
throw new Error(err)
}
}
module.exports = sync
As I mentioned in the comments using async/await with then from the Promise API (see also fetch) is an odd thing to do. Use one or the other. But the key issue is that you're not calling the sync function await sync().
Here's an quick example of simply using async/await.
function mockCall(n) {
return new Promise((res, rej) => {
setTimeout(() => res(n), 1000);
});
}
async function sync() {
const first = await mockCall(1);
const twoToFive = await Promise.all([
mockCall(2),
mockCall(3),
mockCall(4),
mockCall(5)
]);
const six = await mockCall(6);
const seven = await mockCall(7);
console.log([ first, twoToFive, six, seven ]);
}
(async function main() {
await sync();
console.log('Let the processing resume!');
})();
I tested iterations with puppeteer in a small case. I already have read the common reason for puppeteer disconnections are that the Node script doesnt wait for the puppeteer actions to be ended. So I converted all functions in my snippet into async functions but it didnt help.
If the small case with six iterations work I will implement it in my current project with like 50 iterations.
'use strict';
const puppeteer = require('puppeteer');
const arrIDs = [8322072, 1016816, 9312604, 1727088, 9312599, 8477729];
const call = async () => {
await puppeteer.launch().then(async (browser) => {
arrIDs.forEach(async (id, index, arr) => {
await browser.newPage().then(async (page) => {
await page.goto(`http://somelink.com/${id}`).then(async () => {
await page.$eval('div.info > table > tbody', async (heading) => {
return heading.innerText;
}).then(async (result) => {
await browser.close();
console.log(result);
});
});
});
});
});
};
call();
forEach executes synchronously. replace forEach with a simple for loop.
const arrIDs = [8322072, 1016816, 9312604, 1727088, 9312599, 8477729];
const page = await browser.newPage();
for (let id of arrIDs){
await page.goto(`http://somelink.com/${id}`);
let result = await page.$eval('div.info > table > tbody', heading => heading.innerText).catch(e => void e);
console.log(result);
}
await browser.close()
The way you've formatted and nested everything seems like some incarnation of callback hell.
Here's my suggestion, its not working, but the structure is going to work better for Async / Await
const puppeteer = require("puppeteer");
const chromium_path_706915 =
"706915/chrome.exe";
async function Run() {
arrIDs.forEach(
await Navigate();
)
async function Navigate(url) {
const browser = await puppeteer.launch({
executablePath: chromium_path_706915,
args: ["--auto-open-devtools-for-tabs"],
headless: false
});
const page = await browser.newPage();
const response = await page.goto(url);
const result = await page.$$eval("div.info > table > tbody", result =>
result.map(ele2 => ({
etd: ele2.innerText.trim()
}))
);
await browser.close();
console.log(result);
}
}
run();
On top of the other answers, I want to point out that async and forEach loops don't exactly play as expected. One possible solution is having a custom implementation that supports this:
Utility function:
async function asyncForEach(array: Array<any>, callback: any) {
for (let index = 0; index < array.length; index++) {
await callback(array[index], index, array);
}
}
Example usage:
const start = async () => {
await asyncForEach([1, 2, 3], async (num) => {
await waitFor(50);
console.log(num);
});
console.log('Done');
}
start();
Going through this article by Sebastien Chopin can help make it a bit more clear as to why async/await and forEach act unexpectedly. Here it is as a gist.
I have two blocks of code. First is using async await
async sendEmailNotifications() {
try {
const users = await User.find(...)
const promises = users.map(async(user) => {
const _promises = user.appId.map(async(app) => {
const todayVisitorsCount = await Session.count({...})
const yesterdayVisitorsCount = await UserSession.count({...})
const emailObj = {
todayVisitorsCount,
yesterdayVisitorsCount
}
const sendNotification = await emailService.analyticsNotification(emailObj)
})
await Promise.all(_promises)
})
return promises
} catch (err) {
return err
}
}
(await sendEmailNotifications())
And then I have using Promise.all
sendEmailNotifications() {
const users = await User.find(...)
const promises = users.map((user) => {
const allPromises = []
user.appId.map((app) => {
allPromises.push(UserSession.count({...}))
allPromises.push(Session.count({...}))
})
const data = await Promise.all(allPromises)
const emailObj = {
todayVisitorsCount: data[0],
yesterdayVisitorsCount: data[1]
}
const sendNotification = await emailService.analyticsNotification(emailObj)
})
return promises
}
sendNotification.then((data) => console.log(data))
Now I need to know which piece of code will faster execute? One is with series(async await) and one is with parellel(Promise.all). Which has better performance?
In the first code, you have two separate await statements:
const todayVisitorsCount = await Session.count({...})
const yesterdayVisitorsCount = await UserSession.count({...})
whereas in the second, you only have one, before a Promise.all:
const data = await Promise.all(allPromises)
In the first code, the second Promise will only initialize after the first Promise has finished, resulting in a longer time required before the script ends. For example:
const fn = () => new Promise(resolve => setTimeout(resolve, 1000));
console.log('start');
(async () => {
await fn();
await fn();
console.log('two awaits done');
})();
(async () => {
await Promise.all([fn(), fn()]);
console.log('Promise.all done');
})();
The version without Promise.all pauses the function when the first call of fn() is made, and waits for the Promise returned by fn() to resolve (1000 ms) before proceeding to the next line. The next line calls fn() again, and the await waits for it to complete (1000 more ms).
In contrast, the Promise.all version calls both fn()s immediately - both Promises are initialized, and the await that pauses the function is waiting for both Promises to complete. There's no down time between the initialization of the first Promise and the initialization of the second Promise.
So, the Promise.all version will run more significantly more quickly than the version with two awaits. Using Promise.all will be preferable unless the first Promise (UserSession.count) must be completed before the second Promise (Session.count) starts.
With destructuring and without unnecessary variables, this is how I would clean up your Promise.all code, you might consider it to be a bit more readable:
async sendEmailNotifications() {
const users = await User.find();
return users.map(async (user) => {
const [todayVisitorsCount, yesterdayVisitorsCount] = await Promise.all([
UserSession.count(),
Session.count()
]);
await emailService.analyticsNotification({ todayVisitorsCount, yesterdayVisitorsCount });
});
}
I wonder why my second console.log() not logs anything to the console...
describe('Puppeteer', () => {
it('Does not log', () => {
(async () => {
console.log('This logs'); // <- works
const browser = await puppeteer.launch({
headless: true,
args: [
'--incognito'
]
});
await console.log('This does not log'); // <- Does not work
console.log('This does not log too'); // <- This neither
const page = await browser.newPage();
await page.goto('....');
....
expect(a < b)
.toEqual(true);
browser.close();
})();
});
});
Is there any reason why that does not log?
Solution: This does not work because you are running the block instantly. Make sure to pass a function which is not self executing.
A example of self executing function is (()=>{})(). This prevents the test from resolving properly.
Here is the cleaned up code:
const puppeteer = require('puppeteer');
const assert = require('assert');
describe('Puppeteer', () => {
it('Does log', async () => { // <== PASS THE FUNCTION HERE
const browser = await puppeteer.launch({args: ['--incognito']});
console.log('This logs now');
const page = await browser.newPage();
await page.goto('https://example.org');
const title = await page.title();
assert.equal(title, 'Example Domain');
console.log('This logs too');
await browser.close();
})
});
Result:
The question has jest instead of mocha. Here is the code for jest and result. Which is almost same except following line,
// assert.equal(title, 'Example Domain');
expect(title).toEqual('Example Domain');
Result:
Optionally if you want to stack the logs together, you can pass --verbose=false when running jest.