NodeJs Pagination,recursive promise problem - javascript

I am scraping multiple pages with cheerio and axios in node.js
I am having a hard time with Promises, can someone help me return the JSON if I hit the last page? Thanks!
const getWebsiteContent = async (url) => {
await axios.get(url).then(res => {
const $ = cheerio.load(res.data)
pageNum = getTotalpages($); // Get the pagination
console.log(url);
//Some scraping here
})
indexPage++; // Increment to the next page
const nextPageLink = baseUrl + '&page=' + indexPage; // get next page
if (indexPage > pageNum) {
var editedText = text.slice(0, text.length - 1);
editedText += ']}';
editedText = JSON.parse(editedText); // I want to return this and use elsewhere
return editedText;
}
setTimeout(async () => {
getWebsiteContent(nextPageLink); // Call itself
}, 1000);
}
var myJSON= await getWebsiteContent(baseUrl); // something like this

I would write getPages as an async generator -
async function* getPages (href, initPage = 0) {
const res = await axios.get(setPage(href, initPage))
const $ = cheerio.load(res.data)
const pages = getTotalpages($)
yield { page: initPage, dom: $ }
for (let p = initPage; p < pages; p++) {
await sleep(1000)
const r = await axios.get(setPage(href, p))
yield { page: p, dom: cheerio.load(r.data) }
}
}
This depends on helper setPage that manipulates the href page number using the url module, which is much safer than hobbling together strings by hand -
function setPage (href, page) {
const u = new URL(href)
u.searchParams.set("page", page)
return u.toString()
}
And another helper, sleep, which prevents the mixing of setTimeout with async-based code. This allows us to easily pause between pages -
async function sleep (ms) {
return new Promise(r => setTimeout(r, ms))
}
Finally we write scrape which is a simple wrapper around getPages. This allows us to reuse the getPages function to scrape various elements as needed. A benefit of using this approach is that the caller can determine what happens with each page. Below we push to result array, but as another example we could write each page to disk using the fs module. Obviously this for you to decide -
async function scrape (href) {
const result = []
for await (const {page, dom} of getPages(href)) {
console.log("scraped page", page) // some status message
result.push(getSomeData(dom)) // get something from each page
}
return result
}
scrape(myUrl).then(console.log, console.error)

You shouldn't be using then with your async / await code.
pagination should look something like this:
let response = await axios.get(url)
let $ = cheerio.load(response.data)
// do some scraping
while(url = $('[rel=next]').attr('href')){
response = await axios.get(url)
$ = cheerio.load(response.data)
// do more scraping
}

Related

Cannot get querySelectorAll to work with puppeteer (returns undefined)

I'm trying to practice some web scraping with prices from a supermarket. It's with node.js and puppeteer. I can navigate throught the website in beginning with accepting cookies and clicking a "load more button". But then when I try to read div's containing the products with querySelectorAll I get stuck. It returns undefined even though I wait for a specific div to be present. What am I missing?
Problem is at the end of the code block.
const { product } = require("puppeteer");
const scraperObjectAll = {
url: 'https://www.bilkatogo.dk/s/?query=',
async scraper(browser) {
let page = await browser.newPage();
console.log(`Navigating to ${this.url}`);
await page.goto(this.url);
// accept cookies
await page.evaluate(_ => {
CookieInformation.submitAllCategories();
});
var productsRead = 0;
var productsTotal = Number.MAX_VALUE;
while (productsRead < 100) {
// Wait for the required DOM to be rendered
await page.waitForSelector('button.btn.btn-dark.border-radius.my-3');
// Click button to read more products
await page.evaluate(_ => {
document.querySelector("button.btn.btn-dark.border-radius.my-3").click()
});
// Wait for it to load the new products
await page.waitForSelector('div.col-10.col-sm-4.col-lg-2.text-center.mt-4.text-secondary');
// Get number of products read and total
const loadProducts = await page.evaluate(_ => {
let p = document.querySelector("div.col-10.col-sm-4.col-lg-2").innerText.replace("INDLÆS FLERE", "").replace("Du har set ","").replace(" ", "").replace(/(\r\n|\n|\r)/gm,"").split("af ");
return p;
});
console.log("Products (read/total): " + loadProducts);
productsRead = loadProducts[0];
productsTotal = loadProducts[1];
// Now waiting for a div element
await page.waitForSelector('div[data-productid]');
const getProducts = await page.evaluate(_ => {
return document.querySelectorAll('div');
});
// PROBLEM HERE!
// Cannot convert undefined or null to object
console.log("LENGTH: " + Array.from(getProducts).length);
}
The callback passed to page.evaluate runs in the emulated page context, not in the standard scope of the Node script. Expressions can't be passed between the page and the Node script without careful considerations: most importantly, if something isn't serializable (converted into plain JSON), it can't be transferred.
querySelectorAll returns a NodeList, and NodeLists only exist on the front-end, not the backend. Similarly, NodeLists contain HTMLElements, which also only exist on the front-end.
Put all the logic that requires using the data that exists only on the front-end inside the .evaluate callback, for example:
const numberOfDivs = await page.evaluate(_ => {
return document.querySelectorAll('div').length;
});
or
const firstDivText = await page.evaluate(_ => {
return document.querySelector('div').textContent;
});

How to wait for all the code in a promise to finish before resolving it? (but a little more complex)

Sorry for the very confusing question, I have this code that gets information from a website without any node modules or libraries. It is a list of users separated into different pages use ?page= at the end of the URL. I have managed to iterate through the pages and split up the raw HTML just right. However, my promise resolves before all the data is collected. How can I wait for everything to finish before I resolve the promise? I have tried countless solutions, but none seem to work. Please don't ask to use a node package, as my goal is to not use one :) A friend helped with the regex and splitting it up. Here is the code I am using:
function getData() {
return new Promise((resolve, reject) => {
let final = [] //the array of users returned in the end
const https = require("https"), url = "https://buildtheearth.net/buildteams/121/members";
https.get(url + "?page=1", request => { //initial request, gets the number of user pages.
let rawList = '';
request.setEncoding("utf8"),
request.on("data", data => {rawList += data}),
request.on("end", () => {
if(request = (request = (request = rawList.substring(rawList.indexOf('<div class="pagination">'))).substring(0, request.indexOf("</div>"))).match(/<a(.+)>(.+)<\/a>/g)) {
for(let t = parseInt(request[request.length - 1].match(/(\d+)(?!.*\d)/g)), a = 1; a < t + 1; a++) { //iterates through member pages
https.get(url + "?page=" + a, request2 => { //https request for each page of members
let rawList2 = '';
request2.setEncoding('utf8'),
request2.on("data", data => {rawList2 += data}),
request2.on("end", () => {
let i = rawList2.match(/<td>(.+)<\/td>/g); //finds table in HTML
if (i)
for (var t = 1; t < i.length; t += 3) //iterates through rows in table
console.log(i[t].replace(/<td>/g, "").replace(/<\/td>/g, "")), /* logs element to the console (for testing) */
final.push(i[t].replace(/<td>/g, "").replace(/<\/td>/g, "")); //pushes element to the array that is resolved in the end
})
})
}
}
resolve(final) //resolves promise returning final array, but resolves before elements are added with code above
})
})
})
}
If this helps, here is the website I am trying to get info from.
I am still a little new to JS so if you could help, I would really appreciate it :)
I ended up turning each action into an async function with a try and catch block and then chained the functions together with .then() For the base (getting data from a website) I took inspiration from an article on Medium. Here is the site I am pulling data from, and here is the function to get data from a website:
const getData = async (url) => {
const lib = url.startsWith('https://') ? https : http;
return new Promise((resolve, reject) => {
const req = lib.get(url, res => {
if (res.statusCode < 200 || res.statusCode >= 300) {
return reject(new Error(`Status Code: ${res.statusCode}`));
}
const data = [];
res.on('data', chunk => data.push(chunk));
res.on('end', () => resolve(Buffer.concat(data).toString()));
});
req.on('error', reject);
req.end();
});
};
and then I got the number of pages (which can be accessed by appending ?page=<page number> to the end of the url) with this this function:
const pages = async () => {
try {
let html = await getData('https://buildtheearth.net/buildteams/121/members',);
let pages = await (html = (html = html.substring(html.indexOf('<div class="pagination">'))).substring(0, html.indexOf("</div>"))).match(/<a(.+)>(.+)<\/a>/g)
let pageCount = await parseInt(pages[pages.length - 1].match(/(\d+)(?!.*\d)/g))
return pageCount
} catch (error) {
console.error(error);
}
}
and then I used the page count to iterate through the pages and add the HTML of each to an array with this function:
const getPages = async pageCount => {
let returns = []
try {
for (page = 1; page <= pageCount; page++) {
try {
let pageData = await getData('https://buildtheearth.net/buildteams/121/members?page=' + page)
returns.push(pageData)
} catch (error) {
return error
}
}
} catch (error) {
return error
} finally {return returns}
}
and then I iterated through the array of strings of HTML of each page, and extracted the data I needed out of each with this function which would return the list of members I need:
const iteratePages = async pages => {
if (!Array.isArray(pages)) return
try {
let returns = []
await pages.forEach(page => {
let list = page.match(/<td>(.+)<\/td>/g);
if (list)
for (var element = 1; element < list.length; element += 3)
returns.push(list[element].replace(/<td>/g, "").replace(/<\/td>/g, ""));
})
return returns
} catch (error) {
return error
}
}
And then it was a matter of chaining each together to get the array I needed:
pages().then(pageCount => getPages(pageCount)).then(pages => iteratePages(pages)).then(finalList => {console.log(finalList); console.log(finalList.length)})

Code won't execute in function containing return?

I've run into some issues with my scoping, I'm new to javascript so my apologies for my inexperience. The issue i'm having is I don't understand why my console.log within the function groupA(thename) curly braces won't allow code to output. In visual studio's it greys the code out. I think it may be due to the return fetch(`${url}${thename}`), because the code won't grey out when I remove that return. Why does the return function do this?
My goal is to correctly chain each function together so the data pushed into birds_array can be returned via the groupA function. How would I go about correctly formatting these functions to allow this to be possible?
const birds_array = []
let groupAname = "Old World sparrow"
groupA(groupAname);
function groupA(thename) {
return fetch(`${url}${thename}`)
.then(response => response.text())
.then(body => {
const $ = cheerio.load(body);
$('.llgymd').each( (index, element) => {
const $element = $(element);
const names = $element.text();
groupA_array[index] = names;
});
groupA_loop()
function groupA_loop(){
for(j = 0; j < groupA_array.length; j++){
if(!birds_array.includes(groupA_array[j])){
birds_array.push(groupA_array[j]);
}
// code functions here
console.log(birds_array);
}
// code functions here
console.log(birds_array);
}
// code functions here
console.log(birds_array);
});
// code does not function here
console.log(birds_array);
}
// Create a async function to await any async code. Else u get error.
async function main() {
let groupAname = "Old World sparrow";
// Await for the promise to be resolved. // Like pause call
const birds_array = await groupA(groupAname);
// Result done, print
console.log(birds_array)
}
main()
Sample:
const delayData = (num) => {
return new Promise((r) => {
setTimeout(() => {
r(num);
}, 1000);
});
};
async function main() {
const num = await delayData(1);
console.log(num); // 1
const num2 = await delayData(2);
console.log(num2); // 2
}
main();
// Using then
delayData(1).then(num => {
console.log(num) // 1
})
delayData(2).then(num => {
console.log(num) // 2
})

Promise condition equal

I want to wait in my code until two values are the same. For this I use
await new Promise((resolve, reject) => {
if(curCount == maxTests) resolve;
});
But I think, this is only called one time. How I can make it that if both values are the same the promise is resolved? How to avoid that it will never send a resolve?
UPDATE:
Some requested the function that makes trouble. Here is the whole function, without it sub function. The function will fill the q-queue to fullfill the tests sync. The problem ist that req.body.selection.forEach immediately returns but I want to wait until the whole queue is ready. So my idea was to add a promise to the end and hit until current and max are the same.
router.post('/imgtest', async (req, res) => {
console.log('Start Image Test');
//Er kommt an.
req.setTimeout(5000000); // Nach Adam Riese 83 Minuten.
process.setMaxListeners(0);
io = req.app.get('socketio');
//Caluclate the max amounnt of tests
const maxTests = req.body.selection.length * req.body.servers.length;
var curCount = 0;
//RETURN IF THIS IS READY. CURRENTLY IT RETURNS IMMEDIATLY
req.body.selection.forEach(async function(entry) {
//Jetzt erstmal die Domain aus der DB holen
var dbUrl = await getUrl(entry);
console.log('tapsi');
var bildFormat = '';
var arrQuestionmark = dbUrl.split('?');
if(arrQuestionmark.length==2){
if(arrQuestionmark[1].includes('&')){
var arrAnd = arrQuestionmark[1].split('&');
arrAnd.forEach(function(entry) {
if(entry.includes('format=')){
var arrFormat = entry.split('=');
bildFormat = arrFormat[1];
}
});
}
}
var masterName = uuidv1();
const orgpath = path.resolve(__basedir, 'tests/downloads', masterName + '-.' + bildFormat);
//Download the MAsterimage
(async () => {
await queue.add(() =>downloadImage(dbUrl, 'c11', req.body.domain, bildFormat, orgpath) );
})();
req.body.servers.forEach(async function(xserver) {
var fileName = masterName + '-' + xserver + '.' + bildFormat;
const dpath = path.resolve(__basedir, 'tests/downloads', fileName);
(async () => {
await queue.add(() => downloadImage(dbUrl, xserver, req.body.domain, bildFormat, dpath));
//console.log('Done ' + entry);
})();
(async () => {
await queue.add(async() => startCompare(orgpath, dpath, 'null:').then(function(result) {
console.log(result);
curCount++;
messageIO(curCount,maxTests);
}));
//console.log('done compare ' + entry);
//fs.unlinkSync(dpath);
})();
});
});
console.log('Need to wait');
res.sendStatus(200);
});
You're correct in assuming that will only be called once. A way around that is to, within the function, create a loop via setInterval - doing a regular check and resolving if true and clearing the loop.
Not too sure what trying to achieve but one thing is for certain, Array.prototype.forEach() will not await even if its callback is async and performing a test inside a new Promise(...) constructor won't help.
Good news though, for loops will await.
Here's the code with for loops instead of .forEach() (twice), unnecessary stuff removed, and otherwise tidied up.
First a suggestion ...
// https://nodejs.org/api/querystring.html
// The querystring module provides utilities for parsing and formatting URL query strings.
const querystring = require('querystring');
... then:
router.post('/imgtest', async (req, res) => {
req.setTimeout(5000000); // Nach Adam Riese 83 Minuten.
process.setMaxListeners(0);
// io = req.app.get('socketio'); // not used
const masterName = uuidv1(); // moved from inner loop
for (i=0; i<req.body.selection.length; i++) {
let entry = req.body.selection[i];
let dbUrl = await getUrl(entry);
let bildFormat = querystring.parse(dbUrl).format;
let orgpath = path.resolve(__basedir, 'tests/downloads', `${masterName}-.${bildFormat}`);
await queue.add(() => downloadImage(dbUrl, 'c11', req.body.domain, bildFormat, orgpath));
for (j=0; j<req.body.servers.length; j++) {
let xserver = req.body.servers[j];
let dpath = path.resolve(__basedir, 'tests/downloads', `${masterName}-${xserver}.${bildFormat}`);
await queue.add(() => downloadImage(dbUrl, xserver, req.body.domain, bildFormat, dpath));
await queue.add(async() => startCompare(orgpath, dpath, 'null:'); // probably safe to remove `async`
// fs.unlinkSync(dpath); // ???
}
}
res.sendStatus(200);
});
This should get you started. I expect that you still have some way to go. At the very least you need to add a try/catch structure and be prepared to send error/status back to the client.

Unable to receive proper data from the promise function

I am trying to scrap wikipedia page to fetch list of airlines by first scrapping first page and then going to each individual page of airline to get the website url. I have divided the code in two functions. One to scrap main page and get a new url, and second function to scrap another page from the created url to get the website name from that page. I have used request-promise module for getting the html and then cheerio to parse the data.
export async function getAirlinesWebsites(req,res) {
let response = await request(options_mainpage);
console.log(`Data`);
let $ = cheerio.load(response);
console.log('Response got');
$('tr').each((i,e)=>{
let children = '';
console.log('inside function ', i);
if($(e).children('td').children('a').attr('class') !== 'new') {
children = $(e).children('td').children('a').attr('href');
let wiki_url = 'https://en.wikipedia.org' + children;
console.log(`wiki_url = ${wiki_url}`);
let airline_url = getAirlineUrl(wiki_url);
console.log(`airline_url = ${airline_url}`);
}
})
And then the getAirlineUrl() function will parse another page based on the provided url.
async function getAirlineUrl(url){
const wiki_child_options = {
url : url,
headers : headers
}
let child_response = await request(wiki_child_options);
let $ = cheerio.load(child_response);
let answer = $('.infobox.vcard').children('tbody').children('tr').children('td').children('span.url').text();
return answer;
})
However when I console log the answer variable in the parent function, I get a [object Promise] value instead of a String. How do I resolve this issue?
Async function return promise.In case of that,you need to use then to get resolved response or use await.
This should work if other part of your code is ok.
export async function getAirlinesWebsites(req, res) {
let response = await request(options_mainpage);
console.log(`Data`);
let $ = cheerio.load(response);
console.log("Response got");
$("tr").each(async (i, e) => {
let children = "";
console.log("inside function ", i);
if ($(e).children("td").children("a").attr("class") !== "new") {
children = $(e).children("td").children("a").attr("href");
let wiki_url = "https://en.wikipedia.org" + children;
console.log(`wiki_url = ${wiki_url}`);
let airline_url = await getAirlineUrl(wiki_url);
console.log(`airline_url = ${airline_url}`);
}
});
}
Since your getAirlineUrl function returns a promise, you need to await that promise. You can't have await nested inside of the .each callback because the callback is not an async function, and if it was it wouldn't work still. The best fix is the avoid using .each and just use a loop.
export async function getAirlinesWebsites(req,res) {
let response = await request(options_mainpage);
console.log(`Data`);
let $ = cheerio.load(response);
console.log('Response got');
for (const [i, e] of Array.from($('tr')).entries()) {
let children = '';
console.log('inside function ', i);
if($(e).children('td').children('a').attr('class') !== 'new') {
children = $(e).children('td').children('a').attr('href');
let wiki_url = 'https://en.wikipedia.org' + children;
console.log(`wiki_url = ${wiki_url}`);
let airline_url = await getAirlineUrl(wiki_url);
console.log(`airline_url = ${airline_url}`);
}
}
}

Categories

Resources