Web Scrape pagination in a single URL (cheerio and axios) - javascript

newbie here. I was on web scraping project. And I wanted some guide on web scraping pagination technique. I'm scraping this site https://www.imoney.my/unit-trust-investments. As you can see ,I wanted to retrieve different "Total return" percentage based on Xyears. Right now I'm using cheerio and axios.
const http = require("http");
const axios = require("axios");
const cheerio = require("cheerio");
http
.createServer(async function (_, res) {
try {
const response = await axios.get(
"https://www.imoney.my/unit-trust-investments"
);
const $ = cheerio.load(response.data);
const funds = [];
$("[class='list-item']").each((_i, row) => {
const $row = $(row);
const fund = $row.find("[class*='product-title']").find("a").text();
const price = $row.find("[class*='is-narrow product-profit']").find("b").text();
const risk = $row.find("[class*='product-title']").find("[class*='font-xsm extra-info']").text().replace('/10','');;
const totalreturn = $row.find("[class*='product-return']").find("[class='font-lg']").find("b").text().replace('%','');
funds.push({ fund, price, risk, totalreturn});
});
res.statusCode = 200;
res.write(JSON.stringify(funds, null, 4));
} catch (err) {
res.statusCode = 400;
res.write("Unable to process request.");
}
res.end();
})
.listen(8080);
do note, the URL does not change when different year is selected, only the value for total return is changed

This happens because the page uses javascript to generate the content. In this case, you need something like Puppeteer. That's what you need:
const puppeteer = require("puppeteer");
const availableFunds = "10000";
const years = 2; // 3 for 0.5 years; 2 for 1 year; 1 for 2 years, 0 for 3 years.
async function start() {
const browser = await puppeteer.launch({
headless: false,
});
const page = await browser.newPage();
await page.goto("https://www.imoney.my/unit-trust-investments");
await page.waitForSelector(".product-item");
await page.focus("#amount");
for (let i = 0; i < 5; i++) {
await page.keyboard.press("Backspace");
}
await page.type("#amount", availableFunds);
await page.click("#tenure");
for (let i = 0; i < years; i++) {
await page.keyboard.press("ArrowUp");
}
await page.keyboard.press("Enter");
const funds = await page.evaluate(() => {
const funds = [];
Array.from(document.querySelectorAll(".product-item")).forEach((el) => {
const fund = el.querySelector(".title")?.textContent.trim();
const price = el.querySelector(".investmentReturnValue")?.textContent.trim();
const risk = el.querySelector(".col-title .info-desc dd")?.textContent.trim();
const totalreturn = el.querySelector(".col-rate.text-left .info-desc .ir-value")?.textContent.trim();
if (fund && price && risk && totalreturn) funds.push({ fund, price, risk, totalreturn });
});
return funds;
});
console.log(funds);
browser.close();
}
start();
Output:
[
{
fund: 'Aberdeen Standard Islamic World Equity Fund - Class A',
price: 'RM 12,651.20',
risk: 'Medium\n 7/10',
totalreturn: '26.51'
},
{
fund: 'Affin Hwang Select Balanced Fund',
price: 'RM 10,355.52',
risk: 'Medium\n 5/10',
totalreturn: '3.56'
},
... and others

Related

Puppeteer Node Js YouTube data scraping error "Evaluation Failed"

I am trying to scrape the YouTube headline and link from a channel using Puppeteer. While executing the program, I am facing the Evaluation Error as following:
Error: Evaluation failed: TypeError: Cannot read properties of null (reading 'innerText')
at pptr://__puppeteer_evaluation_script__:10:65
at ExecutionContext._ExecutionContext_evaluate (E:\somoy\node_modules\puppeteer-core\lib\cjs\puppeteer\common\ExecutionContext.js:229:15)
at process.processTicksAndRejections (node:internal/process/task_queues:95:5)
at async ExecutionContext.evaluate (E:\somoy\node_modules\puppeteer-core\lib\cjs\puppeteer\common\ExecutionContext.js:107:16)
at async initiate (E:\somoy\appNew.js:45:20)
at async E:\somoy\appNew.js:155:9
async function initiate() {
const browser = await puppeteer.launch({ headless: false, defaultViewport: null, userDataDir: './userdata', executablePath: 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe' });
const page = await browser.newPage();
page.setDefaultNavigationTimeout(0)
await page.goto('https://www.youtube.com/#ProthomAlo/videos', { waitUntil: 'networkidle2' });
await delay(5000);
if (!fs.existsSync('storeLink.txt')) {
//create new file if not exist
fs.writeFileSync("storeLink.txt", '');
}
articleLinkarr = (fs.readFileSync('storeLink.txt', { encoding: 'utf8' })).split('\n')
let articles = await page.evaluate(async (articleLinkarr) => {
//console.log('Hello1')
let arrObj = [];
articles = document.querySelectorAll('.style-scope.ytd-rich-grid-media');
for (let i = 0; i < articles.length; i++) {
//for (let i = 0; i < 20; i++) {
//const category = document.querySelector('.print-entity-section-wrapper.F93gk').innerText
//const headline = articles[i].querySelector('div > h3').innerText
const headline = articles[i].querySelector('h3').innerText
const link = 'https://www.youtube.com' + articles[i].querySelector('a').getAttribute('href')
// if (!(link.includes('video') || link.includes('fun') || link.includes('photo'))) {
// if (!articleLinkarr.includes(link)) {
arrObj.push({ articleHeadline: headline, articleLink: link })
// }
// }
};
return arrObj;
}, articleLinkarr)
}
Puppeteer doesn't seem necessary here if you just want the initial set of titles. There's a JSON blob in the static HTML which has the title list, so you can make a simple HTTP request to the URL and pull the blob out with an HTML parser, then walk the object structure.
const cheerio = require("cheerio"); // 1.0.0-rc.12
const url = "Your URL";
fetch(url) // Node 18 or install node-fetch
.then(res => {
if (!res.ok) {
throw Error(res.statusText);
}
return res.text();
})
.then(html => {
const $ = cheerio.load(html);
const script = $(
[...$("script")].find(e =>
$(e).text().startsWith("var ytInitialData = {")
)
)
.text()
.slice(20, -1);
const data = JSON.parse(script);
const titles = [];
const {contents} =
data.contents.twoColumnBrowseResultsRenderer.tabs[1].tabRenderer
.content.richGridRenderer;
for (const c of contents) {
if (!c.richItemRenderer) {
continue;
}
const title =
c.richItemRenderer.content.videoRenderer.title.runs[0].text;
const url =
c.richItemRenderer.content.videoRenderer.navigationEndpoint
.commandMetadata.webCommandMetadata.url;
titles.push({title, url});
}
console.log(titles);
})
.catch(err => console.error(err));
If you do want to use Puppeteer, you can select these titles and URLs with:
const puppeteer = require("puppeteer"); // ^19.0.0
const url = "Your URL";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto(url, {waitUntil: "domcontentloaded"});
await page.waitForSelector("#video-title-link");
const titles = await page.$$eval("#video-title-link", els =>
els.map(e => ({title: e.textContent, url: e.href}))
.filter(e => e.url)
);
console.log(titles);
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
For some reason, the ids aren't unique.
Although this is less code, this approach is much slower than fetch (~10x slower on my machine), although you can speed it up a bit by blocking irrelevant resources.
As an aside, always use const in front of your variables to avoid making them global.
page.setDefaultNavigationTimeout(0) is generally not a great pattern--this could hang forever. I'd set this to 3 or 4 minutes at most. If nav is taking that long, something is wrong and you should get that logged so you can take a look at it.

How handle multiple functions in puppeteer-cluster?

I have a two step program :
Get a list of href from a page
Loop infinitely on each page of this list, get an element and display it in console
I try to use function with Puppeteer-Cluter but it doesn't work properly.
const { Cluster } = require('puppeteer-cluster');
const fs = require("fs");
const { addExtra } = require("puppeteer-extra");
const vanillaPuppeteer = require("puppeteer");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
var moment = require('moment');
var regexTemps = /(\d+)\s(\w+)$/;
const urlsToCheck = [];
TZ = 'Europe/Paris'
process.env.TZ = 'Europe/Paris'
(async () => {
const puppeteer = addExtra(vanillaPuppeteer);
puppeteer.use(StealthPlugin());
const cluster = await Cluster.launch({
puppeteer,
puppeteerOptions: {
headless: false,
args: ['--no-sandbox'],
},
maxConcurrency: 10,
concurrency: Cluster.CONCURRENCY_CONTEXT,
monitor: false,
skipDuplicateUrls: true,
timeout:30000,
retryLimit:10,
})
cluster.on('taskerror', (err, data, willRetry) => {
if (willRetry) {
console.warn(`Encountered an error while crawling ${data}. ${err.message}\nThis job will be retried`);
} else {
console.error(`Failed to crawl ${data}: ${err.message}`);
}
});
const getElementOnPage = async ({ page, data: url }) => {
console.log('=> Go to URL : ',url);
await page.goto(url);
while (true) {
console.log('=> Reload URL : ',page.url())
await page.reload();
await page.waitForTimeout(1000);
let allNews = await page.$$("article.news"); // [] if nothing
let firstNews = allNews[0];
await page.waitForTimeout(1000);
let info = await firstNews.$eval('.info span', s => s.textContent.trim());
console.log(new Date(), 'info : ',info);
}
};
const getListOfPagesToExplore = async ({ page, data: url }) => {
console.log(new Date(), 'Get the list of deal pages to explore');
await page.goto(url, {waitUntil: 'domcontentloaded'});
await page.waitForTimeout(500);
const hrefsToVisit = await page.$x('//a');
let idxTab=0;
for( let hrefToVisit of hrefsToVisit ) {
var link = await page.evaluate(el => el.getAttribute("href"), hrefToVisit);
console.log(new Date(), 'adding link to list : ', link);
idxTab++;
urlsToCheck.push(link);
}
};
cluster.queue('https://www.apagewithsomelinks.com', getListOfPagesToExplore);
await cluster.idle();
await cluster.close();
console.log(urlsToCheck);
//Display correctly several link in an array
for( let url of urlsToCheck ) {
console.log('Push in queue : ',url);
cluster.queue(url, getElementOnPage);
}
await cluster.idle();
await cluster.close();
})();
When I launch it, it retrieve the links to scrap.
It display "Push in queue : ..." for each URL.
But then, the method getElementOnPage is launched only for the first URL, and runs infinitely like asked. But why the other URL are note launched ??
Before that, I don't use function, I used one unique task with :
await cluster.task(async ({ page, data: url }) => {
But how combine function and this thing ?

Estimate gas prices with Ethers JS

I am trying to make a simple js bot that checks every block for eth(or the main token of the chain) and sends it to another wallet.
I have a working bot:
const { ethers } = require('ethers')
const provider = new ethers.providers.JsonRpcProvider("")
const addressReceiver = ''
const privateKeys = [""]
const bot = async =>{
provider.on('block', async () => {
console.log('Listening to new block, waiting ;)');
for (let i = 0; i < privateKeys.length; i++){
const _target = new ethers.Wallet(privateKeys[i]);
const target = _target.connect(provider);
const balance = await provider.getBalance(target.address);
const txBuffer = ethers.utils.parseEther('0.005');
if (balance.sub(txBuffer) > 0){
console.log("New Account with Eth!");
const amount = balance.sub(txBuffer);
try {
await target.sendTransaction({
to: addressReceiver,
value: amount
});
console.log(`Success! transferred -->${ethers.utils.formatEther(balance)}`);
} catch(e){
console.log(`error: ${e}`);
}
}
}
})
}
bot();
But this has a set transaction buffer that ends up leaving some eth in the wallet after it the bot runs. I want to estimate fees and then subtract those fees from the total taken out. Something like this :
const {
ethers
} = require('ethers')
const provider = new ethers.providers.JsonRpcProvider("")
const addressReceiver = ''
const privateKeys = [""]
const bot = async =>{
provider.on('block', async () => {
console.log('Listening to new block, waiting ;)');
for (let i = 0; i < privateKeys.length; i++) {
const _target = new ethers.Wallet(privateKeys[i]);
const target = _target.connect(provider);
const balance = await provider.getBalance(target.address);
const gasLimit = await provider.estimateGas({
to: addressReceiver,
value: await provider.getBalance(target.address),
gasLimit: 21000,
gasPrice: ethers.utils.parseUnits('10', 'gwei'),
nonce: await provider.getTransactionCount(privateKeys[i])
})
if (balance.sub(gasLimit) > 0) {
console.log("New Account with Eth!");
const amount = balance.sub(gasLimit);
try {
await target.sendTransaction({
to: addressReceiver,
value: amount
});
console.log(`Success! transferred -->${ethers.utils.formatEther(balance)}`);
} catch (e) {
console.log(`error: ${e}`);
}
}
}
})
}
bot();
But this throws an ENS name not configured error.
Few issues here:
ENS name not figured is probably because addressReceiver was an ENS name (ends with .eth), not an address (starts with 0x). Use an address.
amount = balance.sub(gasLimit) is not right. gasLimit is the amount of gas used, not in eth. You'll need to multiply that by fee per gas to get eth. The easiest way to figure out the exact fee is to set tx.maxFeePerGas and tx.maxPriorityFeePerGas to be the same values. That will cause you to overpay most of the time. Then the new code will be amount = balance.sub(tx.maxFeePerGas.mul(gasLimit))

How to run a function which call axios for every 30 seconds

I'm creating a web scraper using node, cheerio and calling the website using axios(async/await). I want the function to run every 30 seconds. I tried using setTimeout and setInterval but did not get the expected result. Instead got heap out of memory error. I want to run the mvcAppointmentSearch function in the while loop for every 30 seconds. Following is the code. also attaching the codepen link for better readability.
Code pen link
const express = require('express');
const request = require('request-promise');
const cheerio = require('cheerio');
const axios = require('axios');
const cssSelect = require('css-select');
const open = require('open');
// const mvcUrl = 'https://telegov.njportal.com/njmvc/AppointmentWizard/17/';
const mvcUrl = 'https://telegov.njportal.com/njmvc/AppointmentWizard/14/';
const mvcLocation = ['Edison', 'Rahway', 'SouthPlainfield'];
// const mvcLocationNumber = ['240', '252', '239'];
const mvcLocationNumber = ['163'];
const requiredMonths = ['September', 'October'];
const callUrl = async (url, locationNumberIndex) => {
try {
const response = await axios.get(url);
//console.log('call url', response.data);
getData(response.data, locationNumberIndex);
} catch (err) {
console.log(err);
}
};
const mvcAppointmentSearch = () => {
for (let i = 0; i < mvcLocationNumber.length; i++) {
const currentUrl = mvcUrl + mvcLocationNumber[i];
console.log(mvcLocationNumber[i]);
callUrl(currentUrl, i);
}
};
const getData = (html, locationNumberIndex) => {
let data = [];
let $ = cheerio.load(html);
console.log('datais ', $);
$.prototype.exists = function (selector) {
return this.find(selector).length > 0;
};
const checkerLength = $('div').exists('.alert-danger');
console.log(checkerLength);
if (checkerLength) {
console.log(
`No appointment available in ${mvcLocation[locationNumberIndex]}`
);
} else {
const dateString = $('.control-label').text();
const availableMonth = dateString.trim().split(' ')[7];
const exactDateAvailability = dateString.slice(24, -1);
console.log(availableMonth);
if (requiredMonths.includes(availableMonth)) {
console.log('Hurray there is an appointment available');
const message = `Appointment available for the location ${mvcLocation[locationNumberIndex]} on ${exactDateAvailability}`;
open(`${mvcUrl + mvcLocationNumber[locationNumberIndex]}`);
console.log(message);
} else {
console.log('required Month is not available still searching');
}
}
};
while (true) {
try {
// mvcAppointmentSearch();
// want to run the following function for every 30 seconds.
mvcAppointmentSearch();
} catch (err) {
console.log(`Error has Occured ${err}`);
}
}

How to combine two json response and send it as a whole?

I am using zomato API which sends only 20 names in one api call and I want atleast 60 responses so I thought of calling the same api three times and combining all the responses together.
app.get('/locations/:query', async (req, res) => {
const query = req.params.query;
const data = await zomato.cities({ q: query,count: 1 })
const cityId= await (data[0].id);
const restaurants1 = await zomato.search({ entity_id: cityId, entity_type: 'city', start:0, count:20, sort:'rating', order:'desc' })
const restaurants2 = await zomato.search({ entity_id: cityId, entity_type: 'city', start:20, count:40, sort:'rating', order:'desc' })
const restaurants = Object.assign(restaurants1,...restaurants2);
res.send(restaurants);
})
As of now I have only tried till 40 but even this does not work. If another constant restaurant3 is there which has start 40 and end 60, how do I merge these three and send them back?
I haven't worked with zomato-api before but according to their API-documentation, you should be able to do this:
app.get('/locations/:query', async (req, res) => {
const query = req.params.query;
const data = await zomato.cities({ q: query,count: 1 })
const cityId= data[0].id;
const result = [];
const nrOfRequests = 2; // change this accordingly to increase the nr of requests
let currCount = 0;
const nrOfEntries = 20;
for(let i=0; i < nrOfRequests ; i++) {
const response = await zomato.search({ entity_id: cityId, entity_type: 'city', start:currCount, count:nrOfEntries, sort:'rating', order:'desc' });
result.push(...response.restaurants);
currCount += nrOfEntries;
}
res.send(result);
});
Basically you should be able to loop for the desired amount of requests by updating the start parameter for each iteration and then storing the resulting restaurants in your result.
You shouldn't be spreading restaurants2 in your Object.assign(). Use one of the following:
const restaurants = Object.assign(restaurants1, restaurants2);
// or
const restaurants = { ...restaurants1, ...restaurants2 };

Categories

Resources