Loop async scraping - javascript

I want to make my personal scraper from wikipedia links I have stored in array but have some problem with async javascript, guess I just don't realize whole async concept enough.
So basically, on every iteration I want my axios.get link to be next thing from data array, and go through whole scraping process, and when it ends it take next link in array and repeat whole process until it loops all array links stated.
I know how to do it in a synchronous way, but this async way is just another universe for me.
const cheerio = require('cheerio');
const axios = require('axios');
var fs = require('fs');
const data = ["a","b","c","d","e"];
for(let i = 0; i < data.length; i++){
let link = data[i];
axios.get(link).then((res) => {
let $ = cheerio.load(res.data);
$('div.class.xyz').each(element => {
let post = $(element).text();
console.log(post);
let input = '{' + JSON.stringify(post) + '},' + '\n\n';
fs.appendFileSync('info.json', input);
// console.log('Saved!');
// console.log(index);
})
})
}

Try this:
const cheerio = require('cheerio');
const axios = require('axios');
const fs = require('fs');
(async () => {
const data = ["a","b","c","d","e"];
for(let i = 0; i < data.length; i++){
let link = data[i];
const res = await axios.get(link);
let $ = cheerio.load(res.data);
$('div.class.xyz').each(element => {
let post = $(element).text();
console.log(post);
let input = '{' + JSON.stringify(post) + '},' + '\n\n';
await fs.appendFile('info.json', input);
// console.log('Saved!');
// console.log(index);
});
}
})();

Related

How to write array data to json?

How to write the data that the loop passes to the array "eventsPolygon", to json. This array returns 4 args. With this method, I get the error "TypeError [ERR_INVALID_CALLBACK]: Callback must be a function. Received undefined"
async function main() {
console.log("Start checking rewards")
const currentBlockNumberPolygon = await maticProvider.getBlockNumber() - 1
const currentBlockNumberBsc = await bscProvider.getBlockNumber() - 1
const oldestBlockNumberPolygon = 22939848
const oldestBlockNumberBsc = 13763979
const eventFilterPolygon = Missions.filters.RewardToPay()
const eventFilterBsc = Rewards.filters.RewardPayed()
let eventsPolygon = []
let eventsBsc = []
for (let i = oldestBlockNumberPolygon; i < currentBlockNumberPolygon - 10000; i += 10000) {
const eventsLoop = await Missions.queryFilter(eventFilterPolygon, i, i + 10000)
eventsPolygon = eventsPolygon.concat(eventsLoop)
const jsonData = JSON.stringify(eventsPolygon);
fs.writeFile('eventsBsc.json', jsonData.finished)
console.log(i)
}
//for(let i = oldestBlockNumberBsc; i < currentBlockNumberBsc-10000; i+=10000) {
//const eventsLoop = await Rewards.queryFilter(eventFilterBsc, i, i+10000)
// eventsBsc = eventsBsc.concat(eventsLoop)
//console.log(i)
//}
console.log('a')
}
JSON.stringify returns a string representation of your JSON. So you cannot do this:
fs.writeFile('eventsBsc.json', jsonData.finished)
Simply write jsonData to the file:
await fs.writeFile('eventsBsc.json', jsonData);
Be aware that this function is async. You need to await it

Trying to use replace() method on files by passing the string not working - JS

I'm trying to replace a string, but when i try to do it on a file, it does not work. Or only the last file will work.
Main Code (inside a class method):
const funcs = fs.readdirSync(path.join(__dirname, "../funcs")).filter(file => file.endsWith('.js'));
this.functions = this.code.split("$");
const functions = this.functions
for (const func of funcs) {
for (let x = functions.length - 1; x > 0; x--) {
let i = 0;
const res = await require(`../funcs/${func}`)(client, this.code.toString(), this._author);
console.log(x);
console.log(res);
console.log(functions);
return res;
i++;
}
}
ping.js:
const ping = (client, code, author) => {
const result = code.split("$[ping]").join(client.ws.ping)
return result;
}
module.exports = ping;
messageAuthorTag.js:
const Util = require('../utils/util');
const messageAuthorTag = (client, code, author) => {
if (code === null) return;
const res = code.split("$[message.author.tag]").join(`${author.tag}`);
return res;
}
module.exports = messageAuthorTag;
Using it : "ping: $[ping] author tag: $[message.author.tag]"
Output:
ping: $[ping] author tag: Example#0001
Your issue is that you return res in the middle of a for loop. This means it won't do anything else in the function and will skip the rest of the loops (like running the other functions!)
As an additional note, you could also remove the let i = 0 and i++ since that doesn't seem to be used for anything.

Why is my for loop not working how I expect to? Run function twice - JavaScript

So guys, I've got scraping function, where I create object of scraped data. Code of scraper is:
const axios = require('axios');
const cheerio = require('cheerio');
const db = require('../config/db.config');
const Article = db.article;
const prices = new Array();
const ids = new Array();
const descs = new Array();
const links = new Array();
for (p = 1; p < 3; p++) {
function again() {
const url = `https://www.olx.ba/pretraga?vrsta=samoprodaja&kategorija=23&sort_order=desc&kanton=9&sacijenom=sacijenom&stranica=${p}`;
axios
.get(url)
.then((response) => {
let $ = cheerio.load(response.data);
$('div[class="naslov"] > a').each((i, el) => {
const id = $(el).attr('href'); // ID, description and link are in the same div class
const desc = id;
const link = id;
descs.push(desc.substring(36)); //Retriving description with substring and push into array
ids.push(id.substring(27, 35)); //Retriving id with substring and push into array
links.push(link); //Retriving link and push into array
for (var i = 0; i < descs.length; i++) {
descs[i] = descs[i].replace('/', '').replace('-', ' ');
}
});
$('div[class="datum"] > span').each((i, el) => {
$('span[class="prekrizenacijena"]').remove();
const price = $(el).text();
prices.push(price); //Retriving price and push into array
});
for (var i = prices.length - 1; i >= 0; i--) {
if (prices[i] === 'PO DOGOVORU') {
prices.splice(i, 1);
}
}
async function asy() {
const sqm = new Array();
for (k = 0; k < links.length; k++) {
const res = await axios
.get(`${links[k]}`)
.then((result) => {
let $ = cheerio.load(result.data);
const pr = $('div[class="df2 "]').first().text();
sqm.push(pr);
for (var i = 0; i < sqm.length; i++) {
sqm[i] = sqm[i].replace('m2', '');
}
})
.catch((err) => {
//handle error
console.log(err);
});
}
const object = ids.map((element, index) => {
const ppm2 =
parseFloat(
prices[index].replace(/\.| ?€$/g, '').replace(',', '.')
) / parseFloat(sqm[index]);
const ppm2final = Math.round(ppm2);
return {
id: element,
price: prices[index],
descr: descs[index],
link: links[index],
sqm: sqm[index],
ppm2: ppm2final + ' KM',
};
});
console.log(object);
console.log(Object.keys(object).length);
/*const ins = await Article.bulkCreate(object)
.then(console.log('Data added to DB'))
.catch((err) => console.log(err));*/
}
asy();
})
.catch((e) => {
console.log(e);
});
}
again();
}
Now when I delete first for lop and function again() and instead of ${p} in url insert eg. 1,2,3 etc. it's working perfect - sqm is fetched for correct link.
Now the problem:
I want to run this url multiple times because ${p} is number of page on that url. Now first problem I got:
sqm isn't correct - sqm data is thrown all over the object and isn't correct for that link.(it's correct when I don't use ${p}
First time i get sqm data(but not correct for that link), when function needs to get ran second time (for second page - to ${p}=2) - sqm isn't fetched at all (it throws NaN).
Also I've got console.log(Object.keys(object).length); where I expect first time to be 30, then after is runned second time to I get 60.(each page contains 30 articles), but I get 60, then again 60.
I've tried with many things: async functions, putting axios to await etc. but nothing really work - sometimes I get only 30 articles, sometimes 60 but with incorrect values.

Can't assign value to a variable from array

I'm creating a simple project in JavaScript and can't seem to figure out why this is not working for me. I created a function that fetches an API to get a random country name, and then push that country name in to an empty array, but I can't seem to figure out how to assign value to my variable from that array, probably missing something easy here.
getRandomWord();
getRandomWord();
getRandomWord();
getRandomWord();
const words = [];
let selectedWord = words[Math.floor(Math.random() * words.length)];
console.log(words);
console.log(selectedWord);
// Fetch some random words
async function getRandomWord() {
const res = await fetch('https://randomuser.me/api');
const data = await res.json();
const randomWord = data.results[0].location.country;
words.push(randomWord);
}
I need to assign one random country name from words array to selectedWord but it throws undefined all the time, although I see 4 different country names in words array at locations from 0 to 3. Can someone explain this to me or maybe have even a better approach for this. Thanks !
Ok , so i changed code a bit.
async function getRandomWord() {
try {
const res = await fetch('https://randomuser.me/api');
const data = await res.json();
const randomWord = data.results[0].location.country;
return randomWord.toUpperCase();
// words.push(randomWord.toUpperCase());
} catch (e) {
console.log(
'There has been a problem with your fetch operation: ' + e.message
);
}
}
And then just called
(async () => {
selectedWord = await getRandomWord();
displayWord();
})();
Can't believe i did this :D Thanks.
The fetches have not resolved and words[] has not been populated when selectedWords = executes. Using Promise.all and then:
const p = Promise.all([
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord()
]);
const words = [];
p.then(()=>{
const selectedWord = words[Math.floor(Math.random() * words.length)];
console.log(words);
console.log(selectedWord);
});
// Fetch some random words
async function getRandomWord() {
const res = await fetch('https://randomuser.me/api');
const data = await res.json();
const randomWord = data.results[0].location.country;
words.push(randomWord);
}
Note: You will want to use Promise.allSettled if you don't want the fail fast behavior, where any failure will cause it to return immediately. Also notice I'm using Promise.all so that it will run the fetches concurrently instead of waiting on them sequentially.
async/await instead of then
(async()=>{
let words = [];
for(var i = 0; i < 4; i++)
words.push(getRandomWord());
words = await Promise.all(words);
const selectedWord = words[Math.floor(Math.random() * words.length)];
console.log(words);
console.log(selectedWord);
})()
// Fetch some random words
async function getRandomWord() {
const res = await fetch('https://randomuser.me/api');
const data = await res.json();
const randomWord = data.results[0].location.country;
return randomWord;
}

how to display responses from a looped scrape (cheerio)

I am scraping this site to collect all rows with the year 2013, but there are 7 pages and I have my request in a loop. How can I display the results after all 7 responses have been received? If I simply try to console.log the rowTrack array, it displays empty because of the async nature of the code. Ideally I want to run the requests in order of the loop so that the results of the first page are the first elements of the array etc..
var request = require("request"),
cheerio = require("cheerio"),
rowTrack = [];
for (var i = 1; i <= 7; i++) {
var url = "http://www.boxofficemojo.com/alltime/world/?pagenum=" + i + "&p=.htm";
request(url, function(error, response, body) {
if (!error) {
var $ = cheerio.load(body),
rows = $('table table tr');
rows.each(function(j, element) {
var select = $(element.children).text().split('\r\n')
select.shift();
select.pop();
if (select[select.length - 1] == "2013") {
rowTrack.push(select);
}
});
}
});}
How can I display the results?
The site you're scraping has changed a bit since the question was asked. The table is still there, but the URL and pagination are a bit different.
JS has moved on to promises and the requests package is deprecated. Nowadays, with promises, you'd do:
const cheerio = require("cheerio"); // ^1.0.0-rc.12
const baseUrl =
"https://www.boxofficemojo.com/chart/top_lifetime_gross/?area=XWW";
(async () => {
const results = [];
for (let i = 0; i < 6; i++) {
const response = await fetch(`${baseUrl}&offset=${i * 100}`);
const $ = cheerio.load(await response.text());
results.push(...[...$("tr")]
.map(e => [...$(e).find("td")].map(e => $(e).text()))
.filter(e => e.at(-1) === "2013")
);
}
console.log(results);
})();
The above code runs in series, but you can parallelize it with Promise.all:
const cheerio = require("cheerio");
const baseUrl =
"https://www.boxofficemojo.com/chart/top_lifetime_gross/?area=XWW";
(async () => {
const results = await Promise.all(
[...Array(6)].map(async (_, i) => {
const response = await fetch(`${baseUrl}&offset=${i * 100}`);
const $ = cheerio.load(await response.text());
return [...$("tr")]
.map(e => [...$(e).find("td")].map(e => $(e).text()))
.filter(e => e.at(-1) === "2013");
})
);
console.log(results.flat());
})();
Node 18 has native fetch, but if you're stuck with a legacy situation without promises, you can store each result in an array and use a counter to determine how many requests have completed. When the last request resolves, trigger the next stage of processing.
const cheerio = require("cheerio");
const request = require("request"); // ^2.88.2
const getRankings = done => {
const results = [];
const total = 6;
let completed = 0;
const baseUrl =
"https://www.boxofficemojo.com/chart/top_lifetime_gross/?area=XWW";
for (let i = 0; i < total; i++) {
request(`${baseUrl}&offset=${i * 100}`, function (error, response, body) {
if (error) {
console.error(err);
}
const $ = cheerio.load(body);
results[i] = [...$("tr")]
.map(e => [...$(e).find("td")].map(e => $(e).text()))
.filter(e => e.at(-1) === "2013");
if (++completed === total) {
done(results.flat());
}
});
}
};
getRankings(results => {
console.log(results);
});
The above code runs all of the requests in parallel. To do the requests sequentially, you can chain the callbacks:
const cheerio = require("cheerio");
const request = require("request");
const getRankings = (done, results=[], total=6, i=0) => {
const baseUrl =
"https://www.boxofficemojo.com/chart/top_lifetime_gross/?area=XWW";
request(`${baseUrl}&offset=${i * 100}`, (error, response, body) => {
if (error) {
console.error(err);
}
const $ = cheerio.load(body);
results[i] = [...$("tr")]
.map(e => [...$(e).find("td")].map(e => $(e).text()))
.filter(e => e.at(-1) === "2013");
if (i + 1 === total) {
done(results.flat());
}
else {
getRankings(done, results, total, i + 1);
}
});
}
getRankings(results => {
console.log(results);
});
Error handling on failed requests is left as an exercise. I haven't bothered adapting modern JS idioms like .at(-1), .flat() and so forth to work on older Node versions. Cheerio's .toArray() can be used instead of spreads, .at(-1) can be recreated with roughly const last = a => a[a.length-1]; and .flat() can be [].concat(...results).

Categories

Resources