how to display responses from a looped scrape (cheerio) - javascript

I am scraping this site to collect all rows with the year 2013, but there are 7 pages and I have my request in a loop. How can I display the results after all 7 responses have been received? If I simply try to console.log the rowTrack array, it displays empty because of the async nature of the code. Ideally I want to run the requests in order of the loop so that the results of the first page are the first elements of the array etc..
var request = require("request"),
cheerio = require("cheerio"),
rowTrack = [];
for (var i = 1; i <= 7; i++) {
var url = "http://www.boxofficemojo.com/alltime/world/?pagenum=" + i + "&p=.htm";
request(url, function(error, response, body) {
if (!error) {
var $ = cheerio.load(body),
rows = $('table table tr');
rows.each(function(j, element) {
var select = $(element.children).text().split('\r\n')
select.shift();
select.pop();
if (select[select.length - 1] == "2013") {
rowTrack.push(select);
}
});
}
});}
How can I display the results?

The site you're scraping has changed a bit since the question was asked. The table is still there, but the URL and pagination are a bit different.
JS has moved on to promises and the requests package is deprecated. Nowadays, with promises, you'd do:
const cheerio = require("cheerio"); // ^1.0.0-rc.12
const baseUrl =
"https://www.boxofficemojo.com/chart/top_lifetime_gross/?area=XWW";
(async () => {
const results = [];
for (let i = 0; i < 6; i++) {
const response = await fetch(`${baseUrl}&offset=${i * 100}`);
const $ = cheerio.load(await response.text());
results.push(...[...$("tr")]
.map(e => [...$(e).find("td")].map(e => $(e).text()))
.filter(e => e.at(-1) === "2013")
);
}
console.log(results);
})();
The above code runs in series, but you can parallelize it with Promise.all:
const cheerio = require("cheerio");
const baseUrl =
"https://www.boxofficemojo.com/chart/top_lifetime_gross/?area=XWW";
(async () => {
const results = await Promise.all(
[...Array(6)].map(async (_, i) => {
const response = await fetch(`${baseUrl}&offset=${i * 100}`);
const $ = cheerio.load(await response.text());
return [...$("tr")]
.map(e => [...$(e).find("td")].map(e => $(e).text()))
.filter(e => e.at(-1) === "2013");
})
);
console.log(results.flat());
})();
Node 18 has native fetch, but if you're stuck with a legacy situation without promises, you can store each result in an array and use a counter to determine how many requests have completed. When the last request resolves, trigger the next stage of processing.
const cheerio = require("cheerio");
const request = require("request"); // ^2.88.2
const getRankings = done => {
const results = [];
const total = 6;
let completed = 0;
const baseUrl =
"https://www.boxofficemojo.com/chart/top_lifetime_gross/?area=XWW";
for (let i = 0; i < total; i++) {
request(`${baseUrl}&offset=${i * 100}`, function (error, response, body) {
if (error) {
console.error(err);
}
const $ = cheerio.load(body);
results[i] = [...$("tr")]
.map(e => [...$(e).find("td")].map(e => $(e).text()))
.filter(e => e.at(-1) === "2013");
if (++completed === total) {
done(results.flat());
}
});
}
};
getRankings(results => {
console.log(results);
});
The above code runs all of the requests in parallel. To do the requests sequentially, you can chain the callbacks:
const cheerio = require("cheerio");
const request = require("request");
const getRankings = (done, results=[], total=6, i=0) => {
const baseUrl =
"https://www.boxofficemojo.com/chart/top_lifetime_gross/?area=XWW";
request(`${baseUrl}&offset=${i * 100}`, (error, response, body) => {
if (error) {
console.error(err);
}
const $ = cheerio.load(body);
results[i] = [...$("tr")]
.map(e => [...$(e).find("td")].map(e => $(e).text()))
.filter(e => e.at(-1) === "2013");
if (i + 1 === total) {
done(results.flat());
}
else {
getRankings(done, results, total, i + 1);
}
});
}
getRankings(results => {
console.log(results);
});
Error handling on failed requests is left as an exercise. I haven't bothered adapting modern JS idioms like .at(-1), .flat() and so forth to work on older Node versions. Cheerio's .toArray() can be used instead of spreads, .at(-1) can be recreated with roughly const last = a => a[a.length-1]; and .flat() can be [].concat(...results).

Related

iterating Javascript array and delete based on condition

I want to iterate through an array of words, look up the definition and delete the word if no definition is found.
my code looks as follows;
var words = ["word1", "word2", "word3",]
function Meaning(words){
const getMeaning = async () => {
const response = await fetch(`https://api.dictionaryapi.dev/api/v2/entries/en/${words}`)
const myJson = await response.json()
for(i = 0; i < words.length; ++i) {
if(!response[i]){
myJson.splice(i,1)
console.log(myJson)
}
}}
This is not really doing anything atm. Where am I going wrong?
edit to add context
tried like this as well;
for(i = 0; i < words.length; ++i)
fetch(`https://api.dictionaryapi.dev/api/v2/entries/en/${words[i]}`).then((response) => {
if (response === 404) {
let response = words
words[i].splice(i,1)
console.log(response)
}
throw new Error('Something went wrong');
})
.then((responseJson) => {
let response = words
response[i].splice(i,1)
})
.catch((error) => {
console.log(error)
});
I can print out the 404 error when it finds no definition, but I can't remove it from the words array
After quick look at the API, and it appears to handle only single words, so the caller needs to make the requests one at a time. Here's how to do it...
const baseUrl = 'https://api.dictionaryapi.dev/api/v2/entries/en/';
// one word lookup. resolve to an array of definitions
async function lookupWord(word) {
const res = await fetch(baseUrl + word);
return res.json();
}
// resolve to a bool, true if the word is in the corpus
async function spellCheck(word) {
const defArray = await lookupWord(word);
return Array.isArray(defArray) && defArray.length > 0;
}
// create a spellCheck promise for every word and resolve with the results
// note, this mutates the array and resolves to undefined
async function spellCheckWords(array) {
const checks = await Promise.all(array.map(spellCheck));
for (let i=array.length-1; i>=0; i--) {
if (!checks[i]) array.splice(i,1);
}
}
// test it (a little)
let array = ['hello', 'whereforeartthou', 'coffee'];
spellCheckWords(array).then(() => {
console.log(array)
})
try this code, you need to check every single element of array from response
var words = ["word1", "word2", "word3"];
function Meaning(words) {
const getMeaning = async () => {
const response = await fetch(`https://api.dictionaryapi.dev/api/v2/entries/en/${words}`)
const myJson = await response.json()
let result = [];
myJson.forEach(element => {
if(words.includes(element)) {
result.push(element)
}
});
return result;
}
return getMeaning();
}

Why is my for loop not working how I expect to? Run function twice - JavaScript

So guys, I've got scraping function, where I create object of scraped data. Code of scraper is:
const axios = require('axios');
const cheerio = require('cheerio');
const db = require('../config/db.config');
const Article = db.article;
const prices = new Array();
const ids = new Array();
const descs = new Array();
const links = new Array();
for (p = 1; p < 3; p++) {
function again() {
const url = `https://www.olx.ba/pretraga?vrsta=samoprodaja&kategorija=23&sort_order=desc&kanton=9&sacijenom=sacijenom&stranica=${p}`;
axios
.get(url)
.then((response) => {
let $ = cheerio.load(response.data);
$('div[class="naslov"] > a').each((i, el) => {
const id = $(el).attr('href'); // ID, description and link are in the same div class
const desc = id;
const link = id;
descs.push(desc.substring(36)); //Retriving description with substring and push into array
ids.push(id.substring(27, 35)); //Retriving id with substring and push into array
links.push(link); //Retriving link and push into array
for (var i = 0; i < descs.length; i++) {
descs[i] = descs[i].replace('/', '').replace('-', ' ');
}
});
$('div[class="datum"] > span').each((i, el) => {
$('span[class="prekrizenacijena"]').remove();
const price = $(el).text();
prices.push(price); //Retriving price and push into array
});
for (var i = prices.length - 1; i >= 0; i--) {
if (prices[i] === 'PO DOGOVORU') {
prices.splice(i, 1);
}
}
async function asy() {
const sqm = new Array();
for (k = 0; k < links.length; k++) {
const res = await axios
.get(`${links[k]}`)
.then((result) => {
let $ = cheerio.load(result.data);
const pr = $('div[class="df2 "]').first().text();
sqm.push(pr);
for (var i = 0; i < sqm.length; i++) {
sqm[i] = sqm[i].replace('m2', '');
}
})
.catch((err) => {
//handle error
console.log(err);
});
}
const object = ids.map((element, index) => {
const ppm2 =
parseFloat(
prices[index].replace(/\.| ?€$/g, '').replace(',', '.')
) / parseFloat(sqm[index]);
const ppm2final = Math.round(ppm2);
return {
id: element,
price: prices[index],
descr: descs[index],
link: links[index],
sqm: sqm[index],
ppm2: ppm2final + ' KM',
};
});
console.log(object);
console.log(Object.keys(object).length);
/*const ins = await Article.bulkCreate(object)
.then(console.log('Data added to DB'))
.catch((err) => console.log(err));*/
}
asy();
})
.catch((e) => {
console.log(e);
});
}
again();
}
Now when I delete first for lop and function again() and instead of ${p} in url insert eg. 1,2,3 etc. it's working perfect - sqm is fetched for correct link.
Now the problem:
I want to run this url multiple times because ${p} is number of page on that url. Now first problem I got:
sqm isn't correct - sqm data is thrown all over the object and isn't correct for that link.(it's correct when I don't use ${p}
First time i get sqm data(but not correct for that link), when function needs to get ran second time (for second page - to ${p}=2) - sqm isn't fetched at all (it throws NaN).
Also I've got console.log(Object.keys(object).length); where I expect first time to be 30, then after is runned second time to I get 60.(each page contains 30 articles), but I get 60, then again 60.
I've tried with many things: async functions, putting axios to await etc. but nothing really work - sometimes I get only 30 articles, sometimes 60 but with incorrect values.

Loop async scraping

I want to make my personal scraper from wikipedia links I have stored in array but have some problem with async javascript, guess I just don't realize whole async concept enough.
So basically, on every iteration I want my axios.get link to be next thing from data array, and go through whole scraping process, and when it ends it take next link in array and repeat whole process until it loops all array links stated.
I know how to do it in a synchronous way, but this async way is just another universe for me.
const cheerio = require('cheerio');
const axios = require('axios');
var fs = require('fs');
const data = ["a","b","c","d","e"];
for(let i = 0; i < data.length; i++){
let link = data[i];
axios.get(link).then((res) => {
let $ = cheerio.load(res.data);
$('div.class.xyz').each(element => {
let post = $(element).text();
console.log(post);
let input = '{' + JSON.stringify(post) + '},' + '\n\n';
fs.appendFileSync('info.json', input);
// console.log('Saved!');
// console.log(index);
})
})
}
Try this:
const cheerio = require('cheerio');
const axios = require('axios');
const fs = require('fs');
(async () => {
const data = ["a","b","c","d","e"];
for(let i = 0; i < data.length; i++){
let link = data[i];
const res = await axios.get(link);
let $ = cheerio.load(res.data);
$('div.class.xyz').each(element => {
let post = $(element).text();
console.log(post);
let input = '{' + JSON.stringify(post) + '},' + '\n\n';
await fs.appendFile('info.json', input);
// console.log('Saved!');
// console.log(index);
});
}
})();

execute axios request multiple times only get one output

i have the following code:
for (var i = 0; i < subscriptions.length; i++) {
axios.get(`SOME_URL/${subscriptions[i]}`, config1)
.then((result) => {
return result.data.subscriptionId
})
.then((result) => {
axios.get(`SOME_URL/${result}/devices`, config2)
.then((data) => {
activationCodeAndDevice[data.data.devices[0].id] = result
return activationCodeAndDevice
})
.then((result) => {
console.log(result);
})
.catch((err) => {
console.log(err);
})
})
.catch((err) => {
console.log(err);
})
}
Now the console.log(result) will print as many times as the for loop goes through. What is your suggestion to only print the result one time when all the Axios requests are done executing?
Now i did build something like this but with 3 different requests:
var request1 = axios.get('request1')
var request2 = axios.get('request2')
var request3 = axios.get('request3')
Promise.allSettled([request1, request2, request3])
.then((values) => {
//do something
})
.catch((err) => {
//do something
})
I have no idea on how to get for a request that I have to perform multiple times, depending on an array of values, but only get one time the output. Of course, i could write it in a file and just have a look at the file once the data is written, but i want it to have it on the console.
thanks
Do something like:
const promises = subscriptions.map(subscription => axios.get(`SOME_URL/${subscription}`, config1))
Promise.allSettled(promises).then((values) => {
...
})
The map converts the requests to an array of promises, and the Promise.allSettled waits for the array to succeed.
If you really insist in using a for loop you can also do the following:
let promises = [];
for(let i = 0; i < subscriptions.length; i++) {
promises.push(axios.get(`SOME_URL/${subscription}`, config1));
}
Promise.allSettled(promises).then((values) => {
...
})
I managed to solve it like this:
const getSubscriptionResponse = subscriptions.map(subscription => axios.get(`SOME_URL/${subscription}`, config1))
Promise.allSettled(getSubscriptionResponse).then((values) => {
var subscriptionsValue = []
for (let i = 0; i < values.length; i++) {
subscriptionsValue.push(values[i].value.data.subscriptionId);
}
const getDeviceResponse = subscriptionsValue.map(sub => axios.get(`SOME_URL/${sub}/devices`, config2))
Promise.allSettled(getDeviceResponse).then((res) => {
var finalResults = {}
for (let j = 0; j < res.length; j++) {
finalResults[res[j].value.data.devices[0].id] = subscriptionsValue[j]
}
console.log(finalResults);
})
})
this will only print once the finalResult
{
A:A,
B:B,
C:C,
.
.
.
}
it looks a bit messy with all the for loops but I actually need to get the data from the array of responses.

Run concurrent HTTP requests in an async function

I am working on a project that needs an async function that's roughly equivalent to the following
async function task(url) {
var r1 = await fetch(url).then(resp => resp.text());
var r2 = await fetch(url + "/" + r1).then(resp => resp.json());
//r2 is an array of urls
var total = 0;
for (var u of r2) {
tmp = await fetch(u).then(resp => resp.text());
total += parseInt(tmp)
}
return total
}
The issue is that there are hundreds of elements in r2, each of the element is an URL. If I do it sequentially, this function will take a loooong time to complete. I would like to run 10 URLs concurrently (could be adjusted to other numbers), wonder how would I rewrite the async function.
Chunk the initial array into pieces of 10, then wait for each chunk to complete with Promise.all before starting the next one:
async function getTotal(urlPart, subArr) {
const resps = await Promise.all(subArr.map(url =>
fetch(url).then(resp => resp.json())
))
return resps.reduce((a, b) => a + b);
}
async function task(url) {
const r1 = await fetch(url).then(resp => resp.text());
const r2 = await fetch(url + "/" + r1).then(resp => resp.json());
const chunks = [];
const { length } = r2
for (let i = 0; i < length; i += 10) {
chunks.push(r2.slice(i, i + 10));
}
let total = 0;
for (const subArr of chunks) {
total += await getTotal(urlPart, subarr);
}
return total;
}
Here's some code I created years ago that allows you to create a "parallel" queue
const makeQueue = length => {
length = (isNaN(length) || length < 1) ? 1 : length;
const q = Array.from({length}, () => Promise.resolve());
let index = 0;
const add = cb => {
index = (index + 1) % length;
return (q[index] = q[index].then(() => cb()));
};
return add;
};
This will allow up to 10 simultaneous requests (or whatever you pass in as the argument)
In your code, I guess you could use it like
async function task(url) {
const q = makeQueue(10); // 10 requests at a time
var r1 = await fetch(url).then(resp => resp.text());
var r2 = await fetch(url + "/" + r1).then(resp => resp.json());
return Promise.all(r2.map(u => q(() => fetch(u).then(resp => resp.text())))).then(v => v.map(parseInt).reduce((a, b) => a+b));
}
the return can also be
return Promise.all(r2.map(u => q(() => fetch(u).then(resp => resp.text()).then(parseInt)))).then(v => v.reduce((a, b) => a+b));
broken down that is equivalent of
const fetch1 = u => fetch(u).then(resp => resp.text()).then(parseInt);
const promises = r2.map(u => q(() => fetch1(u)));
return Promise.all(promises).then(v => v.reduce((a, b) => a+b));
The benefit of this method is that there should be 10 requests "on the go" for a maximum amount of time
Note, browsers tend to limit the number of simultaneous requests per host, so you may not see any improvement with queue size greater than 6 (I think that's the most common limit)
Appreciate all the good answers here! I studied them and come up with the following solution which I think is slightly simpler (for many of us beginners) :-)
This solution doesn't divid all the url-fetching jobs in the beginning because it's uncertain how much time each url-fetching will take.
Instead it makes each worker go through all the urls, if a url is assigned to another worker, it will just move on to next one.
var tasks
var total = 0
var gId = 0
var workerId
manager(4)
async function manager(numOfWorkers) {
var workers = []
tasks = r2.map(function(u) {return {id: gId++, assigned: -1, url: u }})
for (var i=0; i<numOfWorkers; i++) { workers.push(worker()) }
await Promise.all(workers)
console.log(total)
}
async function worker() {
var wid = workerId; workerId ++;
var tmp;
for (var u of tasks) {
if (u.assigned == -1) {
u.assigned = wid;
console.log("unit " + u.id + " assigned to " + wid)
tmp = await fetch(u.url).then(r=>r.text())
total += parseInt(tmp);
}
}
}
In short, ditch the await. By using await, you are literally telling it to wait here until it is done with this one thing.
If you want to parallelize them, make use of Promise.all(). Any async function returns a Promise which can still be used like a normal Promise. Promise.all() accepts an array of Promise objects, and will call then() once all of those requests are done, giving you an array of the results from each.
You could do something like this:
const urls = [/* bunch of URLs */];
Promise.all(
urls.map(url =>
fetch(url).then(res => res.text())
)
).then(results => /* do something with results */)
In this case, results will be an array of the results from your various requests, in the same order as they were passed in.
Now, if you want to be able to have a specific number of them running at a time, you'd want to change it up a bit and have some limits on what's going on.
I usually use a technique which just uses a simple counter to keep track of how many are active, and then fires off more when they are done.
You can do something like this:
// dummy fetch for example purposes, resolves between .2 and 3 seconds
const fakeFetch = url => new Promise(resolve => setTimeout(() => resolve(url), Math.random() * 2800 + 200));
const inputUrls = ['a', 'b', 'c', 'd', 'e', 'f', 'g'];
const limit = 2; // this sets the limit of how many can run at once, set to 10 to run 10 concurrently
const delay = 100; // delay in ms between each batch starting
function fetchAll(urls) {
let active = 0;
let queue = urls.slice(0); // clone urls
// inner function so urls and results can be shared with all calls
function fetchAllInner() {
if (active < limit && queue.length) {
const count = Math.min(limit - active, queue.length);
const urlsThisBatch = queue.slice(0, count);
queue = queue.slice(count); // remaining
return Promise.all(
urlsThisBatch.map(url => {
active++; // increment active
console.log('start', url);
return fakeFetch(url)
.then(r => {
console.log('done', url);
active--; // decrement active
return new Promise(resolve => // new Promise to promisify setTimeout
setTimeout(() =>
resolve(fetchAllInner() // kicks off run again when one finishes
.then(fetchR => [].concat(r, fetchR)) // combine them
), delay
)
);
})
})
).then(r => r.reduce((a, u) => [].concat(u, a), [])); // flatten from Promise.all()
}
return Promise.resolve([]); // final resolve
}
return fetchAllInner();
}
fetchAll(inputUrls)
.then(results => console.log('all done', results));
In a nutshell, what this is doing is it'll create a Promise.all() for a batch (however many we can start up until we hit our limit). Then, when one finishes, it'll set a timeout to start up another batch by recursively calling the same function. It's wrapped in another function simply to avoid having to have some variables be global.
This also has an added delay if you want, so you can throttle how many requests you'll make and not hammer the system too bad. If you don't want to use a delay, you can just set it to 0 or remove the new Promise(resolve => setTimeout bit.
The above version is a bit verbose to make it easier to understand. Here is a more "production-ready" version (be sure to switch fakeFetch to fetch and handle calling res.text())
const fakeFetch = url => new Promise(resolve => setTimeout(() => resolve(url), Math.random() * 2800 + 200));
function fetchAll(urls, limit = 10, delay = 200) {
let active = 0;
const queue = urls.splice(0);
function fetchAllInner() {
if (active >= limit || !queue.length) {
return Promise.resolve([]);
}
const count = Math.min(limit - active, queue.length);
active = limit;
return Promise.all(
queue.splice(0, count)
.map(url => fakeFetch(url)
.then(r => {
active--;
return new Promise(resolve =>
setTimeout(() => resolve(
fetchAllInner().then(fetchR => [].concat(r, fetchR))
), delay)
);
})
)
).then(r =>
r.reduce((a, u) => [].concat(u, a), []));
}
return fetchAllInner();
}
console.log('give it a few seconds');
fetchAll(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
.then(r => console.log('all done', r))

Categories

Resources