Making a web-crawler to have loop - javascript

I tried to make my web-crawler to have a loop to crawl the webpage from 1 to around 500. But the result does not include any directed one but to return an only void array.
This code is based on cheerio, jQuery, and axios. JavaScript.
const axios = require("axios");
const cheerio = require("cheerio");
const log = console.log;
const getHtml = async() => {
var i=0
while (i<493){
try {
return await axios.get("https://playentry.org/ds#!/qna?sort=created&rows=20&page="+i);
} catch (error) {
console.error(error);
}
}
};
getHtml()
.then(html => {
let ulList = [];
const $ = cheerio.load(html.data);
const $bodyList = $("div.discussContentWrapper div.discussListWrapper table.discussList").children("tr.discussRow");
$bodyList.each(function(i, elem){
ulList[i] = {
title:$(this).find('td.discussTitle div.discussTitleWrapper'),
writer:$(this).find('td.discussTitle td.discussViewCount'),
viewcount:$(this).find('td.discussTitle td.discussViewCount'),
likecount:$(this).find('td.discussTitle div.discussLikeCount'),
date:$(this).find('td.discussTitle td.discussDate'),
};
});
const data = ulList.filter(n => n.title);
return data;
})
.then(res => log(res));
The output is '''[]''' or '''[ [] ]''' with no real outputs.
Thanks for your help in advance.

Related

Building an Object from fetch statement

I have some code that when you console.log it, it looks like the image below:
The code I am running is as follows:
onClick={() => {
const stream = fetch(
'https://lichess.org/api/games/user/neio',
{ headers: { Accept: 'application/x-ndjson' } }
);
const onMessage = obj => {
console.log('test', obj);
};
const onComplete = () =>
console.log('The stream has completed');
stream.then(readStream(onMessage)).then(onComplete);
}}
export const readStream = processLine => response => {
const stream = response.body.getReader();
const matcher = /\r?\n/;
const decoder = new TextDecoder();
let buf = '';
const loop = () =>
stream.read().then(({ done, value }) => {
if (done) {
if (buf.length > 0) processLine(JSON.parse(buf));
} else {
const chunk = decoder.decode(value, {
stream: true,
});
buf += chunk;
const parts = buf.split(matcher);
buf = parts.pop();
for (const i of parts) processLine(JSON.parse(i));
return loop();
}
});
return loop();
};
export default readStream;
What I am trying to do is build a parent object that contains all these individual rows of data.
I'm new at promises and fetch etc. So currently, I have no idea on how to build this parent object that contains each individual row.
Any suggestions?
Can't you have a global array and add items to it like:
var arrCollection = [];
...
const onMessage = obj => {
arrCollection.push(obj);
};
You can have an object with those items doing like:
var objCollection = { items: arrCollection };

How to run a function which call axios for every 30 seconds

I'm creating a web scraper using node, cheerio and calling the website using axios(async/await). I want the function to run every 30 seconds. I tried using setTimeout and setInterval but did not get the expected result. Instead got heap out of memory error. I want to run the mvcAppointmentSearch function in the while loop for every 30 seconds. Following is the code. also attaching the codepen link for better readability.
Code pen link
const express = require('express');
const request = require('request-promise');
const cheerio = require('cheerio');
const axios = require('axios');
const cssSelect = require('css-select');
const open = require('open');
// const mvcUrl = 'https://telegov.njportal.com/njmvc/AppointmentWizard/17/';
const mvcUrl = 'https://telegov.njportal.com/njmvc/AppointmentWizard/14/';
const mvcLocation = ['Edison', 'Rahway', 'SouthPlainfield'];
// const mvcLocationNumber = ['240', '252', '239'];
const mvcLocationNumber = ['163'];
const requiredMonths = ['September', 'October'];
const callUrl = async (url, locationNumberIndex) => {
try {
const response = await axios.get(url);
//console.log('call url', response.data);
getData(response.data, locationNumberIndex);
} catch (err) {
console.log(err);
}
};
const mvcAppointmentSearch = () => {
for (let i = 0; i < mvcLocationNumber.length; i++) {
const currentUrl = mvcUrl + mvcLocationNumber[i];
console.log(mvcLocationNumber[i]);
callUrl(currentUrl, i);
}
};
const getData = (html, locationNumberIndex) => {
let data = [];
let $ = cheerio.load(html);
console.log('datais ', $);
$.prototype.exists = function (selector) {
return this.find(selector).length > 0;
};
const checkerLength = $('div').exists('.alert-danger');
console.log(checkerLength);
if (checkerLength) {
console.log(
`No appointment available in ${mvcLocation[locationNumberIndex]}`
);
} else {
const dateString = $('.control-label').text();
const availableMonth = dateString.trim().split(' ')[7];
const exactDateAvailability = dateString.slice(24, -1);
console.log(availableMonth);
if (requiredMonths.includes(availableMonth)) {
console.log('Hurray there is an appointment available');
const message = `Appointment available for the location ${mvcLocation[locationNumberIndex]} on ${exactDateAvailability}`;
open(`${mvcUrl + mvcLocationNumber[locationNumberIndex]}`);
console.log(message);
} else {
console.log('required Month is not available still searching');
}
}
};
while (true) {
try {
// mvcAppointmentSearch();
// want to run the following function for every 30 seconds.
mvcAppointmentSearch();
} catch (err) {
console.log(`Error has Occured ${err}`);
}
}

Appending data from api into DOM

const url = `https://catfact.ninja/fact?max_length=140`;
const getFact = () => {
return fetch('https://catfact.ninja/fact?max_length=140')
.then(res => res.json())
}
const createFactDiv = (fact) => {
const factContainer = document.createElement('div')
const setup = document.createElement('p')
setup.innerText = fact.fact
factContainer.append(setup)
return factContainer
}
const appendFact = (factDiv) => {
const factContainer = document.getElementById('factContainer')
factContainer.append(FactDiv)
}
document.addEventListener('DOMContentLoaded', () => {
})
getFact().then ((fact) => {
const FactDiv = createFactDiv(fact)
append.fact (FactDiv)
})
I have tried several things, fairly new to JS and it is tricky. I am trying to create an app that displays cat facts. I was seeing the DIV with the FACT inside correctly in the console.log in the elements of the DOM, but now I don't see it and I keep seeing
Uncaught (in promise) ReferenceError: append is not defined
Any idea what to do? Much appreciated !
Yep, it's our old friend spelling errors! Here is some working code:
const url = `https://catfact.ninja/fact?max_length=140`;
const getFact = () => {
return fetch('https://catfact.ninja/fact?max_length=140')
.then(res => res.json())
}
const createFactDiv = (fact) => {
const factContainer = document.createElement('div');
const setup = document.createElement('p');
setup.innerText = fact.fact;
factContainer.append(setup);
return factContainer
}
const appendFact = (factDiv) => {
const factContainer = document.getElementById('factContainer');
factContainer.append(factDiv);
}
//This is unused
/*
document.addEventListener('DOMContentLoaded', () => {
})
*/
getFact().then ((fact) => {
const factDiv = createFactDiv(fact);
appendFact(factDiv);
})
This is why it's important to consistently use camelCase:
In appendFact() you took a factDiv parameter but then tried to use FactDiv, which doesn't exist in that function
As noted by Robin, you typed append.fact(FactDiv) instead of appendFact(FactDiv)
This should be refactored to appendFact(factDiv) to stick with camelCase.
Also watch your spacing, and I like to have semicolons at the end of my lines also!

How to make promise to wait for all objects to be complete and then push to array?

The getURL() function creates an array of scraped URLs from the original URL. getSubURL() then loops through that array and scrapes all of those pages' URLs. Currently, this code outputs just fine to the console, but I don't know how to wait for my data to resolve so I can push all gathered data to a single array. Currently, when I try and return sites and then push to array, it only pushes the last value. I believe it's a promise.all(map) situation, but I don't know how to write one correctly without getting an error. Ideally, my completed scrape could be called in another function. Please take a look if you can
const cheerio = require('cheerio');
const axios = require('axios');
let URL = 'https://toscrape.com';
const getURLS = async () => {
try {
const res = await axios.get(URL);
const data = res.data;
const $ = cheerio.load(data);
const urlQueue = [];
$("a[href^='http']").each((i, elem) => {
const link = $(elem).attr('href');
if (urlQueue.indexOf(link) === -1) {
urlQueue.push(link);
}
});
return urlQueue;
} catch (err) {
console.log(`Error fetching and parsing data: `, err);
}
};
const getSubURLs = async () => {
let urls = await getURLS();
try {
//loop through each url in array
for (const url of urls) {
//fetch all html from the current url
const res = await axios.get(url);
const data = res.data;
const $ = cheerio.load(data);
//create object and push that url into that object
let sites = {};
sites.url = url;
let links = [];
//scrape all links and save in links array
$("a[href^='/']").each((i, elem) => {
const link = $(elem).attr('href');
if (links.indexOf(link) === -1) {
links.push(link);
}
//save scraped data in object
sites.links = links;
});
// returns list of {url:'url', links:[link1,link2,link3]}
console.log(sites);
}
} catch (err) {
console.log(`Error fetching and parsing data: `, err);
}
};
Don't think this is a Promise related issue at heart.
You'll need to collect your sites into an array that is initialized outside the loop. Then when getSubURLs() resolves, it will resolve to your array:
const getSubURLs = async() => {
let urls = await getURLS();
let siteList = [];
try {
for (const url of urls) {
// :
// :
// :
siteList.push(sites);
}
} catch (err) {
console.log(`Error fetching and parsing data: `, err);
}
return siteList; // array of objects
};
getSubURLs().then(console.log);

Node - wait for map to finish before continuing

I have this file in my node app that supposed to go fetch me some data about every league champion from their official website using cheerio and its going all great but when I add all the data to my array to then return it as json data the write function runs before the map finishes so I just creating a json file with an empty array in it:
const request = require('request');
const cheerio = require('cheerio');
const fs = require('fs');
const champions = fs.readFileSync('champions.json');
const championsObj = JSON.parse(champions);
let champsList = [];
championsObj.map(champ => {
request(champ.href, (err, res, html) => {
if (!err && res.statusCode == 200) {
const $ = cheerio.load(html);
const champName = $('.style__Title-sc-14gxj1e-3 span').text();
let skins = [];
const skinsList = $('.style__CarouselItemText-sc-1tlyqoa-16').each(
(i, el) => {
const skinName = $(el).text();
skins.push = skinName;
}
);
const champion = {
champName,
skins
};
console.log(champion);
champsList.push = champion;
}
});
});
const jsonContent = JSON.stringify(champsList);
fs.writeFile('champions2.json', jsonContent, 'utf8', function(err) {
if (err) {
console.log(err);
}
});
I'm not a node expert but I tried using Promise but it didn't work but I'm not sure maybe I used it wrong.
UPDATE #1: using axios
championsObj.map(async champ => {
const html = await axios.get(champ.href);
const $ = await cheerio.load(html);
const champName = $('.style__Title-sc-14gxj1e-3 span').text();
let skins = [];
const skinsList = $('.style__CarouselItemText-sc-1tlyqoa-16').each(
(i, el) => {
const skinName = $(el).text();
skins.push = skinName;
}
);
const champion = {
champName,
skins
};
console.log(champion);
champsList.push = champion;
});
you can use await Promise.all(<array>.map(async () => {...}). it does not require any additional dependencies. however you have no guarantees about the order of asynchronous iterations (starting all the iterations in the right order, but no guarantees about iterations' endings).
Your problem here is that Array#map doesn't wait for asynchronous functions such as the request calls to finish before moving on. I recommend p-map with got. To ensure perfect execution order, I also recommend reading and writing the file asynchronously.
const got = require('got');
const pMap = require('p-map');
const cheerio = require('cheerio');
const fs = require('fs').promises;
(async () => {
const champions = JSON.parse(await fs.readFile('champions.json', 'utf8'));
let champsList = await pMap(champions, async champ => {
const {
body
} = await got(champ.href)
const $ = cheerio.load(body);
const champName = $('.style__Title-sc-14gxj1e-3 span').text();
let skins = [];
$('.style__CarouselItemText-sc-1tlyqoa-16').each(
(_, el) => {
const skinName = $(el).text();
skins.push(skinName);
}
);
const champion = {
champName,
skins
};
console.log(champion);
return champion;
})
await fs.writeFile('champions2.json', JSON.stringify(champsList));
})();

Categories

Resources