Running a request inside of a request (Node & Cheerio) - javascript

I am building a web scraper that uses node, request, and cheerio.
The function I have written takes a number as a parameter that gets used as an index in a for loop, and then each number in the loop corresponds to a url on a website that contains an index of blog posts. From there the function returns the title of every blog post on on each url, the href contained in each post title, and then another request is ran that returns every href contained on each individual posts page, using the href of the corresponding post title as an input value. So my output in the terminal should be formatted like this:
Title: Some Blog Post Title 1
Link: Some Blog Post Link 1
Blog Post Links: List of Links on Blogs Page 1
Title: Some Blog Post Title 2
Link: Some Blog Post Link 2
Blog Post Links: List of Links on Blogs Page 2
But instead it is coming out like this:
Title: Some Blog Post Title 1
Link: Some Blog Post Link 1
Title: Some Blog Post Title 2
Link: Some Blog Post Link 2
Giant list of blog post links
So my code is functional in that it retrieves all of the correct information for me, but it is not in the right format. The current output isn't helpful for me because I need to be able to tell which links correspond to each page rather than a giant list of links.
I have researched my problem and I'm pretty sure that this is happening because of the asynchronous nature of the code. My function is very similar to the question posed here but mine is different in the sense that there is a second request being ran using the output from the first request as an input in addition to the loop.
So my question is how to reformat my code to get my output to return in the desired order?
function scrapeUrls(num) {
for (var i = 1; i <= num ; i++) {
request(`https://www.website.com/blog?page=${i}`, (error, response, html) => {
if(!error && response.statusCode == 200) {
const $ = cheerio.load(html);
$('.group-right').each((i, el) => {
const articleTitle = $(el)
.find('h2')
.text();
const articleLink = $(el)
.find('a')
.attr('href');
console.log(`Title: ${articleTitle}\nLink: ${articleLink}\nBlog Post Links:`)
request(`https://www.website.com/${articleLink}`, (error, response, html) => {
if(!error && response.statusCode == 200) {
const $ = cheerio.load(html);
$('.main-container').each((i) => {
links = $('a');
$(links).each(function(i, link) {
console.log($(link).text() + ':\n ' + $(link).attr('href'));
})
})
}
})
});
}
})
}
}

As #Dshiz pointed out you need to await the promise if you want to keep the order. I suggest you use node-fetch instead of request which returns actual promise to be awaited:
let cheerio = require('cheerio');
let fetch = require('node-fetch');
function getArticlesLinks(html) {
const $ = cheerio.load(html);
let articles = [];
$(".group-right").each((i, el) => {
const articleTitle = $(el).find("h2").text();
const articleLink = $(el).find("a").attr("href");
console.log(`Title: ${articleTitle}\nLink: ${articleLink}\nBlog Post Links:`);
articles.push(articleLink);
});
return articles;
}
function getLinks(html) {
const $ = cheerio.load(html);
$(".main-container").each((i) => {
links = $("a");
$(links).each(function (i, link) {
console.log(
$(link).text() + ":\n " + $(link).attr("href")
);
});
});
}
async function scrapeUrls(num) {
for (var i = 1; i <= num; i++) {
// Fetch the page
let pageResponse = await fetch(`https://www.website.com/blog?page=${i}`);
if (pageResponse.ok) {
let pageHtml = await pageResponse.text();
// ^ HERE
// Extract articles' links
let articles = getArticlesLinks(pageHtml);
// For each article fetch and extract links
for (let a of articles) {
let articleResponse = await fetch(`https://www.website.com/${a}`);
if (articleResponse.ok) {
let articleHtml = await articleResponse.text();
// ^ HERE
getLinks(articleHtml);
}
}
}
}
}
scrapeUrls(4)
.then(() => console.log('done'))
.catch(console.error)
Here I turned the scrapeUrls function into async so I can await inside each for of loop.

Related

Insert new key value pair inside and array of objects, but value is created by axios.get

So I've been working on a scraper. Everything was well until I've tried scraping data for individual link.
Now to explain: I've got a scraper, which scrapes me data about apartments. Now first url is page where the articles are located(approx. 29-30 should be fetched). Now on that page I don't have information about square meters, so I need to run another scraper for each link that is scraped, and scrape square meters from there.
Here is the code that I have:
const axios = require('axios');
const cheerio = require('cheerio');
const url = `https://www.olx.ba/pretraga?vrsta=samoprodaja&kategorija=23&sort_order=desc&kanton=9&sacijenom=sacijenom&stranica=2`;
axios.get(url).then((response) => {
const articles = [];
const $ = cheerio.load(response.data);
$('div[id="rezultatipretrage"] > div')
.not('div[class="listitem artikal obicniArtikal i index"]')
.not('div[class="obicniArtikal"]')
.each((index, element) => {
$('span[class="prekrizenacijena"]').remove();
const getLink = $(element).find('div[class="naslov"] > a').attr('href');
const getDescription = $(element)
.find('div[class="naslov"] > a > p')
.text();
const getPrice = $(element)
.find('div[class="datum"] > span')
.text()
.replace(/\.| ?KM$/g, '')
.replace(' ', '');
const getPicture = $(element)
.find('div[class="slika"] > img')
.attr('src');
articles[index] = {
id: getLink.substring(27, 35),
link: getLink,
description: getDescription,
price: getPrice,
picture: getPicture,
};
});
articles.map((item, index) => {
axios.get(item.link).then((response) => {
const $ = cheerio.load(response.data);
const sqa = $('div[class="df2 "]').first().text();
});
});
console.log(articles);
});
Now the first part of the code likes as it should, I've been struggling with this second part.
Now I'm mapping over articles because there, for each link, I need to load it into axios function and get the data about square meters.
So my desired output would be updated articles: with it's old objects and key values inside it but with key sqm and value of scraped sqaure meters.
Any ideas on how to achieve this?
Thanks!
You could simply add the information about the square meters to the current article/item, something like:
const articlePromises = Promise.all(articles.map((item) => {
return axios.get(item.link).then((response) => {
const $ = cheerio.load(response.data);
const sqa = $('div[class="df2 "]').first().text();
item.sqm = sqa;
});
}));
articlePromises.then(() => {
console.log(articles);
});
Note that you need to wait for all mapped promises to resolve, before you log the resulting articles.
Also note that using async/await you could rewrite your code to be a bit cleaner, see https://javascript.info/async-await.

How do I process multiple string node command line arguments?

I am developing a web scraper and have successfully been able to retrieve back the information from the site.
The only issue is I need help on how to pass multiple command line arguments which will allow more data to be returned in my case I want to pass in multiple music artists to return what was found on the website.
I have tried using a for loop for the number of command line arguments. Then tried using a foreach loop.
var request = require('request');
var cheerio = require('cheerio');
const args = process.argv.slice(2);
//request function that uses three parameters were we check for the error and response type
request(' https://www.billboard.com/charts/rap-song', function (error, response, html){
if(!error && response.statusCode === 200) {
var $ = cheerio.load(html);
var results = $('div').find('.chart-list-item__artist:contains("'+ artist +'")').each(function () {
// console.log($(this).text());
}).text();
});
Maybe its because the contains operator can only process one command line argument? Or only gets the first artists specified.
Expected output: node artists.js "Post Malone" "Lil Baby"
----------------
Post Malone & Swae Lee
Post Malone
Lil Baby
Yo Gotti Featuring Lil Baby
Actual Output: node artists.js "Post Malone" "Lil Baby"
--------------
Post Malone & Swae Lee
Post Malone
From what I can see you should iterate the artists and check for a match for each of them, instead of iterating the elements after matching just one artist.
I think something like this - based on what you have
var request = require('request');
var cheerio = require('cheerio');
const args = process.argv.slice(2);
//request function that uses three parameters were we check for the error and response type
request(' https://www.billboard.com/charts/rap-song', function (error, response, html){
if(!error && response.statusCode === 200) {
var $ = cheerio.load(html);
var results = [];
args.forEach(artist => {
$('div').find('.chart-list-item__artist:contains("'+ artist +'")').each(function () {
// console.log($(this).text());
results.push(this); // push current element into results
}).text();
});
});
})

Node.js Scraping Data Click Event

I have a repetitive task that I have to do at regular intervals. Basically, I need to enter the website, get some values from different tables then write them on spreadsheet. By using these values, make some calculation, prepare a report etc.
I would like to create a helper bot because this is straight forward task to do.
I can basically get information by opening up console (while I am on the related page) and by using DOM or Jquery I am fetching data easily.
I would like to take it a step further and create an application on Node.js (without entering related website, I will send my bot to related page and do same actions that I do on console.)
I started to write something with cheerio. However, at some point my bot needs to click a button (in order to change table). I searched but couldn't find the way.
My question is "clicking a button on server side (change the table) and fetch data from that table is possible ?"
If do you know better way to create this kind of bot, please make suggestion.
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
app.get('/scrape', (req, res) => {
url = 'http://www.imdb.com/title/tt1229340/';
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
var title, release;
var json = { title : "", release : ""};
$('.header').filter(() => {
var data = $(this);
title = data.children().first().text();
release = data.children().last().children().text();
json.title = title;
json.release = release;
})
// This is not possible
$( "#target" ).click(function() {
alert( "Handler for .click() called." );
});
}
fs.writeFile('output.json', JSON.stringify(json, null, 4), (err) => {
console.log('File successfully written!);
})
res.send('Check your console!')
}) ;
})
app.listen('8080');
edit: The Answer of this question is "Use Zombie"
Now I have another question related to this one.
I am trying to learn & use zombie. I could
connect to website
go to necessary table
print console all tds
However by using this method, I could only get really messed up string. (All tds were printed without any whitespace, no chance to clean out, basically I want to put all tds in an array. How can I do that ?)
browser.visit(url, () => {
var result = browser.text('table > tbody.bodyName td');
console.log(result);
})
I'd suggest you try using a headless browser such as Phantom.js or Zombie for this purpose. What you're trying to do above is assign a click handler to an element in Cheerio, this won't work!
You should be able to click a button based on the element selector in Zombie.js.
There's a browser.pressButton command in Zombie.js for this purpose.
Here's some sample code using zombie.js, in this case clicking a link..
const Browser = require('zombie');
const url = 'http://www.imdb.com/title/tt1229340/';
let browser = new Browser();
browser.visit(url).then(() => {
console.log(`Visited ${url}..`);
browser.clickLink("FULL CAST AND CREW").then(() => {
console.log('Clicked link..');
browser.dump();
});
}).catch(error => {
console.error(`Error occurred visiting ${url}`);
});
As for the next part of the question, we can select elements using zombie.js and get an array of their text content:
const Browser = require('zombie');
const url = 'http://www.imdb.com/title/tt1229340/';
let browser = new Browser();
browser.visit(url).then(() => {
console.log(`Visited ${url}..`);
var result = browser.queryAll('.cast_list td');
var cellTextArray = result.map(r => r.textContent.trim())
.filter(text => text && (text || '').length > 3);
console.log(cellTextArray);
}).catch(error => {
console.error(`Error occurred visiting ${url}`);
});

Node.js Multi-page Crawler

I try to crawl into website pages.
here my sample code , i used stackoverflow just for test i dont want crawl stackoverflow.
i this code i want get every link in page and push in an array after that go to every link and search for Node (it's just test.)
var request = require('request');
var cheerio = require('cheerio');
var pages = 20;
var counter = 1;
while(counter<=pages){
var siteUrl = "http://stackoverflow.com/unanswered/tagged/?page="+counter+"&tab=votes";
var queue = [];
request(siteUrl, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
// Extract All links in page
links = $('a');
$(links).each(function(i, link){
queue.push("http://stackoverflow.com"+$(link).attr('href'));
});
}
// Search For Node.js on every question.
queue.each(function(i,linkItem){
request(linkItem, function(error, response, html){
var page = cheerio.load(html);
var ser = page.match(/node/i);
if (ser & ser.lenght > 0){
console.log(page);
}
});
})
})
counter ++;
}
when i run this code its just show frist page link and show me error each has no method
i will be happy if tell where i am wrong or even my code its right solution.
First of all, you are mixing of async and sync code is not very well.
The main problem is that the queue variable you are trying to iterate through has no each method. You can use lodash for doing that or just replace the function call with a simple for loop.
var i,
item;
for(i = 0; i < queue.length; i++) {
item = queue[i];
request(item, function(error, response, html){
var page = cheerio.load(html);
var ser = page.match(/node/i);
if (ser & ser.lenght > 0){
console.log(page);
}
});
}
Besides I wrote a tutorial for exactly doing what you are going try.

In Node.js, a loop calling a function defined in another JS file is making mistakes

I have a server (made with Express in Node.js) that gets notifications of RSS feeds, gets data from their entries (title, date, link) and then "does something" with the data by calling a function defined in another JS file ("article_filter_toDB.js"). The code on the server-side is:
// parts omitted
var article_filter_toDB = require('./article_filter_toDB.js');
// parts omitted
client.on('notification', function (notification) {
// gets notifications of RSS feeds
entries = notification.entries;
for (index = 0; index < entries.length; ++index) {
title = entries[index].title;
date = entries[index].published;
link = entries[index].link.href;
// gets data from the entry of the feed
miniwords = 1000;
// a variable that I set
article_filter_toDB(link, title, miniwords);
// "does something" by calling a function defined in another JS file ("article_filter_toDB.js")
}
});
// parts omitted
What the function "article_filter_toDB" does is to get the content of the article given by the link from the RSS feed (using Request), parsing HTML code to count the words of the article, and, if this length is above "miniwords" (here 1000), save the data relative to the article (title, link, date...) to a database (MongoDB, via Mongoose).
Sometimes it works well. But sometimes it computes a length equal to 1 (that is, it was unable to really count the words) although, if I run the function "article_filter_toDB" separately (that is, the separate JS file, applied to same "link", "title", "miniwords" that I copy to it), it is able to correctly count the words.
Do you know what I'm doing wrong? Thanks!
To be more complete, here is the code of the "article_filter_toDB.js" file:
// parts omitted
article_filter_toDB = function (link, title, miniwords) {
Article.findOne({
title: title
}, 'title', function (err, articles) {
if (err) return console.error(err);
if (articles == null) {
// ...if an article with this title is not already present in my database...
// parts omitted here, that set the variable "balise" depending on the link
request(link, function (err, resp, body) {
$ = cheerio.load(body);
texte = $(balise).text();
content = texte.split(" ");
length = content.length;
// ...let's count its words with Request and Cheerio...
if ((length > miniwords)) {
var newArticle = new Article({
site: url.parse(link).hostname.replace(/^www\./, ''),
date: date,
link: link,
title: title,
length: length,
});
newArticle.save(function (err, newArticle) {
if (err) return console.error(err)
});
// if the article's length is more than the number given by "miniwords", let's save its data in my database
}
});
}
});
}
module.exports = article_filter_toDB;
// exportation of the function to use it elsewhere
This how you call functions from another file properly in node.js
// otherfile.js
// ========
module.exports = {
article_filter_toDB: function (link, title, miniwords) {
// do stuff here
},
};
Then on your code:
var otherfile = require('./otherfile');
...
otherfile.article_filter_toDB(link, title, miniwords);

Categories

Resources