I've created a script using axios and cheerio to fetch different shop names and their associated links from yellowpages and then used those links to scrape phone and email from their inner pages. The script is doing fine.
What I wish to do now is use the next page link to keep grabbing the content from next pages as well. I just can't figure out how to apply the logic of parsing and using next pages within getLinks() function.
At the moment this is what I'm trying with:
const axios = require('axios');
const cheerio = require('cheerio');
const startUrl = 'https://www.yellowpages.com/search?search_terms=Pizza&geo_location_terms=San+Francisco%2C+CA';
const host = 'https://www.yellowpages.com';
const getLinks = async (url,host,callback) => {
const { data } = await axios.get(url);
const $ = cheerio.load(data);
$('[class="result"] a.business-name').each(function(){
let items = $(this).find('span').text();
let links = host + $(this).attr("href");
return callback(items,links);
});
}
const fetchContent = async (shopName,shopLink,callback) => {
const { data } = await axios.get(shopLink);
const $ = cheerio.load(data);
let phone = $('.contact > p.phone').eq(0).text();
let email = $('.business-card-footer > a.email-business').eq(0).attr("href");
return callback(shopName,shopLink,phone,email);
}
async function scrapeData() {
getLinks(startUrl,host,function(itemName,link){
fetchContent(itemName,link,function(shopName,shopLink,phone,email){
console.log({shopName,shopLink,phone,email});
});
});
}
scrapeData();
Next page links are often in [rel=next] so it's usually something like this:
async function get(url) {
const { data } = await axios.get(url);
const $ = cheerio.load(data);
return $
}
async function run(){
let url = 'https://www.yellowpages.com/search?search_terms=Pizza&geo_location_terms=San+Francisco%2C+CA'
let $ = await get(url)
// doSomething($)
let href = $('[rel=next]').attr('href')
while(href){
url = new URL(href, url).href
$ = await get(url)
// doSomething($)
href = $('[rel=next]').attr('href')
}
}
Related
I am fairly new to Javascript and I understand that it executes asynchronously. I tried using the callback method to fetch the secret values and then execute next block of code. But it is not waiting.
This is the function that fetches the keyvault secret values
function getsecret_values(client,secret_name,callback) {
let val = []
for (let i =0;i<secret_name.length;i++){
client.getSecret(secret_name[i]).then((latestSecret) => {
val[i] = latestSecret.value;
})
}
callback(val)
}
I am calling getsecret_values function from main block
let vaultName = result.database;
const url = `https://${vaultName}.vault.azure.net`;
const credential = new ClientSecretCredential(result.host, result.user, result.password);
const client = new SecretClient(url, credential);
let secret_values = []
getsecret_values(client, secrets, function(result) {
secret_values = result
console.log(secret_values)
});
console.log(secret_values)
\\next code block
Both the console.log returns empty array.
I want my code to wait till the secret values are fetched and put into secret_values array and then proceed to next block of code. How do I achieve this?
the easiest way is to use Async Await pattern, which uses promises in the background. Trying not to change your code much:
async function getsecret_values(client,secret_name) {
let val = []
for (let i =0;i<secret_name.length;i++){
const latestSecret = await client.getSecret(secret_name[i])
val[i] = latestSecret.value;
}
return val
}
in your main block:
getsecret_values(client, secrets).then(function(result) {
secret_values = result
console.log(secret_values)
});
console.log(secret_values) // will still be an empty array as the then function has not been executed yet....
my approach would be:
async function getsecret_values(client,secret_name) {
let val = []
for (let i =0;i<secret_name.length;i++){
const latestSecret = await client.getSecret(secret_name[i])
val[i] = latestSecret.value;
}
return val
}
// main:
async function main() {
let vaultName = result.database;
const url = `https://${vaultName}.vault.azure.net`;
const credential = new ClientSecretCredential(result.host, result.user, result.password);
const client = new SecretClient(url, credential);
const secret_values = await getsecret_values(client, secrets)
console.log(secret_values)
}
main()
I have tried async/await & using promises however I cannot get this code to execute in order.
The code iterates through a document and parses it before saving it to an array, then saving the array to .json file.
The code continues to run before the loop finishes however which means it writes an empty file as the parsing has not been completed.
Turning it into an async function to await does not solve the issue. Nor does returning a promise and then using .then() to execute final code. It still runs straight away.
const fs = require('fs');
const cheerio = require('cheerio');
const mammoth = require("mammoth");
const articleFolder = './Articles/';
var allArticles = [];
const extractDocuments = async () => {
let files = fs.readdirSync(articleFolder);
for(const file of files) {
await convertToHTML(file);
}
completedExtraction();
}
async function convertToHTML(filename) {
var filepath = articleFolder + filename;
mammoth.convertToHtml({path: filepath})
.then(function(result){
let html = result.value; // The generated HTML
let messages = result.messages; // Any messages, such as warnings during conversion
updateArticles(filename, html);
})
.done();
}
function updateArticles (filename, html) {
var article = {
file: filename,
content: parseHTML(html)
}
allArticles.push(article);
}
function parseHTML (html) {
let $ = cheerio.load(html);
let title = $('h3').first().text();
let date = $('h3:eq(1)').text();
$('h3').slice(0,2).remove()
let content = $('body').html();
let parsedArticle = {
title: title,
date: date,
content: content
}
return parsedArticle;
}
function completedExtraction() {
fs.writeFile('./articles.json', JSON.stringify(allArticles), (err)=>{
if (err) throw err;
console.log('File Written.');
});
console.log('Finished.');
}
extractDocuments();
To solve with map I would do something similar to:
const extractDocuments = async () => {
let files = fs.readdirSync(articleFolder);
const articlePromises = files.map(async file => {
const html = await convertToHTML(file)
return {
filename: file,
html: html
}
})
allArticles = await Promise.all(articlePromises)
completedExtraction();
}
async function convertToHTML(filename) {
var filepath = articleFolder + filename;
return mammoth.convertToHtml({path: filepath})
.then(function(result){
let html = result.value; // The generated HTML
let messages = result.messages; // Any messages, such as warnings during conversion
return html
})
.done();
}
So to wrap up extractDocuments uses a map to iterate and create articles. convertToHTML only returns the created HTML and nothing more. We no longer use the updateArticles since this is now handled in the extractDocuments
Hopes this helps a bit. Hope it points you in the right direction
I am scraping multiple pages with cheerio and axios in node.js
I am having a hard time with Promises, can someone help me return the JSON if I hit the last page? Thanks!
const getWebsiteContent = async (url) => {
await axios.get(url).then(res => {
const $ = cheerio.load(res.data)
pageNum = getTotalpages($); // Get the pagination
console.log(url);
//Some scraping here
})
indexPage++; // Increment to the next page
const nextPageLink = baseUrl + '&page=' + indexPage; // get next page
if (indexPage > pageNum) {
var editedText = text.slice(0, text.length - 1);
editedText += ']}';
editedText = JSON.parse(editedText); // I want to return this and use elsewhere
return editedText;
}
setTimeout(async () => {
getWebsiteContent(nextPageLink); // Call itself
}, 1000);
}
var myJSON= await getWebsiteContent(baseUrl); // something like this
I would write getPages as an async generator -
async function* getPages (href, initPage = 0) {
const res = await axios.get(setPage(href, initPage))
const $ = cheerio.load(res.data)
const pages = getTotalpages($)
yield { page: initPage, dom: $ }
for (let p = initPage; p < pages; p++) {
await sleep(1000)
const r = await axios.get(setPage(href, p))
yield { page: p, dom: cheerio.load(r.data) }
}
}
This depends on helper setPage that manipulates the href page number using the url module, which is much safer than hobbling together strings by hand -
function setPage (href, page) {
const u = new URL(href)
u.searchParams.set("page", page)
return u.toString()
}
And another helper, sleep, which prevents the mixing of setTimeout with async-based code. This allows us to easily pause between pages -
async function sleep (ms) {
return new Promise(r => setTimeout(r, ms))
}
Finally we write scrape which is a simple wrapper around getPages. This allows us to reuse the getPages function to scrape various elements as needed. A benefit of using this approach is that the caller can determine what happens with each page. Below we push to result array, but as another example we could write each page to disk using the fs module. Obviously this for you to decide -
async function scrape (href) {
const result = []
for await (const {page, dom} of getPages(href)) {
console.log("scraped page", page) // some status message
result.push(getSomeData(dom)) // get something from each page
}
return result
}
scrape(myUrl).then(console.log, console.error)
You shouldn't be using then with your async / await code.
pagination should look something like this:
let response = await axios.get(url)
let $ = cheerio.load(response.data)
// do some scraping
while(url = $('[rel=next]').attr('href')){
response = await axios.get(url)
$ = cheerio.load(response.data)
// do more scraping
}
I have a function that retrieves the data from a document correctly. However, one image has the URL as it's field already. The other image only has a firebase image reference. Before I proceed to another function, I need to wait until the download URL has been fetched. I've attempted it below without much luck, and I'm not entirely sure i've stuck the async in the right place either.
getPhoto(user_id: string) {
this._subscription = this._activatedRoute.params.pipe(
switchMap(params => {
return this.service.getPhoto(params);
})
).subscribe(async(result) => {
const imageOne = result.imageOne;
// Need to await the download URL here
const imageTwo = this.blah(result.imageTwoRef)
this.otherFunction(imageOne, imageTwo)
});
}
blah(reference){
var storage = firebase.storage();
var imageTwo = reference;
var imagePathRef = storage.ref().child(imageTwo);
imagePathRef.getDownloadURL().then((url) => {
return url;
});
}
Using the async keyword only works on function, and by doing so, it will return a promise. So your usage is correct in that instance.
You can use await only in an async function and next to a promise call. It will stop the execution until your promise get resolved.
I think you are almost done. Try it like this and let me know:
getPhoto(user_id: string) {
this._subscription = this._activatedRoute.params.pipe(
switchMap(params => {
return this.service.getPhoto(params);
})
).subscribe(async(result) => {
const imageOne = result.imageOne;
// Need to await the download URL here
const imageTwo = await this.blah(result.imageTwoRef)
this.otherFunction(imageOne, imageTwo);
});
}
async blah(reference){
var storage = firebase.storage();
var imageTwo = reference;
var imagePathRef = storage.ref().child(imageTwo);
const url = await imagePathRef.getDownloadURL();
return url;
}
I am trying to scrap wikipedia page to fetch list of airlines by first scrapping first page and then going to each individual page of airline to get the website url. I have divided the code in two functions. One to scrap main page and get a new url, and second function to scrap another page from the created url to get the website name from that page. I have used request-promise module for getting the html and then cheerio to parse the data.
export async function getAirlinesWebsites(req,res) {
let response = await request(options_mainpage);
console.log(`Data`);
let $ = cheerio.load(response);
console.log('Response got');
$('tr').each((i,e)=>{
let children = '';
console.log('inside function ', i);
if($(e).children('td').children('a').attr('class') !== 'new') {
children = $(e).children('td').children('a').attr('href');
let wiki_url = 'https://en.wikipedia.org' + children;
console.log(`wiki_url = ${wiki_url}`);
let airline_url = getAirlineUrl(wiki_url);
console.log(`airline_url = ${airline_url}`);
}
})
And then the getAirlineUrl() function will parse another page based on the provided url.
async function getAirlineUrl(url){
const wiki_child_options = {
url : url,
headers : headers
}
let child_response = await request(wiki_child_options);
let $ = cheerio.load(child_response);
let answer = $('.infobox.vcard').children('tbody').children('tr').children('td').children('span.url').text();
return answer;
})
However when I console log the answer variable in the parent function, I get a [object Promise] value instead of a String. How do I resolve this issue?
Async function return promise.In case of that,you need to use then to get resolved response or use await.
This should work if other part of your code is ok.
export async function getAirlinesWebsites(req, res) {
let response = await request(options_mainpage);
console.log(`Data`);
let $ = cheerio.load(response);
console.log("Response got");
$("tr").each(async (i, e) => {
let children = "";
console.log("inside function ", i);
if ($(e).children("td").children("a").attr("class") !== "new") {
children = $(e).children("td").children("a").attr("href");
let wiki_url = "https://en.wikipedia.org" + children;
console.log(`wiki_url = ${wiki_url}`);
let airline_url = await getAirlineUrl(wiki_url);
console.log(`airline_url = ${airline_url}`);
}
});
}
Since your getAirlineUrl function returns a promise, you need to await that promise. You can't have await nested inside of the .each callback because the callback is not an async function, and if it was it wouldn't work still. The best fix is the avoid using .each and just use a loop.
export async function getAirlinesWebsites(req,res) {
let response = await request(options_mainpage);
console.log(`Data`);
let $ = cheerio.load(response);
console.log('Response got');
for (const [i, e] of Array.from($('tr')).entries()) {
let children = '';
console.log('inside function ', i);
if($(e).children('td').children('a').attr('class') !== 'new') {
children = $(e).children('td').children('a').attr('href');
let wiki_url = 'https://en.wikipedia.org' + children;
console.log(`wiki_url = ${wiki_url}`);
let airline_url = await getAirlineUrl(wiki_url);
console.log(`airline_url = ${airline_url}`);
}
}
}