Nodejs Scraper isn't moving to next page(s) - javascript

Hey guys this is a follow on from my other question, i have created a Nodejs Scraper that doesnt seem to want to go through the pages, it stays on the first. my source code is below
const rp = require('request-promise');
const request = require('request');
const otcsv = require('objects-to-csv');
const cheerio = require('cheerio');
//URL To scrape
const baseURL = 'xxx';
const searchURL = 'xxxx';
//scrape info
const getCompanies = async () => {
// Pagination test
for (let index = 1; index <= 20; index = index + 1) {
const html = await rp.get(baseURL + searchURL + index);
const $ = await cheerio.load(html);
console.log("Loading Pages....");
console.log("At page number " + index);
// end pagination test
//const htmls = await rp(baseURL + searchURL);
const businessMap = cheerio('a.business-name', html).map(async (i, e) => {
const link = baseURL + e.attribs.href;
const innerHtml = await rp(link);
const emailAddress = cheerio('a.email-business', innerHtml).prop('href');
const name = e.children[0].data || cheerio('h1', innerHtml).text();
const phone = cheerio('p.phone', innerHtml).text();
return {
// link,
name,
emailAddress: emailAddress ? emailAddress.replace('mailto:', '') : '',
phone,
}
}).get();
return Promise.all(businessMap);
}
};
console.log("Finished Scraping.... Now Saving!")
//save to CSV
getCompanies()
.then(result => {
const transformed = new otcsv(result);
return transformed.toDisk('./output.csv');
})
.then(() => console.log('Scrape Complete :D '));
As you can see I have tried a few different ways to make this happen so any help will be gratefully appreciated.

Related

how to use trim in fetch url

how to retrieve the verif code, here I try to do the next regex using trim but an error message appears "TypeError: Cannot read properties of undefined (reading 'trim')"
and I just want to fetch the verification code, like in the image
my code
const checkInboxUrl = 'https://getnada.com/api/v1/inboxes/';
const getMessageUrl = 'https://getnada.com/api/v1/messages/html/';
const refreshMailboxUrl = 'https://getnada.com/api/v1/u/';
/* eslint-disable no-unused-vars */
class Getnada {
constructor() {
this.email = '';
this.verificationCode = '';
}
async getEmail(email = 'urmxhbwrz#getnada.com') {
this.email = email;
return this;
}
async getMailbox(pattern, sleepTime = 5000) {
await sleep(sleepTime);
const timestamp = Math.floor(new Date().getTime() / 1000);
const refreshMailboxResponse = await fetch(refreshMailboxUrl + this.email + '/' + timestamp);
const checkInboxResponse = await fetch(checkInboxUrl + this.email);
const checkInboxJson = await checkInboxResponse.json();
const getMessageResponse = await fetch(getMessageUrl + checkInboxJson.msgs[0].uid);
const readInbox = await getMessageResponse.text();
const regex = new RegExp(pattern);
const verificationCodeMatch = regex.exec(readInbox);
this.verificationCode = verificationCodeMatch[1].trim();
console.log(verificationCodeMatch)
return this;
}
}
const getnada = new Getnada();
async function main() {
console.log((await getnada.getEmail()))
console.log((await getnada.getMailbox()))
}
main();
https://getnada.com/api/v1/messages/html/8lra5CwOQcHvja3mpQZgO7G5RPTS3W
To retrieve the verification code, you can try to change this lines :
const regex = new RegExp(pattern);
const verificationCodeMatch = regex.exec(readInbox);
this.verificationCode = verificationCodeMatch[1].trim();
to :
const verificationCodeMatch = pattern.exec(readInbox);
this.verificationCode = verificationCodeMatch[0].trim();
And change this line too :
console.log((await getnada.getMailbox()))
to :
console.log((await getnada.getMailbox(/\b\d{6,6}\b/)));
This regex /\b\d{6,6}\b/ will filter out strings containing exactly 6 digits of numbers which is the verification code.

not able to kill process in node.js

I am working on a node script where I will be opening and closing the browser using child process but its opening the browser but not able to close it on pressing Ctrl + C.
Here's my code:
const { kill } = require('process');
var start = (process.platform == 'darwin'? 'open': process.platform == 'win32'? 'start': 'xdg-open');
//open browser
var url = 'https://www.google.com';
var cmd = start + ' ' + url;
var child = require('child_process').exec(cmd, function(error, stdout, stderr) {
if (error) {
console.log('exec error: ' + error);
}
});
process.on('SIGINT', function() {
kill(child.pid, 'SIGINT');
fs.writeFile(./scraper.js)
process.exit();
});
scarper.js
const cheerio = require('cheerio');
const request = require('request-promise');
const fs = require('fs');
const link="https://www.imdb.com/chart/top/?ref_=nv_mv_250";
(async()=>{
const html = await request(link);
const $ = cheerio.load(html);
const table = $('.lister-list');
const rows = table.find('tr');
const data = [];
for(let i=0;i<rows.length;i++){
const row = rows.eq(i);
const cols = row.find('td');
const rank = i+1;
const title = cols.eq(1).find('a').text().trim();
const year = cols.eq(1).find('span').text().trim();
const rating = cols.eq(2).text().trim();
const votes = cols.eq(4).text();
const image = cols.eq(1).find('img').attr('src');
const movie = {
rank: rank,
title: title,
year: year,
rating: rating,
votes: votes,
image: image
};
data.push(movie);
}
console.log(data);
fs.writeFile('./data.json',JSON.stringify(data));
})();
I am working on a node.js app where I am using socket.io to send data to multiple clients but the socket is only able to send data to one client i.e if I open my webpage in two tabs its not working in both the tabs but when I open just 1 tab of webpage it is able to transmit the data.I dont know why? Can someone help,Here's my code:
try to use
setTimeout((function() {
return process.abort();
}), 5000);
or try
setTimeout((function() {
return process.kill(process.pid);
}), 5000);

How can I get the data from an api request from the then() method, so I can process the data outside of the function?

How can I get the data from an api request from the then() method, so I can process the data outside
of the function. Can I store the data in a state object for example and how to do that.
// openweathermap async await api request
async function getData(){
try{
const url = `http://api.openweathermap.org/data/2.5/forecast?
q=${city}&units=metric&appid=${key}`;
const data = await axios(url)
console.log(data)
return data;
}catch(error){
temperature.insertAdjacentHTML('afterend', '<div id="error">Oooops
something went wrong!</div>');
}
}
getData().then( data => {
const temp = data.data.list[0].main.temp;
temperature.textContent = temp.toFixed(1) + ' \xB0' + "C ";
const temp2 = data.data.list[1].main.temp;
temperature2.textContent = temp2.toFixed(1) + ' \xB0' + "C ";
const forecast = data.data.list[0].weather[0].description;
foreCast.textContent = forecast.toLowerCase();
const forecast2 = data.data.list[1].weather[0].description;
foreCast2.textContent = forecast2.toLowerCase();
const icon = data.data.list[0].weather[0].icon
weatherIcon.insertAdjacentHTML('afterbegin', `<img
src="http://openweathermap.org/img/w/${icon}.png">`)
const icon2 = data.data.list[1].weather[0].icon
weatherIcon2.insertAdjacentHTML('afterbegin', `<img
src="http://openweathermap.org/img/w/${icon2}.png">`)
day.textContent = newTime;
day2.textContent = newTime2;
})
Should axios(url) be axios.get(url)?
Then you can handle the data inside the callback function:
getData().then(data => console.log(data))
Get data is already an async function which means it returns a Promise. Just assign variable data by calling getData function. This should be done in an async function.
(async () => {
const data = await getData();
const temp = data.data.list[0].main.temp;
temperature.textContent = temp.toFixed(1) + ' \xB0' + "C ";
const temp2 = data.data.list[1].main.temp;
temperature2.textContent = temp2.toFixed(1) + ' \xB0' + "C ";
const forecast = data.data.list[0].weather[0].description;
foreCast.textContent = forecast.toLowerCase();
const forecast2 = data.data.list[1].weather[0].description;
foreCast2.textContent = forecast2.toLowerCase();
const icon = data.data.list[0].weather[0].icon
weatherIcon.insertAdjacentHTML('afterbegin', `<img src="http://openweathermap.org/img/w/${icon}.png">`)
const icon2 = data.data.list[1].weather[0].icon
weatherIcon2.insertAdjacentHTML('afterbegin', `<img src="http://openweathermap.org/img/w/${icon2}.png">`)
day.textContent = newTime;
day2.textContent = newTime2;
})();
Found a solution: Create a class with the getData() method, create a function to control the data and store it in the state object.
// Create class
class Weather {
constructor(){
}
async getData(){
try{
const url = `http://api.openweathermap.org/data/2.5/forecast?
q=${city}&units=metric&appid=${key}`;
const data = await axios(url)
this.data = data
}catch(error){
temperature.insertAdjacentHTML('afterend', '<div id="error">Oooops something
went wrong!</div>');
}
}
}
// Create state object, declare function to get data and store data in state object
const state = {}
const controlData = async () => {
state.data = new Weather()
await state.data.getData()
const temp = state.data.data.data.list[0].main.temp;
temperature.textContent = temp.toFixed(1) + ' \xB0' + "C ";
const temp2 = state.data.data.data.list[1].main.temp;
temperature2.textContent = temp2.toFixed(1) + ' \xB0' + "C ";
const forecast = state.data.data.data.list[0].weather[0].description;
foreCast.textContent = forecast.toLowerCase();
const forecast2 = state.data.data.data.list[1].weather[0].description;
foreCast2.textContent = forecast2.toLowerCase();
const icon = state.data.data.data.list[0].weather[0].icon
weatherIcon.insertAdjacentHTML('afterbegin', `<img
src="http://openweathermap.org/img/w/${icon}.png">`)
const icon2 = state.data.data.data.list[1].weather[0].icon
weatherIcon2.insertAdjacentHTML('afterbegin', `<img
src="http://openweathermap.org/img/w/${icon2}.png">`)
day.textContent = newTime;
day2.textContent = newTime2;
let sRise = convertTime(state.data.data.data.city.sunrise);
let sSet = convertTime(state.data.data.data.city.sunset);
sunRise.textContent = `Sunrise ${sRise}`;
sunSet.textContent = `Sunset ${sSet}`;
}
controlData()

cant pass data from textbox inside a loop

so im getting my data from Darksky api, i have elements generated with loops, and i update the text content of those via loop of the api data array. whenever i search, everthing else static changes values, not the one inside the loop
function search(ele) {
if(event.key === 'Enter') {
var url3 = 'https://geocode.xyz/' + ele.value +'?json=1' //this is where i convert the long,lat to city name
fetch(url3)
.then(z => z.json())
.then(z => {
locres = (z.latt+','+z.longt)
render()
renderLoop()
})
}
}
function renderLoop(){
var proxyUrl = 'https://cors-anywhere.herokuapp.com/';
var url1 = 'https://api.darksky.net/forecast/c34e122a56ae30a3090687878bce72c3/' + locres + '?units=auto' //i have to use proxy because of CORS
fetch(proxyUrl + url1)
.then(x => x.json())
.then(x => {
var skycons = new Skycons({"color": "white"});
skycons.set("icon0", Skycons = x.currently.icon);
skycons.play();
console.log(x.daily.data)
for(i=0;i<8;i++){
console.log(x.daily.data[i].time)
console.log(x.daily.data[i].summary)
const divs = document.createElement('div')
divs.className = ('week-day-container')
const divsholdr = document.querySelector('.week-stage')
const canv = document.createElement('canvas')
canv.id = ('icons'+(i+1))
canv.height = 100
canv.width = 70
divs.appendChild(canv)
divsholdr.appendChild(divs)
const dates = document.createElement('p')
dates.textContent = x.daily.data[i].time
divs.appendChild(dates)
const temp = document.createElement('p')
temp.textContent = 'High: '+ x.daily.data[i].temperatureHigh + 'Low: ' + x.daily.data[i].temperatureLow
divs.appendChild(temp)
const summ = document.createElement('p')
summ.textContent = x.daily.data[i].summary
divs.appendChild(summ)
}
for(y=0;y<8;y++){
skycons.set('icons'+(y+1), Skycons = x.daily.data[y].icon)
}
})
}

Scraping IMDb episodes using Cheerio.js - only first page of TV episodes is returned

Working on scraping TV episodes from IMDb (Breaking Bad in the example below). The problem is when implementing the for loop, only the first iteration of j is returned.
My assumption is the return statement is exiting the loop but I'm unsure how to fix the problem.
const fetch = require('node-fetch');
const cheerio = require('cheerio');
const searchUrl = 'https://www.imdb.com/find?s=tt&ttype=tv&ref_=fn_tv&q=';
const movieUrl = 'https://www.imdb.com/title/';
async function getEpisodes(searchTerm) {
//const imdbID = await getID(searchTerm);
//const numSeasons = await getSeasons(imdbID);
const imdbID = 'tt0903747';
const numSeasons = 5;
const episodes = [];
for (let j = 1; j <= numSeasons; j++) {
return fetch(`${movieUrl}${imdbID}/episodes?season=${j}`)
.then(response => response.text())
.then(body => {
const $ = cheerio.load(body);
$('div[itemProp="episodes"]').each(function (i, element) {
const airdate = $(element).find('.airdate').text().trim();
const episodeTitle = $(element).find('a[itemProp="name"]').text().trim();
const votes = $(element).find('.ipl-rating-star__total-votes').text().trim().match(/\(([^)]+)\)/)[1];
const rating = $(element).find('.ipl-rating-star ').find('.ipl-rating-star__rating').text().trim().slice(0, 3);
episode = {
season: j,
episodeTitle,
airdate,
votes,
rating
};
episodes.push(episode);
});
return episodes; //Only season 1 is returned.
});
}
}
Let's rewrite the function using async await style. This way we make sure we fire fetch numSeasons times, await all of them, and process them one by one.
async function processResponse(response, season) {
const body = await response.text();
const $ = cheerio.load(body);
let episodes = [];
$('div[itemProp="episodes"]').each(function (i, element) {
const airdate = $(element).find('.airdate').text().trim();
const episodeTitle = $(element).find('a[itemProp="name"]').text().trim();
const votes = $(element).find('.ipl-rating-star__total-votes').text().trim().match(/\(([^)]+)\)/)[1];
const rating = $(element).find('.ipl-rating-star ').find('.ipl-rating-star__rating').text().trim().slice(0, 3);
episode = {
season,
episodeTitle,
airdate,
votes,
rating
};
episodes.push(episode);
});
return episodes;
}
async function getEpisodes(searchTerm) {
//const imdbID = await getID(searchTerm);
//const numSeasons = await getSeasons(imdbID);
const imdbID = 'tt0903747';
const numSeasons = 5;
let promises = [];
for (let j = 1; j <= numSeasons; j++) {
promises.push(fetch(`${movieUrl}${imdbID}/episodes?season=${j}`));
}
const responses = await Promise.all(promises);
return responses.reduce((accumulator, response, index) => {
return accumulator.concat(await processResponse(response, index + 1));
}, []);
}

Categories

Resources