Scrape JS site with NodeJS, Nightmare and Cheerio - javascript

I am trying to scrape a website to get the scores of each team. I am running into an issue where my script is returning null content. I cannot see where I am going wrong and looking for some help.
JS:
const Nightmare = require('nightmare')
const cheerio = require('cheerio');
const fs = require('fs');
const nightmare = Nightmare({ show: true })
const url = 'https://www.mscl.org/live/scorecard/ed7941919f69b0e11e800fef/mHcehsPR9S86T3zQv';
nightmare
.goto(url)
.wait('body')
.wait('div#summaryTab.tab-pane.fade.in.table-responsive.borderless.active')
.evaluate(() => document.querySelector('div.col-md-6').innerHTML)
.end()
.then(response => {
console.log(getData(response));
}).catch(err => {
console.log(err);
});
let getData = html => {
data = [];
const $ = cheerio.load(html);
$('div').each((i, elem) => {
if(i === 0 ){
console.log($(elem).find('nth-child(1)').html());
}
});
return data;
}
The html I am scraping is here.
https://pastebin.com/R6syWDwD
The line where the scores are: 30 and 32
<div class="col-md-6">
<b>40 Overs Match</b><br>
<b>MVCC Combined</b> won the toss and chose Batting<br>
<b>Umpires: </b>No umpires were selected<br>
<b>Date: </b> 3/24/2021, 5:00:00 PM<br>
<b>Ground: </b>Acton Field 1<br>
<b>Result: TBD</b><br>
<b>MoM: </b> <br>
<hr>
<p><b>MVCC COMBINED XI - 147/10</b> (<b>O:</b> 12.5 | <b>RR:</b> 11.45)</p>
<p><b>MVCC United XI - 23/1</b> (<b>O:</b> 2.0 | <b>RR:</b> 11.50)</p>
<hr>
</div>
When I run this it returns nothing. No errors are being displayed either. What am i missing?

The jQuery Docs for nth-child says
jQuery's implementation of :nth- selectors is strictly derived from the CSS specification
So you propably have to provide an element to your nth-child(1)-pseudo-selector, to tell jQuery from which element it should select the nth-child of. try this:
console.log($(elem).find('b:nth-child(1)').html());
alternatively, just try to prefix nth-child(1) with a colon -> :nth-child(1)
Edit:
I just realized you are using innerHTML on your selected div, which actually returns the contents of the div without the wrapping div itself. But in getData you try to select the div with $('div') which then is actually not found.

Related

JavaScript stops functioning after adding type=module to src tag in HTML (Using Flask)

What I thought would be the easiest part of my project has turned into a Herculean effort. All I wanted to do was get data from a JSON file to then display on my website. Prior to using a JSON file, I hard coded some data to test my filter/search functionality, all of which I wrote in JavaScript. The code worked perfectly, so I decided to move the data to a JSON file as I am expecting to have a lot more data in the future and can't have it hardcoded. However, I have been unable to get data from the JSON file successfully. I tried using require('./data.json'), but apparently I can't just use require like that. I then tried importing the file, which only works if I go back to the html and add type="module" to the src tag. This then allows all of the data to display on the webpage, however, the function that allows me to filter by category no longer works. When I click on the buttons, I get no response. I used Inspect to get the console to find the error, and the output is:
Uncaught ReferenceError: filterProject is not defined
The search functionality still works, and I suspect this is because that code isn't inside a function. Thus, I don't know why filterProject is supposedly not defined when the other JS code works. Here is all of my code:
import projects from './data.json' assert { type: "json" };
const path = "http://localhost/static/images/";
//ADDING THE HTML, IGNORE
for(let i of projects){
let card = document.createElement("div");
card.classList.add("card", i["category"], "hide");
let imgContainer = document.createElement("div");
imgContainer.classList.add("image-container");
let imageOne = document.createElement("img");
imageOne.setAttribute("src", path.concat(i["imageOne"]));
imgContainer.appendChild(imageOne);
card.appendChild(imgContainer);
let container = document.createElement("div");
container.classList.add("container");
let name = document.createElement("h3");
name.classList.add("project-name");
name.innerText = i["projectName"].toUpperCase();
container.appendChild(name);
let student = document.createElement("h4");
student.classList.add("student-name");
student.innerText = i["studentName"].toUpperCase() + " mentored by " + i["mentor"].toUpperCase();
container.appendChild(student);
let category = document.createElement("h6");
category.innerText = i["category"].toUpperCase().replace("_", " ");
container.appendChild(category);
card.appendChild(container);
document.getElementById("projects").appendChild(card);
}
//FILTERING (DOESNT WORK)
function filterProject(value){
let buttons = document.querySelectorAll(".button-value");
buttons.forEach((button) => {
if(value.toUpperCase() == button.innerText.toUpperCase()){
button.classList.add("active");
}else{
button.classList.remove("active");
}
});
let elements = document.querySelectorAll(".card");
elements.forEach((element) => {
if(value == "all"){
element.classList.remove("hide");
}
else{
//having a space messes it up, make it _
if(element.classList.contains(value.replace(" ", "_"))){
element.classList.remove("hide");
}
else{
element.classList.add("hide");
}
}
});
}
//SEARCH (WORKS)
document.getElementById("search").addEventListener
("click", () => {
let searchInput = document.getElementById("search-input").value;
let elements = document.querySelectorAll(".student-name");
let cards = document.querySelectorAll(".card");
elements.forEach((element, index) =>{
if(element.innerText.includes(searchInput.toUpperCase())){
cards[index].classList.remove("hide");
}
else{
cards[index].classList.add("hide");
}
});
});
//INTIAL STATE
window.onload = () =>{
filterProject("all");
};
Here is the HTML just in case as well:
<div class ="wrapper">
<div id="search-container">
<input type="search" id="search-input" placeholder="Search student name here..."/>
<button id = "search">Search</button>
</div>
<div id ="buttons">
<button class = "button-value" onclick="filterProject('all')">All</button>
<button class = "button-value" onclick="filterProject('Creative Project')">Creative Project</button>
<button class = "button-value" onclick="filterProject('Developing Voice')">Developing Voice</button>
<button class = "button-value" onclick="filterProject('Interdisciplinary Fusion')">Interdisciplinary Fusion</button>
<button class = "button-value" onclick="filterProject('Personal Writing')">Personal Writing</button>
<button class = "button-value" onclick="filterProject('Curriculum Designer')">Curriculum Designer</button>
<button class = "button-value" onclick="filterProject('Internship')">Internship</button>
</div>
<div id = projects></div>
</div>
<script type = "module" src = "{{ url_for('static',filename='javascript/script.js') }}"></script>
If it matters, I am using Flask as my web framework. I'm not sure if that has any impact on anything, but it has created some obstacles when I've tried to create a live server to solve this issue. Thanks in advance for any replies!
What you're looking for is how to load json files locally.
One solution is
Start a local server e.g. http://localhost:8080
Then use fetch() to retrieve the json file
For example, if your data.json file was located within the same folder where you have your html file and where you started your server, then your code could be something like
fetch("http://localhost:8080/data.json")
.then((response) => {
return response.json();
})
.then((data) => {
// Add code to process your data
})

Restcountries API - getting names of currencies dynamically into HTML through Javascript

I am new to Javascript and I've been learning how to import a country's attributes into an HTML element. Some of you might recognize this code, it's from a tutorial, which is now outdated. I've been searching around for an updated solution, but couldn't find any.
First I have the function to fetch the data:
const getCountryData = function (country) {
fetch(`https://restcountries.com/v3.1/name/${country}`)
.then(response => response.json())
.then(data => renderCountry(data[0]));
};
Then I call that function, supplying a country getCountryData('czechia') to infuse it into an element like this:
const renderCountry = function(data, className = '') {
const html = `
<article class="country ${className}">
<img class="country__img" src="${data.flags.svg}" />
<div class="country__data">
<h3 class="country__name">${data.name.common}</h3>
<h4 class="country__region">${data.region}</h4>
<p class="country__row">${(+data.population / 1000000).toFixed(1)} people</p>
<p class="country__row">${data.fifa}</p>
</div>
</article>
`
countriesContainer.insertAdjacentHTML
('beforeend', html);
countriesContainer.style.opacity = 1;
}
This works fine, but the issue is that at the end of the HTML, where I input {data.fifa} I want to have the name of the country's main currency instead. Unfortunately, the data is structured in a way, that in order to have the currency's name displayed, I first have to call it's short name, as shown below:
"currencies": {
"CZK": {
"name": "Czech koruna",
"symbol": "Kč"
}
},
If I call the {data.currencies} into the string, I'm just gonna get an empty object back. If I call it as {currencies.CZK.name}, it works, but the issue is that if I call Sweden, for example, it won't display anything, because then it'd need to be {currencies.SEK.name}. How do I get around this? How can I can call a currency's name without having to incorporate CZK, SEK, USD, EUR etc. into the variable?
Any help is appreciated.
You can transform that object into an array:
const currencyArray = Object.values(data.currencies)
console.log(currencyArray[0].name)
If the country has many currencies, just change the index from 0 to 1, 2, ...

Axios & Cheerio - can't select more than one table row

Have just been playing around with axios & cheerio. I was attempting to scrape table data from the world rugby rankings website. I would like to return the top 10 rows of the table.
https://www.world.rugby/tournaments/rankings/mru
Presently I can only retrieve the first row of the table and I can't figure out why.
const axios = require('axios')
const cheerio = require('cheerio')
async function getWorldRankings() {
try {
const siteUrl = 'https://www.world.rugby/tournaments/rankings/mru'
const { data } = await axios({
method: "GET",
url: siteUrl,
})
const $ = cheerio.load(data)
const elemSelector = 'body > section > div.pageContent.flex-content > div:nth-child(2) > div.column.large-8 > div > section > section > div.fullRankingsContainer.large-7.columns > div > div > table > tbody > tr'
$(elemSelector).each((parentIndex, parentElem) => {
$(parentElem).children().each((childIndex, childElem) => {
console.log($(childElem).text());
})
})
} catch (err) {
console.error(err);
}
}
getWorldRankings()
Result:
>node index.jsx
Position
Teams
Points
For full context and credit I was playing around with this guide:
https://www.youtube.com/watch?v=5YCuUCRS_Ks (I'm using the same code just different url's and css selectors - and I can retrieve table rows as intended with his example coinmarketcap.com and many other websites).
For the world rugby rankings site - even though the html is available in dev tools is the data being injected in some way that makes it unselectable? (I have no idea what I am talking about just throwing out a guess).
Thanks for any help.
node v16.4.2
"axios": "^0.22.0",
"cheerio": "^1.0.0-rc.10",
The data for that table is being loaded later with AJAX and is not initially loaded with the page, you therefore cannot select it with Cheerio. The good news is you don't even need Cheerio for this. If you take a look at the network requests tab in your browser's development tools, you'll see the AJAX request being made uses the following URL to load JSON formatted data --the data you want-- into the page:
https://cmsapi.pulselive.com/rugby/rankings/mru?language=en&client=pulse

Get text of a tag in different format in cheerio

I'm using cheerio npm package to extract the datas from the websites for my node app
Below is the snippet of the code from my app:
const response = await axios.get(
"https://cors-anywhere.herokuapp.com/https://example.com/search/?q=" +
param
);
console.log("Response got");
const $ = cheerio.load(response.data);
const chunk1 = $("h1.text-primary a ").text();
console.log(chunk1);
With the above code I'm targetting the text from the following tag:
<h1 class="text-primary font-weight-bold media-heading h4" itemprop="title">
<a href="/teacher-107/" class="no-uline" title="Teacher">
Teacher
</a>
<meta itemprop="employmentType" content="Full Time">
</h1>
The issue I'm having is there are 4 tag block like this, cheerio does gets the text of the tag but gives text from all 4 tag as a single string like: Teacher Plumber Police Army but would like to have it in array, object or separate string.
How do I do that ??
Ok, I figured it out myself
let jobTitles=[];
$('h1.text-primary ').find('a').each(function(index, element) {
jobTitles.push($(element).text());
index
});

collections keeps retrieving me "undefined"

I just cant seem to get the snapshot docs to show me the data from the server.
i have checked the collection. it is called "creaciones" without uppercase. I have 1 document and I have files written already. I've made no spelling mistakes whatsoever. I made this work before and now i cant.
db.collection('usuarios').get().then(function(querySnapshot) {
querySnapshot.forEach(function(doc){
console.log(doc.data);
});
setupGrilla(snapshot.docs);
});
//Setup grilla
const setupGrilla = (data) => {
let html = '';
data.forEach(doc => {
const grilla = doc.data();
const creacion = `
<div>
<img src='jpg/${grilla.tipoCreacion}.png' alt='tipoCreacion'>
<h2>${grilla.nombreCreacion}</h2>
<img src='Imagenes/${grilla.nombreFoto}' alt='nombrefoto' class='imagen'>
<span>piezas: ${grilla.piezas}</span>
<span class='separador'></span>
<span>tiempo: ${grilla.tiempo} minutos</span>
<p>padre: ${grilla.ayuda} </p>
<p class='puntos'>Puntos: ${grilla.puntos} </p>
</div>
`;
html += creacion;
});
}
//get Data
db.collection('creaciones').get().then(snapshot => {
setupGrilla(snapshot.docs);
console.log(snapshot.docs);
});
I expect it to show fetch the database data.
db.collection('usuarios').get().then(function(querySnapshot) {
querySnapshot.forEach(function(doc){
console.log(doc.data);
});
setupGrilla(snapshot.docs);
});
That code is just what I have tried before. No need to look into that because I don't have it written at the moment.
You are calling setupGrilla with a snapshot.docs argument, but snapshot is never defined.
Try querySnapshot.docs instead, or rename querySnapshot in snapshot.
You are also passing the wrong argument to your method
db.collection('usuarios').get().then(function(snapshot) {
snapshot.forEach(function(doc){
console.log(doc.data);
});
setupGrilla(snapshot); // <-- Here
});

Categories

Resources