Scraping links from website using Node.js, request, and cheerio? - javascript

I'm trying to scrape links on my school's course schedule website using Node.js, request, and cheerio. However, my code is not reaching all subject links.
Link to course schedule website here.
Below is my code:
var express = require('express');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
app.get('/subjects', function(req, res) {
var URL = 'http://courseschedules.njit.edu/index.aspx?semester=2016s';
request(URL, function(error, response, body) {
if(!error) {
var $ = cheerio.load(body);
$('.courseList_section a').each(function() {
var text = $(this).text();
var link = $(this).attr('href');
console.log(text + ' --> ' + link);
});
}
else {
console.log('There was an error!');
}
});
});
app.listen('8080');
console.log('Magic happens on port 8080!');
My output can be found here.
As you can see from my output, some links are missing. More specifically, links from sections 'A', 'I (Continued)', and R '(Continued)'. These are also the first sections of each column.
Each section is contained in its own div with class name 'courseList_section' so I don't understand why '.courseList_section a' doesn't loop through all links. Am I missing something obvious? Any and all insight is very appreciated.
Thank you in advance!

The problem isn't your code, it's the site you're trying to parse that's the problem. The HTML tags are invalid. You're trying to parse everything inside the .courseList_section, but the tags looks like this.
<span> <!-- Opening tag -->
<div class='courseList_section'>
<a href='index.aspx?semester=2016s&ƒ=ACC '>ACC - Accounting/Essex CC</a>
</span> <!-- Invalid closing tag for the first span, menaing that .courseList_section will be closed instead
<!-- Suddenly this link is outside the .courseList_section tag, meaning that it will be ignored by cheerio -->
<a href='index.aspx?semester=2016s&subjectID=ACCT'>ACCT - Accounting</a>
<!-- and so on -->
The solution. Fetch all links and ignore those that arn't related to any course.
var request = require('request');
var cheerio = require('cheerio');
var URL = 'http://courseschedules.njit.edu/index.aspx?semester=2016s';
request(URL, function(error, response, body) {
if(error) { return console.error('There was an error!'); }
var $ = cheerio.load(body);
$('a').each(function() {
var text = $(this).text();
var link = $(this).attr('href');
if(link && link.match(/subjectID/)){
console.log(text + ' --> ' + link);
};
});
});
Next time, try looking directly at the HTML and see if it looks okay. If it looks like ****, pass it trough an HTML beautifier and inspect it again. Not even the beautifier could handle this markup which indicated that something was wrong with the tags.

Related

Is there way to have Node redirect to a new page with JSON?

I'm trying to allow users to type a search query into a box on an index.html page and have it send them to a results.html page with results displayed.
Currently I have this working only on an index.html page where the old HTML is removed and replaced with the search results.
//Pertinent Node code:
app.get('/', function(req, res){
res.header("Access-Control-Allow-Origin", "*");
res.redirect('index.html');
});
// A search box in index.html calls the /search below
app.get('/search', function (req, res) {
res.header("Access-Control-Allow-Origin", "*");
const item_id = req.query.item_id;
var json = {};
var sql_query = "a SQL query";
var result = connection.query(sql_query, function(err, result, fields) {
if (err) throw err;
json["result"] = result;
console.log("Sent JSON to client");
res.send(JSON.stringify(json));
});
})
//Client-side:
function get() {
var search = document.getElementById("search").value;
// Validate input to prevent injections
var new_search = search.replace("<", "");
var validated = new_search.replace(">", "");
var url = host + "/search?search=" + validated;
fetch(url, {method : 'GET'})
.then(checkStatus)
.then(function(responseText) {
var json = JSON.parse(responseText);
displaySearchResults(json, search);
})
.catch(function(error) {
});
}
What I'm trying to do is similar to what you might see on Google where clicking search send you to a new page with a url like: "https://www.google.com/searchresults?query=bicycles" that displays a different HTML page from index.html (such as results.html) with the JSON from index.html being moved to and processed in results.html.
How can I make this happen?
Adding some details to what Brad kindly mentioned in comments.. The full solution is:
<form action="search.html" method="GET" name="search">
Then in the Javascript file for search.html, access the form data with:
var search = window.location.search;
If you need to get the "?search=" out of the data, instead use:
var search = window.location.search.replace("?search=", "");

jQuery get method not working properly with two parameters passed to request

I am trying to send firstname and lastname to an api request using jQuery get method. It works fine with only one parameter ie with only first name but does when I add lastname to the request.
It works fine if url ="http://localhost:5000/name?firstname="+h1;
but does not work if url = "http://localhost:5000/name?firstname="+h1+"&lastname="+h2; In later case, the desired output is displayed for a sec and then disappears and url changes to "http://localhost:5000/?", the function is called from "http://localhost:5000/"
Here is my javascript code:
<script type="text/javascript">
$(document).ready(function(){
$("#submitButton").click(function(e){
var h1 = $("#handle1").val();
var h2 = $("#handle2").val();
var u = "http://localhost:5000/name?firstname="+h1+"&lastname="+h2;
//works fine if u = var u = "http://localhost:5000/name?firstname="+h1; though lastname is displayed undefined in the output
alert(u);
$.get(u, function(data){
$('.result').html(data);
})
});
});
</script>
and here is my Express API code:
var express = require('express');
var app = express();
var path = require("path");
const PORT = process.env.PORT || 5000
app.listen(PORT, function(){
console.log('server running on port ' + PORT);
})
app.get('/name', function(req, res){
res.send("Full Name: "+ req.query.firstname + " " + req.query.lastname);
});
Your Ajax $.get request should look like the code snippet below. If the served HTML is also running via http://localhost:5000/ then you can completely omit the relative URL.
$.get('name', {'firstname': h1, 'lastname': h2}).done(function(data) {
$('.result').html(data);
});
The name in $.get corresponds to the route /name

Ajax request to server from html page doesn't work

I made a server on NodeJs using module Express. Now I want to implement a request from html page with $.ajax by clicking a button. I want to get data from server in json format or in text format, it doesnt matter, but it doesn't work. Why?
And plus why does ajax request reload the html page while it shouldn't?
Server part:
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
var url = require("url");
app.get('/scrape', function (req, res) {
console.log("Someone made request");
url = 'http://spun.fkpkzs.ru/Level/Gorny';
request(url, function (error, response, html) {
if (!error) {
console.log("Inside request");
var $ = cheerio.load(html);
var date, waterlevel;
var json = {
time: "",
waterlevel: ""
};
json.time = $("#waterleveltable td.timestampvalue").first().text()
json.waterlevel = $("#waterleveltable td.value").first().text()
res.send(json);
console.log(json);
}
})
})
app.listen('8081')
console.log('Server started on port 8081');
exports = module.exports = app;
This is my hmlt request:
<form>
<!-- button for sending a request to server-->
<button id="button12">Scrape water height</button>
</form>
<div id="response21">
Print
<!-- div for displaying the response from server -->
</div>
<p id="p1">___!</p>
<script>
$(document).ready(function () {
$("#button12").click(function () {
console.log("Get sent.")
// Json request
$.get("http://localhost:8081/scrape", function (data)
{
console.log("Data recieved" + data);
$("#response21")
.append("Time: " + data.time)
.append("Waterlevel: " + data.waterlevel);
}, "json");
});
});
Because of the fact that your button is inside a form, the default action of clicking the button will be to load a new page. This is what causes the reload of your page.
The simplest thing you can do is a return false at the end of the click handler callback so that to prevent the reload of the page.

obtain "server side" variable for use in html/javascript function

I'm trying to develop a language web application. It will scrape data from various websites and ask the participant numerous questions etc. I have created the file that scrapes the web page but I'm having difficulty getting the scraped variables from the node.js file. below are some extracts from the node js file:
var pword = function() {
var request = require("request");
var cheerio = require("cheerio");
var aa = Math.floor(Math.random() * 588);
var words = ['abash', 'aberrant', .....]
var A = words[aa];
var urlcollinsdictionary = "http://www.collinsdictionary.com/dictionary/english/";
var Newurldictionary = urlcollinsdictionary + A;
request({
uri: Newurldictionary,
}, function(error, response, body) {
var $ = cheerio.load(body);
$('div.homograph-entry').each(function() {
var link = $(this);
var text1 = link.text();
console.log(A);
console.log(text1);
});
});
}
My node js code works fine. my problem arises when I try to use its "text1" variable or others of its kind in my HTML/javascript coding. I've tried "getelementbyid", "variable exports/imports", even the "%%" method. but still no luck. as you've probably get guess I'm new to programming. please please please help me by making alterations/additions to the HTML code beneath to enable access to the text1 variable and others like it
<html>
<head>
language game
<title>language game</title>
<br>
<input type = "button" onclick = "word()" value = "Professional Vocab">
</head>
<body>
<script>
function word(){
alert(text1);
}
</script>
</body>
</html>
Thanks in advance...
You can add similar code in your node js file to respond to api calls
var express = require("express");
var app = express();
app.get("sendVar", function(request, response){
response.send(text1);
}
you will need to do npm install express in your command line
In your front-end side you can use jquery and make an AJAX call inside function word()
function word(){
$.get("url/sendVar", function(data){
alert(data);
});
}

Update part of a page in jade?

I'm trying to make a search and send back the result to the same site, I've got the search to work but I can't get the result to be sent back. I want the start site to render the information without page reload.
How can I send the search result back to the index.jade page without having to update it?
function pagelist(items, res) {
var document='';
var db = db_login;
if ( items != null){
items.forEach(function(item) {
document += '<a href=share/'+item._id+' class="searchElement">'
document += item.document
document += '</div>'
})
if(document != ''){
res.send(document);
}else{
}
}
}
index.jade
extends layout
block content
block child
child.jade
extends index
block child
!{document}
You can do it the following way:
First, you could update your index.jade like this:
extends layout
block content
#content
block child
And then, there should be some sort of function you call to get your results. I'll call it getResults.
In the callback of that function you can now do the following:
getResults(function(results){
document.getElementById("content").innerHTML = results;
});
I hope that helps.
UPDATE
I'll give you a complete example:
server.js
var express = require("express");
var i = 0;
function getResults(cb){
cb("<div>Result "+(i++)+"</div><div>Result "+(i++)+"</div><div>Result "+(i++)+"</div>");
}
var app = express();
app.set("view engine","jade");
app.get("/",function(req,res){
getResults(function(results){
res.render("page",{results:results});
});
});
app.get("/results",function(req,res){
getResults(function(results){
res.writeHead(200,"OK",{"Content-Type":"text/html"});
res.end(results);
});
});
app.listen(80);
views/page.jade
doctype html
html
head
script.
function update(){
var req = new XMLHttpRequest();
req.open("GET","/results");
req.onreadystatechange = function(){
if(req.readyState == 4){
document.getElementById("content").innerHTML = req.responseText;
}
}
req.send();
}
body
#content!= results
input(type="button",value="Update",onclick="update()")
Run it with node server.js and visit localhost. You should learn from it how it's done ;)

Categories

Resources