I'm trying to scrape from yelp and have attached the code below. I have problem in storing the data into array.
Here is my code:
...
var id, title, link, neighborhood, address, phone = [];
router.get('/', function(req, res, next) {
var cheerio = require('cheerio');
while (scrapepage) {
var options = {
uri: 'https://www.yelp.co.uk/search?find_desc='+find+'&find_loc='+city+''+'&start='+page,
transform: function (body) {
return cheerio.load(body);
}
};
page += 10;
rp(options)
.then(function ($) {
var json = { id: "", title : "", link : "", neighborhood : "", address : "", phone : ""};
$('.biz-name span').filter(function(){
var data = $(this).text();
console.log(data);
//title.push(data);
title_count++;
});
...
res.send('Check your console!')
})
.catch(function (err) {
// Crawling failed or Cheerio choked...
});
}
});
So whenever I try to push the data to array, it just does not work, keeps waiting forever. If I remove the push, it consoles all the data.
I also tried with each instead of filter, but no luck. Also tried to put manually into array index, still did not work. May I know what am I doing wrong in the code?
UPDATE
I have added this at the top of the page.
var id, title, link, neighborhood, address, phone = [];
I'd have to ask where is title initialized? I see the declaration but nothing that tells the system to initialize title as an array.
try
...
router.get('/', function(req, res, next) {
var cheerio = require('cheerio');
while (scrapepage) {
var options = {
uri: 'https://www.yelp.co.uk/search?find_desc='+find+'&find_loc='+city+''+'&start='+page,
transform: function (body) {
return cheerio.load(body);
}
};
page += 10;
rp(options)
.then(function ($) {
var title = [],
release, rating;
var json = { id: "", title : "", link : "", neighborhood : "", address : "", phone : ""};
$('.biz-name span').filter(function(){
var data = $(this).text();
console.log(data);
title.push(data);
title_count++;
});
...
res.send('Check your console!')
})
.catch(function (err) {
// Crawling failed or Cheerio choked...
});
}
});
without the initialization the system has to go through a process of determining type and compatibility of parameters to be sure it can give you as close to what you've asked for. Sometimes Explicitly defining a variable can speed this process up.
also with this you shouldn't need to use title_count as title.length would have the count of elements.
push will not work until assign title as an array type
then(function ($) {
var title=[];
var release, rating;
var json = { id: "", title : "", link : "", neighborhood : "", address : "", phone : ""};
$('.biz-name span').filter(function(){
var data = $(this).text();
console.log(data);
title.push(data);
title_count++;
});
...
res.send('Check your console!')
})
Related
I need to reset a json file to an original state after a button is clicked. Currently, I am modeling with router, and I need help to extend my existing code.
This is a code snippet I wrote on server.js file, the one that I run "nodemon" to start the server.
var messages = [{index:0, rating:0}, {index:1, rating:0}]
app.get('/votes', (req, res) =>{
res.send( messages )
})
app.post('/votes', (req, res) =>{
votes.push(votes)
res.sendStatus(200)
})
so my initial state on file 'votes' (json format) is:
[{"index":0,"rating":0}, {"index":1, "rating":0}]
After some user actions, I'll add some data to this json file using this code:
<body>
// some logic here
<script>
$(() => {
$("#submit").click(()=>{
var message = [ { index:1, rating: $("#input").val()},
{ index:2, rating: $("#input2").val()}]
postMessage(message)
})
})
function postMessage(message) {
$.post('http://localhost:8080/votes', message)
}
</script>
</body>
and then I have the following in my json file
[{"index":0,"rating":0}, {"index":1, "rating":0}, {"index":1, "rating":1}, {"index":2, "rating":3}]
QUESTION: How do I reset the json file (not json variable) into the initial state with a button click for a new transaction?
I am just doing prototyping, so a quick and dirty way may work.
I'd recommand build or use some sort of user verification, and for each user have a copy of the initial data just for him. Keep in mind that this data will never garbage collected so you will have to manage deletion by yourself. I used basic IP that is given by express but that is not a good practice.
Use npm-memorystore which will give your some memory management.
If you want to identify users you can use express-session, express-jwt
var messages = [{
index: 0,
rating: 0
}, {
index: 1,
rating: 0
}];
var usersMessages = {};
app.get('/votes', (req, res) => {
var userIp = req.ip;
usersMessages[userIp] = [].concat(messages);
res.send(usersMessages[userIp]);
});
app.post('/votes', (req, res) => {
var userIp = req.ip;
var userVotes = usersMessages[userIp];
if (!userVotes)
usersMessages[userIp] = [].concat(messages);
usersMessages[userIp].push(req.body.votes);
res.sendStatus(200)
});
Take a look this:
$(() => {
let message = [];
$("#submit").click(() => {
//fill with values
message = [
{ index: 1, rating: $("#input").val() },
{ index: 2, rating: $("#input2").val() }
];
postMessage(message); //send post
message = []; //reset array
});
function postMessage(message) {
$.post("http://localhost:8080/votes", message);
}
});
Hope this helps. =D
What you need to do is, just remove all of the object from array.
You can do that as follow.
var messages = [{"index":0,"rating":0}, {"index":1, "rating":0}, {"index":1, "rating":1}, {"index":2, "rating":3}];
messages.length = 1;
console.log(messages);
Another way.
var messages = [{"index":0,"rating":0}, {"index":1, "rating":0}, {"index":1, "rating":1}, {"index":2, "rating":3}];
messages = messages.slice(0,1);
console.log(messages);
What above code do is, it will just set array back to it first value.
Goal
Capture each event sent through a webhook and turn it into a Slack post. Events include new blog posts, questions, discussions, wiki page, etc. (qualified as contents) and comments (qualified as comments) posted in an online community. Sometimes multiple events are sent in the webhook at once.
Attempted method
This simple JavaScript Azure Function is intended to
Receive one or more webhook events sent in a JSON array
Filter objects qualified as contents from those qualified as comments
Send an API request for each content and/or comment object (both have their own URL endpoint)
Parse each object returned (contents and comments return a similar but different hierarchy of keys)
Assemble the values into JSON objects (one per event, regardless of whether it is a content or comment) and send to Slack
Results
The following code worked fine for a single webhook event until I attempted to add the for loop to accommodate multiple webhook events sent in one array.
Code
Example JSON from webhook
{
"events": [{
"TypeId": "9999-999e",
"DateOccurred": "2018-12-15T20:39:42.2487557Z",
"EventData": {
"ActorUserId": 1234,
"ContentId": "5678-999c",
"ContentTypeId": "9012-999d",
"WikiPageId": 3456,
"WikiId": 1
}
},
{
"TypeId": "1111-111f",
"DateOccurred": "2018-12-15T22:55:37.7846546Z",
"EventData": {
"ActorUserId": 2345,
"ContentId": "2222-222b",
"ContentTypeId": "3333-333a",
"ForumReplyId": 4567,
"ForumThreadId": 8901,
"ForumId": 2
}
},
{
"TypeId": "9012-888f",
"DateOccurred": "2018-12-15T22:44:57.7091846Z",
"EventData": {
"ActorUserId": 9876,
"CommentId": "8900-123a"
}
}
]
}
Example JSON returned from API request
The slightly different structure in hierarchies is accurate.
(for contents)
{
"Content": {
"CreatedByUser": {
"ProfileUrl": "https://<company>.telligenthosting.net/members/<user>",
"Username": "<user>"
},
"HtmlName": "Title",
"HtmlDescription": "Text",
"Url": "https://<company>.telligenthosting.net/<link>"
}
}
(for comments)
{
"Comment": {
"Content": {
"CreatedByUser": {
"ProfileUrl": "https://<company>.telligenthosting.net/members/<user>",
"Username": "<user>"
},
"HtmlName": "Title",
"HtmlDescription": "Text",
"Url": "https://<company>.telligenthosting.net/<link>"
}
}
}
JavaScript file (as an Azure Function)
module.exports = function (context, data) {
var json = data.body;
var request = require('request');
// Parse the webhook event JSON body
var unparsed = JSON.stringify(json.events);
var parsed = JSON.parse(unparsed);
console.log(parsed) // RESULTS ARE AS EXPECTED (the JSON nested beneath `events`, beginning and ending with `[]`)
for (var i = 0; i < parsed.length; i++) {
// Parse out Id of webhook event (for all content types but comments)
// This Id retrieves details about the content
var ContentId, ContentTypeId;
if (parsed[i].EventData.hasOwnProperty('ContentId')) {
var ContentId = parsed[i].EventData.ContentId;
var ContentTypeId = parsed[i].EventData.ContentTypeId;
console.log(ContentTypeId); // RESULTS ARE NOT AS EXPECTED: Prints the same Id twice
var options = {
url: "https://<company>.telligenthosting.net/api.ashx/v2/genericcontent/" + ContentId + "/" + ContentTypeId + ".json",
headers: {
"Rest-User-Token": "<token>",
"Content-Type": "application/json"
}
};
};
// Parse out Id of a webhook event (for comments only)
// This Id retrieves details about a comment
var CommentId;
if (parsed[i].EventData.hasOwnProperty('CommentId')) {
var CommentId = parsed[i].EventData.CommentId;
var options = {
url: "https://<company>.telligenthosting.net/api.ashx/v2/comments/" + CommentId + ".json",
headers: {
"Rest-User-Token": "<token>",
"Content-Type": "application/json"
}
};
};
function callback(error, response, body) {
if (!error && response.statusCode == 200) {
var info = JSON.parse(body);
//For all content types but comments
var username, profileUrl, subject, url, text;
if (info.hasOwnProperty('Content')) {
username = info.Content.CreatedByUser.Username;
profileUrl = info.Content.CreatedByUser.ProfileUrl;
subject = info.Content.HtmlName;
url = info.Content.Url;
text = info.Content.HtmlDescription;
};
//For comments
if (info.hasOwnProperty('Comment')) {
username = info.Comment.User.DisplayName;
profileUrl = info.Comment.User.ProfileUrl;
subject = info.Comment.Content.HtmlName;
url = info.Comment.Url;
text = info.Comment.Body;
};
};
//Send to Slack
function sendToSlack(theUsername, theIconEmoji) {
var theUsername = "Bot";
var theIconEmoji = ":bot:";
var payload = {
attachments: [{
author_name: username,
author_link: profileUrl,
title: subject,
title_link: url,
text: text
}]
};
if (theUsername !== undefined) {
payload.username = theUsername;
}
if (theIconEmoji !== undefined) {
payload.icon_emoji = theIconEmoji;
}
var theRequest = {
url: urlWebHook,
method: "POST",
json: payload
};
request(theRequest, function (error, response, body) {});
}
var urlWebHook = "https://hooks.slack.com/services/<Id>";
sendToSlack();
};
};
request(options, callback);
};
Issue
As commented out in the code above, it appears that the loop is not going through each event but rather through the first event multiple times.
Much of what I read indicates for (var i = 0; i < json.length; i++) { is the proper approach but no matter what I try the Azure Function throws a 500 Internal Service Error and eventually times out. No information is provided in the debug console even though detailed logging is turned on.
Thank you
Thank you for any advice or education.
EventData is not defined because you're not constructing your object properly.
Here's how you might do it:
var json = require("./test.json");
var unparsedEvents = json.events;
for (let event of unparsedEvents) {
var ContentId = event.EventData.ContentId;
var ContentTypeId = event.EventData.ContentTypeId;
var CommentId = event.EventData.CommentId;
var options = new Object();
console.log("ContentId:", ContentId);
console.log("ContentTypeId:", ContentTypeId);
console.log("CommentId:", CommentId);
if (CommentId) {
options.url = "https://<company>.telligenthosting.net/api.ashx/v2/comments/" + CommentId + ".json";
options.headers = {
"Rest-User-Token": "<token>",
"Content-Type": "application/json",
};
} else {
options.url = "https://<company>.telligenthosting.net/api.ashx/v2/genericcontent/" + ContentId + "/" + ContentTypeId + ".json";
options.headers = {
"Rest-User-Token": "<token>",
"Content-Type": "application/json",
};
}
console.log("options:", options);
console.log();
}
I believe you need to change parsed[0] to parsed[i]. Currently you are looping through the array but only accessing the first element, which is why you see the first event multiple times.
i've got a Problem that i can't solve and i can't find any related solutions on SO or somewhere else, unfortunatly.
Basically i just want to send 3 Arrays with Data to the Client Javascript. Its working fine for 2 Arrays, but the 3rd one becomes empty when i refresh the page in browser, and i don't know why.
heres the related code;
app.post("/", function(req, res) {
ssn = req.session;
ssn.anlagen = [];
var Betreiber = {
TableName: "XXX",
KeyConditionExpression: "#usr = :user",
ExpressionAttributeNames: {
"#usr": "User",
},
ExpressionAttributeValues: {
":user": req.body.name
},
};
docClient.query(Betreiber, function(err, data) {
if (data.Count == 0 || req.body.passwort != data.Items[0].Passwort) {
res.render(__dirname + '/views/login', {text: "Der Nutzername oder das Passwort sind falsch! Bitte versuchen Sie es erneut."});
}
else if (req.body.passwort == data.Items[0].Passwort) {
anlagenarray = [];
ssn.user = data.Items[0].User; // WORKS FINE
ssn.admin = data.Items[0].Admin; // WORKS FINE
if (ssn.admin == false) { // Normal
for (var i = 0; i < data.Items[0].Anlagen.length; i++) {
ssn.ident = data.Items[0].Anlagen[Object.keys(data.Items[0].Anlagen)[i]];
var Anlagenbezeichnung = {
TableName: "SCR-Anlagen",
KeyConditionExpression: "#ident = :id",
ExpressionAttributeNames: {
"#ident": "Id",
},
ExpressionAttributeValues: {
":id": ssn.ident
},
};
docClient.query(Anlagenbezeichnung, function(err, data) {
ssn.anlagen = data.Items[0].Bezeichnung // this Variable gets "[]" after refresh
ssn.anlagen.sort();
});
}
}
Heres the Code to send Arrays to the Client Side JS;
app.post("/anlagen", function(req, res) {
if (ssn.user) {
res.send({
name: ssn.user, // WORKS
adminstatus: ssn.admin, // WORKS
bezeichnungen: ssn.anlagen // [] after Page refresh
});
}
else {
res.render(__dirname + '/views/login');
}
});
I just can't find any solution for this. i literally tried everything, but i keep getting this error.
detailed solution approaches highly appreciated.
it's my first try on backend web development, don't be too hard, if my complete code is total bullshit^^
thanks in advance.
Fixed the Problem by myself.
If you have a similar Problem. Just put req.session.save(); inside the query.
like this:
docClient.query(params, function(err, data) {
req.session.bezeichnung.push([data.Items[0].Bezeichnung,data.Items[0].Id]);
req.session.save();
});
This will make the Variable accessible outside of the query scope.
I have created a simple web scraper that pulls in the article titles and URL from this website: http://espn.go.com/college-football/. However, the scraper only returns 46-50 articles, instead of all the articles from the site. I've tried changing the CSS selector that cheerio uses, but nothing changes with regards to the number of articles it scrapes. Here is the code I'm using:
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var mongo = require('mongoskin');
var db = mongo.db("mongodb://localhost:27017/test", { native_parser: true });
url = 'http://espn.go.com/college-football/';
function Headline(title, link) {
this.Title = title;
this.link = link;
}
request(url, function (error, response, html) {
if (!error) {
var $ = cheerio.load(html);
var result = [];
// Grab the articles titles/url
$('.text-container h1 a.realStory', '#news-feed-content').each(function (i, elem) {
console.log($(elem).text(), elem.attribs.href);
var articleObject = new Headline($(elem).text(), elem.attribs.href);
result.push(articleObject);
});
}
fs.writeFile('espn_articles.json', JSON.stringify(result, null, 4), function (err) {
console.log('File successfully written! - Check your project directory for the output.json file');
})
db.collection('articles').insert(result, function (error, record) {
if (error) throw error;
console.log("data saved");
});
});
Here's an example using Osmosis.
osmosis('http://espn.go.com/college-football/')
.find('#news-feed-content .text-container')
.set({
author: '.author',
category: '.category-link',
title: '.realStory',
link: '.realStory#href',
blurb: 'p'
})
.follow('.realStory#href')
.set({
date: '.article-meta #data-date',
images: [ 'picture #srcset' ],
content: '.article-body'
})
.data(function (article) {
/*
{ author: '...',
category: '...',
title: 'Harbaugh, Michigan reel in Florida OL Herbert',
link: '...',
blurb: 'Jim Harbaugh and Michigan have landed another recruit from SEC country in Kai-Leon Herbert of Florida.',
date: '2016-07-06T17:25:09Z',
images: [ '...', '...' ],
content: '...'
}
*/
db.collection('articles').insert(article, function (error, record) {
// ...
});
})
.log(console.log)
.error(console.log)
.debug(console.log);
when you take a look at the page with the chrome dev tools you'll see that it makes an api call everytime it renders more posts . here's the url : http://cdn.espn.go.com/core/now?render=true&partial=nowfeed&xhr=1&sport=ncf&offset=0&device=desktop&userab=8
I assume that the offset params is used for pagination.
Keep in mind that scraping is "illegal" in certain cases so better to ask for permission first
hope it helps !
I am trying to pull the name, address, and url from the madeinnyc tech map for a personal project in learning how to use mapbox. In the process I wanted to scrape the listings and export them in a json file that I could use for the project. The issue I am running into is that the information is not being correctly written to the json file. When I log the scrapped data in the console, I am receiving the correct format and all of the data, but the written file from the scrapped data is incorrect and only received one random piece of data. I think the current setup is scraping individual pieces and overwriting it multiple times, because I am getting multiple File Success logs in my console. Is my writeFile method incorrect?
Here is the info logged to the console console.log(metadata) (correct data, condensed since you get the idea of the rest of the listings):
{ name: 'Todayships',
address: '4447 Seton Ave 2',
url: 'http://todayships.com/' }
{ name: 'Togather',
address: '45 Main St Suite 404',
url: 'http://www.togather.com' }
{ name: 'Tomorrow Networks, LLC',
address: '1270 Avenue of the Americas 2920',
url: 'http://www.tomorrownetworks.com' }
{ name: 'Topi',
address: '20 W 20 Street 2nd Floor',
url: 'http://topi.com' }
output.json
{
"name": "Berlinica Publishing LLC",
"address": "255 West 43rd Street Suite 1012",
"url": "http://www.berlinica.com"
}s.com"
}ackground.html"
}drianyc.com/ersp.html"
}
scrape.js
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
request('http://mappedinny.com/', function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$('li').each(function(i, element){
var li = $(this)
var name = li.attr('data-name');
var address = li.attr('data-address');
var url = li.attr('data-url');
var metadata = {
name : name,
address : address,
url : url
};
console.log(metadata);
fs.writeFile('output.json', JSON.stringify(metadata, null, 4), function(err){
console.log('File Success');
});
});
}
});
The problem is that you're asynchronously writing to the same file in a synchronous loop (your each()).
If your intention is to write all results to the same file, you might try:
var results = [];
$('li').each(function(i, element){
var li = $(this)
var name = li.attr('data-name');
var address = li.attr('data-address');
var url = li.attr('data-url');
var metadata = {
name : name,
address : address,
url : url
};
console.log(metadata);
results.push(metadata);
});
fs.writeFile('output.json', JSON.stringify(results, null, 4), function(err){
if (err)
console.log('File Error: ' + err);
else
console.log('File Success');
});