Written File from Node Scrape Incorrect - javascript

I am trying to pull the name, address, and url from the madeinnyc tech map for a personal project in learning how to use mapbox. In the process I wanted to scrape the listings and export them in a json file that I could use for the project. The issue I am running into is that the information is not being correctly written to the json file. When I log the scrapped data in the console, I am receiving the correct format and all of the data, but the written file from the scrapped data is incorrect and only received one random piece of data. I think the current setup is scraping individual pieces and overwriting it multiple times, because I am getting multiple File Success logs in my console. Is my writeFile method incorrect?
Here is the info logged to the console console.log(metadata) (correct data, condensed since you get the idea of the rest of the listings):
{ name: 'Todayships',
address: '4447 Seton Ave 2',
url: 'http://todayships.com/' }
{ name: 'Togather',
address: '45 Main St Suite 404',
url: 'http://www.togather.com' }
{ name: 'Tomorrow Networks, LLC',
address: '1270 Avenue of the Americas 2920',
url: 'http://www.tomorrownetworks.com' }
{ name: 'Topi',
address: '20 W 20 Street 2nd Floor',
url: 'http://topi.com' }
output.json
{
"name": "Berlinica Publishing LLC",
"address": "255 West 43rd Street Suite 1012",
"url": "http://www.berlinica.com"
}s.com"
}ackground.html"
}drianyc.com/ersp.html"
}
scrape.js
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
request('http://mappedinny.com/', function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$('li').each(function(i, element){
var li = $(this)
var name = li.attr('data-name');
var address = li.attr('data-address');
var url = li.attr('data-url');
var metadata = {
name : name,
address : address,
url : url
};
console.log(metadata);
fs.writeFile('output.json', JSON.stringify(metadata, null, 4), function(err){
console.log('File Success');
});
});
}
});

The problem is that you're asynchronously writing to the same file in a synchronous loop (your each()).
If your intention is to write all results to the same file, you might try:
var results = [];
$('li').each(function(i, element){
var li = $(this)
var name = li.attr('data-name');
var address = li.attr('data-address');
var url = li.attr('data-url');
var metadata = {
name : name,
address : address,
url : url
};
console.log(metadata);
results.push(metadata);
});
fs.writeFile('output.json', JSON.stringify(results, null, 4), function(err){
if (err)
console.log('File Error: ' + err);
else
console.log('File Success');
});

Related

JavaScript loop to accommodate filtered array of objects received from a webhook

Goal
Capture each event sent through a webhook and turn it into a Slack post. Events include new blog posts, questions, discussions, wiki page, etc. (qualified as contents) and comments (qualified as comments) posted in an online community. Sometimes multiple events are sent in the webhook at once.
Attempted method
This simple JavaScript Azure Function is intended to
Receive one or more webhook events sent in a JSON array
Filter objects qualified as contents from those qualified as comments
Send an API request for each content and/or comment object (both have their own URL endpoint)
Parse each object returned (contents and comments return a similar but different hierarchy of keys)
Assemble the values into JSON objects (one per event, regardless of whether it is a content or comment) and send to Slack
Results
The following code worked fine for a single webhook event until I attempted to add the for loop to accommodate multiple webhook events sent in one array.
Code
Example JSON from webhook
{
"events": [{
"TypeId": "9999-999e",
"DateOccurred": "2018-12-15T20:39:42.2487557Z",
"EventData": {
"ActorUserId": 1234,
"ContentId": "5678-999c",
"ContentTypeId": "9012-999d",
"WikiPageId": 3456,
"WikiId": 1
}
},
{
"TypeId": "1111-111f",
"DateOccurred": "2018-12-15T22:55:37.7846546Z",
"EventData": {
"ActorUserId": 2345,
"ContentId": "2222-222b",
"ContentTypeId": "3333-333a",
"ForumReplyId": 4567,
"ForumThreadId": 8901,
"ForumId": 2
}
},
{
"TypeId": "9012-888f",
"DateOccurred": "2018-12-15T22:44:57.7091846Z",
"EventData": {
"ActorUserId": 9876,
"CommentId": "8900-123a"
}
}
]
}
Example JSON returned from API request
The slightly different structure in hierarchies is accurate.
(for contents)
{
"Content": {
"CreatedByUser": {
"ProfileUrl": "https://<company>.telligenthosting.net/members/<user>",
"Username": "<user>"
},
"HtmlName": "Title",
"HtmlDescription": "Text",
"Url": "https://<company>.telligenthosting.net/<link>"
}
}
(for comments)
{
"Comment": {
"Content": {
"CreatedByUser": {
"ProfileUrl": "https://<company>.telligenthosting.net/members/<user>",
"Username": "<user>"
},
"HtmlName": "Title",
"HtmlDescription": "Text",
"Url": "https://<company>.telligenthosting.net/<link>"
}
}
}
JavaScript file (as an Azure Function)
module.exports = function (context, data) {
var json = data.body;
var request = require('request');
// Parse the webhook event JSON body
var unparsed = JSON.stringify(json.events);
var parsed = JSON.parse(unparsed);
console.log(parsed) // RESULTS ARE AS EXPECTED (the JSON nested beneath `events`, beginning and ending with `[]`)
for (var i = 0; i < parsed.length; i++) {
// Parse out Id of webhook event (for all content types but comments)
// This Id retrieves details about the content
var ContentId, ContentTypeId;
if (parsed[i].EventData.hasOwnProperty('ContentId')) {
var ContentId = parsed[i].EventData.ContentId;
var ContentTypeId = parsed[i].EventData.ContentTypeId;
console.log(ContentTypeId); // RESULTS ARE NOT AS EXPECTED: Prints the same Id twice
var options = {
url: "https://<company>.telligenthosting.net/api.ashx/v2/genericcontent/" + ContentId + "/" + ContentTypeId + ".json",
headers: {
"Rest-User-Token": "<token>",
"Content-Type": "application/json"
}
};
};
// Parse out Id of a webhook event (for comments only)
// This Id retrieves details about a comment
var CommentId;
if (parsed[i].EventData.hasOwnProperty('CommentId')) {
var CommentId = parsed[i].EventData.CommentId;
var options = {
url: "https://<company>.telligenthosting.net/api.ashx/v2/comments/" + CommentId + ".json",
headers: {
"Rest-User-Token": "<token>",
"Content-Type": "application/json"
}
};
};
function callback(error, response, body) {
if (!error && response.statusCode == 200) {
var info = JSON.parse(body);
//For all content types but comments
var username, profileUrl, subject, url, text;
if (info.hasOwnProperty('Content')) {
username = info.Content.CreatedByUser.Username;
profileUrl = info.Content.CreatedByUser.ProfileUrl;
subject = info.Content.HtmlName;
url = info.Content.Url;
text = info.Content.HtmlDescription;
};
//For comments
if (info.hasOwnProperty('Comment')) {
username = info.Comment.User.DisplayName;
profileUrl = info.Comment.User.ProfileUrl;
subject = info.Comment.Content.HtmlName;
url = info.Comment.Url;
text = info.Comment.Body;
};
};
//Send to Slack
function sendToSlack(theUsername, theIconEmoji) {
var theUsername = "Bot";
var theIconEmoji = ":bot:";
var payload = {
attachments: [{
author_name: username,
author_link: profileUrl,
title: subject,
title_link: url,
text: text
}]
};
if (theUsername !== undefined) {
payload.username = theUsername;
}
if (theIconEmoji !== undefined) {
payload.icon_emoji = theIconEmoji;
}
var theRequest = {
url: urlWebHook,
method: "POST",
json: payload
};
request(theRequest, function (error, response, body) {});
}
var urlWebHook = "https://hooks.slack.com/services/<Id>";
sendToSlack();
};
};
request(options, callback);
};
Issue
As commented out in the code above, it appears that the loop is not going through each event but rather through the first event multiple times.
Much of what I read indicates for (var i = 0; i < json.length; i++) { is the proper approach but no matter what I try the Azure Function throws a 500 Internal Service Error and eventually times out. No information is provided in the debug console even though detailed logging is turned on.
Thank you
Thank you for any advice or education.
EventData is not defined because you're not constructing your object properly.
Here's how you might do it:
var json = require("./test.json");
var unparsedEvents = json.events;
for (let event of unparsedEvents) {
var ContentId = event.EventData.ContentId;
var ContentTypeId = event.EventData.ContentTypeId;
var CommentId = event.EventData.CommentId;
var options = new Object();
console.log("ContentId:", ContentId);
console.log("ContentTypeId:", ContentTypeId);
console.log("CommentId:", CommentId);
if (CommentId) {
options.url = "https://<company>.telligenthosting.net/api.ashx/v2/comments/" + CommentId + ".json";
options.headers = {
"Rest-User-Token": "<token>",
"Content-Type": "application/json",
};
} else {
options.url = "https://<company>.telligenthosting.net/api.ashx/v2/genericcontent/" + ContentId + "/" + ContentTypeId + ".json";
options.headers = {
"Rest-User-Token": "<token>",
"Content-Type": "application/json",
};
}
console.log("options:", options);
console.log();
}
I believe you need to change parsed[0] to parsed[i]. Currently you are looping through the array but only accessing the first element, which is why you see the first event multiple times.

S3: Grant access to xml list by access key + javascript

I have a javascript code that gets the xml list http://BUCKETNAME.s3.REGION.amazonaws.com/ of s3 bucket and uses it as a playlist:
AWS.config=
{ "accessKeyId": "ACCESS KEY",
"secretAccessKey": "SECRET KEY",
"region": "REGION" };
// Create S3 service object
s3 = new AWS.S3();
var params = {
Bucket: 'BUCKET NAME', /* required */
Delimiter: '',
EncodingType: 'url',
Marker: '',
MaxKeys: 0,
Prefix: '',
RequestPayer: 'requester'
};
s3.listObjects(params, function(err, data) {
if (err) console.log(err, err.stack); // an error occurred
else
{
console.log('the list is approved '); // successful response
// Here is the function that convert the file list in the xml to an array
var b = document.documentElement;
b.setAttribute('data-useragent', navigator.userAgent);
b.setAttribute('data-platform', navigator.platform);
var radioName;
var radioTitle;
var tracklength= 0;
// setupPlayer function
function setupPlayer(href,name){
radioName= href;
radioTitle= name;
$.ajax({
type: "GET",
url: "http://BUCKETNAME.s3.REGION.amazonaws.com/?prefix=radio/"+radioName+"/",
dataType: "xml",
success: function(xml){
//tracklength=0;
tracks =[];
$(xml).find('Contents').each(function(){
tracklength=tracklength+1;
tracks.push({
"track": tracklength,
"file" : $(this).find('Key').text()
});
});
radio(tracks);
},
error: function() {
alert("An error occurred while processing XML file.");
}
});
}
}
As you can see, in this code I am taking the XML file and add a radio name (which is the folder name) , after that the ajax will save all the file names in this folder to an array tracks.
This code works perfectly if there is a list grantee permission for Everyone. So there is no need for aws config here. I can run the code inside else statement in listObjects function and it will give me the same response.
What I do want is to give the grant access to this key only, to make this function not work without the access key and secret key.
So no one can access the xml list except those who have the access and secret keys.
Is that possible ?
(This is not the full code, but you got the Idea, accessing the XML file of the bucket and getting the keys an saving them to an array).
You should use s3.getObject (http://docs.aws.amazon.com/AWSJavaScriptSDK/latest/AWS/S3.html#getObject-property) to get your xml files instead of $.ajax call.

Array push takes forever in Node JS after response

I'm trying to scrape from yelp and have attached the code below. I have problem in storing the data into array.
Here is my code:
...
var id, title, link, neighborhood, address, phone = [];
router.get('/', function(req, res, next) {
var cheerio = require('cheerio');
while (scrapepage) {
var options = {
uri: 'https://www.yelp.co.uk/search?find_desc='+find+'&find_loc='+city+''+'&start='+page,
transform: function (body) {
return cheerio.load(body);
}
};
page += 10;
rp(options)
.then(function ($) {
var json = { id: "", title : "", link : "", neighborhood : "", address : "", phone : ""};
$('.biz-name span').filter(function(){
var data = $(this).text();
console.log(data);
//title.push(data);
title_count++;
});
...
res.send('Check your console!')
})
.catch(function (err) {
// Crawling failed or Cheerio choked...
});
}
});
So whenever I try to push the data to array, it just does not work, keeps waiting forever. If I remove the push, it consoles all the data.
I also tried with each instead of filter, but no luck. Also tried to put manually into array index, still did not work. May I know what am I doing wrong in the code?
UPDATE
I have added this at the top of the page.
var id, title, link, neighborhood, address, phone = [];
I'd have to ask where is title initialized? I see the declaration but nothing that tells the system to initialize title as an array.
try
...
router.get('/', function(req, res, next) {
var cheerio = require('cheerio');
while (scrapepage) {
var options = {
uri: 'https://www.yelp.co.uk/search?find_desc='+find+'&find_loc='+city+''+'&start='+page,
transform: function (body) {
return cheerio.load(body);
}
};
page += 10;
rp(options)
.then(function ($) {
var title = [],
release, rating;
var json = { id: "", title : "", link : "", neighborhood : "", address : "", phone : ""};
$('.biz-name span').filter(function(){
var data = $(this).text();
console.log(data);
title.push(data);
title_count++;
});
...
res.send('Check your console!')
})
.catch(function (err) {
// Crawling failed or Cheerio choked...
});
}
});
without the initialization the system has to go through a process of determining type and compatibility of parameters to be sure it can give you as close to what you've asked for. Sometimes Explicitly defining a variable can speed this process up.
also with this you shouldn't need to use title_count as title.length would have the count of elements.
push will not work until assign title as an array type
then(function ($) {
var title=[];
var release, rating;
var json = { id: "", title : "", link : "", neighborhood : "", address : "", phone : ""};
$('.biz-name span').filter(function(){
var data = $(this).text();
console.log(data);
title.push(data);
title_count++;
});
...
res.send('Check your console!')
})

Web Scraping Using Nodejs

I have created a simple web scraper that pulls in the article titles and URL from this website: http://espn.go.com/college-football/. However, the scraper only returns 46-50 articles, instead of all the articles from the site. I've tried changing the CSS selector that cheerio uses, but nothing changes with regards to the number of articles it scrapes. Here is the code I'm using:
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var mongo = require('mongoskin');
var db = mongo.db("mongodb://localhost:27017/test", { native_parser: true });
url = 'http://espn.go.com/college-football/';
function Headline(title, link) {
this.Title = title;
this.link = link;
}
request(url, function (error, response, html) {
if (!error) {
var $ = cheerio.load(html);
var result = [];
// Grab the articles titles/url
$('.text-container h1 a.realStory', '#news-feed-content').each(function (i, elem) {
console.log($(elem).text(), elem.attribs.href);
var articleObject = new Headline($(elem).text(), elem.attribs.href);
result.push(articleObject);
});
}
fs.writeFile('espn_articles.json', JSON.stringify(result, null, 4), function (err) {
console.log('File successfully written! - Check your project directory for the output.json file');
})
db.collection('articles').insert(result, function (error, record) {
if (error) throw error;
console.log("data saved");
});
});
Here's an example using Osmosis.
osmosis('http://espn.go.com/college-football/')
.find('#news-feed-content .text-container')
.set({
author: '.author',
category: '.category-link',
title: '.realStory',
link: '.realStory#href',
blurb: 'p'
})
.follow('.realStory#href')
.set({
date: '.article-meta #data-date',
images: [ 'picture #srcset' ],
content: '.article-body'
})
.data(function (article) {
/*
{ author: '...',
category: '...',
title: 'Harbaugh, Michigan reel in Florida OL Herbert',
link: '...',
blurb: 'Jim Harbaugh and Michigan have landed another recruit from SEC country in Kai-Leon Herbert of Florida.',
date: '2016-07-06T17:25:09Z',
images: [ '...', '...' ],
content: '...'
}
*/
db.collection('articles').insert(article, function (error, record) {
// ...
});
})
.log(console.log)
.error(console.log)
.debug(console.log);
when you take a look at the page with the chrome dev tools you'll see that it makes an api call everytime it renders more posts . here's the url : http://cdn.espn.go.com/core/now?render=true&partial=nowfeed&xhr=1&sport=ncf&offset=0&device=desktop&userab=8
I assume that the offset params is used for pagination.
Keep in mind that scraping is "illegal" in certain cases so better to ask for permission first
hope it helps !

Web FTP Portal Logins

I have a web ftp portal that was created a few years ago by a developer that is no longer around. The code for the website is written in Node.js. Inside of app.js is the following code:
var validUsers = [{
name:'x',
user:'907c78ef73998eafc2680e5fdd4798a8eef0881a',
pass:'95489cf3039eb2f5938e3daa954d04276bbf90e7',
dir:''
},{
name:'y',
user:'b26e5ebda152e81099ec78be2f9c191ee25e1cd6',
pass:'e3725873ae302e3f12eb97b02feb7457de9706c2',
dir:'y'
},{
name:'y2',
user:'3182b54d9f4d08641b5a9a0fb33f74df5d76b222',
pass:'916b2e1941c9e23610f8bd3462cdb19f55b5c631',
dir:'y2'
},{
name:'y3',
user:'38aa53de31c04bcfae9163cc23b7963ed9cf90f7',
pass:'7a98cf84c2c61a30f6c4e3984c0cad2eb29f5d6f',
dir:'y3'
},{
name:'y4',
user:'51e822c50cc62cdbdb850a439ea75b6d45ac487b',
pass:'da6a77293ddcdc7047dd461a94c88c8377753265',
dir:'y4'
},{
name:'y5',
user:'14ad0aca26e00f615990946181ee3405c6ede0f1',
pass:'4eb4e0e1ea0f04422b5bc6031ee37c8dc971236d',
dir:'y5'
},{
name:'y6',
user:'4ec9bdb28c5da0f9813e9eed55a0f1dc6217a305',
pass:'e72bd0bbd37423bb0c9b9edfb9ce94446161c511',
dir:'y6'
},{
name:'y7',
user:'f4603bd4ae9e4aa2a11d903d0b178b37a57b1bac',
pass:'8a6a67f235738c4b2e4f88d4608bdcf0bbc49f51',
dir:'y7'
},{
name:'Guest',
user:'35675e68f4b5af7b995d9205ad0fc43842f16450',
pass:'370bb444ef91a3999b1c36af97e166f18848e7b7',
dir:'Guest'
},{
name:'y8',
user:'d8f51fbf5e13e9f2637a8d5c4bd1ab251bd61c30',
pass:'1a047e6dd554ffdd67524916820a8fa23acd2c6e',
dir:'y8'
}];
The x and y1-8 are substitutions for the actual client names and corresponding directories. Example being the 'Guest' name and directory. My question is, the user and pass are hash values from crypto. Yet they result in specific usernames and passwords. If I wanted to reset a username or password, or add another. How would I figure out the corresponding hash value to add to the code based on the username/password strings I want to add.
Any input would be very helpful.
EDIT:
The rest of the FTP code:
app.get('/ftp/', function(req, res){
var pageName = 'File Transfer Portal';
var rNav = '',
sNav = '',
cNav = '',
imNav = '',
title = 'companyNameOmitted: '+pageName,
bodyClass = 'top ftp',
keywords = 'keywordsOmitted',
description = 'descriptionOmiited',
url = '/ftp/';
res.render('ftp', {
title: title,
bodyClass: bodyClass,
keywords: keywords,
description: description,
url: siteRoot+url,
pageEmail: 'mailto:?subject='+escape(title)+'&body='+escape(description)+'%0A'+siteRoot+url,
eUrl:escape(siteRoot+url),
eTitle:escape(title),
eDescription:escape(description),
rNav:rNav,
sNav:sNav,
cNav:cNav,
imNav:imNav});
//console.log(uniqId()+':'+pageName);
});
app.post('/ftp/upload', function(req, res){
//console.log(req.files);
var SID = req.cookies.SID;
var sessionUser = (users[SID]) ? users[SID] : false;
if (!!sessionUser){
_.each(req.files,function (file) {
console.log(new Date(curTime()).toGMTString()+' | Recieved '+file.name+' ('+file.size+' bytes) from '+sessionUser.name);
var newPath = __dirname + '/complete/'+_.where(validUsers,{user:sessionUser.user})[0].dir+'/'+file.name;
fs.rename(file.path,newPath,function(err) {
if (err) throw err;
else {
res.redirect('back');
if (sessionUser.name != 'adminOmitted') {
var htmlString = '<b>'+sessionUser.name+'</b> has uploaded a file <b>'+file.name+'</b>.<br /><br />View it on the File Transfer Portal.';
var transport = nodemailer.createTransport("SMTP",{
host: "hostname.com", // hostname
secureConnection: true, // use SSL
port: 465, // port for secure SMTP
auth: {
user: "user#host.com",
pass: "pass"
}
});
transport.sendMail({
sender:'sender#host.com',
to:'receiver#host.com',
subject:'File Upload: '+sessionUser.name+' uploaded '+file.name,
html: htmlString
},function(err) {
if (err) console.log(err);
else console.log('Notification Sent: S&A File Upload: '+sessionUser.name+' uploaded '+file.name);
});
}
}
});
And the login code...
app.get('/ftp/d/:hash/:filename', function(req, res){
var SID = req.cookies.SID;
var ip = req.ip;
//console.log(ip);
var sessionUser = (users[SID]) ? ((users[SID].md5==req.params.hash)&&(users[SID].ip==ip)) ? users[SID] : false : false;
if (sessionUser) {
var realpath = __dirname +'/complete/'+_.where(validUsers,{user:sessionUser.user})[0].dir+'/'+req.params.filename.replace('>','/');
console.log(new Date(curTime()).toGMTString()+' | Sending '+realpath.substr(realpath.indexOf('complete')+9)+' to '+sessionUser.name);
res.download(realpath,realpath.substr(realpath.lastIndexOf('/')+1),function(err){
if (err) {
res.redirect(302,'/ftp/');
throw (err);
}
});
} else {
console.log(new Date(curTime()).toGMTString()+' | Download request failed authorization for '+req.params.filename);
console.log(new Date(curTime()).toGMTString()+' | Hash: '+req.params.hash);
console.log(new Date(curTime()).toGMTString()+' | SID: '+req.cookies.SID);
res.redirect(302,'/ftp/');
}
});
sio.sockets.on('connection', function (socket) {
var SID = socket.handshake.SID;
if (!users[SID]) register(SID,socket.handshake.address.address);
//console.log(users);
socket.on('login',function(data) {
var thisUser = _.where(validUsers,{user:data.u,pass:data.p})[0];
if (_.isEmpty(thisUser)) {
if (!!users[SID].ip) {
console.log(new Date(curTime()).toGMTString()+' | '+users[SID].ip+' has failed logging in.');
console.log(new Date(curTime()).toGMTString()+' | '+'U:'+data.u);
console.log(new Date(curTime()).toGMTString()+' | '+'P:'+data.p);
}
socket.emit('login',{complete:false,name:false});
} else {
console.log(new Date(curTime()).toGMTString()+' | '+thisUser.name+' has logged in.');
users[SID].name = thisUser.name;
users[SID].user = thisUser.user;
socket.emit('login',{complete:true,name:thisUser.name});
}
});
And the disconnect function, the only code between the login and the disconnect functions are a move file and a delete file function which I doubt are of any use.
//console.log(users);
socket.on('disconnect',function() {
setTimeout(function() {
if (!!users[SID]) {
if (curTime()-users[SID].lastTap>30000)
unregister(SID);
else console.log('Not removing; connection still active. ('+users[SID].name+')');
} else (unregister(SID));
},30000);
});
});
and finally, the crypto functions:
function getMD5(string) {
return crypto.
createHash('md5').
update(string).
digest("hex");
}
function getSHA1(string) {
return crypto.
createHash('sha1').
update(string).
digest("hex");
}
I know the formatting isn't perfect, I've tried to keep it as neat as possible, I think that's all of the relevant functions. I doubt the .jade file for the FTP Portal would be of any use.
You can't.
The usernames and passwords have been put through an asymmetric encryption (ie MD5). This was likely done to protect the user's personal information if the server is hacked.
You're still missing the part of the code that handles the authentication and sets the session cookie.
If you can find the code that handles the auth and you know the username beforehand you could re-hash it to cross-reference the username to the entries list.
Otherwise, your only option is to crack the usernames/passwords which can be difficult/impossible depending on their complexity.
Good luck...

Categories

Resources