I have created a simple web scraper that pulls in the article titles and URL from this website: http://espn.go.com/college-football/. However, the scraper only returns 46-50 articles, instead of all the articles from the site. I've tried changing the CSS selector that cheerio uses, but nothing changes with regards to the number of articles it scrapes. Here is the code I'm using:
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var mongo = require('mongoskin');
var db = mongo.db("mongodb://localhost:27017/test", { native_parser: true });
url = 'http://espn.go.com/college-football/';
function Headline(title, link) {
this.Title = title;
this.link = link;
}
request(url, function (error, response, html) {
if (!error) {
var $ = cheerio.load(html);
var result = [];
// Grab the articles titles/url
$('.text-container h1 a.realStory', '#news-feed-content').each(function (i, elem) {
console.log($(elem).text(), elem.attribs.href);
var articleObject = new Headline($(elem).text(), elem.attribs.href);
result.push(articleObject);
});
}
fs.writeFile('espn_articles.json', JSON.stringify(result, null, 4), function (err) {
console.log('File successfully written! - Check your project directory for the output.json file');
})
db.collection('articles').insert(result, function (error, record) {
if (error) throw error;
console.log("data saved");
});
});
Here's an example using Osmosis.
osmosis('http://espn.go.com/college-football/')
.find('#news-feed-content .text-container')
.set({
author: '.author',
category: '.category-link',
title: '.realStory',
link: '.realStory#href',
blurb: 'p'
})
.follow('.realStory#href')
.set({
date: '.article-meta #data-date',
images: [ 'picture #srcset' ],
content: '.article-body'
})
.data(function (article) {
/*
{ author: '...',
category: '...',
title: 'Harbaugh, Michigan reel in Florida OL Herbert',
link: '...',
blurb: 'Jim Harbaugh and Michigan have landed another recruit from SEC country in Kai-Leon Herbert of Florida.',
date: '2016-07-06T17:25:09Z',
images: [ '...', '...' ],
content: '...'
}
*/
db.collection('articles').insert(article, function (error, record) {
// ...
});
})
.log(console.log)
.error(console.log)
.debug(console.log);
when you take a look at the page with the chrome dev tools you'll see that it makes an api call everytime it renders more posts . here's the url : http://cdn.espn.go.com/core/now?render=true&partial=nowfeed&xhr=1&sport=ncf&offset=0&device=desktop&userab=8
I assume that the offset params is used for pagination.
Keep in mind that scraping is "illegal" in certain cases so better to ask for permission first
hope it helps !
Related
I am trying to create a customfield in jsforce and am having a heck of a time doing it. I can create a custom object fine, but when trying to make a field so the clients users to sort their leads by it is giving me a heckof a time. This is what I did to create a object
var jsforce = require('jsforce');
var conn = new jsforce.Connection();
conn.login('myemail', 'my password and token', function(err, res) {
if (err) { return console.error(err); }
var metadata = [{
fullName: 'TestObject1__c',
label: 'Test Object 1',
pluralLabel: 'Test Object 1',
nameField: {
type: 'Text',
label: 'Test Object Name'
},
deploymentStatus: 'Deployed',
sharingModel: 'ReadWrite'
}];
conn.metadata.create('CustomObject', metadata, function(err, results) {
if (err) { console.err(err); }
for (var i=0; i < results.length; i++) {
var result = results[i];
console.log('success ? : ' + result.success);
console.log('fullName : ' + result.fullName);
}
});
{
if (err) { return console.error(err); }
console.log(res);
}
})
That works fine but it is not what I need. Any help would be greatly appreciated as the client wants this out. It is part of a larger project that we have already completed but now the fields have to be dynamically created so the end users don't have to make the fields themselves in order for us to push the data to their account. We are currently pushing stuff but its under another field whose name doesn't make sense
I have been able to accomplish this by making a package in the salesforce website. All my clients users have to do is to click a simple link click install and it makes a list view and creates the fields i need.
I'm trying to scrape from yelp and have attached the code below. I have problem in storing the data into array.
Here is my code:
...
var id, title, link, neighborhood, address, phone = [];
router.get('/', function(req, res, next) {
var cheerio = require('cheerio');
while (scrapepage) {
var options = {
uri: 'https://www.yelp.co.uk/search?find_desc='+find+'&find_loc='+city+''+'&start='+page,
transform: function (body) {
return cheerio.load(body);
}
};
page += 10;
rp(options)
.then(function ($) {
var json = { id: "", title : "", link : "", neighborhood : "", address : "", phone : ""};
$('.biz-name span').filter(function(){
var data = $(this).text();
console.log(data);
//title.push(data);
title_count++;
});
...
res.send('Check your console!')
})
.catch(function (err) {
// Crawling failed or Cheerio choked...
});
}
});
So whenever I try to push the data to array, it just does not work, keeps waiting forever. If I remove the push, it consoles all the data.
I also tried with each instead of filter, but no luck. Also tried to put manually into array index, still did not work. May I know what am I doing wrong in the code?
UPDATE
I have added this at the top of the page.
var id, title, link, neighborhood, address, phone = [];
I'd have to ask where is title initialized? I see the declaration but nothing that tells the system to initialize title as an array.
try
...
router.get('/', function(req, res, next) {
var cheerio = require('cheerio');
while (scrapepage) {
var options = {
uri: 'https://www.yelp.co.uk/search?find_desc='+find+'&find_loc='+city+''+'&start='+page,
transform: function (body) {
return cheerio.load(body);
}
};
page += 10;
rp(options)
.then(function ($) {
var title = [],
release, rating;
var json = { id: "", title : "", link : "", neighborhood : "", address : "", phone : ""};
$('.biz-name span').filter(function(){
var data = $(this).text();
console.log(data);
title.push(data);
title_count++;
});
...
res.send('Check your console!')
})
.catch(function (err) {
// Crawling failed or Cheerio choked...
});
}
});
without the initialization the system has to go through a process of determining type and compatibility of parameters to be sure it can give you as close to what you've asked for. Sometimes Explicitly defining a variable can speed this process up.
also with this you shouldn't need to use title_count as title.length would have the count of elements.
push will not work until assign title as an array type
then(function ($) {
var title=[];
var release, rating;
var json = { id: "", title : "", link : "", neighborhood : "", address : "", phone : ""};
$('.biz-name span').filter(function(){
var data = $(this).text();
console.log(data);
title.push(data);
title_count++;
});
...
res.send('Check your console!')
})
My requirement is to get the exact date/time when a user gets disabled.
To achieve this, I need to query for the line 'user set to INACTIVE' in the Revision History to find the date the user was switched from enabled to disabled.
How can I get the revision history using node js ?
I tried below code, but its not working as Rally support team mentioned that its older code and i have to try with v2.0.
Can somebody help me to achieve my requirement ?
var revisions = story.RevisionHistory.Revisions;
revisions.sort(byRevisionNumber);
var story_was_blocked = false;
// it doesn't matter how many revs have been in BLOCKED state, presence of one is sufficient
for (var rix = 0; rix < revisions.length && story_was_blocked === false; rix++) {
var rev = revisions[rix];
if (rev.Description.indexOf("BLOCKED changed from ") >= 0) {
story_was_blocked = true;
}
}
Here's a brief example of how this might be accomplished. Note that this just shows how to fetch all revisions in the collection on a User, you'd need to iterate through them and match the Description field on 'Disabled':
var rally = require('rally'),
queryUtils = rally.util.query,
rallyApi = rally({
// Example key, not valid
apiKey: '_UkMasZfjPZfquDIMExfEKnAboQUlyT2SP4UppMHir',
server: 'https://rally1.rallydev.com',
requestOptions: {
headers: {
'X-RallyIntegrationName': 'Query User Revisions',
'X-RallyIntegrationVendor': 'Stackoverflow user4211235',
'X-RallyIntegrationVersion': '1.0'
}
}
});
function onError(error) {
console.log('Failure!', error);
}
function queryUserRevisions(result) {
var revisions = result.Revisions;
rallyApi.query({
ref: revisions,
start: 1,
limit: Infinity,
order: 'RevisionNumber',
fetch: ['RevisionNumber','Description','CreationDate']
}, function(error, result) {
if(error) {
onError(error);
} else {
console.log('Success querying User Revisions...');
console.log('Summary of revisions on User:')
console.log(result);
}
});
}
function queryUserRevisionHistory(result) {
rallyApi.query({
ref: result.Results[0].RevisionHistory,
start: 1,
limit: Infinity,
fetch: ['Revisions','RevisionNumber','Description','CreationDate']
}, function(error, result) {
if(error) {
onError(error);
} else {
console.log('Success querying User Revision History. Querying Revisions...');
queryUserRevisions(result);
}
});
}
function queryUser(callback) {
rallyApi.query({
type: 'user',
start: 1,
pageSize: 2,
limit: 10,
order: 'CreationDate',
fetch: ['UserName', 'EmailAddress', 'RevisionHistory'],
query: queryUtils.where('UserName', '=', "user#company.com")
}, function(error, result) {
if(error) {
onError(error);
} else {
console.log('Success querying User. Querying Revision History...');
callback(result);
}
});
}
queryUser(queryUserRevisionHistory);
maybe it's an absolutely stupid question, but I have a Pouchdb Database that looks like this:
var fragDB = {
_id: new Date().toISOString(),
question: quest,
right: right,
wrong1: wrong1,
wrong2: wrong2,
wrong3: wrong3,
source1: source1,
source2: source2,
tags: tagarr
}
db.put(fragDB);
Now I want to retrieve the tags from the last document as an Array. I tried something like this, but this obviously didn't work.
var alltags = function(){
db.allDocs({include_docs: true, limit: 1, descending: true}, function(err, response){
if (err) {
console.log(err);
console.log("loading Standard Tags");
return ["..."];
}
if (response){
console.log(response.tags);
return response.tags;
}
});
};
What am I missing?
Not a dumb question – it just hard to grok, because PouchDB is asynchronous: http://pouchdb.com/guides/async-code.html
In your code, you have a function within a function, and it's the sub-function that's returning response.tags, not the parent function.
I suggest you read up on the link above, so you can learn how to write promisey async code like e.g.:
db.allDocs({
include_docs: true,
limit: 1,
descending: true
}).then(function (response) {
console.log(response.tags);
return response.tags;
}).catch(function (err) {
console.log(err);
console.log("loading Standard Tags");
return ["..."];
}).then(function (tags) {
// do something with the tags
});
I am trying to pull the name, address, and url from the madeinnyc tech map for a personal project in learning how to use mapbox. In the process I wanted to scrape the listings and export them in a json file that I could use for the project. The issue I am running into is that the information is not being correctly written to the json file. When I log the scrapped data in the console, I am receiving the correct format and all of the data, but the written file from the scrapped data is incorrect and only received one random piece of data. I think the current setup is scraping individual pieces and overwriting it multiple times, because I am getting multiple File Success logs in my console. Is my writeFile method incorrect?
Here is the info logged to the console console.log(metadata) (correct data, condensed since you get the idea of the rest of the listings):
{ name: 'Todayships',
address: '4447 Seton Ave 2',
url: 'http://todayships.com/' }
{ name: 'Togather',
address: '45 Main St Suite 404',
url: 'http://www.togather.com' }
{ name: 'Tomorrow Networks, LLC',
address: '1270 Avenue of the Americas 2920',
url: 'http://www.tomorrownetworks.com' }
{ name: 'Topi',
address: '20 W 20 Street 2nd Floor',
url: 'http://topi.com' }
output.json
{
"name": "Berlinica Publishing LLC",
"address": "255 West 43rd Street Suite 1012",
"url": "http://www.berlinica.com"
}s.com"
}ackground.html"
}drianyc.com/ersp.html"
}
scrape.js
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
request('http://mappedinny.com/', function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$('li').each(function(i, element){
var li = $(this)
var name = li.attr('data-name');
var address = li.attr('data-address');
var url = li.attr('data-url');
var metadata = {
name : name,
address : address,
url : url
};
console.log(metadata);
fs.writeFile('output.json', JSON.stringify(metadata, null, 4), function(err){
console.log('File Success');
});
});
}
});
The problem is that you're asynchronously writing to the same file in a synchronous loop (your each()).
If your intention is to write all results to the same file, you might try:
var results = [];
$('li').each(function(i, element){
var li = $(this)
var name = li.attr('data-name');
var address = li.attr('data-address');
var url = li.attr('data-url');
var metadata = {
name : name,
address : address,
url : url
};
console.log(metadata);
results.push(metadata);
});
fs.writeFile('output.json', JSON.stringify(results, null, 4), function(err){
if (err)
console.log('File Error: ' + err);
else
console.log('File Success');
});