I am playing with NodeJS and for this purpose created an email extractor. Somehow when i create multiple http requests the node.exe memory useage in windows task manager keeps increasing. I understand that the node needs more memory to process the requests but what i noticed that this memory usage does not come down even after all requests have been successfully processed.
When i start nodejs it consumes about 35000K memory but after about 80-100 request this goes upto 50000K and stays.
Here is my simple email extractor module:
var request = require('request'),
cheerio = require('cheerio'),
async = require('async'),
urlHelper = require('url');
function Extractor(config) {
this.baseUrl = config.url;
this.parsedUrl = urlHelper.parse(config.url);
this.urls = [];
this.emails = [];
}
Extractor.prototype.getEmails = function getEmails(html) {
var foundEmails = html.match(/([a-zA-Z0-9._-]+#[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+)/gi) || [];
if(foundEmails.length) this.emails = this.emails.concat(foundEmails);
}
Extractor.prototype.extract = function extract(html) {
var $ = cheerio.load(html),
that = this;
if($('body')){
this.getEmails($('body').html());
}
if(!this.emails.length){
$("a[href^='http://" + this.parsedUrl.host + "'], a[href^='https://" + this.parsedUrl.host + "'], a[href^='/'], a[href^='./'], a[href^='../']").each(function(k, v) {
that.urls.push(urlHelper.resolve(that.baseUrl, $(v).attr('href')));
});
}
};
/**
* Process the base URL
*/
Extractor.prototype.processBase = function processBase(next) {
request(this.baseUrl, function(err, response, body) {
return next(err, body);
});
}
/**
* Process the internal pages
*/
Extractor.prototype.processInternal = function processInternal(cb) {
var that = this;
async.whilst(
// while this condition returns true
function () { return that.emails.length === 0 && that.urls.length > 0; },
// do this
function (callback) {
request(that.urls.shift(), function (err, response, body) {
var $ = cheerio.load(body);
if($(body)){
that.getEmails($('body').html());
}
callback(); // async internal, needs to be called after we are done with our thing
});
},
// call this if any errors occur. An error also stops the series
// this is also called on successful completion of the series
function (err) {
cb(that);
}
);
}
Extractor.prototype.process = function process(next) {
var that = this;
this.processBase(function(err, html) {
if(err) {
console.log(err);
} else {
that.extract(html);
if(!that.emails.length) {
that.processInternal(function(res) {
return next(null, that);
});
}
}
});
}
module.exports = Extractor;
and here is how i call it:
var express = require('express');
var router = express.Router();
var Extractor = require('../services/Extractor');
router.get('/', function(req, res) {
res.json({msg: 'okay'});
var extractor = new Extractor({url: 'http://lior-197784.use1-2.nitrousbox.com:4000/crawl'});
extractor.process(function(err, res) {});
});
module.exports = router;
Related
Update on issue. I now have everything in my mongo db but nothing is showing up on my page. Appears to be something to do with bootbox. If I click scrape new articles previous dialog disappears and scrape happens successfully. Upon inspect a new bootbox section appears with an empty h3 tag.
https://github.com/TylerCEdge/tester
Currently I get all 20 of the results console logged and all are going to mongo but nothing is showing up on my page. It's probably a spelling error but I'm having trouble with it.
// Scrape Script
// =============
// Require axios and cheerio
var axios = require("axios");
var cheerio = require("cheerio");
var URL = "https://thehackernews.com/search?"
function scrape(cb) {
// First, we grab the body of the html with axios
axios.get(URL).then(function (response) {
// Then, we load that into cheerio and save it to $ for a shorthand selector
var $ = cheerio.load(response.data);
var articles = [];
// Now, we grab every h2 within an article tag, and do the following:
$("h2").each(function () {
// Save an empty dataToAdd object
var dataToAdd = {};
// Add the text and href of every link, and save them as properties of the dataToAdd object
dataToAdd.title = $(this)
.text()
// Removes unwanted characters from the titles
.replace(/[^a-z0-9\s]/gi, '').replace(/[_\s]/g, ' ');
dataToAdd.link = $(this)
.parent().parent().parent().parent()
.find("a")
.attr("href");
dataToAdd.img = $(this)
.parent().parent().parent()
.find("div.home-img").find("div.img-ratio").find("img")
.attr("data-src")
dataToAdd.summary = $(this)
.parent()
.find("div.home-desc")
.text()
.replace(/[^a-z0-9\s]/gi, '').replace(/[_\s]/g, ' ')
// console.log(dataToAdd);
articles.push(dataToAdd);
// for (i = 0; i = dataToAdd.length; i++) {
// articles.push(dataToAdd[i]);
// }
});
cb(articles);
console.log(articles);
});
};
module.exports = scrape;
// Server routes
// Import Scrape
var scrape = require("../scripts/scrape");
// Import headlines/notes
var headlinesController = require("../controllers/headlines");
var notesController = require("../controllers/notes");
module.exports = function (router) {
router.get("/", function (req, res) {
res.render("home");
});
router.get("/saved", function (req, res) {
res.render("saved");
});
router.get("/api/fetch", function (req, res) {
headlinesController.fetch(function (err, docs) {
if (!docs || docs.insertedCount === 0) {
res.json({
message: "no new articles today. Check back tomorrow!"
});
}
else {
res.json({
message: "Added " + docs.insertedCount + " new articles!"
});
}
});
});
router.get("/api/headlines", function (req, res) {
var query = {};
if (req.query.saved) {
query = req.query;
}
headlinesController.get(query, function (data) {
res.json(data);
});
});
router.delete("/api/headlines/:id", function (req, res) {
var query = {};
query._id = req.params.id;
headlinesController.delete(query, function (err, data) {
res.json(data);
});
});
router.get("/api/notes/:headline_id?", function (req, res) {
var query = {};
if (req.params.headline_id) {
query._id = req.params.headline_id;
}
notesController.get(query, function (err, data) {
res.json(data);
});
});
router.delete("/api/notes/:id", function (req, res) {
var query = {};
query._id = req.params.id;
notesController.delete(query, function (err, data) {
res.json(data);
});
});
router.post("/api/notes", function (req, res) {
notesController.save(req.body, function (data) {
res.json(data);
});
});
}
In main.js in server folder,
I have this code,
var DDP = require('ddp');
var DDPlogin = require('ddp-login');
var Job = require('meteor-job');
var ddp = new DDP({
host: "127.0.0.1",
port: 3000,
use_ejson: true
});
Meteor.startup(() => {
var myJobs = JobCollection('myJobQueue');
Job.setDDP(ddp);
ddp.connect(function (err) {
if(err) throw err;
DDPlogin(ddp, function (err, token) {
if (err) throw err;
});
});
myJobs.allow({
admin: function (userId, method, params) {
return true;
}
});
Meteor.publish('allJobs', function () {
return myJobs.find({});
});
myJobs.startJobServer();
var workers = Job.processJobs('myJobQueue', 'sendEmail',
function (job, cb) {
console.log(job.data.text);
job.done();
cb(null);
}
);
And in my main.js in client folder,
I have this code,
var myJobs = JobCollection('myJobQueue');
var jobSub = null;
class App extends Component {
componentDidMount(){
if(jobSub !== null)
jobSub.stop();
jobSub = Meteor.subscribe('allJobs');
var job = new Job(myJobs, 'sendEmail',
{
text: 'bozo#clowns.com'
}
);
job.priority('normal')
.retry({ retries: 5,
wait: 60*1000 }) // 1 minute between attempts
.delay(0) // start immediately
.save();
}
...
render(){
console.log(myJobs.find().fetch());
...
}
}
I am using the vsivsi:meteor-job-collection package.
The problem is that console.log() is not executed.
What is wrong in my step by step installation and usage?
I need to console.log() every minute.
I have been trying to learn the MEAN stack and ran into some issues with the javascript below that is in node.js. I have been having trouble with my module.exports.homelist function in which the functions, like function (err, response, body), have been giving me those values as undefined. I have been searching for answers for a while and came across asynchronous code and the callback function, but I was unable to find the solution that fits my situation, especially when the function is called on from within a request.
The Code:
var request = require('request');
var apiOptions = {
server : "https://localhost:3000"
};
if (process.env.NODE_ENV === 'production') {
apiOptions.server = "https://getting-mean-loc8r.herokuapp.com";
}
var renderHomepage = function (req, res, responseBody) {
var message;
if (!(responseBody instanceof Array)) {
message = "API lookup error";
responseBody = [];
} else {
if (!responseBody.length) {
message = "No places found nearby";
}
}
res.render('locations-list', {
title: 'Loc8r - find a place to work with wifi',
pageHeader: {
title: 'Loc8r',
strapline: 'Find places to work with wifi near you!'
},
sidebar: "Looking for wifi and a seat? Loc8r helps you find places to work when out and about. Perhaps with coffee, cake or a pint? Let Loc8r help you find the place you're looking for.",
locations: responseBody,
message: message
});
}
/* GET 'home' page */
module.exports.homelist = function(req, res) {
var requestOptions, path;
path = '/api/locations';
requestOptions = {
url : apiOptions.server + path,
method : "GET",
json : {},
qs : {
lng : -0.7992599,
lat : 51.378091,
maxDistance : 20
}
};
request(
requestOptions,
function(err, response, body) {
var i, data;
data = body;
if (data !== undefined && response !== undefined && response.statusCode === 200 && data.length) {
for (i=0; i<data.length; i++) {
data[i].distance = _formatDistance(data[i].distance);
}
}
renderHomepage(req, res, data);
}
);
var _formatDistance = function (distance) {
var numDistance, unit;
if (distance > 1) {
numDistance = parseFloat(distance).toFixed(1);
unit = 'km';
} else {
numDistance = parseInt(distance * 1000,10);
unit = 'm';
}
return numDistance + unit;
}
};
EDIT: This is the code I have in another file that uses my homelist function to render an HTML homepage:
var express = require('express'); var router = express.Router();
var ctrlLocations = require('../controllers/locations');
router.get('/', ctrlLocations.homelist);
module.exports = router;
You mention MEAN stack- are you requiring express? Please read the documentation on the express website.
I'm really can't find any example of using x-ray and .driver(phantom()) for authentication..
I've trawled through the documentation for x-ray and x-ray-phantom yet can't find any help.
/**
* Module Dependencies
*/
var Crawler = require('x-ray-crawler');
var cheerio = require('cheerio');
var join = require('path').join;
var assert = require('assert');
var phantom = require('../');
var fs = require('fs');
/**
* Tests
*/
describe('phantom driver', function() {
it('should have sensible defaults', function(done) {
var crawler = Crawler()
.driver(phantom())
crawler('http://google.com', function(err, ctx) {
if (err) return done(err);
var $ = cheerio.load(ctx.body);
var title = $('title').text();
assert.equal('Google', title);
done();
})
});
it('should work with client-side pages', function(done) {
var crawler = Crawler()
.driver(phantom());
crawler('https://exchange.coinbase.com/trade', function(err, ctx) {
if (err) return done(err);
var $ = cheerio.load(ctx.body);
var price = $('.market-num').text();
assert.equal(false, isNaN(+price));
done();
})
})
it('should support custom functions', function(done) {
var crawler = Crawler()
.driver(phantom(runner));
crawler('http://mat.io', function(err, ctx) {
if (err) return done(err);
var $ = cheerio.load(ctx.body);
var title = $('title').text();
assert.equal('Lapwing Labs', title);
done();
})
function runner(ctx, nightmare) {
return nightmare
.goto(ctx.url)
.click('.Header-logo-item+ .Header-list-item a')
.wait()
}
})
})
/**
* Read
*/
function get(path) {
return require(join(__dirname, 'fixtures', path));
}
Refer below github links if it helps.
https://github.com/lapwinglabs/x-ray-phantom
https://github.com/lapwinglabs/x-ray/issues/22
I have an Express route like this in an node server (file is required):
var redis = require('../modules/redis');
module.exports = function (app) {
var redisClient = redis.init();
app.post('/auth/ticket', cors(), function (req, res) {
var hashes = ['hash1','hash2', 'hash3'];
var candidates = []; // An array to collect valid hashes
var key;
// to check each hash against a RedisDB I use a For Loop
for (key in hashes) {
var hash = hashes[key];
console.log("Hash " + hash + " will be proofed now:");
//now I try to collect the valid hashes in the candidates array
if (redisClient.exists(hash) === 1) candidates.push(hash);
}
console.log(JSON.stringify(candidates));
});
};
Now here is the code of my module which shall manage all the redis requests:
exports.init = function () {
Redis = exports.Redis = function () {
var promiseFactory = require("q").Promise,
redis = require('promise-redis')(promiseFactory);
this.client = redis.createClient();
this.client.on('error', function (err) {
console.log('redis error – ' + client.host + ':' + client.port + ' – ' + err);
});
Redis.prototype.exists = function (key) {
this.client.exists(key, function (err, data) {
return data === 1 ? true : false;
});
};
return new Redis();
};
So what I experience is that the module is able to console.log the results properly. If a hash is valid, it returns true and otherwise false. This works as expected.
Problem is, that the for-loop continuous the execution without fetching getting the results. I think this is caused by race-conditions.
As you can see, I have started to workout something there with the use of Q and promise-redis in the top of my code:
var promiseFactory = require("q").Promise,
redis = require('promise-redis')(promiseFactory);
this.client = redis.createClient();
I like to know, how I make my for-loop (in the Express route) waiting for the results of redisClient.exists(hash) or in other words, to get all valid hashes into my candidates array.
Please help
like #brad said, you could use Q.all, it would take an array of promises as input and then return an array of results when all the promises are finished:
there is a mistake in your answer:
Redis.prototype.exists = function (key) {
return this.client.exists(key) // CHANGED, you still need to return a promise.
.then(function (reply) {
console.log("reply " + reply);
return (reply);
})
.catch(console.log);
};
If I understand correctly, what you want is something like
exports.init = function () {
Redis = exports.Redis = function () {
var Q = require("q"),
promiseFactory = Q.Promise,
redis = require('promise-redis')(promiseFactory);
this.client = redis.createClient();
this.client.on('error', function (err) {
console.log('redis error – ' + client.host + ':' + client.port + ' – ' + err);
});
Redis.prototype.exists = function (key) {
return this.client.exists(key).then(function (data) {
return data === 1 ? true : false;
});
};
Redis.prototype.getActive = function (arry) {
var self = this;
return Q.all(arry.map(self.exists.bind(self))
).then(function(res){
return arry.filter(function(val, idx){ return res[idx];});
});
};
return new Redis();
};
# mido22: But did you also recognize that I outsourced all the reds functions to the module file (1st Codeblock) which requires the promise-redid and builds a factory for Q. I changed the code inside the module file to:
Redis.prototype.exists = function (key) {
this.client.exists(key)
.then(function (reply) {
console.log("reply " + reply);
return (reply);
})
.catch(console.log);
};
and this results correctly like the console.log evidently shows.
Your codechange of the for-loop works very well but I think it don't fulfills my needs perfectly. If I could, I would like to have it completely outsourced in to the module file, so that I can use the prototyped method in similar cases from everywhere. Is that possible anyhow?
I see, that it would result in having two promise supported functionalities, if I would create an Instance of Redis Client with promise-redid and Q inside the auth/ticket/ router, too.
like this:
var Q = require('q'),
promiseFactory = Q.Promise,
redis = require("promise-redis")(promiseFactory),
client;
an then the express route (there are a lot of more routes each in a single file) like in your code.
Do you understand what I mean? Of course your solution will be fine for my needs at all, but a module resolving the job completely could have more elegance if possible so far.
Using with redis, bluebird and typescript:
import { RedisClient, createClient, ClientOpts } from "redis";
import { promisifyAll, PromisifyAllOptions } from "bluebird";
export module FMC_Redis {
export class Redis {
opt: ClientOpts;
private rc: RedisClient;
private rcPromise: any;
private static _instance: Redis = null;
public static current(_opt?: ClientOpts): Redis {
if (!Redis._instance) {
Redis._instance = new Redis(_opt);
Redis._instance.redisConnect();
}
return Redis._instance;
}
public get client(): RedisClient {
if (!this.rc.connected) throw new Error("There is no connection to Redis DB!");
return this.rc;
}
/******* BLUEBIRD ********/
public get clientAsync(): any {
// promisifyAll functions of redisClient
// creating new redis client object which contains xxxAsync(..) functions.
return this.rcPromise = promisifyAll(this.client);
}
private constructor(_opt?: ClientOpts) {
if (Redis._instance) return;
this.opt = _opt
? _opt
: {
host: "127.0.0.1",
port: 6379,
db: "0"
};
}
public redisConnect(): void {
this.rc = createClient(this.opt);
this.rc
.on("ready", this.onReady)
.on("end", this.onEnd)
.on("error", this.onError);
}
private onReady(): void { console.log("Redis connection was successfully established." + arguments); }
private onEnd(): void { console.warn("Redis connection was closed."); }
private onError(err: any): void { console.error("There is an error: " + err); }
/****** PROMISE *********/
// promise redis test
public getRegularPromise() {
let rc = this.client;
return new Promise(function (res, rej) {
console.warn("> getKeyPromise() ::");
rc.get("cem", function (err, val) {
console.log("DB Response OK.");
// if DB generated error:
if (err) rej(err);
// DB generated result:
else res(val);
});
});
}
/******* ASYNC - AWAIT *******/
// async - await test function
public delay(ms) {
return new Promise<string>((fnResolve, fnReject) => {
setTimeout(fnResolve("> delay(" + ms + ") > successfull result"), ms);
});
}
public async delayTest() {
console.log("\n****** delayTest ")
let a = this.delay(500).then(a => console.log("\t" + a));
let b = await this.delay(400);
console.log("\tb::: " + b);
}
// async - await function
public async getKey(key: string) {
let reply = await this.clientAsync.getAsync("cem");
return reply.toString();
}
}
}
let a = FMC_Redis.Redis.current();
// setTimeout(function () {
// console.warn(a.client.set("cem", "naber"));
// console.warn(a.client.get("cem"));
// console.warn(a.client.keys("cem"));
// }, 1000);
/***** async await test client *****/
a.delayTest();
/** Standart Redis Client test client */
setTimeout(function () {
a.client.get("cem", function (err, val) {
console.log("\n****** Standart Redis Client")
if (err) console.error("\tError: " + err);
else console.log("\tValue ::" + val);
});
}, 100)
/***** Using regular Promise with Redis Client > test client *****/
setTimeout(function () {
a.getRegularPromise().then(function (v) {
console.log("\n***** Regular Promise with Redis Client")
console.log("\t> Then ::" + v);
}).catch(function (e) {
console.error("\t> Catch ::" + e);
});
}, 100);
/***** Using bluebird promisify with Redis Client > test client *****/
setTimeout(function () {
var header = "\n***** bluebird promisify with Redis Client";
a.clientAsync.getAsync("cem").then(result => console.log(header + result)).catch(console.error);
}, 100);