Nodejs with Phantom-Cluster just stops - javascript

I'm running nodejs, not as a webserver, but from the command line against a pretty heavily modified version of the example.js which comes with the phantom-cluster package. Server is Ubuntu 13.10 in an AWS instance.
My goal is to "ping" more than 64000 urls to test for 404 or 500 http errors. If there is an error, then log that url with the error for later processing.
Here is my code:
(function() {
var WEBSITES, cluster, enqueueRequests, main, phantomCluster;
var fs = require('fs');
phantomCluster = require("./index");
cluster = require("cluster");
WEBS = [];
function loadUrls(callback)
{
console.log("starting loaded");
var fs = require('fs');
var urls = [];
fs.readFile("/home/ubuntu/phantom-cluster/theurls.txt", 'utf8', function (err, data)
{
if (err) throw err;
var myArray = data.split("\n");
for(i=0;i<myArray.length;i++)
{
urls.push(myArray[i]);
}
callback(null,urls);
})
}
enqueueRequests = function(engine)
{
fulfilled = 0;
loadUrls(function(err,WEBS)
{
console.log(">>" + WEBS.length + " urls to process");
var enqueuer, i, key, _i, _results;
enqueuer = function(request)
{
var item;
item = engine.enqueue(request);
item.on("timeout", function()
{
fs.appendFile("/home/ubuntu/error_log.log", "TIMEOUT: " + request + "\r\n", function (err) {});
});
return item.on("response", function()
{
fulfilled++;
console.log(fulfilled);
});
};
_results = [];
for (i = i = 0;i < 1; i++)
{
_results.push((function()
{
var _results1;
_results1 = [];
for(x=0;x<WEBS.length;x++)
{
_results1.push(enqueuer(WEBS[x]));
}
return _results1;
})());
}
return _results;
});
};
main = function()
{
var engine;
engine = phantomCluster.createQueued(
{
workers: 20,
workerIterations: 1,
phantomBasePort: 54321
});
if (cluster.isMaster)
{
enqueueRequests(engine);
}
engine.on("queueItemReady", function(url)
{
var _this = this;
var retVal;
urlArray = url.split("|");
var phantom = this.ph;
var curPage = phantom.createPage(function(page)
{
page.set('settings.loadImages', false);
page.set('settings.javascriptEnabled', false);
page.set('settings.resourceTimeout', 5000);
page.set('settings.userAgent','Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
page.set('onError', function(msg, trace)
{
var msgStack = ['ERROR: ' + msg];
if (trace && trace.length)
{
msgStack.push('TRACE:');
trace.forEach(function(t)
{
msgStack.push(' -> ' + t.file + ': ' + t.line + (t.function ? ' (in function "' + t.function +'")' : ''));
});
}
console.error(msgStack.join('\n'));
});
page.set('onResourceReceived', function(response)
{
if((response.status == "404") || (response.status == "500"))
{
myUrl = decodeURI(response.url);
if(myUrl == urlArray[0])
{
retVal = response.status + "|" + url;
fs.appendFile("/home/ubuntu/error_log.log", response.status + "|" + url + "\r\n", function (err) {});
return retVal;
}
}
});
page.open(urlArray[0], function(status)
{
_this.next(); // _this is a PhantomQueuedClusterClient object
return _this.queueItemResponse(status);
});
});
});
return engine.start();
};
main();
}).call(this);
The file which is referenced as index.js is here:
https://github.com/dailymuse/phantom-cluster/blob/master/index.js
and I have not modified it at all.
This works great, and sparks up 20 worker processes which go out and get the initial response code for the queued urls.
Here is the problem:
After processing anywhere from 960-990 urls, the whole thing just stops. no error code, no nothing.
I've tried everything I can think of from some sort of node timeout, to an issue with a given url to banging my head against my desk. The first two would return an error when I create a test for it. The third just makes my head hurt.
Anyone have any help or experience working with this?
EDIT I made an update to the code and added the on.response callback and then called the nextTick method to remove the item from the queue. Still have the same issue.

Have you taken a look at link-crawler? It uses phantom-cluster and prerender to do almost exactly what you're looking for.

If all you're looking to do is check HTTP status codes, you don't need a headless browser to do that. Node can do that on it's own using http.request() or something that utilizes promises like request-promise.
Unless you're needing to verify something in the rendering of the pages that you're crawling, there's no need to render the page in a browser, just make HTTP calls to the URLs and introspect their statuses.

Related

send binary response in node from phantomjs child process

I have created a node endpoint to create rasterised version for my svg charts.
app.post('/dxexport', function(req, res){
node2Phantom.createPhantomProcess(req,res);
});
My node to phantom function uses spawn to run phantomjs
var spawn = require('child_process').spawn;
exports.createPhantomProcess = function(req,res){
var userRequest = JSON.stringify(req.body);
var bin = "node_modules/.bin/phantomjs"
var args = ['./dxexport/exporter-server.js', userRequest, res];
var cspr = spawn(bin, args);
cspr.stdout.on('data', function (data) {
var buff = new Buffer(data);
res.send(data);
});
cspr.stderr.on('data', function (data) {
data += '';
console.log(data.replace("\n", "\nstderr: "));
});
cspr.on('exit', function (code) {
console.log('child process exited with code ' + code);
process.exit(code);
});
};
when rendering is completed and file is successfully created I call the renderCompleted function inside phantomjs:
var renderCompleted = function (parameters) {
var exportFile = fileSystem.open(parameters.exportFileName, "rb"),
exportFileContent = exportFile.read();
parameters.response.statusCode = 200;
parameters.response.headers = {
"Access-Control-Allow-Origin": parameters.url,
"Content-Type": contentTypes[parameters.format],
"Content-Disposition": "attachment; fileName=" + parameters.fileName + "." + parameters.format,
"Content-Length": exportFileContent.length
};
parameters.response.setEncoding("binary");
parameters.response.write(exportFileContent);
/////// somehow send exportFileContent as node res object for download \\\\\\\\
exportFile.close();
parameters.format !== "svg" && fileSystem.remove(parameters.exportFileName);
for (var i = 0; i < parameters.filesPath.length; i++)
fileSystem.remove(parameters.filesPath[i]);
parameters.filesPath = [];
parameters.response.close()
};
this response is passed from nodejs however apparently this code is calling phantomjs methods so I get errors like
TypeError: 'undefined' is not a function (evaluating 'parameters.response.setEncoding("binary")')
How can I send the binary file response somehow to the node function so it can be sent with my node server to the user?
Any help is appreciated.
Ok after some struggle here is the working solution if someone stumbles on this post.
As Artjom B. mentioned I found that the easiest way was delegate the rendering and file creation of the visualisation to phantomjs. Then send all the parameters related to those operations once done through the console.
Also updated the answer based on #ArtjomB.'s advice to wrap the console message sent in a unique beginning and end string so the risk of the other possible future outputs being mistaken for the intended rendered file object is mitigated.
var renderCompleted = function (parameters) {
console.log("STRTORNDRD" + JSON.stringify(parameters) + "ENDORNDRD");
};
This then gets picked up by stdout and usable like this:
exports.exportVisual = function (req, res) {
var userRequest = JSON.stringify(req.body);
var bin = "node_modules/.bin/phantomjs"
var args = ['./dxexport/exporter-server.js', userRequest, res];
var cspr = spawn(bin, args);
var contentTypes = {
pdf: "application/pdf",
svg: "image/svg+xml",
png: "image/png"
};
cspr.stdout.on('data', function (data) {
var buff = new Buffer(data).toString('utf8');
var strData = buff.match(new RegExp("STRTORNDRD" + "(.*)" + "ENDORNDRD"));
if (strData) {
var parameters = JSON.parse(strData[1]);
var img = fs.readFileSync(parameters.exportFileName);
var headers = {
"Access-Control-Allow-Origin": parameters.url,
"Content-Type": contentTypes[parameters.format],
"Content-Disposition": "attachment; fileName=" + parameters.fileName + "." + parameters.format,
"Content-Length": img.length
};
res.writeHead(200, headers);
res.end(img, 'binary');
// delete files after done
if (parameters.format != "svg") {
fs.unlink(parameters.exportFileName);
}
for (var i = 0; i < parameters.filesPath.length; i++)
fs.unlink(parameters.filesPath[i]);
// done. kill it
cspr.kill('SIGINT');
}
});
cspr.stderr.on('data', function (data) {
data += '';
console.log(data.replace("\n", "\nstderr: "));
});
cspr.on('exit', function (code) {
console.log('child process exited with code ' + code);
process.exit(code);
});
};

How to send JSON data from client to node.js server.

Here is server.js
var express = require("express"),
http = require("http"),
mongoose = require( "mongoose" ),
app = express();
app.use(express.static(__dirname + "/client"));
app.use(express.urlencoded());
mongoose.connect('mongodb://localhost/PvdEnroll', function(err) {
if (err) {
console.log(err);
} else {
console.log('Connected to mongodb!');
}
});
var CheckBoxSchema = mongoose.Schema({
npi: String,
boxes:[ String]
});
var CheckBox = mongoose.model("CheckBox", CheckBoxSchema);
http.createServer(app).listen(3000);
// here's where we get something from the client.
app.get("/checkbox.json", function (req, res) {
CheckBox.find( {}, function(err, CheckBox) {
console.log("STUBB2", checkbox);
res.json(checkbox);
});
});
app.post("/checkbox", function (req, res)
console.log("POSTING TO DB: ",req.body);
var newCkBoxData = new npiChecks({"npi": req.body.npi, "boxes":req.boxes});
newCkBOxData.save(function(err, results) {
if (err !== null) {
console.log(err);
res.send("ERROR");
} else {
CheckBox.find({}, function(err, result) {
if (err !== null) {
// the element dir not get saved
res.send("ERROR");
}
res.json(result);
});
}
});
});
The client, secA.js, pertains to a single HTML page.
var main = function (checkBoxObjects) {
"use strict";
$.getJSON("../data/checkBoxesA.json", function(checkBoxTxt) {
checkBoxTxt.forEach(function (data) {
$(".checkbox-input").append("<input type='checkbox' unchecked/>");
$(".checkbox-input").append(' ' + data.label + "<br/>");
$(".checkbox-input").append(' ' + data.note + "<br/>");
$(".checkbox-input").append(' '+ "<br/>");
});
});
};
$(document).ready(main);
providerNPI_ckBs = [];
NPI_number = [];
var loopForm = function(form) {
for ( var i = 0; i < form.elements.length; i++) {
if (form.elements[i].type == 'checkbox')
if (form.elements[i].checked == true) {
providerNPI_ckBs += 1 + ' ';
} else {
providerNPI_ckBs += 0 + ' ';
}
}
if (providerNPI_ckBs.length > 0)
if (NPI_number.length > 0)
createJSONobj();
}
var getNPI = function() {
NPI_number = document.getElementById("text_field1").value;
if (NPI_number.length > 0)
if (providerNPI_ckBs.length > 0) {
createJSONobj();
}
}
var createJSONobj = function() {
var JSONobj = '{' + JSON.stringify(NPI_number) + ':' +
JSON.stringify(providerNPI_ckBs) + '}';
JSON.stringify(JSONobj);
console.log(JSONobj);
// here we'll do a quick post to our todos route
$.post("npi_checks", JSONobj, function (response) {
console.log("We posted and the server responded!");
console.log(response);
});
}
// Note: This is temporary as I'm only intending to sent JSON data one way
// to the server. I'd just like to verify that I can send data both ways
$(document).ready(function (checkBoxObjects) {
$.getJSON("checkbox.json", function (checkBoxObjects) {
console.log("Client Recieved Array from Server: ", checkBoxObjects);
main(checkBoxObjects);
});
});
The Chrome console responds immediately with GET http://127.0.0.1:3000/html/checkbox.json 404 (Not Found)
The page loads and will accept data which the secA.js script formats as JSON. The database has been started by the server. All I need to know is how to send the data over to the server!
I'm clearly new to javascript and producing this application is part of learning the language along with MongoDB. I've structured this application similarly to an example tutorial book. One difference is that in the tutorial the traffic is two ways between client and server.
Any help is appreciated!
If the first argument to post, on the client side, is changed from ""npi_checks" to "/checkbox" to match the first argument app.post the data gets to the server and is loaded into mongoldb. This is the simple solution.

node js azure SDK getBlobToStream uses lots of memory

I am writing a backup script that simply downloads all the blobs in all the blob containers of a specific Azure account.
The script uses async.js to make sure only so much threads can run at the same time so it doesn't overload the server. When I run this script it works fine, but when it hits large files it runs out of memory. I'm guessing the download runs faster than the disk can write, and it eventually fills up the in-memory buffer so badly that I run out of memory entirely, but debugging the exact cause has been impossible so far.
The specific function which appears to use a lot of memory is called as follows:
blobService.getBlobToStream(
containerName,
blob.name,
fs.createWriteStream(fullPath),
function(error) {
if(error){ //Something went wrong, write it to the console but finish the queue item and continue.
console.log("Failed writing " + blob.name + " (" + error + ")");
callback();
}
else if(!error) { //Write the last modified date and finish the queue item silently
fs.writeFile(fullPath + ".date", blobLastModified, function(err)
{ if(err) console.log("Couldn't write .date file: " + err); });
callback();
}
});
Even a single 700MB download will easily fill up 1GB of memory on my side.
Is there any way around this? Am I missing a parameter which magically prevents the Azure SDK from buffering everything and the kitchen sink?
Full code:
#!/usr/bin/env node
//Requires
var azure = require('azure');
var fs = require('fs');
var mkdirp = require('mkdirp');
var path = require('path');
var async = require('async');
var maxconcurrency = 1; //Max amount of simultaneous running threads of getBlobsAndSaveThem() running through async.js.
var blobService = azure.createBlobService();
backupPrefix='/backups/azurebackup/' //Always end with a '/'!!
//Main flow of the script is near the bottom of the file.
var containerProcessingQueue = async.queue(
function getBlobsAndSaveThem(containerName) {
console.log(containerName); //DEBUG
blobService.listBlobs(containerName,
function(error, blobs) {
if(!error){
var blobProcessingQueue =
async.queue(function(index,callback) {
var blob = blobs[index];
console.log(blob); //DEBUG
var fullPath = backupPrefix + containerName + '/' + blob.name;
var blobLastModified = new Date(blob.properties['last-modified']);
//Only create if the directoy doesn't exist, since mkdirp fails if the directory exists.
if(!fs.existsSync(path.dirname(fullPath))){ //And do it sync, because otherwise it'll check 99999 times if the directory exists simultaneously, doesn't find it, then fails to create it 99998 times.
mkdirp.sync(path.dirname(fullPath), function(err) { console.log('Failed to create directory ' + path.dirname(fullPath) + " ("+ err + ")"); });
}
if(fs.existsSync(fullPath + ".date")){
if(blobLastModified == fs.readFileSync(fullPath + ".date").toString()) {
callback();
return; //If the file is unmodified, return. No this won't exit the program, because it's called within a function definition (async.queue(function ...))
}
}
blobService.getBlobToStream(
containerName,
blob.name,
fs.createWriteStream(fullPath),
function(error) {
if(error){ //Something went wrong, write it to the console but finish the queue item and continue.
console.log("Failed writing " + blob.name + " (" + error + ")");
callback();
}
else if(!error) { //Write the last modified date and finish the queue item silently
fs.writeFile(fullPath + ".date", blobLastModified, function(err)
{ if(err) console.log("Couldn't write .date file: " + err); });
callback();
}
});
},maxconcurrency);
for(var blobindex in blobs){
blobProcessingQueue.push(blobindex);
} //Push new items to the queue for processing
}
else {
console.log("An error occurred listing the blobs: " + error);
}
});
},1);
blobService.listContainers(function(err, result){
for(var i=0;i<result.length;i++) {
containerProcessingQueue.push(result[i].name);
}
});
For all those now curious the variables for the start and end have changed. They are now just rangeStart and rangeEnd.
Here is the azure node documentation for more help
http://dl.windowsazure.com/nodestoragedocs/BlobService.html
One thing that you could possibly do is read only a chunk of data into stream instead of whole blob data, append that to the file and read next chunk. Blob Storage service supports that. If you look at the source code for getBlobToStream (https://github.com/WindowsAzure/azure-sdk-for-node/blob/master/lib/services/blob/blobservice.js), you can specify from/to bytes in the options - rangeStartHeader and rangeEndHeader. See if that helps.
I have hacked some code which does just that (as you can see from my code, my knowledge about node.js is quite primitive :)). [Please use this code just to get an idea about how you can do chunked download as I think it still has some glitches]
var azure = require('azure');
var fs = require('fs');
var blobService = azure.createBlobService("account", "accountkey");
var containerName = "container name";
var blobName = "blob name";
var blobSize;
var chunkSize = 1024 * 512;//chunk size -- we'll read 512 KB at a time.
var startPos = 0;
var fullPath = "D:\\node\\";
var blobProperties = blobService.getBlobProperties(containerName, blobName, null, function (error, blob) {
if (error) {
throw error;
}
else {
blobSize = blob.contentLength;
fullPath = fullPath + blobName;
console.log(fullPath);
doDownload();
}
}
);
function doDownload() {
var stream = fs.createWriteStream(fullPath, {flags: 'a'});
var endPos = startPos + chunkSize;
if (endPos > blobSize) {
endPos = blobSize;
}
console.log("Downloading " + (endPos - startPos) + " bytes starting from " + startPos + " marker.");
blobService.getBlobToStream("test", blobName, stream,
{ "rangeStartHeader": startPos, "rangeEndHeader": endPos-1 }, function(error) {
if (error) {
throw error;
}
else if (!error) {
startPos = endPos;
if (startPos <= blobSize - 1) {
doDownload();
}
}
});
}

With node.js how to I set a var to response from HTTP client?

I know this is probably Asynchronous Javascript 101 and I do have some books on the Kindle I could consult, but I am nowhere near my device.
I have a node app with a variable being assigned to a module that I'm loading. The module has one function that downloads a string of data from a URL.
The problem is, how do I not set the variable until the request has returned?
My code looks like this:
Downloader.js:
var http = require('http');
exports.downloadString = function(str) {
console.log("Downloading from " + str);
http.get(str, function(res) {
var data = [];
console.log("Got response: " + res.statusCode);
res.on('data', function (chunk) {
data.push(chunk);
});
res.on('end', function() {
return data.join('');
});
}).on('error', function(e) {
console.log("Got error: " + e.message);
});
}
app.js:
var downloader = require('./lib/Downloader')
, dateParser = require('./lib/DateParser')
, eventIdGenerator = require('./lib/EventIdGenerator');
var report = downloader.downloadString("http://exampleapi.com");
console.log(report);
I need to wait until the variable named "report" is populated.
Obviously this means my Mocha tests are also failing as I'm still unsure of how to tell the test to wait until the variable is filled.
I'm sure this is extremely simple, but I am drawing a blank!
Thanks!
Node.js is (mostly) asynchronous, so you'd need to pass a callback function to your module:
Downloader.js:
var http = require('http');
exports.downloadString = function(str, callback) {
console.log("Downloading from " + str);
http.get(str, function(res) {
var data = [];
console.log("Got response: " + res.statusCode);
res.on('data', function (chunk) {
data.push(chunk);
});
res.on('end', function() {
callback(data.join(''));
});
}).on('error', function(e) {
console.log("Got error: " + e.message);
});
};
app.js:
var downloader = require('./lib/Downloader')
, dateParser = require('./lib/DateParser')
, eventIdGenerator = require('./lib/EventIdGenerator');
downloader.downloadString("http://exampleapi.com", function(report) {
console.log(report);
});

Node.js script would break when requesting HTTP responses from a site that does not exist

Using Node.js, when one requests a HTTP response, in optimal circumstances, the request comes back with a HTTP response.
However, sometimes the request breaks because the site, for example, has a 404 code, or the site does not exist at all. When requesting a batch of URLs, if there is a 404 code on, say, the 200th URL out of 1000 URLs requested, the entire script breaks. Here is my code:
var hostNames = ['www.gizmodo.com','www.sitethatdoesnotexist123.com','www.google.com'];
for (i; i < hostNames.length; i++){
var options = {
host: hostNames[i],
path: '/'
};
(function (i){
http.get(options, function(res) {
var obj = {};
obj.url = hostNames[i];
obj.statusCode = res.statusCode;
obj.headers = res.headers;
db.scrape.save(obj);
}).on('error',function(e){
console.log("Error Site: " + hostNames[i]);
});
})(i);
};
Is there a way, that for example, if the site does not exist, I simply skip to the next URL, instead of having the script break?
EDIT: Fixed. Thanks user DavidKemp
Use a try/catch block to catch any errors that might occur, and then continue on from there.
For example:
var hostNames = ['www.gizmodo.com','www.sitethatdoesnotexist123.com','www.google.com'];
//moved the function out so we do not have to keep redefining it:
var get_url = function (i){
http.get(options, function(res) {
var obj = {};
obj.url = hostNames[i];
obj.statusCode = res.statusCode;
obj.headers = res.headers;
console.log(JSON.stringify(obj, null, 4));
})
};
for (i; i < hostNames.length; i++){
var options = {
host: hostNames[i],
path: '/'
};
try {
get_url(i);
}
catch(err){
//do something with err
}
};
You need to bind an error handler to your request. I also cleaned up the code a bit.
hostNames.forEach(function(hostname), {
var req = http.get({host: hostName}, function(res) {
var obj = {
url: hostName,
statusCode: res.statusCode,
headers: res.headers
};
console.log(JSON.stringify(obj, null, 4));
});
req.on('error', function(err){
console.log('Failed to fetch', hostName);
});
});
You can use uncaughtException event. this let script run even after exception. link
process.on('uncaughtException', function(err) {
console.log('Caught exception: ' + err);
});
var hostNames = ['www.gizmodo.com','www.sitethatdoesnotexist123.com','www.google.com'];
for (i; i < hostNames.length; i++){
var options = {
host: hostNames[i],
path: '/'
};
(function (i){
http.get(options, function(res) {
var obj = {};
obj.url = hostNames[i];
obj.statusCode = res.statusCode;
obj.headers = res.headers;
db.scrape.save(obj);
}).on('error',function(e){
console.log("Error Site: " + hostNames[i]);
});
})(i);
};
Added a callback for when there's an error. Logs the site that returns an error on console. This error is usually triggered by a 404 or request time is too long.
The full docs are at http://nodejs.org/api/http.html#http_http_get_options_callback at the time of writing. loganfsmyth's answer provides a useful example.

Categories

Resources