I am writing a backup script that simply downloads all the blobs in all the blob containers of a specific Azure account.
The script uses async.js to make sure only so much threads can run at the same time so it doesn't overload the server. When I run this script it works fine, but when it hits large files it runs out of memory. I'm guessing the download runs faster than the disk can write, and it eventually fills up the in-memory buffer so badly that I run out of memory entirely, but debugging the exact cause has been impossible so far.
The specific function which appears to use a lot of memory is called as follows:
blobService.getBlobToStream(
containerName,
blob.name,
fs.createWriteStream(fullPath),
function(error) {
if(error){ //Something went wrong, write it to the console but finish the queue item and continue.
console.log("Failed writing " + blob.name + " (" + error + ")");
callback();
}
else if(!error) { //Write the last modified date and finish the queue item silently
fs.writeFile(fullPath + ".date", blobLastModified, function(err)
{ if(err) console.log("Couldn't write .date file: " + err); });
callback();
}
});
Even a single 700MB download will easily fill up 1GB of memory on my side.
Is there any way around this? Am I missing a parameter which magically prevents the Azure SDK from buffering everything and the kitchen sink?
Full code:
#!/usr/bin/env node
//Requires
var azure = require('azure');
var fs = require('fs');
var mkdirp = require('mkdirp');
var path = require('path');
var async = require('async');
var maxconcurrency = 1; //Max amount of simultaneous running threads of getBlobsAndSaveThem() running through async.js.
var blobService = azure.createBlobService();
backupPrefix='/backups/azurebackup/' //Always end with a '/'!!
//Main flow of the script is near the bottom of the file.
var containerProcessingQueue = async.queue(
function getBlobsAndSaveThem(containerName) {
console.log(containerName); //DEBUG
blobService.listBlobs(containerName,
function(error, blobs) {
if(!error){
var blobProcessingQueue =
async.queue(function(index,callback) {
var blob = blobs[index];
console.log(blob); //DEBUG
var fullPath = backupPrefix + containerName + '/' + blob.name;
var blobLastModified = new Date(blob.properties['last-modified']);
//Only create if the directoy doesn't exist, since mkdirp fails if the directory exists.
if(!fs.existsSync(path.dirname(fullPath))){ //And do it sync, because otherwise it'll check 99999 times if the directory exists simultaneously, doesn't find it, then fails to create it 99998 times.
mkdirp.sync(path.dirname(fullPath), function(err) { console.log('Failed to create directory ' + path.dirname(fullPath) + " ("+ err + ")"); });
}
if(fs.existsSync(fullPath + ".date")){
if(blobLastModified == fs.readFileSync(fullPath + ".date").toString()) {
callback();
return; //If the file is unmodified, return. No this won't exit the program, because it's called within a function definition (async.queue(function ...))
}
}
blobService.getBlobToStream(
containerName,
blob.name,
fs.createWriteStream(fullPath),
function(error) {
if(error){ //Something went wrong, write it to the console but finish the queue item and continue.
console.log("Failed writing " + blob.name + " (" + error + ")");
callback();
}
else if(!error) { //Write the last modified date and finish the queue item silently
fs.writeFile(fullPath + ".date", blobLastModified, function(err)
{ if(err) console.log("Couldn't write .date file: " + err); });
callback();
}
});
},maxconcurrency);
for(var blobindex in blobs){
blobProcessingQueue.push(blobindex);
} //Push new items to the queue for processing
}
else {
console.log("An error occurred listing the blobs: " + error);
}
});
},1);
blobService.listContainers(function(err, result){
for(var i=0;i<result.length;i++) {
containerProcessingQueue.push(result[i].name);
}
});
For all those now curious the variables for the start and end have changed. They are now just rangeStart and rangeEnd.
Here is the azure node documentation for more help
http://dl.windowsazure.com/nodestoragedocs/BlobService.html
One thing that you could possibly do is read only a chunk of data into stream instead of whole blob data, append that to the file and read next chunk. Blob Storage service supports that. If you look at the source code for getBlobToStream (https://github.com/WindowsAzure/azure-sdk-for-node/blob/master/lib/services/blob/blobservice.js), you can specify from/to bytes in the options - rangeStartHeader and rangeEndHeader. See if that helps.
I have hacked some code which does just that (as you can see from my code, my knowledge about node.js is quite primitive :)). [Please use this code just to get an idea about how you can do chunked download as I think it still has some glitches]
var azure = require('azure');
var fs = require('fs');
var blobService = azure.createBlobService("account", "accountkey");
var containerName = "container name";
var blobName = "blob name";
var blobSize;
var chunkSize = 1024 * 512;//chunk size -- we'll read 512 KB at a time.
var startPos = 0;
var fullPath = "D:\\node\\";
var blobProperties = blobService.getBlobProperties(containerName, blobName, null, function (error, blob) {
if (error) {
throw error;
}
else {
blobSize = blob.contentLength;
fullPath = fullPath + blobName;
console.log(fullPath);
doDownload();
}
}
);
function doDownload() {
var stream = fs.createWriteStream(fullPath, {flags: 'a'});
var endPos = startPos + chunkSize;
if (endPos > blobSize) {
endPos = blobSize;
}
console.log("Downloading " + (endPos - startPos) + " bytes starting from " + startPos + " marker.");
blobService.getBlobToStream("test", blobName, stream,
{ "rangeStartHeader": startPos, "rangeEndHeader": endPos-1 }, function(error) {
if (error) {
throw error;
}
else if (!error) {
startPos = endPos;
if (startPos <= blobSize - 1) {
doDownload();
}
}
});
}
Related
Similar to this question,
I have a script that downloads a file to a given url via http.get.
How can I make sure the pipe is finished before continuing to the next iteration with just the http/https module??
//nodejs default libs
var fs = require("fs");
var http = require('https');
function dlFile(fullFilePath, dlUrl, fsize, fname){
var file = fs.createWriteStream(fullFilePath); //fullFilePath will dictate where we will save the file + filename.
var rsult ='';
var downloadedFsize;
var stats; //stats of the file will be included here
var request = http.get( dlUrl, function(response) {
let rsult = response.statusCode;
//will respond with a 200 if the file is present
//404 if file is missing
response.pipe(file);
/*pipe writes the file...
how do we stop the iteration while it is not yet finished writing?
*/
console.log(" \n FILE : " + fname);
console.log("File analysis finished : statusCode: " + rsult + " || Saved on " + fullFilePath);
console.log(' \n Downloaded from :' + dlUrl);
console.log(' \n SQL File size is : ' + fsize);
//identify filesize
stats = fs.statSync(fullFilePath);
downloadedFsize = stats["size"]; //0 because the pipe isn't finished yet...
console.log(' actual file size is : ' + downloadedFsize);
}).on('error', function(e) {
console.error(e);
//log that an error happened to the file
}).on('end', function(e){
//tried putting the above script here but nothing happens
});
return rsult;
}
Is there a cleaner approach similar to what I have in mind above? or should I approach this differently? I tried putting the code on .on('end' but it does nothing
The end event is not triggered on the request, instead it is triggered on the response (docs):
response.on("end", function() {
console.log("done");
});
As #Jonas Wilms says, the trigger was indeed on response.
//nodejs default libs
var fs = require("fs");
var http = require('https');
function dlFile(fullFilePath, dlUrl, fsize, fname){
var file = fs.createWriteStream(fullFilePath); //fullFilePath will dictate where we will save the file + filename.
var rsult ='';
var downloadedFsize;
var stats; //stats of the file will be included here
var request = http.get( dlUrl, function(response) {
let rsult = response.statusCode;
//will respond with a 200 if the file is present
//404 if file is missing
response.pipe(file).on('finish', function(e){
console.log(" \n FILE : " + fname);
console.log("File analysis finished : statusCode: " + rsult + " || Saved on " + fullFilePath);
console.log(' \n Downloaded from :' + dlUrl);
console.log(' \n SQL File size is : ' + fsize);
//identify filesize
stats = fs.statSync(fullFilePath);
downloadedFsize = stats["size"];
console.log(' actual file size is : ' + downloadedFsize);
});
/*pipe writes the file above, and output the results once it's done */
}).on('error', function(e) {
console.error(e);
//log that an error happened to the file
}).on('end', function(e){
//tried putting the above script here but nothing happens
});
return rsult;
}
I have a node application that reads an uploaded file like so:
router.route('/moduleUpload')
.post(function (request, response) {
request.files.file.originalname = request.files.file.originalname.replace(/ +?/g, '');
var media = new Media(request.files.file, './user_resources/module/' + request.body.module_id + '/');
if (!fs.existsSync(media.targetDir)) {
fs.mkdirSync(media.targetDir, 0777, function (err) {
if (err) {
console.log(err);
response.send("ERROR! Can't make the directory! \n"); // echo the result back
}
});
fs.chmodSync(media.targetDir, 0777);
}
moveFile(media);
var token = jwt.encode({
mediaObject: media
}, require('../secret')());
response.status(200).json(token);
});
Now when this file is uploaded and status code 200 is recieved my system then calls the following route:
router.route('/resourcePath/:encodedString')
.all(function (req, res) {
var decoded = jwt.decode(req.params.encodedString, require('../secret')());
var mediaObject = decoded.mediaObject;
var ext = mediaObject.file.originalname.substr(mediaObject.file.originalname.lastIndexOf('.'));
var path = 'app_server' + mediaObject.targetDir.substring(1) + mediaObject.fileName + ext;
var fileExist = false;
res.status(200).send(path)
});
Now for some reason this call is being called before the file is correctly in place which results in that sometimes my users cannot see the content.
To make sure the file was in the folder i thought of the following code to add:
var fileExist = false;
while (!fileExist) {
if (fs.existsSync('/var/www/learningbankapp/'+path)) {
fileExist = true;
}
}
However im not sure that this a good solution namly because it goes against node.js nature. So my question is, is there a better way to do it?
I'm running nodejs, not as a webserver, but from the command line against a pretty heavily modified version of the example.js which comes with the phantom-cluster package. Server is Ubuntu 13.10 in an AWS instance.
My goal is to "ping" more than 64000 urls to test for 404 or 500 http errors. If there is an error, then log that url with the error for later processing.
Here is my code:
(function() {
var WEBSITES, cluster, enqueueRequests, main, phantomCluster;
var fs = require('fs');
phantomCluster = require("./index");
cluster = require("cluster");
WEBS = [];
function loadUrls(callback)
{
console.log("starting loaded");
var fs = require('fs');
var urls = [];
fs.readFile("/home/ubuntu/phantom-cluster/theurls.txt", 'utf8', function (err, data)
{
if (err) throw err;
var myArray = data.split("\n");
for(i=0;i<myArray.length;i++)
{
urls.push(myArray[i]);
}
callback(null,urls);
})
}
enqueueRequests = function(engine)
{
fulfilled = 0;
loadUrls(function(err,WEBS)
{
console.log(">>" + WEBS.length + " urls to process");
var enqueuer, i, key, _i, _results;
enqueuer = function(request)
{
var item;
item = engine.enqueue(request);
item.on("timeout", function()
{
fs.appendFile("/home/ubuntu/error_log.log", "TIMEOUT: " + request + "\r\n", function (err) {});
});
return item.on("response", function()
{
fulfilled++;
console.log(fulfilled);
});
};
_results = [];
for (i = i = 0;i < 1; i++)
{
_results.push((function()
{
var _results1;
_results1 = [];
for(x=0;x<WEBS.length;x++)
{
_results1.push(enqueuer(WEBS[x]));
}
return _results1;
})());
}
return _results;
});
};
main = function()
{
var engine;
engine = phantomCluster.createQueued(
{
workers: 20,
workerIterations: 1,
phantomBasePort: 54321
});
if (cluster.isMaster)
{
enqueueRequests(engine);
}
engine.on("queueItemReady", function(url)
{
var _this = this;
var retVal;
urlArray = url.split("|");
var phantom = this.ph;
var curPage = phantom.createPage(function(page)
{
page.set('settings.loadImages', false);
page.set('settings.javascriptEnabled', false);
page.set('settings.resourceTimeout', 5000);
page.set('settings.userAgent','Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
page.set('onError', function(msg, trace)
{
var msgStack = ['ERROR: ' + msg];
if (trace && trace.length)
{
msgStack.push('TRACE:');
trace.forEach(function(t)
{
msgStack.push(' -> ' + t.file + ': ' + t.line + (t.function ? ' (in function "' + t.function +'")' : ''));
});
}
console.error(msgStack.join('\n'));
});
page.set('onResourceReceived', function(response)
{
if((response.status == "404") || (response.status == "500"))
{
myUrl = decodeURI(response.url);
if(myUrl == urlArray[0])
{
retVal = response.status + "|" + url;
fs.appendFile("/home/ubuntu/error_log.log", response.status + "|" + url + "\r\n", function (err) {});
return retVal;
}
}
});
page.open(urlArray[0], function(status)
{
_this.next(); // _this is a PhantomQueuedClusterClient object
return _this.queueItemResponse(status);
});
});
});
return engine.start();
};
main();
}).call(this);
The file which is referenced as index.js is here:
https://github.com/dailymuse/phantom-cluster/blob/master/index.js
and I have not modified it at all.
This works great, and sparks up 20 worker processes which go out and get the initial response code for the queued urls.
Here is the problem:
After processing anywhere from 960-990 urls, the whole thing just stops. no error code, no nothing.
I've tried everything I can think of from some sort of node timeout, to an issue with a given url to banging my head against my desk. The first two would return an error when I create a test for it. The third just makes my head hurt.
Anyone have any help or experience working with this?
EDIT I made an update to the code and added the on.response callback and then called the nextTick method to remove the item from the queue. Still have the same issue.
Have you taken a look at link-crawler? It uses phantom-cluster and prerender to do almost exactly what you're looking for.
If all you're looking to do is check HTTP status codes, you don't need a headless browser to do that. Node can do that on it's own using http.request() or something that utilizes promises like request-promise.
Unless you're needing to verify something in the rendering of the pages that you're crawling, there's no need to render the page in a browser, just make HTTP calls to the URLs and introspect their statuses.
I've searched around, and either can't find the exact question I'm trying to answer, or I need someone to explain it to me like I'm 5.
Basically, I have a Node.js script using the Net library. I'm connecting to multiple hosts, and sending commands, and listening for return data.
var net = require('net');
var nodes = [
'HOST1,192.168.179.8',
'HOST2,192.168.179.9',
'HOST3,192.168.179.10',
'HOST4,192.168.179.11'
];
function connectToServer(tid, ip) {
var conn = net.createConnection(23, ip);
conn.on('connect', function() {
conn.write (login_string); // login string hidden in pretend variable
});
conn.on('data', function(data) {
var read = data.toString();
if (read.match(/Login Successful/)) {
console.log ("Connected to " + ip);
conn.write(command_string);
}
else if (read.match(/Command OK/)) { // command_string returned successful,
// read until /\r\nEND\r\n/
// First part of data comes in here
console.log("Got a response from " + ip + ':' + read);
}
else {
//rest of data comes in here
console.log("Atonomous message from " + ip + ':' + read);
}
});
conn.on('end', function() {
console.log("Lost conncection to " + ip + "!!");
});
conn.on('error', function(err) {
console.log("Connection error: " + err + " for ip " + ip);
});
}
nodes.forEach(function(node) {
var nodeinfo = node.split(",");
connectToServer(nodeinfo[0], nodeinfo[1]);
});
The data ends up being split into two chunks. Even if I store the data in a hash and append the first part to the remainder when I read the /\r\nEND\r\n/ delimiter, there's a chunk missing out of the middle. How do I properly buffer the data in order to make sure I get the complete message from the stream?
EDIT: Ok, this seems to be working better:
function connectToServer(tid, ip) {
var conn = net.createConnection(23, ip);
var completeData = '';
conn.on('connect', function() {
conn.write (login_string);
});
conn.on('data', function(data) {
var read = data.toString();
if (read.match(/Login Successful/)) {
console.log ("Connected to " + ip);
conn.write(command_string);
}
else {
completeData += read;
}
if (completeData.match(/Command OK/)) {
if (completeData.match(/\r\nEND\r\n/)) {
console.log("Response: " + completeData);
}
}
});
conn.on('end', function() {
console.log("Connection closed to " + ip );
});
conn.on('error', function(err) {
console.log("Connection error: " + err + " for ip " + ip);
});
}
My biggest problem was apparently a logic error. I was either waiting for the chunk that began the reply, or the chunk that ended it. I wasn't saving everything in-between.
I guess if I wanted to get all Node-ish about it, I should fire an event whenever a complete message came in (beginning with a blank line, ending with 'END' on a line by itself), and do the processing there.
You shouldn't do anything with the data you recieve, until you receive the end event. The end callback means that all data chunks have been sent through the stream to your callbacks. If data comes in more than one chunk, you need to create a variable within your function closure to store this data to. Most programs can work just fine ignoring this fact, because data usually comes across in one chunk. But sometimes it doesn't. It doesn't even necessarily depend on the amount of data. If you're in a situation where this is happening, I created an example that demos how to handle it. I basically used your code, but removed all the fluff... this is just demoing the logic you need to collect all the data and do work on it.
function connectToServer(tid, ip) {
var conn = net.createConnection(23, ip);
var completeData = '';
conn.on('connect', function() {
conn.write (login_string); // login string hidden in pretend variable
});
conn.on('data', function(data) {
completeData += data;
var dataArray = completeData.split('your delimiter');
if(dataArray.size > 1) { //If our data was split into several pieces, we have a complete chunk saved in the 0th position in the array
doWorkOnTheFirstHalfOfData(dataArray[0]);
completeData = dataArray[1];// The second portion of data may yet be incomplete, thise may need to be more complete logic if you can get more than one delimeter at a time...
}
});
conn.on('end', function() {
//do stuff with the "completeData" variable in here.
});
}
My problem was a logic problem. I was either looking for the chunk that began the message, or the chunk that ended the message, and ignoring everything in between. I guess expected the entirety of the reply to come in in one or two chunks.
Here's the working code, pasted from above. There's probably a more Node-ish way of doing it (I should really emit an event for each chunk of information), but I'll mark this as the answer unless someone posts a better version by this time tomorrow.
function connectToServer(tid, ip) {
var conn = net.createConnection(23, ip);
var completeData = '';
conn.on('connect', function() {
conn.write (login_string);
});
conn.on('data', function(data) {
var read = data.toString();
if (read.match(/Login Successful/)) {
console.log ("Connected to " + ip);
conn.write(command_string);
}
else {
completeData += read;
}
if (completeData.match(/Command OK/)) {
if (completeData.match(/\r\nEND\r\n/)) {
console.log("Response: " + completeData);
}
}
});
conn.on('end', function() {
console.log("Connection closed to " + ip );
});
conn.on('error', function(err) {
console.log("Connection error: " + err + " for ip " + ip);
});
}
I`m trying to create a file downloader as a background service but when a large file is scheduled, it's first put in memory and then, at the end of the download the file is written to disk.
How can I make the file be wrote gradually to the disk preserving memory considering that I may have lots of files being downloaded at the same time?
Here's the code I`m using:
var sys = require("sys"),
http = require("http"),
url = require("url"),
path = require("path"),
fs = require("fs"),
events = require("events");
var downloadfile = "http://nodejs.org/dist/node-v0.2.6.tar.gz";
var host = url.parse(downloadfile).hostname
var filename = url.parse(downloadfile).pathname.split("/").pop()
var theurl = http.createClient(80, host);
var requestUrl = downloadfile;
sys.puts("Downloading file: " + filename);
sys.puts("Before download request");
var request = theurl.request('GET', requestUrl, {"host": host});
request.end();
var dlprogress = 0;
setInterval(function () {
sys.puts("Download progress: " + dlprogress + " bytes");
}, 1000);
request.addListener('response', function (response) {
response.setEncoding('binary')
sys.puts("File size: " + response.headers['content-length'] + " bytes.")
var body = '';
response.addListener('data', function (chunk) {
dlprogress += chunk.length;
body += chunk;
});
response.addListener("end", function() {
fs.writeFileSync(filename, body, 'binary');
sys.puts("After download finished");
});
});
I changed the callback to:
request.addListener('response', function (response) {
var downloadfile = fs.createWriteStream(filename, {'flags': 'a'});
sys.puts("File size " + filename + ": " + response.headers['content-length'] + " bytes.");
response.addListener('data', function (chunk) {
dlprogress += chunk.length;
downloadfile.write(chunk, encoding='binary');
});
response.addListener("end", function() {
downloadfile.end();
sys.puts("Finished downloading " + filename);
});
});
This worked perfectly.
does the request package work for your uses?
it lets you do things like this:
request(downloadurl).pipe(fs.createWriteStream(downloadtohere))
Take a look at http-request:
// shorthand syntax, buffered response
http.get('http://localhost/get', function (err, res) {
if (err) throw err;
console.log(res.code, res.headers, res.buffer.toString());
});
// save the response to 'myfile.bin' with a progress callback
http.get({
url: 'http://localhost/get',
progress: function (current, total) {
console.log('downloaded %d bytes from %d', current, total);
}
}, 'myfile.bin', function (err, res) {
if (err) throw err;
console.log(res.code, res.headers, res.file);
});
When downloading large file please use fs.write and not writeFile as it will override the previous content.
function downloadfile(res) {
var requestserver = http.request(options, function(r) {
console.log('STATUS: ' + r.statusCode);
console.log('HEADERS: ' + JSON.stringify(r.headers));
var fd = fs.openSync('sai.tar.gz', 'w');
r.on('data', function (chunk) {
size += chunk.length;
console.log(size+'bytes received');
sendstatus(res,size);
fs.write(fd, chunk, 0, chunk.length, null, function(er, written) {
});
});
r.on('end',function(){
console.log('\nended from server');
fs.closeSync(fd);
sendendstatus(res);
});
});
}
Instead of holding the content into memory in the "data" event listener you should write to the file in append mode.
Use streams like Carter Cole suggested. Here is a more complete example
var inspect = require('eyespect').inspector();
var request = require('request');
var filed = require('filed');
var temp = require('temp');
var downloadURL = 'http://upload.wikimedia.org/wikipedia/commons/e/ec/Hazard_Creek_Kayaker.JPG';
var downloadPath = temp.path({prefix: 'singlePageRaw', suffix: '.jpg'});
var downloadFile = filed(downloadPath);
var r = request(downloadURL).pipe(downloadFile);
r.on('data', function(data) {
inspect('binary data received');
});
downloadFile.on('end', function () {
inspect(downloadPath, 'file downloaded to path');
});
downloadFile.on('error', function (err) {
inspect(err, 'error downloading file');
});
You may need to install modules which you can do via
npm install filed request eyespect temp