Asynchronously Perform Recursive Data Tree Construction? - javascript

I am working on a web application that makes use of a file tree. The frontend JavaScript performs an ajax request to my Node.js server which calls my browse2 exported function. This function is then responsible for supplying the correct path to my function, getFolderContents(), that recursively builds the file system hierarchy object structure.
My issue is that I am currently doing things synchronously. Having done research into the inner workings of Node.js, it seems as though I should avoid synchronous operations at all costs. As such, I wanted to convert my code to work asynchronously. However, I couldn't get it working and all of my solutions were convoluted.
I have tried managing the flow using the "async" package. I had no luck with figuring that out. I tried implementing my own system of counters/loops/callbacks to determine when processes had finished executing. Ultimately, I suppose I can't wrap my mind around asynchronous execution flow.
I would like to ask two questions:
1. In this case, would it be detrimental to perform this request synchronously instead of asynchronously?
2. If yes to the first question, how should I go about converting this code to be asynchronous?
Note: When I tried to do things asynchronously, I used each synchronous function's asynchronous counterpart.
Below is my synchronous (working) code:
var path = require('path');
var fs = require('fs');
exports.browse2 = function(request, response) {
var tree = getFolderContents('C:\\Users\\AccountName\\folder1\\folder2\\folder3\\test\\');
response.send(tree);
};
function getFolderContents(route) {
var branch = {};
branch.title = path.basename(route);
branch.folder = true;
branch.children = [];
var files = fs.readdirSync(route);
var size = files.length;
for (var i = 0; i < size; i++) {
var file = files[i];
var concatPath = path.join(route, file);
if (fs.lstatSync(concatPath).isDirectory())
branch.children.push(getFolderContents(concatPath));
else
branch.children.push({
"title" : path.basename(file),
"path" : file
});
}
return branch;
}
I appreciate all input!
Edit:
Added asynchronous code attempt. Not fully working. Only a part of the tree is received.
exports.browse2 = function(request, response) {
getFolderContents(
'C:\\Users\\AccountName\\folder1\\folder2\\folder3\\test\\',
function(tree) {
response.send(tree);
});
};
function getFolderContents(route, callback) {
var branch = {};
branch.title = path.basename(route);
branch.folder = true;
branch.children = [];
fs.readdir(route, function(err, files) {
files.forEach(function(file) {
var concatPath = path.join(route, file);
fs.lstat(concatPath, function(err, stats) {
if (stats.isDirectory())
branch.children.push(getFolderContents(concatPath, callback));
else
branch.children.push({
"title" : path.basename(file),
"path" : file
});
callback(branch);
});
});
});
}

The basic problem you're having is that when you use asynchronous calls, you can't just assign things to the return of the function. The entire point of async is that the function won't wait. So for example:
function get_data(a) {
var data = some_async_call(a);
//at this point, data is undefined because execution won't wait on the calls to finish
data.do_something(); // this breaks because of the above
}
So instead what you do is pass an anonymous function to the asynchronous function called a callback, and the asynchronous function calls that function once the operations actually complete. The above example would become this:
function get_data(a) {
some_async_call(a, function(data) {
data.do_something();
});
}
function some_async_call(variable, callback) {
call_async({
data: variable,
success: callback
});
}
And in your case that would look like this:
exports.browse2 = function(request, response) {
getFolderContents('C:\\Users\\AccountName\\folder1\\folder2\\folder3\\test\\', function(tree) {
response.send(tree);
});
};
function getFolderContents(route, callback) {
var branch = {};
branch.title = path.basename(route);
...
callback(branch);
}
If you're familiar with setTimetout, this is how that works - the design pattern is to pass an anonymous function that does the work, and that function then executes once the data/information is actually available.

I managed to get it working. Here are my answers to my own questions:
It is better to perform the tasks asynchronously because to do it otherwise would mean that the application would block other users from receiving their responses until subsequent requests have been responded to.
The way to convert the synchronous code to asynchronous code is to use a parallel loop. The code for my particular case is this:
var path = require('path');
var fs = require('fs');
exports.browse2 = function(request, response) {
getFolderContents(
'C:\\Users\\AccountName\\folder1\\folder2\\folder3\\test\\',
function(err, tree) {
if (err)
throw err;
response.send(tree);
});
};
function getFolderContents(route, callback) {
var branch = {};
branch.title = path.basename(route);
branch.folder = true;
branch.children = [];
fs.readdir(route, function(err, files) {
if (err)
return callback(err);
var pending = files.length;
if (!pending)
return callback(null, branch);
files.forEach(function(file) {
var concatPath = path.join(route, file);
fs.lstat(concatPath, function(err, stats) {
if (stats && stats.isDirectory()) {
getFolderContents(concatPath, function(err, res) {
branch.children.push(res);
if (!--pending)
callback(null, branch);
});
} else {
branch.children.push({
"title" : path.basename(file),
"path" : file
});
if (!--pending)
callback(null, branch);
}
});
});
});
}
Thanks to user "chjj" with his response to a similar question on this thread: node.js fs.readdir recursive directory search
And thanks to user "Dan Smolinske" for directing me to the thread.

Related

Node.js with Express: Push to an empty Array returns an empty Array

I am listing all files from all directories in /home/myComputer/Desktop/Research, and then filtering them with an if statement to only get the .txt files that I would like to read and store into arrays. All works fine, but pushing the data into the arrays is not functioning. When I console log them, they return no value [].
I tried promise as well as call back function, but they didn't work for me because I didn't know how to implement them properly.
app.get('/jsonData', function(req, res) {
/* Define Arrays */
var theFile = [];
var theCategory = [];
var theContent = [];
var walk = function(dir, done) {
var results = [];
fs.readdir(dir, function(err, list) {
if (err) return done(err);
var i = 0;
(function next() {
var file = list[i++];
if (!file) return done(null, results);
file = dir + '/' + file;
fs.stat(file, function(err, stat) {
if (stat && stat.isDirectory()) {
walk(file, function(err, res) {
results = results.concat(res);
next();
});
} else {
results.push(file);
next();
}
});
})();
});
};
//walk(process.env.HOME, function(err, results) {
walk("/home/myComputer/Desktop/Research", function(err, results) {
if (err) throw err;
//console.log(results);
results.map(function(val) {
//Get the filename
var fileName = val.match(/[^\/]+$/).join();
//Get the category
var category = val.substr(48).match(/[^\/]+/);
if (fileName == 'written-speech.txt') {
console.log('FOUND!: ' + fileName + ' Category: ' + category) //this works
fs.readFile(val, 'utf8', function(err, contents) {
console.log(contents); // this works
theFile.push(fileName);
theCategory.push(category);
theContent.push(contents);
});
}
})
});
console.log(theFile); // The problem: This returns an empty Array []
console.log(theCategory); // The problem: This returns an empty Array []
console.log(theContent); // The problem: This returns an empty Array []
});
I expect console.log(theFile); console.log(theCategory); and console.log(theContent); to return the data pushed in them.
The reason for this is that many callbacks in Javascript are asynchronous, which means both fs.readdir and fs.readFile are asynchronous and their callbacks are not called immediately but slightly later (please read about Event Loop in javascript). So at the moment, when you log your arrays they are empty and data to them will be pushed later, e.g. in future. To avoid this you can either use synchronous methods (fs.readdirSync and fs.readFileSync) which is ugly and can cause performance issues if the app has a lot of other asynchronous operations. If in your case it is just a simple script to read some data, it might be fine.
And the other, preferred way is to use promises or some library for managing callbacks, e.g. async. Please read some articles regarding managing async code if these concepts are fully unfamiliar for you, e.g. https://dev.to/mrm8488/from-callbacks-to-fspromises-to-handle-the-file-system-in-nodejs-56p2 to get a basic understanding and see some use case examples.
Regarding your current version, there is no easy way to make it work without a lot of changes. It is better to rewrite it to use the concepts I described earlier.
walk is an asynchronous function because fs.readdir is an asynchronous method and the console.log statements are running (in a synchronous manner) before the callback of fs.readdir getting invoked.
You can console the values of these variables at the end inside the callback of walk.

How to make some synchronous code run before some other asynchronous code?

I have a function like this:
var download = function(url, name) {
http.get(url, function(response) {
// part1 : create a new folder if it doesn't exist
dir = './name';
if (!fs.existsSync(dir)){
fs.mkdirSync(dir);
}
// part 2: download and save file into that folder
response.on('data', function (data) {
fs.appendFileSync(dir, data);
});
})
}
I want part 1 to finish before part 2 runs (so that I can have the dir for part 2). How can I do that ?
(In the code above, as I know so far ( i am new to node.js), both parts will run simultaneously, so i'm not sure that part 1 will always finish before part 2 runs).
both parts will run simultaneously
No, they will not. existsSync and mkdirSync are blocking calls. So, only after they are executed the Event handler will be attached.
But, we should take advantage of the asynchronicity whenever applicable. In this case, you can use the exists and mkdir asynchronous counterparts.
So, your code can be loosely refactored like this
function download(url, name) {
function attachAppender(filename, response) {
response.on('data', function (data) {
fs.appendFile(filename, function (err) {
res.statusCode = err ? 500 : 200;
response.end();
});
});
}
http.get(url, function (response) {
var dir = './name';
fs.exists(dir, function (exists) {
if (!exists) {
fs.mkdir(dir, function (err) {
if (err) {
res.statusCode = 500;
res.end();
} else {
// pass the actual full file name
attachAppender(filename, response);
}
});
} else {
attachAppender(filename, response);
}
});
});
}
Note: fs.exists is deprecated and possibly removed soon. Better use fs.stat instead of it.
You are using sync functions, so that the calls are blocking. However, as thefoureye mentioned it is better to use the async versions, for performance reasons.
If you want to avoid the callback hell (i.e your code becomes more and more difficult to read as you chain asynchronous calls), you can use a library such as async.js that is written in the intent of trying to make it easier to write (and of course, easier to read).
Here is an example taken from the unit tests of async.js: each async function is called after the other.
var series = function(test){
var call_order = [];
async.series([
function(callback){
setTimeout(function(){
call_order.push(1);
callback(null, 1);
}, 25);
},
function(callback){
setTimeout(function(){
call_order.push(2);
callback(null, 2);
}, 50);
},
function(callback){
setTimeout(function(){
call_order.push(3);
callback(null, 3,3);
}, 15);
}
],
function(err, results){
test.ok(err === null, err + " passed instead of 'null'");
test.same(results, [1,2,[3,3]]);
test.same(call_order, [1,2,3]);
test.done();
});
}
There are lots of other initiatives in order to make series of async calls easier to read and write (async/await, fibers.js for example)

NodeJS Variable outside function scope

For the life of me I cannot work this one out. Have look around and tried many many different ways of trying to get this to go. Currently have the following code.
var config = require("./config.js");
var cradle = require('cradle')
var MikroNode = require('mikronode');
var WebServer = require('./bin/www');
var Routers = "Hasnt changed";
var conndb = new(cradle.Connection)(config.couchdb.host);
var db = conndb.database(config.couchdb.db);
db.exists(function(err, exists){
if (err) { console.log('error', err);}
else if (exists) { console.log('Seems the Force is with you - Database Exists');}
else { db.create(); }
});
db.temporaryView({
map: function (doc){
if (doc.type=='ConfigRouter') emit(doc.name, doc);
}
}, function (err, res){
Routers = JSON.stringify(res);
}
);
console.log(Routers);
As it stands it will respond with:
E:\Dev\MM>npm start
> MM#0.0.1 start E:\Dev\MM
> node ./Start.js
Hasnt changed
Seems the Force is with you - Database Exists
I am assuming it is an asynchronous call to the CouchDB and is not filling the result in time before it displays the result. How do I get around this issue?
You are right, the call is asynchronous so when console.log(Routers); is processed, Routers is "Hasnt changed".
One way of doing it would be to use promises thanks to the Q npm module:
var Q = require('q');
var deferred = Q.defer();
db.temporaryView({
map: function (doc) {
if (doc.type=='ConfigRouter') emit(doc.name, doc);
}
}, function (err, res) {
deferred.resolve(JSON.stringify(res));
});
deferred.promise
.then(function (data) {
Routers = data;
console.log(Routers);
// do some stuff...
})
.done();
Maybe it's possible to do something better without using Q.defer and adapting directly the callback:
https://github.com/kriskowal/q#adapting-node

Asynchronously Write Large Array of Objects to Redis with Node.js

I created a Node.js script that creates a large array of randomly generated test data and I want to write it to a Redis DB. I am using the redis client library and the async library. Initially, I tried executing a redisClient.hset(...) command within the for loop that generates my test data, but after some Googling, I learned the Redis method is asynchronous while the for loop is synchronous. After seeing some questions on StackOverflow, I can't get it to work the way I want.
I can write to Redis without a problem with a small array or larger, such as one with 100,000 items. However, it does not work well when I have an array of 5,000,000 items. I end up not having enough memory because the redis commands seem to be queueing up, but aren't executed until after async.each(...) is complete and the node process does not exit. How do I get the Redis client to actually execute the commands, as I call redisClient.hset(...)?
Here a fragment of the code I am working with.
var redis = require('redis');
var async = require('async');
var redisClient = redis.createClient(6379, '192.168.1.150');
var testData = generateTestData();
async.each(testData, function(item, callback) {
var someData = JSON.stringify(item.data);
redisClient.hset('item:'+item.key, 'hashKey', someData, function(err, reply) {
console.log("Item was persisted. Result: " +reply);
});
callback();
}, function(err) {
if (err) {
console.error(err);
} else {
console.log.info("Items have been persisted to Redis.");
}
});
You could call eachLimit to ensure you are not executing too many redisClient.hset calls at the same time.
To avoid overflowing the call stack you could do setTimeout(callback, 0); instead of calling the callback directly.
edit:
Forget what I said about setTimeout. All you need to do is call the callback at the right place. Like so:
redisClient.hset('item:'+item.key, 'hashKey', someData, function(err, reply) {
console.log("Item was persisted. Result: " +reply);
callback();
});
You may still want to use eachLimit and try out which limit works best.
By the way - async.each is supposed to be used only on code that schedules the invocation of the callback in the javascript event queue (e.g. timer, network, etc) . Never use it on code that calls the callback immediately as was the case in your original code.
edit:
You can implement your own eachLimit function that instead of an array takes a generator as it's first argument. Then you write a generator function to create the test data. For that to work, node needs to be run with "node --harmony code.js".
function eachLimit(generator, limit, iterator, callback) {
var isError = false, j;
function startNextSetOfActions() {
var elems = [];
for(var i = 0; i < limit; i++) {
j = generator.next();
if(j.done) break;
elems.push(j.value);
}
var activeActions = elems.length;
if(activeActions === 0) {
callback(null);
}
elems.forEach(function(elem) {
iterator(elem, function(err) {
if(isError) return;
else if(err) {
callback(err);
isError = true;
return;
}
activeActions--;
if(activeActions === 0) startNextSetOfActions();
});
});
}
startNextSetOfActions();
}
function* testData() {
while(...) {
yield new Data(...);
}
}
eachLimit(testData(), 10, function(item, callback) {
var someData = JSON.stringify(item.data);
redisClient.hset('item:'+item.key, 'hashKey', someData, function(err, reply) {
if(err) callback(err);
else {
console.log("Item was persisted. Result: " +reply);
callback();
}
});
}, function(err) {
if (err) {
console.error(err);
} else {
console.log.info("Items have been persisted to Redis.");
}
});

node-mysql timing

i have a recursive query like this (note: this is just an example):
var user = function(data)
{
this.minions = [];
this.loadMinions = function()
{
_user = this;
database.query('select * from users where owner='+data.id,function(err,result,fields)
{
for(var m in result)
{
_user.minions[result[m].id] = new user(result[m]);
_user.minions[result[m].id].loadMinions();
}
}
console.log("loaded all minions");
}
}
currentUser = new user(ID);
for (var m in currentUser.minions)
{
console.log("minion found!");
}
this don't work because the timmings are all wrong, the code don't wait for the query.
i've tried to do this:
var MyQuery = function(QueryString){
var Data;
var Done = false;
database.query(QueryString, function(err, result, fields) {
Data = result;
Done = true;
});
while(Done != true){};
return Data;
}
var user = function(data)
{
this.minions = [];
this.loadMinions = function()
{
_user = this;
result= MyQuery('select * from users where owner='+data.id);
for(var m in result)
{
_user.minions[result[m].id] = new user(result[m]);
_user.minions[result[m].id].loadMinions();
}
console.log("loaded all minions");
}
}
currentUser = new user(ID);
for (var m in currentUser.minions)
{
console.log("minion found!");
}
but he just freezes on the while, am i missing something?
The first hurdle to solving your problem is understanding that I/O in Node.js is asynchronous. Once you know how this applies to your problem the recursive part will be much easier (especially if you use a flow control library like Async or Step).
Here is an example that does some of what you're trying to do (minus the recursion). Personally, I would avoid recursively loading a possibly unknown number/depth of records like that; Instead load them on demand, like in this example:
var User = function(data) {
this.data = data
this.minions;
};
User.prototype.getMinions = function(primaryCallback) {
var that = this; // scope handle
if(this.minions) { // bypass the db query if results cached
return primaryCallback(null, this.minions);
}
// Callback invoked by database.query when it has the records
var aCallback = function(error, results, fields) {
if(error) {
return primaryCallback(error);
}
// This is where you would put your recursive minion initialization
// The problem you are going to have is callback counting, using a library
// like async or step would make this party much much easier
that.minions = results; // bypass the db query after this
primaryCallback(null, results);
}
database.query('SELECT * FROM users WHERE owner = ' + data.id, aCallback);
};
var user = new User(someData);
user.getMinions(function(error, minions) {
if(error) {
throw error;
}
// Inside the function invoked by primaryCallback(...)
minions.forEach(function(minion) {
console.log('found this minion:', minion);
});
});
The biggest thing to note in this example are the callbacks. The database.query(...) is asynchronous and you don't want to tie up the event loop waiting for it to finish. This is solved by providing a callback, aCallback, to the query, which is executed when the results are ready. Once that callback fires and after you perform whatever processing you want to do on the records you can fire the primaryCallback with the final results.
Each Node.js process is single-threaded, so the line
while(Done != true){};
takes over the thread, and the callback that would have set Done to true never gets run because the thead is blocked on an infinite loop.
You need to refactor your program so that code that depends on the results of the query is included within the callback itself. For example, make MyQuery take a callback argument:
MyQuery = function(QueryString, callback){
Then call the callback at the end of your database.query callback -- or even supply it as the database.query callback.
The freezing is unfortunately correct behaviour, as Node is single-threaded.
You need a scheduler package to fix this. Personally, I have been using Fibers-promise for this kind of issue. You might want to look at this or another promise library or at async

Categories

Resources