Dir walking to get abs path array of various files

Dir walking to get abs path array of various files - javascript

Goal: Get a List of Absolute Paths for all files in a directory recursively leveraging NodeJs.
Info: As a python dev, I normally use the python packages which handle this in a platform independent fashion. My boss wanted some javascript code which would handle this goal... and as a JS dev previously, I was like "oh this is easy. Let's look up the node as I never got a chance to get my hands dirty with it." but I seem to be mistaken.
I don't see anything in node relating to Dir Walking, or a way I could hack together to create such a thing.
I was looking in "Child Process", "Console", "File System", "OS", "Path", and "Process". I didn't see anything which would do anything akin to:
pushd .
cd $dir
for folder in $(ls);
do
pushd .
cd $folder
//call again
ls $(pwd)$flag >> $dir/files_matching.txt
popd
done;
// or any platform independent means of recursively getting
// all files and their abs path which meet flag criterion,
// such as "*.txt" || "_*found*"
I could use child process to carry out Command Line items, but then I need to create a bunch of conditionals based on the OS consuming the app, and figured this would be something which already exists.
I don't want to reinvent the wheel, but figured this has already been done; I just don't see it in the base modules.
Is there a node module I would need which accomplishes this, which is outside of the base modules?
I am trying not to have to hand roll a conditional os based system to get an exhaustive list of abs paths for all files under a directory (or subset due to extensions, etc.)

I'd do it like this:
synchronous:
const fs = require("fs");
const { resolve } = require("path");
const getFiles = dir => {
const stack = [resolve(dir)];
const files = [];
while (stack.length) {
dir = stack.pop();
fs.readdirSync(dir).forEach(item => {
const path = resolve(dir, item);
(fs.statSync(path).isDirectory() ? stack : files).push(path);
});
}
return files;
};
console.log(getFiles("."));
asynchronous:
const fs = require("fs");
const { resolve } = require("path");
const pify = require("util").promisify;
const readdir = pify(fs.readdir);
const stat = pify(fs.stat);
const getFiles = async dir => {
const files = await readdir(resolve(dir));
const filesP = files.map(async file => {
const path = resolve(dir, file);
return (await stat(path)).isDirectory() ? getFiles(path) : path;
});
// return (await Promise.all(filesP)).flat(); // flat supported in node ~11
return [].concat(...(await Promise.all(filesP)));
};
getFiles(".").then(console.log);
async demo https://repl.it/#marzelin/getFiles

So, I was looking at the filesystem module and noticed the function readDir
https://nodejs.org/dist/latest-v8.x/docs/api/fs.html#fs_fs_readdir_path_options_callback
which does the trick in part. I guess it wasnt named a method i would have looking for. I was looking for things involving LIST and DIR, but not READ.
Anyways, here is a way to read Dir.
var fs = require('fs');
if (process.argv.length <= 2) {
console.log("Usage: " + __filename + " path/to/directory");
process.exit(-1);
}
var path = process.argv[2];
fs.readdir(path, function(err, items) {
console.log(items);
for (var i=0; i<items.length; i++) {
console.log(items[i]);
}
});
You notice that this one above is Async, but there is a Sync variant, just add "Sync" to the signature. Now you need to determine if something is a directory:
let file = fs.statSync("path/to/directory")
let isDir = file.isDirectory()
So you can couple this all together.
var fs = require('fs')
function recurse_file_system(path, contains) {
let files = fs.readdirSync(path);
let dArr = [];
let fArr = [];
for (let i in files){
let newPath = path + "/" + files[i]
if (fs.statSync(newPath).isDirectory()){
dArr.push(newPath)
}else{
if (filter(files[i], ".txt")){
fArr.push(newPath)
}
}
}
if (arr.length == 0){
return fArr;
}else{
for (let d in dArr){
let rslt = recurse_file_system(dArr[d]);
for (let i in rslt){
fArr.push(rslt[i])
}
}
return fArr;
}
}
console.log("Files:")
console.log(recurse_file_system("/"))
Now if you want to extend this, all you need to do is add a filter to say, limit the size of returns based on particular criterion, such as file name limitation.
function filter(filename, contains){
let reg = new RegEx(contains)
return reg.test(filename)
}
and you can add it to the base case, where you see filter... OR you can just return the WHOLE set and filter it there with the List method, filter.

Related

Get desktop file icons using NodeJs

I am trying to create a desktop launcher application in Electron that reads the number of files in the user's desktop and gathers the information from each file found. I am gathering the files and constructing the path based on the directory, but all I can get is the filenames. I am not sure how to store the file itself and extract the desktop icon from it. I haven't seen many solutions to it, other than using the AxtiveXobject however supposedly certain implementations do not work in the latest nodejs. Here is my code so far.
//requiring path and fs modules
const path = require('path');
const fs = require('fs');
//gets home directory
const homedir = require('os').homedir();
//specifies to desktop
const dir = `${homedir}/Desktop`;
var walk = require('walk');
var filepaths = [];
//storing desktop path
var desktopDir = dir;
console.log(desktopDir);
//gets the desktop files and paths
function getDesktopFiles(_dir){
//read directory
fs.readdir(_dir, (err, files) => {
if (err)
console.log(err);
else {
files.forEach(_file => {
//console.log(_file);
let _p = _dir + '/'+_file;
//changes slashing for file paths
let _path = _p.replace(/\\/g, "/");
filepaths.push(_path);
})
}
})
for(let p of filepaths){
console.log(p);
}
}
getDesktopFiles(desktopDir);

Here is a quick snippet of code which works for me in an Electron renderer process; it has been successfully tested on both macOS and Linux, and should be platform-independent.
It lists all the files located on the user's desktop and displays each file's icon and name at the end of the HTML page; it makes use of the following Electron API functions:
app.getPath
app.getFileIcon
image.toDataURL
image.getSize
const { app, nativeImage } = require ('electron').remote;
const path = require ('path');
const fs = require ('fs');
//
const desktopPath = app.getPath ('desktop');
let filePaths = fs.readdirSync (desktopPath);
for (let filePath of filePaths)
{
app.getFileIcon (filePath)
.then
(
(fileIcon) =>
{
let div = document.createElement ('div');
let img = document.createElement ('img');
img.setAttribute ('src', fileIcon.toDataURL ());
let size = fileIcon.getSize ();
img.setAttribute ('width', size.width);
img.setAttribute ('height', size.height);
div.appendChild (img);
div.appendChild (document.createTextNode (" " + path.basename (filePath)));
// For test purposes, add each file icon and name to the end of <body>
document.body.appendChild (div);
}
);
}
You may find some interesting hints about app.getFileIcon in the post: Is there a standard way for an Electron or Node.js app to access system-level icons?

Define a function for the same as:
function load(icon) {
if (cache[icon]) return cache[icon];
return cache[icon] = fs.readFileSync(__dirname + '/public/icons/' + icon, 'base64');
}
Here you can get the inspiration for the same.

Async and Sync Functions Node JS

I have a couple of functions I am running to download a zip file from Nexus, then unzip/extract the contents of the zip file, finally a search for a specific file type. All function work, however the synchronous search for some reason is not producing any results. If I simply run the download and extract functions in 1 script, then execute the search in another script I get my expected results. I am almost positive it is due to the search being synchronous whereas the download and extract are both async. Is there a quick way to add the find function at the end after the download & extract functions have run? Below is the code:
//npm modules
const fs = require('fs-extra');
const download = require('download');
const unzipper = require('unzipper');
const path = require('path');
//Custom Variables
const artifact = 'SOME_FILE.zip';
const repo = "SOME NEXUS REPOSITORY";
const url = "http://SOME URL/repository/";
const directory = 'SOME DIRECTORY';
//Get Artifact and Extract it to local directory function
const getArtifact = async () => {
const getArtifact = await download(url+repo, "./unzip")
const file = await fs.writeFileSync(directory+artifact, await download(url+repo))
const readStream = await fs.createReadStream(directory + artifact).pipe(unzipper.Extract({path:
directory}))
}
//Find function which should run after download and extract have been fulfilled
const findFile = function (dir, pattern) {
var results = [];
fs.readdirSync(dir).forEach(function (dirInner) {
dirInner = path.resolve(dir, dirInner);
var stat = fs.statSync(dirInner);
console.log(stat)
if (stat.isDirectory()) {
results = results.concat(findFile(dirInner, pattern));
}
if (stat.isFile() && dirInner.endsWith(pattern)) {
results.push(dirInner);
}
});
console.log(results)
return results;
};
//clear contents of directory before new download and extract
fs.emptyDirSync(directory)
//call download and extract function
getArtifact()
When I run "findFile" after the download & extract by itself in a separate script I get expected array output. However, when I try to incorporate (see below) this into the same script I get the an empty array:
getArtifact().then(function findFile (dir, pattern) {
var results = [];
fs.readdirSync(directory).forEach(function (dirInner) {
dirInner = path.resolve(directory, dirInner);
var stat = fs.statSync(dirInner);
console.log(stat)
if (stat.isDirectory()) {
results = results.concat(findFile(dirInner, pattern))
if (stat.isFile() && dirInner.endsWith(pattern)) {
results.push(dirInner);
}
}
console.log(results)
return results;
})
})
//Output
[]
//If I try the following:
getArtifact().then(findFile(directory, file))
// I get same empty array
[]
//If I run "findFile" in its own script after the download extract I get the following:
[
'SOME_FILE_PATH\\document1',
'SOME_FILE_PATH\\document2',
'SOME_FILE_PATH\\document3',
'SOME_FILE_PATH\\document4',
'SOME_FILE_PATH\\document5',
'SOME_FILE_PATH\\document6',
'SOME_FILE_PATH\\document7
]
Any help with how I can incorporate my findFile function into my existing download&extract function is appreciated...

fs.createWriteStream doesn't use back-pressure when writing data to a file, causing high memory usage

Problem
I'm trying to scan a drive directory (recursively walk all the paths) and write all the paths to a file (as it's finding them) using fs.createWriteStream in order to keep the memory usage low, but it doesn't work, the memory usage reaches 2GB during the scan.
Expected
I was expecting fs.createWriteStream to automatically handle memory/disk usage at all times, keeping memory usage at a minimum with back-pressure.
Code
const fs = require('fs')
const walkdir = require('walkdir')
let dir = 'C:/'
let options = {
"max_depth": 0,
"track_inodes": true,
"return_object": false,
"no_return": true,
}
const wstream = fs.createWriteStream("C:/Users/USERNAME/Desktop/paths.txt")
let walker = walkdir(dir, options)
walker.on('path', (path) => {
wstream.write(path + '\n')
})
walker.on('end', (path) => {
wstream.end()
})
Is it because I'm not using .pipe()? I tried creating a new Stream.Readable({read{}}) and then inside the .on('path' emitter pushing paths into it with readable.push(path) but that didn't really work.
UPDATE:
Method 2:
I tried the proposed in the answers drain method but it doesn't help much, it does reduce memory usage to 500mb (which is still too much for a stream) but it slows down the code significantly (from seconds to minutes)
Method 3:
I also tried using readdirp, it uses even less memory (~400mb) and is faster but I don't know how to pause it and use the drain method there to reduce the memory usage further:
const readdirp = require('readdirp')
let dir = 'C:/'
const wstream = fs.createWriteStream("C:/Users/USERNAME/Desktop/paths.txt")
readdirp(dir, {alwaysStat: false, type: 'files_directories'})
.on('data', (entry) => {
wstream.write(`${entry.fullPath}\n`)
})
Method 4:
I also tried doing this operation with a custom recursive walker, and even though it uses only 30mb of memory, which is what I wanted, but it is like 10 times slower than the readdirp method and it is synchronous which is undesirable:
const fs = require('fs')
const path = require('path')
let dir = 'C:/'
function customRecursiveWalker(dir) {
fs.readdirSync(dir).forEach(file => {
let fullPath = path.join(dir, file)
// Folders
if (fs.lstatSync(fullPath).isDirectory()) {
fs.appendFileSync("C:/Users/USERNAME/Desktop/paths.txt", `${fullPath}\n`)
customRecursiveWalker(fullPath)
}
// Files
else {
fs.appendFileSync("C:/Users/USERNAME/Desktop/paths.txt", `${fullPath}\n`)
}
})
}
customRecursiveWalker(dir)

Preliminary observation: you've attempted to get the results you want using multiple approaches. One complication when comparing the approaches you used is that they do not all do the same work. If you run tests on file tree that contains only regular files, that tree does not contain mount points, you can probably compare the approaches fairly, but when you start adding mount points, symbolic links, etc, you may get different memory and time statistics merely due to the fact that one approach excludes files that another approach includes.
I've initially attempted a solution using readdirp, but unfortunately, but that library appears buggy to me. Running it on my system here, I got inconsistent results. One run would output 10Mb of data, another run with the same input parameters would output 22Mb, then I'd get another number, etc. I looked at the code and found that it does not respect the return value of push:
_push(entry) {
if (this.readable) {
this.push(entry);
}
}
As per the documentation the push method may return a false value, in which case the Readable stream should stop producing data and wait until _read is called again. readdirp entirely ignores that part of the specification. It is crucial to pay attention to the return value of push to get proper handling of back-pressure. There are also other things that seemed questionable in that code.
So I abandoned that and worked on a proof of concept showing how it could be done. The crucial parts are:
When the push method returns false it is imperative to stop adding data to the stream. Instead, we record where we were, and stop.
We start again only when _read is called.
If you uncomment the console.log statements that print START and STOP. You'll see them printed out in succession on the console. We start, produce data until Node tells us to stop, and then we stop, until Node tells us to start again, and so on.
const stream = require("stream");
const fs = require("fs");
const { readdir, lstat } = fs.promises;
const path = require("path");
class Walk extends stream.Readable {
constructor(root, maxDepth = Infinity) {
super();
this._maxDepth = maxDepth;
// These fields allow us to remember where we were when we have to pause our
// work.
// The path of the directory to process when we resume processing, and the
// depth of this directory.
this._curdir = [root, 1];
// The directories still to process.
this._dirs = [this._curdir];
// The list of files to process when we resume processing.
this._files = [];
// The location in `this._files` were to continue processing when we resume.
this._ix = 0;
// A flag recording whether or not the fetching of files is currently going
// on.
this._started = false;
}
async _fetch() {
// Recall where we were by loading the state in local variables.
let files = this._files;
let dirs = this._dirs;
let [dir, depth] = this._curdir;
let ix = this._ix;
while (true) {
// If we've gone past the end of the files we were processing, then
// just forget about them. This simplifies the code that follows a bit.
if (ix >= files.length) {
ix = 0;
files = [];
}
// Read directories until we have files to process.
while (!files.length) {
// We've read everything, end the stream.
if (dirs.length === 0) {
// This is how the stream API requires us to indicate the stream has
// ended.
this.push(null);
// We're no longer running.
this._started = false;
return;
}
// Here, we get the next directory to process and get the list of
// files in it.
[dir, depth] = dirs.pop();
try {
files = await readdir(dir, { withFileTypes: true });
}
catch (ex) {
// This is a proof-of-concept. In a real application, you should
// determine what exceptions you want to ignore (e.g. EPERM).
}
}
// Process each file.
for (; ix < files.length; ++ix) {
const dirent = files[ix];
// Don't include in the results those files that are not directories,
// files or symbolic links.
if (!(dirent.isFile() || dirent.isDirectory() || dirent.isSymbolicLink())) {
continue;
}
const fullPath = path.join(dir, dirent.name);
if (dirent.isDirectory() & depth < this._maxDepth) {
// Keep track that we need to walk this directory.
dirs.push([fullPath, depth + 1]);
}
// Finally, we can put the data into the stream!
if (!this.push(`${fullPath}\n`)) {
// If the push returned false, we have to stop pushing results to the
// stream until _read is called again, so we have to stop.
// Uncomment this if you want to see when the stream stops.
// console.log("STOP");
// Record where we were in our processing.
this._files = files;
// The element at ix *has* been processed, so ix + 1.
this._ix = ix + 1;
this._curdir = [dir, depth];
// We're stopping, so indicate that!
this._started = false;
return;
}
}
}
}
async _read() {
// Do not start the process that puts data on the stream over and over
// again.
if (this._started) {
return;
}
this._started = true; // Yep, we've started.
// Uncomment this if you want to see when the stream starts.
// console.log("START");
await this._fetch();
}
}
// Change the paths to something that makes sense for you.
stream.pipeline(new Walk("/home/", 5),
fs.createWriteStream("/tmp/paths3.txt"),
(err) => console.log("ended with", err));
When I run the first attempt you made with walkdir here, I get the following statistics:
Elapsed time (wall clock): 59 sec
Maximum resident set size: 2.90 GB
When I use the code I've shown above:
Elapsed time (wall clock): 35 sec
Maximum resident set size: 0.1 GB
The file tree I use for the tests produces a file listing of 792 MB

You could exploit the returned value from WritableStream.write(): it essentially states if you should continue to read or not. a WritableStream has an internal property that stores the threshold after which the buffer should be processed by the OS. The drain event will be emitted when the buffer has been flushed, i.e. you can call safely call WritableStream.write() without risking to excessively fill the buffer (which means the RAM). Luckily for you, walkdir let you control the process: you can emit pause(pause the walk. no more events will be emitted until resume) and resume(resume the walk) event from the walkdir object, pausing and resuming the writing process on you stream accordingly. Try with this:
let is_emitter_paused = false;
wstream.on('drain', (evt) => {
if (is_emitter_paused) {
walkdir.resume();
}
});
walkdir.on('path', function(path, stat) {
is_emitter_paused = !wstream.write(path + '\n');
if (is_emitter_paused) {
walkdir.pause();
}
});

Here's an implementation inspired by #Louis's answer. I think it's a bit easier to follow and in my minimal testing it performs about the same.
const fs = require('fs');
const path = require('path');
const stream = require('stream');
class Walker extends stream.Readable {
constructor(root = process.cwd(), maxDepth = Infinity) {
super();
// Dirs to process
this._dirs = [{ path: root, depth: 0 }];
// Max traversal depth
this._maxDepth = maxDepth;
// Files to flush
this._files = [];
}
_drain() {
while (this._files.length > 0) {
const file = this._files.pop();
if (file.isFile() || file.isDirectory() || file.isSymbolicLink()) {
const filePath = path.join(this._dir.path, file.name);
if (file.isDirectory() && this._maxDepth > this._dir.depth) {
// Add directory to be walked at a later time
this._dirs.push({ path: filePath, depth: this._dir.depth + 1 });
}
if (!this.push(`${filePath}\n`)) {
// Hault walking
return false;
}
}
}
if (this._dirs.length === 0) {
// Walking complete
this.push(null);
return false;
}
// Continue walking
return true;
}
async _step() {
try {
this._dir = this._dirs.pop();
this._files = await fs.promises.readdir(this._dir.path, { withFileTypes: true });
} catch (e) {
this.emit('error', e); // Uh oh...
}
}
async _walk() {
this.walking = true;
while (this._drain()) {
await this._step();
}
this.walking = false;
}
_read() {
if (!this.walking) {
this._walk();
}
}
}
stream.pipeline(new Walker('some/dir/path', 5),
fs.createWriteStream('output.txt'),
(err) => console.log('ended with', err));

javascript fs seach files for strings in array. Avoid bad performance

I'm building a tool that will clean up a JSON file containing localization strings if they are no longer in use in the source code.
First, I parse the localization file into an array with all the id's that are (or no longer are) used in the source code to get the string value in the right language.
so I have an array looking something like this:
const ids = ['home.title', 'home.description', 'menu.contact', 'menu.social'];
etc. you get the point.
I'm using node.js fs promisified readFile and glob to search .js source code files like this:
const jsFiles = await globbing('./**/*.js', {cwd: directory, ignore: './**/*test.js'});
const results = jsFiles.map(async file => {
const filePath = path.join(directory, file);
return readFile(filePath, 'utf8').then((data) => {
// handle match here
}).catch(console.log);
});
I also have Ramda available for fancy list/collection functions, but no other libraries.
So, I will be able to loop through the ids array and for each item scan the entire source code for a match with the function from above. But that seems a bit overkill to scan the entire source code times ids.length. The ids array is on around 400 ids' and the source code is hundreds of large files.
To avoid O(M*N), is there a way to match the entire array with the entire source code, and discard the not matched array items? Or what would be the best practice here?
current solution:
const cleanLocal = async () => {
const localIdList = Object.keys(await getLocalMap());
const matches = [];
localIdList.map(async id => {
const directory = path.join(__dirname, '..');
const jsFiles = await globbing('./**/*.js', {cwd: directory, ignore: './**/*test.js'});
jsFiles.map(async file => {
const filePath = path.join(directory, file);
return readFile(filePath, 'utf8').then((data) => {
if (data.indexOf(id) >= 0) {
console.log(id);
matches.push(id);
}
}).catch(console.log);
});
});
};

You can't avoid the O(M*N) complexity in this case.
However, to improve performance you can switch the order of your operations: first loop over the files and then loop over the array. This is because looping over the files is a costly IO operation, while looping over the array is a fast memory operation.
In your code, you have M memory operations and M*N IO (filesystem) operations.
If you first loop over the files, you would have N IO operations and M*N memory operations.

As it is not possible to avoid O(M*N) in this case I have only been able to optimize this search function by looping through the source files once and then over the ids' for each file as proposed by #mihai as an optimization opportunity.
The end result looks like this:
const cleanLocal = async () => {
const localIdList = Object.keys(await getLocalMap()); // ids' array
const matches = [];
const directory = path.join(__dirname, '..');
const jsFiles = await globbing('./**/*.js', {cwd: directory, ignore: './**/*test.js'}); // list of files to scan
const results = jsFiles.map(async file => {
const filePath = path.join(directory, file);
return readFile(filePath, 'utf8').then((data) => {
localIdList.map(id => {
if (R.contains(id, data)) { // R = ramda.js
matches.push(id);
}
});
}).catch(console.log);
});
await Promise.all(results);
console.log('matches: ' + R.uniq(matches).length);
console.log('in local.json: ' + localIdList.length);
};
Please let me know if there are any other way to optimize this.

recursively find the closest package.json node.js

I am newly with node js, and I would like to find recursively the closest package.json. Actually, continue finding package.json until will not hit it.
My folder tree
root/
-contarats/
-proto/
some.proto
-package.json
"script": {
"contracts": "generate-some-contracts contracts/proto contracts",
}
const input = process.argv[2]
const settings = require(path.resolve(input, 'package.json'))

Are you looking for a way to iterate through directories? If so heres a synchronous function that would do that
function search_sync(dir) {
var results = []
var list = fs.readdirSync(dir)
list.forEach(function(file) {
file = path.resolve(dir, file)
filename = file.split('\\');
filename = filename[filename.length-1]
var stat = fs.statSync(file)
if (stat && stat.isDirectory()) results = results.concat(search_sync(file))
else if(filename == 'package.json')results.push(file)
})
return results
}
That will return an array of any files that are named package.json with their full file path. EG:
search_sync('./')
[C:\Users\User\MyNodeProject\package.json,
C:\Users\User\MyNodeProject\npm\someDependency\package.json,
C:\Users\User\MyNodeProject\npm\someOtherDependency\package.json]
Personally, I'd then break each line by their '\' character and see which one is closer to my root folder

Looking at your tree of directories, the package.json file is not in contracts/proto, but in contracts. (I assume that contaracts is a typo.) Changing the first argument on the command line should help:
generate-some-contracts contracts contracts
Nevertheless, you ask about the recursive search for the nearest package.json. NPM does it, when looking for the package root. It starts in the current directory and then follows the ancestors, until it finds a package.json. A function reading and parsing that package.json, similarly to require, could look like this:
const { readFile } = require('fs/promises')
const { join, resolve } = require('path')
async function loadPackageJson(cwd) {
const startDir = cwd || process.env.INIT_CWD || process.cwd()
let dir = startDir, prevDir
do {
try {
const path = join(dir, 'package.json')
const content = await readFile(path, 'utf8')
return JSON.parse(content)
} catch (err) {
if (err.code !== 'ENOENT') throw err
}
prevDir = dir
dir = resolve(dir, '..')
} while (prevDir !== dir)
throw new Error(`package.json not found in ${startDir} and its ancestors`)
}

Develop Reference

JavaScript is the programming language of the Web.