Async and Sync Functions Node JS - javascript

I have a couple of functions I am running to download a zip file from Nexus, then unzip/extract the contents of the zip file, finally a search for a specific file type. All function work, however the synchronous search for some reason is not producing any results. If I simply run the download and extract functions in 1 script, then execute the search in another script I get my expected results. I am almost positive it is due to the search being synchronous whereas the download and extract are both async. Is there a quick way to add the find function at the end after the download & extract functions have run? Below is the code:
//npm modules
const fs = require('fs-extra');
const download = require('download');
const unzipper = require('unzipper');
const path = require('path');
//Custom Variables
const artifact = 'SOME_FILE.zip';
const repo = "SOME NEXUS REPOSITORY";
const url = "http://SOME URL/repository/";
const directory = 'SOME DIRECTORY';
//Get Artifact and Extract it to local directory function
const getArtifact = async () => {
const getArtifact = await download(url+repo, "./unzip")
const file = await fs.writeFileSync(directory+artifact, await download(url+repo))
const readStream = await fs.createReadStream(directory + artifact).pipe(unzipper.Extract({path:
directory}))
}
//Find function which should run after download and extract have been fulfilled
const findFile = function (dir, pattern) {
var results = [];
fs.readdirSync(dir).forEach(function (dirInner) {
dirInner = path.resolve(dir, dirInner);
var stat = fs.statSync(dirInner);
console.log(stat)
if (stat.isDirectory()) {
results = results.concat(findFile(dirInner, pattern));
}
if (stat.isFile() && dirInner.endsWith(pattern)) {
results.push(dirInner);
}
});
console.log(results)
return results;
};
//clear contents of directory before new download and extract
fs.emptyDirSync(directory)
//call download and extract function
getArtifact()
When I run "findFile" after the download & extract by itself in a separate script I get expected array output. However, when I try to incorporate (see below) this into the same script I get the an empty array:
getArtifact().then(function findFile (dir, pattern) {
var results = [];
fs.readdirSync(directory).forEach(function (dirInner) {
dirInner = path.resolve(directory, dirInner);
var stat = fs.statSync(dirInner);
console.log(stat)
if (stat.isDirectory()) {
results = results.concat(findFile(dirInner, pattern))
if (stat.isFile() && dirInner.endsWith(pattern)) {
results.push(dirInner);
}
}
console.log(results)
return results;
})
})
//Output
[]
//If I try the following:
getArtifact().then(findFile(directory, file))
// I get same empty array
[]
//If I run "findFile" in its own script after the download extract I get the following:
[
'SOME_FILE_PATH\\document1',
'SOME_FILE_PATH\\document2',
'SOME_FILE_PATH\\document3',
'SOME_FILE_PATH\\document4',
'SOME_FILE_PATH\\document5',
'SOME_FILE_PATH\\document6',
'SOME_FILE_PATH\\document7
]
Any help with how I can incorporate my findFile function into my existing download&extract function is appreciated...

Related

How can I optimize my JavaScript code to handle large log files (over 1 GB)? [duplicate]

I need to do some parsing of large (5-10 Gb)logfiles in Javascript/Node.js (I'm using Cube).
The logline looks something like:
10:00:43.343423 I'm a friendly log message. There are 5 cats, and 7 dogs. We are in state "SUCCESS".
We need to read each line, do some parsing (e.g. strip out 5, 7 and SUCCESS), then pump this data into Cube (https://github.com/square/cube) using their JS client.
Firstly, what is the canonical way in Node to read in a file, line by line?
It seems to be fairly common question online:
http://www.quora.com/What-is-the-best-way-to-read-a-file-line-by-line-in-node-js
Read a file one line at a time in node.js?
A lot of the answers seem to point to a bunch of third-party modules:
https://github.com/nickewing/line-reader
https://github.com/jahewson/node-byline
https://github.com/pkrumins/node-lazy
https://github.com/Gagle/Node-BufferedReader
However, this seems like a fairly basic task - surely, there's a simple way within the stdlib to read in a textfile, line-by-line?
Secondly, I then need to process each line (e.g. convert the timestamp into a Date object, and extract useful fields).
What's the best way to do this, maximising throughput? Is there some way that won't block on either reading in each line, or on sending it to Cube?
Thirdly - I'm guessing using string splits, and the JS equivalent of contains (IndexOf != -1?) will be a lot faster than regexes? Has anybody had much experience in parsing massive amounts of text data in Node.js?
I searched for a solution to parse very large files (gbs) line by line using a stream. All the third-party libraries and examples did not suit my needs since they processed the files not line by line (like 1 , 2 , 3 , 4 ..) or read the entire file to memory
The following solution can parse very large files, line by line using stream & pipe. For testing I used a 2.1 gb file with 17.000.000 records. Ram usage did not exceed 60 mb.
First, install the event-stream package:
npm install event-stream
Then:
var fs = require('fs')
, es = require('event-stream');
var lineNr = 0;
var s = fs.createReadStream('very-large-file.csv')
.pipe(es.split())
.pipe(es.mapSync(function(line){
// pause the readstream
s.pause();
lineNr += 1;
// process line here and call s.resume() when rdy
// function below was for logging memory usage
logMemoryUsage(lineNr);
// resume the readstream, possibly from a callback
s.resume();
})
.on('error', function(err){
console.log('Error while reading file.', err);
})
.on('end', function(){
console.log('Read entire file.')
})
);
Please let me know how it goes!
You can use the inbuilt readline package, see docs here. I use stream to create a new output stream.
var fs = require('fs'),
readline = require('readline'),
stream = require('stream');
var instream = fs.createReadStream('/path/to/file');
var outstream = new stream;
outstream.readable = true;
outstream.writable = true;
var rl = readline.createInterface({
input: instream,
output: outstream,
terminal: false
});
rl.on('line', function(line) {
console.log(line);
//Do your stuff ...
//Then write to output stream
rl.write(line);
});
Large files will take some time to process. Do tell if it works.
I really liked #gerard answer which is actually deserves to be the correct answer here. I made some improvements:
Code is in a class (modular)
Parsing is included
Ability to resume is given to the outside in case there is an asynchronous job is chained to reading the CSV like inserting to DB, or a HTTP request
Reading in chunks/batche sizes that
user can declare. I took care of encoding in the stream too, in case
you have files in different encoding.
Here's the code:
'use strict'
const fs = require('fs'),
util = require('util'),
stream = require('stream'),
es = require('event-stream'),
parse = require("csv-parse"),
iconv = require('iconv-lite');
class CSVReader {
constructor(filename, batchSize, columns) {
this.reader = fs.createReadStream(filename).pipe(iconv.decodeStream('utf8'))
this.batchSize = batchSize || 1000
this.lineNumber = 0
this.data = []
this.parseOptions = {delimiter: '\t', columns: true, escape: '/', relax: true}
}
read(callback) {
this.reader
.pipe(es.split())
.pipe(es.mapSync(line => {
++this.lineNumber
parse(line, this.parseOptions, (err, d) => {
this.data.push(d[0])
})
if (this.lineNumber % this.batchSize === 0) {
callback(this.data)
}
})
.on('error', function(){
console.log('Error while reading file.')
})
.on('end', function(){
console.log('Read entirefile.')
}))
}
continue () {
this.data = []
this.reader.resume()
}
}
module.exports = CSVReader
So basically, here is how you will use it:
let reader = CSVReader('path_to_file.csv')
reader.read(() => reader.continue())
I tested this with a 35GB CSV file and it worked for me and that's why I chose to build it on #gerard's answer, feedbacks are welcomed.
I used https://www.npmjs.com/package/line-by-line for reading more than 1 000 000 lines from a text file. In this case, an occupied capacity of RAM was about 50-60 megabyte.
const LineByLineReader = require('line-by-line'),
lr = new LineByLineReader('big_file.txt');
lr.on('error', function (err) {
// 'err' contains error object
});
lr.on('line', function (line) {
// pause emitting of lines...
lr.pause();
// ...do your asynchronous line processing..
setTimeout(function () {
// ...and continue emitting lines.
lr.resume();
}, 100);
});
lr.on('end', function () {
// All lines are read, file is closed now.
});
The Node.js Documentation offers a very elegant example using the Readline module.
Example: Read File Stream Line-by-Line
const { once } = require('node:events');
const fs = require('fs');
const readline = require('readline');
const rl = readline.createInterface({
input: fs.createReadStream('sample.txt'),
crlfDelay: Infinity
});
rl.on('line', (line) => {
console.log(`Line from file: ${line}`);
});
await once(rl, 'close');
Note: we use the crlfDelay option to recognize all instances of CR LF ('\r\n') as a single line break.
Apart from read the big file line by line, you also can read it chunk by chunk. For more refer to this article
var offset = 0;
var chunkSize = 2048;
var chunkBuffer = new Buffer(chunkSize);
var fp = fs.openSync('filepath', 'r');
var bytesRead = 0;
while(bytesRead = fs.readSync(fp, chunkBuffer, 0, chunkSize, offset)) {
offset += bytesRead;
var str = chunkBuffer.slice(0, bytesRead).toString();
var arr = str.split('\n');
if(bytesRead = chunkSize) {
// the last item of the arr may be not a full line, leave it to the next chunk
offset -= arr.pop().length;
}
lines.push(arr);
}
console.log(lines);
I had the same problem yet. After comparing several modules that seem to have this feature, I decided to do it myself, it's simpler than I thought.
gist: https://gist.github.com/deemstone/8279565
var fetchBlock = lineByline(filepath, onEnd);
fetchBlock(function(lines, start){ ... }); //lines{array} start{int} lines[0] No.
It cover the file opened in a closure, that fetchBlock() returned will fetch a block from the file, end split to array (will deal the segment from last fetch).
I've set the block size to 1024 for each read operation. This may have bugs, but code logic is obvious, try it yourself.
Reading / Writing files using stream with the native nodejs modules (fs, readline):
const fs = require('fs');
const readline = require('readline');
const rl = readline.createInterface({
input: fs.createReadStream('input.json'),
output: fs.createWriteStream('output.json')
});
rl.on('line', function(line) {
console.log(line);
// Do any 'line' processing if you want and then write to the output file
this.output.write(`${line}\n`);
});
rl.on('close', function() {
console.log(`Created "${this.output.path}"`);
});
Based on this questions answer I implemented a class you can use to read a file synchronously line-by-line with fs.readSync(). You can make this "pause" and "resume" by using a Q promise (jQuery seems to require a DOM so cant run it with nodejs):
var fs = require('fs');
var Q = require('q');
var lr = new LineReader(filenameToLoad);
lr.open();
var promise;
workOnLine = function () {
var line = lr.readNextLine();
promise = complexLineTransformation(line).then(
function() {console.log('ok');workOnLine();},
function() {console.log('error');}
);
}
workOnLine();
complexLineTransformation = function (line) {
var deferred = Q.defer();
// ... async call goes here, in callback: deferred.resolve('done ok'); or deferred.reject(new Error(error));
return deferred.promise;
}
function LineReader (filename) {
this.moreLinesAvailable = true;
this.fd = undefined;
this.bufferSize = 1024*1024;
this.buffer = new Buffer(this.bufferSize);
this.leftOver = '';
this.read = undefined;
this.idxStart = undefined;
this.idx = undefined;
this.lineNumber = 0;
this._bundleOfLines = [];
this.open = function() {
this.fd = fs.openSync(filename, 'r');
};
this.readNextLine = function () {
if (this._bundleOfLines.length === 0) {
this._readNextBundleOfLines();
}
this.lineNumber++;
var lineToReturn = this._bundleOfLines[0];
this._bundleOfLines.splice(0, 1); // remove first element (pos, howmany)
return lineToReturn;
};
this.getLineNumber = function() {
return this.lineNumber;
};
this._readNextBundleOfLines = function() {
var line = "";
while ((this.read = fs.readSync(this.fd, this.buffer, 0, this.bufferSize, null)) !== 0) { // read next bytes until end of file
this.leftOver += this.buffer.toString('utf8', 0, this.read); // append to leftOver
this.idxStart = 0
while ((this.idx = this.leftOver.indexOf("\n", this.idxStart)) !== -1) { // as long as there is a newline-char in leftOver
line = this.leftOver.substring(this.idxStart, this.idx);
this._bundleOfLines.push(line);
this.idxStart = this.idx + 1;
}
this.leftOver = this.leftOver.substring(this.idxStart);
if (line !== "") {
break;
}
}
};
}
node-byline uses streams, so i would prefer that one for your huge files.
for your date-conversions i would use moment.js.
for maximising your throughput you could think about using a software-cluster. there are some nice-modules which wrap the node-native cluster-module quite well. i like cluster-master from isaacs. e.g. you could create a cluster of x workers which all compute a file.
for benchmarking splits vs regexes use benchmark.js. i havent tested it until now. benchmark.js is available as a node-module
import * as csv from 'fast-csv';
import * as fs from 'fs';
interface Row {
[s: string]: string;
}
type RowCallBack = (data: Row, index: number) => object;
export class CSVReader {
protected file: string;
protected csvOptions = {
delimiter: ',',
headers: true,
ignoreEmpty: true,
trim: true
};
constructor(file: string, csvOptions = {}) {
if (!fs.existsSync(file)) {
throw new Error(`File ${file} not found.`);
}
this.file = file;
this.csvOptions = Object.assign({}, this.csvOptions, csvOptions);
}
public read(callback: RowCallBack): Promise < Array < object >> {
return new Promise < Array < object >> (resolve => {
const readStream = fs.createReadStream(this.file);
const results: Array < any > = [];
let index = 0;
const csvStream = csv.parse(this.csvOptions).on('data', async (data: Row) => {
index++;
results.push(await callback(data, index));
}).on('error', (err: Error) => {
console.error(err.message);
throw err;
}).on('end', () => {
resolve(results);
});
readStream.pipe(csvStream);
});
}
}
import { CSVReader } from '../src/helpers/CSVReader';
(async () => {
const reader = new CSVReader('./database/migrations/csv/users.csv');
const users = await reader.read(async data => {
return {
username: data.username,
name: data.name,
email: data.email,
cellPhone: data.cell_phone,
homePhone: data.home_phone,
roleId: data.role_id,
description: data.description,
state: data.state,
};
});
console.log(users);
})();
I have made a node module to read large file asynchronously text or JSON.
Tested on large files.
var fs = require('fs')
, util = require('util')
, stream = require('stream')
, es = require('event-stream');
module.exports = FileReader;
function FileReader(){
}
FileReader.prototype.read = function(pathToFile, callback){
var returnTxt = '';
var s = fs.createReadStream(pathToFile)
.pipe(es.split())
.pipe(es.mapSync(function(line){
// pause the readstream
s.pause();
//console.log('reading line: '+line);
returnTxt += line;
// resume the readstream, possibly from a callback
s.resume();
})
.on('error', function(){
console.log('Error while reading file.');
})
.on('end', function(){
console.log('Read entire file.');
callback(returnTxt);
})
);
};
FileReader.prototype.readJSON = function(pathToFile, callback){
try{
this.read(pathToFile, function(txt){callback(JSON.parse(txt));});
}
catch(err){
throw new Error('json file is not valid! '+err.stack);
}
};
Just save the file as file-reader.js, and use it like this:
var FileReader = require('./file-reader');
var fileReader = new FileReader();
fileReader.readJSON(__dirname + '/largeFile.json', function(jsonObj){/*callback logic here*/});

Node.JS reading json file isn't displaying new file updates

I have two configuration files (log.txt, and config.json) that are updated after each computation and the next computation is dependent on the results of the previous computation which are stored in the two files.
The problem is, when I read the json files during computation I only get the results of the initial first computation. The new updates are not reflected in the new file reads. But when I check the files, both have been updated.
What could be the problem?
Thanks in advance.
Script for Reading the configuration files (pFunctions.js):
const fs = require('fs');
// Read logs
function readLog() {
try {
return fs.readFileSync(__dirname + '/' + 'log.txt', {'encoding':'utf8','flag':'rs+'});
}
catch (err) {
return '';
}
}
// Read configurations
function readConfig() {
try {
return fs.readFileSync(__dirname + '/' + 'config.json', {'encoding':'utf8','flag':'rs+'});
}
catch (err) {
return '';
}
}
// Function to read dataset
function readDataset() {
....
....
}
// Read data from configuration files
let config = readConfig(); // config.json
let log = readLog(); // log.txt
// Parse data to JSON
let cfg = JSON.parse(config);
let lcg = JSON.parse(log);
....
....
// Fetch dataset based on the configuration and log file
let dataset = readDataset();
// Create parameter function
const pFunctions = {
parameter1: dataset // set dataset as value of parameter1
}
// Export function as JSON Object
module.exports={pFunctions};
I then import the object in the computation.js script like this:
const {pFunctions} = require('./app/config/pFunctions');
// Capture new 'uParams' computing parameters
let newParams = pFunctions;

javascript fs seach files for strings in array. Avoid bad performance

I'm building a tool that will clean up a JSON file containing localization strings if they are no longer in use in the source code.
First, I parse the localization file into an array with all the id's that are (or no longer are) used in the source code to get the string value in the right language.
so I have an array looking something like this:
const ids = ['home.title', 'home.description', 'menu.contact', 'menu.social'];
etc. you get the point.
I'm using node.js fs promisified readFile and glob to search .js source code files like this:
const jsFiles = await globbing('./**/*.js', {cwd: directory, ignore: './**/*test.js'});
const results = jsFiles.map(async file => {
const filePath = path.join(directory, file);
return readFile(filePath, 'utf8').then((data) => {
// handle match here
}).catch(console.log);
});
I also have Ramda available for fancy list/collection functions, but no other libraries.
So, I will be able to loop through the ids array and for each item scan the entire source code for a match with the function from above. But that seems a bit overkill to scan the entire source code times ids.length. The ids array is on around 400 ids' and the source code is hundreds of large files.
To avoid O(M*N), is there a way to match the entire array with the entire source code, and discard the not matched array items? Or what would be the best practice here?
current solution:
const cleanLocal = async () => {
const localIdList = Object.keys(await getLocalMap());
const matches = [];
localIdList.map(async id => {
const directory = path.join(__dirname, '..');
const jsFiles = await globbing('./**/*.js', {cwd: directory, ignore: './**/*test.js'});
jsFiles.map(async file => {
const filePath = path.join(directory, file);
return readFile(filePath, 'utf8').then((data) => {
if (data.indexOf(id) >= 0) {
console.log(id);
matches.push(id);
}
}).catch(console.log);
});
});
};
You can't avoid the O(M*N) complexity in this case.
However, to improve performance you can switch the order of your operations: first loop over the files and then loop over the array. This is because looping over the files is a costly IO operation, while looping over the array is a fast memory operation.
In your code, you have M memory operations and M*N IO (filesystem) operations.
If you first loop over the files, you would have N IO operations and M*N memory operations.
As it is not possible to avoid O(M*N) in this case I have only been able to optimize this search function by looping through the source files once and then over the ids' for each file as proposed by #mihai as an optimization opportunity.
The end result looks like this:
const cleanLocal = async () => {
const localIdList = Object.keys(await getLocalMap()); // ids' array
const matches = [];
const directory = path.join(__dirname, '..');
const jsFiles = await globbing('./**/*.js', {cwd: directory, ignore: './**/*test.js'}); // list of files to scan
const results = jsFiles.map(async file => {
const filePath = path.join(directory, file);
return readFile(filePath, 'utf8').then((data) => {
localIdList.map(id => {
if (R.contains(id, data)) { // R = ramda.js
matches.push(id);
}
});
}).catch(console.log);
});
await Promise.all(results);
console.log('matches: ' + R.uniq(matches).length);
console.log('in local.json: ' + localIdList.length);
};
Please let me know if there are any other way to optimize this.

Dir walking to get abs path array of various files

Goal: Get a List of Absolute Paths for all files in a directory recursively leveraging NodeJs.
Info: As a python dev, I normally use the python packages which handle this in a platform independent fashion. My boss wanted some javascript code which would handle this goal... and as a JS dev previously, I was like "oh this is easy. Let's look up the node as I never got a chance to get my hands dirty with it." but I seem to be mistaken.
I don't see anything in node relating to Dir Walking, or a way I could hack together to create such a thing.
I was looking in "Child Process", "Console", "File System", "OS", "Path", and "Process". I didn't see anything which would do anything akin to:
pushd .
cd $dir
for folder in $(ls);
do
pushd .
cd $folder
//call again
ls $(pwd)$flag >> $dir/files_matching.txt
popd
done;
// or any platform independent means of recursively getting
// all files and their abs path which meet flag criterion,
// such as "*.txt" || "_*found*"
I could use child process to carry out Command Line items, but then I need to create a bunch of conditionals based on the OS consuming the app, and figured this would be something which already exists.
I don't want to reinvent the wheel, but figured this has already been done; I just don't see it in the base modules.
Is there a node module I would need which accomplishes this, which is outside of the base modules?
I am trying not to have to hand roll a conditional os based system to get an exhaustive list of abs paths for all files under a directory (or subset due to extensions, etc.)
I'd do it like this:
synchronous:
const fs = require("fs");
const { resolve } = require("path");
const getFiles = dir => {
const stack = [resolve(dir)];
const files = [];
while (stack.length) {
dir = stack.pop();
fs.readdirSync(dir).forEach(item => {
const path = resolve(dir, item);
(fs.statSync(path).isDirectory() ? stack : files).push(path);
});
}
return files;
};
console.log(getFiles("."));
asynchronous:
const fs = require("fs");
const { resolve } = require("path");
const pify = require("util").promisify;
const readdir = pify(fs.readdir);
const stat = pify(fs.stat);
const getFiles = async dir => {
const files = await readdir(resolve(dir));
const filesP = files.map(async file => {
const path = resolve(dir, file);
return (await stat(path)).isDirectory() ? getFiles(path) : path;
});
// return (await Promise.all(filesP)).flat(); // flat supported in node ~11
return [].concat(...(await Promise.all(filesP)));
};
getFiles(".").then(console.log);
async demo https://repl.it/#marzelin/getFiles
So, I was looking at the filesystem module and noticed the function readDir
https://nodejs.org/dist/latest-v8.x/docs/api/fs.html#fs_fs_readdir_path_options_callback
which does the trick in part. I guess it wasnt named a method i would have looking for. I was looking for things involving LIST and DIR, but not READ.
Anyways, here is a way to read Dir.
var fs = require('fs');
if (process.argv.length <= 2) {
console.log("Usage: " + __filename + " path/to/directory");
process.exit(-1);
}
var path = process.argv[2];
fs.readdir(path, function(err, items) {
console.log(items);
for (var i=0; i<items.length; i++) {
console.log(items[i]);
}
});
You notice that this one above is Async, but there is a Sync variant, just add "Sync" to the signature. Now you need to determine if something is a directory:
let file = fs.statSync("path/to/directory")
let isDir = file.isDirectory()
So you can couple this all together.
var fs = require('fs')
function recurse_file_system(path, contains) {
let files = fs.readdirSync(path);
let dArr = [];
let fArr = [];
for (let i in files){
let newPath = path + "/" + files[i]
if (fs.statSync(newPath).isDirectory()){
dArr.push(newPath)
}else{
if (filter(files[i], ".txt")){
fArr.push(newPath)
}
}
}
if (arr.length == 0){
return fArr;
}else{
for (let d in dArr){
let rslt = recurse_file_system(dArr[d]);
for (let i in rslt){
fArr.push(rslt[i])
}
}
return fArr;
}
}
console.log("Files:")
console.log(recurse_file_system("/"))
Now if you want to extend this, all you need to do is add a filter to say, limit the size of returns based on particular criterion, such as file name limitation.
function filter(filename, contains){
let reg = new RegEx(contains)
return reg.test(filename)
}
and you can add it to the base case, where you see filter... OR you can just return the WHOLE set and filter it there with the List method, filter.

get one file from each directory

in my app user can have multiple directory in setting , I want to get one file for each folder, note these directories can have their own directories an nested as possible
my config.json
{"dirs":["F:/music/Ellie Goulding","F:/music/Eminem - Revival [Album] [iTunes Version] - [7tunes]"]}
var findSongs = async function (baseDir,isDir=false,musics) {
try{
musics =musics || []
var files = await getDirFiles(baseDir)
for(let i =0;i<files.length;i++){
try {
let file = files[i]
var musicPath = `${baseDir}/${file}`
let stat = fs.lstatSync(musicPath)
if(stat.isDirectory()){
await findSongs(musicPath,true,musics)
}else{
let meta = await getMusicMeta(musicPath)
if(meta){
musics.push(meta)
}
if(isDir){
break
}
}
issue with this code is it goes get all songs of second folder and doesn't return anything from folder one but if only place one folder like this : F:/music it will work as expected

Categories

Resources