Javascript - .map running out of memory - javascript

My libraries:
const Promise = require('bluebird');
const fs = Promise.promisifyAll(require('graceful-fs'));
const path = require('path');
const xml2js = Promise.promisifyAll(require('xml2js'));
I have a large number of XML files I want to parse. I am able to create an array of paths to all the files using this function:
function getFileNames(rootPath) {
// Read content of path
return fs.readdirAsync(rootPath)
// Return all directories
.then(function(content) {
return content.filter(function(file) {
return fs.statSync(path.join(rootPath, file)).isDirectory();
});
})
// For every directory
.map(function(directory) {
// Save current path
let currentPath = path.join(rootPath, directory);
// Read files in the directory
return fs.readdirAsync(currentPath)
// Filter out the XMLs
.filter(function(file) {
return path.extname(file) === '.XML';
})
// Return path to file
.map(function(file) {
return path.join(rootPath, directory, file);
});
})
// Flatten array of results
.reduce(function(a, b) {
return a.concat(b);
});
}
and now I want to go trough every single file and parse it.
I have 2 function to do so:
function openFile(filePath) {
return fs.readFileAsync('./' + filePath)
.then(function(fileData) {
return fileData;
});
}
function parseFile(data) {
return xml2js.parseStringAsync(data)
.then(function(xmlObject) {
return xmlObject;
});
}
Now when I call this with the .map (the GetFileNames function outputs an array with over 20k strings with file paths) function:
getFileNames('./XML')
.map(function(file) {
openFile(file)
.then(function(data) {
parseFile(data)
.then(function(object) {
console.log(object);
});
});
});
I get a javascript heap out of memory error:
FATAL ERROR: CALL_AND_RETRY_LAST Allocation failed - JavaScript heap
out of memory
But when I run the function a single time by passing in the path to the actual file:
openFile('./XML/2016-10-1/EUROTIPOLD2016-10-1T00-00-22.5756240530.XML')
.then(function(data) {
parseFile(data)
.then(function(object) {
console.log(object);
});
});
I get the desired output.
What am I doing wrong?

Iterating nK files happens asynchronous.
1) You're getting list of files
2) by doing .map You're calling openFile, parseFile that are async functions and it takes time to read and parse.
So because of asynchronousity it proceeds to next file without waiting to finish previous one to call garbage collector to sweep memory and here is insufficient memory problem.
Think about reading 20K files with different sizes at once.
So here is solution:
Use async to synchronize (eachSeries) or control (eachLimit) iteration.
const async = require('async'); // install: npm i --save async
let files = getFileNames('./XML');
// eachLimit(files, 3,
async.eachSeries(files,
(file, next) => {
openFile(file)
.then(
parseFile,
(err) => {
console.error('Cannot open file:', file, err);
next();
})
.then(
object => { // successfully parsed file, so log it out and proceed to next file
console.log(object);
next();
},
(err) => {
console.error('Cannot parse data from file:', file, err);
next();
});
});
p.s. feel free to comment and fix code issue in my answer.

This is a simple case of more resource requirement for your workload. I would look at increasing heap size to meet your demand, rather than changing the source code.
I recommend --max_old_space_size to be setup accordingly to meet the requirement - it may be an iterative process though.
Hope this helps.

Related

Cypress identify the downloaded file using regex

I have one scenario where I have to verify the downloaded text file's data against an API response.
Below is the code that I have tried.
Test:
const path = require('path')
const downloadsFolder = Cypress.config('downloadsFolder')
cy.task('deleteFolder', downloadsFolder)
const downloadedFilename = path.join(downloadsFolder, 'ABCDEF.txt')//'*.txt'
....
cy.get('#portmemo').its('response.body')
.then((response) => {
var json = JSON.parse(response);
const resCon = json[0].content.replaceAll(/[\n\r]/g, '');
cy.readFile(downloadedFilename).then((fc) => {
const formatedfc = fc.replaceAll(/[\n\r]/g, '');
cy.wrap(formatedfc).should('contains', resCon)
})
})
Task in /cypress/plugins/index.js
const { rmdir } = require('fs')
module.exports = (on, config) => {
console.log("cucumber started")
on('task', {
deleteFolder(folderName) {
return new Promise((resolve, reject) => {
rmdir(folderName, { maxRetries: 5, recursive: true }, (err) => {
if (err) {
console.error(err);
return reject(err)
}
resolve(null)
})
})
},
})
When I have the downloadedFilename as 'ABCDEF.txt', it works fine [I have hard coded here]. But I need some help to get the (dynamic) file name as it changes every time [eg.: AUADLFA.txt, CIABJPT.txt, SVACJTM.txt, PKPQ1TM.txt & etc.,].
I tried to use '.text' but I get 'Timed out retrying after 4000ms: cy.readFile("C:\Repositories\xyz-testautomation\cypress\downloads/.txt") failed because the file does not exist error.
I referred to this doc as well but no luck yet.
What is the right way to use regex to achieve the same? Also wondering is there a way to get the recently downloaded file name?
You can make use of the task shown in this question How can I verify if the downloaded file contains name that is dynamic
/cypress/plugins/index.js
const fs = require('fs');
on('task', {
downloads: (downloadspath) => {
return fs.readdirSync(downloadspath)
}
})
This returns a list of the files in the downloads folder.
Ideally you'd make it easy on yourself, and set the trashAssetsBeforeRuns configuration. That way, the array will only contain the one file and there's no need to compare arrays before and after the download.
(Just noticed you have a task for it).

Reading, parsing files and inserting documents using NestJS and MongoDB causing JavaScript heap out of memory

My NestJS application has a simple purpose to:
Loop through an array of large files ( 29 files where each one have about 12k to 70k lines)
Read a file line by line and parse it
Insert (each line) into my MongoDB collection
The most important part of my code consist of:
for(let file of FILES){
result = await this.processFile(file);
resultInsert += result;
}
and the function processFile()
async processFile(fileName: string): Promise<number> {
count = 0;
return new Promise((resolve, reject) => {
let s = fs
.createReadStream(BASE_PATH + fileName, {encoding: 'latin1'})
.pipe(es.split())
.pipe(
es
.mapSync(async (line: string) => {
count++;
console.log(line);
let line_splited = line.split("#");
let user = {
name: line_splited[0],
age: line_splited[1],
address: line_splited[2],
job: line_splited[3],
country: line_splited[4]
}
await this.userModel.updateOne(
user,
user,
{ upsert: true }
);
})
.on('end', () => {
resolve(count);
})
.on('error', err => {
reject(err);
})
);
});
}
The main problem is by the interaction of the ~9th file, I have a memory failure: Allocation failed - JavaScript heap out of memory.
I saw that my problem is similar to Parsing huge logfiles in Node.js - read in line-by-line but the code still managed to fail.
I suspect the fact that I am opening a file, reading it and when I open another file, I am still inserting the previous one can cause the problem but I don't know how to handle it.
I could make it work by changing the updateOne() to insertMany().
Quick explanation: instead of inserting one by one, we would be inserting by 100k.
So I just created an array of user and when it reached 100k documents, we would insert with insertMany()

Why fs.readFileSync returns nothing inside a promise on serveside?

I found this "Rendering PDFs with React components" tutorial on themeteorchef about creating PDF files on Meteor serverside and then sending them back to client. I had no really need for PDF files, but docx files instead and thought that maybe I could follow similar approach when creating docx files with officegen
I created very similar server side module that generates a docx file from inputs on clientside and then tries to transform them into base64 string that is then supposed to be sent to the client. However, the base64 string is never created.
Here's the module:
let myModule;
const getBase64String = (loc) => {
try {
const file = fs.readFileSync(loc);
return new Buffer(file).toString('base64');
} catch (exception) {
myModule.reject(exception);
}
}
const generateBase64Docx = (path, fileName) => {
try {
myModule.resolve({fileName, base64: getBase64String(path+fileName)});
fs.unlink(loc);
} catch (exception) {
myModule.reject(exception);
}
}
const formatComponentAsDocx = (props, fileName) => {
try {
var docxFile = officegen({
'type': 'docx',
'orientation': 'portrait',
'title': props.title,
});
var pObj = docxFile.createP();
pObj.addText(props.body);
var path = './';
output = fs.createWriteStream(path+fileName);
docxFile.generate(output);
return path;
} catch (exception) {
myModule.reject(exception);
}
}
const handler = ({props, fileName}, promise) => {
myModule = promise;
const path = formatComponentAsDocx(props, fileName);
if (path) {
generateBase64Docx(path, fileName);
}
}
export const generateComponentAsDocx = (options) => {
return new Promise((resolve, reject) => {
return handler(options, { resolve, reject });
});
};
The problem here is the fs.readFileSync part. It always returns empty buffer and that's why the file is never transformed into base64 string and also never sent back to client. Why's that? The file itself is always created on the server and can always be found.
If I change the const file = fs.readFileSync(loc); part to for example this
fs.readFile(loc, (err, data) => {
if(err) myModule.reject(err);
console.log(JSON.stringify(data));
}
I can see some data in data, but not enough for the whole file.
What am I doing wrong here? Am I missing something?
You need to wait until the file generated by officegen is complete before you try to get the base64 out of it. That's the minimal change you need to make. I don't recommend waiting on the finalize event generated by officegen as this event is buggy. I recommend waiting on the finish event of the output stream. However, there are additional issues with the code you show:
Since you have code to unlink the file immediately after you use it, then I infer you do not need a file. So you can just create the data in memory and get a base64 string from that.
The whole rigmarole with myModule is awful awful design. If one of my colleagues presented such code, strong words would be exchanged. Yes, it is that bad. It is much better to convert the entire code base to work with promises.
The whole module can be reduced to the following. I've done a modicum of testing on this code but I don't claim that it deals with every eventuality.
import * as stream from "stream";
import officegen from "officegen";
function formatComponentAsDocx(props) {
return new Promise((resolve, reject) => {
// There's no need to wrap this in try...catch only to call reject. If any
// exception is raised in this function, the promise is automatically
// rejected.
const docxFile = officegen({
'type': 'docx',
'orientation': 'portrait',
'title': props.title,
});
const pObj = docxFile.createP();
pObj.addText(props.body);
// We record the output in our own buffer instead of writing to disk,
// and reading later.
let buf = Buffer.alloc(0);
const output = new stream.Writable({
write(chunk, encoding, callback) {
buf = Buffer.concat([buf, chunk]);
callback();
},
});
docxFile.generate(output, {
// Do propagate errors from officegen.
error: reject,
});
// We don't use the "finalize" event that docxFile.generate would emit
// because it is buggy. Instead, we wait for the output stream to emit
// the "finish" event.
output.on('finish', () => {
resolve(buf);
});
});
}
export function generateComponentAsDocx({ props }) {
return formatComponentAsDocx(props).then((data) => {
return { base64: data.toString("base64") };
});
};
Your problem is that docxFile.generate(output); is not synchronous. Thus, while your local path exists (it was created by fs.createWriteStream() call), it is empty and your synchronous fs.readFileSync is catching just that, empty file.
You should subscribe to docxFile's finalize event to catch file generation end:
docxFile.on('finalize, function (writtenBytes) {
// do you work with generated file here
});
Thus, rewriting your code:
const handler = ({props, fileName}, promise) => {
myModule = promise;
formatComponentAsDocx(props, fileName);
}
const formatComponentAsDocx = (props, fileName) => {
try {
var docxFile = officegen({
'type': 'docx',
'orientation': 'portrait',
'title': props.title,
});
var pObj = docxFile.createP();
pObj.addText(props.body);
var path = './';
output = fs.createWriteStream(path+fileName);
docxFile.on('error', function (err) {
myModule.reject(err);
});
docxFile.on('finalize', function () {
generateBase64Docx(path, fileName);
});
docxFile.generate(output);
} catch (exception) {
myModule.reject(exception);
}
}
readFileSync is synchronous, so it doesn't deal in promises.
https://nodejs.org/api/fs.html#fs_fs_readfilesync_file_options
Synchronous version of fs.readFile. Returns the contents of the file.
You probably want to use fs.readFile.
https://nodejs.org/api/fs.html#fs_fs_readfile_file_options_callback
The callback is passed two arguments (err, data), where data is the contents of the file.
If no encoding is specified, then the raw buffer is returned.

Saving JSON in Electron

I am building an app using Electron. In this app, I am building a data structure using JSON. My data structure looks like this:
{
items: [
{ id:1, name:'football' },
{ id:2, name:'soccer ball' },
{ id:3, name:'basketball' }
]
}
I want to save this JSON to a file called "data.json". I want to save it to a file because I want to load the next time the application starts. My challenge is, I do not know how to save the data. In fact, I'm not sure where I should even save the file. Do I save it in the same directory as the app? Or is there some cross-platform approach I should use?
Currently, I have the following:
saveClick: function() {
var json = JSON.stringify(this.data);
// assume json matches the JSON provided above.
// Now, I'm not sure how to actually save the file.
}
So, how / where do I save JSON to the local file system for use at a later time?
Electron lacks an easy way to persist and read user settings for your application. electron-json-storage implements an API somehow similar to localStorage to write and read JSON objects to/from the operating system application data directory, as defined by app.getPath('userData').
Electron uses node.js as its core. You can use the following:
var fs = require("fs");
read_file = function(path){
return fs.readFileSync(path, 'utf8');
}
write_file = function(path, output){
fs.writeFileSync(path, output);
}
For write_file(), you can either pass "document.txt" as the path and it will write it to the same directory the html file it was run from. You can also put in a full path like "C:/Users/usern/document.txt" and it will write to the specific location you want.
Also, you can choose any file extention you want, (ie. ".txt", ".js", ".json", etc.). You can even make up your own!
I wrote a simple library that you can use, with a simple interface, it also creates subdirectories and works with promises/callbacks.
it will save the data into app.getPath("appData") as the root folder.
https://github.com/ran-y/electron-storage
Installation
$ npm install --save electron-storage
usage
const storage = require('electron-storage');
API
storage.get(filePath, (err, data) => {
if (err) {
console.error(err)
} else {
console.log(data);
}
});
storage.get(filePath)
.then(data => {
console.log(data);
})
.catch(err => {
console.error(err);
});
storage.set(filePath, data, (err) => {
if (err) {
console.error(err)
}
});
storage.set(filePath, data)
.then(data => {
console.log(data);
})
.catch(err => {
console.error(err);
});
`const fs = require('fs');
let student = {
name: 'Mike',
age: 23,
gender: 'Male',
department: 'English',
car: 'Honda'
};
let data = JSON.stringify(student, null, 2);
fs.writeFile('student-3.json', data, (err) => {
if (err) throw err;
console.log('Data written to file');
});
console.log('This is after the write call');`
There are multiple steps:
Step 1: As of version 5, the default for nodeIntegration changed from true to false. You can enable it when creating the Browser Window:
const createWindow = () => {
const win = new BrowserWindow({
width: 1000,
height: 800,
webPreferences: {
nodeIntegration: true,
contextIsolation: false,
}
})
}
Step 2:
function writetofile() {
let configsettings = {
break: output.innerHTML,
alwaysonoff: toggleoutput.innerHTML,
};
let settings_data = JSON.stringify(configsettings, null, 2);
const fs = require("fs");
fs.writeFileSync("assets/configs/settings.json", settings_data);
}

Looping through files in a folder Node.JS

I am trying to loop through and pick up files in a directory, but I have some trouble implementing it. How to pull in multiple files and then move them to another folder?
var dirname = 'C:/FolderwithFiles';
console.log("Going to get file info!");
fs.stat(dirname, function (err, stats) {
if (err) {
return console.error(err);
}
console.log(stats);
console.log("Got file info successfully!");
// Check file type
console.log("isFile ? " + stats.isFile());
console.log("isDirectory ? " + stats.isDirectory());
});
Older answer with callbacks
You want to use the fs.readdir function to get the directory contents and the fs.rename function to actually do the renaming. Both these functions have synchronous versions if you need to wait for them to finishing before running the code afterwards.
I wrote a quick script that does what you described.
var fs = require('fs');
var path = require('path');
// In newer Node.js versions where process is already global this isn't necessary.
var process = require("process");
var moveFrom = "/home/mike/dev/node/sonar/moveme";
var moveTo = "/home/mike/dev/node/sonar/tome"
// Loop through all the files in the temp directory
fs.readdir(moveFrom, function (err, files) {
if (err) {
console.error("Could not list the directory.", err);
process.exit(1);
}
files.forEach(function (file, index) {
// Make one pass and make the file complete
var fromPath = path.join(moveFrom, file);
var toPath = path.join(moveTo, file);
fs.stat(fromPath, function (error, stat) {
if (error) {
console.error("Error stating file.", error);
return;
}
if (stat.isFile())
console.log("'%s' is a file.", fromPath);
else if (stat.isDirectory())
console.log("'%s' is a directory.", fromPath);
fs.rename(fromPath, toPath, function (error) {
if (error) {
console.error("File moving error.", error);
} else {
console.log("Moved file '%s' to '%s'.", fromPath, toPath);
}
});
});
});
});
Tested on my local machine.
node testme.js
'/home/mike/dev/node/sonar/moveme/hello' is a file.
'/home/mike/dev/node/sonar/moveme/test' is a directory.
'/home/mike/dev/node/sonar/moveme/test2' is a directory.
'/home/mike/dev/node/sonar/moveme/test23' is a directory.
'/home/mike/dev/node/sonar/moveme/test234' is a directory.
Moved file '/home/mike/dev/node/sonar/moveme/hello' to '/home/mike/dev/node/sonar/tome/hello'.
Moved file '/home/mike/dev/node/sonar/moveme/test' to '/home/mike/dev/node/sonar/tome/test'.
Moved file '/home/mike/dev/node/sonar/moveme/test2' to '/home/mike/dev/node/sonar/tome/test2'.
Moved file '/home/mike/dev/node/sonar/moveme/test23' to '/home/mike/dev/node/sonar/tome/test23'.
Moved file '/home/mike/dev/node/sonar/moveme/test234' to '/home/mike/dev/node/sonar/tome/test234'.
Update: fs.promises functions with async/await
Inspired by ma11hew28's answer (shown here), here is the same thing as above but with the async functions in fs.promises. As noted by ma11hew28, this may have memory limitations versus fs.promises.opendir added in v12.12.0.
Quick code below.
//jshint esversion:8
//jshint node:true
const fs = require( 'fs' );
const path = require( 'path' );
const moveFrom = "/tmp/movefrom";
const moveTo = "/tmp/moveto";
// Make an async function that gets executed immediately
(async ()=>{
// Our starting point
try {
// Get the files as an array
const files = await fs.promises.readdir( moveFrom );
// Loop them all with the new for...of
for( const file of files ) {
// Get the full paths
const fromPath = path.join( moveFrom, file );
const toPath = path.join( moveTo, file );
// Stat the file to see if we have a file or dir
const stat = await fs.promises.stat( fromPath );
if( stat.isFile() )
console.log( "'%s' is a file.", fromPath );
else if( stat.isDirectory() )
console.log( "'%s' is a directory.", fromPath );
// Now move async
await fs.promises.rename( fromPath, toPath );
// Log because we're crazy
console.log( "Moved '%s'->'%s'", fromPath, toPath );
} // End for...of
}
catch( e ) {
// Catch anything bad that happens
console.error( "We've thrown! Whoops!", e );
}
})(); // Wrap in parenthesis and call now
fs.readdir(path[, options], callback) (which Mikey A. Leonetti used in his answer) and its variants (fsPromises.readdir(path[, options]) and fs.readdirSync(path[, options])) each reads all of a directory's entries into memory at once. That's good for most cases, but if the directory has very many entries and/or you want to lower your application's memory footprint, you could instead iterate over the directory's entries one at a time.
Asynchronously
Directories are async iterable, so you could do something like this:
const fs = require('fs')
async function ls(path) {
const dir = await fs.promises.opendir(path)
for await (const dirent of dir) {
console.log(dirent.name)
}
}
ls('.').catch(console.error)
Or, you could use dir.read() and/or dir.read(callback) directly.
Synchronously
Directories aren't sync iterable, but you could use dir.readSync() directly. For example:
const fs = require('fs')
const dir = fs.opendirSync('.')
let dirent
while ((dirent = dir.readSync()) !== null) {
console.log(dirent.name)
}
dir.closeSync()
Or, you could make directories sync iterable. For example:
const fs = require('fs')
function makeDirectoriesSyncIterable() {
const p = fs.Dir.prototype
if (p.hasOwnProperty(Symbol.iterator)) { return }
const entriesSync = function* () {
try {
let dirent
while ((dirent = this.readSync()) !== null) { yield dirent }
} finally { this.closeSync() }
}
if (!p.hasOwnProperty(entriesSync)) { p.entriesSync = entriesSync }
Object.defineProperty(p, Symbol.iterator, {
configurable: true,
enumerable: false,
value: entriesSync,
writable: true
})
}
makeDirectoriesSyncIterable()
And then, you could do something like this:
const dir = fs.opendirSync('.')
for (const dirent of dir) {
console.log(dirent.name)
}
Note: "In busy processes, use the asynchronous versions of these calls. The synchronous versions will block the entire process until they complete, halting all connections."
References:
Node.js Documentation: File System: Class fs.Dir
Node.js source code: fs.Dir
GitHub: nodejs/node: Issues: streaming / iterative fs.readdir #583
Read all folders in a directory
const readAllFolder = (dirMain) => {
const readDirMain = fs.readdirSync(dirMain);
console.log(dirMain);
console.log(readDirMain);
readDirMain.forEach((dirNext) => {
console.log(dirNext, fs.lstatSync(dirMain + "/" + dirNext).isDirectory());
if (fs.lstatSync(dirMain + "/" + dirNext).isDirectory()) {
readAllFolder(dirMain + "/" + dirNext);
}
});
};
The answers provided are for a single folder. Here is an asynchronous implementation for multiple folders where all the folders are processed simultaneously but the smaller folders or files gets completed first.
Please comment if you have any feedback
Asynchronously Multiple Folders
const fs = require('fs')
const util = require('util')
const path = require('path')
// Multiple folders list
const in_dir_list = [
'Folder 1 Large',
'Folder 2 Small', // small folder and files will complete first
'Folder 3 Extra Large'
]
// BEST PRACTICES: (1) Faster folder list For loop has to be outside async_capture_callback functions for async to make sense
// (2) Slower Read Write or I/O processes best be contained in an async_capture_callback functions because these processes are slower than for loop events and faster completed items get callback-ed out first
for (i = 0; i < in_dir_list.length; i++) {
var in_dir = in_dir_list[i]
// function is created (see below) so each folder is processed asynchronously for readFile_async that follows
readdir_async_capture(in_dir, function(files_path) {
console.log("Processing folders asynchronously ...")
for (j = 0; j < files_path.length; j++) {
file_path = files_path[j]
file = file_path.substr(file_path.lastIndexOf("/") + 1, file_path.length)
// function is created (see below) so all files are read simultaneously but the smallest file will be completed first and get callback-ed first
readFile_async_capture(file_path, file, function(file_string) {
try {
console.log(file_path)
console.log(file_string)
} catch (error) {
console.log(error)
console.log("System exiting first to catch error if not async will continue...")
process.exit()
}
})
}
})
}
// fs.readdir async_capture function to deal with asynchronous code above
function readdir_async_capture(in_dir, callback) {
fs.readdir(in_dir, function(error, files) {
if (error) { return console.log(error) }
files_path = files.map(function(x) { return path.join(in_dir, x) })
callback(files_path)
})
}
// fs.readFile async_capture function to deal with asynchronous code above
function readFile_async_capture(file_path, file, callback) {
fs.readFile(file_path, function(error, data) {
if (error) { return console.log(error) }
file_string = data.toString()
callback(file_string)
})
}

Categories

Resources