How do I run the data in Chunks for Papa Parse? - javascript

My file size is 2483385kb, the following code using Papa Parser does not help me:
this.readFile((output) => {
_this.sample = get(Papa.parse(output, {preview: 2, skipEmptyLines: true, chunkSize: 1024 * 1024 * 45}), "data");
_this.csv = get(Papa.parse(output, {skipEmptyLines: true, chunkSize: 1024 * 1024 * 45}), "data");
});
It cannot read the large csv file. Am I doing it wrongly?

You can try streaming.
/**
* Returns an async generator that yields the CSV file line-by-line.
*
* #param path {String}
* #param options {papa.ParseConfig}
*/
async function* readCsvStream(path, options) {
const csvStream = createReadStream(path);
const parseStream = papa.parse(papa.NODE_STREAM_INPUT, options);
csvStream.pipe(parseStream);
for await (const chunk of parseStream) {
yield chunk;
}
}
(async () => {
try {
const asyncIterator = readCsvStream(`${__dirname}/cad_fi.csv`, {
header: true,
dynamicTyping: true,
});
for await (const chunk of asyncIterator) {
// You can do anything with each row of the `.csv` file here.
// like saving it to a DB row by row
console.log(chunk);
}
} catch (error) {
console.error(error);
}
})();
Or, alternatively, you can pass a step callback to the parse function, which will be called on each line, thus avoiding loading the entire csv in memory.
/**
* Reads a CSV file.
*
* #param path {String}
* #param options {papa.ParseConfig}
*/
function readCsv(csvString, stepCallback, options) {
papa.parse(csvString, {
...options,
step: stepCallback,
});
}
(async () => {
// I read it from FS because it is NodeJs, but you can get the string by any means.
const csv = (await readFile(`${__dirname}/cad_fi.csv`)).toString();
const data = [];
const errors = [];
const stepCallback = (results, parser) => {
if (results?.errors?.length) {
errors.push(...results.errors);
}
data.push(results.data);
};
const papaparseOptions = {
header: true,
dynamicTyping: true,
};
try {
readCsv(
csv,
stepCallback,
papaparseOptions,
);
console.log(errors);
console.log(data.length);
} catch (error) {
console.error(error);
}
})();

Related

Why decorations and data is not updating in my VScode extension?

I am trying to write quokka.js like vscode extension. I am using decorations api to display vars const let values in editor and chrome dev tools inspector node api to run and compile code. In activate function I am subscribing to vscode save document event and getting the values from global scope. But it is working only once. Than data don't updates. Why?
// Import the module and reference it with the alias vscode in your code below
import * as vscode from 'vscode';
import * as inspector from 'inspector';
import * as path from 'path';
import * as util from 'util';
import { memoryUsage, versions } from 'process';
import * as fs from 'fs';
import * as kill from 'tree-kill';
// console.log(vscode.Disposable);
const activeTextEditor = vscode.window.activeTextEditor;
let decorationTypes: any = [];
const makeDecorationWithText = (
contentText: string,
line: number,
column: number,
activeEditor: vscode.TextEditor
) => {
const decorationType = vscode.window.createTextEditorDecorationType({
after: {
contentText,
margin: '20px'
}
});
decorationTypes.push(decorationType);
const range = new vscode.Range(
new vscode.Position(line, column),
new vscode.Position(line, column)
);
activeEditor.setDecorations(decorationType, [{ range }]);
};
const clearDecorations = () => {
if (decorationTypes.length > 0) {
for (let i = 0; i < decorationTypes.length; i++) {
activeTextEditor?.setDecorations(decorationTypes[i], []);
decorationTypes[i].dispose();
}
decorationTypes = [];
} else {
vscode.window.showInformationMessage('Array is empty');
}
};
export async function activate(context: vscode.ExtensionContext) {
clearDecorations();
let disposable = vscode.commands.registerCommand('coderunner3.coderun', async () => {
vscode.window.showInformationMessage('Coderun is working!');
vscode.workspace.onDidSaveTextDocument(async () => {
const session = new inspector.Session();
session.connect();
const post = <any>util.promisify(session.post).bind(session);
clearDecorations();
// await post('Memory.forciblyPurgeJavaScriptMemory');
await post('Runtime.disable');
// await post('Runtime.addBinding', 'variables');
await post('Runtime.enable');
await post('Debugger.enable');
const activeEditor = vscode.window.activeTextEditor;
if (!activeEditor) {
return;
}
const document = activeEditor.document;
const filename = path.basename(document.uri.toString());
const { scriptId } = await post('Runtime.compileScript',
{
expression: vscode.window.activeTextEditor?.document.getText(),
sourceURL: filename,
persistScript: true
});
// console.log(scriptId);
await post('Runtime.runScript', { scriptId });
const data = await post('Runtime.globalLexicalScopeNames', {
executionContextId: 1
});
// console.log(data.names[0]);
data.names.map(async (expression: string) => {
console.log(expression);
const executionResult = await post('Runtime.evaluate', { expression, contextId: 1 });
const { value } = executionResult.result;
// console.log(executionResult.result);
const { result } = await post('Debugger.searchInContent', {
scriptId, query: expression
});
// for(let i = 0; i < result.length; i++) {
// let test = await post('Runtime.evaluate', { expression: result[i].lineContent, contextId: 1 });
// console.log(test);
// }
// console.log(executionResult);
makeDecorationWithText(`${value}`, result[0].lineNumber, result[0].lineContent.length, activeEditor);
});
await post('Runtime.disable');
await post('Debugger.disable');
// await post('Debugger.setBreakpoint', { scriptId, lineNumber: 1 });
// await post('CacheStorage.deleteCache', { cacheId: scriptId });
session.disconnect();
});
});
context.subscriptions.push(disposable);
}
export function deactivate() {
clearDecorations();
}

Upload byte array from axios to Node server

Background
Javascript library for Microsoft Office add-ins allows you to get raw content of the DOCX file through getFileAsync() api, which returns a slice of up to 4MB in one go. You keep calling the function using a sliding window approach till you have reed entire content. I need to upload these slices to the server and the join them back to recreate the original DOCX file.
My attempt
I'm using axios on the client-side and busboy-based express-chunked-file-upload middleware on my node server. As I call getFileAsync recursively, I get a raw array of bytes that I then convert to a Blob and append to FormData before posting it to the node server. The entire thing works and I get the slice on the server. However, the chunk that gets written to the disk on the server is much larger than the blob I uploaded, normally of the order of 3 times, so it is obviously not getting what I sent.
My suspicion is that this may have to do with stream encoding, but the node middleware does not expose any options to set encoding.
Here is the current state of code:
Client-side
public sendActiveDocument(uploadAs: string, sliceSize: number): Promise<boolean> {
return new Promise<boolean>((resolve) => {
Office.context.document.getFileAsync(Office.FileType.Compressed,
{ sliceSize: sliceSize },
async (result) => {
if (result.status == Office.AsyncResultStatus.Succeeded) {
// Get the File object from the result.
const myFile = result.value;
const state = {
file: myFile,
filename: uploadAs,
counter: 0,
sliceCount: myFile.sliceCount,
chunkSize: sliceSize
} as getFileState;
console.log("Getting file of " + myFile.size + " bytes");
const hash = makeId(12)
this.getSlice(state, hash).then(resolve(true))
} else {
resolve(false)
}
})
})
}
private async getSlice(state: getFileState, fileHash: string): Promise<boolean> {
const result = await this.getSliceAsyncPromise(state.file, state.counter)
if (result.status == Office.AsyncResultStatus.Succeeded) {
const data = result.value.data;
if (data) {
const formData = new FormData();
formData.append("file", new Blob([data]), state.filename);
const boundary = makeId(12);
const start = state.counter * state.chunkSize
const end = (state.counter + 1) * state.chunkSize
const total = state.file.size
return await Axios.post('/upload', formData, {
headers: {
"Content-Type": `multipart/form-data; boundary=${boundary}`,
"file-chunk-id": fileHash,
"file-chunk-size": state.chunkSize,
"Content-Range": 'bytes ' + start + '-' + end + '/' + total,
},
}).then(async res => {
if (res.status === 200) {
state.counter++;
if (state.counter < state.sliceCount) {
return await this.getSlice(state, fileHash);
}
else {
this.closeFile(state);
return true
}
}
else {
return false
}
}).catch(err => {
console.log(err)
this.closeFile(state)
return false
})
} else {
return false
}
}
else {
console.log(result.status);
return false
}
}
private getSliceAsyncPromise(file: Office.File, sliceNumber: number): Promise<Office.AsyncResult<Office.Slice>> {
return new Promise(function (resolve) {
file.getSliceAsync(sliceNumber, result => resolve(result))
})
}
Server-side
This code is totally from the npm package (link above), so I'm not supposed to change anything in here, but still for reference:
makeMiddleware = () => {
return (req, res, next) => {
const busboy = new Busboy({ headers: req.headers });
busboy.on('file', (fieldName, file, filename, _0, _1) => {
if (this.fileField !== fieldName) { // Current field is not handled.
return next();
}
const chunkSize = req.headers[this.chunkSizeHeader] || 500000; // Default: 500Kb.
const chunkId = req.headers[this.chunkIdHeader] || 'unique-file-id'; // If not specified, will reuse same chunk id.
// NOTE: Using the same chunk id for multiple file uploads in parallel will corrupt the result.
const contentRangeHeader = req.headers['content-range'];
let contentRange;
const errorMessage = util.format(
'Invalid Content-Range header: %s', contentRangeHeader
);
try {
contentRange = parse(contentRangeHeader);
} catch (err) {
return next(new Error(errorMessage));
}
if (!contentRange) {
return next(new Error(errorMessage));
}
const part = contentRange.start / chunkSize;
const partFilename = util.format('%i.part', part);
const tmpDir = util.format('/tmp/%s', chunkId);
this._makeSureDirExists(tmpDir);
const partPath = path.join(tmpDir, partFilename);
const writableStream = fs.createWriteStream(partPath);
file.pipe(writableStream);
file.on('end', () => {
req.filePart = part;
if (this._isLastPart(contentRange)) {
req.isLastPart = true;
this._buildOriginalFile(chunkId, chunkSize, contentRange, filename).then(() => {
next();
}).catch(_ => {
const errorMessage = 'Failed merging parts.';
next(new Error(errorMessage));
});
} else {
req.isLastPart = false;
next();
}
});
});
req.pipe(busboy);
};
}
Update
So it looks like I have found the problem at least. busboy appears to be writing my array of bytes as text in the output file. I get 80,75,3,4,20,0,6,0,8,0,0,0,33,0,44,25 (as text) when I upload the array of bytes [80,75,3,4,20,0,6,0,8,0,0,0,33,0,44,25]. Now need to figure out how to force it to write it as a binary stream.
Figured out. Just in case it helps anyone, there was no problem with busboy or office.js or axios. I just had to convert the incoming chunk of data to Uint8Array before creating a blob from it. So instead of:
formData.append("file", new Blob([data]), state.filename);
like this:
const blob = new Blob([ new Uint8Array(data) ])
formData.append("file", blob, state.filename);
And it worked like a charm.

Better way to write CSV from Parquet in Javascript

I am converting from Parquet to CSV using javascript.
The example below works, but i am storing in memory the array of values read from Parquet, in records.
Parquet library uses AsyncIterator while the CSV library uses Node Stream API.
I would like to know how to implement a more elegant solution, leveraging streams and reducing memory footprint. TIA
libraries -
Parquet: https://github.com/ironSource/parquetjs
CSV: https://csv.js.org/
import pts from 'parquets'
let { ParquetSchema, ParquetWriter, ParquetReader } = pts
import * as fs from 'fs'
import stringify from 'csv-stringify'
// declare a schema for the `PI` table
let schema = new ParquetSchema({
Source: { type: 'UTF8' },
TagID: { type: 'UTF8' },
Timestamp: { type: 'TIMESTAMP_MILLIS' },
Value: { type: 'DOUBLE' },
});
const WriterParquet = async () => {
// create new ParquetWriter that writes to 'pi.parquet`
let writer = await ParquetWriter.openFile(schema, 'pi.parquet')
// append a few rows to the file
await writer.appendRow({Source: 'PI/NO-SVG-PISRV01', TagID: 'OGP8TI198Z.PV', Timestamp: new Date(), Value: 410 })
await writer.appendRow({Source: 'PI/NO-SVG-PISRV01', TagID: 'OGP8TI198Z.PV', Timestamp: new Date(), Value: 420 })
await writer.close()
}
const WriterCSV = async () => {
// create new ParquetReader that reads from 'pi.parquet`
let reader = await ParquetReader.openFile('pi.parquet')
// create a new cursor
let cursor = reader.getCursor()
// read all records from the file and print them
let records = []
let record = null;
while (record = await cursor.next()) {
console.log(record)
records.push(record)
}
await reader.close()
// write to CSV
stringify(records, {
header: true
}, function (err, output) {
console.log(output)
fs.writeFile('./pi.csv', output, () => {});
})
}
const Main = async () => {
console.log('writing parquet...')
await WriterParquet()
console.log('reading parquet and writing csv...')
await WriterCSV()
}
Main()
Instead of using the cursor i used the Readable.from(reader) creating a ReadableStream, after this, it was easy to pipe into csv-stringify:
const WriterCSV = async () => {
// create new ParquetReader that reads from 'pi.parquet`
let reader = await ParquetReader.openFile('pi.parquet')
// read all records from the file and print them
const readStream = Readable.from(reader)
readStream.pipe(
stringify({
header: true,
columns: {
Source: 'Source',
TagID: 'TagID',
Timestamp: 'Timestamp',
Value: 'Value'
}
}, function (error, output) {
fs.writeFile('./pi.csv', output, () => {});
}))
readStream.on('end', async function () {
await reader.close();
});
}

ftp directory download triggers maximum call stack exceeded error

I'm currently working on a backup script with NodeJS. The script downloads a directory and its files und subdirectories recursively using FTP/FTPS. I'm using the basic-ftp package to do the FTP calls.
When I try to download a big directory with a lot of subdirectories, I get the Maximum call stack size exceeded error, but I don't find why and where it happens. I don't see any infinity loop or any missing return calls. After hours of debugging, I have no more ideas.
I don't use the downloadDirTo method from basic-ftp, because I don't want to stop downloading after a error happend. When an error occures it should keep going and it should add the error to the log file.
The repository is here: https://github.com/julianpoemp/webspace-backup.
As soon as the FTPManager is ready, I call the doBackup method (see method in BackupManager). This method calls the downloadFolder method defined in FTPManager.
export class BackupManager {
private ftpManager: FtpManager;
constructor() {
osLocale().then((locale) => {
ConsoleOutput.info(`locale is ${locale}`);
moment.locale(locale);
}).catch((error) => {
ConsoleOutput.error(error);
});
this.ftpManager = new FtpManager(AppSettings.settings.backup.root, {
host: AppSettings.settings.server.host,
port: AppSettings.settings.server.port,
user: AppSettings.settings.server.user,
password: AppSettings.settings.server.password,
pasvTimeout: AppSettings.settings.server.pasvTimeout
});
this.ftpManager.afterManagerIsReady().then(() => {
this.doBackup();
}).catch((error) => {
ConsoleOutput.error(error);
});
}
public doBackup() {
let errors = '';
if (fs.existsSync(path.join(AppSettings.appPath, 'errors.log'))) {
fs.unlinkSync(path.join(AppSettings.appPath, 'errors.log'));
}
if (fs.existsSync(path.join(AppSettings.appPath, 'statistics.txt'))) {
fs.unlinkSync(path.join(AppSettings.appPath, 'statistics.txt'));
}
const subscr = this.ftpManager.error.subscribe((message: string) => {
ConsoleOutput.error(`${moment().format('L LTS')}: ${message}`);
const line = `${moment().format('L LTS')}:\t${message}\n`;
errors += line;
fs.appendFile(path.join(AppSettings.appPath, 'errors.log'), line, {
encoding: 'Utf8'
}, () => {
});
});
let name = AppSettings.settings.backup.root.substring(0, AppSettings.settings.backup.root.lastIndexOf('/'));
name = name.substring(name.lastIndexOf('/') + 1);
const downloadPath = (AppSettings.settings.backup.downloadPath === '') ? AppSettings.appPath : AppSettings.settings.backup.downloadPath;
ConsoleOutput.info(`Remote path: ${AppSettings.settings.backup.root}\nDownload path: ${downloadPath}\n`);
this.ftpManager.statistics.started = Date.now();
this.ftpManager.downloadFolder(AppSettings.settings.backup.root, path.join(downloadPath, name)).then(() => {
this.ftpManager.statistics.ended = Date.now();
this.ftpManager.statistics.duration = (this.ftpManager.statistics.ended - this.ftpManager.statistics.started) / 1000 / 60;
ConsoleOutput.success('Backup finished!');
const statistics = `\n-- Statistics: --
Started: ${moment(this.ftpManager.statistics.started).format('L LTS')}
Ended: ${moment(this.ftpManager.statistics.ended).format('L LTS')}
Duration: ${this.ftpManager.getTimeString(this.ftpManager.statistics.duration * 60 * 1000)} (H:m:s)
Folders: ${this.ftpManager.statistics.folders}
Files: ${this.ftpManager.statistics.files}
Errors: ${errors.split('\n').length - 1}`;
ConsoleOutput.log('\n' + statistics);
fs.writeFileSync(path.join(AppSettings.appPath, 'statistics.txt'), statistics, {
encoding: 'utf-8'
});
if (errors !== '') {
ConsoleOutput.error(`There are errors. Please read the errors.log file for further information.`);
}
subscr.unsubscribe();
this.ftpManager.close();
}).catch((error) => {
ConsoleOutput.error(error);
this.ftpManager.close();
});
}
}
import * as ftp from 'basic-ftp';
import {FileInfo} from 'basic-ftp';
import * as Path from 'path';
import * as fs from 'fs';
import {Subject} from 'rxjs';
import {FtpEntry, FTPFolder} from './ftp-entry';
import {ConsoleOutput} from './ConsoleOutput';
import moment = require('moment');
export class FtpManager {
private isReady = false;
private _client: ftp.Client;
private currentDirectory = '';
public readyChange: Subject<boolean>;
public error: Subject<string>;
private connectionOptions: FTPConnectionOptions;
public statistics = {
folders: 0,
files: 0,
started: 0,
ended: 0,
duration: 0
};
private recursives = 0;
constructor(path: string, options: FTPConnectionOptions) {
this._client = new ftp.Client();
this._client.ftp.verbose = false;
this.readyChange = new Subject<boolean>();
this.error = new Subject<string>();
this.currentDirectory = path;
this.connectionOptions = options;
this.connect().then(() => {
this.isReady = true;
this.gotTo(path).then(() => {
this.onReady();
}).catch((error) => {
ConsoleOutput.error('ERROR: ' + error);
this.onConnectionFailed();
});
});
}
private connect(): Promise<void> {
return new Promise<void>((resolve, reject) => {
this._client.access({
host: this.connectionOptions.host,
user: this.connectionOptions.user,
password: this.connectionOptions.password,
secure: true
}).then(() => {
resolve();
}).catch((error) => {
reject(error);
});
});
}
private onReady = () => {
this.isReady = true;
this.readyChange.next(true);
};
private onConnectionFailed() {
this.isReady = false;
this.readyChange.next(false);
}
public close() {
this._client.close();
}
public async gotTo(path: string) {
return new Promise<void>((resolve, reject) => {
if (this.isReady) {
ConsoleOutput.info(`open ${path}`);
this._client.cd(path).then(() => {
this._client.pwd().then((dir) => {
this.currentDirectory = dir;
resolve();
}).catch((error) => {
reject(error);
});
}).catch((error) => {
reject(error);
});
} else {
reject(`FTPManager is not ready. gotTo ${path}`);
}
});
}
public async listEntries(path: string): Promise<FileInfo[]> {
if (this.isReady) {
return this._client.list(path);
} else {
throw new Error('FtpManager is not ready. list entries');
}
}
public afterManagerIsReady(): Promise<void> {
return new Promise<void>((resolve, reject) => {
if (this.isReady) {
resolve();
} else {
this.readyChange.subscribe(() => {
resolve();
},
(error) => {
reject(error);
},
() => {
});
}
});
}
public async downloadFolder(remotePath: string, downloadPath: string) {
this.recursives++;
if (this.recursives % 100 === 99) {
ConsoleOutput.info('WAIT');
await this.wait(0);
}
if (!fs.existsSync(downloadPath)) {
fs.mkdirSync(downloadPath);
}
try {
const list = await this.listEntries(remotePath);
for (const fileInfo of list) {
if (fileInfo.isDirectory) {
const folderPath = remotePath + fileInfo.name + '/';
try {
await this.downloadFolder(folderPath, Path.join(downloadPath, fileInfo.name));
this.statistics.folders++;
ConsoleOutput.success(`${this.getCurrentTimeString()}===> Directory downloaded: ${remotePath}\n`);
} catch (e) {
this.error.next(e);
}
} else if (fileInfo.isFile) {
try {
const filePath = remotePath + fileInfo.name;
if (this.recursives % 100 === 99) {
ConsoleOutput.info('WAIT');
await this.wait(0);
}
await this.downloadFile(filePath, downloadPath, fileInfo);
} catch (e) {
this.error.next(e);
}
}
}
return true;
} catch (e) {
this.error.next(e);
return true;
}
}
public async downloadFile(path: string, downloadPath: string, fileInfo: FileInfo) {
this.recursives++;
if (fs.existsSync(downloadPath)) {
const handler = (info) => {
let procent = Math.round((info.bytes / fileInfo.size) * 10000) / 100;
if (isNaN(procent)) {
procent = 0;
}
let procentStr = '';
if (procent < 10) {
procentStr = '__';
} else if (procent < 100) {
procentStr = '_';
}
procentStr += procent.toFixed(2);
ConsoleOutput.log(`${this.getCurrentTimeString()}---> ${info.type} (${procentStr}%): ${info.name}`);
};
if (this._client.closed) {
try {
await this.connect();
} catch (e) {
throw new Error(e);
}
}
this._client.trackProgress(handler);
try {
await this._client.downloadTo(Path.join(downloadPath, fileInfo.name), path);
this._client.trackProgress(undefined);
this.statistics.files++;
return true;
} catch (e) {
throw new Error(e);
}
} else {
throw new Error('downloadPath does not exist');
}
}
public chmod(path: string, permission: string): Promise<void> {
return new Promise<void>((resolve, reject) => {
this._client.send(`SITE CHMOD ${permission} ${path}`).then(() => {
console.log(`changed chmod of ${path} to ${permission}`);
resolve();
}).catch((error) => {
reject(error);
});
});
}
public getCurrentTimeString(): string {
const duration = Date.now() - this.statistics.started;
return moment().format('L LTS') + ' | Duration: ' + this.getTimeString(duration) + ' ';
}
public getTimeString(timespan: number) {
if (timespan < 0) {
timespan = 0;
}
let result = '';
const minutes: string = this.formatNumber(this.getMinutes(timespan), 2);
const seconds: string = this.formatNumber(this.getSeconds(timespan), 2);
const hours: string = this.formatNumber(this.getHours(timespan), 2);
result += hours + ':' + minutes + ':' + seconds;
return result;
}
private formatNumber = (num, length): string => {
let result = '' + num.toFixed(0);
while (result.length < length) {
result = '0' + result;
}
return result;
};
private getSeconds(timespan: number): number {
return Math.floor(timespan / 1000) % 60;
}
private getMinutes(timespan: number): number {
return Math.floor(timespan / 1000 / 60) % 60;
}
private getHours(timespan: number): number {
return Math.floor(timespan / 1000 / 60 / 60);
}
public async wait(time: number): Promise<void> {
return new Promise<void>((resolve) => {
setTimeout(() => {
resolve();
}, time);
});
}
}
export interface FTPConnectionOptions {
host: string;
port: number;
user: string;
password: string;
pasvTimeout: number;
}
Problem
Inside the FtpManager.downloadFolder function, I see recursive calls to the same downloadFolder method with an await. Your Maximum call stack exceeded error could come from there, since your initial call will need to keep everything in memory while traversing all subdirectories.
Proposed solution
Instead of awaiting everything recursively, you could setup a queue system, with an algorithm like this:
Add the current folder to a queue
While that queue is not empty:
Get the first folder in the queue (and remove it from it)
List all entries in it
Download all files
Add all subfolders to the queue
This allows you to download a lot of folders in a loop, instead of using recursion. Each loop iteration will run independently, meaning that the result of the root directory download won't depend on the deeeeeep file tree inside it.
Using a queue manager
There are plenty of queue manager modules for NodeJS, which allow you to have concurrency, timeouts, etc. One I've used in the past is simply named queue. It has a lot of useful features, but will require a little more work to implement in your project. Hence, for this answer, I used no external queue module, so that you can see the logic behind it. Feel free to search for queue, job, concurrency...
Example
I wanted to implement that logic directly into your own code, but I don't use Typescript, so I thought I'd make a simple folder copy function, which uses the same logic.
Note: For simplicity, I've not added any error handling, this is just a proof of concept! You can find a demo project which uses this here on my Github.
Here is how I've done it:
const fs = require('fs-extra');
const Path = require('path');
class CopyManager {
constructor() {
// Create a queue accessible by all methods
this.folderQueue = [];
}
/**
* Copies a directory
* #param {String} remotePath
* #param {String} downloadPath
*/
async copyFolder(remotePath, downloadPath) {
// Add the folder to the queue
this.folderQueue.push({ remotePath, downloadPath });
// While the queue contains folders to download
while (this.folderQueue.length > 0) {
// Download them
const { remotePath, downloadPath } = this.folderQueue.shift();
console.log(`Copy directory: ${remotePath} to ${downloadPath}`);
await this._copyFolderAux(remotePath, downloadPath);
}
}
/**
* Private internal method which copies the files from a folder,
* but if it finds subfolders, simply adds them to the folderQueue
* #param {String} remotePath
* #param {String} downloadPath
*/
async _copyFolderAux(remotePath, downloadPath) {
await fs.mkdir(downloadPath);
const list = await this.listEntries(remotePath);
for (const fileInfo of list) {
if (fileInfo.isDirectory) {
const folderPath = Path.join(remotePath, fileInfo.name);
const targetPath = Path.join(downloadPath, fileInfo.name);
// Push the folder to the queue
this.folderQueue.push({ remotePath: folderPath, downloadPath: targetPath });
} else if (fileInfo.isFile) {
const filePath = Path.join(remotePath, fileInfo.name);
await this.copyFile(filePath, downloadPath, fileInfo);
}
}
}
/**
* Copies a file
* #param {String} filePath
* #param {String} downloadPath
* #param {Object} fileInfo
*/
async copyFile(filePath, downloadPath, fileInfo) {
const targetPath = Path.join(downloadPath, fileInfo.name);
console.log(`Copy file: ${filePath} to ${targetPath}`);
return await fs.copy(filePath, targetPath);
}
/**
* Lists entries from a folder
* #param {String} remotePath
*/
async listEntries(remotePath) {
const fileNames = await fs.readdir(remotePath);
return Promise.all(
fileNames.map(async name => {
const stats = await fs.lstat(Path.join(remotePath, name));
return {
name,
isDirectory: stats.isDirectory(),
isFile: stats.isFile()
};
})
);
}
}
module.exports = CopyManager;
I found the source of the problem. It's the pkg package that emits the maximum callstack exceeded error: www.github.com/zeit/pkg/issues/681.
When I test it directly using node on windows, it work's. I will either downgrade to Node 10 or looking for another solution.
Thanks #blex for the help!

Why does my code using insertMany() skips some of the records and insert same records multiple times?

I have 9577 unique records in a csv file.
This code inserts 9800 records and insert not all records, but duplicates of some of them. Any idea why it does not inserts the unique 9577 records and also duplicates of some of them? Below I also insert the remain part of the code so you get the whole picture
function bulkImportToMongo(arrayToImport, mongooseModel) {
const Model = require(`../../../models/${mongooseModel}`);
let batchCount = Math.ceil(arrayToImport.length / 100);
console.log(arrayToImport.length);
let ops = [];
for (let i = 0; i < batchCount; i++) {
// console.log(i);
let batch = arrayToImport.slice(i, i + 100);
console.log(batch.length);
ops.push(Model.insertMany(batch));
}
return ops;
return Promise.all(ops).then(results => {
// results is an array of results for each batch
console.log("results: ", results);
});
}
I parse the csv file
const Promise = require("bluebird");
const csv = require("fast-csv");
const path = require("path");
const fs = Promise.promisifyAll(require("fs"));
const promiseCSV = Promise.method((filePath, options) => {
return new Promise((resolve, reject) => {
var records = [];
csv
.fromPath(filePath, options)
.on("data", record => {
records.push(record);
})
.on("end", () => {
// console.log(records);
resolve(records);
});
});
});
And here is the script that connects it all together:
const path = require("path");
const promiseCSV = require("./helpers/ImportCSVFiles");
const {
connectToMongo,
bulkImportToMongo
} = require("./helpers/mongoOperations");
const filePath = path.join(__dirname, "../../data/parts.csv");
const options = {
delimiter: ";",
noheader: true,
headers: [
"facility",
"partNumber",
"partName",
"partDescription",
"netWeight",
"customsTariff"
]
};
connectToMongo("autoMDM");
promiseCSV(filePath, options).then(records => {
bulkImportToMongo(records, "parts.js");
});
//It looks like your issue is simply i++. Perhaps you meant i += 100?
for (let i = 0; i < batchCount; i+=100 /* NOT i++ */) {
//...
}
I solved it.
I hope this helps other... :-)
I had two errors, in the function promiseCSV (changed to parseCSV) and second I had bad logic in bulkImportToMongo.
Complete solution:
I parsed and imported 602.198 objects and here is how long time it took using node --max_old_space_size=8000 on a MacBook Pro with 8gb of ram.
console
➜ database git:(master) ✗ node --max_old_space_size=8000 partImport.js
Connected to db!
Time to parse file: : 5209.325ms
Disconnected from db!
Time to import parsed objects to db: : 153606.545ms
➜ database git:(master) ✗
parseCSV.js
const csv = require("fast-csv");
function promiseCSV(filePath, options) {
return new Promise((resolve, reject) => {
console.time("Time to parse file");
var records = [];
csv
.fromPath(filePath, options)
.on("data", record => {
records.push(record);
})
.on("end", () => {
console.timeEnd("Time to parse file");
resolve(records);
});
});
}
module.exports = promiseCSV;
mongodb.js
const mongoose = require("mongoose");
mongoose.Promise = global.Promise;
function connectToMongo(databaseName) {
mongoose.connect(`mongodb://localhost:27017/${databaseName}`, {
keepAlive: true,
reconnectTries: Number.MAX_VALUE,
useMongoClient: true
});
console.log("Connected to db!");
}
function disconnectFromMongo() {
mongoose.disconnect();
console.log("Disconnected from db!");
}
function bulkImportToMongo(arrayToImport, mongooseModel) {
const Model = require(`../../../models/${mongooseModel}`);
const batchSize = 100;
let batchCount = Math.ceil(arrayToImport.length / batchSize);
let recordsLeft = arrayToImport.length;
let ops = [];
let counter = 0;
for (let i = 0; i < batchCount; i++) {
let batch = arrayToImport.slice(counter, counter + batchSize);
counter += batchSize;
ops.push(Model.insertMany(batch));
}
return Promise.all(ops);
}
module.exports.bulkImportToMongo = bulkImportToMongo;
module.exports.connectToMongo = connectToMongo;
module.exports.disconnectFromMongo = disconnectFromMongo;
partImport.js
const path = require("path");
const parseCSV = require("./helpers/parseCSV");
const {
connectToMongo,
disconnectFromMongo,
bulkImportToMongo
} = require("./helpers/mongodb");
const filePath = path.join(__dirname, "../../data/parts.csv");
const options = {
delimiter: ";",
noheader: true,
headers: [
"facility",
"partNumber",
"partName",
"partDescription",
"netWeight",
"customsTariff"
]
};
connectToMongo("autoMDM");
parseCSV(filePath, options)
.then(records => {
console.time("Time to import parsed objects to db");
return bulkImportToMongo(records, "parts.js");
})
/* .then(result =>
console.log("Total batches inserted: ", result, result.length)
) */
.then(() => {
disconnectFromMongo();
console.timeEnd("Time to import parsed objects to db");
})
.catch(error => console.log(error));

Categories

Resources