Firefox UI becomes unresponsive while downloading many files with Addon SDK API - javascript

I have a problem that is rather hard to debug, i need to download a lot (~400) of rather small (~3-4mb) files in the background using the firefox addon sdk API.
I tried using the old API (nsIWebBrowserPersist) as well as the new API (Downloads.jsm) (shortened code):
Task.spawn(function () {
for (var i = 0; i < documents.length; i++) {
var url = ...;
var file = ...;
let download = yield Downloads.createDownload({
source: url,
target: file,
});
yield download.start();
yield download.finalize();
}
});
But the UI gets extremely unresponsive after some time, i tried using the same file and overwriting it, because my first guess was windows file handles accumulating over the time, but it didn't help. It does not seem to be related to the system performance, also it works sometimes and on the same machine after 5 min it fails.
Is there a known issue with downloading a lot of files using the firefox sdk api, or am i doing something wrong?

I found that by using an alternative API the download became faster and the ui more responsive:
function downloadFromUrl(url, file, callback) {
var channel = chrome.Cc["#mozilla.org/network/io-service;1"]
.getService(chrome.Ci.nsIIOService)
.newChannel(url, 0, null);
var bstream = chrome.Cc["#mozilla.org/binaryinputstream;1"]
.createInstance(chrome.Ci.nsIBinaryInputStream);
bstream.setInputStream(channel.open());
var fos = chrome.Cc["#mozilla.org/network/safe-file-output-stream;1"]
.createInstance(chrome.Ci.nsIFileOutputStream);
try {
fos.init(file, 0x04 | 0x08 | 0x10 | 0x20 | 0x40, 0600, 0); // see:https://developer.mozilla.org/en-US/docs/PR_Open#Parameters
var length = 0;
var size = 0;
while(size = bstream.available()) {
fos.write(bstream.readBytes(size), size);
length += size;
callback(length);
}
} finally {
if (fos instanceof chrome.Ci.nsISafeOutputStream) {
fos.finish();
} else {
fos.close();
}
}
}
I know that this is kind of primitive api, but it works way better than the alternatives..
Edit:
I improved the above function, but it may be too bloated, here is it anyways:
/**
* Downloads from a given url to a local file
* #param url url to download
* #param file local file
* #param callback called during the download, signature: callback(currentBytes)
* #returns downloadResult {contentType, error: false | ExceptionObject}
*/
function downloadFromUrl(url, file, callback) {
let result = {
contentType: null,
error: false
};
try {
let channel = chrome.Cc["#mozilla.org/network/io-service;1"]
.getService(chrome.Ci.nsIIOService)
.newChannel(url, 0, null);
let bstream = chrome.Cc["#mozilla.org/binaryinputstream;1"]
.createInstance(chrome.Ci.nsIBinaryInputStream);
bstream.setInputStream(channel.open());
let fos = chrome.Cc["#mozilla.org/network/safe-file-output-stream;1"]
.createInstance(chrome.Ci.nsIFileOutputStream);
try {
// const values from https://developer.mozilla.org/en-US/docs/PR_Open#Parameters
const PR_RDWR = 0x04; // Open for reading and writing.
const PR_CREATE_FILE = 0x08; // If the file does not exist, the file is created. If the file exists, this flag has no effect.
const PR_APPEND = 0x10; // The file pointer is set to the end of the file prior to each write.
const PR_TRUNCATE = 0x20; // If the file exists, its length is truncated to 0.
const PR_SYNC = 0x40; // If set, each write will wait for both the file data and file status to be physically updated.
fos.init(file, PR_RDWR | PR_CREATE_FILE | PR_APPEND | PR_TRUNCATE | PR_SYNC, 0600, 0);
let length = 0;
let size = bstream.available();
while(size) {
fos.write(bstream.readBytes(size), size);
length += size;
callback(length);
size = bstream.available();
}
fos.flush();
result.contentType = channel.contentType;
} finally {
if (fos instanceof chrome.Ci.nsISafeOutputStream) {
fos.finish();
} else {
fos.close();
}
}
} catch (e) {
result.error = e;
}
return result;
}

Related

How can I optimize my JavaScript code to handle large log files (over 1 GB)? [duplicate]

I need to do some parsing of large (5-10 Gb)logfiles in Javascript/Node.js (I'm using Cube).
The logline looks something like:
10:00:43.343423 I'm a friendly log message. There are 5 cats, and 7 dogs. We are in state "SUCCESS".
We need to read each line, do some parsing (e.g. strip out 5, 7 and SUCCESS), then pump this data into Cube (https://github.com/square/cube) using their JS client.
Firstly, what is the canonical way in Node to read in a file, line by line?
It seems to be fairly common question online:
http://www.quora.com/What-is-the-best-way-to-read-a-file-line-by-line-in-node-js
Read a file one line at a time in node.js?
A lot of the answers seem to point to a bunch of third-party modules:
https://github.com/nickewing/line-reader
https://github.com/jahewson/node-byline
https://github.com/pkrumins/node-lazy
https://github.com/Gagle/Node-BufferedReader
However, this seems like a fairly basic task - surely, there's a simple way within the stdlib to read in a textfile, line-by-line?
Secondly, I then need to process each line (e.g. convert the timestamp into a Date object, and extract useful fields).
What's the best way to do this, maximising throughput? Is there some way that won't block on either reading in each line, or on sending it to Cube?
Thirdly - I'm guessing using string splits, and the JS equivalent of contains (IndexOf != -1?) will be a lot faster than regexes? Has anybody had much experience in parsing massive amounts of text data in Node.js?
I searched for a solution to parse very large files (gbs) line by line using a stream. All the third-party libraries and examples did not suit my needs since they processed the files not line by line (like 1 , 2 , 3 , 4 ..) or read the entire file to memory
The following solution can parse very large files, line by line using stream & pipe. For testing I used a 2.1 gb file with 17.000.000 records. Ram usage did not exceed 60 mb.
First, install the event-stream package:
npm install event-stream
Then:
var fs = require('fs')
, es = require('event-stream');
var lineNr = 0;
var s = fs.createReadStream('very-large-file.csv')
.pipe(es.split())
.pipe(es.mapSync(function(line){
// pause the readstream
s.pause();
lineNr += 1;
// process line here and call s.resume() when rdy
// function below was for logging memory usage
logMemoryUsage(lineNr);
// resume the readstream, possibly from a callback
s.resume();
})
.on('error', function(err){
console.log('Error while reading file.', err);
})
.on('end', function(){
console.log('Read entire file.')
})
);
Please let me know how it goes!
You can use the inbuilt readline package, see docs here. I use stream to create a new output stream.
var fs = require('fs'),
readline = require('readline'),
stream = require('stream');
var instream = fs.createReadStream('/path/to/file');
var outstream = new stream;
outstream.readable = true;
outstream.writable = true;
var rl = readline.createInterface({
input: instream,
output: outstream,
terminal: false
});
rl.on('line', function(line) {
console.log(line);
//Do your stuff ...
//Then write to output stream
rl.write(line);
});
Large files will take some time to process. Do tell if it works.
I really liked #gerard answer which is actually deserves to be the correct answer here. I made some improvements:
Code is in a class (modular)
Parsing is included
Ability to resume is given to the outside in case there is an asynchronous job is chained to reading the CSV like inserting to DB, or a HTTP request
Reading in chunks/batche sizes that
user can declare. I took care of encoding in the stream too, in case
you have files in different encoding.
Here's the code:
'use strict'
const fs = require('fs'),
util = require('util'),
stream = require('stream'),
es = require('event-stream'),
parse = require("csv-parse"),
iconv = require('iconv-lite');
class CSVReader {
constructor(filename, batchSize, columns) {
this.reader = fs.createReadStream(filename).pipe(iconv.decodeStream('utf8'))
this.batchSize = batchSize || 1000
this.lineNumber = 0
this.data = []
this.parseOptions = {delimiter: '\t', columns: true, escape: '/', relax: true}
}
read(callback) {
this.reader
.pipe(es.split())
.pipe(es.mapSync(line => {
++this.lineNumber
parse(line, this.parseOptions, (err, d) => {
this.data.push(d[0])
})
if (this.lineNumber % this.batchSize === 0) {
callback(this.data)
}
})
.on('error', function(){
console.log('Error while reading file.')
})
.on('end', function(){
console.log('Read entirefile.')
}))
}
continue () {
this.data = []
this.reader.resume()
}
}
module.exports = CSVReader
So basically, here is how you will use it:
let reader = CSVReader('path_to_file.csv')
reader.read(() => reader.continue())
I tested this with a 35GB CSV file and it worked for me and that's why I chose to build it on #gerard's answer, feedbacks are welcomed.
I used https://www.npmjs.com/package/line-by-line for reading more than 1 000 000 lines from a text file. In this case, an occupied capacity of RAM was about 50-60 megabyte.
const LineByLineReader = require('line-by-line'),
lr = new LineByLineReader('big_file.txt');
lr.on('error', function (err) {
// 'err' contains error object
});
lr.on('line', function (line) {
// pause emitting of lines...
lr.pause();
// ...do your asynchronous line processing..
setTimeout(function () {
// ...and continue emitting lines.
lr.resume();
}, 100);
});
lr.on('end', function () {
// All lines are read, file is closed now.
});
The Node.js Documentation offers a very elegant example using the Readline module.
Example: Read File Stream Line-by-Line
const { once } = require('node:events');
const fs = require('fs');
const readline = require('readline');
const rl = readline.createInterface({
input: fs.createReadStream('sample.txt'),
crlfDelay: Infinity
});
rl.on('line', (line) => {
console.log(`Line from file: ${line}`);
});
await once(rl, 'close');
Note: we use the crlfDelay option to recognize all instances of CR LF ('\r\n') as a single line break.
Apart from read the big file line by line, you also can read it chunk by chunk. For more refer to this article
var offset = 0;
var chunkSize = 2048;
var chunkBuffer = new Buffer(chunkSize);
var fp = fs.openSync('filepath', 'r');
var bytesRead = 0;
while(bytesRead = fs.readSync(fp, chunkBuffer, 0, chunkSize, offset)) {
offset += bytesRead;
var str = chunkBuffer.slice(0, bytesRead).toString();
var arr = str.split('\n');
if(bytesRead = chunkSize) {
// the last item of the arr may be not a full line, leave it to the next chunk
offset -= arr.pop().length;
}
lines.push(arr);
}
console.log(lines);
I had the same problem yet. After comparing several modules that seem to have this feature, I decided to do it myself, it's simpler than I thought.
gist: https://gist.github.com/deemstone/8279565
var fetchBlock = lineByline(filepath, onEnd);
fetchBlock(function(lines, start){ ... }); //lines{array} start{int} lines[0] No.
It cover the file opened in a closure, that fetchBlock() returned will fetch a block from the file, end split to array (will deal the segment from last fetch).
I've set the block size to 1024 for each read operation. This may have bugs, but code logic is obvious, try it yourself.
Reading / Writing files using stream with the native nodejs modules (fs, readline):
const fs = require('fs');
const readline = require('readline');
const rl = readline.createInterface({
input: fs.createReadStream('input.json'),
output: fs.createWriteStream('output.json')
});
rl.on('line', function(line) {
console.log(line);
// Do any 'line' processing if you want and then write to the output file
this.output.write(`${line}\n`);
});
rl.on('close', function() {
console.log(`Created "${this.output.path}"`);
});
Based on this questions answer I implemented a class you can use to read a file synchronously line-by-line with fs.readSync(). You can make this "pause" and "resume" by using a Q promise (jQuery seems to require a DOM so cant run it with nodejs):
var fs = require('fs');
var Q = require('q');
var lr = new LineReader(filenameToLoad);
lr.open();
var promise;
workOnLine = function () {
var line = lr.readNextLine();
promise = complexLineTransformation(line).then(
function() {console.log('ok');workOnLine();},
function() {console.log('error');}
);
}
workOnLine();
complexLineTransformation = function (line) {
var deferred = Q.defer();
// ... async call goes here, in callback: deferred.resolve('done ok'); or deferred.reject(new Error(error));
return deferred.promise;
}
function LineReader (filename) {
this.moreLinesAvailable = true;
this.fd = undefined;
this.bufferSize = 1024*1024;
this.buffer = new Buffer(this.bufferSize);
this.leftOver = '';
this.read = undefined;
this.idxStart = undefined;
this.idx = undefined;
this.lineNumber = 0;
this._bundleOfLines = [];
this.open = function() {
this.fd = fs.openSync(filename, 'r');
};
this.readNextLine = function () {
if (this._bundleOfLines.length === 0) {
this._readNextBundleOfLines();
}
this.lineNumber++;
var lineToReturn = this._bundleOfLines[0];
this._bundleOfLines.splice(0, 1); // remove first element (pos, howmany)
return lineToReturn;
};
this.getLineNumber = function() {
return this.lineNumber;
};
this._readNextBundleOfLines = function() {
var line = "";
while ((this.read = fs.readSync(this.fd, this.buffer, 0, this.bufferSize, null)) !== 0) { // read next bytes until end of file
this.leftOver += this.buffer.toString('utf8', 0, this.read); // append to leftOver
this.idxStart = 0
while ((this.idx = this.leftOver.indexOf("\n", this.idxStart)) !== -1) { // as long as there is a newline-char in leftOver
line = this.leftOver.substring(this.idxStart, this.idx);
this._bundleOfLines.push(line);
this.idxStart = this.idx + 1;
}
this.leftOver = this.leftOver.substring(this.idxStart);
if (line !== "") {
break;
}
}
};
}
node-byline uses streams, so i would prefer that one for your huge files.
for your date-conversions i would use moment.js.
for maximising your throughput you could think about using a software-cluster. there are some nice-modules which wrap the node-native cluster-module quite well. i like cluster-master from isaacs. e.g. you could create a cluster of x workers which all compute a file.
for benchmarking splits vs regexes use benchmark.js. i havent tested it until now. benchmark.js is available as a node-module
import * as csv from 'fast-csv';
import * as fs from 'fs';
interface Row {
[s: string]: string;
}
type RowCallBack = (data: Row, index: number) => object;
export class CSVReader {
protected file: string;
protected csvOptions = {
delimiter: ',',
headers: true,
ignoreEmpty: true,
trim: true
};
constructor(file: string, csvOptions = {}) {
if (!fs.existsSync(file)) {
throw new Error(`File ${file} not found.`);
}
this.file = file;
this.csvOptions = Object.assign({}, this.csvOptions, csvOptions);
}
public read(callback: RowCallBack): Promise < Array < object >> {
return new Promise < Array < object >> (resolve => {
const readStream = fs.createReadStream(this.file);
const results: Array < any > = [];
let index = 0;
const csvStream = csv.parse(this.csvOptions).on('data', async (data: Row) => {
index++;
results.push(await callback(data, index));
}).on('error', (err: Error) => {
console.error(err.message);
throw err;
}).on('end', () => {
resolve(results);
});
readStream.pipe(csvStream);
});
}
}
import { CSVReader } from '../src/helpers/CSVReader';
(async () => {
const reader = new CSVReader('./database/migrations/csv/users.csv');
const users = await reader.read(async data => {
return {
username: data.username,
name: data.name,
email: data.email,
cellPhone: data.cell_phone,
homePhone: data.home_phone,
roleId: data.role_id,
description: data.description,
state: data.state,
};
});
console.log(users);
})();
I have made a node module to read large file asynchronously text or JSON.
Tested on large files.
var fs = require('fs')
, util = require('util')
, stream = require('stream')
, es = require('event-stream');
module.exports = FileReader;
function FileReader(){
}
FileReader.prototype.read = function(pathToFile, callback){
var returnTxt = '';
var s = fs.createReadStream(pathToFile)
.pipe(es.split())
.pipe(es.mapSync(function(line){
// pause the readstream
s.pause();
//console.log('reading line: '+line);
returnTxt += line;
// resume the readstream, possibly from a callback
s.resume();
})
.on('error', function(){
console.log('Error while reading file.');
})
.on('end', function(){
console.log('Read entire file.');
callback(returnTxt);
})
);
};
FileReader.prototype.readJSON = function(pathToFile, callback){
try{
this.read(pathToFile, function(txt){callback(JSON.parse(txt));});
}
catch(err){
throw new Error('json file is not valid! '+err.stack);
}
};
Just save the file as file-reader.js, and use it like this:
var FileReader = require('./file-reader');
var fileReader = new FileReader();
fileReader.readJSON(__dirname + '/largeFile.json', function(jsonObj){/*callback logic here*/});

How can I read chunks from stream in JavaScript?

I have a file open in the browser that I want to create a stream and read with JS. I want to read the file in chunks of 1kb, but it always reads the whole file.
import { ReadableStream as PolyfillReadableStream } from 'web-streams-polyfill';
import { createReadableStreamWrapper } from '#mattiasbuelens/web-streams-adapter';
const toPolyfillReadable = createReadableStreamWrapper(PolyfillReadableStream);
const file = myFile;
const fStreamReader = toPolyfillReadable(file.stream(), new ByteLengthQueuingStrategy({
highWaterMark: 1024,
}));
const stream = [];
for await (const value of fStreamReader) { // This works because I'm using web-streams-polyfill
console.log(value); // This only runs once and prints the whole file.
}
The following is a simplified version of https://stackoverflow.com/a/28318964/16462950. When I used it to read a 28MB file, the allocation timeline always stayed at around 1KB:
async function read(file) {
for (var offset = 0; offset < file.size; offset += 1024) {
var oneKB = await file.slice(offset, offset + 1024).text();
console.log(oneKB);
}
}
<input name="file" type="file" onchange="read(this.files[0])" />
A file.stream() is faster, but consumes 64KB chunks and therefore allocates more memory.

fs.createWriteStream doesn't use back-pressure when writing data to a file, causing high memory usage

Problem
I'm trying to scan a drive directory (recursively walk all the paths) and write all the paths to a file (as it's finding them) using fs.createWriteStream in order to keep the memory usage low, but it doesn't work, the memory usage reaches 2GB during the scan.
Expected
I was expecting fs.createWriteStream to automatically handle memory/disk usage at all times, keeping memory usage at a minimum with back-pressure.
Code
const fs = require('fs')
const walkdir = require('walkdir')
let dir = 'C:/'
let options = {
"max_depth": 0,
"track_inodes": true,
"return_object": false,
"no_return": true,
}
const wstream = fs.createWriteStream("C:/Users/USERNAME/Desktop/paths.txt")
let walker = walkdir(dir, options)
walker.on('path', (path) => {
wstream.write(path + '\n')
})
walker.on('end', (path) => {
wstream.end()
})
Is it because I'm not using .pipe()? I tried creating a new Stream.Readable({read{}}) and then inside the .on('path' emitter pushing paths into it with readable.push(path) but that didn't really work.
UPDATE:
Method 2:
I tried the proposed in the answers drain method but it doesn't help much, it does reduce memory usage to 500mb (which is still too much for a stream) but it slows down the code significantly (from seconds to minutes)
Method 3:
I also tried using readdirp, it uses even less memory (~400mb) and is faster but I don't know how to pause it and use the drain method there to reduce the memory usage further:
const readdirp = require('readdirp')
let dir = 'C:/'
const wstream = fs.createWriteStream("C:/Users/USERNAME/Desktop/paths.txt")
readdirp(dir, {alwaysStat: false, type: 'files_directories'})
.on('data', (entry) => {
wstream.write(`${entry.fullPath}\n`)
})
Method 4:
I also tried doing this operation with a custom recursive walker, and even though it uses only 30mb of memory, which is what I wanted, but it is like 10 times slower than the readdirp method and it is synchronous which is undesirable:
const fs = require('fs')
const path = require('path')
let dir = 'C:/'
function customRecursiveWalker(dir) {
fs.readdirSync(dir).forEach(file => {
let fullPath = path.join(dir, file)
// Folders
if (fs.lstatSync(fullPath).isDirectory()) {
fs.appendFileSync("C:/Users/USERNAME/Desktop/paths.txt", `${fullPath}\n`)
customRecursiveWalker(fullPath)
}
// Files
else {
fs.appendFileSync("C:/Users/USERNAME/Desktop/paths.txt", `${fullPath}\n`)
}
})
}
customRecursiveWalker(dir)
Preliminary observation: you've attempted to get the results you want using multiple approaches. One complication when comparing the approaches you used is that they do not all do the same work. If you run tests on file tree that contains only regular files, that tree does not contain mount points, you can probably compare the approaches fairly, but when you start adding mount points, symbolic links, etc, you may get different memory and time statistics merely due to the fact that one approach excludes files that another approach includes.
I've initially attempted a solution using readdirp, but unfortunately, but that library appears buggy to me. Running it on my system here, I got inconsistent results. One run would output 10Mb of data, another run with the same input parameters would output 22Mb, then I'd get another number, etc. I looked at the code and found that it does not respect the return value of push:
_push(entry) {
if (this.readable) {
this.push(entry);
}
}
As per the documentation the push method may return a false value, in which case the Readable stream should stop producing data and wait until _read is called again. readdirp entirely ignores that part of the specification. It is crucial to pay attention to the return value of push to get proper handling of back-pressure. There are also other things that seemed questionable in that code.
So I abandoned that and worked on a proof of concept showing how it could be done. The crucial parts are:
When the push method returns false it is imperative to stop adding data to the stream. Instead, we record where we were, and stop.
We start again only when _read is called.
If you uncomment the console.log statements that print START and STOP. You'll see them printed out in succession on the console. We start, produce data until Node tells us to stop, and then we stop, until Node tells us to start again, and so on.
const stream = require("stream");
const fs = require("fs");
const { readdir, lstat } = fs.promises;
const path = require("path");
class Walk extends stream.Readable {
constructor(root, maxDepth = Infinity) {
super();
this._maxDepth = maxDepth;
// These fields allow us to remember where we were when we have to pause our
// work.
// The path of the directory to process when we resume processing, and the
// depth of this directory.
this._curdir = [root, 1];
// The directories still to process.
this._dirs = [this._curdir];
// The list of files to process when we resume processing.
this._files = [];
// The location in `this._files` were to continue processing when we resume.
this._ix = 0;
// A flag recording whether or not the fetching of files is currently going
// on.
this._started = false;
}
async _fetch() {
// Recall where we were by loading the state in local variables.
let files = this._files;
let dirs = this._dirs;
let [dir, depth] = this._curdir;
let ix = this._ix;
while (true) {
// If we've gone past the end of the files we were processing, then
// just forget about them. This simplifies the code that follows a bit.
if (ix >= files.length) {
ix = 0;
files = [];
}
// Read directories until we have files to process.
while (!files.length) {
// We've read everything, end the stream.
if (dirs.length === 0) {
// This is how the stream API requires us to indicate the stream has
// ended.
this.push(null);
// We're no longer running.
this._started = false;
return;
}
// Here, we get the next directory to process and get the list of
// files in it.
[dir, depth] = dirs.pop();
try {
files = await readdir(dir, { withFileTypes: true });
}
catch (ex) {
// This is a proof-of-concept. In a real application, you should
// determine what exceptions you want to ignore (e.g. EPERM).
}
}
// Process each file.
for (; ix < files.length; ++ix) {
const dirent = files[ix];
// Don't include in the results those files that are not directories,
// files or symbolic links.
if (!(dirent.isFile() || dirent.isDirectory() || dirent.isSymbolicLink())) {
continue;
}
const fullPath = path.join(dir, dirent.name);
if (dirent.isDirectory() & depth < this._maxDepth) {
// Keep track that we need to walk this directory.
dirs.push([fullPath, depth + 1]);
}
// Finally, we can put the data into the stream!
if (!this.push(`${fullPath}\n`)) {
// If the push returned false, we have to stop pushing results to the
// stream until _read is called again, so we have to stop.
// Uncomment this if you want to see when the stream stops.
// console.log("STOP");
// Record where we were in our processing.
this._files = files;
// The element at ix *has* been processed, so ix + 1.
this._ix = ix + 1;
this._curdir = [dir, depth];
// We're stopping, so indicate that!
this._started = false;
return;
}
}
}
}
async _read() {
// Do not start the process that puts data on the stream over and over
// again.
if (this._started) {
return;
}
this._started = true; // Yep, we've started.
// Uncomment this if you want to see when the stream starts.
// console.log("START");
await this._fetch();
}
}
// Change the paths to something that makes sense for you.
stream.pipeline(new Walk("/home/", 5),
fs.createWriteStream("/tmp/paths3.txt"),
(err) => console.log("ended with", err));
When I run the first attempt you made with walkdir here, I get the following statistics:
Elapsed time (wall clock): 59 sec
Maximum resident set size: 2.90 GB
When I use the code I've shown above:
Elapsed time (wall clock): 35 sec
Maximum resident set size: 0.1 GB
The file tree I use for the tests produces a file listing of 792 MB
You could exploit the returned value from WritableStream.write(): it essentially states if you should continue to read or not. a WritableStream has an internal property that stores the threshold after which the buffer should be processed by the OS. The drain event will be emitted when the buffer has been flushed, i.e. you can call safely call WritableStream.write() without risking to excessively fill the buffer (which means the RAM). Luckily for you, walkdir let you control the process: you can emit pause(pause the walk. no more events will be emitted until resume) and resume(resume the walk) event from the walkdir object, pausing and resuming the writing process on you stream accordingly. Try with this:
let is_emitter_paused = false;
wstream.on('drain', (evt) => {
if (is_emitter_paused) {
walkdir.resume();
}
});
walkdir.on('path', function(path, stat) {
is_emitter_paused = !wstream.write(path + '\n');
if (is_emitter_paused) {
walkdir.pause();
}
});
Here's an implementation inspired by #Louis's answer. I think it's a bit easier to follow and in my minimal testing it performs about the same.
const fs = require('fs');
const path = require('path');
const stream = require('stream');
class Walker extends stream.Readable {
constructor(root = process.cwd(), maxDepth = Infinity) {
super();
// Dirs to process
this._dirs = [{ path: root, depth: 0 }];
// Max traversal depth
this._maxDepth = maxDepth;
// Files to flush
this._files = [];
}
_drain() {
while (this._files.length > 0) {
const file = this._files.pop();
if (file.isFile() || file.isDirectory() || file.isSymbolicLink()) {
const filePath = path.join(this._dir.path, file.name);
if (file.isDirectory() && this._maxDepth > this._dir.depth) {
// Add directory to be walked at a later time
this._dirs.push({ path: filePath, depth: this._dir.depth + 1 });
}
if (!this.push(`${filePath}\n`)) {
// Hault walking
return false;
}
}
}
if (this._dirs.length === 0) {
// Walking complete
this.push(null);
return false;
}
// Continue walking
return true;
}
async _step() {
try {
this._dir = this._dirs.pop();
this._files = await fs.promises.readdir(this._dir.path, { withFileTypes: true });
} catch (e) {
this.emit('error', e); // Uh oh...
}
}
async _walk() {
this.walking = true;
while (this._drain()) {
await this._step();
}
this.walking = false;
}
_read() {
if (!this.walking) {
this._walk();
}
}
}
stream.pipeline(new Walker('some/dir/path', 5),
fs.createWriteStream('output.txt'),
(err) => console.log('ended with', err));

How is Node so much faster in this benchmark?

In an attempt to learn some systems programming, I was going to attempt to write a tokeniser in rust. Immediately though I found it to be extremely slow at iterating over a string's chars. I put together a simple benchmark to show what I mean.
src/bench.html is a html doc with approx 3000 chars
node:
var input = require('fs').readFileSync('src/bench.html', 'utf8');
var len = input.length;
for(var i = 0; i < 100; i+=1) run();
function run () {
var index = 0;
while (index < len) {
var c = input.charAt(index);
// noop
index++;
}
}
rust:
use std::error::Error;
use std::fs::File;
use std::io::prelude::*;
use std::path::Path;
fn main() {
// Create a path to the desired file
let path = Path::new("src/bench.html");
let display = path.display();
// Open the path in read-only mode, returns `io::Result<File>`
let mut file = match File::open(&path) {
// The `description` method of `io::Error` returns a string that
// describes the error
Err(why) => panic!("couldn't open {}: {}", display,
Error::description(&why)),
Ok(file) => file,
};
// Read the file contents into a string, returns `io::Result<usize>`
let mut s = String::new();
match file.read_to_string(&mut s) {
Err(why) => panic!("couldn't read {}: {}", display,
Error::description(&why)),
Ok(_) => {
for x in 1..100 {
for token in s.chars() {
match token {
_ => {
// noop
}
}
}
}
println!("done!");
}
}
}
Can someone explain what I'm doing incorrectly in the rust example to make it 10x slower than the same thing in node?
All code can be found here https://github.com/shakyShane/rust-vs-node
Simple answer, when benchmarking, don't use target/debug/program but run cargo build --release first. This will give you target/release/program for your benchmarks :)

Converting Uint8Array crashing browser for large files

Have an app where there is an input of type "file". The following methods grab the file, then prep it to be sent to the server via AJAX.
private StartUpload = (files) => {
if (files && files.length === 1) {
this.GetFileProperties(files[0])
.done((properties: IFileProperties) => {
$('input[type=file]').val("");
if (this._compatibleTypes.indexOf(properties.Extension) >= 0) {
var base64 = this.ArrayBufferToBase64(properties.ArrayBuffer);
this.DoFileUpload(base64, properties.Extension).always(() => {
this.ShowDialogMessage('edit_document_upload_complete', 'edit_document_upload_complete');
});
} else {
this.ShowDialogMessage('edit_document_upload_incompatible', 'edit_document_upload_compatible_types', this._compatibleTypes);
}
});
} else {
this.ShowDialogMessage('edit_document_upload_one_file', 'edit_document_upload_one_file_msg');
}
};
private ArrayBufferToBase64(buffer): any {
var binary = '';
var bytes = new Uint8Array(buffer);
for (var xx = 0, len = bytes.byteLength; xx < len; xx++) {
binary += String.fromCharCode(bytes[xx]);
}
return window.btoa(binary);
}
private DoFileUpload = (base64, extension) => {
this.IsLoading(true);
var dfd = $.Deferred();
var data = {
data: base64
};
UpdateFormDigest((<any>window)._spPageContextInfo.webServerRelativeUrl, (<any>window)._spFormDigestRefreshInterval);
var methodUrl = "_vti_bin/viewfile/FileInformation.asmx/AddScannedItemAlt";
$.ajax({
headers: {
"X-RequestDigest": $("#__REQUESTDIGEST").val()
},
url: methodUrl,
contentType: "application/json",
data: JSON.stringify(data),
dataType: 'json',
type: "POST",
success: (response) => {
// do stuff
},
error: (e) => {
// do stuff
}
});
return dfd;
};
This works perfectly in the vast majority of cases. However, when the file size is large (say 200MB+) it kills the browser.
Chrome shows a blackish-grey page with the "aw snap" message and basically dies.
IE shows an "Out of Memory" console error but continues to work.
FF shows an "Unresponsive script" warning. Choosing "don't show me again" lets it run until an "out of memory" console error shows up.
This is where it dies:
for (var xx = 0, len = bytes.byteLength; xx < len; xx++) {
binary += String.fromCharCode(bytes[xx]);
}
Wrapping a try/catch around this does nothing and no error is caught.
I can step into the loop without a crash, but stepping through every iteration is tough since len = 210164805. For this I tried to add console.log(xx) to the loop and let it fly - but the browser crashes before anything shows up in the log.
Is there some limit to the size a string can be that could be causing the browser to crash once exceeded?
Thanks
You need to do this in an asynchronous way by breaking up the code either in blocks or time segments.
This means your code will need to use callback, but otherwise it's straight forward -
Example
var bytes = new Uint8Array(256*1024*1024); // 256 mb buffer
convert(bytes, function(str) { // invoke the process with a callback defined
alert("Done!");
});
function convert(bytes, callback) {
var binary = "", blockSize = 2*1024*1024, // 2 mb block
block = blockSize, // block segment
xx = 0, len = bytes.byteLength;
(function _loop() {
while(xx < len && --block > 0) { // copy until block segment = 0
binary += String.fromCharCode(bytes[xx++]);
}
if (xx < len) { // more data to copy?
block = blockSize; // reinit new block segment
binary = ""; // for demo to avoid out-of-memory
setTimeout(_loop, 10); // KEY: async wait
// update a progress bar so we can see something going on:
document.querySelector("div").style.width = (xx / len) * 100 + "%";
}
else callback(binary); // if done, invoke callback
})(); // selv-invoke loop
}
html, body {width:100%;margin:0;overflow:hidden}
div {background:#4288F7;height:10px}
<div></div>
Using large buffers converted to string will possibly make the client run out of memory. A buffer of 200mb converted to string will add 2 x 200mb as strings are stored as UTF-16 (ie. 2 bytes per char), so here we use 600 mb out of the box.
It depends on browser and how it deals with memory allocations as well as the system of course. The browser will try to protect the computer against malevolent scripts which would attempt to fill up the memory for example.
You should be able to stay in ArrayBuffer and send that to server.

Categories

Resources