Node Streams - Pushing to Read unintentionally splits into 3 Write streams

Node Streams - Pushing to Read unintentionally splits into 3 Write streams - javascript

Goal: Objects will be pushed to a readable stream and then saved in a separate .csv depending on what channel (Email, Push, In-App) they come from.
Problem: I am unable to separate out the streams in to different .pipe() "lines" so all .csv logs receive only their channel specific event objects. But in the current iteration all of the .csv files created by the Writestream are receiving the event objects from all channels.
Questions:
Can I dynamically create the multiple channel "pipe() lines" in the setup() function programmatically or is the current way I am approaching this correct?
Is this manual creation of the "pipe() lines" the reason all of the .csv's are being populated with events? Can this be solved with one "pipe() line" and dynamic routing?
A brief explanation of the code below:
setup() calls makeStreams() - creates an object with a Readable and a Writable (rotating file system Writable stream) (setup() is an unnecessary function right now but will hold more setup tasks later.)
pushStream() is called when an inbound event occurs and pushes an object like: {Email: {queryParam:1, queryParam:2, etc.}} The event is sorted by the highest level obj (in this case "Email") and then is pushed to the correct writable stream which in theory should be ported to the correct writable stream.
Unfortunately this isn't the case, it's sending the event object to all of the writable streams. How can I send it to only the correct stream?
CODE:
const Readable = require('stream').Readable
const Json2csvTransform = require('json2csv').Transform;
var rfs = require("rotating-file-stream");
const channelTypes = ['Push Notification', 'Email', 'In-app Message']
var streamArr = setup(channelTypes);
const opts = {};
const transformOpts = {
objectMode: true
};
const json2csv = new Json2csvTransform(opts, transformOpts);
function setup(list) {
console.log("Setting up streams...")
streamArr = makeStreams(list) //makes streams out of each endpoint
return streamArr
}
//Stream Builder for Logging Based Upon Channel Name
function makeStreams(listArray) {
listArray = ['Push Notification', 'Email', 'In-app Message']
var length = listArray.length
var streamObjs = {}
for (var name = 0; name < length; name++) {
var fileName = listArray[name] + '.csv'
const readStream = new Readable({
objectMode: true,
read() {}
})
const writeStream = rfs(fileName, {
size: "50M", // rotate every 50 MegaBytes written
interval: "1d" // rotate daily
//compress: "gzip" // compress rotated files
});
var objName = listArray[name]
var obj = {
instream: readStream,
outstream: writeStream
}
streamObjs[objName] = obj
}
return streamObjs
}
function pushStream(obj) {
var keys = Object.keys(obj)
if (streamArr[keys]) {
streamArr[keys].instream.push(obj[keys])
} else {
console.log("event without a matching channel error")
}
}
//Had to make each pipe line here manually. Can this be improved? Is it the reason all of the files are receiving all events?
streamArr['Email'].instream.pipe(json2csv).pipe(streamArr['Email'].outstream)
streamArr['In-app Message'].instream.pipe(json2csv).pipe(streamArr['In-app Message'].outstream)
streamArr['Push Notification'].instream.pipe(json2csv).pipe(streamArr['Push Notification'].outstream)
module.exports = {
makeStreams,
pushStream,
setup
}

Related

How to access input/output of worklet node from main file?

I have created a real time voice chat application for a game I am making. I got it to work completely fine using audiocontext.createScriptProcessor() method.
Here's the code, I left out parts that weren't relevant
//establish websocket connection
const audioData = []
//websocket connection.onMessage (data) =>
audioData.push(decodeBase64(data)) //push audio data coming from another player into array
//on get user media (stream) =>
const audioCtx = new AudioContext({latencyHint: "interactive", sampleRate: 22050,})
const inputNode = audioCtx.createMediaStreamSource(stream)
var processor = audioCtx.createScriptProcessor(2048, 1, 1);
var outputNode = audioCtx.destination
inputNode.connect(tunerNode)
processor.connect(outputNode)
processor.onaudioprocess = function (e) {
var input = e.inputBuffer.getChannelData(0);
webSocketSend(input) //send microphone input to other sockets via a function set up in a different file, all it does is base 64 encode then send.
//if there is data from the server, play it, else, play nothing
var output
if(audioData.length > 0){
output = audioData[0]
audioData.splice(0,1)
}else output = new Array(2048).fill(0)
};
the only issue is that the createScriptProccessor() method is deprecated. As recommended, I attempted to do this using Audio Worklet Nodes. However I quickly ran into a problem. I can't access the user's microphone input, or set the output from the main file where the WebSocket connection is.
Here is my code for main.js:
document.getElementById('btn').onclick = () => {createVoiceChatSession()}
//establish websocket connection
const audioData = []
//webSocket connection.onMessage (data) =>
audioData.push(data) //how do I get this data to the worklet Node???
var voiceChatContext
function createVoiceChatSession(){
voiceChatContext = new AudioContext()
navigator.mediaDevices.getUserMedia({audio: true}).then( async stream => {
await voiceChatContext.audioWorklet.addModule('module.js')
const microphone = voiceChatContext.createMediaStreamSource(stream)
const processor = new AudioWorkletNode(voiceChatContext, 'processor')
microphone.connect(processor).connect(voiceChatContext.destination)
}).catch(err => console.log(err))
}
Here is my code for module.js:
class processor extends AudioWorkletProcessor {
constructor() {
super()
}
//copies the input to the output
process(inputList, outputList) { // how do I get the input list data (the data from my microphone) to the main file so I can send it via websocket ???
for(var i = 0; i < inputList[0][0].length; i++){
outputList[0][0][i] = inputList[0][0][i]
outputList[0][1][i] = inputList[0][1][i]
}
return true;
}
}
registerProcessor("processor", processor);
So I can record and process the input, but I can't send input via WebSocket or pass in data that is coming from the server to the worklet node because I can't access the input list or output list from the main file where the WebSocket connection is. Does anyone know a way to work around this? Or is there a better solution that doesn't use audio worklet nodes?
Thank you to all who can help!

I figured it out, all I needed to do was use the port.onmessage method to exchange data between the worklet and the main file.
processor.port.onmessage = (e) => {//do something with e.data}

NodeJS: Using Pipe To Write A File From A Readable Stream Gives Heap Memory Error

I am trying to create 150 million lines of data and write the data into a csv file so that I can insert the data into different databases with little modification.
I am using a few functions to generate seemingly random data and pushing the data into the writable stream.
The code that I have right now is unsuccessful at handling memory issue.
After a few hours of research, I am starting to think that I should not be pushing each data at the end of the for loop because it seems that the pipe method simply cannot handle garbage collection this way.
Also, I found a few StackOverFlow answers and NodeJS docs that recommend against using push at all.
However, I am very new to NodeJS and I feel like I am blocked and do not know how to proceed from here.
If someone can provide me any guidance on how to proceed and give me an example, I would really appreciate it.
Below is a part of my code to give you a better understanding of what I am trying to achieve.
P.S. -
I have found a way to write successfully handle memory issue without using pipe method at all --I used the drain event-- but I had to start from scratch and now I am curious to know if there is a simple way to handle this memory issue without completely changing this bit of code.
Also, I have been trying to avoid using any library because I feel like there should be a relatively easy tweak to make this work without using a library but please tell me if I am wrong. Thank you in advance.
// This is my target number of data
const targetDataNum = 150000000;
// Create readable stream
const readableStream = new Stream.Readable({
read() {}
});
// Create writable stream
const writableStream = fs.createWriteStream('./database/RDBMS/test.csv');
// Write columns first
writableStream.write('id, body, date, dp\n', 'utf8');
// Then, push a number of data to the readable stream (150M in this case)
for (var i = 1; i <= targetDataNum; i += 1) {
const id = i;
const body = lorem.paragraph(1);
const date = randomDate(new Date(2014, 0, 1), new Date());
const dp = randomNumber(1, 1000);
const data = `${id},${body},${date},${dp}\n`;
readableStream.push(data, 'utf8');
};
// Pipe readable stream to writeable stream
readableStream.pipe(writableStream);
// End the stream
readableStream.push(null);

Since you're new to streams, maybe start with an easier abstraction: generators. Generators generate data only when it is consumed (just like Streams should), but they don't have buffering and complicated constructors and methods.
This is just your for loop, moved into a generator function:
function * generateData(targetDataNum) {
for (var i = 1; i <= targetDataNum; i += 1) {
const id = i;
const body = lorem.paragraph(1);
const date = randomDate(new Date(2014, 0, 1), new Date());
const dp = randomNumber(1, 1000);
yield `${id},${body},${date},${dp}\n`;
}
}
In Node 12, you can create a Readable stream directly from any iterable, including generators and async generators:
const stream = Readable.from(generateData(), {encoding: 'utf8'})
stream.pipe(writableStream)

i suggest to try a solution like the following:
const { Readable } = require('readable-stream');
class CustomReadable extends Readable {
constructor(max, options = {}) {
super(options);
this.targetDataNum = max;
this.i = 1;
}
_read(size) {
if (i <= this.targetDataNum) {
// your code to build the csv content
this.push(data, 'utf8');
return;
}
this.push(null);
}
}
const rs = new CustomReadable(150000000);
rs.pipe(ws);
Just complete it with your portion of code to fill the csv and create the writable stream.
With this solution you leave calling the rs.push method to the internal _read stream method invoked until this.push(null) is not called. Probably before you were filling the internal stream buffer too fast calling push manually in a loop getting the out memory error.

Try pipeing to the WritableStream before you start pumping data into the ReadableStream and yield before you write the next chunk.
...
// Write columns first
writableStream.write('id, body, date, dp\n', 'utf8');
// Pipe readable stream to writeable stream
readableStream.pipe(writableStream);
// Then, push a number of data to the readable stream (150M in this case)
for (var i = 1; i <= targetDataNum; i += 1) {
const id = i;
const body = lorem.paragraph(1);
const date = randomDate(new Date(2014, 0, 1), new Date());
const dp = randomNumber(1, 1000);
const data = `${id},${body},${date},${dp}\n`;
readableStream.push(data, 'utf8');
// somehow YIELD for the STREAM to drain out.
};
...
The entire Stream implementation of Node.js relies on the fact that the wire is slow and that the CPU can actually have a downtime before the next chunk of data comes in from the stream source or till the next chunk of data has been written to the stream destination.
In the current implementation, since the for-loop has booked up the CPU, there is no downtime for the actual pipeing of the data to the writestream. You will be able to catch this if you watch cat test.csv which will not change while the loop is running.
As (I am sure) you know, pipe helps in guaranteeing that the data you are working with is buffered in memory only in chunks and not as a whole. But that guarantee only holds true if the CPU gets enough downtime to actually drain the data.
Having said all that, I wrapped your entire code into an async IIFE and ran it with an await for a setTimeout which ensures that I yield for the stream to drain the data.
let fs = require('fs');
let Stream = require('stream');
(async function () {
// This is my target number of data
const targetDataNum = 150000000;
// Create readable stream
const readableStream = new Stream.Readable({
read() { }
});
// Create writable stream
const writableStream = fs.createWriteStream('./test.csv');
// Write columns first
writableStream.write('id, body, date, dp\n', 'utf8');
// Pipe readable stream to writeable stream
readableStream.pipe(writableStream);
// Then, push a number of data to the readable stream (150M in this case)
for (var i = 1; i <= targetDataNum; i += 1) {
console.log(`Pushing ${i}`);
const id = i;
const body = `body${i}`;
const date = `date${i}`;
const dp = `dp${i}`;
const data = `${id},${body},${date},${dp}\n`;
readableStream.push(data, 'utf8');
await new Promise(resolve => setImmediate(resolve));
};
// End the stream
readableStream.push(null);
})();
This is what top looks like pretty much the whole time I am running this.
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
15213 binaek ** ** ****** ***** ***** * ***.* 0.5 *:**.** node
Notice the %MEM which stays more-or-less static.

You were running out of memory because you were pre-generating all the data in memory before you wrote any of it to disk. Instead, you need a strategy to write is as you generate so you don't have to hold large amounts of data in memory.
It does not seem like you need .pipe() here because you control the generation of the data (it's not coming from some random readStream).
So, you can just generate the data and immediately write it and handle the drain event when needed. Here's a runnable example (this creates a very large file):
const {once} = require('events');
const fs = require('fs');
// This is my target number of data
const targetDataNum = 150000000;
async function run() {
// Create writable stream
const writableStream = fs.createWriteStream('./test.csv');
// Write columns first
writableStream.write('id, body, date, dp\n', 'utf8');
// Then, push a number of data to the readable stream (150M in this case)
for (let i = 1; i <= targetDataNum; i += 1) {
const id = i;
const body = lorem.paragraph(1);
const date = randomDate(new Date(2014, 0, 1), new Date());
const dp = randomNumber(1, 1000);
const data = `${id},${body},${date},${dp}\n`;
const canWriteMore = writableStream.write(data);
if (!canWriteMore) {
// wait for stream to be ready for more writing
await once(writableStream, "drain");
}
}
writableStream.end();
}
run().then(() => {
console.log(done);
}).catch(err => {
console.log("got rejection: ", err);
});
// placeholders for the functions that were being used
function randomDate(low, high) {
let rand = randomNumber(low.getTime(), high.getTime());
return new Date(rand);
}
function randomNumber(low, high) {
return Math.floor(Math.random() * (high - low)) + low;
}
const lorem = {
paragraph: function() {
return "random paragraph";
}
}

Javascript: Compare two selected files by file path

I'm looking for a way to check if two files/documents (PDF, JPG, PNG) are the same.
If a user selects one or more files, I convert the File Object to a javascript object. I'm keeping the size, type, filename and I create a blob so I can store the object in my redux store.
When a user selects another file I want to compare this file with the files that already has been added (so I can set the same blobURL).
I can check if two files has the same name, type and size but there is a change that all these properties match and the files aren't the same so I would like to check the file path. Unfortunately, that property isn't provided in the File Object. Is there a way to get this or another solution to make sure both files are (not) the same?

No there is no way to get the real path, but that doesn't matter.
All you have access to is a FakePath, in the form C:\fakepath\yourfilename.ext (from input.value), and sometimes a bit more if you gained access to a directory.
But anyway you probably don't want to check that two files came from the same place on the hard-disk, this has no importance whatsoever, since they could very well have been modified since first access.
What you can and probably want to do however, is to check if their content
are the same.
For this, you can compare their byte content:
inp1.onchange = inp2.onchange = e => {
const file1 = inp1.files[0];
const file2 = inp2.files[0];
if(!file1 || !file2) return;
compare(file1, file2)
.then(res => console.log('are same ? ', res));
};
function compare(file1, file2) {
// they don't have the same size, they are different
if(file1.size !== file2.size)
return Promise.resolve(false);
// load both as ArrayBuffers
return Promise.all([
readAsArrayBuffer(file1),
readAsArrayBuffer(file2)
]).then(([buf1, buf2]) => {
// create views over our ArrayBuffers
const arr1 = new Uint8Array(buf1);
const arr2 = new Uint8Array(buf2);
return !arr1.some((val, i) =>
arr2[i] !== val // search for diffs
);
});
}
function readAsArrayBuffer(file) {
// we could also have used a FileReader,
// but Response is conveniently already Promise based
return new Response(file).arrayBuffer();
}
<input type="file" id="inp1">
<input type="file" id="inp2">
Now, you say that you don't have access to the original Files anymore, and that you can only store serializable data. In this case, one less performant solution is to generate a hash from your Files.
This can be done on front-end, thanks to the SubtleCrypto API,
but this operation being quite slow for big files, you may want to do it systematically from server instead of doing it on front, and to only do it on front when the sizes are the same:
// a fake storage object like OP has
const store = [
{ /* an utf-8 text file whose content is `hello world`*/
name: "helloworld.txt",
size: 11,
hash: "b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9" // generated from server
}
];
// the smae file as the one we fakely stored
const sameFile = new File(['hello world'], 'same-file.txt');
// a file the same size as the one we stored (needs deep check)
const sameSizeButDifferentContent = new File(['random text'], 'differentcontent.txt');
inp.onchange = e => tryToStore(inp.files[0]);
tryToStore(sameFile); // false
tryToStore(sameSizeButDifferentContent);
// hash: "a4e082f56a58e0855a6abbf2f4ebd08895ff85ea80e634e02b210def84b557dd"
function tryToStore(file) {
checkShouldStore(file)
.then(result => {
console.log('should store', file.name, result)
if(result)
store.push(result);
// this is just for demo, in your case you would do it on the server
if(!result.hash)
generateHash(file).then(h => result.hash = h);
});
}
async function checkShouldStore(file) {
const {name, size} = file;
const toStore = {name, size, file}; // create a wrapper object
// first check against the sizes (fast checking)
const sameSizes = store.filter(obj => obj.size === file.size);
// only if some files have the same size
if(sameSizes.length) {
// then we generate a hash directly
const hash = await generateHash(file);
if(sameSizes.some(obj => obj.hash === hash)) {
return false; // is already in our store
}
toStore.hash = hash; // save the hash so we don't have to generate it on server
}
return toStore;
}
async function generateHash(file) {
// read as ArrayBuffer
const buf = await new Response(file).arrayBuffer();
// generate SHA-256 hash using crypto API
const hash_buf = await crypto.subtle.digest("SHA-256", buf);
// convert to Hex
const hash_arr = [...new Uint8Array(hash_buf)]
.map(v => v.toString(16).padStart(2, "0"));
return hash_arr.join('');
}
<input type="file" id="inp">

Process large array of objects in JSON file [duplicate]

I have a file which stores many JavaScript objects in JSON form and I need to read the file, create each of the objects, and do something with them (insert them into a db in my case). The JavaScript objects can be represented a format:
Format A:
[{name: 'thing1'},
....
{name: 'thing999999999'}]
or Format B:
{name: 'thing1'} // <== My choice.
...
{name: 'thing999999999'}
Note that the ... indicates a lot of JSON objects. I am aware I could read the entire file into memory and then use JSON.parse() like this:
fs.readFile(filePath, 'utf-8', function (err, fileContents) {
if (err) throw err;
console.log(JSON.parse(fileContents));
});
However, the file could be really large, I would prefer to use a stream to accomplish this. The problem I see with a stream is that the file contents could be broken into data chunks at any point, so how can I use JSON.parse() on such objects?
Ideally, each object would be read as a separate data chunk, but I am not sure on how to do that.
var importStream = fs.createReadStream(filePath, {flags: 'r', encoding: 'utf-8'});
importStream.on('data', function(chunk) {
var pleaseBeAJSObject = JSON.parse(chunk);
// insert pleaseBeAJSObject in a database
});
importStream.on('end', function(item) {
console.log("Woot, imported objects into the database!");
});*/
Note, I wish to prevent reading the entire file into memory. Time efficiency does not matter to me. Yes, I could try to read a number of objects at once and insert them all at once, but that's a performance tweak - I need a way that is guaranteed not to cause a memory overload, not matter how many objects are contained in the file.
I can choose to use FormatA or FormatB or maybe something else, just please specify in your answer. Thanks!

To process a file line-by-line, you simply need to decouple the reading of the file and the code that acts upon that input. You can accomplish this by buffering your input until you hit a newline. Assuming we have one JSON object per line (basically, format B):
var stream = fs.createReadStream(filePath, {flags: 'r', encoding: 'utf-8'});
var buf = '';
stream.on('data', function(d) {
buf += d.toString(); // when data is read, stash it in a string buffer
pump(); // then process the buffer
});
function pump() {
var pos;
while ((pos = buf.indexOf('\n')) >= 0) { // keep going while there's a newline somewhere in the buffer
if (pos == 0) { // if there's more than one newline in a row, the buffer will now start with a newline
buf = buf.slice(1); // discard it
continue; // so that the next iteration will start with data
}
processLine(buf.slice(0,pos)); // hand off the line
buf = buf.slice(pos+1); // and slice the processed data off the buffer
}
}
function processLine(line) { // here's where we do something with a line
if (line[line.length-1] == '\r') line=line.substr(0,line.length-1); // discard CR (0x0D)
if (line.length > 0) { // ignore empty lines
var obj = JSON.parse(line); // parse the JSON
console.log(obj); // do something with the data here!
}
}
Each time the file stream receives data from the file system, it's stashed in a buffer, and then pump is called.
If there's no newline in the buffer, pump simply returns without doing anything. More data (and potentially a newline) will be added to the buffer the next time the stream gets data, and then we'll have a complete object.
If there is a newline, pump slices off the buffer from the beginning to the newline and hands it off to process. It then checks again if there's another newline in the buffer (the while loop). In this way, we can process all of the lines that were read in the current chunk.
Finally, process is called once per input line. If present, it strips off the carriage return character (to avoid issues with line endings – LF vs CRLF), and then calls JSON.parse one the line. At this point, you can do whatever you need to with your object.
Note that JSON.parse is strict about what it accepts as input; you must quote your identifiers and string values with double quotes. In other words, {name:'thing1'} will throw an error; you must use {"name":"thing1"}.
Because no more than a chunk of data will ever be in memory at a time, this will be extremely memory efficient. It will also be extremely fast. A quick test showed I processed 10,000 rows in under 15ms.

Just as I was thinking that it would be fun to write a streaming JSON parser, I also thought that maybe I should do a quick search to see if there's one already available.
Turns out there is.
JSONStream "streaming JSON.parse and stringify"
Since I just found it, I've obviously not used it, so I can't comment on its quality, but I'll be interested to hear if it works.
It does work consider the following Javascript and _.isString:
stream.pipe(JSONStream.parse('*'))
.on('data', (d) => {
console.log(typeof d);
console.log("isString: " + _.isString(d))
});
This will log objects as they come in if the stream is an array of objects. Therefore the only thing being buffered is one object at a time.

As of October 2014, you can just do something like the following (using JSONStream) - https://www.npmjs.org/package/JSONStream
var fs = require('fs'),
JSONStream = require('JSONStream'),
var getStream() = function () {
var jsonData = 'myData.json',
stream = fs.createReadStream(jsonData, { encoding: 'utf8' }),
parser = JSONStream.parse('*');
return stream.pipe(parser);
}
getStream().pipe(MyTransformToDoWhateverProcessingAsNeeded).on('error', function (err) {
// handle any errors
});
To demonstrate with a working example:
npm install JSONStream event-stream
data.json:
{
"greeting": "hello world"
}
hello.js:
var fs = require('fs'),
JSONStream = require('JSONStream'),
es = require('event-stream');
var getStream = function () {
var jsonData = 'data.json',
stream = fs.createReadStream(jsonData, { encoding: 'utf8' }),
parser = JSONStream.parse('*');
return stream.pipe(parser);
};
getStream()
.pipe(es.mapSync(function (data) {
console.log(data);
}));
$ node hello.js
// hello world

I had similar requirement, i need to read a large json file in node js and process data in chunks and call a api and save in mongodb.
inputFile.json is like:
{
"customers":[
{ /*customer data*/},
{ /*customer data*/},
{ /*customer data*/}....
]
}
Now i used JsonStream and EventStream to achieve this synchronously.
var JSONStream = require("JSONStream");
var es = require("event-stream");
fileStream = fs.createReadStream(filePath, { encoding: "utf8" });
fileStream.pipe(JSONStream.parse("customers.*")).pipe(
es.through(function(data) {
console.log("printing one customer object read from file ::");
console.log(data);
this.pause();
processOneCustomer(data, this);
return data;
}),
function end() {
console.log("stream reading ended");
this.emit("end");
}
);
function processOneCustomer(data, es) {
DataModel.save(function(err, dataModel) {
es.resume();
});
}

I realize that you want to avoid reading the whole JSON file into memory if possible, however if you have the memory available it may not be a bad idea performance-wise. Using node.js's require() on a json file loads the data into memory really fast.
I ran two tests to see what the performance looked like on printing out an attribute from each feature from a 81MB geojson file.
In the 1st test, I read the entire geojson file into memory using var data = require('./geo.json'). That took 3330 milliseconds and then printing out an attribute from each feature took 804 milliseconds for a grand total of 4134 milliseconds. However, it appeared that node.js was using 411MB of memory.
In the second test, I used #arcseldon's answer with JSONStream + event-stream. I modified the JSONPath query to select only what I needed. This time the memory never went higher than 82MB, however, the whole thing now took 70 seconds to complete!

I wrote a module that can do this, called BFJ. Specifically, the method bfj.match can be used to break up a large stream into discrete chunks of JSON:
const bfj = require('bfj');
const fs = require('fs');
const stream = fs.createReadStream(filePath);
bfj.match(stream, (key, value, depth) => depth === 0, { ndjson: true })
.on('data', object => {
// do whatever you need to do with object
})
.on('dataError', error => {
// a syntax error was found in the JSON
})
.on('error', error => {
// some kind of operational error occurred
})
.on('end', error => {
// finished processing the stream
});
Here, bfj.match returns a readable, object-mode stream that will receive the parsed data items, and is passed 3 arguments:
A readable stream containing the input JSON.
A predicate that indicates which items from the parsed JSON will be pushed to the result stream.
An options object indicating that the input is newline-delimited JSON (this is to process format B from the question, it's not required for format A).
Upon being called, bfj.match will parse JSON from the input stream depth-first, calling the predicate with each value to determine whether or not to push that item to the result stream. The predicate is passed three arguments:
The property key or array index (this will be undefined for top-level items).
The value itself.
The depth of the item in the JSON structure (zero for top-level items).
Of course a more complex predicate can also be used as necessary according to requirements. You can also pass a string or a regular expression instead of a predicate function, if you want to perform simple matches against property keys.

If you have control over the input file, and it's an array of objects, you can solve this more easily. Arrange to output the file with each record on one line, like this:
[
{"key": value},
{"key": value},
...
This is still valid JSON.
Then, use the node.js readline module to process them one line at a time.
var fs = require("fs");
var lineReader = require('readline').createInterface({
input: fs.createReadStream("input.txt")
});
lineReader.on('line', function (line) {
line = line.trim();
if (line.charAt(line.length-1) === ',') {
line = line.substr(0, line.length-1);
}
if (line.charAt(0) === '{') {
processRecord(JSON.parse(line));
}
});
function processRecord(record) {
// Process the records one at a time here!
}

I solved this problem using the split npm module. Pipe your stream into split, and it will "Break up a stream and reassemble it so that each line is a chunk".
Sample code:
var fs = require('fs')
, split = require('split')
;
var stream = fs.createReadStream(filePath, {flags: 'r', encoding: 'utf-8'});
var lineStream = stream.pipe(split());
linestream.on('data', function(chunk) {
var json = JSON.parse(chunk);
// ...
});

Using the #josh3736 answer, but for ES2021 and Node.js 16+ with async/await + AirBnb rules:
import fs from 'node:fs';
const file = 'file.json';
/**
* #callback itemProcessorCb
* #param {object} item The current item
*/
/**
* Process each data chunk in a stream.
*
* #param {import('fs').ReadStream} readable The readable stream
* #param {itemProcessorCb} itemProcessor A function to process each item
*/
async function processChunk(readable, itemProcessor) {
let data = '';
let total = 0;
// eslint-disable-next-line no-restricted-syntax
for await (const chunk of readable) {
// join with last result, remove CR and get lines
const lines = (data + chunk).replace('\r', '').split('\n');
// clear last result
data = '';
// process lines
let line = lines.shift();
const items = [];
while (line) {
// check if isn't a empty line or an array definition
if (line !== '' && !/[\[\]]+/.test(line)) {
try {
// remove the last comma and parse json
const json = JSON.parse(line.replace(/\s?(,)+\s?$/, ''));
items.push(json);
} catch (error) {
// last line gets only a partial line from chunk
// so we add this to join at next loop
data += line;
}
}
// continue
line = lines.shift();
}
total += items.length;
// Process items in parallel
await Promise.all(items.map(itemProcessor));
}
console.log(`${total} items processed.`);
}
// Process each item
async function processItem(item) {
console.log(item);
}
// Init
try {
const readable = fs.createReadStream(file, {
flags: 'r',
encoding: 'utf-8',
});
processChunk(readable, processItem);
} catch (error) {
console.error(error.message);
}
For a JSON like:
[
{ "name": "A", "active": true },
{ "name": "B", "active": false },
...
]

https.get(url1 , function(response) {
var data = "";
response.on('data', function(chunk) {
data += chunk.toString();
})
.on('end', function() {
console.log(data)
});
});

I think you need to use a database. MongoDB is a good choice in this case because it is JSON compatible.
UPDATE:
You can use mongoimport tool to import JSON data into MongoDB.
mongoimport --collection collection --file collection.json

Multiple pipes on writeStream open event gives undefined

I am downloading a file from a URL and then I want to write the metadata for that file to the destination stream only after I make sure that the destination file is created(i.e. after fs.createWriteStream(path) is successful).So I have used the "open" event of writable stream to proceed further. However, this code gives me the error exactly on the second pipe:
Cannot call method 'pipe' on undefined
There is more code beyond this which uses the hashes that are calculated here. But somehow I am stuck with this error at the moment. I have been struggling on this for quite a while. Any help/pointers are very much appreciated.
Also I tried to run the example
var fs = require('fs');
var digestStream = require('digest-stream')
callHandle();
function callHandle(){
var stream = request.get(url)
var result = {}
handle(reader, result)
}
function handle(reader,metadata){
const writer = fs.createWriteStream('pathToFile');
writer.on('open', function(){
reader.pipe(
digestStream('sha1', 'hex', function(digest, length) {
result.sha1 = digest;
result.size = length;
}))
.pipe(
digestStream('md5', 'hex', function(digest) {
result.md5 = digest;
})
).pipe(writer)
})
}

Develop Reference

JavaScript is the programming language of the Web.

Node Streams - Pushing to Read unintentionally splits into 3 Write streams - javascript

Related

How to access input/output of worklet node from main file?

NodeJS: Using Pipe To Write A File From A Readable Stream Gives Heap Memory Error

Javascript: Compare two selected files by file path

Process large array of objects in JSON file [duplicate]

Multiple pipes on writeStream open event gives undefined

Categories

Resources