Error when introducing huge data in Neo4j from Node.js - javascript

I am trying to introduce a huge amount of data in neo4j from a file. I am using node.js code, simple javascript with no much complexity.
The thing is that I have 386213 lines or 'nodes' to introduce, but when executed (and wait 3 hours) I only see the half moreless. I think some of the queries are lost in the way, but I do not know why...
I am using npm node-neo4j package for the connection and that.
Here my node.js code:
var neo4j = require('neo4j');
var readline = require("readline");
var fs = require("fs")
var db = new neo4j.GraphDatabase('http://neo4j:Gemitis26#localhost:7474');
var rl = readline.createInterface({
input: fs.createReadStream('C:/Users/RRamos/Documents/Projects/test-neo4j/Files/kaggle_songs.txt')
});
var i=1;
rl.on('line', function (line) {
var str = line.split(" ");
db.cypher({
query: "CREATE (:Song {id: '{line1}', num_id: {line2}})",
params: {
line1: str[0],
line2: str[1],
},
}, callback);
console.log(i + " " + "CREATE (:Song {id: '"+str[0]+"', num_id: "+str[1]+"})");
i = i+1;
});
function callback(err, results){
if(err) throw err;
}

Making 386213 separate Cypher REST queries (in separate transactions) is probably the slowest possible way to create such a large number of nodes.
There are at least 3 better ways (in order of increasing performance):
Create multiple nodes at a time by sending as a parameter an array containing the data for multiple nodes. For example, you can create 8 nodes by sending this array parameter: [['a', 1],['b', 2],['c', 3],['d', 4],['e', 5],['f', 6],['g', 7],['h', 8]], and using this query:
UNWIND {data} AS d
CREATE (:Song {id: d[0], num_id: d[0]})
You can use the LOAD CSV clause to create the nodes. Since your input file seems to use a space to separate node property values, this might work for you:
LOAD CSV FROM 'file:///C:/Users/RRamos/Documents/Projects/test-neo4j/Files/kaggle_songs.txt' AS line
FIELDTERMINATOR ' '
CREATE (:Song {id: line[0], num_id: line[1]})
For even better performance, you could use the Import tool, which is a command line tool for initializing a new DB.

Related

How can I update or add new objects in the middle of a stream process in Nodejs?

I'm just new in programming and now self-studying how to use createStream. I'm kind of lost how to use stream in nodejs using JS. Basically, what I wanted to do is to read a JSON file (more than 1GB) which have a massive array of object. Update the existing value of a certain object or add another set of object. I able to do it using the normal read, update or add function then write. Problem is I'm getting a large spike in RAM usage.
My code is like this:
const fs = require(`fs-extra`);
async function updateOrAdd () {
var datafile = await fs.readJson(`./bigJSONfile.json`);
var tofind = {user:alexa, age:21,country: japan, pending: 1, paid: 0};
foundData = datafile.filter(x => x.user === tofind.user && x.country === tofind.country);
if (foundData === null){
datafile = datafile.concat(tofind)
} else {
foundData.pending += 1
foundData.paid += 1
}
await fs.writeJson(`./bigJSONfile.json`, datafile)
}
I saw some codes for reference in createStream and they say pipe is the most efficient way for memory usage. Though, mostly what I saw is like making a copy only from the original one.
I really appreciate it if anyone can teach me how to do this using stream or if you can provide me the code for it :).

Is it possible to write text in the middle of a file with fs.createWriteStream ? (or in nodejs in general)

I'm trying to write in a text file, but not at the end like appendFile() do or by replacing the entiere content...
I saw it was possible to chose where you want to start with start parameter of fs.createwritestream() -> https://nodejs.org/api/fs.html#fs_fs_createwritestream_path_options
But there is no parameter to say where to stop writting, right ? So it remove all the end of my file after I wrote with this function.
const fs = require('fs');
var logger = fs.createWriteStream('result.csv', {
flags: 'r+',
start: 20 //start to write at the 20th caracter
})
logger.write('5258,525,98951,0,1\n') //example a new line to write
Is there a way to specify where to stop writting in the file to have something like:
....
data from begining
....
5258,525,98951,0,1
...
data till the end
...
I suspect you mean, "Is it possible to insert in the middle of the file." The answer to that is: No, it isn't.
Instead, to insert, you have to:
Determine how big what you're inserting is
Copy the data at your insertion point to that many bytes later in the file
Write your data
Obviously when doing #2 you need to be sure that you're not overwriting data you haven't copied yet (either by reading it all into memory first or by working in blocks, from the end of the file toward the insertion point).
(I've never looked for one, but there may be an npm module out there that does this for you...)
You could read/parse your file at first. Then apply the modifications and save the new file.
Something like:
const fs = require("fs");
const fileData = fs.readFileSync("result.csv", { encoding: "utf8" });
const fileDataArray = fileData.split("\n");
const newData = "5258,525,98951,0,1";
const index = 2; // after each row to insert your data
fileDataArray.splice(index, 0, newData); // insert data into the array
const newFileData = fileDataArray.join("\n"); // create the new file
fs.writeFileSync("result.csv", newFileData, { encoding: "utf8" }); // save it

Pass large array to node child process

I have complex CPU intensive work I want to do on a large array. Ideally, I'd like to pass this to the child process.
var spawn = require('child_process').spawn;
// dataAsNumbers is a large 2D array
var child = spawn(process.execPath, ['/child_process_scripts/getStatistics', dataAsNumbers]);
child.stdout.on('data', function(data){
console.log('from child: ', data.toString());
});
But when I do, node gives the error:
spawn E2BIG
I came across this article
So piping the data to the child process seems to be the way to go. My code is now:
var spawn = require('child_process').spawn;
console.log('creating child........................');
var options = { stdio: [null, null, null, 'pipe'] };
var args = [ '/getStatistics' ];
var child = spawn(process.execPath, args, options);
var pipe = child.stdio[3];
pipe.write(Buffer('awesome'));
child.stdout.on('data', function(data){
console.log('from child: ', data.toString());
});
And then in getStatistics.js:
console.log('im inside child');
process.stdin.on('data', function(data) {
console.log('data is ', data);
process.exit(0);
});
However the callback in process.stdin.on isn't reached. How can I receive a stream in my child script?
EDIT
I had to abandon the buffer approach. Now I'm sending the array as a message:
var cp = require('child_process');
var child = cp.fork('/getStatistics.js');
child.send({
dataAsNumbers: dataAsNumbers
});
But this only works when the length of dataAsNumbers is below about 20,000, otherwise it times out.
With such a massive amount of data, I would look into using shared memory rather than copying the data into the child process (which is what is happening when you use a pipe or pass messages). This will save memory, take less CPU time for the parent process, and be unlikely to bump into some limit.
shm-typed-array is a very simple module that seems suited to your application. Example:
parent.js
"use strict";
const shm = require('shm-typed-array');
const fork = require('child_process').fork;
// Create shared memory
const SIZE = 20000000;
const data = shm.create(SIZE, 'Float64Array');
// Fill with dummy data
Array.prototype.fill.call(data, 1);
// Spawn child, set up communication, and give shared memory
const child = fork("child.js");
child.on('message', sum => {
console.log(`Got answer: ${sum}`);
// Demo only; ideally you'd re-use the same child
child.kill();
});
child.send(data.key);
child.js
"use strict";
const shm = require('shm-typed-array');
process.on('message', key => {
// Get access to shared memory
const data = shm.get(key, 'Float64Array');
// Perform processing
const sum = Array.prototype.reduce.call(data, (a, b) => a + b, 0);
// Return processed data
process.send(sum);
});
Note that we are only sending a small "key" from the parent to the child process through IPC, not the whole data. Thus, we save a ton of memory and time.
Of course, you can change 'Float64Array' (e.g. a double) to whatever typed array your application requires. Note that this library in particular only handles single-dimensional typed arrays; but that should only be a minor obstacle.
I too was able to reproduce the delay your were experiencing, but maybe not as bad as you. I used the following
// main.js
const fork = require('child_process').fork
const child = fork('./getStats.js')
const dataAsNumbers = Array(100000).fill(0).map(() =>
Array(100).fill(0).map(() => Math.round(Math.random() * 100)))
child.send({
dataAsNumbers: dataAsNumbers,
})
And
// getStats.js
process.on('message', function (data) {
console.log('data is ', data)
process.exit(0)
})
node main.js 2.72s user 0.45s system 103% cpu 3.045 total
I'm generating 100k elements composed of 100 numbers to mock your data, make sure you are using the message event on process. But maybe your children are more complex and might be the reason of the failure, also depends on the timeout you set on your query.
If you want to get better results, what you could do is chunk your data into multiple pieces that will be sent to the child process and reconstructed to form the initial array.
Also one possibility would be to use a third-party library or protocol, even if it's a bit more work. You could have a look to messenger.js or even something like an AMQP queue that could allow you to communicate between the two process with a pool and a guaranty of the message been acknowledged by the sub process. There is a few node implementations of it, like amqp.node, but it would still require a bit of setup and configuration work.
Use an in memory cache like https://github.com/ptarjan/node-cache, and let the parent process store the array contents with some key, the child process would retreive the contents through that key.
You could consider using OS pipes you'll find a gist here as an input to your node child application.
I know this is not exactly what you're asking for, but you could use the cluster module (included in node). This way you can get as many instances as cores you machine has to speed up processing. Moreover consider using streams if you don't need to have all the data available before you start processing. If the data to be processed is too large i would store it in a file so you can reinilize if there is any error during the process.
Here is an example of clustering.
var cluster = require('cluster');
var numCPUs = 4;
if (cluster.isMaster) {
for (var i = 0; i < numCPUs; i++) {
var worker = cluster.fork();
console.log('id', worker.id)
}
} else {
doSomeWork()
}
function doSomeWork(){
for (var i=1; i<10; i++){
console.log(i)
}
}
More info sending messages across workers question 8534462.
Why do you want to make a subprocess? The sending of the data across subprocesses is likely to cost more in terms of cpu and realtime than you will save in making the processing happen within the same process.
Instead, I would suggest that for super efficient coding you consider to do your statistics calculations in a worker thread that runs within the same memory as the nodejs main process.
You can use the NAN to write C++ code that you can post to a worker thread, and then have that worker thread to post the result and an event back to your nodejs event loop when done.
The benefit of this is that you don't need extra time to send the data across to a different process, but the downside is that you will write a bit of C++ code for the threaded action, but the NAN extension should take care of most of the difficult task for you.
To address the performance issue while passing large data to the child process, save the data to the .json or .txt file and pass only the filename to the childprocess. I've achieved 70% performance improvement with this approach.
For long process tasks you could use something like gearman You could do the heavy work process on workers, in this way you can setup how many workers you need, for example I do some file processing in this way, if I need scale you create more worker instance, also I have different workers for different tasks, process zip files, generate thumbnails, etc, the good of this is the workers can be written on any language node.js, Java, python and can be integrated on your project with ease
// worker-unzip.js
const debug = require('debug')('worker:unzip');
const {series, apply} = require('async');
const gearman = require('gearmanode');
const {mkdirpSync} = require('fs-extra');
const extract = require('extract-zip');
module.exports.unzip = unzip;
module.exports.worker = worker;
function unzip(inputPath, outputDirPath, done) {
debug('unzipping', inputPath, 'to', outputDirPath);
mkdirpSync(outputDirPath);
extract(inputPath, {dir: outputDirPath}, done);
}
/**
*
* #param {Job} job
*/
function workerUnzip(job) {
const {inputPath, outputDirPath} = JSON.parse(job.payload);
series([
apply(unzip, inputPath, outputDirPath),
(done) => job.workComplete(outputDirPath)
], (err) => {
if (err) {
console.error(err);
job.reportError();
}
});
}
function worker(config) {
const worker = gearman.worker(config);
if (config.id) {
worker.setWorkerId(config.id);
}
worker.addFunction('unzip', workerUnzip, {timeout: 10, toStringEncoding: 'ascii'});
worker.on('error', (err) => console.error(err));
return worker;
}
a simple index.js
const unzip = require('./worker-unzip').worker;
unzip(config); // pass host and port of the Gearman server
I normally run workers with PM2
the integration with your code it's very easy. something like
//initialize
const gearman = require('gearmanode');
gearman.Client.logger.transports.console.level = 'error';
const client = gearman.client(configGearman); // same host and port
the just add work to the queue passing the name of the functions
const taskpayload = {inputPath: '/tmp/sample-file.zip', outputDirPath: '/tmp/unzip/sample-file/'}
const job client.submitJob('unzip', JSON.stringify(taskpayload));
job.on('complete', jobCompleteCallback);
job.on('error', jobErrorCallback);

Running out of memory writing to a file in NodeJS

I'm processing a very large amount of data that I'm manipulating and storing it in a file. I iterate over the dataset, then I want to store it all in a JSON file.
My initial method using fs, storing it all in an object then dumping it didn't work as I was running out of memory and it became extremely slow.
I'm now using fs.createWriteStream but as far as I can tell it's still storing it all in memory.
I want the data to be written object by object to the file, unless someone can recommend a better way of doing it.
Part of my code:
// Top of the file
var wstream = fs.createWriteStream('mydata.json');
...
// In a loop
let JSONtoWrite = {}
JSONtoWrite[entry.word] = wordData
wstream.write(JSON.stringify(JSONtoWrite))
...
// Outside my loop (when memory is probably maxed out)
wstream.end()
I think I'm using Streams wrong, can someone tell me how to write all this data to a file without running out of memory? Every example I find online relates to reading a stream in but because of the calculations I'm doing on the data, I can't use a readable stream. I need to add to this file sequentially.
The problem is that you're not waiting for the data to be flushed to the filesystem, but instead keep throwing new and new data to the stream synchronously in a tight loop.
Here's an piece of pseudocode that should work for you:
// Top of the file
const wstream = fs.createWriteStream('mydata.json');
// I'm no sure how're you getting the data, let's say you have it all in an object
const entry = {};
const words = Object.keys(entry);
function writeCB(index) {
if (index >= words.length) {
wstream.end()
return;
}
const JSONtoWrite = {};
JSONtoWrite[words[index]] = entry[words[index]];
wstream.write(JSON.stringify(JSONtoWrite), writeCB.bind(index + 1));
}
wstream.write(JSON.stringify(JSONtoWrite), writeCB.bind(0));
You should wrap your data source in a readable stream too. I don't know what is your source, but you have to make sure, it does not load all your data in memory.
For example, assuming your data set come from another file where JSON objects are splitted with end of line character, you could create a Read stream as follow:
const Readable = require('stream').Readable;
class JSONReader extends Readable {
constructor(options={}){
super(options);
this._source=options.source: // the source stream
this._buffer='';
source.on('readable', function() {
this.read();
}.bind(this));//read whenever the source is ready
}
_read(size){
var chunk;
var line;
var lineIndex;
var result;
if (this._buffer.length === 0) {
chunk = this._source.read(); // read more from source when buffer is empty
this._buffer += chunk;
}
lineIndex = this._buffer.indexOf('\n'); // find end of line
if (lineIndex !== -1) { //we have a end of line and therefore a new object
line = this._buffer.slice(0, lineIndex); // get the character related to the object
if (line) {
result = JSON.parse(line);
this._buffer = this._buffer.slice(lineIndex + 1);
this.push(JSON.stringify(line) // push to the internal read queue
} else {
this._buffer.slice(1)
}
}
}}
now you can use
const source = fs.createReadStream('mySourceFile');
const reader = new JSONReader({source});
const target = fs.createWriteStream('myTargetFile');
reader.pipe(target);
then you'll have a better memory flow:
Please note that the picture and the above example are taken from the excellent nodejs in practice book

Running multiple threads in parallel using Webworker-Threads for Node.js

I am VERY new to using Node.js and WebWorker-Threads for Node.js (https://github.com/audreyt/node-webworker-threads). The WebWorker-Threads module is based on Thread-A-Gogo (https://github.com/xk/node-threads-a-gogo).
Here is a summary of my problem:
I have a file called "readFile.js", which contains code to read a csv
file that is passed in, and converts the csv data into a 2D array.
I want the function ""exports.parseCSV" within "readFile.js" to be
loaded and executed in multiple worker threads, with each thread
running in parallel.
I have a file, "test1.js", which attempts to do point 2 by using a
thread pool:
var Threads = require('webworker-threads');
var parser = require('./readFile.js');
var numThreads = 3;
var threadPool = Threads.createPool(numThreads);
threadPool.all.eval(parser.parseCSV);
for (var i = 1; i <=3; i++) {
(function(i) {
threadPool.any.eval(parser.parseCSV("csvFile.csv"), function (err, val) {
if (i == 3) {
console.log("bye!");
threadPool.destroy();
}
});
})(i);
}
I have another file, "test2.js", which attempts to do point 2 by
manually creating threads:
function cb (err, result) {
if (err) {
console.log("\n" + " ERROR! ERROR! ERROR!");
throw err;
}
console.log(" NO ERROR!");
thread.destroy();
}
var Threads = require('webworker-threads');
var parser = require('./readFile.js');
var thread = Threads.create();
thread.eval(parser.parseCSV);
thread.eval(parser.parseCSV("csvFile.csv"), cb);
thread.eval(parser.parseCSV);
thread.eval(parser.parseCSV("csvFile.csv"), cb);
thread.eval(parser.parseCSV);
thread.eval(parser.parseCSV("csvFile.csv"), cb);
When I run a "test[x].js" file using the command node test[x].js in the command prompt, the threads seem to be working but it appears as if they are being executed sequentially, and not in parallel. I say this based on the outputs of the time each thread starts, which I print out to command prompt window.
How do I execute multiple Webworker-Threads which run in parallel in the background? I want the threads to start at the same time and end at the same (if that is possible). I have looked at the API for the WebWorker-Threads but I haven't been able to solve this problem. Once this works, I would like to use the same methodology to not only read the csv files, but also store their contents into a database.
Any help would be greatly appreciated! Thank you very much :)

Categories

Resources