nodejs running out of memory processing csv files - javascript

I've read through a number of SO questions about nodejs running out of memory, but I haven't seen anything that sounds similar to my situation.
I'm trying to process about 20GBs of data across 250 csv files (so ~80MBs/file). Launch the node script with --max-old-space-size=8192 on a server with 90GB of free memory using node v5.9.1. After 9mins of processing the script quits with an out-of-memory error.
I'm new to Node programming, but I thought I wrote the script to process data one line at a time and not to keep anything in memory. Yet it seems some object references are being held on to by something, so the script is leaking memory. Here's the full script:
var fs = require('fs');
var readline = require('readline');
var mongoose = require('mongoose');
mongoose.connect('mongodb://buzzard/xtra');
var db = mongoose.connection;
db.on('error', console.error.bind(console, 'connection error:'));
var DeviceSchema = mongoose.Schema({
_id: String,
serial: String
});
var Device = mongoose.model('Device', DeviceSchema, 'devices');
function processLine(line) {
var serial = line.split(',')[8];
Device({
_id: serial,
serial: serial
}).save(function (err) {
if (err) return console.error(err);
});
}
function processFile(baseDir, fileName) {
if(!fileName.startsWith('qcx3'))
return;
var fullPath = `${baseDir}/${fileName}`;
var lineReader = readline.createInterface({
input: fs.createReadStream(fullPath)
});
lineReader.on('line', processLine);
}
function findFiles(rootDir) {
fs.readdir(rootDir, function (error, files) {
if (error) {
console.log(`Error: ${error}` );
return
}
files.forEach(function (file) {
if(file.startsWith('.'))
return;
var fullPath = `${rootDir}/${file}`;
fs.stat(fullPath, function(error, stat) {
if (error) {
console.log(`Error: ${error}` );
return;
}
if(stat.isDirectory())
dir(fullPath);
else
processFile(rootDir, file);
});
});
})
}
findFiles('c://temp/logs/compress');
I also noticed that when I run the script on a much smaller test set that it can completely finish processing, the script doesn't exit at the end. Just keeps hanging there until I ctrl+c it. Could this be somehow related?
What am I doing wrong?

The script is not exiting cause you have an open connection to mongoose, after all the files has been processed you should close the connection and the script will finish.
You have the right Idea of using streams but i think you missed something on the way, I suggest you the following article to update the streamInterface, and events. https://coderwall.com/p/ohjerg/read-large-text-files-in-nodejs
An other source of problem could be the mongodb, it seems you make a lot of inserts, it could be related with the max i/o of mongodb that exhaust the memory.

Related

How to fix the maxlistenersexceeded warning in nodejs

On my studoes with NodeJS I've found an issue/warning the is getting me a bit worried because I would like to avoid memory leaks, but since this language is a bit new to me, I can't find which of my code is doing it.
const MongoClient = require('mongodb').MongoClient;
const assert = require('assert');
const fs = require("fs");
const conversor = require("./converterSiteEmObjeto");
// Connection URL
const url = 'mongodb://localhost:27017';
// Database config
const client = new MongoClient(url, { useNewUrlParser: true, useUnifiedTopology: true });
async function salvarBanco(listaLinks) {
try {
await client.connect();
console.log("Connected successfully to server");
const db = client.db('consultadados');
// Iterates through the list of links
for (link of listaLinks) {
// scrape data from link and return a JS object
let resultado = await conversor(link);
// insert object into DB and asserts
let r = await db.collection('cnpjs').insertOne(resultado);
assert.equal(1, r.insertedCount);
}
} catch (err) {
console.log(err.stack);
}
client.close();
console.log("Done...");
};
function converterSitemapEmListaLinks() {
// Reads file containing links
fs.readFile('links/sitemap.txt', 'utf8', function (err, contents) {
//creates a array of links (100+ links)
let listaLinks = contents.split("\n");
//saves to database
salvarBanco(listaLinks);
});
}
converterSitemapEmListaLinks();
It runs just fine, but after a few dozen iteractions on listaLinks (list of links) I get the following message:
" MaxListenersExceededWarning: Possible EventEmitter memory leak detected. 11 SIGHUP listeners added to [process]. Use emitter.setMaxListeners() to increase limit "
I know what it means, what I would like is to know how to fix it instead of simply setting the listeners to a higher max value.

In Node, how to execute sql from global database connection

I am unable to execute the sql, when using the global database connection in node.js.
I have followed the steps as in Azure documentation: https://learn.microsoft.com/en-us/azure/mysql/connect-nodejs and able to display the output on the console. But, I want to put all my Azure SQL database connection in a separate file, but the select query is not printing the output on the console.
DatabaseManager.js
var Connection = require('tedious').Connection;
var Request = require('tedious').Request;
var sqlConnection = function sqlConnection() {
// Create connection to database
var config =
{
userName: 'uname',
password: 'password',
server: 'dbserver.database.windows.net',
options:
{
database: 'mydatabase',
encrypt: true
}
}
var connection = new Connection(config);
// Attempt to connect and execute queries if connection goes through
connection.on('connect', function(err) {
if (err)
{
console.log(err)
}
else
{
console.log('CONNECTED TO DATABASE');
}
}
);
}
module.exports = sqlConnection;
app.js
var restify = require('restify');
var builder = require('botbuilder');
var botbuilder_azure = require("botbuilder-azure");
var azure = require('azure-storage');
var dbconnection = require('./DatabaseManager');
bot.dialog('profileDialog',
(session) => {
session.send('You reached the profile intent. You said \'%s\'.', session.message.text);
console.log('Reading rows from the Table...');
dbconnection("select FNAME from StudentProfile where ID=1"),
function (err, result, fields) {
if (err) throw err;
console.log(result);
}
session.endDialog();
}
Console Output:
Reading rows from the Table...
CONNECTED TO DATABASE
I was expecting the output of FNAME, but nothing is printing on the console. Is there anything, I am missing?
Thank you.
There's a couple of problems here. First off, you should only ever import a module once per file. This is just a performance consideration and won't actually break your code.
Next, pay attention to what you're exporting from your DatabaseManager module. Right now, you're exporting a function that creates the connection and then doesn't do anything with it. We can fix this by using a pattern called a "callback" which lets us provide a function that will then be called with the connection as an argument.
I added a ton of comments to the code explaining things. This code won't run as-is - there's a couple places where I have "do this or this". You'll have to choose one.
var Tedious = require('tedious'); // Only require a library once per file
var Connection = Tedious.Connection;
var Request = Tedious.Request;
// Or using the object spread operator
var { Connection, Request } = require('tedious');
// You called this `sqlConnection`. I'm going to use a verb since it's a
// function and not a variable containing the connection. I'm also going
// to change the declaration syntax to be clearer.
function connect(cb) { // cb is short for callback. It should be a function.
var config = {
userName: 'uname',
password: 'password',
server: 'dbserver.database.windows.net',
options: {
database: 'mydatabase',
encrypt: true
}
}; // Put a semi-colon on your variable assignments
var connection = new Connection(config);
// Attempt to connect and execute queries if connection goes through
connection.on('connect', function(err) {
if (err) {
console.log(err);
return; // Stop executing the function if it failed
}
// We don't need an "else" because of the return statement above
console.log('CONNECTED TO DATABASE');
// We have a connection, now let's do something with it. Call the
// callback and pass it the connection.
cb(connection);
});
}
module.exports = connect; // This exports a function that creates the connection
Then back in your main file, you can use it like so.
var restify = require('restify');
var builder = require('botbuilder');
var botbuilder_azure = require('botbuilder-azure');
var azure = require('azure-storage');
var connect = require('./DatabaseManager'); // renamed to be a verb since it's a function.
bot.dialog('profileDialog', (session) => { // Hey, this is a callback too!
session.send('You reached the profile intent. You said \'%s\'.', session.message.text);
console.log('Creating a connection');
connect((connection) => {
// or with the traditional function notation
connect(function(connection) {
console.log('Reading rows from the Table...');
// Execute your queries here using your connection. This code is
// taken from
// https://github.com/tediousjs/tedious/blob/master/examples/minimal.js
request = new Request("select FNAME from StudentProfile where ID=1", function(err, rowCount) { // Look another callback!
if (err) {
console.log(err);
} else {
console.log(rowCount + ' rows');
}
connection.close();
});
request.on('row', function(columns) { // Iterate through the rows using a callback
columns.forEach(function(column) {
if (column.value === null) {
console.log('NULL');
} else {
console.log(column.value);
}
});
});
connection.execSql(request);
});

readFile not running inside while loop

I am trying to send the contents of a text file through a socket connection every time the text file updates using Express:
console.log('Server running!');
var express = require('express');
var app = express();
var server = app.listen(3000);
var fs = require("fs");
var x = 0;
app.use(express.static('public'));
var socket = require('socket.io');
var io = socket(server);
io.sockets.on('connection', newConnection);
function newConnection(socket) {
console.log("New connection: " + socket.id);
while (true) {
fs.readFile('song.txt', function(err, data) {
if (err) throw err;
console.log(data);
if (data != x) {
var songdata = data;
console.log(songdata);
io.sockets.emit('update', songdata);
x = data;
} else {
console.log("Song is not different:)");
}
})
}
}
Without the while loop, everything works just fine and I recieve the contents in the seperate client. However, now nothing is happening, no console log of data. This indicates the readFile is suddenly no longer running, why?
Thanks:)
First off, some basics. node.js runs your Javascript as single threaded and thus this is a single threaded server. It can only do one thing with your Javascript at a time. But, if you program it carefully, it can scale really well and do lots of things.
Second off, you pretty much never want to do while (true) in server-side Javascript. That's just going to run forever and never let anything else run on your server. Nothing else.
Third, you are attempting to create a new version of that infinite loop every time a new client connects. That's not a correct design (even if there wasn't an infinite loop). You only need one instance of code checking the file, not N.
Now, if you what you're really trying to do is to "poll" for changes in song.txt and notify the client whenever it changes, you need to pick a reasonable time delta between checks on the file and use a timer. This will check that file every so often and let your server run normally all the rest of the time.
Here's a simple version that polls with setInterval():
console.log('Server code started!');
const express = require('express');
const app = express();
const server = app.listen(3000);
const fs = require("fs");
let lastSongData = 0;
app.use(express.static('public'));
const io = require('socket.io')(server);
// get initial songData for future use
// there will not be any connected clients yet so we don't need to broadcast it
try {
lastSongData = fs.readFileSync('song.txt');
} catch(e) {
console.log(e, "\nDidn't find song.txt on server initialization");
}
// here, we create a polling loop that notifies all connected clients
// any time the song has changed
const pollInterval = 60*1000; // 60 seconds, ideally it should be longer than this
const pollTimer = setInterval(() => {
fs.readFile('song.txt', (err, songData) => {
if (!err && songData !== lastSongData) {
// notify all connect clients
console.log("found changed songData");
io.emit('update', songData);
lastSongData = songData;
}
});
}, pollInterval);
io.sockets.on('connection', socket => {
console.log("New connection: " + socket.id);
});
If your songData is binary, then you will have to change how you send the data to the client and how you compare the data to the previous data so you are sending and receiving binary data, not string data and so you are comparing buffers, not strings.
Here's are some references on sending binary data with socket.io:
How to send binary data with socket.io?
How to send binary data from a Node.js socket.io server to a browser client?
A little more efficient way to detect changes to the file is to use fs.watch() which should notify you of changes to the file though you will have to thoroughly test it on whatever platform you are running to make sure it works the way you want. The feature has a number of platform caveats (it does not work identically on all platforms), so you have to test it thoroughly on your platform to see if you can use it for what you want.
console.log('Server code started!');
const express = require('express');
const app = express();
const server = app.listen(3000);
const fs = require("fs");
let lastSongData = 0;
app.use(express.static('public'));
const io = require('socket.io')(server);
// get initial songData for future use
// there will not be any connected clients yet so we don't need to broadcast it
try {
lastSongData = fs.readFileSync('song.txt');
} catch(e) {
console.log(e, "\nDidn't find song.txt on server initialization");
}
// ask node.js to tell us when song.txt is modified
fs.watch('song.txt', (eventType, filename) => {
// check the file for all eventTypes
fs.readFile('song.txt', (err, songData) => {
if (!err && songData !== lastSongData) {
// notify all connect clients
console.log("found changed songData");
lastSongData = songData;
io.emit('update', songData);
}
});
});
io.sockets.on('connection', socket => {
console.log("New connection: " + socket.id);
});
It is unclear from your original code if you need to send the songData to each new connection (whether it has recently changed or not).
If so, you can just change your connection event handler to this:
io.sockets.on('connection', socket => {
console.log("New connection: " + socket.id);
// send most recent songData to each newly connected client
if (lastSongData) {
socket.emit('update', lastSongData);
}
});
Continuously reading the file to detect changes is not a good idea. Instead you should use fs.watch(filename[, options][, listener]) to notify you when the file has changed. When a new socket connects only that socket should have the content broadcast to them, sending it to every client is redundant.
io.sockets.on('connection', newConnection);
var filename = 'song.txt';
function update(socket) {
fs.readFile(filename, function (err, data) {
if (err) throw err;
socket.emit('update', data);
});
}
function newConnection(socket) {
console.log("New connection: " + socket.id);
update(socket); // Read and send to just this socket
}
fs.watch(filename, function () {
console.log("File changed");
update(io.sockets); // Read and send to all sockets.
});

How to add many records to mongoDB from directory of JSON files?

I have about a million JSON files saved across many sub-directories of the directory "D:/njs/nodetest1/imports/source1/" and I want to import them into the collection "users" in my mongoDB database.
The following code correctly traverses through the file system. As you can see, it reads each item in the directory and if that item is a directory it reads each item in it. For each item that is not a directory it performs a some operations on it before sending a variable holding an to a function.
function traverseFS (path){
var files = fs.readdirSync(path);
for (var i in files){
var currentFile = path + '/' + files[i];
var stats = fs.statSync(currentFile);
if (stats.isFile())
runOnFile(currentFile);
else
traverseFS(currentFile);
}
}
traverseFS("D:/njs/nodetest1/imports/source1/")
Next, I run a few operations on the code (see below). This reads the file, parses it into a JSON object, reads two attributes of that object into variables,creates an object in the variable "entry" and passes the variable to another function.
function runOnFile(currentFile){
var fileText = fs.readFileSync(currentFile,'utf8');
var generatedJSON = JSON.parse(fileText);
var recordID = generatedJSON.recordID;
var recordText = generatedJSON.recordTexts;
var entry = {recordID:recordID, recordText:recordText};
insertRecord(entry);
}
The final function then should be used to insert the data into mongoDB. I think that this is where thing go wrong.
function insertRecord(entry){
var MongoClient = mongodb.MongoClient;
var MongoURL = 'mongodb://localhost:27017/my_database_name';
MongoClient.connect(MongoURL, function (err, db) {
var collection = db.collection('users');
collection.insert([entry], function (err, result) {
db.close();
});
});
}
I expected this to run through the file structure, reading the JSON files into objects and then inserting those objects into my mongoDB. Instead it reads the first file into the database and then stops/hangs.
Notes:
I don't want to use mongoimport because I don't want to insert all the data from these files into my MongoDB database. I however am not tied to any aspect of this approach. If some other solution exists I am open to it.
This connects to the database just fine. For each item in the directory this successfully creates an "entry" object and passes it to the insertRecord function. In other words, the problem must be occuring in the insertRecord section. But it obviously could be caused by something earlier in the process.
If I add error handling, no errors are produced. I have left the error handling out of this post because it clutters the readability of the code snippets.
As per mongodb2.2 (current latest) documentation, insert is deprecated
DEPRECATED
Use insertOne, insertMany or bulkWrite
So the short answer is probably to change collection.insert([entry], ...) to collection.insertOne(entry, ...) and you're done.
Then for the long answer, you say "about a million of json files", which typically deserves a full async approach with the least amount of overhead.
There are two (potential) bottlenecks in the sample code:
fs.readFileSync, this is a blocking operation
the connecting, inserting a record and closing the database connection
Both are executed "about a million of times". Granted, an import is not usually done over and over again and (hopefully) not on a machine which needs its performance for other important tasks. Still, the sample code can easily be made more robust.
Consider using the glob module to obtain the list of json file.
glob('imports/**/*.json', function(error, files) {...})
This provides you with the full list of files easily in an async fashion.
Then consider connecting to the database just once, insert everything and close once.
Maintaining more or less the same steps you have in the sample, I'd suggest something like:
var glob = require('glob'),
mongodb = require('mongodb'),
fs = require('fs'),
MongoClient = mongodb.MongoClient,
mongoDSN = 'mongodb://localhost:27017/my_database_name',
collection; // moved this to the "global" scope so we can do it only once
function insertRecord(json, done) {
var recordID = json.recordID || null,
recordText = json.recordText || null;
// the question implies some kind of validation/sanitation/preparation..
if (recordID && recordText) {
// NOTE: insert was changed to insertOne
return collection.insertOne({recordID: recordID, recordText: recordText}, done);
}
done('No recordID and/or recordText');
}
function runOnFile(file, done) {
// moved to be async
fs.readFile(file, function(error, data) {
if (error) {
return done(error);
}
var json = JSON.parse(data);
if (!json) {
return done('Unable to parse JSON: ' + file);
}
insertRecord(json, done);
});
}
function processFiles(files, done) {
var next = files.length ? files.shift() : null;
if (next) {
return runOnFile(next, function(error) {
if (error) {
console.error(error);
// you may or may not want to stop here by throwing an Error
}
processFiles(files, done);
});
}
done();
}
MongoClient.connect(mongoDSN, function(error, db) {
if (error) {
throw new Error(error);
}
collection = db.collection('users');
glob('imports/**/*.json', function(error, files) {
if (error) {
throw new Error(error);
}
processFiles(files, function() {
console.log('all done');
db.close();
});
});
});
NOTE: You can collect multiple "entry"-records to leverage the performance gain of multiple inserts using insertMany, though I have the feeling the inserted records are more complicated than described and it might give some memory issues if not handled correctly.
Just structure your data into one big array of objects, then run db.collection.insertMany.
I suggest you doing this using Promises:
const Bluebird = require('bluebird');
const glob = Bluebird.promisify(require('glob'));
const mongodb = require('mongodb');
const fs = Bluebird.promisifyAll(require('fs'));
const Path = require('path');
const MongoClient = mongodb.MongoClient;
const insertMillionsFromPath = Bluebird.coroutine(function *(path, mongoConnString) {
const db = yield MongoClient.connect(mongoConnString);
try {
const collection = db.collection('users');
const files = yield glob(Path.join(path, "*.json"));
yield Bluebird.map(
files,
Bluebird.coroutine(function *(filename) {
console.log("reading", filename);
const fileContent = yield fs.readFileAsync(filename);
const obj = JSON.parse(fileContent);
console.log("inserting", filename);
yield collection.insertOne(obj);
}),
{concurrency: 10} // You can increase concurrency here
);
} finally {
yield db.close();
}
});
insertMillionsFromPath("./myFiles", "mongodb://localhost:27017/database")
.then(()=>console.log("OK"))
.catch((err)=>console.log("ERROR", err));
In order to work, you will need to install the following packages:
npm install --save mongodb bluebird glob
and you will need to use node.js version 6 or greater, otherwise you will need to transpile your javascript (due to function *() generators usage).

Nodejs unable to kill external windows executable program executed by Node itself

I am hosting Node as a server locally to interact with hardware.
My web application then makes a request to Node to execute 2 executable, whichever executable returns a data first will Respond it back to my Web Application.
By doing so, it causes the other executable to still be running in the background waiting for response from the hardware.
I am unable to kill off that process either, I have to either manually stop Node and run it again or task kill that executable.
My code are below:
Node.JS
var exec = require('child_process').exec;
app.get('/Page', function (req, res) {
var Page = function () {
var a = exec('F:/Example1.exe', function (err, data) {
console.log(err);
console.log(data);
b.kill();
if (!res.headersSent) {
res.send(data);
}
});
var b = exec('F:/Example2.exe', function (err, data) {
console.log(err);
console.log(data);
a.kill();
if (!res.headersSent) {
res.send(data);
}
});
}
Page();
});
Apparently, even with the kill command, I am still unable to terminate the process.
I should let you guys know, I am also using AngularJS for my front-end.
I have sourced online for solution, however Google's results are all slowly turning purple.
Thank you so much for those who post their solution, please explain to me the details of the solution as well. I would really want to learn more.
Thank you so much.
The problem with exec is it will wait until the program has executed for it's callback to run.
You can use spawn instead, then you have control over the process as it's running.
var spawn = require('child_process').spawn;
app.get('/Page', function(req, res) {
var Page = function() {
var a = spawn('F:/Example1.exe');
var b = spawn('F:/Example2.exe');
var aData = '',
bData = '';
a.stdout.on('data', function(data) {
aData += data.toString();
});
b.stdout.on('data', function(data) {
bData += data.toString();
});
a.on('close', function() {
b.kill();
if (!res.headersSent) {
res.send(aData);
}
});
b.on('close', function() {
a.kill();
if (!res.headersSent) {
res.send(bData);
}
});
}
Page();
});
I have never used exec in nodejs but javascript scoping I think Page is executed every request, so the a and b processes from previous requests are no longer around.
You could store references to the processes globally so that each request has access to the processes, (real incomplete rough example):
var exec = require('child_process').exec;
var a = null;
var b = null;
app.get('/Page', function (req, res) {
var Page = function () {
if (a) {
// a is already a running process? Do something?
} else {
// start a ?
a = exec('your command');
}
if (b) {
// b is already running? Do something?
}
}
Page();
});

Categories

Resources