Nodejs mongodb fetching document size without actually fetching cursor - javascript

My question is, how can I get a cursor size (in KBs) without actually fetching it ?
I've already examined a lot of question such as here But I don't want to fetch query result to learn how much KB is it.
I just want something like:
var MongoClient = require('mongodb').MongoClient,
test = require('assert');
MongoClient.connect('mongodb://localhost:27017/test', function(err, db) {
var collection = db.collection('simple_query');
// Insert a bunch of documents for the testing
collection.insertMany([{a:1}, {a:2}, {a:3}], {w:1}, function(err, result) {
test.equal(null, err);
collection.find(/**SOME QUERY*/).size(function(err, SIZE) {
test.equal(null, err);
test.equal(32111351, SIZE); // in bytes or kilobytes whatever
db.close();
});
});
});

Something like this?
var avgSize = db.collectionName.stats().avgObjSize;
// ...
collection.count(/* some query */, function(err, count) {
var approximateSize = count*avgSize; // This could work for simple database models
}
I know its not perfect, but it is the best way i found.

Related

How to add many records to mongoDB from directory of JSON files?

I have about a million JSON files saved across many sub-directories of the directory "D:/njs/nodetest1/imports/source1/" and I want to import them into the collection "users" in my mongoDB database.
The following code correctly traverses through the file system. As you can see, it reads each item in the directory and if that item is a directory it reads each item in it. For each item that is not a directory it performs a some operations on it before sending a variable holding an to a function.
function traverseFS (path){
var files = fs.readdirSync(path);
for (var i in files){
var currentFile = path + '/' + files[i];
var stats = fs.statSync(currentFile);
if (stats.isFile())
runOnFile(currentFile);
else
traverseFS(currentFile);
}
}
traverseFS("D:/njs/nodetest1/imports/source1/")
Next, I run a few operations on the code (see below). This reads the file, parses it into a JSON object, reads two attributes of that object into variables,creates an object in the variable "entry" and passes the variable to another function.
function runOnFile(currentFile){
var fileText = fs.readFileSync(currentFile,'utf8');
var generatedJSON = JSON.parse(fileText);
var recordID = generatedJSON.recordID;
var recordText = generatedJSON.recordTexts;
var entry = {recordID:recordID, recordText:recordText};
insertRecord(entry);
}
The final function then should be used to insert the data into mongoDB. I think that this is where thing go wrong.
function insertRecord(entry){
var MongoClient = mongodb.MongoClient;
var MongoURL = 'mongodb://localhost:27017/my_database_name';
MongoClient.connect(MongoURL, function (err, db) {
var collection = db.collection('users');
collection.insert([entry], function (err, result) {
db.close();
});
});
}
I expected this to run through the file structure, reading the JSON files into objects and then inserting those objects into my mongoDB. Instead it reads the first file into the database and then stops/hangs.
Notes:
I don't want to use mongoimport because I don't want to insert all the data from these files into my MongoDB database. I however am not tied to any aspect of this approach. If some other solution exists I am open to it.
This connects to the database just fine. For each item in the directory this successfully creates an "entry" object and passes it to the insertRecord function. In other words, the problem must be occuring in the insertRecord section. But it obviously could be caused by something earlier in the process.
If I add error handling, no errors are produced. I have left the error handling out of this post because it clutters the readability of the code snippets.
As per mongodb2.2 (current latest) documentation, insert is deprecated
DEPRECATED
Use insertOne, insertMany or bulkWrite
So the short answer is probably to change collection.insert([entry], ...) to collection.insertOne(entry, ...) and you're done.
Then for the long answer, you say "about a million of json files", which typically deserves a full async approach with the least amount of overhead.
There are two (potential) bottlenecks in the sample code:
fs.readFileSync, this is a blocking operation
the connecting, inserting a record and closing the database connection
Both are executed "about a million of times". Granted, an import is not usually done over and over again and (hopefully) not on a machine which needs its performance for other important tasks. Still, the sample code can easily be made more robust.
Consider using the glob module to obtain the list of json file.
glob('imports/**/*.json', function(error, files) {...})
This provides you with the full list of files easily in an async fashion.
Then consider connecting to the database just once, insert everything and close once.
Maintaining more or less the same steps you have in the sample, I'd suggest something like:
var glob = require('glob'),
mongodb = require('mongodb'),
fs = require('fs'),
MongoClient = mongodb.MongoClient,
mongoDSN = 'mongodb://localhost:27017/my_database_name',
collection; // moved this to the "global" scope so we can do it only once
function insertRecord(json, done) {
var recordID = json.recordID || null,
recordText = json.recordText || null;
// the question implies some kind of validation/sanitation/preparation..
if (recordID && recordText) {
// NOTE: insert was changed to insertOne
return collection.insertOne({recordID: recordID, recordText: recordText}, done);
}
done('No recordID and/or recordText');
}
function runOnFile(file, done) {
// moved to be async
fs.readFile(file, function(error, data) {
if (error) {
return done(error);
}
var json = JSON.parse(data);
if (!json) {
return done('Unable to parse JSON: ' + file);
}
insertRecord(json, done);
});
}
function processFiles(files, done) {
var next = files.length ? files.shift() : null;
if (next) {
return runOnFile(next, function(error) {
if (error) {
console.error(error);
// you may or may not want to stop here by throwing an Error
}
processFiles(files, done);
});
}
done();
}
MongoClient.connect(mongoDSN, function(error, db) {
if (error) {
throw new Error(error);
}
collection = db.collection('users');
glob('imports/**/*.json', function(error, files) {
if (error) {
throw new Error(error);
}
processFiles(files, function() {
console.log('all done');
db.close();
});
});
});
NOTE: You can collect multiple "entry"-records to leverage the performance gain of multiple inserts using insertMany, though I have the feeling the inserted records are more complicated than described and it might give some memory issues if not handled correctly.
Just structure your data into one big array of objects, then run db.collection.insertMany.
I suggest you doing this using Promises:
const Bluebird = require('bluebird');
const glob = Bluebird.promisify(require('glob'));
const mongodb = require('mongodb');
const fs = Bluebird.promisifyAll(require('fs'));
const Path = require('path');
const MongoClient = mongodb.MongoClient;
const insertMillionsFromPath = Bluebird.coroutine(function *(path, mongoConnString) {
const db = yield MongoClient.connect(mongoConnString);
try {
const collection = db.collection('users');
const files = yield glob(Path.join(path, "*.json"));
yield Bluebird.map(
files,
Bluebird.coroutine(function *(filename) {
console.log("reading", filename);
const fileContent = yield fs.readFileAsync(filename);
const obj = JSON.parse(fileContent);
console.log("inserting", filename);
yield collection.insertOne(obj);
}),
{concurrency: 10} // You can increase concurrency here
);
} finally {
yield db.close();
}
});
insertMillionsFromPath("./myFiles", "mongodb://localhost:27017/database")
.then(()=>console.log("OK"))
.catch((err)=>console.log("ERROR", err));
In order to work, you will need to install the following packages:
npm install --save mongodb bluebird glob
and you will need to use node.js version 6 or greater, otherwise you will need to transpile your javascript (due to function *() generators usage).

Correct way to insert many records into Mongodb with Node.js

I was wondering what is the correct way to do bulk inserts into Mongodb (although could be any other database) with Node.js
I have written the following code as an example, although I believe it is floored as db.close() may be run before all the asynchronous collection.insert calls have completed.
MongoClient.connect('mongodb://127.0.0.1:27017/test', function (err, db) {
var i, collection;
if (err) {
throw err;
}
collection = db.collection('entries');
for (i = 0; i < entries.length; i++) {
collection.insert(entries[i].entry);
}
db.close();
});
If your MongoDB server is 2.6 or newer, it would be better to take advantage of using a write commands Bulk API that allow for the execution of bulk insert operations which are simply abstractions on top of the server to make it easy to build bulk operations and thus get perfomance gains with your update over large collections.
Sending the bulk insert operations in batches results in less traffic to the server and thus performs efficient wire transactions by not sending everything all in individual statements, but rather breaking up into manageable chunks for server commitment. There is also less time waiting for the response in the callback with this approach.
These bulk operations come mainly in two flavours:
Ordered bulk operations. These operations execute all the operation in order and error out on the first write error.
Unordered bulk operations. These operations execute all the operations in parallel and aggregates up all the errors. Unordered bulk operations do not guarantee order of execution.
Note, for older servers than 2.6 the API will downconvert the operations. However it's not possible to downconvert 100% so there might be some edge cases where it cannot correctly report the right numbers.
In your case, you could implement the Bulk API insert operation in batches of 1000 like this:
For MongoDB 3.2+ using bulkWrite
var MongoClient = require('mongodb').MongoClient;
var url = 'mongodb://localhost:27017/test';
var entries = [ ... ] // a huge array containing the entry objects
var createNewEntries = function(db, entries, callback) {
// Get the collection and bulk api artefacts
var collection = db.collection('entries'),
bulkUpdateOps = [];
entries.forEach(function(doc) {
bulkUpdateOps.push({ "insertOne": { "document": doc } });
if (bulkUpdateOps.length === 1000) {
collection.bulkWrite(bulkUpdateOps).then(function(r) {
// do something with result
});
bulkUpdateOps = [];
}
})
if (bulkUpdateOps.length > 0) {
collection.bulkWrite(bulkUpdateOps).then(function(r) {
// do something with result
});
}
};
For MongoDB <3.2
var MongoClient = require('mongodb').MongoClient;
var url = 'mongodb://localhost:27017/test';
var entries = [ ... ] // a huge array containing the entry objects
var createNewEntries = function(db, entries, callback) {
// Get the collection and bulk api artefacts
var collection = db.collection('entries'),
bulk = collection.initializeOrderedBulkOp(), // Initialize the Ordered Batch
counter = 0;
// Execute the forEach method, triggers for each entry in the array
entries.forEach(function(obj) {
bulk.insert(obj);
counter++;
if (counter % 1000 == 0 ) {
// Execute the operation
bulk.execute(function(err, result) {
// re-initialise batch operation
bulk = collection.initializeOrderedBulkOp();
callback();
});
}
});
if (counter % 1000 != 0 ){
bulk.execute(function(err, result) {
// do something with result
callback();
});
}
};
Call the createNewEntries() function.
MongoClient.connect(url, function(err, db) {
createNewEntries(db, entries, function() {
db.close();
});
});
You can use insertMany. It accepts an array of objects. Check the API.
New in version 3.2.
The db.collection.bulkWrite() method provides the ability to perform bulk insert, update, and remove operations. MongoDB also supports bulk insert through the db.collection.insertMany().
In bulkWrite it is supporting only insertOne, updateOne, updateMany, replaceOne, deleteOne, deleteMany
In your case to insert data using single line of code, it can use insertMany option.
MongoClient.connect('mongodb://127.0.0.1:27017/test', function (err, db) {
var i, collection;
if (err) {
throw err;
}
collection = db.collection('entries');
collection.insertMany(entries)
db.close();
});
var MongoClient = require('mongodb').MongoClient;
var url = 'mongodb://localhost:27017/test';
var data1={
name:'Data1',
work:'student',
No:4355453,
Date_of_birth:new Date(1996,10,17)
};
var data2={
name:'Data2',
work:'student',
No:4355453,
Date_of_birth:new Date(1996,10,17)
};
MongoClient.connect(url, function(err, db) {
if(err!=null){
return console.log(err.message)
}
//insertOne
db.collection("App").insertOne(data1,function (err,data) {
if(err!=null){
return console.log(err);
}
console.log(data.ops[0]);
});
//insertMany
var Data=[data1,data2];
db.collection("App").insertMany(Data,forceServerObjectId=true,function (err,data) {
if(err!=null){
return console.log(err);
}
console.log(data.ops);
});
db.close();
});

How to use 64-bit long auto-increment counters in MongoDB

I'm implementing a logger database using MongoDB. The capped collection will contain log messages collected from several sources across the network. Since I want to do $lte/$gte queries on _id afterwards I need to have an _id that grows as a monotonic function.
To achieve that I've implemented the auto-incremented counter described in this article http://docs.mongodb.org/manual/tutorial/create-an-auto-incrementing-field/
My code looks like that:
var mongo = require("mongodb");
var Promise = require('es6-promise').Promise;
function connectToDB(mongo, uri) {
return new Promise(function(resolve, reject) {
mongo.MongoClient.connect(uri, function (err, db) {
if(err) reject(err);
else resolve(db);
});
});
}
function getNextSequenceNumber(db, counterName) {
return new Promise(function(resolve, reject) {
db.collection("counters", function (err, collection) {
if (err) reject(err);
else {
var criteria = { _id: counterName };
var sort = {$natural: 1};
var update = { $inc: { seq: 1 } };
var options = {remove: false, new: true, upsert: true};
collection.findAndModify(criteria, sort, update, options, function(err, res) {
if(err) reject(err);
else resolve(res.seq);
});
}
});
});
}
It works perfectly fine, but I've read that by default the number fields used in MongoDB are actually floats. The problem is that my database is a capped collection of log entries and it is going to have lots of entries. Moreover, since this is a capped collection the old entries will be overwritten but the counter will keep growing. Having counter as a float I cannot guarantee the system will keep on working after a few years.
My question is how can I force MongoDB to use 64 bit counter in this particular case.
Please provide some code examples.
MongoDB (or rather BSON) has the NumberLong type, which is a 64-bit signed integer.
From Node.js you can use it in your update statement to create the seq property of that type:
var update = { $inc : { seq : mongo.Long(1) } };
This also seems to convert the seq property of existing documents to NumberLong.

Processing database query in node.js

What's the correct way to process information from a database query after it has been retrieved.
Assuming the the below example that dataObj is just a js object that contains a field, name, which is defined, is this how I should be processing data in node.js?
EDIT:
forgot to mention that of course I can't return the data object because this is an async call.
function processData(dataObj){
if(dataObj.name == "CS"){
console.log("true");
}
}
function getData(anon)
var dat = .... sql query that works and its an object now that works correctly...
anon(dat);
}
getData(processData);
Here's a snippet of some code i'm working on using mongoose:
var db = mongoose.createConnection(uristring)
, mongoose = require('mongoose');
db.once('open', function(){
var recipientSchema = mongoose.Schema({
username: String,
recipientEmail: String
});
var Recipient = db.model('Recipient', recipientSchema);
Recipient.find( {username: user }, function(err, recipients){
res.json(recipients);
})
})
Hope that helps!
One of NodeJS' strengths is streaming support. I would expect from a proper SQL driver to give me one row at the time. If I want I can collect all of them, but most of the time I don't want all the rows in memory. Instead I would stream them one by one to a consumer.
My preferred DB code would look like...
var db = require('<somedriver>');
var sqlStatement = ".... sql query that works and its an object now that works correctly...";
var query = db.execute(sqlStatement);
query.on('row', function (row) {
// Do something with a row object...
console.log("This is a row", row);
});
query.on('done', function (err, rowcount) {
if (err) { throw err; } // or do something else...
console.log("Number of rows received", rowcount)
});
Hope this helps

NodeJS and node-mongodb-native

Just getting started with node, and trying to get the mongo driver
to work. I've got my connection set up, and oddly I can insert things
just fine, however calling find on a collection produces craziness.
var db = new mongo.Db('things', new mongo.Server('192.168.2.6',mongo.Connection.DEFAULT_PORT, {}), {});
db.open(function(err, db) {
db.collection('things', function(err, collection) {
// collection.insert(row);
collection.find({}, null, function(err, cursor) {
cursor.each(function(err, doc) {
sys.puts(sys.inspect(doc,true));
});
});
});
});
If I uncomment the insert and comment out the find, it works a treat.
The inverse unfortunately doesn't hold, I receive this error:
collection.find({}, null, function(err, cursor) {
^
TypeError: Cannot call method 'find' of null
I'm sure I'm doing something silly, but for the life of me I can't
find it...
I got the same thing just now. I realized that db.collection is being called over and over again for some reason, so I did something like this (hacking away on your code):
var db = new mongo.Db('things', new mongo.Server('192.168.2.6',mongo.Connection.DEFAULT_PORT, {}), {});
var Things;
db.open(function(err, db) {
db.collection('things', function(err, collection) {
Things = Things || collection;
});
var findThings = function() {
Things.find({}, null, function(err, cursor) {
cursor.each(function(err, doc) {
sys.puts(sys.inspect(doc,true));
});
});
}
I realize you asked this 9 months ago. Hope this grave diggin still helps someone. Good luck!
try to call collection.save() after your insert to flush your row.
take a look at http://www.learnboost.com/mongoose/
"Currently Mongoose only supports manual flushing of data to the server."

Categories

Resources