MongoDB&JavaScript heap out of memory - javascript

The data size in telemetry table is HUGE. So, I get "JavaScript heap out of memory" error.
How do I overcome that error?
const aloUrl = `mongodb://${userName}:${pwd}#${host}:${port}/${dbName}`;
MongoClient.connect(aloUrl, function(err, client) {
if (err) {
return console.log('ERROR:: ', err);
}
console.log("INFO:: OK");
const db = client.db(dbName);
var arr = db.collection('endpoint').find({provider:"KMR"}).map(e => e._id).toArray((err, result) => {
if (err){
console.log("ERROR", err)
}
var son = db.collection('telemetry').find({endpoint: {$in: result}}).toArray().then(arr =>{
console.log("Let's start to party")
for (let i = 0; i < 10; i++) {
console.log("\t" + arr[i]._id)
}
}).catch(e => {
console.log(`ERROR::${e}`)
})
})
});

From the mongodb docs,
The toArray() method returns an array that contains all the documents
from a cursor. The method iterates completely the cursor, loading all
the documents into RAM and exhausting the cursor.
Thus instead of calling toArray, you should use the next or forEach (or some other method which doesn't load everything at once to RAM), to iterate through the elements one by one.
For example, to print all the documents in your telemetry collection ONE BY ONE, you can do this,
db.collection('telemetry')
.find({
endpoint: {
$in: result
}
})
.forEach((document) => {
console.log(document)
});

I would suggest you to use forEach instead of toArrayin order to fetch and load w/o exhaustion.

For huge data it's always advised to stream (it's achieved by cursor in mongo).
also
$lookup is new in MongoDB 3.2. It performs a left outer join to an unsharded collection in the same database to filter in documents from the “joined” collection for processing.
You can have a look at aggregation pipeline for mongo
updating your code with aggregate.
var MongoClient = require('mongodb').MongoClient;
// Connection URL
const aloUrl = `mongodb://${userName}:${pwd}#${host}:${port}/${dbName}`;
MongoClient.connect(aloUrl, function (err, client) {
console.log("INFO:: OK");
const db = client.db(dbName);
const col = db.collection('endpoint');
var cursor = col.aggregate([
{
$match: {provider: "KMR"}
},
{
$lookup:
{
from : "telemetry",
localField : "_id",
foreignField: "endpoint",
as : "telemetry"
}
}
]);
console.log("Let's start to party")
cursor.on('data', function (data) {
console.log("\t" + data._id)
});
cursor.on('end', function () {
console.log("Done ");
});
});

Related

MongoDB Aggregation not updating my collection

I'm trying to run an aggregation query in Mongo using their nodejs driver, that takes some of my fields and adds/finds averages etc. I built the aggregation in Mongo Cloud and exported it to node, but when I run the code, I get the following responses from Mongo:
this.res could not be serialized.
steps.update_mongo.res:null
Here's the code (history field is an array of objects):
const agg = [
{
'$addFields': {
'avgGrowth': {
'$ceil': {
'$avg': '$history.growth'
}
},
'avgDecline': {
'$ceil': {
'$avg': '$history.decline'
}
}
}
},
{
'$merge': {
'into': {'db':'mydb', 'coll':'test'},
'on': '$_id'
},
}
];
const coll = await db.collection('test');
this.res = await coll.aggregate(agg, async(cmdErr, result) => {
await assert.equal(null, cmdErr);
});
this.res = await coll.aggregate(agg, async(cmdErr, result) => {
await assert.equal(null, cmdErr);
});
Is an incorrect syntax. You either provide a callback OR use await, not both.
Try using is as follow :
this.res = await coll.aggregate(agg);

Retrieving documents for MongoDB cluster

I am trying to retrieve all the documents from a MongoDB cluster. I have followed code I've seen online, however I am facing a small problem.
const MongoClient = require('mongodb');
const uri = "mongodb+srv://<user>:<password>#cluster0-10soy.mongodb.net/test?retryWrites=true&w=majority";
var questionsArray = [];
MongoClient.connect(uri, function (err, client) {
const database = client.db("WhatSportWereYouMadeFor");
database.collection("Questions").find({}, (error, cursor) =>{
cursor.each(function(error, item){
if (item == null){
console.log(error);
}
questionsArray.push(item);
});
})
});
module.exports = { questionsArray };
I connect fine to the database, however I've set a breakpoint at the stop variable and that gets hit before any of the documents retrieved from the database get pushed to the questions array.
I've also tried wrapping the code inside an async function and then awaiting it before the stop variable, but still that breakpoint gets hit first and only after the documents get pushed to the array.
What I would do, this wrap the whole thing into a promise, and the export that.
const MyExport = () => {
return new Promise((resolve, reject) => {
var questionsArray = [];
MongoClient.connect(uri, function (err, client) {
const database = client.db("WhatSportWereYouMadeFor");
database.collection("Questions").find({}, (error, cursor) =>{
cursor.each(function(error, item){
if (item == null){
console.log(error);
}
questionsArray.push(item);
});
resolve(questionsArray)
})
});
})
}
module.exports.questionsArray = MyExport
But then when you import it, you need to run and await it
cosnt questionsArrayFunc = require("path/to/this/file").questionsArray
const questionsArray = await questionsArrayFunc()
I hope this is what you looking for. There might be some other way, but I think this works.

Nested for loop in nodejs seems to be running asynchronously

So I have two for loops, and one is nested inside another but the results they return seem to be running the first loop and returning its results than the nested loop. How could I make it run in a synchronous behavior?
For example, all the topicData gets printed in a row instead of printing one topicData and moving on to the nested for loop.
I'm not sure if this is the proper way to implement the async await. Any pointers would be appreciated. Thanks
exports.create = (event, context, callback) => {
var topicTimestamp = "";
var endpoint = "";
sns.listTopics(async function(err, data) {
if (err) {
console.log(err, err.stack);
} else {
console.log(data);
for (var topic in data.Topics){ //first loop
//var topicData = "";
//retrieve each topic and append to topicList if it is lakeview topic
var topicData = await data.Topics[topic].TopicArn;
topicTimestamp = topicData.slice(22, 34); //get only the topic createdAt
var params = {
TopicArn: topicData //topicData
};
console.log("SUBS per" + params.TopicArn);
//retrieve subscriptions attached to each topic
sns.listSubscriptionsByTopic(params, async function(err, subscriptionData){
console.log(subscriptionData);
//console.log("SUBS per" + params.TopicArn);
if (err) {
console.log(err, err.stack); // an error occurred
} else {
var endpointList = [];
for (var sub in subscriptionData.Subscriptions) { //nested loop
endpoint = await subscriptionData.Subscriptions[sub].Endpoint;
console.log("ENDPOINT:: " + endpoint);
endpointList.push(endpoint);
}
} // end of else listSub
//put topic info into table
var topicsParams = {
TableName: tableName,
Item: {
id: uuidv4(),
createdAt: timestamp,
topicCreatedAt: topicTimestamp,
topic: topicData,
phoneNumbers: endpointList
},
};
endpointList = []; //reset to empty array
dynamoDb.put(topicsParams, (error) => {...}
There are couple of issues here
You are trying to do callback style code in loops while you have promise methods available.
You could also do things in parallel using promise.all
Because of callback style the code is very complicated
You are awaiting where it is not required. For example in the callback
You can try to use this way
exports.create = async (event, context, callback) => {
try {
let topicTimestamp = "";
let endpoint = "";
const data = await sns.listTopics().promise();
// eslint-disable-next-line guard-for-in
for (const topic in data.Topics) { // first loop
// var topicData = "";
// retrieve each topic and append to topicList if it is lakeview topic
const topicData = data.Topics[topic].TopicArn;
topicTimestamp = topicData.slice(22, 34); // get only the topic createdAt
const params = {
"TopicArn": topicData // topicData
};
console.log(`SUBS per${ params.TopicArn}`);
const subscriptionData = await sns.listSubscriptionsByTopic(params).promise();
const endpointList = [];
// eslint-disable-next-line guard-for-in
for (const sub in subscriptionData.Subscriptions) { // nested loop
endpoint = subscriptionData.Subscriptions[sub].Endpoint;
console.log(`ENDPOINT:: ${ endpoint}`);
endpointList.push(endpoint);
}
// put topic info into table
const topicsParams = {
"TableName": tableName,
"Item": {
"id": uuidv4(),
"createdAt": timestamp,
"topicCreatedAt": topicTimestamp,
"topic": topicData,
"phoneNumbers": endpointList
}
};
// Similarly use dynamodb .promise functions here
}
} catch (Err) {
console.log(Err);
}
};
aws-sdk by default supports callback style. To convert them to promise you need to add .promise() at end.
At the moment this example is using for loop but you could do the same thing using Promise.all as well.
Hope this helps.

Batch update in knex

I'd like to perform a batch update using Knex.js
For example:
'UPDATE foo SET [theValues] WHERE idFoo = 1'
'UPDATE foo SET [theValues] WHERE idFoo = 2'
with values:
{ name: "FooName1", checked: true } // to `idFoo = 1`
{ name: "FooName2", checked: false } // to `idFoo = 2`
I was using node-mysql previously, which allowed multiple-statements. While using that I simply built a mulitple-statement query string and just send that through the wire in a single run.
I'm not sure how to achieve the same with Knex. I can see batchInsert as an API method I can use, but nothing as far as batchUpdate is concerned.
Note:
I can do an async iteration and update each row separately. That's bad cause it means there's gonna be lots of roundtrips from the server to the DB
I can use the raw() thing of Knex and probably do something similar to what I do with node-mysql. However that defeats the whole knex purpose of being a DB abstraction layer (It introduces strong DB coupling)
So I'd like to do this using something "knex-y".
Any ideas welcome.
I needed to perform a batch update inside a transaction (I didn't want to have partial updates in case something went wrong).
I've resolved it the next way:
// I wrap knex as 'connection'
return connection.transaction(trx => {
const queries = [];
users.forEach(user => {
const query = connection('users')
.where('id', user.id)
.update({
lastActivity: user.lastActivity,
points: user.points,
})
.transacting(trx); // This makes every update be in the same transaction
queries.push(query);
});
Promise.all(queries) // Once every query is written
.then(trx.commit) // We try to execute all of them
.catch(trx.rollback); // And rollback in case any of them goes wrong
});
Assuming you have a collection of valid keys/values for the given table:
// abstract transactional batch update
function batchUpdate(table, collection) {
return knex.transaction(trx => {
const queries = collection.map(tuple =>
knex(table)
.where('id', tuple.id)
.update(tuple)
.transacting(trx)
);
return Promise.all(queries)
.then(trx.commit)
.catch(trx.rollback);
});
}
To call it
batchUpdate('user', [...]);
Are you unfortunately subject to non-conventional column names? No worries, I got you fam:
function batchUpdate(options, collection) {
return knex.transaction(trx => {
const queries = collection.map(tuple =>
knex(options.table)
.where(options.column, tuple[options.column])
.update(tuple)
.transacting(trx)
);
return Promise.all(queries)
.then(trx.commit)
.catch(trx.rollback);
});
}
To call it
batchUpdate({ table: 'user', column: 'user_id' }, [...]);
Modern Syntax Version:
const batchUpdate = (options, collection) => {
const { table, column } = options;
const trx = await knex.transaction();
try {
await Promise.all(collection.map(tuple =>
knex(table)
.where(column, tuple[column])
.update(tuple)
.transacting(trx)
)
);
await trx.commit();
} catch (error) {
await trx.rollback();
}
}
You have a good idea of the pros and cons of each approach. I would recommend a raw query that bulk updates over several async updates. Yes you can run them in parallel, but your bottleneck becomes the time it takes for the db to run each update. Details can be found here.
Below is an example of an batch upsert using knex.raw. Assume that records is an array of objects (one obj for each row we want to update) whose values are the properties names line up with the columns in the database you want to update:
var knex = require('knex'),
_ = require('underscore');
function bulkUpdate (records) {
var updateQuery = [
'INSERT INTO mytable (primaryKeyCol, col2, colN) VALUES',
_.map(records, () => '(?)').join(','),
'ON DUPLICATE KEY UPDATE',
'col2 = VALUES(col2),',
'colN = VALUES(colN)'
].join(' '),
vals = [];
_(records).map(record => {
vals.push(_(record).values());
});
return knex.raw(updateQuery, vals);
}
This answer does a great job explaining the runtime relationship between the two approaches.
Edit:
It was requested that I show what records would look like in this example.
var records = [
{ primaryKeyCol: 123, col2: 'foo', colN: 'bar' },
{ // some other record, same props }
];
Please note that if your record has additional properties than the ones you specified in the query, you cannot do:
_(records).map(record => {
vals.push(_(record).values());
});
Because you will hand too many values to the query per record and knex will fail to match the property values of each record with the ? characters in the query. You instead will need to explicitly push the values on each record that you want to insert into an array like so:
// assume a record has additional property `type` that you dont want to
// insert into the database
// example: { primaryKeyCol: 123, col2: 'foo', colN: 'bar', type: 'baz' }
_(records).map(record => {
vals.push(record.primaryKeyCol);
vals.push(record.col2);
vals.push(record.colN);
});
There are less repetitive ways of doing the above explicit references, but this is just an example. Hope this helps!
The solution works great for me! I just include an ID parameter to make it dynamic across tables with custom ID tags. Chenhai, here's my snippet including a way to return a single array of ID values for the transaction:
function batchUpdate(table, id, collection) {
return knex.transaction((trx) => {
const queries = collection.map(async (tuple) => {
const [tupleId] = await knex(table)
.where(`${id}`, tuple[id])
.update(tuple)
.transacting(trx)
.returning(id);
return tupleId;
});
return Promise.all(queries).then(trx.commit).catch(trx.rollback);
});
}
You can use
response = await batchUpdate("table_name", "custom_table_id", [array of rows to update])
to get the returned array of IDs.
The update can be done in batches, i.e 1000 rows in a batch
And as long as it does it in batches, the bluebird map could be used.
For more information on bluebird map: http://bluebirdjs.com/docs/api/promise.map.html
const limit = 1000;
const totalRows = 50000;
const seq = count => Array(Math.ceil(count / limit)).keys();
map(seq(totalRows), page => updateTable(dbTable, page), { concurrency: 1 });
const updateTable = async (dbTable, page) => {
let offset = limit* page;
return knex(dbTable).pluck('id').limit(limit).offset(offset).then(ids => {
return knex(dbTable)
.whereIn('id', ids)
.update({ date: new Date() })
.then((rows) => {
console.log(`${page} - Updated rows of the table ${dbTable} from ${offset} to ${offset + batch}: `, rows);
})
.catch((err) => {
console.log({ err });
});
})
.catch((err) => {
console.log({ err });
});
};
Where pluck() is used to get ids in array form

NodeJs Mongoose collecting data in loop

Hope for your help.
I have collection tasks with document like this schema.
Task = {
title:'taskName',
performers:[ {userId:1,price:230}, {userId:2,price:260} ]
}
Profiles = { id:1, name: 'Alex', surname: 'Robinson', etc.. }
Finally, I shoul collect all data, and in response return Array of profiles objects. Problem is that for-loop end before finished all .findOne() for every elements, and it return empty Array.
This is code form get.
CODE HERE:
apiRoutes.get('/performers/:id', function(req,res,next){
var profArr = [];
Task.findOne({'_id':req.params.id},function(err, doc){
for(var i = 0; i<doc.performers.length; i++){
var profile = {
price: 0,
name: '',
surname: ''
};
profile.price = doc.performers[i].price;
Profile.findOne({'_id':doc.performers[i].userId},function(err,doc){
if (err) throw err;
profile.name = doc.name;
profile.surname = doc.surname;
profArr.push(profile);
});
}
return res.json({success:true,
message:'Performers data collected',
data:profArr});
});
The problem is you need to return response inside mongoose query. You can't use any values assigned inside the query outside. For example :
var sampleArr = [];
Users.find({}, function(err, users) {
users.forEach(function(user) {
Students.find({'userid' : user.id}, function(err, student) {
sampleArr.push({
'student' : student
});
})
console.log(sampleArr);
// It will only return empty array[];
})
})
So, your task should be like this:
apiRoutes.get('/performers/:id', function(req,res,next){
var profArr = [];
// Get a task by ID
Task.findById(req.params.id, function (err, task) {
// Get all profiles
Profile.find({}, function (err, profiles) {
task.performers.forEach(function(taskPerformer) {
profiles.forEach(function(profile) {
// Check the performer ID is the same with userID or not
if (profile._id == taskPerformer.userId) {
profArr.push({
price: taskPerformer.price,
name: profile.name,
surname: profile.surname
});
}
})
});
return res.json({
success:true,
message:'Performers data collected',
data:profArr
});
});
})
A simple idea would be to introduce a countdown-counter before you start your for-loop like this:
var countdown = doc.performers.length;
Decrement the countdown in the callback function of the findOne-Calls. Check if you have reached 0 and call a function outside to send the result.
But still your code doesn't look very efficient. There are a lot of calls to the db. Maybe you could rethink your datamodel in order to minimize the calls to the db.
Your "for" loop will be finished before findOne will be finished.

Categories

Resources