Aggregation Unwind Document Keys as New Documents - javascript

I am having some problems in altering the schema I am using for a time series database I have constructed using Mongo DB. Currently, I have records like the one shown below:
{
"_id" : 20,
"name" : "Bob,
"location" : "London",
"01/01/1993" : {
"height" : "110cm",
"weight" : "60kg",
},
"02/01/1993" : {
"height" : "112cm",
"weight" : "61kg",
}
}
I wish to use the aggregation framework to create several records for each "person", one for each "time-value" subdocument in the original record:
{
"_id" : 20,
"name" : "Bob,
"date" : "01/01/1993"
"location" : "London",
"height" : "110cm",
"weight" : "60kg",
},
{
"_id" : 20,
"name" : "Bob,
"date" : "02/01/1993"
"location" : "London",
"height" : "112cm",
"weight" : "61kg",
}
The new scheme should be much more efficient when adding a large number of time series values to each record and I shouldn't run into a max document size error!
Any help on how to do this using the Mongo DB aggregation pipeline would be greatly appreciated!

Whilst there are functions in modern releases of the Aggregation Framework that can allow you to do this sort of thing, mileage may vary to whether it is actually the best solution for this.
In essence you can create an array of entries comprised of the document keys "which do not include" the other top level keys which would then be included in the document. That array can then be processed with $unwind and the whole result reshaped into new documents:
db.getCollection('input').aggregate([
{ "$project": {
"name": 1,
"location": 1,
"data": {
"$filter": {
"input": { "$objectToArray": "$$ROOT" },
"as": "d",
"cond": {
"$not": { "$in": [ "$$d.k", ["_id","name","location"] ] }
}
}
}
}},
{ "$unwind": "$data" },
{ "$replaceRoot": {
"newRoot": {
"$arrayToObject": {
"$concatArrays": [
[{ "k": "id", "v": "$_id" },
{ "k": "name", "v": "$name" },
{ "k": "location", "v": "$location" },
{ "k": "date", "v": "$data.k" }],
{ "$objectToArray": "$data.v" }
]
}
}
}},
{ "$out": "output" }
])
or alternately do all the reshaping in the initial $project within the array elements produced:
db.getCollection('input').aggregate([
{ "$project": {
"_id": 0,
"data": {
"$map": {
"input": {
"$filter": {
"input": { "$objectToArray": "$$ROOT" },
"as": "d",
"cond": {
"$not": { "$in": [ "$$d.k", ["_id", "name", "location"] ] }
}
}
},
"as": "d",
"in": {
"$arrayToObject": {
"$concatArrays": [
{ "$filter": {
"input": { "$objectToArray": "$$ROOT" },
"as": "r",
"cond": { "$in": [ "$$r.k", ["_id", "name", "location"] ] }
}},
[{ "k": "date", "v": "$$d.k" }],
{ "$objectToArray": "$$d.v" }
]
}
}
}
}
}},
{ "$unwind": "$data" },
{ "$replaceRoot": { "newRoot": "$data" } },
{ "$out": "output" }
])
So you use $objectToArray and $filter in order to make an array from the keys which actually contain the data points for each date.
After $unwind we basically apply $arrayToObject on a set of named keys in the "array format" in order to construct the newRoot for $replaceRoot and then write to the new collection, as one new document for each data key using $out.
That may only get you part of the way though, as you really should change the "date"data to a BSON Date. It takes a lot less storage space, and is easier to query as well.
var updates = [];
db.getCollection('output').find().forEach( d => {
updates.push({
"updateOne": {
"filter": { "_id": d._id },
"update": {
"$set": {
"date": new Date(
Date.UTC.apply(null,
d.date.split('/')
.reverse().map((e,i) => (i == 1) ? parseInt(e)-1: parseInt(e) )
)
)
}
}
}
});
if ( updates.length >= 500 ) {
db.getCollection('output').bulkWrite(updates);
updates = [];
}
})
if ( updates.length != 0 ) {
db.getCollection('output').bulkWrite(updates);
updates = [];
}
Of course, if your MongoDB server lacks those aggregation features then you are better off just writing the output to a new collection by iterating the loop in the first place:
var output = [];
db.getCollection('input').find().forEach( d => {
output = [
...output,
...Object.keys(d)
.filter(k => ['_id','name','location'].indexOf(k) === -1)
.map(k => Object.assign(
{
id: d._id,
name: d.name,
location: d.location,
date: new Date(
Date.UTC.apply(null,
k.split('/')
.reverse().map((e,i) => (i == 1) ? parseInt(e)-1: parseInt(e) )
)
)
},
d[k]
))
];
if ( output.length >= 500 ) {
db.getCollection('output').insertMany(output);
output = [];
}
})
if ( output.length != 0 ) {
db.getCollection('output').insertMany(output);
output = [];
}
In either of those cases we want to apply Date.UTC to the reversed string elements from the existing "string" based date and get a value than can be cast into a BSON Date.
The aggregation framework itself does not allow casting of types so the only solution for that part ( and it is a necessary part ) is to actually loop and update, but using the forms at least makes it efficient to loop and update.
Either case gives you the same end output:
/* 1 */
{
"_id" : ObjectId("599275b1e38f41729f1d64fe"),
"id" : 20.0,
"name" : "Bob",
"location" : "London",
"date" : ISODate("1993-01-01T00:00:00.000Z"),
"height" : "110cm",
"weight" : "60kg"
}
/* 2 */
{
"_id" : ObjectId("599275b1e38f41729f1d64ff"),
"id" : 20.0,
"name" : "Bob",
"location" : "London",
"date" : ISODate("1993-01-02T00:00:00.000Z"),
"height" : "112cm",
"weight" : "61kg"
}

Related

Aggregation Accumulate Inner Objects

I'm new with mongo aggregations, and I need a help with creating one,
I have a collection of the following document as an example:
{
"_id" : ObjectId("5afc2f06e1da131c9802071e"),
"_class" : "Traveler",
"name" : "John Due",
"startTimestamp" : 1526476550933,
"endTimestamp" : 1526476554823,
"source" : "istanbul",
"cities" : [
{
"_id" : "ef8f6b26328f-0663202f94faeaeb-3981",
"name" : "Moscow",
"timestamp" : 1526476550940,
"timeSpent" : 3180
},
{
"_id" : "ef8f6b26328f-0663202f94faeaeb-1122",
"name" : "Cairo",
"timestamp" : 1625476550940,
"timeSpent" : 318000,
},
{
"_id" : "ef8f6b26328f-0663202f94faeaeb-3981",
"name" : "Moscow",
"timestamp" : 15211276550940,
"timeSpent" : 318011
}
],
"variables" : [
{
"_id" : "cd4318a83c9b-a8478d76bfd3e4b6-5967",
"name" : "Customer Profile",
"lastValue" : "",
"values" : [],
"additionalData" : {}
},
{
"_id" : "366cb8c07996-c62c37a87a86d526-d3e7",
"name" : "Target Telephony Queue",
"lastValue" : "",
"values" : [],
"additionalData" : {}
},
{
"_id" : "4ed84742da33-d70ba8a809b712f3-bdf4",
"name" : "IMEI",
"lastValue" : "",
"values" : [],
"additionalData" : {}
},
{
"_id" : "c8103687c1c8-97d749e349d785c8-9154",
"name" : "Budget",
"defaultValue" : "",
"lastValue" : "",
"values" : [
{
"value" : "3000",
"timestamp" : NumberLong(1526476550940),
"element" : "c8103687c1c8-97d749e349d785c8-9154"
}
],
"additionalData" : {}
}
]
}
I need to have a resulting document showing the how many times each city have been visited by each traveler in the collection, and the average budget (budget is an element in the variables array
so the resulting document will be similar to:
{
"_id" : ObjectId("5afc2f06e1da131c9802071e"),
"_class" : "Traveler",
"name" : "John Due",
"startTimestamp" : 1526476550933,
"endTimestamp" : 1526476554823,
"source" : "istanbul",
"cities" : [
{
"_id" : "ef8f6b26328f-0663202f94faeaeb-3981",
"name" : "Moscow",
"visited":2
},
{
"_id" : "ef8f6b26328f-0663202f94faeaeb-1122",
"name" : "Cairo",
"visited":1
}
],
"variables" : [
{
"_id" : "c8103687c1c8-97d749e349d785c8-9154",
"name" : "Budget",
"defaultValue" : "",
"lastValue" : "",
"values" : [
{
"value" : "3000",
}
],
}
],
}
Thank you for your help
As a quick note, you need to change your "value" field inside the "values" to be numeric, since it's presently a string. But on to the answer:
If you have access to $reduce from MongoDB 3.4, then you can actually do something like this:
db.collection.aggregate([
{ "$addFields": {
"cities": {
"$reduce": {
"input": "$cities",
"initialValue": [],
"in": {
"$cond": {
"if": { "$ne": [{ "$indexOfArray": ["$$value._id", "$$this._id"] }, -1] },
"then": {
"$concatArrays": [
{ "$filter": {
"input": "$$value",
"as": "v",
"cond": { "$ne": [ "$$this._id", "$$v._id" ] }
}},
[{
"_id": "$$this._id",
"name": "$$this.name",
"visited": {
"$add": [
{ "$arrayElemAt": [
"$$value.visited",
{ "$indexOfArray": [ "$$value._id", "$$this._id" ] }
]},
1
]
}
}]
]
},
"else": {
"$concatArrays": [
"$$value",
[{
"_id": "$$this._id",
"name": "$$this.name",
"visited": 1
}]
]
}
}
}
}
},
"variables": {
"$map": {
"input": {
"$filter": {
"input": "$variables",
"cond": { "$eq": ["$$this.name", "Budget"] }
}
},
"in": {
"_id": "$$this._id",
"name": "$$this.name",
"defaultValue": "$$this.defaultValue",
"lastValue": "$$this.lastValue",
"value": { "$avg": "$$this.values.value" }
}
}
}
}}
])
If you have MongoDB 3.6, you can clean that up a bit with $mergeObjects:
db.collection.aggregate([
{ "$addFields": {
"cities": {
"$reduce": {
"input": "$cities",
"initialValue": [],
"in": {
"$cond": {
"if": { "$ne": [{ "$indexOfArray": ["$$value._id", "$$this._id"] }, -1] },
"then": {
"$concatArrays": [
{ "$filter": {
"input": "$$value",
"as": "v",
"cond": { "$ne": [ "$$this._id", "$$v._id" ] }
}},
[{
"_id": "$$this._id",
"name": "$$this.name",
"visited": {
"$add": [
{ "$arrayElemAt": [
"$$value.visited",
{ "$indexOfArray": [ "$$value._id", "$$this._id" ] }
]},
1
]
}
}]
]
},
"else": {
"$concatArrays": [
"$$value",
[{
"_id": "$$this._id",
"name": "$$this.name",
"visited": 1
}]
]
}
}
}
}
},
"variables": {
"$map": {
"input": {
"$filter": {
"input": "$variables",
"cond": { "$eq": ["$$this.name", "Budget"] }
}
},
"in": {
"$mergeObjects": [
"$$this",
{ "values": { "$avg": "$$this.values.value" } }
]
}
}
}
}}
])
But it's more or less the same thing except we keep the additionalData
Going back a little before that, then you can always $unwind the "cities" to accumulate:
db.collection.aggregate([
{ "$unwind": "$cities" },
{ "$group": {
"_id": {
"_id": "$_id",
"cities": {
"_id": "$cities._id",
"name": "$cities.name"
}
},
"_class": { "$first": "$class" },
"name": { "$first": "$name" },
"startTimestamp": { "$first": "$startTimestamp" },
"endTimestamp" : { "$first": "$endTimestamp" },
"source" : { "$first": "$source" },
"variables": { "$first": "$variables" },
"visited": { "$sum": 1 }
}},
{ "$group": {
"_id": "$_id._id",
"_class": { "$first": "$class" },
"name": { "$first": "$name" },
"startTimestamp": { "$first": "$startTimestamp" },
"endTimestamp" : { "$first": "$endTimestamp" },
"source" : { "$first": "$source" },
"cities": {
"$push": {
"_id": "$_id.cities._id",
"name": "$_id.cities.name",
"visited": "$visited"
}
},
"variables": { "$first": "$variables" },
}},
{ "$addFields": {
"variables": {
"$map": {
"input": {
"$filter": {
"input": "$variables",
"cond": { "$eq": ["$$this.name", "Budget"] }
}
},
"in": {
"_id": "$$this._id",
"name": "$$this.name",
"defaultValue": "$$this.defaultValue",
"lastValue": "$$this.lastValue",
"value": { "$avg": "$$this.values.value" }
}
}
}
}}
])
All return (almost) the same thing:
{
"_id" : ObjectId("5afc2f06e1da131c9802071e"),
"_class" : "Traveler",
"name" : "John Due",
"startTimestamp" : 1526476550933,
"endTimestamp" : 1526476554823,
"source" : "istanbul",
"cities" : [
{
"_id" : "ef8f6b26328f-0663202f94faeaeb-1122",
"name" : "Cairo",
"visited" : 1
},
{
"_id" : "ef8f6b26328f-0663202f94faeaeb-3981",
"name" : "Moscow",
"visited" : 2
}
],
"variables" : [
{
"_id" : "c8103687c1c8-97d749e349d785c8-9154",
"name" : "Budget",
"defaultValue" : "",
"lastValue" : "",
"value" : 3000
}
]
}
The first two forms are of course the most optimal thing to do since they are simply working "within" the same document at all times.
Operators like $reduce allow "accumulation" expressions on arrays, so we can use it here to keep a "reduced" array which we test for the unique "_id" value using $indexOfArray in order to see if the there already is an accumulated item that matches. A result of -1 means it's not there.
In order construct a "reduced array" we take the "initialValue" of [] as an empty array and then add to it via $concatArrays. All of that process is decided via the "ternary" $cond operator which considers the "if" condition and "then" either "joins" the output of the $filter on the current $$value to exclude the current index _id entry, with of course another "array" representing the singular object.
For that "object" we again use the $indexOfArray to actually get the matched index since we know that the item "is there", and use that to extract the current "visited" value from that entry via $arrayElemAt and $add to it in order to increment.
In the "else" case we simply add an "array" as an "object" which just has a default "visited" value of 1. Using both those cases effectively accumulates unique values within the array to output.
In the latter version, we just $unwind the array and use successive $group stages in order to first "count" on the unique inner entries, and then "re-construct the array" into the similar form.
Using $unwind looks far more simple, but since what it actually does is take a copy of the document for every array entry, then this actually adds considerable overhead to processing. In modern versions there are generally array operators which mean you don't need to use this unless your intention is to "accumulate across documents". So if you actually need to $group on a value of a key from "inside" an array, then that is where you actually do need to use it.
As for the "variables" then we can simply use the $filter again here to get the matching "Budget" entry. We do this as the input to the $map operator which allows "re-shaping" of the array content. We mainly want that so you can take the content of the "values" ( once you make it all numeric ) and use the $avg operator, which is supplied that "field path notation" form directly to the array values because it can in fact return a result from such an input.
That generally makes the tour of pretty much ALL of the main "array operators" for the aggregation pipeline ( excluding the "set" operators ) all within a single pipeline stage.
Also don't ever forget that you just about always want to $match with regular Query Operators as the "very first stage" of any aggregation pipeline in order to just select the documents you need. Ideally using an index.
Alternates
Alternates are working through the documents in client code. It generally would not be recommended since all methods above show they actually "reduce" the content as returned from the server, as is generally the point of "server aggregations".
It "may" be possible due to the "document based" nature that larger result sets may take considerably more time using $unwind and client processing could be an option, but I would consider it far more likely
Below is a listing that demonstrates applying a transform to the cursor stream as results are returned doing the same thing. There are three demonstrated versions of the transform, showing "exactly" the same logic as above, a implementation with lodash methods for accumulation, and a "natural" accumulation on the Map implementation:
const { MongoClient } = require('mongodb');
const { chain } = require('lodash');
const uri = 'mongodb://localhost:27017';
const opts = { useNewUrlParser: true };
const log = data => console.log(JSON.stringify(data, undefined, 2));
const transform = ({ cities, variables, ...d }) => ({
...d,
cities: cities.reduce((o,{ _id, name }) =>
(o.map(i => i._id).indexOf(_id) != -1)
? [
...o.filter(i => i._id != _id),
{ _id, name, visited: o.find(e => e._id === _id).visited + 1 }
]
: [ ...o, { _id, name, visited: 1 } ]
, []).sort((a,b) => b.visited - a.visited),
variables: variables.filter(v => v.name === "Budget")
.map(({ values, additionalData, ...v }) => ({
...v,
values: (values != undefined)
? values.reduce((o,e) => o + e.value, 0) / values.length
: 0
}))
});
const alternate = ({ cities, variables, ...d }) => ({
...d,
cities: chain(cities)
.groupBy("_id")
.toPairs()
.map(([k,v]) =>
({
...v.reduce((o,{ _id, name }) => ({ ...o, _id, name }),{}),
visited: v.length
})
)
.sort((a,b) => b.visited - a.visited)
.value(),
variables: variables.filter(v => v.name === "Budget")
.map(({ values, additionalData, ...v }) => ({
...v,
values: (values != undefined)
? values.reduce((o,e) => o + e.value, 0) / values.length
: 0
}))
});
const natural = ({ cities, variables, ...d }) => ({
...d,
cities: [
...cities
.reduce((o,{ _id, name }) => o.set(_id,
[ ...(o.has(_id) ? o.get(_id) : []), { _id, name } ]), new Map())
.entries()
]
.map(([k,v]) =>
({
...v.reduce((o,{ _id, name }) => ({ ...o, _id, name }),{}),
visited: v.length
})
)
.sort((a,b) => b.visited - a.visited),
variables: variables.filter(v => v.name === "Budget")
.map(({ values, additionalData, ...v }) => ({
...v,
values: (values != undefined)
? values.reduce((o,e) => o + e.value, 0) / values.length
: 0
}))
});
(async function() {
try {
const client = await MongoClient.connect(uri, opts);
let db = client.db('test');
let coll = db.collection('junk');
let cursor = coll.find().map(natural);
while (await cursor.hasNext()) {
let doc = await cursor.next();
log(doc);
}
client.close();
} catch(e) {
console.error(e)
} finally {
process.exit()
}
})()

Unwind Multiple Document Arrays Into New Documents

Today I run into a situation I need to sync a mongoDB collection to vertica (SQL Database) where my object keys will be the columns of the table in SQL.
I use mongoDB aggregation framework, first to query, manipulate and project the wanted result document and then I sync it to vertica.
The schema I want to aggregate looks like this:
{
userId: 123
firstProperty: {
firstArray: ['x','y','z'],
anotherAttr: 'abc'
},
anotherProperty: {
secondArray: ['a','b','c'],
anotherAttr: 'def'
}
}
Since array values are not related with other arrays value, what I need is that each value of nested array, will be in a separate result document.
For that I use the following aggregation pipe:
db.collection('myCollection').aggregate([
{
$match: {
$or: [
{'firstProperty.firstArray.1': {$exists: true}},
{'secondProperty.secondArray.1': {$exists: true}}
]
}
},
{
$project: {
userId: 1,
firstProperty: 1,
secondProperty: 1
}
}, {
$unwind: {path:'$firstProperty.firstAray'}
}, {
$unwind: {path:'$secondProperty.secondArray'},
}, {
$project: {
userId: 1,
firstProperty: '$firstProperty.firstArray',
firstPropertyAttr: '$firstProperty.anotherAttr',
secondProperty: '$secondProperty.secondArray',
seondPropertyAttr: '$secondProperty.anotherAttr'
}
}, {
$out: 'another_collection'
}
])
What I expect is the following result:
{
userId: 'x1',
firstProperty: 'x',
firstPropertyAttr: 'a'
}
{
userId: 'x1',
firstProperty: 'y',
firstPropertyAttr: 'a'
}
{
userId: 'x1',
firstProperty: 'z',
firstPropertyAttr: 'a'
}
{
userId: 'x1',
secondProperty: 'a',
firstPropertyAttr: 'b'
}
{
userId: 'x1',
secondProperty: 'b',
firstPropertyAttr: 'b'
}
{
userId: 'x1',
secondProperty: 'c',
firstPropertyAttr: 'b'
}
Instead I get something like that:
{
userId: 'x1',
firstProperty: 'x',
firstPropertyAttr: 'b'
secondProperty: 'a',
secondPropertyAttr: 'b'
}
{
userId: 'x1',
firstProperty: 'y',
firstPropertyAttr: 'b'
secondProperty: 'b',
secondPropertyAttr: 'b'
}
{
userId: 'x1',
firstProperty: 'z',
firstPropertyAttr: 'b'
secondProperty: 'c',
secondPropertyAttr: 'b'
}
What exactly am I missing, and how can I fix it?
This is actually a much "curlier" problem than you might think it is, and it all really boils down to "named keys", which are generally a real problem and your data "should" not be using "data points" in the naming of such keys.
The other obvious problem in your attempt is called a "cartesian product". This is where you $unwind one array and then $unwind another, which results in the items from the "first" $unwind being repeated for every value present in the "second".
Addressing that second problem, the basic approach is to "combine the arrays" in order that you only $unwind from a single source. This is pretty common to all remaining approaches.
As for the approaches, these differ in the MongoDB version you have available and the general practicality of application. So let's step through them:
Remove the named keys
The most simple approach here is to simply not expect named keys in the output, and instead mark them as a "name" identifying their source in the final output. So all we want to do is specify each "expected" key within the construction of an initial "combined" array, and then simply $filter that for any null values resulting from named paths not existing in the present document.
db.getCollection('myCollection').aggregate([
{ "$match": {
"$or": [
{ "firstProperty.firstArray.0": { "$exists": true } },
{ "anotherProperty.secondArray.0": { "$exists": true } }
]
}},
{ "$project": {
"_id": 0,
"userId": 1,
"combined": {
"$filter": {
"input": [
{
"name": { "$literal": "first" },
"array": "$firstProperty.firstArray",
"attr": "$firstProperty.anotherAttr"
},
{
"name": { "$literal": "another" },
"array": "$anotherProperty.secondArray",
"attr": "$anotherProperty.anotherAttr"
}
],
"cond": {
"$ne": ["$$this.array", null ]
}
}
}
}},
{ "$unwind": "$combined" },
{ "$unwind": "$combined.array" },
{ "$project": {
"userId": 1,
"name": "$combined.name",
"value": "$combined.array",
"attr": "$combined.attr"
}}
])
From the data included in your question this would produce:
/* 1 */
{
"userId" : 123.0,
"name" : "first",
"value" : "x",
"attr" : "abc"
}
/* 2 */
{
"userId" : 123.0,
"name" : "first",
"value" : "y",
"attr" : "abc"
}
/* 3 */
{
"userId" : 123.0,
"name" : "first",
"value" : "z",
"attr" : "abc"
}
/* 4 */
{
"userId" : 123.0,
"name" : "another",
"value" : "a",
"attr" : "def"
}
/* 5 */
{
"userId" : 123.0,
"name" : "another",
"value" : "b",
"attr" : "def"
}
/* 6 */
{
"userId" : 123.0,
"name" : "another",
"value" : "c",
"attr" : "def"
}
Merge Objects - Requires MongoDB 3.4.4 minimum
To actually use "named keys" we need the $objectToArray and $arrayToObject operators that were only available since MongoDB 3.4.4. Using these and the $replaceRoot pipeline stage we can simply process to your desired output without explicitly naming the keys to output at any stage:
db.getCollection('myCollection').aggregate([
{ "$match": {
"$or": [
{ "firstProperty.firstArray.0": { "$exists": true } },
{ "anotherProperty.secondArray.0": { "$exists": true } }
]
}},
{ "$project": {
"_id": 0,
"userId": 1,
"data": {
"$reduce": {
"input": {
"$map": {
"input": {
"$filter": {
"input": { "$objectToArray": "$$ROOT" },
"cond": { "$not": { "$in": [ "$$this.k", ["_id", "userId"] ] } }
}
},
"as": "d",
"in": {
"$let": {
"vars": {
"inner": {
"$map": {
"input": { "$objectToArray": "$$d.v" },
"as": "i",
"in": {
"k": {
"$cond": {
"if": { "$ne": [{ "$indexOfCP": ["$$i.k", "Array"] }, -1] },
"then": "$$d.k",
"else": { "$concat": ["$$d.k", "Attr"] }
}
},
"v": "$$i.v"
}
}
}
},
"in": {
"$map": {
"input": {
"$arrayElemAt": [
"$$inner.v",
{ "$indexOfArray": ["$$inner.k", "$$d.k"] }
]
},
"as": "v",
"in": {
"$arrayToObject": [[
{ "k": "$$d.k", "v": "$$v" },
{
"k": { "$concat": ["$$d.k", "Attr"] },
"v": {
"$arrayElemAt": [
"$$inner.v",
{ "$indexOfArray": ["$$inner.k", { "$concat": ["$$d.k", "Attr"] }] }
]
}
}
]]
}
}
}
}
}
}
},
"initialValue": [],
"in": { "$concatArrays": [ "$$value", "$$this" ] }
}
}
}},
{ "$unwind": "$data" },
{ "$replaceRoot": {
"newRoot": {
"$arrayToObject": {
"$concatArrays": [
[{ "k": "userId", "v": "$userId" }],
{ "$objectToArray": "$data" }
]
}
}
}}
])
Which gets pretty monstrous from converting the "keys" into an array, then the "sub-keys" into an array and mapping the values from those inner arrays onto the pair of keys in output.
The key parts being $objectToArray is essentially needed to "transform" your "nested key" structures into arrays of "k" and "v" representing the "name" of the key and the "value". This gets called twice, being once for the "outer" parts of the document and excluding the "constant" fields such as "_id" and "userId" into such an array structure. Then the second call is processed on each of those "array" elements in order to make those "inner keys" a similar "array".
Matching is then done using $indexOfCP to work out which "inner key" is the one for the value and which is the "Attr". The keys are then renamed here to the "outer" key value, which we can access because that's a "v" courtesy of $objectToArray.
Then for the "inner value" which is an "array", we want to $map each entry into a combined "array" which basically has the form:
[
{ "k": "firstProperty", "v": "x" },
{ "k": "firstPropertyAttr", "v": "abc" }
]
This happens for each "inner array" element, for which $arrayToObject reverses the process and turns each "k" and "v" into "key" and "value" of an object respectively.
Since the output is still an "array of arrays" of the "inner keys" at this point, the $reduce wraps that output and applies $concatArrays while processing each element in order to "join" into a single array for "data".
All that remains is to simply $unwind the array produced from each source document, and then apply $replaceRoot, which is the part that actually allows "different key names" at the "root" of each document output.
The "merging" here is done by supplying an array of object of the same "k" and "v" construction notated for "userId", and "concatentating" that with the $objectToArray transform of the "data". Of course this "new array" is then converted to an object via $arrayToObject one final time, which forms the "object" argument to "newRoot" as an expression.
You do something like that when there is a large number of "named keys" that you can't really name explicitly. And it actually gives you the result you want:
/* 1 */
{
"userId" : 123.0,
"firstProperty" : "x",
"firstPropertyAttr" : "abc"
}
/* 2 */
{
"userId" : 123.0,
"firstProperty" : "y",
"firstPropertyAttr" : "abc"
}
/* 3 */
{
"userId" : 123.0,
"firstProperty" : "z",
"firstPropertyAttr" : "abc"
}
/* 4 */
{
"userId" : 123.0,
"anotherProperty" : "a",
"anotherPropertyAttr" : "def"
}
/* 5 */
{
"userId" : 123.0,
"anotherProperty" : "b",
"anotherPropertyAttr" : "def"
}
/* 6 */
{
"userId" : 123.0,
"anotherProperty" : "c",
"anotherPropertyAttr" : "def"
}
Named Keys without MongoDB 3.4.4 or Greater
Without the operator support as shown in the above listing, it's simply not possible for the aggregation framework to output documents with different key names.
So though it's not possible to instruct the "server" to do this via $out, you can of course simply iterate the cursor and write a new collection
var ops = [];
db.getCollection('myCollection').find().forEach( d => {
ops = ops.concat(Object.keys(d).filter(k => ['_id','userId'].indexOf(k) === -1 )
.map(k =>
d[k][Object.keys(d[k]).find(ki => /Array$/.test(ki))]
.map(v => ({
[k]: v,
[`${k}Attr`]: d[k][Object.keys(d[k]).find(ki => /Attr$/.test(ki))]
}))
)
.reduce((acc,curr) => acc.concat(curr),[])
.map( o => Object.assign({ userId: d.userId },o) )
);
if (ops.length >= 1000) {
db.getCollection("another_collection").insertMany(ops);
ops = [];
}
})
if ( ops.length > 0 ) {
db.getCollection("another_collection").insertMany(ops);
ops = [];
}
Same sort of thing as is being done in the earlier aggregation, but just "externally". It essentially produces and array of documents for each document matching the "inner" arrays, like so:
[
{
"userId" : 123.0,
"firstProperty" : "x",
"firstPropertyAttr" : "abc"
},
{
"userId" : 123.0,
"firstProperty" : "y",
"firstPropertyAttr" : "abc"
},
{
"userId" : 123.0,
"firstProperty" : "z",
"firstPropertyAttr" : "abc"
},
{
"userId" : 123.0,
"anotherProperty" : "a",
"anotherPropertyAttr" : "def"
},
{
"userId" : 123.0,
"anotherProperty" : "b",
"anotherPropertyAttr" : "def"
},
{
"userId" : 123.0,
"anotherProperty" : "c",
"anotherPropertyAttr" : "def"
}
]
These get "cached" into a big array, which when that reaches a length of 1000 or more is finally written to the new collection via .insertMany(). Of course that requires "back and forth" communication with the server, but it does get the job done in the most efficient way possible if you don't have the features available for the previous aggregation.
Conclusion
The overall point here is that unless you actually have a MongoDB that supports it, then you are not going to get documents with "different key names" in the output, solely from the aggregation pipeline.
So when you do not have that support, you either go with the first option and then use $out discarding having named keys. Or you do the final approach and simply manipulate the cursor results and write back to the new collection.

Project array of Objects to Key Value

After aggregations i get this result,
{
"_id" : {
"date" : ISODate("2017-08-30T00:00:00.000Z")
},
"aggr" : [
{
"gender" : "unknown",
"count" : 365
},
{
"gender" : "male",
"count" : 2
}
]
}
Is it possible to convert this into below format
{
"date" : ISODate("2017-08-30T00:00:00.000Z"),
"unknown" : 365,
"male" : 2
}
Tried using $unwind and $project, but couldn't convert array objects into key,value pairs
Yes, using $arrayToObject and $map to convert the existing array to a format it accepts:
db.collection.aggregate([
{ "$replaceRoot": {
"newRoot": {
"$arrayToObject": {
"$concatArrays": [
[{ "k": "date", "v": "$_id.date" }],
{ "$map": {
"input": "$aggr",
"in": { "k": "$$this.gender", "v": "$$this.count" }
}}
]
}
}
}}
])
Of course if this is actually only on the "tail" of an existing aggregation and you don't have at least MongoDB 3.4.4 where the operator is introduced, then you can simply reshape the result in the client code:
db.collection.aggregate([
// existing pipeline
]).map(d =>
Object.assign(
{ date: d._id.date },
d.aggr.reduce((acc,curr) =>
Object.assign(acc,{ [curr.gender]: curr.count }),{}
)
)
)

Find maximum length of data in keys for the collection

{
"_id" : ObjectId("59786a62a96166007d7e364dsadasfafsdfsdgdfgfd"),
"someotherdata" : {
"place1" : "lwekjfrhweriufesdfwergfwr",
"place2" : "sgfertgryrctshyctrhysdthc ",
"place3" : "sdfsdgfrdgfvk",
"place4" : "asdfkjaseeeeeeeeeeeeeeeeefjnhwklegvds."
}
}
I have thousands of these in my collection. I need to look through all the someotherdata and do the following
Check to see if it is present (in some records i have place1 and not place4)
Find the longest record (in terms of string length)
The output must look something like this (showing the count of characters for the longest)
{
place1: 123,
place2: 12,
place3: 17
place4: 445
}
I'am using Mongodb 3.2.9 so don't have access to the new aggregate functions. But I do have the Mongodb shell
EDIT: To be clear I want the longest throughout the whole collection. So there might be 1000 documents but only one result with the longest length for each field throughout the whole collection.
Use .mapReduce() for this to reduce down to the largest values for each key:
db.collection.mapReduce(
function() {
emit(null,
Object.keys(this.someotherdata).map(k => ({ [k]: this.someotherdata[k].length }))
.reduce((acc,curr) => Object.assign(acc,curr),{})
);
},
function(key,values) {
var result = {};
values.forEach(value => {
Object.keys(value).forEach(k => {
if (!result.hasOwnProperty(k))
result[k] = 0;
if ( value[k] > result[k] )
result[k] = value[k];
});
});
return result;
},
{
"out": { "inline": 1 },
"query": { "someotherdata": { "$exists": true } }
}
)
Which basically emits the "length" of each key present in the sub-document path for each document, and then in "reduction", only the largest "length" for each key is actually returned.
Note that in mapReduce you need to put out the same structure you put in, since the way it deals with a large number of documents is by "reducing" in gradual batches. Which is why we emit in numeric form, just like the "reduce" function does.
Gives this output on your document shown in the question. Of course it's the "max" on all documents in the collection when you have more.
{
"_id" : null,
"value" : {
"place1" : 25.0,
"place2" : 26.0,
"place3" : 13.0,
"place4" : 38.0
}
}
For the interested, the context of the question is in fact that features of MongoDB 3.4 were not available to them. But to do the same thing using .aggregate() where the features are available:
db.collection.aggregate([
{ "$match": { "someotherdata": { "$exists": true } } },
{ "$project": {
"_id": 0,
"someotherdata": {
"$map": {
"input": { "$objectToArray": "$someotherdata" },
"as": "s",
"in": { "k": "$$s.k", "v": { "$strLenCP": "$$s.v" } }
}
}
}},
{ "$unwind": "$someotherdata" },
{ "$group": {
"_id": "$someotherdata.k",
"v": { "$max": "$someotherdata.v" }
}},
{ "$sort": { "_id": 1 } },
{ "$group": {
"_id": null,
"data": {
"$push": { "k": "$_id", "v": "$v" }
}
}},
{ "$replaceRoot": {
"newRoot": {
"$arrayToObject": "$data"
}
}}
])
With the same output:
{
"place1" : 25,
"place2" : 26,
"place3" : 13,
"place4" : 38
}
Use cursor.forEach to iterate through the collection.
Keep track of the longest placen values (starting from -1, updating when greater found). Print out values with print() or printjson()

How can I remove a duplicate object from a MongoDB array?

My data looks like this:
{
"foo_list": [
{
"id": "98aa4987-d812-4aba-ac20-92d1079f87b2",
"name": "Foo 1",
"slug": "foo-1"
},
{
"id": "98aa4987-d812-4aba-ac20-92d1079f87b2",
"name": "Foo 1",
"slug": "foo-1"
},
{
"id": "157569ec-abab-4bfb-b732-55e9c8f4a57d",
"name": "Foo 3",
"slug": "foo-3"
}
]
}
Where foo_list is a field in a model called Bar. Notice that the first and second objects in the array are complete duplicates.
Aside from the obvious solution of switching to PostgresSQL, what MongoDB query can I run to remove duplicate entries from foo_list?
Similar answers that do not quite cut it:
https://stackoverflow.com/a/16907596/432
https://stackoverflow.com/a/18804460/432
These questions answer the question if the array had bare strings in it. However in my situation the array is filled with objects.
I hope it is clear that I am not interested querying the database; I want the duplicates to be gone from the database forever.
Purely from an aggregation framework point of view there are a few approaches to this.
You can either just apply $setUnion in modern releases:
db.collection.aggregate([
{ "$project": {
"foo_list": { "$setUnion": [ "$foo_list", "$foo_list" ] }
}}
])
Or more traditionally with $unwind and $addToSet:
db.collection.aggregate([
{ "$unwind": "$foo_list" },
{ "$group": {
"_id": "$_id",
"foo_list": { "$addToSet": "$foo_list" }
}}
])
Or if you were just interested in the duplicates only then by general grouping:
db.collection.aggregate([
{ "$unwind": "$foo_list" },
{ "$group": {
"_id": {
"_id": "$_id",
"foo_list": "$foo_list"
},
"count": { "$sum": 1 }
}},
{ "$match": { "count": { "$ne": 1 } } },
{ "$group": {
"_id": "$_id._id",
"foo_list": { "$push": "$_id.foo_list" }
}}
])
The last form could be useful to you if you actually want to "remove" the duplicates from your data with another update statement as it identifies the elements which are duplicates.
So in that last form the returned result from your sample data identifies the duplicate:
{
"_id" : ObjectId("53f5f7314ffa9b02cf01c076"),
"foo_list" : [
{
"id" : "98aa4987-d812-4aba-ac20-92d1079f87b2",
"name" : "Foo 1",
"slug" : "foo-1"
}
]
}
Where results are returned from your collection per document that contains duplicate entries in the array and which entries are duplicated. This is the information you need to update, and you loop the results as you need to specify the update information from the results in order to remove duplicates.
This is actually done with two update statements per document, as a simple $pull operation would remove "both" items, which is not what you want:
var cursor = db.collection.aggregate([
{ "$unwind": "$foo_list" },
{ "$group": {
"_id": {
"_id": "$_id",
"foo_list": "$foo_list"
},
"count": { "$sum": 1 }
}},
{ "$match": { "count": { "$ne": 1 } } },
{ "$group": {
"_id": "$_id._id",
"foo_list": { "$push": "$_id.foo_list" }
}}
])
var batch = db.collection.initializeOrderedBulkOp();
var count = 0;
cursor.forEach(function(doc) {
doc.foo_list.forEach(function(dup) {
batch.find({ "_id": doc._id, "foo_list": { "$elemMatch": dup } }).updateOne({
"$unset": { "foo_list.$": "" }
});
batch.find({ "_id": doc._id }).updateOne({
"$pull": { "foo_list": null }
});
});
count++;
if ( count % 500 == 0 ) {
batch.execute();
batch = db.collection.initializeOrderedBulkOp();
}
});
if ( count % 500 != 0 ) {
batch.execute();
}
That's the modern MongoDB 2.6 and above way to do it, with a cursor result from aggregation and Bulk operations for updates. But the principles remain the same:
Identify the duplicates in documents
Loop the results to issue the updates to the affected documents
Use $unset with the positional $ operator to set the "first" matched array element to null
Use $pull to remove the null entry from the array
So after processing the above operations your sample now looks like this:
{
"_id" : ObjectId("53f5f7314ffa9b02cf01c076"),
"foo_list" : [
{
"id" : "98aa4987-d812-4aba-ac20-92d1079f87b2",
"name" : "Foo 1",
"slug" : "foo-1"
},
{
"id" : "157569ec-abab-4bfb-b732-55e9c8f4a57d",
"name" : "Foo 3",
"slug" : "foo-3"
}
]
}
The duplicate is removed with the "duplicated" item still intact. That is how you process to identify and remove the duplicate data from your collection.

Categories

Resources