I have a collection of various documents similar to what is shown below as 3 objects.
{
comment:{
text_sentiment: "positive",
topic: "A"
}
}, // DOC-1
{
comment:{
text_sentiment: "negative",
topic: "A"
}}, // DOC-2
{
comment:{
text_sentiment: "positive",
topic: "B"
}},..//DOC-3 ..
I want to write an aggregation that returns results in the following structure:
{
topic: "A",
topicOccurance: 2,
sentiment: {
positive: 3,
negative: 2,
neutral: 0
}
},...
I have written an aggregation that is able to group for topic and text_sentiment but I do not know, how can I create a structure similar to the one shown above. Here is the aggregation that I created.
db.MyCollection.aggregate({
$match: {
_id: "xyz",
"comment.topic": {$exists: 1},
}
},{
$group: {
_id: {
topic: "$comment.topic",
text_sentiment: "$comment.text_sentiment"
},
total: {$sum: 1},
}
},{
$project: {
topic: {
name: "$_id.topic",
occurence: "$total"
},
sentiment: "$_id.text_sentiment"
}
},{
$sort: {"topic.occurence": -1}
})
It grouped by topic and sentiment, but the structure does not match the one above. How can I get a similar structure?
Answer 1
You need 2 $group stages.
$match
$group - Group by comment.topic and comment.topic and $sum.
$group - Group by _id.topic, $sum; and add text_sentiment and total from previous stage into text_sentiments via $push.
$project - Decorate output documents. Set sentiment with converting from text_sentiments array to key-value pair via $arrayToObject.
$sort
db.collection.aggregate([
{
$match: {
_id: "xyz",
"comment.topic": {
$exists: 1
},
}
},
{
$group: {
_id: {
topic: "$comment.topic",
text_sentiment: "$comment.text_sentiment"
},
total: {
$sum: 1
},
}
},
{
$group: {
_id: "$_id.topic",
total: {
$sum: 1
},
text_sentiments: {
$push: {
k: "$_id.text_sentiment",
v: "$total"
}
}
}
},
{
$project: {
topic: "$_id",
topicOccurance: "$total",
sentiment: {
"$arrayToObject": "$text_sentiments"
}
}
},
{
$sort: {
"topicOccurance": -1
}
}
])
Sample Mongo Playground (Answer 1)
Answer 2
As mentioned text_sentiment values are fixed, you can use the query below:
db.collection.aggregate([
{
$match: {
_id: "xyz",
"comment.topic": {
$exists: 1
},
}
},
{
$group: {
_id: "$comment.topic",
total: {
$sum: 1
},
text_sentiments: {
$push: "$comment.text_sentiment"
}
}
},
{
$project: {
topic: "$_id",
topicOccurance: "$total",
sentiment: {
"positive": {
$reduce: {
input: "$text_sentiments",
initialValue: 0,
in: {
$sum: [
"$$value",
{
"$cond": {
"if": {
$eq: [
"$$this",
"positive"
]
},
"then": 1,
"else": 0
}
}
]
}
}
},
"negative": {
$reduce: {
input: "$text_sentiments",
initialValue: 0,
in: {
$sum: [
"$$value",
{
"$cond": {
"if": {
$eq: [
"$$this",
"negative"
]
},
"then": 1,
"else": 0
}
}
]
}
}
},
"neutral": {
$reduce: {
input: "$text_sentiments",
initialValue: 0,
in: {
$sum: [
"$$value",
{
"$cond": {
"if": {
$eq: [
"$$this",
"neutral"
]
},
"then": 1,
"else": 0
}
}
]
}
}
}
}
}
},
{
$sort: {
"topicOccurance": -1
}
}
])
Disadvantage: When the text_sentiment value is added/removed, then you have to modify the query.
Sample Mongo Playground (Answer 2)
Answer 3
Another approach similar to Answer 2 is using $size and $filter to replace $reduce.
db.collection.aggregate([
{
$match: {
//_id: "xyz",
"comment.topic": {
$exists: 1
},
}
},
{
$group: {
_id: "$comment.topic",
total: {
$sum: 1
},
text_sentiments: {
$push: "$comment.text_sentiment"
}
}
},
{
$project: {
topic: "$_id",
topicOccurance: "$total",
sentiment: {
"positive": {
$size: {
$filter: {
input: "$text_sentiments",
cond: {
$eq: [
"$$this",
"positive"
]
}
}
}
},
"negative": {
$size: {
$filter: {
input: "$text_sentiments",
cond: {
$eq: [
"$$this",
"negative"
]
}
}
}
},
"neutral": {
$size: {
$filter: {
input: "$text_sentiments",
cond: {
$eq: [
"$$this",
"neutral"
]
}
}
}
},
}
}
},
{
$sort: {
"topicOccurance": -1
}
}
])
Sample Mongo Playground (Answer 3)
Related
Example Documents:
[
{"subdocument": {"value":100,"additionalValue":300}},
{"subdocument": {"value":100} // additionalValue doesn't exist on this one
]
What I want: at the end of my aggregation:
{
"largest": // The entire first item because 300 is the highest overall value
"smallest": // The entire second item because 100 is the smallest "" average
"average": 150 // 1st item average is 200, 2nd item is 100 and their combined average is 150
}
What I did:
{ $sort: { 'subdocument.value': -1 } },
{
$group: {
_id: null,
average: { $avg: '$subdocument.value' },
items: { $push: '$$ROOT' },
},
},
{ $set: { largest: { $first: '$items' } } },
{ $set: { smallest: { $last: '$items' } } },
{ $project: { largest: 1, smallest: 1, average: 1 } },
But this does not include the additonalValue field.
And I don't know of any way to get a "larger one" expression
something like: average: { $max: { $larger: ['$subdocument.value', 'subdocument.additonalValue'] }},
IMPORTANT NOTE: the additionalValue field is optional.
Simply applying the operation twice on the values.
db.collection.aggregate([
{
$group: {
_id: null,
largest: {
$max: {
$max: [
"$subdocument.value",
"$subdocument.additionalValue"
]
}
},
smallest: {
$min: {
$min: [
"$subdocument.value",
"$subdocument.additionalValue"
]
}
},
average: {
$avg: {
$avg: [
"$subdocument.value",
"$subdocument.additionalValue"
]
}
}
}
},
{
"$lookup": {
"from": "collection",
"let": {
l: "$largest"
},
"pipeline": [
{
$match: {
$expr: {
$eq: [
"$$l",
{
$max: [
"$subdocument.value",
"$subdocument.additionalValue"
]
}
]
}
}
}
],
"as": "largestItems"
}
},
{
"$lookup": {
"from": "collection",
"let": {
s: "$smallest"
},
"pipeline": [
{
$match: {
$expr: {
$eq: [
"$$s",
{
$min: [
"$subdocument.value",
"$subdocument.additionalValue"
]
}
]
}
}
}
],
"as": "smallestItems"
}
}
])
Mongo Playground
Just as reference the solution by #ray combined with a filter instead of lookup that I came up with:
{ $sort: { 'subdoc.value': -1 } },
{
$group: {
_id: null,
min: { $min: '$subdoc.value' },
average: {
$avg: { $avg: ['$subdoc.value', '$subdoc.additionalValue'] },
},
max: {
$max: { $max: ['$subdoc.additionalValue', '$subdoc.value'] },
},
items: { $push: '$$ROOT' },
},
},
{
$set: {
largest: {
$first: {
$filter: {
input: '$items',
as: 'item',
cond: {
$eq: [
{
$max: [
'$$item.subdoc.value',
'$$item.subdoc.additionalValue',
],
},
'$max',
],
},
},
},
},
},
},
{ $set: { smallest: { $last: '$items' } } },
{ $project: { largest: 1, smallest: 1, averaget: 1 } },
I have this pipeline :
let pipeline = [
{
$match: {
date: { $gte: new Date("2022-10-19"), $lte: new Date("2022-10-26") },
},
},
{
$group: {
_id: "$date",
tasks: { $push: "$$ROOT" },
},
},
{
$sort: { _id: -1 },
},
];
const aggregationData = await ScheduleTaskModel.aggregate(pipeline);
where i group all "tasks" between a date range by date and i get that result :
[
{
"date": "2022-10-21T00:00:00.000Z",
"tasks": [...tasks with this date]
},
{
"date": "2022-10-20T00:00:00.000Z",
"tasks": [...tasks with this date]
}
]
as you see i have "tasks" only for 2 dates in that range,what if i want all dates to appear even the ones with no tasks so it would be like this with empty arrays ?
[
{
"date": "2022-10-26T00:00:00.000Z",
"tasks": []
},
{
"date": "2022-10-25T00:00:00.000Z",
"tasks": []
},
{
"date": "2022-10-24T00:00:00.000Z",
"tasks": []
},
{
"date": "2022-10-23T00:00:00.000Z",
"tasks": []
},
{
"date": "2022-10-22T00:00:00.000Z",
"tasks": []
},
{
"date": "2022-10-21T00:00:00.000Z",
"tasks": [...tasks with this date]
},
{
"date": "2022-10-20T00:00:00.000Z",
"tasks": [...tasks with this date]
},
{
"date": "2022-10-19T00:00:00.000Z",
"tasks": []
},
]
i tried to use $densify but unfortunately it requires upgrading my mongoDb atlas cluster which is not possible..
The answer of #WernfriedDomscheitAnother has a downside of grouping together all the documents in the collection, creating one large document, while a document has a size limit. A variation on it, without this downside, can be:
$match only the relevant document, same as in your current query
Use $facet to handle the case of no relevant documents at all. This will allow you to group all the relevant documents as you did in your query, but to keep a working-document even if there are any.
Add the relevant dates inside an array (since we use $facet this will happen even if the first match is empty)
Concatenate the array of matched data with the array of empty entries, use the real-data first.
$unwind the separate the documents by date, and $group again by date to remove the duplicates.
Format the result
db.collection.aggregate([
{$match: {date: {$gte: new Date("2022-10-19"), $lte: new Date("2022-10-26")}}},
{$facet: {
data: [
{$group: {_id: "$date", tasks: {$push: "$$ROOT"}}},
{$project: {date: "$_id", tasks: 1}}
]
}},
{$addFields: {
dates: {$map: {
input: {$range: [0, 8]},
// maybe more dynamic with $dateDiff -> { $dateDiff: { startDate: new Date("2022-10-19"), endDate: new Date("2022-10-26") }, unit: "day" } }
in: {
date: {$dateAdd: {
startDate: ISODate("2022-10-19T00:00:00.000Z"),
unit: "day",
amount: "$$this"
}},
tasks: []
}
}}
}},
{$project: {data: {$concatArrays: ["$data", "$dates"]}}},
{$unwind: "$data"},
{$group: {_id: "$data.date", "tasks": {$first: "$data.tasks"}}},
{$project: { _id: 0, date: "$_id", tasks: 1 }},
{$sort: { date: -1 }},
])
See how it works on the playground example
New function $densify would be the simplest, of course. The manual way of doing it would be this one:
db.collection.aggregate([
{
$group: {
_id: null,
data: { $push: "$$ROOT" }
}
},
{
$set: {
dates: {
$map: {
input: { $range: [ 0, 8 ] }, // maybe more dynamic with $dateDiff -> { $dateDiff: { startDate: new Date("2022-10-19"), endDate: new Date("2022-10-26") }, unit: "day" } }
in: {
date: {
$dateAdd: {
startDate: ISODate("2022-10-19T00:00:00.000Z"),
unit: "day",
amount: "$$this"
}
}
}
}
}
}
},
{
$set: {
dates: {
$map: {
input: "$dates",
as: "d",
in: {
$mergeObjects: [
"$$d",
{
tasks: {
$filter: {
input: "$data",
cond: { $eq: [ "$$d.date", "$$this.date" ] }
}
}
}
]
}
}
}
}
},
{
$project: {
data: {
$map: {
input: "$dates",
in: {
$cond: {
if: { $eq: [ "$$this.tasks", [] ] },
then: "$$this",
else: { $first: "$$this.tasks" }
}
}
}
}
}
},
{ $unwind: "$data" },
{ $replaceWith: "$data" }
])
Mongo Playground
I'm working on a feature that recursively looks up a thread of comments using $graphLookUp, and I almost have it. (albeit in a somewhat convoluted way but it is working!)
The last step I need is the following:
instead of having the nested posteriorThread as a property of the root array ($$ROOT), just merge it onto the root itself.
AGGREGATION CODE:
const posteriorThread = await Comment.aggregate([
{
$match: {
_id: post.threadDescendant
}
},
{
$graphLookup: {
from: 'baseposts',
startWith: '$threadDescendant',
connectFromField: 'threadDescendant',
connectToField: '_id',
as: 'posteriorThread'
}
},
{
$unwind: '$posteriorThread'
},
{
$sort: { 'posteriorThread.depth': 1 }
},
{
$group: {
_id: '$_id',
posteriorThread: { $push: '$posteriorThread' },
root: { $first: '$$ROOT' }
}
},
{
$project: {
'root.posteriorThread': 0
}
},
{
$replaceRoot: {
newRoot: {
$mergeObjects: [
{
posteriorThread: '$posteriorThread'
},
'$root'
]
}
}
}
]);
CURRENT OUTPUT
OUTPUT: posteriorThread
[
{
_id: '5f7eab40575e6fc56ee07604',
onModel: 'BasePost',
depth: 1,
user: '5f5da45245c07cc06e51b09f',
text: 'thread 0',
isThread: true,
threadDescendant: '5f7eabad575e6fc56ee07607',
posteriorThread: [
{
_id: '5f7eabad575e6fc56ee07607',
onModel: 'Comment',
depth: 2,
user: '5f5da45245c07cc06e51b09f',
text: 'thread 1',
isThread: true,
threadDescendant: '5f7eac82575e6fc56ee07609'
},
{
_id: '5f7eac82575e6fc56ee07609',
onModel: 'Comment',
depth: 3,
user: '5f5da45245c07cc06e51b09f',
text: 'thread 2',
isThread: true
}
]
}
];
DESIRED OUTPUT
OUTPUT: posteriorThread
[
{
_id: '5f7eab40575e6fc56ee07604',
onModel: 'BasePost',
depth: 1,
user: '5f5da45245c07cc06e51b09f',
text: 'thread 0',
isThread: true,
threadDescendant: '5f7eabad575e6fc56ee07607'
},
{
_id: '5f7eabad575e6fc56ee07607',
onModel: 'Comment',
depth: 2,
user: '5f5da45245c07cc06e51b09f',
text: 'thread 1',
isThread: true,
threadDescendant: '5f7eac82575e6fc56ee07609'
},
{
_id: '5f7eac82575e6fc56ee07609',
onModel: 'Comment',
depth: 3,
user: '5f5da45245c07cc06e51b09f',
text: 'thread 2',
isThread: true
}
];
I could accomplish this after the aggregation in regular js, but I would prefer to do it all in the aggregation.
The part that needs to be replaced is the mergeObjects bit and replaced with something else or the group aggregation and taking a different strategy, but I'm not sure what to put in it's place.
Also, if you have any other suggestions to make this cleaner, I'm all ears.
Thanks in advance.
Its really challenging. Atleast for me. And really very interesting case. Lets try my solution. Hope it works..
db.test.aggregate([
// PREVIOSU STEPS YOU ALREADY DID
{
$group: {
_id: "$_id",
items: {$push: "$$ROOT"},
subItems: {$first: "$posteriorThread"}
}
},
{
$project: {
"items.posteriorThread": 0
}
},
{
$addFields: {
allItems: {
$concatArrays: ["$items", "$subItems"]
}
}
},
{
$group: {
_id: null,
mergedItems: {$push: "$allItems"}
}
},
{
$unwind: "$mergedItems"
},
{
$unwind: "$mergedItems"
},
{
$replaceRoot: {
newRoot: "$mergedItems"
}
}
])
Thanks to #Sunil K Samanta for steering me in the right direction. It's not the prettiest solution, but it does give me the right solution.
const posteriorThread = await Comment.aggregate([
{
$match: {
_id: post.threadDescendant
}
},
{
$graphLookup: {
from: 'baseposts',
startWith: '$threadDescendant',
connectFromField: 'threadDescendant',
connectToField: '_id',
as: 'posteriorThread'
}
},
{
$unwind: '$posteriorThread'
},
{
$sort: { 'posteriorThread.depth': 1 }
},
{
$group: {
_id: '$_id',
items: { $push: '$$ROOT.posteriorThread' },
root: { $push: '$$ROOT' },
},
},
{
$project: {
items: 1,
root: { $slice: ['$$ROOT.root', 0, 1] },
},
},
{
$project: {
'root.posteriorThread': 0,
},
},
{
$addFields: {
allItems: {
$concatArrays: ['$root', '$items'],
},
},
},
{
$replaceRoot: {
newRoot: {
full_posterior: '$$ROOT.allItems',
},
},
},
])
)[0].full_posterior;
I am doing a $lookup to find 'events' where a customer is an attendee. The list of attendants is an array like this:
attendee: [{customer: <ID>}]
I tried this but it always returns an empty array:
$lookup: {
from: "events",
let: { customer: "$_id" },
pipeline: [
{
$match: {
$expr: {
$and: [
{ $eq: ['$attendee.customer', '$$customer'] },
]
},
}
},
{ $limit: 1 },
{ $sort: {start: -1} },
{ $project: { id: "$_id", start: 1, end: 1, name: 1, host: 1 } },
],
as: "event"
}
You are matching the fields on array so just replace $eq to $in
Your new code will be
$lookup: {
from: "events",
let: { customer: "$_id" },
pipeline: [
{
$match: {
$expr: {
$in: [
"$attendee.customer",
"$$customer"
]
},
}
},
{ $limit: 1 },
{ $sort: {start: -1} },
{ $project: { id: "$_id", start: 1, end: 1, name: 1, host: 1 } },
],
as: "event"
}
I feel like this could be really complex. I know this is possible in JS by taking each record, determine the difference in months, then increase each model.month by 1 within a for loop that stops once the months difference has been met. I just can't wrap my head around the possibility with aggregation. Any help would be a lifesaver!
Here is my data:
{
model: car
serial: bbbaaa
warranty_start_date: 03/05/2018
warranty_end_date: 06/16/2018
},
{
model: car
serial: jjjwww
warranty_start_date: 03/02/2018
warranty_end_date: 05/20/2018
},
{
model: truck
serial: tttvvv
warranty_start_date: 05/06/2016
warranty_end_date: 07/15/2016
}
This is how I want it to end up:
{
model: car
in_warranty_these_months: { 03/2018: 2, 04/2018: 2, 05/2018: 2, 06/2018: 1 }
},
{
model: truck
in_warranty_these_months: { 05/2016: 1, 06/2016: 1, 07/2016: 1 }
}
UPDATE
with major help from #mickl,
the below code works like a charm:
db.col.aggregate([
{
$addFields: {
monthsRange: {
$range: [
{ $add: [
{ $multiply: [12, {$year: "$warranty_start_date"}] },
{$month: "$warranty_start_date"} ]
},
{ $add: [
{ $multiply: [12, {$year: "$warranty_end_date"}] },
{ $add: [{$month: "$warranty_end_date"}, 1] } ]
}, 1]
}
}
},
{
$unwind: "$monthsRange"
},
{
$group: {
_id: { model: "$model", month: "$monthsRange" },
count: {$sum:1}
}
},
{
$group: {
_id: "$_id.model",
pairs: {
$push: {
k: {
$dateToString: {
format: "%m_%Y",
date: {
$dateFromParts: {
day: 1,
month: {
$cond: [{
$eq: [0, {
$mod: ["$_id.month", 12]
}]
}, {
$trunc: 12
}, {
$mod: ["$_id.month", 12]
}]
},
year: {
$cond: [{
$eq: [0, {
$mod: ["$_id.month", 12]
}]
}, {
$subtract: [{
$trunc: {
$divide: ["$_id.month", 12]
}
}, 1]
}, {
$trunc: {
$divide: ["$_id.month", 12]
}
}]
}
}
}
}
},
v: "$count"
}
}
}
},
{
$project: {
_id: 0,
model: "$_id",
in_warranty_these_months: {
$arrayToObject: "$pairs"
}
}
}
])
You can use following aggrregation:
db.col.aggregate([
{
$addFields: {
monthsRange: {
$range: [
{ $add: [
{ $multiply: [12, {$year: "$warranty_start_date"}] },
{$month: "$warranty_start_date"} ]
},
{ $add: [
{ $multiply: [12, {$year: "$warranty_end_date"}] },
{ $add: [{$month: "$warranty_end_date"}, 1] } ]
}, 1]
}
}
},
{
$unwind: "$monthsRange"
},
{
$group: {
_id: { model: "$model", month: "$monthsRange" },
count: {$sum:1}
}
},
{
$group: {
_id: "$_id.model",
pairs: {
$push: {
k: {
$dateToString: {
format: "%m/%Y",
date: {
$dateFromParts: {
day: 1,
month: { $add: [{ $mod: [ "$_id.month", 12 ] }, 1] },
year: { $trunc: { $divide: [ "$_id.month", 12 ] } }
}
}
}
},
v: "$count"
}
}
}
},
{
$project: {
_id: 0,
model: "$_id",
in_warranty_these_months: {
$arrayToObject: "$pairs"
}
}
}
])
Basically you have to generate ranges (using $range) based on your dates. To do that you can convert your dates to numbers based on following formula: 12 *year + month. This will give you a possibility to use $range to generate four values for first document, three values for second, etc.
Then you can use $group to count each month per model.
In last step we want to use $arrayToObject so we have to transform our data to objects with two properties, k and v. To do that we have to transform our numbers back to desired format using $dateFromParts and $dateToString