wget -qO - https://www.mongodb.org/static/pgp/server-4.2.asc | sudo apt-key add -
echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu bionic/mongodb-org/4.2 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-4.2.list
sudo apt-get update
sudo apt-get install -y mongodb-org
# if conflict, do: sudo apt-get purge mongodb mongodb-server mongodb-server-core mongodb-clients mongodb-org mongo-tools
sudo systemctl start mongod
sudo systemctl status mongod
sudo systemctl enable mongod
mongo
After installing, create a new go file (don't forget go get the external imports:
package main
import (
"context"
"fmt"
"github.com/kokizzu/gotro/L"
//"github.com/simagix/keyhole/mdb"
"go.mongodb.org/mongo-driver/bson"
"go.mongodb.org/mongo-driver/mongo"
"go.mongodb.org/mongo-driver/mongo/options"
"math/rand"
"time"
)
type Score struct {
Id string `bson:"_id"`
User int64 `bson:"user"`
RefId int64 `bson:"ref_id"`
Epoch int64 `bson:"epoch"`
Score int `bson:"score"`
}
func main() {
ctx, _ := context.WithTimeout(context.Background(), 10*time.Second)
client, err := mongo.Connect(ctx, options.Client().ApplyURI("mongodb://127.0.0.1:27017"))
if L.IsError(err, `cannot connect to mongodb`) {
return
}
collection := client.Database("test1").Collection("scores")
for x := 0; x < 1000*1000; x++ {
userId := 1 + rand.Intn(1000*10)
refId := 1 + rand.Intn(1000)
epoch := time.Now().AddDate(0, 0, -rand.Intn(365)+1).Unix()
score := 10 + rand.Intn(100)
id := fmt.Sprintf("user%dref%d", userId, refId)
update := bson.M{
"$set": &Score{
Id: id,
User: int64(userId),
RefId: int64(refId),
Epoch: epoch,
Score: score,
},
}
filter := bson.M{"_id": bson.M{"$eq": id}}
_, err = collection.UpdateOne(context.Background(), filter, update, options.Update().SetUpsert(true))
if L.IsError(err, `failed upsert`) {
return
}
}
// Use query
window := []int{7, 30, 365}
for _, delta := range window {
fmt.Println(delta)
epoch := time.Now().AddDate(0, 0, -delta).Unix()
/*
db.scores.aggregate([
{
$match:{
epoch:{$gt: 1251251}
}
},
{
$group:{
_id: '$user',
score:{$sum: '$score'}
}
},
{
$sort:{
score:-1
}
},
{$limit:10}
])
*/
t := time.Now()
cursor, err := collection.Aggregate(context.Background(), []bson.M{{
`$match`: bson.M{
`epoch`: bson.M{
`$gt`: epoch,
},
},
}, {
`$group`: bson.M{
`_id`: `$user`,
`score`: bson.M{`$sum`: `$score`,},
},
}, {
`$sort`: bson.M{
`score`: -1,
},
}, {
`$limit`: 10,
}}, options.Aggregate())
if L.IsError(err, `failed query`) {
return
}
for cursor.Next(context.Background()) {
//row := Score{}
var row map[string]interface{}
cursor.Decode(&row)
fmt.Printf("Row: %v\n", row)
}
cursor.Close(context.Background())
dur := time.Since(t)
fmt.Printf("Done in %s\n",dur)
}
}
Well, this is the first time i was using MongoDB (was studying the syntax back when MongoDB silently discarding strings that too long, probably 8 years ago?). Also that context everywhere quite annoying, it would be nicer if it was set default globally on connection instead of per query, and split into two function one with default context and one with passed context. The result is quite nice btw, the performance is blazing fast for 9.5k data:
7
Row: map[_id:188 score:207]
Row: map[_id:491 score:205]
Row: map[_id:806 score:204]
Row: map[_id:720 score:186]
Row: map[_id:544 score:180]
Row: map[_id:682 score:163]
Row: map[_id:693 score:154]
Row: map[_id:138 score:152]
Row: map[_id:24 score:139]
Row: map[_id:174 score:127]
Done in 6.30876ms
30
Row: map[_id:693 score:467]
Row: map[_id:188 score:446]
Row: map[_id:780 score:300]
Row: map[_id:682 score:290]
Row: map[_id:570 score:280]
Row: map[_id:544 score:279]
Row: map[_id:943 score:261]
Row: map[_id:340 score:251]
Row: map[_id:714 score:250]
Row: map[_id:737 score:248]
Done in 6.561656ms
365
Row: map[_id:398 score:1346]
Row: map[_id:837 score:1324]
Row: map[_id:570 score:1180]
Row: map[_id:947 score:1173]
Row: map[_id:81 score:1165]
Row: map[_id:531 score:1121]
Row: map[_id:821 score:1105]
Row: map[_id:919 score:1104]
Row: map[_id:278 score:1095]
Row: map[_id:12 score:1091]
Done in 11.173371ms
Upsert performance on my PC is about 5k rows per second, the query syntax would took a while to get used to (unlike standard SQL), the query performance for 759k rows are:
7
Row: map[_id:47896 score:281]
Row: map[_id:56009 score:268]
Row: map[_id:84819 score:268]
Row: map[_id:69869 score:253]
Row: map[_id:74753 score:241]
Row: map[_id:89237 score:233]
Row: map[_id:138 score:221]
Row: map[_id:30651 score:215]
Row: map[_id:7545 score:213]
Row: map[_id:13532 score:213]
Done in 335.78427ms
30
Row: map[_id:693 score:467]
Row: map[_id:188 score:446]
Row: map[_id:756 score:366]
Row: map[_id:40581 score:363]
Row: map[_id:42851 score:347]
Row: map[_id:60245 score:340]
Row: map[_id:89237 score:332]
Row: map[_id:138 score:327]
Row: map[_id:22743 score:325]
Row: map[_id:858 score:325]
Done in 422.796678ms
365
Row: map[_id:85 score:1688]
Row: map[_id:837 score:1546]
Row: map[_id:398 score:1517]
Row: map[_id:531 score:1498]
Row: map[_id:406 score:1492]
Row: map[_id:821 score:1489]
Row: map[_id:570 score:1468]
Row: map[_id:81 score:1458]
Row: map[_id:178 score:1444]
Row: map[_id:947 score:1437]
Done in 1.488410158s
Now let's check the index performance, we'll add 2 index: epoch in descending and user in ascending
db.scores.createIndex( { user: 1 } )
db.scores.createIndex( { epoch: -1 } )
The query performance for the same number of rows are:
7
Row: map[_id:47896 score:281]
Row: map[_id:56009 score:268]
Row: map[_id:84819 score:268]
Row: map[_id:69869 score:253]
Row: map[_id:74753 score:241]
Row: map[_id:89237 score:233]
Row: map[_id:138 score:221]
Row: map[_id:30651 score:215]
Row: map[_id:13532 score:213]
Row: map[_id:7545 score:213]
Done in 62.307086ms
30
Row: map[_id:693 score:467]
Row: map[_id:188 score:446]
Row: map[_id:756 score:366]
Row: map[_id:40581 score:363]
Row: map[_id:42851 score:347]
Row: map[_id:60245 score:340]
Row: map[_id:89237 score:332]
Row: map[_id:138 score:327]
Row: map[_id:858 score:325]
Row: map[_id:22743 score:325]
Done in 254.839226ms
365
Row: map[_id:85 score:1688]
Row: map[_id:837 score:1546]
Row: map[_id:398 score:1517]
Row: map[_id:531 score:1498]
Row: map[_id:406 score:1492]
Row: map[_id:821 score:1489]
Row: map[_id:570 score:1468]
Row: map[_id:81 score:1458]
Row: map[_id:178 score:1444]
Row: map[_id:947 score:1437]
Done in 2.849935466s
Weirdly, when index can't filter the result well (365 days before today, which possibly returns all rows) it performance getting worse by almost 2x.
db.scores.createIndex( { user: 1 } )
db.scores.createIndex( { epoch: -1 } )
The query performance for the same number of rows are:
7
Row: map[_id:47896 score:281]
Row: map[_id:56009 score:268]
Row: map[_id:84819 score:268]
Row: map[_id:69869 score:253]
Row: map[_id:74753 score:241]
Row: map[_id:89237 score:233]
Row: map[_id:138 score:221]
Row: map[_id:30651 score:215]
Row: map[_id:13532 score:213]
Row: map[_id:7545 score:213]
Done in 62.307086ms
30
Row: map[_id:693 score:467]
Row: map[_id:188 score:446]
Row: map[_id:756 score:366]
Row: map[_id:40581 score:363]
Row: map[_id:42851 score:347]
Row: map[_id:60245 score:340]
Row: map[_id:89237 score:332]
Row: map[_id:138 score:327]
Row: map[_id:858 score:325]
Row: map[_id:22743 score:325]
Done in 254.839226ms
365
Row: map[_id:85 score:1688]
Row: map[_id:837 score:1546]
Row: map[_id:398 score:1517]
Row: map[_id:531 score:1498]
Row: map[_id:406 score:1492]
Row: map[_id:821 score:1489]
Row: map[_id:570 score:1468]
Row: map[_id:81 score:1458]
Row: map[_id:178 score:1444]
Row: map[_id:947 score:1437]
Done in 2.849935466s