Skip to content

Commit

Permalink
post review fixup
Browse files Browse the repository at this point in the history
  • Loading branch information
benzekrimaha committed Sep 25, 2024
1 parent 92ba78e commit b92c716
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 39 deletions.
57 changes: 35 additions & 22 deletions monitoring/mongodb/alerts.test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -259,24 +259,33 @@ tests:
description: MongoDb has low disk space
summary: MongoDb has low disk space

- name: MongoDbNodeRecovering

- name: MongoDbPodRecovering
interval: 1m
input_series:
- series: mongodb_rs_members_state{namespace="zenko", pod="data-db-mongodb-sharded-mongos-0", rs_state="3"}
values: 3x60
values: 3x1440

alert_rule_test:
- alertname: MongoDbNodeRecovering
eval_time: 1h
- alertname: MongoDbPodRecovering
eval_time: 1d
exp_alerts:
- exp_labels:
namespace: zenko
severity: warning
severity: critical
pod: data-db-mongodb-sharded-mongos-0
rs_state: 3
exp_annotations:
description: "MongoDB pod `data-db-mongodb-sharded-mongos-0` has been in the 'RECOVERING' state more than 24 hours. The instance may be failing to catch up and recover."
summary: MongoDB is recovering
- exp_labels:
namespace: zenko
pod: data-db-mongodb-sharded-mongos-0
rs_state: 3
severity: warning
exp_annotations:
description: "The Mongodb instance `data-db-mongodb-sharded-mongos-0` is in the 'RECOVERING' state for over an hour. The instance may not be able to join the replica set if the platform ingests a large number of operations during this time. This alert is expected if the 'Resync a Data Services MongoDB Member' procedure has recently been executed."
summary: MongoDB node recovering for too long
description: "MongoDB pod `data-db-mongodb-sharded-mongos-0` has been in 'RECOVERING' state for more than 1 hour. This may be expected if the 'Resync a Data Services MongoDB Member' procedure has recently been executed."
summary: MongoDB is recovering

- name: MongoDbInvalidState
interval: 1m
Expand All @@ -290,25 +299,25 @@ tests:

alert_rule_test:
- alertname: MongoDbInvalidState
eval_time: 1m
eval_time: 5m
exp_alerts:
- exp_labels:
namespace: zenko
severity: critical
pod: data-db-mongodb-sharded-mongos-0
rs_state: 6
exp_annotations:
description: "The Mongodb instance `data-db-mongodb-sharded-mongos-0` is in the `6` state. Immediate attention is required."
description: "MongoDB pod `data-db-mongodb-sharded-mongos-0` is in an invalid state state (`6`)."
summary: "MongoDB node in an invalid state: 6 (UNKNOWN), 8 (DOWN), 10 (REMOVED)"

- name: MongoDbNodeStartup2
- name: MongoDbPodStartup2
interval: 1m
input_series:
- series: mongodb_rs_members_state{namespace="zenko", pod="data-db-mongodb-sharded-mongos-0", rs_state="5"}
values: 5x60

alert_rule_test:
- alertname: MongoDbNodeStartup2
- alertname: MongoDbPodStartup2
eval_time: 1h
exp_alerts:
- exp_labels:
Expand All @@ -317,26 +326,30 @@ tests:
pod: data-db-mongodb-sharded-mongos-0
rs_state: 5
exp_annotations:
description: "The Mongodb instance `data-db-mongodb-sharded-mongos-0` is in the 'STARTUP2' state for an hour. The instance might be stuck."
description: "MongoDB pod `data-db-mongodb-sharded-mongos-0` has been in the 'STARTUP2' state for more than 1 hour. Please ensure that the instance is running properly."
summary: MongoDB node in STARTUP2 state for too long


- name: MongoDbNodeNotSynced
- name: MongoDbRSNotSynced
interval: 1m
input_series:
- series: mongodb_mongod_replset_number_of_members{set="data-db-mongodb-sharded-shard-0", pod="mongodb-1"}
values: 1+0x9

- series: mongodb_rs_members_state{namespace="zenko", rs_nm="rs0", pod="data-db-mongodb-sharded-mongos-0", member_state="SECONDARY"}
values: 0x10
- series: mongodb_rs_members_state{namespace="zenko", rs_nm="rs0", pod="data-db-mongodb-sharded-mongos-1", member_state="SECONDARY"}
values: 0x10
- series: mongodb_rs_members_state{namespace="zenko", rs_nm="rs0", pod="data-db-mongodb-sharded-mongos-2", member_state="SECONDARY"}
values: 1x10

alert_rule_test:
- alertname: MongoDbNodeNotSynced
eval_time: 1m
- alertname: MongoDbRSNotSynced
eval_time: 10m
exp_alerts:
- exp_labels:
severity: critical
pod: mongodb-1
severity: warning
rs_nm: rs0
exp_annotations:
description: "The MongoDB instance `mongodb-1` is out of the replica set. It does not longer receive any data and must be added back to the cluster to avoid performance and storage problems."
summary: MongoDB node not in replica set
description: "MongoDB replica set `rs0` is not in the expected state. It currently has `1` SECONDARY members instead of the expected number. Please ensure that all instance are running properly."
summary: MongoDB replica set out of sync



43 changes: 26 additions & 17 deletions monitoring/mongodb/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -140,44 +140,53 @@ groups:
description: 'MongoDb has low disk space'
summary: 'MongoDb has low disk space'

- alert: MongoDbNodeRecovering
- alert: MongoDbPodRecovering
expr: |
avg_over_time(mongodb_rs_members_state{namespace="${namespace}", pod=~"${service}.*", rs_state="3"}[1h]) == 3
mongodb_rs_members_state{namespace="${namespace}", pod=~"${service}.*"} == 3
for: 1h
labels:
severity: warning
annotations:
description: "The Mongodb instance `{{ $labels.pod }}` is in the 'RECOVERING' state for over an hour. The instance may not be able to join the replica set if the platform ingests a large number of operations during this time. This alert is expected if the 'Resync a Data Services MongoDB Member' procedure has recently been executed."
summary: MongoDB node recovering for too long
description: "MongoDB pod `{{ $labels.pod }}` has been in 'RECOVERING' state for more than 1 hour. This may be expected if the 'Resync a Data Services MongoDB Member' procedure has recently been executed."
summary: MongoDB is recovering

- alert: MongoDbPodRecovering
expr: |
mongodb_rs_members_state{namespace="${namespace}", pod=~"${service}.*"} == 3
for: 1d
labels:
severity: critical
annotations:
description: "MongoDB pod `{{ $labels.pod }}` has been in the 'RECOVERING' state more than 24 hours. The instance may be failing to catch up and recover."
summary: MongoDB is recovering

- alert: MongoDbInvalidState
expr: |
avg_over_time(mongodb_rs_members_state{namespace="${namespace}", pod=~"${service}.*", rs_state=~"6|8|10"}[1m]) > 0
for: 1m
mongodb_rs_members_state{namespace="${namespace}", pod=~"${service}.*", rs_state=~"6|8|10"} > 0
for: 5m
labels:
severity: critical
annotations:
description: "The Mongodb instance `{{ $labels.pod }}` is in the `{{ $labels.rs_state }}` state. Immediate attention is required."
description: "MongoDB pod `{{ $labels.pod }}` is in an invalid state state (`{{ $labels.rs_state }}`)."
summary: "MongoDB node in an invalid state: 6 (UNKNOWN), 8 (DOWN), 10 (REMOVED)"


- alert: MongoDbNodeStartup2
- alert: MongoDbPodStartup2
expr: |
avg_over_time(mongodb_rs_members_state{namespace="${namespace}", pod=~"${service}.*", rs_state="5"}[1h]) == 5
mongodb_rs_members_state{namespace="${namespace}", pod=~"${service}.*"} == 5
for: 1h
labels:
severity: warning
annotations:
description: "The Mongodb instance `{{ $labels.pod }}` is in the 'STARTUP2' state for an hour. The instance might be stuck."
description: "MongoDB pod `{{ $labels.pod }}` has been in the 'STARTUP2' state for more than 1 hour. Please ensure that the instance is running properly."
summary: MongoDB node in STARTUP2 state for too long

- alert: MongoDbNodeNotSynced
- alert: MongoDbRSNotSynced
expr: |
sum by (pod) (mongodb_mongod_replset_number_of_members{set="data-db-mongodb-sharded-shard-0"}) != ${replicas}
for: 1m
sum by (rs_nm) (mongodb_rs_members_state{namespace="${namespace}", pod=~"${service}.*", member_state="SECONDARY"}) != (${replicas} - 1)
for: 10m
labels:
severity: critical
severity: warning
annotations:
description: "The MongoDB instance `{{ $labels.pod }}` is out of the replica set. It does not longer receive any data and must be added back to the cluster to avoid performance and storage problems."
summary: MongoDB node not in replica set

description: "MongoDB replica set `{{ $labels.rs_nm }}` is not in the expected state. It currently has `{{ $value }}` SECONDARY members instead of the expected number. Please ensure that all instance are running properly."
summary: MongoDB replica set out of sync

0 comments on commit b92c716

Please sign in to comment.