Skip to content

Commit

Permalink
Alerts for high kubernetes memory and cpu utilization (#542)
Browse files Browse the repository at this point in the history
* alerts for k8s memory and cpu utilization > 90%

Signed-off-by: Ceridwen Coghlan <cdriskill@google.com>

* swap to allocatable utilization

Signed-off-by: Ceridwen Coghlan <cdriskill@google.com>

---------

Signed-off-by: Ceridwen Coghlan <cdriskill@google.com>
  • Loading branch information
Ceridwen Coghlan authored Mar 7, 2023
1 parent a8b1c4d commit 4c4a392
Showing 1 changed file with 86 additions and 0 deletions.
86 changes: 86 additions & 0 deletions terraform/gcp/modules/monitoring/infra/alerts.tf
Original file line number Diff line number Diff line change
Expand Up @@ -246,3 +246,89 @@ resource "google_monitoring_alert_policy" "kms_crypto_request_alert" {
notification_channels = local.notification_channels
project = var.project_id
}

### Kubernetes Alerts

# Kubernetes Node Memory Allocatable Utilization > 90%
resource "google_monitoring_alert_policy" "k8s_container_memory_allocatable_utilization" {
# In the absence of data, incident will auto-close in 7 days
alert_strategy {
auto_close = "604800s"
}

combiner = "OR"

conditions {
condition_threshold {
aggregations {
alignment_period = "300s"
per_series_aligner = "ALIGN_MEAN"
}

comparison = "COMPARISON_GT"
duration = "0s"
filter = "metric.type=\"kubernetes.io/node/memory/allocatable_utilization\" resource.type=\"k8s_container\""
threshold_value = "0.9"

trigger {
count = "1"
percent = "0"
}
}

display_name = "Kubernetes Node - Memory Allocatable Utilization [MEAN]"
}

display_name = "Kubernetes Node Memory Allocatable Utilization > 90%"

documentation {
content = "Kubernetes Node using >90% of allocatable memory. Please investigate possible memory leak."
mime_type = "text/markdown"
}

enabled = "true"
notification_channels = local.notification_channels
project = var.project_id
}

# Kubernetes Node CPU Allocatable Utilization > 90%
resource "google_monitoring_alert_policy" "k8s_container_cpu_allocatable_utilization" {
# In the absence of data, incident will auto-close in 7 days
alert_strategy {
auto_close = "604800s"
}

combiner = "OR"

conditions {
condition_threshold {
aggregations {
alignment_period = "300s"
per_series_aligner = "ALIGN_MEAN"
}

comparison = "COMPARISON_GT"
duration = "0s"
filter = "metric.type=\"kubernetes.io/node/cpu/allocatable_utilization\" resource.type=\"k8s_container\""
threshold_value = "0.9"

trigger {
count = "1"
percent = "0"
}
}

display_name = "Kubernetes Node - CPU Allocatable Utilization [MEAN]"
}

display_name = "Kubernetes Node CPU Allocatable Utilization > 90%"

documentation {
content = "Kubernetes Node using >90% of allocatable CPU. Please investigate running processes."
mime_type = "text/markdown"
}

enabled = "true"
notification_channels = local.notification_channels
project = var.project_id
}

0 comments on commit 4c4a392

Please sign in to comment.