From cd3df73944a82321ea9c64c543065883d20a2678 Mon Sep 17 00:00:00 2001 From: Dan Mace Date: Mon, 27 Jul 2020 17:23:04 -0400 Subject: [PATCH] Documentation: Further improve etcdMembersDown alert Before this change, the default window for the etcdMembersDown network failure rate function was recently changed to 1 minute. While this helps detect a etcd recovery more quickly, it depends on scrape intervals of <= 15s to collect sufficient data points for the rate function. In practice, an interval of >= 30s is more typical, which causes the rate function to be less accurate. This patch increases the window to 2m, which is a compromise between the original value of 3m and the 1m change introuced with 2aa5684, and should accomodate more typical scrape intervals. To offset the window change and to further improve the chance that the alert will only fire when etcd is truly dead, this patch changes the `for` clause from 3m to 10m. The rationale is as follows: 1. There can be significant variance in durations following a reboot before etcd is scraped and detected as available. 2. A conservative trigger like 10m seems less likely to produce a false alarm in the face of such variance. 3. In this alerting situation, if the outage is real, it seems unlikely that an additional 7 minutes of delay before (for example) paging somebody will make a significant impact on the overall response. --- Documentation/etcd-mixin/mixin.libsonnet | 9 +++++--- Documentation/etcd-mixin/test.yaml | 26 +++++++++++------------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/Documentation/etcd-mixin/mixin.libsonnet b/Documentation/etcd-mixin/mixin.libsonnet index 5468da393aa..cf74da8dcd5 100644 --- a/Documentation/etcd-mixin/mixin.libsonnet +++ b/Documentation/etcd-mixin/mixin.libsonnet @@ -7,6 +7,9 @@ // instances are deployed on K8s, you will likely want to change // this to 'instance, pod'. etcd_instance_labels: 'instance', + // scrape_interval_seconds is the global scrape interval which can be + // used to dynamically adjust rate windows as a function of the interval. + scrape_interval_seconds: 30, }, prometheusAlerts+:: { @@ -21,12 +24,12 @@ sum without (%(etcd_instance_labels)s) (up{%(etcd_selector)s} == bool 0) or count without (To) ( - sum without (%(etcd_instance_labels)s) (rate(etcd_network_peer_sent_failures_total{%(etcd_selector)s}[1m])) > 0.01 + sum without (%(etcd_instance_labels)s) (rate(etcd_network_peer_sent_failures_total{%(etcd_selector)s}[%(network_failure_range)ss])) > 0.01 ) ) > 0 - ||| % $._config, - 'for': '3m', + ||| % {etcd_instance_labels: $._config.etcd_instance_labels, etcd_selector: $._config.etcd_selector, network_failure_range: $._config.scrape_interval_seconds*4}, + 'for': '10m', labels: { severity: 'critical', }, diff --git a/Documentation/etcd-mixin/test.yaml b/Documentation/etcd-mixin/test.yaml index e7ad4cac61d..3f11ca82235 100644 --- a/Documentation/etcd-mixin/test.yaml +++ b/Documentation/etcd-mixin/test.yaml @@ -17,16 +17,16 @@ tests: alertname: etcdInsufficientMembers - eval_time: 5m alertname: etcdInsufficientMembers - - eval_time: 5m + - eval_time: 12m alertname: etcdMembersDown - - eval_time: 7m + - eval_time: 14m alertname: etcdMembersDown exp_alerts: - exp_labels: job: etcd severity: critical exp_annotations: - message: 'etcd cluster "etcd": members are down (1).' + message: 'etcd cluster "etcd": members are down (3).' - eval_time: 7m alertname: etcdInsufficientMembers - eval_time: 11m @@ -49,33 +49,31 @@ tests: - interval: 1m input_series: - series: 'up{job="etcd",instance="10.10.10.0"}' - values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0' + values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0' - series: 'up{job="etcd",instance="10.10.10.1"}' - values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0' + values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0' - series: 'up{job="etcd",instance="10.10.10.2"}' - values: '1 1 1 1 0 0 0 0' + values: '1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' alert_rule_test: - - eval_time: 10m + - eval_time: 14m alertname: etcdMembersDown exp_alerts: - exp_labels: job: etcd severity: critical exp_annotations: - message: 'etcd cluster "etcd": members are down (2).' + message: 'etcd cluster "etcd": members are down (3).' - interval: 1m input_series: - series: 'up{job="etcd",instance="10.10.10.0"}' - values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0' - series: 'up{job="etcd",instance="10.10.10.1"}' - values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0' + values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0' - series: 'etcd_network_peer_sent_failures_total{To="member-1",job="etcd",endpoint="test"}' - values: '0 0 1 2 3 4 5 6 7 8 9 10' + values: '0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18' alert_rule_test: - - eval_time: 4m - alertname: etcdMembersDown - - eval_time: 6m + - eval_time: 13m alertname: etcdMembersDown exp_alerts: - exp_labels: