From 065eef63fe7311744db35a738a2eb854d6fadd63 Mon Sep 17 00:00:00 2001 From: Evan Hearne <156197717+ehearneRedHat@users.noreply.github.com> Date: Wed, 21 Aug 2024 15:01:15 +0100 Subject: [PATCH] Add Alerts Panels to PE Dashboard (#753) * added pending/firing panels Signed-off-by: ehearneredhat * Apply formatting, query & tooltip updates Signed-off-by: ehearneredhat * currently active columns center and active state timeline width full dashboard Signed-off-by: ehearneredhat --------- Signed-off-by: ehearneredhat Co-authored-by: David Martin --- .../controller-resources-metrics.json | 5 +- .../controller-runtime-metrics.json | 1 - examples/dashboards/platform_engineer.json | 447 +++++++++++++++++- 3 files changed, 447 insertions(+), 6 deletions(-) diff --git a/examples/dashboards/controller-resources-metrics.json b/examples/dashboards/controller-resources-metrics.json index f6a092baa..1658cc8e0 100644 --- a/examples/dashboards/controller-resources-metrics.json +++ b/examples/dashboards/controller-resources-metrics.json @@ -232,7 +232,6 @@ "style": "dark", "tags": [], "templating": { - "list": [ { "current": { @@ -253,7 +252,7 @@ "skipUrlSync": false, "type": "datasource" }, - { + { "datasource": { "type": "prometheus", "uid": "${datasource}" @@ -287,7 +286,7 @@ "definition": "label_values(controller_runtime_reconcile_total, namespace)", "hide": 0, "includeAll": false, - "multi": false, + "multi": false, "name": "namespace", "options": [], "query": { diff --git a/examples/dashboards/controller-runtime-metrics.json b/examples/dashboards/controller-runtime-metrics.json index b8d7dafe5..e2c8ec6e5 100644 --- a/examples/dashboards/controller-runtime-metrics.json +++ b/examples/dashboards/controller-runtime-metrics.json @@ -1,5 +1,4 @@ { - "__requires": [ { "type": "datasource", diff --git a/examples/dashboards/platform_engineer.json b/examples/dashboards/platform_engineer.json index 843a0dcd3..24589a3a7 100644 --- a/examples/dashboards/platform_engineer.json +++ b/examples/dashboards/platform_engineer.json @@ -24,6 +24,12 @@ "name": "Stat", "version": "" }, + { + "type": "panel", + "id": "state-timeline", + "name": "State timeline", + "version": "" + }, { "type": "panel", "id": "table", @@ -2037,6 +2043,439 @@ ], "title": "Errors (req/s)", "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 47 + }, + "id": 155, + "panels": [], + "title": "Alerts", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Total number of firing alerts, grouped by Alert Name. Note that there may be more than 1 instance of an alert active based on different labels (like pods or namespace). These will only be counted once here if the alertname is the same for each one.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "semi-dark-red", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 48 + }, + "id": 156, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.5.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(sum by(alertname, alertstate, severity, cluster_id) (ALERTS{alertstate=\"firing\"}))", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Total Firing", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Currently pending or firing alerts, grouped by Alert Name. Note that there may be more than 1 instance of an alert active based on different labels (like pods or namespace). The total number of instances of an alert in that state is shown in the '# Active` column. For further details on the individual alert instances, check your alerting system.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "center", + "cellOptions": { + "type": "auto" + }, + "filterable": true, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "alertstate" + }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "firing": { + "color": "semi-dark-red", + "index": 1, + "text": "Firing" + }, + "pending": { + "color": "orange", + "index": 0, + "text": "Pending" + } + }, + "type": "value" + } + ] + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 21, + "x": 3, + "y": 48 + }, + "id": 157, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "9.5.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum by(alertname, alertstate, severity, cluster_id) (ALERTS{alertstate=\"firing\"} or ALERTS{alertstate=\"pending\"})", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Currently Active", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": false, + "__name__": true, + "container": true, + "endpoint": true, + "instance": true, + "job": true, + "namespace": true, + "pod": true, + "prometheus": true, + "receive": true, + "replica": true, + "rule_group": true, + "service": true, + "tenant_id": true + }, + "indexByName": { + "Time": 0, + "Value": 2, + "alertname": 1, + "alertstate": 3, + "cluster_id": 4, + "severity": 5 + }, + "renameByName": { + "Value": "# Active", + "alertname": "Name", + "alertstate": "State", + "cluster_id": "Cluster ID", + "severity": "Severity" + } + } + }, + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "Name" + } + ] + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Total number of pending alerts, grouped by Alert Name. Note that there may be more than 1 instance of an alert active based on different labels (like pods or namespace). These will only be counted once here if the alertname is the same for each one.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "orange", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 52 + }, + "id": 159, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.5.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(sum by(alertname, alertstate, severity, cluster_id) (ALERTS{alertstate=\"pending\"}))", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Total Pending", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Historical state of any alerts that were pending or firing, grouped by Alert Name. Note that more than 1 instance of an alert may have been active based on different labels (like pods or namespace). For further details on the individual alert instances, check your alerting system.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "fillOpacity": 100, + "lineWidth": 0, + "spanNulls": false + }, + "mappings": [ + { + "options": { + "firing": { + "color": "semi-dark-red", + "index": 1, + "text": "Firing" + }, + "pending": { + "color": "orange", + "index": 0, + "text": "Pending" + } + }, + "type": "value" + }, + { + "options": { + "match": "empty", + "result": { + "color": "light-green", + "index": 2, + "text": "Inactive" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "light-green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 56 + }, + "id": 158, + "options": { + "alignValue": "center", + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "mergeValues": true, + "rowHeight": 0.9, + "showValue": "auto", + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum by(alertname, alertstate, severity, cluster_id) (ALERTS{alertstate=\"firing\"} or ALERTS{alertstate=\"pending\"})", + "format": "table", + "instant": false, + "legendFormat": "{{alertname}}", + "range": true, + "refId": "A" + } + ], + "title": "Active State Timeline", + "transformations": [ + { + "id": "groupingToMatrix", + "options": { + "columnField": "alertname", + "emptyValue": "empty", + "rowField": "Time", + "valueField": "alertstate" + } + }, + { + "id": "convertFieldType", + "options": { + "conversions": [ + { + "destinationType": "time", + "targetField": "Time\\alertname" + } + ], + "fields": {} + } + }, + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "desc": false, + "field": "Time\\alertname" + } + ] + } + } + ], + "type": "state-timeline" } ], "refresh": "30s", @@ -2066,7 +2505,11 @@ "type": "datasource" }, { - "current": {}, + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, "datasource": { "type": "prometheus", "uid": "${datasource}" @@ -2140,7 +2583,7 @@ ] }, "time": { - "from": "now-30m", + "from": "now-1h", "to": "now" }, "timepicker": {