diff --git a/operator/CHANGELOG.md b/operator/CHANGELOG.md index f0bf7f4d5b5f..49f72d84858b 100644 --- a/operator/CHANGELOG.md +++ b/operator/CHANGELOG.md @@ -1,5 +1,9 @@ ## Main +## Release 5.8.11 + +- [13512](https://github.com/grafana/loki/pull/13512) **xperimental**: feat(operator): Add alert for discarded samples + ## Release 5.8.10 - [13562](https://github.com/grafana/loki/pull/13562) **xperimental**: fix(operator): Set object storage for delete requests when using retention diff --git a/operator/docs/lokistack/sop.md b/operator/docs/lokistack/sop.md index 8c437bd53b67..00ac13a071be 100644 --- a/operator/docs/lokistack/sop.md +++ b/operator/docs/lokistack/sop.md @@ -308,3 +308,35 @@ The query queue is currently under high load. ### Steps - Increase the number of queriers + +## Loki Discarded Samples Warning + +### Impact + +Loki is discarding samples (log entries) because they fail validation. This alert only fires for errors that are not retryable. This means that the discarded samples are lost. + +### Summary + +Loki can reject log entries (samples) during submission when they fail validation. This happens on a per-stream basis, so only the specific samples or streams failing validation are lost. + +The possible validation errors are documented in the [Loki documentation](https://grafana.com/docs/loki/latest/operations/request-validation-rate-limits/#validation-errors). This alert only fires for the validation errors that are not retryable, which means that discarded samples are permanently lost. + +The alerting can only show the affected Loki tenant. Since Loki 3.1.0 more detailed information about the affected streams is provided in an error message emitted by the distributor component. + +This information can be used to pinpoint the application sending the offending logs. For some of the validations there are configuration parameters that can be tuned in LokiStack's `limits` structure, if the messages should be accepted. Usually it is recommended to fix the issue either on the emitting application (if possible) or by changing collector configuration to fix non-compliant messages before sending them to Loki. + +### Severity + +`Warning` + +### Access Required + +- Console access to the cluster +- View access in the namespace where the LokiStack is deployed + - OpenShift + - `openshift-logging` (LokiStack) + +### Steps + +- View detailed log output from the Loki distributors to identify affected streams +- Decide on further steps depending on log source and validation error diff --git a/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml b/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml index f378c49fd78c..e0c49d61fe0a 100644 --- a/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml +++ b/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml @@ -175,3 +175,21 @@ groups: for: 15m labels: severity: warning + - alert: LokiDiscardedSamplesWarning + annotations: + message: |- + Loki in namespace {{ $labels.namespace }} is discarding samples in the "{{ $labels.tenant }}" tenant during ingestion. + Samples are discarded because of "{{ $labels.reason }}" at a rate of {{ .Value | humanize }} samples per second. + summary: Loki is discarding samples during ingestion because they fail validation. + runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning" + expr: | + sum by(namespace, tenant, reason) ( + irate(loki_discarded_samples_total{ + reason!="rate_limited", + reason!="per_stream_rate_limit", + reason!="stream_limit"}[2m]) + ) + > 0 + for: 15m + labels: + severity: warning diff --git a/operator/internal/manifests/internal/alerts/testdata/test.yaml b/operator/internal/manifests/internal/alerts/testdata/test.yaml index a4d8bec8a6a4..d60e3befa002 100644 --- a/operator/internal/manifests/internal/alerts/testdata/test.yaml +++ b/operator/internal/manifests/internal/alerts/testdata/test.yaml @@ -63,6 +63,9 @@ tests: - series: 'loki_logql_querystats_latency_seconds_bucket{namespace="my-ns", job="querier", route="my-route", le="+Inf"}' values: '0+100x20' + - series: 'loki_discarded_samples_total{namespace="my-ns", tenant="application", reason="line_too_long"}' + values: '0x5 0+120x25 3000' + alert_rule_test: - eval_time: 16m alertname: LokiRequestErrors @@ -177,3 +180,17 @@ tests: summary: "The read path has high volume of queries, causing longer response times." message: "The read path is experiencing high load." runbook_url: "[[ .RunbookURL ]]#Loki-Read-Path-High-Load" + - eval_time: 22m + alertname: LokiDiscardedSamplesWarning + exp_alerts: + - exp_labels: + namespace: my-ns + tenant: application + severity: warning + reason: line_too_long + exp_annotations: + message: |- + Loki in namespace my-ns is discarding samples in the "application" tenant during ingestion. + Samples are discarded because of "line_too_long" at a rate of 2 samples per second. + summary: Loki is discarding samples during ingestion because they fail validation. + runbook_url: "[[ .RunbookURL]]#Loki-Discarded-Samples-Warning"