From 5b30589acffbe6d5e52427d76588680314678970 Mon Sep 17 00:00:00 2001
From: Danny Kopping <danny.kopping@grafana.com>
Date: Wed, 3 Nov 2021 15:12:52 +0200
Subject: [PATCH] Documenting recording rules per-tenant WAL (#4566)

* WIP

Signed-off-by: Danny Kopping <danny.kopping@grafana.com>

* Document ruler config options

Signed-off-by: Danny Kopping <danny.kopping@grafana.com>

* Self-review updates

Signed-off-by: Danny Kopping <danny.kopping@grafana.com>

* Clarifying WAL benefit in response to review

Signed-off-by: Danny Kopping <danny.kopping@grafana.com>

* Adding dashboard link

Signed-off-by: Danny Kopping <danny.kopping@grafana.com>

* Update docs/sources/configuration/_index.md

Co-authored-by: Owen Diehl <ow.diehl@gmail.com>

* Ensuring ruler and ingester default WAL paths do not conflict

Signed-off-by: Danny Kopping <danny.kopping@grafana.com>

Co-authored-by: Owen Diehl <ow.diehl@gmail.com>
---
 docs/sources/configuration/_index.md       | 139 +++++++++++++++++---
 docs/sources/operations/recording-rules.md | 144 +++++++++++++++++++++
 docs/sources/rules/_index.md               |  29 +----
 pkg/ruler/config.go                        |   2 +-
 pkg/ruler/registry.go                      |   1 +
 pkg/ruler/storage/cleaner/cleaner.go       |   2 +-
 pkg/ruler/storage/instance/instance.go     |   2 +-
 7 files changed, 274 insertions(+), 45 deletions(-)
 create mode 100644 docs/sources/operations/recording-rules.md
diff --git a/docs/sources/configuration/_index.md b/docs/sources/configuration/_index.md
index 718524ff639e..ac9894e97363 100644
--- a/docs/sources/configuration/_index.md
+++ b/docs/sources/configuration/_index.md
@@ -296,7 +296,7 @@ engine:
 ## query_scheduler_config
 
 The `query_scheduler_config` block configures the Loki query scheduler.
-  
+
 ```yaml
 # Maximum number of outstanding requests per tenant per query-scheduler.
 # In-flight requests above this limit will fail with HTTP response status code
@@ -544,6 +544,25 @@ remote_write:
   # Enable remote-write functionality.
   # CLI flag: -ruler.remote-write.enabled
   [enabled: <boolean> | default = false]
+  # Minimum period to wait between refreshing remote-write reconfigurations.
+  # This should be greater than or equivalent to -limits.per-user-override-period.
+  [config_refresh_period: <duration> | default = 10s]
+
+  wal:
+    # The directory in which to write tenant WAL files. Each tenant will have its own
+    # directory one level below this directory.
+    [dir: <string> | default = "ruler-wal"]
+    # Frequency with which to run the WAL truncation process.
+    [truncate_frequency: <duration> | default = 60m]
+    # Minimum and maximum time series should exist in the WAL for.
+    [min_age: <duration> | default = 5m]
+    [max_age: <duration> | default = 4h]
+
+  wal_cleaner:
+    # The minimum age of a WAL to consider for cleaning.
+    [min_age: <duration> | default = 12h]
+    # How often to run the WAL cleaner.
+    [period: <duration> | default = 0s (disabled)]
 
   client:
     # The URL of the endpoint to send samples to.
@@ -553,12 +572,18 @@ remote_write:
     [remote_timeout: <duration> | default = 30s]
 
     # Custom HTTP headers to be sent along with each remote write request.
-    # Be aware that headers that are set by Prometheus itself can't be overwritten.
+    # Be aware that headers that are set by Loki itself can't be overwritten.
     headers:
       [<string>: <string> ...]
 
-    # HTTP proxy server to use to connect to the targets.
-    [proxy_url: <string>]
+    # List of remote write relabel configurations.
+    write_relabel_configs:
+      [- <relabel_config> ...]
+ 
+    # Name of the remote write config, which if specified must be unique among remote write configs.
+    # The name will be used in metrics and logging in place of a generated value to help users distinguish between
+    # remote write configs.
+    [name: <string>]
 
     # Sets the `Authorization` header on every remote write request with the
     # configured username and password.
@@ -568,32 +593,78 @@ remote_write:
       [password: <secret>]
       [password_file: <string>]
 
-    # `Authorization` header configuration.
+    # Optional `Authorization` header configuration.
     authorization:
       # Sets the authentication type.
       [type: <string> | default: Bearer]
       # Sets the credentials. It is mutually exclusive with
       # `credentials_file`.
       [credentials: <secret>]
-      # Sets the credentials with the credentials read from the configured file.
+      # Sets the credentials to the credentials read from the configured file.
       # It is mutually exclusive with `credentials`.
       [credentials_file: <filename>]
 
+    # Optionally configures AWS's Signature Verification 4 signing process to
+    # sign requests. Cannot be set at the same time as basic_auth, authorization, or oauth2.
+    # To use the default credentials from the AWS SDK, use `sigv4: {}`.
+    sigv4:
+      # The AWS region. If blank, the region from the default credentials chain
+      # is used.
+      [region: <string>]
+
+      # The AWS API keys. If blank, the environment variables `AWS_ACCESS_KEY_ID`
+      # and `AWS_SECRET_ACCESS_KEY` are used.
+      [access_key: <string>]
+      [secret_key: <secret>]
+
+      # Named AWS profile used to authenticate.
+      [profile: <string>]
+
+      # AWS Role ARN, an alternative to using AWS API keys.
+      [role_arn: <string>]
+
+    # Configures the remote write request's TLS settings.
     tls_config:
       # CA certificate to validate API server certificate with.
       [ca_file: <filename>]
-
       # Certificate and key files for client cert authentication to the server.
       [cert_file: <filename>]
       [key_file: <filename>]
-
       # ServerName extension to indicate the name of the server.
       # https://tools.ietf.org/html/rfc4366#section-3.1
       [server_name: <string>]
-
       # Disable validation of the server certificate.
       [insecure_skip_verify: <boolean>]
 
+    # Optional proxy URL.
+    [proxy_url: <string>]
+
+    # Configure whether HTTP requests follow HTTP 3xx redirects.
+    [follow_redirects: <bool> | default = true]
+
+    # Configures the queue used to write to remote storage.
+    queue_config:
+      # Number of samples to buffer per shard before we block reading of more
+      # samples from the WAL. It is recommended to have enough capacity in each
+      # shard to buffer several requests to keep throughput up while processing
+      # occasional slow remote requests.
+      [capacity: <int> | default = 2500]
+      # Maximum number of shards, i.e. amount of concurrency.
+      [max_shards: <int> | default = 200]
+      # Minimum number of shards, i.e. amount of concurrency.
+      [min_shards: <int> | default = 1]
+      # Maximum number of samples per send.
+      [max_samples_per_send: <int> | default = 500]
+      # Maximum time a sample will wait in buffer.
+      [batch_send_deadline: <duration> | default = 5s]
+      # Initial retry delay. Gets doubled for every retry.
+      [min_backoff: <duration> | default = 30ms]
+      # Maximum retry delay.
+      [max_backoff: <duration> | default = 100ms]
+      # Retry upon receiving a 429 status code from the remote-write storage.
+      # This is experimental and might change in the future.
+      [retry_on_http_429: <boolean> | default = false]
+
 # File path to store temporary rule files.
 # CLI flag: -ruler.rule-path
 [rule_path: <filename> | default = "/rules"]
@@ -2027,8 +2098,7 @@ compactor_ring:
 
 ## limits_config
 
-The `limits_config` block configures global and per-tenant limits for ingesting
-logs in Loki.
+The `limits_config` block configures global and per-tenant limits in Loki.
 
 ```yaml
 # Whether the ingestion rate limit should be applied individually to each
@@ -2169,10 +2239,6 @@ logs in Loki.
 # If no rule is matched the `retention_period` is used.
 [retention_stream: <array> | default = none]
 
-# Capacity of remote-write queues; if a queue exceeds its capacity it will evict oldest samples.
-# CLI flag: -ruler.remote-write.queue-capacity
-[ruler_remote_write_queue_capacity: <int> | default = 10000]
-
 # Feature renamed to 'runtime configuration', flag deprecated in favor of -runtime-config.file
 # (runtime_config.file in YAML).
 # CLI flag: -limits.per-user-override-config
@@ -2216,6 +2282,49 @@ logs in Loki.
 # The default value of 0 does not set a limit.
 # CLI flag: -querier.max-query-lookback
 [max_query_lookback: <duration> | default = 0]
+
+# Disable recording rules remote-write.
+[ruler_remote_write_disabled: <bool> | default = false]
+
+# The URL of the endpoint to send samples to.
+[ruler_remote_write_url: <string>]
+
+# Timeout for requests to the remote write endpoint.
+[ruler_remote_write_timeout: <duration>]
+
+# Custom HTTP headers to be sent along with each remote write request.
+# Be aware that headers that are set by Loki itself can't be overwritten.
+[ruler_remote_write_headers: <headers>]
+
+# List of remote write relabel configurations.
+[ruler_remote_write_relabel_configs: <relabel_config>]
+
+# Number of samples to buffer per shard before we block reading of more
+# samples from the WAL. It is recommended to have enough capacity in each
+# shard to buffer several requests to keep throughput up while processing
+# occasional slow remote requests.
+[ruler_remote_write_queue_capacity: <int>]
+
+# Minimum number of shards, i.e. amount of concurrency.
+[ruler_remote_write_queue_min_shards: <int>]
+
+# Maximum number of shards, i.e. amount of concurrency.
+[ruler_remote_write_queue_max_shards: <int>]
+
+# Maximum number of samples per send.
+[ruler_remote_write_queue_max_samples_per_send: <int>]
+
+# Maximum time a sample will wait in buffer.
+[ruler_remote_write_queue_batch_send_deadline: <duration>]
+
+# Initial retry delay. Gets doubled for every retry.
+[ruler_remote_write_queue_min_backoff: <duration>]
+
+# Maximum retry delay.
+[ruler_remote_write_queue_max_backoff: <duration>]
+# Retry upon receiving a 429 status code from the remote-write storage.
+# This is experimental and might change in the future.
+[ruler_remote_write_queue_retry_on_ratelimit: <bool>]
 ```
 
 ### grpc_client_config
diff --git a/docs/sources/operations/recording-rules.md b/docs/sources/operations/recording-rules.md
new file mode 100644
index 000000000000..abd56f1854c3
--- /dev/null
+++ b/docs/sources/operations/recording-rules.md
@@ -0,0 +1,144 @@
+---
+title: Recording Rules
+---
+
+# Recording Rules
+
+Recording rules are evaluated by the `ruler` component. Each `ruler` acts as its own `querier`, in the sense that it
+executes queries against the store without using the `query-frontend` or `querier` components. It will respect all query
+[limits](https://grafana.com/docs/loki/latest/configuration/#limits_config) put in place for the `querier`.
+
+Loki's implementation of recording rules largely reuses Prometheus' code.
+
+Samples generated by recording rules are sent to Prometheus using Prometheus' **remote-write** feature.
+
+## Write-Ahead Log (WAL)
+
+All samples generated by recording rules are written to a WAL. The WAL's main benefit is that it persists the samples
+generated by recording rules to disk, which means that if your `ruler` crashes, you won't lose any data.
+We are trading off extra memory usage and slower start-up times for this functionality.
+
+A WAL is created per tenant; this is done to prevent cross-tenant interactions. If all samples were to be written
+to a single WAL, this would increase the chances that one tenant could cause data-loss for others. A typical scenario here
+is that Prometheus will, for example, reject a remote-write request with 100 samples if just 1 of those samples is invalid in some way.
+
+### Start-up
+
+When the `ruler` starts up, it will load the WALs for the tenants who have recording rules. These WAL files are stored
+on disk and are loaded into memory.
+
+Note: WALs are loaded one at a time upon start-up. This is a current limitation of the Cortex Ruler which Loki inherits.
+For this reason, it is adviseable that the number of rule groups serviced by a ruler be kept to a reasonable size, since
+_no rule evaluation occurs while WAL replay is in progress (this includes alerting rules)_.
+
+### Truncation
+
+WAL files are regularly truncated to reduce their size on disk.
+[This guide](https://ganeshvernekar.com/blog/prometheus-tsdb-wal-and-checkpoint/#wal-truncation-and-checkpointing)
+from one of the Prometheus maintainers (Ganesh Vernekar) gives an excellent overview of the truncation, checkpointing,
+and replaying of the WAL.
+
+### Cleaner
+
+<span style="background-color:#f3f973;">WAL Cleaner is an experimental feature.</span>
+
+The WAL Cleaner watches for abandoned WALs (tenants who no longer have recording rules associated) and deletes them.
+Enable this feature only if you are running into storage concerns with WALs that are too large. WALs should not grow
+excessively large due to truncation.
+
+## Scaling
+
+Loki's `ruler` component is based on Cortex's `ruler`.
+
+See Cortex's guide for [horizontally scaling the `ruler`](https://cortexmetrics.io/docs/guides/ruler-sharding/) using the ring.
+
+Note: the `ruler` shards by rule _group_, not by individual rules. This is an artifact of the fact that Prometheus
+recording rules need to run in order since one recording rule can reuse another - but this is not possible in Loki.
+
+## Deployment
+
+The `ruler` needs to persist its WAL files to disk, and it incurs a bit of a start-up cost by reading these WALs into memory.
+As such, it is recommended that you try to minimize churn of individual `ruler` instances since rule evaluation is blocked
+while the WALs are being read from disk.
+
+### Kubernetes
+
+It is recommended that you run the `rulers` using `StatefulSets`. The `ruler` will write its WAL files to persistent storage,
+so a `Persistent Volume` should be utilised.
+
+## Remote-Write
+
+### Per-Tenant Limits
+
+Remote-write can be configured at a global level in the base configuration, and certain parameters tuned specifically on
+a per-tenant basis. Most of the configuration options [defined here](../configuration/#ruler_config)
+have [override options](../configuration/#limits_config) (which can be also applied at runtime!).
+
+### Tuning
+
+Remote-write can be tuned if the default configuration is insufficient (see [Failure Modes](#failure-modes) below).
+
+There is a [guide](https://prometheus.io/docs/practices/remote_write/) on the Prometheus website, all of which applies to Loki, too.
+
+## Observability
+
+Since Loki reuses the Prometheus code for recording rules and WALs, it also gains all of Prometheus' observability.
+
+Prometheus exposes a number of metrics for its WAL implementation, and these have all been prefixed with `loki_ruler_wal_`.
+
+For example: `prometheus_remote_storage_bytes_total` → `loki_ruler_wal_prometheus_remote_storage_bytes_total`
+
+Additional metrics are exposed, also with the prefix `loki_ruler_wal_`. All per-tenant metrics contain a `tenant`
+label, so be aware that cardinality could begin to be a concern if the number of tenants grows sufficiently large.
+
+Some key metrics to note are:
+- `loki_ruler_wal_appender_ready`: whether a WAL appender is ready to accept samples (1) or not (0)
+- `loki_ruler_wal_prometheus_remote_storage_samples_total`: number of samples sent per tenant to remote storage
+- `loki_ruler_wal_prometheus_remote_storage_samples...`
+  - `loki_ruler_wal_prometheus_remote_storage_samples_pending_total`: samples buffered in memory, waiting to be sent to remote storage
+  - `loki_ruler_wal_prometheus_remote_storage_samples_failed_total`: samples that failed when sent to remote storage
+  - `loki_ruler_wal_prometheus_remote_storage_samples_dropped_total`: samples dropped by relabel configurations
+  - `loki_ruler_wal_prometheus_remote_storage_samples_retried_total`: samples re-resent to remote storage
+- `loki_ruler_wal_prometheus_remote_storage_highest_timestamp_in_seconds`: highest timestamp of sample appended to WAL
+- `loki_ruler_wal_prometheus_remote_storage_queue_highest_sent_timestamp_seconds`: highest timestamp of sample sent to remote storage.
+
+We've created a basic [dashboard in our loki-mixin](https://github.com/grafana/loki/tree/main/production/loki-mixin/dashboards/recording-rules.libsonnet)
+which you can use to administer recording rules.
+
+## Failure Modes
+
+### Remote-Write Lagging
+
+Remote-write can lag behind for many reasons:
+
+1. Remote-write storage (Prometheus) is temporarily unavailable
+2. A tenant is producing samples too quickly from a recording rule
+3. Remote-write is tuned too low, creating backpressure
+
+It can be determined by subtracting
+`loki_ruler_wal_prometheus_remote_storage_queue_highest_sent_timestamp_seconds` from
+`loki_ruler_wal_prometheus_remote_storage_highest_timestamp_in_seconds`.
+
+In case 1, the `ruler` will continue to retry sending these samples until the remote storage becomes available again. Be
+aware that if the remote storage is down for longer than `ruler.wal.max-age`, data loss may occur after truncation occurs.
+
+In cases 2 & 3, you should consider [tuning](#tuning) remote-write appropriately.
+
+Further reading: see [this blog post](https://grafana.com/blog/2021/04/12/how-to-troubleshoot-remote-write-issues-in-prometheus/)
+by Prometheus maintainer Callum Styan.
+
+### Appender Not Ready
+
+Each tenant's WAL has an "appender" internally; this appender is used to _append_ samples to the WAL. The appender is marked
+as _not ready_ until the WAL replay is complete upon startup. If the WAL is corrupted for some reason, or is taking a long
+time to replay, you can determine this by alerting on `loki_ruler_wal_appender_ready < 1`.
+
+### Corrupt WAL
+
+If a disk fails or the `ruler` does not terminate correctly, there's a chance one or more tenant WALs can become corrupted.
+A mechanism exists for automatically repairing the WAL, but this cannot handle every conceivable scenario. In this case,
+the `loki_ruler_wal_corruptions_repair_failed_total` metric will be incremented.
+
+### Found another failure mode?
+
+Please open an [issue](https://github.com/grafana/loki/issues) and tell us about it!
\ No newline at end of file
diff --git a/docs/sources/rules/_index.md b/docs/sources/rules/_index.md
index 217b0491aedb..3ce6bed3a23b 100644
--- a/docs/sources/rules/_index.md
+++ b/docs/sources/rules/_index.md
@@ -70,8 +70,6 @@ groups:
 
 ## Recording Rules
 
-<span style="background-color:#f3f973;">Recording rules are an experimental feature.</span>
-
 We support [Prometheus-compatible](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/#recording-rules) recording rules. From Prometheus' documentation:
 
 > Recording rules allow you to precompute frequently needed or computationally expensive expressions and save their result as a new set of time series.
@@ -126,32 +124,9 @@ ruler:
 
 Further configuration options can be found under [ruler_config](/configuration#ruler_config).
 
-### Resilience and Durability
-
-Given the above remote-write configuration, one needs to take into account what would happen if the remote-write receiver
-becomes unavailable.
-
-The Ruler component ensures some durability guarantees by buffering all outgoing writes in an in-memory queue. This queue
-holds all metric samples that are due to be written to the remote-write receiver, and while that receiver is down, the buffer
-will grow in size.
-
-Once the queue is full, the oldest samples will be evicted from the queue. The size of this queue is controllable globally,
-or on a per-tenant basis, with the [`ruler_remote_write_queue_capacity`](/configuration#limits_config) limit setting. By default, this value is set to 10000 samples.
-
-**NOTE**: this queue only exists in-memory at this time; there is no Write-Ahead Log (WAL) functionality available yet.
-This means that if your Ruler instance crashes, all pending metric samples in the queue that have not yet been written will be lost.
-
-### Operational Considerations
-
-Metrics are available to monitor recording rule evaluations and writes.
+### Operations
 
-| Metric  | Description  |
-|---|---|
-| `recording_rules_samples_queued_current`  | Number of samples queued to be remote-written.                                 |
-| `recording_rules_samples_queued_total`    | Total number of samples queued.                                             |
-| `recording_rules_samples_queue_capacity`  | Number of samples that can be queued before eviction of the oldest samples occurs. |
-| `recording_rules_samples_evicted_total`   | Number of samples evicted from queue because the queue is full.                           | 
-| `recording_rules_remote_write_errors`     | Number of samples that failed to be remote-written due to error.               |
+Please refer to the [Recording Rules](../operations/recording-rules/) page.
 
 ## Use cases
 
diff --git a/pkg/ruler/config.go b/pkg/ruler/config.go
index 52fa0efec5a9..9d7e53d7eb25 100644
--- a/pkg/ruler/config.go
+++ b/pkg/ruler/config.go
@@ -79,5 +79,5 @@ func (c *RemoteWriteConfig) Clone() (*RemoteWriteConfig, error) {
 // RegisterFlags adds the flags required to config this to the given FlagSet.
 func (c *RemoteWriteConfig) RegisterFlags(f *flag.FlagSet) {
 	f.BoolVar(&c.Enabled, "ruler.remote-write.enabled", false, "Remote-write recording rule samples to Prometheus-compatible remote-write receiver.")
-	f.DurationVar(&c.ConfigRefreshPeriod, "ruler.remote-write.config-refresh-period", 10*time.Second, "Minimum period to wait between remote-write reconfigurations. This should be greater than or equivalent to -limits.per-user-override-period.")
+	f.DurationVar(&c.ConfigRefreshPeriod, "ruler.remote-write.config-refresh-period", 10*time.Second, "Minimum period to wait between refreshing remote-write reconfigurations. This should be greater than or equivalent to -limits.per-user-override-period.")
 }
diff --git a/pkg/ruler/registry.go b/pkg/ruler/registry.go
index f3ab5de0dbbc..7dfa4479ae6c 100644
--- a/pkg/ruler/registry.go
+++ b/pkg/ruler/registry.go
@@ -381,6 +381,7 @@ func newStorageRegistryMetrics(reg prometheus.Registerer) *storageRegistryMetric
 		reg: reg,
 		appenderReady: prometheus.NewGaugeVec(prometheus.GaugeOpts{
 			Name: "appender_ready",
+			Help: "Whether a WAL appender is ready to accept samples (1) or not (0)",
 		}, []string{"tenant"}),
 	}
 
diff --git a/pkg/ruler/storage/cleaner/cleaner.go b/pkg/ruler/storage/cleaner/cleaner.go
index fcde4e316c60..0c5b3ba705d5 100644
--- a/pkg/ruler/storage/cleaner/cleaner.go
+++ b/pkg/ruler/storage/cleaner/cleaner.go
@@ -20,7 +20,7 @@ import (
 // Default settings for the WAL cleaner.
 const (
 	DefaultCleanupAge    = 12 * time.Hour
-	DefaultCleanupPeriod = 30 * time.Minute
+	DefaultCleanupPeriod = 0 * time.Second // disabled by default
 )
 
 // lastModifiedFunc gets the last modified time of the most recent segment of a WAL
diff --git a/pkg/ruler/storage/instance/instance.go b/pkg/ruler/storage/instance/instance.go
index f8fb999c47b9..d8bed8481f11 100644
--- a/pkg/ruler/storage/instance/instance.go
+++ b/pkg/ruler/storage/instance/instance.go
@@ -42,7 +42,7 @@ var (
 // Default configuration values
 var (
 	DefaultConfig = Config{
-		Dir:                 "wal",
+		Dir:                 "ruler-wal",
 		TruncateFrequency:   60 * time.Minute,
 		MinAge:              5 * time.Minute,
 		MaxAge:              4 * time.Hour,