From 5b30589acffbe6d5e52427d76588680314678970 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Wed, 3 Nov 2021 15:12:52 +0200 Subject: [PATCH] Documenting recording rules per-tenant WAL (#4566) * WIP Signed-off-by: Danny Kopping * Document ruler config options Signed-off-by: Danny Kopping * Self-review updates Signed-off-by: Danny Kopping * Clarifying WAL benefit in response to review Signed-off-by: Danny Kopping * Adding dashboard link Signed-off-by: Danny Kopping * Update docs/sources/configuration/_index.md Co-authored-by: Owen Diehl * Ensuring ruler and ingester default WAL paths do not conflict Signed-off-by: Danny Kopping Co-authored-by: Owen Diehl --- docs/sources/configuration/_index.md | 139 +++++++++++++++++--- docs/sources/operations/recording-rules.md | 144 +++++++++++++++++++++ docs/sources/rules/_index.md | 29 +---- pkg/ruler/config.go | 2 +- pkg/ruler/registry.go | 1 + pkg/ruler/storage/cleaner/cleaner.go | 2 +- pkg/ruler/storage/instance/instance.go | 2 +- 7 files changed, 274 insertions(+), 45 deletions(-) create mode 100644 docs/sources/operations/recording-rules.md diff --git a/docs/sources/configuration/_index.md b/docs/sources/configuration/_index.md index 718524ff639e..ac9894e97363 100644 --- a/docs/sources/configuration/_index.md +++ b/docs/sources/configuration/_index.md @@ -296,7 +296,7 @@ engine: ## query_scheduler_config The `query_scheduler_config` block configures the Loki query scheduler. - + ```yaml # Maximum number of outstanding requests per tenant per query-scheduler. # In-flight requests above this limit will fail with HTTP response status code @@ -544,6 +544,25 @@ remote_write: # Enable remote-write functionality. # CLI flag: -ruler.remote-write.enabled [enabled: | default = false] + # Minimum period to wait between refreshing remote-write reconfigurations. + # This should be greater than or equivalent to -limits.per-user-override-period. + [config_refresh_period: | default = 10s] + + wal: + # The directory in which to write tenant WAL files. Each tenant will have its own + # directory one level below this directory. + [dir: | default = "ruler-wal"] + # Frequency with which to run the WAL truncation process. + [truncate_frequency: | default = 60m] + # Minimum and maximum time series should exist in the WAL for. + [min_age: | default = 5m] + [max_age: | default = 4h] + + wal_cleaner: + # The minimum age of a WAL to consider for cleaning. + [min_age: | default = 12h] + # How often to run the WAL cleaner. + [period: | default = 0s (disabled)] client: # The URL of the endpoint to send samples to. @@ -553,12 +572,18 @@ remote_write: [remote_timeout: | default = 30s] # Custom HTTP headers to be sent along with each remote write request. - # Be aware that headers that are set by Prometheus itself can't be overwritten. + # Be aware that headers that are set by Loki itself can't be overwritten. headers: [: ...] - # HTTP proxy server to use to connect to the targets. - [proxy_url: ] + # List of remote write relabel configurations. + write_relabel_configs: + [- ...] + + # Name of the remote write config, which if specified must be unique among remote write configs. + # The name will be used in metrics and logging in place of a generated value to help users distinguish between + # remote write configs. + [name: ] # Sets the `Authorization` header on every remote write request with the # configured username and password. @@ -568,32 +593,78 @@ remote_write: [password: ] [password_file: ] - # `Authorization` header configuration. + # Optional `Authorization` header configuration. authorization: # Sets the authentication type. [type: | default: Bearer] # Sets the credentials. It is mutually exclusive with # `credentials_file`. [credentials: ] - # Sets the credentials with the credentials read from the configured file. + # Sets the credentials to the credentials read from the configured file. # It is mutually exclusive with `credentials`. [credentials_file: ] + # Optionally configures AWS's Signature Verification 4 signing process to + # sign requests. Cannot be set at the same time as basic_auth, authorization, or oauth2. + # To use the default credentials from the AWS SDK, use `sigv4: {}`. + sigv4: + # The AWS region. If blank, the region from the default credentials chain + # is used. + [region: ] + + # The AWS API keys. If blank, the environment variables `AWS_ACCESS_KEY_ID` + # and `AWS_SECRET_ACCESS_KEY` are used. + [access_key: ] + [secret_key: ] + + # Named AWS profile used to authenticate. + [profile: ] + + # AWS Role ARN, an alternative to using AWS API keys. + [role_arn: ] + + # Configures the remote write request's TLS settings. tls_config: # CA certificate to validate API server certificate with. [ca_file: ] - # Certificate and key files for client cert authentication to the server. [cert_file: ] [key_file: ] - # ServerName extension to indicate the name of the server. # https://tools.ietf.org/html/rfc4366#section-3.1 [server_name: ] - # Disable validation of the server certificate. [insecure_skip_verify: ] + # Optional proxy URL. + [proxy_url: ] + + # Configure whether HTTP requests follow HTTP 3xx redirects. + [follow_redirects: | default = true] + + # Configures the queue used to write to remote storage. + queue_config: + # Number of samples to buffer per shard before we block reading of more + # samples from the WAL. It is recommended to have enough capacity in each + # shard to buffer several requests to keep throughput up while processing + # occasional slow remote requests. + [capacity: | default = 2500] + # Maximum number of shards, i.e. amount of concurrency. + [max_shards: | default = 200] + # Minimum number of shards, i.e. amount of concurrency. + [min_shards: | default = 1] + # Maximum number of samples per send. + [max_samples_per_send: | default = 500] + # Maximum time a sample will wait in buffer. + [batch_send_deadline: | default = 5s] + # Initial retry delay. Gets doubled for every retry. + [min_backoff: | default = 30ms] + # Maximum retry delay. + [max_backoff: | default = 100ms] + # Retry upon receiving a 429 status code from the remote-write storage. + # This is experimental and might change in the future. + [retry_on_http_429: | default = false] + # File path to store temporary rule files. # CLI flag: -ruler.rule-path [rule_path: | default = "/rules"] @@ -2027,8 +2098,7 @@ compactor_ring: ## limits_config -The `limits_config` block configures global and per-tenant limits for ingesting -logs in Loki. +The `limits_config` block configures global and per-tenant limits in Loki. ```yaml # Whether the ingestion rate limit should be applied individually to each @@ -2169,10 +2239,6 @@ logs in Loki. # If no rule is matched the `retention_period` is used. [retention_stream: | default = none] -# Capacity of remote-write queues; if a queue exceeds its capacity it will evict oldest samples. -# CLI flag: -ruler.remote-write.queue-capacity -[ruler_remote_write_queue_capacity: | default = 10000] - # Feature renamed to 'runtime configuration', flag deprecated in favor of -runtime-config.file # (runtime_config.file in YAML). # CLI flag: -limits.per-user-override-config @@ -2216,6 +2282,49 @@ logs in Loki. # The default value of 0 does not set a limit. # CLI flag: -querier.max-query-lookback [max_query_lookback: | default = 0] + +# Disable recording rules remote-write. +[ruler_remote_write_disabled: | default = false] + +# The URL of the endpoint to send samples to. +[ruler_remote_write_url: ] + +# Timeout for requests to the remote write endpoint. +[ruler_remote_write_timeout: ] + +# Custom HTTP headers to be sent along with each remote write request. +# Be aware that headers that are set by Loki itself can't be overwritten. +[ruler_remote_write_headers: ] + +# List of remote write relabel configurations. +[ruler_remote_write_relabel_configs: ] + +# Number of samples to buffer per shard before we block reading of more +# samples from the WAL. It is recommended to have enough capacity in each +# shard to buffer several requests to keep throughput up while processing +# occasional slow remote requests. +[ruler_remote_write_queue_capacity: ] + +# Minimum number of shards, i.e. amount of concurrency. +[ruler_remote_write_queue_min_shards: ] + +# Maximum number of shards, i.e. amount of concurrency. +[ruler_remote_write_queue_max_shards: ] + +# Maximum number of samples per send. +[ruler_remote_write_queue_max_samples_per_send: ] + +# Maximum time a sample will wait in buffer. +[ruler_remote_write_queue_batch_send_deadline: ] + +# Initial retry delay. Gets doubled for every retry. +[ruler_remote_write_queue_min_backoff: ] + +# Maximum retry delay. +[ruler_remote_write_queue_max_backoff: ] +# Retry upon receiving a 429 status code from the remote-write storage. +# This is experimental and might change in the future. +[ruler_remote_write_queue_retry_on_ratelimit: ] ``` ### grpc_client_config diff --git a/docs/sources/operations/recording-rules.md b/docs/sources/operations/recording-rules.md new file mode 100644 index 000000000000..abd56f1854c3 --- /dev/null +++ b/docs/sources/operations/recording-rules.md @@ -0,0 +1,144 @@ +--- +title: Recording Rules +--- + +# Recording Rules + +Recording rules are evaluated by the `ruler` component. Each `ruler` acts as its own `querier`, in the sense that it +executes queries against the store without using the `query-frontend` or `querier` components. It will respect all query +[limits](https://grafana.com/docs/loki/latest/configuration/#limits_config) put in place for the `querier`. + +Loki's implementation of recording rules largely reuses Prometheus' code. + +Samples generated by recording rules are sent to Prometheus using Prometheus' **remote-write** feature. + +## Write-Ahead Log (WAL) + +All samples generated by recording rules are written to a WAL. The WAL's main benefit is that it persists the samples +generated by recording rules to disk, which means that if your `ruler` crashes, you won't lose any data. +We are trading off extra memory usage and slower start-up times for this functionality. + +A WAL is created per tenant; this is done to prevent cross-tenant interactions. If all samples were to be written +to a single WAL, this would increase the chances that one tenant could cause data-loss for others. A typical scenario here +is that Prometheus will, for example, reject a remote-write request with 100 samples if just 1 of those samples is invalid in some way. + +### Start-up + +When the `ruler` starts up, it will load the WALs for the tenants who have recording rules. These WAL files are stored +on disk and are loaded into memory. + +Note: WALs are loaded one at a time upon start-up. This is a current limitation of the Cortex Ruler which Loki inherits. +For this reason, it is adviseable that the number of rule groups serviced by a ruler be kept to a reasonable size, since +_no rule evaluation occurs while WAL replay is in progress (this includes alerting rules)_. + +### Truncation + +WAL files are regularly truncated to reduce their size on disk. +[This guide](https://ganeshvernekar.com/blog/prometheus-tsdb-wal-and-checkpoint/#wal-truncation-and-checkpointing) +from one of the Prometheus maintainers (Ganesh Vernekar) gives an excellent overview of the truncation, checkpointing, +and replaying of the WAL. + +### Cleaner + +WAL Cleaner is an experimental feature. + +The WAL Cleaner watches for abandoned WALs (tenants who no longer have recording rules associated) and deletes them. +Enable this feature only if you are running into storage concerns with WALs that are too large. WALs should not grow +excessively large due to truncation. + +## Scaling + +Loki's `ruler` component is based on Cortex's `ruler`. + +See Cortex's guide for [horizontally scaling the `ruler`](https://cortexmetrics.io/docs/guides/ruler-sharding/) using the ring. + +Note: the `ruler` shards by rule _group_, not by individual rules. This is an artifact of the fact that Prometheus +recording rules need to run in order since one recording rule can reuse another - but this is not possible in Loki. + +## Deployment + +The `ruler` needs to persist its WAL files to disk, and it incurs a bit of a start-up cost by reading these WALs into memory. +As such, it is recommended that you try to minimize churn of individual `ruler` instances since rule evaluation is blocked +while the WALs are being read from disk. + +### Kubernetes + +It is recommended that you run the `rulers` using `StatefulSets`. The `ruler` will write its WAL files to persistent storage, +so a `Persistent Volume` should be utilised. + +## Remote-Write + +### Per-Tenant Limits + +Remote-write can be configured at a global level in the base configuration, and certain parameters tuned specifically on +a per-tenant basis. Most of the configuration options [defined here](../configuration/#ruler_config) +have [override options](../configuration/#limits_config) (which can be also applied at runtime!). + +### Tuning + +Remote-write can be tuned if the default configuration is insufficient (see [Failure Modes](#failure-modes) below). + +There is a [guide](https://prometheus.io/docs/practices/remote_write/) on the Prometheus website, all of which applies to Loki, too. + +## Observability + +Since Loki reuses the Prometheus code for recording rules and WALs, it also gains all of Prometheus' observability. + +Prometheus exposes a number of metrics for its WAL implementation, and these have all been prefixed with `loki_ruler_wal_`. + +For example: `prometheus_remote_storage_bytes_total` → `loki_ruler_wal_prometheus_remote_storage_bytes_total` + +Additional metrics are exposed, also with the prefix `loki_ruler_wal_`. All per-tenant metrics contain a `tenant` +label, so be aware that cardinality could begin to be a concern if the number of tenants grows sufficiently large. + +Some key metrics to note are: +- `loki_ruler_wal_appender_ready`: whether a WAL appender is ready to accept samples (1) or not (0) +- `loki_ruler_wal_prometheus_remote_storage_samples_total`: number of samples sent per tenant to remote storage +- `loki_ruler_wal_prometheus_remote_storage_samples...` + - `loki_ruler_wal_prometheus_remote_storage_samples_pending_total`: samples buffered in memory, waiting to be sent to remote storage + - `loki_ruler_wal_prometheus_remote_storage_samples_failed_total`: samples that failed when sent to remote storage + - `loki_ruler_wal_prometheus_remote_storage_samples_dropped_total`: samples dropped by relabel configurations + - `loki_ruler_wal_prometheus_remote_storage_samples_retried_total`: samples re-resent to remote storage +- `loki_ruler_wal_prometheus_remote_storage_highest_timestamp_in_seconds`: highest timestamp of sample appended to WAL +- `loki_ruler_wal_prometheus_remote_storage_queue_highest_sent_timestamp_seconds`: highest timestamp of sample sent to remote storage. + +We've created a basic [dashboard in our loki-mixin](https://github.com/grafana/loki/tree/main/production/loki-mixin/dashboards/recording-rules.libsonnet) +which you can use to administer recording rules. + +## Failure Modes + +### Remote-Write Lagging + +Remote-write can lag behind for many reasons: + +1. Remote-write storage (Prometheus) is temporarily unavailable +2. A tenant is producing samples too quickly from a recording rule +3. Remote-write is tuned too low, creating backpressure + +It can be determined by subtracting +`loki_ruler_wal_prometheus_remote_storage_queue_highest_sent_timestamp_seconds` from +`loki_ruler_wal_prometheus_remote_storage_highest_timestamp_in_seconds`. + +In case 1, the `ruler` will continue to retry sending these samples until the remote storage becomes available again. Be +aware that if the remote storage is down for longer than `ruler.wal.max-age`, data loss may occur after truncation occurs. + +In cases 2 & 3, you should consider [tuning](#tuning) remote-write appropriately. + +Further reading: see [this blog post](https://grafana.com/blog/2021/04/12/how-to-troubleshoot-remote-write-issues-in-prometheus/) +by Prometheus maintainer Callum Styan. + +### Appender Not Ready + +Each tenant's WAL has an "appender" internally; this appender is used to _append_ samples to the WAL. The appender is marked +as _not ready_ until the WAL replay is complete upon startup. If the WAL is corrupted for some reason, or is taking a long +time to replay, you can determine this by alerting on `loki_ruler_wal_appender_ready < 1`. + +### Corrupt WAL + +If a disk fails or the `ruler` does not terminate correctly, there's a chance one or more tenant WALs can become corrupted. +A mechanism exists for automatically repairing the WAL, but this cannot handle every conceivable scenario. In this case, +the `loki_ruler_wal_corruptions_repair_failed_total` metric will be incremented. + +### Found another failure mode? + +Please open an [issue](https://github.com/grafana/loki/issues) and tell us about it! \ No newline at end of file diff --git a/docs/sources/rules/_index.md b/docs/sources/rules/_index.md index 217b0491aedb..3ce6bed3a23b 100644 --- a/docs/sources/rules/_index.md +++ b/docs/sources/rules/_index.md @@ -70,8 +70,6 @@ groups: ## Recording Rules -Recording rules are an experimental feature. - We support [Prometheus-compatible](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/#recording-rules) recording rules. From Prometheus' documentation: > Recording rules allow you to precompute frequently needed or computationally expensive expressions and save their result as a new set of time series. @@ -126,32 +124,9 @@ ruler: Further configuration options can be found under [ruler_config](/configuration#ruler_config). -### Resilience and Durability - -Given the above remote-write configuration, one needs to take into account what would happen if the remote-write receiver -becomes unavailable. - -The Ruler component ensures some durability guarantees by buffering all outgoing writes in an in-memory queue. This queue -holds all metric samples that are due to be written to the remote-write receiver, and while that receiver is down, the buffer -will grow in size. - -Once the queue is full, the oldest samples will be evicted from the queue. The size of this queue is controllable globally, -or on a per-tenant basis, with the [`ruler_remote_write_queue_capacity`](/configuration#limits_config) limit setting. By default, this value is set to 10000 samples. - -**NOTE**: this queue only exists in-memory at this time; there is no Write-Ahead Log (WAL) functionality available yet. -This means that if your Ruler instance crashes, all pending metric samples in the queue that have not yet been written will be lost. - -### Operational Considerations - -Metrics are available to monitor recording rule evaluations and writes. +### Operations -| Metric | Description | -|---|---| -| `recording_rules_samples_queued_current` | Number of samples queued to be remote-written. | -| `recording_rules_samples_queued_total` | Total number of samples queued. | -| `recording_rules_samples_queue_capacity` | Number of samples that can be queued before eviction of the oldest samples occurs. | -| `recording_rules_samples_evicted_total` | Number of samples evicted from queue because the queue is full. | -| `recording_rules_remote_write_errors` | Number of samples that failed to be remote-written due to error. | +Please refer to the [Recording Rules](../operations/recording-rules/) page. ## Use cases diff --git a/pkg/ruler/config.go b/pkg/ruler/config.go index 52fa0efec5a9..9d7e53d7eb25 100644 --- a/pkg/ruler/config.go +++ b/pkg/ruler/config.go @@ -79,5 +79,5 @@ func (c *RemoteWriteConfig) Clone() (*RemoteWriteConfig, error) { // RegisterFlags adds the flags required to config this to the given FlagSet. func (c *RemoteWriteConfig) RegisterFlags(f *flag.FlagSet) { f.BoolVar(&c.Enabled, "ruler.remote-write.enabled", false, "Remote-write recording rule samples to Prometheus-compatible remote-write receiver.") - f.DurationVar(&c.ConfigRefreshPeriod, "ruler.remote-write.config-refresh-period", 10*time.Second, "Minimum period to wait between remote-write reconfigurations. This should be greater than or equivalent to -limits.per-user-override-period.") + f.DurationVar(&c.ConfigRefreshPeriod, "ruler.remote-write.config-refresh-period", 10*time.Second, "Minimum period to wait between refreshing remote-write reconfigurations. This should be greater than or equivalent to -limits.per-user-override-period.") } diff --git a/pkg/ruler/registry.go b/pkg/ruler/registry.go index f3ab5de0dbbc..7dfa4479ae6c 100644 --- a/pkg/ruler/registry.go +++ b/pkg/ruler/registry.go @@ -381,6 +381,7 @@ func newStorageRegistryMetrics(reg prometheus.Registerer) *storageRegistryMetric reg: reg, appenderReady: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "appender_ready", + Help: "Whether a WAL appender is ready to accept samples (1) or not (0)", }, []string{"tenant"}), } diff --git a/pkg/ruler/storage/cleaner/cleaner.go b/pkg/ruler/storage/cleaner/cleaner.go index fcde4e316c60..0c5b3ba705d5 100644 --- a/pkg/ruler/storage/cleaner/cleaner.go +++ b/pkg/ruler/storage/cleaner/cleaner.go @@ -20,7 +20,7 @@ import ( // Default settings for the WAL cleaner. const ( DefaultCleanupAge = 12 * time.Hour - DefaultCleanupPeriod = 30 * time.Minute + DefaultCleanupPeriod = 0 * time.Second // disabled by default ) // lastModifiedFunc gets the last modified time of the most recent segment of a WAL diff --git a/pkg/ruler/storage/instance/instance.go b/pkg/ruler/storage/instance/instance.go index f8fb999c47b9..d8bed8481f11 100644 --- a/pkg/ruler/storage/instance/instance.go +++ b/pkg/ruler/storage/instance/instance.go @@ -42,7 +42,7 @@ var ( // Default configuration values var ( DefaultConfig = Config{ - Dir: "wal", + Dir: "ruler-wal", TruncateFrequency: 60 * time.Minute, MinAge: 5 * time.Minute, MaxAge: 4 * time.Hour,