grafana · owen-d · Jan 7, 2021 · Jan 7, 2021 · Jan 7, 2021 · Jan 7, 2021
diff --git a/docs/sources/operations/storage/wal.md b/docs/sources/operations/storage/wal.md
@@ -4,11 +4,29 @@ title: Write Ahead Log
 
 # Write Ahead Log (WAL)
 
+Ingesters temporarily store data in memory. In the event of a crash, there could be data loss. The WAL helps fill this gap in reliability.
 
-Ingesters store all their data in memory. If there is a crash, there can be data loss. The WAL helps fill this gap in reliability.
-This section will use Kubernetes as a reference.
+The WAL in Loki records incoming data and stores it on the local file system in order to guarantee persistence of acknowledged data in the event of a process crash. Upon restart, Loki will "replay" all of the data in the log before registering itself as ready for subsequent writes. This allows Loki to maintain the performance & cost benefits of buffering data in memory _and_ durability benefits (it won't lose data once a write has been acknowledged).
 
-To use the WAL, there are some changes that needs to be made.
+This section will use Kubernetes as a reference deployment paradigm in the examples.
+
+## Disclaimer & WAL nuances
+
+The Write Ahead Log in Loki takes a few particular tradeoffs compared to other WALs you may be familiar with. The WAL aims to add additional durability guarantees, but _not at the expense of availability_. Particularly, there are two scenarios where the WAL sacrifices these guarantees.
+
+1) Corruption/Deletion of the WAL prior to replaying it
+
+In the event the WAL is corrupted/partially deleted, Loki will not be able to recover all of it's data. In this case, Loki will attempt to recover any data it can, but will not prevent Loki from starting.
+
+Note: the Prometheus metric `loki_ingester_wal_corruptions_total` can be used to track and alert when this happens.
+
+1) No space left on disk
+
+In the event the underlying WAL disk is full, Loki will not fail incoming writes, but neither will it log them to the WAL. In this case, the persistence guarantees across process restarts will not hold.
+
+Note: the Prometheus metric `loki_ingester_wal_disk_full_failures_total` can be used to track and alert when this happens.
+
+### Metrics
 
 ## Changes to deployment
 

diff --git a/pkg/ingester/instance.go b/pkg/ingester/instance.go
@@ -3,7 +3,9 @@ package ingester
 import (
 	"context"
 	"net/http"
+	"os"
 	"sync"
+	"syscall"
 
 	"github.com/pkg/errors"
 	"github.com/prometheus/client_golang/prometheus"
@@ -161,8 +163,13 @@ func (i *instance) Push(ctx context.Context, req *logproto.PushRequest) error {
 
 	if !record.IsEmpty() {
 		if err := i.wal.Log(record); err != nil {
-			return err
+			if e, ok := err.(*os.PathError); ok && e.Err == syscall.ENOSPC {
+				i.metrics.walDiskFullFailures.Inc()
+			} else {
+				return err
+			}
 		}
+
 	}
 
 	return appendErr

diff --git a/pkg/ingester/metrics.go b/pkg/ingester/metrics.go
@@ -13,6 +13,7 @@ type ingesterMetrics struct {
 	checkpointDuration         prometheus.Summary
 	checkpointLoggedBytesTotal prometheus.Counter
 
+	walDiskFullFailures prometheus.Counter
 	walReplayDuration   prometheus.Gauge
 	walCorruptionsTotal *prometheus.CounterVec
 	walLoggedBytesTotal prometheus.Counter
@@ -30,6 +31,10 @@ const (
 
 func newIngesterMetrics(r prometheus.Registerer) *ingesterMetrics {
 	return &ingesterMetrics{
+		walDiskFullFailures: promauto.With(r).NewCounter(prometheus.CounterOpts{
+			Name: "loki_ingester_wal_disk_full_failures_total",
+			Help: "Total number of wal write failures due to full disk.",
+		}),
 		walReplayDuration: promauto.With(r).NewGauge(prometheus.GaugeOpts{
 			Name: "loki_ingester_wal_replay_duration_seconds",
 			Help: "Time taken to replay the checkpoint and the WAL.",