grafana · liguozhong · Jan 21, 2022 · Jan 21, 2022 · Jan 21, 2022 · Jan 21, 2022
diff --git a/pkg/storage/chunk/aws/dynamodb_storage_client.go b/pkg/storage/chunk/aws/dynamodb_storage_client.go
@@ -113,6 +113,10 @@ type dynamoDBStorageClient struct {
 	metrics *dynamoDBMetrics
 }
 
+func (a dynamoDBStorageClient) AsyncQueueLength() int {
+	return 0
+}
+
 // NewDynamoDBIndexClient makes a new DynamoDB-backed IndexClient.
 func NewDynamoDBIndexClient(cfg DynamoDBConfig, schemaCfg chunk.SchemaConfig, reg prometheus.Registerer) (chunk.IndexClient, error) {
 	return newDynamoDBStorageClient(cfg, schemaCfg, reg)

diff --git a/pkg/storage/chunk/cassandra/storage_client.go b/pkg/storage/chunk/cassandra/storage_client.go
@@ -259,6 +259,10 @@ type StorageClient struct {
 	querySemaphore *semaphore.Weighted
 }
 
+func (s *StorageClient) AsyncQueueLength() int {
+	return 0
+}
+
 // NewStorageClient returns a new StorageClient.
 func NewStorageClient(cfg Config, schemaCfg chunk.SchemaConfig, registerer prometheus.Registerer) (*StorageClient, error) {
 	readSession, err := cfg.session("index-read", registerer)

diff --git a/pkg/storage/chunk/gcp/bigtable_index_client.go b/pkg/storage/chunk/gcp/bigtable_index_client.go
@@ -68,6 +68,10 @@ type storageClientColumnKey struct {
 	keysFn    keysFn
 }
 
+func (s *storageClientColumnKey) AsyncQueueLength() int {
+	return 0
+}
+
 // storageClientV1 implements chunk.storageClient for GCP.
 type storageClientV1 struct {
 	storageClientColumnKey

diff --git a/pkg/storage/chunk/grpc/storage_client.go b/pkg/storage/chunk/grpc/storage_client.go
@@ -16,6 +16,10 @@ type StorageClient struct {
 	connection *grpc.ClientConn
 }
 
+func (s *StorageClient) AsyncQueueLength() int {
+	return 0
+}
+
 // NewStorageClient returns a new StorageClient.
 func NewStorageClient(cfg Config, schemaCfg chunk.SchemaConfig) (*StorageClient, error) {
 	grpcClient, conn, err := connectToGrpcServer(cfg.Address)

diff --git a/pkg/storage/chunk/inmemory_storage_client.go b/pkg/storage/chunk/inmemory_storage_client.go
@@ -41,6 +41,10 @@ type MockStorage struct {
 	mode           MockStorageMode
 }
 
+func (m *MockStorage) AsyncQueueLength() int {
+	return 0
+}
+
 type mockTable struct {
 	items       map[string][]mockItem
 	write, read int64

diff --git a/pkg/storage/chunk/local/boltdb_index_client.go b/pkg/storage/chunk/local/boltdb_index_client.go
@@ -54,6 +54,10 @@ type BoltIndexClient struct {
 	wait   sync.WaitGroup
 }
 
+func (b *BoltIndexClient) AsyncQueueLength() int {
+	return 0
+}
+
 // NewBoltDBIndexClient creates a new IndexClient that used BoltDB.
 func NewBoltDBIndexClient(cfg BoltDBConfig) (*BoltIndexClient, error) {
 	if err := chunk_util.EnsureDirectory(cfg.Directory); err != nil {

@@ -32,7 +32,7 @@ func (f fixture) Clients() (chunk.IndexClient, chunk.Client, chunk.TableClient,
 	indexClient = newCachingIndexClient(indexClient, cache.NewFifoCache("index-fifo", cache.FifoCacheConfig{
 		MaxSizeItems: 500,
 		Validity:     5 * time.Minute,
-	}, reg, logger), 5*time.Minute, limits, logger, false)
+	}, reg, logger), 5*time.Minute, limits, logger, false, 10, 100)
 	return indexClient, chunkClient, tableClient, schemaConfig, closer, err
 }
 

@@ -8,18 +8,21 @@ import (
 	"github.com/go-kit/log"
 	"github.com/go-kit/log/level"
 	"github.com/gogo/protobuf/proto"
+	"github.com/pkg/errors"
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/prometheus/client_golang/prometheus/promauto"
 
 	"github.com/grafana/loki/pkg/storage/chunk"
 	"github.com/grafana/loki/pkg/storage/chunk/cache"
 	chunk_util "github.com/grafana/loki/pkg/storage/chunk/util"
 	"github.com/grafana/loki/pkg/tenant"
+	util_log "github.com/grafana/loki/pkg/util/log"
 	"github.com/grafana/loki/pkg/util/spanlogger"
 )
 
 var (
-	cacheCorruptErrs = promauto.NewCounter(prometheus.CounterOpts{
+	errAsyncBufferFull = errors.New("the async buffer is full")
-	errAsyncBufferFull = errors.New("the async buffer is full")
+	errAsyncBufferFull = errors.New("the async buffer of the caching index client is full")
-	errAsyncBufferFull = errors.New("the async buffer is full")
+	errAsyncBufferFull = errors.New("the async buffer of the caching index client is full")
+	cacheCorruptErrs   = promauto.NewCounter(prometheus.CounterOpts{
 		Namespace: "loki",
 		Name:      "querier_index_cache_corruptions_total",
 		Help:      "The number of cache corruptions for the index cache.",
@@ -44,6 +47,16 @@ var (
 		Name:      "querier_index_cache_encode_errors_total",
 		Help:      "The number of errors for the index cache while encoding the body.",
 	})
+	cacheClientQueueEnqueue = promauto.NewCounter(prometheus.CounterOpts{
+		Namespace: "loki",
+		Name:      "querier_index_client_cache_enqueued_total",
+		Help:      "Total number of index enqueued to a buffer to be asynchronously written back to the index cache.",
+	})
+	cacheClientQueueDequeue = promauto.NewCounter(prometheus.CounterOpts{
+		Namespace: "loki",
+		Name:      "querier_index_client_cache_dequeued_total",
+		Help:      "Total number of index dequeued to a buffer to be asynchronously written back to the index cache.",
+	})
 )
 
 const sep = "\xff"
@@ -55,26 +68,45 @@ type cachingIndexClient struct {
 	limits              StoreLimits
 	logger              log.Logger
 	disableBroadQueries bool
+	asyncQueue          chan cacheEntry
+	maxAsyncConcurrency int
+	maxAsyncBufferSize  int
+	stop                chan struct{}
 }
 
-func newCachingIndexClient(client chunk.IndexClient, c cache.Cache, validity time.Duration, limits StoreLimits, logger log.Logger, disableBroadQueries bool) chunk.IndexClient {
+func newCachingIndexClient(client chunk.IndexClient, c cache.Cache, validity time.Duration, limits StoreLimits, logger log.Logger, disableBroadQueries bool, maxAsyncConcurrency int, maxAsyncBufferSize int) chunk.IndexClient {
 	if c == nil || cache.IsEmptyTieredCache(c) {
 		return client
 	}
 
-	return &cachingIndexClient{
+	cacheClient := &cachingIndexClient{
 		IndexClient:         client,
 		cache:               cache.NewSnappy(c, logger),
 		validity:            validity,
 		limits:              limits,
 		logger:              logger,
 		disableBroadQueries: disableBroadQueries,
+		maxAsyncConcurrency: maxAsyncConcurrency,
+		maxAsyncBufferSize:  maxAsyncBufferSize,
+		stop:                make(chan struct{}),
+	}
+	cacheClient.asyncQueue = make(chan cacheEntry, cacheClient.maxAsyncBufferSize)
+	for i := 0; i < cacheClient.maxAsyncConcurrency; i++ {
+		go cacheClient.asyncWriteBackCacheQueueProcessLoop()
 	}
+
+	return cacheClient
+}
+
+type cacheEntry struct {
+	keys    []string
+	batches []ReadBatch
 }
 
 func (s *cachingIndexClient) Stop() {
 	s.cache.Stop()
 	s.IndexClient.Stop()
+	close(s.stop)
 }
 
 func (s *cachingIndexClient) QueryPages(ctx context.Context, queries []chunk.IndexQuery, callback func(chunk.IndexQuery, chunk.ReadBatch) (shouldContinue bool)) error {
@@ -89,6 +121,10 @@ func (s *cachingIndexClient) QueryPages(ctx context.Context, queries []chunk.Ind
 	return s.doQueries(ctx, queries, callback)
 }
 
+func (s *cachingIndexClient) AsyncQueueLength() int {
+	return len(s.asyncQueue)
+}
+
 func (s *cachingIndexClient) queryPages(ctx context.Context, queries []chunk.IndexQuery, callback chunk_util.Callback,
 	buildIndexQuery func(query chunk.IndexQuery) chunk.IndexQuery, buildQueryKey func(query chunk.IndexQuery) string) error {
 	if len(queries) == 0 {
@@ -199,10 +235,13 @@ func (s *cachingIndexClient) queryPages(ctx context.Context, queries []chunk.Ind
 			}
 		}
 
-		err := s.cacheStore(ctx, keys, batches)
+		cacheErr := s.cacheStoreAsync(keys, batches)
 		if cardinalityErr != nil {
 			return cardinalityErr
 		}
+		if cacheErr != nil {
+			level.Warn(util_log.Logger).Log("msg", "could not write fetched index from storage into index cache", "err", cacheErr)
-			level.Warn(util_log.Logger).Log("msg", "could not write fetched index from storage into index cache", "err", cacheErr)
+			level.Warn(s.logger).Log("msg", "could not write fetched index from storage into index cache", "err", cacheErr)
-			level.Warn(util_log.Logger).Log("msg", "could not write fetched index from storage into index cache", "err", cacheErr)
+			level.Warn(s.logger).Log("msg", "could not write fetched index from storage into index cache", "err", cacheErr)
+		}
 		return err
 	}
 }
@@ -273,6 +312,31 @@ func isChunksQuery(q chunk.IndexQuery) bool {
 	return len(q.RangeValueStart) != 0
 }
 
+func (s *cachingIndexClient) cacheStoreAsync(keys []string, batches []ReadBatch) error {
+	select {
+	case s.asyncQueue <- cacheEntry{keys, batches}:
+		cacheClientQueueEnqueue.Add(float64(len(batches)))
+		return nil
+	default:
+		return errAsyncBufferFull
+	}
+}
+
+func (s *cachingIndexClient) asyncWriteBackCacheQueueProcessLoop() {
+	for {
+		select {
+		case cacheEntry := <-s.asyncQueue:
+			cacheClientQueueDequeue.Add(float64(len(cacheEntry.batches)))
+			cacheErr := s.cacheStore(context.Background(), cacheEntry.keys, cacheEntry.batches)
+			if cacheErr != nil {
+				level.Warn(util_log.Logger).Log("msg", "could not write fetched index from storage into index cache", "err", cacheErr)
-				level.Warn(util_log.Logger).Log("msg", "could not write fetched index from storage into index cache", "err", cacheErr)
+				level.Warn(s.logger).Log("msg", "could not write fetched index from storage into index cache", "err", cacheErr)
-				level.Warn(util_log.Logger).Log("msg", "could not write fetched index from storage into index cache", "err", cacheErr)
+				level.Warn(s.logger).Log("msg", "could not write fetched index from storage into index cache", "err", cacheErr)
+			}
+		case <-s.stop:
+			return
+		}
+	}
+}
+
 func (s *cachingIndexClient) cacheStore(ctx context.Context, keys []string, batches []ReadBatch) error {
 	cachePuts.Add(float64(len(keys)))