From 6f1f0ce8bc34a7a9941e33e96dca44b678563e73 Mon Sep 17 00:00:00 2001 From: Ashish Naware Date: Wed, 4 Sep 2024 16:39:41 -0700 Subject: [PATCH] Added metrics for LRU data cache Signed-off-by: Ashish Naware --- docs/content/en/docs/reference/metrics.md | 20 ++++++++ pkg/observer/cache.go | 62 +++++++++++++++++++++++ pkg/observer/data.go | 30 +++++------ pkg/observer/metrics.go | 51 +++++++++++++++++++ 4 files changed, 147 insertions(+), 16 deletions(-) create mode 100644 pkg/observer/cache.go diff --git a/docs/content/en/docs/reference/metrics.md b/docs/content/en/docs/reference/metrics.md index b3b750fc593..72ca585368f 100644 --- a/docs/content/en/docs/reference/metrics.md +++ b/docs/content/en/docs/reference/metrics.md @@ -29,6 +29,26 @@ Build information about tetragon | `modified` | `false` | | `time ` | `2022-05-13T15:54:45Z` | +### `tetragon_data_cache_capacity` + +The capacity of the data cache. + +### `tetragon_data_cache_evictions_total` + +Number of data cache LRU evictions. + +### `tetragon_data_cache_misses_total` + +Number of data cache misses. + +| label | values | +| ----- | ------ | +| `operation` | `get, remove` | + +### `tetragon_data_cache_size` + +The size of the data cache + ### `tetragon_data_event_size` The size of received data events. diff --git a/pkg/observer/cache.go b/pkg/observer/cache.go new file mode 100644 index 00000000000..64c4e39b27c --- /dev/null +++ b/pkg/observer/cache.go @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright Authors of Tetragon + +package observer + +import ( + "fmt" + + "github.com/cilium/tetragon/pkg/api/dataapi" + lru "github.com/hashicorp/golang-lru/v2" +) + +type cache struct { + cache *lru.Cache[dataapi.DataEventId, []byte] + size int +} + +// newCache constructs a cache of fixed size with the callback function that increments +// data_cache_evictions_total counter every time the cache is evicted. +func newCache(dataCacheSize int) (*cache, error) { + lruCache, err := lru.NewWithEvict( + dataCacheSize, + func(_ dataapi.DataEventId, _ []byte) { + dataCacheEvictions.Inc() + }, + ) + if err != nil { + return nil, err + } + cache := &cache{ + cache: lruCache, + size: dataCacheSize, + } + return cache, nil +} + +func (c *cache) get(dataEventId dataapi.DataEventId) ([]byte, error) { + data, ok := c.cache.Get(dataEventId) + if !ok { + dataCacheMisses.WithLabelValues("get").Inc() + return nil, fmt.Errorf("data event with id : %v not found", dataEventId) + } + return data, nil +} + +func (c *cache) add(id dataapi.DataEventId, msgData []byte) bool { + evicted := c.cache.Add(id, msgData) + if !evicted { + dataCacheTotal.Inc() + } + return evicted +} + +func (c *cache) remove(desc dataapi.DataEventDesc) bool { + present := c.cache.Remove(desc.Id) + if present { + dataCacheTotal.Dec() + } else { + dataCacheMisses.WithLabelValues("remove").Inc() + } + return present +} diff --git a/pkg/observer/data.go b/pkg/observer/data.go index 15632fc4e19..4d0b82e4b5d 100644 --- a/pkg/observer/data.go +++ b/pkg/observer/data.go @@ -12,7 +12,6 @@ import ( "github.com/cilium/tetragon/pkg/api/dataapi" "github.com/cilium/tetragon/pkg/api/ops" "github.com/cilium/tetragon/pkg/logger" - lru "github.com/hashicorp/golang-lru/v2" ) func init() { @@ -20,29 +19,28 @@ func init() { } var ( - dataMap *lru.Cache[dataapi.DataEventId, []byte] + dataCache *cache ) func InitDataCache(size int) error { var err error - - dataMap, err = lru.New[dataapi.DataEventId, []byte](size) + dataCache, err = newCache(size) return err } func DataAdd(id dataapi.DataEventId, msgData []byte) error { size := len(msgData) - data, ok := dataMap.Get(id) - if !ok { - dataMap.Add(id, msgData) + data, err := dataCache.get(id) + if err != nil { + dataCache.add(id, msgData) DataEventMetricInc(DataEventAdded) } else { data = append(data, msgData...) - dataMap.Add(id, data) + dataCache.add(id, data) DataEventMetricInc(DataEventAppended) + logger.GetLogger().WithFields(nil).Tracef("Data message received id %v, size %v, total %v", id, size, len(data)) } - logger.GetLogger().WithFields(nil).Tracef("Data message received id %v, size %v, total %v", id, size, len(data)) return nil } @@ -60,13 +58,13 @@ func add(r *bytes.Reader, m *dataapi.MsgData) error { } func DataGet(desc dataapi.DataEventDesc) ([]byte, error) { - data, ok := dataMap.Get(desc.Id) - if !ok { + data, err := dataCache.get(desc.Id) + if err != nil { DataEventMetricInc(DataEventNotMatched) - return nil, fmt.Errorf("failed to find data for id: %v", desc.Id) + return nil, err } - dataMap.Remove(desc.Id) + dataCache.remove(desc) // make sure we did not loose anything on the way through ring buffer if len(data) != int(desc.Size-desc.Leftover) { @@ -88,12 +86,12 @@ func HandleData(r *bytes.Reader) ([]Event, error) { m := dataapi.MsgData{} err := binary.Read(r, binary.LittleEndian, &m) if err != nil { - return nil, fmt.Errorf("Failed to read data msg") + return nil, fmt.Errorf("failed to read data msg") } err = add(r, &m) if err != nil { - return nil, fmt.Errorf("Failed to add data msg") + return nil, fmt.Errorf("failed to add data msg") } // we don't send the event further @@ -101,5 +99,5 @@ func HandleData(r *bytes.Reader) ([]Event, error) { } func DataPurge() { - dataMap.Purge() + dataCache.cache.Purge() } diff --git a/pkg/observer/metrics.go b/pkg/observer/metrics.go index 639ac965ee3..1da07831688 100644 --- a/pkg/observer/metrics.go +++ b/pkg/observer/metrics.go @@ -13,6 +13,13 @@ const ( subsystem = "observer" ) +var ( + operationLabel = metrics.ConstrainedLabel{ + Name: "operation", + Values: []string{"get", "remove"}, + } +) + var ( // TODO: These metrics are also stored as Observer struct fields. We could // collect them only once: https://github.com/cilium/tetragon/issues/2834 @@ -53,12 +60,56 @@ var ( Help: "Number of perf events Tetragon ring buffer events queue lost.", ConstLabels: nil, }) + + dataCacheTotal = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: consts.MetricsNamespace, + Name: "data_cache_size", + Help: "The size of the data cache", + ConstLabels: nil, + }) + dataCacheCapacity = metrics.MustNewCustomGauge(metrics.NewOpts( + consts.MetricsNamespace, "", "data_cache_capacity", + "The capacity of the data cache.", + nil, nil, nil, + )) + dataCacheEvictions = prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: consts.MetricsNamespace, + Name: "data_cache_evictions_total", + Help: "Number of data cache LRU evictions.", + }) + dataCacheMisses = metrics.MustNewCounter(metrics.NewOpts( + consts.MetricsNamespace, "", + "data_cache_misses_total", + "Number of data cache misses.", + nil, + []metrics.ConstrainedLabel{operationLabel}, + nil, + ), nil) ) +func newCacheCollector() prometheus.Collector { + return metrics.NewCustomCollector( + metrics.CustomMetrics{dataCacheCapacity}, + func(ch chan<- prometheus.Metric) { + capacity := 0 + if dataCache != nil { + capacity = dataCache.size + } + ch <- dataCacheCapacity.MustMetric(float64(capacity)) + }, + nil, + ) +} + func RegisterHealthMetrics(group metrics.Group) { group.MustRegister(RingbufReceived) group.MustRegister(RingbufLost) group.MustRegister(RingbufErrors) group.MustRegister(queueReceived) group.MustRegister(queueLost) + group.MustRegister( + dataCacheTotal, + dataCacheEvictions, + dataCacheMisses, + newCacheCollector()) }