Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added metrics for LRU data cache #2908

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions docs/content/en/docs/reference/metrics.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

62 changes: 62 additions & 0 deletions pkg/observer/cache.go
AshishNaware marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright Authors of Tetragon

package observer

import (
"fmt"

"github.com/cilium/tetragon/pkg/api/dataapi"
lru "github.com/hashicorp/golang-lru/v2"
)

type cache struct {
cache *lru.Cache[dataapi.DataEventId, []byte]
size int
}

// newCache constructs a cache of fixed size with the callback function that increments
// data_cache_evictions_total counter every time the cache is evicted.
func newCache(dataCacheSize int) (*cache, error) {
lruCache, err := lru.NewWithEvict(
AshishNaware marked this conversation as resolved.
Show resolved Hide resolved
dataCacheSize,
func(_ dataapi.DataEventId, _ []byte) {
dataCacheEvictions.Inc()
},
)
if err != nil {
return nil, err
}
cache := &cache{
cache: lruCache,
size: dataCacheSize,
}
return cache, nil
}

func (c *cache) get(dataEventId dataapi.DataEventId) ([]byte, error) {
data, ok := c.cache.Get(dataEventId)
if !ok {
dataCacheMisses.WithLabelValues("get").Inc()
return nil, fmt.Errorf("data event with id : %v not found", dataEventId)
}
return data, nil
}

func (c *cache) add(id dataapi.DataEventId, msgData []byte) bool {
evicted := c.cache.Add(id, msgData)
if !evicted {
dataCacheTotal.Inc()
}
return evicted
AshishNaware marked this conversation as resolved.
Show resolved Hide resolved
}

func (c *cache) remove(desc dataapi.DataEventDesc) bool {
present := c.cache.Remove(desc.Id)
if present {
dataCacheTotal.Dec()
} else {
dataCacheMisses.WithLabelValues("remove").Inc()
}
return present
}
30 changes: 14 additions & 16 deletions pkg/observer/data.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,37 +12,35 @@ import (
"github.com/cilium/tetragon/pkg/api/dataapi"
"github.com/cilium/tetragon/pkg/api/ops"
"github.com/cilium/tetragon/pkg/logger"
lru "github.com/hashicorp/golang-lru/v2"
)

func init() {
RegisterEventHandlerAtInit(ops.MSG_OP_DATA, HandleData)
}

var (
dataMap *lru.Cache[dataapi.DataEventId, []byte]
dataCache *cache
)

func InitDataCache(size int) error {
var err error

dataMap, err = lru.New[dataapi.DataEventId, []byte](size)
dataCache, err = newCache(size)
return err
}

func DataAdd(id dataapi.DataEventId, msgData []byte) error {
size := len(msgData)
data, ok := dataMap.Get(id)
if !ok {
dataMap.Add(id, msgData)
data, err := dataCache.get(id)
if err != nil {
dataCache.add(id, msgData)
DataEventMetricInc(DataEventAdded)
} else {
data = append(data, msgData...)
dataMap.Add(id, data)
dataCache.add(id, data)
DataEventMetricInc(DataEventAppended)
logger.GetLogger().WithFields(nil).Tracef("Data message received id %v, size %v, total %v", id, size, len(data))
}

logger.GetLogger().WithFields(nil).Tracef("Data message received id %v, size %v, total %v", id, size, len(data))
return nil
}

Expand All @@ -60,13 +58,13 @@ func add(r *bytes.Reader, m *dataapi.MsgData) error {
}

func DataGet(desc dataapi.DataEventDesc) ([]byte, error) {
data, ok := dataMap.Get(desc.Id)
if !ok {
data, err := dataCache.get(desc.Id)
if err != nil {
DataEventMetricInc(DataEventNotMatched)
return nil, fmt.Errorf("failed to find data for id: %v", desc.Id)
return nil, err
}

dataMap.Remove(desc.Id)
dataCache.remove(desc)

// make sure we did not loose anything on the way through ring buffer
if len(data) != int(desc.Size-desc.Leftover) {
Expand All @@ -88,18 +86,18 @@ func HandleData(r *bytes.Reader) ([]Event, error) {
m := dataapi.MsgData{}
err := binary.Read(r, binary.LittleEndian, &m)
if err != nil {
return nil, fmt.Errorf("Failed to read data msg")
return nil, fmt.Errorf("failed to read data msg")
}

err = add(r, &m)
if err != nil {
return nil, fmt.Errorf("Failed to add data msg")
return nil, fmt.Errorf("failed to add data msg")
}

// we don't send the event further
return nil, nil
}

func DataPurge() {
dataMap.Purge()
dataCache.cache.Purge()
}
51 changes: 51 additions & 0 deletions pkg/observer/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,13 @@ const (
subsystem = "observer"
)

var (
operationLabel = metrics.ConstrainedLabel{
Name: "operation",
Values: []string{"get", "remove"},
}
)

var (
// TODO: These metrics are also stored as Observer struct fields. We could
// collect them only once: https://github.com/cilium/tetragon/issues/2834
Expand Down Expand Up @@ -53,12 +60,56 @@ var (
Help: "Number of perf events Tetragon ring buffer events queue lost.",
ConstLabels: nil,
})

dataCacheTotal = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: consts.MetricsNamespace,
Name: "data_cache_size",
Help: "The size of the data cache",
ConstLabels: nil,
})
dataCacheCapacity = metrics.MustNewCustomGauge(metrics.NewOpts(
consts.MetricsNamespace, "", "data_cache_capacity",
"The capacity of the data cache.",
nil, nil, nil,
))
dataCacheEvictions = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: consts.MetricsNamespace,
Name: "data_cache_evictions_total",
Help: "Number of data cache LRU evictions.",
})
dataCacheMisses = metrics.MustNewCounter(metrics.NewOpts(
consts.MetricsNamespace, "",
"data_cache_misses_total",
"Number of data cache misses.",
nil,
[]metrics.ConstrainedLabel{operationLabel},
nil,
), nil)
)

func newCacheCollector() prometheus.Collector {
return metrics.NewCustomCollector(
metrics.CustomMetrics{dataCacheCapacity},
func(ch chan<- prometheus.Metric) {
capacity := 0
if dataCache != nil {
capacity = dataCache.size
}
ch <- dataCacheCapacity.MustMetric(float64(capacity))
},
nil,
)
}

func RegisterHealthMetrics(group metrics.Group) {
group.MustRegister(RingbufReceived)
group.MustRegister(RingbufLost)
group.MustRegister(RingbufErrors)
group.MustRegister(queueReceived)
group.MustRegister(queueLost)
group.MustRegister(
dataCacheTotal,
dataCacheEvictions,
dataCacheMisses,
newCacheCollector())
}
Loading