From 205eee5b336f13bea4d9ce38db20ac882a5cc9d4 Mon Sep 17 00:00:00 2001 From: Anna Kapuscinska Date: Mon, 2 Sep 2024 11:59:20 +0200 Subject: [PATCH] eventcache: Refactor retries counter Use new helpers from pkg/metrics to define custom metrics and collector. Rename the metric to tetragon_event_cache_fetch_retries_total, to make it clear what is being retried. Signed-off-by: Anna Kapuscinska --- contrib/upgrade-notes/latest.md | 1 + docs/content/en/docs/reference/metrics.md | 12 ++++++------ pkg/eventcache/eventcache.go | 6 +++--- pkg/eventcache/metrics.go | 19 ++++++++----------- pkg/grpc/exec/exec.go | 10 +++++----- 5 files changed, 23 insertions(+), 25 deletions(-) diff --git a/contrib/upgrade-notes/latest.md b/contrib/upgrade-notes/latest.md index d12acb1b3db..11e36bd1570 100644 --- a/contrib/upgrade-notes/latest.md +++ b/contrib/upgrade-notes/latest.md @@ -59,3 +59,4 @@ tetragon: * `tetragon_event_cache__errors_total` metrics are replaced by `tetragon_event_cache_fetch_failures_total{entry_type=""}`. * `tetragon_event_cache_accesses_total` metric is renamed to `tetragon_event_cache_inserts_total`. +* `tetragon_event_cache_retries_total` metric is renamed to `tetragon_event_cache_fetch_retries_total`. diff --git a/docs/content/en/docs/reference/metrics.md b/docs/content/en/docs/reference/metrics.md index df2df5534e3..61a013a839d 100644 --- a/docs/content/en/docs/reference/metrics.md +++ b/docs/content/en/docs/reference/metrics.md @@ -75,18 +75,18 @@ Number of failed fetches from the event cache. These won't be retried as they al | `entry_type` | `parent_info, pod_info, process_info` | | `event_type` | `PROCESS_EXEC, PROCESS_EXIT, PROCESS_KPROBE, PROCESS_LOADER, PROCESS_LSM, PROCESS_THROTTLE, PROCESS_TRACEPOINT, PROCESS_UPROBE, RATE_LIMIT_INFO` | -### `tetragon_event_cache_inserts_total` - -Number of inserts to the event cache. - -### `tetragon_event_cache_retries_total` +### `tetragon_event_cache_fetch_retries_total` -The total number of retries for event caching per entry type. +Number of retries when fetching info from the event cache. | label | values | | ----- | ------ | | `entry_type` | `parent_info, pod_info, process_info` | +### `tetragon_event_cache_inserts_total` + +Number of inserts to the event cache. + ### `tetragon_events_exported_bytes_total` Number of bytes exported for events diff --git a/pkg/eventcache/eventcache.go b/pkg/eventcache/eventcache.go index 8b86c7395a6..8e29a4f2f68 100644 --- a/pkg/eventcache/eventcache.go +++ b/pkg/eventcache/eventcache.go @@ -68,7 +68,7 @@ func HandleGenericInternal(ev notify.Event, pid uint32, tid *uint32, timestamp u if parent != nil { ev.SetParent(parent.UnsafeGetProcess()) } else { - EventCacheRetries(ParentInfo).Inc() + CacheRetries(ParentInfo).Inc() err = ErrFailedToGetParentInfo } @@ -84,7 +84,7 @@ func HandleGenericInternal(ev notify.Event, pid uint32, tid *uint32, timestamp u process.UpdateEventProcessTid(proc, tid) ev.SetProcess(proc) } else { - EventCacheRetries(ProcessInfo).Inc() + CacheRetries(ProcessInfo).Inc() err = ErrFailedToGetProcessInfo } @@ -101,7 +101,7 @@ func HandleGenericInternal(ev notify.Event, pid uint32, tid *uint32, timestamp u func HandleGenericEvent(internal *process.ProcessInternal, ev notify.Event, tid *uint32) error { p := internal.UnsafeGetProcess() if option.Config.EnableK8s && p.Pod == nil { - EventCacheRetries(PodInfo).Inc() + CacheRetries(PodInfo).Inc() return ErrFailedToGetPodInfo } diff --git a/pkg/eventcache/metrics.go b/pkg/eventcache/metrics.go index d359aa90cb7..abf2dffa73f 100644 --- a/pkg/eventcache/metrics.go +++ b/pkg/eventcache/metrics.go @@ -62,11 +62,6 @@ var ( Help: "The total of errors encountered while fetching process exec information from the cache.", ConstLabels: nil, }, []string{"error", "event_type"}) - eventCacheRetriesTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ - Namespace: consts.MetricsNamespace, - Name: "event_cache_retries_total", - Help: "The total number of retries for event caching per entry type.", - }, []string{"entry_type"}) cacheSize = metrics.MustNewCustomGauge(metrics.NewOpts( consts.MetricsNamespace, "", "event_cache_entries", "The number of entries in the event cache.", @@ -79,6 +74,11 @@ var ( Help: "Number of inserts to the event cache.", ConstLabels: nil, }) + cacheRetries = metrics.MustNewCounter(metrics.NewOpts( + consts.MetricsNamespace, subsystem, "fetch_retries_total", + "Number of retries when fetching info from the event cache.", + nil, []metrics.ConstrainedLabel{entryTypeLabel}, nil, + ), nil) failedFetches = metrics.MustNewCounter(metrics.NewOpts( consts.MetricsNamespace, subsystem, "fetch_failures_total", "Number of failed fetches from the event cache. These won't be retried as they already exceeded the limit.", @@ -102,19 +102,16 @@ func newCacheCollector() prometheus.Collector { func RegisterMetrics(group metrics.Group) { group.MustRegister(eventCacheErrorsTotal) - group.MustRegister(eventCacheRetriesTotal) group.MustRegister( newCacheCollector(), cacheInserts, + cacheRetries, failedFetches, ) } func InitMetrics() { // Initialize metrics with labels - for en := range cacheEntryTypeLabelValues { - EventCacheRetries(en).Add(0) - } for ev := range tetragon.EventType_name { if tetragon.EventType(ev) != tetragon.EventType_UNDEF && tetragon.EventType(ev) != tetragon.EventType_TEST { for er := range cacheErrorLabelValues { @@ -130,6 +127,6 @@ func EventCacheError(er CacheError, eventType tetragon.EventType) prometheus.Cou } // Get a new handle on an eventCacheRetriesTotal metric for an entryType -func EventCacheRetries(entryType CacheEntryType) prometheus.Counter { - return eventCacheRetriesTotal.WithLabelValues(entryType.String()) +func CacheRetries(entryType CacheEntryType) prometheus.Counter { + return cacheRetries.WithLabelValues(entryType.String()) } diff --git a/pkg/grpc/exec/exec.go b/pkg/grpc/exec/exec.go index 7e9d98befba..ed3799578f1 100644 --- a/pkg/grpc/exec/exec.go +++ b/pkg/grpc/exec/exec.go @@ -191,7 +191,7 @@ func (msg *MsgExecveEventUnix) Retry(internal *process.ProcessInternal, ev notif cgroupID := msg.Unix.Kube.Cgrpid podInfo = process.GetPodInfo(cgroupID, containerId, filename, args, nspid) if podInfo == nil { - eventcache.EventCacheRetries(eventcache.PodInfo).Inc() + eventcache.CacheRetries(eventcache.PodInfo).Inc() return eventcache.ErrFailedToGetPodInfo } } @@ -420,7 +420,7 @@ func (msg *MsgExitEventUnix) RetryInternal(ev notify.Event, timestamp uint64) (* msg.RefCntDone[ParentRefCnt] = true } } else { - eventcache.EventCacheRetries(eventcache.ParentInfo).Inc() + eventcache.CacheRetries(eventcache.ParentInfo).Inc() err = eventcache.ErrFailedToGetParentInfo } @@ -432,7 +432,7 @@ func (msg *MsgExitEventUnix) RetryInternal(ev notify.Event, timestamp uint64) (* msg.RefCntDone[ProcessRefCnt] = true } } else { - eventcache.EventCacheRetries(eventcache.ProcessInfo).Inc() + eventcache.CacheRetries(eventcache.ProcessInfo).Inc() err = eventcache.ErrFailedToGetProcessInfo } @@ -486,7 +486,7 @@ func (msg *MsgProcessCleanupEventUnix) RetryInternal(_ notify.Event, timestamp u msg.RefCntDone[ParentRefCnt] = true } } else { - eventcache.EventCacheRetries(eventcache.ParentInfo).Inc() + eventcache.CacheRetries(eventcache.ParentInfo).Inc() err = eventcache.ErrFailedToGetParentInfo } @@ -496,7 +496,7 @@ func (msg *MsgProcessCleanupEventUnix) RetryInternal(_ notify.Event, timestamp u msg.RefCntDone[ProcessRefCnt] = true } } else { - eventcache.EventCacheRetries(eventcache.ProcessInfo).Inc() + eventcache.CacheRetries(eventcache.ProcessInfo).Inc() err = eventcache.ErrFailedToGetProcessInfo }