From 95974f4ba0029b2f179d49ebf935d7742c42e970 Mon Sep 17 00:00:00 2001 From: sadath-12 Date: Fri, 29 Dec 2023 18:28:03 +0530 Subject: [PATCH] perf: handle metrics correctly added eventCacheRetriesTotal metric for capturing pod,process,parent(Info) retries along parentInfoErrors, processInfoErrors and podInfoErrors update Signed-off-by: sadath-12 --- pkg/eventcache/eventcache.go | 11 +++---- pkg/grpc/exec/exec.go | 8 +++-- pkg/metrics/errormetrics/errormetrics.go | 6 ---- .../eventcachemetrics/eventcachemetrics.go | 29 +++++++++++++++++++ 4 files changed, 40 insertions(+), 14 deletions(-) diff --git a/pkg/eventcache/eventcache.go b/pkg/eventcache/eventcache.go index 9755bf686d9..5f5a03649ce 100644 --- a/pkg/eventcache/eventcache.go +++ b/pkg/eventcache/eventcache.go @@ -9,7 +9,6 @@ import ( "github.com/cilium/tetragon/api/v1/tetragon" "github.com/cilium/tetragon/pkg/ktime" - "github.com/cilium/tetragon/pkg/metrics/errormetrics" "github.com/cilium/tetragon/pkg/metrics/eventcachemetrics" "github.com/cilium/tetragon/pkg/option" "github.com/cilium/tetragon/pkg/process" @@ -70,7 +69,7 @@ func HandleGenericInternal(ev notify.Event, pid uint32, tid *uint32, timestamp u if parent != nil { ev.SetParent(parent.UnsafeGetProcess()) } else { - errormetrics.ErrorTotalInc(errormetrics.EventCacheParentInfoFailed) + eventcachemetrics.EventCacheRetries(eventcachemetrics.ParentInfo).Inc() err = ErrFailedToGetParentInfo } @@ -86,7 +85,7 @@ func HandleGenericInternal(ev notify.Event, pid uint32, tid *uint32, timestamp u process.UpdateEventProcessTid(proc, tid) ev.SetProcess(proc) } else { - errormetrics.ErrorTotalInc(errormetrics.EventCacheProcessInfoFailed) + eventcachemetrics.EventCacheRetries(eventcachemetrics.ProcessInfo).Inc() err = ErrFailedToGetProcessInfo } @@ -103,7 +102,7 @@ func HandleGenericInternal(ev notify.Event, pid uint32, tid *uint32, timestamp u func HandleGenericEvent(internal *process.ProcessInternal, ev notify.Event, tid *uint32) error { p := internal.UnsafeGetProcess() if option.Config.EnableK8s && p.Pod == nil { - errormetrics.ErrorTotalInc(errormetrics.EventCachePodInfoRetryFailed) + eventcachemetrics.EventCacheRetries(eventcachemetrics.PodInfo).Inc() return ErrFailedToGetPodInfo } @@ -141,7 +140,9 @@ func (ec *Cache) handleEvents() { tmp = append(tmp, event) continue } - if errors.Is(err, ErrFailedToGetProcessInfo) { + if errors.Is(err, ErrFailedToGetParentInfo) { + eventcachemetrics.ParentInfoError(notify.EventTypeString(event.event)).Inc() + } else if errors.Is(err, ErrFailedToGetProcessInfo) { eventcachemetrics.ProcessInfoError(notify.EventTypeString(event.event)).Inc() } else if errors.Is(err, ErrFailedToGetPodInfo) { eventcachemetrics.PodInfoError(notify.EventTypeString(event.event)).Inc() diff --git a/pkg/grpc/exec/exec.go b/pkg/grpc/exec/exec.go index 096fb2d9aea..de9fa122f98 100644 --- a/pkg/grpc/exec/exec.go +++ b/pkg/grpc/exec/exec.go @@ -196,7 +196,7 @@ func (msg *MsgExecveEventUnix) Retry(internal *process.ProcessInternal, ev notif if option.Config.EnableK8s && containerId != "" { podInfo = process.GetPodInfo(containerId, filename, args, nspid) if podInfo == nil { - errormetrics.ErrorTotalInc(errormetrics.EventCachePodInfoRetryFailed) + eventcachemetrics.EventCacheRetries(eventcachemetrics.PodInfo).Inc() return eventcache.ErrFailedToGetPodInfo } } @@ -434,7 +434,7 @@ func (msg *MsgExitEventUnix) RetryInternal(ev notify.Event, timestamp uint64) (* msg.RefCntDone[ParentRefCnt] = true } } else { - errormetrics.ErrorTotalInc(errormetrics.EventCacheParentInfoFailed) + eventcachemetrics.EventCacheRetries(eventcachemetrics.ParentInfo).Inc() err = eventcache.ErrFailedToGetParentInfo } @@ -446,7 +446,7 @@ func (msg *MsgExitEventUnix) RetryInternal(ev notify.Event, timestamp uint64) (* msg.RefCntDone[ProcessRefCnt] = true } } else { - errormetrics.ErrorTotalInc(errormetrics.EventCacheProcessInfoFailed) + eventcachemetrics.EventCacheRetries(eventcachemetrics.ProcessInfo).Inc() err = eventcache.ErrFailedToGetProcessInfo } @@ -505,6 +505,7 @@ func (msg *MsgProcessCleanupEventUnix) RetryInternal(_ notify.Event, timestamp u msg.RefCntDone[ParentRefCnt] = true } } else { + eventcachemetrics.EventCacheRetries(eventcachemetrics.ParentInfo).Inc() err = eventcache.ErrFailedToGetParentInfo } @@ -514,6 +515,7 @@ func (msg *MsgProcessCleanupEventUnix) RetryInternal(_ notify.Event, timestamp u msg.RefCntDone[ProcessRefCnt] = true } } else { + eventcachemetrics.EventCacheRetries(eventcachemetrics.ProcessInfo).Inc() err = eventcache.ErrFailedToGetProcessInfo } diff --git a/pkg/metrics/errormetrics/errormetrics.go b/pkg/metrics/errormetrics/errormetrics.go index a0de0cb331b..c6c24ec0aa6 100644 --- a/pkg/metrics/errormetrics/errormetrics.go +++ b/pkg/metrics/errormetrics/errormetrics.go @@ -23,12 +23,6 @@ var ( ProcessCacheMissOnRemove ErrorType = "process_cache_miss_on_remove" // Tid and Pid mismatch that could affect BPF and user space caching logic ProcessPidTidMismatch ErrorType = "process_pid_tid_mismatch" - // Event cache podInfo retries failed. - EventCachePodInfoRetryFailed ErrorType = "event_cache_podinfo_retry_failed" - // Event cache failed to set process information for an event. - EventCacheProcessInfoFailed ErrorType = "event_cache_process_info_failed" - // Event cache failed to set parent information for an event. - EventCacheParentInfoFailed ErrorType = "event_cache_parent_info_failed" // An event is missing process info. EventMissingProcessInfo ErrorType = "event_missing_process_info" // An error occurred in an event handler. diff --git a/pkg/metrics/eventcachemetrics/eventcachemetrics.go b/pkg/metrics/eventcachemetrics/eventcachemetrics.go index 9fa660b5a24..669fd61ba81 100644 --- a/pkg/metrics/eventcachemetrics/eventcachemetrics.go +++ b/pkg/metrics/eventcachemetrics/eventcachemetrics.go @@ -8,6 +8,12 @@ import ( "github.com/prometheus/client_golang/prometheus" ) +const ( + ProcessInfo = "process_info" + ParentInfo = "parent_info" + PodInfo = "pod_info" +) + var ( processInfoErrors = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: consts.MetricsNamespace, @@ -33,6 +39,17 @@ var ( Help: "The total of errors encountered while fetching process exec information from the cache.", ConstLabels: nil, }, []string{"error"}) + eventCacheRetriesTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: consts.MetricsNamespace, + Name: "event_cache_retries_total", + Help: "The total number of retries for event caching per entry type.", + }, []string{"entry_type"}) + parentInfoErrors = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: consts.MetricsNamespace, + Name: "event_cache_parent_info_errors_total", + Help: "The total of times we failed to fetch cached parent info for a given event type.", + ConstLabels: nil, + }, []string{"event_type"}) ) func InitMetrics(registry *prometheus.Registry) { @@ -40,6 +57,8 @@ func InitMetrics(registry *prometheus.Registry) { registry.MustRegister(podInfoErrors) registry.MustRegister(EventCacheCount) registry.MustRegister(eventCacheErrorsTotal) + registry.MustRegister(eventCacheRetriesTotal) + registry.MustRegister(parentInfoErrors) } // Get a new handle on an processInfoErrors metric for an eventType @@ -56,3 +75,13 @@ func PodInfoError(eventType string) prometheus.Counter { func EventCacheError(err string) prometheus.Counter { return eventCacheErrorsTotal.WithLabelValues(err) } + +// Get a new handle on the eventCacheRetriesTotal metric for an entryType +func EventCacheRetries(entryType string) prometheus.Counter { + return eventCacheRetriesTotal.WithLabelValues(entryType) +} + +// Get a new handle on an processInfoErrors metric for an eventType +func ParentInfoError(eventType string) prometheus.Counter { + return parentInfoErrors.WithLabelValues(eventType) +}