Skip to content

Commit

Permalink
eventcache: Refactor retries counter
Browse files Browse the repository at this point in the history
Use new helpers from pkg/metrics to define custom metrics and collector.

Rename the metric to tetragon_event_cache_fetch_retries_total, to make it clear
what is being retried.

Signed-off-by: Anna Kapuscinska <anna@isovalent.com>
  • Loading branch information
lambdanis committed Sep 2, 2024
1 parent 798c77b commit 205eee5
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 25 deletions.
1 change: 1 addition & 0 deletions contrib/upgrade-notes/latest.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,4 @@ tetragon:
* `tetragon_event_cache_<entry_type>_errors_total` metrics are replaced by
`tetragon_event_cache_fetch_failures_total{entry_type="<entry_type>"}`.
* `tetragon_event_cache_accesses_total` metric is renamed to `tetragon_event_cache_inserts_total`.
* `tetragon_event_cache_retries_total` metric is renamed to `tetragon_event_cache_fetch_retries_total`.
12 changes: 6 additions & 6 deletions docs/content/en/docs/reference/metrics.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions pkg/eventcache/eventcache.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ func HandleGenericInternal(ev notify.Event, pid uint32, tid *uint32, timestamp u
if parent != nil {
ev.SetParent(parent.UnsafeGetProcess())
} else {
EventCacheRetries(ParentInfo).Inc()
CacheRetries(ParentInfo).Inc()
err = ErrFailedToGetParentInfo
}

Expand All @@ -84,7 +84,7 @@ func HandleGenericInternal(ev notify.Event, pid uint32, tid *uint32, timestamp u
process.UpdateEventProcessTid(proc, tid)
ev.SetProcess(proc)
} else {
EventCacheRetries(ProcessInfo).Inc()
CacheRetries(ProcessInfo).Inc()
err = ErrFailedToGetProcessInfo
}

Expand All @@ -101,7 +101,7 @@ func HandleGenericInternal(ev notify.Event, pid uint32, tid *uint32, timestamp u
func HandleGenericEvent(internal *process.ProcessInternal, ev notify.Event, tid *uint32) error {
p := internal.UnsafeGetProcess()
if option.Config.EnableK8s && p.Pod == nil {
EventCacheRetries(PodInfo).Inc()
CacheRetries(PodInfo).Inc()
return ErrFailedToGetPodInfo
}

Expand Down
19 changes: 8 additions & 11 deletions pkg/eventcache/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,6 @@ var (
Help: "The total of errors encountered while fetching process exec information from the cache.",
ConstLabels: nil,
}, []string{"error", "event_type"})
eventCacheRetriesTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: consts.MetricsNamespace,
Name: "event_cache_retries_total",
Help: "The total number of retries for event caching per entry type.",
}, []string{"entry_type"})
cacheSize = metrics.MustNewCustomGauge(metrics.NewOpts(
consts.MetricsNamespace, "", "event_cache_entries",
"The number of entries in the event cache.",
Expand All @@ -79,6 +74,11 @@ var (
Help: "Number of inserts to the event cache.",
ConstLabels: nil,
})
cacheRetries = metrics.MustNewCounter(metrics.NewOpts(
consts.MetricsNamespace, subsystem, "fetch_retries_total",
"Number of retries when fetching info from the event cache.",
nil, []metrics.ConstrainedLabel{entryTypeLabel}, nil,
), nil)
failedFetches = metrics.MustNewCounter(metrics.NewOpts(
consts.MetricsNamespace, subsystem, "fetch_failures_total",
"Number of failed fetches from the event cache. These won't be retried as they already exceeded the limit.",
Expand All @@ -102,19 +102,16 @@ func newCacheCollector() prometheus.Collector {

func RegisterMetrics(group metrics.Group) {
group.MustRegister(eventCacheErrorsTotal)
group.MustRegister(eventCacheRetriesTotal)
group.MustRegister(
newCacheCollector(),
cacheInserts,
cacheRetries,
failedFetches,
)
}

func InitMetrics() {
// Initialize metrics with labels
for en := range cacheEntryTypeLabelValues {
EventCacheRetries(en).Add(0)
}
for ev := range tetragon.EventType_name {
if tetragon.EventType(ev) != tetragon.EventType_UNDEF && tetragon.EventType(ev) != tetragon.EventType_TEST {
for er := range cacheErrorLabelValues {
Expand All @@ -130,6 +127,6 @@ func EventCacheError(er CacheError, eventType tetragon.EventType) prometheus.Cou
}

// Get a new handle on an eventCacheRetriesTotal metric for an entryType
func EventCacheRetries(entryType CacheEntryType) prometheus.Counter {
return eventCacheRetriesTotal.WithLabelValues(entryType.String())
func CacheRetries(entryType CacheEntryType) prometheus.Counter {
return cacheRetries.WithLabelValues(entryType.String())
}
10 changes: 5 additions & 5 deletions pkg/grpc/exec/exec.go
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ func (msg *MsgExecveEventUnix) Retry(internal *process.ProcessInternal, ev notif
cgroupID := msg.Unix.Kube.Cgrpid
podInfo = process.GetPodInfo(cgroupID, containerId, filename, args, nspid)
if podInfo == nil {
eventcache.EventCacheRetries(eventcache.PodInfo).Inc()
eventcache.CacheRetries(eventcache.PodInfo).Inc()
return eventcache.ErrFailedToGetPodInfo
}
}
Expand Down Expand Up @@ -420,7 +420,7 @@ func (msg *MsgExitEventUnix) RetryInternal(ev notify.Event, timestamp uint64) (*
msg.RefCntDone[ParentRefCnt] = true
}
} else {
eventcache.EventCacheRetries(eventcache.ParentInfo).Inc()
eventcache.CacheRetries(eventcache.ParentInfo).Inc()
err = eventcache.ErrFailedToGetParentInfo
}

Expand All @@ -432,7 +432,7 @@ func (msg *MsgExitEventUnix) RetryInternal(ev notify.Event, timestamp uint64) (*
msg.RefCntDone[ProcessRefCnt] = true
}
} else {
eventcache.EventCacheRetries(eventcache.ProcessInfo).Inc()
eventcache.CacheRetries(eventcache.ProcessInfo).Inc()
err = eventcache.ErrFailedToGetProcessInfo
}

Expand Down Expand Up @@ -486,7 +486,7 @@ func (msg *MsgProcessCleanupEventUnix) RetryInternal(_ notify.Event, timestamp u
msg.RefCntDone[ParentRefCnt] = true
}
} else {
eventcache.EventCacheRetries(eventcache.ParentInfo).Inc()
eventcache.CacheRetries(eventcache.ParentInfo).Inc()
err = eventcache.ErrFailedToGetParentInfo
}

Expand All @@ -496,7 +496,7 @@ func (msg *MsgProcessCleanupEventUnix) RetryInternal(_ notify.Event, timestamp u
msg.RefCntDone[ProcessRefCnt] = true
}
} else {
eventcache.EventCacheRetries(eventcache.ProcessInfo).Inc()
eventcache.CacheRetries(eventcache.ProcessInfo).Inc()
err = eventcache.ErrFailedToGetProcessInfo
}

Expand Down

0 comments on commit 205eee5

Please sign in to comment.