From 4166a3fa3fd05e8ad4cfae509a985a3f4db2bbd9 Mon Sep 17 00:00:00 2001 From: Anna Kapuscinska Date: Sun, 1 Sep 2024 12:18:38 +0200 Subject: [PATCH] metrics: Expose more errors in tetragon_bpf_missed_events_total counter When testing I saw a bunch of "unknown" errors. To further investigate, let's split out ENOENT, E2BIG and EINVAL into separate label values. This should be almost all errors returned by perf_event_output, I left out only EOPNOTSUPP. Signed-off-by: Anna Kapuscinska --- bpf/lib/process.h | 17 +++++++++++++---- docs/content/en/docs/reference/metrics.md | 2 +- pkg/api/processapi/processapi.go | 3 +++ pkg/metrics/eventmetrics/eventmetrics.go | 3 +++ 4 files changed, 20 insertions(+), 5 deletions(-) diff --git a/bpf/lib/process.h b/bpf/lib/process.h index 25c5e0c6db6..dfb39410b74 100644 --- a/bpf/lib/process.h +++ b/bpf/lib/process.h @@ -565,9 +565,12 @@ _Static_assert(sizeof(struct execve_map_value) % 8 == 0, "struct execve_map_value should have size multiple of 8 bytes"); #define SENT_FAILED_UNKNOWN 0 // unknown error -#define SENT_FAILED_EBUSY 1 // EBUSY -#define SENT_FAILED_ENOSPC 2 // ENOSPC -#define SENT_FAILED_MAX 3 +#define SENT_FAILED_ENOENT 1 // ENOENT +#define SENT_FAILED_E2BIG 2 // E2BIG +#define SENT_FAILED_EBUSY 3 // EBUSY +#define SENT_FAILED_EINVAL 4 // EINVAL +#define SENT_FAILED_ENOSPC 5 // ENOSPC +#define SENT_FAILED_MAX 6 struct kernel_stats { __u64 sent_failed[256][SENT_FAILED_MAX]; @@ -591,8 +594,14 @@ perf_event_output_metric(void *ctx, u8 msg_op, void *map, u64 flags, void *data, if (err < 0) { valp = map_lookup_elem(&tg_stats_map, &zero); if (valp) { - if (err == -16) // EBUSY + if (err == -2) // ENOENT + __sync_fetch_and_add(&valp->sent_failed[msg_op][SENT_FAILED_ENOENT], 1); + else if (err == -7) // E2BIG + __sync_fetch_and_add(&valp->sent_failed[msg_op][SENT_FAILED_E2BIG], 1); + else if (err == -16) // EBUSY __sync_fetch_and_add(&valp->sent_failed[msg_op][SENT_FAILED_EBUSY], 1); + else if (err == -22) // EINVAL + __sync_fetch_and_add(&valp->sent_failed[msg_op][SENT_FAILED_EINVAL], 1); else if (err == -28) // ENOSPC __sync_fetch_and_add(&valp->sent_failed[msg_op][SENT_FAILED_ENOSPC], 1); else diff --git a/docs/content/en/docs/reference/metrics.md b/docs/content/en/docs/reference/metrics.md index 1f2825cc717..e747f92f84f 100644 --- a/docs/content/en/docs/reference/metrics.md +++ b/docs/content/en/docs/reference/metrics.md @@ -15,7 +15,7 @@ Number of Tetragon perf events that are failed to be sent from the kernel. | label | values | | ----- | ------ | -| `error` | `EBUSY, ENOSPC, unknown` | +| `error` | `E2BIG, EBUSY, EINVAL, ENOENT, ENOSPC, unknown` | | `msg_op` | `13, 14, 15, 16, 23, 24, 25, 26, 27, 5, 7` | ### `tetragon_build_info` diff --git a/pkg/api/processapi/processapi.go b/pkg/api/processapi/processapi.go index 7da1e640e18..426492df394 100644 --- a/pkg/api/processapi/processapi.go +++ b/pkg/api/processapi/processapi.go @@ -51,7 +51,10 @@ const ( const ( SentFailedUnknown = iota + SentFailedEnoent + SentFailedE2big SentFailedEbusy + SentFailedEinval SentFailedEnospc SentFailedMax ) diff --git a/pkg/metrics/eventmetrics/eventmetrics.go b/pkg/metrics/eventmetrics/eventmetrics.go index 4bee11f2bda..d599112302c 100644 --- a/pkg/metrics/eventmetrics/eventmetrics.go +++ b/pkg/metrics/eventmetrics/eventmetrics.go @@ -25,7 +25,10 @@ import ( var ( perfEventErrors = map[int]string{ processapi.SentFailedUnknown: "unknown", + processapi.SentFailedEnoent: "ENOENT", + processapi.SentFailedE2big: "E2BIG", processapi.SentFailedEbusy: "EBUSY", + processapi.SentFailedEinval: "EINVAL", processapi.SentFailedEnospc: "ENOSPC", } perfEventErrorLabel = metrics.ConstrainedLabel{