From 2b81f07ab5e567a9b5d19b04f3e0ce619305220f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 26 Apr 2024 13:33:02 +0000 Subject: [PATCH 1/8] tetragon: Factor AllPrograms maintaining At the moment we keep unloaded programs in AllPrograms. Making AllPrograms local and locked by mutex. Adding new helpers progsAdd and progsCleanup to add and cleanup programs (on sensor unload) respectively. Signed-off-by: Jiri Olsa --- pkg/sensors/load.go | 10 ++++++---- pkg/sensors/sensors.go | 33 +++++++++++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/pkg/sensors/load.go b/pkg/sensors/load.go index b7ce819304a..87bb1c7cc56 100644 --- a/pkg/sensors/load.go +++ b/pkg/sensors/load.go @@ -72,10 +72,6 @@ func (s *Sensor) Load(bpfDir string) error { return fmt.Errorf("sensor %s has been previously destroyed, please recreate it before loading", s.Name) } - // Add the loaded programs and maps to All* so they can be unloaded on shutdown. - AllPrograms = append(AllPrograms, s.Progs...) - AllMaps = append(AllMaps, s.Maps...) - logger.GetLogger().WithField("metadata", cachedbtf.GetCachedBTFFile()).Info("BTF file: using metadata file") if _, err := observerMinReqs(); err != nil { return fmt.Errorf("tetragon, aborting minimum requirements not met: %w", err) @@ -114,6 +110,11 @@ func (s *Sensor) Load(bpfDir string) error { p.LoadState.RefInc() l.WithField("prog", p.Name).WithField("label", p.Label).Debugf("BPF prog was loaded") } + + // Add the *loaded* programs and maps, so they can be unloaded later + progsAdd(s.Progs) + AllMaps = append(AllMaps, s.Maps...) + l.WithField("sensor", s.Name).Infof("Loaded BPF maps and events for sensor successfully") s.Loaded = true return nil @@ -149,6 +150,7 @@ func (s *Sensor) Unload() error { } } + progsCleanup() return nil } diff --git a/pkg/sensors/sensors.go b/pkg/sensors/sensors.go index e406e924779..4222ae5a691 100644 --- a/pkg/sensors/sensors.go +++ b/pkg/sensors/sensors.go @@ -5,6 +5,7 @@ package sensors import ( "fmt" + "sync" "github.com/cilium/tetragon/pkg/logger" "github.com/cilium/tetragon/pkg/policyfilter" @@ -17,8 +18,10 @@ import ( ) var ( - // AllPrograms are all the loaded programs. For use with Unload(). - AllPrograms = []*program.Program{} + // allPrograms are all the loaded programs. For use with Unload(). + allPrograms = []*program.Program{} + // allPrograms lock + allProgramsMutex sync.Mutex // AllMaps are all the loaded programs. For use with Unload(). AllMaps = []*program.Map{} ) @@ -167,3 +170,29 @@ func GetMergedSensorFromParserPolicy(tp tracingpolicy.TracingPolicy) (SensorIfac return SensorCombine(tp.TpName(), sensors...), nil } + +func progsAdd(progs []*program.Program) { + allProgramsMutex.Lock() + defer allProgramsMutex.Unlock() + + allPrograms = append(allPrograms, progs...) +} + +func progsCleanup() { + allProgramsMutex.Lock() + defer allProgramsMutex.Unlock() + + progs := []*program.Program{} + + for _, p := range allPrograms { + if p.LoadState.IsLoaded() { + progs = append(progs, p) + } + } + + allPrograms = progs +} + +func AllPrograms() []*program.Program { + return append([]*program.Program{}, allPrograms...) +} From e9310af3dd6dc4ba507f8cfe2d3c68b1707a17d2 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 28 Mar 2024 13:09:10 +0000 Subject: [PATCH 2/8] tetragon: Adding kprobe_multi prefix to program Attach string This just matters for log output, changing from: time="2024-03-28T13:08:34Z" level=info msg="Loading registered BPF probe" Attach="3 functions" Program=... into: time="2024-03-28T13:08:34Z" level=info msg="Loading registered BPF probe" Attach="kprobe_multi (3 functions)" Program=... It has more info and will help in following changes where the Attach string will serve as the metric label. Signed-off-by: Jiri Olsa --- pkg/sensors/tracing/generickprobe.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/sensors/tracing/generickprobe.go b/pkg/sensors/tracing/generickprobe.go index 6bf0ab96c07..64217de74b7 100644 --- a/pkg/sensors/tracing/generickprobe.go +++ b/pkg/sensors/tracing/generickprobe.go @@ -310,7 +310,7 @@ func createMultiKprobeSensor(sensorPath, policyName string, multiIDs []idtable.E load := program.Builder( path.Join(option.Config.HubbleLib, loadProgName), - fmt.Sprintf("%d functions", len(multiIDs)), + fmt.Sprintf("kprobe_multi (%d functions)", len(multiIDs)), "kprobe.multi/generic_kprobe", pinPath, "generic_kprobe"). From 191c72426d5562c3b25080f4bea1db8faa16046b Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 10 Aug 2023 08:44:53 +0000 Subject: [PATCH 3/8] tetragon: Store link in program.Program Storing link in program so we can access link's stats and store them as metrics. Signed-off-by: Jiri Olsa --- pkg/sensors/program/loader.go | 2 ++ pkg/sensors/program/program.go | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/pkg/sensors/program/loader.go b/pkg/sensors/program/loader.go index d3d247c8d96..3955549b4e3 100644 --- a/pkg/sensors/program/loader.go +++ b/pkg/sensors/program/loader.go @@ -202,6 +202,7 @@ func kprobeAttach(load *Program, prog *ebpf.Program, spec *ebpf.ProgramSpec, lnk.Close() return nil, err } + load.Link = lnk return &unloader.RelinkUnloader{ UnloadProg: unloader.PinUnloader{Prog: prog}.Unload, IsLinked: true, @@ -474,6 +475,7 @@ func multiKprobeAttach(load *Program, prog *ebpf.Program, lnk.Close() return nil, err } + load.Link = lnk return unloader.ChainUnloader{ unloader.PinUnloader{ Prog: prog, diff --git a/pkg/sensors/program/program.go b/pkg/sensors/program/program.go index 1ced40c6707..b8e4e00a6cb 100644 --- a/pkg/sensors/program/program.go +++ b/pkg/sensors/program/program.go @@ -8,6 +8,7 @@ import ( "github.com/cilium/ebpf" "github.com/cilium/ebpf/btf" + "github.com/cilium/ebpf/link" "github.com/cilium/tetragon/pkg/sensors/unloader" ) @@ -29,6 +30,7 @@ func Builder( MapLoad: nil, unloader: nil, PinMap: make(map[string]*Map), + Link: nil, } } @@ -120,6 +122,8 @@ type Program struct { // Tail call prefix/map TcPrefix string TcMap *Map + + Link link.Link } func (p *Program) SetRetProbe(ret bool) *Program { From dfef6861744c9b14e17a4c399e247e1302b5da21 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 11 May 2024 17:53:40 +0000 Subject: [PATCH 4/8] tetragon: Store ebpf.Program instance in program.Program Storing ebpf.Program instance in program.Program to allow ebpf related stats retrieval in following changes. Signed-off-by: Jiri Olsa --- pkg/sensors/program/loader.go | 2 ++ pkg/sensors/program/program.go | 2 ++ 2 files changed, 4 insertions(+) diff --git a/pkg/sensors/program/loader.go b/pkg/sensors/program/loader.go index 3955549b4e3..b904f2b1334 100644 --- a/pkg/sensors/program/loader.go +++ b/pkg/sensors/program/loader.go @@ -911,6 +911,8 @@ func doLoadProgram( return nil, err } + load.Prog = prog + // Copy the loaded collection before it's destroyed if KeepCollection { return copyLoadedCollection(coll) diff --git a/pkg/sensors/program/program.go b/pkg/sensors/program/program.go index b8e4e00a6cb..99d86ed38c5 100644 --- a/pkg/sensors/program/program.go +++ b/pkg/sensors/program/program.go @@ -31,6 +31,7 @@ func Builder( unloader: nil, PinMap: make(map[string]*Map), Link: nil, + Prog: nil, } } @@ -124,6 +125,7 @@ type Program struct { TcMap *Map Link link.Link + Prog *ebpf.Program } func (p *Program) SetRetProbe(ret bool) *Program { From a6b25165dfbef1318b93d3f722943226b92ab672 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 26 Apr 2024 19:55:12 +0000 Subject: [PATCH 5/8] tetragon: Add support to set policy name for program Adding support to set policy name for program so we can use it as a metric label in following changes. We are going to support missed stats retrieval for kprobes in following changes so at ATM we set policy for kprobe programs which are in base sensor and generic kprobe sensor. Signed-off-by: Jiri Olsa --- pkg/sensors/base/base.go | 16 +++++++++------- pkg/sensors/program/program.go | 9 +++++++++ pkg/sensors/tracing/generickprobe.go | 12 ++++++++---- 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/pkg/sensors/base/base.go b/pkg/sensors/base/base.go index b2bf22c11e4..34afc2d48c9 100644 --- a/pkg/sensors/base/base.go +++ b/pkg/sensors/base/base.go @@ -21,13 +21,15 @@ const ( ) var ( + basePolicy = "__base__" + Execve = program.Builder( config.ExecObj(), "sched/sched_process_exec", "tracepoint/sys_execve", "event_execve", "execve", - ) + ).SetPolicy(basePolicy) ExecveBprmCommit = program.Builder( "bpf_execve_bprm_commit_creds.o", @@ -35,7 +37,7 @@ var ( "kprobe/security_bprm_committing_creds", "tg_kp_bprm_committing_creds", "kprobe", - ) + ).SetPolicy(basePolicy) Exit = program.Builder( "bpf_exit.o", @@ -43,7 +45,7 @@ var ( "kprobe/acct_process", "event_exit", "kprobe", - ) + ).SetPolicy(basePolicy) Fork = program.Builder( "bpf_fork.o", @@ -51,7 +53,7 @@ var ( "kprobe/wake_up_new_task", "kprobe_pid_clear", "kprobe", - ) + ).SetPolicy(basePolicy) CgroupRmdir = program.Builder( "bpf_cgroup.o", @@ -59,7 +61,7 @@ var ( "raw_tracepoint/cgroup_rmdir", "tg_cgroup_rmdir", "raw_tracepoint", - ) + ).SetPolicy(basePolicy) /* Event Ring map */ TCPMonMap = program.MapBuilder("tcpmon_map", Execve) @@ -84,12 +86,12 @@ var ( MatchBinariesSetMap = program.MapBuilder(mbset.MapName, Execve) sensor = sensors.Sensor{ - Name: "__base__", + Name: basePolicy, } sensorInit sync.Once sensorTest = sensors.Sensor{ - Name: "__base__", + Name: basePolicy, } sensorTestInit sync.Once ) diff --git a/pkg/sensors/program/program.go b/pkg/sensors/program/program.go index 99d86ed38c5..e52cc796761 100644 --- a/pkg/sensors/program/program.go +++ b/pkg/sensors/program/program.go @@ -32,6 +32,7 @@ func Builder( PinMap: make(map[string]*Map), Link: nil, Prog: nil, + Policy: "", } } @@ -126,6 +127,9 @@ type Program struct { Link link.Link Prog *ebpf.Program + + // policy name the program belongs to + Policy string } func (p *Program) SetRetProbe(ret bool) *Program { @@ -149,6 +153,11 @@ func (p *Program) SetTailCall(prefix string, m *Map) *Program { return p } +func (p *Program) SetPolicy(policy string) *Program { + p.Policy = policy + return p +} + func (p *Program) Unload() error { if p.unloader == nil { return nil diff --git a/pkg/sensors/tracing/generickprobe.go b/pkg/sensors/tracing/generickprobe.go index 64217de74b7..a4c4fbff7b8 100644 --- a/pkg/sensors/tracing/generickprobe.go +++ b/pkg/sensors/tracing/generickprobe.go @@ -314,7 +314,8 @@ func createMultiKprobeSensor(sensorPath, policyName string, multiIDs []idtable.E "kprobe.multi/generic_kprobe", pinPath, "generic_kprobe"). - SetLoaderData(multiIDs) + SetLoaderData(multiIDs). + SetPolicy(policyName) progs = append(progs, load) fdinstall := program.MapBuilderPin("fdinstall_map", sensors.PathJoin(sensorPath, "fdinstall_map"), load) @@ -391,7 +392,8 @@ func createMultiKprobeSensor(sensorPath, policyName string, multiIDs []idtable.E "multi_retkprobe", "generic_kprobe"). SetRetProbe(true). - SetLoaderData(multiRetIDs) + SetLoaderData(multiRetIDs). + SetPolicy(policyName) progs = append(progs, loadret) retProbe := program.MapBuilderPin("retprobe_map", sensors.PathJoin(pinPath, "retprobe_map"), loadret) @@ -900,7 +902,8 @@ func createKprobeSensorFromEntry(kprobeEntry *genericKprobe, sensorPath string, "kprobe/generic_kprobe", pinProg, "generic_kprobe"). - SetLoaderData(kprobeEntry.tableId) + SetLoaderData(kprobeEntry.tableId). + SetPolicy(kprobeEntry.policyName) load.Override = kprobeEntry.hasOverride if load.Override { load.OverrideFmodRet = isSecurityFunc && bpf.HasModifyReturn() @@ -991,7 +994,8 @@ func createKprobeSensorFromEntry(kprobeEntry *genericKprobe, sensorPath string, pinRetProg, "generic_kprobe"). SetRetProbe(true). - SetLoaderData(kprobeEntry.tableId) + SetLoaderData(kprobeEntry.tableId). + SetPolicy(kprobeEntry.policyName) progs = append(progs, loadret) retProbe := program.MapBuilderPin("retprobe_map", sensors.PathJoin(pinPath, "retprobe_map"), loadret) From 26b1be192efef4febc17d9c9a2ffd04907c7aa35 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 22 Apr 2024 15:58:39 +0000 Subject: [PATCH 6/8] tetragon: Add HasMissedStatsKprobe/Multi functions Adding HasMissedStatsKprobe/Multi functions to detect support of kprobe missed stats. Checking this by searching BTF for needed structs, because AFAICS there's no 'functional' test possible. Signed-off-by: Jiri Olsa --- cmd/tetragon/main.go | 6 +- pkg/bpf/detect.go | 128 ++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 122 insertions(+), 12 deletions(-) diff --git a/cmd/tetragon/main.go b/cmd/tetragon/main.go index c4912387d26..e2d07019439 100644 --- a/cmd/tetragon/main.go +++ b/cmd/tetragon/main.go @@ -284,9 +284,6 @@ func tetragonExecute() error { return fmt.Errorf("Failed to move old tetragon base directory: %w", err) } - // we need file system mounts setup above before we detect features - log.Info("BPF detected features: ", bpf.LogFeatures()) - if option.Config.PprofAddr != "" { go func() { if err := servePprof(option.Config.PprofAddr); err != nil { @@ -393,6 +390,9 @@ func tetragonExecute() error { return err } + // needs BTF, so caling it after InitCachedBTF + log.Info("BPF detected features: ", bpf.LogFeatures()) + if err := observer.InitDataCache(option.Config.DataCacheSize); err != nil { return err } diff --git a/pkg/bpf/detect.go b/pkg/bpf/detect.go index e0ecccff2e0..71d22e13402 100644 --- a/pkg/bpf/detect.go +++ b/pkg/bpf/detect.go @@ -17,9 +17,11 @@ import ( "github.com/cilium/ebpf" "github.com/cilium/ebpf/asm" + ebtf "github.com/cilium/ebpf/btf" "github.com/cilium/ebpf/features" "github.com/cilium/ebpf/link" "github.com/cilium/tetragon/pkg/arch" + "github.com/cilium/tetragon/pkg/btf" "github.com/cilium/tetragon/pkg/logger" "golang.org/x/sys/unix" ) @@ -30,13 +32,15 @@ type Feature struct { } var ( - kprobeMulti Feature - uprobeMulti Feature - buildid Feature - modifyReturn Feature - modifyReturnSyscall Feature - linkPin Feature - lsm Feature + kprobeMulti Feature + uprobeMulti Feature + buildid Feature + modifyReturn Feature + modifyReturnSyscall Feature + linkPin Feature + lsm Feature + missedStatsKprobe Feature + missedStatsKprobeMulti Feature ) func HasOverrideHelper() bool { @@ -311,9 +315,115 @@ func HasLinkPin() bool { return linkPin.detected } +func detectMissedStats() (bool, bool) { + spec, err := btf.NewBTF() + if err != nil { + return false, false + } + + // bpf_link_info + var linkInfo *ebtf.Struct + if err := spec.TypeByName("bpf_link_info", &linkInfo); err != nil { + return false, false + } + + if len(linkInfo.Members) < 4 { + return false, false + } + + // bpf_link_info::union + m := linkInfo.Members[3] + union, ok := m.Type.(*ebtf.Union) + if !ok { + return false, false + } + + kprobe := false + kprobeMulti := false + + hasField := func(st *ebtf.Struct, name string) bool { + for _, m := range st.Members { + if m.Name == name { + return true + } + } + return false + } + + detectKprobeMulti := func(m ebtf.Member) bool { + // bpf_link_info::kprobe_multi + st, ok := m.Type.(*ebtf.Struct) + if !ok { + return false + } + // bpf_link_info::kprobe_multi::missed + return hasField(st, "missed") + } + + detectKprobe := func(m ebtf.Member) bool { + // bpf_link_info::perf_event + st, ok := m.Type.(*ebtf.Struct) + if !ok { + return false + } + + if len(st.Members) < 2 { + return false + } + + // bpf_link_info::perf_event::union + tm := st.Members[1] + un, ok := tm.Type.(*ebtf.Union) + if !ok { + return false + } + + for _, mu := range un.Members { + // bpf_link_info::perf_event::kprobe + if mu.Name == "kprobe" { + st2, ok := mu.Type.(*ebtf.Struct) + if !ok { + return false + } + // bpf_link_info::perf_event::kprobe::missed + return hasField(st2, "missed") + } + } + return false + } + + for _, m := range union.Members { + if m.Name == "kprobe_multi" { + kprobeMulti = detectKprobeMulti(m) + } else if m.Name == "perf_event" { + kprobe = detectKprobe(m) + } + } + + return kprobe, kprobeMulti +} + +func detectMissedStatsOnce() { + missedStatsKprobe.init.Do(func() { + kprobe, kprobeMulti := detectMissedStats() + missedStatsKprobe.detected = kprobe + missedStatsKprobeMulti.detected = kprobeMulti + }) +} + +func HasMissedStatsPerfEvent() bool { + detectMissedStatsOnce() + return missedStatsKprobe.detected +} + +func HasMissedStatsKprobeMulti() bool { + detectMissedStatsOnce() + return missedStatsKprobeMulti.detected +} + func LogFeatures() string { - return fmt.Sprintf("override_return: %t, buildid: %t, kprobe_multi: %t, uprobe_multi %t, fmodret: %t, fmodret_syscall: %t, signal: %t, large: %t, link_pin: %t, lsm: %t", + return fmt.Sprintf("override_return: %t, buildid: %t, kprobe_multi: %t, uprobe_multi %t, fmodret: %t, fmodret_syscall: %t, signal: %t, large: %t, link_pin: %t, lsm: %t, missed_stats_kprobe_multi: %t, missed_stats_kprobe: %t", HasOverrideHelper(), HasBuildId(), HasKprobeMulti(), HasUprobeMulti(), HasModifyReturn(), HasModifyReturnSyscall(), HasSignalHelper(), HasProgramLargeSize(), - HasLinkPin(), HasLSMPrograms()) + HasLinkPin(), HasLSMPrograms(), HasMissedStatsKprobeMulti(), HasMissedStatsPerfEvent()) } From f4650adddecddeb3d7122e563c6219da674cde9b Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jan 2024 21:47:23 +0000 Subject: [PATCH 7/8] tetragon: Add missed stats to kprobemetrics package Adding metrics for missed runs on program and link level to kprobemetrics package and logic to store and collect missed stats. The missed stats are supported for all programs and kprobe/kprobe-multi links. They are stored per 'attach name' and 'policy name'. For programs (not just kprobes): tetragon_missed_prog_probes_total{attach="__x64_sys_linkat",policy="sys-linkat-passwd"} 68 tetragon_missed_prog_probes_total{attach="acct_process",policy="__base__"} 60 tetragon_missed_prog_probes_total{attach="sched/sched_process_exec",policy="__base__"} 64 tetragon_missed_prog_probes_total{attach="security_bprm_committing_creds",policy="__base__"} 66 tetragon_missed_prog_probes_total{attach="wake_up_new_task",policy="__base__"} 62 For kprobe and kprobe-multi links: tetragon_missed_link_probes_total{attach="__x64_sys_linkat",policy="sys-linkat-passwd"} 45 tetragon_missed_link_probes_total{attach="acct_process",policy="__base__"} 39 tetragon_missed_link_probes_total{attach="security_bprm_committing_creds",policy="__base__"} 43 tetragon_missed_link_probes_total{attach="wake_up_new_task",policy="__base__"} 41 tetragon_missed_prog_probes_total{attach="acct_process",policy="__base__"} 40 tetragon_missed_prog_probes_total{attach="kprobe_multi (1 functions)",policy="sys-linkat-passwd"} 48 tetragon_missed_prog_probes_total{attach="sched/sched_process_exec",policy="__base__"} 44 tetragon_missed_prog_probes_total{attach="security_bprm_committing_creds",policy="__base__"} 46 tetragon_missed_prog_probes_total{attach="wake_up_new_task",policy="__base__"} 42 Note changing the healthMetrics group to be created as not constrained, so it can carry new metrics. It will be addressed in future by adding debug metrics group. Signed-off-by: Jiri Olsa --- docs/content/en/docs/reference/metrics.md | 18 +++++ pkg/metrics/kprobemetrics/collector.go | 83 +++++++++++++++++++++++ pkg/metrics/kprobemetrics/missed.go | 29 ++++++++ pkg/metricsconfig/healthmetrics.go | 4 +- 4 files changed, 133 insertions(+), 1 deletion(-) create mode 100644 pkg/metrics/kprobemetrics/collector.go create mode 100644 pkg/metrics/kprobemetrics/missed.go diff --git a/docs/content/en/docs/reference/metrics.md b/docs/content/en/docs/reference/metrics.md index ed089fa40c6..f9d49d54a09 100644 --- a/docs/content/en/docs/reference/metrics.md +++ b/docs/content/en/docs/reference/metrics.md @@ -181,6 +181,24 @@ The total number of Tetragon events per type that are failed to sent from the ke | ----- | ------ | | `msg_op` | `11, 13, 14, 15, 23, 24, 25, 26, 5, 7` | +### `tetragon_missed_link_probes_total` + +The total number of Tetragon probe missed by link. + +| label | values | +| ----- | ------ | +| `attach` | `sys_panic` | +| `policy` | `monitor_panic` | + +### `tetragon_missed_prog_probes_total` + +The total number of Tetragon probe missed by program. + +| label | values | +| ----- | ------ | +| `attach` | `sys_panic` | +| `policy` | `monitor_panic` | + ### `tetragon_msg_op_total` The total number of times we encounter a given message opcode. For internal use only. diff --git a/pkg/metrics/kprobemetrics/collector.go b/pkg/metrics/kprobemetrics/collector.go new file mode 100644 index 00000000000..487e844b88c --- /dev/null +++ b/pkg/metrics/kprobemetrics/collector.go @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright Authors of Tetragon + +package kprobemetrics + +import ( + "github.com/cilium/ebpf/link" + "github.com/cilium/tetragon/pkg/bpf" + "github.com/cilium/tetragon/pkg/metrics" + "github.com/cilium/tetragon/pkg/sensors" + "github.com/cilium/tetragon/pkg/sensors/program" + "github.com/prometheus/client_golang/prometheus" + "golang.org/x/sys/unix" +) + +func NewBPFCollector() prometheus.Collector { + return metrics.NewCustomCollector( + metrics.CustomMetrics{ + MissedLink, + MissedProg, + }, + collect, + collectForDocs, + ) +} + +func collectLink(ch chan<- prometheus.Metric, load *program.Program) { + if load.Link == nil { + return + } + + info, err := load.Link.Info() + if err != nil { + return + } + + missed := uint64(0) + + switch info.Type { + case link.PerfEventType: + if !bpf.HasMissedStatsPerfEvent() { + return + } + pevent := info.PerfEvent() + switch pevent.Type { + case unix.BPF_PERF_EVENT_KPROBE, unix.BPF_PERF_EVENT_KRETPROBE: + kprobe := pevent.Kprobe() + missed, _ = kprobe.Missed() + } + case link.KprobeMultiType: + if !bpf.HasMissedStatsKprobeMulti() { + return + } + kmulti := info.KprobeMulti() + missed, _ = kmulti.Missed() + default: + } + + ch <- MissedLink.MustMetric(float64(missed), load.Policy, load.Attach) +} + +func collectProg(ch chan<- prometheus.Metric, load *program.Program) { + info, err := load.Prog.Info() + if err != nil { + return + } + + missed, _ := info.RecursionMisses() + ch <- MissedProg.MustMetric(float64(missed), load.Policy, load.Attach) +} + +func collect(ch chan<- prometheus.Metric) { + allPrograms := sensors.AllPrograms() + for _, prog := range allPrograms { + collectLink(ch, prog) + collectProg(ch, prog) + } +} + +func collectForDocs(ch chan<- prometheus.Metric) { + ch <- MissedLink.MustMetric(0, "monitor_panic", "sys_panic") + ch <- MissedProg.MustMetric(0, "monitor_panic", "sys_panic") +} diff --git a/pkg/metrics/kprobemetrics/missed.go b/pkg/metrics/kprobemetrics/missed.go new file mode 100644 index 00000000000..a10d92e0d8a --- /dev/null +++ b/pkg/metrics/kprobemetrics/missed.go @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright Authors of Tetragon + +package kprobemetrics + +import ( + "github.com/cilium/tetragon/pkg/metrics" + "github.com/cilium/tetragon/pkg/metrics/consts" +) + +var ( + MissedLink = metrics.MustNewCustomCounter(metrics.NewOpts( + consts.MetricsNamespace, "", "missed_link_probes_total", + "The total number of Tetragon probe missed by link.", + nil, nil, []metrics.UnconstrainedLabel{ + metrics.UnconstrainedLabel{Name: "policy", ExampleValue: "monitor_panic"}, + metrics.UnconstrainedLabel{Name: "attach", ExampleValue: "sys_panic"}, + }, + )) + + MissedProg = metrics.MustNewCustomCounter(metrics.NewOpts( + consts.MetricsNamespace, "", "missed_prog_probes_total", + "The total number of Tetragon probe missed by program.", + nil, nil, []metrics.UnconstrainedLabel{ + metrics.UnconstrainedLabel{Name: "policy", ExampleValue: "monitor_panic"}, + metrics.UnconstrainedLabel{Name: "attach", ExampleValue: "sys_panic"}, + }, + )) +) diff --git a/pkg/metricsconfig/healthmetrics.go b/pkg/metricsconfig/healthmetrics.go index 9de0b4719b1..70182dc5fc4 100644 --- a/pkg/metricsconfig/healthmetrics.go +++ b/pkg/metricsconfig/healthmetrics.go @@ -36,7 +36,7 @@ var ( func GetHealthGroup() metrics.Group { healthMetricsOnce.Do(func() { - healthMetrics = metrics.NewMetricsGroup(true) + healthMetrics = metrics.NewMetricsGroup(false) }) return healthMetrics } @@ -102,4 +102,6 @@ func registerHealthMetrics(group metrics.Group) { group.MustRegister(policystatemetrics.NewPolicyStateCollector()) // gRPC metrics group.MustRegister(grpcmetrics.NewServerMetrics()) + // missed metris + group.MustRegister(kprobemetrics.NewBPFCollector()) } From 02900fe11ddf5e3899d45287ff3d40b1bcd2d615 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 19 Jun 2024 09:04:10 +0000 Subject: [PATCH 8/8] tetragon: Add test for tetragon_missed_prog_probes_total metric Adding test for tetragon_missed_prog_probes_total metric. At the moment I'm able to trigged missed event only for kprobe_multi and for prog recursion miss. We will need to do special setup to hit other missed counts, adding that on my todo list. Signed-off-by: Jiri Olsa --- pkg/sensors/tracing/kprobe_test.go | 69 ++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/pkg/sensors/tracing/kprobe_test.go b/pkg/sensors/tracing/kprobe_test.go index 5ecc0d2902b..f68403a1bc8 100644 --- a/pkg/sensors/tracing/kprobe_test.go +++ b/pkg/sensors/tracing/kprobe_test.go @@ -37,6 +37,8 @@ import ( bc "github.com/cilium/tetragon/pkg/matchers/bytesmatcher" lc "github.com/cilium/tetragon/pkg/matchers/listmatcher" sm "github.com/cilium/tetragon/pkg/matchers/stringmatcher" + "github.com/cilium/tetragon/pkg/metrics/consts" + "github.com/cilium/tetragon/pkg/metricsconfig" "github.com/cilium/tetragon/pkg/observer" "github.com/cilium/tetragon/pkg/observer/observertesthelper" "github.com/cilium/tetragon/pkg/option" @@ -47,6 +49,8 @@ import ( "github.com/cilium/tetragon/pkg/testutils/perfring" tus "github.com/cilium/tetragon/pkg/testutils/sensors" "github.com/cilium/tetragon/pkg/tracingpolicy" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" "github.com/sirupsen/logrus" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -6808,3 +6812,68 @@ spec: err = jsonchecker.JsonTestCheck(t, checker) assert.NoError(t, err) } + +func TestMissedProgStatsKprobeMulti(t *testing.T) { + var doneWG, readyWG sync.WaitGroup + defer doneWG.Wait() + + ctx, cancel := context.WithTimeout(context.Background(), tus.Conf().CmdWaitTime) + defer cancel() + + // we need kernel support to count the prog's missed count added in: + // f915fcb38553 ("bpf: Count stats for kprobe_multi programs") + // which was added in v6.7, adding also the kprobe-multi check + // just to be sure we have that + if !kernels.MinKernelVersion("6.7") || !bpf.HasKprobeMulti() { + t.Skip("Test requires kprobe multi and kernel version 6.7") + } + + testNop := testutils.RepoRootPath("contrib/tester-progs/nop") + + tracingPolicy := ` +apiVersion: cilium.io/v1alpha1 +kind: TracingPolicy +metadata: + name: "syswritefollowfdpsswd" +spec: + kprobes: + - call: "sys_read" + syscall: true + selectors: + - matchBinaries: + - operator: "In" + values: + - "` + testNop + `" + matchActions: + - action: Signal + argSig: 10 + - call: "group_send_sig_info" + syscall: false +` + + createCrdFile(t, tracingPolicy) + + obs, err := observertesthelper.GetDefaultObserverWithFile(t, ctx, testConfigFile, tus.Conf().TetragonLib) + if err != nil { + t.Fatalf("GetDefaultObserverWithFile error: %s", err) + } + observertesthelper.LoopEvents(ctx, t, &doneWG, &readyWG, obs) + readyWG.Wait() + + if err := exec.Command(testNop).Run(); err != nil { + fmt.Printf("Failed to execute test binary: %s\n", err) + } + + expected := strings.NewReader(` # HELP tetragon_missed_prog_probes_total The total number of Tetragon probe missed by program. +# TYPE tetragon_missed_prog_probes_total counter +tetragon_missed_prog_probes_total{attach="acct_process",policy="__base__"} 0 +tetragon_missed_prog_probes_total{attach="kprobe_multi (2 functions)",policy="syswritefollowfdpsswd"} 1 +tetragon_missed_prog_probes_total{attach="sched/sched_process_exec",policy="__base__"} 0 +tetragon_missed_prog_probes_total{attach="security_bprm_committing_creds",policy="__base__"} 0 +tetragon_missed_prog_probes_total{attach="wake_up_new_task",policy="__base__"} 0 +`) + + assert.NoError(t, testutil.GatherAndCompare(metricsconfig.GetRegistry(), expected, + prometheus.BuildFQName(consts.MetricsNamespace, "", "missed_prog_probes_total"))) + +}