From 91d06529365a9da85106a5c2b04a2d65e6a732bc Mon Sep 17 00:00:00 2001 From: Anastasios Papagiannis Date: Mon, 9 Sep 2024 15:10:55 +0000 Subject: [PATCH] [bugfix] Fix clone event caching due to missing pod info [upstream commit 20bba351bd5dd0476315bfb8334189cc351ba1b2] The eventcache API provides 2 handlers. These are: RetryInternal -> called to setup process information Retry -> called to setup pod information In the case of clone events, we used to have en empty implementation on the Retry handler. This results in an issue with missing pod information which is described in detail here: https://github.com/cilium/tetragon/issues/2902 This patch provides the proper Retry implementation to handle also those cases. FIXES: https://github.com/cilium/tetragon/issues/2902 Signed-off-by: Anastasios Papagiannis --- pkg/grpc/exec/exec.go | 23 +++++++++++++++++++---- pkg/process/process.go | 8 ++++---- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/pkg/grpc/exec/exec.go b/pkg/grpc/exec/exec.go index 368c43fbdae..03e240ba108 100644 --- a/pkg/grpc/exec/exec.go +++ b/pkg/grpc/exec/exec.go @@ -302,19 +302,34 @@ func (msg *MsgCloneEventUnix) Notify() bool { } func (msg *MsgCloneEventUnix) RetryInternal(_ notify.Event, _ uint64) (*process.ProcessInternal, error) { - return nil, process.AddCloneEvent(&msg.MsgCloneEvent) + return process.AddCloneEvent(&msg.MsgCloneEvent) } -func (msg *MsgCloneEventUnix) Retry(_ *process.ProcessInternal, _ notify.Event) error { +func (msg *MsgCloneEventUnix) Retry(internal *process.ProcessInternal, _ notify.Event) error { + proc := internal.UnsafeGetProcess() + if option.Config.EnableK8s && proc.Docker != "" && proc.Pod == nil { + podInfo := process.GetPodInfo(proc.Docker, proc.Binary, proc.Arguments, msg.NSPID) + if podInfo == nil { + errormetrics.ErrorTotalInc(errormetrics.EventCachePodInfoRetryFailed) + return eventcache.ErrFailedToGetPodInfo + } + internal.AddPodInfo(podInfo) + } return nil } func (msg *MsgCloneEventUnix) HandleMessage() *tetragon.GetEventsResponse { switch msg.Common.Op { case ops.MSG_OP_CLONE: - if err := process.AddCloneEvent(&msg.MsgCloneEvent); err != nil { - ec := eventcache.Get() + ec := eventcache.Get() + if internal, err := process.AddCloneEvent(&msg.MsgCloneEvent); err == nil { + if ec != nil && ec.Needed(internal.UnsafeGetProcess()) { + // adding to the cache due to missing pod info + ec.Add(internal, nil, msg.MsgCloneEvent.Common.Ktime, msg.MsgCloneEvent.Ktime, msg) + } + } else { if ec != nil { + // adding to the cache due to missing parent ec.Add(nil, nil, msg.MsgCloneEvent.Common.Ktime, msg.MsgCloneEvent.Ktime, msg) } } diff --git a/pkg/process/process.go b/pkg/process/process.go index 053fc56cbb6..b4e67785314 100644 --- a/pkg/process/process.go +++ b/pkg/process/process.go @@ -440,7 +440,7 @@ func AddExecEvent(event *tetragonAPI.MsgExecveEventUnix) *ProcessInternal { } // AddCloneEvent adds a new process into the cache from a CloneEvent -func AddCloneEvent(event *tetragonAPI.MsgCloneEvent) error { +func AddCloneEvent(event *tetragonAPI.MsgCloneEvent) (*ProcessInternal, error) { parentExecId := GetProcessID(event.Parent.Pid, event.Parent.Ktime) parent, err := Get(parentExecId) if err != nil { @@ -449,17 +449,17 @@ func AddCloneEvent(event *tetragonAPI.MsgCloneEvent) error { "event.parent.pid": event.Parent.Pid, "event.parent.exec_id": parentExecId, }).WithError(err).Debug("CloneEvent: parent process not found in cache") - return err + return nil, err } proc, err := initProcessInternalClone(event, parent, parentExecId) if err != nil { - return err + return nil, err } parent.RefInc() procCache.add(proc) - return nil + return proc, nil } func Get(execId string) (*ProcessInternal, error) {