From e26b38518093d2b70e28c57951c7eac873437645 Mon Sep 17 00:00:00 2001 From: Kornilios Kourtis Date: Tue, 2 Apr 2024 17:12:11 +0200 Subject: [PATCH 1/3] policystatemetrics: use observer.GetSensorManager() policystatemetrics needs a reference to the sensor manager so that it can collect metrics. Currently, this reference is passed using observer.GetSensorManager() at initialization time. In observer tests, we currently do not restart the metrics (see [1]) which means that if we create a new observer, then the metrics will still reference the old sensor manager. Fix this by having policystatemetrics to call observer.GetSensorManager() to get the latest version of the sensor manager. [1] https://github.com/cilium/tetragon/blob/22eb995b19207ac0ced2dd83950ec8e8aedd122d/pkg/observer/observertesthelper/observer_test_helper.go#L272-L276 Signed-off-by: Kornilios Kourtis --- .../policystatemetrics/policystatemetrics.go | 17 ++++++++--------- .../policystatemetrics_test.go | 10 +++++++++- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/pkg/metrics/policystatemetrics/policystatemetrics.go b/pkg/metrics/policystatemetrics/policystatemetrics.go index 9feca0d2ae9..e59b7721869 100644 --- a/pkg/metrics/policystatemetrics/policystatemetrics.go +++ b/pkg/metrics/policystatemetrics/policystatemetrics.go @@ -11,17 +11,15 @@ import ( "github.com/cilium/tetragon/pkg/logger" "github.com/cilium/tetragon/pkg/metrics/consts" "github.com/cilium/tetragon/pkg/observer" - "github.com/cilium/tetragon/pkg/sensors" "github.com/prometheus/client_golang/prometheus" ) type policyStateCollector struct { - descriptor *prometheus.Desc - sensorManager *sensors.Manager + descriptor *prometheus.Desc } func InitMetrics(registry *prometheus.Registry) { - registry.MustRegister(newPolicyStateCollector(observer.GetSensorManager())) + registry.MustRegister(newPolicyStateCollector()) } func InitMetricsForDocs(registry *prometheus.Registry) { @@ -30,14 +28,13 @@ func InitMetricsForDocs(registry *prometheus.Registry) { // This metric collector converts the output of ListTracingPolicies into a few // gauges metrics on collection. Thus, it needs a sensor manager to query. -func newPolicyStateCollector(sensorManager *sensors.Manager) *policyStateCollector { +func newPolicyStateCollector() *policyStateCollector { return &policyStateCollector{ descriptor: prometheus.NewDesc( prometheus.BuildFQName(consts.MetricsNamespace, "", "tracingpolicy_loaded"), "The number of loaded tracing policy by state.", []string{"state"}, nil, ), - sensorManager: sensorManager, } } @@ -46,11 +43,13 @@ func (c *policyStateCollector) Describe(ch chan<- *prometheus.Desc) { } func (c *policyStateCollector) Collect(ch chan<- prometheus.Metric) { - if c.sensorManager == nil { + + sm := observer.GetSensorManager() + if sm == nil { logger.GetLogger().Debug("failed retrieving the sensor manager: manager is nil") return } - list, err := c.sensorManager.ListTracingPolicies(context.Background()) + list, err := sm.ListTracingPolicies(context.Background()) if err != nil { logger.GetLogger().WithError(err).Warn("error listing tracing policies to collect policies state") return @@ -97,7 +96,7 @@ type policyStateZeroCollector struct { func newPolicyStateZeroCollector() prometheus.Collector { return &policyStateZeroCollector{ - policyStateCollector: *newPolicyStateCollector(nil), + policyStateCollector: *newPolicyStateCollector(), } } diff --git a/pkg/metrics/policystatemetrics/policystatemetrics_test.go b/pkg/metrics/policystatemetrics/policystatemetrics_test.go index 0673297f029..227306b654f 100644 --- a/pkg/metrics/policystatemetrics/policystatemetrics_test.go +++ b/pkg/metrics/policystatemetrics/policystatemetrics_test.go @@ -10,6 +10,7 @@ import ( "strings" "testing" + "github.com/cilium/tetragon/pkg/observer" tus "github.com/cilium/tetragon/pkg/testutils/sensors" "github.com/cilium/tetragon/pkg/tracingpolicy" "github.com/prometheus/client_golang/prometheus" @@ -30,9 +31,16 @@ tetragon_tracingpolicy_loaded{state="load_error"} %d } reg := prometheus.NewRegistry() + + // NB(kkourt): the policy state collector uses observer.GetSensorManager() to get the sensor + // manager because in the observer tests we only initialize metrics while the observer + // changes for every test (see: + // https://github.com/cilium/tetragon/blob/22eb995b19207ac0ced2dd83950ec8e8aedd122d/pkg/observer/observertesthelper/observer_test_helper.go#L272-L276) manager := tus.GetTestSensorManager(context.TODO(), t).Manager + observer.SetSensorManager(manager) + t.Cleanup(observer.ResetSensorManager) - collector := newPolicyStateCollector(manager) + collector := newPolicyStateCollector() reg.Register(collector) err := manager.AddTracingPolicy(context.TODO(), &tracingpolicy.GenericTracingPolicy{ From 33b40145a51dca75c7f1796b065a32ff8b4fc939 Mon Sep 17 00:00:00 2001 From: Kornilios Kourtis Date: Tue, 2 Apr 2024 18:05:19 +0200 Subject: [PATCH 2/3] sensors: respect ctx in ListTracingPolicies We should also do the same in the other operations, but we leave that as a followup. Signed-off-by: Kornilios Kourtis --- pkg/sensors/manager.go | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pkg/sensors/manager.go b/pkg/sensors/manager.go index 84d4ce49dd0..eaf90834081 100644 --- a/pkg/sensors/manager.go +++ b/pkg/sensors/manager.go @@ -249,9 +249,15 @@ func (h *Manager) ListTracingPolicies(ctx context.Context) (*tetragon.ListTracin retChan: retc, } - h.sensorCtl <- op - err := <-retc - return op.result, err + select { + case h.sensorCtl <- op: + err := <-retc + return op.result, err + + case <-ctx.Done(): + return nil, ctx.Err() + } + } func (h *Manager) RemoveSensor(ctx context.Context, sensorName string) error { From 5ce0f715dd1a1b70a488277a9f333a4556595efc Mon Sep 17 00:00:00 2001 From: Kornilios Kourtis Date: Tue, 2 Apr 2024 18:06:38 +0200 Subject: [PATCH 3/3] policystatemetrics: timeout for ListTracingPolicies This patch adds a timeout for ListTracingPolicies. It can be the case that the sensor manager is stuck or misbehaving. This patch (combined with the previous one) ensures that metrics will continue after a timeout. Tested manually using: ```diff diff --git a/pkg/metrics/policystatemetrics/policystatemetrics_test.go b/pkg/metrics/policystatemetrics/policystatemetrics_test.go index 227306b65..fd581392b 100644 --- a/pkg/metrics/policystatemetrics/policystatemetrics_test.go +++ b/pkg/metrics/policystatemetrics/policystatemetrics_test.go @@ -9,6 +9,7 @@ import ( "io" "strings" "testing" + "time" "github.com/cilium/tetragon/pkg/observer" tus "github.com/cilium/tetragon/pkg/testutils/sensors" @@ -57,3 +58,22 @@ tetragon_tracingpolicy_loaded{state="load_error"} %d err = testutil.CollectAndCompare(collector, expectedMetrics(1, 0, 0, 0)) assert.NoError(t, err) } + +func TestTimeout(t *testing.T) { + reg := prometheus.NewRegistry() + + manager := tus.GetTestSensorManager(context.TODO(), t).Manager + observer.SetSensorManager(manager) + t.Cleanup(observer.ResetSensorManager) + + collector := newPolicyStateCollector() + reg.Register(collector) + + go func() { + err := manager.SleepForTesting(context.TODO(), t, 1*time.Second) + assert.NoError(t, err) + }() + + err := testutil.CollectAndCompare(collector, strings.NewReader("")) + assert.NoError(t, err) +} diff --git a/pkg/sensors/manager.go b/pkg/sensors/manager.go index eaf908340..291a58c8f 100644 --- a/pkg/sensors/manager.go +++ b/pkg/sensors/manager.go @@ -8,6 +8,8 @@ import ( "errors" "fmt" "strings" + "testing" + "time" "github.com/cilium/tetragon/api/v1/tetragon" "github.com/cilium/tetragon/pkg/k8s/apis/cilium.io/v1alpha1" @@ -96,6 +98,13 @@ func startSensorManager( logger.GetLogger().Debugf("stopping sensor controller...") done = true err = nil + + // NB(kkourt): for testing + case *sensorManagerSleep: + time.Sleep(op.d) + err = nil + default: err = fmt.Errorf("unknown sensorOp: %v", op) } @@ -421,6 +430,13 @@ type sensorCtlStop struct { retChan chan error } +// sensorManagerSleep just sleeps. Intended only for testing. +type sensorManagerSleep struct { + ctx context.Context + retChan chan error + d time.Duration +} + type LoadArg struct{} type UnloadArg = LoadArg @@ -436,5 +452,18 @@ func (s *sensorEnable) sensorOpDone(e error) { s.retChan <- e } func (s *sensorDisable) sensorOpDone(e error) { s.retChan <- e } func (s *sensorList) sensorOpDone(e error) { s.retChan <- e } func (s *sensorCtlStop) sensorOpDone(e error) { s.retChan <- e } +func (s *sensorManagerSleep) sensorOpDone(e error) { s.retChan <- e } type sensorCtlHandle = chan<- sensorOp + +func (h *Manager) SleepForTesting(ctx context.Context, t *testing.T, d time.Duration) error { + retc := make(chan error) + op := &sensorManagerSleep{ + ctx: ctx, + retChan: retc, + d: d, + } + + h.sensorCtl <- op + return <-retc +} ``` Signed-off-by: Kornilios Kourtis --- pkg/metrics/policystatemetrics/policystatemetrics.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pkg/metrics/policystatemetrics/policystatemetrics.go b/pkg/metrics/policystatemetrics/policystatemetrics.go index e59b7721869..6bf2ef4107f 100644 --- a/pkg/metrics/policystatemetrics/policystatemetrics.go +++ b/pkg/metrics/policystatemetrics/policystatemetrics.go @@ -6,6 +6,7 @@ package policystatemetrics import ( "context" "strings" + "time" "github.com/cilium/tetragon/api/v1/tetragon" "github.com/cilium/tetragon/pkg/logger" @@ -49,7 +50,10 @@ func (c *policyStateCollector) Collect(ch chan<- prometheus.Metric) { logger.GetLogger().Debug("failed retrieving the sensor manager: manager is nil") return } - list, err := sm.ListTracingPolicies(context.Background()) + + ctx, cancel := context.WithTimeout(context.TODO(), 900*time.Millisecond) + defer cancel() + list, err := sm.ListTracingPolicies(ctx) if err != nil { logger.GetLogger().WithError(err).Warn("error listing tracing policies to collect policies state") return