Machine ID: Add Prometheus metrics for loop tasks (#52410) (#52729)

timothyb89 · web-flow · commit eba79259b253 · 2025-03-11T15:02:20.000Z
* Machine ID: Add Prometheus metrics for loop tasks

This adds a number of Prometheus metrics to help track success,
failure, and timing for loop iterations. The loop helper is used
across tbot services, so these metrics universally cover identity
and output renewals, among other tasks.

Also, renames `service_heatbeat.go`, which was misspelled.

* Include service name as a label; rename metrics for conventions
diff --git a/lib/tbot/loop.go b/lib/tbot/loop.go
@@ -26,14 +26,45 @@ import (
 
 	"github.com/gravitational/trace"
 	"github.com/jonboulle/clockwork"
+	"github.com/prometheus/client_golang/prometheus"
 
 	"github.com/gravitational/teleport/api/utils/retryutils"
 )
 
+var (
+	loopIterationsCounter = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Name: "tbot_task_iterations_total",
+			Help: "Number of task iteration attempts, not counting retries",
+		}, []string{"service", "name"},
+	)
+	loopIterationsSuccessCounter = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Name:    "tbot_task_iterations_successful",
+			Help:    "Histogram of task iterations that ultimately succeeded, bucketed by number of retries before success",
+			Buckets: []float64{0, 1, 2, 3, 4, 5},
+		}, []string{"service", "name"},
+	)
+	loopIterationsFailureCounter = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Name: "tbot_task_iterations_failed",
+			Help: "Number of task iterations that ultimately failed, not counting retries",
+		}, []string{"service", "name"},
+	)
+	loopIterationTime = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Name:    "tbot_task_iteration_duration_seconds",
+			Help:    "Time between beginning and ultimate end of one task iteration regardless of outcome, including all retries",
+			Buckets: prometheus.ExponentialBuckets(0.1, 1.75, 6),
+		}, []string{"service", "name"},
+	)
+)
+
 type runOnIntervalConfig struct {
-	name  string
-	f     func(ctx context.Context) error
-	clock clockwork.Clock
+	service string
+	name    string
+	f       func(ctx context.Context) error
+	clock   clockwork.Clock
 	// reloadCh allows the task to be triggered immediately, ideal for handling
 	// CA rotations or a manual signal from a user.
 	// reloadCh can be nil, in which case, the task will only run on the
@@ -49,8 +80,6 @@ type runOnIntervalConfig struct {
 // runOnInterval runs a function on a given interval, with retries and jitter.
 //
 // TODO(noah): Emit Prometheus metrics for:
-// - Success/Failure of attempts
-// - Time taken to execute attempt
 // - Time of next attempt
 func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error {
 	switch {
@@ -87,6 +116,9 @@ func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error {
 		}
 		firstRun = false
 
+		loopIterationsCounter.WithLabelValues(cfg.service, cfg.name).Inc()
+		startTime := time.Now()
+
 		var err error
 		for attempt := 1; attempt <= cfg.retryLimit; attempt++ {
 			log.InfoContext(
@@ -97,6 +129,7 @@ func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error {
 			)
 			err = cfg.f(ctx)
 			if err == nil {
+				loopIterationsSuccessCounter.WithLabelValues(cfg.service, cfg.name).Observe(float64(attempt - 1))
 				break
 			}
 
@@ -114,12 +147,20 @@ func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error {
 				)
 				select {
 				case <-ctx.Done():
+					// Note: will discard metric update for this loop. It
+					// probably won't be collected if we're shutting down,
+					// anyway.
 					return nil
 				case <-cfg.clock.After(backoffTime):
 				}
 			}
 		}
+
+		loopIterationTime.WithLabelValues(cfg.service, cfg.name).Observe(time.Since(startTime).Seconds())
+
 		if err != nil {
+			loopIterationsFailureCounter.WithLabelValues(cfg.service, cfg.name).Inc()
+
 			if cfg.exitOnRetryExhausted {
 				log.ErrorContext(
 					ctx,
diff --git a/lib/tbot/service_application_output.go b/lib/tbot/service_application_output.go
@@ -60,6 +60,7 @@ func (s *ApplicationOutputService) Run(ctx context.Context) error {
 	defer unsubscribe()
 
 	err := runOnInterval(ctx, runOnIntervalConfig{
+		service:    s.String(),
 		name:       "output-renewal",
 		f:          s.generate,
 		interval:   s.botCfg.RenewalInterval,
diff --git a/lib/tbot/service_bot_identity.go b/lib/tbot/service_bot_identity.go
@@ -264,7 +264,8 @@ func (s *identityService) Run(ctx context.Context) error {
 	)
 
 	err := runOnInterval(ctx, runOnIntervalConfig{
-		name: "bot-identity-renewal",
+		service: s.String(),
+		name:    "bot-identity-renewal",
 		f: func(ctx context.Context) error {
 			return s.renew(ctx, storageDestination)
 		},
diff --git a/lib/tbot/service_client_credential.go b/lib/tbot/service_client_credential.go
@@ -55,6 +55,7 @@ func (s *ClientCredentialOutputService) Run(ctx context.Context) error {
 	defer unsubscribe()
 
 	err := runOnInterval(ctx, runOnIntervalConfig{
+		service:    s.String(),
 		name:       "output-renewal",
 		f:          s.generate,
 		interval:   s.botCfg.RenewalInterval,
diff --git a/lib/tbot/service_database_output.go b/lib/tbot/service_database_output.go
@@ -60,6 +60,7 @@ func (s *DatabaseOutputService) Run(ctx context.Context) error {
 	defer unsubscribe()
 
 	err := runOnInterval(ctx, runOnIntervalConfig{
+		service:    s.String(),
 		name:       "output-renewal",
 		f:          s.generate,
 		interval:   s.botCfg.RenewalInterval,
diff --git a/lib/tbot/service_heartbeat.go b/lib/tbot/service_heartbeat.go
@@ -95,6 +95,7 @@ func (s *heartbeatService) OneShot(ctx context.Context) error {
 func (s *heartbeatService) Run(ctx context.Context) error {
 	isStartup := true
 	err := runOnInterval(ctx, runOnIntervalConfig{
+		service:    s.String(),
 		name:       "submit-heartbeat",
 		log:        s.log,
 		interval:   s.interval,
diff --git a/lib/tbot/service_identity_output.go b/lib/tbot/service_identity_output.go
@@ -82,6 +82,7 @@ func (s *IdentityOutputService) Run(ctx context.Context) error {
 	defer unsubscribe()
 
 	err := runOnInterval(ctx, runOnIntervalConfig{
+		service:    s.String(),
 		name:       "output-renewal",
 		f:          s.generate,
 		interval:   s.botCfg.RenewalInterval,
diff --git a/lib/tbot/service_kubernetes_output.go b/lib/tbot/service_kubernetes_output.go
@@ -77,6 +77,7 @@ func (s *KubernetesOutputService) Run(ctx context.Context) error {
 	defer unsubscribe()
 
 	err := runOnInterval(ctx, runOnIntervalConfig{
+		service:    s.String(),
 		name:       "output-renewal",
 		f:          s.generate,
 		interval:   s.botCfg.RenewalInterval,
diff --git a/lib/tbot/service_ssh_host_output.go b/lib/tbot/service_ssh_host_output.go
@@ -61,6 +61,7 @@ func (s *SSHHostOutputService) Run(ctx context.Context) error {
 	defer unsubscribe()
 
 	err := runOnInterval(ctx, runOnIntervalConfig{
+		service:    s.String(),
 		name:       "output-renewal",
 		f:          s.generate,
 		interval:   s.botCfg.RenewalInterval,
diff --git a/lib/tbot/service_ssh_multiplexer.go b/lib/tbot/service_ssh_multiplexer.go
@@ -391,7 +391,8 @@ func (s *SSHMultiplexerService) identityRenewalLoop(
 	reloadCh, unsubscribe := s.reloadBroadcaster.subscribe()
 	defer unsubscribe()
 	err := runOnInterval(ctx, runOnIntervalConfig{
-		name: "identity-renewal",
+		service: s.String(),
+		name:    "identity-renewal",
 		f: func(ctx context.Context) error {
 			id, err := s.generateIdentity(ctx)
 			if err != nil {
diff --git a/lib/tbot/tbot.go b/lib/tbot/tbot.go
@@ -133,7 +133,14 @@ func (b *Bot) Run(ctx context.Context) (err error) {
 	defer func() { apitracing.EndSpan(span, err) }()
 	startedAt := time.Now()
 
-	if err := metrics.RegisterPrometheusCollectors(clientMetrics); err != nil {
+	if err := metrics.RegisterPrometheusCollectors(
+		metrics.BuildCollector(),
+		clientMetrics,
+		loopIterationsCounter,
+		loopIterationsSuccessCounter,
+		loopIterationsFailureCounter,
+		loopIterationTime,
+	); err != nil {
 		return trace.Wrap(err)
 	}
 

Original file line number	Diff line number	Diff line change
`@@ -264,7 +264,8 @@ func (s *identityService) Run(ctx context.Context) error {`
`264`	`264`	`)`
`265`	`265`
`266`	`266`	`err := runOnInterval(ctx, runOnIntervalConfig{`
`267`		`- name: "bot-identity-renewal",`
	`267`	`+ service: s.String(),`
	`268`	`+ name: "bot-identity-renewal",`
`268`	`269`	`f: func(ctx context.Context) error {`
`269`	`270`	`return s.renew(ctx, storageDestination)`
`270`	`271`	`},`