Skip to content

Commit eba7925

Browse files
authored
Machine ID: Add Prometheus metrics for loop tasks (#52410) (#52729)
* Machine ID: Add Prometheus metrics for loop tasks This adds a number of Prometheus metrics to help track success, failure, and timing for loop iterations. The loop helper is used across tbot services, so these metrics universally cover identity and output renewals, among other tasks. Also, renames `service_heatbeat.go`, which was misspelled. * Include service name as a label; rename metrics for conventions
1 parent 15e0119 commit eba7925

11 files changed

+65
-8
lines changed

lib/tbot/loop.go

+46-5
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,45 @@ import (
2626

2727
"github.com/gravitational/trace"
2828
"github.com/jonboulle/clockwork"
29+
"github.com/prometheus/client_golang/prometheus"
2930

3031
"github.com/gravitational/teleport/api/utils/retryutils"
3132
)
3233

34+
var (
35+
loopIterationsCounter = prometheus.NewCounterVec(
36+
prometheus.CounterOpts{
37+
Name: "tbot_task_iterations_total",
38+
Help: "Number of task iteration attempts, not counting retries",
39+
}, []string{"service", "name"},
40+
)
41+
loopIterationsSuccessCounter = prometheus.NewHistogramVec(
42+
prometheus.HistogramOpts{
43+
Name: "tbot_task_iterations_successful",
44+
Help: "Histogram of task iterations that ultimately succeeded, bucketed by number of retries before success",
45+
Buckets: []float64{0, 1, 2, 3, 4, 5},
46+
}, []string{"service", "name"},
47+
)
48+
loopIterationsFailureCounter = prometheus.NewCounterVec(
49+
prometheus.CounterOpts{
50+
Name: "tbot_task_iterations_failed",
51+
Help: "Number of task iterations that ultimately failed, not counting retries",
52+
}, []string{"service", "name"},
53+
)
54+
loopIterationTime = prometheus.NewHistogramVec(
55+
prometheus.HistogramOpts{
56+
Name: "tbot_task_iteration_duration_seconds",
57+
Help: "Time between beginning and ultimate end of one task iteration regardless of outcome, including all retries",
58+
Buckets: prometheus.ExponentialBuckets(0.1, 1.75, 6),
59+
}, []string{"service", "name"},
60+
)
61+
)
62+
3363
type runOnIntervalConfig struct {
34-
name string
35-
f func(ctx context.Context) error
36-
clock clockwork.Clock
64+
service string
65+
name string
66+
f func(ctx context.Context) error
67+
clock clockwork.Clock
3768
// reloadCh allows the task to be triggered immediately, ideal for handling
3869
// CA rotations or a manual signal from a user.
3970
// reloadCh can be nil, in which case, the task will only run on the
@@ -49,8 +80,6 @@ type runOnIntervalConfig struct {
4980
// runOnInterval runs a function on a given interval, with retries and jitter.
5081
//
5182
// TODO(noah): Emit Prometheus metrics for:
52-
// - Success/Failure of attempts
53-
// - Time taken to execute attempt
5483
// - Time of next attempt
5584
func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error {
5685
switch {
@@ -87,6 +116,9 @@ func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error {
87116
}
88117
firstRun = false
89118

119+
loopIterationsCounter.WithLabelValues(cfg.service, cfg.name).Inc()
120+
startTime := time.Now()
121+
90122
var err error
91123
for attempt := 1; attempt <= cfg.retryLimit; attempt++ {
92124
log.InfoContext(
@@ -97,6 +129,7 @@ func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error {
97129
)
98130
err = cfg.f(ctx)
99131
if err == nil {
132+
loopIterationsSuccessCounter.WithLabelValues(cfg.service, cfg.name).Observe(float64(attempt - 1))
100133
break
101134
}
102135

@@ -114,12 +147,20 @@ func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error {
114147
)
115148
select {
116149
case <-ctx.Done():
150+
// Note: will discard metric update for this loop. It
151+
// probably won't be collected if we're shutting down,
152+
// anyway.
117153
return nil
118154
case <-cfg.clock.After(backoffTime):
119155
}
120156
}
121157
}
158+
159+
loopIterationTime.WithLabelValues(cfg.service, cfg.name).Observe(time.Since(startTime).Seconds())
160+
122161
if err != nil {
162+
loopIterationsFailureCounter.WithLabelValues(cfg.service, cfg.name).Inc()
163+
123164
if cfg.exitOnRetryExhausted {
124165
log.ErrorContext(
125166
ctx,

lib/tbot/service_application_output.go

+1
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ func (s *ApplicationOutputService) Run(ctx context.Context) error {
6060
defer unsubscribe()
6161

6262
err := runOnInterval(ctx, runOnIntervalConfig{
63+
service: s.String(),
6364
name: "output-renewal",
6465
f: s.generate,
6566
interval: s.botCfg.RenewalInterval,

lib/tbot/service_bot_identity.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,8 @@ func (s *identityService) Run(ctx context.Context) error {
264264
)
265265

266266
err := runOnInterval(ctx, runOnIntervalConfig{
267-
name: "bot-identity-renewal",
267+
service: s.String(),
268+
name: "bot-identity-renewal",
268269
f: func(ctx context.Context) error {
269270
return s.renew(ctx, storageDestination)
270271
},

lib/tbot/service_client_credential.go

+1
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ func (s *ClientCredentialOutputService) Run(ctx context.Context) error {
5555
defer unsubscribe()
5656

5757
err := runOnInterval(ctx, runOnIntervalConfig{
58+
service: s.String(),
5859
name: "output-renewal",
5960
f: s.generate,
6061
interval: s.botCfg.RenewalInterval,

lib/tbot/service_database_output.go

+1
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ func (s *DatabaseOutputService) Run(ctx context.Context) error {
6060
defer unsubscribe()
6161

6262
err := runOnInterval(ctx, runOnIntervalConfig{
63+
service: s.String(),
6364
name: "output-renewal",
6465
f: s.generate,
6566
interval: s.botCfg.RenewalInterval,

lib/tbot/service_heatbeat.go lib/tbot/service_heartbeat.go

+1
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ func (s *heartbeatService) OneShot(ctx context.Context) error {
9595
func (s *heartbeatService) Run(ctx context.Context) error {
9696
isStartup := true
9797
err := runOnInterval(ctx, runOnIntervalConfig{
98+
service: s.String(),
9899
name: "submit-heartbeat",
99100
log: s.log,
100101
interval: s.interval,

lib/tbot/service_identity_output.go

+1
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ func (s *IdentityOutputService) Run(ctx context.Context) error {
8282
defer unsubscribe()
8383

8484
err := runOnInterval(ctx, runOnIntervalConfig{
85+
service: s.String(),
8586
name: "output-renewal",
8687
f: s.generate,
8788
interval: s.botCfg.RenewalInterval,

lib/tbot/service_kubernetes_output.go

+1
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ func (s *KubernetesOutputService) Run(ctx context.Context) error {
7777
defer unsubscribe()
7878

7979
err := runOnInterval(ctx, runOnIntervalConfig{
80+
service: s.String(),
8081
name: "output-renewal",
8182
f: s.generate,
8283
interval: s.botCfg.RenewalInterval,

lib/tbot/service_ssh_host_output.go

+1
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ func (s *SSHHostOutputService) Run(ctx context.Context) error {
6161
defer unsubscribe()
6262

6363
err := runOnInterval(ctx, runOnIntervalConfig{
64+
service: s.String(),
6465
name: "output-renewal",
6566
f: s.generate,
6667
interval: s.botCfg.RenewalInterval,

lib/tbot/service_ssh_multiplexer.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -391,7 +391,8 @@ func (s *SSHMultiplexerService) identityRenewalLoop(
391391
reloadCh, unsubscribe := s.reloadBroadcaster.subscribe()
392392
defer unsubscribe()
393393
err := runOnInterval(ctx, runOnIntervalConfig{
394-
name: "identity-renewal",
394+
service: s.String(),
395+
name: "identity-renewal",
395396
f: func(ctx context.Context) error {
396397
id, err := s.generateIdentity(ctx)
397398
if err != nil {

lib/tbot/tbot.go

+8-1
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,14 @@ func (b *Bot) Run(ctx context.Context) (err error) {
133133
defer func() { apitracing.EndSpan(span, err) }()
134134
startedAt := time.Now()
135135

136-
if err := metrics.RegisterPrometheusCollectors(clientMetrics); err != nil {
136+
if err := metrics.RegisterPrometheusCollectors(
137+
metrics.BuildCollector(),
138+
clientMetrics,
139+
loopIterationsCounter,
140+
loopIterationsSuccessCounter,
141+
loopIterationsFailureCounter,
142+
loopIterationTime,
143+
); err != nil {
137144
return trace.Wrap(err)
138145
}
139146

0 commit comments

Comments
 (0)