@@ -26,14 +26,45 @@ import (
26
26
27
27
"github.com/gravitational/trace"
28
28
"github.com/jonboulle/clockwork"
29
+ "github.com/prometheus/client_golang/prometheus"
29
30
30
31
"github.com/gravitational/teleport/api/utils/retryutils"
31
32
)
32
33
34
+ var (
35
+ loopIterationsCounter = prometheus .NewCounterVec (
36
+ prometheus.CounterOpts {
37
+ Name : "tbot_task_iterations_total" ,
38
+ Help : "Number of task iteration attempts, not counting retries" ,
39
+ }, []string {"service" , "name" },
40
+ )
41
+ loopIterationsSuccessCounter = prometheus .NewHistogramVec (
42
+ prometheus.HistogramOpts {
43
+ Name : "tbot_task_iterations_successful" ,
44
+ Help : "Histogram of task iterations that ultimately succeeded, bucketed by number of retries before success" ,
45
+ Buckets : []float64 {0 , 1 , 2 , 3 , 4 , 5 },
46
+ }, []string {"service" , "name" },
47
+ )
48
+ loopIterationsFailureCounter = prometheus .NewCounterVec (
49
+ prometheus.CounterOpts {
50
+ Name : "tbot_task_iterations_failed" ,
51
+ Help : "Number of task iterations that ultimately failed, not counting retries" ,
52
+ }, []string {"service" , "name" },
53
+ )
54
+ loopIterationTime = prometheus .NewHistogramVec (
55
+ prometheus.HistogramOpts {
56
+ Name : "tbot_task_iteration_duration_seconds" ,
57
+ Help : "Time between beginning and ultimate end of one task iteration regardless of outcome, including all retries" ,
58
+ Buckets : prometheus .ExponentialBuckets (0.1 , 1.75 , 6 ),
59
+ }, []string {"service" , "name" },
60
+ )
61
+ )
62
+
33
63
type runOnIntervalConfig struct {
34
- name string
35
- f func (ctx context.Context ) error
36
- clock clockwork.Clock
64
+ service string
65
+ name string
66
+ f func (ctx context.Context ) error
67
+ clock clockwork.Clock
37
68
// reloadCh allows the task to be triggered immediately, ideal for handling
38
69
// CA rotations or a manual signal from a user.
39
70
// reloadCh can be nil, in which case, the task will only run on the
@@ -49,8 +80,6 @@ type runOnIntervalConfig struct {
49
80
// runOnInterval runs a function on a given interval, with retries and jitter.
50
81
//
51
82
// TODO(noah): Emit Prometheus metrics for:
52
- // - Success/Failure of attempts
53
- // - Time taken to execute attempt
54
83
// - Time of next attempt
55
84
func runOnInterval (ctx context.Context , cfg runOnIntervalConfig ) error {
56
85
switch {
@@ -87,6 +116,9 @@ func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error {
87
116
}
88
117
firstRun = false
89
118
119
+ loopIterationsCounter .WithLabelValues (cfg .service , cfg .name ).Inc ()
120
+ startTime := time .Now ()
121
+
90
122
var err error
91
123
for attempt := 1 ; attempt <= cfg .retryLimit ; attempt ++ {
92
124
log .InfoContext (
@@ -97,6 +129,7 @@ func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error {
97
129
)
98
130
err = cfg .f (ctx )
99
131
if err == nil {
132
+ loopIterationsSuccessCounter .WithLabelValues (cfg .service , cfg .name ).Observe (float64 (attempt - 1 ))
100
133
break
101
134
}
102
135
@@ -114,12 +147,20 @@ func runOnInterval(ctx context.Context, cfg runOnIntervalConfig) error {
114
147
)
115
148
select {
116
149
case <- ctx .Done ():
150
+ // Note: will discard metric update for this loop. It
151
+ // probably won't be collected if we're shutting down,
152
+ // anyway.
117
153
return nil
118
154
case <- cfg .clock .After (backoffTime ):
119
155
}
120
156
}
121
157
}
158
+
159
+ loopIterationTime .WithLabelValues (cfg .service , cfg .name ).Observe (time .Since (startTime ).Seconds ())
160
+
122
161
if err != nil {
162
+ loopIterationsFailureCounter .WithLabelValues (cfg .service , cfg .name ).Inc ()
163
+
123
164
if cfg .exitOnRetryExhausted {
124
165
log .ErrorContext (
125
166
ctx ,
0 commit comments