From 4e0f4c20edd2743426dc974dae48d26dc3316b42 Mon Sep 17 00:00:00 2001 From: Simon Pasquier Date: Tue, 18 Feb 2025 17:07:55 +0100 Subject: [PATCH] frontend: refactor metrics This commit refactors the frontend service to report more sensible and useful metrics. The visible changes are: * The `endpoint` label of the `frontend_health` gauge metric is dropped. * The `frontend_requests_total` gauge metric is renamed to `frontend_http_requests_total` (counter). * The `frontend_duration` gauge metric is renamed to `frontend_http_requests_duration_seconds` (histogram). The histogram buckets have been chosen to match the MSFT requirements in terms of latency: [0.25s, 0.5s, 1s, 2.5s, 5s, 10s]. Signed-off-by: Simon Pasquier --- frontend/cmd/cmd.go | 5 +- frontend/go.mod | 3 +- frontend/go.sum | 2 - frontend/pkg/frontend/const.go | 7 + frontend/pkg/frontend/context.go | 46 +++++- frontend/pkg/frontend/frontend.go | 46 +++--- frontend/pkg/frontend/frontend_test.go | 201 +++++++++++++++++------- frontend/pkg/frontend/metrics.go | 158 +++++++++---------- frontend/pkg/frontend/middleware.go | 19 ++- frontend/pkg/frontend/node_pool_test.go | 47 +++--- frontend/pkg/frontend/routes.go | 7 +- frontend/pkg/metrics/metrics.go | 20 ++- go.work.sum | 2 + 13 files changed, 361 insertions(+), 202 deletions(-) diff --git a/frontend/cmd/cmd.go b/frontend/cmd/cmd.go index 265d10c94..d15410d1b 100644 --- a/frontend/cmd/cmd.go +++ b/frontend/cmd/cmd.go @@ -109,9 +109,6 @@ func (opts *FrontendOpts) Run() error { logger := util.DefaultLogger() logger.Info(fmt.Sprintf("%s (%s) started", frontend.ProgramName, util.Version())) - // Initialize the Prometheus emitter. - prometheusEmitter := frontend.NewPrometheusEmitter(prometheus.DefaultRegisterer) - // Initialize the global OpenTelemetry tracer. otelShutdown, err := frontend.ConfigureOpenTelemetryTracer(ctx, logger, semconv.CloudRegion(opts.location)) if err != nil { @@ -179,7 +176,7 @@ func (opts *FrontendOpts) Run() error { } logger.Info(fmt.Sprintf("Application running in %s", opts.location)) - f := frontend.NewFrontend(logger, listener, metricsListener, prometheusEmitter, dbClient, opts.location, &csClient) + f := frontend.NewFrontend(logger, listener, metricsListener, prometheus.DefaultRegisterer, dbClient, opts.location, &csClient) stop := make(chan struct{}) signalChannel := make(chan os.Signal, 1) diff --git a/frontend/go.mod b/frontend/go.mod index 681a84250..fae4bd638 100644 --- a/frontend/go.mod +++ b/frontend/go.mod @@ -17,7 +17,6 @@ require ( go.opentelemetry.io/otel/sdk v1.34.0 go.opentelemetry.io/otel/trace v1.34.0 go.uber.org/mock v0.5.0 - golang.org/x/exp v0.0.0-20241009180824-f66d83c29e7c golang.org/x/sync v0.11.0 ) @@ -86,7 +85,7 @@ require ( github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/openshift/api v0.0.0-20240429104249-ac9356ba1784 github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect - github.com/prometheus/client_model v0.6.1 // indirect + github.com/prometheus/client_model v0.6.1 github.com/prometheus/common v0.62.0 // indirect github.com/prometheus/procfs v0.15.1 // indirect github.com/skratchdot/open-golang v0.0.0-20200116055534-eef842397966 // indirect diff --git a/frontend/go.sum b/frontend/go.sum index 4802aa5c7..8232df741 100644 --- a/frontend/go.sum +++ b/frontend/go.sum @@ -228,8 +228,6 @@ golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8U golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.33.0 h1:IOBPskki6Lysi0lo9qQvbxiQ+FvsCC/YWOecCHAixus= golang.org/x/crypto v0.33.0/go.mod h1:bVdXmD7IV/4GdElGPozy6U7lWdRXA4qyRVGJV57uQ5M= -golang.org/x/exp v0.0.0-20241009180824-f66d83c29e7c h1:7dEasQXItcW1xKJ2+gg5VOiBnqWrJc+rq0DPKyvvdbY= -golang.org/x/exp v0.0.0-20241009180824-f66d83c29e7c/go.mod h1:NQtJDoLvd6faHhE7m4T/1IY708gDefGGjR/iUW8yQQ8= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= diff --git a/frontend/pkg/frontend/const.go b/frontend/pkg/frontend/const.go index 8609cd394..bba10c213 100644 --- a/frontend/pkg/frontend/const.go +++ b/frontend/pkg/frontend/const.go @@ -18,4 +18,11 @@ const ( PathSegmentResourceGroupName = "resourcegroupname" PathSegmentResourceName = "resourcename" PathSegmentSubscriptionID = "subscriptionid" + + healthGaugeName = "frontend_health" + requestCounterName = "frontend_http_requests_total" + requestDurationName = "frontend_http_requests_duration_seconds" + + noMatchRouteLabel = "" + unknownVersionLabel = "" ) diff --git a/frontend/pkg/frontend/context.go b/frontend/pkg/frontend/context.go index 671b0dbb8..de93d1306 100644 --- a/frontend/pkg/frontend/context.go +++ b/frontend/pkg/frontend/context.go @@ -15,17 +15,43 @@ import ( type ContextError struct { got any + key contextKey } func (c *ContextError) Error() string { return fmt.Sprintf( - "error retrieving value from context, value obtained was '%v' and type obtained was '%T'", + "error retrieving value for key %q from context, value obtained was '%v' and type obtained was '%T'", + c.key, c.got, c.got) } type contextKey int +func (c contextKey) String() string { + switch c { + case contextKeyOriginalPath: + return "originalPath" + case contextKeyBody: + return "body" + case contextKeyLogger: + return "logger" + case contextKeyVersion: + return "version" + case contextKeyDBClient: + return "dbClient" + case contextKeyResourceID: + return "resourceID" + case contextKeyCorrelationData: + return "correlationData" + case contextKeySystemData: + return "systemData" + case contextKeyPattern: + return "pattern" + } + return "" +} + const ( // Keys for request-scoped data in http.Request contexts contextKeyOriginalPath contextKey = iota @@ -36,6 +62,7 @@ const ( contextKeyResourceID contextKeyCorrelationData contextKeySystemData + contextKeyPattern ) func ContextWithOriginalPath(ctx context.Context, originalPath string) context.Context { @@ -47,6 +74,7 @@ func OriginalPathFromContext(ctx context.Context) (string, error) { if !ok { err := &ContextError{ got: originalPath, + key: contextKeyOriginalPath, } return originalPath, err } @@ -62,6 +90,7 @@ func BodyFromContext(ctx context.Context) ([]byte, error) { if !ok { err := &ContextError{ got: body, + key: contextKeyBody, } return body, err } @@ -77,6 +106,7 @@ func LoggerFromContext(ctx context.Context) *slog.Logger { if !ok { err := &ContextError{ got: logger, + key: contextKeyLogger, } // Return the default logger as a fail-safe, but log // the failure to obtain the logger from the context. @@ -95,6 +125,7 @@ func VersionFromContext(ctx context.Context) (api.Version, error) { if !ok { err := &ContextError{ got: version, + key: contextKeyVersion, } return version, err } @@ -110,6 +141,7 @@ func DBClientFromContext(ctx context.Context) (database.DBClient, error) { if !ok { err := &ContextError{ got: dbClient, + key: contextKeyDBClient, } return dbClient, err } @@ -125,6 +157,7 @@ func ResourceIDFromContext(ctx context.Context) (*azcorearm.ResourceID, error) { if !ok { err := &ContextError{ got: resourceID, + key: contextKeyResourceID, } return resourceID, err } @@ -140,6 +173,7 @@ func CorrelationDataFromContext(ctx context.Context) (*arm.CorrelationData, erro if !ok { err := &ContextError{ got: correlationData, + key: contextKeyCorrelationData, } return correlationData, err } @@ -155,8 +189,18 @@ func SystemDataFromContext(ctx context.Context) (*arm.SystemData, error) { if !ok { err := &ContextError{ got: systemData, + key: contextKeySystemData, } return systemData, err } return systemData, nil } + +func ContextWithPattern(ctx context.Context, pattern *string) context.Context { + return context.WithValue(ctx, contextKeyPattern, pattern) +} + +func PatternFromContext(ctx context.Context) *string { + pattern, _ := ctx.Value(contextKeyPattern).(*string) + return pattern +} diff --git a/frontend/pkg/frontend/frontend.go b/frontend/pkg/frontend/frontend.go index efbfb2b46..606ff751f 100644 --- a/frontend/pkg/frontend/frontend.go +++ b/frontend/pkg/frontend/frontend.go @@ -21,6 +21,7 @@ import ( azcorearm "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" arohcpv1alpha1 "github.com/openshift-online/ocm-sdk-go/arohcp/v1alpha1" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" "golang.org/x/sync/errgroup" "github.com/Azure/ARO-HCP/frontend/pkg/metrics" @@ -39,16 +40,24 @@ type Frontend struct { dbClient database.DBClient ready atomic.Value done chan struct{} - metrics Emitter location string + collector *metrics.SubscriptionCollector + healthGauge prometheus.Gauge } -func NewFrontend(logger *slog.Logger, listener net.Listener, metricsListener net.Listener, emitter Emitter, dbClient database.DBClient, location string, csClient ocm.ClusterServiceClientSpec) *Frontend { +func NewFrontend( + logger *slog.Logger, + listener net.Listener, + metricsListener net.Listener, + reg prometheus.Registerer, + dbClient database.DBClient, + location string, + csClient ocm.ClusterServiceClientSpec, +) *Frontend { f := &Frontend{ clusterServiceClient: csClient, listener: listener, metricsListener: metricsListener, - metrics: emitter, server: http.Server{ ErrorLog: slog.NewLogLogger(logger.Handler(), slog.LevelError), BaseContext: func(net.Listener) context.Context { @@ -64,12 +73,19 @@ func NewFrontend(logger *slog.Logger, listener net.Listener, metricsListener net return ContextWithLogger(context.Background(), logger) }, }, - dbClient: dbClient, - done: make(chan struct{}), - location: strings.ToLower(location), + dbClient: dbClient, + done: make(chan struct{}), + location: strings.ToLower(location), + collector: metrics.NewSubscriptionCollector(reg, dbClient, location), + healthGauge: promauto.With(reg).NewGauge( + prometheus.GaugeOpts{ + Name: healthGaugeName, + Help: "Reports the health status of the service (0: not healthy, 1: healthy).", + }, + ), } - f.server.Handler = f.routes() + f.server.Handler = f.routes(reg) f.metricsServer.Handler = f.metricsRoutes() return f @@ -101,8 +117,7 @@ func (f *Frontend) Run(ctx context.Context, stop <-chan struct{}) { return f.metricsServer.Serve(f.metricsListener) }) errs.Go(func() error { - collector := metrics.NewSubscriptionCollector(prometheus.DefaultRegisterer, f.dbClient, f.location) - collector.Run(logger, stop) + f.collector.Run(logger, stop) return nil }) @@ -137,19 +152,14 @@ func (f *Frontend) NotFound(writer http.ResponseWriter, request *http.Request) { } func (f *Frontend) Healthz(writer http.ResponseWriter, request *http.Request) { - var healthStatus float64 - if f.CheckReady(request.Context()) { writer.WriteHeader(http.StatusOK) - healthStatus = 1.0 - } else { - arm.WriteInternalServerError(writer) - healthStatus = 0.0 + f.healthGauge.Set(1.0) + return } - f.metrics.EmitGauge("frontend_health", healthStatus, map[string]string{ - "endpoint": "/healthz", - }) + arm.WriteInternalServerError(writer) + f.healthGauge.Set(0.0) } func (f *Frontend) ArmResourceList(writer http.ResponseWriter, request *http.Request) { diff --git a/frontend/pkg/frontend/frontend_test.go b/frontend/pkg/frontend/frontend_test.go index 0838f3d9d..6dc3f68b4 100644 --- a/frontend/pkg/frontend/frontend_test.go +++ b/frontend/pkg/frontend/frontend_test.go @@ -2,13 +2,12 @@ package frontend import ( "bytes" - "context" "encoding/json" "io" "log/slog" - "net" "net/http" "net/http/httptest" + "slices" "strings" "testing" "time" @@ -16,6 +15,9 @@ import ( azcorearm "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/testutil" + dto "github.com/prometheus/client_model/go" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "go.uber.org/mock/gomock" "github.com/Azure/ARO-HCP/internal/api" @@ -66,29 +68,30 @@ func TestReadiness(t *testing.T) { mockDBClient := mocks.NewMockDBClient(ctrl) reg := prometheus.NewRegistry() - f := &Frontend{ - dbClient: mockDBClient, - metrics: NewPrometheusEmitter(reg), - } + f := NewFrontend( + testLogger, + nil, + nil, + reg, + mockDBClient, + "", + nil, + ) f.ready.Store(test.ready) - ts := httptest.NewServer(f.routes()) - ts.Config.BaseContext = func(net.Listener) context.Context { - return ContextWithLogger(context.Background(), testLogger) - } - // Call expected but is irrelevant to the test. mockDBClient.EXPECT().DBConnectionTest(gomock.Any()) - rs, err := ts.Client().Get(ts.URL + "/healthz") - if err != nil { - t.Fatal(err) - } + ts := newHTTPServer(f, ctrl, mockDBClient) - if rs.StatusCode != test.expectedStatusCode { - t.Errorf("expected status code %d, got %d", test.expectedStatusCode, rs.StatusCode) - } + rs, err := ts.Client().Get(ts.URL + "/healthz") + require.NoError(t, err) + require.Equal(t, test.expectedStatusCode, rs.StatusCode) lintMetrics(t, reg) + + got, err := testutil.GatherAndCount(reg, healthGaugeName) + require.NoError(t, err) + assert.Equal(t, 1, got) }) } } @@ -122,24 +125,28 @@ func TestSubscriptionsGET(t *testing.T) { mockDBClient := mocks.NewMockDBClient(ctrl) reg := prometheus.NewRegistry() - f := &Frontend{ - dbClient: mockDBClient, - metrics: NewPrometheusEmitter(reg), - } - - // ArmSubscriptionGet and MetricsMiddleware + f := NewFrontend( + testLogger, + nil, + nil, + reg, + mockDBClient, + "", + nil, + ) + + // ArmSubscriptionGet. mockDBClient.EXPECT(). GetSubscriptionDoc(gomock.Any(), gomock.Any()). Return(getMockDBDoc(test.subDoc)). - Times(2) - - ts := httptest.NewServer(f.routes()) - ts.Config.BaseContext = func(net.Listener) context.Context { - ctx := context.Background() - ctx = ContextWithLogger(ctx, testLogger) - ctx = ContextWithDBClient(ctx, f.dbClient) - return ctx + Times(1) + + // The subscription collector lists all documents once. + var subs []*database.SubscriptionDocument + if test.subDoc != nil { + subs = append(subs, test.subDoc) } + ts := newHTTPServer(f, ctrl, mockDBClient, subs...) rs, err := ts.Client().Get(ts.URL + "/subscriptions/" + subscriptionID + "?api-version=2.0") if err != nil { @@ -151,6 +158,7 @@ func TestSubscriptionsGET(t *testing.T) { } lintMetrics(t, reg) + assertHTTPMetrics(t, reg, test.subDoc) }) } } @@ -240,10 +248,15 @@ func TestSubscriptionsPUT(t *testing.T) { mockDBClient := mocks.NewMockDBClient(ctrl) reg := prometheus.NewRegistry() - f := &Frontend{ - dbClient: mockDBClient, - metrics: NewPrometheusEmitter(reg), - } + f := NewFrontend( + testLogger, + nil, + nil, + reg, + mockDBClient, + "", + nil, + ) body, err := json.Marshal(&test.subscription) if err != nil { @@ -269,20 +282,12 @@ func TestSubscriptionsPUT(t *testing.T) { UpdateSubscriptionDoc(gomock.Any(), gomock.Any(), gomock.Any()) } } - // MiddlewareMetrics - // (except when MiddlewareValidateStatic fails) - mockDBClient.EXPECT(). - GetSubscriptionDoc(gomock.Any(), gomock.Any()). - Return(database.NewSubscriptionDocument(subscriptionID, test.subscription), nil). - MaxTimes(1) - ts := httptest.NewServer(f.routes()) - ts.Config.BaseContext = func(net.Listener) context.Context { - ctx := context.Background() - ctx = ContextWithLogger(ctx, testLogger) - ctx = ContextWithDBClient(ctx, f.dbClient) - return ctx + var subs []*database.SubscriptionDocument + if test.subDoc != nil { + subs = append(subs, test.subDoc) } + ts := newHTTPServer(f, ctrl, mockDBClient, subs...) req, err := http.NewRequest(http.MethodPut, ts.URL+test.urlPath, bytes.NewReader(body)) if err != nil { @@ -291,15 +296,14 @@ func TestSubscriptionsPUT(t *testing.T) { req.Header.Set("Content-Type", "application/json") rs, err := ts.Client().Do(req) - if err != nil { - t.Fatal(err) - } + require.NoError(t, err) - if rs.StatusCode != test.expectedStatusCode { - t.Errorf("expected status code %d, got %d", test.expectedStatusCode, rs.StatusCode) - } + assert.Equal(t, test.expectedStatusCode, rs.StatusCode) lintMetrics(t, reg) + if test.expectedStatusCode != http.StatusBadRequest { + assertHTTPMetrics(t, reg, test.subDoc) + } }) } } @@ -308,11 +312,94 @@ func lintMetrics(t *testing.T, r prometheus.Gatherer) { t.Helper() problems, err := testutil.GatherAndLint(r) - if err != nil { - t.Fatal(err) - } + require.NoError(t, err) for _, p := range problems { t.Errorf("metric %q: %s", p.Metric, p.Text) } } + +// assertHTTPMetrics ensures that HTTP metrics have been recorded. +func assertHTTPMetrics(t *testing.T, r prometheus.Gatherer, subscriptionDoc *database.SubscriptionDocument) { + t.Helper() + + metrics, err := r.Gather() + assert.NoError(t, err) + + var mfs []*dto.MetricFamily + for _, mf := range metrics { + if mf.GetName() != requestCounterName && mf.GetName() != requestDurationName { + continue + } + + mfs = append(mfs, mf) + + for _, m := range mf.GetMetric() { + var ( + route string + apiVersion string + state string + ) + for _, l := range m.GetLabel() { + switch l.GetName() { + case "route": + route = l.GetValue() + case "api_version": + apiVersion = l.GetValue() + case "state": + state = l.GetValue() + } + } + + // Verify that route and API version labels have known values. + assert.NotEmpty(t, route) + assert.NotEqual(t, route, noMatchRouteLabel) + assert.NotEmpty(t, apiVersion) + assert.NotEqual(t, apiVersion, unknownVersionLabel) + + if mf.GetName() == requestCounterName { + assert.NotEmpty(t, state) + if subscriptionDoc != nil { + assert.Equal(t, string(subscriptionDoc.Subscription.State), state) + } else { + assert.Equal(t, "Unknown", state) + } + } + } + } + + // We need request counter and latency histogram. + assert.Len(t, mfs, 2) +} + +// newHTTPServer returns a test HTTP server. When a mock DB client is provided, +// the subscription collector will be bootstrapped with the provided +// subscription documents. +func newHTTPServer(f *Frontend, ctrl *gomock.Controller, mockDBClient *mocks.MockDBClient, subs ...*database.SubscriptionDocument) *httptest.Server { + ts := httptest.NewUnstartedServer(f.server.Handler) + ts.Config.BaseContext = f.server.BaseContext + ts.Start() + + mockIter := mocks.NewMockDBClientIterator[database.SubscriptionDocument](ctrl) + mockIter.EXPECT(). + Items(gomock.Any()). + Return(database.DBClientIteratorItem[database.SubscriptionDocument](slices.Values(subs))) + + mockIter.EXPECT(). + GetError(). + Return(nil) + + mockDBClient.EXPECT(). + ListAllSubscriptionDocs(). + Return(mockIter). + Times(1) + + // The initialization of the subscriptions collector is normally part of + // the Run() method but the method doesn't get called in the tests so it's + // executed here. + stop := make(chan struct{}) + close(stop) + f.collector.Run(testLogger, stop) + + return ts +} diff --git a/frontend/pkg/frontend/metrics.go b/frontend/pkg/frontend/metrics.go index 4b54587f0..3d9d1a248 100644 --- a/frontend/pkg/frontend/metrics.go +++ b/frontend/pkg/frontend/metrics.go @@ -7,68 +7,24 @@ import ( "net/http" "regexp" "strconv" - "sync" "time" + azcorearm "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" "github.com/prometheus/client_golang/prometheus" - "golang.org/x/exp/maps" - - "github.com/Azure/ARO-HCP/internal/database" + "github.com/prometheus/client_golang/prometheus/promauto" ) -// Emitter emits different types of metrics -type Emitter interface { - AddCounter(metricName string, value float64, labels map[string]string) - EmitGauge(metricName string, value float64, labels map[string]string) -} - -type PrometheusEmitter struct { - mutex sync.Mutex - gauges map[string]*prometheus.GaugeVec - counters map[string]*prometheus.CounterVec - registry prometheus.Registerer -} - -func NewPrometheusEmitter(r prometheus.Registerer) *PrometheusEmitter { - return &PrometheusEmitter{ - gauges: make(map[string]*prometheus.GaugeVec), - counters: make(map[string]*prometheus.CounterVec), - registry: r, - } -} - -func (pe *PrometheusEmitter) EmitGauge(name string, value float64, labels map[string]string) { - pe.mutex.Lock() - defer pe.mutex.Unlock() - vec, exists := pe.gauges[name] - if !exists { - labelKeys := maps.Keys(labels) - vec = prometheus.NewGaugeVec(prometheus.GaugeOpts{Name: name}, labelKeys) - pe.registry.MustRegister(vec) - pe.gauges[name] = vec - } - vec.With(labels).Set(value) -} - -func (pe *PrometheusEmitter) AddCounter(name string, value float64, labels map[string]string) { - pe.mutex.Lock() - defer pe.mutex.Unlock() - vec, exists := pe.counters[name] - if !exists { - labelKeys := maps.Keys(labels) - vec = prometheus.NewCounterVec(prometheus.CounterOpts{Name: name}, labelKeys) - pe.registry.MustRegister(vec) - pe.counters[name] = vec - } - vec.With(labels).Add(value) -} - // patternRe is used to strip the METHOD string from the [ServerMux] pattern string. var patternRe = regexp.MustCompile(`^[^\s]*\s+`) +type SubscriptionStateGetter interface { + GetSubscriptionState(string) string +} + type MetricsMiddleware struct { - Emitter - dbClient database.DBClient + ssg SubscriptionStateGetter + requestCounter *prometheus.CounterVec + requestDuration *prometheus.HistogramVec } type logResponseWriter struct { @@ -82,49 +38,85 @@ func (lrw *logResponseWriter) WriteHeader(code int) { lrw.ResponseWriter.WriteHeader(code) } +func NewMetricsMiddleware(r prometheus.Registerer, ssg SubscriptionStateGetter) *MetricsMiddleware { + mm := &MetricsMiddleware{ + ssg: ssg, + requestCounter: promauto.With(r).NewCounterVec( + prometheus.CounterOpts{ + Name: requestCounterName, + Help: "Counter for HTTP requests by method, code and route.", + }, + []string{"api_version", "method", "code", "route", "state"}, + ), + requestDuration: promauto.With(r).NewHistogramVec( + prometheus.HistogramOpts{ + Name: requestDurationName, + Help: "Histogram of latencies for HTTP requests by method, code and route.", + // The bucket values are chosen to match the general + // recommendation in terms of latency for resource providers + // (e.g. latency less than or equal to 1 second). + Buckets: []float64{.25, .5, 1, 2.5, 5, 10}, + // Enable native histogram (sparse buckets). The settings have + // been chosen to offer a balance between accuracy and memory + // usage. + // Note that it requires support from the scraper (e.g. Prometheus). + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: 1 * time.Hour, + }, + []string{"api_version", "method", "code", "route"}, + ), + } + + return mm +} + // Metrics middleware to capture response time and status code func (mm MetricsMiddleware) Metrics() MiddlewareFunc { return func(w http.ResponseWriter, r *http.Request, next http.HandlerFunc) { - ctx := r.Context() - logger := LoggerFromContext(ctx) - startTime := time.Now() lrw := &logResponseWriter{ResponseWriter: w} - next(lrw, r) // Process the request - - duration := time.Since(startTime).Seconds() + next(lrw, r) // Process the request. + + // Get the route pattern that matched. + // Note that the value can be empty if one of the middlewares executing + // before the ServeMux handler returned early. + var ( + routePattern = PatternFromContext(r.Context()) + route string + ) + if routePattern != nil { + route = patternRe.ReplaceAllString(*routePattern, "") + } + if route == "" { + route = noMatchRouteLabel + } - // Get the route pattern that matched - routePattern := r.Pattern - routePattern = patternRe.ReplaceAllString(routePattern, "") + apiVersion := r.URL.Query().Get(APIVersionKey) + if apiVersion == "" { + apiVersion = unknownVersionLabel + } - subscriptionState := "Unknown" - subscriptionId := r.PathValue(PathSegmentSubscriptionID) - if subscriptionId != "" { - sub, err := mm.dbClient.GetSubscriptionDoc(r.Context(), subscriptionId) - if err != nil { - // If we can't determine the subscription state, we can still expose a metric for subscriptionState "Unknown" - logger.Info("unable to retrieve subscription document for the `frontend_requests_total` metric", "subscriptionId", subscriptionId, "error", err) - } else { - subscriptionState = string(sub.Subscription.State) - } + var subscriptionID string + if resource, _ := azcorearm.ParseResourceID(r.URL.Path); resource != nil { + subscriptionID = resource.SubscriptionID } - mm.Emitter.AddCounter("frontend_requests_total", 1.0, map[string]string{ - "verb": r.Method, - "api_version": r.URL.Query().Get(APIVersionKey), + mm.requestCounter.With(prometheus.Labels{ + "method": r.Method, + "api_version": apiVersion, "code": strconv.Itoa(lrw.statusCode), - "route": routePattern, - "state": subscriptionState, - }) + "route": route, + "state": mm.ssg.GetSubscriptionState(subscriptionID), + }).Inc() - mm.Emitter.EmitGauge("frontend_duration", float64(duration), map[string]string{ - "verb": r.Method, - "api_version": r.URL.Query().Get(APIVersionKey), + mm.requestDuration.With(prometheus.Labels{ + "method": r.Method, + "api_version": apiVersion, "code": strconv.Itoa(lrw.statusCode), - "route": routePattern, - }) + "route": route, + }).Observe(time.Since(startTime).Seconds()) } } diff --git a/frontend/pkg/frontend/middleware.go b/frontend/pkg/frontend/middleware.go index 751c4b9e5..8bbffabf5 100644 --- a/frontend/pkg/frontend/middleware.go +++ b/frontend/pkg/frontend/middleware.go @@ -80,5 +80,22 @@ func NewMiddlewareMux(functions ...MiddlewareFunc) *MiddlewareMux { // ServeHTTP dispatches the request to each middleware function, and then to // the handler whose pattern most closely matches the request URL. func (mux *MiddlewareMux) ServeHTTP(w http.ResponseWriter, r *http.Request) { - mux.middleware.Handler(&mux.ServeMux).ServeHTTP(w, r) + // Initialize a string pointer to record the pattern matched by ServeMux. + // + // This is useful for middlewares that are executed before ServeMux and + // want to know the matched pattern. They can't rely on the value stored in + // r.Pattern because the original request can be mutated by following + // middlewares in which case r.Pattern remains empty. + // + // Since the handlers execute sequentially, there's no risk of concurrent + // access to the value. + patt := new(string) + r = r.WithContext(ContextWithPattern(r.Context(), patt)) + + mainHandler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + mux.ServeMux.ServeHTTP(w, r) + *patt = r.Pattern + }) + + mux.middleware.Handler(mainHandler).ServeHTTP(w, r) } diff --git a/frontend/pkg/frontend/node_pool_test.go b/frontend/pkg/frontend/node_pool_test.go index 7553e5248..50c796ca7 100644 --- a/frontend/pkg/frontend/node_pool_test.go +++ b/frontend/pkg/frontend/node_pool_test.go @@ -6,15 +6,15 @@ import ( "bytes" "context" "encoding/json" - "net" "net/http" - "net/http/httptest" "testing" "time" azcorearm "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" "github.com/prometheus/client_golang/prometheus" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "go.uber.org/mock/gomock" "github.com/Azure/ARO-HCP/internal/api" @@ -70,7 +70,8 @@ func TestCreateNodePool(t *testing.T) { { name: "PUT Node Pool - Create a new Node Pool", urlPath: dummyNodePoolID + "?api-version=2024-06-10-preview", - subDoc: database.NewSubscriptionDocument(dummySubscriptionId, + subDoc: database.NewSubscriptionDocument( + dummySubscriptionId, &arm.Subscription{ State: arm.SubscriptionStateRegistered, RegistrationDate: api.Ptr(time.Now().String()), @@ -78,7 +79,6 @@ func TestCreateNodePool(t *testing.T) { }), clusterDoc: clusterDoc, nodePoolDoc: nodePoolDoc, - systemData: &arm.SystemData{}, expectedStatusCode: http.StatusCreated, }, } @@ -90,26 +90,22 @@ func TestCreateNodePool(t *testing.T) { mockCSClient := mocks.NewMockClusterServiceClientSpec(ctrl) reg := prometheus.NewRegistry() - f := &Frontend{ - dbClient: mockDBClient, - metrics: NewPrometheusEmitter(reg), - clusterServiceClient: mockCSClient, - } + f := NewFrontend( + testLogger, + nil, + nil, + reg, + mockDBClient, + "", + mockCSClient, + ) requestHeader := make(http.Header) requestHeader.Add(arm.HeaderNameHomeTenantID, dummyTenantId) body, _ := json.Marshal(requestBody) - ts := httptest.NewServer(f.routes()) - ts.Config.BaseContext = func(net.Listener) context.Context { - ctx := context.Background() - ctx = ContextWithLogger(ctx, testLogger) // defined in frontend_test.go - ctx = ContextWithDBClient(ctx, f.dbClient) - ctx = ContextWithSystemData(ctx, test.systemData) - - return ctx - } + ts := newHTTPServer(f, ctrl, mockDBClient, test.subDoc) // CreateOrUpdateNodePool mockCSClient.EXPECT(). @@ -126,11 +122,11 @@ func TestCreateNodePool(t *testing.T) { // MiddlewareLockSubscription mockDBClient.EXPECT(). GetLockClient() - // MiddlewareValidateSubscriptionState and MetricsMiddleware + // MiddlewareValidateSubscriptionState mockDBClient.EXPECT(). GetSubscriptionDoc(gomock.Any(), test.subDoc.ID). Return(test.subDoc, nil). - Times(2) + Times(1) // CreateOrUpdateNodePool mockDBClient.EXPECT(). GetResourceDoc(gomock.Any(), equalResourceID(test.nodePoolDoc.ResourceID)). @@ -155,19 +151,16 @@ func TestCreateNodePool(t *testing.T) { t.Fatal(err) } req.Header.Set("Content-Type", "application/json") + req.Header.Set(arm.HeaderNameARMResourceSystemData, "{}") rs, err := ts.Client().Do(req) t.Log(rs) - if err != nil { - t.Log(err) - t.Fatal(err) - } + require.NoError(t, err) - if rs.StatusCode != test.expectedStatusCode { - t.Errorf("expected status code %d, got %d", test.expectedStatusCode, rs.StatusCode) - } + assert.Equal(t, test.expectedStatusCode, rs.StatusCode) lintMetrics(t, reg) + assertHTTPMetrics(t, reg, test.subDoc) }) } } diff --git a/frontend/pkg/frontend/routes.go b/frontend/pkg/frontend/routes.go index 838ccc10c..afdfe1586 100644 --- a/frontend/pkg/frontend/routes.go +++ b/frontend/pkg/frontend/routes.go @@ -6,6 +6,7 @@ import ( "path" "strings" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/Azure/ARO-HCP/internal/api" @@ -39,12 +40,13 @@ func MuxPattern(method string, segments ...string) string { return fmt.Sprintf("%s /%s", method, strings.ToLower(path.Join(segments...))) } -func (f *Frontend) routes() *MiddlewareMux { +func (f *Frontend) routes(r prometheus.Registerer) *MiddlewareMux { // Setup metrics middleware - metricsMiddleware := MetricsMiddleware{dbClient: f.dbClient, Emitter: f.metrics} + metricsMiddleware := NewMetricsMiddleware(r, f.collector) mux := NewMiddlewareMux( MiddlewarePanic, + metricsMiddleware.Metrics(), MiddlewareCorrelationData, MiddlewareTracing, MiddlewareLogging, @@ -56,7 +58,6 @@ func (f *Frontend) routes() *MiddlewareMux { MiddlewareLowercase, MiddlewareSystemData, MiddlewareValidateStatic, - metricsMiddleware.Metrics(), ) // Unauthenticated routes diff --git a/frontend/pkg/metrics/metrics.go b/frontend/pkg/metrics/metrics.go index 060c71e8c..673005e57 100644 --- a/frontend/pkg/metrics/metrics.go +++ b/frontend/pkg/metrics/metrics.go @@ -32,7 +32,7 @@ type SubscriptionCollector struct { lastSuccessSyncTimestamp prometheus.Gauge mtx sync.RWMutex - subscriptions []subscription + subscriptions map[string]subscription } const ( @@ -123,15 +123,15 @@ func (sc *SubscriptionCollector) refresh(logger *slog.Logger) { } func (sc *SubscriptionCollector) updateCache() error { - var subscriptions []subscription + subscriptions := make(map[string]subscription) iter := sc.dbClient.ListAllSubscriptionDocs() for sub := range iter.Items(context.Background()) { - subscriptions = append(subscriptions, subscription{ + subscriptions[sub.ID] = subscription{ id: sub.ID, state: string(sub.Subscription.State), updated_at: int64(sub.CosmosTimestamp), - }) + } } if err := iter.GetError(); err != nil { return err @@ -144,6 +144,18 @@ func (sc *SubscriptionCollector) updateCache() error { return nil } +// GetSubscriptionState returns the state of the subscription. +func (sc *SubscriptionCollector) GetSubscriptionState(id string) string { + sc.mtx.RLock() + defer sc.mtx.RUnlock() + + if s, found := sc.subscriptions[id]; found { + return s.state + } + + return "Unknown" +} + var ( subscriptionStateDesc = prometheus.NewDesc( subscriptionStateName, diff --git a/go.work.sum b/go.work.sum index 59931950b..9427f60f6 100644 --- a/go.work.sum +++ b/go.work.sum @@ -2301,6 +2301,8 @@ golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA= golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0= golang.org/x/term v0.22.0/go.mod h1:F3qCibpT5AMpCRfhfT53vVJwhLtIVHhB9XDjfFvnMI4= golang.org/x/term v0.25.0/go.mod h1:RPyXicDX+6vLxogjjRxjgD2TKtmAO6NZBsBRfrOLu7M= +golang.org/x/term v0.29.0 h1:L6pJp37ocefwRRtYPKSWOWzOtWSxVajvz2ldH/xi3iU= +golang.org/x/term v0.29.0/go.mod h1:6bl4lRlvVuDgSf3179VpIxBF0o10JUpXWOnI7nErv7s= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=