Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Add initial support for opentelemetry tracing in maestro server #253

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 99 additions & 0 deletions cmd/maestro/common/otlp_sdk.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
package common
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we should place the initialization to a place where it can be reused by the frontend too. Wdyt?

E.g. here: https://github.com/Azure/ARO-HCP/tree/main/internal

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agreed but should it be somewhere else instead of "internal" package? Internal pkg usually means it is not exportable or should not be exportable.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

github.com/openshift-online/ocm-common might be the best option since it's already imported by https://gitlab.cee.redhat.com/service/uhc-clusters-service/ too. And it should be ok to import it in https://github.com/Azure/ARO-HCP/ too.


import (
"context"
"fmt"
"os"
"time"

"go.opentelemetry.io/contrib/exporters/autoexport"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/propagation"
"go.opentelemetry.io/otel/sdk/resource"
tracesdk "go.opentelemetry.io/otel/sdk/trace"
semconv "go.opentelemetry.io/otel/semconv/v1.25.0"

errors "github.com/zgalor/weberr"

"github.com/openshift-online/maestro/pkg/constants"
"github.com/openshift-online/maestro/pkg/logger"
)

// Without a specific configuration, a noop tracer is used by default.
// At least two environment variables must be configured to enable trace export:
// - name: OTEL_EXPORTER_OTLP_ENDPOINT
// value: http(s)://<service>.<namespace>:4318
// - name: OTEL_TRACES_EXPORTER
// value: otlp
func InstallOpenTelemetryTracer(ctx context.Context, log logger.OCMLogger) (func(context.Context) error, error) {
log.Info("initializing OpenTelemetry tracer")

exp, err := autoexport.NewSpanExporter(ctx, autoexport.WithFallbackSpanExporter(newNoopFactory))
if err != nil {
return nil, errors.Errorf("failed to create OTEL exporter: %s", err)
}

resources, err := resource.New(context.Background(),
resource.WithAttributes(
semconv.ServiceNameKey.String(constants.DefaultSourceID),
),
resource.WithHost(),
)
if err != nil {
return nil, errors.Errorf("failed to initialize trace resources: %s", err)
}

tp := tracesdk.NewTracerProvider(
tracesdk.WithBatcher(exp),
tracesdk.WithResource(resources),
)
otel.SetTracerProvider(tp)

shutdown := func(ctx context.Context) error {
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
return tp.Shutdown(ctx)
}

propagator := propagation.NewCompositeTextMapPropagator(propagation.Baggage{}, propagation.TraceContext{})
otel.SetTextMapPropagator(propagator)

otel.SetErrorHandler(otelErrorHandlerFunc(func(err error) {
log.Error(fmt.Sprintf("OpenTelemetry.ErrorHandler: %v", err))
}))

return shutdown, nil
}

// TracingEnabled returns true if the environment variable OTEL_TRACES_EXPORTER
// to configure the OpenTelemetry Exporter is defined.
func TracingEnabled() bool {
_, ok := os.LookupEnv("OTEL_TRACES_EXPORTER")
return ok
}

type otelErrorHandlerFunc func(error)

// Handle implements otel.ErrorHandler
func (f otelErrorHandlerFunc) Handle(err error) {
f(err)
}

func newNoopFactory(_ context.Context) (tracesdk.SpanExporter, error) {
return &noopSpanExporter{}, nil
}

var _ tracesdk.SpanExporter = noopSpanExporter{}

// noopSpanExporter is an implementation of trace.SpanExporter that performs no operations.
type noopSpanExporter struct{}

// ExportSpans is part of trace.SpanExporter interface.
func (e noopSpanExporter) ExportSpans(ctx context.Context, spans []tracesdk.ReadOnlySpan) error {
return nil
}

// Shutdown is part of trace.SpanExporter interface.
func (e noopSpanExporter) Shutdown(ctx context.Context) error {
return nil
}
21 changes: 21 additions & 0 deletions cmd/maestro/servecmd/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,23 @@ package servecmd

import (
"context"
"fmt"
"os"
"os/signal"
"syscall"

"github.com/spf13/cobra"
"k8s.io/klog/v2"

"github.com/openshift-online/maestro/cmd/maestro/common"
"github.com/openshift-online/maestro/cmd/maestro/environments"
"github.com/openshift-online/maestro/cmd/maestro/server"
"github.com/openshift-online/maestro/pkg/config"
"github.com/openshift-online/maestro/pkg/controllers"
"github.com/openshift-online/maestro/pkg/db"
"github.com/openshift-online/maestro/pkg/dispatcher"
"github.com/openshift-online/maestro/pkg/event"
"github.com/openshift-online/maestro/pkg/logger"
)

func NewServerCommand() *cobra.Command {
Expand Down Expand Up @@ -76,6 +79,20 @@ func runServer(cmd *cobra.Command, args []string) {

ctx, cancel := context.WithCancel(context.Background())

tracingShutdown := func(context.Context) error { return nil }
log := logger.NewOCMLogger(ctx)
if common.TracingEnabled() {
tracingShutdown, err = common.InstallOpenTelemetryTracer(ctx, log)
if err != nil {
log.Error(fmt.Sprintf("Can't initialize OpenTelemetry trace provider: %v", err))
os.Exit(1)
}
}
if err != nil {
log.Error(fmt.Sprintf("Can't initialize OpenTelemetry trace provider: %v", err))
os.Exit(1)
}
Comment on lines +91 to +94

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it seems duplicated code?

Suggested change
if err != nil {
log.Error(fmt.Sprintf("Can't initialize OpenTelemetry trace provider: %v", err))
os.Exit(1)
}


stopCh := make(chan os.Signal, 1)
signal.Notify(stopCh, syscall.SIGINT, syscall.SIGTERM)
go func() {
Expand All @@ -89,6 +106,10 @@ func runServer(cmd *cobra.Command, args []string) {
if err := metricsServer.Stop(); err != nil {
klog.Errorf("Failed to stop metrics server, %v", err)
}

if tracingShutdown != nil && tracingShutdown(ctx) != nil {
log.Warning(fmt.Sprintf("OpenTelemetry trace provider failed to shutdown: %v", err))
}
}()

// Start the event broadcaster
Expand Down
12 changes: 12 additions & 0 deletions cmd/maestro/server/api_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@ import (
gorillahandlers "github.com/gorilla/handlers"
sdk "github.com/openshift-online/ocm-sdk-go"
"github.com/openshift-online/ocm-sdk-go/authentication"
"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
"k8s.io/klog/v2"

"github.com/openshift-online/maestro/cmd/maestro/common"
"github.com/openshift-online/maestro/cmd/maestro/environments"
"github.com/openshift-online/maestro/data/generated/openapi"
"github.com/openshift-online/maestro/pkg/errors"
Expand Down Expand Up @@ -119,6 +121,16 @@ func NewAPIServer(eventBroadcaster *event.EventBroadcaster) Server {
)(mainHandler)

mainHandler = removeTrailingSlash(mainHandler)
mainHandler = traceAttributeMiddleware(mainHandler)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Like we did for CS, we can add more attributes to the span in a follow-up PR to keep things simple.

if common.TracingEnabled() {
mainHandler = otelhttp.NewHandler(mainHandler, "apiserver",
otelhttp.WithSpanNameFormatter(
func(operation string, r *http.Request) string {
return fmt.Sprintf("%s %s %s", operation, "HTTP", r.Method)
},
),
)
}

s.httpServer = &http.Server{
Addr: env().Config.HTTPServer.Hostname + ":" + env().Config.HTTPServer.BindPort,
Expand Down
6 changes: 6 additions & 0 deletions cmd/maestro/server/routes.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ import (

gorillahandlers "github.com/gorilla/handlers"
"github.com/gorilla/mux"
"go.opentelemetry.io/contrib/instrumentation/github.com/gorilla/mux/otelmux"

"github.com/openshift-online/maestro/cmd/maestro/common"
"github.com/openshift-online/maestro/cmd/maestro/server/logging"
"github.com/openshift-online/maestro/pkg/api"
"github.com/openshift-online/maestro/pkg/auth"
Expand Down Expand Up @@ -47,6 +49,10 @@ func (s *apiServer) routes() *mux.Router {
mainRouter := mux.NewRouter()
mainRouter.NotFoundHandler = http.HandlerFunc(api.SendNotFound)

if common.TracingEnabled() {
mainRouter.Use(otelmux.Middleware("serve"))
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lets use proto + method here as spanname.

}

// Operation ID middleware sets a relatively unique operation ID in the context of each request for debugging purposes
mainRouter.Use(logger.OperationIDMiddleware)

Expand Down
38 changes: 38 additions & 0 deletions cmd/maestro/server/trace_handler.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package server

import (
"net/http"

"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/baggage"
"go.opentelemetry.io/otel/trace"

"github.com/openshift-online/maestro/pkg/constants"
"github.com/openshift-online/maestro/pkg/logger"
)

// traceAttributeMiddleware is currently only relevant for the correlation of
// requests by the ARO-HCP resource provider frontend.
//
// The middleware extracts correlation data transferred in the baggage and sets
// it as an attribute in the currently active span.
// This middleware has no effect if tracing is deactivated or if there is no
// data in the transferred baggage.
func traceAttributeMiddleware(h http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
ctx := logger.WithOpID(r.Context())
b := baggage.FromContext(ctx)
attrs := []attribute.KeyValue{}
bvalues := []string{constants.ClusterServiceClusterID, constants.AROCorrelationID, constants.AROClientRequestID, constants.ARORequestID, string(logger.OpIDKey)}
for _, k := range bvalues {
if v := b.Member(k).Value(); v != "" {
attrs = append(attrs, attribute.String(k, b.Member(k).Value()))
}
}

if len(attrs) > 0 {
trace.SpanFromContext(ctx).SetAttributes(attrs...)
}
h.ServeHTTP(w, r)
})
}
53 changes: 35 additions & 18 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ require (
github.com/go-gormigrate/gormigrate/v2 v2.0.0
github.com/go-logr/zapr v1.3.0
github.com/golang-jwt/jwt/v4 v4.5.1
github.com/golang/glog v1.2.1
github.com/golang/glog v1.2.2
github.com/google/uuid v1.6.0
github.com/gorilla/handlers v1.5.1
github.com/gorilla/mux v1.8.1
Expand All @@ -34,15 +34,22 @@ require (
github.com/openshift/library-go v0.0.0-20241107160307-0064ad7bd060
github.com/prometheus/client_golang v1.20.5
github.com/prometheus/client_model v0.6.1
github.com/prometheus/common v0.55.0
github.com/prometheus/common v0.62.0
github.com/segmentio/ksuid v1.0.2
github.com/spf13/cobra v1.8.1
github.com/spf13/pflag v1.0.5
github.com/yaacov/tree-search-language v0.0.0-20190923184055-1c2dad2e354b
github.com/zgalor/weberr v0.8.2
go.opentelemetry.io/contrib/exporters/autoexport v0.59.0
go.opentelemetry.io/contrib/instrumentation/github.com/gorilla/mux/otelmux v0.59.0
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0
go.opentelemetry.io/otel v1.34.0
go.opentelemetry.io/otel/sdk v1.34.0
go.opentelemetry.io/otel/trace v1.34.0
go.uber.org/zap v1.27.0
golang.org/x/oauth2 v0.21.0
google.golang.org/grpc v1.65.0
google.golang.org/protobuf v1.35.1
golang.org/x/oauth2 v0.24.0
google.golang.org/grpc v1.69.4
google.golang.org/protobuf v1.36.3
gopkg.in/resty.v1 v1.12.0
gorm.io/datatypes v1.2.0
gorm.io/driver/postgres v1.5.0
Expand All @@ -60,7 +67,7 @@ require (
)

require (
cloud.google.com/go/compute/metadata v0.3.0 // indirect
cloud.google.com/go/compute/metadata v0.5.2 // indirect
github.com/Azure/azure-sdk-for-go/sdk/internal v1.9.0 // indirect
github.com/AzureAD/microsoft-authentication-library-for-go v1.2.2 // indirect
github.com/NYTimes/gziphandler v1.1.1 // indirect
Expand Down Expand Up @@ -103,15 +110,15 @@ require (
github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db // indirect
github.com/gorilla/css v1.0.0 // indirect
github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 // indirect
github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 // indirect
github.com/grpc-ecosystem/grpc-gateway/v2 v2.25.1 // indirect
github.com/imdario/mergo v0.3.16 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/jackc/pgpassfile v1.0.0 // indirect
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
github.com/jinzhu/now v1.1.5 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/compress v1.17.9 // indirect
github.com/klauspost/compress v1.17.11 // indirect
github.com/kylelemons/godebug v1.1.0 // indirect
github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect
github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect
Expand All @@ -136,15 +143,25 @@ require (
go.etcd.io/etcd/client/pkg/v3 v3.5.14 // indirect
go.etcd.io/etcd/client/v3 v3.5.14 // indirect
go.opencensus.io v0.24.0 // indirect
go.opentelemetry.io/auto/sdk v1.1.0 // indirect
go.opentelemetry.io/contrib/bridges/prometheus v0.59.0 // indirect
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.53.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 // indirect
go.opentelemetry.io/otel v1.28.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.28.0 // indirect
go.opentelemetry.io/otel/metric v1.28.0 // indirect
go.opentelemetry.io/otel/sdk v1.28.0 // indirect
go.opentelemetry.io/otel/trace v1.28.0 // indirect
go.opentelemetry.io/proto/otlp v1.3.1 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.10.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp v0.10.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.34.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.34.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.34.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.34.0 // indirect
go.opentelemetry.io/otel/exporters/prometheus v0.56.0 // indirect
go.opentelemetry.io/otel/exporters/stdout/stdoutlog v0.10.0 // indirect
go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.34.0 // indirect
go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.34.0 // indirect
go.opentelemetry.io/otel/log v0.10.0 // indirect
go.opentelemetry.io/otel/metric v1.34.0 // indirect
go.opentelemetry.io/otel/sdk/log v0.10.0 // indirect
go.opentelemetry.io/otel/sdk/metric v1.34.0 // indirect
go.opentelemetry.io/proto/otlp v1.5.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
golang.org/x/crypto v0.32.0 // indirect
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect
Expand All @@ -156,8 +173,8 @@ require (
golang.org/x/time v0.5.0 // indirect
golang.org/x/tools v0.26.0 // indirect
google.golang.org/genproto v0.0.0-20240123012728-ef4313101c80 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20240701130421-f6361c86f094 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20240701130421-f6361c86f094 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20250115164207-1a7da9e5054f // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20250115164207-1a7da9e5054f // indirect
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/natefinch/lumberjack.v2 v2.2.1 // indirect
Expand Down
Loading