Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initialize metrics with labels #2151

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
c667092
metrics: Separate metrics registration and initialization
lambdanis Feb 23, 2024
b97c19b
errormetrics: Define error types as integers not strings
lambdanis Feb 23, 2024
d470f04
errormetrics: Use ops.OpCode type instead of basic ints
lambdanis Feb 24, 2024
92ae579
api/ops: Define OpCodeStrings map with opcode string values
lambdanis Feb 26, 2024
9379ceb
api/ops: Add missing OpCode values
lambdanis Feb 26, 2024
00a28f8
errormetrics, observer: Define error_type label values
lambdanis Feb 26, 2024
3df24c0
errormetrics: Initialize metrics with labels
lambdanis Feb 23, 2024
846f7e3
eventcachemetrics: Define entry_type label values
lambdanis Feb 23, 2024
05472e2
eventcachemetrics: Use tetragon.EventType as event_type label
lambdanis Feb 26, 2024
74f3604
eventcachemetrics: Add event_type label to errors metric
lambdanis Feb 26, 2024
c6df341
eventcachemetrics: Define error label values
lambdanis Feb 26, 2024
491d339
eventcachemetrics: Initialize metrics with labels
lambdanis Feb 23, 2024
48505a2
reader/exec: Define FlagStrings map with flag string values
lambdanis Feb 26, 2024
f207ebe
eventmetrics: Initialize metrics with labels
lambdanis Feb 28, 2024
0e66b7b
kprobemetrics: Define curr_type and prev_type labels values
lambdanis Feb 26, 2024
b9fd806
opcodemetrics: Use ops.OpCode type instead of basic ints
lambdanis Feb 26, 2024
db41ecd
opcodemetrics: Initialize metrics with labels
lambdanis Feb 26, 2024
e40bc60
policyfiltermetrics: Define subsys label values
lambdanis Feb 27, 2024
35f7835
policyfiltermetrics: Define op label values
lambdanis Feb 27, 2024
cc9068a
policyfiltermetrics: Remove error label
lambdanis Feb 27, 2024
96d82a1
policyfiltermetrics: Initialize metrics with labels
lambdanis Feb 27, 2024
1512ea1
Remove pkg/metrics/processexecmetrics
lambdanis Feb 27, 2024
28c69e7
watchermetrics: Define watcher label values
lambdanis Feb 27, 2024
e0458ca
watchermetrics: Initialize metrics with labels
lambdanis Feb 27, 2024
efd3107
observer: Define op as integers not strings
lambdanis Feb 28, 2024
b29398e
observer: Initialize metrics with labels
lambdanis Feb 27, 2024
57d02d5
tracing: Initialize metrics with labels
lambdanis Feb 27, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ require (
github.com/mennanov/fieldmask-utils v1.1.0
github.com/opencontainers/runtime-spec v1.2.0
github.com/pelletier/go-toml v1.9.5
github.com/pkg/errors v0.9.1
github.com/prometheus/client_golang v1.18.0
github.com/prometheus/client_model v0.6.0
github.com/sirupsen/logrus v1.9.3
Expand Down Expand Up @@ -149,6 +148,7 @@ require (
github.com/opentracing/opentracing-go v1.2.1-0.20220228012449-10b1cf09e00b // indirect
github.com/pelletier/go-toml/v2 v2.1.0 // indirect
github.com/petermattis/goid v0.0.0-20180202154549-b0b1615b78e5 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect
github.com/prometheus/common v0.45.0 // indirect
Expand Down
36 changes: 25 additions & 11 deletions pkg/api/ops/ops.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,21 +65,35 @@ const (
MsgOpKfreeSkb = 11
MsgOpGenericKprobe = 13
MsgOpGeneric_Tracepoint = 14
MsgOpGenericUprobe = 15
MsgOpClone = 23
MsgOpData = 24
MsgOpCgroup = 25
MsgOpLoader = 26
MsgOpTest = 254
)

var OpCodeStrings = map[OpCode]string{
MsgOpUndef: "Undef",
MsgOpExecve: "Execve",
MsgOpExit: "Exit",
MsgOpKfreeSkb: "KfreeSkb",
MsgOpGenericKprobe: "GenericKprobe",
MsgOpGeneric_Tracepoint: "GenericTracepoint",
MsgOpGenericUprobe: "GenericUprobe",
MsgOpClone: "Clone",
MsgOpData: "Data",
MsgOpCgroup: "Cgroup",
MsgOpLoader: "Loader",
MsgOpTest: "Test",
}

func (op OpCode) String() string {
return [...]string{
0: "Undef",
5: "Execve",
7: "Exit",
13: "GenericKprobe",
14: "GenericTracepoint",
23: "Clone",
24: "Data",
25: "Cgroup",
254: "Test",
}[op]
s, ok := OpCodeStrings[op]
if !ok {
return ""
}
return s
}

func (op CgroupOpCode) String() string {
Expand Down
6 changes: 3 additions & 3 deletions pkg/eventcache/eventcache.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,11 +141,11 @@ func (ec *Cache) handleEvents() {
continue
}
if errors.Is(err, ErrFailedToGetParentInfo) {
eventcachemetrics.ParentInfoError(notify.EventTypeString(event.event)).Inc()
eventcachemetrics.ParentInfoError(notify.EventType(event.event)).Inc()
} else if errors.Is(err, ErrFailedToGetProcessInfo) {
eventcachemetrics.ProcessInfoError(notify.EventTypeString(event.event)).Inc()
eventcachemetrics.ProcessInfoError(notify.EventType(event.event)).Inc()
} else if errors.Is(err, ErrFailedToGetPodInfo) {
eventcachemetrics.PodInfoError(notify.EventTypeString(event.event)).Inc()
eventcachemetrics.PodInfoError(notify.EventType(event.event)).Inc()
}
}

Expand Down
4 changes: 2 additions & 2 deletions pkg/grpc/exec/exec.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ func GetProcessExec(event *MsgExecveEventUnix, useCache bool) *tetragon.ProcessE
}

if tetragonProcess.Pid == nil {
eventcachemetrics.EventCacheError("GetProcessExec: nil Process.Pid").Inc()
eventcachemetrics.EventCacheError(eventcachemetrics.NilProcessPid, notify.EventType(tetragonEvent)).Inc()
return nil
}

Expand Down Expand Up @@ -394,7 +394,7 @@ func GetProcessExit(event *MsgExitEventUnix) *tetragon.ProcessExit {
}

if tetragonProcess.Pid == nil {
eventcachemetrics.EventCacheError("GetProcessExit: nil Process.Pid").Inc()
eventcachemetrics.EventCacheError(eventcachemetrics.NilProcessPid, notify.EventType(tetragonEvent)).Inc()
return nil
}

Expand Down
14 changes: 13 additions & 1 deletion pkg/grpc/tracing/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,22 @@ var (
}, []string{"count"})
)

func InitMetrics(registry *prometheus.Registry) {
func registerMetrics(registry *prometheus.Registry) {
registry.MustRegister(LoaderStats)
}

func InitMetrics(registry *prometheus.Registry) {
registerMetrics(registry)

// Initialize metrics with labels
for _, ty := range LoaderTypeStrings {
LoaderStats.WithLabelValues(ty).Add(0)
}

// NOTES:
// * Rename process_loader_stats metric (to e.g. process_loader_events_total) and count label (to e.g. event)?
}

type LoaderType int

const (
Expand Down
30 changes: 14 additions & 16 deletions pkg/grpc/tracing/tracing.go
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ func GetProcessKprobe(event *MsgGenericKprobeUnix) *tetragon.ProcessKprobe {
}

if tetragonProcess.Pid == nil {
eventcachemetrics.EventCacheError("GetProcessKprobe: nil Process.Pid").Inc()
eventcachemetrics.EventCacheError(eventcachemetrics.NilProcessPid, notify.EventType(tetragonEvent)).Inc()
return nil
}

Expand Down Expand Up @@ -447,7 +447,7 @@ func (msg *MsgGenericTracepointUnix) HandleMessage() *tetragon.GetEventsResponse
}

if tetragonProcess.Pid == nil {
eventcachemetrics.EventCacheError("GetProcessTracepoint: nil Process.Pid").Inc()
eventcachemetrics.EventCacheError(eventcachemetrics.NilProcessPid, notify.EventType(tetragonEvent)).Inc()
return nil
}

Expand Down Expand Up @@ -565,28 +565,26 @@ func GetProcessLoader(msg *MsgProcessLoaderUnix) *tetragon.ProcessLoader {
tetragonProcess = process.UnsafeGetProcess()
}

notifyEvent := &ProcessLoaderNotify{
ProcessLoader: tetragon.ProcessLoader{
Process: tetragonProcess,
Path: msg.Path,
Buildid: msg.Buildid,
},
}

if tetragonProcess.Pid == nil {
eventcachemetrics.EventCacheError("GetProcessLoader: nil Process.Pid").Inc()
eventcachemetrics.EventCacheError(eventcachemetrics.NilProcessPid, notify.EventType(notifyEvent)).Inc()
return nil
}

if ec := eventcache.Get(); ec != nil &&
(ec.Needed(tetragonProcess) || (tetragonProcess.Pid.Value > 1)) {
tetragonEvent := &ProcessLoaderNotify{}
tetragonEvent.Process = tetragonProcess
tetragonEvent.Path = msg.Path
tetragonEvent.Buildid = msg.Buildid
ec.Add(nil, tetragonEvent, msg.Msg.Common.Ktime, msg.Msg.ProcessKey.Ktime, msg)
ec.Add(nil, notifyEvent, msg.Msg.Common.Ktime, msg.Msg.ProcessKey.Ktime, msg)
return nil
}

tetragonEvent := &tetragon.ProcessLoader{
Process: tetragonProcess,
Path: msg.Path,
Buildid: msg.Buildid,
}

return tetragonEvent
return &notifyEvent.ProcessLoader
}

func (msg *MsgProcessLoaderUnix) Notify() bool {
Expand Down Expand Up @@ -685,7 +683,7 @@ func GetProcessUprobe(event *MsgGenericUprobeUnix) *tetragon.ProcessUprobe {
}

if tetragonProcess.Pid == nil {
eventcachemetrics.EventCacheError("GetProcessUprobe: nil Process.Pid").Inc()
eventcachemetrics.EventCacheError(eventcachemetrics.NilProcessPid, notify.EventType(tetragonEvent)).Inc()
return nil
}

Expand Down
94 changes: 74 additions & 20 deletions pkg/metrics/errormetrics/errormetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,32 +5,62 @@ package errormetrics

import (
"fmt"
"strings"

"github.com/cilium/tetragon/pkg/api/ops"
"github.com/cilium/tetragon/pkg/metrics/consts"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
)

type ErrorType string
type ErrorType int

var (
const (
// Process not found on get() call.
ProcessCacheMissOnGet ErrorType = "process_cache_miss_on_get"
ProcessCacheMissOnGet ErrorType = iota
// Process evicted from the cache.
ProcessCacheEvicted ErrorType = "process_cache_evicted"
ProcessCacheEvicted
// Process not found on remove() call.
ProcessCacheMissOnRemove ErrorType = "process_cache_miss_on_remove"
ProcessCacheMissOnRemove
// Tid and Pid mismatch that could affect BPF and user space caching logic
ProcessPidTidMismatch ErrorType = "process_pid_tid_mismatch"
ProcessPidTidMismatch
// An event is missing process info.
EventMissingProcessInfo ErrorType = "event_missing_process_info"
EventMissingProcessInfo
// An error occurred in an event handler.
HandlerError ErrorType = "handler_error"
HandlerError
// An event finalizer on Process failed
EventFinalizeProcessInfoFailed ErrorType = "event_finalize_process_info_failed"
EventFinalizeProcessInfoFailed
)

var errorTypeLabelValues = map[ErrorType]string{
ProcessCacheMissOnGet: "process_cache_miss_on_get",
ProcessCacheEvicted: "process_cache_evicted",
ProcessCacheMissOnRemove: "process_cache_miss_on_remove",
ProcessPidTidMismatch: "process_pid_tid_mismatch",
EventMissingProcessInfo: "event_missing_process_info",
HandlerError: "handler_error",
EventFinalizeProcessInfoFailed: "event_finalize_process_info_failed",
}

func (e ErrorType) String() string {
return errorTypeLabelValues[e]
}

type EventHandlerError int

// TODO: Recognize different errors returned by individual handlers
const (
HandlePerfUnknownOp EventHandlerError = iota
HandlePerfHandlerError
)

var eventHandlerErrorLabelValues = map[EventHandlerError]string{
HandlePerfUnknownOp: "unknown_opcode",
HandlePerfHandlerError: "event_handler_failed",
}

func (e EventHandlerError) String() string {
return eventHandlerErrorLabelValues[e]
}

var (
ErrorTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: consts.MetricsNamespace,
Expand All @@ -47,27 +77,51 @@ var (
}, []string{"opcode", "error_type"})
)

func InitMetrics(registry *prometheus.Registry) {
func registerMetrics(registry *prometheus.Registry) {
registry.MustRegister(ErrorTotal)
registry.MustRegister(HandlerErrors)
}

func InitMetrics(registry *prometheus.Registry) {
registerMetrics(registry)

// Initialize metrics with labels
for er := range errorTypeLabelValues {
GetErrorTotal(er).Add(0)
}
for opcode := range ops.OpCodeStrings {
if opcode != ops.MsgOpUndef && opcode != ops.MsgOpTest {
GetHandlerErrors(opcode, HandlePerfHandlerError).Add(0)
}
}
// NB: We initialize only ops.MsgOpUndef here, but unknown_opcode can occur for any opcode
// that is not explicitly handled.
GetHandlerErrors(ops.MsgOpUndef, HandlePerfUnknownOp).Add(0)

// NOTES:
// * op, msg_op, opcode - standardize on a label (+ add human-readable label)
// * error, error_type, type - standardize on a label
// * Delete errors_total{type="handler_error"} - it duplicates handler_errors_total
// * Consider further splitting errors_total
// * Rename handler_errors_total to event_handler_errors_total?
}

// Get a new handle on an ErrorTotal metric for an ErrorType
func GetErrorTotal(t ErrorType) prometheus.Counter {
return ErrorTotal.WithLabelValues(string(t))
func GetErrorTotal(er ErrorType) prometheus.Counter {
return ErrorTotal.WithLabelValues(er.String())
}

// Increment an ErrorTotal for an ErrorType
func ErrorTotalInc(t ErrorType) {
GetErrorTotal(t).Inc()
func ErrorTotalInc(er ErrorType) {
GetErrorTotal(er).Inc()
}

// Get a new handle on the HandlerErrors metric
func GetHandlerErrors(opcode int, err error) prometheus.Counter {
return HandlerErrors.WithLabelValues(fmt.Sprint(opcode), strings.ReplaceAll(fmt.Sprintf("%T", errors.Cause(err)), "*", ""))
func GetHandlerErrors(opcode ops.OpCode, er EventHandlerError) prometheus.Counter {
return HandlerErrors.WithLabelValues(fmt.Sprint(int32(opcode)), er.String())
}

// Increment the HandlerErrors metric
func HandlerErrorsInc(opcode int, err error) {
GetHandlerErrors(opcode, err).Inc()
func HandlerErrorsInc(opcode ops.OpCode, er EventHandlerError) {
GetHandlerErrors(opcode, er).Inc()
}
Loading
Loading