From 82f0e8f007b436cb3b33e7d639b4cd6d453f5ffe Mon Sep 17 00:00:00 2001 From: Rob Best Date: Thu, 29 Apr 2021 10:27:50 +0100 Subject: [PATCH] metrics: initialize counters with 0 value The increase() function won't recognise a move from no value -> 1 as an increase, which means it's possible for alerts to miss events. To address this, initialize every possible set of labels for each counter with a 0 value. The `semaphore_policy_sync_queue_full_failures` and `semaphore_policy_sync_requeue` CounterVecs were using the names of networksets as labels, which has an almost unbounded possible list of potential values which we obviously can't initialize ahead of time. I've decided to remove the label because: 1. It has the potential for high cardinality 2. I don't think it's necessarily important information for either metric. I'm not sure if we'd expect issues with requeueing to be networkset specific or if we'd want to split alerts by networkset. 3. If we do have an issue with a specific networkset, that information will be in the logs. --- metrics/prometheus.go | 28 ++++++++++++++++------------ networksets.go | 4 ++-- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/metrics/prometheus.go b/metrics/prometheus.go index 2178c6c..441f6d3 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -26,23 +26,31 @@ var ( }, []string{"type"}, ) - syncQueueFullFailures = prometheus.NewCounterVec( + syncQueueFullFailures = prometheus.NewCounter( prometheus.CounterOpts{ Name: "semaphore_policy_sync_queue_full_failures", Help: "Number of times a sync task was not added to the sync queue in time because the queue was full.", }, - []string{"globalnetworkset"}, ) - syncRequeue = prometheus.NewCounterVec( + syncRequeue = prometheus.NewCounter( prometheus.CounterOpts{ Name: "semaphore_policy_sync_requeue", Help: "Number of attempts to requeue a sync.", }, - []string{"globalnetworkset"}, ) ) func init() { + // Retrieving a Counter from a CounterVec will initialize it with a 0 value if it + // doesn't already have a value. This ensures that all possible counters + // start with a 0 value. + for _, t := range []string{"get", "list", "create", "update", "patch", "watch", "delete"} { + podWatcherFailures.With(prometheus.Labels{"type": t}) + for _, s := range []string{"0", "1"} { + calicoClientRequest.With(prometheus.Labels{"type": t, "success": s}) + } + } + prometheus.MustRegister(calicoClientRequest) prometheus.MustRegister(podWatcherFailures) prometheus.MustRegister(syncQueueFullFailures) @@ -66,14 +74,10 @@ func IncPodWatcherFailures(t string) { }).Inc() } -func IncSyncQueueFullFailures(globalnetworkset string) { - syncQueueFullFailures.With(prometheus.Labels{ - "globalnetworkset": globalnetworkset, - }).Inc() +func IncSyncQueueFullFailures() { + syncQueueFullFailures.Inc() } -func IncSyncRequeue(globalnetworkset string) { - syncRequeue.With(prometheus.Labels{ - "globalnetworkset": globalnetworkset, - }).Inc() +func IncSyncRequeue() { + syncRequeue.Inc() } diff --git a/networksets.go b/networksets.go index 7b03b22..6d1b01c 100644 --- a/networksets.go +++ b/networksets.go @@ -156,7 +156,7 @@ func (nss *NetworkSetStore) RunSyncLoop() { func (nss *NetworkSetStore) requeue(id string) { log.Logger.Debug("Requeueing sync task", "id", id) - metrics.IncSyncRequeue(id) + metrics.IncSyncRequeue() go func() { time.Sleep(1) nss.enqueue(id) @@ -170,7 +170,7 @@ func (nss *NetworkSetStore) enqueue(id string) { log.Logger.Debug("Sync task queued", "id", id) case <-time.After(5 * time.Second): log.Logger.Error("Timed out trying to queue a sync action for netset, run queue is full", "id", id) - metrics.IncSyncQueueFullFailures(id) + metrics.IncSyncQueueFullFailures() nss.requeue(id) } }