Skip to content

Commit

Permalink
metrics: initialize counters with 0 value
Browse files Browse the repository at this point in the history
The increase() function  won't recognise a move from no value -> 1
as an increase, which means it's possible for alerts to miss events.

To address this, initialize every possible set of labels for each
counter with a 0 value.

The `semaphore_policy_sync_queue_full_failures` and
`semaphore_policy_sync_requeue` CounterVecs were using the names of
networksets as labels, which has an almost unbounded possible list of
potential values which we obviously can't initialize ahead of time.

I've decided to remove the label because:
1. It has the potential for high cardinality
2. I don't think it's necessarily important information for either
   metric. I'm not sure if we'd expect issues with requeueing to be
   networkset specific or if we'd want to split alerts by networkset.
3. If we do have an issue with a specific networkset, that information
   will be in the logs.
  • Loading branch information
ribbybibby committed Apr 29, 2021
1 parent 0b2ef93 commit 82f0e8f
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 14 deletions.
28 changes: 16 additions & 12 deletions metrics/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,23 +26,31 @@ var (
},
[]string{"type"},
)
syncQueueFullFailures = prometheus.NewCounterVec(
syncQueueFullFailures = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "semaphore_policy_sync_queue_full_failures",
Help: "Number of times a sync task was not added to the sync queue in time because the queue was full.",
},
[]string{"globalnetworkset"},
)
syncRequeue = prometheus.NewCounterVec(
syncRequeue = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "semaphore_policy_sync_requeue",
Help: "Number of attempts to requeue a sync.",
},
[]string{"globalnetworkset"},
)
)

func init() {
// Retrieving a Counter from a CounterVec will initialize it with a 0 value if it
// doesn't already have a value. This ensures that all possible counters
// start with a 0 value.
for _, t := range []string{"get", "list", "create", "update", "patch", "watch", "delete"} {
podWatcherFailures.With(prometheus.Labels{"type": t})
for _, s := range []string{"0", "1"} {
calicoClientRequest.With(prometheus.Labels{"type": t, "success": s})
}
}

prometheus.MustRegister(calicoClientRequest)
prometheus.MustRegister(podWatcherFailures)
prometheus.MustRegister(syncQueueFullFailures)
Expand All @@ -66,14 +74,10 @@ func IncPodWatcherFailures(t string) {
}).Inc()
}

func IncSyncQueueFullFailures(globalnetworkset string) {
syncQueueFullFailures.With(prometheus.Labels{
"globalnetworkset": globalnetworkset,
}).Inc()
func IncSyncQueueFullFailures() {
syncQueueFullFailures.Inc()
}

func IncSyncRequeue(globalnetworkset string) {
syncRequeue.With(prometheus.Labels{
"globalnetworkset": globalnetworkset,
}).Inc()
func IncSyncRequeue() {
syncRequeue.Inc()
}
4 changes: 2 additions & 2 deletions networksets.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ func (nss *NetworkSetStore) RunSyncLoop() {

func (nss *NetworkSetStore) requeue(id string) {
log.Logger.Debug("Requeueing sync task", "id", id)
metrics.IncSyncRequeue(id)
metrics.IncSyncRequeue()
go func() {
time.Sleep(1)
nss.enqueue(id)
Expand All @@ -170,7 +170,7 @@ func (nss *NetworkSetStore) enqueue(id string) {
log.Logger.Debug("Sync task queued", "id", id)
case <-time.After(5 * time.Second):
log.Logger.Error("Timed out trying to queue a sync action for netset, run queue is full", "id", id)
metrics.IncSyncQueueFullFailures(id)
metrics.IncSyncQueueFullFailures()
nss.requeue(id)
}
}
Expand Down

0 comments on commit 82f0e8f

Please sign in to comment.