From 5a189153f38a3b87874f90a590651dd6792ea000 Mon Sep 17 00:00:00 2001 From: yacut Date: Sat, 27 Feb 2021 21:40:17 +0100 Subject: [PATCH] Split pod and node links for better configuration opportunities --- CHANGELOG.md | 4 ++++ cmd/flag.go | 6 ++++-- config.yaml | 14 ++++++++++---- pkg/config/main.go | 6 ++++++ pkg/incident/main.go | 25 +++---------------------- pkg/watcher/node.go | 28 ++++++++++++++++++++++++++++ pkg/watcher/node_checker.go | 8 +++++--- pkg/watcher/node_informer.go | 3 ++- pkg/watcher/pod.go | 28 ++++++++++++++++++++++++++++ pkg/watcher/pod_checker.go | 8 +++++--- pkg/watcher/pod_informer.go | 9 ++++++--- version.go | 2 +- 12 files changed, 102 insertions(+), 39 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a92910..aa07206 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## v1.3.0 / 2021-02-27 + +- [ENHANCEMENT] Split pod and node links for better configuration opportunities + ## v1.2.4 / 2021-02-27 - [FIX] Fix cron jobs diff --git a/cmd/flag.go b/cmd/flag.go index 2747f37..6c0936f 100644 --- a/cmd/flag.go +++ b/cmd/flag.go @@ -57,8 +57,10 @@ func parseAndValidateFlags() *config.Config { flag.String("alarms.nodes.resources.priority", "LOW", "The node resources alarm incident priority") flag.Int("alarms.nodes.resources.threshold", 90, "The node resources percentage threshold from 1 to 100") - flag.String("links.metrics", "", "Metrics URL for the alarm-related incident. Your can use following mustache variables here: pod_namespace, pod_name, node_name, cluster_name") - flag.String("links.logs", "", "Logs URL for the alarm-related incident. Your can use following mustache variables here: pod_namespace, pod_name, node_name, cluster_name") + flag.String("links.pods.metrics", "", "Metrics URL for the alarm-related incident. Your can use following mustache variables here: pod_namespace, pod_name, cluster_name") + flag.String("links.pods.logs", "", "Logs URL for the alarm-related incident. Your can use following mustache variables here: pod_namespace, pod_name, cluster_name") + flag.String("links.nodes.metrics", "", "Metrics URL for the alarm-related incident. Your can use following mustache variables here: node_name, cluster_name") + flag.String("links.nodes.logs", "", "Logs URL for the alarm-related incident. Your can use following mustache variables here: node_name, cluster_name") pflag.CommandLine.AddGoFlagSet(flag.CommandLine) pflag.Parse() diff --git a/config.yaml b/config.yaml index 3a9ba9c..37b8d1e 100644 --- a/config.yaml +++ b/config.yaml @@ -78,7 +78,13 @@ alarms: threshold: 90 links: - ## Metrics URL for the alarm-related incident. Your can use following mustache variables here: pod_namespace, pod_name, node_name, cluster_name - # metrics: "https://grafana.example.com/d/kubernetes/kubernetes-overview?var-Node=All&var-Pod={{pod_name}}" - ## Logs URL for the alarm-related incident. Your can use following mustache variables here: pod_namespace, pod_name, node_name, cluster_name - # logs: "https://grafana.example.com/explore?left=%5B%22now-1h%22,%22now%22,%22Loki%22,%7B%22expr%22:%22%7Binstance%3D%5C%22{{pod_name}}%5C%22,namespace%3D%5C%22{{pod_namespace}}%5C%22%7D%22%7D%5D" + pods: + ## Metrics URL for the alarm-related incident. Your can use following mustache variables here: pod_namespace, pod_name, node_name, cluster_name + # metrics: "https://grafana.example.com/d/kubernetes/kubernetes-overview?var-Node=All&var-Pod={{pod_name}}" + ## Logs URL for the alarm-related incident. Your can use following mustache variables here: pod_namespace, pod_name, node_name, cluster_name + # logs: "https://grafana.example.com/explore?left=%5B%22now-1h%22,%22now%22,%22Loki%22,%7B%22expr%22:%22%7Binstance%3D%5C%22{{pod_name}}%5C%22,namespace%3D%5C%22{{pod_namespace}}%5C%22%7D%22%7D%5D" + nodes: + ## Metrics URL for the alarm-related incident. Your can use following mustache variables here: pod_namespace, pod_name, node_name, cluster_name + # metrics: "https://grafana.example.com/d/kubernetes/kubernetes-overview?var-Node=All&var-Pod={{pod_name}}" + ## Logs URL for the alarm-related incident. Your can use following mustache variables here: pod_namespace, pod_name, node_name, cluster_name + # logs: "https://grafana.example.com/explore?left=%5B%22now-1h%22,%22now%22,%22Loki%22,%7B%22expr%22:%22%7Binstance%3D%5C%22{{pod_name}}%5C%22,namespace%3D%5C%22{{pod_namespace}}%5C%22%7D%22%7D%5D" diff --git a/pkg/config/main.go b/pkg/config/main.go index d0f4dd6..85c3f8f 100644 --- a/pkg/config/main.go +++ b/pkg/config/main.go @@ -62,6 +62,12 @@ type ConfigAlarmSettingWithThreshold struct { // ConfigLinks definition type ConfigLinks struct { + Pods ConfigLinksSetting `yaml:"pods"` + Nodes ConfigLinksSetting `yaml:"nodes"` +} + +// ConfigLinksSetting definition +type ConfigLinksSetting struct { Metrics string `yaml:"metrics"` Logs string `yaml:"logs"` } diff --git a/pkg/incident/main.go b/pkg/incident/main.go index ceb795a..d25d2d5 100644 --- a/pkg/incident/main.go +++ b/pkg/incident/main.go @@ -5,7 +5,6 @@ import ( "strconv" "strings" - "github.com/cbroglie/mustache" "github.com/rs/zerolog/log" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -22,7 +21,7 @@ var ilertClient *ilert.Client // CreateEvent creates an incident event func CreateEvent( cfg *config.Config, - mustacheValues map[string]string, + links []ilert.IncidentLink, incidentKey string, summary string, details string, @@ -40,27 +39,9 @@ func CreateEvent( EventType: eventType, APIKey: cfg.Settings.APIKey, Priority: priority, + Links: links, } - links := make([]ilert.IncidentLink, 0) - if cfg.Links.Metrics != "" { - url, err := mustache.Render(cfg.Links.Metrics, mustacheValues) - if err == nil && url != "" { - links = append(links, ilert.IncidentLink{ - Href: url, - Text: "Metrics", - }) - } - } - if cfg.Links.Logs != "" { - url, err := mustache.Render(cfg.Links.Logs, mustacheValues) - if err == nil && url != "" { - links = append(links, ilert.IncidentLink{ - Href: url, - Text: "Logs", - }) - } - } - event.Links = links + log.Debug().Interface("event", event).Msg("Creating incident event") output, err := ilertClient.CreateEvent(&ilert.CreateEventInput{ diff --git a/pkg/watcher/node.go b/pkg/watcher/node.go index 5635d28..3dd7c5e 100644 --- a/pkg/watcher/node.go +++ b/pkg/watcher/node.go @@ -3,6 +3,9 @@ package watcher import ( "fmt" + "github.com/cbroglie/mustache" + "github.com/iLert/ilert-go" + "github.com/iLert/ilert-kube-agent/pkg/config" api "k8s.io/api/core/v1" "k8s.io/client-go/kubernetes" ) @@ -42,3 +45,28 @@ func getNodeMustacheValues(node *api.Node) map[string]string { "cluster_name": node.GetClusterName(), } } + +func getNodeLinks(cfg *config.Config, node *api.Node) []ilert.IncidentLink { + mustacheValues := getNodeMustacheValues(node) + + links := make([]ilert.IncidentLink, 0) + if cfg.Links.Nodes.Metrics != "" { + url, err := mustache.Render(cfg.Links.Nodes.Metrics, mustacheValues) + if err == nil && url != "" { + links = append(links, ilert.IncidentLink{ + Href: url, + Text: "Metrics", + }) + } + } + if cfg.Links.Nodes.Logs != "" { + url, err := mustache.Render(cfg.Links.Nodes.Logs, mustacheValues) + if err == nil && url != "" { + links = append(links, ilert.IncidentLink{ + Href: url, + Text: "Logs", + }) + } + } + return links +} diff --git a/pkg/watcher/node_checker.go b/pkg/watcher/node_checker.go index 97cdcd1..53c83b4 100644 --- a/pkg/watcher/node_checker.go +++ b/pkg/watcher/node_checker.go @@ -83,7 +83,8 @@ func checkNodes(kubeClient *kubernetes.Clientset, metricsClient *metrics.Clients if incidentRef == nil { summary := fmt.Sprintf("Node %s CPU limit reached > %d%%", node.GetName(), cfg.Alarms.Nodes.Resources.Threshold) details := getNodeDetailsWithUsageLimit(kubeClient, &node, fmt.Sprintf("%.3f CPU", cpuUsage), fmt.Sprintf("%.3f CPU", cpuLimit)) - incidentID := incident.CreateEvent(cfg, getNodeMustacheValues(&node), nodeKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Nodes.Resources.Priority) + links := getNodeLinks(cfg, &node) + incidentID := incident.CreateEvent(cfg, links, nodeKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Nodes.Resources.Priority) incident.CreateIncidentRef(agentKubeClient, node.GetName(), cfg.Settings.Namespace, incidentID, summary, details) } } @@ -101,14 +102,15 @@ func checkNodes(kubeClient *kubernetes.Clientset, metricsClient *metrics.Clients if incidentRef == nil { summary := fmt.Sprintf("Node %s memory limit reached > %d%%", node.GetName(), cfg.Alarms.Nodes.Resources.Threshold) details := getNodeDetailsWithUsageLimit(kubeClient, &node, humanize.Bytes(uint64(memoryUsage)), humanize.Bytes(uint64(memoryLimit))) - incidentID := incident.CreateEvent(cfg, getNodeMustacheValues(&node), nodeKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Nodes.Resources.Priority) + links := getNodeLinks(cfg, &node) + incidentID := incident.CreateEvent(cfg, links, nodeKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Nodes.Resources.Priority) incident.CreateIncidentRef(agentKubeClient, node.GetName(), cfg.Settings.Namespace, incidentID, summary, details) } } } if healthy && incidentRef != nil && incidentRef.Spec.ID > 0 { - incident.CreateEvent(cfg, getNodeMustacheValues(&node), nodeKey, fmt.Sprintf("Node %s recovered", node.GetName()), "", ilert.EventTypes.Resolve, cfg.Alarms.Nodes.Resources.Priority) + incident.CreateEvent(cfg, nil, nodeKey, fmt.Sprintf("Node %s recovered", node.GetName()), "", ilert.EventTypes.Resolve, cfg.Alarms.Nodes.Resources.Priority) incident.DeleteIncidentRef(agentKubeClient, node.GetName(), cfg.Settings.Namespace) } } diff --git a/pkg/watcher/node_informer.go b/pkg/watcher/node_informer.go index c4efcbb..9a0e5e1 100644 --- a/pkg/watcher/node_informer.go +++ b/pkg/watcher/node_informer.go @@ -32,7 +32,8 @@ func startNodeInformer(kubeClient *kubernetes.Clientset, agentKubeClient *agentc if node.Status.Phase == api.NodeTerminated && incidentRef == nil { summary := fmt.Sprintf("Node %s terminated", node.GetName()) details := getNodeDetails(kubeClient, node) - incidentID := incident.CreateEvent(cfg, getNodeMustacheValues(node), nodeKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Nodes.Resources.Priority) + links := getNodeLinks(cfg, node) + incidentID := incident.CreateEvent(cfg, links, nodeKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Nodes.Resources.Priority) incident.CreateIncidentRef(agentKubeClient, node.GetName(), cfg.Settings.Namespace, incidentID, summary, details) } }, diff --git a/pkg/watcher/pod.go b/pkg/watcher/pod.go index 2d6594e..fe8e3ee 100644 --- a/pkg/watcher/pod.go +++ b/pkg/watcher/pod.go @@ -5,6 +5,9 @@ import ( "fmt" "io" + "github.com/cbroglie/mustache" + "github.com/iLert/ilert-go" + "github.com/iLert/ilert-kube-agent/pkg/config" "github.com/iLert/ilert-kube-agent/pkg/utils" api "k8s.io/api/core/v1" "k8s.io/client-go/kubernetes" @@ -86,3 +89,28 @@ func getPodMustacheValues(pod *api.Pod) map[string]string { "cluster_name": pod.GetClusterName(), } } + +func getPodLinks(cfg *config.Config, node *api.Pod) []ilert.IncidentLink { + mustacheValues := getPodMustacheValues(node) + + links := make([]ilert.IncidentLink, 0) + if cfg.Links.Pods.Metrics != "" { + url, err := mustache.Render(cfg.Links.Pods.Metrics, mustacheValues) + if err == nil && url != "" { + links = append(links, ilert.IncidentLink{ + Href: url, + Text: "Metrics", + }) + } + } + if cfg.Links.Pods.Logs != "" { + url, err := mustache.Render(cfg.Links.Pods.Logs, mustacheValues) + if err == nil && url != "" { + links = append(links, ilert.IncidentLink{ + Href: url, + Text: "Logs", + }) + } + } + return links +} diff --git a/pkg/watcher/pod_checker.go b/pkg/watcher/pod_checker.go index 89e2fcf..64572e2 100644 --- a/pkg/watcher/pod_checker.go +++ b/pkg/watcher/pod_checker.go @@ -97,7 +97,8 @@ func checkPods(kubeClient *kubernetes.Clientset, metricsClient *metrics.Clientse if incidentRef == nil { summary := fmt.Sprintf("Pod %s/%s CPU limit reached > %d%%", pod.GetNamespace(), pod.GetName(), cfg.Alarms.Pods.Resources.Threshold) details := getPodDetailsWithUsageLimit(kubeClient, &pod, fmt.Sprintf("%.3f CPU", cpuUsage), fmt.Sprintf("%.3f CPU", cpuLimit)) - incidentID := incident.CreateEvent(cfg, getPodMustacheValues(&pod), podKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Pods.Resources.Priority) + links := getPodLinks(cfg, &pod) + incidentID := incident.CreateEvent(cfg, links, podKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Pods.Resources.Priority) incident.CreateIncidentRef(agentKubeClient, pod.GetName(), pod.GetNamespace(), incidentID, summary, details) } } @@ -118,7 +119,8 @@ func checkPods(kubeClient *kubernetes.Clientset, metricsClient *metrics.Clientse if incidentRef == nil { summary := fmt.Sprintf("Pod %s/%s memory limit reached > %d%%", pod.GetNamespace(), pod.GetName(), cfg.Alarms.Pods.Resources.Threshold) details := getPodDetailsWithUsageLimit(kubeClient, &pod, humanize.Bytes(uint64(memoryUsage)), humanize.Bytes(uint64(memoryLimit))) - incidentID := incident.CreateEvent(cfg, getPodMustacheValues(&pod), podKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Pods.Resources.Priority) + links := getPodLinks(cfg, &pod) + incidentID := incident.CreateEvent(cfg, links, podKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Pods.Resources.Priority) incident.CreateIncidentRef(agentKubeClient, pod.GetName(), pod.GetNamespace(), incidentID, summary, details) } } @@ -126,7 +128,7 @@ func checkPods(kubeClient *kubernetes.Clientset, metricsClient *metrics.Clientse } } if healthy && incidentRef != nil && incidentRef.Spec.ID > 0 { - incident.CreateEvent(cfg, getPodMustacheValues(&pod), podKey, fmt.Sprintf("Pod %s/%s recovered", pod.GetNamespace(), pod.GetName()), "", ilert.EventTypes.Resolve, cfg.Alarms.Pods.Resources.Priority) + incident.CreateEvent(cfg, nil, podKey, fmt.Sprintf("Pod %s/%s recovered", pod.GetNamespace(), pod.GetName()), "", ilert.EventTypes.Resolve, cfg.Alarms.Pods.Resources.Priority) incident.DeleteIncidentRef(agentKubeClient, pod.GetName(), pod.GetNamespace()) } } diff --git a/pkg/watcher/pod_informer.go b/pkg/watcher/pod_informer.go index 687381f..7b30878 100644 --- a/pkg/watcher/pod_informer.go +++ b/pkg/watcher/pod_informer.go @@ -35,7 +35,8 @@ func startPodInformer(kubeClient *kubernetes.Clientset, agentKubeClient *agentcl cfg.Alarms.Pods.Terminate.Enabled && incidentRef == nil { summary := fmt.Sprintf("Pod %s/%s terminated - %s", pod.GetNamespace(), pod.GetName(), containerStatus.State.Terminated.Reason) details := getPodDetailsWithStatus(kubeClient, pod, &containerStatus) - incidentID := incident.CreateEvent(cfg, getPodMustacheValues(pod), podKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Pods.Terminate.Priority) + links := getPodLinks(cfg, pod) + incidentID := incident.CreateEvent(cfg, links, podKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Pods.Terminate.Priority) incident.CreateIncidentRef(agentKubeClient, pod.GetName(), pod.GetNamespace(), incidentID, summary, details) break } @@ -45,7 +46,8 @@ func startPodInformer(kubeClient *kubernetes.Clientset, agentKubeClient *agentcl cfg.Alarms.Pods.Waiting.Enabled && incidentRef == nil { summary := fmt.Sprintf("Pod %s/%s waiting - %s", pod.GetNamespace(), pod.GetName(), containerStatus.State.Waiting.Reason) details := getPodDetailsWithStatus(kubeClient, pod, &containerStatus) - incidentID := incident.CreateEvent(cfg, getPodMustacheValues(pod), podKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Pods.Waiting.Priority) + links := getPodLinks(cfg, pod) + incidentID := incident.CreateEvent(cfg, links, podKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Pods.Waiting.Priority) incident.CreateIncidentRef(agentKubeClient, pod.GetName(), pod.GetNamespace(), incidentID, summary, details) break } @@ -53,7 +55,8 @@ func startPodInformer(kubeClient *kubernetes.Clientset, agentKubeClient *agentcl if cfg.Alarms.Pods.Restarts.Enabled && containerStatus.RestartCount >= cfg.Alarms.Pods.Restarts.Threshold && incidentRef == nil { summary := fmt.Sprintf("Pod %s/%s restarts threshold reached: %d", pod.GetNamespace(), pod.GetName(), containerStatus.RestartCount) details := getPodDetailsWithStatus(kubeClient, pod, &containerStatus) - incidentID := incident.CreateEvent(cfg, getPodMustacheValues(pod), podKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Pods.Restarts.Priority) + links := getPodLinks(cfg, pod) + incidentID := incident.CreateEvent(cfg, links, podKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Pods.Restarts.Priority) incident.CreateIncidentRef(agentKubeClient, pod.GetName(), pod.GetNamespace(), incidentID, summary, details) break } diff --git a/version.go b/version.go index c0eebc1..5d70bcd 100644 --- a/version.go +++ b/version.go @@ -1,7 +1,7 @@ package shared // Version current version -const Version = "v1.2.4" +const Version = "v1.3.0" // App name const App = "ilert-kube-agent"