Skip to content

Commit

Permalink
Split pod and node links for better configuration opportunities
Browse files Browse the repository at this point in the history
  • Loading branch information
yacut committed Feb 27, 2021
1 parent 2e60ff8 commit 5a18915
Show file tree
Hide file tree
Showing 12 changed files with 102 additions and 39 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## v1.3.0 / 2021-02-27

- [ENHANCEMENT] Split pod and node links for better configuration opportunities

## v1.2.4 / 2021-02-27

- [FIX] Fix cron jobs
Expand Down
6 changes: 4 additions & 2 deletions cmd/flag.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,10 @@ func parseAndValidateFlags() *config.Config {
flag.String("alarms.nodes.resources.priority", "LOW", "The node resources alarm incident priority")
flag.Int("alarms.nodes.resources.threshold", 90, "The node resources percentage threshold from 1 to 100")

flag.String("links.metrics", "", "Metrics URL for the alarm-related incident. Your can use following mustache variables here: pod_namespace, pod_name, node_name, cluster_name")
flag.String("links.logs", "", "Logs URL for the alarm-related incident. Your can use following mustache variables here: pod_namespace, pod_name, node_name, cluster_name")
flag.String("links.pods.metrics", "", "Metrics URL for the alarm-related incident. Your can use following mustache variables here: pod_namespace, pod_name, cluster_name")
flag.String("links.pods.logs", "", "Logs URL for the alarm-related incident. Your can use following mustache variables here: pod_namespace, pod_name, cluster_name")
flag.String("links.nodes.metrics", "", "Metrics URL for the alarm-related incident. Your can use following mustache variables here: node_name, cluster_name")
flag.String("links.nodes.logs", "", "Logs URL for the alarm-related incident. Your can use following mustache variables here: node_name, cluster_name")

pflag.CommandLine.AddGoFlagSet(flag.CommandLine)
pflag.Parse()
Expand Down
14 changes: 10 additions & 4 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,13 @@ alarms:
threshold: 90

links:
## Metrics URL for the alarm-related incident. Your can use following mustache variables here: pod_namespace, pod_name, node_name, cluster_name
# metrics: "https://grafana.example.com/d/kubernetes/kubernetes-overview?var-Node=All&var-Pod={{pod_name}}"
## Logs URL for the alarm-related incident. Your can use following mustache variables here: pod_namespace, pod_name, node_name, cluster_name
# logs: "https://grafana.example.com/explore?left=%5B%22now-1h%22,%22now%22,%22Loki%22,%7B%22expr%22:%22%7Binstance%3D%5C%22{{pod_name}}%5C%22,namespace%3D%5C%22{{pod_namespace}}%5C%22%7D%22%7D%5D"
pods:
## Metrics URL for the alarm-related incident. Your can use following mustache variables here: pod_namespace, pod_name, node_name, cluster_name
# metrics: "https://grafana.example.com/d/kubernetes/kubernetes-overview?var-Node=All&var-Pod={{pod_name}}"
## Logs URL for the alarm-related incident. Your can use following mustache variables here: pod_namespace, pod_name, node_name, cluster_name
# logs: "https://grafana.example.com/explore?left=%5B%22now-1h%22,%22now%22,%22Loki%22,%7B%22expr%22:%22%7Binstance%3D%5C%22{{pod_name}}%5C%22,namespace%3D%5C%22{{pod_namespace}}%5C%22%7D%22%7D%5D"
nodes:
## Metrics URL for the alarm-related incident. Your can use following mustache variables here: pod_namespace, pod_name, node_name, cluster_name
# metrics: "https://grafana.example.com/d/kubernetes/kubernetes-overview?var-Node=All&var-Pod={{pod_name}}"
## Logs URL for the alarm-related incident. Your can use following mustache variables here: pod_namespace, pod_name, node_name, cluster_name
# logs: "https://grafana.example.com/explore?left=%5B%22now-1h%22,%22now%22,%22Loki%22,%7B%22expr%22:%22%7Binstance%3D%5C%22{{pod_name}}%5C%22,namespace%3D%5C%22{{pod_namespace}}%5C%22%7D%22%7D%5D"
6 changes: 6 additions & 0 deletions pkg/config/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,12 @@ type ConfigAlarmSettingWithThreshold struct {

// ConfigLinks definition
type ConfigLinks struct {
Pods ConfigLinksSetting `yaml:"pods"`
Nodes ConfigLinksSetting `yaml:"nodes"`
}

// ConfigLinksSetting definition
type ConfigLinksSetting struct {
Metrics string `yaml:"metrics"`
Logs string `yaml:"logs"`
}
25 changes: 3 additions & 22 deletions pkg/incident/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import (
"strconv"
"strings"

"github.com/cbroglie/mustache"
"github.com/rs/zerolog/log"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

Expand All @@ -22,7 +21,7 @@ var ilertClient *ilert.Client
// CreateEvent creates an incident event
func CreateEvent(
cfg *config.Config,
mustacheValues map[string]string,
links []ilert.IncidentLink,
incidentKey string,
summary string,
details string,
Expand All @@ -40,27 +39,9 @@ func CreateEvent(
EventType: eventType,
APIKey: cfg.Settings.APIKey,
Priority: priority,
Links: links,
}
links := make([]ilert.IncidentLink, 0)
if cfg.Links.Metrics != "" {
url, err := mustache.Render(cfg.Links.Metrics, mustacheValues)
if err == nil && url != "" {
links = append(links, ilert.IncidentLink{
Href: url,
Text: "Metrics",
})
}
}
if cfg.Links.Logs != "" {
url, err := mustache.Render(cfg.Links.Logs, mustacheValues)
if err == nil && url != "" {
links = append(links, ilert.IncidentLink{
Href: url,
Text: "Logs",
})
}
}
event.Links = links

log.Debug().Interface("event", event).Msg("Creating incident event")

output, err := ilertClient.CreateEvent(&ilert.CreateEventInput{
Expand Down
28 changes: 28 additions & 0 deletions pkg/watcher/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ package watcher
import (
"fmt"

"github.com/cbroglie/mustache"
"github.com/iLert/ilert-go"
"github.com/iLert/ilert-kube-agent/pkg/config"
api "k8s.io/api/core/v1"
"k8s.io/client-go/kubernetes"
)
Expand Down Expand Up @@ -42,3 +45,28 @@ func getNodeMustacheValues(node *api.Node) map[string]string {
"cluster_name": node.GetClusterName(),
}
}

func getNodeLinks(cfg *config.Config, node *api.Node) []ilert.IncidentLink {
mustacheValues := getNodeMustacheValues(node)

links := make([]ilert.IncidentLink, 0)
if cfg.Links.Nodes.Metrics != "" {
url, err := mustache.Render(cfg.Links.Nodes.Metrics, mustacheValues)
if err == nil && url != "" {
links = append(links, ilert.IncidentLink{
Href: url,
Text: "Metrics",
})
}
}
if cfg.Links.Nodes.Logs != "" {
url, err := mustache.Render(cfg.Links.Nodes.Logs, mustacheValues)
if err == nil && url != "" {
links = append(links, ilert.IncidentLink{
Href: url,
Text: "Logs",
})
}
}
return links
}
8 changes: 5 additions & 3 deletions pkg/watcher/node_checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ func checkNodes(kubeClient *kubernetes.Clientset, metricsClient *metrics.Clients
if incidentRef == nil {
summary := fmt.Sprintf("Node %s CPU limit reached > %d%%", node.GetName(), cfg.Alarms.Nodes.Resources.Threshold)
details := getNodeDetailsWithUsageLimit(kubeClient, &node, fmt.Sprintf("%.3f CPU", cpuUsage), fmt.Sprintf("%.3f CPU", cpuLimit))
incidentID := incident.CreateEvent(cfg, getNodeMustacheValues(&node), nodeKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Nodes.Resources.Priority)
links := getNodeLinks(cfg, &node)
incidentID := incident.CreateEvent(cfg, links, nodeKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Nodes.Resources.Priority)
incident.CreateIncidentRef(agentKubeClient, node.GetName(), cfg.Settings.Namespace, incidentID, summary, details)
}
}
Expand All @@ -101,14 +102,15 @@ func checkNodes(kubeClient *kubernetes.Clientset, metricsClient *metrics.Clients
if incidentRef == nil {
summary := fmt.Sprintf("Node %s memory limit reached > %d%%", node.GetName(), cfg.Alarms.Nodes.Resources.Threshold)
details := getNodeDetailsWithUsageLimit(kubeClient, &node, humanize.Bytes(uint64(memoryUsage)), humanize.Bytes(uint64(memoryLimit)))
incidentID := incident.CreateEvent(cfg, getNodeMustacheValues(&node), nodeKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Nodes.Resources.Priority)
links := getNodeLinks(cfg, &node)
incidentID := incident.CreateEvent(cfg, links, nodeKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Nodes.Resources.Priority)
incident.CreateIncidentRef(agentKubeClient, node.GetName(), cfg.Settings.Namespace, incidentID, summary, details)
}
}
}

if healthy && incidentRef != nil && incidentRef.Spec.ID > 0 {
incident.CreateEvent(cfg, getNodeMustacheValues(&node), nodeKey, fmt.Sprintf("Node %s recovered", node.GetName()), "", ilert.EventTypes.Resolve, cfg.Alarms.Nodes.Resources.Priority)
incident.CreateEvent(cfg, nil, nodeKey, fmt.Sprintf("Node %s recovered", node.GetName()), "", ilert.EventTypes.Resolve, cfg.Alarms.Nodes.Resources.Priority)
incident.DeleteIncidentRef(agentKubeClient, node.GetName(), cfg.Settings.Namespace)
}
}
Expand Down
3 changes: 2 additions & 1 deletion pkg/watcher/node_informer.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ func startNodeInformer(kubeClient *kubernetes.Clientset, agentKubeClient *agentc
if node.Status.Phase == api.NodeTerminated && incidentRef == nil {
summary := fmt.Sprintf("Node %s terminated", node.GetName())
details := getNodeDetails(kubeClient, node)
incidentID := incident.CreateEvent(cfg, getNodeMustacheValues(node), nodeKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Nodes.Resources.Priority)
links := getNodeLinks(cfg, node)
incidentID := incident.CreateEvent(cfg, links, nodeKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Nodes.Resources.Priority)
incident.CreateIncidentRef(agentKubeClient, node.GetName(), cfg.Settings.Namespace, incidentID, summary, details)
}
},
Expand Down
28 changes: 28 additions & 0 deletions pkg/watcher/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ import (
"fmt"
"io"

"github.com/cbroglie/mustache"
"github.com/iLert/ilert-go"
"github.com/iLert/ilert-kube-agent/pkg/config"
"github.com/iLert/ilert-kube-agent/pkg/utils"
api "k8s.io/api/core/v1"
"k8s.io/client-go/kubernetes"
Expand Down Expand Up @@ -86,3 +89,28 @@ func getPodMustacheValues(pod *api.Pod) map[string]string {
"cluster_name": pod.GetClusterName(),
}
}

func getPodLinks(cfg *config.Config, node *api.Pod) []ilert.IncidentLink {
mustacheValues := getPodMustacheValues(node)

links := make([]ilert.IncidentLink, 0)
if cfg.Links.Pods.Metrics != "" {
url, err := mustache.Render(cfg.Links.Pods.Metrics, mustacheValues)
if err == nil && url != "" {
links = append(links, ilert.IncidentLink{
Href: url,
Text: "Metrics",
})
}
}
if cfg.Links.Pods.Logs != "" {
url, err := mustache.Render(cfg.Links.Pods.Logs, mustacheValues)
if err == nil && url != "" {
links = append(links, ilert.IncidentLink{
Href: url,
Text: "Logs",
})
}
}
return links
}
8 changes: 5 additions & 3 deletions pkg/watcher/pod_checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@ func checkPods(kubeClient *kubernetes.Clientset, metricsClient *metrics.Clientse
if incidentRef == nil {
summary := fmt.Sprintf("Pod %s/%s CPU limit reached > %d%%", pod.GetNamespace(), pod.GetName(), cfg.Alarms.Pods.Resources.Threshold)
details := getPodDetailsWithUsageLimit(kubeClient, &pod, fmt.Sprintf("%.3f CPU", cpuUsage), fmt.Sprintf("%.3f CPU", cpuLimit))
incidentID := incident.CreateEvent(cfg, getPodMustacheValues(&pod), podKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Pods.Resources.Priority)
links := getPodLinks(cfg, &pod)
incidentID := incident.CreateEvent(cfg, links, podKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Pods.Resources.Priority)
incident.CreateIncidentRef(agentKubeClient, pod.GetName(), pod.GetNamespace(), incidentID, summary, details)
}
}
Expand All @@ -118,15 +119,16 @@ func checkPods(kubeClient *kubernetes.Clientset, metricsClient *metrics.Clientse
if incidentRef == nil {
summary := fmt.Sprintf("Pod %s/%s memory limit reached > %d%%", pod.GetNamespace(), pod.GetName(), cfg.Alarms.Pods.Resources.Threshold)
details := getPodDetailsWithUsageLimit(kubeClient, &pod, humanize.Bytes(uint64(memoryUsage)), humanize.Bytes(uint64(memoryLimit)))
incidentID := incident.CreateEvent(cfg, getPodMustacheValues(&pod), podKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Pods.Resources.Priority)
links := getPodLinks(cfg, &pod)
incidentID := incident.CreateEvent(cfg, links, podKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Pods.Resources.Priority)
incident.CreateIncidentRef(agentKubeClient, pod.GetName(), pod.GetNamespace(), incidentID, summary, details)
}
}
}
}
}
if healthy && incidentRef != nil && incidentRef.Spec.ID > 0 {
incident.CreateEvent(cfg, getPodMustacheValues(&pod), podKey, fmt.Sprintf("Pod %s/%s recovered", pod.GetNamespace(), pod.GetName()), "", ilert.EventTypes.Resolve, cfg.Alarms.Pods.Resources.Priority)
incident.CreateEvent(cfg, nil, podKey, fmt.Sprintf("Pod %s/%s recovered", pod.GetNamespace(), pod.GetName()), "", ilert.EventTypes.Resolve, cfg.Alarms.Pods.Resources.Priority)
incident.DeleteIncidentRef(agentKubeClient, pod.GetName(), pod.GetNamespace())
}
}
Expand Down
9 changes: 6 additions & 3 deletions pkg/watcher/pod_informer.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ func startPodInformer(kubeClient *kubernetes.Clientset, agentKubeClient *agentcl
cfg.Alarms.Pods.Terminate.Enabled && incidentRef == nil {
summary := fmt.Sprintf("Pod %s/%s terminated - %s", pod.GetNamespace(), pod.GetName(), containerStatus.State.Terminated.Reason)
details := getPodDetailsWithStatus(kubeClient, pod, &containerStatus)
incidentID := incident.CreateEvent(cfg, getPodMustacheValues(pod), podKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Pods.Terminate.Priority)
links := getPodLinks(cfg, pod)
incidentID := incident.CreateEvent(cfg, links, podKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Pods.Terminate.Priority)
incident.CreateIncidentRef(agentKubeClient, pod.GetName(), pod.GetNamespace(), incidentID, summary, details)
break
}
Expand All @@ -45,15 +46,17 @@ func startPodInformer(kubeClient *kubernetes.Clientset, agentKubeClient *agentcl
cfg.Alarms.Pods.Waiting.Enabled && incidentRef == nil {
summary := fmt.Sprintf("Pod %s/%s waiting - %s", pod.GetNamespace(), pod.GetName(), containerStatus.State.Waiting.Reason)
details := getPodDetailsWithStatus(kubeClient, pod, &containerStatus)
incidentID := incident.CreateEvent(cfg, getPodMustacheValues(pod), podKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Pods.Waiting.Priority)
links := getPodLinks(cfg, pod)
incidentID := incident.CreateEvent(cfg, links, podKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Pods.Waiting.Priority)
incident.CreateIncidentRef(agentKubeClient, pod.GetName(), pod.GetNamespace(), incidentID, summary, details)
break
}

if cfg.Alarms.Pods.Restarts.Enabled && containerStatus.RestartCount >= cfg.Alarms.Pods.Restarts.Threshold && incidentRef == nil {
summary := fmt.Sprintf("Pod %s/%s restarts threshold reached: %d", pod.GetNamespace(), pod.GetName(), containerStatus.RestartCount)
details := getPodDetailsWithStatus(kubeClient, pod, &containerStatus)
incidentID := incident.CreateEvent(cfg, getPodMustacheValues(pod), podKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Pods.Restarts.Priority)
links := getPodLinks(cfg, pod)
incidentID := incident.CreateEvent(cfg, links, podKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Pods.Restarts.Priority)
incident.CreateIncidentRef(agentKubeClient, pod.GetName(), pod.GetNamespace(), incidentID, summary, details)
break
}
Expand Down
2 changes: 1 addition & 1 deletion version.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package shared

// Version current version
const Version = "v1.2.4"
const Version = "v1.3.0"

// App name
const App = "ilert-kube-agent"

0 comments on commit 5a18915

Please sign in to comment.