Skip to content

Commit

Permalink
dynamic links
Browse files Browse the repository at this point in the history
add alarm type
  • Loading branch information
yacut committed Feb 28, 2021
1 parent 2175bb4 commit 705ba39
Show file tree
Hide file tree
Showing 14 changed files with 65 additions and 53 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
## v1.4.0 / 2021-02-28

- [ENHANCEMENT] Make dynamic pod and node links for better configuration opportunities
- [ENHANCEMENT] Add alarm type to incident ref
- [FIX] Incident resolution based on alarm type

## v1.3.0 / 2021-02-27

- [ENHANCEMENT] Split pod and node links for better configuration opportunities
Expand Down
30 changes: 25 additions & 5 deletions cmd/flag.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,6 @@ func parseAndValidateFlags() *config.Config {
flag.String("alarms.nodes.resources.priority", "LOW", "The node resources alarm incident priority")
flag.Int("alarms.nodes.resources.threshold", 90, "The node resources percentage threshold from 1 to 100")

flag.String("links.pods.metrics", "", "Metrics URL for the alarm-related incident. Your can use following mustache variables here: pod_namespace, pod_name, cluster_name")
flag.String("links.pods.logs", "", "Logs URL for the alarm-related incident. Your can use following mustache variables here: pod_namespace, pod_name, cluster_name")
flag.String("links.nodes.metrics", "", "Metrics URL for the alarm-related incident. Your can use following mustache variables here: node_name, cluster_name")
flag.String("links.nodes.logs", "", "Logs URL for the alarm-related incident. Your can use following mustache variables here: node_name, cluster_name")

pflag.CommandLine.AddGoFlagSet(flag.CommandLine)
pflag.Parse()

Expand Down Expand Up @@ -100,6 +95,31 @@ func parseAndValidateFlags() *config.Config {
log.Fatal().Err(err).Msg("Unable to decode config")
}

if cfg.Links.Pods == nil {
cfg.Links.Pods = make([]config.ConfigLinksSetting, 0)
}
if cfg.Links.Nodes == nil {
cfg.Links.Nodes = make([]config.ConfigLinksSetting, 0)
}

for _, e := range os.Environ() {
pair := strings.SplitN(e, "=", 2)
if strings.HasPrefix(pair[0], "ILERT_LINKS_PODS_") {
link := strings.ReplaceAll(pair[0], "ILERT_LINKS_PODS_", "")
cfg.Links.Pods = append(cfg.Links.Pods, config.ConfigLinksSetting{
Name: strings.Title(strings.ToLower(strings.ReplaceAll(link, "_", " "))),
Href: pair[1],
})
}

if strings.HasPrefix(pair[0], "ILERT_LINKS_NODES_") {
cfg.Links.Nodes = append(cfg.Links.Nodes, config.ConfigLinksSetting{
Name: strings.Title(strings.ToLower(strings.ReplaceAll(strings.ReplaceAll(pair[0], "ILERT_LINKS_NODES_", ""), "_", " "))),
Href: pair[1],
})
}
}

logger.Init(cfg.Settings.Log)

ilertAPIKeyEnv := utils.GetEnv("ILERT_API_KEY", "")
Expand Down
16 changes: 8 additions & 8 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,12 @@ alarms:

links:
pods:
## Metrics URL for the alarm-related incident. Your can use following mustache variables here: pod_namespace, pod_name, node_name, cluster_name
# metrics: "https://grafana.example.com/d/kubernetes/kubernetes-overview?var-Node=All&var-Pod={{pod_name}}"
## Logs URL for the alarm-related incident. Your can use following mustache variables here: pod_namespace, pod_name, node_name, cluster_name
# logs: "https://grafana.example.com/explore?left=%5B%22now-1h%22,%22now%22,%22Loki%22,%7B%22expr%22:%22%7Binstance%3D%5C%22{{pod_name}}%5C%22,namespace%3D%5C%22{{pod_namespace}}%5C%22%7D%22%7D%5D"
## Pods URL for the alarm-related incident. Your can use following mustache variables here: pod_namespace, pod_name, cluster_name
# - name: Metrics
# href: "https://grafana.example.com/d/kubernetes/kubernetes-overview?var-Node=All&var-Pod={{pod_name}}"
# - name: Logs
# href: "https://grafana.example.com/explore?left=%5B%22now-1h%22,%22now%22,%22Loki%22,%7B%22expr%22:%22%7Binstance%3D%5C%22{{pod_name}}%5C%22,namespace%3D%5C%22{{pod_namespace}}%5C%22%7D%22%7D%5D"
nodes:
## Metrics URL for the alarm-related incident. Your can use following mustache variables here: pod_namespace, pod_name, node_name, cluster_name
# metrics: "https://grafana.example.com/d/kubernetes/kubernetes-overview?var-Node=All&var-Pod={{pod_name}}"
## Logs URL for the alarm-related incident. Your can use following mustache variables here: pod_namespace, pod_name, node_name, cluster_name
# logs: "https://grafana.example.com/explore?left=%5B%22now-1h%22,%22now%22,%22Loki%22,%7B%22expr%22:%22%7Binstance%3D%5C%22{{pod_name}}%5C%22,namespace%3D%5C%22{{pod_namespace}}%5C%22%7D%22%7D%5D"
## Nodes URL for the alarm-related incident. Your can use following mustache variables here: node_name, cluster_name
# - name: Metrics
# href: "https://grafana.example.com/d/kubernetes/kubernetes-overview?var-Node={{node_name}}"
2 changes: 2 additions & 0 deletions example/standard/00-crds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,5 @@ spec:
type: "string"
details:
type: "string"
type:
type: "string"
1 change: 1 addition & 0 deletions pkg/apis/incident/v1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ type IncidentSpec struct {
ID int64 `json:"id,omitempty"`
Summary string `json:"summary,omitempty"`
Details string `json:"details,omitempty"`
Type string `json:"type,omitempty"`
}

// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
Expand Down
8 changes: 4 additions & 4 deletions pkg/config/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,12 @@ type ConfigAlarmSettingWithThreshold struct {

// ConfigLinks definition
type ConfigLinks struct {
Pods ConfigLinksSetting `yaml:"pods"`
Nodes ConfigLinksSetting `yaml:"nodes"`
Pods []ConfigLinksSetting `yaml:"pods"`
Nodes []ConfigLinksSetting `yaml:"nodes"`
}

// ConfigLinksSetting definition
type ConfigLinksSetting struct {
Metrics string `yaml:"metrics"`
Logs string `yaml:"logs"`
Name string `yaml:"name"`
Href string `yaml:"href"`
}
3 changes: 2 additions & 1 deletion pkg/incident/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ func CreateEvent(
}

// CreateIncidentRef definition
func CreateIncidentRef(agentKubeClient *agentclientset.Clientset, name string, namespace string, incidentID *int64, summary string, details string) {
func CreateIncidentRef(agentKubeClient *agentclientset.Clientset, name string, namespace string, incidentID *int64, summary string, details string, incidentType string) {
if incidentID != nil && *incidentID > 0 {
log.Debug().Int64("incident_id", *incidentID).Str("name", name).Str("namespace", namespace).Msg("Creating incident ref")
incident := &v1.Incident{
Expand All @@ -79,6 +79,7 @@ func CreateIncidentRef(agentKubeClient *agentclientset.Clientset, name string, n
ID: *incidentID,
Summary: summary,
Details: details,
Type: incidentType,
},
}
_, err := agentKubeClient.IlertV1().Incidents(namespace).Create(incident)
Expand Down
15 changes: 3 additions & 12 deletions pkg/watcher/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,21 +50,12 @@ func getNodeLinks(cfg *config.Config, node *api.Node) []ilert.IncidentLink {
mustacheValues := getNodeMustacheValues(node)

links := make([]ilert.IncidentLink, 0)
if cfg.Links.Nodes.Metrics != "" {
url, err := mustache.Render(cfg.Links.Nodes.Metrics, mustacheValues)
for _, link := range cfg.Links.Nodes {
url, err := mustache.Render(link.Href, mustacheValues)
if err == nil && url != "" {
links = append(links, ilert.IncidentLink{
Href: url,
Text: "Metrics",
})
}
}
if cfg.Links.Nodes.Logs != "" {
url, err := mustache.Render(cfg.Links.Nodes.Logs, mustacheValues)
if err == nil && url != "" {
links = append(links, ilert.IncidentLink{
Href: url,
Text: "Logs",
Text: link.Name,
})
}
}
Expand Down
6 changes: 3 additions & 3 deletions pkg/watcher/node_checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ func checkNodes(kubeClient *kubernetes.Clientset, metricsClient *metrics.Clients
details := getNodeDetailsWithUsageLimit(kubeClient, &node, fmt.Sprintf("%.3f CPU", cpuUsage), fmt.Sprintf("%.3f CPU", cpuLimit))
links := getNodeLinks(cfg, &node)
incidentID := incident.CreateEvent(cfg, links, nodeKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Nodes.Resources.Priority)
incident.CreateIncidentRef(agentKubeClient, node.GetName(), cfg.Settings.Namespace, incidentID, summary, details)
incident.CreateIncidentRef(agentKubeClient, node.GetName(), cfg.Settings.Namespace, incidentID, summary, details, "resources")
}
}
}
Expand All @@ -104,12 +104,12 @@ func checkNodes(kubeClient *kubernetes.Clientset, metricsClient *metrics.Clients
details := getNodeDetailsWithUsageLimit(kubeClient, &node, humanize.Bytes(uint64(memoryUsage)), humanize.Bytes(uint64(memoryLimit)))
links := getNodeLinks(cfg, &node)
incidentID := incident.CreateEvent(cfg, links, nodeKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Nodes.Resources.Priority)
incident.CreateIncidentRef(agentKubeClient, node.GetName(), cfg.Settings.Namespace, incidentID, summary, details)
incident.CreateIncidentRef(agentKubeClient, node.GetName(), cfg.Settings.Namespace, incidentID, summary, details, "resources")
}
}
}

if healthy && incidentRef != nil && incidentRef.Spec.ID > 0 {
if healthy && incidentRef != nil && incidentRef.Spec.ID > 0 && incidentRef.Spec.Type == "resources" {
incident.CreateEvent(cfg, nil, nodeKey, fmt.Sprintf("Node %s recovered", node.GetName()), "", ilert.EventTypes.Resolve, cfg.Alarms.Nodes.Resources.Priority)
incident.DeleteIncidentRef(agentKubeClient, node.GetName(), cfg.Settings.Namespace)
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/watcher/node_informer.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ func startNodeInformer(kubeClient *kubernetes.Clientset, agentKubeClient *agentc
details := getNodeDetails(kubeClient, node)
links := getNodeLinks(cfg, node)
incidentID := incident.CreateEvent(cfg, links, nodeKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Nodes.Resources.Priority)
incident.CreateIncidentRef(agentKubeClient, node.GetName(), cfg.Settings.Namespace, incidentID, summary, details)
incident.CreateIncidentRef(agentKubeClient, node.GetName(), cfg.Settings.Namespace, incidentID, summary, details, "terminate")
}
},
})
Expand Down
15 changes: 3 additions & 12 deletions pkg/watcher/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,21 +94,12 @@ func getPodLinks(cfg *config.Config, node *api.Pod) []ilert.IncidentLink {
mustacheValues := getPodMustacheValues(node)

links := make([]ilert.IncidentLink, 0)
if cfg.Links.Pods.Metrics != "" {
url, err := mustache.Render(cfg.Links.Pods.Metrics, mustacheValues)
for _, link := range cfg.Links.Pods {
url, err := mustache.Render(link.Href, mustacheValues)
if err == nil && url != "" {
links = append(links, ilert.IncidentLink{
Href: url,
Text: "Metrics",
})
}
}
if cfg.Links.Pods.Logs != "" {
url, err := mustache.Render(cfg.Links.Pods.Logs, mustacheValues)
if err == nil && url != "" {
links = append(links, ilert.IncidentLink{
Href: url,
Text: "Logs",
Text: link.Name,
})
}
}
Expand Down
6 changes: 3 additions & 3 deletions pkg/watcher/pod_checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ func checkPods(kubeClient *kubernetes.Clientset, metricsClient *metrics.Clientse
details := getPodDetailsWithUsageLimit(kubeClient, &pod, fmt.Sprintf("%.3f CPU", cpuUsage), fmt.Sprintf("%.3f CPU", cpuLimit))
links := getPodLinks(cfg, &pod)
incidentID := incident.CreateEvent(cfg, links, podKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Pods.Resources.Priority)
incident.CreateIncidentRef(agentKubeClient, pod.GetName(), pod.GetNamespace(), incidentID, summary, details)
incident.CreateIncidentRef(agentKubeClient, pod.GetName(), pod.GetNamespace(), incidentID, summary, details, "resources")
}
}
}
Expand All @@ -121,13 +121,13 @@ func checkPods(kubeClient *kubernetes.Clientset, metricsClient *metrics.Clientse
details := getPodDetailsWithUsageLimit(kubeClient, &pod, humanize.Bytes(uint64(memoryUsage)), humanize.Bytes(uint64(memoryLimit)))
links := getPodLinks(cfg, &pod)
incidentID := incident.CreateEvent(cfg, links, podKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Pods.Resources.Priority)
incident.CreateIncidentRef(agentKubeClient, pod.GetName(), pod.GetNamespace(), incidentID, summary, details)
incident.CreateIncidentRef(agentKubeClient, pod.GetName(), pod.GetNamespace(), incidentID, summary, details, "resources")
}
}
}
}
}
if healthy && incidentRef != nil && incidentRef.Spec.ID > 0 {
if healthy && incidentRef != nil && incidentRef.Spec.ID > 0 && incidentRef.Spec.Type == "resources" {
incident.CreateEvent(cfg, nil, podKey, fmt.Sprintf("Pod %s/%s recovered", pod.GetNamespace(), pod.GetName()), "", ilert.EventTypes.Resolve, cfg.Alarms.Pods.Resources.Priority)
incident.DeleteIncidentRef(agentKubeClient, pod.GetName(), pod.GetNamespace())
}
Expand Down
6 changes: 3 additions & 3 deletions pkg/watcher/pod_informer.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ func startPodInformer(kubeClient *kubernetes.Clientset, agentKubeClient *agentcl
details := getPodDetailsWithStatus(kubeClient, pod, &containerStatus)
links := getPodLinks(cfg, pod)
incidentID := incident.CreateEvent(cfg, links, podKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Pods.Terminate.Priority)
incident.CreateIncidentRef(agentKubeClient, pod.GetName(), pod.GetNamespace(), incidentID, summary, details)
incident.CreateIncidentRef(agentKubeClient, pod.GetName(), pod.GetNamespace(), incidentID, summary, details, "terminate")
break
}

Expand All @@ -48,7 +48,7 @@ func startPodInformer(kubeClient *kubernetes.Clientset, agentKubeClient *agentcl
details := getPodDetailsWithStatus(kubeClient, pod, &containerStatus)
links := getPodLinks(cfg, pod)
incidentID := incident.CreateEvent(cfg, links, podKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Pods.Waiting.Priority)
incident.CreateIncidentRef(agentKubeClient, pod.GetName(), pod.GetNamespace(), incidentID, summary, details)
incident.CreateIncidentRef(agentKubeClient, pod.GetName(), pod.GetNamespace(), incidentID, summary, details, "waiting")
break
}

Expand All @@ -57,7 +57,7 @@ func startPodInformer(kubeClient *kubernetes.Clientset, agentKubeClient *agentcl
details := getPodDetailsWithStatus(kubeClient, pod, &containerStatus)
links := getPodLinks(cfg, pod)
incidentID := incident.CreateEvent(cfg, links, podKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Pods.Restarts.Priority)
incident.CreateIncidentRef(agentKubeClient, pod.GetName(), pod.GetNamespace(), incidentID, summary, details)
incident.CreateIncidentRef(agentKubeClient, pod.GetName(), pod.GetNamespace(), incidentID, summary, details, "restarts")
break
}
}
Expand Down
2 changes: 1 addition & 1 deletion version.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package shared

// Version current version
const Version = "v1.3.0"
const Version = "v1.4.0"

// App name
const App = "ilert-kube-agent"

0 comments on commit 705ba39

Please sign in to comment.