Skip to content

Commit

Permalink
split cpu and memory resource config
Browse files Browse the repository at this point in the history
  • Loading branch information
yacut committed Mar 2, 2021
1 parent d3dd2ed commit af9c65b
Show file tree
Hide file tree
Showing 9 changed files with 116 additions and 67 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## v1.5.0 / 2021-03-02

- [ENHANCEMENT] Split CPU and memory resource config for better configuration opportunities

## v1.4.3 / 2021-03-01

- [ENHANCEMENT] Move to native kubernetes incident url
Expand Down
28 changes: 20 additions & 8 deletions cmd/flag.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,23 @@ func parseAndValidateFlags() *config.Config {
flag.String("alarms.pods.restarts.priority", "LOW", "The pod waiting alarm incident priority")
flag.Int("alarms.pods.restarts.threshold", 10, "Pod restart threshold to alarm")
flag.Bool("alarms.pods.resources.enabled", true, "Enable pod resources alarms")
flag.String("alarms.pods.resources.priority", "LOW", "The pod resources alarm incident priority")
flag.Int("alarms.pods.resources.threshold", 90, "The pod resources percentage threshold from 1 to 100")
flag.Bool("alarms.pods.resources.cpu.enabled", true, "Enable pod CPU resources alarms")
flag.String("alarms.pods.resources.cpu.priority", "LOW", "The pod CPU resources alarm incident priority")
flag.Int("alarms.pods.resources.cpu.threshold", 90, "The pod CPU resources percentage threshold from 1 to 100")
flag.Bool("alarms.pods.resources.memory.enabled", true, "Enable pod memory resources alarms")
flag.String("alarms.pods.resources.memory.priority", "LOW", "The pod memory resources alarm incident priority")
flag.Int("alarms.pods.resources.memory.threshold", 90, "The pod memory resources percentage threshold from 1 to 100")

flag.Bool("alarms.nodes.enabled", true, "Enable node alarms")
flag.Bool("alarms.nodes.terminate.enabled", true, "Enable node terminate alarms")
flag.String("alarms.nodes.terminate.priority", "HIGH", "The node terminate alarm incident priority")
flag.Bool("alarms.nodes.resources.enabled", true, "Enable node resources alarms")
flag.String("alarms.nodes.resources.priority", "LOW", "The node resources alarm incident priority")
flag.Int("alarms.nodes.resources.threshold", 90, "The node resources percentage threshold from 1 to 100")
flag.Bool("alarms.nodes.resources.cpu.enabled", true, "Enable node CPU resources alarms")
flag.String("alarms.nodes.resources.cpu.priority", "LOW", "The node CPU resources alarm incident priority")
flag.Int("alarms.nodes.resources.cpu.threshold", 90, "The node CPU resources percentage threshold from 1 to 100")
flag.Bool("alarms.nodes.resources.memory.enabled", true, "Enable node memory resources alarms")
flag.String("alarms.nodes.resources.memory.priority", "LOW", "The node memory resources alarm incident priority")
flag.Int("alarms.nodes.resources.memory.threshold", 90, "The node memory resources percentage threshold from 1 to 100")

pflag.CommandLine.AddGoFlagSet(flag.CommandLine)
pflag.Parse()
Expand Down Expand Up @@ -156,13 +164,17 @@ func parseAndValidateFlags() *config.Config {
checkPriorityConfig(cfg.Alarms.Pods.Terminate.Priority, "--alarms.pods.terminate.priority")
checkPriorityConfig(cfg.Alarms.Pods.Waiting.Priority, "--alarms.pods.waiting.priority")
checkPriorityConfig(cfg.Alarms.Pods.Restarts.Priority, "--alarms.pods.restarts.priority")
checkPriorityConfig(cfg.Alarms.Pods.Resources.Priority, "--alarms.pods.resources.priority")
checkPriorityConfig(cfg.Alarms.Pods.Resources.CPU.Priority, "--alarms.pods.resources.cpu.priority")
checkPriorityConfig(cfg.Alarms.Pods.Resources.Memory.Priority, "--alarms.pods.resources.memory.priority")
checkPriorityConfig(cfg.Alarms.Nodes.Terminate.Priority, "--alarms.nodes.terminate.priority")
checkPriorityConfig(cfg.Alarms.Nodes.Resources.Priority, "--alarms.nodes.resources.priority")
checkPriorityConfig(cfg.Alarms.Nodes.Resources.CPU.Priority, "--alarms.nodes.resources.cpu.priority")
checkPriorityConfig(cfg.Alarms.Nodes.Resources.Memory.Priority, "--alarms.nodes.resources.memory.priority")

checkThresholdConfig(cfg.Alarms.Pods.Resources.Threshold, 1, 100, "--alarms.pods.resources.threshold")
checkThresholdConfig(cfg.Alarms.Pods.Resources.CPU.Threshold, 1, 100, "--alarms.pods.resources.cpu.threshold")
checkThresholdConfig(cfg.Alarms.Pods.Resources.Memory.Threshold, 1, 100, "--alarms.pods.resources.memory.threshold")
checkThresholdConfig(cfg.Alarms.Pods.Restarts.Threshold, 1, 1000000, "--alarms.pods.restarts.threshold")
checkThresholdConfig(cfg.Alarms.Pods.Resources.Threshold, 1, 100, "--alarms.nodes.resources.threshold")
checkThresholdConfig(cfg.Alarms.Pods.Resources.CPU.Threshold, 1, 100, "--alarms.nodes.resources.cpu.threshold")
checkThresholdConfig(cfg.Alarms.Pods.Resources.Memory.Threshold, 1, 100, "--alarms.nodes.resources.memory.threshold")

return cfg
}
Expand Down
36 changes: 28 additions & 8 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,20 @@ alarms:
resources:
## Enables resources pod alarms
enabled: true
## The pod resources alarm incident priority
priority: LOW
## The pod resources percentage threshold from 1 to 100
threshold: 90
cpu:
## Enables CPU resources pod alarms
enabled: true
## The pod resources alarm incident priority
priority: LOW
## The pod CPU resources percentage threshold from 1 to 100
threshold: 90
memory:
## Enables memory resources pod alarms
enabled: true
## The pod resources alarm incident priority
priority: LOW
## The pod memory resources percentage threshold from 1 to 100
threshold: 90

nodes:
## Enables all pod alarms
Expand All @@ -72,10 +82,20 @@ alarms:
resources:
## Enables resources node alarms
enabled: true
## The node resources alarm incident priority
priority: LOW
## The node resources percentage threshold from 1 to 100
threshold: 90
cpu:
## Enables CPU resources node alarms
enabled: true
## The node resources alarm incident priority
priority: LOW
## The node CPU resources percentage threshold from 1 to 100
threshold: 90
memory:
## Enables memory resources node alarms
enabled: true
## The node resources alarm incident priority
priority: LOW
## The node memory resources percentage threshold from 1 to 100
threshold: 90

links:
pods:
Expand Down
2 changes: 1 addition & 1 deletion example/standard/30-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ spec:
serviceAccountName: ilert-kube-agent
containers:
- name: ilert-kube-agent
image: "ilert/ilert-kube-agent:v1.4.1"
image: "ilert/ilert-kube-agent:v1.5.0"
imagePullPolicy: Always
env:
- name: NAMESPACE
Expand Down
15 changes: 11 additions & 4 deletions pkg/config/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,14 @@ type ConfigAlarmsPods struct {
Terminate ConfigAlarmSetting `yaml:"terminate"`
Waiting ConfigAlarmSetting `yaml:"waiting"`
Restarts ConfigAlarmSettingWithThreshold `yaml:"restarts"`
Resources ConfigAlarmSettingWithThreshold `yaml:"resources"`
Resources ConfigAlarmSettingResources `yaml:"resources"`
}

// ConfigAlarmsNodes definition
type ConfigAlarmsNodes struct {
Enabled bool `yaml:"enabled"`
Terminate ConfigAlarmSetting `yaml:"terminate"`
Resources ConfigAlarmSettingWithThreshold `yaml:"resources"`
Enabled bool `yaml:"enabled"`
Terminate ConfigAlarmSetting `yaml:"terminate"`
Resources ConfigAlarmSettingResources `yaml:"resources"`
}

// ConfigAlarmSetting definition
Expand All @@ -60,6 +60,13 @@ type ConfigAlarmSettingWithThreshold struct {
Threshold int32 `yaml:"priority"`
}

// ConfigAlarmSettingResources definition
type ConfigAlarmSettingResources struct {
Enabled bool `yaml:"enabled"`
CPU ConfigAlarmSettingWithThreshold `yaml:"cpu"`
Memory ConfigAlarmSettingWithThreshold `yaml:"memory"`
}

// ConfigLinks definition
type ConfigLinks struct {
Pods []ConfigLinksSetting `yaml:"pods"`
Expand Down
74 changes: 39 additions & 35 deletions pkg/watcher/node_checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,50 +67,54 @@ func checkNodes(kubeClient *kubernetes.Clientset, metricsClient *metrics.Clients
memoryUsage = 0
}

cpuLimitDec := node.Status.Capacity.Cpu().AsDec().String()
cpuLimit, err = strconv.ParseFloat(cpuLimitDec, 64)
if err != nil {
cpuLimit = 0
}
if ok && cpuLimit > 0 && cpuUsage > 0 {
log.Debug().
Str("node", node.GetName()).
Float64("limit", cpuLimit).
Float64("usage", cpuUsage).
Msg("Checking CPU limit")
if cpuUsage >= (float64(cfg.Alarms.Nodes.Resources.Threshold) * (cpuLimit / 100)) {
healthy = false
if incidentRef == nil {
summary := fmt.Sprintf("Node %s CPU limit reached > %d%%", node.GetName(), cfg.Alarms.Nodes.Resources.Threshold)
details := getNodeDetailsWithUsageLimit(kubeClient, &node, fmt.Sprintf("%.3f CPU", cpuUsage), fmt.Sprintf("%.3f CPU", cpuLimit))
links := getNodeLinks(cfg, &node)
incidentID := incident.CreateEvent(cfg, links, nodeKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Nodes.Resources.Priority)
incident.CreateIncidentRef(agentKubeClient, node.GetName(), cfg.Settings.Namespace, incidentID, summary, details, "resources")
if cfg.Alarms.Nodes.Resources.CPU.Enabled {
cpuLimitDec := node.Status.Capacity.Cpu().AsDec().String()
cpuLimit, err = strconv.ParseFloat(cpuLimitDec, 64)
if err != nil {
cpuLimit = 0
}
if ok && cpuLimit > 0 && cpuUsage > 0 {
log.Debug().
Str("node", node.GetName()).
Float64("limit", cpuLimit).
Float64("usage", cpuUsage).
Msg("Checking CPU limit")
if cpuUsage >= (float64(cfg.Alarms.Nodes.Resources.CPU.Threshold) * (cpuLimit / 100)) {
healthy = false
if incidentRef == nil {
summary := fmt.Sprintf("Node %s CPU limit reached > %d%%", node.GetName(), cfg.Alarms.Nodes.Resources.CPU.Threshold)
details := getNodeDetailsWithUsageLimit(kubeClient, &node, fmt.Sprintf("%.3f CPU", cpuUsage), fmt.Sprintf("%.3f CPU", cpuLimit))
links := getNodeLinks(cfg, &node)
incidentID := incident.CreateEvent(cfg, links, nodeKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Nodes.Resources.CPU.Priority)
incident.CreateIncidentRef(agentKubeClient, node.GetName(), cfg.Settings.Namespace, incidentID, summary, details, "resources")
}
}
}
}

memoryLimit, ok := node.Status.Capacity.Memory().AsInt64()
if ok && memoryLimit > 0 && memoryUsage > 0 {
log.Debug().
Str("node", node.GetName()).
Int64("limit", memoryLimit).
Int64("usage", memoryUsage).
Msg("Checking memory limit")
if memoryUsage >= (int64(cfg.Alarms.Nodes.Resources.Threshold) * (memoryLimit / 100)) {
healthy = false
if incidentRef == nil {
summary := fmt.Sprintf("Node %s memory limit reached > %d%%", node.GetName(), cfg.Alarms.Nodes.Resources.Threshold)
details := getNodeDetailsWithUsageLimit(kubeClient, &node, humanize.Bytes(uint64(memoryUsage)), humanize.Bytes(uint64(memoryLimit)))
links := getNodeLinks(cfg, &node)
incidentID := incident.CreateEvent(cfg, links, nodeKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Nodes.Resources.Priority)
incident.CreateIncidentRef(agentKubeClient, node.GetName(), cfg.Settings.Namespace, incidentID, summary, details, "resources")
if cfg.Alarms.Nodes.Resources.Memory.Enabled {
memoryLimit, ok := node.Status.Capacity.Memory().AsInt64()
if ok && memoryLimit > 0 && memoryUsage > 0 {
log.Debug().
Str("node", node.GetName()).
Int64("limit", memoryLimit).
Int64("usage", memoryUsage).
Msg("Checking memory limit")
if memoryUsage >= (int64(cfg.Alarms.Nodes.Resources.Memory.Threshold) * (memoryLimit / 100)) {
healthy = false
if incidentRef == nil {
summary := fmt.Sprintf("Node %s memory limit reached > %d%%", node.GetName(), cfg.Alarms.Nodes.Resources.Memory.Threshold)
details := getNodeDetailsWithUsageLimit(kubeClient, &node, humanize.Bytes(uint64(memoryUsage)), humanize.Bytes(uint64(memoryLimit)))
links := getNodeLinks(cfg, &node)
incidentID := incident.CreateEvent(cfg, links, nodeKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Nodes.Resources.Memory.Priority)
incident.CreateIncidentRef(agentKubeClient, node.GetName(), cfg.Settings.Namespace, incidentID, summary, details, "resources")
}
}
}
}

if healthy && incidentRef != nil && incidentRef.Spec.ID > 0 && incidentRef.Spec.Type == "resources" {
incident.CreateEvent(cfg, nil, nodeKey, fmt.Sprintf("Node %s recovered", node.GetName()), "", ilert.EventTypes.Resolve, cfg.Alarms.Nodes.Resources.Priority)
incident.CreateEvent(cfg, nil, nodeKey, fmt.Sprintf("Node %s recovered", node.GetName()), "", ilert.EventTypes.Resolve, "")
incident.DeleteIncidentRef(agentKubeClient, node.GetName(), cfg.Settings.Namespace)
}
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/watcher/node_informer.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ func startNodeInformer(kubeClient *kubernetes.Clientset, agentKubeClient *agentc
summary := fmt.Sprintf("Node %s terminated", node.GetName())
details := getNodeDetails(kubeClient, node)
links := getNodeLinks(cfg, node)
incidentID := incident.CreateEvent(cfg, links, nodeKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Nodes.Resources.Priority)
incidentID := incident.CreateEvent(cfg, links, nodeKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Nodes.Terminate.Priority)
incident.CreateIncidentRef(agentKubeClient, node.GetName(), cfg.Settings.Namespace, incidentID, summary, details, "terminate")
}
},
Expand Down
20 changes: 11 additions & 9 deletions pkg/watcher/pod_checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ func checkPods(kubeClient *kubernetes.Clientset, metricsClient *metrics.Clientse
if !ok {
memoryUsage = 0
}
if cpuUsage > 0 && container.Resources.Limits.Cpu() != nil {

if cfg.Alarms.Pods.Resources.CPU.Enabled && cpuUsage > 0 && container.Resources.Limits.Cpu() != nil {
cpuLimitDec := container.Resources.Limits.Cpu().AsDec().String()
cpuLimit, err = strconv.ParseFloat(cpuLimitDec, 64)
if err != nil {
Expand All @@ -92,19 +93,20 @@ func checkPods(kubeClient *kubernetes.Clientset, metricsClient *metrics.Clientse
Float64("limit", cpuLimit).
Float64("usage", cpuUsage).
Msg("Checking CPU limit")
if cpuUsage >= (float64(cfg.Alarms.Pods.Resources.Threshold) * (cpuLimit / 100)) {
if cpuUsage >= (float64(cfg.Alarms.Pods.Resources.CPU.Threshold) * (cpuLimit / 100)) {
healthy = false
if incidentRef == nil {
summary := fmt.Sprintf("Pod %s/%s CPU limit reached > %d%%", pod.GetNamespace(), pod.GetName(), cfg.Alarms.Pods.Resources.Threshold)
summary := fmt.Sprintf("Pod %s/%s CPU limit reached > %d%%", pod.GetNamespace(), pod.GetName(), cfg.Alarms.Pods.Resources.CPU.Threshold)
details := getPodDetailsWithUsageLimit(kubeClient, &pod, fmt.Sprintf("%.3f CPU", cpuUsage), fmt.Sprintf("%.3f CPU", cpuLimit))
links := getPodLinks(cfg, &pod)
incidentID := incident.CreateEvent(cfg, links, podKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Pods.Resources.Priority)
incidentID := incident.CreateEvent(cfg, links, podKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Pods.Resources.CPU.Priority)
incident.CreateIncidentRef(agentKubeClient, pod.GetName(), pod.GetNamespace(), incidentID, summary, details, "resources")
}
}
}
}
if memoryUsage > 0 && container.Resources.Limits.Memory() != nil {

if cfg.Alarms.Pods.Resources.Memory.Enabled && memoryUsage > 0 && container.Resources.Limits.Memory() != nil {
memoryLimit, ok := container.Resources.Limits.Memory().AsInt64()
if ok && memoryLimit > 0 {
log.Debug().
Expand All @@ -114,21 +116,21 @@ func checkPods(kubeClient *kubernetes.Clientset, metricsClient *metrics.Clientse
Int64("limit", memoryLimit).
Int64("usage", memoryUsage).
Msg("Checking memory limit")
if memoryUsage >= (int64(cfg.Alarms.Pods.Resources.Threshold) * (memoryLimit / 100)) {
if memoryUsage >= (int64(cfg.Alarms.Pods.Resources.Memory.Threshold) * (memoryLimit / 100)) {
healthy = false
if incidentRef == nil {
summary := fmt.Sprintf("Pod %s/%s memory limit reached > %d%%", pod.GetNamespace(), pod.GetName(), cfg.Alarms.Pods.Resources.Threshold)
summary := fmt.Sprintf("Pod %s/%s memory limit reached > %d%%", pod.GetNamespace(), pod.GetName(), cfg.Alarms.Pods.Resources.Memory.Threshold)
details := getPodDetailsWithUsageLimit(kubeClient, &pod, humanize.Bytes(uint64(memoryUsage)), humanize.Bytes(uint64(memoryLimit)))
links := getPodLinks(cfg, &pod)
incidentID := incident.CreateEvent(cfg, links, podKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Pods.Resources.Priority)
incidentID := incident.CreateEvent(cfg, links, podKey, summary, details, ilert.EventTypes.Alert, cfg.Alarms.Pods.Resources.Memory.Priority)
incident.CreateIncidentRef(agentKubeClient, pod.GetName(), pod.GetNamespace(), incidentID, summary, details, "resources")
}
}
}
}
}
if healthy && incidentRef != nil && incidentRef.Spec.ID > 0 && incidentRef.Spec.Type == "resources" {
incident.CreateEvent(cfg, nil, podKey, fmt.Sprintf("Pod %s/%s recovered", pod.GetNamespace(), pod.GetName()), "", ilert.EventTypes.Resolve, cfg.Alarms.Pods.Resources.Priority)
incident.CreateEvent(cfg, nil, podKey, fmt.Sprintf("Pod %s/%s recovered", pod.GetNamespace(), pod.GetName()), "", ilert.EventTypes.Resolve, "")
incident.DeleteIncidentRef(agentKubeClient, pod.GetName(), pod.GetNamespace())
}
}
Expand Down
2 changes: 1 addition & 1 deletion version.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package shared

// Version current version
const Version = "v1.4.3"
const Version = "v1.5.0"

// App name
const App = "ilert-kube-agent"

0 comments on commit af9c65b

Please sign in to comment.