From a88a6931585e72e4993b522bb8842b1a0e38354c Mon Sep 17 00:00:00 2001 From: Bjorn Jorgensen Date: Tue, 23 Apr 2024 20:47:17 +0200 Subject: [PATCH] Fix buddy race condition, ramdom crash --- CHANGELOG.md | 9 +++-- cmd/dashgoat/alertmanager.go | 10 ++++- cmd/dashgoat/buddy.go | 62 +++++++++++++++++++++++------- cmd/dashgoat/handler.go | 4 +- cmd/dashgoat/main.go | 5 +++ cmd/dashgoat/pagerduty.go | 10 +++-- cmd/dashgoat/state.go | 6 ++- deploy/azure-functions/.funcignore | 1 + 8 files changed, 81 insertions(+), 26 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9bb941c..c6f5d7c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,13 +1,14 @@ # Changelog -## [v1.7.3] - 2024-04-23 -Change: - - Upgrade Go dependencies - +## [v1.7.2] - 2024-04-23 Fix: - Buddy state inconsistencies - Buddy state race problems, making the app reset dataset - Change timestamp not updated correctly +## [v1.7.1] - 2024-04-22 +Change: + - Upgrade Go dependencies + ## [v1.7.0] - 2024-04-18 Add: - Config element logformat diff --git a/cmd/dashgoat/alertmanager.go b/cmd/dashgoat/alertmanager.go index 71ef182..76fb132 100644 --- a/cmd/dashgoat/alertmanager.go +++ b/cmd/dashgoat/alertmanager.go @@ -4,6 +4,7 @@ import ( "encoding/json" "fmt" "net/http" + "strings" "time" "github.com/labstack/echo/v4" @@ -73,7 +74,7 @@ func parseAlertmanagerHookMessage(message HookMessage) error { var post_service_state ServiceState post_service_state.UpdateKey = "valid" - post_service_state.Severity = message.CommonLabels["severity"] + post_service_state.Severity = strings.ToLower(message.CommonLabels["severity"]) if message.CommonLabels["severity"] == "" { err := fmt.Errorf("missing CommonLabels[Severity]") logger.Error("parseAlertmanagerHookMessage", "CommonLabels", err) @@ -98,6 +99,8 @@ func parseAlertmanagerHookMessage(message HookMessage) error { err := fmt.Errorf("missing CommonLabels['prometheus_cluster'], CommonLabels['cluster'] or CommonLabels['prometheus']") return err } + post_service_state.Host = strings.ToLower(post_service_state.Host) + post_service_state.Service = message.CommonLabels["namespace"] if message.CommonLabels["namespace"] == "" { logger.Info("parseAlertmanagerHookMessage", "missing", "CommonLabels['namespace']") @@ -140,6 +143,8 @@ func parseAlertmanagerHookMessage(message HookMessage) error { post_service_state = runDependOn(post_service_state) ss.serviceStateList[host_service] = post_service_state + + go updateBuddy(post_service_state, "") } return nil @@ -165,8 +170,9 @@ func parseAlertmanagerAlert(alert Alert, service_state ServiceState) (ServiceSta } if service_state.Service == "" { logger.Error("parseAlertmanagerAlert", "service", "Cant find namespace or container", "alert object", alert.Labels) + } else { + service_state.Service = strings.ToLower(service_state.Service) } - return service_state, nil } diff --git a/cmd/dashgoat/buddy.go b/cmd/dashgoat/buddy.go index f9ce6cb..bc1753f 100644 --- a/cmd/dashgoat/buddy.go +++ b/cmd/dashgoat/buddy.go @@ -24,6 +24,42 @@ type ( } ) +// setStateDown on buddy backlog +func setStateDown(host string, data int64) { + if host == "" { + logger.Info("setStateDown", "error", "no host") + return + } + + backlog.mutex.Lock() + defer backlog.mutex.Unlock() + backlog.StateDown[host] = data +} + +// getStateDown on buddy backlog +func getStateDown() map[string]int64 { + backlog.mutex.RLock() + defer backlog.mutex.RUnlock() + + copy := make(map[string]int64) + for k, v := range backlog.StateDown { + copy[k] = v + } + return copy +} + +// setBacklog on buddy backlog +func setBacklog(host string, data []string) { + if host == "" { + logger.Info("setBacklog", "error", "no host") + return + } + + backlog.mutex.Lock() + defer backlog.mutex.Unlock() + backlog.buddyBacklog[host] = data +} + // Update Buddies with newly recieved msg func updateBuddy(event ServiceState, delete string) { to_update := listBuddies() @@ -32,9 +68,7 @@ func updateBuddy(event ServiceState, delete string) { return //No buddy to tell } - backlog.mutex.RLock() - buddyDown := backlog.StateDown - backlog.mutex.RUnlock() + buddyDown := getStateDown() for _, bhost := range to_update { if !contains(event.From, bhost.Name) { @@ -117,7 +151,7 @@ func talkToBuddyApi(event ServiceState, host Buddy, delete string) { func findBuddy(buddyConfig []Buddy) { initBuddyConf(buddyConfig) - buddyAmount := len(buddyRunningConfig.Buddies) + buddyAmount := len(listBuddies()) if buddyAmount < 1 { setDashGoatReady(true) @@ -167,29 +201,30 @@ func findBuddy(buddyConfig []Buddy) { // report back to UI, stausList func tellBuddyState(host string, up bool, servicehost string) { + var empty_slice []string + var default_int64 int64 now := time.Now() - backlog.mutex.Lock() - defer backlog.mutex.Unlock() if _, ok := backlog.StateDown[host]; !ok { - backlog.StateDown[host] = 0 + setStateDown(host, default_int64) } if up { - if backlog.StateDown[host] != 0 { + if getStateDown()[host] != 0 { tellServiceListAboutBuddy(host, up) } - backlog.StateDown[host] = 0 + setStateDown(host, default_int64) deliverBacklog(host, backlog.buddyBacklog[host]) - backlog.buddyBacklog[host] = nil + setBacklog(host, empty_slice) //empty backlog for host } else { if servicehost != "" { - backlog.buddyBacklog[host] = append(backlog.buddyBacklog[host], servicehost) + backlog_tmp := append(backlog.buddyBacklog[host], servicehost) + setBacklog(host, backlog_tmp) } if backlog.StateDown[host] == 0 { tellServiceListAboutBuddy(host, up) - backlog.StateDown[host] = now.Unix() + setStateDown(host, now.Unix()) } } } @@ -308,7 +343,9 @@ func AskApiFullStatusList(bhost Buddy) error { for servicehost, status := range resultMap { if status.Service != "buddy" { + ss.mutex.Lock() ss.serviceStateList[servicehost] = status + ss.mutex.Unlock() } } @@ -375,7 +412,6 @@ func tellServiceListAboutBuddy(buddyName string, up bool) { logger.Error("tellServiceListAboutBuddy", "error", err) } - // logger.Info("tellServiceListAboutBuddy", "debugger", result) iSnewState(result) ss.serviceStateList[serviceName] = result diff --git a/cmd/dashgoat/handler.go b/cmd/dashgoat/handler.go index 05c84fa..be61e8a 100644 --- a/cmd/dashgoat/handler.go +++ b/cmd/dashgoat/handler.go @@ -76,6 +76,9 @@ func heartBeat(c echo.Context) error { result = post_service_state.Host + post_service_state.Service ss.serviceStateList[result] = post_service_state + + go updateBuddy(post_service_state, "") + return c.JSON(http.StatusOK, result) } @@ -237,7 +240,6 @@ func checkUpdatekey(key string) bool { } func checkUrnKey(key string) bool { - //logger.Info("Config urn", "key", config.UrnKey) return key == config.UrnKey } diff --git a/cmd/dashgoat/main.go b/cmd/dashgoat/main.go index 4b9ed5a..16e8f6a 100644 --- a/cmd/dashgoat/main.go +++ b/cmd/dashgoat/main.go @@ -50,9 +50,14 @@ var serviceStateCollector *ServiceStateCollector func main() { var configfile string + ss.mutex.Lock() ss.serviceStateList = make(map[string]ServiceState) + ss.mutex.Unlock() + + backlog.mutex.Lock() backlog.buddyBacklog = make(map[string][]string) backlog.StateDown = make(map[string]int64) + backlog.mutex.Unlock() e := echo.New() diff --git a/cmd/dashgoat/pagerduty.go b/cmd/dashgoat/pagerduty.go index 298840a..fdea58b 100644 --- a/cmd/dashgoat/pagerduty.go +++ b/cmd/dashgoat/pagerduty.go @@ -112,6 +112,7 @@ func shouldPagerDutyTrigger(severity_to_check string) bool { trigger_level := indexOf(severitys[:], config.PagerdutyConfig.TriggerLevel) to_check := indexOf(severitys[:], severity_to_check) + logger.Info("shouldPagerDutyTrigger", "severity_to_check", to_check, "trigger_level", trigger_level) return to_check >= trigger_level } @@ -148,7 +149,7 @@ func (c *PdClient) CompilePdEvent(fromstate string, dgss ServiceState) { err := pdClient.TellPagerDuty(pdevent) if err != nil { - logger.Error("Error sending to PagerDuty:", err) + logger.Error("CompilePdEvent", "error", "update was not send") } } @@ -190,7 +191,7 @@ func (c *PdClient) TellPagerDuty(pdevent PagerDutyEvent) error { req, err := http.NewRequest("POST", c.config.URL, payload) if err != nil { - logger.Error("PagerDuty POST failed", err) + logger.Error("TellPagerDuty", "POST failed", err) return err } @@ -198,13 +199,14 @@ func (c *PdClient) TellPagerDuty(pdevent PagerDutyEvent) error { res, err := client.Do(req) if err != nil { - logger.Error("PagerDuty error client", err) + logger.Error("TellPagerDuty", "Do error", err) + return err } defer res.Body.Close() body, err := io.ReadAll(res.Body) if err != nil { - logger.Error("Failed reading PagerDuty response", err) + logger.Error("TellPagerDuty", "ReadAll PagerDuty", err) return err } diff --git a/cmd/dashgoat/state.go b/cmd/dashgoat/state.go index aa0f228..35ee564 100644 --- a/cmd/dashgoat/state.go +++ b/cmd/dashgoat/state.go @@ -15,13 +15,15 @@ func iSnewState(checkss ServiceState) (change string, new_service bool) { if _, ok := ss.serviceStateList[hostservice]; ok { + current_status := ss.serviceStateList[hostservice].Status + // no change - if ss.serviceStateList[hostservice].Status == checkss.Status { + if current_status == checkss.Status { return "", false } // change - go reportStateChange(ss.serviceStateList[hostservice].Status, checkss) + go reportStateChange(current_status, checkss) return checkss.Status, false } diff --git a/deploy/azure-functions/.funcignore b/deploy/azure-functions/.funcignore index c857635..92c1539 100644 --- a/deploy/azure-functions/.funcignore +++ b/deploy/azure-functions/.funcignore @@ -4,6 +4,7 @@ __azurite_db*__.json __blobstorage__ __queuestorage__ local.settings.json +prepare-azure-files.sh build deploy doc