Skip to content

Commit fdc2d32

Browse files
committed
Fixed permission issue with pgbouncer and logging events to file for troubleshooting
1 parent 104e878 commit fdc2d32

File tree

3 files changed

+84
-11
lines changed

3 files changed

+84
-11
lines changed

cmd/event_handler/main.go

+71-10
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@ import (
44
"context"
55
"flag"
66
"fmt"
7+
"os"
78
"strconv"
9+
"time"
810

911
"github.com/fly-apps/postgres-flex/pkg/flypg"
1012
)
@@ -21,27 +23,87 @@ func main() {
2123
details := flag.String("details", "", "details")
2224
flag.Parse()
2325

24-
fmt.Printf("Event: %s\n Node: %d\n Success: %s\n Details: %s\n",
25-
*event, *nodeID, *success, *details)
26+
eventDetails := fmt.Sprintf("%s - Event: %s\n Node: %d\n Success: %s\n Details: %s\n", time.Now().String(), *event, *nodeID, *success, *details)
27+
28+
// TODO - Use an actual logging framework instead of just writing strings to a file.
29+
logFile, err := os.OpenFile("/data/event.log", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
30+
if err != nil {
31+
fmt.Printf("failed to open event log: %s", err)
32+
}
33+
defer logFile.Close()
34+
35+
logFile.WriteString(eventDetails)
2636

2737
switch *event {
38+
2839
case "repmgrd_failover_promote", "standby_promote":
2940
// TODO - Need to figure out what to do when success == 0.
30-
if err := reconfigurePGBouncer(*nodeID); err != nil {
31-
fmt.Println(err.Error())
32-
return
41+
42+
retry := 0
43+
maxRetries := 5
44+
success := false
45+
46+
for retry < maxRetries {
47+
if err := reconfigurePGBouncer(*nodeID); err != nil {
48+
errMsg := fmt.Sprintf("%s [%s] attempt: %d - failed to reconfigure pgbouncer: %s\n", *event, time.Now().String(), retry, err)
49+
logFile.WriteString(errMsg)
50+
51+
retry++
52+
time.Sleep(1 * time.Second)
53+
continue
54+
}
55+
56+
success = true
57+
break
58+
}
59+
60+
if success {
61+
msg := fmt.Sprintf("%s [%s] Successfully reconfigured pgBouncer to %d\n", *event, time.Now().String(), *nodeID)
62+
logFile.WriteString(msg)
63+
os.Exit(0)
64+
} else {
65+
msg := fmt.Sprintf(" %s [%s] Failed ot reconfigured pgBouncer to %d\n", *event, time.Now().String(), *nodeID)
66+
logFile.WriteString(msg)
67+
os.Exit(1)
3368
}
3469

3570
case "standby_follow":
71+
3672
newMemberID, err := strconv.Atoi(*newPrimary)
3773
if err != nil {
38-
fmt.Printf("failed to parse new member id: %s", err)
74+
errMsg := fmt.Sprintf("failed to parse newMemberID %s: %s\n", *newPrimary, err)
75+
logFile.WriteString(errMsg)
76+
os.Exit(1)
3977
}
4078

41-
if err := reconfigurePGBouncer(newMemberID); err != nil {
42-
fmt.Println(err.Error())
43-
return
79+
retry := 0
80+
maxRetries := 5
81+
success := false
82+
83+
for retry < maxRetries {
84+
if err := reconfigurePGBouncer(*&newMemberID); err != nil {
85+
errMsg := fmt.Sprintf("%s [%s] attempt: %d - failed to reconfigure pgbouncer: %s\n", *event, time.Now().String(), retry, err)
86+
logFile.WriteString(errMsg)
87+
88+
retry++
89+
time.Sleep(1 * time.Second)
90+
continue
91+
}
92+
93+
success = true
94+
break
95+
}
96+
97+
if success {
98+
msg := fmt.Sprintf("%s [%s] Successfully reconfigured pgBouncer to %d\n", *event, time.Now().String(), newMemberID)
99+
logFile.WriteString(msg)
100+
os.Exit(0)
101+
} else {
102+
msg := fmt.Sprintf(" %s [%s] Failed ot reconfigured pgBouncer to %d\n", *event, time.Now().String(), newMemberID)
103+
logFile.WriteString(msg)
104+
os.Exit(1)
44105
}
106+
45107
default:
46108
// noop
47109
}
@@ -63,7 +125,6 @@ func reconfigurePGBouncer(id int) error {
63125
return err
64126
}
65127

66-
fmt.Println("Reconfiguring pgbouncer primary")
67128
if err := node.PGBouncer.ConfigurePrimary(context.TODO(), member.Hostname, true); err != nil {
68129
return fmt.Errorf("failed to reconfigure pgbouncer primary %s", err)
69130
}

pkg/flycheck/role.go

+6-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,12 @@ func PostgreSQLRole(ctx context.Context, checks *check.CheckSuite) (*check.Check
3838

3939
switch member.Role {
4040
case flypg.PrimaryRoleName:
41-
return "primary", nil
41+
if member.Active {
42+
return "primary", nil
43+
} else {
44+
return "zombie", nil
45+
}
46+
4247
case flypg.StandbyRoleName:
4348
return "replica", nil
4449
default:

pkg/flypg/node.go

+7
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,8 @@ func (n *Node) Init(ctx context.Context) error {
173173
return fmt.Errorf("failed to rejoin cluster: %s", err)
174174
}
175175

176+
// TODO - Wait for target cluster to register self as a standby.
177+
176178
if err := removeZombieLock(); err != nil {
177179
return fmt.Errorf("failed to remove zombie lock: %s", err)
178180
}
@@ -231,6 +233,10 @@ func (n *Node) Init(ctx context.Context) error {
231233
return fmt.Errorf("failed to configure postgres: %s", err)
232234
}
233235

236+
if err := setDirOwnership(); err != nil {
237+
return err
238+
}
239+
234240
return nil
235241
}
236242

@@ -488,6 +494,7 @@ func (n *Node) configure(ctx context.Context, store *state.Store) error {
488494
}
489495

490496
// Clear target and wait for primary resolution
497+
fmt.Println("Disabling PGBouncer until primary is resolved")
491498
if err := n.PGBouncer.ConfigurePrimary(ctx, "", false); err != nil {
492499
return fmt.Errorf("failed to set pgbouncer target: %s", err)
493500
}

0 commit comments

Comments
 (0)