Skip to content

Commit 4a3b553

Browse files
authored
Revert "switch node names to machine ID (#251)" (#258)
This reverts commit a39f76f.
1 parent a39f76f commit 4a3b553

File tree

9 files changed

+47
-220
lines changed

9 files changed

+47
-220
lines changed

bin/restart-repmgrd

+1-8
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,3 @@
11
#!/bin/bash
22

3-
if [ -f /tmp/repmgrd.pid ]; then
4-
PID=$(cat /tmp/repmgrd.pid)
5-
6-
# Check if the process is running
7-
if ps -p $PID > /dev/null 2>&1; then
8-
kill $PID
9-
fi
10-
fi
3+
kill `cat /tmp/repmgrd.pid`

cmd/pg_unregister/main.go

+1-10
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ package main
33
import (
44
"context"
55
"encoding/base64"
6-
"errors"
76
"fmt"
87
"log"
98
"os"
@@ -45,15 +44,7 @@ func processUnregistration(ctx context.Context) error {
4544
defer func() { _ = conn.Close(ctx) }()
4645

4746
member, err := node.RepMgr.MemberByHostname(ctx, conn, string(hostnameBytes))
48-
if errors.Is(err, pgx.ErrNoRows) {
49-
// for historical reasons, old versions of flyctl passes in the 6pn as the hostname
50-
// most likely this won't work because the hostname does not resolve if the machine is stopped,
51-
// but we try anyway
52-
member, err = node.RepMgr.MemberBy6PN(ctx, conn, string(hostnameBytes))
53-
if err != nil {
54-
return fmt.Errorf("failed to resolve member by hostname and 6pn: %s", err)
55-
}
56-
} else if err != nil {
47+
if err != nil {
5748
return fmt.Errorf("failed to resolve member: %s", err)
5849
}
5950

go.mod

+2-2
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,8 @@ require (
88
github.com/hashicorp/consul/api v1.18.0
99
github.com/jackc/pgconn v1.14.3
1010
github.com/jackc/pgx/v5 v5.5.4
11-
github.com/olekukonko/tablewriter v0.0.5
1211
github.com/pkg/errors v0.9.1
1312
github.com/pkg/term v1.1.0
14-
github.com/spf13/cobra v1.8.1
1513
github.com/superfly/fly-checks v0.0.0-20230510154016-d189351293f2
1614
golang.org/x/exp v0.0.0-20230105202349-8879d0199aa3
1715
golang.org/x/sync v0.1.0
@@ -38,6 +36,8 @@ require (
3836
github.com/mattn/go-runewidth v0.0.9 // indirect
3937
github.com/mitchellh/go-homedir v1.1.0 // indirect
4038
github.com/mitchellh/mapstructure v1.4.1 // indirect
39+
github.com/olekukonko/tablewriter v0.0.5 // indirect
40+
github.com/spf13/cobra v1.8.1 // indirect
4141
github.com/spf13/pflag v1.0.5 // indirect
4242
github.com/stretchr/objx v0.5.0 // indirect
4343
golang.org/x/crypto v0.20.0 // indirect

internal/flypg/node.go

+3-62
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,10 @@ import (
1616
"github.com/fly-apps/postgres-flex/internal/privnet"
1717
"github.com/fly-apps/postgres-flex/internal/utils"
1818
"github.com/jackc/pgx/v5"
19-
"golang.org/x/exp/slices"
2019
)
2120

2221
type Node struct {
2322
AppName string
24-
MachineID string
2523
PrivateIP string
2624
PrimaryRegion string
2725
DataDir string
@@ -54,8 +52,6 @@ func NewNode() (*Node, error) {
5452

5553
node.PrivateIP = ipv6.String()
5654

57-
node.MachineID = os.Getenv("FLY_MACHINE_ID")
58-
5955
node.PrimaryRegion = os.Getenv("PRIMARY_REGION")
6056
if node.PrimaryRegion == "" {
6157
return nil, fmt.Errorf("PRIMARY_REGION environment variable must be set")
@@ -93,7 +89,6 @@ func NewNode() (*Node, error) {
9389
PasswordConfigPath: "/data/.pgpass",
9490
DataDir: node.DataDir,
9591
PrivateIP: node.PrivateIP,
96-
MachineID: node.MachineID,
9792
Port: 5433,
9893
DatabaseName: "repmgr",
9994
Credentials: node.ReplCredentials,
@@ -270,7 +265,7 @@ func (n *Node) PostInit(ctx context.Context) error {
270265
return fmt.Errorf("failed to resolve member role: %s", err)
271266
}
272267

273-
// Restart repmgrd in the event the machine ID changes for an already registered node.
268+
// Restart repmgrd in the event the IP changes for an already registered node.
274269
// This can happen if the underlying volume is moved to a different node.
275270
daemonRestartRequired := n.RepMgr.daemonRestartRequired(member)
276271

@@ -284,8 +279,6 @@ func (n *Node) PostInit(ctx context.Context) error {
284279
if err := Quarantine(ctx, n, primary); err != nil {
285280
return fmt.Errorf("failed to quarantine failed primary: %s", err)
286281
}
287-
288-
panic(err)
289282
} else if errors.Is(err, ErrZombieDiscovered) {
290283
log.Printf("[ERROR] The majority of registered members agree that '%s' is the real primary.\n", primary)
291284
// Turn member read-only
@@ -299,10 +292,10 @@ func (n *Node) PostInit(ctx context.Context) error {
299292
}
300293

301294
// This should never happen
302-
if primary != n.RepMgr.machineIdToDNS(n.MachineID) {
295+
if primary != n.PrivateIP {
303296
return fmt.Errorf("resolved primary '%s' does not match ourself '%s'. this should not happen",
304297
primary,
305-
n.RepMgr.machineIdToDNS(n.MachineID),
298+
n.PrivateIP,
306299
)
307300
}
308301

@@ -318,11 +311,6 @@ func (n *Node) PostInit(ctx context.Context) error {
318311
}
319312
}
320313
case StandbyRoleName:
321-
if err := n.migrateNodeNameIfNeeded(ctx, repConn); err != nil {
322-
log.Printf("[ERROR] failed to migrate node name: %s", err)
323-
// We try to bring the standby up anyway
324-
}
325-
326314
// Register existing standby to apply any configuration changes.
327315
if err := n.RepMgr.registerStandby(daemonRestartRequired); err != nil {
328316
return fmt.Errorf("failed to register existing standby: %s", err)
@@ -539,50 +527,3 @@ func (n *Node) handleRemoteRestore(ctx context.Context, store *state.Store) erro
539527

540528
return nil
541529
}
542-
543-
// migrate node name from 6pn to machine ID if needed
544-
func (n *Node) migrateNodeNameIfNeeded(ctx context.Context, repConn *pgx.Conn) error {
545-
primary, err := n.RepMgr.PrimaryMember(ctx, repConn)
546-
if err != nil {
547-
return fmt.Errorf("failed to resolve primary member when updating standby: %s", err)
548-
}
549-
550-
primaryConn, err := n.RepMgr.NewRemoteConnection(ctx, primary.Hostname)
551-
if err != nil {
552-
return fmt.Errorf("failed to establish connection to primary: %s", err)
553-
}
554-
defer func() { _ = primaryConn.Close(ctx) }()
555-
556-
rows, err := primaryConn.Query(ctx, "select application_name from pg_stat_replication")
557-
if err != nil {
558-
return fmt.Errorf("failed to query pg_stat_replication: %s", err)
559-
}
560-
defer rows.Close()
561-
562-
var applicationNames []string
563-
for rows.Next() {
564-
var applicationName string
565-
if err := rows.Scan(&applicationName); err != nil {
566-
return fmt.Errorf("failed to scan application_name: %s", err)
567-
}
568-
applicationNames = append(applicationNames, applicationName)
569-
}
570-
if err := rows.Err(); err != nil {
571-
return fmt.Errorf("failed to iterate over rows: %s", err)
572-
}
573-
574-
// if we find our 6pn as application_name, we need to regenerate postgresql.auto.conf and reload postgresql
575-
if slices.Contains(applicationNames, n.PrivateIP) {
576-
log.Printf("pg_stat_replication on the primary has our ipv6 address as application_name, converting to machine ID...")
577-
578-
if err := n.RepMgr.regenReplicationConf(ctx); err != nil {
579-
return fmt.Errorf("failed to clone standby: %s", err)
580-
}
581-
582-
if err := admin.ReloadPostgresConfig(ctx, repConn); err != nil {
583-
return fmt.Errorf("failed to reload postgresql: %s", err)
584-
}
585-
}
586-
587-
return nil
588-
}

internal/flypg/readonly.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ func BroadcastReadonlyChange(ctx context.Context, n *Node, enabled bool) error {
7070

7171
for _, member := range members {
7272
if member.Role == PrimaryRoleName {
73-
endpoint := fmt.Sprintf("http://%s:5500/%s", member.Hostname, target)
73+
endpoint := fmt.Sprintf("http://[%s]:5500/%s", member.Hostname, target)
7474
resp, err := http.Get(endpoint)
7575
if err != nil {
7676
log.Printf("[WARN] Failed to broadcast readonly state change to member %s: %s", member.Hostname, err)
@@ -85,7 +85,7 @@ func BroadcastReadonlyChange(ctx context.Context, n *Node, enabled bool) error {
8585
}
8686

8787
for _, member := range members {
88-
endpoint := fmt.Sprintf("http://%s:5500/%s", member.Hostname, RestartHaproxyEndpoint)
88+
endpoint := fmt.Sprintf("http://[%s]:5500/%s", member.Hostname, RestartHaproxyEndpoint)
8989
resp, err := http.Get(endpoint)
9090
if err != nil {
9191
log.Printf("[WARN] Failed to restart haproxy on member %s: %s", member.Hostname, err)

internal/flypg/repmgr.go

+10-81
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ type RepMgr struct {
3434
PrimaryRegion string
3535
Region string
3636
PrivateIP string
37-
MachineID string
3837
DataDir string
3938
DatabaseName string
4039
Credentials admin.Credential
@@ -162,12 +161,10 @@ func (r *RepMgr) setDefaults() error {
162161
return err
163162
}
164163

165-
hostname := r.machineIdToDNS(r.MachineID)
166-
167164
conf := ConfigMap{
168165
"node_id": nodeID,
169-
"node_name": fmt.Sprintf("'%s'", hostname),
170-
"conninfo": fmt.Sprintf("'host=%s port=%d user=%s dbname=%s connect_timeout=5'", hostname, r.Port, r.Credentials.Username, r.DatabaseName),
166+
"node_name": fmt.Sprintf("'%s'", r.PrivateIP),
167+
"conninfo": fmt.Sprintf("'host=%s port=%d user=%s dbname=%s connect_timeout=5'", r.PrivateIP, r.Port, r.Credentials.Username, r.DatabaseName),
171168
"data_directory": fmt.Sprintf("'%s'", r.DataDir),
172169
"failover": "'automatic'",
173170
"use_replication_slots": "yes",
@@ -279,7 +276,7 @@ func (*RepMgr) restartDaemon() error {
279276
}
280277

281278
func (r *RepMgr) daemonRestartRequired(m *Member) bool {
282-
return m.Hostname != r.MachineID
279+
return m.Hostname != r.PrivateIP
283280
}
284281

285282
func (r *RepMgr) unregisterWitness(id int) error {
@@ -304,14 +301,14 @@ func (r *RepMgr) rejoinCluster(hostname string) error {
304301
return err
305302
}
306303

307-
func (r *RepMgr) clonePrimary(hostname string) error {
304+
func (r *RepMgr) clonePrimary(ipStr string) error {
308305
cmdStr := fmt.Sprintf("mkdir -p %s", r.DataDir)
309306
if _, err := utils.RunCommand(cmdStr, "postgres"); err != nil {
310307
return fmt.Errorf("failed to create pg directory: %s", err)
311308
}
312309

313310
cmdStr = fmt.Sprintf("repmgr -h %s -p %d -d %s -U %s -f %s standby clone -c -F",
314-
hostname,
311+
ipStr,
315312
r.Port,
316313
r.DatabaseName,
317314
r.Credentials.Username,
@@ -325,21 +322,6 @@ func (r *RepMgr) clonePrimary(hostname string) error {
325322
return nil
326323
}
327324

328-
func (r *RepMgr) regenReplicationConf(ctx context.Context) error {
329-
// TODO: do we need -c?
330-
if _, err := utils.RunCmd(ctx, "postgres",
331-
"repmgr", "--replication-conf-only",
332-
"-h", "",
333-
"-p", fmt.Sprint(r.Port),
334-
"-d", r.DatabaseName,
335-
"-U", r.Credentials.Username,
336-
"-f", r.ConfigPath,
337-
"standby", "clone", "-F"); err != nil {
338-
return fmt.Errorf("failed to regenerate replication conf: %s", err)
339-
}
340-
return nil
341-
}
342-
343325
type Member struct {
344326
ID int
345327
Hostname string
@@ -449,56 +431,26 @@ func (*RepMgr) MemberByHostname(ctx context.Context, pg *pgx.Conn, hostname stri
449431
return &member, nil
450432
}
451433

452-
// MemberBy6PN returns a member by its 6PN address.
453-
func (r *RepMgr) MemberBy6PN(ctx context.Context, pg *pgx.Conn, ip string) (*Member, error) {
454-
members, err := r.Members(ctx, pg)
455-
if err != nil {
456-
return nil, err
457-
}
458-
459-
resolver := privnet.GetResolver()
460-
var lastErr error
461-
for _, member := range members {
462-
ips, err := resolver.LookupIPAddr(ctx, member.Hostname)
463-
if err != nil {
464-
lastErr = err
465-
continue
466-
}
467-
468-
for _, addr := range ips {
469-
if addr.IP.String() == ip {
470-
return &member, nil
471-
}
472-
}
473-
}
474-
475-
if lastErr != nil {
476-
return nil, fmt.Errorf("no matches found for %s, and error encountered: %s", ip, lastErr)
477-
}
478-
479-
return nil, nil
480-
}
481-
482434
func (r *RepMgr) ResolveMemberOverDNS(ctx context.Context) (*Member, error) {
483-
machineIds, err := r.InRegionPeerMachines(ctx)
435+
ips, err := r.InRegionPeerIPs(ctx)
484436
if err != nil {
485437
return nil, err
486438
}
487439

488440
var target *Member
489441

490-
for _, machineId := range machineIds {
491-
if machineId == r.MachineID {
442+
for _, ip := range ips {
443+
if ip.String() == r.PrivateIP {
492444
continue
493445
}
494446

495-
conn, err := r.NewRemoteConnection(ctx, r.machineIdToDNS(machineId))
447+
conn, err := r.NewRemoteConnection(ctx, ip.String())
496448
if err != nil {
497449
continue
498450
}
499451
defer func() { _ = conn.Close(ctx) }()
500452

501-
member, err := r.MemberByHostname(ctx, conn, r.machineIdToDNS(machineId))
453+
member, err := r.MemberByHostname(ctx, conn, ip.String())
502454
if err != nil {
503455
continue
504456
}
@@ -525,21 +477,6 @@ func (r *RepMgr) InRegionPeerIPs(ctx context.Context) ([]net.IPAddr, error) {
525477
return privnet.AllPeers(ctx, targets)
526478
}
527479

528-
func (r *RepMgr) InRegionPeerMachines(ctx context.Context) ([]string, error) {
529-
machines, err := privnet.AllMachines(ctx, r.AppName)
530-
if err != nil {
531-
return nil, err
532-
}
533-
534-
var machineIDs []string
535-
for _, machine := range machines {
536-
if machine.Region == r.PrimaryRegion {
537-
machineIDs = append(machineIDs, machine.Id)
538-
}
539-
}
540-
return machineIDs, nil
541-
}
542-
543480
func (r *RepMgr) HostInRegion(ctx context.Context, hostname string) (bool, error) {
544481
ips, err := r.InRegionPeerIPs(ctx)
545482
if err != nil {
@@ -577,11 +514,3 @@ func (r *RepMgr) UnregisterMember(member Member) error {
577514
func (r *RepMgr) eligiblePrimary() bool {
578515
return r.Region == r.PrimaryRegion
579516
}
580-
581-
func (r *RepMgr) machineIdToDNS(nodeName string) string {
582-
if len(nodeName) != 14 {
583-
panic("invalid machine id")
584-
}
585-
586-
return fmt.Sprintf("%s.vm.%s.internal", nodeName, r.AppName)
587-
}

internal/flypg/repmgr_test.go

+2-4
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ func TestRepmgrInitialization(t *testing.T) {
3333
UserConfigPath: repgmrUserConfigFilePath,
3434
PasswordConfigPath: repgmrPasswordConfigFilePath,
3535
DataDir: repmgrTestDirectory,
36-
MachineID: "abcdefg1234567",
3736
PrivateIP: "127.0.0.1",
3837
Credentials: admin.Credential{
3938
Username: "user",
@@ -92,8 +91,8 @@ func TestRepmgrInitialization(t *testing.T) {
9291
t.Fatal(err)
9392
}
9493

95-
if config["node_name"] != "'abcdefg1234567.vm.test-app.internal'" {
96-
t.Fatalf("expected node_name to be 'abcdefg1234567.vm.test-app.internal', got %v", config["node_name"])
94+
if config["node_name"] != "'127.0.0.1'" {
95+
t.Fatalf("expected node_name to be '127.0.0.1', got %v", config["node_name"])
9796
}
9897

9998
if config["location"] != "'dev'" {
@@ -123,7 +122,6 @@ func TestRepmgrNodeIDGeneration(t *testing.T) {
123122

124123
DataDir: repmgrTestDirectory,
125124
PrivateIP: "127.0.0.1",
126-
MachineID: "abcdefg1234567",
127125
Port: 5433,
128126
DatabaseName: "repmgr",
129127
Credentials: admin.Credential{

0 commit comments

Comments
 (0)