Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clear demotePrimaryStalled field after the function ends #17823

Merged
merged 3 commits into from
Feb 21, 2025
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions go/vt/vttablet/tabletmanager/rpc_replication.go
Original file line number Diff line number Diff line change
Expand Up @@ -535,6 +535,7 @@ func (tm *TabletManager) demotePrimary(ctx context.Context, revertPartialFailure
return nil, err
}
defer tm.unlock()
defer tm.QueryServiceControl.ClearDemotePrimaryStalled()

finishCtx, cancel := context.WithCancel(context.Background())
defer cancel()
Expand All @@ -550,6 +551,12 @@ func (tm *TabletManager) demotePrimary(ctx context.Context, revertPartialFailure
buf := make([]byte, 1<<16) // 64 KB buffer size
stackSize := runtime.Stack(buf, true)
log.Errorf("Stack trace:\n%s", string(buf[:stackSize]))
// This condition check is only to handle the race, where we start to set the demote primary stalled
// but then the function finishes. So, after we set demote primary stalled, we check if the
// function has finished and if it has, we clear the demote primary stalled.
if finishCtx.Err() != nil {
tm.QueryServiceControl.ClearDemotePrimaryStalled()
}
}
}()

Expand Down
28 changes: 22 additions & 6 deletions go/vt/vttablet/tabletmanager/rpc_replication_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,16 +50,20 @@ func TestWaitForGrantsToHaveApplied(t *testing.T) {

type demotePrimaryStallQS struct {
tabletserver.Controller
waitTime time.Duration
qsWaitChan chan any
primaryStalled atomic.Bool
}

func (d *demotePrimaryStallQS) SetDemotePrimaryStalled() {
d.primaryStalled.Store(true)
}

func (d *demotePrimaryStallQS) ClearDemotePrimaryStalled() {
d.primaryStalled.Store(false)
}

func (d *demotePrimaryStallQS) IsServing() bool {
time.Sleep(d.waitTime)
<-d.qsWaitChan
return false
}

Expand All @@ -74,7 +78,7 @@ func TestDemotePrimaryStalled(t *testing.T) {

// Create a fake query service control to intercept calls from DemotePrimary function.
qsc := &demotePrimaryStallQS{
waitTime: 2 * time.Second,
qsWaitChan: make(chan any),
}
// Create a tablet manager with a replica type tablet.
tm := &TabletManager{
Expand All @@ -88,8 +92,20 @@ func TestDemotePrimaryStalled(t *testing.T) {
QueryServiceControl: qsc,
}

// We make IsServing stall for over 2 seconds, which is longer than 10 * remote operation timeout.
go func() {
tm.demotePrimary(context.Background(), false)
}()
// We make IsServing stall by making it wait on a channel.
// This should cause the demote primary operation to be stalled.
tm.demotePrimary(context.Background(), false)
require.True(t, qsc.primaryStalled.Load())
require.Eventually(t, func() bool {
return qsc.primaryStalled.Load()
}, 5*time.Second, 100*time.Millisecond)

// Unblock the DemotePrimary call by closing the channel.
close(qsc.qsWaitChan)

// Eventually demote primary will succeed, and we want the stalled field to be cleared.
require.Eventually(t, func() bool {
return !qsc.primaryStalled.Load()
}, 5*time.Second, 100*time.Millisecond)
}
3 changes: 3 additions & 0 deletions go/vt/vttablet/tabletserver/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,9 @@ type Controller interface {
// SetDemotePrimaryStalled marks that demote primary is stalled in the state manager.
SetDemotePrimaryStalled()

// ClearDemotePrimaryStalled clears the demote primary stalled field in the state manager.
ClearDemotePrimaryStalled()

// IsDiskStalled returns if the disk is stalled.
IsDiskStalled() bool
}
Expand Down
8 changes: 8 additions & 0 deletions go/vt/vttablet/tabletserver/tabletserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -769,6 +769,14 @@ func (tsv *TabletServer) SetDemotePrimaryStalled() {
tsv.BroadcastHealth()
}

// ClearDemotePrimaryStalled clears the demote primary stalled field in the state manager.
func (tsv *TabletServer) ClearDemotePrimaryStalled() {
tsv.sm.mu.Lock()
tsv.sm.demotePrimaryStalled = false
tsv.sm.mu.Unlock()
tsv.BroadcastHealth()
}

// IsDiskStalled returns if the disk is stalled or not.
func (tsv *TabletServer) IsDiskStalled() bool {
return tsv.sm.diskHealthMonitor.IsDiskStalled()
Expand Down
5 changes: 5 additions & 0 deletions go/vt/vttablet/tabletservermock/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,11 @@ func (tqsc *Controller) SetDemotePrimaryStalled() {
tqsc.MethodCalled["SetDemotePrimaryStalled"] = true
}

// ClearDemotePrimaryStalled is part of the tabletserver.Controller interface
func (tqsc *Controller) ClearDemotePrimaryStalled() {
tqsc.MethodCalled["ClearDemotePrimaryStalled"] = true
}

// IsDiskStalled is part of the tabletserver.Controller interface
func (tqsc *Controller) IsDiskStalled() bool {
tqsc.MethodCalled["IsDiskStalled"] = true
Expand Down
Loading