From 3471ef133daed5b8e7608fe66da33ef9bbdab5e6 Mon Sep 17 00:00:00 2001 From: Marek Siarkowicz Date: Wed, 20 Dec 2023 15:32:55 +0100 Subject: [PATCH] Add an e2e test and robustness failpoint around recovering from snapshot backend Signed-off-by: Marek Siarkowicz --- server/etcdserver/server.go | 2 +- tests/e2e/leader_snapshot_no_proxy_test.go | 98 ++++++++++++++++++++++ tests/robustness/failpoint/failpoint.go | 1 + tests/robustness/failpoint/gofail.go | 1 + tests/robustness/failpoint/network.go | 4 +- 5 files changed, 103 insertions(+), 3 deletions(-) create mode 100644 tests/e2e/leader_snapshot_no_proxy_test.go diff --git a/server/etcdserver/server.go b/server/etcdserver/server.go index 00c385674b8..3f75a9899ab 100644 --- a/server/etcdserver/server.go +++ b/server/etcdserver/server.go @@ -1001,7 +1001,7 @@ func (s *EtcdServer) applySnapshot(ep *etcdProgress, toApply *toApply) { // wait for raftNode to persist snapshot onto the disk <-toApply.notifyc - // gofail: var beforeOpenSnapshotBackend struct{} + // gofail: var applyBeforeOpenSnapshot struct{} newbe, err := serverstorage.OpenSnapshotBackend(s.Cfg, s.snapshotter, toApply.snapshot, s.beHooks) if err != nil { lg.Panic("failed to open snapshot backend", zap.Error(err)) diff --git a/tests/e2e/leader_snapshot_no_proxy_test.go b/tests/e2e/leader_snapshot_no_proxy_test.go new file mode 100644 index 00000000000..1dffbe956d9 --- /dev/null +++ b/tests/e2e/leader_snapshot_no_proxy_test.go @@ -0,0 +1,98 @@ +// Copyright 2016 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !cluster_proxy + +package e2e + +import ( + "context" + "sync" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/zap" + + clientv3 "go.etcd.io/etcd/client/v3" + "go.etcd.io/etcd/pkg/v3/expect" + "go.etcd.io/etcd/tests/v3/framework/config" + "go.etcd.io/etcd/tests/v3/framework/e2e" + "go.etcd.io/etcd/tests/v3/robustness/failpoint" +) + +func TestRecoverSnapshotBackend(t *testing.T) { + e2e.BeforeTest(t) + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + epc, err := e2e.NewEtcdProcessCluster(ctx, t, + e2e.WithClusterSize(3), + e2e.WithKeepDataDir(true), + e2e.WithPeerProxy(true), + e2e.WithSnapshotCatchUpEntries(50), + e2e.WithSnapshotCount(50), + e2e.WithGoFailEnabled(true), + e2e.WithIsPeerTLS(true), + ) + require.NoError(t, err) + + defer epc.Close() + + blackholedMember := epc.Procs[0] + otherMember := epc.Procs[1] + + wg := sync.WaitGroup{} + + trafficCtx, trafficCancel := context.WithCancel(ctx) + c, err := clientv3.New(clientv3.Config{ + Endpoints: otherMember.EndpointsGRPC(), + Logger: zap.NewNop(), + DialKeepAliveTime: 10 * time.Second, + DialKeepAliveTimeout: 100 * time.Millisecond, + }) + require.NoError(t, err) + defer c.Close() + wg.Add(1) + go func() { + defer wg.Done() + for { + select { + case <-trafficCtx.Done(): + return + default: + } + putCtx, putCancel := context.WithTimeout(trafficCtx, 50*time.Millisecond) + c.Put(putCtx, "a", "b") + putCancel() + time.Sleep(10 * time.Millisecond) + } + }() + + err = blackholedMember.Failpoints().SetupHTTP(ctx, "applyBeforeOpenSnapshot", "panic") + require.NoError(t, err) + err = failpoint.Blackhole(ctx, t, blackholedMember, epc, true) + require.NoError(t, err) + err = blackholedMember.Wait(ctx) + require.NoError(t, err) + trafficCancel() + wg.Wait() + err = blackholedMember.Start(ctx) + require.NoError(t, err) + _, err = blackholedMember.Logs().ExpectWithContext(ctx, expect.ExpectedResponse{Value: "Recovering from snapshot backend"}) + assert.NoError(t, err) + err = blackholedMember.Etcdctl().Put(ctx, "a", "1", config.PutOptions{}) + assert.NoError(t, err) +} diff --git a/tests/robustness/failpoint/failpoint.go b/tests/robustness/failpoint/failpoint.go index de5bfe53e9f..12a72b69d96 100644 --- a/tests/robustness/failpoint/failpoint.go +++ b/tests/robustness/failpoint/failpoint.go @@ -50,6 +50,7 @@ var ( DropPeerNetwork, RaftBeforeSaveSleep, RaftAfterSaveSleep, + ApplyBeforeOpenSnapshot, } ) diff --git a/tests/robustness/failpoint/gofail.go b/tests/robustness/failpoint/gofail.go index 7c12945ef6a..3d90c5ddd8f 100644 --- a/tests/robustness/failpoint/gofail.go +++ b/tests/robustness/failpoint/gofail.go @@ -53,6 +53,7 @@ var ( RaftAfterWALReleasePanic Failpoint = goPanicFailpoint{"raftAfterWALRelease", triggerBlackhole{waitTillSnapshot: true}, Follower} RaftBeforeSaveSnapPanic Failpoint = goPanicFailpoint{"raftBeforeSaveSnap", triggerBlackhole{waitTillSnapshot: true}, Follower} RaftAfterSaveSnapPanic Failpoint = goPanicFailpoint{"raftAfterSaveSnap", triggerBlackhole{waitTillSnapshot: true}, Follower} + ApplyBeforeOpenSnapshot Failpoint = goPanicFailpoint{"applyBeforeOpenSnapshot", triggerBlackhole{waitTillSnapshot: true}, Follower} BeforeApplyOneConfChangeSleep Failpoint = killAndGofailSleep{"beforeApplyOneConfChange", time.Second} RaftBeforeSaveSleep Failpoint = gofailSleepAndDeactivate{"raftBeforeSave", time.Second} RaftAfterSaveSleep Failpoint = gofailSleepAndDeactivate{"raftAfterSave", time.Second} diff --git a/tests/robustness/failpoint/network.go b/tests/robustness/failpoint/network.go index a765c427ec2..5d59fba3d99 100644 --- a/tests/robustness/failpoint/network.go +++ b/tests/robustness/failpoint/network.go @@ -51,7 +51,7 @@ type triggerBlackhole struct { } func (tb triggerBlackhole) Trigger(ctx context.Context, t *testing.T, member e2e.EtcdProcess, clus *e2e.EtcdProcessCluster) error { - return blackhole(ctx, t, member, clus, tb.waitTillSnapshot) + return Blackhole(ctx, t, member, clus, tb.waitTillSnapshot) } func (tb triggerBlackhole) Available(config e2e.EtcdProcessClusterConfig, process e2e.EtcdProcess) bool { @@ -61,7 +61,7 @@ func (tb triggerBlackhole) Available(config e2e.EtcdProcessClusterConfig, proces return config.ClusterSize > 1 && process.PeerProxy() != nil } -func blackhole(ctx context.Context, t *testing.T, member e2e.EtcdProcess, clus *e2e.EtcdProcessCluster, shouldWaitTillSnapshot bool) error { +func Blackhole(ctx context.Context, t *testing.T, member e2e.EtcdProcess, clus *e2e.EtcdProcessCluster, shouldWaitTillSnapshot bool) error { proxy := member.PeerProxy() // Blackholing will cause peers to not be able to use streamWriters registered with member