From 70171e78963f0108781bc040c94ba3879da344cf Mon Sep 17 00:00:00 2001 From: Abhishek Bansal Date: Tue, 5 Nov 2024 11:24:53 +0530 Subject: [PATCH] fix: unexpected downtime in rollouts Signed-off-by: Abhishek Bansal --- rollout/trafficrouting.go | 22 +++++++++++ rollout/trafficrouting_test.go | 67 ++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) diff --git a/rollout/trafficrouting.go b/rollout/trafficrouting.go index eef25e1c1c..76278a79be 100644 --- a/rollout/trafficrouting.go +++ b/rollout/trafficrouting.go @@ -27,6 +27,7 @@ import ( replicasetutil "github.com/argoproj/argo-rollouts/utils/replicaset" rolloututil "github.com/argoproj/argo-rollouts/utils/rollout" "github.com/argoproj/argo-rollouts/utils/weightutil" + appsv1 "k8s.io/api/apps/v1" ) // NewTrafficRoutingReconciler identifies return the TrafficRouting Plugin that the rollout wants to modify @@ -133,6 +134,23 @@ func (c *Controller) NewTrafficRoutingReconciler(roCtx *rolloutContext) ([]traff return nil, nil } +func (c *rolloutContext) checkReplicasAvailable(rs *appsv1.ReplicaSet, desiredWeight int32) bool { + if rs == nil { + return false + } + availableReplicas := rs.Status.AvailableReplicas + totalReplicas := *c.rollout.Spec.Replicas + + desiredReplicas := (desiredWeight * totalReplicas) / 100 + if availableReplicas < desiredReplicas { + c.log.Infof("ReplicaSet '%s' has %d available replicas, waiting for %d", rs.Name, availableReplicas, desiredReplicas) + return false + } + + return true + +} + // this currently only be used in the canary strategy func (c *rolloutContext) reconcileTrafficRouting() error { reconcilers, err := c.newTrafficRoutingReconciler(c) @@ -243,6 +261,10 @@ func (c *rolloutContext) reconcileTrafficRouting() error { desiredWeight = weightutil.MaxTrafficWeight(c.rollout) } } + + if !c.checkReplicasAvailable(c.stableRS, weightutil.MaxTrafficWeight(c.rollout)-desiredWeight) { + return nil + } // We need to check for revision > 1 because when we first install the rollout we run step 0 this prevents that. // There is a bigger fix needed for the reasons on why we run step 0 on rollout install, that needs to be explored. revision, revisionFound := annotations.GetRevisionAnnotation(c.rollout) diff --git a/rollout/trafficrouting_test.go b/rollout/trafficrouting_test.go index 1022ed09ef..a5bb05af0b 100644 --- a/rollout/trafficrouting_test.go +++ b/rollout/trafficrouting_test.go @@ -1435,3 +1435,70 @@ func TestDontWeightOrHaveManagedRoutesDuringInterruptedUpdate(t *testing.T) { f.fakeTrafficRouting.AssertCalled(t, "RemoveManagedRoutes", mock.Anything, mock.Anything) } + +// This test verifies that if we are shifting traffic to stable replicaset without the stable replicaset being available proportional to the weight, the traffic shouldn't be switched immediately to the stable replicaset. +func TestCheckReplicaSetAvailable(t *testing.T) { + fix := newFixture(t) + defer fix.Close() + + steps := []v1alpha1.CanaryStep{ + { + SetWeight: pointer.Int32(60), + }, + { + Pause: &v1alpha1.RolloutPause{}, + }, + } + + rollout1 := newCanaryRollout("test-rollout", 10, nil, steps, pointer.Int32(1), intstr.FromInt(1), intstr.FromInt(1)) + rollout1.Spec.Strategy.Canary.DynamicStableScale = true + rollout1.Spec.Strategy.Canary.TrafficRouting = &v1alpha1.RolloutTrafficRouting{ + SMI: &v1alpha1.SMITrafficRouting{}, + } + rollout1.Spec.Strategy.Canary.CanaryService = "canary-service" + rollout1.Spec.Strategy.Canary.StableService = "stable-service" + rollout1.Status.ReadyReplicas = 10 + rollout1.Status.AvailableReplicas = 10 + + rollout2 := bumpVersion(rollout1) + + replicaSet1 := newReplicaSetWithStatus(rollout1, 1, 1) + replicaSet2 := newReplicaSetWithStatus(rollout2, 9, 9) + + replicaSet1Hash := replicaSet1.Labels[v1alpha1.DefaultRolloutUniqueLabelKey] + replicaSet2Hash := replicaSet2.Labels[v1alpha1.DefaultRolloutUniqueLabelKey] + canarySelector := map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: replicaSet2Hash} + stableSelector := map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: replicaSet1Hash} + canarySvc := newService("canary-service", 80, canarySelector, rollout1) + stableSvc := newService("stable-service", 80, stableSelector, rollout1) + + rollout2.Spec = rollout1.Spec + rollout2.Status.StableRS = replicaSet1Hash + rollout2.Status.CurrentPodHash = replicaSet1Hash + rollout2.Status.Canary.Weights = &v1alpha1.TrafficWeights{ + Canary: v1alpha1.WeightDestination{ + Weight: 10, + ServiceName: "canary-service", + PodTemplateHash: replicaSet2Hash, + }, + Stable: v1alpha1.WeightDestination{ + Weight: 90, + ServiceName: "stable-service", + PodTemplateHash: replicaSet1Hash, + }, + } + + fix.kubeobjects = append(fix.kubeobjects, replicaSet1, replicaSet2, canarySvc, stableSvc) + fix.replicaSetLister = append(fix.replicaSetLister, replicaSet1, replicaSet2) + + fix.rolloutLister = append(fix.rolloutLister, rollout2) + fix.objects = append(fix.objects, rollout2) + + fix.expectUpdateReplicaSetAction(replicaSet1) + fix.expectUpdateRolloutAction(rollout2) + fix.expectUpdateReplicaSetAction(replicaSet1) + fix.expectPatchRolloutAction(rollout2) + fix.fakeTrafficRouting = newUnmockedFakeTrafficRoutingReconciler() + fix.fakeTrafficRouting.On("RemoveManagedRoutes", mock.Anything, mock.Anything, mock.Anything).Return(nil) + fix.run(getKey(rollout1, t)) +}