diff --git a/.golangci.yml b/.golangci.yml index 0bf5244c2..ad749fde3 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -85,7 +85,9 @@ linters-settings: - alias: pkgctx pkg: github.com/vmware-tanzu/vm-operator/pkg/context - alias: pkgerr - pkg: github.com/vmware-tanzu/vm-operator/pkg/pkgerr + pkg: github.com/vmware-tanzu/vm-operator/pkg/errors + - alias: pkgexit + pkg: github.com/vmware-tanzu/vm-operator/pkg/exit - alias: ctxop pkg: github.com/vmware-tanzu/vm-operator/pkg/context/operation - alias: pkgmgr diff --git a/config/local/vmoperator/local_env_var_patch.yaml b/config/local/vmoperator/local_env_var_patch.yaml index 71771fce2..1237728da 100644 --- a/config/local/vmoperator/local_env_var_patch.yaml +++ b/config/local/vmoperator/local_env_var_patch.yaml @@ -13,6 +13,8 @@ spec: value: "true" - name: ASYNC_CREATE_ENABLED value: "true" + - name: EXIT_DELAY + value: "3s" - name: MEM_STATS_PERIOD value: "10m" - name: VSPHERE_NETWORKING diff --git a/config/wcp/vmoperator/manager_env_var_patch.yaml b/config/wcp/vmoperator/manager_env_var_patch.yaml index ed8e053f9..d09581568 100644 --- a/config/wcp/vmoperator/manager_env_var_patch.yaml +++ b/config/wcp/vmoperator/manager_env_var_patch.yaml @@ -40,6 +40,12 @@ name: PRIVILEGED_USERS value: "" +- op: add + path: /spec/template/spec/containers/0/env/- + value: + name: EXIT_DELAY + value: "3s" + - op: add path: /spec/template/spec/containers/0/env/- value: diff --git a/controllers/infra/capability/configmap/configmap_capability_controller.go b/controllers/infra/capability/configmap/configmap_capability_controller.go index 954ef6c22..e577e6181 100644 --- a/controllers/infra/capability/configmap/configmap_capability_controller.go +++ b/controllers/infra/capability/configmap/configmap_capability_controller.go @@ -20,10 +20,10 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/manager" - "github.com/vmware-tanzu/vm-operator/controllers/infra/capability/exit" pkgcfg "github.com/vmware-tanzu/vm-operator/pkg/config" "github.com/vmware-tanzu/vm-operator/pkg/config/capabilities" pkgctx "github.com/vmware-tanzu/vm-operator/pkg/context" + pkgexit "github.com/vmware-tanzu/vm-operator/pkg/exit" pkgmgr "github.com/vmware-tanzu/vm-operator/pkg/manager" "github.com/vmware-tanzu/vm-operator/pkg/record" kubeutil "github.com/vmware-tanzu/vm-operator/pkg/util/kube" @@ -53,6 +53,7 @@ func AddToManager(ctx *pkgctx.ControllerManagerContext, mgr manager.Manager) err cache, ctrl.Log.WithName("controllers").WithName(controllerName), record.New(mgr.GetEventRecorderFor(controllerNameLong)), + mgr.Elected(), ) // This controller is also run on the non-leaders (webhooks) pods too @@ -89,13 +90,15 @@ func NewReconciler( ctx context.Context, client ctrlclient.Reader, logger logr.Logger, - recorder record.Recorder) *Reconciler { + recorder record.Recorder, + elected <-chan struct{}) *Reconciler { return &Reconciler{ Context: ctx, Client: client, Logger: logger, Recorder: recorder, + Elected: elected, } } @@ -104,6 +107,7 @@ type Reconciler struct { Client ctrlclient.Reader Logger logr.Logger Recorder record.Recorder + Elected <-chan struct{} } // +kubebuilder:rbac:groups="",resources=configmaps,verbs=get;list;watch @@ -120,8 +124,10 @@ func (r *Reconciler) Reconcile( } if capabilities.UpdateCapabilitiesFeatures(ctx, obj) { - r.Logger.Info("killing pod due to changed capabilities") - exit.Exit() + pkgexit.Exit( + logr.NewContext(ctx, r.Logger), + "capabilities have changed", + r.Elected) } return ctrl.Result{}, nil diff --git a/controllers/infra/capability/configmap/configmap_capability_controller_suite_test.go b/controllers/infra/capability/configmap/configmap_capability_controller_suite_test.go index 3f838c9df..8d53cb6a9 100644 --- a/controllers/infra/capability/configmap/configmap_capability_controller_suite_test.go +++ b/controllers/infra/capability/configmap/configmap_capability_controller_suite_test.go @@ -7,21 +7,26 @@ package capability_test import ( "sync/atomic" "testing" + "time" . "github.com/onsi/ginkgo/v2" capability "github.com/vmware-tanzu/vm-operator/controllers/infra/capability/configmap" - "github.com/vmware-tanzu/vm-operator/controllers/infra/capability/exit" pkgcfg "github.com/vmware-tanzu/vm-operator/pkg/config" + pkgexit "github.com/vmware-tanzu/vm-operator/pkg/exit" "github.com/vmware-tanzu/vm-operator/pkg/manager" "github.com/vmware-tanzu/vm-operator/test/builder" ) -var numExits int32 +var ( + numExits int32 + exited = make(chan struct{}) +) func init() { - exit.Exit = func() { + pkgexit.ExitFn = func(_ time.Duration) { atomic.AddInt32(&numExits, 1) + exited <- struct{}{} } } @@ -30,6 +35,7 @@ var suite = builder.NewTestSuiteForControllerWithContext( pkgcfg.NewContextWithDefaultConfig(), func(config *pkgcfg.Config) { config.Features.SVAsyncUpgrade = false + config.ExitDelay = time.Millisecond * 100 }, ), capability.AddToManager, diff --git a/controllers/infra/capability/configmap/configmap_capability_controller_test.go b/controllers/infra/capability/configmap/configmap_capability_controller_test.go index 18c19b601..c3718679f 100644 --- a/controllers/infra/capability/configmap/configmap_capability_controller_test.go +++ b/controllers/infra/capability/configmap/configmap_capability_controller_test.go @@ -28,6 +28,7 @@ var _ = Describe( testlabels.EnvTest, testlabels.V1Alpha3, ), + Ordered, func() { var ( @@ -61,6 +62,12 @@ var _ = Describe( }) When("the capabilities have changed", func() { + JustBeforeEach(func() { + Eventually(exited).Should(Receive()) + }) + AfterEach(func() { + Consistently(exited).ShouldNot(Receive()) + }) When("some capabilities are enabled", func() { BeforeEach(func() { pkgcfg.SetContext( @@ -127,12 +134,20 @@ var _ = Describe( }) JustBeforeEach(func() { + Eventually(exited).Should(Receive()) + Eventually(func(g Gomega) { g.Expect(pkgcfg.FromContext(suite.Context).Features.TKGMultipleCL).To(BeTrue()) }, time.Second*5).Should(Succeed()) obj.Data[capabilities.CapabilityKeyTKGMultipleContentLibraries] = "false" Expect(ctx.Client.Update(ctx, obj)).To(Succeed()) + + Eventually(exited).Should(Receive()) + }) + + AfterEach(func() { + Consistently(exited).ShouldNot(Receive()) }) Specify("the pod was exited once on create and once on update", func() { @@ -162,6 +177,9 @@ var _ = Describe( capabilities.CapabilityKeyWorkloadIsolation: "false", } }) + JustBeforeEach(func() { + Consistently(exited).ShouldNot(Receive()) + }) Specify("the pod was not exited and features were not updated", func() { Consistently(func(g Gomega) { g.Expect(atomic.LoadInt32(&numExits)).To(Equal(int32(0))) diff --git a/controllers/infra/capability/crd/crd_capability_controller.go b/controllers/infra/capability/crd/crd_capability_controller.go index 5a5348481..caeb305f2 100644 --- a/controllers/infra/capability/crd/crd_capability_controller.go +++ b/controllers/infra/capability/crd/crd_capability_controller.go @@ -16,11 +16,11 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/predicate" - "github.com/vmware-tanzu/vm-operator/controllers/infra/capability/exit" capv1 "github.com/vmware-tanzu/vm-operator/external/capabilities/api/v1alpha1" pkgcfg "github.com/vmware-tanzu/vm-operator/pkg/config" "github.com/vmware-tanzu/vm-operator/pkg/config/capabilities" pkgctx "github.com/vmware-tanzu/vm-operator/pkg/context" + pkgexit "github.com/vmware-tanzu/vm-operator/pkg/exit" "github.com/vmware-tanzu/vm-operator/pkg/record" "github.com/vmware-tanzu/vm-operator/pkg/util/ptr" ) @@ -39,6 +39,7 @@ func AddToManager(ctx *pkgctx.ControllerManagerContext, mgr manager.Manager) err mgr.GetClient(), ctrl.Log.WithName("controllers").WithName(controllerName), record.New(mgr.GetEventRecorderFor(controllerNameLong)), + mgr.Elected(), ) return ctrl.NewControllerManagedBy(mgr). @@ -66,13 +67,15 @@ func NewReconciler( ctx context.Context, client ctrlclient.Client, logger logr.Logger, - recorder record.Recorder) *Reconciler { + recorder record.Recorder, + elected <-chan struct{}) *Reconciler { return &Reconciler{ Context: ctx, Client: client, Logger: logger, Recorder: recorder, + Elected: elected, } } @@ -81,6 +84,7 @@ type Reconciler struct { Client ctrlclient.Client Logger logr.Logger Recorder record.Recorder + Elected <-chan struct{} } // +kubebuilder:rbac:groups=iaas.vmware.com,resources=capabilities,verbs=get;list;watch @@ -98,8 +102,10 @@ func (r *Reconciler) Reconcile( } if capabilities.UpdateCapabilitiesFeatures(ctx, obj) { - r.Logger.Info("killing pod due to changed capabilities") - exit.Exit() + pkgexit.Exit( + logr.NewContext(ctx, r.Logger), + "capabilities have changed", + r.Elected) } return ctrl.Result{}, nil diff --git a/controllers/infra/capability/crd/crd_capability_controller_suite_test.go b/controllers/infra/capability/crd/crd_capability_controller_suite_test.go index a5418ae26..f99d9aea1 100644 --- a/controllers/infra/capability/crd/crd_capability_controller_suite_test.go +++ b/controllers/infra/capability/crd/crd_capability_controller_suite_test.go @@ -7,21 +7,26 @@ package capability_test import ( "sync/atomic" "testing" + "time" . "github.com/onsi/ginkgo/v2" capability "github.com/vmware-tanzu/vm-operator/controllers/infra/capability/crd" - "github.com/vmware-tanzu/vm-operator/controllers/infra/capability/exit" pkgcfg "github.com/vmware-tanzu/vm-operator/pkg/config" + pkgexit "github.com/vmware-tanzu/vm-operator/pkg/exit" "github.com/vmware-tanzu/vm-operator/pkg/manager" "github.com/vmware-tanzu/vm-operator/test/builder" ) -var numExits int32 +var ( + numExits int32 + exited = make(chan struct{}) +) func init() { - exit.Exit = func() { + pkgexit.ExitFn = func(_ time.Duration) { atomic.AddInt32(&numExits, 1) + exited <- struct{}{} } } @@ -30,6 +35,7 @@ var suite = builder.NewTestSuiteForControllerWithContext( pkgcfg.NewContextWithDefaultConfig(), func(config *pkgcfg.Config) { config.Features.SVAsyncUpgrade = true + config.ExitDelay = time.Millisecond * 100 }, ), capability.AddToManager, diff --git a/controllers/infra/capability/crd/crd_capability_controller_test.go b/controllers/infra/capability/crd/crd_capability_controller_test.go index 6ce84a79b..c4c1b2be0 100644 --- a/controllers/infra/capability/crd/crd_capability_controller_test.go +++ b/controllers/infra/capability/crd/crd_capability_controller_test.go @@ -28,6 +28,7 @@ var _ = Describe( testlabels.EnvTest, testlabels.V1Alpha3, ), + Ordered, func() { var ( @@ -64,6 +65,13 @@ var _ = Describe( }) When("the capabilities have changed", func() { + JustBeforeEach(func() { + Eventually(exited).Should(Receive()) + }) + AfterEach(func() { + Consistently(exited).ShouldNot(Receive()) + }) + When("some capabilities are enabled", func() { BeforeEach(func() { pkgcfg.SetContext( @@ -164,6 +172,8 @@ var _ = Describe( }) JustBeforeEach(func() { + Eventually(exited).Should(Receive()) + Eventually(func(g Gomega) { g.Expect(pkgcfg.FromContext(suite.Context).Features.BringYourOwnEncryptionKey).To(BeTrue()) g.Expect(pkgcfg.FromContext(suite.Context).Features.TKGMultipleCL).To(BeTrue()) @@ -174,6 +184,12 @@ var _ = Describe( Activated: false, } Expect(ctx.Client.Status().Update(ctx, obj)).To(Succeed()) + + Eventually(exited).Should(Receive()) + }) + + AfterEach(func() { + Consistently(exited).ShouldNot(Receive()) }) Specify("the pod was exited once on create and once on update", func() { @@ -214,6 +230,9 @@ var _ = Describe( }, } }) + JustBeforeEach(func() { + Consistently(exited).ShouldNot(Receive()) + }) Specify("the pod was not exited and features were not updated", func() { Consistently(func(g Gomega) { g.Expect(atomic.LoadInt32(&numExits)).To(Equal(int32(0))) diff --git a/controllers/infra/capability/exit/exit.go b/controllers/infra/capability/exit/exit.go deleted file mode 100644 index 7c279c45b..000000000 --- a/controllers/infra/capability/exit/exit.go +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright (c) 2024 Broadcom. All Rights Reserved. -// Broadcom Confidential. The term "Broadcom" refers to Broadcom Inc. -// and/or its subsidiaries. - -package exit - -import ( - "os" -) - -// Exit is used in testing to assert the capability controller exits the process -// when the capabilities have changed. -var Exit func() - -func init() { - Exit = func() { os.Exit(1) } -} diff --git a/main.go b/main.go index aa3081a3b..4fa2f462a 100644 --- a/main.go +++ b/main.go @@ -34,6 +34,7 @@ import ( pkgcfg "github.com/vmware-tanzu/vm-operator/pkg/config" "github.com/vmware-tanzu/vm-operator/pkg/config/capabilities" pkgctx "github.com/vmware-tanzu/vm-operator/pkg/context" + pkgexit "github.com/vmware-tanzu/vm-operator/pkg/exit" pkgmgr "github.com/vmware-tanzu/vm-operator/pkg/manager" pkgmgrinit "github.com/vmware-tanzu/vm-operator/pkg/manager/init" "github.com/vmware-tanzu/vm-operator/pkg/mem" @@ -72,6 +73,8 @@ func main() { "buildtype", pkg.BuildType, "commit", pkg.BuildCommit) + initExitFn() + initContext() initFlags() @@ -128,6 +131,10 @@ func initMemStats() { metrics.Registry.MustRegister) } +func initExitFn() { + pkgexit.ExitFn = func(_ time.Duration) { os.Exit(1) } +} + func initContext() { ctx = pkgcfg.WithConfig(defaultConfig) ctx = cource.WithContext(ctx) diff --git a/pkg/config/config.go b/pkg/config/config.go index e411e65f5..1cf5991e3 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -106,6 +106,14 @@ type Config struct { // such as passwords. Defaults to false. LogSensitiveData bool + // ExitDelay may be set to a time.Duration value and is the maximum delay + // used when exiting the pod due to an event that requires the pod be + // restarted, ex. the capabilities have changed. The exit will be delayed + // by some duration between 0-ExitDelay. + // + // Defaults to 3s. + ExitDelay time.Duration + // AsyncSignalEnabled may be set to false to disable the vm-watcher service // used to reconcile VirtualMachine objects if their backend state has // changed. diff --git a/pkg/config/default.go b/pkg/config/default.go index e67f0a548..d1349ab53 100644 --- a/pkg/config/default.go +++ b/pkg/config/default.go @@ -39,6 +39,7 @@ func Default() Config { LeaderElectionID: defaultPrefix + "controller-manager-runtime", MaxCreateVMsOnProvider: 80, MaxConcurrentReconciles: 1, + ExitDelay: 3 * time.Second, AsyncSignalEnabled: true, AsyncCreateEnabled: true, MemStatsPeriod: 10 * time.Minute, diff --git a/pkg/config/env.go b/pkg/config/env.go index 30a71a5ff..4604552e5 100644 --- a/pkg/config/env.go +++ b/pkg/config/env.go @@ -29,6 +29,7 @@ func FromEnv() Config { setBool(env.VSphereNetworking, &config.VSphereNetworking) setStringSlice(env.PrivilegedUsers, &config.PrivilegedUsers) setBool(env.LogSensitiveData, &config.LogSensitiveData) + setDuration(env.ExitDelay, &config.ExitDelay) setBool(env.AsyncSignalEnabled, &config.AsyncSignalEnabled) setBool(env.AsyncCreateEnabled, &config.AsyncCreateEnabled) setDuration(env.MemStatsPeriod, &config.MemStatsPeriod) diff --git a/pkg/config/env/env.go b/pkg/config/env/env.go index b9206cdc7..689b1e9cd 100644 --- a/pkg/config/env/env.go +++ b/pkg/config/env/env.go @@ -14,6 +14,7 @@ type VarName uint8 const ( _varNameBegin VarName = iota + ExitDelay DefaultVMClassControllerName MaxCreateVMsOnProvider CreateVMRequeueDelay @@ -89,6 +90,8 @@ func All() []VarName { //nolint:gocyclo func (n VarName) String() string { switch n { + case ExitDelay: + return "EXIT_DELAY" case DefaultVMClassControllerName: return "DEFAULT_VM_CLASS_CONTROLLER_NAME" case MaxCreateVMsOnProvider: diff --git a/pkg/config/env_test.go b/pkg/config/env_test.go index b3015984b..56d8a1545 100644 --- a/pkg/config/env_test.go +++ b/pkg/config/env_test.go @@ -108,6 +108,7 @@ var _ = Describe( Expect(os.Setenv("POWERED_ON_VM_HAS_IP_REQUEUE_DELAY", "126h")).To(Succeed()) Expect(os.Setenv("MEM_STATS_PERIOD", "127h")).To(Succeed()) Expect(os.Setenv("SYNC_IMAGE_REQUEUE_DELAY", "128h")).To(Succeed()) + Expect(os.Setenv("EXIT_DELAY", "129h")).To(Succeed()) }) It("Should return a default config overridden by the environment", func() { Expect(config).To(BeComparableTo(pkgcfg.Config{ @@ -162,6 +163,7 @@ var _ = Describe( PoweredOnVMHasIPRequeueDelay: 126 * time.Hour, MemStatsPeriod: 127 * time.Hour, SyncImageRequeueDelay: 128 * time.Hour, + ExitDelay: 129 * time.Hour, })) }) }) diff --git a/pkg/exit/exit.go b/pkg/exit/exit.go new file mode 100644 index 000000000..3a975e846 --- /dev/null +++ b/pkg/exit/exit.go @@ -0,0 +1,53 @@ +// Copyright (c) 2024 Broadcom. All Rights Reserved. +// Broadcom Confidential. The term "Broadcom" refers to Broadcom Inc. +// and/or its subsidiaries. + +package exit + +import ( + "context" + "math/rand/v2" + "time" + + "github.com/go-logr/logr" + + pkgcfg "github.com/vmware-tanzu/vm-operator/pkg/config" +) + +// ExitFn is used in testing to assert the Exit() function is called. +// The default value is a no-op. +var ExitFn func(exitDelay time.Duration) + +func init() { + ExitFn = func(_ time.Duration) {} +} + +// Exit will exit the pod immediately if it is not the leader, otherwise if +// the pod is the leader or leader election is not enabled, the pod will exit +// after a brief delay. +func Exit( + ctx context.Context, + reason string, + elected <-chan struct{}) { + + var exitDelay time.Duration + select { + case <-elected: + // When the leader or when there is no leader election, delay the + // exit up to some random amount of time. + n := rand.Float64() //nolint:gosec + exitDelay = time.Duration(n * float64(pkgcfg.FromContext(ctx).ExitDelay)) + default: + // When not the leader, exit immediately. This should allow one of + // the non-leader pods to come back online to become the leader when + // the leader pod exits. + } + if exitDelay > 0 { + time.Sleep(exitDelay) + } + + logr.FromContextOrDiscard(ctx). + Info("Exiting pod", "exitDelay", exitDelay, "reason", reason) + + ExitFn(exitDelay) +} diff --git a/pkg/exit/exit_suite_test.go b/pkg/exit/exit_suite_test.go new file mode 100644 index 000000000..b3df9034a --- /dev/null +++ b/pkg/exit/exit_suite_test.go @@ -0,0 +1,17 @@ +// © Broadcom. All Rights Reserved. +// The term “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. +// SPDX-License-Identifier: Apache-2.0 + +package exit_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestExit(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Exit Suite") +} diff --git a/pkg/exit/exit_test.go b/pkg/exit/exit_test.go new file mode 100644 index 000000000..f887200a8 --- /dev/null +++ b/pkg/exit/exit_test.go @@ -0,0 +1,80 @@ +// © Broadcom. All Rights Reserved. +// The term “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. +// SPDX-License-Identifier: Apache-2.0 + +package exit_test + +import ( + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + pkgcfg "github.com/vmware-tanzu/vm-operator/pkg/config" + pkgexit "github.com/vmware-tanzu/vm-operator/pkg/exit" +) + +var _ = DescribeTable("Exit", Ordered, MustPassRepeatedly(5), + func(exitDelay time.Duration, leaderOrNoElection bool) { + + uniqueExitDelayValues := map[time.Duration]struct{}{} + + fn := func() { + var ( + actualExitDelay time.Duration + + ctx = pkgcfg.WithConfig(pkgcfg.Config{ExitDelay: exitDelay}) + done = make(chan struct{}) + elected = make(chan struct{}) + ) + + pkgexit.ExitFn = func(d time.Duration) { + actualExitDelay = d + uniqueExitDelayValues[d] = struct{}{} + close(done) + } + + if leaderOrNoElection { + close(elected) + } + + pkgexit.Exit(ctx, "", elected) + <-done + + if leaderOrNoElection { + Expect(actualExitDelay).ToNot(BeZero()) + Expect(actualExitDelay).To(BeNumerically("<=", exitDelay)) + } else { + Expect(actualExitDelay).To(BeZero()) + } + } + + n := 1 + if leaderOrNoElection { + // If the test is for a leader or when leader election is not + // configured, then run the test fn N times and assert there are + // N unique exitDelay values. + n = 10 + } + + for i := 0; i < n; i++ { + fn() + } + Expect(uniqueExitDelayValues).To(HaveLen(n)) + }, + Entry( + "leader election is not configured", + time.Millisecond*100, + true, + ), + Entry( + "leader election is configured and pod is the leader", + time.Millisecond*100, + true, + ), + Entry( + "leader election is configured and pod is not the leader", + time.Millisecond*100, + false, + ), +)