From 46934e7527fbac1a2c7aefe7fa88ecbab639f106 Mon Sep 17 00:00:00 2001 From: Ethan Mosbaugh Date: Tue, 28 Jan 2025 13:24:31 -0800 Subject: [PATCH] conditionally migrate --- .../controllers/installation_controller.go | 202 +++++++++++++++++- operator/pkg/cli/upgrade_job.go | 9 +- 2 files changed, 207 insertions(+), 4 deletions(-) diff --git a/operator/controllers/installation_controller.go b/operator/controllers/installation_controller.go index 8c493d56d..14aa9cb6a 100644 --- a/operator/controllers/installation_controller.go +++ b/operator/controllers/installation_controller.go @@ -26,6 +26,8 @@ import ( apv1b2 "github.com/k0sproject/k0s/pkg/apis/autopilot/v1beta2" k0shelm "github.com/k0sproject/k0s/pkg/apis/helm/v1beta1" + k0sv1beta1 "github.com/k0sproject/k0s/pkg/apis/k0s/v1beta1" + apcore "github.com/k0sproject/k0s/pkg/autopilot/controller/plans/core" "github.com/replicatedhq/embedded-cluster/pkg/kubeutils" "github.com/replicatedhq/embedded-cluster/pkg/runtimeconfig" batchv1 "k8s.io/api/batch/v1" @@ -42,8 +44,14 @@ import ( "sigs.k8s.io/controller-runtime/pkg/handler" "github.com/replicatedhq/embedded-cluster/kinds/apis/v1beta1" + ectypes "github.com/replicatedhq/embedded-cluster/kinds/types" + "github.com/replicatedhq/embedded-cluster/operator/pkg/autopilot" + "github.com/replicatedhq/embedded-cluster/operator/pkg/charts" + "github.com/replicatedhq/embedded-cluster/operator/pkg/k8sutil" "github.com/replicatedhq/embedded-cluster/operator/pkg/metrics" "github.com/replicatedhq/embedded-cluster/operator/pkg/openebs" + "github.com/replicatedhq/embedded-cluster/operator/pkg/registry" + "github.com/replicatedhq/embedded-cluster/operator/pkg/upgrade" "github.com/replicatedhq/embedded-cluster/operator/pkg/util" ) @@ -255,6 +263,169 @@ func (r *InstallationReconciler) ReconcileOpenebs(ctx context.Context, in *v1bet return nil } +// ReconcileRegistry reconciles registry components, ensuring that the necessary secrets are +// created as well as rebalancing stateful pods when nodes are removed from the cluster. +func (r *InstallationReconciler) ReconcileRegistry(ctx context.Context, in *v1beta1.Installation) error { + if in == nil || !in.Spec.AirGap || !in.Spec.HighAvailability { + // do not create registry secrets or rebalance stateful pods if the installation is not HA or not airgapped + return nil + } + + log := ctrl.LoggerFrom(ctx) + + // fetch the current clusterConfig + var clusterConfig k0sv1beta1.ClusterConfig + if err := r.Get(ctx, client.ObjectKey{Name: "k0s", Namespace: "kube-system"}, &clusterConfig); err != nil { + return fmt.Errorf("failed to get cluster config: %w", err) + } + + err := registry.MigrateRegistryData(ctx, in, r.Client) + if err != nil { + if err := r.Status().Update(ctx, in); err != nil { + log.Error(err, "Failed to update installation status") + } + return fmt.Errorf("failed to migrate registry data: %w", err) + + } + + return nil +} + +// ReconcileHAStatus reconciles the HA migration status condition for the installation. +// This status is based on the HA condition being set, the Registry deployment having two running + healthy replicas, +// and the kotsadm rqlite statefulset having three healthy replicas. +func (r *InstallationReconciler) ReconcileHAStatus(ctx context.Context, in *v1beta1.Installation) error { + if in == nil { + return nil + } + + if !in.Spec.HighAvailability { + in.Status.SetCondition(metav1.Condition{ + Type: HAConditionType, + Status: metav1.ConditionFalse, + Reason: "HANotEnabled", + ObservedGeneration: in.Generation, + }) + return nil + } + + if in.Spec.AirGap { + seaweedReady, err := k8sutil.GetChartHealth(ctx, r.Client, "seaweedfs") + if err != nil { + return fmt.Errorf("failed to check seaweedfs readiness: %w", err) + } + if !seaweedReady { + in.Status.SetCondition(metav1.Condition{ + Type: HAConditionType, + Status: metav1.ConditionFalse, + Reason: "SeaweedFSNotReady", + ObservedGeneration: in.Generation, + }) + return nil + } + + registryMigrated, err := registry.HasRegistryMigrated(ctx, r.Client) + if err != nil { + return fmt.Errorf("failed to check registry migration status: %w", err) + } + if !registryMigrated { + in.Status.SetCondition(metav1.Condition{ + Type: HAConditionType, + Status: metav1.ConditionFalse, + Reason: "RegistryNotMigrated", + ObservedGeneration: in.Generation, + }) + return nil + } + + registryReady, err := k8sutil.GetChartHealth(ctx, r.Client, "docker-registry") + if err != nil { + return fmt.Errorf("failed to check docker-registry readiness: %w", err) + } + if !registryReady { + in.Status.SetCondition(metav1.Condition{ + Type: HAConditionType, + Status: metav1.ConditionFalse, + Reason: "RegistryNotReady", + ObservedGeneration: in.Generation, + }) + return nil + } + } + + adminConsole, err := k8sutil.GetChartHealth(ctx, r.Client, "admin-console") + if err != nil { + return fmt.Errorf("failed to check admin-console readiness: %w", err) + } + if !adminConsole { + in.Status.SetCondition(metav1.Condition{ + Type: HAConditionType, + Status: metav1.ConditionFalse, + Reason: "AdminConsoleNotReady", + ObservedGeneration: in.Generation, + }) + return nil + } + + if in.Status.State != v1beta1.InstallationStateInstalled { + in.Status.SetCondition(metav1.Condition{ + Type: HAConditionType, + Status: metav1.ConditionFalse, + Reason: "InstallationNotReady", + ObservedGeneration: in.Generation, + }) + return nil + } + + in.Status.SetCondition(metav1.Condition{ + Type: HAConditionType, + Status: metav1.ConditionTrue, + Reason: "HAReady", + ObservedGeneration: in.Generation, + }) + + return nil +} + +// SetStateBasedOnPlan sets the installation state based on the Plan state. For now we do not +// report anything fancy but we should consider reporting here a summary of how many nodes +// have been upgraded and how many are still pending. +func (r *InstallationReconciler) SetStateBasedOnPlan(in *v1beta1.Installation, plan apv1b2.Plan, desiredVersion string) { + reason := autopilot.ReasonForState(plan) + switch plan.Status.State { + case "": + in.Status.SetState(v1beta1.InstallationStateEnqueued, reason, nil) + case apcore.PlanIncompleteTargets: + fallthrough + case apcore.PlanInconsistentTargets: + fallthrough + case apcore.PlanRestricted: + fallthrough + case apcore.PlanWarning: + fallthrough + case apcore.PlanMissingSignalNode: + fallthrough + case apcore.PlanApplyFailed: + r.Recorder.Eventf(in, corev1.EventTypeNormal, "K0sUpgradeFailed", "Upgrade of k0s to %s failed (%q)", desiredVersion, plan.Status.State) + in.Status.SetState(v1beta1.InstallationStateFailed, reason, nil) + case apcore.PlanSchedulable: + fallthrough + case apcore.PlanSchedulableWait: + in.Status.SetState(v1beta1.InstallationStateInstalling, reason, nil) + case apcore.PlanCompleted: + r.Recorder.Eventf(in, corev1.EventTypeNormal, "K0sUpgradeComplete", "Upgrade of k0s to %s completed", desiredVersion) + in.Status.SetState(v1beta1.InstallationStateKubernetesInstalled, reason, nil) + default: + r.Recorder.Eventf(in, corev1.EventTypeNormal, "K0sUpgradeUnknownState", "Upgrade of k0s to %s has an unknown state %q", desiredVersion, plan.Status.State) + in.Status.SetState(v1beta1.InstallationStateFailed, reason, nil) + } +} + +// StartAutopilotUpgrade creates an autopilot plan to upgrade to version specified in spec.config.version. +func (r *InstallationReconciler) StartAutopilotUpgrade(ctx context.Context, in *v1beta1.Installation, meta *ectypes.ReleaseMetadata) error { + return upgrade.StartAutopilotUpgrade(ctx, r.Client, in, meta) +} + // CoalesceInstallations goes through all the installation objects and make sure that the // status of the newest one is coherent with whole cluster status. Returns the newest // installation object. @@ -425,7 +596,12 @@ func (r *InstallationReconciler) Reconcile(ctx context.Context, req ctrl.Request // parse the config otherwise we risk moving on with a reconcile // using an erroneous config. if err := r.ReadClusterConfigSpecFromSecret(ctx, in); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to read cluster config from secret: %w", err) + in.Status.SetState(v1beta1.InstallationStateFailed, err.Error(), nil) + if err := r.Status().Update(ctx, in); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to update installation status: %w", err) + } + r.DisableOldInstallations(ctx, items) + return ctrl.Result{}, fmt.Errorf("failed to update installation status: %w", err) } // verify if a new node has been added, removed or changed. @@ -444,6 +620,25 @@ func (r *InstallationReconciler) Reconcile(ctx context.Context, req ctrl.Request return ctrl.Result{}, fmt.Errorf("failed to reconcile openebs: %w", err) } + // reconcile helm chart dependencies including secrets. + if err := r.ReconcileRegistry(ctx, in); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to pre-reconcile helm charts: %w", err) + } + + // reconcile the add-ons (k0s helm extensions). + log.Info("Reconciling helm charts") + ev, err := charts.ReconcileHelmCharts(ctx, r.Client, in) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile helm charts: %w", err) + } + if ev != nil { + r.Recorder.Event(in, corev1.EventTypeNormal, ev.Reason, ev.Message) + } + + if err := r.ReconcileHAStatus(ctx, in); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile HA status: %w", err) + } + // save the installation status. nothing more to do with it. if err := r.Status().Update(ctx, in.DeepCopy()); err != nil { if k8serrors.IsConflict(err) { @@ -452,6 +647,11 @@ func (r *InstallationReconciler) Reconcile(ctx context.Context, req ctrl.Request return ctrl.Result{}, fmt.Errorf("failed to update installation status: %w", err) } + // now that the status has been updated we can flag all older installation + // objects as obsolete. these are not necessary anymore and are kept only + // for historic reasons. + r.DisableOldInstallations(ctx, items) + // if we are not in an airgap environment this is the time to call back to // replicated and inform the status of this installation. if !in.Spec.AirGap { diff --git a/operator/pkg/cli/upgrade_job.go b/operator/pkg/cli/upgrade_job.go index 530350e5f..30762b61e 100644 --- a/operator/pkg/cli/upgrade_job.go +++ b/operator/pkg/cli/upgrade_job.go @@ -2,6 +2,7 @@ package cli import ( "fmt" + "os" "time" ecv1beta1 "github.com/replicatedhq/embedded-cluster/kinds/apis/v1beta1" @@ -54,9 +55,11 @@ func UpgradeJobCmd() *cobra.Command { fmt.Println(fmt.Sprintf(format, args...)) } - err = migratev2.Run(ctx, logf, cli, installation) - if err != nil { - return fmt.Errorf("failed to run v2 migration: %w", err) + if os.Getenv("MIGRATE_V2") == "true" { + err := migratev2.Run(ctx, logf, cli, installation) + if err != nil { + return fmt.Errorf("failed to run v2 migration: %w", err) + } } err = upgrade.Upgrade(ctx, cli, installation)