From c10c0f463dbc43689dd4b06dbeab03a0d15d3e14 Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Thu, 23 Jan 2025 15:37:34 -0600 Subject: [PATCH 01/13] Refactor GetCompletedBackup to be shared between different controllers Signed-off-by: Florent Poinsard --- .../vitessshard/reconcile_backup_job.go | 29 ++++--------- pkg/operator/vitessbackup/object.go | 41 ++++++++++++++++++- 2 files changed, 47 insertions(+), 23 deletions(-) diff --git a/pkg/controller/vitessshard/reconcile_backup_job.go b/pkg/controller/vitessshard/reconcile_backup_job.go index f580ce91..623a066f 100644 --- a/pkg/controller/vitessshard/reconcile_backup_job.go +++ b/pkg/controller/vitessshard/reconcile_backup_job.go @@ -22,7 +22,6 @@ import ( corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" - apilabels "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/reconcile" @@ -56,27 +55,15 @@ func (r *ReconcileVitessShard) reconcileBackupJob(ctx context.Context, vts *plan vitessbackup.TypeLabel: vitessbackup.TypeInit, } - // List all backups for this shard, across all storage locations. - // We'll use the latest observed state of backups to decide whether to take - // a new one. This list could be out of date because it's populated by - // polling the Vitess API (see the VitessBackupStorage controller), but as - // long as it's eventually consistent, we'll converge to the right behavior. - allBackups := &planetscalev2.VitessBackupList{} - listOpts := &client.ListOptions{ - Namespace: vts.Namespace, - LabelSelector: apilabels.SelectorFromSet(apilabels.Set{ - planetscalev2.ClusterLabel: clusterName, - planetscalev2.KeyspaceLabel: keyspaceName, - planetscalev2.ShardLabel: shardSafeName, - }), - } - if err := r.client.List(ctx, allBackups, listOpts); err != nil { + allBackups, completeBackups, err := vitessbackup.GetBackups(ctx, vts.Namespace, clusterName, keyspaceName, shardSafeName, + func(ctx context.Context, allBackupsList *planetscalev2.VitessBackupList, listOpts *client.ListOptions) error { + return r.client.List(ctx, allBackupsList, listOpts) + }, + ) + if err != nil { return resultBuilder.Error(err) } - updateBackupStatus(vts, allBackups.Items) - - // Here we only care about complete backups. - completeBackups := vitessbackup.CompleteBackups(allBackups.Items) + updateBackupStatus(vts, allBackups) // Generate keys (object names) for all desired backup Pods and PVCs. // Keep a map back from generated names to the backup specs. @@ -112,7 +99,7 @@ func (r *ReconcileVitessShard) reconcileBackupJob(ctx context.Context, vts *plan // Reconcile vtbackup PVCs. Use the same key as the corresponding Pod, // but only if the Pod expects a PVC. - err := r.reconciler.ReconcileObjectSet(ctx, vts, pvcKeys, labels, reconciler.Strategy{ + err = r.reconciler.ReconcileObjectSet(ctx, vts, pvcKeys, labels, reconciler.Strategy{ Kind: &corev1.PersistentVolumeClaim{}, New: func(key client.ObjectKey) runtime.Object { diff --git a/pkg/operator/vitessbackup/object.go b/pkg/operator/vitessbackup/object.go index b5c011d8..7c74495a 100644 --- a/pkg/operator/vitessbackup/object.go +++ b/pkg/operator/vitessbackup/object.go @@ -17,11 +17,14 @@ limitations under the License. package vitessbackup import ( + "context" "fmt" "strconv" "strings" "time" + apilabels "k8s.io/apimachinery/pkg/labels" + "sigs.k8s.io/controller-runtime/pkg/client" topodatapb "vitess.io/vitess/go/vt/proto/topodata" "vitess.io/vitess/go/vt/topo/topoproto" @@ -99,8 +102,42 @@ func LatestForLocation(locationName string, backups []*planetscalev2.VitessBacku return latest } -// CompleteBackups returns a list of only the complete backups from the input. -func CompleteBackups(backups []planetscalev2.VitessBackup) []*planetscalev2.VitessBackup { +// GetBackups returns a list of all backups, along with only completed backups, for the given +// keyspace/shard in the given cluster. +// A function to list the backup using the controller's client is necessary. +func GetBackups( + ctx context.Context, + namespace, clusterName, keyspaceName, shardSafeName string, + listBackups func(context.Context, *planetscalev2.VitessBackupList, *client.ListOptions) error, +) (allBackups []planetscalev2.VitessBackup, completeBackups []*planetscalev2.VitessBackup, err error) { + // List all backups for this shard, across all storage locations. + // We'll use the latest observed state of backups to decide whether to take + // a new one. This list could be out of date because it's populated by + // polling the Vitess API (see the VitessBackupStorage controller), but as + // long as it's eventually consistent, we'll converge to the right behavior. + allBackupsList := &planetscalev2.VitessBackupList{} + listOpts := &client.ListOptions{ + Namespace: namespace, + LabelSelector: apilabels.SelectorFromSet(apilabels.Set{ + planetscalev2.ClusterLabel: clusterName, + planetscalev2.KeyspaceLabel: keyspaceName, + planetscalev2.ShardLabel: shardSafeName, + }), + } + if err = listBackups(ctx, allBackupsList, listOpts); err != nil { + return nil, nil, err + } + + allBackups = allBackupsList.Items + + // Filter by complete backups. + completeBackups = getCompleteBackups(allBackups) + + return allBackups, completeBackups, nil +} + +// getCompleteBackups returns a list of only the complete backups from the input. +func getCompleteBackups(backups []planetscalev2.VitessBackup) []*planetscalev2.VitessBackup { completeBackups := []*planetscalev2.VitessBackup{} for i := range backups { if backups[i].Status.Complete { From 9f75193e2c2c18dfd38a18fbcfa7d318aa1aaf77 Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Tue, 28 Jan 2025 17:10:28 -0600 Subject: [PATCH 02/13] Use Forbid as the default concurrency setting in vbs Signed-off-by: Florent Poinsard --- deploy/crds/planetscale.com_vitessbackupschedules.yaml | 1 + deploy/crds/planetscale.com_vitessclusters.yaml | 1 + docs/api.md | 4 ++-- docs/api/index.html | 4 ++-- pkg/apis/planetscale/v2/vitessbackupschedule_types.go | 5 +++-- 5 files changed, 9 insertions(+), 6 deletions(-) diff --git a/deploy/crds/planetscale.com_vitessbackupschedules.yaml b/deploy/crds/planetscale.com_vitessbackupschedules.yaml index 2bb4c666..dda63530 100644 --- a/deploy/crds/planetscale.com_vitessbackupschedules.yaml +++ b/deploy/crds/planetscale.com_vitessbackupschedules.yaml @@ -38,6 +38,7 @@ spec: cluster: type: string concurrencyPolicy: + default: Forbid enum: - Allow - Forbid diff --git a/deploy/crds/planetscale.com_vitessclusters.yaml b/deploy/crds/planetscale.com_vitessclusters.yaml index cefa2884..43cd6d0c 100644 --- a/deploy/crds/planetscale.com_vitessclusters.yaml +++ b/deploy/crds/planetscale.com_vitessclusters.yaml @@ -168,6 +168,7 @@ spec: type: string type: object concurrencyPolicy: + default: Forbid enum: - Allow - Forbid diff --git a/docs/api.md b/docs/api.md index acb8ad03..8e9b6d2f 100644 --- a/docs/api.md +++ b/docs/api.md @@ -2828,8 +2828,8 @@ ConcurrencyPolicy (Optional)

ConcurrencyPolicy specifies ho to treat concurrent executions of a Job. Valid values are: -- “Allow” (default): allows CronJobs to run concurrently; -- “Forbid”: forbids concurrent runs, skipping next run if previous run hasn’t finished yet; +- “Allow”: allows CronJobs to run concurrently; +- “Forbid” (default): forbids concurrent runs, skipping next run if previous run hasn’t finished yet; - “Replace”: cancels currently running job and replaces it with a new one.

diff --git a/docs/api/index.html b/docs/api/index.html index f9974877..b5b32f91 100644 --- a/docs/api/index.html +++ b/docs/api/index.html @@ -2830,8 +2830,8 @@

VitessBackupScheduleTem (Optional)

ConcurrencyPolicy specifies ho to treat concurrent executions of a Job. Valid values are: -- “Allow” (default): allows CronJobs to run concurrently; -- “Forbid”: forbids concurrent runs, skipping next run if previous run hasn’t finished yet; +- “Allow”: allows CronJobs to run concurrently; +- “Forbid” (default): forbids concurrent runs, skipping next run if previous run hasn’t finished yet; - “Replace”: cancels currently running job and replaces it with a new one.

diff --git a/pkg/apis/planetscale/v2/vitessbackupschedule_types.go b/pkg/apis/planetscale/v2/vitessbackupschedule_types.go index ff29654d..dbc5f3f9 100644 --- a/pkg/apis/planetscale/v2/vitessbackupschedule_types.go +++ b/pkg/apis/planetscale/v2/vitessbackupschedule_types.go @@ -136,11 +136,12 @@ type VitessBackupScheduleTemplate struct { // ConcurrencyPolicy specifies ho to treat concurrent executions of a Job. // Valid values are: - // - "Allow" (default): allows CronJobs to run concurrently; - // - "Forbid": forbids concurrent runs, skipping next run if previous run hasn't finished yet; + // - "Allow": allows CronJobs to run concurrently; + // - "Forbid" (default): forbids concurrent runs, skipping next run if previous run hasn't finished yet; // - "Replace": cancels currently running job and replaces it with a new one. // +optional // +kubebuilder:example="Forbid" + // +kubebuilder:default="Forbid" ConcurrencyPolicy ConcurrencyPolicy `json:"concurrencyPolicy,omitempty"` // AllowedMissedRuns defines how many missed run of the schedule will be allowed before giving up on finding the last job. From 0e982c72e020565be684719af544b5af5ad21816 Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Tue, 28 Jan 2025 17:14:05 -0600 Subject: [PATCH 03/13] Fix tests Signed-off-by: Florent Poinsard --- test/endtoend/operator/101_initial_cluster_backup_schedule.yaml | 2 +- test/endtoend/operator/operator-latest.yaml | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/test/endtoend/operator/101_initial_cluster_backup_schedule.yaml b/test/endtoend/operator/101_initial_cluster_backup_schedule.yaml index b31c3032..7fe852cf 100644 --- a/test/endtoend/operator/101_initial_cluster_backup_schedule.yaml +++ b/test/endtoend/operator/101_initial_cluster_backup_schedule.yaml @@ -52,7 +52,7 @@ spec: vtorc: vitess/lite:latest vtbackup: vitess/lite:latest mysqld: - mysql80Compatible: mysql:8.0.40 + mysql80Compatible: vitess/lite:latest mysqldExporter: prom/mysqld-exporter:v0.14.0 cells: - name: zone1 diff --git a/test/endtoend/operator/operator-latest.yaml b/test/endtoend/operator/operator-latest.yaml index a7ab8a87..f4488761 100644 --- a/test/endtoend/operator/operator-latest.yaml +++ b/test/endtoend/operator/operator-latest.yaml @@ -416,6 +416,7 @@ spec: cluster: type: string concurrencyPolicy: + default: Forbid enum: - Allow - Forbid @@ -2028,6 +2029,7 @@ spec: type: string type: object concurrencyPolicy: + default: Forbid enum: - Allow - Forbid From 97bed04ae95b5b5b657f49445604a42dca921e07 Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Tue, 28 Jan 2025 17:19:44 -0600 Subject: [PATCH 04/13] Run vtbackup pods instead of vtctldclient Signed-off-by: Florent Poinsard --- .../vitessbackupschedule_controller.go | 309 +++++++++++------- .../vitessshard/reconcile_backup_job.go | 8 +- pkg/operator/vttablet/flags.go | 1 + pkg/operator/vttablet/vtbackup_pod.go | 2 + 4 files changed, 200 insertions(+), 120 deletions(-) diff --git a/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go b/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go index c105fe9d..b6595f73 100644 --- a/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go +++ b/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go @@ -34,9 +34,12 @@ import ( apierrors "k8s.io/apimachinery/pkg/api/errors" apilabels "k8s.io/apimachinery/pkg/labels" "k8s.io/client-go/tools/record" + "planetscale.dev/vitess-operator/pkg/controller/vitessshard" "planetscale.dev/vitess-operator/pkg/operator/metrics" "planetscale.dev/vitess-operator/pkg/operator/reconciler" "planetscale.dev/vitess-operator/pkg/operator/results" + "planetscale.dev/vitess-operator/pkg/operator/vitessbackup" + "planetscale.dev/vitess-operator/pkg/operator/vttablet" "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/manager" @@ -179,83 +182,100 @@ func (r *ReconcileVitessBackupsSchedule) Reconcile(ctx context.Context, req ctrl reconcileCount.WithLabelValues(vbsc.Name, metrics.Result(err)).Inc() }() - jobs, mostRecentTime, err := r.getJobsList(ctx, req, vbsc.Name) - if err != nil { - // We had an error reading the jobs, we can requeue. - return resultBuilder.Error(err) - } + var scheduledResult ctrl.Result + var activeJobs []*kbatch.Job + for _, strategy := range vbsc.Spec.Strategy { + start, end, ok := strings.Cut(strategy.Shard, "-") + if !ok { + return resultBuilder.Error(fmt.Errorf("invalid strategy shard: %s", strategy.Shard)) + } + vkr := planetscalev2.VitessKeyRange{ + Start: start, + End: end, + } + jobName := vbsc.Name + "-" + strategy.Keyspace + "-" + vkr.SafeName() - err = r.updateVitessBackupScheduleStatus(ctx, mostRecentTime, vbsc, jobs.active) - if err != nil { - // We had an error updating the status, we can requeue. - return resultBuilder.Error(err) - } + jobs, mostRecentTime, err := r.getJobsList(ctx, req, vbsc.Name, jobName) + if err != nil { + // We had an error reading the jobs, we can requeue. + return resultBuilder.Error(err) + } - // We must clean up old jobs to not overcrowd the number of Pods and Jobs in the cluster. - // This will be done according to both failedJobsHistoryLimit and successfulJobsHistoryLimit fields. - r.cleanupJobsWithLimit(ctx, jobs.failed, vbsc.GetFailedJobsLimit()) - r.cleanupJobsWithLimit(ctx, jobs.successful, vbsc.GetSuccessfulJobsLimit()) + activeJobs = append(activeJobs, jobs.active...) + err = r.updateVitessBackupScheduleStatus(ctx, mostRecentTime, vbsc, activeJobs) + if err != nil { + // We had an error updating the status, we can requeue. + return resultBuilder.Error(err) + } - err = r.removeTimeoutJobs(ctx, jobs.active, vbsc.Name, vbsc.Spec.JobTimeoutMinutes) - if err != nil { - // We had an error while removing timed out jobs, we can requeue - return resultBuilder.Error(err) - } + // We must clean up old jobs to not overcrowd the number of Pods and Jobs in the cluster. + // This will be done according to both failedJobsHistoryLimit and successfulJobsHistoryLimit fields. + r.cleanupJobsWithLimit(ctx, jobs.failed, vbsc.GetFailedJobsLimit()) + r.cleanupJobsWithLimit(ctx, jobs.successful, vbsc.GetSuccessfulJobsLimit()) - // If the Suspend setting is set to true, we can skip adding any job, our work is done here. - if vbsc.Spec.Suspend != nil && *vbsc.Spec.Suspend { - log.Info("VitessBackupSchedule suspended, skipping") - return ctrl.Result{}, nil - } + err = r.removeTimeoutJobs(ctx, jobs.active, vbsc.Name, vbsc.Spec.JobTimeoutMinutes) + if err != nil { + // We had an error while removing timed out jobs, we can requeue + return resultBuilder.Error(err) + } - missedRun, nextRun, err := getNextSchedule(vbsc, time.Now()) - if err != nil { - log.Error(err, "unable to figure out VitessBackupSchedule schedule") - // Re-queuing here does not make sense as we have an error with the schedule and the user needs to fix it first. - return ctrl.Result{}, nil - } + // If the Suspend setting is set to true, we can skip adding any job, our work is done here. + if vbsc.Spec.Suspend != nil && *vbsc.Spec.Suspend { + log.Info("VitessBackupSchedule suspended, skipping") + return ctrl.Result{}, nil + } - // Ask kubernetes to re-queue for the next scheduled job, and skip if we don't miss any run. - scheduledResult := ctrl.Result{RequeueAfter: nextRun.Sub(time.Now())} - if missedRun.IsZero() { - return scheduledResult, nil - } + missedRun, nextRun, err := getNextSchedule(vbsc, time.Now()) + if err != nil { + log.Error(err, "unable to figure out VitessBackupSchedule schedule") + // Re-queuing here does not make sense as we have an error with the schedule and the user needs to fix it first. + return ctrl.Result{}, nil + } - // Check whether we are too late to create this Job or not. The startingDeadlineSeconds field will help us - // schedule Jobs that are late. - tooLate := false - if vbsc.Spec.StartingDeadlineSeconds != nil { - tooLate = missedRun.Add(time.Duration(*vbsc.Spec.StartingDeadlineSeconds) * time.Second).Before(time.Now()) - } - if tooLate { - log.Infof("missed starting deadline for latest run; skipping; next run is scheduled for: %s", nextRun.Format(time.RFC3339)) - return scheduledResult, nil - } + // Ask kubernetes to re-queue for the next scheduled job, and skip if we don't miss any run. + scheduledResult = ctrl.Result{RequeueAfter: nextRun.Sub(time.Now())} + if missedRun.IsZero() { + return scheduledResult, nil + } - // Check concurrency policy and skip this job if we have ForbidConcurrent set plus an active job - if vbsc.Spec.ConcurrencyPolicy == planetscalev2.ForbidConcurrent && len(jobs.active) > 0 { - log.Infof("concurrency policy blocks concurrent runs: skipping, number of active jobs: %d", len(jobs.active)) - return scheduledResult, nil - } + // Check whether we are too late to create this Job or not. The startingDeadlineSeconds field will help us + // schedule Jobs that are late. + tooLate := false + if vbsc.Spec.StartingDeadlineSeconds != nil { + tooLate = missedRun.Add(time.Duration(*vbsc.Spec.StartingDeadlineSeconds) * time.Second).Before(time.Now()) + } + if tooLate { + log.Infof("missed starting deadline for latest run; skipping; next run is scheduled for: %s", nextRun.Format(time.RFC3339)) + return scheduledResult, nil + } - // Now that the different policies are checked, we can create and apply our new job. - job, err := r.createJob(ctx, &vbsc, missedRun) - if err != nil { - // Re-queuing here does not make sense as we have an error with the template and the user needs to fix it first. - log.WithError(err).Error("unable to construct job from template") - return ctrl.Result{}, err - } - if err = r.client.Create(ctx, job); err != nil { - // if the job already exists it means another reconciling loop created the job since we last fetched - // the list of jobs to create, we can safely return without failing. - if apierrors.IsAlreadyExists(err) { - return ctrl.Result{}, nil + // Check concurrency policy and skip this job if we have ForbidConcurrent set plus an active job + if vbsc.Spec.ConcurrencyPolicy == planetscalev2.ForbidConcurrent && len(jobs.active) > 0 { + log.Infof("concurrency policy blocks concurrent runs: skipping, number of active jobs: %d", len(jobs.active)) + return scheduledResult, nil + } + + // Now that the different policies are checked, we can create and apply our new job. + job, err := r.createJob(ctx, jobName, &vbsc, strategy, missedRun, vkr) + if err != nil { + // Re-queuing here does not make sense as we have an error with the template and the user needs to fix it first. + log.WithError(err).Error("unable to construct job from template") + return ctrl.Result{}, err + } + + if err = r.client.Create(ctx, job); err != nil { + // if the job already exists it means another reconciling loop created the job since we last fetched + // the list of jobs to create, we can safely return without failing. + if apierrors.IsAlreadyExists(err) { + log.Infof("job %s already exists, will retry in %s", job.Name, scheduledResult.RequeueAfter.String()) + return scheduledResult, nil + } + // Simply re-queue here + return resultBuilder.Error(err) } - // Simply re-queue here - return resultBuilder.Error(err) - } - log.Infof("created new job: %s, next job scheduled in %s", job.Name, scheduledResult.RequeueAfter.String()) + log.Infof("created new job: %s, next job scheduled in %s", job.Name, scheduledResult.RequeueAfter.String()) + } return scheduledResult, nil } @@ -330,7 +350,7 @@ func (r *ReconcileVitessBackupsSchedule) updateVitessBackupScheduleStatus(ctx co // getJobsList fetches all existing Jobs in the cluster and return them by categories: active, failed or successful. // It also returns at what time was the last job created, which is needed to update VitessBackupSchedule's status, // and plan future jobs. -func (r *ReconcileVitessBackupsSchedule) getJobsList(ctx context.Context, req ctrl.Request, vbscName string) (jobsList, *time.Time, error) { +func (r *ReconcileVitessBackupsSchedule) getJobsList(ctx context.Context, req ctrl.Request, vbscName string, jobName string) (jobsList, *time.Time, error) { var existingJobs kbatch.JobList err := r.client.List(ctx, &existingJobs, client.InNamespace(req.Namespace), client.MatchingLabels{planetscalev2.BackupScheduleLabel: vbscName}) @@ -344,6 +364,10 @@ func (r *ReconcileVitessBackupsSchedule) getJobsList(ctx context.Context, req ct var mostRecentTime *time.Time for i, job := range existingJobs.Items { + if !strings.HasPrefix(job.Name, jobName) { + continue + } + _, jobType := isJobFinished(&job) switch jobType { case kbatch.JobFailed, kbatch.JobFailureTarget: @@ -386,11 +410,27 @@ func (r *ReconcileVitessBackupsSchedule) cleanupJobsWithLimit(ctx context.Contex if int32(i) >= int32(len(jobs))-limit { break } + // delete the job if err := r.client.Delete(ctx, job, client.PropagationPolicy(metav1.DeletePropagationBackground)); (err) != nil { log.WithError(err).Errorf("unable to delete old job: %s", job.Name) } else { log.Infof("deleted old job: %s", job.Name) } + + // delete the vtbackup pod's PVC + pvc := &corev1.PersistentVolumeClaim{} + err := r.client.Get(ctx, client.ObjectKey{Namespace: job.Namespace, Name: job.Name}, pvc) + if err != nil { + log.WithError(err).Errorf("unable to get PVC for job: %s", job.Name) + if apierrors.IsNotFound(err) { + continue + } + } + if err := r.client.Delete(ctx, pvc, client.PropagationPolicy(metav1.DeletePropagationBackground)); (err) != nil { + log.WithError(err).Errorf("unable to delete old PVC for job: %s", job.Name) + } else { + log.Infof("deleted old PVC for job: %s", job.Name) + } } } @@ -410,6 +450,20 @@ func (r *ReconcileVitessBackupsSchedule) removeTimeoutJobs(ctx context.Context, log.Infof("deleted timed out job: %s", job.Name) } timeoutJobsCount.WithLabelValues(vbscName, metrics.Result(err)).Inc() + + pvc := &corev1.PersistentVolumeClaim{} + err := r.client.Get(ctx, client.ObjectKey{Namespace: job.Namespace, Name: job.Name}, pvc) + if err != nil { + log.WithError(err).Errorf("unable to get PVC for timed out job: %s", job.Name) + if apierrors.IsNotFound(err) { + continue + } + } + if err := r.client.Delete(ctx, pvc, client.PropagationPolicy(metav1.DeletePropagationBackground)); (err) != nil { + log.WithError(err).Errorf("unable to delete old PVC for timed out job: %s", job.Name) + } else { + log.Infof("deleted old PVC for timed out job: %s", job.Name) + } } } return nil @@ -439,9 +493,15 @@ func getScheduledTimeForJob(job *kbatch.Job) (*time.Time, error) { return &timeParsed, nil } -func (r *ReconcileVitessBackupsSchedule) createJob(ctx context.Context, vbsc *planetscalev2.VitessBackupSchedule, scheduledTime time.Time) (*kbatch.Job, error) { - name := fmt.Sprintf("%s-%d", vbsc.Name, scheduledTime.Unix()) - +func (r *ReconcileVitessBackupsSchedule) createJob( + ctx context.Context, + jobName string, + vbsc *planetscalev2.VitessBackupSchedule, + strategy planetscalev2.VitessBackupScheduleStrategy, + scheduledTime time.Time, + vkr planetscalev2.VitessKeyRange, +) (*kbatch.Job, error) { + name := fmt.Sprintf("%s-%d", jobName, scheduledTime.Unix()) meta := metav1.ObjectMeta{ Labels: map[string]string{ planetscalev2.BackupScheduleLabel: vbsc.Name, @@ -457,7 +517,7 @@ func (r *ReconcileVitessBackupsSchedule) createJob(ctx context.Context, vbsc *pl maps.Copy(meta.Labels, vbsc.Labels) - pod, err := r.createJobPod(ctx, vbsc, name) + pod, vtbackupSpec, err := r.createJobPod(ctx, vbsc, strategy, name, vkr) if err != nil { return nil, err } @@ -466,7 +526,7 @@ func (r *ReconcileVitessBackupsSchedule) createJob(ctx context.Context, vbsc *pl Spec: kbatch.JobSpec{ Template: corev1.PodTemplateSpec{ ObjectMeta: meta, - Spec: pod, + Spec: pod.Spec, }, }, } @@ -475,55 +535,71 @@ func (r *ReconcileVitessBackupsSchedule) createJob(ctx context.Context, vbsc *pl return nil, err } - return job, nil -} - -func (r *ReconcileVitessBackupsSchedule) createJobPod(ctx context.Context, vbsc *planetscalev2.VitessBackupSchedule, name string) (pod corev1.PodSpec, err error) { - getVtctldServiceName := func(cluster string) (string, error) { - vtctldServiceName, vtctldServicePort, err := r.getVtctldServiceName(ctx, vbsc, cluster) + // Create the corresponding PVC for the new vtbackup pod + pvc := &corev1.PersistentVolumeClaim{} + key := client.ObjectKey{ + Namespace: job.Namespace, + Name: name, + } + err = r.client.Get(ctx, key, pvc) + if err != nil { + if !apierrors.IsNotFound(err) { + return nil, err + } + newPVC := vttablet.NewPVC(key, vtbackupSpec.TabletSpec) + if err := ctrl.SetControllerReference(vbsc, newPVC, r.scheme); err != nil { + return nil, err + } + err = r.client.Create(ctx, newPVC) if err != nil { - return "", err + return nil, err } - return fmt.Sprintf("--server=%s:%d", vtctldServiceName, vtctldServicePort), nil } - // It is fine to not have any default in the event there is no strategy as the CRD validation - // ensures that there will be at least one item in this list. The YAML cannot be applied with - // empty list of strategies. - var cmd strings.Builder + return job, nil +} - addNewCmd := func(i int) { - if i > 0 { - cmd.WriteString(" && ") - } +func (r *ReconcileVitessBackupsSchedule) createJobPod( + ctx context.Context, + vbsc *planetscalev2.VitessBackupSchedule, + strategy planetscalev2.VitessBackupScheduleStrategy, + name string, + vkr planetscalev2.VitessKeyRange, +) (pod *corev1.Pod, spec *vttablet.BackupSpec, err error) { + vts, err := r.getShardFromKeyspace(ctx, vbsc.Namespace, vbsc.Spec.Cluster, strategy.Keyspace, strategy.Shard) + if err != nil { + return nil, nil, err } - for i, strategy := range vbsc.Spec.Strategy { - vtctldclientServerArg, err := getVtctldServiceName(vbsc.Spec.Cluster) - if err != nil { - return corev1.PodSpec{}, err - } - - addNewCmd(i) - switch strategy.Name { - case planetscalev2.BackupShard: - createVtctldClientCommand(&cmd, vtctldclientServerArg, strategy.ExtraFlags, strategy.Keyspace, strategy.Shard) - } - + _, completeBackups, err := vitessbackup.GetBackups(ctx, vbsc.Namespace, vbsc.Spec.Cluster, strategy.Keyspace, vkr.SafeName(), + func(ctx context.Context, allBackupsList *planetscalev2.VitessBackupList, listOpts *client.ListOptions) error { + return r.client.List(ctx, allBackupsList, listOpts) + }, + ) + if err != nil { + return nil, nil, err + } + backupType := vitessbackup.TypeUpdate + if len(completeBackups) == 0 { + backupType = vitessbackup.TypeInit } - pod = corev1.PodSpec{ - Containers: []corev1.Container{{ - Name: name, - Image: vbsc.Spec.Image, - ImagePullPolicy: vbsc.Spec.ImagePullPolicy, - Resources: vbsc.Spec.Resources, - Args: []string{"/bin/sh", "-c", cmd.String()}, - }}, - RestartPolicy: corev1.RestartPolicyOnFailure, - Affinity: vbsc.Spec.Affinity, + podKey := client.ObjectKey{ + Namespace: vbsc.Namespace, + Name: name, + } + labels := map[string]string{ + planetscalev2.BackupScheduleLabel: vbsc.Name, + planetscalev2.ClusterLabel: vbsc.Spec.Cluster, + planetscalev2.KeyspaceLabel: strategy.Keyspace, + planetscalev2.ShardLabel: vkr.SafeName(), } - return pod, nil + vtbackupSpec := vitessshard.MakeVtbackupSpec(podKey, &vts, labels, backupType) + p := vttablet.NewBackupPod(podKey, vtbackupSpec, vts.Spec.Images.Mysqld.Image()) + + p.Spec.RestartPolicy = corev1.RestartPolicyOnFailure + p.Spec.Affinity = vbsc.Spec.Affinity + return p, vtbackupSpec, nil } func createVtctldClientCommand(cmd *strings.Builder, serverAddr string, extraFlags map[string]string, keyspace, shard string) { @@ -568,7 +644,7 @@ func (r *ReconcileVitessBackupsSchedule) getVtctldServiceName(ctx context.Contex return svcName, svcPort, nil } -func (r *ReconcileVitessBackupsSchedule) getAllShardsInKeyspace(ctx context.Context, namespace, cluster, keyspace string) ([]string, error) { +func (r *ReconcileVitessBackupsSchedule) getShardFromKeyspace(ctx context.Context, namespace, cluster, keyspace, shard string) (planetscalev2.VitessShard, error) { shardsList := &planetscalev2.VitessShardList{} listOpts := &client.ListOptions{ Namespace: namespace, @@ -578,13 +654,14 @@ func (r *ReconcileVitessBackupsSchedule) getAllShardsInKeyspace(ctx context.Cont }.AsSelector(), } if err := r.client.List(ctx, shardsList, listOpts); err != nil { - return nil, fmt.Errorf("unable to list shards of keyspace %s in %s: %v", keyspace, namespace, err) + return planetscalev2.VitessShard{}, fmt.Errorf("unable to list shards of keyspace %s in %s: %v", keyspace, namespace, err) } - var result []string for _, item := range shardsList.Items { - result = append(result, item.Spec.Name) + if item.Spec.KeyRange.String() == shard { + return item, nil + } } - return result, nil + return planetscalev2.VitessShard{}, fmt.Errorf("unable to find shard %s in keyspace %s in %s", shard, keyspace, namespace) } type keyspace struct { diff --git a/pkg/controller/vitessshard/reconcile_backup_job.go b/pkg/controller/vitessshard/reconcile_backup_job.go index 623a066f..df4fcd65 100644 --- a/pkg/controller/vitessshard/reconcile_backup_job.go +++ b/pkg/controller/vitessshard/reconcile_backup_job.go @@ -84,7 +84,7 @@ func (r *ReconcileVitessShard) reconcileBackupJob(ctx context.Context, vts *plan // scratch (not from any tablet). If we're wrong and a backup exists // already, the idempotent vtbackup "initial backup" mode will just do // nothing and return success. - initSpec := vtbackupInitSpec(initPodKey, vts, labels) + initSpec := MakeVtbackupSpec(initPodKey, vts, labels, vitessbackup.TypeInit) if initSpec != nil { podKeys = append(podKeys, initPodKey) if initSpec.TabletSpec.DataVolumePVCSpec != nil { @@ -168,7 +168,7 @@ func (r *ReconcileVitessShard) reconcileBackupJob(ctx context.Context, vts *plan return resultBuilder.Result() } -func vtbackupInitSpec(key client.ObjectKey, vts *planetscalev2.VitessShard, parentLabels map[string]string) *vttablet.BackupSpec { +func MakeVtbackupSpec(key client.ObjectKey, vts *planetscalev2.VitessShard, parentLabels map[string]string, typ string) *vttablet.BackupSpec { // If we specifically set our cluster to avoid initial backups, bail early. if !*vts.Spec.Replication.InitializeBackup { return nil @@ -183,7 +183,7 @@ func vtbackupInitSpec(key client.ObjectKey, vts *planetscalev2.VitessShard, pare // Make a vtbackup spec that's a similar shape to the first tablet pool. // This should give it enough resources to run mysqld and restore a backup, // since all tablets need to be able to do that, regardless of type. - return vtbackupSpec(key, vts, parentLabels, &vts.Spec.TabletPools[0], vitessbackup.TypeInit) + return vtbackupSpec(key, vts, parentLabels, &vts.Spec.TabletPools[0], typ) } func vtbackupSpec(key client.ObjectKey, vts *planetscalev2.VitessShard, parentLabels map[string]string, pool *planetscalev2.VitessShardTabletPool, backupType string) *vttablet.BackupSpec { @@ -240,7 +240,7 @@ func vtbackupSpec(key client.ObjectKey, vts *planetscalev2.VitessShard, parentLa } return &vttablet.BackupSpec{ - InitialBackup: backupType == vitessbackup.TypeInit, + AllowFirstBackup: backupType == vitessbackup.TypeInit, MinBackupInterval: minBackupInterval, MinRetentionTime: minRetentionTime, MinRetentionCount: minRetentionCount, diff --git a/pkg/operator/vttablet/flags.go b/pkg/operator/vttablet/flags.go index 1ec42d0c..6bc34847 100644 --- a/pkg/operator/vttablet/flags.go +++ b/pkg/operator/vttablet/flags.go @@ -104,6 +104,7 @@ func init() { // vtbackup-specific flags. "concurrency": vtbackupConcurrency, "initial_backup": backupSpec.InitialBackup, + "allow_first_backup": backupSpec.AllowFirstBackup, "min_backup_interval": backupSpec.MinBackupInterval, "min_retention_time": backupSpec.MinRetentionTime, "min_retention_count": backupSpec.MinRetentionCount, diff --git a/pkg/operator/vttablet/vtbackup_pod.go b/pkg/operator/vttablet/vtbackup_pod.go index 813257f9..68b38567 100644 --- a/pkg/operator/vttablet/vtbackup_pod.go +++ b/pkg/operator/vttablet/vtbackup_pod.go @@ -61,6 +61,8 @@ type BackupSpec struct { // aren't any. Instead, bootstrap the shard with a backup of an empty // database. InitialBackup bool + // AllowFirstBackup enables vtbackup to take a backup even if none exist. + AllowFirstBackup bool // MinBackupInterval is the minimum spacing between backups. // A new backup will only be taken if it's been at least this long since the // most recent backup. From 460e94da8001b31687dc6951b6f43d00af31bf2b Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Wed, 29 Jan 2025 10:40:42 -0600 Subject: [PATCH 05/13] Edit tests Signed-off-by: Florent Poinsard --- test/endtoend/backup_schedule_test.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/endtoend/backup_schedule_test.sh b/test/endtoend/backup_schedule_test.sh index 55c37a50..2e6a1d36 100755 --- a/test/endtoend/backup_schedule_test.sh +++ b/test/endtoend/backup_schedule_test.sh @@ -31,14 +31,14 @@ function verifyListBackupsOutputWithSchedule() { checkVitessBackupScheduleStatusWithTimeout "example-vbsc-every-five-minute(.*)" echo -e "Check for number of backups in the cluster" - # Sleep for 6 minutes, during this time we should have at the very minimum 7 backups. - # At least: 6 backups from the every-minute schedule, and 1 backup from the every-five-minute schedule. + # Sleep for 6 minutes, after 6 minutes we should at least have 3 backups: 1 from the initial vtbackup pod + # 1 minimum from the every minute schedule, and 1 from the every-five minute schedule sleep 360 backupCount=$(kubectl get vtb --no-headers | wc -l) - if [[ "${backupCount}" -lt 7 ]]; then - echo "Did not find at least 7 backups" - return 0 + if [[ "${backupCount}" -lt 3 ]]; then + echo "Did not find at least 3 backups" + exit 1 fi echo -e "Check for Jobs' pods" From e9e15940d5b94180ea48b721f88ccdd49a91854d Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Thu, 30 Jan 2025 11:39:10 -0600 Subject: [PATCH 06/13] Fix service name length and initial backup issues Signed-off-by: Florent Poinsard --- .../vitessbackupschedule_controller.go | 79 +++++++++++-------- .../vitessshard/reconcile_backup_job.go | 9 ++- pkg/operator/vitessbackup/labels.go | 2 + 3 files changed, 56 insertions(+), 34 deletions(-) diff --git a/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go b/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go index b6595f73..5d2ddfb8 100644 --- a/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go +++ b/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go @@ -22,6 +22,7 @@ import ( "fmt" "maps" "math/rand/v2" + "strconv" "strings" "sort" @@ -36,6 +37,7 @@ import ( "k8s.io/client-go/tools/record" "planetscale.dev/vitess-operator/pkg/controller/vitessshard" "planetscale.dev/vitess-operator/pkg/operator/metrics" + "planetscale.dev/vitess-operator/pkg/operator/names" "planetscale.dev/vitess-operator/pkg/operator/reconciler" "planetscale.dev/vitess-operator/pkg/operator/results" "planetscale.dev/vitess-operator/pkg/operator/vitessbackup" @@ -193,9 +195,7 @@ func (r *ReconcileVitessBackupsSchedule) Reconcile(ctx context.Context, req ctrl Start: start, End: end, } - jobName := vbsc.Name + "-" + strategy.Keyspace + "-" + vkr.SafeName() - - jobs, mostRecentTime, err := r.getJobsList(ctx, req, vbsc.Name, jobName) + jobs, mostRecentTime, err := r.getJobsList(ctx, req, vbsc, strategy.Keyspace, vkr.SafeName()) if err != nil { // We had an error reading the jobs, we can requeue. return resultBuilder.Error(err) @@ -235,7 +235,7 @@ func (r *ReconcileVitessBackupsSchedule) Reconcile(ctx context.Context, req ctrl // Ask kubernetes to re-queue for the next scheduled job, and skip if we don't miss any run. scheduledResult = ctrl.Result{RequeueAfter: nextRun.Sub(time.Now())} if missedRun.IsZero() { - return scheduledResult, nil + continue } // Check whether we are too late to create this Job or not. The startingDeadlineSeconds field will help us @@ -246,17 +246,17 @@ func (r *ReconcileVitessBackupsSchedule) Reconcile(ctx context.Context, req ctrl } if tooLate { log.Infof("missed starting deadline for latest run; skipping; next run is scheduled for: %s", nextRun.Format(time.RFC3339)) - return scheduledResult, nil + continue } // Check concurrency policy and skip this job if we have ForbidConcurrent set plus an active job if vbsc.Spec.ConcurrencyPolicy == planetscalev2.ForbidConcurrent && len(jobs.active) > 0 { log.Infof("concurrency policy blocks concurrent runs: skipping, number of active jobs: %d", len(jobs.active)) - return scheduledResult, nil + continue } // Now that the different policies are checked, we can create and apply our new job. - job, err := r.createJob(ctx, jobName, &vbsc, strategy, missedRun, vkr) + job, err := r.createJob(ctx, vbsc, strategy, missedRun, vkr) if err != nil { // Re-queuing here does not make sense as we have an error with the template and the user needs to fix it first. log.WithError(err).Error("unable to construct job from template") @@ -268,7 +268,7 @@ func (r *ReconcileVitessBackupsSchedule) Reconcile(ctx context.Context, req ctrl // the list of jobs to create, we can safely return without failing. if apierrors.IsAlreadyExists(err) { log.Infof("job %s already exists, will retry in %s", job.Name, scheduledResult.RequeueAfter.String()) - return scheduledResult, nil + continue } // Simply re-queue here return resultBuilder.Error(err) @@ -350,10 +350,21 @@ func (r *ReconcileVitessBackupsSchedule) updateVitessBackupScheduleStatus(ctx co // getJobsList fetches all existing Jobs in the cluster and return them by categories: active, failed or successful. // It also returns at what time was the last job created, which is needed to update VitessBackupSchedule's status, // and plan future jobs. -func (r *ReconcileVitessBackupsSchedule) getJobsList(ctx context.Context, req ctrl.Request, vbscName string, jobName string) (jobsList, *time.Time, error) { +func (r *ReconcileVitessBackupsSchedule) getJobsList( + ctx context.Context, + req ctrl.Request, + vbsc planetscalev2.VitessBackupSchedule, + keyspace string, + shardSafeName string, +) (jobsList, *time.Time, error) { var existingJobs kbatch.JobList - err := r.client.List(ctx, &existingJobs, client.InNamespace(req.Namespace), client.MatchingLabels{planetscalev2.BackupScheduleLabel: vbscName}) + err := r.client.List(ctx, &existingJobs, client.InNamespace(req.Namespace), client.MatchingLabels{ + planetscalev2.BackupScheduleLabel: vbsc.Name, + planetscalev2.ClusterLabel: vbsc.Spec.Cluster, + planetscalev2.KeyspaceLabel: keyspace, + planetscalev2.ShardLabel: shardSafeName, + }) if err != nil && !apierrors.IsNotFound(err) { log.WithError(err).Error("unable to list Jobs in cluster") return jobsList{}, nil, err @@ -364,10 +375,6 @@ func (r *ReconcileVitessBackupsSchedule) getJobsList(ctx context.Context, req ct var mostRecentTime *time.Time for i, job := range existingJobs.Items { - if !strings.HasPrefix(job.Name, jobName) { - continue - } - _, jobType := isJobFinished(&job) switch jobType { case kbatch.JobFailed, kbatch.JobFailureTarget: @@ -495,17 +502,22 @@ func getScheduledTimeForJob(job *kbatch.Job) (*time.Time, error) { func (r *ReconcileVitessBackupsSchedule) createJob( ctx context.Context, - jobName string, - vbsc *planetscalev2.VitessBackupSchedule, + vbsc planetscalev2.VitessBackupSchedule, strategy planetscalev2.VitessBackupScheduleStrategy, scheduledTime time.Time, vkr planetscalev2.VitessKeyRange, ) (*kbatch.Job, error) { - name := fmt.Sprintf("%s-%d", jobName, scheduledTime.Unix()) + name := names.JoinWithConstraints(names.ServiceConstraints, vbsc.Name, strategy.Keyspace, vkr.SafeName(), strconv.Itoa(int(scheduledTime.Unix()))) + + labels := map[string]string{ + planetscalev2.BackupScheduleLabel: vbsc.Name, + planetscalev2.ClusterLabel: vbsc.Spec.Cluster, + planetscalev2.KeyspaceLabel: strategy.Keyspace, + planetscalev2.ShardLabel: vkr.SafeName(), + } + meta := metav1.ObjectMeta{ - Labels: map[string]string{ - planetscalev2.BackupScheduleLabel: vbsc.Name, - }, + Labels: labels, Annotations: make(map[string]string), Name: name, Namespace: vbsc.Namespace, @@ -517,7 +529,7 @@ func (r *ReconcileVitessBackupsSchedule) createJob( maps.Copy(meta.Labels, vbsc.Labels) - pod, vtbackupSpec, err := r.createJobPod(ctx, vbsc, strategy, name, vkr) + pod, vtbackupSpec, err := r.createJobPod(ctx, vbsc, strategy, name, vkr, labels) if err != nil { return nil, err } @@ -531,7 +543,7 @@ func (r *ReconcileVitessBackupsSchedule) createJob( }, } - if err := ctrl.SetControllerReference(vbsc, job, r.scheme); err != nil { + if err := ctrl.SetControllerReference(&vbsc, job, r.scheme); err != nil { return nil, err } @@ -547,7 +559,7 @@ func (r *ReconcileVitessBackupsSchedule) createJob( return nil, err } newPVC := vttablet.NewPVC(key, vtbackupSpec.TabletSpec) - if err := ctrl.SetControllerReference(vbsc, newPVC, r.scheme); err != nil { + if err := ctrl.SetControllerReference(&vbsc, newPVC, r.scheme); err != nil { return nil, err } err = r.client.Create(ctx, newPVC) @@ -561,10 +573,11 @@ func (r *ReconcileVitessBackupsSchedule) createJob( func (r *ReconcileVitessBackupsSchedule) createJobPod( ctx context.Context, - vbsc *planetscalev2.VitessBackupSchedule, + vbsc planetscalev2.VitessBackupSchedule, strategy planetscalev2.VitessBackupScheduleStrategy, name string, vkr planetscalev2.VitessKeyRange, + labels map[string]string, ) (pod *corev1.Pod, spec *vttablet.BackupSpec, err error) { vts, err := r.getShardFromKeyspace(ctx, vbsc.Namespace, vbsc.Spec.Cluster, strategy.Keyspace, strategy.Shard) if err != nil { @@ -579,25 +592,27 @@ func (r *ReconcileVitessBackupsSchedule) createJobPod( if err != nil { return nil, nil, err } + backupType := vitessbackup.TypeUpdate if len(completeBackups) == 0 { - backupType = vitessbackup.TypeInit + if vts.Status.HasMaster == corev1.ConditionTrue { + backupType = vitessbackup.TypeFirstBackup + } else { + backupType = vitessbackup.TypeInit + } } podKey := client.ObjectKey{ Namespace: vbsc.Namespace, Name: name, } - labels := map[string]string{ - planetscalev2.BackupScheduleLabel: vbsc.Name, - planetscalev2.ClusterLabel: vbsc.Spec.Cluster, - planetscalev2.KeyspaceLabel: strategy.Keyspace, - planetscalev2.ShardLabel: vkr.SafeName(), - } vtbackupSpec := vitessshard.MakeVtbackupSpec(podKey, &vts, labels, backupType) p := vttablet.NewBackupPod(podKey, vtbackupSpec, vts.Spec.Images.Mysqld.Image()) - p.Spec.RestartPolicy = corev1.RestartPolicyOnFailure + // Explicitly do not restart on failure. The VitessBackupSchedule controller will retry the failed job + // during the next scheduled run. + p.Spec.RestartPolicy = corev1.RestartPolicyNever + p.Spec.Affinity = vbsc.Spec.Affinity return p, vtbackupSpec, nil } diff --git a/pkg/controller/vitessshard/reconcile_backup_job.go b/pkg/controller/vitessshard/reconcile_backup_job.go index df4fcd65..23a82973 100644 --- a/pkg/controller/vitessshard/reconcile_backup_job.go +++ b/pkg/controller/vitessshard/reconcile_backup_job.go @@ -84,7 +84,11 @@ func (r *ReconcileVitessShard) reconcileBackupJob(ctx context.Context, vts *plan // scratch (not from any tablet). If we're wrong and a backup exists // already, the idempotent vtbackup "initial backup" mode will just do // nothing and return success. - initSpec := MakeVtbackupSpec(initPodKey, vts, labels, vitessbackup.TypeInit) + backupType := vitessbackup.TypeUpdate + if vts.Status.HasMaster != corev1.ConditionTrue { + backupType = vitessbackup.TypeInit + } + initSpec := MakeVtbackupSpec(initPodKey, vts, labels, backupType) if initSpec != nil { podKeys = append(podKeys, initPodKey) if initSpec.TabletSpec.DataVolumePVCSpec != nil { @@ -240,7 +244,8 @@ func vtbackupSpec(key client.ObjectKey, vts *planetscalev2.VitessShard, parentLa } return &vttablet.BackupSpec{ - AllowFirstBackup: backupType == vitessbackup.TypeInit, + InitialBackup: backupType == vitessbackup.TypeInit, + AllowFirstBackup: backupType == vitessbackup.TypeFirstBackup, MinBackupInterval: minBackupInterval, MinRetentionTime: minRetentionTime, MinRetentionCount: minRetentionCount, diff --git a/pkg/operator/vitessbackup/labels.go b/pkg/operator/vitessbackup/labels.go index e8e9a785..8c6d9819 100644 --- a/pkg/operator/vitessbackup/labels.go +++ b/pkg/operator/vitessbackup/labels.go @@ -24,6 +24,8 @@ const ( // TypeInit is a backup taken to initialize an empty shard. TypeInit = "init" + // TypeFirstBackup is a backup taken when no other backup exist in an existing shard. + TypeFirstBackup = "empty" // TypeUpdate is a backup taken to update the latest backup for a shard. TypeUpdate = "update" ) From 504b77137445182c5534864f2713fc3bc24e434a Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Mon, 24 Feb 2025 08:49:29 -0600 Subject: [PATCH 07/13] wip Signed-off-by: Florent Poinsard --- .../operator/101_initial_cluster.yaml | 9 +- .../operator/201_customer_tablets.yaml | 9 +- test/endtoend/operator/302_new_shards.yaml | 9 +- test/endtoend/operator/306_down_shard_0.yaml | 9 +- .../operator/401_scheduled_backups.yaml | 167 ++++++++++++++++++ test/endtoend/operator/cluster_upgrade.yaml | 9 +- test/endtoend/upgrade_test.sh | 21 ++- 7 files changed, 224 insertions(+), 9 deletions(-) create mode 100644 test/endtoend/operator/401_scheduled_backups.yaml diff --git a/test/endtoend/operator/101_initial_cluster.yaml b/test/endtoend/operator/101_initial_cluster.yaml index 76ef570a..e0a34ed1 100644 --- a/test/endtoend/operator/101_initial_cluster.yaml +++ b/test/endtoend/operator/101_initial_cluster.yaml @@ -7,6 +7,13 @@ kind: VitessCluster metadata: name: example spec: + backup: + engine: xtrabackup + locations: + - volume: + hostPath: + path: /backup + type: Directory images: vtctld: vitess/lite:v21.0.0 vtgate: vitess/lite:v21.0.0 @@ -14,7 +21,7 @@ spec: vtorc: vitess/lite:v21.0.0 vtbackup: vitess/lite:v21.0.0 mysqld: - mysql80Compatible: mysql:8.0.40 + mysql80Compatible: vitess/lite:v21.0.0 mysqldExporter: prom/mysqld-exporter:v0.14.0 cells: - name: zone1 diff --git a/test/endtoend/operator/201_customer_tablets.yaml b/test/endtoend/operator/201_customer_tablets.yaml index ef1f3546..c0565c7d 100644 --- a/test/endtoend/operator/201_customer_tablets.yaml +++ b/test/endtoend/operator/201_customer_tablets.yaml @@ -3,6 +3,13 @@ kind: VitessCluster metadata: name: example spec: + backup: + engine: xtrabackup + locations: + - volume: + hostPath: + path: /backup + type: Directory images: vtctld: vitess/lite:latest vtgate: vitess/lite:latest @@ -10,7 +17,7 @@ spec: vtorc: vitess/lite:latest vtbackup: vitess/lite:latest mysqld: - mysql80Compatible: mysql:8.0.40 + mysql80Compatible: vitess/lite:latest mysqldExporter: prom/mysqld-exporter:v0.14.0 cells: - name: zone1 diff --git a/test/endtoend/operator/302_new_shards.yaml b/test/endtoend/operator/302_new_shards.yaml index 8731afea..2808349b 100644 --- a/test/endtoend/operator/302_new_shards.yaml +++ b/test/endtoend/operator/302_new_shards.yaml @@ -3,6 +3,13 @@ kind: VitessCluster metadata: name: example spec: + backup: + engine: xtrabackup + locations: + - volume: + hostPath: + path: /backup + type: Directory images: vtctld: vitess/lite:latest vtgate: vitess/lite:latest @@ -10,7 +17,7 @@ spec: vtorc: vitess/lite:latest vtbackup: vitess/lite:latest mysqld: - mysql80Compatible: mysql:8.0.40 + mysql80Compatible: vitess/lite:latest mysqldExporter: prom/mysqld-exporter:v0.14.0 cells: - name: zone1 diff --git a/test/endtoend/operator/306_down_shard_0.yaml b/test/endtoend/operator/306_down_shard_0.yaml index 9684f5f2..2cfcf5bd 100644 --- a/test/endtoend/operator/306_down_shard_0.yaml +++ b/test/endtoend/operator/306_down_shard_0.yaml @@ -3,6 +3,13 @@ kind: VitessCluster metadata: name: example spec: + backup: + engine: xtrabackup + locations: + - volume: + hostPath: + path: /backup + type: Directory images: vtctld: vitess/lite:latest vtgate: vitess/lite:latest @@ -10,7 +17,7 @@ spec: vtorc: vitess/lite:latest vtbackup: vitess/lite:latest mysqld: - mysql80Compatible: mysql:8.0.40 + mysql80Compatible: vitess/lite:latest mysqldExporter: prom/mysqld-exporter:v0.14.0 cells: - name: zone1 diff --git a/test/endtoend/operator/401_scheduled_backups.yaml b/test/endtoend/operator/401_scheduled_backups.yaml new file mode 100644 index 00000000..52c18053 --- /dev/null +++ b/test/endtoend/operator/401_scheduled_backups.yaml @@ -0,0 +1,167 @@ +apiVersion: planetscale.com/v2 +kind: VitessCluster +metadata: + name: example +spec: + backup: + engine: xtrabackup + locations: + - volume: + hostPath: + path: /backup + type: Directory + schedules: + - name: "customer" + schedule: "* * * * *" + resources: + requests: + cpu: 100m + memory: 1024Mi + limits: + memory: 1024Mi + successfulJobsHistoryLimit: 2 + failedJobsHistoryLimit: 3 + jobTimeoutMinute: 5 + suspend: true + strategies: + - name: BackupShard + keyspace: "customer" + shard: "80-" + - name: BackupShard + keyspace: "customer" + shard: "-80" + - name: "commerce" + schedule: "* * * * *" + resources: + requests: + cpu: 100m + memory: 1024Mi + limits: + memory: 1024Mi + successfulJobsHistoryLimit: 2 + failedJobsHistoryLimit: 3 + jobTimeoutMinute: 5 + strategies: + - name: BackupShard + keyspace: "commerce" + shard: "-" + images: + vtctld: vitess/lite:latest + vtgate: vitess/lite:latest + vttablet: vitess/lite:latest + vtorc: vitess/lite:latest + vtbackup: vitess/lite:latest + mysqld: + mysql80Compatible: vitess/lite:latest + mysqldExporter: prom/mysqld-exporter:v0.14.0 + cells: + - name: zone1 + gateway: + authentication: + static: + secret: + name: example-cluster-config + key: users.json + replicas: 1 + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + memory: 256Mi + vitessDashboard: + cells: + - zone1 + extraFlags: + security_policy: read-only + replicas: 1 + resources: + limits: + memory: 128Mi + requests: + cpu: 100m + memory: 128Mi + + keyspaces: + - name: commerce + durabilityPolicy: semi_sync + vitessOrchestrator: + resources: + limits: + memory: 128Mi + requests: + cpu: 100m + memory: 128Mi + extraFlags: + instance-poll-time: 3s + turndownPolicy: Immediate + partitionings: + - equal: + parts: 1 + shardTemplate: + databaseInitScriptSecret: + name: example-cluster-config + key: init_db.sql + tabletPools: + - cell: zone1 + type: replica + replicas: 3 + vttablet: + extraFlags: + db_charset: utf8mb4 + resources: + requests: + cpu: 100m + memory: 256Mi + mysqld: + resources: + requests: + cpu: 100m + memory: 512Mi + dataVolumeClaimTemplate: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 10Gi + - name: customer + durabilityPolicy: semi_sync + vitessOrchestrator: + resources: + limits: + memory: 128Mi + requests: + cpu: 100m + memory: 128Mi + extraFlags: + instance-poll-time: 3s + turndownPolicy: Immediate + partitionings: + - equal: + parts: 2 + shardTemplate: + databaseInitScriptSecret: + name: example-cluster-config + key: init_db.sql + tabletPools: + - cell: zone1 + type: replica + replicas: 3 + vttablet: + extraFlags: + db_charset: utf8mb4 + resources: + requests: + cpu: 100m + memory: 256Mi + mysqld: + resources: + requests: + cpu: 100m + memory: 512Mi + dataVolumeClaimTemplate: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 10Gi + updateStrategy: + type: Immediate diff --git a/test/endtoend/operator/cluster_upgrade.yaml b/test/endtoend/operator/cluster_upgrade.yaml index 18cd5484..cb0216e6 100644 --- a/test/endtoend/operator/cluster_upgrade.yaml +++ b/test/endtoend/operator/cluster_upgrade.yaml @@ -7,6 +7,13 @@ kind: VitessCluster metadata: name: example spec: + backup: + engine: xtrabackup + locations: + - volume: + hostPath: + path: /backup + type: Directory images: vtctld: vitess/lite:latest vtgate: vitess/lite:latest @@ -14,7 +21,7 @@ spec: vtorc: vitess/lite:latest vtbackup: vitess/lite:latest mysqld: - mysql80Compatible: mysql:8.0.40 + mysql80Compatible: vitess/lite:latest mysqldExporter: prom/mysqld-exporter:v0.14.0 cells: - name: zone1 diff --git a/test/endtoend/upgrade_test.sh b/test/endtoend/upgrade_test.sh index f1c47413..3bc4552c 100755 --- a/test/endtoend/upgrade_test.sh +++ b/test/endtoend/upgrade_test.sh @@ -233,10 +233,23 @@ EOF } # Test setup +mkdir -p -m 777 ./vtdataroot/backup echo "Building the docker image" docker build -f build/Dockerfile.release -t vitess-operator-pr:latest . echo "Creating Kind cluster" -kind create cluster --wait 30s --name kind-${BUILDKITE_BUILD_ID} --image ${KIND_VERSION} +if [[ "$BUILDKITE_BUILD_ID" != "0" ]]; then + # The script is being run from buildkite, so we can't mount the current + # working directory to kind. The current directory in the docker is workdir + # So if we try and mount that, we get an error. Instead we need to mount the + # path where the code was checked out be buildkite + dockerContainerName=$(docker container ls --filter "ancestor=docker" --format '{{.Names}}') + CHECKOUT_PATH=$(docker container inspect -f '{{range .Mounts}}{{ if eq .Destination "/workdir" }}{{println .Source }}{{ end }}{{end}}' "$dockerContainerName") + BACKUP_DIR="$CHECKOUT_PATH/vtdataroot/backup" +else + BACKUP_DIR="$PWD/vtdataroot/backup" +fi +cat ./test/endtoend/kindBackupConfig.yaml | sed "s,PATH,$BACKUP_DIR,1" > ./vtdataroot/config.yaml +kind create cluster --wait 30s --name kind-${BUILDKITE_BUILD_ID} --image ${KIND_VERSION} --config ./vtdataroot/config.yaml echo "Loading docker image into Kind cluster" kind load docker-image vitess-operator-pr:latest --name kind-${BUILDKITE_BUILD_ID} @@ -244,7 +257,7 @@ cd "$PWD/test/endtoend/operator" killall kubectl setupKubectlAccessForCI -get_started "operator.yaml" "101_initial_cluster.yaml" +get_started "operator-latest.yaml" "101_initial_cluster.yaml" verifyVtGateVersion "21.0.0" checkSemiSyncSetup # Initially too durability policy should be specified @@ -258,5 +271,5 @@ move_tables resharding # Teardown -echo "Deleting Kind cluster. This also deletes the volume associated with it" -kind delete cluster --name kind-${BUILDKITE_BUILD_ID} +#echo "Deleting Kind cluster. This also deletes the volume associated with it" +#kind delete cluster --name kind-${BUILDKITE_BUILD_ID} From ae049fdfc06ef1abbb9ecc027cacef54d3b5ff4e Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Mon, 24 Feb 2025 17:08:38 -0600 Subject: [PATCH 08/13] Rework how multiple strategies are handled and how the Status is used Signed-off-by: Florent Poinsard --- ...planetscale.com_vitessbackupschedules.yaml | 7 +- .../crds/planetscale.com_vitessclusters.yaml | 2 - docs/api.md | 45 ++-- docs/api/index.html | 45 ++-- .../v2/vitessbackupschedule_types.go | 42 +-- .../planetscale/v2/zz_generated.deepcopy.go | 8 + .../vitessbackupschedule_controller.go | 254 +++++++++--------- test/endtoend/backup_schedule_test.sh | 19 +- .../101_initial_cluster_backup_schedule.yaml | 4 +- test/endtoend/operator/operator-latest.yaml | 9 +- test/endtoend/utils.sh | 22 ++ 11 files changed, 260 insertions(+), 197 deletions(-) diff --git a/deploy/crds/planetscale.com_vitessbackupschedules.yaml b/deploy/crds/planetscale.com_vitessbackupschedules.yaml index dda63530..dbce574a 100644 --- a/deploy/crds/planetscale.com_vitessbackupschedules.yaml +++ b/deploy/crds/planetscale.com_vitessbackupschedules.yaml @@ -114,8 +114,6 @@ spec: example: commerce type: string name: - enum: - - BackupShard type: string shard: example: '-' @@ -165,6 +163,11 @@ spec: lastScheduledTime: format: date-time type: string + lastScheduledTimes: + additionalProperties: + format: date-time + type: string + type: object type: object type: object served: true diff --git a/deploy/crds/planetscale.com_vitessclusters.yaml b/deploy/crds/planetscale.com_vitessclusters.yaml index 0d7361d7..3646e9d9 100644 --- a/deploy/crds/planetscale.com_vitessclusters.yaml +++ b/deploy/crds/planetscale.com_vitessclusters.yaml @@ -240,8 +240,6 @@ spec: example: commerce type: string name: - enum: - - BackupShard type: string shard: example: '-' diff --git a/docs/api.md b/docs/api.md index 23073710..661e2975 100644 --- a/docs/api.md +++ b/docs/api.md @@ -647,16 +647,6 @@ SecretSource -

BackupStrategyName -(string alias)

-

-(Appears on: -VitessBackupScheduleStrategy) -

-

-

BackupStrategyName describes the vtctldclient command that will be used to take a backup. -When scheduling a backup, you must specify at least one strategy.

-

CephBackupLocation

@@ -2600,7 +2590,8 @@ The PullPolicy used will be the same as the one used to pull the vtctld image. (Optional) -

A list of pointers to currently running jobs.

+

A list of pointers to currently running jobs. +This field is deprecated and no longer used in >= v2.15. It will be removed in a future release.

@@ -2614,7 +2605,25 @@ Kubernetes meta/v1.Time (Optional) -

Information when was the last time the job was successfully scheduled.

+

Information when was the last time the job was successfully scheduled. +This field is deprecated and no longer used in >= v2.15. It will be removed in a future release. +Please use lastScheduledTimes instead which maps the last schedule time to each VitessBackupScheduleStrategy +in the VitessBackupSchedule.

+ + + + +lastScheduledTimes
+ + +map[string]*k8s.io/apimachinery/pkg/apis/meta/v1.Time + + + + +(Optional) +

LastScheduledTimes lists for every VitessBackupScheduleStrategy what is the last schedule we executed. +It does not list when the last execution started, but to which scheduled time it corresponds.

@@ -2642,9 +2651,7 @@ command line that will be executed in the Job’s pod.

name
- -BackupStrategyName - +string @@ -2741,11 +2748,9 @@ string

Strategy defines how we are going to take a backup. If you want to take several backups within the same schedule you can add more items -to the Strategy list. Each VitessBackupScheduleStrategy will be executed by the same -kubernetes job. This is useful if for instance you have one schedule, and you want to -take a backup of all shards in a keyspace and don’t want to re-create a second schedule. -All the VitessBackupScheduleStrategy are concatenated into a single shell command that -is executed when the Job’s container starts.

+to the Strategy list. Each VitessBackupScheduleStrategy will be executed within different +K8S Jobs. This is useful if you want to have a single schedule backing up multiple shards +at the same time.

diff --git a/docs/api/index.html b/docs/api/index.html index 970b5750..2934ee89 100644 --- a/docs/api/index.html +++ b/docs/api/index.html @@ -649,16 +649,6 @@

AzblobBackupLocation -

BackupStrategyName -(string alias)

-

-(Appears on: -VitessBackupScheduleStrategy) -

-

-

BackupStrategyName describes the vtctldclient command that will be used to take a backup. -When scheduling a backup, you must specify at least one strategy.

-

CephBackupLocation

@@ -2602,7 +2592,8 @@

VitessBackupScheduleStatu (Optional) -

A list of pointers to currently running jobs.

+

A list of pointers to currently running jobs. +This field is deprecated and no longer used in >= v2.15. It will be removed in a future release.

@@ -2616,7 +2607,25 @@

VitessBackupScheduleStatu (Optional) -

Information when was the last time the job was successfully scheduled.

+

Information when was the last time the job was successfully scheduled. +This field is deprecated and no longer used in >= v2.15. It will be removed in a future release. +Please use lastScheduledTimes instead which maps the last schedule time to each VitessBackupScheduleStrategy +in the VitessBackupSchedule.

+ + + + +lastScheduledTimes
+ + +map[string]*k8s.io/apimachinery/pkg/apis/meta/v1.Time + + + + +(Optional) +

LastScheduledTimes lists for every VitessBackupScheduleStrategy what is the last schedule we executed. +It does not list when the last execution started, but to which scheduled time it corresponds.

@@ -2644,9 +2653,7 @@

VitessBackupScheduleStr name
- -BackupStrategyName - +string @@ -2743,11 +2750,9 @@

VitessBackupScheduleTem

Strategy defines how we are going to take a backup. If you want to take several backups within the same schedule you can add more items -to the Strategy list. Each VitessBackupScheduleStrategy will be executed by the same -kubernetes job. This is useful if for instance you have one schedule, and you want to -take a backup of all shards in a keyspace and don’t want to re-create a second schedule. -All the VitessBackupScheduleStrategy are concatenated into a single shell command that -is executed when the Job’s container starts.

+to the Strategy list. Each VitessBackupScheduleStrategy will be executed within different +K8S Jobs. This is useful if you want to have a single schedule backing up multiple shards +at the same time.

diff --git a/pkg/apis/planetscale/v2/vitessbackupschedule_types.go b/pkg/apis/planetscale/v2/vitessbackupschedule_types.go index dbc5f3f9..e33d4061 100644 --- a/pkg/apis/planetscale/v2/vitessbackupschedule_types.go +++ b/pkg/apis/planetscale/v2/vitessbackupschedule_types.go @@ -34,16 +34,6 @@ const ( ForbidConcurrent ConcurrencyPolicy = "Forbid" ) -// BackupStrategyName describes the vtctldclient command that will be used to take a backup. -// When scheduling a backup, you must specify at least one strategy. -// +kubebuilder:validation:Enum=BackupShard -type BackupStrategyName string - -const ( - // BackupShard will use the "vtctldclient BackupShard" command to take a backup - BackupShard BackupStrategyName = "BackupShard" -) - // VitessBackupSchedule is the Schema for the VitessBackupSchedule API. // +kubebuilder:object:root=true // +kubebuilder:subresource:status @@ -98,12 +88,12 @@ type VitessBackupScheduleTemplate struct { // Strategy defines how we are going to take a backup. // If you want to take several backups within the same schedule you can add more items - // to the Strategy list. Each VitessBackupScheduleStrategy will be executed by the same - // kubernetes job. This is useful if for instance you have one schedule, and you want to - // take a backup of all shards in a keyspace and don't want to re-create a second schedule. - // All the VitessBackupScheduleStrategy are concatenated into a single shell command that - // is executed when the Job's container starts. + // to the Strategy list. Each VitessBackupScheduleStrategy will be executed within different + // K8S Jobs. This is useful if you want to have a single schedule backing up multiple shards + // at the same time. // +kubebuilder:validation:MinItems=1 + // +patchMergeKey=name + // +patchStrategy=merge Strategy []VitessBackupScheduleStrategy `json:"strategies"` // Resources specify the compute resources to allocate for every Jobs's pod. @@ -181,7 +171,7 @@ type VitessBackupScheduleTemplate struct { // command line that will be executed in the Job's pod. type VitessBackupScheduleStrategy struct { // Name of the backup strategy. - Name BackupStrategyName `json:"name"` + Name string `json:"name"` // Keyspace defines the keyspace on which we want to take the backup. // +kubebuilder:example="commerce" @@ -199,12 +189,32 @@ type VitessBackupScheduleStrategy struct { // VitessBackupScheduleStatus defines the observed state of VitessBackupSchedule type VitessBackupScheduleStatus struct { // A list of pointers to currently running jobs. + // This field is deprecated and no longer used in >= v2.15. It will be removed in a future release. // +optional Active []corev1.ObjectReference `json:"active,omitempty"` // Information when was the last time the job was successfully scheduled. + // This field is deprecated and no longer used in >= v2.15. It will be removed in a future release. + // Please use lastScheduledTimes instead which maps the last schedule time to each VitessBackupScheduleStrategy + // in the VitessBackupSchedule. // +optional LastScheduledTime *metav1.Time `json:"lastScheduledTime,omitempty"` + + // LastScheduledTimes lists for every VitessBackupScheduleStrategy what is the last schedule we executed. + // It does not list when the last execution started, but to which scheduled time it corresponds. + // +optional + LastScheduledTimes map[string]*metav1.Time `json:"lastScheduledTimes,omitempty"` +} + +// NewVitessBackupScheduleStatus creates a new status with default values. +func NewVitessBackupScheduleStatus(status VitessBackupScheduleStatus) VitessBackupScheduleStatus { + newStatus := VitessBackupScheduleStatus{ + LastScheduledTimes: status.LastScheduledTimes, + } + if status.LastScheduledTimes == nil { + newStatus.LastScheduledTimes = make(map[string]*metav1.Time) + } + return newStatus } func init() { diff --git a/pkg/apis/planetscale/v2/zz_generated.deepcopy.go b/pkg/apis/planetscale/v2/zz_generated.deepcopy.go index ace6b890..700716d2 100644 --- a/pkg/apis/planetscale/v2/zz_generated.deepcopy.go +++ b/pkg/apis/planetscale/v2/zz_generated.deepcopy.go @@ -7,6 +7,7 @@ package v2 import ( autoscalingv2 "k8s.io/api/autoscaling/v2" "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" ) @@ -855,6 +856,13 @@ func (in *VitessBackupScheduleStatus) DeepCopyInto(out *VitessBackupScheduleStat in, out := &in.LastScheduledTime, &out.LastScheduledTime *out = (*in).DeepCopy() } + if in.LastScheduledTimes != nil { + in, out := &in.LastScheduledTimes, &out.LastScheduledTimes + *out = make(map[string]*metav1.Time, len(*in)) + for key, val := range *in { + (*out)[key] = val.DeepCopy() + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VitessBackupScheduleStatus. diff --git a/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go b/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go index 5d2ddfb8..4091163a 100644 --- a/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go +++ b/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go @@ -32,6 +32,7 @@ import ( "github.com/sirupsen/logrus" kbatch "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" + apiequality "k8s.io/apimachinery/pkg/api/equality" apierrors "k8s.io/apimachinery/pkg/api/errors" apilabels "k8s.io/apimachinery/pkg/labels" "k8s.io/client-go/tools/record" @@ -40,6 +41,7 @@ import ( "planetscale.dev/vitess-operator/pkg/operator/names" "planetscale.dev/vitess-operator/pkg/operator/reconciler" "planetscale.dev/vitess-operator/pkg/operator/results" + "planetscale.dev/vitess-operator/pkg/operator/resync" "planetscale.dev/vitess-operator/pkg/operator/vitessbackup" "planetscale.dev/vitess-operator/pkg/operator/vttablet" "sigs.k8s.io/controller-runtime/pkg/controller" @@ -51,7 +53,6 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" - ref "k8s.io/client-go/tools/reference" planetscalev2 "planetscale.dev/vitess-operator/pkg/apis/planetscale/v2" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -64,6 +65,7 @@ const ( var ( maxConcurrentReconciles = flag.Int("vitessbackupschedule_concurrent_reconciles", 10, "the maximum number of different vitessbackupschedule resources to reconcile concurrently") + resyncPeriod = flag.Duration("vitessbackupschedule_resync_period", 1*time.Minute, "reconcile vitessbackupschedules with this period even if no Kubernetes events occur") scheduledTimeAnnotation = "planetscale.com/backup-scheduled-at" @@ -80,6 +82,7 @@ type ( ReconcileVitessBackupsSchedule struct { client client.Client scheme *runtime.Scheme + resync *resync.Periodic recorder record.EventRecorder reconciler *reconciler.Reconciler } @@ -111,6 +114,7 @@ func newReconciler(mgr manager.Manager) (*ReconcileVitessBackupsSchedule, error) return &ReconcileVitessBackupsSchedule{ client: c, scheme: scheme, + resync: resync.NewPeriodic(controllerName, *resyncPeriod), recorder: recorder, reconciler: reconciler.New(c, scheme, recorder), }, nil @@ -145,6 +149,11 @@ func add(mgr manager.Manager, r *ReconcileVitessBackupsSchedule) error { } } + // Periodically resync even when no Kubernetes events have come in. + if err := c.Watch(r.resync.WatchSource()); err != nil { + return err + } + return nil } @@ -156,16 +165,14 @@ func add(mgr manager.Manager, r *ReconcileVitessBackupsSchedule) error { // - List all jobs and define the last scheduled Job // - Clean up old Job objects // - Create a new Job if needed -func (r *ReconcileVitessBackupsSchedule) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - resultBuilder := &results.Builder{} - +// - Update the VitessBackupSchedule Status +func (r *ReconcileVitessBackupsSchedule) Reconcile(ctx context.Context, req ctrl.Request) (res ctrl.Result, err error) { log = log.WithFields(logrus.Fields{ "namespace": req.Namespace, "VitessBackupSchedule": req.Name, }) log.Info("Reconciling VitessBackupSchedule") - var err error var vbsc planetscalev2.VitessBackupSchedule if err = r.client.Get(ctx, req.NamespacedName, &vbsc); err != nil { log.WithError(err).Error(" unable to fetch VitessBackupSchedule") @@ -173,113 +180,145 @@ func (r *ReconcileVitessBackupsSchedule) Reconcile(ctx context.Context, req ctrl // Request object not found, could have been deleted after reconcile request. // Owned objects are automatically garbage collected. For additional cleanup logic use finalizers. // Return and don't requeue - return resultBuilder.Result() + return ctrl.Result{}, nil } // Error reading the object - requeue the request. - return resultBuilder.Error(err) + return ctrl.Result{}, err } + // If the Suspend setting is set to true, we can skip adding any job, our work is done here. + if vbsc.Spec.Suspend != nil && *vbsc.Spec.Suspend { + log.Info("VitessBackupSchedule suspended, skipping") + return ctrl.Result{}, nil + } + + oldStatus := vbsc.DeepCopy() + vbsc.Status = planetscalev2.NewVitessBackupScheduleStatus(vbsc.Status) + // Register this reconciling attempt no matter if we fail or succeed. defer func() { reconcileCount.WithLabelValues(vbsc.Name, metrics.Result(err)).Inc() }() - var scheduledResult ctrl.Result - var activeJobs []*kbatch.Job - for _, strategy := range vbsc.Spec.Strategy { - start, end, ok := strings.Cut(strategy.Shard, "-") - if !ok { - return resultBuilder.Error(fmt.Errorf("invalid strategy shard: %s", strategy.Shard)) - } - vkr := planetscalev2.VitessKeyRange{ - Start: start, - End: end, - } - jobs, mostRecentTime, err := r.getJobsList(ctx, req, vbsc, strategy.Keyspace, vkr.SafeName()) - if err != nil { - // We had an error reading the jobs, we can requeue. - return resultBuilder.Error(err) - } + resultBuilder := &results.Builder{} + _, _ = resultBuilder.Merge(r.reconcileStrategies(ctx, req, vbsc)) - activeJobs = append(activeJobs, jobs.active...) - err = r.updateVitessBackupScheduleStatus(ctx, mostRecentTime, vbsc, activeJobs) - if err != nil { - // We had an error updating the status, we can requeue. - return resultBuilder.Error(err) + if !apiequality.Semantic.DeepEqual(&vbsc.Status, &oldStatus) { + if err := r.client.Status().Update(ctx, &vbsc); err != nil { + if !apierrors.IsConflict(err) { + log.WithError(err).Error("unable to update VitessBackupSchedule status") + } + _, _ = resultBuilder.Error(err) } + } - // We must clean up old jobs to not overcrowd the number of Pods and Jobs in the cluster. - // This will be done according to both failedJobsHistoryLimit and successfulJobsHistoryLimit fields. - r.cleanupJobsWithLimit(ctx, jobs.failed, vbsc.GetFailedJobsLimit()) - r.cleanupJobsWithLimit(ctx, jobs.successful, vbsc.GetSuccessfulJobsLimit()) + // Request a periodic resync of this VitessBackupSchedule object to check + // if existing Jobs have finished or timed out and need to be cleaned up + // even if no Kubernetes events have occurred. + r.resync.Enqueue(req.NamespacedName) - err = r.removeTimeoutJobs(ctx, jobs.active, vbsc.Name, vbsc.Spec.JobTimeoutMinutes) - if err != nil { - // We had an error while removing timed out jobs, we can requeue - return resultBuilder.Error(err) - } + return resultBuilder.Result() +} - // If the Suspend setting is set to true, we can skip adding any job, our work is done here. - if vbsc.Spec.Suspend != nil && *vbsc.Spec.Suspend { - log.Info("VitessBackupSchedule suspended, skipping") - return ctrl.Result{}, nil - } +func (r *ReconcileVitessBackupsSchedule) reconcileStrategies(ctx context.Context, req ctrl.Request, vbsc planetscalev2.VitessBackupSchedule) (ctrl.Result, error) { + resultBuilder := &results.Builder{} - missedRun, nextRun, err := getNextSchedule(vbsc, time.Now()) - if err != nil { - log.Error(err, "unable to figure out VitessBackupSchedule schedule") - // Re-queuing here does not make sense as we have an error with the schedule and the user needs to fix it first. - return ctrl.Result{}, nil - } + for _, strategy := range vbsc.Spec.Strategy { + _, _ = resultBuilder.Merge(r.reconcileStrategy(ctx, strategy, req, vbsc)) + } + return resultBuilder.Result() +} - // Ask kubernetes to re-queue for the next scheduled job, and skip if we don't miss any run. - scheduledResult = ctrl.Result{RequeueAfter: nextRun.Sub(time.Now())} - if missedRun.IsZero() { - continue - } +func (r *ReconcileVitessBackupsSchedule) reconcileStrategy( + ctx context.Context, + strategy planetscalev2.VitessBackupScheduleStrategy, + req ctrl.Request, + vbsc planetscalev2.VitessBackupSchedule, +) (reconcile.Result, error) { + resultBuilder := &results.Builder{} - // Check whether we are too late to create this Job or not. The startingDeadlineSeconds field will help us - // schedule Jobs that are late. - tooLate := false - if vbsc.Spec.StartingDeadlineSeconds != nil { - tooLate = missedRun.Add(time.Duration(*vbsc.Spec.StartingDeadlineSeconds) * time.Second).Before(time.Now()) - } - if tooLate { - log.Infof("missed starting deadline for latest run; skipping; next run is scheduled for: %s", nextRun.Format(time.RFC3339)) - continue - } + start, end, ok := strings.Cut(strategy.Shard, "-") + if !ok { + return resultBuilder.Error(fmt.Errorf("invalid strategy shard: %s", strategy.Shard)) + } + vkr := planetscalev2.VitessKeyRange{ + Start: start, + End: end, + } + jobs, mostRecentTime, err := r.getJobsList(ctx, req, vbsc, strategy.Keyspace, vkr.SafeName()) + if err != nil { + // We had an error reading the jobs, we can requeue. + return resultBuilder.Error(err) + } - // Check concurrency policy and skip this job if we have ForbidConcurrent set plus an active job - if vbsc.Spec.ConcurrencyPolicy == planetscalev2.ForbidConcurrent && len(jobs.active) > 0 { - log.Infof("concurrency policy blocks concurrent runs: skipping, number of active jobs: %d", len(jobs.active)) - continue - } + // We must clean up old jobs to not overcrowd the number of Pods and Jobs in the cluster. + // This will be done according to both failedJobsHistoryLimit and successfulJobsHistoryLimit fields. + r.cleanupJobsWithLimit(ctx, jobs.failed, vbsc.GetFailedJobsLimit()) + r.cleanupJobsWithLimit(ctx, jobs.successful, vbsc.GetSuccessfulJobsLimit()) - // Now that the different policies are checked, we can create and apply our new job. - job, err := r.createJob(ctx, vbsc, strategy, missedRun, vkr) - if err != nil { - // Re-queuing here does not make sense as we have an error with the template and the user needs to fix it first. - log.WithError(err).Error("unable to construct job from template") - return ctrl.Result{}, err - } + err = r.removeTimeoutJobs(ctx, jobs.active, vbsc.Name, vbsc.Spec.JobTimeoutMinutes) + if err != nil { + // We had an error while removing timed out jobs, we can requeue + return resultBuilder.Error(err) + } - if err = r.client.Create(ctx, job); err != nil { - // if the job already exists it means another reconciling loop created the job since we last fetched - // the list of jobs to create, we can safely return without failing. - if apierrors.IsAlreadyExists(err) { - log.Infof("job %s already exists, will retry in %s", job.Name, scheduledResult.RequeueAfter.String()) - continue - } - // Simply re-queue here - return resultBuilder.Error(err) - } + missedRun, nextRun, err := getNextSchedule(vbsc, time.Now(), mostRecentTime) + if err != nil { + log.Error(err, "unable to figure out VitessBackupSchedule schedule") + // Re-queuing here does not make sense as we have an error with the schedule and the user needs to fix it first. + return resultBuilder.Error(reconcile.TerminalError(err)) + } - log.Infof("created new job: %s, next job scheduled in %s", job.Name, scheduledResult.RequeueAfter.String()) + // If we did not miss any run, we can skip and not requeue anything + if missedRun.IsZero() { + return resultBuilder.Result() } - return scheduledResult, nil + + // Keep track of when we need to requeue this job + requeueAfter := nextRun.Sub(time.Now()) + _, _ = resultBuilder.RequeueAfter(requeueAfter) + + // Check whether we are too late to create this Job or not. The startingDeadlineSeconds field will help us + // schedule Jobs that are late. + tooLate := false + if vbsc.Spec.StartingDeadlineSeconds != nil { + tooLate = missedRun.Add(time.Duration(*vbsc.Spec.StartingDeadlineSeconds) * time.Second).Before(time.Now()) + } + if tooLate { + log.Infof("missed starting deadline for latest run; skipping; next run is scheduled for: %s", nextRun.Format(time.RFC3339)) + return resultBuilder.Result() + } + + // Check concurrency policy and skip this job if we have ForbidConcurrent set plus an active job + if vbsc.Spec.ConcurrencyPolicy == planetscalev2.ForbidConcurrent && len(jobs.active) > 0 { + log.Infof("concurrency policy blocks concurrent runs: skipping, number of active jobs: %d", len(jobs.active)) + return resultBuilder.Result() + } + + // Now that the different policies are checked, we can create and apply our new job. + job, err := r.createJob(ctx, vbsc, strategy, missedRun, vkr) + if err != nil { + // Re-queuing here does not make sense as we have an error with the template and the user needs to fix it first. + log.WithError(err).Error("unable to construct job from template") + return resultBuilder.Error(reconcile.TerminalError(err)) + } + + if err = r.client.Create(ctx, job); err != nil { + // if the job already exists it means another reconciling loop created the job since we last fetched + // the list of jobs to create, we can safely return without failing. + if apierrors.IsAlreadyExists(err) { + log.Infof("job %s already exists, will retry in %s", job.Name, requeueAfter.String()) + return resultBuilder.Result() + } + // Simply re-queue here + return resultBuilder.Error(err) + } + log.Infof("created new job: %s, next job scheduled in %s", job.Name, requeueAfter.String()) + vbsc.Status.LastScheduledTimes[strategy.Name] = &metav1.Time{Time: missedRun} + return resultBuilder.Result() } -func getNextSchedule(vbsc planetscalev2.VitessBackupSchedule, now time.Time) (time.Time, time.Time, error) { +func getNextSchedule(vbsc planetscalev2.VitessBackupSchedule, now time.Time, mostRecentTime *time.Time) (time.Time, time.Time, error) { sched, err := cron.ParseStandard(vbsc.Spec.Schedule) if err != nil { return time.Time{}, time.Time{}, fmt.Errorf("unable to parse schedule %q: %v", vbsc.Spec.Schedule, err) @@ -287,30 +326,27 @@ func getNextSchedule(vbsc planetscalev2.VitessBackupSchedule, now time.Time) (ti // Set the last scheduled time by either looking at the VitessBackupSchedule's Status or // by looking at its creation time. - var latestRun time.Time - if vbsc.Status.LastScheduledTime != nil { - latestRun = vbsc.Status.LastScheduledTime.Time - } else { - latestRun = vbsc.ObjectMeta.CreationTimestamp.Time + if mostRecentTime == nil { + mostRecentTime = &vbsc.ObjectMeta.CreationTimestamp.Time } if vbsc.Spec.StartingDeadlineSeconds != nil { // controller is not going to schedule anything below this point schedulingDeadline := now.Add(-time.Second * time.Duration(*vbsc.Spec.StartingDeadlineSeconds)) - if schedulingDeadline.After(latestRun) { - latestRun = schedulingDeadline + if schedulingDeadline.After(*mostRecentTime) { + *mostRecentTime = schedulingDeadline } } // Next schedule is later, simply return the next scheduled time. - if latestRun.After(now) { + if mostRecentTime.After(now) { return time.Time{}, sched.Next(now), nil } var lastMissed time.Time missedRuns := 0 - for t := sched.Next(latestRun); !t.After(now); t = sched.Next(t) { + for t := sched.Next(*mostRecentTime); !t.After(now); t = sched.Next(t) { lastMissed = t missedRuns++ @@ -323,30 +359,6 @@ func getNextSchedule(vbsc planetscalev2.VitessBackupSchedule, now time.Time) (ti return lastMissed, sched.Next(now), nil } -func (r *ReconcileVitessBackupsSchedule) updateVitessBackupScheduleStatus(ctx context.Context, mostRecentTime *time.Time, vbsc planetscalev2.VitessBackupSchedule, activeJobs []*kbatch.Job) error { - if mostRecentTime != nil { - vbsc.Status.LastScheduledTime = &metav1.Time{Time: *mostRecentTime} - } else { - vbsc.Status.LastScheduledTime = nil - } - - vbsc.Status.Active = make([]corev1.ObjectReference, 0, len(activeJobs)) - for _, activeJob := range activeJobs { - jobRef, err := ref.GetReference(r.scheme, activeJob) - if err != nil { - log.WithError(err).Errorf("unable to make reference to active job: %s", jobRef.Name) - continue - } - vbsc.Status.Active = append(vbsc.Status.Active, *jobRef) - } - - if err := r.client.Status().Update(ctx, &vbsc); err != nil { - log.WithError(err).Error("unable to update VitessBackupSchedule status") - return err - } - return nil -} - // getJobsList fetches all existing Jobs in the cluster and return them by categories: active, failed or successful. // It also returns at what time was the last job created, which is needed to update VitessBackupSchedule's status, // and plan future jobs. diff --git a/test/endtoend/backup_schedule_test.sh b/test/endtoend/backup_schedule_test.sh index ce661ed4..8620bbc0 100755 --- a/test/endtoend/backup_schedule_test.sh +++ b/test/endtoend/backup_schedule_test.sh @@ -31,27 +31,26 @@ function verifyListBackupsOutputWithSchedule() { checkVitessBackupScheduleStatusWithTimeout "example-vbsc-every-five-minute(.*)" echo -e "Check for number of backups in the cluster" - # Sleep for over 6 minutes, during this time we should have at the very minimum - # 7 backups. At least: 6 backups from the every-minute schedule, and 1 backup - # from the every-five-minute schedule. - for i in {1..6} ; do + # Sleep for 6 minutes, after 6 minutes we should at least have 3 backups: 1 from the initial vtbackup pod + # 1 minimum from the every minute schedule, and 1 from the every-five minute schedule + for i in {1..360} ; do # Ensure that we can view the backup files from the host. docker exec -it $(docker container ls --format '{{.Names}}' | grep kind) chmod o+rwx -R /backup > /dev/null backupCount=$(kubectl get vtb --no-headers | wc -l) echo "Found ${backupCount} backups" - if [[ "${backupCount}" -ge 7 ]]; then + if [[ "${backupCount}" -ge 3 ]]; then break fi - sleep 100 + sleep 1 done - if [[ "${backupCount}" -lt 7 ]]; then - echo "Did not find at least 7 backups" + if [[ "${backupCount}" -lt 3 ]]; then + echo "Did not find at least 3 backups" exit 1 fi echo -e "Check for Jobs' pods" - checkPodStatusWithTimeout "example-vbsc-every-minute-(.*)0/1(.*)Completed(.*)" 3 - checkPodStatusWithTimeout "example-vbsc-every-five-minute-(.*)0/1(.*)Completed(.*)" 2 + checkPodExistWithTimeout "example-vbsc-every-minute-(.*)0/1(.*)Completed(.*)" + checkPodExistWithTimeout "example-vbsc-every-five-minute-(.*)0/1(.*)Completed(.*)" } # Test setup diff --git a/test/endtoend/operator/101_initial_cluster_backup_schedule.yaml b/test/endtoend/operator/101_initial_cluster_backup_schedule.yaml index 7fe852cf..b6379901 100644 --- a/test/endtoend/operator/101_initial_cluster_backup_schedule.yaml +++ b/test/endtoend/operator/101_initial_cluster_backup_schedule.yaml @@ -27,7 +27,7 @@ spec: failedJobsHistoryLimit: 3 jobTimeoutMinute: 5 strategies: - - name: BackupShard + - name: commerce-x keyspace: "commerce" shard: "-" - name: "every-five-minute" @@ -42,7 +42,7 @@ spec: failedJobsHistoryLimit: 3 jobTimeoutMinute: 5 strategies: - - name: BackupShard + - name: commerce-x keyspace: "commerce" shard: "-" images: diff --git a/test/endtoend/operator/operator-latest.yaml b/test/endtoend/operator/operator-latest.yaml index 4d41262a..be23f61a 100644 --- a/test/endtoend/operator/operator-latest.yaml +++ b/test/endtoend/operator/operator-latest.yaml @@ -492,8 +492,6 @@ spec: example: commerce type: string name: - enum: - - BackupShard type: string shard: example: '-' @@ -543,6 +541,11 @@ spec: lastScheduledTime: format: date-time type: string + lastScheduledTimes: + additionalProperties: + format: date-time + type: string + type: object type: object type: object served: true @@ -2101,8 +2104,6 @@ spec: example: commerce type: string name: - enum: - - BackupShard type: string shard: example: '-' diff --git a/test/endtoend/utils.sh b/test/endtoend/utils.sh index 4de6add1..74e380f8 100644 --- a/test/endtoend/utils.sh +++ b/test/endtoend/utils.sh @@ -155,6 +155,28 @@ function checkPodStatusWithTimeout() { exit 1 } +function checkPodExistWithTimeout() { + regex=$1 + + # We use this for loop instead of `kubectl wait` because we don't have access to the full pod name + # and `kubectl wait` does not support regex to match resource name. + for i in {1..1200} ; do + out=$(kubectl get pods) + echo "$out" | grep -E "$regex" > /dev/null 2>&1 + if [[ $? -eq 0 ]]; then + echo "$regex found" + return + fi + sleep 1 + done + echo -e "ERROR: checkPodStatusWithTimeout timeout to find pod matching:\ngot:\n$out\nfor regex: $regex" + echo "$regex" | grep "vttablet" > /dev/null 2>&1 + if [[ $? -eq 0 ]]; then + printMysqlErrorFiles + fi + exit 1 +} + # ensurePodResourcesSet: # $1: regex used to match pod names function ensurePodResourcesSet() { From e35d9e2df593e2c9c3111643743a66a9e9e463dc Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Tue, 25 Feb 2025 13:06:46 -0600 Subject: [PATCH 09/13] Add more tests both in upgrade and scheduled backup Signed-off-by: Florent Poinsard --- test/endtoend/backup_schedule_test.sh | 21 ++---- .../operator/401_scheduled_backups.yaml | 27 ++++---- test/endtoend/upgrade_test.sh | 65 ++++++++++++++----- test/endtoend/utils.sh | 16 ++++- 4 files changed, 79 insertions(+), 50 deletions(-) diff --git a/test/endtoend/backup_schedule_test.sh b/test/endtoend/backup_schedule_test.sh index 8620bbc0..34ca9f94 100755 --- a/test/endtoend/backup_schedule_test.sh +++ b/test/endtoend/backup_schedule_test.sh @@ -11,29 +11,15 @@ function takedownShard() { checkPodStatusWithTimeout "example-vttablet-zone1" 0 } -function checkVitessBackupScheduleStatusWithTimeout() { - regex=$1 - - for i in {1..1200} ; do - if [[ $(kubectl get VitessBackupSchedule | grep -E "${regex}" | wc -l) -eq 1 ]]; then - echo "$regex found" - return - fi - sleep 1 - done - echo -e "ERROR: checkPodStatusWithTimeout timeout to find pod matching:\ngot:\n$out\nfor regex: $regex" - exit 1 -} - function verifyListBackupsOutputWithSchedule() { echo -e "Check for VitessBackupSchedule status" checkVitessBackupScheduleStatusWithTimeout "example-vbsc-every-minute(.*)" checkVitessBackupScheduleStatusWithTimeout "example-vbsc-every-five-minute(.*)" echo -e "Check for number of backups in the cluster" - # Sleep for 6 minutes, after 6 minutes we should at least have 3 backups: 1 from the initial vtbackup pod + # Sleep for 7 minutes, after 7 minutes we should at least have 3 backups: 1 from the initial vtbackup pod # 1 minimum from the every minute schedule, and 1 from the every-five minute schedule - for i in {1..360} ; do + for i in {1..420} ; do # Ensure that we can view the backup files from the host. docker exec -it $(docker container ls --format '{{.Names}}' | grep kind) chmod o+rwx -R /backup > /dev/null backupCount=$(kubectl get vtb --no-headers | wc -l) @@ -43,12 +29,13 @@ function verifyListBackupsOutputWithSchedule() { fi sleep 1 done - if [[ "${backupCount}" -lt 3 ]]; then + if [[ "${backupCount}" -ge 3 ]]; then echo "Did not find at least 3 backups" exit 1 fi echo -e "Check for Jobs' pods" + # Here check explicitly that the every five minute schedule ran at least once during the 7 minutes sleep checkPodExistWithTimeout "example-vbsc-every-minute-(.*)0/1(.*)Completed(.*)" checkPodExistWithTimeout "example-vbsc-every-five-minute-(.*)0/1(.*)Completed(.*)" } diff --git a/test/endtoend/operator/401_scheduled_backups.yaml b/test/endtoend/operator/401_scheduled_backups.yaml index 52c18053..221ab22b 100644 --- a/test/endtoend/operator/401_scheduled_backups.yaml +++ b/test/endtoend/operator/401_scheduled_backups.yaml @@ -11,8 +11,8 @@ spec: path: /backup type: Directory schedules: - - name: "customer" - schedule: "* * * * *" + - name: "commerce" + schedule: "*/2 * * * *" resources: requests: cpu: 100m @@ -22,16 +22,12 @@ spec: successfulJobsHistoryLimit: 2 failedJobsHistoryLimit: 3 jobTimeoutMinute: 5 - suspend: true strategies: - - name: BackupShard - keyspace: "customer" - shard: "80-" - - name: BackupShard - keyspace: "customer" - shard: "-80" - - name: "commerce" - schedule: "* * * * *" + - name: commerce_x + keyspace: "commerce" + shard: "-" + - name: "customer" + schedule: "*/2 * * * *" resources: requests: cpu: 100m @@ -42,9 +38,12 @@ spec: failedJobsHistoryLimit: 3 jobTimeoutMinute: 5 strategies: - - name: BackupShard - keyspace: "commerce" - shard: "-" + - name: customer_80-x + keyspace: "customer" + shard: "80-" + - name: customer_x-80 + keyspace: "customer" + shard: "-80" images: vtctld: vitess/lite:latest vtgate: vitess/lite:latest diff --git a/test/endtoend/upgrade_test.sh b/test/endtoend/upgrade_test.sh index 3bc4552c..256e2b7b 100755 --- a/test/endtoend/upgrade_test.sh +++ b/test/endtoend/upgrade_test.sh @@ -17,6 +17,7 @@ function move_tables() { sleep 10 + echo "Execute MoveTables" vtctldclient LegacyVtctlCommand -- MoveTables --source commerce --tables 'customer,corder' Create customer.commerce2customer if [ $? -ne 0 ]; then echo "MoveTables failed" @@ -26,6 +27,7 @@ function move_tables() { sleep 10 + echo "Execute VDiff" vdiff_out=$(vtctldclient LegacyVtctlCommand -- VDiff customer.commerce2customer) echo "$vdiff_out" | grep "ProcessedRows: 5" | wc -l | grep "2" > /dev/null if [ $? -ne 0 ]; then @@ -33,6 +35,7 @@ function move_tables() { # Allow failure fi + echo "SwitchTraffic for rdonly" vtctldclient LegacyVtctlCommand -- MoveTables --tablet_types='rdonly,replica' SwitchTraffic customer.commerce2customer if [ $? -ne 0 ]; then echo "SwitchTraffic for rdonly and replica failed" @@ -40,6 +43,7 @@ function move_tables() { exit 1 fi + echo "SwitchTraffic for primary" vtctldclient LegacyVtctlCommand -- MoveTables --tablet_types='primary' SwitchTraffic customer.commerce2customer if [ $? -ne 0 ]; then echo "SwitchTraffic for primary failed" @@ -47,6 +51,7 @@ function move_tables() { exit 1 fi + echo "Complete MoveTables" vtctldclient LegacyVtctlCommand -- MoveTables Complete customer.commerce2customer if [ $? -ne 0 ]; then echo "MoveTables Complete failed" @@ -177,6 +182,39 @@ EOF waitForKeyspaceToBeServing customer 80- 2 } +function scheduledBackups() { + echo "Apply 401_scheduled_backups.yaml" + kubectl apply -f 401_scheduled_backups.yaml > /dev/null + + checkVitessBackupScheduleStatusWithTimeout "example-vbsc-commerce(.*)" + checkVitessBackupScheduleStatusWithTimeout "example-vbsc-customer(.*)" + + docker exec -it $(docker container ls --format '{{.Names}}' | grep kind) chmod o+rwx -R /backup > /dev/null + initialCommerceBackups=$(kubectl get vtb --no-headers | grep "commerce-x-x" | wc -l) + initialCustomerFirstShardBackups=$(kubectl get vtb --no-headers | grep "customer-x-80" | wc -l) + initialCustomerSecondShardBackups=$(kubectl get vtb --no-headers | grep "customer-80-x" | wc -l) + + for i in {1..300} ; do + commerceBackups=$(kubectl get vtb --no-headers | grep "commerce-x-x" | wc -l) + customerFirstShardBackups=$(kubectl get vtb --no-headers | grep "customer-x-80" | wc -l) + customerSecondShardBackups=$(kubectl get vtb --no-headers | grep "customer-80-x" | wc -l) + + if [[ "${customerFirstShardBackups}" -ge $(( initialCustomerFirstShardBackups + 2 )) && "${customerSecondShardBackups}" -ge $(( initialCustomerSecondShardBackups + 2 )) && "${commerceBackups}" -ge $(( initialCommerceBackups + 2 )) ]]; then + echo "Found all backups" + return + else + echo "Got: ${customerFirstShardBackups} customer-x-80 backups but want: $(( initialCustomerFirstShardBackups + 2 ))" + echo "Got: ${customerSecondShardBackups} customer-80-x backups but want: $(( initialCustomerSecondShardBackups + 2 ))" + echo "Got: ${commerceBackups} commerce-x-x backups but want: $(( initialCommerceBackups + 2 ))" + echo "" + fi + sleep 10 + done + + echo "Did not find the backups on time" + exit 1 +} + function waitAndVerifySetup() { sleep 10 checkPodStatusWithTimeout "example-zone1-vtctld(.*)1/1(.*)Running(.*)" @@ -236,22 +274,8 @@ EOF mkdir -p -m 777 ./vtdataroot/backup echo "Building the docker image" docker build -f build/Dockerfile.release -t vitess-operator-pr:latest . -echo "Creating Kind cluster" -if [[ "$BUILDKITE_BUILD_ID" != "0" ]]; then - # The script is being run from buildkite, so we can't mount the current - # working directory to kind. The current directory in the docker is workdir - # So if we try and mount that, we get an error. Instead we need to mount the - # path where the code was checked out be buildkite - dockerContainerName=$(docker container ls --filter "ancestor=docker" --format '{{.Names}}') - CHECKOUT_PATH=$(docker container inspect -f '{{range .Mounts}}{{ if eq .Destination "/workdir" }}{{println .Source }}{{ end }}{{end}}' "$dockerContainerName") - BACKUP_DIR="$CHECKOUT_PATH/vtdataroot/backup" -else - BACKUP_DIR="$PWD/vtdataroot/backup" -fi -cat ./test/endtoend/kindBackupConfig.yaml | sed "s,PATH,$BACKUP_DIR,1" > ./vtdataroot/config.yaml -kind create cluster --wait 30s --name kind-${BUILDKITE_BUILD_ID} --image ${KIND_VERSION} --config ./vtdataroot/config.yaml -echo "Loading docker image into Kind cluster" -kind load docker-image vitess-operator-pr:latest --name kind-${BUILDKITE_BUILD_ID} +setupKindConfig +createKindCluster cd "$PWD/test/endtoend/operator" killall kubectl @@ -270,6 +294,11 @@ verifyDurabilityPolicy "commerce" "semi_sync" move_tables resharding +scheduledBackups + # Teardown -#echo "Deleting Kind cluster. This also deletes the volume associated with it" -#kind delete cluster --name kind-${BUILDKITE_BUILD_ID} +echo "Removing the temporary directory" +removeBackupFiles +rm -rf "$STARTING_DIR/vtdataroot" +echo "Deleting Kind cluster. This also deletes the volume associated with it" +kind delete cluster --name kind-${BUILDKITE_BUILD_ID} diff --git a/test/endtoend/utils.sh b/test/endtoend/utils.sh index 74e380f8..fbdd55c0 100644 --- a/test/endtoend/utils.sh +++ b/test/endtoend/utils.sh @@ -247,7 +247,7 @@ function waitForKeyspaceToBeServing() { fi echo "Shard $ks/$shard is not fully serving. Output: $out" echo "Retrying (attempt #$i) ..." - sleep 10 + sleep 1 done } @@ -397,3 +397,17 @@ COrder +----------+-------------+----------+-------+ EOF } + +function checkVitessBackupScheduleStatusWithTimeout() { + regex=$1 + + for i in {1..1200} ; do + if [[ $(kubectl get VitessBackupSchedule | grep -E "${regex}" | wc -l) -eq 1 ]]; then + echo "$regex found" + return + fi + sleep 1 + done + echo -e "ERROR: checkPodStatusWithTimeout timeout to find pod matching:\ngot:\n$out\nfor regex: $regex" + exit 1 +} From d29a799e8569b8511318affa89896685baef981b Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Tue, 25 Feb 2025 13:07:36 -0600 Subject: [PATCH 10/13] Remove unused function in vbsc controller Signed-off-by: Florent Poinsard --- .../vitessbackupschedule_controller.go | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go b/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go index 4091163a..16becc10 100644 --- a/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go +++ b/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go @@ -59,8 +59,7 @@ import ( ) const ( - controllerName = "vitessbackupschedule-controller" - vtctldclientPath = "/vt/bin/vtctldclient" + controllerName = "vitessbackupschedule-controller" ) var ( @@ -629,18 +628,6 @@ func (r *ReconcileVitessBackupsSchedule) createJobPod( return p, vtbackupSpec, nil } -func createVtctldClientCommand(cmd *strings.Builder, serverAddr string, extraFlags map[string]string, keyspace, shard string) { - cmd.WriteString(fmt.Sprintf("%s %s BackupShard", vtctldclientPath, serverAddr)) - - // Add any flags - for key, value := range extraFlags { - cmd.WriteString(fmt.Sprintf(" --%s=%s", key, value)) - } - - // Add keyspace/shard - cmd.WriteString(fmt.Sprintf(" %s/%s", keyspace, shard)) -} - func (r *ReconcileVitessBackupsSchedule) getVtctldServiceName(ctx context.Context, vbsc *planetscalev2.VitessBackupSchedule, cluster string) (svcName string, svcPort int32, err error) { svcList := &corev1.ServiceList{} listOpts := &client.ListOptions{ From 8d4f5ee559c762421e1d3bac9fd9647b67fe2c7d Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Tue, 25 Feb 2025 14:46:26 -0600 Subject: [PATCH 11/13] Increase timeout in verifyListBackupsOutputWithSchedule Signed-off-by: Florent Poinsard --- test/endtoend/backup_schedule_test.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/endtoend/backup_schedule_test.sh b/test/endtoend/backup_schedule_test.sh index 34ca9f94..db46a508 100755 --- a/test/endtoend/backup_schedule_test.sh +++ b/test/endtoend/backup_schedule_test.sh @@ -17,9 +17,9 @@ function verifyListBackupsOutputWithSchedule() { checkVitessBackupScheduleStatusWithTimeout "example-vbsc-every-five-minute(.*)" echo -e "Check for number of backups in the cluster" - # Sleep for 7 minutes, after 7 minutes we should at least have 3 backups: 1 from the initial vtbackup pod + # Sleep for 10 minutes, after 10 minutes we should at least have 3 backups: 1 from the initial vtbackup pod # 1 minimum from the every minute schedule, and 1 from the every-five minute schedule - for i in {1..420} ; do + for i in {1..600} ; do # Ensure that we can view the backup files from the host. docker exec -it $(docker container ls --format '{{.Names}}' | grep kind) chmod o+rwx -R /backup > /dev/null backupCount=$(kubectl get vtb --no-headers | wc -l) @@ -35,7 +35,7 @@ function verifyListBackupsOutputWithSchedule() { fi echo -e "Check for Jobs' pods" - # Here check explicitly that the every five minute schedule ran at least once during the 7 minutes sleep + # Here check explicitly that the every five minute schedule ran at least once during the 10 minutes sleep checkPodExistWithTimeout "example-vbsc-every-minute-(.*)0/1(.*)Completed(.*)" checkPodExistWithTimeout "example-vbsc-every-five-minute-(.*)0/1(.*)Completed(.*)" } From 28875f9c973d4534ef696ef647a198d4ec032408 Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Tue, 25 Feb 2025 15:25:31 -0600 Subject: [PATCH 12/13] Self review Signed-off-by: Florent Poinsard --- pkg/controller/vitessshard/reconcile_backup_job.go | 2 +- test/endtoend/upgrade_test.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/controller/vitessshard/reconcile_backup_job.go b/pkg/controller/vitessshard/reconcile_backup_job.go index 23a82973..14aef2a7 100644 --- a/pkg/controller/vitessshard/reconcile_backup_job.go +++ b/pkg/controller/vitessshard/reconcile_backup_job.go @@ -84,7 +84,7 @@ func (r *ReconcileVitessShard) reconcileBackupJob(ctx context.Context, vts *plan // scratch (not from any tablet). If we're wrong and a backup exists // already, the idempotent vtbackup "initial backup" mode will just do // nothing and return success. - backupType := vitessbackup.TypeUpdate + backupType := vitessbackup.TypeFirstBackup if vts.Status.HasMaster != corev1.ConditionTrue { backupType = vitessbackup.TypeInit } diff --git a/test/endtoend/upgrade_test.sh b/test/endtoend/upgrade_test.sh index 256e2b7b..4aca2c50 100755 --- a/test/endtoend/upgrade_test.sh +++ b/test/endtoend/upgrade_test.sh @@ -194,7 +194,7 @@ function scheduledBackups() { initialCustomerFirstShardBackups=$(kubectl get vtb --no-headers | grep "customer-x-80" | wc -l) initialCustomerSecondShardBackups=$(kubectl get vtb --no-headers | grep "customer-80-x" | wc -l) - for i in {1..300} ; do + for i in {1..60} ; do commerceBackups=$(kubectl get vtb --no-headers | grep "commerce-x-x" | wc -l) customerFirstShardBackups=$(kubectl get vtb --no-headers | grep "customer-x-80" | wc -l) customerSecondShardBackups=$(kubectl get vtb --no-headers | grep "customer-80-x" | wc -l) @@ -281,7 +281,7 @@ cd "$PWD/test/endtoend/operator" killall kubectl setupKubectlAccessForCI -get_started "operator-latest.yaml" "101_initial_cluster.yaml" +get_started "operator.yaml" "101_initial_cluster.yaml" verifyVtGateVersion "21.0.0" checkSemiSyncSetup # Initially too durability policy should be specified From 91897cc8a220c94cbb0182dc7f769fac3e86c79b Mon Sep 17 00:00:00 2001 From: Florent Poinsard Date: Tue, 25 Feb 2025 16:16:05 -0600 Subject: [PATCH 13/13] Fix test logic Signed-off-by: Florent Poinsard --- test/endtoend/backup_schedule_test.sh | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/test/endtoend/backup_schedule_test.sh b/test/endtoend/backup_schedule_test.sh index db46a508..23393289 100755 --- a/test/endtoend/backup_schedule_test.sh +++ b/test/endtoend/backup_schedule_test.sh @@ -25,19 +25,16 @@ function verifyListBackupsOutputWithSchedule() { backupCount=$(kubectl get vtb --no-headers | wc -l) echo "Found ${backupCount} backups" if [[ "${backupCount}" -ge 3 ]]; then - break + echo -e "Check for Jobs' pods" + # Here check explicitly that the every five minute schedule ran at least once during the 10 minutes sleep + checkPodExistWithTimeout "example-vbsc-every-minute-(.*)0/1(.*)Completed(.*)" + checkPodExistWithTimeout "example-vbsc-every-five-minute-(.*)0/1(.*)Completed(.*)" + return fi sleep 1 done - if [[ "${backupCount}" -ge 3 ]]; then - echo "Did not find at least 3 backups" - exit 1 - fi - - echo -e "Check for Jobs' pods" - # Here check explicitly that the every five minute schedule ran at least once during the 10 minutes sleep - checkPodExistWithTimeout "example-vbsc-every-minute-(.*)0/1(.*)Completed(.*)" - checkPodExistWithTimeout "example-vbsc-every-five-minute-(.*)0/1(.*)Completed(.*)" + echo "Did not find at least 3 backups" + exit 1 } # Test setup