From c10c0f463dbc43689dd4b06dbeab03a0d15d3e14 Mon Sep 17 00:00:00 2001
From: Florent Poinsard
Date: Thu, 23 Jan 2025 15:37:34 -0600
Subject: [PATCH 01/13] Refactor GetCompletedBackup to be shared between
different controllers
Signed-off-by: Florent Poinsard
---
.../vitessshard/reconcile_backup_job.go | 29 ++++---------
pkg/operator/vitessbackup/object.go | 41 ++++++++++++++++++-
2 files changed, 47 insertions(+), 23 deletions(-)
diff --git a/pkg/controller/vitessshard/reconcile_backup_job.go b/pkg/controller/vitessshard/reconcile_backup_job.go
index f580ce91..623a066f 100644
--- a/pkg/controller/vitessshard/reconcile_backup_job.go
+++ b/pkg/controller/vitessshard/reconcile_backup_job.go
@@ -22,7 +22,6 @@ import (
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
- apilabels "k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
@@ -56,27 +55,15 @@ func (r *ReconcileVitessShard) reconcileBackupJob(ctx context.Context, vts *plan
vitessbackup.TypeLabel: vitessbackup.TypeInit,
}
- // List all backups for this shard, across all storage locations.
- // We'll use the latest observed state of backups to decide whether to take
- // a new one. This list could be out of date because it's populated by
- // polling the Vitess API (see the VitessBackupStorage controller), but as
- // long as it's eventually consistent, we'll converge to the right behavior.
- allBackups := &planetscalev2.VitessBackupList{}
- listOpts := &client.ListOptions{
- Namespace: vts.Namespace,
- LabelSelector: apilabels.SelectorFromSet(apilabels.Set{
- planetscalev2.ClusterLabel: clusterName,
- planetscalev2.KeyspaceLabel: keyspaceName,
- planetscalev2.ShardLabel: shardSafeName,
- }),
- }
- if err := r.client.List(ctx, allBackups, listOpts); err != nil {
+ allBackups, completeBackups, err := vitessbackup.GetBackups(ctx, vts.Namespace, clusterName, keyspaceName, shardSafeName,
+ func(ctx context.Context, allBackupsList *planetscalev2.VitessBackupList, listOpts *client.ListOptions) error {
+ return r.client.List(ctx, allBackupsList, listOpts)
+ },
+ )
+ if err != nil {
return resultBuilder.Error(err)
}
- updateBackupStatus(vts, allBackups.Items)
-
- // Here we only care about complete backups.
- completeBackups := vitessbackup.CompleteBackups(allBackups.Items)
+ updateBackupStatus(vts, allBackups)
// Generate keys (object names) for all desired backup Pods and PVCs.
// Keep a map back from generated names to the backup specs.
@@ -112,7 +99,7 @@ func (r *ReconcileVitessShard) reconcileBackupJob(ctx context.Context, vts *plan
// Reconcile vtbackup PVCs. Use the same key as the corresponding Pod,
// but only if the Pod expects a PVC.
- err := r.reconciler.ReconcileObjectSet(ctx, vts, pvcKeys, labels, reconciler.Strategy{
+ err = r.reconciler.ReconcileObjectSet(ctx, vts, pvcKeys, labels, reconciler.Strategy{
Kind: &corev1.PersistentVolumeClaim{},
New: func(key client.ObjectKey) runtime.Object {
diff --git a/pkg/operator/vitessbackup/object.go b/pkg/operator/vitessbackup/object.go
index b5c011d8..7c74495a 100644
--- a/pkg/operator/vitessbackup/object.go
+++ b/pkg/operator/vitessbackup/object.go
@@ -17,11 +17,14 @@ limitations under the License.
package vitessbackup
import (
+ "context"
"fmt"
"strconv"
"strings"
"time"
+ apilabels "k8s.io/apimachinery/pkg/labels"
+ "sigs.k8s.io/controller-runtime/pkg/client"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
"vitess.io/vitess/go/vt/topo/topoproto"
@@ -99,8 +102,42 @@ func LatestForLocation(locationName string, backups []*planetscalev2.VitessBacku
return latest
}
-// CompleteBackups returns a list of only the complete backups from the input.
-func CompleteBackups(backups []planetscalev2.VitessBackup) []*planetscalev2.VitessBackup {
+// GetBackups returns a list of all backups, along with only completed backups, for the given
+// keyspace/shard in the given cluster.
+// A function to list the backup using the controller's client is necessary.
+func GetBackups(
+ ctx context.Context,
+ namespace, clusterName, keyspaceName, shardSafeName string,
+ listBackups func(context.Context, *planetscalev2.VitessBackupList, *client.ListOptions) error,
+) (allBackups []planetscalev2.VitessBackup, completeBackups []*planetscalev2.VitessBackup, err error) {
+ // List all backups for this shard, across all storage locations.
+ // We'll use the latest observed state of backups to decide whether to take
+ // a new one. This list could be out of date because it's populated by
+ // polling the Vitess API (see the VitessBackupStorage controller), but as
+ // long as it's eventually consistent, we'll converge to the right behavior.
+ allBackupsList := &planetscalev2.VitessBackupList{}
+ listOpts := &client.ListOptions{
+ Namespace: namespace,
+ LabelSelector: apilabels.SelectorFromSet(apilabels.Set{
+ planetscalev2.ClusterLabel: clusterName,
+ planetscalev2.KeyspaceLabel: keyspaceName,
+ planetscalev2.ShardLabel: shardSafeName,
+ }),
+ }
+ if err = listBackups(ctx, allBackupsList, listOpts); err != nil {
+ return nil, nil, err
+ }
+
+ allBackups = allBackupsList.Items
+
+ // Filter by complete backups.
+ completeBackups = getCompleteBackups(allBackups)
+
+ return allBackups, completeBackups, nil
+}
+
+// getCompleteBackups returns a list of only the complete backups from the input.
+func getCompleteBackups(backups []planetscalev2.VitessBackup) []*planetscalev2.VitessBackup {
completeBackups := []*planetscalev2.VitessBackup{}
for i := range backups {
if backups[i].Status.Complete {
From 9f75193e2c2c18dfd38a18fbcfa7d318aa1aaf77 Mon Sep 17 00:00:00 2001
From: Florent Poinsard
Date: Tue, 28 Jan 2025 17:10:28 -0600
Subject: [PATCH 02/13] Use Forbid as the default concurrency setting in vbs
Signed-off-by: Florent Poinsard
---
deploy/crds/planetscale.com_vitessbackupschedules.yaml | 1 +
deploy/crds/planetscale.com_vitessclusters.yaml | 1 +
docs/api.md | 4 ++--
docs/api/index.html | 4 ++--
pkg/apis/planetscale/v2/vitessbackupschedule_types.go | 5 +++--
5 files changed, 9 insertions(+), 6 deletions(-)
diff --git a/deploy/crds/planetscale.com_vitessbackupschedules.yaml b/deploy/crds/planetscale.com_vitessbackupschedules.yaml
index 2bb4c666..dda63530 100644
--- a/deploy/crds/planetscale.com_vitessbackupschedules.yaml
+++ b/deploy/crds/planetscale.com_vitessbackupschedules.yaml
@@ -38,6 +38,7 @@ spec:
cluster:
type: string
concurrencyPolicy:
+ default: Forbid
enum:
- Allow
- Forbid
diff --git a/deploy/crds/planetscale.com_vitessclusters.yaml b/deploy/crds/planetscale.com_vitessclusters.yaml
index cefa2884..43cd6d0c 100644
--- a/deploy/crds/planetscale.com_vitessclusters.yaml
+++ b/deploy/crds/planetscale.com_vitessclusters.yaml
@@ -168,6 +168,7 @@ spec:
type: string
type: object
concurrencyPolicy:
+ default: Forbid
enum:
- Allow
- Forbid
diff --git a/docs/api.md b/docs/api.md
index acb8ad03..8e9b6d2f 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -2828,8 +2828,8 @@ ConcurrencyPolicy
(Optional)
ConcurrencyPolicy specifies ho to treat concurrent executions of a Job.
Valid values are:
-- “Allow” (default): allows CronJobs to run concurrently;
-- “Forbid”: forbids concurrent runs, skipping next run if previous run hasn’t finished yet;
+- “Allow”: allows CronJobs to run concurrently;
+- “Forbid” (default): forbids concurrent runs, skipping next run if previous run hasn’t finished yet;
- “Replace”: cancels currently running job and replaces it with a new one.
diff --git a/docs/api/index.html b/docs/api/index.html
index f9974877..b5b32f91 100644
--- a/docs/api/index.html
+++ b/docs/api/index.html
@@ -2830,8 +2830,8 @@ VitessBackupScheduleTem
(Optional)
ConcurrencyPolicy specifies ho to treat concurrent executions of a Job.
Valid values are:
-- “Allow” (default): allows CronJobs to run concurrently;
-- “Forbid”: forbids concurrent runs, skipping next run if previous run hasn’t finished yet;
+- “Allow”: allows CronJobs to run concurrently;
+- “Forbid” (default): forbids concurrent runs, skipping next run if previous run hasn’t finished yet;
- “Replace”: cancels currently running job and replaces it with a new one.
diff --git a/pkg/apis/planetscale/v2/vitessbackupschedule_types.go b/pkg/apis/planetscale/v2/vitessbackupschedule_types.go
index ff29654d..dbc5f3f9 100644
--- a/pkg/apis/planetscale/v2/vitessbackupschedule_types.go
+++ b/pkg/apis/planetscale/v2/vitessbackupschedule_types.go
@@ -136,11 +136,12 @@ type VitessBackupScheduleTemplate struct {
// ConcurrencyPolicy specifies ho to treat concurrent executions of a Job.
// Valid values are:
- // - "Allow" (default): allows CronJobs to run concurrently;
- // - "Forbid": forbids concurrent runs, skipping next run if previous run hasn't finished yet;
+ // - "Allow": allows CronJobs to run concurrently;
+ // - "Forbid" (default): forbids concurrent runs, skipping next run if previous run hasn't finished yet;
// - "Replace": cancels currently running job and replaces it with a new one.
// +optional
// +kubebuilder:example="Forbid"
+ // +kubebuilder:default="Forbid"
ConcurrencyPolicy ConcurrencyPolicy `json:"concurrencyPolicy,omitempty"`
// AllowedMissedRuns defines how many missed run of the schedule will be allowed before giving up on finding the last job.
From 0e982c72e020565be684719af544b5af5ad21816 Mon Sep 17 00:00:00 2001
From: Florent Poinsard
Date: Tue, 28 Jan 2025 17:14:05 -0600
Subject: [PATCH 03/13] Fix tests
Signed-off-by: Florent Poinsard
---
test/endtoend/operator/101_initial_cluster_backup_schedule.yaml | 2 +-
test/endtoend/operator/operator-latest.yaml | 2 ++
2 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/test/endtoend/operator/101_initial_cluster_backup_schedule.yaml b/test/endtoend/operator/101_initial_cluster_backup_schedule.yaml
index b31c3032..7fe852cf 100644
--- a/test/endtoend/operator/101_initial_cluster_backup_schedule.yaml
+++ b/test/endtoend/operator/101_initial_cluster_backup_schedule.yaml
@@ -52,7 +52,7 @@ spec:
vtorc: vitess/lite:latest
vtbackup: vitess/lite:latest
mysqld:
- mysql80Compatible: mysql:8.0.40
+ mysql80Compatible: vitess/lite:latest
mysqldExporter: prom/mysqld-exporter:v0.14.0
cells:
- name: zone1
diff --git a/test/endtoend/operator/operator-latest.yaml b/test/endtoend/operator/operator-latest.yaml
index a7ab8a87..f4488761 100644
--- a/test/endtoend/operator/operator-latest.yaml
+++ b/test/endtoend/operator/operator-latest.yaml
@@ -416,6 +416,7 @@ spec:
cluster:
type: string
concurrencyPolicy:
+ default: Forbid
enum:
- Allow
- Forbid
@@ -2028,6 +2029,7 @@ spec:
type: string
type: object
concurrencyPolicy:
+ default: Forbid
enum:
- Allow
- Forbid
From 97bed04ae95b5b5b657f49445604a42dca921e07 Mon Sep 17 00:00:00 2001
From: Florent Poinsard
Date: Tue, 28 Jan 2025 17:19:44 -0600
Subject: [PATCH 04/13] Run vtbackup pods instead of vtctldclient
Signed-off-by: Florent Poinsard
---
.../vitessbackupschedule_controller.go | 309 +++++++++++-------
.../vitessshard/reconcile_backup_job.go | 8 +-
pkg/operator/vttablet/flags.go | 1 +
pkg/operator/vttablet/vtbackup_pod.go | 2 +
4 files changed, 200 insertions(+), 120 deletions(-)
diff --git a/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go b/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go
index c105fe9d..b6595f73 100644
--- a/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go
+++ b/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go
@@ -34,9 +34,12 @@ import (
apierrors "k8s.io/apimachinery/pkg/api/errors"
apilabels "k8s.io/apimachinery/pkg/labels"
"k8s.io/client-go/tools/record"
+ "planetscale.dev/vitess-operator/pkg/controller/vitessshard"
"planetscale.dev/vitess-operator/pkg/operator/metrics"
"planetscale.dev/vitess-operator/pkg/operator/reconciler"
"planetscale.dev/vitess-operator/pkg/operator/results"
+ "planetscale.dev/vitess-operator/pkg/operator/vitessbackup"
+ "planetscale.dev/vitess-operator/pkg/operator/vttablet"
"sigs.k8s.io/controller-runtime/pkg/controller"
"sigs.k8s.io/controller-runtime/pkg/handler"
"sigs.k8s.io/controller-runtime/pkg/manager"
@@ -179,83 +182,100 @@ func (r *ReconcileVitessBackupsSchedule) Reconcile(ctx context.Context, req ctrl
reconcileCount.WithLabelValues(vbsc.Name, metrics.Result(err)).Inc()
}()
- jobs, mostRecentTime, err := r.getJobsList(ctx, req, vbsc.Name)
- if err != nil {
- // We had an error reading the jobs, we can requeue.
- return resultBuilder.Error(err)
- }
+ var scheduledResult ctrl.Result
+ var activeJobs []*kbatch.Job
+ for _, strategy := range vbsc.Spec.Strategy {
+ start, end, ok := strings.Cut(strategy.Shard, "-")
+ if !ok {
+ return resultBuilder.Error(fmt.Errorf("invalid strategy shard: %s", strategy.Shard))
+ }
+ vkr := planetscalev2.VitessKeyRange{
+ Start: start,
+ End: end,
+ }
+ jobName := vbsc.Name + "-" + strategy.Keyspace + "-" + vkr.SafeName()
- err = r.updateVitessBackupScheduleStatus(ctx, mostRecentTime, vbsc, jobs.active)
- if err != nil {
- // We had an error updating the status, we can requeue.
- return resultBuilder.Error(err)
- }
+ jobs, mostRecentTime, err := r.getJobsList(ctx, req, vbsc.Name, jobName)
+ if err != nil {
+ // We had an error reading the jobs, we can requeue.
+ return resultBuilder.Error(err)
+ }
- // We must clean up old jobs to not overcrowd the number of Pods and Jobs in the cluster.
- // This will be done according to both failedJobsHistoryLimit and successfulJobsHistoryLimit fields.
- r.cleanupJobsWithLimit(ctx, jobs.failed, vbsc.GetFailedJobsLimit())
- r.cleanupJobsWithLimit(ctx, jobs.successful, vbsc.GetSuccessfulJobsLimit())
+ activeJobs = append(activeJobs, jobs.active...)
+ err = r.updateVitessBackupScheduleStatus(ctx, mostRecentTime, vbsc, activeJobs)
+ if err != nil {
+ // We had an error updating the status, we can requeue.
+ return resultBuilder.Error(err)
+ }
- err = r.removeTimeoutJobs(ctx, jobs.active, vbsc.Name, vbsc.Spec.JobTimeoutMinutes)
- if err != nil {
- // We had an error while removing timed out jobs, we can requeue
- return resultBuilder.Error(err)
- }
+ // We must clean up old jobs to not overcrowd the number of Pods and Jobs in the cluster.
+ // This will be done according to both failedJobsHistoryLimit and successfulJobsHistoryLimit fields.
+ r.cleanupJobsWithLimit(ctx, jobs.failed, vbsc.GetFailedJobsLimit())
+ r.cleanupJobsWithLimit(ctx, jobs.successful, vbsc.GetSuccessfulJobsLimit())
- // If the Suspend setting is set to true, we can skip adding any job, our work is done here.
- if vbsc.Spec.Suspend != nil && *vbsc.Spec.Suspend {
- log.Info("VitessBackupSchedule suspended, skipping")
- return ctrl.Result{}, nil
- }
+ err = r.removeTimeoutJobs(ctx, jobs.active, vbsc.Name, vbsc.Spec.JobTimeoutMinutes)
+ if err != nil {
+ // We had an error while removing timed out jobs, we can requeue
+ return resultBuilder.Error(err)
+ }
- missedRun, nextRun, err := getNextSchedule(vbsc, time.Now())
- if err != nil {
- log.Error(err, "unable to figure out VitessBackupSchedule schedule")
- // Re-queuing here does not make sense as we have an error with the schedule and the user needs to fix it first.
- return ctrl.Result{}, nil
- }
+ // If the Suspend setting is set to true, we can skip adding any job, our work is done here.
+ if vbsc.Spec.Suspend != nil && *vbsc.Spec.Suspend {
+ log.Info("VitessBackupSchedule suspended, skipping")
+ return ctrl.Result{}, nil
+ }
- // Ask kubernetes to re-queue for the next scheduled job, and skip if we don't miss any run.
- scheduledResult := ctrl.Result{RequeueAfter: nextRun.Sub(time.Now())}
- if missedRun.IsZero() {
- return scheduledResult, nil
- }
+ missedRun, nextRun, err := getNextSchedule(vbsc, time.Now())
+ if err != nil {
+ log.Error(err, "unable to figure out VitessBackupSchedule schedule")
+ // Re-queuing here does not make sense as we have an error with the schedule and the user needs to fix it first.
+ return ctrl.Result{}, nil
+ }
- // Check whether we are too late to create this Job or not. The startingDeadlineSeconds field will help us
- // schedule Jobs that are late.
- tooLate := false
- if vbsc.Spec.StartingDeadlineSeconds != nil {
- tooLate = missedRun.Add(time.Duration(*vbsc.Spec.StartingDeadlineSeconds) * time.Second).Before(time.Now())
- }
- if tooLate {
- log.Infof("missed starting deadline for latest run; skipping; next run is scheduled for: %s", nextRun.Format(time.RFC3339))
- return scheduledResult, nil
- }
+ // Ask kubernetes to re-queue for the next scheduled job, and skip if we don't miss any run.
+ scheduledResult = ctrl.Result{RequeueAfter: nextRun.Sub(time.Now())}
+ if missedRun.IsZero() {
+ return scheduledResult, nil
+ }
- // Check concurrency policy and skip this job if we have ForbidConcurrent set plus an active job
- if vbsc.Spec.ConcurrencyPolicy == planetscalev2.ForbidConcurrent && len(jobs.active) > 0 {
- log.Infof("concurrency policy blocks concurrent runs: skipping, number of active jobs: %d", len(jobs.active))
- return scheduledResult, nil
- }
+ // Check whether we are too late to create this Job or not. The startingDeadlineSeconds field will help us
+ // schedule Jobs that are late.
+ tooLate := false
+ if vbsc.Spec.StartingDeadlineSeconds != nil {
+ tooLate = missedRun.Add(time.Duration(*vbsc.Spec.StartingDeadlineSeconds) * time.Second).Before(time.Now())
+ }
+ if tooLate {
+ log.Infof("missed starting deadline for latest run; skipping; next run is scheduled for: %s", nextRun.Format(time.RFC3339))
+ return scheduledResult, nil
+ }
- // Now that the different policies are checked, we can create and apply our new job.
- job, err := r.createJob(ctx, &vbsc, missedRun)
- if err != nil {
- // Re-queuing here does not make sense as we have an error with the template and the user needs to fix it first.
- log.WithError(err).Error("unable to construct job from template")
- return ctrl.Result{}, err
- }
- if err = r.client.Create(ctx, job); err != nil {
- // if the job already exists it means another reconciling loop created the job since we last fetched
- // the list of jobs to create, we can safely return without failing.
- if apierrors.IsAlreadyExists(err) {
- return ctrl.Result{}, nil
+ // Check concurrency policy and skip this job if we have ForbidConcurrent set plus an active job
+ if vbsc.Spec.ConcurrencyPolicy == planetscalev2.ForbidConcurrent && len(jobs.active) > 0 {
+ log.Infof("concurrency policy blocks concurrent runs: skipping, number of active jobs: %d", len(jobs.active))
+ return scheduledResult, nil
+ }
+
+ // Now that the different policies are checked, we can create and apply our new job.
+ job, err := r.createJob(ctx, jobName, &vbsc, strategy, missedRun, vkr)
+ if err != nil {
+ // Re-queuing here does not make sense as we have an error with the template and the user needs to fix it first.
+ log.WithError(err).Error("unable to construct job from template")
+ return ctrl.Result{}, err
+ }
+
+ if err = r.client.Create(ctx, job); err != nil {
+ // if the job already exists it means another reconciling loop created the job since we last fetched
+ // the list of jobs to create, we can safely return without failing.
+ if apierrors.IsAlreadyExists(err) {
+ log.Infof("job %s already exists, will retry in %s", job.Name, scheduledResult.RequeueAfter.String())
+ return scheduledResult, nil
+ }
+ // Simply re-queue here
+ return resultBuilder.Error(err)
}
- // Simply re-queue here
- return resultBuilder.Error(err)
- }
- log.Infof("created new job: %s, next job scheduled in %s", job.Name, scheduledResult.RequeueAfter.String())
+ log.Infof("created new job: %s, next job scheduled in %s", job.Name, scheduledResult.RequeueAfter.String())
+ }
return scheduledResult, nil
}
@@ -330,7 +350,7 @@ func (r *ReconcileVitessBackupsSchedule) updateVitessBackupScheduleStatus(ctx co
// getJobsList fetches all existing Jobs in the cluster and return them by categories: active, failed or successful.
// It also returns at what time was the last job created, which is needed to update VitessBackupSchedule's status,
// and plan future jobs.
-func (r *ReconcileVitessBackupsSchedule) getJobsList(ctx context.Context, req ctrl.Request, vbscName string) (jobsList, *time.Time, error) {
+func (r *ReconcileVitessBackupsSchedule) getJobsList(ctx context.Context, req ctrl.Request, vbscName string, jobName string) (jobsList, *time.Time, error) {
var existingJobs kbatch.JobList
err := r.client.List(ctx, &existingJobs, client.InNamespace(req.Namespace), client.MatchingLabels{planetscalev2.BackupScheduleLabel: vbscName})
@@ -344,6 +364,10 @@ func (r *ReconcileVitessBackupsSchedule) getJobsList(ctx context.Context, req ct
var mostRecentTime *time.Time
for i, job := range existingJobs.Items {
+ if !strings.HasPrefix(job.Name, jobName) {
+ continue
+ }
+
_, jobType := isJobFinished(&job)
switch jobType {
case kbatch.JobFailed, kbatch.JobFailureTarget:
@@ -386,11 +410,27 @@ func (r *ReconcileVitessBackupsSchedule) cleanupJobsWithLimit(ctx context.Contex
if int32(i) >= int32(len(jobs))-limit {
break
}
+ // delete the job
if err := r.client.Delete(ctx, job, client.PropagationPolicy(metav1.DeletePropagationBackground)); (err) != nil {
log.WithError(err).Errorf("unable to delete old job: %s", job.Name)
} else {
log.Infof("deleted old job: %s", job.Name)
}
+
+ // delete the vtbackup pod's PVC
+ pvc := &corev1.PersistentVolumeClaim{}
+ err := r.client.Get(ctx, client.ObjectKey{Namespace: job.Namespace, Name: job.Name}, pvc)
+ if err != nil {
+ log.WithError(err).Errorf("unable to get PVC for job: %s", job.Name)
+ if apierrors.IsNotFound(err) {
+ continue
+ }
+ }
+ if err := r.client.Delete(ctx, pvc, client.PropagationPolicy(metav1.DeletePropagationBackground)); (err) != nil {
+ log.WithError(err).Errorf("unable to delete old PVC for job: %s", job.Name)
+ } else {
+ log.Infof("deleted old PVC for job: %s", job.Name)
+ }
}
}
@@ -410,6 +450,20 @@ func (r *ReconcileVitessBackupsSchedule) removeTimeoutJobs(ctx context.Context,
log.Infof("deleted timed out job: %s", job.Name)
}
timeoutJobsCount.WithLabelValues(vbscName, metrics.Result(err)).Inc()
+
+ pvc := &corev1.PersistentVolumeClaim{}
+ err := r.client.Get(ctx, client.ObjectKey{Namespace: job.Namespace, Name: job.Name}, pvc)
+ if err != nil {
+ log.WithError(err).Errorf("unable to get PVC for timed out job: %s", job.Name)
+ if apierrors.IsNotFound(err) {
+ continue
+ }
+ }
+ if err := r.client.Delete(ctx, pvc, client.PropagationPolicy(metav1.DeletePropagationBackground)); (err) != nil {
+ log.WithError(err).Errorf("unable to delete old PVC for timed out job: %s", job.Name)
+ } else {
+ log.Infof("deleted old PVC for timed out job: %s", job.Name)
+ }
}
}
return nil
@@ -439,9 +493,15 @@ func getScheduledTimeForJob(job *kbatch.Job) (*time.Time, error) {
return &timeParsed, nil
}
-func (r *ReconcileVitessBackupsSchedule) createJob(ctx context.Context, vbsc *planetscalev2.VitessBackupSchedule, scheduledTime time.Time) (*kbatch.Job, error) {
- name := fmt.Sprintf("%s-%d", vbsc.Name, scheduledTime.Unix())
-
+func (r *ReconcileVitessBackupsSchedule) createJob(
+ ctx context.Context,
+ jobName string,
+ vbsc *planetscalev2.VitessBackupSchedule,
+ strategy planetscalev2.VitessBackupScheduleStrategy,
+ scheduledTime time.Time,
+ vkr planetscalev2.VitessKeyRange,
+) (*kbatch.Job, error) {
+ name := fmt.Sprintf("%s-%d", jobName, scheduledTime.Unix())
meta := metav1.ObjectMeta{
Labels: map[string]string{
planetscalev2.BackupScheduleLabel: vbsc.Name,
@@ -457,7 +517,7 @@ func (r *ReconcileVitessBackupsSchedule) createJob(ctx context.Context, vbsc *pl
maps.Copy(meta.Labels, vbsc.Labels)
- pod, err := r.createJobPod(ctx, vbsc, name)
+ pod, vtbackupSpec, err := r.createJobPod(ctx, vbsc, strategy, name, vkr)
if err != nil {
return nil, err
}
@@ -466,7 +526,7 @@ func (r *ReconcileVitessBackupsSchedule) createJob(ctx context.Context, vbsc *pl
Spec: kbatch.JobSpec{
Template: corev1.PodTemplateSpec{
ObjectMeta: meta,
- Spec: pod,
+ Spec: pod.Spec,
},
},
}
@@ -475,55 +535,71 @@ func (r *ReconcileVitessBackupsSchedule) createJob(ctx context.Context, vbsc *pl
return nil, err
}
- return job, nil
-}
-
-func (r *ReconcileVitessBackupsSchedule) createJobPod(ctx context.Context, vbsc *planetscalev2.VitessBackupSchedule, name string) (pod corev1.PodSpec, err error) {
- getVtctldServiceName := func(cluster string) (string, error) {
- vtctldServiceName, vtctldServicePort, err := r.getVtctldServiceName(ctx, vbsc, cluster)
+ // Create the corresponding PVC for the new vtbackup pod
+ pvc := &corev1.PersistentVolumeClaim{}
+ key := client.ObjectKey{
+ Namespace: job.Namespace,
+ Name: name,
+ }
+ err = r.client.Get(ctx, key, pvc)
+ if err != nil {
+ if !apierrors.IsNotFound(err) {
+ return nil, err
+ }
+ newPVC := vttablet.NewPVC(key, vtbackupSpec.TabletSpec)
+ if err := ctrl.SetControllerReference(vbsc, newPVC, r.scheme); err != nil {
+ return nil, err
+ }
+ err = r.client.Create(ctx, newPVC)
if err != nil {
- return "", err
+ return nil, err
}
- return fmt.Sprintf("--server=%s:%d", vtctldServiceName, vtctldServicePort), nil
}
- // It is fine to not have any default in the event there is no strategy as the CRD validation
- // ensures that there will be at least one item in this list. The YAML cannot be applied with
- // empty list of strategies.
- var cmd strings.Builder
+ return job, nil
+}
- addNewCmd := func(i int) {
- if i > 0 {
- cmd.WriteString(" && ")
- }
+func (r *ReconcileVitessBackupsSchedule) createJobPod(
+ ctx context.Context,
+ vbsc *planetscalev2.VitessBackupSchedule,
+ strategy planetscalev2.VitessBackupScheduleStrategy,
+ name string,
+ vkr planetscalev2.VitessKeyRange,
+) (pod *corev1.Pod, spec *vttablet.BackupSpec, err error) {
+ vts, err := r.getShardFromKeyspace(ctx, vbsc.Namespace, vbsc.Spec.Cluster, strategy.Keyspace, strategy.Shard)
+ if err != nil {
+ return nil, nil, err
}
- for i, strategy := range vbsc.Spec.Strategy {
- vtctldclientServerArg, err := getVtctldServiceName(vbsc.Spec.Cluster)
- if err != nil {
- return corev1.PodSpec{}, err
- }
-
- addNewCmd(i)
- switch strategy.Name {
- case planetscalev2.BackupShard:
- createVtctldClientCommand(&cmd, vtctldclientServerArg, strategy.ExtraFlags, strategy.Keyspace, strategy.Shard)
- }
-
+ _, completeBackups, err := vitessbackup.GetBackups(ctx, vbsc.Namespace, vbsc.Spec.Cluster, strategy.Keyspace, vkr.SafeName(),
+ func(ctx context.Context, allBackupsList *planetscalev2.VitessBackupList, listOpts *client.ListOptions) error {
+ return r.client.List(ctx, allBackupsList, listOpts)
+ },
+ )
+ if err != nil {
+ return nil, nil, err
+ }
+ backupType := vitessbackup.TypeUpdate
+ if len(completeBackups) == 0 {
+ backupType = vitessbackup.TypeInit
}
- pod = corev1.PodSpec{
- Containers: []corev1.Container{{
- Name: name,
- Image: vbsc.Spec.Image,
- ImagePullPolicy: vbsc.Spec.ImagePullPolicy,
- Resources: vbsc.Spec.Resources,
- Args: []string{"/bin/sh", "-c", cmd.String()},
- }},
- RestartPolicy: corev1.RestartPolicyOnFailure,
- Affinity: vbsc.Spec.Affinity,
+ podKey := client.ObjectKey{
+ Namespace: vbsc.Namespace,
+ Name: name,
+ }
+ labels := map[string]string{
+ planetscalev2.BackupScheduleLabel: vbsc.Name,
+ planetscalev2.ClusterLabel: vbsc.Spec.Cluster,
+ planetscalev2.KeyspaceLabel: strategy.Keyspace,
+ planetscalev2.ShardLabel: vkr.SafeName(),
}
- return pod, nil
+ vtbackupSpec := vitessshard.MakeVtbackupSpec(podKey, &vts, labels, backupType)
+ p := vttablet.NewBackupPod(podKey, vtbackupSpec, vts.Spec.Images.Mysqld.Image())
+
+ p.Spec.RestartPolicy = corev1.RestartPolicyOnFailure
+ p.Spec.Affinity = vbsc.Spec.Affinity
+ return p, vtbackupSpec, nil
}
func createVtctldClientCommand(cmd *strings.Builder, serverAddr string, extraFlags map[string]string, keyspace, shard string) {
@@ -568,7 +644,7 @@ func (r *ReconcileVitessBackupsSchedule) getVtctldServiceName(ctx context.Contex
return svcName, svcPort, nil
}
-func (r *ReconcileVitessBackupsSchedule) getAllShardsInKeyspace(ctx context.Context, namespace, cluster, keyspace string) ([]string, error) {
+func (r *ReconcileVitessBackupsSchedule) getShardFromKeyspace(ctx context.Context, namespace, cluster, keyspace, shard string) (planetscalev2.VitessShard, error) {
shardsList := &planetscalev2.VitessShardList{}
listOpts := &client.ListOptions{
Namespace: namespace,
@@ -578,13 +654,14 @@ func (r *ReconcileVitessBackupsSchedule) getAllShardsInKeyspace(ctx context.Cont
}.AsSelector(),
}
if err := r.client.List(ctx, shardsList, listOpts); err != nil {
- return nil, fmt.Errorf("unable to list shards of keyspace %s in %s: %v", keyspace, namespace, err)
+ return planetscalev2.VitessShard{}, fmt.Errorf("unable to list shards of keyspace %s in %s: %v", keyspace, namespace, err)
}
- var result []string
for _, item := range shardsList.Items {
- result = append(result, item.Spec.Name)
+ if item.Spec.KeyRange.String() == shard {
+ return item, nil
+ }
}
- return result, nil
+ return planetscalev2.VitessShard{}, fmt.Errorf("unable to find shard %s in keyspace %s in %s", shard, keyspace, namespace)
}
type keyspace struct {
diff --git a/pkg/controller/vitessshard/reconcile_backup_job.go b/pkg/controller/vitessshard/reconcile_backup_job.go
index 623a066f..df4fcd65 100644
--- a/pkg/controller/vitessshard/reconcile_backup_job.go
+++ b/pkg/controller/vitessshard/reconcile_backup_job.go
@@ -84,7 +84,7 @@ func (r *ReconcileVitessShard) reconcileBackupJob(ctx context.Context, vts *plan
// scratch (not from any tablet). If we're wrong and a backup exists
// already, the idempotent vtbackup "initial backup" mode will just do
// nothing and return success.
- initSpec := vtbackupInitSpec(initPodKey, vts, labels)
+ initSpec := MakeVtbackupSpec(initPodKey, vts, labels, vitessbackup.TypeInit)
if initSpec != nil {
podKeys = append(podKeys, initPodKey)
if initSpec.TabletSpec.DataVolumePVCSpec != nil {
@@ -168,7 +168,7 @@ func (r *ReconcileVitessShard) reconcileBackupJob(ctx context.Context, vts *plan
return resultBuilder.Result()
}
-func vtbackupInitSpec(key client.ObjectKey, vts *planetscalev2.VitessShard, parentLabels map[string]string) *vttablet.BackupSpec {
+func MakeVtbackupSpec(key client.ObjectKey, vts *planetscalev2.VitessShard, parentLabels map[string]string, typ string) *vttablet.BackupSpec {
// If we specifically set our cluster to avoid initial backups, bail early.
if !*vts.Spec.Replication.InitializeBackup {
return nil
@@ -183,7 +183,7 @@ func vtbackupInitSpec(key client.ObjectKey, vts *planetscalev2.VitessShard, pare
// Make a vtbackup spec that's a similar shape to the first tablet pool.
// This should give it enough resources to run mysqld and restore a backup,
// since all tablets need to be able to do that, regardless of type.
- return vtbackupSpec(key, vts, parentLabels, &vts.Spec.TabletPools[0], vitessbackup.TypeInit)
+ return vtbackupSpec(key, vts, parentLabels, &vts.Spec.TabletPools[0], typ)
}
func vtbackupSpec(key client.ObjectKey, vts *planetscalev2.VitessShard, parentLabels map[string]string, pool *planetscalev2.VitessShardTabletPool, backupType string) *vttablet.BackupSpec {
@@ -240,7 +240,7 @@ func vtbackupSpec(key client.ObjectKey, vts *planetscalev2.VitessShard, parentLa
}
return &vttablet.BackupSpec{
- InitialBackup: backupType == vitessbackup.TypeInit,
+ AllowFirstBackup: backupType == vitessbackup.TypeInit,
MinBackupInterval: minBackupInterval,
MinRetentionTime: minRetentionTime,
MinRetentionCount: minRetentionCount,
diff --git a/pkg/operator/vttablet/flags.go b/pkg/operator/vttablet/flags.go
index 1ec42d0c..6bc34847 100644
--- a/pkg/operator/vttablet/flags.go
+++ b/pkg/operator/vttablet/flags.go
@@ -104,6 +104,7 @@ func init() {
// vtbackup-specific flags.
"concurrency": vtbackupConcurrency,
"initial_backup": backupSpec.InitialBackup,
+ "allow_first_backup": backupSpec.AllowFirstBackup,
"min_backup_interval": backupSpec.MinBackupInterval,
"min_retention_time": backupSpec.MinRetentionTime,
"min_retention_count": backupSpec.MinRetentionCount,
diff --git a/pkg/operator/vttablet/vtbackup_pod.go b/pkg/operator/vttablet/vtbackup_pod.go
index 813257f9..68b38567 100644
--- a/pkg/operator/vttablet/vtbackup_pod.go
+++ b/pkg/operator/vttablet/vtbackup_pod.go
@@ -61,6 +61,8 @@ type BackupSpec struct {
// aren't any. Instead, bootstrap the shard with a backup of an empty
// database.
InitialBackup bool
+ // AllowFirstBackup enables vtbackup to take a backup even if none exist.
+ AllowFirstBackup bool
// MinBackupInterval is the minimum spacing between backups.
// A new backup will only be taken if it's been at least this long since the
// most recent backup.
From 460e94da8001b31687dc6951b6f43d00af31bf2b Mon Sep 17 00:00:00 2001
From: Florent Poinsard
Date: Wed, 29 Jan 2025 10:40:42 -0600
Subject: [PATCH 05/13] Edit tests
Signed-off-by: Florent Poinsard
---
test/endtoend/backup_schedule_test.sh | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/test/endtoend/backup_schedule_test.sh b/test/endtoend/backup_schedule_test.sh
index 55c37a50..2e6a1d36 100755
--- a/test/endtoend/backup_schedule_test.sh
+++ b/test/endtoend/backup_schedule_test.sh
@@ -31,14 +31,14 @@ function verifyListBackupsOutputWithSchedule() {
checkVitessBackupScheduleStatusWithTimeout "example-vbsc-every-five-minute(.*)"
echo -e "Check for number of backups in the cluster"
- # Sleep for 6 minutes, during this time we should have at the very minimum 7 backups.
- # At least: 6 backups from the every-minute schedule, and 1 backup from the every-five-minute schedule.
+ # Sleep for 6 minutes, after 6 minutes we should at least have 3 backups: 1 from the initial vtbackup pod
+ # 1 minimum from the every minute schedule, and 1 from the every-five minute schedule
sleep 360
backupCount=$(kubectl get vtb --no-headers | wc -l)
- if [[ "${backupCount}" -lt 7 ]]; then
- echo "Did not find at least 7 backups"
- return 0
+ if [[ "${backupCount}" -lt 3 ]]; then
+ echo "Did not find at least 3 backups"
+ exit 1
fi
echo -e "Check for Jobs' pods"
From e9e15940d5b94180ea48b721f88ccdd49a91854d Mon Sep 17 00:00:00 2001
From: Florent Poinsard
Date: Thu, 30 Jan 2025 11:39:10 -0600
Subject: [PATCH 06/13] Fix service name length and initial backup issues
Signed-off-by: Florent Poinsard
---
.../vitessbackupschedule_controller.go | 79 +++++++++++--------
.../vitessshard/reconcile_backup_job.go | 9 ++-
pkg/operator/vitessbackup/labels.go | 2 +
3 files changed, 56 insertions(+), 34 deletions(-)
diff --git a/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go b/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go
index b6595f73..5d2ddfb8 100644
--- a/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go
+++ b/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go
@@ -22,6 +22,7 @@ import (
"fmt"
"maps"
"math/rand/v2"
+ "strconv"
"strings"
"sort"
@@ -36,6 +37,7 @@ import (
"k8s.io/client-go/tools/record"
"planetscale.dev/vitess-operator/pkg/controller/vitessshard"
"planetscale.dev/vitess-operator/pkg/operator/metrics"
+ "planetscale.dev/vitess-operator/pkg/operator/names"
"planetscale.dev/vitess-operator/pkg/operator/reconciler"
"planetscale.dev/vitess-operator/pkg/operator/results"
"planetscale.dev/vitess-operator/pkg/operator/vitessbackup"
@@ -193,9 +195,7 @@ func (r *ReconcileVitessBackupsSchedule) Reconcile(ctx context.Context, req ctrl
Start: start,
End: end,
}
- jobName := vbsc.Name + "-" + strategy.Keyspace + "-" + vkr.SafeName()
-
- jobs, mostRecentTime, err := r.getJobsList(ctx, req, vbsc.Name, jobName)
+ jobs, mostRecentTime, err := r.getJobsList(ctx, req, vbsc, strategy.Keyspace, vkr.SafeName())
if err != nil {
// We had an error reading the jobs, we can requeue.
return resultBuilder.Error(err)
@@ -235,7 +235,7 @@ func (r *ReconcileVitessBackupsSchedule) Reconcile(ctx context.Context, req ctrl
// Ask kubernetes to re-queue for the next scheduled job, and skip if we don't miss any run.
scheduledResult = ctrl.Result{RequeueAfter: nextRun.Sub(time.Now())}
if missedRun.IsZero() {
- return scheduledResult, nil
+ continue
}
// Check whether we are too late to create this Job or not. The startingDeadlineSeconds field will help us
@@ -246,17 +246,17 @@ func (r *ReconcileVitessBackupsSchedule) Reconcile(ctx context.Context, req ctrl
}
if tooLate {
log.Infof("missed starting deadline for latest run; skipping; next run is scheduled for: %s", nextRun.Format(time.RFC3339))
- return scheduledResult, nil
+ continue
}
// Check concurrency policy and skip this job if we have ForbidConcurrent set plus an active job
if vbsc.Spec.ConcurrencyPolicy == planetscalev2.ForbidConcurrent && len(jobs.active) > 0 {
log.Infof("concurrency policy blocks concurrent runs: skipping, number of active jobs: %d", len(jobs.active))
- return scheduledResult, nil
+ continue
}
// Now that the different policies are checked, we can create and apply our new job.
- job, err := r.createJob(ctx, jobName, &vbsc, strategy, missedRun, vkr)
+ job, err := r.createJob(ctx, vbsc, strategy, missedRun, vkr)
if err != nil {
// Re-queuing here does not make sense as we have an error with the template and the user needs to fix it first.
log.WithError(err).Error("unable to construct job from template")
@@ -268,7 +268,7 @@ func (r *ReconcileVitessBackupsSchedule) Reconcile(ctx context.Context, req ctrl
// the list of jobs to create, we can safely return without failing.
if apierrors.IsAlreadyExists(err) {
log.Infof("job %s already exists, will retry in %s", job.Name, scheduledResult.RequeueAfter.String())
- return scheduledResult, nil
+ continue
}
// Simply re-queue here
return resultBuilder.Error(err)
@@ -350,10 +350,21 @@ func (r *ReconcileVitessBackupsSchedule) updateVitessBackupScheduleStatus(ctx co
// getJobsList fetches all existing Jobs in the cluster and return them by categories: active, failed or successful.
// It also returns at what time was the last job created, which is needed to update VitessBackupSchedule's status,
// and plan future jobs.
-func (r *ReconcileVitessBackupsSchedule) getJobsList(ctx context.Context, req ctrl.Request, vbscName string, jobName string) (jobsList, *time.Time, error) {
+func (r *ReconcileVitessBackupsSchedule) getJobsList(
+ ctx context.Context,
+ req ctrl.Request,
+ vbsc planetscalev2.VitessBackupSchedule,
+ keyspace string,
+ shardSafeName string,
+) (jobsList, *time.Time, error) {
var existingJobs kbatch.JobList
- err := r.client.List(ctx, &existingJobs, client.InNamespace(req.Namespace), client.MatchingLabels{planetscalev2.BackupScheduleLabel: vbscName})
+ err := r.client.List(ctx, &existingJobs, client.InNamespace(req.Namespace), client.MatchingLabels{
+ planetscalev2.BackupScheduleLabel: vbsc.Name,
+ planetscalev2.ClusterLabel: vbsc.Spec.Cluster,
+ planetscalev2.KeyspaceLabel: keyspace,
+ planetscalev2.ShardLabel: shardSafeName,
+ })
if err != nil && !apierrors.IsNotFound(err) {
log.WithError(err).Error("unable to list Jobs in cluster")
return jobsList{}, nil, err
@@ -364,10 +375,6 @@ func (r *ReconcileVitessBackupsSchedule) getJobsList(ctx context.Context, req ct
var mostRecentTime *time.Time
for i, job := range existingJobs.Items {
- if !strings.HasPrefix(job.Name, jobName) {
- continue
- }
-
_, jobType := isJobFinished(&job)
switch jobType {
case kbatch.JobFailed, kbatch.JobFailureTarget:
@@ -495,17 +502,22 @@ func getScheduledTimeForJob(job *kbatch.Job) (*time.Time, error) {
func (r *ReconcileVitessBackupsSchedule) createJob(
ctx context.Context,
- jobName string,
- vbsc *planetscalev2.VitessBackupSchedule,
+ vbsc planetscalev2.VitessBackupSchedule,
strategy planetscalev2.VitessBackupScheduleStrategy,
scheduledTime time.Time,
vkr planetscalev2.VitessKeyRange,
) (*kbatch.Job, error) {
- name := fmt.Sprintf("%s-%d", jobName, scheduledTime.Unix())
+ name := names.JoinWithConstraints(names.ServiceConstraints, vbsc.Name, strategy.Keyspace, vkr.SafeName(), strconv.Itoa(int(scheduledTime.Unix())))
+
+ labels := map[string]string{
+ planetscalev2.BackupScheduleLabel: vbsc.Name,
+ planetscalev2.ClusterLabel: vbsc.Spec.Cluster,
+ planetscalev2.KeyspaceLabel: strategy.Keyspace,
+ planetscalev2.ShardLabel: vkr.SafeName(),
+ }
+
meta := metav1.ObjectMeta{
- Labels: map[string]string{
- planetscalev2.BackupScheduleLabel: vbsc.Name,
- },
+ Labels: labels,
Annotations: make(map[string]string),
Name: name,
Namespace: vbsc.Namespace,
@@ -517,7 +529,7 @@ func (r *ReconcileVitessBackupsSchedule) createJob(
maps.Copy(meta.Labels, vbsc.Labels)
- pod, vtbackupSpec, err := r.createJobPod(ctx, vbsc, strategy, name, vkr)
+ pod, vtbackupSpec, err := r.createJobPod(ctx, vbsc, strategy, name, vkr, labels)
if err != nil {
return nil, err
}
@@ -531,7 +543,7 @@ func (r *ReconcileVitessBackupsSchedule) createJob(
},
}
- if err := ctrl.SetControllerReference(vbsc, job, r.scheme); err != nil {
+ if err := ctrl.SetControllerReference(&vbsc, job, r.scheme); err != nil {
return nil, err
}
@@ -547,7 +559,7 @@ func (r *ReconcileVitessBackupsSchedule) createJob(
return nil, err
}
newPVC := vttablet.NewPVC(key, vtbackupSpec.TabletSpec)
- if err := ctrl.SetControllerReference(vbsc, newPVC, r.scheme); err != nil {
+ if err := ctrl.SetControllerReference(&vbsc, newPVC, r.scheme); err != nil {
return nil, err
}
err = r.client.Create(ctx, newPVC)
@@ -561,10 +573,11 @@ func (r *ReconcileVitessBackupsSchedule) createJob(
func (r *ReconcileVitessBackupsSchedule) createJobPod(
ctx context.Context,
- vbsc *planetscalev2.VitessBackupSchedule,
+ vbsc planetscalev2.VitessBackupSchedule,
strategy planetscalev2.VitessBackupScheduleStrategy,
name string,
vkr planetscalev2.VitessKeyRange,
+ labels map[string]string,
) (pod *corev1.Pod, spec *vttablet.BackupSpec, err error) {
vts, err := r.getShardFromKeyspace(ctx, vbsc.Namespace, vbsc.Spec.Cluster, strategy.Keyspace, strategy.Shard)
if err != nil {
@@ -579,25 +592,27 @@ func (r *ReconcileVitessBackupsSchedule) createJobPod(
if err != nil {
return nil, nil, err
}
+
backupType := vitessbackup.TypeUpdate
if len(completeBackups) == 0 {
- backupType = vitessbackup.TypeInit
+ if vts.Status.HasMaster == corev1.ConditionTrue {
+ backupType = vitessbackup.TypeFirstBackup
+ } else {
+ backupType = vitessbackup.TypeInit
+ }
}
podKey := client.ObjectKey{
Namespace: vbsc.Namespace,
Name: name,
}
- labels := map[string]string{
- planetscalev2.BackupScheduleLabel: vbsc.Name,
- planetscalev2.ClusterLabel: vbsc.Spec.Cluster,
- planetscalev2.KeyspaceLabel: strategy.Keyspace,
- planetscalev2.ShardLabel: vkr.SafeName(),
- }
vtbackupSpec := vitessshard.MakeVtbackupSpec(podKey, &vts, labels, backupType)
p := vttablet.NewBackupPod(podKey, vtbackupSpec, vts.Spec.Images.Mysqld.Image())
- p.Spec.RestartPolicy = corev1.RestartPolicyOnFailure
+ // Explicitly do not restart on failure. The VitessBackupSchedule controller will retry the failed job
+ // during the next scheduled run.
+ p.Spec.RestartPolicy = corev1.RestartPolicyNever
+
p.Spec.Affinity = vbsc.Spec.Affinity
return p, vtbackupSpec, nil
}
diff --git a/pkg/controller/vitessshard/reconcile_backup_job.go b/pkg/controller/vitessshard/reconcile_backup_job.go
index df4fcd65..23a82973 100644
--- a/pkg/controller/vitessshard/reconcile_backup_job.go
+++ b/pkg/controller/vitessshard/reconcile_backup_job.go
@@ -84,7 +84,11 @@ func (r *ReconcileVitessShard) reconcileBackupJob(ctx context.Context, vts *plan
// scratch (not from any tablet). If we're wrong and a backup exists
// already, the idempotent vtbackup "initial backup" mode will just do
// nothing and return success.
- initSpec := MakeVtbackupSpec(initPodKey, vts, labels, vitessbackup.TypeInit)
+ backupType := vitessbackup.TypeUpdate
+ if vts.Status.HasMaster != corev1.ConditionTrue {
+ backupType = vitessbackup.TypeInit
+ }
+ initSpec := MakeVtbackupSpec(initPodKey, vts, labels, backupType)
if initSpec != nil {
podKeys = append(podKeys, initPodKey)
if initSpec.TabletSpec.DataVolumePVCSpec != nil {
@@ -240,7 +244,8 @@ func vtbackupSpec(key client.ObjectKey, vts *planetscalev2.VitessShard, parentLa
}
return &vttablet.BackupSpec{
- AllowFirstBackup: backupType == vitessbackup.TypeInit,
+ InitialBackup: backupType == vitessbackup.TypeInit,
+ AllowFirstBackup: backupType == vitessbackup.TypeFirstBackup,
MinBackupInterval: minBackupInterval,
MinRetentionTime: minRetentionTime,
MinRetentionCount: minRetentionCount,
diff --git a/pkg/operator/vitessbackup/labels.go b/pkg/operator/vitessbackup/labels.go
index e8e9a785..8c6d9819 100644
--- a/pkg/operator/vitessbackup/labels.go
+++ b/pkg/operator/vitessbackup/labels.go
@@ -24,6 +24,8 @@ const (
// TypeInit is a backup taken to initialize an empty shard.
TypeInit = "init"
+ // TypeFirstBackup is a backup taken when no other backup exist in an existing shard.
+ TypeFirstBackup = "empty"
// TypeUpdate is a backup taken to update the latest backup for a shard.
TypeUpdate = "update"
)
From 504b77137445182c5534864f2713fc3bc24e434a Mon Sep 17 00:00:00 2001
From: Florent Poinsard
Date: Mon, 24 Feb 2025 08:49:29 -0600
Subject: [PATCH 07/13] wip
Signed-off-by: Florent Poinsard
---
.../operator/101_initial_cluster.yaml | 9 +-
.../operator/201_customer_tablets.yaml | 9 +-
test/endtoend/operator/302_new_shards.yaml | 9 +-
test/endtoend/operator/306_down_shard_0.yaml | 9 +-
.../operator/401_scheduled_backups.yaml | 167 ++++++++++++++++++
test/endtoend/operator/cluster_upgrade.yaml | 9 +-
test/endtoend/upgrade_test.sh | 21 ++-
7 files changed, 224 insertions(+), 9 deletions(-)
create mode 100644 test/endtoend/operator/401_scheduled_backups.yaml
diff --git a/test/endtoend/operator/101_initial_cluster.yaml b/test/endtoend/operator/101_initial_cluster.yaml
index 76ef570a..e0a34ed1 100644
--- a/test/endtoend/operator/101_initial_cluster.yaml
+++ b/test/endtoend/operator/101_initial_cluster.yaml
@@ -7,6 +7,13 @@ kind: VitessCluster
metadata:
name: example
spec:
+ backup:
+ engine: xtrabackup
+ locations:
+ - volume:
+ hostPath:
+ path: /backup
+ type: Directory
images:
vtctld: vitess/lite:v21.0.0
vtgate: vitess/lite:v21.0.0
@@ -14,7 +21,7 @@ spec:
vtorc: vitess/lite:v21.0.0
vtbackup: vitess/lite:v21.0.0
mysqld:
- mysql80Compatible: mysql:8.0.40
+ mysql80Compatible: vitess/lite:v21.0.0
mysqldExporter: prom/mysqld-exporter:v0.14.0
cells:
- name: zone1
diff --git a/test/endtoend/operator/201_customer_tablets.yaml b/test/endtoend/operator/201_customer_tablets.yaml
index ef1f3546..c0565c7d 100644
--- a/test/endtoend/operator/201_customer_tablets.yaml
+++ b/test/endtoend/operator/201_customer_tablets.yaml
@@ -3,6 +3,13 @@ kind: VitessCluster
metadata:
name: example
spec:
+ backup:
+ engine: xtrabackup
+ locations:
+ - volume:
+ hostPath:
+ path: /backup
+ type: Directory
images:
vtctld: vitess/lite:latest
vtgate: vitess/lite:latest
@@ -10,7 +17,7 @@ spec:
vtorc: vitess/lite:latest
vtbackup: vitess/lite:latest
mysqld:
- mysql80Compatible: mysql:8.0.40
+ mysql80Compatible: vitess/lite:latest
mysqldExporter: prom/mysqld-exporter:v0.14.0
cells:
- name: zone1
diff --git a/test/endtoend/operator/302_new_shards.yaml b/test/endtoend/operator/302_new_shards.yaml
index 8731afea..2808349b 100644
--- a/test/endtoend/operator/302_new_shards.yaml
+++ b/test/endtoend/operator/302_new_shards.yaml
@@ -3,6 +3,13 @@ kind: VitessCluster
metadata:
name: example
spec:
+ backup:
+ engine: xtrabackup
+ locations:
+ - volume:
+ hostPath:
+ path: /backup
+ type: Directory
images:
vtctld: vitess/lite:latest
vtgate: vitess/lite:latest
@@ -10,7 +17,7 @@ spec:
vtorc: vitess/lite:latest
vtbackup: vitess/lite:latest
mysqld:
- mysql80Compatible: mysql:8.0.40
+ mysql80Compatible: vitess/lite:latest
mysqldExporter: prom/mysqld-exporter:v0.14.0
cells:
- name: zone1
diff --git a/test/endtoend/operator/306_down_shard_0.yaml b/test/endtoend/operator/306_down_shard_0.yaml
index 9684f5f2..2cfcf5bd 100644
--- a/test/endtoend/operator/306_down_shard_0.yaml
+++ b/test/endtoend/operator/306_down_shard_0.yaml
@@ -3,6 +3,13 @@ kind: VitessCluster
metadata:
name: example
spec:
+ backup:
+ engine: xtrabackup
+ locations:
+ - volume:
+ hostPath:
+ path: /backup
+ type: Directory
images:
vtctld: vitess/lite:latest
vtgate: vitess/lite:latest
@@ -10,7 +17,7 @@ spec:
vtorc: vitess/lite:latest
vtbackup: vitess/lite:latest
mysqld:
- mysql80Compatible: mysql:8.0.40
+ mysql80Compatible: vitess/lite:latest
mysqldExporter: prom/mysqld-exporter:v0.14.0
cells:
- name: zone1
diff --git a/test/endtoend/operator/401_scheduled_backups.yaml b/test/endtoend/operator/401_scheduled_backups.yaml
new file mode 100644
index 00000000..52c18053
--- /dev/null
+++ b/test/endtoend/operator/401_scheduled_backups.yaml
@@ -0,0 +1,167 @@
+apiVersion: planetscale.com/v2
+kind: VitessCluster
+metadata:
+ name: example
+spec:
+ backup:
+ engine: xtrabackup
+ locations:
+ - volume:
+ hostPath:
+ path: /backup
+ type: Directory
+ schedules:
+ - name: "customer"
+ schedule: "* * * * *"
+ resources:
+ requests:
+ cpu: 100m
+ memory: 1024Mi
+ limits:
+ memory: 1024Mi
+ successfulJobsHistoryLimit: 2
+ failedJobsHistoryLimit: 3
+ jobTimeoutMinute: 5
+ suspend: true
+ strategies:
+ - name: BackupShard
+ keyspace: "customer"
+ shard: "80-"
+ - name: BackupShard
+ keyspace: "customer"
+ shard: "-80"
+ - name: "commerce"
+ schedule: "* * * * *"
+ resources:
+ requests:
+ cpu: 100m
+ memory: 1024Mi
+ limits:
+ memory: 1024Mi
+ successfulJobsHistoryLimit: 2
+ failedJobsHistoryLimit: 3
+ jobTimeoutMinute: 5
+ strategies:
+ - name: BackupShard
+ keyspace: "commerce"
+ shard: "-"
+ images:
+ vtctld: vitess/lite:latest
+ vtgate: vitess/lite:latest
+ vttablet: vitess/lite:latest
+ vtorc: vitess/lite:latest
+ vtbackup: vitess/lite:latest
+ mysqld:
+ mysql80Compatible: vitess/lite:latest
+ mysqldExporter: prom/mysqld-exporter:v0.14.0
+ cells:
+ - name: zone1
+ gateway:
+ authentication:
+ static:
+ secret:
+ name: example-cluster-config
+ key: users.json
+ replicas: 1
+ resources:
+ requests:
+ cpu: 100m
+ memory: 256Mi
+ limits:
+ memory: 256Mi
+ vitessDashboard:
+ cells:
+ - zone1
+ extraFlags:
+ security_policy: read-only
+ replicas: 1
+ resources:
+ limits:
+ memory: 128Mi
+ requests:
+ cpu: 100m
+ memory: 128Mi
+
+ keyspaces:
+ - name: commerce
+ durabilityPolicy: semi_sync
+ vitessOrchestrator:
+ resources:
+ limits:
+ memory: 128Mi
+ requests:
+ cpu: 100m
+ memory: 128Mi
+ extraFlags:
+ instance-poll-time: 3s
+ turndownPolicy: Immediate
+ partitionings:
+ - equal:
+ parts: 1
+ shardTemplate:
+ databaseInitScriptSecret:
+ name: example-cluster-config
+ key: init_db.sql
+ tabletPools:
+ - cell: zone1
+ type: replica
+ replicas: 3
+ vttablet:
+ extraFlags:
+ db_charset: utf8mb4
+ resources:
+ requests:
+ cpu: 100m
+ memory: 256Mi
+ mysqld:
+ resources:
+ requests:
+ cpu: 100m
+ memory: 512Mi
+ dataVolumeClaimTemplate:
+ accessModes: ["ReadWriteOnce"]
+ resources:
+ requests:
+ storage: 10Gi
+ - name: customer
+ durabilityPolicy: semi_sync
+ vitessOrchestrator:
+ resources:
+ limits:
+ memory: 128Mi
+ requests:
+ cpu: 100m
+ memory: 128Mi
+ extraFlags:
+ instance-poll-time: 3s
+ turndownPolicy: Immediate
+ partitionings:
+ - equal:
+ parts: 2
+ shardTemplate:
+ databaseInitScriptSecret:
+ name: example-cluster-config
+ key: init_db.sql
+ tabletPools:
+ - cell: zone1
+ type: replica
+ replicas: 3
+ vttablet:
+ extraFlags:
+ db_charset: utf8mb4
+ resources:
+ requests:
+ cpu: 100m
+ memory: 256Mi
+ mysqld:
+ resources:
+ requests:
+ cpu: 100m
+ memory: 512Mi
+ dataVolumeClaimTemplate:
+ accessModes: ["ReadWriteOnce"]
+ resources:
+ requests:
+ storage: 10Gi
+ updateStrategy:
+ type: Immediate
diff --git a/test/endtoend/operator/cluster_upgrade.yaml b/test/endtoend/operator/cluster_upgrade.yaml
index 18cd5484..cb0216e6 100644
--- a/test/endtoend/operator/cluster_upgrade.yaml
+++ b/test/endtoend/operator/cluster_upgrade.yaml
@@ -7,6 +7,13 @@ kind: VitessCluster
metadata:
name: example
spec:
+ backup:
+ engine: xtrabackup
+ locations:
+ - volume:
+ hostPath:
+ path: /backup
+ type: Directory
images:
vtctld: vitess/lite:latest
vtgate: vitess/lite:latest
@@ -14,7 +21,7 @@ spec:
vtorc: vitess/lite:latest
vtbackup: vitess/lite:latest
mysqld:
- mysql80Compatible: mysql:8.0.40
+ mysql80Compatible: vitess/lite:latest
mysqldExporter: prom/mysqld-exporter:v0.14.0
cells:
- name: zone1
diff --git a/test/endtoend/upgrade_test.sh b/test/endtoend/upgrade_test.sh
index f1c47413..3bc4552c 100755
--- a/test/endtoend/upgrade_test.sh
+++ b/test/endtoend/upgrade_test.sh
@@ -233,10 +233,23 @@ EOF
}
# Test setup
+mkdir -p -m 777 ./vtdataroot/backup
echo "Building the docker image"
docker build -f build/Dockerfile.release -t vitess-operator-pr:latest .
echo "Creating Kind cluster"
-kind create cluster --wait 30s --name kind-${BUILDKITE_BUILD_ID} --image ${KIND_VERSION}
+if [[ "$BUILDKITE_BUILD_ID" != "0" ]]; then
+ # The script is being run from buildkite, so we can't mount the current
+ # working directory to kind. The current directory in the docker is workdir
+ # So if we try and mount that, we get an error. Instead we need to mount the
+ # path where the code was checked out be buildkite
+ dockerContainerName=$(docker container ls --filter "ancestor=docker" --format '{{.Names}}')
+ CHECKOUT_PATH=$(docker container inspect -f '{{range .Mounts}}{{ if eq .Destination "/workdir" }}{{println .Source }}{{ end }}{{end}}' "$dockerContainerName")
+ BACKUP_DIR="$CHECKOUT_PATH/vtdataroot/backup"
+else
+ BACKUP_DIR="$PWD/vtdataroot/backup"
+fi
+cat ./test/endtoend/kindBackupConfig.yaml | sed "s,PATH,$BACKUP_DIR,1" > ./vtdataroot/config.yaml
+kind create cluster --wait 30s --name kind-${BUILDKITE_BUILD_ID} --image ${KIND_VERSION} --config ./vtdataroot/config.yaml
echo "Loading docker image into Kind cluster"
kind load docker-image vitess-operator-pr:latest --name kind-${BUILDKITE_BUILD_ID}
@@ -244,7 +257,7 @@ cd "$PWD/test/endtoend/operator"
killall kubectl
setupKubectlAccessForCI
-get_started "operator.yaml" "101_initial_cluster.yaml"
+get_started "operator-latest.yaml" "101_initial_cluster.yaml"
verifyVtGateVersion "21.0.0"
checkSemiSyncSetup
# Initially too durability policy should be specified
@@ -258,5 +271,5 @@ move_tables
resharding
# Teardown
-echo "Deleting Kind cluster. This also deletes the volume associated with it"
-kind delete cluster --name kind-${BUILDKITE_BUILD_ID}
+#echo "Deleting Kind cluster. This also deletes the volume associated with it"
+#kind delete cluster --name kind-${BUILDKITE_BUILD_ID}
From ae049fdfc06ef1abbb9ecc027cacef54d3b5ff4e Mon Sep 17 00:00:00 2001
From: Florent Poinsard
Date: Mon, 24 Feb 2025 17:08:38 -0600
Subject: [PATCH 08/13] Rework how multiple strategies are handled and how the
Status is used
Signed-off-by: Florent Poinsard
---
...planetscale.com_vitessbackupschedules.yaml | 7 +-
.../crds/planetscale.com_vitessclusters.yaml | 2 -
docs/api.md | 45 ++--
docs/api/index.html | 45 ++--
.../v2/vitessbackupschedule_types.go | 42 +--
.../planetscale/v2/zz_generated.deepcopy.go | 8 +
.../vitessbackupschedule_controller.go | 254 +++++++++---------
test/endtoend/backup_schedule_test.sh | 19 +-
.../101_initial_cluster_backup_schedule.yaml | 4 +-
test/endtoend/operator/operator-latest.yaml | 9 +-
test/endtoend/utils.sh | 22 ++
11 files changed, 260 insertions(+), 197 deletions(-)
diff --git a/deploy/crds/planetscale.com_vitessbackupschedules.yaml b/deploy/crds/planetscale.com_vitessbackupschedules.yaml
index dda63530..dbce574a 100644
--- a/deploy/crds/planetscale.com_vitessbackupschedules.yaml
+++ b/deploy/crds/planetscale.com_vitessbackupschedules.yaml
@@ -114,8 +114,6 @@ spec:
example: commerce
type: string
name:
- enum:
- - BackupShard
type: string
shard:
example: '-'
@@ -165,6 +163,11 @@ spec:
lastScheduledTime:
format: date-time
type: string
+ lastScheduledTimes:
+ additionalProperties:
+ format: date-time
+ type: string
+ type: object
type: object
type: object
served: true
diff --git a/deploy/crds/planetscale.com_vitessclusters.yaml b/deploy/crds/planetscale.com_vitessclusters.yaml
index 0d7361d7..3646e9d9 100644
--- a/deploy/crds/planetscale.com_vitessclusters.yaml
+++ b/deploy/crds/planetscale.com_vitessclusters.yaml
@@ -240,8 +240,6 @@ spec:
example: commerce
type: string
name:
- enum:
- - BackupShard
type: string
shard:
example: '-'
diff --git a/docs/api.md b/docs/api.md
index 23073710..661e2975 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -647,16 +647,6 @@ SecretSource
-BackupStrategyName
-(string
alias)
-
-(Appears on:
-VitessBackupScheduleStrategy)
-
-
-
BackupStrategyName describes the vtctldclient command that will be used to take a backup.
-When scheduling a backup, you must specify at least one strategy.
-
CephBackupLocation
@@ -2600,7 +2590,8 @@ The PullPolicy used will be the same as the one used to pull the vtctld image.
(Optional)
- A list of pointers to currently running jobs.
+A list of pointers to currently running jobs.
+This field is deprecated and no longer used in >= v2.15. It will be removed in a future release.
|
@@ -2614,7 +2605,25 @@ Kubernetes meta/v1.Time
(Optional)
- Information when was the last time the job was successfully scheduled.
+Information when was the last time the job was successfully scheduled.
+This field is deprecated and no longer used in >= v2.15. It will be removed in a future release.
+Please use lastScheduledTimes instead which maps the last schedule time to each VitessBackupScheduleStrategy
+in the VitessBackupSchedule.
+ |
+
+
+
+lastScheduledTimes
+
+
+map[string]*k8s.io/apimachinery/pkg/apis/meta/v1.Time
+
+
+ |
+
+(Optional)
+ LastScheduledTimes lists for every VitessBackupScheduleStrategy what is the last schedule we executed.
+It does not list when the last execution started, but to which scheduled time it corresponds.
|
@@ -2642,9 +2651,7 @@ command line that will be executed in the Job’s pod.
name
-
-BackupStrategyName
-
+string
|
@@ -2741,11 +2748,9 @@ string
|
Strategy defines how we are going to take a backup.
If you want to take several backups within the same schedule you can add more items
-to the Strategy list. Each VitessBackupScheduleStrategy will be executed by the same
-kubernetes job. This is useful if for instance you have one schedule, and you want to
-take a backup of all shards in a keyspace and don’t want to re-create a second schedule.
-All the VitessBackupScheduleStrategy are concatenated into a single shell command that
-is executed when the Job’s container starts.
+to the Strategy list. Each VitessBackupScheduleStrategy will be executed within different
+K8S Jobs. This is useful if you want to have a single schedule backing up multiple shards
+at the same time.
|
diff --git a/docs/api/index.html b/docs/api/index.html
index 970b5750..2934ee89 100644
--- a/docs/api/index.html
+++ b/docs/api/index.html
@@ -649,16 +649,6 @@ AzblobBackupLocation
-BackupStrategyName
-(string
alias)
-
-(Appears on:
-VitessBackupScheduleStrategy)
-
-
-
BackupStrategyName describes the vtctldclient command that will be used to take a backup.
-When scheduling a backup, you must specify at least one strategy.
-
CephBackupLocation
@@ -2602,7 +2592,8 @@
VitessBackupScheduleStatu
(Optional)
- A list of pointers to currently running jobs.
+A list of pointers to currently running jobs.
+This field is deprecated and no longer used in >= v2.15. It will be removed in a future release.
|
@@ -2616,7 +2607,25 @@ VitessBackupScheduleStatu
(Optional)
- Information when was the last time the job was successfully scheduled.
+Information when was the last time the job was successfully scheduled.
+This field is deprecated and no longer used in >= v2.15. It will be removed in a future release.
+Please use lastScheduledTimes instead which maps the last schedule time to each VitessBackupScheduleStrategy
+in the VitessBackupSchedule.
+ |
+
+
+
+lastScheduledTimes
+
+
+map[string]*k8s.io/apimachinery/pkg/apis/meta/v1.Time
+
+
+ |
+
+(Optional)
+ LastScheduledTimes lists for every VitessBackupScheduleStrategy what is the last schedule we executed.
+It does not list when the last execution started, but to which scheduled time it corresponds.
|
@@ -2644,9 +2653,7 @@ VitessBackupScheduleStr
name
-
-BackupStrategyName
-
+string
|
@@ -2743,11 +2750,9 @@ VitessBackupScheduleTem
Strategy defines how we are going to take a backup.
If you want to take several backups within the same schedule you can add more items
-to the Strategy list. Each VitessBackupScheduleStrategy will be executed by the same
-kubernetes job. This is useful if for instance you have one schedule, and you want to
-take a backup of all shards in a keyspace and don’t want to re-create a second schedule.
-All the VitessBackupScheduleStrategy are concatenated into a single shell command that
-is executed when the Job’s container starts.
+to the Strategy list. Each VitessBackupScheduleStrategy will be executed within different
+K8S Jobs. This is useful if you want to have a single schedule backing up multiple shards
+at the same time.
|
diff --git a/pkg/apis/planetscale/v2/vitessbackupschedule_types.go b/pkg/apis/planetscale/v2/vitessbackupschedule_types.go
index dbc5f3f9..e33d4061 100644
--- a/pkg/apis/planetscale/v2/vitessbackupschedule_types.go
+++ b/pkg/apis/planetscale/v2/vitessbackupschedule_types.go
@@ -34,16 +34,6 @@ const (
ForbidConcurrent ConcurrencyPolicy = "Forbid"
)
-// BackupStrategyName describes the vtctldclient command that will be used to take a backup.
-// When scheduling a backup, you must specify at least one strategy.
-// +kubebuilder:validation:Enum=BackupShard
-type BackupStrategyName string
-
-const (
- // BackupShard will use the "vtctldclient BackupShard" command to take a backup
- BackupShard BackupStrategyName = "BackupShard"
-)
-
// VitessBackupSchedule is the Schema for the VitessBackupSchedule API.
// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
@@ -98,12 +88,12 @@ type VitessBackupScheduleTemplate struct {
// Strategy defines how we are going to take a backup.
// If you want to take several backups within the same schedule you can add more items
- // to the Strategy list. Each VitessBackupScheduleStrategy will be executed by the same
- // kubernetes job. This is useful if for instance you have one schedule, and you want to
- // take a backup of all shards in a keyspace and don't want to re-create a second schedule.
- // All the VitessBackupScheduleStrategy are concatenated into a single shell command that
- // is executed when the Job's container starts.
+ // to the Strategy list. Each VitessBackupScheduleStrategy will be executed within different
+ // K8S Jobs. This is useful if you want to have a single schedule backing up multiple shards
+ // at the same time.
// +kubebuilder:validation:MinItems=1
+ // +patchMergeKey=name
+ // +patchStrategy=merge
Strategy []VitessBackupScheduleStrategy `json:"strategies"`
// Resources specify the compute resources to allocate for every Jobs's pod.
@@ -181,7 +171,7 @@ type VitessBackupScheduleTemplate struct {
// command line that will be executed in the Job's pod.
type VitessBackupScheduleStrategy struct {
// Name of the backup strategy.
- Name BackupStrategyName `json:"name"`
+ Name string `json:"name"`
// Keyspace defines the keyspace on which we want to take the backup.
// +kubebuilder:example="commerce"
@@ -199,12 +189,32 @@ type VitessBackupScheduleStrategy struct {
// VitessBackupScheduleStatus defines the observed state of VitessBackupSchedule
type VitessBackupScheduleStatus struct {
// A list of pointers to currently running jobs.
+ // This field is deprecated and no longer used in >= v2.15. It will be removed in a future release.
// +optional
Active []corev1.ObjectReference `json:"active,omitempty"`
// Information when was the last time the job was successfully scheduled.
+ // This field is deprecated and no longer used in >= v2.15. It will be removed in a future release.
+ // Please use lastScheduledTimes instead which maps the last schedule time to each VitessBackupScheduleStrategy
+ // in the VitessBackupSchedule.
// +optional
LastScheduledTime *metav1.Time `json:"lastScheduledTime,omitempty"`
+
+ // LastScheduledTimes lists for every VitessBackupScheduleStrategy what is the last schedule we executed.
+ // It does not list when the last execution started, but to which scheduled time it corresponds.
+ // +optional
+ LastScheduledTimes map[string]*metav1.Time `json:"lastScheduledTimes,omitempty"`
+}
+
+// NewVitessBackupScheduleStatus creates a new status with default values.
+func NewVitessBackupScheduleStatus(status VitessBackupScheduleStatus) VitessBackupScheduleStatus {
+ newStatus := VitessBackupScheduleStatus{
+ LastScheduledTimes: status.LastScheduledTimes,
+ }
+ if status.LastScheduledTimes == nil {
+ newStatus.LastScheduledTimes = make(map[string]*metav1.Time)
+ }
+ return newStatus
}
func init() {
diff --git a/pkg/apis/planetscale/v2/zz_generated.deepcopy.go b/pkg/apis/planetscale/v2/zz_generated.deepcopy.go
index ace6b890..700716d2 100644
--- a/pkg/apis/planetscale/v2/zz_generated.deepcopy.go
+++ b/pkg/apis/planetscale/v2/zz_generated.deepcopy.go
@@ -7,6 +7,7 @@ package v2
import (
autoscalingv2 "k8s.io/api/autoscaling/v2"
"k8s.io/api/core/v1"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
runtime "k8s.io/apimachinery/pkg/runtime"
)
@@ -855,6 +856,13 @@ func (in *VitessBackupScheduleStatus) DeepCopyInto(out *VitessBackupScheduleStat
in, out := &in.LastScheduledTime, &out.LastScheduledTime
*out = (*in).DeepCopy()
}
+ if in.LastScheduledTimes != nil {
+ in, out := &in.LastScheduledTimes, &out.LastScheduledTimes
+ *out = make(map[string]*metav1.Time, len(*in))
+ for key, val := range *in {
+ (*out)[key] = val.DeepCopy()
+ }
+ }
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VitessBackupScheduleStatus.
diff --git a/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go b/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go
index 5d2ddfb8..4091163a 100644
--- a/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go
+++ b/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go
@@ -32,6 +32,7 @@ import (
"github.com/sirupsen/logrus"
kbatch "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
+ apiequality "k8s.io/apimachinery/pkg/api/equality"
apierrors "k8s.io/apimachinery/pkg/api/errors"
apilabels "k8s.io/apimachinery/pkg/labels"
"k8s.io/client-go/tools/record"
@@ -40,6 +41,7 @@ import (
"planetscale.dev/vitess-operator/pkg/operator/names"
"planetscale.dev/vitess-operator/pkg/operator/reconciler"
"planetscale.dev/vitess-operator/pkg/operator/results"
+ "planetscale.dev/vitess-operator/pkg/operator/resync"
"planetscale.dev/vitess-operator/pkg/operator/vitessbackup"
"planetscale.dev/vitess-operator/pkg/operator/vttablet"
"sigs.k8s.io/controller-runtime/pkg/controller"
@@ -51,7 +53,6 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
- ref "k8s.io/client-go/tools/reference"
planetscalev2 "planetscale.dev/vitess-operator/pkg/apis/planetscale/v2"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -64,6 +65,7 @@ const (
var (
maxConcurrentReconciles = flag.Int("vitessbackupschedule_concurrent_reconciles", 10, "the maximum number of different vitessbackupschedule resources to reconcile concurrently")
+ resyncPeriod = flag.Duration("vitessbackupschedule_resync_period", 1*time.Minute, "reconcile vitessbackupschedules with this period even if no Kubernetes events occur")
scheduledTimeAnnotation = "planetscale.com/backup-scheduled-at"
@@ -80,6 +82,7 @@ type (
ReconcileVitessBackupsSchedule struct {
client client.Client
scheme *runtime.Scheme
+ resync *resync.Periodic
recorder record.EventRecorder
reconciler *reconciler.Reconciler
}
@@ -111,6 +114,7 @@ func newReconciler(mgr manager.Manager) (*ReconcileVitessBackupsSchedule, error)
return &ReconcileVitessBackupsSchedule{
client: c,
scheme: scheme,
+ resync: resync.NewPeriodic(controllerName, *resyncPeriod),
recorder: recorder,
reconciler: reconciler.New(c, scheme, recorder),
}, nil
@@ -145,6 +149,11 @@ func add(mgr manager.Manager, r *ReconcileVitessBackupsSchedule) error {
}
}
+ // Periodically resync even when no Kubernetes events have come in.
+ if err := c.Watch(r.resync.WatchSource()); err != nil {
+ return err
+ }
+
return nil
}
@@ -156,16 +165,14 @@ func add(mgr manager.Manager, r *ReconcileVitessBackupsSchedule) error {
// - List all jobs and define the last scheduled Job
// - Clean up old Job objects
// - Create a new Job if needed
-func (r *ReconcileVitessBackupsSchedule) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
- resultBuilder := &results.Builder{}
-
+// - Update the VitessBackupSchedule Status
+func (r *ReconcileVitessBackupsSchedule) Reconcile(ctx context.Context, req ctrl.Request) (res ctrl.Result, err error) {
log = log.WithFields(logrus.Fields{
"namespace": req.Namespace,
"VitessBackupSchedule": req.Name,
})
log.Info("Reconciling VitessBackupSchedule")
- var err error
var vbsc planetscalev2.VitessBackupSchedule
if err = r.client.Get(ctx, req.NamespacedName, &vbsc); err != nil {
log.WithError(err).Error(" unable to fetch VitessBackupSchedule")
@@ -173,113 +180,145 @@ func (r *ReconcileVitessBackupsSchedule) Reconcile(ctx context.Context, req ctrl
// Request object not found, could have been deleted after reconcile request.
// Owned objects are automatically garbage collected. For additional cleanup logic use finalizers.
// Return and don't requeue
- return resultBuilder.Result()
+ return ctrl.Result{}, nil
}
// Error reading the object - requeue the request.
- return resultBuilder.Error(err)
+ return ctrl.Result{}, err
}
+ // If the Suspend setting is set to true, we can skip adding any job, our work is done here.
+ if vbsc.Spec.Suspend != nil && *vbsc.Spec.Suspend {
+ log.Info("VitessBackupSchedule suspended, skipping")
+ return ctrl.Result{}, nil
+ }
+
+ oldStatus := vbsc.DeepCopy()
+ vbsc.Status = planetscalev2.NewVitessBackupScheduleStatus(vbsc.Status)
+
// Register this reconciling attempt no matter if we fail or succeed.
defer func() {
reconcileCount.WithLabelValues(vbsc.Name, metrics.Result(err)).Inc()
}()
- var scheduledResult ctrl.Result
- var activeJobs []*kbatch.Job
- for _, strategy := range vbsc.Spec.Strategy {
- start, end, ok := strings.Cut(strategy.Shard, "-")
- if !ok {
- return resultBuilder.Error(fmt.Errorf("invalid strategy shard: %s", strategy.Shard))
- }
- vkr := planetscalev2.VitessKeyRange{
- Start: start,
- End: end,
- }
- jobs, mostRecentTime, err := r.getJobsList(ctx, req, vbsc, strategy.Keyspace, vkr.SafeName())
- if err != nil {
- // We had an error reading the jobs, we can requeue.
- return resultBuilder.Error(err)
- }
+ resultBuilder := &results.Builder{}
+ _, _ = resultBuilder.Merge(r.reconcileStrategies(ctx, req, vbsc))
- activeJobs = append(activeJobs, jobs.active...)
- err = r.updateVitessBackupScheduleStatus(ctx, mostRecentTime, vbsc, activeJobs)
- if err != nil {
- // We had an error updating the status, we can requeue.
- return resultBuilder.Error(err)
+ if !apiequality.Semantic.DeepEqual(&vbsc.Status, &oldStatus) {
+ if err := r.client.Status().Update(ctx, &vbsc); err != nil {
+ if !apierrors.IsConflict(err) {
+ log.WithError(err).Error("unable to update VitessBackupSchedule status")
+ }
+ _, _ = resultBuilder.Error(err)
}
+ }
- // We must clean up old jobs to not overcrowd the number of Pods and Jobs in the cluster.
- // This will be done according to both failedJobsHistoryLimit and successfulJobsHistoryLimit fields.
- r.cleanupJobsWithLimit(ctx, jobs.failed, vbsc.GetFailedJobsLimit())
- r.cleanupJobsWithLimit(ctx, jobs.successful, vbsc.GetSuccessfulJobsLimit())
+ // Request a periodic resync of this VitessBackupSchedule object to check
+ // if existing Jobs have finished or timed out and need to be cleaned up
+ // even if no Kubernetes events have occurred.
+ r.resync.Enqueue(req.NamespacedName)
- err = r.removeTimeoutJobs(ctx, jobs.active, vbsc.Name, vbsc.Spec.JobTimeoutMinutes)
- if err != nil {
- // We had an error while removing timed out jobs, we can requeue
- return resultBuilder.Error(err)
- }
+ return resultBuilder.Result()
+}
- // If the Suspend setting is set to true, we can skip adding any job, our work is done here.
- if vbsc.Spec.Suspend != nil && *vbsc.Spec.Suspend {
- log.Info("VitessBackupSchedule suspended, skipping")
- return ctrl.Result{}, nil
- }
+func (r *ReconcileVitessBackupsSchedule) reconcileStrategies(ctx context.Context, req ctrl.Request, vbsc planetscalev2.VitessBackupSchedule) (ctrl.Result, error) {
+ resultBuilder := &results.Builder{}
- missedRun, nextRun, err := getNextSchedule(vbsc, time.Now())
- if err != nil {
- log.Error(err, "unable to figure out VitessBackupSchedule schedule")
- // Re-queuing here does not make sense as we have an error with the schedule and the user needs to fix it first.
- return ctrl.Result{}, nil
- }
+ for _, strategy := range vbsc.Spec.Strategy {
+ _, _ = resultBuilder.Merge(r.reconcileStrategy(ctx, strategy, req, vbsc))
+ }
+ return resultBuilder.Result()
+}
- // Ask kubernetes to re-queue for the next scheduled job, and skip if we don't miss any run.
- scheduledResult = ctrl.Result{RequeueAfter: nextRun.Sub(time.Now())}
- if missedRun.IsZero() {
- continue
- }
+func (r *ReconcileVitessBackupsSchedule) reconcileStrategy(
+ ctx context.Context,
+ strategy planetscalev2.VitessBackupScheduleStrategy,
+ req ctrl.Request,
+ vbsc planetscalev2.VitessBackupSchedule,
+) (reconcile.Result, error) {
+ resultBuilder := &results.Builder{}
- // Check whether we are too late to create this Job or not. The startingDeadlineSeconds field will help us
- // schedule Jobs that are late.
- tooLate := false
- if vbsc.Spec.StartingDeadlineSeconds != nil {
- tooLate = missedRun.Add(time.Duration(*vbsc.Spec.StartingDeadlineSeconds) * time.Second).Before(time.Now())
- }
- if tooLate {
- log.Infof("missed starting deadline for latest run; skipping; next run is scheduled for: %s", nextRun.Format(time.RFC3339))
- continue
- }
+ start, end, ok := strings.Cut(strategy.Shard, "-")
+ if !ok {
+ return resultBuilder.Error(fmt.Errorf("invalid strategy shard: %s", strategy.Shard))
+ }
+ vkr := planetscalev2.VitessKeyRange{
+ Start: start,
+ End: end,
+ }
+ jobs, mostRecentTime, err := r.getJobsList(ctx, req, vbsc, strategy.Keyspace, vkr.SafeName())
+ if err != nil {
+ // We had an error reading the jobs, we can requeue.
+ return resultBuilder.Error(err)
+ }
- // Check concurrency policy and skip this job if we have ForbidConcurrent set plus an active job
- if vbsc.Spec.ConcurrencyPolicy == planetscalev2.ForbidConcurrent && len(jobs.active) > 0 {
- log.Infof("concurrency policy blocks concurrent runs: skipping, number of active jobs: %d", len(jobs.active))
- continue
- }
+ // We must clean up old jobs to not overcrowd the number of Pods and Jobs in the cluster.
+ // This will be done according to both failedJobsHistoryLimit and successfulJobsHistoryLimit fields.
+ r.cleanupJobsWithLimit(ctx, jobs.failed, vbsc.GetFailedJobsLimit())
+ r.cleanupJobsWithLimit(ctx, jobs.successful, vbsc.GetSuccessfulJobsLimit())
- // Now that the different policies are checked, we can create and apply our new job.
- job, err := r.createJob(ctx, vbsc, strategy, missedRun, vkr)
- if err != nil {
- // Re-queuing here does not make sense as we have an error with the template and the user needs to fix it first.
- log.WithError(err).Error("unable to construct job from template")
- return ctrl.Result{}, err
- }
+ err = r.removeTimeoutJobs(ctx, jobs.active, vbsc.Name, vbsc.Spec.JobTimeoutMinutes)
+ if err != nil {
+ // We had an error while removing timed out jobs, we can requeue
+ return resultBuilder.Error(err)
+ }
- if err = r.client.Create(ctx, job); err != nil {
- // if the job already exists it means another reconciling loop created the job since we last fetched
- // the list of jobs to create, we can safely return without failing.
- if apierrors.IsAlreadyExists(err) {
- log.Infof("job %s already exists, will retry in %s", job.Name, scheduledResult.RequeueAfter.String())
- continue
- }
- // Simply re-queue here
- return resultBuilder.Error(err)
- }
+ missedRun, nextRun, err := getNextSchedule(vbsc, time.Now(), mostRecentTime)
+ if err != nil {
+ log.Error(err, "unable to figure out VitessBackupSchedule schedule")
+ // Re-queuing here does not make sense as we have an error with the schedule and the user needs to fix it first.
+ return resultBuilder.Error(reconcile.TerminalError(err))
+ }
- log.Infof("created new job: %s, next job scheduled in %s", job.Name, scheduledResult.RequeueAfter.String())
+ // If we did not miss any run, we can skip and not requeue anything
+ if missedRun.IsZero() {
+ return resultBuilder.Result()
}
- return scheduledResult, nil
+
+ // Keep track of when we need to requeue this job
+ requeueAfter := nextRun.Sub(time.Now())
+ _, _ = resultBuilder.RequeueAfter(requeueAfter)
+
+ // Check whether we are too late to create this Job or not. The startingDeadlineSeconds field will help us
+ // schedule Jobs that are late.
+ tooLate := false
+ if vbsc.Spec.StartingDeadlineSeconds != nil {
+ tooLate = missedRun.Add(time.Duration(*vbsc.Spec.StartingDeadlineSeconds) * time.Second).Before(time.Now())
+ }
+ if tooLate {
+ log.Infof("missed starting deadline for latest run; skipping; next run is scheduled for: %s", nextRun.Format(time.RFC3339))
+ return resultBuilder.Result()
+ }
+
+ // Check concurrency policy and skip this job if we have ForbidConcurrent set plus an active job
+ if vbsc.Spec.ConcurrencyPolicy == planetscalev2.ForbidConcurrent && len(jobs.active) > 0 {
+ log.Infof("concurrency policy blocks concurrent runs: skipping, number of active jobs: %d", len(jobs.active))
+ return resultBuilder.Result()
+ }
+
+ // Now that the different policies are checked, we can create and apply our new job.
+ job, err := r.createJob(ctx, vbsc, strategy, missedRun, vkr)
+ if err != nil {
+ // Re-queuing here does not make sense as we have an error with the template and the user needs to fix it first.
+ log.WithError(err).Error("unable to construct job from template")
+ return resultBuilder.Error(reconcile.TerminalError(err))
+ }
+
+ if err = r.client.Create(ctx, job); err != nil {
+ // if the job already exists it means another reconciling loop created the job since we last fetched
+ // the list of jobs to create, we can safely return without failing.
+ if apierrors.IsAlreadyExists(err) {
+ log.Infof("job %s already exists, will retry in %s", job.Name, requeueAfter.String())
+ return resultBuilder.Result()
+ }
+ // Simply re-queue here
+ return resultBuilder.Error(err)
+ }
+ log.Infof("created new job: %s, next job scheduled in %s", job.Name, requeueAfter.String())
+ vbsc.Status.LastScheduledTimes[strategy.Name] = &metav1.Time{Time: missedRun}
+ return resultBuilder.Result()
}
-func getNextSchedule(vbsc planetscalev2.VitessBackupSchedule, now time.Time) (time.Time, time.Time, error) {
+func getNextSchedule(vbsc planetscalev2.VitessBackupSchedule, now time.Time, mostRecentTime *time.Time) (time.Time, time.Time, error) {
sched, err := cron.ParseStandard(vbsc.Spec.Schedule)
if err != nil {
return time.Time{}, time.Time{}, fmt.Errorf("unable to parse schedule %q: %v", vbsc.Spec.Schedule, err)
@@ -287,30 +326,27 @@ func getNextSchedule(vbsc planetscalev2.VitessBackupSchedule, now time.Time) (ti
// Set the last scheduled time by either looking at the VitessBackupSchedule's Status or
// by looking at its creation time.
- var latestRun time.Time
- if vbsc.Status.LastScheduledTime != nil {
- latestRun = vbsc.Status.LastScheduledTime.Time
- } else {
- latestRun = vbsc.ObjectMeta.CreationTimestamp.Time
+ if mostRecentTime == nil {
+ mostRecentTime = &vbsc.ObjectMeta.CreationTimestamp.Time
}
if vbsc.Spec.StartingDeadlineSeconds != nil {
// controller is not going to schedule anything below this point
schedulingDeadline := now.Add(-time.Second * time.Duration(*vbsc.Spec.StartingDeadlineSeconds))
- if schedulingDeadline.After(latestRun) {
- latestRun = schedulingDeadline
+ if schedulingDeadline.After(*mostRecentTime) {
+ *mostRecentTime = schedulingDeadline
}
}
// Next schedule is later, simply return the next scheduled time.
- if latestRun.After(now) {
+ if mostRecentTime.After(now) {
return time.Time{}, sched.Next(now), nil
}
var lastMissed time.Time
missedRuns := 0
- for t := sched.Next(latestRun); !t.After(now); t = sched.Next(t) {
+ for t := sched.Next(*mostRecentTime); !t.After(now); t = sched.Next(t) {
lastMissed = t
missedRuns++
@@ -323,30 +359,6 @@ func getNextSchedule(vbsc planetscalev2.VitessBackupSchedule, now time.Time) (ti
return lastMissed, sched.Next(now), nil
}
-func (r *ReconcileVitessBackupsSchedule) updateVitessBackupScheduleStatus(ctx context.Context, mostRecentTime *time.Time, vbsc planetscalev2.VitessBackupSchedule, activeJobs []*kbatch.Job) error {
- if mostRecentTime != nil {
- vbsc.Status.LastScheduledTime = &metav1.Time{Time: *mostRecentTime}
- } else {
- vbsc.Status.LastScheduledTime = nil
- }
-
- vbsc.Status.Active = make([]corev1.ObjectReference, 0, len(activeJobs))
- for _, activeJob := range activeJobs {
- jobRef, err := ref.GetReference(r.scheme, activeJob)
- if err != nil {
- log.WithError(err).Errorf("unable to make reference to active job: %s", jobRef.Name)
- continue
- }
- vbsc.Status.Active = append(vbsc.Status.Active, *jobRef)
- }
-
- if err := r.client.Status().Update(ctx, &vbsc); err != nil {
- log.WithError(err).Error("unable to update VitessBackupSchedule status")
- return err
- }
- return nil
-}
-
// getJobsList fetches all existing Jobs in the cluster and return them by categories: active, failed or successful.
// It also returns at what time was the last job created, which is needed to update VitessBackupSchedule's status,
// and plan future jobs.
diff --git a/test/endtoend/backup_schedule_test.sh b/test/endtoend/backup_schedule_test.sh
index ce661ed4..8620bbc0 100755
--- a/test/endtoend/backup_schedule_test.sh
+++ b/test/endtoend/backup_schedule_test.sh
@@ -31,27 +31,26 @@ function verifyListBackupsOutputWithSchedule() {
checkVitessBackupScheduleStatusWithTimeout "example-vbsc-every-five-minute(.*)"
echo -e "Check for number of backups in the cluster"
- # Sleep for over 6 minutes, during this time we should have at the very minimum
- # 7 backups. At least: 6 backups from the every-minute schedule, and 1 backup
- # from the every-five-minute schedule.
- for i in {1..6} ; do
+ # Sleep for 6 minutes, after 6 minutes we should at least have 3 backups: 1 from the initial vtbackup pod
+ # 1 minimum from the every minute schedule, and 1 from the every-five minute schedule
+ for i in {1..360} ; do
# Ensure that we can view the backup files from the host.
docker exec -it $(docker container ls --format '{{.Names}}' | grep kind) chmod o+rwx -R /backup > /dev/null
backupCount=$(kubectl get vtb --no-headers | wc -l)
echo "Found ${backupCount} backups"
- if [[ "${backupCount}" -ge 7 ]]; then
+ if [[ "${backupCount}" -ge 3 ]]; then
break
fi
- sleep 100
+ sleep 1
done
- if [[ "${backupCount}" -lt 7 ]]; then
- echo "Did not find at least 7 backups"
+ if [[ "${backupCount}" -lt 3 ]]; then
+ echo "Did not find at least 3 backups"
exit 1
fi
echo -e "Check for Jobs' pods"
- checkPodStatusWithTimeout "example-vbsc-every-minute-(.*)0/1(.*)Completed(.*)" 3
- checkPodStatusWithTimeout "example-vbsc-every-five-minute-(.*)0/1(.*)Completed(.*)" 2
+ checkPodExistWithTimeout "example-vbsc-every-minute-(.*)0/1(.*)Completed(.*)"
+ checkPodExistWithTimeout "example-vbsc-every-five-minute-(.*)0/1(.*)Completed(.*)"
}
# Test setup
diff --git a/test/endtoend/operator/101_initial_cluster_backup_schedule.yaml b/test/endtoend/operator/101_initial_cluster_backup_schedule.yaml
index 7fe852cf..b6379901 100644
--- a/test/endtoend/operator/101_initial_cluster_backup_schedule.yaml
+++ b/test/endtoend/operator/101_initial_cluster_backup_schedule.yaml
@@ -27,7 +27,7 @@ spec:
failedJobsHistoryLimit: 3
jobTimeoutMinute: 5
strategies:
- - name: BackupShard
+ - name: commerce-x
keyspace: "commerce"
shard: "-"
- name: "every-five-minute"
@@ -42,7 +42,7 @@ spec:
failedJobsHistoryLimit: 3
jobTimeoutMinute: 5
strategies:
- - name: BackupShard
+ - name: commerce-x
keyspace: "commerce"
shard: "-"
images:
diff --git a/test/endtoend/operator/operator-latest.yaml b/test/endtoend/operator/operator-latest.yaml
index 4d41262a..be23f61a 100644
--- a/test/endtoend/operator/operator-latest.yaml
+++ b/test/endtoend/operator/operator-latest.yaml
@@ -492,8 +492,6 @@ spec:
example: commerce
type: string
name:
- enum:
- - BackupShard
type: string
shard:
example: '-'
@@ -543,6 +541,11 @@ spec:
lastScheduledTime:
format: date-time
type: string
+ lastScheduledTimes:
+ additionalProperties:
+ format: date-time
+ type: string
+ type: object
type: object
type: object
served: true
@@ -2101,8 +2104,6 @@ spec:
example: commerce
type: string
name:
- enum:
- - BackupShard
type: string
shard:
example: '-'
diff --git a/test/endtoend/utils.sh b/test/endtoend/utils.sh
index 4de6add1..74e380f8 100644
--- a/test/endtoend/utils.sh
+++ b/test/endtoend/utils.sh
@@ -155,6 +155,28 @@ function checkPodStatusWithTimeout() {
exit 1
}
+function checkPodExistWithTimeout() {
+ regex=$1
+
+ # We use this for loop instead of `kubectl wait` because we don't have access to the full pod name
+ # and `kubectl wait` does not support regex to match resource name.
+ for i in {1..1200} ; do
+ out=$(kubectl get pods)
+ echo "$out" | grep -E "$regex" > /dev/null 2>&1
+ if [[ $? -eq 0 ]]; then
+ echo "$regex found"
+ return
+ fi
+ sleep 1
+ done
+ echo -e "ERROR: checkPodStatusWithTimeout timeout to find pod matching:\ngot:\n$out\nfor regex: $regex"
+ echo "$regex" | grep "vttablet" > /dev/null 2>&1
+ if [[ $? -eq 0 ]]; then
+ printMysqlErrorFiles
+ fi
+ exit 1
+}
+
# ensurePodResourcesSet:
# $1: regex used to match pod names
function ensurePodResourcesSet() {
From e35d9e2df593e2c9c3111643743a66a9e9e463dc Mon Sep 17 00:00:00 2001
From: Florent Poinsard
Date: Tue, 25 Feb 2025 13:06:46 -0600
Subject: [PATCH 09/13] Add more tests both in upgrade and scheduled backup
Signed-off-by: Florent Poinsard
---
test/endtoend/backup_schedule_test.sh | 21 ++----
.../operator/401_scheduled_backups.yaml | 27 ++++----
test/endtoend/upgrade_test.sh | 65 ++++++++++++++-----
test/endtoend/utils.sh | 16 ++++-
4 files changed, 79 insertions(+), 50 deletions(-)
diff --git a/test/endtoend/backup_schedule_test.sh b/test/endtoend/backup_schedule_test.sh
index 8620bbc0..34ca9f94 100755
--- a/test/endtoend/backup_schedule_test.sh
+++ b/test/endtoend/backup_schedule_test.sh
@@ -11,29 +11,15 @@ function takedownShard() {
checkPodStatusWithTimeout "example-vttablet-zone1" 0
}
-function checkVitessBackupScheduleStatusWithTimeout() {
- regex=$1
-
- for i in {1..1200} ; do
- if [[ $(kubectl get VitessBackupSchedule | grep -E "${regex}" | wc -l) -eq 1 ]]; then
- echo "$regex found"
- return
- fi
- sleep 1
- done
- echo -e "ERROR: checkPodStatusWithTimeout timeout to find pod matching:\ngot:\n$out\nfor regex: $regex"
- exit 1
-}
-
function verifyListBackupsOutputWithSchedule() {
echo -e "Check for VitessBackupSchedule status"
checkVitessBackupScheduleStatusWithTimeout "example-vbsc-every-minute(.*)"
checkVitessBackupScheduleStatusWithTimeout "example-vbsc-every-five-minute(.*)"
echo -e "Check for number of backups in the cluster"
- # Sleep for 6 minutes, after 6 minutes we should at least have 3 backups: 1 from the initial vtbackup pod
+ # Sleep for 7 minutes, after 7 minutes we should at least have 3 backups: 1 from the initial vtbackup pod
# 1 minimum from the every minute schedule, and 1 from the every-five minute schedule
- for i in {1..360} ; do
+ for i in {1..420} ; do
# Ensure that we can view the backup files from the host.
docker exec -it $(docker container ls --format '{{.Names}}' | grep kind) chmod o+rwx -R /backup > /dev/null
backupCount=$(kubectl get vtb --no-headers | wc -l)
@@ -43,12 +29,13 @@ function verifyListBackupsOutputWithSchedule() {
fi
sleep 1
done
- if [[ "${backupCount}" -lt 3 ]]; then
+ if [[ "${backupCount}" -ge 3 ]]; then
echo "Did not find at least 3 backups"
exit 1
fi
echo -e "Check for Jobs' pods"
+ # Here check explicitly that the every five minute schedule ran at least once during the 7 minutes sleep
checkPodExistWithTimeout "example-vbsc-every-minute-(.*)0/1(.*)Completed(.*)"
checkPodExistWithTimeout "example-vbsc-every-five-minute-(.*)0/1(.*)Completed(.*)"
}
diff --git a/test/endtoend/operator/401_scheduled_backups.yaml b/test/endtoend/operator/401_scheduled_backups.yaml
index 52c18053..221ab22b 100644
--- a/test/endtoend/operator/401_scheduled_backups.yaml
+++ b/test/endtoend/operator/401_scheduled_backups.yaml
@@ -11,8 +11,8 @@ spec:
path: /backup
type: Directory
schedules:
- - name: "customer"
- schedule: "* * * * *"
+ - name: "commerce"
+ schedule: "*/2 * * * *"
resources:
requests:
cpu: 100m
@@ -22,16 +22,12 @@ spec:
successfulJobsHistoryLimit: 2
failedJobsHistoryLimit: 3
jobTimeoutMinute: 5
- suspend: true
strategies:
- - name: BackupShard
- keyspace: "customer"
- shard: "80-"
- - name: BackupShard
- keyspace: "customer"
- shard: "-80"
- - name: "commerce"
- schedule: "* * * * *"
+ - name: commerce_x
+ keyspace: "commerce"
+ shard: "-"
+ - name: "customer"
+ schedule: "*/2 * * * *"
resources:
requests:
cpu: 100m
@@ -42,9 +38,12 @@ spec:
failedJobsHistoryLimit: 3
jobTimeoutMinute: 5
strategies:
- - name: BackupShard
- keyspace: "commerce"
- shard: "-"
+ - name: customer_80-x
+ keyspace: "customer"
+ shard: "80-"
+ - name: customer_x-80
+ keyspace: "customer"
+ shard: "-80"
images:
vtctld: vitess/lite:latest
vtgate: vitess/lite:latest
diff --git a/test/endtoend/upgrade_test.sh b/test/endtoend/upgrade_test.sh
index 3bc4552c..256e2b7b 100755
--- a/test/endtoend/upgrade_test.sh
+++ b/test/endtoend/upgrade_test.sh
@@ -17,6 +17,7 @@ function move_tables() {
sleep 10
+ echo "Execute MoveTables"
vtctldclient LegacyVtctlCommand -- MoveTables --source commerce --tables 'customer,corder' Create customer.commerce2customer
if [ $? -ne 0 ]; then
echo "MoveTables failed"
@@ -26,6 +27,7 @@ function move_tables() {
sleep 10
+ echo "Execute VDiff"
vdiff_out=$(vtctldclient LegacyVtctlCommand -- VDiff customer.commerce2customer)
echo "$vdiff_out" | grep "ProcessedRows: 5" | wc -l | grep "2" > /dev/null
if [ $? -ne 0 ]; then
@@ -33,6 +35,7 @@ function move_tables() {
# Allow failure
fi
+ echo "SwitchTraffic for rdonly"
vtctldclient LegacyVtctlCommand -- MoveTables --tablet_types='rdonly,replica' SwitchTraffic customer.commerce2customer
if [ $? -ne 0 ]; then
echo "SwitchTraffic for rdonly and replica failed"
@@ -40,6 +43,7 @@ function move_tables() {
exit 1
fi
+ echo "SwitchTraffic for primary"
vtctldclient LegacyVtctlCommand -- MoveTables --tablet_types='primary' SwitchTraffic customer.commerce2customer
if [ $? -ne 0 ]; then
echo "SwitchTraffic for primary failed"
@@ -47,6 +51,7 @@ function move_tables() {
exit 1
fi
+ echo "Complete MoveTables"
vtctldclient LegacyVtctlCommand -- MoveTables Complete customer.commerce2customer
if [ $? -ne 0 ]; then
echo "MoveTables Complete failed"
@@ -177,6 +182,39 @@ EOF
waitForKeyspaceToBeServing customer 80- 2
}
+function scheduledBackups() {
+ echo "Apply 401_scheduled_backups.yaml"
+ kubectl apply -f 401_scheduled_backups.yaml > /dev/null
+
+ checkVitessBackupScheduleStatusWithTimeout "example-vbsc-commerce(.*)"
+ checkVitessBackupScheduleStatusWithTimeout "example-vbsc-customer(.*)"
+
+ docker exec -it $(docker container ls --format '{{.Names}}' | grep kind) chmod o+rwx -R /backup > /dev/null
+ initialCommerceBackups=$(kubectl get vtb --no-headers | grep "commerce-x-x" | wc -l)
+ initialCustomerFirstShardBackups=$(kubectl get vtb --no-headers | grep "customer-x-80" | wc -l)
+ initialCustomerSecondShardBackups=$(kubectl get vtb --no-headers | grep "customer-80-x" | wc -l)
+
+ for i in {1..300} ; do
+ commerceBackups=$(kubectl get vtb --no-headers | grep "commerce-x-x" | wc -l)
+ customerFirstShardBackups=$(kubectl get vtb --no-headers | grep "customer-x-80" | wc -l)
+ customerSecondShardBackups=$(kubectl get vtb --no-headers | grep "customer-80-x" | wc -l)
+
+ if [[ "${customerFirstShardBackups}" -ge $(( initialCustomerFirstShardBackups + 2 )) && "${customerSecondShardBackups}" -ge $(( initialCustomerSecondShardBackups + 2 )) && "${commerceBackups}" -ge $(( initialCommerceBackups + 2 )) ]]; then
+ echo "Found all backups"
+ return
+ else
+ echo "Got: ${customerFirstShardBackups} customer-x-80 backups but want: $(( initialCustomerFirstShardBackups + 2 ))"
+ echo "Got: ${customerSecondShardBackups} customer-80-x backups but want: $(( initialCustomerSecondShardBackups + 2 ))"
+ echo "Got: ${commerceBackups} commerce-x-x backups but want: $(( initialCommerceBackups + 2 ))"
+ echo ""
+ fi
+ sleep 10
+ done
+
+ echo "Did not find the backups on time"
+ exit 1
+}
+
function waitAndVerifySetup() {
sleep 10
checkPodStatusWithTimeout "example-zone1-vtctld(.*)1/1(.*)Running(.*)"
@@ -236,22 +274,8 @@ EOF
mkdir -p -m 777 ./vtdataroot/backup
echo "Building the docker image"
docker build -f build/Dockerfile.release -t vitess-operator-pr:latest .
-echo "Creating Kind cluster"
-if [[ "$BUILDKITE_BUILD_ID" != "0" ]]; then
- # The script is being run from buildkite, so we can't mount the current
- # working directory to kind. The current directory in the docker is workdir
- # So if we try and mount that, we get an error. Instead we need to mount the
- # path where the code was checked out be buildkite
- dockerContainerName=$(docker container ls --filter "ancestor=docker" --format '{{.Names}}')
- CHECKOUT_PATH=$(docker container inspect -f '{{range .Mounts}}{{ if eq .Destination "/workdir" }}{{println .Source }}{{ end }}{{end}}' "$dockerContainerName")
- BACKUP_DIR="$CHECKOUT_PATH/vtdataroot/backup"
-else
- BACKUP_DIR="$PWD/vtdataroot/backup"
-fi
-cat ./test/endtoend/kindBackupConfig.yaml | sed "s,PATH,$BACKUP_DIR,1" > ./vtdataroot/config.yaml
-kind create cluster --wait 30s --name kind-${BUILDKITE_BUILD_ID} --image ${KIND_VERSION} --config ./vtdataroot/config.yaml
-echo "Loading docker image into Kind cluster"
-kind load docker-image vitess-operator-pr:latest --name kind-${BUILDKITE_BUILD_ID}
+setupKindConfig
+createKindCluster
cd "$PWD/test/endtoend/operator"
killall kubectl
@@ -270,6 +294,11 @@ verifyDurabilityPolicy "commerce" "semi_sync"
move_tables
resharding
+scheduledBackups
+
# Teardown
-#echo "Deleting Kind cluster. This also deletes the volume associated with it"
-#kind delete cluster --name kind-${BUILDKITE_BUILD_ID}
+echo "Removing the temporary directory"
+removeBackupFiles
+rm -rf "$STARTING_DIR/vtdataroot"
+echo "Deleting Kind cluster. This also deletes the volume associated with it"
+kind delete cluster --name kind-${BUILDKITE_BUILD_ID}
diff --git a/test/endtoend/utils.sh b/test/endtoend/utils.sh
index 74e380f8..fbdd55c0 100644
--- a/test/endtoend/utils.sh
+++ b/test/endtoend/utils.sh
@@ -247,7 +247,7 @@ function waitForKeyspaceToBeServing() {
fi
echo "Shard $ks/$shard is not fully serving. Output: $out"
echo "Retrying (attempt #$i) ..."
- sleep 10
+ sleep 1
done
}
@@ -397,3 +397,17 @@ COrder
+----------+-------------+----------+-------+
EOF
}
+
+function checkVitessBackupScheduleStatusWithTimeout() {
+ regex=$1
+
+ for i in {1..1200} ; do
+ if [[ $(kubectl get VitessBackupSchedule | grep -E "${regex}" | wc -l) -eq 1 ]]; then
+ echo "$regex found"
+ return
+ fi
+ sleep 1
+ done
+ echo -e "ERROR: checkPodStatusWithTimeout timeout to find pod matching:\ngot:\n$out\nfor regex: $regex"
+ exit 1
+}
From d29a799e8569b8511318affa89896685baef981b Mon Sep 17 00:00:00 2001
From: Florent Poinsard
Date: Tue, 25 Feb 2025 13:07:36 -0600
Subject: [PATCH 10/13] Remove unused function in vbsc controller
Signed-off-by: Florent Poinsard
---
.../vitessbackupschedule_controller.go | 15 +--------------
1 file changed, 1 insertion(+), 14 deletions(-)
diff --git a/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go b/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go
index 4091163a..16becc10 100644
--- a/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go
+++ b/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go
@@ -59,8 +59,7 @@ import (
)
const (
- controllerName = "vitessbackupschedule-controller"
- vtctldclientPath = "/vt/bin/vtctldclient"
+ controllerName = "vitessbackupschedule-controller"
)
var (
@@ -629,18 +628,6 @@ func (r *ReconcileVitessBackupsSchedule) createJobPod(
return p, vtbackupSpec, nil
}
-func createVtctldClientCommand(cmd *strings.Builder, serverAddr string, extraFlags map[string]string, keyspace, shard string) {
- cmd.WriteString(fmt.Sprintf("%s %s BackupShard", vtctldclientPath, serverAddr))
-
- // Add any flags
- for key, value := range extraFlags {
- cmd.WriteString(fmt.Sprintf(" --%s=%s", key, value))
- }
-
- // Add keyspace/shard
- cmd.WriteString(fmt.Sprintf(" %s/%s", keyspace, shard))
-}
-
func (r *ReconcileVitessBackupsSchedule) getVtctldServiceName(ctx context.Context, vbsc *planetscalev2.VitessBackupSchedule, cluster string) (svcName string, svcPort int32, err error) {
svcList := &corev1.ServiceList{}
listOpts := &client.ListOptions{
From 8d4f5ee559c762421e1d3bac9fd9647b67fe2c7d Mon Sep 17 00:00:00 2001
From: Florent Poinsard
Date: Tue, 25 Feb 2025 14:46:26 -0600
Subject: [PATCH 11/13] Increase timeout in verifyListBackupsOutputWithSchedule
Signed-off-by: Florent Poinsard
---
test/endtoend/backup_schedule_test.sh | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/test/endtoend/backup_schedule_test.sh b/test/endtoend/backup_schedule_test.sh
index 34ca9f94..db46a508 100755
--- a/test/endtoend/backup_schedule_test.sh
+++ b/test/endtoend/backup_schedule_test.sh
@@ -17,9 +17,9 @@ function verifyListBackupsOutputWithSchedule() {
checkVitessBackupScheduleStatusWithTimeout "example-vbsc-every-five-minute(.*)"
echo -e "Check for number of backups in the cluster"
- # Sleep for 7 minutes, after 7 minutes we should at least have 3 backups: 1 from the initial vtbackup pod
+ # Sleep for 10 minutes, after 10 minutes we should at least have 3 backups: 1 from the initial vtbackup pod
# 1 minimum from the every minute schedule, and 1 from the every-five minute schedule
- for i in {1..420} ; do
+ for i in {1..600} ; do
# Ensure that we can view the backup files from the host.
docker exec -it $(docker container ls --format '{{.Names}}' | grep kind) chmod o+rwx -R /backup > /dev/null
backupCount=$(kubectl get vtb --no-headers | wc -l)
@@ -35,7 +35,7 @@ function verifyListBackupsOutputWithSchedule() {
fi
echo -e "Check for Jobs' pods"
- # Here check explicitly that the every five minute schedule ran at least once during the 7 minutes sleep
+ # Here check explicitly that the every five minute schedule ran at least once during the 10 minutes sleep
checkPodExistWithTimeout "example-vbsc-every-minute-(.*)0/1(.*)Completed(.*)"
checkPodExistWithTimeout "example-vbsc-every-five-minute-(.*)0/1(.*)Completed(.*)"
}
From 28875f9c973d4534ef696ef647a198d4ec032408 Mon Sep 17 00:00:00 2001
From: Florent Poinsard
Date: Tue, 25 Feb 2025 15:25:31 -0600
Subject: [PATCH 12/13] Self review
Signed-off-by: Florent Poinsard
---
pkg/controller/vitessshard/reconcile_backup_job.go | 2 +-
test/endtoend/upgrade_test.sh | 4 ++--
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/pkg/controller/vitessshard/reconcile_backup_job.go b/pkg/controller/vitessshard/reconcile_backup_job.go
index 23a82973..14aef2a7 100644
--- a/pkg/controller/vitessshard/reconcile_backup_job.go
+++ b/pkg/controller/vitessshard/reconcile_backup_job.go
@@ -84,7 +84,7 @@ func (r *ReconcileVitessShard) reconcileBackupJob(ctx context.Context, vts *plan
// scratch (not from any tablet). If we're wrong and a backup exists
// already, the idempotent vtbackup "initial backup" mode will just do
// nothing and return success.
- backupType := vitessbackup.TypeUpdate
+ backupType := vitessbackup.TypeFirstBackup
if vts.Status.HasMaster != corev1.ConditionTrue {
backupType = vitessbackup.TypeInit
}
diff --git a/test/endtoend/upgrade_test.sh b/test/endtoend/upgrade_test.sh
index 256e2b7b..4aca2c50 100755
--- a/test/endtoend/upgrade_test.sh
+++ b/test/endtoend/upgrade_test.sh
@@ -194,7 +194,7 @@ function scheduledBackups() {
initialCustomerFirstShardBackups=$(kubectl get vtb --no-headers | grep "customer-x-80" | wc -l)
initialCustomerSecondShardBackups=$(kubectl get vtb --no-headers | grep "customer-80-x" | wc -l)
- for i in {1..300} ; do
+ for i in {1..60} ; do
commerceBackups=$(kubectl get vtb --no-headers | grep "commerce-x-x" | wc -l)
customerFirstShardBackups=$(kubectl get vtb --no-headers | grep "customer-x-80" | wc -l)
customerSecondShardBackups=$(kubectl get vtb --no-headers | grep "customer-80-x" | wc -l)
@@ -281,7 +281,7 @@ cd "$PWD/test/endtoend/operator"
killall kubectl
setupKubectlAccessForCI
-get_started "operator-latest.yaml" "101_initial_cluster.yaml"
+get_started "operator.yaml" "101_initial_cluster.yaml"
verifyVtGateVersion "21.0.0"
checkSemiSyncSetup
# Initially too durability policy should be specified
From 91897cc8a220c94cbb0182dc7f769fac3e86c79b Mon Sep 17 00:00:00 2001
From: Florent Poinsard
Date: Tue, 25 Feb 2025 16:16:05 -0600
Subject: [PATCH 13/13] Fix test logic
Signed-off-by: Florent Poinsard
---
test/endtoend/backup_schedule_test.sh | 17 +++++++----------
1 file changed, 7 insertions(+), 10 deletions(-)
diff --git a/test/endtoend/backup_schedule_test.sh b/test/endtoend/backup_schedule_test.sh
index db46a508..23393289 100755
--- a/test/endtoend/backup_schedule_test.sh
+++ b/test/endtoend/backup_schedule_test.sh
@@ -25,19 +25,16 @@ function verifyListBackupsOutputWithSchedule() {
backupCount=$(kubectl get vtb --no-headers | wc -l)
echo "Found ${backupCount} backups"
if [[ "${backupCount}" -ge 3 ]]; then
- break
+ echo -e "Check for Jobs' pods"
+ # Here check explicitly that the every five minute schedule ran at least once during the 10 minutes sleep
+ checkPodExistWithTimeout "example-vbsc-every-minute-(.*)0/1(.*)Completed(.*)"
+ checkPodExistWithTimeout "example-vbsc-every-five-minute-(.*)0/1(.*)Completed(.*)"
+ return
fi
sleep 1
done
- if [[ "${backupCount}" -ge 3 ]]; then
- echo "Did not find at least 3 backups"
- exit 1
- fi
-
- echo -e "Check for Jobs' pods"
- # Here check explicitly that the every five minute schedule ran at least once during the 10 minutes sleep
- checkPodExistWithTimeout "example-vbsc-every-minute-(.*)0/1(.*)Completed(.*)"
- checkPodExistWithTimeout "example-vbsc-every-five-minute-(.*)0/1(.*)Completed(.*)"
+ echo "Did not find at least 3 backups"
+ exit 1
}
# Test setup
|