diff --git a/deploy/crds/planetscale.com_vitessbackupschedules.yaml b/deploy/crds/planetscale.com_vitessbackupschedules.yaml index 2bb4c666..dbce574a 100644 --- a/deploy/crds/planetscale.com_vitessbackupschedules.yaml +++ b/deploy/crds/planetscale.com_vitessbackupschedules.yaml @@ -38,6 +38,7 @@ spec: cluster: type: string concurrencyPolicy: + default: Forbid enum: - Allow - Forbid @@ -113,8 +114,6 @@ spec: example: commerce type: string name: - enum: - - BackupShard type: string shard: example: '-' @@ -164,6 +163,11 @@ spec: lastScheduledTime: format: date-time type: string + lastScheduledTimes: + additionalProperties: + format: date-time + type: string + type: object type: object type: object served: true diff --git a/deploy/crds/planetscale.com_vitessclusters.yaml b/deploy/crds/planetscale.com_vitessclusters.yaml index 241540e6..3646e9d9 100644 --- a/deploy/crds/planetscale.com_vitessclusters.yaml +++ b/deploy/crds/planetscale.com_vitessclusters.yaml @@ -168,6 +168,7 @@ spec: type: string type: object concurrencyPolicy: + default: Forbid enum: - Allow - Forbid @@ -239,8 +240,6 @@ spec: example: commerce type: string name: - enum: - - BackupShard type: string shard: example: '-' diff --git a/docs/api.md b/docs/api.md index f654e80a..661e2975 100644 --- a/docs/api.md +++ b/docs/api.md @@ -647,16 +647,6 @@ SecretSource -

BackupStrategyName -(string alias)

-

-(Appears on: -VitessBackupScheduleStrategy) -

-

-

BackupStrategyName describes the vtctldclient command that will be used to take a backup. -When scheduling a backup, you must specify at least one strategy.

-

CephBackupLocation

@@ -2600,7 +2590,8 @@ The PullPolicy used will be the same as the one used to pull the vtctld image. (Optional) -

A list of pointers to currently running jobs.

+

A list of pointers to currently running jobs. +This field is deprecated and no longer used in >= v2.15. It will be removed in a future release.

@@ -2614,7 +2605,25 @@ Kubernetes meta/v1.Time (Optional) -

Information when was the last time the job was successfully scheduled.

+

Information when was the last time the job was successfully scheduled. +This field is deprecated and no longer used in >= v2.15. It will be removed in a future release. +Please use lastScheduledTimes instead which maps the last schedule time to each VitessBackupScheduleStrategy +in the VitessBackupSchedule.

+ + + + +lastScheduledTimes
+ + +map[string]*k8s.io/apimachinery/pkg/apis/meta/v1.Time + + + + +(Optional) +

LastScheduledTimes lists for every VitessBackupScheduleStrategy what is the last schedule we executed. +It does not list when the last execution started, but to which scheduled time it corresponds.

@@ -2642,9 +2651,7 @@ command line that will be executed in the Job’s pod.

name
- -BackupStrategyName - +string @@ -2741,11 +2748,9 @@ string

Strategy defines how we are going to take a backup. If you want to take several backups within the same schedule you can add more items -to the Strategy list. Each VitessBackupScheduleStrategy will be executed by the same -kubernetes job. This is useful if for instance you have one schedule, and you want to -take a backup of all shards in a keyspace and don’t want to re-create a second schedule. -All the VitessBackupScheduleStrategy are concatenated into a single shell command that -is executed when the Job’s container starts.

+to the Strategy list. Each VitessBackupScheduleStrategy will be executed within different +K8S Jobs. This is useful if you want to have a single schedule backing up multiple shards +at the same time.

@@ -2828,8 +2833,8 @@ ConcurrencyPolicy (Optional)

ConcurrencyPolicy specifies ho to treat concurrent executions of a Job. Valid values are: -- “Allow” (default): allows CronJobs to run concurrently; -- “Forbid”: forbids concurrent runs, skipping next run if previous run hasn’t finished yet; +- “Allow”: allows CronJobs to run concurrently; +- “Forbid” (default): forbids concurrent runs, skipping next run if previous run hasn’t finished yet; - “Replace”: cancels currently running job and replaces it with a new one.

diff --git a/docs/api/index.html b/docs/api/index.html index 870a3e63..2934ee89 100644 --- a/docs/api/index.html +++ b/docs/api/index.html @@ -649,16 +649,6 @@

AzblobBackupLocation -

BackupStrategyName -(string alias)

-

-(Appears on: -VitessBackupScheduleStrategy) -

-

-

BackupStrategyName describes the vtctldclient command that will be used to take a backup. -When scheduling a backup, you must specify at least one strategy.

-

CephBackupLocation

@@ -2602,7 +2592,8 @@

VitessBackupScheduleStatu (Optional) -

A list of pointers to currently running jobs.

+

A list of pointers to currently running jobs. +This field is deprecated and no longer used in >= v2.15. It will be removed in a future release.

@@ -2616,7 +2607,25 @@

VitessBackupScheduleStatu (Optional) -

Information when was the last time the job was successfully scheduled.

+

Information when was the last time the job was successfully scheduled. +This field is deprecated and no longer used in >= v2.15. It will be removed in a future release. +Please use lastScheduledTimes instead which maps the last schedule time to each VitessBackupScheduleStrategy +in the VitessBackupSchedule.

+ + + + +lastScheduledTimes
+ + +map[string]*k8s.io/apimachinery/pkg/apis/meta/v1.Time + + + + +(Optional) +

LastScheduledTimes lists for every VitessBackupScheduleStrategy what is the last schedule we executed. +It does not list when the last execution started, but to which scheduled time it corresponds.

@@ -2644,9 +2653,7 @@

VitessBackupScheduleStr name
- -BackupStrategyName - +string @@ -2743,11 +2750,9 @@

VitessBackupScheduleTem

Strategy defines how we are going to take a backup. If you want to take several backups within the same schedule you can add more items -to the Strategy list. Each VitessBackupScheduleStrategy will be executed by the same -kubernetes job. This is useful if for instance you have one schedule, and you want to -take a backup of all shards in a keyspace and don’t want to re-create a second schedule. -All the VitessBackupScheduleStrategy are concatenated into a single shell command that -is executed when the Job’s container starts.

+to the Strategy list. Each VitessBackupScheduleStrategy will be executed within different +K8S Jobs. This is useful if you want to have a single schedule backing up multiple shards +at the same time.

@@ -2830,8 +2835,8 @@

VitessBackupScheduleTem (Optional)

ConcurrencyPolicy specifies ho to treat concurrent executions of a Job. Valid values are: -- “Allow” (default): allows CronJobs to run concurrently; -- “Forbid”: forbids concurrent runs, skipping next run if previous run hasn’t finished yet; +- “Allow”: allows CronJobs to run concurrently; +- “Forbid” (default): forbids concurrent runs, skipping next run if previous run hasn’t finished yet; - “Replace”: cancels currently running job and replaces it with a new one.

diff --git a/pkg/apis/planetscale/v2/vitessbackupschedule_types.go b/pkg/apis/planetscale/v2/vitessbackupschedule_types.go index ff29654d..e33d4061 100644 --- a/pkg/apis/planetscale/v2/vitessbackupschedule_types.go +++ b/pkg/apis/planetscale/v2/vitessbackupschedule_types.go @@ -34,16 +34,6 @@ const ( ForbidConcurrent ConcurrencyPolicy = "Forbid" ) -// BackupStrategyName describes the vtctldclient command that will be used to take a backup. -// When scheduling a backup, you must specify at least one strategy. -// +kubebuilder:validation:Enum=BackupShard -type BackupStrategyName string - -const ( - // BackupShard will use the "vtctldclient BackupShard" command to take a backup - BackupShard BackupStrategyName = "BackupShard" -) - // VitessBackupSchedule is the Schema for the VitessBackupSchedule API. // +kubebuilder:object:root=true // +kubebuilder:subresource:status @@ -98,12 +88,12 @@ type VitessBackupScheduleTemplate struct { // Strategy defines how we are going to take a backup. // If you want to take several backups within the same schedule you can add more items - // to the Strategy list. Each VitessBackupScheduleStrategy will be executed by the same - // kubernetes job. This is useful if for instance you have one schedule, and you want to - // take a backup of all shards in a keyspace and don't want to re-create a second schedule. - // All the VitessBackupScheduleStrategy are concatenated into a single shell command that - // is executed when the Job's container starts. + // to the Strategy list. Each VitessBackupScheduleStrategy will be executed within different + // K8S Jobs. This is useful if you want to have a single schedule backing up multiple shards + // at the same time. // +kubebuilder:validation:MinItems=1 + // +patchMergeKey=name + // +patchStrategy=merge Strategy []VitessBackupScheduleStrategy `json:"strategies"` // Resources specify the compute resources to allocate for every Jobs's pod. @@ -136,11 +126,12 @@ type VitessBackupScheduleTemplate struct { // ConcurrencyPolicy specifies ho to treat concurrent executions of a Job. // Valid values are: - // - "Allow" (default): allows CronJobs to run concurrently; - // - "Forbid": forbids concurrent runs, skipping next run if previous run hasn't finished yet; + // - "Allow": allows CronJobs to run concurrently; + // - "Forbid" (default): forbids concurrent runs, skipping next run if previous run hasn't finished yet; // - "Replace": cancels currently running job and replaces it with a new one. // +optional // +kubebuilder:example="Forbid" + // +kubebuilder:default="Forbid" ConcurrencyPolicy ConcurrencyPolicy `json:"concurrencyPolicy,omitempty"` // AllowedMissedRuns defines how many missed run of the schedule will be allowed before giving up on finding the last job. @@ -180,7 +171,7 @@ type VitessBackupScheduleTemplate struct { // command line that will be executed in the Job's pod. type VitessBackupScheduleStrategy struct { // Name of the backup strategy. - Name BackupStrategyName `json:"name"` + Name string `json:"name"` // Keyspace defines the keyspace on which we want to take the backup. // +kubebuilder:example="commerce" @@ -198,12 +189,32 @@ type VitessBackupScheduleStrategy struct { // VitessBackupScheduleStatus defines the observed state of VitessBackupSchedule type VitessBackupScheduleStatus struct { // A list of pointers to currently running jobs. + // This field is deprecated and no longer used in >= v2.15. It will be removed in a future release. // +optional Active []corev1.ObjectReference `json:"active,omitempty"` // Information when was the last time the job was successfully scheduled. + // This field is deprecated and no longer used in >= v2.15. It will be removed in a future release. + // Please use lastScheduledTimes instead which maps the last schedule time to each VitessBackupScheduleStrategy + // in the VitessBackupSchedule. // +optional LastScheduledTime *metav1.Time `json:"lastScheduledTime,omitempty"` + + // LastScheduledTimes lists for every VitessBackupScheduleStrategy what is the last schedule we executed. + // It does not list when the last execution started, but to which scheduled time it corresponds. + // +optional + LastScheduledTimes map[string]*metav1.Time `json:"lastScheduledTimes,omitempty"` +} + +// NewVitessBackupScheduleStatus creates a new status with default values. +func NewVitessBackupScheduleStatus(status VitessBackupScheduleStatus) VitessBackupScheduleStatus { + newStatus := VitessBackupScheduleStatus{ + LastScheduledTimes: status.LastScheduledTimes, + } + if status.LastScheduledTimes == nil { + newStatus.LastScheduledTimes = make(map[string]*metav1.Time) + } + return newStatus } func init() { diff --git a/pkg/apis/planetscale/v2/zz_generated.deepcopy.go b/pkg/apis/planetscale/v2/zz_generated.deepcopy.go index ace6b890..700716d2 100644 --- a/pkg/apis/planetscale/v2/zz_generated.deepcopy.go +++ b/pkg/apis/planetscale/v2/zz_generated.deepcopy.go @@ -7,6 +7,7 @@ package v2 import ( autoscalingv2 "k8s.io/api/autoscaling/v2" "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" ) @@ -855,6 +856,13 @@ func (in *VitessBackupScheduleStatus) DeepCopyInto(out *VitessBackupScheduleStat in, out := &in.LastScheduledTime, &out.LastScheduledTime *out = (*in).DeepCopy() } + if in.LastScheduledTimes != nil { + in, out := &in.LastScheduledTimes, &out.LastScheduledTimes + *out = make(map[string]*metav1.Time, len(*in)) + for key, val := range *in { + (*out)[key] = val.DeepCopy() + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VitessBackupScheduleStatus. diff --git a/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go b/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go index c105fe9d..16becc10 100644 --- a/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go +++ b/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go @@ -22,6 +22,7 @@ import ( "fmt" "maps" "math/rand/v2" + "strconv" "strings" "sort" @@ -31,12 +32,18 @@ import ( "github.com/sirupsen/logrus" kbatch "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" + apiequality "k8s.io/apimachinery/pkg/api/equality" apierrors "k8s.io/apimachinery/pkg/api/errors" apilabels "k8s.io/apimachinery/pkg/labels" "k8s.io/client-go/tools/record" + "planetscale.dev/vitess-operator/pkg/controller/vitessshard" "planetscale.dev/vitess-operator/pkg/operator/metrics" + "planetscale.dev/vitess-operator/pkg/operator/names" "planetscale.dev/vitess-operator/pkg/operator/reconciler" "planetscale.dev/vitess-operator/pkg/operator/results" + "planetscale.dev/vitess-operator/pkg/operator/resync" + "planetscale.dev/vitess-operator/pkg/operator/vitessbackup" + "planetscale.dev/vitess-operator/pkg/operator/vttablet" "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/manager" @@ -46,19 +53,18 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" - ref "k8s.io/client-go/tools/reference" planetscalev2 "planetscale.dev/vitess-operator/pkg/apis/planetscale/v2" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" ) const ( - controllerName = "vitessbackupschedule-controller" - vtctldclientPath = "/vt/bin/vtctldclient" + controllerName = "vitessbackupschedule-controller" ) var ( maxConcurrentReconciles = flag.Int("vitessbackupschedule_concurrent_reconciles", 10, "the maximum number of different vitessbackupschedule resources to reconcile concurrently") + resyncPeriod = flag.Duration("vitessbackupschedule_resync_period", 1*time.Minute, "reconcile vitessbackupschedules with this period even if no Kubernetes events occur") scheduledTimeAnnotation = "planetscale.com/backup-scheduled-at" @@ -75,6 +81,7 @@ type ( ReconcileVitessBackupsSchedule struct { client client.Client scheme *runtime.Scheme + resync *resync.Periodic recorder record.EventRecorder reconciler *reconciler.Reconciler } @@ -106,6 +113,7 @@ func newReconciler(mgr manager.Manager) (*ReconcileVitessBackupsSchedule, error) return &ReconcileVitessBackupsSchedule{ client: c, scheme: scheme, + resync: resync.NewPeriodic(controllerName, *resyncPeriod), recorder: recorder, reconciler: reconciler.New(c, scheme, recorder), }, nil @@ -140,6 +148,11 @@ func add(mgr manager.Manager, r *ReconcileVitessBackupsSchedule) error { } } + // Periodically resync even when no Kubernetes events have come in. + if err := c.Watch(r.resync.WatchSource()); err != nil { + return err + } + return nil } @@ -151,16 +164,14 @@ func add(mgr manager.Manager, r *ReconcileVitessBackupsSchedule) error { // - List all jobs and define the last scheduled Job // - Clean up old Job objects // - Create a new Job if needed -func (r *ReconcileVitessBackupsSchedule) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - resultBuilder := &results.Builder{} - +// - Update the VitessBackupSchedule Status +func (r *ReconcileVitessBackupsSchedule) Reconcile(ctx context.Context, req ctrl.Request) (res ctrl.Result, err error) { log = log.WithFields(logrus.Fields{ "namespace": req.Namespace, "VitessBackupSchedule": req.Name, }) log.Info("Reconciling VitessBackupSchedule") - var err error var vbsc planetscalev2.VitessBackupSchedule if err = r.client.Get(ctx, req.NamespacedName, &vbsc); err != nil { log.WithError(err).Error(" unable to fetch VitessBackupSchedule") @@ -168,26 +179,74 @@ func (r *ReconcileVitessBackupsSchedule) Reconcile(ctx context.Context, req ctrl // Request object not found, could have been deleted after reconcile request. // Owned objects are automatically garbage collected. For additional cleanup logic use finalizers. // Return and don't requeue - return resultBuilder.Result() + return ctrl.Result{}, nil } // Error reading the object - requeue the request. - return resultBuilder.Error(err) + return ctrl.Result{}, err } + // If the Suspend setting is set to true, we can skip adding any job, our work is done here. + if vbsc.Spec.Suspend != nil && *vbsc.Spec.Suspend { + log.Info("VitessBackupSchedule suspended, skipping") + return ctrl.Result{}, nil + } + + oldStatus := vbsc.DeepCopy() + vbsc.Status = planetscalev2.NewVitessBackupScheduleStatus(vbsc.Status) + // Register this reconciling attempt no matter if we fail or succeed. defer func() { reconcileCount.WithLabelValues(vbsc.Name, metrics.Result(err)).Inc() }() - jobs, mostRecentTime, err := r.getJobsList(ctx, req, vbsc.Name) - if err != nil { - // We had an error reading the jobs, we can requeue. - return resultBuilder.Error(err) + resultBuilder := &results.Builder{} + _, _ = resultBuilder.Merge(r.reconcileStrategies(ctx, req, vbsc)) + + if !apiequality.Semantic.DeepEqual(&vbsc.Status, &oldStatus) { + if err := r.client.Status().Update(ctx, &vbsc); err != nil { + if !apierrors.IsConflict(err) { + log.WithError(err).Error("unable to update VitessBackupSchedule status") + } + _, _ = resultBuilder.Error(err) + } + } + + // Request a periodic resync of this VitessBackupSchedule object to check + // if existing Jobs have finished or timed out and need to be cleaned up + // even if no Kubernetes events have occurred. + r.resync.Enqueue(req.NamespacedName) + + return resultBuilder.Result() +} + +func (r *ReconcileVitessBackupsSchedule) reconcileStrategies(ctx context.Context, req ctrl.Request, vbsc planetscalev2.VitessBackupSchedule) (ctrl.Result, error) { + resultBuilder := &results.Builder{} + + for _, strategy := range vbsc.Spec.Strategy { + _, _ = resultBuilder.Merge(r.reconcileStrategy(ctx, strategy, req, vbsc)) } + return resultBuilder.Result() +} - err = r.updateVitessBackupScheduleStatus(ctx, mostRecentTime, vbsc, jobs.active) +func (r *ReconcileVitessBackupsSchedule) reconcileStrategy( + ctx context.Context, + strategy planetscalev2.VitessBackupScheduleStrategy, + req ctrl.Request, + vbsc planetscalev2.VitessBackupSchedule, +) (reconcile.Result, error) { + resultBuilder := &results.Builder{} + + start, end, ok := strings.Cut(strategy.Shard, "-") + if !ok { + return resultBuilder.Error(fmt.Errorf("invalid strategy shard: %s", strategy.Shard)) + } + vkr := planetscalev2.VitessKeyRange{ + Start: start, + End: end, + } + jobs, mostRecentTime, err := r.getJobsList(ctx, req, vbsc, strategy.Keyspace, vkr.SafeName()) if err != nil { - // We had an error updating the status, we can requeue. + // We had an error reading the jobs, we can requeue. return resultBuilder.Error(err) } @@ -202,25 +261,22 @@ func (r *ReconcileVitessBackupsSchedule) Reconcile(ctx context.Context, req ctrl return resultBuilder.Error(err) } - // If the Suspend setting is set to true, we can skip adding any job, our work is done here. - if vbsc.Spec.Suspend != nil && *vbsc.Spec.Suspend { - log.Info("VitessBackupSchedule suspended, skipping") - return ctrl.Result{}, nil - } - - missedRun, nextRun, err := getNextSchedule(vbsc, time.Now()) + missedRun, nextRun, err := getNextSchedule(vbsc, time.Now(), mostRecentTime) if err != nil { log.Error(err, "unable to figure out VitessBackupSchedule schedule") // Re-queuing here does not make sense as we have an error with the schedule and the user needs to fix it first. - return ctrl.Result{}, nil + return resultBuilder.Error(reconcile.TerminalError(err)) } - // Ask kubernetes to re-queue for the next scheduled job, and skip if we don't miss any run. - scheduledResult := ctrl.Result{RequeueAfter: nextRun.Sub(time.Now())} + // If we did not miss any run, we can skip and not requeue anything if missedRun.IsZero() { - return scheduledResult, nil + return resultBuilder.Result() } + // Keep track of when we need to requeue this job + requeueAfter := nextRun.Sub(time.Now()) + _, _ = resultBuilder.RequeueAfter(requeueAfter) + // Check whether we are too late to create this Job or not. The startingDeadlineSeconds field will help us // schedule Jobs that are late. tooLate := false @@ -229,37 +285,39 @@ func (r *ReconcileVitessBackupsSchedule) Reconcile(ctx context.Context, req ctrl } if tooLate { log.Infof("missed starting deadline for latest run; skipping; next run is scheduled for: %s", nextRun.Format(time.RFC3339)) - return scheduledResult, nil + return resultBuilder.Result() } // Check concurrency policy and skip this job if we have ForbidConcurrent set plus an active job if vbsc.Spec.ConcurrencyPolicy == planetscalev2.ForbidConcurrent && len(jobs.active) > 0 { log.Infof("concurrency policy blocks concurrent runs: skipping, number of active jobs: %d", len(jobs.active)) - return scheduledResult, nil + return resultBuilder.Result() } // Now that the different policies are checked, we can create and apply our new job. - job, err := r.createJob(ctx, &vbsc, missedRun) + job, err := r.createJob(ctx, vbsc, strategy, missedRun, vkr) if err != nil { // Re-queuing here does not make sense as we have an error with the template and the user needs to fix it first. log.WithError(err).Error("unable to construct job from template") - return ctrl.Result{}, err + return resultBuilder.Error(reconcile.TerminalError(err)) } + if err = r.client.Create(ctx, job); err != nil { // if the job already exists it means another reconciling loop created the job since we last fetched // the list of jobs to create, we can safely return without failing. if apierrors.IsAlreadyExists(err) { - return ctrl.Result{}, nil + log.Infof("job %s already exists, will retry in %s", job.Name, requeueAfter.String()) + return resultBuilder.Result() } // Simply re-queue here return resultBuilder.Error(err) } - - log.Infof("created new job: %s, next job scheduled in %s", job.Name, scheduledResult.RequeueAfter.String()) - return scheduledResult, nil + log.Infof("created new job: %s, next job scheduled in %s", job.Name, requeueAfter.String()) + vbsc.Status.LastScheduledTimes[strategy.Name] = &metav1.Time{Time: missedRun} + return resultBuilder.Result() } -func getNextSchedule(vbsc planetscalev2.VitessBackupSchedule, now time.Time) (time.Time, time.Time, error) { +func getNextSchedule(vbsc planetscalev2.VitessBackupSchedule, now time.Time, mostRecentTime *time.Time) (time.Time, time.Time, error) { sched, err := cron.ParseStandard(vbsc.Spec.Schedule) if err != nil { return time.Time{}, time.Time{}, fmt.Errorf("unable to parse schedule %q: %v", vbsc.Spec.Schedule, err) @@ -267,30 +325,27 @@ func getNextSchedule(vbsc planetscalev2.VitessBackupSchedule, now time.Time) (ti // Set the last scheduled time by either looking at the VitessBackupSchedule's Status or // by looking at its creation time. - var latestRun time.Time - if vbsc.Status.LastScheduledTime != nil { - latestRun = vbsc.Status.LastScheduledTime.Time - } else { - latestRun = vbsc.ObjectMeta.CreationTimestamp.Time + if mostRecentTime == nil { + mostRecentTime = &vbsc.ObjectMeta.CreationTimestamp.Time } if vbsc.Spec.StartingDeadlineSeconds != nil { // controller is not going to schedule anything below this point schedulingDeadline := now.Add(-time.Second * time.Duration(*vbsc.Spec.StartingDeadlineSeconds)) - if schedulingDeadline.After(latestRun) { - latestRun = schedulingDeadline + if schedulingDeadline.After(*mostRecentTime) { + *mostRecentTime = schedulingDeadline } } // Next schedule is later, simply return the next scheduled time. - if latestRun.After(now) { + if mostRecentTime.After(now) { return time.Time{}, sched.Next(now), nil } var lastMissed time.Time missedRuns := 0 - for t := sched.Next(latestRun); !t.After(now); t = sched.Next(t) { + for t := sched.Next(*mostRecentTime); !t.After(now); t = sched.Next(t) { lastMissed = t missedRuns++ @@ -303,37 +358,24 @@ func getNextSchedule(vbsc planetscalev2.VitessBackupSchedule, now time.Time) (ti return lastMissed, sched.Next(now), nil } -func (r *ReconcileVitessBackupsSchedule) updateVitessBackupScheduleStatus(ctx context.Context, mostRecentTime *time.Time, vbsc planetscalev2.VitessBackupSchedule, activeJobs []*kbatch.Job) error { - if mostRecentTime != nil { - vbsc.Status.LastScheduledTime = &metav1.Time{Time: *mostRecentTime} - } else { - vbsc.Status.LastScheduledTime = nil - } - - vbsc.Status.Active = make([]corev1.ObjectReference, 0, len(activeJobs)) - for _, activeJob := range activeJobs { - jobRef, err := ref.GetReference(r.scheme, activeJob) - if err != nil { - log.WithError(err).Errorf("unable to make reference to active job: %s", jobRef.Name) - continue - } - vbsc.Status.Active = append(vbsc.Status.Active, *jobRef) - } - - if err := r.client.Status().Update(ctx, &vbsc); err != nil { - log.WithError(err).Error("unable to update VitessBackupSchedule status") - return err - } - return nil -} - // getJobsList fetches all existing Jobs in the cluster and return them by categories: active, failed or successful. // It also returns at what time was the last job created, which is needed to update VitessBackupSchedule's status, // and plan future jobs. -func (r *ReconcileVitessBackupsSchedule) getJobsList(ctx context.Context, req ctrl.Request, vbscName string) (jobsList, *time.Time, error) { +func (r *ReconcileVitessBackupsSchedule) getJobsList( + ctx context.Context, + req ctrl.Request, + vbsc planetscalev2.VitessBackupSchedule, + keyspace string, + shardSafeName string, +) (jobsList, *time.Time, error) { var existingJobs kbatch.JobList - err := r.client.List(ctx, &existingJobs, client.InNamespace(req.Namespace), client.MatchingLabels{planetscalev2.BackupScheduleLabel: vbscName}) + err := r.client.List(ctx, &existingJobs, client.InNamespace(req.Namespace), client.MatchingLabels{ + planetscalev2.BackupScheduleLabel: vbsc.Name, + planetscalev2.ClusterLabel: vbsc.Spec.Cluster, + planetscalev2.KeyspaceLabel: keyspace, + planetscalev2.ShardLabel: shardSafeName, + }) if err != nil && !apierrors.IsNotFound(err) { log.WithError(err).Error("unable to list Jobs in cluster") return jobsList{}, nil, err @@ -386,11 +428,27 @@ func (r *ReconcileVitessBackupsSchedule) cleanupJobsWithLimit(ctx context.Contex if int32(i) >= int32(len(jobs))-limit { break } + // delete the job if err := r.client.Delete(ctx, job, client.PropagationPolicy(metav1.DeletePropagationBackground)); (err) != nil { log.WithError(err).Errorf("unable to delete old job: %s", job.Name) } else { log.Infof("deleted old job: %s", job.Name) } + + // delete the vtbackup pod's PVC + pvc := &corev1.PersistentVolumeClaim{} + err := r.client.Get(ctx, client.ObjectKey{Namespace: job.Namespace, Name: job.Name}, pvc) + if err != nil { + log.WithError(err).Errorf("unable to get PVC for job: %s", job.Name) + if apierrors.IsNotFound(err) { + continue + } + } + if err := r.client.Delete(ctx, pvc, client.PropagationPolicy(metav1.DeletePropagationBackground)); (err) != nil { + log.WithError(err).Errorf("unable to delete old PVC for job: %s", job.Name) + } else { + log.Infof("deleted old PVC for job: %s", job.Name) + } } } @@ -410,6 +468,20 @@ func (r *ReconcileVitessBackupsSchedule) removeTimeoutJobs(ctx context.Context, log.Infof("deleted timed out job: %s", job.Name) } timeoutJobsCount.WithLabelValues(vbscName, metrics.Result(err)).Inc() + + pvc := &corev1.PersistentVolumeClaim{} + err := r.client.Get(ctx, client.ObjectKey{Namespace: job.Namespace, Name: job.Name}, pvc) + if err != nil { + log.WithError(err).Errorf("unable to get PVC for timed out job: %s", job.Name) + if apierrors.IsNotFound(err) { + continue + } + } + if err := r.client.Delete(ctx, pvc, client.PropagationPolicy(metav1.DeletePropagationBackground)); (err) != nil { + log.WithError(err).Errorf("unable to delete old PVC for timed out job: %s", job.Name) + } else { + log.Infof("deleted old PVC for timed out job: %s", job.Name) + } } } return nil @@ -439,13 +511,24 @@ func getScheduledTimeForJob(job *kbatch.Job) (*time.Time, error) { return &timeParsed, nil } -func (r *ReconcileVitessBackupsSchedule) createJob(ctx context.Context, vbsc *planetscalev2.VitessBackupSchedule, scheduledTime time.Time) (*kbatch.Job, error) { - name := fmt.Sprintf("%s-%d", vbsc.Name, scheduledTime.Unix()) +func (r *ReconcileVitessBackupsSchedule) createJob( + ctx context.Context, + vbsc planetscalev2.VitessBackupSchedule, + strategy planetscalev2.VitessBackupScheduleStrategy, + scheduledTime time.Time, + vkr planetscalev2.VitessKeyRange, +) (*kbatch.Job, error) { + name := names.JoinWithConstraints(names.ServiceConstraints, vbsc.Name, strategy.Keyspace, vkr.SafeName(), strconv.Itoa(int(scheduledTime.Unix()))) + + labels := map[string]string{ + planetscalev2.BackupScheduleLabel: vbsc.Name, + planetscalev2.ClusterLabel: vbsc.Spec.Cluster, + planetscalev2.KeyspaceLabel: strategy.Keyspace, + planetscalev2.ShardLabel: vkr.SafeName(), + } meta := metav1.ObjectMeta{ - Labels: map[string]string{ - planetscalev2.BackupScheduleLabel: vbsc.Name, - }, + Labels: labels, Annotations: make(map[string]string), Name: name, Namespace: vbsc.Namespace, @@ -457,7 +540,7 @@ func (r *ReconcileVitessBackupsSchedule) createJob(ctx context.Context, vbsc *pl maps.Copy(meta.Labels, vbsc.Labels) - pod, err := r.createJobPod(ctx, vbsc, name) + pod, vtbackupSpec, err := r.createJobPod(ctx, vbsc, strategy, name, vkr, labels) if err != nil { return nil, err } @@ -466,76 +549,83 @@ func (r *ReconcileVitessBackupsSchedule) createJob(ctx context.Context, vbsc *pl Spec: kbatch.JobSpec{ Template: corev1.PodTemplateSpec{ ObjectMeta: meta, - Spec: pod, + Spec: pod.Spec, }, }, } - if err := ctrl.SetControllerReference(vbsc, job, r.scheme); err != nil { + if err := ctrl.SetControllerReference(&vbsc, job, r.scheme); err != nil { return nil, err } - return job, nil -} - -func (r *ReconcileVitessBackupsSchedule) createJobPod(ctx context.Context, vbsc *planetscalev2.VitessBackupSchedule, name string) (pod corev1.PodSpec, err error) { - getVtctldServiceName := func(cluster string) (string, error) { - vtctldServiceName, vtctldServicePort, err := r.getVtctldServiceName(ctx, vbsc, cluster) + // Create the corresponding PVC for the new vtbackup pod + pvc := &corev1.PersistentVolumeClaim{} + key := client.ObjectKey{ + Namespace: job.Namespace, + Name: name, + } + err = r.client.Get(ctx, key, pvc) + if err != nil { + if !apierrors.IsNotFound(err) { + return nil, err + } + newPVC := vttablet.NewPVC(key, vtbackupSpec.TabletSpec) + if err := ctrl.SetControllerReference(&vbsc, newPVC, r.scheme); err != nil { + return nil, err + } + err = r.client.Create(ctx, newPVC) if err != nil { - return "", err + return nil, err } - return fmt.Sprintf("--server=%s:%d", vtctldServiceName, vtctldServicePort), nil } - // It is fine to not have any default in the event there is no strategy as the CRD validation - // ensures that there will be at least one item in this list. The YAML cannot be applied with - // empty list of strategies. - var cmd strings.Builder + return job, nil +} - addNewCmd := func(i int) { - if i > 0 { - cmd.WriteString(" && ") - } +func (r *ReconcileVitessBackupsSchedule) createJobPod( + ctx context.Context, + vbsc planetscalev2.VitessBackupSchedule, + strategy planetscalev2.VitessBackupScheduleStrategy, + name string, + vkr planetscalev2.VitessKeyRange, + labels map[string]string, +) (pod *corev1.Pod, spec *vttablet.BackupSpec, err error) { + vts, err := r.getShardFromKeyspace(ctx, vbsc.Namespace, vbsc.Spec.Cluster, strategy.Keyspace, strategy.Shard) + if err != nil { + return nil, nil, err } - for i, strategy := range vbsc.Spec.Strategy { - vtctldclientServerArg, err := getVtctldServiceName(vbsc.Spec.Cluster) - if err != nil { - return corev1.PodSpec{}, err - } + _, completeBackups, err := vitessbackup.GetBackups(ctx, vbsc.Namespace, vbsc.Spec.Cluster, strategy.Keyspace, vkr.SafeName(), + func(ctx context.Context, allBackupsList *planetscalev2.VitessBackupList, listOpts *client.ListOptions) error { + return r.client.List(ctx, allBackupsList, listOpts) + }, + ) + if err != nil { + return nil, nil, err + } - addNewCmd(i) - switch strategy.Name { - case planetscalev2.BackupShard: - createVtctldClientCommand(&cmd, vtctldclientServerArg, strategy.ExtraFlags, strategy.Keyspace, strategy.Shard) + backupType := vitessbackup.TypeUpdate + if len(completeBackups) == 0 { + if vts.Status.HasMaster == corev1.ConditionTrue { + backupType = vitessbackup.TypeFirstBackup + } else { + backupType = vitessbackup.TypeInit } - } - pod = corev1.PodSpec{ - Containers: []corev1.Container{{ - Name: name, - Image: vbsc.Spec.Image, - ImagePullPolicy: vbsc.Spec.ImagePullPolicy, - Resources: vbsc.Spec.Resources, - Args: []string{"/bin/sh", "-c", cmd.String()}, - }}, - RestartPolicy: corev1.RestartPolicyOnFailure, - Affinity: vbsc.Spec.Affinity, + podKey := client.ObjectKey{ + Namespace: vbsc.Namespace, + Name: name, } - return pod, nil -} - -func createVtctldClientCommand(cmd *strings.Builder, serverAddr string, extraFlags map[string]string, keyspace, shard string) { - cmd.WriteString(fmt.Sprintf("%s %s BackupShard", vtctldclientPath, serverAddr)) + vtbackupSpec := vitessshard.MakeVtbackupSpec(podKey, &vts, labels, backupType) + p := vttablet.NewBackupPod(podKey, vtbackupSpec, vts.Spec.Images.Mysqld.Image()) - // Add any flags - for key, value := range extraFlags { - cmd.WriteString(fmt.Sprintf(" --%s=%s", key, value)) - } + // Explicitly do not restart on failure. The VitessBackupSchedule controller will retry the failed job + // during the next scheduled run. + p.Spec.RestartPolicy = corev1.RestartPolicyNever - // Add keyspace/shard - cmd.WriteString(fmt.Sprintf(" %s/%s", keyspace, shard)) + p.Spec.Affinity = vbsc.Spec.Affinity + return p, vtbackupSpec, nil } func (r *ReconcileVitessBackupsSchedule) getVtctldServiceName(ctx context.Context, vbsc *planetscalev2.VitessBackupSchedule, cluster string) (svcName string, svcPort int32, err error) { @@ -568,7 +658,7 @@ func (r *ReconcileVitessBackupsSchedule) getVtctldServiceName(ctx context.Contex return svcName, svcPort, nil } -func (r *ReconcileVitessBackupsSchedule) getAllShardsInKeyspace(ctx context.Context, namespace, cluster, keyspace string) ([]string, error) { +func (r *ReconcileVitessBackupsSchedule) getShardFromKeyspace(ctx context.Context, namespace, cluster, keyspace, shard string) (planetscalev2.VitessShard, error) { shardsList := &planetscalev2.VitessShardList{} listOpts := &client.ListOptions{ Namespace: namespace, @@ -578,13 +668,14 @@ func (r *ReconcileVitessBackupsSchedule) getAllShardsInKeyspace(ctx context.Cont }.AsSelector(), } if err := r.client.List(ctx, shardsList, listOpts); err != nil { - return nil, fmt.Errorf("unable to list shards of keyspace %s in %s: %v", keyspace, namespace, err) + return planetscalev2.VitessShard{}, fmt.Errorf("unable to list shards of keyspace %s in %s: %v", keyspace, namespace, err) } - var result []string for _, item := range shardsList.Items { - result = append(result, item.Spec.Name) + if item.Spec.KeyRange.String() == shard { + return item, nil + } } - return result, nil + return planetscalev2.VitessShard{}, fmt.Errorf("unable to find shard %s in keyspace %s in %s", shard, keyspace, namespace) } type keyspace struct { diff --git a/pkg/controller/vitessshard/reconcile_backup_job.go b/pkg/controller/vitessshard/reconcile_backup_job.go index f580ce91..14aef2a7 100644 --- a/pkg/controller/vitessshard/reconcile_backup_job.go +++ b/pkg/controller/vitessshard/reconcile_backup_job.go @@ -22,7 +22,6 @@ import ( corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" - apilabels "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/reconcile" @@ -56,27 +55,15 @@ func (r *ReconcileVitessShard) reconcileBackupJob(ctx context.Context, vts *plan vitessbackup.TypeLabel: vitessbackup.TypeInit, } - // List all backups for this shard, across all storage locations. - // We'll use the latest observed state of backups to decide whether to take - // a new one. This list could be out of date because it's populated by - // polling the Vitess API (see the VitessBackupStorage controller), but as - // long as it's eventually consistent, we'll converge to the right behavior. - allBackups := &planetscalev2.VitessBackupList{} - listOpts := &client.ListOptions{ - Namespace: vts.Namespace, - LabelSelector: apilabels.SelectorFromSet(apilabels.Set{ - planetscalev2.ClusterLabel: clusterName, - planetscalev2.KeyspaceLabel: keyspaceName, - planetscalev2.ShardLabel: shardSafeName, - }), - } - if err := r.client.List(ctx, allBackups, listOpts); err != nil { + allBackups, completeBackups, err := vitessbackup.GetBackups(ctx, vts.Namespace, clusterName, keyspaceName, shardSafeName, + func(ctx context.Context, allBackupsList *planetscalev2.VitessBackupList, listOpts *client.ListOptions) error { + return r.client.List(ctx, allBackupsList, listOpts) + }, + ) + if err != nil { return resultBuilder.Error(err) } - updateBackupStatus(vts, allBackups.Items) - - // Here we only care about complete backups. - completeBackups := vitessbackup.CompleteBackups(allBackups.Items) + updateBackupStatus(vts, allBackups) // Generate keys (object names) for all desired backup Pods and PVCs. // Keep a map back from generated names to the backup specs. @@ -97,7 +84,11 @@ func (r *ReconcileVitessShard) reconcileBackupJob(ctx context.Context, vts *plan // scratch (not from any tablet). If we're wrong and a backup exists // already, the idempotent vtbackup "initial backup" mode will just do // nothing and return success. - initSpec := vtbackupInitSpec(initPodKey, vts, labels) + backupType := vitessbackup.TypeFirstBackup + if vts.Status.HasMaster != corev1.ConditionTrue { + backupType = vitessbackup.TypeInit + } + initSpec := MakeVtbackupSpec(initPodKey, vts, labels, backupType) if initSpec != nil { podKeys = append(podKeys, initPodKey) if initSpec.TabletSpec.DataVolumePVCSpec != nil { @@ -112,7 +103,7 @@ func (r *ReconcileVitessShard) reconcileBackupJob(ctx context.Context, vts *plan // Reconcile vtbackup PVCs. Use the same key as the corresponding Pod, // but only if the Pod expects a PVC. - err := r.reconciler.ReconcileObjectSet(ctx, vts, pvcKeys, labels, reconciler.Strategy{ + err = r.reconciler.ReconcileObjectSet(ctx, vts, pvcKeys, labels, reconciler.Strategy{ Kind: &corev1.PersistentVolumeClaim{}, New: func(key client.ObjectKey) runtime.Object { @@ -181,7 +172,7 @@ func (r *ReconcileVitessShard) reconcileBackupJob(ctx context.Context, vts *plan return resultBuilder.Result() } -func vtbackupInitSpec(key client.ObjectKey, vts *planetscalev2.VitessShard, parentLabels map[string]string) *vttablet.BackupSpec { +func MakeVtbackupSpec(key client.ObjectKey, vts *planetscalev2.VitessShard, parentLabels map[string]string, typ string) *vttablet.BackupSpec { // If we specifically set our cluster to avoid initial backups, bail early. if !*vts.Spec.Replication.InitializeBackup { return nil @@ -196,7 +187,7 @@ func vtbackupInitSpec(key client.ObjectKey, vts *planetscalev2.VitessShard, pare // Make a vtbackup spec that's a similar shape to the first tablet pool. // This should give it enough resources to run mysqld and restore a backup, // since all tablets need to be able to do that, regardless of type. - return vtbackupSpec(key, vts, parentLabels, &vts.Spec.TabletPools[0], vitessbackup.TypeInit) + return vtbackupSpec(key, vts, parentLabels, &vts.Spec.TabletPools[0], typ) } func vtbackupSpec(key client.ObjectKey, vts *planetscalev2.VitessShard, parentLabels map[string]string, pool *planetscalev2.VitessShardTabletPool, backupType string) *vttablet.BackupSpec { @@ -254,6 +245,7 @@ func vtbackupSpec(key client.ObjectKey, vts *planetscalev2.VitessShard, parentLa return &vttablet.BackupSpec{ InitialBackup: backupType == vitessbackup.TypeInit, + AllowFirstBackup: backupType == vitessbackup.TypeFirstBackup, MinBackupInterval: minBackupInterval, MinRetentionTime: minRetentionTime, MinRetentionCount: minRetentionCount, diff --git a/pkg/operator/vitessbackup/labels.go b/pkg/operator/vitessbackup/labels.go index e8e9a785..8c6d9819 100644 --- a/pkg/operator/vitessbackup/labels.go +++ b/pkg/operator/vitessbackup/labels.go @@ -24,6 +24,8 @@ const ( // TypeInit is a backup taken to initialize an empty shard. TypeInit = "init" + // TypeFirstBackup is a backup taken when no other backup exist in an existing shard. + TypeFirstBackup = "empty" // TypeUpdate is a backup taken to update the latest backup for a shard. TypeUpdate = "update" ) diff --git a/pkg/operator/vitessbackup/object.go b/pkg/operator/vitessbackup/object.go index b5c011d8..7c74495a 100644 --- a/pkg/operator/vitessbackup/object.go +++ b/pkg/operator/vitessbackup/object.go @@ -17,11 +17,14 @@ limitations under the License. package vitessbackup import ( + "context" "fmt" "strconv" "strings" "time" + apilabels "k8s.io/apimachinery/pkg/labels" + "sigs.k8s.io/controller-runtime/pkg/client" topodatapb "vitess.io/vitess/go/vt/proto/topodata" "vitess.io/vitess/go/vt/topo/topoproto" @@ -99,8 +102,42 @@ func LatestForLocation(locationName string, backups []*planetscalev2.VitessBacku return latest } -// CompleteBackups returns a list of only the complete backups from the input. -func CompleteBackups(backups []planetscalev2.VitessBackup) []*planetscalev2.VitessBackup { +// GetBackups returns a list of all backups, along with only completed backups, for the given +// keyspace/shard in the given cluster. +// A function to list the backup using the controller's client is necessary. +func GetBackups( + ctx context.Context, + namespace, clusterName, keyspaceName, shardSafeName string, + listBackups func(context.Context, *planetscalev2.VitessBackupList, *client.ListOptions) error, +) (allBackups []planetscalev2.VitessBackup, completeBackups []*planetscalev2.VitessBackup, err error) { + // List all backups for this shard, across all storage locations. + // We'll use the latest observed state of backups to decide whether to take + // a new one. This list could be out of date because it's populated by + // polling the Vitess API (see the VitessBackupStorage controller), but as + // long as it's eventually consistent, we'll converge to the right behavior. + allBackupsList := &planetscalev2.VitessBackupList{} + listOpts := &client.ListOptions{ + Namespace: namespace, + LabelSelector: apilabels.SelectorFromSet(apilabels.Set{ + planetscalev2.ClusterLabel: clusterName, + planetscalev2.KeyspaceLabel: keyspaceName, + planetscalev2.ShardLabel: shardSafeName, + }), + } + if err = listBackups(ctx, allBackupsList, listOpts); err != nil { + return nil, nil, err + } + + allBackups = allBackupsList.Items + + // Filter by complete backups. + completeBackups = getCompleteBackups(allBackups) + + return allBackups, completeBackups, nil +} + +// getCompleteBackups returns a list of only the complete backups from the input. +func getCompleteBackups(backups []planetscalev2.VitessBackup) []*planetscalev2.VitessBackup { completeBackups := []*planetscalev2.VitessBackup{} for i := range backups { if backups[i].Status.Complete { diff --git a/pkg/operator/vttablet/flags.go b/pkg/operator/vttablet/flags.go index 1ec42d0c..6bc34847 100644 --- a/pkg/operator/vttablet/flags.go +++ b/pkg/operator/vttablet/flags.go @@ -104,6 +104,7 @@ func init() { // vtbackup-specific flags. "concurrency": vtbackupConcurrency, "initial_backup": backupSpec.InitialBackup, + "allow_first_backup": backupSpec.AllowFirstBackup, "min_backup_interval": backupSpec.MinBackupInterval, "min_retention_time": backupSpec.MinRetentionTime, "min_retention_count": backupSpec.MinRetentionCount, diff --git a/pkg/operator/vttablet/vtbackup_pod.go b/pkg/operator/vttablet/vtbackup_pod.go index 813257f9..68b38567 100644 --- a/pkg/operator/vttablet/vtbackup_pod.go +++ b/pkg/operator/vttablet/vtbackup_pod.go @@ -61,6 +61,8 @@ type BackupSpec struct { // aren't any. Instead, bootstrap the shard with a backup of an empty // database. InitialBackup bool + // AllowFirstBackup enables vtbackup to take a backup even if none exist. + AllowFirstBackup bool // MinBackupInterval is the minimum spacing between backups. // A new backup will only be taken if it's been at least this long since the // most recent backup. diff --git a/test/endtoend/backup_schedule_test.sh b/test/endtoend/backup_schedule_test.sh index e540bf4c..23393289 100755 --- a/test/endtoend/backup_schedule_test.sh +++ b/test/endtoend/backup_schedule_test.sh @@ -11,47 +11,30 @@ function takedownShard() { checkPodStatusWithTimeout "example-vttablet-zone1" 0 } -function checkVitessBackupScheduleStatusWithTimeout() { - regex=$1 - - for i in {1..1200} ; do - if [[ $(kubectl get VitessBackupSchedule | grep -E "${regex}" | wc -l) -eq 1 ]]; then - echo "$regex found" - return - fi - sleep 1 - done - echo -e "ERROR: checkPodStatusWithTimeout timeout to find pod matching:\ngot:\n$out\nfor regex: $regex" - exit 1 -} - function verifyListBackupsOutputWithSchedule() { echo -e "Check for VitessBackupSchedule status" checkVitessBackupScheduleStatusWithTimeout "example-vbsc-every-minute(.*)" checkVitessBackupScheduleStatusWithTimeout "example-vbsc-every-five-minute(.*)" echo -e "Check for number of backups in the cluster" - # Sleep for over 6 minutes, during this time we should have at the very minimum - # 7 backups. At least: 6 backups from the every-minute schedule, and 1 backup - # from the every-five-minute schedule. - for i in {1..6} ; do + # Sleep for 10 minutes, after 10 minutes we should at least have 3 backups: 1 from the initial vtbackup pod + # 1 minimum from the every minute schedule, and 1 from the every-five minute schedule + for i in {1..600} ; do # Ensure that we can view the backup files from the host. docker exec -it $(docker container ls --format '{{.Names}}' | grep kind) chmod o+rwx -R /backup > /dev/null backupCount=$(kubectl get vtb --no-headers | wc -l) echo "Found ${backupCount} backups" - if [[ "${backupCount}" -ge 7 ]]; then - break + if [[ "${backupCount}" -ge 3 ]]; then + echo -e "Check for Jobs' pods" + # Here check explicitly that the every five minute schedule ran at least once during the 10 minutes sleep + checkPodExistWithTimeout "example-vbsc-every-minute-(.*)0/1(.*)Completed(.*)" + checkPodExistWithTimeout "example-vbsc-every-five-minute-(.*)0/1(.*)Completed(.*)" + return fi - sleep 100 + sleep 1 done - if [[ "${backupCount}" -lt 7 ]]; then - echo "Did not find at least 7 backups" - exit 1 - fi - - echo -e "Check for Jobs' pods" - checkPodStatusWithTimeout "example-vbsc-every-minute-(.*)0/1(.*)Completed(.*)" 3 - checkPodStatusWithTimeout "example-vbsc-every-five-minute-(.*)0/1(.*)Completed(.*)" 2 + echo "Did not find at least 3 backups" + exit 1 } # Test setup diff --git a/test/endtoend/operator/101_initial_cluster.yaml b/test/endtoend/operator/101_initial_cluster.yaml index 76ef570a..e0a34ed1 100644 --- a/test/endtoend/operator/101_initial_cluster.yaml +++ b/test/endtoend/operator/101_initial_cluster.yaml @@ -7,6 +7,13 @@ kind: VitessCluster metadata: name: example spec: + backup: + engine: xtrabackup + locations: + - volume: + hostPath: + path: /backup + type: Directory images: vtctld: vitess/lite:v21.0.0 vtgate: vitess/lite:v21.0.0 @@ -14,7 +21,7 @@ spec: vtorc: vitess/lite:v21.0.0 vtbackup: vitess/lite:v21.0.0 mysqld: - mysql80Compatible: mysql:8.0.40 + mysql80Compatible: vitess/lite:v21.0.0 mysqldExporter: prom/mysqld-exporter:v0.14.0 cells: - name: zone1 diff --git a/test/endtoend/operator/101_initial_cluster_backup_schedule.yaml b/test/endtoend/operator/101_initial_cluster_backup_schedule.yaml index b31c3032..b6379901 100644 --- a/test/endtoend/operator/101_initial_cluster_backup_schedule.yaml +++ b/test/endtoend/operator/101_initial_cluster_backup_schedule.yaml @@ -27,7 +27,7 @@ spec: failedJobsHistoryLimit: 3 jobTimeoutMinute: 5 strategies: - - name: BackupShard + - name: commerce-x keyspace: "commerce" shard: "-" - name: "every-five-minute" @@ -42,7 +42,7 @@ spec: failedJobsHistoryLimit: 3 jobTimeoutMinute: 5 strategies: - - name: BackupShard + - name: commerce-x keyspace: "commerce" shard: "-" images: @@ -52,7 +52,7 @@ spec: vtorc: vitess/lite:latest vtbackup: vitess/lite:latest mysqld: - mysql80Compatible: mysql:8.0.40 + mysql80Compatible: vitess/lite:latest mysqldExporter: prom/mysqld-exporter:v0.14.0 cells: - name: zone1 diff --git a/test/endtoend/operator/201_customer_tablets.yaml b/test/endtoend/operator/201_customer_tablets.yaml index ef1f3546..c0565c7d 100644 --- a/test/endtoend/operator/201_customer_tablets.yaml +++ b/test/endtoend/operator/201_customer_tablets.yaml @@ -3,6 +3,13 @@ kind: VitessCluster metadata: name: example spec: + backup: + engine: xtrabackup + locations: + - volume: + hostPath: + path: /backup + type: Directory images: vtctld: vitess/lite:latest vtgate: vitess/lite:latest @@ -10,7 +17,7 @@ spec: vtorc: vitess/lite:latest vtbackup: vitess/lite:latest mysqld: - mysql80Compatible: mysql:8.0.40 + mysql80Compatible: vitess/lite:latest mysqldExporter: prom/mysqld-exporter:v0.14.0 cells: - name: zone1 diff --git a/test/endtoend/operator/302_new_shards.yaml b/test/endtoend/operator/302_new_shards.yaml index 8731afea..2808349b 100644 --- a/test/endtoend/operator/302_new_shards.yaml +++ b/test/endtoend/operator/302_new_shards.yaml @@ -3,6 +3,13 @@ kind: VitessCluster metadata: name: example spec: + backup: + engine: xtrabackup + locations: + - volume: + hostPath: + path: /backup + type: Directory images: vtctld: vitess/lite:latest vtgate: vitess/lite:latest @@ -10,7 +17,7 @@ spec: vtorc: vitess/lite:latest vtbackup: vitess/lite:latest mysqld: - mysql80Compatible: mysql:8.0.40 + mysql80Compatible: vitess/lite:latest mysqldExporter: prom/mysqld-exporter:v0.14.0 cells: - name: zone1 diff --git a/test/endtoend/operator/306_down_shard_0.yaml b/test/endtoend/operator/306_down_shard_0.yaml index 9684f5f2..2cfcf5bd 100644 --- a/test/endtoend/operator/306_down_shard_0.yaml +++ b/test/endtoend/operator/306_down_shard_0.yaml @@ -3,6 +3,13 @@ kind: VitessCluster metadata: name: example spec: + backup: + engine: xtrabackup + locations: + - volume: + hostPath: + path: /backup + type: Directory images: vtctld: vitess/lite:latest vtgate: vitess/lite:latest @@ -10,7 +17,7 @@ spec: vtorc: vitess/lite:latest vtbackup: vitess/lite:latest mysqld: - mysql80Compatible: mysql:8.0.40 + mysql80Compatible: vitess/lite:latest mysqldExporter: prom/mysqld-exporter:v0.14.0 cells: - name: zone1 diff --git a/test/endtoend/operator/401_scheduled_backups.yaml b/test/endtoend/operator/401_scheduled_backups.yaml new file mode 100644 index 00000000..221ab22b --- /dev/null +++ b/test/endtoend/operator/401_scheduled_backups.yaml @@ -0,0 +1,166 @@ +apiVersion: planetscale.com/v2 +kind: VitessCluster +metadata: + name: example +spec: + backup: + engine: xtrabackup + locations: + - volume: + hostPath: + path: /backup + type: Directory + schedules: + - name: "commerce" + schedule: "*/2 * * * *" + resources: + requests: + cpu: 100m + memory: 1024Mi + limits: + memory: 1024Mi + successfulJobsHistoryLimit: 2 + failedJobsHistoryLimit: 3 + jobTimeoutMinute: 5 + strategies: + - name: commerce_x + keyspace: "commerce" + shard: "-" + - name: "customer" + schedule: "*/2 * * * *" + resources: + requests: + cpu: 100m + memory: 1024Mi + limits: + memory: 1024Mi + successfulJobsHistoryLimit: 2 + failedJobsHistoryLimit: 3 + jobTimeoutMinute: 5 + strategies: + - name: customer_80-x + keyspace: "customer" + shard: "80-" + - name: customer_x-80 + keyspace: "customer" + shard: "-80" + images: + vtctld: vitess/lite:latest + vtgate: vitess/lite:latest + vttablet: vitess/lite:latest + vtorc: vitess/lite:latest + vtbackup: vitess/lite:latest + mysqld: + mysql80Compatible: vitess/lite:latest + mysqldExporter: prom/mysqld-exporter:v0.14.0 + cells: + - name: zone1 + gateway: + authentication: + static: + secret: + name: example-cluster-config + key: users.json + replicas: 1 + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + memory: 256Mi + vitessDashboard: + cells: + - zone1 + extraFlags: + security_policy: read-only + replicas: 1 + resources: + limits: + memory: 128Mi + requests: + cpu: 100m + memory: 128Mi + + keyspaces: + - name: commerce + durabilityPolicy: semi_sync + vitessOrchestrator: + resources: + limits: + memory: 128Mi + requests: + cpu: 100m + memory: 128Mi + extraFlags: + instance-poll-time: 3s + turndownPolicy: Immediate + partitionings: + - equal: + parts: 1 + shardTemplate: + databaseInitScriptSecret: + name: example-cluster-config + key: init_db.sql + tabletPools: + - cell: zone1 + type: replica + replicas: 3 + vttablet: + extraFlags: + db_charset: utf8mb4 + resources: + requests: + cpu: 100m + memory: 256Mi + mysqld: + resources: + requests: + cpu: 100m + memory: 512Mi + dataVolumeClaimTemplate: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 10Gi + - name: customer + durabilityPolicy: semi_sync + vitessOrchestrator: + resources: + limits: + memory: 128Mi + requests: + cpu: 100m + memory: 128Mi + extraFlags: + instance-poll-time: 3s + turndownPolicy: Immediate + partitionings: + - equal: + parts: 2 + shardTemplate: + databaseInitScriptSecret: + name: example-cluster-config + key: init_db.sql + tabletPools: + - cell: zone1 + type: replica + replicas: 3 + vttablet: + extraFlags: + db_charset: utf8mb4 + resources: + requests: + cpu: 100m + memory: 256Mi + mysqld: + resources: + requests: + cpu: 100m + memory: 512Mi + dataVolumeClaimTemplate: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 10Gi + updateStrategy: + type: Immediate diff --git a/test/endtoend/operator/cluster_upgrade.yaml b/test/endtoend/operator/cluster_upgrade.yaml index 18cd5484..cb0216e6 100644 --- a/test/endtoend/operator/cluster_upgrade.yaml +++ b/test/endtoend/operator/cluster_upgrade.yaml @@ -7,6 +7,13 @@ kind: VitessCluster metadata: name: example spec: + backup: + engine: xtrabackup + locations: + - volume: + hostPath: + path: /backup + type: Directory images: vtctld: vitess/lite:latest vtgate: vitess/lite:latest @@ -14,7 +21,7 @@ spec: vtorc: vitess/lite:latest vtbackup: vitess/lite:latest mysqld: - mysql80Compatible: mysql:8.0.40 + mysql80Compatible: vitess/lite:latest mysqldExporter: prom/mysqld-exporter:v0.14.0 cells: - name: zone1 diff --git a/test/endtoend/operator/operator-latest.yaml b/test/endtoend/operator/operator-latest.yaml index 46ccb358..be23f61a 100644 --- a/test/endtoend/operator/operator-latest.yaml +++ b/test/endtoend/operator/operator-latest.yaml @@ -416,6 +416,7 @@ spec: cluster: type: string concurrencyPolicy: + default: Forbid enum: - Allow - Forbid @@ -491,8 +492,6 @@ spec: example: commerce type: string name: - enum: - - BackupShard type: string shard: example: '-' @@ -542,6 +541,11 @@ spec: lastScheduledTime: format: date-time type: string + lastScheduledTimes: + additionalProperties: + format: date-time + type: string + type: object type: object type: object served: true @@ -2028,6 +2032,7 @@ spec: type: string type: object concurrencyPolicy: + default: Forbid enum: - Allow - Forbid @@ -2099,8 +2104,6 @@ spec: example: commerce type: string name: - enum: - - BackupShard type: string shard: example: '-' diff --git a/test/endtoend/upgrade_test.sh b/test/endtoend/upgrade_test.sh index f1c47413..4aca2c50 100755 --- a/test/endtoend/upgrade_test.sh +++ b/test/endtoend/upgrade_test.sh @@ -17,6 +17,7 @@ function move_tables() { sleep 10 + echo "Execute MoveTables" vtctldclient LegacyVtctlCommand -- MoveTables --source commerce --tables 'customer,corder' Create customer.commerce2customer if [ $? -ne 0 ]; then echo "MoveTables failed" @@ -26,6 +27,7 @@ function move_tables() { sleep 10 + echo "Execute VDiff" vdiff_out=$(vtctldclient LegacyVtctlCommand -- VDiff customer.commerce2customer) echo "$vdiff_out" | grep "ProcessedRows: 5" | wc -l | grep "2" > /dev/null if [ $? -ne 0 ]; then @@ -33,6 +35,7 @@ function move_tables() { # Allow failure fi + echo "SwitchTraffic for rdonly" vtctldclient LegacyVtctlCommand -- MoveTables --tablet_types='rdonly,replica' SwitchTraffic customer.commerce2customer if [ $? -ne 0 ]; then echo "SwitchTraffic for rdonly and replica failed" @@ -40,6 +43,7 @@ function move_tables() { exit 1 fi + echo "SwitchTraffic for primary" vtctldclient LegacyVtctlCommand -- MoveTables --tablet_types='primary' SwitchTraffic customer.commerce2customer if [ $? -ne 0 ]; then echo "SwitchTraffic for primary failed" @@ -47,6 +51,7 @@ function move_tables() { exit 1 fi + echo "Complete MoveTables" vtctldclient LegacyVtctlCommand -- MoveTables Complete customer.commerce2customer if [ $? -ne 0 ]; then echo "MoveTables Complete failed" @@ -177,6 +182,39 @@ EOF waitForKeyspaceToBeServing customer 80- 2 } +function scheduledBackups() { + echo "Apply 401_scheduled_backups.yaml" + kubectl apply -f 401_scheduled_backups.yaml > /dev/null + + checkVitessBackupScheduleStatusWithTimeout "example-vbsc-commerce(.*)" + checkVitessBackupScheduleStatusWithTimeout "example-vbsc-customer(.*)" + + docker exec -it $(docker container ls --format '{{.Names}}' | grep kind) chmod o+rwx -R /backup > /dev/null + initialCommerceBackups=$(kubectl get vtb --no-headers | grep "commerce-x-x" | wc -l) + initialCustomerFirstShardBackups=$(kubectl get vtb --no-headers | grep "customer-x-80" | wc -l) + initialCustomerSecondShardBackups=$(kubectl get vtb --no-headers | grep "customer-80-x" | wc -l) + + for i in {1..60} ; do + commerceBackups=$(kubectl get vtb --no-headers | grep "commerce-x-x" | wc -l) + customerFirstShardBackups=$(kubectl get vtb --no-headers | grep "customer-x-80" | wc -l) + customerSecondShardBackups=$(kubectl get vtb --no-headers | grep "customer-80-x" | wc -l) + + if [[ "${customerFirstShardBackups}" -ge $(( initialCustomerFirstShardBackups + 2 )) && "${customerSecondShardBackups}" -ge $(( initialCustomerSecondShardBackups + 2 )) && "${commerceBackups}" -ge $(( initialCommerceBackups + 2 )) ]]; then + echo "Found all backups" + return + else + echo "Got: ${customerFirstShardBackups} customer-x-80 backups but want: $(( initialCustomerFirstShardBackups + 2 ))" + echo "Got: ${customerSecondShardBackups} customer-80-x backups but want: $(( initialCustomerSecondShardBackups + 2 ))" + echo "Got: ${commerceBackups} commerce-x-x backups but want: $(( initialCommerceBackups + 2 ))" + echo "" + fi + sleep 10 + done + + echo "Did not find the backups on time" + exit 1 +} + function waitAndVerifySetup() { sleep 10 checkPodStatusWithTimeout "example-zone1-vtctld(.*)1/1(.*)Running(.*)" @@ -233,12 +271,11 @@ EOF } # Test setup +mkdir -p -m 777 ./vtdataroot/backup echo "Building the docker image" docker build -f build/Dockerfile.release -t vitess-operator-pr:latest . -echo "Creating Kind cluster" -kind create cluster --wait 30s --name kind-${BUILDKITE_BUILD_ID} --image ${KIND_VERSION} -echo "Loading docker image into Kind cluster" -kind load docker-image vitess-operator-pr:latest --name kind-${BUILDKITE_BUILD_ID} +setupKindConfig +createKindCluster cd "$PWD/test/endtoend/operator" killall kubectl @@ -257,6 +294,11 @@ verifyDurabilityPolicy "commerce" "semi_sync" move_tables resharding +scheduledBackups + # Teardown +echo "Removing the temporary directory" +removeBackupFiles +rm -rf "$STARTING_DIR/vtdataroot" echo "Deleting Kind cluster. This also deletes the volume associated with it" kind delete cluster --name kind-${BUILDKITE_BUILD_ID} diff --git a/test/endtoend/utils.sh b/test/endtoend/utils.sh index 4de6add1..fbdd55c0 100644 --- a/test/endtoend/utils.sh +++ b/test/endtoend/utils.sh @@ -155,6 +155,28 @@ function checkPodStatusWithTimeout() { exit 1 } +function checkPodExistWithTimeout() { + regex=$1 + + # We use this for loop instead of `kubectl wait` because we don't have access to the full pod name + # and `kubectl wait` does not support regex to match resource name. + for i in {1..1200} ; do + out=$(kubectl get pods) + echo "$out" | grep -E "$regex" > /dev/null 2>&1 + if [[ $? -eq 0 ]]; then + echo "$regex found" + return + fi + sleep 1 + done + echo -e "ERROR: checkPodStatusWithTimeout timeout to find pod matching:\ngot:\n$out\nfor regex: $regex" + echo "$regex" | grep "vttablet" > /dev/null 2>&1 + if [[ $? -eq 0 ]]; then + printMysqlErrorFiles + fi + exit 1 +} + # ensurePodResourcesSet: # $1: regex used to match pod names function ensurePodResourcesSet() { @@ -225,7 +247,7 @@ function waitForKeyspaceToBeServing() { fi echo "Shard $ks/$shard is not fully serving. Output: $out" echo "Retrying (attempt #$i) ..." - sleep 10 + sleep 1 done } @@ -375,3 +397,17 @@ COrder +----------+-------------+----------+-------+ EOF } + +function checkVitessBackupScheduleStatusWithTimeout() { + regex=$1 + + for i in {1..1200} ; do + if [[ $(kubectl get VitessBackupSchedule | grep -E "${regex}" | wc -l) -eq 1 ]]; then + echo "$regex found" + return + fi + sleep 1 + done + echo -e "ERROR: checkPodStatusWithTimeout timeout to find pod matching:\ngot:\n$out\nfor regex: $regex" + exit 1 +}