VitessBackupScheduleStr
name
-
-BackupStrategyName
-
+string
|
@@ -2743,11 +2750,9 @@ VitessBackupScheduleTem
Strategy defines how we are going to take a backup.
If you want to take several backups within the same schedule you can add more items
-to the Strategy list. Each VitessBackupScheduleStrategy will be executed by the same
-kubernetes job. This is useful if for instance you have one schedule, and you want to
-take a backup of all shards in a keyspace and don’t want to re-create a second schedule.
-All the VitessBackupScheduleStrategy are concatenated into a single shell command that
-is executed when the Job’s container starts.
+to the Strategy list. Each VitessBackupScheduleStrategy will be executed within different
+K8S Jobs. This is useful if you want to have a single schedule backing up multiple shards
+at the same time.
|
@@ -2830,8 +2835,8 @@ VitessBackupScheduleTem
(Optional)
ConcurrencyPolicy specifies ho to treat concurrent executions of a Job.
Valid values are:
-- “Allow” (default): allows CronJobs to run concurrently;
-- “Forbid”: forbids concurrent runs, skipping next run if previous run hasn’t finished yet;
+- “Allow”: allows CronJobs to run concurrently;
+- “Forbid” (default): forbids concurrent runs, skipping next run if previous run hasn’t finished yet;
- “Replace”: cancels currently running job and replaces it with a new one.
diff --git a/pkg/apis/planetscale/v2/vitessbackupschedule_types.go b/pkg/apis/planetscale/v2/vitessbackupschedule_types.go
index ff29654d..e33d4061 100644
--- a/pkg/apis/planetscale/v2/vitessbackupschedule_types.go
+++ b/pkg/apis/planetscale/v2/vitessbackupschedule_types.go
@@ -34,16 +34,6 @@ const (
ForbidConcurrent ConcurrencyPolicy = "Forbid"
)
-// BackupStrategyName describes the vtctldclient command that will be used to take a backup.
-// When scheduling a backup, you must specify at least one strategy.
-// +kubebuilder:validation:Enum=BackupShard
-type BackupStrategyName string
-
-const (
- // BackupShard will use the "vtctldclient BackupShard" command to take a backup
- BackupShard BackupStrategyName = "BackupShard"
-)
-
// VitessBackupSchedule is the Schema for the VitessBackupSchedule API.
// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
@@ -98,12 +88,12 @@ type VitessBackupScheduleTemplate struct {
// Strategy defines how we are going to take a backup.
// If you want to take several backups within the same schedule you can add more items
- // to the Strategy list. Each VitessBackupScheduleStrategy will be executed by the same
- // kubernetes job. This is useful if for instance you have one schedule, and you want to
- // take a backup of all shards in a keyspace and don't want to re-create a second schedule.
- // All the VitessBackupScheduleStrategy are concatenated into a single shell command that
- // is executed when the Job's container starts.
+ // to the Strategy list. Each VitessBackupScheduleStrategy will be executed within different
+ // K8S Jobs. This is useful if you want to have a single schedule backing up multiple shards
+ // at the same time.
// +kubebuilder:validation:MinItems=1
+ // +patchMergeKey=name
+ // +patchStrategy=merge
Strategy []VitessBackupScheduleStrategy `json:"strategies"`
// Resources specify the compute resources to allocate for every Jobs's pod.
@@ -136,11 +126,12 @@ type VitessBackupScheduleTemplate struct {
// ConcurrencyPolicy specifies ho to treat concurrent executions of a Job.
// Valid values are:
- // - "Allow" (default): allows CronJobs to run concurrently;
- // - "Forbid": forbids concurrent runs, skipping next run if previous run hasn't finished yet;
+ // - "Allow": allows CronJobs to run concurrently;
+ // - "Forbid" (default): forbids concurrent runs, skipping next run if previous run hasn't finished yet;
// - "Replace": cancels currently running job and replaces it with a new one.
// +optional
// +kubebuilder:example="Forbid"
+ // +kubebuilder:default="Forbid"
ConcurrencyPolicy ConcurrencyPolicy `json:"concurrencyPolicy,omitempty"`
// AllowedMissedRuns defines how many missed run of the schedule will be allowed before giving up on finding the last job.
@@ -180,7 +171,7 @@ type VitessBackupScheduleTemplate struct {
// command line that will be executed in the Job's pod.
type VitessBackupScheduleStrategy struct {
// Name of the backup strategy.
- Name BackupStrategyName `json:"name"`
+ Name string `json:"name"`
// Keyspace defines the keyspace on which we want to take the backup.
// +kubebuilder:example="commerce"
@@ -198,12 +189,32 @@ type VitessBackupScheduleStrategy struct {
// VitessBackupScheduleStatus defines the observed state of VitessBackupSchedule
type VitessBackupScheduleStatus struct {
// A list of pointers to currently running jobs.
+ // This field is deprecated and no longer used in >= v2.15. It will be removed in a future release.
// +optional
Active []corev1.ObjectReference `json:"active,omitempty"`
// Information when was the last time the job was successfully scheduled.
+ // This field is deprecated and no longer used in >= v2.15. It will be removed in a future release.
+ // Please use lastScheduledTimes instead which maps the last schedule time to each VitessBackupScheduleStrategy
+ // in the VitessBackupSchedule.
// +optional
LastScheduledTime *metav1.Time `json:"lastScheduledTime,omitempty"`
+
+ // LastScheduledTimes lists for every VitessBackupScheduleStrategy what is the last schedule we executed.
+ // It does not list when the last execution started, but to which scheduled time it corresponds.
+ // +optional
+ LastScheduledTimes map[string]*metav1.Time `json:"lastScheduledTimes,omitempty"`
+}
+
+// NewVitessBackupScheduleStatus creates a new status with default values.
+func NewVitessBackupScheduleStatus(status VitessBackupScheduleStatus) VitessBackupScheduleStatus {
+ newStatus := VitessBackupScheduleStatus{
+ LastScheduledTimes: status.LastScheduledTimes,
+ }
+ if status.LastScheduledTimes == nil {
+ newStatus.LastScheduledTimes = make(map[string]*metav1.Time)
+ }
+ return newStatus
}
func init() {
diff --git a/pkg/apis/planetscale/v2/zz_generated.deepcopy.go b/pkg/apis/planetscale/v2/zz_generated.deepcopy.go
index ace6b890..700716d2 100644
--- a/pkg/apis/planetscale/v2/zz_generated.deepcopy.go
+++ b/pkg/apis/planetscale/v2/zz_generated.deepcopy.go
@@ -7,6 +7,7 @@ package v2
import (
autoscalingv2 "k8s.io/api/autoscaling/v2"
"k8s.io/api/core/v1"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
runtime "k8s.io/apimachinery/pkg/runtime"
)
@@ -855,6 +856,13 @@ func (in *VitessBackupScheduleStatus) DeepCopyInto(out *VitessBackupScheduleStat
in, out := &in.LastScheduledTime, &out.LastScheduledTime
*out = (*in).DeepCopy()
}
+ if in.LastScheduledTimes != nil {
+ in, out := &in.LastScheduledTimes, &out.LastScheduledTimes
+ *out = make(map[string]*metav1.Time, len(*in))
+ for key, val := range *in {
+ (*out)[key] = val.DeepCopy()
+ }
+ }
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VitessBackupScheduleStatus.
diff --git a/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go b/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go
index c105fe9d..16becc10 100644
--- a/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go
+++ b/pkg/controller/vitessbackupschedule/vitessbackupschedule_controller.go
@@ -22,6 +22,7 @@ import (
"fmt"
"maps"
"math/rand/v2"
+ "strconv"
"strings"
"sort"
@@ -31,12 +32,18 @@ import (
"github.com/sirupsen/logrus"
kbatch "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
+ apiequality "k8s.io/apimachinery/pkg/api/equality"
apierrors "k8s.io/apimachinery/pkg/api/errors"
apilabels "k8s.io/apimachinery/pkg/labels"
"k8s.io/client-go/tools/record"
+ "planetscale.dev/vitess-operator/pkg/controller/vitessshard"
"planetscale.dev/vitess-operator/pkg/operator/metrics"
+ "planetscale.dev/vitess-operator/pkg/operator/names"
"planetscale.dev/vitess-operator/pkg/operator/reconciler"
"planetscale.dev/vitess-operator/pkg/operator/results"
+ "planetscale.dev/vitess-operator/pkg/operator/resync"
+ "planetscale.dev/vitess-operator/pkg/operator/vitessbackup"
+ "planetscale.dev/vitess-operator/pkg/operator/vttablet"
"sigs.k8s.io/controller-runtime/pkg/controller"
"sigs.k8s.io/controller-runtime/pkg/handler"
"sigs.k8s.io/controller-runtime/pkg/manager"
@@ -46,19 +53,18 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
- ref "k8s.io/client-go/tools/reference"
planetscalev2 "planetscale.dev/vitess-operator/pkg/apis/planetscale/v2"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
)
const (
- controllerName = "vitessbackupschedule-controller"
- vtctldclientPath = "/vt/bin/vtctldclient"
+ controllerName = "vitessbackupschedule-controller"
)
var (
maxConcurrentReconciles = flag.Int("vitessbackupschedule_concurrent_reconciles", 10, "the maximum number of different vitessbackupschedule resources to reconcile concurrently")
+ resyncPeriod = flag.Duration("vitessbackupschedule_resync_period", 1*time.Minute, "reconcile vitessbackupschedules with this period even if no Kubernetes events occur")
scheduledTimeAnnotation = "planetscale.com/backup-scheduled-at"
@@ -75,6 +81,7 @@ type (
ReconcileVitessBackupsSchedule struct {
client client.Client
scheme *runtime.Scheme
+ resync *resync.Periodic
recorder record.EventRecorder
reconciler *reconciler.Reconciler
}
@@ -106,6 +113,7 @@ func newReconciler(mgr manager.Manager) (*ReconcileVitessBackupsSchedule, error)
return &ReconcileVitessBackupsSchedule{
client: c,
scheme: scheme,
+ resync: resync.NewPeriodic(controllerName, *resyncPeriod),
recorder: recorder,
reconciler: reconciler.New(c, scheme, recorder),
}, nil
@@ -140,6 +148,11 @@ func add(mgr manager.Manager, r *ReconcileVitessBackupsSchedule) error {
}
}
+ // Periodically resync even when no Kubernetes events have come in.
+ if err := c.Watch(r.resync.WatchSource()); err != nil {
+ return err
+ }
+
return nil
}
@@ -151,16 +164,14 @@ func add(mgr manager.Manager, r *ReconcileVitessBackupsSchedule) error {
// - List all jobs and define the last scheduled Job
// - Clean up old Job objects
// - Create a new Job if needed
-func (r *ReconcileVitessBackupsSchedule) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
- resultBuilder := &results.Builder{}
-
+// - Update the VitessBackupSchedule Status
+func (r *ReconcileVitessBackupsSchedule) Reconcile(ctx context.Context, req ctrl.Request) (res ctrl.Result, err error) {
log = log.WithFields(logrus.Fields{
"namespace": req.Namespace,
"VitessBackupSchedule": req.Name,
})
log.Info("Reconciling VitessBackupSchedule")
- var err error
var vbsc planetscalev2.VitessBackupSchedule
if err = r.client.Get(ctx, req.NamespacedName, &vbsc); err != nil {
log.WithError(err).Error(" unable to fetch VitessBackupSchedule")
@@ -168,26 +179,74 @@ func (r *ReconcileVitessBackupsSchedule) Reconcile(ctx context.Context, req ctrl
// Request object not found, could have been deleted after reconcile request.
// Owned objects are automatically garbage collected. For additional cleanup logic use finalizers.
// Return and don't requeue
- return resultBuilder.Result()
+ return ctrl.Result{}, nil
}
// Error reading the object - requeue the request.
- return resultBuilder.Error(err)
+ return ctrl.Result{}, err
}
+ // If the Suspend setting is set to true, we can skip adding any job, our work is done here.
+ if vbsc.Spec.Suspend != nil && *vbsc.Spec.Suspend {
+ log.Info("VitessBackupSchedule suspended, skipping")
+ return ctrl.Result{}, nil
+ }
+
+ oldStatus := vbsc.DeepCopy()
+ vbsc.Status = planetscalev2.NewVitessBackupScheduleStatus(vbsc.Status)
+
// Register this reconciling attempt no matter if we fail or succeed.
defer func() {
reconcileCount.WithLabelValues(vbsc.Name, metrics.Result(err)).Inc()
}()
- jobs, mostRecentTime, err := r.getJobsList(ctx, req, vbsc.Name)
- if err != nil {
- // We had an error reading the jobs, we can requeue.
- return resultBuilder.Error(err)
+ resultBuilder := &results.Builder{}
+ _, _ = resultBuilder.Merge(r.reconcileStrategies(ctx, req, vbsc))
+
+ if !apiequality.Semantic.DeepEqual(&vbsc.Status, &oldStatus) {
+ if err := r.client.Status().Update(ctx, &vbsc); err != nil {
+ if !apierrors.IsConflict(err) {
+ log.WithError(err).Error("unable to update VitessBackupSchedule status")
+ }
+ _, _ = resultBuilder.Error(err)
+ }
+ }
+
+ // Request a periodic resync of this VitessBackupSchedule object to check
+ // if existing Jobs have finished or timed out and need to be cleaned up
+ // even if no Kubernetes events have occurred.
+ r.resync.Enqueue(req.NamespacedName)
+
+ return resultBuilder.Result()
+}
+
+func (r *ReconcileVitessBackupsSchedule) reconcileStrategies(ctx context.Context, req ctrl.Request, vbsc planetscalev2.VitessBackupSchedule) (ctrl.Result, error) {
+ resultBuilder := &results.Builder{}
+
+ for _, strategy := range vbsc.Spec.Strategy {
+ _, _ = resultBuilder.Merge(r.reconcileStrategy(ctx, strategy, req, vbsc))
}
+ return resultBuilder.Result()
+}
- err = r.updateVitessBackupScheduleStatus(ctx, mostRecentTime, vbsc, jobs.active)
+func (r *ReconcileVitessBackupsSchedule) reconcileStrategy(
+ ctx context.Context,
+ strategy planetscalev2.VitessBackupScheduleStrategy,
+ req ctrl.Request,
+ vbsc planetscalev2.VitessBackupSchedule,
+) (reconcile.Result, error) {
+ resultBuilder := &results.Builder{}
+
+ start, end, ok := strings.Cut(strategy.Shard, "-")
+ if !ok {
+ return resultBuilder.Error(fmt.Errorf("invalid strategy shard: %s", strategy.Shard))
+ }
+ vkr := planetscalev2.VitessKeyRange{
+ Start: start,
+ End: end,
+ }
+ jobs, mostRecentTime, err := r.getJobsList(ctx, req, vbsc, strategy.Keyspace, vkr.SafeName())
if err != nil {
- // We had an error updating the status, we can requeue.
+ // We had an error reading the jobs, we can requeue.
return resultBuilder.Error(err)
}
@@ -202,25 +261,22 @@ func (r *ReconcileVitessBackupsSchedule) Reconcile(ctx context.Context, req ctrl
return resultBuilder.Error(err)
}
- // If the Suspend setting is set to true, we can skip adding any job, our work is done here.
- if vbsc.Spec.Suspend != nil && *vbsc.Spec.Suspend {
- log.Info("VitessBackupSchedule suspended, skipping")
- return ctrl.Result{}, nil
- }
-
- missedRun, nextRun, err := getNextSchedule(vbsc, time.Now())
+ missedRun, nextRun, err := getNextSchedule(vbsc, time.Now(), mostRecentTime)
if err != nil {
log.Error(err, "unable to figure out VitessBackupSchedule schedule")
// Re-queuing here does not make sense as we have an error with the schedule and the user needs to fix it first.
- return ctrl.Result{}, nil
+ return resultBuilder.Error(reconcile.TerminalError(err))
}
- // Ask kubernetes to re-queue for the next scheduled job, and skip if we don't miss any run.
- scheduledResult := ctrl.Result{RequeueAfter: nextRun.Sub(time.Now())}
+ // If we did not miss any run, we can skip and not requeue anything
if missedRun.IsZero() {
- return scheduledResult, nil
+ return resultBuilder.Result()
}
+ // Keep track of when we need to requeue this job
+ requeueAfter := nextRun.Sub(time.Now())
+ _, _ = resultBuilder.RequeueAfter(requeueAfter)
+
// Check whether we are too late to create this Job or not. The startingDeadlineSeconds field will help us
// schedule Jobs that are late.
tooLate := false
@@ -229,37 +285,39 @@ func (r *ReconcileVitessBackupsSchedule) Reconcile(ctx context.Context, req ctrl
}
if tooLate {
log.Infof("missed starting deadline for latest run; skipping; next run is scheduled for: %s", nextRun.Format(time.RFC3339))
- return scheduledResult, nil
+ return resultBuilder.Result()
}
// Check concurrency policy and skip this job if we have ForbidConcurrent set plus an active job
if vbsc.Spec.ConcurrencyPolicy == planetscalev2.ForbidConcurrent && len(jobs.active) > 0 {
log.Infof("concurrency policy blocks concurrent runs: skipping, number of active jobs: %d", len(jobs.active))
- return scheduledResult, nil
+ return resultBuilder.Result()
}
// Now that the different policies are checked, we can create and apply our new job.
- job, err := r.createJob(ctx, &vbsc, missedRun)
+ job, err := r.createJob(ctx, vbsc, strategy, missedRun, vkr)
if err != nil {
// Re-queuing here does not make sense as we have an error with the template and the user needs to fix it first.
log.WithError(err).Error("unable to construct job from template")
- return ctrl.Result{}, err
+ return resultBuilder.Error(reconcile.TerminalError(err))
}
+
if err = r.client.Create(ctx, job); err != nil {
// if the job already exists it means another reconciling loop created the job since we last fetched
// the list of jobs to create, we can safely return without failing.
if apierrors.IsAlreadyExists(err) {
- return ctrl.Result{}, nil
+ log.Infof("job %s already exists, will retry in %s", job.Name, requeueAfter.String())
+ return resultBuilder.Result()
}
// Simply re-queue here
return resultBuilder.Error(err)
}
-
- log.Infof("created new job: %s, next job scheduled in %s", job.Name, scheduledResult.RequeueAfter.String())
- return scheduledResult, nil
+ log.Infof("created new job: %s, next job scheduled in %s", job.Name, requeueAfter.String())
+ vbsc.Status.LastScheduledTimes[strategy.Name] = &metav1.Time{Time: missedRun}
+ return resultBuilder.Result()
}
-func getNextSchedule(vbsc planetscalev2.VitessBackupSchedule, now time.Time) (time.Time, time.Time, error) {
+func getNextSchedule(vbsc planetscalev2.VitessBackupSchedule, now time.Time, mostRecentTime *time.Time) (time.Time, time.Time, error) {
sched, err := cron.ParseStandard(vbsc.Spec.Schedule)
if err != nil {
return time.Time{}, time.Time{}, fmt.Errorf("unable to parse schedule %q: %v", vbsc.Spec.Schedule, err)
@@ -267,30 +325,27 @@ func getNextSchedule(vbsc planetscalev2.VitessBackupSchedule, now time.Time) (ti
// Set the last scheduled time by either looking at the VitessBackupSchedule's Status or
// by looking at its creation time.
- var latestRun time.Time
- if vbsc.Status.LastScheduledTime != nil {
- latestRun = vbsc.Status.LastScheduledTime.Time
- } else {
- latestRun = vbsc.ObjectMeta.CreationTimestamp.Time
+ if mostRecentTime == nil {
+ mostRecentTime = &vbsc.ObjectMeta.CreationTimestamp.Time
}
if vbsc.Spec.StartingDeadlineSeconds != nil {
// controller is not going to schedule anything below this point
schedulingDeadline := now.Add(-time.Second * time.Duration(*vbsc.Spec.StartingDeadlineSeconds))
- if schedulingDeadline.After(latestRun) {
- latestRun = schedulingDeadline
+ if schedulingDeadline.After(*mostRecentTime) {
+ *mostRecentTime = schedulingDeadline
}
}
// Next schedule is later, simply return the next scheduled time.
- if latestRun.After(now) {
+ if mostRecentTime.After(now) {
return time.Time{}, sched.Next(now), nil
}
var lastMissed time.Time
missedRuns := 0
- for t := sched.Next(latestRun); !t.After(now); t = sched.Next(t) {
+ for t := sched.Next(*mostRecentTime); !t.After(now); t = sched.Next(t) {
lastMissed = t
missedRuns++
@@ -303,37 +358,24 @@ func getNextSchedule(vbsc planetscalev2.VitessBackupSchedule, now time.Time) (ti
return lastMissed, sched.Next(now), nil
}
-func (r *ReconcileVitessBackupsSchedule) updateVitessBackupScheduleStatus(ctx context.Context, mostRecentTime *time.Time, vbsc planetscalev2.VitessBackupSchedule, activeJobs []*kbatch.Job) error {
- if mostRecentTime != nil {
- vbsc.Status.LastScheduledTime = &metav1.Time{Time: *mostRecentTime}
- } else {
- vbsc.Status.LastScheduledTime = nil
- }
-
- vbsc.Status.Active = make([]corev1.ObjectReference, 0, len(activeJobs))
- for _, activeJob := range activeJobs {
- jobRef, err := ref.GetReference(r.scheme, activeJob)
- if err != nil {
- log.WithError(err).Errorf("unable to make reference to active job: %s", jobRef.Name)
- continue
- }
- vbsc.Status.Active = append(vbsc.Status.Active, *jobRef)
- }
-
- if err := r.client.Status().Update(ctx, &vbsc); err != nil {
- log.WithError(err).Error("unable to update VitessBackupSchedule status")
- return err
- }
- return nil
-}
-
// getJobsList fetches all existing Jobs in the cluster and return them by categories: active, failed or successful.
// It also returns at what time was the last job created, which is needed to update VitessBackupSchedule's status,
// and plan future jobs.
-func (r *ReconcileVitessBackupsSchedule) getJobsList(ctx context.Context, req ctrl.Request, vbscName string) (jobsList, *time.Time, error) {
+func (r *ReconcileVitessBackupsSchedule) getJobsList(
+ ctx context.Context,
+ req ctrl.Request,
+ vbsc planetscalev2.VitessBackupSchedule,
+ keyspace string,
+ shardSafeName string,
+) (jobsList, *time.Time, error) {
var existingJobs kbatch.JobList
- err := r.client.List(ctx, &existingJobs, client.InNamespace(req.Namespace), client.MatchingLabels{planetscalev2.BackupScheduleLabel: vbscName})
+ err := r.client.List(ctx, &existingJobs, client.InNamespace(req.Namespace), client.MatchingLabels{
+ planetscalev2.BackupScheduleLabel: vbsc.Name,
+ planetscalev2.ClusterLabel: vbsc.Spec.Cluster,
+ planetscalev2.KeyspaceLabel: keyspace,
+ planetscalev2.ShardLabel: shardSafeName,
+ })
if err != nil && !apierrors.IsNotFound(err) {
log.WithError(err).Error("unable to list Jobs in cluster")
return jobsList{}, nil, err
@@ -386,11 +428,27 @@ func (r *ReconcileVitessBackupsSchedule) cleanupJobsWithLimit(ctx context.Contex
if int32(i) >= int32(len(jobs))-limit {
break
}
+ // delete the job
if err := r.client.Delete(ctx, job, client.PropagationPolicy(metav1.DeletePropagationBackground)); (err) != nil {
log.WithError(err).Errorf("unable to delete old job: %s", job.Name)
} else {
log.Infof("deleted old job: %s", job.Name)
}
+
+ // delete the vtbackup pod's PVC
+ pvc := &corev1.PersistentVolumeClaim{}
+ err := r.client.Get(ctx, client.ObjectKey{Namespace: job.Namespace, Name: job.Name}, pvc)
+ if err != nil {
+ log.WithError(err).Errorf("unable to get PVC for job: %s", job.Name)
+ if apierrors.IsNotFound(err) {
+ continue
+ }
+ }
+ if err := r.client.Delete(ctx, pvc, client.PropagationPolicy(metav1.DeletePropagationBackground)); (err) != nil {
+ log.WithError(err).Errorf("unable to delete old PVC for job: %s", job.Name)
+ } else {
+ log.Infof("deleted old PVC for job: %s", job.Name)
+ }
}
}
@@ -410,6 +468,20 @@ func (r *ReconcileVitessBackupsSchedule) removeTimeoutJobs(ctx context.Context,
log.Infof("deleted timed out job: %s", job.Name)
}
timeoutJobsCount.WithLabelValues(vbscName, metrics.Result(err)).Inc()
+
+ pvc := &corev1.PersistentVolumeClaim{}
+ err := r.client.Get(ctx, client.ObjectKey{Namespace: job.Namespace, Name: job.Name}, pvc)
+ if err != nil {
+ log.WithError(err).Errorf("unable to get PVC for timed out job: %s", job.Name)
+ if apierrors.IsNotFound(err) {
+ continue
+ }
+ }
+ if err := r.client.Delete(ctx, pvc, client.PropagationPolicy(metav1.DeletePropagationBackground)); (err) != nil {
+ log.WithError(err).Errorf("unable to delete old PVC for timed out job: %s", job.Name)
+ } else {
+ log.Infof("deleted old PVC for timed out job: %s", job.Name)
+ }
}
}
return nil
@@ -439,13 +511,24 @@ func getScheduledTimeForJob(job *kbatch.Job) (*time.Time, error) {
return &timeParsed, nil
}
-func (r *ReconcileVitessBackupsSchedule) createJob(ctx context.Context, vbsc *planetscalev2.VitessBackupSchedule, scheduledTime time.Time) (*kbatch.Job, error) {
- name := fmt.Sprintf("%s-%d", vbsc.Name, scheduledTime.Unix())
+func (r *ReconcileVitessBackupsSchedule) createJob(
+ ctx context.Context,
+ vbsc planetscalev2.VitessBackupSchedule,
+ strategy planetscalev2.VitessBackupScheduleStrategy,
+ scheduledTime time.Time,
+ vkr planetscalev2.VitessKeyRange,
+) (*kbatch.Job, error) {
+ name := names.JoinWithConstraints(names.ServiceConstraints, vbsc.Name, strategy.Keyspace, vkr.SafeName(), strconv.Itoa(int(scheduledTime.Unix())))
+
+ labels := map[string]string{
+ planetscalev2.BackupScheduleLabel: vbsc.Name,
+ planetscalev2.ClusterLabel: vbsc.Spec.Cluster,
+ planetscalev2.KeyspaceLabel: strategy.Keyspace,
+ planetscalev2.ShardLabel: vkr.SafeName(),
+ }
meta := metav1.ObjectMeta{
- Labels: map[string]string{
- planetscalev2.BackupScheduleLabel: vbsc.Name,
- },
+ Labels: labels,
Annotations: make(map[string]string),
Name: name,
Namespace: vbsc.Namespace,
@@ -457,7 +540,7 @@ func (r *ReconcileVitessBackupsSchedule) createJob(ctx context.Context, vbsc *pl
maps.Copy(meta.Labels, vbsc.Labels)
- pod, err := r.createJobPod(ctx, vbsc, name)
+ pod, vtbackupSpec, err := r.createJobPod(ctx, vbsc, strategy, name, vkr, labels)
if err != nil {
return nil, err
}
@@ -466,76 +549,83 @@ func (r *ReconcileVitessBackupsSchedule) createJob(ctx context.Context, vbsc *pl
Spec: kbatch.JobSpec{
Template: corev1.PodTemplateSpec{
ObjectMeta: meta,
- Spec: pod,
+ Spec: pod.Spec,
},
},
}
- if err := ctrl.SetControllerReference(vbsc, job, r.scheme); err != nil {
+ if err := ctrl.SetControllerReference(&vbsc, job, r.scheme); err != nil {
return nil, err
}
- return job, nil
-}
-
-func (r *ReconcileVitessBackupsSchedule) createJobPod(ctx context.Context, vbsc *planetscalev2.VitessBackupSchedule, name string) (pod corev1.PodSpec, err error) {
- getVtctldServiceName := func(cluster string) (string, error) {
- vtctldServiceName, vtctldServicePort, err := r.getVtctldServiceName(ctx, vbsc, cluster)
+ // Create the corresponding PVC for the new vtbackup pod
+ pvc := &corev1.PersistentVolumeClaim{}
+ key := client.ObjectKey{
+ Namespace: job.Namespace,
+ Name: name,
+ }
+ err = r.client.Get(ctx, key, pvc)
+ if err != nil {
+ if !apierrors.IsNotFound(err) {
+ return nil, err
+ }
+ newPVC := vttablet.NewPVC(key, vtbackupSpec.TabletSpec)
+ if err := ctrl.SetControllerReference(&vbsc, newPVC, r.scheme); err != nil {
+ return nil, err
+ }
+ err = r.client.Create(ctx, newPVC)
if err != nil {
- return "", err
+ return nil, err
}
- return fmt.Sprintf("--server=%s:%d", vtctldServiceName, vtctldServicePort), nil
}
- // It is fine to not have any default in the event there is no strategy as the CRD validation
- // ensures that there will be at least one item in this list. The YAML cannot be applied with
- // empty list of strategies.
- var cmd strings.Builder
+ return job, nil
+}
- addNewCmd := func(i int) {
- if i > 0 {
- cmd.WriteString(" && ")
- }
+func (r *ReconcileVitessBackupsSchedule) createJobPod(
+ ctx context.Context,
+ vbsc planetscalev2.VitessBackupSchedule,
+ strategy planetscalev2.VitessBackupScheduleStrategy,
+ name string,
+ vkr planetscalev2.VitessKeyRange,
+ labels map[string]string,
+) (pod *corev1.Pod, spec *vttablet.BackupSpec, err error) {
+ vts, err := r.getShardFromKeyspace(ctx, vbsc.Namespace, vbsc.Spec.Cluster, strategy.Keyspace, strategy.Shard)
+ if err != nil {
+ return nil, nil, err
}
- for i, strategy := range vbsc.Spec.Strategy {
- vtctldclientServerArg, err := getVtctldServiceName(vbsc.Spec.Cluster)
- if err != nil {
- return corev1.PodSpec{}, err
- }
+ _, completeBackups, err := vitessbackup.GetBackups(ctx, vbsc.Namespace, vbsc.Spec.Cluster, strategy.Keyspace, vkr.SafeName(),
+ func(ctx context.Context, allBackupsList *planetscalev2.VitessBackupList, listOpts *client.ListOptions) error {
+ return r.client.List(ctx, allBackupsList, listOpts)
+ },
+ )
+ if err != nil {
+ return nil, nil, err
+ }
- addNewCmd(i)
- switch strategy.Name {
- case planetscalev2.BackupShard:
- createVtctldClientCommand(&cmd, vtctldclientServerArg, strategy.ExtraFlags, strategy.Keyspace, strategy.Shard)
+ backupType := vitessbackup.TypeUpdate
+ if len(completeBackups) == 0 {
+ if vts.Status.HasMaster == corev1.ConditionTrue {
+ backupType = vitessbackup.TypeFirstBackup
+ } else {
+ backupType = vitessbackup.TypeInit
}
-
}
- pod = corev1.PodSpec{
- Containers: []corev1.Container{{
- Name: name,
- Image: vbsc.Spec.Image,
- ImagePullPolicy: vbsc.Spec.ImagePullPolicy,
- Resources: vbsc.Spec.Resources,
- Args: []string{"/bin/sh", "-c", cmd.String()},
- }},
- RestartPolicy: corev1.RestartPolicyOnFailure,
- Affinity: vbsc.Spec.Affinity,
+ podKey := client.ObjectKey{
+ Namespace: vbsc.Namespace,
+ Name: name,
}
- return pod, nil
-}
-
-func createVtctldClientCommand(cmd *strings.Builder, serverAddr string, extraFlags map[string]string, keyspace, shard string) {
- cmd.WriteString(fmt.Sprintf("%s %s BackupShard", vtctldclientPath, serverAddr))
+ vtbackupSpec := vitessshard.MakeVtbackupSpec(podKey, &vts, labels, backupType)
+ p := vttablet.NewBackupPod(podKey, vtbackupSpec, vts.Spec.Images.Mysqld.Image())
- // Add any flags
- for key, value := range extraFlags {
- cmd.WriteString(fmt.Sprintf(" --%s=%s", key, value))
- }
+ // Explicitly do not restart on failure. The VitessBackupSchedule controller will retry the failed job
+ // during the next scheduled run.
+ p.Spec.RestartPolicy = corev1.RestartPolicyNever
- // Add keyspace/shard
- cmd.WriteString(fmt.Sprintf(" %s/%s", keyspace, shard))
+ p.Spec.Affinity = vbsc.Spec.Affinity
+ return p, vtbackupSpec, nil
}
func (r *ReconcileVitessBackupsSchedule) getVtctldServiceName(ctx context.Context, vbsc *planetscalev2.VitessBackupSchedule, cluster string) (svcName string, svcPort int32, err error) {
@@ -568,7 +658,7 @@ func (r *ReconcileVitessBackupsSchedule) getVtctldServiceName(ctx context.Contex
return svcName, svcPort, nil
}
-func (r *ReconcileVitessBackupsSchedule) getAllShardsInKeyspace(ctx context.Context, namespace, cluster, keyspace string) ([]string, error) {
+func (r *ReconcileVitessBackupsSchedule) getShardFromKeyspace(ctx context.Context, namespace, cluster, keyspace, shard string) (planetscalev2.VitessShard, error) {
shardsList := &planetscalev2.VitessShardList{}
listOpts := &client.ListOptions{
Namespace: namespace,
@@ -578,13 +668,14 @@ func (r *ReconcileVitessBackupsSchedule) getAllShardsInKeyspace(ctx context.Cont
}.AsSelector(),
}
if err := r.client.List(ctx, shardsList, listOpts); err != nil {
- return nil, fmt.Errorf("unable to list shards of keyspace %s in %s: %v", keyspace, namespace, err)
+ return planetscalev2.VitessShard{}, fmt.Errorf("unable to list shards of keyspace %s in %s: %v", keyspace, namespace, err)
}
- var result []string
for _, item := range shardsList.Items {
- result = append(result, item.Spec.Name)
+ if item.Spec.KeyRange.String() == shard {
+ return item, nil
+ }
}
- return result, nil
+ return planetscalev2.VitessShard{}, fmt.Errorf("unable to find shard %s in keyspace %s in %s", shard, keyspace, namespace)
}
type keyspace struct {
diff --git a/pkg/controller/vitessshard/reconcile_backup_job.go b/pkg/controller/vitessshard/reconcile_backup_job.go
index f580ce91..14aef2a7 100644
--- a/pkg/controller/vitessshard/reconcile_backup_job.go
+++ b/pkg/controller/vitessshard/reconcile_backup_job.go
@@ -22,7 +22,6 @@ import (
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
- apilabels "k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
@@ -56,27 +55,15 @@ func (r *ReconcileVitessShard) reconcileBackupJob(ctx context.Context, vts *plan
vitessbackup.TypeLabel: vitessbackup.TypeInit,
}
- // List all backups for this shard, across all storage locations.
- // We'll use the latest observed state of backups to decide whether to take
- // a new one. This list could be out of date because it's populated by
- // polling the Vitess API (see the VitessBackupStorage controller), but as
- // long as it's eventually consistent, we'll converge to the right behavior.
- allBackups := &planetscalev2.VitessBackupList{}
- listOpts := &client.ListOptions{
- Namespace: vts.Namespace,
- LabelSelector: apilabels.SelectorFromSet(apilabels.Set{
- planetscalev2.ClusterLabel: clusterName,
- planetscalev2.KeyspaceLabel: keyspaceName,
- planetscalev2.ShardLabel: shardSafeName,
- }),
- }
- if err := r.client.List(ctx, allBackups, listOpts); err != nil {
+ allBackups, completeBackups, err := vitessbackup.GetBackups(ctx, vts.Namespace, clusterName, keyspaceName, shardSafeName,
+ func(ctx context.Context, allBackupsList *planetscalev2.VitessBackupList, listOpts *client.ListOptions) error {
+ return r.client.List(ctx, allBackupsList, listOpts)
+ },
+ )
+ if err != nil {
return resultBuilder.Error(err)
}
- updateBackupStatus(vts, allBackups.Items)
-
- // Here we only care about complete backups.
- completeBackups := vitessbackup.CompleteBackups(allBackups.Items)
+ updateBackupStatus(vts, allBackups)
// Generate keys (object names) for all desired backup Pods and PVCs.
// Keep a map back from generated names to the backup specs.
@@ -97,7 +84,11 @@ func (r *ReconcileVitessShard) reconcileBackupJob(ctx context.Context, vts *plan
// scratch (not from any tablet). If we're wrong and a backup exists
// already, the idempotent vtbackup "initial backup" mode will just do
// nothing and return success.
- initSpec := vtbackupInitSpec(initPodKey, vts, labels)
+ backupType := vitessbackup.TypeFirstBackup
+ if vts.Status.HasMaster != corev1.ConditionTrue {
+ backupType = vitessbackup.TypeInit
+ }
+ initSpec := MakeVtbackupSpec(initPodKey, vts, labels, backupType)
if initSpec != nil {
podKeys = append(podKeys, initPodKey)
if initSpec.TabletSpec.DataVolumePVCSpec != nil {
@@ -112,7 +103,7 @@ func (r *ReconcileVitessShard) reconcileBackupJob(ctx context.Context, vts *plan
// Reconcile vtbackup PVCs. Use the same key as the corresponding Pod,
// but only if the Pod expects a PVC.
- err := r.reconciler.ReconcileObjectSet(ctx, vts, pvcKeys, labels, reconciler.Strategy{
+ err = r.reconciler.ReconcileObjectSet(ctx, vts, pvcKeys, labels, reconciler.Strategy{
Kind: &corev1.PersistentVolumeClaim{},
New: func(key client.ObjectKey) runtime.Object {
@@ -181,7 +172,7 @@ func (r *ReconcileVitessShard) reconcileBackupJob(ctx context.Context, vts *plan
return resultBuilder.Result()
}
-func vtbackupInitSpec(key client.ObjectKey, vts *planetscalev2.VitessShard, parentLabels map[string]string) *vttablet.BackupSpec {
+func MakeVtbackupSpec(key client.ObjectKey, vts *planetscalev2.VitessShard, parentLabels map[string]string, typ string) *vttablet.BackupSpec {
// If we specifically set our cluster to avoid initial backups, bail early.
if !*vts.Spec.Replication.InitializeBackup {
return nil
@@ -196,7 +187,7 @@ func vtbackupInitSpec(key client.ObjectKey, vts *planetscalev2.VitessShard, pare
// Make a vtbackup spec that's a similar shape to the first tablet pool.
// This should give it enough resources to run mysqld and restore a backup,
// since all tablets need to be able to do that, regardless of type.
- return vtbackupSpec(key, vts, parentLabels, &vts.Spec.TabletPools[0], vitessbackup.TypeInit)
+ return vtbackupSpec(key, vts, parentLabels, &vts.Spec.TabletPools[0], typ)
}
func vtbackupSpec(key client.ObjectKey, vts *planetscalev2.VitessShard, parentLabels map[string]string, pool *planetscalev2.VitessShardTabletPool, backupType string) *vttablet.BackupSpec {
@@ -254,6 +245,7 @@ func vtbackupSpec(key client.ObjectKey, vts *planetscalev2.VitessShard, parentLa
return &vttablet.BackupSpec{
InitialBackup: backupType == vitessbackup.TypeInit,
+ AllowFirstBackup: backupType == vitessbackup.TypeFirstBackup,
MinBackupInterval: minBackupInterval,
MinRetentionTime: minRetentionTime,
MinRetentionCount: minRetentionCount,
diff --git a/pkg/operator/vitessbackup/labels.go b/pkg/operator/vitessbackup/labels.go
index e8e9a785..8c6d9819 100644
--- a/pkg/operator/vitessbackup/labels.go
+++ b/pkg/operator/vitessbackup/labels.go
@@ -24,6 +24,8 @@ const (
// TypeInit is a backup taken to initialize an empty shard.
TypeInit = "init"
+ // TypeFirstBackup is a backup taken when no other backup exist in an existing shard.
+ TypeFirstBackup = "empty"
// TypeUpdate is a backup taken to update the latest backup for a shard.
TypeUpdate = "update"
)
diff --git a/pkg/operator/vitessbackup/object.go b/pkg/operator/vitessbackup/object.go
index b5c011d8..7c74495a 100644
--- a/pkg/operator/vitessbackup/object.go
+++ b/pkg/operator/vitessbackup/object.go
@@ -17,11 +17,14 @@ limitations under the License.
package vitessbackup
import (
+ "context"
"fmt"
"strconv"
"strings"
"time"
+ apilabels "k8s.io/apimachinery/pkg/labels"
+ "sigs.k8s.io/controller-runtime/pkg/client"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
"vitess.io/vitess/go/vt/topo/topoproto"
@@ -99,8 +102,42 @@ func LatestForLocation(locationName string, backups []*planetscalev2.VitessBacku
return latest
}
-// CompleteBackups returns a list of only the complete backups from the input.
-func CompleteBackups(backups []planetscalev2.VitessBackup) []*planetscalev2.VitessBackup {
+// GetBackups returns a list of all backups, along with only completed backups, for the given
+// keyspace/shard in the given cluster.
+// A function to list the backup using the controller's client is necessary.
+func GetBackups(
+ ctx context.Context,
+ namespace, clusterName, keyspaceName, shardSafeName string,
+ listBackups func(context.Context, *planetscalev2.VitessBackupList, *client.ListOptions) error,
+) (allBackups []planetscalev2.VitessBackup, completeBackups []*planetscalev2.VitessBackup, err error) {
+ // List all backups for this shard, across all storage locations.
+ // We'll use the latest observed state of backups to decide whether to take
+ // a new one. This list could be out of date because it's populated by
+ // polling the Vitess API (see the VitessBackupStorage controller), but as
+ // long as it's eventually consistent, we'll converge to the right behavior.
+ allBackupsList := &planetscalev2.VitessBackupList{}
+ listOpts := &client.ListOptions{
+ Namespace: namespace,
+ LabelSelector: apilabels.SelectorFromSet(apilabels.Set{
+ planetscalev2.ClusterLabel: clusterName,
+ planetscalev2.KeyspaceLabel: keyspaceName,
+ planetscalev2.ShardLabel: shardSafeName,
+ }),
+ }
+ if err = listBackups(ctx, allBackupsList, listOpts); err != nil {
+ return nil, nil, err
+ }
+
+ allBackups = allBackupsList.Items
+
+ // Filter by complete backups.
+ completeBackups = getCompleteBackups(allBackups)
+
+ return allBackups, completeBackups, nil
+}
+
+// getCompleteBackups returns a list of only the complete backups from the input.
+func getCompleteBackups(backups []planetscalev2.VitessBackup) []*planetscalev2.VitessBackup {
completeBackups := []*planetscalev2.VitessBackup{}
for i := range backups {
if backups[i].Status.Complete {
diff --git a/pkg/operator/vttablet/flags.go b/pkg/operator/vttablet/flags.go
index 1ec42d0c..6bc34847 100644
--- a/pkg/operator/vttablet/flags.go
+++ b/pkg/operator/vttablet/flags.go
@@ -104,6 +104,7 @@ func init() {
// vtbackup-specific flags.
"concurrency": vtbackupConcurrency,
"initial_backup": backupSpec.InitialBackup,
+ "allow_first_backup": backupSpec.AllowFirstBackup,
"min_backup_interval": backupSpec.MinBackupInterval,
"min_retention_time": backupSpec.MinRetentionTime,
"min_retention_count": backupSpec.MinRetentionCount,
diff --git a/pkg/operator/vttablet/vtbackup_pod.go b/pkg/operator/vttablet/vtbackup_pod.go
index 813257f9..68b38567 100644
--- a/pkg/operator/vttablet/vtbackup_pod.go
+++ b/pkg/operator/vttablet/vtbackup_pod.go
@@ -61,6 +61,8 @@ type BackupSpec struct {
// aren't any. Instead, bootstrap the shard with a backup of an empty
// database.
InitialBackup bool
+ // AllowFirstBackup enables vtbackup to take a backup even if none exist.
+ AllowFirstBackup bool
// MinBackupInterval is the minimum spacing between backups.
// A new backup will only be taken if it's been at least this long since the
// most recent backup.
diff --git a/test/endtoend/backup_schedule_test.sh b/test/endtoend/backup_schedule_test.sh
index e540bf4c..23393289 100755
--- a/test/endtoend/backup_schedule_test.sh
+++ b/test/endtoend/backup_schedule_test.sh
@@ -11,47 +11,30 @@ function takedownShard() {
checkPodStatusWithTimeout "example-vttablet-zone1" 0
}
-function checkVitessBackupScheduleStatusWithTimeout() {
- regex=$1
-
- for i in {1..1200} ; do
- if [[ $(kubectl get VitessBackupSchedule | grep -E "${regex}" | wc -l) -eq 1 ]]; then
- echo "$regex found"
- return
- fi
- sleep 1
- done
- echo -e "ERROR: checkPodStatusWithTimeout timeout to find pod matching:\ngot:\n$out\nfor regex: $regex"
- exit 1
-}
-
function verifyListBackupsOutputWithSchedule() {
echo -e "Check for VitessBackupSchedule status"
checkVitessBackupScheduleStatusWithTimeout "example-vbsc-every-minute(.*)"
checkVitessBackupScheduleStatusWithTimeout "example-vbsc-every-five-minute(.*)"
echo -e "Check for number of backups in the cluster"
- # Sleep for over 6 minutes, during this time we should have at the very minimum
- # 7 backups. At least: 6 backups from the every-minute schedule, and 1 backup
- # from the every-five-minute schedule.
- for i in {1..6} ; do
+ # Sleep for 10 minutes, after 10 minutes we should at least have 3 backups: 1 from the initial vtbackup pod
+ # 1 minimum from the every minute schedule, and 1 from the every-five minute schedule
+ for i in {1..600} ; do
# Ensure that we can view the backup files from the host.
docker exec -it $(docker container ls --format '{{.Names}}' | grep kind) chmod o+rwx -R /backup > /dev/null
backupCount=$(kubectl get vtb --no-headers | wc -l)
echo "Found ${backupCount} backups"
- if [[ "${backupCount}" -ge 7 ]]; then
- break
+ if [[ "${backupCount}" -ge 3 ]]; then
+ echo -e "Check for Jobs' pods"
+ # Here check explicitly that the every five minute schedule ran at least once during the 10 minutes sleep
+ checkPodExistWithTimeout "example-vbsc-every-minute-(.*)0/1(.*)Completed(.*)"
+ checkPodExistWithTimeout "example-vbsc-every-five-minute-(.*)0/1(.*)Completed(.*)"
+ return
fi
- sleep 100
+ sleep 1
done
- if [[ "${backupCount}" -lt 7 ]]; then
- echo "Did not find at least 7 backups"
- exit 1
- fi
-
- echo -e "Check for Jobs' pods"
- checkPodStatusWithTimeout "example-vbsc-every-minute-(.*)0/1(.*)Completed(.*)" 3
- checkPodStatusWithTimeout "example-vbsc-every-five-minute-(.*)0/1(.*)Completed(.*)" 2
+ echo "Did not find at least 3 backups"
+ exit 1
}
# Test setup
diff --git a/test/endtoend/operator/101_initial_cluster.yaml b/test/endtoend/operator/101_initial_cluster.yaml
index 76ef570a..e0a34ed1 100644
--- a/test/endtoend/operator/101_initial_cluster.yaml
+++ b/test/endtoend/operator/101_initial_cluster.yaml
@@ -7,6 +7,13 @@ kind: VitessCluster
metadata:
name: example
spec:
+ backup:
+ engine: xtrabackup
+ locations:
+ - volume:
+ hostPath:
+ path: /backup
+ type: Directory
images:
vtctld: vitess/lite:v21.0.0
vtgate: vitess/lite:v21.0.0
@@ -14,7 +21,7 @@ spec:
vtorc: vitess/lite:v21.0.0
vtbackup: vitess/lite:v21.0.0
mysqld:
- mysql80Compatible: mysql:8.0.40
+ mysql80Compatible: vitess/lite:v21.0.0
mysqldExporter: prom/mysqld-exporter:v0.14.0
cells:
- name: zone1
diff --git a/test/endtoend/operator/101_initial_cluster_backup_schedule.yaml b/test/endtoend/operator/101_initial_cluster_backup_schedule.yaml
index b31c3032..b6379901 100644
--- a/test/endtoend/operator/101_initial_cluster_backup_schedule.yaml
+++ b/test/endtoend/operator/101_initial_cluster_backup_schedule.yaml
@@ -27,7 +27,7 @@ spec:
failedJobsHistoryLimit: 3
jobTimeoutMinute: 5
strategies:
- - name: BackupShard
+ - name: commerce-x
keyspace: "commerce"
shard: "-"
- name: "every-five-minute"
@@ -42,7 +42,7 @@ spec:
failedJobsHistoryLimit: 3
jobTimeoutMinute: 5
strategies:
- - name: BackupShard
+ - name: commerce-x
keyspace: "commerce"
shard: "-"
images:
@@ -52,7 +52,7 @@ spec:
vtorc: vitess/lite:latest
vtbackup: vitess/lite:latest
mysqld:
- mysql80Compatible: mysql:8.0.40
+ mysql80Compatible: vitess/lite:latest
mysqldExporter: prom/mysqld-exporter:v0.14.0
cells:
- name: zone1
diff --git a/test/endtoend/operator/201_customer_tablets.yaml b/test/endtoend/operator/201_customer_tablets.yaml
index ef1f3546..c0565c7d 100644
--- a/test/endtoend/operator/201_customer_tablets.yaml
+++ b/test/endtoend/operator/201_customer_tablets.yaml
@@ -3,6 +3,13 @@ kind: VitessCluster
metadata:
name: example
spec:
+ backup:
+ engine: xtrabackup
+ locations:
+ - volume:
+ hostPath:
+ path: /backup
+ type: Directory
images:
vtctld: vitess/lite:latest
vtgate: vitess/lite:latest
@@ -10,7 +17,7 @@ spec:
vtorc: vitess/lite:latest
vtbackup: vitess/lite:latest
mysqld:
- mysql80Compatible: mysql:8.0.40
+ mysql80Compatible: vitess/lite:latest
mysqldExporter: prom/mysqld-exporter:v0.14.0
cells:
- name: zone1
diff --git a/test/endtoend/operator/302_new_shards.yaml b/test/endtoend/operator/302_new_shards.yaml
index 8731afea..2808349b 100644
--- a/test/endtoend/operator/302_new_shards.yaml
+++ b/test/endtoend/operator/302_new_shards.yaml
@@ -3,6 +3,13 @@ kind: VitessCluster
metadata:
name: example
spec:
+ backup:
+ engine: xtrabackup
+ locations:
+ - volume:
+ hostPath:
+ path: /backup
+ type: Directory
images:
vtctld: vitess/lite:latest
vtgate: vitess/lite:latest
@@ -10,7 +17,7 @@ spec:
vtorc: vitess/lite:latest
vtbackup: vitess/lite:latest
mysqld:
- mysql80Compatible: mysql:8.0.40
+ mysql80Compatible: vitess/lite:latest
mysqldExporter: prom/mysqld-exporter:v0.14.0
cells:
- name: zone1
diff --git a/test/endtoend/operator/306_down_shard_0.yaml b/test/endtoend/operator/306_down_shard_0.yaml
index 9684f5f2..2cfcf5bd 100644
--- a/test/endtoend/operator/306_down_shard_0.yaml
+++ b/test/endtoend/operator/306_down_shard_0.yaml
@@ -3,6 +3,13 @@ kind: VitessCluster
metadata:
name: example
spec:
+ backup:
+ engine: xtrabackup
+ locations:
+ - volume:
+ hostPath:
+ path: /backup
+ type: Directory
images:
vtctld: vitess/lite:latest
vtgate: vitess/lite:latest
@@ -10,7 +17,7 @@ spec:
vtorc: vitess/lite:latest
vtbackup: vitess/lite:latest
mysqld:
- mysql80Compatible: mysql:8.0.40
+ mysql80Compatible: vitess/lite:latest
mysqldExporter: prom/mysqld-exporter:v0.14.0
cells:
- name: zone1
diff --git a/test/endtoend/operator/401_scheduled_backups.yaml b/test/endtoend/operator/401_scheduled_backups.yaml
new file mode 100644
index 00000000..221ab22b
--- /dev/null
+++ b/test/endtoend/operator/401_scheduled_backups.yaml
@@ -0,0 +1,166 @@
+apiVersion: planetscale.com/v2
+kind: VitessCluster
+metadata:
+ name: example
+spec:
+ backup:
+ engine: xtrabackup
+ locations:
+ - volume:
+ hostPath:
+ path: /backup
+ type: Directory
+ schedules:
+ - name: "commerce"
+ schedule: "*/2 * * * *"
+ resources:
+ requests:
+ cpu: 100m
+ memory: 1024Mi
+ limits:
+ memory: 1024Mi
+ successfulJobsHistoryLimit: 2
+ failedJobsHistoryLimit: 3
+ jobTimeoutMinute: 5
+ strategies:
+ - name: commerce_x
+ keyspace: "commerce"
+ shard: "-"
+ - name: "customer"
+ schedule: "*/2 * * * *"
+ resources:
+ requests:
+ cpu: 100m
+ memory: 1024Mi
+ limits:
+ memory: 1024Mi
+ successfulJobsHistoryLimit: 2
+ failedJobsHistoryLimit: 3
+ jobTimeoutMinute: 5
+ strategies:
+ - name: customer_80-x
+ keyspace: "customer"
+ shard: "80-"
+ - name: customer_x-80
+ keyspace: "customer"
+ shard: "-80"
+ images:
+ vtctld: vitess/lite:latest
+ vtgate: vitess/lite:latest
+ vttablet: vitess/lite:latest
+ vtorc: vitess/lite:latest
+ vtbackup: vitess/lite:latest
+ mysqld:
+ mysql80Compatible: vitess/lite:latest
+ mysqldExporter: prom/mysqld-exporter:v0.14.0
+ cells:
+ - name: zone1
+ gateway:
+ authentication:
+ static:
+ secret:
+ name: example-cluster-config
+ key: users.json
+ replicas: 1
+ resources:
+ requests:
+ cpu: 100m
+ memory: 256Mi
+ limits:
+ memory: 256Mi
+ vitessDashboard:
+ cells:
+ - zone1
+ extraFlags:
+ security_policy: read-only
+ replicas: 1
+ resources:
+ limits:
+ memory: 128Mi
+ requests:
+ cpu: 100m
+ memory: 128Mi
+
+ keyspaces:
+ - name: commerce
+ durabilityPolicy: semi_sync
+ vitessOrchestrator:
+ resources:
+ limits:
+ memory: 128Mi
+ requests:
+ cpu: 100m
+ memory: 128Mi
+ extraFlags:
+ instance-poll-time: 3s
+ turndownPolicy: Immediate
+ partitionings:
+ - equal:
+ parts: 1
+ shardTemplate:
+ databaseInitScriptSecret:
+ name: example-cluster-config
+ key: init_db.sql
+ tabletPools:
+ - cell: zone1
+ type: replica
+ replicas: 3
+ vttablet:
+ extraFlags:
+ db_charset: utf8mb4
+ resources:
+ requests:
+ cpu: 100m
+ memory: 256Mi
+ mysqld:
+ resources:
+ requests:
+ cpu: 100m
+ memory: 512Mi
+ dataVolumeClaimTemplate:
+ accessModes: ["ReadWriteOnce"]
+ resources:
+ requests:
+ storage: 10Gi
+ - name: customer
+ durabilityPolicy: semi_sync
+ vitessOrchestrator:
+ resources:
+ limits:
+ memory: 128Mi
+ requests:
+ cpu: 100m
+ memory: 128Mi
+ extraFlags:
+ instance-poll-time: 3s
+ turndownPolicy: Immediate
+ partitionings:
+ - equal:
+ parts: 2
+ shardTemplate:
+ databaseInitScriptSecret:
+ name: example-cluster-config
+ key: init_db.sql
+ tabletPools:
+ - cell: zone1
+ type: replica
+ replicas: 3
+ vttablet:
+ extraFlags:
+ db_charset: utf8mb4
+ resources:
+ requests:
+ cpu: 100m
+ memory: 256Mi
+ mysqld:
+ resources:
+ requests:
+ cpu: 100m
+ memory: 512Mi
+ dataVolumeClaimTemplate:
+ accessModes: ["ReadWriteOnce"]
+ resources:
+ requests:
+ storage: 10Gi
+ updateStrategy:
+ type: Immediate
diff --git a/test/endtoend/operator/cluster_upgrade.yaml b/test/endtoend/operator/cluster_upgrade.yaml
index 18cd5484..cb0216e6 100644
--- a/test/endtoend/operator/cluster_upgrade.yaml
+++ b/test/endtoend/operator/cluster_upgrade.yaml
@@ -7,6 +7,13 @@ kind: VitessCluster
metadata:
name: example
spec:
+ backup:
+ engine: xtrabackup
+ locations:
+ - volume:
+ hostPath:
+ path: /backup
+ type: Directory
images:
vtctld: vitess/lite:latest
vtgate: vitess/lite:latest
@@ -14,7 +21,7 @@ spec:
vtorc: vitess/lite:latest
vtbackup: vitess/lite:latest
mysqld:
- mysql80Compatible: mysql:8.0.40
+ mysql80Compatible: vitess/lite:latest
mysqldExporter: prom/mysqld-exporter:v0.14.0
cells:
- name: zone1
diff --git a/test/endtoend/operator/operator-latest.yaml b/test/endtoend/operator/operator-latest.yaml
index 46ccb358..be23f61a 100644
--- a/test/endtoend/operator/operator-latest.yaml
+++ b/test/endtoend/operator/operator-latest.yaml
@@ -416,6 +416,7 @@ spec:
cluster:
type: string
concurrencyPolicy:
+ default: Forbid
enum:
- Allow
- Forbid
@@ -491,8 +492,6 @@ spec:
example: commerce
type: string
name:
- enum:
- - BackupShard
type: string
shard:
example: '-'
@@ -542,6 +541,11 @@ spec:
lastScheduledTime:
format: date-time
type: string
+ lastScheduledTimes:
+ additionalProperties:
+ format: date-time
+ type: string
+ type: object
type: object
type: object
served: true
@@ -2028,6 +2032,7 @@ spec:
type: string
type: object
concurrencyPolicy:
+ default: Forbid
enum:
- Allow
- Forbid
@@ -2099,8 +2104,6 @@ spec:
example: commerce
type: string
name:
- enum:
- - BackupShard
type: string
shard:
example: '-'
diff --git a/test/endtoend/upgrade_test.sh b/test/endtoend/upgrade_test.sh
index f1c47413..4aca2c50 100755
--- a/test/endtoend/upgrade_test.sh
+++ b/test/endtoend/upgrade_test.sh
@@ -17,6 +17,7 @@ function move_tables() {
sleep 10
+ echo "Execute MoveTables"
vtctldclient LegacyVtctlCommand -- MoveTables --source commerce --tables 'customer,corder' Create customer.commerce2customer
if [ $? -ne 0 ]; then
echo "MoveTables failed"
@@ -26,6 +27,7 @@ function move_tables() {
sleep 10
+ echo "Execute VDiff"
vdiff_out=$(vtctldclient LegacyVtctlCommand -- VDiff customer.commerce2customer)
echo "$vdiff_out" | grep "ProcessedRows: 5" | wc -l | grep "2" > /dev/null
if [ $? -ne 0 ]; then
@@ -33,6 +35,7 @@ function move_tables() {
# Allow failure
fi
+ echo "SwitchTraffic for rdonly"
vtctldclient LegacyVtctlCommand -- MoveTables --tablet_types='rdonly,replica' SwitchTraffic customer.commerce2customer
if [ $? -ne 0 ]; then
echo "SwitchTraffic for rdonly and replica failed"
@@ -40,6 +43,7 @@ function move_tables() {
exit 1
fi
+ echo "SwitchTraffic for primary"
vtctldclient LegacyVtctlCommand -- MoveTables --tablet_types='primary' SwitchTraffic customer.commerce2customer
if [ $? -ne 0 ]; then
echo "SwitchTraffic for primary failed"
@@ -47,6 +51,7 @@ function move_tables() {
exit 1
fi
+ echo "Complete MoveTables"
vtctldclient LegacyVtctlCommand -- MoveTables Complete customer.commerce2customer
if [ $? -ne 0 ]; then
echo "MoveTables Complete failed"
@@ -177,6 +182,39 @@ EOF
waitForKeyspaceToBeServing customer 80- 2
}
+function scheduledBackups() {
+ echo "Apply 401_scheduled_backups.yaml"
+ kubectl apply -f 401_scheduled_backups.yaml > /dev/null
+
+ checkVitessBackupScheduleStatusWithTimeout "example-vbsc-commerce(.*)"
+ checkVitessBackupScheduleStatusWithTimeout "example-vbsc-customer(.*)"
+
+ docker exec -it $(docker container ls --format '{{.Names}}' | grep kind) chmod o+rwx -R /backup > /dev/null
+ initialCommerceBackups=$(kubectl get vtb --no-headers | grep "commerce-x-x" | wc -l)
+ initialCustomerFirstShardBackups=$(kubectl get vtb --no-headers | grep "customer-x-80" | wc -l)
+ initialCustomerSecondShardBackups=$(kubectl get vtb --no-headers | grep "customer-80-x" | wc -l)
+
+ for i in {1..60} ; do
+ commerceBackups=$(kubectl get vtb --no-headers | grep "commerce-x-x" | wc -l)
+ customerFirstShardBackups=$(kubectl get vtb --no-headers | grep "customer-x-80" | wc -l)
+ customerSecondShardBackups=$(kubectl get vtb --no-headers | grep "customer-80-x" | wc -l)
+
+ if [[ "${customerFirstShardBackups}" -ge $(( initialCustomerFirstShardBackups + 2 )) && "${customerSecondShardBackups}" -ge $(( initialCustomerSecondShardBackups + 2 )) && "${commerceBackups}" -ge $(( initialCommerceBackups + 2 )) ]]; then
+ echo "Found all backups"
+ return
+ else
+ echo "Got: ${customerFirstShardBackups} customer-x-80 backups but want: $(( initialCustomerFirstShardBackups + 2 ))"
+ echo "Got: ${customerSecondShardBackups} customer-80-x backups but want: $(( initialCustomerSecondShardBackups + 2 ))"
+ echo "Got: ${commerceBackups} commerce-x-x backups but want: $(( initialCommerceBackups + 2 ))"
+ echo ""
+ fi
+ sleep 10
+ done
+
+ echo "Did not find the backups on time"
+ exit 1
+}
+
function waitAndVerifySetup() {
sleep 10
checkPodStatusWithTimeout "example-zone1-vtctld(.*)1/1(.*)Running(.*)"
@@ -233,12 +271,11 @@ EOF
}
# Test setup
+mkdir -p -m 777 ./vtdataroot/backup
echo "Building the docker image"
docker build -f build/Dockerfile.release -t vitess-operator-pr:latest .
-echo "Creating Kind cluster"
-kind create cluster --wait 30s --name kind-${BUILDKITE_BUILD_ID} --image ${KIND_VERSION}
-echo "Loading docker image into Kind cluster"
-kind load docker-image vitess-operator-pr:latest --name kind-${BUILDKITE_BUILD_ID}
+setupKindConfig
+createKindCluster
cd "$PWD/test/endtoend/operator"
killall kubectl
@@ -257,6 +294,11 @@ verifyDurabilityPolicy "commerce" "semi_sync"
move_tables
resharding
+scheduledBackups
+
# Teardown
+echo "Removing the temporary directory"
+removeBackupFiles
+rm -rf "$STARTING_DIR/vtdataroot"
echo "Deleting Kind cluster. This also deletes the volume associated with it"
kind delete cluster --name kind-${BUILDKITE_BUILD_ID}
diff --git a/test/endtoend/utils.sh b/test/endtoend/utils.sh
index 4de6add1..fbdd55c0 100644
--- a/test/endtoend/utils.sh
+++ b/test/endtoend/utils.sh
@@ -155,6 +155,28 @@ function checkPodStatusWithTimeout() {
exit 1
}
+function checkPodExistWithTimeout() {
+ regex=$1
+
+ # We use this for loop instead of `kubectl wait` because we don't have access to the full pod name
+ # and `kubectl wait` does not support regex to match resource name.
+ for i in {1..1200} ; do
+ out=$(kubectl get pods)
+ echo "$out" | grep -E "$regex" > /dev/null 2>&1
+ if [[ $? -eq 0 ]]; then
+ echo "$regex found"
+ return
+ fi
+ sleep 1
+ done
+ echo -e "ERROR: checkPodStatusWithTimeout timeout to find pod matching:\ngot:\n$out\nfor regex: $regex"
+ echo "$regex" | grep "vttablet" > /dev/null 2>&1
+ if [[ $? -eq 0 ]]; then
+ printMysqlErrorFiles
+ fi
+ exit 1
+}
+
# ensurePodResourcesSet:
# $1: regex used to match pod names
function ensurePodResourcesSet() {
@@ -225,7 +247,7 @@ function waitForKeyspaceToBeServing() {
fi
echo "Shard $ks/$shard is not fully serving. Output: $out"
echo "Retrying (attempt #$i) ..."
- sleep 10
+ sleep 1
done
}
@@ -375,3 +397,17 @@ COrder
+----------+-------------+----------+-------+
EOF
}
+
+function checkVitessBackupScheduleStatusWithTimeout() {
+ regex=$1
+
+ for i in {1..1200} ; do
+ if [[ $(kubectl get VitessBackupSchedule | grep -E "${regex}" | wc -l) -eq 1 ]]; then
+ echo "$regex found"
+ return
+ fi
+ sleep 1
+ done
+ echo -e "ERROR: checkPodStatusWithTimeout timeout to find pod matching:\ngot:\n$out\nfor regex: $regex"
+ exit 1
+}
|