diff --git a/.werf/consts.yaml b/.werf/consts.yaml index 5d41585c..c3d0dd52 100644 --- a/.werf/consts.yaml +++ b/.werf/consts.yaml @@ -1,7 +1,7 @@ # base images {{- $_ := set $ "BASE_ALT" "registry.deckhouse.io/base_images/alt:p10@sha256:f105773c682498700680d7cd61a702a4315c4235aee3622757591fd510fb8b4a" }} {{- $_ := set $ "BASE_ALT_P11" "registry.deckhouse.io/base_images/alt:p11@sha256:e47d84424485d3674240cb2f67d3a1801b37d327e6d1eb8cc8d01be8ed3b34f3" }} -{{- $_ := set $ "BASE_GOLANG_1_23" "registry.deckhouse.io/base_images/golang:1.23.5-alpine3.20@sha256:623ef3f63012bbd648021a2f097de3f411889332ba83bd98f0ac8d1288bdaa06" }} +{{- $_ := set $ "BASE_GOLANG_1_23" "registry.deckhouse.io/base_images/golang:1.23.6-alpine3.20@sha256:3058c63e0e2532881949c4186414baa24a0f9a8f9349b1853daa49be816f42e9" }} {{- $_ := set $ "BASE_SCRATCH" "registry.deckhouse.io/base_images/scratch@sha256:653ae76965c98c8cd1c8c9ff7725316d2983986f896655b30e0f44d2f8b2dd7e" }} {{- $_ := set $ "BASE_ALPINE" "registry.deckhouse.io/base_images/alpine:3.20.3@sha256:41628df7c9b935d248f64542634e7a843f9bc7f2252d7f878e77f7b79a947466" }} diff --git a/api/v1alpha1/const.go b/api/v1alpha1/const.go index 5b302610..7f6e6e03 100644 --- a/api/v1alpha1/const.go +++ b/api/v1alpha1/const.go @@ -19,6 +19,7 @@ package v1alpha1 const ( PhaseCreated = "Created" PhasePending = "Pending" + PhaseCleaning = "Cleaning" PhaseResizing = "Resizing" PhaseFailed = "Failed" PhaseNotReady = "NotReady" diff --git a/api/v1alpha1/lvm_logical_volume.go b/api/v1alpha1/lvm_logical_volume.go index 1dc6af93..37696505 100644 --- a/api/v1alpha1/lvm_logical_volume.go +++ b/api/v1alpha1/lvm_logical_volume.go @@ -51,7 +51,8 @@ type LVMLogicalVolumeThinSpec struct { } type LVMLogicalVolumeThickSpec struct { - Contiguous *bool `json:"contiguous"` + Contiguous *bool `json:"contiguous,omitempty"` + VolumeCleanup *string `json:"volumeCleanup,omitempty"` } type LVMLogicalVolumeStatus struct { Phase string `json:"phase"` diff --git a/crds/doc-ru-lvmlogicalvolume.yaml b/crds/doc-ru-lvmlogicalvolume.yaml index cb0dacfc..a39e80a9 100644 --- a/crds/doc-ru-lvmlogicalvolume.yaml +++ b/crds/doc-ru-lvmlogicalvolume.yaml @@ -38,6 +38,14 @@ spec: contiguous: description: | Если true, логический том будет создан с флагом contiguous. Примечание: Этот флаг следует использовать с осторожностью, так как он может привести к невозможности создания LV, не смотря на наличие свободного пространства. + volumeCleanup: + description: | + Метод очистки тома после удаления PV. + Если параметр не задан, после удаления PV данные могут удалиться, либо остаться. Гарантий удаления или неудаления нет. + Допустимые значения: + - `RandomFillSinglePass` - том будет перезаписан случайными данными один раз перед удалением. Использовать эту опцию не рекомендуется для твердотельных накопителей, так как она уменьшает ресурс накопителя. + - `RandomFillThreePass` - том будет перезаписан случайными данными три раза перед удалением. Использовать эту опцию не рекомендуется для твердотельных накопителей, так как она уменьшает ресурс накопителя. + - `Discard` - все блоки тома будут отмечены как свободные с использованием системного вызова `discard` перед удалением. Эта опция имеет смысл только для твердотельных накопителей. status: description: | Описывает состояние ресурса. diff --git a/crds/lvmlogicalvolume.yaml b/crds/lvmlogicalvolume.yaml index 0e5247d1..f59e5efc 100644 --- a/crds/lvmlogicalvolume.yaml +++ b/crds/lvmlogicalvolume.yaml @@ -35,11 +35,14 @@ spec: - rule: | (self.type == "Thin" && has(self.thin) && !has(self.thick)) || self.type != "Thin" message: "Field 'thin' is required and field 'thick' is forbidden when 'type' is 'Thin'." - - rule: | - (!has(oldSelf.thick) || has(self.thick)) - message: "Field 'thick' cannot be removed." - - rule: | - (!has(self.thick) || !has(self.thick.contiguous) || (has(self.thick.contiguous) && self.thick.contiguous == oldSelf.thick.contiguous)) + - rule: > + ( + (!has(self.thick) || !has(self.thick.contiguous)) && + (!has(oldSelf.thick) || !has(oldSelf.thick.contiguous)) + ) || ( + has(self.thick) && has(self.thick.contiguous) && + has(oldSelf.thick) && has(oldSelf.thick.contiguous) + ) message: "Field 'contiguous' is immutable and cannot be added if not specified at creation." required: - actualLVNameOnTheNode @@ -100,13 +103,9 @@ spec: thick: type: object x-kubernetes-validations: - - rule: self == oldSelf - message: Value is immutable. - rule: | (!has(oldSelf.contiguous) || has(self.contiguous)) message: "Field 'contiguous' cannot be removed." - required: - - contiguous properties: contiguous: type: boolean @@ -115,6 +114,16 @@ spec: message: Value is immutable. description: | If true, the Logical Volume will be created with the contiguous flag. Use it carefully as LV might not be created even if there is enough space in VG. + volumeCleanup: + type: string + enum: [RandomFillThreePass, RandomFillSinglePass, Discard] + description: | + The method of the volume cleanup before deletion. + If the parameter is not set, after deleting the PV, the data may be deleted or it may remain. There is no guarantee of deletion or non-deletion. + Allowed values: + - `RandomFillSinglePass`: The volume will be overwritten with random data once before deletion. This option is not recommended for solid-state drives, as it reduces the lifespan of the drive. + - `RandomFillThreePass`: The volume will be overwritten with random data three times before deletion. This option is also not recommended for solid-state drives, as it reduces the lifespan of the drive. + - `Discard`: All blocks of the volume will be marked as free using the `discard`` system call before deletion. This option is only applicable to solid-state drives. source: type: object description: | @@ -149,7 +158,7 @@ spec: properties: phase: type: string - enum: [Created, Pending, Resizing, Failed] + enum: [Created, Pending, Cleaning, Resizing, Failed] description: | The current resource's phase. reason: diff --git a/docs/USAGE.md b/docs/USAGE.md index 1d4ea7c0..65059adc 100644 --- a/docs/USAGE.md +++ b/docs/USAGE.md @@ -170,3 +170,38 @@ To extract the `BlockDevice` resource from the `LVMVolumeGroup` resource, you ne > **Caution!** If the deleting `LVM Volume Group` resource contains any `Logical Volume` (even if it is only the `Thin-pool` that is specified in `spec`), a user must delete all those `Logical Volumes` manually. Otherwise, the `LVMVolumeGroup` resource and its `Volume Group` will not be deleted. > A user can forbid to delete the `LVMVolumeGroup` resource by annotate it with `storage.deckhouse.io/deletion-protection`. If the controller finds the annotation, it will not delete nether the resource or the `Volume Group` till the annotation removal. + +## Protection against data leakage between volumes + +When deleting files, the operating system does not physically delete the contents, but only marks the corresponding blocks as “free”. If a new volume receives physical blocks previously used by another volume, the previous user's data may remain in them. + +This is possible, for example, in the following case: + + - user №1 placed files in the volume requested from StorageClass 1 and on node 1 (no matter in “Block” or “Filesystem” mode); + - user №1 deleted the files and the volume; + - the physical blocks it occupied become “free” but not wiped; + - user №2 requested a new volume from StorageClass 1 and on node 1 in “Block” mode; + - there is a risk that some or all of the blocks previously occupied by user №1 will be reallocated to user №2; + - in which case user №2 has the ability to recover user №1's data. + +### Thick volumes + +The `volumeCleanup` parameter is provided to prevent leaks through thick volumes. +It allows to select the volume cleanup method before deleting the PV. +Allowed values: + +* parameter not specified — do not perform any additional actions when deleting a volume. The data may be available to the next user; + +* `RandomFillSinglePass` - the volume will be overwritten with random data once before deletion. Use of this option is not recommended for solid-state drives as it reduces the lifespan of the drive. + +* `RandomFillThreePass` - the volume will be overwritten with random data three times before deletion. Use of this option is not recommended for solid-state drives as it reduces the lifespan of the drive. + +* `Discard` - all blocks of the volume will be marked as free using the `discard` system call before deletion. This option is only applicable to solid-state drives. + +Most modern solid-state drives ensure that a `discard` marked block will not return previous data when read. This makes the `Discard' option the most effective way to prevent leakage when using solid-state drives. +However, clearing a cell is a relatively long operation, so it is performed in the background by the device. In addition, many drives cannot clear individual cells, only groups - pages. Because of this, not all drives guarantee immediate unavailability of the freed data. In addition, not all drives that do guarantee this keep the promise. +If the device does not guarantee Deterministic TRIM (DRAT), Deterministic Read Zero after TRIM (RZAT) and is not tested, then it is not recommended. + +### Thin volumes + +When a thin-pool block is released via `discard` by the guest operating system, this command is forwarded to the device. If a hard disk drive is used or if there is no `discard` support from the solid-state drive, the data may remain on the thin-pool until such a block is used again. However, users are only given access to thin volumes, not the thin-pool itself. They can only retrieve a volume from the pool, and the thin volumes are nulled for the thin-pool block on new use, preventing leakage between clients. This is guaranteed by setting `thin_pool_zero=1` in LVM. diff --git a/docs/USAGE.ru.md b/docs/USAGE.ru.md index 45d96f58..f7ae6a42 100644 --- a/docs/USAGE.ru.md +++ b/docs/USAGE.ru.md @@ -170,3 +170,38 @@ kubectl delete lvg %lvg-name% > **Внимание!** Если удаляемый ресурс `LVMVolumeGroup` содержит `Logical Volume` (даже если это только `Thin-pool`, который указан в `spec`) пользователю необходимо самостоятельно удалить все `Logical Volume`, которые содержит удаляемая `Volume Group`. В противном случае ни ресурс, ни `Volume Group` удалены не будут. > Пользователь может запретить удаление `LVMVolumeGroup` ресурса, повесив на ресурс специальную аннотацию `storage.deckhouse.io/deletion-protection`. При наличии данной аннотации контроллер не будет удалять ни ресурс, ни соответствующую `Volume Group` до тех пор, пока аннотация не будет снята с ресурса. + +## Защита от утечек данных между томами + +При удалении файлов операционная система не удаляет содержимое физически, а лишь помечает соответствующие блоки как «свободные». Если новый том получает физические блоки, ранее использовавшиеся другим томом, в них могут остаться данные предыдущего пользователя. + +Это возможно, например, в таком случае: + + - пользователь №1 разместил файлы в томе, запрошенном из StorageClass 1 и на узле 1 (не важно, в режиме «Block» или «Filesystem»); + - пользователь №1 удалил файлы и том; + - физические блоки, которые он занимал, становятся «свободными», но не затертыми; + - пользователь №2 запросил новый том из StorageClass 1 и на узле 1 в режиме «Block»; + - есть риск, что часть или все блоки, ранее занимаемые пользователем №1, будут снова выделены пользователю №2; + - в этом случае пользователь №2 имеет возможность восстановить данные пользователя №1. + +### Thick-тома + +Для предотвращения утечек через thick-тома предусмотрен параметр `volumeCleanup`. +Он позволяет выбрать метод очистки тома перед удалением PV. +Возможные значения: + +* параметр не задан — не выполнять никаких дополнительных действий при удалении тома. Данные могут оказаться доступными следующему пользователю; + +* `RandomFillSinglePass` — том будет перезаписан случайными данными один раз перед удалением. Использовать эту опцию не рекомендуется для твердотельных накопителей, так как она уменьшает ресурс накопителя. + +* `RandomFillThreePass` — том будет перезаписан случайными данными три раза перед удалением. Использовать эту опцию не рекомендуется для твердотельных накопителей, так как она уменьшает ресурс накопителя. + +* `Discard` — все блоки тома будут отмечены как свободные с использованием системного вызова `discard` перед удалением. Эта опция имеет смысл только для твердотельных накопителей. + +Большинство современных твердотельных накопителей гарантирует, что помеченный `discard` блок при чтении не вернет предыдущие данные. Это делает опцию `Discard` самым эффективным способом предотвращения утечек при использовании твердотельных накопителей. +Однако очистка ячейки относительно долгая операция, поэтому выполняется устройством в фоне. К тому-же многие диски не могут очищать индивидуальные ячейки, а только группы - страницы. Из-за этого не все накопители гарантируют немедленную недоступность освобожденных данных. К тому-же не все накопители, гарантирующие это, держат обещание. +Если устройство не гарантирует Deterministic TRIM (DRAT), Deterministic Read Zero after TRIM (RZAT) и не является проверенным, то использовать его не рекомендуется. + +### Thin-тома + +В момент освобождения блока thin-тома через `discard` гостевой операционной системы, эта команда пересылается на устройство. В случае использования жесткого диска или отсутствии поддержки `discard` со стороны твердотельного накопителя, данные могут остаться на thin-pool до нового использования такого блока. Однако, пользователям предоставляется доступ только к thin-томам, а не к самому thin-пулу. Они могут получить только том из пула, а для thin-томов производится зануление блока thin-pool при новом использовании, что предотвращает утечки между клиентами. Это гарантируется настройкой `thin_pool_zero=1` в LVM. diff --git a/images/agent/src/cmd/main.go b/images/agent/src/cmd/main.go index 96a90e26..ad78dc4d 100644 --- a/images/agent/src/cmd/main.go +++ b/images/agent/src/cmd/main.go @@ -75,6 +75,7 @@ func main() { log.Info(fmt.Sprintf("[main] OS/Arch:Go OS/Arch:%s/%s ", goruntime.GOOS, goruntime.GOARCH)) log.Info(fmt.Sprintf("[main] Feature SnapshotsEnabled: %t", feature.SnapshotsEnabled())) + log.Info(fmt.Sprintf("[main] Feature VolumeCleanupEnabled: %t", feature.VolumeCleanupEnabled())) log.Info("[main] CfgParams has been successfully created") log.Info(fmt.Sprintf("[main] %s = %s", config.LogLevel, cfgParams.Loglevel)) diff --git a/images/agent/src/internal/controller/llv/reconciler.go b/images/agent/src/internal/controller/llv/reconciler.go index 5e8b6cfb..478e2ef7 100644 --- a/images/agent/src/internal/controller/llv/reconciler.go +++ b/images/agent/src/internal/controller/llv/reconciler.go @@ -22,6 +22,7 @@ import ( "fmt" "reflect" "strings" + "sync" "time" "github.com/deckhouse/sds-node-configurator/api/v1alpha1" @@ -42,16 +43,32 @@ import ( const ReconcilerName = "lvm-logical-volume-watcher-controller" +type cleanupsKey struct { + vgName, lvName string +} + +type cleanupStatus struct { + cleanupRunning bool + prevFailedMethod *string +} +type cleanups struct { + m sync.Mutex + status map[cleanupsKey]cleanupStatus +} type Reconciler struct { - cl client.Client - log logger.Logger - lvgCl *utils.LVGClient - llvCl *utils.LLVClient - metrics monitoring.Metrics - sdsCache *cache.Cache - cfg ReconcilerConfig + cl client.Client + log logger.Logger + lvgCl *utils.LVGClient + llvCl *utils.LLVClient + metrics monitoring.Metrics + sdsCache *cache.Cache + cfg ReconcilerConfig + runningCleanups cleanups } +var errAlreadyRunning = errors.New("reconcile in progress") +var errCleanupSameAsPreviouslyFailed = errors.New("cleanup method was failed and not changed") + type ReconcilerConfig struct { NodeName string Loglevel logger.Verbosity @@ -82,9 +99,43 @@ func NewReconciler( metrics: metrics, sdsCache: sdsCache, cfg: cfg, + runningCleanups: cleanups{ + status: make(map[cleanupsKey]cleanupStatus, 50), + }, } } +func (r *Reconciler) startCleanupRunning(vgName, lvName string) (inserted bool, prevFailedMethod *string) { + r.runningCleanups.m.Lock() + defer r.runningCleanups.m.Unlock() + key := cleanupsKey{vgName: vgName, lvName: lvName} + value, exists := r.runningCleanups.status[key] + if exists && value.cleanupRunning { + return false, nil + } + value.cleanupRunning = true + r.runningCleanups.status[key] = value + return true, value.prevFailedMethod +} + +func (r *Reconciler) stopCleanupRunning(vgName, lvName string, failedMethod *string) error { + r.runningCleanups.m.Lock() + defer r.runningCleanups.m.Unlock() + key := cleanupsKey{vgName: vgName, lvName: lvName} + value, exists := r.runningCleanups.status[key] + if !exists || !value.cleanupRunning { + return errors.New("cleanup is not running") + } + if failedMethod == nil { + delete(r.runningCleanups.status, key) + } else { + value.prevFailedMethod = failedMethod + value.cleanupRunning = false + r.runningCleanups.status[key] = value + } + return nil +} + // Name implements controller.Reconciler. func (r *Reconciler) Name() string { return ReconcilerName @@ -129,7 +180,7 @@ func (r *Reconciler) Reconcile( lvg, err := r.lvgCl.GetLVMVolumeGroup(ctx, llv.Spec.LVMVolumeGroupName) if err != nil { if k8serr.IsNotFound(err) { - r.log.Error(err, fmt.Sprintf("[ReconcileLVMLogicalVolume] LVMVolumeGroup %s not found for LVMLogicalVolume %s. Retry in %s", llv.Spec.LVMVolumeGroupName, llv.Name, r.cfg.VolumeGroupScanInterval.String())) + r.log.Error(err, fmt.Sprintf("[Reconcile] LVMVolumeGroup %s not found for LVMLogicalVolume %s. Retry in %s", llv.Spec.LVMVolumeGroupName, llv.Name, r.cfg.VolumeGroupScanInterval.String())) err = r.llvCl.UpdatePhaseIfNeeded( ctx, llv, @@ -137,7 +188,7 @@ func (r *Reconciler) Reconcile( fmt.Sprintf("LVMVolumeGroup %s not found", llv.Spec.LVMVolumeGroupName), ) if err != nil { - r.log.Error(err, fmt.Sprintf("[ReconcileLVMLogicalVolume] unable to update the LVMLogicalVolume %s", llv.Name)) + r.log.Error(err, fmt.Sprintf("[Reconcile] unable to update the LVMLogicalVolume %s", llv.Name)) return controller.Result{}, err } @@ -153,16 +204,16 @@ func (r *Reconciler) Reconcile( fmt.Sprintf("Unable to get selected LVMVolumeGroup, err: %s", err.Error()), ) if err != nil { - r.log.Error(err, fmt.Sprintf("[ReconcileLVMLogicalVolume] unable to update the LVMLogicalVolume %s", llv.Name)) + r.log.Error(err, fmt.Sprintf("[Reconcile] unable to update the LVMLogicalVolume %s", llv.Name)) } return controller.Result{}, err } if !utils.LVGBelongsToNode(lvg, r.cfg.NodeName) { - r.log.Info(fmt.Sprintf("[ReconcileLVMLogicalVolume] the LVMVolumeGroup %s of the LVMLogicalVolume %s does not belongs to the current node: %s. Reconciliation stopped", lvg.Name, llv.Name, r.cfg.NodeName)) + r.log.Info(fmt.Sprintf("[Reconcile] the LVMVolumeGroup %s of the LVMLogicalVolume %s does not belongs to the current node: %s. Reconciliation stopped", lvg.Name, llv.Name, r.cfg.NodeName)) return controller.Result{}, nil } - r.log.Info(fmt.Sprintf("[ReconcileLVMLogicalVolume] the LVMVolumeGroup %s of the LVMLogicalVolume %s belongs to the current node: %s. Reconciliation continues", lvg.Name, llv.Name, r.cfg.NodeName)) + r.log.Info(fmt.Sprintf("[Reconcile] the LVMVolumeGroup %s of the LVMLogicalVolume %s belongs to the current node: %s. Reconciliation continues", lvg.Name, llv.Name, r.cfg.NodeName)) // this case prevents the unexpected behavior when the controller runs up with existing LVMLogicalVolumes if vgs, _ := r.sdsCache.GetVGs(); len(vgs) == 0 { @@ -170,47 +221,49 @@ func (r *Reconciler) Reconcile( return controller.Result{RequeueAfter: r.cfg.VolumeGroupScanInterval}, nil } - r.log.Debug(fmt.Sprintf("[ReconcileLVMLogicalVolume] tries to add the finalizer %s to the LVMLogicalVolume %s", internal.SdsNodeConfiguratorFinalizer, llv.Name)) + r.log.Debug(fmt.Sprintf("[Reconcile] tries to add the finalizer %s to the LVMLogicalVolume %s", internal.SdsNodeConfiguratorFinalizer, llv.Name)) added, err := r.addLLVFinalizerIfNotExist(ctx, llv) if err != nil { - r.log.Error(err, fmt.Sprintf("[ReconcileLVMLogicalVolume] unable to update the LVMLogicalVolume %s", llv.Name)) + r.log.Error(err, fmt.Sprintf("[Reconcile] unable to update the LVMLogicalVolume %s", llv.Name)) return controller.Result{}, err } if added { - r.log.Debug(fmt.Sprintf("[ReconcileLVMLogicalVolume] successfully added the finalizer %s to the LVMLogicalVolume %s", internal.SdsNodeConfiguratorFinalizer, llv.Name)) + r.log.Debug(fmt.Sprintf("[Reconcile] successfully added the finalizer %s to the LVMLogicalVolume %s", internal.SdsNodeConfiguratorFinalizer, llv.Name)) } else { - r.log.Debug(fmt.Sprintf("[ReconcileLVMLogicalVolume] no need to add the finalizer %s to the LVMLogicalVolume %s", internal.SdsNodeConfiguratorFinalizer, llv.Name)) + r.log.Debug(fmt.Sprintf("[Reconcile] no need to add the finalizer %s to the LVMLogicalVolume %s", internal.SdsNodeConfiguratorFinalizer, llv.Name)) } - r.log.Info(fmt.Sprintf("[ReconcileLVMLogicalVolume] starts to validate the LVMLogicalVolume %s", llv.Name)) + r.log.Info(fmt.Sprintf("[Reconcile] starts to validate the LVMLogicalVolume %s", llv.Name)) valid, reason := r.validateLVMLogicalVolume(llv, lvg) if !valid { - r.log.Warning(fmt.Sprintf("[ReconcileLVMLogicalVolume] the LVMLogicalVolume %s is not valid, reason: %s", llv.Name, reason)) + r.log.Warning(fmt.Sprintf("[Reconcile] the LVMLogicalVolume %s is not valid, reason: %s", llv.Name, reason)) err = r.llvCl.UpdatePhaseIfNeeded(ctx, llv, v1alpha1.PhaseFailed, reason) if err != nil { - r.log.Error(err, fmt.Sprintf("[ReconcileLVMLogicalVolume] unable to update the LVMLogicalVolume %s", llv.Name)) + r.log.Error(err, fmt.Sprintf("[Reconcile] unable to update the LVMLogicalVolume %s", llv.Name)) return controller.Result{}, err } return controller.Result{}, nil } - r.log.Info(fmt.Sprintf("[ReconcileLVMLogicalVolume] successfully validated the LVMLogicalVolume %s", llv.Name)) + r.log.Info(fmt.Sprintf("[Reconcile] successfully validated the LVMLogicalVolume %s", llv.Name)) shouldRequeue, err := r.ReconcileLVMLogicalVolume(ctx, llv, lvg) if err != nil { - r.log.Error(err, fmt.Sprintf("[RunLVMLogicalVolumeWatcherController] an error occurred while reconciling the LVMLogicalVolume: %s", llv.Name)) - updErr := r.llvCl.UpdatePhaseIfNeeded(ctx, llv, v1alpha1.PhaseFailed, err.Error()) - if updErr != nil { - r.log.Error(updErr, fmt.Sprintf("[RunLVMLogicalVolumeWatcherController] unable to update the LVMLogicalVolume %s", llv.Name)) - return controller.Result{}, updErr + r.log.Error(err, fmt.Sprintf("[Reconcile] an error occurred while reconciling the LVMLogicalVolume: %s", llv.Name)) + if !errors.Is(err, errAlreadyRunning) && !errors.Is(err, errCleanupSameAsPreviouslyFailed) { + updErr := r.llvCl.UpdatePhaseIfNeeded(ctx, llv, v1alpha1.PhaseFailed, err.Error()) + if updErr != nil { + r.log.Error(updErr, fmt.Sprintf("[Reconcile] unable to update the LVMLogicalVolume %s", llv.Name)) + return controller.Result{}, updErr + } } } if shouldRequeue { - r.log.Info(fmt.Sprintf("[RunLVMLogicalVolumeWatcherController] some issues were occurred while reconciliation the LVMLogicalVolume %s. Requeue the request in %s", llv.Name, r.cfg.LLVRequeueInterval.String())) + r.log.Info(fmt.Sprintf("[Reconcile] some issues were occurred while reconciliation the LVMLogicalVolume %s. Requeue the request in %s", llv.Name, r.cfg.LLVRequeueInterval.String())) return controller.Result{RequeueAfter: r.cfg.LLVRequeueInterval}, nil } - r.log.Info(fmt.Sprintf("[RunLVMLogicalVolumeWatcherController] successfully ended reconciliation of the LVMLogicalVolume %s", llv.Name)) + r.log.Info(fmt.Sprintf("[Reconcile] successfully ended reconciliation of the LVMLogicalVolume %s", llv.Name)) return controller.Result{}, nil } @@ -228,7 +281,7 @@ func (r *Reconciler) ReconcileLVMLogicalVolume(ctx context.Context, llv *v1alpha case internal.DeleteReconcile: return r.reconcileLLVDeleteFunc(ctx, llv, lvg) default: - r.log.Info(fmt.Sprintf("[runEventReconcile] the LVMLogicalVolume %s has compeleted configuration and should not be reconciled", llv.Name)) + r.log.Info(fmt.Sprintf("[runEventReconcile] the LVMLogicalVolume %s has completed configuration and should not be reconciled", llv.Name)) if llv.Status.Phase != v1alpha1.PhaseCreated { r.log.Warning(fmt.Sprintf("[runEventReconcile] the LVMLogicalVolume %s should not be reconciled but has an unexpected phase: %s. Setting the phase to %s", llv.Name, llv.Status.Phase, v1alpha1.PhaseCreated)) err := r.llvCl.UpdatePhaseIfNeeded(ctx, llv, v1alpha1.PhaseCreated, "") @@ -452,10 +505,10 @@ func (r *Reconciler) reconcileLLVDeleteFunc( } } - err := r.deleteLVIfNeeded(lvg.Spec.ActualVGNameOnTheNode, llv) + shouldRequeue, err := r.deleteLVIfNeeded(ctx, lvg.Spec.ActualVGNameOnTheNode, llv) if err != nil { r.log.Error(err, fmt.Sprintf("[reconcileLLVDeleteFunc] unable to delete the LV %s in VG %s", llv.Spec.ActualLVNameOnTheNode, lvg.Spec.ActualVGNameOnTheNode)) - return true, err + return shouldRequeue, err } r.log.Info(fmt.Sprintf("[reconcileLLVDeleteFunc] successfully deleted the LV %s in VG %s", llv.Spec.ActualLVNameOnTheNode, lvg.Spec.ActualVGNameOnTheNode)) @@ -536,30 +589,72 @@ func checkIfLVBelongsToLLV(llv *v1alpha1.LVMLogicalVolume, lv *internal.LVData) return true } -func (r *Reconciler) deleteLVIfNeeded(vgName string, llv *v1alpha1.LVMLogicalVolume) error { +func (r *Reconciler) deleteLVIfNeeded(ctx context.Context, vgName string, llv *v1alpha1.LVMLogicalVolume) (shouldRequeue bool, err error) { lv := r.sdsCache.FindLV(vgName, llv.Spec.ActualLVNameOnTheNode) if lv == nil || !lv.Exist { r.log.Warning(fmt.Sprintf("[deleteLVIfNeeded] did not find LV %s in VG %s", llv.Spec.ActualLVNameOnTheNode, vgName)) - return nil + return false, nil } // this case prevents unexpected same-name LV deletions which does not actually belong to our LLV if !checkIfLVBelongsToLLV(llv, &lv.Data) { - r.log.Warning(fmt.Sprintf("[deleteLVIfNeeded] no need to delete LV %s as it doesnt belong to LVMLogicalVolume %s", lv.Data.LVName, llv.Name)) - return nil + r.log.Warning(fmt.Sprintf("[deleteLVIfNeeded] no need to delete LV %s as it doesn't belong to LVMLogicalVolume %s", lv.Data.LVName, llv.Name)) + return false, nil + } + + if llv.Spec.Type == internal.Thick && llv.Spec.Thick != nil && llv.Spec.Thick.VolumeCleanup != nil { + method := *llv.Spec.Thick.VolumeCleanup + lvName := llv.Spec.ActualLVNameOnTheNode + started, prevFailedMethod := r.startCleanupRunning(vgName, lvName) + if !started { + r.log.Debug(fmt.Sprintf("[deleteLVIfNeeded] cleanup already running for LV %s in VG %s", lvName, vgName)) + return false, errAlreadyRunning + } + r.log.Trace(fmt.Sprintf("[deleteLVIfNeeded] starting cleaning up for LV %s in VG %s with method %s", lvName, vgName, method)) + defer func() { + r.log.Trace(fmt.Sprintf("[deleteLVIfNeeded] stopping cleaning up for LV %s in VG %s with method %s", lvName, vgName, method)) + err := r.stopCleanupRunning(vgName, lvName, prevFailedMethod) + if err != nil { + r.log.Error(err, fmt.Sprintf("[deleteLVIfNeeded] can't unregister running cleanup for LV %s in VG %s", lvName, vgName)) + } + }() + + // prevent doing cleanup with previously failed method + if prevFailedMethod != nil && *prevFailedMethod == method { + r.log.Debug(fmt.Sprintf("[deleteLVIfNeeded] was already failed with method %s for LV %s in VG %s", *prevFailedMethod, lvName, vgName)) + return false, errCleanupSameAsPreviouslyFailed + } + err := r.llvCl.UpdatePhaseIfNeeded( + ctx, + llv, + v1alpha1.PhaseCleaning, + fmt.Sprintf("Cleaning up volume %s in %s group using %s", lvName, vgName, method), + ) + if err != nil { + r.log.Error(err, "[deleteLVIfNeeded] changing phase to Cleaning") + return true, fmt.Errorf("changing phase to Cleaning :%w", err) + } + prevFailedMethod = &method + r.log.Debug(fmt.Sprintf("[deleteLVIfNeeded] running cleanup for LV %s in VG %s with method %s", lvName, vgName, method)) + err = utils.VolumeCleanup(ctx, r.log, vgName, lvName, method) + if err != nil { + r.log.Error(err, fmt.Sprintf("[deleteLVIfNeeded] unable to clean up LV %s in VG %s with method %s", lvName, vgName, method)) + return true, err + } + prevFailedMethod = nil } cmd, err := utils.RemoveLV(vgName, llv.Spec.ActualLVNameOnTheNode) r.log.Debug(fmt.Sprintf("[deleteLVIfNeeded] runs cmd: %s", cmd)) if err != nil { r.log.Error(err, fmt.Sprintf("[deleteLVIfNeeded] unable to remove LV %s from VG %s", llv.Spec.ActualLVNameOnTheNode, vgName)) - return err + return true, err } r.log.Debug(fmt.Sprintf("[deleteLVIfNeeded] mark LV %s in the cache as removed", lv.Data.LVName)) r.sdsCache.MarkLVAsRemoved(lv.Data.VGName, lv.Data.LVName) - return nil + return false, nil } func (r *Reconciler) getLVActualSize(vgName, lvName string) resource.Quantity { @@ -683,7 +778,7 @@ func (r *Reconciler) shouldReconcileByUpdateFunc(vgName string, llv *v1alpha1.LV } func isContiguous(llv *v1alpha1.LVMLogicalVolume) bool { - if llv.Spec.Thick == nil { + if llv.Spec.Thick == nil || llv.Spec.Thick.Contiguous == nil { return false } diff --git a/images/agent/src/internal/utils/client_llv.go b/images/agent/src/internal/utils/client_llv.go index 2551d973..497df356 100644 --- a/images/agent/src/internal/utils/client_llv.go +++ b/images/agent/src/internal/utils/client_llv.go @@ -62,7 +62,7 @@ func (llvCl *LLVClient) UpdatePhaseToCreatedIfNeeded( actualSize resource.Quantity, ) error { var contiguous *bool - if llv.Spec.Thick != nil { + if llv.Spec.Thick != nil && llv.Spec.Thick.Contiguous != nil { if *llv.Spec.Thick.Contiguous { contiguous = llv.Spec.Thick.Contiguous } diff --git a/images/agent/src/internal/utils/volume_cleanup_ce.go b/images/agent/src/internal/utils/volume_cleanup_ce.go new file mode 100644 index 00000000..374ecad0 --- /dev/null +++ b/images/agent/src/internal/utils/volume_cleanup_ce.go @@ -0,0 +1,27 @@ +//go:build ce + +/* +Copyright 2025 Flant JSC +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package utils + +import ( + "context" + "fmt" + + "agent/internal/logger" +) + +func VolumeCleanup(_ context.Context, _ logger.Logger, _, _, _ string) error { + return fmt.Errorf("volume cleanup is not supported in your edition") +} diff --git a/images/agent/src/internal/utils/volume_cleanup_ee.go b/images/agent/src/internal/utils/volume_cleanup_ee.go new file mode 100644 index 00000000..8d2613c5 --- /dev/null +++ b/images/agent/src/internal/utils/volume_cleanup_ee.go @@ -0,0 +1,237 @@ +//go:build !ce + +/* +Copyright 2025 Flant JSC +Licensed under the Deckhouse Platform Enterprise Edition (EE) license. See https://github.com/deckhouse/deckhouse/blob/main/ee/LICENSE +*/ + +package utils + +import ( + "context" + "errors" + "fmt" + "io" + "os" + "syscall" + "time" + "unsafe" + + "github.com/deckhouse/sds-node-configurator/lib/go/common/pkg/feature" + + "agent/internal/logger" +) + +func VolumeCleanup(ctx context.Context, log logger.Logger, vgName, lvName, volumeCleanup string) error { + log.Trace(fmt.Sprintf("[VolumeCleanup] cleaning up volume %s in volume group %s using %s", lvName, vgName, volumeCleanup)) + if !feature.VolumeCleanupEnabled() { + return fmt.Errorf("volume cleanup is not supported in your edition") + } + + devicePath := fmt.Sprintf("/dev/%s/%s", vgName, lvName) + randomSource := "/dev/urandom" + + var err error + closingErrors := []error{} + + switch volumeCleanup { + case "RandomFillSinglePass": + err = volumeCleanupOverwrite(ctx, log, &closingErrors, devicePath, randomSource, 1) + case "RandomFillThreePass": + err = volumeCleanupOverwrite(ctx, log, &closingErrors, devicePath, randomSource, 3) + case "Discard": + err = volumeCleanupDiscard(ctx, log, &closingErrors, devicePath) + default: + return fmt.Errorf("unknown cleanup method %s", volumeCleanup) + } + + if err != nil && len(closingErrors) > 0 { + closingErrors = append([]error{err}, closingErrors...) + } + + if len(closingErrors) > 0 { + err = errors.Join(closingErrors...) + } + + if err == nil { + return nil + } + + log.Error(err, fmt.Sprintf("[VolumeCleanup] fail to cleanup volume %s", devicePath)) + return fmt.Errorf("cleaning volume %s: %w", devicePath, err) +} + +func volumeSize(log logger.Logger, device *os.File) (int64, error) { + log.Trace(fmt.Sprintf("[volumeSize] finding size of device %v", device)) + var stat syscall.Stat_t + log.Debug("[volumeSize] Calling fstat") + if err := syscall.Fstat(int(device.Fd()), &stat); err != nil { + log.Error(err, "[volumeSize] Calling fstat") + return 0, fmt.Errorf("fstat call failed: %w", err) + } + + if stat.Size > 0 { + log.Debug(fmt.Sprintf("[volumeSize] Size %d is valid.", stat.Size)) + return stat.Size, nil + } + + if stat.Mode&S_IFMT != S_IFBLK { + log.Debug(fmt.Sprintf("[volumeSize] Device mode %x", stat.Mode)) + return 0, fmt.Errorf("not a block device, mode: %x", stat.Mode) + } + + var blockDeviceSize uint64 + _, _, errno := syscall.Syscall( + syscall.SYS_IOCTL, + device.Fd(), + uintptr(BLKGETSIZE64), + uintptr(unsafe.Pointer(&blockDeviceSize))) + if errno != 0 { + err := errors.New(errno.Error()) + log.Error(err, "[volumeSize] calling ioctl BLKGETSIZE64") + return 0, fmt.Errorf("error calling ioctl BLKGETSIZE64: %w", err) + } + log.Debug(fmt.Sprintf("Block device size is %d", blockDeviceSize)) + if blockDeviceSize <= 0 { + return 0, fmt.Errorf("block size is invalid") + } + + return int64(blockDeviceSize), nil +} + +func volumeCleanupOverwrite(_ context.Context, log logger.Logger, closingErrors *[]error, devicePath, inputPath string, passes int) error { + log.Trace(fmt.Sprintf("[volumeCleanupOverwrite] overwriting %s by %s in %d passes", devicePath, inputPath, passes)) + closeFile := func(file *os.File) { + log.Trace(fmt.Sprintf("[volumeCleanupOverwrite] closing %s", file.Name())) + err := file.Close() + if err != nil { + log.Error(err, fmt.Sprintf("[volumeCleanupOverwrite] While closing file %s", file.Name())) + *closingErrors = append(*closingErrors, fmt.Errorf("closing file %s: %w", file.Name(), err)) + } + } + + input, err := os.OpenFile(inputPath, syscall.O_RDONLY, os.ModeDevice) + if err != nil { + log.Error(err, fmt.Sprintf("[volumeCleanupOverwrite] Opening file %s", inputPath)) + return fmt.Errorf("opening source device %s to wipe: %w", inputPath, err) + } + defer closeFile(input) + + output, err := os.OpenFile(devicePath, syscall.O_DIRECT|syscall.O_RDWR, os.ModeDevice) + if err != nil { + log.Error(err, fmt.Sprintf("[volumeCleanupOverwrite] Opening file %s", devicePath)) + return fmt.Errorf("opening device %s to wipe: %w", devicePath, err) + } + defer closeFile(output) + + bytesToWrite, err := volumeSize(log, output) + if err != nil { + log.Error(err, "[volumeCleanupOverwrite] Finding volume size") + return fmt.Errorf("can't find the size of device %s: %w", devicePath, err) + } + + bufferSize := 1024 * 1024 * 4 + buffer := make([]byte, bufferSize) + for pass := 0; pass < passes; pass++ { + log.Debug(fmt.Sprintf("[volumeCleanupOverwrite] Overwriting %d bytes. Pass %d", bytesToWrite, pass)) + start := time.Now() + written, err := io.CopyBuffer( + io.NewOffsetWriter(output, 0), + io.LimitReader(input, bytesToWrite), + buffer) + log.Info(fmt.Sprintf("[volumeCleanupOverwrite] Overwriting is done in %s", time.Since(start).String())) + if err != nil { + log.Error(err, fmt.Sprintf("[volumeCleanupOverwrite] copying from %s to %s", inputPath, devicePath)) + return fmt.Errorf("copying from %s to %s: %w", inputPath, devicePath, err) + } + + if written != bytesToWrite { + log.Error(err, fmt.Sprintf("[volumeCleanupOverwrite] only %d bytes written, expected %d", written, bytesToWrite)) + return fmt.Errorf("only %d bytes written, expected %d", written, bytesToWrite) + } + } + + return err +} + +/* To find these constant run: +gcc -o test -x c - < +#include +#include +#include + +#define PRINT_CONSTANT(name, fmt) printf(#name " = " fmt "\n", name) + +int main() { + PRINT_CONSTANT(S_IFMT, "0x%x"); + PRINT_CONSTANT(S_IFBLK, "0x%x"); + PRINT_CONSTANT(BLKGETSIZE64, "0x%lx"); + PRINT_CONSTANT(BLKDISCARD, "0x%x"); + return 0; +} +EOF +*/ + +// TODO: It will be nice to figure them out during compilation or maybe runtime? +// +//nolint:revive +const ( + BLKDISCARD = 0x1277 + + BLKGETSIZE64 = 0x80081272 + + S_IFMT = 0xf000 /* type of file mask */ + S_IFBLK = 0x6000 /* block special */ +) + +type Range struct { + start, count uint64 +} + +func volumeCleanupDiscard(_ context.Context, log logger.Logger, closingErrors *[]error, devicePath string) error { + log.Trace(fmt.Sprintf("[volumeCleanupDiscard] discarding %s", devicePath)) + device, err := os.OpenFile(devicePath, syscall.O_RDWR, os.ModeDevice) + if err != nil { + log.Error(err, fmt.Sprintf("[volumeCleanupDiscard] Opening device %s", devicePath)) + return fmt.Errorf("opening device %s to wipe: %w", devicePath, err) + } + defer func() { + log.Trace(fmt.Sprintf("Closing file %s", devicePath)) + err := device.Close() + if err != nil { + log.Error(err, fmt.Sprintf("[volumeCleanupDiscard] While closing deice %s", devicePath)) + *closingErrors = append(*closingErrors, fmt.Errorf("closing file %s: %w", device.Name(), err)) + } + }() + + deviceSize, err := volumeSize(log, device) + if err != nil { + log.Error(err, fmt.Sprintf("[volumeCleanupDiscard] can't find the size of device %s", devicePath)) + return fmt.Errorf("can't find the size of device %s: %w", devicePath, err) + } + + rng := Range{ + start: 0, + count: uint64(deviceSize), + } + + log.Debug(fmt.Sprintf("[volumeCleanupDiscard] calling BLKDISCARD fd: %d, range %v", device.Fd(), rng)) + start := time.Now() + + _, _, errno := syscall.Syscall( + syscall.SYS_IOCTL, + device.Fd(), + uintptr(BLKDISCARD), + uintptr(unsafe.Pointer(&rng))) + + log.Info(fmt.Sprintf("[volumeCleanupDiscard] BLKDISCARD is done in %s", time.Since(start).String())) + + if errno != 0 { + err := errors.New(errno.Error()) + log.Error(err, "[volumeCleanupDiscard] error calling BLKDISCARD") + return fmt.Errorf("calling ioctl BLKDISCARD: %s", err) + } + + return nil +} diff --git a/lib/go/common/pkg/feature/const_ce.go b/lib/go/common/pkg/feature/const_ce.go index f1240f87..aec6ce27 100644 --- a/lib/go/common/pkg/feature/const_ce.go +++ b/lib/go/common/pkg/feature/const_ce.go @@ -16,3 +16,4 @@ limitations under the License. package feature const snapshotsEnabled = false +const volumeCleanupEnabled = false diff --git a/lib/go/common/pkg/feature/const_csepro.go b/lib/go/common/pkg/feature/const_csepro.go index 3a0b2587..bdd208fb 100644 --- a/lib/go/common/pkg/feature/const_csepro.go +++ b/lib/go/common/pkg/feature/const_csepro.go @@ -9,3 +9,4 @@ See https://github.com/deckhouse/deckhouse/blob/main/ee/LICENSE package feature const snapshotsEnabled = true +const volumeCleanupEnabled = true diff --git a/lib/go/common/pkg/feature/const_ee.go b/lib/go/common/pkg/feature/const_ee.go index cf1d2165..0fd07f6b 100644 --- a/lib/go/common/pkg/feature/const_ee.go +++ b/lib/go/common/pkg/feature/const_ee.go @@ -9,3 +9,4 @@ See https://github.com/deckhouse/deckhouse/blob/main/ee/LICENSE package feature const snapshotsEnabled = true +const volumeCleanupEnabled = true diff --git a/lib/go/common/pkg/feature/const_se.go b/lib/go/common/pkg/feature/const_se.go index 8a5bbca0..fdb30460 100644 --- a/lib/go/common/pkg/feature/const_se.go +++ b/lib/go/common/pkg/feature/const_se.go @@ -9,3 +9,4 @@ See https://github.com/deckhouse/deckhouse/blob/main/ee/LICENSE package feature const snapshotsEnabled = true +const volumeCleanupEnabled = false diff --git a/lib/go/common/pkg/feature/const_seplus.go b/lib/go/common/pkg/feature/const_seplus.go index d2445b57..61f6352c 100644 --- a/lib/go/common/pkg/feature/const_seplus.go +++ b/lib/go/common/pkg/feature/const_seplus.go @@ -9,3 +9,4 @@ See https://github.com/deckhouse/deckhouse/blob/main/ee/LICENSE package feature const snapshotsEnabled = true +const volumeCleanupEnabled = false diff --git a/lib/go/common/pkg/feature/feature.go b/lib/go/common/pkg/feature/feature.go index 18ed9940..c82186b8 100644 --- a/lib/go/common/pkg/feature/feature.go +++ b/lib/go/common/pkg/feature/feature.go @@ -3,3 +3,7 @@ package feature func SnapshotsEnabled() bool { return snapshotsEnabled } + +func VolumeCleanupEnabled() bool { + return volumeCleanupEnabled +}