From 07aca7bab20a61b38cb91d932cc036c48b6eb401 Mon Sep 17 00:00:00 2001 From: georgel-ms <41901384+georgel-ms@users.noreply.github.com> Date: Mon, 18 Nov 2024 21:21:23 +0800 Subject: [PATCH] wait for volume attachments to be detached before deleting a vm (#281) * wait for volume attachments to be detached before deleting a vm * decrease volume check interval to 2 sec. --- cloud/scope/virtualmachine.go | 4 + .../azurestackhcivirtualmachine_reconciler.go | 66 ++++++++++++++++ go.mod | 1 + pkg/util/util.go | 79 ++++++++++++++++++- 4 files changed, 149 insertions(+), 1 deletion(-) diff --git a/cloud/scope/virtualmachine.go b/cloud/scope/virtualmachine.go index e539b405..3133a2a2 100644 --- a/cloud/scope/virtualmachine.go +++ b/cloud/scope/virtualmachine.go @@ -139,6 +139,10 @@ func (m *VirtualMachineScope) ClusterName() string { return m.AzureStackHCIVirtualMachine.Spec.ClusterName } +func (m *VirtualMachineScope) Client() client.Client { + return m.client +} + // Location returns the AzureStackHCIVirtualMachine location. func (m *VirtualMachineScope) Location() string { return m.AzureStackHCIVirtualMachine.Spec.Location diff --git a/controllers/azurestackhcivirtualmachine_reconciler.go b/controllers/azurestackhcivirtualmachine_reconciler.go index 2b338e57..a0bc915f 100644 --- a/controllers/azurestackhcivirtualmachine_reconciler.go +++ b/controllers/azurestackhcivirtualmachine_reconciler.go @@ -19,6 +19,7 @@ package controllers import ( "encoding/base64" + "time" infrav1 "github.com/microsoft/cluster-api-provider-azurestackhci/api/v1beta1" azurestackhci "github.com/microsoft/cluster-api-provider-azurestackhci/cloud" @@ -26,8 +27,16 @@ import ( "github.com/microsoft/cluster-api-provider-azurestackhci/cloud/services/disks" "github.com/microsoft/cluster-api-provider-azurestackhci/cloud/services/networkinterfaces" "github.com/microsoft/cluster-api-provider-azurestackhci/cloud/services/virtualmachines" + infrav1util "github.com/microsoft/cluster-api-provider-azurestackhci/pkg/util" sdk_compute "github.com/microsoft/moc-sdk-for-go/services/compute" "github.com/pkg/errors" + "k8s.io/apimachinery/pkg/util/wait" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + waitVolumeAttachmentsInterval = time.Second * 2 + waitVolumeAttachmentsTimeout = time.Minute * 5 ) // azureStackHCIVirtualMachineService are list of services required by cluster actuator, easy to create a fake @@ -90,6 +99,29 @@ func (s *azureStackHCIVirtualMachineService) Delete() error { Name: s.vmScope.Name(), } + now := time.Now() + + if err := wait.PollImmediate(waitVolumeAttachmentsInterval, waitVolumeAttachmentsTimeout, func() (bool, error) { + volumes, err := s.ListVolumeAttachments() + if err != nil { + s.vmScope.Error(err, "failed to check volume attachment on vm", "vmName", s.vmScope.Name()) + return true, nil + } + if len(volumes) == 0 { + s.vmScope.Info("No volume attachments found on vm", "vmName", s.vmScope.Name()) + return true, nil + } + for _, volume := range volumes { + s.vmScope.Info("VolumeAttachment is still attached on vm, waiting for the volume to be detached before delete the vm", "volume", volume, "vmName", s.vmScope.Name()) + } + return false, nil + }); err != nil { + s.vmScope.Error(err, "failed to wait for volume attachments to be detached on vm", "vmName", s.vmScope.Name()) + } + + latency := time.Since(now) + s.vmScope.Info("Waiting for volume attachments to be detached on vm took", "vmName", s.vmScope.Name(), "duration", latency.String()) + err := s.virtualMachinesSvc.Delete(s.vmScope.Context, vmSpec) if err != nil { return errors.Wrapf(err, "failed to delete machine") @@ -117,6 +149,40 @@ func (s *azureStackHCIVirtualMachineService) Delete() error { return nil } +func (s *azureStackHCIVirtualMachineService) ListVolumeAttachments() ([]string, error) { + // target cluster key + clusterKey := client.ObjectKey{ + Namespace: s.vmScope.AzureStackHCIVirtualMachine.Namespace, + Name: s.vmScope.AzureStackHCIVirtualMachine.Spec.ClusterName, + } + + targetClusterClient, err := infrav1util.NewTargetClusterClient(s.vmScope.Context, s.vmScope.Client(), clusterKey) + if err != nil { + s.vmScope.Error(err, "failed to create target cluster client", "nameSpace", clusterKey.Namespace, "name", clusterKey.Name) + return nil, errors.Wrapf(err, "failed to create target cluster client for cluster %s:%s", clusterKey.Namespace, clusterKey.Name) + } + + // get kubernetes node name of the AzureStackHCIVirtualMachine that's being reconciled + nodeName, err := infrav1util.GetNodeName(s.vmScope.Context, s.vmScope.Client(), s.vmScope.AzureStackHCIVirtualMachine.ObjectMeta) + if err != nil { + s.vmScope.Error(err, "failed to get valid node name for vm", "vmName", s.vmScope.Name()) + return nil, errors.Wrapf(err, "failed to get node name for vm %s", s.vmScope.Name()) + } + + if nodeName == "" { + s.vmScope.Info("Node name is empty, skipping volume attachment check", "vmName", s.vmScope.Name()) + return nil, nil + } + + // get volume attachments from target cluster + volumes, err := infrav1util.ListVolumeAttachmentOnNode(s.vmScope.Context, targetClusterClient, clusterKey, nodeName) + if err != nil { + s.vmScope.Error(err, "failed to check volume attachment on vm", "vmName", s.vmScope.Name()) + return nil, errors.Wrapf(err, "failed to check volume attachment on vm %s", s.vmScope.Name()) + } + return volumes, nil +} + func (s *azureStackHCIVirtualMachineService) VMIfExists() (*infrav1.VM, error) { vmSpec := &virtualmachines.Spec{ diff --git a/go.mod b/go.mod index c64e604d..957bb0bd 100644 --- a/go.mod +++ b/go.mod @@ -98,6 +98,7 @@ require ( gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/apiextensions-apiserver v0.29.3 // indirect + k8s.io/cluster-bootstrap v0.29.3 // indirect k8s.io/component-base v0.29.3 // indirect k8s.io/klog v1.0.0 // indirect k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 // indirect diff --git a/pkg/util/util.go b/pkg/util/util.go index 5206a949..7e508586 100644 --- a/pkg/util/util.go +++ b/pkg/util/util.go @@ -4,17 +4,30 @@ import ( "context" "crypto/rand" "math/big" + "strings" + "time" infrav1 "github.com/microsoft/cluster-api-provider-azurestackhci/api/v1beta1" + "github.com/pkg/errors" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/uuid" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" + "k8s.io/utils/pointer" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + capiutil "sigs.k8s.io/cluster-api/util" + "sigs.k8s.io/cluster-api/util/kubeconfig" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" ) const ( - charSet = "abcdefghijklmnopqrstuvwxyz0123456789" + charSet = "abcdefghijklmnopqrstuvwxyz0123456789" + diskCsiDriver = "disk.csi.akshci.com" ) // GetAzureStackHCIMachinesInCluster gets a cluster's AzureStackHCIMachines resources. @@ -37,6 +50,70 @@ func GetAzureStackHCIMachinesInCluster(ctx context.Context, controllerClient cli return machines, nil } +// Create a target cluster config based on the secret in the management cluster +func NewTargetClusterConfig(ctx context.Context, controllerClient client.Reader, clusterKey client.ObjectKey) (*rest.Config, error) { + kubeconfig, err := kubeconfig.FromSecret(ctx, controllerClient, clusterKey) + if err != nil { + return nil, errors.Wrapf(err, "failed to retrieve kubeconfig secret for cluster %s:%s", clusterKey.Namespace, clusterKey.Name) + } + + restConfig, err := clientcmd.RESTConfigFromKubeConfig(kubeconfig) + if err != nil { + return nil, errors.Wrapf(err, "failed to create client configuration for cluster %s:%s", clusterKey.Namespace, clusterKey.Name) + } + + return restConfig, nil +} + +func NewTargetClusterClient(ctx context.Context, controllerClient client.Client, clusterKey client.ObjectKey) (*kubernetes.Clientset, error) { + restConfig, err := NewTargetClusterConfig(ctx, controllerClient, clusterKey) + if err != nil { + return nil, errors.Wrapf(err, "failed to create client configuration for cluster %s:%s", clusterKey.Namespace, clusterKey.Name) + } + + // sets the timeout, otherwise this will default to 0 (i.e. no timeout) + restConfig.Timeout = 10 * time.Second + + targetClusterClient, err := kubernetes.NewForConfig(restConfig) + if err != nil { + return nil, errors.Wrapf(err, "failed to connect to the cluster %s:%s", clusterKey.Namespace, clusterKey.Name) + } + + return targetClusterClient, err +} + +// GetNodeName returns the Node Name from the resource's owning CAPI machine object. +func GetNodeName(ctx context.Context, client client.Client, obj metav1.ObjectMeta) (string, error) { + machine, err := capiutil.GetOwnerMachine(ctx, client, obj) + if err != nil { + return "", errors.Wrapf(err, "failed to get owner machine for %s.%s", obj.Namespace, obj.Name) + } + if machine == nil { + return "", errors.Errorf("resource %s.%s has no owning machine", obj.Namespace, obj.Name) + } + if machine.Status.NodeRef == nil { + return "", errors.Errorf("machine %s.%s has no node ref", machine.Namespace, machine.Name) + } + return machine.Status.NodeRef.Name, nil +} + +func ListVolumeAttachmentOnNode(ctx context.Context, client *kubernetes.Clientset, clusterKey client.ObjectKey, nodeName string) ([]string, error) { + volumeAttachmentList, err := client.StorageV1().VolumeAttachments().List(ctx, metav1.ListOptions{}) + if err != nil { + return nil, errors.Wrapf(err, "failed to list VolumeAttachments for Cluster %s:%s", clusterKey.Namespace, clusterKey.Name) + } + + res := []string{} + if volumeAttachmentList != nil && len(volumeAttachmentList.Items) > 0 { + for _, va := range volumeAttachmentList.Items { + if va.Spec.Attacher == diskCsiDriver && strings.EqualFold(va.Spec.NodeName, nodeName) { + res = append(res, pointer.StringDeref(va.Spec.Source.PersistentVolumeName, "")) + } + } + } + return res, nil +} + // RandomAlphaNumericString returns a random alphanumeric string. func RandomAlphaNumericString(n int) (string, error) { result := make([]byte, n)