From 1de4af952c9fa14b7c31929da2502ff889986718 Mon Sep 17 00:00:00 2001 From: Andrea Mazzotti Date: Wed, 10 Jul 2024 15:22:43 +0200 Subject: [PATCH] Cleanup code and add comments Signed-off-by: Andrea Mazzotti --- .obs/chartfile/crds/templates/crds.yaml | 73 +------------------ api/v1beta1/machineinventory_types.go | 1 - api/v1beta1/machineselector_types.go | 3 - api/v1beta1/types.go | 11 ++- api/v1beta1/zz_generated.deepcopy.go | 1 - ...l.cattle.io_machineinventoryselectors.yaml | 33 --------- ....io_machineinventoryselectortemplates.yaml | 35 --------- ...mental.cattle.io_machineregistrations.yaml | 5 +- controllers/machineinventory_controller.go | 25 +++---- pkg/install/install.go | 46 ++++++------ pkg/network/network.go | 27 +++++++ 11 files changed, 69 insertions(+), 191 deletions(-) diff --git a/.obs/chartfile/crds/templates/crds.yaml b/.obs/chartfile/crds/templates/crds.yaml index 2a027a70c..b4671fbd8 100644 --- a/.obs/chartfile/crds/templates/crds.yaml +++ b/.obs/chartfile/crds/templates/crds.yaml @@ -348,39 +348,6 @@ spec: type: object spec: properties: - network: - description: Network is the network template to be applied during - MachineInventory adoption. - properties: - connections: - additionalProperties: - type: string - type: object - ipAddresses: - additionalProperties: - description: |- - TypedLocalObjectReference contains enough information to let you locate the - typed referenced object inside the same namespace. - properties: - apiGroup: - description: |- - APIGroup is the group for the resource being referenced. - If APIGroup is not specified, the specified Kind must be in the core API group. - For any other third-party types, APIGroup is required. - type: string - kind: - description: Kind is the type of resource being referenced - type: string - name: - description: Name is the name of resource being referenced - type: string - required: - - kind - - name - type: object - x-kubernetes-map-type: atomic - type: object - type: object providerID: description: |- ProviderID the identifier for the elemental instance. @@ -614,41 +581,6 @@ spec: type: object spec: properties: - network: - description: Network is the network template to be applied - during MachineInventory adoption. - properties: - connections: - additionalProperties: - type: string - type: object - ipAddresses: - additionalProperties: - description: |- - TypedLocalObjectReference contains enough information to let you locate the - typed referenced object inside the same namespace. - properties: - apiGroup: - description: |- - APIGroup is the group for the resource being referenced. - If APIGroup is not specified, the specified Kind must be in the core API group. - For any other third-party types, APIGroup is required. - type: string - kind: - description: Kind is the type of resource being - referenced - type: string - name: - description: Name is the name of resource being - referenced - type: string - required: - - kind - - name - type: object - x-kubernetes-map-type: atomic - type: object - type: object providerID: description: |- ProviderID the identifier for the elemental instance. @@ -1002,9 +934,8 @@ spec: type: object type: object network: - description: |- - NetworkTemplate contains a list of IPAddressPools and a network config template. - This template can be defined in both MachineRegistrations and MachineSelectors. + description: NetworkTemplate contains a map of IPAddressPools + and a map of connection templates. properties: connections: additionalProperties: diff --git a/api/v1beta1/machineinventory_types.go b/api/v1beta1/machineinventory_types.go index 478baf822..65502655a 100644 --- a/api/v1beta1/machineinventory_types.go +++ b/api/v1beta1/machineinventory_types.go @@ -30,7 +30,6 @@ const ( PlanTypeReset = "reset" MachineInventoryResettableAnnotation = "elemental.cattle.io/resettable" MachineInventoryOSUnmanagedAnnotation = "elemental.cattle.io/os.unmanaged" - MachineInventoryNetworkConfigApplied = "elemental.cattle.io/network.applied" ) type MachineInventorySpec struct { diff --git a/api/v1beta1/machineselector_types.go b/api/v1beta1/machineselector_types.go index 4c1f517f7..34d1d90a7 100644 --- a/api/v1beta1/machineselector_types.go +++ b/api/v1beta1/machineselector_types.go @@ -38,9 +38,6 @@ type MachineInventorySelectorSpec struct { ProviderID string `json:"providerID,omitempty"` // Selector selector to choose elemental machines. Selector metav1.LabelSelector `json:"selector,omitempty"` - // Network is the network template to be applied during MachineInventory adoption. - // +optional - Network NetworkTemplate `json:"network,omitempty" yaml:"network"` } type MachineInventorySelectorStatus struct { diff --git a/api/v1beta1/types.go b/api/v1beta1/types.go index e316d5371..2ec1f983f 100644 --- a/api/v1beta1/types.go +++ b/api/v1beta1/types.go @@ -173,17 +173,22 @@ const ( DeviceSelectorKeySize DeviceSelectorKey = "Size" ) -// NetworkTemplate contains a list of IPAddressPools and a network config template. -// This template can be defined in both MachineRegistrations and MachineSelectors. +// NetworkTemplate contains a map of IPAddressPools and a map of connection templates. type NetworkTemplate struct { IPAddresses map[string]*corev1.TypedLocalObjectReference `json:"ipAddresses,omitempty" yaml:"ipAddresses,omitempty"` Connections map[string]string `json:"connections,omitempty" yaml:"connections,omitempty"` } -// NetworkConfig contains a list of claimed IPAddresses and a network config template. +// NetworkConfig contains a map of claimed IPAddresses and a map of connection templates. // This config is a digested NetworkTemplate, the MachineInventory Ready condition highlight that // this config is ready to be consumed, this means all needed IPAddressClaims for this machine // have been created and the IPAM provider served real IPAddresses that can be applied to the machine. +// +// Note that Connections still carries the same connection templates from the NetworkTemplate object. +// An alternative could be to simplify this object to contain final connections where the variable +// substitution already took place (fully digested). +// Right now we send both connection templates and real ipaddressed so the consumer (elemental-register) +// can do the substitution itself. type NetworkConfig struct { IPAddresses map[string]string `json:"ipAddresses,omitempty" yaml:"ipAddresses,omitempty"` Connections map[string]string `json:"connections,omitempty" yaml:"connections,omitempty"` diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go index 809273537..f0d635fef 100644 --- a/api/v1beta1/zz_generated.deepcopy.go +++ b/api/v1beta1/zz_generated.deepcopy.go @@ -335,7 +335,6 @@ func (in *MachineInventorySelectorList) DeepCopyObject() runtime.Object { func (in *MachineInventorySelectorSpec) DeepCopyInto(out *MachineInventorySelectorSpec) { *out = *in in.Selector.DeepCopyInto(&out.Selector) - in.Network.DeepCopyInto(&out.Network) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MachineInventorySelectorSpec. diff --git a/config/crd/bases/elemental.cattle.io_machineinventoryselectors.yaml b/config/crd/bases/elemental.cattle.io_machineinventoryselectors.yaml index 3680b930d..4d1be4dde 100644 --- a/config/crd/bases/elemental.cattle.io_machineinventoryselectors.yaml +++ b/config/crd/bases/elemental.cattle.io_machineinventoryselectors.yaml @@ -37,39 +37,6 @@ spec: type: object spec: properties: - network: - description: Network is the network template to be applied during - MachineInventory adoption. - properties: - connections: - additionalProperties: - type: string - type: object - ipAddresses: - additionalProperties: - description: |- - TypedLocalObjectReference contains enough information to let you locate the - typed referenced object inside the same namespace. - properties: - apiGroup: - description: |- - APIGroup is the group for the resource being referenced. - If APIGroup is not specified, the specified Kind must be in the core API group. - For any other third-party types, APIGroup is required. - type: string - kind: - description: Kind is the type of resource being referenced - type: string - name: - description: Name is the name of resource being referenced - type: string - required: - - kind - - name - type: object - x-kubernetes-map-type: atomic - type: object - type: object providerID: description: |- ProviderID the identifier for the elemental instance. diff --git a/config/crd/bases/elemental.cattle.io_machineinventoryselectortemplates.yaml b/config/crd/bases/elemental.cattle.io_machineinventoryselectortemplates.yaml index a6c7785f4..13dd0f5a1 100644 --- a/config/crd/bases/elemental.cattle.io_machineinventoryselectortemplates.yaml +++ b/config/crd/bases/elemental.cattle.io_machineinventoryselectortemplates.yaml @@ -59,41 +59,6 @@ spec: type: object spec: properties: - network: - description: Network is the network template to be applied - during MachineInventory adoption. - properties: - connections: - additionalProperties: - type: string - type: object - ipAddresses: - additionalProperties: - description: |- - TypedLocalObjectReference contains enough information to let you locate the - typed referenced object inside the same namespace. - properties: - apiGroup: - description: |- - APIGroup is the group for the resource being referenced. - If APIGroup is not specified, the specified Kind must be in the core API group. - For any other third-party types, APIGroup is required. - type: string - kind: - description: Kind is the type of resource being - referenced - type: string - name: - description: Name is the name of resource being - referenced - type: string - required: - - kind - - name - type: object - x-kubernetes-map-type: atomic - type: object - type: object providerID: description: |- ProviderID the identifier for the elemental instance. diff --git a/config/crd/bases/elemental.cattle.io_machineregistrations.yaml b/config/crd/bases/elemental.cattle.io_machineregistrations.yaml index 105cd2e90..75914b6fa 100644 --- a/config/crd/bases/elemental.cattle.io_machineregistrations.yaml +++ b/config/crd/bases/elemental.cattle.io_machineregistrations.yaml @@ -173,9 +173,8 @@ spec: type: object type: object network: - description: |- - NetworkTemplate contains a list of IPAddressPools and a network config template. - This template can be defined in both MachineRegistrations and MachineSelectors. + description: NetworkTemplate contains a map of IPAddressPools + and a map of connection templates. properties: connections: additionalProperties: diff --git a/controllers/machineinventory_controller.go b/controllers/machineinventory_controller.go index e32e6f858..4e1bf3296 100644 --- a/controllers/machineinventory_controller.go +++ b/controllers/machineinventory_controller.go @@ -161,6 +161,12 @@ func (r *MachineInventoryReconciler) reconcile(ctx context.Context, mInventory * } if r.networkNeedsReconcile(*mInventory) { + meta.SetStatusCondition(&mInventory.Status.Conditions, metav1.Condition{ + Type: elementalv1.ReadyCondition, + Reason: elementalv1.ReconcilingNetworkConfig, + Status: metav1.ConditionFalse, + Message: "NetworkConfig needs reconcile", + }) result, err := r.reconcileNetworkConfig(ctx, mInventory) if err != nil { meta.SetStatusCondition(&mInventory.Status.Conditions, metav1.Condition{ @@ -310,28 +316,14 @@ func (r *MachineInventoryReconciler) reconcileNetworkConfig(ctx context.Context, logger := ctrl.LoggerFrom(ctx) logger.Info("Reconciling Network Config") - // Before we do any change, we make sure the MachineInventory is patched with the appropriate status and annotation. - // This will put the MachineInventory "on hold" with a Ready=false condition, so that the elemental-register knows - // the config is not ready yet to be consumed. IPAddressClaims must be created and IPAddresses must be served first. - if mInventory.Annotations[elementalv1.MachineInventoryNetworkConfigApplied] != "false" { - logger.Info("Marking Network Config as deprecated") - mInventory.Annotations[elementalv1.MachineInventoryNetworkConfigApplied] = "false" - meta.SetStatusCondition(&mInventory.Status.Conditions, metav1.Condition{ - Type: elementalv1.NetworkConfigReady, - Reason: elementalv1.ReconcilingNetworkConfig, - Status: metav1.ConditionFalse, - Message: "NetworkConfig needs changes", - }) - return ctrl.Result{RequeueAfter: time.Second}, nil - } - - // Loops over the IPAddressPools and create an IPClaim for each + // Loops over the IPAddressPools and create an IPAddressClaim for each for name, ipPoolRef := range mInventory.Spec.IPAddressPools { ipClaimName := fmt.Sprintf("%s-%s", mInventory.Name, name) ipClaim := &ipamv1.IPAddressClaim{ ObjectMeta: metav1.ObjectMeta{ Name: ipClaimName, Namespace: mInventory.Namespace, + // Ownership takes care of IPAddressClaim deletion when the MachineInventory is also deleted. OwnerReferences: []metav1.OwnerReference{ { APIVersion: mInventory.APIVersion, @@ -355,6 +347,7 @@ func (r *MachineInventoryReconciler) reconcileNetworkConfig(ctx context.Context, if err := r.Get(ctx, client.ObjectKeyFromObject(ipClaim), ipClaim); err != nil { return ctrl.Result{}, fmt.Errorf("getting IPAddressClaim '%s': %w", ipClaimName, err) } + // Just for safety, prevent usage of any IPClaim that is undergoing deletion (we don't have an IPAddressClaim watch for a further reconcile) if !ipClaim.DeletionTimestamp.IsZero() { return ctrl.Result{}, fmt.Errorf("Waiting for IPAddressClaim deletion '%s'", ipClaimName) } diff --git a/pkg/install/install.go b/pkg/install/install.go index a2b6c2a40..dd157f9aa 100644 --- a/pkg/install/install.go +++ b/pkg/install/install.go @@ -100,24 +100,13 @@ func (i *installer) InstallElemental(config elementalv1.Config, state register.S log.Warningf("Both device and device-selector set, using device-field '%s'", config.Elemental.Install.Device) } - additionalConfigs, err := i.getCloudInitConfigs(config, state) + additionalConfigs, err := i.getCloudInitConfigs(config, state, networkConfig) if err != nil { return fmt.Errorf("generating additional cloud configs: %w", err) } config.Elemental.Install.ConfigURLs = append(config.Elemental.Install.ConfigURLs, additionalConfigs...) - log.Info("Installing Network config") - networkYipConfig := i.networkConfigurator.GetNetworkConfigApplicator(networkConfig) - networkConfigBytes, err := yaml.Marshal(networkYipConfig) - if err != nil { - return fmt.Errorf("marshalling network config: %w", err) - } - if err := i.fs.WriteFile(tempNetworkConfig, networkConfigBytes, 0600); err != nil { - return fmt.Errorf("writing file '%s': %w", tempNetworkConfig, err) - } - config.Elemental.Install.ConfigURLs = append(config.Elemental.Install.ConfigURLs, tempNetworkConfig) - if err := i.runner.Install(config.Elemental.Install); err != nil { return fmt.Errorf("failed to install elemental: %w", err) } @@ -131,23 +120,12 @@ func (i *installer) ResetElemental(config elementalv1.Config, state register.Sta config.Elemental.Reset.ConfigURLs = []string{} } - additionalConfigs, err := i.getCloudInitConfigs(config, state) + additionalConfigs, err := i.getCloudInitConfigs(config, state, networkConfig) if err != nil { return fmt.Errorf("generating additional cloud configs: %w", err) } config.Elemental.Reset.ConfigURLs = append(config.Elemental.Reset.ConfigURLs, additionalConfigs...) - log.Info("Installing Network config") - networkYipConfig := i.networkConfigurator.GetNetworkConfigApplicator(networkConfig) - networkConfigBytes, err := yaml.Marshal(networkYipConfig) - if err != nil { - return fmt.Errorf("marshalling network config: %w", err) - } - if err := i.fs.WriteFile(tempNetworkConfig, networkConfigBytes, 0600); err != nil { - return fmt.Errorf("writing file '%s': %w", tempNetworkConfig, err) - } - config.Elemental.Reset.ConfigURLs = append(config.Elemental.Reset.ConfigURLs, tempNetworkConfig) - if err := i.runner.Reset(config.Elemental.Reset); err != nil { return fmt.Errorf("failed to reset elemental: %w", err) } @@ -263,7 +241,7 @@ func matchesGt(disk *block.Disk, req elementalv1.DeviceSelectorRequirement) (boo // getCloudInitConfigs creates cloud-init configuration files that can be passed as additional `config-urls` // to the `elemental` cli. We exploit this mechanism to persist information during `elemental install` // or `elemental reset` calls into the newly installed or resetted system. -func (i *installer) getCloudInitConfigs(config elementalv1.Config, state register.State) ([]string, error) { +func (i *installer) getCloudInitConfigs(config elementalv1.Config, state register.State, networkConfig elementalv1.NetworkConfig) ([]string, error) { configs := []string{} agentConfPath, err := i.writeSystemAgentConfig(config.Elemental) if err != nil { @@ -291,6 +269,12 @@ func (i *installer) getCloudInitConfigs(config elementalv1.Config, state registe } configs = append(configs, registrationStatePath) + networkConfigPath, err := i.writeNetworkConfig(networkConfig) + if err != nil { + return nil, fmt.Errorf("writing temporary network config: %w", err) + } + configs = append(configs, networkConfigPath) + return configs, nil } @@ -393,6 +377,18 @@ func (i *installer) writeCloudInit(cloudConfig map[string]runtime.RawExtension) return f.Name(), nil } +func (i *installer) writeNetworkConfig(networkConfig elementalv1.NetworkConfig) (string, error) { + networkYipConfig := i.networkConfigurator.GetNetworkConfigApplicator(networkConfig) + networkConfigBytes, err := yaml.Marshal(networkYipConfig) + if err != nil { + return "", fmt.Errorf("marshalling network config: %w", err) + } + if err := i.fs.WriteFile(tempNetworkConfig, networkConfigBytes, 0600); err != nil { + return "", fmt.Errorf("writing file '%s': %w", tempNetworkConfig, err) + } + return tempNetworkConfig, nil +} + func (i *installer) getConnectionInfoBytes(config elementalv1.Elemental) ([]byte, error) { kubeConfig := api.Config{ Kind: "Config", diff --git a/pkg/network/network.go b/pkg/network/network.go index bf40d108e..ed3334f86 100644 --- a/pkg/network/network.go +++ b/pkg/network/network.go @@ -85,7 +85,21 @@ func (n *networkManagerConfigurator) GetNetworkConfigApplicator(networkConfig el return config } +// ResetNetworkConfig is invoked during reset trigger. +// It is important that once the elemental-system-agent marks the reset-trigger plan as completed, +// the assigned IPs are no longer used. +// This to prevent any race condition in which the remote MachineInventory is deleted, +// the associated IPAddressClaims are also deleted, but the machine is still using the IPAddresses from those claims. +// +// Also note that this method is called twice, since the first time we restart the elemental-system-agent. +// The reason for that restart is that after restarting the NetworkManager, the elemental-system-agent process hangs +// waiting for the (old) connection timeout. This can lead to the scheduled shutdown to trigger (and reset from recovery start), +// before the elemental-system-agent has time to recover from the connection change and confirm the application of the reset-trigger plan. +// Potentially this can lead to an infinite reset loop. func (n *networkManagerConfigurator) ResetNetworkConfig() error { + // If there are no /etc/NetworkManager/system-connections/*.nmconnection files, + // then this is the second time we invoke this method, or we never configured any + // network config so we have nothing to do. connectionFiles, err := n.fs.ReadDir(systemConnectionsDir) if err != nil { return fmt.Errorf("reading files in dir '%s': %w", systemConnectionsDir, err) @@ -102,6 +116,9 @@ func (n *networkManagerConfigurator) ResetNetworkConfig() error { return nil } + // Delete all .nmconnection files. This will also delete any "static" connection that the user defined in the base image for example. + // Which means maybe that is not supported anymore, or if we want to support it we should make sure we only delete the ones created by elemental, + // for example prefixing all files with "elemental-" or just parsing the network config again at this stage to determine the file names. log.Debug("Deleting all .nmconnection configs") cmd := exec.Command("find", systemConnectionsDir, "-name", "*.nmconnection", "-type", "f", "-delete") cmd.Stdout = os.Stdout @@ -111,6 +128,11 @@ func (n *networkManagerConfigurator) ResetNetworkConfig() error { return fmt.Errorf("deleting all %s/*.nmconnection: %w", systemConnectionsDir, err) } + //TODO: Here is where we can create "first boot" connections, for example by parsing the network config from the remote registration. + // In this way we can always reset to a deterministic network configuration, rather than simply delete all connections and revert on default (dhcp) + + // We need to invoke nmcli connection reload to tell NetworkManager to reload connections from disk. + // NetworkManager won't reload them alone with a simple restart. log.Debug("Reloading connections") cmd = exec.Command("nmcli", "connection", "reload") cmd.Stdout = os.Stdout @@ -120,6 +142,7 @@ func (n *networkManagerConfigurator) ResetNetworkConfig() error { return fmt.Errorf("running: nmcli connection reload: %w", err) } + // Restart NetworkManager to restart connections. log.Debug("Restarting NetworkManager") cmd = exec.Command("systemctl", "restart", "NetworkManager.service") cmd.Stdout = os.Stdout @@ -129,6 +152,8 @@ func (n *networkManagerConfigurator) ResetNetworkConfig() error { return fmt.Errorf("running command: systemctl restart NetworkManager.service: %w", err) } + // Not entirely necessary, but this mitigates the risk of continuing with any potential elemental-system-agent + // plan confirmation while the network is offline. log.Debug("Waiting NetworkManager online") cmd = exec.Command("systemctl", "start", "NetworkManager-wait-online.service") cmd.Stdout = os.Stdout @@ -138,6 +163,8 @@ func (n *networkManagerConfigurator) ResetNetworkConfig() error { return fmt.Errorf("running command: systemctl start NetworkManager-wait-online.service: %w", err) } + // Restarts the elemental-system-agent to start a new connection using the new config. + // This will make the plan be executed a second time. log.Debug("Restarting elemental-system-agent") cmd = exec.Command("systemctl", "restart", "elemental-system-agent.service") cmd.Stdout = os.Stdout