From 6f014b1ea164dd0872619422c9db9af3b8141681 Mon Sep 17 00:00:00 2001 From: Utku Ozdemir Date: Thu, 30 Jan 2025 15:44:50 +0100 Subject: [PATCH] fix: fix node resolution cache for nodes in maintenance mode There was a problem with the node resolution (a.k.a. DNS) cache of the nodes. When a machine is in maintenance mode, there is a corresponding `MachineStatus` resource for it, but there isn't any `ClusterMachineIdentity`. Both of these types trigger updates in the node resolution cache. When a machine was never part of a cluster, the only source is `MachineStatus`, and the cache updates on it did not populate the machine ID in the cache. This caused the GRPC router to pick the wrong destination. Furthermore, we did not remove the cluster and node name information from the cache when a machine was removed from a cluster. This caused the cache to contain obsolete cluster information, causing Talos GRPC proxy to not proxy the requests correctly after a machine was removed from a cluster. Co-authored-by: Artem Chernyshev Signed-off-by: Utku Ozdemir --- internal/backend/dns/service.go | 3 ++ internal/backend/dns/service_test.go | 46 +++++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/internal/backend/dns/service.go b/internal/backend/dns/service.go index f8ca3662..7abbedfe 100644 --- a/internal/backend/dns/service.go +++ b/internal/backend/dns/service.go @@ -281,6 +281,7 @@ func (d *Service) updateEntryByMachineStatus(res *omni.MachineStatus) { info := d.machineIDToInfo[res.Metadata().ID()] + info.ID = res.Metadata().ID() info.TalosVersion = version info.managementEndpoint = res.TypedSpec().Value.ManagementAddress @@ -316,6 +317,8 @@ func (d *Service) deleteIdentityMappings(id resource.ID) { d.nodenameToID.remove(info.Name, id) } + info.Cluster = "" + info.Name = "" info.address = "" d.machineIDToInfo[id] = info diff --git a/internal/backend/dns/service_test.go b/internal/backend/dns/service_test.go index 5ac3d728..ce608a48 100644 --- a/internal/backend/dns/service_test.go +++ b/internal/backend/dns/service_test.go @@ -3,6 +3,7 @@ // Use of this software is governed by the Business Source License // included in the LICENSE file. +//nolint:goconst package dns_test import ( @@ -115,7 +116,7 @@ func (suite *ServiceSuite) TestResolve() { // destroy the identity, assert that it doesn't resolve anymore suite.Require().NoError(suite.state.Destroy(suite.ctx, identity.Metadata())) - expected = dns.NewInfo(cluster, "test-1", "test-1-node", "") + expected = dns.NewInfo("", "test-1", "", "") expected.TalosVersion = machineStatus.TypedSpec().Value.TalosVersion // still resolves by the node id, but has an empty address @@ -129,6 +130,49 @@ func (suite *ServiceSuite) TestResolve() { suite.assertResolve("test-1", zeroInfo) } +func (suite *ServiceSuite) TestResolveAllocateAndDeallocate() { + expected := dns.NewInfo("", "test-1", "", "") + + expected.TalosVersion = "3.2.1" + + // In the maintenance mode, we only have MachineStatus, so we start with that + // (means cache will be initialized with the data on MachineStatus and nothing else - no ClusterMachineIdentity) + machineStatus := omni.NewMachineStatus(resources.DefaultNamespace, "test-1") + + machineStatus.TypedSpec().Value.TalosVersion = "3.2.1" + + suite.Require().NoError(suite.state.Create(suite.ctx, machineStatus)) + + suite.assertResolve("test-1", expected) + + // allocate the machine to a cluster by creating a ClusterMachineIdentity + + identity := omni.NewClusterMachineIdentity(resources.DefaultNamespace, "test-1") + identity.Metadata().Labels().Set(omni.LabelCluster, "test-cluster-1") + + identity.TypedSpec().Value.Nodename = "test-1-node" + + suite.Require().NoError(suite.state.Create(suite.ctx, identity)) + + // assert that cluster information gets resolved + + expected.Cluster = "test-cluster-1" + expected.Name = "test-1-node" + + suite.assertResolve("test-1", expected) + + // deallocate the machine by destroying the ClusterMachineIdentity + + suite.Require().NoError(suite.state.Destroy(suite.ctx, identity.Metadata())) + + // assert that the machine still resolves but the cluster information is gone + + expected.Cluster = "" + expected.Name = "" + + suite.assertResolve("test-1", expected) +} + func (suite *ServiceSuite) assertResolveAddress(cluster, node, expected string) { err := retry.Constant(3*time.Second, retry.WithUnits(100*time.Millisecond)).RetryWithContext(suite.ctx, func(context.Context) error { resolved := suite.dnsService.Resolve(cluster, node)