From 7e2b804cb715e032ae4f9134e09f398aaf714124 Mon Sep 17 00:00:00 2001 From: Grant Spence Date: Wed, 29 Jan 2025 14:38:05 -0500 Subject: [PATCH 1/2] OCPBUGS-48780: Extend E2E DNS Resolution Timeout to 10 Minutes Fix IBMCloud DNS resolution issues in our E2E tests by extending the timeout to 10 minutes. This fix defines the timeout as a documented constant, ensuring that all DNS resolution logic is updated to reference it. Testing showed that new IBMCloud DNS records were resolving within 7 minutes for external (e.g., test runner cluster) queries. Setting the timeout to 10 minutes provides a reasonable buffer to accommodate DNS propagation across all platforms. --- test/e2e/idle_connection_test.go | 2 +- test/e2e/operator_test.go | 13 +++++++++++-- test/e2e/util_gatewayapi_test.go | 2 +- test/e2e/util_test.go | 2 +- 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/test/e2e/idle_connection_test.go b/test/e2e/idle_connection_test.go index 623253de8e..ed4a2dc079 100644 --- a/test/e2e/idle_connection_test.go +++ b/test/e2e/idle_connection_test.go @@ -415,7 +415,7 @@ func idleConnectionTerminationPolicyRunTest(t *testing.T, policy operatorv1.Ingr elbAddress := elbHostname if net.ParseIP(elbHostname) == nil { - if err := wait.PollUntilContextTimeout(context.Background(), 10*time.Second, 5*time.Minute, false, func(ctx context.Context) (bool, error) { + if err := wait.PollUntilContextTimeout(context.Background(), 10*time.Second, dnsResolutionTimeout, false, func(ctx context.Context) (bool, error) { addrs, err := net.LookupHost(elbHostname) if err != nil { t.Logf("%v error resolving %s: %v, retrying...", time.Now(), elbHostname, err) diff --git a/test/e2e/operator_test.go b/test/e2e/operator_test.go index 3aa8745dad..d659ce7126 100644 --- a/test/e2e/operator_test.go +++ b/test/e2e/operator_test.go @@ -127,6 +127,15 @@ var ( clusterConfigName = types.NamespacedName{Namespace: operatorNamespace, Name: manifests.ClusterIngressConfigName} ) +const ( + // dnsResolutionTimeout is the maximum time allowed for the test runner cluster to resolve a newly created + // DNS record. This record may be a wildcard DNS record or a load balancer hostname (for AWS & IBMCloud). This + // timeout accounts for both DNS propagation time from the test cluster's hosted zone to the test runner + // cluster (which may be on different platforms) and any negative caching along the way. As of writing this, AWS + // typically resolves within ~1 minute (see OCPBUGS-14966), while IBMCloud takes ~7 minutes (see OCPBUGS-48780). + dnsResolutionTimeout = 10 * time.Minute +) + func init() { // This is required because controller-runtime expects its consumers to // set a logger through log.SetLogger within 30 seconds of the program's @@ -3056,7 +3065,7 @@ func TestAWSELBConnectionIdleTimeout(t *testing.T) { elbHostname := wildcardRecord.Spec.Targets[0] // Wait until we can resolve the ELB's hostname - if err := wait.PollImmediate(5*time.Second, 5*time.Minute, func() (bool, error) { + if err := wait.PollImmediate(5*time.Second, dnsResolutionTimeout, func() (bool, error) { _, err := net.LookupIP(elbHostname) if err != nil { t.Log(err) @@ -3270,7 +3279,7 @@ func TestConnectTimeout(t *testing.T) { lbHostname := wildcardRecord.Spec.Targets[0] // Wait until we can resolve the LB's hostname - if err := wait.PollUntilContextTimeout(context.Background(), 5*time.Second, 5*time.Minute, true, func(ctx context.Context) (bool, error) { + if err := wait.PollUntilContextTimeout(context.Background(), 5*time.Second, dnsResolutionTimeout, true, func(ctx context.Context) (bool, error) { _, err := net.LookupIP(lbHostname) if err != nil { t.Log(err) diff --git a/test/e2e/util_gatewayapi_test.go b/test/e2e/util_gatewayapi_test.go index 8d505772c4..4e350d4943 100644 --- a/test/e2e/util_gatewayapi_test.go +++ b/test/e2e/util_gatewayapi_test.go @@ -527,7 +527,7 @@ func assertHttpRouteConnection(t *testing.T, hostname string, gateway *gatewayap // Wait and check that the dns name resolves first. Takes a long time, so // if the hostname is actually an IP address, skip this. if net.ParseIP(hostname) == nil { - if err := wait.PollUntilContextTimeout(context.Background(), 10*time.Second, 5*time.Minute, false, func(context context.Context) (bool, error) { + if err := wait.PollUntilContextTimeout(context.Background(), 10*time.Second, dnsResolutionTimeout, false, func(context context.Context) (bool, error) { _, err := net.LookupHost(hostname) if err != nil { t.Logf("%v waiting for HTTP route name %s to resolve (%v)", time.Now(), hostname, err) diff --git a/test/e2e/util_test.go b/test/e2e/util_test.go index b174e755c9..f72172a39e 100644 --- a/test/e2e/util_test.go +++ b/test/e2e/util_test.go @@ -688,7 +688,7 @@ func verifyExternalIngressController(t *testing.T, name types.NamespacedName, ho // If we have a DNS as an external IP address, make sure we can resolve it before moving on. // This just limits the number of "could not resolve host" errors which can be confusing. if net.ParseIP(address) == nil { - if err := wait.PollImmediate(10*time.Second, 5*time.Minute, func() (bool, error) { + if err := wait.PollImmediate(10*time.Second, dnsResolutionTimeout, func() (bool, error) { _, err := net.LookupIP(address) if err != nil { t.Logf("waiting for loadbalancer domain %s to resolve...", address) From 7ee110b86850ee71fce690598ff12415f3e80ec0 Mon Sep 17 00:00:00 2001 From: Grant Spence Date: Wed, 29 Jan 2025 18:15:21 -0500 Subject: [PATCH 2/2] OCPBUGS-42045: Add an Internal Warmup for IBMCloud DNS in E2E During testing, we found that IBMCloud's DNS resolution works well from outside the cluster (e.g., the test runner cluster). However, internal DNS queries within the test cluster trigger to an unchangeable ~30-minute negative caching TTL. This E2E test fix introduces an internal warmup period for IBMCloud clusters to mitigate the negative caching issue. Only one test, TestUnmanagedDNSToManagedDNSInternalIngressController, requires this workaround. --- test/e2e/operator_test.go | 8 ++++++++ test/e2e/util_test.go | 14 ++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/test/e2e/operator_test.go b/test/e2e/operator_test.go index d659ce7126..9ff927c458 100644 --- a/test/e2e/operator_test.go +++ b/test/e2e/operator_test.go @@ -125,6 +125,14 @@ var ( operandNamespace = operatorcontroller.DefaultOperandNamespace defaultName = types.NamespacedName{Namespace: operatorNamespace, Name: manifests.DefaultIngressControllerName} clusterConfigName = types.NamespacedName{Namespace: operatorNamespace, Name: manifests.ClusterIngressConfigName} + + // Platforms that need a DNS "warmup" period for internal (inside the test cluster) DNS resolution. + // The warmup period is a period of delay before the first query is executed to avoid negative caching. + // This is not intended for external (i.e. the test runner cluster) DNS resolution. + platformsNeedInternalDNSWarmup = map[configv1.PlatformType]time.Duration{ + // 7 minutes of warmup was required past testing for internal IBMCloud DNS queries. + configv1.IBMCloudPlatformType: 7 * time.Minute, + } ) const ( diff --git a/test/e2e/util_test.go b/test/e2e/util_test.go index f72172a39e..a3f693b96e 100644 --- a/test/e2e/util_test.go +++ b/test/e2e/util_test.go @@ -766,6 +766,10 @@ func verifyInternalIngressController(t *testing.T, name types.NamespacedName, ho } }() + // Since we've created a new IngressController, and we are about to use a curl pod to query the + // wildcard DNS record, we need to wait for platforms that require a warmup period for internal queries. + waitForInternalDNSWarmup(t) + extraArgs := []string{ "--header", "HOST:" + echoRoute.Spec.Host, "-v", @@ -849,6 +853,16 @@ func verifyInternalIngressController(t *testing.T, name types.NamespacedName, ho } } +// waitForInternalDNSWarmup waits for a designated "warmup" period +// to prevent negative caching on platforms that require it. +func waitForInternalDNSWarmup(t *testing.T) { + warmup, _ := platformsNeedInternalDNSWarmup[infraConfig.Status.PlatformStatus.Type] + if warmup > 0 { + t.Logf("this platform requires DNS warmup time to avoid negative caching...waiting for %s", warmup) + time.Sleep(warmup) + } +} + // assertDeleted tries to delete a cluster resource, and causes test failure if the delete fails. func assertDeleted(t *testing.T, cl client.Client, thing client.Object) { t.Helper()