Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OCPBUGS-48780: Fix IBMCloud DNS Propagation Issues in E2E #1164

Merged
merged 2 commits into from
Jan 31, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion test/e2e/idle_connection_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -415,7 +415,7 @@ func idleConnectionTerminationPolicyRunTest(t *testing.T, policy operatorv1.Ingr
elbAddress := elbHostname

if net.ParseIP(elbHostname) == nil {
if err := wait.PollUntilContextTimeout(context.Background(), 10*time.Second, 5*time.Minute, false, func(ctx context.Context) (bool, error) {
if err := wait.PollUntilContextTimeout(context.Background(), 10*time.Second, dnsResolutionTimeout, false, func(ctx context.Context) (bool, error) {
addrs, err := net.LookupHost(elbHostname)
if err != nil {
t.Logf("%v error resolving %s: %v, retrying...", time.Now(), elbHostname, err)
Expand Down
21 changes: 19 additions & 2 deletions test/e2e/operator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,23 @@ var (
operandNamespace = operatorcontroller.DefaultOperandNamespace
defaultName = types.NamespacedName{Namespace: operatorNamespace, Name: manifests.DefaultIngressControllerName}
clusterConfigName = types.NamespacedName{Namespace: operatorNamespace, Name: manifests.ClusterIngressConfigName}

// Platforms that need a DNS "warmup" period for internal (inside the test cluster) DNS resolution.
// The warmup period is a period of delay before the first query is executed to avoid negative caching.
// This is not intended for external (i.e. the test runner cluster) DNS resolution.
platformsNeedInternalDNSWarmup = map[configv1.PlatformType]time.Duration{
// 7 minutes of warmup was required past testing for internal IBMCloud DNS queries.
configv1.IBMCloudPlatformType: 7 * time.Minute,
}
)

const (
// dnsResolutionTimeout is the maximum time allowed for the test runner cluster to resolve a newly created
// DNS record. This record may be a wildcard DNS record or a load balancer hostname (for AWS & IBMCloud). This
// timeout accounts for both DNS propagation time from the test cluster's hosted zone to the test runner
// cluster (which may be on different platforms) and any negative caching along the way. As of writing this, AWS
// typically resolves within ~1 minute (see OCPBUGS-14966), while IBMCloud takes ~7 minutes (see OCPBUGS-48780).
dnsResolutionTimeout = 10 * time.Minute
)

func init() {
Expand Down Expand Up @@ -3056,7 +3073,7 @@ func TestAWSELBConnectionIdleTimeout(t *testing.T) {
elbHostname := wildcardRecord.Spec.Targets[0]

// Wait until we can resolve the ELB's hostname
if err := wait.PollImmediate(5*time.Second, 5*time.Minute, func() (bool, error) {
if err := wait.PollImmediate(5*time.Second, dnsResolutionTimeout, func() (bool, error) {
_, err := net.LookupIP(elbHostname)
if err != nil {
t.Log(err)
Expand Down Expand Up @@ -3270,7 +3287,7 @@ func TestConnectTimeout(t *testing.T) {
lbHostname := wildcardRecord.Spec.Targets[0]

// Wait until we can resolve the LB's hostname
if err := wait.PollUntilContextTimeout(context.Background(), 5*time.Second, 5*time.Minute, true, func(ctx context.Context) (bool, error) {
if err := wait.PollUntilContextTimeout(context.Background(), 5*time.Second, dnsResolutionTimeout, true, func(ctx context.Context) (bool, error) {
_, err := net.LookupIP(lbHostname)
if err != nil {
t.Log(err)
Expand Down
2 changes: 1 addition & 1 deletion test/e2e/util_gatewayapi_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,7 @@ func assertHttpRouteConnection(t *testing.T, hostname string, gateway *gatewayap
// Wait and check that the dns name resolves first. Takes a long time, so
// if the hostname is actually an IP address, skip this.
if net.ParseIP(hostname) == nil {
if err := wait.PollUntilContextTimeout(context.Background(), 10*time.Second, 5*time.Minute, false, func(context context.Context) (bool, error) {
if err := wait.PollUntilContextTimeout(context.Background(), 10*time.Second, dnsResolutionTimeout, false, func(context context.Context) (bool, error) {
_, err := net.LookupHost(hostname)
if err != nil {
t.Logf("%v waiting for HTTP route name %s to resolve (%v)", time.Now(), hostname, err)
Expand Down
16 changes: 15 additions & 1 deletion test/e2e/util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -688,7 +688,7 @@ func verifyExternalIngressController(t *testing.T, name types.NamespacedName, ho
// If we have a DNS as an external IP address, make sure we can resolve it before moving on.
// This just limits the number of "could not resolve host" errors which can be confusing.
if net.ParseIP(address) == nil {
if err := wait.PollImmediate(10*time.Second, 5*time.Minute, func() (bool, error) {
if err := wait.PollImmediate(10*time.Second, dnsResolutionTimeout, func() (bool, error) {
_, err := net.LookupIP(address)
if err != nil {
t.Logf("waiting for loadbalancer domain %s to resolve...", address)
Expand Down Expand Up @@ -766,6 +766,10 @@ func verifyInternalIngressController(t *testing.T, name types.NamespacedName, ho
}
}()

// Since we've created a new IngressController, and we are about to use a curl pod to query the
// wildcard DNS record, we need to wait for platforms that require a warmup period for internal queries.
waitForInternalDNSWarmup(t)

extraArgs := []string{
"--header", "HOST:" + echoRoute.Spec.Host,
"-v",
Expand Down Expand Up @@ -849,6 +853,16 @@ func verifyInternalIngressController(t *testing.T, name types.NamespacedName, ho
}
}

// waitForInternalDNSWarmup waits for a designated "warmup" period
// to prevent negative caching on platforms that require it.
func waitForInternalDNSWarmup(t *testing.T) {
warmup, _ := platformsNeedInternalDNSWarmup[infraConfig.Status.PlatformStatus.Type]
if warmup > 0 {
t.Logf("this platform requires DNS warmup time to avoid negative caching...waiting for %s", warmup)
time.Sleep(warmup)
}
}

// assertDeleted tries to delete a cluster resource, and causes test failure if the delete fails.
func assertDeleted(t *testing.T, cl client.Client, thing client.Object) {
t.Helper()
Expand Down