Skip to content

Commit

Permalink
Fix IBMCloud DNS Propagation Issues
Browse files Browse the repository at this point in the history
Resolves IBMCloud DNS names inside the cluster because they don't
propagate to the test runner clusters in a reasonable time.
  • Loading branch information
gcs278 committed Oct 29, 2024
1 parent 8bf1b36 commit 34c2e67
Show file tree
Hide file tree
Showing 2 changed files with 163 additions and 23 deletions.
43 changes: 32 additions & 11 deletions test/e2e/operator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,12 @@ var (
operandNamespace = operatorcontroller.DefaultOperandNamespace
defaultName = types.NamespacedName{Namespace: operatorNamespace, Name: manifests.DefaultIngressControllerName}
clusterConfigName = types.NamespacedName{Namespace: operatorNamespace, Name: manifests.ClusterIngressConfigName}

// Platforms that need to resolve DNS hostnames from within the cluster.
// Typically because new DNS hostnames won't propagate to the test runner in a reasonable time.
platformsNeedInternalDNSResolutionAndWarmup = map[configv1.PlatformType]time.Duration{
configv1.IBMCloudPlatformType: 3 * time.Minute,
}
)

func init() {
Expand Down Expand Up @@ -3186,21 +3192,14 @@ func TestConnectTimeout(t *testing.T) {
lbHostname := wildcardRecord.Spec.Targets[0]

// Wait until we can resolve the LB's hostname
if err := wait.PollUntilContextTimeout(context.Background(), 5*time.Second, 5*time.Minute, true, func(ctx context.Context) (bool, error) {
_, err := net.LookupIP(lbHostname)
if err != nil {
t.Log(err)
return false, nil
}

return true, nil
}); err != nil {
t.Fatalf("failed to observe expected condition: %v", err)
lbIPAddress, err := waitForLookupIP(t, lbHostname, 5*time.Minute, false)
if err != nil {
t.Fatalf("failed to wait for load balancer hostname: %v", err)
}

// Open a connection to the route, send a request, and verify that the
// connection times out.
request, err := http.NewRequest("GET", "http://"+lbHostname, nil)
request, err := http.NewRequest("GET", "http://"+lbIPAddress, nil)
if err != nil {
t.Fatalf("failed to create HTTP request: %v", err)
}
Expand Down Expand Up @@ -4110,6 +4109,28 @@ func waitForPodReady(t *testing.T, cl client.Client, pod *corev1.Pod, timeout ti
return nil
}

// waitForPodReady waits for a pod to succeed or fail.
func waitForPodComplete(t *testing.T, cl client.Client, pod *corev1.Pod, timeout time.Duration) error {
t.Helper()
name := types.NamespacedName{Namespace: pod.Namespace, Name: pod.Name}
p := &corev1.Pod{}
err := wait.PollUntilContextTimeout(context.Background(), 5*time.Second, timeout, true, func(ctx context.Context) (bool, error) {
if err := cl.Get(ctx, name, p); err != nil {
t.Logf("error getting pod %s: %v", name, err)
return false, nil
}
if p.Status.Phase == corev1.PodSucceeded || p.Status.Phase == corev1.PodFailed {
return true, nil
}
t.Logf("pod %s has not completed yet", name)
return false, nil
})
if err != nil {
return fmt.Errorf("failed to wait for pod %s to complete", name)
}
return nil
}

func clusterOperatorConditionMap(conditions ...configv1.ClusterOperatorStatusCondition) map[string]string {
conds := map[string]string{}
for _, cond := range conditions {
Expand Down
143 changes: 131 additions & 12 deletions test/e2e/util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -660,20 +660,12 @@ func verifyExternalIngressController(t *testing.T, name types.NamespacedName, ho

// If we have a DNS as an external IP address, make sure we can resolve it before moving on.
// This just limits the number of "could not resolve host" errors which can be confusing.
if net.ParseIP(address) == nil {
if err := wait.PollImmediate(10*time.Second, 5*time.Minute, func() (bool, error) {
_, err := net.LookupIP(address)
if err != nil {
t.Logf("waiting for loadbalancer domain %s to resolve...", address)
return false, nil
}
return true, nil
}); err != nil {
t.Fatalf("loadbalancer domain %s was unable to resolve: %v", address, err)
}
ipAddress, err := waitForLookupIP(t, address, 5*time.Minute, false)
if err != nil {
t.Fatalf("failed to wait for load balancer hostname: %v", err)
}

req, err := http.NewRequest(http.MethodGet, fmt.Sprintf("http://%s", address), nil)
req, err := http.NewRequest(http.MethodGet, fmt.Sprintf("http://%s", ipAddress), nil)
if err != nil {
t.Fatalf("failed to build client request: %v", err)
}
Expand Down Expand Up @@ -739,6 +731,9 @@ func verifyInternalIngressController(t *testing.T, name types.NamespacedName, ho
}
}()

// Wait for DNS to resolve, including warmup period if needed.
waitForLookupIP(t, address, 5*time.Minute, true)

extraArgs := []string{
"--header", "HOST:" + echoRoute.Spec.Host,
"-v",
Expand Down Expand Up @@ -822,6 +817,130 @@ func verifyInternalIngressController(t *testing.T, name types.NamespacedName, ho
}
}

// waitForLookupIP looks up an IP for a given hostname, handling platforms that
// require an internal cluster DNS query. It returns a single IP.
func waitForLookupIP(t *testing.T, host string, timeout time.Duration, forceInternal bool) (string, error) {
// If already an IP, return as is.
if net.ParseIP(host) != nil {
return host, nil
}

var ip []net.IP
var err error
if err := wait.PollImmediate(10*time.Second, timeout, func() (bool, error) {
if platformNeedInternalDNSResolution() || forceInternal {
warmup := platformDNSWarmupDuration()
if warmup > 0 {
t.Logf("this platform requires DNS warmup time to avoid negative caching...waiting for %s", warmup)
time.Sleep(warmup)
}
ip, err = lookupIPInsideCluster(t, host)
} else {
ip, err = net.LookupIP(host)
}
if err != nil {
t.Logf("waiting for hostname %s to resolve...", host)
return false, nil
}
return true, nil
}); err != nil {
t.Fatalf("hostname %s was unable to resolve: %v", host, err)
}

return ip[0].String(), nil
}

// platformNeedInternalDNSResolution returns true if the current platform
// requires internal cluster DNS resolution, and false if not.
func platformNeedInternalDNSResolution() bool {
if _, exists := platformsNeedInternalDNSResolutionAndWarmup[infraConfig.Status.PlatformStatus.Type]; exists {
return true
}
return false
}

func platformDNSWarmupDuration() time.Duration {
if warmup, exists := platformsNeedInternalDNSResolutionAndWarmup[infraConfig.Status.PlatformStatus.Type]; exists {
return warmup
}
return 0
}

// lookupIPInsideCluster looks up a host inside a cluster
// by running the dig command inside a pod.
func lookupIPInsideCluster(t *testing.T, host string) ([]net.IP, error) {
ns := operandNamespace
// Get current router image so we can use it for running dig.
deployment := &appsv1.Deployment{}
deploymentName := types.NamespacedName{Namespace: controller.DefaultOperandNamespace, Name: "router-default"}
if err := kclient.Get(context.TODO(), deploymentName, deployment); err != nil {
return nil, err
}
image := deployment.Spec.Template.Spec.Containers[0].Image
digPod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
GenerateName: "dig-lookup-",
Namespace: ns,
},
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{
Name: "dig",
Image: image,
Command: []string{"sh", "-c", fmt.Sprintf("dig +short %s", host)},
},
},
RestartPolicy: corev1.RestartPolicyNever,
},
}
if err := kclient.Create(context.Background(), digPod); err != nil {
return nil, fmt.Errorf("failed to create dig pod %s/%s: %v", digPod.Namespace, digPod.Name, err)
}
defer func() {
if err := kclient.Delete(context.TODO(), digPod); err != nil {
if !errors.IsNotFound(err) {
t.Fatalf("failed to delete pod %s/%s: %v", digPod.Namespace, digPod.Name, err)
}
}
}()

if err := waitForPodComplete(t, kclient, digPod, 1*time.Minute); err != nil {
return nil, fmt.Errorf("failed to observe dig pod completion: %v", err)
}
// Get client-go ClientSet for retrieving logs.
kubeConfig, err := config.GetConfig()
if err != nil {
t.Fatalf("failed to get kube config: %v", err)
}
cl, err := kubernetes.NewForConfig(kubeConfig)
if err != nil {
t.Fatalf("failed to create kube client: %v", err)
}

// Retrieve logs from dig pod.
logs, err := cl.CoreV1().Pods(ns).GetLogs(digPod.Name, &corev1.PodLogOptions{
Container: "dig",
Follow: false,
}).DoRaw(context.Background())
if err != nil {
return nil, fmt.Errorf("failed to get %s/%s pod logs: %v", digPod.Namespace, digPod.Name, err)
}

// Attempt to parse out the IP from the logs.
var ips []net.IP
for _, line := range strings.Split(string(logs), "\n") {
if ip := net.ParseIP(line); ip != nil {
ips = append(ips, ip)
}
}

if len(ips) == 0 {
return nil, fmt.Errorf("no valid IP addresses found for host using internal-cluster dig: %s", host)
}

return ips, nil
}

// assertDeleted tries to delete a cluster resource, and causes test failure if the delete fails.
func assertDeleted(t *testing.T, cl client.Client, thing client.Object) {
t.Helper()
Expand Down

0 comments on commit 34c2e67

Please sign in to comment.