diff --git a/test/extended/imagepolicy/imagepolicy.go b/test/extended/imagepolicy/imagepolicy.go index eff805a06057..65284f7e5567 100644 --- a/test/extended/imagepolicy/imagepolicy.go +++ b/test/extended/imagepolicy/imagepolicy.go @@ -216,8 +216,7 @@ func updateImageConfig(oc *exutil.CLI, allowedRegistries []string) { return err }) o.Expect(err).NotTo(o.HaveOccurred(), "error updating image config") - WaitForMCPConfigSpecChangeAndUpdated(oc, workerPool, initialWorkerSpec) - WaitForMCPConfigSpecChangeAndUpdated(oc, masterPool, initialMasterSpec) + WaitForMCPsConfigSpecChangeAndUpdated(oc, initialWorkerSpec, initialMasterSpec) } func cleanupImageConfig(oc *exutil.CLI) error { @@ -238,8 +237,7 @@ func cleanupImageConfig(oc *exutil.CLI) error { return err }) o.Expect(err).NotTo(o.HaveOccurred(), "error cleaning up image config") - WaitForMCPConfigSpecChangeAndUpdated(oc, workerPool, initialWorkerSpec) - WaitForMCPConfigSpecChangeAndUpdated(oc, masterPool, initialMasterSpec) + WaitForMCPsConfigSpecChangeAndUpdated(oc, initialWorkerSpec, initialMasterSpec) return nil } @@ -285,8 +283,7 @@ func createClusterImagePolicy(oc *exutil.CLI, policy configv1.ClusterImagePolicy _, err := oc.AdminConfigClient().ConfigV1().ClusterImagePolicies().Create(context.TODO(), &policy, metav1.CreateOptions{}) o.Expect(err).NotTo(o.HaveOccurred()) - WaitForMCPConfigSpecChangeAndUpdated(oc, workerPool, initialWorkerSpec) - WaitForMCPConfigSpecChangeAndUpdated(oc, masterPool, initialMasterSpec) + WaitForMCPsConfigSpecChangeAndUpdated(oc, initialWorkerSpec, initialMasterSpec) } func deleteClusterImagePolicy(oc *exutil.CLI, policyName string) error { @@ -296,8 +293,7 @@ func deleteClusterImagePolicy(oc *exutil.CLI, policyName string) error { if err := oc.AdminConfigClient().ConfigV1().ClusterImagePolicies().Delete(context.TODO(), policyName, metav1.DeleteOptions{}); err != nil && !errors.IsNotFound(err) { return fmt.Errorf("failed to delete cluster image policy %s: %v", policyName, err) } - WaitForMCPConfigSpecChangeAndUpdated(oc, workerPool, initialWorkerSpec) - WaitForMCPConfigSpecChangeAndUpdated(oc, masterPool, initialMasterSpec) + WaitForMCPsConfigSpecChangeAndUpdated(oc, initialWorkerSpec, initialMasterSpec) return nil } @@ -312,8 +308,7 @@ func createImagePolicy(oc *exutil.CLI, policy configv1.ImagePolicy, namespace st // Wait until each pool's Spec.Configuration.Name changes from the initial value // and the pool reports Updated=true - WaitForMCPConfigSpecChangeAndUpdated(oc, workerPool, initialWorkerSpec) - WaitForMCPConfigSpecChangeAndUpdated(oc, masterPool, initialMasterSpec) + WaitForMCPsConfigSpecChangeAndUpdated(oc, initialWorkerSpec, initialMasterSpec) } func deleteImagePolicy(oc *exutil.CLI, policyName string, namespace string) error { @@ -323,8 +318,7 @@ func deleteImagePolicy(oc *exutil.CLI, policyName string, namespace string) erro if err := oc.AdminConfigClient().ConfigV1().ImagePolicies(namespace).Delete(context.TODO(), policyName, metav1.DeleteOptions{}); err != nil && !errors.IsNotFound(err) { return fmt.Errorf("failed to delete image policy %s in namespace %s: %v", policyName, namespace, err) } - WaitForMCPConfigSpecChangeAndUpdated(oc, workerPool, initialWorkerSpec) - WaitForMCPConfigSpecChangeAndUpdated(oc, masterPool, initialMasterSpec) + WaitForMCPsConfigSpecChangeAndUpdated(oc, initialWorkerSpec, initialMasterSpec) return nil } @@ -707,7 +701,42 @@ func WaitForMCPConfigSpecChangeAndUpdated(oc *exutil.CLI, pool string, initialSp return false } return machineconfighelper.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdated) - }, 20*time.Minute, 10*time.Second).Should(o.BeTrue()) + }, 15*time.Minute, 10*time.Second).Should(o.BeTrue()) +} + +func WaitForMCPsConfigSpecChangeAndUpdated(oc *exutil.CLI, workerInitialSpec, masterInitialSpec string) { + e2e.Logf("Waiting for worker and master pools to complete") + clientSet, err := machineconfigclient.NewForConfig(oc.KubeFramework().ClientConfig()) + o.Expect(err).NotTo(o.HaveOccurred()) + + o.Eventually(func() bool { + workerMCP, err := clientSet.MachineconfigurationV1().MachineConfigPools().Get(context.TODO(), "worker", metav1.GetOptions{}) + if err != nil { + return false + } + masterMCP, err := clientSet.MachineconfigurationV1().MachineConfigPools().Get(context.TODO(), "master", metav1.GetOptions{}) + if err != nil { + return false + } + + workerReady := workerMCP.Status.Configuration.Name != workerInitialSpec && + workerMCP.Spec.Configuration.Name == workerMCP.Status.Configuration.Name && + machineconfighelper.IsMachineConfigPoolConditionTrue(workerMCP.Status.Conditions, mcfgv1.MachineConfigPoolUpdated) + + masterReady := masterMCP.Status.Configuration.Name != masterInitialSpec && + masterMCP.Spec.Configuration.Name == masterMCP.Status.Configuration.Name && + machineconfighelper.IsMachineConfigPoolConditionTrue(masterMCP.Status.Conditions, mcfgv1.MachineConfigPoolUpdated) + + if !workerReady { + e2e.Logf("Worker MCP not ready yet") + } + if !masterReady { + e2e.Logf("Master MCP not ready yet") + } + + return workerReady && masterReady + }, 15*time.Minute, 10*time.Second).Should(o.BeTrue()) + e2e.Logf("Both worker and master pools completed successfully") } func isDisconnectedCluster(oc *exutil.CLI) bool { diff --git a/test/extended/node/criocredentialprovider.go b/test/extended/node/criocredentialprovider.go index 9e3d72910dd6..3daf87c0d9ca 100644 --- a/test/extended/node/criocredentialprovider.go +++ b/test/extended/node/criocredentialprovider.go @@ -195,8 +195,7 @@ func updateCRIOCredentialProviderConfig(oc *exutil.CLI, matchImages []string, ex return } - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, workerPool, initialWorkerSpec) - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, masterPool, initialMasterSpec) + imagepolicy.WaitForMCPsConfigSpecChangeAndUpdated(oc, initialWorkerSpec, initialMasterSpec) } func getWorkerNodes(oc *exutil.CLI) ([]corev1.Node, error) { @@ -289,8 +288,7 @@ func createIDMSResources(oc *exutil.CLI) { e2e.Logf("Created ImageDigestMirrorSet %q", idms.Name) - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, workerPool, initialWorkerSpec) - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, masterPool, initialMasterSpec) + imagepolicy.WaitForMCPsConfigSpecChangeAndUpdated(oc, initialWorkerSpec, initialMasterSpec) } func cleanupIDMSResources(oc *exutil.CLI) { @@ -302,8 +300,7 @@ func cleanupIDMSResources(oc *exutil.CLI) { e2e.Logf("Deleted ImageDigestMirrorSet %q", "digest-mirror") - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, workerPool, initialWorkerSpec) - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, masterPool, initialMasterSpec) + imagepolicy.WaitForMCPsConfigSpecChangeAndUpdated(oc, initialWorkerSpec, initialMasterSpec) } func createNamespaceRBAC(f *e2e.Framework, namespace string) { diff --git a/test/extended/node/image_volume.go b/test/extended/node/image_volume.go index dea55d4e1a2f..622bb03edc15 100644 --- a/test/extended/node/image_volume.go +++ b/test/extended/node/image_volume.go @@ -66,13 +66,15 @@ func describeImageVolumeTests(config imageVolumeTestConfig) bool { podName = config.frameworkName + "-test" ) - g.BeforeEach(func() { + g.BeforeEach(func(ctx context.Context) { // Microshift doesn't inherit OCP feature gates, and ImageVolume won't work either isMicroshift, err := exutil.IsMicroShiftCluster(oc.AdminKubeClient()) o.Expect(err).NotTo(o.HaveOccurred()) if isMicroshift { g.Skip("Not supported on Microshift") } + + EnsureNodesReady(ctx, oc) }) g.It("should succeed with pod and pull policy of Always", func(ctx context.Context) { diff --git a/test/extended/node/kubelet_secret_pulled_images.go b/test/extended/node/kubelet_secret_pulled_images.go index af1c74c9b527..a1b1e1f9145f 100644 --- a/test/extended/node/kubelet_secret_pulled_images.go +++ b/test/extended/node/kubelet_secret_pulled_images.go @@ -206,7 +206,7 @@ var _ = g.Describe("[sig-node][Suite:openshift/disruptive-longrunning][Disruptiv g.DeferCleanup(func() { _ = deleteKC(oc, kcName) - _ = waitForMCP(ctx, mcClient, "worker", 30*time.Minute) + _ = waitForMCP(ctx, mcClient, "worker", 15*time.Minute) }) g.By("Pre-caching private image on the node with a valid secret") @@ -215,7 +215,7 @@ var _ = g.Describe("[sig-node][Suite:openshift/disruptive-longrunning][Disruptiv g.By("Applying NeverVerify policy and waiting for MCO rollout") credVerifyApplyPolicy(ctx, mcClient, kcName, `{"imagePullCredentialsVerificationPolicy":"NeverVerify"}`) credVerifyWaitForMCPUpdating(ctx, mcClient, "worker") - err = waitForMCP(ctx, mcClient, "worker", 30*time.Minute) + err = waitForMCP(ctx, mcClient, "worker", 15*time.Minute) o.Expect(err).NotTo(o.HaveOccurred()) g.By("Verifying NeverVerify policy allows pod without secret to use cached image") @@ -224,7 +224,7 @@ var _ = g.Describe("[sig-node][Suite:openshift/disruptive-longrunning][Disruptiv g.By("Switching to AlwaysVerify policy and waiting for MCO rollout") credVerifyApplyPolicy(ctx, mcClient, kcName, `{"imagePullCredentialsVerificationPolicy":"AlwaysVerify"}`) credVerifyWaitForMCPUpdating(ctx, mcClient, "worker") - err = waitForMCP(ctx, mcClient, "worker", 30*time.Minute) + err = waitForMCP(ctx, mcClient, "worker", 15*time.Minute) o.Expect(err).NotTo(o.HaveOccurred()) // This pod also re-caches the image after MCO rollout since pull records are cleared diff --git a/test/extended/node/kubeletconfig_tls.go b/test/extended/node/kubeletconfig_tls.go index 369cccda386f..dcb992badcbb 100644 --- a/test/extended/node/kubeletconfig_tls.go +++ b/test/extended/node/kubeletconfig_tls.go @@ -220,7 +220,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv "Timed out waiting for MachineConfigPool %q to start updating", testMCPName) g.By(fmt.Sprintf("Waiting for MachineConfigPool %s to complete rollout", testMCPName)) - err = waitForMCP(ctx, mcClient, testMCPName, 30*time.Minute) + err = waitForMCP(ctx, mcClient, testMCPName, 15*time.Minute) o.Expect(err).NotTo(o.HaveOccurred(), "Error waiting for MachineConfigPool %q to become ready", testMCPName) framework.Logf("MachineConfigPool %s has completed rollout", testMCPName) diff --git a/test/extended/node/node_e2e/container_runtime_config.go b/test/extended/node/node_e2e/container_runtime_config.go index dc9c0d39fb1a..ff799d4825d3 100644 --- a/test/extended/node/node_e2e/container_runtime_config.go +++ b/test/extended/node/node_e2e/container_runtime_config.go @@ -28,12 +28,14 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv oc = exutil.NewCLIWithoutNamespace("ctrcfg") ) - g.BeforeEach(func() { + g.BeforeEach(func(ctx context.Context) { isMicroShift, err := exutil.IsMicroShiftCluster(oc.AdminKubeClient()) o.Expect(err).NotTo(o.HaveOccurred(), "failed to detect MicroShift cluster") if isMicroShift { g.Skip("Skipping test on MicroShift cluster - MachineConfig resources are not available") } + + nodeutils.EnsureNodesReady(ctx, oc) }) // Validates that ContainerRuntimeConfig pidsLimit setting is correctly applied @@ -51,12 +53,12 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv workerNode := workers[0].Name g.By("Make a manual change to crio.conf on worker node") - _, err = nodeutils.ExecOnNodeWithChroot(oc, workerNode, + _, err = nodeutils.ExecOnNodeWithChroot(ctx, oc, workerNode, "/bin/bash", "-c", `sed -i '/^\[crio\.runtime\]/a log_level = "debug"' /etc/crio/crio.conf`) o.Expect(err).NotTo(o.HaveOccurred(), "failed to edit crio.conf on node %s", workerNode) g.By("Verify the manual crio.conf edit took effect") - editedConf, err := nodeutils.ExecOnNodeWithChroot(oc, workerNode, "cat", "/etc/crio/crio.conf") + editedConf, err := nodeutils.ExecOnNodeWithChroot(ctx, oc, workerNode, "cat", "/etc/crio/crio.conf") o.Expect(err).NotTo(o.HaveOccurred(), "failed to read crio.conf on node %s", workerNode) o.Expect(editedConf).To(o.ContainSubstring(`log_level = "debug"`), "sed edit did not apply: expected log_level = debug in crio.conf") @@ -100,7 +102,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv var crioConfig string o.Eventually(func() error { var execErr error - crioConfig, execErr = nodeutils.ExecOnNodeWithChroot(oc, workerNode, + crioConfig, execErr = nodeutils.ExecOnNodeWithChroot(ctx, oc, workerNode, "/bin/bash", "-c", "crio config 2>/dev/null") return execErr }, 30*time.Second, 5*time.Second).Should(o.Succeed(), "failed to get crio config on node %s", workerNode) @@ -163,7 +165,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv e2e.Logf("Worker node rolled out successfully") g.By("Check overlaySize takes effect in storage.conf on worker node") - storageConf, err := nodeutils.ExecOnNodeWithChroot(oc, workerNode, + storageConf, err := nodeutils.ExecOnNodeWithChroot(ctx, oc, workerNode, "/bin/bash", "-c", "head -n 7 /etc/containers/storage.conf | grep size") o.Expect(err).NotTo(o.HaveOccurred(), "failed to read storage.conf on node %s", workerNode) e2e.Logf("storage.conf size line: %s", storageConf) diff --git a/test/extended/node/node_e2e/image_registry_config.go b/test/extended/node/node_e2e/image_registry_config.go index ce3a02189813..14bc176f60e1 100644 --- a/test/extended/node/node_e2e/image_registry_config.go +++ b/test/extended/node/node_e2e/image_registry_config.go @@ -24,12 +24,14 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv oc = exutil.NewCLIWithoutNamespace("imgcfg") ) - g.BeforeEach(func() { + g.BeforeEach(func(ctx context.Context) { isMicroShift, err := exutil.IsMicroShiftCluster(oc.AdminKubeClient()) o.Expect(err).NotTo(o.HaveOccurred(), "failed to detect cluster type") if isMicroShift { g.Skip("Skipping test on MicroShift cluster - MachineConfig resources are not available") } + + nodeutils.EnsureNodesReady(ctx, oc) }) // Verifies that updating image.config.openshift.io/cluster with a new search @@ -62,8 +64,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv cleanupWorkerSpec := imagepolicy.GetMCPCurrentSpecConfigName(oc, "worker") cleanupMasterSpec := imagepolicy.GetMCPCurrentSpecConfigName(oc, "master") - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, "worker", cleanupWorkerSpec) - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, "master", cleanupMasterSpec) + imagepolicy.WaitForMCPsConfigSpecChangeAndUpdated(oc, cleanupWorkerSpec, cleanupMasterSpec) e2e.Logf("Cleanup: waiting for all cluster operators to settle") waitErr := operator.WaitForOperatorsToSettle(ctx, oc.AdminConfigClient(), 10) @@ -90,8 +91,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv o.Expect(err).NotTo(o.HaveOccurred(), "failed to update image.config.openshift.io/cluster") g.By("Wait for worker and master MCP rollout to complete") - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, "worker", initialWorkerSpec) - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, "master", initialMasterSpec) + imagepolicy.WaitForMCPsConfigSpecChangeAndUpdated(oc, initialWorkerSpec, initialMasterSpec) g.By("Verify search registries config on a worker node") workers, err := exutil.GetReadySchedulableWorkerNodes(ctx, oc.AdminKubeClient()) @@ -101,7 +101,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv var registriesConf string o.Eventually(func() error { var execErr error - registriesConf, execErr = nodeutils.ExecOnNodeWithChroot(oc, workers[0].Name, + registriesConf, execErr = nodeutils.ExecOnNodeWithChroot(ctx, oc, workers[0].Name, "cat", "/etc/containers/registries.conf.d/01-image-searchRegistries.conf") if execErr != nil { return execErr @@ -115,7 +115,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv e2e.Logf("Registries config on %s:\n%s", workers[0].Name, registriesConf) g.By("Verify policy.json is updated with allowed registries") - policyJSON, err := nodeutils.ExecOnNodeWithChroot(oc, workers[0].Name, + policyJSON, err := nodeutils.ExecOnNodeWithChroot(ctx, oc, workers[0].Name, "cat", "/etc/containers/policy.json") o.Expect(err).NotTo(o.HaveOccurred(), "failed to read policy.json on node %s", workers[0].Name) e2e.Logf("policy.json on %s:\n%s", workers[0].Name, policyJSON) diff --git a/test/extended/node/node_e2e/initcontainer.go b/test/extended/node/node_e2e/initcontainer.go index 6e095fe222d7..9d539f722872 100644 --- a/test/extended/node/node_e2e/initcontainer.go +++ b/test/extended/node/node_e2e/initcontainer.go @@ -26,12 +26,14 @@ var _ = g.Describe("[sig-node] [Jira:Node/Kubelet] NODE initContainer policy,vol ) // Skip all tests on MicroShift clusters as MachineConfig resources are not available - g.BeforeEach(func() { + g.BeforeEach(func(ctx context.Context) { isMicroShift, err := exutil.IsMicroShiftCluster(oc.AdminKubeClient()) o.Expect(err).NotTo(o.HaveOccurred()) if isMicroShift { g.Skip("Skipping test on MicroShift cluster - MachineConfig resources are not available") } + + nodeutils.EnsureNodesReady(ctx, oc) }) //author: bgudi@redhat.com @@ -127,7 +129,7 @@ var _ = g.Describe("[sig-node] [Jira:Node/Kubelet] NODE initContainer policy,vol actualContainerID := matches[1] g.By("Delete init container from node") - output, err := nodeutils.ExecOnNodeWithChroot(oc, nodeName, "crictl", "rm", actualContainerID) + output, err := nodeutils.ExecOnNodeWithChroot(ctx, oc, nodeName, "crictl", "rm", actualContainerID) o.Expect(err).NotTo(o.HaveOccurred(), "fail to delete container") e2e.Logf("Container deletion output: %s", output) diff --git a/test/extended/node/node_e2e/node.go b/test/extended/node/node_e2e/node.go index 5f43c93e20af..319396edbc82 100644 --- a/test/extended/node/node_e2e/node.go +++ b/test/extended/node/node_e2e/node.go @@ -29,16 +29,18 @@ var _ = g.Describe("[sig-node] [Jira:Node/Kubelet] Kubelet, CRI-O, CPU manager", ) // Skip all tests on MicroShift clusters as MachineConfig resources are not available - g.BeforeEach(func() { + g.BeforeEach(func(ctx context.Context) { isMicroShift, err := exutil.IsMicroShiftCluster(oc.AdminKubeClient()) o.Expect(err).NotTo(o.HaveOccurred()) if isMicroShift { g.Skip("Skipping test on MicroShift cluster - MachineConfig resources are not available") } + + nodeutils.EnsureNodesReady(ctx, oc) }) //author: asahay@redhat.com - g.It("[OTP] validate KUBELET_LOG_LEVEL", func() { + g.It("[OTP] validate KUBELET_LOG_LEVEL", func(ctx context.Context) { var kubeservice string var kubelet string var err error @@ -59,11 +61,11 @@ var _ = g.Describe("[sig-node] [Jira:Node/Kubelet] Kubelet, CRI-O, CPU manager", if nodeStatus == "True" { g.By("Checking KUBELET_LOG_LEVEL in kubelet.service on node " + node) - kubeservice, err = nodeutils.ExecOnNodeWithChroot(oc, node, "/bin/bash", "-c", "systemctl show kubelet.service | grep KUBELET_LOG_LEVEL") + kubeservice, err = nodeutils.ExecOnNodeWithChroot(ctx, oc, node, "/bin/bash", "-c", "systemctl show kubelet.service | grep KUBELET_LOG_LEVEL") o.Expect(err).NotTo(o.HaveOccurred()) g.By("Checking kubelet process for --v=2 flag on node " + node) - kubelet, err = nodeutils.ExecOnNodeWithChroot(oc, node, "/bin/bash", "-c", "ps aux | grep [k]ubelet") + kubelet, err = nodeutils.ExecOnNodeWithChroot(ctx, oc, node, "/bin/bash", "-c", "ps aux | grep [k]ubelet") o.Expect(err).NotTo(o.HaveOccurred()) g.By("Verifying KUBELET_LOG_LEVEL is set and kubelet is running with --v=2") @@ -89,7 +91,7 @@ var _ = g.Describe("[sig-node] [Jira:Node/Kubelet] Kubelet, CRI-O, CPU manager", }) //author: cmaurya@redhat.com - g.It("[OTP] validate cgroupv2 is default [OCP-80983]", func() { + g.It("[OTP] validate cgroupv2 is default [OCP-80983]", func(ctx context.Context) { g.By("Check cgroup version on all Ready worker nodes") nodeNames, err := oc.AsAdmin().WithoutNamespace().Run("get").Args("nodes", "-l", "node-role.kubernetes.io/worker", "-o=jsonpath={.items[*].metadata.name}").Output() o.Expect(err).NotTo(o.HaveOccurred()) @@ -103,7 +105,7 @@ var _ = g.Describe("[sig-node] [Jira:Node/Kubelet] Kubelet, CRI-O, CPU manager", e2e.Logf("Skipping worker node %s (not Ready)", worker) continue } - cgroupV, err := nodeutils.ExecOnNodeWithChroot(oc, worker, "/bin/bash", "-c", "stat -c %T -f /sys/fs/cgroup") + cgroupV, err := nodeutils.ExecOnNodeWithChroot(ctx, oc, worker, "/bin/bash", "-c", "stat -c %T -f /sys/fs/cgroup") o.Expect(err).NotTo(o.HaveOccurred()) e2e.Logf("cgroup version on node %s: [%v]", worker, cgroupV) o.Expect(cgroupV).To(o.ContainSubstring("cgroup2fs"), "Node %s does not have cgroupv2", worker) @@ -116,7 +118,7 @@ var _ = g.Describe("[sig-node] [Jira:Node/Kubelet] Kubelet, CRI-O, CPU manager", }) //author: cmaurya@redhat.com - g.It("[OTP] Allow dev fuse by default in CRI-O [OCP-70987]", func() { + g.It("[OTP] Allow dev fuse by default in CRI-O [OCP-70987]", func(ctx context.Context) { podName := "pod-devfuse" ns := "devfuse-test" @@ -127,7 +129,7 @@ var _ = g.Describe("[sig-node] [Jira:Node/Kubelet] Kubelet, CRI-O, CPU manager", "nodes", "-l", "node-role.kubernetes.io/worker", "-o=jsonpath={.items[0].metadata.name}").Output() o.Expect(err).NotTo(o.HaveOccurred()) o.Expect(node).NotTo(o.BeEmpty()) - runtime, err := nodeutils.ExecOnNodeWithChroot(oc, node, "/bin/bash", "-c", + runtime, err := nodeutils.ExecOnNodeWithChroot(ctx, oc, node, "/bin/bash", "-c", "crio status config 2>/dev/null | awk -F'\"' '/default_runtime/{print $2}'") o.Expect(err).NotTo(o.HaveOccurred()) if strings.TrimSpace(runtime) == "runc" { @@ -169,19 +171,20 @@ var _ = g.Describe("[sig-node] [Jira:Node/Kubelet] Kubelet, CRI-O, CPU manager", // author: asahay@redhat.com var _ = g.Describe("[sig-node][Suite:openshift/disruptive-longrunning][Disruptive][Serial] ImageTagMirrorSet and ImageDigestMirrorSet", func() { var ( - oc = exutil.NewCLIWithoutNamespace("image-mirror-set") - ctx = context.Background() + oc = exutil.NewCLIWithoutNamespace("image-mirror-set") ) - g.BeforeEach(func() { + g.BeforeEach(func(ctx context.Context) { isMicroShift, err := exutil.IsMicroShiftCluster(oc.AdminKubeClient()) o.Expect(err).NotTo(o.HaveOccurred()) if isMicroShift { g.Skip("Skipping test on MicroShift cluster - MachineConfig resources are not available") } + + nodeutils.EnsureNodesReady(ctx, oc) }) - g.It("[OTP] Create ImageDigestMirrorSet and ImageTagMirrorSet and verify registries.conf [OCP-57401]", func() { + g.It("[OTP] Create ImageDigestMirrorSet and ImageTagMirrorSet and verify registries.conf [OCP-57401]", func(ctx context.Context) { configClient := oc.AdminConfigClient().ConfigV1() suffix := utilrand.String(5) idmsName := fmt.Sprintf("digest-mirror-%s", suffix) @@ -220,21 +223,21 @@ var _ = g.Describe("[sig-node][Suite:openshift/disruptive-longrunning][Disruptiv e2e.Logf("ImageDigestMirrorSet %q created successfully", createdIDMS.Name) g.DeferCleanup(func() { + cleanupCtx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) + defer cancel() g.By("Cleanup: Delete IDMS and ITMS resources") - cleanupWorkerSpec := imagepolicy.GetMCPCurrentSpecConfigName(oc, "worker") - cleanupMasterSpec := imagepolicy.GetMCPCurrentSpecConfigName(oc, "master") - if delErr := configClient.ImageTagMirrorSets().Delete(ctx, itmsName, metav1.DeleteOptions{}); delErr != nil { + if delErr := configClient.ImageTagMirrorSets().Delete(cleanupCtx, itmsName, metav1.DeleteOptions{}); delErr != nil { e2e.Logf("Warning: failed to delete ImageTagMirrorSet: %v", delErr) } - if delErr := configClient.ImageDigestMirrorSets().Delete(ctx, idmsName, metav1.DeleteOptions{}); delErr != nil { + if delErr := configClient.ImageDigestMirrorSets().Delete(cleanupCtx, idmsName, metav1.DeleteOptions{}); delErr != nil { e2e.Logf("Warning: failed to delete ImageDigestMirrorSet: %v", delErr) } - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, "worker", cleanupWorkerSpec) - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, "master", cleanupMasterSpec) + cleanupWorkerSpec := imagepolicy.GetMCPCurrentSpecConfigName(oc, "worker") + cleanupMasterSpec := imagepolicy.GetMCPCurrentSpecConfigName(oc, "master") + imagepolicy.WaitForMCPsConfigSpecChangeAndUpdated(oc, cleanupWorkerSpec, cleanupMasterSpec) }) - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, "worker", initialWorkerSpec) - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, "master", initialMasterSpec) + imagepolicy.WaitForMCPsConfigSpecChangeAndUpdated(oc, initialWorkerSpec, initialMasterSpec) e2e.Logf("IDMS MCP rollout complete") g.By("Step 2: Create an ImageTagMirrorSet") @@ -271,15 +274,14 @@ var _ = g.Describe("[sig-node][Suite:openshift/disruptive-longrunning][Disruptiv e2e.Logf("ImageTagMirrorSet %q created successfully", createdITMS.Name) g.By("Step 3: Wait for all nodes to finish rolling out") - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, "worker", itmsWorkerSpec) - imagepolicy.WaitForMCPConfigSpecChangeAndUpdated(oc, "master", itmsMasterSpec) + imagepolicy.WaitForMCPsConfigSpecChangeAndUpdated(oc, itmsWorkerSpec, itmsMasterSpec) e2e.Logf("All MCPs have finished rolling out") g.By("Step 4: Verify /etc/containers/registries.conf on a worker node") workerNodeName := nodeutils.GetFirstReadyWorkerNode(oc) o.Expect(workerNodeName).NotTo(o.BeEmpty(), "no ready worker node found") - registriesConf, err := nodeutils.ExecOnNodeWithChroot(oc, workerNodeName, "cat", "/etc/containers/registries.conf") + registriesConf, err := nodeutils.ExecOnNodeWithChroot(ctx, oc, workerNodeName, "cat", "/etc/containers/registries.conf") o.Expect(err).NotTo(o.HaveOccurred(), "failed to read registries.conf from node %s", workerNodeName) e2e.Logf("registries.conf content:\n%s", registriesConf) diff --git a/test/extended/node/node_e2e/pdb_drain.go b/test/extended/node/node_e2e/pdb_drain.go index dc1c95ce14d7..18856a1394bf 100644 --- a/test/extended/node/node_e2e/pdb_drain.go +++ b/test/extended/node/node_e2e/pdb_drain.go @@ -18,6 +18,7 @@ import ( e2e "k8s.io/kubernetes/test/e2e/framework" "k8s.io/utils/ptr" + nodeutils "github.com/openshift/origin/test/extended/node" exutil "github.com/openshift/origin/test/extended/util" "github.com/openshift/origin/test/extended/util/operator" ) @@ -27,12 +28,14 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv oc = exutil.NewCLIWithoutNamespace("pdb-drain") ) - g.BeforeEach(func() { + g.BeforeEach(func(ctx context.Context) { isMicroShift, err := exutil.IsMicroShiftCluster(oc.AdminKubeClient()) o.Expect(err).NotTo(o.HaveOccurred()) if isMicroShift { g.Skip("Skipping test on MicroShift cluster") } + + nodeutils.EnsureNodesReady(ctx, oc) }) //author: bgudi@redhat.com diff --git a/test/extended/node/node_e2e/probe_termination.go b/test/extended/node/node_e2e/probe_termination.go index 6114cc69494b..a5fb9c02058d 100644 --- a/test/extended/node/node_e2e/probe_termination.go +++ b/test/extended/node/node_e2e/probe_termination.go @@ -27,12 +27,14 @@ var _ = g.Describe("[sig-node] Probe configuration", func() { oc = exutil.NewCLIWithoutNamespace("probe-termination") ) - g.BeforeEach(func() { + g.BeforeEach(func(ctx context.Context) { isMicroShift, err := exutil.IsMicroShiftCluster(oc.AdminKubeClient()) o.Expect(err).NotTo(o.HaveOccurred()) if isMicroShift { g.Skip("Skipping test on MicroShift cluster") } + + nodeutils.EnsureNodesReady(ctx, oc) }) //author: bgudi@redhat.com diff --git a/test/extended/node/node_sizing.go b/test/extended/node/node_sizing.go index bfba9942473d..40d67ac46c74 100644 --- a/test/extended/node/node_sizing.go +++ b/test/extended/node/node_sizing.go @@ -31,6 +31,8 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv if isMicroShift { g.Skip("Skipping test on MicroShift cluster") } + + EnsureNodesReady(ctx, oc) }) g.It("should have NODE_SIZING_ENABLED=true by default and NODE_SIZING_ENABLED=false when KubeletConfig with autoSizingReserved=false is applied", func(ctx context.Context) { @@ -154,7 +156,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv err = waitForMCP(ctx, mcClient, testMCPName, 5*time.Minute) o.Expect(err).NotTo(o.HaveOccurred(), "Custom MachineConfigPool should become ready") - verifyNodeSizingEnabledFile(oc, nodeName, "true") + verifyNodeSizingEnabledFile(ctx, oc, nodeName, "true") // Now apply KubeletConfig and verify NODE_SIZING_ENABLED=false @@ -232,7 +234,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv err = waitForMCP(ctx, mcClient, testMCPName, 15*time.Minute) o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("%s MCP should become ready with new configuration", testMCPName)) - verifyNodeSizingEnabledFile(oc, nodeName, "false") + verifyNodeSizingEnabledFile(ctx, oc, nodeName, "false") // Explicit cleanup on success; DeferCleanup ensures cleanup also runs on failure cleanupKubeletConfig() @@ -242,14 +244,14 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv }) // verifyNodeSizingEnabledFile verifies the NODE_SIZING_ENABLED value in the env file -func verifyNodeSizingEnabledFile(oc *exutil.CLI, nodeName, expectedValue string) { +func verifyNodeSizingEnabledFile(ctx context.Context, oc *exutil.CLI, nodeName, expectedValue string) { g.By("Verifying /etc/node-sizing-enabled.env file exists") - output, err := ExecOnNodeWithChroot(oc, nodeName, "test", "-f", "/etc/node-sizing-enabled.env") + output, err := ExecOnNodeWithChroot(ctx, oc, nodeName, "test", "-f", "/etc/node-sizing-enabled.env") o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("File /etc/node-sizing-enabled.env should exist on node %s. Output: %s", nodeName, output)) g.By("Reading /etc/node-sizing-enabled.env file contents") - output, err = ExecOnNodeWithChroot(oc, nodeName, "cat", "/etc/node-sizing-enabled.env") + output, err = ExecOnNodeWithChroot(ctx, oc, nodeName, "cat", "/etc/node-sizing-enabled.env") o.Expect(err).NotTo(o.HaveOccurred(), "Should be able to read /etc/node-sizing-enabled.env") framework.Logf("Contents of /etc/node-sizing-enabled.env:\n%s", output) diff --git a/test/extended/node/node_swap.go b/test/extended/node/node_swap.go index f3f0c151c0ce..80f194c9ea11 100644 --- a/test/extended/node/node_swap.go +++ b/test/extended/node/node_swap.go @@ -35,6 +35,8 @@ var _ = g.Describe("[Jira:Node][sig-node] Node non-cnv swap configuration", func if isMicroShift { g.Skip("Skipping test on MicroShift cluster") } + + EnsureNodesReady(ctx, oc) }) // This test validates that: diff --git a/test/extended/node/node_swap_cnv.go b/test/extended/node/node_swap_cnv.go index 2a5669addb25..a04e881befb4 100644 --- a/test/extended/node/node_swap_cnv.go +++ b/test/extended/node/node_swap_cnv.go @@ -91,9 +91,8 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr } }) - // TC1: Verify drop-in directory exists on all nodes (created by MCO for kubelet config) - // Per MCO PR #6044: directory is mandatory on ALL nodes (masters, workers) - g.It("TC1: should verify drop-in directory exists on all nodes with correct ownership", func(ctx context.Context) { + // TC1: Verify silent creation and ownership of drop-in directory + g.It("TC1: should verify silent creation and ownership of drop-in directory on CNV nodes", func(ctx context.Context) { // Get a CNV worker node for tests cnvWorkerNode = getCNVWorkerNodeName(ctx, oc) o.Expect(cnvWorkerNode).NotTo(o.BeEmpty(), "No CNV worker nodes available") @@ -116,7 +115,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Checking drop-in directory exists on ALL worker nodes") for _, workerNode := range workerNodeNames { framework.Logf("Running command: ls -ld %s on node %s", cnvDropInDir, workerNode) - output, err := ExecOnNodeWithChroot(oc, workerNode, "ls", "-ld", cnvDropInDir) + output, err := ExecOnNodeWithChroot(ctx, oc, workerNode, "ls", "-ld", cnvDropInDir) if err != nil { framework.Logf("Drop-in directory does not exist on worker node %s: %v", workerNode, err) e2eskipper.Skipf("Drop-in directory not present on worker node %s - CNV operator may not be installed", workerNode) @@ -129,7 +128,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Checking directory permissions on all worker nodes (should be 755 or stricter)") for _, workerNode := range workerNodeNames { framework.Logf("Running command: stat -c %%a %s on node %s", cnvDropInDir, workerNode) - output, err := ExecOnNodeWithChroot(oc, workerNode, "stat", "-c", "%a", cnvDropInDir) + output, err := ExecOnNodeWithChroot(ctx, oc, workerNode, "stat", "-c", "%a", cnvDropInDir) o.Expect(err).NotTo(o.HaveOccurred()) perms := strings.TrimSpace(output) framework.Logf("Output from node %s: permissions=%s", workerNode, perms) @@ -139,7 +138,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Checking SELinux context on worker nodes") framework.Logf("Running command: ls -ldZ %s on node %s", cnvDropInDir, cnvWorkerNode) - output, err := ExecOnNodeWithChroot(oc, cnvWorkerNode, "ls", "-ldZ", cnvDropInDir) + output, err := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "ls", "-ldZ", cnvDropInDir) if err == nil { framework.Logf("Output: %s", output) } @@ -154,28 +153,27 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr o.Expect(lowerOutput).NotTo(o.ContainSubstring("failed to load kubelet config"), "Should not have kubelet config load failures") o.Expect(lowerOutput).NotTo(o.ContainSubstring("error reading drop-in"), "Should not have errors reading drop-in files") - // Verify drop-in directory also exists on control plane nodes + // Skip on Hypershift - MachineConfig API is not available controlPlaneTopology, err := exutil.GetControlPlaneTopology(oc) o.Expect(err).NotTo(o.HaveOccurred()) if *controlPlaneTopology != configv1.ExternalTopologyMode { - g.By("Verifying drop-in directory EXISTS on control plane/master nodes") + g.By("Verifying drop-in directory does NOT exist on control plane/master nodes") controlPlaneNodes, err := getNodesByLabel(ctx, oc, "node-role.kubernetes.io/master") o.Expect(err).NotTo(o.HaveOccurred()) - o.Expect(controlPlaneNodes).NotTo(o.BeEmpty(), - "expected at least one control-plane/master node in non-external topology") framework.Logf("Found %d control plane/master nodes", len(controlPlaneNodes)) - // Drop-in directory SHOULD exist on control plane nodes (created by MCO for all nodes) + // Drop-in directory should NOT exist on control plane nodes for _, cpNode := range controlPlaneNodes { - output, err := ExecOnNodeWithChroot(oc, cpNode.Name, "ls", "-ld", cnvDropInDir) - o.Expect(err).NotTo(o.HaveOccurred(), "Drop-in directory should exist on control plane node %s", cpNode.Name) - framework.Logf("Control plane node %s has drop-in directory (expected): %s", cpNode.Name, strings.TrimSpace(output)) - - // Verify ownership - o.Expect(output).To(o.ContainSubstring("root root"), "Directory should be owned by root:root on control plane node %s", cpNode.Name) + _, err = ExecOnNodeWithChroot(ctx, oc, cpNode.Name, "ls", "-ld", cnvDropInDir) + if err == nil { + framework.Logf("ERROR: Drop-in directory exists on control plane node %s - this is unexpected", cpNode.Name) + o.Expect(err).To(o.HaveOccurred(), "Drop-in directory should NOT exist on control plane node %s", cpNode.Name) + } else { + framework.Logf("Drop-in directory does NOT exist on control plane node %s (expected)", cpNode.Name) + } } - framework.Logf("TC1 PASSED: Drop-in directory is present on all %d worker nodes and all %d control plane nodes", len(workerNodeNames), len(controlPlaneNodes)) + framework.Logf("TC1 PASSED: Drop-in directory is present on all %d worker nodes and NOT present on any control plane nodes", len(workerNodeNames)) } else { framework.Logf("TC1 PASSED: Drop-in directory is present on all %d worker nodes (skipped control plane validation on Hypershift)", len(workerNodeNames)) } @@ -190,14 +188,14 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr framework.Logf("Using CNV worker node for tests: %s", cnvWorkerNode) g.By("Checking if drop-in directory exists and is empty") - output, err := ExecOnNodeWithChroot(oc, cnvWorkerNode, "ls", "-la", cnvDropInDir) + output, err := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "ls", "-la", cnvDropInDir) if err != nil { e2eskipper.Skipf("Drop-in directory not present") } framework.Logf("Directory contents: %s", output) g.By("Verifying kubelet is running") - output, err = ExecOnNodeWithChroot(oc, cnvWorkerNode, "systemctl", "is-active", "kubelet") + output, err = ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "systemctl", "is-active", "kubelet") o.Expect(err).NotTo(o.HaveOccurred()) o.Expect(strings.TrimSpace(output)).To(o.Equal("active"), "Kubelet should be active") @@ -241,19 +239,21 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Creating drop-in file with LimitedSwap configuration in /etc/openshift/kubelet.conf.d/") framework.Logf("Creating file: %s with content:\n%s", cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) - err = createDropInFile(oc, cnvWorkerNode, cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) + err = createDropInFile(ctx, oc, cnvWorkerNode, cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) o.Expect(err).NotTo(o.HaveOccurred()) g.By("Verifying drop-in file was created successfully") - output, err := ExecOnNodeWithChroot(oc, cnvWorkerNode, "cat", cnvDropInFilePath) + output, err := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "cat", cnvDropInFilePath) o.Expect(err).NotTo(o.HaveOccurred()) framework.Logf("Drop-in file content:\n%s", output) o.Expect(output).To(o.ContainSubstring("LimitedSwap"), "Drop-in file should contain LimitedSwap configuration") // Defer cleanup defer func() { + cleanupCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() g.By("Cleaning up - removing drop-in file and restarting kubelet") - cleanupDropInAndRestartKubelet(ctx, oc, cnvWorkerNode, cnvDropInFilePath) + cleanupDropInAndRestartKubelet(cleanupCtx, oc, cnvWorkerNode, cnvDropInFilePath) }() g.By("Restarting kubelet to load the new configuration") @@ -293,7 +293,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr if configInitial.MemorySwap.SwapBehavior != "LimitedSwap" { g.By("Creating drop-in file with LimitedSwap configuration") framework.Logf("Creating file: %s", cnvDropInFilePath) - err = createDropInFile(oc, cnvWorkerNode, cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) + err = createDropInFile(ctx, oc, cnvWorkerNode, cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) o.Expect(err).NotTo(o.HaveOccurred()) g.By("Restarting kubelet to apply LimitedSwap") @@ -323,26 +323,138 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr framework.Logf("=== TC4 PASSED ===") }) - // TC5: Validate security and permissions of drop-in directory - g.It("TC5: should validate security and permissions of drop-in directory", func(ctx context.Context) { + // TC5: Verify kubelet ignores drop-in configuration on ALL control plane nodes + g.It("TC5: should verify control plane kubelets ignore drop-in config", func(ctx context.Context) { + framework.Logf("=== TC5: Testing control plane ignores drop-in configuration ===") + + // skip these tests on hypershift platforms + if ok, _ := exutil.IsHypershift(ctx, oc.AdminConfigClient()); ok { + g.Skip("MachineConfigNodes is not supported on hypershift. Skipping tests.") + } + + // Get all control plane nodes + controlPlaneNodes, err := getControlPlaneNodes(ctx, oc) + o.Expect(err).NotTo(o.HaveOccurred()) + if len(controlPlaneNodes) == 0 { + e2eskipper.Skipf("No control plane nodes available") + } + framework.Logf("Found %d control plane nodes to test", len(controlPlaneNodes)) + + for i, cpNode := range controlPlaneNodes { + cpNodeName := cpNode.Name + framework.Logf("--- Testing control plane node %d/%d: %s ---", i+1, len(controlPlaneNodes), cpNodeName) + + g.By(fmt.Sprintf("Getting kubelet config BEFORE placing drop-in file on %s", cpNodeName)) + configBefore, err := getKubeletConfigFromNode(ctx, oc, cpNodeName) + o.Expect(err).NotTo(o.HaveOccurred()) + framework.Logf("Control plane %s swapBehavior BEFORE: '%s'", cpNodeName, configBefore.MemorySwap.SwapBehavior) + + g.By(fmt.Sprintf("Creating drop-in directory on %s if not exists", cpNodeName)) + _, _ = ExecOnNodeWithChroot(ctx, oc, cpNodeName, "mkdir", "-p", cnvDropInDir) + + g.By(fmt.Sprintf("Creating drop-in file on %s", cpNodeName)) + err = createDropInFile(ctx, oc, cpNodeName, cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) + o.Expect(err).NotTo(o.HaveOccurred()) + framework.Logf("Created drop-in file: %s on %s", cnvDropInFilePath, cpNodeName) + + g.By(fmt.Sprintf("Restarting kubelet on %s", cpNodeName)) + err = restartKubeletOnNode(ctx, oc, cpNodeName) + o.Expect(err).NotTo(o.HaveOccurred()) + waitForNodeToBeReady(ctx, oc, cpNodeName) + + g.By(fmt.Sprintf("Verifying %s did NOT apply LimitedSwap from drop-in", cpNodeName)) + configAfter, err := getKubeletConfigFromNode(ctx, oc, cpNodeName) + o.Expect(err).NotTo(o.HaveOccurred()) + framework.Logf("Control plane %s swapBehavior AFTER: '%s'", cpNodeName, configAfter.MemorySwap.SwapBehavior) + + // Control plane should not apply LimitedSwap from drop-in (config-dir not configured for control plane) + o.Expect(configAfter.MemorySwap.SwapBehavior).NotTo(o.Equal("LimitedSwap"), + fmt.Sprintf("Control plane %s should NOT apply LimitedSwap from drop-in", cpNodeName)) + + framework.Logf("Control plane %s ignored drop-in file as expected (swapBehavior: '%s' -> '%s')", + cpNodeName, configBefore.MemorySwap.SwapBehavior, configAfter.MemorySwap.SwapBehavior) + + g.By(fmt.Sprintf("Cleaning up %s", cpNodeName)) + removeDropInFile(ctx, oc, cpNodeName, cnvDropInFilePath) + // Also remove the drop-in directory we created on control plane + _, _ = ExecOnNodeWithChroot(ctx, oc, cpNodeName, "rmdir", cnvDropInDir) + framework.Logf("Removed drop-in directory from control plane node %s", cpNodeName) + } + + framework.Logf("=== TC5 PASSED ===") + framework.Logf("All %d control plane nodes ignored drop-in file as expected", len(controlPlaneNodes)) + }) + + // TC6: Verify directory is auto-recreated after deletion and kubelet restart + g.It("TC6: should verify drop-in directory is auto-recreated after deletion", func(ctx context.Context) { + skipOnSingleNodeTopology(oc) //skip this test for SNO + // Get a CNV worker node for tests + cnvWorkerNode = getCNVWorkerNodeName(ctx, oc) + o.Expect(cnvWorkerNode).NotTo(o.BeEmpty(), "No CNV worker nodes available") + + framework.Logf("=== TC6: Testing drop-in directory auto-recreation ===") + framework.Logf("Executing on node: %s", cnvWorkerNode) + + g.By("Checking if directory exists before deletion") + output, err := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "ls", "-la", cnvDropInDir) + if err != nil { + framework.Logf("Directory does not exist") + } else { + framework.Logf("Output:\n%s", output) + } + + g.By("Deleting drop-in directory") + framework.Logf("Running: rm -rf %s", cnvDropInDir) + _, _ = ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "rm", "-rf", cnvDropInDir) + framework.Logf("Directory deletion command executed") + + g.By("Verifying directory is deleted") + framework.Logf("Running: ls -la %s (expecting failure)", cnvDropInDir) + _, err = ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "ls", "-la", cnvDropInDir) + o.Expect(err).To(o.HaveOccurred(), "Directory should not exist after deletion") + framework.Logf("Confirmed: Directory does not exist after deletion") + + g.By("Restarting kubelet") + err = restartKubeletOnNode(ctx, oc, cnvWorkerNode) + o.Expect(err).NotTo(o.HaveOccurred()) + + g.By("Waiting for node to be ready") + waitForNodeToBeReady(ctx, oc, cnvWorkerNode) + + g.By("Verifying directory was auto-recreated") + output, err = ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "ls", "-la", cnvDropInDir) + o.Expect(err).NotTo(o.HaveOccurred(), "Directory should be auto-recreated after kubelet restart") + framework.Logf("Output:\n%s", output) + + g.By("Verifying kubelet is running") + output, err = ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "systemctl", "is-active", "kubelet") + o.Expect(err).NotTo(o.HaveOccurred()) + framework.Logf("kubelet status: %s", strings.TrimSpace(output)) + o.Expect(strings.TrimSpace(output)).To(o.Equal("active")) + + framework.Logf("=== TC6 PASSED ===") + }) + + // TC7: Validate security and permissions of drop-in directory + g.It("TC7: should validate security and permissions of drop-in directory", func(ctx context.Context) { skipOnSingleNodeTopology(oc) //skip this test for SNO // Get a CNV worker node for tests cnvWorkerNode = getCNVWorkerNodeName(ctx, oc) o.Expect(cnvWorkerNode).NotTo(o.BeEmpty(), "No CNV worker nodes available") - framework.Logf("=== TC5: Testing security and permissions of drop-in directory ===") + framework.Logf("=== TC7: Testing security and permissions of drop-in directory ===") framework.Logf("Executing on node: %s", cnvWorkerNode) framework.Logf("Drop-in directory: %s", cnvDropInDir) g.By("Ensuring drop-in directory exists") framework.Logf("Running: mkdir -p %s", cnvDropInDir) - _, err := ExecOnNodeWithChroot(oc, cnvWorkerNode, "mkdir", "-p", cnvDropInDir) + _, err := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "mkdir", "-p", cnvDropInDir) o.Expect(err).NotTo(o.HaveOccurred()) framework.Logf("Directory exists or created successfully") g.By("Verifying directory ownership is root:root") framework.Logf("Running: stat -c %%U:%%G %s", cnvDropInDir) - output, err := ExecOnNodeWithChroot(oc, cnvWorkerNode, "stat", "-c", "%U:%G", cnvDropInDir) + output, err := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "stat", "-c", "%U:%G", cnvDropInDir) o.Expect(err).NotTo(o.HaveOccurred()) ownership := strings.TrimSpace(output) framework.Logf("Directory ownership: %s", ownership) @@ -350,7 +462,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Verifying directory permissions") framework.Logf("Running: stat -c %%a %s", cnvDropInDir) - output, err = ExecOnNodeWithChroot(oc, cnvWorkerNode, "stat", "-c", "%a", cnvDropInDir) + output, err = ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "stat", "-c", "%a", cnvDropInDir) o.Expect(err).NotTo(o.HaveOccurred()) perms := strings.TrimSpace(output) framework.Logf("Directory permissions: %s", perms) @@ -358,7 +470,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Checking SELinux context of directory") framework.Logf("Running: ls -ldZ %s", cnvDropInDir) - output, err = ExecOnNodeWithChroot(oc, cnvWorkerNode, "ls", "-ldZ", cnvDropInDir) + output, err = ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "ls", "-ldZ", cnvDropInDir) if err == nil { framework.Logf("SELinux context: %s", strings.TrimSpace(output)) } @@ -367,27 +479,27 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr testFile := cnvDropInDir + "/test-permissions.conf" framework.Logf("Creating test file: %s", testFile) framework.Logf("File content:\n%s", loadConfigFromFile(cnvLimitedSwapConfigPath)) - err = createDropInFile(oc, cnvWorkerNode, testFile, loadConfigFromFile(cnvLimitedSwapConfigPath)) + err = createDropInFile(ctx, oc, cnvWorkerNode, testFile, loadConfigFromFile(cnvLimitedSwapConfigPath)) o.Expect(err).NotTo(o.HaveOccurred()) framework.Logf("Test file created successfully") - defer removeDropInFile(oc, cnvWorkerNode, testFile) + defer removeDropInFile(ctx, oc, cnvWorkerNode, testFile) g.By("Verifying config file ownership") framework.Logf("Running: stat -c %%U:%%G %s", testFile) - output, err = ExecOnNodeWithChroot(oc, cnvWorkerNode, "stat", "-c", "%U:%G", testFile) + output, err = ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "stat", "-c", "%U:%G", testFile) o.Expect(err).NotTo(o.HaveOccurred()) fileOwnership := strings.TrimSpace(output) framework.Logf("File ownership: %s", fileOwnership) g.By("Verifying config file permissions (should be 644 or 600)") framework.Logf("Running: stat -c %%a %s", testFile) - output, err = ExecOnNodeWithChroot(oc, cnvWorkerNode, "stat", "-c", "%a", testFile) + output, err = ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "stat", "-c", "%a", testFile) o.Expect(err).NotTo(o.HaveOccurred()) filePerms := strings.TrimSpace(output) framework.Logf("File permissions: %s", filePerms) o.Expect(filePerms).To(o.Or(o.Equal("644"), o.Equal("600"))) - framework.Logf("=== TC5 PASSED ===") + framework.Logf("=== TC7 PASSED ===") framework.Logf("Security and permissions summary:") framework.Logf("- Directory: %s", cnvDropInDir) framework.Logf("- Directory ownership: %s (expected: root:root)", ownership) @@ -397,31 +509,33 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr framework.Logf("- File permissions: %s (expected: 644/600)", filePerms) }) - // TC6: Validate cluster stability and performance - g.It("TC6: should verify cluster stability with LimitedSwap enabled", func(ctx context.Context) { + // TC8: Validate cluster stability and performance + g.It("TC8: should verify cluster stability with LimitedSwap enabled", func(ctx context.Context) { skipOnSingleNodeTopology(oc) //skip this test for SNO // Get a CNV worker node for tests cnvWorkerNode = getCNVWorkerNodeName(ctx, oc) o.Expect(cnvWorkerNode).NotTo(o.BeEmpty(), "No CNV worker nodes available") - framework.Logf("=== TC6: Testing cluster stability with LimitedSwap enabled ===") + framework.Logf("=== TC8: Testing cluster stability with LimitedSwap enabled ===") framework.Logf("Executing on node: %s", cnvWorkerNode) g.By("Creating LimitedSwap configuration") framework.Logf("Creating drop-in file: %s", cnvDropInFilePath) framework.Logf("Drop-in file content:\n%s", loadConfigFromFile(cnvLimitedSwapConfigPath)) - err := createDropInFile(oc, cnvWorkerNode, cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) + err := createDropInFile(ctx, oc, cnvWorkerNode, cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) o.Expect(err).NotTo(o.HaveOccurred()) framework.Logf("Drop-in file created successfully") // Verify file was created - output, err := ExecOnNodeWithChroot(oc, cnvWorkerNode, "cat", cnvDropInFilePath) + output, err := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "cat", cnvDropInFilePath) o.Expect(err).NotTo(o.HaveOccurred()) framework.Logf("Verified drop-in file content:\n%s", output) defer func() { + cleanupCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() g.By("Cleaning up") - cleanupDropInAndRestartKubelet(ctx, oc, cnvWorkerNode, cnvDropInFilePath) + cleanupDropInAndRestartKubelet(cleanupCtx, oc, cnvWorkerNode, cnvDropInFilePath) }() g.By("Restarting kubelet") @@ -473,7 +587,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr } framework.Logf("✅ No memory pressure detected") - framework.Logf("=== TC6 PASSED ===") + framework.Logf("=== TC8 PASSED ===") framework.Logf("Cluster stability verification:") framework.Logf("- Node: %s", cnvWorkerNode) framework.Logf("- swapBehavior: LimitedSwap") @@ -482,9 +596,9 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr framework.Logf("- Stability after 30 seconds: CONFIRMED") }) - // TC7: Validate non-CNV cluster unaffected - g.It("TC7: should verify non-CNV workers have no swap configuration", func(ctx context.Context) { - framework.Logf("=== TC7: Testing non-CNV workers have no swap configuration ===") + // TC9: Validate non-CNV cluster unaffected + g.It("TC9: should verify non-CNV workers have no swap configuration", func(ctx context.Context) { + framework.Logf("=== TC9: Testing non-CNV workers have no swap configuration ===") // Get a CNV worker node and temporarily remove its CNV label cnvWorkerNode = getCNVWorkerNodeName(ctx, oc) @@ -526,13 +640,13 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Checking drop-in directory on non-CNV node") framework.Logf("Running: ls -ld %s on node %s", cnvDropInDir, nonCNVWorkerNode) - output, err = ExecOnNodeWithChroot(oc, nonCNVWorkerNode, "ls", "-ld", cnvDropInDir) + output, err = ExecOnNodeWithChroot(ctx, oc, nonCNVWorkerNode, "ls", "-ld", cnvDropInDir) if err == nil { framework.Logf("Drop-in directory exists: %s", strings.TrimSpace(output)) framework.Logf("Note: Directory exists because CNV was previously installed on this node") g.By("Checking directory contents") framework.Logf("Running: ls -la %s", cnvDropInDir) - dirOutput, _ := ExecOnNodeWithChroot(oc, nonCNVWorkerNode, "ls", "-la", cnvDropInDir) + dirOutput, _ := ExecOnNodeWithChroot(ctx, oc, nonCNVWorkerNode, "ls", "-la", cnvDropInDir) framework.Logf("Directory contents:\n%s", dirOutput) } else { framework.Logf("Drop-in directory does not exist on non-CNV node (expected for truly non-CNV nodes)") @@ -547,21 +661,21 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr o.Expect(config.MemorySwap.SwapBehavior).To(o.Or(o.BeEmpty(), o.Equal("NoSwap")), "swapBehavior should be empty or NoSwap on non-CNV node") - framework.Logf("=== TC7 PASSED ===") + framework.Logf("=== TC9 PASSED ===") framework.Logf("Non-CNV worker verification:") framework.Logf("- Node: %s", nonCNVWorkerNode) framework.Logf("- CNV label removed: YES") framework.Logf("- swapBehavior: %s (NoSwap/default)", config.MemorySwap.SwapBehavior) }) - // TC8: Validate behavior with multiple conflicting drop-in files - g.It("TC8: should apply correct precedence with multiple files", func(ctx context.Context) { + // TC10: Validate behavior with multiple conflicting drop-in files + g.It("TC10: should apply correct precedence with multiple files", func(ctx context.Context) { skipOnSingleNodeTopology(oc) //skip this test for SNO // Get a CNV worker node for tests cnvWorkerNode = getCNVWorkerNodeName(ctx, oc) o.Expect(cnvWorkerNode).NotTo(o.BeEmpty(), "No CNV worker nodes available") - framework.Logf("=== TC8: Testing file precedence with multiple drop-in files ===") + framework.Logf("=== TC10: Testing file precedence with multiple drop-in files ===") framework.Logf("Executing on node: %s", cnvWorkerNode) framework.Logf("Drop-in directory: %s", cnvDropInDir) @@ -571,31 +685,37 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Creating 98-swap-disabled.conf with NoSwap") framework.Logf("Creating file: %s", file98) framework.Logf("Content:\n%s", loadConfigFromFile(cnvNoSwapConfigPath)) - err := createDropInFile(oc, cnvWorkerNode, file98, loadConfigFromFile(cnvNoSwapConfigPath)) + err := createDropInFile(ctx, oc, cnvWorkerNode, file98, loadConfigFromFile(cnvNoSwapConfigPath)) o.Expect(err).NotTo(o.HaveOccurred()) framework.Logf("Created: %s (NoSwap)", file98) g.By("Creating 99-swap-limited.conf with LimitedSwap") framework.Logf("Creating file: %s", file99) framework.Logf("Content:\n%s", loadConfigFromFile(cnvLimitedSwapConfigPath)) - err = createDropInFile(oc, cnvWorkerNode, file99, loadConfigFromFile(cnvLimitedSwapConfigPath)) + err = createDropInFile(ctx, oc, cnvWorkerNode, file99, loadConfigFromFile(cnvLimitedSwapConfigPath)) o.Expect(err).NotTo(o.HaveOccurred()) framework.Logf("Created: %s (LimitedSwap)", file99) g.By("Listing drop-in directory contents") framework.Logf("Running: ls -la %s", cnvDropInDir) - output, _ := ExecOnNodeWithChroot(oc, cnvWorkerNode, "ls", "-la", cnvDropInDir) + output, _ := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "ls", "-la", cnvDropInDir) framework.Logf("Directory contents:\n%s", output) defer func() { + cleanupCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() g.By("Cleaning up multiple config files") framework.Logf("Removing: %s", file98) - removeDropInFile(oc, cnvWorkerNode, file98) + if err := removeDropInFile(cleanupCtx, oc, cnvWorkerNode, file98); err != nil { + framework.Logf("Warning: failed to remove %s: %v", file98, err) + } framework.Logf("Removing: %s", file99) - removeDropInFile(oc, cnvWorkerNode, file99) + if err := removeDropInFile(cleanupCtx, oc, cnvWorkerNode, file99); err != nil { + framework.Logf("Warning: failed to remove %s: %v", file99, err) + } framework.Logf("Running: systemctl restart kubelet") - restartKubeletOnNode(ctx, oc, cnvWorkerNode) - waitForNodeToBeReady(ctx, oc, cnvWorkerNode) + restartKubeletOnNode(cleanupCtx, oc, cnvWorkerNode) + waitForNodeToBeReady(cleanupCtx, oc, cnvWorkerNode) framework.Logf("Cleanup completed") }() @@ -615,7 +735,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr o.Expect(config.MemorySwap.SwapBehavior).To(o.Equal("LimitedSwap"), "99-* file should take precedence over 98-* file") - framework.Logf("=== TC8 PASSED ===") + framework.Logf("=== TC10 PASSED ===") framework.Logf("File precedence verification:") framework.Logf("- File 1: 98-swap-disabled.conf (NoSwap)") framework.Logf("- File 2: 99-swap-limited.conf (LimitedSwap)") @@ -623,10 +743,10 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr framework.Logf("- 99-* file correctly overrides 98-* file (lexicographic order)") }) - // TC9: Validate multi-node consistency and synchronization with checksum verification - g.It("TC9: should maintain consistent configuration with checksum verification across CNV nodes", func(ctx context.Context) { + // TC11: Validate multi-node consistency and synchronization with checksum verification + g.It("TC11: should maintain consistent configuration with checksum verification across CNV nodes", func(ctx context.Context) { skipOnSingleNodeTopology(oc) //skip this test for SNO - framework.Logf("=== TC9: Testing multi-node consistency with checksum verification ===") + framework.Logf("=== TC11: Testing multi-node consistency with checksum verification ===") g.By("Getting all CNV worker nodes") // Get nodes with both worker role and CNV schedulable label @@ -655,22 +775,26 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr framework.Logf("Content:\n%s", loadConfigFromFile(cnvLimitedSwapConfigPath)) for _, node := range cnvNodes { framework.Logf("Creating drop-in file on node: %s", node) - err := createDropInFile(oc, node, cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) + err := createDropInFile(ctx, oc, node, cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) o.Expect(err).NotTo(o.HaveOccurred()) framework.Logf(" -> Created successfully on %s", node) } defer func() { + cleanupCtx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) + defer cancel() g.By("Cleaning up all CNV nodes") for _, node := range cnvNodes { framework.Logf("Removing drop-in file from node: %s", node) - removeDropInFile(oc, node, cnvDropInFilePath) + if err := removeDropInFile(cleanupCtx, oc, node, cnvDropInFilePath); err != nil { + framework.Logf("Warning: failed to remove drop-in from %s: %v", node, err) + } framework.Logf("Restarting kubelet on node: %s", node) - restartKubeletOnNode(ctx, oc, node) + restartKubeletOnNode(cleanupCtx, oc, node) } for _, node := range cnvNodes { framework.Logf("Waiting for node %s to be ready...", node) - waitForNodeToBeReady(ctx, oc, node) + waitForNodeToBeReady(cleanupCtx, oc, node) } framework.Logf("Cleanup completed on all %d CNV nodes", len(cnvNodes)) }() @@ -679,7 +803,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr checksums := make(map[string]string) for _, node := range cnvNodes { framework.Logf("Running: md5sum %s on node %s", cnvDropInFilePath, node) - output, err := ExecOnNodeWithChroot(oc, node, "md5sum", cnvDropInFilePath) + output, err := ExecOnNodeWithChroot(ctx, oc, node, "md5sum", cnvDropInFilePath) o.Expect(err).NotTo(o.HaveOccurred()) // Extract checksum (first field) checksum := strings.Fields(strings.TrimSpace(output))[0] @@ -740,7 +864,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr driftDetected := false for _, node := range cnvNodes { framework.Logf("Running: md5sum %s on node %s (after wait)", cnvDropInFilePath, node) - output, err := ExecOnNodeWithChroot(oc, node, "md5sum", cnvDropInFilePath) + output, err := ExecOnNodeWithChroot(ctx, oc, node, "md5sum", cnvDropInFilePath) o.Expect(err).NotTo(o.HaveOccurred()) checksum := strings.Fields(strings.TrimSpace(output))[0] framework.Logf("Checksum for %s (after wait): %s", node, checksum) @@ -762,7 +886,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr "Node %s should still have LimitedSwap after wait", node) } - framework.Logf("=== TC9 PASSED ===") + framework.Logf("=== TC11 PASSED ===") framework.Logf("Multi-node consistency verification:") framework.Logf("- Total CNV nodes: %d", len(cnvNodes)) framework.Logf("- Configuration checksum: %s (identical across all nodes)", referenceChecksum) @@ -771,11 +895,11 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr framework.Logf("- All nodes remain Ready: YES") }) - // TC10: Validate LimitedSwap config when OS-level swap is not enabled + // TC12: Validate LimitedSwap config when OS-level swap is not enabled // This test verifies kubelet gracefully handles LimitedSwap config even without OS swap - g.It("TC10: should handle LimitedSwap config gracefully when OS swap is disabled", func(ctx context.Context) { + g.It("TC12: should handle LimitedSwap config gracefully when OS swap is disabled", func(ctx context.Context) { skipOnSingleNodeTopology(oc) //skip this test for SNO - framework.Logf("=== TC10: Testing LimitedSwap config when OS swap is disabled ===") + framework.Logf("=== TC12: Testing LimitedSwap config when OS swap is disabled ===") // Get a CNV worker node for tests cnvWorkerNode = getCNVWorkerNodeName(ctx, oc) @@ -784,7 +908,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Checking initial OS-level swap status") framework.Logf("Running: swapon -s") - initialSwapOutput, err := ExecOnNodeWithChroot(oc, cnvWorkerNode, "swapon", "-s") + initialSwapOutput, err := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "swapon", "-s") o.Expect(err).NotTo(o.HaveOccurred(), "Failed to check initial swap status on node %s: %v", cnvWorkerNode, err) framework.Logf("Initial swapon -s output:\n%s", initialSwapOutput) initialHasSwap := strings.TrimSpace(initialSwapOutput) != "" && initialSwapOutput != "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority" @@ -793,7 +917,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr if initialHasSwap { g.By("Disabling existing OS-level swap for test") framework.Logf("Running: swapoff -a") - swapoffOutput, swapoffErr := ExecOnNodeWithNsenter(oc, cnvWorkerNode, "swapoff", "-a") + swapoffOutput, swapoffErr := ExecOnNodeWithNsenter(ctx, oc, cnvWorkerNode, "swapoff", "-a") if swapoffErr != nil { framework.Failf("Failed to disable swap on node %s: %v (output: %s)", cnvWorkerNode, swapoffErr, swapoffOutput) } @@ -802,7 +926,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Verifying no OS-level swap is present") framework.Logf("Running: swapon -s") - swapOutput, err := ExecOnNodeWithChroot(oc, cnvWorkerNode, "swapon", "-s") + swapOutput, err := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "swapon", "-s") o.Expect(err).NotTo(o.HaveOccurred(), "Failed to verify swap status on node %s: %v", cnvWorkerNode, err) framework.Logf("swapon -s output:\n%s", swapOutput) hasOSSwap := strings.TrimSpace(swapOutput) != "" && swapOutput != "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority" @@ -818,26 +942,29 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Ensuring drop-in directory exists") framework.Logf("Running: mkdir -p %s", cnvDropInDir) - _, _ = ExecOnNodeWithChroot(oc, cnvWorkerNode, "mkdir", "-p", cnvDropInDir) + _, _ = ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "mkdir", "-p", cnvDropInDir) g.By("Creating LimitedSwap drop-in configuration") framework.Logf("Creating drop-in file: %s", cnvDropInFilePath) framework.Logf("Content:\n%s", loadConfigFromFile(cnvLimitedSwapConfigPath)) - err = createDropInFile(oc, cnvWorkerNode, cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) + err = createDropInFile(ctx, oc, cnvWorkerNode, cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) o.Expect(err).NotTo(o.HaveOccurred()) framework.Logf("Drop-in file created successfully") defer func() { + cleanupCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() g.By("Cleaning up") framework.Logf("Removing drop-in file: %s", cnvDropInFilePath) - removeDropInFile(oc, cnvWorkerNode, cnvDropInFilePath) - // Re-enable swap if it was initially present + if err := removeDropInFile(cleanupCtx, oc, cnvWorkerNode, cnvDropInFilePath); err != nil { + framework.Logf("Warning: failed to remove drop-in: %v", err) + } if initialHasSwap { framework.Logf("Note: OS swap was initially enabled, may need manual re-enable") } framework.Logf("Restarting kubelet on node: %s", cnvWorkerNode) - restartKubeletOnNode(ctx, oc, cnvWorkerNode) - waitForNodeToBeReady(ctx, oc, cnvWorkerNode) + restartKubeletOnNode(cleanupCtx, oc, cnvWorkerNode) + waitForNodeToBeReady(cleanupCtx, oc, cnvWorkerNode) }() g.By("Restarting kubelet with LimitedSwap config but no OS swap") @@ -902,7 +1029,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Verifying /proc/meminfo shows swap fields (even if 0)") framework.Logf("Running: grep -i swap /proc/meminfo") - meminfoOutput, err := ExecOnNodeWithChroot(oc, cnvWorkerNode, "grep", "-i", "swap", "/proc/meminfo") + meminfoOutput, err := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "grep", "-i", "swap", "/proc/meminfo") o.Expect(err).NotTo(o.HaveOccurred()) framework.Logf("Swap info from /proc/meminfo:\n%s", strings.TrimSpace(meminfoOutput)) o.Expect(meminfoOutput).To(o.ContainSubstring("SwapTotal")) @@ -910,7 +1037,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Verifying free -h shows swap status") framework.Logf("Running: free -h") - freeOutput, _ := ExecOnNodeWithChroot(oc, cnvWorkerNode, "free", "-h") + freeOutput, _ := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "free", "-h") framework.Logf("free -h output:\n%s", freeOutput) g.By("Verifying node has no memory pressure conditions") @@ -922,7 +1049,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr } } - framework.Logf("=== TC10 PASSED ===") + framework.Logf("=== TC12 PASSED ===") framework.Logf("LimitedSwap config without OS swap verification:") framework.Logf("- Node: %s", cnvWorkerNode) framework.Logf("- OS swap: disabled/not present") @@ -933,16 +1060,16 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr framework.Logf("- Kubelet handles LimitedSwap gracefully even without OS swap") }) - // TC11: Validate behavior with various swap sizes + // TC13: Validate behavior with various swap sizes // This test creates temporary swap files on the node for testing different sizes // It requires sufficient disk space and may take longer to complete - g.It("TC11: should work correctly with various swap sizes", func(ctx context.Context) { + g.It("TC13: should work correctly with various swap sizes", func(ctx context.Context) { skipOnSingleNodeTopology(oc) //skip this test for SNO // Get a CNV worker node for tests cnvWorkerNode = getCNVWorkerNodeName(ctx, oc) o.Expect(cnvWorkerNode).NotTo(o.BeEmpty(), "No CNV worker nodes available") - framework.Logf("=== TC11: Testing LimitedSwap with various swap sizes ===") + framework.Logf("=== TC13: Testing LimitedSwap with various swap sizes ===") framework.Logf("Executing on node: %s", cnvWorkerNode) // Define swap sizes to test (in MB) @@ -960,22 +1087,28 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Creating LimitedSwap drop-in configuration") framework.Logf("Creating drop-in file: %s", cnvDropInFilePath) framework.Logf("Content:\n%s", loadConfigFromFile(cnvLimitedSwapConfigPath)) - err := createDropInFile(oc, cnvWorkerNode, cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) + err := createDropInFile(ctx, oc, cnvWorkerNode, cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) o.Expect(err).NotTo(o.HaveOccurred()) framework.Logf("Drop-in file created successfully") defer func() { + cleanupCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() g.By("Final cleanup") - // Disable and remove any test swap file framework.Logf("Disabling test swap file if present") - ExecOnNodeWithNsenter(oc, cnvWorkerNode, "swapoff", swapFilePath) - ExecOnNodeWithChroot(oc, cnvWorkerNode, "rm", "-f", swapFilePath) - // Remove drop-in config + if _, err := ExecOnNodeWithNsenter(cleanupCtx, oc, cnvWorkerNode, "swapoff", swapFilePath); err != nil { + framework.Logf("Warning: failed to disable swap on %s: %v", cnvWorkerNode, err) + } + if _, err := ExecOnNodeWithChroot(cleanupCtx, oc, cnvWorkerNode, "rm", "-f", swapFilePath); err != nil { + framework.Logf("Warning: failed to remove swap file %s on %s: %v", swapFilePath, cnvWorkerNode, err) + } framework.Logf("Removing drop-in file: %s", cnvDropInFilePath) - removeDropInFile(oc, cnvWorkerNode, cnvDropInFilePath) + if err := removeDropInFile(cleanupCtx, oc, cnvWorkerNode, cnvDropInFilePath); err != nil { + framework.Logf("Warning: failed to remove drop-in %s on %s: %v", cnvDropInFilePath, cnvWorkerNode, err) + } framework.Logf("Restarting kubelet") - restartKubeletOnNode(ctx, oc, cnvWorkerNode) - waitForNodeToBeReady(ctx, oc, cnvWorkerNode) + restartKubeletOnNode(cleanupCtx, oc, cnvWorkerNode) + waitForNodeToBeReady(cleanupCtx, oc, cnvWorkerNode) framework.Logf("Final cleanup completed") }() @@ -999,19 +1132,19 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By(fmt.Sprintf("Disabling any existing swap for %s test", swapSize.name)) framework.Logf("Running: swapoff -a on node %s", cnvWorkerNode) - swapoffOutput, swapoffErr := ExecOnNodeWithNsenter(oc, cnvWorkerNode, "swapoff", "-a") + swapoffOutput, swapoffErr := ExecOnNodeWithNsenter(ctx, oc, cnvWorkerNode, "swapoff", "-a") if swapoffErr != nil { framework.Failf("Failed to disable swap on node %s for %s test: %v (output: %s)", cnvWorkerNode, swapSize.name, swapoffErr, swapoffOutput) } framework.Logf("Running: rm -f %s on node %s", swapFilePath, cnvWorkerNode) - rmOutput, rmErr := ExecOnNodeWithChroot(oc, cnvWorkerNode, "rm", "-f", swapFilePath) + rmOutput, rmErr := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "rm", "-f", swapFilePath) if rmErr != nil { framework.Failf("Failed to remove swap file %s on node %s for %s test: %v (output: %s)", swapFilePath, cnvWorkerNode, swapSize.name, rmErr, rmOutput) } g.By(fmt.Sprintf("Creating %dMB swap file", swapSize.sizeMB)) framework.Logf("Running: dd if=/dev/zero of=%s bs=1M count=%d", swapFilePath, swapSize.sizeMB) - _, err := ExecOnNodeWithChroot(oc, cnvWorkerNode, "dd", "if=/dev/zero", fmt.Sprintf("of=%s", swapFilePath), + _, err := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "dd", "if=/dev/zero", fmt.Sprintf("of=%s", swapFilePath), "bs=1M", fmt.Sprintf("count=%d", swapSize.sizeMB)) if err != nil { framework.Logf("Warning: Failed to create swap file: %v", err) @@ -1021,10 +1154,10 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr } framework.Logf("Running: chmod 600 %s", swapFilePath) - ExecOnNodeWithChroot(oc, cnvWorkerNode, "chmod", "600", swapFilePath) + ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "chmod", "600", swapFilePath) framework.Logf("Running: mkswap %s", swapFilePath) - _, err = ExecOnNodeWithChroot(oc, cnvWorkerNode, "mkswap", swapFilePath) + _, err = ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "mkswap", swapFilePath) if err != nil { framework.Logf("Warning: Failed to mkswap: %v", err) result.success = false @@ -1033,7 +1166,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr } framework.Logf("Running: swapon %s", swapFilePath) - _, err = ExecOnNodeWithNsenter(oc, cnvWorkerNode, "swapon", swapFilePath) + _, err = ExecOnNodeWithNsenter(ctx, oc, cnvWorkerNode, "swapon", swapFilePath) if err != nil { framework.Logf("Warning: Failed to enable swap: %v", err) result.success = false @@ -1061,11 +1194,11 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By(fmt.Sprintf("Verifying swap metrics with %s swap", swapSize.name)) framework.Logf("Running: swapon -s") - swapOutput, _ := ExecOnNodeWithChroot(oc, cnvWorkerNode, "swapon", "-s") + swapOutput, _ := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "swapon", "-s") framework.Logf("swapon -s output:\n%s", swapOutput) framework.Logf("Running: grep -i swap /proc/meminfo") - meminfoOutput, _ := ExecOnNodeWithChroot(oc, cnvWorkerNode, "grep", "-i", "swap", "/proc/meminfo") + meminfoOutput, _ := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "grep", "-i", "swap", "/proc/meminfo") framework.Logf("Swap info from /proc/meminfo:\n%s", strings.TrimSpace(meminfoOutput)) // Parse SwapTotal @@ -1079,7 +1212,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr } framework.Logf("Running: free -h") - freeOutput, _ := ExecOnNodeWithChroot(oc, cnvWorkerNode, "free", "-h") + freeOutput, _ := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "free", "-h") framework.Logf("free -h output:\n%s", freeOutput) // Verify swap size is approximately what we configured (within 10%) @@ -1102,7 +1235,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr framework.Logf("--- %s swap (%dMB) test PASSED ---", swapSize.name, swapSize.sizeMB) } - framework.Logf("=== TC11 PASSED ===") + framework.Logf("=== TC13 PASSED ===") framework.Logf("Swap size verification results:") for _, r := range results { framework.Logf("- %s (%dMB): Success=%v, SwapTotal=%dMB, NodeReady=%v, ConfigOK=%v", @@ -1111,14 +1244,14 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr framework.Logf("LimitedSwap works correctly with all tested swap sizes") }) - // TC12: Validate swap metrics and observability via Prometheus - g.It("TC12: should expose swap metrics correctly via Prometheus", func(ctx context.Context) { + // TC14: Validate swap metrics and observability via Prometheus + g.It("TC14: should expose swap metrics correctly via Prometheus", func(ctx context.Context) { skipOnSingleNodeTopology(oc) //skip this test for SNO // Get a CNV worker node for tests cnvWorkerNode = getCNVWorkerNodeName(ctx, oc) o.Expect(cnvWorkerNode).NotTo(o.BeEmpty(), "No CNV worker nodes available") - framework.Logf("=== TC12: Testing swap metrics and observability via Prometheus ===") + framework.Logf("=== TC14: Testing swap metrics and observability via Prometheus ===") framework.Logf("Executing on node: %s", cnvWorkerNode) swapFilePath := "/var/swapfile" @@ -1127,7 +1260,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Checking OS-level swap status") framework.Logf("Running: swapon -s") - swapOutput, _ := ExecOnNodeWithChroot(oc, cnvWorkerNode, "swapon", "-s") + swapOutput, _ := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "swapon", "-s") framework.Logf("swapon -s output:\n%s", swapOutput) hasOSSwap := strings.TrimSpace(swapOutput) != "" && swapOutput != "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority" @@ -1138,49 +1271,55 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By(fmt.Sprintf("Creating %dMB swap file at %s", swapSizeMB, swapFilePath)) framework.Logf("Running: dd if=/dev/zero of=%s bs=1M count=%d", swapFilePath, swapSizeMB) - ddOutput, err := ExecOnNodeWithChroot(oc, cnvWorkerNode, "dd", "if=/dev/zero", fmt.Sprintf("of=%s", swapFilePath), "bs=1M", fmt.Sprintf("count=%d", swapSizeMB)) + ddOutput, err := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "dd", "if=/dev/zero", fmt.Sprintf("of=%s", swapFilePath), "bs=1M", fmt.Sprintf("count=%d", swapSizeMB)) if err != nil { framework.Logf("Warning: dd command returned error (may still have succeeded): %v", err) } framework.Logf("dd output: %s", ddOutput) framework.Logf("Running: chmod 600 %s", swapFilePath) - _, err = ExecOnNodeWithChroot(oc, cnvWorkerNode, "chmod", "600", swapFilePath) + _, err = ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "chmod", "600", swapFilePath) o.Expect(err).NotTo(o.HaveOccurred()) framework.Logf("Running: mkswap %s", swapFilePath) - mkswapOutput, err := ExecOnNodeWithChroot(oc, cnvWorkerNode, "mkswap", swapFilePath) + mkswapOutput, err := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "mkswap", swapFilePath) o.Expect(err).NotTo(o.HaveOccurred()) framework.Logf("mkswap output: %s", mkswapOutput) g.By("Enabling swap") framework.Logf("Running: swapon %s", swapFilePath) - _, err = ExecOnNodeWithNsenter(oc, cnvWorkerNode, "swapon", swapFilePath) + _, err = ExecOnNodeWithNsenter(ctx, oc, cnvWorkerNode, "swapon", swapFilePath) o.Expect(err).NotTo(o.HaveOccurred()) swapCreated = true // Verify swap is now enabled framework.Logf("Verifying swap is enabled...") - swapVerify, _ := ExecOnNodeWithChroot(oc, cnvWorkerNode, "swapon", "-s") + swapVerify, _ := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "swapon", "-s") framework.Logf("swapon -s after enabling:\n%s", swapVerify) hasOSSwap = true } defer func() { + cleanupCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() g.By("Cleaning up swap file and drop-in configuration") if swapCreated { framework.Logf("Disabling swap: swapoff %s", swapFilePath) - ExecOnNodeWithNsenter(oc, cnvWorkerNode, "swapoff", swapFilePath) + if _, err := ExecOnNodeWithNsenter(cleanupCtx, oc, cnvWorkerNode, "swapoff", swapFilePath); err != nil { + framework.Logf("Warning: failed to disable swap: %v", err) + } framework.Logf("Removing swap file: rm -f %s", swapFilePath) - ExecOnNodeWithChroot(oc, cnvWorkerNode, "rm", "-f", swapFilePath) + if _, err := ExecOnNodeWithChroot(cleanupCtx, oc, cnvWorkerNode, "rm", "-f", swapFilePath); err != nil { + framework.Logf("Warning: failed to remove swap file: %v", err) + } } - cleanupDropInAndRestartKubelet(ctx, oc, cnvWorkerNode, cnvDropInFilePath) + cleanupDropInAndRestartKubelet(cleanupCtx, oc, cnvWorkerNode, cnvDropInFilePath) }() g.By("Creating LimitedSwap configuration") framework.Logf("Creating drop-in file: %s", cnvDropInFilePath) framework.Logf("Content:\n%s", loadConfigFromFile(cnvLimitedSwapConfigPath)) - err := createDropInFile(oc, cnvWorkerNode, cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) + err := createDropInFile(ctx, oc, cnvWorkerNode, cnvDropInFilePath, loadConfigFromFile(cnvLimitedSwapConfigPath)) o.Expect(err).NotTo(o.HaveOccurred()) framework.Logf("Drop-in file created successfully") @@ -1200,7 +1339,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Getting swap metrics from /proc/meminfo (baseline)") framework.Logf("Running: grep -i swap /proc/meminfo") - meminfoOutput, err := ExecOnNodeWithChroot(oc, cnvWorkerNode, "grep", "-i", "swap", "/proc/meminfo") + meminfoOutput, err := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "grep", "-i", "swap", "/proc/meminfo") o.Expect(err).NotTo(o.HaveOccurred()) framework.Logf("Swap metrics from /proc/meminfo:\n%s", strings.TrimSpace(meminfoOutput)) @@ -1219,7 +1358,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr g.By("Checking free -h output for swap") framework.Logf("Running: free -h") - freeOutput, _ := ExecOnNodeWithChroot(oc, cnvWorkerNode, "free", "-h") + freeOutput, _ := ExecOnNodeWithChroot(ctx, oc, cnvWorkerNode, "free", "-h") framework.Logf("free -h output:\n%s", freeOutput) g.By("Querying Prometheus for node swap metrics") @@ -1330,7 +1469,7 @@ var _ = g.Describe("[Jira:Node/Kubelet][sig-node][Feature:NodeSwap][Serial][Disr } else if hasOSSwap { osSwapStatus = "enabled (pre-existing)" } - framework.Logf("=== TC12 PASSED ===") + framework.Logf("=== TC14 PASSED ===") framework.Logf("Swap metrics and observability verification:") framework.Logf("- Node: %s", cnvWorkerNode) framework.Logf("- OS swap: %s", osSwapStatus) diff --git a/test/extended/node/node_utils.go b/test/extended/node/node_utils.go index b4060e7e5b36..9cca6e545033 100644 --- a/test/extended/node/node_utils.go +++ b/test/extended/node/node_utils.go @@ -156,34 +156,69 @@ func getCNVWorkerNodeName(ctx context.Context, oc *exutil.CLI) string { return nodes[rand.Intn(len(nodes))].Name } -// ExecOnNodeWithChroot runs a command on a node using oc debug with chroot /host -func ExecOnNodeWithChroot(oc *exutil.CLI, nodeName string, cmd ...string) (string, error) { +func execOnNodeWithDebug(ctx context.Context, oc *exutil.CLI, nodeName string, timeout time.Duration, args []string) (string, error) { + timeoutCtx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + execCmd, stdOutBuf, stdErrBuf, err := oc.AsAdmin().WithoutNamespace().Run("debug").Args(args...).Background() + if err != nil { + return "", err + } + + type result struct { + err error + } + resultCh := make(chan result, 1) + + go func() { + resultCh <- result{err: execCmd.Wait()} + }() + + select { + case res := <-resultCh: + stdOut := strings.TrimSpace(stdOutBuf.String()) + stdErr := strings.TrimSpace(stdErrBuf.String()) + if res.err != nil { + return stdOut, fmt.Errorf("oc debug failed: %w\nStdErr: %s", res.err, stdErr) + } + return stdOut, nil + case <-timeoutCtx.Done(): + var killErr error + if execCmd.Process != nil { + killErr = execCmd.Process.Kill() + } + if ctx.Err() != nil { + return "", fmt.Errorf("oc debug command canceled on node %s: %w", nodeName, ctx.Err()) + } + if killErr != nil { + return "", fmt.Errorf("oc debug command timed out after %v on node %s; failed to stop debug process: %w", timeout, nodeName, killErr) + } + return "", fmt.Errorf("oc debug command timed out after %v on node %s (cleanup likely hung)", timeout, nodeName) + } +} + +func ExecOnNodeWithChroot(ctx context.Context, oc *exutil.CLI, nodeName string, cmd ...string) (string, error) { args := append([]string{"node/" + nodeName, "-n" + DebugNamespace, "--", "chroot", "/host"}, cmd...) - stdOut, _, err := oc.AsAdmin().WithoutNamespace().Run("debug").Args(args...).Outputs() - return stdOut, err + return execOnNodeWithDebug(ctx, oc, nodeName, 2*time.Minute, args) } -// ExecOnNodeWithNsenter runs a command on a node using nsenter to access host namespaces -// This is needed for swap operations (swapon/swapoff) that require direct namespace access -func ExecOnNodeWithNsenter(oc *exutil.CLI, nodeName string, cmd ...string) (string, error) { +func ExecOnNodeWithNsenter(ctx context.Context, oc *exutil.CLI, nodeName string, cmd ...string) (string, error) { nsenterCmd := append([]string{"nsenter", "-a", "-t", "1"}, cmd...) args := append([]string{"node/" + nodeName, "-n" + DebugNamespace, "--"}, nsenterCmd...) - stdOut, _, err := oc.AsAdmin().WithoutNamespace().Run("debug").Args(args...).Outputs() - return stdOut, err + return execOnNodeWithDebug(ctx, oc, nodeName, 2*time.Minute, args) } // createDropInFile creates a drop-in configuration file on the specified node -func createDropInFile(oc *exutil.CLI, nodeName, filePath, content string) error { - // Escape content for shell +func createDropInFile(ctx context.Context, oc *exutil.CLI, nodeName, filePath, content string) error { escapedContent := strings.ReplaceAll(content, "'", "'\\''") cmd := fmt.Sprintf("echo '%s' > %s && chmod 644 %s", escapedContent, filePath, filePath) - _, err := ExecOnNodeWithChroot(oc, nodeName, "sh", "-c", cmd) + _, err := ExecOnNodeWithChroot(ctx, oc, nodeName, "sh", "-c", cmd) return err } // removeDropInFile removes a drop-in configuration file from the specified node -func removeDropInFile(oc *exutil.CLI, nodeName, filePath string) error { - _, err := ExecOnNodeWithChroot(oc, nodeName, "rm", "-f", filePath) +func removeDropInFile(ctx context.Context, oc *exutil.CLI, nodeName, filePath string) error { + _, err := ExecOnNodeWithChroot(ctx, oc, nodeName, "rm", "-f", filePath) return err } @@ -202,7 +237,7 @@ func restartKubeletOnNode(ctx context.Context, oc *exutil.CLI, nodeName string) const maxAttempts = 3 var lastErr error for attempt := 0; attempt < maxAttempts; attempt++ { - _, err := ExecOnNodeWithChroot(oc, nodeName, "systemctl", "restart", "kubelet") + _, err := ExecOnNodeWithChroot(ctx, oc, nodeName, "systemctl", "restart", "kubelet") if err == nil { return nil } @@ -271,7 +306,7 @@ func isNodeInReadyState(node *corev1.Node) bool { // cleanupDropInAndRestartKubelet removes the drop-in file and restarts kubelet func cleanupDropInAndRestartKubelet(ctx context.Context, oc *exutil.CLI, nodeName, filePath string) { framework.Logf("Removing drop-in file: %s", filePath) - removeDropInFile(oc, nodeName, filePath) + removeDropInFile(ctx, oc, nodeName, filePath) framework.Logf("Restarting kubelet on node: %s", nodeName) restartKubeletOnNode(ctx, oc, nodeName) framework.Logf("Waiting for node to be ready...") @@ -442,7 +477,7 @@ func installCNVOperator(ctx context.Context, oc *exutil.CLI) error { return fmt.Errorf("failed to create MC client for MCP check: %w", err) } - err = waitForMCP(ctx, mcClient, "worker", 30*time.Minute) + err = waitForMCP(ctx, mcClient, "worker", 15*time.Minute) if err != nil { return fmt.Errorf("MCP rollout failed after CNV installation: %w", err) } @@ -485,7 +520,7 @@ func waitForCNVOperatorReady(ctx context.Context, oc *exutil.CLI) error { func waitForHyperConvergedReady(ctx context.Context, oc *exutil.CLI) error { dynamicClient := oc.AdminDynamicClient() - return wait.PollUntilContextTimeout(ctx, 15*time.Second, 20*time.Minute, true, func(ctx context.Context) (bool, error) { + return wait.PollUntilContextTimeout(ctx, 15*time.Second, 15*time.Minute, true, func(ctx context.Context) (bool, error) { hc, err := dynamicClient.Resource(hyperConvergedGVR).Namespace(cnvNamespace).Get(ctx, cnvHyperConverged, metav1.GetOptions{}) if err != nil { framework.Logf("Error getting HyperConverged: %v", err) @@ -715,7 +750,7 @@ func uninstallCNVOperator(ctx context.Context, oc *exutil.CLI) error { if err != nil { framework.Logf("Warning: failed to create MC client for MCP check: %v", err) } else { - err = waitForMCP(ctx, mcClient, "worker", 30*time.Minute) + err = waitForMCP(ctx, mcClient, "worker", 15*time.Minute) if err != nil { framework.Logf("Warning: MCP rollout check failed: %v", err) } @@ -733,7 +768,7 @@ func ensureDropInDirectoryExists(ctx context.Context, oc *exutil.CLI, dirPath st } for _, node := range nodes { - _, err := ExecOnNodeWithChroot(oc, node.Name, "mkdir", "-p", dirPath) + _, err := ExecOnNodeWithChroot(ctx, oc, node.Name, "mkdir", "-p", dirPath) if err != nil { framework.Logf("Warning: failed to create directory on node %s: %v", node.Name, err) } @@ -781,3 +816,26 @@ func CalculateEventTimeDiff(startEvent, endEvent *corev1.Event) time.Duration { } return endTime.Sub(startTime) } + +func GetNotReadyNodes(ctx context.Context, oc *exutil.CLI) ([]string, error) { + nodes, err := oc.AdminKubeClient().CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + if err != nil { + return nil, err + } + + var notReadyNodes []string + for _, node := range nodes.Items { + if !isNodeInReadyState(&node) { + notReadyNodes = append(notReadyNodes, node.Name) + } + } + + return notReadyNodes, nil +} + +func EnsureNodesReady(ctx context.Context, oc *exutil.CLI) { + notReadyNodes, err := GetNotReadyNodes(ctx, oc) + o.Expect(err).NotTo(o.HaveOccurred(), "failed to check node readiness") + o.Expect(notReadyNodes).To(o.BeEmpty(), + "Cannot start test: nodes not Ready: %v. Cluster may be recovering from previous test.", notReadyNodes) +} diff --git a/test/extended/node/system_compressible.go b/test/extended/node/system_compressible.go index 6eb0dcc6351d..1358ec46bb5a 100644 --- a/test/extended/node/system_compressible.go +++ b/test/extended/node/system_compressible.go @@ -34,6 +34,8 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv if isMicroShift { g.Skip("Skipping test on MicroShift cluster") } + + EnsureNodesReady(ctx, oc) }) g.It("should enforce system compressible CPU limit by default", func(ctx context.Context) { @@ -57,7 +59,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv // Read SYSTEM_RESERVED_CPU from /etc/node-sizing.env g.By("Reading SYSTEM_RESERVED_CPU from /etc/node-sizing.env") - nodeSizingOutput, err := ExecOnNodeWithChroot(oc, nodeName, "cat", "/etc/node-sizing.env") + nodeSizingOutput, err := ExecOnNodeWithChroot(ctx, oc, nodeName, "cat", "/etc/node-sizing.env") o.Expect(err).NotTo(o.HaveOccurred(), "Should be able to read /etc/node-sizing.env") framework.Logf("/etc/node-sizing.env contents:\n%s", nodeSizingOutput) @@ -81,7 +83,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv // Check cgroup cpu.weight configuration for system.slice g.By("Verifying system.slice cgroup CPU weight") - actualWeight, err := readCgroupCPUWeight(oc, nodeName, "system.slice") + actualWeight, err := readCgroupCPUWeight(ctx, oc, nodeName, "system.slice") o.Expect(err).NotTo(o.HaveOccurred(), "Should be able to read cpu.weight for system.slice") framework.Logf("system.slice actual cpu.weight: %d", actualWeight) @@ -265,7 +267,7 @@ var _ = g.Describe("[Suite:openshift/disruptive-longrunning][sig-node][Disruptiv // Check cgroup cpu.weight configuration for system.slice g.By("Verifying system.slice cgroup CPU weight when system compressible is disabled") - actualWeight, err := readCgroupCPUWeight(oc, nodeName, "system.slice") + actualWeight, err := readCgroupCPUWeight(ctx, oc, nodeName, "system.slice") o.Expect(err).NotTo(o.HaveOccurred(), "Should be able to read cpu.weight for system.slice") framework.Logf("system.slice actual cpu.weight when disabled: %d", actualWeight) @@ -562,10 +564,10 @@ func selectTestNode(ctx context.Context, oc *exutil.CLI, minCPUs int) (string, i } // readCgroupCPUWeight reads cpu.weight file for a cgroup slice -func readCgroupCPUWeight(oc *exutil.CLI, nodeName, slicePath string) (uint64, error) { +func readCgroupCPUWeight(ctx context.Context, oc *exutil.CLI, nodeName, slicePath string) (uint64, error) { weightPath := fmt.Sprintf("/sys/fs/cgroup/%s/cpu.weight", slicePath) - output, err := ExecOnNodeWithChroot(oc, nodeName, "cat", weightPath) + output, err := ExecOnNodeWithChroot(ctx, oc, nodeName, "cat", weightPath) if err != nil { return 0, fmt.Errorf("failed to read %s: %w", weightPath, err) }