From 4ba04a125d6c1a6f606523460d5b2b9df0dba2b3 Mon Sep 17 00:00:00 2001 From: Isabella Janssen Date: Wed, 27 May 2026 14:46:58 -0400 Subject: [PATCH 1/4] tests: make ImageModeStatusReporting MCN property and condition transition tests resilient on SNO --- test/extended-priv/mco_ocb.go | 6 +- .../machineconfigs/master-extension-mc.yaml | 17 ++ test/extended/image_mode_status_reporting.go | 185 ++++++++++++++---- test/extended/machineconfignode.go | 152 ++++++++++---- test/extended/machineconfigpool.go | 17 +- test/extended/node.go | 4 +- 6 files changed, 294 insertions(+), 87 deletions(-) create mode 100644 test/extended-priv/testdata/files/machineconfigs/master-extension-mc.yaml diff --git a/test/extended-priv/mco_ocb.go b/test/extended-priv/mco_ocb.go index dffb210ad8..46f357fee0 100644 --- a/test/extended-priv/mco_ocb.go +++ b/test/extended-priv/mco_ocb.go @@ -311,7 +311,9 @@ func ValidateSuccessfulMOSC(mosc *MachineOSConfig, checkers []Checker) { logger.Infof("OK!\n") exutil.By("Check that the machine-os-builder is using leader election without failing") - o.Expect(mOSBuilder.Logs()).To(o.And( + o.Eventually(func() (string, error) { + return mOSBuilder.Logs() + }, "5m", "10s").Should(o.And( o.MatchRegexp("(?i)"+regexp.QuoteMeta("attempting to acquire leader lease")), o.MatchRegexp("(?i)"+regexp.QuoteMeta("successfully acquired lease"))), "The machine os builder pod is not using the leader election without failures") @@ -442,7 +444,7 @@ func ValidateMOSCIsGarbageCollected(mosc *MachineOSConfig, mcp *MachineConfigPoo logger.Infof("Validating that machine-os-builder pod was garbage collected") mOSBuilder := NewNamespacedResource(mosc.GetOC().AsAdmin(), "deployment", MachineConfigNamespace, "machine-os-builder") - o.Eventually(mOSBuilder, "2m", "30s").ShouldNot(Exist(), + o.Eventually(mOSBuilder, "5m", "30s").ShouldNot(Exist(), "The machine-os-builder deployment was not removed when the infra pool was unlabeled") logger.Infof("Validating that configmaps were garbage collected") diff --git a/test/extended-priv/testdata/files/machineconfigs/master-extension-mc.yaml b/test/extended-priv/testdata/files/machineconfigs/master-extension-mc.yaml new file mode 100644 index 0000000000..8e7e61c995 --- /dev/null +++ b/test/extended-priv/testdata/files/machineconfigs/master-extension-mc.yaml @@ -0,0 +1,17 @@ +apiVersion: template.openshift.io/v1 +kind: Template +metadata: + name: master-extension-mc +objects: +- apiVersion: machineconfiguration.openshift.io/v1 + kind: MachineConfig + metadata: + labels: + machineconfiguration.openshift.io/role: master + name: 90-master-extension + spec: + config: + ignition: + version: 3.2.0 + extensions: + - usbguard diff --git a/test/extended/image_mode_status_reporting.go b/test/extended/image_mode_status_reporting.go index 0d85ec4fae..593ef98108 100644 --- a/test/extended/image_mode_status_reporting.go +++ b/test/extended/image_mode_status_reporting.go @@ -19,9 +19,10 @@ import ( var ( mcNameToFixtureMap = map[string]string{ - "90-infra-extension": filepath.Join("machineconfigs", "infra-extension-mc.yaml"), - "90-infra-testfile": filepath.Join("machineconfigs", "infra-testfile-mc.yaml"), - "90-master-testfile": filepath.Join("machineconfigs", "master-testfile-mc.yaml"), + "90-infra-extension": filepath.Join("machineconfigs", "infra-extension-mc.yaml"), + "90-infra-testfile": filepath.Join("machineconfigs", "infra-testfile-mc.yaml"), + "90-master-extension": filepath.Join("machineconfigs", "master-extension-mc.yaml"), + "90-master-testfile": filepath.Join("machineconfigs", "master-testfile-mc.yaml"), } nodeDisruptionFixture = filepath.Join("machineconfigurations", "nodedisruptionpolicy-rebootless-path.yaml") nodeDisruptionEmptyFixture = filepath.Join("machineconfigurations", "managedbootimages-empty.yaml") @@ -46,14 +47,15 @@ var _ = g.Describe("[sig-mco][Suite:openshift/machine-config-operator/disruptive machineConfigClient, clientErr := machineconfigclient.NewForConfig(oc.KubeFramework().ClientConfig()) o.Expect(clientErr).NotTo(o.HaveOccurred(), "Error creating client set for test: %s", clientErr) - // Skip this test if there are no machines in the worker MCP, as this will mean we cannot - // create a custom MCP. + // If there are no machines in the worker MCP, we cannot create a custom MCP. In these + // cases, run the test against the default `master` MCP. if !DoesMachineConfigPoolHaveMachines(machineConfigClient, "worker") { - g.Skip(fmt.Sprintf("Skipping this test since the cluster does not have nodes in the `worker` MCP.")) + logger.Infof("Cluster has no `worker` machines, running test against the `master` MCP.") + runImageModeMCNTestDefaultMCP(oc, machineConfigClient, "master", "", false) + } else { + // Run the standard image mode MCN test for a custom MCP named `infra` with no MC to apply + runImageModeMCNTestCustomMCP(oc, machineConfigClient, "infra", "", false) } - - // Run the standard image mode MCN test for a custom MCP named `infra` with no MC to apply - runImageModeMCNTest(oc, machineConfigClient, "infra", "", false) }) g.It("MachineConfigNode conditions should properly transition on an image based update when OCB is enabled in a custom MCP [apigroup:machineconfiguration.openshift.io]", func() { @@ -61,14 +63,15 @@ var _ = g.Describe("[sig-mco][Suite:openshift/machine-config-operator/disruptive machineConfigClient, clientErr := machineconfigclient.NewForConfig(oc.KubeFramework().ClientConfig()) o.Expect(clientErr).NotTo(o.HaveOccurred(), "Error creating client set for test: %s", clientErr) - // Skip this test if there are no machines in the worker MCP, as this will mean we cannot - // create a custom MCP. + // If there are no machines in the worker MCP, we cannot create a custom MCP. In these + // cases, run the test against the default `master` MCP. if !DoesMachineConfigPoolHaveMachines(machineConfigClient, "worker") { - g.Skip(fmt.Sprintf("Skipping this test since the cluster does not have nodes in the `worker` MCP.")) + logger.Infof("Cluster has no `worker` machines, running test against the `master` MCP.") + runImageModeMCNTestDefaultMCP(oc, machineConfigClient, "master", "90-master-extension", true) + } else { + // Run the standard image mode MCN test for a custom MCP named `infra` with no MC to apply + runImageModeMCNTestCustomMCP(oc, machineConfigClient, "infra", "90-infra-extension", true) } - - // Run the standard image mode MCN test for a custom MCP named `infra` with no MC to apply - runImageModeMCNTest(oc, machineConfigClient, "infra", "90-infra-extension", true) }) g.It("MachineConfigNode conditions should properly transition on a non-image based update when OCB is enabled in a custom MCP [apigroup:machineconfiguration.openshift.io]", func() { @@ -76,14 +79,15 @@ var _ = g.Describe("[sig-mco][Suite:openshift/machine-config-operator/disruptive machineConfigClient, clientErr := machineconfigclient.NewForConfig(oc.KubeFramework().ClientConfig()) o.Expect(clientErr).NotTo(o.HaveOccurred(), "Error creating client set for test: %s", clientErr) - // Skip this test if there are no machines in the worker MCP, as this will mean we cannot - // create a custom MCP. + // If there are no machines in the worker MCP, we cannot create a custom MCP. In these + // cases, run the test against the default `master` MCP. if !DoesMachineConfigPoolHaveMachines(machineConfigClient, "worker") { - g.Skip(fmt.Sprintf("Skipping this test since the cluster does not have nodes in the `worker` MCP.")) + logger.Infof("Cluster has no `worker` machines, running test against the `master` MCP.") + runImageModeMCNTestDefaultMCP(oc, machineConfigClient, "master", "90-master-testfile", false) + } else { + // Run the standard image mode MCN test for a custom MCP named `infra` + runImageModeMCNTestCustomMCP(oc, machineConfigClient, "infra", "90-infra-testfile", false) } - - // Run the standard image mode MCN test for a custom MCP named `infra` with no MC to apply - runImageModeMCNTest(oc, machineConfigClient, "infra", "90-infra-testfile", false) }) g.It("MachineConfigPool machine counts should transition correctly on an update in a default MCP [apigroup:machineconfiguration.openshift.io]", func() { @@ -112,10 +116,10 @@ var _ = g.Describe("[sig-mco][Suite:openshift/machine-config-operator/disruptive }) }) -// `runImageModeMCNTest` runs through the general flow of validating the MCN of a node for image -// mode enabled workflows. The steps for the test are as follows: +// `runImageModeMCNTestCustomMCP` runs through the general flow of validating the MCN of a node for +// image mode enabled workflows in a custom MCP. The steps for the test are as follows: // 1. Select a worker node to use throughout the test -// 2. Validate the starting properties of te MCN associated with the test node +// 2. Validate the starting properties of the MCN associated with the test node // 3. Create a custom MCP named the value of `mcpAndMoscName` and add the test node to it // 4. Validate the properties of the MCN associated with the test node // 5. Configure on cluster image mode in the custom MCP & validate the MOSC applied successfully @@ -126,7 +130,7 @@ var _ = g.Describe("[sig-mco][Suite:openshift/machine-config-operator/disruptive // 9. Validate the properties of the MCN associated with the test node // 10. Remove the custom MCP // 11. Validate the properties of the MCN associated with the test node -func runImageModeMCNTest(oc *exutil.CLI, machineConfigClient *machineconfigclient.Clientset, mcpAndMoscName, mcName string, isImageUpdate bool) { +func runImageModeMCNTestCustomMCP(oc *exutil.CLI, machineConfigClient *machineconfigclient.Clientset, mcpAndMoscName, mcName string, isImageUpdate bool) { exutil.By("Select a node to follow in this test") workerNodes, err := GetNodesByRole(oc, "worker") o.Expect(err).NotTo(o.HaveOccurred(), "Error getting worker nodes: %s", err) @@ -149,7 +153,7 @@ func runImageModeMCNTest(oc *exutil.CLI, machineConfigClient *machineconfigclien err = oc.AsAdmin().Run("label").Args(fmt.Sprintf("node/%s", nodeToTestName), fmt.Sprintf("node-role.kubernetes.io/%s=", mcpAndMoscName)).Execute() o.Expect(err).NotTo(o.HaveOccurred(), "Error labeing node `%s` for MCP `%s`: %s", nodeToTestName, mcpAndMoscName, err) // Wait for the new `infra` MCP to be ready - WaitForMCPToBeReady(machineConfigClient, mcpAndMoscName, 1, "") + WaitForMCPToBeReady(machineConfigClient, mcpAndMoscName, 1, "", 5*time.Minute) logger.Infof("OK!\n") exutil.By("Validate node's custom MCP MCN properties") @@ -175,21 +179,27 @@ func runImageModeMCNTest(oc *exutil.CLI, machineConfigClient *machineconfigclien // If an MC has been provided, apply it and validate the MCN conditions transition correctly // throughout the update. if mcName != "" { + // If a rebootless (non-image based) update is desired, apply the NodeDisruptionPolicy to prevent reboots. + if !isImageUpdate { + exutil.By("Applying the NodeDisruptionPolicy") + err = ApplyMachineConfigFixture(oc, nodeDisruptionFixture) + defer ApplyMachineConfigFixture(oc, nodeDisruptionEmptyFixture) + o.Expect(err).NotTo(o.HaveOccurred(), "Error applying the NodeDisruptionPolicy: %s", err) + logger.Infof("OK!\n") + } + exutil.By("Applying the MC") err = ApplyMachineConfigFixture(oc, mcNameToFixtureMap[mcName]) - defer DeleteMCAndWaitForMCPUpdate(oc, machineConfigClient, mcName, mcpAndMoscName) + defer DeleteMCAndWaitForMCPUpdate(oc, machineConfigClient, mcName, mcpAndMoscName, false) o.Expect(err).NotTo(o.HaveOccurred(), "Error applying MC `%s`: %s", mcName, err) logger.Infof("OK!\n") exutil.By("Validating the MCN condition transitions") - validateMCNTransitions(machineConfigClient, nodeToTestName, isImageUpdate) - logger.Infof("OK!\n") - - exutil.By("Removing the MC") - DeleteMCAndWaitForMCPUpdate(oc, machineConfigClient, mcName, mcpAndMoscName) + validateMCNTransitions(oc, machineConfigClient, nodeToTestName, isImageUpdate) logger.Infof("OK!\n") } + // Remove the MOSC before the MC to avoid an unnecessary OCB rebuild. exutil.By("Remove the MachineOSConfig resource") o.Expect(extpriv.DisableOCL(mosc)).To(o.Succeed(), "Error cleaning up MOSC `%s`", mosc) logger.Infof("OK!\n") @@ -198,9 +208,16 @@ func runImageModeMCNTest(oc *exutil.CLI, machineConfigClient *machineconfigclien extpriv.ValidateMOSCIsGarbageCollected(mosc, infraMcp) logger.Infof("OK!\n") + if mcName != "" { + exutil.By("Removing the MC") + DeleteMCAndWaitForMCPUpdate(oc, machineConfigClient, mcName, mcpAndMoscName, false) + logger.Infof("OK!\n") + } + exutil.By("Validate the node in `infra` MCP has correct MCN properties") - err = ValidateMCNForNode(oc, machineConfigClient, nodeToTestName, mcpAndMoscName) - o.Expect(err).NotTo(o.HaveOccurred(), "Error validating MCN for node `%s`: %s", nodeToTestName, err) + o.Eventually(func() error { + return ValidateMCNForNode(oc, machineConfigClient, nodeToTestName, mcpAndMoscName) + }, 1*time.Minute, 5*time.Second).Should(o.Succeed(), "Error validating MCN for node `%s`", nodeToTestName) logger.Infof("OK!\n") exutil.By("Delete the `infra` MCP") @@ -214,13 +231,101 @@ func runImageModeMCNTest(oc *exutil.CLI, machineConfigClient *machineconfigclien logger.Infof("OK!\n") } +// `runImageModeMCNTestDefaultMCP` runs through the general flow of validating the MCN of a node +// for image mode enabled workflows in a default MCP. The steps for the test are as follows: +// 1. Select a node in the desired MCP to follow throughout the test +// 2. Validate the starting properties of the MCN associated with the test node +// 3. Configure on cluster image mode in the desired MCP & validate the MOSC applied successfully +// 4. Validate the properties of the MCN associated with the test node +// 5. If a MachineConfig has been provided, apply it, validate the MCN conditions trasnition +// throughout the update, then remove the MC +// 6. Disable on cluster image mode in the desired MCP & validate the MOSC removal was successful +// 7. Validate the properties of the MCN associated with the test node +func runImageModeMCNTestDefaultMCP(oc *exutil.CLI, machineConfigClient *machineconfigclient.Clientset, mcpAndMoscName, mcName string, isImageUpdate bool) { + exutil.By("Select a node to follow in this test") + mcpNodes, err := GetNodesByRole(oc, mcpAndMoscName) + o.Expect(err).NotTo(o.HaveOccurred(), "Error getting nodes from `%v` MCP: %s", mcpAndMoscName, err) + o.Expect(len(mcpNodes)).To(o.BeNumerically(">=", 1), "Less than one node in desired pool") + nodeToTestName := mcpNodes[0].Name + logger.Infof("Using `%s` as node for test", nodeToTestName) + logger.Infof("OK!\n") + + exutil.By("Validate node's starting MCN properties") + err = ValidateMCNForNode(oc, machineConfigClient, nodeToTestName, mcpAndMoscName) + o.Expect(err).NotTo(o.HaveOccurred(), "Error validating MCN for node `%s`: %s", nodeToTestName, err) + logger.Infof("OK!\n") + + exutil.By("Configure OCB functionality for the desired MCP") + mosc, err := extpriv.CreateMachineOSConfigUsingExternalOrInternalRegistry(oc.AsAdmin(), MachineConfigNamespace, mcpAndMoscName, mcpAndMoscName, nil) + defer extpriv.DisableOCL(mosc) + o.Expect(err).NotTo(o.HaveOccurred(), "Error creating the MachineOSConfig resource: %s", err) + logger.Infof("OK!\n") + + exutil.By("Validating the MOSC applied successfully") + extpriv.ValidateSuccessfulMOSC(mosc, nil) + logger.Infof("OK!\n") + + exutil.By("Validate the test node has correct MCN properties") + err = ValidateMCNForNode(oc, machineConfigClient, nodeToTestName, mcpAndMoscName) + o.Expect(err).NotTo(o.HaveOccurred(), "Error validating MCN for node `%s`: %s", nodeToTestName, err) + logger.Infof("OK!\n") + + // If an MC has been provided, apply it and validate the MCN conditions transition correctly + // throughout the update. + if mcName != "" { + // If a rebootless (non image-based) update is desired, apply the NodeDisruptionPolicy to prevent reboots. + if !isImageUpdate { + exutil.By("Applying the NodeDisruptionPolicy") + err = ApplyMachineConfigFixture(oc, nodeDisruptionFixture) + defer ApplyMachineConfigFixture(oc, nodeDisruptionEmptyFixture) + o.Expect(err).NotTo(o.HaveOccurred(), "Error applying the NodeDisruptionPolicy: %s", err) + logger.Infof("OK!\n") + } + + exutil.By("Applying the MC") + err = ApplyMachineConfigFixture(oc, mcNameToFixtureMap[mcName]) + defer DeleteMCAndWaitForMCPUpdate(oc, machineConfigClient, mcName, mcpAndMoscName, true) + o.Expect(err).NotTo(o.HaveOccurred(), "Error applying MC `%s`: %s", mcName, err) + logger.Infof("OK!\n") + + exutil.By("Validating the MCN condition transitions") + validateMCNTransitions(oc, machineConfigClient, nodeToTestName, isImageUpdate) + logger.Infof("OK!\n") + } + + // Remove the MOSC before the MC to avoid an unnecessary OCB rebuild. Removing the + // MC while image mode is active would trigger a new image build, consuming additional + // disk space that can cause DiskPressure on SNO nodes. By exiting image mode first, + // the subsequent MC deletion is a standard non-layered update. + exutil.By("Remove the MachineOSConfig resource") + o.Expect(extpriv.DisableOCL(mosc)).To(o.Succeed(), "Error cleaning up MOSC `%s`", mosc) + logger.Infof("OK!\n") + + exutil.By("Validating the MOSC was removed successfully") + mcp := extpriv.NewMachineConfigPool(oc.AsAdmin(), mcpAndMoscName) + extpriv.ValidateMOSCIsGarbageCollected(mosc, mcp) + logger.Infof("OK!\n") + + if mcName != "" { + exutil.By("Removing the MC") + DeleteMCAndWaitForMCPUpdate(oc, machineConfigClient, mcName, mcpAndMoscName, true) + logger.Infof("OK!\n") + } + + exutil.By("Validate the test node has correct MCN properties") + o.Eventually(func() error { + return ValidateMCNForNode(oc, machineConfigClient, nodeToTestName, mcpAndMoscName) + }, 1*time.Minute, 5*time.Second).Should(o.Succeed(), "Error validating MCN for node `%s`", nodeToTestName) + logger.Infof("OK!\n") +} + // `validateMCNTransitions` applies a MC, validates that the MCN conditions properly // transition during the update, removes the MC, then validates that the MCN conditions properly // transition during the update. -func validateMCNTransitions(machineConfigClient *machineconfigclient.Clientset, nodeToTestName string, isImageUpdate bool) { +func validateMCNTransitions(oc *exutil.CLI, machineConfigClient *machineconfigclient.Clientset, nodeToTestName string, isImageUpdate bool) { // Validate transition through conditions for MCN exutil.By("Validating the transitions through MCN conditions") - ValidateTransitionThroughConditions(machineConfigClient, nodeToTestName, false, isImageUpdate) + ValidateTransitionThroughConditions(oc, machineConfigClient, nodeToTestName, isImageUpdate) logger.Infof("OK!\n") // When an update is complete, all conditions other than `Updated` must be false @@ -261,7 +366,7 @@ func runMachineCountTest(machineConfigClient *machineconfigclient.Clientset, oc // Apply machine config exutil.By("Applying the MC") err = ApplyMachineConfigFixture(oc, mcNameToFixtureMap[mcName]) - defer DeleteMCAndWaitForMCPUpdate(oc, machineConfigClient, mcName, mcpName) + defer DeleteMCAndWaitForMCPUpdate(oc, machineConfigClient, mcName, mcpName, false) o.Expect(err).NotTo(o.HaveOccurred(), "Error applying MC `%s`: %s", mcName, err) logger.Infof("OK!\n") } else { // Handle the layered MCP case @@ -316,7 +421,9 @@ func validateMCPMachineCountTransitions(machineConfigClient *machineconfigclient interval = 30 * time.Second // SNO clusters require a longer time to reconcile due to the MCC restart in // reboot-required updates, so allow for more retries. - if isSNO, _ := extpriv.IsSNOSafe(oc); isSNO { + isSNO, isSNOErr := extpriv.IsSNOSafe(oc) + o.Expect(isSNOErr).NotTo(o.HaveOccurred(), fmt.Sprintf("Error checking if cluster is SNO: %v", isSNOErr)) + if isSNO { logger.Infof("Cluster is SNO, setting higher retry count.") loopCount = 10 } @@ -348,7 +455,7 @@ func validateMCPMachineCountTransitions(machineConfigClient *machineconfigclient // between the expected and actual machine counts. o.Expect(i).NotTo(o.BeNumerically("==", loopCount), "The actual MCP machine counts did not match the expected machine counts") // If we have not exhausted our attempts, wait a few seconds and check again - if i != 2 { + if i != (loopCount - 1) { logger.Infof("The MCP machine counts did match the expected values. Waiting %v seconds then trying again.", (i+1)*4) time.Sleep(time.Duration((i+1)*4) * time.Second) } diff --git a/test/extended/machineconfignode.go b/test/extended/machineconfignode.go index 1d674f0d97..1662dc1f3a 100644 --- a/test/extended/machineconfignode.go +++ b/test/extended/machineconfignode.go @@ -3,13 +3,18 @@ package extended import ( "context" + "errors" "fmt" + "io" + "net" + "syscall" "time" o "github.com/onsi/gomega" mcfgv1 "github.com/openshift/api/machineconfiguration/v1" machineconfigclient "github.com/openshift/client-go/machineconfiguration/clientset/versioned" "github.com/openshift/machine-config-operator/pkg/daemon/constants" + extpriv "github.com/openshift/machine-config-operator/test/extended-priv" exutil "github.com/openshift/machine-config-operator/test/extended-priv/util" logger "github.com/openshift/machine-config-operator/test/extended-priv/util/logext" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -135,13 +140,20 @@ func getMCNConditionStatus(mcn *mcfgv1.MachineConfigNode, conditionType mcfgv1.S // confirm "Updated" is "False") func checkMCNConditionStatus(mcn *mcfgv1.MachineConfigNode, conditionType mcfgv1.StateProgress, status metav1.ConditionStatus) bool { conditionStatus := getMCNConditionStatus(mcn, conditionType) - if conditionStatus != status && conditionType == mcfgv1.MachineConfigNodeResumed { - condition := getMCNCondition(mcn, conditionType) - logger.Infof("LastTransitionTime: %v, Message: %v, ObservedGeneration: %v, Reason: %v, Status: %v, Type: %v", condition.LastTransitionTime, condition.Message, condition.ObservedGeneration, condition.Reason, condition.Status, condition.Type) - } return conditionStatus == status } +// `isTransientConnectionError` returns true if the error indicates the API server is temporarily +// unreachable. On SNO this happens during node reboots. +func isTransientConnectionError(err error) bool { + var netErr net.Error + return err != nil && + (errors.Is(err, syscall.ECONNREFUSED) || + errors.Is(err, io.EOF) || + errors.Is(err, io.ErrUnexpectedEOF) || + (errors.As(err, &netErr) && netErr.Timeout())) +} + // `WaitForMCNConditionStatus` waits up to a specified timeout for the desired MCN condition to // match the desired status (ex. wait until "Updated" is "False"). If the desired condition is // "Unknown," the function will also return true if the condition is "True," which ensures that we @@ -168,15 +180,21 @@ func waitForMCNConditionStatus(machineConfigClient *machineconfigclient.Clientse // trough the "Unknown" phase, check if the condition has flipped to `True`. if !conditionMet && status == metav1.ConditionUnknown { conditionMet = checkMCNConditionStatus(workerNodeMCN, conditionType, metav1.ConditionTrue) - logger.Infof("MCN '%v' %v condition was %v, missed transition through %v.", mcnName, conditionType, metav1.ConditionTrue, status) + if conditionMet { + logger.Infof("MCN '%v' %v condition was %v, missed transition through %v.", mcnName, conditionType, metav1.ConditionTrue, status) + } } return conditionMet, nil }); err != nil { logger.Infof("The desired MCN condition was never met: %v", err) // Handle the situation where there were errors getting the MCN resource if conditionErr != nil { + if isTransientConnectionError(conditionErr) { + logger.Infof("Got a transient connection error waiting for MCN '%v' %v condition to be %v: %v", mcnName, conditionType, status, conditionErr) + return conditionMet, conditionErr + } logger.Infof("An error occurred waiting for MCN '%v' %v condition to be %v: %v", mcnName, conditionType, status, conditionErr) - return conditionMet, fmt.Errorf("MCN '%v' %v condition was not %v: %v", mcnName, conditionType, status, conditionErr) + return conditionMet, fmt.Errorf("MCN '%v' %v condition was not %v: %w", mcnName, conditionType, status, conditionErr) } // Handle case when no errors occur grabbing the MCN, but we time out waiting for the condition to be in the desired state logger.Infof("A timeout occurred waiting for MCN '%v' %v condition was not %v.", mcnName, conditionType, status) @@ -192,7 +210,7 @@ func waitForMCNConditionStatus(machineConfigClient *machineconfigclient.Clientse // status, a warning will be logged instead of erroring out the test. // //nolint:dupl // (ijanssen): Ignoring a duplication error the linter is throwing because of two similar, but unique if blocks. -func ValidateTransitionThroughConditions(machineConfigClient *machineconfigclient.Clientset, updatingNodeName string, isRebootless, isImageMode bool) { +func ValidateTransitionThroughConditions(oc *exutil.CLI, machineConfigClient *machineconfigclient.Clientset, updatingNodeName string, isImageMode bool) { // Get the start time of the update updateStartTime := metav1.Now() @@ -202,9 +220,16 @@ func ValidateTransitionThroughConditions(machineConfigClient *machineconfigclien // long for the condition to flip (that would mean something is wrong and would waste time). updatingWaitTime := 1 * time.Minute updatingWaitInterval := 1 * time.Second + isSNO, isSNOErr := extpriv.IsSNOSafe(oc) + o.Expect(isSNOErr).NotTo(o.HaveOccurred(), fmt.Sprintf("Error checking if cluster is SNO: %v", isSNOErr)) if isImageMode { updatingWaitTime = 25 * time.Minute updatingWaitInterval = 5 * time.Second + } else if isSNO { + // SNO transition times can also be a bit longer due to cluster connection instability, so + // set the times for longer in such clusters. (Longer image updates timing takes precedence.) + updatingWaitTime = 5 * time.Minute + updatingWaitInterval = 5 * time.Second } // Test the condition transitions. @@ -223,8 +248,10 @@ func ValidateTransitionThroughConditions(machineConfigClient *machineconfigclien o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for UpdateExecuted=Unknown: %v", err)) o.Expect(conditionMet).To(o.BeTrue(), "Error, could not detect UpdateExecuted=Unknown.") - // On standard, non-rebootless, update, check that node transitions through "Cordoned" and "Drained" phases - if !isRebootless { + // On standard, non-rebootless (image based), update, check that node transitions through + // "Cordoned" and "Drained" phases. + // Nodes do not cordon or drain on SNO, so also skip these checks if the cluster is SNO. + if isImageMode && !isSNO { logger.Infof("Waiting for Cordoned=True") conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateCordoned, metav1.ConditionTrue, 30*time.Second, 1*time.Second) o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for Cordoned=True: %v", err)) @@ -250,43 +277,76 @@ func ValidateTransitionThroughConditions(machineConfigClient *machineconfigclien o.Expect(conditionMet).To(o.BeTrue(), "Error, could not detect AppliedOSImage=Unknown.") logger.Infof("Waiting for ImagePulledFromRegistry=Unknown") - conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeImagePulledFromRegistry, metav1.ConditionUnknown, 30*time.Second, 1*time.Second) + timeout := 30 * time.Second + if isSNO { + timeout = 1 * time.Minute + } + conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeImagePulledFromRegistry, metav1.ConditionUnknown, timeout, 1*time.Second) o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for ImagePulledFromRegistry=Unknown: %v", err)) o.Expect(conditionMet).To(o.BeTrue(), "Error, could not detect ImagePulledFromRegistry=Unknown.") logger.Infof("Waiting for AppliedOSImage=True") - conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateOS, metav1.ConditionTrue, 3*time.Minute, 1*time.Second) - o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for AppliedOSImage=True: %v", err)) - o.Expect(conditionMet).To(o.BeTrue(), "Error, could not detect AppliedOSImage=True.") + conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateOS, metav1.ConditionTrue, 4*time.Minute, 2*time.Second) + if isSNO && isTransientConnectionError(err) { + logger.Infof("Warning, got connection error detecting AppliedOSImage=True. The node likely started rebooting.") + } else { + o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for AppliedOSImage=True: %v", err)) + o.Expect(conditionMet).To(o.BeTrue(), "Error, could not detect AppliedOSImage=True.") + } } else { // On a non-image mode update, check that node transitions through the "AppliedFiles" phase logger.Infof("Waiting for AppliedFiles=Unknown") conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateFiles, metav1.ConditionUnknown, 30*time.Second, 1*time.Second) - o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for AppliedFiles=Unknown: %v", err)) + // A node update in SNO can quickly transition through the "AppliedFiles" state and cause + // us to catch the update during the node reboot. During reboot, we will get connection + // errors and, thus, should not error out on such errors. + if isSNO && isTransientConnectionError(err) { + logger.Infof("Warning, got connection error detecting AppliedFiles=Unknown.") + } else { + o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for AppliedFiles=Unknown: %v", err)) + } if !conditionMet { logger.Infof("Warning, could not detect AppliedFiles=Unknown.") } logger.Infof("Waiting for AppliedFiles=True") conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateFiles, metav1.ConditionTrue, 3*time.Minute, 1*time.Second) - o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for AppliedFiles=True: %v", err)) - o.Expect(conditionMet).To(o.BeTrue(), "Error, could not detect AppliedFiles=True.") + // A node update in SNO can quickly transition through the "AppliedFiles" state and cause + // us to catch the update during the node reboot. During reboot, we will get connection + // errors and, thus, should not error out on such errors. + if isSNO && isTransientConnectionError(err) { + logger.Infof("Warning, got connection error detecting AppliedFiles=True. The node likely started rebooting.") + } else { + o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for AppliedFiles=True: %v", err)) + o.Expect(conditionMet).To(o.BeTrue(), "Error, could not detect AppliedFiles=True.") + } } logger.Infof("Waiting for UpdateExecuted=True") conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateExecuted, metav1.ConditionTrue, 20*time.Second, 1*time.Second) - o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for UpdateExecuted=True: %v", err)) - o.Expect(conditionMet).To(o.BeTrue(), "Error, could not detect UpdateExecuted=True.") + // A node update in SNO can quickly transition through the "UpdateExecuted" state and cause + // us to catch the update during the node reboot. During reboot, we will get connection + // errors and, thus, should not error out on such errors. + if isSNO && isTransientConnectionError(err) { + logger.Infof("Warning, got connection error detecting UpdateExecuted=True. The node likely started rebooting.") + } else { + o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for UpdateExecuted=True: %v", err)) + o.Expect(conditionMet).To(o.BeTrue(), "Error, could not detect UpdateExecuted=True.") + } // On image mode update, check that node transitions through the "ImagePulledFromRegistry" phase if isImageMode { logger.Infof("Waiting for ImagePulledFromRegistry=True") conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeImagePulledFromRegistry, metav1.ConditionTrue, 1*time.Minute, 1*time.Second) - o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for ImagePulledFromRegistry=True: %v", err)) - o.Expect(conditionMet).To(o.BeTrue(), "Error, could not detect ImagePulledFromRegistry=True.") + if isSNO && isTransientConnectionError(err) { + logger.Infof("Warning, got connection error detecting ImagePulledFromRegistry=True. The node likely started rebooting.") + } else { + o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for ImagePulledFromRegistry=True: %v", err)) + o.Expect(conditionMet).To(o.BeTrue(), "Error, could not detect ImagePulledFromRegistry=True.") + } } - // On rebootless update, check that node transitions through "UpdatePostActionComplete" phase - if isRebootless { + // On rebootless (non-image based) update, check that node transitions through "UpdatePostActionComplete" phase + if !isImageMode { logger.Infof("Waiting for UpdatePostActionComplete=True") conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdatePostActionComplete, metav1.ConditionTrue, 1*time.Minute, 1*time.Second) o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for UpdatePostActionComplete=True: %v", err)) @@ -294,11 +354,15 @@ func ValidateTransitionThroughConditions(machineConfigClient *machineconfigclien } else { // On standard, non-rebootless, update, check that node transitions through "RebootedNode" phase logger.Infof("Waiting for RebootedNode=Unknown") conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateRebooted, metav1.ConditionUnknown, 15*time.Second, 1*time.Second) - o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for RebootedNode=Unknown: %v", err)) - o.Expect(conditionMet).To(o.BeTrue(), "Error, could not detect RebootedNode=Unknown.") + if isSNO && isTransientConnectionError(err) { + logger.Infof("Warning, got connection error detecting RebootedNode=Unknown. The node is likely already rebooting.") + } else { + o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for RebootedNode=Unknown: %v", err)) + o.Expect(conditionMet).To(o.BeTrue(), "Error, could not detect RebootedNode=Unknown.") + } logger.Infof("Waiting for RebootedNode=True") - conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateRebooted, metav1.ConditionTrue, 10*time.Minute, 1*time.Second) + conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateRebooted, metav1.ConditionTrue, 15*time.Minute, 1*time.Second) o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for RebootedNode=True: %v", err)) o.Expect(conditionMet).To(o.BeTrue(), "Error, could not detect RebootedNode=True.") } @@ -306,26 +370,38 @@ func ValidateTransitionThroughConditions(machineConfigClient *machineconfigclien // The final steps of the update happen quickly, so sometimes we can miss the final condition // transitions. If we do, we will not error out, but record that the condition was missed. logger.Infof("Waiting for Resumed=True") - conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeResumed, metav1.ConditionTrue, 5*time.Second, 1*time.Second) + timeout := 45 * time.Second + if isSNO { + timeout = 3 * time.Minute + } + conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeResumed, metav1.ConditionTrue, timeout, 1*time.Second) o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for Resumed=True: %v", err)) if !conditionMet { logger.Infof("Warning, could not detect Resumed=True.") } - logger.Infof("Waiting for UpdateComplete=True") - conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateComplete, metav1.ConditionTrue, 10*time.Second, 1*time.Second) - o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for UpdateComplete=True: %v", err)) - if !conditionMet { - logger.Infof("Warning, could not detect UpdateComplete=True.") - } - logger.Infof("Waiting for Uncordoned=True") - conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateUncordoned, metav1.ConditionTrue, 10*time.Second, 1*time.Second) - o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for UpdateComplete=True: %v", err)) - if !conditionMet { - logger.Infof("Warning, could not detect UpdateComplete=True.") + // Only nodes that cordon and drain go through the "UpdateComplete" and "Uncordoned" stages, so + // skip these checks for standard, non-rebootless (non-image based), updates and in SNO clusters. + if isImageMode && !isSNO { + logger.Infof("Waiting for UpdateComplete=True") + conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateComplete, metav1.ConditionTrue, 10*time.Second, 1*time.Second) + o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for UpdateComplete=True: %v", err)) + if !conditionMet { + logger.Infof("Warning, could not detect UpdateComplete=True.") + } + logger.Infof("Waiting for Uncordoned=True") + conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateUncordoned, metav1.ConditionTrue, 10*time.Second, 1*time.Second) + o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for Uncordoned=True: %v", err)) + if !conditionMet { + logger.Infof("Warning, could not detect Uncordoned=True.") + } } logger.Infof("Waiting for Updated=True") - conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdated, metav1.ConditionTrue, 1*time.Minute, 1*time.Second) + timeout = 1 * time.Minute + if isSNO { + timeout = 10 * time.Minute + } + conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdated, metav1.ConditionTrue, timeout, 1*time.Second) o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for Updated=True: %v", err)) o.Expect(conditionMet).To(o.BeTrue(), "Error, could not detect Updated=True.") diff --git a/test/extended/machineconfigpool.go b/test/extended/machineconfigpool.go index 6b90d58f73..929b244922 100644 --- a/test/extended/machineconfigpool.go +++ b/test/extended/machineconfigpool.go @@ -29,9 +29,9 @@ func DoesMachineConfigPoolHaveMachines(machineConfigClient *machineconfigclient. return mcp.Status.MachineCount > 0 } -// `WaitForMCPToBeReady` waits up to 5 minutes for a pool to be in an updated state with +// `WaitForMCPToBeReady` waits up to a specified timeout for a pool to be in an updated state with // a specified number of ready machines -func WaitForMCPToBeReady(machineConfigClient *machineconfigclient.Clientset, poolName string, readyMachineCount int32, oldRenderedMC string) { +func WaitForMCPToBeReady(machineConfigClient *machineconfigclient.Clientset, poolName string, readyMachineCount int32, oldRenderedMC string, timeout time.Duration) { o.Eventually(func() bool { mcp, err := machineConfigClient.MachineconfigurationV1().MachineConfigPools().Get(context.TODO(), poolName, metav1.GetOptions{}) if err != nil { @@ -61,7 +61,7 @@ func WaitForMCPToBeReady(machineConfigClient *machineconfigclient.Clientset, poo logger.Infof("MCP '%v' has %v ready machines. Waiting for the desired ready machine count of %v.", poolName, mcp.Status.UpdatedMachineCount, readyMachineCount) } return false - }, 5*time.Minute, 10*time.Second).Should(o.BeTrue(), "Timed out waiting for MCP '%v' to be in 'Updated' state with %v ready machines.", poolName, readyMachineCount) + }, timeout, 10*time.Second).Should(o.BeTrue(), "Timed out waiting for MCP '%v' to be in 'Updated' state with %v ready machines.", poolName, readyMachineCount) } // `CleanupCustomMCP` cleans up a custom MCP if it exists through the following steps: @@ -92,7 +92,7 @@ func CleanupCustomMCP(oc *exutil.CLI, machineConfigClient *machineconfigclient.C // Wait for custom MCP to report no ready nodes logger.Infof("Waiting for %v MCP to be updated with %v ready machines.", customMCPName, 0) - WaitForMCPToBeReady(machineConfigClient, customMCPName, 0, "") + WaitForMCPToBeReady(machineConfigClient, customMCPName, 0, "", 5*time.Minute) // Wait for node to have a current config version equal to the worker MCP's config version workerMcp, workerMcpErr := machineConfigClient.MachineconfigurationV1().MachineConfigPools().Get(context.TODO(), "worker", metav1.GetOptions{}) @@ -115,7 +115,7 @@ func CleanupCustomMCP(oc *exutil.CLI, machineConfigClient *machineconfigclient.C // `DeleteMCAndWaitForMCPUpdate` deletes the desired MC and waits for the associated MCP // to return to an updated state -func DeleteMCAndWaitForMCPUpdate(oc *exutil.CLI, machineConfigClient *machineconfigclient.Clientset, mcName, mcpName string) { +func DeleteMCAndWaitForMCPUpdate(oc *exutil.CLI, machineConfigClient *machineconfigclient.Clientset, mcName, mcpName string, isSNO bool) { oldRenderedMC := "" // Get the rendered config of the MCP before the MC deletion, if the MCP still exists mcp, mcpErr := machineConfigClient.MachineconfigurationV1().MachineConfigPools().Get(context.TODO(), mcpName, metav1.GetOptions{}) @@ -131,8 +131,13 @@ func DeleteMCAndWaitForMCPUpdate(oc *exutil.CLI, machineConfigClient *machinecon // Only wait for the MCP to return to an updated state if the MC existed and needed deletion // and if the targeted MCP still exists if mcDeleted && mcpErr == nil { + timeout := 8 * time.Minute + // SNO clusters can take a bit longer to finish an update, so set a longer timeout + if isSNO { + timeout = 15 * time.Minute + } logger.Infof("Waiting for %v MCP to be updated with %v ready machines.", mcpName, 1) - WaitForMCPToBeReady(machineConfigClient, mcpName, 1, oldRenderedMC) + WaitForMCPToBeReady(machineConfigClient, mcpName, 1, oldRenderedMC, timeout) } } diff --git a/test/extended/node.go b/test/extended/node.go index 83dcad89c1..cca6cd91a2 100644 --- a/test/extended/node.go +++ b/test/extended/node.go @@ -28,7 +28,7 @@ func GetNodesByRole(oc *exutil.CLI, role string) ([]corev1.Node, error) { return nodes.Items, nil } -// `WaitForNodeCurrentConfig` waits up to 5 minutes for a input node to have a current +// `WaitForNodeCurrentConfig` waits up to 8 minutes for a input node to have a current // config equal to the `config` parameter func WaitForNodeCurrentConfig(oc *exutil.CLI, nodeName, config string) { o.Eventually(func() bool { @@ -46,5 +46,5 @@ func WaitForNodeCurrentConfig(oc *exutil.CLI, nodeName, config string) { } logger.Infof("Node '%v' has a current config version of '%v'. Waiting for the node's current config version to be '%v'.", nodeName, nodeCurrentConfig, config) return false - }, 5*time.Minute, 10*time.Second).Should(o.BeTrue(), "Timed out waiting for node '%v' to have a current config version of '%v'.", nodeName, config) + }, 8*time.Minute, 10*time.Second).Should(o.BeTrue(), "Timed out waiting for node '%v' to have a current config version of '%v'.", nodeName, config) } From eb38f08188dfa55edf31764e0971216889244ef0 Mon Sep 17 00:00:00 2001 From: Isabella Janssen Date: Wed, 10 Jun 2026 09:13:10 -0400 Subject: [PATCH 2/4] don't ignore transient errors & bump timeout --- test/extended-priv/mco_ocb.go | 2 +- test/extended/machineconfignode.go | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/test/extended-priv/mco_ocb.go b/test/extended-priv/mco_ocb.go index 46f357fee0..9653032e61 100644 --- a/test/extended-priv/mco_ocb.go +++ b/test/extended-priv/mco_ocb.go @@ -313,7 +313,7 @@ func ValidateSuccessfulMOSC(mosc *MachineOSConfig, checkers []Checker) { exutil.By("Check that the machine-os-builder is using leader election without failing") o.Eventually(func() (string, error) { return mOSBuilder.Logs() - }, "5m", "10s").Should(o.And( + }, "7m", "10s").Should(o.And( o.MatchRegexp("(?i)"+regexp.QuoteMeta("attempting to acquire leader lease")), o.MatchRegexp("(?i)"+regexp.QuoteMeta("successfully acquired lease"))), "The machine os builder pod is not using the leader election without failures") diff --git a/test/extended/machineconfignode.go b/test/extended/machineconfignode.go index 1662dc1f3a..38f7463ab9 100644 --- a/test/extended/machineconfignode.go +++ b/test/extended/machineconfignode.go @@ -163,6 +163,7 @@ func waitForMCNConditionStatus(machineConfigClient *machineconfigclient.Clientse conditionMet := false var conditionErr error + var transientErr error var workerNodeMCN *mcfgv1.MachineConfigNode if err := wait.PollUntilContextTimeout(context.TODO(), interval, timeout, true, func(_ context.Context) (bool, error) { logger.Infof("Waiting for MCN '%v' %v condition to be %v.", mcnName, conditionType, status) @@ -183,6 +184,9 @@ func waitForMCNConditionStatus(machineConfigClient *machineconfigclient.Clientse if conditionMet { logger.Infof("MCN '%v' %v condition was %v, missed transition through %v.", mcnName, conditionType, metav1.ConditionTrue, status) } + if transientErr == nil && isTransientConnectionError(conditionErr) { + transientErr = conditionErr + } } return conditionMet, nil }); err != nil { @@ -196,6 +200,10 @@ func waitForMCNConditionStatus(machineConfigClient *machineconfigclient.Clientse logger.Infof("An error occurred waiting for MCN '%v' %v condition to be %v: %v", mcnName, conditionType, status, conditionErr) return conditionMet, fmt.Errorf("MCN '%v' %v condition was not %v: %w", mcnName, conditionType, status, conditionErr) } + if transientErr != nil { + logger.Infof("Got a transient connection error waiting for MCN '%v' %v condition to be %v: %v", mcnName, conditionType, status, transientErr) + return conditionMet, transientErr + } // Handle case when no errors occur grabbing the MCN, but we time out waiting for the condition to be in the desired state logger.Infof("A timeout occurred waiting for MCN '%v' %v condition was not %v.", mcnName, conditionType, status) return conditionMet, nil From 82dce394367470cb80ac1add5f0b546a2a7d9f76 Mon Sep 17 00:00:00 2001 From: Isabella Janssen Date: Wed, 10 Jun 2026 12:29:09 -0400 Subject: [PATCH 3/4] don't keep polling on pre-reboot conditions when node has entered reboot --- test/extended/machineconfignode.go | 82 +++++++++++++----------------- 1 file changed, 34 insertions(+), 48 deletions(-) diff --git a/test/extended/machineconfignode.go b/test/extended/machineconfignode.go index 38f7463ab9..be5c266e37 100644 --- a/test/extended/machineconfignode.go +++ b/test/extended/machineconfignode.go @@ -159,19 +159,20 @@ func isTransientConnectionError(err error) bool { // "Unknown," the function will also return true if the condition is "True," which ensures that we // do not fail when an update progresses quickly through the intermediary "Unknown" phase. func waitForMCNConditionStatus(machineConfigClient *machineconfigclient.Clientset, mcnName string, conditionType mcfgv1.StateProgress, status metav1.ConditionStatus, - timeout time.Duration, interval time.Duration) (bool, error) { + timeout time.Duration, interval time.Duration, stopOnTransientError bool) (bool, error) { conditionMet := false var conditionErr error - var transientErr error var workerNodeMCN *mcfgv1.MachineConfigNode if err := wait.PollUntilContextTimeout(context.TODO(), interval, timeout, true, func(_ context.Context) (bool, error) { logger.Infof("Waiting for MCN '%v' %v condition to be %v.", mcnName, conditionType, status) workerNodeMCN, conditionErr = machineConfigClient.MachineconfigurationV1().MachineConfigNodes().Get(context.TODO(), mcnName, metav1.GetOptions{}) - // Record if an error occurs when getting the MCN resource if conditionErr != nil { logger.Infof("Error getting MCN for node '%v': %v", mcnName, conditionErr) + if stopOnTransientError && isTransientConnectionError(conditionErr) { + return false, conditionErr + } return false, nil } @@ -184,27 +185,22 @@ func waitForMCNConditionStatus(machineConfigClient *machineconfigclient.Clientse if conditionMet { logger.Infof("MCN '%v' %v condition was %v, missed transition through %v.", mcnName, conditionType, metav1.ConditionTrue, status) } - if transientErr == nil && isTransientConnectionError(conditionErr) { - transientErr = conditionErr - } } return conditionMet, nil }); err != nil { + if isTransientConnectionError(err) { + logger.Infof("Got a transient connection error waiting for MCN '%v' %v condition to be %v: %v", mcnName, conditionType, status, err) + return conditionMet, err + } logger.Infof("The desired MCN condition was never met: %v", err) - // Handle the situation where there were errors getting the MCN resource if conditionErr != nil { if isTransientConnectionError(conditionErr) { - logger.Infof("Got a transient connection error waiting for MCN '%v' %v condition to be %v: %v", mcnName, conditionType, status, conditionErr) - return conditionMet, conditionErr + logger.Infof("Timed out with transient connection error waiting for MCN '%v' %v condition to be %v: %v", mcnName, conditionType, status, conditionErr) + return conditionMet, nil } logger.Infof("An error occurred waiting for MCN '%v' %v condition to be %v: %v", mcnName, conditionType, status, conditionErr) return conditionMet, fmt.Errorf("MCN '%v' %v condition was not %v: %w", mcnName, conditionType, status, conditionErr) } - if transientErr != nil { - logger.Infof("Got a transient connection error waiting for MCN '%v' %v condition to be %v: %v", mcnName, conditionType, status, transientErr) - return conditionMet, transientErr - } - // Handle case when no errors occur grabbing the MCN, but we time out waiting for the condition to be in the desired state logger.Infof("A timeout occurred waiting for MCN '%v' %v condition was not %v.", mcnName, conditionType, status) return conditionMet, nil } @@ -242,17 +238,17 @@ func ValidateTransitionThroughConditions(oc *exutil.CLI, machineConfigClient *ma // Test the condition transitions. logger.Infof("Waiting for Updated=False") - conditionMet, err := waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdated, metav1.ConditionFalse, updatingWaitTime, updatingWaitInterval) + conditionMet, err := waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdated, metav1.ConditionFalse, updatingWaitTime, updatingWaitInterval, false) o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for Updated=False: %v", err)) o.Expect(conditionMet).To(o.BeTrue(), "Error, could not detect Updated=False.") logger.Infof("Waiting for UpdatePrepared=True") - conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdatePrepared, metav1.ConditionTrue, 1*time.Minute, 1*time.Second) + conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdatePrepared, metav1.ConditionTrue, 1*time.Minute, 1*time.Second, false) o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for UpdatePrepared=True: %v", err)) o.Expect(conditionMet).To(o.BeTrue(), "Error, could not detect UpdatePrepared=True.") logger.Infof("Waiting for UpdateExecuted=Unknown") - conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateExecuted, metav1.ConditionUnknown, 30*time.Second, 1*time.Second) + conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateExecuted, metav1.ConditionUnknown, 30*time.Second, 1*time.Second, false) o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for UpdateExecuted=Unknown: %v", err)) o.Expect(conditionMet).To(o.BeTrue(), "Error, could not detect UpdateExecuted=Unknown.") @@ -261,17 +257,17 @@ func ValidateTransitionThroughConditions(oc *exutil.CLI, machineConfigClient *ma // Nodes do not cordon or drain on SNO, so also skip these checks if the cluster is SNO. if isImageMode && !isSNO { logger.Infof("Waiting for Cordoned=True") - conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateCordoned, metav1.ConditionTrue, 30*time.Second, 1*time.Second) + conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateCordoned, metav1.ConditionTrue, 30*time.Second, 1*time.Second, false) o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for Cordoned=True: %v", err)) o.Expect(conditionMet).To(o.BeTrue(), "Error, could not detect Cordoned=True.") logger.Infof("Waiting for Drained=Unknown") - conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateDrained, metav1.ConditionUnknown, 15*time.Second, 1*time.Second) + conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateDrained, metav1.ConditionUnknown, 15*time.Second, 1*time.Second, false) o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for Drained=Unknown: %v", err)) o.Expect(conditionMet).To(o.BeTrue(), "Error, could not detect Drained=Unknown.") logger.Infof("Waiting for Drained=True") - conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateDrained, metav1.ConditionTrue, 4*time.Minute, 1*time.Second) + conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateDrained, metav1.ConditionTrue, 4*time.Minute, 1*time.Second, false) o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for Drained=True: %v", err)) o.Expect(conditionMet).To(o.BeTrue(), "Error, could not detect Drained=True.") } @@ -280,7 +276,7 @@ func ValidateTransitionThroughConditions(oc *exutil.CLI, machineConfigClient *ma // "ImagePulledFromRegistry" phases if isImageMode { logger.Infof("Waiting for AppliedOSImage=Unknown") - conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateOS, metav1.ConditionUnknown, 30*time.Second, 1*time.Second) + conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateOS, metav1.ConditionUnknown, 30*time.Second, 1*time.Second, false) o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for AppliedOSImage=Unknown: %v", err)) o.Expect(conditionMet).To(o.BeTrue(), "Error, could not detect AppliedOSImage=Unknown.") @@ -289,12 +285,12 @@ func ValidateTransitionThroughConditions(oc *exutil.CLI, machineConfigClient *ma if isSNO { timeout = 1 * time.Minute } - conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeImagePulledFromRegistry, metav1.ConditionUnknown, timeout, 1*time.Second) + conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeImagePulledFromRegistry, metav1.ConditionUnknown, timeout, 1*time.Second, false) o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for ImagePulledFromRegistry=Unknown: %v", err)) o.Expect(conditionMet).To(o.BeTrue(), "Error, could not detect ImagePulledFromRegistry=Unknown.") logger.Infof("Waiting for AppliedOSImage=True") - conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateOS, metav1.ConditionTrue, 4*time.Minute, 2*time.Second) + conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateOS, metav1.ConditionTrue, 4*time.Minute, 2*time.Second, true) if isSNO && isTransientConnectionError(err) { logger.Infof("Warning, got connection error detecting AppliedOSImage=True. The node likely started rebooting.") } else { @@ -303,24 +299,14 @@ func ValidateTransitionThroughConditions(oc *exutil.CLI, machineConfigClient *ma } } else { // On a non-image mode update, check that node transitions through the "AppliedFiles" phase logger.Infof("Waiting for AppliedFiles=Unknown") - conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateFiles, metav1.ConditionUnknown, 30*time.Second, 1*time.Second) - // A node update in SNO can quickly transition through the "AppliedFiles" state and cause - // us to catch the update during the node reboot. During reboot, we will get connection - // errors and, thus, should not error out on such errors. - if isSNO && isTransientConnectionError(err) { - logger.Infof("Warning, got connection error detecting AppliedFiles=Unknown.") - } else { - o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for AppliedFiles=Unknown: %v", err)) - } + conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateFiles, metav1.ConditionUnknown, 30*time.Second, 1*time.Second, false) + o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for AppliedFiles=Unknown: %v", err)) if !conditionMet { logger.Infof("Warning, could not detect AppliedFiles=Unknown.") } logger.Infof("Waiting for AppliedFiles=True") - conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateFiles, metav1.ConditionTrue, 3*time.Minute, 1*time.Second) - // A node update in SNO can quickly transition through the "AppliedFiles" state and cause - // us to catch the update during the node reboot. During reboot, we will get connection - // errors and, thus, should not error out on such errors. + conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateFiles, metav1.ConditionTrue, 3*time.Minute, 1*time.Second, true) if isSNO && isTransientConnectionError(err) { logger.Infof("Warning, got connection error detecting AppliedFiles=True. The node likely started rebooting.") } else { @@ -330,7 +316,7 @@ func ValidateTransitionThroughConditions(oc *exutil.CLI, machineConfigClient *ma } logger.Infof("Waiting for UpdateExecuted=True") - conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateExecuted, metav1.ConditionTrue, 20*time.Second, 1*time.Second) + conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateExecuted, metav1.ConditionTrue, 20*time.Second, 1*time.Second, true) // A node update in SNO can quickly transition through the "UpdateExecuted" state and cause // us to catch the update during the node reboot. During reboot, we will get connection // errors and, thus, should not error out on such errors. @@ -344,7 +330,7 @@ func ValidateTransitionThroughConditions(oc *exutil.CLI, machineConfigClient *ma // On image mode update, check that node transitions through the "ImagePulledFromRegistry" phase if isImageMode { logger.Infof("Waiting for ImagePulledFromRegistry=True") - conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeImagePulledFromRegistry, metav1.ConditionTrue, 1*time.Minute, 1*time.Second) + conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeImagePulledFromRegistry, metav1.ConditionTrue, 1*time.Minute, 1*time.Second, true) if isSNO && isTransientConnectionError(err) { logger.Infof("Warning, got connection error detecting ImagePulledFromRegistry=True. The node likely started rebooting.") } else { @@ -356,21 +342,21 @@ func ValidateTransitionThroughConditions(oc *exutil.CLI, machineConfigClient *ma // On rebootless (non-image based) update, check that node transitions through "UpdatePostActionComplete" phase if !isImageMode { logger.Infof("Waiting for UpdatePostActionComplete=True") - conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdatePostActionComplete, metav1.ConditionTrue, 1*time.Minute, 1*time.Second) + conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdatePostActionComplete, metav1.ConditionTrue, 1*time.Minute, 1*time.Second, false) o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for UpdatePostActionComplete=True: %v", err)) o.Expect(conditionMet).To(o.BeTrue(), "Error, could not detect UpdatePostActionComplete=True.") } else { // On standard, non-rebootless, update, check that node transitions through "RebootedNode" phase logger.Infof("Waiting for RebootedNode=Unknown") - conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateRebooted, metav1.ConditionUnknown, 15*time.Second, 1*time.Second) - if isSNO && isTransientConnectionError(err) { - logger.Infof("Warning, got connection error detecting RebootedNode=Unknown. The node is likely already rebooting.") + conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateRebooted, metav1.ConditionUnknown, 15*time.Second, 1*time.Second, false) + o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for RebootedNode=Unknown: %v", err)) + if !conditionMet { + logger.Infof("Warning, could not detect RebootedNode=Unknown.") } else { - o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for RebootedNode=Unknown: %v", err)) o.Expect(conditionMet).To(o.BeTrue(), "Error, could not detect RebootedNode=Unknown.") } logger.Infof("Waiting for RebootedNode=True") - conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateRebooted, metav1.ConditionTrue, 15*time.Minute, 1*time.Second) + conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateRebooted, metav1.ConditionTrue, 15*time.Minute, 1*time.Second, false) o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for RebootedNode=True: %v", err)) o.Expect(conditionMet).To(o.BeTrue(), "Error, could not detect RebootedNode=True.") } @@ -382,7 +368,7 @@ func ValidateTransitionThroughConditions(oc *exutil.CLI, machineConfigClient *ma if isSNO { timeout = 3 * time.Minute } - conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeResumed, metav1.ConditionTrue, timeout, 1*time.Second) + conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeResumed, metav1.ConditionTrue, timeout, 1*time.Second, false) o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for Resumed=True: %v", err)) if !conditionMet { logger.Infof("Warning, could not detect Resumed=True.") @@ -391,13 +377,13 @@ func ValidateTransitionThroughConditions(oc *exutil.CLI, machineConfigClient *ma // skip these checks for standard, non-rebootless (non-image based), updates and in SNO clusters. if isImageMode && !isSNO { logger.Infof("Waiting for UpdateComplete=True") - conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateComplete, metav1.ConditionTrue, 10*time.Second, 1*time.Second) + conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateComplete, metav1.ConditionTrue, 10*time.Second, 1*time.Second, false) o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for UpdateComplete=True: %v", err)) if !conditionMet { logger.Infof("Warning, could not detect UpdateComplete=True.") } logger.Infof("Waiting for Uncordoned=True") - conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateUncordoned, metav1.ConditionTrue, 10*time.Second, 1*time.Second) + conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateUncordoned, metav1.ConditionTrue, 10*time.Second, 1*time.Second, false) o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for Uncordoned=True: %v", err)) if !conditionMet { logger.Infof("Warning, could not detect Uncordoned=True.") @@ -409,7 +395,7 @@ func ValidateTransitionThroughConditions(oc *exutil.CLI, machineConfigClient *ma if isSNO { timeout = 10 * time.Minute } - conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdated, metav1.ConditionTrue, timeout, 1*time.Second) + conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdated, metav1.ConditionTrue, timeout, 1*time.Second, false) o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for Updated=True: %v", err)) o.Expect(conditionMet).To(o.BeTrue(), "Error, could not detect Updated=True.") From fa482bd346a230eacb78adcc2c69f7cc0e39389e Mon Sep 17 00:00:00 2001 From: Isabella Janssen Date: Wed, 10 Jun 2026 23:11:57 -0400 Subject: [PATCH 4/4] apply same transient error stop retry to rebooted = unknown condition --- test/extended/machineconfignode.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/extended/machineconfignode.go b/test/extended/machineconfignode.go index be5c266e37..5f493b3da4 100644 --- a/test/extended/machineconfignode.go +++ b/test/extended/machineconfignode.go @@ -347,11 +347,11 @@ func ValidateTransitionThroughConditions(oc *exutil.CLI, machineConfigClient *ma o.Expect(conditionMet).To(o.BeTrue(), "Error, could not detect UpdatePostActionComplete=True.") } else { // On standard, non-rebootless, update, check that node transitions through "RebootedNode" phase logger.Infof("Waiting for RebootedNode=Unknown") - conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateRebooted, metav1.ConditionUnknown, 15*time.Second, 1*time.Second, false) - o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for RebootedNode=Unknown: %v", err)) - if !conditionMet { - logger.Infof("Warning, could not detect RebootedNode=Unknown.") + conditionMet, err = waitForMCNConditionStatus(machineConfigClient, updatingNodeName, mcfgv1.MachineConfigNodeUpdateRebooted, metav1.ConditionUnknown, 15*time.Second, 1*time.Second, true) + if isSNO && isTransientConnectionError(err) { + logger.Infof("Warning, got connection error detecting RebootedNode=Unknown. The node likely started rebooting.") } else { + o.Expect(err).NotTo(o.HaveOccurred(), fmt.Sprintf("Error occurred while waiting for RebootedNode=Unknown: %v", err)) o.Expect(conditionMet).To(o.BeTrue(), "Error, could not detect RebootedNode=Unknown.") }