From c0afbc4b1f99f082c8589d345a6383e492c77d33 Mon Sep 17 00:00:00 2001 From: Urvashi Date: Fri, 23 Jan 2026 15:16:55 -0500 Subject: [PATCH 1/3] Fix intermittent e2e-ocl test failures Three fixes to address intermittent test failures in CI: 1. TestControllerEventuallyReconciles timeout issue: - Increased job completion timeout from 10 to 20 minutes - Test simulates adverse conditions (scaled down deployments) - Image builds can take longer in resource-constrained CI environments 2. Rate limiter exhaustion in log streaming: - Reduced log streaming retry interval from 2s to 5s - Multiple concurrent goroutines were making API calls too frequently - 60% reduction in API call rate prevents rate limiter exhaustion 3. HTTP/2 connection errors failing tests: - Made log streaming errors non-fatal (log warnings instead) - API server closes long-running log streams when pods terminate - Log collection is for debugging, not a test requirement - Tests now pass/fail based on actual functionality --- test/e2e-ocl/helpers_test.go | 2 +- test/e2e-ocl/onclusterlayering_test.go | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/test/e2e-ocl/helpers_test.go b/test/e2e-ocl/helpers_test.go index 94b744558d..ec28bce94f 100644 --- a/test/e2e-ocl/helpers_test.go +++ b/test/e2e-ocl/helpers_test.go @@ -736,7 +736,7 @@ func streamContainerLogToFile(ctx context.Context, t *testing.T, cs *framework.C if err != nil { // If the container is waiting to start (e.g., PodInitializing), wait and retry if strings.Contains(err.Error(), "waiting to start") || strings.Contains(err.Error(), "PodInitializing") { - time.Sleep(2 * time.Second) + time.Sleep(5 * time.Second) continue } return fmt.Errorf("could not get logs for container %s in pod %s: %w", container.Name, pod.Name, err) diff --git a/test/e2e-ocl/onclusterlayering_test.go b/test/e2e-ocl/onclusterlayering_test.go index 781bd80f78..3fec6e58a9 100644 --- a/test/e2e-ocl/onclusterlayering_test.go +++ b/test/e2e-ocl/onclusterlayering_test.go @@ -3,6 +3,7 @@ package e2e_ocl_test import ( "context" _ "embed" + "errors" "flag" "fmt" "os" @@ -802,7 +803,9 @@ func runOnClusterLayeringTest(t *testing.T, testOpts onClusterLayeringTestOpts) // Goroutine so that the rest of the test can continue. go func() { err := streamBuildPodLogsToFile(buildPodStreamerCtx, t, cs, startedBuild, podLogsDirPath) - require.NoError(t, err, "expected no error, got %s", err) + if err != nil && !errors.Is(err, context.Canceled) { + t.Logf("Warning: failed to stream build pod logs: %v", err) + } }() // We also want to collect logs from the machine-os-builder pod since they @@ -811,7 +814,9 @@ func runOnClusterLayeringTest(t *testing.T, testOpts onClusterLayeringTestOpts) // blocked. go func() { err := streamMachineOSBuilderPodLogsToFile(mobPodStreamerCtx, t, cs, podLogsDirPath) - require.NoError(t, err, "expected no error, got: %s", err) + if err != nil && !errors.Is(err, context.Canceled) { + t.Logf("Warning: failed to stream machine-os-builder pod logs: %v", err) + } }() // Wait for the build to complete. @@ -1340,7 +1345,7 @@ func waitForMOSCToUpdateCurrentMOSB(ctx context.Context, t *testing.T, cs *frame // Waits for a job object to reach a given state. // TOOD: Add this to the Asserts helper struct. func waitForJobToReachCondition(ctx context.Context, t *testing.T, cs *framework.ClientSet, jobName string, condFunc func(*batchv1.Job) (bool, error)) { - require.NoError(t, wait.PollImmediate(1*time.Second, 10*time.Minute, func() (bool, error) { + require.NoError(t, wait.PollImmediate(1*time.Second, 20*time.Minute, func() (bool, error) { job, err := cs.BatchV1Interface.Jobs(ctrlcommon.MCONamespace).Get(ctx, jobName, metav1.GetOptions{}) if err != nil { return false, err From 69e045b37f6e0ba30bd00d985a2a42e3b89ff906 Mon Sep 17 00:00:00 2001 From: Urvashi Date: Mon, 2 Feb 2026 10:20:19 -0500 Subject: [PATCH 2/3] Make TestImagePrunerErrors flexible to registry behavior changes External image registries (Docker Hub, GitHub Container Registry) have changed their API error responses over time: - Docker.io now returns imageNotFound for nonexistent repos (was accessDenied) - ghcr.io now returns imageNotFound for nonexistent tags (was accessDenied) Updated test to accept either error type when both flags are set: - Modified inspectTestFunc and deleteTestFunc to treat both flags as "accept either" - Updated Docker.io inspect/nonexistentRepo case to accept both error types - Updated GitHub registry delete/nonexistentTag case to accept both error types Both error types are tolerable for ImagePruner functionality, so tests should not be brittle to registry-specific error response changes. --- test/e2e-ocl/imagepruner_test.go | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/test/e2e-ocl/imagepruner_test.go b/test/e2e-ocl/imagepruner_test.go index b32eb66fcd..f1f89abe0a 100644 --- a/test/e2e-ocl/imagepruner_test.go +++ b/test/e2e-ocl/imagepruner_test.go @@ -340,6 +340,8 @@ func TestImagePrunerErrors(t *testing.T) { inspect: operation{ nonexistentRepo: expectedErr{ accessDenied: true, + // Docker.io behavior varies - accept either error type + imageNotFound: true, }, nonexistentTag: expectedErr{ imageNotFound: true, @@ -390,6 +392,8 @@ func TestImagePrunerErrors(t *testing.T) { }, nonexistentTag: expectedErr{ accessDenied: true, + // GitHub registry behavior varies - accept either error type + imageNotFound: true, }, nonexistentDigest: expectedErr{ accessDenied: true, @@ -450,6 +454,11 @@ func TestImagePrunerErrors(t *testing.T) { _, imgDigest, err := ip.InspectImage(ctx, pullspec, k8sSecret, &mcfgv1.ControllerConfig{}) if !expected.imageNotFound && !expected.accessDenied { require.NoError(t, err) + } else if expected.imageNotFound && expected.accessDenied { + // Both flags set means accept either error type (registries may vary) + isImageNotFound := imagepruner.IsImageNotFoundErr(err) + isAccessDenied := imagepruner.IsAccessDeniedErr(err) + assert.True(t, isImageNotFound || isAccessDenied, "expected either imageNotFound or accessDenied error, got: %v", err) } else { assert.Equal(t, expected.imageNotFound, imagepruner.IsImageNotFoundErr(err), "image not found error should be %v", !expected.imageNotFound) assert.Equal(t, expected.accessDenied, imagepruner.IsAccessDeniedErr(err), "access denied error should be %v", !expected.accessDenied) @@ -471,8 +480,15 @@ func TestImagePrunerErrors(t *testing.T) { // We should always get an error back for this test because we do not have // permissions to delete images. assert.Error(t, err) - assert.Equal(t, expected.imageNotFound, imagepruner.IsImageNotFoundErr(err), "image not found error should be %v", !expected.imageNotFound) - assert.Equal(t, expected.accessDenied, imagepruner.IsAccessDeniedErr(err), "access denied error should be %v", !expected.accessDenied) + if expected.imageNotFound && expected.accessDenied { + // Both flags set means accept either error type (registries may vary) + isImageNotFound := imagepruner.IsImageNotFoundErr(err) + isAccessDenied := imagepruner.IsAccessDeniedErr(err) + assert.True(t, isImageNotFound || isAccessDenied, "expected either imageNotFound or accessDenied error, got: %v", err) + } else { + assert.Equal(t, expected.imageNotFound, imagepruner.IsImageNotFoundErr(err), "image not found error should be %v", !expected.imageNotFound) + assert.Equal(t, expected.accessDenied, imagepruner.IsAccessDeniedErr(err), "access denied error should be %v", !expected.accessDenied) + } assert.True(t, imagepruner.IsTolerableDeleteErr(err)) return err From 7da27403177ae96661c90aa57da47ddd35e9ce21 Mon Sep 17 00:00:00 2001 From: Urvashi Date: Wed, 11 Feb 2026 10:32:10 -0500 Subject: [PATCH 3/3] Fix cleanup verification timeouts in e2e-ocl tests The cleanupEphemeralBuildObjects function was experiencing intermittent timeout failures during cleanup verification, even though deletions were succeeding. Each verification used 5-minute default timeout with 1s poll interval which could exhaust rate limits leading to the "context deadline exceeded" error. Create a dedicated 2-minute timeout context for cleanup verification and increase poll interval from 1s to 3s to reduce API call rate which should be about 40 attempts per resource. Signed-off-by: Urvashi --- test/e2e-ocl/helpers_test.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/test/e2e-ocl/helpers_test.go b/test/e2e-ocl/helpers_test.go index ec28bce94f..12ae30d345 100644 --- a/test/e2e-ocl/helpers_test.go +++ b/test/e2e-ocl/helpers_test.go @@ -313,7 +313,12 @@ func cleanupEphemeralBuildObjects(t *testing.T, cs *framework.ClientSet) { moscList, err := cs.MachineconfigurationV1Interface.MachineOSConfigs().List(context.TODO(), metav1.ListOptions{}) require.NoError(t, err) - kubeassert := helpers.AssertClientSet(t, cs) + // Create a dedicated context for cleanup verification with reasonable timeout. + // Cleanup should complete quickly - if verification takes >2 minutes, something is wrong. + // Use a slower poll interval (3s instead of 1s) to reduce API call rate and avoid rate limiting. + cleanupCtx, cleanupCancel := context.WithTimeout(context.Background(), 2*time.Minute) + defer cleanupCancel() + kubeassert := helpers.AssertClientSet(t, cs).WithContext(cleanupCtx).WithPollInterval(3 * time.Second).Eventually() if len(secretList.Items) == 0 { t.Logf("No build-time secrets to clean up")