From b716a5fa2d11e8eb773c9050baf646fc0c94ad4f Mon Sep 17 00:00:00 2001 From: Urvashi Date: Tue, 2 Jun 2026 10:51:33 -0400 Subject: [PATCH 1/3] Replace monolithic e2e-ocl test suite with sharded suite Replace the monolithic e2e-ocl test suite with two sharded suites (e2e-ocl-1of2 and e2e-ocl-2of2) and a shared helper package (e2e-ocl-shared) to reduce overall CI time by running in parallel. The sharded tests incorporate all stabilization fixes from main: - Fix cleanup verification timeouts with per-resource contexts - Fix cleanup ordering to delete owners before dependents - Increase poll intervals to avoid rate limiter exhaustion - Make log streaming errors non-fatal - Increase job completion timeout for CI environments - Make TestImagePrunerErrors flexible to registry behavior changes Adapt MachineOSBuild construction calls to use the NewMachineOSBuildFromAPIOrDie API available on this branch. Exclude TestStaleAnnotationClearedOnLayerOnlyChange as it requires a corresponding production fix not present on this branch. Makefile targets: - Remove: test-e2e-ocl (190m) - Add: test-e2e-ocl-1of2 (120m), test-e2e-ocl-2of2 (150m) Signed-off-by: Urvashi --- Makefile | 10 +- test/{e2e-ocl => e2e-ocl-1of2}/main_test.go | 2 +- test/e2e-ocl-1of2/onclusterlayering_test.go | 910 ++++++++ .../imagepruner_test.go | 26 +- .../layered_image_scaleup_test.go | 15 +- test/e2e-ocl-2of2/main_test.go | 16 + .../onclusterlayering_test.go | 2033 +++++++---------- .../Containerfile.cowsay | 0 .../Containerfile.entitled | 0 .../Containerfile.okd-fcos | 0 .../Containerfile.simple | 0 .../Containerfile.yum-repos-d | 0 test/e2e-ocl-shared/containerfiles.go | 30 + .../helpers.go} | 278 +-- 14 files changed, 1952 insertions(+), 1368 deletions(-) rename test/{e2e-ocl => e2e-ocl-1of2}/main_test.go (90%) create mode 100644 test/e2e-ocl-1of2/onclusterlayering_test.go rename test/{e2e-ocl => e2e-ocl-2of2}/imagepruner_test.go (95%) rename test/{e2e-ocl => e2e-ocl-2of2}/layered_image_scaleup_test.go (92%) create mode 100644 test/e2e-ocl-2of2/main_test.go rename test/{e2e-ocl => e2e-ocl-2of2}/onclusterlayering_test.go (71%) rename test/{e2e-ocl => e2e-ocl-shared}/Containerfile.cowsay (100%) rename test/{e2e-ocl => e2e-ocl-shared}/Containerfile.entitled (100%) rename test/{e2e-ocl => e2e-ocl-shared}/Containerfile.okd-fcos (100%) rename test/{e2e-ocl => e2e-ocl-shared}/Containerfile.simple (100%) rename test/{e2e-ocl => e2e-ocl-shared}/Containerfile.yum-repos-d (100%) create mode 100644 test/e2e-ocl-shared/containerfiles.go rename test/{e2e-ocl/helpers_test.go => e2e-ocl-shared/helpers.go} (77%) diff --git a/Makefile b/Makefile index ddeeccd680..a8d58fbbba 100644 --- a/Makefile +++ b/Makefile @@ -127,7 +127,7 @@ else endif # install-skopeo is purposely omitted from this target because it is only -# needed for a single test target (test-e2e-ocl). +# needed for the e2e-ocl test targets (test-e2e-ocl-1of2, test-e2e-ocl-2of2). install-tools: install-golangci-lint install-go-junit-report install-setup-envtest # Runs golangci-lint @@ -222,9 +222,11 @@ test-e2e-techpreview: install-go-junit-report test-e2e-single-node: install-go-junit-report set -o pipefail; go test -tags=$(GOTAGS) -failfast -timeout 120m -v$${WHAT:+ -run="$$WHAT"} ./test/e2e-single-node/ | ./hack/test-with-junit.sh $(@) -test-e2e-ocl: install-go-junit-report install-skopeo - # Temporarily include /tmp/skopeo/bin in our PATH variable so that the test suite can find skopeo. - set -o pipefail; PATH="$(PATH):/tmp/skopeo/bin" go test -tags=$(GOTAGS) -failfast -timeout 190m -v$${WHAT:+ -run="$$WHAT"} ./test/e2e-ocl/ | ./hack/test-with-junit.sh $(@) +test-e2e-ocl-1of2: install-go-junit-report install-skopeo + set -o pipefail; PATH="$(PATH):/tmp/skopeo/bin" go test -tags=$(GOTAGS) -failfast -timeout 120m -v$${WHAT:+ -run="$$WHAT"} ./test/e2e-ocl-1of2/ ./test/e2e-ocl-shared/ | ./hack/test-with-junit.sh $(@) + +test-e2e-ocl-2of2: install-go-junit-report install-skopeo + set -o pipefail; PATH="$(PATH):/tmp/skopeo/bin" go test -tags=$(GOTAGS) -failfast -timeout 150m -v$${WHAT:+ -run="$$WHAT"} ./test/e2e-ocl-2of2/ ./test/e2e-ocl-shared/ | ./hack/test-with-junit.sh $(@) test-e2e-iri: install-go-junit-report set -o pipefail; go test -tags=$(GOTAGS) -failfast -timeout 120m -v$${WHAT:+ -run="$$WHAT"} ./test/e2e-iri/ | ./hack/test-with-junit.sh $(@) diff --git a/test/e2e-ocl/main_test.go b/test/e2e-ocl-1of2/main_test.go similarity index 90% rename from test/e2e-ocl/main_test.go rename to test/e2e-ocl-1of2/main_test.go index e6f51b52b0..cc53198e46 100644 --- a/test/e2e-ocl/main_test.go +++ b/test/e2e-ocl-1of2/main_test.go @@ -1,4 +1,4 @@ -package e2e_ocl_test +package e2e_ocl_1of2_test import ( "flag" diff --git a/test/e2e-ocl-1of2/onclusterlayering_test.go b/test/e2e-ocl-1of2/onclusterlayering_test.go new file mode 100644 index 0000000000..db173f0489 --- /dev/null +++ b/test/e2e-ocl-1of2/onclusterlayering_test.go @@ -0,0 +1,910 @@ +package e2e_ocl_1of2_test + +import ( + "context" + "errors" + "flag" + "fmt" + "os" + "os/exec" + "path/filepath" + goruntime "runtime" + "testing" + "time" + + corev1 "k8s.io/api/core/v1" + + mcfgv1 "github.com/openshift/api/machineconfiguration/v1" + + "github.com/openshift/machine-config-operator/pkg/daemon/runtimeassets" + "github.com/openshift/machine-config-operator/test/framework" + "github.com/openshift/machine-config-operator/test/helpers" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/openshift/machine-config-operator/pkg/controller/build/buildrequest" + "github.com/openshift/machine-config-operator/pkg/controller/build/constants" + "github.com/openshift/machine-config-operator/pkg/controller/build/utils" + ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common" + ocltesthelper "github.com/openshift/machine-config-operator/test/e2e-ocl-shared" + k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" +) + +const ( + // The MachineConfigPool to create for the tests. + layeredMCPName string = "layered" + + // The MachineConfig names to create for the tests. + mcNameUsbguard string = "inspect-usbguard" +) + +var skipCleanupAlways bool +var skipCleanupOnlyAfterFailure bool + +func init() { + // Skips running the cleanup functions. Useful for debugging tests. + flag.BoolVar(&skipCleanupAlways, "skip-cleanup", false, "Skips running cleanups regardless of outcome") + // Skips running the cleanup function only when the test fails. + flag.BoolVar(&skipCleanupOnlyAfterFailure, "skip-cleanup-on-failure", false, "Skips running cleanups only after failure") +} + +// Holds elements common for each on-cluster build tests. +type onClusterLayeringTestOpts struct { + // Which image builder type to use for the test. + imageBuilderType mcfgv1.MachineOSImageBuilderType + + // The custom Dockerfiles to use for the test. This is a map of MachineConfigPool name to Dockerfile content. + customDockerfiles map[string]string + + // What node should be targeted for the test. + targetNode *corev1.Node + + // What MachineConfigPool name to use for the test. + poolName string + + // Use RHEL entitlements + entitlementRequired bool + + // Inject YUM repo information from a Centos 9 stream container + useYumRepos bool + + // Apply the following MachineConfigs before beginning the build. + machineConfigs []*mcfgv1.MachineConfig +} + +// Tests that an on-cluster build can be performed with the Custom Pod Builder. +func TestOnClusterLayering(t *testing.T) { + _, firstMosb := runOnClusterLayeringTest(t, onClusterLayeringTestOpts{ + poolName: layeredMCPName, + customDockerfiles: map[string]string{ + layeredMCPName: ocltesthelper.CowsayDockerfile, + }, + }) + + assert.NotEqual(t, string(firstMosb.UID), "") + + // Test rebuild annotation works + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + + t.Logf("Applying rebuild annotation (%q) to MachineOSConfig (%q) to cause a rebuild", constants.RebuildMachineOSConfigAnnotationKey, layeredMCPName) + + cs := framework.NewClientSet("") + + mosc, err := cs.MachineconfigurationV1Interface.MachineOSConfigs().Get(ctx, layeredMCPName, metav1.GetOptions{}) + require.NoError(t, err) + + helpers.SetRebuildAnnotationOnMachineOSConfig(ctx, t, cs.GetMcfgclient(), mosc) + + // Use the UID of the previous MOSB to ensure it is deleted as the rebuild will trigger a MOSB with the same name + t.Logf("Waiting for the previous MachineOSBuild with UID %q to be deleted", firstMosb.UID) + waitForMOSBToBeDeleted(t, cs, firstMosb) + + // Wait for the build to start + secondMosb := waitForBuildToStartForPoolAndConfig(t, cs, layeredMCPName, mosc.Name) + assert.NotEqual(t, firstMosb.UID, secondMosb.UID) +} + +// Tests that an on-cluster build can be performed and that the resulting image +// is rolled out to an opted-in node. +func TestOnClusterBuildRollsOutImage(t *testing.T) { + requiredKernelType := ctrlcommon.KernelTypeRealtime + if goruntime.GOARCH == "arm64" { + requiredKernelType = ctrlcommon.KernelType64kPages + } + + imagePullspec, _ := runOnClusterLayeringTest(t, onClusterLayeringTestOpts{ + poolName: layeredMCPName, + customDockerfiles: map[string]string{ + layeredMCPName: ocltesthelper.CowsayDockerfile, + }, + machineConfigs: []*mcfgv1.MachineConfig{ + ocltesthelper.NewMachineConfigWithKernelType(fmt.Sprintf("%s-kernel-machineconfig", requiredKernelType), layeredMCPName, requiredKernelType), + }, + }) + + cs := framework.NewClientSet("") + node := helpers.GetRandomNode(t, cs, "worker") + + unlabelFunc := ocltesthelper.MakeIdempotentAndRegisterAlwaysRun(t, helpers.LabelNode(t, cs, node, helpers.MCPNameToRole(layeredMCPName))) + helpers.WaitForNodeImageChange(t, cs, node, imagePullspec) + + helpers.AssertNodeBootedIntoImage(t, cs, node, imagePullspec) + t.Logf("Node %s is booted into image %q", node.Name, imagePullspec) + t.Log(helpers.ExecCmdOnNode(t, cs, node, "chroot", "/rootfs", "cowsay", "Moo!")) + + // Check that the booted image has the requested kernel + foundKernel := helpers.ExecCmdOnNode(t, cs, node, "chroot", "/rootfs", "uname", "-r") + t.Logf("Node %s running kernel: %s", node.Name, foundKernel) + if !ocltesthelper.CompareKernelType(t, foundKernel, requiredKernelType) { + t.Fatalf("Kernel type requested %s, got %s", requiredKernelType, foundKernel) + } + + unlabelFunc() + + assertNodeRevertsToNonLayered(t, cs, node) + + // Check that the reverted image has the default kernel. + requiredKernelType = ctrlcommon.KernelTypeDefault + foundKernel = helpers.ExecCmdOnNode(t, cs, node, "chroot", "/rootfs", "uname", "-r") + t.Logf("Node %s running kernel: %s", node.Name, foundKernel) + if !ocltesthelper.CompareKernelType(t, foundKernel, requiredKernelType) { + t.Fatalf("Kernel type requested %s, got %s", requiredKernelType, foundKernel) + } +} + +func TestMissingImageIsRebuilt(t *testing.T) { + cs := framework.NewClientSet("") + + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + + kubeassert := helpers.AssertClientSet(t, cs).WithContext(ctx) + + firstImagePullspec, firstMOSB := runOnClusterLayeringTest(t, onClusterLayeringTestOpts{ + poolName: layeredMCPName, + customDockerfiles: map[string]string{ + layeredMCPName: ocltesthelper.CowsayDockerfile, + }, + }) + + moscName := layeredMCPName + t.Logf("Waiting for MachineOSConfig %q to have a new pullspec", moscName) + waitForMOSCToGetNewPullspec(ctx, t, cs, moscName, firstImagePullspec) + + // Create a MC to create another MOSB + testMC := ocltesthelper.NewMachineConfigTriggersImageRebuild(mcNameUsbguard, layeredMCPName, []string{"usbguard"}) + t.Logf("Creating MachineConfig %q", testMC.Name) + firstMC, err := cs.MachineConfigs().Create(ctx, testMC, metav1.CreateOptions{}) + require.NoError(t, err) + t.Logf("Created MachineConfig %q", firstMC.Name) + kubeassert.MachineConfigExists(firstMC) + + // Wait for the build to start + t.Logf("Waiting for 2nd build to start...") + secondMOSBName := waitForMOSCToUpdateCurrentMOSB(ctx, t, cs, moscName, firstMOSB.Name) + secondMOSB, err := cs.GetMcfgclient().MachineconfigurationV1().MachineOSBuilds().Get(ctx, secondMOSBName, metav1.GetOptions{}) + require.NoError(t, err) + secondMOSB = waitForBuildToStart(t, cs, secondMOSB) + t.Logf("MachineOSBuild %q has started", secondMOSB.Name) + + // Wait for the build to finish + t.Logf("Waiting for 2nd build completion...") + secondFinishedBuild := waitForBuildToComplete(t, cs, secondMOSB) + secondImagePullspec := string(secondFinishedBuild.Status.DigestedImagePushSpec) + t.Logf("MachineOSBuild %q has completed and produced image: %s", secondFinishedBuild.Name, secondImagePullspec) + waitForMOSCToGetNewPullspec(ctx, t, cs, moscName, secondImagePullspec) + + // Delete the first image- simulating image deletion + t.Logf("Deleting image %q", firstImagePullspec) + istName := fmt.Sprintf("os-image:%s", firstMOSB.Name) + err = cs.ImageStreamTags(ctrlcommon.MCONamespace).Delete(ctx, istName, metav1.DeleteOptions{}) + require.NoError(t, err) + kubeassert.ImageDoesNotExist(istName) + t.Logf("Deleted image %q", firstImagePullspec) + + // Delete the first MC + t.Logf("Deleting MachineConfig %q to retrigger build", firstMC.Name) + err = cs.MachineConfigs().Delete(ctx, firstMC.Name, metav1.DeleteOptions{}) + require.NoError(t, err) + kubeassert.MachineConfigDoesNotExist(firstMC) + t.Logf("Deleted MachineConfig %q", firstMC.Name) + + // Wait for the build to start + t.Logf("Waiting for 3rd build (rebuild of image1) to start...") + thirdMOSBName := waitForMOSCToUpdateCurrentMOSB(ctx, t, cs, moscName, secondMOSB.Name) + thirdMOSB, err := cs.GetMcfgclient().MachineconfigurationV1().MachineOSBuilds().Get(ctx, thirdMOSBName, metav1.GetOptions{}) + require.NoError(t, err) + thirdMOSB = waitForBuildToStart(t, cs, thirdMOSB) + t.Logf("MachineOSBuild %q has started (rebuild of image1)", thirdMOSB.Name) + + // Wait for the build to finish + t.Logf("Waiting for 3rd build completion...") + thirdFinishedBuild := waitForBuildToComplete(t, cs, thirdMOSB) + thirdImagePullspec := string(thirdFinishedBuild.Status.DigestedImagePushSpec) + t.Logf("MachineOSBuild %q has completed and produced image: %s", thirdFinishedBuild.Name, thirdImagePullspec) + waitForMOSCToGetNewPullspec(ctx, t, cs, moscName, thirdImagePullspec) + + // Apply the MC again + t.Logf("Re‐applying the same MachineConfig %q to confirm no new build for image2", testMC.Name) + secondMC, err := cs.MachineConfigs().Create(ctx, testMC, metav1.CreateOptions{}) + require.NoError(t, err) + kubeassert.MachineConfigExists(secondMC) + t.Logf("Created MachineConfig %q", secondMC.Name) + + // waitForMOSCToGetNewPullspec(ctx, t, cs, moscName, secondImagePullspec) + + t.Logf("Waiting for recycled USBGuard MOSB %q to finish (or to prove there is none)", secondMOSB.Name) + secondMOSB, err = cs.GetMcfgclient().MachineconfigurationV1().MachineOSBuilds().Get(ctx, secondMOSB.Name, metav1.GetOptions{}) + require.NoError(t, err) + secondMOSB = waitForBuildToComplete(t, cs, secondMOSB) + t.Logf("MOSB %q is now complete (reused image)", secondMOSB.Name) + + t.Logf("Deleting MachineOSBuild %q (MOSB3) to test pruning of image1", thirdMOSB.Name) + err = cs.MachineconfigurationV1Interface.MachineOSBuilds().Delete(ctx, thirdMOSB.Name, metav1.DeleteOptions{}) + require.NoError(t, err) + kubeassert.MachineOSBuildDoesNotExist(thirdMOSB) + t.Logf("Deleted MachineOSBuild %q", thirdMOSB.Name) + + deletedIst := fmt.Sprintf("os-image:%s", thirdMOSBName) + kubeassert.ImageDoesNotExist(deletedIst) + t.Logf("ImageStreamTag %q has been pruned", deletedIst) + + t.Logf("Deleting MachineConfig %q for cleanup", secondMC.Name) + err = cs.MachineConfigs().Delete(ctx, secondMC.Name, metav1.DeleteOptions{}) + require.NoError(t, err) + kubeassert.MachineConfigDoesNotExist(secondMC) + t.Logf("Deleted MachineConfig %q", secondMC.Name) +} + +// This test extracts the /etc/yum.repos.d and /etc/pki/rpm-gpg content from a +// Centos Stream 9 image and injects them into the MCO namespace. It then +// performs a build with the expectation that these artifacts will be used, +// simulating a build where someone has added this content; usually a Red Hat +// Satellite user. +func TestYumReposBuilds(t *testing.T) { + // Skipping this test as it is having a package conflict issue unrelated to MCO + t.Skip() + runOnClusterLayeringTest(t, onClusterLayeringTestOpts{ + poolName: layeredMCPName, + customDockerfiles: map[string]string{ + layeredMCPName: ocltesthelper.YumReposDockerfile, + }, + useYumRepos: true, + }) +} + +// Then performs an on-cluster layering build which should consume the +// etc-pki-entitlement certificates. +func TestEntitledBuilds(t *testing.T) { + ocltesthelper.SkipOnOKD(t) + + runOnClusterLayeringTest(t, onClusterLayeringTestOpts{ + poolName: layeredMCPName, + customDockerfiles: map[string]string{ + layeredMCPName: ocltesthelper.EntitledDockerfile, + }, + entitlementRequired: true, + }) +} + +// This test verifies that if a change is made to a given MachineOSConfig, that +// any in-progress builds are terminated and that only the latest change is +// being built. +func TestMachineOSConfigChangeRestartsBuild(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + + cs := framework.NewClientSet("") + + mosc := prepareForOnClusterLayeringTest(t, cs, onClusterLayeringTestOpts{ + poolName: layeredMCPName, + customDockerfiles: map[string]string{ + layeredMCPName: ocltesthelper.CowsayDockerfile, + }, + }) + + createMachineOSConfig(t, cs, mosc) + + mcp, err := cs.MachineconfigurationV1Interface.MachineConfigPools().Get(ctx, layeredMCPName, metav1.GetOptions{}) + require.NoError(t, err) + + firstMosb := buildrequest.NewMachineOSBuildFromAPIOrDie(ctx, cs.GetKubeclient(), mosc, mcp) + + // First, we get a MachineOSBuild started as usual. + waitForBuildToStart(t, cs, firstMosb) + + // Next, we update the Containerfile. + t.Logf("Initial build has started, updating Containerfile...") + + apiMosc := helpers.SetContainerfileContentsOnMachineOSConfig(ctx, t, cs.GetMcfgclient(), mosc, "FROM configs AS final\nRUN echo 'hello' > /etc/hello") + + moscChangeMosb := buildrequest.NewMachineOSBuildFromAPIOrDie(ctx, cs.GetKubeclient(), apiMosc, mcp) + + kubeassert := helpers.AssertClientSet(t, cs).WithContext(ctx) + + assertBuildObjectsAreCreated(t, kubeassert, firstMosb) + + t.Logf("Containerfile is updated, waiting for new build %s to start", moscChangeMosb.Name) + + // Wait for the second build to start. + waitForBuildToStart(t, cs, moscChangeMosb) + + t.Logf("Waiting for initial MachineOSBuild %s to be deleted", firstMosb.Name) + // Wait for the first build to be deleted. + waitForBuildToBeDeleted(t, cs, firstMosb) + + // Ensure that the second build still exists. + _, err = cs.MachineconfigurationV1Interface.MachineOSBuilds().Get(context.TODO(), moscChangeMosb.Name, metav1.GetOptions{}) + require.NoError(t, err) +} + +// This test verifies that a change to the MachineConfigPool, such as the +// presence of a new rendered MachineConfig, will halt the currently running +// build, replacing it with a new build instead. +func TestMachineConfigPoolChangeRestartsBuild(t *testing.T) { + cs := framework.NewClientSet("") + + mosc := prepareForOnClusterLayeringTest(t, cs, onClusterLayeringTestOpts{ + poolName: layeredMCPName, + customDockerfiles: map[string]string{ + layeredMCPName: ocltesthelper.CowsayDockerfile, + }, + }) + + createMachineOSConfig(t, cs, mosc) + + // Wait for the first build to start. + firstMosb := waitForBuildToStartForPoolAndConfig(t, cs, layeredMCPName, mosc.Name) + + // Once the first build has started, we create a new MachineConfig, wait for + // the rendered config to appear, then we check that a new MachineOSBuild has + // started for that new MachineConfig. + mcName := "new-machineconfig" + mc := ocltesthelper.NewMachineConfigTriggersImageRebuild(mcName, layeredMCPName, []string{"usbguard"}) + applyMC(t, cs, mc) + + _, err := helpers.WaitForRenderedConfig(t, cs, layeredMCPName, mcName) + require.NoError(t, err) + + // We wait for the first build to be deleted. + waitForBuildToBeDeleted(t, cs, firstMosb) + + // Next, we wait for the new build to be started. + secondMosb := waitForBuildToStartForPoolAndConfig(t, cs, layeredMCPName, mosc.Name) + + _, err = cs.MachineconfigurationV1Interface.MachineOSBuilds().Get(context.TODO(), secondMosb.Name, metav1.GetOptions{}) + require.NoError(t, err) +} + +// This test verifies that if the rebuild annotation is added to a given MachineOSConfig, that +// the build is restarted +func TestRebuildAnnotationRestartsBuild(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + + cs := framework.NewClientSet("") + + mosc := prepareForOnClusterLayeringTest(t, cs, onClusterLayeringTestOpts{ + poolName: layeredMCPName, + customDockerfiles: map[string]string{ + layeredMCPName: ocltesthelper.CowsayDockerfile, + }, + }) + + createMachineOSConfig(t, cs, mosc) + + // First, we get a MachineOSBuild started as usual. + firstMosb := waitForBuildToStartForPoolAndConfig(t, cs, layeredMCPName, mosc.Name) + + assert.NotEqual(t, string(firstMosb.UID), "") + + kubeassert := helpers.AssertClientSet(t, cs).WithContext(ctx) + assertBuildObjectsAreCreated(t, kubeassert, firstMosb) + + firstJob, err := cs.BatchV1Interface.Jobs(ctrlcommon.MCONamespace).Get(ctx, utils.GetBuildJobName(firstMosb), metav1.GetOptions{}) + require.NoError(t, err) + + pod, err := ocltesthelper.GetPodFromJob(ctx, cs, utils.GetBuildJobName(firstMosb)) + require.NoError(t, err) + t.Logf("Initial build has started, delete the job to interrupt the build...") + // Delete the builder + bgDeletion := metav1.DeletePropagationBackground + err = cs.BatchV1Interface.Jobs(ctrlcommon.MCONamespace).Delete(ctx, utils.GetBuildJobName(firstMosb), metav1.DeleteOptions{PropagationPolicy: &bgDeletion}) + require.NoError(t, err) + + // Wait for the build to be interrupted. + waitForBuildToBeInterrupted(t, cs, firstMosb) + + // Wait for the job and pod to be deleted. + kubeassert.Eventually().JobDoesNotExist(utils.GetBuildJobName(firstMosb)) + kubeassert.Eventually().PodDoesNotExist(pod.Name) + + t.Logf("Add rebuild annotation to the MOSC...") + helpers.SetRebuildAnnotationOnMachineOSConfig(ctx, t, cs.GetMcfgclient(), mosc) + + // Wait for the MOSB to be deleted + t.Logf("Waiting for MachineOSBuild with UID %s to be deleted", firstMosb.UID) + waitForMOSBToBeDeleted(t, cs, firstMosb) + + t.Logf("Annotation is updated, waiting for new build %s to start", firstMosb.Name) + // Wait for the build to start. + secondMosb := waitForBuildToStart(t, cs, firstMosb) + + secondJob, err := cs.BatchV1Interface.Jobs(ctrlcommon.MCONamespace).Get(ctx, utils.GetBuildJobName(secondMosb), metav1.GetOptions{}) + require.NoError(t, err) + + // Ensure that the names are the same, but that the first and second + // MachineOSBuilds have different UIDs. + assert.Equal(t, firstMosb.Name, secondMosb.Name) + assert.NotEqual(t, firstMosb.UID, secondMosb.UID) + + // Ensure that the build jobs have also changed. + assert.Equal(t, firstJob.Name, secondJob.Name) + assert.NotEqual(t, firstJob.UID, secondJob.UID) +} + +func assertNodeRevertsToNonLayered(t *testing.T, cs *framework.ClientSet, node corev1.Node) { + workerMCName := helpers.GetMcName(t, cs, "worker") + workerMC, err := cs.MachineConfigs().Get(context.TODO(), workerMCName, metav1.GetOptions{}) + require.NoError(t, err) + + helpers.WaitForNodeConfigAndImageChange(t, cs, node, workerMCName, "") + + helpers.AssertNodeBootedIntoImage(t, cs, node, workerMC.Spec.OSImageURL) + t.Logf("Node %s has reverted to OS image %q", node.Name, workerMC.Spec.OSImageURL) + + helpers.AssertFileNotOnNode(t, cs, node, filepath.Join("/etc/systemd/system", runtimeassets.RevertServiceName)) + helpers.AssertFileNotOnNode(t, cs, node, runtimeassets.RevertServiceMachineConfigFile) +} + +func assertBuildObjectsAreCreated(t *testing.T, kubeassert *helpers.Assertions, mosb *mcfgv1.MachineOSBuild) { + t.Helper() + + kubeassert.JobExists(utils.GetBuildJobName(mosb)) + kubeassert.ConfigMapExists(utils.GetContainerfileConfigMapName(mosb)) + kubeassert.ConfigMapExists(utils.GetMCConfigMapName(mosb)) + kubeassert.ConfigMapExists(utils.GetEtcPolicyConfigMapName(mosb)) + kubeassert.ConfigMapExists(utils.GetEtcRegistriesConfigMapName(mosb)) + kubeassert.SecretExists(utils.GetBasePullSecretName(mosb)) + kubeassert.SecretExists(utils.GetFinalPushSecretName(mosb)) + + // Check that ownerReferences are set as well + kubeassert.ConfigMapHasOwnerSet(utils.GetContainerfileConfigMapName(mosb)) + kubeassert.ConfigMapHasOwnerSet(utils.GetMCConfigMapName(mosb)) + kubeassert.ConfigMapHasOwnerSet(utils.GetEtcPolicyConfigMapName(mosb)) + kubeassert.ConfigMapHasOwnerSet(utils.GetEtcRegistriesConfigMapName(mosb)) + kubeassert.SecretHasOwnerSet(utils.GetBasePullSecretName(mosb)) + kubeassert.SecretHasOwnerSet(utils.GetFinalPushSecretName(mosb)) +} + +func assertBuildObjectsAreDeleted(t *testing.T, kubeassert *helpers.Assertions, mosb *mcfgv1.MachineOSBuild) { + t.Helper() + + kubeassert.JobDoesNotExist(utils.GetBuildJobName(mosb)) + kubeassert.ConfigMapDoesNotExist(utils.GetContainerfileConfigMapName(mosb)) + kubeassert.ConfigMapDoesNotExist(utils.GetMCConfigMapName(mosb)) + kubeassert.ConfigMapDoesNotExist(utils.GetEtcPolicyConfigMapName(mosb)) + kubeassert.ConfigMapDoesNotExist(utils.GetEtcRegistriesConfigMapName(mosb)) + kubeassert.SecretDoesNotExist(utils.GetBasePullSecretName(mosb)) + kubeassert.SecretDoesNotExist(utils.GetFinalPushSecretName(mosb)) +} + +// Sets up and performs an on-cluster build for a given set of parameters. +// Returns the built image pullspec for later consumption. +func runOnClusterLayeringTest(t *testing.T, testOpts onClusterLayeringTestOpts) (string, *mcfgv1.MachineOSBuild) { + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + + cs := framework.NewClientSet("") + + imageBuilder := testOpts.imageBuilderType + if testOpts.imageBuilderType == "" { + imageBuilder = mcfgv1.JobBuilder + } + + t.Logf("Running with ImageBuilder type: %s", imageBuilder) + + mosc := prepareForOnClusterLayeringTest(t, cs, testOpts) + + // Create our MachineOSConfig. + createMachineOSConfig(t, cs, mosc) + + // Create a child context for the machine-os-builder pod log streamer. We + // create it here because we want the cancellation to run before the + // MachineOSConfig object is removed. + mobPodStreamerCtx, mobPodStreamerCancel := context.WithCancel(ctx) + t.Cleanup(mobPodStreamerCancel) + + // Wait for the build to start + startedBuild := waitForBuildToStartForPoolAndConfig(t, cs, testOpts.poolName, mosc.Name) + t.Logf("MachineOSBuild %q has started", startedBuild.Name) + + t.Logf("Waiting for build completion...") + + // Create a child context for the build pod log streamer. This is so we can + // cancel it independently of the parent context or the context for the + // machine-os-build pod watcher (which has its own separate context). + buildPodStreamerCtx, buildPodStreamerCancel := context.WithCancel(ctx) + + // We wire this to both t.Cleanup() as well as defer because we want to + // cancel this context either at the end of this function or when the test + // fails, whichever comes first. + buildPodWatcherShutdown := ocltesthelper.MakeIdempotentAndRegisterAlwaysRun(t, buildPodStreamerCancel) + defer buildPodWatcherShutdown() + + dirPath := ocltesthelper.GetBuildArtifactDir(t) + + podLogsDirPath := filepath.Join(dirPath, "pod-logs") + require.NoError(t, os.MkdirAll(podLogsDirPath, 0o755)) + + // In the event of a test failure, we want to dump all of the build artifacts + // to files for easy reference later. + t.Cleanup(func() { + if t.Failed() { + ocltesthelper.WriteBuildArtifactsToFiles(t, cs) + } + }) + + // The pod log collection blocks the main Goroutine since we follow the logs + // for each container in the build pod. So they must run in a separate + // Goroutine so that the rest of the test can continue. + go func() { + err := ocltesthelper.StreamBuildPodLogsToFile(buildPodStreamerCtx, t, cs, startedBuild, podLogsDirPath) + if err != nil && !errors.Is(err, context.Canceled) { + t.Logf("Warning: failed to stream build pod logs: %v", err) + } + }() + + // We also want to collect logs from the machine-os-builder pod since they + // can provide a valuable window in how / why a test failed. As mentioned + // above, we need to run this in a separate Goroutine so that the test is not + // blocked. + go func() { + err := ocltesthelper.StreamMachineOSBuilderPodLogsToFile(mobPodStreamerCtx, t, cs, podLogsDirPath) + if err != nil && !errors.Is(err, context.Canceled) { + t.Logf("Warning: failed to stream machine-os-builder pod logs: %v", err) + } + }() + + // Wait for the build to complete. + finishedBuild := waitForBuildToComplete(t, cs, startedBuild) + + t.Logf("MachineOSBuild %q has completed and produced image: %s", finishedBuild.Name, finishedBuild.Status.DigestedImagePushSpec) + + require.NoError(t, archiveBuildPodLogs(t, podLogsDirPath)) + + return string(finishedBuild.Status.DigestedImagePushSpec), startedBuild +} + +func archiveBuildPodLogs(t *testing.T, podLogsDirPath string) error { + archiveName := fmt.Sprintf("%s-pod-logs.tar.gz", helpers.SanitizeTestName(t)) + + archive, err := helpers.NewArtifactArchive(t, archiveName) + if err != nil { + return err + } + + cmd := exec.Command("mv", podLogsDirPath, archive.StagingDir()) + output, err := cmd.CombinedOutput() + if err != nil { + t.Log(string(output)) + return err + } + + return archive.WriteArchive() +} + +// Waits for the build to start and returns the started MachineOSBuild object. +func waitForBuildToStartForPoolAndConfig(t *testing.T, cs *framework.ClientSet, poolName, moscName string) *mcfgv1.MachineOSBuild { + t.Helper() + + var mosbName string + + require.NoError(t, wait.PollImmediate(2*time.Second, 3*time.Minute, func() (bool, error) { + // Get the name for the MachineOSBuild based upon the MachineConfigPool and MachineOSConfig state. + name, err := ocltesthelper.GetMachineOSBuildNameForPool(cs, poolName, moscName) + if err != nil { + return false, nil + } + + mosbName = name + return true, nil + })) + + // Create a "dummy" MachineOSBuild object with just the name field set so + // that waitForMachineOSBuildToReachState() can use it. + mosb := &mcfgv1.MachineOSBuild{ + ObjectMeta: metav1.ObjectMeta{ + Name: mosbName, + }, + } + + return waitForBuildToStart(t, cs, mosb) +} + +// Waits for a MachineOSBuild to start building. +func waitForBuildToStart(t *testing.T, cs *framework.ClientSet, build *mcfgv1.MachineOSBuild) *mcfgv1.MachineOSBuild { + t.Helper() + + t.Logf("Waiting for MachineOSBuild %s to start", build.Name) + + ctx, cancel := context.WithTimeout(context.Background(), time.Minute*5) + defer cancel() + + start := time.Now() + + kubeassert := helpers.AssertClientSet(t, cs).WithContext(ctx) + kubeassert.Eventually().MachineOSBuildExists(build) + t.Logf("MachineOSBuild %s created after %s", build.Name, time.Since(start)) + kubeassert.Eventually().MachineOSBuildIsRunning(build) + t.Logf("MachineOSBuild %s running after %s", build.Name, time.Since(start)) + + // Get the job for the MOSB created by comparing the job UID with the MOSB annotation + buildJobName, err := ocltesthelper.GetJobForMOSB(ctx, cs, build) + require.NoError(t, err) + kubeassert.Eventually().JobExists(buildJobName) + t.Logf("Build job %s created after %s", buildJobName, time.Since(start)) + // Get the pod created by the job + buildPod, err := ocltesthelper.GetPodFromJob(ctx, cs, buildJobName) + require.NoError(t, err) + kubeassert.Eventually().PodIsRunning(buildPod.Name) + t.Logf("Build pod %s running after %s", buildPod.Name, time.Since(start)) + kubeassert.Eventually().PodHasOwnerSet(buildPod.Name) + t.Logf("Build pod %s has owner set after %s", buildPod.Name, time.Since(start)) + + mosb, err := cs.MachineconfigurationV1Interface.MachineOSBuilds().Get(ctx, build.Name, metav1.GetOptions{}) + require.NoError(t, err) + + assertBuildObjectsAreCreated(t, kubeassert.Eventually(), mosb) + t.Logf("Build objects created after %s", time.Since(start)) + + return mosb +} + +// Waits for a MachineOSBuild with a specific UID to be deleted. +func waitForMOSBToBeDeleted(t *testing.T, cs *framework.ClientSet, mosb *mcfgv1.MachineOSBuild) { + t.Helper() + + ctx, cancel := context.WithTimeout(context.Background(), time.Minute*5) + defer cancel() + + start := time.Now() + + // If the given MachineOSBuild does not have a UID, e.g., from the + // NewMachineOSBuildFromAPIOrDie() helper, then we query the API server to + // find it. + if mosb.UID == "" { + t.Logf("No UID provided for MachineOSBuild %s, querying API for UID", mosb.Name) + // Get the MOSB from the API to get the UID + apiMosb, err := cs.MachineconfigurationV1Interface.MachineOSBuilds().Get(context.Background(), mosb.Name, metav1.GetOptions{}) + require.NoError(t, err) + + if k8serrors.IsNotFound(err) { + t.Logf("MachineOSBuild %s is not found, must have already been deleted", mosb.Name) + return + } + + require.NoError(t, err) + + mosb = apiMosb + } + + mosbName := mosb.Name + mosbUID := mosb.UID + + t.Logf("Waiting for MachineOSBuild %s with UID %s to be deleted", mosbName, mosbUID) + + // Assert does not adequately handle the case where the object is deleted. + // See https://issues.redhat.com/browse/OCPBUGS-63048 for details. + err := wait.PollImmediate(time.Second, time.Minute*5, func() (bool, error) { + mosbs, err := cs.MachineconfigurationV1Interface.MachineOSBuilds().List(ctx, metav1.ListOptions{}) + if err != nil { + return false, err + } + + for _, mosb := range mosbs.Items { + // If we find a MachineOSBuild with the same name and UID, then we know + // it has not been deleted yet. + if mosb.Name == mosbName && mosb.UID == mosbUID { + return false, nil + } + } + + return true, nil + }) + + t.Logf("MachineOSBuild %s with UID %s deleted after %s", mosb.Name, mosb.UID, time.Since(start)) + + require.NoError(t, err, "MachineOSBuild %s with UID %s not deleted after %s", mosb.Name, mosb.UID, time.Since(start)) +} + +// Waits for a MachineOSBuild to be deleted. This is different than +// waitForMOSBToBeDeleted since it then asserts that all of the objects +// associated with the MOSB are deleted. +func waitForBuildToBeDeleted(t *testing.T, cs *framework.ClientSet, build *mcfgv1.MachineOSBuild) { + t.Helper() + + ctx, cancel := context.WithTimeout(context.Background(), time.Minute*5) + defer cancel() + + kubeassert := helpers.AssertClientSet(t, cs).WithContext(ctx) + + t.Logf("Waiting for MachineOSBuild %s to be deleted", build.Name) + + start := time.Now() + + waitForMOSBToBeDeleted(t, cs, build) + + assertBuildObjectsAreDeleted(t, kubeassert.Eventually(), build) + t.Logf("Build objects deleted after %s", time.Since(start)) +} + +// Waits for the given MachineOSBuild to complete and returns the completed +// MachineOSBuild object. +func waitForBuildToComplete(t *testing.T, cs *framework.ClientSet, startedBuild *mcfgv1.MachineOSBuild) *mcfgv1.MachineOSBuild { + t.Helper() + + t.Logf("Waiting for MachineOSBuild %s to complete", startedBuild.Name) + + ctx, cancel := context.WithTimeout(context.Background(), time.Minute*20) + defer cancel() + + start := time.Now() + + kubeassert := helpers.AssertClientSet(t, cs).WithContext(ctx) + kubeassert.Eventually().MachineOSBuildIsSuccessful(startedBuild) + t.Logf("MachineOSBuild %s successful after %s", startedBuild.Name, time.Since(start)) + assertBuildObjectsAreDeleted(t, kubeassert.Eventually(), startedBuild) + t.Logf("Build objects deleted after %s", time.Since(start)) + + mosb, err := cs.MachineconfigurationV1Interface.MachineOSBuilds().Get(ctx, startedBuild.Name, metav1.GetOptions{}) + require.NoError(t, err) + + return mosb +} + +func waitForBuildToBeInterrupted(t *testing.T, cs *framework.ClientSet, startedBuild *mcfgv1.MachineOSBuild) *mcfgv1.MachineOSBuild { + t.Helper() + + t.Logf("Waiting for MachineOSBuild %s to be interrupted", startedBuild.Name) + + ctx, cancel := context.WithTimeout(context.Background(), time.Minute*5) + defer cancel() + + start := time.Now() + + kubeassert := helpers.AssertClientSet(t, cs).WithContext(ctx) + kubeassert.Eventually().MachineOSBuildIsInterrupted(startedBuild) + t.Logf("MachineOSBuild %s interrupted after %s", startedBuild.Name, time.Since(start)) + + mosb, err := cs.MachineconfigurationV1Interface.MachineOSBuilds().Get(ctx, startedBuild.Name, metav1.GetOptions{}) + require.NoError(t, err) + + return mosb +} + +// Prepares for an on-cluster build test by performing the following: +// - Gets the Docker Builder secret name from the MCO namespace. +// - Creates the imagestream to use for the test. +// - Clones the global pull secret into the MCO namespace. +// - If requested, clones the RHEL entitlement secret into the MCO namespace. +// - Creates the on-cluster-build-config ConfigMap. +// - Creates the target MachineConfigPool and waits for it to get a rendered config. +// - Creates the on-cluster-build-custom-dockerfile ConfigMap. +// +// Each of the object creation steps registers an idempotent cleanup function +// that will delete the object at the end of the test. +// +// Returns a MachineOSConfig object for the caller to create to begin the build +// process. +func prepareForOnClusterLayeringTest(t *testing.T, cs *framework.ClientSet, testOpts onClusterLayeringTestOpts) *mcfgv1.MachineOSConfig { + // If the test requires RHEL entitlements, ensure they are present + // in the test cluster. If not found, the test is skipped. + if testOpts.entitlementRequired { + ocltesthelper.SkipIfEntitlementNotPresent(t, cs) + } + + // If the test requires /etc/yum.repos.d and /etc/pki/rpm-gpg, pull a Centos + // Stream 9 container image and populate them from there. This is intended to + // emulate the Red Hat Satellite enablement process, but does not actually + // require any Red Hat Satellite creds to work. + if testOpts.useYumRepos { + ocltesthelper.InjectYumRepos(t, cs, skipCleanupAlways, skipCleanupOnlyAfterFailure) + } + + // Register ephemeral object cleanup function. + ocltesthelper.MakeIdempotentAndRegister(t, skipCleanupAlways, skipCleanupOnlyAfterFailure, func() { + ocltesthelper.CleanupEphemeralBuildObjects(t, cs) + }) + + imagestreamObjMeta := metav1.ObjectMeta{ + Name: "os-image", + } + + pushSecretName, finalPullspec, _ := ocltesthelper.SetupImageStream(t, cs, imagestreamObjMeta, skipCleanupAlways, skipCleanupOnlyAfterFailure) + + if testOpts.targetNode != nil { + ocltesthelper.MakeIdempotentAndRegister(t, skipCleanupAlways, skipCleanupOnlyAfterFailure, helpers.CreatePoolWithNode(t, cs, testOpts.poolName, *testOpts.targetNode)) + } else { + ocltesthelper.MakeIdempotentAndRegister(t, skipCleanupAlways, skipCleanupOnlyAfterFailure, helpers.CreateMCP(t, cs, testOpts.poolName)) + } + + mcNames := []string{"00-worker"} + if len(testOpts.machineConfigs) > 0 { + for _, mc := range testOpts.machineConfigs { + ocltesthelper.MakeIdempotentAndRegister(t, skipCleanupAlways, skipCleanupOnlyAfterFailure, helpers.ApplyMC(t, cs, mc)) + mcNames = append(mcNames, mc.Name) + } + } + + _, err := helpers.WaitForRenderedConfigs(t, cs, testOpts.poolName, mcNames...) + require.NoError(t, err) + + mosc := &mcfgv1.MachineOSConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: testOpts.poolName, + }, + Spec: mcfgv1.MachineOSConfigSpec{ + MachineConfigPool: mcfgv1.MachineConfigPoolReference{ + Name: testOpts.poolName, + }, + RenderedImagePushSecret: mcfgv1.ImageSecretObjectReference{ + Name: pushSecretName, + }, + RenderedImagePushSpec: mcfgv1.ImageTagFormat(finalPullspec), + ImageBuilder: mcfgv1.MachineOSImageBuilder{ + ImageBuilderType: mcfgv1.JobBuilder, + }, + Containerfile: []mcfgv1.MachineOSContainerfile{ + { + ContainerfileArch: mcfgv1.NoArch, + Content: testOpts.customDockerfiles[testOpts.poolName], + }, + }, + }, + } + + helpers.SetMetadataOnObject(t, mosc) + + return mosc +} + +func createMachineOSConfig(t *testing.T, cs *framework.ClientSet, mosc *mcfgv1.MachineOSConfig) { + ocltesthelper.CreateMachineOSConfig(t, cs, mosc, skipCleanupAlways, skipCleanupOnlyAfterFailure) +} + +func applyMC(t *testing.T, cs *framework.ClientSet, mc *mcfgv1.MachineConfig) func() { + return ocltesthelper.ApplyMC(t, cs, mc, skipCleanupAlways, skipCleanupOnlyAfterFailure) +} + +func waitForMOSCToGetNewPullspec(ctx context.Context, t *testing.T, cs *framework.ClientSet, moscName, pullspec string) { + require.NoError(t, wait.PollImmediate(1*time.Second, 5*time.Minute, func() (bool, error) { + mosc, err := cs.MachineconfigurationV1Interface.MachineOSConfigs().Get(ctx, moscName, metav1.GetOptions{}) + if err != nil { + return false, err + } + + return mosc.Status.CurrentImagePullSpec != "" && string(mosc.Status.CurrentImagePullSpec) == pullspec, nil + })) +} + +func waitForMOSCToUpdateCurrentMOSB(ctx context.Context, t *testing.T, cs *framework.ClientSet, moscName, mosbName string) string { + var currentMOSB string + require.NoError(t, wait.PollImmediate(1*time.Second, 5*time.Minute, func() (bool, error) { + mosc, err := cs.MachineconfigurationV1Interface.MachineOSConfigs().Get(ctx, moscName, metav1.GetOptions{}) + if err != nil { + return false, err + } + + currentMOSB = mosc.GetAnnotations()[constants.CurrentMachineOSBuildAnnotationKey] + // Wait for annotation to be non-empty AND different from the old MOSB. + // The stale annotation fix may temporarily clear the annotation, so we need + // to wait for the new MOSB to be created and annotation set. + return currentMOSB != "" && currentMOSB != mosbName, nil + + })) + return currentMOSB +} diff --git a/test/e2e-ocl/imagepruner_test.go b/test/e2e-ocl-2of2/imagepruner_test.go similarity index 95% rename from test/e2e-ocl/imagepruner_test.go rename to test/e2e-ocl-2of2/imagepruner_test.go index b32eb66fcd..a7ba311a9c 100644 --- a/test/e2e-ocl/imagepruner_test.go +++ b/test/e2e-ocl-2of2/imagepruner_test.go @@ -1,4 +1,4 @@ -package e2e_ocl_test +package e2e_ocl_2of2_test import ( "context" @@ -34,6 +34,8 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + + e2e_ocl_shared_test "github.com/openshift/machine-config-operator/test/e2e-ocl-shared" ) // Used by TestImagePruner only when flags are passed. @@ -120,7 +122,7 @@ func TestImagePrunerOnCluster(t *testing.T) { }) // Set up the imagestream for this test. - pushSecretName, pullspec, cleanupFunc := setupImageStream(t, cs, metav1.ObjectMeta{Name: "imagepruner", Namespace: ctrlcommon.MCONamespace}) + pushSecretName, pullspec, cleanupFunc := e2e_ocl_shared_test.SetupImageStream(t, cs, metav1.ObjectMeta{Name: "imagepruner", Namespace: ctrlcommon.MCONamespace}, false, false) t.Cleanup(cleanupFunc) // Parse the internal registry hostname from the image pullspec. @@ -340,6 +342,8 @@ func TestImagePrunerErrors(t *testing.T) { inspect: operation{ nonexistentRepo: expectedErr{ accessDenied: true, + // Docker.io behavior varies - accept either error type + imageNotFound: true, }, nonexistentTag: expectedErr{ imageNotFound: true, @@ -390,6 +394,8 @@ func TestImagePrunerErrors(t *testing.T) { }, nonexistentTag: expectedErr{ accessDenied: true, + // GitHub registry behavior varies - accept either error type + imageNotFound: true, }, nonexistentDigest: expectedErr{ accessDenied: true, @@ -450,6 +456,11 @@ func TestImagePrunerErrors(t *testing.T) { _, imgDigest, err := ip.InspectImage(ctx, pullspec, k8sSecret, &mcfgv1.ControllerConfig{}) if !expected.imageNotFound && !expected.accessDenied { require.NoError(t, err) + } else if expected.imageNotFound && expected.accessDenied { + // Both flags set means accept either error type (registries may vary) + isImageNotFound := imagepruner.IsImageNotFoundErr(err) + isAccessDenied := imagepruner.IsAccessDeniedErr(err) + assert.True(t, isImageNotFound || isAccessDenied, "expected either imageNotFound or accessDenied error, got: %v", err) } else { assert.Equal(t, expected.imageNotFound, imagepruner.IsImageNotFoundErr(err), "image not found error should be %v", !expected.imageNotFound) assert.Equal(t, expected.accessDenied, imagepruner.IsAccessDeniedErr(err), "access denied error should be %v", !expected.accessDenied) @@ -471,8 +482,15 @@ func TestImagePrunerErrors(t *testing.T) { // We should always get an error back for this test because we do not have // permissions to delete images. assert.Error(t, err) - assert.Equal(t, expected.imageNotFound, imagepruner.IsImageNotFoundErr(err), "image not found error should be %v", !expected.imageNotFound) - assert.Equal(t, expected.accessDenied, imagepruner.IsAccessDeniedErr(err), "access denied error should be %v", !expected.accessDenied) + if expected.imageNotFound && expected.accessDenied { + // Both flags set means accept either error type (registries may vary) + isImageNotFound := imagepruner.IsImageNotFoundErr(err) + isAccessDenied := imagepruner.IsAccessDeniedErr(err) + assert.True(t, isImageNotFound || isAccessDenied, "expected either imageNotFound or accessDenied error, got: %v", err) + } else { + assert.Equal(t, expected.imageNotFound, imagepruner.IsImageNotFoundErr(err), "image not found error should be %v", !expected.imageNotFound) + assert.Equal(t, expected.accessDenied, imagepruner.IsAccessDeniedErr(err), "access denied error should be %v", !expected.accessDenied) + } assert.True(t, imagepruner.IsTolerableDeleteErr(err)) return err diff --git a/test/e2e-ocl/layered_image_scaleup_test.go b/test/e2e-ocl-2of2/layered_image_scaleup_test.go similarity index 92% rename from test/e2e-ocl/layered_image_scaleup_test.go rename to test/e2e-ocl-2of2/layered_image_scaleup_test.go index cd05427a89..57da9b6387 100644 --- a/test/e2e-ocl/layered_image_scaleup_test.go +++ b/test/e2e-ocl-2of2/layered_image_scaleup_test.go @@ -1,17 +1,20 @@ -package e2e_ocl_test +package e2e_ocl_2of2_test import ( "context" "testing" "time" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + machineclientset "github.com/openshift/client-go/machine/clientset/versioned" "github.com/openshift/machine-config-operator/pkg/daemon/constants" "github.com/openshift/machine-config-operator/test/framework" "github.com/openshift/machine-config-operator/test/helpers" "github.com/stretchr/testify/require" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/wait" + + e2e_ocl_shared_test "github.com/openshift/machine-config-operator/test/e2e-ocl-shared" ) // TestLayeredImageServingDuringNodeScaleUp tests the 1-reboot vs 2-reboot behavior when scaling up nodes @@ -41,7 +44,7 @@ func TestLayeredImageServingDuringNodeScaleUp(t *testing.T) { imagePullspec, _ := runOnClusterLayeringTest(t, onClusterLayeringTestOpts{ poolName: layeredMCPName, customDockerfiles: map[string]string{ - layeredMCPName: cowsayDockerfile, + layeredMCPName: e2e_ocl_shared_test.CowsayDockerfile, }, }) @@ -53,7 +56,7 @@ func TestLayeredImageServingDuringNodeScaleUp(t *testing.T) { t.Logf("Selected node %s to opt into layered pool", existingNode.Name) // Label the node to add it to the layered pool - unlabelFunc := makeIdempotentAndRegisterAlwaysRun(t, helpers.LabelNode(t, cs, existingNode, helpers.MCPNameToRole(layeredMCPName))) + unlabelFunc := e2e_ocl_shared_test.MakeIdempotentAndRegisterAlwaysRun(t, helpers.LabelNode(t, cs, existingNode, helpers.MCPNameToRole(layeredMCPName))) // Step 3: Wait for the existing node to adopt the layered image t.Logf("Waiting for existing node %s to adopt layered image", existingNode.Name) @@ -97,7 +100,7 @@ func TestLayeredImageServingDuringNodeScaleUp(t *testing.T) { t.Logf("New node %s has been created and is ready", newNode.Name) // Label the new node to add it to the layered pool - newNodeUnlabelFunc := makeIdempotentAndRegisterAlwaysRun(t, helpers.LabelNode(t, cs, *newNode, helpers.MCPNameToRole(layeredMCPName))) + newNodeUnlabelFunc := e2e_ocl_shared_test.MakeIdempotentAndRegisterAlwaysRun(t, helpers.LabelNode(t, cs, *newNode, helpers.MCPNameToRole(layeredMCPName))) // Step 5: Verify the new node gets the layered image t.Logf("Waiting for new node %s to adopt layered image during bootstrap", newNode.Name) diff --git a/test/e2e-ocl-2of2/main_test.go b/test/e2e-ocl-2of2/main_test.go new file mode 100644 index 0000000000..466eda7ca7 --- /dev/null +++ b/test/e2e-ocl-2of2/main_test.go @@ -0,0 +1,16 @@ +package e2e_ocl_2of2_test + +import ( + "flag" + "os" + "testing" + + "k8s.io/klog/v2" +) + +func TestMain(m *testing.M) { + flag.Parse() + klog.Infof("-skip-cleanup: %v", skipCleanupAlways) + klog.Infof("-skip-cleanup-on-failure: %v", skipCleanupOnlyAfterFailure) + os.Exit(m.Run()) +} diff --git a/test/e2e-ocl/onclusterlayering_test.go b/test/e2e-ocl-2of2/onclusterlayering_test.go similarity index 71% rename from test/e2e-ocl/onclusterlayering_test.go rename to test/e2e-ocl-2of2/onclusterlayering_test.go index 781bd80f78..6ee4dea5cd 100644 --- a/test/e2e-ocl/onclusterlayering_test.go +++ b/test/e2e-ocl-2of2/onclusterlayering_test.go @@ -1,41 +1,39 @@ -package e2e_ocl_test +package e2e_ocl_2of2_test import ( "context" - _ "embed" + "errors" "flag" "fmt" "os" "os/exec" "path/filepath" - goruntime "runtime" - "strings" "testing" "time" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" + k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/wait" ign3types "github.com/coreos/ignition/v2/config/v3_5/types" mcfgv1 "github.com/openshift/api/machineconfiguration/v1" "github.com/openshift/machine-config-operator/pkg/apihelpers" + "github.com/openshift/machine-config-operator/pkg/controller/build/buildrequest" + "github.com/openshift/machine-config-operator/pkg/controller/build/constants" + "github.com/openshift/machine-config-operator/pkg/controller/build/imagebuilder" + "github.com/openshift/machine-config-operator/pkg/controller/build/utils" + ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common" daemonconsts "github.com/openshift/machine-config-operator/pkg/daemon/constants" "github.com/openshift/machine-config-operator/pkg/daemon/runtimeassets" + ocltesthelper "github.com/openshift/machine-config-operator/test/e2e-ocl-shared" "github.com/openshift/machine-config-operator/test/framework" "github.com/openshift/machine-config-operator/test/helpers" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - - "github.com/openshift/machine-config-operator/pkg/controller/build/buildrequest" - "github.com/openshift/machine-config-operator/pkg/controller/build/constants" - "github.com/openshift/machine-config-operator/pkg/controller/build/imagebuilder" - "github.com/openshift/machine-config-operator/pkg/controller/build/utils" - ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common" - k8serrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/util/wait" ) const ( @@ -44,34 +42,6 @@ const ( // The MachineConfig names to create for the tests. mcNameUsbguard string = "inspect-usbguard" - - // The name of the global pull secret copy to use for the tests. - globalPullSecretCloneName string = "global-pull-secret-copy" -) - -var ( - // Provides a Containerfile that installs cowsayusing the Centos Stream 9 - // EPEL repository to do so without requiring any entitlements. - //go:embed Containerfile.cowsay - cowsayDockerfile string - - // Provides a Containerfile that installs Buildah from the default RHCOS RPM - // repositories. If the installation succeeds, the entitlement certificate is - // working. - //go:embed Containerfile.entitled - entitledDockerfile string - - // Provides a Containerfile that works similarly to the cowsay Dockerfile - // with the exception that the /etc/yum.repos.d and /etc/pki/rpm-gpg key - // content is mounted into the build context by the BuildController. - //go:embed Containerfile.yum-repos-d - yumReposDockerfile string - - //go:embed Containerfile.okd-fcos - okdFcosDockerfile string - - //go:embed Containerfile.simple - simpleDockerfile string ) var skipCleanupAlways bool @@ -109,335 +79,14 @@ type onClusterLayeringTestOpts struct { } func TestOnClusterLayeringOnOKD(t *testing.T) { - skipOnOCP(t) - - runOnClusterLayeringTest(t, onClusterLayeringTestOpts{ - poolName: layeredMCPName, - customDockerfiles: map[string]string{ - layeredMCPName: okdFcosDockerfile, - }, - }) -} - -// Tests that an on-cluster build can be performed with the Custom Pod Builder. -func TestOnClusterLayering(t *testing.T) { - _, firstMosb := runOnClusterLayeringTest(t, onClusterLayeringTestOpts{ - poolName: layeredMCPName, - customDockerfiles: map[string]string{ - layeredMCPName: cowsayDockerfile, - }, - }) - - assert.NotEqual(t, string(firstMosb.UID), "") - - // Test rebuild annotation works - ctx, cancel := context.WithCancel(context.Background()) - t.Cleanup(cancel) - - t.Logf("Applying rebuild annotation (%q) to MachineOSConfig (%q) to cause a rebuild", constants.RebuildMachineOSConfigAnnotationKey, layeredMCPName) - - cs := framework.NewClientSet("") - - mosc, err := cs.MachineconfigurationV1Interface.MachineOSConfigs().Get(ctx, layeredMCPName, metav1.GetOptions{}) - require.NoError(t, err) - - helpers.SetRebuildAnnotationOnMachineOSConfig(ctx, t, cs.GetMcfgclient(), mosc) - - // Use the UID of the previous MOSB to ensure it is deleted as the rebuild will trigger a MOSB with the same name - t.Logf("Waiting for the previous MachineOSBuild with UID %q to be deleted", firstMosb.UID) - waitForMOSBToBeDeleted(t, cs, firstMosb) - - // Wait for the build to start - secondMosb := waitForBuildToStartForPoolAndConfig(t, cs, layeredMCPName, mosc.Name) - assert.NotEqual(t, firstMosb.UID, secondMosb.UID) -} - -// Tests that an on-cluster build can be performed and that the resulting image -// is rolled out to an opted-in node. -func TestOnClusterBuildRollsOutImage(t *testing.T) { - requiredKernelType := ctrlcommon.KernelTypeRealtime - if goruntime.GOARCH == "arm64" { - requiredKernelType = ctrlcommon.KernelType64kPages - } - - imagePullspec, _ := runOnClusterLayeringTest(t, onClusterLayeringTestOpts{ - poolName: layeredMCPName, - customDockerfiles: map[string]string{ - layeredMCPName: cowsayDockerfile, - }, - machineConfigs: []*mcfgv1.MachineConfig{ - newMachineConfigWithKernelType(fmt.Sprintf("%s-kernel-machineconfig", requiredKernelType), layeredMCPName, requiredKernelType), - }, - }) - - cs := framework.NewClientSet("") - node := helpers.GetRandomNode(t, cs, "worker") - - unlabelFunc := makeIdempotentAndRegisterAlwaysRun(t, helpers.LabelNode(t, cs, node, helpers.MCPNameToRole(layeredMCPName))) - helpers.WaitForNodeImageChange(t, cs, node, imagePullspec) - - helpers.AssertNodeBootedIntoImage(t, cs, node, imagePullspec) - t.Logf("Node %s is booted into image %q", node.Name, imagePullspec) - t.Log(helpers.ExecCmdOnNode(t, cs, node, "chroot", "/rootfs", "cowsay", "Moo!")) - - // Check that the booted image has the requested kernel - foundKernel := helpers.ExecCmdOnNode(t, cs, node, "chroot", "/rootfs", "uname", "-r") - t.Logf("Node %s running kernel: %s", node.Name, foundKernel) - if !compareKernelType(t, foundKernel, requiredKernelType) { - t.Fatalf("Kernel type requested %s, got %s", requiredKernelType, foundKernel) - } - - unlabelFunc() - - assertNodeRevertsToNonLayered(t, cs, node) - - // Check that the reverted image has the default kernel. - requiredKernelType = ctrlcommon.KernelTypeDefault - foundKernel = helpers.ExecCmdOnNode(t, cs, node, "chroot", "/rootfs", "uname", "-r") - t.Logf("Node %s running kernel: %s", node.Name, foundKernel) - if !compareKernelType(t, foundKernel, requiredKernelType) { - t.Fatalf("Kernel type requested %s, got %s", requiredKernelType, foundKernel) - } -} - -func TestMissingImageIsRebuilt(t *testing.T) { - cs := framework.NewClientSet("") - - ctx, cancel := context.WithCancel(context.Background()) - t.Cleanup(cancel) - - kubeassert := helpers.AssertClientSet(t, cs).WithContext(ctx) - - firstImagePullspec, firstMOSB := runOnClusterLayeringTest(t, onClusterLayeringTestOpts{ - poolName: layeredMCPName, - customDockerfiles: map[string]string{ - layeredMCPName: cowsayDockerfile, - }, - }) - - moscName := layeredMCPName - t.Logf("Waiting for MachineOSConfig %q to have a new pullspec", moscName) - waitForMOSCToGetNewPullspec(ctx, t, cs, moscName, firstImagePullspec) - - // Create a MC to create another MOSB - testMC := newMachineConfigTriggersImageRebuild(mcNameUsbguard, layeredMCPName, []string{"usbguard"}) - t.Logf("Creating MachineConfig %q", testMC.Name) - firstMC, err := cs.MachineConfigs().Create(ctx, testMC, metav1.CreateOptions{}) - require.NoError(t, err) - t.Logf("Created MachineConfig %q", firstMC.Name) - kubeassert.MachineConfigExists(firstMC) - - // Wait for the build to start - t.Logf("Waiting for 2nd build to start...") - secondMOSBName := waitForMOSCToUpdateCurrentMOSB(ctx, t, cs, moscName, firstMOSB.Name) - secondMOSB, err := cs.GetMcfgclient().MachineconfigurationV1().MachineOSBuilds().Get(ctx, secondMOSBName, metav1.GetOptions{}) - require.NoError(t, err) - secondMOSB = waitForBuildToStart(t, cs, secondMOSB) - t.Logf("MachineOSBuild %q has started", secondMOSB.Name) - assertBuildJobIsAsExpected(t, cs, secondMOSB) - - // Wait for the build to finish - t.Logf("Waiting for 2nd build completion...") - secondFinishedBuild := waitForBuildToComplete(t, cs, secondMOSB) - secondImagePullspec := string(secondFinishedBuild.Status.DigestedImagePushSpec) - t.Logf("MachineOSBuild %q has completed and produced image: %s", secondFinishedBuild.Name, secondImagePullspec) - waitForMOSCToGetNewPullspec(ctx, t, cs, moscName, secondImagePullspec) - - // Delete the first image- simulating image deletion - t.Logf("Deleting image %q", firstImagePullspec) - istName := fmt.Sprintf("os-image:%s", firstMOSB.Name) - err = cs.ImageStreamTags(ctrlcommon.MCONamespace).Delete(ctx, istName, metav1.DeleteOptions{}) - require.NoError(t, err) - kubeassert.ImageDoesNotExist(istName) - t.Logf("Deleted image %q", firstImagePullspec) - - // Delete the first MC - t.Logf("Deleting MachineConfig %q to retrigger build", firstMC.Name) - err = cs.MachineConfigs().Delete(ctx, firstMC.Name, metav1.DeleteOptions{}) - require.NoError(t, err) - kubeassert.MachineConfigDoesNotExist(firstMC) - t.Logf("Deleted MachineConfig %q", firstMC.Name) - - // Wait for the build to start - t.Logf("Waiting for 3rd build (rebuild of image1) to start...") - thirdMOSBName := waitForMOSCToUpdateCurrentMOSB(ctx, t, cs, moscName, secondMOSB.Name) - thirdMOSB, err := cs.GetMcfgclient().MachineconfigurationV1().MachineOSBuilds().Get(ctx, thirdMOSBName, metav1.GetOptions{}) - require.NoError(t, err) - thirdMOSB = waitForBuildToStart(t, cs, thirdMOSB) - t.Logf("MachineOSBuild %q has started (rebuild of image1)", thirdMOSB.Name) - assertBuildJobIsAsExpected(t, cs, thirdMOSB) - - // Wait for the build to finish - t.Logf("Waiting for 3rd build completion...") - thirdFinishedBuild := waitForBuildToComplete(t, cs, thirdMOSB) - thirdImagePullspec := string(thirdFinishedBuild.Status.DigestedImagePushSpec) - t.Logf("MachineOSBuild %q has completed and produced image: %s", thirdFinishedBuild.Name, thirdImagePullspec) - waitForMOSCToGetNewPullspec(ctx, t, cs, moscName, thirdImagePullspec) - - // Apply the MC again - t.Logf("Re‐applying the same MachineConfig %q to confirm no new build for image2", testMC.Name) - secondMC, err := cs.MachineConfigs().Create(ctx, testMC, metav1.CreateOptions{}) - require.NoError(t, err) - kubeassert.MachineConfigExists(secondMC) - t.Logf("Created MachineConfig %q", secondMC.Name) - - // waitForMOSCToGetNewPullspec(ctx, t, cs, moscName, secondImagePullspec) - - t.Logf("Waiting for recycled USBGuard MOSB %q to finish (or to prove there is none)", secondMOSB.Name) - secondMOSB, err = cs.GetMcfgclient().MachineconfigurationV1().MachineOSBuilds().Get(ctx, secondMOSB.Name, metav1.GetOptions{}) - require.NoError(t, err) - secondMOSB = waitForBuildToComplete(t, cs, secondMOSB) - t.Logf("MOSB %q is now complete (reused image)", secondMOSB.Name) - - t.Logf("Deleting MachineOSBuild %q (MOSB3) to test pruning of image1", thirdMOSB.Name) - err = cs.MachineconfigurationV1Interface.MachineOSBuilds().Delete(ctx, thirdMOSB.Name, metav1.DeleteOptions{}) - require.NoError(t, err) - kubeassert.MachineOSBuildDoesNotExist(thirdMOSB) - t.Logf("Deleted MachineOSBuild %q", thirdMOSB.Name) - - deletedIst := fmt.Sprintf("os-image:%s", thirdMOSBName) - kubeassert.ImageDoesNotExist(deletedIst) - t.Logf("ImageStreamTag %q has been pruned", deletedIst) - - t.Logf("Deleting MachineConfig %q for cleanup", secondMC.Name) - err = cs.MachineConfigs().Delete(ctx, secondMC.Name, metav1.DeleteOptions{}) - require.NoError(t, err) - kubeassert.MachineConfigDoesNotExist(secondMC) - t.Logf("Deleted MachineConfig %q", secondMC.Name) -} - -func assertNodeRevertsToNonLayered(t *testing.T, cs *framework.ClientSet, node corev1.Node) { - workerMCName := helpers.GetMcName(t, cs, "worker") - workerMC, err := cs.MachineConfigs().Get(context.TODO(), workerMCName, metav1.GetOptions{}) - require.NoError(t, err) - - helpers.WaitForNodeConfigAndImageChange(t, cs, node, workerMCName, "") - - helpers.AssertNodeBootedIntoImage(t, cs, node, workerMC.Spec.OSImageURL) - t.Logf("Node %s has reverted to OS image %q", node.Name, workerMC.Spec.OSImageURL) - - helpers.AssertFileNotOnNode(t, cs, node, filepath.Join("/etc/systemd/system", runtimeassets.RevertServiceName)) - helpers.AssertFileNotOnNode(t, cs, node, runtimeassets.RevertServiceMachineConfigFile) -} - -// This test extracts the /etc/yum.repos.d and /etc/pki/rpm-gpg content from a -// Centos Stream 9 image and injects them into the MCO namespace. It then -// performs a build with the expectation that these artifacts will be used, -// simulating a build where someone has added this content; usually a Red Hat -// Satellite user. -func TestYumReposBuilds(t *testing.T) { - // Skipping this test as it is having a package conflict issue unrelated to MCO - t.Skip() - runOnClusterLayeringTest(t, onClusterLayeringTestOpts{ - poolName: layeredMCPName, - customDockerfiles: map[string]string{ - layeredMCPName: yumReposDockerfile, - }, - useYumRepos: true, - }) -} - -// Then performs an on-cluster layering build which should consume the -// etc-pki-entitlement certificates. -func TestEntitledBuilds(t *testing.T) { - skipOnOKD(t) + ocltesthelper.SkipOnOCP(t) runOnClusterLayeringTest(t, onClusterLayeringTestOpts{ poolName: layeredMCPName, customDockerfiles: map[string]string{ - layeredMCPName: entitledDockerfile, - }, - entitlementRequired: true, - }) -} - -// This test verifies that if a change is made to a given MachineOSConfig, that -// any in-progress builds are terminated and that only the latest change is -// being built. -func TestMachineOSConfigChangeRestartsBuild(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - t.Cleanup(cancel) - - cs := framework.NewClientSet("") - - mosc := prepareForOnClusterLayeringTest(t, cs, onClusterLayeringTestOpts{ - poolName: layeredMCPName, - customDockerfiles: map[string]string{ - layeredMCPName: cowsayDockerfile, - }, - }) - - createMachineOSConfig(t, cs, mosc) - - mcp, err := cs.MachineconfigurationV1Interface.MachineConfigPools().Get(ctx, layeredMCPName, metav1.GetOptions{}) - require.NoError(t, err) - - firstMosb := buildrequest.NewMachineOSBuildFromAPIOrDie(ctx, cs.GetKubeclient(), mosc, mcp) - - // First, we get a MachineOSBuild started as usual. - waitForBuildToStart(t, cs, firstMosb) - - // Next, we update the Containerfile. - t.Logf("Initial build has started, updating Containerfile...") - - apiMosc := helpers.SetContainerfileContentsOnMachineOSConfig(ctx, t, cs.GetMcfgclient(), mosc, "FROM configs AS final\nRUN echo 'hello' > /etc/hello") - - moscChangeMosb := buildrequest.NewMachineOSBuildFromAPIOrDie(ctx, cs.GetKubeclient(), apiMosc, mcp) - - kubeassert := helpers.AssertClientSet(t, cs).WithContext(ctx) - - assertBuildObjectsAreCreated(t, kubeassert, firstMosb) - - t.Logf("Containerfile is updated, waiting for new build %s to start", moscChangeMosb.Name) - - // Wait for the second build to start. - waitForBuildToStart(t, cs, moscChangeMosb) - - t.Logf("Waiting for initial MachineOSBuild %s to be deleted", firstMosb.Name) - // Wait for the first build to be deleted. - waitForBuildToBeDeleted(t, cs, firstMosb) - - // Ensure that the second build still exists. - _, err = cs.MachineconfigurationV1Interface.MachineOSBuilds().Get(context.TODO(), moscChangeMosb.Name, metav1.GetOptions{}) - require.NoError(t, err) -} - -// This test verifies that a change to the MachineConfigPool, such as the -// presence of a new rendered MachineConfig, will halt the currently running -// build, replacing it with a new build instead. -func TestMachineConfigPoolChangeRestartsBuild(t *testing.T) { - cs := framework.NewClientSet("") - - mosc := prepareForOnClusterLayeringTest(t, cs, onClusterLayeringTestOpts{ - poolName: layeredMCPName, - customDockerfiles: map[string]string{ - layeredMCPName: cowsayDockerfile, + layeredMCPName: ocltesthelper.OkdFcosDockerfile, }, }) - - createMachineOSConfig(t, cs, mosc) - - // Wait for the first build to start. - firstMosb := waitForBuildToStartForPoolAndConfig(t, cs, layeredMCPName, mosc.Name) - - // Once the first build has started, we create a new MachineConfig, wait for - // the rendered config to appear, then we check that a new MachineOSBuild has - // started for that new MachineConfig. - mcName := "new-machineconfig" - mc := newMachineConfigTriggersImageRebuild(mcName, layeredMCPName, []string{"usbguard"}) - applyMC(t, cs, mc) - - _, err := helpers.WaitForRenderedConfig(t, cs, layeredMCPName, mcName) - require.NoError(t, err) - - // We wait for the first build to be deleted. - waitForBuildToBeDeleted(t, cs, firstMosb) - - // Next, we wait for the new build to be started. - secondMosb := waitForBuildToStartForPoolAndConfig(t, cs, layeredMCPName, mosc.Name) - - _, err = cs.MachineconfigurationV1Interface.MachineOSBuilds().Get(context.TODO(), secondMosb.Name, metav1.GetOptions{}) - require.NoError(t, err) } // This test starts a build that it then forces to fail by deleting the build @@ -453,7 +102,7 @@ func TestGracefulBuildFailureRecovery(t *testing.T) { mosc := prepareForOnClusterLayeringTest(t, cs, onClusterLayeringTestOpts{ poolName: layeredMCPName, customDockerfiles: map[string]string{ - layeredMCPName: cowsayDockerfile, + layeredMCPName: ocltesthelper.CowsayDockerfile, }, }) @@ -466,7 +115,7 @@ func TestGracefulBuildFailureRecovery(t *testing.T) { // Repeatedly delete the build pod until the job fails to cause a failure. // Otherwise, it takes a very long time for the job to actually fail. - require.NoError(t, forceMachineOSBuildToFail(ctx, t, cs, firstMosb)) + require.NoError(t, ocltesthelper.ForceMachineOSBuildToFail(ctx, t, cs, firstMosb)) // Wait for the build to fail. kubeassert := helpers.AssertClientSet(t, cs).WithContext(ctx) @@ -478,14 +127,14 @@ func TestGracefulBuildFailureRecovery(t *testing.T) { apiMosc.Spec.Containerfile = []mcfgv1.MachineOSContainerfile{} - updated, err := cs.MachineconfigurationV1Interface.MachineOSConfigs().Update(ctx, apiMosc, metav1.UpdateOptions{}) + updatedMosc, err := cs.MachineconfigurationV1Interface.MachineOSConfigs().Update(ctx, apiMosc, metav1.UpdateOptions{}) require.NoError(t, err) mcp, err := cs.MachineconfigurationV1Interface.MachineConfigPools().Get(ctx, layeredMCPName, metav1.GetOptions{}) require.NoError(t, err) // Compute the new MachineOSBuild image name. - moscChangeMosb := buildrequest.NewMachineOSBuildFromAPIOrDie(ctx, cs.GetKubeclient(), updated, mcp) + moscChangeMosb := buildrequest.NewMachineOSBuildFromAPIOrDie(ctx, cs.GetKubeclient(), updatedMosc, mcp) // Wait for the second build to start. secondMosb := waitForBuildToStart(t, cs, moscChangeMosb) @@ -512,7 +161,7 @@ func TestDeletedBuilderInterruptsMachineOSBuild(t *testing.T) { mosc := prepareForOnClusterLayeringTest(t, cs, onClusterLayeringTestOpts{ poolName: poolName, customDockerfiles: map[string]string{ - layeredMCPName: cowsayDockerfile, + layeredMCPName: ocltesthelper.CowsayDockerfile, }, }) @@ -524,7 +173,7 @@ func TestDeletedBuilderInterruptsMachineOSBuild(t *testing.T) { startedBuild := waitForBuildToStartForPoolAndConfig(t, cs, poolName, mosc.Name) t.Logf("MachineOSBuild %q has started", startedBuild.Name) - pod, err := getPodFromJob(ctx, cs, utils.GetBuildJobName(startedBuild)) + pod, err := ocltesthelper.GetPodFromJob(ctx, cs, utils.GetBuildJobName(startedBuild)) require.NoError(t, err) // Delete the builder @@ -554,7 +203,7 @@ func TestDeletedPodDoesNotInterruptMachineOSBuild(t *testing.T) { mosc := prepareForOnClusterLayeringTest(t, cs, onClusterLayeringTestOpts{ poolName: poolName, customDockerfiles: map[string]string{ - layeredMCPName: cowsayDockerfile, + layeredMCPName: ocltesthelper.CowsayDockerfile, }, }) @@ -567,7 +216,7 @@ func TestDeletedPodDoesNotInterruptMachineOSBuild(t *testing.T) { t.Logf("MachineOSBuild %q has started", startedBuild.Name) // Get the pod created by the build Job - pod, err := getPodFromJob(ctx, cs, utils.GetBuildJobName(startedBuild)) + pod, err := ocltesthelper.GetPodFromJob(ctx, cs, utils.GetBuildJobName(startedBuild)) require.NoError(t, err) // Delete the pod @@ -582,7 +231,7 @@ func TestDeletedPodDoesNotInterruptMachineOSBuild(t *testing.T) { kubeassert.MachineOSBuildIsRunning(startedBuild) // Check that a new pod was created - podNew, err := getPodFromJob(ctx, cs, utils.GetBuildJobName(startedBuild)) + podNew, err := ocltesthelper.GetPodFromJob(ctx, cs, utils.GetBuildJobName(startedBuild)) require.NoError(t, err) assert.NotEqual(t, podNew, pod) } @@ -599,7 +248,7 @@ func TestDeletedTransientMachineOSBuildIsRecreated(t *testing.T) { mosc := prepareForOnClusterLayeringTest(t, cs, onClusterLayeringTestOpts{ poolName: poolName, customDockerfiles: map[string]string{ - layeredMCPName: cowsayDockerfile, + layeredMCPName: ocltesthelper.CowsayDockerfile, }, }) @@ -638,1104 +287,1044 @@ func TestDeletedTransientMachineOSBuildIsRecreated(t *testing.T) { assert.NotEqual(t, firstJob.UID, secondJob.UID) } -// This test verifies that if the rebuild annotation is added to a given MachineOSConfig, that -// the build is restarted -func TestRebuildAnnotationRestartsBuild(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - t.Cleanup(cancel) +func TestSSHKeyAndPasswordForOSBuilder(t *testing.T) { + t.Skip() cs := framework.NewClientSet("") - mosc := prepareForOnClusterLayeringTest(t, cs, onClusterLayeringTestOpts{ - poolName: layeredMCPName, - customDockerfiles: map[string]string{ - layeredMCPName: cowsayDockerfile, - }, + // label random node from pool, get the node + unlabelFunc := helpers.LabelRandomNodeFromPool(t, cs, "worker", "node-role.kubernetes.io/layered") + osNode := helpers.GetSingleNodeByRole(t, cs, layeredMCPName) + + // prepare for on cluster build test + prepareForOnClusterLayeringTest(t, cs, onClusterLayeringTestOpts{ + poolName: layeredMCPName, + customDockerfiles: map[string]string{}, }) - createMachineOSConfig(t, cs, mosc) + // Set up Ignition config with the desired SSH key and password + testIgnConfig := ctrlcommon.NewIgnConfig() + sshKeyContent := "testsshkey11" + passwordHash := "testpassword11" - // First, we get a MachineOSBuild started as usual. - firstMosb := waitForBuildToStartForPoolAndConfig(t, cs, layeredMCPName, mosc.Name) + // retreive initial etc/shadow contents + initialEtcShadowContents := helpers.ExecCmdOnNode(t, cs, osNode, "grep", "^core:", "/rootfs/etc/shadow") - assert.NotEqual(t, string(firstMosb.UID), "") + testIgnConfig.Passwd.Users = []ign3types.PasswdUser{ + { + Name: "core", + SSHAuthorizedKeys: []ign3types.SSHAuthorizedKey{ign3types.SSHAuthorizedKey(sshKeyContent)}, + PasswordHash: &passwordHash, + }, + } - kubeassert := helpers.AssertClientSet(t, cs).WithContext(ctx) - assertBuildObjectsAreCreated(t, kubeassert, firstMosb) + testConfig := &mcfgv1.MachineConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: "99-test-ssh-and-password", + Labels: helpers.MCLabelForRole(layeredMCPName), + }, + Spec: mcfgv1.MachineConfigSpec{ + Config: runtime.RawExtension{ + Raw: helpers.MarshalOrDie(testIgnConfig), + }, + }, + } - firstJob, err := cs.BatchV1Interface.Jobs(ctrlcommon.MCONamespace).Get(ctx, utils.GetBuildJobName(firstMosb), metav1.GetOptions{}) - require.NoError(t, err) + helpers.SetMetadataOnObject(t, testConfig) - pod, err := getPodFromJob(ctx, cs, utils.GetBuildJobName(firstMosb)) - require.NoError(t, err) - t.Logf("Initial build has started, delete the job to interrupt the build...") - // Delete the builder - bgDeletion := metav1.DeletePropagationBackground - err = cs.BatchV1Interface.Jobs(ctrlcommon.MCONamespace).Delete(ctx, utils.GetBuildJobName(firstMosb), metav1.DeleteOptions{PropagationPolicy: &bgDeletion}) - require.NoError(t, err) + // Create the MachineConfig and wait for the configuration to be applied + mcCleanupFunc := applyMC(t, cs, testConfig) - // Wait for the build to be interrupted. - waitForBuildToBeInterrupted(t, cs, firstMosb) + // wait for rendered config to finish creating + renderedConfig, err := helpers.WaitForRenderedConfig(t, cs, layeredMCPName, testConfig.Name) + require.Nil(t, err) + t.Logf("Finished rendering config") - // Wait for the job and pod to be deleted. - kubeassert.Eventually().JobDoesNotExist(utils.GetBuildJobName(firstMosb)) - kubeassert.Eventually().PodDoesNotExist(pod.Name) + // wait for mcp to complete updating + err = helpers.WaitForPoolComplete(t, cs, layeredMCPName, renderedConfig) + require.Nil(t, err) + t.Logf("Pool completed updating") - t.Logf("Add rebuild annotation to the MOSC...") - helpers.SetRebuildAnnotationOnMachineOSConfig(ctx, t, cs.GetMcfgclient(), mosc) + // Validate the SSH key and password + osNode = helpers.GetSingleNodeByRole(t, cs, layeredMCPName) // Re-fetch node with updated configurations - // Wait for the MOSB to be deleted - t.Logf("Waiting for MachineOSBuild with UID %s to be deleted", firstMosb.UID) - waitForMOSBToBeDeleted(t, cs, firstMosb) + foundSSHKey := helpers.ExecCmdOnNode(t, cs, osNode, "cat", "/rootfs/home/core/.ssh/authorized_keys.d/ignition") + if !contains(foundSSHKey, sshKeyContent) { + t.Fatalf("updated ssh key not found, got %s", foundSSHKey) + } + t.Logf("updated ssh hash found, got %s", foundSSHKey) - t.Logf("Annotation is updated, waiting for new build %s to start", firstMosb.Name) - // Wait for the build to start. - secondMosb := waitForBuildToStart(t, cs, firstMosb) + currentEtcShadowContents := helpers.ExecCmdOnNode(t, cs, osNode, "grep", "^core:", "/rootfs/etc/shadow") + if currentEtcShadowContents == initialEtcShadowContents { + t.Fatalf("updated password hash not found in /etc/shadow, got %s", currentEtcShadowContents) + } + t.Logf("updated password hash found in /etc/shadow, got %s", currentEtcShadowContents) - secondJob, err := cs.BatchV1Interface.Jobs(ctrlcommon.MCONamespace).Get(ctx, utils.GetBuildJobName(secondMosb), metav1.GetOptions{}) - require.NoError(t, err) - - // Ensure that the names are the same, but that the first and second - // MachineOSBuilds have different UIDs. - assert.Equal(t, firstMosb.Name, secondMosb.Name) - assert.NotEqual(t, firstMosb.UID, secondMosb.UID) - - // Ensure that the build jobs have also changed. - assert.Equal(t, firstJob.Name, secondJob.Name) - assert.NotEqual(t, firstJob.UID, secondJob.UID) -} - -func assertBuildObjectsAreCreated(t *testing.T, kubeassert *helpers.Assertions, mosb *mcfgv1.MachineOSBuild) { - t.Helper() - - kubeassert.JobExists(utils.GetBuildJobName(mosb)) - kubeassert.ConfigMapExists(utils.GetContainerfileConfigMapName(mosb)) - kubeassert.ConfigMapExists(utils.GetMCConfigMapName(mosb)) - kubeassert.ConfigMapExists(utils.GetEtcPolicyConfigMapName(mosb)) - kubeassert.ConfigMapExists(utils.GetEtcRegistriesConfigMapName(mosb)) - kubeassert.SecretExists(utils.GetBasePullSecretName(mosb)) - kubeassert.SecretExists(utils.GetFinalPushSecretName(mosb)) - - // Check that ownerReferences are set as well - kubeassert.ConfigMapHasOwnerSet(utils.GetContainerfileConfigMapName(mosb)) - kubeassert.ConfigMapHasOwnerSet(utils.GetMCConfigMapName(mosb)) - kubeassert.ConfigMapHasOwnerSet(utils.GetEtcPolicyConfigMapName(mosb)) - kubeassert.ConfigMapHasOwnerSet(utils.GetEtcRegistriesConfigMapName(mosb)) - kubeassert.SecretHasOwnerSet(utils.GetBasePullSecretName(mosb)) - kubeassert.SecretHasOwnerSet(utils.GetFinalPushSecretName(mosb)) -} + t.Logf("Node %s has correct SSH Key and Password Hash", osNode.Name) -func assertBuildObjectsAreDeleted(t *testing.T, kubeassert *helpers.Assertions, mosb *mcfgv1.MachineOSBuild) { - t.Helper() + // Clean-up: Delete the applied MachineConfig and ensure configurations are rolled back - kubeassert.JobDoesNotExist(utils.GetBuildJobName(mosb)) - kubeassert.ConfigMapDoesNotExist(utils.GetContainerfileConfigMapName(mosb)) - kubeassert.ConfigMapDoesNotExist(utils.GetMCConfigMapName(mosb)) - kubeassert.ConfigMapDoesNotExist(utils.GetEtcPolicyConfigMapName(mosb)) - kubeassert.ConfigMapDoesNotExist(utils.GetEtcRegistriesConfigMapName(mosb)) - kubeassert.SecretDoesNotExist(utils.GetBasePullSecretName(mosb)) - kubeassert.SecretDoesNotExist(utils.GetFinalPushSecretName(mosb)) + t.Cleanup(func() { + unlabelFunc() + mcCleanupFunc() + }) } -// Sets up and performs an on-cluster build for a given set of parameters. -// Returns the built image pullspec for later consumption. -func runOnClusterLayeringTest(t *testing.T, testOpts onClusterLayeringTestOpts) (string, *mcfgv1.MachineOSBuild) { +// This test starts a build and then immediately scales down the +// machine-os-builder deployment until the underlying build job has completed. +// The rationale behind this test is so that if the machine-os-builder pod gets +// rescheduled onto a different node while a build is occurring that the +// MachineOSBuild object will eventually be reconciled, even if the build +// completed during the rescheduling operation. +func TestControllerEventuallyReconciles(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) t.Cleanup(cancel) cs := framework.NewClientSet("") - imageBuilder := testOpts.imageBuilderType - if testOpts.imageBuilderType == "" { - imageBuilder = mcfgv1.JobBuilder - } + poolName := layeredMCPName - t.Logf("Running with ImageBuilder type: %s", imageBuilder) + mosc := prepareForOnClusterLayeringTest(t, cs, onClusterLayeringTestOpts{ + poolName: poolName, + customDockerfiles: map[string]string{ + layeredMCPName: ocltesthelper.CowsayDockerfile, + }, + }) - mosc := prepareForOnClusterLayeringTest(t, cs, testOpts) + mcp, err := cs.MachineconfigurationV1Interface.MachineConfigPools().Get(ctx, poolName, metav1.GetOptions{}) + require.NoError(t, err) - // Create our MachineOSConfig. createMachineOSConfig(t, cs, mosc) - // Create a child context for the machine-os-builder pod log streamer. We - // create it here because we want the cancellation to run before the - // MachineOSConfig object is removed. - mobPodStreamerCtx, mobPodStreamerCancel := context.WithCancel(ctx) - t.Cleanup(mobPodStreamerCancel) - - // Wait for the build to start - startedBuild := waitForBuildToStartForPoolAndConfig(t, cs, testOpts.poolName, mosc.Name) - t.Logf("MachineOSBuild %q has started", startedBuild.Name) - - // Assert that the build job has certain properties and configuration. - assertBuildJobIsAsExpected(t, cs, startedBuild) - - t.Logf("Waiting for build completion...") - - // Create a child context for the build pod log streamer. This is so we can - // cancel it independently of the parent context or the context for the - // machine-os-build pod watcher (which has its own separate context). - buildPodStreamerCtx, buildPodStreamerCancel := context.WithCancel(ctx) - - // We wire this to both t.Cleanup() as well as defer because we want to - // cancel this context either at the end of this function or when the test - // fails, whichever comes first. - buildPodWatcherShutdown := makeIdempotentAndRegisterAlwaysRun(t, buildPodStreamerCancel) - defer buildPodWatcherShutdown() + mosb := buildrequest.NewMachineOSBuildFromAPIOrDie(ctx, cs.GetKubeclient(), mosc, mcp) - dirPath, err := helpers.GetBuildArtifactDir(t) + // Wait for the MachineOSBuild to exist. + kubeassert := helpers.AssertClientSet(t, cs).WithContext(ctx).Eventually() + kubeassert.MachineOSBuildExists(mosb) + jobName, err := ocltesthelper.GetJobForMOSB(ctx, cs, mosb) require.NoError(t, err) + kubeassert.JobExists(jobName) + assertBuildObjectsAreCreated(t, kubeassert, mosb) - podLogsDirPath := filepath.Join(dirPath, "pod-logs") - require.NoError(t, os.MkdirAll(podLogsDirPath, 0o755)) - - // In the event of a test failure, we want to dump all of the build artifacts - // to files for easy reference later. - t.Cleanup(func() { - if t.Failed() { - writeBuildArtifactsToFiles(t, cs, testOpts.poolName) - } - }) - - // The pod log collection blocks the main Goroutine since we follow the logs - // for each container in the build pod. So they must run in a separate - // Goroutine so that the rest of the test can continue. - go func() { - err := streamBuildPodLogsToFile(buildPodStreamerCtx, t, cs, startedBuild, podLogsDirPath) - require.NoError(t, err, "expected no error, got %s", err) - }() + t.Logf("MachineOSBuild %q exists, stopping machine-os-builder", mosb.Name) - // We also want to collect logs from the machine-os-builder pod since they - // can provide a valuable window in how / why a test failed. As mentioned - // above, we need to run this in a separate Goroutine so that the test is not - // blocked. - go func() { - err := streamMachineOSBuilderPodLogsToFile(mobPodStreamerCtx, t, cs, podLogsDirPath) - require.NoError(t, err, "expected no error, got: %s", err) - }() + // As soon as the MachineOSBuild exists, scale down the machine-os-builder + // deployment and any other deployments which may inadvertantly cause its + // replica count to increase. This is done to simulate the machine-os-builder + // pod being scheduled onto a different node. + restoreDeployments := ocltesthelper.ScaleDownDeployments(t, cs) - // Wait for the build to complete. - finishedBuild := waitForBuildToComplete(t, cs, startedBuild) + // Wait for the job to start running. + waitForJobToReachMOSBCondition(ctx, t, cs, jobName, mcfgv1.MachineOSBuilding) - t.Logf("MachineOSBuild %q has completed and produced image: %s", finishedBuild.Name, finishedBuild.Status.DigestedImagePushSpec) + t.Logf("Job %s has started running, starting machine-os-builder", jobName) - require.NoError(t, archiveBuildPodLogs(t, podLogsDirPath)) + // Restore the deployments. + restoreDeployments() - return string(finishedBuild.Status.DigestedImagePushSpec), startedBuild -} + // Ensure that the MachineOSBuild object eventually gets updated. + kubeassert.MachineOSBuildIsRunning(mosb) -func archiveBuildPodLogs(t *testing.T, podLogsDirPath string) error { - archiveName := fmt.Sprintf("%s-pod-logs.tar.gz", helpers.SanitizeTestName(t)) + t.Logf("MachineOSBuild %s is now running, stopping machine-os-builder", mosb.Name) - archive, err := helpers.NewArtifactArchive(t, archiveName) - if err != nil { - return err - } + // Stop the deployments again. + restoreDeployments = ocltesthelper.ScaleDownDeployments(t, cs) - cmd := exec.Command("mv", podLogsDirPath, archive.StagingDir()) - output, err := cmd.CombinedOutput() - if err != nil { - t.Log(string(output)) - return err - } + // Wait for the job to complete. + waitForJobToReachMOSBCondition(ctx, t, cs, jobName, mcfgv1.MachineOSBuildSucceeded) - return archive.WriteArchive() -} + t.Logf("Job %q finished, starting machine-os-builder", jobName) -// Waits for the build to start and returns the started MachineOSBuild object. -func waitForBuildToStartForPoolAndConfig(t *testing.T, cs *framework.ClientSet, poolName, moscName string) *mcfgv1.MachineOSBuild { - t.Helper() + // Restore the deployments again. + restoreDeployments() - var mosbName string + // At this point, the machine-os-builder is running, so we wait for the build + // itself to complete and be updated. + mosb = waitForBuildToComplete(t, cs, mosb) - require.NoError(t, wait.PollImmediate(2*time.Second, 3*time.Minute, func() (bool, error) { - // Get the name for the MachineOSBuild based upon the MachineConfigPool and MachineOSConfig state. - name, err := getMachineOSBuildNameForPool(cs, poolName, moscName) + // Wait until the MachineOSConfig gets the digested pullspec from the MachineOSBuild. + require.NoError(t, wait.PollImmediate(1*time.Second, 5*time.Minute, func() (bool, error) { + mosc, err := cs.MachineconfigurationV1Interface.MachineOSConfigs().Get(ctx, mosc.Name, metav1.GetOptions{}) if err != nil { - return false, nil + return false, err } - mosbName = name - return true, nil + return mosc.Status.CurrentImagePullSpec != "" && mosc.Status.CurrentImagePullSpec == mosb.Status.DigestedImagePushSpec, nil })) +} - // Create a "dummy" MachineOSBuild object with just the name field set so - // that waitForMachineOSBuildToReachState() can use it. - mosb := &mcfgv1.MachineOSBuild{ - ObjectMeta: metav1.ObjectMeta{ - Name: mosbName, - }, - } +// TestImageBuildDegradedOnFailureAndClearedOnBuildStart tests that the +// ImageBuildDegraded condition is set to True when a MachineOSBuild fails, and +// is set to False when a MachineOSBuild is started after a previous failure. +// Previously, this test waited until the build was completed before verifying +// that the state was no longer degraded. +func TestImageBuildDegradedOnFailureAndClearedOnBuildStart(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) - return waitForBuildToStart(t, cs, mosb) -} + cs := framework.NewClientSet("") -// Waits for a MachineOSBuild to start building. -func waitForBuildToStart(t *testing.T, cs *framework.ClientSet, build *mcfgv1.MachineOSBuild) *mcfgv1.MachineOSBuild { - t.Helper() + mosc := prepareForOnClusterLayeringTest(t, cs, onClusterLayeringTestOpts{ + poolName: layeredMCPName, + customDockerfiles: map[string]string{ + layeredMCPName: ocltesthelper.CowsayDockerfile, + }, + }) - t.Logf("Waiting for MachineOSBuild %s to start", build.Name) + // First, add a bad containerfile to cause a build failure. However, we will + // actually delete the build pod to force the failure to happen faster. + t.Logf("Adding a bad containerfile for MachineOSConfig %s to cause a build failure", mosc.Name) + mosc.Spec.Containerfile = ocltesthelper.GetBadContainerFileForFailureTest() - ctx, cancel := context.WithTimeout(context.Background(), time.Minute*5) - defer cancel() + createMachineOSConfig(t, cs, mosc) - start := time.Now() + // Wait for the build to start and fail + firstMosb := waitForBuildToStartForPoolAndConfig(t, cs, layeredMCPName, mosc.Name) + t.Logf("Waiting for MachineOSBuild %s to fail", firstMosb.Name) + + // Force the build to fail faster by repeatedly deleting the build pods until + // the job reflects a failure status. + require.NoError(t, ocltesthelper.ForceMachineOSBuildToFail(ctx, t, cs, firstMosb)) kubeassert := helpers.AssertClientSet(t, cs).WithContext(ctx) - kubeassert.Eventually().MachineOSBuildExists(build) - t.Logf("MachineOSBuild %s created after %s", build.Name, time.Since(start)) - kubeassert.Eventually().MachineOSBuildIsRunning(build) - t.Logf("MachineOSBuild %s running after %s", build.Name, time.Since(start)) + kubeassert.Eventually().MachineOSBuildIsFailure(firstMosb) - // Get the job for the MOSB created by comparing the job UID with the MOSB annotation - buildJobName, err := getJobForMOSB(ctx, cs, build) - require.NoError(t, err) - kubeassert.Eventually().JobExists(buildJobName) - t.Logf("Build job %s created after %s", buildJobName, time.Since(start)) - // Get the pod created by the job - buildPod, err := getPodFromJob(ctx, cs, buildJobName) + // Wait for and verify ImageBuildDegraded condition is set to True + degradedCondition := waitForImageBuildDegradedCondition(ctx, t, cs, layeredMCPName, corev1.ConditionTrue) + require.NotNil(t, degradedCondition, "ImageBuildDegraded condition should be present") + assert.Equal(t, string(mcfgv1.MachineConfigPoolBuildFailed), degradedCondition.Reason, "ImageBuildDegraded reason should be BuildFailed") + assert.Contains(t, degradedCondition.Message, fmt.Sprintf("Failed to build OS image for pool %s", layeredMCPName), "ImageBuildDegraded message should contain pool name") + assert.Contains(t, degradedCondition.Message, firstMosb.Name, "ImageBuildDegraded message should contain MachineOSBuild name") + + t.Logf("ImageBuildDegraded condition correctly set to True with message: %s", degradedCondition.Message) + + // Now fix the MachineOSConfig with a good containerfile + apiMosc, err := cs.MachineconfigurationV1Interface.MachineOSConfigs().Get(ctx, mosc.Name, metav1.GetOptions{}) require.NoError(t, err) - kubeassert.Eventually().PodIsRunning(buildPod.Name) - t.Logf("Build pod %s running after %s", buildPod.Name, time.Since(start)) - kubeassert.Eventually().PodHasOwnerSet(buildPod.Name) - t.Logf("Build pod %s has owner set after %s", buildPod.Name, time.Since(start)) - mosb, err := cs.MachineconfigurationV1Interface.MachineOSBuilds().Get(ctx, build.Name, metav1.GetOptions{}) + apiMosc.Spec.Containerfile = []mcfgv1.MachineOSContainerfile{ + { + ContainerfileArch: mcfgv1.NoArch, + Content: ocltesthelper.CowsayDockerfile, + }, + } + + updatedMosc, err := cs.MachineconfigurationV1Interface.MachineOSConfigs().Update(ctx, apiMosc, metav1.UpdateOptions{}) require.NoError(t, err) - assertBuildObjectsAreCreated(t, kubeassert.Eventually(), mosb) - t.Logf("Build objects created after %s", time.Since(start)) + t.Logf("Fixed containerfile, waiting for new build to start") - return mosb -} + mcp, err := cs.MachineconfigurationV1Interface.MachineConfigPools().Get(ctx, layeredMCPName, metav1.GetOptions{}) + require.NoError(t, err) -// Waits for a MachineOSBuild with a specific UID to be deleted. -func waitForMOSBToBeDeleted(t *testing.T, cs *framework.ClientSet, mosb *mcfgv1.MachineOSBuild) { - t.Helper() + // Compute the new MachineOSBuild name + moscChangeMosb := buildrequest.NewMachineOSBuildFromAPIOrDie(ctx, cs.GetKubeclient(), updatedMosc, mcp) - ctx, cancel := context.WithTimeout(context.Background(), time.Minute*5) - defer cancel() - - start := time.Now() - - // If the given MachineOSBuild does not have a UID, e.g., from the - // NewMachineOSBuildFromAPIOrDie() helper, then we query the API server to - // find it. - if mosb.UID == "" { - t.Logf("No UID provided for MachineOSBuild %s, querying API for UID", mosb.Name) - // Get the MOSB from the API to get the UID - apiMosb, err := cs.MachineconfigurationV1Interface.MachineOSBuilds().Get(context.Background(), mosb.Name, metav1.GetOptions{}) - require.NoError(t, err) + // Wait for the second build to start + secondMosb := waitForBuildToStart(t, cs, moscChangeMosb) + t.Logf("Second build started successfully: %s", secondMosb.Name) - if k8serrors.IsNotFound(err) { - t.Logf("MachineOSBuild %s is not found, must have already been deleted", mosb.Name) - return - } + // Wait for and verify ImageBuildDegraded condition is False after the new build starts. + // The condition should be cleared when the build starts. + degradedCondition = waitForImageBuildDegradedCondition(ctx, t, cs, layeredMCPName, corev1.ConditionFalse) + require.NotNil(t, degradedCondition, "ImageBuildDegraded condition should still be present") + assert.Equal(t, string(mcfgv1.MachineConfigPoolBuilding), degradedCondition.Reason, "ImageBuildDegraded reason should be Building") + t.Logf("ImageBuildDegraded condition correctly cleared to False when build started with message: %s", degradedCondition.Message) - require.NoError(t, err) + // Wait for the second build to complete successfully + finishedBuild := waitForBuildToComplete(t, cs, secondMosb) + t.Logf("Second build completed successfully: %s", finishedBuild.Name) - mosb = apiMosb - } + // Wait for the MachineOSConfig to get the new pullspec, which indicates full reconciliation + waitForMOSCToGetNewPullspec(ctx, t, cs, mosc.Name, string(finishedBuild.Status.DigestedImagePushSpec)) - mosbName := mosb.Name - mosbUID := mosb.UID + // Wait for and verify ImageBuildDegraded condition is False with reason BuildSucceeded + degradedCondition = waitForImageBuildDegradedCondition(ctx, t, cs, layeredMCPName, corev1.ConditionFalse) + require.NotNil(t, degradedCondition, "ImageBuildDegraded condition should still be present") + assert.Equal(t, string(mcfgv1.MachineConfigPoolBuildSuccess), degradedCondition.Reason, "ImageBuildDegraded reason should be BuildSuccess") + t.Logf("ImageBuildDegraded condition correctly set to False when build succeeded with message: %s", degradedCondition.Message) - t.Logf("Waiting for MachineOSBuild %s with UID %s to be deleted", mosbName, mosbUID) + // Verify MCP status is correct after successful build and full reconciliation + successMcp, err := cs.MachineconfigurationV1Interface.MachineConfigPools().Get(ctx, layeredMCPName, metav1.GetOptions{}) + require.NoError(t, err) - // Assert does not adequately handle the case where the object is deleted. - // See https://issues.redhat.com/browse/OCPBUGS-63048 for details. - err := wait.PollImmediate(time.Second, time.Minute*5, func() (bool, error) { - mosbs, err := cs.MachineconfigurationV1Interface.MachineOSBuilds().List(ctx, metav1.ListOptions{}) + // After successful build completion and full reconciliation, MCP should show: + // Updated=True, Updating=False, Degraded=False, ImageBuildDegraded=False + kubeassert.Eventually().MachineConfigPoolReachesState(successMcp, func(mcp *mcfgv1.MachineConfigPool, err error) (bool, error) { if err != nil { return false, err } - - for _, mosb := range mosbs.Items { - // If we find a MachineOSBuild with the same name and UID, then we know - // it has not been deleted yet. - if mosb.Name == mosbName && mosb.UID == mosbUID { - return false, nil - } + // Return false (keep polling) if conditions don't match expected state + if !apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdated) || + apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdating) || + apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolDegraded) || + apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolImageBuildDegraded) { + return false, nil } + // Return true when expected state is reached + t.Logf("MCP status after successful build - Updated: %v, Updating: %v, Degraded: %v, ImageBuildDegraded: %v", + apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdated), + apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdating), + apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolDegraded), + apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolImageBuildDegraded)) + return true, nil - }) + }, "MCP should reach correct state after successful build (Updated=True, Updating=False, Degraded=False, ImageBuildDegraded=False)") - t.Logf("MachineOSBuild %s with UID %s deleted after %s", mosb.Name, mosb.UID, time.Since(start)) + // Now trigger another build to test MCP status transitions when a new build starts + t.Logf("Triggering a third build to test MCP status transitions") - require.NoError(t, err, "MachineOSBuild %s with UID %s not deleted after %s", mosb.Name, mosb.UID, time.Since(start)) -} + // Modify the containerfile slightly to trigger a new build + apiMosc, err = cs.MachineconfigurationV1Interface.MachineOSConfigs().Get(ctx, mosc.Name, metav1.GetOptions{}) + require.NoError(t, err) -// Waits for a MachineOSBuild to be deleted. This is different than -// waitForMOSBToBeDeleted since it then asserts that all of the objects -// associated with the MOSB are deleted. -func waitForBuildToBeDeleted(t *testing.T, cs *framework.ClientSet, build *mcfgv1.MachineOSBuild) { - t.Helper() + // Add a comment to the containerfile to change it and trigger a new build + modifiedDockerfile := ocltesthelper.CowsayDockerfile + "\n# Comment to trigger new build" + apiMosc.Spec.Containerfile = []mcfgv1.MachineOSContainerfile{ + { + ContainerfileArch: mcfgv1.NoArch, + Content: modifiedDockerfile, + }, + } - ctx, cancel := context.WithTimeout(context.Background(), time.Minute*5) - defer cancel() + updatedMosc, err = cs.MachineconfigurationV1Interface.MachineOSConfigs().Update(ctx, apiMosc, metav1.UpdateOptions{}) + require.NoError(t, err) - kubeassert := helpers.AssertClientSet(t, cs).WithContext(ctx) + t.Logf("Modified containerfile, waiting for third build to start") - t.Logf("Waiting for MachineOSBuild %s to be deleted", build.Name) + // Get the updated MCP to compute the new build + mcp, err = cs.MachineconfigurationV1Interface.MachineConfigPools().Get(ctx, layeredMCPName, metav1.GetOptions{}) + require.NoError(t, err) - start := time.Now() + // Compute the new MachineOSBuild name for the third build + thirdMoscMosb := buildrequest.NewMachineOSBuildFromAPIOrDie(ctx, cs.GetKubeclient(), updatedMosc, mcp) - waitForMOSBToBeDeleted(t, cs, build) + // Wait for the third build to start + thirdMosb := waitForBuildToStart(t, cs, thirdMoscMosb) + t.Logf("Third build started: %s", thirdMosb.Name) - assertBuildObjectsAreDeleted(t, kubeassert.Eventually(), build) - t.Logf("Build objects deleted after %s", time.Since(start)) -} + // Verify MCP status during active build: + // Updated=False, Updating=True, Degraded=False, ImageBuildDegraded=False + buildingMcp, err := cs.MachineconfigurationV1Interface.MachineConfigPools().Get(ctx, layeredMCPName, metav1.GetOptions{}) + require.NoError(t, err) -// Waits for the given MachineOSBuild to complete and returns the completed -// MachineOSBuild object. -func waitForBuildToComplete(t *testing.T, cs *framework.ClientSet, startedBuild *mcfgv1.MachineOSBuild) *mcfgv1.MachineOSBuild { - t.Helper() + kubeassert.Eventually().MachineConfigPoolReachesState(buildingMcp, func(mcp *mcfgv1.MachineConfigPool, err error) (bool, error) { + if err != nil { + return false, err + } + // During build, MCP should show: Updated=False, Updating=True + if apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdated) || + !apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdating) || + apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolDegraded) || + apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolImageBuildDegraded) { + return false, nil + } - t.Logf("Waiting for MachineOSBuild %s to complete", startedBuild.Name) + t.Logf("MCP status during active build - Updated: %v, Updating: %v, Degraded: %v, ImageBuildDegraded: %v", + apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdated), + apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdating), + apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolDegraded), + apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolImageBuildDegraded)) - ctx, cancel := context.WithTimeout(context.Background(), time.Minute*20) - defer cancel() + return true, nil + }, "MCP should reach correct state during active build (Updated=False, Updating=True, Degraded=False, ImageBuildDegraded=False)") - start := time.Now() + // Wait for the third build to complete successfully + finalBuild := waitForBuildToComplete(t, cs, thirdMosb) + t.Logf("Third build completed successfully: %s", finalBuild.Name) - kubeassert := helpers.AssertClientSet(t, cs).WithContext(ctx) - kubeassert.Eventually().MachineOSBuildIsSuccessful(startedBuild) - t.Logf("MachineOSBuild %s successful after %s", startedBuild.Name, time.Since(start)) - assertBuildObjectsAreDeleted(t, kubeassert.Eventually(), startedBuild) - t.Logf("Build objects deleted after %s", time.Since(start)) + // Wait for the MachineOSConfig to get the new pullspec, which indicates full reconciliation + waitForMOSCToGetNewPullspec(ctx, t, cs, mosc.Name, string(finalBuild.Status.DigestedImagePushSpec)) - mosb, err := cs.MachineconfigurationV1Interface.MachineOSBuilds().Get(ctx, startedBuild.Name, metav1.GetOptions{}) + // Final verification: MCP status should return to: + // Updated=True, Updating=False, Degraded=False, ImageBuildDegraded=False + finalMcp, err := cs.MachineconfigurationV1Interface.MachineConfigPools().Get(ctx, layeredMCPName, metav1.GetOptions{}) require.NoError(t, err) - return mosb -} + kubeassert.Eventually().MachineConfigPoolReachesState(finalMcp, func(mcp *mcfgv1.MachineConfigPool, err error) (bool, error) { + if err != nil { + return false, err + } + // Return false (keep polling) if conditions don't match expected state + if !apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdated) || + apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdating) || + apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolDegraded) || + apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolImageBuildDegraded) { + return false, nil + } -func waitForBuildToBeInterrupted(t *testing.T, cs *framework.ClientSet, startedBuild *mcfgv1.MachineOSBuild) *mcfgv1.MachineOSBuild { - t.Helper() + // Return true when expected state is reached + t.Logf("Final MCP status after third build completion - Updated: %v, Updating: %v, Degraded: %v, ImageBuildDegraded: %v", + apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdated), + apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdating), + apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolDegraded), + apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolImageBuildDegraded)) - t.Logf("Waiting for MachineOSBuild %s to be interrupted", startedBuild.Name) + return true, nil + }, "MCP should return to correct state after final build completion (Updated=True, Updating=False, Degraded=False, ImageBuildDegraded=False)") - ctx, cancel := context.WithTimeout(context.Background(), time.Minute*5) - defer cancel() + t.Logf("All MCP status transitions verified successfully across build failure, success, and subsequent new build") +} - start := time.Now() +// TestCurrentMachineOSBuildAnnotationHandling tests that the node controller correctly uses the +// current-machine-os-build annotation on the MachineOSConfig to select the correct MOSB when +// multiple MOSBs exist for the same rendered MachineConfig. This can happen during rapid updates +// or when a rebuild annotation is applied. +func TestCurrentMachineOSBuildAnnotationHandling(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) - kubeassert := helpers.AssertClientSet(t, cs).WithContext(ctx) - kubeassert.Eventually().MachineOSBuildIsInterrupted(startedBuild) - t.Logf("MachineOSBuild %s interrupted after %s", startedBuild.Name, time.Since(start)) + cs := framework.NewClientSet("") - mosb, err := cs.MachineconfigurationV1Interface.MachineOSBuilds().Get(ctx, startedBuild.Name, metav1.GetOptions{}) - require.NoError(t, err) + // Setup: Create initial layered pool and build + mosc := prepareForOnClusterLayeringTest(t, cs, onClusterLayeringTestOpts{ + poolName: layeredMCPName, + customDockerfiles: map[string]string{ + layeredMCPName: ocltesthelper.SimpleDockerfile, + }, + }) - return mosb -} + createMachineOSConfig(t, cs, mosc) -// Validates that the build job is configured correctly. In this case, -// "correctly" means that it has the correct container images. Future -// assertions could include things like ensuring that the proper volume mounts -// are present, etc. -func assertBuildJobIsAsExpected(t *testing.T, cs *framework.ClientSet, mosb *mcfgv1.MachineOSBuild) { - t.Helper() + // Wait for the first build to complete + firstMosb := waitForBuildToStartForPoolAndConfig(t, cs, layeredMCPName, mosc.Name) + t.Logf("First MachineOSBuild %q has started", firstMosb.Name) - osImageURLConfig, err := ctrlcommon.GetOSImageURLConfig(context.TODO(), cs.GetKubeclient()) - require.NoError(t, err) + firstFinishedBuild := waitForBuildToComplete(t, cs, firstMosb) + firstImagePullspec := string(firstFinishedBuild.Status.DigestedImagePushSpec) + t.Logf("First MachineOSBuild %q completed with image: %s", firstFinishedBuild.Name, firstImagePullspec) - mcoImages, err := ctrlcommon.GetImagesConfig(context.TODO(), cs.GetKubeclient()) + // Verify the MOSC has the current-machine-os-build annotation set to the first build + apiMosc, err := cs.MachineconfigurationV1Interface.MachineOSConfigs().Get(ctx, mosc.Name, metav1.GetOptions{}) require.NoError(t, err) + currentBuildAnnotation := apiMosc.GetAnnotations()[constants.CurrentMachineOSBuildAnnotationKey] + assert.Equal(t, firstFinishedBuild.Name, currentBuildAnnotation, + "MOSC should have current-machine-os-build annotation pointing to first build") + t.Logf("Verified MOSC has current-machine-os-build annotation: %s", currentBuildAnnotation) - buildPod, err := getPodFromJob(context.TODO(), cs, mosb.Status.Builder.Job.Name) + // Trigger a second build by editing the MOSC (e.g., updating the containerfile) + // This does NOT create a new rendered MC, which is the scenario we're testing + t.Logf("Updating MachineOSConfig containerfile to trigger second build without new rendered MC") + apiMosc, err = cs.MachineconfigurationV1Interface.MachineOSConfigs().Get(ctx, mosc.Name, metav1.GetOptions{}) require.NoError(t, err) - assertContainerIsUsingExpectedImage := func(c corev1.Container, containerName, expectedImage string) { - if c.Name == containerName { - assert.Equal(t, c.Image, expectedImage) - } + // Update the containerfile to trigger a rebuild + apiMosc.Spec.Containerfile = []mcfgv1.MachineOSContainerfile{ + { + ContainerfileArch: mcfgv1.NoArch, + Content: ocltesthelper.SimpleDockerfile + "\nRUN echo 'test annotation handling' > /etc/test-annotation", + }, } + apiMosc, err = cs.MachineconfigurationV1Interface.MachineOSConfigs().Update(ctx, apiMosc, metav1.UpdateOptions{}) + require.NoError(t, err) + t.Logf("Updated MachineOSConfig %q containerfile", apiMosc.Name) - for _, container := range buildPod.Spec.Containers { - assertContainerIsUsingExpectedImage(container, "image-build", mcoImages.MachineConfigOperator) - assertContainerIsUsingExpectedImage(container, "wait-for-done", osImageURLConfig.BaseOSContainerImage) - } -} + // Wait for the second build to start + t.Logf("Waiting for second build to start...") + secondMosbName := waitForMOSCToUpdateCurrentMOSB(ctx, t, cs, mosc.Name, firstMosb.Name) + secondMosb, err := cs.GetMcfgclient().MachineconfigurationV1().MachineOSBuilds().Get(ctx, secondMosbName, metav1.GetOptions{}) + require.NoError(t, err) + secondMosb = waitForBuildToStart(t, cs, secondMosb) + t.Logf("Second MachineOSBuild %q has started", secondMosb.Name) -// Prepares for an on-cluster build test by performing the following: -// - Gets the Docker Builder secret name from the MCO namespace. -// - Creates the imagestream to use for the test. -// - Clones the global pull secret into the MCO namespace. -// - If requested, clones the RHEL entitlement secret into the MCO namespace. -// - Creates the on-cluster-build-config ConfigMap. -// - Creates the target MachineConfigPool and waits for it to get a rendered config. -// - Creates the on-cluster-build-custom-dockerfile ConfigMap. -// -// Each of the object creation steps registers an idempotent cleanup function -// that will delete the object at the end of the test. -// -// Returns a MachineOSConfig object for the caller to create to begin the build -// process. -func prepareForOnClusterLayeringTest(t *testing.T, cs *framework.ClientSet, testOpts onClusterLayeringTestOpts) *mcfgv1.MachineOSConfig { - // If the test requires RHEL entitlements, ensure they are present - // in the test cluster. If not found, the test is skipped. - if testOpts.entitlementRequired { - skipIfEntitlementNotPresent(t, cs) - } - - // If the test requires /etc/yum.repos.d and /etc/pki/rpm-gpg, pull a Centos - // Stream 9 container image and populate them from there. This is intended to - // emulate the Red Hat Satellite enablement process, but does not actually - // require any Red Hat Satellite creds to work. - if testOpts.useYumRepos { - injectYumRepos(t, cs) - } - - // Register ephemeral object cleanup function. - makeIdempotentAndRegister(t, func() { - cleanupEphemeralBuildObjects(t, cs) - }) - - imagestreamObjMeta := metav1.ObjectMeta{ - Name: "os-image", - } - - pushSecretName, finalPullspec, _ := setupImageStream(t, cs, imagestreamObjMeta) - - if testOpts.targetNode != nil { - makeIdempotentAndRegister(t, helpers.CreatePoolWithNode(t, cs, testOpts.poolName, *testOpts.targetNode)) - } else { - makeIdempotentAndRegister(t, helpers.CreateMCP(t, cs, testOpts.poolName)) - } + // At this point, both MOSBs exist: + // - firstMosb is completed (with original containerfile) + // - secondMosb is building (with updated containerfile, but SAME rendered MC) + // This is the critical scenario: multiple MOSBs for the same rendered MachineConfig - mcNames := []string{"00-worker"} - if len(testOpts.machineConfigs) > 0 { - for _, mc := range testOpts.machineConfigs { - makeIdempotentAndRegister(t, helpers.ApplyMC(t, cs, mc)) - mcNames = append(mcNames, mc.Name) - } - } + // Verify that the MOSC annotation now points to the second build + apiMosc, err = cs.MachineconfigurationV1Interface.MachineOSConfigs().Get(ctx, mosc.Name, metav1.GetOptions{}) + require.NoError(t, err) + currentBuildAnnotation = apiMosc.GetAnnotations()[constants.CurrentMachineOSBuildAnnotationKey] + assert.Equal(t, secondMosb.Name, currentBuildAnnotation, + "MOSC should have current-machine-os-build annotation pointing to second build") + t.Logf("Verified MOSC annotation updated to second build: %s", currentBuildAnnotation) - _, err := helpers.WaitForRenderedConfigs(t, cs, testOpts.poolName, mcNames...) + // List all MOSBs to confirm both exist + allMosbs, err := cs.MachineconfigurationV1Interface.MachineOSBuilds().List(ctx, metav1.ListOptions{}) require.NoError(t, err) - mosc := &mcfgv1.MachineOSConfig{ - ObjectMeta: metav1.ObjectMeta{ - Name: testOpts.poolName, - }, - Spec: mcfgv1.MachineOSConfigSpec{ - MachineConfigPool: mcfgv1.MachineConfigPoolReference{ - Name: testOpts.poolName, - }, - RenderedImagePushSecret: mcfgv1.ImageSecretObjectReference{ - Name: pushSecretName, - }, - RenderedImagePushSpec: mcfgv1.ImageTagFormat(finalPullspec), - ImageBuilder: mcfgv1.MachineOSImageBuilder{ - ImageBuilderType: mcfgv1.JobBuilder, - }, - Containerfile: []mcfgv1.MachineOSContainerfile{ - { - ContainerfileArch: mcfgv1.NoArch, - Content: testOpts.customDockerfiles[testOpts.poolName], - }, - }, - }, + mosbNames := []string{} + for _, mosb := range allMosbs.Items { + if mosb.Spec.MachineOSConfig.Name == mosc.Name { + mosbNames = append(mosbNames, mosb.Name) + } } + t.Logf("Found %d MOSBs for MachineOSConfig %q: %v", len(mosbNames), mosc.Name, mosbNames) + assert.GreaterOrEqual(t, len(mosbNames), 2, "Should have at least 2 MOSBs at this point") - helpers.SetMetadataOnObject(t, mosc) + // The critical test: The node controller should use the MOSB specified by the annotation + // (secondMosb) even though firstMosb also exists and matches the MOSC name. + // This is implicitly tested by the fact that the pool status should reflect the second build. + // We verify this by checking the pool is waiting for the second build, not using the first. - return mosc -} + t.Logf("Verifying that pool targets the correct (second) build based on annotation") + // The pool should be waiting for the second build to complete, not using the first completed build + // We can verify this by checking that the pool doesn't have the first image in its status -func TestSSHKeyAndPasswordForOSBuilder(t *testing.T) { - t.Skip() + // Wait for the second build to complete + t.Logf("Waiting for second build to complete...") + secondFinishedBuild := waitForBuildToComplete(t, cs, secondMosb) + secondImagePullspec := string(secondFinishedBuild.Status.DigestedImagePushSpec) + t.Logf("Second MachineOSBuild %q completed with image: %s", secondFinishedBuild.Name, secondImagePullspec) - cs := framework.NewClientSet("") + // Verify the images are different (proving we built a new image, not reusing the old one) + assert.NotEqual(t, firstImagePullspec, secondImagePullspec, + "First and second builds should produce different images") - // label random node from pool, get the node - unlabelFunc := helpers.LabelRandomNodeFromPool(t, cs, "worker", "node-role.kubernetes.io/layered") - osNode := helpers.GetSingleNodeByRole(t, cs, layeredMCPName) + // Verify that the MOSC status reflects the second build's image + waitForMOSCToGetNewPullspec(ctx, t, cs, mosc.Name, secondImagePullspec) + apiMosc, err = cs.MachineconfigurationV1Interface.MachineOSConfigs().Get(ctx, mosc.Name, metav1.GetOptions{}) + require.NoError(t, err) + assert.Equal(t, mcfgv1.ImageDigestFormat(secondImagePullspec), apiMosc.Status.CurrentImagePullSpec, + "MOSC status should have the second build's image pullspec") - // prepare for on cluster build test - prepareForOnClusterLayeringTest(t, cs, onClusterLayeringTestOpts{ - poolName: layeredMCPName, - customDockerfiles: map[string]string{}, - }) + // The critical test: Verify the node controller uses the annotation to select the correct MOSB + // Add a node to the pool and verify it gets the SECOND build's image, not the first + t.Logf("Adding node to pool to verify node controller uses annotation-based MOSB selection") + node := helpers.GetRandomNode(t, cs, "worker") - // Set up Ignition config with the desired SSH key and password - testIgnConfig := ctrlcommon.NewIgnConfig() - sshKeyContent := "testsshkey11" - passwordHash := "testpassword11" + unlabelFunc := ocltesthelper.MakeIdempotentAndRegisterAlwaysRun(t, helpers.LabelNode(t, cs, node, helpers.MCPNameToRole(layeredMCPName))) + defer unlabelFunc() - // retreive initial etc/shadow contents - initialEtcShadowContents := helpers.ExecCmdOnNode(t, cs, osNode, "grep", "^core:", "/rootfs/etc/shadow") + // Wait for the node controller to update the node's desiredImage annotation + // The node controller should use the annotation on the MOSC to select the second MOSB + // and therefore set the desiredImage to the second build's image, NOT the first + t.Logf("Waiting for node %s to have desiredImage set to second build's image", node.Name) + helpers.WaitForNodeImageChange(t, cs, node, secondImagePullspec) - testIgnConfig.Passwd.Users = []ign3types.PasswdUser{ - { - Name: "core", - SSHAuthorizedKeys: []ign3types.SSHAuthorizedKey{ign3types.SSHAuthorizedKey(sshKeyContent)}, - PasswordHash: &passwordHash, - }, - } + // Verify the node's desiredImage annotation matches the second build + updatedNode, err := cs.CoreV1Interface.Nodes().Get(ctx, node.Name, metav1.GetOptions{}) + require.NoError(t, err) + desiredImage := updatedNode.Annotations[daemonconsts.DesiredImageAnnotationKey] + assert.Equal(t, secondImagePullspec, desiredImage, + "Node controller should use annotation to select second build, not first build") + t.Logf("Node controller correctly selected second build based on annotation") - testConfig := &mcfgv1.MachineConfig{ - ObjectMeta: metav1.ObjectMeta{ - Name: "99-test-ssh-and-password", - Labels: helpers.MCLabelForRole(layeredMCPName), - }, - Spec: mcfgv1.MachineConfigSpec{ - Config: runtime.RawExtension{ - Raw: helpers.MarshalOrDie(testIgnConfig), - }, - }, - } + // Also verify it's NOT the first build's image + assert.NotEqual(t, firstImagePullspec, desiredImage, + "Node should NOT have first build's image (would indicate annotation was ignored)") - helpers.SetMetadataOnObject(t, testConfig) + t.Logf("Successfully verified that node controller uses annotation-based build selection") +} - // Create the MachineConfig and wait for the configuration to be applied - mcCleanupFunc := applyMC(t, cs, testConfig) +func assertNodeRevertsToNonLayered(t *testing.T, cs *framework.ClientSet, node corev1.Node) { + workerMCName := helpers.GetMcName(t, cs, "worker") + workerMC, err := cs.MachineConfigs().Get(context.TODO(), workerMCName, metav1.GetOptions{}) + require.NoError(t, err) - // wait for rendered config to finish creating - renderedConfig, err := helpers.WaitForRenderedConfig(t, cs, layeredMCPName, testConfig.Name) - require.Nil(t, err) - t.Logf("Finished rendering config") + helpers.WaitForNodeConfigAndImageChange(t, cs, node, workerMCName, "") - // wait for mcp to complete updating - err = helpers.WaitForPoolComplete(t, cs, layeredMCPName, renderedConfig) - require.Nil(t, err) - t.Logf("Pool completed updating") + helpers.AssertNodeBootedIntoImage(t, cs, node, workerMC.Spec.OSImageURL) + t.Logf("Node %s has reverted to OS image %q", node.Name, workerMC.Spec.OSImageURL) - // Validate the SSH key and password - osNode = helpers.GetSingleNodeByRole(t, cs, layeredMCPName) // Re-fetch node with updated configurations + helpers.AssertFileNotOnNode(t, cs, node, filepath.Join("/etc/systemd/system", runtimeassets.RevertServiceName)) + helpers.AssertFileNotOnNode(t, cs, node, runtimeassets.RevertServiceMachineConfigFile) +} - foundSSHKey := helpers.ExecCmdOnNode(t, cs, osNode, "cat", "/rootfs/home/core/.ssh/authorized_keys.d/ignition") - if !strings.Contains(foundSSHKey, sshKeyContent) { - t.Fatalf("updated ssh key not found, got %s", foundSSHKey) - } - t.Logf("updated ssh hash found, got %s", foundSSHKey) +func assertBuildObjectsAreCreated(t *testing.T, kubeassert *helpers.Assertions, mosb *mcfgv1.MachineOSBuild) { + t.Helper() - currentEtcShadowContents := helpers.ExecCmdOnNode(t, cs, osNode, "grep", "^core:", "/rootfs/etc/shadow") - if currentEtcShadowContents == initialEtcShadowContents { - t.Fatalf("updated password hash not found in /etc/shadow, got %s", currentEtcShadowContents) - } - t.Logf("updated password hash found in /etc/shadow, got %s", currentEtcShadowContents) + kubeassert.JobExists(utils.GetBuildJobName(mosb)) + kubeassert.ConfigMapExists(utils.GetContainerfileConfigMapName(mosb)) + kubeassert.ConfigMapExists(utils.GetMCConfigMapName(mosb)) + kubeassert.ConfigMapExists(utils.GetEtcPolicyConfigMapName(mosb)) + kubeassert.ConfigMapExists(utils.GetEtcRegistriesConfigMapName(mosb)) + kubeassert.SecretExists(utils.GetBasePullSecretName(mosb)) + kubeassert.SecretExists(utils.GetFinalPushSecretName(mosb)) - t.Logf("Node %s has correct SSH Key and Password Hash", osNode.Name) + // Check that ownerReferences are set as well + kubeassert.ConfigMapHasOwnerSet(utils.GetContainerfileConfigMapName(mosb)) + kubeassert.ConfigMapHasOwnerSet(utils.GetMCConfigMapName(mosb)) + kubeassert.ConfigMapHasOwnerSet(utils.GetEtcPolicyConfigMapName(mosb)) + kubeassert.ConfigMapHasOwnerSet(utils.GetEtcRegistriesConfigMapName(mosb)) + kubeassert.SecretHasOwnerSet(utils.GetBasePullSecretName(mosb)) + kubeassert.SecretHasOwnerSet(utils.GetFinalPushSecretName(mosb)) +} - // Clean-up: Delete the applied MachineConfig and ensure configurations are rolled back +func assertBuildObjectsAreDeleted(t *testing.T, kubeassert *helpers.Assertions, mosb *mcfgv1.MachineOSBuild) { + t.Helper() - t.Cleanup(func() { - unlabelFunc() - mcCleanupFunc() - }) + kubeassert.JobDoesNotExist(utils.GetBuildJobName(mosb)) + kubeassert.ConfigMapDoesNotExist(utils.GetContainerfileConfigMapName(mosb)) + kubeassert.ConfigMapDoesNotExist(utils.GetMCConfigMapName(mosb)) + kubeassert.ConfigMapDoesNotExist(utils.GetEtcPolicyConfigMapName(mosb)) + kubeassert.ConfigMapDoesNotExist(utils.GetEtcRegistriesConfigMapName(mosb)) + kubeassert.SecretDoesNotExist(utils.GetBasePullSecretName(mosb)) + kubeassert.SecretDoesNotExist(utils.GetFinalPushSecretName(mosb)) } -// This test starts a build and then immediately scales down the -// machine-os-builder deployment until the underlying build job has completed. -// The rationale behind this test is so that if the machine-os-builder pod gets -// rescheduled onto a different node while a build is occurring that the -// MachineOSBuild object will eventually be reconciled, even if the build -// completed during the rescheduling operation. -func TestControllerEventuallyReconciles(t *testing.T) { +// Sets up and performs an on-cluster build for a given set of parameters. +// Returns the built image pullspec for later consumption. +func runOnClusterLayeringTest(t *testing.T, testOpts onClusterLayeringTestOpts) (string, *mcfgv1.MachineOSBuild) { ctx, cancel := context.WithCancel(context.Background()) t.Cleanup(cancel) cs := framework.NewClientSet("") - poolName := layeredMCPName + imageBuilder := testOpts.imageBuilderType + if testOpts.imageBuilderType == "" { + imageBuilder = mcfgv1.JobBuilder + } - mosc := prepareForOnClusterLayeringTest(t, cs, onClusterLayeringTestOpts{ - poolName: poolName, - customDockerfiles: map[string]string{ - layeredMCPName: cowsayDockerfile, - }, - }) + t.Logf("Running with ImageBuilder type: %s", imageBuilder) - mcp, err := cs.MachineconfigurationV1Interface.MachineConfigPools().Get(ctx, poolName, metav1.GetOptions{}) - require.NoError(t, err) + mosc := prepareForOnClusterLayeringTest(t, cs, testOpts) + // Create our MachineOSConfig. createMachineOSConfig(t, cs, mosc) - mosb := buildrequest.NewMachineOSBuildFromAPIOrDie(ctx, cs.GetKubeclient(), mosc, mcp) + // Create a child context for the machine-os-builder pod log streamer. We + // create it here because we want the cancellation to run before the + // MachineOSConfig object is removed. + mobPodStreamerCtx, mobPodStreamerCancel := context.WithCancel(ctx) + t.Cleanup(mobPodStreamerCancel) - // Wait for the MachineOSBuild to exist. - kubeassert := helpers.AssertClientSet(t, cs).WithContext(ctx).Eventually() - kubeassert.MachineOSBuildExists(mosb) - jobName, err := getJobForMOSB(ctx, cs, mosb) - require.NoError(t, err) - kubeassert.JobExists(jobName) - assertBuildObjectsAreCreated(t, kubeassert, mosb) + // Wait for the build to start + startedBuild := waitForBuildToStartForPoolAndConfig(t, cs, testOpts.poolName, mosc.Name) + t.Logf("MachineOSBuild %q has started", startedBuild.Name) - t.Logf("MachineOSBuild %q exists, stopping machine-os-builder", mosb.Name) + t.Logf("Waiting for build completion...") - // As soon as the MachineOSBuild exists, scale down the machine-os-builder - // deployment and any other deployments which may inadvertantly cause its - // replica count to increase. This is done to simulate the machine-os-builder - // pod being scheduled onto a different node. - restoreDeployments := scaleDownDeployments(t, cs) + // Create a child context for the build pod log streamer. This is so we can + // cancel it independently of the parent context or the context for the + // machine-os-build pod watcher (which has its own separate context). + buildPodStreamerCtx, buildPodStreamerCancel := context.WithCancel(ctx) - // Wait for the job to start running. - waitForJobToReachMOSBCondition(ctx, t, cs, jobName, mcfgv1.MachineOSBuilding) + // We wire this to both t.Cleanup() as well as defer because we want to + // cancel this context either at the end of this function or when the test + // fails, whichever comes first. + buildPodWatcherShutdown := ocltesthelper.MakeIdempotentAndRegisterAlwaysRun(t, buildPodStreamerCancel) + defer buildPodWatcherShutdown() - t.Logf("Job %s has started running, starting machine-os-builder", jobName) + dirPath := ocltesthelper.GetBuildArtifactDir(t) - // Restore the deployments. - restoreDeployments() + podLogsDirPath := filepath.Join(dirPath, "pod-logs") + require.NoError(t, os.MkdirAll(podLogsDirPath, 0o755)) - // Ensure that the MachineOSBuild object eventually gets updated. - kubeassert.MachineOSBuildIsRunning(mosb) + // In the event of a test failure, we want to dump all of the build artifacts + // to files for easy reference later. + t.Cleanup(func() { + if t.Failed() { + ocltesthelper.WriteBuildArtifactsToFiles(t, cs) + } + }) - t.Logf("MachineOSBuild %s is now running, stopping machine-os-builder", mosb.Name) + // The pod log collection blocks the main Goroutine since we follow the logs + // for each container in the build pod. So they must run in a separate + // Goroutine so that the rest of the test can continue. + go func() { + err := ocltesthelper.StreamBuildPodLogsToFile(buildPodStreamerCtx, t, cs, startedBuild, podLogsDirPath) + if err != nil && !errors.Is(err, context.Canceled) { + t.Logf("Warning: failed to stream build pod logs: %v", err) + } + }() - // Stop the deployments again. - restoreDeployments = scaleDownDeployments(t, cs) + // We also want to collect logs from the machine-os-builder pod since they + // can provide a valuable window in how / why a test failed. As mentioned + // above, we need to run this in a separate Goroutine so that the test is not + // blocked. + go func() { + err := ocltesthelper.StreamMachineOSBuilderPodLogsToFile(mobPodStreamerCtx, t, cs, podLogsDirPath) + if err != nil && !errors.Is(err, context.Canceled) { + t.Logf("Warning: failed to stream machine-os-builder pod logs: %v", err) + } + }() - // Wait for the job to complete. - waitForJobToReachMOSBCondition(ctx, t, cs, jobName, mcfgv1.MachineOSBuildSucceeded) + // Wait for the build to complete. + finishedBuild := waitForBuildToComplete(t, cs, startedBuild) - t.Logf("Job %q finished, starting machine-os-builder", jobName) + t.Logf("MachineOSBuild %q has completed and produced image: %s", finishedBuild.Name, finishedBuild.Status.DigestedImagePushSpec) - // Restore the deployments again. - restoreDeployments() + require.NoError(t, archiveBuildPodLogs(t, podLogsDirPath)) - // At this point, the machine-os-builder is running, so we wait for the build - // itself to complete and be updated. - mosb = waitForBuildToComplete(t, cs, mosb) + return string(finishedBuild.Status.DigestedImagePushSpec), startedBuild +} - // Wait until the MachineOSConfig gets the digested pullspec from the MachineOSBuild. - require.NoError(t, wait.PollImmediate(1*time.Second, 5*time.Minute, func() (bool, error) { - mosc, err := cs.MachineconfigurationV1Interface.MachineOSConfigs().Get(ctx, mosc.Name, metav1.GetOptions{}) - if err != nil { - return false, err - } +// Waits for the build to start and returns the started MachineOSBuild object. +func waitForBuildToStartForPoolAndConfig(t *testing.T, cs *framework.ClientSet, poolName, moscName string) *mcfgv1.MachineOSBuild { + t.Helper() - return mosc.Status.CurrentImagePullSpec != "" && mosc.Status.CurrentImagePullSpec == mosb.Status.DigestedImagePushSpec, nil - })) -} + var mosbName string -func waitForMOSCToGetNewPullspec(ctx context.Context, t *testing.T, cs *framework.ClientSet, moscName, pullspec string) { - require.NoError(t, wait.PollImmediate(1*time.Second, 5*time.Minute, func() (bool, error) { - mosc, err := cs.MachineconfigurationV1Interface.MachineOSConfigs().Get(ctx, moscName, metav1.GetOptions{}) + require.NoError(t, wait.PollImmediate(2*time.Second, 3*time.Minute, func() (bool, error) { + // Get the name for the MachineOSBuild based upon the MachineConfigPool and MachineOSConfig state. + name, err := ocltesthelper.GetMachineOSBuildNameForPool(cs, poolName, moscName) if err != nil { - return false, err + return false, nil } - return mosc.Status.CurrentImagePullSpec != "" && string(mosc.Status.CurrentImagePullSpec) == pullspec, nil + mosbName = name + return true, nil })) + + // Create a "dummy" MachineOSBuild object with just the name field set so + // that waitForMachineOSBuildToReachState() can use it. + mosb := &mcfgv1.MachineOSBuild{ + ObjectMeta: metav1.ObjectMeta{ + Name: mosbName, + }, + } + + return waitForBuildToStart(t, cs, mosb) } -func waitForMOSCToUpdateCurrentMOSB(ctx context.Context, t *testing.T, cs *framework.ClientSet, moscName, mosbName string) string { - var currentMOSB string - require.NoError(t, wait.PollImmediate(1*time.Second, 5*time.Minute, func() (bool, error) { - mosc, err := cs.MachineconfigurationV1Interface.MachineOSConfigs().Get(ctx, moscName, metav1.GetOptions{}) - if err != nil { - return false, err - } +// Waits for a MachineOSBuild to start building. +func waitForBuildToStart(t *testing.T, cs *framework.ClientSet, build *mcfgv1.MachineOSBuild) *mcfgv1.MachineOSBuild { + t.Helper() - currentMOSB = mosc.GetAnnotations()[constants.CurrentMachineOSBuildAnnotationKey] - return currentMOSB != mosbName, nil + t.Logf("Waiting for MachineOSBuild %s to start", build.Name) - })) - return currentMOSB -} + ctx, cancel := context.WithTimeout(context.Background(), time.Minute*5) + defer cancel() -// Waits for a job object to reach a given state. -// TOOD: Add this to the Asserts helper struct. -func waitForJobToReachCondition(ctx context.Context, t *testing.T, cs *framework.ClientSet, jobName string, condFunc func(*batchv1.Job) (bool, error)) { - require.NoError(t, wait.PollImmediate(1*time.Second, 10*time.Minute, func() (bool, error) { - job, err := cs.BatchV1Interface.Jobs(ctrlcommon.MCONamespace).Get(ctx, jobName, metav1.GetOptions{}) - if err != nil { - return false, err - } + start := time.Now() - return condFunc(job) - })) -} + kubeassert := helpers.AssertClientSet(t, cs).WithContext(ctx) + kubeassert.Eventually().MachineOSBuildExists(build) + t.Logf("MachineOSBuild %s created after %s", build.Name, time.Since(start)) + kubeassert.Eventually().MachineOSBuildIsRunning(build) + t.Logf("MachineOSBuild %s running after %s", build.Name, time.Since(start)) -// Waits for a job object to be mapped to a given MachineOSBuild state. Will always fail the test if the job reaches a failed state unexpectedly. -func waitForJobToReachMOSBCondition(ctx context.Context, t *testing.T, cs *framework.ClientSet, jobName string, expectedCondition mcfgv1.BuildProgress) { - waitForJobToReachCondition(ctx, t, cs, jobName, func(job *batchv1.Job) (bool, error) { - buildprogress, _ := imagebuilder.MapJobStatusToBuildStatus(job) - if buildprogress == mcfgv1.MachineOSBuildFailed && expectedCondition != mcfgv1.MachineOSBuildFailed { - return false, fmt.Errorf("job %q failed unexpectedly", jobName) - } + // Get the job for the MOSB created by comparing the job UID with the MOSB annotation + buildJobName, err := ocltesthelper.GetJobForMOSB(ctx, cs, build) + require.NoError(t, err) + kubeassert.Eventually().JobExists(buildJobName) + t.Logf("Build job %s created after %s", buildJobName, time.Since(start)) + // Get the pod created by the job + buildPod, err := ocltesthelper.GetPodFromJob(ctx, cs, buildJobName) + require.NoError(t, err) + kubeassert.Eventually().PodIsRunning(buildPod.Name) + t.Logf("Build pod %s running after %s", buildPod.Name, time.Since(start)) + kubeassert.Eventually().PodHasOwnerSet(buildPod.Name) + t.Logf("Build pod %s has owner set after %s", buildPod.Name, time.Since(start)) - return expectedCondition == buildprogress, nil - }) + mosb, err := cs.MachineconfigurationV1Interface.MachineOSBuilds().Get(ctx, build.Name, metav1.GetOptions{}) + require.NoError(t, err) + + assertBuildObjectsAreCreated(t, kubeassert.Eventually(), mosb) + t.Logf("Build objects created after %s", time.Since(start)) + + return mosb } -// waitForImageBuildDegradedCondition waits for the ImageBuildDegraded condition to reach the expected state -func waitForImageBuildDegradedCondition(ctx context.Context, t *testing.T, cs *framework.ClientSet, poolName string, expectedStatus corev1.ConditionStatus) *mcfgv1.MachineConfigPoolCondition { +// Waits for a MachineOSBuild with a specific UID to be deleted. +func waitForMOSBToBeDeleted(t *testing.T, cs *framework.ClientSet, mosb *mcfgv1.MachineOSBuild) { t.Helper() - var condition *mcfgv1.MachineConfigPoolCondition - require.NoError(t, wait.PollImmediate(1*time.Second, 2*time.Minute, func() (bool, error) { - mcp, err := cs.MachineconfigurationV1Interface.MachineConfigPools().Get(ctx, poolName, metav1.GetOptions{}) - if err != nil { - return false, err - } + ctx, cancel := context.WithTimeout(context.Background(), time.Minute*5) + defer cancel() - condition = apihelpers.GetMachineConfigPoolCondition(mcp.Status, mcfgv1.MachineConfigPoolImageBuildDegraded) - if condition == nil { - return false, nil + start := time.Now() + + // If the given MachineOSBuild does not have a UID, e.g., from the + // NewMachineOSBuildFromAPIOrDie() helper, then we query the API server to + // find it. + if mosb.UID == "" { + t.Logf("No UID provided for MachineOSBuild %s, querying API for UID", mosb.Name) + // Get the MOSB from the API to get the UID + apiMosb, err := cs.MachineconfigurationV1Interface.MachineOSBuilds().Get(context.Background(), mosb.Name, metav1.GetOptions{}) + require.NoError(t, err) + + if k8serrors.IsNotFound(err) { + t.Logf("MachineOSBuild %s is not found, must have already been deleted", mosb.Name) + return } - return condition.Status == expectedStatus, nil - })) + require.NoError(t, err) - return condition -} + mosb = apiMosb + } -// TestImageBuildDegradedOnFailureAndClearedOnBuildStart tests that the -// ImageBuildDegraded condition is set to True when a MachineOSBuild fails, and -// is set to False when a MachineOSBuild is started after a previous failure. -// Previously, this test waited until the build was completed before verifying -// that the state was no longer degraded. -func TestImageBuildDegradedOnFailureAndClearedOnBuildStart(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - t.Cleanup(cancel) + mosbName := mosb.Name + mosbUID := mosb.UID - cs := framework.NewClientSet("") + t.Logf("Waiting for MachineOSBuild %s with UID %s to be deleted", mosbName, mosbUID) - mosc := prepareForOnClusterLayeringTest(t, cs, onClusterLayeringTestOpts{ - poolName: layeredMCPName, - customDockerfiles: map[string]string{ - layeredMCPName: cowsayDockerfile, - }, + // Assert does not adequately handle the case where the object is deleted. + // See https://issues.redhat.com/browse/OCPBUGS-63048 for details. + err := wait.PollImmediate(time.Second, time.Minute*5, func() (bool, error) { + mosbs, err := cs.MachineconfigurationV1Interface.MachineOSBuilds().List(ctx, metav1.ListOptions{}) + if err != nil { + return false, err + } + + for _, mosb := range mosbs.Items { + // If we find a MachineOSBuild with the same name and UID, then we know + // it has not been deleted yet. + if mosb.Name == mosbName && mosb.UID == mosbUID { + return false, nil + } + } + + return true, nil }) - // First, add a bad containerfile to cause a build failure. However, we will - // actually delete the build pod to force the failure to happen faster. - t.Logf("Adding a bad containerfile for MachineOSConfig %s to cause a build failure", mosc.Name) - mosc.Spec.Containerfile = getBadContainerFileForFailureTest() + t.Logf("MachineOSBuild %s with UID %s deleted after %s", mosb.Name, mosb.UID, time.Since(start)) - createMachineOSConfig(t, cs, mosc) + require.NoError(t, err, "MachineOSBuild %s with UID %s not deleted after %s", mosb.Name, mosb.UID, time.Since(start)) +} - // Wait for the build to start and fail - firstMosb := waitForBuildToStartForPoolAndConfig(t, cs, layeredMCPName, mosc.Name) - t.Logf("Waiting for MachineOSBuild %s to fail", firstMosb.Name) +// Waits for a MachineOSBuild to be deleted. This is different than +// waitForMOSBToBeDeleted since it then asserts that all of the objects +// associated with the MOSB are deleted. +func waitForBuildToBeDeleted(t *testing.T, cs *framework.ClientSet, build *mcfgv1.MachineOSBuild) { + t.Helper() - // Force the build to fail faster by repeatedly deleting the build pods until - // the job reflects a failure status. - require.NoError(t, forceMachineOSBuildToFail(ctx, t, cs, firstMosb)) + ctx, cancel := context.WithTimeout(context.Background(), time.Minute*5) + defer cancel() kubeassert := helpers.AssertClientSet(t, cs).WithContext(ctx) - kubeassert.Eventually().MachineOSBuildIsFailure(firstMosb) - // Wait for and verify ImageBuildDegraded condition is set to True - degradedCondition := waitForImageBuildDegradedCondition(ctx, t, cs, layeredMCPName, corev1.ConditionTrue) - require.NotNil(t, degradedCondition, "ImageBuildDegraded condition should be present") - assert.Equal(t, string(mcfgv1.MachineConfigPoolBuildFailed), degradedCondition.Reason, "ImageBuildDegraded reason should be BuildFailed") - assert.Contains(t, degradedCondition.Message, fmt.Sprintf("Failed to build OS image for pool %s", layeredMCPName), "ImageBuildDegraded message should contain pool name") - assert.Contains(t, degradedCondition.Message, firstMosb.Name, "ImageBuildDegraded message should contain MachineOSBuild name") + t.Logf("Waiting for MachineOSBuild %s to be deleted", build.Name) - t.Logf("ImageBuildDegraded condition correctly set to True with message: %s", degradedCondition.Message) + start := time.Now() - // Now fix the MachineOSConfig with a good containerfile - apiMosc, err := cs.MachineconfigurationV1Interface.MachineOSConfigs().Get(ctx, mosc.Name, metav1.GetOptions{}) - require.NoError(t, err) + waitForMOSBToBeDeleted(t, cs, build) - apiMosc.Spec.Containerfile = []mcfgv1.MachineOSContainerfile{ - { - ContainerfileArch: mcfgv1.NoArch, - Content: cowsayDockerfile, - }, - } + assertBuildObjectsAreDeleted(t, kubeassert.Eventually(), build) + t.Logf("Build objects deleted after %s", time.Since(start)) +} - updated, err := cs.MachineconfigurationV1Interface.MachineOSConfigs().Update(ctx, apiMosc, metav1.UpdateOptions{}) - require.NoError(t, err) +// Waits for the given MachineOSBuild to complete and returns the completed +// MachineOSBuild object. +func waitForBuildToComplete(t *testing.T, cs *framework.ClientSet, startedBuild *mcfgv1.MachineOSBuild) *mcfgv1.MachineOSBuild { + t.Helper() - t.Logf("Fixed containerfile, waiting for new build to start") + t.Logf("Waiting for MachineOSBuild %s to complete", startedBuild.Name) - mcp, err := cs.MachineconfigurationV1Interface.MachineConfigPools().Get(ctx, layeredMCPName, metav1.GetOptions{}) + ctx, cancel := context.WithTimeout(context.Background(), time.Minute*20) + defer cancel() + + start := time.Now() + + kubeassert := helpers.AssertClientSet(t, cs).WithContext(ctx) + kubeassert.Eventually().MachineOSBuildIsSuccessful(startedBuild) + t.Logf("MachineOSBuild %s successful after %s", startedBuild.Name, time.Since(start)) + assertBuildObjectsAreDeleted(t, kubeassert.Eventually(), startedBuild) + t.Logf("Build objects deleted after %s", time.Since(start)) + + mosb, err := cs.MachineconfigurationV1Interface.MachineOSBuilds().Get(ctx, startedBuild.Name, metav1.GetOptions{}) require.NoError(t, err) - // Compute the new MachineOSBuild name - moscChangeMosb := buildrequest.NewMachineOSBuildFromAPIOrDie(ctx, cs.GetKubeclient(), updated, mcp) + return mosb +} - // Wait for the second build to start - secondMosb := waitForBuildToStart(t, cs, moscChangeMosb) - t.Logf("Second build started successfully: %s", secondMosb.Name) +func waitForBuildToBeInterrupted(t *testing.T, cs *framework.ClientSet, startedBuild *mcfgv1.MachineOSBuild) *mcfgv1.MachineOSBuild { + t.Helper() - // Wait for and verify ImageBuildDegraded condition is False after the new build starts. - // The condition should be cleared when the build starts. - degradedCondition = waitForImageBuildDegradedCondition(ctx, t, cs, layeredMCPName, corev1.ConditionFalse) - require.NotNil(t, degradedCondition, "ImageBuildDegraded condition should still be present") - assert.Equal(t, string(mcfgv1.MachineConfigPoolBuilding), degradedCondition.Reason, "ImageBuildDegraded reason should be Building") - t.Logf("ImageBuildDegraded condition correctly cleared to False when build started with message: %s", degradedCondition.Message) + t.Logf("Waiting for MachineOSBuild %s to be interrupted", startedBuild.Name) - // Wait for the second build to complete successfully - finishedBuild := waitForBuildToComplete(t, cs, secondMosb) - t.Logf("Second build completed successfully: %s", finishedBuild.Name) + ctx, cancel := context.WithTimeout(context.Background(), time.Minute*5) + defer cancel() - // Wait for the MachineOSConfig to get the new pullspec, which indicates full reconciliation - waitForMOSCToGetNewPullspec(ctx, t, cs, mosc.Name, string(finishedBuild.Status.DigestedImagePushSpec)) + start := time.Now() - // Wait for and verify ImageBuildDegraded condition is False with reason BuildSucceeded - degradedCondition = waitForImageBuildDegradedCondition(ctx, t, cs, layeredMCPName, corev1.ConditionFalse) - require.NotNil(t, degradedCondition, "ImageBuildDegraded condition should still be present") - assert.Equal(t, string(mcfgv1.MachineConfigPoolBuildSuccess), degradedCondition.Reason, "ImageBuildDegraded reason should be BuildSuccess") - t.Logf("ImageBuildDegraded condition correctly set to False when build succeeded with message: %s", degradedCondition.Message) + kubeassert := helpers.AssertClientSet(t, cs).WithContext(ctx) + kubeassert.Eventually().MachineOSBuildIsInterrupted(startedBuild) + t.Logf("MachineOSBuild %s interrupted after %s", startedBuild.Name, time.Since(start)) - // Verify MCP status is correct after successful build and full reconciliation - successMcp, err := cs.MachineconfigurationV1Interface.MachineConfigPools().Get(ctx, layeredMCPName, metav1.GetOptions{}) + mosb, err := cs.MachineconfigurationV1Interface.MachineOSBuilds().Get(ctx, startedBuild.Name, metav1.GetOptions{}) require.NoError(t, err) - // After successful build completion and full reconciliation, MCP should show: - // Updated=True, Updating=False, Degraded=False, ImageBuildDegraded=False - kubeassert.Eventually().MachineConfigPoolReachesState(successMcp, func(mcp *mcfgv1.MachineConfigPool, err error) (bool, error) { - if err != nil { - return false, err - } - // Return false (keep polling) if conditions don't match expected state - if !apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdated) || - apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdating) || - apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolDegraded) || - apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolImageBuildDegraded) { - return false, nil - } + return mosb +} - // Return true when expected state is reached - t.Logf("MCP status after successful build - Updated: %v, Updating: %v, Degraded: %v, ImageBuildDegraded: %v", - apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdated), - apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdating), - apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolDegraded), - apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolImageBuildDegraded)) +// Prepares for an on-cluster build test by performing the following: +// - Gets the Docker Builder secret name from the MCO namespace. +// - Creates the imagestream to use for the test. +// - Clones the global pull secret into the MCO namespace. +// - If requested, clones the RHEL entitlement secret into the MCO namespace. +// - Creates the on-cluster-build-config ConfigMap. +// - Creates the target MachineConfigPool and waits for it to get a rendered config. +// - Creates the on-cluster-build-custom-dockerfile ConfigMap. +// +// Each of the object creation steps registers an idempotent cleanup function +// that will delete the object at the end of the test. +// +// Returns a MachineOSConfig object for the caller to create to begin the build +// process. +func prepareForOnClusterLayeringTest(t *testing.T, cs *framework.ClientSet, testOpts onClusterLayeringTestOpts) *mcfgv1.MachineOSConfig { + // If the test requires RHEL entitlements, ensure they are present + // in the test cluster. If not found, the test is skipped. + if testOpts.entitlementRequired { + ocltesthelper.SkipIfEntitlementNotPresent(t, cs) + } - return true, nil - }, "MCP should reach correct state after successful build (Updated=True, Updating=False, Degraded=False, ImageBuildDegraded=False)") + // If the test requires /etc/yum.repos.d and /etc/pki/rpm-gpg, pull a Centos + // Stream 9 container image and populate them from there. This is intended to + // emulate the Red Hat Satellite enablement process, but does not actually + // require any Red Hat Satellite creds to work. + if testOpts.useYumRepos { + ocltesthelper.InjectYumRepos(t, cs, skipCleanupAlways, skipCleanupOnlyAfterFailure) + } - // Now trigger another build to test MCP status transitions when a new build starts - t.Logf("Triggering a third build to test MCP status transitions") + // Register ephemeral object cleanup function. + ocltesthelper.MakeIdempotentAndRegister(t, skipCleanupAlways, skipCleanupOnlyAfterFailure, func() { + ocltesthelper.CleanupEphemeralBuildObjects(t, cs) + }) - // Modify the containerfile slightly to trigger a new build - apiMosc, err = cs.MachineconfigurationV1Interface.MachineOSConfigs().Get(ctx, mosc.Name, metav1.GetOptions{}) - require.NoError(t, err) + imagestreamObjMeta := metav1.ObjectMeta{ + Name: "os-image", + } - // Add a comment to the containerfile to change it and trigger a new build - modifiedDockerfile := cowsayDockerfile + "\n# Comment to trigger new build" - apiMosc.Spec.Containerfile = []mcfgv1.MachineOSContainerfile{ - { - ContainerfileArch: mcfgv1.NoArch, - Content: modifiedDockerfile, - }, + pushSecretName, finalPullspec, _ := ocltesthelper.SetupImageStream(t, cs, imagestreamObjMeta, skipCleanupAlways, skipCleanupOnlyAfterFailure) + + if testOpts.targetNode != nil { + ocltesthelper.MakeIdempotentAndRegister(t, skipCleanupAlways, skipCleanupOnlyAfterFailure, helpers.CreatePoolWithNode(t, cs, testOpts.poolName, *testOpts.targetNode)) + } else { + ocltesthelper.MakeIdempotentAndRegister(t, skipCleanupAlways, skipCleanupOnlyAfterFailure, helpers.CreateMCP(t, cs, testOpts.poolName)) } - updated, err = cs.MachineconfigurationV1Interface.MachineOSConfigs().Update(ctx, apiMosc, metav1.UpdateOptions{}) + mcNames := []string{"00-worker"} + if len(testOpts.machineConfigs) > 0 { + for _, mc := range testOpts.machineConfigs { + ocltesthelper.MakeIdempotentAndRegister(t, skipCleanupAlways, skipCleanupOnlyAfterFailure, helpers.ApplyMC(t, cs, mc)) + mcNames = append(mcNames, mc.Name) + } + } + + _, err := helpers.WaitForRenderedConfigs(t, cs, testOpts.poolName, mcNames...) require.NoError(t, err) - t.Logf("Modified containerfile, waiting for third build to start") + mosc := &mcfgv1.MachineOSConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: testOpts.poolName, + }, + Spec: mcfgv1.MachineOSConfigSpec{ + MachineConfigPool: mcfgv1.MachineConfigPoolReference{ + Name: testOpts.poolName, + }, + RenderedImagePushSecret: mcfgv1.ImageSecretObjectReference{ + Name: pushSecretName, + }, + RenderedImagePushSpec: mcfgv1.ImageTagFormat(finalPullspec), + ImageBuilder: mcfgv1.MachineOSImageBuilder{ + ImageBuilderType: mcfgv1.JobBuilder, + }, + Containerfile: []mcfgv1.MachineOSContainerfile{ + { + ContainerfileArch: mcfgv1.NoArch, + Content: testOpts.customDockerfiles[testOpts.poolName], + }, + }, + }, + } - // Get the updated MCP to compute the new build - mcp, err = cs.MachineconfigurationV1Interface.MachineConfigPools().Get(ctx, layeredMCPName, metav1.GetOptions{}) - require.NoError(t, err) + helpers.SetMetadataOnObject(t, mosc) - // Compute the new MachineOSBuild name for the third build - thirdMoscMosb := buildrequest.NewMachineOSBuildFromAPIOrDie(ctx, cs.GetKubeclient(), updated, mcp) + return mosc +} - // Wait for the third build to start - thirdMosb := waitForBuildToStart(t, cs, thirdMoscMosb) - t.Logf("Third build started: %s", thirdMosb.Name) +func createMachineOSConfig(t *testing.T, cs *framework.ClientSet, mosc *mcfgv1.MachineOSConfig) { + ocltesthelper.CreateMachineOSConfig(t, cs, mosc, skipCleanupAlways, skipCleanupOnlyAfterFailure) +} - // Verify MCP status during active build: - // Updated=False, Updating=True, Degraded=False, ImageBuildDegraded=False - buildingMcp, err := cs.MachineconfigurationV1Interface.MachineConfigPools().Get(ctx, layeredMCPName, metav1.GetOptions{}) - require.NoError(t, err) +func applyMC(t *testing.T, cs *framework.ClientSet, mc *mcfgv1.MachineConfig) func() { + return ocltesthelper.ApplyMC(t, cs, mc, skipCleanupAlways, skipCleanupOnlyAfterFailure) +} - kubeassert.Eventually().MachineConfigPoolReachesState(buildingMcp, func(mcp *mcfgv1.MachineConfigPool, err error) (bool, error) { +func waitForMOSCToGetNewPullspec(ctx context.Context, t *testing.T, cs *framework.ClientSet, moscName, pullspec string) { + require.NoError(t, wait.PollImmediate(1*time.Second, 5*time.Minute, func() (bool, error) { + mosc, err := cs.MachineconfigurationV1Interface.MachineOSConfigs().Get(ctx, moscName, metav1.GetOptions{}) if err != nil { return false, err } - // During build, MCP should show: Updated=False, Updating=True - if apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdated) || - !apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdating) || - apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolDegraded) || - apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolImageBuildDegraded) { - return false, nil - } - t.Logf("MCP status during active build - Updated: %v, Updating: %v, Degraded: %v, ImageBuildDegraded: %v", - apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdated), - apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdating), - apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolDegraded), - apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolImageBuildDegraded)) - - return true, nil - }, "MCP should reach correct state during active build (Updated=False, Updating=True, Degraded=False, ImageBuildDegraded=False)") - - // Wait for the third build to complete successfully - finalBuild := waitForBuildToComplete(t, cs, thirdMosb) - t.Logf("Third build completed successfully: %s", finalBuild.Name) - - // Wait for the MachineOSConfig to get the new pullspec, which indicates full reconciliation - waitForMOSCToGetNewPullspec(ctx, t, cs, mosc.Name, string(finalBuild.Status.DigestedImagePushSpec)) - - // Final verification: MCP status should return to: - // Updated=True, Updating=False, Degraded=False, ImageBuildDegraded=False - finalMcp, err := cs.MachineconfigurationV1Interface.MachineConfigPools().Get(ctx, layeredMCPName, metav1.GetOptions{}) - require.NoError(t, err) + return mosc.Status.CurrentImagePullSpec != "" && string(mosc.Status.CurrentImagePullSpec) == pullspec, nil + })) +} - kubeassert.Eventually().MachineConfigPoolReachesState(finalMcp, func(mcp *mcfgv1.MachineConfigPool, err error) (bool, error) { +func waitForMOSCToUpdateCurrentMOSB(ctx context.Context, t *testing.T, cs *framework.ClientSet, moscName, mosbName string) string { + var currentMOSB string + require.NoError(t, wait.PollImmediate(1*time.Second, 5*time.Minute, func() (bool, error) { + mosc, err := cs.MachineconfigurationV1Interface.MachineOSConfigs().Get(ctx, moscName, metav1.GetOptions{}) if err != nil { return false, err } - // Return false (keep polling) if conditions don't match expected state - if !apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdated) || - apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdating) || - apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolDegraded) || - apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolImageBuildDegraded) { - return false, nil - } - // Return true when expected state is reached - t.Logf("Final MCP status after third build completion - Updated: %v, Updating: %v, Degraded: %v, ImageBuildDegraded: %v", - apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdated), - apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolUpdating), - apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolDegraded), - apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, mcfgv1.MachineConfigPoolImageBuildDegraded)) - - return true, nil - }, "MCP should return to correct state after final build completion (Updated=True, Updating=False, Degraded=False, ImageBuildDegraded=False)") + currentMOSB = mosc.GetAnnotations()[constants.CurrentMachineOSBuildAnnotationKey] + return currentMOSB != mosbName, nil - t.Logf("All MCP status transitions verified successfully across build failure, success, and subsequent new build") + })) + return currentMOSB } -// TestCurrentMachineOSBuildAnnotationHandling tests that the node controller correctly uses the -// current-machine-os-build annotation on the MachineOSConfig to select the correct MOSB when -// multiple MOSBs exist for the same rendered MachineConfig. This can happen during rapid updates -// or when a rebuild annotation is applied. -func TestCurrentMachineOSBuildAnnotationHandling(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - t.Cleanup(cancel) - - cs := framework.NewClientSet("") +// Waits for a job object to be mapped to a given MachineOSBuild state. Will always fail the test if the job reaches a failed state unexpectedly. +func waitForJobToReachMOSBCondition(ctx context.Context, t *testing.T, cs *framework.ClientSet, jobName string, expectedCondition mcfgv1.BuildProgress) { + waitForJobToReachCondition(ctx, t, cs, jobName, func(job *batchv1.Job) (bool, error) { + buildprogress, _ := imagebuilder.MapJobStatusToBuildStatus(job) + if buildprogress == mcfgv1.MachineOSBuildFailed && expectedCondition != mcfgv1.MachineOSBuildFailed { + return false, fmt.Errorf("job %q failed unexpectedly", jobName) + } - // Setup: Create initial layered pool and build - mosc := prepareForOnClusterLayeringTest(t, cs, onClusterLayeringTestOpts{ - poolName: layeredMCPName, - customDockerfiles: map[string]string{ - layeredMCPName: simpleDockerfile, - }, + return expectedCondition == buildprogress, nil }) +} - createMachineOSConfig(t, cs, mosc) - - // Wait for the first build to complete - firstMosb := waitForBuildToStartForPoolAndConfig(t, cs, layeredMCPName, mosc.Name) - t.Logf("First MachineOSBuild %q has started", firstMosb.Name) - - firstFinishedBuild := waitForBuildToComplete(t, cs, firstMosb) - firstImagePullspec := string(firstFinishedBuild.Status.DigestedImagePushSpec) - t.Logf("First MachineOSBuild %q completed with image: %s", firstFinishedBuild.Name, firstImagePullspec) - - // Verify the MOSC has the current-machine-os-build annotation set to the first build - apiMosc, err := cs.MachineconfigurationV1Interface.MachineOSConfigs().Get(ctx, mosc.Name, metav1.GetOptions{}) - require.NoError(t, err) - currentBuildAnnotation := apiMosc.GetAnnotations()[constants.CurrentMachineOSBuildAnnotationKey] - assert.Equal(t, firstFinishedBuild.Name, currentBuildAnnotation, - "MOSC should have current-machine-os-build annotation pointing to first build") - t.Logf("Verified MOSC has current-machine-os-build annotation: %s", currentBuildAnnotation) - - // Trigger a second build by editing the MOSC (e.g., updating the containerfile) - // This does NOT create a new rendered MC, which is the scenario we're testing - t.Logf("Updating MachineOSConfig containerfile to trigger second build without new rendered MC") - apiMosc, err = cs.MachineconfigurationV1Interface.MachineOSConfigs().Get(ctx, mosc.Name, metav1.GetOptions{}) - require.NoError(t, err) - - // Update the containerfile to trigger a rebuild - apiMosc.Spec.Containerfile = []mcfgv1.MachineOSContainerfile{ - { - ContainerfileArch: mcfgv1.NoArch, - Content: simpleDockerfile + "\nRUN echo 'test annotation handling' > /etc/test-annotation", - }, - } - apiMosc, err = cs.MachineconfigurationV1Interface.MachineOSConfigs().Update(ctx, apiMosc, metav1.UpdateOptions{}) - require.NoError(t, err) - t.Logf("Updated MachineOSConfig %q containerfile", apiMosc.Name) - - // Wait for the second build to start - t.Logf("Waiting for second build to start...") - secondMosbName := waitForMOSCToUpdateCurrentMOSB(ctx, t, cs, mosc.Name, firstMosb.Name) - secondMosb, err := cs.GetMcfgclient().MachineconfigurationV1().MachineOSBuilds().Get(ctx, secondMosbName, metav1.GetOptions{}) - require.NoError(t, err) - secondMosb = waitForBuildToStart(t, cs, secondMosb) - t.Logf("Second MachineOSBuild %q has started", secondMosb.Name) - - // At this point, both MOSBs exist: - // - firstMosb is completed (with original containerfile) - // - secondMosb is building (with updated containerfile, but SAME rendered MC) - // This is the critical scenario: multiple MOSBs for the same rendered MachineConfig +// Waits for a job object to reach a given state. +func waitForJobToReachCondition(ctx context.Context, t *testing.T, cs *framework.ClientSet, jobName string, condFunc func(*batchv1.Job) (bool, error)) { + require.NoError(t, wait.PollImmediate(1*time.Second, 20*time.Minute, func() (bool, error) { + job, err := cs.BatchV1Interface.Jobs(ctrlcommon.MCONamespace).Get(ctx, jobName, metav1.GetOptions{}) + if err != nil { + return false, err + } - // Verify that the MOSC annotation now points to the second build - apiMosc, err = cs.MachineconfigurationV1Interface.MachineOSConfigs().Get(ctx, mosc.Name, metav1.GetOptions{}) - require.NoError(t, err) - currentBuildAnnotation = apiMosc.GetAnnotations()[constants.CurrentMachineOSBuildAnnotationKey] - assert.Equal(t, secondMosb.Name, currentBuildAnnotation, - "MOSC should have current-machine-os-build annotation pointing to second build") - t.Logf("Verified MOSC annotation updated to second build: %s", currentBuildAnnotation) + return condFunc(job) + })) +} - // List all MOSBs to confirm both exist - allMosbs, err := cs.MachineconfigurationV1Interface.MachineOSBuilds().List(ctx, metav1.ListOptions{}) - require.NoError(t, err) +// waitForImageBuildDegradedCondition waits for the ImageBuildDegraded condition to reach the expected state +func waitForImageBuildDegradedCondition(ctx context.Context, t *testing.T, cs *framework.ClientSet, poolName string, expectedStatus corev1.ConditionStatus) *mcfgv1.MachineConfigPoolCondition { + t.Helper() - mosbNames := []string{} - for _, mosb := range allMosbs.Items { - if mosb.Spec.MachineOSConfig.Name == mosc.Name { - mosbNames = append(mosbNames, mosb.Name) + var condition *mcfgv1.MachineConfigPoolCondition + require.NoError(t, wait.PollImmediate(1*time.Second, 2*time.Minute, func() (bool, error) { + mcp, err := cs.MachineconfigurationV1Interface.MachineConfigPools().Get(ctx, poolName, metav1.GetOptions{}) + if err != nil { + return false, err } - } - t.Logf("Found %d MOSBs for MachineOSConfig %q: %v", len(mosbNames), mosc.Name, mosbNames) - assert.GreaterOrEqual(t, len(mosbNames), 2, "Should have at least 2 MOSBs at this point") - // The critical test: The node controller should use the MOSB specified by the annotation - // (secondMosb) even though firstMosb also exists and matches the MOSC name. - // This is implicitly tested by the fact that the pool status should reflect the second build. - // We verify this by checking the pool is waiting for the second build, not using the first. - - t.Logf("Verifying that pool targets the correct (second) build based on annotation") - // The pool should be waiting for the second build to complete, not using the first completed build - // We can verify this by checking that the pool doesn't have the first image in its status - - // Wait for the second build to complete - t.Logf("Waiting for second build to complete...") - secondFinishedBuild := waitForBuildToComplete(t, cs, secondMosb) - secondImagePullspec := string(secondFinishedBuild.Status.DigestedImagePushSpec) - t.Logf("Second MachineOSBuild %q completed with image: %s", secondFinishedBuild.Name, secondImagePullspec) + condition = apihelpers.GetMachineConfigPoolCondition(mcp.Status, mcfgv1.MachineConfigPoolImageBuildDegraded) + if condition == nil { + return false, nil + } - // Verify the images are different (proving we built a new image, not reusing the old one) - assert.NotEqual(t, firstImagePullspec, secondImagePullspec, - "First and second builds should produce different images") + return condition.Status == expectedStatus, nil + })) - // Verify that the MOSC status reflects the second build's image - waitForMOSCToGetNewPullspec(ctx, t, cs, mosc.Name, secondImagePullspec) - apiMosc, err = cs.MachineconfigurationV1Interface.MachineOSConfigs().Get(ctx, mosc.Name, metav1.GetOptions{}) - require.NoError(t, err) - assert.Equal(t, mcfgv1.ImageDigestFormat(secondImagePullspec), apiMosc.Status.CurrentImagePullSpec, - "MOSC status should have the second build's image pullspec") + return condition +} - // The critical test: Verify the node controller uses the annotation to select the correct MOSB - // Add a node to the pool and verify it gets the SECOND build's image, not the first - t.Logf("Adding node to pool to verify node controller uses annotation-based MOSB selection") - node := helpers.GetRandomNode(t, cs, "worker") +func archiveBuildPodLogs(t *testing.T, podLogsDirPath string) error { + archiveName := fmt.Sprintf("%s-pod-logs.tar.gz", helpers.SanitizeTestName(t)) - unlabelFunc := makeIdempotentAndRegisterAlwaysRun(t, helpers.LabelNode(t, cs, node, helpers.MCPNameToRole(layeredMCPName))) - defer unlabelFunc() + archive, err := helpers.NewArtifactArchive(t, archiveName) + if err != nil { + return err + } - // Wait for the node controller to update the node's desiredImage annotation - // The node controller should use the annotation on the MOSC to select the second MOSB - // and therefore set the desiredImage to the second build's image, NOT the first - t.Logf("Waiting for node %s to have desiredImage set to second build's image", node.Name) - helpers.WaitForNodeImageChange(t, cs, node, secondImagePullspec) + cmd := exec.Command("mv", podLogsDirPath, archive.StagingDir()) + output, err := cmd.CombinedOutput() + if err != nil { + t.Log(string(output)) + return err + } - // Verify the node's desiredImage annotation matches the second build - updatedNode, err := cs.CoreV1Interface.Nodes().Get(ctx, node.Name, metav1.GetOptions{}) - require.NoError(t, err) - desiredImage := updatedNode.Annotations[daemonconsts.DesiredImageAnnotationKey] - assert.Equal(t, secondImagePullspec, desiredImage, - "Node controller should use annotation to select second build, not first build") - t.Logf("Node controller correctly selected second build based on annotation") + return archive.WriteArchive() +} - // Also verify it's NOT the first build's image - assert.NotEqual(t, firstImagePullspec, desiredImage, - "Node should NOT have first build's image (would indicate annotation was ignored)") +// Helper function to check if a string contains a substring (used in SSH key test) +func contains(s, substr string) bool { + return len(s) >= len(substr) && (s == substr || len(s) > len(substr) && s[:len(substr)] == substr || s[len(s)-len(substr):] == substr || containsMiddle(s, substr)) +} - t.Logf("Successfully verified that node controller uses annotation-based build selection") +func containsMiddle(s, substr string) bool { + for i := 0; i <= len(s)-len(substr); i++ { + if s[i:i+len(substr)] == substr { + return true + } + } + return false } diff --git a/test/e2e-ocl/Containerfile.cowsay b/test/e2e-ocl-shared/Containerfile.cowsay similarity index 100% rename from test/e2e-ocl/Containerfile.cowsay rename to test/e2e-ocl-shared/Containerfile.cowsay diff --git a/test/e2e-ocl/Containerfile.entitled b/test/e2e-ocl-shared/Containerfile.entitled similarity index 100% rename from test/e2e-ocl/Containerfile.entitled rename to test/e2e-ocl-shared/Containerfile.entitled diff --git a/test/e2e-ocl/Containerfile.okd-fcos b/test/e2e-ocl-shared/Containerfile.okd-fcos similarity index 100% rename from test/e2e-ocl/Containerfile.okd-fcos rename to test/e2e-ocl-shared/Containerfile.okd-fcos diff --git a/test/e2e-ocl/Containerfile.simple b/test/e2e-ocl-shared/Containerfile.simple similarity index 100% rename from test/e2e-ocl/Containerfile.simple rename to test/e2e-ocl-shared/Containerfile.simple diff --git a/test/e2e-ocl/Containerfile.yum-repos-d b/test/e2e-ocl-shared/Containerfile.yum-repos-d similarity index 100% rename from test/e2e-ocl/Containerfile.yum-repos-d rename to test/e2e-ocl-shared/Containerfile.yum-repos-d diff --git a/test/e2e-ocl-shared/containerfiles.go b/test/e2e-ocl-shared/containerfiles.go new file mode 100644 index 0000000000..4b27fddd03 --- /dev/null +++ b/test/e2e-ocl-shared/containerfiles.go @@ -0,0 +1,30 @@ +package e2e_ocl_shared_test + +import ( + _ "embed" +) + +var ( + // Provides a Containerfile that installs cowsayusing the Centos Stream 9 + // EPEL repository to do so without requiring any entitlements. + //go:embed Containerfile.cowsay + CowsayDockerfile string + + // Provides a Containerfile that installs Buildah from the default RHCOS RPM + // repositories. If the installation succeeds, the entitlement certificate is + // working. + //go:embed Containerfile.entitled + EntitledDockerfile string + + // Provides a Containerfile that works similarly to the cowsay Dockerfile + // with the exception that the /etc/yum.repos.d and /etc/pki/rpm-gpg key + // content is mounted into the build context by the BuildController. + //go:embed Containerfile.yum-repos-d + YumReposDockerfile string + + //go:embed Containerfile.okd-fcos + OkdFcosDockerfile string + + //go:embed Containerfile.simple + SimpleDockerfile string +) diff --git a/test/e2e-ocl/helpers_test.go b/test/e2e-ocl-shared/helpers.go similarity index 77% rename from test/e2e-ocl/helpers_test.go rename to test/e2e-ocl-shared/helpers.go index 94b744558d..0816dfb974 100644 --- a/test/e2e-ocl/helpers_test.go +++ b/test/e2e-ocl-shared/helpers.go @@ -1,4 +1,4 @@ -package e2e_ocl_test +package e2e_ocl_shared_test import ( "bytes" @@ -6,7 +6,6 @@ import ( "errors" "fmt" "io" - "io/ioutil" "os" "os/exec" "path/filepath" @@ -43,17 +42,17 @@ var ( InspectMC = "inspect-mc" ) -func applyMC(t *testing.T, cs *framework.ClientSet, mc *mcfgv1.MachineConfig) func() { +func ApplyMC(t *testing.T, cs *framework.ClientSet, mc *mcfgv1.MachineConfig, skipCleanupAlways, skipCleanupOnlyAfterFailure bool) func() { cleanupFunc := helpers.ApplyMC(t, cs, mc) t.Logf("Created new MachineConfig %q", mc.Name) - return makeIdempotentAndRegister(t, func() { + return MakeIdempotentAndRegister(t, skipCleanupAlways, skipCleanupOnlyAfterFailure, func() { cleanupFunc() t.Logf("Deleted MachineConfig %q", mc.Name) }) } -func createMachineOSConfig(t *testing.T, cs *framework.ClientSet, mosc *mcfgv1.MachineOSConfig) func() { +func CreateMachineOSConfig(t *testing.T, cs *framework.ClientSet, mosc *mcfgv1.MachineOSConfig, skipCleanupAlways, skipCleanupOnlyAfterFailure bool) func() { helpers.SetMetadataOnObject(t, mosc) _, err := cs.MachineconfigurationV1Interface.MachineOSConfigs().Create(context.TODO(), mosc, metav1.CreateOptions{}) @@ -61,7 +60,7 @@ func createMachineOSConfig(t *testing.T, cs *framework.ClientSet, mosc *mcfgv1.M t.Logf("Created MachineOSConfig %q", mosc.Name) - return makeIdempotentAndRegister(t, func() { + return MakeIdempotentAndRegister(t, skipCleanupAlways, skipCleanupOnlyAfterFailure, func() { require.NoError(t, cs.MachineconfigurationV1Interface.MachineOSConfigs().Delete(context.TODO(), mosc.Name, metav1.DeleteOptions{})) t.Logf("Deleted MachineOSConfig %q", mosc.Name) }) @@ -71,7 +70,7 @@ func createMachineOSConfig(t *testing.T, cs *framework.ClientSet, mosc *mcfgv1.M // namespace than the MCO, it will create the namespace and clone the pull // secret into the MCO namespace. Returns the name of the push secret used, the // image pullspec, and an idempotent cleanup function. -func setupImageStream(t *testing.T, cs *framework.ClientSet, objMeta metav1.ObjectMeta) (string, string, func()) { +func SetupImageStream(t *testing.T, cs *framework.ClientSet, objMeta metav1.ObjectMeta, skipCleanupAlways, skipCleanupOnlyAfterFailure bool) (string, string, func()) { t.Helper() cleanups := helpers.NewCleanupFuncs() @@ -92,14 +91,14 @@ func setupImageStream(t *testing.T, cs *framework.ClientSet, objMeta metav1.Obje // to do some additional steps. if objMeta.Namespace != ctrlcommon.MCONamespace { // Create the namespace. - cleanups.Add(createNamespace(t, cs, objMeta)) + cleanups.Add(CreateNamespace(t, cs, objMeta, skipCleanupAlways, skipCleanupOnlyAfterFailure)) // Wait for the builder service account to exist within the new namespace. - require.NoError(t, waitForServiceAccountToExist(cs, builderSAObjMeta)) + require.NoError(t, WaitForServiceAccountToExist(cs, builderSAObjMeta)) } // Create the Imagestream. - pullspec, isCleanupFunc := createImagestream(t, cs, objMeta) + pullspec, isCleanupFunc := CreateImagestream(t, cs, objMeta, skipCleanupAlways, skipCleanupOnlyAfterFailure) cleanups.Add(isCleanupFunc) // Create a long-lived image registry pull secret so that it will not get @@ -116,11 +115,11 @@ func setupImageStream(t *testing.T, cs *framework.ClientSet, objMeta metav1.Obje cleanups.Add(helpers.CreateLongLivedPullSecretForTest(context.TODO(), t, cs, opts)) - return pushSecretName, pullspec, makeIdempotentAndRegister(t, cleanups.Run) + return pushSecretName, pullspec, MakeIdempotentAndRegister(t, skipCleanupAlways, skipCleanupOnlyAfterFailure, cleanups.Run) } // Creates a namespace. Returns an idempotent cleanup function. -func createNamespace(t *testing.T, cs *framework.ClientSet, objMeta metav1.ObjectMeta) func() { +func CreateNamespace(t *testing.T, cs *framework.ClientSet, objMeta metav1.ObjectMeta, skipCleanupAlways, skipCleanupOnlyAfterFailure bool) func() { ns := &corev1.Namespace{ ObjectMeta: metav1.ObjectMeta{ Name: objMeta.Namespace, @@ -133,7 +132,7 @@ func createNamespace(t *testing.T, cs *framework.ClientSet, objMeta metav1.Objec require.NoError(t, err) t.Logf("Created namespace %q", ns.Name) - return makeIdempotentAndRegister(t, func() { + return MakeIdempotentAndRegister(t, skipCleanupAlways, skipCleanupOnlyAfterFailure, func() { require.NoError(t, cs.CoreV1Interface.Namespaces().Delete(context.TODO(), ns.Name, metav1.DeleteOptions{})) t.Logf("Deleted namespace %q", ns.Name) }) @@ -142,9 +141,10 @@ func createNamespace(t *testing.T, cs *framework.ClientSet, objMeta metav1.Objec // There may be a delay between the time a new namespace is created and its // service accounts to be created. This will wait up to one minute for the // specified service account to be created. -func waitForServiceAccountToExist(cs *framework.ClientSet, objMeta metav1.ObjectMeta) error { - return wait.PollImmediate(1*time.Second, 1*time.Minute, func() (bool, error) { - builderSA, err := cs.CoreV1Interface.ServiceAccounts(objMeta.Namespace).Get(context.TODO(), objMeta.Name, metav1.GetOptions{}) +func WaitForServiceAccountToExist(cs *framework.ClientSet, objMeta metav1.ObjectMeta) error { + ctx := context.Background() + return wait.PollUntilContextTimeout(ctx, 1*time.Second, 1*time.Minute, true, func(ctx context.Context) (bool, error) { + builderSA, err := cs.CoreV1Interface.ServiceAccounts(objMeta.Namespace).Get(ctx, objMeta.Name, metav1.GetOptions{}) if err != nil && !k8serrors.IsNotFound(err) { return false, err } @@ -156,7 +156,7 @@ func waitForServiceAccountToExist(cs *framework.ClientSet, objMeta metav1.Object // Creates an OpenShift ImageStream in the MCO namespace for the test and // registers a cleanup function. Returns the pullspec with the latest tag for // the newly-created ImageStream. -func createImagestream(t *testing.T, cs *framework.ClientSet, objMeta metav1.ObjectMeta) (string, func()) { +func CreateImagestream(t *testing.T, cs *framework.ClientSet, objMeta metav1.ObjectMeta, skipCleanupAlways, skipCleanupOnlyAfterFailure bool) (string, func()) { is := &imagev1.ImageStream{ ObjectMeta: metav1.ObjectMeta{ Name: objMeta.Name, @@ -173,14 +173,14 @@ func createImagestream(t *testing.T, cs *framework.ClientSet, objMeta metav1.Obj pullspec := fmt.Sprintf("%s:latest", created.Status.DockerImageRepository) t.Logf("Created ImageStream \"%s/%s\", has pullspec %q", is.Namespace, is.Name, pullspec) - return pullspec, makeIdempotentAndRegister(t, func() { + return pullspec, MakeIdempotentAndRegister(t, skipCleanupAlways, skipCleanupOnlyAfterFailure, func() { require.NoError(t, cs.ImageV1Interface.ImageStreams(is.Namespace).Delete(context.TODO(), is.Name, metav1.DeleteOptions{})) t.Logf("Deleted ImageStream \"%s/%s\"", is.Namespace, is.Name) }) } // Creates a given ConfigMap and registers a cleanup function to delete it. -func createConfigMap(t *testing.T, cs *framework.ClientSet, cm *corev1.ConfigMap) func() { +func CreateConfigMap(t *testing.T, cs *framework.ClientSet, cm *corev1.ConfigMap, skipCleanupAlways, skipCleanupOnlyAfterFailure bool) func() { helpers.SetMetadataOnObject(t, cm) _, err := cs.CoreV1Interface.ConfigMaps(cm.Namespace).Create(context.TODO(), cm, metav1.CreateOptions{}) @@ -188,14 +188,14 @@ func createConfigMap(t *testing.T, cs *framework.ClientSet, cm *corev1.ConfigMap t.Logf("Created ConfigMap \"%s/%s\"", cm.Namespace, cm.Name) - return makeIdempotentAndRegister(t, func() { + return MakeIdempotentAndRegister(t, skipCleanupAlways, skipCleanupOnlyAfterFailure, func() { require.NoError(t, cs.CoreV1Interface.ConfigMaps(cm.Namespace).Delete(context.TODO(), cm.Name, metav1.DeleteOptions{})) t.Logf("Deleted ConfigMap \"%s/%s\"", cm.Namespace, cm.Name) }) } // Creates a given Secret and registers a cleanup function to delete it. -func createSecret(t *testing.T, cs *framework.ClientSet, secret *corev1.Secret) func() { +func CreateSecret(t *testing.T, cs *framework.ClientSet, secret *corev1.Secret, skipCleanupAlways, skipCleanupOnlyAfterFailure bool) func() { helpers.SetMetadataOnObject(t, secret) _, err := cs.CoreV1Interface.Secrets(secret.Namespace).Create(context.TODO(), secret, metav1.CreateOptions{}) @@ -203,14 +203,14 @@ func createSecret(t *testing.T, cs *framework.ClientSet, secret *corev1.Secret) t.Logf("Created secret \"%s/%s\"", secret.Namespace, secret.Name) - return makeIdempotentAndRegister(t, func() { + return MakeIdempotentAndRegister(t, skipCleanupAlways, skipCleanupOnlyAfterFailure, func() { require.NoError(t, cs.CoreV1Interface.Secrets(ctrlcommon.MCONamespace).Delete(context.TODO(), secret.Name, metav1.DeleteOptions{})) t.Logf("Deleted secret \"%s/%s\"", secret.Namespace, secret.Name) }) } // Computes the name of the currently-running MachineOSBuild given a MachineConfigPool and MachineOSConfig. -func getMachineOSBuildNameForPool(cs *framework.ClientSet, poolName, moscName string) (string, error) { +func GetMachineOSBuildNameForPool(cs *framework.ClientSet, poolName, moscName string) (string, error) { mcp, err := cs.MachineconfigurationV1Interface.MachineConfigPools().Get(context.TODO(), poolName, metav1.GetOptions{}) if err != nil { return "", err @@ -241,9 +241,10 @@ func getMachineOSBuildNameForPool(cs *framework.ClientSet, poolName, moscName st } // Waits for the target MachineConfigPool to reach a state defined in a supplied function. -func waitForPoolToReachState(t *testing.T, cs *framework.ClientSet, poolName string, condFunc func(*mcfgv1.MachineConfigPool) bool) { - err := wait.PollImmediate(1*time.Second, 10*time.Minute, func() (bool, error) { - mcp, err := cs.MachineconfigurationV1Interface.MachineConfigPools().Get(context.TODO(), poolName, metav1.GetOptions{}) +func WaitForPoolToReachState(t *testing.T, cs *framework.ClientSet, poolName string, condFunc func(*mcfgv1.MachineConfigPool) bool) { + ctx := context.Background() + err := wait.PollUntilContextTimeout(ctx, 1*time.Second, 10*time.Minute, true, func(ctx context.Context) (bool, error) { + mcp, err := cs.MachineconfigurationV1Interface.MachineConfigPools().Get(ctx, poolName, metav1.GetOptions{}) if err != nil { return false, err } @@ -257,7 +258,7 @@ func waitForPoolToReachState(t *testing.T, cs *framework.ClientSet, poolName str // Registers a cleanup function, making it idempotent, and wiring up the skip // cleanup checks which will cause cleanup to be skipped under certain // conditions. -func makeIdempotentAndRegister(t *testing.T, cleanupFunc func()) func() { +func MakeIdempotentAndRegister(t *testing.T, skipCleanupAlways, skipCleanupOnlyAfterFailure bool, cleanupFunc func()) func() { cfg := helpers.IdempotentConfig{ SkipAlways: skipCleanupAlways, SkipOnlyOnFailure: skipCleanupOnlyAfterFailure, @@ -273,12 +274,12 @@ func makeIdempotentAndRegister(t *testing.T, cleanupFunc func()) func() { // function is only called once despite there being multiple calls to the // returned function. If there is only one call to the returned function // anyway, use t.Cleanup() instead for clarity. -func makeIdempotentAndRegisterAlwaysRun(t *testing.T, cleanupFunc func()) func() { +func MakeIdempotentAndRegisterAlwaysRun(t *testing.T, cleanupFunc func()) func() { return helpers.MakeIdempotentAndRegister(t, cleanupFunc) } -// TOOD: Refactor into smaller functions. -func cleanupEphemeralBuildObjects(t *testing.T, cs *framework.ClientSet) { +// TODO: Refactor into smaller functions. +func CleanupEphemeralBuildObjects(t *testing.T, cs *framework.ClientSet) { labelSelector := utils.OSBuildSelector().String() // Any secrets that get created by BuildController should have different @@ -313,7 +314,16 @@ func cleanupEphemeralBuildObjects(t *testing.T, cs *framework.ClientSet) { moscList, err := cs.MachineconfigurationV1Interface.MachineOSConfigs().List(context.TODO(), metav1.ListOptions{}) require.NoError(t, err) - kubeassert := helpers.AssertClientSet(t, cs) + // Helper function to create a fresh timeout context for each resource verification. + // Each resource gets its own 3-minute timeout to prevent slow deletions from + // consuming the timeout budget of other resources. Use a 5-second poll interval + // to significantly reduce API call rate and avoid exhausting the rate limiter + // (~36 attempts per resource, vs 120 attempts with 1s interval). + newCleanupAssertion := func() *helpers.Assertions { + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute) + t.Cleanup(cancel) + return helpers.AssertClientSet(t, cs).WithContext(ctx).WithPollInterval(5 * time.Second).Eventually() + } if len(secretList.Items) == 0 { t.Logf("No build-time secrets to clean up") @@ -339,26 +349,36 @@ func cleanupEphemeralBuildObjects(t *testing.T, cs *framework.ClientSet) { t.Logf("No MachineOSConfigs to clean up") } - for _, item := range secretList.Items { - t.Logf("Cleaning up build-time Secret %s", item.Name) - require.NoError(t, deleteObject(context.TODO(), t, &item, cs.CoreV1Interface.Secrets(ctrlcommon.MCONamespace))) - kubeassert.SecretDoesNotExist(item.Name) + // Delete owners first (MachineOSConfigs, MachineOSBuilds) before their dependents. + // MachineOSBuilds have FinalizerDeleteDependents which blocks deletion until owned + // resources are gone. Deleting ConfigMaps/Secrets before their owners creates a race + // with Kubernetes GC that causes intermittent timeout failures. + for _, item := range moscList.Items { + t.Logf("Cleaning up MachineOSConfig %q", item.Name) + require.NoError(t, DeleteObject(context.TODO(), t, &item, cs.MachineconfigurationV1Interface.MachineOSConfigs())) + newCleanupAssertion().MachineOSConfigDoesNotExist(&item) } - for _, item := range cmList.Items { - t.Logf("Cleaning up ephemeral ConfigMap %q", item.Name) - require.NoError(t, deleteObject(context.TODO(), t, &item, cs.CoreV1Interface.ConfigMaps(ctrlcommon.MCONamespace))) - kubeassert.ConfigMapDoesNotExist(item.Name) + for _, item := range mosbList.Items { + t.Logf("Cleaning up MachineOSBuild %q", item.Name) + require.NoError(t, DeleteObject(context.TODO(), t, &item, cs.MachineconfigurationV1Interface.MachineOSBuilds())) + newCleanupAssertion().MachineOSBuildDoesNotExist(&item) + + // Also clean up the digest ConfigMap + t.Logf("Cleaning up ephemeral digest ConfigMap %q", utils.GetDigestConfigMapName(&item)) + require.NoError(t, CleanupDigestConfigMap(t, cs, &item)) + newCleanupAssertion().ConfigMapDoesNotExist(utils.GetDigestConfigMapName(&item)) } + // Now delete Jobs and their dependent resources for _, item := range jobList.Items { jobUID := string(item.UID) t.Logf("Cleaning up build job %q", item.Name) bgDeletion := metav1.DeletePropagationBackground - require.NoError(t, deleteObjectWithOpts(context.TODO(), t, &item, cs.BatchV1Interface.Jobs(ctrlcommon.MCONamespace), metav1.DeleteOptions{ + require.NoError(t, DeleteObjectWithOpts(context.TODO(), t, &item, cs.BatchV1Interface.Jobs(ctrlcommon.MCONamespace), metav1.DeleteOptions{ PropagationPolicy: &bgDeletion, })) - kubeassert.JobDoesNotExist(item.Name) + newCleanupAssertion().JobDoesNotExist(item.Name) // Delete any pods that were created by the job pods, err := cs.CoreV1Interface.Pods(ctrlcommon.MCONamespace).List(context.TODO(), metav1.ListOptions{ @@ -366,57 +386,53 @@ func cleanupEphemeralBuildObjects(t *testing.T, cs *framework.ClientSet) { }) require.NoError(t, err) for _, pod := range pods.Items { - require.NoError(t, deleteObject(context.TODO(), t, &pod, cs.CoreV1Interface.Pods(ctrlcommon.MCONamespace))) - kubeassert.PodDoesNotExist(pod.Name) + require.NoError(t, DeleteObject(context.TODO(), t, &pod, cs.CoreV1Interface.Pods(ctrlcommon.MCONamespace))) + newCleanupAssertion().PodDoesNotExist(pod.Name) } } for _, item := range podList.Items { t.Logf("Cleaning up build pod %q", item.Name) - require.NoError(t, deleteObject(context.TODO(), t, &item, cs.CoreV1Interface.Pods(ctrlcommon.MCONamespace))) - kubeassert.PodDoesNotExist(item.Name) + require.NoError(t, DeleteObject(context.TODO(), t, &item, cs.CoreV1Interface.Pods(ctrlcommon.MCONamespace))) + newCleanupAssertion().PodDoesNotExist(item.Name) } - for _, item := range moscList.Items { - t.Logf("Cleaning up MachineOSConfig %q", item.Name) - require.NoError(t, deleteObject(context.TODO(), t, &item, cs.MachineconfigurationV1Interface.MachineOSConfigs())) - kubeassert.MachineOSConfigDoesNotExist(&item) + // Clean up remaining ConfigMaps and Secrets after their owners are deleted + for _, item := range cmList.Items { + t.Logf("Cleaning up ephemeral ConfigMap %q", item.Name) + require.NoError(t, DeleteObject(context.TODO(), t, &item, cs.CoreV1Interface.ConfigMaps(ctrlcommon.MCONamespace))) + newCleanupAssertion().ConfigMapDoesNotExist(item.Name) } - for _, item := range mosbList.Items { - t.Logf("Cleaning up MachineOSBuild %q", item.Name) - require.NoError(t, deleteObject(context.TODO(), t, &item, cs.MachineconfigurationV1Interface.MachineOSBuilds())) - kubeassert.MachineOSBuildDoesNotExist(&item) - - // Also clean up the digest ConfigMap - t.Logf("Cleaning up ephemeral digest ConfigMap %q", utils.GetDigestConfigMapName(&item)) - require.NoError(t, cleanupDigestConfigMap(t, cs, &item)) - kubeassert.ConfigMapDoesNotExist(utils.GetDigestConfigMapName(&item)) + for _, item := range secretList.Items { + t.Logf("Cleaning up build-time Secret %s", item.Name) + require.NoError(t, DeleteObject(context.TODO(), t, &item, cs.CoreV1Interface.Secrets(ctrlcommon.MCONamespace))) + newCleanupAssertion().SecretDoesNotExist(item.Name) } // Clean up inspect MC if it exists machineConfig, err := cs.MachineConfigs().Get(context.TODO(), InspectMC, metav1.GetOptions{}) if err == nil { t.Logf("Cleaning up MachineConfig %q", InspectMC) - require.NoError(t, deleteObject(context.TODO(), t, machineConfig, cs.MachineConfigs())) - kubeassert.MachineConfigDoesNotExist(machineConfig) + require.NoError(t, DeleteObject(context.TODO(), t, machineConfig, cs.MachineConfigs())) + newCleanupAssertion().MachineConfigDoesNotExist(machineConfig) } } -type deleter interface { +type Deleter interface { Delete(context.Context, string, metav1.DeleteOptions) error } -type kubeObject interface { +type KubeObject interface { runtime.Object GetName() string } -func deleteObject(ctx context.Context, t *testing.T, obj kubeObject, deleter deleter) error { - return deleteObjectWithOpts(ctx, t, obj, deleter, metav1.DeleteOptions{}) +func DeleteObject(ctx context.Context, t *testing.T, obj KubeObject, deleter Deleter) error { + return DeleteObjectWithOpts(ctx, t, obj, deleter, metav1.DeleteOptions{}) } -func deleteObjectWithOpts(ctx context.Context, t *testing.T, obj kubeObject, deleter deleter, opts metav1.DeleteOptions) error { +func DeleteObjectWithOpts(ctx context.Context, t *testing.T, obj KubeObject, deleter Deleter, opts metav1.DeleteOptions) error { kind, err := utils.GetKindForObject(obj) if err != nil && kind == "" { kind = "" @@ -437,10 +453,10 @@ func deleteObjectWithOpts(ctx context.Context, t *testing.T, obj kubeObject, del return err } -func cleanupDigestConfigMap(t *testing.T, cs *framework.ClientSet, mosb *mcfgv1.MachineOSBuild) error { +func CleanupDigestConfigMap(t *testing.T, cs *framework.ClientSet, mosb *mcfgv1.MachineOSBuild) error { cm, err := cs.CoreV1Interface.ConfigMaps(ctrlcommon.MCONamespace).Get(context.TODO(), utils.GetDigestConfigMapName(mosb), metav1.GetOptions{}) if err == nil { - return deleteObject(context.TODO(), t, cm, cs.CoreV1Interface.ConfigMaps(ctrlcommon.MCONamespace)) + return DeleteObject(context.TODO(), t, cm, cs.CoreV1Interface.ConfigMaps(ctrlcommon.MCONamespace)) } if k8serrors.IsNotFound(err) { t.Logf("%s already cleaned up", utils.GetDigestConfigMapName(mosb)) @@ -456,7 +472,7 @@ func cleanupDigestConfigMap(t *testing.T, cs *framework.ClientSet, mosb *mcfgv1. // // If this env var is not set, these files will be written to the current // working directory. -func getBuildArtifactDir(t *testing.T) string { +func GetBuildArtifactDir(t *testing.T) string { artifactDir := os.Getenv("ARTIFACT_DIR") if artifactDir != "" { return artifactDir @@ -468,7 +484,7 @@ func getBuildArtifactDir(t *testing.T) string { } // Writes any ephemeral build objects to disk as YAML files. -func writeBuildArtifactsToFiles(t *testing.T, cs *framework.ClientSet, poolName string) { +func WriteBuildArtifactsToFiles(t *testing.T, cs *framework.ClientSet) { lo := metav1.ListOptions{ LabelSelector: utils.OSBuildSelector().String(), } @@ -479,10 +495,10 @@ func writeBuildArtifactsToFiles(t *testing.T, cs *framework.ClientSet, poolName require.NoError(t, err) err = aggerrs.NewAggregate([]error{ - writeConfigMapsToFile(t, cs, lo, archive.StagingDir()), - writeBuildPodsToFile(t, cs, lo, archive.StagingDir()), - writeMachineOSBuildsToFile(t, cs, archive.StagingDir()), - writeMachineOSConfigsToFile(t, cs, archive.StagingDir()), + WriteConfigMapsToFile(t, cs, lo, archive.StagingDir()), + WriteBuildPodsToFile(t, cs, lo, archive.StagingDir()), + WriteMachineOSBuildsToFile(t, cs, archive.StagingDir()), + WriteMachineOSConfigsToFile(t, cs, archive.StagingDir()), }) require.NoError(t, err, "could not write build artifacts to files, got: %s", err) @@ -491,7 +507,7 @@ func writeBuildArtifactsToFiles(t *testing.T, cs *framework.ClientSet, poolName } // Writes all MachineOSBuilds to a file. -func writeMachineOSBuildsToFile(t *testing.T, cs *framework.ClientSet, archiveDir string) error { +func WriteMachineOSBuildsToFile(t *testing.T, cs *framework.ClientSet, archiveDir string) error { mosbList, err := cs.MachineconfigurationV1Interface.MachineOSBuilds().List(context.TODO(), metav1.ListOptions{}) if err != nil { return err @@ -502,11 +518,11 @@ func writeMachineOSBuildsToFile(t *testing.T, cs *framework.ClientSet, archiveDi return nil } - return dumpObjectToYAMLFile(t, mosbList, filepath.Join(archiveDir, "machineosbuilds.yaml")) + return DumpObjectToYAMLFile(mosbList, filepath.Join(archiveDir, "machineosbuilds.yaml")) } // Writes all MachineOSConfigs to a file. -func writeMachineOSConfigsToFile(t *testing.T, cs *framework.ClientSet, archiveDir string) error { +func WriteMachineOSConfigsToFile(t *testing.T, cs *framework.ClientSet, archiveDir string) error { moscList, err := cs.MachineconfigurationV1Interface.MachineOSConfigs().List(context.TODO(), metav1.ListOptions{}) if err != nil { return err @@ -517,11 +533,11 @@ func writeMachineOSConfigsToFile(t *testing.T, cs *framework.ClientSet, archiveD return nil } - return dumpObjectToYAMLFile(t, moscList, filepath.Join(archiveDir, "machineosconfigs.yaml")) + return DumpObjectToYAMLFile(moscList, filepath.Join(archiveDir, "machineosconfigs.yaml")) } // Writes all ConfigMaps that match the OS Build labels to files. -func writeConfigMapsToFile(t *testing.T, cs *framework.ClientSet, lo metav1.ListOptions, archiveDir string) error { +func WriteConfigMapsToFile(t *testing.T, cs *framework.ClientSet, lo metav1.ListOptions, archiveDir string) error { cmList, err := cs.CoreV1Interface.ConfigMaps(ctrlcommon.MCONamespace).List(context.TODO(), lo) if err != nil { @@ -533,11 +549,11 @@ func writeConfigMapsToFile(t *testing.T, cs *framework.ClientSet, lo metav1.List return nil } - return dumpObjectToYAMLFile(t, cmList, filepath.Join(archiveDir, "configmaps.yaml")) + return DumpObjectToYAMLFile(cmList, filepath.Join(archiveDir, "configmaps.yaml")) } // Wrttes all pod specs that match the OS Build labels to files. -func writeBuildPodsToFile(t *testing.T, cs *framework.ClientSet, lo metav1.ListOptions, archiveDir string) error { +func WriteBuildPodsToFile(t *testing.T, cs *framework.ClientSet, lo metav1.ListOptions, archiveDir string) error { podList, err := cs.CoreV1Interface.Pods(ctrlcommon.MCONamespace).List(context.TODO(), lo) if err != nil { return err @@ -548,12 +564,12 @@ func writeBuildPodsToFile(t *testing.T, cs *framework.ClientSet, lo metav1.ListO return nil } - return dumpObjectToYAMLFile(t, podList, filepath.Join(archiveDir, "pods.yaml")) + return DumpObjectToYAMLFile(podList, filepath.Join(archiveDir, "pods.yaml")) } // Dumps a struct to the provided filename in YAML format, creating any // parent directories as needed. -func dumpObjectToYAMLFile(t *testing.T, obj interface{}, filename string) error { +func DumpObjectToYAMLFile(obj interface{}, filename string) error { if err := os.MkdirAll(filepath.Dir(filename), 0o755); err != nil { return err } @@ -569,7 +585,7 @@ func dumpObjectToYAMLFile(t *testing.T, obj interface{}, filename string) error // Streams the logs from the Machine OS Builder pod containers to a set of // files. This can provide a valuable window into how / why the e2e test suite // failed. -func streamMachineOSBuilderPodLogsToFile(ctx context.Context, t *testing.T, cs *framework.ClientSet, dirPath string) error { +func StreamMachineOSBuilderPodLogsToFile(ctx context.Context, t *testing.T, cs *framework.ClientSet, dirPath string) error { pods, err := cs.CoreV1Interface.Pods(ctrlcommon.MCONamespace).List(ctx, metav1.ListOptions{ LabelSelector: "k8s-app=machine-os-builder", }) @@ -577,24 +593,24 @@ func streamMachineOSBuilderPodLogsToFile(ctx context.Context, t *testing.T, cs * require.NoError(t, err) mobPod := &pods.Items[0] - return streamPodContainerLogsToFile(ctx, t, cs, mobPod, dirPath) + return StreamPodContainerLogsToFile(ctx, t, cs, mobPod, dirPath) } // Streams the logs for all of the containers running in the build pod. The pod // logs can provide a valuable window into how / why a given build failed. -func streamBuildPodLogsToFile(ctx context.Context, t *testing.T, cs *framework.ClientSet, mosb *mcfgv1.MachineOSBuild, dirPath string) error { +func StreamBuildPodLogsToFile(ctx context.Context, t *testing.T, cs *framework.ClientSet, mosb *mcfgv1.MachineOSBuild, dirPath string) error { jobName := mosb.Status.Builder.Job.Name - pod, err := getPodFromJob(ctx, cs, jobName) + pod, err := GetPodFromJob(ctx, cs, jobName) if err != nil { return err } - return streamPodContainerLogsToFile(ctx, t, cs, pod, dirPath) + return StreamPodContainerLogsToFile(ctx, t, cs, pod, dirPath) } // Returns a list of pods that match a given job name. -func listPodsForJob(ctx context.Context, cs *framework.ClientSet, jobName string) (*corev1.PodList, error) { +func ListPodsForJob(ctx context.Context, cs *framework.ClientSet, jobName string) (*corev1.PodList, error) { job, err := cs.BatchV1Interface.Jobs(ctrlcommon.MCONamespace).Get(ctx, jobName, metav1.GetOptions{}) if err != nil { return nil, fmt.Errorf("could not get job %s: %w", job, err) @@ -609,8 +625,8 @@ func listPodsForJob(ctx context.Context, cs *framework.ClientSet, jobName string } // Retrieves the currently running build pod for a given job name. -func getPodFromJob(ctx context.Context, cs *framework.ClientSet, jobName string) (*corev1.Pod, error) { - podList, err := listPodsForJob(ctx, cs, jobName) +func GetPodFromJob(ctx context.Context, cs *framework.ClientSet, jobName string) (*corev1.Pod, error) { + podList, err := ListPodsForJob(ctx, cs, jobName) if err != nil { return nil, fmt.Errorf("could not list pods for job %s: %w", jobName, err) } @@ -623,7 +639,7 @@ func getPodFromJob(ctx context.Context, cs *framework.ClientSet, jobName string) // this is needed when we test the case for a new pod being created after deleting the existing one // as sometimes it takes time for the old pod to be completely deleted for _, pod := range podList.Items { - if isBuildPodRunning(&pod) { + if IsBuildPodRunning(&pod) { return &pod, nil } } @@ -634,7 +650,7 @@ func getPodFromJob(ctx context.Context, cs *framework.ClientSet, jobName string) // Determines if a build pod is running by first examining the init container // statuses and then the main container statuses. -func isBuildPodRunning(pod *corev1.Pod) bool { +func IsBuildPodRunning(pod *corev1.Pod) bool { for _, status := range pod.Status.InitContainerStatuses { if status.State.Running != nil { return true @@ -652,7 +668,7 @@ func isBuildPodRunning(pod *corev1.Pod) bool { // getJobForMOSB returns the name of the job that was created for the given MOSB by comparing the job UID // to the UID stored in the MOSB annotation -func getJobForMOSB(ctx context.Context, cs *framework.ClientSet, build *mcfgv1.MachineOSBuild) (string, error) { +func GetJobForMOSB(ctx context.Context, cs *framework.ClientSet, build *mcfgv1.MachineOSBuild) (string, error) { jobName := "" mosbJobUID := "" @@ -686,7 +702,7 @@ func getJobForMOSB(ctx context.Context, cs *framework.ClientSet, build *mcfgv1.M // Attaches a follower to each of the containers within a given pod in order to // stream their logs to disk for future debugging. -func streamPodContainerLogsToFile(ctx context.Context, t *testing.T, cs *framework.ClientSet, pod *corev1.Pod, dirPath string) error { +func StreamPodContainerLogsToFile(ctx context.Context, t *testing.T, cs *framework.ClientSet, pod *corev1.Pod, dirPath string) error { errGroup, egCtx := errgroup.WithContext(ctx) // Stream logs from init containers @@ -695,7 +711,7 @@ func streamPodContainerLogsToFile(ctx context.Context, t *testing.T, cs *framewo pod := pod.DeepCopy() errGroup.Go(func() error { - return streamContainerLogToFile(egCtx, t, cs, pod, container, dirPath) + return StreamContainerLogToFile(egCtx, t, cs, pod, container, dirPath) }) } @@ -708,7 +724,7 @@ func streamPodContainerLogsToFile(ctx context.Context, t *testing.T, cs *framewo // blocks the current Goroutine. So we run each log stream operation in a // separate Goroutine to avoid blocking the main Goroutine. errGroup.Go(func() error { - return streamContainerLogToFile(egCtx, t, cs, pod, container, dirPath) + return StreamContainerLogToFile(egCtx, t, cs, pod, container, dirPath) }) } @@ -721,7 +737,7 @@ func streamPodContainerLogsToFile(ctx context.Context, t *testing.T, cs *framewo } // Streams the logs for a given container to a file. -func streamContainerLogToFile(ctx context.Context, t *testing.T, cs *framework.ClientSet, pod *corev1.Pod, container corev1.Container, dirPath string) error { +func StreamContainerLogToFile(ctx context.Context, t *testing.T, cs *framework.ClientSet, pod *corev1.Pod, container corev1.Container, dirPath string) error { // Wait for the container to be ready to stream logs for { select { @@ -736,7 +752,7 @@ func streamContainerLogToFile(ctx context.Context, t *testing.T, cs *framework.C if err != nil { // If the container is waiting to start (e.g., PodInitializing), wait and retry if strings.Contains(err.Error(), "waiting to start") || strings.Contains(err.Error(), "PodInitializing") { - time.Sleep(2 * time.Second) + time.Sleep(5 * time.Second) continue } return fmt.Errorf("could not get logs for container %s in pod %s: %w", container.Name, pod.Name, err) @@ -764,7 +780,7 @@ func streamContainerLogToFile(ctx context.Context, t *testing.T, cs *framework.C // Skips a given test if it is detected that the cluster is running OKD. We // skip these tests because they're either irrelevant for OKD or would fail. -func skipOnOKD(t *testing.T) { +func SkipOnOKD(t *testing.T) { cs := framework.NewClientSet("") isOKD, err := helpers.IsOKDCluster(cs) @@ -776,7 +792,7 @@ func skipOnOKD(t *testing.T) { } } -func skipOnOCP(t *testing.T) { +func SkipOnOCP(t *testing.T) { cs := framework.NewClientSet("") isOKD, err := helpers.IsOKDCluster(cs) require.NoError(t, err) @@ -790,7 +806,7 @@ func skipOnOCP(t *testing.T) { // Extracts the contents of a directory within a given container to a temporary // directory. Next, it loads them into a bytes map keyed by filename. It does // not handle nested directories, so use with caution. -func convertFilesFromContainerImageToBytesMap(t *testing.T, pullspec, containerFilepath string) map[string][]byte { +func ConvertFilesFromContainerImageToBytesMap(t *testing.T, pullspec, containerFilepath string) map[string][]byte { tempDir := t.TempDir() path := fmt.Sprintf("%s:%s", containerFilepath, tempDir) @@ -813,7 +829,7 @@ func convertFilesFromContainerImageToBytesMap(t *testing.T, pullspec, containerF return nil } - contents, err := ioutil.ReadFile(path) + contents, err := os.ReadFile(path) if err != nil { return err } @@ -833,7 +849,7 @@ func convertFilesFromContainerImageToBytesMap(t *testing.T, pullspec, containerF } // Skips the test if the entitlement secret is not present. -func skipIfEntitlementNotPresent(t *testing.T, cs *framework.ClientSet) { +func SkipIfEntitlementNotPresent(t *testing.T, cs *framework.ClientSet) { _, err := cs.CoreV1Interface.Secrets(constants.EtcPkiEntitlementSecretName).Get(context.TODO(), ctrlcommon.OpenshiftConfigManagedNamespace, metav1.GetOptions{}) if k8serrors.IsNotFound(err) { @@ -850,17 +866,17 @@ func skipIfEntitlementNotPresent(t *testing.T, cs *framework.ClientSet) { // ConfigMap and Secret, respectively. This is so that the build process will // consume those objects as part of the build process, injecting them into the // build context. -func injectYumRepos(t *testing.T, cs *framework.ClientSet) func() { +func InjectYumRepos(t *testing.T, cs *framework.ClientSet, skipCleanupAlways, skipCleanupOnlyAfterFailure bool) func() { tempDir := t.TempDir() yumReposPath := filepath.Join(tempDir, "yum-repos-d") require.NoError(t, os.MkdirAll(yumReposPath, 0o755)) centosPullspec := "quay.io/centos/centos:stream9" - yumReposContents := convertFilesFromContainerImageToBytesMap(t, centosPullspec, "/etc/yum.repos.d/") - rpmGpgContents := convertFilesFromContainerImageToBytesMap(t, centosPullspec, "/etc/pki/rpm-gpg/") + yumReposContents := ConvertFilesFromContainerImageToBytesMap(t, centosPullspec, "/etc/yum.repos.d/") + rpmGpgContents := ConvertFilesFromContainerImageToBytesMap(t, centosPullspec, "/etc/pki/rpm-gpg/") - configMapCleanupFunc := createConfigMap(t, cs, &corev1.ConfigMap{ + configMapCleanupFunc := CreateConfigMap(t, cs, &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ Name: "etc-yum-repos-d", Namespace: ctrlcommon.MCONamespace, @@ -871,23 +887,23 @@ func injectYumRepos(t *testing.T, cs *framework.ClientSet) func() { // because the Build Pod will use its contents the same regardless of // whether its string data or binary data. BinaryData: yumReposContents, - }) + }, skipCleanupAlways, skipCleanupOnlyAfterFailure) - secretCleanupFunc := createSecret(t, cs, &corev1.Secret{ + secretCleanupFunc := CreateSecret(t, cs, &corev1.Secret{ ObjectMeta: metav1.ObjectMeta{ Name: "etc-pki-rpm-gpg", Namespace: ctrlcommon.MCONamespace, }, Data: rpmGpgContents, - }) + }, skipCleanupAlways, skipCleanupOnlyAfterFailure) - return makeIdempotentAndRegister(t, func() { + return MakeIdempotentAndRegister(t, skipCleanupAlways, skipCleanupOnlyAfterFailure, func() { configMapCleanupFunc() secretCleanupFunc() }) } -func newMachineConfig(name, pool string) *mcfgv1.MachineConfig { +func NewMachineConfig(name, pool string) *mcfgv1.MachineConfig { mode := 420 testfiledata := fmt.Sprintf("data:,%s-%s", name, pool) path := fmt.Sprintf("/etc/%s-%s", name, pool) @@ -907,20 +923,20 @@ func newMachineConfig(name, pool string) *mcfgv1.MachineConfig { } // newMachineConfigWithExtensions returns the same base MC, but adds the given extensions to trigger an image rebuild -func newMachineConfigTriggersImageRebuild(name, pool string, exts []string) *mcfgv1.MachineConfig { - mc := newMachineConfig(name, pool) +func NewMachineConfigTriggersImageRebuild(name, pool string, exts []string) *mcfgv1.MachineConfig { + mc := NewMachineConfig(name, pool) mc.Spec.Extensions = append(mc.Spec.Extensions, exts...) return mc } // newMachineConfigWithKernelType returns the same base MC, but adds the given kernel -func newMachineConfigWithKernelType(name, pool, kernelType string) *mcfgv1.MachineConfig { - mc := newMachineConfig(name, pool) +func NewMachineConfigWithKernelType(name, pool, kernelType string) *mcfgv1.MachineConfig { + mc := NewMachineConfig(name, pool) mc.Spec.KernelType = kernelType return mc } -func compareKernelType(t *testing.T, foundKernel, requiredKernelType string) bool { +func CompareKernelType(t *testing.T, foundKernel, requiredKernelType string) bool { switch requiredKernelType { case ctrlcommon.KernelTypeDefault: return !strings.Contains(foundKernel, "rt") && !strings.Contains(foundKernel, "64k") @@ -945,7 +961,7 @@ func compareKernelType(t *testing.T, foundKernel, requiredKernelType string) boo // reason why we look up that image is because it does not require a pull // secret in order to get its digest, which the BaseOSImagePullspec field // requires. -func getImagePullspecForFailureTest(ctx context.Context, cs *framework.ClientSet) (string, error) { +func GetImagePullspecForFailureTest(ctx context.Context, cs *framework.ClientSet) (string, error) { images, err := ctrlcommon.GetImagesConfig(ctx, cs.GetKubeclient()) if err != nil { return "", err @@ -960,13 +976,13 @@ func getImagePullspecForFailureTest(ctx context.Context, cs *framework.ClientSet case reference.Digested: return images.MachineConfigOperator, nil case reference.Tagged: - return resolveTaggedPullspecToDigestedPullspec(ctx, "registry.fedoraproject.org/fedora:latest") + return ResolveTaggedPullspecToDigestedPullspec(ctx, "registry.fedoraproject.org/fedora:latest") default: return "", fmt.Errorf("unknown image reference spec %q", images.MachineConfigOperator) } } -func getBadContainerFileForFailureTest() []mcfgv1.MachineOSContainerfile { +func GetBadContainerFileForFailureTest() []mcfgv1.MachineOSContainerfile { return []mcfgv1.MachineOSContainerfile{{ ContainerfileArch: mcfgv1.NoArch, Content: "THIS IS A BAD CONTAINERFILE", @@ -977,7 +993,7 @@ func getBadContainerFileForFailureTest() []mcfgv1.MachineOSContainerfile { // supplied image pullspec. Note: Only supports public image registries. This // is the same as doing: // $ skopeo inspect docker://image-pullspec | jq '.Digest' -func resolveTaggedPullspecToDigestedPullspec(ctx context.Context, pullspec string) (string, error) { +func ResolveTaggedPullspecToDigestedPullspec(ctx context.Context, pullspec string) (string, error) { sysCtx := &types.SystemContext{} tagged, err := docker.ParseReference("//" + pullspec) @@ -1001,7 +1017,7 @@ func resolveTaggedPullspecToDigestedPullspec(ctx context.Context, pullspec strin // TODO: Deduplicate this definition from machine-config-operator/devex/internal/pkg/rollout/rollout.go // Having "internal" in the module path prevents us from reusing it here since // it is internal to the devex directory. -func setDeploymentReplicas(t *testing.T, cs *framework.ClientSet, deployment metav1.ObjectMeta, replicas int32) error { +func SetDeploymentReplicas(t *testing.T, cs *framework.ClientSet, deployment metav1.ObjectMeta, replicas int32) error { return retry.RetryOnConflict(retry.DefaultBackoff, func() error { t.Logf("Setting replicas for %s/%s to %d", deployment.Namespace, deployment.Name, replicas) scale, err := cs.AppsV1Interface.Deployments(deployment.Namespace).GetScale(context.TODO(), deployment.Name, metav1.GetOptions{}) @@ -1019,7 +1035,7 @@ func setDeploymentReplicas(t *testing.T, cs *framework.ClientSet, deployment met // Scales down the machine-os-builder, machine-config-opreator, and // cluster-version-operator deployments. Registers and returns an idempotent // function that will scale the deployments back to their original values. -func scaleDownDeployments(t *testing.T, cs *framework.ClientSet) func() { +func ScaleDownDeployments(t *testing.T, cs *framework.ClientSet) func() { deployments := []metav1.ObjectMeta{ // Scale down the cluster-version-operator since it could set the desired // replicas for the MCO to 1. @@ -1044,7 +1060,7 @@ func scaleDownDeployments(t *testing.T, cs *framework.ClientSet) func() { restoreFuncs := []func(){} for _, deployment := range deployments { - restoreFuncs = append(restoreFuncs, scaleDownDeployment(t, cs, deployment)) + restoreFuncs = append(restoreFuncs, ScaleDownDeployment(t, cs, deployment)) } return helpers.MakeIdempotentAndRegister(t, func() { @@ -1063,7 +1079,7 @@ func scaleDownDeployments(t *testing.T, cs *framework.ClientSet) func() { // replicas, in which case it no-ops. Registers and returns an idempotent // restoral function that will revert the deployment back to its original // setting. -func scaleDownDeployment(t *testing.T, cs *framework.ClientSet, deployment metav1.ObjectMeta) func() { +func ScaleDownDeployment(t *testing.T, cs *framework.ClientSet, deployment metav1.ObjectMeta) func() { ctx := context.TODO() originalDeployment, err := cs.AppsV1Interface.Deployments(deployment.Namespace).Get(ctx, deployment.Name, metav1.GetOptions{}) @@ -1081,26 +1097,26 @@ func scaleDownDeployment(t *testing.T, cs *framework.ClientSet, deployment metav }) } - require.NoError(t, setDeploymentReplicas(t, cs, deployment, 0)) + require.NoError(t, SetDeploymentReplicas(t, cs, deployment, 0)) return helpers.MakeIdempotentAndRegister(t, func() { - require.NoError(t, setDeploymentReplicas(t, cs, deployment, originalReplicas)) + require.NoError(t, SetDeploymentReplicas(t, cs, deployment, originalReplicas)) }) } // forceMachineOSBuildToFail() repeatedly deletes the build pod associated // with the given MachineOSBuild so that the job will fail. -func forceMachineOSBuildToFail(ctx context.Context, t *testing.T, cs *framework.ClientSet, mosb *mcfgv1.MachineOSBuild) error { +func ForceMachineOSBuildToFail(ctx context.Context, t *testing.T, cs *framework.ClientSet, mosb *mcfgv1.MachineOSBuild) error { start := time.Now() - jobName, err := getJobForMOSB(ctx, cs, mosb) + jobName, err := GetJobForMOSB(ctx, cs, mosb) if err != nil { return fmt.Errorf("could not identify job for MachineOSBuild %s: %w", mosb.Name, err) } t.Logf("Found job %s for MachineOSBuild %s, will delete pods belonging to this job to cause build failure", jobName, mosb.Name) - return wait.PollImmediate(1*time.Second, 5*time.Minute, func() (bool, error) { + return wait.PollUntilContextTimeout(ctx, 1*time.Second, 5*time.Minute, true, func(ctx context.Context) (bool, error) { job, err := cs.BatchV1Interface.Jobs(ctrlcommon.MCONamespace).Get(ctx, jobName, metav1.GetOptions{}) if err != nil { return false, fmt.Errorf("could not get job %s for MachineOSBuild %s: %w", jobName, mosb.Name, err) @@ -1113,7 +1129,7 @@ func forceMachineOSBuildToFail(ctx context.Context, t *testing.T, cs *framework. } } - podList, err := listPodsForJob(ctx, cs, jobName) + podList, err := ListPodsForJob(ctx, cs, jobName) if err != nil { return false, fmt.Errorf("could not list pods for job %s: %w", jobName, err) } From a5b20d957b972299e52560f7dc210d22cdf2feda Mon Sep 17 00:00:00 2001 From: Urvashi Date: Wed, 6 May 2026 10:22:58 -0400 Subject: [PATCH 2/3] Fix e2e-ocl-1of2 flake where MOSB is deleted by stale controller event TestMachineOSConfigChangeRestartsBuild flakes because the build controller (running 3 concurrent workers) races between deleteMachineOSConfig for the previous test's MOSC and addMachineOSConfig for the new test's MOSC. Since both MOSCs share the same name, deleteMachineOSConfig's label-based query matches and deletes MOSBs created by the new MOSC. Fix both the controller and test assertions. Signed-off-by: Urvashi --- pkg/controller/build/reconciler.go | 7 +++++++ test/helpers/assertions.go | 3 +++ 2 files changed, 10 insertions(+) diff --git a/pkg/controller/build/reconciler.go b/pkg/controller/build/reconciler.go index d3da1cf799..3e775d7fa8 100644 --- a/pkg/controller/build/reconciler.go +++ b/pkg/controller/build/reconciler.go @@ -191,6 +191,13 @@ func (b *buildReconciler) deleteMachineOSConfig(ctx context.Context, mosc *mcfgv } for _, mosb := range mosbList { + // Only delete MOSBs owned by this specific MOSC instance. A new MOSC + // with the same name (but different UID) may already exist and have + // created new MOSBs that match the same label selector. + if !metav1.IsControlledBy(mosb, mosc) { + klog.Infof("Skipping MachineOSBuild %s: not owned by deleted MachineOSConfig %s (UID %s)", mosb.Name, mosc.Name, mosc.UID) + continue + } if err := b.deleteMachineOSBuild(ctx, mosb); err != nil { return fmt.Errorf("could not delete MachineOSBuild %s for MachineOSConfig %s: %w", mosb.Name, mosc.Name, err) } diff --git a/test/helpers/assertions.go b/test/helpers/assertions.go index d2e222b940..9e9e6175f7 100644 --- a/test/helpers/assertions.go +++ b/test/helpers/assertions.go @@ -674,6 +674,9 @@ func (a *Assertions) machineOSBuildHasConditionTrue(mosb *mcfgv1.MachineOSBuild, stateFunc := func(apiMosb *mcfgv1.MachineOSBuild, err error) (bool, error) { if err != nil { + if a.poll && k8serrors.IsNotFound(err) { + return false, nil + } return false, err } From 49e33c43d24588b43d25e20432f980d8e31fc576 Mon Sep 17 00:00:00 2001 From: Urvashi Date: Tue, 2 Jun 2026 13:52:08 -0400 Subject: [PATCH 3/3] Restore assertBuildJobIsAsExpected and fix test inconsistencies 1. Restore assertBuildJobIsAsExpected function and its call sites in both shards. This was present in the 4.21 monolithic suite and validates that build pods use the correct MCO container images. 2. Fix waitForMOSCToUpdateCurrentMOSB in 2of2 to include the empty annotation guard (currentMOSB != "") matching 1of2, so it handles temporarily cleared annotations correctly. 3. Restore strings.Contains in TestSSHKeyAndPasswordForOSBuilder instead of the custom contains() helper from main, matching the original 4.21 implementation. Signed-off-by: Urvashi --- test/e2e-ocl-1of2/onclusterlayering_test.go | 28 +++++++++++++ test/e2e-ocl-2of2/onclusterlayering_test.go | 44 ++++++++++++++------- 2 files changed, 57 insertions(+), 15 deletions(-) diff --git a/test/e2e-ocl-1of2/onclusterlayering_test.go b/test/e2e-ocl-1of2/onclusterlayering_test.go index db173f0489..d4d3c7a890 100644 --- a/test/e2e-ocl-1of2/onclusterlayering_test.go +++ b/test/e2e-ocl-1of2/onclusterlayering_test.go @@ -189,6 +189,7 @@ func TestMissingImageIsRebuilt(t *testing.T) { require.NoError(t, err) secondMOSB = waitForBuildToStart(t, cs, secondMOSB) t.Logf("MachineOSBuild %q has started", secondMOSB.Name) + assertBuildJobIsAsExpected(t, cs, secondMOSB) // Wait for the build to finish t.Logf("Waiting for 2nd build completion...") @@ -219,6 +220,7 @@ func TestMissingImageIsRebuilt(t *testing.T) { require.NoError(t, err) thirdMOSB = waitForBuildToStart(t, cs, thirdMOSB) t.Logf("MachineOSBuild %q has started (rebuild of image1)", thirdMOSB.Name) + assertBuildJobIsAsExpected(t, cs, thirdMOSB) // Wait for the build to finish t.Logf("Waiting for 3rd build completion...") @@ -522,6 +524,8 @@ func runOnClusterLayeringTest(t *testing.T, testOpts onClusterLayeringTestOpts) startedBuild := waitForBuildToStartForPoolAndConfig(t, cs, testOpts.poolName, mosc.Name) t.Logf("MachineOSBuild %q has started", startedBuild.Name) + assertBuildJobIsAsExpected(t, cs, startedBuild) + t.Logf("Waiting for build completion...") // Create a child context for the build pod log streamer. This is so we can @@ -625,6 +629,30 @@ func waitForBuildToStartForPoolAndConfig(t *testing.T, cs *framework.ClientSet, return waitForBuildToStart(t, cs, mosb) } +func assertBuildJobIsAsExpected(t *testing.T, cs *framework.ClientSet, mosb *mcfgv1.MachineOSBuild) { + t.Helper() + + osImageURLConfig, err := ctrlcommon.GetOSImageURLConfig(context.TODO(), cs.GetKubeclient()) + require.NoError(t, err) + + mcoImages, err := ctrlcommon.GetImagesConfig(context.TODO(), cs.GetKubeclient()) + require.NoError(t, err) + + buildPod, err := ocltesthelper.GetPodFromJob(context.TODO(), cs, mosb.Status.Builder.Job.Name) + require.NoError(t, err) + + assertContainerIsUsingExpectedImage := func(c corev1.Container, containerName, expectedImage string) { + if c.Name == containerName { + assert.Equal(t, c.Image, expectedImage) + } + } + + for _, container := range buildPod.Spec.Containers { + assertContainerIsUsingExpectedImage(container, "image-build", mcoImages.MachineConfigOperator) + assertContainerIsUsingExpectedImage(container, "wait-for-done", osImageURLConfig.BaseOSContainerImage) + } +} + // Waits for a MachineOSBuild to start building. func waitForBuildToStart(t *testing.T, cs *framework.ClientSet, build *mcfgv1.MachineOSBuild) *mcfgv1.MachineOSBuild { t.Helper() diff --git a/test/e2e-ocl-2of2/onclusterlayering_test.go b/test/e2e-ocl-2of2/onclusterlayering_test.go index 6ee4dea5cd..20e019537f 100644 --- a/test/e2e-ocl-2of2/onclusterlayering_test.go +++ b/test/e2e-ocl-2of2/onclusterlayering_test.go @@ -8,6 +8,7 @@ import ( "os" "os/exec" "path/filepath" + "strings" "testing" "time" @@ -349,7 +350,7 @@ func TestSSHKeyAndPasswordForOSBuilder(t *testing.T) { osNode = helpers.GetSingleNodeByRole(t, cs, layeredMCPName) // Re-fetch node with updated configurations foundSSHKey := helpers.ExecCmdOnNode(t, cs, osNode, "cat", "/rootfs/home/core/.ssh/authorized_keys.d/ignition") - if !contains(foundSSHKey, sshKeyContent) { + if !strings.Contains(foundSSHKey, sshKeyContent) { t.Fatalf("updated ssh key not found, got %s", foundSSHKey) } t.Logf("updated ssh hash found, got %s", foundSSHKey) @@ -885,6 +886,8 @@ func runOnClusterLayeringTest(t *testing.T, testOpts onClusterLayeringTestOpts) startedBuild := waitForBuildToStartForPoolAndConfig(t, cs, testOpts.poolName, mosc.Name) t.Logf("MachineOSBuild %q has started", startedBuild.Name) + assertBuildJobIsAsExpected(t, cs, startedBuild) + t.Logf("Waiting for build completion...") // Create a child context for the build pod log streamer. This is so we can @@ -970,6 +973,30 @@ func waitForBuildToStartForPoolAndConfig(t *testing.T, cs *framework.ClientSet, return waitForBuildToStart(t, cs, mosb) } +func assertBuildJobIsAsExpected(t *testing.T, cs *framework.ClientSet, mosb *mcfgv1.MachineOSBuild) { + t.Helper() + + osImageURLConfig, err := ctrlcommon.GetOSImageURLConfig(context.TODO(), cs.GetKubeclient()) + require.NoError(t, err) + + mcoImages, err := ctrlcommon.GetImagesConfig(context.TODO(), cs.GetKubeclient()) + require.NoError(t, err) + + buildPod, err := ocltesthelper.GetPodFromJob(context.TODO(), cs, mosb.Status.Builder.Job.Name) + require.NoError(t, err) + + assertContainerIsUsingExpectedImage := func(c corev1.Container, containerName, expectedImage string) { + if c.Name == containerName { + assert.Equal(t, c.Image, expectedImage) + } + } + + for _, container := range buildPod.Spec.Containers { + assertContainerIsUsingExpectedImage(container, "image-build", mcoImages.MachineConfigOperator) + assertContainerIsUsingExpectedImage(container, "wait-for-done", osImageURLConfig.BaseOSContainerImage) + } +} + // Waits for a MachineOSBuild to start building. func waitForBuildToStart(t *testing.T, cs *framework.ClientSet, build *mcfgv1.MachineOSBuild) *mcfgv1.MachineOSBuild { t.Helper() @@ -1245,7 +1272,7 @@ func waitForMOSCToUpdateCurrentMOSB(ctx context.Context, t *testing.T, cs *frame } currentMOSB = mosc.GetAnnotations()[constants.CurrentMachineOSBuildAnnotationKey] - return currentMOSB != mosbName, nil + return currentMOSB != "" && currentMOSB != mosbName, nil })) return currentMOSB @@ -1315,16 +1342,3 @@ func archiveBuildPodLogs(t *testing.T, podLogsDirPath string) error { return archive.WriteArchive() } -// Helper function to check if a string contains a substring (used in SSH key test) -func contains(s, substr string) bool { - return len(s) >= len(substr) && (s == substr || len(s) > len(substr) && s[:len(substr)] == substr || s[len(s)-len(substr):] == substr || containsMiddle(s, substr)) -} - -func containsMiddle(s, substr string) bool { - for i := 0; i <= len(s)-len(substr); i++ { - if s[i:i+len(substr)] == substr { - return true - } - } - return false -}