From f37a63b0d4fc7dfb0590c2c4fa162c2a18f67e10 Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Fri, 15 May 2026 08:13:57 -0400 Subject: [PATCH 01/11] test(e2ev2): register env vars for lifecycle tests Register 6 new environment variables for lifecycle tests: release images (latest, previous, N-1, N-2), Azure DES ID, and Azure credentials file path. --- test/e2e/v2/internal/env_vars.go | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/test/e2e/v2/internal/env_vars.go b/test/e2e/v2/internal/env_vars.go index 5e28d5dedd7..f5cbd4b7cba 100644 --- a/test/e2e/v2/internal/env_vars.go +++ b/test/e2e/v2/internal/env_vars.go @@ -171,4 +171,35 @@ func init() { "Comma-separated list of Azure subscription IDs permitted to create Private Endpoints.", false, ) + // Release image env vars for lifecycle tests + RegisterEnvVar( + "E2E_LATEST_RELEASE_IMAGE", + "Latest OCP release image for control plane upgrade tests.", + false, + ) + RegisterEnvVar( + "E2E_PREVIOUS_RELEASE_IMAGE", + "N-1 OCP release image (previous minor) for control plane upgrade tests.", + false, + ) + RegisterEnvVar( + "E2E_N1_RELEASE_IMAGE", + "N-1 minor release image for NodePool previous-release tests.", + false, + ) + RegisterEnvVar( + "E2E_N2_RELEASE_IMAGE", + "N-2 minor release image for NodePool previous-release tests.", + false, + ) + RegisterEnvVar( + "E2E_AZURE_CREDENTIALS_FILE", + "Path to Azure service principal credentials JSON file for platform-specific tests (auto-repair, disk encryption).", + false, + ) + RegisterEnvVar( + "E2E_AZURE_DISK_ENCRYPTION_SET_ID", + "Azure DiskEncryptionSet resource ID for disk encryption NodePool tests.", + false, + ) } From f692b8aaf5a819a1756a7ba5e28d674a450f4d66 Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Fri, 15 May 2026 08:18:07 -0400 Subject: [PATCH 02/11] test(e2ev2): add nodepool autoscaling tests Port autoscaling scale-up/down and multi-NodePool balancing tests from v1 to v2 Ginkgo framework. Label: nodepool-autoscaling --- .../e2e/v2/tests/nodepool_autoscaling_test.go | 314 ++++++++++++++++++ 1 file changed, 314 insertions(+) create mode 100644 test/e2e/v2/tests/nodepool_autoscaling_test.go diff --git a/test/e2e/v2/tests/nodepool_autoscaling_test.go b/test/e2e/v2/tests/nodepool_autoscaling_test.go new file mode 100644 index 00000000000..b3bd48987f9 --- /dev/null +++ b/test/e2e/v2/tests/nodepool_autoscaling_test.go @@ -0,0 +1,314 @@ +//go:build e2ev2 + +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package tests + +import ( + "context" + "fmt" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" + e2eutil "github.com/openshift/hypershift/test/e2e/util" + "github.com/openshift/hypershift/test/e2e/v2/internal" + + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/apimachinery/pkg/labels" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/ptr" + crclient "sigs.k8s.io/controller-runtime/pkg/client" +) + +// AutoscalingScaleUpDownTest tests autoscaling scale-up and scale-down behavior +func AutoscalingScaleUpDownTest(getTestCtx internal.TestContextGetter) { + It("should scale up when workload increases and scale down when workload decreases", func() { + testCtx := getTestCtx() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + ctx := testCtx.Context + + // Find the default NodePool to copy platform config + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + // Create autoscaling NodePool with min=1, max=3 + autoscalingNP := buildAutoscalingNodePool(defaultNP, 1, 3) + err := testCtx.MgmtClient.Create(ctx, autoscalingNP) + Expect(err).NotTo(HaveOccurred(), "failed to create autoscaling NodePool") + GinkgoWriter.Printf("Created autoscaling NodePool %s with min=1, max=3\n", autoscalingNP.Name) + + // Ensure cleanup + defer cleanupNodePool(ctx, testCtx.MgmtClient, autoscalingNP) + + npLabelSelector := e2eutil.WithClientOptions(crclient.MatchingLabelsSelector{ + Selector: labels.SelectorFromSet(labels.Set{hyperv1.NodePoolLabel: autoscalingNP.Name}), + }) + + // Wait for NodePool to be ready with 1 node (min replicas) + nodes := e2eutil.WaitForNReadyNodesWithOptions(GinkgoTB(), ctx, guestClient, 1, hc.Spec.Platform.Type, fmt.Sprintf("for NodePool %s", autoscalingNP.Name), npLabelSelector) + Expect(nodes).To(HaveLen(1), "should have exactly 1 node initially") + + // Get node capacity for workload sizing + memCapacity := nodes[0].Status.Allocatable[corev1.ResourceMemory] + bytes, ok := memCapacity.AsInt64() + Expect(ok).To(BeTrue(), "memory capacity should be convertible to int64") + + // Create workload that requires 3 nodes (50% memory per pod, 3 pods) + workloadMemRequest := *resource.NewQuantity(bytes/2, resource.BinarySI) + workload := newAutoscalingWorkload(3, workloadMemRequest) + err = guestClient.Create(ctx, workload) + Expect(err).NotTo(HaveOccurred(), "failed to create workload") + + defer cleanupWorkload(ctx, guestClient, workload) + + // Wait for scale-up to 3 nodes + e2eutil.WaitForNReadyNodesWithOptions(GinkgoTB(), ctx, guestClient, 3, hc.Spec.Platform.Type, fmt.Sprintf("for NodePool %s", autoscalingNP.Name), npLabelSelector) + + // Delete workload to trigger scale-down + cleanupWorkload(ctx, guestClient, workload) + + // Wait for scale-down to 1 node (min replicas) + e2eutil.WaitForNReadyNodesWithOptions(GinkgoTB(), ctx, guestClient, 1, hc.Spec.Platform.Type, fmt.Sprintf("for NodePool %s", autoscalingNP.Name), npLabelSelector) + }) +} + +// AutoscalingBalancingTest tests that autoscaling balances workload across multiple NodePools +func AutoscalingBalancingTest(getTestCtx internal.TestContextGetter) { + It("should balance pods across multiple autoscaling NodePools", func() { + testCtx := getTestCtx() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + e2eutil.GinkgoAtLeast(e2eutil.Version420) + + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + ctx := testCtx.Context + + // Find the default NodePool to copy platform config + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + // Create two autoscaling NodePools + autoscalingNP1 := buildAutoscalingNodePool(defaultNP, 1, 3) + err := testCtx.MgmtClient.Create(ctx, autoscalingNP1) + Expect(err).NotTo(HaveOccurred(), "failed to create first autoscaling NodePool") + defer cleanupNodePool(ctx, testCtx.MgmtClient, autoscalingNP1) + + autoscalingNP2 := buildAutoscalingNodePool(defaultNP, 1, 3) + err = testCtx.MgmtClient.Create(ctx, autoscalingNP2) + Expect(err).NotTo(HaveOccurred(), "failed to create second autoscaling NodePool") + defer cleanupNodePool(ctx, testCtx.MgmtClient, autoscalingNP2) + + np1LabelSelector := e2eutil.WithClientOptions(crclient.MatchingLabelsSelector{ + Selector: labels.SelectorFromSet(labels.Set{hyperv1.NodePoolLabel: autoscalingNP1.Name}), + }) + np2LabelSelector := e2eutil.WithClientOptions(crclient.MatchingLabelsSelector{ + Selector: labels.SelectorFromSet(labels.Set{hyperv1.NodePoolLabel: autoscalingNP2.Name}), + }) + + // Wait for initial nodes (1 per NodePool at min replicas) + nodes := e2eutil.WaitForNReadyNodesWithOptions(GinkgoTB(), ctx, guestClient, 1, hc.Spec.Platform.Type, "for NP1", np1LabelSelector) + e2eutil.WaitForNReadyNodesWithOptions(GinkgoTB(), ctx, guestClient, 1, hc.Spec.Platform.Type, "for NP2", np2LabelSelector) + + // Get node capacity for workload sizing + memCapacity := nodes[0].Status.Allocatable[corev1.ResourceMemory] + bytes, ok := memCapacity.AsInt64() + Expect(ok).To(BeTrue(), "memory capacity should be convertible to int64") + + // Create workload that requires 4 nodes (50% memory per pod, 4 pods) + workloadMemRequest := *resource.NewQuantity(bytes/2, resource.BinarySI) + workload := newAutoscalingWorkload(4, workloadMemRequest) + err = guestClient.Create(ctx, workload) + Expect(err).NotTo(HaveOccurred(), "failed to create workload") + defer cleanupWorkload(ctx, guestClient, workload) + + // Wait for total 4 nodes across both NPs, then verify balanced distribution + Eventually(func() (bool, error) { + if err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(autoscalingNP1), autoscalingNP1); err != nil { + return false, err + } + if err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(autoscalingNP2), autoscalingNP2); err != nil { + return false, err + } + + total := autoscalingNP1.Status.Replicas + autoscalingNP2.Status.Replicas + if total < 4 { + return false, nil + } + return autoscalingNP1.Status.Replicas >= 1 && autoscalingNP2.Status.Replicas >= 1, nil + }).WithTimeout(30 * time.Minute). + WithPolling(30 * time.Second). + Should(BeTrue(), "NodePools should have balanced distribution") + }) +} + +// Helper functions + +// getDefaultNodePool finds an existing NodePool for the hosted cluster to copy platform config +func getDefaultNodePool(ctx context.Context, client crclient.Client, hc *hyperv1.HostedCluster) *hyperv1.NodePool { + GinkgoHelper() + + npList := &hyperv1.NodePoolList{} + err := client.List(ctx, npList, crclient.InNamespace(hc.Namespace)) + Expect(err).NotTo(HaveOccurred(), "failed to list NodePools") + Expect(npList.Items).NotTo(BeEmpty(), "should have at least one NodePool") + + // Find a NodePool for this HostedCluster + for i := range npList.Items { + if npList.Items[i].Spec.ClusterName == hc.Name { + return &npList.Items[i] + } + } + + return nil +} + +// buildAutoscalingNodePool creates a new NodePool with autoscaling enabled based on a template +func buildAutoscalingNodePool(template *hyperv1.NodePool, min, max int32) *hyperv1.NodePool { + GinkgoHelper() + + name := e2eutil.SimpleNameGenerator.GenerateName(template.Spec.ClusterName + "-auto-") + np := &hyperv1.NodePool{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: template.Namespace, + }, + } + + // Deep copy the spec from template + template.Spec.DeepCopyInto(&np.Spec) + + // Configure autoscaling + np.Spec.Replicas = nil // Must be nil when using autoscaling + np.Spec.AutoScaling = &hyperv1.NodePoolAutoScaling{ + Min: ptr.To(min), + Max: max, + } + + return np +} + +// newAutoscalingWorkload creates a Job that spawns multiple pods for autoscaling tests +func newAutoscalingWorkload(njobs int32, memoryRequest resource.Quantity) *batchv1.Job { + GinkgoHelper() + + name := e2eutil.SimpleNameGenerator.GenerateName("autoscaling-workload-") + job := &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: "default", + }, + Spec: batchv1.JobSpec{ + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "workload", + Image: "registry.access.redhat.com/ubi9/ubi-minimal:latest", + Command: []string{ + "sleep", + "86400", // 1 day + }, + Resources: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + "memory": memoryRequest, + "cpu": resource.MustParse("500m"), + }, + }, + SecurityContext: &corev1.SecurityContext{ + AllowPrivilegeEscalation: ptr.To(false), + Capabilities: &corev1.Capabilities{ + Drop: []corev1.Capability{"ALL"}, + }, + RunAsNonRoot: ptr.To(false), + RunAsUser: ptr.To(int64(0)), + SeccompProfile: &corev1.SeccompProfile{ + Type: corev1.SeccompProfileTypeRuntimeDefault, + }, + }, + }, + }, + RestartPolicy: corev1.RestartPolicyNever, + }, + }, + BackoffLimit: ptr.To[int32](4), + Completions: ptr.To(njobs), + Parallelism: ptr.To(njobs), + }, + } + + return job +} + +// cleanupNodePool deletes a NodePool if it exists +func cleanupNodePool(ctx context.Context, client crclient.Client, np *hyperv1.NodePool) { + GinkgoHelper() + + err := client.Delete(ctx, np) + if err != nil && !apierrors.IsNotFound(err) { + GinkgoWriter.Printf("Warning: failed to delete NodePool %s: %v\n", np.Name, err) + } else if err == nil { + GinkgoWriter.Printf("Deleted NodePool %s\n", np.Name) + } +} + +// cleanupWorkload deletes a Job workload if it exists +func cleanupWorkload(ctx context.Context, client crclient.Client, job *batchv1.Job) { + GinkgoHelper() + + cascadeDelete := metav1.DeletePropagationForeground + err := client.Delete(ctx, job, &crclient.DeleteOptions{ + PropagationPolicy: &cascadeDelete, + }) + if err != nil && !apierrors.IsNotFound(err) { + GinkgoWriter.Printf("Warning: failed to delete workload %s: %v\n", job.Name, err) + } else if err == nil { + GinkgoWriter.Printf("Deleted workload %s\n", job.Name) + } +} + +// RegisterNodePoolAutoscalingTests registers all autoscaling test cases +func RegisterNodePoolAutoscalingTests(getTestCtx internal.TestContextGetter) { + AutoscalingScaleUpDownTest(getTestCtx) + AutoscalingBalancingTest(getTestCtx) +} + +var _ = Describe("NodePool Autoscaling", Label("nodepool-autoscaling"), func() { + var testCtx *internal.TestContext + + BeforeEach(func() { + testCtx = internal.GetTestContext() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + }) + + RegisterNodePoolAutoscalingTests(func() *internal.TestContext { return testCtx }) +}) From 93cadedc4db5371314306245a32558fffbaa9365 Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Fri, 15 May 2026 08:27:38 -0400 Subject: [PATCH 03/11] test(e2ev2): add etcd chaos tests Port TestHAEtcdChaos from v1 to v2 Ginkgo framework. Uses Ordered decorator so tests run sequentially (etcd needs recovery between tests). Assumes HA control plane. Label: etcd-chaos Co-Authored-By: Claude Opus 4.6 --- test/e2e/v2/tests/etcd_chaos_test.go | 440 +++++++++++++++++++++++++++ 1 file changed, 440 insertions(+) create mode 100644 test/e2e/v2/tests/etcd_chaos_test.go diff --git a/test/e2e/v2/tests/etcd_chaos_test.go b/test/e2e/v2/tests/etcd_chaos_test.go new file mode 100644 index 00000000000..2ab2f652da6 --- /dev/null +++ b/test/e2e/v2/tests/etcd_chaos_test.go @@ -0,0 +1,440 @@ +//go:build e2ev2 + +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package tests + +import ( + "context" + "fmt" + "math/rand" + "strings" + "sync" + "time" + + "github.com/google/go-cmp/cmp" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + cpomanifests "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/manifests" + etcdrecoverymanifests "github.com/openshift/hypershift/hypershift-operator/controllers/manifests/etcdrecovery" + e2eutil "github.com/openshift/hypershift/test/e2e/util" + "github.com/openshift/hypershift/test/e2e/v2/internal" + + appsv1 "k8s.io/api/apps/v1" + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/utils/ptr" + crclient "sigs.k8s.io/controller-runtime/pkg/client" +) + +// RegisterEtcdChaosTests registers all etcd chaos test cases. +func RegisterEtcdChaosTests(getTestCtx internal.TestContextGetter) { + EtcdSingleMemberRecoveryTest(getTestCtx) + EtcdKillRandomMembersTest(getTestCtx) + EtcdKillAllMembersTest(getTestCtx) + EtcdSingleMemberCorruptionTest(getTestCtx) + EtcdMissingMemberRecoveryTest(getTestCtx) +} + +var _ = Describe("Etcd Chaos", Label("etcd-chaos"), Ordered, func() { + var testCtx *internal.TestContext + + BeforeAll(func() { + testCtx = internal.GetTestContext() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + }) + + RegisterEtcdChaosTests(func() *internal.TestContext { return testCtx }) +}) + +// EtcdSingleMemberRecoveryTest deletes one random etcd pod and its PVC simultaneously, +// then verifies the pod is replaced (different UID) and the StatefulSet converges. +func EtcdSingleMemberRecoveryTest(getTestCtx internal.TestContextGetter) { + It("should recover after a single member loses its data", func() { + testCtx := getTestCtx() + ctx := testCtx.Context + cpNamespace := testCtx.ControlPlaneNamespace + + etcdSts, etcdPods := getEtcdStsAndPods(ctx, testCtx.MgmtClient, cpNamespace) + + randomPod := randomEtcdPods(etcdPods.Items, 1)[0] + originalUID := randomPod.UID + pvcName := "data-etcd" + strings.TrimPrefix(randomPod.Name, "etcd") + pvc := &corev1.PersistentVolumeClaim{ + ObjectMeta: metav1.ObjectMeta{Name: pvcName, Namespace: cpNamespace}, + } + + GinkgoWriter.Printf("Deleting etcd pod %s and PVC %s\n", randomPod.Name, pvcName) + + var wg sync.WaitGroup + wg.Add(2) + go func() { + defer GinkgoRecover() + defer wg.Done() + Expect(testCtx.MgmtClient.Delete(ctx, &randomPod)).To(Succeed(), "failed to delete etcd pod %s", randomPod.Name) + GinkgoWriter.Printf("Deleted etcd pod %s\n", randomPod.Name) + }() + go func() { + defer GinkgoRecover() + defer wg.Done() + Expect(testCtx.MgmtClient.Delete(ctx, pvc)).To(Succeed(), "failed to delete etcd PVC %s", pvcName) + GinkgoWriter.Printf("Deleted etcd PVC %s\n", pvcName) + }() + wg.Wait() + + // Verify pod is replaced with a new UID + e2eutil.EventuallyObject(GinkgoTB(), ctx, "deleted etcd pod is replaced", + func(ctx context.Context) (*corev1.Pod, error) { + pod := &corev1.Pod{} + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(&randomPod), pod) + return pod, err + }, + []e2eutil.Predicate[*corev1.Pod]{func(pod *corev1.Pod) (bool, string, error) { + return originalUID != pod.UID, fmt.Sprintf("pod UID %s", pod.UID), nil + }}, + e2eutil.WithInterval(5*time.Second), + e2eutil.WithTimeout(30*time.Minute), + ) + + waitForEtcdConvergence(ctx, testCtx.MgmtClient, cpNamespace, ptr.Deref(etcdSts.Spec.Replicas, 0)) + }) +} + +// EtcdKillRandomMembersTest creates a marker ConfigMap in the hosted cluster, +// deletes random etcd pods every 5 seconds for 30 seconds, then verifies +// StatefulSet convergence and that the marker data survived. +func EtcdKillRandomMembersTest(getTestCtx internal.TestContextGetter) { + It("should preserve data when random members are repeatedly killed", func() { + testCtx := getTestCtx() + ctx := testCtx.Context + cpNamespace := testCtx.ControlPlaneNamespace + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + // Create marker data that should survive the chaos + markerCM := createMarkerConfigMap(ctx, guestClient) + DeferCleanup(func() { + if err := guestClient.Delete(ctx, markerCM); err != nil && !apierrors.IsNotFound(err) { + GinkgoWriter.Printf("Warning: failed to cleanup marker ConfigMap: %v\n", err) + } + }) + + etcdSts, etcdPods := getEtcdStsAndPods(ctx, testCtx.MgmtClient, cpNamespace) + + // Delete random etcd pods every 5s for 30s + duration, period := 30*time.Second, 5*time.Second + GinkgoWriter.Printf("Deleting random etcd pods every %s for %s\n", period, duration) + deletionCount := 0 + deadline := time.Now().Add(duration) + for time.Now().Before(deadline) { + pod := randomEtcdPods(etcdPods.Items, 1)[0] + err := testCtx.MgmtClient.Delete(ctx, &pod, &crclient.DeleteOptions{ + GracePeriodSeconds: ptr.To[int64](0), + }) + if err != nil { + GinkgoWriter.Printf("Warning: failed to delete pod %s: %v\n", pod.Name, err) + } else { + GinkgoWriter.Printf("Deleted pod %s\n", pod.Name) + deletionCount++ + } + time.Sleep(period) + } + Expect(deletionCount).To(BeNumerically(">", 0), "at least one pod deletion should have succeeded") + + waitForEtcdConvergence(ctx, testCtx.MgmtClient, cpNamespace, ptr.Deref(etcdSts.Spec.Replicas, 0)) + + verifyMarkerSurvived(ctx, guestClient, markerCM) + }) +} + +// EtcdKillAllMembersTest creates a marker ConfigMap, deletes ALL etcd pods simultaneously +// via goroutines, then verifies convergence and marker survival. +func EtcdKillAllMembersTest(getTestCtx internal.TestContextGetter) { + It("should preserve data when all members are killed simultaneously", func() { + testCtx := getTestCtx() + ctx := testCtx.Context + cpNamespace := testCtx.ControlPlaneNamespace + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + // Create marker data that should survive the chaos + markerCM := createMarkerConfigMap(ctx, guestClient) + DeferCleanup(func() { + if err := guestClient.Delete(ctx, markerCM); err != nil && !apierrors.IsNotFound(err) { + GinkgoWriter.Printf("Warning: failed to cleanup marker ConfigMap: %v\n", err) + } + }) + + etcdSts, etcdPods := getEtcdStsAndPods(ctx, testCtx.MgmtClient, cpNamespace) + + // Delete all etcd pods simultaneously + GinkgoWriter.Printf("Deleting all %d etcd pods simultaneously\n", len(etcdPods.Items)) + var wg sync.WaitGroup + wg.Add(len(etcdPods.Items)) + for i := range etcdPods.Items { + go func(pod *corev1.Pod) { + defer GinkgoRecover() + defer wg.Done() + deleteCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + err := testCtx.MgmtClient.Delete(deleteCtx, pod, &crclient.DeleteOptions{ + GracePeriodSeconds: ptr.To[int64](0), + }) + if err != nil { + GinkgoWriter.Printf("Warning: failed to delete pod %s: %v\n", pod.Name, err) + } else { + GinkgoWriter.Printf("Deleted pod %s\n", pod.Name) + } + }(&etcdPods.Items[i]) + } + wg.Wait() + + // Verify all etcd pods are replaced with new UIDs + e2eutil.EventuallyObjects(GinkgoTB(), ctx, "etcd pods to be replaced", + func(ctx context.Context) ([]*corev1.Pod, error) { + pods := &corev1.PodList{} + err := testCtx.MgmtClient.List(ctx, pods, &crclient.ListOptions{ + Namespace: cpNamespace, + LabelSelector: labels.Set(etcdSts.Spec.Selector.MatchLabels).AsSelector(), + }) + items := make([]*corev1.Pod, len(pods.Items)) + for i := range pods.Items { + items[i] = &pods.Items[i] + } + return items, err + }, + nil, + []e2eutil.Predicate[*corev1.Pod]{func(pod *corev1.Pod) (bool, string, error) { + for _, previousPod := range etcdPods.Items { + if previousPod.Namespace == pod.Namespace && previousPod.Name == pod.Name { + return previousPod.UID != pod.UID, fmt.Sprintf("pod UID %s", pod.UID), nil + } + } + return false, "pod not found in previous list", nil + }}, + e2eutil.WithInterval(5*time.Second), + e2eutil.WithTimeout(30*time.Minute), + ) + + waitForEtcdConvergence(ctx, testCtx.MgmtClient, cpNamespace, ptr.Deref(etcdSts.Spec.Replicas, 0)) + + verifyMarkerSurvived(ctx, guestClient, markerCM) + }) +} + +// EtcdSingleMemberCorruptionTest corrupts a random member's WAL file using +// RunCommandInPod, deletes the pod, verifies the etcd recovery job becomes active, +// and waits for StatefulSet convergence. +func EtcdSingleMemberCorruptionTest(getTestCtx internal.TestContextGetter) { + It("should recover after a single member's WAL is corrupted", func() { + testCtx := getTestCtx() + ctx := testCtx.Context + cpNamespace := testCtx.ControlPlaneNamespace + + etcdSts, etcdPods := getEtcdStsAndPods(ctx, testCtx.MgmtClient, cpNamespace) + + pod := randomEtcdPods(etcdPods.Items, 1)[0] + command := `find /var/lib/data/member/wal -type f -name "*.wal" -print0 | shuf -z -n1 | xargs -0 rm` + + GinkgoWriter.Printf("Deleting WAL file from etcd pod: %s\n", pod.Name) + cmdStdout, err := e2eutil.RunCommandInPod(ctx, testCtx.MgmtClient, "etcd", pod.Namespace, []string{"/bin/sh", "-c", command}, "etcd", 5*time.Minute) + Expect(err).NotTo(HaveOccurred(), "failed to delete WAL file from etcd pod %s", pod.Name) + Expect(cmdStdout).NotTo(ContainSubstring("No such file or directory"), "failed to delete WAL file from etcd pod %s", pod.Name) + + GinkgoWriter.Printf("Deleting pod: %s\n", pod.Name) + Expect(testCtx.MgmtClient.Delete(ctx, &pod)).To(Succeed(), "failed to delete pod %s", pod.Name) + + // Etcd recovery job should be created. + // We don't check if the job completed because it will be deleted after completion. + e2eutil.EventuallyObject(GinkgoTB(), ctx, "etcd recovery job to be active", + func(ctx context.Context) (*batchv1.Job, error) { + recoveryJob := etcdrecoverymanifests.EtcdRecoveryJob(cpNamespace) + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(recoveryJob), recoveryJob) + return recoveryJob, err + }, + []e2eutil.Predicate[*batchv1.Job]{func(job *batchv1.Job) (bool, string, error) { + want := int32(1) + got := job.Status.Active + return want != 0 && want == got, fmt.Sprintf("wanted status active to be %d, got %d", want, got), nil + }}, + e2eutil.WithInterval(5*time.Second), + e2eutil.WithTimeout(10*time.Minute), + ) + + waitForEtcdConvergence(ctx, testCtx.MgmtClient, cpNamespace, ptr.Deref(etcdSts.Spec.Replicas, 0)) + }) +} + +// EtcdMissingMemberRecoveryTest removes a member from the etcd cluster via +// etcdctl member remove, deletes the pod, verifies the recovery job, +// and waits for StatefulSet convergence. +func EtcdMissingMemberRecoveryTest(getTestCtx internal.TestContextGetter) { + It("should recover after a member is removed from the etcd cluster", func() { + testCtx := getTestCtx() + ctx := testCtx.Context + cpNamespace := testCtx.ControlPlaneNamespace + + etcdSts, etcdPods := getEtcdStsAndPods(ctx, testCtx.MgmtClient, cpNamespace) + + pod := randomEtcdPods(etcdPods.Items, 1)[0] + ep := fmt.Sprintf("https://etcd-client.%s.svc:2379", cpNamespace) + + // Step 1: Discover the member ID + discoverCommand := []string{ + "/bin/sh", "-c", + fmt.Sprintf("/usr/bin/etcdctl --cacert=/etc/etcd/tls/etcd-ca/ca.crt --cert=/etc/etcd/tls/server/server.crt --key=/etc/etcd/tls/server/server.key --endpoints=%s member list | grep %s | awk '{print $1}' | tr -d ,", ep, pod.Name), + } + + GinkgoWriter.Printf("Discovering member ID for: %s\n", pod.Name) + memberID, err := e2eutil.RunCommandInPod(ctx, testCtx.MgmtClient, "etcd", pod.Namespace, discoverCommand, "etcd", 5*time.Minute) + Expect(err).NotTo(HaveOccurred(), "failed to discover etcd member ID for %s", pod.Name) + memberID = strings.TrimSpace(memberID) + Expect(memberID).NotTo(BeEmpty(), "member ID should not be empty for %s", pod.Name) + + // Step 2: Remove the member + removeCommand := []string{ + "/usr/bin/etcdctl", + "--cacert=/etc/etcd/tls/etcd-ca/ca.crt", + "--cert=/etc/etcd/tls/server/server.crt", + "--key=/etc/etcd/tls/server/server.key", + fmt.Sprintf("--endpoints=%s", ep), + "member", "remove", memberID, + } + + GinkgoWriter.Printf("Removing etcd member %s (ID: %s)\n", pod.Name, memberID) + cmdStdout, err := e2eutil.RunCommandInPod(ctx, testCtx.MgmtClient, "etcd", pod.Namespace, removeCommand, "etcd", 5*time.Minute) + Expect(err).NotTo(HaveOccurred(), "failed to remove etcd member %s", pod.Name) + Expect(cmdStdout).NotTo(ContainSubstring("Error:"), "failed to remove etcd member %s", pod.Name) + + GinkgoWriter.Printf("Deleting pod: %s\n", pod.Name) + Expect(testCtx.MgmtClient.Delete(ctx, &pod)).To(Succeed(), "failed to delete pod %s", pod.Name) + + // Etcd recovery job should be created. + // We don't check if the job completed because it will be deleted after completion. + e2eutil.EventuallyObject(GinkgoTB(), ctx, "etcd recovery job to be active", + func(ctx context.Context) (*batchv1.Job, error) { + recoveryJob := etcdrecoverymanifests.EtcdRecoveryJob(cpNamespace) + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(recoveryJob), recoveryJob) + return recoveryJob, err + }, + []e2eutil.Predicate[*batchv1.Job]{func(job *batchv1.Job) (bool, string, error) { + want := int32(1) + got := job.Status.Active + return want != 0 && want == got, fmt.Sprintf("wanted status active to be %d, got %d", want, got), nil + }}, + e2eutil.WithInterval(5*time.Second), + e2eutil.WithTimeout(10*time.Minute), + ) + + waitForEtcdConvergence(ctx, testCtx.MgmtClient, cpNamespace, ptr.Deref(etcdSts.Spec.Replicas, 0)) + }) +} + +// getEtcdStsAndPods fetches the etcd StatefulSet and its pods from the control plane namespace. +func getEtcdStsAndPods(ctx context.Context, client crclient.Client, cpNamespace string) (*appsv1.StatefulSet, *corev1.PodList) { + GinkgoHelper() + + etcdSts := cpomanifests.EtcdStatefulSet(cpNamespace) + Expect(client.Get(ctx, crclient.ObjectKeyFromObject(etcdSts), etcdSts)).To(Succeed(), "failed to get etcd StatefulSet") + + etcdPods := &corev1.PodList{} + Expect(client.List(ctx, etcdPods, &crclient.ListOptions{ + Namespace: cpNamespace, + LabelSelector: labels.Set(etcdSts.Spec.Selector.MatchLabels).AsSelector(), + })).To(Succeed(), "failed to list etcd pods") + Expect(etcdPods.Items).NotTo(BeEmpty(), "no etcd pods found") + GinkgoWriter.Printf("Found %d etcd pods\n", len(etcdPods.Items)) + + return etcdSts, etcdPods +} + +// waitForEtcdConvergence polls the etcd StatefulSet until ReadyReplicas equals the expected replica count. +func waitForEtcdConvergence(ctx context.Context, client crclient.Client, cpNamespace string, expectedReplicas int32) { + GinkgoHelper() + + e2eutil.EventuallyObject(GinkgoTB(), ctx, "etcd StatefulSet replicas to converge", + func(ctx context.Context) (*appsv1.StatefulSet, error) { + sts := cpomanifests.EtcdStatefulSet(cpNamespace) + err := client.Get(ctx, crclient.ObjectKeyFromObject(sts), sts) + return sts, err + }, + []e2eutil.Predicate[*appsv1.StatefulSet]{func(sts *appsv1.StatefulSet) (bool, string, error) { + got := sts.Status.ReadyReplicas + return expectedReplicas != 0 && expectedReplicas == got, fmt.Sprintf("wanted %d ready replicas, got %d", expectedReplicas, got), nil + }}, + e2eutil.WithInterval(5*time.Second), + e2eutil.WithTimeout(30*time.Minute), + ) +} + +// randomEtcdPods selects count random pods from the provided slice. +func randomEtcdPods(pods []corev1.Pod, count int) []corev1.Pod { + indexes := rand.Perm(len(pods)) + selected := make([]corev1.Pod, count) + for i := 0; i < count; i++ { + selected[i] = pods[indexes[i]] + } + return selected +} + +// createMarkerConfigMap creates a ConfigMap with timestamp data in the hosted cluster +// and returns it for later verification. +func createMarkerConfigMap(ctx context.Context, client crclient.Client) *corev1.ConfigMap { + GinkgoHelper() + + value, err := time.Now().MarshalText() + Expect(err).NotTo(HaveOccurred(), "failed to marshal timestamp") + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "default", + Name: e2eutil.SimpleNameGenerator.GenerateName("marker-"), + }, + Data: map[string]string{"value": string(value)}, + } + e2eutil.EventuallyObject(GinkgoTB(), ctx, "create marker ConfigMap", + func(ctx context.Context) (*corev1.ConfigMap, error) { + err := client.Create(ctx, cm) + return cm, err + }, nil, + ) + GinkgoWriter.Printf("Created marker ConfigMap %s/%s\n", cm.Namespace, cm.Name) + return cm +} + +// verifyMarkerSurvived verifies that the marker ConfigMap still has its original data +// after etcd chaos operations. +func verifyMarkerSurvived(ctx context.Context, client crclient.Client, expected *corev1.ConfigMap) { + GinkgoHelper() + + e2eutil.EventuallyObject(GinkgoTB(), ctx, "verify marker data survived disruption", + func(ctx context.Context) (*corev1.ConfigMap, error) { + actual := &corev1.ConfigMap{} + err := client.Get(ctx, crclient.ObjectKeyFromObject(expected), actual) + return actual, err + }, + []e2eutil.Predicate[*corev1.ConfigMap]{func(configMap *corev1.ConfigMap) (bool, string, error) { + diff := cmp.Diff(expected.Data, configMap.Data) + return diff == "", fmt.Sprintf("incorrect data: %v", diff), nil + }}, + e2eutil.WithInterval(5*time.Second), + e2eutil.WithTimeout(30*time.Minute), + ) +} From d337f6d8ae538f777bfb0d4006b3f088a32a61f5 Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Fri, 15 May 2026 08:33:19 -0400 Subject: [PATCH 04/11] test(e2ev2): add control plane upgrade test Port TestUpgradeControlPlane from v1 to v2 Ginkgo framework. Inlines upgrade wait logic using EventuallyObject/EventuallyObjects (testing.TB) since the v1 wrapper functions require *testing.T which is incompatible with GinkgoTB(). The test updates the hosted cluster release image, then waits for: 1. ControlPlaneComponent resources to report RolloutComplete (4.20+) 2. ControlPlaneVersion status to reach Completed state (4.22+) 3. Data plane (CVO) version history to show CompletedUpdate Post-upgrade validation checks (EnsureFeatureGateStatus, EnsureNoCrashingPods, etc.) are deferred until the e2eutil functions are refactored from *testing.T to testing.TB. Label: control-plane-upgrade Co-Authored-By: Claude Opus 4.6 --- .../v2/tests/control_plane_upgrade_test.go | 218 ++++++++++++++++++ 1 file changed, 218 insertions(+) create mode 100644 test/e2e/v2/tests/control_plane_upgrade_test.go diff --git a/test/e2e/v2/tests/control_plane_upgrade_test.go b/test/e2e/v2/tests/control_plane_upgrade_test.go new file mode 100644 index 00000000000..0c3316d651c --- /dev/null +++ b/test/e2e/v2/tests/control_plane_upgrade_test.go @@ -0,0 +1,218 @@ +//go:build e2ev2 + +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package tests + +import ( + "context" + "fmt" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + configv1 "github.com/openshift/api/config/v1" + hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" + e2eutil "github.com/openshift/hypershift/test/e2e/util" + "github.com/openshift/hypershift/test/e2e/v2/internal" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/ptr" + crclient "sigs.k8s.io/controller-runtime/pkg/client" +) + +// ControlPlaneUpgradeTest registers tests for control plane upgrade lifecycle. +func ControlPlaneUpgradeTest(getTestCtx internal.TestContextGetter) { + Context("Control Plane Upgrade", func() { + It("should upgrade the control plane to the latest release image", func() { + testCtx := getTestCtx() + ctx := testCtx.Context + hostedCluster := testCtx.GetHostedCluster() + Expect(hostedCluster).NotTo(BeNil(), "hosted cluster should be available") + + latestReleaseImage := internal.GetEnvVarValue("E2E_LATEST_RELEASE_IMAGE") + Expect(latestReleaseImage).NotTo(BeEmpty(), "E2E_LATEST_RELEASE_IMAGE must be set for upgrade tests") + + // Record the starting version from the current version history. + var startingVersion string + if hostedCluster.Status.Version != nil && len(hostedCluster.Status.Version.History) > 0 { + startingVersion = hostedCluster.Status.Version.History[0].Version + } + GinkgoLogr.Info("Starting control plane upgrade", + "startingVersion", startingVersion, + "targetImage", latestReleaseImage, + ) + + // Capture the last completion time before the upgrade so the data plane + // rollout predicate can detect when a *new* history entry completes. + var lastVersionCompletionTime *metav1.Time + if hostedCluster.Status.Version != nil && len(hostedCluster.Status.Version.History) > 0 { + lastVersionCompletionTime = hostedCluster.Status.Version.History[0].CompletionTime + } + + // Update the hosted cluster release image and set ForceUpgradeToAnnotation. + // UpdateObject takes testing.TB so GinkgoTB() works here. + err := e2eutil.UpdateObject(GinkgoTB(), ctx, testCtx.MgmtClient, hostedCluster, func(obj *hyperv1.HostedCluster) { + obj.Spec.Release.Image = latestReleaseImage + if obj.Annotations == nil { + obj.Annotations = make(map[string]string) + } + obj.Annotations[hyperv1.ForceUpgradeToAnnotation] = latestReleaseImage + }) + Expect(err).NotTo(HaveOccurred(), "failed to update hosted cluster release image") + + // Step 1: Wait for ControlPlaneComponent resources to complete rollout (4.20+). + // Inlined from e2eutil.WaitForControlPlaneComponentRollout because the v1 + // wrapper takes *testing.T which is incompatible with GinkgoTB(). + By("Waiting for control plane components to complete rollout") + e2eutil.GinkgoAtLeast(e2eutil.Version420) + e2eutil.EventuallyObjects(GinkgoTB(), ctx, "control plane components to complete rollout", + func(ctx context.Context) ([]*hyperv1.ControlPlaneComponent, error) { + list := &hyperv1.ControlPlaneComponentList{} + err := testCtx.MgmtClient.List(ctx, list, crclient.InNamespace(testCtx.ControlPlaneNamespace)) + items := make([]*hyperv1.ControlPlaneComponent, len(list.Items)) + for i := range list.Items { + items[i] = &list.Items[i] + } + return items, err + }, + []e2eutil.Predicate[[]*hyperv1.ControlPlaneComponent]{ + func(cpComponents []*hyperv1.ControlPlaneComponent) (done bool, reasons string, err error) { + return len(cpComponents) > 10, "expecting more than 10 control plane components", nil + }, + }, + []e2eutil.Predicate[*hyperv1.ControlPlaneComponent]{ + e2eutil.ConditionPredicate[*hyperv1.ControlPlaneComponent](e2eutil.Condition{ + Type: string(hyperv1.ControlPlaneComponentRolloutComplete), + Status: metav1.ConditionTrue, + }), + func(cpComponent *hyperv1.ControlPlaneComponent) (done bool, reasons string, err error) { + if startingVersion != "" && cpComponent.Status.Version == startingVersion { + return false, fmt.Sprintf("component %s is still on version %s", cpComponent.Name, cpComponent.Status.Version), nil + } + return true, fmt.Sprintf("component %s has version: %s", cpComponent.Name, cpComponent.Status.Version), nil + }, + }, + e2eutil.WithTimeout(30*time.Minute), + e2eutil.WithInterval(10*time.Second), + ) + + // Step 2: Wait for controlPlaneVersion to complete rollout (4.22+). + // Inlined from e2eutil.WaitForControlPlaneRollout / isControlPlaneVersionCompleted + // because the v1 wrapper takes *testing.T. + By("Waiting for control plane version to complete rollout") + e2eutil.GinkgoAtLeast(e2eutil.Version422) + e2eutil.EventuallyObject(GinkgoTB(), ctx, "control plane version to complete rollout", + func(ctx context.Context) (*hyperv1.HostedCluster, error) { + hc := &hyperv1.HostedCluster{} + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(hostedCluster), hc) + return hc, err + }, + []e2eutil.Predicate[*hyperv1.HostedCluster]{ + func(hc *hyperv1.HostedCluster) (done bool, reasons string, err error) { + if hc.Status.ControlPlaneVersion.Desired.Image == "" { + return false, "HostedCluster has no controlPlaneVersion status", nil + } + if len(hc.Status.ControlPlaneVersion.History) == 0 { + return false, "HostedCluster controlPlaneVersion has no history", nil + } + entry := hc.Status.ControlPlaneVersion.History[0] + if entry.Image != hc.Status.ControlPlaneVersion.Desired.Image { + return false, fmt.Sprintf("controlPlaneVersion desired image %s doesn't match most recent image in history %s", + hc.Status.ControlPlaneVersion.Desired.Image, entry.Image), nil + } + if entry.State != configv1.CompletedUpdate { + return false, fmt.Sprintf("controlPlaneVersion state is %s, waiting for Completed", entry.State), nil + } + return true, "controlPlaneVersion reached Completed", nil + }, + }, + e2eutil.WithTimeout(30*time.Minute), + e2eutil.WithInterval(10*time.Second), + ) + + // Step 3: Wait for the data plane (CVO) rollout to complete. + // Inlined from e2eutil.WaitForDataPlaneRollout because the v1 wrapper + // takes *testing.T. + By("Waiting for data plane (CVO) rollout to complete") + e2eutil.EventuallyObject(GinkgoTB(), ctx, "data plane to complete rollout", + func(ctx context.Context) (*hyperv1.HostedCluster, error) { + hc := &hyperv1.HostedCluster{} + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(hostedCluster), hc) + return hc, err + }, + []e2eutil.Predicate[*hyperv1.HostedCluster]{ + e2eutil.ConditionPredicate[*hyperv1.HostedCluster](e2eutil.Condition{ + Type: string(hyperv1.HostedClusterAvailable), + Status: metav1.ConditionTrue, + }), + e2eutil.ConditionPredicate[*hyperv1.HostedCluster](e2eutil.Condition{ + Type: string(hyperv1.HostedClusterProgressing), + Status: metav1.ConditionFalse, + }), + func(hc *hyperv1.HostedCluster) (done bool, reasons string, err error) { + if len(ptr.Deref(hc.Status.Version, hyperv1.ClusterVersionStatus{}).History) == 0 { + return false, "HostedCluster has no version history", nil + } + if lastVersionCompletionTime != nil && + hc.Status.Version.History[0].CompletionTime != nil && + lastVersionCompletionTime.Equal(hc.Status.Version.History[0].CompletionTime) { + return false, "HostedCluster version history has not been updated yet", nil + } + if wanted, got := hc.Status.Version.Desired.Image, hc.Status.Version.History[0].Image; wanted != got { + return false, fmt.Sprintf("desired image %s doesn't match most recent image in history %s", wanted, got), nil + } + if wanted, got := configv1.CompletedUpdate, hc.Status.Version.History[0].State; wanted != got { + return false, fmt.Sprintf("wanted most recent version history to have state %s, has state %s", wanted, got), nil + } + return true, "cluster rolled out", nil + }, + }, + e2eutil.WithTimeout(30*time.Minute), + ) + + // TODO: Add post-upgrade validation checks once the e2eutil functions are + // refactored to accept testing.TB instead of *testing.T. The following + // checks are performed by the v1 test but cannot be called from Ginkgo: + // - e2eutil.EnsureFeatureGateStatus + // - e2eutil.EnsureNodeCountMatchesNodePoolReplicas + // - e2eutil.EnsureNoCrashingPods + // - e2eutil.EnsureMachineDeploymentGeneration + }) + }) +} + +// RegisterControlPlaneUpgradeTests registers all control plane upgrade tests. +func RegisterControlPlaneUpgradeTests(getTestCtx internal.TestContextGetter) { + ControlPlaneUpgradeTest(getTestCtx) +} + +var _ = Describe("Control Plane Upgrade", Label("control-plane-upgrade"), func() { + var ( + testCtx *internal.TestContext + ) + + BeforeEach(func() { + testCtx = internal.GetTestContext() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + if err := testCtx.ValidateControlPlaneNamespace(); err != nil { + AbortSuite(err.Error()) + } + }) + + RegisterControlPlaneUpgradeTests(func() *internal.TestContext { return testCtx }) +}) From 88650c5a88bc1797076cb307756c6ec86337a553 Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Fri, 15 May 2026 08:39:22 -0400 Subject: [PATCH 05/11] refactor(e2e): widen 12 e2eutil functions from *testing.T to testing.TB GinkgoTB() returns *GinkgoTBWrapper which implements testing.TB but not *testing.T. Widening these signatures lets v2 tests call shared helpers directly instead of inlining them. The change is backwards-compatible since *testing.T satisfies testing.TB. Functions that use t.Run() (Ensure*) cannot be widened and remain as TODOs in the v2 upgrade test. Co-Authored-By: Claude Opus 4.6 --- test/e2e/util/util.go | 24 +- .../v2/tests/control_plane_upgrade_test.go | 213 ++++-------------- 2 files changed, 56 insertions(+), 181 deletions(-) diff --git a/test/e2e/util/util.go b/test/e2e/util/util.go index dcbf6ed7e55..9928bccfcf2 100644 --- a/test/e2e/util/util.go +++ b/test/e2e/util/util.go @@ -343,7 +343,7 @@ func WaitForGuestRestConfig(t *testing.T, ctx context.Context, client crclient.C return guestConfig } -func WaitForGuestClient(t *testing.T, ctx context.Context, client crclient.Client, hostedCluster *hyperv1.HostedCluster) crclient.Client { +func WaitForGuestClient(t testing.TB, ctx context.Context, client crclient.Client, hostedCluster *hyperv1.HostedCluster) crclient.Client { g := NewWithT(t) guestKubeConfigSecretData := WaitForGuestKubeConfig(t, ctx, client, hostedCluster) @@ -463,23 +463,23 @@ func WaitForGuestKubeconfigHostResolutionUpdate(t *testing.T, ctx context.Contex g.Expect(err).NotTo(HaveOccurred(), "failed to wait for guest kubeconfig host resolution to update") } -func WaitForNReadyNodes(t *testing.T, ctx context.Context, client crclient.Client, n int32, platform hyperv1.PlatformType) []corev1.Node { +func WaitForNReadyNodes(t testing.TB, ctx context.Context, client crclient.Client, n int32, platform hyperv1.PlatformType) []corev1.Node { return WaitForNReadyNodesWithOptions(t, ctx, client, n, platform, "") } -func WaitForReadyNodesByNodePool(t *testing.T, ctx context.Context, client crclient.Client, np *hyperv1.NodePool, platform hyperv1.PlatformType, opts ...NodePoolPollOption) []corev1.Node { +func WaitForReadyNodesByNodePool(t testing.TB, ctx context.Context, client crclient.Client, np *hyperv1.NodePool, platform hyperv1.PlatformType, opts ...NodePoolPollOption) []corev1.Node { return WaitForNReadyNodesWithOptions(t, ctx, client, *np.Spec.Replicas, platform, fmt.Sprintf("for NodePool %s/%s", np.Namespace, np.Name), append(opts, WithClientOptions(crclient.MatchingLabelsSelector{Selector: labels.SelectorFromSet(labels.Set{hyperv1.NodePoolLabel: np.Name})}))...) } -func WaitForReadyNodesByLabels(t *testing.T, ctx context.Context, client crclient.Client, platform hyperv1.PlatformType, replicas int32, nodeLabels map[string]string) []corev1.Node { +func WaitForReadyNodesByLabels(t testing.TB, ctx context.Context, client crclient.Client, platform hyperv1.PlatformType, replicas int32, nodeLabels map[string]string) []corev1.Node { return WaitForNReadyNodesWithOptions(t, ctx, client, replicas, platform, "", WithClientOptions(crclient.MatchingLabelsSelector{Selector: labels.SelectorFromSet(labels.Set(nodeLabels))})) } -func WaitForNodePoolConfigUpdateComplete(t *testing.T, ctx context.Context, client crclient.Client, np *hyperv1.NodePool) { +func WaitForNodePoolConfigUpdateComplete(t testing.TB, ctx context.Context, client crclient.Client, np *hyperv1.NodePool) { WaitForNodePoolConfigUpdateCompleteWithPlatform(t, ctx, client, np, hyperv1.NonePlatform) } -func WaitForNodePoolConfigUpdateCompleteWithPlatform(t *testing.T, ctx context.Context, client crclient.Client, np *hyperv1.NodePool, platform hyperv1.PlatformType) { +func WaitForNodePoolConfigUpdateCompleteWithPlatform(t testing.TB, ctx context.Context, client crclient.Client, np *hyperv1.NodePool, platform hyperv1.PlatformType) { // configUpdateTimeout for config updates to complete configUpdateTimeout := 25 * time.Minute switch platform { @@ -557,7 +557,7 @@ func WithSuffix(suffix string) NodePoolPollOption { } } -func WaitForNReadyNodesWithOptions(t *testing.T, ctx context.Context, client crclient.Client, n int32, platform hyperv1.PlatformType, suffix string, opts ...NodePoolPollOption) []corev1.Node { +func WaitForNReadyNodesWithOptions(t testing.TB, ctx context.Context, client crclient.Client, n int32, platform hyperv1.PlatformType, suffix string, opts ...NodePoolPollOption) []corev1.Node { options := &NodePoolPollOptions{} for _, opt := range opts { opt(options) @@ -606,7 +606,7 @@ func WaitForNReadyNodesWithOptions(t *testing.T, ctx context.Context, client crc // This was renamed from WaitForImageRollout to clarify that it checks HC.Status.Version // (data-plane CVO rollout), in contrast to WaitForControlPlaneRollout which checks // HC.Status.ControlPlaneVersion (management-side components). -func WaitForDataPlaneRollout(t *testing.T, ctx context.Context, client crclient.Client, hostedCluster *hyperv1.HostedCluster) { +func WaitForDataPlaneRollout(t testing.TB, ctx context.Context, client crclient.Client, hostedCluster *hyperv1.HostedCluster) { var lastVersionCompletionTime *metav1.Time if hostedCluster.Status.Version != nil && len(hostedCluster.Status.Version.History) > 0 { @@ -651,14 +651,14 @@ func WaitForDataPlaneRollout(t *testing.T, ctx context.Context, client crclient. // WaitForImageRollout is a deprecated alias for WaitForDataPlaneRollout. // Deprecated: Use WaitForDataPlaneRollout instead. -func WaitForImageRollout(t *testing.T, ctx context.Context, client crclient.Client, hostedCluster *hyperv1.HostedCluster) { +func WaitForImageRollout(t testing.TB, ctx context.Context, client crclient.Client, hostedCluster *hyperv1.HostedCluster) { WaitForDataPlaneRollout(t, ctx, client, hostedCluster) } // WaitForControlPlaneRollout waits for HC.Status.ControlPlaneVersion to reach Completed state // with the desired image. This checks management-side component rollout independently from CVO. // Must be gated with AtLeast(t, Version422) at call sites since older clusters lack this field. -func WaitForControlPlaneRollout(t *testing.T, ctx context.Context, client crclient.Client, hostedCluster *hyperv1.HostedCluster) { +func WaitForControlPlaneRollout(t testing.TB, ctx context.Context, client crclient.Client, hostedCluster *hyperv1.HostedCluster) { EventuallyObject(t, ctx, fmt.Sprintf("HostedCluster %s/%s controlPlaneVersion to complete", hostedCluster.Namespace, hostedCluster.Name), func(ctx context.Context) (*hyperv1.HostedCluster, error) { hc := &hyperv1.HostedCluster{} @@ -676,7 +676,7 @@ func WaitForControlPlaneRollout(t *testing.T, ctx context.Context, client crclie // WaitForControlPlaneComponentRollout waits for all ControlPlaneComponent resources to report // RolloutComplete=True and a version different from initialVersion. This provides a belt-and-suspenders // check alongside WaitForControlPlaneRollout by directly inspecting individual component status. -func WaitForControlPlaneComponentRollout(t *testing.T, ctx context.Context, client crclient.Client, hostedCluster *hyperv1.HostedCluster, initialVersion string) { +func WaitForControlPlaneComponentRollout(t testing.TB, ctx context.Context, client crclient.Client, hostedCluster *hyperv1.HostedCluster, initialVersion string) { controlPlaneComponents := &hyperv1.ControlPlaneComponentList{} controlPlaneNamespace := manifests.HostedControlPlaneNamespace(hostedCluster.Namespace, hostedCluster.Name) EventuallyObjects(t, ctx, "control plane components to complete rollout", @@ -735,7 +735,7 @@ func WaitForConditionsOnHostedControlPlane(t *testing.T, ctx context.Context, cl ) } -func WaitForNodePoolDesiredNodes(t *testing.T, ctx context.Context, client crclient.Client, hostedCluster *hyperv1.HostedCluster) { +func WaitForNodePoolDesiredNodes(t testing.TB, ctx context.Context, client crclient.Client, hostedCluster *hyperv1.HostedCluster) { EventuallyObjects(t, ctx, fmt.Sprintf("NodePools for HostedCluster %s/%s to have all of their desired nodes", hostedCluster.Namespace, hostedCluster.Name), func(ctx context.Context) ([]*hyperv1.NodePool, error) { list := &hyperv1.NodePoolList{} diff --git a/test/e2e/v2/tests/control_plane_upgrade_test.go b/test/e2e/v2/tests/control_plane_upgrade_test.go index 0c3316d651c..d729d338601 100644 --- a/test/e2e/v2/tests/control_plane_upgrade_test.go +++ b/test/e2e/v2/tests/control_plane_upgrade_test.go @@ -17,181 +17,62 @@ limitations under the License. package tests import ( - "context" - "fmt" - "time" - . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - configv1 "github.com/openshift/api/config/v1" hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" e2eutil "github.com/openshift/hypershift/test/e2e/util" "github.com/openshift/hypershift/test/e2e/v2/internal" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/utils/ptr" crclient "sigs.k8s.io/controller-runtime/pkg/client" ) -// ControlPlaneUpgradeTest registers tests for control plane upgrade lifecycle. +// ControlPlaneUpgradeTest upgrades the hosted cluster from N-1 to the latest release image. func ControlPlaneUpgradeTest(getTestCtx internal.TestContextGetter) { - Context("Control Plane Upgrade", func() { - It("should upgrade the control plane to the latest release image", func() { - testCtx := getTestCtx() - ctx := testCtx.Context - hostedCluster := testCtx.GetHostedCluster() - Expect(hostedCluster).NotTo(BeNil(), "hosted cluster should be available") - - latestReleaseImage := internal.GetEnvVarValue("E2E_LATEST_RELEASE_IMAGE") - Expect(latestReleaseImage).NotTo(BeEmpty(), "E2E_LATEST_RELEASE_IMAGE must be set for upgrade tests") - - // Record the starting version from the current version history. - var startingVersion string - if hostedCluster.Status.Version != nil && len(hostedCluster.Status.Version.History) > 0 { - startingVersion = hostedCluster.Status.Version.History[0].Version - } - GinkgoLogr.Info("Starting control plane upgrade", - "startingVersion", startingVersion, - "targetImage", latestReleaseImage, - ) - - // Capture the last completion time before the upgrade so the data plane - // rollout predicate can detect when a *new* history entry completes. - var lastVersionCompletionTime *metav1.Time - if hostedCluster.Status.Version != nil && len(hostedCluster.Status.Version.History) > 0 { - lastVersionCompletionTime = hostedCluster.Status.Version.History[0].CompletionTime - } + It("should upgrade the control plane from N-1 to latest", func() { + testCtx := getTestCtx() + ctx := testCtx.Context + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + latestImage := internal.GetEnvVarValue("E2E_LATEST_RELEASE_IMAGE") + Expect(latestImage).NotTo(BeEmpty(), "E2E_LATEST_RELEASE_IMAGE must be set for upgrade tests") + + var startingVersion string + if hc.Status.Version != nil && len(hc.Status.Version.History) > 0 { + startingVersion = hc.Status.Version.History[0].Version + } + GinkgoWriter.Printf("Starting upgrade from version %s to image %s\n", startingVersion, latestImage) - // Update the hosted cluster release image and set ForceUpgradeToAnnotation. - // UpdateObject takes testing.TB so GinkgoTB() works here. - err := e2eutil.UpdateObject(GinkgoTB(), ctx, testCtx.MgmtClient, hostedCluster, func(obj *hyperv1.HostedCluster) { - obj.Spec.Release.Image = latestReleaseImage - if obj.Annotations == nil { - obj.Annotations = make(map[string]string) - } - obj.Annotations[hyperv1.ForceUpgradeToAnnotation] = latestReleaseImage - }) - Expect(err).NotTo(HaveOccurred(), "failed to update hosted cluster release image") - - // Step 1: Wait for ControlPlaneComponent resources to complete rollout (4.20+). - // Inlined from e2eutil.WaitForControlPlaneComponentRollout because the v1 - // wrapper takes *testing.T which is incompatible with GinkgoTB(). - By("Waiting for control plane components to complete rollout") - e2eutil.GinkgoAtLeast(e2eutil.Version420) - e2eutil.EventuallyObjects(GinkgoTB(), ctx, "control plane components to complete rollout", - func(ctx context.Context) ([]*hyperv1.ControlPlaneComponent, error) { - list := &hyperv1.ControlPlaneComponentList{} - err := testCtx.MgmtClient.List(ctx, list, crclient.InNamespace(testCtx.ControlPlaneNamespace)) - items := make([]*hyperv1.ControlPlaneComponent, len(list.Items)) - for i := range list.Items { - items[i] = &list.Items[i] - } - return items, err - }, - []e2eutil.Predicate[[]*hyperv1.ControlPlaneComponent]{ - func(cpComponents []*hyperv1.ControlPlaneComponent) (done bool, reasons string, err error) { - return len(cpComponents) > 10, "expecting more than 10 control plane components", nil - }, - }, - []e2eutil.Predicate[*hyperv1.ControlPlaneComponent]{ - e2eutil.ConditionPredicate[*hyperv1.ControlPlaneComponent](e2eutil.Condition{ - Type: string(hyperv1.ControlPlaneComponentRolloutComplete), - Status: metav1.ConditionTrue, - }), - func(cpComponent *hyperv1.ControlPlaneComponent) (done bool, reasons string, err error) { - if startingVersion != "" && cpComponent.Status.Version == startingVersion { - return false, fmt.Sprintf("component %s is still on version %s", cpComponent.Name, cpComponent.Status.Version), nil - } - return true, fmt.Sprintf("component %s has version: %s", cpComponent.Name, cpComponent.Status.Version), nil - }, - }, - e2eutil.WithTimeout(30*time.Minute), - e2eutil.WithInterval(10*time.Second), - ) - - // Step 2: Wait for controlPlaneVersion to complete rollout (4.22+). - // Inlined from e2eutil.WaitForControlPlaneRollout / isControlPlaneVersionCompleted - // because the v1 wrapper takes *testing.T. - By("Waiting for control plane version to complete rollout") - e2eutil.GinkgoAtLeast(e2eutil.Version422) - e2eutil.EventuallyObject(GinkgoTB(), ctx, "control plane version to complete rollout", - func(ctx context.Context) (*hyperv1.HostedCluster, error) { - hc := &hyperv1.HostedCluster{} - err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(hostedCluster), hc) - return hc, err - }, - []e2eutil.Predicate[*hyperv1.HostedCluster]{ - func(hc *hyperv1.HostedCluster) (done bool, reasons string, err error) { - if hc.Status.ControlPlaneVersion.Desired.Image == "" { - return false, "HostedCluster has no controlPlaneVersion status", nil - } - if len(hc.Status.ControlPlaneVersion.History) == 0 { - return false, "HostedCluster controlPlaneVersion has no history", nil - } - entry := hc.Status.ControlPlaneVersion.History[0] - if entry.Image != hc.Status.ControlPlaneVersion.Desired.Image { - return false, fmt.Sprintf("controlPlaneVersion desired image %s doesn't match most recent image in history %s", - hc.Status.ControlPlaneVersion.Desired.Image, entry.Image), nil - } - if entry.State != configv1.CompletedUpdate { - return false, fmt.Sprintf("controlPlaneVersion state is %s, waiting for Completed", entry.State), nil - } - return true, "controlPlaneVersion reached Completed", nil - }, - }, - e2eutil.WithTimeout(30*time.Minute), - e2eutil.WithInterval(10*time.Second), - ) - - // Step 3: Wait for the data plane (CVO) rollout to complete. - // Inlined from e2eutil.WaitForDataPlaneRollout because the v1 wrapper - // takes *testing.T. - By("Waiting for data plane (CVO) rollout to complete") - e2eutil.EventuallyObject(GinkgoTB(), ctx, "data plane to complete rollout", - func(ctx context.Context) (*hyperv1.HostedCluster, error) { - hc := &hyperv1.HostedCluster{} - err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(hostedCluster), hc) - return hc, err - }, - []e2eutil.Predicate[*hyperv1.HostedCluster]{ - e2eutil.ConditionPredicate[*hyperv1.HostedCluster](e2eutil.Condition{ - Type: string(hyperv1.HostedClusterAvailable), - Status: metav1.ConditionTrue, - }), - e2eutil.ConditionPredicate[*hyperv1.HostedCluster](e2eutil.Condition{ - Type: string(hyperv1.HostedClusterProgressing), - Status: metav1.ConditionFalse, - }), - func(hc *hyperv1.HostedCluster) (done bool, reasons string, err error) { - if len(ptr.Deref(hc.Status.Version, hyperv1.ClusterVersionStatus{}).History) == 0 { - return false, "HostedCluster has no version history", nil - } - if lastVersionCompletionTime != nil && - hc.Status.Version.History[0].CompletionTime != nil && - lastVersionCompletionTime.Equal(hc.Status.Version.History[0].CompletionTime) { - return false, "HostedCluster version history has not been updated yet", nil - } - if wanted, got := hc.Status.Version.Desired.Image, hc.Status.Version.History[0].Image; wanted != got { - return false, fmt.Sprintf("desired image %s doesn't match most recent image in history %s", wanted, got), nil - } - if wanted, got := configv1.CompletedUpdate, hc.Status.Version.History[0].State; wanted != got { - return false, fmt.Sprintf("wanted most recent version history to have state %s, has state %s", wanted, got), nil - } - return true, "cluster rolled out", nil - }, - }, - e2eutil.WithTimeout(30*time.Minute), - ) - - // TODO: Add post-upgrade validation checks once the e2eutil functions are - // refactored to accept testing.TB instead of *testing.T. The following - // checks are performed by the v1 test but cannot be called from Ginkgo: - // - e2eutil.EnsureFeatureGateStatus - // - e2eutil.EnsureNodeCountMatchesNodePoolReplicas - // - e2eutil.EnsureNoCrashingPods - // - e2eutil.EnsureMachineDeploymentGeneration + err := e2eutil.UpdateObject(GinkgoTB(), ctx, testCtx.MgmtClient, hc, func(obj *hyperv1.HostedCluster) { + obj.Spec.Release.Image = latestImage + if obj.Annotations == nil { + obj.Annotations = make(map[string]string) + } + obj.Annotations[hyperv1.ForceUpgradeToAnnotation] = latestImage }) + Expect(err).NotTo(HaveOccurred(), "failed to update hosted cluster release image") + + By("Waiting for control plane components to complete rollout") + e2eutil.GinkgoAtLeast(e2eutil.Version420) + e2eutil.WaitForControlPlaneComponentRollout(GinkgoTB(), ctx, testCtx.MgmtClient, hc, startingVersion) + + By("Waiting for control plane version to complete rollout") + e2eutil.GinkgoAtLeast(e2eutil.Version422) + e2eutil.WaitForControlPlaneRollout(GinkgoTB(), ctx, testCtx.MgmtClient, hc) + + By("Waiting for data plane rollout to complete") + e2eutil.WaitForDataPlaneRollout(GinkgoTB(), ctx, testCtx.MgmtClient, hc) + + // Re-fetch HC after upgrade + Expect(testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(hc), hc)).To(Succeed()) + + // TODO: Add post-upgrade validation checks once the Ensure* functions + // in e2eutil are refactored from *testing.T to testing.TB: + // - EnsureFeatureGateStatus + // - EnsureNodeCountMatchesNodePoolReplicas + // - EnsureNoCrashingPods + // - EnsureMachineDeploymentGeneration }) } @@ -201,17 +82,11 @@ func RegisterControlPlaneUpgradeTests(getTestCtx internal.TestContextGetter) { } var _ = Describe("Control Plane Upgrade", Label("control-plane-upgrade"), func() { - var ( - testCtx *internal.TestContext - ) + var testCtx *internal.TestContext BeforeEach(func() { testCtx = internal.GetTestContext() Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") - - if err := testCtx.ValidateControlPlaneNamespace(); err != nil { - AbortSuite(err.Error()) - } }) RegisterControlPlaneUpgradeTests(func() *internal.TestContext { return testCtx }) From 3b07c4ac2aa5312e99dabc07288be92b83f57f04 Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Fri, 15 May 2026 08:48:39 -0400 Subject: [PATCH 06/11] test(e2ev2): add nodepool lifecycle tests Port 13 NodePool sub-tests from v1 to v2 Ginkgo framework. Each sub-test creates its own uniquely-named NodePool and cleans up, making tests parallelism-safe for shared clusters. Tests ported: MachineConfig rollout, NTO rollout (Replace + InPlace), NodePool upgrade (Replace + InPlace), rolling upgrade, previous release (N-1, N-2), mirror configs, trust bundle propagation, NTO performance profile, auto-repair skeleton, and disk encryption skeleton. Label: nodepool-lifecycle Co-Authored-By: Claude Opus 4.6 --- test/e2e/v2/tests/nodepool_lifecycle_test.go | 1487 ++++++++++++++++++ 1 file changed, 1487 insertions(+) create mode 100644 test/e2e/v2/tests/nodepool_lifecycle_test.go diff --git a/test/e2e/v2/tests/nodepool_lifecycle_test.go b/test/e2e/v2/tests/nodepool_lifecycle_test.go new file mode 100644 index 00000000000..4c892d9e97e --- /dev/null +++ b/test/e2e/v2/tests/nodepool_lifecycle_test.go @@ -0,0 +1,1487 @@ +//go:build e2ev2 + +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package tests + +import ( + "context" + "encoding/json" + "fmt" + "time" + + "github.com/google/go-cmp/cmp" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + ignitionapi "github.com/coreos/ignition/v2/config/v3_2/types" + mcfgv1 "github.com/openshift/api/machineconfiguration/v1" + hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" + "github.com/openshift/hypershift/hypershift-operator/controllers/manifests" + "github.com/openshift/hypershift/hypershift-operator/controllers/nodepool" + hyperapi "github.com/openshift/hypershift/support/api" + "github.com/openshift/hypershift/support/netutil" + "github.com/openshift/hypershift/support/podspec" + e2eutil "github.com/openshift/hypershift/test/e2e/util" + "github.com/openshift/hypershift/test/e2e/v2/internal" + + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/utils/ptr" + crclient "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/apiutil" + "sigs.k8s.io/yaml" +) + +// RegisterNodePoolLifecycleTests registers all NodePool lifecycle test cases. +func RegisterNodePoolLifecycleTests(getTestCtx internal.TestContextGetter) { + NodePoolMachineconfigRolloutTest(getTestCtx) + NodePoolNTORolloutTest(getTestCtx) + NodePoolNTOInPlaceTest(getTestCtx) + NodePoolReplaceUpgradeTest(getTestCtx) + NodePoolInPlaceUpgradeTest(getTestCtx) + NodePoolRollingUpgradeTest(getTestCtx) + NodePoolPrevReleaseN1Test(getTestCtx) + NodePoolPrevReleaseN2Test(getTestCtx) + NodePoolMirrorConfigsTest(getTestCtx) + NodePoolTrustBundleTest(getTestCtx) + NodePoolNTOPerformanceProfileTest(getTestCtx) + NodePoolAutoRepairTest(getTestCtx) + NodePoolDiskEncryptionTest(getTestCtx) +} + +var _ = Describe("NodePool Lifecycle", Label("nodepool-lifecycle"), func() { + var testCtx *internal.TestContext + + BeforeEach(func() { + testCtx = internal.GetTestContext() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + }) + + RegisterNodePoolLifecycleTests(func() *internal.TestContext { return testCtx }) +}) + +// NodePoolMachineconfigRolloutTest creates a NodePool with Replace upgrade strategy, +// applies a MachineConfig via ConfigMap, patches the NodePool to reference it, +// creates a verification DaemonSet in the hosted cluster, and waits for config update +// complete and DaemonSet rollout. +func NodePoolMachineconfigRolloutTest(getTestCtx internal.TestContextGetter) { + It("should roll out a MachineConfig change via Replace upgrade strategy", func() { + testCtx := getTestCtx() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + if hc.Spec.Platform.Type == hyperv1.KubevirtPlatform { + Skip("test is skipped for KubeVirt platform until https://issues.redhat.com/browse/CNV-38196 is addressed") + } + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + ctx := testCtx.Context + + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + var oneReplica int32 = 1 + np := buildTestNodePool(defaultNP, "mc-rollout", func(pool *hyperv1.NodePool) { + pool.Spec.Replicas = &oneReplica + pool.Spec.Management.Replace = &hyperv1.ReplaceUpgrade{ + Strategy: hyperv1.UpgradeStrategyRollingUpdate, + RollingUpdate: &hyperv1.RollingUpdate{ + MaxUnavailable: ptr.To(intstr.FromInt32(0)), + MaxSurge: ptr.To(intstr.FromInt32(oneReplica)), + }, + } + }) + + err := testCtx.MgmtClient.Create(ctx, np) + Expect(err).NotTo(HaveOccurred(), "failed to create NodePool %s", np.Name) + GinkgoWriter.Printf("Created NodePool %s\n", np.Name) + defer cleanupNodePool(ctx, testCtx.MgmtClient, np) + + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + // Build MachineConfig with a custom file at /etc/custom-config + ignitionConfig := ignitionapi.Config{ + Ignition: ignitionapi.Ignition{Version: "3.2.0"}, + Storage: ignitionapi.Storage{ + Files: []ignitionapi.File{{ + Node: ignitionapi.Node{Path: "/etc/custom-config"}, + FileEmbedded1: ignitionapi.FileEmbedded1{Contents: ignitionapi.Resource{Source: ptr.To("data:,content%0A")}}, + }}, + }, + } + serializedIgnition, err := json.Marshal(ignitionConfig) + Expect(err).NotTo(HaveOccurred(), "failed to serialize ignition config") + + machineConfig := &mcfgv1.MachineConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: "custom", + Labels: map[string]string{"machineconfiguration.openshift.io/role": "worker"}, + }, + Spec: mcfgv1.MachineConfigSpec{Config: runtime.RawExtension{Raw: serializedIgnition}}, + } + gvk, err := apiutil.GVKForObject(machineConfig, hyperapi.Scheme) + Expect(err).NotTo(HaveOccurred(), "failed to get GVK for MachineConfig") + machineConfig.SetGroupVersionKind(gvk) + + serializedMC, err := yaml.Marshal(machineConfig) + Expect(err).NotTo(HaveOccurred(), "failed to serialize MachineConfig") + + mcConfigMap := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: e2eutil.SimpleNameGenerator.GenerateName("custom-mc-"), + Namespace: hc.Namespace, + }, + Data: map[string]string{"config": string(serializedMC)}, + } + Expect(testCtx.MgmtClient.Create(ctx, mcConfigMap)).To(Succeed(), "failed to create MachineConfig ConfigMap") + GinkgoWriter.Printf("Created MachineConfig ConfigMap %s\n", mcConfigMap.Name) + + original := np.DeepCopy() + np.Spec.Config = append(np.Spec.Config, corev1.LocalObjectReference{Name: mcConfigMap.Name}) + Expect(testCtx.MgmtClient.Patch(ctx, np, crclient.MergeFrom(original))).To(Succeed(), + "failed to patch NodePool %s with MachineConfig", np.Name) + + // Build verification DaemonSet that checks /etc/custom-config exists + ds := buildMachineConfigVerificationDaemonSet(np) + Expect(guestClient.Create(ctx, ds)).To(Succeed(), "failed to create verification DaemonSet") + + e2eutil.WaitForNodePoolConfigUpdateCompleteWithPlatform(GinkgoTB(), ctx, testCtx.MgmtClient, np, hc.Spec.Platform.Type) + waitForDaemonSetRollout(ctx, guestClient, ds, 1, np.Spec.Platform.Type) + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + // TODO: EnsureNoCrashingPods, EnsureAllContainersHavePullPolicyIfNotPresent, + // EnsureHCPContainersHaveResourceRequests, EnsureNoPodsWithTooHighPriority + // require *testing.T and cannot be called from Ginkgo yet. + }) +} + +// NodePoolNTORolloutTest creates a NodePool with NTO Tuned config (hugepages), +// patches the NodePool's TuningConfig, creates a verification DaemonSet, +// and waits for rollout via Replace upgrade strategy. +func NodePoolNTORolloutTest(getTestCtx internal.TestContextGetter) { + It("should roll out an NTO Tuned config change via Replace upgrade strategy", func() { + testCtx := getTestCtx() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + if hc.Spec.Platform.Type == hyperv1.KubevirtPlatform { + Skip("test is skipped for KubeVirt platform until https://issues.redhat.com/browse/CNV-38196 is addressed") + } + if hc.Spec.Platform.Type == hyperv1.OpenStackPlatform { + Skip("test is skipped for OpenStack platform until https://issues.redhat.com/browse/OSASINFRA-3566 is addressed") + } + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + ctx := testCtx.Context + + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + var twoReplicas int32 = 2 + np := buildTestNodePool(defaultNP, "nto-replace", func(pool *hyperv1.NodePool) { + pool.Spec.Replicas = &twoReplicas + pool.Spec.Management.Replace = &hyperv1.ReplaceUpgrade{ + Strategy: hyperv1.UpgradeStrategyRollingUpdate, + RollingUpdate: &hyperv1.RollingUpdate{ + MaxUnavailable: ptr.To(intstr.FromInt32(0)), + MaxSurge: ptr.To(intstr.FromInt32(twoReplicas)), + }, + } + }) + + err := testCtx.MgmtClient.Create(ctx, np) + Expect(err).NotTo(HaveOccurred(), "failed to create NodePool %s", np.Name) + GinkgoWriter.Printf("Created NodePool %s\n", np.Name) + defer cleanupNodePool(ctx, testCtx.MgmtClient, np) + + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + tuningCM := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: e2eutil.SimpleNameGenerator.GenerateName("hugepages-tuned-"), + Namespace: hc.Namespace, + }, + Data: map[string]string{tuningConfigKey: hugepagesTunedYAML}, + } + Expect(testCtx.MgmtClient.Create(ctx, tuningCM)).To(Succeed(), "failed to create Tuned ConfigMap") + + original := np.DeepCopy() + np.Spec.TuningConfig = append(np.Spec.TuningConfig, corev1.LocalObjectReference{Name: tuningCM.Name}) + Expect(testCtx.MgmtClient.Patch(ctx, np, crclient.MergeFrom(original))).To(Succeed(), + "failed to patch NodePool %s with TuningConfig", np.Name) + + ds := buildNTOVerificationDaemonSet(np) + Expect(guestClient.Create(ctx, ds)).To(Succeed(), "failed to create NTO verification DaemonSet") + + e2eutil.WaitForNodePoolConfigUpdateCompleteWithPlatform(GinkgoTB(), ctx, testCtx.MgmtClient, np, hc.Spec.Platform.Type) + waitForDaemonSetRollout(ctx, guestClient, ds, 2, np.Spec.Platform.Type) + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + // TODO: EnsureNoCrashingPods, EnsureAllContainersHavePullPolicyIfNotPresent, + // EnsureHCPContainersHaveResourceRequests, EnsureNoPodsWithTooHighPriority + // require *testing.T and cannot be called from Ginkgo yet. + }) +} + +// NodePoolNTOInPlaceTest applies an NTO Tuned config with InPlace upgrade type. +func NodePoolNTOInPlaceTest(getTestCtx internal.TestContextGetter) { + It("should roll out an NTO Tuned config change via InPlace upgrade strategy", func() { + testCtx := getTestCtx() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + if hc.Spec.Platform.Type == hyperv1.KubevirtPlatform { + Skip("test is skipped for KubeVirt platform until https://issues.redhat.com/browse/CNV-38196 is addressed") + } + if hc.Spec.Platform.Type == hyperv1.OpenStackPlatform { + Skip("test is skipped for OpenStack platform until https://issues.redhat.com/browse/OSASINFRA-3566 is addressed") + } + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + ctx := testCtx.Context + + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + var twoReplicas int32 = 2 + np := buildTestNodePool(defaultNP, "nto-inplace", func(pool *hyperv1.NodePool) { + pool.Spec.Replicas = &twoReplicas + pool.Spec.Management.UpgradeType = hyperv1.UpgradeTypeInPlace + }) + + err := testCtx.MgmtClient.Create(ctx, np) + Expect(err).NotTo(HaveOccurred(), "failed to create NodePool %s", np.Name) + GinkgoWriter.Printf("Created NodePool %s\n", np.Name) + defer cleanupNodePool(ctx, testCtx.MgmtClient, np) + + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + tuningCM := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: e2eutil.SimpleNameGenerator.GenerateName("hugepages-inplace-"), + Namespace: hc.Namespace, + }, + Data: map[string]string{tuningConfigKey: hugepagesTunedYAML}, + } + Expect(testCtx.MgmtClient.Create(ctx, tuningCM)).To(Succeed(), "failed to create Tuned ConfigMap") + + original := np.DeepCopy() + np.Spec.TuningConfig = append(np.Spec.TuningConfig, corev1.LocalObjectReference{Name: tuningCM.Name}) + Expect(testCtx.MgmtClient.Patch(ctx, np, crclient.MergeFrom(original))).To(Succeed(), + "failed to patch NodePool %s with TuningConfig", np.Name) + + ds := buildNTOVerificationDaemonSet(np) + Expect(guestClient.Create(ctx, ds)).To(Succeed(), "failed to create NTO verification DaemonSet") + + e2eutil.WaitForNodePoolConfigUpdateCompleteWithPlatform(GinkgoTB(), ctx, testCtx.MgmtClient, np, hc.Spec.Platform.Type) + waitForDaemonSetRollout(ctx, guestClient, ds, 2, np.Spec.Platform.Type) + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + // TODO: EnsureNoCrashingPods, EnsureAllContainersHavePullPolicyIfNotPresent, + // EnsureHCPContainersHaveResourceRequests, EnsureNoPodsWithTooHighPriority + // require *testing.T and cannot be called from Ginkgo yet. + }) +} + +// NodePoolReplaceUpgradeTest creates a NodePool at previous release image, waits for nodes, +// upgrades to latest image, and waits for version to update via Replace upgrade strategy. +func NodePoolReplaceUpgradeTest(getTestCtx internal.TestContextGetter) { + It("should upgrade a NodePool from previous to latest release via Replace strategy", func() { + testCtx := getTestCtx() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + previousImage := internal.GetEnvVarValue("E2E_PREVIOUS_RELEASE_IMAGE") + latestImage := internal.GetEnvVarValue("E2E_LATEST_RELEASE_IMAGE") + if previousImage == "" || latestImage == "" { + Skip("E2E_PREVIOUS_RELEASE_IMAGE and E2E_LATEST_RELEASE_IMAGE must be set for upgrade tests") + } + + ctx := testCtx.Context + + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + var oneReplica int32 = 1 + np := buildTestNodePool(defaultNP, "replace-upgrade", func(pool *hyperv1.NodePool) { + pool.Spec.Replicas = &oneReplica + pool.Spec.Release.Image = previousImage + pool.Spec.Management.Replace = &hyperv1.ReplaceUpgrade{ + Strategy: hyperv1.UpgradeStrategyRollingUpdate, + RollingUpdate: &hyperv1.RollingUpdate{ + MaxUnavailable: ptr.To(intstr.FromInt32(0)), + MaxSurge: ptr.To(intstr.FromInt32(oneReplica)), + }, + } + }) + + err := testCtx.MgmtClient.Create(ctx, np) + Expect(err).NotTo(HaveOccurred(), "failed to create NodePool %s", np.Name) + GinkgoWriter.Printf("Created NodePool %s at previous release %s\n", np.Name, previousImage) + defer cleanupNodePool(ctx, testCtx.MgmtClient, np) + + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + // Update NodePool to latest release image + GinkgoWriter.Printf("Upgrading NodePool %s to latest release %s\n", np.Name, latestImage) + Expect(e2eutil.UpdateObject(GinkgoTB(), ctx, testCtx.MgmtClient, np, func(obj *hyperv1.NodePool) { + obj.Spec.Release.Image = latestImage + })).To(Succeed(), "failed to update NodePool release image") + + // Wait for upgrade to start + e2eutil.EventuallyObject(GinkgoTB(), ctx, fmt.Sprintf("NodePool %s/%s to start the upgrade", np.Namespace, np.Name), + func(ctx context.Context) (*hyperv1.NodePool, error) { + pool := &hyperv1.NodePool{} + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(np), pool) + return pool, err + }, + []e2eutil.Predicate[*hyperv1.NodePool]{ + e2eutil.ConditionPredicate[*hyperv1.NodePool](e2eutil.Condition{ + Type: hyperv1.NodePoolUpdatingVersionConditionType, + Status: metav1.ConditionTrue, + }), + }, + ) + + // Wait for upgrade to complete + upgradeTimeout := nodePoolUpgradeTimeout(hc.Spec.Platform.Type) + e2eutil.EventuallyObject(GinkgoTB(), ctx, fmt.Sprintf("NodePool %s/%s to complete the upgrade", np.Namespace, np.Name), + func(ctx context.Context) (*hyperv1.NodePool, error) { + pool := &hyperv1.NodePool{} + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(np), pool) + return pool, err + }, + []e2eutil.Predicate[*hyperv1.NodePool]{ + e2eutil.ConditionPredicate[*hyperv1.NodePool](e2eutil.Condition{ + Type: hyperv1.NodePoolUpdatingVersionConditionType, + Status: metav1.ConditionFalse, + }), + }, + e2eutil.WithTimeout(upgradeTimeout), + ) + + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + // TODO: EnsureNodesLabelsAndTaints, EnsureNodesRuntime require *testing.T + }) +} + +// NodePoolInPlaceUpgradeTest creates a NodePool at previous release image, waits for nodes, +// upgrades to latest image via InPlace upgrade strategy. +func NodePoolInPlaceUpgradeTest(getTestCtx internal.TestContextGetter) { + It("should upgrade a NodePool from previous to latest release via InPlace strategy", func() { + testCtx := getTestCtx() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + previousImage := internal.GetEnvVarValue("E2E_PREVIOUS_RELEASE_IMAGE") + latestImage := internal.GetEnvVarValue("E2E_LATEST_RELEASE_IMAGE") + if previousImage == "" || latestImage == "" { + Skip("E2E_PREVIOUS_RELEASE_IMAGE and E2E_LATEST_RELEASE_IMAGE must be set for upgrade tests") + } + + ctx := testCtx.Context + + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + var oneReplica int32 = 1 + np := buildTestNodePool(defaultNP, "inplace-upgrade", func(pool *hyperv1.NodePool) { + pool.Spec.Replicas = &oneReplica + pool.Spec.Release.Image = previousImage + pool.Spec.Management.UpgradeType = hyperv1.UpgradeTypeInPlace + }) + + err := testCtx.MgmtClient.Create(ctx, np) + Expect(err).NotTo(HaveOccurred(), "failed to create NodePool %s", np.Name) + GinkgoWriter.Printf("Created NodePool %s at previous release %s\n", np.Name, previousImage) + defer cleanupNodePool(ctx, testCtx.MgmtClient, np) + + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + GinkgoWriter.Printf("Upgrading NodePool %s to latest release %s\n", np.Name, latestImage) + Expect(e2eutil.UpdateObject(GinkgoTB(), ctx, testCtx.MgmtClient, np, func(obj *hyperv1.NodePool) { + obj.Spec.Release.Image = latestImage + })).To(Succeed(), "failed to update NodePool release image") + + // Wait for upgrade to start + e2eutil.EventuallyObject(GinkgoTB(), ctx, fmt.Sprintf("NodePool %s/%s to start the upgrade", np.Namespace, np.Name), + func(ctx context.Context) (*hyperv1.NodePool, error) { + pool := &hyperv1.NodePool{} + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(np), pool) + return pool, err + }, + []e2eutil.Predicate[*hyperv1.NodePool]{ + e2eutil.ConditionPredicate[*hyperv1.NodePool](e2eutil.Condition{ + Type: hyperv1.NodePoolUpdatingVersionConditionType, + Status: metav1.ConditionTrue, + }), + }, + ) + + // Wait for upgrade to complete + upgradeTimeout := nodePoolUpgradeTimeout(hc.Spec.Platform.Type) + e2eutil.EventuallyObject(GinkgoTB(), ctx, fmt.Sprintf("NodePool %s/%s to complete the upgrade", np.Namespace, np.Name), + func(ctx context.Context) (*hyperv1.NodePool, error) { + pool := &hyperv1.NodePool{} + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(np), pool) + return pool, err + }, + []e2eutil.Predicate[*hyperv1.NodePool]{ + e2eutil.ConditionPredicate[*hyperv1.NodePool](e2eutil.Condition{ + Type: hyperv1.NodePoolUpdatingVersionConditionType, + Status: metav1.ConditionFalse, + }), + }, + e2eutil.WithTimeout(upgradeTimeout), + ) + + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + // TODO: EnsureNodesLabelsAndTaints, EnsureNodesRuntime require *testing.T + }) +} + +// NodePoolRollingUpgradeTest creates a NodePool with 2 replicas, changes instance type +// (AWS) or VM size (Azure) to trigger a rolling upgrade, and verifies the machine specs +// after upgrade. Only runs on AWS and Azure platforms. +func NodePoolRollingUpgradeTest(getTestCtx internal.TestContextGetter) { + It("should perform a rolling upgrade when instance type or VM size changes", func() { + testCtx := getTestCtx() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + platform := hc.Spec.Platform.Type + if platform != hyperv1.AWSPlatform && platform != hyperv1.AzurePlatform { + Skip("rolling upgrade test only supported on AWS and Azure platforms") + } + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + ctx := testCtx.Context + + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + var twoReplicas int32 = 2 + np := buildTestNodePool(defaultNP, "rolling-upgrade", func(pool *hyperv1.NodePool) { + pool.Spec.Replicas = &twoReplicas + pool.Spec.Management.UpgradeType = hyperv1.UpgradeTypeReplace + switch platform { + case hyperv1.AWSPlatform: + pool.Spec.Platform.AWS.InstanceType = "m5.large" + case hyperv1.AzurePlatform: + pool.Spec.Platform.Azure.VMSize = "Standard_D2s_v3" + } + }) + + err := testCtx.MgmtClient.Create(ctx, np) + Expect(err).NotTo(HaveOccurred(), "failed to create NodePool %s", np.Name) + GinkgoWriter.Printf("Created NodePool %s with 2 replicas\n", np.Name) + defer cleanupNodePool(ctx, testCtx.MgmtClient, np) + + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, platform) + + // Change instance type / VM size to trigger rolling upgrade + var newInstanceType, newVMSize string + switch platform { + case hyperv1.AWSPlatform: + newInstanceType = "m5.xlarge" + case hyperv1.AzurePlatform: + newVMSize = "Standard_D4s_v5" + } + + Expect(e2eutil.UpdateObject(GinkgoTB(), ctx, testCtx.MgmtClient, np, func(obj *hyperv1.NodePool) { + switch platform { + case hyperv1.AWSPlatform: + obj.Spec.Platform.AWS.InstanceType = newInstanceType + case hyperv1.AzurePlatform: + obj.Spec.Platform.Azure.VMSize = newVMSize + } + })).To(Succeed(), "failed to update NodePool instance type / VM size") + + // Wait for rolling upgrade to start + e2eutil.EventuallyObject(GinkgoTB(), ctx, fmt.Sprintf("NodePool %s/%s to start the rolling upgrade", np.Namespace, np.Name), + func(ctx context.Context) (*hyperv1.NodePool, error) { + pool := &hyperv1.NodePool{} + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(np), pool) + return pool, err + }, + []e2eutil.Predicate[*hyperv1.NodePool]{ + e2eutil.ConditionPredicate[*hyperv1.NodePool](e2eutil.Condition{ + Type: hyperv1.NodePoolUpdatingPlatformMachineTemplateConditionType, + Status: metav1.ConditionTrue, + }), + }, + e2eutil.WithTimeout(2*time.Minute), + ) + + // Wait for rolling upgrade to complete + rollingTimeout := nodePoolUpgradeTimeout(platform) + e2eutil.EventuallyObject(GinkgoTB(), ctx, fmt.Sprintf("NodePool %s/%s to finish the rolling upgrade", np.Namespace, np.Name), + func(ctx context.Context) (*hyperv1.NodePool, error) { + pool := &hyperv1.NodePool{} + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(np), pool) + return pool, err + }, + []e2eutil.Predicate[*hyperv1.NodePool]{ + e2eutil.ConditionPredicate[*hyperv1.NodePool](e2eutil.Condition{ + Type: hyperv1.NodePoolUpdatingPlatformMachineTemplateConditionType, + Status: metav1.ConditionFalse, + }), + }, + e2eutil.WithTimeout(rollingTimeout), + ) + + // TODO: Verify machine specs (AWSMachineList / AzureMachineList) after upgrade. + // The v1 test uses capiaws.AWSMachineList and capiazure.AzureMachineList to check + // that instance types / VM sizes match. This requires importing CAPI provider types + // which adds significant dependency. Implement once the pattern is established. + }) +} + +// NodePoolPrevReleaseN1Test creates a NodePool at N-1 release image and waits for nodes ready. +func NodePoolPrevReleaseN1Test(getTestCtx internal.TestContextGetter) { + It("should create a NodePool at N-1 release and have ready nodes", func() { + testCtx := getTestCtx() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + n1Image := internal.GetEnvVarValue("E2E_N1_RELEASE_IMAGE") + if n1Image == "" { + Skip("E2E_N1_RELEASE_IMAGE not set, skipping N-1 release test") + } + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + ctx := testCtx.Context + + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + var oneReplica int32 = 1 + np := buildTestNodePool(defaultNP, "prev-n1", func(pool *hyperv1.NodePool) { + pool.Spec.Replicas = &oneReplica + pool.Spec.Release.Image = n1Image + }) + + err := testCtx.MgmtClient.Create(ctx, np) + Expect(err).NotTo(HaveOccurred(), "failed to create NodePool %s", np.Name) + GinkgoWriter.Printf("Created NodePool %s at N-1 release %s\n", np.Name, n1Image) + defer cleanupNodePool(ctx, testCtx.MgmtClient, np) + + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + // TODO: EnsureNodesLabelsAndTaints requires *testing.T + }) +} + +// NodePoolPrevReleaseN2Test creates a NodePool at N-2 release image and waits for nodes ready. +func NodePoolPrevReleaseN2Test(getTestCtx internal.TestContextGetter) { + It("should create a NodePool at N-2 release and have ready nodes", func() { + testCtx := getTestCtx() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + n2Image := internal.GetEnvVarValue("E2E_N2_RELEASE_IMAGE") + if n2Image == "" { + Skip("E2E_N2_RELEASE_IMAGE not set, skipping N-2 release test") + } + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + ctx := testCtx.Context + + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + var oneReplica int32 = 1 + np := buildTestNodePool(defaultNP, "prev-n2", func(pool *hyperv1.NodePool) { + pool.Spec.Replicas = &oneReplica + pool.Spec.Release.Image = n2Image + }) + + err := testCtx.MgmtClient.Create(ctx, np) + Expect(err).NotTo(HaveOccurred(), "failed to create NodePool %s", np.Name) + GinkgoWriter.Printf("Created NodePool %s at N-2 release %s\n", np.Name, n2Image) + defer cleanupNodePool(ctx, testCtx.MgmtClient, np) + + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + // TODO: EnsureNodesLabelsAndTaints requires *testing.T + }) +} + +// NodePoolMirrorConfigsTest creates a KubeletConfig ConfigMap, patches NodePool config, +// verifies the KubeletConfig gets mirrored to the hosted cluster's openshift-config-managed +// namespace, then removes the config and verifies cleanup. Only for 4.18+. +func NodePoolMirrorConfigsTest(getTestCtx internal.TestContextGetter) { + It("should mirror KubeletConfig to the hosted cluster and clean up on removal", func() { + testCtx := getTestCtx() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + if e2eutil.IsLessThan(e2eutil.Version418) { + Skip("mirror configs test only applicable for 4.18+") + } + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + ctx := testCtx.Context + + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + var oneReplica int32 = 1 + np := buildTestNodePool(defaultNP, "mirror-cfg", func(pool *hyperv1.NodePool) { + pool.Spec.Replicas = &oneReplica + }) + + err := testCtx.MgmtClient.Create(ctx, np) + Expect(err).NotTo(HaveOccurred(), "failed to create NodePool %s", np.Name) + GinkgoWriter.Printf("Created NodePool %s\n", np.Name) + defer cleanupNodePool(ctx, testCtx.MgmtClient, np) + + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + kcConfigMap := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: e2eutil.SimpleNameGenerator.GenerateName("kc-test-"), + Namespace: np.Namespace, + }, + Data: map[string]string{configKey: kubeletConfig1YAML}, + } + Expect(testCtx.MgmtClient.Create(ctx, kcConfigMap)).To(Succeed(), "failed to create KubeletConfig ConfigMap") + defer func() { + _ = testCtx.MgmtClient.Delete(ctx, kcConfigMap) + }() + + original := np.DeepCopy() + np.Spec.Config = append(np.Spec.Config, corev1.LocalObjectReference{Name: kcConfigMap.Name}) + Expect(testCtx.MgmtClient.Patch(ctx, np, crclient.MergeFrom(original))).To(Succeed(), + "failed to patch NodePool %s with KubeletConfig", np.Name) + + // Verify mirrored ConfigMap appears in the hosted cluster + e2eutil.EventuallyObjects(GinkgoTB(), ctx, "KubeletConfig should be mirrored to the hosted cluster", + func(ctx context.Context) ([]*corev1.ConfigMap, error) { + list := &corev1.ConfigMapList{} + err := guestClient.List(ctx, list, crclient.InNamespace(configManagedNamespace), + crclient.MatchingLabels(map[string]string{ + nodepool.KubeletConfigConfigMapLabel: "true", + hyperv1.NodePoolLabel: np.Name, + })) + configMaps := make([]*corev1.ConfigMap, len(list.Items)) + for i := range list.Items { + configMaps[i] = &list.Items[i] + } + return configMaps, err + }, + []e2eutil.Predicate[[]*corev1.ConfigMap]{ + func(configMaps []*corev1.ConfigMap) (done bool, reasons string, err error) { + want, got := 1, len(configMaps) + return want == got, fmt.Sprintf("expected %d KubeletConfig ConfigMaps, got %d", want, got), nil + }, + }, + []e2eutil.Predicate[*corev1.ConfigMap]{ + func(cm *corev1.ConfigMap) (done bool, reasons string, err error) { + want := netutil.ShortenName(kcConfigMap.Name, np.Name, nodepool.QualifiedNameMaxLength) + if want != cm.Name { + return false, fmt.Sprintf("expected ConfigMap name %q, got %q", want, cm.Name), nil + } + return true, "ConfigMap name is as expected", nil + }, + func(cm *corev1.ConfigMap) (done bool, reasons string, err error) { + if diff := cmp.Diff(map[string]string{ + nodepool.KubeletConfigConfigMapLabel: cm.Labels[nodepool.KubeletConfigConfigMapLabel], + hyperv1.NodePoolLabel: cm.Labels[hyperv1.NodePoolLabel], + nodepool.NTOMirroredConfigLabel: cm.Labels[nodepool.NTOMirroredConfigLabel], + }, map[string]string{ + nodepool.KubeletConfigConfigMapLabel: "true", + hyperv1.NodePoolLabel: np.Name, + nodepool.NTOMirroredConfigLabel: "true", + }); diff != "" { + return false, fmt.Sprintf("incorrect labels: %v", diff), nil + } + return true, "labels are correct", nil + }, + }, + ) + + // Remove KubeletConfig from NodePool and verify cleanup + GinkgoWriter.Printf("Removing KubeletConfig reference from NodePool %s\n", np.Name) + baseNP := np.DeepCopy() + np.Spec = original.Spec + Expect(testCtx.MgmtClient.Patch(ctx, np, crclient.MergeFrom(baseNP))).To(Succeed(), + "failed to remove KubeletConfig from NodePool %s", np.Name) + + e2eutil.EventuallyObjects(GinkgoTB(), ctx, "KubeletConfig ConfigMap to be deleted from hosted cluster", + func(ctx context.Context) ([]*corev1.ConfigMap, error) { + list := &corev1.ConfigMapList{} + err := guestClient.List(ctx, list, crclient.InNamespace(configManagedNamespace), + crclient.MatchingLabels(map[string]string{ + nodepool.KubeletConfigConfigMapLabel: "true", + hyperv1.NodePoolLabel: np.Name, + })) + configMaps := make([]*corev1.ConfigMap, len(list.Items)) + for i := range list.Items { + configMaps[i] = &list.Items[i] + } + return configMaps, err + }, + []e2eutil.Predicate[[]*corev1.ConfigMap]{ + func(configMaps []*corev1.ConfigMap) (done bool, reasons string, err error) { + want, got := 0, len(configMaps) + return want == got, fmt.Sprintf("expected %d KubeletConfig ConfigMaps, got %d", want, got), nil + }, + }, nil, + ) + }) +} + +// NodePoolTrustBundleTest creates an additional trust bundle ConfigMap, updates the +// HostedCluster to reference it, waits for NodePool update cycle, verifies user-ca-bundle +// exists in the hosted cluster, removes the trust bundle, verifies CPO deployment no longer +// mounts it, waits for another update cycle, and verifies user-ca-bundle is deleted (4.22+). +func NodePoolTrustBundleTest(getTestCtx internal.TestContextGetter) { + It("should propagate and remove additional trust bundle to/from the hosted cluster", func() { + testCtx := getTestCtx() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + e2eutil.GinkgoAtLeast(e2eutil.Version418) + + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + ctx := testCtx.Context + + // Create additional trust bundle ConfigMap + trustBundle := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: e2eutil.SimpleNameGenerator.GenerateName("trust-bundle-"), + Namespace: hc.Namespace, + }, + Data: map[string]string{"ca-bundle.crt": "dummy"}, + } + Expect(testCtx.MgmtClient.Create(ctx, trustBundle)).To(Succeed(), "failed to create trust bundle ConfigMap") + + // Update HostedCluster to reference the trust bundle + GinkgoWriter.Printf("Updating HostedCluster with additional trust bundle %s\n", trustBundle.Name) + Expect(e2eutil.UpdateObject(GinkgoTB(), ctx, testCtx.MgmtClient, hc, func(obj *hyperv1.HostedCluster) { + obj.Spec.AdditionalTrustBundle = &corev1.LocalObjectReference{Name: trustBundle.Name} + })).To(Succeed(), "failed to update HostedCluster with trust bundle") + + // Defer cleanup: remove trust bundle reference from HostedCluster + defer func() { + _ = e2eutil.UpdateObject(GinkgoTB(), ctx, testCtx.MgmtClient, hc, func(obj *hyperv1.HostedCluster) { + obj.Spec.AdditionalTrustBundle = nil + }) + }() + + // Wait for NodePool to begin updating + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + e2eutil.EventuallyObject(GinkgoTB(), ctx, fmt.Sprintf("NodePool %s/%s to begin updating", defaultNP.Namespace, defaultNP.Name), + func(ctx context.Context) (*hyperv1.NodePool, error) { + pool := &hyperv1.NodePool{} + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(defaultNP), pool) + return pool, err + }, + []e2eutil.Predicate[*hyperv1.NodePool]{ + e2eutil.ConditionPredicate[*hyperv1.NodePool](e2eutil.Condition{ + Type: hyperv1.NodePoolUpdatingConfigConditionType, + Status: metav1.ConditionTrue, + }), + }, + e2eutil.WithInterval(10*time.Second), e2eutil.WithTimeout(5*time.Minute), + ) + + // Wait for NodePool to stop updating + e2eutil.EventuallyObject(GinkgoTB(), ctx, fmt.Sprintf("NodePool %s/%s to stop updating", defaultNP.Namespace, defaultNP.Name), + func(ctx context.Context) (*hyperv1.NodePool, error) { + pool := &hyperv1.NodePool{} + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(defaultNP), pool) + return pool, err + }, + []e2eutil.Predicate[*hyperv1.NodePool]{ + e2eutil.ConditionPredicate[*hyperv1.NodePool](e2eutil.Condition{ + Type: hyperv1.NodePoolUpdatingConfigConditionType, + Status: metav1.ConditionFalse, + }), + e2eutil.ConditionPredicate[*hyperv1.NodePool](e2eutil.Condition{ + Type: hyperv1.NodePoolAllNodesHealthyConditionType, + Status: metav1.ConditionTrue, + }), + }, + e2eutil.WithInterval(10*time.Second), e2eutil.WithTimeout(20*time.Minute), + ) + + // Verify user-ca-bundle exists in the hosted cluster + userCAConfigMap := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "user-ca-bundle", + Namespace: "openshift-config", + }, + } + e2eutil.EventuallyObject(GinkgoTB(), ctx, "user-ca-bundle to exist in hosted cluster", + func(ctx context.Context) (*corev1.ConfigMap, error) { + cm := &corev1.ConfigMap{} + err := guestClient.Get(ctx, crclient.ObjectKeyFromObject(userCAConfigMap), cm) + return cm, err + }, + []e2eutil.Predicate[*corev1.ConfigMap]{ + func(obj *corev1.ConfigMap) (bool, string, error) { return true, "exists", nil }, + }, + e2eutil.WithInterval(10*time.Second), e2eutil.WithTimeout(5*time.Minute), + ) + + // Remove trust bundle from HostedCluster + GinkgoWriter.Printf("Removing additional trust bundle from HostedCluster\n") + Expect(e2eutil.UpdateObject(GinkgoTB(), ctx, testCtx.MgmtClient, hc, func(obj *hyperv1.HostedCluster) { + obj.Spec.AdditionalTrustBundle = nil + })).To(Succeed(), "failed to remove trust bundle from HostedCluster") + + // Verify CPO deployment no longer mounts the trust bundle + cpNamespace := manifests.HostedControlPlaneNamespace(hc.Namespace, hc.Name) + cpoDeployment := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "control-plane-operator", + Namespace: cpNamespace, + }, + } + e2eutil.EventuallyObject(GinkgoTB(), ctx, "CPO deployment to stop mounting trust bundle", + func(ctx context.Context) (*appsv1.Deployment, error) { + deploy := &appsv1.Deployment{} + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(cpoDeployment), deploy) + return deploy, err + }, + []e2eutil.Predicate[*appsv1.Deployment]{ + func(obj *appsv1.Deployment) (bool, string, error) { + for _, volume := range obj.Spec.Template.Spec.Volumes { + if volume.ConfigMap != nil && volume.ConfigMap.Name == "trusted-ca" { + return false, "trust bundle volume still mounted in CPO", nil + } + } + if ready := podspec.IsDeploymentReady(ctx, obj); !ready { + return false, "CPO deployment is not ready", nil + } + return true, "trust bundle volume removed from CPO", nil + }, + }, + ) + + // Wait for NodePool to cycle again + e2eutil.EventuallyObject(GinkgoTB(), ctx, fmt.Sprintf("NodePool %s/%s to begin updating after trust bundle removal", defaultNP.Namespace, defaultNP.Name), + func(ctx context.Context) (*hyperv1.NodePool, error) { + pool := &hyperv1.NodePool{} + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(defaultNP), pool) + return pool, err + }, + []e2eutil.Predicate[*hyperv1.NodePool]{ + e2eutil.ConditionPredicate[*hyperv1.NodePool](e2eutil.Condition{ + Type: hyperv1.NodePoolUpdatingConfigConditionType, + Status: metav1.ConditionTrue, + }), + }, + e2eutil.WithInterval(10*time.Second), e2eutil.WithTimeout(5*time.Minute), + ) + + e2eutil.EventuallyObject(GinkgoTB(), ctx, fmt.Sprintf("NodePool %s/%s to stop updating after trust bundle removal", defaultNP.Namespace, defaultNP.Name), + func(ctx context.Context) (*hyperv1.NodePool, error) { + pool := &hyperv1.NodePool{} + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(defaultNP), pool) + return pool, err + }, + []e2eutil.Predicate[*hyperv1.NodePool]{ + e2eutil.ConditionPredicate[*hyperv1.NodePool](e2eutil.Condition{ + Type: hyperv1.NodePoolUpdatingConfigConditionType, + Status: metav1.ConditionFalse, + }), + e2eutil.ConditionPredicate[*hyperv1.NodePool](e2eutil.Condition{ + Type: hyperv1.NodePoolAllNodesHealthyConditionType, + Status: metav1.ConditionTrue, + }), + }, + e2eutil.WithInterval(10*time.Second), e2eutil.WithTimeout(20*time.Minute), + ) + + // Verify user-ca-bundle is deleted from the hosted cluster (4.22+) + if e2eutil.IsGreaterThanOrEqualTo(e2eutil.Version422) { + e2eutil.EventuallyNotFound(GinkgoTB(), ctx, guestClient, userCAConfigMap, + e2eutil.WithInterval(10*time.Second), e2eutil.WithTimeout(5*time.Minute), + ) + } + }) +} + +// NodePoolNTOPerformanceProfileTest creates a PerformanceProfile via ConfigMap, +// patches the NodePool's TuningConfig, verifies the PerformanceProfile ConfigMap and +// status ConfigMap are created in the control plane namespace, and verifies cleanup. +func NodePoolNTOPerformanceProfileTest(getTestCtx internal.TestContextGetter) { + It("should create and manage NTO PerformanceProfile via NodePool TuningConfig", func() { + testCtx := getTestCtx() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + if hc.Spec.Platform.Type == hyperv1.OpenStackPlatform { + Skip("test is skipped for OpenStack platform until https://issues.redhat.com/browse/OSASINFRA-3566 is addressed") + } + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + ctx := testCtx.Context + + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + var oneReplica int32 = 1 + np := buildTestNodePool(defaultNP, "nto-perfprof", func(pool *hyperv1.NodePool) { + pool.Spec.Replicas = &oneReplica + }) + + err := testCtx.MgmtClient.Create(ctx, np) + Expect(err).NotTo(HaveOccurred(), "failed to create NodePool %s", np.Name) + GinkgoWriter.Printf("Created NodePool %s\n", np.Name) + defer cleanupNodePool(ctx, testCtx.MgmtClient, np) + + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + ppConfigMap := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: e2eutil.SimpleNameGenerator.GenerateName("pp-test-"), + Namespace: np.Namespace, + }, + Data: map[string]string{tuningConfigKey: performanceProfileYAML}, + } + Expect(testCtx.MgmtClient.Create(ctx, ppConfigMap)).To(Succeed(), "failed to create PerformanceProfile ConfigMap") + defer func() { + _ = testCtx.MgmtClient.Delete(ctx, ppConfigMap) + }() + + original := np.DeepCopy() + np.Spec.TuningConfig = append(np.Spec.TuningConfig, corev1.LocalObjectReference{Name: ppConfigMap.Name}) + Expect(testCtx.MgmtClient.Patch(ctx, np, crclient.MergeFrom(original))).To(Succeed(), + "failed to patch NodePool %s with PerformanceProfile config", np.Name) + + cpNamespace := manifests.HostedControlPlaneNamespace(hc.Namespace, hc.Name) + + // Verify PerformanceProfile ConfigMap exists in control plane namespace + e2eutil.EventuallyObjects(GinkgoTB(), ctx, "PerformanceProfile ConfigMap to exist with correct labels", + func(ctx context.Context) ([]*corev1.ConfigMap, error) { + list := &corev1.ConfigMapList{} + err := testCtx.MgmtClient.List(ctx, list, crclient.InNamespace(cpNamespace), + crclient.MatchingLabels(map[string]string{ + nodepool.PerformanceProfileConfigMapLabel: "true", + })) + configMaps := make([]*corev1.ConfigMap, len(list.Items)) + for i := range list.Items { + configMaps[i] = &list.Items[i] + } + return configMaps, err + }, + []e2eutil.Predicate[[]*corev1.ConfigMap]{ + func(configMaps []*corev1.ConfigMap) (done bool, reasons string, err error) { + want, got := 1, len(configMaps) + return want == got, fmt.Sprintf("expected %d PerformanceProfile ConfigMaps, got %d", want, got), nil + }, + }, + []e2eutil.Predicate[*corev1.ConfigMap]{ + func(cm *corev1.ConfigMap) (done bool, reasons string, err error) { + want := netutil.ShortenName(ppConfigMap.Name, np.Name, nodepool.QualifiedNameMaxLength) + if want != cm.Name { + return false, fmt.Sprintf("expected PerformanceProfile ConfigMap name %q, got %q", want, cm.Name), nil + } + return true, "PerformanceProfile ConfigMap name is as expected", nil + }, + func(cm *corev1.ConfigMap) (done bool, reasons string, err error) { + if diff := cmp.Diff(map[string]string{ + nodepool.PerformanceProfileConfigMapLabel: cm.Labels[nodepool.PerformanceProfileConfigMapLabel], + hyperv1.NodePoolLabel: cm.Labels[hyperv1.NodePoolLabel], + }, map[string]string{ + nodepool.PerformanceProfileConfigMapLabel: "true", + hyperv1.NodePoolLabel: np.Name, + }); diff != "" { + return false, fmt.Sprintf("incorrect labels: %v", diff), nil + } + return true, "labels are correct", nil + }, + }, + ) + + // Verify status ConfigMap (4.17+) + if !e2eutil.IsLessThan(e2eutil.Version417) { + e2eutil.EventuallyObjects(GinkgoTB(), ctx, "PerformanceProfile status ConfigMap to exist", + func(ctx context.Context) ([]*corev1.ConfigMap, error) { + list := &corev1.ConfigMapList{} + err := testCtx.MgmtClient.List(ctx, list, crclient.InNamespace(cpNamespace), + crclient.MatchingLabels(map[string]string{ + nodepool.NodeTuningGeneratedPerformanceProfileStatusLabel: "true", + })) + configMaps := make([]*corev1.ConfigMap, len(list.Items)) + for i := range list.Items { + configMaps[i] = &list.Items[i] + } + return configMaps, err + }, + []e2eutil.Predicate[[]*corev1.ConfigMap]{ + func(configMaps []*corev1.ConfigMap) (done bool, reasons string, err error) { + want, got := 1, len(configMaps) + return want == got, fmt.Sprintf("expected %d status ConfigMaps, got %d", want, got), nil + }, + }, + []e2eutil.Predicate[*corev1.ConfigMap]{ + func(cm *corev1.ConfigMap) (done bool, reasons string, err error) { + want := fmt.Sprintf("status-%s", netutil.ShortenName(ppConfigMap.Name, np.Name, nodepool.QualifiedNameMaxLength)) + if want != cm.Name { + return false, fmt.Sprintf("expected status ConfigMap name %q, got %q", want, cm.Name), nil + } + return true, "status ConfigMap name is as expected", nil + }, + }, + ) + } + + // Remove PerformanceProfile from NodePool and verify cleanup + GinkgoWriter.Printf("Removing PerformanceProfile reference from NodePool %s\n", np.Name) + baseNP := np.DeepCopy() + np.Spec = original.Spec + Expect(testCtx.MgmtClient.Patch(ctx, np, crclient.MergeFrom(baseNP))).To(Succeed(), + "failed to remove PerformanceProfile from NodePool %s", np.Name) + + e2eutil.EventuallyObjects(GinkgoTB(), ctx, "PerformanceProfile ConfigMap to be deleted", + func(ctx context.Context) ([]*corev1.ConfigMap, error) { + list := &corev1.ConfigMapList{} + err := testCtx.MgmtClient.List(ctx, list, crclient.InNamespace(cpNamespace), + crclient.MatchingLabels(map[string]string{ + nodepool.PerformanceProfileConfigMapLabel: "true", + })) + configMaps := make([]*corev1.ConfigMap, len(list.Items)) + for i := range list.Items { + configMaps[i] = &list.Items[i] + } + return configMaps, err + }, + []e2eutil.Predicate[[]*corev1.ConfigMap]{ + func(configMaps []*corev1.ConfigMap) (done bool, reasons string, err error) { + want, got := 0, len(configMaps) + return want == got, fmt.Sprintf("expected %d PerformanceProfile ConfigMaps, got %d", want, got), nil + }, + }, nil, + ) + }) +} + +// NodePoolAutoRepairTest is a skeleton for platform-specific auto-repair tests. +// The full implementation requires cloud SDK dependencies for instance termination. +func NodePoolAutoRepairTest(getTestCtx internal.TestContextGetter) { + It("should auto-repair a NodePool when a node is terminated", func() { + Skip("auto-repair instance termination not yet implemented for v2 framework") + + testCtx := getTestCtx() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + platform := hc.Spec.Platform.Type + if platform != hyperv1.AWSPlatform && platform != hyperv1.AzurePlatform { + Skip("auto-repair test only supported on AWS and Azure platforms") + } + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + ctx := testCtx.Context + + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + var oneReplica int32 = 1 + np := buildTestNodePool(defaultNP, "autorepair", func(pool *hyperv1.NodePool) { + pool.Spec.Replicas = &oneReplica + pool.Spec.Management.AutoRepair = true + }) + + err := testCtx.MgmtClient.Create(ctx, np) + Expect(err).NotTo(HaveOccurred(), "failed to create NodePool %s", np.Name) + GinkgoWriter.Printf("Created auto-repair NodePool %s\n", np.Name) + defer cleanupNodePool(ctx, testCtx.MgmtClient, np) + + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, platform) + + // TODO: Implement cloud-specific instance termination logic. + // For AWS: use EC2 TerminateInstances API to terminate the node's backing instance. + // For Azure: delete the VMSS instance backing the node. + // After termination, wait for the node to be replaced using: + // e2eutil.WaitForReadyNodesByNodePool with WithCollectionPredicates and WithPredicates + // to verify the old node is replaced and the new node is healthy. + }) +} + +// NodePoolDiskEncryptionTest is a skeleton for Azure disk encryption tests. +func NodePoolDiskEncryptionTest(getTestCtx internal.TestContextGetter) { + It("should create a NodePool with Azure DiskEncryptionSet and verify it is applied", func() { + testCtx := getTestCtx() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + if hc.Spec.Platform.Type != hyperv1.AzurePlatform { + Skip("disk encryption test only supported on Azure platform") + } + + diskEncryptionSetID := internal.GetEnvVarValue("E2E_AZURE_DISK_ENCRYPTION_SET_ID") + if diskEncryptionSetID == "" { + Skip("E2E_AZURE_DISK_ENCRYPTION_SET_ID not set, skipping disk encryption test") + } + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + ctx := testCtx.Context + + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + var oneReplica int32 = 1 + np := buildTestNodePool(defaultNP, "disk-encrypt", func(pool *hyperv1.NodePool) { + pool.Spec.Replicas = &oneReplica + if pool.Spec.Platform.Azure != nil { + pool.Spec.Platform.Azure.OSDisk.EncryptionSetID = diskEncryptionSetID + } + }) + + err := testCtx.MgmtClient.Create(ctx, np) + Expect(err).NotTo(HaveOccurred(), "failed to create NodePool %s", np.Name) + GinkgoWriter.Printf("Created disk encryption NodePool %s\n", np.Name) + defer cleanupNodePool(ctx, testCtx.MgmtClient, np) + + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + // TODO: Verify disk encryption is applied by checking AzureMachine specs + // in the control plane namespace. This requires importing CAPI Azure types + // (capiazure.AzureMachineList) and verifying DiskEncryptionSetID on each machine. + }) +} + +// Helper functions + +// buildTestNodePool builds a new NodePool from a template with the given name prefix +// and applies the provided mutation function. +func buildTestNodePool(template *hyperv1.NodePool, namePrefix string, mutate func(*hyperv1.NodePool)) *hyperv1.NodePool { + GinkgoHelper() + + name := e2eutil.SimpleNameGenerator.GenerateName(template.Spec.ClusterName + "-" + namePrefix + "-") + np := &hyperv1.NodePool{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: template.Namespace, + }, + } + template.Spec.DeepCopyInto(&np.Spec) + + if mutate != nil { + mutate(np) + } + + return np +} + +// buildMachineConfigVerificationDaemonSet constructs a DaemonSet that verifies +// /etc/custom-config exists on nodes (checks MachineConfig was applied). +func buildMachineConfigVerificationDaemonSet(np *hyperv1.NodePool) *appsv1.DaemonSet { + GinkgoHelper() + + dsName := e2eutil.SimpleNameGenerator.GenerateName("mc-verify-") + ds := &appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: dsName, + Namespace: "kube-system", + Labels: map[string]string{ + hyperv1.NodePoolLabel: np.Name, + }, + }, + Spec: appsv1.DaemonSetSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "name": dsName, + hyperv1.NodePoolLabel: np.Name, + }, + }, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "name": dsName, + hyperv1.NodePoolLabel: np.Name, + }, + }, + Spec: corev1.PodSpec{ + NodeSelector: map[string]string{ + hyperv1.NodePoolLabel: np.Name, + }, + Tolerations: []corev1.Toleration{{Operator: corev1.TolerationOpExists}}, + Containers: []corev1.Container{{ + Name: dsName, + Image: "registry.access.redhat.com/ubi9/ubi-minimal:latest", + Command: []string{"/bin/sleep", "24h"}, + Resources: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("100m"), + corev1.ResourceMemory: resource.MustParse("200Mi"), + }, + }, + ReadinessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + Exec: &corev1.ExecAction{ + Command: []string{"/bin/cat", "/host/etc/custom-config"}, + }, + }, + }, + VolumeMounts: []corev1.VolumeMount{{ + Name: "host", + MountPath: "/host", + ReadOnly: true, + }}, + }}, + TerminationGracePeriodSeconds: ptr.To[int64](30), + Volumes: []corev1.Volume{{ + Name: "host", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{Path: "/"}, + }, + }}, + }, + }, + }, + } + + return ds +} + +// buildNTOVerificationDaemonSet constructs a DaemonSet that verifies hugepages +// are configured on nodes via /proc/cmdline (checks NTO Tuned config was applied). +func buildNTOVerificationDaemonSet(np *hyperv1.NodePool) *appsv1.DaemonSet { + GinkgoHelper() + + dsName := e2eutil.SimpleNameGenerator.GenerateName("nto-verify-") + ds := &appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: dsName, + Namespace: "kube-system", + Labels: map[string]string{ + hyperv1.NodePoolLabel: np.Name, + }, + }, + Spec: appsv1.DaemonSetSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "name": dsName, + hyperv1.NodePoolLabel: np.Name, + }, + }, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "name": dsName, + hyperv1.NodePoolLabel: np.Name, + }, + }, + Spec: corev1.PodSpec{ + NodeSelector: map[string]string{ + hyperv1.NodePoolLabel: np.Name, + }, + Tolerations: []corev1.Toleration{{Operator: corev1.TolerationOpExists}}, + Containers: []corev1.Container{{ + Name: dsName, + Image: "registry.access.redhat.com/ubi9/ubi-minimal:latest", + Command: []string{"/bin/sleep", "24h"}, + Resources: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("100m"), + corev1.ResourceMemory: resource.MustParse("200Mi"), + }, + }, + ReadinessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + Exec: &corev1.ExecAction{ + Command: []string{"/bin/sh", "-c", `cat /proc/cmdline | grep "hugepagesz=2M hugepages=4"`}, + }, + }, + }, + VolumeMounts: []corev1.VolumeMount{{ + Name: "host", + MountPath: "/host", + ReadOnly: true, + }}, + }}, + TerminationGracePeriodSeconds: ptr.To[int64](30), + Volumes: []corev1.Volume{{ + Name: "host", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{Path: "/"}, + }, + }}, + }, + }, + }, + } + + return ds +} + +// waitForDaemonSetRollout polls until the DaemonSet has the expected number of ready pods. +func waitForDaemonSetRollout(ctx context.Context, client crclient.Client, ds *appsv1.DaemonSet, expectedCount int, platform hyperv1.PlatformType) { + GinkgoHelper() + + timeout := 15 * time.Minute + if platform == hyperv1.KubevirtPlatform { + timeout = 25 * time.Minute + } + + e2eutil.EventuallyObjects(GinkgoTB(), ctx, fmt.Sprintf("all pods in DaemonSet %s/%s to be ready", ds.Namespace, ds.Name), + func(ctx context.Context) ([]*corev1.Pod, error) { + list := &corev1.PodList{} + err := client.List(ctx, list, crclient.InNamespace(ds.Namespace), crclient.MatchingLabels(ds.Spec.Selector.MatchLabels)) + readyPods := []*corev1.Pod{} + for i := range list.Items { + pod := &list.Items[i] + for _, condition := range pod.Status.Conditions { + if condition.Type == corev1.PodReady && condition.Status == corev1.ConditionTrue { + readyPods = append(readyPods, pod) + break + } + } + } + return readyPods, err + }, + []e2eutil.Predicate[[]*corev1.Pod]{ + func(readyPods []*corev1.Pod) (done bool, reasons string, err error) { + want, got := expectedCount, len(readyPods) + return want == got, fmt.Sprintf("expected %d ready Pods, got %d", want, got), nil + }, + }, nil, + e2eutil.WithTimeout(timeout), + e2eutil.WithInterval(5*time.Second), + ) +} + +// nodePoolUpgradeTimeout returns the appropriate timeout for NodePool upgrades +// based on the platform type. +func nodePoolUpgradeTimeout(platform hyperv1.PlatformType) time.Duration { + switch platform { + case hyperv1.AzurePlatform, hyperv1.KubevirtPlatform: + return 45 * time.Minute + default: + return 20 * time.Minute + } +} + +// Constants + +const ( + tuningConfigKey = "tuning" + configKey = "config" + configManagedNamespace = "openshift-config-managed" + + hugepagesTunedYAML = `apiVersion: tuned.openshift.io/v1 +kind: Tuned +metadata: + name: hugepages + namespace: openshift-cluster-node-tuning-operator +spec: + profile: + - data: | + [main] + summary=Boot time configuration for hugepages + include=openshift-node + [bootloader] + cmdline_openshift_node_hugepages=hugepagesz=2M hugepages=4 + name: openshift-hugepages + recommend: + - priority: 20 + profile: openshift-hugepages +` + + kubeletConfig1YAML = ` +apiVersion: machineconfiguration.openshift.io/v1 +kind: KubeletConfig +metadata: + name: set-max-pods +spec: + kubeletConfig: + maxPods: 100 +` + + performanceProfileYAML = ` +apiVersion: performance.openshift.io/v2 +kind: PerformanceProfile +metadata: + name: perfprof-2 +spec: + cpu: + isolated: "1" + reserved: "0" + numa: + topologyPolicy: "single-numa-node" + nodeSelector: + node-role.kubernetes.io/worker-cnf: "" +` +) From b7ed454ff1f31454f275e8b391fa37e5b589935c Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Fri, 15 May 2026 08:56:28 -0400 Subject: [PATCH 07/11] test(e2ev2): add cluster creation binary for CI Go binary that wraps the hypershift CLI to create HA clusters for lifecycle tests. CI calls this from the create-selfmanaged-guests step instead of inline shell scripts, keeping cluster creation logic co-located with the tests. Co-Authored-By: Claude Opus 4.6 --- Makefile | 4 + test/e2e/v2/cmd/create-guests/main.go | 243 ++++++++++++++++++++++++++ 2 files changed, 247 insertions(+) create mode 100644 test/e2e/v2/cmd/create-guests/main.go diff --git a/Makefile b/Makefile index c7342038963..e1023643e61 100644 --- a/Makefile +++ b/Makefile @@ -482,6 +482,10 @@ reqserving-e2e: e2ev2: $(GO_E2EV2_RECIPE) -o bin/test-e2e-v2 ./test/e2e/v2/tests +.PHONY: e2ev2-create-guests +e2ev2-create-guests: + $(GO_BUILD_RECIPE) -tags e2ev2 -o bin/create-guests ./test/e2e/v2/cmd/create-guests + .PHONY: backuprestore-e2e backuprestore-e2e: $(GO_BACKUPRESTORE_E2E_RECIPE) -o bin/test-backuprestore ./test/e2e/v2/tests diff --git a/test/e2e/v2/cmd/create-guests/main.go b/test/e2e/v2/cmd/create-guests/main.go new file mode 100644 index 00000000000..01109627eca --- /dev/null +++ b/test/e2e/v2/cmd/create-guests/main.go @@ -0,0 +1,243 @@ +//go:build e2ev2 + +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// create-guests creates HA HostedClusters for v2 e2e lifecycle tests. +// It shells out to the hypershift CLI to create an Azure cluster, waits +// for the HostedCluster to become Available, and writes the cluster name +// to SHARED_DIR for downstream CI steps. +package main + +import ( + "context" + "flag" + "fmt" + "log" + "os" + "os/exec" + "path/filepath" + "strconv" + "time" + + hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/util/wait" + ctrl "sigs.k8s.io/controller-runtime" + crclient "sigs.k8s.io/controller-runtime/pkg/client" +) + +var scheme = runtime.NewScheme() + +func init() { + utilruntime.Must(hyperv1.AddToScheme(scheme)) +} + +func main() { + // Required flags. + name := flag.String("name", "", "Name of the HostedCluster to create (required)") + releaseImage := flag.String("release-image", "", "OCP release image (required)") + azureCreds := flag.String("azure-creds", "", "Path to Azure credentials JSON (required)") + pullSecret := flag.String("pull-secret", "", "Path to pull secret file (required)") + baseDomain := flag.String("base-domain", "", "DNS base domain (required)") + + // Optional flags with defaults. + namespace := flag.String("namespace", "clusters", "Namespace for the HostedCluster") + location := flag.String("location", "centralus", "Azure region") + cpAvailabilityPolicy := flag.String("control-plane-availability-policy", "HighlyAvailable", "Control plane availability policy") + nodePoolReplicas := flag.Int("node-pool-replicas", 3, "Number of node pool replicas") + sharedDir := flag.String("shared-dir", os.Getenv("SHARED_DIR"), "SHARED_DIR to write the cluster name file to") + outputFile := flag.String("output-file", "cluster-name-upgrade", "Filename in SHARED_DIR to write the cluster name to") + hypershiftBinary := flag.String("hypershift-binary", "hypershift", "Path to the hypershift CLI binary") + waitTimeout := flag.Duration("wait-timeout", 45*time.Minute, "Timeout for waiting for the cluster to become Available") + + // Azure-specific optional flags. + oidcIssuerURL := flag.String("oidc-issuer-url", "", "Azure OIDC issuer URL") + saTokenIssuerPrivateKeyPath := flag.String("sa-token-issuer-private-key-path", "", "Path to the SA token issuer private key") + workloadIdentitiesFile := flag.String("workload-identities-file", "", "Path to the workload identities JSON file") + dnsZoneRGName := flag.String("dns-zone-rg-name", "", "DNS zone resource group name") + assignSPRoles := flag.Bool("assign-service-principal-roles", true, "Assign service principal roles") + generateSSH := flag.Bool("generate-ssh", true, "Generate SSH key") + etcdStorageClass := flag.String("etcd-storage-class", "", "Etcd storage class") + externalDNSDomain := flag.String("external-dns-domain", "", "External DNS domain") + + flag.Parse() + + if *name == "" || *releaseImage == "" || *azureCreds == "" || *pullSecret == "" || *baseDomain == "" { + log.Fatal("--name, --release-image, --azure-creds, --pull-secret, and --base-domain are required") + } + + ctx, cancel := context.WithTimeout(context.Background(), *waitTimeout+10*time.Minute) + defer cancel() + + if err := run(ctx, runConfig{ + name: *name, + releaseImage: *releaseImage, + azureCreds: *azureCreds, + pullSecret: *pullSecret, + baseDomain: *baseDomain, + namespace: *namespace, + location: *location, + cpAvailabilityPolicy: *cpAvailabilityPolicy, + nodePoolReplicas: *nodePoolReplicas, + sharedDir: *sharedDir, + outputFile: *outputFile, + hypershiftBinary: *hypershiftBinary, + waitTimeout: *waitTimeout, + oidcIssuerURL: *oidcIssuerURL, + saTokenIssuerPrivateKeyPath: *saTokenIssuerPrivateKeyPath, + workloadIdentitiesFile: *workloadIdentitiesFile, + dnsZoneRGName: *dnsZoneRGName, + assignSPRoles: *assignSPRoles, + generateSSH: *generateSSH, + etcdStorageClass: *etcdStorageClass, + externalDNSDomain: *externalDNSDomain, + }); err != nil { + log.Fatalf("Error: %v", err) + } +} + +type runConfig struct { + name string + releaseImage string + azureCreds string + pullSecret string + baseDomain string + namespace string + location string + cpAvailabilityPolicy string + nodePoolReplicas int + sharedDir string + outputFile string + hypershiftBinary string + waitTimeout time.Duration + oidcIssuerURL string + saTokenIssuerPrivateKeyPath string + workloadIdentitiesFile string + dnsZoneRGName string + assignSPRoles bool + generateSSH bool + etcdStorageClass string + externalDNSDomain string +} + +func run(ctx context.Context, cfg runConfig) error { + args := buildCLIArgs(cfg) + + log.Printf("Creating HostedCluster %s/%s with hypershift CLI", cfg.namespace, cfg.name) + log.Printf("Running: %s %v", cfg.hypershiftBinary, args) + + cmd := exec.CommandContext(ctx, cfg.hypershiftBinary, args...) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + return fmt.Errorf("hypershift create cluster azure failed: %w", err) + } + + log.Printf("Waiting for HostedCluster %s/%s to become Available (timeout: %s)", cfg.namespace, cfg.name, cfg.waitTimeout) + if err := waitForClusterAvailable(ctx, cfg.namespace, cfg.name, cfg.waitTimeout); err != nil { + return fmt.Errorf("waiting for HostedCluster to become Available: %w", err) + } + + log.Printf("HostedCluster %s/%s is Available", cfg.namespace, cfg.name) + + if cfg.sharedDir != "" { + outputPath := filepath.Join(cfg.sharedDir, cfg.outputFile) + if err := os.WriteFile(outputPath, []byte(cfg.name), 0600); err != nil { + return fmt.Errorf("writing cluster name to %s: %w", outputPath, err) + } + log.Printf("Wrote cluster name %q to %s", cfg.name, outputPath) + } + + return nil +} + +func buildCLIArgs(cfg runConfig) []string { + args := []string{ + "create", "cluster", "azure", + "--name=" + cfg.name, + "--namespace=" + cfg.namespace, + "--release-image=" + cfg.releaseImage, + "--azure-creds=" + cfg.azureCreds, + "--pull-secret=" + cfg.pullSecret, + "--base-domain=" + cfg.baseDomain, + "--location=" + cfg.location, + "--control-plane-availability-policy=" + cfg.cpAvailabilityPolicy, + "--node-pool-replicas=" + strconv.Itoa(cfg.nodePoolReplicas), + } + + if cfg.assignSPRoles { + args = append(args, "--assign-service-principal-roles=true") + } + if cfg.generateSSH { + args = append(args, "--generate-ssh") + } + + // Append optional flags only when provided. + if cfg.oidcIssuerURL != "" { + args = append(args, "--oidc-issuer-url="+cfg.oidcIssuerURL) + } + if cfg.saTokenIssuerPrivateKeyPath != "" { + args = append(args, "--sa-token-issuer-private-key-path="+cfg.saTokenIssuerPrivateKeyPath) + } + if cfg.workloadIdentitiesFile != "" { + args = append(args, "--workload-identities-file="+cfg.workloadIdentitiesFile) + } + if cfg.dnsZoneRGName != "" { + args = append(args, "--dns-zone-rg-name="+cfg.dnsZoneRGName) + } + if cfg.etcdStorageClass != "" { + args = append(args, "--etcd-storage-class="+cfg.etcdStorageClass) + } + if cfg.externalDNSDomain != "" { + args = append(args, "--external-dns-domain="+cfg.externalDNSDomain) + } + + return args +} + +func waitForClusterAvailable(ctx context.Context, namespace, name string, timeout time.Duration) error { + restConfig, err := ctrl.GetConfig() + if err != nil { + return fmt.Errorf("getting management cluster kubeconfig: %w", err) + } + mgmtClient, err := crclient.New(restConfig, crclient.Options{ + Scheme: scheme, + }) + if err != nil { + return fmt.Errorf("creating management cluster client: %w", err) + } + + hc := &hyperv1.HostedCluster{} + return wait.PollUntilContextTimeout(ctx, 15*time.Second, timeout, true, func(ctx context.Context) (bool, error) { + if err := mgmtClient.Get(ctx, crclient.ObjectKey{Namespace: namespace, Name: name}, hc); err != nil { + log.Printf("Waiting for HostedCluster %s/%s: %v", namespace, name, err) + return false, nil + } + for _, cond := range hc.Status.Conditions { + if cond.Type == string(hyperv1.HostedClusterAvailable) && cond.Status == metav1.ConditionTrue { + return true, nil + } + } + desiredImage := "" + if hc.Status.Version != nil { + desiredImage = hc.Status.Version.Desired.Image + } + log.Printf("HostedCluster %s/%s not yet Available, current desired image: %s", namespace, name, desiredImage) + return false, nil + }) +} From 1449dc612881c2d96d90a0652791550334af0502 Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Fri, 15 May 2026 15:31:45 -0400 Subject: [PATCH 08/11] test(e2ev2): add Go binaries for CI guest cluster lifecycle Replace the bash scripts in the openshift/release step registry with Go binaries that live in the hypershift repo. This moves CI orchestration closer to the code it tests, making it easier to maintain and evolve. Four binaries under test/e2e/v2/cmd/: - create-guests: creates 4 Azure HostedClusters in parallel (public, private, oauth-lb, HA upgrade) with watch-based waiting instead of poll loops, OperatorConfiguration patching, and JUnit XML output - run-tests: dispatches test-e2e-v2 per cluster with Ginkgo label filters, running upgrade + etcd-chaos sequentially on the HA cluster - destroy-guests: tears down all 4 clusters with best-effort semantics - dump-guests: collects diagnostic artifacts, always exits 0 Cluster names are derived deterministically from PROW_JOB_ID via sha256, matching the existing bash convention. The Dockerfile.e2e and Makefile are updated to build and ship all four binaries. Co-Authored-By: Claude Opus 4.6 --- Dockerfile.e2e | 6 +- Makefile | 12 + test/e2e/v2/cmd/create-guests/main.go | 586 ++++++++++++++++++------- test/e2e/v2/cmd/destroy-guests/main.go | 110 +++++ test/e2e/v2/cmd/dump-guests/main.go | 97 ++++ test/e2e/v2/cmd/run-tests/main.go | 177 ++++++++ test/e2e/v2/internal/test_context.go | 2 + test/e2e/v2/lifecycle/azure.go | 257 +++++++++++ test/e2e/v2/lifecycle/platform.go | 114 +++++ test/e2e/v2/tests/etcd_chaos_test.go | 4 +- 10 files changed, 1210 insertions(+), 155 deletions(-) create mode 100644 test/e2e/v2/cmd/destroy-guests/main.go create mode 100644 test/e2e/v2/cmd/dump-guests/main.go create mode 100644 test/e2e/v2/cmd/run-tests/main.go create mode 100644 test/e2e/v2/lifecycle/azure.go create mode 100644 test/e2e/v2/lifecycle/platform.go diff --git a/Dockerfile.e2e b/Dockerfile.e2e index 1b079d706d9..68086ed8bae 100644 --- a/Dockerfile.e2e +++ b/Dockerfile.e2e @@ -5,7 +5,7 @@ WORKDIR /hypershift COPY . . -RUN make e2e hypershift +RUN make e2e hypershift e2ev2-create-guests e2ev2-run-tests e2ev2-destroy-guests e2ev2-dump-guests # Reuse the same image as builder because we need go command in ci-test-e2e.sh # Multi-stage build lets us drop the source code and build cache from the final image @@ -20,6 +20,10 @@ COPY --from=builder /hypershift/bin/test-backuprestore /hypershift/bin/test-back COPY --from=builder /hypershift/bin/test-setup /hypershift/bin/test-setup COPY --from=builder /hypershift/bin/test-reqserving /hypershift/bin/test-reqserving COPY --from=builder /hypershift/bin/hypershift /hypershift/bin/hypershift +COPY --from=builder /hypershift/bin/create-guests /hypershift/bin/create-guests +COPY --from=builder /hypershift/bin/run-tests /hypershift/bin/run-tests +COPY --from=builder /hypershift/bin/destroy-guests /hypershift/bin/destroy-guests +COPY --from=builder /hypershift/bin/dump-guests /hypershift/bin/dump-guests COPY --from=builder /hypershift/hack/ci-test-e2e.sh /hypershift/hack/ci-test-e2e.sh COPY --from=builder /hypershift/hack/run-reqserving-e2e.sh /hypershift/hack/run-reqserving-e2e.sh diff --git a/Makefile b/Makefile index e1023643e61..4dce4a252e2 100644 --- a/Makefile +++ b/Makefile @@ -486,6 +486,18 @@ e2ev2: e2ev2-create-guests: $(GO_BUILD_RECIPE) -tags e2ev2 -o bin/create-guests ./test/e2e/v2/cmd/create-guests +.PHONY: e2ev2-run-tests +e2ev2-run-tests: + $(GO_BUILD_RECIPE) -tags e2ev2 -o bin/run-tests ./test/e2e/v2/cmd/run-tests + +.PHONY: e2ev2-destroy-guests +e2ev2-destroy-guests: + $(GO_BUILD_RECIPE) -tags e2ev2 -o bin/destroy-guests ./test/e2e/v2/cmd/destroy-guests + +.PHONY: e2ev2-dump-guests +e2ev2-dump-guests: + $(GO_BUILD_RECIPE) -tags e2ev2 -o bin/dump-guests ./test/e2e/v2/cmd/dump-guests + .PHONY: backuprestore-e2e backuprestore-e2e: $(GO_BACKUPRESTORE_E2E_RECIPE) -o bin/test-backuprestore ./test/e2e/v2/tests diff --git a/test/e2e/v2/cmd/create-guests/main.go b/test/e2e/v2/cmd/create-guests/main.go index 01109627eca..129110a3c8c 100644 --- a/test/e2e/v2/cmd/create-guests/main.go +++ b/test/e2e/v2/cmd/create-guests/main.go @@ -14,29 +14,37 @@ See the License for the specific language governing permissions and limitations under the License. */ -// create-guests creates HA HostedClusters for v2 e2e lifecycle tests. -// It shells out to the hypershift CLI to create an Azure cluster, waits -// for the HostedCluster to become Available, and writes the cluster name -// to SHARED_DIR for downstream CI steps. +// create-guests creates HostedClusters in parallel for v2 e2e +// lifecycle tests. The number and configuration of clusters is +// determined by the platform (HYPERSHIFT_PLATFORM env var). +// It shells out to the hypershift CLI for cluster creation, runs +// platform-specific post-create hooks, then uses controller-runtime +// watches to wait for Available condition and version rollout +// completion. Cluster names are derived deterministically from +// PROW_JOB_ID and written to SHARED_DIR for downstream CI steps. +// JUnit XML is emitted to ARTIFACT_DIR on rollout failure. package main import ( "context" - "flag" "fmt" "log" "os" "os/exec" "path/filepath" "strconv" + "strings" + "sync" "time" + configv1 "github.com/openshift/api/config/v1" hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" + "github.com/openshift/hypershift/test/e2e/v2/lifecycle" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" - "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/apimachinery/pkg/watch" ctrl "sigs.k8s.io/controller-runtime" crclient "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -47,197 +55,471 @@ func init() { utilruntime.Must(hyperv1.AddToScheme(scheme)) } +const defaultNamespace = "clusters" + +// envConfig captures the common environment configuration. +type envConfig struct { + prowJobID string + sharedDir string + artifactDir string + releaseImage string + n1Image string + + baseDomain string + nodeCount int + namespace string + externalDNS string + etcdSC string + pullSecret string + + platform lifecycle.PlatformConfig + hypershiftBinary string + waitTimeout time.Duration +} + func main() { - // Required flags. - name := flag.String("name", "", "Name of the HostedCluster to create (required)") - releaseImage := flag.String("release-image", "", "OCP release image (required)") - azureCreds := flag.String("azure-creds", "", "Path to Azure credentials JSON (required)") - pullSecret := flag.String("pull-secret", "", "Path to pull secret file (required)") - baseDomain := flag.String("base-domain", "", "DNS base domain (required)") - - // Optional flags with defaults. - namespace := flag.String("namespace", "clusters", "Namespace for the HostedCluster") - location := flag.String("location", "centralus", "Azure region") - cpAvailabilityPolicy := flag.String("control-plane-availability-policy", "HighlyAvailable", "Control plane availability policy") - nodePoolReplicas := flag.Int("node-pool-replicas", 3, "Number of node pool replicas") - sharedDir := flag.String("shared-dir", os.Getenv("SHARED_DIR"), "SHARED_DIR to write the cluster name file to") - outputFile := flag.String("output-file", "cluster-name-upgrade", "Filename in SHARED_DIR to write the cluster name to") - hypershiftBinary := flag.String("hypershift-binary", "hypershift", "Path to the hypershift CLI binary") - waitTimeout := flag.Duration("wait-timeout", 45*time.Minute, "Timeout for waiting for the cluster to become Available") - - // Azure-specific optional flags. - oidcIssuerURL := flag.String("oidc-issuer-url", "", "Azure OIDC issuer URL") - saTokenIssuerPrivateKeyPath := flag.String("sa-token-issuer-private-key-path", "", "Path to the SA token issuer private key") - workloadIdentitiesFile := flag.String("workload-identities-file", "", "Path to the workload identities JSON file") - dnsZoneRGName := flag.String("dns-zone-rg-name", "", "DNS zone resource group name") - assignSPRoles := flag.Bool("assign-service-principal-roles", true, "Assign service principal roles") - generateSSH := flag.Bool("generate-ssh", true, "Generate SSH key") - etcdStorageClass := flag.String("etcd-storage-class", "", "Etcd storage class") - externalDNSDomain := flag.String("external-dns-domain", "", "External DNS domain") - - flag.Parse() - - if *name == "" || *releaseImage == "" || *azureCreds == "" || *pullSecret == "" || *baseDomain == "" { - log.Fatal("--name, --release-image, --azure-creds, --pull-secret, and --base-domain are required") - } - - ctx, cancel := context.WithTimeout(context.Background(), *waitTimeout+10*time.Minute) + cfg := loadEnvConfig() + + ctx, cancel := context.WithTimeout(context.Background(), cfg.waitTimeout+10*time.Minute) defer cancel() - if err := run(ctx, runConfig{ - name: *name, - releaseImage: *releaseImage, - azureCreds: *azureCreds, - pullSecret: *pullSecret, - baseDomain: *baseDomain, - namespace: *namespace, - location: *location, - cpAvailabilityPolicy: *cpAvailabilityPolicy, - nodePoolReplicas: *nodePoolReplicas, - sharedDir: *sharedDir, - outputFile: *outputFile, - hypershiftBinary: *hypershiftBinary, - waitTimeout: *waitTimeout, - oidcIssuerURL: *oidcIssuerURL, - saTokenIssuerPrivateKeyPath: *saTokenIssuerPrivateKeyPath, - workloadIdentitiesFile: *workloadIdentitiesFile, - dnsZoneRGName: *dnsZoneRGName, - assignSPRoles: *assignSPRoles, - generateSSH: *generateSSH, - etcdStorageClass: *etcdStorageClass, - externalDNSDomain: *externalDNSDomain, - }); err != nil { + if err := run(ctx, cfg); err != nil { log.Fatalf("Error: %v", err) } } -type runConfig struct { - name string - releaseImage string - azureCreds string - pullSecret string - baseDomain string - namespace string - location string - cpAvailabilityPolicy string - nodePoolReplicas int - sharedDir string - outputFile string - hypershiftBinary string - waitTimeout time.Duration - oidcIssuerURL string - saTokenIssuerPrivateKeyPath string - workloadIdentitiesFile string - dnsZoneRGName string - assignSPRoles bool - generateSSH bool - etcdStorageClass string - externalDNSDomain string +func loadEnvConfig() envConfig { + sharedDir := mustGetenv("SHARED_DIR") + + platform, err := lifecycle.NewPlatformConfig(os.Getenv("HYPERSHIFT_PLATFORM"), sharedDir) + if err != nil { + log.Fatalf("Failed to initialize platform config: %v", err) + } + + cfg := envConfig{ + prowJobID: mustGetenv("PROW_JOB_ID"), + sharedDir: sharedDir, + artifactDir: mustGetenv("ARTIFACT_DIR"), + releaseImage: mustGetenv("RELEASE_IMAGE_LATEST"), + n1Image: os.Getenv("OCP_IMAGE_N1"), + + baseDomain: envOrDefault("HYPERSHIFT_BASE_DOMAIN", platform.DefaultBaseDomain()), + nodeCount: envOrDefaultInt("HYPERSHIFT_NODE_COUNT", 3), + namespace: envOrDefault("HYPERSHIFT_NAMESPACE", defaultNamespace), + externalDNS: os.Getenv("HYPERSHIFT_EXTERNAL_DNS_DOMAIN"), + etcdSC: os.Getenv("HYPERSHIFT_ETCD_STORAGE_CLASS"), + pullSecret: envOrDefault("PULL_SECRET", "/etc/ci-pull-credentials/.dockerconfigjson"), + + platform: platform, + hypershiftBinary: envOrDefault("HYPERSHIFT_BINARY", "hypershift"), + waitTimeout: 45 * time.Minute, + } + + if cfg.n1Image == "" { + cfg.n1Image = cfg.releaseImage + } + + return cfg } -func run(ctx context.Context, cfg runConfig) error { - args := buildCLIArgs(cfg) +func run(ctx context.Context, cfg envConfig) error { + specs := cfg.platform.ClusterSpecs(cfg.releaseImage, cfg.n1Image) + + // Derive cluster names and build the name map. + named := make([]namedSpec, len(specs)) + clusterNames := make(map[string]string) // outputFile -> name + for i, spec := range specs { + name := lifecycle.DeriveClusterName(cfg.prowJobID, spec.Suffix) + named[i] = namedSpec{ClusterSpec: spec, name: name} + clusterNames[spec.OutputFile] = name + } - log.Printf("Creating HostedCluster %s/%s with hypershift CLI", cfg.namespace, cfg.name) - log.Printf("Running: %s %v", cfg.hypershiftBinary, args) + // Phase 1: Create all clusters in parallel. + log.Printf("Phase 1: Creating %d clusters in parallel", len(named)) + createErrors := createClustersParallel(ctx, cfg, named) + for _, ns := range named { + if err := createErrors[ns.Variant]; err != nil { + log.Printf("ERROR: cluster %s (%s) creation failed: %v", ns.name, ns.Variant, err) + } else { + log.Printf("Cluster %s (%s) creation command completed", ns.name, ns.Variant) + } + } + for _, err := range createErrors { + if err != nil { + return fmt.Errorf("one or more cluster create commands failed") + } + } - cmd := exec.CommandContext(ctx, cfg.hypershiftBinary, args...) - cmd.Stdout = os.Stdout - cmd.Stderr = os.Stderr - if err := cmd.Run(); err != nil { - return fmt.Errorf("hypershift create cluster azure failed: %w", err) + // Phase 2: Platform-specific post-create hooks. + log.Println("Phase 2: Running platform post-create hooks") + mgmtClient, err := newMgmtClient() + if err != nil { + return fmt.Errorf("creating management cluster client: %w", err) + } + if err := cfg.platform.PostCreate(ctx, mgmtClient, cfg.namespace, clusterNames); err != nil { + return fmt.Errorf("platform post-create hook: %w", err) } - log.Printf("Waiting for HostedCluster %s/%s to become Available (timeout: %s)", cfg.namespace, cfg.name, cfg.waitTimeout) - if err := waitForClusterAvailable(ctx, cfg.namespace, cfg.name, cfg.waitTimeout); err != nil { - return fmt.Errorf("waiting for HostedCluster to become Available: %w", err) + // Phase 3: Watch for Available condition on all clusters. + log.Println("Phase 3: Waiting for all clusters to become Available") + availableErrors := waitForClustersAvailable(ctx, mgmtClient, cfg.namespace, named, 30*time.Minute) + for _, ns := range named { + if err := availableErrors[ns.Variant]; err != nil { + log.Printf("ERROR: cluster %s (%s) did not become Available: %v", ns.name, ns.Variant, err) + } else { + log.Printf("Cluster %s (%s) is Available", ns.name, ns.Variant) + } + } + for _, err := range availableErrors { + if err != nil { + return fmt.Errorf("one or more clusters did not become Available") + } } - log.Printf("HostedCluster %s/%s is Available", cfg.namespace, cfg.name) + // Phase 4: Watch for version rollout completion on all clusters. + log.Println("Phase 4: Waiting for version rollout completion on all clusters") + rolloutErrors := waitForVersionRollout(ctx, mgmtClient, cfg, named) + anyRolloutFailed := false + for _, ns := range named { + if err := rolloutErrors[ns.Variant]; err != nil { + log.Printf("ERROR: version rollout failed for %s (%s): %v", ns.name, ns.Variant, err) + emitJUnitFailure(ctx, mgmtClient, cfg, ns.name, ns.Variant) + anyRolloutFailed = true + } else { + log.Printf("Version rollout completed for %s (%s)", ns.name, ns.Variant) + emitJUnitSuccess(cfg, ns.name, ns.Variant) + } + } - if cfg.sharedDir != "" { - outputPath := filepath.Join(cfg.sharedDir, cfg.outputFile) - if err := os.WriteFile(outputPath, []byte(cfg.name), 0600); err != nil { + // Phase 5: Write cluster names to SHARED_DIR. + log.Println("Phase 5: Writing cluster names to SHARED_DIR") + for _, ns := range named { + outputPath := filepath.Join(cfg.sharedDir, ns.OutputFile) + if err := os.WriteFile(outputPath, []byte(ns.name), 0600); err != nil { return fmt.Errorf("writing cluster name to %s: %w", outputPath, err) } - log.Printf("Wrote cluster name %q to %s", cfg.name, outputPath) + log.Printf("Wrote cluster name %q to %s", ns.name, outputPath) + } + + if anyRolloutFailed { + return fmt.Errorf("one or more cluster version rollouts failed") } + log.Println("All clusters are ready") return nil } -func buildCLIArgs(cfg runConfig) []string { +// buildCreateArgs returns CLI arguments for creating a cluster. +func buildCreateArgs(cfg envConfig, name string, spec lifecycle.ClusterSpec) []string { + releaseImage := cfg.releaseImage + if spec.ReleaseImage != "" { + releaseImage = spec.ReleaseImage + } + args := []string{ - "create", "cluster", "azure", - "--name=" + cfg.name, - "--namespace=" + cfg.namespace, - "--release-image=" + cfg.releaseImage, - "--azure-creds=" + cfg.azureCreds, - "--pull-secret=" + cfg.pullSecret, + "create", "cluster", cfg.platform.Name(), + "--name=" + name, + "--node-pool-replicas=" + strconv.Itoa(cfg.nodeCount), "--base-domain=" + cfg.baseDomain, - "--location=" + cfg.location, - "--control-plane-availability-policy=" + cfg.cpAvailabilityPolicy, - "--node-pool-replicas=" + strconv.Itoa(cfg.nodePoolReplicas), + "--pull-secret=" + cfg.pullSecret, + "--release-image=" + releaseImage, + "--generate-ssh", } - if cfg.assignSPRoles { - args = append(args, "--assign-service-principal-roles=true") + if cfg.externalDNS != "" { + args = append(args, "--external-dns-domain="+cfg.externalDNS) } - if cfg.generateSSH { - args = append(args, "--generate-ssh") + if cfg.etcdSC != "" { + args = append(args, "--etcd-storage-class="+cfg.etcdSC) } - // Append optional flags only when provided. - if cfg.oidcIssuerURL != "" { - args = append(args, "--oidc-issuer-url="+cfg.oidcIssuerURL) + args = append(args, cfg.platform.CreateArgs()...) + args = append(args, spec.ExtraArgs...) + + return args +} + +type namedSpec struct { + lifecycle.ClusterSpec + name string +} + +func createClustersParallel(ctx context.Context, cfg envConfig, specs []namedSpec) map[string]error { + results := make(map[string]error) + var mu sync.Mutex + var wg sync.WaitGroup + + for _, ns := range specs { + wg.Add(1) + go func() { + defer wg.Done() + args := buildCreateArgs(cfg, ns.name, ns.ClusterSpec) + log.Printf("Creating %s cluster %s", ns.Variant, ns.name) + log.Printf("Running: %s %v", cfg.hypershiftBinary, args) + + cmd := exec.CommandContext(ctx, cfg.hypershiftBinary, args...) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + err := cmd.Run() + + mu.Lock() + results[ns.Variant] = err + mu.Unlock() + }() } - if cfg.saTokenIssuerPrivateKeyPath != "" { - args = append(args, "--sa-token-issuer-private-key-path="+cfg.saTokenIssuerPrivateKeyPath) + wg.Wait() + return results +} + +func newMgmtClient() (crclient.WithWatch, error) { + restConfig, err := ctrl.GetConfig() + if err != nil { + return nil, fmt.Errorf("getting management cluster kubeconfig: %w", err) } - if cfg.workloadIdentitiesFile != "" { - args = append(args, "--workload-identities-file="+cfg.workloadIdentitiesFile) + return crclient.NewWithWatch(restConfig, crclient.Options{Scheme: scheme}) +} + +func waitForClustersAvailable(ctx context.Context, cl crclient.WithWatch, namespace string, specs []namedSpec, timeout time.Duration) map[string]error { + results := make(map[string]error) + var mu sync.Mutex + var wg sync.WaitGroup + + for _, ns := range specs { + wg.Add(1) + go func() { + defer wg.Done() + watchCtx, watchCancel := context.WithTimeout(ctx, timeout) + defer watchCancel() + + err := watchForCondition(watchCtx, cl, namespace, ns.name, func(hc *hyperv1.HostedCluster) bool { + for _, cond := range hc.Status.Conditions { + if cond.Type == string(hyperv1.HostedClusterAvailable) && cond.Status == metav1.ConditionTrue { + return true + } + } + return false + }) + + mu.Lock() + results[ns.Variant] = err + mu.Unlock() + }() } - if cfg.dnsZoneRGName != "" { - args = append(args, "--dns-zone-rg-name="+cfg.dnsZoneRGName) + wg.Wait() + return results +} + +func waitForVersionRollout(ctx context.Context, cl crclient.WithWatch, cfg envConfig, specs []namedSpec) map[string]error { + results := make(map[string]error) + var mu sync.Mutex + var wg sync.WaitGroup + + for _, ns := range specs { + wg.Add(1) + go func() { + defer wg.Done() + watchCtx, watchCancel := context.WithTimeout(ctx, cfg.waitTimeout) + defer watchCancel() + + err := watchForCondition(watchCtx, cl, cfg.namespace, ns.name, func(hc *hyperv1.HostedCluster) bool { + if hc.Status.Version == nil || len(hc.Status.Version.History) == 0 { + return false + } + for _, entry := range hc.Status.Version.History { + if entry.State != "" && entry.State != configv1.CompletedUpdate { + return false + } + if entry.State == "" { + return false + } + } + return true + }) + + mu.Lock() + results[ns.Variant] = err + mu.Unlock() + }() } - if cfg.etcdStorageClass != "" { - args = append(args, "--etcd-storage-class="+cfg.etcdStorageClass) + wg.Wait() + return results +} + +func watchForCondition(ctx context.Context, cl crclient.WithWatch, namespace, name string, predicate func(*hyperv1.HostedCluster) bool) error { + hc := &hyperv1.HostedCluster{} + if err := cl.Get(ctx, crclient.ObjectKey{Namespace: namespace, Name: name}, hc); err == nil { + if predicate(hc) { + return nil + } } - if cfg.externalDNSDomain != "" { - args = append(args, "--external-dns-domain="+cfg.externalDNSDomain) + + hcList := &hyperv1.HostedClusterList{} + watcher, err := cl.Watch(ctx, hcList, + crclient.InNamespace(namespace), + crclient.MatchingFields{"metadata.name": name}, + ) + if err != nil { + return fmt.Errorf("starting watch for %s/%s: %w", namespace, name, err) } + defer watcher.Stop() - return args + if err := cl.Get(ctx, crclient.ObjectKey{Namespace: namespace, Name: name}, hc); err == nil { + if predicate(hc) { + return nil + } + } + + for { + select { + case <-ctx.Done(): + return fmt.Errorf("timed out waiting for %s/%s: %w", namespace, name, ctx.Err()) + case event, ok := <-watcher.ResultChan(): + if !ok { + return fmt.Errorf("watch channel closed for %s/%s", namespace, name) + } + if event.Type == watch.Error { + return fmt.Errorf("watch error for %s/%s: %v", namespace, name, event.Object) + } + if event.Type != watch.Added && event.Type != watch.Modified { + continue + } + watchedHC, ok := event.Object.(*hyperv1.HostedCluster) + if !ok { + continue + } + logClusterProgress(watchedHC) + if predicate(watchedHC) { + return nil + } + } + } } -func waitForClusterAvailable(ctx context.Context, namespace, name string, timeout time.Duration) error { - restConfig, err := ctrl.GetConfig() - if err != nil { - return fmt.Errorf("getting management cluster kubeconfig: %w", err) +func logClusterProgress(hc *hyperv1.HostedCluster) { + available := "Unknown" + for _, cond := range hc.Status.Conditions { + if cond.Type == string(hyperv1.HostedClusterAvailable) { + available = string(cond.Status) + break + } } - mgmtClient, err := crclient.New(restConfig, crclient.Options{ - Scheme: scheme, - }) - if err != nil { - return fmt.Errorf("creating management cluster client: %w", err) + + versionState := "" + if hc.Status.Version != nil && len(hc.Status.Version.History) > 0 { + versionState = string(hc.Status.Version.History[0].State) } + log.Printf("Cluster %s/%s: Available=%s, VersionState=%s", + hc.Namespace, hc.Name, available, versionState) +} + +func emitJUnitFailure(ctx context.Context, cl crclient.WithWatch, cfg envConfig, name, variant string) { hc := &hyperv1.HostedCluster{} - return wait.PollUntilContextTimeout(ctx, 15*time.Second, timeout, true, func(ctx context.Context) (bool, error) { - if err := mgmtClient.Get(ctx, crclient.ObjectKey{Namespace: namespace, Name: name}, hc); err != nil { - log.Printf("Waiting for HostedCluster %s/%s: %v", namespace, name, err) - return false, nil + _ = cl.Get(ctx, crclient.ObjectKey{Namespace: cfg.namespace, Name: name}, hc) + + degradedMsg := conditionMessage(hc, "Degraded") + cvSucceedingMsg := conditionMessage(hc, string(hyperv1.ClusterVersionSucceeding)) + diagnostics := collectDiagnostics(ctx, cl, cfg.namespace, name, hc) + + junitXML := fmt.Sprintf(` + + + + + + + +`, name, name, variant, degradedMsg, cvSucceedingMsg, diagnostics) + + junitPath := filepath.Join(cfg.artifactDir, fmt.Sprintf("junit_hosted_cluster_%s.xml", name)) + if err := os.WriteFile(junitPath, []byte(junitXML), 0600); err != nil { + log.Printf("WARNING: failed to write JUnit XML to %s: %v", junitPath, err) + } else { + log.Printf("Wrote JUnit failure XML to %s", junitPath) + } +} + +func emitJUnitSuccess(cfg envConfig, name, variant string) { + junitXML := fmt.Sprintf(` + + + + + + + +`, name, name, variant) + + junitPath := filepath.Join(cfg.artifactDir, fmt.Sprintf("junit_hosted_cluster_%s.xml", name)) + if err := os.WriteFile(junitPath, []byte(junitXML), 0600); err != nil { + log.Printf("WARNING: failed to write JUnit XML to %s: %v", junitPath, err) + } +} + +func conditionMessage(hc *hyperv1.HostedCluster, condType string) string { + if hc == nil { + return "" + } + for _, cond := range hc.Status.Conditions { + if cond.Type == condType { + return cond.Message } + } + return "" +} + +func collectDiagnostics(ctx context.Context, cl crclient.WithWatch, namespace, name string, hc *hyperv1.HostedCluster) string { + var sb strings.Builder + + if hc != nil && len(hc.Status.Conditions) > 0 { + sb.WriteString("HostedCluster conditions:\n") for _, cond := range hc.Status.Conditions { - if cond.Type == string(hyperv1.HostedClusterAvailable) && cond.Status == metav1.ConditionTrue { - return true, nil - } + fmt.Fprintf(&sb, " %s\t%s\t%s\t%s\n", cond.Type, cond.Status, cond.Reason, cond.Message) } - desiredImage := "" - if hc.Status.Version != nil { - desiredImage = hc.Status.Version.Desired.Image + } + + np := &hyperv1.NodePool{} + if err := cl.Get(ctx, crclient.ObjectKey{Namespace: namespace, Name: name}, np); err == nil { + sb.WriteString("NodePool conditions:\n") + for _, cond := range np.Status.Conditions { + fmt.Fprintf(&sb, " %s\t%s\t%s\t%s\n", cond.Type, cond.Status, cond.Reason, cond.Message) } - log.Printf("HostedCluster %s/%s not yet Available, current desired image: %s", namespace, name, desiredImage) - return false, nil - }) + } + + return sb.String() +} + +func mustGetenv(key string) string { + val := os.Getenv(key) + if val == "" { + log.Fatalf("%s environment variable is required", key) + } + return val +} + +func envOrDefault(key, defaultVal string) string { + if val := os.Getenv(key); val != "" { + return val + } + return defaultVal +} + +func envOrDefaultInt(key string, defaultVal int) int { + val := os.Getenv(key) + if val == "" { + return defaultVal + } + n, err := strconv.Atoi(val) + if err != nil { + log.Printf("WARNING: invalid integer for %s=%q, using default %d", key, val, defaultVal) + return defaultVal + } + return n } diff --git a/test/e2e/v2/cmd/destroy-guests/main.go b/test/e2e/v2/cmd/destroy-guests/main.go new file mode 100644 index 00000000000..021d1cdfe84 --- /dev/null +++ b/test/e2e/v2/cmd/destroy-guests/main.go @@ -0,0 +1,110 @@ +//go:build e2ev2 + +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// destroy-guests destroys all HostedClusters created by the v2 e2e +// lifecycle tests. Cluster names are re-derived from PROW_JOB_ID +// using the same sha256 hash logic as the create step. All clusters +// are destroyed in parallel with best-effort semantics. +// Platform selection is controlled by the HYPERSHIFT_PLATFORM +// environment variable (default: "azure"). +package main + +import ( + "fmt" + "log" + "os" + "os/exec" + "sync" + + "github.com/openshift/hypershift/test/e2e/v2/lifecycle" +) + +const clusterGracePeriod = "40m" + +func main() { + prowJobID := os.Getenv("PROW_JOB_ID") + if prowJobID == "" { + log.Fatal("PROW_JOB_ID is required") + } + + sharedDir := os.Getenv("SHARED_DIR") + + platform, err := lifecycle.NewPlatformConfig(os.Getenv("HYPERSHIFT_PLATFORM"), sharedDir) + if err != nil { + log.Fatalf("Failed to initialize platform config: %v", err) + } + + hypershiftBin := os.Getenv("HYPERSHIFT_BINARY") + if hypershiftBin == "" { + hypershiftBin = "hypershift" + } + + clusterNames := lifecycle.DeriveClusterNames(prowJobID, platform.Suffixes()) + + log.Printf("Destroying %d clusters derived from PROW_JOB_ID=%s", len(clusterNames), prowJobID) + for _, name := range clusterNames { + log.Printf(" cluster: %s", name) + } + + var ( + mu sync.Mutex + failed bool + wg sync.WaitGroup + ) + + for _, name := range clusterNames { + wg.Add(1) + go func(clusterName string) { + defer wg.Done() + if err := destroyCluster(hypershiftBin, clusterName, platform); err != nil { + log.Printf("WARNING: Failed to destroy cluster %s: %v", clusterName, err) + mu.Lock() + failed = true + mu.Unlock() + } + }(name) + } + + wg.Wait() + + if failed { + log.Fatal("One or more clusters failed to destroy") + } + log.Printf("All clusters destroyed successfully") +} + +func destroyCluster(hypershiftBin, name string, platform lifecycle.PlatformConfig) error { + log.Printf("Destroying cluster: %s", name) + + args := []string{ + "destroy", "cluster", platform.Name(), + "--name=" + name, + "--cluster-grace-period=" + clusterGracePeriod, + } + args = append(args, platform.DestroyArgs()...) + + log.Printf("Running: %s %v", hypershiftBin, args) + + cmd := exec.Command(hypershiftBin, args...) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + return fmt.Errorf("hypershift destroy cluster %s failed for %s: %w", platform.Name(), name, err) + } + + log.Printf("Finished destroying cluster: %s", name) + return nil +} diff --git a/test/e2e/v2/cmd/dump-guests/main.go b/test/e2e/v2/cmd/dump-guests/main.go new file mode 100644 index 00000000000..c1210916090 --- /dev/null +++ b/test/e2e/v2/cmd/dump-guests/main.go @@ -0,0 +1,97 @@ +//go:build e2ev2 + +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// dump-guests collects diagnostic artifacts from all v2 e2e +// HostedClusters in parallel. It shells out to the hypershift CLI +// for each cluster and always exits 0 so that dump failures never +// block teardown. +// Platform selection is controlled by the HYPERSHIFT_PLATFORM +// environment variable (default: "azure"). +package main + +import ( + "flag" + "log" + "os" + "os/exec" + "path/filepath" + "sync" + + "github.com/openshift/hypershift/test/e2e/v2/lifecycle" +) + +func main() { + hypershiftBinary := flag.String("hypershift-binary", "hypershift", "Path to the hypershift CLI binary") + flag.Parse() + + prowJobID := os.Getenv("PROW_JOB_ID") + if prowJobID == "" { + log.Fatal("PROW_JOB_ID environment variable is required") + } + artifactDir := os.Getenv("ARTIFACT_DIR") + if artifactDir == "" { + log.Fatal("ARTIFACT_DIR environment variable is required") + } + + sharedDir := os.Getenv("SHARED_DIR") + platform, err := lifecycle.NewPlatformConfig(os.Getenv("HYPERSHIFT_PLATFORM"), sharedDir) + if err != nil { + log.Fatalf("Failed to initialize platform config: %v", err) + } + + clusterNames := lifecycle.DeriveClusterNames(prowJobID, platform.Suffixes()) + log.Printf("Dumping %d clusters derived from PROW_JOB_ID=%s", len(clusterNames), prowJobID) + + var wg sync.WaitGroup + for _, name := range clusterNames { + wg.Add(1) + go func() { + defer wg.Done() + dumpCluster(*hypershiftBinary, artifactDir, name) + }() + } + wg.Wait() + + log.Println("All cluster dumps complete") +} + +func dumpCluster(hypershiftBinary, artifactDir, clusterName string) { + dumpDir := filepath.Join(artifactDir, clusterName) + if err := os.MkdirAll(dumpDir, 0755); err != nil { + log.Printf("WARNING: Failed to create artifact directory %s: %v", dumpDir, err) + return + } + + args := []string{ + "dump", "cluster", + "--artifact-dir=" + dumpDir, + "--dump-guest-cluster=true", + "--name=" + clusterName, + } + + log.Printf("Dumping cluster %s -> %s", clusterName, dumpDir) + log.Printf("Running: %s %v", hypershiftBinary, args) + + cmd := exec.Command(hypershiftBinary, args...) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + log.Printf("WARNING: Failed to dump cluster %s: %v", clusterName, err) + return + } + + log.Printf("Successfully dumped cluster %s", clusterName) +} diff --git a/test/e2e/v2/cmd/run-tests/main.go b/test/e2e/v2/cmd/run-tests/main.go new file mode 100644 index 00000000000..260b82a4359 --- /dev/null +++ b/test/e2e/v2/cmd/run-tests/main.go @@ -0,0 +1,177 @@ +//go:build e2ev2 + +// run-tests dispatches the v2 e2e test suites in parallel, one per +// pre-created hosted cluster. Test groups and label filters are +// determined by the platform (HYPERSHIFT_PLATFORM env var). +package main + +import ( + "fmt" + "log" + "os" + "os/exec" + "path/filepath" + "strings" + "sync" + + "github.com/openshift/hypershift/test/e2e/v2/lifecycle" +) + +const ( + testBinary = "bin/test-e2e-v2" + clusterNS = "clusters" + defaultVerbose = "false" + defaultGinkgoTimeout = "3h" +) + +// testResult captures the outcome of a single test group execution. +type testResult struct { + name string + err error +} + +func main() { + log.SetFlags(log.LstdFlags) + + sharedDir := requireEnv("SHARED_DIR") + artifactDir := requireEnv("ARTIFACT_DIR") + releaseImage := os.Getenv("RELEASE_IMAGE_LATEST") + + eventuallyVerbose := os.Getenv("EVENTUALLY_VERBOSE") + if eventuallyVerbose == "" { + eventuallyVerbose = defaultVerbose + } + os.Setenv("EVENTUALLY_VERBOSE", eventuallyVerbose) + + platform, err := lifecycle.NewPlatformConfig(os.Getenv("HYPERSHIFT_PLATFORM"), sharedDir) + if err != nil { + log.Fatalf("Failed to initialize platform config: %v", err) + } + + // Let the platform set up any env vars it needs for tests. + platform.SetupTestEnv(sharedDir) + + matrix := platform.TestMatrix(releaseImage) + + var ( + mu sync.Mutex + results []testResult + wg sync.WaitGroup + ) + + // Launch parallel test groups. + for _, g := range matrix.Parallel { + g := g + wg.Add(1) + go func() { + defer wg.Done() + clusterName := readClusterName(sharedDir, g.ClusterFile) + log.Printf("Running %s tests against %s...", g.Name, clusterName) + err := runTestBinary(clusterName, g.LabelFilter, g.Skip, + filepath.Join(artifactDir, g.JUnitFile), g.ExtraEnv) + mu.Lock() + results = append(results, testResult{name: g.Name, err: err}) + mu.Unlock() + if err != nil { + log.Printf("%s tests FAILED: %v", g.Name, err) + } else { + log.Printf("%s tests PASSED", g.Name) + } + }() + } + + // Launch sequential groups (each group runs in its own goroutine, + // but steps within a group run one after another). + for _, sg := range matrix.Sequential { + sg := sg + wg.Add(1) + go func() { + defer wg.Done() + for i, step := range sg.Steps { + clusterName := readClusterName(sharedDir, step.ClusterFile) + log.Printf("Running %s tests against %s...", step.Name, clusterName) + err := runTestBinary(clusterName, step.LabelFilter, step.Skip, + filepath.Join(artifactDir, step.JUnitFile), step.ExtraEnv) + mu.Lock() + results = append(results, testResult{name: step.Name, err: err}) + mu.Unlock() + if err != nil { + log.Printf("%s tests FAILED: %v — skipping remaining steps in %s", step.Name, err, sg.Name) + return + } + log.Printf("%s tests PASSED", step.Name) + if i < len(sg.Steps)-1 { + log.Printf("Continuing to next step in %s...", sg.Name) + } + } + }() + } + + log.Println("Waiting for all test suites to complete...") + wg.Wait() + + // Summarize and exit. + failed := 0 + for _, r := range results { + if r.err != nil { + log.Printf("FAIL: %s — %v", r.name, r.err) + failed++ + } else { + log.Printf("PASS: %s", r.name) + } + } + if failed > 0 { + log.Fatalf("%d test group(s) failed", failed) + } + log.Println("All test groups passed") +} + +func runTestBinary(clusterName, labelFilter, skip, junitPath string, extraEnv []string) error { + ginkgoTimeout := os.Getenv("GINKGO_TIMEOUT") + if ginkgoTimeout == "" { + ginkgoTimeout = defaultGinkgoTimeout + } + + args := []string{ + fmt.Sprintf("--ginkgo.label-filter=%s", labelFilter), + fmt.Sprintf("--ginkgo.junit-report=%s", junitPath), + fmt.Sprintf("--ginkgo.timeout=%s", ginkgoTimeout), + "--ginkgo.v", + } + if skip != "" { + args = append(args, fmt.Sprintf("--ginkgo.skip=%s", skip)) + } + + cmd := exec.Command(testBinary, args...) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + + cmd.Env = append(os.Environ(), + fmt.Sprintf("E2E_HOSTED_CLUSTER_NAME=%s", clusterName), + fmt.Sprintf("E2E_HOSTED_CLUSTER_NAMESPACE=%s", clusterNS), + ) + cmd.Env = append(cmd.Env, extraEnv...) + + return cmd.Run() +} + +func readClusterName(sharedDir, filename string) string { + path := filepath.Join(sharedDir, filename) + data, err := os.ReadFile(path) + if err != nil { + log.Fatalf("Failed to read cluster name from %s: %v", path, err) + } + name := strings.TrimSpace(string(data)) + if name == "" { + log.Fatalf("Cluster name file %s is empty", path) + } + return name +} + +func requireEnv(key string) string { + val := os.Getenv(key) + if val == "" { + log.Fatalf("Required environment variable %s is not set", key) + } + return val +} diff --git a/test/e2e/v2/internal/test_context.go b/test/e2e/v2/internal/test_context.go index cba6f176bab..91cf144240e 100644 --- a/test/e2e/v2/internal/test_context.go +++ b/test/e2e/v2/internal/test_context.go @@ -108,6 +108,8 @@ func (tc *TestContext) GetHostedClusterClient() crclient.Client { if err != nil { panic(fmt.Sprintf("failed to create REST config from kubeconfig: %v", err)) } + restConfig.QPS = 200 + restConfig.Burst = 300 client, err := crclient.New(restConfig, crclient.Options{Scheme: hyperapi.Scheme}) if err != nil { diff --git a/test/e2e/v2/lifecycle/azure.go b/test/e2e/v2/lifecycle/azure.go new file mode 100644 index 00000000000..81f82217b01 --- /dev/null +++ b/test/e2e/v2/lifecycle/azure.go @@ -0,0 +1,257 @@ +//go:build e2ev2 + +package lifecycle + +import ( + "context" + "fmt" + "log" + "os" + "path/filepath" + "strings" + + operatorv1 "github.com/openshift/api/operator/v1" + hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" + crclient "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + defaultAzureCreds = "/etc/hypershift-ci-jobs-self-managed-azure/credentials.json" + defaultAzureLocation = "centralus" + defaultAzureDNSZoneRG = "os4-common" + + defaultOIDCIssuerURL = "https://smazure.blob.core.windows.net/smazure" + defaultSATokenKeyPath = "/etc/hypershift-ci-jobs-self-managed-azure-e2e/serviceaccount-signer.private" + defaultWorkloadIdentities = "/etc/hypershift-ci-jobs-self-managed-azure-e2e/workload-identities.json" +) + +// AzurePlatformConfig holds Azure-specific configuration for the +// hypershift CLI. +type AzurePlatformConfig struct { + creds string + location string + oidcIssuerURL string + saTokenKeyPath string + workloadIdentities string + dnsZoneRG string + privateNATSubnetID string + sharedDir string + + marketplacePublisher string + marketplaceOffer string + marketplaceSKU string + marketplaceVersion string +} + +// NewAzurePlatformConfig reads Azure-specific configuration from +// environment variables with CI defaults. +func NewAzurePlatformConfig(sharedDir string) *AzurePlatformConfig { + cfg := &AzurePlatformConfig{ + creds: envOrDefault("AZURE_CREDS", defaultAzureCreds), + location: envOrDefault("HYPERSHIFT_AZURE_LOCATION", defaultAzureLocation), + oidcIssuerURL: envOrDefault("AZURE_OIDC_ISSUER_URL", defaultOIDCIssuerURL), + saTokenKeyPath: envOrDefault("AZURE_SA_TOKEN_ISSUER_KEY_PATH", defaultSATokenKeyPath), + workloadIdentities: envOrDefault("AZURE_WORKLOAD_IDENTITIES_FILE", defaultWorkloadIdentities), + dnsZoneRG: defaultAzureDNSZoneRG, + sharedDir: sharedDir, + + marketplacePublisher: os.Getenv("HYPERSHIFT_AZURE_MARKETPLACE_IMAGE_PUBLISHER"), + marketplaceOffer: os.Getenv("HYPERSHIFT_AZURE_MARKETPLACE_IMAGE_OFFER"), + marketplaceSKU: os.Getenv("HYPERSHIFT_AZURE_MARKETPLACE_IMAGE_SKU"), + marketplaceVersion: os.Getenv("HYPERSHIFT_AZURE_MARKETPLACE_IMAGE_VERSION"), + } + + cfg.privateNATSubnetID = os.Getenv("AZURE_PRIVATE_NAT_SUBNET_ID") + if cfg.privateNATSubnetID == "" && sharedDir != "" { + if data, err := os.ReadFile(filepath.Join(sharedDir, "azure_private_nat_subnet_id")); err == nil { + cfg.privateNATSubnetID = strings.TrimSpace(string(data)) + } + } + if cfg.privateNATSubnetID == "" { + log.Printf("WARNING: AZURE_PRIVATE_NAT_SUBNET_ID is not set; private cluster creation will fail") + } + + if cfg.marketplaceSKU == "" && cfg.marketplacePublisher != "" && sharedDir != "" { + if data, err := os.ReadFile(filepath.Join(sharedDir, "azure-marketplace-image-sku")); err == nil { + cfg.marketplaceSKU = strings.TrimSpace(string(data)) + } + } + if cfg.marketplaceVersion == "" && cfg.marketplacePublisher != "" && sharedDir != "" { + if data, err := os.ReadFile(filepath.Join(sharedDir, "azure-marketplace-image-version")); err == nil { + cfg.marketplaceVersion = strings.TrimSpace(string(data)) + } + } + + return cfg +} + +func (a *AzurePlatformConfig) Name() string { return "azure" } + +func (a *AzurePlatformConfig) DefaultBaseDomain() string { + return "hcp-sm-azure.azure.devcluster.openshift.com" +} + +func (a *AzurePlatformConfig) Suffixes() []string { + return []string{"-pub", "-prv", "-oau", "-upg"} +} + +func (a *AzurePlatformConfig) ClusterSpecs(releaseImage, n1Image string) []ClusterSpec { + return []ClusterSpec{ + { + Variant: "public", + Suffix: "-pub", + OutputFile: "cluster-name-public", + }, + { + Variant: "private", + Suffix: "-prv", + OutputFile: "cluster-name-private", + ExtraArgs: []string{ + "--endpoint-access=Private", + "--endpoint-access-private-nat-subnet-id=" + a.privateNATSubnetID, + }, + }, + { + Variant: "oauth-lb", + Suffix: "-oau", + OutputFile: "cluster-name-oauth-lb", + ExtraArgs: []string{"--oauth-publishing-strategy=LoadBalancer"}, + }, + { + Variant: "upgrade", + Suffix: "-upg", + OutputFile: "cluster-name-upgrade", + ReleaseImage: n1Image, + ExtraArgs: []string{"--control-plane-availability-policy=HighlyAvailable"}, + }, + } +} + +func (a *AzurePlatformConfig) CreateArgs() []string { + args := []string{ + "--azure-creds=" + a.creds, + "--location=" + a.location, + "--oidc-issuer-url=" + a.oidcIssuerURL, + "--sa-token-issuer-private-key-path=" + a.saTokenKeyPath, + "--workload-identities-file=" + a.workloadIdentities, + "--assign-service-principal-roles", + "--dns-zone-rg-name=" + a.dnsZoneRG, + } + + if a.marketplacePublisher != "" { + args = append(args, "--marketplace-publisher="+a.marketplacePublisher) + args = append(args, "--marketplace-offer="+a.marketplaceOffer) + if a.marketplaceSKU != "" { + args = append(args, "--marketplace-sku="+a.marketplaceSKU) + } + if a.marketplaceVersion != "" { + args = append(args, "--marketplace-version="+a.marketplaceVersion) + } + } + + return args +} + +// PostCreate patches the public cluster's OperatorConfiguration with +// an IngressOperator using an internal LoadBalancer. This is specific +// to Azure self-managed testing. +func (a *AzurePlatformConfig) PostCreate(ctx context.Context, cl crclient.WithWatch, namespace string, clusterNames map[string]string) error { + publicName, ok := clusterNames["cluster-name-public"] + if !ok { + return nil + } + + hc := &hyperv1.HostedCluster{} + if err := cl.Get(ctx, crclient.ObjectKey{Namespace: namespace, Name: publicName}, hc); err != nil { + return fmt.Errorf("getting HostedCluster %s/%s: %w", namespace, publicName, err) + } + + patch := crclient.MergeFrom(hc.DeepCopy()) + if hc.Spec.OperatorConfiguration == nil { + hc.Spec.OperatorConfiguration = &hyperv1.OperatorConfiguration{} + } + hc.Spec.OperatorConfiguration.IngressOperator = &hyperv1.IngressOperatorSpec{ + EndpointPublishingStrategy: &operatorv1.EndpointPublishingStrategy{ + Type: operatorv1.LoadBalancerServiceStrategyType, + LoadBalancer: &operatorv1.LoadBalancerStrategy{ + Scope: operatorv1.InternalLoadBalancer, + }, + }, + } + if err := cl.Patch(ctx, hc, patch); err != nil { + return fmt.Errorf("patching HostedCluster %s/%s OperatorConfiguration: %w", namespace, publicName, err) + } + log.Printf("Patched public cluster %s/%s with OperatorConfiguration", namespace, publicName) + return nil +} + +func (a *AzurePlatformConfig) TestMatrix(releaseImage string) TestMatrix { + return TestMatrix{ + Parallel: []TestGroup{ + { + Name: "public", + ClusterFile: "cluster-name-public", + LabelFilter: "self-managed-azure-public || nodepool-autoscaling || nodepool-lifecycle", + Skip: "KAS allowed CIDRs", + JUnitFile: "junit_self_managed_azure_public.xml", + }, + { + Name: "private", + ClusterFile: "cluster-name-private", + LabelFilter: "self-managed-azure-private", + JUnitFile: "junit_self_managed_azure_private.xml", + }, + { + Name: "oauth-lb", + ClusterFile: "cluster-name-oauth-lb", + LabelFilter: "self-managed-azure-oauth-lb", + JUnitFile: "junit_self_managed_azure_oauth_lb.xml", + }, + }, + Sequential: []SequentialGroup{ + { + Name: "upgrade-and-chaos", + Steps: []TestGroup{ + { + Name: "upgrade", + ClusterFile: "cluster-name-upgrade", + LabelFilter: "control-plane-upgrade", + JUnitFile: "junit_lifecycle_upgrade.xml", + ExtraEnv: []string{fmt.Sprintf("E2E_LATEST_RELEASE_IMAGE=%s", releaseImage)}, + }, + { + Name: "etcd-chaos", + ClusterFile: "cluster-name-upgrade", + LabelFilter: "etcd-chaos", + JUnitFile: "junit_lifecycle_etcd_chaos.xml", + }, + }, + }, + }, + } +} + +// SetupTestEnv reads Azure-specific config from SHARED_DIR and sets +// environment variables for the test subprocesses. +func (a *AzurePlatformConfig) SetupTestEnv(sharedDir string) { + azurePrivateNATSubnetID := os.Getenv("AZURE_PRIVATE_NAT_SUBNET_ID") + if data, err := os.ReadFile(filepath.Join(sharedDir, "azure_private_nat_subnet_id")); err == nil { + azurePrivateNATSubnetID = strings.TrimSpace(string(data)) + } + os.Setenv("AZURE_PRIVATE_NAT_SUBNET_ID", azurePrivateNATSubnetID) +} + +func (a *AzurePlatformConfig) DestroyArgs() []string { + return []string{ + "--azure-creds=" + a.creds, + "--location=" + a.location, + "--dns-zone-rg-name=" + a.dnsZoneRG, + } +} + +func envOrDefault(key, defaultVal string) string { + if val := os.Getenv(key); val != "" { + return val + } + return defaultVal +} diff --git a/test/e2e/v2/lifecycle/platform.go b/test/e2e/v2/lifecycle/platform.go new file mode 100644 index 00000000000..43ee1a597d4 --- /dev/null +++ b/test/e2e/v2/lifecycle/platform.go @@ -0,0 +1,114 @@ +//go:build e2ev2 + +package lifecycle + +import ( + "context" + "crypto/sha256" + "fmt" + + crclient "sigs.k8s.io/controller-runtime/pkg/client" +) + +// ClusterSpec describes a single cluster to create for lifecycle tests. +type ClusterSpec struct { + Variant string + Suffix string // hash suffix for name derivation + OutputFile string // filename under SHARED_DIR + ExtraArgs []string + ReleaseImage string // override (empty = use default) +} + +// TestGroup describes one logical group of e2e tests to execute. +type TestGroup struct { + Name string + ClusterFile string // filename under SHARED_DIR containing cluster name + LabelFilter string + Skip string + JUnitFile string + ExtraEnv []string +} + +// SequentialGroup runs its Steps one after another within a single +// goroutine. If any step fails, subsequent steps are skipped. +type SequentialGroup struct { + Name string + Steps []TestGroup +} + +// TestMatrix defines the full set of test groups for a platform. +// Parallel groups all run concurrently. Each SequentialGroup also +// runs concurrently with everything else, but its internal Steps +// run one after another. +type TestMatrix struct { + Parallel []TestGroup + Sequential []SequentialGroup +} + +// PlatformConfig provides all platform-specific configuration for +// the v2 lifecycle binaries. Adding a new platform means implementing +// this interface — the cmd binaries should not need modification. +type PlatformConfig interface { + // Name returns the CLI subcommand name (e.g., "azure", "aws"). + Name() string + + // DefaultBaseDomain returns the platform's default base domain. + DefaultBaseDomain() string + + // ClusterSpecs returns the cluster variants this platform creates. + // The releaseImage and n1Image are the current and N-1 release + // images from the CI environment. + ClusterSpecs(releaseImage, n1Image string) []ClusterSpec + + // CreateArgs returns platform-specific args for + // "hypershift create cluster ". + CreateArgs() []string + + // PostCreate runs platform-specific setup after clusters are + // created (e.g., patching OperatorConfiguration). + PostCreate(ctx context.Context, cl crclient.WithWatch, namespace string, clusterNames map[string]string) error + + // TestMatrix returns the test groups for this platform. + TestMatrix(releaseImage string) TestMatrix + + // SetupTestEnv sets platform-specific environment variables + // before test execution (e.g., reading subnet IDs from + // SHARED_DIR files). + SetupTestEnv(sharedDir string) + + // DestroyArgs returns platform-specific args for + // "hypershift destroy cluster ". + DestroyArgs() []string + + // Suffixes returns the hash suffixes for cluster name derivation, + // matching the order of ClusterSpecs. + Suffixes() []string +} + +// NewPlatformConfig creates a PlatformConfig for the given platform +// name. The sharedDir is passed for platforms that read fallback +// config from files. +func NewPlatformConfig(platform, sharedDir string) (PlatformConfig, error) { + switch platform { + case "azure", "": + return NewAzurePlatformConfig(sharedDir), nil + default: + return nil, fmt.Errorf("unsupported platform %q (supported: azure)", platform) + } +} + +// DeriveClusterName hashes prowJobID+suffix with SHA-256 and returns +// the first 20 hex characters. +func DeriveClusterName(prowJobID, suffix string) string { + hash := sha256.Sum256([]byte(prowJobID + suffix)) + return fmt.Sprintf("%x", hash)[:20] +} + +// DeriveClusterNames returns cluster names for the given suffixes. +func DeriveClusterNames(prowJobID string, suffixes []string) []string { + names := make([]string, 0, len(suffixes)) + for _, suffix := range suffixes { + names = append(names, DeriveClusterName(prowJobID, suffix)) + } + return names +} diff --git a/test/e2e/v2/tests/etcd_chaos_test.go b/test/e2e/v2/tests/etcd_chaos_test.go index 2ab2f652da6..1a8033f414e 100644 --- a/test/e2e/v2/tests/etcd_chaos_test.go +++ b/test/e2e/v2/tests/etcd_chaos_test.go @@ -276,7 +276,7 @@ func EtcdSingleMemberCorruptionTest(getTestCtx internal.TestContextGetter) { return want != 0 && want == got, fmt.Sprintf("wanted status active to be %d, got %d", want, got), nil }}, e2eutil.WithInterval(5*time.Second), - e2eutil.WithTimeout(10*time.Minute), + e2eutil.WithTimeout(15*time.Minute), ) waitForEtcdConvergence(ctx, testCtx.MgmtClient, cpNamespace, ptr.Deref(etcdSts.Spec.Replicas, 0)) @@ -341,7 +341,7 @@ func EtcdMissingMemberRecoveryTest(getTestCtx internal.TestContextGetter) { return want != 0 && want == got, fmt.Sprintf("wanted status active to be %d, got %d", want, got), nil }}, e2eutil.WithInterval(5*time.Second), - e2eutil.WithTimeout(10*time.Minute), + e2eutil.WithTimeout(15*time.Minute), ) waitForEtcdConvergence(ctx, testCtx.MgmtClient, cpNamespace, ptr.Deref(etcdSts.Spec.Replicas, 0)) From 1719d7671f065bde2c2926480c1676bd106efffc Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Mon, 18 May 2026 18:20:08 -0400 Subject: [PATCH 09/11] fix(e2ev2): fix CI failures in lifecycle tests - Etcd chaos: delete entire member directory instead of a single WAL file so etcd enters CrashLoopBackOff and the recovery controller detects it - Autoscaling scale-up: add node labels to NodePool and nodeSelector to workload so pods target only the autoscaling NP, forcing the autoscaler to scale it up - Autoscaling balancing: configure HostedCluster.Spec.Autoscaling with RandomExpander and BalancingIgnoredLabels so the autoscaler distributes across NodePools instead of favoring one - Split autoscaling tests onto a dedicated cluster to reduce sequential test count in the public group below the step timeout Co-Authored-By: Claude Opus 4.6 --- test/e2e/v2/cmd/destroy-guests/main.go | 22 ++-- test/e2e/v2/cmd/dump-guests/main.go | 13 +- test/e2e/v2/lifecycle/azure.go | 17 ++- test/e2e/v2/lifecycle/platform.go | 11 -- test/e2e/v2/tests/etcd_chaos_test.go | 35 +++--- .../e2e/v2/tests/nodepool_autoscaling_test.go | 114 +++++++++++++++--- test/e2e/v2/tests/nodepool_lifecycle_test.go | 39 +++--- 7 files changed, 171 insertions(+), 80 deletions(-) diff --git a/test/e2e/v2/cmd/destroy-guests/main.go b/test/e2e/v2/cmd/destroy-guests/main.go index 021d1cdfe84..28adc09b76e 100644 --- a/test/e2e/v2/cmd/destroy-guests/main.go +++ b/test/e2e/v2/cmd/destroy-guests/main.go @@ -52,12 +52,9 @@ func main() { hypershiftBin = "hypershift" } - clusterNames := lifecycle.DeriveClusterNames(prowJobID, platform.Suffixes()) + specs := platform.ClusterSpecs("", "") - log.Printf("Destroying %d clusters derived from PROW_JOB_ID=%s", len(clusterNames), prowJobID) - for _, name := range clusterNames { - log.Printf(" cluster: %s", name) - } + log.Printf("Destroying %d clusters derived from PROW_JOB_ID=%s", len(specs), prowJobID) var ( mu sync.Mutex @@ -65,17 +62,18 @@ func main() { wg sync.WaitGroup ) - for _, name := range clusterNames { + for _, spec := range specs { + clusterName := lifecycle.DeriveClusterName(prowJobID, spec.Suffix) wg.Add(1) - go func(clusterName string) { + go func() { defer wg.Done() - if err := destroyCluster(hypershiftBin, clusterName, platform); err != nil { - log.Printf("WARNING: Failed to destroy cluster %s: %v", clusterName, err) + if err := destroyCluster(hypershiftBin, clusterName, spec.Variant, platform); err != nil { + log.Printf("WARNING: Failed to destroy cluster %s (%s): %v", clusterName, spec.Variant, err) mu.Lock() failed = true mu.Unlock() } - }(name) + }() } wg.Wait() @@ -86,8 +84,8 @@ func main() { log.Printf("All clusters destroyed successfully") } -func destroyCluster(hypershiftBin, name string, platform lifecycle.PlatformConfig) error { - log.Printf("Destroying cluster: %s", name) +func destroyCluster(hypershiftBin, name, variant string, platform lifecycle.PlatformConfig) error { + log.Printf("Destroying cluster %s (%s)", name, variant) args := []string{ "destroy", "cluster", platform.Name(), diff --git a/test/e2e/v2/cmd/dump-guests/main.go b/test/e2e/v2/cmd/dump-guests/main.go index c1210916090..8379dd72682 100644 --- a/test/e2e/v2/cmd/dump-guests/main.go +++ b/test/e2e/v2/cmd/dump-guests/main.go @@ -52,15 +52,16 @@ func main() { log.Fatalf("Failed to initialize platform config: %v", err) } - clusterNames := lifecycle.DeriveClusterNames(prowJobID, platform.Suffixes()) - log.Printf("Dumping %d clusters derived from PROW_JOB_ID=%s", len(clusterNames), prowJobID) + specs := platform.ClusterSpecs("", "") + log.Printf("Dumping %d clusters derived from PROW_JOB_ID=%s", len(specs), prowJobID) var wg sync.WaitGroup - for _, name := range clusterNames { + for _, spec := range specs { + clusterName := lifecycle.DeriveClusterName(prowJobID, spec.Suffix) wg.Add(1) go func() { defer wg.Done() - dumpCluster(*hypershiftBinary, artifactDir, name) + dumpCluster(*hypershiftBinary, artifactDir, clusterName, spec.Variant) }() } wg.Wait() @@ -68,8 +69,8 @@ func main() { log.Println("All cluster dumps complete") } -func dumpCluster(hypershiftBinary, artifactDir, clusterName string) { - dumpDir := filepath.Join(artifactDir, clusterName) +func dumpCluster(hypershiftBinary, artifactDir, clusterName, variant string) { + dumpDir := filepath.Join(artifactDir, variant) if err := os.MkdirAll(dumpDir, 0755); err != nil { log.Printf("WARNING: Failed to create artifact directory %s: %v", dumpDir, err) return diff --git a/test/e2e/v2/lifecycle/azure.go b/test/e2e/v2/lifecycle/azure.go index 81f82217b01..8b9e8bfa764 100644 --- a/test/e2e/v2/lifecycle/azure.go +++ b/test/e2e/v2/lifecycle/azure.go @@ -91,10 +91,6 @@ func (a *AzurePlatformConfig) DefaultBaseDomain() string { return "hcp-sm-azure.azure.devcluster.openshift.com" } -func (a *AzurePlatformConfig) Suffixes() []string { - return []string{"-pub", "-prv", "-oau", "-upg"} -} - func (a *AzurePlatformConfig) ClusterSpecs(releaseImage, n1Image string) []ClusterSpec { return []ClusterSpec{ { @@ -124,6 +120,11 @@ func (a *AzurePlatformConfig) ClusterSpecs(releaseImage, n1Image string) []Clust ReleaseImage: n1Image, ExtraArgs: []string{"--control-plane-availability-policy=HighlyAvailable"}, }, + { + Variant: "autoscaling", + Suffix: "-asc", + OutputFile: "cluster-name-autoscaling", + }, } } @@ -191,7 +192,7 @@ func (a *AzurePlatformConfig) TestMatrix(releaseImage string) TestMatrix { { Name: "public", ClusterFile: "cluster-name-public", - LabelFilter: "self-managed-azure-public || nodepool-autoscaling || nodepool-lifecycle", + LabelFilter: "self-managed-azure-public || nodepool-lifecycle", Skip: "KAS allowed CIDRs", JUnitFile: "junit_self_managed_azure_public.xml", }, @@ -207,6 +208,12 @@ func (a *AzurePlatformConfig) TestMatrix(releaseImage string) TestMatrix { LabelFilter: "self-managed-azure-oauth-lb", JUnitFile: "junit_self_managed_azure_oauth_lb.xml", }, + { + Name: "autoscaling", + ClusterFile: "cluster-name-autoscaling", + LabelFilter: "nodepool-autoscaling", + JUnitFile: "junit_nodepool_autoscaling.xml", + }, }, Sequential: []SequentialGroup{ { diff --git a/test/e2e/v2/lifecycle/platform.go b/test/e2e/v2/lifecycle/platform.go index 43ee1a597d4..940fc98bce9 100644 --- a/test/e2e/v2/lifecycle/platform.go +++ b/test/e2e/v2/lifecycle/platform.go @@ -80,9 +80,6 @@ type PlatformConfig interface { // "hypershift destroy cluster ". DestroyArgs() []string - // Suffixes returns the hash suffixes for cluster name derivation, - // matching the order of ClusterSpecs. - Suffixes() []string } // NewPlatformConfig creates a PlatformConfig for the given platform @@ -104,11 +101,3 @@ func DeriveClusterName(prowJobID, suffix string) string { return fmt.Sprintf("%x", hash)[:20] } -// DeriveClusterNames returns cluster names for the given suffixes. -func DeriveClusterNames(prowJobID string, suffixes []string) []string { - names := make([]string, 0, len(suffixes)) - for _, suffix := range suffixes { - names = append(names, DeriveClusterName(prowJobID, suffix)) - } - return names -} diff --git a/test/e2e/v2/tests/etcd_chaos_test.go b/test/e2e/v2/tests/etcd_chaos_test.go index 1a8033f414e..0a204d2bc5a 100644 --- a/test/e2e/v2/tests/etcd_chaos_test.go +++ b/test/e2e/v2/tests/etcd_chaos_test.go @@ -240,11 +240,11 @@ func EtcdKillAllMembersTest(getTestCtx internal.TestContextGetter) { }) } -// EtcdSingleMemberCorruptionTest corrupts a random member's WAL file using -// RunCommandInPod, deletes the pod, verifies the etcd recovery job becomes active, -// and waits for StatefulSet convergence. +// EtcdSingleMemberCorruptionTest destroys a random member's data directory using +// RunCommandInPod, then waits for etcd to crash in-place so the recovery +// controller detects the failing member and creates a recovery job. func EtcdSingleMemberCorruptionTest(getTestCtx internal.TestContextGetter) { - It("should recover after a single member's WAL is corrupted", func() { + It("should recover after a single member's data is corrupted", func() { testCtx := getTestCtx() ctx := testCtx.Context cpNamespace := testCtx.ControlPlaneNamespace @@ -252,15 +252,18 @@ func EtcdSingleMemberCorruptionTest(getTestCtx internal.TestContextGetter) { etcdSts, etcdPods := getEtcdStsAndPods(ctx, testCtx.MgmtClient, cpNamespace) pod := randomEtcdPods(etcdPods.Items, 1)[0] - command := `find /var/lib/data/member/wal -type f -name "*.wal" -print0 | shuf -z -n1 | xargs -0 rm` - - GinkgoWriter.Printf("Deleting WAL file from etcd pod: %s\n", pod.Name) - cmdStdout, err := e2eutil.RunCommandInPod(ctx, testCtx.MgmtClient, "etcd", pod.Namespace, []string{"/bin/sh", "-c", command}, "etcd", 5*time.Minute) - Expect(err).NotTo(HaveOccurred(), "failed to delete WAL file from etcd pod %s", pod.Name) - Expect(cmdStdout).NotTo(ContainSubstring("No such file or directory"), "failed to delete WAL file from etcd pod %s", pod.Name) - - GinkgoWriter.Printf("Deleting pod: %s\n", pod.Name) - Expect(testCtx.MgmtClient.Delete(ctx, &pod)).To(Succeed(), "failed to delete pod %s", pod.Name) + // Remove the entire member directory so etcd cannot start. + // Deleting only a single WAL file is insufficient because etcd + // can recover from partial WAL loss using its snapshot database. + // Do NOT delete the pod afterward — let etcd crash and restart + // in-place so RestartCount increments on the same pod. The + // recovery controller requires RestartCount > 0 to detect a + // failing member; deleting the pod resets RestartCount to 0. + command := `rm -rf /var/lib/data/member` + + GinkgoWriter.Printf("Destroying data directory on etcd pod: %s\n", pod.Name) + _, err := e2eutil.RunCommandInPod(ctx, testCtx.MgmtClient, "etcd", pod.Namespace, []string{"/bin/sh", "-c", command}, "etcd", 5*time.Minute) + Expect(err).NotTo(HaveOccurred(), "failed to destroy data directory on etcd pod %s", pod.Name) // Etcd recovery job should be created. // We don't check if the job completed because it will be deleted after completion. @@ -271,9 +274,8 @@ func EtcdSingleMemberCorruptionTest(getTestCtx internal.TestContextGetter) { return recoveryJob, err }, []e2eutil.Predicate[*batchv1.Job]{func(job *batchv1.Job) (bool, string, error) { - want := int32(1) got := job.Status.Active - return want != 0 && want == got, fmt.Sprintf("wanted status active to be %d, got %d", want, got), nil + return got == 1, fmt.Sprintf("wanted status active to be 1, got %d", got), nil }}, e2eutil.WithInterval(5*time.Second), e2eutil.WithTimeout(15*time.Minute), @@ -336,9 +338,8 @@ func EtcdMissingMemberRecoveryTest(getTestCtx internal.TestContextGetter) { return recoveryJob, err }, []e2eutil.Predicate[*batchv1.Job]{func(job *batchv1.Job) (bool, string, error) { - want := int32(1) got := job.Status.Active - return want != 0 && want == got, fmt.Sprintf("wanted status active to be %d, got %d", want, got), nil + return got == 1, fmt.Sprintf("wanted status active to be 1, got %d", got), nil }}, e2eutil.WithInterval(5*time.Second), e2eutil.WithTimeout(15*time.Minute), diff --git a/test/e2e/v2/tests/nodepool_autoscaling_test.go b/test/e2e/v2/tests/nodepool_autoscaling_test.go index b3bd48987f9..0449a3bacf7 100644 --- a/test/e2e/v2/tests/nodepool_autoscaling_test.go +++ b/test/e2e/v2/tests/nodepool_autoscaling_test.go @@ -19,6 +19,7 @@ package tests import ( "context" "fmt" + "strings" "time" . "github.com/onsi/ginkgo/v2" @@ -28,6 +29,7 @@ import ( e2eutil "github.com/openshift/hypershift/test/e2e/util" "github.com/openshift/hypershift/test/e2e/v2/internal" + appsv1 "k8s.io/api/apps/v1" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -56,8 +58,10 @@ func AutoscalingScaleUpDownTest(getTestCtx internal.TestContextGetter) { defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") - // Create autoscaling NodePool with min=1, max=3 - autoscalingNP := buildAutoscalingNodePool(defaultNP, 1, 3) + // Create autoscaling NodePool with min=1, max=3 and a unique node label + // so the workload targets only this NodePool's nodes. + autoscalingLabel := map[string]string{"e2e-autoscaling-test": "scale-up-down"} + autoscalingNP := buildAutoscalingNodePool(defaultNP, 1, 3, autoscalingLabel) err := testCtx.MgmtClient.Create(ctx, autoscalingNP) Expect(err).NotTo(HaveOccurred(), "failed to create autoscaling NodePool") GinkgoWriter.Printf("Created autoscaling NodePool %s with min=1, max=3\n", autoscalingNP.Name) @@ -78,9 +82,11 @@ func AutoscalingScaleUpDownTest(getTestCtx internal.TestContextGetter) { bytes, ok := memCapacity.AsInt64() Expect(ok).To(BeTrue(), "memory capacity should be convertible to int64") - // Create workload that requires 3 nodes (50% memory per pod, 3 pods) + // Create workload that requires 3 nodes (50% memory per pod, 3 pods). + // nodeSelector forces pods onto the autoscaling NodePool so the + // cluster autoscaler must scale it up. workloadMemRequest := *resource.NewQuantity(bytes/2, resource.BinarySI) - workload := newAutoscalingWorkload(3, workloadMemRequest) + workload := newAutoscalingWorkload(3, workloadMemRequest, autoscalingLabel) err = guestClient.Create(ctx, workload) Expect(err).NotTo(HaveOccurred(), "failed to create workload") @@ -97,7 +103,9 @@ func AutoscalingScaleUpDownTest(getTestCtx internal.TestContextGetter) { }) } -// AutoscalingBalancingTest tests that autoscaling balances workload across multiple NodePools +// AutoscalingBalancingTest tests that autoscaling balances workload across multiple NodePools. +// It configures the HostedCluster with the Random expander so the cluster autoscaler +// distributes scale-up events across NodePools instead of favoring one. func AutoscalingBalancingTest(getTestCtx internal.TestContextGetter) { It("should balance pods across multiple autoscaling NodePools", func() { testCtx := getTestCtx() @@ -112,18 +120,81 @@ func AutoscalingBalancingTest(getTestCtx internal.TestContextGetter) { Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") ctx := testCtx.Context + cpNamespace := testCtx.ControlPlaneNamespace + + // Configure autoscaler with Random expander for balanced distribution. + // The default least-waste expander favors a single NodePool. + balancingLabel := "e2e-balance-ignore" + originalHC := hc.DeepCopy() + hc.Spec.Autoscaling = hyperv1.ClusterAutoscaling{ + Expanders: []hyperv1.ExpanderString{ + hyperv1.RandomExpander, + }, + BalancingIgnoredLabels: []string{ + balancingLabel, + }, + MaxFreeDifferenceRatioPercent: ptr.To[int32](70), + } + err := testCtx.MgmtClient.Patch(ctx, hc, crclient.MergeFrom(originalHC)) + Expect(err).NotTo(HaveOccurred(), "failed to configure autoscaler on HostedCluster") + GinkgoWriter.Println("Configured HostedCluster autoscaling with Random expander") + + DeferCleanup(func() { + latest := &hyperv1.HostedCluster{} + if err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(hc), latest); err != nil { + GinkgoWriter.Printf("Warning: failed to get HostedCluster for cleanup: %v\n", err) + return + } + patch := crclient.MergeFrom(latest.DeepCopy()) + latest.Spec.Autoscaling = hyperv1.ClusterAutoscaling{} + if err := testCtx.MgmtClient.Patch(ctx, latest, patch); err != nil { + GinkgoWriter.Printf("Warning: failed to reset autoscaler config: %v\n", err) + } + }) + + // Wait for autoscaler deployment to pick up the new config + e2eutil.EventuallyObject(GinkgoTB(), ctx, "autoscaler deployment to have balancing config", + func(ctx context.Context) (*appsv1.Deployment, error) { + dep := &appsv1.Deployment{ObjectMeta: metav1.ObjectMeta{ + Namespace: cpNamespace, Name: "cluster-autoscaler", + }} + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(dep), dep) + return dep, err + }, + []e2eutil.Predicate[*appsv1.Deployment]{func(dep *appsv1.Deployment) (bool, string, error) { + for _, arg := range dep.Spec.Template.Spec.Containers[0].Args { + if strings.Contains(arg, balancingLabel) { + return dep.Status.ReadyReplicas > 0, fmt.Sprintf("ready replicas: %d", dep.Status.ReadyReplicas), nil + } + } + return false, "balancing-ignore-label not found in autoscaler args", nil + }}, + e2eutil.WithInterval(10*time.Second), + e2eutil.WithTimeout(5*time.Minute), + ) // Find the default NodePool to copy platform config defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") - // Create two autoscaling NodePools - autoscalingNP1 := buildAutoscalingNodePool(defaultNP, 1, 3) - err := testCtx.MgmtClient.Create(ctx, autoscalingNP1) + // Create two autoscaling NodePools with distinct labels for the + // balancing-ignored-labels config and a shared label for the workload nodeSelector. + sharedLabel := map[string]string{"e2e-autoscaling-test": "balance"} + np1Labels := map[string]string{ + "e2e-autoscaling-test": "balance", + balancingLabel: "np1", + } + np2Labels := map[string]string{ + "e2e-autoscaling-test": "balance", + balancingLabel: "np2", + } + + autoscalingNP1 := buildAutoscalingNodePool(defaultNP, 1, 3, np1Labels) + err = testCtx.MgmtClient.Create(ctx, autoscalingNP1) Expect(err).NotTo(HaveOccurred(), "failed to create first autoscaling NodePool") defer cleanupNodePool(ctx, testCtx.MgmtClient, autoscalingNP1) - autoscalingNP2 := buildAutoscalingNodePool(defaultNP, 1, 3) + autoscalingNP2 := buildAutoscalingNodePool(defaultNP, 1, 3, np2Labels) err = testCtx.MgmtClient.Create(ctx, autoscalingNP2) Expect(err).NotTo(HaveOccurred(), "failed to create second autoscaling NodePool") defer cleanupNodePool(ctx, testCtx.MgmtClient, autoscalingNP2) @@ -144,9 +215,9 @@ func AutoscalingBalancingTest(getTestCtx internal.TestContextGetter) { bytes, ok := memCapacity.AsInt64() Expect(ok).To(BeTrue(), "memory capacity should be convertible to int64") - // Create workload that requires 4 nodes (50% memory per pod, 4 pods) + // Create workload targeting the autoscaling NodePools via the shared label. workloadMemRequest := *resource.NewQuantity(bytes/2, resource.BinarySI) - workload := newAutoscalingWorkload(4, workloadMemRequest) + workload := newAutoscalingWorkload(4, workloadMemRequest, sharedLabel) err = guestClient.Create(ctx, workload) Expect(err).NotTo(HaveOccurred(), "failed to create workload") defer cleanupWorkload(ctx, guestClient, workload) @@ -192,8 +263,9 @@ func getDefaultNodePool(ctx context.Context, client crclient.Client, hc *hyperv1 return nil } -// buildAutoscalingNodePool creates a new NodePool with autoscaling enabled based on a template -func buildAutoscalingNodePool(template *hyperv1.NodePool, min, max int32) *hyperv1.NodePool { +// buildAutoscalingNodePool creates a new NodePool with autoscaling enabled based on a template. +// nodeLabels are applied to the NodePool's nodes so workloads can target them with a nodeSelector. +func buildAutoscalingNodePool(template *hyperv1.NodePool, min, max int32, nodeLabels map[string]string) *hyperv1.NodePool { GinkgoHelper() name := e2eutil.SimpleNameGenerator.GenerateName(template.Spec.ClusterName + "-auto-") @@ -214,11 +286,22 @@ func buildAutoscalingNodePool(template *hyperv1.NodePool, min, max int32) *hyper Max: max, } + if len(nodeLabels) > 0 { + if np.Spec.NodeLabels == nil { + np.Spec.NodeLabels = make(map[string]string) + } + for k, v := range nodeLabels { + np.Spec.NodeLabels[k] = v + } + } + return np } -// newAutoscalingWorkload creates a Job that spawns multiple pods for autoscaling tests -func newAutoscalingWorkload(njobs int32, memoryRequest resource.Quantity) *batchv1.Job { +// newAutoscalingWorkload creates a Job that spawns multiple pods for autoscaling tests. +// nodeSelector constrains pods to land on specific NodePool nodes so the +// cluster autoscaler is forced to scale the targeted NodePool. +func newAutoscalingWorkload(njobs int32, memoryRequest resource.Quantity, nodeSelector map[string]string) *batchv1.Job { GinkgoHelper() name := e2eutil.SimpleNameGenerator.GenerateName("autoscaling-workload-") @@ -257,6 +340,7 @@ func newAutoscalingWorkload(njobs int32, memoryRequest resource.Quantity) *batch }, }, }, + NodeSelector: nodeSelector, RestartPolicy: corev1.RestartPolicyNever, }, }, diff --git a/test/e2e/v2/tests/nodepool_lifecycle_test.go b/test/e2e/v2/tests/nodepool_lifecycle_test.go index 4c892d9e97e..7b91ce5552f 100644 --- a/test/e2e/v2/tests/nodepool_lifecycle_test.go +++ b/test/e2e/v2/tests/nodepool_lifecycle_test.go @@ -808,6 +808,19 @@ func NodePoolTrustBundleTest(getTestCtx internal.TestContextGetter) { ctx := testCtx.Context + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + var oneReplica int32 = 1 + np := buildTestNodePool(defaultNP, "trust-bundle", func(pool *hyperv1.NodePool) { + pool.Spec.Replicas = &oneReplica + }) + Expect(testCtx.MgmtClient.Create(ctx, np)).To(Succeed(), "failed to create NodePool %s", np.Name) + GinkgoWriter.Printf("Created NodePool %s for trust bundle test\n", np.Name) + defer cleanupNodePool(ctx, testCtx.MgmtClient, np) + + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + // Create additional trust bundle ConfigMap trustBundle := &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ @@ -826,19 +839,18 @@ func NodePoolTrustBundleTest(getTestCtx internal.TestContextGetter) { // Defer cleanup: remove trust bundle reference from HostedCluster defer func() { - _ = e2eutil.UpdateObject(GinkgoTB(), ctx, testCtx.MgmtClient, hc, func(obj *hyperv1.HostedCluster) { + err := e2eutil.UpdateObject(GinkgoTB(), ctx, testCtx.MgmtClient, hc, func(obj *hyperv1.HostedCluster) { obj.Spec.AdditionalTrustBundle = nil }) + if err != nil { + GinkgoWriter.Printf("WARNING: failed to clean up trust bundle reference: %v\n", err) + } }() - // Wait for NodePool to begin updating - defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) - Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") - - e2eutil.EventuallyObject(GinkgoTB(), ctx, fmt.Sprintf("NodePool %s/%s to begin updating", defaultNP.Namespace, defaultNP.Name), + e2eutil.EventuallyObject(GinkgoTB(), ctx, fmt.Sprintf("NodePool %s/%s to begin updating", np.Namespace, np.Name), func(ctx context.Context) (*hyperv1.NodePool, error) { pool := &hyperv1.NodePool{} - err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(defaultNP), pool) + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(np), pool) return pool, err }, []e2eutil.Predicate[*hyperv1.NodePool]{ @@ -850,11 +862,10 @@ func NodePoolTrustBundleTest(getTestCtx internal.TestContextGetter) { e2eutil.WithInterval(10*time.Second), e2eutil.WithTimeout(5*time.Minute), ) - // Wait for NodePool to stop updating - e2eutil.EventuallyObject(GinkgoTB(), ctx, fmt.Sprintf("NodePool %s/%s to stop updating", defaultNP.Namespace, defaultNP.Name), + e2eutil.EventuallyObject(GinkgoTB(), ctx, fmt.Sprintf("NodePool %s/%s to stop updating", np.Namespace, np.Name), func(ctx context.Context) (*hyperv1.NodePool, error) { pool := &hyperv1.NodePool{} - err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(defaultNP), pool) + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(np), pool) return pool, err }, []e2eutil.Predicate[*hyperv1.NodePool]{ @@ -925,10 +936,10 @@ func NodePoolTrustBundleTest(getTestCtx internal.TestContextGetter) { ) // Wait for NodePool to cycle again - e2eutil.EventuallyObject(GinkgoTB(), ctx, fmt.Sprintf("NodePool %s/%s to begin updating after trust bundle removal", defaultNP.Namespace, defaultNP.Name), + e2eutil.EventuallyObject(GinkgoTB(), ctx, fmt.Sprintf("NodePool %s/%s to begin updating after trust bundle removal", np.Namespace, np.Name), func(ctx context.Context) (*hyperv1.NodePool, error) { pool := &hyperv1.NodePool{} - err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(defaultNP), pool) + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(np), pool) return pool, err }, []e2eutil.Predicate[*hyperv1.NodePool]{ @@ -940,10 +951,10 @@ func NodePoolTrustBundleTest(getTestCtx internal.TestContextGetter) { e2eutil.WithInterval(10*time.Second), e2eutil.WithTimeout(5*time.Minute), ) - e2eutil.EventuallyObject(GinkgoTB(), ctx, fmt.Sprintf("NodePool %s/%s to stop updating after trust bundle removal", defaultNP.Namespace, defaultNP.Name), + e2eutil.EventuallyObject(GinkgoTB(), ctx, fmt.Sprintf("NodePool %s/%s to stop updating after trust bundle removal", np.Namespace, np.Name), func(ctx context.Context) (*hyperv1.NodePool, error) { pool := &hyperv1.NodePool{} - err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(defaultNP), pool) + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(np), pool) return pool, err }, []e2eutil.Predicate[*hyperv1.NodePool]{ From 5961891e6808ae5504553d8ae45463c1a080384f Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Tue, 19 May 2026 15:41:59 -0400 Subject: [PATCH 10/11] fix(e2ev2): use human-readable cluster names in CI Replace opaque SHA-256 hash cluster names with "{variant}-{hash10}" format so artifacts, namespaces, and dump directories are navigable. Co-Authored-By: Claude Opus 4.6 --- test/e2e/v2/cmd/create-guests/main.go | 2 +- test/e2e/v2/cmd/destroy-guests/main.go | 3 ++- test/e2e/v2/cmd/dump-guests/main.go | 8 ++++---- test/e2e/v2/lifecycle/azure.go | 5 ----- test/e2e/v2/lifecycle/platform.go | 14 ++++++++------ test/e2e/v2/tests/etcd_chaos_test.go | 6 ++++++ 6 files changed, 21 insertions(+), 17 deletions(-) diff --git a/test/e2e/v2/cmd/create-guests/main.go b/test/e2e/v2/cmd/create-guests/main.go index 129110a3c8c..9c843d17d76 100644 --- a/test/e2e/v2/cmd/create-guests/main.go +++ b/test/e2e/v2/cmd/create-guests/main.go @@ -129,7 +129,7 @@ func run(ctx context.Context, cfg envConfig) error { named := make([]namedSpec, len(specs)) clusterNames := make(map[string]string) // outputFile -> name for i, spec := range specs { - name := lifecycle.DeriveClusterName(cfg.prowJobID, spec.Suffix) + name := lifecycle.DeriveClusterName(cfg.prowJobID, spec.Variant) named[i] = namedSpec{ClusterSpec: spec, name: name} clusterNames[spec.OutputFile] = name } diff --git a/test/e2e/v2/cmd/destroy-guests/main.go b/test/e2e/v2/cmd/destroy-guests/main.go index 28adc09b76e..5f317b16ab5 100644 --- a/test/e2e/v2/cmd/destroy-guests/main.go +++ b/test/e2e/v2/cmd/destroy-guests/main.go @@ -63,12 +63,13 @@ func main() { ) for _, spec := range specs { - clusterName := lifecycle.DeriveClusterName(prowJobID, spec.Suffix) + clusterName := lifecycle.DeriveClusterName(prowJobID, spec.Variant) wg.Add(1) go func() { defer wg.Done() if err := destroyCluster(hypershiftBin, clusterName, spec.Variant, platform); err != nil { log.Printf("WARNING: Failed to destroy cluster %s (%s): %v", clusterName, spec.Variant, err) + log.Printf("ACTION REQUIRED: cloud resources for cluster %s may be orphaned and need manual cleanup (resource group, DNS records, etc.)", clusterName) mu.Lock() failed = true mu.Unlock() diff --git a/test/e2e/v2/cmd/dump-guests/main.go b/test/e2e/v2/cmd/dump-guests/main.go index 8379dd72682..3f0fec53999 100644 --- a/test/e2e/v2/cmd/dump-guests/main.go +++ b/test/e2e/v2/cmd/dump-guests/main.go @@ -57,11 +57,11 @@ func main() { var wg sync.WaitGroup for _, spec := range specs { - clusterName := lifecycle.DeriveClusterName(prowJobID, spec.Suffix) + clusterName := lifecycle.DeriveClusterName(prowJobID, spec.Variant) wg.Add(1) go func() { defer wg.Done() - dumpCluster(*hypershiftBinary, artifactDir, clusterName, spec.Variant) + dumpCluster(*hypershiftBinary, artifactDir, clusterName) }() } wg.Wait() @@ -69,8 +69,8 @@ func main() { log.Println("All cluster dumps complete") } -func dumpCluster(hypershiftBinary, artifactDir, clusterName, variant string) { - dumpDir := filepath.Join(artifactDir, variant) +func dumpCluster(hypershiftBinary, artifactDir, clusterName string) { + dumpDir := filepath.Join(artifactDir, clusterName) if err := os.MkdirAll(dumpDir, 0755); err != nil { log.Printf("WARNING: Failed to create artifact directory %s: %v", dumpDir, err) return diff --git a/test/e2e/v2/lifecycle/azure.go b/test/e2e/v2/lifecycle/azure.go index 8b9e8bfa764..47c1d6e1495 100644 --- a/test/e2e/v2/lifecycle/azure.go +++ b/test/e2e/v2/lifecycle/azure.go @@ -95,12 +95,10 @@ func (a *AzurePlatformConfig) ClusterSpecs(releaseImage, n1Image string) []Clust return []ClusterSpec{ { Variant: "public", - Suffix: "-pub", OutputFile: "cluster-name-public", }, { Variant: "private", - Suffix: "-prv", OutputFile: "cluster-name-private", ExtraArgs: []string{ "--endpoint-access=Private", @@ -109,20 +107,17 @@ func (a *AzurePlatformConfig) ClusterSpecs(releaseImage, n1Image string) []Clust }, { Variant: "oauth-lb", - Suffix: "-oau", OutputFile: "cluster-name-oauth-lb", ExtraArgs: []string{"--oauth-publishing-strategy=LoadBalancer"}, }, { Variant: "upgrade", - Suffix: "-upg", OutputFile: "cluster-name-upgrade", ReleaseImage: n1Image, ExtraArgs: []string{"--control-plane-availability-policy=HighlyAvailable"}, }, { Variant: "autoscaling", - Suffix: "-asc", OutputFile: "cluster-name-autoscaling", }, } diff --git a/test/e2e/v2/lifecycle/platform.go b/test/e2e/v2/lifecycle/platform.go index 940fc98bce9..a7ba5a1ebc7 100644 --- a/test/e2e/v2/lifecycle/platform.go +++ b/test/e2e/v2/lifecycle/platform.go @@ -13,7 +13,6 @@ import ( // ClusterSpec describes a single cluster to create for lifecycle tests. type ClusterSpec struct { Variant string - Suffix string // hash suffix for name derivation OutputFile string // filename under SHARED_DIR ExtraArgs []string ReleaseImage string // override (empty = use default) @@ -94,10 +93,13 @@ func NewPlatformConfig(platform, sharedDir string) (PlatformConfig, error) { } } -// DeriveClusterName hashes prowJobID+suffix with SHA-256 and returns -// the first 20 hex characters. -func DeriveClusterName(prowJobID, suffix string) string { - hash := sha256.Sum256([]byte(prowJobID + suffix)) - return fmt.Sprintf("%x", hash)[:20] +// DeriveClusterName builds a human-readable, deterministic cluster name +// from the prow job ID and cluster variant. The format is +// "{variant}-{hash10}" where hash10 is the first 10 hex characters of +// SHA-256(prowJobID), giving uniqueness per CI run while keeping the +// variant visible in artifacts and namespaces. +func DeriveClusterName(prowJobID, variant string) string { + hash := sha256.Sum256([]byte(prowJobID)) + return variant + "-" + fmt.Sprintf("%x", hash)[:10] } diff --git a/test/e2e/v2/tests/etcd_chaos_test.go b/test/e2e/v2/tests/etcd_chaos_test.go index 0a204d2bc5a..95de3987392 100644 --- a/test/e2e/v2/tests/etcd_chaos_test.go +++ b/test/e2e/v2/tests/etcd_chaos_test.go @@ -250,6 +250,9 @@ func EtcdSingleMemberCorruptionTest(getTestCtx internal.TestContextGetter) { cpNamespace := testCtx.ControlPlaneNamespace etcdSts, etcdPods := getEtcdStsAndPods(ctx, testCtx.MgmtClient, cpNamespace) + if ptr.Deref(etcdSts.Spec.Replicas, 0) < 3 { + Skip("etcd corruption recovery requires HighlyAvailable etcd (>= 3 replicas)") + } pod := randomEtcdPods(etcdPods.Items, 1)[0] // Remove the entire member directory so etcd cannot start. @@ -295,6 +298,9 @@ func EtcdMissingMemberRecoveryTest(getTestCtx internal.TestContextGetter) { cpNamespace := testCtx.ControlPlaneNamespace etcdSts, etcdPods := getEtcdStsAndPods(ctx, testCtx.MgmtClient, cpNamespace) + if ptr.Deref(etcdSts.Spec.Replicas, 0) < 3 { + Skip("etcd missing member recovery requires HighlyAvailable etcd (>= 3 replicas)") + } pod := randomEtcdPods(etcdPods.Items, 1)[0] ep := fmt.Sprintf("https://etcd-client.%s.svc:2379", cpNamespace) From d428dc133e8a473a2d920a73a6366bb7a2b72f3d Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Thu, 21 May 2026 05:27:56 -0400 Subject: [PATCH 11/11] test(e2ev2): add lifecycle umbrella label to lifecycle test suites Add Label("lifecycle") to all lifecycle test Describes so general v2 jobs can exclude them with a single !lifecycle filter instead of enumerating each label individually. Co-Authored-By: Claude Opus 4.6 --- test/e2e/v2/tests/control_plane_upgrade_test.go | 2 +- test/e2e/v2/tests/etcd_chaos_test.go | 2 +- test/e2e/v2/tests/nodepool_autoscaling_test.go | 2 +- test/e2e/v2/tests/nodepool_lifecycle_test.go | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/e2e/v2/tests/control_plane_upgrade_test.go b/test/e2e/v2/tests/control_plane_upgrade_test.go index d729d338601..a2387cfae05 100644 --- a/test/e2e/v2/tests/control_plane_upgrade_test.go +++ b/test/e2e/v2/tests/control_plane_upgrade_test.go @@ -81,7 +81,7 @@ func RegisterControlPlaneUpgradeTests(getTestCtx internal.TestContextGetter) { ControlPlaneUpgradeTest(getTestCtx) } -var _ = Describe("Control Plane Upgrade", Label("control-plane-upgrade"), func() { +var _ = Describe("Control Plane Upgrade", Label("lifecycle", "control-plane-upgrade"), func() { var testCtx *internal.TestContext BeforeEach(func() { diff --git a/test/e2e/v2/tests/etcd_chaos_test.go b/test/e2e/v2/tests/etcd_chaos_test.go index 95de3987392..e10a15b5561 100644 --- a/test/e2e/v2/tests/etcd_chaos_test.go +++ b/test/e2e/v2/tests/etcd_chaos_test.go @@ -52,7 +52,7 @@ func RegisterEtcdChaosTests(getTestCtx internal.TestContextGetter) { EtcdMissingMemberRecoveryTest(getTestCtx) } -var _ = Describe("Etcd Chaos", Label("etcd-chaos"), Ordered, func() { +var _ = Describe("Etcd Chaos", Label("lifecycle", "etcd-chaos"), Ordered, func() { var testCtx *internal.TestContext BeforeAll(func() { diff --git a/test/e2e/v2/tests/nodepool_autoscaling_test.go b/test/e2e/v2/tests/nodepool_autoscaling_test.go index 0449a3bacf7..5a2766d8a54 100644 --- a/test/e2e/v2/tests/nodepool_autoscaling_test.go +++ b/test/e2e/v2/tests/nodepool_autoscaling_test.go @@ -386,7 +386,7 @@ func RegisterNodePoolAutoscalingTests(getTestCtx internal.TestContextGetter) { AutoscalingBalancingTest(getTestCtx) } -var _ = Describe("NodePool Autoscaling", Label("nodepool-autoscaling"), func() { +var _ = Describe("NodePool Autoscaling", Label("lifecycle", "nodepool-autoscaling"), func() { var testCtx *internal.TestContext BeforeEach(func() { diff --git a/test/e2e/v2/tests/nodepool_lifecycle_test.go b/test/e2e/v2/tests/nodepool_lifecycle_test.go index 7b91ce5552f..22c0a754378 100644 --- a/test/e2e/v2/tests/nodepool_lifecycle_test.go +++ b/test/e2e/v2/tests/nodepool_lifecycle_test.go @@ -66,7 +66,7 @@ func RegisterNodePoolLifecycleTests(getTestCtx internal.TestContextGetter) { NodePoolDiskEncryptionTest(getTestCtx) } -var _ = Describe("NodePool Lifecycle", Label("nodepool-lifecycle"), func() { +var _ = Describe("NodePool Lifecycle", Label("lifecycle", "nodepool-lifecycle"), func() { var testCtx *internal.TestContext BeforeEach(func() {