diff --git a/Dockerfile.e2e b/Dockerfile.e2e index 1b079d706d9..68086ed8bae 100644 --- a/Dockerfile.e2e +++ b/Dockerfile.e2e @@ -5,7 +5,7 @@ WORKDIR /hypershift COPY . . -RUN make e2e hypershift +RUN make e2e hypershift e2ev2-create-guests e2ev2-run-tests e2ev2-destroy-guests e2ev2-dump-guests # Reuse the same image as builder because we need go command in ci-test-e2e.sh # Multi-stage build lets us drop the source code and build cache from the final image @@ -20,6 +20,10 @@ COPY --from=builder /hypershift/bin/test-backuprestore /hypershift/bin/test-back COPY --from=builder /hypershift/bin/test-setup /hypershift/bin/test-setup COPY --from=builder /hypershift/bin/test-reqserving /hypershift/bin/test-reqserving COPY --from=builder /hypershift/bin/hypershift /hypershift/bin/hypershift +COPY --from=builder /hypershift/bin/create-guests /hypershift/bin/create-guests +COPY --from=builder /hypershift/bin/run-tests /hypershift/bin/run-tests +COPY --from=builder /hypershift/bin/destroy-guests /hypershift/bin/destroy-guests +COPY --from=builder /hypershift/bin/dump-guests /hypershift/bin/dump-guests COPY --from=builder /hypershift/hack/ci-test-e2e.sh /hypershift/hack/ci-test-e2e.sh COPY --from=builder /hypershift/hack/run-reqserving-e2e.sh /hypershift/hack/run-reqserving-e2e.sh diff --git a/Makefile b/Makefile index c7342038963..4dce4a252e2 100644 --- a/Makefile +++ b/Makefile @@ -482,6 +482,22 @@ reqserving-e2e: e2ev2: $(GO_E2EV2_RECIPE) -o bin/test-e2e-v2 ./test/e2e/v2/tests +.PHONY: e2ev2-create-guests +e2ev2-create-guests: + $(GO_BUILD_RECIPE) -tags e2ev2 -o bin/create-guests ./test/e2e/v2/cmd/create-guests + +.PHONY: e2ev2-run-tests +e2ev2-run-tests: + $(GO_BUILD_RECIPE) -tags e2ev2 -o bin/run-tests ./test/e2e/v2/cmd/run-tests + +.PHONY: e2ev2-destroy-guests +e2ev2-destroy-guests: + $(GO_BUILD_RECIPE) -tags e2ev2 -o bin/destroy-guests ./test/e2e/v2/cmd/destroy-guests + +.PHONY: e2ev2-dump-guests +e2ev2-dump-guests: + $(GO_BUILD_RECIPE) -tags e2ev2 -o bin/dump-guests ./test/e2e/v2/cmd/dump-guests + .PHONY: backuprestore-e2e backuprestore-e2e: $(GO_BACKUPRESTORE_E2E_RECIPE) -o bin/test-backuprestore ./test/e2e/v2/tests diff --git a/test/e2e/util/util.go b/test/e2e/util/util.go index dcbf6ed7e55..9928bccfcf2 100644 --- a/test/e2e/util/util.go +++ b/test/e2e/util/util.go @@ -343,7 +343,7 @@ func WaitForGuestRestConfig(t *testing.T, ctx context.Context, client crclient.C return guestConfig } -func WaitForGuestClient(t *testing.T, ctx context.Context, client crclient.Client, hostedCluster *hyperv1.HostedCluster) crclient.Client { +func WaitForGuestClient(t testing.TB, ctx context.Context, client crclient.Client, hostedCluster *hyperv1.HostedCluster) crclient.Client { g := NewWithT(t) guestKubeConfigSecretData := WaitForGuestKubeConfig(t, ctx, client, hostedCluster) @@ -463,23 +463,23 @@ func WaitForGuestKubeconfigHostResolutionUpdate(t *testing.T, ctx context.Contex g.Expect(err).NotTo(HaveOccurred(), "failed to wait for guest kubeconfig host resolution to update") } -func WaitForNReadyNodes(t *testing.T, ctx context.Context, client crclient.Client, n int32, platform hyperv1.PlatformType) []corev1.Node { +func WaitForNReadyNodes(t testing.TB, ctx context.Context, client crclient.Client, n int32, platform hyperv1.PlatformType) []corev1.Node { return WaitForNReadyNodesWithOptions(t, ctx, client, n, platform, "") } -func WaitForReadyNodesByNodePool(t *testing.T, ctx context.Context, client crclient.Client, np *hyperv1.NodePool, platform hyperv1.PlatformType, opts ...NodePoolPollOption) []corev1.Node { +func WaitForReadyNodesByNodePool(t testing.TB, ctx context.Context, client crclient.Client, np *hyperv1.NodePool, platform hyperv1.PlatformType, opts ...NodePoolPollOption) []corev1.Node { return WaitForNReadyNodesWithOptions(t, ctx, client, *np.Spec.Replicas, platform, fmt.Sprintf("for NodePool %s/%s", np.Namespace, np.Name), append(opts, WithClientOptions(crclient.MatchingLabelsSelector{Selector: labels.SelectorFromSet(labels.Set{hyperv1.NodePoolLabel: np.Name})}))...) } -func WaitForReadyNodesByLabels(t *testing.T, ctx context.Context, client crclient.Client, platform hyperv1.PlatformType, replicas int32, nodeLabels map[string]string) []corev1.Node { +func WaitForReadyNodesByLabels(t testing.TB, ctx context.Context, client crclient.Client, platform hyperv1.PlatformType, replicas int32, nodeLabels map[string]string) []corev1.Node { return WaitForNReadyNodesWithOptions(t, ctx, client, replicas, platform, "", WithClientOptions(crclient.MatchingLabelsSelector{Selector: labels.SelectorFromSet(labels.Set(nodeLabels))})) } -func WaitForNodePoolConfigUpdateComplete(t *testing.T, ctx context.Context, client crclient.Client, np *hyperv1.NodePool) { +func WaitForNodePoolConfigUpdateComplete(t testing.TB, ctx context.Context, client crclient.Client, np *hyperv1.NodePool) { WaitForNodePoolConfigUpdateCompleteWithPlatform(t, ctx, client, np, hyperv1.NonePlatform) } -func WaitForNodePoolConfigUpdateCompleteWithPlatform(t *testing.T, ctx context.Context, client crclient.Client, np *hyperv1.NodePool, platform hyperv1.PlatformType) { +func WaitForNodePoolConfigUpdateCompleteWithPlatform(t testing.TB, ctx context.Context, client crclient.Client, np *hyperv1.NodePool, platform hyperv1.PlatformType) { // configUpdateTimeout for config updates to complete configUpdateTimeout := 25 * time.Minute switch platform { @@ -557,7 +557,7 @@ func WithSuffix(suffix string) NodePoolPollOption { } } -func WaitForNReadyNodesWithOptions(t *testing.T, ctx context.Context, client crclient.Client, n int32, platform hyperv1.PlatformType, suffix string, opts ...NodePoolPollOption) []corev1.Node { +func WaitForNReadyNodesWithOptions(t testing.TB, ctx context.Context, client crclient.Client, n int32, platform hyperv1.PlatformType, suffix string, opts ...NodePoolPollOption) []corev1.Node { options := &NodePoolPollOptions{} for _, opt := range opts { opt(options) @@ -606,7 +606,7 @@ func WaitForNReadyNodesWithOptions(t *testing.T, ctx context.Context, client crc // This was renamed from WaitForImageRollout to clarify that it checks HC.Status.Version // (data-plane CVO rollout), in contrast to WaitForControlPlaneRollout which checks // HC.Status.ControlPlaneVersion (management-side components). -func WaitForDataPlaneRollout(t *testing.T, ctx context.Context, client crclient.Client, hostedCluster *hyperv1.HostedCluster) { +func WaitForDataPlaneRollout(t testing.TB, ctx context.Context, client crclient.Client, hostedCluster *hyperv1.HostedCluster) { var lastVersionCompletionTime *metav1.Time if hostedCluster.Status.Version != nil && len(hostedCluster.Status.Version.History) > 0 { @@ -651,14 +651,14 @@ func WaitForDataPlaneRollout(t *testing.T, ctx context.Context, client crclient. // WaitForImageRollout is a deprecated alias for WaitForDataPlaneRollout. // Deprecated: Use WaitForDataPlaneRollout instead. -func WaitForImageRollout(t *testing.T, ctx context.Context, client crclient.Client, hostedCluster *hyperv1.HostedCluster) { +func WaitForImageRollout(t testing.TB, ctx context.Context, client crclient.Client, hostedCluster *hyperv1.HostedCluster) { WaitForDataPlaneRollout(t, ctx, client, hostedCluster) } // WaitForControlPlaneRollout waits for HC.Status.ControlPlaneVersion to reach Completed state // with the desired image. This checks management-side component rollout independently from CVO. // Must be gated with AtLeast(t, Version422) at call sites since older clusters lack this field. -func WaitForControlPlaneRollout(t *testing.T, ctx context.Context, client crclient.Client, hostedCluster *hyperv1.HostedCluster) { +func WaitForControlPlaneRollout(t testing.TB, ctx context.Context, client crclient.Client, hostedCluster *hyperv1.HostedCluster) { EventuallyObject(t, ctx, fmt.Sprintf("HostedCluster %s/%s controlPlaneVersion to complete", hostedCluster.Namespace, hostedCluster.Name), func(ctx context.Context) (*hyperv1.HostedCluster, error) { hc := &hyperv1.HostedCluster{} @@ -676,7 +676,7 @@ func WaitForControlPlaneRollout(t *testing.T, ctx context.Context, client crclie // WaitForControlPlaneComponentRollout waits for all ControlPlaneComponent resources to report // RolloutComplete=True and a version different from initialVersion. This provides a belt-and-suspenders // check alongside WaitForControlPlaneRollout by directly inspecting individual component status. -func WaitForControlPlaneComponentRollout(t *testing.T, ctx context.Context, client crclient.Client, hostedCluster *hyperv1.HostedCluster, initialVersion string) { +func WaitForControlPlaneComponentRollout(t testing.TB, ctx context.Context, client crclient.Client, hostedCluster *hyperv1.HostedCluster, initialVersion string) { controlPlaneComponents := &hyperv1.ControlPlaneComponentList{} controlPlaneNamespace := manifests.HostedControlPlaneNamespace(hostedCluster.Namespace, hostedCluster.Name) EventuallyObjects(t, ctx, "control plane components to complete rollout", @@ -735,7 +735,7 @@ func WaitForConditionsOnHostedControlPlane(t *testing.T, ctx context.Context, cl ) } -func WaitForNodePoolDesiredNodes(t *testing.T, ctx context.Context, client crclient.Client, hostedCluster *hyperv1.HostedCluster) { +func WaitForNodePoolDesiredNodes(t testing.TB, ctx context.Context, client crclient.Client, hostedCluster *hyperv1.HostedCluster) { EventuallyObjects(t, ctx, fmt.Sprintf("NodePools for HostedCluster %s/%s to have all of their desired nodes", hostedCluster.Namespace, hostedCluster.Name), func(ctx context.Context) ([]*hyperv1.NodePool, error) { list := &hyperv1.NodePoolList{} diff --git a/test/e2e/v2/cmd/create-guests/main.go b/test/e2e/v2/cmd/create-guests/main.go new file mode 100644 index 00000000000..9c843d17d76 --- /dev/null +++ b/test/e2e/v2/cmd/create-guests/main.go @@ -0,0 +1,525 @@ +//go:build e2ev2 + +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// create-guests creates HostedClusters in parallel for v2 e2e +// lifecycle tests. The number and configuration of clusters is +// determined by the platform (HYPERSHIFT_PLATFORM env var). +// It shells out to the hypershift CLI for cluster creation, runs +// platform-specific post-create hooks, then uses controller-runtime +// watches to wait for Available condition and version rollout +// completion. Cluster names are derived deterministically from +// PROW_JOB_ID and written to SHARED_DIR for downstream CI steps. +// JUnit XML is emitted to ARTIFACT_DIR on rollout failure. +package main + +import ( + "context" + "fmt" + "log" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + "sync" + "time" + + configv1 "github.com/openshift/api/config/v1" + hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" + "github.com/openshift/hypershift/test/e2e/v2/lifecycle" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/watch" + ctrl "sigs.k8s.io/controller-runtime" + crclient "sigs.k8s.io/controller-runtime/pkg/client" +) + +var scheme = runtime.NewScheme() + +func init() { + utilruntime.Must(hyperv1.AddToScheme(scheme)) +} + +const defaultNamespace = "clusters" + +// envConfig captures the common environment configuration. +type envConfig struct { + prowJobID string + sharedDir string + artifactDir string + releaseImage string + n1Image string + + baseDomain string + nodeCount int + namespace string + externalDNS string + etcdSC string + pullSecret string + + platform lifecycle.PlatformConfig + hypershiftBinary string + waitTimeout time.Duration +} + +func main() { + cfg := loadEnvConfig() + + ctx, cancel := context.WithTimeout(context.Background(), cfg.waitTimeout+10*time.Minute) + defer cancel() + + if err := run(ctx, cfg); err != nil { + log.Fatalf("Error: %v", err) + } +} + +func loadEnvConfig() envConfig { + sharedDir := mustGetenv("SHARED_DIR") + + platform, err := lifecycle.NewPlatformConfig(os.Getenv("HYPERSHIFT_PLATFORM"), sharedDir) + if err != nil { + log.Fatalf("Failed to initialize platform config: %v", err) + } + + cfg := envConfig{ + prowJobID: mustGetenv("PROW_JOB_ID"), + sharedDir: sharedDir, + artifactDir: mustGetenv("ARTIFACT_DIR"), + releaseImage: mustGetenv("RELEASE_IMAGE_LATEST"), + n1Image: os.Getenv("OCP_IMAGE_N1"), + + baseDomain: envOrDefault("HYPERSHIFT_BASE_DOMAIN", platform.DefaultBaseDomain()), + nodeCount: envOrDefaultInt("HYPERSHIFT_NODE_COUNT", 3), + namespace: envOrDefault("HYPERSHIFT_NAMESPACE", defaultNamespace), + externalDNS: os.Getenv("HYPERSHIFT_EXTERNAL_DNS_DOMAIN"), + etcdSC: os.Getenv("HYPERSHIFT_ETCD_STORAGE_CLASS"), + pullSecret: envOrDefault("PULL_SECRET", "/etc/ci-pull-credentials/.dockerconfigjson"), + + platform: platform, + hypershiftBinary: envOrDefault("HYPERSHIFT_BINARY", "hypershift"), + waitTimeout: 45 * time.Minute, + } + + if cfg.n1Image == "" { + cfg.n1Image = cfg.releaseImage + } + + return cfg +} + +func run(ctx context.Context, cfg envConfig) error { + specs := cfg.platform.ClusterSpecs(cfg.releaseImage, cfg.n1Image) + + // Derive cluster names and build the name map. + named := make([]namedSpec, len(specs)) + clusterNames := make(map[string]string) // outputFile -> name + for i, spec := range specs { + name := lifecycle.DeriveClusterName(cfg.prowJobID, spec.Variant) + named[i] = namedSpec{ClusterSpec: spec, name: name} + clusterNames[spec.OutputFile] = name + } + + // Phase 1: Create all clusters in parallel. + log.Printf("Phase 1: Creating %d clusters in parallel", len(named)) + createErrors := createClustersParallel(ctx, cfg, named) + for _, ns := range named { + if err := createErrors[ns.Variant]; err != nil { + log.Printf("ERROR: cluster %s (%s) creation failed: %v", ns.name, ns.Variant, err) + } else { + log.Printf("Cluster %s (%s) creation command completed", ns.name, ns.Variant) + } + } + for _, err := range createErrors { + if err != nil { + return fmt.Errorf("one or more cluster create commands failed") + } + } + + // Phase 2: Platform-specific post-create hooks. + log.Println("Phase 2: Running platform post-create hooks") + mgmtClient, err := newMgmtClient() + if err != nil { + return fmt.Errorf("creating management cluster client: %w", err) + } + if err := cfg.platform.PostCreate(ctx, mgmtClient, cfg.namespace, clusterNames); err != nil { + return fmt.Errorf("platform post-create hook: %w", err) + } + + // Phase 3: Watch for Available condition on all clusters. + log.Println("Phase 3: Waiting for all clusters to become Available") + availableErrors := waitForClustersAvailable(ctx, mgmtClient, cfg.namespace, named, 30*time.Minute) + for _, ns := range named { + if err := availableErrors[ns.Variant]; err != nil { + log.Printf("ERROR: cluster %s (%s) did not become Available: %v", ns.name, ns.Variant, err) + } else { + log.Printf("Cluster %s (%s) is Available", ns.name, ns.Variant) + } + } + for _, err := range availableErrors { + if err != nil { + return fmt.Errorf("one or more clusters did not become Available") + } + } + + // Phase 4: Watch for version rollout completion on all clusters. + log.Println("Phase 4: Waiting for version rollout completion on all clusters") + rolloutErrors := waitForVersionRollout(ctx, mgmtClient, cfg, named) + anyRolloutFailed := false + for _, ns := range named { + if err := rolloutErrors[ns.Variant]; err != nil { + log.Printf("ERROR: version rollout failed for %s (%s): %v", ns.name, ns.Variant, err) + emitJUnitFailure(ctx, mgmtClient, cfg, ns.name, ns.Variant) + anyRolloutFailed = true + } else { + log.Printf("Version rollout completed for %s (%s)", ns.name, ns.Variant) + emitJUnitSuccess(cfg, ns.name, ns.Variant) + } + } + + // Phase 5: Write cluster names to SHARED_DIR. + log.Println("Phase 5: Writing cluster names to SHARED_DIR") + for _, ns := range named { + outputPath := filepath.Join(cfg.sharedDir, ns.OutputFile) + if err := os.WriteFile(outputPath, []byte(ns.name), 0600); err != nil { + return fmt.Errorf("writing cluster name to %s: %w", outputPath, err) + } + log.Printf("Wrote cluster name %q to %s", ns.name, outputPath) + } + + if anyRolloutFailed { + return fmt.Errorf("one or more cluster version rollouts failed") + } + + log.Println("All clusters are ready") + return nil +} + +// buildCreateArgs returns CLI arguments for creating a cluster. +func buildCreateArgs(cfg envConfig, name string, spec lifecycle.ClusterSpec) []string { + releaseImage := cfg.releaseImage + if spec.ReleaseImage != "" { + releaseImage = spec.ReleaseImage + } + + args := []string{ + "create", "cluster", cfg.platform.Name(), + "--name=" + name, + "--node-pool-replicas=" + strconv.Itoa(cfg.nodeCount), + "--base-domain=" + cfg.baseDomain, + "--pull-secret=" + cfg.pullSecret, + "--release-image=" + releaseImage, + "--generate-ssh", + } + + if cfg.externalDNS != "" { + args = append(args, "--external-dns-domain="+cfg.externalDNS) + } + if cfg.etcdSC != "" { + args = append(args, "--etcd-storage-class="+cfg.etcdSC) + } + + args = append(args, cfg.platform.CreateArgs()...) + args = append(args, spec.ExtraArgs...) + + return args +} + +type namedSpec struct { + lifecycle.ClusterSpec + name string +} + +func createClustersParallel(ctx context.Context, cfg envConfig, specs []namedSpec) map[string]error { + results := make(map[string]error) + var mu sync.Mutex + var wg sync.WaitGroup + + for _, ns := range specs { + wg.Add(1) + go func() { + defer wg.Done() + args := buildCreateArgs(cfg, ns.name, ns.ClusterSpec) + log.Printf("Creating %s cluster %s", ns.Variant, ns.name) + log.Printf("Running: %s %v", cfg.hypershiftBinary, args) + + cmd := exec.CommandContext(ctx, cfg.hypershiftBinary, args...) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + err := cmd.Run() + + mu.Lock() + results[ns.Variant] = err + mu.Unlock() + }() + } + wg.Wait() + return results +} + +func newMgmtClient() (crclient.WithWatch, error) { + restConfig, err := ctrl.GetConfig() + if err != nil { + return nil, fmt.Errorf("getting management cluster kubeconfig: %w", err) + } + return crclient.NewWithWatch(restConfig, crclient.Options{Scheme: scheme}) +} + +func waitForClustersAvailable(ctx context.Context, cl crclient.WithWatch, namespace string, specs []namedSpec, timeout time.Duration) map[string]error { + results := make(map[string]error) + var mu sync.Mutex + var wg sync.WaitGroup + + for _, ns := range specs { + wg.Add(1) + go func() { + defer wg.Done() + watchCtx, watchCancel := context.WithTimeout(ctx, timeout) + defer watchCancel() + + err := watchForCondition(watchCtx, cl, namespace, ns.name, func(hc *hyperv1.HostedCluster) bool { + for _, cond := range hc.Status.Conditions { + if cond.Type == string(hyperv1.HostedClusterAvailable) && cond.Status == metav1.ConditionTrue { + return true + } + } + return false + }) + + mu.Lock() + results[ns.Variant] = err + mu.Unlock() + }() + } + wg.Wait() + return results +} + +func waitForVersionRollout(ctx context.Context, cl crclient.WithWatch, cfg envConfig, specs []namedSpec) map[string]error { + results := make(map[string]error) + var mu sync.Mutex + var wg sync.WaitGroup + + for _, ns := range specs { + wg.Add(1) + go func() { + defer wg.Done() + watchCtx, watchCancel := context.WithTimeout(ctx, cfg.waitTimeout) + defer watchCancel() + + err := watchForCondition(watchCtx, cl, cfg.namespace, ns.name, func(hc *hyperv1.HostedCluster) bool { + if hc.Status.Version == nil || len(hc.Status.Version.History) == 0 { + return false + } + for _, entry := range hc.Status.Version.History { + if entry.State != "" && entry.State != configv1.CompletedUpdate { + return false + } + if entry.State == "" { + return false + } + } + return true + }) + + mu.Lock() + results[ns.Variant] = err + mu.Unlock() + }() + } + wg.Wait() + return results +} + +func watchForCondition(ctx context.Context, cl crclient.WithWatch, namespace, name string, predicate func(*hyperv1.HostedCluster) bool) error { + hc := &hyperv1.HostedCluster{} + if err := cl.Get(ctx, crclient.ObjectKey{Namespace: namespace, Name: name}, hc); err == nil { + if predicate(hc) { + return nil + } + } + + hcList := &hyperv1.HostedClusterList{} + watcher, err := cl.Watch(ctx, hcList, + crclient.InNamespace(namespace), + crclient.MatchingFields{"metadata.name": name}, + ) + if err != nil { + return fmt.Errorf("starting watch for %s/%s: %w", namespace, name, err) + } + defer watcher.Stop() + + if err := cl.Get(ctx, crclient.ObjectKey{Namespace: namespace, Name: name}, hc); err == nil { + if predicate(hc) { + return nil + } + } + + for { + select { + case <-ctx.Done(): + return fmt.Errorf("timed out waiting for %s/%s: %w", namespace, name, ctx.Err()) + case event, ok := <-watcher.ResultChan(): + if !ok { + return fmt.Errorf("watch channel closed for %s/%s", namespace, name) + } + if event.Type == watch.Error { + return fmt.Errorf("watch error for %s/%s: %v", namespace, name, event.Object) + } + if event.Type != watch.Added && event.Type != watch.Modified { + continue + } + watchedHC, ok := event.Object.(*hyperv1.HostedCluster) + if !ok { + continue + } + logClusterProgress(watchedHC) + if predicate(watchedHC) { + return nil + } + } + } +} + +func logClusterProgress(hc *hyperv1.HostedCluster) { + available := "Unknown" + for _, cond := range hc.Status.Conditions { + if cond.Type == string(hyperv1.HostedClusterAvailable) { + available = string(cond.Status) + break + } + } + + versionState := "" + if hc.Status.Version != nil && len(hc.Status.Version.History) > 0 { + versionState = string(hc.Status.Version.History[0].State) + } + + log.Printf("Cluster %s/%s: Available=%s, VersionState=%s", + hc.Namespace, hc.Name, available, versionState) +} + +func emitJUnitFailure(ctx context.Context, cl crclient.WithWatch, cfg envConfig, name, variant string) { + hc := &hyperv1.HostedCluster{} + _ = cl.Get(ctx, crclient.ObjectKey{Namespace: cfg.namespace, Name: name}, hc) + + degradedMsg := conditionMessage(hc, "Degraded") + cvSucceedingMsg := conditionMessage(hc, string(hyperv1.ClusterVersionSucceeding)) + diagnostics := collectDiagnostics(ctx, cl, cfg.namespace, name, hc) + + junitXML := fmt.Sprintf(` + + + + + + + +`, name, name, variant, degradedMsg, cvSucceedingMsg, diagnostics) + + junitPath := filepath.Join(cfg.artifactDir, fmt.Sprintf("junit_hosted_cluster_%s.xml", name)) + if err := os.WriteFile(junitPath, []byte(junitXML), 0600); err != nil { + log.Printf("WARNING: failed to write JUnit XML to %s: %v", junitPath, err) + } else { + log.Printf("Wrote JUnit failure XML to %s", junitPath) + } +} + +func emitJUnitSuccess(cfg envConfig, name, variant string) { + junitXML := fmt.Sprintf(` + + + + + + + +`, name, name, variant) + + junitPath := filepath.Join(cfg.artifactDir, fmt.Sprintf("junit_hosted_cluster_%s.xml", name)) + if err := os.WriteFile(junitPath, []byte(junitXML), 0600); err != nil { + log.Printf("WARNING: failed to write JUnit XML to %s: %v", junitPath, err) + } +} + +func conditionMessage(hc *hyperv1.HostedCluster, condType string) string { + if hc == nil { + return "" + } + for _, cond := range hc.Status.Conditions { + if cond.Type == condType { + return cond.Message + } + } + return "" +} + +func collectDiagnostics(ctx context.Context, cl crclient.WithWatch, namespace, name string, hc *hyperv1.HostedCluster) string { + var sb strings.Builder + + if hc != nil && len(hc.Status.Conditions) > 0 { + sb.WriteString("HostedCluster conditions:\n") + for _, cond := range hc.Status.Conditions { + fmt.Fprintf(&sb, " %s\t%s\t%s\t%s\n", cond.Type, cond.Status, cond.Reason, cond.Message) + } + } + + np := &hyperv1.NodePool{} + if err := cl.Get(ctx, crclient.ObjectKey{Namespace: namespace, Name: name}, np); err == nil { + sb.WriteString("NodePool conditions:\n") + for _, cond := range np.Status.Conditions { + fmt.Fprintf(&sb, " %s\t%s\t%s\t%s\n", cond.Type, cond.Status, cond.Reason, cond.Message) + } + } + + return sb.String() +} + +func mustGetenv(key string) string { + val := os.Getenv(key) + if val == "" { + log.Fatalf("%s environment variable is required", key) + } + return val +} + +func envOrDefault(key, defaultVal string) string { + if val := os.Getenv(key); val != "" { + return val + } + return defaultVal +} + +func envOrDefaultInt(key string, defaultVal int) int { + val := os.Getenv(key) + if val == "" { + return defaultVal + } + n, err := strconv.Atoi(val) + if err != nil { + log.Printf("WARNING: invalid integer for %s=%q, using default %d", key, val, defaultVal) + return defaultVal + } + return n +} diff --git a/test/e2e/v2/cmd/destroy-guests/main.go b/test/e2e/v2/cmd/destroy-guests/main.go new file mode 100644 index 00000000000..5f317b16ab5 --- /dev/null +++ b/test/e2e/v2/cmd/destroy-guests/main.go @@ -0,0 +1,109 @@ +//go:build e2ev2 + +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// destroy-guests destroys all HostedClusters created by the v2 e2e +// lifecycle tests. Cluster names are re-derived from PROW_JOB_ID +// using the same sha256 hash logic as the create step. All clusters +// are destroyed in parallel with best-effort semantics. +// Platform selection is controlled by the HYPERSHIFT_PLATFORM +// environment variable (default: "azure"). +package main + +import ( + "fmt" + "log" + "os" + "os/exec" + "sync" + + "github.com/openshift/hypershift/test/e2e/v2/lifecycle" +) + +const clusterGracePeriod = "40m" + +func main() { + prowJobID := os.Getenv("PROW_JOB_ID") + if prowJobID == "" { + log.Fatal("PROW_JOB_ID is required") + } + + sharedDir := os.Getenv("SHARED_DIR") + + platform, err := lifecycle.NewPlatformConfig(os.Getenv("HYPERSHIFT_PLATFORM"), sharedDir) + if err != nil { + log.Fatalf("Failed to initialize platform config: %v", err) + } + + hypershiftBin := os.Getenv("HYPERSHIFT_BINARY") + if hypershiftBin == "" { + hypershiftBin = "hypershift" + } + + specs := platform.ClusterSpecs("", "") + + log.Printf("Destroying %d clusters derived from PROW_JOB_ID=%s", len(specs), prowJobID) + + var ( + mu sync.Mutex + failed bool + wg sync.WaitGroup + ) + + for _, spec := range specs { + clusterName := lifecycle.DeriveClusterName(prowJobID, spec.Variant) + wg.Add(1) + go func() { + defer wg.Done() + if err := destroyCluster(hypershiftBin, clusterName, spec.Variant, platform); err != nil { + log.Printf("WARNING: Failed to destroy cluster %s (%s): %v", clusterName, spec.Variant, err) + log.Printf("ACTION REQUIRED: cloud resources for cluster %s may be orphaned and need manual cleanup (resource group, DNS records, etc.)", clusterName) + mu.Lock() + failed = true + mu.Unlock() + } + }() + } + + wg.Wait() + + if failed { + log.Fatal("One or more clusters failed to destroy") + } + log.Printf("All clusters destroyed successfully") +} + +func destroyCluster(hypershiftBin, name, variant string, platform lifecycle.PlatformConfig) error { + log.Printf("Destroying cluster %s (%s)", name, variant) + + args := []string{ + "destroy", "cluster", platform.Name(), + "--name=" + name, + "--cluster-grace-period=" + clusterGracePeriod, + } + args = append(args, platform.DestroyArgs()...) + + log.Printf("Running: %s %v", hypershiftBin, args) + + cmd := exec.Command(hypershiftBin, args...) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + return fmt.Errorf("hypershift destroy cluster %s failed for %s: %w", platform.Name(), name, err) + } + + log.Printf("Finished destroying cluster: %s", name) + return nil +} diff --git a/test/e2e/v2/cmd/dump-guests/main.go b/test/e2e/v2/cmd/dump-guests/main.go new file mode 100644 index 00000000000..3f0fec53999 --- /dev/null +++ b/test/e2e/v2/cmd/dump-guests/main.go @@ -0,0 +1,98 @@ +//go:build e2ev2 + +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// dump-guests collects diagnostic artifacts from all v2 e2e +// HostedClusters in parallel. It shells out to the hypershift CLI +// for each cluster and always exits 0 so that dump failures never +// block teardown. +// Platform selection is controlled by the HYPERSHIFT_PLATFORM +// environment variable (default: "azure"). +package main + +import ( + "flag" + "log" + "os" + "os/exec" + "path/filepath" + "sync" + + "github.com/openshift/hypershift/test/e2e/v2/lifecycle" +) + +func main() { + hypershiftBinary := flag.String("hypershift-binary", "hypershift", "Path to the hypershift CLI binary") + flag.Parse() + + prowJobID := os.Getenv("PROW_JOB_ID") + if prowJobID == "" { + log.Fatal("PROW_JOB_ID environment variable is required") + } + artifactDir := os.Getenv("ARTIFACT_DIR") + if artifactDir == "" { + log.Fatal("ARTIFACT_DIR environment variable is required") + } + + sharedDir := os.Getenv("SHARED_DIR") + platform, err := lifecycle.NewPlatformConfig(os.Getenv("HYPERSHIFT_PLATFORM"), sharedDir) + if err != nil { + log.Fatalf("Failed to initialize platform config: %v", err) + } + + specs := platform.ClusterSpecs("", "") + log.Printf("Dumping %d clusters derived from PROW_JOB_ID=%s", len(specs), prowJobID) + + var wg sync.WaitGroup + for _, spec := range specs { + clusterName := lifecycle.DeriveClusterName(prowJobID, spec.Variant) + wg.Add(1) + go func() { + defer wg.Done() + dumpCluster(*hypershiftBinary, artifactDir, clusterName) + }() + } + wg.Wait() + + log.Println("All cluster dumps complete") +} + +func dumpCluster(hypershiftBinary, artifactDir, clusterName string) { + dumpDir := filepath.Join(artifactDir, clusterName) + if err := os.MkdirAll(dumpDir, 0755); err != nil { + log.Printf("WARNING: Failed to create artifact directory %s: %v", dumpDir, err) + return + } + + args := []string{ + "dump", "cluster", + "--artifact-dir=" + dumpDir, + "--dump-guest-cluster=true", + "--name=" + clusterName, + } + + log.Printf("Dumping cluster %s -> %s", clusterName, dumpDir) + log.Printf("Running: %s %v", hypershiftBinary, args) + + cmd := exec.Command(hypershiftBinary, args...) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + log.Printf("WARNING: Failed to dump cluster %s: %v", clusterName, err) + return + } + + log.Printf("Successfully dumped cluster %s", clusterName) +} diff --git a/test/e2e/v2/cmd/run-tests/main.go b/test/e2e/v2/cmd/run-tests/main.go new file mode 100644 index 00000000000..260b82a4359 --- /dev/null +++ b/test/e2e/v2/cmd/run-tests/main.go @@ -0,0 +1,177 @@ +//go:build e2ev2 + +// run-tests dispatches the v2 e2e test suites in parallel, one per +// pre-created hosted cluster. Test groups and label filters are +// determined by the platform (HYPERSHIFT_PLATFORM env var). +package main + +import ( + "fmt" + "log" + "os" + "os/exec" + "path/filepath" + "strings" + "sync" + + "github.com/openshift/hypershift/test/e2e/v2/lifecycle" +) + +const ( + testBinary = "bin/test-e2e-v2" + clusterNS = "clusters" + defaultVerbose = "false" + defaultGinkgoTimeout = "3h" +) + +// testResult captures the outcome of a single test group execution. +type testResult struct { + name string + err error +} + +func main() { + log.SetFlags(log.LstdFlags) + + sharedDir := requireEnv("SHARED_DIR") + artifactDir := requireEnv("ARTIFACT_DIR") + releaseImage := os.Getenv("RELEASE_IMAGE_LATEST") + + eventuallyVerbose := os.Getenv("EVENTUALLY_VERBOSE") + if eventuallyVerbose == "" { + eventuallyVerbose = defaultVerbose + } + os.Setenv("EVENTUALLY_VERBOSE", eventuallyVerbose) + + platform, err := lifecycle.NewPlatformConfig(os.Getenv("HYPERSHIFT_PLATFORM"), sharedDir) + if err != nil { + log.Fatalf("Failed to initialize platform config: %v", err) + } + + // Let the platform set up any env vars it needs for tests. + platform.SetupTestEnv(sharedDir) + + matrix := platform.TestMatrix(releaseImage) + + var ( + mu sync.Mutex + results []testResult + wg sync.WaitGroup + ) + + // Launch parallel test groups. + for _, g := range matrix.Parallel { + g := g + wg.Add(1) + go func() { + defer wg.Done() + clusterName := readClusterName(sharedDir, g.ClusterFile) + log.Printf("Running %s tests against %s...", g.Name, clusterName) + err := runTestBinary(clusterName, g.LabelFilter, g.Skip, + filepath.Join(artifactDir, g.JUnitFile), g.ExtraEnv) + mu.Lock() + results = append(results, testResult{name: g.Name, err: err}) + mu.Unlock() + if err != nil { + log.Printf("%s tests FAILED: %v", g.Name, err) + } else { + log.Printf("%s tests PASSED", g.Name) + } + }() + } + + // Launch sequential groups (each group runs in its own goroutine, + // but steps within a group run one after another). + for _, sg := range matrix.Sequential { + sg := sg + wg.Add(1) + go func() { + defer wg.Done() + for i, step := range sg.Steps { + clusterName := readClusterName(sharedDir, step.ClusterFile) + log.Printf("Running %s tests against %s...", step.Name, clusterName) + err := runTestBinary(clusterName, step.LabelFilter, step.Skip, + filepath.Join(artifactDir, step.JUnitFile), step.ExtraEnv) + mu.Lock() + results = append(results, testResult{name: step.Name, err: err}) + mu.Unlock() + if err != nil { + log.Printf("%s tests FAILED: %v — skipping remaining steps in %s", step.Name, err, sg.Name) + return + } + log.Printf("%s tests PASSED", step.Name) + if i < len(sg.Steps)-1 { + log.Printf("Continuing to next step in %s...", sg.Name) + } + } + }() + } + + log.Println("Waiting for all test suites to complete...") + wg.Wait() + + // Summarize and exit. + failed := 0 + for _, r := range results { + if r.err != nil { + log.Printf("FAIL: %s — %v", r.name, r.err) + failed++ + } else { + log.Printf("PASS: %s", r.name) + } + } + if failed > 0 { + log.Fatalf("%d test group(s) failed", failed) + } + log.Println("All test groups passed") +} + +func runTestBinary(clusterName, labelFilter, skip, junitPath string, extraEnv []string) error { + ginkgoTimeout := os.Getenv("GINKGO_TIMEOUT") + if ginkgoTimeout == "" { + ginkgoTimeout = defaultGinkgoTimeout + } + + args := []string{ + fmt.Sprintf("--ginkgo.label-filter=%s", labelFilter), + fmt.Sprintf("--ginkgo.junit-report=%s", junitPath), + fmt.Sprintf("--ginkgo.timeout=%s", ginkgoTimeout), + "--ginkgo.v", + } + if skip != "" { + args = append(args, fmt.Sprintf("--ginkgo.skip=%s", skip)) + } + + cmd := exec.Command(testBinary, args...) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + + cmd.Env = append(os.Environ(), + fmt.Sprintf("E2E_HOSTED_CLUSTER_NAME=%s", clusterName), + fmt.Sprintf("E2E_HOSTED_CLUSTER_NAMESPACE=%s", clusterNS), + ) + cmd.Env = append(cmd.Env, extraEnv...) + + return cmd.Run() +} + +func readClusterName(sharedDir, filename string) string { + path := filepath.Join(sharedDir, filename) + data, err := os.ReadFile(path) + if err != nil { + log.Fatalf("Failed to read cluster name from %s: %v", path, err) + } + name := strings.TrimSpace(string(data)) + if name == "" { + log.Fatalf("Cluster name file %s is empty", path) + } + return name +} + +func requireEnv(key string) string { + val := os.Getenv(key) + if val == "" { + log.Fatalf("Required environment variable %s is not set", key) + } + return val +} diff --git a/test/e2e/v2/internal/env_vars.go b/test/e2e/v2/internal/env_vars.go index 5e28d5dedd7..f5cbd4b7cba 100644 --- a/test/e2e/v2/internal/env_vars.go +++ b/test/e2e/v2/internal/env_vars.go @@ -171,4 +171,35 @@ func init() { "Comma-separated list of Azure subscription IDs permitted to create Private Endpoints.", false, ) + // Release image env vars for lifecycle tests + RegisterEnvVar( + "E2E_LATEST_RELEASE_IMAGE", + "Latest OCP release image for control plane upgrade tests.", + false, + ) + RegisterEnvVar( + "E2E_PREVIOUS_RELEASE_IMAGE", + "N-1 OCP release image (previous minor) for control plane upgrade tests.", + false, + ) + RegisterEnvVar( + "E2E_N1_RELEASE_IMAGE", + "N-1 minor release image for NodePool previous-release tests.", + false, + ) + RegisterEnvVar( + "E2E_N2_RELEASE_IMAGE", + "N-2 minor release image for NodePool previous-release tests.", + false, + ) + RegisterEnvVar( + "E2E_AZURE_CREDENTIALS_FILE", + "Path to Azure service principal credentials JSON file for platform-specific tests (auto-repair, disk encryption).", + false, + ) + RegisterEnvVar( + "E2E_AZURE_DISK_ENCRYPTION_SET_ID", + "Azure DiskEncryptionSet resource ID for disk encryption NodePool tests.", + false, + ) } diff --git a/test/e2e/v2/internal/test_context.go b/test/e2e/v2/internal/test_context.go index cba6f176bab..91cf144240e 100644 --- a/test/e2e/v2/internal/test_context.go +++ b/test/e2e/v2/internal/test_context.go @@ -108,6 +108,8 @@ func (tc *TestContext) GetHostedClusterClient() crclient.Client { if err != nil { panic(fmt.Sprintf("failed to create REST config from kubeconfig: %v", err)) } + restConfig.QPS = 200 + restConfig.Burst = 300 client, err := crclient.New(restConfig, crclient.Options{Scheme: hyperapi.Scheme}) if err != nil { diff --git a/test/e2e/v2/lifecycle/azure.go b/test/e2e/v2/lifecycle/azure.go new file mode 100644 index 00000000000..47c1d6e1495 --- /dev/null +++ b/test/e2e/v2/lifecycle/azure.go @@ -0,0 +1,259 @@ +//go:build e2ev2 + +package lifecycle + +import ( + "context" + "fmt" + "log" + "os" + "path/filepath" + "strings" + + operatorv1 "github.com/openshift/api/operator/v1" + hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" + crclient "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + defaultAzureCreds = "/etc/hypershift-ci-jobs-self-managed-azure/credentials.json" + defaultAzureLocation = "centralus" + defaultAzureDNSZoneRG = "os4-common" + + defaultOIDCIssuerURL = "https://smazure.blob.core.windows.net/smazure" + defaultSATokenKeyPath = "/etc/hypershift-ci-jobs-self-managed-azure-e2e/serviceaccount-signer.private" + defaultWorkloadIdentities = "/etc/hypershift-ci-jobs-self-managed-azure-e2e/workload-identities.json" +) + +// AzurePlatformConfig holds Azure-specific configuration for the +// hypershift CLI. +type AzurePlatformConfig struct { + creds string + location string + oidcIssuerURL string + saTokenKeyPath string + workloadIdentities string + dnsZoneRG string + privateNATSubnetID string + sharedDir string + + marketplacePublisher string + marketplaceOffer string + marketplaceSKU string + marketplaceVersion string +} + +// NewAzurePlatformConfig reads Azure-specific configuration from +// environment variables with CI defaults. +func NewAzurePlatformConfig(sharedDir string) *AzurePlatformConfig { + cfg := &AzurePlatformConfig{ + creds: envOrDefault("AZURE_CREDS", defaultAzureCreds), + location: envOrDefault("HYPERSHIFT_AZURE_LOCATION", defaultAzureLocation), + oidcIssuerURL: envOrDefault("AZURE_OIDC_ISSUER_URL", defaultOIDCIssuerURL), + saTokenKeyPath: envOrDefault("AZURE_SA_TOKEN_ISSUER_KEY_PATH", defaultSATokenKeyPath), + workloadIdentities: envOrDefault("AZURE_WORKLOAD_IDENTITIES_FILE", defaultWorkloadIdentities), + dnsZoneRG: defaultAzureDNSZoneRG, + sharedDir: sharedDir, + + marketplacePublisher: os.Getenv("HYPERSHIFT_AZURE_MARKETPLACE_IMAGE_PUBLISHER"), + marketplaceOffer: os.Getenv("HYPERSHIFT_AZURE_MARKETPLACE_IMAGE_OFFER"), + marketplaceSKU: os.Getenv("HYPERSHIFT_AZURE_MARKETPLACE_IMAGE_SKU"), + marketplaceVersion: os.Getenv("HYPERSHIFT_AZURE_MARKETPLACE_IMAGE_VERSION"), + } + + cfg.privateNATSubnetID = os.Getenv("AZURE_PRIVATE_NAT_SUBNET_ID") + if cfg.privateNATSubnetID == "" && sharedDir != "" { + if data, err := os.ReadFile(filepath.Join(sharedDir, "azure_private_nat_subnet_id")); err == nil { + cfg.privateNATSubnetID = strings.TrimSpace(string(data)) + } + } + if cfg.privateNATSubnetID == "" { + log.Printf("WARNING: AZURE_PRIVATE_NAT_SUBNET_ID is not set; private cluster creation will fail") + } + + if cfg.marketplaceSKU == "" && cfg.marketplacePublisher != "" && sharedDir != "" { + if data, err := os.ReadFile(filepath.Join(sharedDir, "azure-marketplace-image-sku")); err == nil { + cfg.marketplaceSKU = strings.TrimSpace(string(data)) + } + } + if cfg.marketplaceVersion == "" && cfg.marketplacePublisher != "" && sharedDir != "" { + if data, err := os.ReadFile(filepath.Join(sharedDir, "azure-marketplace-image-version")); err == nil { + cfg.marketplaceVersion = strings.TrimSpace(string(data)) + } + } + + return cfg +} + +func (a *AzurePlatformConfig) Name() string { return "azure" } + +func (a *AzurePlatformConfig) DefaultBaseDomain() string { + return "hcp-sm-azure.azure.devcluster.openshift.com" +} + +func (a *AzurePlatformConfig) ClusterSpecs(releaseImage, n1Image string) []ClusterSpec { + return []ClusterSpec{ + { + Variant: "public", + OutputFile: "cluster-name-public", + }, + { + Variant: "private", + OutputFile: "cluster-name-private", + ExtraArgs: []string{ + "--endpoint-access=Private", + "--endpoint-access-private-nat-subnet-id=" + a.privateNATSubnetID, + }, + }, + { + Variant: "oauth-lb", + OutputFile: "cluster-name-oauth-lb", + ExtraArgs: []string{"--oauth-publishing-strategy=LoadBalancer"}, + }, + { + Variant: "upgrade", + OutputFile: "cluster-name-upgrade", + ReleaseImage: n1Image, + ExtraArgs: []string{"--control-plane-availability-policy=HighlyAvailable"}, + }, + { + Variant: "autoscaling", + OutputFile: "cluster-name-autoscaling", + }, + } +} + +func (a *AzurePlatformConfig) CreateArgs() []string { + args := []string{ + "--azure-creds=" + a.creds, + "--location=" + a.location, + "--oidc-issuer-url=" + a.oidcIssuerURL, + "--sa-token-issuer-private-key-path=" + a.saTokenKeyPath, + "--workload-identities-file=" + a.workloadIdentities, + "--assign-service-principal-roles", + "--dns-zone-rg-name=" + a.dnsZoneRG, + } + + if a.marketplacePublisher != "" { + args = append(args, "--marketplace-publisher="+a.marketplacePublisher) + args = append(args, "--marketplace-offer="+a.marketplaceOffer) + if a.marketplaceSKU != "" { + args = append(args, "--marketplace-sku="+a.marketplaceSKU) + } + if a.marketplaceVersion != "" { + args = append(args, "--marketplace-version="+a.marketplaceVersion) + } + } + + return args +} + +// PostCreate patches the public cluster's OperatorConfiguration with +// an IngressOperator using an internal LoadBalancer. This is specific +// to Azure self-managed testing. +func (a *AzurePlatformConfig) PostCreate(ctx context.Context, cl crclient.WithWatch, namespace string, clusterNames map[string]string) error { + publicName, ok := clusterNames["cluster-name-public"] + if !ok { + return nil + } + + hc := &hyperv1.HostedCluster{} + if err := cl.Get(ctx, crclient.ObjectKey{Namespace: namespace, Name: publicName}, hc); err != nil { + return fmt.Errorf("getting HostedCluster %s/%s: %w", namespace, publicName, err) + } + + patch := crclient.MergeFrom(hc.DeepCopy()) + if hc.Spec.OperatorConfiguration == nil { + hc.Spec.OperatorConfiguration = &hyperv1.OperatorConfiguration{} + } + hc.Spec.OperatorConfiguration.IngressOperator = &hyperv1.IngressOperatorSpec{ + EndpointPublishingStrategy: &operatorv1.EndpointPublishingStrategy{ + Type: operatorv1.LoadBalancerServiceStrategyType, + LoadBalancer: &operatorv1.LoadBalancerStrategy{ + Scope: operatorv1.InternalLoadBalancer, + }, + }, + } + if err := cl.Patch(ctx, hc, patch); err != nil { + return fmt.Errorf("patching HostedCluster %s/%s OperatorConfiguration: %w", namespace, publicName, err) + } + log.Printf("Patched public cluster %s/%s with OperatorConfiguration", namespace, publicName) + return nil +} + +func (a *AzurePlatformConfig) TestMatrix(releaseImage string) TestMatrix { + return TestMatrix{ + Parallel: []TestGroup{ + { + Name: "public", + ClusterFile: "cluster-name-public", + LabelFilter: "self-managed-azure-public || nodepool-lifecycle", + Skip: "KAS allowed CIDRs", + JUnitFile: "junit_self_managed_azure_public.xml", + }, + { + Name: "private", + ClusterFile: "cluster-name-private", + LabelFilter: "self-managed-azure-private", + JUnitFile: "junit_self_managed_azure_private.xml", + }, + { + Name: "oauth-lb", + ClusterFile: "cluster-name-oauth-lb", + LabelFilter: "self-managed-azure-oauth-lb", + JUnitFile: "junit_self_managed_azure_oauth_lb.xml", + }, + { + Name: "autoscaling", + ClusterFile: "cluster-name-autoscaling", + LabelFilter: "nodepool-autoscaling", + JUnitFile: "junit_nodepool_autoscaling.xml", + }, + }, + Sequential: []SequentialGroup{ + { + Name: "upgrade-and-chaos", + Steps: []TestGroup{ + { + Name: "upgrade", + ClusterFile: "cluster-name-upgrade", + LabelFilter: "control-plane-upgrade", + JUnitFile: "junit_lifecycle_upgrade.xml", + ExtraEnv: []string{fmt.Sprintf("E2E_LATEST_RELEASE_IMAGE=%s", releaseImage)}, + }, + { + Name: "etcd-chaos", + ClusterFile: "cluster-name-upgrade", + LabelFilter: "etcd-chaos", + JUnitFile: "junit_lifecycle_etcd_chaos.xml", + }, + }, + }, + }, + } +} + +// SetupTestEnv reads Azure-specific config from SHARED_DIR and sets +// environment variables for the test subprocesses. +func (a *AzurePlatformConfig) SetupTestEnv(sharedDir string) { + azurePrivateNATSubnetID := os.Getenv("AZURE_PRIVATE_NAT_SUBNET_ID") + if data, err := os.ReadFile(filepath.Join(sharedDir, "azure_private_nat_subnet_id")); err == nil { + azurePrivateNATSubnetID = strings.TrimSpace(string(data)) + } + os.Setenv("AZURE_PRIVATE_NAT_SUBNET_ID", azurePrivateNATSubnetID) +} + +func (a *AzurePlatformConfig) DestroyArgs() []string { + return []string{ + "--azure-creds=" + a.creds, + "--location=" + a.location, + "--dns-zone-rg-name=" + a.dnsZoneRG, + } +} + +func envOrDefault(key, defaultVal string) string { + if val := os.Getenv(key); val != "" { + return val + } + return defaultVal +} diff --git a/test/e2e/v2/lifecycle/platform.go b/test/e2e/v2/lifecycle/platform.go new file mode 100644 index 00000000000..a7ba5a1ebc7 --- /dev/null +++ b/test/e2e/v2/lifecycle/platform.go @@ -0,0 +1,105 @@ +//go:build e2ev2 + +package lifecycle + +import ( + "context" + "crypto/sha256" + "fmt" + + crclient "sigs.k8s.io/controller-runtime/pkg/client" +) + +// ClusterSpec describes a single cluster to create for lifecycle tests. +type ClusterSpec struct { + Variant string + OutputFile string // filename under SHARED_DIR + ExtraArgs []string + ReleaseImage string // override (empty = use default) +} + +// TestGroup describes one logical group of e2e tests to execute. +type TestGroup struct { + Name string + ClusterFile string // filename under SHARED_DIR containing cluster name + LabelFilter string + Skip string + JUnitFile string + ExtraEnv []string +} + +// SequentialGroup runs its Steps one after another within a single +// goroutine. If any step fails, subsequent steps are skipped. +type SequentialGroup struct { + Name string + Steps []TestGroup +} + +// TestMatrix defines the full set of test groups for a platform. +// Parallel groups all run concurrently. Each SequentialGroup also +// runs concurrently with everything else, but its internal Steps +// run one after another. +type TestMatrix struct { + Parallel []TestGroup + Sequential []SequentialGroup +} + +// PlatformConfig provides all platform-specific configuration for +// the v2 lifecycle binaries. Adding a new platform means implementing +// this interface — the cmd binaries should not need modification. +type PlatformConfig interface { + // Name returns the CLI subcommand name (e.g., "azure", "aws"). + Name() string + + // DefaultBaseDomain returns the platform's default base domain. + DefaultBaseDomain() string + + // ClusterSpecs returns the cluster variants this platform creates. + // The releaseImage and n1Image are the current and N-1 release + // images from the CI environment. + ClusterSpecs(releaseImage, n1Image string) []ClusterSpec + + // CreateArgs returns platform-specific args for + // "hypershift create cluster ". + CreateArgs() []string + + // PostCreate runs platform-specific setup after clusters are + // created (e.g., patching OperatorConfiguration). + PostCreate(ctx context.Context, cl crclient.WithWatch, namespace string, clusterNames map[string]string) error + + // TestMatrix returns the test groups for this platform. + TestMatrix(releaseImage string) TestMatrix + + // SetupTestEnv sets platform-specific environment variables + // before test execution (e.g., reading subnet IDs from + // SHARED_DIR files). + SetupTestEnv(sharedDir string) + + // DestroyArgs returns platform-specific args for + // "hypershift destroy cluster ". + DestroyArgs() []string + +} + +// NewPlatformConfig creates a PlatformConfig for the given platform +// name. The sharedDir is passed for platforms that read fallback +// config from files. +func NewPlatformConfig(platform, sharedDir string) (PlatformConfig, error) { + switch platform { + case "azure", "": + return NewAzurePlatformConfig(sharedDir), nil + default: + return nil, fmt.Errorf("unsupported platform %q (supported: azure)", platform) + } +} + +// DeriveClusterName builds a human-readable, deterministic cluster name +// from the prow job ID and cluster variant. The format is +// "{variant}-{hash10}" where hash10 is the first 10 hex characters of +// SHA-256(prowJobID), giving uniqueness per CI run while keeping the +// variant visible in artifacts and namespaces. +func DeriveClusterName(prowJobID, variant string) string { + hash := sha256.Sum256([]byte(prowJobID)) + return variant + "-" + fmt.Sprintf("%x", hash)[:10] +} + diff --git a/test/e2e/v2/tests/control_plane_upgrade_test.go b/test/e2e/v2/tests/control_plane_upgrade_test.go new file mode 100644 index 00000000000..a2387cfae05 --- /dev/null +++ b/test/e2e/v2/tests/control_plane_upgrade_test.go @@ -0,0 +1,93 @@ +//go:build e2ev2 + +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package tests + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" + e2eutil "github.com/openshift/hypershift/test/e2e/util" + "github.com/openshift/hypershift/test/e2e/v2/internal" + + crclient "sigs.k8s.io/controller-runtime/pkg/client" +) + +// ControlPlaneUpgradeTest upgrades the hosted cluster from N-1 to the latest release image. +func ControlPlaneUpgradeTest(getTestCtx internal.TestContextGetter) { + It("should upgrade the control plane from N-1 to latest", func() { + testCtx := getTestCtx() + ctx := testCtx.Context + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + latestImage := internal.GetEnvVarValue("E2E_LATEST_RELEASE_IMAGE") + Expect(latestImage).NotTo(BeEmpty(), "E2E_LATEST_RELEASE_IMAGE must be set for upgrade tests") + + var startingVersion string + if hc.Status.Version != nil && len(hc.Status.Version.History) > 0 { + startingVersion = hc.Status.Version.History[0].Version + } + GinkgoWriter.Printf("Starting upgrade from version %s to image %s\n", startingVersion, latestImage) + + err := e2eutil.UpdateObject(GinkgoTB(), ctx, testCtx.MgmtClient, hc, func(obj *hyperv1.HostedCluster) { + obj.Spec.Release.Image = latestImage + if obj.Annotations == nil { + obj.Annotations = make(map[string]string) + } + obj.Annotations[hyperv1.ForceUpgradeToAnnotation] = latestImage + }) + Expect(err).NotTo(HaveOccurred(), "failed to update hosted cluster release image") + + By("Waiting for control plane components to complete rollout") + e2eutil.GinkgoAtLeast(e2eutil.Version420) + e2eutil.WaitForControlPlaneComponentRollout(GinkgoTB(), ctx, testCtx.MgmtClient, hc, startingVersion) + + By("Waiting for control plane version to complete rollout") + e2eutil.GinkgoAtLeast(e2eutil.Version422) + e2eutil.WaitForControlPlaneRollout(GinkgoTB(), ctx, testCtx.MgmtClient, hc) + + By("Waiting for data plane rollout to complete") + e2eutil.WaitForDataPlaneRollout(GinkgoTB(), ctx, testCtx.MgmtClient, hc) + + // Re-fetch HC after upgrade + Expect(testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(hc), hc)).To(Succeed()) + + // TODO: Add post-upgrade validation checks once the Ensure* functions + // in e2eutil are refactored from *testing.T to testing.TB: + // - EnsureFeatureGateStatus + // - EnsureNodeCountMatchesNodePoolReplicas + // - EnsureNoCrashingPods + // - EnsureMachineDeploymentGeneration + }) +} + +// RegisterControlPlaneUpgradeTests registers all control plane upgrade tests. +func RegisterControlPlaneUpgradeTests(getTestCtx internal.TestContextGetter) { + ControlPlaneUpgradeTest(getTestCtx) +} + +var _ = Describe("Control Plane Upgrade", Label("lifecycle", "control-plane-upgrade"), func() { + var testCtx *internal.TestContext + + BeforeEach(func() { + testCtx = internal.GetTestContext() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + }) + + RegisterControlPlaneUpgradeTests(func() *internal.TestContext { return testCtx }) +}) diff --git a/test/e2e/v2/tests/etcd_chaos_test.go b/test/e2e/v2/tests/etcd_chaos_test.go new file mode 100644 index 00000000000..e10a15b5561 --- /dev/null +++ b/test/e2e/v2/tests/etcd_chaos_test.go @@ -0,0 +1,447 @@ +//go:build e2ev2 + +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package tests + +import ( + "context" + "fmt" + "math/rand" + "strings" + "sync" + "time" + + "github.com/google/go-cmp/cmp" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + cpomanifests "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/manifests" + etcdrecoverymanifests "github.com/openshift/hypershift/hypershift-operator/controllers/manifests/etcdrecovery" + e2eutil "github.com/openshift/hypershift/test/e2e/util" + "github.com/openshift/hypershift/test/e2e/v2/internal" + + appsv1 "k8s.io/api/apps/v1" + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/utils/ptr" + crclient "sigs.k8s.io/controller-runtime/pkg/client" +) + +// RegisterEtcdChaosTests registers all etcd chaos test cases. +func RegisterEtcdChaosTests(getTestCtx internal.TestContextGetter) { + EtcdSingleMemberRecoveryTest(getTestCtx) + EtcdKillRandomMembersTest(getTestCtx) + EtcdKillAllMembersTest(getTestCtx) + EtcdSingleMemberCorruptionTest(getTestCtx) + EtcdMissingMemberRecoveryTest(getTestCtx) +} + +var _ = Describe("Etcd Chaos", Label("lifecycle", "etcd-chaos"), Ordered, func() { + var testCtx *internal.TestContext + + BeforeAll(func() { + testCtx = internal.GetTestContext() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + }) + + RegisterEtcdChaosTests(func() *internal.TestContext { return testCtx }) +}) + +// EtcdSingleMemberRecoveryTest deletes one random etcd pod and its PVC simultaneously, +// then verifies the pod is replaced (different UID) and the StatefulSet converges. +func EtcdSingleMemberRecoveryTest(getTestCtx internal.TestContextGetter) { + It("should recover after a single member loses its data", func() { + testCtx := getTestCtx() + ctx := testCtx.Context + cpNamespace := testCtx.ControlPlaneNamespace + + etcdSts, etcdPods := getEtcdStsAndPods(ctx, testCtx.MgmtClient, cpNamespace) + + randomPod := randomEtcdPods(etcdPods.Items, 1)[0] + originalUID := randomPod.UID + pvcName := "data-etcd" + strings.TrimPrefix(randomPod.Name, "etcd") + pvc := &corev1.PersistentVolumeClaim{ + ObjectMeta: metav1.ObjectMeta{Name: pvcName, Namespace: cpNamespace}, + } + + GinkgoWriter.Printf("Deleting etcd pod %s and PVC %s\n", randomPod.Name, pvcName) + + var wg sync.WaitGroup + wg.Add(2) + go func() { + defer GinkgoRecover() + defer wg.Done() + Expect(testCtx.MgmtClient.Delete(ctx, &randomPod)).To(Succeed(), "failed to delete etcd pod %s", randomPod.Name) + GinkgoWriter.Printf("Deleted etcd pod %s\n", randomPod.Name) + }() + go func() { + defer GinkgoRecover() + defer wg.Done() + Expect(testCtx.MgmtClient.Delete(ctx, pvc)).To(Succeed(), "failed to delete etcd PVC %s", pvcName) + GinkgoWriter.Printf("Deleted etcd PVC %s\n", pvcName) + }() + wg.Wait() + + // Verify pod is replaced with a new UID + e2eutil.EventuallyObject(GinkgoTB(), ctx, "deleted etcd pod is replaced", + func(ctx context.Context) (*corev1.Pod, error) { + pod := &corev1.Pod{} + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(&randomPod), pod) + return pod, err + }, + []e2eutil.Predicate[*corev1.Pod]{func(pod *corev1.Pod) (bool, string, error) { + return originalUID != pod.UID, fmt.Sprintf("pod UID %s", pod.UID), nil + }}, + e2eutil.WithInterval(5*time.Second), + e2eutil.WithTimeout(30*time.Minute), + ) + + waitForEtcdConvergence(ctx, testCtx.MgmtClient, cpNamespace, ptr.Deref(etcdSts.Spec.Replicas, 0)) + }) +} + +// EtcdKillRandomMembersTest creates a marker ConfigMap in the hosted cluster, +// deletes random etcd pods every 5 seconds for 30 seconds, then verifies +// StatefulSet convergence and that the marker data survived. +func EtcdKillRandomMembersTest(getTestCtx internal.TestContextGetter) { + It("should preserve data when random members are repeatedly killed", func() { + testCtx := getTestCtx() + ctx := testCtx.Context + cpNamespace := testCtx.ControlPlaneNamespace + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + // Create marker data that should survive the chaos + markerCM := createMarkerConfigMap(ctx, guestClient) + DeferCleanup(func() { + if err := guestClient.Delete(ctx, markerCM); err != nil && !apierrors.IsNotFound(err) { + GinkgoWriter.Printf("Warning: failed to cleanup marker ConfigMap: %v\n", err) + } + }) + + etcdSts, etcdPods := getEtcdStsAndPods(ctx, testCtx.MgmtClient, cpNamespace) + + // Delete random etcd pods every 5s for 30s + duration, period := 30*time.Second, 5*time.Second + GinkgoWriter.Printf("Deleting random etcd pods every %s for %s\n", period, duration) + deletionCount := 0 + deadline := time.Now().Add(duration) + for time.Now().Before(deadline) { + pod := randomEtcdPods(etcdPods.Items, 1)[0] + err := testCtx.MgmtClient.Delete(ctx, &pod, &crclient.DeleteOptions{ + GracePeriodSeconds: ptr.To[int64](0), + }) + if err != nil { + GinkgoWriter.Printf("Warning: failed to delete pod %s: %v\n", pod.Name, err) + } else { + GinkgoWriter.Printf("Deleted pod %s\n", pod.Name) + deletionCount++ + } + time.Sleep(period) + } + Expect(deletionCount).To(BeNumerically(">", 0), "at least one pod deletion should have succeeded") + + waitForEtcdConvergence(ctx, testCtx.MgmtClient, cpNamespace, ptr.Deref(etcdSts.Spec.Replicas, 0)) + + verifyMarkerSurvived(ctx, guestClient, markerCM) + }) +} + +// EtcdKillAllMembersTest creates a marker ConfigMap, deletes ALL etcd pods simultaneously +// via goroutines, then verifies convergence and marker survival. +func EtcdKillAllMembersTest(getTestCtx internal.TestContextGetter) { + It("should preserve data when all members are killed simultaneously", func() { + testCtx := getTestCtx() + ctx := testCtx.Context + cpNamespace := testCtx.ControlPlaneNamespace + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + // Create marker data that should survive the chaos + markerCM := createMarkerConfigMap(ctx, guestClient) + DeferCleanup(func() { + if err := guestClient.Delete(ctx, markerCM); err != nil && !apierrors.IsNotFound(err) { + GinkgoWriter.Printf("Warning: failed to cleanup marker ConfigMap: %v\n", err) + } + }) + + etcdSts, etcdPods := getEtcdStsAndPods(ctx, testCtx.MgmtClient, cpNamespace) + + // Delete all etcd pods simultaneously + GinkgoWriter.Printf("Deleting all %d etcd pods simultaneously\n", len(etcdPods.Items)) + var wg sync.WaitGroup + wg.Add(len(etcdPods.Items)) + for i := range etcdPods.Items { + go func(pod *corev1.Pod) { + defer GinkgoRecover() + defer wg.Done() + deleteCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + err := testCtx.MgmtClient.Delete(deleteCtx, pod, &crclient.DeleteOptions{ + GracePeriodSeconds: ptr.To[int64](0), + }) + if err != nil { + GinkgoWriter.Printf("Warning: failed to delete pod %s: %v\n", pod.Name, err) + } else { + GinkgoWriter.Printf("Deleted pod %s\n", pod.Name) + } + }(&etcdPods.Items[i]) + } + wg.Wait() + + // Verify all etcd pods are replaced with new UIDs + e2eutil.EventuallyObjects(GinkgoTB(), ctx, "etcd pods to be replaced", + func(ctx context.Context) ([]*corev1.Pod, error) { + pods := &corev1.PodList{} + err := testCtx.MgmtClient.List(ctx, pods, &crclient.ListOptions{ + Namespace: cpNamespace, + LabelSelector: labels.Set(etcdSts.Spec.Selector.MatchLabels).AsSelector(), + }) + items := make([]*corev1.Pod, len(pods.Items)) + for i := range pods.Items { + items[i] = &pods.Items[i] + } + return items, err + }, + nil, + []e2eutil.Predicate[*corev1.Pod]{func(pod *corev1.Pod) (bool, string, error) { + for _, previousPod := range etcdPods.Items { + if previousPod.Namespace == pod.Namespace && previousPod.Name == pod.Name { + return previousPod.UID != pod.UID, fmt.Sprintf("pod UID %s", pod.UID), nil + } + } + return false, "pod not found in previous list", nil + }}, + e2eutil.WithInterval(5*time.Second), + e2eutil.WithTimeout(30*time.Minute), + ) + + waitForEtcdConvergence(ctx, testCtx.MgmtClient, cpNamespace, ptr.Deref(etcdSts.Spec.Replicas, 0)) + + verifyMarkerSurvived(ctx, guestClient, markerCM) + }) +} + +// EtcdSingleMemberCorruptionTest destroys a random member's data directory using +// RunCommandInPod, then waits for etcd to crash in-place so the recovery +// controller detects the failing member and creates a recovery job. +func EtcdSingleMemberCorruptionTest(getTestCtx internal.TestContextGetter) { + It("should recover after a single member's data is corrupted", func() { + testCtx := getTestCtx() + ctx := testCtx.Context + cpNamespace := testCtx.ControlPlaneNamespace + + etcdSts, etcdPods := getEtcdStsAndPods(ctx, testCtx.MgmtClient, cpNamespace) + if ptr.Deref(etcdSts.Spec.Replicas, 0) < 3 { + Skip("etcd corruption recovery requires HighlyAvailable etcd (>= 3 replicas)") + } + + pod := randomEtcdPods(etcdPods.Items, 1)[0] + // Remove the entire member directory so etcd cannot start. + // Deleting only a single WAL file is insufficient because etcd + // can recover from partial WAL loss using its snapshot database. + // Do NOT delete the pod afterward — let etcd crash and restart + // in-place so RestartCount increments on the same pod. The + // recovery controller requires RestartCount > 0 to detect a + // failing member; deleting the pod resets RestartCount to 0. + command := `rm -rf /var/lib/data/member` + + GinkgoWriter.Printf("Destroying data directory on etcd pod: %s\n", pod.Name) + _, err := e2eutil.RunCommandInPod(ctx, testCtx.MgmtClient, "etcd", pod.Namespace, []string{"/bin/sh", "-c", command}, "etcd", 5*time.Minute) + Expect(err).NotTo(HaveOccurred(), "failed to destroy data directory on etcd pod %s", pod.Name) + + // Etcd recovery job should be created. + // We don't check if the job completed because it will be deleted after completion. + e2eutil.EventuallyObject(GinkgoTB(), ctx, "etcd recovery job to be active", + func(ctx context.Context) (*batchv1.Job, error) { + recoveryJob := etcdrecoverymanifests.EtcdRecoveryJob(cpNamespace) + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(recoveryJob), recoveryJob) + return recoveryJob, err + }, + []e2eutil.Predicate[*batchv1.Job]{func(job *batchv1.Job) (bool, string, error) { + got := job.Status.Active + return got == 1, fmt.Sprintf("wanted status active to be 1, got %d", got), nil + }}, + e2eutil.WithInterval(5*time.Second), + e2eutil.WithTimeout(15*time.Minute), + ) + + waitForEtcdConvergence(ctx, testCtx.MgmtClient, cpNamespace, ptr.Deref(etcdSts.Spec.Replicas, 0)) + }) +} + +// EtcdMissingMemberRecoveryTest removes a member from the etcd cluster via +// etcdctl member remove, deletes the pod, verifies the recovery job, +// and waits for StatefulSet convergence. +func EtcdMissingMemberRecoveryTest(getTestCtx internal.TestContextGetter) { + It("should recover after a member is removed from the etcd cluster", func() { + testCtx := getTestCtx() + ctx := testCtx.Context + cpNamespace := testCtx.ControlPlaneNamespace + + etcdSts, etcdPods := getEtcdStsAndPods(ctx, testCtx.MgmtClient, cpNamespace) + if ptr.Deref(etcdSts.Spec.Replicas, 0) < 3 { + Skip("etcd missing member recovery requires HighlyAvailable etcd (>= 3 replicas)") + } + + pod := randomEtcdPods(etcdPods.Items, 1)[0] + ep := fmt.Sprintf("https://etcd-client.%s.svc:2379", cpNamespace) + + // Step 1: Discover the member ID + discoverCommand := []string{ + "/bin/sh", "-c", + fmt.Sprintf("/usr/bin/etcdctl --cacert=/etc/etcd/tls/etcd-ca/ca.crt --cert=/etc/etcd/tls/server/server.crt --key=/etc/etcd/tls/server/server.key --endpoints=%s member list | grep %s | awk '{print $1}' | tr -d ,", ep, pod.Name), + } + + GinkgoWriter.Printf("Discovering member ID for: %s\n", pod.Name) + memberID, err := e2eutil.RunCommandInPod(ctx, testCtx.MgmtClient, "etcd", pod.Namespace, discoverCommand, "etcd", 5*time.Minute) + Expect(err).NotTo(HaveOccurred(), "failed to discover etcd member ID for %s", pod.Name) + memberID = strings.TrimSpace(memberID) + Expect(memberID).NotTo(BeEmpty(), "member ID should not be empty for %s", pod.Name) + + // Step 2: Remove the member + removeCommand := []string{ + "/usr/bin/etcdctl", + "--cacert=/etc/etcd/tls/etcd-ca/ca.crt", + "--cert=/etc/etcd/tls/server/server.crt", + "--key=/etc/etcd/tls/server/server.key", + fmt.Sprintf("--endpoints=%s", ep), + "member", "remove", memberID, + } + + GinkgoWriter.Printf("Removing etcd member %s (ID: %s)\n", pod.Name, memberID) + cmdStdout, err := e2eutil.RunCommandInPod(ctx, testCtx.MgmtClient, "etcd", pod.Namespace, removeCommand, "etcd", 5*time.Minute) + Expect(err).NotTo(HaveOccurred(), "failed to remove etcd member %s", pod.Name) + Expect(cmdStdout).NotTo(ContainSubstring("Error:"), "failed to remove etcd member %s", pod.Name) + + GinkgoWriter.Printf("Deleting pod: %s\n", pod.Name) + Expect(testCtx.MgmtClient.Delete(ctx, &pod)).To(Succeed(), "failed to delete pod %s", pod.Name) + + // Etcd recovery job should be created. + // We don't check if the job completed because it will be deleted after completion. + e2eutil.EventuallyObject(GinkgoTB(), ctx, "etcd recovery job to be active", + func(ctx context.Context) (*batchv1.Job, error) { + recoveryJob := etcdrecoverymanifests.EtcdRecoveryJob(cpNamespace) + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(recoveryJob), recoveryJob) + return recoveryJob, err + }, + []e2eutil.Predicate[*batchv1.Job]{func(job *batchv1.Job) (bool, string, error) { + got := job.Status.Active + return got == 1, fmt.Sprintf("wanted status active to be 1, got %d", got), nil + }}, + e2eutil.WithInterval(5*time.Second), + e2eutil.WithTimeout(15*time.Minute), + ) + + waitForEtcdConvergence(ctx, testCtx.MgmtClient, cpNamespace, ptr.Deref(etcdSts.Spec.Replicas, 0)) + }) +} + +// getEtcdStsAndPods fetches the etcd StatefulSet and its pods from the control plane namespace. +func getEtcdStsAndPods(ctx context.Context, client crclient.Client, cpNamespace string) (*appsv1.StatefulSet, *corev1.PodList) { + GinkgoHelper() + + etcdSts := cpomanifests.EtcdStatefulSet(cpNamespace) + Expect(client.Get(ctx, crclient.ObjectKeyFromObject(etcdSts), etcdSts)).To(Succeed(), "failed to get etcd StatefulSet") + + etcdPods := &corev1.PodList{} + Expect(client.List(ctx, etcdPods, &crclient.ListOptions{ + Namespace: cpNamespace, + LabelSelector: labels.Set(etcdSts.Spec.Selector.MatchLabels).AsSelector(), + })).To(Succeed(), "failed to list etcd pods") + Expect(etcdPods.Items).NotTo(BeEmpty(), "no etcd pods found") + GinkgoWriter.Printf("Found %d etcd pods\n", len(etcdPods.Items)) + + return etcdSts, etcdPods +} + +// waitForEtcdConvergence polls the etcd StatefulSet until ReadyReplicas equals the expected replica count. +func waitForEtcdConvergence(ctx context.Context, client crclient.Client, cpNamespace string, expectedReplicas int32) { + GinkgoHelper() + + e2eutil.EventuallyObject(GinkgoTB(), ctx, "etcd StatefulSet replicas to converge", + func(ctx context.Context) (*appsv1.StatefulSet, error) { + sts := cpomanifests.EtcdStatefulSet(cpNamespace) + err := client.Get(ctx, crclient.ObjectKeyFromObject(sts), sts) + return sts, err + }, + []e2eutil.Predicate[*appsv1.StatefulSet]{func(sts *appsv1.StatefulSet) (bool, string, error) { + got := sts.Status.ReadyReplicas + return expectedReplicas != 0 && expectedReplicas == got, fmt.Sprintf("wanted %d ready replicas, got %d", expectedReplicas, got), nil + }}, + e2eutil.WithInterval(5*time.Second), + e2eutil.WithTimeout(30*time.Minute), + ) +} + +// randomEtcdPods selects count random pods from the provided slice. +func randomEtcdPods(pods []corev1.Pod, count int) []corev1.Pod { + indexes := rand.Perm(len(pods)) + selected := make([]corev1.Pod, count) + for i := 0; i < count; i++ { + selected[i] = pods[indexes[i]] + } + return selected +} + +// createMarkerConfigMap creates a ConfigMap with timestamp data in the hosted cluster +// and returns it for later verification. +func createMarkerConfigMap(ctx context.Context, client crclient.Client) *corev1.ConfigMap { + GinkgoHelper() + + value, err := time.Now().MarshalText() + Expect(err).NotTo(HaveOccurred(), "failed to marshal timestamp") + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "default", + Name: e2eutil.SimpleNameGenerator.GenerateName("marker-"), + }, + Data: map[string]string{"value": string(value)}, + } + e2eutil.EventuallyObject(GinkgoTB(), ctx, "create marker ConfigMap", + func(ctx context.Context) (*corev1.ConfigMap, error) { + err := client.Create(ctx, cm) + return cm, err + }, nil, + ) + GinkgoWriter.Printf("Created marker ConfigMap %s/%s\n", cm.Namespace, cm.Name) + return cm +} + +// verifyMarkerSurvived verifies that the marker ConfigMap still has its original data +// after etcd chaos operations. +func verifyMarkerSurvived(ctx context.Context, client crclient.Client, expected *corev1.ConfigMap) { + GinkgoHelper() + + e2eutil.EventuallyObject(GinkgoTB(), ctx, "verify marker data survived disruption", + func(ctx context.Context) (*corev1.ConfigMap, error) { + actual := &corev1.ConfigMap{} + err := client.Get(ctx, crclient.ObjectKeyFromObject(expected), actual) + return actual, err + }, + []e2eutil.Predicate[*corev1.ConfigMap]{func(configMap *corev1.ConfigMap) (bool, string, error) { + diff := cmp.Diff(expected.Data, configMap.Data) + return diff == "", fmt.Sprintf("incorrect data: %v", diff), nil + }}, + e2eutil.WithInterval(5*time.Second), + e2eutil.WithTimeout(30*time.Minute), + ) +} diff --git a/test/e2e/v2/tests/nodepool_autoscaling_test.go b/test/e2e/v2/tests/nodepool_autoscaling_test.go new file mode 100644 index 00000000000..5a2766d8a54 --- /dev/null +++ b/test/e2e/v2/tests/nodepool_autoscaling_test.go @@ -0,0 +1,398 @@ +//go:build e2ev2 + +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package tests + +import ( + "context" + "fmt" + "strings" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" + e2eutil "github.com/openshift/hypershift/test/e2e/util" + "github.com/openshift/hypershift/test/e2e/v2/internal" + + appsv1 "k8s.io/api/apps/v1" + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/apimachinery/pkg/labels" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/ptr" + crclient "sigs.k8s.io/controller-runtime/pkg/client" +) + +// AutoscalingScaleUpDownTest tests autoscaling scale-up and scale-down behavior +func AutoscalingScaleUpDownTest(getTestCtx internal.TestContextGetter) { + It("should scale up when workload increases and scale down when workload decreases", func() { + testCtx := getTestCtx() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + ctx := testCtx.Context + + // Find the default NodePool to copy platform config + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + // Create autoscaling NodePool with min=1, max=3 and a unique node label + // so the workload targets only this NodePool's nodes. + autoscalingLabel := map[string]string{"e2e-autoscaling-test": "scale-up-down"} + autoscalingNP := buildAutoscalingNodePool(defaultNP, 1, 3, autoscalingLabel) + err := testCtx.MgmtClient.Create(ctx, autoscalingNP) + Expect(err).NotTo(HaveOccurred(), "failed to create autoscaling NodePool") + GinkgoWriter.Printf("Created autoscaling NodePool %s with min=1, max=3\n", autoscalingNP.Name) + + // Ensure cleanup + defer cleanupNodePool(ctx, testCtx.MgmtClient, autoscalingNP) + + npLabelSelector := e2eutil.WithClientOptions(crclient.MatchingLabelsSelector{ + Selector: labels.SelectorFromSet(labels.Set{hyperv1.NodePoolLabel: autoscalingNP.Name}), + }) + + // Wait for NodePool to be ready with 1 node (min replicas) + nodes := e2eutil.WaitForNReadyNodesWithOptions(GinkgoTB(), ctx, guestClient, 1, hc.Spec.Platform.Type, fmt.Sprintf("for NodePool %s", autoscalingNP.Name), npLabelSelector) + Expect(nodes).To(HaveLen(1), "should have exactly 1 node initially") + + // Get node capacity for workload sizing + memCapacity := nodes[0].Status.Allocatable[corev1.ResourceMemory] + bytes, ok := memCapacity.AsInt64() + Expect(ok).To(BeTrue(), "memory capacity should be convertible to int64") + + // Create workload that requires 3 nodes (50% memory per pod, 3 pods). + // nodeSelector forces pods onto the autoscaling NodePool so the + // cluster autoscaler must scale it up. + workloadMemRequest := *resource.NewQuantity(bytes/2, resource.BinarySI) + workload := newAutoscalingWorkload(3, workloadMemRequest, autoscalingLabel) + err = guestClient.Create(ctx, workload) + Expect(err).NotTo(HaveOccurred(), "failed to create workload") + + defer cleanupWorkload(ctx, guestClient, workload) + + // Wait for scale-up to 3 nodes + e2eutil.WaitForNReadyNodesWithOptions(GinkgoTB(), ctx, guestClient, 3, hc.Spec.Platform.Type, fmt.Sprintf("for NodePool %s", autoscalingNP.Name), npLabelSelector) + + // Delete workload to trigger scale-down + cleanupWorkload(ctx, guestClient, workload) + + // Wait for scale-down to 1 node (min replicas) + e2eutil.WaitForNReadyNodesWithOptions(GinkgoTB(), ctx, guestClient, 1, hc.Spec.Platform.Type, fmt.Sprintf("for NodePool %s", autoscalingNP.Name), npLabelSelector) + }) +} + +// AutoscalingBalancingTest tests that autoscaling balances workload across multiple NodePools. +// It configures the HostedCluster with the Random expander so the cluster autoscaler +// distributes scale-up events across NodePools instead of favoring one. +func AutoscalingBalancingTest(getTestCtx internal.TestContextGetter) { + It("should balance pods across multiple autoscaling NodePools", func() { + testCtx := getTestCtx() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + e2eutil.GinkgoAtLeast(e2eutil.Version420) + + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + ctx := testCtx.Context + cpNamespace := testCtx.ControlPlaneNamespace + + // Configure autoscaler with Random expander for balanced distribution. + // The default least-waste expander favors a single NodePool. + balancingLabel := "e2e-balance-ignore" + originalHC := hc.DeepCopy() + hc.Spec.Autoscaling = hyperv1.ClusterAutoscaling{ + Expanders: []hyperv1.ExpanderString{ + hyperv1.RandomExpander, + }, + BalancingIgnoredLabels: []string{ + balancingLabel, + }, + MaxFreeDifferenceRatioPercent: ptr.To[int32](70), + } + err := testCtx.MgmtClient.Patch(ctx, hc, crclient.MergeFrom(originalHC)) + Expect(err).NotTo(HaveOccurred(), "failed to configure autoscaler on HostedCluster") + GinkgoWriter.Println("Configured HostedCluster autoscaling with Random expander") + + DeferCleanup(func() { + latest := &hyperv1.HostedCluster{} + if err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(hc), latest); err != nil { + GinkgoWriter.Printf("Warning: failed to get HostedCluster for cleanup: %v\n", err) + return + } + patch := crclient.MergeFrom(latest.DeepCopy()) + latest.Spec.Autoscaling = hyperv1.ClusterAutoscaling{} + if err := testCtx.MgmtClient.Patch(ctx, latest, patch); err != nil { + GinkgoWriter.Printf("Warning: failed to reset autoscaler config: %v\n", err) + } + }) + + // Wait for autoscaler deployment to pick up the new config + e2eutil.EventuallyObject(GinkgoTB(), ctx, "autoscaler deployment to have balancing config", + func(ctx context.Context) (*appsv1.Deployment, error) { + dep := &appsv1.Deployment{ObjectMeta: metav1.ObjectMeta{ + Namespace: cpNamespace, Name: "cluster-autoscaler", + }} + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(dep), dep) + return dep, err + }, + []e2eutil.Predicate[*appsv1.Deployment]{func(dep *appsv1.Deployment) (bool, string, error) { + for _, arg := range dep.Spec.Template.Spec.Containers[0].Args { + if strings.Contains(arg, balancingLabel) { + return dep.Status.ReadyReplicas > 0, fmt.Sprintf("ready replicas: %d", dep.Status.ReadyReplicas), nil + } + } + return false, "balancing-ignore-label not found in autoscaler args", nil + }}, + e2eutil.WithInterval(10*time.Second), + e2eutil.WithTimeout(5*time.Minute), + ) + + // Find the default NodePool to copy platform config + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + // Create two autoscaling NodePools with distinct labels for the + // balancing-ignored-labels config and a shared label for the workload nodeSelector. + sharedLabel := map[string]string{"e2e-autoscaling-test": "balance"} + np1Labels := map[string]string{ + "e2e-autoscaling-test": "balance", + balancingLabel: "np1", + } + np2Labels := map[string]string{ + "e2e-autoscaling-test": "balance", + balancingLabel: "np2", + } + + autoscalingNP1 := buildAutoscalingNodePool(defaultNP, 1, 3, np1Labels) + err = testCtx.MgmtClient.Create(ctx, autoscalingNP1) + Expect(err).NotTo(HaveOccurred(), "failed to create first autoscaling NodePool") + defer cleanupNodePool(ctx, testCtx.MgmtClient, autoscalingNP1) + + autoscalingNP2 := buildAutoscalingNodePool(defaultNP, 1, 3, np2Labels) + err = testCtx.MgmtClient.Create(ctx, autoscalingNP2) + Expect(err).NotTo(HaveOccurred(), "failed to create second autoscaling NodePool") + defer cleanupNodePool(ctx, testCtx.MgmtClient, autoscalingNP2) + + np1LabelSelector := e2eutil.WithClientOptions(crclient.MatchingLabelsSelector{ + Selector: labels.SelectorFromSet(labels.Set{hyperv1.NodePoolLabel: autoscalingNP1.Name}), + }) + np2LabelSelector := e2eutil.WithClientOptions(crclient.MatchingLabelsSelector{ + Selector: labels.SelectorFromSet(labels.Set{hyperv1.NodePoolLabel: autoscalingNP2.Name}), + }) + + // Wait for initial nodes (1 per NodePool at min replicas) + nodes := e2eutil.WaitForNReadyNodesWithOptions(GinkgoTB(), ctx, guestClient, 1, hc.Spec.Platform.Type, "for NP1", np1LabelSelector) + e2eutil.WaitForNReadyNodesWithOptions(GinkgoTB(), ctx, guestClient, 1, hc.Spec.Platform.Type, "for NP2", np2LabelSelector) + + // Get node capacity for workload sizing + memCapacity := nodes[0].Status.Allocatable[corev1.ResourceMemory] + bytes, ok := memCapacity.AsInt64() + Expect(ok).To(BeTrue(), "memory capacity should be convertible to int64") + + // Create workload targeting the autoscaling NodePools via the shared label. + workloadMemRequest := *resource.NewQuantity(bytes/2, resource.BinarySI) + workload := newAutoscalingWorkload(4, workloadMemRequest, sharedLabel) + err = guestClient.Create(ctx, workload) + Expect(err).NotTo(HaveOccurred(), "failed to create workload") + defer cleanupWorkload(ctx, guestClient, workload) + + // Wait for total 4 nodes across both NPs, then verify balanced distribution + Eventually(func() (bool, error) { + if err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(autoscalingNP1), autoscalingNP1); err != nil { + return false, err + } + if err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(autoscalingNP2), autoscalingNP2); err != nil { + return false, err + } + + total := autoscalingNP1.Status.Replicas + autoscalingNP2.Status.Replicas + if total < 4 { + return false, nil + } + return autoscalingNP1.Status.Replicas >= 1 && autoscalingNP2.Status.Replicas >= 1, nil + }).WithTimeout(30 * time.Minute). + WithPolling(30 * time.Second). + Should(BeTrue(), "NodePools should have balanced distribution") + }) +} + +// Helper functions + +// getDefaultNodePool finds an existing NodePool for the hosted cluster to copy platform config +func getDefaultNodePool(ctx context.Context, client crclient.Client, hc *hyperv1.HostedCluster) *hyperv1.NodePool { + GinkgoHelper() + + npList := &hyperv1.NodePoolList{} + err := client.List(ctx, npList, crclient.InNamespace(hc.Namespace)) + Expect(err).NotTo(HaveOccurred(), "failed to list NodePools") + Expect(npList.Items).NotTo(BeEmpty(), "should have at least one NodePool") + + // Find a NodePool for this HostedCluster + for i := range npList.Items { + if npList.Items[i].Spec.ClusterName == hc.Name { + return &npList.Items[i] + } + } + + return nil +} + +// buildAutoscalingNodePool creates a new NodePool with autoscaling enabled based on a template. +// nodeLabels are applied to the NodePool's nodes so workloads can target them with a nodeSelector. +func buildAutoscalingNodePool(template *hyperv1.NodePool, min, max int32, nodeLabels map[string]string) *hyperv1.NodePool { + GinkgoHelper() + + name := e2eutil.SimpleNameGenerator.GenerateName(template.Spec.ClusterName + "-auto-") + np := &hyperv1.NodePool{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: template.Namespace, + }, + } + + // Deep copy the spec from template + template.Spec.DeepCopyInto(&np.Spec) + + // Configure autoscaling + np.Spec.Replicas = nil // Must be nil when using autoscaling + np.Spec.AutoScaling = &hyperv1.NodePoolAutoScaling{ + Min: ptr.To(min), + Max: max, + } + + if len(nodeLabels) > 0 { + if np.Spec.NodeLabels == nil { + np.Spec.NodeLabels = make(map[string]string) + } + for k, v := range nodeLabels { + np.Spec.NodeLabels[k] = v + } + } + + return np +} + +// newAutoscalingWorkload creates a Job that spawns multiple pods for autoscaling tests. +// nodeSelector constrains pods to land on specific NodePool nodes so the +// cluster autoscaler is forced to scale the targeted NodePool. +func newAutoscalingWorkload(njobs int32, memoryRequest resource.Quantity, nodeSelector map[string]string) *batchv1.Job { + GinkgoHelper() + + name := e2eutil.SimpleNameGenerator.GenerateName("autoscaling-workload-") + job := &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: "default", + }, + Spec: batchv1.JobSpec{ + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "workload", + Image: "registry.access.redhat.com/ubi9/ubi-minimal:latest", + Command: []string{ + "sleep", + "86400", // 1 day + }, + Resources: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + "memory": memoryRequest, + "cpu": resource.MustParse("500m"), + }, + }, + SecurityContext: &corev1.SecurityContext{ + AllowPrivilegeEscalation: ptr.To(false), + Capabilities: &corev1.Capabilities{ + Drop: []corev1.Capability{"ALL"}, + }, + RunAsNonRoot: ptr.To(false), + RunAsUser: ptr.To(int64(0)), + SeccompProfile: &corev1.SeccompProfile{ + Type: corev1.SeccompProfileTypeRuntimeDefault, + }, + }, + }, + }, + NodeSelector: nodeSelector, + RestartPolicy: corev1.RestartPolicyNever, + }, + }, + BackoffLimit: ptr.To[int32](4), + Completions: ptr.To(njobs), + Parallelism: ptr.To(njobs), + }, + } + + return job +} + +// cleanupNodePool deletes a NodePool if it exists +func cleanupNodePool(ctx context.Context, client crclient.Client, np *hyperv1.NodePool) { + GinkgoHelper() + + err := client.Delete(ctx, np) + if err != nil && !apierrors.IsNotFound(err) { + GinkgoWriter.Printf("Warning: failed to delete NodePool %s: %v\n", np.Name, err) + } else if err == nil { + GinkgoWriter.Printf("Deleted NodePool %s\n", np.Name) + } +} + +// cleanupWorkload deletes a Job workload if it exists +func cleanupWorkload(ctx context.Context, client crclient.Client, job *batchv1.Job) { + GinkgoHelper() + + cascadeDelete := metav1.DeletePropagationForeground + err := client.Delete(ctx, job, &crclient.DeleteOptions{ + PropagationPolicy: &cascadeDelete, + }) + if err != nil && !apierrors.IsNotFound(err) { + GinkgoWriter.Printf("Warning: failed to delete workload %s: %v\n", job.Name, err) + } else if err == nil { + GinkgoWriter.Printf("Deleted workload %s\n", job.Name) + } +} + +// RegisterNodePoolAutoscalingTests registers all autoscaling test cases +func RegisterNodePoolAutoscalingTests(getTestCtx internal.TestContextGetter) { + AutoscalingScaleUpDownTest(getTestCtx) + AutoscalingBalancingTest(getTestCtx) +} + +var _ = Describe("NodePool Autoscaling", Label("lifecycle", "nodepool-autoscaling"), func() { + var testCtx *internal.TestContext + + BeforeEach(func() { + testCtx = internal.GetTestContext() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + }) + + RegisterNodePoolAutoscalingTests(func() *internal.TestContext { return testCtx }) +}) diff --git a/test/e2e/v2/tests/nodepool_lifecycle_test.go b/test/e2e/v2/tests/nodepool_lifecycle_test.go new file mode 100644 index 00000000000..22c0a754378 --- /dev/null +++ b/test/e2e/v2/tests/nodepool_lifecycle_test.go @@ -0,0 +1,1498 @@ +//go:build e2ev2 + +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package tests + +import ( + "context" + "encoding/json" + "fmt" + "time" + + "github.com/google/go-cmp/cmp" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + ignitionapi "github.com/coreos/ignition/v2/config/v3_2/types" + mcfgv1 "github.com/openshift/api/machineconfiguration/v1" + hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" + "github.com/openshift/hypershift/hypershift-operator/controllers/manifests" + "github.com/openshift/hypershift/hypershift-operator/controllers/nodepool" + hyperapi "github.com/openshift/hypershift/support/api" + "github.com/openshift/hypershift/support/netutil" + "github.com/openshift/hypershift/support/podspec" + e2eutil "github.com/openshift/hypershift/test/e2e/util" + "github.com/openshift/hypershift/test/e2e/v2/internal" + + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/utils/ptr" + crclient "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/apiutil" + "sigs.k8s.io/yaml" +) + +// RegisterNodePoolLifecycleTests registers all NodePool lifecycle test cases. +func RegisterNodePoolLifecycleTests(getTestCtx internal.TestContextGetter) { + NodePoolMachineconfigRolloutTest(getTestCtx) + NodePoolNTORolloutTest(getTestCtx) + NodePoolNTOInPlaceTest(getTestCtx) + NodePoolReplaceUpgradeTest(getTestCtx) + NodePoolInPlaceUpgradeTest(getTestCtx) + NodePoolRollingUpgradeTest(getTestCtx) + NodePoolPrevReleaseN1Test(getTestCtx) + NodePoolPrevReleaseN2Test(getTestCtx) + NodePoolMirrorConfigsTest(getTestCtx) + NodePoolTrustBundleTest(getTestCtx) + NodePoolNTOPerformanceProfileTest(getTestCtx) + NodePoolAutoRepairTest(getTestCtx) + NodePoolDiskEncryptionTest(getTestCtx) +} + +var _ = Describe("NodePool Lifecycle", Label("lifecycle", "nodepool-lifecycle"), func() { + var testCtx *internal.TestContext + + BeforeEach(func() { + testCtx = internal.GetTestContext() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + }) + + RegisterNodePoolLifecycleTests(func() *internal.TestContext { return testCtx }) +}) + +// NodePoolMachineconfigRolloutTest creates a NodePool with Replace upgrade strategy, +// applies a MachineConfig via ConfigMap, patches the NodePool to reference it, +// creates a verification DaemonSet in the hosted cluster, and waits for config update +// complete and DaemonSet rollout. +func NodePoolMachineconfigRolloutTest(getTestCtx internal.TestContextGetter) { + It("should roll out a MachineConfig change via Replace upgrade strategy", func() { + testCtx := getTestCtx() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + if hc.Spec.Platform.Type == hyperv1.KubevirtPlatform { + Skip("test is skipped for KubeVirt platform until https://issues.redhat.com/browse/CNV-38196 is addressed") + } + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + ctx := testCtx.Context + + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + var oneReplica int32 = 1 + np := buildTestNodePool(defaultNP, "mc-rollout", func(pool *hyperv1.NodePool) { + pool.Spec.Replicas = &oneReplica + pool.Spec.Management.Replace = &hyperv1.ReplaceUpgrade{ + Strategy: hyperv1.UpgradeStrategyRollingUpdate, + RollingUpdate: &hyperv1.RollingUpdate{ + MaxUnavailable: ptr.To(intstr.FromInt32(0)), + MaxSurge: ptr.To(intstr.FromInt32(oneReplica)), + }, + } + }) + + err := testCtx.MgmtClient.Create(ctx, np) + Expect(err).NotTo(HaveOccurred(), "failed to create NodePool %s", np.Name) + GinkgoWriter.Printf("Created NodePool %s\n", np.Name) + defer cleanupNodePool(ctx, testCtx.MgmtClient, np) + + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + // Build MachineConfig with a custom file at /etc/custom-config + ignitionConfig := ignitionapi.Config{ + Ignition: ignitionapi.Ignition{Version: "3.2.0"}, + Storage: ignitionapi.Storage{ + Files: []ignitionapi.File{{ + Node: ignitionapi.Node{Path: "/etc/custom-config"}, + FileEmbedded1: ignitionapi.FileEmbedded1{Contents: ignitionapi.Resource{Source: ptr.To("data:,content%0A")}}, + }}, + }, + } + serializedIgnition, err := json.Marshal(ignitionConfig) + Expect(err).NotTo(HaveOccurred(), "failed to serialize ignition config") + + machineConfig := &mcfgv1.MachineConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: "custom", + Labels: map[string]string{"machineconfiguration.openshift.io/role": "worker"}, + }, + Spec: mcfgv1.MachineConfigSpec{Config: runtime.RawExtension{Raw: serializedIgnition}}, + } + gvk, err := apiutil.GVKForObject(machineConfig, hyperapi.Scheme) + Expect(err).NotTo(HaveOccurred(), "failed to get GVK for MachineConfig") + machineConfig.SetGroupVersionKind(gvk) + + serializedMC, err := yaml.Marshal(machineConfig) + Expect(err).NotTo(HaveOccurred(), "failed to serialize MachineConfig") + + mcConfigMap := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: e2eutil.SimpleNameGenerator.GenerateName("custom-mc-"), + Namespace: hc.Namespace, + }, + Data: map[string]string{"config": string(serializedMC)}, + } + Expect(testCtx.MgmtClient.Create(ctx, mcConfigMap)).To(Succeed(), "failed to create MachineConfig ConfigMap") + GinkgoWriter.Printf("Created MachineConfig ConfigMap %s\n", mcConfigMap.Name) + + original := np.DeepCopy() + np.Spec.Config = append(np.Spec.Config, corev1.LocalObjectReference{Name: mcConfigMap.Name}) + Expect(testCtx.MgmtClient.Patch(ctx, np, crclient.MergeFrom(original))).To(Succeed(), + "failed to patch NodePool %s with MachineConfig", np.Name) + + // Build verification DaemonSet that checks /etc/custom-config exists + ds := buildMachineConfigVerificationDaemonSet(np) + Expect(guestClient.Create(ctx, ds)).To(Succeed(), "failed to create verification DaemonSet") + + e2eutil.WaitForNodePoolConfigUpdateCompleteWithPlatform(GinkgoTB(), ctx, testCtx.MgmtClient, np, hc.Spec.Platform.Type) + waitForDaemonSetRollout(ctx, guestClient, ds, 1, np.Spec.Platform.Type) + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + // TODO: EnsureNoCrashingPods, EnsureAllContainersHavePullPolicyIfNotPresent, + // EnsureHCPContainersHaveResourceRequests, EnsureNoPodsWithTooHighPriority + // require *testing.T and cannot be called from Ginkgo yet. + }) +} + +// NodePoolNTORolloutTest creates a NodePool with NTO Tuned config (hugepages), +// patches the NodePool's TuningConfig, creates a verification DaemonSet, +// and waits for rollout via Replace upgrade strategy. +func NodePoolNTORolloutTest(getTestCtx internal.TestContextGetter) { + It("should roll out an NTO Tuned config change via Replace upgrade strategy", func() { + testCtx := getTestCtx() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + if hc.Spec.Platform.Type == hyperv1.KubevirtPlatform { + Skip("test is skipped for KubeVirt platform until https://issues.redhat.com/browse/CNV-38196 is addressed") + } + if hc.Spec.Platform.Type == hyperv1.OpenStackPlatform { + Skip("test is skipped for OpenStack platform until https://issues.redhat.com/browse/OSASINFRA-3566 is addressed") + } + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + ctx := testCtx.Context + + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + var twoReplicas int32 = 2 + np := buildTestNodePool(defaultNP, "nto-replace", func(pool *hyperv1.NodePool) { + pool.Spec.Replicas = &twoReplicas + pool.Spec.Management.Replace = &hyperv1.ReplaceUpgrade{ + Strategy: hyperv1.UpgradeStrategyRollingUpdate, + RollingUpdate: &hyperv1.RollingUpdate{ + MaxUnavailable: ptr.To(intstr.FromInt32(0)), + MaxSurge: ptr.To(intstr.FromInt32(twoReplicas)), + }, + } + }) + + err := testCtx.MgmtClient.Create(ctx, np) + Expect(err).NotTo(HaveOccurred(), "failed to create NodePool %s", np.Name) + GinkgoWriter.Printf("Created NodePool %s\n", np.Name) + defer cleanupNodePool(ctx, testCtx.MgmtClient, np) + + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + tuningCM := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: e2eutil.SimpleNameGenerator.GenerateName("hugepages-tuned-"), + Namespace: hc.Namespace, + }, + Data: map[string]string{tuningConfigKey: hugepagesTunedYAML}, + } + Expect(testCtx.MgmtClient.Create(ctx, tuningCM)).To(Succeed(), "failed to create Tuned ConfigMap") + + original := np.DeepCopy() + np.Spec.TuningConfig = append(np.Spec.TuningConfig, corev1.LocalObjectReference{Name: tuningCM.Name}) + Expect(testCtx.MgmtClient.Patch(ctx, np, crclient.MergeFrom(original))).To(Succeed(), + "failed to patch NodePool %s with TuningConfig", np.Name) + + ds := buildNTOVerificationDaemonSet(np) + Expect(guestClient.Create(ctx, ds)).To(Succeed(), "failed to create NTO verification DaemonSet") + + e2eutil.WaitForNodePoolConfigUpdateCompleteWithPlatform(GinkgoTB(), ctx, testCtx.MgmtClient, np, hc.Spec.Platform.Type) + waitForDaemonSetRollout(ctx, guestClient, ds, 2, np.Spec.Platform.Type) + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + // TODO: EnsureNoCrashingPods, EnsureAllContainersHavePullPolicyIfNotPresent, + // EnsureHCPContainersHaveResourceRequests, EnsureNoPodsWithTooHighPriority + // require *testing.T and cannot be called from Ginkgo yet. + }) +} + +// NodePoolNTOInPlaceTest applies an NTO Tuned config with InPlace upgrade type. +func NodePoolNTOInPlaceTest(getTestCtx internal.TestContextGetter) { + It("should roll out an NTO Tuned config change via InPlace upgrade strategy", func() { + testCtx := getTestCtx() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + if hc.Spec.Platform.Type == hyperv1.KubevirtPlatform { + Skip("test is skipped for KubeVirt platform until https://issues.redhat.com/browse/CNV-38196 is addressed") + } + if hc.Spec.Platform.Type == hyperv1.OpenStackPlatform { + Skip("test is skipped for OpenStack platform until https://issues.redhat.com/browse/OSASINFRA-3566 is addressed") + } + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + ctx := testCtx.Context + + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + var twoReplicas int32 = 2 + np := buildTestNodePool(defaultNP, "nto-inplace", func(pool *hyperv1.NodePool) { + pool.Spec.Replicas = &twoReplicas + pool.Spec.Management.UpgradeType = hyperv1.UpgradeTypeInPlace + }) + + err := testCtx.MgmtClient.Create(ctx, np) + Expect(err).NotTo(HaveOccurred(), "failed to create NodePool %s", np.Name) + GinkgoWriter.Printf("Created NodePool %s\n", np.Name) + defer cleanupNodePool(ctx, testCtx.MgmtClient, np) + + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + tuningCM := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: e2eutil.SimpleNameGenerator.GenerateName("hugepages-inplace-"), + Namespace: hc.Namespace, + }, + Data: map[string]string{tuningConfigKey: hugepagesTunedYAML}, + } + Expect(testCtx.MgmtClient.Create(ctx, tuningCM)).To(Succeed(), "failed to create Tuned ConfigMap") + + original := np.DeepCopy() + np.Spec.TuningConfig = append(np.Spec.TuningConfig, corev1.LocalObjectReference{Name: tuningCM.Name}) + Expect(testCtx.MgmtClient.Patch(ctx, np, crclient.MergeFrom(original))).To(Succeed(), + "failed to patch NodePool %s with TuningConfig", np.Name) + + ds := buildNTOVerificationDaemonSet(np) + Expect(guestClient.Create(ctx, ds)).To(Succeed(), "failed to create NTO verification DaemonSet") + + e2eutil.WaitForNodePoolConfigUpdateCompleteWithPlatform(GinkgoTB(), ctx, testCtx.MgmtClient, np, hc.Spec.Platform.Type) + waitForDaemonSetRollout(ctx, guestClient, ds, 2, np.Spec.Platform.Type) + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + // TODO: EnsureNoCrashingPods, EnsureAllContainersHavePullPolicyIfNotPresent, + // EnsureHCPContainersHaveResourceRequests, EnsureNoPodsWithTooHighPriority + // require *testing.T and cannot be called from Ginkgo yet. + }) +} + +// NodePoolReplaceUpgradeTest creates a NodePool at previous release image, waits for nodes, +// upgrades to latest image, and waits for version to update via Replace upgrade strategy. +func NodePoolReplaceUpgradeTest(getTestCtx internal.TestContextGetter) { + It("should upgrade a NodePool from previous to latest release via Replace strategy", func() { + testCtx := getTestCtx() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + previousImage := internal.GetEnvVarValue("E2E_PREVIOUS_RELEASE_IMAGE") + latestImage := internal.GetEnvVarValue("E2E_LATEST_RELEASE_IMAGE") + if previousImage == "" || latestImage == "" { + Skip("E2E_PREVIOUS_RELEASE_IMAGE and E2E_LATEST_RELEASE_IMAGE must be set for upgrade tests") + } + + ctx := testCtx.Context + + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + var oneReplica int32 = 1 + np := buildTestNodePool(defaultNP, "replace-upgrade", func(pool *hyperv1.NodePool) { + pool.Spec.Replicas = &oneReplica + pool.Spec.Release.Image = previousImage + pool.Spec.Management.Replace = &hyperv1.ReplaceUpgrade{ + Strategy: hyperv1.UpgradeStrategyRollingUpdate, + RollingUpdate: &hyperv1.RollingUpdate{ + MaxUnavailable: ptr.To(intstr.FromInt32(0)), + MaxSurge: ptr.To(intstr.FromInt32(oneReplica)), + }, + } + }) + + err := testCtx.MgmtClient.Create(ctx, np) + Expect(err).NotTo(HaveOccurred(), "failed to create NodePool %s", np.Name) + GinkgoWriter.Printf("Created NodePool %s at previous release %s\n", np.Name, previousImage) + defer cleanupNodePool(ctx, testCtx.MgmtClient, np) + + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + // Update NodePool to latest release image + GinkgoWriter.Printf("Upgrading NodePool %s to latest release %s\n", np.Name, latestImage) + Expect(e2eutil.UpdateObject(GinkgoTB(), ctx, testCtx.MgmtClient, np, func(obj *hyperv1.NodePool) { + obj.Spec.Release.Image = latestImage + })).To(Succeed(), "failed to update NodePool release image") + + // Wait for upgrade to start + e2eutil.EventuallyObject(GinkgoTB(), ctx, fmt.Sprintf("NodePool %s/%s to start the upgrade", np.Namespace, np.Name), + func(ctx context.Context) (*hyperv1.NodePool, error) { + pool := &hyperv1.NodePool{} + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(np), pool) + return pool, err + }, + []e2eutil.Predicate[*hyperv1.NodePool]{ + e2eutil.ConditionPredicate[*hyperv1.NodePool](e2eutil.Condition{ + Type: hyperv1.NodePoolUpdatingVersionConditionType, + Status: metav1.ConditionTrue, + }), + }, + ) + + // Wait for upgrade to complete + upgradeTimeout := nodePoolUpgradeTimeout(hc.Spec.Platform.Type) + e2eutil.EventuallyObject(GinkgoTB(), ctx, fmt.Sprintf("NodePool %s/%s to complete the upgrade", np.Namespace, np.Name), + func(ctx context.Context) (*hyperv1.NodePool, error) { + pool := &hyperv1.NodePool{} + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(np), pool) + return pool, err + }, + []e2eutil.Predicate[*hyperv1.NodePool]{ + e2eutil.ConditionPredicate[*hyperv1.NodePool](e2eutil.Condition{ + Type: hyperv1.NodePoolUpdatingVersionConditionType, + Status: metav1.ConditionFalse, + }), + }, + e2eutil.WithTimeout(upgradeTimeout), + ) + + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + // TODO: EnsureNodesLabelsAndTaints, EnsureNodesRuntime require *testing.T + }) +} + +// NodePoolInPlaceUpgradeTest creates a NodePool at previous release image, waits for nodes, +// upgrades to latest image via InPlace upgrade strategy. +func NodePoolInPlaceUpgradeTest(getTestCtx internal.TestContextGetter) { + It("should upgrade a NodePool from previous to latest release via InPlace strategy", func() { + testCtx := getTestCtx() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + previousImage := internal.GetEnvVarValue("E2E_PREVIOUS_RELEASE_IMAGE") + latestImage := internal.GetEnvVarValue("E2E_LATEST_RELEASE_IMAGE") + if previousImage == "" || latestImage == "" { + Skip("E2E_PREVIOUS_RELEASE_IMAGE and E2E_LATEST_RELEASE_IMAGE must be set for upgrade tests") + } + + ctx := testCtx.Context + + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + var oneReplica int32 = 1 + np := buildTestNodePool(defaultNP, "inplace-upgrade", func(pool *hyperv1.NodePool) { + pool.Spec.Replicas = &oneReplica + pool.Spec.Release.Image = previousImage + pool.Spec.Management.UpgradeType = hyperv1.UpgradeTypeInPlace + }) + + err := testCtx.MgmtClient.Create(ctx, np) + Expect(err).NotTo(HaveOccurred(), "failed to create NodePool %s", np.Name) + GinkgoWriter.Printf("Created NodePool %s at previous release %s\n", np.Name, previousImage) + defer cleanupNodePool(ctx, testCtx.MgmtClient, np) + + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + GinkgoWriter.Printf("Upgrading NodePool %s to latest release %s\n", np.Name, latestImage) + Expect(e2eutil.UpdateObject(GinkgoTB(), ctx, testCtx.MgmtClient, np, func(obj *hyperv1.NodePool) { + obj.Spec.Release.Image = latestImage + })).To(Succeed(), "failed to update NodePool release image") + + // Wait for upgrade to start + e2eutil.EventuallyObject(GinkgoTB(), ctx, fmt.Sprintf("NodePool %s/%s to start the upgrade", np.Namespace, np.Name), + func(ctx context.Context) (*hyperv1.NodePool, error) { + pool := &hyperv1.NodePool{} + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(np), pool) + return pool, err + }, + []e2eutil.Predicate[*hyperv1.NodePool]{ + e2eutil.ConditionPredicate[*hyperv1.NodePool](e2eutil.Condition{ + Type: hyperv1.NodePoolUpdatingVersionConditionType, + Status: metav1.ConditionTrue, + }), + }, + ) + + // Wait for upgrade to complete + upgradeTimeout := nodePoolUpgradeTimeout(hc.Spec.Platform.Type) + e2eutil.EventuallyObject(GinkgoTB(), ctx, fmt.Sprintf("NodePool %s/%s to complete the upgrade", np.Namespace, np.Name), + func(ctx context.Context) (*hyperv1.NodePool, error) { + pool := &hyperv1.NodePool{} + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(np), pool) + return pool, err + }, + []e2eutil.Predicate[*hyperv1.NodePool]{ + e2eutil.ConditionPredicate[*hyperv1.NodePool](e2eutil.Condition{ + Type: hyperv1.NodePoolUpdatingVersionConditionType, + Status: metav1.ConditionFalse, + }), + }, + e2eutil.WithTimeout(upgradeTimeout), + ) + + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + // TODO: EnsureNodesLabelsAndTaints, EnsureNodesRuntime require *testing.T + }) +} + +// NodePoolRollingUpgradeTest creates a NodePool with 2 replicas, changes instance type +// (AWS) or VM size (Azure) to trigger a rolling upgrade, and verifies the machine specs +// after upgrade. Only runs on AWS and Azure platforms. +func NodePoolRollingUpgradeTest(getTestCtx internal.TestContextGetter) { + It("should perform a rolling upgrade when instance type or VM size changes", func() { + testCtx := getTestCtx() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + platform := hc.Spec.Platform.Type + if platform != hyperv1.AWSPlatform && platform != hyperv1.AzurePlatform { + Skip("rolling upgrade test only supported on AWS and Azure platforms") + } + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + ctx := testCtx.Context + + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + var twoReplicas int32 = 2 + np := buildTestNodePool(defaultNP, "rolling-upgrade", func(pool *hyperv1.NodePool) { + pool.Spec.Replicas = &twoReplicas + pool.Spec.Management.UpgradeType = hyperv1.UpgradeTypeReplace + switch platform { + case hyperv1.AWSPlatform: + pool.Spec.Platform.AWS.InstanceType = "m5.large" + case hyperv1.AzurePlatform: + pool.Spec.Platform.Azure.VMSize = "Standard_D2s_v3" + } + }) + + err := testCtx.MgmtClient.Create(ctx, np) + Expect(err).NotTo(HaveOccurred(), "failed to create NodePool %s", np.Name) + GinkgoWriter.Printf("Created NodePool %s with 2 replicas\n", np.Name) + defer cleanupNodePool(ctx, testCtx.MgmtClient, np) + + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, platform) + + // Change instance type / VM size to trigger rolling upgrade + var newInstanceType, newVMSize string + switch platform { + case hyperv1.AWSPlatform: + newInstanceType = "m5.xlarge" + case hyperv1.AzurePlatform: + newVMSize = "Standard_D4s_v5" + } + + Expect(e2eutil.UpdateObject(GinkgoTB(), ctx, testCtx.MgmtClient, np, func(obj *hyperv1.NodePool) { + switch platform { + case hyperv1.AWSPlatform: + obj.Spec.Platform.AWS.InstanceType = newInstanceType + case hyperv1.AzurePlatform: + obj.Spec.Platform.Azure.VMSize = newVMSize + } + })).To(Succeed(), "failed to update NodePool instance type / VM size") + + // Wait for rolling upgrade to start + e2eutil.EventuallyObject(GinkgoTB(), ctx, fmt.Sprintf("NodePool %s/%s to start the rolling upgrade", np.Namespace, np.Name), + func(ctx context.Context) (*hyperv1.NodePool, error) { + pool := &hyperv1.NodePool{} + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(np), pool) + return pool, err + }, + []e2eutil.Predicate[*hyperv1.NodePool]{ + e2eutil.ConditionPredicate[*hyperv1.NodePool](e2eutil.Condition{ + Type: hyperv1.NodePoolUpdatingPlatformMachineTemplateConditionType, + Status: metav1.ConditionTrue, + }), + }, + e2eutil.WithTimeout(2*time.Minute), + ) + + // Wait for rolling upgrade to complete + rollingTimeout := nodePoolUpgradeTimeout(platform) + e2eutil.EventuallyObject(GinkgoTB(), ctx, fmt.Sprintf("NodePool %s/%s to finish the rolling upgrade", np.Namespace, np.Name), + func(ctx context.Context) (*hyperv1.NodePool, error) { + pool := &hyperv1.NodePool{} + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(np), pool) + return pool, err + }, + []e2eutil.Predicate[*hyperv1.NodePool]{ + e2eutil.ConditionPredicate[*hyperv1.NodePool](e2eutil.Condition{ + Type: hyperv1.NodePoolUpdatingPlatformMachineTemplateConditionType, + Status: metav1.ConditionFalse, + }), + }, + e2eutil.WithTimeout(rollingTimeout), + ) + + // TODO: Verify machine specs (AWSMachineList / AzureMachineList) after upgrade. + // The v1 test uses capiaws.AWSMachineList and capiazure.AzureMachineList to check + // that instance types / VM sizes match. This requires importing CAPI provider types + // which adds significant dependency. Implement once the pattern is established. + }) +} + +// NodePoolPrevReleaseN1Test creates a NodePool at N-1 release image and waits for nodes ready. +func NodePoolPrevReleaseN1Test(getTestCtx internal.TestContextGetter) { + It("should create a NodePool at N-1 release and have ready nodes", func() { + testCtx := getTestCtx() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + n1Image := internal.GetEnvVarValue("E2E_N1_RELEASE_IMAGE") + if n1Image == "" { + Skip("E2E_N1_RELEASE_IMAGE not set, skipping N-1 release test") + } + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + ctx := testCtx.Context + + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + var oneReplica int32 = 1 + np := buildTestNodePool(defaultNP, "prev-n1", func(pool *hyperv1.NodePool) { + pool.Spec.Replicas = &oneReplica + pool.Spec.Release.Image = n1Image + }) + + err := testCtx.MgmtClient.Create(ctx, np) + Expect(err).NotTo(HaveOccurred(), "failed to create NodePool %s", np.Name) + GinkgoWriter.Printf("Created NodePool %s at N-1 release %s\n", np.Name, n1Image) + defer cleanupNodePool(ctx, testCtx.MgmtClient, np) + + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + // TODO: EnsureNodesLabelsAndTaints requires *testing.T + }) +} + +// NodePoolPrevReleaseN2Test creates a NodePool at N-2 release image and waits for nodes ready. +func NodePoolPrevReleaseN2Test(getTestCtx internal.TestContextGetter) { + It("should create a NodePool at N-2 release and have ready nodes", func() { + testCtx := getTestCtx() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + n2Image := internal.GetEnvVarValue("E2E_N2_RELEASE_IMAGE") + if n2Image == "" { + Skip("E2E_N2_RELEASE_IMAGE not set, skipping N-2 release test") + } + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + ctx := testCtx.Context + + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + var oneReplica int32 = 1 + np := buildTestNodePool(defaultNP, "prev-n2", func(pool *hyperv1.NodePool) { + pool.Spec.Replicas = &oneReplica + pool.Spec.Release.Image = n2Image + }) + + err := testCtx.MgmtClient.Create(ctx, np) + Expect(err).NotTo(HaveOccurred(), "failed to create NodePool %s", np.Name) + GinkgoWriter.Printf("Created NodePool %s at N-2 release %s\n", np.Name, n2Image) + defer cleanupNodePool(ctx, testCtx.MgmtClient, np) + + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + // TODO: EnsureNodesLabelsAndTaints requires *testing.T + }) +} + +// NodePoolMirrorConfigsTest creates a KubeletConfig ConfigMap, patches NodePool config, +// verifies the KubeletConfig gets mirrored to the hosted cluster's openshift-config-managed +// namespace, then removes the config and verifies cleanup. Only for 4.18+. +func NodePoolMirrorConfigsTest(getTestCtx internal.TestContextGetter) { + It("should mirror KubeletConfig to the hosted cluster and clean up on removal", func() { + testCtx := getTestCtx() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + if e2eutil.IsLessThan(e2eutil.Version418) { + Skip("mirror configs test only applicable for 4.18+") + } + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + ctx := testCtx.Context + + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + var oneReplica int32 = 1 + np := buildTestNodePool(defaultNP, "mirror-cfg", func(pool *hyperv1.NodePool) { + pool.Spec.Replicas = &oneReplica + }) + + err := testCtx.MgmtClient.Create(ctx, np) + Expect(err).NotTo(HaveOccurred(), "failed to create NodePool %s", np.Name) + GinkgoWriter.Printf("Created NodePool %s\n", np.Name) + defer cleanupNodePool(ctx, testCtx.MgmtClient, np) + + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + kcConfigMap := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: e2eutil.SimpleNameGenerator.GenerateName("kc-test-"), + Namespace: np.Namespace, + }, + Data: map[string]string{configKey: kubeletConfig1YAML}, + } + Expect(testCtx.MgmtClient.Create(ctx, kcConfigMap)).To(Succeed(), "failed to create KubeletConfig ConfigMap") + defer func() { + _ = testCtx.MgmtClient.Delete(ctx, kcConfigMap) + }() + + original := np.DeepCopy() + np.Spec.Config = append(np.Spec.Config, corev1.LocalObjectReference{Name: kcConfigMap.Name}) + Expect(testCtx.MgmtClient.Patch(ctx, np, crclient.MergeFrom(original))).To(Succeed(), + "failed to patch NodePool %s with KubeletConfig", np.Name) + + // Verify mirrored ConfigMap appears in the hosted cluster + e2eutil.EventuallyObjects(GinkgoTB(), ctx, "KubeletConfig should be mirrored to the hosted cluster", + func(ctx context.Context) ([]*corev1.ConfigMap, error) { + list := &corev1.ConfigMapList{} + err := guestClient.List(ctx, list, crclient.InNamespace(configManagedNamespace), + crclient.MatchingLabels(map[string]string{ + nodepool.KubeletConfigConfigMapLabel: "true", + hyperv1.NodePoolLabel: np.Name, + })) + configMaps := make([]*corev1.ConfigMap, len(list.Items)) + for i := range list.Items { + configMaps[i] = &list.Items[i] + } + return configMaps, err + }, + []e2eutil.Predicate[[]*corev1.ConfigMap]{ + func(configMaps []*corev1.ConfigMap) (done bool, reasons string, err error) { + want, got := 1, len(configMaps) + return want == got, fmt.Sprintf("expected %d KubeletConfig ConfigMaps, got %d", want, got), nil + }, + }, + []e2eutil.Predicate[*corev1.ConfigMap]{ + func(cm *corev1.ConfigMap) (done bool, reasons string, err error) { + want := netutil.ShortenName(kcConfigMap.Name, np.Name, nodepool.QualifiedNameMaxLength) + if want != cm.Name { + return false, fmt.Sprintf("expected ConfigMap name %q, got %q", want, cm.Name), nil + } + return true, "ConfigMap name is as expected", nil + }, + func(cm *corev1.ConfigMap) (done bool, reasons string, err error) { + if diff := cmp.Diff(map[string]string{ + nodepool.KubeletConfigConfigMapLabel: cm.Labels[nodepool.KubeletConfigConfigMapLabel], + hyperv1.NodePoolLabel: cm.Labels[hyperv1.NodePoolLabel], + nodepool.NTOMirroredConfigLabel: cm.Labels[nodepool.NTOMirroredConfigLabel], + }, map[string]string{ + nodepool.KubeletConfigConfigMapLabel: "true", + hyperv1.NodePoolLabel: np.Name, + nodepool.NTOMirroredConfigLabel: "true", + }); diff != "" { + return false, fmt.Sprintf("incorrect labels: %v", diff), nil + } + return true, "labels are correct", nil + }, + }, + ) + + // Remove KubeletConfig from NodePool and verify cleanup + GinkgoWriter.Printf("Removing KubeletConfig reference from NodePool %s\n", np.Name) + baseNP := np.DeepCopy() + np.Spec = original.Spec + Expect(testCtx.MgmtClient.Patch(ctx, np, crclient.MergeFrom(baseNP))).To(Succeed(), + "failed to remove KubeletConfig from NodePool %s", np.Name) + + e2eutil.EventuallyObjects(GinkgoTB(), ctx, "KubeletConfig ConfigMap to be deleted from hosted cluster", + func(ctx context.Context) ([]*corev1.ConfigMap, error) { + list := &corev1.ConfigMapList{} + err := guestClient.List(ctx, list, crclient.InNamespace(configManagedNamespace), + crclient.MatchingLabels(map[string]string{ + nodepool.KubeletConfigConfigMapLabel: "true", + hyperv1.NodePoolLabel: np.Name, + })) + configMaps := make([]*corev1.ConfigMap, len(list.Items)) + for i := range list.Items { + configMaps[i] = &list.Items[i] + } + return configMaps, err + }, + []e2eutil.Predicate[[]*corev1.ConfigMap]{ + func(configMaps []*corev1.ConfigMap) (done bool, reasons string, err error) { + want, got := 0, len(configMaps) + return want == got, fmt.Sprintf("expected %d KubeletConfig ConfigMaps, got %d", want, got), nil + }, + }, nil, + ) + }) +} + +// NodePoolTrustBundleTest creates an additional trust bundle ConfigMap, updates the +// HostedCluster to reference it, waits for NodePool update cycle, verifies user-ca-bundle +// exists in the hosted cluster, removes the trust bundle, verifies CPO deployment no longer +// mounts it, waits for another update cycle, and verifies user-ca-bundle is deleted (4.22+). +func NodePoolTrustBundleTest(getTestCtx internal.TestContextGetter) { + It("should propagate and remove additional trust bundle to/from the hosted cluster", func() { + testCtx := getTestCtx() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + e2eutil.GinkgoAtLeast(e2eutil.Version418) + + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + ctx := testCtx.Context + + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + var oneReplica int32 = 1 + np := buildTestNodePool(defaultNP, "trust-bundle", func(pool *hyperv1.NodePool) { + pool.Spec.Replicas = &oneReplica + }) + Expect(testCtx.MgmtClient.Create(ctx, np)).To(Succeed(), "failed to create NodePool %s", np.Name) + GinkgoWriter.Printf("Created NodePool %s for trust bundle test\n", np.Name) + defer cleanupNodePool(ctx, testCtx.MgmtClient, np) + + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + // Create additional trust bundle ConfigMap + trustBundle := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: e2eutil.SimpleNameGenerator.GenerateName("trust-bundle-"), + Namespace: hc.Namespace, + }, + Data: map[string]string{"ca-bundle.crt": "dummy"}, + } + Expect(testCtx.MgmtClient.Create(ctx, trustBundle)).To(Succeed(), "failed to create trust bundle ConfigMap") + + // Update HostedCluster to reference the trust bundle + GinkgoWriter.Printf("Updating HostedCluster with additional trust bundle %s\n", trustBundle.Name) + Expect(e2eutil.UpdateObject(GinkgoTB(), ctx, testCtx.MgmtClient, hc, func(obj *hyperv1.HostedCluster) { + obj.Spec.AdditionalTrustBundle = &corev1.LocalObjectReference{Name: trustBundle.Name} + })).To(Succeed(), "failed to update HostedCluster with trust bundle") + + // Defer cleanup: remove trust bundle reference from HostedCluster + defer func() { + err := e2eutil.UpdateObject(GinkgoTB(), ctx, testCtx.MgmtClient, hc, func(obj *hyperv1.HostedCluster) { + obj.Spec.AdditionalTrustBundle = nil + }) + if err != nil { + GinkgoWriter.Printf("WARNING: failed to clean up trust bundle reference: %v\n", err) + } + }() + + e2eutil.EventuallyObject(GinkgoTB(), ctx, fmt.Sprintf("NodePool %s/%s to begin updating", np.Namespace, np.Name), + func(ctx context.Context) (*hyperv1.NodePool, error) { + pool := &hyperv1.NodePool{} + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(np), pool) + return pool, err + }, + []e2eutil.Predicate[*hyperv1.NodePool]{ + e2eutil.ConditionPredicate[*hyperv1.NodePool](e2eutil.Condition{ + Type: hyperv1.NodePoolUpdatingConfigConditionType, + Status: metav1.ConditionTrue, + }), + }, + e2eutil.WithInterval(10*time.Second), e2eutil.WithTimeout(5*time.Minute), + ) + + e2eutil.EventuallyObject(GinkgoTB(), ctx, fmt.Sprintf("NodePool %s/%s to stop updating", np.Namespace, np.Name), + func(ctx context.Context) (*hyperv1.NodePool, error) { + pool := &hyperv1.NodePool{} + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(np), pool) + return pool, err + }, + []e2eutil.Predicate[*hyperv1.NodePool]{ + e2eutil.ConditionPredicate[*hyperv1.NodePool](e2eutil.Condition{ + Type: hyperv1.NodePoolUpdatingConfigConditionType, + Status: metav1.ConditionFalse, + }), + e2eutil.ConditionPredicate[*hyperv1.NodePool](e2eutil.Condition{ + Type: hyperv1.NodePoolAllNodesHealthyConditionType, + Status: metav1.ConditionTrue, + }), + }, + e2eutil.WithInterval(10*time.Second), e2eutil.WithTimeout(20*time.Minute), + ) + + // Verify user-ca-bundle exists in the hosted cluster + userCAConfigMap := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "user-ca-bundle", + Namespace: "openshift-config", + }, + } + e2eutil.EventuallyObject(GinkgoTB(), ctx, "user-ca-bundle to exist in hosted cluster", + func(ctx context.Context) (*corev1.ConfigMap, error) { + cm := &corev1.ConfigMap{} + err := guestClient.Get(ctx, crclient.ObjectKeyFromObject(userCAConfigMap), cm) + return cm, err + }, + []e2eutil.Predicate[*corev1.ConfigMap]{ + func(obj *corev1.ConfigMap) (bool, string, error) { return true, "exists", nil }, + }, + e2eutil.WithInterval(10*time.Second), e2eutil.WithTimeout(5*time.Minute), + ) + + // Remove trust bundle from HostedCluster + GinkgoWriter.Printf("Removing additional trust bundle from HostedCluster\n") + Expect(e2eutil.UpdateObject(GinkgoTB(), ctx, testCtx.MgmtClient, hc, func(obj *hyperv1.HostedCluster) { + obj.Spec.AdditionalTrustBundle = nil + })).To(Succeed(), "failed to remove trust bundle from HostedCluster") + + // Verify CPO deployment no longer mounts the trust bundle + cpNamespace := manifests.HostedControlPlaneNamespace(hc.Namespace, hc.Name) + cpoDeployment := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "control-plane-operator", + Namespace: cpNamespace, + }, + } + e2eutil.EventuallyObject(GinkgoTB(), ctx, "CPO deployment to stop mounting trust bundle", + func(ctx context.Context) (*appsv1.Deployment, error) { + deploy := &appsv1.Deployment{} + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(cpoDeployment), deploy) + return deploy, err + }, + []e2eutil.Predicate[*appsv1.Deployment]{ + func(obj *appsv1.Deployment) (bool, string, error) { + for _, volume := range obj.Spec.Template.Spec.Volumes { + if volume.ConfigMap != nil && volume.ConfigMap.Name == "trusted-ca" { + return false, "trust bundle volume still mounted in CPO", nil + } + } + if ready := podspec.IsDeploymentReady(ctx, obj); !ready { + return false, "CPO deployment is not ready", nil + } + return true, "trust bundle volume removed from CPO", nil + }, + }, + ) + + // Wait for NodePool to cycle again + e2eutil.EventuallyObject(GinkgoTB(), ctx, fmt.Sprintf("NodePool %s/%s to begin updating after trust bundle removal", np.Namespace, np.Name), + func(ctx context.Context) (*hyperv1.NodePool, error) { + pool := &hyperv1.NodePool{} + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(np), pool) + return pool, err + }, + []e2eutil.Predicate[*hyperv1.NodePool]{ + e2eutil.ConditionPredicate[*hyperv1.NodePool](e2eutil.Condition{ + Type: hyperv1.NodePoolUpdatingConfigConditionType, + Status: metav1.ConditionTrue, + }), + }, + e2eutil.WithInterval(10*time.Second), e2eutil.WithTimeout(5*time.Minute), + ) + + e2eutil.EventuallyObject(GinkgoTB(), ctx, fmt.Sprintf("NodePool %s/%s to stop updating after trust bundle removal", np.Namespace, np.Name), + func(ctx context.Context) (*hyperv1.NodePool, error) { + pool := &hyperv1.NodePool{} + err := testCtx.MgmtClient.Get(ctx, crclient.ObjectKeyFromObject(np), pool) + return pool, err + }, + []e2eutil.Predicate[*hyperv1.NodePool]{ + e2eutil.ConditionPredicate[*hyperv1.NodePool](e2eutil.Condition{ + Type: hyperv1.NodePoolUpdatingConfigConditionType, + Status: metav1.ConditionFalse, + }), + e2eutil.ConditionPredicate[*hyperv1.NodePool](e2eutil.Condition{ + Type: hyperv1.NodePoolAllNodesHealthyConditionType, + Status: metav1.ConditionTrue, + }), + }, + e2eutil.WithInterval(10*time.Second), e2eutil.WithTimeout(20*time.Minute), + ) + + // Verify user-ca-bundle is deleted from the hosted cluster (4.22+) + if e2eutil.IsGreaterThanOrEqualTo(e2eutil.Version422) { + e2eutil.EventuallyNotFound(GinkgoTB(), ctx, guestClient, userCAConfigMap, + e2eutil.WithInterval(10*time.Second), e2eutil.WithTimeout(5*time.Minute), + ) + } + }) +} + +// NodePoolNTOPerformanceProfileTest creates a PerformanceProfile via ConfigMap, +// patches the NodePool's TuningConfig, verifies the PerformanceProfile ConfigMap and +// status ConfigMap are created in the control plane namespace, and verifies cleanup. +func NodePoolNTOPerformanceProfileTest(getTestCtx internal.TestContextGetter) { + It("should create and manage NTO PerformanceProfile via NodePool TuningConfig", func() { + testCtx := getTestCtx() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + if hc.Spec.Platform.Type == hyperv1.OpenStackPlatform { + Skip("test is skipped for OpenStack platform until https://issues.redhat.com/browse/OSASINFRA-3566 is addressed") + } + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + ctx := testCtx.Context + + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + var oneReplica int32 = 1 + np := buildTestNodePool(defaultNP, "nto-perfprof", func(pool *hyperv1.NodePool) { + pool.Spec.Replicas = &oneReplica + }) + + err := testCtx.MgmtClient.Create(ctx, np) + Expect(err).NotTo(HaveOccurred(), "failed to create NodePool %s", np.Name) + GinkgoWriter.Printf("Created NodePool %s\n", np.Name) + defer cleanupNodePool(ctx, testCtx.MgmtClient, np) + + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + ppConfigMap := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: e2eutil.SimpleNameGenerator.GenerateName("pp-test-"), + Namespace: np.Namespace, + }, + Data: map[string]string{tuningConfigKey: performanceProfileYAML}, + } + Expect(testCtx.MgmtClient.Create(ctx, ppConfigMap)).To(Succeed(), "failed to create PerformanceProfile ConfigMap") + defer func() { + _ = testCtx.MgmtClient.Delete(ctx, ppConfigMap) + }() + + original := np.DeepCopy() + np.Spec.TuningConfig = append(np.Spec.TuningConfig, corev1.LocalObjectReference{Name: ppConfigMap.Name}) + Expect(testCtx.MgmtClient.Patch(ctx, np, crclient.MergeFrom(original))).To(Succeed(), + "failed to patch NodePool %s with PerformanceProfile config", np.Name) + + cpNamespace := manifests.HostedControlPlaneNamespace(hc.Namespace, hc.Name) + + // Verify PerformanceProfile ConfigMap exists in control plane namespace + e2eutil.EventuallyObjects(GinkgoTB(), ctx, "PerformanceProfile ConfigMap to exist with correct labels", + func(ctx context.Context) ([]*corev1.ConfigMap, error) { + list := &corev1.ConfigMapList{} + err := testCtx.MgmtClient.List(ctx, list, crclient.InNamespace(cpNamespace), + crclient.MatchingLabels(map[string]string{ + nodepool.PerformanceProfileConfigMapLabel: "true", + })) + configMaps := make([]*corev1.ConfigMap, len(list.Items)) + for i := range list.Items { + configMaps[i] = &list.Items[i] + } + return configMaps, err + }, + []e2eutil.Predicate[[]*corev1.ConfigMap]{ + func(configMaps []*corev1.ConfigMap) (done bool, reasons string, err error) { + want, got := 1, len(configMaps) + return want == got, fmt.Sprintf("expected %d PerformanceProfile ConfigMaps, got %d", want, got), nil + }, + }, + []e2eutil.Predicate[*corev1.ConfigMap]{ + func(cm *corev1.ConfigMap) (done bool, reasons string, err error) { + want := netutil.ShortenName(ppConfigMap.Name, np.Name, nodepool.QualifiedNameMaxLength) + if want != cm.Name { + return false, fmt.Sprintf("expected PerformanceProfile ConfigMap name %q, got %q", want, cm.Name), nil + } + return true, "PerformanceProfile ConfigMap name is as expected", nil + }, + func(cm *corev1.ConfigMap) (done bool, reasons string, err error) { + if diff := cmp.Diff(map[string]string{ + nodepool.PerformanceProfileConfigMapLabel: cm.Labels[nodepool.PerformanceProfileConfigMapLabel], + hyperv1.NodePoolLabel: cm.Labels[hyperv1.NodePoolLabel], + }, map[string]string{ + nodepool.PerformanceProfileConfigMapLabel: "true", + hyperv1.NodePoolLabel: np.Name, + }); diff != "" { + return false, fmt.Sprintf("incorrect labels: %v", diff), nil + } + return true, "labels are correct", nil + }, + }, + ) + + // Verify status ConfigMap (4.17+) + if !e2eutil.IsLessThan(e2eutil.Version417) { + e2eutil.EventuallyObjects(GinkgoTB(), ctx, "PerformanceProfile status ConfigMap to exist", + func(ctx context.Context) ([]*corev1.ConfigMap, error) { + list := &corev1.ConfigMapList{} + err := testCtx.MgmtClient.List(ctx, list, crclient.InNamespace(cpNamespace), + crclient.MatchingLabels(map[string]string{ + nodepool.NodeTuningGeneratedPerformanceProfileStatusLabel: "true", + })) + configMaps := make([]*corev1.ConfigMap, len(list.Items)) + for i := range list.Items { + configMaps[i] = &list.Items[i] + } + return configMaps, err + }, + []e2eutil.Predicate[[]*corev1.ConfigMap]{ + func(configMaps []*corev1.ConfigMap) (done bool, reasons string, err error) { + want, got := 1, len(configMaps) + return want == got, fmt.Sprintf("expected %d status ConfigMaps, got %d", want, got), nil + }, + }, + []e2eutil.Predicate[*corev1.ConfigMap]{ + func(cm *corev1.ConfigMap) (done bool, reasons string, err error) { + want := fmt.Sprintf("status-%s", netutil.ShortenName(ppConfigMap.Name, np.Name, nodepool.QualifiedNameMaxLength)) + if want != cm.Name { + return false, fmt.Sprintf("expected status ConfigMap name %q, got %q", want, cm.Name), nil + } + return true, "status ConfigMap name is as expected", nil + }, + }, + ) + } + + // Remove PerformanceProfile from NodePool and verify cleanup + GinkgoWriter.Printf("Removing PerformanceProfile reference from NodePool %s\n", np.Name) + baseNP := np.DeepCopy() + np.Spec = original.Spec + Expect(testCtx.MgmtClient.Patch(ctx, np, crclient.MergeFrom(baseNP))).To(Succeed(), + "failed to remove PerformanceProfile from NodePool %s", np.Name) + + e2eutil.EventuallyObjects(GinkgoTB(), ctx, "PerformanceProfile ConfigMap to be deleted", + func(ctx context.Context) ([]*corev1.ConfigMap, error) { + list := &corev1.ConfigMapList{} + err := testCtx.MgmtClient.List(ctx, list, crclient.InNamespace(cpNamespace), + crclient.MatchingLabels(map[string]string{ + nodepool.PerformanceProfileConfigMapLabel: "true", + })) + configMaps := make([]*corev1.ConfigMap, len(list.Items)) + for i := range list.Items { + configMaps[i] = &list.Items[i] + } + return configMaps, err + }, + []e2eutil.Predicate[[]*corev1.ConfigMap]{ + func(configMaps []*corev1.ConfigMap) (done bool, reasons string, err error) { + want, got := 0, len(configMaps) + return want == got, fmt.Sprintf("expected %d PerformanceProfile ConfigMaps, got %d", want, got), nil + }, + }, nil, + ) + }) +} + +// NodePoolAutoRepairTest is a skeleton for platform-specific auto-repair tests. +// The full implementation requires cloud SDK dependencies for instance termination. +func NodePoolAutoRepairTest(getTestCtx internal.TestContextGetter) { + It("should auto-repair a NodePool when a node is terminated", func() { + Skip("auto-repair instance termination not yet implemented for v2 framework") + + testCtx := getTestCtx() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + platform := hc.Spec.Platform.Type + if platform != hyperv1.AWSPlatform && platform != hyperv1.AzurePlatform { + Skip("auto-repair test only supported on AWS and Azure platforms") + } + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + ctx := testCtx.Context + + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + var oneReplica int32 = 1 + np := buildTestNodePool(defaultNP, "autorepair", func(pool *hyperv1.NodePool) { + pool.Spec.Replicas = &oneReplica + pool.Spec.Management.AutoRepair = true + }) + + err := testCtx.MgmtClient.Create(ctx, np) + Expect(err).NotTo(HaveOccurred(), "failed to create NodePool %s", np.Name) + GinkgoWriter.Printf("Created auto-repair NodePool %s\n", np.Name) + defer cleanupNodePool(ctx, testCtx.MgmtClient, np) + + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, platform) + + // TODO: Implement cloud-specific instance termination logic. + // For AWS: use EC2 TerminateInstances API to terminate the node's backing instance. + // For Azure: delete the VMSS instance backing the node. + // After termination, wait for the node to be replaced using: + // e2eutil.WaitForReadyNodesByNodePool with WithCollectionPredicates and WithPredicates + // to verify the old node is replaced and the new node is healthy. + }) +} + +// NodePoolDiskEncryptionTest is a skeleton for Azure disk encryption tests. +func NodePoolDiskEncryptionTest(getTestCtx internal.TestContextGetter) { + It("should create a NodePool with Azure DiskEncryptionSet and verify it is applied", func() { + testCtx := getTestCtx() + Expect(testCtx).NotTo(BeNil(), "test context should be set up in BeforeSuite") + + hc := testCtx.GetHostedCluster() + Expect(hc).NotTo(BeNil(), "hosted cluster should be available") + + if hc.Spec.Platform.Type != hyperv1.AzurePlatform { + Skip("disk encryption test only supported on Azure platform") + } + + diskEncryptionSetID := internal.GetEnvVarValue("E2E_AZURE_DISK_ENCRYPTION_SET_ID") + if diskEncryptionSetID == "" { + Skip("E2E_AZURE_DISK_ENCRYPTION_SET_ID not set, skipping disk encryption test") + } + + guestClient := testCtx.GetHostedClusterClient() + Expect(guestClient).NotTo(BeNil(), "hosted cluster client should be available") + + ctx := testCtx.Context + + defaultNP := getDefaultNodePool(ctx, testCtx.MgmtClient, hc) + Expect(defaultNP).NotTo(BeNil(), "default NodePool should exist") + + var oneReplica int32 = 1 + np := buildTestNodePool(defaultNP, "disk-encrypt", func(pool *hyperv1.NodePool) { + pool.Spec.Replicas = &oneReplica + if pool.Spec.Platform.Azure != nil { + pool.Spec.Platform.Azure.OSDisk.EncryptionSetID = diskEncryptionSetID + } + }) + + err := testCtx.MgmtClient.Create(ctx, np) + Expect(err).NotTo(HaveOccurred(), "failed to create NodePool %s", np.Name) + GinkgoWriter.Printf("Created disk encryption NodePool %s\n", np.Name) + defer cleanupNodePool(ctx, testCtx.MgmtClient, np) + + e2eutil.WaitForReadyNodesByNodePool(GinkgoTB(), ctx, guestClient, np, hc.Spec.Platform.Type) + + // TODO: Verify disk encryption is applied by checking AzureMachine specs + // in the control plane namespace. This requires importing CAPI Azure types + // (capiazure.AzureMachineList) and verifying DiskEncryptionSetID on each machine. + }) +} + +// Helper functions + +// buildTestNodePool builds a new NodePool from a template with the given name prefix +// and applies the provided mutation function. +func buildTestNodePool(template *hyperv1.NodePool, namePrefix string, mutate func(*hyperv1.NodePool)) *hyperv1.NodePool { + GinkgoHelper() + + name := e2eutil.SimpleNameGenerator.GenerateName(template.Spec.ClusterName + "-" + namePrefix + "-") + np := &hyperv1.NodePool{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: template.Namespace, + }, + } + template.Spec.DeepCopyInto(&np.Spec) + + if mutate != nil { + mutate(np) + } + + return np +} + +// buildMachineConfigVerificationDaemonSet constructs a DaemonSet that verifies +// /etc/custom-config exists on nodes (checks MachineConfig was applied). +func buildMachineConfigVerificationDaemonSet(np *hyperv1.NodePool) *appsv1.DaemonSet { + GinkgoHelper() + + dsName := e2eutil.SimpleNameGenerator.GenerateName("mc-verify-") + ds := &appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: dsName, + Namespace: "kube-system", + Labels: map[string]string{ + hyperv1.NodePoolLabel: np.Name, + }, + }, + Spec: appsv1.DaemonSetSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "name": dsName, + hyperv1.NodePoolLabel: np.Name, + }, + }, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "name": dsName, + hyperv1.NodePoolLabel: np.Name, + }, + }, + Spec: corev1.PodSpec{ + NodeSelector: map[string]string{ + hyperv1.NodePoolLabel: np.Name, + }, + Tolerations: []corev1.Toleration{{Operator: corev1.TolerationOpExists}}, + Containers: []corev1.Container{{ + Name: dsName, + Image: "registry.access.redhat.com/ubi9/ubi-minimal:latest", + Command: []string{"/bin/sleep", "24h"}, + Resources: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("100m"), + corev1.ResourceMemory: resource.MustParse("200Mi"), + }, + }, + ReadinessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + Exec: &corev1.ExecAction{ + Command: []string{"/bin/cat", "/host/etc/custom-config"}, + }, + }, + }, + VolumeMounts: []corev1.VolumeMount{{ + Name: "host", + MountPath: "/host", + ReadOnly: true, + }}, + }}, + TerminationGracePeriodSeconds: ptr.To[int64](30), + Volumes: []corev1.Volume{{ + Name: "host", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{Path: "/"}, + }, + }}, + }, + }, + }, + } + + return ds +} + +// buildNTOVerificationDaemonSet constructs a DaemonSet that verifies hugepages +// are configured on nodes via /proc/cmdline (checks NTO Tuned config was applied). +func buildNTOVerificationDaemonSet(np *hyperv1.NodePool) *appsv1.DaemonSet { + GinkgoHelper() + + dsName := e2eutil.SimpleNameGenerator.GenerateName("nto-verify-") + ds := &appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: dsName, + Namespace: "kube-system", + Labels: map[string]string{ + hyperv1.NodePoolLabel: np.Name, + }, + }, + Spec: appsv1.DaemonSetSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "name": dsName, + hyperv1.NodePoolLabel: np.Name, + }, + }, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "name": dsName, + hyperv1.NodePoolLabel: np.Name, + }, + }, + Spec: corev1.PodSpec{ + NodeSelector: map[string]string{ + hyperv1.NodePoolLabel: np.Name, + }, + Tolerations: []corev1.Toleration{{Operator: corev1.TolerationOpExists}}, + Containers: []corev1.Container{{ + Name: dsName, + Image: "registry.access.redhat.com/ubi9/ubi-minimal:latest", + Command: []string{"/bin/sleep", "24h"}, + Resources: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("100m"), + corev1.ResourceMemory: resource.MustParse("200Mi"), + }, + }, + ReadinessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + Exec: &corev1.ExecAction{ + Command: []string{"/bin/sh", "-c", `cat /proc/cmdline | grep "hugepagesz=2M hugepages=4"`}, + }, + }, + }, + VolumeMounts: []corev1.VolumeMount{{ + Name: "host", + MountPath: "/host", + ReadOnly: true, + }}, + }}, + TerminationGracePeriodSeconds: ptr.To[int64](30), + Volumes: []corev1.Volume{{ + Name: "host", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{Path: "/"}, + }, + }}, + }, + }, + }, + } + + return ds +} + +// waitForDaemonSetRollout polls until the DaemonSet has the expected number of ready pods. +func waitForDaemonSetRollout(ctx context.Context, client crclient.Client, ds *appsv1.DaemonSet, expectedCount int, platform hyperv1.PlatformType) { + GinkgoHelper() + + timeout := 15 * time.Minute + if platform == hyperv1.KubevirtPlatform { + timeout = 25 * time.Minute + } + + e2eutil.EventuallyObjects(GinkgoTB(), ctx, fmt.Sprintf("all pods in DaemonSet %s/%s to be ready", ds.Namespace, ds.Name), + func(ctx context.Context) ([]*corev1.Pod, error) { + list := &corev1.PodList{} + err := client.List(ctx, list, crclient.InNamespace(ds.Namespace), crclient.MatchingLabels(ds.Spec.Selector.MatchLabels)) + readyPods := []*corev1.Pod{} + for i := range list.Items { + pod := &list.Items[i] + for _, condition := range pod.Status.Conditions { + if condition.Type == corev1.PodReady && condition.Status == corev1.ConditionTrue { + readyPods = append(readyPods, pod) + break + } + } + } + return readyPods, err + }, + []e2eutil.Predicate[[]*corev1.Pod]{ + func(readyPods []*corev1.Pod) (done bool, reasons string, err error) { + want, got := expectedCount, len(readyPods) + return want == got, fmt.Sprintf("expected %d ready Pods, got %d", want, got), nil + }, + }, nil, + e2eutil.WithTimeout(timeout), + e2eutil.WithInterval(5*time.Second), + ) +} + +// nodePoolUpgradeTimeout returns the appropriate timeout for NodePool upgrades +// based on the platform type. +func nodePoolUpgradeTimeout(platform hyperv1.PlatformType) time.Duration { + switch platform { + case hyperv1.AzurePlatform, hyperv1.KubevirtPlatform: + return 45 * time.Minute + default: + return 20 * time.Minute + } +} + +// Constants + +const ( + tuningConfigKey = "tuning" + configKey = "config" + configManagedNamespace = "openshift-config-managed" + + hugepagesTunedYAML = `apiVersion: tuned.openshift.io/v1 +kind: Tuned +metadata: + name: hugepages + namespace: openshift-cluster-node-tuning-operator +spec: + profile: + - data: | + [main] + summary=Boot time configuration for hugepages + include=openshift-node + [bootloader] + cmdline_openshift_node_hugepages=hugepagesz=2M hugepages=4 + name: openshift-hugepages + recommend: + - priority: 20 + profile: openshift-hugepages +` + + kubeletConfig1YAML = ` +apiVersion: machineconfiguration.openshift.io/v1 +kind: KubeletConfig +metadata: + name: set-max-pods +spec: + kubeletConfig: + maxPods: 100 +` + + performanceProfileYAML = ` +apiVersion: performance.openshift.io/v2 +kind: PerformanceProfile +metadata: + name: perfprof-2 +spec: + cpu: + isolated: "1" + reserved: "0" + numa: + topologyPolicy: "single-numa-node" + nodeSelector: + node-role.kubernetes.io/worker-cnf: "" +` +)