From 1a9eec353b1e8d5b6ac77bbca12824ff082777af Mon Sep 17 00:00:00 2001 From: Divyam pateriya Date: Mon, 1 Jun 2026 00:14:59 +0530 Subject: [PATCH] OCPBUGS-86798: spot MHC not created when autoRepair=true and ignition endpoint not reached The spot-specific MachineHealthCheck was placed after the autoRepair gate in CAPI.Reconcile(). When autoRepair is true and the ignition endpoint has not been reached, the function returns early, preventing the spot MHC from being created. This causes a permanent deadlock: spot instances that fail to provision never get remediated because the MHC (which would detect the stuck Machine via NodeStartupTimeout) is never created, and the ignition endpoint is never reached because no node ever joins. Move the spot MHC reconciliation before the autoRepair gate so it is always created when spot instances are enabled, regardless of the autoRepair/ignition state. Co-authored-by: Cursor --- .../controllers/nodepool/capi.go | 50 ++-- .../controllers/nodepool/capi_test.go | 230 ++++++++++++++++++ 2 files changed, 256 insertions(+), 24 deletions(-) diff --git a/hypershift-operator/controllers/nodepool/capi.go b/hypershift-operator/controllers/nodepool/capi.go index f2965b8a555..e8e4591b28a 100644 --- a/hypershift-operator/controllers/nodepool/capi.go +++ b/hypershift-operator/controllers/nodepool/capi.go @@ -151,6 +151,32 @@ func (c *CAPI) Reconcile(ctx context.Context) error { } } + // Reconcile spot-specific MachineHealthCheck when spot instances are enabled. + // This is placed before the autoRepair gate because the spot MHC must exist + // independently of autoRepair — it serves as a safety net for spot instance + // failures even when the ignition endpoint has not been reached yet. + spotMHC := c.spotMachineHealthCheck() + if isSpotEnabled(nodePool) { + if result, err := ctrl.CreateOrUpdate(ctx, c.Client, spotMHC, func() error { + return c.reconcileSpotMachineHealthCheck(ctx, spotMHC) + }); err != nil { + return fmt.Errorf("failed to reconcile spot MachineHealthCheck %q: %w", + client.ObjectKeyFromObject(spotMHC).String(), err) + } else { + log.Info("Reconciled spot MachineHealthCheck", "result", result) + } + } else { + err := c.Get(ctx, client.ObjectKeyFromObject(spotMHC), spotMHC) + if err != nil && !apierrors.IsNotFound(err) { + return err + } + if err == nil { + if err := c.Delete(ctx, spotMHC); err != nil && !apierrors.IsNotFound(err) { + return err + } + } + } + mhc := c.machineHealthCheck() if nodePool.Spec.Management.AutoRepair { if c := FindStatusCondition(nodePool.Status.Conditions, hyperv1.NodePoolReachedIgnitionEndpoint); c == nil || c.Status != corev1.ConditionTrue { @@ -190,30 +216,6 @@ func (c *CAPI) Reconcile(ctx context.Context) error { }) } - // Reconcile spot-specific MachineHealthCheck when spot instances are enabled - spotMHC := c.spotMachineHealthCheck() - if isSpotEnabled(nodePool) { - if result, err := ctrl.CreateOrUpdate(ctx, c.Client, spotMHC, func() error { - return c.reconcileSpotMachineHealthCheck(ctx, spotMHC) - }); err != nil { - return fmt.Errorf("failed to reconcile spot MachineHealthCheck %q: %w", - client.ObjectKeyFromObject(spotMHC).String(), err) - } else { - log.Info("Reconciled spot MachineHealthCheck", "result", result) - } - } else { - // Delete spot MHC if spot is not enabled - err := c.Get(ctx, client.ObjectKeyFromObject(spotMHC), spotMHC) - if err != nil && !apierrors.IsNotFound(err) { - return err - } - if err == nil { - if err := c.Delete(ctx, spotMHC); err != nil && !apierrors.IsNotFound(err) { - return err - } - } - } - return nil } diff --git a/hypershift-operator/controllers/nodepool/capi_test.go b/hypershift-operator/controllers/nodepool/capi_test.go index cafeaf17922..8ec303f396c 100644 --- a/hypershift-operator/controllers/nodepool/capi_test.go +++ b/hypershift-operator/controllers/nodepool/capi_test.go @@ -3279,3 +3279,233 @@ func TestNewCAPI(t *testing.T) { }) } } + +func TestSpotMHCCreatedIndependentlyOfAutoRepairIgnitionGate(t *testing.T) { + t.Parallel() + + maxUnavailable := intstr.FromInt(0) + maxSurge := intstr.FromInt(1) + awsMachineTemplateName := "test-nodepool-28d5cf5a" + capiClusterName := "infra-id" + + tests := []struct { + name string + autoRepair bool + ignitionReached bool + spotEnabled bool + expectSpotMHC bool + expectRegularMHC bool + expectAutoRepairStatus corev1.ConditionStatus + }{ + { + name: "When spot is enabled and autoRepair is true and ignition is NOT reached it should create spot MHC and not create regular MHC", + autoRepair: true, + ignitionReached: false, + spotEnabled: true, + expectSpotMHC: true, + expectRegularMHC: false, + expectAutoRepairStatus: "", + }, + { + name: "When spot is enabled and autoRepair is true and ignition is reached it should create both MHCs", + autoRepair: true, + ignitionReached: true, + spotEnabled: true, + expectSpotMHC: true, + expectRegularMHC: true, + expectAutoRepairStatus: corev1.ConditionTrue, + }, + { + name: "When spot is enabled and autoRepair is false it should create spot MHC and not create regular MHC", + autoRepair: false, + ignitionReached: false, + spotEnabled: true, + expectSpotMHC: true, + expectRegularMHC: false, + expectAutoRepairStatus: corev1.ConditionFalse, + }, + { + name: "When spot is NOT enabled and autoRepair is true and ignition is NOT reached it should not create any MHC", + autoRepair: true, + ignitionReached: false, + spotEnabled: false, + expectSpotMHC: false, + expectRegularMHC: false, + expectAutoRepairStatus: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + g := NewWithT(t) + + annotations := map[string]string{} + if tt.spotEnabled { + annotations[AnnotationEnableSpot] = "true" + } + + var conditions []hyperv1.NodePoolCondition + if tt.ignitionReached { + conditions = append(conditions, hyperv1.NodePoolCondition{ + Type: hyperv1.NodePoolReachedIgnitionEndpoint, + Status: corev1.ConditionTrue, + }) + } else { + conditions = append(conditions, hyperv1.NodePoolCondition{ + Type: hyperv1.NodePoolReachedIgnitionEndpoint, + Status: corev1.ConditionFalse, + }) + } + + nodePool := &hyperv1.NodePool{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-nodepool", + Namespace: "test-namespace", + Annotations: annotations, + }, + Spec: hyperv1.NodePoolSpec{ + ClusterName: "test-cluster", + Management: hyperv1.NodePoolManagement{ + UpgradeType: hyperv1.UpgradeTypeReplace, + Replace: &hyperv1.ReplaceUpgrade{ + Strategy: hyperv1.UpgradeStrategyRollingUpdate, + RollingUpdate: &hyperv1.RollingUpdate{ + MaxUnavailable: &maxUnavailable, + MaxSurge: &maxSurge, + }, + }, + AutoRepair: tt.autoRepair, + }, + Replicas: ptr.To[int32](1), + Platform: hyperv1.NodePoolPlatform{ + Type: hyperv1.AWSPlatform, + AWS: &hyperv1.AWSNodePoolPlatform{ + AMI: "an-ami", + }, + }, + }, + Status: hyperv1.NodePoolStatus{ + Conditions: conditions, + }, + } + + hostedCluster := &hyperv1.HostedCluster{ + ObjectMeta: metav1.ObjectMeta{Name: "test-cluster", Namespace: "test-namespace"}, + Spec: hyperv1.HostedClusterSpec{ + Platform: hyperv1.PlatformSpec{ + Type: hyperv1.AWSPlatform, + AWS: &hyperv1.AWSPlatformSpec{ + Region: "", + CloudProviderConfig: &hyperv1.AWSCloudProviderConfig{}, + ServiceEndpoints: []hyperv1.AWSServiceEndpoint{}, + RolesRef: hyperv1.AWSRolesRef{}, + ResourceTags: []hyperv1.AWSResourceTag{}, + EndpointAccess: "", + AdditionalAllowedPrincipals: []string{}, + MultiArch: false, + }, + }, + }, + } + + machineSet := &capiv1.MachineSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-machineset", + Namespace: "test-namespace-test-cluster", + Annotations: map[string]string{ + nodePoolAnnotation: "test-namespace/test-nodepool", + }, + }, + Spec: capiv1.MachineSetSpec{ + Template: capiv1.MachineTemplateSpec{ + Spec: capiv1.MachineSpec{ + InfrastructureRef: corev1.ObjectReference{ + Kind: "AWSMachineTemplate", + APIVersion: "infrastructure.cluster.x-k8s.io/v1beta2", + Namespace: "test-namespace-test-cluster", + Name: awsMachineTemplateName, + }, + }, + }, + }, + } + + template := &capiaws.AWSMachineTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Name: awsMachineTemplateName, + Namespace: "test-namespace-test-cluster", + Annotations: map[string]string{ + nodePoolAnnotation: "test-namespace/test-nodepool", + }, + }, + } + + controlpaneNamespace := manifests.HostedControlPlaneNamespace(hostedCluster.Namespace, hostedCluster.Name) + c := fake.NewClientBuilder(). + WithScheme(api.Scheme). + WithObjects(nodePool, machineSet, template). + Build() + + capi := &CAPI{ + Token: &Token{ + ConfigGenerator: &ConfigGenerator{ + Client: c, + hostedCluster: hostedCluster, + nodePool: nodePool, + controlplaneNamespace: controlpaneNamespace, + rolloutConfig: &rolloutConfig{ + releaseImage: &releaseinfo.ReleaseImage{ + ImageStream: &imageapi.ImageStream{ + ObjectMeta: metav1.ObjectMeta{ + Name: "target-version", + }, + }, + }, + }, + }, + cpoCapabilities: &CPOCapabilities{}, + CreateOrUpdateProvider: upsert.New(false), + }, + capiClusterName: capiClusterName, + ApplyProvider: upsert.NewApplyProvider(false), + } + + err := capi.Reconcile(t.Context()) + g.Expect(err).NotTo(HaveOccurred()) + + spotMHC := &capiv1.MachineHealthCheck{} + spotMHCErr := c.Get(t.Context(), client.ObjectKey{ + Namespace: controlpaneNamespace, + Name: nodePool.GetName() + "-spot", + }, spotMHC) + + if tt.expectSpotMHC { + g.Expect(spotMHCErr).NotTo(HaveOccurred(), "spot MHC should exist") + g.Expect(spotMHC.Spec.Selector.MatchLabels).To(HaveKeyWithValue(interruptibleInstanceLabel, "")) + g.Expect(spotMHC.Spec.MaxUnhealthy.String()).To(Equal("100%")) + } else { + g.Expect(apierrors.IsNotFound(spotMHCErr)).To(BeTrue(), "spot MHC should not exist") + } + + regularMHC := &capiv1.MachineHealthCheck{} + regularMHCErr := c.Get(t.Context(), client.ObjectKey{ + Namespace: controlpaneNamespace, + Name: nodePool.GetName(), + }, regularMHC) + + if tt.expectRegularMHC { + g.Expect(regularMHCErr).NotTo(HaveOccurred(), "regular MHC should exist") + g.Expect(regularMHC.Spec.ClusterName).To(Equal(capiClusterName)) + } else { + g.Expect(apierrors.IsNotFound(regularMHCErr)).To(BeTrue(), "regular MHC should not exist") + } + + if tt.expectAutoRepairStatus != "" { + cond := FindStatusCondition(nodePool.Status.Conditions, hyperv1.NodePoolAutorepairEnabledConditionType) + g.Expect(cond).NotTo(BeNil()) + g.Expect(cond.Status).To(Equal(tt.expectAutoRepairStatus)) + } + }) + } +}