diff --git a/hypershift-operator/controllers/nodepool/capi.go b/hypershift-operator/controllers/nodepool/capi.go index f2965b8a555..e8e4591b28a 100644 --- a/hypershift-operator/controllers/nodepool/capi.go +++ b/hypershift-operator/controllers/nodepool/capi.go @@ -151,6 +151,32 @@ func (c *CAPI) Reconcile(ctx context.Context) error { } } + // Reconcile spot-specific MachineHealthCheck when spot instances are enabled. + // This is placed before the autoRepair gate because the spot MHC must exist + // independently of autoRepair — it serves as a safety net for spot instance + // failures even when the ignition endpoint has not been reached yet. + spotMHC := c.spotMachineHealthCheck() + if isSpotEnabled(nodePool) { + if result, err := ctrl.CreateOrUpdate(ctx, c.Client, spotMHC, func() error { + return c.reconcileSpotMachineHealthCheck(ctx, spotMHC) + }); err != nil { + return fmt.Errorf("failed to reconcile spot MachineHealthCheck %q: %w", + client.ObjectKeyFromObject(spotMHC).String(), err) + } else { + log.Info("Reconciled spot MachineHealthCheck", "result", result) + } + } else { + err := c.Get(ctx, client.ObjectKeyFromObject(spotMHC), spotMHC) + if err != nil && !apierrors.IsNotFound(err) { + return err + } + if err == nil { + if err := c.Delete(ctx, spotMHC); err != nil && !apierrors.IsNotFound(err) { + return err + } + } + } + mhc := c.machineHealthCheck() if nodePool.Spec.Management.AutoRepair { if c := FindStatusCondition(nodePool.Status.Conditions, hyperv1.NodePoolReachedIgnitionEndpoint); c == nil || c.Status != corev1.ConditionTrue { @@ -190,30 +216,6 @@ func (c *CAPI) Reconcile(ctx context.Context) error { }) } - // Reconcile spot-specific MachineHealthCheck when spot instances are enabled - spotMHC := c.spotMachineHealthCheck() - if isSpotEnabled(nodePool) { - if result, err := ctrl.CreateOrUpdate(ctx, c.Client, spotMHC, func() error { - return c.reconcileSpotMachineHealthCheck(ctx, spotMHC) - }); err != nil { - return fmt.Errorf("failed to reconcile spot MachineHealthCheck %q: %w", - client.ObjectKeyFromObject(spotMHC).String(), err) - } else { - log.Info("Reconciled spot MachineHealthCheck", "result", result) - } - } else { - // Delete spot MHC if spot is not enabled - err := c.Get(ctx, client.ObjectKeyFromObject(spotMHC), spotMHC) - if err != nil && !apierrors.IsNotFound(err) { - return err - } - if err == nil { - if err := c.Delete(ctx, spotMHC); err != nil && !apierrors.IsNotFound(err) { - return err - } - } - } - return nil } diff --git a/hypershift-operator/controllers/nodepool/capi_test.go b/hypershift-operator/controllers/nodepool/capi_test.go index cafeaf17922..8ec303f396c 100644 --- a/hypershift-operator/controllers/nodepool/capi_test.go +++ b/hypershift-operator/controllers/nodepool/capi_test.go @@ -3279,3 +3279,233 @@ func TestNewCAPI(t *testing.T) { }) } } + +func TestSpotMHCCreatedIndependentlyOfAutoRepairIgnitionGate(t *testing.T) { + t.Parallel() + + maxUnavailable := intstr.FromInt(0) + maxSurge := intstr.FromInt(1) + awsMachineTemplateName := "test-nodepool-28d5cf5a" + capiClusterName := "infra-id" + + tests := []struct { + name string + autoRepair bool + ignitionReached bool + spotEnabled bool + expectSpotMHC bool + expectRegularMHC bool + expectAutoRepairStatus corev1.ConditionStatus + }{ + { + name: "When spot is enabled and autoRepair is true and ignition is NOT reached it should create spot MHC and not create regular MHC", + autoRepair: true, + ignitionReached: false, + spotEnabled: true, + expectSpotMHC: true, + expectRegularMHC: false, + expectAutoRepairStatus: "", + }, + { + name: "When spot is enabled and autoRepair is true and ignition is reached it should create both MHCs", + autoRepair: true, + ignitionReached: true, + spotEnabled: true, + expectSpotMHC: true, + expectRegularMHC: true, + expectAutoRepairStatus: corev1.ConditionTrue, + }, + { + name: "When spot is enabled and autoRepair is false it should create spot MHC and not create regular MHC", + autoRepair: false, + ignitionReached: false, + spotEnabled: true, + expectSpotMHC: true, + expectRegularMHC: false, + expectAutoRepairStatus: corev1.ConditionFalse, + }, + { + name: "When spot is NOT enabled and autoRepair is true and ignition is NOT reached it should not create any MHC", + autoRepair: true, + ignitionReached: false, + spotEnabled: false, + expectSpotMHC: false, + expectRegularMHC: false, + expectAutoRepairStatus: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + g := NewWithT(t) + + annotations := map[string]string{} + if tt.spotEnabled { + annotations[AnnotationEnableSpot] = "true" + } + + var conditions []hyperv1.NodePoolCondition + if tt.ignitionReached { + conditions = append(conditions, hyperv1.NodePoolCondition{ + Type: hyperv1.NodePoolReachedIgnitionEndpoint, + Status: corev1.ConditionTrue, + }) + } else { + conditions = append(conditions, hyperv1.NodePoolCondition{ + Type: hyperv1.NodePoolReachedIgnitionEndpoint, + Status: corev1.ConditionFalse, + }) + } + + nodePool := &hyperv1.NodePool{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-nodepool", + Namespace: "test-namespace", + Annotations: annotations, + }, + Spec: hyperv1.NodePoolSpec{ + ClusterName: "test-cluster", + Management: hyperv1.NodePoolManagement{ + UpgradeType: hyperv1.UpgradeTypeReplace, + Replace: &hyperv1.ReplaceUpgrade{ + Strategy: hyperv1.UpgradeStrategyRollingUpdate, + RollingUpdate: &hyperv1.RollingUpdate{ + MaxUnavailable: &maxUnavailable, + MaxSurge: &maxSurge, + }, + }, + AutoRepair: tt.autoRepair, + }, + Replicas: ptr.To[int32](1), + Platform: hyperv1.NodePoolPlatform{ + Type: hyperv1.AWSPlatform, + AWS: &hyperv1.AWSNodePoolPlatform{ + AMI: "an-ami", + }, + }, + }, + Status: hyperv1.NodePoolStatus{ + Conditions: conditions, + }, + } + + hostedCluster := &hyperv1.HostedCluster{ + ObjectMeta: metav1.ObjectMeta{Name: "test-cluster", Namespace: "test-namespace"}, + Spec: hyperv1.HostedClusterSpec{ + Platform: hyperv1.PlatformSpec{ + Type: hyperv1.AWSPlatform, + AWS: &hyperv1.AWSPlatformSpec{ + Region: "", + CloudProviderConfig: &hyperv1.AWSCloudProviderConfig{}, + ServiceEndpoints: []hyperv1.AWSServiceEndpoint{}, + RolesRef: hyperv1.AWSRolesRef{}, + ResourceTags: []hyperv1.AWSResourceTag{}, + EndpointAccess: "", + AdditionalAllowedPrincipals: []string{}, + MultiArch: false, + }, + }, + }, + } + + machineSet := &capiv1.MachineSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-machineset", + Namespace: "test-namespace-test-cluster", + Annotations: map[string]string{ + nodePoolAnnotation: "test-namespace/test-nodepool", + }, + }, + Spec: capiv1.MachineSetSpec{ + Template: capiv1.MachineTemplateSpec{ + Spec: capiv1.MachineSpec{ + InfrastructureRef: corev1.ObjectReference{ + Kind: "AWSMachineTemplate", + APIVersion: "infrastructure.cluster.x-k8s.io/v1beta2", + Namespace: "test-namespace-test-cluster", + Name: awsMachineTemplateName, + }, + }, + }, + }, + } + + template := &capiaws.AWSMachineTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Name: awsMachineTemplateName, + Namespace: "test-namespace-test-cluster", + Annotations: map[string]string{ + nodePoolAnnotation: "test-namespace/test-nodepool", + }, + }, + } + + controlpaneNamespace := manifests.HostedControlPlaneNamespace(hostedCluster.Namespace, hostedCluster.Name) + c := fake.NewClientBuilder(). + WithScheme(api.Scheme). + WithObjects(nodePool, machineSet, template). + Build() + + capi := &CAPI{ + Token: &Token{ + ConfigGenerator: &ConfigGenerator{ + Client: c, + hostedCluster: hostedCluster, + nodePool: nodePool, + controlplaneNamespace: controlpaneNamespace, + rolloutConfig: &rolloutConfig{ + releaseImage: &releaseinfo.ReleaseImage{ + ImageStream: &imageapi.ImageStream{ + ObjectMeta: metav1.ObjectMeta{ + Name: "target-version", + }, + }, + }, + }, + }, + cpoCapabilities: &CPOCapabilities{}, + CreateOrUpdateProvider: upsert.New(false), + }, + capiClusterName: capiClusterName, + ApplyProvider: upsert.NewApplyProvider(false), + } + + err := capi.Reconcile(t.Context()) + g.Expect(err).NotTo(HaveOccurred()) + + spotMHC := &capiv1.MachineHealthCheck{} + spotMHCErr := c.Get(t.Context(), client.ObjectKey{ + Namespace: controlpaneNamespace, + Name: nodePool.GetName() + "-spot", + }, spotMHC) + + if tt.expectSpotMHC { + g.Expect(spotMHCErr).NotTo(HaveOccurred(), "spot MHC should exist") + g.Expect(spotMHC.Spec.Selector.MatchLabels).To(HaveKeyWithValue(interruptibleInstanceLabel, "")) + g.Expect(spotMHC.Spec.MaxUnhealthy.String()).To(Equal("100%")) + } else { + g.Expect(apierrors.IsNotFound(spotMHCErr)).To(BeTrue(), "spot MHC should not exist") + } + + regularMHC := &capiv1.MachineHealthCheck{} + regularMHCErr := c.Get(t.Context(), client.ObjectKey{ + Namespace: controlpaneNamespace, + Name: nodePool.GetName(), + }, regularMHC) + + if tt.expectRegularMHC { + g.Expect(regularMHCErr).NotTo(HaveOccurred(), "regular MHC should exist") + g.Expect(regularMHC.Spec.ClusterName).To(Equal(capiClusterName)) + } else { + g.Expect(apierrors.IsNotFound(regularMHCErr)).To(BeTrue(), "regular MHC should not exist") + } + + if tt.expectAutoRepairStatus != "" { + cond := FindStatusCondition(nodePool.Status.Conditions, hyperv1.NodePoolAutorepairEnabledConditionType) + g.Expect(cond).NotTo(BeNil()) + g.Expect(cond.Status).To(Equal(tt.expectAutoRepairStatus)) + } + }) + } +}