diff --git a/api/hypershift/v1beta1/nodepool_types.go b/api/hypershift/v1beta1/nodepool_types.go index dc17f42ee9e..5ebc5378064 100644 --- a/api/hypershift/v1beta1/nodepool_types.go +++ b/api/hypershift/v1beta1/nodepool_types.go @@ -106,7 +106,7 @@ type NodePool struct { // +kubebuilder:validation:XValidation:rule="!has(self.replicas) || !has(self.autoScaling)", message="Both replicas or autoScaling should not be set" // +kubebuilder:validation:XValidation:rule="self.arch != 's390x' || has(self.platform.kubevirt)", message="s390x is only supported on KubeVirt platform" // +kubebuilder:validation:XValidation:rule="!has(self.platform.aws) || !has(self.platform.aws.imageType) || self.platform.aws.imageType != 'Windows' || self.arch == 'amd64'", message="ImageType 'Windows' requires arch 'amd64' (AWS only)" -// +kubebuilder:validation:XValidation:rule="!has(self.autoScaling) || self.autoScaling.min > 0 || self.platform.type == 'AWS'", message="Scale-from-zero (autoScaling.min=0) is currently only supported for AWS platform" +// +kubebuilder:validation:XValidation:rule="!has(self.autoScaling) || self.autoScaling.min > 0 || self.platform.type == 'AWS' || self.platform.type == 'Azure'", message="Scale-from-zero (autoScaling.min=0) is currently only supported for AWS and Azure platforms" type NodePoolSpec struct { // clusterName is the name of the HostedCluster this NodePool belongs to. // If a HostedCluster with this name doesn't exist, the controller will no-op until it exists. @@ -501,7 +501,7 @@ type NodePoolManagement struct { // +kubebuilder:validation:XValidation:rule="self.max >= self.min", message="max must be equal or greater than min" type NodePoolAutoScaling struct { // min is the minimum number of nodes to maintain in the pool. - // Can be set to 0 for scale-from-zero for AWS platform. + // Can be set to 0 for scale-from-zero for AWS and Azure platforms. // Must be >= 0 and <= .Max. // // +kubebuilder:validation:Minimum=0 diff --git a/api/hypershift/v1beta1/zz_generated.featuregated-crd-manifests/nodepools.hypershift.openshift.io/AAA_ungated.yaml b/api/hypershift/v1beta1/zz_generated.featuregated-crd-manifests/nodepools.hypershift.openshift.io/AAA_ungated.yaml index 9463ef0840c..28b0b76bc4a 100644 --- a/api/hypershift/v1beta1/zz_generated.featuregated-crd-manifests/nodepools.hypershift.openshift.io/AAA_ungated.yaml +++ b/api/hypershift/v1beta1/zz_generated.featuregated-crd-manifests/nodepools.hypershift.openshift.io/AAA_ungated.yaml @@ -108,7 +108,7 @@ spec: min: description: |- min is the minimum number of nodes to maintain in the pool. - Can be set to 0 for scale-from-zero for AWS platform. + Can be set to 0 for scale-from-zero for AWS and Azure platforms. Must be >= 0 and <= .Max. format: int32 minimum: 0 @@ -1518,9 +1518,9 @@ spec: rule: '!has(self.platform.aws) || !has(self.platform.aws.imageType) || self.platform.aws.imageType != ''Windows'' || self.arch == ''amd64''' - message: Scale-from-zero (autoScaling.min=0) is currently only supported - for AWS platform + for AWS and Azure platforms rule: '!has(self.autoScaling) || self.autoScaling.min > 0 || self.platform.type - == ''AWS''' + == ''AWS'' || self.platform.type == ''Azure''' status: description: status is the latest observed status of the NodePool. properties: diff --git a/api/hypershift/v1beta1/zz_generated.featuregated-crd-manifests/nodepools.hypershift.openshift.io/GCPPlatform.yaml b/api/hypershift/v1beta1/zz_generated.featuregated-crd-manifests/nodepools.hypershift.openshift.io/GCPPlatform.yaml index ea37b91c13a..2705760fb21 100644 --- a/api/hypershift/v1beta1/zz_generated.featuregated-crd-manifests/nodepools.hypershift.openshift.io/GCPPlatform.yaml +++ b/api/hypershift/v1beta1/zz_generated.featuregated-crd-manifests/nodepools.hypershift.openshift.io/GCPPlatform.yaml @@ -108,7 +108,7 @@ spec: min: description: |- min is the minimum number of nodes to maintain in the pool. - Can be set to 0 for scale-from-zero for AWS platform. + Can be set to 0 for scale-from-zero for AWS and Azure platforms. Must be >= 0 and <= .Max. format: int32 minimum: 0 @@ -1787,9 +1787,9 @@ spec: rule: '!has(self.platform.aws) || !has(self.platform.aws.imageType) || self.platform.aws.imageType != ''Windows'' || self.arch == ''amd64''' - message: Scale-from-zero (autoScaling.min=0) is currently only supported - for AWS platform + for AWS and Azure platforms rule: '!has(self.autoScaling) || self.autoScaling.min > 0 || self.platform.type - == ''AWS''' + == ''AWS'' || self.platform.type == ''Azure''' status: description: status is the latest observed status of the NodePool. properties: diff --git a/api/hypershift/v1beta1/zz_generated.featuregated-crd-manifests/nodepools.hypershift.openshift.io/OpenStack.yaml b/api/hypershift/v1beta1/zz_generated.featuregated-crd-manifests/nodepools.hypershift.openshift.io/OpenStack.yaml index baed37846b3..a7c77a9ab07 100644 --- a/api/hypershift/v1beta1/zz_generated.featuregated-crd-manifests/nodepools.hypershift.openshift.io/OpenStack.yaml +++ b/api/hypershift/v1beta1/zz_generated.featuregated-crd-manifests/nodepools.hypershift.openshift.io/OpenStack.yaml @@ -108,7 +108,7 @@ spec: min: description: |- min is the minimum number of nodes to maintain in the pool. - Can be set to 0 for scale-from-zero for AWS platform. + Can be set to 0 for scale-from-zero for AWS and Azure platforms. Must be >= 0 and <= .Max. format: int32 minimum: 0 @@ -1705,9 +1705,9 @@ spec: rule: '!has(self.platform.aws) || !has(self.platform.aws.imageType) || self.platform.aws.imageType != ''Windows'' || self.arch == ''amd64''' - message: Scale-from-zero (autoScaling.min=0) is currently only supported - for AWS platform + for AWS and Azure platforms rule: '!has(self.autoScaling) || self.autoScaling.min > 0 || self.platform.type - == ''AWS''' + == ''AWS'' || self.platform.type == ''Azure''' status: description: status is the latest observed status of the NodePool. properties: diff --git a/cmd/install/assets/crds/hypershift-operator/tests/nodepools.hypershift.openshift.io/stable.nodepools.autoscaling.testsuite.yaml b/cmd/install/assets/crds/hypershift-operator/tests/nodepools.hypershift.openshift.io/stable.nodepools.autoscaling.testsuite.yaml index 924499411ac..10ee8107523 100644 --- a/cmd/install/assets/crds/hypershift-operator/tests/nodepools.hypershift.openshift.io/stable.nodepools.autoscaling.testsuite.yaml +++ b/cmd/install/assets/crds/hypershift-operator/tests/nodepools.hypershift.openshift.io/stable.nodepools.autoscaling.testsuite.yaml @@ -31,7 +31,7 @@ tests: id: "subnet-01234567" type: AWS - - name: when autoScaling min=0 on Azure platform it should fail + - name: when autoScaling min=0 on Azure platform it should pass initial: | apiVersion: hypershift.openshift.io/v1beta1 kind: NodePool @@ -56,7 +56,6 @@ tests: diskStorageAccountType: Premium_LRS subnetID: "/subscriptions/12345678-1234-5678-9012-123456789012/resourceGroups/test-rg/providers/Microsoft.Network/virtualNetworks/test-vnet/subnets/test-subnet" type: Azure - expectedError: "Scale-from-zero (autoScaling.min=0) is currently only supported for AWS platform" - name: when autoScaling min=0 on Agent platform it should fail initial: | @@ -77,7 +76,7 @@ tests: agent: {} type: Agent - expectedError: "Scale-from-zero (autoScaling.min=0) is currently only supported for AWS platform" + expectedError: "Scale-from-zero (autoScaling.min=0) is currently only supported for AWS and Azure platforms" - name: when autoScaling min=0 on KubeVirt platform it should fail initial: | @@ -101,7 +100,7 @@ tests: persistent: size: 32Gi type: KubeVirt - expectedError: "Scale-from-zero (autoScaling.min=0) is currently only supported for AWS platform" + expectedError: "Scale-from-zero (autoScaling.min=0) is currently only supported for AWS and Azure platforms" - name: when autoScaling min=1 on Azure platform it should pass initial: | diff --git a/cmd/install/assets/crds/hypershift-operator/zz_generated.crd-manifests/nodepools-CustomNoUpgrade.crd.yaml b/cmd/install/assets/crds/hypershift-operator/zz_generated.crd-manifests/nodepools-CustomNoUpgrade.crd.yaml index 055967ac455..4e38e77b912 100644 --- a/cmd/install/assets/crds/hypershift-operator/zz_generated.crd-manifests/nodepools-CustomNoUpgrade.crd.yaml +++ b/cmd/install/assets/crds/hypershift-operator/zz_generated.crd-manifests/nodepools-CustomNoUpgrade.crd.yaml @@ -111,7 +111,7 @@ spec: min: description: |- min is the minimum number of nodes to maintain in the pool. - Can be set to 0 for scale-from-zero for AWS platform. + Can be set to 0 for scale-from-zero for AWS and Azure platforms. Must be >= 0 and <= .Max. format: int32 minimum: 0 @@ -1975,9 +1975,9 @@ spec: rule: '!has(self.platform.aws) || !has(self.platform.aws.imageType) || self.platform.aws.imageType != ''Windows'' || self.arch == ''amd64''' - message: Scale-from-zero (autoScaling.min=0) is currently only supported - for AWS platform + for AWS and Azure platforms rule: '!has(self.autoScaling) || self.autoScaling.min > 0 || self.platform.type - == ''AWS''' + == ''AWS'' || self.platform.type == ''Azure''' status: description: status is the latest observed status of the NodePool. properties: diff --git a/cmd/install/assets/crds/hypershift-operator/zz_generated.crd-manifests/nodepools-Default.crd.yaml b/cmd/install/assets/crds/hypershift-operator/zz_generated.crd-manifests/nodepools-Default.crd.yaml index d317df1b41a..0df74130419 100644 --- a/cmd/install/assets/crds/hypershift-operator/zz_generated.crd-manifests/nodepools-Default.crd.yaml +++ b/cmd/install/assets/crds/hypershift-operator/zz_generated.crd-manifests/nodepools-Default.crd.yaml @@ -111,7 +111,7 @@ spec: min: description: |- min is the minimum number of nodes to maintain in the pool. - Can be set to 0 for scale-from-zero for AWS platform. + Can be set to 0 for scale-from-zero for AWS and Azure platforms. Must be >= 0 and <= .Max. format: int32 minimum: 0 @@ -1521,9 +1521,9 @@ spec: rule: '!has(self.platform.aws) || !has(self.platform.aws.imageType) || self.platform.aws.imageType != ''Windows'' || self.arch == ''amd64''' - message: Scale-from-zero (autoScaling.min=0) is currently only supported - for AWS platform + for AWS and Azure platforms rule: '!has(self.autoScaling) || self.autoScaling.min > 0 || self.platform.type - == ''AWS''' + == ''AWS'' || self.platform.type == ''Azure''' status: description: status is the latest observed status of the NodePool. properties: diff --git a/cmd/install/assets/crds/hypershift-operator/zz_generated.crd-manifests/nodepools-TechPreviewNoUpgrade.crd.yaml b/cmd/install/assets/crds/hypershift-operator/zz_generated.crd-manifests/nodepools-TechPreviewNoUpgrade.crd.yaml index f8650900421..4680973aea3 100644 --- a/cmd/install/assets/crds/hypershift-operator/zz_generated.crd-manifests/nodepools-TechPreviewNoUpgrade.crd.yaml +++ b/cmd/install/assets/crds/hypershift-operator/zz_generated.crd-manifests/nodepools-TechPreviewNoUpgrade.crd.yaml @@ -111,7 +111,7 @@ spec: min: description: |- min is the minimum number of nodes to maintain in the pool. - Can be set to 0 for scale-from-zero for AWS platform. + Can be set to 0 for scale-from-zero for AWS and Azure platforms. Must be >= 0 and <= .Max. format: int32 minimum: 0 @@ -1975,9 +1975,9 @@ spec: rule: '!has(self.platform.aws) || !has(self.platform.aws.imageType) || self.platform.aws.imageType != ''Windows'' || self.arch == ''amd64''' - message: Scale-from-zero (autoScaling.min=0) is currently only supported - for AWS platform + for AWS and Azure platforms rule: '!has(self.autoScaling) || self.autoScaling.min > 0 || self.platform.type - == ''AWS''' + == ''AWS'' || self.platform.type == ''Azure''' status: description: status is the latest observed status of the NodePool. properties: diff --git a/cmd/install/install.go b/cmd/install/install.go index c13fd743ec1..c2b0366ef96 100644 --- a/cmd/install/install.go +++ b/cmd/install/install.go @@ -291,18 +291,15 @@ func (o *Options) validateScaleFromZeroConfig() []error { return nil } var errs []error - supportedProviders := set.New("aws") - // Check mutual exclusivity - only one of file or secret should be provided + supportedProviders := set.New("aws", "azure") if len(o.ScaleFromZeroCreds) != 0 && len(o.ScaleFromZeroCredentialsSecret) != 0 { errs = append(errs, fmt.Errorf("only one of --scale-from-zero-creds or --scale-from-zero-secret is supported")) } - // Provider is required when using scale-from-zero credentials if len(o.ScaleFromZeroProvider) == 0 { errs = append(errs, fmt.Errorf("--scale-from-zero-provider is required when using scale-from-zero credentials")) } else if !supportedProviders.Has(o.ScaleFromZeroProvider) { errs = append(errs, fmt.Errorf("invalid --scale-from-zero-provider: %s (must be one of: %v)", o.ScaleFromZeroProvider, supportedProviders.UnsortedList())) } - // Validate credentials file exists and is accessible if provided if len(o.ScaleFromZeroCreds) > 0 { if _, err := os.Stat(o.ScaleFromZeroCreds); err != nil { if os.IsNotExist(err) { @@ -435,7 +432,7 @@ func NewCommand() *cobra.Command { cmd.PersistentFlags().StringSliceVar(&opts.PlatformsToInstall, "limit-crd-install", opts.PlatformsToInstall, "Used to limit the CRDs that are installed to a per platform basis (example: --limit-crd-install=AWS,Azure). If this flag is not specified, all CRDs for all platforms will be installed. Valid, case-insensitive values are: AWS, Azure, IBMCloud, KubeVirt, Agent, OpenStack, GCP.") cmd.PersistentFlags().StringToStringVar(&opts.AdditionalOperatorEnvVars, "additional-operator-env-vars", opts.AdditionalOperatorEnvVars, "Set of additional environment variables to be set on the HyperShift Operator deployment.") cmd.PersistentFlags().BoolVar(&opts.EnableAuditLogPersistence, "enable-audit-log-persistence", opts.EnableAuditLogPersistence, "If true, enables persistent audit logs with automatic snapshots for kube-apiserver pods") - cmd.PersistentFlags().StringVar(&opts.ScaleFromZeroProvider, "scale-from-zero-provider", opts.ScaleFromZeroProvider, "Platform type for scale-from-zero autoscaling (aws)") + cmd.PersistentFlags().StringVar(&opts.ScaleFromZeroProvider, "scale-from-zero-provider", opts.ScaleFromZeroProvider, "Platform type for scale-from-zero autoscaling (aws, azure)") cmd.PersistentFlags().StringVar(&opts.ScaleFromZeroCreds, "scale-from-zero-creds", opts.ScaleFromZeroCreds, "Path to credentials file for scale-from-zero instance type queries") cmd.PersistentFlags().StringVar(&opts.ScaleFromZeroCredentialsSecret, "scale-from-zero-secret", opts.ScaleFromZeroCredentialsSecret, "Name of existing secret containing scale-from-zero credentials (alternative to --scale-from-zero-creds)") cmd.PersistentFlags().StringVar(&opts.ScaleFromZeroCredentialsSecretKey, "scale-from-zero-secret-key", opts.ScaleFromZeroCredentialsSecretKey, "Key within the scale-from-zero credentials secret (default: credentials)") diff --git a/docs/content/reference/aggregated-docs.md b/docs/content/reference/aggregated-docs.md index e1e9658ac27..34e7f5fb53c 100644 --- a/docs/content/reference/aggregated-docs.md +++ b/docs/content/reference/aggregated-docs.md @@ -48365,7 +48365,7 @@ int32

min is the minimum number of nodes to maintain in the pool. -Can be set to 0 for scale-from-zero for AWS platform. +Can be set to 0 for scale-from-zero for AWS and Azure platforms. Must be >= 0 and <= .Max.

diff --git a/docs/content/reference/api.md b/docs/content/reference/api.md index 828e8531ff1..91284eac765 100644 --- a/docs/content/reference/api.md +++ b/docs/content/reference/api.md @@ -12680,7 +12680,7 @@ int32

min is the minimum number of nodes to maintain in the pool. -Can be set to 0 for scale-from-zero for AWS platform. +Can be set to 0 for scale-from-zero for AWS and Azure platforms. Must be >= 0 and <= .Max.

diff --git a/go.mod b/go.mod index 536ea13a88a..493a7fd6ecb 100644 --- a/go.mod +++ b/go.mod @@ -6,6 +6,7 @@ require ( github.com/Azure/azure-sdk-for-go/sdk/azcore v1.21.1 github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/authorization/armauthorization/v2 v2.2.0 + github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5 v5.7.0 github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/dns/armdns v1.2.0 github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/msi/armmsi v1.3.0 github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v5 v5.2.0 @@ -137,7 +138,6 @@ require ( cloud.google.com/go/compute/metadata v0.9.0 // indirect cyphar.com/go-pathrs v0.2.1 // indirect github.com/Azure/azure-sdk-for-go/sdk/internal v1.12.0 // indirect - github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5 v5.7.0 // indirect github.com/Azure/azure-sdk-for-go/sdk/security/keyvault/azsecrets v1.4.0 // indirect github.com/Azure/azure-sdk-for-go/sdk/security/keyvault/internal v1.2.0 // indirect github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c // indirect diff --git a/hypershift-operator/controllers/nodepool/capi.go b/hypershift-operator/controllers/nodepool/capi.go index 248c7fc468b..907f4ab82d0 100644 --- a/hypershift-operator/controllers/nodepool/capi.go +++ b/hypershift-operator/controllers/nodepool/capi.go @@ -51,7 +51,8 @@ const ( // and let nodepool, hostedcluster, and client be fields of CAPI / interface methods. type CAPI struct { *Token - capiClusterName string + capiClusterName string + scaleFromZeroPlatform hyperv1.PlatformType upsert.ApplyProvider } @@ -472,7 +473,7 @@ func (c *CAPI) reconcileMachineDeployment(ctx context.Context, log logr.Logger, } } - setMachineDeploymentReplicas(nodePool, machineDeployment) + setMachineDeploymentReplicas(nodePool, machineDeployment, c.scaleFromZeroPlatform) if updated := c.propagateVersionAndTemplate(log, machineDeployment, machineTemplateCR); updated { return nil @@ -756,7 +757,7 @@ func (c *CAPI) reconcileMachineHealthCheck(ctx context.Context, // setMachineDeploymentReplicas sets wanted replicas: // If autoscaling is enabled we reconcile min/max annotations and leave replicas untouched. -func setMachineDeploymentReplicas(nodePool *hyperv1.NodePool, machineDeployment *capiv1.MachineDeployment) { +func setMachineDeploymentReplicas(nodePool *hyperv1.NodePool, machineDeployment *capiv1.MachineDeployment, scaleFromZeroPlatform hyperv1.PlatformType) { if machineDeployment.Annotations == nil { machineDeployment.Annotations = make(map[string]string) } @@ -773,7 +774,7 @@ func setMachineDeploymentReplicas(nodePool *hyperv1.NodePool, machineDeployment // NodePools from being permanently stuck at 0 replicas on platforms that don't support // scale-from-zero metadata. effectiveMin := ptr.Deref(nodePool.Spec.AutoScaling.Min, 0) - if effectiveMin == 0 && nodePool.Spec.Platform.Type != hyperv1.AWSPlatform { + if effectiveMin == 0 && nodePool.Spec.Platform.Type != hyperv1.AWSPlatform && nodePool.Spec.Platform.Type != scaleFromZeroPlatform { effectiveMin = 1 } @@ -957,7 +958,7 @@ func (c *CAPI) reconcileMachineSet(ctx context.Context, } machineSet.Spec.Template.Annotations[nodePoolAnnotationTaints] = taintsInJSON - setMachineSetReplicas(nodePool, machineSet) + setMachineSetReplicas(nodePool, machineSet, c.scaleFromZeroPlatform) isUpdating := false // Propagate version and userData Secret to the MachineSet. @@ -1064,7 +1065,7 @@ func machineSetInPlaceRolloutIsComplete(machineSet *capiv1.MachineSet) bool { // setMachineSetReplicas sets wanted replicas: // If autoscaling is enabled we reconcile min/max annotations and leave replicas untouched. -func setMachineSetReplicas(nodePool *hyperv1.NodePool, machineSet *capiv1.MachineSet) { +func setMachineSetReplicas(nodePool *hyperv1.NodePool, machineSet *capiv1.MachineSet, scaleFromZeroPlatform hyperv1.PlatformType) { if machineSet.Annotations == nil { machineSet.Annotations = make(map[string]string) } @@ -1081,7 +1082,7 @@ func setMachineSetReplicas(nodePool *hyperv1.NodePool, machineSet *capiv1.Machin // NodePools from being permanently stuck at 0 replicas on platforms that don't support // scale-from-zero metadata. effectiveMin := ptr.Deref(nodePool.Spec.AutoScaling.Min, 0) - if effectiveMin == 0 && nodePool.Spec.Platform.Type != hyperv1.AWSPlatform { + if effectiveMin == 0 && nodePool.Spec.Platform.Type != hyperv1.AWSPlatform && nodePool.Spec.Platform.Type != scaleFromZeroPlatform { effectiveMin = 1 } diff --git a/hypershift-operator/controllers/nodepool/capi_test.go b/hypershift-operator/controllers/nodepool/capi_test.go index 1fe35dc09f6..4250b5025c8 100644 --- a/hypershift-operator/controllers/nodepool/capi_test.go +++ b/hypershift-operator/controllers/nodepool/capi_test.go @@ -42,6 +42,7 @@ func TestSetMachineSetReplicas(t *testing.T) { name string nodePool *hyperv1.NodePool machineSet *capiv1.MachineSet + scaleFromZeroPlatform hyperv1.PlatformType expectReplicas int32 expectAutoscalerAnnotations map[string]string }{ @@ -178,7 +179,7 @@ func TestSetMachineSetReplicas(t *testing.T) { }, }, { - name: "it enforces min=1 for Azure platform even when NodePool specifies min=0", + name: "it allows min=0 for Azure platform (scale-from-zero)", nodePool: &hyperv1.NodePool{ ObjectMeta: metav1.ObjectMeta{}, Spec: hyperv1.NodePoolSpec{ @@ -199,9 +200,10 @@ func TestSetMachineSetReplicas(t *testing.T) { Replicas: nil, }, }, - expectReplicas: 1, + scaleFromZeroPlatform: hyperv1.AzurePlatform, + expectReplicas: 0, expectAutoscalerAnnotations: map[string]string{ - autoscalerMinAnnotation: "1", + autoscalerMinAnnotation: "0", autoscalerMaxAnnotation: "5", }, }, @@ -266,7 +268,7 @@ func TestSetMachineSetReplicas(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { g := NewWithT(t) - setMachineSetReplicas(tc.nodePool, tc.machineSet) + setMachineSetReplicas(tc.nodePool, tc.machineSet, tc.scaleFromZeroPlatform) g.Expect(*tc.machineSet.Spec.Replicas).To(Equal(tc.expectReplicas)) g.Expect(tc.machineSet.Annotations).To(Equal(tc.expectAutoscalerAnnotations)) }) @@ -279,6 +281,7 @@ func TestSetMachineDeploymentReplicas(t *testing.T) { name string nodePool *hyperv1.NodePool machineDeployment *capiv1.MachineDeployment + scaleFromZeroPlatform hyperv1.PlatformType expectReplicas int32 expectAutoscalerAnnotations map[string]string }{ @@ -504,7 +507,7 @@ func TestSetMachineDeploymentReplicas(t *testing.T) { }, }, { - name: "it enforces min=1 for Azure platform even when NodePool specifies min=0", + name: "it allows min=0 for Azure platform (scale-from-zero)", nodePool: &hyperv1.NodePool{ ObjectMeta: metav1.ObjectMeta{}, Spec: hyperv1.NodePoolSpec{ @@ -525,9 +528,10 @@ func TestSetMachineDeploymentReplicas(t *testing.T) { Replicas: nil, }, }, - expectReplicas: 1, + scaleFromZeroPlatform: hyperv1.AzurePlatform, + expectReplicas: 0, expectAutoscalerAnnotations: map[string]string{ - autoscalerMinAnnotation: "1", + autoscalerMinAnnotation: "0", autoscalerMaxAnnotation: "5", }, }, @@ -592,7 +596,7 @@ func TestSetMachineDeploymentReplicas(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { g := NewWithT(t) - setMachineDeploymentReplicas(tc.nodePool, tc.machineDeployment) + setMachineDeploymentReplicas(tc.nodePool, tc.machineDeployment, tc.scaleFromZeroPlatform) g.Expect(*tc.machineDeployment.Spec.Replicas).To(Equal(tc.expectReplicas)) g.Expect(tc.machineDeployment.Annotations).To(Equal(tc.expectAutoscalerAnnotations)) }) diff --git a/hypershift-operator/controllers/nodepool/conditions.go b/hypershift-operator/controllers/nodepool/conditions.go index cff009e3590..7b5d710620c 100644 --- a/hypershift-operator/controllers/nodepool/conditions.go +++ b/hypershift-operator/controllers/nodepool/conditions.go @@ -181,9 +181,7 @@ func (r *NodePoolReconciler) autoscalerEnabledCondition(_ context.Context, nodeP // Check platform-specific support var supported bool switch nodePool.Spec.Platform.Type { - case hyperv1.AWSPlatform: - // AWS supports scale-from-zero either natively (when CPO supports it) - // or via MachineDeployment controller workaround annotations + case hyperv1.AWSPlatform, hyperv1.AzurePlatform: supported = true default: // Other platforms don't support autoscaling from zero yet diff --git a/hypershift-operator/controllers/nodepool/instancetype/azure/provider.go b/hypershift-operator/controllers/nodepool/instancetype/azure/provider.go new file mode 100644 index 00000000000..57dd7a499ba --- /dev/null +++ b/hypershift-operator/controllers/nodepool/instancetype/azure/provider.go @@ -0,0 +1,168 @@ +package azure + +import ( + "context" + "fmt" + "math" + "strconv" + "strings" + "sync" + + hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" + "github.com/openshift/hypershift/hypershift-operator/controllers/nodepool/instancetype" + + azruntime "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5" +) + +// ResourceSKUsAPI defines the operations used from armcompute.ResourceSKUsClient. +type ResourceSKUsAPI interface { + NewListPager(options *armcompute.ResourceSKUsClientListOptions) *azruntime.Pager[armcompute.ResourceSKUsClientListResponse] +} + +// Compile-time check that the real client satisfies our interface. +var _ ResourceSKUsAPI = (*armcompute.ResourceSKUsClient)(nil) + +// Provider implements the instancetype.Provider interface for Azure. +// It queries the Azure Resource SKUs API to get VM size specifications. +type Provider struct { + skuClient ResourceSKUsAPI + location string + cache map[string]*instancetype.InstanceTypeInfo + mu sync.Mutex +} + +// NewProvider creates a new Azure instance type provider. +func NewProvider(skuClient ResourceSKUsAPI, location string) *Provider { + return &Provider{ + skuClient: skuClient, + location: location, + } +} + +// GetInstanceTypeInfo queries Azure Resource SKUs API for VM size specifications. +func (p *Provider) GetInstanceTypeInfo(ctx context.Context, instanceType string) (*instancetype.InstanceTypeInfo, error) { + p.mu.Lock() + defer p.mu.Unlock() + + if p.cache == nil { + if err := p.loadSKUs(ctx); err != nil { + return nil, fmt.Errorf("failed to load Azure Resource SKUs: %w", err) + } + } + + info, ok := p.cache[instanceType] + if !ok { + return nil, fmt.Errorf("VM size %q not found in Azure Resource SKUs for location %q", instanceType, p.location) + } + + copied := *info + return &copied, nil +} + +func (p *Provider) loadSKUs(ctx context.Context) error { + nextCache := make(map[string]*instancetype.InstanceTypeInfo) + + filter := fmt.Sprintf("location eq '%s'", p.location) + pager := p.skuClient.NewListPager(&armcompute.ResourceSKUsClientListOptions{ + Filter: &filter, + }) + + for pager.More() { + page, err := pager.NextPage(ctx) + if err != nil { + return fmt.Errorf("failed to list Azure Resource SKUs: %w", err) + } + + for _, sku := range page.Value { + if sku.ResourceType == nil || !strings.EqualFold(*sku.ResourceType, "virtualMachines") { + continue + } + + info, err := transformSKU(sku) + if err != nil { + continue + } + nextCache[info.InstanceType] = info + } + } + + p.cache = nextCache + return nil +} + +func transformSKU(sku *armcompute.ResourceSKU) (*instancetype.InstanceTypeInfo, error) { + if sku.Name == nil || *sku.Name == "" { + return nil, fmt.Errorf("SKU name is missing") + } + + name := *sku.Name + info := &instancetype.InstanceTypeInfo{ + InstanceType: name, + } + + vcpuStr, ok := getCapabilityValue(sku.Capabilities, "vCPUs") + if !ok { + return nil, fmt.Errorf("missing vCPUs capability for VM size %q", name) + } + vcpu, err := strconv.ParseInt(vcpuStr, 10, 32) + if err != nil { + return nil, fmt.Errorf("invalid vCPUs value %q for VM size %q: %w", vcpuStr, name, err) + } + if vcpu <= 0 { + return nil, fmt.Errorf("invalid vCPUs count %d for VM size %q", vcpu, name) + } + info.VCPU = int32(vcpu) + + memStr, ok := getCapabilityValue(sku.Capabilities, "MemoryGB") + if !ok { + return nil, fmt.Errorf("missing MemoryGB capability for VM size %q", name) + } + memGB, err := strconv.ParseFloat(memStr, 64) + if err != nil { + return nil, fmt.Errorf("invalid MemoryGB value %q for VM size %q: %w", memStr, name, err) + } + if memGB <= 0 { + return nil, fmt.Errorf("invalid MemoryGB value %v for VM size %q", memGB, name) + } + info.MemoryMb = int64(math.Round(memGB * 1024)) + + gpuStr, ok := getCapabilityValue(sku.Capabilities, "GPUs") + if ok { + gpu, err := strconv.ParseInt(gpuStr, 10, 32) + if err != nil { + return nil, fmt.Errorf("invalid GPUs value %q for VM size %q: %w", gpuStr, name, err) + } + if gpu < 0 { + return nil, fmt.Errorf("negative GPUs count %d for VM size %q", gpu, name) + } + info.GPU = int32(gpu) + } + + archStr, ok := getCapabilityValue(sku.Capabilities, "CpuArchitectureType") + if !ok { + return nil, fmt.Errorf("missing CpuArchitectureType capability for VM size %q", name) + } + switch strings.ToLower(archStr) { + case "x64": + info.CPUArchitecture = hyperv1.ArchitectureAMD64 + case "arm64": + info.CPUArchitecture = hyperv1.ArchitectureARM64 + default: + return nil, fmt.Errorf("unsupported CPU architecture %q for VM size %q", archStr, name) + } + + return info, nil +} + +func getCapabilityValue(capabilities []*armcompute.ResourceSKUCapabilities, name string) (string, bool) { + for _, cap := range capabilities { + if cap.Name != nil && *cap.Name == name { + if cap.Value != nil { + return *cap.Value, true + } + return "", false + } + } + return "", false +} diff --git a/hypershift-operator/controllers/nodepool/instancetype/azure/provider_test.go b/hypershift-operator/controllers/nodepool/instancetype/azure/provider_test.go new file mode 100644 index 00000000000..04a6dfafe9a --- /dev/null +++ b/hypershift-operator/controllers/nodepool/instancetype/azure/provider_test.go @@ -0,0 +1,424 @@ +package azure + +import ( + "context" + "fmt" + "testing" + + . "github.com/onsi/gomega" + + hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" + "github.com/openshift/hypershift/hypershift-operator/controllers/nodepool/instancetype" + + azruntime "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5" +) + +type mockResourceSKUsAPI struct { + skus []*armcompute.ResourceSKU + err error +} + +func (m *mockResourceSKUsAPI) NewListPager(_ *armcompute.ResourceSKUsClientListOptions) *azruntime.Pager[armcompute.ResourceSKUsClientListResponse] { + return azruntime.NewPager(azruntime.PagingHandler[armcompute.ResourceSKUsClientListResponse]{ + More: func(page armcompute.ResourceSKUsClientListResponse) bool { + return false + }, + Fetcher: func(ctx context.Context, page *armcompute.ResourceSKUsClientListResponse) (armcompute.ResourceSKUsClientListResponse, error) { + if m.err != nil { + return armcompute.ResourceSKUsClientListResponse{}, m.err + } + return armcompute.ResourceSKUsClientListResponse{ + ResourceSKUsResult: armcompute.ResourceSKUsResult{ + Value: m.skus, + }, + }, nil + }, + }) +} + +func makeSKU(name, resourceType string, capabilities map[string]string) *armcompute.ResourceSKU { + sku := &armcompute.ResourceSKU{ + Name: to.Ptr(name), + ResourceType: to.Ptr(resourceType), + } + for k, v := range capabilities { + sku.Capabilities = append(sku.Capabilities, &armcompute.ResourceSKUCapabilities{ + Name: to.Ptr(k), + Value: to.Ptr(v), + }) + } + return sku +} + +func TestTransformSKU_WhenValidInput_ItShouldTransformCorrectly(t *testing.T) { + tests := []struct { + name string + input *armcompute.ResourceSKU + expected *instancetype.InstanceTypeInfo + }{ + { + name: "When Standard_D4s_v3 with x64 arch it should transform correctly", + input: makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ + "vCPUs": "4", + "MemoryGB": "16", + "CpuArchitectureType": "x64", + }), + expected: &instancetype.InstanceTypeInfo{ + InstanceType: "Standard_D4s_v3", + VCPU: 4, + MemoryMb: 16384, + GPU: 0, + CPUArchitecture: hyperv1.ArchitectureAMD64, + }, + }, + { + name: "When GPU VM it should set GPU count", + input: makeSKU("Standard_NC16as_T4_v3", "virtualMachines", map[string]string{ + "vCPUs": "16", + "MemoryGB": "110", + "GPUs": "1", + "CpuArchitectureType": "x64", + }), + expected: &instancetype.InstanceTypeInfo{ + InstanceType: "Standard_NC16as_T4_v3", + VCPU: 16, + MemoryMb: 112640, + GPU: 1, + CPUArchitecture: hyperv1.ArchitectureAMD64, + }, + }, + { + name: "When Arm64 VM it should set correct architecture", + input: makeSKU("Standard_D4ps_v5", "virtualMachines", map[string]string{ + "vCPUs": "4", + "MemoryGB": "16", + "CpuArchitectureType": "Arm64", + }), + expected: &instancetype.InstanceTypeInfo{ + InstanceType: "Standard_D4ps_v5", + VCPU: 4, + MemoryMb: 16384, + GPU: 0, + CPUArchitecture: hyperv1.ArchitectureARM64, + }, + }, + { + name: "When GPUs capability is absent it should default to 0", + input: makeSKU("Standard_B2s", "virtualMachines", map[string]string{ + "vCPUs": "2", + "MemoryGB": "4", + "CpuArchitectureType": "x64", + }), + expected: &instancetype.InstanceTypeInfo{ + InstanceType: "Standard_B2s", + VCPU: 2, + MemoryMb: 4096, + GPU: 0, + CPUArchitecture: hyperv1.ArchitectureAMD64, + }, + }, + { + name: "When MemoryGB is fractional it should convert correctly", + input: makeSKU("Standard_B1ls", "virtualMachines", map[string]string{ + "vCPUs": "1", + "MemoryGB": "0.5", + "CpuArchitectureType": "x64", + }), + expected: &instancetype.InstanceTypeInfo{ + InstanceType: "Standard_B1ls", + VCPU: 1, + MemoryMb: 512, + GPU: 0, + CPUArchitecture: hyperv1.ArchitectureAMD64, + }, + }, + { + name: "When MemoryGB is large it should convert correctly", + input: makeSKU("Standard_M416ms_v2", "virtualMachines", map[string]string{ + "vCPUs": "416", + "MemoryGB": "11400", + "CpuArchitectureType": "x64", + }), + expected: &instancetype.InstanceTypeInfo{ + InstanceType: "Standard_M416ms_v2", + VCPU: 416, + MemoryMb: 11673600, + GPU: 0, + CPUArchitecture: hyperv1.ArchitectureAMD64, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewGomegaWithT(t) + result, err := transformSKU(tt.input) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(result).To(Equal(tt.expected)) + }) + } +} + +func TestTransformSKU_WhenMissingRequiredFields_ItShouldReturnError(t *testing.T) { + tests := []struct { + name string + input *armcompute.ResourceSKU + expectedError string + }{ + { + name: "When SKU name is nil it should return error", + input: &armcompute.ResourceSKU{ + Name: nil, + ResourceType: to.Ptr("virtualMachines"), + Capabilities: []*armcompute.ResourceSKUCapabilities{ + {Name: to.Ptr("vCPUs"), Value: to.Ptr("4")}, + }, + }, + expectedError: "SKU name is missing", + }, + { + name: "When vCPUs capability is missing it should return error", + input: makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ + "MemoryGB": "16", + "CpuArchitectureType": "x64", + }), + expectedError: "missing vCPUs capability", + }, + { + name: "When MemoryGB capability is missing it should return error", + input: makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ + "vCPUs": "4", + "CpuArchitectureType": "x64", + }), + expectedError: "missing MemoryGB capability", + }, + { + name: "When CpuArchitectureType capability is missing it should return error", + input: makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ + "vCPUs": "4", + "MemoryGB": "16", + }), + expectedError: "missing CpuArchitectureType capability", + }, + { + name: "When vCPUs value is not a valid integer it should return error", + input: makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ + "vCPUs": "abc", + "MemoryGB": "16", + "CpuArchitectureType": "x64", + }), + expectedError: "invalid vCPUs value", + }, + { + name: "When MemoryGB value is not a valid float it should return error", + input: makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ + "vCPUs": "4", + "MemoryGB": "xyz", + "CpuArchitectureType": "x64", + }), + expectedError: "invalid MemoryGB value", + }, + { + name: "When vCPUs value is zero it should return error", + input: makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ + "vCPUs": "0", + "MemoryGB": "16", + "CpuArchitectureType": "x64", + }), + expectedError: "invalid vCPUs count", + }, + { + name: "When MemoryGB value is zero it should return error", + input: makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ + "vCPUs": "4", + "MemoryGB": "0", + "CpuArchitectureType": "x64", + }), + expectedError: "invalid MemoryGB value", + }, + { + name: "When CpuArchitectureType is unsupported it should return error", + input: makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ + "vCPUs": "4", + "MemoryGB": "16", + "CpuArchitectureType": "i386", + }), + expectedError: "unsupported CPU architecture", + }, + { + name: "When GPUs value is not a valid integer it should return error", + input: makeSKU("Standard_NC6", "virtualMachines", map[string]string{ + "vCPUs": "6", + "MemoryGB": "56", + "GPUs": "abc", + "CpuArchitectureType": "x64", + }), + expectedError: "invalid GPUs value", + }, + { + name: "When GPUs value is negative it should return error", + input: makeSKU("Standard_NC6", "virtualMachines", map[string]string{ + "vCPUs": "6", + "MemoryGB": "56", + "GPUs": "-1", + "CpuArchitectureType": "x64", + }), + expectedError: "negative GPUs count", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewGomegaWithT(t) + _, err := transformSKU(tt.input) + g.Expect(err).To(HaveOccurred()) + g.Expect(err.Error()).To(ContainSubstring(tt.expectedError)) + }) + } +} + +func TestGetInstanceTypeInfo(t *testing.T) { + tests := []struct { + name string + skus []*armcompute.ResourceSKU + apiErr error + instanceType string + expected *instancetype.InstanceTypeInfo + expectedError string + }{ + { + name: "When VM size exists it should return info", + skus: []*armcompute.ResourceSKU{ + makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ + "vCPUs": "4", "MemoryGB": "16", "CpuArchitectureType": "x64", + }), + }, + instanceType: "Standard_D4s_v3", + expected: &instancetype.InstanceTypeInfo{ + InstanceType: "Standard_D4s_v3", + VCPU: 4, + MemoryMb: 16384, + GPU: 0, + CPUArchitecture: hyperv1.ArchitectureAMD64, + }, + }, + { + name: "When VM size not found it should return error", + skus: []*armcompute.ResourceSKU{ + makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ + "vCPUs": "4", "MemoryGB": "16", "CpuArchitectureType": "x64", + }), + }, + instanceType: "Standard_Nonexistent", + expectedError: "not found", + }, + { + name: "When API returns error it should propagate error", + apiErr: fmt.Errorf("API error: throttling"), + instanceType: "Standard_D4s_v3", + expectedError: "failed to load Azure Resource SKUs", + }, + { + name: "When SKU has matching name but wrong ResourceType it should return not found", + skus: []*armcompute.ResourceSKU{ + makeSKU("Standard_D4s_v3", "disks", map[string]string{ + "vCPUs": "4", "MemoryGB": "16", "CpuArchitectureType": "x64", + }), + }, + instanceType: "Standard_D4s_v3", + expectedError: "not found", + }, + { + name: "When multiple SKUs returned it should match only virtualMachines type", + skus: []*armcompute.ResourceSKU{ + makeSKU("Standard_D4s_v3", "disks", map[string]string{ + "vCPUs": "99", "MemoryGB": "99", "CpuArchitectureType": "x64", + }), + makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ + "vCPUs": "4", "MemoryGB": "16", "CpuArchitectureType": "x64", + }), + }, + instanceType: "Standard_D4s_v3", + expected: &instancetype.InstanceTypeInfo{ + InstanceType: "Standard_D4s_v3", + VCPU: 4, + MemoryMb: 16384, + GPU: 0, + CPUArchitecture: hyperv1.ArchitectureAMD64, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewGomegaWithT(t) + mock := &mockResourceSKUsAPI{skus: tt.skus, err: tt.apiErr} + provider := NewProvider(mock, "eastus") + result, err := provider.GetInstanceTypeInfo(context.Background(), tt.instanceType) + + if tt.expectedError != "" { + g.Expect(err).To(HaveOccurred()) + g.Expect(err.Error()).To(ContainSubstring(tt.expectedError)) + } else { + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(result).To(Equal(tt.expected)) + } + }) + } +} + +func TestGetCapabilityValue(t *testing.T) { + tests := []struct { + name string + capabilities []*armcompute.ResourceSKUCapabilities + capName string + expectedVal string + expectedOK bool + }{ + { + name: "When capability exists it should return the value", + capabilities: []*armcompute.ResourceSKUCapabilities{ + {Name: to.Ptr("vCPUs"), Value: to.Ptr("4")}, + }, + capName: "vCPUs", + expectedVal: "4", + expectedOK: true, + }, + { + name: "When capability does not exist it should return empty and false", + capabilities: []*armcompute.ResourceSKUCapabilities{ + {Name: to.Ptr("vCPUs"), Value: to.Ptr("4")}, + }, + capName: "GPUs", + expectedVal: "", + expectedOK: false, + }, + { + name: "When capabilities slice is nil it should return empty and false", + capabilities: nil, + capName: "vCPUs", + expectedVal: "", + expectedOK: false, + }, + { + name: "When capability name has different case it should not match", + capabilities: []*armcompute.ResourceSKUCapabilities{ + {Name: to.Ptr("vcpus"), Value: to.Ptr("4")}, + }, + capName: "vCPUs", + expectedVal: "", + expectedOK: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewGomegaWithT(t) + val, ok := getCapabilityValue(tt.capabilities, tt.capName) + g.Expect(ok).To(Equal(tt.expectedOK)) + g.Expect(val).To(Equal(tt.expectedVal)) + }) + } +} diff --git a/hypershift-operator/controllers/nodepool/nodepool_controller.go b/hypershift-operator/controllers/nodepool/nodepool_controller.go index 8a550972e4a..2bfca1845ef 100644 --- a/hypershift-operator/controllers/nodepool/nodepool_controller.go +++ b/hypershift-operator/controllers/nodepool/nodepool_controller.go @@ -104,6 +104,7 @@ type NodePoolReconciler struct { KubevirtInfraClients kvinfra.KubevirtInfraClientMap EC2Client awsapi.EC2API InstanceTypeProvider instancetype.Provider + ScaleFromZeroPlatform hyperv1.PlatformType } type NotReadyError struct { @@ -385,6 +386,7 @@ func (r *NodePoolReconciler) reconcile(ctx context.Context, hcluster *hyperv1.Ho if err != nil { return ctrl.Result{}, err } + capi.scaleFromZeroPlatform = r.ScaleFromZeroPlatform if isPaused, duration := supportutil.IsReconciliationPaused(log, nodePool.Spec.PausedUntil); isPaused { if err := capi.Pause(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("error pausing CAPI: %w", err) @@ -426,7 +428,7 @@ func (r *NodePoolReconciler) reconcile(ctx context.Context, hcluster *hyperv1.Ho // Set scale-from-zero annotations if provider is configured and platform is supported // This works for both Replace (MachineDeployment) and InPlace (MachineSet) upgrade types - if isAutoscalingEnabled(nodePool) && r.InstanceTypeProvider != nil && supportedScaleFromZeroPlatform(nodePool.Spec.Platform.Type) { + if isAutoscalingEnabled(nodePool) && r.InstanceTypeProvider != nil && r.ScaleFromZeroPlatform == nodePool.Spec.Platform.Type { if err = r.reconcileScaleFromZeroAnnotations(ctx, nodePool, capi); err != nil { log.Error(err, "Failed to set scale-from-zero annotations, will retry") return ctrl.Result{RequeueAfter: 30 * time.Second}, nil @@ -436,11 +438,6 @@ func (r *NodePoolReconciler) reconcile(ctx context.Context, hcluster *hyperv1.Ho return ctrl.Result{}, nil } -// supportedScaleFromZeroPlatform checks if the platform supports scale-from-zero functionality. -func supportedScaleFromZeroPlatform(platform hyperv1.PlatformType) bool { - return platform == hyperv1.AWSPlatform -} - func (r *NodePoolReconciler) token(ctx context.Context, hcluster *hyperv1.HostedCluster, nodePool *hyperv1.NodePool) (*Token, error) { // Validate and get releaseImage. releaseImage, err := r.getReleaseImage(ctx, hcluster, nodePool.Status.Version, nodePool.Spec.Release.Image) @@ -1260,16 +1257,15 @@ func (r *NodePoolReconciler) reconcileScaleFromZeroAnnotations(ctx context.Conte } machineTemplate = awsMachineTemplate - // Future platform support can be added here: - // case hyperv1.AzurePlatform: - // azureTemplate := &capiazure.AzureMachineTemplate{} - // if err := capi.getExistingMachineTemplate(ctx, azureTemplate); err != nil { - // if apierrors.IsNotFound(err) { - // return nil - // } - // return fmt.Errorf("failed to get AzureMachineTemplate: %w", err) - // } - // machineTemplate = azureTemplate + case hyperv1.AzurePlatform: + azureTemplate := &capiazure.AzureMachineTemplate{} + if err := capi.getExistingMachineTemplate(ctx, azureTemplate); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to get AzureMachineTemplate: %w", err) + } + machineTemplate = azureTemplate default: return fmt.Errorf("unsupported platform for scale-from-zero: %s", nodePool.Spec.Platform.Type) diff --git a/hypershift-operator/controllers/nodepool/scale_from_zero.go b/hypershift-operator/controllers/nodepool/scale_from_zero.go index 5d24335bc19..b67c2a4ca65 100644 --- a/hypershift-operator/controllers/nodepool/scale_from_zero.go +++ b/hypershift-operator/controllers/nodepool/scale_from_zero.go @@ -27,6 +27,7 @@ import ( "k8s.io/apimachinery/pkg/api/resource" infrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/api/v1beta2" + capiazure "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -80,10 +81,8 @@ func setScaleFromZeroAnnotationsOnObject(ctx context.Context, provider instancet case *infrav1.AWSMachineTemplate: instanceType = template.Spec.Template.Spec.InstanceType statusCapacity = template.Status.Capacity - // Future platform support can be added here: - // case *capiazure.AzureMachineTemplate: - // instanceType = template.Spec.Template.Spec.VMSize - // statusCapacity = template.Status.Capacity + case *capiazure.AzureMachineTemplate: + instanceType = template.Spec.Template.Spec.VMSize default: return fmt.Errorf("unsupported machine template type: %T", machineTemplate) } diff --git a/hypershift-operator/controllers/nodepool/scale_from_zero_test.go b/hypershift-operator/controllers/nodepool/scale_from_zero_test.go index bec100f788b..ae16ff36060 100644 --- a/hypershift-operator/controllers/nodepool/scale_from_zero_test.go +++ b/hypershift-operator/controllers/nodepool/scale_from_zero_test.go @@ -15,6 +15,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" infrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/api/v1beta2" + capiazure "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1" capiv1 "sigs.k8s.io/cluster-api/api/core/v1beta1" ) @@ -90,6 +91,16 @@ func TestSetScaleFromZeroAnnotationsOnObject(t *testing.T) { } } + newAzureTemplate := func(vmSize string) *capiazure.AzureMachineTemplate { + return &capiazure.AzureMachineTemplate{ + Spec: capiazure.AzureMachineTemplateSpec{ + Template: capiazure.AzureMachineTemplateResource{ + Spec: capiazure.AzureMachineSpec{VMSize: vmSize}, + }, + }, + } + } + tests := []struct { name string provider instancetype.Provider @@ -200,6 +211,66 @@ func TestSetScaleFromZeroAnnotationsOnObject(t *testing.T) { g.Expect(a).ToNot(HaveKey(taintsKey)) }, }, + { + name: "When Azure template with valid VMSize and no GPU it should set basic annotations", + provider: &mockProvider{info: &instancetype.InstanceTypeInfo{ + VCPU: 4, MemoryMb: 16384, GPU: 0, CPUArchitecture: "amd64", + }}, + nodePool: &hyperv1.NodePool{}, + object: &capiv1.MachineDeployment{}, + machineTemplate: newAzureTemplate("Standard_D4s_v3"), + expectErr: false, + validate: func(g Gomega, md *capiv1.MachineDeployment) { + a := md.GetAnnotations() + g.Expect(a).To(HaveKeyWithValue(cpuKey, "4")) + g.Expect(a).To(HaveKeyWithValue(memoryKey, "16384")) + g.Expect(a).To(HaveKeyWithValue(labelsKey, "kubernetes.io/arch=amd64")) + g.Expect(a).ToNot(HaveKey(gpuKey)) + }, + }, + { + name: "When Azure template with empty VMSize it should return error", + provider: &mockProvider{}, + nodePool: &hyperv1.NodePool{}, + object: &capiv1.MachineDeployment{}, + machineTemplate: newAzureTemplate(""), + expectErr: true, + errSubstring: "instanceType is empty", + }, + { + name: "When Azure template with nil provider it should skip annotations", + provider: nil, + nodePool: &hyperv1.NodePool{}, + object: &capiv1.MachineDeployment{}, + machineTemplate: newAzureTemplate("Standard_D4s_v3"), + expectErr: false, + validate: func(g Gomega, md *capiv1.MachineDeployment) { + g.Expect(md.GetAnnotations()).ToNot(HaveKey(cpuKey)) + }, + }, + { + name: "When Azure template with GPU and taints it should set all annotations", + provider: &mockProvider{info: &instancetype.InstanceTypeInfo{ + VCPU: 6, MemoryMb: 114688, GPU: 1, CPUArchitecture: "amd64", + }}, + nodePool: &hyperv1.NodePool{ + Spec: hyperv1.NodePoolSpec{ + Taints: []hyperv1.Taint{ + {Key: "dedicated", Value: "gpu", Effect: corev1.TaintEffectNoSchedule}, + }, + }, + }, + object: &capiv1.MachineDeployment{}, + machineTemplate: newAzureTemplate("Standard_NC6s_v3"), + expectErr: false, + validate: func(g Gomega, md *capiv1.MachineDeployment) { + a := md.GetAnnotations() + g.Expect(a).To(HaveKeyWithValue(cpuKey, "6")) + g.Expect(a).To(HaveKeyWithValue(memoryKey, "114688")) + g.Expect(a).To(HaveKeyWithValue(gpuKey, "1")) + g.Expect(a).To(HaveKeyWithValue(taintsKey, "dedicated=gpu:NoSchedule")) + }, + }, { name: "When instance has GPU, labels with arch override, taints, and existing annotations it should set all correctly", provider: &mockProvider{info: &instancetype.InstanceTypeInfo{ diff --git a/hypershift-operator/main.go b/hypershift-operator/main.go index a663242bf0f..680084874e4 100644 --- a/hypershift-operator/main.go +++ b/hypershift-operator/main.go @@ -17,6 +17,7 @@ package main import ( "context" "crypto/tls" + "encoding/json" "fmt" "os" "strings" @@ -35,6 +36,7 @@ import ( "github.com/openshift/hypershift/hypershift-operator/controllers/nodepool" "github.com/openshift/hypershift/hypershift-operator/controllers/nodepool/instancetype" awsinstancetype "github.com/openshift/hypershift/hypershift-operator/controllers/nodepool/instancetype/aws" + azureinstancetype "github.com/openshift/hypershift/hypershift-operator/controllers/nodepool/instancetype/azure" npmetrics "github.com/openshift/hypershift/hypershift-operator/controllers/nodepool/metrics" "github.com/openshift/hypershift/hypershift-operator/controllers/platform/aws" azureplatform "github.com/openshift/hypershift/hypershift-operator/controllers/platform/azure" @@ -71,6 +73,7 @@ import ( "github.com/Azure/azure-sdk-for-go/sdk/azcore" "github.com/Azure/azure-sdk-for-go/sdk/azcore/cloud" "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v5" admissionregistrationv1 "k8s.io/api/admissionregistration/v1" @@ -351,7 +354,7 @@ func validateStartOptions(opts *StartOptions, log logr.Logger) error { return fmt.Errorf("--etcd-backup-max-count must be at least 1, got %d", opts.EtcdBackupMaxCount) } - supportedProviders := set.New("aws") + supportedProviders := set.New("aws", "azure") if opts.ScaleFromZeroCreds != "" { if opts.ScaleFromZeroProvider == "" { return fmt.Errorf("--scale-from-zero-provider is required when using --scale-from-zero-creds") @@ -596,6 +599,8 @@ func setupEC2Client(ctx context.Context, opts *StartOptions) awsapi.EC2API { func setupNodePoolController(ctx context.Context, mgr ctrl.Manager, opts *StartOptions, operatorImage string, createOrUpdate upsert.CreateOrUpdateProvider, registryProvider globalconfig.CommonRegistryProvider, ec2Client awsapi.EC2API, log logr.Logger) error { var instanceTypeProvider instancetype.Provider + var scaleFromZeroPlatform hyperv1.PlatformType + if opts.ScaleFromZeroCreds != "" && opts.ScaleFromZeroProvider != "" { switch strings.ToLower(opts.ScaleFromZeroProvider) { case "aws": @@ -605,7 +610,65 @@ func setupNodePoolController(ctx context.Context, mgr ctrl.Manager, opts *StartO o.Retryer = awsConfig() }) instanceTypeProvider = awsinstancetype.NewProvider(scaleFromZeroEC2Client) + scaleFromZeroPlatform = hyperv1.AWSPlatform log.Info("Instance type provider initialized", "provider", opts.ScaleFromZeroProvider) + case "azure": + raw, err := os.ReadFile(opts.ScaleFromZeroCreds) + if err != nil { + return fmt.Errorf("failed to read Azure scale-from-zero credentials: %w", err) + } + var azureCreds struct { + SubscriptionID string `json:"subscriptionId"` + ClientID string `json:"clientId"` + ClientSecret string `json:"clientSecret"` + TenantID string `json:"tenantId"` + Location string `json:"location"` + } + if err := json.Unmarshal(raw, &azureCreds); err != nil { + return fmt.Errorf("failed to parse Azure scale-from-zero credentials: %w", err) + } + var missing []string + if azureCreds.SubscriptionID == "" { + missing = append(missing, "subscriptionId") + } + if azureCreds.ClientID == "" { + missing = append(missing, "clientId") + } + if azureCreds.ClientSecret == "" { + missing = append(missing, "clientSecret") + } + if azureCreds.TenantID == "" { + missing = append(missing, "tenantId") + } + if azureCreds.Location == "" { + missing = append(missing, "location") + } + if len(missing) > 0 { + return fmt.Errorf("azure scale-from-zero credentials missing required fields: %s", strings.Join(missing, ", ")) + } + azureCloudName := os.Getenv("AZURE_CLOUD_NAME") + if azureCloudName == "" { + azureCloudName = config.DefaultAzureCloud + } + cloudConfig, err := azureutil.GetAzureCloudConfiguration(azureCloudName) + if err != nil { + return fmt.Errorf("failed to get Azure cloud configuration for scale-from-zero: %w", err) + } + cred, err := azidentity.NewClientSecretCredential(azureCreds.TenantID, azureCreds.ClientID, azureCreds.ClientSecret, + &azidentity.ClientSecretCredentialOptions{ + ClientOptions: azcore.ClientOptions{Cloud: cloudConfig}, + }, + ) + if err != nil { + return fmt.Errorf("failed to create Azure credentials for scale-from-zero: %w", err) + } + skuClient, err := armcompute.NewResourceSKUsClient(azureCreds.SubscriptionID, cred, azureutil.NewARMClientOptions(cloudConfig)) + if err != nil { + return fmt.Errorf("failed to create Azure ResourceSKUs client: %w", err) + } + instanceTypeProvider = azureinstancetype.NewProvider(skuClient, azureCreds.Location) + scaleFromZeroPlatform = hyperv1.AzurePlatform + log.Info("Instance type provider initialized", "provider", opts.ScaleFromZeroProvider, "location", azureCreds.Location) default: log.Info("WARNING: Unsupported scale-from-zero provider", "provider", opts.ScaleFromZeroProvider) } @@ -620,6 +683,7 @@ func setupNodePoolController(ctx context.Context, mgr ctrl.Manager, opts *StartO KubevirtInfraClients: kvinfra.NewKubevirtInfraClientMap(), EC2Client: ec2Client, InstanceTypeProvider: instanceTypeProvider, + ScaleFromZeroPlatform: scaleFromZeroPlatform, }).SetupWithManager(mgr); err != nil { return fmt.Errorf("unable to create controller: %w", err) } diff --git a/test/e2e/autoscaling_test.go b/test/e2e/autoscaling_test.go index 62f5c07374b..270af8cd991 100644 --- a/test/e2e/autoscaling_test.go +++ b/test/e2e/autoscaling_test.go @@ -675,8 +675,8 @@ func testAutoscalerRespectsNodePoolPause(ctx context.Context, mgtClient crclient } func TestNodePoolAutoscalingScaleFromZero(t *testing.T) { - if globalOpts.Platform != hyperv1.AWSPlatform { - t.Skip("test only supported on platform AWS") + if globalOpts.Platform != hyperv1.AWSPlatform && globalOpts.Platform != hyperv1.AzurePlatform { + t.Skip("test only supported on AWS and Azure platforms") } // Get management client to check for scale-from-zero secret diff --git a/test/e2e/nodepool_test.go b/test/e2e/nodepool_test.go index 6e40796a160..f72452ab793 100644 --- a/test/e2e/nodepool_test.go +++ b/test/e2e/nodepool_test.go @@ -12,10 +12,12 @@ import ( hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" "github.com/openshift/hypershift/support/conditions" e2eutil "github.com/openshift/hypershift/test/e2e/util" + corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/wait" + crclient "sigs.k8s.io/controller-runtime/pkg/client" . "github.com/onsi/gomega" diff --git a/vendor/github.com/openshift/hypershift/api/hypershift/v1beta1/nodepool_types.go b/vendor/github.com/openshift/hypershift/api/hypershift/v1beta1/nodepool_types.go index dc17f42ee9e..5ebc5378064 100644 --- a/vendor/github.com/openshift/hypershift/api/hypershift/v1beta1/nodepool_types.go +++ b/vendor/github.com/openshift/hypershift/api/hypershift/v1beta1/nodepool_types.go @@ -106,7 +106,7 @@ type NodePool struct { // +kubebuilder:validation:XValidation:rule="!has(self.replicas) || !has(self.autoScaling)", message="Both replicas or autoScaling should not be set" // +kubebuilder:validation:XValidation:rule="self.arch != 's390x' || has(self.platform.kubevirt)", message="s390x is only supported on KubeVirt platform" // +kubebuilder:validation:XValidation:rule="!has(self.platform.aws) || !has(self.platform.aws.imageType) || self.platform.aws.imageType != 'Windows' || self.arch == 'amd64'", message="ImageType 'Windows' requires arch 'amd64' (AWS only)" -// +kubebuilder:validation:XValidation:rule="!has(self.autoScaling) || self.autoScaling.min > 0 || self.platform.type == 'AWS'", message="Scale-from-zero (autoScaling.min=0) is currently only supported for AWS platform" +// +kubebuilder:validation:XValidation:rule="!has(self.autoScaling) || self.autoScaling.min > 0 || self.platform.type == 'AWS' || self.platform.type == 'Azure'", message="Scale-from-zero (autoScaling.min=0) is currently only supported for AWS and Azure platforms" type NodePoolSpec struct { // clusterName is the name of the HostedCluster this NodePool belongs to. // If a HostedCluster with this name doesn't exist, the controller will no-op until it exists. @@ -501,7 +501,7 @@ type NodePoolManagement struct { // +kubebuilder:validation:XValidation:rule="self.max >= self.min", message="max must be equal or greater than min" type NodePoolAutoScaling struct { // min is the minimum number of nodes to maintain in the pool. - // Can be set to 0 for scale-from-zero for AWS platform. + // Can be set to 0 for scale-from-zero for AWS and Azure platforms. // Must be >= 0 and <= .Max. // // +kubebuilder:validation:Minimum=0