From 57e21a57e8851790fd4cf5254f36ef14b8106d79 Mon Sep 17 00:00:00 2001 From: Jesse Jaggars Date: Fri, 24 Apr 2026 21:08:22 +0000 Subject: [PATCH 1/7] feat: CNTRLPLANE-2262: Add Azure scale-from-zero support Extend the existing scale-from-zero autoscaling framework to support Azure by implementing an Azure instance type provider that queries the Azure Resource SKUs API for VM size specifications and writing capacity annotations on MachineDeployments. Changes: - Add Azure instancetype.Provider using armcompute.ResourceSKUsClient - Add AzureMachineTemplate case to scale_from_zero.go type switch - Extend supportedScaleFromZeroPlatform() for Azure - Extend reconcileScaleFromZeroAnnotations() for Azure - Update autoscalerEnabledCondition() to accept Azure with min=0 - Update effectiveMin guard in capi.go to allow min=0 for Azure - Add "azure" to supportedProviders in main.go and install.go - Add Azure provider initialization with credential file parsing - Update CRD CEL validation to allow min=0 for Azure platform - Add unit tests for Azure provider and extended type switches Co-Authored-By: Claude Opus 4.6 (1M context) --- api/hypershift/v1beta1/nodepool_types.go | 2 +- cmd/install/install.go | 5 +- .../controllers/nodepool/capi.go | 4 +- .../controllers/nodepool/capi_test.go | 12 +- .../controllers/nodepool/conditions.go | 4 +- .../nodepool/instancetype/azure/provider.go | 163 +++++++ .../instancetype/azure/provider_test.go | 404 ++++++++++++++++++ .../nodepool/nodepool_controller.go | 21 +- .../controllers/nodepool/scale_from_zero.go | 7 +- .../nodepool/scale_from_zero_test.go | 71 +++ hypershift-operator/main.go | 33 +- 11 files changed, 694 insertions(+), 32 deletions(-) create mode 100644 hypershift-operator/controllers/nodepool/instancetype/azure/provider.go create mode 100644 hypershift-operator/controllers/nodepool/instancetype/azure/provider_test.go diff --git a/api/hypershift/v1beta1/nodepool_types.go b/api/hypershift/v1beta1/nodepool_types.go index dc17f42ee9e..7aeeef9988a 100644 --- a/api/hypershift/v1beta1/nodepool_types.go +++ b/api/hypershift/v1beta1/nodepool_types.go @@ -106,7 +106,7 @@ type NodePool struct { // +kubebuilder:validation:XValidation:rule="!has(self.replicas) || !has(self.autoScaling)", message="Both replicas or autoScaling should not be set" // +kubebuilder:validation:XValidation:rule="self.arch != 's390x' || has(self.platform.kubevirt)", message="s390x is only supported on KubeVirt platform" // +kubebuilder:validation:XValidation:rule="!has(self.platform.aws) || !has(self.platform.aws.imageType) || self.platform.aws.imageType != 'Windows' || self.arch == 'amd64'", message="ImageType 'Windows' requires arch 'amd64' (AWS only)" -// +kubebuilder:validation:XValidation:rule="!has(self.autoScaling) || self.autoScaling.min > 0 || self.platform.type == 'AWS'", message="Scale-from-zero (autoScaling.min=0) is currently only supported for AWS platform" +// +kubebuilder:validation:XValidation:rule="!has(self.autoScaling) || self.autoScaling.min > 0 || self.platform.type == 'AWS' || self.platform.type == 'Azure'", message="Scale-from-zero (autoScaling.min=0) is currently only supported for AWS and Azure platforms" type NodePoolSpec struct { // clusterName is the name of the HostedCluster this NodePool belongs to. // If a HostedCluster with this name doesn't exist, the controller will no-op until it exists. diff --git a/cmd/install/install.go b/cmd/install/install.go index c13fd743ec1..5f7a6560cac 100644 --- a/cmd/install/install.go +++ b/cmd/install/install.go @@ -291,18 +291,15 @@ func (o *Options) validateScaleFromZeroConfig() []error { return nil } var errs []error - supportedProviders := set.New("aws") - // Check mutual exclusivity - only one of file or secret should be provided + supportedProviders := set.New("aws", "azure") if len(o.ScaleFromZeroCreds) != 0 && len(o.ScaleFromZeroCredentialsSecret) != 0 { errs = append(errs, fmt.Errorf("only one of --scale-from-zero-creds or --scale-from-zero-secret is supported")) } - // Provider is required when using scale-from-zero credentials if len(o.ScaleFromZeroProvider) == 0 { errs = append(errs, fmt.Errorf("--scale-from-zero-provider is required when using scale-from-zero credentials")) } else if !supportedProviders.Has(o.ScaleFromZeroProvider) { errs = append(errs, fmt.Errorf("invalid --scale-from-zero-provider: %s (must be one of: %v)", o.ScaleFromZeroProvider, supportedProviders.UnsortedList())) } - // Validate credentials file exists and is accessible if provided if len(o.ScaleFromZeroCreds) > 0 { if _, err := os.Stat(o.ScaleFromZeroCreds); err != nil { if os.IsNotExist(err) { diff --git a/hypershift-operator/controllers/nodepool/capi.go b/hypershift-operator/controllers/nodepool/capi.go index 248c7fc468b..068e11dd2a6 100644 --- a/hypershift-operator/controllers/nodepool/capi.go +++ b/hypershift-operator/controllers/nodepool/capi.go @@ -773,7 +773,7 @@ func setMachineDeploymentReplicas(nodePool *hyperv1.NodePool, machineDeployment // NodePools from being permanently stuck at 0 replicas on platforms that don't support // scale-from-zero metadata. effectiveMin := ptr.Deref(nodePool.Spec.AutoScaling.Min, 0) - if effectiveMin == 0 && nodePool.Spec.Platform.Type != hyperv1.AWSPlatform { + if effectiveMin == 0 && nodePool.Spec.Platform.Type != hyperv1.AWSPlatform && nodePool.Spec.Platform.Type != hyperv1.AzurePlatform { effectiveMin = 1 } @@ -1081,7 +1081,7 @@ func setMachineSetReplicas(nodePool *hyperv1.NodePool, machineSet *capiv1.Machin // NodePools from being permanently stuck at 0 replicas on platforms that don't support // scale-from-zero metadata. effectiveMin := ptr.Deref(nodePool.Spec.AutoScaling.Min, 0) - if effectiveMin == 0 && nodePool.Spec.Platform.Type != hyperv1.AWSPlatform { + if effectiveMin == 0 && nodePool.Spec.Platform.Type != hyperv1.AWSPlatform && nodePool.Spec.Platform.Type != hyperv1.AzurePlatform { effectiveMin = 1 } diff --git a/hypershift-operator/controllers/nodepool/capi_test.go b/hypershift-operator/controllers/nodepool/capi_test.go index 1fe35dc09f6..dde546a1ead 100644 --- a/hypershift-operator/controllers/nodepool/capi_test.go +++ b/hypershift-operator/controllers/nodepool/capi_test.go @@ -178,7 +178,7 @@ func TestSetMachineSetReplicas(t *testing.T) { }, }, { - name: "it enforces min=1 for Azure platform even when NodePool specifies min=0", + name: "it allows min=0 for Azure platform (scale-from-zero)", nodePool: &hyperv1.NodePool{ ObjectMeta: metav1.ObjectMeta{}, Spec: hyperv1.NodePoolSpec{ @@ -199,9 +199,9 @@ func TestSetMachineSetReplicas(t *testing.T) { Replicas: nil, }, }, - expectReplicas: 1, + expectReplicas: 0, expectAutoscalerAnnotations: map[string]string{ - autoscalerMinAnnotation: "1", + autoscalerMinAnnotation: "0", autoscalerMaxAnnotation: "5", }, }, @@ -504,7 +504,7 @@ func TestSetMachineDeploymentReplicas(t *testing.T) { }, }, { - name: "it enforces min=1 for Azure platform even when NodePool specifies min=0", + name: "it allows min=0 for Azure platform (scale-from-zero)", nodePool: &hyperv1.NodePool{ ObjectMeta: metav1.ObjectMeta{}, Spec: hyperv1.NodePoolSpec{ @@ -525,9 +525,9 @@ func TestSetMachineDeploymentReplicas(t *testing.T) { Replicas: nil, }, }, - expectReplicas: 1, + expectReplicas: 0, expectAutoscalerAnnotations: map[string]string{ - autoscalerMinAnnotation: "1", + autoscalerMinAnnotation: "0", autoscalerMaxAnnotation: "5", }, }, diff --git a/hypershift-operator/controllers/nodepool/conditions.go b/hypershift-operator/controllers/nodepool/conditions.go index cff009e3590..7b5d710620c 100644 --- a/hypershift-operator/controllers/nodepool/conditions.go +++ b/hypershift-operator/controllers/nodepool/conditions.go @@ -181,9 +181,7 @@ func (r *NodePoolReconciler) autoscalerEnabledCondition(_ context.Context, nodeP // Check platform-specific support var supported bool switch nodePool.Spec.Platform.Type { - case hyperv1.AWSPlatform: - // AWS supports scale-from-zero either natively (when CPO supports it) - // or via MachineDeployment controller workaround annotations + case hyperv1.AWSPlatform, hyperv1.AzurePlatform: supported = true default: // Other platforms don't support autoscaling from zero yet diff --git a/hypershift-operator/controllers/nodepool/instancetype/azure/provider.go b/hypershift-operator/controllers/nodepool/instancetype/azure/provider.go new file mode 100644 index 00000000000..a5a6217a759 --- /dev/null +++ b/hypershift-operator/controllers/nodepool/instancetype/azure/provider.go @@ -0,0 +1,163 @@ +package azure + +import ( + "context" + "fmt" + "math" + "strconv" + "strings" + "sync" + + hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" + "github.com/openshift/hypershift/hypershift-operator/controllers/nodepool/instancetype" + + azruntime "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5" +) + +// ResourceSKUsAPI defines the operations used from armcompute.ResourceSKUsClient. +type ResourceSKUsAPI interface { + NewListPager(options *armcompute.ResourceSKUsClientListOptions) *azruntime.Pager[armcompute.ResourceSKUsClientListResponse] +} + +// Compile-time check that the real client satisfies our interface. +var _ ResourceSKUsAPI = (*armcompute.ResourceSKUsClient)(nil) + +// Provider implements the instancetype.Provider interface for Azure. +// It queries the Azure Resource SKUs API to get VM size specifications. +type Provider struct { + skuClient ResourceSKUsAPI + location string + cache map[string]*instancetype.InstanceTypeInfo + mu sync.Mutex +} + +// NewProvider creates a new Azure instance type provider. +func NewProvider(skuClient ResourceSKUsAPI, location string) *Provider { + return &Provider{ + skuClient: skuClient, + location: location, + } +} + +// GetInstanceTypeInfo queries Azure Resource SKUs API for VM size specifications. +func (p *Provider) GetInstanceTypeInfo(ctx context.Context, instanceType string) (*instancetype.InstanceTypeInfo, error) { + p.mu.Lock() + defer p.mu.Unlock() + + if p.cache == nil { + if err := p.loadSKUs(ctx); err != nil { + return nil, fmt.Errorf("failed to load Azure Resource SKUs: %w", err) + } + } + + info, ok := p.cache[instanceType] + if !ok { + return nil, fmt.Errorf("VM size %q not found in Azure Resource SKUs for location %q", instanceType, p.location) + } + + copied := *info + return &copied, nil +} + +func (p *Provider) loadSKUs(ctx context.Context) error { + p.cache = make(map[string]*instancetype.InstanceTypeInfo) + + filter := fmt.Sprintf("location eq '%s'", p.location) + pager := p.skuClient.NewListPager(&armcompute.ResourceSKUsClientListOptions{ + Filter: &filter, + }) + + for pager.More() { + page, err := pager.NextPage(ctx) + if err != nil { + return fmt.Errorf("failed to list Azure Resource SKUs: %w", err) + } + + for _, sku := range page.Value { + if sku.ResourceType == nil || !strings.EqualFold(*sku.ResourceType, "virtualMachines") { + continue + } + + info, err := transformSKU(sku) + if err != nil { + continue + } + p.cache[info.InstanceType] = info + } + } + + return nil +} + +func transformSKU(sku *armcompute.ResourceSKU) (*instancetype.InstanceTypeInfo, error) { + if sku.Name == nil || *sku.Name == "" { + return nil, fmt.Errorf("SKU name is missing") + } + + name := *sku.Name + info := &instancetype.InstanceTypeInfo{ + InstanceType: name, + } + + vcpuStr, ok := getCapabilityValue(sku.Capabilities, "vCPUs") + if !ok { + return nil, fmt.Errorf("missing vCPUs capability for VM size %q", name) + } + vcpu, err := strconv.ParseInt(vcpuStr, 10, 32) + if err != nil { + return nil, fmt.Errorf("invalid vCPUs value %q for VM size %q: %w", vcpuStr, name, err) + } + if vcpu <= 0 { + return nil, fmt.Errorf("invalid vCPUs count %d for VM size %q", vcpu, name) + } + info.VCPU = int32(vcpu) + + memStr, ok := getCapabilityValue(sku.Capabilities, "MemoryGB") + if !ok { + return nil, fmt.Errorf("missing MemoryGB capability for VM size %q", name) + } + memGB, err := strconv.ParseFloat(memStr, 64) + if err != nil { + return nil, fmt.Errorf("invalid MemoryGB value %q for VM size %q: %w", memStr, name, err) + } + if memGB <= 0 { + return nil, fmt.Errorf("invalid MemoryGB value %v for VM size %q", memGB, name) + } + info.MemoryMb = int64(math.Round(memGB * 1024)) + + gpuStr, ok := getCapabilityValue(sku.Capabilities, "GPUs") + if ok { + gpu, err := strconv.ParseInt(gpuStr, 10, 32) + if err == nil { + info.GPU = int32(gpu) + } + } + + archStr, ok := getCapabilityValue(sku.Capabilities, "CpuArchitectureType") + if !ok { + return nil, fmt.Errorf("missing CpuArchitectureType capability for VM size %q", name) + } + switch strings.ToLower(archStr) { + case "x64": + info.CPUArchitecture = hyperv1.ArchitectureAMD64 + case "arm64": + info.CPUArchitecture = hyperv1.ArchitectureARM64 + default: + return nil, fmt.Errorf("unsupported CPU architecture %q for VM size %q", archStr, name) + } + + return info, nil +} + +func getCapabilityValue(capabilities []*armcompute.ResourceSKUCapabilities, name string) (string, bool) { + for _, cap := range capabilities { + if cap.Name != nil && *cap.Name == name { + if cap.Value != nil { + return *cap.Value, true + } + return "", false + } + } + return "", false +} diff --git a/hypershift-operator/controllers/nodepool/instancetype/azure/provider_test.go b/hypershift-operator/controllers/nodepool/instancetype/azure/provider_test.go new file mode 100644 index 00000000000..9e7f87ef65c --- /dev/null +++ b/hypershift-operator/controllers/nodepool/instancetype/azure/provider_test.go @@ -0,0 +1,404 @@ +package azure + +import ( + "context" + "fmt" + "testing" + + . "github.com/onsi/gomega" + + hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" + "github.com/openshift/hypershift/hypershift-operator/controllers/nodepool/instancetype" + + azruntime "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5" +) + +type mockResourceSKUsAPI struct { + skus []*armcompute.ResourceSKU + err error +} + +func (m *mockResourceSKUsAPI) NewListPager(_ *armcompute.ResourceSKUsClientListOptions) *azruntime.Pager[armcompute.ResourceSKUsClientListResponse] { + return azruntime.NewPager(azruntime.PagingHandler[armcompute.ResourceSKUsClientListResponse]{ + More: func(page armcompute.ResourceSKUsClientListResponse) bool { + return false + }, + Fetcher: func(ctx context.Context, page *armcompute.ResourceSKUsClientListResponse) (armcompute.ResourceSKUsClientListResponse, error) { + if m.err != nil { + return armcompute.ResourceSKUsClientListResponse{}, m.err + } + return armcompute.ResourceSKUsClientListResponse{ + ResourceSKUsResult: armcompute.ResourceSKUsResult{ + Value: m.skus, + }, + }, nil + }, + }) +} + +func makeSKU(name, resourceType string, capabilities map[string]string) *armcompute.ResourceSKU { + sku := &armcompute.ResourceSKU{ + Name: to.Ptr(name), + ResourceType: to.Ptr(resourceType), + } + for k, v := range capabilities { + sku.Capabilities = append(sku.Capabilities, &armcompute.ResourceSKUCapabilities{ + Name: to.Ptr(k), + Value: to.Ptr(v), + }) + } + return sku +} + +func TestTransformSKU_WhenValidInput_ItShouldTransformCorrectly(t *testing.T) { + tests := []struct { + name string + input *armcompute.ResourceSKU + expected *instancetype.InstanceTypeInfo + }{ + { + name: "When Standard_D4s_v3 with x64 arch it should transform correctly", + input: makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ + "vCPUs": "4", + "MemoryGB": "16", + "CpuArchitectureType": "x64", + }), + expected: &instancetype.InstanceTypeInfo{ + InstanceType: "Standard_D4s_v3", + VCPU: 4, + MemoryMb: 16384, + GPU: 0, + CPUArchitecture: hyperv1.ArchitectureAMD64, + }, + }, + { + name: "When GPU VM it should set GPU count", + input: makeSKU("Standard_NC16as_T4_v3", "virtualMachines", map[string]string{ + "vCPUs": "16", + "MemoryGB": "110", + "GPUs": "1", + "CpuArchitectureType": "x64", + }), + expected: &instancetype.InstanceTypeInfo{ + InstanceType: "Standard_NC16as_T4_v3", + VCPU: 16, + MemoryMb: 112640, + GPU: 1, + CPUArchitecture: hyperv1.ArchitectureAMD64, + }, + }, + { + name: "When Arm64 VM it should set correct architecture", + input: makeSKU("Standard_D4ps_v5", "virtualMachines", map[string]string{ + "vCPUs": "4", + "MemoryGB": "16", + "CpuArchitectureType": "Arm64", + }), + expected: &instancetype.InstanceTypeInfo{ + InstanceType: "Standard_D4ps_v5", + VCPU: 4, + MemoryMb: 16384, + GPU: 0, + CPUArchitecture: hyperv1.ArchitectureARM64, + }, + }, + { + name: "When GPUs capability is absent it should default to 0", + input: makeSKU("Standard_B2s", "virtualMachines", map[string]string{ + "vCPUs": "2", + "MemoryGB": "4", + "CpuArchitectureType": "x64", + }), + expected: &instancetype.InstanceTypeInfo{ + InstanceType: "Standard_B2s", + VCPU: 2, + MemoryMb: 4096, + GPU: 0, + CPUArchitecture: hyperv1.ArchitectureAMD64, + }, + }, + { + name: "When MemoryGB is fractional it should convert correctly", + input: makeSKU("Standard_B1ls", "virtualMachines", map[string]string{ + "vCPUs": "1", + "MemoryGB": "0.5", + "CpuArchitectureType": "x64", + }), + expected: &instancetype.InstanceTypeInfo{ + InstanceType: "Standard_B1ls", + VCPU: 1, + MemoryMb: 512, + GPU: 0, + CPUArchitecture: hyperv1.ArchitectureAMD64, + }, + }, + { + name: "When MemoryGB is large it should convert correctly", + input: makeSKU("Standard_M416ms_v2", "virtualMachines", map[string]string{ + "vCPUs": "416", + "MemoryGB": "11400", + "CpuArchitectureType": "x64", + }), + expected: &instancetype.InstanceTypeInfo{ + InstanceType: "Standard_M416ms_v2", + VCPU: 416, + MemoryMb: 11673600, + GPU: 0, + CPUArchitecture: hyperv1.ArchitectureAMD64, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewGomegaWithT(t) + result, err := transformSKU(tt.input) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(result).To(Equal(tt.expected)) + }) + } +} + +func TestTransformSKU_WhenMissingRequiredFields_ItShouldReturnError(t *testing.T) { + tests := []struct { + name string + input *armcompute.ResourceSKU + expectedError string + }{ + { + name: "When SKU name is nil it should return error", + input: &armcompute.ResourceSKU{ + Name: nil, + ResourceType: to.Ptr("virtualMachines"), + Capabilities: []*armcompute.ResourceSKUCapabilities{ + {Name: to.Ptr("vCPUs"), Value: to.Ptr("4")}, + }, + }, + expectedError: "SKU name is missing", + }, + { + name: "When vCPUs capability is missing it should return error", + input: makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ + "MemoryGB": "16", + "CpuArchitectureType": "x64", + }), + expectedError: "missing vCPUs capability", + }, + { + name: "When MemoryGB capability is missing it should return error", + input: makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ + "vCPUs": "4", + "CpuArchitectureType": "x64", + }), + expectedError: "missing MemoryGB capability", + }, + { + name: "When CpuArchitectureType capability is missing it should return error", + input: makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ + "vCPUs": "4", + "MemoryGB": "16", + }), + expectedError: "missing CpuArchitectureType capability", + }, + { + name: "When vCPUs value is not a valid integer it should return error", + input: makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ + "vCPUs": "abc", + "MemoryGB": "16", + "CpuArchitectureType": "x64", + }), + expectedError: "invalid vCPUs value", + }, + { + name: "When MemoryGB value is not a valid float it should return error", + input: makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ + "vCPUs": "4", + "MemoryGB": "xyz", + "CpuArchitectureType": "x64", + }), + expectedError: "invalid MemoryGB value", + }, + { + name: "When vCPUs value is zero it should return error", + input: makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ + "vCPUs": "0", + "MemoryGB": "16", + "CpuArchitectureType": "x64", + }), + expectedError: "invalid vCPUs count", + }, + { + name: "When MemoryGB value is zero it should return error", + input: makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ + "vCPUs": "4", + "MemoryGB": "0", + "CpuArchitectureType": "x64", + }), + expectedError: "invalid MemoryGB value", + }, + { + name: "When CpuArchitectureType is unsupported it should return error", + input: makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ + "vCPUs": "4", + "MemoryGB": "16", + "CpuArchitectureType": "i386", + }), + expectedError: "unsupported CPU architecture", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewGomegaWithT(t) + _, err := transformSKU(tt.input) + g.Expect(err).To(HaveOccurred()) + g.Expect(err.Error()).To(ContainSubstring(tt.expectedError)) + }) + } +} + +func TestGetInstanceTypeInfo(t *testing.T) { + tests := []struct { + name string + skus []*armcompute.ResourceSKU + apiErr error + instanceType string + expected *instancetype.InstanceTypeInfo + expectedError string + }{ + { + name: "When VM size exists it should return info", + skus: []*armcompute.ResourceSKU{ + makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ + "vCPUs": "4", "MemoryGB": "16", "CpuArchitectureType": "x64", + }), + }, + instanceType: "Standard_D4s_v3", + expected: &instancetype.InstanceTypeInfo{ + InstanceType: "Standard_D4s_v3", + VCPU: 4, + MemoryMb: 16384, + GPU: 0, + CPUArchitecture: hyperv1.ArchitectureAMD64, + }, + }, + { + name: "When VM size not found it should return error", + skus: []*armcompute.ResourceSKU{ + makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ + "vCPUs": "4", "MemoryGB": "16", "CpuArchitectureType": "x64", + }), + }, + instanceType: "Standard_Nonexistent", + expectedError: "not found", + }, + { + name: "When API returns error it should propagate error", + apiErr: fmt.Errorf("API error: throttling"), + instanceType: "Standard_D4s_v3", + expectedError: "failed to load Azure Resource SKUs", + }, + { + name: "When SKU has matching name but wrong ResourceType it should return not found", + skus: []*armcompute.ResourceSKU{ + makeSKU("Standard_D4s_v3", "disks", map[string]string{ + "vCPUs": "4", "MemoryGB": "16", "CpuArchitectureType": "x64", + }), + }, + instanceType: "Standard_D4s_v3", + expectedError: "not found", + }, + { + name: "When multiple SKUs returned it should match only virtualMachines type", + skus: []*armcompute.ResourceSKU{ + makeSKU("Standard_D4s_v3", "disks", map[string]string{ + "vCPUs": "99", "MemoryGB": "99", "CpuArchitectureType": "x64", + }), + makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ + "vCPUs": "4", "MemoryGB": "16", "CpuArchitectureType": "x64", + }), + }, + instanceType: "Standard_D4s_v3", + expected: &instancetype.InstanceTypeInfo{ + InstanceType: "Standard_D4s_v3", + VCPU: 4, + MemoryMb: 16384, + GPU: 0, + CPUArchitecture: hyperv1.ArchitectureAMD64, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewGomegaWithT(t) + mock := &mockResourceSKUsAPI{skus: tt.skus, err: tt.apiErr} + provider := NewProvider(mock, "eastus") + result, err := provider.GetInstanceTypeInfo(context.Background(), tt.instanceType) + + if tt.expectedError != "" { + g.Expect(err).To(HaveOccurred()) + g.Expect(err.Error()).To(ContainSubstring(tt.expectedError)) + } else { + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(result).To(Equal(tt.expected)) + } + }) + } +} + +func TestGetCapabilityValue(t *testing.T) { + tests := []struct { + name string + capabilities []*armcompute.ResourceSKUCapabilities + capName string + expectedVal string + expectedOK bool + }{ + { + name: "When capability exists it should return the value", + capabilities: []*armcompute.ResourceSKUCapabilities{ + {Name: to.Ptr("vCPUs"), Value: to.Ptr("4")}, + }, + capName: "vCPUs", + expectedVal: "4", + expectedOK: true, + }, + { + name: "When capability does not exist it should return empty and false", + capabilities: []*armcompute.ResourceSKUCapabilities{ + {Name: to.Ptr("vCPUs"), Value: to.Ptr("4")}, + }, + capName: "GPUs", + expectedVal: "", + expectedOK: false, + }, + { + name: "When capabilities slice is nil it should return empty and false", + capabilities: nil, + capName: "vCPUs", + expectedVal: "", + expectedOK: false, + }, + { + name: "When capability name has different case it should not match", + capabilities: []*armcompute.ResourceSKUCapabilities{ + {Name: to.Ptr("vcpus"), Value: to.Ptr("4")}, + }, + capName: "vCPUs", + expectedVal: "", + expectedOK: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewGomegaWithT(t) + val, ok := getCapabilityValue(tt.capabilities, tt.capName) + g.Expect(ok).To(Equal(tt.expectedOK)) + g.Expect(val).To(Equal(tt.expectedVal)) + }) + } +} diff --git a/hypershift-operator/controllers/nodepool/nodepool_controller.go b/hypershift-operator/controllers/nodepool/nodepool_controller.go index 8a550972e4a..22628a3c496 100644 --- a/hypershift-operator/controllers/nodepool/nodepool_controller.go +++ b/hypershift-operator/controllers/nodepool/nodepool_controller.go @@ -438,7 +438,7 @@ func (r *NodePoolReconciler) reconcile(ctx context.Context, hcluster *hyperv1.Ho // supportedScaleFromZeroPlatform checks if the platform supports scale-from-zero functionality. func supportedScaleFromZeroPlatform(platform hyperv1.PlatformType) bool { - return platform == hyperv1.AWSPlatform + return platform == hyperv1.AWSPlatform || platform == hyperv1.AzurePlatform } func (r *NodePoolReconciler) token(ctx context.Context, hcluster *hyperv1.HostedCluster, nodePool *hyperv1.NodePool) (*Token, error) { @@ -1260,16 +1260,15 @@ func (r *NodePoolReconciler) reconcileScaleFromZeroAnnotations(ctx context.Conte } machineTemplate = awsMachineTemplate - // Future platform support can be added here: - // case hyperv1.AzurePlatform: - // azureTemplate := &capiazure.AzureMachineTemplate{} - // if err := capi.getExistingMachineTemplate(ctx, azureTemplate); err != nil { - // if apierrors.IsNotFound(err) { - // return nil - // } - // return fmt.Errorf("failed to get AzureMachineTemplate: %w", err) - // } - // machineTemplate = azureTemplate + case hyperv1.AzurePlatform: + azureTemplate := &capiazure.AzureMachineTemplate{} + if err := capi.getExistingMachineTemplate(ctx, azureTemplate); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("failed to get AzureMachineTemplate: %w", err) + } + machineTemplate = azureTemplate default: return fmt.Errorf("unsupported platform for scale-from-zero: %s", nodePool.Spec.Platform.Type) diff --git a/hypershift-operator/controllers/nodepool/scale_from_zero.go b/hypershift-operator/controllers/nodepool/scale_from_zero.go index 5d24335bc19..b67c2a4ca65 100644 --- a/hypershift-operator/controllers/nodepool/scale_from_zero.go +++ b/hypershift-operator/controllers/nodepool/scale_from_zero.go @@ -27,6 +27,7 @@ import ( "k8s.io/apimachinery/pkg/api/resource" infrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/api/v1beta2" + capiazure "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -80,10 +81,8 @@ func setScaleFromZeroAnnotationsOnObject(ctx context.Context, provider instancet case *infrav1.AWSMachineTemplate: instanceType = template.Spec.Template.Spec.InstanceType statusCapacity = template.Status.Capacity - // Future platform support can be added here: - // case *capiazure.AzureMachineTemplate: - // instanceType = template.Spec.Template.Spec.VMSize - // statusCapacity = template.Status.Capacity + case *capiazure.AzureMachineTemplate: + instanceType = template.Spec.Template.Spec.VMSize default: return fmt.Errorf("unsupported machine template type: %T", machineTemplate) } diff --git a/hypershift-operator/controllers/nodepool/scale_from_zero_test.go b/hypershift-operator/controllers/nodepool/scale_from_zero_test.go index bec100f788b..6b039040a9f 100644 --- a/hypershift-operator/controllers/nodepool/scale_from_zero_test.go +++ b/hypershift-operator/controllers/nodepool/scale_from_zero_test.go @@ -15,6 +15,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" infrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/api/v1beta2" + capiazure "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1" capiv1 "sigs.k8s.io/cluster-api/api/core/v1beta1" ) @@ -90,6 +91,16 @@ func TestSetScaleFromZeroAnnotationsOnObject(t *testing.T) { } } + newAzureTemplate := func(vmSize string) *capiazure.AzureMachineTemplate { + return &capiazure.AzureMachineTemplate{ + Spec: capiazure.AzureMachineTemplateSpec{ + Template: capiazure.AzureMachineTemplateResource{ + Spec: capiazure.AzureMachineSpec{VMSize: vmSize}, + }, + }, + } + } + tests := []struct { name string provider instancetype.Provider @@ -200,6 +211,66 @@ func TestSetScaleFromZeroAnnotationsOnObject(t *testing.T) { g.Expect(a).ToNot(HaveKey(taintsKey)) }, }, + { + name: "When Azure template with valid VMSize and no GPU it should set basic annotations", + provider: &mockProvider{info: &instancetype.InstanceTypeInfo{ + VCPU: 4, MemoryMb: 16384, GPU: 0, CPUArchitecture: "amd64", + }}, + nodePool: &hyperv1.NodePool{}, + object: &capiv1.MachineDeployment{}, + machineTemplate: newAzureTemplate("Standard_D4s_v3"), + expectErr: false, + validate: func(g Gomega, md *capiv1.MachineDeployment) { + a := md.GetAnnotations() + g.Expect(a).To(HaveKeyWithValue(cpuKey, "4")) + g.Expect(a).To(HaveKeyWithValue(memoryKey, "16384")) + g.Expect(a).To(HaveKeyWithValue(labelsKey, "kubernetes.io/arch=amd64")) + g.Expect(a).ToNot(HaveKey(gpuKey)) + }, + }, + { + name: "When Azure template with empty VMSize it should return error", + provider: &mockProvider{}, + nodePool: &hyperv1.NodePool{}, + object: &capiv1.MachineDeployment{}, + machineTemplate: newAzureTemplate(""), + expectErr: true, + errSubstring: "instanceType is empty", + }, + { + name: "When Azure template with nil provider it should skip annotations", + provider: nil, + nodePool: &hyperv1.NodePool{}, + object: &capiv1.MachineDeployment{}, + machineTemplate: newAzureTemplate("Standard_D4s_v3"), + expectErr: false, + validate: func(g Gomega, md *capiv1.MachineDeployment) { + g.Expect(md.GetAnnotations()).ToNot(HaveKey(cpuKey)) + }, + }, + { + name: "When Azure template with GPU and taints it should set all annotations", + provider: &mockProvider{info: &instancetype.InstanceTypeInfo{ + VCPU: 6, MemoryMb: 114688, GPU: 1, CPUArchitecture: "amd64", + }}, + nodePool: &hyperv1.NodePool{ + Spec: hyperv1.NodePoolSpec{ + Taints: []hyperv1.Taint{ + {Key: "dedicated", Value: "gpu", Effect: corev1.TaintEffectNoSchedule}, + }, + }, + }, + object: &capiv1.MachineDeployment{}, + machineTemplate: newAzureTemplate("Standard_NC6s_v3"), + expectErr: false, + validate: func(g Gomega, md *capiv1.MachineDeployment) { + a := md.GetAnnotations() + g.Expect(a).To(HaveKeyWithValue(cpuKey, "6")) + g.Expect(a).To(HaveKeyWithValue(memoryKey, "114688")) + g.Expect(a).To(HaveKeyWithValue(gpuKey, "1")) + g.Expect(a).To(HaveKeyWithValue(taintsKey, "dedicated=gpu:NoSchedule")) + }, + }, { name: "When instance has GPU, labels with arch override, taints, and existing annotations it should set all correctly", provider: &mockProvider{info: &instancetype.InstanceTypeInfo{ diff --git a/hypershift-operator/main.go b/hypershift-operator/main.go index a663242bf0f..98e05e86610 100644 --- a/hypershift-operator/main.go +++ b/hypershift-operator/main.go @@ -18,6 +18,7 @@ import ( "context" "crypto/tls" "fmt" + "encoding/json" "os" "strings" "time" @@ -35,6 +36,7 @@ import ( "github.com/openshift/hypershift/hypershift-operator/controllers/nodepool" "github.com/openshift/hypershift/hypershift-operator/controllers/nodepool/instancetype" awsinstancetype "github.com/openshift/hypershift/hypershift-operator/controllers/nodepool/instancetype/aws" + azureinstancetype "github.com/openshift/hypershift/hypershift-operator/controllers/nodepool/instancetype/azure" npmetrics "github.com/openshift/hypershift/hypershift-operator/controllers/nodepool/metrics" "github.com/openshift/hypershift/hypershift-operator/controllers/platform/aws" azureplatform "github.com/openshift/hypershift/hypershift-operator/controllers/platform/azure" @@ -71,6 +73,7 @@ import ( "github.com/Azure/azure-sdk-for-go/sdk/azcore" "github.com/Azure/azure-sdk-for-go/sdk/azcore/cloud" "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v5" admissionregistrationv1 "k8s.io/api/admissionregistration/v1" @@ -351,7 +354,7 @@ func validateStartOptions(opts *StartOptions, log logr.Logger) error { return fmt.Errorf("--etcd-backup-max-count must be at least 1, got %d", opts.EtcdBackupMaxCount) } - supportedProviders := set.New("aws") + supportedProviders := set.New("aws", "azure") if opts.ScaleFromZeroCreds != "" { if opts.ScaleFromZeroProvider == "" { return fmt.Errorf("--scale-from-zero-provider is required when using --scale-from-zero-creds") @@ -606,6 +609,34 @@ func setupNodePoolController(ctx context.Context, mgr ctrl.Manager, opts *StartO }) instanceTypeProvider = awsinstancetype.NewProvider(scaleFromZeroEC2Client) log.Info("Instance type provider initialized", "provider", opts.ScaleFromZeroProvider) + case "azure": + raw, err := os.ReadFile(opts.ScaleFromZeroCreds) + if err != nil { + return fmt.Errorf("failed to read Azure scale-from-zero credentials: %w", err) + } + var azureCreds struct { + SubscriptionID string `json:"subscriptionId"` + ClientID string `json:"clientId"` + ClientSecret string `json:"clientSecret"` + TenantID string `json:"tenantId"` + Location string `json:"location"` + } + if err := json.Unmarshal(raw, &azureCreds); err != nil { + return fmt.Errorf("failed to parse Azure scale-from-zero credentials: %w", err) + } + if azureCreds.Location == "" { + return fmt.Errorf("Azure scale-from-zero credentials must include 'location'") + } + cred, err := azidentity.NewClientSecretCredential(azureCreds.TenantID, azureCreds.ClientID, azureCreds.ClientSecret, nil) + if err != nil { + return fmt.Errorf("failed to create Azure credentials for scale-from-zero: %w", err) + } + skuClient, err := armcompute.NewResourceSKUsClient(azureCreds.SubscriptionID, cred, nil) + if err != nil { + return fmt.Errorf("failed to create Azure ResourceSKUs client: %w", err) + } + instanceTypeProvider = azureinstancetype.NewProvider(skuClient, azureCreds.Location) + log.Info("Instance type provider initialized", "provider", opts.ScaleFromZeroProvider, "location", azureCreds.Location) default: log.Info("WARNING: Unsupported scale-from-zero provider", "provider", opts.ScaleFromZeroProvider) } From 77219083b9b3286fca4087230eb054d72f20dd55 Mon Sep 17 00:00:00 2001 From: Jesse Jaggars Date: Wed, 6 May 2026 19:31:37 +0000 Subject: [PATCH 2/7] fix: address review feedback for Azure scale-from-zero - Update NodePoolAutoScaling.Min field comment and CRD validation rule to reflect Azure support alongside AWS - Regenerate CRD manifests with updated docs and validation - Fix partial SKU cache on Azure pager failure: build into local map and assign to cache only after full walk succeeds - Tighten platform gate: add ScaleFromZeroPlatform field so annotations are only set when nodepool platform matches the configured provider - Validate all required Azure credential fields (subscriptionId, clientId, clientSecret, tenantId, location) upfront with a clear error listing missing fields Co-Authored-By: Claude Opus 4.6 (1M context) --- api/hypershift/v1beta1/nodepool_types.go | 2 +- .../AAA_ungated.yaml | 6 ++--- .../GCPPlatform.yaml | 6 ++--- .../OpenStack.yaml | 6 ++--- .../nodepools-CustomNoUpgrade.crd.yaml | 6 ++--- .../nodepools-Default.crd.yaml | 6 ++--- .../nodepools-TechPreviewNoUpgrade.crd.yaml | 6 ++--- .../nodepool/instancetype/azure/provider.go | 5 ++-- .../nodepool/nodepool_controller.go | 8 ++----- hypershift-operator/main.go | 23 ++++++++++++++++++- 10 files changed, 46 insertions(+), 28 deletions(-) diff --git a/api/hypershift/v1beta1/nodepool_types.go b/api/hypershift/v1beta1/nodepool_types.go index 7aeeef9988a..5ebc5378064 100644 --- a/api/hypershift/v1beta1/nodepool_types.go +++ b/api/hypershift/v1beta1/nodepool_types.go @@ -501,7 +501,7 @@ type NodePoolManagement struct { // +kubebuilder:validation:XValidation:rule="self.max >= self.min", message="max must be equal or greater than min" type NodePoolAutoScaling struct { // min is the minimum number of nodes to maintain in the pool. - // Can be set to 0 for scale-from-zero for AWS platform. + // Can be set to 0 for scale-from-zero for AWS and Azure platforms. // Must be >= 0 and <= .Max. // // +kubebuilder:validation:Minimum=0 diff --git a/api/hypershift/v1beta1/zz_generated.featuregated-crd-manifests/nodepools.hypershift.openshift.io/AAA_ungated.yaml b/api/hypershift/v1beta1/zz_generated.featuregated-crd-manifests/nodepools.hypershift.openshift.io/AAA_ungated.yaml index 9463ef0840c..28b0b76bc4a 100644 --- a/api/hypershift/v1beta1/zz_generated.featuregated-crd-manifests/nodepools.hypershift.openshift.io/AAA_ungated.yaml +++ b/api/hypershift/v1beta1/zz_generated.featuregated-crd-manifests/nodepools.hypershift.openshift.io/AAA_ungated.yaml @@ -108,7 +108,7 @@ spec: min: description: |- min is the minimum number of nodes to maintain in the pool. - Can be set to 0 for scale-from-zero for AWS platform. + Can be set to 0 for scale-from-zero for AWS and Azure platforms. Must be >= 0 and <= .Max. format: int32 minimum: 0 @@ -1518,9 +1518,9 @@ spec: rule: '!has(self.platform.aws) || !has(self.platform.aws.imageType) || self.platform.aws.imageType != ''Windows'' || self.arch == ''amd64''' - message: Scale-from-zero (autoScaling.min=0) is currently only supported - for AWS platform + for AWS and Azure platforms rule: '!has(self.autoScaling) || self.autoScaling.min > 0 || self.platform.type - == ''AWS''' + == ''AWS'' || self.platform.type == ''Azure''' status: description: status is the latest observed status of the NodePool. properties: diff --git a/api/hypershift/v1beta1/zz_generated.featuregated-crd-manifests/nodepools.hypershift.openshift.io/GCPPlatform.yaml b/api/hypershift/v1beta1/zz_generated.featuregated-crd-manifests/nodepools.hypershift.openshift.io/GCPPlatform.yaml index ea37b91c13a..2705760fb21 100644 --- a/api/hypershift/v1beta1/zz_generated.featuregated-crd-manifests/nodepools.hypershift.openshift.io/GCPPlatform.yaml +++ b/api/hypershift/v1beta1/zz_generated.featuregated-crd-manifests/nodepools.hypershift.openshift.io/GCPPlatform.yaml @@ -108,7 +108,7 @@ spec: min: description: |- min is the minimum number of nodes to maintain in the pool. - Can be set to 0 for scale-from-zero for AWS platform. + Can be set to 0 for scale-from-zero for AWS and Azure platforms. Must be >= 0 and <= .Max. format: int32 minimum: 0 @@ -1787,9 +1787,9 @@ spec: rule: '!has(self.platform.aws) || !has(self.platform.aws.imageType) || self.platform.aws.imageType != ''Windows'' || self.arch == ''amd64''' - message: Scale-from-zero (autoScaling.min=0) is currently only supported - for AWS platform + for AWS and Azure platforms rule: '!has(self.autoScaling) || self.autoScaling.min > 0 || self.platform.type - == ''AWS''' + == ''AWS'' || self.platform.type == ''Azure''' status: description: status is the latest observed status of the NodePool. properties: diff --git a/api/hypershift/v1beta1/zz_generated.featuregated-crd-manifests/nodepools.hypershift.openshift.io/OpenStack.yaml b/api/hypershift/v1beta1/zz_generated.featuregated-crd-manifests/nodepools.hypershift.openshift.io/OpenStack.yaml index baed37846b3..a7c77a9ab07 100644 --- a/api/hypershift/v1beta1/zz_generated.featuregated-crd-manifests/nodepools.hypershift.openshift.io/OpenStack.yaml +++ b/api/hypershift/v1beta1/zz_generated.featuregated-crd-manifests/nodepools.hypershift.openshift.io/OpenStack.yaml @@ -108,7 +108,7 @@ spec: min: description: |- min is the minimum number of nodes to maintain in the pool. - Can be set to 0 for scale-from-zero for AWS platform. + Can be set to 0 for scale-from-zero for AWS and Azure platforms. Must be >= 0 and <= .Max. format: int32 minimum: 0 @@ -1705,9 +1705,9 @@ spec: rule: '!has(self.platform.aws) || !has(self.platform.aws.imageType) || self.platform.aws.imageType != ''Windows'' || self.arch == ''amd64''' - message: Scale-from-zero (autoScaling.min=0) is currently only supported - for AWS platform + for AWS and Azure platforms rule: '!has(self.autoScaling) || self.autoScaling.min > 0 || self.platform.type - == ''AWS''' + == ''AWS'' || self.platform.type == ''Azure''' status: description: status is the latest observed status of the NodePool. properties: diff --git a/cmd/install/assets/crds/hypershift-operator/zz_generated.crd-manifests/nodepools-CustomNoUpgrade.crd.yaml b/cmd/install/assets/crds/hypershift-operator/zz_generated.crd-manifests/nodepools-CustomNoUpgrade.crd.yaml index 055967ac455..4e38e77b912 100644 --- a/cmd/install/assets/crds/hypershift-operator/zz_generated.crd-manifests/nodepools-CustomNoUpgrade.crd.yaml +++ b/cmd/install/assets/crds/hypershift-operator/zz_generated.crd-manifests/nodepools-CustomNoUpgrade.crd.yaml @@ -111,7 +111,7 @@ spec: min: description: |- min is the minimum number of nodes to maintain in the pool. - Can be set to 0 for scale-from-zero for AWS platform. + Can be set to 0 for scale-from-zero for AWS and Azure platforms. Must be >= 0 and <= .Max. format: int32 minimum: 0 @@ -1975,9 +1975,9 @@ spec: rule: '!has(self.platform.aws) || !has(self.platform.aws.imageType) || self.platform.aws.imageType != ''Windows'' || self.arch == ''amd64''' - message: Scale-from-zero (autoScaling.min=0) is currently only supported - for AWS platform + for AWS and Azure platforms rule: '!has(self.autoScaling) || self.autoScaling.min > 0 || self.platform.type - == ''AWS''' + == ''AWS'' || self.platform.type == ''Azure''' status: description: status is the latest observed status of the NodePool. properties: diff --git a/cmd/install/assets/crds/hypershift-operator/zz_generated.crd-manifests/nodepools-Default.crd.yaml b/cmd/install/assets/crds/hypershift-operator/zz_generated.crd-manifests/nodepools-Default.crd.yaml index d317df1b41a..0df74130419 100644 --- a/cmd/install/assets/crds/hypershift-operator/zz_generated.crd-manifests/nodepools-Default.crd.yaml +++ b/cmd/install/assets/crds/hypershift-operator/zz_generated.crd-manifests/nodepools-Default.crd.yaml @@ -111,7 +111,7 @@ spec: min: description: |- min is the minimum number of nodes to maintain in the pool. - Can be set to 0 for scale-from-zero for AWS platform. + Can be set to 0 for scale-from-zero for AWS and Azure platforms. Must be >= 0 and <= .Max. format: int32 minimum: 0 @@ -1521,9 +1521,9 @@ spec: rule: '!has(self.platform.aws) || !has(self.platform.aws.imageType) || self.platform.aws.imageType != ''Windows'' || self.arch == ''amd64''' - message: Scale-from-zero (autoScaling.min=0) is currently only supported - for AWS platform + for AWS and Azure platforms rule: '!has(self.autoScaling) || self.autoScaling.min > 0 || self.platform.type - == ''AWS''' + == ''AWS'' || self.platform.type == ''Azure''' status: description: status is the latest observed status of the NodePool. properties: diff --git a/cmd/install/assets/crds/hypershift-operator/zz_generated.crd-manifests/nodepools-TechPreviewNoUpgrade.crd.yaml b/cmd/install/assets/crds/hypershift-operator/zz_generated.crd-manifests/nodepools-TechPreviewNoUpgrade.crd.yaml index f8650900421..4680973aea3 100644 --- a/cmd/install/assets/crds/hypershift-operator/zz_generated.crd-manifests/nodepools-TechPreviewNoUpgrade.crd.yaml +++ b/cmd/install/assets/crds/hypershift-operator/zz_generated.crd-manifests/nodepools-TechPreviewNoUpgrade.crd.yaml @@ -111,7 +111,7 @@ spec: min: description: |- min is the minimum number of nodes to maintain in the pool. - Can be set to 0 for scale-from-zero for AWS platform. + Can be set to 0 for scale-from-zero for AWS and Azure platforms. Must be >= 0 and <= .Max. format: int32 minimum: 0 @@ -1975,9 +1975,9 @@ spec: rule: '!has(self.platform.aws) || !has(self.platform.aws.imageType) || self.platform.aws.imageType != ''Windows'' || self.arch == ''amd64''' - message: Scale-from-zero (autoScaling.min=0) is currently only supported - for AWS platform + for AWS and Azure platforms rule: '!has(self.autoScaling) || self.autoScaling.min > 0 || self.platform.type - == ''AWS''' + == ''AWS'' || self.platform.type == ''Azure''' status: description: status is the latest observed status of the NodePool. properties: diff --git a/hypershift-operator/controllers/nodepool/instancetype/azure/provider.go b/hypershift-operator/controllers/nodepool/instancetype/azure/provider.go index a5a6217a759..1ee6d4ee350 100644 --- a/hypershift-operator/controllers/nodepool/instancetype/azure/provider.go +++ b/hypershift-operator/controllers/nodepool/instancetype/azure/provider.go @@ -61,7 +61,7 @@ func (p *Provider) GetInstanceTypeInfo(ctx context.Context, instanceType string) } func (p *Provider) loadSKUs(ctx context.Context) error { - p.cache = make(map[string]*instancetype.InstanceTypeInfo) + nextCache := make(map[string]*instancetype.InstanceTypeInfo) filter := fmt.Sprintf("location eq '%s'", p.location) pager := p.skuClient.NewListPager(&armcompute.ResourceSKUsClientListOptions{ @@ -83,10 +83,11 @@ func (p *Provider) loadSKUs(ctx context.Context) error { if err != nil { continue } - p.cache[info.InstanceType] = info + nextCache[info.InstanceType] = info } } + p.cache = nextCache return nil } diff --git a/hypershift-operator/controllers/nodepool/nodepool_controller.go b/hypershift-operator/controllers/nodepool/nodepool_controller.go index 22628a3c496..923ef898bf6 100644 --- a/hypershift-operator/controllers/nodepool/nodepool_controller.go +++ b/hypershift-operator/controllers/nodepool/nodepool_controller.go @@ -104,6 +104,7 @@ type NodePoolReconciler struct { KubevirtInfraClients kvinfra.KubevirtInfraClientMap EC2Client awsapi.EC2API InstanceTypeProvider instancetype.Provider + ScaleFromZeroPlatform hyperv1.PlatformType } type NotReadyError struct { @@ -426,7 +427,7 @@ func (r *NodePoolReconciler) reconcile(ctx context.Context, hcluster *hyperv1.Ho // Set scale-from-zero annotations if provider is configured and platform is supported // This works for both Replace (MachineDeployment) and InPlace (MachineSet) upgrade types - if isAutoscalingEnabled(nodePool) && r.InstanceTypeProvider != nil && supportedScaleFromZeroPlatform(nodePool.Spec.Platform.Type) { + if isAutoscalingEnabled(nodePool) && r.InstanceTypeProvider != nil && r.ScaleFromZeroPlatform == nodePool.Spec.Platform.Type { if err = r.reconcileScaleFromZeroAnnotations(ctx, nodePool, capi); err != nil { log.Error(err, "Failed to set scale-from-zero annotations, will retry") return ctrl.Result{RequeueAfter: 30 * time.Second}, nil @@ -436,11 +437,6 @@ func (r *NodePoolReconciler) reconcile(ctx context.Context, hcluster *hyperv1.Ho return ctrl.Result{}, nil } -// supportedScaleFromZeroPlatform checks if the platform supports scale-from-zero functionality. -func supportedScaleFromZeroPlatform(platform hyperv1.PlatformType) bool { - return platform == hyperv1.AWSPlatform || platform == hyperv1.AzurePlatform -} - func (r *NodePoolReconciler) token(ctx context.Context, hcluster *hyperv1.HostedCluster, nodePool *hyperv1.NodePool) (*Token, error) { // Validate and get releaseImage. releaseImage, err := r.getReleaseImage(ctx, hcluster, nodePool.Status.Version, nodePool.Spec.Release.Image) diff --git a/hypershift-operator/main.go b/hypershift-operator/main.go index 98e05e86610..85740e79bf7 100644 --- a/hypershift-operator/main.go +++ b/hypershift-operator/main.go @@ -599,6 +599,8 @@ func setupEC2Client(ctx context.Context, opts *StartOptions) awsapi.EC2API { func setupNodePoolController(ctx context.Context, mgr ctrl.Manager, opts *StartOptions, operatorImage string, createOrUpdate upsert.CreateOrUpdateProvider, registryProvider globalconfig.CommonRegistryProvider, ec2Client awsapi.EC2API, log logr.Logger) error { var instanceTypeProvider instancetype.Provider + var scaleFromZeroPlatform hyperv1.PlatformType + if opts.ScaleFromZeroCreds != "" && opts.ScaleFromZeroProvider != "" { switch strings.ToLower(opts.ScaleFromZeroProvider) { case "aws": @@ -608,6 +610,7 @@ func setupNodePoolController(ctx context.Context, mgr ctrl.Manager, opts *StartO o.Retryer = awsConfig() }) instanceTypeProvider = awsinstancetype.NewProvider(scaleFromZeroEC2Client) + scaleFromZeroPlatform = hyperv1.AWSPlatform log.Info("Instance type provider initialized", "provider", opts.ScaleFromZeroProvider) case "azure": raw, err := os.ReadFile(opts.ScaleFromZeroCreds) @@ -624,8 +627,24 @@ func setupNodePoolController(ctx context.Context, mgr ctrl.Manager, opts *StartO if err := json.Unmarshal(raw, &azureCreds); err != nil { return fmt.Errorf("failed to parse Azure scale-from-zero credentials: %w", err) } + var missing []string + if azureCreds.SubscriptionID == "" { + missing = append(missing, "subscriptionId") + } + if azureCreds.ClientID == "" { + missing = append(missing, "clientId") + } + if azureCreds.ClientSecret == "" { + missing = append(missing, "clientSecret") + } + if azureCreds.TenantID == "" { + missing = append(missing, "tenantId") + } if azureCreds.Location == "" { - return fmt.Errorf("Azure scale-from-zero credentials must include 'location'") + missing = append(missing, "location") + } + if len(missing) > 0 { + return fmt.Errorf("Azure scale-from-zero credentials missing required fields: %s", strings.Join(missing, ", ")) } cred, err := azidentity.NewClientSecretCredential(azureCreds.TenantID, azureCreds.ClientID, azureCreds.ClientSecret, nil) if err != nil { @@ -636,6 +655,7 @@ func setupNodePoolController(ctx context.Context, mgr ctrl.Manager, opts *StartO return fmt.Errorf("failed to create Azure ResourceSKUs client: %w", err) } instanceTypeProvider = azureinstancetype.NewProvider(skuClient, azureCreds.Location) + scaleFromZeroPlatform = hyperv1.AzurePlatform log.Info("Instance type provider initialized", "provider", opts.ScaleFromZeroProvider, "location", azureCreds.Location) default: log.Info("WARNING: Unsupported scale-from-zero provider", "provider", opts.ScaleFromZeroProvider) @@ -651,6 +671,7 @@ func setupNodePoolController(ctx context.Context, mgr ctrl.Manager, opts *StartO KubevirtInfraClients: kvinfra.NewKubevirtInfraClientMap(), EC2Client: ec2Client, InstanceTypeProvider: instanceTypeProvider, + ScaleFromZeroPlatform: scaleFromZeroPlatform, }).SetupWithManager(mgr); err != nil { return fmt.Errorf("unable to create controller: %w", err) } From 9bf4e1849faa7f0bd2a21355eef7070c358702d1 Mon Sep 17 00:00:00 2001 From: Jesse Jaggars Date: Thu, 7 May 2026 13:49:45 +0000 Subject: [PATCH 3/7] fix: update envtest suite for Azure scale-from-zero Update CRD test suite to match the updated validation rule that allows autoScaling.min=0 on Azure platform: - Change Azure min=0 test from expecting failure to expecting success - Update Agent and KubeVirt error messages to include Azure Co-Authored-By: Claude Opus 4.6 (1M context) --- .../stable.nodepools.autoscaling.testsuite.yaml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cmd/install/assets/crds/hypershift-operator/tests/nodepools.hypershift.openshift.io/stable.nodepools.autoscaling.testsuite.yaml b/cmd/install/assets/crds/hypershift-operator/tests/nodepools.hypershift.openshift.io/stable.nodepools.autoscaling.testsuite.yaml index 924499411ac..10ee8107523 100644 --- a/cmd/install/assets/crds/hypershift-operator/tests/nodepools.hypershift.openshift.io/stable.nodepools.autoscaling.testsuite.yaml +++ b/cmd/install/assets/crds/hypershift-operator/tests/nodepools.hypershift.openshift.io/stable.nodepools.autoscaling.testsuite.yaml @@ -31,7 +31,7 @@ tests: id: "subnet-01234567" type: AWS - - name: when autoScaling min=0 on Azure platform it should fail + - name: when autoScaling min=0 on Azure platform it should pass initial: | apiVersion: hypershift.openshift.io/v1beta1 kind: NodePool @@ -56,7 +56,6 @@ tests: diskStorageAccountType: Premium_LRS subnetID: "/subscriptions/12345678-1234-5678-9012-123456789012/resourceGroups/test-rg/providers/Microsoft.Network/virtualNetworks/test-vnet/subnets/test-subnet" type: Azure - expectedError: "Scale-from-zero (autoScaling.min=0) is currently only supported for AWS platform" - name: when autoScaling min=0 on Agent platform it should fail initial: | @@ -77,7 +76,7 @@ tests: agent: {} type: Agent - expectedError: "Scale-from-zero (autoScaling.min=0) is currently only supported for AWS platform" + expectedError: "Scale-from-zero (autoScaling.min=0) is currently only supported for AWS and Azure platforms" - name: when autoScaling min=0 on KubeVirt platform it should fail initial: | @@ -101,7 +100,7 @@ tests: persistent: size: 32Gi type: KubeVirt - expectedError: "Scale-from-zero (autoScaling.min=0) is currently only supported for AWS platform" + expectedError: "Scale-from-zero (autoScaling.min=0) is currently only supported for AWS and Azure platforms" - name: when autoScaling min=1 on Azure platform it should pass initial: | From 9e881ae970a88e060b153bdba6eb818db54a5907 Mon Sep 17 00:00:00 2001 From: Jesse Jaggars Date: Thu, 7 May 2026 20:39:06 +0000 Subject: [PATCH 4/7] fix: address lint and verify CI failures - Lowercase error string for Azure scale-from-zero credentials - Fix gci import ordering in main.go, provider_test.go, scale_from_zero_test.go, and nodepool_test.go Co-Authored-By: Claude Opus 4.6 (1M context) --- .../instancetype/azure/provider_test.go | 50 +++++++++---------- .../nodepool/scale_from_zero_test.go | 2 +- hypershift-operator/main.go | 4 +- test/e2e/nodepool_test.go | 2 + 4 files changed, 30 insertions(+), 28 deletions(-) diff --git a/hypershift-operator/controllers/nodepool/instancetype/azure/provider_test.go b/hypershift-operator/controllers/nodepool/instancetype/azure/provider_test.go index 9e7f87ef65c..ea8a653d790 100644 --- a/hypershift-operator/controllers/nodepool/instancetype/azure/provider_test.go +++ b/hypershift-operator/controllers/nodepool/instancetype/azure/provider_test.go @@ -61,8 +61,8 @@ func TestTransformSKU_WhenValidInput_ItShouldTransformCorrectly(t *testing.T) { { name: "When Standard_D4s_v3 with x64 arch it should transform correctly", input: makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ - "vCPUs": "4", - "MemoryGB": "16", + "vCPUs": "4", + "MemoryGB": "16", "CpuArchitectureType": "x64", }), expected: &instancetype.InstanceTypeInfo{ @@ -76,9 +76,9 @@ func TestTransformSKU_WhenValidInput_ItShouldTransformCorrectly(t *testing.T) { { name: "When GPU VM it should set GPU count", input: makeSKU("Standard_NC16as_T4_v3", "virtualMachines", map[string]string{ - "vCPUs": "16", - "MemoryGB": "110", - "GPUs": "1", + "vCPUs": "16", + "MemoryGB": "110", + "GPUs": "1", "CpuArchitectureType": "x64", }), expected: &instancetype.InstanceTypeInfo{ @@ -92,8 +92,8 @@ func TestTransformSKU_WhenValidInput_ItShouldTransformCorrectly(t *testing.T) { { name: "When Arm64 VM it should set correct architecture", input: makeSKU("Standard_D4ps_v5", "virtualMachines", map[string]string{ - "vCPUs": "4", - "MemoryGB": "16", + "vCPUs": "4", + "MemoryGB": "16", "CpuArchitectureType": "Arm64", }), expected: &instancetype.InstanceTypeInfo{ @@ -107,8 +107,8 @@ func TestTransformSKU_WhenValidInput_ItShouldTransformCorrectly(t *testing.T) { { name: "When GPUs capability is absent it should default to 0", input: makeSKU("Standard_B2s", "virtualMachines", map[string]string{ - "vCPUs": "2", - "MemoryGB": "4", + "vCPUs": "2", + "MemoryGB": "4", "CpuArchitectureType": "x64", }), expected: &instancetype.InstanceTypeInfo{ @@ -122,8 +122,8 @@ func TestTransformSKU_WhenValidInput_ItShouldTransformCorrectly(t *testing.T) { { name: "When MemoryGB is fractional it should convert correctly", input: makeSKU("Standard_B1ls", "virtualMachines", map[string]string{ - "vCPUs": "1", - "MemoryGB": "0.5", + "vCPUs": "1", + "MemoryGB": "0.5", "CpuArchitectureType": "x64", }), expected: &instancetype.InstanceTypeInfo{ @@ -137,8 +137,8 @@ func TestTransformSKU_WhenValidInput_ItShouldTransformCorrectly(t *testing.T) { { name: "When MemoryGB is large it should convert correctly", input: makeSKU("Standard_M416ms_v2", "virtualMachines", map[string]string{ - "vCPUs": "416", - "MemoryGB": "11400", + "vCPUs": "416", + "MemoryGB": "11400", "CpuArchitectureType": "x64", }), expected: &instancetype.InstanceTypeInfo{ @@ -181,7 +181,7 @@ func TestTransformSKU_WhenMissingRequiredFields_ItShouldReturnError(t *testing.T { name: "When vCPUs capability is missing it should return error", input: makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ - "MemoryGB": "16", + "MemoryGB": "16", "CpuArchitectureType": "x64", }), expectedError: "missing vCPUs capability", @@ -189,7 +189,7 @@ func TestTransformSKU_WhenMissingRequiredFields_ItShouldReturnError(t *testing.T { name: "When MemoryGB capability is missing it should return error", input: makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ - "vCPUs": "4", + "vCPUs": "4", "CpuArchitectureType": "x64", }), expectedError: "missing MemoryGB capability", @@ -205,8 +205,8 @@ func TestTransformSKU_WhenMissingRequiredFields_ItShouldReturnError(t *testing.T { name: "When vCPUs value is not a valid integer it should return error", input: makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ - "vCPUs": "abc", - "MemoryGB": "16", + "vCPUs": "abc", + "MemoryGB": "16", "CpuArchitectureType": "x64", }), expectedError: "invalid vCPUs value", @@ -214,8 +214,8 @@ func TestTransformSKU_WhenMissingRequiredFields_ItShouldReturnError(t *testing.T { name: "When MemoryGB value is not a valid float it should return error", input: makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ - "vCPUs": "4", - "MemoryGB": "xyz", + "vCPUs": "4", + "MemoryGB": "xyz", "CpuArchitectureType": "x64", }), expectedError: "invalid MemoryGB value", @@ -223,8 +223,8 @@ func TestTransformSKU_WhenMissingRequiredFields_ItShouldReturnError(t *testing.T { name: "When vCPUs value is zero it should return error", input: makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ - "vCPUs": "0", - "MemoryGB": "16", + "vCPUs": "0", + "MemoryGB": "16", "CpuArchitectureType": "x64", }), expectedError: "invalid vCPUs count", @@ -232,8 +232,8 @@ func TestTransformSKU_WhenMissingRequiredFields_ItShouldReturnError(t *testing.T { name: "When MemoryGB value is zero it should return error", input: makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ - "vCPUs": "4", - "MemoryGB": "0", + "vCPUs": "4", + "MemoryGB": "0", "CpuArchitectureType": "x64", }), expectedError: "invalid MemoryGB value", @@ -241,8 +241,8 @@ func TestTransformSKU_WhenMissingRequiredFields_ItShouldReturnError(t *testing.T { name: "When CpuArchitectureType is unsupported it should return error", input: makeSKU("Standard_D4s_v3", "virtualMachines", map[string]string{ - "vCPUs": "4", - "MemoryGB": "16", + "vCPUs": "4", + "MemoryGB": "16", "CpuArchitectureType": "i386", }), expectedError: "unsupported CPU architecture", diff --git a/hypershift-operator/controllers/nodepool/scale_from_zero_test.go b/hypershift-operator/controllers/nodepool/scale_from_zero_test.go index 6b039040a9f..ae16ff36060 100644 --- a/hypershift-operator/controllers/nodepool/scale_from_zero_test.go +++ b/hypershift-operator/controllers/nodepool/scale_from_zero_test.go @@ -212,7 +212,7 @@ func TestSetScaleFromZeroAnnotationsOnObject(t *testing.T) { }, }, { - name: "When Azure template with valid VMSize and no GPU it should set basic annotations", + name: "When Azure template with valid VMSize and no GPU it should set basic annotations", provider: &mockProvider{info: &instancetype.InstanceTypeInfo{ VCPU: 4, MemoryMb: 16384, GPU: 0, CPUArchitecture: "amd64", }}, diff --git a/hypershift-operator/main.go b/hypershift-operator/main.go index 85740e79bf7..83b1d8e6fba 100644 --- a/hypershift-operator/main.go +++ b/hypershift-operator/main.go @@ -17,8 +17,8 @@ package main import ( "context" "crypto/tls" - "fmt" "encoding/json" + "fmt" "os" "strings" "time" @@ -644,7 +644,7 @@ func setupNodePoolController(ctx context.Context, mgr ctrl.Manager, opts *StartO missing = append(missing, "location") } if len(missing) > 0 { - return fmt.Errorf("Azure scale-from-zero credentials missing required fields: %s", strings.Join(missing, ", ")) + return fmt.Errorf("azure scale-from-zero credentials missing required fields: %s", strings.Join(missing, ", ")) } cred, err := azidentity.NewClientSecretCredential(azureCreds.TenantID, azureCreds.ClientID, azureCreds.ClientSecret, nil) if err != nil { diff --git a/test/e2e/nodepool_test.go b/test/e2e/nodepool_test.go index 6e40796a160..f72452ab793 100644 --- a/test/e2e/nodepool_test.go +++ b/test/e2e/nodepool_test.go @@ -12,10 +12,12 @@ import ( hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" "github.com/openshift/hypershift/support/conditions" e2eutil "github.com/openshift/hypershift/test/e2e/util" + corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/wait" + crclient "sigs.k8s.io/controller-runtime/pkg/client" . "github.com/onsi/gomega" From d32f2b92d8eeea04633a4aef07ced231cf24cde6 Mon Sep 17 00:00:00 2001 From: Jesse Jaggars Date: Fri, 8 May 2026 18:58:46 +0000 Subject: [PATCH 5/7] fix: address CodeRabbit review for Azure scale-from-zero - Gate effectiveMin=0 on runtime-configured scaleFromZeroPlatform instead of static platform type check, preventing stalled pools when the scale-from-zero provider isn't wired up - Resolve AZURE_CLOUD_NAME for credential and SKU client construction in scale-from-zero init, matching sovereign cloud support used elsewhere - Return errors on invalid/negative GPU values in transformSKU instead of silently skipping, with VM size in error messages for debuggability Co-Authored-By: Claude Opus 4.6 (1M context) --- .../controllers/nodepool/capi.go | 15 +++++++------- .../controllers/nodepool/capi_test.go | 12 +++++++---- .../nodepool/instancetype/azure/provider.go | 8 ++++++-- .../instancetype/azure/provider_test.go | 20 +++++++++++++++++++ .../nodepool/nodepool_controller.go | 1 + hypershift-operator/main.go | 16 +++++++++++++-- 6 files changed, 57 insertions(+), 15 deletions(-) diff --git a/hypershift-operator/controllers/nodepool/capi.go b/hypershift-operator/controllers/nodepool/capi.go index 068e11dd2a6..907f4ab82d0 100644 --- a/hypershift-operator/controllers/nodepool/capi.go +++ b/hypershift-operator/controllers/nodepool/capi.go @@ -51,7 +51,8 @@ const ( // and let nodepool, hostedcluster, and client be fields of CAPI / interface methods. type CAPI struct { *Token - capiClusterName string + capiClusterName string + scaleFromZeroPlatform hyperv1.PlatformType upsert.ApplyProvider } @@ -472,7 +473,7 @@ func (c *CAPI) reconcileMachineDeployment(ctx context.Context, log logr.Logger, } } - setMachineDeploymentReplicas(nodePool, machineDeployment) + setMachineDeploymentReplicas(nodePool, machineDeployment, c.scaleFromZeroPlatform) if updated := c.propagateVersionAndTemplate(log, machineDeployment, machineTemplateCR); updated { return nil @@ -756,7 +757,7 @@ func (c *CAPI) reconcileMachineHealthCheck(ctx context.Context, // setMachineDeploymentReplicas sets wanted replicas: // If autoscaling is enabled we reconcile min/max annotations and leave replicas untouched. -func setMachineDeploymentReplicas(nodePool *hyperv1.NodePool, machineDeployment *capiv1.MachineDeployment) { +func setMachineDeploymentReplicas(nodePool *hyperv1.NodePool, machineDeployment *capiv1.MachineDeployment, scaleFromZeroPlatform hyperv1.PlatformType) { if machineDeployment.Annotations == nil { machineDeployment.Annotations = make(map[string]string) } @@ -773,7 +774,7 @@ func setMachineDeploymentReplicas(nodePool *hyperv1.NodePool, machineDeployment // NodePools from being permanently stuck at 0 replicas on platforms that don't support // scale-from-zero metadata. effectiveMin := ptr.Deref(nodePool.Spec.AutoScaling.Min, 0) - if effectiveMin == 0 && nodePool.Spec.Platform.Type != hyperv1.AWSPlatform && nodePool.Spec.Platform.Type != hyperv1.AzurePlatform { + if effectiveMin == 0 && nodePool.Spec.Platform.Type != hyperv1.AWSPlatform && nodePool.Spec.Platform.Type != scaleFromZeroPlatform { effectiveMin = 1 } @@ -957,7 +958,7 @@ func (c *CAPI) reconcileMachineSet(ctx context.Context, } machineSet.Spec.Template.Annotations[nodePoolAnnotationTaints] = taintsInJSON - setMachineSetReplicas(nodePool, machineSet) + setMachineSetReplicas(nodePool, machineSet, c.scaleFromZeroPlatform) isUpdating := false // Propagate version and userData Secret to the MachineSet. @@ -1064,7 +1065,7 @@ func machineSetInPlaceRolloutIsComplete(machineSet *capiv1.MachineSet) bool { // setMachineSetReplicas sets wanted replicas: // If autoscaling is enabled we reconcile min/max annotations and leave replicas untouched. -func setMachineSetReplicas(nodePool *hyperv1.NodePool, machineSet *capiv1.MachineSet) { +func setMachineSetReplicas(nodePool *hyperv1.NodePool, machineSet *capiv1.MachineSet, scaleFromZeroPlatform hyperv1.PlatformType) { if machineSet.Annotations == nil { machineSet.Annotations = make(map[string]string) } @@ -1081,7 +1082,7 @@ func setMachineSetReplicas(nodePool *hyperv1.NodePool, machineSet *capiv1.Machin // NodePools from being permanently stuck at 0 replicas on platforms that don't support // scale-from-zero metadata. effectiveMin := ptr.Deref(nodePool.Spec.AutoScaling.Min, 0) - if effectiveMin == 0 && nodePool.Spec.Platform.Type != hyperv1.AWSPlatform && nodePool.Spec.Platform.Type != hyperv1.AzurePlatform { + if effectiveMin == 0 && nodePool.Spec.Platform.Type != hyperv1.AWSPlatform && nodePool.Spec.Platform.Type != scaleFromZeroPlatform { effectiveMin = 1 } diff --git a/hypershift-operator/controllers/nodepool/capi_test.go b/hypershift-operator/controllers/nodepool/capi_test.go index dde546a1ead..4250b5025c8 100644 --- a/hypershift-operator/controllers/nodepool/capi_test.go +++ b/hypershift-operator/controllers/nodepool/capi_test.go @@ -42,6 +42,7 @@ func TestSetMachineSetReplicas(t *testing.T) { name string nodePool *hyperv1.NodePool machineSet *capiv1.MachineSet + scaleFromZeroPlatform hyperv1.PlatformType expectReplicas int32 expectAutoscalerAnnotations map[string]string }{ @@ -199,7 +200,8 @@ func TestSetMachineSetReplicas(t *testing.T) { Replicas: nil, }, }, - expectReplicas: 0, + scaleFromZeroPlatform: hyperv1.AzurePlatform, + expectReplicas: 0, expectAutoscalerAnnotations: map[string]string{ autoscalerMinAnnotation: "0", autoscalerMaxAnnotation: "5", @@ -266,7 +268,7 @@ func TestSetMachineSetReplicas(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { g := NewWithT(t) - setMachineSetReplicas(tc.nodePool, tc.machineSet) + setMachineSetReplicas(tc.nodePool, tc.machineSet, tc.scaleFromZeroPlatform) g.Expect(*tc.machineSet.Spec.Replicas).To(Equal(tc.expectReplicas)) g.Expect(tc.machineSet.Annotations).To(Equal(tc.expectAutoscalerAnnotations)) }) @@ -279,6 +281,7 @@ func TestSetMachineDeploymentReplicas(t *testing.T) { name string nodePool *hyperv1.NodePool machineDeployment *capiv1.MachineDeployment + scaleFromZeroPlatform hyperv1.PlatformType expectReplicas int32 expectAutoscalerAnnotations map[string]string }{ @@ -525,7 +528,8 @@ func TestSetMachineDeploymentReplicas(t *testing.T) { Replicas: nil, }, }, - expectReplicas: 0, + scaleFromZeroPlatform: hyperv1.AzurePlatform, + expectReplicas: 0, expectAutoscalerAnnotations: map[string]string{ autoscalerMinAnnotation: "0", autoscalerMaxAnnotation: "5", @@ -592,7 +596,7 @@ func TestSetMachineDeploymentReplicas(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { g := NewWithT(t) - setMachineDeploymentReplicas(tc.nodePool, tc.machineDeployment) + setMachineDeploymentReplicas(tc.nodePool, tc.machineDeployment, tc.scaleFromZeroPlatform) g.Expect(*tc.machineDeployment.Spec.Replicas).To(Equal(tc.expectReplicas)) g.Expect(tc.machineDeployment.Annotations).To(Equal(tc.expectAutoscalerAnnotations)) }) diff --git a/hypershift-operator/controllers/nodepool/instancetype/azure/provider.go b/hypershift-operator/controllers/nodepool/instancetype/azure/provider.go index 1ee6d4ee350..57dd7a499ba 100644 --- a/hypershift-operator/controllers/nodepool/instancetype/azure/provider.go +++ b/hypershift-operator/controllers/nodepool/instancetype/azure/provider.go @@ -130,9 +130,13 @@ func transformSKU(sku *armcompute.ResourceSKU) (*instancetype.InstanceTypeInfo, gpuStr, ok := getCapabilityValue(sku.Capabilities, "GPUs") if ok { gpu, err := strconv.ParseInt(gpuStr, 10, 32) - if err == nil { - info.GPU = int32(gpu) + if err != nil { + return nil, fmt.Errorf("invalid GPUs value %q for VM size %q: %w", gpuStr, name, err) + } + if gpu < 0 { + return nil, fmt.Errorf("negative GPUs count %d for VM size %q", gpu, name) } + info.GPU = int32(gpu) } archStr, ok := getCapabilityValue(sku.Capabilities, "CpuArchitectureType") diff --git a/hypershift-operator/controllers/nodepool/instancetype/azure/provider_test.go b/hypershift-operator/controllers/nodepool/instancetype/azure/provider_test.go index ea8a653d790..04a6dfafe9a 100644 --- a/hypershift-operator/controllers/nodepool/instancetype/azure/provider_test.go +++ b/hypershift-operator/controllers/nodepool/instancetype/azure/provider_test.go @@ -247,6 +247,26 @@ func TestTransformSKU_WhenMissingRequiredFields_ItShouldReturnError(t *testing.T }), expectedError: "unsupported CPU architecture", }, + { + name: "When GPUs value is not a valid integer it should return error", + input: makeSKU("Standard_NC6", "virtualMachines", map[string]string{ + "vCPUs": "6", + "MemoryGB": "56", + "GPUs": "abc", + "CpuArchitectureType": "x64", + }), + expectedError: "invalid GPUs value", + }, + { + name: "When GPUs value is negative it should return error", + input: makeSKU("Standard_NC6", "virtualMachines", map[string]string{ + "vCPUs": "6", + "MemoryGB": "56", + "GPUs": "-1", + "CpuArchitectureType": "x64", + }), + expectedError: "negative GPUs count", + }, } for _, tt := range tests { diff --git a/hypershift-operator/controllers/nodepool/nodepool_controller.go b/hypershift-operator/controllers/nodepool/nodepool_controller.go index 923ef898bf6..2bfca1845ef 100644 --- a/hypershift-operator/controllers/nodepool/nodepool_controller.go +++ b/hypershift-operator/controllers/nodepool/nodepool_controller.go @@ -386,6 +386,7 @@ func (r *NodePoolReconciler) reconcile(ctx context.Context, hcluster *hyperv1.Ho if err != nil { return ctrl.Result{}, err } + capi.scaleFromZeroPlatform = r.ScaleFromZeroPlatform if isPaused, duration := supportutil.IsReconciliationPaused(log, nodePool.Spec.PausedUntil); isPaused { if err := capi.Pause(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("error pausing CAPI: %w", err) diff --git a/hypershift-operator/main.go b/hypershift-operator/main.go index 83b1d8e6fba..680084874e4 100644 --- a/hypershift-operator/main.go +++ b/hypershift-operator/main.go @@ -646,11 +646,23 @@ func setupNodePoolController(ctx context.Context, mgr ctrl.Manager, opts *StartO if len(missing) > 0 { return fmt.Errorf("azure scale-from-zero credentials missing required fields: %s", strings.Join(missing, ", ")) } - cred, err := azidentity.NewClientSecretCredential(azureCreds.TenantID, azureCreds.ClientID, azureCreds.ClientSecret, nil) + azureCloudName := os.Getenv("AZURE_CLOUD_NAME") + if azureCloudName == "" { + azureCloudName = config.DefaultAzureCloud + } + cloudConfig, err := azureutil.GetAzureCloudConfiguration(azureCloudName) + if err != nil { + return fmt.Errorf("failed to get Azure cloud configuration for scale-from-zero: %w", err) + } + cred, err := azidentity.NewClientSecretCredential(azureCreds.TenantID, azureCreds.ClientID, azureCreds.ClientSecret, + &azidentity.ClientSecretCredentialOptions{ + ClientOptions: azcore.ClientOptions{Cloud: cloudConfig}, + }, + ) if err != nil { return fmt.Errorf("failed to create Azure credentials for scale-from-zero: %w", err) } - skuClient, err := armcompute.NewResourceSKUsClient(azureCreds.SubscriptionID, cred, nil) + skuClient, err := armcompute.NewResourceSKUsClient(azureCreds.SubscriptionID, cred, azureutil.NewARMClientOptions(cloudConfig)) if err != nil { return fmt.Errorf("failed to create Azure ResourceSKUs client: %w", err) } From 686c55e915d158e1aeaf121d572f07ea0ca10b47 Mon Sep 17 00:00:00 2001 From: Jesse Jaggars Date: Tue, 26 May 2026 10:16:59 -0400 Subject: [PATCH 6/7] fix: update CLI help text and regenerate vendor files Update the --scale-from-zero-provider help text to list both aws and azure as supported platforms. Regenerate vendor and docs to sync with the rebased branch. Co-Authored-By: Claude Opus 4.6 (1M context) --- cmd/install/install.go | 2 +- docs/content/reference/aggregated-docs.md | 2 +- docs/content/reference/api.md | 2 +- go.mod | 1 + .../hypershift/api/hypershift/v1beta1/nodepool_types.go | 4 ++-- 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cmd/install/install.go b/cmd/install/install.go index 5f7a6560cac..c2b0366ef96 100644 --- a/cmd/install/install.go +++ b/cmd/install/install.go @@ -432,7 +432,7 @@ func NewCommand() *cobra.Command { cmd.PersistentFlags().StringSliceVar(&opts.PlatformsToInstall, "limit-crd-install", opts.PlatformsToInstall, "Used to limit the CRDs that are installed to a per platform basis (example: --limit-crd-install=AWS,Azure). If this flag is not specified, all CRDs for all platforms will be installed. Valid, case-insensitive values are: AWS, Azure, IBMCloud, KubeVirt, Agent, OpenStack, GCP.") cmd.PersistentFlags().StringToStringVar(&opts.AdditionalOperatorEnvVars, "additional-operator-env-vars", opts.AdditionalOperatorEnvVars, "Set of additional environment variables to be set on the HyperShift Operator deployment.") cmd.PersistentFlags().BoolVar(&opts.EnableAuditLogPersistence, "enable-audit-log-persistence", opts.EnableAuditLogPersistence, "If true, enables persistent audit logs with automatic snapshots for kube-apiserver pods") - cmd.PersistentFlags().StringVar(&opts.ScaleFromZeroProvider, "scale-from-zero-provider", opts.ScaleFromZeroProvider, "Platform type for scale-from-zero autoscaling (aws)") + cmd.PersistentFlags().StringVar(&opts.ScaleFromZeroProvider, "scale-from-zero-provider", opts.ScaleFromZeroProvider, "Platform type for scale-from-zero autoscaling (aws, azure)") cmd.PersistentFlags().StringVar(&opts.ScaleFromZeroCreds, "scale-from-zero-creds", opts.ScaleFromZeroCreds, "Path to credentials file for scale-from-zero instance type queries") cmd.PersistentFlags().StringVar(&opts.ScaleFromZeroCredentialsSecret, "scale-from-zero-secret", opts.ScaleFromZeroCredentialsSecret, "Name of existing secret containing scale-from-zero credentials (alternative to --scale-from-zero-creds)") cmd.PersistentFlags().StringVar(&opts.ScaleFromZeroCredentialsSecretKey, "scale-from-zero-secret-key", opts.ScaleFromZeroCredentialsSecretKey, "Key within the scale-from-zero credentials secret (default: credentials)") diff --git a/docs/content/reference/aggregated-docs.md b/docs/content/reference/aggregated-docs.md index e1e9658ac27..34e7f5fb53c 100644 --- a/docs/content/reference/aggregated-docs.md +++ b/docs/content/reference/aggregated-docs.md @@ -48365,7 +48365,7 @@ int32

min is the minimum number of nodes to maintain in the pool. -Can be set to 0 for scale-from-zero for AWS platform. +Can be set to 0 for scale-from-zero for AWS and Azure platforms. Must be >= 0 and <= .Max.

diff --git a/docs/content/reference/api.md b/docs/content/reference/api.md index 828e8531ff1..91284eac765 100644 --- a/docs/content/reference/api.md +++ b/docs/content/reference/api.md @@ -12680,7 +12680,7 @@ int32

min is the minimum number of nodes to maintain in the pool. -Can be set to 0 for scale-from-zero for AWS platform. +Can be set to 0 for scale-from-zero for AWS and Azure platforms. Must be >= 0 and <= .Max.

diff --git a/go.mod b/go.mod index 536ea13a88a..38cf7c22432 100644 --- a/go.mod +++ b/go.mod @@ -6,6 +6,7 @@ require ( github.com/Azure/azure-sdk-for-go/sdk/azcore v1.21.1 github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/authorization/armauthorization/v2 v2.2.0 + github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5 v5.7.0 github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/dns/armdns v1.2.0 github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/msi/armmsi v1.3.0 github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v5 v5.2.0 diff --git a/vendor/github.com/openshift/hypershift/api/hypershift/v1beta1/nodepool_types.go b/vendor/github.com/openshift/hypershift/api/hypershift/v1beta1/nodepool_types.go index dc17f42ee9e..5ebc5378064 100644 --- a/vendor/github.com/openshift/hypershift/api/hypershift/v1beta1/nodepool_types.go +++ b/vendor/github.com/openshift/hypershift/api/hypershift/v1beta1/nodepool_types.go @@ -106,7 +106,7 @@ type NodePool struct { // +kubebuilder:validation:XValidation:rule="!has(self.replicas) || !has(self.autoScaling)", message="Both replicas or autoScaling should not be set" // +kubebuilder:validation:XValidation:rule="self.arch != 's390x' || has(self.platform.kubevirt)", message="s390x is only supported on KubeVirt platform" // +kubebuilder:validation:XValidation:rule="!has(self.platform.aws) || !has(self.platform.aws.imageType) || self.platform.aws.imageType != 'Windows' || self.arch == 'amd64'", message="ImageType 'Windows' requires arch 'amd64' (AWS only)" -// +kubebuilder:validation:XValidation:rule="!has(self.autoScaling) || self.autoScaling.min > 0 || self.platform.type == 'AWS'", message="Scale-from-zero (autoScaling.min=0) is currently only supported for AWS platform" +// +kubebuilder:validation:XValidation:rule="!has(self.autoScaling) || self.autoScaling.min > 0 || self.platform.type == 'AWS' || self.platform.type == 'Azure'", message="Scale-from-zero (autoScaling.min=0) is currently only supported for AWS and Azure platforms" type NodePoolSpec struct { // clusterName is the name of the HostedCluster this NodePool belongs to. // If a HostedCluster with this name doesn't exist, the controller will no-op until it exists. @@ -501,7 +501,7 @@ type NodePoolManagement struct { // +kubebuilder:validation:XValidation:rule="self.max >= self.min", message="max must be equal or greater than min" type NodePoolAutoScaling struct { // min is the minimum number of nodes to maintain in the pool. - // Can be set to 0 for scale-from-zero for AWS platform. + // Can be set to 0 for scale-from-zero for AWS and Azure platforms. // Must be >= 0 and <= .Max. // // +kubebuilder:validation:Minimum=0 From ee42ce8525da7c7a3e11b5fa23fa50c82eb93814 Mon Sep 17 00:00:00 2001 From: Jesse Jaggars Date: Wed, 27 May 2026 07:39:22 -0400 Subject: [PATCH 7/7] test(e2e): enable scale-from-zero e2e test for Azure platform The TestNodePoolAutoscalingScaleFromZero test was hardcoded to skip on non-AWS platforms. The test logic is already platform-agnostic (it copies the existing NodePool spec), so the only change needed is widening the platform gate to include Azure. A follow-up PR in openshift/release will configure the Azure CI jobs to install the operator with --scale-from-zero-provider=azure and the appropriate credentials. Co-Authored-By: Claude Opus 4.6 (1M context) --- go.mod | 1 - test/e2e/autoscaling_test.go | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 38cf7c22432..493a7fd6ecb 100644 --- a/go.mod +++ b/go.mod @@ -138,7 +138,6 @@ require ( cloud.google.com/go/compute/metadata v0.9.0 // indirect cyphar.com/go-pathrs v0.2.1 // indirect github.com/Azure/azure-sdk-for-go/sdk/internal v1.12.0 // indirect - github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5 v5.7.0 // indirect github.com/Azure/azure-sdk-for-go/sdk/security/keyvault/azsecrets v1.4.0 // indirect github.com/Azure/azure-sdk-for-go/sdk/security/keyvault/internal v1.2.0 // indirect github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c // indirect diff --git a/test/e2e/autoscaling_test.go b/test/e2e/autoscaling_test.go index 62f5c07374b..270af8cd991 100644 --- a/test/e2e/autoscaling_test.go +++ b/test/e2e/autoscaling_test.go @@ -675,8 +675,8 @@ func testAutoscalerRespectsNodePoolPause(ctx context.Context, mgtClient crclient } func TestNodePoolAutoscalingScaleFromZero(t *testing.T) { - if globalOpts.Platform != hyperv1.AWSPlatform { - t.Skip("test only supported on platform AWS") + if globalOpts.Platform != hyperv1.AWSPlatform && globalOpts.Platform != hyperv1.AzurePlatform { + t.Skip("test only supported on AWS and Azure platforms") } // Get management client to check for scale-from-zero secret