From ba7737f73fa926cb5cfd2f37c89dad9b381bc44d Mon Sep 17 00:00:00 2001 From: Vimal Solanki Date: Fri, 5 Jun 2026 16:37:17 +0530 Subject: [PATCH] fix(azure): prevent cluster deletion from hanging when resource groups are already deleted When Azure resource groups are deleted before the HostedCluster, two things block cleanup: CAPI AzureMachine finalizers can never be removed because the provider controller cannot reach the deleted resources, and the CLI destroy command hard-fails on the resource group pre-validation check. Implement the OrphanDeleter interface for Azure by detecting AzureMachines stuck in deletion for longer than 5 minutes and force-removing their finalizers, allowing the CAPI deletion cascade to proceed. Also change the destroy command's RG pre-validation to log a warning and continue on 404 instead of failing. Signed-off-by: Vimal Solanki --- cmd/cluster/azure/destroy.go | 9 +- .../internal/platform/azure/azure.go | 32 ++++ .../internal/platform/azure/azure_test.go | 143 ++++++++++++++++++ .../internal/platform/platform.go | 15 +- 4 files changed, 191 insertions(+), 8 deletions(-) diff --git a/cmd/cluster/azure/destroy.go b/cmd/cluster/azure/destroy.go index baf63d29acb..76863ba6a87 100644 --- a/cmd/cluster/azure/destroy.go +++ b/cmd/cluster/azure/destroy.go @@ -2,7 +2,9 @@ package azure import ( "context" + stderrors "errors" "fmt" + "net/http" "os" "os/signal" "syscall" @@ -140,7 +142,12 @@ func DestroyCluster(ctx context.Context, o *core.DestroyOptions) error { } if _, err = resourceGroupClient.Get(ctx, o.AzurePlatform.ResourceGroupName, nil); err != nil { - return fmt.Errorf("failed to get resource group name, '%s': %w", o.AzurePlatform.ResourceGroupName, err) + var respErr *azcore.ResponseError + if stderrors.As(err, &respErr) && respErr.StatusCode == http.StatusNotFound { + o.Log.Info("Resource group not found, continuing with cluster deletion", "resourceGroup", o.AzurePlatform.ResourceGroupName) + } else { + return fmt.Errorf("failed to get resource group name, '%s': %w", o.AzurePlatform.ResourceGroupName, err) + } } } else { o.AzurePlatform.ResourceGroupName = o.Name + "-" + o.InfraID diff --git a/hypershift-operator/controllers/hostedcluster/internal/platform/azure/azure.go b/hypershift-operator/controllers/hostedcluster/internal/platform/azure/azure.go index 6d633c16e84..9d727d09ce5 100644 --- a/hypershift-operator/controllers/hostedcluster/internal/platform/azure/azure.go +++ b/hypershift-operator/controllers/hostedcluster/internal/platform/azure/azure.go @@ -8,6 +8,7 @@ import ( "os" "path" "strings" + "time" hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/cloud/azure" @@ -24,11 +25,14 @@ import ( rbacv1 "k8s.io/api/rbac/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + utilerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/utils/ptr" capiazure "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1" capiv1 "sigs.k8s.io/cluster-api/api/core/v1beta1" + ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "github.com/blang/semver" ) @@ -355,6 +359,34 @@ func (a Azure) DeleteCredentials(ctx context.Context, c client.Client, hcluster return nil } +const orphanedMachineDeletionThreshold = 5 * time.Minute + +func (Azure) DeleteOrphanedMachines(ctx context.Context, c client.Client, hc *hyperv1.HostedCluster, controlPlaneNamespace string) error { + azureMachineList := capiazure.AzureMachineList{} + if err := c.List(ctx, &azureMachineList, client.InNamespace(controlPlaneNamespace)); err != nil { + return fmt.Errorf("failed to list AzureMachines in %s: %w", controlPlaneNamespace, err) + } + logger := ctrl.LoggerFrom(ctx) + var errs []error + for i := range azureMachineList.Items { + m := &azureMachineList.Items[i] + if m.DeletionTimestamp.IsZero() { + continue + } + if time.Since(m.DeletionTimestamp.Time) < orphanedMachineDeletionThreshold { + continue + } + original := m.DeepCopy() + controllerutil.RemoveFinalizer(m, capiazure.MachineFinalizer) + if err := c.Patch(ctx, m, client.MergeFrom(original)); err != nil { + errs = append(errs, fmt.Errorf("failed to remove finalizers from AzureMachine %s/%s: %w", m.Namespace, m.Name, err)) + continue + } + logger.Info("Removed finalizers from orphaned AzureMachine stuck in deletion", "machine", client.ObjectKeyFromObject(m), "deletionTimestamp", m.DeletionTimestamp.Time) + } + return utilerrors.NewAggregate(errs) +} + func reconcileAzureCluster(azureCluster *capiazure.AzureCluster, hcluster *hyperv1.HostedCluster, apiEndpoint hyperv1.APIEndpoint, azureClusterIdentity *capiazure.AzureClusterIdentity, _ string) error { if azureCluster.Annotations == nil { azureCluster.Annotations = map[string]string{} diff --git a/hypershift-operator/controllers/hostedcluster/internal/platform/azure/azure_test.go b/hypershift-operator/controllers/hostedcluster/internal/platform/azure/azure_test.go index b71468ea237..1beaab3466b 100644 --- a/hypershift-operator/controllers/hostedcluster/internal/platform/azure/azure_test.go +++ b/hypershift-operator/controllers/hostedcluster/internal/platform/azure/azure_test.go @@ -5,6 +5,7 @@ import ( "encoding/json" "fmt" "testing" + "time" . "github.com/onsi/gomega" @@ -15,6 +16,7 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/ptr" capiazure "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1" "sigs.k8s.io/controller-runtime/pkg/client" @@ -527,3 +529,144 @@ func TestReconcileKMSConfigSecret(t *testing.T) { }) } } + +func TestDeleteOrphanedMachines(t *testing.T) { + controlPlaneNamespace := "test-cp-ns" + hc := &hyperv1.HostedCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hc", + Namespace: "clusters", + }, + } + + testCases := []struct { + name string + existingMachines []capiazure.AzureMachine + expectFinalizers map[string]bool + expectError bool + }{ + { + name: "When no AzureMachines exist, it should return nil", + existingMachines: nil, + expectFinalizers: map[string]bool{}, + }, + { + name: "When an AzureMachine is not being deleted, it should not remove finalizers", + existingMachines: []capiazure.AzureMachine{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "machine-active", + Namespace: controlPlaneNamespace, + Finalizers: []string{capiazure.MachineFinalizer}, + }, + Spec: capiazure.AzureMachineSpec{ + VMSize: "Standard_D4s_v4", + OSDisk: capiazure.OSDisk{OSType: "Linux", DiskSizeGB: ptr.To[int32](128), ManagedDisk: &capiazure.ManagedDiskParameters{StorageAccountType: "Premium_LRS"}}, + }, + }, + }, + expectFinalizers: map[string]bool{"machine-active": true}, + }, + { + name: "When an AzureMachine is being deleted but under the threshold, it should not remove finalizers", + existingMachines: []capiazure.AzureMachine{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "machine-recent", + Namespace: controlPlaneNamespace, + Finalizers: []string{capiazure.MachineFinalizer}, + DeletionTimestamp: &metav1.Time{Time: time.Now().Add(-1 * time.Minute)}, + }, + Spec: capiazure.AzureMachineSpec{ + VMSize: "Standard_D4s_v4", + OSDisk: capiazure.OSDisk{OSType: "Linux", DiskSizeGB: ptr.To[int32](128), ManagedDisk: &capiazure.ManagedDiskParameters{StorageAccountType: "Premium_LRS"}}, + }, + }, + }, + expectFinalizers: map[string]bool{"machine-recent": true}, + }, + { + name: "When an AzureMachine is being deleted and over the threshold, it should remove the Azure finalizer", + existingMachines: []capiazure.AzureMachine{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "machine-stuck", + Namespace: controlPlaneNamespace, + Finalizers: []string{capiazure.MachineFinalizer}, + DeletionTimestamp: &metav1.Time{Time: time.Now().Add(-10 * time.Minute)}, + }, + Spec: capiazure.AzureMachineSpec{ + VMSize: "Standard_D4s_v4", + OSDisk: capiazure.OSDisk{OSType: "Linux", DiskSizeGB: ptr.To[int32](128), ManagedDisk: &capiazure.ManagedDiskParameters{StorageAccountType: "Premium_LRS"}}, + }, + }, + }, + expectFinalizers: map[string]bool{"machine-stuck": false}, + }, + { + name: "When an AzureMachine has other finalizers, it should only remove the Azure finalizer", + existingMachines: []capiazure.AzureMachine{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "machine-multi-finalizer", + Namespace: controlPlaneNamespace, + Finalizers: []string{"some-other-controller.io/finalizer", capiazure.MachineFinalizer}, + DeletionTimestamp: &metav1.Time{Time: time.Now().Add(-10 * time.Minute)}, + }, + Spec: capiazure.AzureMachineSpec{ + VMSize: "Standard_D4s_v4", + OSDisk: capiazure.OSDisk{OSType: "Linux", DiskSizeGB: ptr.To[int32](128), ManagedDisk: &capiazure.ManagedDiskParameters{StorageAccountType: "Premium_LRS"}}, + }, + }, + }, + expectFinalizers: map[string]bool{"machine-multi-finalizer": false}, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + g := NewWithT(t) + + var objects []client.Object + for i := range tc.existingMachines { + objects = append(objects, &tc.existingMachines[i]) + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(api.Scheme). + WithObjects(objects...). + Build() + + azure := Azure{} + err := azure.DeleteOrphanedMachines(t.Context(), fakeClient, hc, controlPlaneNamespace) + + if tc.expectError { + g.Expect(err).To(HaveOccurred(), "expected DeleteOrphanedMachines to return an error") + } else { + g.Expect(err).ToNot(HaveOccurred(), "expected DeleteOrphanedMachines to succeed") + } + + var machineList capiazure.AzureMachineList + g.Expect(fakeClient.List(t.Context(), &machineList, client.InNamespace(controlPlaneNamespace))).To(Succeed(), "expected to list AzureMachines") + + for _, m := range machineList.Items { + expectHasAzureFinalizer, exists := tc.expectFinalizers[m.Name] + if !exists { + continue + } + g.Expect(controllerutil.ContainsFinalizer(&m, capiazure.MachineFinalizer)).To(Equal(expectHasAzureFinalizer), + "unexpected Azure finalizer state on %s", m.Name) + } + + // Verify non-Azure finalizers are preserved on multi-finalizer machines + if tc.name == "When an AzureMachine has other finalizers, it should only remove the Azure finalizer" { + for _, m := range machineList.Items { + if m.Name == "machine-multi-finalizer" { + g.Expect(m.Finalizers).To(ContainElement("some-other-controller.io/finalizer"), + "non-Azure finalizer should be preserved") + } + } + } + }) + } +} diff --git a/hypershift-operator/controllers/hostedcluster/internal/platform/platform.go b/hypershift-operator/controllers/hostedcluster/internal/platform/platform.go index 6916ef0367b..598e934b21e 100644 --- a/hypershift-operator/controllers/hostedcluster/internal/platform/platform.go +++ b/hypershift-operator/controllers/hostedcluster/internal/platform/platform.go @@ -36,13 +36,14 @@ const ( ) var ( - _ Platform = aws.AWS{} - _ Platform = azure.Azure{} - _ Platform = ibmcloud.IBMCloud{} - _ Platform = none.None{} - _ Platform = agent.Agent{} - _ Platform = kubevirt.Kubevirt{} - _ Platform = gcp.GCP{} + _ Platform = aws.AWS{} + _ Platform = azure.Azure{} + _ Platform = ibmcloud.IBMCloud{} + _ Platform = none.None{} + _ Platform = agent.Agent{} + _ Platform = kubevirt.Kubevirt{} + _ Platform = gcp.GCP{} + _ OrphanDeleter = &azure.Azure{} ) type Platform interface {