diff --git a/internal/common/constant/constant.go b/internal/common/constant/constant.go index dd7f34aa..d3dcd5fe 100644 --- a/internal/common/constant/constant.go +++ b/internal/common/constant/constant.go @@ -93,6 +93,9 @@ const JSONTagString = "json" // CommaString defines a constant for the comma string const CommaString = "," +// WildcardString defines the wildcard character used in Velero resource filters +const WildcardString = "*" + // MaximumNacObjectNameLength represents Generated Non Admin Object Name and // must be below 63 characters, because it's used within object Label Value const MaximumNacObjectNameLength = validation.DNS1123LabelMaxLength diff --git a/internal/common/function/function_test.go b/internal/common/function/function_test.go index 4a6ef396..79b2f14d 100644 --- a/internal/common/function/function_test.go +++ b/internal/common/function/function_test.go @@ -378,6 +378,11 @@ func TestValidateBackupSpecEnforcedFields(t *testing.T) { ParallelFilesUpload: 32, //nolint:revive // just test }, }, + { + name: "VolumeGroupSnapshotLabelKey", + enforcedValue: "enforced-label-key", + overrideValue: "user-label-key", + }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { diff --git a/internal/controller/nonadminbackup_controller.go b/internal/controller/nonadminbackup_controller.go index 1e26cf16..4264747b 100644 --- a/internal/controller/nonadminbackup_controller.go +++ b/internal/controller/nonadminbackup_controller.go @@ -21,6 +21,7 @@ import ( "context" "errors" "reflect" + "slices" "github.com/go-logr/logr" velerov1 "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" @@ -705,11 +706,18 @@ func (r *NonAdminBackupReconciler) createVeleroBackupAndSyncWithNonAdminBackup(c if haveNewResourceFilterParameters { // Use the new-style exclusion list - backupSpec.ExcludedNamespaceScopedResources = append(backupSpec.ExcludedNamespaceScopedResources, - alwaysExcludedNamespacedResources...) - backupSpec.ExcludedClusterScopedResources = append(backupSpec.ExcludedClusterScopedResources, - alwaysExcludedClusterResources...) - } else { + // Skip appending when wildcard '*' is already present, as it already + // excludes everything and mixing '*' with specific items causes Velero + // FailedValidation. + if !slices.Contains(backupSpec.ExcludedNamespaceScopedResources, constant.WildcardString) { + backupSpec.ExcludedNamespaceScopedResources = append(backupSpec.ExcludedNamespaceScopedResources, + alwaysExcludedNamespacedResources...) + } + if !slices.Contains(backupSpec.ExcludedClusterScopedResources, constant.WildcardString) { + backupSpec.ExcludedClusterScopedResources = append(backupSpec.ExcludedClusterScopedResources, + alwaysExcludedClusterResources...) + } + } else if !slices.Contains(backupSpec.ExcludedResources, constant.WildcardString) { // Fallback to the old-style exclusion list backupSpec.ExcludedResources = append(backupSpec.ExcludedResources, alwaysExcludedNamespacedResources...) diff --git a/internal/controller/nonadminbackup_controller_test.go b/internal/controller/nonadminbackup_controller_test.go index 65d4b914..c5a5de7a 100644 --- a/internal/controller/nonadminbackup_controller_test.go +++ b/internal/controller/nonadminbackup_controller_test.go @@ -1546,6 +1546,44 @@ var _ = ginkgo.Describe("Test full reconcile loop of NonAdminBackup Controller", }, addSyncLabel: true, }), + ginkgo.Entry("Should not append default excluded resources when wildcard is used in excludedClusterScopedResources", nonAdminBackupFullReconcileScenario{ + spec: nacv1alpha1.NonAdminBackupSpec{ + BackupSpec: &velerov1.BackupSpec{ + ExcludedClusterScopedResources: []string{"*"}, + IncludedNamespaceScopedResources: []string{"pvc"}, + }, + }, + status: nacv1alpha1.NonAdminBackupStatus{ + Phase: nacv1alpha1.NonAdminPhaseCreated, + VeleroBackup: &nacv1alpha1.VeleroBackup{ + Namespace: oadpNamespace, + Status: nil, + Spec: &velerov1.BackupSpec{ + ExcludedClusterScopedResources: []string{"*"}, + IncludedNamespaceScopedResources: []string{"pvc"}, + ExcludedNamespaceScopedResources: []string{ + nacv1alpha1.NonAdminBackups, + nacv1alpha1.NonAdminRestores, + nacv1alpha1.NonAdminBackupStorageLocations, + }, + }, + }, + Conditions: []metav1.Condition{ + { + Type: "Accepted", + Status: metav1.ConditionTrue, + Reason: "BackupAccepted", + Message: "backup accepted", + }, + { + Type: "Queued", + Status: metav1.ConditionTrue, + Reason: "BackupScheduled", + Message: "Created Velero Backup object", + }, + }, + }, + }), ) ginkgo.DescribeTable("Reconcile triggered by NonAdminBackup sync event", diff --git a/internal/controller/nonadminbackupstoragelocation_controller.go b/internal/controller/nonadminbackupstoragelocation_controller.go index db4be820..df2625ea 100644 --- a/internal/controller/nonadminbackupstoragelocation_controller.go +++ b/internal/controller/nonadminbackupstoragelocation_controller.go @@ -12,37 +12,37 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. - -RESOURCE CONFLICT RESOLUTION ENHANCEMENTS: -This file has been enhanced to resolve resource conflict issues that occurred when -multiple controllers or processes attempted to update the same NonAdminBackupStorageLocationRequest -objects simultaneously. The following changes were made: - -1. RETRY LOGIC FRAMEWORK (updateStatusWithRetry function): - - Uses standard Kubernetes client-go retry.RetryOnConflict with DefaultRetry settings - - Handles "object has been modified" errors gracefully - - Fetches fresh object copies to avoid stale ResourceVersion conflicts - - Leverages proven Kubernetes retry patterns (5 attempts, 10ms+jitter) - -2. NIL SAFETY CHECKS (ensureNonAdminRequest function): - - Prevents panic when SourceNonAdminBSL is nil during initialization - - Converts terminal errors to requeue conditions for uninitialized status - - Allows proper status initialization timing in high-concurrency environments - -3. OPTIMIZED STATUS UPDATES (createNonAdminRequest function): - - Uses fast-path direct updates for new objects - - Falls back to retry logic only when conflicts are detected - - Preserves computed status values while ensuring conflict resilience - -4. TEST ENVIRONMENT ADAPTATIONS: - - Increased timeouts to accommodate retry logic execution time - - Reduced polling frequency to handle Kubernetes client rate limiting - - Added delays to prevent overwhelming API server during test runs - -These enhancements ensure that OADP non-admin backup operations complete successfully -even under high concurrency or when multiple reconciliation events occur simultaneously. */ +// RESOURCE CONFLICT RESOLUTION ENHANCEMENTS: +// This file has been enhanced to resolve resource conflict issues that occurred when +// multiple controllers or processes attempted to update the same NonAdminBackupStorageLocationRequest +// objects simultaneously. The following changes were made: +// +// 1. RETRY LOGIC FRAMEWORK (updateStatusWithRetry function): +// - Uses standard Kubernetes client-go retry.RetryOnConflict with DefaultRetry settings +// - Handles "object has been modified" errors gracefully +// - Fetches fresh object copies to avoid stale ResourceVersion conflicts +// - Leverages proven Kubernetes retry patterns (5 attempts, 10ms+jitter) +// +// 2. NIL SAFETY CHECKS (ensureNonAdminRequest function): +// - Prevents panic when SourceNonAdminBSL is nil during initialization +// - Converts terminal errors to requeue conditions for uninitialized status +// - Allows proper status initialization timing in high-concurrency environments +// +// 3. OPTIMIZED STATUS UPDATES (createNonAdminRequest function): +// - Uses fast-path direct updates for new objects +// - Falls back to retry logic only when conflicts are detected +// - Preserves computed status values while ensuring conflict resilience +// +// 4. TEST ENVIRONMENT ADAPTATIONS: +// - Increased timeouts to accommodate retry logic execution time +// - Reduced polling frequency to handle Kubernetes client rate limiting +// - Added delays to prevent overwhelming API server during test runs +// +// These enhancements ensure that OADP non-admin backup operations complete successfully +// even under high concurrency or when multiple reconciliation events occur simultaneously. + package controller import ( @@ -125,7 +125,10 @@ func (r *NonAdminBackupStorageLocationReconciler) updateStatusWithRetry(ctx cont // Get the latest version of the object from the API server to ensure we have // the most recent ResourceVersion and avoid stale object conflicts key := client.ObjectKeyFromObject(obj) - fresh := obj.DeepCopyObject().(client.Object) + fresh, ok := obj.DeepCopyObject().(client.Object) + if !ok { + return errors.New("failed to convert deep copy to client.Object") + } if err := r.Get(ctx, key, fresh); err != nil { return err // RetryOnConflict will handle conflict vs non-conflict errors } @@ -643,7 +646,10 @@ func (r *NonAdminBackupStorageLocationReconciler) createNonAdminRequest(ctx cont // - Event-driven reconciliation causing concurrent status updates logger.V(1).Info("NonAdminBackupStorageLocationRequest already exists") if updateErr := r.updateStatusWithRetry(ctx, logger, nabslRequest, func(obj client.Object) bool { - req := obj.(*nacv1alpha1.NonAdminBackupStorageLocationRequest) + req, ok := obj.(*nacv1alpha1.NonAdminBackupStorageLocationRequest) + if !ok { + return false + } return updatePhaseIfNeeded(&req.Status.Phase, req.Spec.ApprovalDecision) }); updateErr != nil { logger.Error(updateErr, failedUpdateStatusError) @@ -704,26 +710,22 @@ func (r *NonAdminBackupStorageLocationReconciler) createNonAdminRequest(ctx cont // - Correctness: Proper status initialization even under load if updated := updateNonAdminRequestStatus(&nonAdminBslRequest.Status, nabsl, approvalDecision); updated { if updateErr := r.Status().Update(ctx, &nonAdminBslRequest); updateErr != nil { - if apierrors.IsConflict(updateErr) { - // CONFLICT DETECTED: Another process modified the request between create and status update - // This can happen when: - // - Admin approves/rejects the request immediately after creation - // - Multiple reconcile loops are triggered by related events - // - High concurrency in the test environment - logger.V(1).Info("Conflict on initial status update, retrying with fresh object...") - if retryErr := r.updateStatusWithRetry(ctx, logger, &nonAdminBslRequest, func(obj client.Object) bool { - req := obj.(*nacv1alpha1.NonAdminBackupStorageLocationRequest) - return updateNonAdminRequestStatus(&req.Status, nabsl, approvalDecision) - }); retryErr != nil { - logger.Error(retryErr, failedUpdateStatusError) - return false, retryErr - } - } else { - // NON-CONFLICT ERROR: Validation, permission, or other API server issue - // Don't retry these as they're likely to persist + if !apierrors.IsConflict(updateErr) { logger.Error(updateErr, failedUpdateStatusError) return false, updateErr } + // CONFLICT DETECTED: Another process modified the request between create and status update + logger.V(1).Info("Conflict on initial status update, retrying with fresh object...") + if retryErr := r.updateStatusWithRetry(ctx, logger, &nonAdminBslRequest, func(obj client.Object) bool { + req, ok := obj.(*nacv1alpha1.NonAdminBackupStorageLocationRequest) + if !ok { + return false + } + return updateNonAdminRequestStatus(&req.Status, nabsl, approvalDecision) + }); retryErr != nil { + logger.Error(retryErr, failedUpdateStatusError) + return false, retryErr + } } } diff --git a/internal/controller/nonadminrestore_controller.go b/internal/controller/nonadminrestore_controller.go index c7183807..b7859005 100644 --- a/internal/controller/nonadminrestore_controller.go +++ b/internal/controller/nonadminrestore_controller.go @@ -20,6 +20,7 @@ import ( "context" "errors" "reflect" + "slices" "github.com/go-logr/logr" velerov1 "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" @@ -348,8 +349,10 @@ func (r *NonAdminRestoreReconciler) createVeleroRestore(ctx context.Context, log } } - restoreSpec.ExcludedResources = append(restoreSpec.ExcludedResources, - "volumesnapshotclasses") + if !slices.Contains(restoreSpec.ExcludedResources, constant.WildcardString) { + restoreSpec.ExcludedResources = append(restoreSpec.ExcludedResources, + "volumesnapshotclasses") + } veleroRestore = &velerov1.Restore{ ObjectMeta: metav1.ObjectMeta{