From 25baa38762fd19be2edf1cbfbfcfa6cd0fc5e074 Mon Sep 17 00:00:00 2001 From: Scott Seago Date: Tue, 10 Mar 2026 15:41:17 -0400 Subject: [PATCH] refactor: Optimize VSC handle readiness polling for VSS backups Co-authored-by: aider (gemini/gemini-2.5-pro) Signed-off-by: Scott Seago --- pkg/util/csi/volume_snapshot.go | 141 +++++++++++++++++++------------- 1 file changed, 83 insertions(+), 58 deletions(-) diff --git a/pkg/util/csi/volume_snapshot.go b/pkg/util/csi/volume_snapshot.go index 4e4103efa8..58e54341f9 100644 --- a/pkg/util/csi/volume_snapshot.go +++ b/pkg/util/csi/volume_snapshot.go @@ -604,72 +604,97 @@ func WaitUntilVSCHandleIsReady( return vsc, nil } - // We'll wait 10m for the VSC to be reconciled polling - // every 5s unless backup's csiSnapshotTimeout is set - interval := 5 * time.Second + // We'll wait for the VSC to be reconciled, trying a fast poll interval first + // before falling back to a slower poll interval for the full csiSnapshotTimeout. vsc := new(snapshotv1api.VolumeSnapshotContent) + var interval time.Duration + + pollFunc := func(ctx context.Context) (bool, error) { + vs := new(snapshotv1api.VolumeSnapshot) + if err := crClient.Get( + ctx, + crclient.ObjectKeyFromObject(volSnap), + vs, + ); err != nil { + return false, + errors.Wrapf( + err, + "failed to get volumesnapshot %s/%s", + volSnap.Namespace, volSnap.Name, + ) + } + + if vs.Status == nil || vs.Status.BoundVolumeSnapshotContentName == nil { + log.Infof("Waiting for CSI driver to reconcile volumesnapshot %s/%s. Retrying in %ds", + volSnap.Namespace, volSnap.Name, interval/time.Second) + return false, nil + } + + if err := crClient.Get( + ctx, + crclient.ObjectKey{ + Name: *vs.Status.BoundVolumeSnapshotContentName, + }, + vsc, + ); err != nil { + return false, + errors.Wrapf( + err, + "failed to get VolumeSnapshotContent %s for VolumeSnapshot %s/%s", + *vs.Status.BoundVolumeSnapshotContentName, vs.Namespace, vs.Name, + ) + } + // we need to wait for the VolumeSnapshotContent + // to have a snapshot handle because during restore, + // we'll use that snapshot handle as the source for + // the VolumeSnapshotContent so it's statically + // bound to the existing snapshot. + if vsc.Status == nil || + vsc.Status.SnapshotHandle == nil { + log.Infof( + "Waiting for VolumeSnapshotContents %s to have snapshot handle. Retrying in %ds", + vsc.Name, interval/time.Second) + if vsc.Status != nil && + vsc.Status.Error != nil { + log.Warnf("VolumeSnapshotContent %s has error: %v", + vsc.Name, *vsc.Status.Error.Message) + } + return false, nil + } + + return true, nil + } + + // The short interval for the first ten seconds is due to the fact that + // Microsoft VSS backups have a hard-coded unfreeze call after 10 seconds, + // so we need to minimize waiting time during the first 10 seconds. + // First poll with a short interval and timeout. + interval = 1 * time.Second + timeout := 10 * time.Second err := wait.PollUntilContextTimeout( context.Background(), interval, - csiSnapshotTimeout, + timeout, true, - func(ctx context.Context) (bool, error) { - vs := new(snapshotv1api.VolumeSnapshot) - if err := crClient.Get( - ctx, - crclient.ObjectKeyFromObject(volSnap), - vs, - ); err != nil { - return false, - errors.Wrapf( - err, - "failed to get volumesnapshot %s/%s", - volSnap.Namespace, volSnap.Name, - ) - } - - if vs.Status == nil || vs.Status.BoundVolumeSnapshotContentName == nil { - log.Infof("Waiting for CSI driver to reconcile volumesnapshot %s/%s. Retrying in %ds", - volSnap.Namespace, volSnap.Name, interval/time.Second) - return false, nil - } - - if err := crClient.Get( - ctx, - crclient.ObjectKey{ - Name: *vs.Status.BoundVolumeSnapshotContentName, - }, - vsc, - ); err != nil { - return false, - errors.Wrapf( - err, - "failed to get VolumeSnapshotContent %s for VolumeSnapshot %s/%s", - *vs.Status.BoundVolumeSnapshotContentName, vs.Namespace, vs.Name, - ) - } + pollFunc, + ) - // we need to wait for the VolumeSnapshotContent - // to have a snapshot handle because during restore, - // we'll use that snapshot handle as the source for - // the VolumeSnapshotContent so it's statically - // bound to the existing snapshot. - if vsc.Status == nil || - vsc.Status.SnapshotHandle == nil { - log.Infof( - "Waiting for VolumeSnapshotContents %s to have snapshot handle. Retrying in %ds", - vsc.Name, interval/time.Second) - if vsc.Status != nil && - vsc.Status.Error != nil { - log.Warnf("VolumeSnapshotContent %s has error: %v", - vsc.Name, *vsc.Status.Error.Message) - } - return false, nil - } + if err == nil { + return vsc, nil + } + if !wait.Interrupted(err) { + return nil, err + } - return true, nil - }, + // If the first poll timed out, poll with a longer interval and the full timeout. + interval = 5 * time.Second + err = wait.PollUntilContextTimeout( + context.Background(), + interval, + csiSnapshotTimeout, + true, + pollFunc, ) if err != nil {