From 35031dc35a8efc318c40e07fc003336782fa5d2e Mon Sep 17 00:00:00 2001 From: Walter Boring Date: Fri, 5 Jun 2026 10:37:48 -0400 Subject: [PATCH] [cinder-csi-plugin] Wait for volume availability before attach ControllerPublishVolume now waits for the volume to reach 'available' or 'in-use' status before calling the Cinder attachment API. Previously, if the CO called ControllerPublishVolume immediately after CreateVolume, the volume could still be in 'creating' state on the backend. This caused Cinder to reject the attachment with a 409 Conflict ('status must be available or downloading to reserve, but the current status is creating'), forcing the CO to retry blindly. The new behavior uses a context-aware poll (WaitVolumeTargetStatusWithContext) that respects the gRPC request deadline. The volume status is checked every 3 seconds until it reaches a target state, enters an error state, or the context expires. This eliminates unnecessary 409 errors against Cinder and reduces time-to-attach for volumes still being provisioned. Signed-off-by: Walter Boring --- pkg/csi/cinder/controllerserver.go | 17 ++++++++++ pkg/csi/cinder/controllerserver_test.go | 1 + pkg/csi/cinder/openstack/openstack.go | 2 ++ pkg/csi/cinder/openstack/openstack_mock.go | 15 +++++++++ pkg/csi/cinder/openstack/openstack_volumes.go | 31 +++++++++++++++++++ 5 files changed, 66 insertions(+) diff --git a/pkg/csi/cinder/controllerserver.go b/pkg/csi/cinder/controllerserver.go index 1a1b9e2ff6..de9a91549c 100644 --- a/pkg/csi/cinder/controllerserver.go +++ b/pkg/csi/cinder/controllerserver.go @@ -23,6 +23,7 @@ import ( "slices" "sort" "strconv" + "time" "github.com/container-storage-interface/spec/lib/go/csi" "github.com/gophercloud/gophercloud/v2/openstack/blockstorage/v3/backups" @@ -51,6 +52,10 @@ const ( cinderCSIClusterIDKey = "cinder.csi.openstack.org/cluster" affinityKey = "cinder.csi.openstack.org/affinity" antiAffinityKey = "cinder.csi.openstack.org/anti-affinity" + + // volumeReadyPollInterval is the interval at which the volume status is + // polled when waiting for a volume to become available before attaching. + volumeReadyPollInterval = 3 * time.Second ) func (cs *controllerServer) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest) (*csi.CreateVolumeResponse, error) { @@ -321,6 +326,18 @@ func (cs *controllerServer) ControllerPublishVolume(ctx context.Context, req *cs return nil, status.Errorf(codes.Internal, "[ControllerPublishVolume] get volume failed with error %v", err) } + // Wait for the volume to be available before attempting attach. + // Cinder rejects attachment requests for volumes not in "available" (or + // "in-use" for multiattach) state. Without this wait, the driver would + // immediately hit a 409 Conflict if CreateVolume has just been called and + // the volume is still being provisioned on the backend. + targetStatus := []string{openstack.VolumeAvailableStatus, openstack.VolumeInUseStatus} + err = cloud.WaitVolumeTargetStatusWithContext(ctx, volumeID, targetStatus, volumeReadyPollInterval) + if err != nil { + klog.Errorf("Failed waiting for volume %s to become available: %v", volumeID, err) + return nil, status.Errorf(codes.FailedPrecondition, "[ControllerPublishVolume] Volume %s is not available for attach: %v", volumeID, err) + } + _, err = cloud.GetInstanceByID(ctx, instanceID) if err != nil { if cpoerrors.IsNotFound(err) { diff --git a/pkg/csi/cinder/controllerserver_test.go b/pkg/csi/cinder/controllerserver_test.go index df608d1b2c..aa8c673f87 100644 --- a/pkg/csi/cinder/controllerserver_test.go +++ b/pkg/csi/cinder/controllerserver_test.go @@ -529,6 +529,7 @@ func TestDeleteVolume(t *testing.T) { func TestControllerPublishVolume(t *testing.T) { fakeCs, osmock := fakeControllerServer() + osmock.On("WaitVolumeTargetStatusWithContext", FakeVolID, mock.AnythingOfType("[]string"), mock.AnythingOfType("time.Duration")).Return(nil) osmock.On("AttachVolume", FakeNodeID, FakeVolID).Return(FakeVolID, nil) osmock.On("WaitDiskAttached", FakeNodeID, FakeVolID).Return(nil) osmock.On("GetAttachmentDiskPath", FakeNodeID, FakeVolID).Return(FakeDevicePath, nil) diff --git a/pkg/csi/cinder/openstack/openstack.go b/pkg/csi/cinder/openstack/openstack.go index 7402a61fbb..6f0d87f006 100644 --- a/pkg/csi/cinder/openstack/openstack.go +++ b/pkg/csi/cinder/openstack/openstack.go @@ -21,6 +21,7 @@ import ( "fmt" "net/http" "os" + "time" "github.com/gophercloud/gophercloud/v2" "github.com/gophercloud/gophercloud/v2/openstack" @@ -54,6 +55,7 @@ type IOpenStack interface { DetachVolume(ctx context.Context, instanceID, volumeID string) error WaitDiskDetached(ctx context.Context, instanceID string, volumeID string) error WaitVolumeTargetStatus(ctx context.Context, volumeID string, tStatus []string) error + WaitVolumeTargetStatusWithContext(ctx context.Context, volumeID string, tStatus []string, interval time.Duration) error GetAttachmentDiskPath(ctx context.Context, instanceID, volumeID string) (string, error) GetVolume(ctx context.Context, volumeID string) (*volumes.Volume, error) GetVolumesByName(ctx context.Context, name string) ([]volumes.Volume, error) diff --git a/pkg/csi/cinder/openstack/openstack_mock.go b/pkg/csi/cinder/openstack/openstack_mock.go index 8849b1bf53..d88fb960ef 100644 --- a/pkg/csi/cinder/openstack/openstack_mock.go +++ b/pkg/csi/cinder/openstack/openstack_mock.go @@ -19,6 +19,7 @@ package openstack import ( "context" "fmt" + "time" "github.com/gophercloud/gophercloud/v2/openstack/blockstorage/v3/backups" "github.com/gophercloud/gophercloud/v2/openstack/blockstorage/v3/snapshots" @@ -200,6 +201,20 @@ func (_m *OpenStackMock) WaitVolumeTargetStatus(ctx context.Context, volumeID st return r0 } +// WaitVolumeTargetStatusWithContext provides a mock function with given fields: volumeID, tStatus, interval +func (_m *OpenStackMock) WaitVolumeTargetStatusWithContext(ctx context.Context, volumeID string, tStatus []string, interval time.Duration) error { + ret := _m.Called(volumeID, tStatus, interval) + + var r0 error + if rf, ok := ret.Get(0).(func(string, []string, time.Duration) error); ok { + r0 = rf(volumeID, tStatus, interval) + } else { + r0 = ret.Error(0) + } + + return r0 +} + // WaitDiskDetached provides a mock function with given fields: instanceID, volumeID func (_m *OpenStackMock) WaitDiskDetached(ctx context.Context, instanceID string, volumeID string) error { ret := _m.Called(instanceID, volumeID) diff --git a/pkg/csi/cinder/openstack/openstack_volumes.go b/pkg/csi/cinder/openstack/openstack_volumes.go index 41499b77cf..f4f67c8b9b 100644 --- a/pkg/csi/cinder/openstack/openstack_volumes.go +++ b/pkg/csi/cinder/openstack/openstack_volumes.go @@ -298,6 +298,37 @@ func (os *OpenStack) WaitVolumeTargetStatus(ctx context.Context, volumeID string return waitErr } +// WaitVolumeTargetStatusWithContext waits for volume to be in target state, +// respecting the context deadline. This is useful when the caller has a +// context with a deadline (e.g., gRPC request context) and wants the wait +// to be bounded by that deadline rather than a fixed number of steps. +// The interval parameter controls how often the volume status is polled. +func (os *OpenStack) WaitVolumeTargetStatusWithContext(ctx context.Context, volumeID string, tStatus []string, interval time.Duration) error { + err := wait.PollUntilContextCancel(ctx, interval, true, func(ctx context.Context) (bool, error) { + vol, err := os.GetVolume(ctx, volumeID) + if err != nil { + return false, err + } + for _, t := range tStatus { + if vol.Status == t { + return true, nil + } + } + for _, eState := range volumeErrorStates { + if vol.Status == eState { + return false, fmt.Errorf("volume %s is in error state: %s", volumeID, vol.Status) + } + } + return false, nil + }) + + if err != nil && ctx.Err() != nil { + return fmt.Errorf("timeout waiting for volume %s to reach status %v: %w", volumeID, tStatus, err) + } + + return err +} + // DetachVolume detaches given cinder volume from the compute func (os *OpenStack) DetachVolume(ctx context.Context, instanceID, volumeID string) error { volume, err := os.GetVolume(ctx, volumeID)