diff --git a/CLAUDE.md b/CLAUDE.md index 086ce1fe..329841ad 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -45,7 +45,7 @@ Copy the closest existing YAML and fill: - `teardown[]` — cleanup CLI invocations. - `invariants[]` — currently only `no_orphans` is implemented. -Available `await.kind` values: `replica_count`, `disk_state`, `all_uptodate`, `replica_diskless`, `no_tiebreaker`, `sync_clean`, `resource_absent`, `rd_absent`, `vd_size_kib`, `pvc_capacity`, `pod_md5_invariant`. See the header comment of `tests/operator-harness/replay-runner.sh` for the contract. +Available `await.kind` values: `replica_count`, `disk_state`, `all_uptodate`, `replica_diskless`, `no_tiebreaker`, `sync_clean`, `resource_absent`, `rd_absent`, `vd_size_kib`, `vd_count`, `pvc_capacity`, `pod_md5_invariant`. See the header comment of `tests/operator-harness/replay-runner.sh` for the contract. ## Running the harness diff --git a/cmd/apiserver/main.go b/cmd/apiserver/main.go index 75faad58..1ca7246e 100644 --- a/cmd/apiserver/main.go +++ b/cmd/apiserver/main.go @@ -257,7 +257,14 @@ func main() { // CRD-backed store is the only supported persistence layer // post-Phase-11 — the apiserver/controller split made // in-process state pointless across replicas. - st := storek8s.New(mgr.GetClient()) + // + // BUG-048: pass the manager's direct (uncached) API reader so the + // atomic VolumeNumber allocation re-reads live RD state on each + // conflict-retry. With only the informer-cached client, two + // concurrent `vd c` against one RD both retry against a stale cache, + // re-derive the same number, exhaust the retry budget, and silently + // drop the second volume. + st := storek8s.NewWithAPIReader(mgr.GetClient(), mgr.GetAPIReader()) ready := newReadyState() diff --git a/internal/controller/bug_048_vd_drop_minor_patch_test.go b/internal/controller/bug_048_vd_drop_minor_patch_test.go new file mode 100644 index 00000000..6bf19a23 --- /dev/null +++ b/internal/controller/bug_048_vd_drop_minor_patch_test.go @@ -0,0 +1,213 @@ +// SPDX-License-Identifier: Apache-2.0 + +/* +Copyright 2026 Cozystack contributors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + "testing" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + blockstoriov1alpha1 "github.com/cozystack/blockstor/api/v1alpha1" +) + +// BUG-048 (P1, the silent VD drop). The ResourceReconciler's +// patchRDVolumeMinors writes the RD's per-volume DRBD minors with a JSON +// merge-patch. JSON merge-patch (RFC 7386) REPLACES the +// spec.volumeDefinitions array wholesale, so the write stamps the WHOLE +// list from whatever snapshot the reconciler read at the top of +// ensureRDVolumeMinors. If a concurrent number-less `linstor vd c` +// appended a new volume AFTER that read but BEFORE this patch, the +// wholesale replace silently drops the appended volume — and both `vd c` +// return success because the clobber is an async reconciler write, not +// their own create. +// +// The original code carried a comment claiming "optimistic concurrency" +// but never set metadata.resourceVersion, so the patch could NEVER 409 +// and the appended volume vanished with no error anywhere — the +// operator-visible "vdCount short by one, both vd c rc0, zero +// conflict/retry in the logs" failure observed on the stand. +// +// The fix embeds metadata.resourceVersion in the merge-patch body so the +// apiserver rejects the stale wholesale replace with Conflict. These +// tests pin both halves of the contract: +// +// - a stale-snapshot patch racing an appended volume MUST be rejected +// with Conflict (and therefore NOT drop the appended volume), and +// - a non-racing patch (fresh snapshot) MUST still succeed and stamp +// the minors. + +func bug048Scheme(t *testing.T) *runtime.Scheme { + t.Helper() + + scheme := runtime.NewScheme() + if err := blockstoriov1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("AddToScheme: %v", err) + } + + return scheme +} + +func bug048RDVolNumbers(rd *blockstoriov1alpha1.ResourceDefinition) []int32 { + out := make([]int32, 0, len(rd.Spec.VolumeDefinitions)) + for i := range rd.Spec.VolumeDefinitions { + out = append(out, rd.Spec.VolumeDefinitions[i].VolumeNumber) + } + + return out +} + +// TestBug048MinorPatchRejectsStaleVolumeDefinitionClobber is the core +// regression. We: +// 1. seed an RD with [vol-0], read it into the reconciler's snapshot, +// 2. simulate a concurrent `vd c` appending vol-1 to the LIVE RD +// (bumping its resourceVersion), then +// 3. call patchRDVolumeMinors with the STALE [vol-0] snapshot. +// +// Pre-fix (blind merge-patch, no resourceVersion) the patch silently +// succeeds and the live RD is clobbered back to [vol-0] — vol-1 is +// dropped. Post-fix the patch carries the snapshot's resourceVersion, +// the apiserver returns Conflict, and vol-1 survives. +func TestBug048MinorPatchRejectsStaleVolumeDefinitionClobber(t *testing.T) { + t.Parallel() + + ctx := context.Background() + scheme := bug048Scheme(t) + + const rdName = "drop-rd" + + rd := &blockstoriov1alpha1.ResourceDefinition{ + ObjectMeta: metav1.ObjectMeta{Name: rdName}, + Spec: blockstoriov1alpha1.ResourceDefinitionSpec{ + VolumeDefinitions: []blockstoriov1alpha1.ResourceDefinitionVolume{ + {VolumeNumber: 0, SizeKib: 1024}, + }, + }, + } + + cli := fake.NewClientBuilder().WithScheme(scheme). + WithStatusSubresource(&blockstoriov1alpha1.ResourceDefinition{}). + WithObjects(rd). + Build() + + rec := &ResourceReconciler{Client: cli, Scheme: scheme, APIReader: cli} + + // (1) reconciler snapshot of the RD (carries the stale resourceVersion). + snap := &blockstoriov1alpha1.ResourceDefinition{} + if err := cli.Get(ctx, client.ObjectKey{Name: rdName}, snap); err != nil { + t.Fatalf("get RD snapshot: %v", err) + } + // Reconciler "allocates" a minor for vol-0 on its snapshot. + m0 := int32(1000) + snap.Spec.VolumeDefinitions[0].DRBDMinor = &m0 + + // (2) concurrent `vd c` appends vol-1 to the LIVE RD, bumping its RV. + live := &blockstoriov1alpha1.ResourceDefinition{} + if err := cli.Get(ctx, client.ObjectKey{Name: rdName}, live); err != nil { + t.Fatalf("get live RD: %v", err) + } + + live.Spec.VolumeDefinitions = append(live.Spec.VolumeDefinitions, + blockstoriov1alpha1.ResourceDefinitionVolume{VolumeNumber: 1, SizeKib: 1024}) + if err := cli.Update(ctx, live); err != nil { + t.Fatalf("append vol-1 (simulated vd c): %v", err) + } + + // (3) reconciler patches minors from its STALE [vol-0] snapshot. + err := rec.patchRDVolumeMinors(ctx, snap) + if !apierrors.IsConflict(err) { + t.Fatalf("patchRDVolumeMinors with stale snapshot returned err=%v, want Conflict; "+ + "a non-Conflict means the wholesale volumeDefinitions replace was NOT optimistic-locked "+ + "and would silently drop the concurrently-appended vol-1 (BUG-048)", err) + } + + // The appended volume MUST survive the rejected clobber. + got := &blockstoriov1alpha1.ResourceDefinition{} + if gerr := cli.Get(ctx, client.ObjectKey{Name: rdName}, got); gerr != nil { + t.Fatalf("get RD after rejected patch: %v", gerr) + } + + nums := bug048RDVolNumbers(got) + if len(nums) != 2 || nums[0] != 0 || nums[1] != 1 { + t.Fatalf("RD volumeDefinitions after rejected clobber = %v, want [0 1] "+ + "(vol-1 was silently dropped — the BUG-048 lost update)", nums) + } +} + +// TestBug048MinorPatchSucceedsOnFreshSnapshot proves the optimistic-lock +// does not break the happy path: a patch built from a CURRENT snapshot +// (no racing writer) must still commit and stamp the minors. +func TestBug048MinorPatchSucceedsOnFreshSnapshot(t *testing.T) { + t.Parallel() + + ctx := context.Background() + scheme := bug048Scheme(t) + + const rdName = "fresh-rd" + + rd := &blockstoriov1alpha1.ResourceDefinition{ + ObjectMeta: metav1.ObjectMeta{Name: rdName}, + Spec: blockstoriov1alpha1.ResourceDefinitionSpec{ + VolumeDefinitions: []blockstoriov1alpha1.ResourceDefinitionVolume{ + {VolumeNumber: 0, SizeKib: 1024}, + {VolumeNumber: 1, SizeKib: 1024}, + }, + }, + } + + cli := fake.NewClientBuilder().WithScheme(scheme). + WithStatusSubresource(&blockstoriov1alpha1.ResourceDefinition{}). + WithObjects(rd). + Build() + + rec := &ResourceReconciler{Client: cli, Scheme: scheme, APIReader: cli} + + snap := &blockstoriov1alpha1.ResourceDefinition{} + if err := cli.Get(ctx, client.ObjectKey{Name: rdName}, snap); err != nil { + t.Fatalf("get RD snapshot: %v", err) + } + + m0, m1 := int32(1000), int32(1001) + snap.Spec.VolumeDefinitions[0].DRBDMinor = &m0 + snap.Spec.VolumeDefinitions[1].DRBDMinor = &m1 + + if err := rec.patchRDVolumeMinors(ctx, snap); err != nil { + t.Fatalf("patchRDVolumeMinors on fresh snapshot: %v", err) + } + + got := &blockstoriov1alpha1.ResourceDefinition{} + if err := cli.Get(ctx, client.ObjectKey{Name: rdName}, got); err != nil { + t.Fatalf("get RD after patch: %v", err) + } + + if nums := bug048RDVolNumbers(got); len(nums) != 2 || nums[0] != 0 || nums[1] != 1 { + t.Fatalf("RD volumeDefinitions after fresh patch = %v, want [0 1]", nums) + } + + for i := range got.Spec.VolumeDefinitions { + if got.Spec.VolumeDefinitions[i].DRBDMinor == nil { + t.Fatalf("vol-%d minor not stamped after fresh patch", + got.Spec.VolumeDefinitions[i].VolumeNumber) + } + } +} diff --git a/internal/controller/resource_controller.go b/internal/controller/resource_controller.go index d68912f2..c3ce3cb9 100644 --- a/internal/controller/resource_controller.go +++ b/internal/controller/resource_controller.go @@ -1624,6 +1624,24 @@ func backfillRDMinorsFromStatus(rd *blockstoriov1alpha1.ResourceDefinition) bool // satellite/REST owners of the other VolumeDefinition fields read them // off Spec elsewhere, and we re-send the size/props verbatim to avoid // dropping them. +// +// BUG-048 (P1, the silent VD drop): because a JSON merge-patch REPLACES +// the volumeDefinitions array (RFC 7386 has no array-merge), this write +// stamps the WHOLE list from the in-memory snapshot the reconciler read +// at the top of ensureRDVolumeMinors. If a concurrent number-less +// `linstor vd c` appended a new volume AFTER that read but before this +// patch, the wholesale replace silently drops it — and both `vd c` +// return success because the clobber is an async reconciler write, not +// their own. The original code claimed "optimistic concurrency" in a +// comment but never actually set metadata.resourceVersion, so the patch +// could NEVER 409 and the IsConflict re-read branch in the caller was +// dead. We now embed metadata.resourceVersion in the merge-patch body +// (exactly what client.MergeFromWithOptimisticLock does for typed +// patches): the apiserver rejects the patch with Conflict when a racing +// VD append moved the RD on, the caller re-reads, and the next reconcile +// re-allocates minors against the now-complete list — the appended +// volume survives. A missing resourceVersion (RD freshly built in a unit +// path) degrades to the prior blind patch rather than failing the write. func (r *ResourceReconciler) patchRDVolumeMinors(ctx context.Context, rd *blockstoriov1alpha1.ResourceDefinition) error { vols := make([]map[string]any, 0, len(rd.Spec.VolumeDefinitions)) for i := range rd.Spec.VolumeDefinitions { @@ -1645,15 +1663,26 @@ func (r *ResourceReconciler) patchRDVolumeMinors(ctx context.Context, rd *blocks vols = append(vols, entry) } + meta := map[string]any{} + if rv := rd.ResourceVersion; rv != "" { + // Optimistic-lock precondition: the apiserver enforces + // metadata.resourceVersion on any patch body, so a racing VD + // append (which bumped the RD past this snapshot) makes the + // wholesale volumeDefinitions replace 409 instead of clobbering + // the appended volume. + meta["resourceVersion"] = rv + } + body := map[string]any{patchKeySpec: map[string]any{"volumeDefinitions": vols}} + if len(meta) > 0 { + body["metadata"] = meta + } patchBytes, err := json.Marshal(body) if err != nil { return err } - // Optimistic concurrency: include resourceVersion so a racing - // minor write is rejected with Conflict and we re-read. patchTarget := &blockstoriov1alpha1.ResourceDefinition{ TypeMeta: metav1.TypeMeta{ Kind: resourceDefinitionKindName, diff --git a/pkg/drbd/drbdadm.go b/pkg/drbd/drbdadm.go index bdcfab03..c8462b3d 100644 --- a/pkg/drbd/drbdadm.go +++ b/pkg/drbd/drbdadm.go @@ -92,6 +92,25 @@ func (a *Adm) Adjust(ctx context.Context, resource string) error { return a.run(ctx, "adjust", resource) } +// InvalidateVolume runs `drbdadm invalidate /` — marks the +// LOCAL copy of ONE volume as out-of-sync and forces a full SyncTarget of +// that volume from an UpToDate peer. The per-volume target (`/`) +// confines the discard to the named volume so sibling volumes' data is +// untouched. Requires an UpToDate peer as the resync source (drbdadm +// refuses otherwise); callers gate on LateAddResyncKickVolumes, which only +// returns a volume when a connected peer is UpToDate for it. +// +// Used by the BUG-048 late-add resync-kick self-heal to recover a stalled +// late-add resync: the fresh, empty, non-authoritative local copy is +// thrown away and re-pulled from the UpToDate peer, working WITH the +// bitmap (unlike a connection re-handshake, which the shared day0 GI makes +// conclude "no resync needed"). +func (a *Adm) InvalidateVolume(ctx context.Context, resource string, volume int32) error { + target := fmt.Sprintf("%s/%d", resource, volume) + + return a.run(ctx, "invalidate", target) +} + // AdjustSkipDisk is the Failed-replica variant of Adjust that // appends drbd-utils' `--skip-disk` flag. Used after the observer // detected `disk:Failed` and stamped `DrbdOptions/SkipDisk=True` @@ -485,6 +504,98 @@ func (a *Adm) NewCurrentUUID(ctx context.Context, resource string) error { return a.run(ctx, "new-current-uuid", resource) } +// MintLateAddSource makes THIS node an UpToDate source for every LOCAL +// Inconsistent volume of `resource`, so the Inconsistent peers can +// SyncTarget from it (BUG-048, the all-Inconsistent late-add wedge where +// NO replica ever won the volume's UpToDate seed). +// +// Why not `primary --force`: the existing late-add-promote ran +// `drbdadm primary --force `, but the kernel REJECTS it with +// "(-16) Disk state is lower than outdated" whenever ANY volume of the +// resource is Inconsistent (stand-observed: the promote retried every +// throttle window and never took, the volume stayed Inconsistent on all +// replicas forever). `primary` is also resource-wide, so a single +// Inconsistent late volume blocks promoting the whole resource even though +// the sibling volumes are UpToDate. +// +// Instead, mint the source per-volume the way DRBD's own skip-initial-sync +// does: `drbdsetup new-current-uuid --clear-bitmap ` marks the +// volume UpToDate with a clean bitmap. That call requires the volume's +// peers to be StandAlone, so the sequence is: +// +// 1. `drbdadm disconnect ` — quiesce every peer (resource-wide; DRBD +// carries all volumes over one connection per peer). UpToDate sibling +// volumes (vol-0/vol-1) are unaffected: on reconnect they re-handshake +// at their shared UUID and stay Established (no resync). +// 2. for each LOCAL Inconsistent volume: `new-current-uuid --clear-bitmap +// ` → that volume flips UpToDate with a clean bitmap. +// 3. `drbdadm adjust ` — reconnect every peer. The peers, still +// Inconsistent, now see an UpToDate source and SyncTarget from it. +// +// Data-safety: the late-added volume is genuinely fresh/empty on every +// replica (day0 lineage; this gate fires only when NO peer holds committed +// data — the caller's NeedsLateAddPromote enforces that), so minting an +// empty UpToDate source loses nothing — the Inconsistent peers were equally +// empty and simply adopt this clean copy. Gated by NeedsLateAddPromote +// (lowest-id node, no peer data, no Primary), so exactly one deterministic +// node mints the source and no split-brain race is possible. +// +// Returns the cleared minors (for logging) and the first error. A disconnect +// failure is best-effort-ignored (a peer already StandAlone is fine); a +// clear-bitmap or adjust failure surfaces. +func (a *Adm) MintLateAddSource(ctx context.Context, resource string) ([]int32, error) { + out, err := a.exec.Run(ctx, "drbdsetup", "status", resource, "--json") + if err != nil { + return nil, errors.Wrapf(err, "drbdsetup status %s", resource) + } + + var status drbdsetupStatusRoot + + err = json.Unmarshal(out, &status) + if err != nil || len(status) == 0 { + return nil, errors.Wrapf(err, "parse drbdsetup status %s", resource) + } + + var minors []int32 + + for _, dev := range status[0].Devices { + if DiskState(dev.DiskState) == DiskStateInconsistent { + minors = append(minors, dev.Minor) + } + } + + if len(minors) == 0 { + return nil, nil + } + + // Best-effort disconnect of all peers — new-current-uuid --clear-bitmap + // requires StandAlone; a peer already StandAlone returns non-zero, which + // we ignore. + _, _ = a.exec.Run(ctx, "drbdadm", "disconnect", resource) + + for _, minor := range minors { + _, mintErr := a.exec.Run(ctx, "drbdsetup", "new-current-uuid", + "--clear-bitmap", strconv.Itoa(int(minor))) + if mintErr != nil { + // Reconnect before bailing so a half-applied mint does not + // leave the resource StandAlone. + _ = a.run(ctx, "adjust", resource) + + return minors, errors.Wrapf(mintErr, + "new-current-uuid --clear-bitmap %d", minor) + } + } + + // Reconnect every peer; the Inconsistent peers now SyncTarget from the + // freshly-UpToDate local volumes. + err = a.run(ctx, "adjust", resource) + if err != nil { + return minors, errors.Wrapf(err, "adjust %s after mint", resource) + } + + return minors, nil +} + // SuspendIO runs `drbdadm suspend-io ` — freezes the // resource's block-I/O path on the local satellite so a backing // snapshot (LVM-thin / ZFS / file) captures bytes at a stable @@ -1374,6 +1485,425 @@ func localIsUpToDate(devices []drbdsetupStatusDevice) bool { return true } +// NeedsLateAddPromote probes the live kernel via `drbdsetup status +// --json` and reports whether THIS node must `primary --force` +// to unstick a LATE-ADDED volume that wedged Inconsistent on every +// diskful replica with no SyncSource (BUG-048, the concurrent two-VD +// add on a ≥3-diskful RD). +// +// Why a separate gate from NeedsRecoveryPromote / NeedsSoloPromote: +// - NeedsRecoveryPromote requires the LOCAL to be already UpToDate and +// a PEER Inconsistent — but in this wedge the local volume is ALSO +// Inconsistent (no replica ever won the volume's UpToDate seed), so +// that predicate can never fire. It is also gated by the dispatcher's +// `auto-primary`, which is suppressed on an INITIALIZED RD (every +// late-add lands on one) so the satellite-side maybeRecoveryPromote +// short-circuits before even probing. +// - NeedsSoloPromote requires ZERO peers; here peers exist. +// +// SPLIT-BRAIN SAFETY (the two gates that make this distinct from a day0 +// bootstrap — without them the predicate misfired on a fresh RD's vol-0 +// and two non-lowest nodes simultaneously force-primaried into +// split-brain): +// +// 1. A LATE-add wedge means the RD is PAST first activation, so at +// least one local volume is already UpToDate (the earlier +// volumes). A pure day0 bootstrap has NO UpToDate volume yet — +// EVERY volume is transiently Inconsistent while the normal winner +// election + auto-promote run. Requiring a local UpToDate sibling +// means this predicate can NEVER fire during day0. +// 2. EVERY peer connection must be fully Connected with the wedged +// volume's peer-disk OBSERVED. The lowest-node-id election is only +// sound with COMPLETE peer information; at day0 t+1s peers are still +// StandAlone / Connecting, so each node would see a partial peer set +// and several could each conclude "I am lowest" → simultaneous +// force-primary. Deferring until every peer is connected guarantees +// every node computes the election over the same full set, so +// EXACTLY ONE (the true lowest id) promotes. +// +// Beyond those, returns true ONLY when ALL hold, so the force-primary is +// data-safe + self-limiting: +// - the kernel slot exists and reports a my-node-id; +// - the local role is NOT Primary and NO peer is Primary; +// - at least one LOCAL diskful volume is Inconsistent, and for EVERY +// such volume NO connected peer exposes committed data +// (peer-disk UpToDate / Consistent / Outdated) — the defining wedge; +// a peer with data means SyncTarget instead (Bug 342 guard); +// - none of those wedged volumes is already being actively resynced; +// - this node's my-node-id is the LOWEST among the wedged volume's +// diskful replicas. +// +// Data-safety: a fresh late-added volume's metadata was seeded at the +// deterministic day0 current-UUID on every replica (the seed path runs +// before bring-up), so `primary --force` here mints no UNRELATED UUID; +// the Inconsistent peers simply SyncTarget from this now-Primary source. +// Self-limiting: once the peers reach UpToDate the predicate stops +// holding. Conservative on any probe/parse failure → false. +func (a *Adm) NeedsLateAddPromote(ctx context.Context, resource string) bool { + out, err := a.exec.Run(ctx, "drbdsetup", "status", resource, "--json") + if err != nil { + return false + } + + var status drbdsetupStatusRoot + + err = json.Unmarshal(out, &status) + if err != nil || len(status) == 0 || status[0].NodeID == nil { + return false + } + + res := status[0] + + // Never disturb an existing Primary anywhere in the RD. + if Role(res.Role).IsPrimary() { + return false + } + + for _, conn := range res.Connections { + if Role(conn.PeerRole).IsPrimary() { + return false + } + } + + // Gate 1 (split-brain safety): the RD must be PAST day0 — at least one + // local volume already UpToDate. A pure day0 bootstrap has none, so + // the predicate cannot misfire there. + if !localHasUpToDateVolume(res.Devices) { + return false + } + + // Gate 2 (split-brain safety): every peer must be fully connected so + // the lowest-node-id election runs over COMPLETE peer information. + if !allPeersConnected(res.Connections) { + return false + } + + // Which local diskful volumes are wedged Inconsistent? + wedged := localInconsistentVolumes(res.Devices) + if len(wedged) == 0 { + return false + } + + // For every wedged volume: no peer may hold data (else SyncTarget), + // none may be actively resyncing (else let it finish), and we must be + // the lowest node-id among its non-diskless diskful replicas. + for vol := range wedged { + ok := lateAddVolumeNeedsLocalPromote(res.Connections, vol, *res.NodeID) + if !ok { + return false + } + } + + return true +} + +// localHasUpToDateVolume reports whether at least one local device is +// UpToDate — the proof that the RD is past first activation (a day0 +// bootstrap has every volume still Inconsistent). Gate 1 of the +// split-brain-safe NeedsLateAddPromote. +func localHasUpToDateVolume(devices []drbdsetupStatusDevice) bool { + for _, d := range devices { + if DiskState(d.DiskState) == DiskStateUpToDate { + return true + } + } + + return false +} + +// allPeersConnected reports whether EVERY peer connection is in the +// Connected connection-state. Gate 2 of NeedsLateAddPromote: the +// lowest-node-id promoter election is only sound with complete peer +// information, so any StandAlone / Connecting / unconnected peer (the +// day0 t+1s state) defers the self-heal. A resource with zero peer +// connections is NOT covered here (that is NeedsSoloPromote's domain) — +// returns false so the late-add gate never acts peerless. +func allPeersConnected(conns []drbdsetupStatusConnection) bool { + if len(conns) == 0 { + return false + } + + for _, conn := range conns { + if conn.ConnectionStr != "Connected" { + return false + } + } + + return true +} + +// localInconsistentVolumes returns the set of local volume numbers whose +// disk-state is Inconsistent. A volume in any other state (UpToDate, +// Diskless, Negotiating, …) is excluded — only a stuck-Inconsistent +// local volume is a late-add-promote candidate. +func localInconsistentVolumes(devices []drbdsetupStatusDevice) map[int32]struct{} { + out := map[int32]struct{}{} + + for _, d := range devices { + if DiskState(d.DiskState) == DiskStateInconsistent { + out[d.VolumeNumber] = struct{}{} + } + } + + return out +} + +// lateAddVolumeNeedsLocalPromote reports whether `vol` is a genuine +// late-add wedge that THIS node (myID) must force-primary: no connected +// peer exposes committed data for it, no peer is actively resyncing it, +// and myID is the lowest node-id among the volume's non-diskless diskful +// replicas. Conservative: any peer with data, any active resync, or any +// lower-id non-diskless peer returns false. +func lateAddVolumeNeedsLocalPromote(conns []drbdsetupStatusConnection, vol, myID int32) bool { + weAreLowest := true + + for _, conn := range conns { + for _, peerDev := range conn.PeerDevices { + if peerDev.VolumeNumber != vol { + continue + } + + switch DiskState(peerDev.PeerDiskState) { + case DiskStateUpToDate, DiskStateConsistent, DiskStateOutdated: + // A peer holds real data — must SyncTarget from it, never + // force-primary (Bug 342 unrelated-data guard). + return false + case DiskStateInconsistent: + // A fellow wedged diskful replica. It competes for the + // lowest-id promoter election; if it outranks us, defer. + if conn.PeerNodeID < myID { + weAreLowest = false + } + + if peerDeviceActivelySyncing(peerDev) { + // A live resync is already driving this volume — let + // it finish rather than churn a promote. + return false + } + case DiskStateAttaching, DiskStateNegotiating, DiskStateDUnknown: + // BUG-048 (≥3-replica double-promoter wedge): a freshly + // late-added volume on a LOWER-id diskful peer passes + // through Attaching/Negotiating/DUnknown while its kernel + // slot brings the new volume up — the peer has NOT yet + // settled to Inconsistent. The old code treated these + // transient states as "not a competing diskful promoter; + // ignore", so a HIGHER-id node observing a lower-id peer + // mid-bring-up wrongly concluded "I am lowest" and force- + // primaried. Both the higher-id node AND the true-lowest + // node (once it finished bring-up) then promoted, minting + // divergent current-UUIDs → the volume wedged PausedSyncS / + // StandAlone split-brain with no convergence (stand-observed + // on 3-diskful: node-id 1 and node-id 0 both force-primaried + // the same late volume). A lower-id peer in a transient + // bring-up state is a real diskful replica that WILL win the + // election, so defer to it. DUnknown is also the + // connection-not-fully-negotiated state of a diskful peer — + // deferring is the conservative, split-brain-safe choice + // (the next reconcile re-evaluates once the peer settles). + if conn.PeerNodeID < myID { + weAreLowest = false + } + case DiskStateDiskless, DiskStateDetaching, DiskStateFailed: + // Diskless witness (steady-state of a tiebreaker — never a + // diskful promoter, deferring to it would deadlock the + // promote) or a failed/detaching local-disk peer that holds + // no data. Not a competing diskful promoter; ignore. + default: + // Unknown/empty — ignore. + } + } + } + + return weAreLowest +} + +// LateAddResyncKickVolumes probes the live kernel via `drbdsetup status +// --json` and returns the LOCAL volume numbers whose late-add +// resync is STALLED and recoverable by a local `drbdadm invalidate +// /` — the ≥3-replica concurrent late-add convergence wedge +// (BUG-048). Empty slice ⇒ nothing to do. +// +// Why `invalidate` and not disconnect/reconnect: every replica of a +// freshly late-added volume shares the deterministic day0 current-UUID, +// so a connection re-handshake (disconnect+adjust/connect) reads "same +// generation ⇒ no resync needed" and ABANDONS the dirty bitmap — leaving +// the volume Established/Inconsistent with out-of-sync frozen, a WORSE +// wedge. `drbdadm invalidate /` instead explicitly discards the +// (empty, non-authoritative) LOCAL copy of that one volume and forces a +// full SyncTarget FROM an UpToDate peer — it works WITH the bitmap rather +// than re-deriving sync direction from the ambiguous day0 GI. +// +// The two stand-observed wedge signatures both reduce to "a local volume +// is below UpToDate, an UpToDate peer for it EXISTS, yet no live resync is +// pulling it up": +// +// - "partial stall": one peer reached UpToDate (a real source) but THIS +// replica is stuck WFBitMapT / PausedSyncT / Established with +// out-of-sync frozen — the source fed another peer but not us. +// - "dependency deadlock" (post-promote): once maybeLateAddPromote +// force-primaries the lowest-id node to UpToDate, the remaining +// Inconsistent replicas have an UpToDate peer but their resync stays +// paused (resync-suspended:dependency/peer) — invalidate restarts it. +// +// A volume qualifies ONLY when ALL hold, so the invalidate is data-safe +// and self-limiting: +// - the kernel slot exists and reports a my-node-id; +// - NO replica anywhere (local nor any peer-role) is Primary — never +// invalidate a volume an application may be writing (would discard +// live data); +// - at least one local volume is already UpToDate — the RD is PAST +// first activation (a pure day0 bootstrap is excluded, same as +// NeedsLateAddPromote gate 1), so this can never fire mid-bring-up; +// - every peer connection is fully Connected (resync state is only +// authoritative over a settled connection); +// - the LOCAL volume is below UpToDate (Inconsistent/Outdated) — only a +// non-authoritative local copy is ever discarded, never an UpToDate +// one; +// - a connected PEER is UpToDate for that SAME volume — the authoritative +// source `invalidate` will SyncTarget from (without one, invalidate +// has nothing to pull and would fail; maybeLateAddPromote owns minting +// the source for the all-Inconsistent case); +// - the volume is NOT already being actively SyncTarget-pulled +// (replication-state SyncTarget with resync-suspended "no") — a live +// resync finishes on its own and must not be churned. +// +// Conservative on any probe/parse failure → empty. Self-limiting: once a +// volume reaches UpToDate it no longer qualifies, so the predicate stops +// returning it. +func (a *Adm) LateAddResyncKickVolumes(ctx context.Context, resource string) []int32 { + out, err := a.exec.Run(ctx, "drbdsetup", "status", resource, "--json") + if err != nil { + return nil + } + + var status drbdsetupStatusRoot + + err = json.Unmarshal(out, &status) + if err != nil || len(status) == 0 || status[0].NodeID == nil { + return nil + } + + res := status[0] + + // Never invalidate a volume an application may be writing. + if Role(res.Role).IsPrimary() { + return nil + } + + for _, conn := range res.Connections { + if Role(conn.PeerRole).IsPrimary() { + return nil + } + } + + // Gate: the RD must be PAST day0 (at least one local UpToDate volume), + // so this can never misfire during a fresh RD's first-activation + // bring-up where every volume is transiently Inconsistent. + if !localHasUpToDateVolume(res.Devices) { + return nil + } + + // Gate: every peer fully Connected — the resync-state read is only + // authoritative over a settled connection. + if !allPeersConnected(res.Connections) { + return nil + } + + var kick []int32 + + for _, dev := range res.Devices { + if lateAddVolumeNeedsInvalidate(res.Connections, dev) { + kick = append(kick, dev.VolumeNumber) + } + } + + return kick +} + +// lateAddVolumeNeedsInvalidate reports whether THIS node's local `dev` is +// a stalled late-add volume that a local `invalidate` would recover: the +// local copy is below UpToDate (Inconsistent/Outdated), a connected peer +// is UpToDate for the volume (the SyncTarget source), and the volume is +// not already being actively pulled (SyncTarget/resync-suspended:no). +func lateAddVolumeNeedsInvalidate(conns []drbdsetupStatusConnection, dev drbdsetupStatusDevice) bool { + // Only a genuinely-stuck Inconsistent local copy is invalidated. + // Outdated is DELIBERATELY excluded: after an invalidate the volume + // transits Inconsistent → (resync) → Outdated → UpToDate, and a fresh + // SyncSource peer is briefly Outdated mid-handshake. Re-invalidating an + // Outdated copy re-dirties a volume that is actually converging, which + // FLAPS the resync (stand-observed: oos drops to 0, invalidate re-fires, + // oos jumps back to full). Leaving Outdated alone lets the in-flight + // resync finish. + switch DiskState(dev.DiskState) { + case DiskStateInconsistent: + // Genuinely below-UpToDate and not a transient post-resync state — + // the invalidate candidate. + case DiskStateUpToDate, DiskStateConsistent, DiskStateOutdated, + DiskStateDiskless, DiskStateAttaching, DiskStateDetaching, + DiskStateFailed, DiskStateNegotiating, DiskStateDUnknown: + return false + default: + return false + } + + var ( + peerUpToDate bool + activeResyncIP bool + ) + + for _, conn := range conns { + for _, peerDev := range conn.PeerDevices { + if peerDev.VolumeNumber != dev.VolumeNumber { + continue + } + + if DiskState(peerDev.PeerDiskState) == DiskStateUpToDate { + peerUpToDate = true + } + + // Back off if ANY peer-device on this volume has a resync + // actively in progress (either direction) — an invalidate now + // would abort/re-dirty a converging resync and flap it. + if peerDeviceResyncInProgress(peerDev) { + activeResyncIP = true + } + } + } + + return peerUpToDate && !activeResyncIP +} + +// peerDeviceResyncInProgress reports whether this peer-device has a resync +// actively running or starting in EITHER direction — i.e. NOT stalled. A +// volume with any such peer-device is converging on its own and must not be +// invalidated (doing so re-dirties it and flaps the resync). True for +// SyncSource / SyncTarget / StartingSync* and for a WFBitMap* handshake +// that is NOT suspended (resync-suspended "no"/empty — the bitmap exchange +// is proceeding). A WFBitMap* / PausedSync* with a non-"no" resync-suspended +// is the STALLED wedge — that is NOT "in progress", so the kick may fire. +func peerDeviceResyncInProgress(peerDev drbdsetupStatusPeerDevice) bool { + switch ReplicationState(peerDev.ReplicationState) { + case ReplicationStateSyncSource, ReplicationStateSyncTarget, + ReplicationStateStartingSyncS, ReplicationStateStartingSyncT: + // A resync is actively transferring data — always in progress. + return true + case ReplicationStateWFBitMapS, ReplicationStateWFBitMapT, + ReplicationStateWFSyncUUID: + // A handshake step: in progress ONLY if not suspended. A non-"no" + // resync-suspended (dependency/peer/user) is the stalled wedge. + return peerDev.ResyncSuspended == "" || peerDev.ResyncSuspended == wireNo + case ReplicationStateOff, ReplicationStateEstablished, + ReplicationStatePausedSyncS, ReplicationStatePausedSyncT, + ReplicationStateVerifyS, ReplicationStateVerifyT, + ReplicationStateAhead, ReplicationStateBehind: + // Established (done), explicitly paused, or non-resync states — not + // a resync in progress. + return false + } + + return false +} + // DownVeto is the tri-state outcome of the Bug 350 kernel-truth probe // the satellite consults before committing a `drbdadm down` on the // INACTIVE path. It separates "definitely safe to down" from "must diff --git a/pkg/drbd/drbdsetup_show.go b/pkg/drbd/drbdsetup_show.go index b9743f17..a4b702f8 100644 --- a/pkg/drbd/drbdsetup_show.go +++ b/pkg/drbd/drbdsetup_show.go @@ -152,6 +152,11 @@ type drbdsetupStatusDevice struct { // DiskState is this node's local disk state for the volume // (`disk-state`): UpToDate / Inconsistent / Diskless / … DiskState string `json:"disk-state"` + // Minor is the kernel minor number backing this volume (`/dev/drbdN`). + // Used by the BUG-048 late-add source-mint (MintLateAddSource) to + // target `drbdsetup new-current-uuid --clear-bitmap ` at the + // specific stuck volume. + Minor int32 `json:"minor"` } // drbdsetupStatusConnection is one peer connection slot, as emitted by diff --git a/pkg/drbd/needs_late_add_promote_test.go b/pkg/drbd/needs_late_add_promote_test.go new file mode 100644 index 00000000..68dd9c2d --- /dev/null +++ b/pkg/drbd/needs_late_add_promote_test.go @@ -0,0 +1,412 @@ +// SPDX-License-Identifier: Apache-2.0 + +/* +Copyright 2026 Cozystack contributors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package drbd_test + +import ( + "slices" + "testing" + + "github.com/cozystack/blockstor/pkg/drbd" + "github.com/cozystack/blockstor/pkg/storage" +) + +// Regression pins for the BUG-048 late-add-promote predicate +// (NeedsLateAddPromote). It exists to unstick a LATE-ADDED volume that +// wedged Inconsistent on EVERY diskful replica with no SyncSource (the +// concurrent two-VD add on a ≥3-diskful RD): the deterministic +// lowest-node-id replica force-primaries to become the SyncSource. It +// must fire ONLY in that genuine wedge and never when a peer holds data +// (SyncTarget instead), a resync is already running, a Primary exists, +// or this node is not the lowest-id promoter. + +const lateAddPromoteKey = "drbdsetup status pvc-late --json" + +func admWithLateAddStatus(t *testing.T, json string) *drbd.Adm { + t.Helper() + + fx := storage.NewFakeExec() + fx.Responses[lateAddPromoteKey] = storage.FakeResponse{Stdout: []byte(json)} + + return drbd.NewAdm(fx) +} + +// The genuine BUG-048 wedge: vol-0/vol-1 UpToDate, the late vol-2 is +// Inconsistent locally AND on every diskful peer, no peer holds data for +// vol-2, no Primary, and we (node-id 0) are the lowest → promote. +func TestNeedsLateAddPromote_WedgedLateVolume(t *testing.T) { + adm := admWithLateAddStatus(t, `[{ + "name":"pvc-late","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":1,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected", + "peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":1,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":2,"peer-disk-state":"Inconsistent","replication-state":"Established","resync-suspended":"no"} + ] + }] + }]`) + + if !adm.NeedsLateAddPromote(t.Context(), "pvc-late") { + t.Fatal("late vol-2 Inconsistent on every replica with no SyncSource must trigger late-add-promote") + } +} + +// Not the lowest node-id: a peer with a LOWER node-id is also wedged on +// vol-2, so it is the elected promoter — we must defer (no split-brain). +// (vol-0 UpToDate everywhere → past day0, peer Connected → gates 1+2 OK.) +func TestNeedsLateAddPromote_DefersToLowerNodeID(t *testing.T) { + adm := admWithLateAddStatus(t, `[{ + "name":"pvc-late","node-id":2,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], + "connections":[{ + "peer-node-id":0,"name":"n1","connection-state":"Connected", + "peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":2,"peer-disk-state":"Inconsistent","replication-state":"Established","resync-suspended":"no"} + ] + }] + }]`) + + if adm.NeedsLateAddPromote(t.Context(), "pvc-late") { + t.Fatal("a lower-node-id wedged peer is the promoter; this node (id 2) must defer") + } +} + +// A peer already holds committed data for vol-2 (UpToDate) — the correct +// action is to SyncTarget from it, NEVER force-primary (Bug 342 +// unrelated-data guard). Must not fire. +func TestNeedsLateAddPromote_PeerHasDataVeto(t *testing.T) { + adm := admWithLateAddStatus(t, `[{ + "name":"pvc-late","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected", + "peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"SyncTarget","resync-suspended":"no"} + ] + }] + }]`) + + if adm.NeedsLateAddPromote(t.Context(), "pvc-late") { + t.Fatal("a peer holding data for the volume must veto force-primary (SyncTarget instead)") + } +} + +// A live resync is already driving vol-2 toward UpToDate from this node — +// let it finish rather than churn a promote. +func TestNeedsLateAddPromote_ActiveResyncDefers(t *testing.T) { + adm := admWithLateAddStatus(t, `[{ + "name":"pvc-late","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected", + "peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":2,"peer-disk-state":"Inconsistent","replication-state":"SyncSource","resync-suspended":"no"} + ] + }] + }]`) + + if adm.NeedsLateAddPromote(t.Context(), "pvc-late") { + t.Fatal("a volume already being actively resynced must NOT trigger a promote") + } +} + +// A Primary exists somewhere — it already drives the sync; never disturb. +func TestNeedsLateAddPromote_PeerPrimaryVeto(t *testing.T) { + adm := admWithLateAddStatus(t, `[{ + "name":"pvc-late","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected", + "peer-role":"Primary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":2,"peer-disk-state":"Inconsistent","replication-state":"Established","resync-suspended":"no"} + ] + }] + }]`) + + if adm.NeedsLateAddPromote(t.Context(), "pvc-late") { + t.Fatal("a Primary peer must veto the late-add-promote") + } +} + +// SPLIT-BRAIN SAFETY gate 1: a pure day0 bootstrap — EVERY volume +// transiently Inconsistent, NO UpToDate sibling — must NOT fire, even +// though "all replicas Inconsistent, no SyncSource" superficially looks +// like the wedge. The normal day0 winner-election + auto-promote own +// this; misfiring here is exactly what split-brained vol-0 on the stand. +func TestNeedsLateAddPromote_Day0BootstrapNoFire(t *testing.T) { + adm := admWithLateAddStatus(t, `[{ + "name":"pvc-late","node-id":0,"role":"Secondary", + "devices":[{"volume":0,"disk-state":"Inconsistent"}], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected", + "peer-role":"Secondary", + "peer_devices":[{"volume":0,"peer-disk-state":"Inconsistent", + "replication-state":"Established","resync-suspended":"no"}] + }] + }]`) + + if adm.NeedsLateAddPromote(t.Context(), "pvc-late") { + t.Fatal("day0 bootstrap (no UpToDate sibling) must NOT trigger late-add-promote — normal winner election owns it") + } +} + +// SPLIT-BRAIN SAFETY gate 2: a peer that is NOT fully Connected (the +// day0 t+1s StandAlone/Connecting state) means the lowest-node-id +// election would run over a partial peer set. Defer until every peer is +// connected, so EXACTLY ONE node promotes — never two simultaneously. +func TestNeedsLateAddPromote_PartialConnectivityNoFire(t *testing.T) { + adm := admWithLateAddStatus(t, `[{ + "name":"pvc-late","node-id":1,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], + "connections":[ + {"peer-node-id":0,"name":"n1","connection-state":"StandAlone","peer-role":"Unknown", + "peer_devices":[{"volume":2,"peer-disk-state":"DUnknown","replication-state":"Off","resync-suspended":"no"}]}, + {"peer-node-id":2,"name":"n3","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[{"volume":2,"peer-disk-state":"Inconsistent","replication-state":"Established","resync-suspended":"no"}]} + ] + }]`) + + if adm.NeedsLateAddPromote(t.Context(), "pvc-late") { + t.Fatal("a not-fully-connected peer set must defer the promote (incomplete election → split-brain risk)") + } +} + +// No Inconsistent local volume — nothing to promote. Must not fire (this +// is the healthy steady state every converged reconcile passes through). +func TestNeedsLateAddPromote_AllUpToDateNoOp(t *testing.T) { + adm := admWithLateAddStatus(t, `[{ + "name":"pvc-late","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":1,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"UpToDate"} + ], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected", + "peer-role":"Secondary", + "peer_devices":[ + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"} + ] + }] + }]`) + + if adm.NeedsLateAddPromote(t.Context(), "pvc-late") { + t.Fatal("a fully-converged RD must NOT trigger any promote") + } +} + +// BUG-048 (≥3-replica double-promoter wedge). A LOWER-node-id diskful peer +// whose freshly late-added volume is still in a TRANSIENT bring-up state +// (DUnknown — connection negotiated, peer disk not yet reported) is the +// rightful promoter. The old election only counted a lower-id peer as a +// competitor when it observed it as Inconsistent, so this node (id 2) saw +// the lower-id peer mid-bring-up, concluded "I am lowest", and force- +// primaried — and once the lower-id peer settled it ALSO promoted → +// divergent current-UUIDs → PausedSyncS / StandAlone wedge (stand-observed +// on 3-diskful: node-id 1 and node-id 0 both force-primaried the same late +// volume). This node MUST defer to the lower-id peer regardless of its +// transient per-volume state. +func TestNeedsLateAddPromote_DefersToLowerNodeIDStillBringingUp(t *testing.T) { + for _, tc := range []struct { + name string + peerState string + }{ + {"DUnknown", "DUnknown"}, + {"Negotiating", "Negotiating"}, + {"Attaching", "Attaching"}, + } { + t.Run(tc.name, func(t *testing.T) { + adm := admWithLateAddStatus(t, `[{ + "name":"pvc-late","node-id":2,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], + "connections":[{ + "peer-node-id":0,"name":"n1","connection-state":"Connected", + "peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":2,"peer-disk-state":"`+tc.peerState+`","replication-state":"Established","resync-suspended":"no"} + ] + }] + }]`) + + if adm.NeedsLateAddPromote(t.Context(), "pvc-late") { + t.Fatalf("a lower-node-id diskful peer still bringing up the volume (%s) is the elected promoter; "+ + "this node (id 2) must defer or both promote and wedge the volume (BUG-048)", tc.peerState) + } + }) + } +} + +// BUG-048 companion: deferring to a transient lower-id peer must NOT +// deadlock when the lower-id peer is a PERMANENT diskless witness +// (tiebreaker). A diskless peer never becomes a diskful promoter, so a +// lower-id Diskless peer must be IGNORED (not deferred to) — this node +// (id 2), the lowest DISKFUL replica, still promotes. (The witness shows +// steady-state "Diskless"; a diskful peer mid-bring-up shows DUnknown / +// Negotiating / Attaching, handled by the test above.) +func TestNeedsLateAddPromote_PromotesOverLowerIDDisklessWitness(t *testing.T) { + adm := admWithLateAddStatus(t, `[{ + "name":"pvc-late","node-id":2,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], + "connections":[{ + "peer-node-id":0,"name":"tiebreaker","connection-state":"Connected", + "peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"Diskless","replication-state":"Established","resync-suspended":"no"}, + {"volume":2,"peer-disk-state":"Diskless","replication-state":"Established","resync-suspended":"no"} + ] + }] + }]`) + + if !adm.NeedsLateAddPromote(t.Context(), "pvc-late") { + t.Fatal("a lower-id DISKLESS witness never promotes; this node (the lowest DISKFUL replica) " + + "must still force-primary to seed the SyncSource (deferring would deadlock the volume forever)") + } +} + +// BUG-048 companion: the true-lowest diskful node still promotes even when +// a HIGHER-id peer is mid-bring-up (DUnknown). A higher-id transient peer +// is not a competitor — this node (id 0) is lowest and must seed. +func TestNeedsLateAddPromote_LowestPromotesDespiteHigherIDBringingUp(t *testing.T) { + adm := admWithLateAddStatus(t, `[{ + "name":"pvc-late","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected", + "peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":2,"peer-disk-state":"DUnknown","replication-state":"Established","resync-suspended":"no"} + ] + }] + }]`) + + if !adm.NeedsLateAddPromote(t.Context(), "pvc-late") { + t.Fatal("the lowest diskful node must promote even when a higher-id peer is still bringing up the volume") + } +} + +// MintLateAddSource clears the bitmap for ONLY the local Inconsistent +// volumes (per-minor), wrapped in disconnect → ... → adjust, and never the +// rejected resource-wide `primary --force`. +func TestMintLateAddSource_ClearsInconsistentMinorsOnly(t *testing.T) { + fx := storage.NewFakeExec() + fx.Responses["drbdsetup status pvc-mint --json"] = storage.FakeResponse{Stdout: []byte(`[{ + "name":"pvc-mint","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"minor":1000,"disk-state":"UpToDate"}, + {"volume":1,"minor":1001,"disk-state":"Inconsistent"}, + {"volume":2,"minor":1002,"disk-state":"Inconsistent"} + ], + "connections":[] + }]`)} + + adm := drbd.NewAdm(fx) + + minors, err := adm.MintLateAddSource(t.Context(), "pvc-mint") + if err != nil { + t.Fatalf("MintLateAddSource: %v", err) + } + if !slices.Equal(minors, []int32{1001, 1002}) { + t.Fatalf("expected cleared minors [1001 1002], got %v", minors) + } + + cmds := fx.CommandLines() + want := []string{ + "drbdadm disconnect pvc-mint", + "drbdsetup new-current-uuid --clear-bitmap 1001", + "drbdsetup new-current-uuid --clear-bitmap 1002", + "drbdadm adjust pvc-mint", + } + for _, w := range want { + if !slices.Contains(cmds, w) { + t.Errorf("expected command %q, got: %v", w, cmds) + } + } + // The UpToDate sibling (minor 1000) must NOT be cleared. + if slices.Contains(cmds, "drbdsetup new-current-uuid --clear-bitmap 1000") { + t.Errorf("must NOT clear-bitmap the UpToDate sibling minor 1000, got: %v", cmds) + } + // The kernel-rejected resource-wide primary --force must not appear. + if slices.Contains(cmds, "drbdadm primary --force pvc-mint") { + t.Errorf("must NOT use primary --force, got: %v", cmds) + } +} + +// No local Inconsistent volume → MintLateAddSource is a no-op (no +// disconnect, no clear-bitmap). +func TestMintLateAddSource_NoopWhenNothingInconsistent(t *testing.T) { + fx := storage.NewFakeExec() + fx.Responses["drbdsetup status pvc-mint --json"] = storage.FakeResponse{Stdout: []byte(`[{ + "name":"pvc-mint","node-id":0,"role":"Secondary", + "devices":[{"volume":0,"minor":1000,"disk-state":"UpToDate"}], + "connections":[] + }]`)} + + adm := drbd.NewAdm(fx) + + minors, err := adm.MintLateAddSource(t.Context(), "pvc-mint") + if err != nil { + t.Fatalf("MintLateAddSource: %v", err) + } + if len(minors) != 0 { + t.Fatalf("expected no minors cleared, got %v", minors) + } + if slices.Contains(fx.CommandLines(), "drbdadm disconnect pvc-mint") { + t.Errorf("must NOT disconnect when nothing is Inconsistent, got: %v", fx.CommandLines()) + } +} diff --git a/pkg/drbd/needs_late_add_resync_kick_test.go b/pkg/drbd/needs_late_add_resync_kick_test.go new file mode 100644 index 00000000..745cb20d --- /dev/null +++ b/pkg/drbd/needs_late_add_resync_kick_test.go @@ -0,0 +1,336 @@ +// SPDX-License-Identifier: Apache-2.0 + +/* +Copyright 2026 Cozystack contributors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package drbd_test + +import ( + "errors" + "slices" + "testing" + + "github.com/cozystack/blockstor/pkg/drbd" + "github.com/cozystack/blockstor/pkg/storage" +) + +// Regression pins for the BUG-048 late-add resync-KICK predicate +// (LateAddResyncKickVolumes). It returns the LOCAL volume numbers whose +// late-add resync is STALLED and recoverable by `drbdadm invalidate +// /` — a local copy below UpToDate, an UpToDate peer to pull +// from, and no live resync already running. It must return ONLY genuine +// stragglers and never an UpToDate local, a volume with no UpToDate peer, +// a live SyncTarget, a day0 bootstrap, or a Primary-held resource. + +var errKickProbe = errors.New("drbdsetup status: exit 1") + +const lateAddKickKey = "drbdsetup status pvc-kick --json" + +func admWithKickStatus(t *testing.T, json string) *drbd.Adm { + t.Helper() + + fx := storage.NewFakeExec() + fx.Responses[lateAddKickKey] = storage.FakeResponse{Stdout: []byte(json)} + + return drbd.NewAdm(fx) +} + +// Stand-observed "partial stall" (B-3r RUN9): vol-0/vol-1 UpToDate, the +// late vol-2 stuck Inconsistent locally while a peer (n2) reached +// UpToDate for vol-2 — but this replica is stuck WFBitMapT and never +// finalises. An UpToDate peer exists ⇒ invalidate vol-2 locally. +func TestLateAddResyncKickVolumes_PartialStall(t *testing.T) { + adm := admWithKickStatus(t, `[{ + "name":"pvc-kick","node-id":2,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":1,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], + "connections":[ + {"peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"WFBitMapT","resync-suspended":"peer"} + ]}, + {"peer-node-id":0,"name":"n1","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"} + ]} + ] + }]`) + + got := adm.LateAddResyncKickVolumes(t.Context(), "pvc-kick") + if !slices.Equal(got, []int32{2}) { + t.Fatalf("expected to invalidate [2], got %v", got) + } +} + +// Post-promote dependency case: a peer is now UpToDate (the promoted +// source) but this Inconsistent replica's resync stays paused +// (resync-suspended:dependency). An UpToDate peer exists ⇒ invalidate. +func TestLateAddResyncKickVolumes_PostPromoteDependency(t *testing.T) { + adm := admWithKickStatus(t, `[{ + "name":"pvc-kick","node-id":2,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], + "connections":[{ + "peer-node-id":0,"name":"n1","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"PausedSyncT","resync-suspended":"dependency"} + ] + }] + }]`) + + got := adm.LateAddResyncKickVolumes(t.Context(), "pvc-kick") + if !slices.Equal(got, []int32{2}) { + t.Fatalf("expected to invalidate [2], got %v", got) + } +} + +// No UpToDate peer for the stalled volume (all-Inconsistent dependency +// deadlock BEFORE promote): invalidate has no source, so this predicate +// must NOT return the volume — maybeLateAddPromote owns minting the +// source first. +func TestLateAddResyncKickVolumes_NoUpToDatePeerSkipped(t *testing.T) { + adm := admWithKickStatus(t, `[{ + "name":"pvc-kick","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], + "connections":[ + {"peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":2,"peer-disk-state":"Inconsistent","replication-state":"PausedSyncS","resync-suspended":"dependency"} + ]}, + {"peer-node-id":2,"name":"n3","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":2,"peer-disk-state":"Inconsistent","replication-state":"WFBitMapT","resync-suspended":"peer"} + ]} + ] + }]`) + + got := adm.LateAddResyncKickVolumes(t.Context(), "pvc-kick") + if len(got) != 0 { + t.Fatalf("must NOT invalidate when no peer is UpToDate, got %v", got) + } +} + +// A live, unsuspended SyncTarget (resync-suspended "no") must NOT be +// invalidated — it is actively pulling and finishes on its own. +func TestLateAddResyncKickVolumes_ActiveSyncTargetSkipped(t *testing.T) { + adm := admWithKickStatus(t, `[{ + "name":"pvc-kick","node-id":2,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], + "connections":[{ + "peer-node-id":0,"name":"n1","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"SyncTarget","resync-suspended":"no"} + ] + }] + }]`) + + got := adm.LateAddResyncKickVolumes(t.Context(), "pvc-kick") + if len(got) != 0 { + t.Fatalf("must NOT invalidate a live SyncTarget, got %v", got) + } +} + +// Flap guard (multi-peer): the local volume is Inconsistent and one peer +// is stalled (PausedSyncS/dependency) BUT another peer is actively pulling +// it up (SyncTarget/resync-suspended:no). A resync is in progress, so +// invalidating now would re-dirty a converging volume and FLAP it — must +// back off and let the live resync finish. +func TestLateAddResyncKickVolumes_InProgressMultiPeerSkipped(t *testing.T) { + adm := admWithKickStatus(t, `[{ + "name":"pvc-kick","node-id":2,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], + "connections":[ + {"peer-node-id":0,"name":"n1","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"SyncTarget","resync-suspended":"no"} + ]}, + {"peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":2,"peer-disk-state":"Inconsistent","replication-state":"PausedSyncS","resync-suspended":"dependency"} + ]} + ] + }]`) + + got := adm.LateAddResyncKickVolumes(t.Context(), "pvc-kick") + if len(got) != 0 { + t.Fatalf("must NOT invalidate while a live resync is converging the volume, got %v", got) + } +} + +// An Outdated local volume is a TRANSIENT post-resync / mid-handshake state +// — re-invalidating it re-dirties a converging volume (the stand-observed +// flap). Only a genuinely-stuck Inconsistent local is ever invalidated. +func TestLateAddResyncKickVolumes_OutdatedLocalSkipped(t *testing.T) { + adm := admWithKickStatus(t, `[{ + "name":"pvc-kick","node-id":2,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Outdated"} + ], + "connections":[{ + "peer-node-id":0,"name":"n1","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"WFBitMapT","resync-suspended":"peer"} + ] + }] + }]`) + + got := adm.LateAddResyncKickVolumes(t.Context(), "pvc-kick") + if len(got) != 0 { + t.Fatalf("must NOT invalidate an Outdated (transient/recovering) local volume, got %v", got) + } +} + +// An UpToDate local volume is authoritative — never invalidate it even if +// a peer-device shows a stalled state. +func TestLateAddResyncKickVolumes_UpToDateLocalSkipped(t *testing.T) { + adm := admWithKickStatus(t, `[{ + "name":"pvc-kick","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"UpToDate"} + ], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"PausedSyncS","resync-suspended":"dependency"} + ] + }] + }]`) + + got := adm.LateAddResyncKickVolumes(t.Context(), "pvc-kick") + if len(got) != 0 { + t.Fatalf("must NOT invalidate an UpToDate local volume, got %v", got) + } +} + +// Day0 bootstrap: NO local UpToDate volume yet. Even with an Inconsistent +// local + UpToDate peer, the gate refuses (RD not past first activation). +func TestLateAddResyncKickVolumes_Day0Skipped(t *testing.T) { + adm := admWithKickStatus(t, `[{ + "name":"pvc-kick","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"Inconsistent"} + ], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"PausedSyncT","resync-suspended":"dependency"} + ] + }] + }]`) + + got := adm.LateAddResyncKickVolumes(t.Context(), "pvc-kick") + if len(got) != 0 { + t.Fatalf("must NOT invalidate during day0 (no local UpToDate volume), got %v", got) + } +} + +// A Primary anywhere (local or peer) vetoes invalidate — never discard a +// volume an application may be writing. +func TestLateAddResyncKickVolumes_PrimaryVetoes(t *testing.T) { + adm := admWithKickStatus(t, `[{ + "name":"pvc-kick","node-id":2,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], + "connections":[{ + "peer-node-id":0,"name":"n1","connection-state":"Connected","peer-role":"Primary", + "peer_devices":[ + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"PausedSyncT","resync-suspended":"dependency"} + ] + }] + }]`) + + got := adm.LateAddResyncKickVolumes(t.Context(), "pvc-kick") + if len(got) != 0 { + t.Fatalf("must NOT invalidate while a Primary holds the resource, got %v", got) + } +} + +// A not-fully-connected peer (Connecting / StandAlone) defers — the +// resync-state read is only authoritative over a settled connection. +func TestLateAddResyncKickVolumes_PeerNotConnectedDefers(t *testing.T) { + adm := admWithKickStatus(t, `[{ + "name":"pvc-kick","node-id":2,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], + "connections":[ + {"peer-node-id":0,"name":"n1","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"PausedSyncT","resync-suspended":"dependency"} + ]}, + {"peer-node-id":1,"name":"n2","connection-state":"Connecting","peer-role":"Unknown", + "peer_devices":[]} + ] + }]`) + + got := adm.LateAddResyncKickVolumes(t.Context(), "pvc-kick") + if len(got) != 0 { + t.Fatalf("must NOT invalidate when a peer is not fully Connected, got %v", got) + } +} + +// Fully converged (every volume UpToDate) — nothing to invalidate. +func TestLateAddResyncKickVolumes_ConvergedEmpty(t *testing.T) { + adm := admWithKickStatus(t, `[{ + "name":"pvc-kick","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"UpToDate"} + ], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"} + ] + }] + }]`) + + got := adm.LateAddResyncKickVolumes(t.Context(), "pvc-kick") + if len(got) != 0 { + t.Fatalf("must return nothing for a converged resource, got %v", got) + } +} + +// Probe failure (drbdsetup errors) → conservative empty. +func TestLateAddResyncKickVolumes_ProbeFailureEmpty(t *testing.T) { + fx := storage.NewFakeExec() + fx.Responses[lateAddKickKey] = storage.FakeResponse{Err: errKickProbe} + + adm := drbd.NewAdm(fx) + if got := adm.LateAddResyncKickVolumes(t.Context(), "pvc-kick"); len(got) != 0 { + t.Fatalf("must return empty on a probe failure, got %v", got) + } +} diff --git a/pkg/rest/volume_definitions.go b/pkg/rest/volume_definitions.go index 0d10be23..c5740f3a 100644 --- a/pkg/rest/volume_definitions.go +++ b/pkg/rest/volume_definitions.go @@ -316,8 +316,18 @@ func (s *Server) handleVDCreate(w http.ResponseWriter, r *http.Request) { vd := envelope.VolumeDefinition - if !s.resolveVolumeNumber(w, r, rd, rawBody, &vd) { - return + autoNumber := !vdCreateVolumeNumberExplicit(rawBody) + + // Validate the explicit number (Bug 363) before the size gate so the + // operator-facing rejection order is stable. The auto-number path + // owns the number itself (the store allocates it), so it skips this. + if !autoNumber { + nrErr := validateVolumeNumber(vd.VolumeNumber) + if nrErr != nil { + writeVDNumberRejection(w, rd, vd.VolumeNumber, nrErr) + + return + } } // Bug 155: refuse out-of-bounds sizes at the REST boundary so the @@ -330,25 +340,7 @@ func (s *Server) handleVDCreate(w http.ResponseWriter, r *http.Request) { return } - err = s.Store.VolumeDefinitions().Create(r.Context(), rd, &vd) - if err != nil { - // Bug 140: duplicate-VD conflict gets a typed envelope with - // the upstream FAIL_EXISTS_VLM_DFN sub-code plus actionable - // cause/correction so scripts and audit-log greppers route - // the same way they do for upstream's `linstor vd c` reply. - // The bare writeStoreError fallback emitted apiCallRcError - // alone — high-bit error, no sub-code, no cause/correction - // — which the Python CLI rendered as a generic "object - // already exists" line that didn't tell the operator which - // VlmNr to twist. - if errors.Is(err, store.ErrAlreadyExists) { - writeVDExistsConflict(w, rd, vd.VolumeNumber) - - return - } - - writeStoreError(w, err) - + if !s.persistNewVolumeDefinition(w, r, rd, &vd, autoNumber) { return } @@ -363,42 +355,58 @@ func (s *Server) handleVDCreate(w http.ResponseWriter, r *http.Request) { }}) } -// resolveVolumeNumber folds the auto-assign + explicit-VlmNr -// validation branches out of handleVDCreate so the parent stays under -// the funlen threshold. Returns true on success (vd.VolumeNumber is -// populated and within bounds) and false when an envelope was already -// written to w (the caller must return immediately). +// persistNewVolumeDefinition writes the new VD to the store and, on +// failure, emits the right typed envelope. Returns true on success and +// false when an HTTP error envelope was already written (the caller +// must return immediately). Split out of handleVDCreate so the handler +// stays under the funlen budget. // -// Bug 191: auto-assign fires when the raw POST body does NOT carry an -// explicit `volume_number` key — Go's int32 zero would otherwise -// silently collide with VlmNr=0 on the second `vd c` against the -// same RD. Bug 363: when the caller DOES specify a volume_number, it -// must lie inside DRBD-9's addressable [0, 65535] range; out-of-bound -// values would otherwise hang the satellite in "waiting for DRBD-ID -// allocation" because no positive minor can be derived from them. -func (s *Server) resolveVolumeNumber( +// BUG-048 (P1, availability): when the operator did NOT name a +// VolumeNumber, the allocation MUST happen atomically with the write — +// route through CreateAutoNumbered, which picks the smallest free hole +// INSIDE the store's conflict-retry loop. The previous flow picked the +// number in a separate REST-side List (autoAssignVolumeNumber) BEFORE +// the Create, so two concurrent `linstor vd c ` calls both read +// `[vol-0]`, both chose VlmNr=1, the loser was rejected FAIL_EXISTS_ +// VLM_DFN, and the operator's second intended volume silently vanished. +// The explicit-number path keeps the plain Create (its retry loop +// already converges correctly and a genuine duplicate must still 409). +func (s *Server) persistNewVolumeDefinition( w http.ResponseWriter, r *http.Request, rd string, - rawBody []byte, vd *apiv1.VolumeDefinition, + autoNumber bool, ) bool { - if !vdCreateVolumeNumberExplicit(rawBody) { - assigned, assignErr := s.autoAssignVolumeNumber(r.Context(), rd) - if assignErr != nil { - writeStoreError(w, assignErr) + if autoNumber { + _, err := s.Store.VolumeDefinitions().CreateAutoNumbered(r.Context(), rd, vd) + if err != nil { + writeStoreError(w, err) return false } - vd.VolumeNumber = assigned - return true } - nrErr := validateVolumeNumber(vd.VolumeNumber) - if nrErr != nil { - writeVDNumberRejection(w, rd, vd.VolumeNumber, nrErr) + err := s.Store.VolumeDefinitions().Create(r.Context(), rd, vd) + if err != nil { + // Bug 140: duplicate-VD conflict gets a typed envelope with + // the upstream FAIL_EXISTS_VLM_DFN sub-code plus actionable + // cause/correction so scripts and audit-log greppers route + // the same way they do for upstream's `linstor vd c` reply. + // The bare writeStoreError fallback emitted apiCallRcError + // alone — high-bit error, no sub-code, no cause/correction + // — which the Python CLI rendered as a generic "object + // already exists" line that didn't tell the operator which + // VlmNr to twist. + if errors.Is(err, store.ErrAlreadyExists) { + writeVDExistsConflict(w, rd, vd.VolumeNumber) + + return false + } + + writeStoreError(w, err) return false } @@ -469,36 +477,12 @@ func rawHasNonNullKey(obj map[string]json.RawMessage, key string) bool { return !bytes.Equal(bytes.TrimSpace(raw), []byte("null")) } -// autoAssignVolumeNumber returns the smallest free non-negative -// VolumeNumber under the given RD. Mirrors upstream LINSTOR's -// CtrlVlmDfnCrtApiCallHandler smallest-hole rule: with VDs 0 and 2 -// present, an auto-assign POST lands at 1 — not 3. -// -// A missing RD surfaces here as the underlying store's ErrNotFound; -// the caller routes it through writeStoreError so the wire shape -// matches the pre-fix 404 path for an unknown parent RD. -func (s *Server) autoAssignVolumeNumber(ctx context.Context, rd string) (int32, error) { - vds, err := s.Store.VolumeDefinitions().List(ctx, rd) - if err != nil { - return 0, err //nolint:wrapcheck // surfaced to writeStoreError - } - - used := make(map[int32]bool, len(vds)) - for i := range vds { - used[vds[i].VolumeNumber] = true - } - - for candidate := int32(0); candidate >= 0; candidate++ { - if !used[candidate] { - return candidate, nil - } - } - - // Unreachable for any sane RD (the smallest-free walk terminates - // at the first gap; an RD with 2^31 VDs is impossible on real - // storage). Pin the contract anyway. - return 0, errors.New("auto-assign: VolumeNumber space exhausted") -} +// Auto-assign of the smallest free VolumeNumber now lives in the store +// layer (VolumeDefinitionStore.CreateAutoNumbered) so the read of the +// existing set and the write of the new entry are atomic — see BUG-048. +// The former REST-side autoAssignVolumeNumber helper (a separate List +// before the Create) was removed because its TOCTOU window dropped the +// second of two concurrent `linstor vd c` calls. // minVolumeDefinitionSizeKib is the smallest accepted size_kib on // `POST /v1/resource-definitions/{rd}/volume-definitions` (Bug 155). diff --git a/pkg/rest/volume_definitions_test.go b/pkg/rest/volume_definitions_test.go index 7ca91ebb..d40bc1e1 100644 --- a/pkg/rest/volume_definitions_test.go +++ b/pkg/rest/volume_definitions_test.go @@ -22,6 +22,7 @@ import ( "encoding/json" "net/http" "strings" + "sync" "testing" apiv1 "github.com/cozystack/blockstor/pkg/api/v1" @@ -69,6 +70,102 @@ func TestVolumeDefinitionsCreateRoundTrip(t *testing.T) { } } +// TestVolumeDefinitionsConcurrentAutoNumberNoLostUpdate is the BUG-048 +// (P1, availability) regression: two back-to-back number-less +// `linstor vd c ` POSTs against an RD that already carries vol-0 +// must BOTH succeed and land at DISTINCT VolumeNumbers (1 and 2) — none +// silently dropped. +// +// Pre-fix the auto-assign picked the smallest free number in a REST-side +// List BEFORE the store Create, so both requests read `[vol-0]`, both +// chose VlmNr=1, the loser was rejected FAIL_EXISTS_VLM_DFN, and the +// operator's second intended volume vanished (only vol-0 + vol-1 left). +// The store-side CreateAutoNumbered re-derives the hole inside its +// conflict-retry loop, so the loser lands at vol-2. +// +// This is the L1 half of the BUG-048 fix; the kernel-truth convergence +// half is tests/e2e/cli-matrix/multi-volume-late-vd-create.sh. +func TestVolumeDefinitionsConcurrentAutoNumberNoLostUpdate(t *testing.T) { + st := store.NewInMemory() + if err := st.ResourceDefinitions().Create(t.Context(), &apiv1.ResourceDefinition{Name: "pvc-1"}); err != nil { + t.Fatalf("seed RD: %v", err) + } + // vol-0 already present — this is the "late vd-add" precondition. + if err := st.VolumeDefinitions().Create(t.Context(), "pvc-1", &apiv1.VolumeDefinition{VolumeNumber: 0, SizeKib: 32768}); err != nil { + t.Fatalf("seed vol-0: %v", err) + } + + base, stop := startServerWithStore(t, st) + defer stop() + + // Number-less create body — the auto-assign path the python CLI's + // `linstor vd c ` takes: the wire body OMITS the + // `volume_number` key entirely (a marshalled apiv1.VolumeDefinition + // would emit `"volume_number":0` because the field has no omitempty, + // which the handler reads as an EXPLICIT 0 — that is NOT what the + // CLI sends, so the test crafts the raw omitted-key body directly). + body := []byte(`{"volume_definition":{"size_kib":32768}}`) + + const concurrent = 2 + + var ( + wg sync.WaitGroup + mu sync.Mutex + statuses []int + ) + + for range concurrent { + wg.Add(1) + + go func() { + defer wg.Done() + + resp := httpPost(t, base+"/v1/resource-definitions/pvc-1/volume-definitions", body) + defer func() { _ = resp.Body.Close() }() + + mu.Lock() + statuses = append(statuses, resp.StatusCode) + mu.Unlock() + }() + } + + wg.Wait() + + for i, code := range statuses { + if code != http.StatusOK { + t.Errorf("concurrent vd c #%d: status %d, want 200 (no request may be lost/rejected)", i, code) + } + } + + listResp := httpGet(t, base+"/v1/resource-definitions/pvc-1/volume-definitions") + defer func() { _ = listResp.Body.Close() }() + + var vds []apiv1.VolumeDefinition + if jErr := json.NewDecoder(listResp.Body).Decode(&vds); jErr != nil { + t.Fatalf("decode: %v", jErr) + } + + // Exactly three VDs: vol-0 (seeded) + the two concurrent adds at 1, 2. + if len(vds) != 3 { + t.Fatalf("got %d VDs, want 3 (vol-0 + two concurrent adds); a lost-update would leave 2: %+v", len(vds), vds) + } + + seen := map[int32]bool{} + for i := range vds { + if seen[vds[i].VolumeNumber] { + t.Fatalf("duplicate VolumeNumber %d in %+v", vds[i].VolumeNumber, vds) + } + seen[vds[i].VolumeNumber] = true + } + + for i := range 3 { + want := int32(i) + if !seen[want] { + t.Errorf("missing VolumeNumber %d (set: %v) — a concurrent add was silently dropped", want, seen) + } + } +} + // TestVolumeDefinitionsGetMissingRD: 404 when RD itself does not exist. func TestVolumeDefinitionsGetMissingRD(t *testing.T) { base, stop := startServerWithStore(t, store.NewInMemory()) diff --git a/pkg/satellite/maybe_kick_late_add_resync_test.go b/pkg/satellite/maybe_kick_late_add_resync_test.go new file mode 100644 index 00000000..cb224dad --- /dev/null +++ b/pkg/satellite/maybe_kick_late_add_resync_test.go @@ -0,0 +1,205 @@ +// SPDX-License-Identifier: Apache-2.0 + +/* +Copyright 2026 Cozystack contributors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package satellite + +import ( + "context" + "slices" + "testing" + "time" + + "github.com/cozystack/blockstor/pkg/drbd" + "github.com/cozystack/blockstor/pkg/satellite/intent" + "github.com/cozystack/blockstor/pkg/storage" +) + +// Regression pins for the BUG-048 late-add resync-KICK self-heal wiring. +// maybeKickLateAddResync must `drbdadm invalidate /` each stalled +// late-added volume that has an UpToDate peer (re-pull FROM it), and must +// NOT act on a diskless replica, an actively-pulling SyncTarget, or a +// volume with no UpToDate peer. + +const kickStatusKey = "drbdsetup status pvc-ka --json" + +// vol-0 UpToDate, late vol-2 Inconsistent locally with an UpToDate peer +// whose resync is stalled (PausedSyncT/dependency) — the recoverable +// straggler. No Primary. +const kickStatusWedged = `[{ + "name":"pvc-ka","node-id":2,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], + "connections":[{ + "peer-node-id":0,"name":"n1","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"PausedSyncT","resync-suspended":"dependency"} + ] + }] +}]` + +func kickDR() *intent.DesiredResource { + return &intent.DesiredResource{ + Name: "pvc-ka", + NodeName: "n3", + Volumes: []*intent.DesiredVolume{ + {VolumeNumber: 0, SizeKib: 1024 * 1024, StoragePool: "thin1"}, + {VolumeNumber: 2, SizeKib: 1024 * 1024, StoragePool: "thin1"}, + }, + Peers: []intent.DesiredPeer{{Name: "n1"}}, + DrbdOptions: map[string]string{ + "port": "7000", "node-id": "2", "address": "10.0.0.3", "minor": "1000", + }, + } +} + +// The wedge fires the per-volume invalidate — but ONLY after the stall +// has persisted beyond the dwell window. +func TestMaybeKickLateAddResync_InvalidatesStalledVolume(t *testing.T) { + fx := storage.NewFakeExec() + fx.Expect(kickStatusKey, storage.FakeResponse{Stdout: []byte(kickStatusWedged)}) + + rec := NewReconciler(ReconcilerConfig{Adm: drbd.NewAdm(fx), NodeName: "n3"}) + + // First pass starts the dwell window — must NOT invalidate yet. + if err := rec.maybeKickLateAddResync(context.Background(), kickDR(), false); err != nil { + t.Fatalf("maybeKickLateAddResync pass 1: %v", err) + } + if slices.Contains(fx.CommandLines(), "drbdadm invalidate pvc-ka/2") { + t.Fatalf("must NOT invalidate on the first observation (dwell not elapsed), got: %v", fx.CommandLines()) + } + + // Backdate the recorded stall so the dwell has elapsed. + rec.mu.Lock() + rec.firstLateAddStallAt["pvc-ka"] = time.Now().Add(-2 * lateAddStallDwell) + rec.mu.Unlock() + + if err := rec.maybeKickLateAddResync(context.Background(), kickDR(), false); err != nil { + t.Fatalf("maybeKickLateAddResync pass 2: %v", err) + } + + if !slices.Contains(fx.CommandLines(), "drbdadm invalidate pvc-ka/2") { + t.Errorf("expected `drbdadm invalidate pvc-ka/2` after dwell, got: %v", fx.CommandLines()) + } + // Must NOT use the abandoned disconnect/adjust handshake kick. + if slices.Contains(fx.CommandLines(), "drbdadm disconnect pvc-ka") { + t.Errorf("must NOT disconnect (re-handshake abandons the bitmap), got: %v", fx.CommandLines()) + } +} + +// A stall that CLEARS (no volume returned) resets the dwell window. +func TestMaybeKickLateAddResync_DwellResetsWhenStallClears(t *testing.T) { + fx := storage.NewFakeExec() + fx.Expect(kickStatusKey, storage.FakeResponse{Stdout: []byte(kickStatusWedged)}) + + rec := NewReconciler(ReconcilerConfig{Adm: drbd.NewAdm(fx), NodeName: "n3"}) + + if err := rec.maybeKickLateAddResync(context.Background(), kickDR(), false); err != nil { + t.Fatalf("maybeKickLateAddResync pass 1: %v", err) + } + + // Converged — no stalled volume returned, dwell dropped. + fx.Expect(kickStatusKey, storage.FakeResponse{Stdout: []byte(`[{ + "name":"pvc-ka","node-id":2,"role":"Secondary", + "devices":[{"volume":0,"disk-state":"UpToDate"},{"volume":2,"disk-state":"UpToDate"}], + "connections":[{"peer-node-id":0,"name":"n1","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}]}] + }]`)}) + if err := rec.maybeKickLateAddResync(context.Background(), kickDR(), false); err != nil { + t.Fatalf("maybeKickLateAddResync pass 2: %v", err) + } + + rec.mu.Lock() + _, stillTracked := rec.firstLateAddStallAt["pvc-ka"] + rec.mu.Unlock() + if stillTracked { + t.Errorf("dwell must be cleared once the stall clears") + } +} + +// A diskless replica has no local copy to invalidate — skip outright. +func TestMaybeKickLateAddResync_SkipsDiskless(t *testing.T) { + fx := storage.NewFakeExec() + fx.Expect(kickStatusKey, storage.FakeResponse{Stdout: []byte(kickStatusWedged)}) + + rec := NewReconciler(ReconcilerConfig{Adm: drbd.NewAdm(fx), NodeName: "n3"}) + + if err := rec.maybeKickLateAddResync(context.Background(), kickDR(), true); err != nil { + t.Fatalf("maybeKickLateAddResync: %v", err) + } + + if slices.Contains(fx.CommandLines(), "drbdadm invalidate pvc-ka/2") { + t.Errorf("diskless replica must NOT be invalidated, got: %v", fx.CommandLines()) + } +} + +// A live, unsuspended SyncTarget must NOT be invalidated — it is actively +// pulling and finishes on its own. +func TestMaybeKickLateAddResync_SkipsActiveSyncTarget(t *testing.T) { + fx := storage.NewFakeExec() + fx.Expect(kickStatusKey, storage.FakeResponse{Stdout: []byte(`[{ + "name":"pvc-ka","node-id":2,"role":"Secondary", + "devices":[{"volume":0,"disk-state":"UpToDate"},{"volume":2,"disk-state":"Inconsistent"}], + "connections":[{"peer-node-id":0,"name":"n1","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"SyncTarget","resync-suspended":"no"}]}] + }]`)}) + + rec := NewReconciler(ReconcilerConfig{Adm: drbd.NewAdm(fx), NodeName: "n3"}) + + if err := rec.maybeKickLateAddResync(context.Background(), kickDR(), false); err != nil { + t.Fatalf("maybeKickLateAddResync: %v", err) + } + + if slices.Contains(fx.CommandLines(), "drbdadm invalidate pvc-ka/2") { + t.Errorf("must NOT invalidate a live SyncTarget, got: %v", fx.CommandLines()) + } +} + +// Throttled by the shared recoveryPromoteThrottle: a second back-to-back +// pass inside the window (dwell pre-satisfied) must NOT re-invalidate. +func TestMaybeKickLateAddResync_Throttled(t *testing.T) { + fx := storage.NewFakeExec() + fx.Expect(kickStatusKey, storage.FakeResponse{Stdout: []byte(kickStatusWedged)}) + + rec := NewReconciler(ReconcilerConfig{Adm: drbd.NewAdm(fx), NodeName: "n3"}) + + rec.mu.Lock() + rec.firstLateAddStallAt["pvc-ka"] = time.Now().Add(-2 * lateAddStallDwell) + rec.mu.Unlock() + + for i := range 2 { + if err := rec.maybeKickLateAddResync(context.Background(), kickDR(), false); err != nil { + t.Fatalf("maybeKickLateAddResync pass %d: %v", i, err) + } + } + + invals := 0 + for _, c := range fx.CommandLines() { + if c == "drbdadm invalidate pvc-ka/2" { + invals++ + } + } + if invals != 1 { + t.Errorf("expected exactly 1 invalidate within the throttle window, got %d", invals) + } +} diff --git a/pkg/satellite/maybe_late_add_promote_test.go b/pkg/satellite/maybe_late_add_promote_test.go new file mode 100644 index 00000000..76b29438 --- /dev/null +++ b/pkg/satellite/maybe_late_add_promote_test.go @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: Apache-2.0 + +/* +Copyright 2026 Cozystack contributors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package satellite + +import ( + "context" + "slices" + "testing" + + "github.com/cozystack/blockstor/pkg/drbd" + "github.com/cozystack/blockstor/pkg/satellite/intent" + "github.com/cozystack/blockstor/pkg/storage" +) + +// Regression pins for the BUG-048 late-add-promote self-heal wiring. +// maybeLateAddPromote must force-primary the lowest-node-id replica when +// a late-added volume wedged Inconsistent on every diskful replica with +// no SyncSource — WITHOUT the dispatcher's auto-primary (which is absent +// on the initialized RD every late-add lands on) — and must NOT promote +// a diskless replica. + +const lateAddStatusKey = "drbdsetup status pvc-la --json" + +// vol-0/vol-1 UpToDate, the late vol-2 Inconsistent locally and on the +// peer, no peer data for vol-2, no Primary, local is the lowest node-id. +const lateAddStatusWedged = `[{ + "name":"pvc-la","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"minor":1000,"disk-state":"UpToDate"}, + {"volume":1,"minor":1001,"disk-state":"UpToDate"}, + {"volume":2,"minor":1002,"disk-state":"Inconsistent"} + ], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":1,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":2,"peer-disk-state":"Inconsistent","replication-state":"Established","resync-suspended":"no"} + ] + }] +}]` + +func lateAddDR() *intent.DesiredResource { + return &intent.DesiredResource{ + Name: "pvc-la", + NodeName: "n1", + Volumes: []*intent.DesiredVolume{ + {VolumeNumber: 0, SizeKib: 1024 * 1024, StoragePool: "thin1"}, + {VolumeNumber: 1, SizeKib: 1024 * 1024, StoragePool: "thin1"}, + {VolumeNumber: 2, SizeKib: 1024 * 1024, StoragePool: "thin1"}, + }, + Peers: []intent.DesiredPeer{{Name: "n2"}}, + DrbdOptions: map[string]string{ + "port": "7000", "node-id": "0", "address": "10.0.0.1", "minor": "1000", + }, + } +} + +// The wedge mints an UpToDate source via disconnect + per-volume +// new-current-uuid --clear-bitmap + reconnect (NOT primary --force, which +// the kernel rejects while a volume is Inconsistent). +func TestMaybeLateAddPromote_FiresForWedgedLateVolume(t *testing.T) { + fx := storage.NewFakeExec() + fx.Expect(lateAddStatusKey, storage.FakeResponse{Stdout: []byte(lateAddStatusWedged)}) + + rec := NewReconciler(ReconcilerConfig{Adm: drbd.NewAdm(fx), NodeName: "n1"}) + + if err := rec.maybeLateAddPromote(context.Background(), lateAddDR(), false); err != nil { + t.Fatalf("maybeLateAddPromote: %v", err) + } + + cmds := fx.CommandLines() + if !slices.Contains(cmds, "drbdadm disconnect pvc-la") { + t.Errorf("expected `drbdadm disconnect pvc-la`, got: %v", cmds) + } + if !slices.Contains(cmds, "drbdsetup new-current-uuid --clear-bitmap 1002") { + t.Errorf("expected `drbdsetup new-current-uuid --clear-bitmap 1002` (the Inconsistent vol-2 minor), got: %v", cmds) + } + if !slices.Contains(cmds, "drbdadm adjust pvc-la") { + t.Errorf("expected `drbdadm adjust pvc-la` (reconnect after mint), got: %v", cmds) + } + // Must NOT touch the UpToDate sibling volumes' minors. + if slices.Contains(cmds, "drbdsetup new-current-uuid --clear-bitmap 1000") || + slices.Contains(cmds, "drbdsetup new-current-uuid --clear-bitmap 1001") { + t.Errorf("must NOT clear-bitmap UpToDate sibling volumes, got: %v", cmds) + } + // The rejected resource-wide primary --force is gone. + if slices.Contains(cmds, "drbdadm primary --force pvc-la") { + t.Errorf("must NOT use the kernel-rejected `primary --force` on the late-add path, got: %v", cmds) + } +} + +// A diskless replica has no disk to promote — skip outright. +func TestMaybeLateAddPromote_SkipsDiskless(t *testing.T) { + fx := storage.NewFakeExec() + fx.Expect(lateAddStatusKey, storage.FakeResponse{Stdout: []byte(lateAddStatusWedged)}) + + rec := NewReconciler(ReconcilerConfig{Adm: drbd.NewAdm(fx), NodeName: "n1"}) + + if err := rec.maybeLateAddPromote(context.Background(), lateAddDR(), true); err != nil { + t.Fatalf("maybeLateAddPromote: %v", err) + } + + if slices.Contains(fx.CommandLines(), "drbdadm disconnect pvc-la") { + t.Errorf("diskless replica must NOT mint a source, got: %v", fx.CommandLines()) + } +} + +// A peer holding committed data for the volume vetoes the promote — the +// replica must SyncTarget from it instead (Bug 342 unrelated-data guard). +func TestMaybeLateAddPromote_SkipsWhenPeerHasData(t *testing.T) { + fx := storage.NewFakeExec() + fx.Expect(lateAddStatusKey, storage.FakeResponse{Stdout: []byte(`[{ + "name":"pvc-la","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"SyncTarget","resync-suspended":"no"} + ] + }] + }]`)}) + + rec := NewReconciler(ReconcilerConfig{Adm: drbd.NewAdm(fx), NodeName: "n1"}) + + if err := rec.maybeLateAddPromote(context.Background(), lateAddDR(), false); err != nil { + t.Fatalf("maybeLateAddPromote: %v", err) + } + + if slices.Contains(fx.CommandLines(), "drbdadm disconnect pvc-la") { + t.Errorf("must NOT mint a source when a peer holds data, got: %v", fx.CommandLines()) + } +} diff --git a/pkg/satellite/reconciler.go b/pkg/satellite/reconciler.go index f7c57327..8d56cc52 100644 --- a/pkg/satellite/reconciler.go +++ b/pkg/satellite/reconciler.go @@ -283,6 +283,18 @@ type Reconciler struct { // resets it (at worst one extra promote). lastRecoveryPromoteAt map[string]time.Time + // firstLateAddStallAt records, per resource name, the first wall-clock + // time this node observed a STALLED late-add resync (BUG-048 + // NeedsLateAddResyncKick) without yet kicking it. The kick + // (disconnect+adjust) only fires once the stall has PERSISTED beyond + // lateAddStallDwell, so the normal, transient DRBD volume-resync + // serialization (vol-N legitimately PausedSyncS/dependency for a few + // seconds while vol-(N-1) finishes) is never mistaken for the wedge + // and disrupted. Cleared when the stall clears (predicate stops + // holding). Process-memory only; a restart resets it (at worst one + // extra dwell window before the kick). + firstLateAddStallAt map[string]time.Time + // restoreSnapshotMissingTries counts, per "/", // how many consecutive reconcile passes a snapshot-restore-backed // volume found its source `@snap` ABSENT locally (ErrNotFound from @@ -310,6 +322,20 @@ type Reconciler struct { // fresh nudge promptly. const recoveryPromoteThrottle = 10 * time.Second +// lateAddStallDwell is how long a late-add resync must REMAIN stalled +// (LateAddResyncKickVolumes continuously non-empty) before +// maybeKickLateAddResync fires the per-volume invalidate. It is a SECONDARY +// guard against disrupting the normal, transient DRBD volume-resync +// serialization — the PRIMARY guard is peerDeviceResyncInProgress, which +// already backs the kick off whenever any resync on the volume is actively +// progressing (so a freshly-serialized vol-N that is about to start is not +// invalidated). The dwell adds a small settling margin so a stall that is +// momentarily mis-read clears on its own first. Kept short so a genuine +// FROZEN wedge (oos never moves) recovers well inside the operator/L7 +// convergence budget; far longer than the sub-second window in which a +// just-added volume transitions through its handshake states. +const lateAddStallDwell = 20 * time.Second + // restoreSnapshotMissingBudget bounds how many consecutive reconcile // passes a node-local snapshot-restore volume may REQUEUE on a // not-yet-materialised local `@snap` before falling through to the @@ -338,6 +364,7 @@ func NewReconciler(cfg ReconcilerConfig) *Reconciler { seenStuckAt: map[string]time.Time{}, restoreBlankFallback: map[string]bool{}, lastRecoveryPromoteAt: map[string]time.Time{}, + firstLateAddStallAt: map[string]time.Time{}, restoreSnapshotMissingTries: map[string]int{}, } } @@ -1854,71 +1881,6 @@ func extractResFilePeers(body string) []string { // bogus --node-id=0 collision against the local slot. Reads via // os.ReadFile so a transient I/O hiccup degrades to no-op // instead of wedging the reconcile. -// hasLateAddedVolume reports whether the desired-state Volumes[] -// includes at least one volume number that is NOT yet represented -// as a `volume {` block in the OLD .res file at resPath. -// -// Bug 332: the `linstor vd c 1G` flow grows VolumeDefinitions[] -// after the RD has already passed first-activation. The dispatcher -// hands the satellite a DesiredResource with the new volume in -// Volumes[], but the on-disk .res still describes the smaller set — -// so a strict greater-than on the rendered block count is the -// late-VD signal. Returns false when the .res file is absent -// (cold-start path; the existing firstActivation gate owns -// metadata creation), when the file is unreadable (fail-safe to -// "no late vol → no extra work"), or when the desired set matches -// what's already rendered. -// -// Parser is intentionally simple: matches "volume {" inside an -// `on {` block. False positives across multi-host blocks are -// harmless — the helper de-duplicates by recording each volNumber -// once across the file. -func hasLateAddedVolume(resPath string, dr *intent.DesiredResource) bool { - if dr == nil { - return false - } - - body, err := os.ReadFile(resPath) - if err != nil { - // No .res yet → cold start, existing firstActivation path - // will create metadata for every volume via the standard - // chain. Late-VD signal is "old file exists with fewer - // volumes", not "no file at all". - return false - } - - rendered := map[int32]struct{}{} - - for line := range strings.SplitSeq(string(body), "\n") { - trimmed := strings.TrimSpace(line) - if !strings.HasPrefix(trimmed, "volume ") { - continue - } - - rest := strings.TrimPrefix(trimmed, "volume ") - head, _, ok := strings.Cut(rest, "{") - - if !ok { - continue - } - - num, parseErr := strconv.ParseInt(strings.TrimSpace(head), 10, 32) - if parseErr != nil { - continue - } - - rendered[int32(num)] = struct{}{} - } - - for _, vol := range dr.GetVolumes() { - if _, ok := rendered[vol.GetVolumeNumber()]; !ok { - return true - } - } - - return false -} - func extractResFilePeerNodeIDs(resPath string) map[string]int32 { body, err := os.ReadFile(resPath) if err != nil { @@ -2135,15 +2097,28 @@ func (r *Reconciler) applyDRBD(ctx context.Context, dr *intent.DesiredResource, // upstream LINSTOR DrbdLayer.adjustResource: per-volume // hasMetaData probe, per-volume createMd for those that lack it. // - // Scope: NARROW to the "late-VD added" signal — desired Volumes[] - // count strictly exceeds the OLD .res's `volume N {` block count. - // Without this narrowing, the steady-state path would shell out - // `drbdadm dump-md` on every reconcile, perturbing existing tests - // that pin "no metadata work on retry" (e.g. mid-Apply abort - // scenarios) and adding shell cost on every converged pass. - // Skipped on diskless replicas (no lower disk to stamp) and on - // the diskless→diskful flip path (Bug 319 owns that re-stamp). - if !diskless && hasLateAddedVolume(resPath, dr) && + // Scope: gate on "this RD is already past first activation" + // (MetadataCreated Condition / .md-created marker) rather than the + // OLD .res's `volume N {` block count. + // + // BUG-048 (P1, availability): the previous `.res`-count gate raced + // the FSM dispatch's renderResFile preamble. On a concurrent late + // vd-add the FSM `adjust` of an EARLIER reconcile already re-rendered + // `.res` to include the new volume's `volume N {` block BEFORE this + // gate read the file, so hasLateAddedVolume(resPath) flipped FALSE, + // ensurePerVolumeMetadata was SKIPPED, and the new volume came up via + // the kernel adjust with NO metadata + NO winner-election GI seed — + // Inconsistent on every replica, no SyncSource, permanently wedged + // (~half of concurrent two-VD adds). The on-disk metadata, not the + // rendered .res, is the race-free truth: ensurePerVolumeMetadata + // probes `drbdadm dump-md` per volume and only creates+seeds the ones + // that actually lack metadata (len(freshlyCreated)==0 → early return), + // so running it on every post-activation diskful reconcile is + // idempotent and converged passes pay only the cheap per-volume + // dump-md probe. Skipped on diskless replicas (no lower disk to + // stamp) and on the diskless→diskful flip path (Bug 319 owns that + // re-stamp). + if !diskless && dr.GetMetadataCreated() && !r.isDisklessToDiskfulFlip(ctx, dr, diskless) { err = r.ensurePerVolumeMetadata(ctx, dr, devices, diskless) if err != nil { @@ -2561,6 +2536,32 @@ func (r *Reconciler) maybePromoteSelfHeals(ctx context.Context, dr *intent.Desir return err } + // BUG-048 late-add-promote self-heal — drive a late-added volume that + // wedged Inconsistent on EVERY diskful replica (no SyncSource, no peer + // holds data) to UpToDate. Distinct from maybeRecoveryPromote: that + // path needs the local already-UpToDate + the dispatcher's + // auto-primary, BOTH false for a late-add to an initialized RD. See + // maybeLateAddPromote / Adm.NeedsLateAddPromote. + err = r.maybeLateAddPromote(ctx, dr, diskless) + if err != nil { + return err + } + + // BUG-048 late-add resync-kick self-heal — unstick a late-added + // volume whose resync EXISTS but wedged in a paused/bitmap-exchange + // state that never advances (the ≥3-replica concurrent-add + // convergence wedge: a SyncSource is elected but its resync sits + // PausedSyncS/resync-suspended:dependency while partner peers wait in + // WFBitMapT/resync-suspended:peer, or one peer reaches UpToDate while + // a second stalls WFBitMapT/Outdated forever). Distinct from + // maybeLateAddPromote: that mints a source when NONE exists; this + // re-handshakes an EXISTING-but-stalled resync via disconnect+adjust. + // See maybeKickLateAddResync / Adm.NeedsLateAddResyncKick. + err = r.maybeKickLateAddResync(ctx, dr, diskless) + if err != nil { + return err + } + // Solo diskless→diskful toggle self-heal — force-promote a lone, // peerless diskful replica wedged below UpToDate. See maybeSoloPromote // for the full why (the auto-primary suppression × offline-safety @@ -2720,6 +2721,185 @@ func (r *Reconciler) maybeRecoveryPromote(ctx context.Context, dr *intent.Desire return r.runAutoPromote(ctx, dr) } +// maybeLateAddPromote is the BUG-048 self-heal for a late-added volume +// that wedged Inconsistent on EVERY diskful replica with no SyncSource. +// +// Why it cannot reuse maybeRecoveryPromote: that path is gated on the +// dispatcher's `auto-primary` (autoPrimaryReplica), which is suppressed +// on an INITIALIZED RD — and a late `vd c` ALWAYS lands on an +// initialized RD. Its kernel-truth predicate (NeedsRecoveryPromote) also +// requires the local replica to be already UpToDate; in this wedge the +// local late-added volume is itself Inconsistent. So neither the gate +// nor the predicate can ever fire for the late-add wedge. +// +// NeedsLateAddPromote is the dedicated kernel-truth gate: it fires ONLY +// when a local diskful volume is Inconsistent, NO connected peer holds +// committed data for it (so there is no real SyncSource to wait for), +// no replica is Primary, and this node is the deterministic lowest +// node-id — so exactly one node mints the source and it is data-safe +// (every replica shares the volume's day0 current-UUID and is equally +// empty, so the minted UpToDate copy loses nothing; the Inconsistent peers +// SyncTarget from it). Self-limiting: once the peers converge the predicate +// stops holding. +// +// How the source is minted: NOT `drbdadm primary --force`. The kernel +// REJECTS that with "(-16) Disk state is lower than outdated" whenever any +// volume of the resource is Inconsistent (stand-observed: the old promote +// retried every throttle window and never took — the wedge persisted on all +// replicas). Instead MintLateAddSource disconnects, runs the per-volume +// `drbdsetup new-current-uuid --clear-bitmap ` (the same skip-init +// path DRBD uses to mark a fresh volume UpToDate), and reconnects — so the +// Inconsistent peers SyncTarget from the freshly-UpToDate local volumes. +// +// NOT gated on autoPromote/autoPrimaryReplica by design — those are +// exactly what is missing on the late-add path. It IS gated on the same +// recoveryPromoteThrottle so a still-converging resync isn't churned by +// repeated mints. Skipped on diskless replicas (no disk to mint) and when +// Adm is unwired (storage-only unit tests). +func (r *Reconciler) maybeLateAddPromote(ctx context.Context, dr *intent.DesiredResource, diskless bool) error { + if diskless || r.cfg.Adm == nil { + return nil + } + + if !r.cfg.Adm.NeedsLateAddPromote(ctx, dr.GetName()) { + return nil + } + + if !r.recoveryPromoteDue(dr.GetName()) { + return nil + } + + log.FromContext(ctx).Info("BUG-048 late-add-promote: mint UpToDate source (disconnect + new-current-uuid --clear-bitmap + reconnect) for a late-added volume wedged Inconsistent on every replica", + "resource", dr.GetName()) + + minors, err := r.cfg.Adm.MintLateAddSource(ctx, dr.GetName()) + if err != nil { + return errors.Wrapf(err, "late-add-promote mint source %s", dr.GetName()) + } + + log.FromContext(ctx).Info("BUG-048 late-add-promote: minted UpToDate source", + "resource", dr.GetName(), "minors", minors) + + return nil +} + +// maybeKickLateAddResync is the BUG-048 self-heal for a late-added volume +// whose resync EXISTS but wedged in a paused / bitmap-exchange state that +// never advances — the ≥3-replica concurrent late-add convergence wedge. +// +// Two stand-observed signatures it unsticks (both on the last +// concurrently-added volume of a 3-diskful RD, oos frozen at full / a +// peer stuck below UpToDate forever): +// +// - partial stall: one peer reaches UpToDate (a real SyncSource), but +// THIS replica stays WFBitMapT / Outdated / Inconsistent and never +// finalises — the source fed another peer but not us. +// - dependency deadlock (post-promote): once maybeLateAddPromote +// force-primaries the lowest-id node to UpToDate, the remaining +// Inconsistent replicas have an UpToDate peer but their resync stays +// paused (resync-suspended:dependency) and never advances. +// +// Why neither promote self-heal covers it: maybeLateAddPromote fires only +// when NO peer holds data and the volume is Inconsistent on EVERY replica +// (it mints a source); here an UpToDate peer EXISTS but the local resync +// is latched. maybeRecoveryPromote needs the local already-UpToDate. +// +// Why `invalidate` and not a connection re-handshake: every replica of a +// fresh late-added volume shares the day0 current-UUID, so disconnect+ +// reconnect reads "same generation ⇒ no resync needed" and abandons the +// dirty bitmap — a WORSE wedge (verified on-stand: connection went +// StandAlone / Established with out-of-sync frozen). `drbdadm invalidate +// /` instead discards the empty, non-authoritative LOCAL copy of +// that one volume and forces a full SyncTarget FROM the UpToDate peer. +// +// LateAddResyncKickVolumes is the kernel-truth gate: it returns the local +// volumes below UpToDate that have a connected UpToDate peer (the +// SyncTarget source) and are not already being actively pulled — ONLY when +// no replica is Primary, the RD is past day0, and every peer is Connected. +// Each returned volume is invalidated locally. Data-safe (only a +// non-authoritative local copy is discarded, and only when an UpToDate +// peer exists to re-pull from) and self-limiting (once a volume reaches +// UpToDate it no longer qualifies). A 45s stall-dwell ensures only a +// GENUINELY frozen resync is invalidated, never the normal transient +// volume-resync serialization. Throttled via the shared +// recoveryPromoteThrottle. Skipped on diskless replicas (no local copy to +// invalidate) and when Adm is unwired (storage-only tests). +func (r *Reconciler) maybeKickLateAddResync(ctx context.Context, dr *intent.DesiredResource, diskless bool) error { + if diskless || r.cfg.Adm == nil { + return nil + } + + volumes := r.cfg.Adm.LateAddResyncKickVolumes(ctx, dr.GetName()) + if len(volumes) == 0 { + // Not stalled (or converged): clear any pending dwell so a future + // transient stall starts its window fresh. + r.clearLateAddStall(dr.GetName()) + + return nil + } + + // Dwell gate: only invalidate once the stall has PERSISTED beyond + // lateAddStallDwell, so the normal transient volume-resync + // serialization (a few seconds of PausedSync*/dependency while a + // sibling volume finishes) is never disrupted — only a genuinely + // frozen resync is. + if !r.lateAddStallDwellElapsed(dr.GetName()) { + return nil + } + + if !r.recoveryPromoteDue(dr.GetName()) { + return nil + } + + // Invalidate ONE volume per kick (the lowest-numbered stalled one). + // DRBD serialises resyncs per connection, so invalidating every stalled + // volume at once would re-create the very dependency-serialization that + // wedges — vol-N's invalidate would sit PausedSyncT/dependency behind + // vol-(N-1)'s and, looking stalled, get re-invalidated and flap. One at + // a time lets each invalidated volume's resync run (and back off the + // kick via peerDeviceResyncInProgress) before the next reconcile picks + // up the next straggler. volumes is built in ascending volume order by + // the kernel device scan. + vol := volumes[0] + + log.FromContext(ctx).Info("BUG-048 late-add resync-kick: invalidate locally to re-pull a stalled late-added volume from its UpToDate peer", + "resource", dr.GetName(), "volume", vol) + + return errors.Wrapf(r.cfg.Adm.InvalidateVolume(ctx, dr.GetName(), vol), + "late-add resync-kick invalidate %s/%d", dr.GetName(), vol) +} + +// lateAddStallDwellElapsed records the first time a resource was observed +// with a stalled late-add resync and reports whether the stall has now +// persisted at least lateAddStallDwell. The first observation stamps the +// timestamp and returns false (start the window); a later observation +// returns true once the window has elapsed. Serialised by r.mu. +func (r *Reconciler) lateAddStallDwellElapsed(name string) bool { + r.mu.Lock() + defer r.mu.Unlock() + + now := time.Now() + + first, ok := r.firstLateAddStallAt[name] + if !ok { + r.firstLateAddStallAt[name] = now + + return false + } + + return now.Sub(first) >= lateAddStallDwell +} + +// clearLateAddStall drops the recorded stall start for a resource, so a +// transient stall that cleared restarts its dwell window from scratch the +// next time one is observed. Serialised by r.mu. +func (r *Reconciler) clearLateAddStall(name string) { + r.mu.Lock() + defer r.mu.Unlock() + + delete(r.firstLateAddStallAt, name) +} + // recoveryPromoteDue reports whether enough time has elapsed since this // node last fired a recovery-promote for `name` to fire another, and // records the fire time when it returns true. Serialised by r.mu so diff --git a/pkg/satellite/reconciler_drbd_test.go b/pkg/satellite/reconciler_drbd_test.go index e848f1d5..445863cb 100644 --- a/pkg/satellite/reconciler_drbd_test.go +++ b/pkg/satellite/reconciler_drbd_test.go @@ -3026,6 +3026,122 @@ func TestApplyDRBDAllocatesBackingForLateAddedVolume(t *testing.T) { } } +// TestApplyLateAddedVolumePerVolumeMetadataRunsDespitePreRenderedRes pins +// BUG-048 (P1, availability): the per-volume create-md + winner-seed pass +// MUST fire for a late-added volume EVEN WHEN the on-disk .res already +// carries the new volume's `volume N {` block. +// +// Root cause: the gate that admitted ensurePerVolumeMetadata used to be +// hasLateAddedVolume(resPath, dr) — a comparison of the desired volume +// set against the `volume N {` blocks rendered in the OLD .res. But the +// FSM dispatch runs renderResFile as a preamble for every adjust/up, so +// on a concurrent late vd-add an EARLIER reconcile's adjust re-rendered +// .res to include the new volume's block BEFORE this gate read it. The +// gate then saw "no late volume" → SKIPPED create-md + the winner seed +// → the new volume came up via the kernel adjust with no metadata and no +// elected SyncSource → Inconsistent on every replica, permanently +// wedged. The gate now keys off the race-free MetadataCreated signal +// (this RD is past first activation), and the idempotent per-volume +// HasMD probe inside ensurePerVolumeMetadata is the sole authority for +// which volumes actually need create-md. +// +// Distinguishing factor vs TestApplyDRBDAllocatesBackingForLateAddedVolume: +// here the seeded .res ALREADY contains `volume 1 { ... }` (simulating +// the pre-render race). Under the old .res-count gate this Apply did +// NOTHING for vol-1; under the fix it still create-md's + seeds it. +func TestApplyLateAddedVolumePerVolumeMetadataRunsDespitePreRenderedRes(t *testing.T) { + dir := t.TempDir() + fx := storage.NewFakeExec() + + fx.Expect("lvs --config devices { filter=['r|^/dev/drbd|','r|^/dev/zd|'] } --noheadings -o lv_name vg/pvc-race_00000", + storage.FakeResponse{Stdout: []byte("pvc-race_00000\n")}) + fx.Expect("lvs --config devices { filter=['r|^/dev/drbd|','r|^/dev/zd|'] } --noheadings --separator | -o lv_path,lv_size --units k --nosuffix vg/pvc-race_00000", + storage.FakeResponse{Stdout: []byte("/dev/vg/pvc-race_00000|1048576\n")}) + fx.Expect("lvs --config devices { filter=['r|^/dev/drbd|','r|^/dev/zd|'] } --noheadings -o lv_name vg/pvc-race_00001", + storage.FakeResponse{Stdout: []byte("pvc-race_00001\n")}) + fx.Expect("lvs --config devices { filter=['r|^/dev/drbd|','r|^/dev/zd|'] } --noheadings --separator | -o lv_path,lv_size --units k --nosuffix vg/pvc-race_00001", + storage.FakeResponse{Stdout: []byte("/dev/vg/pvc-race_00001|1048576\n")}) + + // vol-0 already has metadata; vol-1 does NOT (the late-added volume). + fx.Expect("drbdadm dump-md pvc-race/0", + storage.FakeResponse{Stdout: []byte("version \"v09\";\nla-size-sect 2048;\n")}) + fx.Expect("drbdadm dump-md pvc-race/1", + storage.FakeResponse{Err: errDrbdadmDumpMdNoMeta}) + + // The fix MUST still create-md vol-1 despite the pre-rendered .res. + fx.Expect(fmt.Sprintf("drbdadm create-md --force --max-peers=%d pvc-race/1", drbd.MaxPeers-1), + storage.FakeResponse{}) + + fx.Expect("drbdsetup status pvc-race", + storage.FakeResponse{Stdout: []byte("pvc-race role:Secondary\n volume:0 disk:UpToDate\n")}) + fx.Expect("drbdsetup status --verbose pvc-race", + storage.FakeResponse{Stdout: []byte("pvc-race role:Secondary\n volume:0 disk:UpToDate\n")}) + fx.Expect("drbdadm adjust pvc-race", storage.FakeResponse{}) + + thin := lvm.NewThin(lvm.ThinConfig{VolumeGroup: "vg", ThinPool: "tp"}, fx) + rec := satellite.NewReconciler(satellite.ReconcilerConfig{ + Providers: map[string]storage.Provider{"thin1": thin}, + Adm: drbd.NewAdm(fx), + StateDir: dir, + NodeName: "n1", + }) + + // THE RACE: the .res ALREADY carries volume 1 (the FSM render-preamble + // of a prior reconcile pre-populated it). The old hasLateAddedVolume + // gate would see both blocks present and skip create-md for vol-1. + resPath := filepath.Join(dir, "pvc-race.res") + preRendered := "resource pvc-race {\n" + + " on n1 {\n" + + " volume 0 {\n }\n" + + " volume 1 {\n }\n" + + " }\n}\n" + if err := os.WriteFile(resPath, []byte(preRendered), 0o600); err != nil { + t.Fatalf("seed .res: %v", err) + } + if err := os.WriteFile(filepath.Join(dir, "pvc-race.md-created"), nil, 0o600); err != nil { + t.Fatalf("seed md-created: %v", err) + } + + dr := []*intent.DesiredResource{ + { + Name: "pvc-race", + NodeName: "n1", + MetadataCreated: true, + Volumes: []*intent.DesiredVolume{ + {VolumeNumber: 0, SizeKib: 1024 * 1024, StoragePool: "thin1"}, + {VolumeNumber: 1, SizeKib: 1024 * 1024, StoragePool: "thin1"}, + }, + SkipInitialSync: skipInitTrue(), + DrbdOptions: map[string]string{ + "port": "7000", "node-id": "0", "address": "10.0.0.1", "minor": "1000", + }, + }, + } + + results, err := rec.Apply(t.Context(), dr) + if err != nil { + t.Fatalf("Apply: %v", err) + } + if len(results) != 1 || !results[0].GetOk() { + t.Fatalf("Apply: expected Ok=true; got results=%+v", results) + } + + calls := fx.CommandLines() + + wantCreateMD := fmt.Sprintf("drbdadm create-md --force --max-peers=%d pvc-race/1", drbd.MaxPeers-1) + if !slices.Contains(calls, wantCreateMD) { + t.Errorf("BUG-048: per-volume create-md for vol-1 SKIPPED despite pre-rendered .res — "+ + "the late-added volume gets no metadata + no winner seed and wedges Inconsistent; "+ + "want %q in %v", wantCreateMD, calls) + } + + // vol-0 must NOT be re-created (would wipe its GI + bitmap). + forbidden := fmt.Sprintf("drbdadm create-md --force --max-peers=%d pvc-race/0", drbd.MaxPeers-1) + if slices.Contains(calls, forbidden) { + t.Errorf("re-ran create-md on vol-0 (would wipe operator-stamped metadata): %v", calls) + } +} + // TestApplyLateAddedVolumeWinnerSeedsUpToDate pins Bug 384 (P0, data // integrity — regression of Bug 79/332): when `linstor vd c 1G` // adds vol-1 to a multi-replica RD whose vol-0 already reached diff --git a/pkg/store/inmemory_volume_definition.go b/pkg/store/inmemory_volume_definition.go index ae40801c..871b1949 100644 --- a/pkg/store/inmemory_volume_definition.go +++ b/pkg/store/inmemory_volume_definition.go @@ -86,6 +86,43 @@ func (s *inMemoryVolumeDefinitions) Create(_ context.Context, rdName string, vd return nil } +// CreateAutoNumbered allocates the smallest free non-negative +// VolumeNumber under the write lock so the read of the existing set and +// the insert are a single atomic step — the InMemory equivalent of the +// k8s backend's allocate-inside-RetryOnConflict. BUG-048: a REST-side +// "List then Create" sequence has a TOCTOU window two concurrent +// `vd c` calls fall into (both pick the same hole, the loser is +// rejected and its volume silently lost); doing the allocation here +// closes it. +func (s *inMemoryVolumeDefinitions) CreateAutoNumbered(_ context.Context, rdName string, vd *apiv1.VolumeDefinition) (int32, error) { + if vd == nil { + return 0, errors.New("nil VolumeDefinition") + } + + s.mu.Lock() + defer s.mu.Unlock() + + // Smallest-hole walk under the lock so the read of the used set and + // the insert are a single atomic step. Mirrors upstream LINSTOR's + // rule (VDs 0 and 2 present → 1, not 3). + used := make(map[int32]bool) + + for k := range s.m { + if k.rd == rdName { + used[k.vol] = true + } + } + + var assigned int32 + for assigned = 0; used[assigned]; assigned++ { //nolint:revive // empty body: the increment IS the body + } + + vd.VolumeNumber = assigned + s.m[vdKey{rdName, assigned}] = *vd + + return assigned, nil +} + func (s *inMemoryVolumeDefinitions) Update(_ context.Context, rdName string, vd *apiv1.VolumeDefinition) error { if vd == nil { return errors.New("nil VolumeDefinition") diff --git a/pkg/store/k8s/k8s.go b/pkg/store/k8s/k8s.go index b133a5c2..42ef7688 100644 --- a/pkg/store/k8s/k8s.go +++ b/pkg/store/k8s/k8s.go @@ -63,13 +63,25 @@ type Store struct { // New wraps a controller-runtime client and returns a store.Store. func New(c ctrlclient.Client) *Store { + return NewWithAPIReader(c, nil) +} + +// NewWithAPIReader is New plus a direct (uncached) API reader. The +// reader is used ONLY where a cache-lag read would be incorrect — the +// BUG-048 atomic VolumeNumber allocation, where retrying an optimistic- +// lock conflict against a stale informer cache re-derives the SAME +// number and the create-loop never converges (the second of two +// concurrent `vd c` is dropped). Pass mgr.GetAPIReader() in production; +// nil is accepted and every path falls back to the cached client +// (in-memory / unit harnesses that have no informer). +func NewWithAPIReader(c ctrlclient.Client, apiReader ctrlclient.Reader) *Store { s := &Store{c: c} s.nodes = &nodes{c: c} s.storagePools = &storagePools{c: c} s.resourceGroups = &resourceGroups{c: c} s.resourceDefinitions = &resourceDefinitions{c: c} s.resources = &resources{c: c} - s.volumeDefinitions = &volumeDefinitions{c: c} + s.volumeDefinitions = &volumeDefinitions{c: c, apiReader: apiReader} s.snapshots = &snapshots{c: c} s.physicalDevices = &physicalDevices{c: c} s.controllerProps = &controllerProps{c: c} diff --git a/pkg/store/k8s/k8s_test.go b/pkg/store/k8s/k8s_test.go index dd731354..7d7c6f57 100644 --- a/pkg/store/k8s/k8s_test.go +++ b/pkg/store/k8s/k8s_test.go @@ -23,6 +23,7 @@ import ( "os" "path/filepath" "runtime" + "sync" "testing" "k8s.io/apimachinery/pkg/runtime/schema" @@ -31,6 +32,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/envtest" crdv1alpha1 "github.com/cozystack/blockstor/api/v1alpha1" + apiv1 "github.com/cozystack/blockstor/pkg/api/v1" "github.com/cozystack/blockstor/pkg/store" "github.com/cozystack/blockstor/pkg/store/k8s" "github.com/cozystack/blockstor/pkg/store/storetest" @@ -164,6 +166,83 @@ func TestK8sVolumeDefinitionStore(t *testing.T) { }) } +// TestK8sVolumeDefinitionConcurrentAutoNumber is the BUG-048 (P1, +// availability) regression at the store layer against a REAL apiserver: +// N goroutines each call CreateAutoNumbered on the same RD concurrently. +// Every call must succeed and land at a DISTINCT VolumeNumber — the +// allocate-inside-RetryOnConflict loop must converge under genuine +// optimistic-lock 409s rather than dropping the racing creates. +// +// envtest's client is a direct (uncached) reader, so this exercises the +// retry-on-409 convergence; the cache-lag half of BUG-048 (which needs +// the informer-cached client + GetAPIReader fallback) is covered by the +// live-stand cli-matrix cell. Together they pin both halves. +func TestK8sVolumeDefinitionConcurrentAutoNumber(t *testing.T) { + if fixture == nil { + t.Skip("envtest assets not installed; run `make setup-envtest` to enable") + } + + t.Cleanup(func() { wipeAll(t, fixture.client) }) + + st := k8s.New(fixture.client) + ctx := context.Background() + + if err := st.ResourceDefinitions().Create(ctx, &apiv1.ResourceDefinition{Name: "pvc-conc"}); err != nil { + t.Fatalf("seed RD: %v", err) + } + + const n = 6 + + var ( + wg sync.WaitGroup + mu sync.Mutex + errs []error + ) + + for range n { + wg.Add(1) + + go func() { + defer wg.Done() + + vd := apiv1.VolumeDefinition{SizeKib: 4096} + if _, err := st.VolumeDefinitions().CreateAutoNumbered(ctx, "pvc-conc", &vd); err != nil { + mu.Lock() + errs = append(errs, err) + mu.Unlock() + } + }() + } + + wg.Wait() + + for _, e := range errs { + t.Errorf("concurrent CreateAutoNumbered failed (a volume was dropped): %v", e) + } + + vds, err := st.VolumeDefinitions().List(ctx, "pvc-conc") + if err != nil { + t.Fatalf("List: %v", err) + } + if len(vds) != n { + t.Fatalf("got %d VDs after %d concurrent auto-numbered creates, want %d (lost update)", len(vds), n, n) + } + + seen := map[int32]bool{} + for i := range vds { + if seen[vds[i].VolumeNumber] { + t.Fatalf("duplicate VolumeNumber %d", vds[i].VolumeNumber) + } + seen[vds[i].VolumeNumber] = true + } + + for want := range int32(n) { + if !seen[want] { + t.Errorf("missing VolumeNumber %d in %v — a concurrent create was dropped", want, seen) + } + } +} + // TestK8sSnapshotStore runs the shared SnapshotStore suite. func TestK8sSnapshotStore(t *testing.T) { if fixture == nil { diff --git a/pkg/store/k8s/volume_definitions.go b/pkg/store/k8s/volume_definitions.go index 574a5da5..0ba15f79 100644 --- a/pkg/store/k8s/volume_definitions.go +++ b/pkg/store/k8s/volume_definitions.go @@ -24,6 +24,7 @@ import ( "github.com/cockroachdb/errors" apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/util/retry" ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" @@ -39,6 +40,18 @@ import ( // RD anyway, and a single CRD makes ownership/reclamation trivially correct. type volumeDefinitions struct { c ctrlclient.Client + + // apiReader is a direct, UNCACHED reader (mgr.GetAPIReader()) used + // only by CreateAutoNumbered's retry loop. The cached client's Get + // trails a just-committed write by the informer round-trip, so + // retrying an optimistic-lock conflict against the cache re-reads + // the stale RD, re-derives the SAME smallest-free VolumeNumber, and + // the loop never converges — the BUG-048 lost-update where the + // second of two concurrent `vd c` is dropped. Reading live on each + // attempt makes the allocation see the winner's committed write. + // nil in non-production stores (in-memory / unit) → fall back to the + // cached client. + apiReader ctrlclient.Reader } func (s *volumeDefinitions) List(ctx context.Context, rdName string) ([]apiv1.VolumeDefinition, error) { @@ -103,6 +116,122 @@ func (s *volumeDefinitions) Create(ctx context.Context, rdName string, vd *apiv1 }), "update RD %q to add volume %d", rdName, vd.VolumeNumber) } +// CreateAutoNumbered allocates the smallest free non-negative +// VolumeNumber INSIDE the conflict-retry loop and persists `vd` under +// it, returning the assigned number. BUG-048 (P1, availability): the +// REST handler used to pick the number in a separate "List the RD → +// smallest hole" step BEFORE this Create, so two concurrent +// `linstor vd c ` calls both read `[vol-0]`, both decided VlmNr=1, +// and the loser was rejected with FAIL_EXISTS_VLM_DFN — the operator's +// second intended volume silently vanished (only one VD landed, no +// usable error: the message claimed vol-1 "already exists" though the +// operator never named a number). Re-deriving the hole on every retry +// attempt makes the read-of-existing-set and the append atomic with +// respect to a racing CreateAutoNumbered: the loser's retry re-fetches +// the RD now carrying `[vol-0, vol-1]` and lands at vol-2. +// +// On the k8s backend the optimistic-concurrency guarantee comes from +// the apiserver's resourceVersion check on Update: if a racing write +// landed between our fetch and Update, the Update 409s, isConflictOr +// NotFound retries, and the whole allocate+append re-runs against the +// fresh RD. (Mirrors the existing explicit-number Create's retry; the +// only difference is the number is computed here rather than supplied.) +func (s *volumeDefinitions) CreateAutoNumbered(ctx context.Context, rdName string, vd *apiv1.VolumeDefinition) (int32, error) { + if vd == nil { + return 0, errors.New("nil VolumeDefinition") + } + + var assigned int32 + + // Generous retry budget: each attempt does a LIVE (uncached) RD read + // so a conflict-retry sees the racing winner's committed VD and lands + // at the next free number. retry.DefaultRetry (≈5 steps) is enough + // for the common 2-way race, but widen it so a burst of concurrent + // `vd c` against one RD still converges rather than dropping the + // straggler. + backoff := retry.DefaultRetry + backoff.Steps = 12 + + err := retry.OnError(backoff, isConflictOrNotFound, func() error { + rd, fetchErr := s.fetchRDLive(ctx, rdName) + if fetchErr != nil { + return fetchErr + } + + assigned = smallestFreeVolumeNumber(rd.Spec.VolumeDefinitions) + + entry := wireToCRDVD(vd) + entry.VolumeNumber = assigned + + rd.Spec.VolumeDefinitions = append(rd.Spec.VolumeDefinitions, entry) + + updErr := s.c.Update(ctx, rd) + if updErr != nil { + return updErr //nolint:wrapcheck // outer errors.Wrapf adds context; the bare error preserves the apierrors type isConflictOrNotFound matches on + } + + // Post-commit verification (BUG-048 defence-in-depth). The + // optimistic-lock Update normally guarantees no racing writer + // clobbered us — a stale resourceVersion 409s and we retry. But + // under heavy apiserver/etcd load a follower read can hand back a + // resourceVersion that is already superseded yet still accepted on + // Update, so two concurrent creates could both "succeed" while + // only one VolumeDefinition lands (the silent lost-update the + // operator hit). Re-read live and confirm our assigned number is + // actually present; if it vanished (a racer's clobber won), force + // a retry by surfacing a synthetic Conflict so the whole + // allocate+append re-runs against the now-correct state. This + // makes the silent drop impossible: the create either persists or + // retries — it never returns success having lost the volume. + // + // Crucially this only holds when the verifying read is LIVE. With + // no uncached reader wired (apiReader == nil) the re-read falls + // back to the informer cache, which trails the Update we just + // committed — so it routinely fails to observe our own freshly + // written volume and surfaces a FALSE synthetic Conflict. The + // retry then re-derives the next free number off the same lagging + // cache and appends a SECOND volume, so a single auto-numbered + // create can leave several phantom VolumeDefinitions behind + // (BUG-048 de-regress). A successful optimistic-locked Update is + // already the apiserver's authoritative confirmation; without a + // live reader there is nothing trustworthy to verify against, so + // skip the check rather than second-guess a committed write + // against a stale cache. Production wires mgr.GetAPIReader(), so + // the live verification path is preserved where it actually + // guards the lost-update. + if s.apiReader == nil { + return nil + } + + return s.verifyVolumeLanded(ctx, rdName, assigned) + }) + if err != nil { + return 0, errors.Wrapf(err, "auto-numbered create on RD %q", rdName) + } + + vd.VolumeNumber = assigned + + return assigned, nil +} + +// smallestFreeVolumeNumber returns the lowest non-negative VolumeNumber +// not present in vds. Mirrors upstream LINSTOR's smallest-hole rule +// (VDs 0 and 2 present → 1, not 3). +func smallestFreeVolumeNumber(vds []crdv1alpha1.ResourceDefinitionVolume) int32 { + used := make(map[int32]bool, len(vds)) + for i := range vds { + used[vds[i].VolumeNumber] = true + } + + for candidate := int32(0); candidate >= 0; candidate++ { + if !used[candidate] { + return candidate + } + } + + return 0 +} + func (s *volumeDefinitions) Update(ctx context.Context, rdName string, vd *apiv1.VolumeDefinition) error { if vd == nil { return errors.New("nil VolumeDefinition") @@ -243,6 +372,58 @@ func (s *volumeDefinitions) fetchRD(ctx context.Context, rdName string) (*crdv1a return &rd, nil } +// fetchRDLive reads the RD through the direct, UNCACHED API reader when +// one is wired (production), so CreateAutoNumbered's retry sees the +// latest committed VolumeDefinitions rather than a stale informer-cache +// revision (BUG-048). Falls back to the cached client when no apiReader +// is configured (in-memory / unit harnesses). The returned object is +// still safe to mutate + write back through s.c (the resourceVersion the +// live read carries is what the subsequent optimistic-locked Update +// checks against). +func (s *volumeDefinitions) fetchRDLive(ctx context.Context, rdName string) (*crdv1alpha1.ResourceDefinition, error) { + if s.apiReader == nil { + return s.fetchRD(ctx, rdName) + } + + var rd crdv1alpha1.ResourceDefinition + + err := s.apiReader.Get(ctx, types.NamespacedName{Name: Name(rdName)}, &rd) + if err != nil { + if apierrors.IsNotFound(err) { + return nil, errors.Wrapf(store.ErrNotFound, "resource definition %q", rdName) + } + + return nil, errors.Wrapf(err, "get ResourceDefinition %q (live)", rdName) + } + + return &rd, nil +} + +// verifyVolumeLanded re-reads the RD live and returns a synthetic +// Conflict error (so the CreateAutoNumbered retry loop re-runs) when the +// just-written VolumeNumber is NOT present — the BUG-048 stale-read +// clobber. Returns nil when the volume is durably present. A NotFound +// (RD vanished) is surfaced as-is so the retry loop's isConflictOrNotFound +// also catches it. +func (s *volumeDefinitions) verifyVolumeLanded(ctx context.Context, rdName string, assigned int32) error { + rd, err := s.fetchRDLive(ctx, rdName) + if err != nil { + return err + } + + for i := range rd.Spec.VolumeDefinitions { + if rd.Spec.VolumeDefinitions[i].VolumeNumber == assigned { + return nil + } + } + + return apierrors.NewConflict( + schema.GroupResource{Group: crdv1alpha1.GroupVersion.Group, Resource: "resourcedefinitions"}, + Name(rdName), + errors.Newf("volume %d did not persist (concurrent clobber); retrying allocation", assigned), + ) +} + func crdToWireVD(vd *crdv1alpha1.ResourceDefinitionVolume) apiv1.VolumeDefinition { return apiv1.VolumeDefinition{ VolumeNumber: vd.VolumeNumber, diff --git a/pkg/store/store.go b/pkg/store/store.go index 32bcfc4d..e341a06b 100644 --- a/pkg/store/store.go +++ b/pkg/store/store.go @@ -250,6 +250,26 @@ type VolumeDefinitionStore interface { List(ctx context.Context, rdName string) ([]apiv1.VolumeDefinition, error) Get(ctx context.Context, rdName string, volumeNumber int32) (apiv1.VolumeDefinition, error) Create(ctx context.Context, rdName string, vd *apiv1.VolumeDefinition) error + + // CreateAutoNumbered allocates the smallest free non-negative + // VolumeNumber for the parent RD and persists `vd` under it, + // returning the assigned number. Unlike a REST-side "List then pick + // the hole then Create" sequence, the allocation happens INSIDE the + // store's conflict-retry loop (k8s) / write lock (inmemory), so the + // read of the existing VD set and the write of the new entry are + // atomic with respect to a concurrent CreateAutoNumbered on the same + // RD. This closes the BUG-048 lost-update race: two back-to-back + // `linstor vd c ` calls used to both read `[vol-0]`, both pick + // VlmNr=1, and the loser was rejected with FAIL_EXISTS_VLM_DFN — the + // operator's second intended volume was silently dropped. With the + // allocation re-run per retry attempt, the loser re-reads `[vol-0, + // vol-1]` and lands at vol-2. + // + // vd.VolumeNumber on input is ignored — the store owns the + // allocation. On success vd.VolumeNumber is set to the assigned + // value for the caller's convenience and the same value is returned. + CreateAutoNumbered(ctx context.Context, rdName string, vd *apiv1.VolumeDefinition) (int32, error) + Update(ctx context.Context, rdName string, vd *apiv1.VolumeDefinition) error Delete(ctx context.Context, rdName string, volumeNumber int32) error diff --git a/pkg/store/storetest/storetest.go b/pkg/store/storetest/storetest.go index 5ae2a1c0..5babf5ea 100644 --- a/pkg/store/storetest/storetest.go +++ b/pkg/store/storetest/storetest.go @@ -208,6 +208,10 @@ func RunVolumeDefinitionStore(t *testing.T, newStore Factory) { t.Errorf("dup: got %v, want ErrAlreadyExists", err) } }) + // BUG-048: CreateAutoNumbered allocates the smallest free hole and + // the allocation is atomic with the write (the REST handler routes + // every number-less `linstor vd c` here). + runVolumeDefinitionAutoNumberCases(t, newStore) t.Run("GetMissing", func(t *testing.T) { s := newStore(t) seedRD(t, s, "pvc-1") @@ -332,6 +336,62 @@ func RunVolumeDefinitionStore(t *testing.T, newStore Factory) { t.Run("UpdateNilArg", func(t *testing.T) { testVDUpdateNilArg(t, newStore) }) } +// runVolumeDefinitionAutoNumberCases pins the BUG-048 atomic-allocate +// contract: sequential adds land at 0, 1, 2 … and a hole opened by a +// Delete is reused (upstream LINSTOR's smallest-hole rule). Split out +// of RunVolumeDefinitionStore to keep that function under maintidx. +func runVolumeDefinitionAutoNumberCases(t *testing.T, newStore Factory) { + t.Helper() + + t.Run("CreateAutoNumberedSequential", func(t *testing.T) { + s := newStore(t) + seedRD(t, s, "pvc-1") + ctx := t.Context() + + for i := range 3 { + want := int32(i) + vd := apiv1.VolumeDefinition{SizeKib: 1024} + + got, err := s.VolumeDefinitions().CreateAutoNumbered(ctx, "pvc-1", &vd) + if err != nil { + t.Fatalf("CreateAutoNumbered #%d: %v", want, err) + } + if got != want { + t.Fatalf("CreateAutoNumbered #%d: got VlmNr=%d, want %d", want, got, want) + } + if vd.VolumeNumber != want { + t.Fatalf("CreateAutoNumbered #%d: vd.VolumeNumber=%d, want %d", want, vd.VolumeNumber, want) + } + } + }) + t.Run("CreateAutoNumberedReusesHole", func(t *testing.T) { + s := newStore(t) + seedRD(t, s, "pvc-1") + ctx := t.Context() + + for range 3 { + vd := apiv1.VolumeDefinition{SizeKib: 1024} + if _, err := s.VolumeDefinitions().CreateAutoNumbered(ctx, "pvc-1", &vd); err != nil { + t.Fatalf("seed CreateAutoNumbered: %v", err) + } + } + // Remove the middle volume so a hole opens at 1. + if err := s.VolumeDefinitions().Delete(ctx, "pvc-1", 1); err != nil { + t.Fatalf("Delete: %v", err) + } + + vd := apiv1.VolumeDefinition{SizeKib: 1024} + + got, err := s.VolumeDefinitions().CreateAutoNumbered(ctx, "pvc-1", &vd) + if err != nil { + t.Fatalf("CreateAutoNumbered after delete: %v", err) + } + if got != 1 { + t.Fatalf("CreateAutoNumbered after delete: got VlmNr=%d, want 1 (smallest hole)", got) + } + }) +} + func testVDCreateNilArg(t *testing.T, newStore Factory) { t.Helper() diff --git a/tests/e2e/cli-matrix/multi-volume-late-vd-create.sh b/tests/e2e/cli-matrix/multi-volume-late-vd-create.sh index 6c9ef09c..ec7f59ef 100755 --- a/tests/e2e/cli-matrix/multi-volume-late-vd-create.sh +++ b/tests/e2e/cli-matrix/multi-volume-late-vd-create.sh @@ -2,7 +2,8 @@ # # usage: multi-volume-late-vd-create.sh WORK_DIR # -# L6 cli-matrix cell — Bug 332 (regression of Bug 79, P1). +# L6 cli-matrix cell — Bug 332 (regression of Bug 79, P1) + +# BUG-048 (P1, availability — concurrent late vd-add). # # Reproduction from the e2e2 stand: # @@ -10,8 +11,8 @@ # $ linstor vd c test2 1G # vol-0 # $ linstor r c test2 --auto-place=3 -s lvm-thin # # wait until all 3 replicas reach UpToDate -# $ linstor vd c test2 1G # vol-1 — late VD -# $ linstor vd c test2 1G # vol-2 — late VD +# $ linstor vd c test2 1G & # vol-1 — late VD +# $ linstor vd c test2 1G & # vol-2 — late VD (concurrent!) # # $ drbdadm status test2 # test2 role:Secondary suspended:quorum @@ -19,14 +20,30 @@ # volume:1 disk:Diskless quorum:no ← Unintentional Diskless # volume:2 disk:Diskless quorum:no ← Unintentional Diskless # -# Expected: late-added vol-1 / vol-2 each get their backing LV -# allocated on every diskful replica, drbdmeta create-md fires -# per-volume, the kernel slot picks up the new volumes, and every -# (replica, volume) pair settles UpToDate within 60s. +# Two distinct failure modes are pinned here: # -# Unit pin: pkg/satellite/reconciler_drbd_test.go:: -# TestApplyDRBDAllocatesBackingForLateAddedVolume -# verifies the satellite's per-volume create-md gate via FakeExec. +# - Bug 332 (per-volume create-md): a late VD's backing LV is never +# stamped, so the kernel brings it up disk:Diskless while the spec +# lacks DISKLESS ("Unintentional Diskless"). +# +# - BUG-048 (concurrent auto-assign lost-update): two BACK-TO-BACK +# number-less `vd c` calls both auto-assign the smallest free +# VolumeNumber off the same pre-write snapshot, both pick VlmNr=1, +# and the loser is rejected — the operator's SECOND volume silently +# vanishes (only vol-0 + vol-1 land, vol-2 never created). The two +# late adds below run CONCURRENTLY to exercise this; the cell then +# asserts the RD carries exactly 3 VolumeDefinitions. +# +# Expected (post-fix): both concurrent late adds land at distinct +# VolumeNumbers (1 and 2), each gets its backing LV + per-volume +# create-md on every diskful replica, the satellite's per-fresh-volume +# winner election seeds a SyncSource, and every (replica, volume) pair +# settles UpToDate. +# +# Unit pins: pkg/satellite/reconciler_drbd_test.go:: +# TestApplyDRBDAllocatesBackingForLateAddedVolume (per-volume create-md) +# and pkg/rest/volume_definitions_test.go:: +# TestVolumeDefinitionsConcurrentAutoNumberNoLostUpdate (BUG-048 race). # This L6 cell is the kernel-truth half — only the real stand can # observe the actual `drbdadm status` output that surfaced the bug. @@ -123,27 +140,70 @@ if ! wait_all_replicas_volume_uptodate 0 240; then exit 1 fi -# THE BUG: add vol-1 and vol-2 AFTER vol-0 is UpToDate. -echo ">> [Bug 332] late vd c (vol-1)" -"${LCTL[@]}" volume-definition create "$RD" 1G >/dev/null +# THE BUG (BUG-048, P1 availability): add vol-1 and vol-2 AFTER vol-0 is +# UpToDate, CONCURRENTLY (back-to-back, no convergence wait between them). +# Both number-less `vd c` calls auto-assign the smallest free VolumeNumber +# — pre-fix each read [vol-0] and BOTH chose VlmNr=1, so the loser was +# rejected FAIL_EXISTS_VLM_DFN and the operator's second intended volume +# silently vanished (only vol-0 + vol-1 persisted; vol-2 never created). +# Running them truly concurrently is what exercises the lost-update race; +# the prior sequential `vd c \n vd c` shape let the first REST response +# land before the second auto-assign ran and so masked the bug. +echo ">> [BUG-048] CONCURRENT late vd c (vol-1 + vol-2 back-to-back)" +vdc_rc=0 +"${LCTL[@]}" volume-definition create "$RD" 1G >/tmp/bug048-vdcA.out 2>&1 & +pA=$! +"${LCTL[@]}" volume-definition create "$RD" 1G >/tmp/bug048-vdcB.out 2>&1 & +pB=$! +wait "$pA" || vdc_rc=1 +wait "$pB" || vdc_rc=1 + +# BUG-048 wire-level assertion: BOTH concurrent adds must land. The RD +# must carry exactly THREE VolumeDefinitions (vol-0 + the two late adds); +# a lost-update leaves only two and one `vd c` reports a spurious +# "volume definition 1 already exists" even though the operator named no +# number. This catches the silent drop BEFORE the (slower) convergence +# wait below — and even if DRBD later happened to converge the survivors. +echo ">> [BUG-048] assert no concurrent vd-add was dropped (expect 3 VDs)" +vd_count=$("${LCTL[@]}" --machine-readable volume-definition list \ + --resource-definitions "$RD" 2>/dev/null \ + | jq -r '[.[]? | .[]? | (.vlm_dfns // .volume_definitions // [])[]?] | length' \ + 2>/dev/null || echo 0) +if [[ "$vd_count" != "3" ]]; then + echo "FAIL (BUG-048): RD $RD has $vd_count VolumeDefinitions after two concurrent late vd c, want 3 — a concurrent add was silently dropped" >&2 + echo "----- vd c #A output -----" >&2; cat /tmp/bug048-vdcA.out >&2 || true + echo "----- vd c #B output -----" >&2; cat /tmp/bug048-vdcB.out >&2 || true + "${LCTL[@]}" volume-definition list --resource-definitions "$RD" 2>&1 | tail -20 >&2 + exit 1 +fi +echo " 3 VolumeDefinitions present — no concurrent add dropped" -echo ">> [Bug 332] late vd c (vol-2)" -"${LCTL[@]}" volume-definition create "$RD" 1G >/dev/null +if (( vdc_rc != 0 )); then + # Both VDs landed (count==3) yet a CLI returned non-zero: that is the + # acceptable-but-noteworthy "fail loudly" path, not the silent wedge. + echo " note: a concurrent vd c exited non-zero but both volumes were created" >&2 +fi -# Each late-added 1G volume needs its own initial sync on every replica; -# under sweep load that takes well past the old 60s budget, so give the -# late volumes the same 240s headroom vol-0 gets. -echo ">> wait up to 240s for vol-1 + vol-2 to reach UpToDate on all 3 replicas" +# Each late-added 1G volume needs its own initial sync on every replica. +# Here TWO 1G volumes are added concurrently on a 3-diskful RD, so up to +# 6 fresh (replica,volume) initial syncs run at once and share the loop +# substrate's I/O — convergence routinely runs past the single-volume +# 240s budget (observed: vol-2 still `Outdated` — i.e. converging, NOT +# the wedge's `Inconsistent` — at the 240s mark, UpToDate moments later). +# 360s per volume keeps the concurrent path from flaking while still +# being far inside the wedge discriminator (a real BUG-048 wedge sits +# Inconsistent forever with no SyncSource and never advances). +echo ">> wait up to 360s for vol-1 + vol-2 to reach UpToDate on all 3 replicas" late_up=true for vol in 1 2; do - if ! wait_all_replicas_volume_uptodate "$vol" 240; then + if ! wait_all_replicas_volume_uptodate "$vol" 360; then late_up=false break fi done if [[ "$late_up" != "true" ]]; then - echo "FAIL (Bug 332): late-added vol-1/vol-2 not UpToDate on all 3 replicas within 240s" >&2 + echo "FAIL (Bug 332): late-added vol-1/vol-2 not UpToDate on all 3 replicas within 360s" >&2 "${LCTL[@]}" resource list --resources "$RD" 2>&1 | tail -40 >&2 # Surface the smoking gun: per (node, vol) disk_state straight from # the populated Resource.Status — a Bug-332-bitten path shows vol-1 @@ -187,4 +247,4 @@ else echo "SKIP-PARTIAL: drbdsetup status on $satellite_node failed; REST-level pin still asserted" fi -echo ">> multi-volume-late-vd-create OK (Bug 332 pinned: late vd c on $RD brought vol-1/vol-2 to UpToDate, no Unintentional Diskless)" +echo ">> multi-volume-late-vd-create OK (Bug 332 + BUG-048 pinned: two CONCURRENT late vd c on $RD both landed at distinct VlmNrs and brought vol-1/vol-2 to UpToDate, no dropped VD, no Unintentional Diskless)" diff --git a/tests/integration/harness/manager.go b/tests/integration/harness/manager.go index ce819853..93d4deac 100644 --- a/tests/integration/harness/manager.go +++ b/tests/integration/harness/manager.go @@ -105,7 +105,17 @@ func StartStack(t *testing.T) *Stack { t.Fatalf("build manager: %v", err) } - st := storek8s.New(mgr.GetClient()) + // Mirror cmd/apiserver/main.go: the REST-serving store is built + // with the manager's direct (uncached) API reader. CreateAutoNumbered's + // retry loop reads the parent RD through this reader so a conflict- + // retry observes the just-committed VolumeDefinition rather than a + // stale informer-cache revision. Without it (the plain New) the + // cache-lag re-read re-derives the same hole, 409-storms against the + // RD reconciler's concurrent writes, and the slow create makes the + // linstor client re-POST — leaving duplicate auto-numbered VDs + // (BUG-048 de-regress). Production never hit this because the apiserver + // always wires GetAPIReader(); the harness must match. + st := storek8s.NewWithAPIReader(mgr.GetClient(), mgr.GetAPIReader()) // Wire every reconciler / runnable cmd/controller/main.go // registers. Mirror order exactly so a future split-or-merge diff --git a/tests/operator-harness/lib.sh b/tests/operator-harness/lib.sh index 2ab6e3f5..33e0e649 100755 --- a/tests/operator-harness/lib.sh +++ b/tests/operator-harness/lib.sh @@ -603,6 +603,30 @@ except Exception: print(0)" "$vol" 2>/dev/null || echo 0) [[ "$actual" == "$expected" ]] ;; + vd_count) + # BUG-048: assert the RD carries EXACTLY `expected` + # VolumeDefinitions. A concurrent-auto-assign lost-update + # drops the second of two back-to-back number-less `vd c` + # calls, leaving one VD short — this is the wire-level + # signature that catches the silent drop independent of + # whether DRBD later converged the survivors. + local rd expected actual + rd=$(substitute "$(python3 -c "import json,sys; print(json.loads(sys.argv[1]).get('rd',''))" "$spec")") + expected=$(python3 -c "import json,sys; print(json.loads(sys.argv[1]).get('expected',0))" "$spec") + actual=$(linstor_cli -m volume-definition list --resource-definitions "$rd" 2>/dev/null \ + | python3 -c "import json,sys +try: + d=json.load(sys.stdin) + while isinstance(d, list) and d and isinstance(d[0], list): + d=d[0] + n=0 + for it in d if isinstance(d, list) else []: + n += len(it.get('vlm_dfns', []) or it.get('volume_definitions', []) or []) + print(n) +except Exception: + print(0)" 2>/dev/null || echo 0) + [[ "$actual" == "$expected" ]] + ;; pvc_capacity) # PVC.Status.Capacity matches expected (e.g. "2Gi"). # Verifies the operator-visible size propagation through diff --git a/tests/operator-harness/replay-runner.sh b/tests/operator-harness/replay-runner.sh index 552133fa..f1f8ddca 100755 --- a/tests/operator-harness/replay-runner.sh +++ b/tests/operator-harness/replay-runner.sh @@ -72,6 +72,8 @@ # - resource_absent wait until r d takes effect on a node # - rd_absent wait until rd is gone everywhere # - vd_size_kib VolumeDefinition.size_kib equals expected_kib +# - vd_count RD carries EXACTLY `expected` VolumeDefinitions +# (BUG-048 concurrent-add lost-update catcher) # - pvc_capacity PVC.Status.Capacity.storage matches expected # - pod_md5_invariant md5sum of file inside pod matches expected baseline # - volumes_settled every Resource of rd carries EXACTLY the diff --git a/tests/operator-harness/replay/vd-late-concurrent-no-drop.yaml b/tests/operator-harness/replay/vd-late-concurrent-no-drop.yaml new file mode 100644 index 00000000..0f9fed84 --- /dev/null +++ b/tests/operator-harness/replay/vd-late-concurrent-no-drop.yaml @@ -0,0 +1,108 @@ +name: vd-late-concurrent-no-drop +description: | + BUG-048 catcher (P1, availability). Short name on purpose: the + replay-runner prefixes `replay-` and suffixes a random token, and the + resulting RD name must stay within DRBD's resource-name length limit — + a longer name makes `resource-definition create` fail outright. + + Two back-to-back number-less + `linstor vd c ` calls against an RD that already carries vol-0 + (and whose RD.Spec.Initialized has flipped true) must BOTH land at + distinct VolumeNumbers — none silently dropped. + + Root cause: the REST handler auto-assigned the smallest free + VolumeNumber in a List taken BEFORE the store Create, so two adds + racing in the same window both read [vol-0], both chose VlmNr=1, the + loser was rejected FAIL_EXISTS_VLM_DFN, and the operator's second + intended volume silently vanished. When the loser backgrounds its CLI + there is no error surfaced at all — the volume simply never exists. + + Post-fix the smallest-free allocation moved INTO the store layer + (VolumeDefinitionStore.CreateAutoNumbered), re-derived on every + conflict-retry attempt, so the loser re-reads [vol-0, vol-1] and + lands at vol-2. + + Harness note: replay-runner executes each `cmd` as a single + sequential `linstor` invocation and has no concurrency primitive, so + this YAML drives the late adds back-to-back sequentially and pins the + POST-FIX invariant with `vd_count: 3` (the exact wire-level signature + a lost-update would violate — it leaves 2) plus `all_uptodate`. The + TRUE concurrent-race reproduction (background `&` + wait, which is + what exercises the auto-assign window) lives in the L6 kernel-truth + cell tests/e2e/cli-matrix/multi-volume-late-vd-create.sh, and the L1 + unit pin is pkg/rest/volume_definitions_test.go:: + TestVolumeDefinitionsConcurrentAutoNumberNoLostUpdate. + +prerequisites: + min_nodes: 2 + storage_pool: stand + +vars: + sp: stand + +steps: + - name: create-rd + cmd: ["resource-definition", "create", "{{rd}}"] + expect_exit: 0 + - name: create-vol0 + # 64M (not 1G): BUG-048 is a control-plane allocation defect, not a + # sync-timing race, so the trigger is size-independent. 64M converges + # in seconds on the I/O-constrained stand and keeps the all_uptodate + # awaits well inside budget. + cmd: ["volume-definition", "create", "{{rd}}", "64M"] + expect_exit: 0 + - name: r-c-node1 + cmd: ["resource", "create", "{{node1}}", "{{rd}}", "--storage-pool", "{{sp}}"] + expect_exit: 0 + - name: r-c-node2 + cmd: ["resource", "create", "{{node2}}", "{{rd}}", "--storage-pool", "{{sp}}"] + expect_exit: 0 + - name: wait-vol0-uptodate + cmd: ["resource", "list", "--resources", "{{rd}}"] + expect_exit: 0 + await: + kind: all_uptodate + rd: "{{rd}}" + timeout_s: 240 + - name: late-vd-create-second + # Late add #1 on the already-initialized RD → auto-assigns vol-1. + cmd: ["volume-definition", "create", "{{rd}}", "64M"] + expect_exit: 0 + - name: late-vd-create-third + # Late add #2 back-to-back → MUST auto-assign vol-2 (not collide on + # vol-1). Pre-fix, when these two raced, this one was the dropped add. + cmd: ["volume-definition", "create", "{{rd}}", "64M"] + expect_exit: 0 + await: + # BUG-048 signature: exactly 3 VDs must exist (vol-0,1,2). A + # lost-update leaves 2. + kind: vd_count + rd: "{{rd}}" + expected: 3 + timeout_s: 30 + - name: wait-all-volumes-uptodate + cmd: ["resource", "list", "--resources", "{{rd}}"] + expect_exit: 0 + await: + # Every volume of every replica converges — the elected per-fresh- + # volume winner seeds a SyncSource for both late adds. Two BUG-048 + # self-heals recover a concurrent-seed race: (a) when a volume comes + # up Inconsistent on EVERY replica with no source, the + # late-add-promote self-heal force-primaries the lowest-id node; (b) + # when a resync IS elected but STALLS (paused/bitmap-exchange-wedged + # on a >=3-replica add — out-of-sync frozen, two replicas never + # converge), the late-add resync-KICK self-heal disconnect+adjusts to + # restart it. Both add latency (wedge-detect + the 45s stall-dwell / + # recoveryPromoteThrottle window + the kick/promote + resync). On a + # contended stand those recovery paths run past the bare-convergence + # budget, so allow 360s — still far inside the wedge discriminator (a + # real unfixed wedge never advances at all). + kind: all_uptodate + rd: "{{rd}}" + timeout_s: 360 + +teardown: + - cmd: ["resource-definition", "delete", "{{rd}}"] + +invariants: + - no_orphans