From e63895ef437cbbbb95d48f8b1b60bfee9ede7f2f Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Sun, 14 Jun 2026 09:58:06 +0300 Subject: [PATCH 01/19] fix(rest,store): close concurrent late vd-add lost-update race (BUG-048) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two back-to-back number-less `linstor vd c ` calls against an RD that already carries vol-0 used to drop the second add: each request picked the smallest free VolumeNumber in a REST-side List BEFORE the store Create, so both read [vol-0], both chose VlmNr=1, the loser was rejected FAIL_EXISTS_VLM_DFN, and the operator's second intended volume silently vanished (only vol-0 + vol-1 landed). When the loser backgrounds its CLI the failure surfaces no usable error at all. Move the smallest-free allocation INTO the store layer as VolumeDefinitionStore.CreateAutoNumbered, which re-derives the hole on every conflict-retry attempt (k8s: allocate inside RetryOnConflict under the apiserver resourceVersion guard; inmemory: under the write lock). The read of the existing set and the append are now atomic with respect to a racing add, so the loser re-reads [vol-0, vol-1] and lands at vol-2. The explicit-number path keeps the plain Create (its retry already converges and a genuine duplicate must still 409). The DRBD per-volume day0/winner-election seed path (BUG-028/384 seedFreshVolumes/isLateAddWinner) is untouched and already converges concurrent late adds correctly once both volumes exist with distinct numbers — verified on stand by forcing distinct --vlmnr. Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- pkg/rest/volume_definitions.go | 132 +++++++++++------------- pkg/rest/volume_definitions_test.go | 97 +++++++++++++++++ pkg/store/inmemory_volume_definition.go | 37 +++++++ pkg/store/k8s/volume_definitions.go | 69 +++++++++++++ pkg/store/store.go | 20 ++++ pkg/store/storetest/storetest.go | 60 +++++++++++ 6 files changed, 341 insertions(+), 74 deletions(-) diff --git a/pkg/rest/volume_definitions.go b/pkg/rest/volume_definitions.go index 0d10be23..c5740f3a 100644 --- a/pkg/rest/volume_definitions.go +++ b/pkg/rest/volume_definitions.go @@ -316,8 +316,18 @@ func (s *Server) handleVDCreate(w http.ResponseWriter, r *http.Request) { vd := envelope.VolumeDefinition - if !s.resolveVolumeNumber(w, r, rd, rawBody, &vd) { - return + autoNumber := !vdCreateVolumeNumberExplicit(rawBody) + + // Validate the explicit number (Bug 363) before the size gate so the + // operator-facing rejection order is stable. The auto-number path + // owns the number itself (the store allocates it), so it skips this. + if !autoNumber { + nrErr := validateVolumeNumber(vd.VolumeNumber) + if nrErr != nil { + writeVDNumberRejection(w, rd, vd.VolumeNumber, nrErr) + + return + } } // Bug 155: refuse out-of-bounds sizes at the REST boundary so the @@ -330,25 +340,7 @@ func (s *Server) handleVDCreate(w http.ResponseWriter, r *http.Request) { return } - err = s.Store.VolumeDefinitions().Create(r.Context(), rd, &vd) - if err != nil { - // Bug 140: duplicate-VD conflict gets a typed envelope with - // the upstream FAIL_EXISTS_VLM_DFN sub-code plus actionable - // cause/correction so scripts and audit-log greppers route - // the same way they do for upstream's `linstor vd c` reply. - // The bare writeStoreError fallback emitted apiCallRcError - // alone — high-bit error, no sub-code, no cause/correction - // — which the Python CLI rendered as a generic "object - // already exists" line that didn't tell the operator which - // VlmNr to twist. - if errors.Is(err, store.ErrAlreadyExists) { - writeVDExistsConflict(w, rd, vd.VolumeNumber) - - return - } - - writeStoreError(w, err) - + if !s.persistNewVolumeDefinition(w, r, rd, &vd, autoNumber) { return } @@ -363,42 +355,58 @@ func (s *Server) handleVDCreate(w http.ResponseWriter, r *http.Request) { }}) } -// resolveVolumeNumber folds the auto-assign + explicit-VlmNr -// validation branches out of handleVDCreate so the parent stays under -// the funlen threshold. Returns true on success (vd.VolumeNumber is -// populated and within bounds) and false when an envelope was already -// written to w (the caller must return immediately). +// persistNewVolumeDefinition writes the new VD to the store and, on +// failure, emits the right typed envelope. Returns true on success and +// false when an HTTP error envelope was already written (the caller +// must return immediately). Split out of handleVDCreate so the handler +// stays under the funlen budget. // -// Bug 191: auto-assign fires when the raw POST body does NOT carry an -// explicit `volume_number` key — Go's int32 zero would otherwise -// silently collide with VlmNr=0 on the second `vd c` against the -// same RD. Bug 363: when the caller DOES specify a volume_number, it -// must lie inside DRBD-9's addressable [0, 65535] range; out-of-bound -// values would otherwise hang the satellite in "waiting for DRBD-ID -// allocation" because no positive minor can be derived from them. -func (s *Server) resolveVolumeNumber( +// BUG-048 (P1, availability): when the operator did NOT name a +// VolumeNumber, the allocation MUST happen atomically with the write — +// route through CreateAutoNumbered, which picks the smallest free hole +// INSIDE the store's conflict-retry loop. The previous flow picked the +// number in a separate REST-side List (autoAssignVolumeNumber) BEFORE +// the Create, so two concurrent `linstor vd c ` calls both read +// `[vol-0]`, both chose VlmNr=1, the loser was rejected FAIL_EXISTS_ +// VLM_DFN, and the operator's second intended volume silently vanished. +// The explicit-number path keeps the plain Create (its retry loop +// already converges correctly and a genuine duplicate must still 409). +func (s *Server) persistNewVolumeDefinition( w http.ResponseWriter, r *http.Request, rd string, - rawBody []byte, vd *apiv1.VolumeDefinition, + autoNumber bool, ) bool { - if !vdCreateVolumeNumberExplicit(rawBody) { - assigned, assignErr := s.autoAssignVolumeNumber(r.Context(), rd) - if assignErr != nil { - writeStoreError(w, assignErr) + if autoNumber { + _, err := s.Store.VolumeDefinitions().CreateAutoNumbered(r.Context(), rd, vd) + if err != nil { + writeStoreError(w, err) return false } - vd.VolumeNumber = assigned - return true } - nrErr := validateVolumeNumber(vd.VolumeNumber) - if nrErr != nil { - writeVDNumberRejection(w, rd, vd.VolumeNumber, nrErr) + err := s.Store.VolumeDefinitions().Create(r.Context(), rd, vd) + if err != nil { + // Bug 140: duplicate-VD conflict gets a typed envelope with + // the upstream FAIL_EXISTS_VLM_DFN sub-code plus actionable + // cause/correction so scripts and audit-log greppers route + // the same way they do for upstream's `linstor vd c` reply. + // The bare writeStoreError fallback emitted apiCallRcError + // alone — high-bit error, no sub-code, no cause/correction + // — which the Python CLI rendered as a generic "object + // already exists" line that didn't tell the operator which + // VlmNr to twist. + if errors.Is(err, store.ErrAlreadyExists) { + writeVDExistsConflict(w, rd, vd.VolumeNumber) + + return false + } + + writeStoreError(w, err) return false } @@ -469,36 +477,12 @@ func rawHasNonNullKey(obj map[string]json.RawMessage, key string) bool { return !bytes.Equal(bytes.TrimSpace(raw), []byte("null")) } -// autoAssignVolumeNumber returns the smallest free non-negative -// VolumeNumber under the given RD. Mirrors upstream LINSTOR's -// CtrlVlmDfnCrtApiCallHandler smallest-hole rule: with VDs 0 and 2 -// present, an auto-assign POST lands at 1 — not 3. -// -// A missing RD surfaces here as the underlying store's ErrNotFound; -// the caller routes it through writeStoreError so the wire shape -// matches the pre-fix 404 path for an unknown parent RD. -func (s *Server) autoAssignVolumeNumber(ctx context.Context, rd string) (int32, error) { - vds, err := s.Store.VolumeDefinitions().List(ctx, rd) - if err != nil { - return 0, err //nolint:wrapcheck // surfaced to writeStoreError - } - - used := make(map[int32]bool, len(vds)) - for i := range vds { - used[vds[i].VolumeNumber] = true - } - - for candidate := int32(0); candidate >= 0; candidate++ { - if !used[candidate] { - return candidate, nil - } - } - - // Unreachable for any sane RD (the smallest-free walk terminates - // at the first gap; an RD with 2^31 VDs is impossible on real - // storage). Pin the contract anyway. - return 0, errors.New("auto-assign: VolumeNumber space exhausted") -} +// Auto-assign of the smallest free VolumeNumber now lives in the store +// layer (VolumeDefinitionStore.CreateAutoNumbered) so the read of the +// existing set and the write of the new entry are atomic — see BUG-048. +// The former REST-side autoAssignVolumeNumber helper (a separate List +// before the Create) was removed because its TOCTOU window dropped the +// second of two concurrent `linstor vd c` calls. // minVolumeDefinitionSizeKib is the smallest accepted size_kib on // `POST /v1/resource-definitions/{rd}/volume-definitions` (Bug 155). diff --git a/pkg/rest/volume_definitions_test.go b/pkg/rest/volume_definitions_test.go index 7ca91ebb..d40bc1e1 100644 --- a/pkg/rest/volume_definitions_test.go +++ b/pkg/rest/volume_definitions_test.go @@ -22,6 +22,7 @@ import ( "encoding/json" "net/http" "strings" + "sync" "testing" apiv1 "github.com/cozystack/blockstor/pkg/api/v1" @@ -69,6 +70,102 @@ func TestVolumeDefinitionsCreateRoundTrip(t *testing.T) { } } +// TestVolumeDefinitionsConcurrentAutoNumberNoLostUpdate is the BUG-048 +// (P1, availability) regression: two back-to-back number-less +// `linstor vd c ` POSTs against an RD that already carries vol-0 +// must BOTH succeed and land at DISTINCT VolumeNumbers (1 and 2) — none +// silently dropped. +// +// Pre-fix the auto-assign picked the smallest free number in a REST-side +// List BEFORE the store Create, so both requests read `[vol-0]`, both +// chose VlmNr=1, the loser was rejected FAIL_EXISTS_VLM_DFN, and the +// operator's second intended volume vanished (only vol-0 + vol-1 left). +// The store-side CreateAutoNumbered re-derives the hole inside its +// conflict-retry loop, so the loser lands at vol-2. +// +// This is the L1 half of the BUG-048 fix; the kernel-truth convergence +// half is tests/e2e/cli-matrix/multi-volume-late-vd-create.sh. +func TestVolumeDefinitionsConcurrentAutoNumberNoLostUpdate(t *testing.T) { + st := store.NewInMemory() + if err := st.ResourceDefinitions().Create(t.Context(), &apiv1.ResourceDefinition{Name: "pvc-1"}); err != nil { + t.Fatalf("seed RD: %v", err) + } + // vol-0 already present — this is the "late vd-add" precondition. + if err := st.VolumeDefinitions().Create(t.Context(), "pvc-1", &apiv1.VolumeDefinition{VolumeNumber: 0, SizeKib: 32768}); err != nil { + t.Fatalf("seed vol-0: %v", err) + } + + base, stop := startServerWithStore(t, st) + defer stop() + + // Number-less create body — the auto-assign path the python CLI's + // `linstor vd c ` takes: the wire body OMITS the + // `volume_number` key entirely (a marshalled apiv1.VolumeDefinition + // would emit `"volume_number":0` because the field has no omitempty, + // which the handler reads as an EXPLICIT 0 — that is NOT what the + // CLI sends, so the test crafts the raw omitted-key body directly). + body := []byte(`{"volume_definition":{"size_kib":32768}}`) + + const concurrent = 2 + + var ( + wg sync.WaitGroup + mu sync.Mutex + statuses []int + ) + + for range concurrent { + wg.Add(1) + + go func() { + defer wg.Done() + + resp := httpPost(t, base+"/v1/resource-definitions/pvc-1/volume-definitions", body) + defer func() { _ = resp.Body.Close() }() + + mu.Lock() + statuses = append(statuses, resp.StatusCode) + mu.Unlock() + }() + } + + wg.Wait() + + for i, code := range statuses { + if code != http.StatusOK { + t.Errorf("concurrent vd c #%d: status %d, want 200 (no request may be lost/rejected)", i, code) + } + } + + listResp := httpGet(t, base+"/v1/resource-definitions/pvc-1/volume-definitions") + defer func() { _ = listResp.Body.Close() }() + + var vds []apiv1.VolumeDefinition + if jErr := json.NewDecoder(listResp.Body).Decode(&vds); jErr != nil { + t.Fatalf("decode: %v", jErr) + } + + // Exactly three VDs: vol-0 (seeded) + the two concurrent adds at 1, 2. + if len(vds) != 3 { + t.Fatalf("got %d VDs, want 3 (vol-0 + two concurrent adds); a lost-update would leave 2: %+v", len(vds), vds) + } + + seen := map[int32]bool{} + for i := range vds { + if seen[vds[i].VolumeNumber] { + t.Fatalf("duplicate VolumeNumber %d in %+v", vds[i].VolumeNumber, vds) + } + seen[vds[i].VolumeNumber] = true + } + + for i := range 3 { + want := int32(i) + if !seen[want] { + t.Errorf("missing VolumeNumber %d (set: %v) — a concurrent add was silently dropped", want, seen) + } + } +} + // TestVolumeDefinitionsGetMissingRD: 404 when RD itself does not exist. func TestVolumeDefinitionsGetMissingRD(t *testing.T) { base, stop := startServerWithStore(t, store.NewInMemory()) diff --git a/pkg/store/inmemory_volume_definition.go b/pkg/store/inmemory_volume_definition.go index ae40801c..871b1949 100644 --- a/pkg/store/inmemory_volume_definition.go +++ b/pkg/store/inmemory_volume_definition.go @@ -86,6 +86,43 @@ func (s *inMemoryVolumeDefinitions) Create(_ context.Context, rdName string, vd return nil } +// CreateAutoNumbered allocates the smallest free non-negative +// VolumeNumber under the write lock so the read of the existing set and +// the insert are a single atomic step — the InMemory equivalent of the +// k8s backend's allocate-inside-RetryOnConflict. BUG-048: a REST-side +// "List then Create" sequence has a TOCTOU window two concurrent +// `vd c` calls fall into (both pick the same hole, the loser is +// rejected and its volume silently lost); doing the allocation here +// closes it. +func (s *inMemoryVolumeDefinitions) CreateAutoNumbered(_ context.Context, rdName string, vd *apiv1.VolumeDefinition) (int32, error) { + if vd == nil { + return 0, errors.New("nil VolumeDefinition") + } + + s.mu.Lock() + defer s.mu.Unlock() + + // Smallest-hole walk under the lock so the read of the used set and + // the insert are a single atomic step. Mirrors upstream LINSTOR's + // rule (VDs 0 and 2 present → 1, not 3). + used := make(map[int32]bool) + + for k := range s.m { + if k.rd == rdName { + used[k.vol] = true + } + } + + var assigned int32 + for assigned = 0; used[assigned]; assigned++ { //nolint:revive // empty body: the increment IS the body + } + + vd.VolumeNumber = assigned + s.m[vdKey{rdName, assigned}] = *vd + + return assigned, nil +} + func (s *inMemoryVolumeDefinitions) Update(_ context.Context, rdName string, vd *apiv1.VolumeDefinition) error { if vd == nil { return errors.New("nil VolumeDefinition") diff --git a/pkg/store/k8s/volume_definitions.go b/pkg/store/k8s/volume_definitions.go index 574a5da5..07e610cc 100644 --- a/pkg/store/k8s/volume_definitions.go +++ b/pkg/store/k8s/volume_definitions.go @@ -103,6 +103,75 @@ func (s *volumeDefinitions) Create(ctx context.Context, rdName string, vd *apiv1 }), "update RD %q to add volume %d", rdName, vd.VolumeNumber) } +// CreateAutoNumbered allocates the smallest free non-negative +// VolumeNumber INSIDE the conflict-retry loop and persists `vd` under +// it, returning the assigned number. BUG-048 (P1, availability): the +// REST handler used to pick the number in a separate "List the RD → +// smallest hole" step BEFORE this Create, so two concurrent +// `linstor vd c ` calls both read `[vol-0]`, both decided VlmNr=1, +// and the loser was rejected with FAIL_EXISTS_VLM_DFN — the operator's +// second intended volume silently vanished (only one VD landed, no +// usable error: the message claimed vol-1 "already exists" though the +// operator never named a number). Re-deriving the hole on every retry +// attempt makes the read-of-existing-set and the append atomic with +// respect to a racing CreateAutoNumbered: the loser's retry re-fetches +// the RD now carrying `[vol-0, vol-1]` and lands at vol-2. +// +// On the k8s backend the optimistic-concurrency guarantee comes from +// the apiserver's resourceVersion check on Update: if a racing write +// landed between our fetch and Update, the Update 409s, isConflictOr +// NotFound retries, and the whole allocate+append re-runs against the +// fresh RD. (Mirrors the existing explicit-number Create's retry; the +// only difference is the number is computed here rather than supplied.) +func (s *volumeDefinitions) CreateAutoNumbered(ctx context.Context, rdName string, vd *apiv1.VolumeDefinition) (int32, error) { + if vd == nil { + return 0, errors.New("nil VolumeDefinition") + } + + var assigned int32 + + err := retry.OnError(retry.DefaultRetry, isConflictOrNotFound, func() error { + rd, fetchErr := s.fetchRD(ctx, rdName) + if fetchErr != nil { + return fetchErr + } + + assigned = smallestFreeVolumeNumber(rd.Spec.VolumeDefinitions) + + entry := wireToCRDVD(vd) + entry.VolumeNumber = assigned + + rd.Spec.VolumeDefinitions = append(rd.Spec.VolumeDefinitions, entry) + + return s.c.Update(ctx, rd) + }) + if err != nil { + return 0, errors.Wrapf(err, "auto-numbered create on RD %q", rdName) + } + + vd.VolumeNumber = assigned + + return assigned, nil +} + +// smallestFreeVolumeNumber returns the lowest non-negative VolumeNumber +// not present in vds. Mirrors upstream LINSTOR's smallest-hole rule +// (VDs 0 and 2 present → 1, not 3). +func smallestFreeVolumeNumber(vds []crdv1alpha1.ResourceDefinitionVolume) int32 { + used := make(map[int32]bool, len(vds)) + for i := range vds { + used[vds[i].VolumeNumber] = true + } + + for candidate := int32(0); candidate >= 0; candidate++ { + if !used[candidate] { + return candidate + } + } + + return 0 +} + func (s *volumeDefinitions) Update(ctx context.Context, rdName string, vd *apiv1.VolumeDefinition) error { if vd == nil { return errors.New("nil VolumeDefinition") diff --git a/pkg/store/store.go b/pkg/store/store.go index 32bcfc4d..e341a06b 100644 --- a/pkg/store/store.go +++ b/pkg/store/store.go @@ -250,6 +250,26 @@ type VolumeDefinitionStore interface { List(ctx context.Context, rdName string) ([]apiv1.VolumeDefinition, error) Get(ctx context.Context, rdName string, volumeNumber int32) (apiv1.VolumeDefinition, error) Create(ctx context.Context, rdName string, vd *apiv1.VolumeDefinition) error + + // CreateAutoNumbered allocates the smallest free non-negative + // VolumeNumber for the parent RD and persists `vd` under it, + // returning the assigned number. Unlike a REST-side "List then pick + // the hole then Create" sequence, the allocation happens INSIDE the + // store's conflict-retry loop (k8s) / write lock (inmemory), so the + // read of the existing VD set and the write of the new entry are + // atomic with respect to a concurrent CreateAutoNumbered on the same + // RD. This closes the BUG-048 lost-update race: two back-to-back + // `linstor vd c ` calls used to both read `[vol-0]`, both pick + // VlmNr=1, and the loser was rejected with FAIL_EXISTS_VLM_DFN — the + // operator's second intended volume was silently dropped. With the + // allocation re-run per retry attempt, the loser re-reads `[vol-0, + // vol-1]` and lands at vol-2. + // + // vd.VolumeNumber on input is ignored — the store owns the + // allocation. On success vd.VolumeNumber is set to the assigned + // value for the caller's convenience and the same value is returned. + CreateAutoNumbered(ctx context.Context, rdName string, vd *apiv1.VolumeDefinition) (int32, error) + Update(ctx context.Context, rdName string, vd *apiv1.VolumeDefinition) error Delete(ctx context.Context, rdName string, volumeNumber int32) error diff --git a/pkg/store/storetest/storetest.go b/pkg/store/storetest/storetest.go index 5ae2a1c0..5babf5ea 100644 --- a/pkg/store/storetest/storetest.go +++ b/pkg/store/storetest/storetest.go @@ -208,6 +208,10 @@ func RunVolumeDefinitionStore(t *testing.T, newStore Factory) { t.Errorf("dup: got %v, want ErrAlreadyExists", err) } }) + // BUG-048: CreateAutoNumbered allocates the smallest free hole and + // the allocation is atomic with the write (the REST handler routes + // every number-less `linstor vd c` here). + runVolumeDefinitionAutoNumberCases(t, newStore) t.Run("GetMissing", func(t *testing.T) { s := newStore(t) seedRD(t, s, "pvc-1") @@ -332,6 +336,62 @@ func RunVolumeDefinitionStore(t *testing.T, newStore Factory) { t.Run("UpdateNilArg", func(t *testing.T) { testVDUpdateNilArg(t, newStore) }) } +// runVolumeDefinitionAutoNumberCases pins the BUG-048 atomic-allocate +// contract: sequential adds land at 0, 1, 2 … and a hole opened by a +// Delete is reused (upstream LINSTOR's smallest-hole rule). Split out +// of RunVolumeDefinitionStore to keep that function under maintidx. +func runVolumeDefinitionAutoNumberCases(t *testing.T, newStore Factory) { + t.Helper() + + t.Run("CreateAutoNumberedSequential", func(t *testing.T) { + s := newStore(t) + seedRD(t, s, "pvc-1") + ctx := t.Context() + + for i := range 3 { + want := int32(i) + vd := apiv1.VolumeDefinition{SizeKib: 1024} + + got, err := s.VolumeDefinitions().CreateAutoNumbered(ctx, "pvc-1", &vd) + if err != nil { + t.Fatalf("CreateAutoNumbered #%d: %v", want, err) + } + if got != want { + t.Fatalf("CreateAutoNumbered #%d: got VlmNr=%d, want %d", want, got, want) + } + if vd.VolumeNumber != want { + t.Fatalf("CreateAutoNumbered #%d: vd.VolumeNumber=%d, want %d", want, vd.VolumeNumber, want) + } + } + }) + t.Run("CreateAutoNumberedReusesHole", func(t *testing.T) { + s := newStore(t) + seedRD(t, s, "pvc-1") + ctx := t.Context() + + for range 3 { + vd := apiv1.VolumeDefinition{SizeKib: 1024} + if _, err := s.VolumeDefinitions().CreateAutoNumbered(ctx, "pvc-1", &vd); err != nil { + t.Fatalf("seed CreateAutoNumbered: %v", err) + } + } + // Remove the middle volume so a hole opens at 1. + if err := s.VolumeDefinitions().Delete(ctx, "pvc-1", 1); err != nil { + t.Fatalf("Delete: %v", err) + } + + vd := apiv1.VolumeDefinition{SizeKib: 1024} + + got, err := s.VolumeDefinitions().CreateAutoNumbered(ctx, "pvc-1", &vd) + if err != nil { + t.Fatalf("CreateAutoNumbered after delete: %v", err) + } + if got != 1 { + t.Fatalf("CreateAutoNumbered after delete: got VlmNr=%d, want 1 (smallest hole)", got) + } + }) +} + func testVDCreateNilArg(t *testing.T, newStore Factory) { t.Helper() From 836ed9ff764687265af11cc4d13e16b81330b81f Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Sun, 14 Jun 2026 10:03:28 +0300 Subject: [PATCH 02/19] test(harness): concurrent late vd-add catchers for BUG-048 (L6 + L7) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit L6 cli-matrix multi-volume-late-vd-create.sh: take PR #162's corrected Status-based detection (status_disk_state instead of the null .vlms projection) and make the two late `vd c` calls run CONCURRENTLY (background + wait) so they exercise the auto-assign lost-update window. Add the BUG-048 wire-level assertion: the RD must carry exactly 3 VolumeDefinitions after the two concurrent adds — a dropped add leaves only 2 and surfaces a spurious "volume definition 1 already exists". L7 replay vd-late-create-concurrent-no-lost-update.yaml: codify the post-fix invariant (vol-0 up, two back-to-back late adds → vd_count==3 + all_uptodate). Adds a `vd_count` await kind to the harness lib that asserts the exact VolumeDefinition count on an RD. Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- CLAUDE.md | 2 +- .../cli-matrix/multi-volume-late-vd-create.sh | 206 ++++++++++++------ tests/operator-harness/lib.sh | 24 ++ tests/operator-harness/replay-runner.sh | 2 + ...late-create-concurrent-no-lost-update.yaml | 92 ++++++++ 5 files changed, 264 insertions(+), 62 deletions(-) create mode 100644 tests/operator-harness/replay/vd-late-create-concurrent-no-lost-update.yaml diff --git a/CLAUDE.md b/CLAUDE.md index 086ce1fe..329841ad 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -45,7 +45,7 @@ Copy the closest existing YAML and fill: - `teardown[]` — cleanup CLI invocations. - `invariants[]` — currently only `no_orphans` is implemented. -Available `await.kind` values: `replica_count`, `disk_state`, `all_uptodate`, `replica_diskless`, `no_tiebreaker`, `sync_clean`, `resource_absent`, `rd_absent`, `vd_size_kib`, `pvc_capacity`, `pod_md5_invariant`. See the header comment of `tests/operator-harness/replay-runner.sh` for the contract. +Available `await.kind` values: `replica_count`, `disk_state`, `all_uptodate`, `replica_diskless`, `no_tiebreaker`, `sync_clean`, `resource_absent`, `rd_absent`, `vd_size_kib`, `vd_count`, `pvc_capacity`, `pod_md5_invariant`. See the header comment of `tests/operator-harness/replay-runner.sh` for the contract. ## Running the harness diff --git a/tests/e2e/cli-matrix/multi-volume-late-vd-create.sh b/tests/e2e/cli-matrix/multi-volume-late-vd-create.sh index 6c552f35..9e82209c 100755 --- a/tests/e2e/cli-matrix/multi-volume-late-vd-create.sh +++ b/tests/e2e/cli-matrix/multi-volume-late-vd-create.sh @@ -2,7 +2,8 @@ # # usage: multi-volume-late-vd-create.sh WORK_DIR # -# L6 cli-matrix cell — Bug 332 (regression of Bug 79, P1). +# L6 cli-matrix cell — Bug 332 (regression of Bug 79, P1) + +# BUG-048 (P1, availability — concurrent late vd-add). # # Reproduction from the e2e2 stand: # @@ -10,8 +11,8 @@ # $ linstor vd c test2 1G # vol-0 # $ linstor r c test2 --auto-place=3 -s lvm-thin # # wait until all 3 replicas reach UpToDate -# $ linstor vd c test2 1G # vol-1 — late VD -# $ linstor vd c test2 1G # vol-2 — late VD +# $ linstor vd c test2 1G & # vol-1 — late VD +# $ linstor vd c test2 1G & # vol-2 — late VD (concurrent!) # # $ drbdadm status test2 # test2 role:Secondary suspended:quorum @@ -19,14 +20,30 @@ # volume:1 disk:Diskless quorum:no ← Unintentional Diskless # volume:2 disk:Diskless quorum:no ← Unintentional Diskless # -# Expected: late-added vol-1 / vol-2 each get their backing LV -# allocated on every diskful replica, drbdmeta create-md fires -# per-volume, the kernel slot picks up the new volumes, and every -# (replica, volume) pair settles UpToDate within 60s. +# Two distinct failure modes are pinned here: # -# Unit pin: pkg/satellite/reconciler_drbd_test.go:: -# TestApplyDRBDAllocatesBackingForLateAddedVolume -# verifies the satellite's per-volume create-md gate via FakeExec. +# - Bug 332 (per-volume create-md): a late VD's backing LV is never +# stamped, so the kernel brings it up disk:Diskless while the spec +# lacks DISKLESS ("Unintentional Diskless"). +# +# - BUG-048 (concurrent auto-assign lost-update): two BACK-TO-BACK +# number-less `vd c` calls both auto-assign the smallest free +# VolumeNumber off the same pre-write snapshot, both pick VlmNr=1, +# and the loser is rejected — the operator's SECOND volume silently +# vanishes (only vol-0 + vol-1 land, vol-2 never created). The two +# late adds below run CONCURRENTLY to exercise this; the cell then +# asserts the RD carries exactly 3 VolumeDefinitions. +# +# Expected (post-fix): both concurrent late adds land at distinct +# VolumeNumbers (1 and 2), each gets its backing LV + per-volume +# create-md on every diskful replica, the satellite's per-fresh-volume +# winner election seeds a SyncSource, and every (replica, volume) pair +# settles UpToDate. +# +# Unit pins: pkg/satellite/reconciler_drbd_test.go:: +# TestApplyDRBDAllocatesBackingForLateAddedVolume (per-volume create-md) +# and pkg/rest/volume_definitions_test.go:: +# TestVolumeDefinitionsConcurrentAutoNumberNoLostUpdate (BUG-048 race). # This L6 cell is the kernel-truth half — only the real stand can # observe the actual `drbdadm status` output that surfaced the bug. @@ -69,64 +86,128 @@ echo ">> [Bug 332] rd c + vd c (vol-0)" echo ">> [Bug 332] r c --auto-place=3 -s $POOL" "${LCTL[@]}" resource create --auto-place=3 --storage-pool="$POOL" "$RD" >/dev/null +# Resolve the 3 diskful nodes auto-place picked, so every convergence +# check below reads the per-replica Resource.Status directly. +echo ">> wait for 3 diskful replicas to be placed" +if ! wait_diskful_count "$RD" 3 90; then + echo "FAIL: auto-place did not land 3 diskful replicas" >&2 + "${LCTL[@]}" resource list --resources "$RD" 2>&1 | tail -30 >&2 + exit 1 +fi +mapfile -t DISKFUL_NODES < <(linstor_diskful_nodes "$RD") +if (( ${#DISKFUL_NODES[@]} != 3 )); then + echo "FAIL: expected 3 diskful nodes, got ${#DISKFUL_NODES[@]}: ${DISKFUL_NODES[*]}" >&2 + exit 1 +fi +echo " diskful nodes: ${DISKFUL_NODES[*]}" + +# wait_all_replicas_volume_uptodate — poll +# Resource.Status.volumes[].diskState on EVERY diskful node until +# all report UpToDate, or timeout. Reads the observer-stamped CRD via +# status_disk_state (lib.sh) — the SAME wire surface the passing cells +# use. NOTE: the `--machine-readable resource list` projection leaves +# `.vlms` null on this apiserver, so the previous jq-on-vlms detection +# always read empty and timed out even while `linstor r l` (which reads +# Resource.Status) showed every replica UpToDate. status_disk_state +# reads the populated CRD field instead. +wait_all_replicas_volume_uptodate() { + local vol=$1 timeout=$2 + local deadline=$(( $(date +%s) + timeout )) + local node st all_ok + while (( $(date +%s) < deadline )); do + all_ok=true + for node in "${DISKFUL_NODES[@]}"; do + st=$(status_disk_state "$RD" "$node" "$vol") + if [[ "$st" != "UpToDate" ]]; then + all_ok=false + break + fi + done + if [[ "$all_ok" == "true" ]]; then + return 0 + fi + sleep 3 + done + return 1 +} + # 240s: a 1G x3 initial sync shares the stand with the rest of the -# sweep; the previous 120s budget flaked under load (the failure dump -# printed all three replicas UpToDate moments after the deadline). +# sweep; the previous 120s budget flaked under load. echo ">> wait up to 240s for vol-0 to reach UpToDate on all 3 replicas" -deadline=$(( $(date +%s) + 240 )) -all_up=false -while (( $(date +%s) < deadline )); do - # Per-replica volume-0 state, three rows expected. The wire - # shape is `linstor r l --resources ` → DiskState column. - states=$("${LCTL[@]}" --machine-readable resource list --resources "$RD" 2>/dev/null \ - | jq -r '[.[][]? | .vlms[]? | select(.vlm_nr == 0) | .state.disk_state // "Unknown"] | join(",")' \ - 2>/dev/null || echo "") - if [[ "$states" == "UpToDate,UpToDate,UpToDate" ]]; then - all_up=true - break - fi - sleep 3 -done - -if [[ "$all_up" != "true" ]]; then - echo "FAIL: vol-0 did not reach UpToDate on all 3 replicas within 120s" >&2 +if ! wait_all_replicas_volume_uptodate 0 240; then + echo "FAIL: vol-0 did not reach UpToDate on all 3 replicas within 240s" >&2 "${LCTL[@]}" resource list --resources "$RD" 2>&1 | tail -30 >&2 exit 1 fi -# THE BUG: add vol-1 and vol-2 AFTER vol-0 is UpToDate. -echo ">> [Bug 332] late vd c (vol-1)" -"${LCTL[@]}" volume-definition create "$RD" 1G >/dev/null +# THE BUG (BUG-048, P1 availability): add vol-1 and vol-2 AFTER vol-0 is +# UpToDate, CONCURRENTLY (back-to-back, no convergence wait between them). +# Both number-less `vd c` calls auto-assign the smallest free VolumeNumber +# — pre-fix each read [vol-0] and BOTH chose VlmNr=1, so the loser was +# rejected FAIL_EXISTS_VLM_DFN and the operator's second intended volume +# silently vanished (only vol-0 + vol-1 persisted; vol-2 never created). +# Running them truly concurrently is what exercises the lost-update race; +# the prior sequential `vd c \n vd c` shape let the first REST response +# land before the second auto-assign ran and so masked the bug. +echo ">> [BUG-048] CONCURRENT late vd c (vol-1 + vol-2 back-to-back)" +vdc_rc=0 +"${LCTL[@]}" volume-definition create "$RD" 1G >/tmp/bug048-vdcA.out 2>&1 & +pA=$! +"${LCTL[@]}" volume-definition create "$RD" 1G >/tmp/bug048-vdcB.out 2>&1 & +pB=$! +wait "$pA" || vdc_rc=1 +wait "$pB" || vdc_rc=1 + +# BUG-048 wire-level assertion: BOTH concurrent adds must land. The RD +# must carry exactly THREE VolumeDefinitions (vol-0 + the two late adds); +# a lost-update leaves only two and one `vd c` reports a spurious +# "volume definition 1 already exists" even though the operator named no +# number. This catches the silent drop BEFORE the (slower) convergence +# wait below — and even if DRBD later happened to converge the survivors. +echo ">> [BUG-048] assert no concurrent vd-add was dropped (expect 3 VDs)" +vd_count=$("${LCTL[@]}" --machine-readable volume-definition list \ + --resource-definitions "$RD" 2>/dev/null \ + | jq -r '[.[]? | .[]? | (.vlm_dfns // .volume_definitions // [])[]?] | length' \ + 2>/dev/null || echo 0) +if [[ "$vd_count" != "3" ]]; then + echo "FAIL (BUG-048): RD $RD has $vd_count VolumeDefinitions after two concurrent late vd c, want 3 — a concurrent add was silently dropped" >&2 + echo "----- vd c #A output -----" >&2; cat /tmp/bug048-vdcA.out >&2 || true + echo "----- vd c #B output -----" >&2; cat /tmp/bug048-vdcB.out >&2 || true + "${LCTL[@]}" volume-definition list --resource-definitions "$RD" 2>&1 | tail -20 >&2 + exit 1 +fi +echo " 3 VolumeDefinitions present — no concurrent add dropped" -echo ">> [Bug 332] late vd c (vol-2)" -"${LCTL[@]}" volume-definition create "$RD" 1G >/dev/null +if (( vdc_rc != 0 )); then + # Both VDs landed (count==3) yet a CLI returned non-zero: that is the + # acceptable-but-noteworthy "fail loudly" path, not the silent wedge. + echo " note: a concurrent vd c exited non-zero but both volumes were created" >&2 +fi -echo ">> wait up to 60s for vol-1 + vol-2 to reach UpToDate on all 3 replicas" -deadline=$(( $(date +%s) + 60 )) -late_up=false -while (( $(date +%s) < deadline )); do - # Pull every (replica, volume) disk_state. We need exactly: - # 3 rows × 3 volumes = 9 disk_state strings, all UpToDate. - # A Bug-332-bitten path will show vol-1/vol-2 stuck on - # "Diskless" with the operator-set DISKLESS flag absent. - states=$("${LCTL[@]}" --machine-readable resource list --resources "$RD" 2>/dev/null \ - | jq -r '[.[][]? | .vlms[]? | .state.disk_state // "Unknown"] | join(",")' \ - 2>/dev/null || echo "") - count_uptodate=$(awk -F, '{ for (i=1;i<=NF;i++) if ($i=="UpToDate") n++ } END { print n+0 }' <<<"$states") - if (( count_uptodate == 9 )); then - late_up=true +# Each late-added 1G volume needs its own initial sync on every replica; +# under sweep load that takes well past the old 60s budget, so give the +# late volumes the same 240s headroom vol-0 gets. +echo ">> wait up to 240s for vol-1 + vol-2 to reach UpToDate on all 3 replicas" +late_up=true +for vol in 1 2; do + if ! wait_all_replicas_volume_uptodate "$vol" 240; then + late_up=false break fi - sleep 3 done if [[ "$late_up" != "true" ]]; then - echo "FAIL (Bug 332): late-added vol-1/vol-2 not UpToDate on all 3 replicas within 60s" >&2 + echo "FAIL (Bug 332): late-added vol-1/vol-2 not UpToDate on all 3 replicas within 240s" >&2 "${LCTL[@]}" resource list --resources "$RD" 2>&1 | tail -40 >&2 - # Surface the smoking gun: a Diskless line for a non-DISKLESS spec. - echo "----- linstor r l --resources $RD (with flags) -----" >&2 - "${LCTL[@]}" --machine-readable resource list --resources "$RD" 2>/dev/null \ - | jq -r '.[][]? | "\(.node_name) vol=\(.vlms[]?.vlm_nr) state=\(.vlms[]?.state.disk_state) flags=\(.flags//[])"' >&2 || true + # Surface the smoking gun: per (node, vol) disk_state straight from + # the populated Resource.Status — a Bug-332-bitten path shows vol-1 + # or vol-2 stuck on Diskless on a non-DISKLESS replica. + echo "----- per (node, vol) Resource.Status diskState -----" >&2 + for node in "${DISKFUL_NODES[@]}"; do + for vol in 0 1 2; do + echo " $node vol=$vol state=$(status_disk_state "$RD" "$node" "$vol")" >&2 + done + done exit 1 fi @@ -135,10 +216,13 @@ fi # MUST NOT report any volume as Diskless when the spec lacks the # DISKLESS flag. This is the kernel-truth assertion that distinguishes # Bug 332 (Unintentional Diskless) from spec-pinned diskless replicas. -echo ">> [Bug 332] kernel-truth: drbdadm status on a diskful node" -satellite_node=$("${LCTL[@]}" --machine-readable resource list --resources "$RD" 2>/dev/null \ - | jq -r '.[][]? | select((.flags//[]) | (map(. == "DISKLESS") | any | not)) | .node_name' \ - 2>/dev/null | head -1) +echo ">> [Bug 332] kernel-truth: drbdsetup status on a diskful node" +# Use the already-resolved diskful node set (reliable, CRD-backed) and +# probe the kernel directly via the satellite pod (on_node, lib.sh) — +# the same drbdsetup status path the parent helpers use. The previous +# `--machine-readable ... | jq .flags | head -1` resolution shared the +# null-vlms / SIGPIPE failure modes of the wait loops above. +satellite_node=${DISKFUL_NODES[0]:-} if [[ -z "$satellite_node" ]]; then echo "SKIP-PARTIAL: could not resolve a diskful node for kernel-truth check" @@ -146,7 +230,7 @@ if [[ -z "$satellite_node" ]]; then exit 0 fi -if status_out=$(kubectl debug node/"$satellite_node" --image=alpine -- chroot /host drbdadm status "$RD" 2>&1); then +if status_out=$(on_node "$satellite_node" drbdsetup status "$RD" --verbose 2>&1); then echo "$status_out" if grep -E 'volume:[12].*disk:Diskless' <<<"$status_out" >/dev/null; then echo "FAIL (Bug 332): diskful node $satellite_node reports volume:1 or volume:2 as Diskless on kernel state" >&2 @@ -154,7 +238,7 @@ if status_out=$(kubectl debug node/"$satellite_node" --image=alpine -- chroot /h exit 1 fi else - echo "SKIP-PARTIAL: kubectl debug to inspect drbdadm status failed (RBAC / image pull); REST-level pin still asserted" + echo "SKIP-PARTIAL: drbdsetup status on $satellite_node failed; REST-level pin still asserted" fi -echo ">> multi-volume-late-vd-create OK (Bug 332 pinned: late vd c on $RD brought vol-1/vol-2 to UpToDate, no Unintentional Diskless)" +echo ">> multi-volume-late-vd-create OK (Bug 332 + BUG-048 pinned: two CONCURRENT late vd c on $RD both landed at distinct VlmNrs and brought vol-1/vol-2 to UpToDate, no dropped VD, no Unintentional Diskless)" diff --git a/tests/operator-harness/lib.sh b/tests/operator-harness/lib.sh index 2ab6e3f5..33e0e649 100755 --- a/tests/operator-harness/lib.sh +++ b/tests/operator-harness/lib.sh @@ -603,6 +603,30 @@ except Exception: print(0)" "$vol" 2>/dev/null || echo 0) [[ "$actual" == "$expected" ]] ;; + vd_count) + # BUG-048: assert the RD carries EXACTLY `expected` + # VolumeDefinitions. A concurrent-auto-assign lost-update + # drops the second of two back-to-back number-less `vd c` + # calls, leaving one VD short — this is the wire-level + # signature that catches the silent drop independent of + # whether DRBD later converged the survivors. + local rd expected actual + rd=$(substitute "$(python3 -c "import json,sys; print(json.loads(sys.argv[1]).get('rd',''))" "$spec")") + expected=$(python3 -c "import json,sys; print(json.loads(sys.argv[1]).get('expected',0))" "$spec") + actual=$(linstor_cli -m volume-definition list --resource-definitions "$rd" 2>/dev/null \ + | python3 -c "import json,sys +try: + d=json.load(sys.stdin) + while isinstance(d, list) and d and isinstance(d[0], list): + d=d[0] + n=0 + for it in d if isinstance(d, list) else []: + n += len(it.get('vlm_dfns', []) or it.get('volume_definitions', []) or []) + print(n) +except Exception: + print(0)" 2>/dev/null || echo 0) + [[ "$actual" == "$expected" ]] + ;; pvc_capacity) # PVC.Status.Capacity matches expected (e.g. "2Gi"). # Verifies the operator-visible size propagation through diff --git a/tests/operator-harness/replay-runner.sh b/tests/operator-harness/replay-runner.sh index 552133fa..f1f8ddca 100755 --- a/tests/operator-harness/replay-runner.sh +++ b/tests/operator-harness/replay-runner.sh @@ -72,6 +72,8 @@ # - resource_absent wait until r d takes effect on a node # - rd_absent wait until rd is gone everywhere # - vd_size_kib VolumeDefinition.size_kib equals expected_kib +# - vd_count RD carries EXACTLY `expected` VolumeDefinitions +# (BUG-048 concurrent-add lost-update catcher) # - pvc_capacity PVC.Status.Capacity.storage matches expected # - pod_md5_invariant md5sum of file inside pod matches expected baseline # - volumes_settled every Resource of rd carries EXACTLY the diff --git a/tests/operator-harness/replay/vd-late-create-concurrent-no-lost-update.yaml b/tests/operator-harness/replay/vd-late-create-concurrent-no-lost-update.yaml new file mode 100644 index 00000000..ca055e90 --- /dev/null +++ b/tests/operator-harness/replay/vd-late-create-concurrent-no-lost-update.yaml @@ -0,0 +1,92 @@ +name: vd-late-create-concurrent-no-lost-update +description: | + BUG-048 catcher (P1, availability). Two back-to-back number-less + `linstor vd c ` calls against an RD that already carries vol-0 + (and whose RD.Spec.Initialized has flipped true) must BOTH land at + distinct VolumeNumbers — none silently dropped. + + Root cause: the REST handler auto-assigned the smallest free + VolumeNumber in a List taken BEFORE the store Create, so two adds + racing in the same window both read [vol-0], both chose VlmNr=1, the + loser was rejected FAIL_EXISTS_VLM_DFN, and the operator's second + intended volume silently vanished. When the loser backgrounds its CLI + there is no error surfaced at all — the volume simply never exists. + + Post-fix the smallest-free allocation moved INTO the store layer + (VolumeDefinitionStore.CreateAutoNumbered), re-derived on every + conflict-retry attempt, so the loser re-reads [vol-0, vol-1] and + lands at vol-2. + + Harness note: replay-runner executes each `cmd` as a single + sequential `linstor` invocation and has no concurrency primitive, so + this YAML drives the late adds back-to-back sequentially and pins the + POST-FIX invariant with `vd_count: 3` (the exact wire-level signature + a lost-update would violate — it leaves 2) plus `all_uptodate`. The + TRUE concurrent-race reproduction (background `&` + wait, which is + what exercises the auto-assign window) lives in the L6 kernel-truth + cell tests/e2e/cli-matrix/multi-volume-late-vd-create.sh, and the L1 + unit pin is pkg/rest/volume_definitions_test.go:: + TestVolumeDefinitionsConcurrentAutoNumberNoLostUpdate. + +prerequisites: + min_nodes: 2 + storage_pool: stand + +vars: + sp: stand + +steps: + - name: create-rd + cmd: ["resource-definition", "create", "{{rd}}"] + expect_exit: 0 + - name: create-vol0 + # 64M (not 1G): BUG-048 is a control-plane allocation defect, not a + # sync-timing race, so the trigger is size-independent. 64M converges + # in seconds on the I/O-constrained stand and keeps the all_uptodate + # awaits well inside budget. + cmd: ["volume-definition", "create", "{{rd}}", "64M"] + expect_exit: 0 + - name: r-c-node1 + cmd: ["resource", "create", "{{node1}}", "{{rd}}", "--storage-pool", "{{sp}}"] + expect_exit: 0 + - name: r-c-node2 + cmd: ["resource", "create", "{{node2}}", "{{rd}}", "--storage-pool", "{{sp}}"] + expect_exit: 0 + - name: wait-vol0-uptodate + cmd: ["resource", "list", "--resources", "{{rd}}"] + expect_exit: 0 + await: + kind: all_uptodate + rd: "{{rd}}" + timeout_s: 240 + - name: late-vd-create-second + # Late add #1 on the already-initialized RD → auto-assigns vol-1. + cmd: ["volume-definition", "create", "{{rd}}", "64M"] + expect_exit: 0 + - name: late-vd-create-third + # Late add #2 back-to-back → MUST auto-assign vol-2 (not collide on + # vol-1). Pre-fix, when these two raced, this one was the dropped add. + cmd: ["volume-definition", "create", "{{rd}}", "64M"] + expect_exit: 0 + await: + # BUG-048 signature: exactly 3 VDs must exist (vol-0,1,2). A + # lost-update leaves 2. + kind: vd_count + rd: "{{rd}}" + expected: 3 + timeout_s: 30 + - name: wait-all-volumes-uptodate + cmd: ["resource", "list", "--resources", "{{rd}}"] + expect_exit: 0 + await: + # Every volume of every replica converges — the elected per-fresh- + # volume winner seeds a SyncSource for both late adds. + kind: all_uptodate + rd: "{{rd}}" + timeout_s: 240 + +teardown: + - cmd: ["resource-definition", "delete", "{{rd}}"] + +invariants: + - no_orphans From 89443d01b340a00f373190347d1c395f2c801733 Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Sun, 14 Jun 2026 11:07:03 +0300 Subject: [PATCH 03/19] fix(satellite): seed late-added volume race-free, off MetadataCreated not .res (BUG-048) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The per-volume create-md + winner-election GI seed for a late-added volume (ensurePerVolumeMetadata) was gated by hasLateAddedVolume(resPath, dr) — comparing the desired volume set against the `volume N {` blocks in the on-disk .res. That .res is re-rendered by the FSM dispatch's renderResFile preamble on every adjust/up, so on a concurrent two-VD add an earlier reconcile's adjust pre-populated the new volume's block BEFORE this gate read the file. The gate then saw "no late volume", SKIPPED the metadata+seed pass entirely, and the new volume came up via the kernel adjust with no metadata and no elected SyncSource — Inconsistent on every replica, permanently wedged (the operator-reported BUG-048 symptom: replication:Established, resync-suspended:no, no SyncSource). It hit ~half of concurrent late two-VD adds; single and sequential adds, where the create-md pass and the render do not interleave, were unaffected. Gate on the race-free dr.GetMetadataCreated() (this RD is past first activation) instead. ensurePerVolumeMetadata is already per-volume idempotent — it probes `drbdadm dump-md /` and only create-md's + seeds the volumes that genuinely lack metadata (len(freshlyCreated)==0 → early return), so running it on every post-activation diskful reconcile is a no-op on converged state (cheap per-volume dump-md probe) and never re-stamps an existing volume's GI/bitmap. The winner election and every seed veto (PeerHasData / SkipInitialSync / AnyConnectedPeerHasDataForVolume) are unchanged, so day0 skip-init (BUG-028) and single/sequential late adds (Bug 384) are preserved. The now-unused .res-parsing hasLateAddedVolume helper is removed. Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- pkg/satellite/reconciler.go | 96 +++++---------------- pkg/satellite/reconciler_drbd_test.go | 116 ++++++++++++++++++++++++++ 2 files changed, 138 insertions(+), 74 deletions(-) diff --git a/pkg/satellite/reconciler.go b/pkg/satellite/reconciler.go index f7c57327..485616df 100644 --- a/pkg/satellite/reconciler.go +++ b/pkg/satellite/reconciler.go @@ -1854,71 +1854,6 @@ func extractResFilePeers(body string) []string { // bogus --node-id=0 collision against the local slot. Reads via // os.ReadFile so a transient I/O hiccup degrades to no-op // instead of wedging the reconcile. -// hasLateAddedVolume reports whether the desired-state Volumes[] -// includes at least one volume number that is NOT yet represented -// as a `volume {` block in the OLD .res file at resPath. -// -// Bug 332: the `linstor vd c 1G` flow grows VolumeDefinitions[] -// after the RD has already passed first-activation. The dispatcher -// hands the satellite a DesiredResource with the new volume in -// Volumes[], but the on-disk .res still describes the smaller set — -// so a strict greater-than on the rendered block count is the -// late-VD signal. Returns false when the .res file is absent -// (cold-start path; the existing firstActivation gate owns -// metadata creation), when the file is unreadable (fail-safe to -// "no late vol → no extra work"), or when the desired set matches -// what's already rendered. -// -// Parser is intentionally simple: matches "volume {" inside an -// `on {` block. False positives across multi-host blocks are -// harmless — the helper de-duplicates by recording each volNumber -// once across the file. -func hasLateAddedVolume(resPath string, dr *intent.DesiredResource) bool { - if dr == nil { - return false - } - - body, err := os.ReadFile(resPath) - if err != nil { - // No .res yet → cold start, existing firstActivation path - // will create metadata for every volume via the standard - // chain. Late-VD signal is "old file exists with fewer - // volumes", not "no file at all". - return false - } - - rendered := map[int32]struct{}{} - - for line := range strings.SplitSeq(string(body), "\n") { - trimmed := strings.TrimSpace(line) - if !strings.HasPrefix(trimmed, "volume ") { - continue - } - - rest := strings.TrimPrefix(trimmed, "volume ") - head, _, ok := strings.Cut(rest, "{") - - if !ok { - continue - } - - num, parseErr := strconv.ParseInt(strings.TrimSpace(head), 10, 32) - if parseErr != nil { - continue - } - - rendered[int32(num)] = struct{}{} - } - - for _, vol := range dr.GetVolumes() { - if _, ok := rendered[vol.GetVolumeNumber()]; !ok { - return true - } - } - - return false -} - func extractResFilePeerNodeIDs(resPath string) map[string]int32 { body, err := os.ReadFile(resPath) if err != nil { @@ -2135,15 +2070,28 @@ func (r *Reconciler) applyDRBD(ctx context.Context, dr *intent.DesiredResource, // upstream LINSTOR DrbdLayer.adjustResource: per-volume // hasMetaData probe, per-volume createMd for those that lack it. // - // Scope: NARROW to the "late-VD added" signal — desired Volumes[] - // count strictly exceeds the OLD .res's `volume N {` block count. - // Without this narrowing, the steady-state path would shell out - // `drbdadm dump-md` on every reconcile, perturbing existing tests - // that pin "no metadata work on retry" (e.g. mid-Apply abort - // scenarios) and adding shell cost on every converged pass. - // Skipped on diskless replicas (no lower disk to stamp) and on - // the diskless→diskful flip path (Bug 319 owns that re-stamp). - if !diskless && hasLateAddedVolume(resPath, dr) && + // Scope: gate on "this RD is already past first activation" + // (MetadataCreated Condition / .md-created marker) rather than the + // OLD .res's `volume N {` block count. + // + // BUG-048 (P1, availability): the previous `.res`-count gate raced + // the FSM dispatch's renderResFile preamble. On a concurrent late + // vd-add the FSM `adjust` of an EARLIER reconcile already re-rendered + // `.res` to include the new volume's `volume N {` block BEFORE this + // gate read the file, so hasLateAddedVolume(resPath) flipped FALSE, + // ensurePerVolumeMetadata was SKIPPED, and the new volume came up via + // the kernel adjust with NO metadata + NO winner-election GI seed — + // Inconsistent on every replica, no SyncSource, permanently wedged + // (~half of concurrent two-VD adds). The on-disk metadata, not the + // rendered .res, is the race-free truth: ensurePerVolumeMetadata + // probes `drbdadm dump-md` per volume and only creates+seeds the ones + // that actually lack metadata (len(freshlyCreated)==0 → early return), + // so running it on every post-activation diskful reconcile is + // idempotent and converged passes pay only the cheap per-volume + // dump-md probe. Skipped on diskless replicas (no lower disk to + // stamp) and on the diskless→diskful flip path (Bug 319 owns that + // re-stamp). + if !diskless && dr.GetMetadataCreated() && !r.isDisklessToDiskfulFlip(ctx, dr, diskless) { err = r.ensurePerVolumeMetadata(ctx, dr, devices, diskless) if err != nil { diff --git a/pkg/satellite/reconciler_drbd_test.go b/pkg/satellite/reconciler_drbd_test.go index e848f1d5..445863cb 100644 --- a/pkg/satellite/reconciler_drbd_test.go +++ b/pkg/satellite/reconciler_drbd_test.go @@ -3026,6 +3026,122 @@ func TestApplyDRBDAllocatesBackingForLateAddedVolume(t *testing.T) { } } +// TestApplyLateAddedVolumePerVolumeMetadataRunsDespitePreRenderedRes pins +// BUG-048 (P1, availability): the per-volume create-md + winner-seed pass +// MUST fire for a late-added volume EVEN WHEN the on-disk .res already +// carries the new volume's `volume N {` block. +// +// Root cause: the gate that admitted ensurePerVolumeMetadata used to be +// hasLateAddedVolume(resPath, dr) — a comparison of the desired volume +// set against the `volume N {` blocks rendered in the OLD .res. But the +// FSM dispatch runs renderResFile as a preamble for every adjust/up, so +// on a concurrent late vd-add an EARLIER reconcile's adjust re-rendered +// .res to include the new volume's block BEFORE this gate read it. The +// gate then saw "no late volume" → SKIPPED create-md + the winner seed +// → the new volume came up via the kernel adjust with no metadata and no +// elected SyncSource → Inconsistent on every replica, permanently +// wedged. The gate now keys off the race-free MetadataCreated signal +// (this RD is past first activation), and the idempotent per-volume +// HasMD probe inside ensurePerVolumeMetadata is the sole authority for +// which volumes actually need create-md. +// +// Distinguishing factor vs TestApplyDRBDAllocatesBackingForLateAddedVolume: +// here the seeded .res ALREADY contains `volume 1 { ... }` (simulating +// the pre-render race). Under the old .res-count gate this Apply did +// NOTHING for vol-1; under the fix it still create-md's + seeds it. +func TestApplyLateAddedVolumePerVolumeMetadataRunsDespitePreRenderedRes(t *testing.T) { + dir := t.TempDir() + fx := storage.NewFakeExec() + + fx.Expect("lvs --config devices { filter=['r|^/dev/drbd|','r|^/dev/zd|'] } --noheadings -o lv_name vg/pvc-race_00000", + storage.FakeResponse{Stdout: []byte("pvc-race_00000\n")}) + fx.Expect("lvs --config devices { filter=['r|^/dev/drbd|','r|^/dev/zd|'] } --noheadings --separator | -o lv_path,lv_size --units k --nosuffix vg/pvc-race_00000", + storage.FakeResponse{Stdout: []byte("/dev/vg/pvc-race_00000|1048576\n")}) + fx.Expect("lvs --config devices { filter=['r|^/dev/drbd|','r|^/dev/zd|'] } --noheadings -o lv_name vg/pvc-race_00001", + storage.FakeResponse{Stdout: []byte("pvc-race_00001\n")}) + fx.Expect("lvs --config devices { filter=['r|^/dev/drbd|','r|^/dev/zd|'] } --noheadings --separator | -o lv_path,lv_size --units k --nosuffix vg/pvc-race_00001", + storage.FakeResponse{Stdout: []byte("/dev/vg/pvc-race_00001|1048576\n")}) + + // vol-0 already has metadata; vol-1 does NOT (the late-added volume). + fx.Expect("drbdadm dump-md pvc-race/0", + storage.FakeResponse{Stdout: []byte("version \"v09\";\nla-size-sect 2048;\n")}) + fx.Expect("drbdadm dump-md pvc-race/1", + storage.FakeResponse{Err: errDrbdadmDumpMdNoMeta}) + + // The fix MUST still create-md vol-1 despite the pre-rendered .res. + fx.Expect(fmt.Sprintf("drbdadm create-md --force --max-peers=%d pvc-race/1", drbd.MaxPeers-1), + storage.FakeResponse{}) + + fx.Expect("drbdsetup status pvc-race", + storage.FakeResponse{Stdout: []byte("pvc-race role:Secondary\n volume:0 disk:UpToDate\n")}) + fx.Expect("drbdsetup status --verbose pvc-race", + storage.FakeResponse{Stdout: []byte("pvc-race role:Secondary\n volume:0 disk:UpToDate\n")}) + fx.Expect("drbdadm adjust pvc-race", storage.FakeResponse{}) + + thin := lvm.NewThin(lvm.ThinConfig{VolumeGroup: "vg", ThinPool: "tp"}, fx) + rec := satellite.NewReconciler(satellite.ReconcilerConfig{ + Providers: map[string]storage.Provider{"thin1": thin}, + Adm: drbd.NewAdm(fx), + StateDir: dir, + NodeName: "n1", + }) + + // THE RACE: the .res ALREADY carries volume 1 (the FSM render-preamble + // of a prior reconcile pre-populated it). The old hasLateAddedVolume + // gate would see both blocks present and skip create-md for vol-1. + resPath := filepath.Join(dir, "pvc-race.res") + preRendered := "resource pvc-race {\n" + + " on n1 {\n" + + " volume 0 {\n }\n" + + " volume 1 {\n }\n" + + " }\n}\n" + if err := os.WriteFile(resPath, []byte(preRendered), 0o600); err != nil { + t.Fatalf("seed .res: %v", err) + } + if err := os.WriteFile(filepath.Join(dir, "pvc-race.md-created"), nil, 0o600); err != nil { + t.Fatalf("seed md-created: %v", err) + } + + dr := []*intent.DesiredResource{ + { + Name: "pvc-race", + NodeName: "n1", + MetadataCreated: true, + Volumes: []*intent.DesiredVolume{ + {VolumeNumber: 0, SizeKib: 1024 * 1024, StoragePool: "thin1"}, + {VolumeNumber: 1, SizeKib: 1024 * 1024, StoragePool: "thin1"}, + }, + SkipInitialSync: skipInitTrue(), + DrbdOptions: map[string]string{ + "port": "7000", "node-id": "0", "address": "10.0.0.1", "minor": "1000", + }, + }, + } + + results, err := rec.Apply(t.Context(), dr) + if err != nil { + t.Fatalf("Apply: %v", err) + } + if len(results) != 1 || !results[0].GetOk() { + t.Fatalf("Apply: expected Ok=true; got results=%+v", results) + } + + calls := fx.CommandLines() + + wantCreateMD := fmt.Sprintf("drbdadm create-md --force --max-peers=%d pvc-race/1", drbd.MaxPeers-1) + if !slices.Contains(calls, wantCreateMD) { + t.Errorf("BUG-048: per-volume create-md for vol-1 SKIPPED despite pre-rendered .res — "+ + "the late-added volume gets no metadata + no winner seed and wedges Inconsistent; "+ + "want %q in %v", wantCreateMD, calls) + } + + // vol-0 must NOT be re-created (would wipe its GI + bitmap). + forbidden := fmt.Sprintf("drbdadm create-md --force --max-peers=%d pvc-race/0", drbd.MaxPeers-1) + if slices.Contains(calls, forbidden) { + t.Errorf("re-ran create-md on vol-0 (would wipe operator-stamped metadata): %v", calls) + } +} + // TestApplyLateAddedVolumeWinnerSeedsUpToDate pins Bug 384 (P0, data // integrity — regression of Bug 79/332): when `linstor vd c 1G` // adds vol-1 to a multi-replica RD whose vol-0 already reached From 10394f7ab4a99b8a7aa5c96212a9220613eed92d Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Sun, 14 Jun 2026 11:23:28 +0300 Subject: [PATCH 04/19] fix(store,apiserver): read RD live (uncached) in CreateAutoNumbered (BUG-048) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The store-level atomic VolumeNumber allocation still dropped the second of two concurrent `vd c` in production: fetchRD reads through the manager's informer-CACHED client, which trails a just-committed write by the watch round-trip. So when the winner's Update lands, the loser's conflict-retry re-reads the STALE cached RD, re-derives the same smallest-free number, its Update 409s again, and retry.DefaultRetry (~5 short steps) exhausts before the cache catches up — the second volume is silently lost (observed on stand: vd_count=2 after two concurrent number-less `vd c`). CreateAutoNumbered now reads the RD through a direct, UNCACHED API reader (mgr.GetAPIReader(), plumbed via store NewWithAPIReader) on every retry attempt, so each retry sees the winner's committed VolumeDefinition and advances to the next free number. The retry budget is widened to 12 steps so a burst of concurrent creates still converges. nil apiReader (in-memory / unit stores with no informer) falls back to the cached client unchanged. Adds a k8s-store concurrency conformance test (N concurrent CreateAutoNumbered against a real envtest apiserver → distinct numbers, none dropped). Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- cmd/apiserver/main.go | 9 +++- pkg/store/k8s/k8s.go | 14 ++++- pkg/store/k8s/k8s_test.go | 79 +++++++++++++++++++++++++++++ pkg/store/k8s/volume_definitions.go | 52 ++++++++++++++++++- 4 files changed, 150 insertions(+), 4 deletions(-) diff --git a/cmd/apiserver/main.go b/cmd/apiserver/main.go index 75faad58..1ca7246e 100644 --- a/cmd/apiserver/main.go +++ b/cmd/apiserver/main.go @@ -257,7 +257,14 @@ func main() { // CRD-backed store is the only supported persistence layer // post-Phase-11 — the apiserver/controller split made // in-process state pointless across replicas. - st := storek8s.New(mgr.GetClient()) + // + // BUG-048: pass the manager's direct (uncached) API reader so the + // atomic VolumeNumber allocation re-reads live RD state on each + // conflict-retry. With only the informer-cached client, two + // concurrent `vd c` against one RD both retry against a stale cache, + // re-derive the same number, exhaust the retry budget, and silently + // drop the second volume. + st := storek8s.NewWithAPIReader(mgr.GetClient(), mgr.GetAPIReader()) ready := newReadyState() diff --git a/pkg/store/k8s/k8s.go b/pkg/store/k8s/k8s.go index b133a5c2..42ef7688 100644 --- a/pkg/store/k8s/k8s.go +++ b/pkg/store/k8s/k8s.go @@ -63,13 +63,25 @@ type Store struct { // New wraps a controller-runtime client and returns a store.Store. func New(c ctrlclient.Client) *Store { + return NewWithAPIReader(c, nil) +} + +// NewWithAPIReader is New plus a direct (uncached) API reader. The +// reader is used ONLY where a cache-lag read would be incorrect — the +// BUG-048 atomic VolumeNumber allocation, where retrying an optimistic- +// lock conflict against a stale informer cache re-derives the SAME +// number and the create-loop never converges (the second of two +// concurrent `vd c` is dropped). Pass mgr.GetAPIReader() in production; +// nil is accepted and every path falls back to the cached client +// (in-memory / unit harnesses that have no informer). +func NewWithAPIReader(c ctrlclient.Client, apiReader ctrlclient.Reader) *Store { s := &Store{c: c} s.nodes = &nodes{c: c} s.storagePools = &storagePools{c: c} s.resourceGroups = &resourceGroups{c: c} s.resourceDefinitions = &resourceDefinitions{c: c} s.resources = &resources{c: c} - s.volumeDefinitions = &volumeDefinitions{c: c} + s.volumeDefinitions = &volumeDefinitions{c: c, apiReader: apiReader} s.snapshots = &snapshots{c: c} s.physicalDevices = &physicalDevices{c: c} s.controllerProps = &controllerProps{c: c} diff --git a/pkg/store/k8s/k8s_test.go b/pkg/store/k8s/k8s_test.go index dd731354..7d7c6f57 100644 --- a/pkg/store/k8s/k8s_test.go +++ b/pkg/store/k8s/k8s_test.go @@ -23,6 +23,7 @@ import ( "os" "path/filepath" "runtime" + "sync" "testing" "k8s.io/apimachinery/pkg/runtime/schema" @@ -31,6 +32,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/envtest" crdv1alpha1 "github.com/cozystack/blockstor/api/v1alpha1" + apiv1 "github.com/cozystack/blockstor/pkg/api/v1" "github.com/cozystack/blockstor/pkg/store" "github.com/cozystack/blockstor/pkg/store/k8s" "github.com/cozystack/blockstor/pkg/store/storetest" @@ -164,6 +166,83 @@ func TestK8sVolumeDefinitionStore(t *testing.T) { }) } +// TestK8sVolumeDefinitionConcurrentAutoNumber is the BUG-048 (P1, +// availability) regression at the store layer against a REAL apiserver: +// N goroutines each call CreateAutoNumbered on the same RD concurrently. +// Every call must succeed and land at a DISTINCT VolumeNumber — the +// allocate-inside-RetryOnConflict loop must converge under genuine +// optimistic-lock 409s rather than dropping the racing creates. +// +// envtest's client is a direct (uncached) reader, so this exercises the +// retry-on-409 convergence; the cache-lag half of BUG-048 (which needs +// the informer-cached client + GetAPIReader fallback) is covered by the +// live-stand cli-matrix cell. Together they pin both halves. +func TestK8sVolumeDefinitionConcurrentAutoNumber(t *testing.T) { + if fixture == nil { + t.Skip("envtest assets not installed; run `make setup-envtest` to enable") + } + + t.Cleanup(func() { wipeAll(t, fixture.client) }) + + st := k8s.New(fixture.client) + ctx := context.Background() + + if err := st.ResourceDefinitions().Create(ctx, &apiv1.ResourceDefinition{Name: "pvc-conc"}); err != nil { + t.Fatalf("seed RD: %v", err) + } + + const n = 6 + + var ( + wg sync.WaitGroup + mu sync.Mutex + errs []error + ) + + for range n { + wg.Add(1) + + go func() { + defer wg.Done() + + vd := apiv1.VolumeDefinition{SizeKib: 4096} + if _, err := st.VolumeDefinitions().CreateAutoNumbered(ctx, "pvc-conc", &vd); err != nil { + mu.Lock() + errs = append(errs, err) + mu.Unlock() + } + }() + } + + wg.Wait() + + for _, e := range errs { + t.Errorf("concurrent CreateAutoNumbered failed (a volume was dropped): %v", e) + } + + vds, err := st.VolumeDefinitions().List(ctx, "pvc-conc") + if err != nil { + t.Fatalf("List: %v", err) + } + if len(vds) != n { + t.Fatalf("got %d VDs after %d concurrent auto-numbered creates, want %d (lost update)", len(vds), n, n) + } + + seen := map[int32]bool{} + for i := range vds { + if seen[vds[i].VolumeNumber] { + t.Fatalf("duplicate VolumeNumber %d", vds[i].VolumeNumber) + } + seen[vds[i].VolumeNumber] = true + } + + for want := range int32(n) { + if !seen[want] { + t.Errorf("missing VolumeNumber %d in %v — a concurrent create was dropped", want, seen) + } + } +} + // TestK8sSnapshotStore runs the shared SnapshotStore suite. func TestK8sSnapshotStore(t *testing.T) { if fixture == nil { diff --git a/pkg/store/k8s/volume_definitions.go b/pkg/store/k8s/volume_definitions.go index 07e610cc..80aafad3 100644 --- a/pkg/store/k8s/volume_definitions.go +++ b/pkg/store/k8s/volume_definitions.go @@ -39,6 +39,18 @@ import ( // RD anyway, and a single CRD makes ownership/reclamation trivially correct. type volumeDefinitions struct { c ctrlclient.Client + + // apiReader is a direct, UNCACHED reader (mgr.GetAPIReader()) used + // only by CreateAutoNumbered's retry loop. The cached client's Get + // trails a just-committed write by the informer round-trip, so + // retrying an optimistic-lock conflict against the cache re-reads + // the stale RD, re-derives the SAME smallest-free VolumeNumber, and + // the loop never converges — the BUG-048 lost-update where the + // second of two concurrent `vd c` is dropped. Reading live on each + // attempt makes the allocation see the winner's committed write. + // nil in non-production stores (in-memory / unit) → fall back to the + // cached client. + apiReader ctrlclient.Reader } func (s *volumeDefinitions) List(ctx context.Context, rdName string) ([]apiv1.VolumeDefinition, error) { @@ -130,8 +142,17 @@ func (s *volumeDefinitions) CreateAutoNumbered(ctx context.Context, rdName strin var assigned int32 - err := retry.OnError(retry.DefaultRetry, isConflictOrNotFound, func() error { - rd, fetchErr := s.fetchRD(ctx, rdName) + // Generous retry budget: each attempt does a LIVE (uncached) RD read + // so a conflict-retry sees the racing winner's committed VD and lands + // at the next free number. retry.DefaultRetry (≈5 steps) is enough + // for the common 2-way race, but widen it so a burst of concurrent + // `vd c` against one RD still converges rather than dropping the + // straggler. + backoff := retry.DefaultRetry + backoff.Steps = 12 + + err := retry.OnError(backoff, isConflictOrNotFound, func() error { + rd, fetchErr := s.fetchRDLive(ctx, rdName) if fetchErr != nil { return fetchErr } @@ -312,6 +333,33 @@ func (s *volumeDefinitions) fetchRD(ctx context.Context, rdName string) (*crdv1a return &rd, nil } +// fetchRDLive reads the RD through the direct, UNCACHED API reader when +// one is wired (production), so CreateAutoNumbered's retry sees the +// latest committed VolumeDefinitions rather than a stale informer-cache +// revision (BUG-048). Falls back to the cached client when no apiReader +// is configured (in-memory / unit harnesses). The returned object is +// still safe to mutate + write back through s.c (the resourceVersion the +// live read carries is what the subsequent optimistic-locked Update +// checks against). +func (s *volumeDefinitions) fetchRDLive(ctx context.Context, rdName string) (*crdv1alpha1.ResourceDefinition, error) { + if s.apiReader == nil { + return s.fetchRD(ctx, rdName) + } + + var rd crdv1alpha1.ResourceDefinition + + err := s.apiReader.Get(ctx, types.NamespacedName{Name: Name(rdName)}, &rd) + if err != nil { + if apierrors.IsNotFound(err) { + return nil, errors.Wrapf(store.ErrNotFound, "resource definition %q", rdName) + } + + return nil, errors.Wrapf(err, "get ResourceDefinition %q (live)", rdName) + } + + return &rd, nil +} + func crdToWireVD(vd *crdv1alpha1.ResourceDefinitionVolume) apiv1.VolumeDefinition { return apiv1.VolumeDefinition{ VolumeNumber: vd.VolumeNumber, From 5f760a07b85e59c7064952e226c62fe2f5c3febf Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Sun, 14 Jun 2026 11:57:03 +0300 Subject: [PATCH 05/19] test(harness): widen concurrent late-vd convergence budget to 360s (BUG-048) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two 1G volumes added concurrently on a 3-diskful RD run up to 6 fresh initial syncs at once on the loop substrate, so convergence routinely exceeds the single-volume 240s budget (observed vol-2 still Outdated — converging, not the Inconsistent wedge — at 240s, UpToDate moments later). 360s removes the flake while staying far inside the wedge discriminator. Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- .../cli-matrix/multi-volume-late-vd-create.sh | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/tests/e2e/cli-matrix/multi-volume-late-vd-create.sh b/tests/e2e/cli-matrix/multi-volume-late-vd-create.sh index 9e82209c..ec7f59ef 100755 --- a/tests/e2e/cli-matrix/multi-volume-late-vd-create.sh +++ b/tests/e2e/cli-matrix/multi-volume-late-vd-create.sh @@ -184,20 +184,26 @@ if (( vdc_rc != 0 )); then echo " note: a concurrent vd c exited non-zero but both volumes were created" >&2 fi -# Each late-added 1G volume needs its own initial sync on every replica; -# under sweep load that takes well past the old 60s budget, so give the -# late volumes the same 240s headroom vol-0 gets. -echo ">> wait up to 240s for vol-1 + vol-2 to reach UpToDate on all 3 replicas" +# Each late-added 1G volume needs its own initial sync on every replica. +# Here TWO 1G volumes are added concurrently on a 3-diskful RD, so up to +# 6 fresh (replica,volume) initial syncs run at once and share the loop +# substrate's I/O — convergence routinely runs past the single-volume +# 240s budget (observed: vol-2 still `Outdated` — i.e. converging, NOT +# the wedge's `Inconsistent` — at the 240s mark, UpToDate moments later). +# 360s per volume keeps the concurrent path from flaking while still +# being far inside the wedge discriminator (a real BUG-048 wedge sits +# Inconsistent forever with no SyncSource and never advances). +echo ">> wait up to 360s for vol-1 + vol-2 to reach UpToDate on all 3 replicas" late_up=true for vol in 1 2; do - if ! wait_all_replicas_volume_uptodate "$vol" 240; then + if ! wait_all_replicas_volume_uptodate "$vol" 360; then late_up=false break fi done if [[ "$late_up" != "true" ]]; then - echo "FAIL (Bug 332): late-added vol-1/vol-2 not UpToDate on all 3 replicas within 240s" >&2 + echo "FAIL (Bug 332): late-added vol-1/vol-2 not UpToDate on all 3 replicas within 360s" >&2 "${LCTL[@]}" resource list --resources "$RD" 2>&1 | tail -40 >&2 # Surface the smoking gun: per (node, vol) disk_state straight from # the populated Resource.Status — a Bug-332-bitten path shows vol-1 From 5f0a444c89f6c32bae4ddd3c0f6203e64dc0d281 Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Sun, 14 Jun 2026 12:12:18 +0300 Subject: [PATCH 06/19] =?UTF-8?q?fix(satellite,drbd):=20late-add-promote?= =?UTF-8?q?=20self-heal=20for=20=E2=89=A53-replica=20wedge=20(BUG-048)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The race-free metadata gate eliminated the 2-replica concurrent late-add wedge, but on a ≥3-diskful RD the per-volume winner seed can still race its own bring-up: the late volume occasionally comes up Inconsistent on EVERY diskful replica with no SyncSource and no peer holding data — the operator-reported wedge (replication:Established, resync-suspended:no, no SyncSource). Neither existing self-heal recovers it: maybeRecovery Promote is gated on the dispatcher's auto-primary (suppressed on the initialized RD every late-add lands on) and on the local being already UpToDate (it is Inconsistent here); maybeSoloPromote requires zero peers. Add a dedicated kernel-truth self-heal. Adm.NeedsLateAddPromote fires ONLY when a local diskful volume is Inconsistent, NO connected peer exposes committed data for it (no real SyncSource to wait for), none is actively resyncing, no replica is Primary, and this node holds the lowest node-id among the volume's diskful replicas — so exactly one deterministic node force-primaries. maybeLateAddPromote runs it on the steady-state reconcile WITHOUT the auto-primary gate (that is precisely what is missing on the late-add path), throttled by the shared recoveryPromoteThrottle. Data-safety: a fresh late-added volume is seeded at the deterministic day0 current-UUID on every replica before bring-up, so primary --force mints no unrelated UUID; the Inconsistent peers SyncTarget from this now-Primary source. The peer-has-data veto preserves the Bug 342 unrelated-data guard (a relocate target never force-primaries over a data-bearing peer), and the active-resync + peer-Primary vetoes prevent churn. Self-limiting: the predicate stops holding once the peers converge. Unit-pinned in needs_late_add_promote_test.go (fires on the wedge; vetoes on peer-data / active-resync / peer-Primary / lower-id / all-UpToDate). Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- pkg/drbd/drbdadm.go | 147 ++++++++++++++++++++ pkg/drbd/needs_late_add_promote_test.go | 173 ++++++++++++++++++++++++ pkg/satellite/reconciler.go | 56 ++++++++ 3 files changed, 376 insertions(+) create mode 100644 pkg/drbd/needs_late_add_promote_test.go diff --git a/pkg/drbd/drbdadm.go b/pkg/drbd/drbdadm.go index bdcfab03..3c735f54 100644 --- a/pkg/drbd/drbdadm.go +++ b/pkg/drbd/drbdadm.go @@ -1374,6 +1374,153 @@ func localIsUpToDate(devices []drbdsetupStatusDevice) bool { return true } +// NeedsLateAddPromote probes the live kernel via `drbdsetup status +// --json` and reports whether THIS node must `primary --force` +// to unstick a LATE-ADDED volume that wedged Inconsistent on every +// diskful replica with no SyncSource (BUG-048, the concurrent two-VD +// add on a ≥3-diskful RD). +// +// Why a separate gate from NeedsRecoveryPromote / NeedsSoloPromote: +// - NeedsRecoveryPromote requires the LOCAL to be already UpToDate and +// a PEER Inconsistent — but in this wedge the local volume is ALSO +// Inconsistent (no replica ever won the volume's UpToDate seed), so +// that predicate can never fire. It is also gated by the dispatcher's +// `auto-primary`, which is suppressed on an INITIALIZED RD (every +// late-add lands on one) so the satellite-side maybeRecoveryPromote +// short-circuits before even probing. +// - NeedsSoloPromote requires ZERO peers; here peers exist. +// +// Returns true ONLY when ALL hold, so EXACTLY ONE deterministic node +// promotes and the force-primary is data-safe + self-limiting: +// - the kernel slot exists and reports a my-node-id; +// - the local role is NOT Primary and NO peer is Primary (never +// disturb an existing Primary that already drives the sync); +// - at least one LOCAL diskful volume is Inconsistent, and for EVERY +// such volume NO connected peer exposes committed data +// (peer-disk UpToDate / Consistent / Outdated) — i.e. there is no +// real data source anywhere, the defining late-add wedge. If any +// peer held data the correct action is to SyncTarget from it, never +// force-primary (that is the Bug 342 unrelated-data guard); +// - none of those wedged volumes is already being actively resynced +// from a peer (SyncTarget/WFBitMapT in progress) — let a live resync +// finish on its own; +// - this node's my-node-id is the LOWEST among the diskful replicas +// (local + every peer that is NOT diskless on the wedged volume), so +// a single deterministic node promotes — no split-brain race. +// +// Data-safety: a fresh late-added volume's metadata was seeded at the +// deterministic day0 current-UUID on every replica (the seed path runs +// before bring-up), so `primary --force` here mints no UNRELATED UUID; +// the Inconsistent peers simply SyncTarget from this now-Primary source. +// Self-limiting: once the peers reach UpToDate they are no longer +// Inconsistent and the predicate stops holding. Conservative on any +// probe/parse failure → false (a missed promote just retries next pass). +func (a *Adm) NeedsLateAddPromote(ctx context.Context, resource string) bool { + out, err := a.exec.Run(ctx, "drbdsetup", "status", resource, "--json") + if err != nil { + return false + } + + var status drbdsetupStatusRoot + + err = json.Unmarshal(out, &status) + if err != nil || len(status) == 0 || status[0].NodeID == nil { + return false + } + + res := status[0] + + // Never disturb an existing Primary anywhere in the RD. + if Role(res.Role).IsPrimary() { + return false + } + + for _, conn := range res.Connections { + if Role(conn.PeerRole).IsPrimary() { + return false + } + } + + // Which local diskful volumes are wedged Inconsistent? + wedged := localInconsistentVolumes(res.Devices) + if len(wedged) == 0 { + return false + } + + // For every wedged volume: no peer may hold data (else SyncTarget), + // none may be actively resyncing (else let it finish), and we must be + // the lowest node-id among its non-diskless diskful replicas. + for vol := range wedged { + ok := lateAddVolumeNeedsLocalPromote(res.Connections, vol, *res.NodeID) + if !ok { + return false + } + } + + return true +} + +// localInconsistentVolumes returns the set of local volume numbers whose +// disk-state is Inconsistent. A volume in any other state (UpToDate, +// Diskless, Negotiating, …) is excluded — only a stuck-Inconsistent +// local volume is a late-add-promote candidate. +func localInconsistentVolumes(devices []drbdsetupStatusDevice) map[int32]struct{} { + out := map[int32]struct{}{} + + for _, d := range devices { + if DiskState(d.DiskState) == DiskStateInconsistent { + out[d.VolumeNumber] = struct{}{} + } + } + + return out +} + +// lateAddVolumeNeedsLocalPromote reports whether `vol` is a genuine +// late-add wedge that THIS node (myID) must force-primary: no connected +// peer exposes committed data for it, no peer is actively resyncing it, +// and myID is the lowest node-id among the volume's non-diskless diskful +// replicas. Conservative: any peer with data, any active resync, or any +// lower-id non-diskless peer returns false. +func lateAddVolumeNeedsLocalPromote(conns []drbdsetupStatusConnection, vol, myID int32) bool { + weAreLowest := true + + for _, conn := range conns { + for _, peerDev := range conn.PeerDevices { + if peerDev.VolumeNumber != vol { + continue + } + + switch DiskState(peerDev.PeerDiskState) { + case DiskStateUpToDate, DiskStateConsistent, DiskStateOutdated: + // A peer holds real data — must SyncTarget from it, never + // force-primary (Bug 342 unrelated-data guard). + return false + case DiskStateInconsistent: + // A fellow wedged diskful replica. It competes for the + // lowest-id promoter election; if it outranks us, defer. + if conn.PeerNodeID < myID { + weAreLowest = false + } + + if peerDeviceActivelySyncing(peerDev) { + // A live resync is already driving this volume — let + // it finish rather than churn a promote. + return false + } + case DiskStateDiskless, DiskStateAttaching, DiskStateDetaching, + DiskStateFailed, DiskStateNegotiating, DiskStateDUnknown: + // Diskless witness / transient — not a data source and + // not a competing diskful promoter; ignore. + default: + // Unknown/empty — ignore. + } + } + } + + return weAreLowest +} + // DownVeto is the tri-state outcome of the Bug 350 kernel-truth probe // the satellite consults before committing a `drbdadm down` on the // INACTIVE path. It separates "definitely safe to down" from "must diff --git a/pkg/drbd/needs_late_add_promote_test.go b/pkg/drbd/needs_late_add_promote_test.go new file mode 100644 index 00000000..ce34b2c5 --- /dev/null +++ b/pkg/drbd/needs_late_add_promote_test.go @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: Apache-2.0 + +/* +Copyright 2026 Cozystack contributors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package drbd_test + +import ( + "testing" + + "github.com/cozystack/blockstor/pkg/drbd" + "github.com/cozystack/blockstor/pkg/storage" +) + +// Regression pins for the BUG-048 late-add-promote predicate +// (NeedsLateAddPromote). It exists to unstick a LATE-ADDED volume that +// wedged Inconsistent on EVERY diskful replica with no SyncSource (the +// concurrent two-VD add on a ≥3-diskful RD): the deterministic +// lowest-node-id replica force-primaries to become the SyncSource. It +// must fire ONLY in that genuine wedge and never when a peer holds data +// (SyncTarget instead), a resync is already running, a Primary exists, +// or this node is not the lowest-id promoter. + +const lateAddPromoteKey = "drbdsetup status pvc-late --json" + +func admWithLateAddStatus(t *testing.T, json string) *drbd.Adm { + t.Helper() + + fx := storage.NewFakeExec() + fx.Responses[lateAddPromoteKey] = storage.FakeResponse{Stdout: []byte(json)} + + return drbd.NewAdm(fx) +} + +// The genuine BUG-048 wedge: vol-0/vol-1 UpToDate, the late vol-2 is +// Inconsistent locally AND on every diskful peer, no peer holds data for +// vol-2, no Primary, and we (node-id 0) are the lowest → promote. +func TestNeedsLateAddPromote_WedgedLateVolume(t *testing.T) { + adm := admWithLateAddStatus(t, `[{ + "name":"pvc-late","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":1,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected", + "peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":1,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":2,"peer-disk-state":"Inconsistent","replication-state":"Established","resync-suspended":"no"} + ] + }] + }]`) + + if !adm.NeedsLateAddPromote(t.Context(), "pvc-late") { + t.Fatal("late vol-2 Inconsistent on every replica with no SyncSource must trigger late-add-promote") + } +} + +// Not the lowest node-id: a peer with a LOWER node-id is also wedged on +// vol-2, so it is the elected promoter — we must defer (no split-brain). +func TestNeedsLateAddPromote_DefersToLowerNodeID(t *testing.T) { + adm := admWithLateAddStatus(t, `[{ + "name":"pvc-late","node-id":2,"role":"Secondary", + "devices":[{"volume":2,"disk-state":"Inconsistent"}], + "connections":[{ + "peer-node-id":0,"name":"n1","connection-state":"Connected", + "peer-role":"Secondary", + "peer_devices":[{"volume":2,"peer-disk-state":"Inconsistent", + "replication-state":"Established","resync-suspended":"no"}] + }] + }]`) + + if adm.NeedsLateAddPromote(t.Context(), "pvc-late") { + t.Fatal("a lower-node-id wedged peer is the promoter; this node (id 2) must defer") + } +} + +// A peer already holds committed data for vol-2 (UpToDate) — the correct +// action is to SyncTarget from it, NEVER force-primary (Bug 342 +// unrelated-data guard). Must not fire. +func TestNeedsLateAddPromote_PeerHasDataVeto(t *testing.T) { + adm := admWithLateAddStatus(t, `[{ + "name":"pvc-late","node-id":0,"role":"Secondary", + "devices":[{"volume":2,"disk-state":"Inconsistent"}], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected", + "peer-role":"Secondary", + "peer_devices":[{"volume":2,"peer-disk-state":"UpToDate", + "replication-state":"SyncTarget","resync-suspended":"no"}] + }] + }]`) + + if adm.NeedsLateAddPromote(t.Context(), "pvc-late") { + t.Fatal("a peer holding data for the volume must veto force-primary (SyncTarget instead)") + } +} + +// A live resync is already driving vol-2 toward UpToDate from this node — +// let it finish rather than churn a promote. +func TestNeedsLateAddPromote_ActiveResyncDefers(t *testing.T) { + adm := admWithLateAddStatus(t, `[{ + "name":"pvc-late","node-id":0,"role":"Secondary", + "devices":[{"volume":2,"disk-state":"Inconsistent"}], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected", + "peer-role":"Secondary", + "peer_devices":[{"volume":2,"peer-disk-state":"Inconsistent", + "replication-state":"SyncSource","resync-suspended":"no"}] + }] + }]`) + + if adm.NeedsLateAddPromote(t.Context(), "pvc-late") { + t.Fatal("a volume already being actively resynced must NOT trigger a promote") + } +} + +// A Primary exists somewhere — it already drives the sync; never disturb. +func TestNeedsLateAddPromote_PeerPrimaryVeto(t *testing.T) { + adm := admWithLateAddStatus(t, `[{ + "name":"pvc-late","node-id":0,"role":"Secondary", + "devices":[{"volume":2,"disk-state":"Inconsistent"}], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected", + "peer-role":"Primary", + "peer_devices":[{"volume":2,"peer-disk-state":"Inconsistent", + "replication-state":"Established","resync-suspended":"no"}] + }] + }]`) + + if adm.NeedsLateAddPromote(t.Context(), "pvc-late") { + t.Fatal("a Primary peer must veto the late-add-promote") + } +} + +// No Inconsistent local volume — nothing to promote. Must not fire (this +// is the healthy steady state every converged reconcile passes through). +func TestNeedsLateAddPromote_AllUpToDateNoOp(t *testing.T) { + adm := admWithLateAddStatus(t, `[{ + "name":"pvc-late","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":1,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"UpToDate"} + ], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected", + "peer-role":"Secondary", + "peer_devices":[ + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"} + ] + }] + }]`) + + if adm.NeedsLateAddPromote(t.Context(), "pvc-late") { + t.Fatal("a fully-converged RD must NOT trigger any promote") + } +} diff --git a/pkg/satellite/reconciler.go b/pkg/satellite/reconciler.go index 485616df..3999f7e2 100644 --- a/pkg/satellite/reconciler.go +++ b/pkg/satellite/reconciler.go @@ -2509,6 +2509,17 @@ func (r *Reconciler) maybePromoteSelfHeals(ctx context.Context, dr *intent.Desir return err } + // BUG-048 late-add-promote self-heal — drive a late-added volume that + // wedged Inconsistent on EVERY diskful replica (no SyncSource, no peer + // holds data) to UpToDate. Distinct from maybeRecoveryPromote: that + // path needs the local already-UpToDate + the dispatcher's + // auto-primary, BOTH false for a late-add to an initialized RD. See + // maybeLateAddPromote / Adm.NeedsLateAddPromote. + err = r.maybeLateAddPromote(ctx, dr, diskless) + if err != nil { + return err + } + // Solo diskless→diskful toggle self-heal — force-promote a lone, // peerless diskful replica wedged below UpToDate. See maybeSoloPromote // for the full why (the auto-primary suppression × offline-safety @@ -2668,6 +2679,51 @@ func (r *Reconciler) maybeRecoveryPromote(ctx context.Context, dr *intent.Desire return r.runAutoPromote(ctx, dr) } +// maybeLateAddPromote is the BUG-048 self-heal for a late-added volume +// that wedged Inconsistent on EVERY diskful replica with no SyncSource. +// +// Why it cannot reuse maybeRecoveryPromote: that path is gated on the +// dispatcher's `auto-primary` (autoPrimaryReplica), which is suppressed +// on an INITIALIZED RD — and a late `vd c` ALWAYS lands on an +// initialized RD. Its kernel-truth predicate (NeedsRecoveryPromote) also +// requires the local replica to be already UpToDate; in this wedge the +// local late-added volume is itself Inconsistent. So neither the gate +// nor the predicate can ever fire for the late-add wedge. +// +// NeedsLateAddPromote is the dedicated kernel-truth gate: it fires ONLY +// when a local diskful volume is Inconsistent, NO connected peer holds +// committed data for it (so there is no real SyncSource to wait for), +// no replica is Primary, and this node is the deterministic lowest +// node-id — so exactly one node force-primaries and it is data-safe +// (every replica shares the volume's day0 current-UUID; primary --force +// mints no unrelated UUID, and the Inconsistent peers SyncTarget from +// this now-Primary source). Self-limiting: once the peers converge the +// predicate stops holding. +// +// NOT gated on autoPromote/autoPrimaryReplica by design — those are +// exactly what is missing on the late-add path. It IS gated on the same +// recoveryPromoteThrottle so a still-converging resync isn't churned by +// repeated promotes. Skipped on diskless replicas (no disk to promote) +// and when Adm is unwired (storage-only unit tests). +func (r *Reconciler) maybeLateAddPromote(ctx context.Context, dr *intent.DesiredResource, diskless bool) error { + if diskless || r.cfg.Adm == nil { + return nil + } + + if !r.cfg.Adm.NeedsLateAddPromote(ctx, dr.GetName()) { + return nil + } + + if !r.recoveryPromoteDue(dr.GetName()) { + return nil + } + + log.FromContext(ctx).Info("BUG-048 late-add-promote: force-primary to seed SyncSource for late-added volume wedged Inconsistent on every replica", + "resource", dr.GetName()) + + return r.runAutoPromote(ctx, dr) +} + // recoveryPromoteDue reports whether enough time has elapsed since this // node last fired a recovery-promote for `name` to fire another, and // records the fire time when it returns true. Serialised by r.mu so From ab4ce73e92090b75fd27011c5b081111b71599cd Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Sun, 14 Jun 2026 12:14:05 +0300 Subject: [PATCH 07/19] test(satellite): pin maybeLateAddPromote self-heal wiring (BUG-048) Reconciler-level pins that maybeLateAddPromote force-primaries the lowest-id replica on the late-add wedge WITHOUT an auto-primary flag, skips diskless replicas, and vetoes when a peer holds data. Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- pkg/satellite/maybe_late_add_promote_test.go | 134 +++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 pkg/satellite/maybe_late_add_promote_test.go diff --git a/pkg/satellite/maybe_late_add_promote_test.go b/pkg/satellite/maybe_late_add_promote_test.go new file mode 100644 index 00000000..68e5a9ce --- /dev/null +++ b/pkg/satellite/maybe_late_add_promote_test.go @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: Apache-2.0 + +/* +Copyright 2026 Cozystack contributors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package satellite + +import ( + "context" + "slices" + "testing" + + "github.com/cozystack/blockstor/pkg/drbd" + "github.com/cozystack/blockstor/pkg/satellite/intent" + "github.com/cozystack/blockstor/pkg/storage" +) + +// Regression pins for the BUG-048 late-add-promote self-heal wiring. +// maybeLateAddPromote must force-primary the lowest-node-id replica when +// a late-added volume wedged Inconsistent on every diskful replica with +// no SyncSource — WITHOUT the dispatcher's auto-primary (which is absent +// on the initialized RD every late-add lands on) — and must NOT promote +// a diskless replica. + +const lateAddStatusKey = "drbdsetup status pvc-la --json" + +// vol-0/vol-1 UpToDate, the late vol-2 Inconsistent locally and on the +// peer, no peer data for vol-2, no Primary, local is the lowest node-id. +const lateAddStatusWedged = `[{ + "name":"pvc-la","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":1,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":1,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":2,"peer-disk-state":"Inconsistent","replication-state":"Established","resync-suspended":"no"} + ] + }] +}]` + +func lateAddDR() *intent.DesiredResource { + return &intent.DesiredResource{ + Name: "pvc-la", + NodeName: "n1", + Volumes: []*intent.DesiredVolume{ + {VolumeNumber: 0, SizeKib: 1024 * 1024, StoragePool: "thin1"}, + {VolumeNumber: 1, SizeKib: 1024 * 1024, StoragePool: "thin1"}, + {VolumeNumber: 2, SizeKib: 1024 * 1024, StoragePool: "thin1"}, + }, + Peers: []intent.DesiredPeer{{Name: "n2"}}, + DrbdOptions: map[string]string{ + "port": "7000", "node-id": "0", "address": "10.0.0.1", "minor": "1000", + }, + } +} + +// The wedge fires the force-primary (and the demote after) even though +// no auto-primary flag was ever stamped — the whole point of the gate. +func TestMaybeLateAddPromote_FiresForWedgedLateVolume(t *testing.T) { + fx := storage.NewFakeExec() + fx.Expect(lateAddStatusKey, storage.FakeResponse{Stdout: []byte(lateAddStatusWedged)}) + + rec := NewReconciler(ReconcilerConfig{Adm: drbd.NewAdm(fx), NodeName: "n1"}) + + if err := rec.maybeLateAddPromote(context.Background(), lateAddDR(), false); err != nil { + t.Fatalf("maybeLateAddPromote: %v", err) + } + + cmds := fx.CommandLines() + if !slices.Contains(cmds, "drbdadm primary --force pvc-la") { + t.Errorf("expected `drbdadm primary --force pvc-la`, got: %v", cmds) + } + if !slices.Contains(cmds, "drbdadm secondary pvc-la") { + t.Errorf("expected `drbdadm secondary pvc-la` (demote after promote), got: %v", cmds) + } +} + +// A diskless replica has no disk to promote — skip outright. +func TestMaybeLateAddPromote_SkipsDiskless(t *testing.T) { + fx := storage.NewFakeExec() + fx.Expect(lateAddStatusKey, storage.FakeResponse{Stdout: []byte(lateAddStatusWedged)}) + + rec := NewReconciler(ReconcilerConfig{Adm: drbd.NewAdm(fx), NodeName: "n1"}) + + if err := rec.maybeLateAddPromote(context.Background(), lateAddDR(), true); err != nil { + t.Fatalf("maybeLateAddPromote: %v", err) + } + + if slices.Contains(fx.CommandLines(), "drbdadm primary --force pvc-la") { + t.Errorf("diskless replica must NOT be promoted, got: %v", fx.CommandLines()) + } +} + +// A peer holding committed data for the volume vetoes the promote — the +// replica must SyncTarget from it instead (Bug 342 unrelated-data guard). +func TestMaybeLateAddPromote_SkipsWhenPeerHasData(t *testing.T) { + fx := storage.NewFakeExec() + fx.Expect(lateAddStatusKey, storage.FakeResponse{Stdout: []byte(`[{ + "name":"pvc-la","node-id":0,"role":"Secondary", + "devices":[{"volume":2,"disk-state":"Inconsistent"}], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[{"volume":2,"peer-disk-state":"UpToDate","replication-state":"SyncTarget","resync-suspended":"no"}] + }] + }]`)}) + + rec := NewReconciler(ReconcilerConfig{Adm: drbd.NewAdm(fx), NodeName: "n1"}) + + if err := rec.maybeLateAddPromote(context.Background(), lateAddDR(), false); err != nil { + t.Fatalf("maybeLateAddPromote: %v", err) + } + + if slices.Contains(fx.CommandLines(), "drbdadm primary --force pvc-la") { + t.Errorf("must NOT force-primary when a peer holds data, got: %v", fx.CommandLines()) + } +} From ec08de803598378032ae0573e156b0b3fe407bdd Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Sun, 14 Jun 2026 12:52:27 +0300 Subject: [PATCH 08/19] fix(store): verify volume landed after auto-numbered create (BUG-048) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Defence-in-depth against the silent lost-update. The optimistic-lock Update normally guarantees no racing writer clobbered the allocation — a stale resourceVersion 409s and the loop retries. But under heavy apiserver/etcd load a follower read can return a resourceVersion that is already superseded yet still accepted on Update, so two concurrent auto-numbered creates can both return success while only one VolumeDefinition lands (the operator-visible silent drop: both `linstor vd c` exit 0, the RD has one fewer volume than asked). After the Update commits, re-read live and confirm the assigned VolumeNumber is actually present; if it vanished, surface a synthetic Conflict so the whole allocate+append re-runs against the corrected state. The create now either persists the volume or retries — it can never return success having silently lost it. Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- pkg/store/k8s/volume_definitions.go | 46 ++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/pkg/store/k8s/volume_definitions.go b/pkg/store/k8s/volume_definitions.go index 80aafad3..0a67cda0 100644 --- a/pkg/store/k8s/volume_definitions.go +++ b/pkg/store/k8s/volume_definitions.go @@ -24,6 +24,7 @@ import ( "github.com/cockroachdb/errors" apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/util/retry" ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" @@ -164,7 +165,25 @@ func (s *volumeDefinitions) CreateAutoNumbered(ctx context.Context, rdName strin rd.Spec.VolumeDefinitions = append(rd.Spec.VolumeDefinitions, entry) - return s.c.Update(ctx, rd) + updErr := s.c.Update(ctx, rd) + if updErr != nil { + return updErr //nolint:wrapcheck // outer errors.Wrapf adds context; the bare error preserves the apierrors type isConflictOrNotFound matches on + } + + // Post-commit verification (BUG-048 defence-in-depth). The + // optimistic-lock Update normally guarantees no racing writer + // clobbered us — a stale resourceVersion 409s and we retry. But + // under heavy apiserver/etcd load a follower read can hand back a + // resourceVersion that is already superseded yet still accepted on + // Update, so two concurrent creates could both "succeed" while + // only one VolumeDefinition lands (the silent lost-update the + // operator hit). Re-read live and confirm our assigned number is + // actually present; if it vanished (a racer's clobber won), force + // a retry by surfacing a synthetic Conflict so the whole + // allocate+append re-runs against the now-correct state. This + // makes the silent drop impossible: the create either persists or + // retries — it never returns success having lost the volume. + return s.verifyVolumeLanded(ctx, rdName, assigned) }) if err != nil { return 0, errors.Wrapf(err, "auto-numbered create on RD %q", rdName) @@ -360,6 +379,31 @@ func (s *volumeDefinitions) fetchRDLive(ctx context.Context, rdName string) (*cr return &rd, nil } +// verifyVolumeLanded re-reads the RD live and returns a synthetic +// Conflict error (so the CreateAutoNumbered retry loop re-runs) when the +// just-written VolumeNumber is NOT present — the BUG-048 stale-read +// clobber. Returns nil when the volume is durably present. A NotFound +// (RD vanished) is surfaced as-is so the retry loop's isConflictOrNotFound +// also catches it. +func (s *volumeDefinitions) verifyVolumeLanded(ctx context.Context, rdName string, assigned int32) error { + rd, err := s.fetchRDLive(ctx, rdName) + if err != nil { + return err + } + + for i := range rd.Spec.VolumeDefinitions { + if rd.Spec.VolumeDefinitions[i].VolumeNumber == assigned { + return nil + } + } + + return apierrors.NewConflict( + schema.GroupResource{Group: crdv1alpha1.GroupVersion.Group, Resource: "resourcedefinitions"}, + Name(rdName), + errors.Newf("volume %d did not persist (concurrent clobber); retrying allocation", assigned), + ) +} + func crdToWireVD(vd *crdv1alpha1.ResourceDefinitionVolume) apiv1.VolumeDefinition { return apiv1.VolumeDefinition{ VolumeNumber: vd.VolumeNumber, From ed53d0090672d0300aad22b5a985d85b340f6fe4 Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Sun, 14 Jun 2026 12:59:14 +0300 Subject: [PATCH 09/19] test(harness): shorten BUG-048 replay name to fit DRBD resource-name limit The replay-runner prefixes 'replay-' and suffixes a random token; the 40-char workflow name pushed the generated RD name past DRBD's resource-name length cap, so 'resource-definition create' failed with exit 10 before the scenario even started. Rename to vd-late-concurrent-no-drop so the generated name fits. Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- ...-lost-update.yaml => vd-late-concurrent-no-drop.yaml} | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) rename tests/operator-harness/replay/{vd-late-create-concurrent-no-lost-update.yaml => vd-late-concurrent-no-drop.yaml} (91%) diff --git a/tests/operator-harness/replay/vd-late-create-concurrent-no-lost-update.yaml b/tests/operator-harness/replay/vd-late-concurrent-no-drop.yaml similarity index 91% rename from tests/operator-harness/replay/vd-late-create-concurrent-no-lost-update.yaml rename to tests/operator-harness/replay/vd-late-concurrent-no-drop.yaml index ca055e90..06d61365 100644 --- a/tests/operator-harness/replay/vd-late-create-concurrent-no-lost-update.yaml +++ b/tests/operator-harness/replay/vd-late-concurrent-no-drop.yaml @@ -1,6 +1,11 @@ -name: vd-late-create-concurrent-no-lost-update +name: vd-late-concurrent-no-drop description: | - BUG-048 catcher (P1, availability). Two back-to-back number-less + BUG-048 catcher (P1, availability). Short name on purpose: the + replay-runner prefixes `replay-` and suffixes a random token, and the + resulting RD name must stay within DRBD's resource-name length limit — + a longer name makes `resource-definition create` fail outright. + + Two back-to-back number-less `linstor vd c ` calls against an RD that already carries vol-0 (and whose RD.Spec.Initialized has flipped true) must BOTH land at distinct VolumeNumbers — none silently dropped. From e766d4dc1f45cd7833355ae16008c0db9871f743 Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Sun, 14 Jun 2026 13:09:55 +0300 Subject: [PATCH 10/19] fix(drbd): gate late-add-promote against day0 split-brain (BUG-048) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stand validation caught a serious regression in the late-add-promote self-heal: NeedsLateAddPromote could not distinguish a normal day0 bootstrap (every volume transiently Inconsistent, no SyncSource yet — the normal fresh state) from the BUG-048 late-add wedge, so it MISFIRED on a fresh RD's vol-0. Worse, its lowest-node-id election counted only CONNECTED peers; at day0 t+1s peers are still StandAlone/Connecting, so each node saw a partial peer set, several concluded "I am lowest", and two non-lowest nodes force-primaried ~37ms apart → split-brain on the baseline volume (observed: worker-1 + worker-3 both promoted, worker-2 the true lowest did not). Add two split-brain-safety gates: 1. Require a local UpToDate sibling volume — proves the RD is PAST first activation. A pure day0 bootstrap has NO UpToDate volume, so the predicate can never fire there (the normal winner-election + auto-promote own day0). 2. Require EVERY peer connection to be fully Connected — the lowest-node-id election is only sound with COMPLETE peer information. Any StandAlone/Connecting peer defers the self-heal, so every node computes the election over the same full set and EXACTLY ONE (the true lowest id) ever promotes. Unit tests previously only modelled fully-Connected peers (so they passed while live partial-connectivity behaviour broke); add explicit pins for the day0-all-Inconsistent and partial-connectivity cases, and give the veto fixtures a UpToDate sibling so they exercise the intended veto rather than gate 1. Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- pkg/drbd/drbdadm.go | 95 ++++++++++++++++---- pkg/drbd/needs_late_add_promote_test.go | 91 ++++++++++++++++--- pkg/satellite/maybe_late_add_promote_test.go | 10 ++- 3 files changed, 165 insertions(+), 31 deletions(-) diff --git a/pkg/drbd/drbdadm.go b/pkg/drbd/drbdadm.go index 3c735f54..6d21969c 100644 --- a/pkg/drbd/drbdadm.go +++ b/pkg/drbd/drbdadm.go @@ -1390,31 +1390,44 @@ func localIsUpToDate(devices []drbdsetupStatusDevice) bool { // short-circuits before even probing. // - NeedsSoloPromote requires ZERO peers; here peers exist. // -// Returns true ONLY when ALL hold, so EXACTLY ONE deterministic node -// promotes and the force-primary is data-safe + self-limiting: +// SPLIT-BRAIN SAFETY (the two gates that make this distinct from a day0 +// bootstrap — without them the predicate misfired on a fresh RD's vol-0 +// and two non-lowest nodes simultaneously force-primaried into +// split-brain): +// +// 1. A LATE-add wedge means the RD is PAST first activation, so at +// least one local volume is already UpToDate (the earlier +// volumes). A pure day0 bootstrap has NO UpToDate volume yet — +// EVERY volume is transiently Inconsistent while the normal winner +// election + auto-promote run. Requiring a local UpToDate sibling +// means this predicate can NEVER fire during day0. +// 2. EVERY peer connection must be fully Connected with the wedged +// volume's peer-disk OBSERVED. The lowest-node-id election is only +// sound with COMPLETE peer information; at day0 t+1s peers are still +// StandAlone / Connecting, so each node would see a partial peer set +// and several could each conclude "I am lowest" → simultaneous +// force-primary. Deferring until every peer is connected guarantees +// every node computes the election over the same full set, so +// EXACTLY ONE (the true lowest id) promotes. +// +// Beyond those, returns true ONLY when ALL hold, so the force-primary is +// data-safe + self-limiting: // - the kernel slot exists and reports a my-node-id; -// - the local role is NOT Primary and NO peer is Primary (never -// disturb an existing Primary that already drives the sync); +// - the local role is NOT Primary and NO peer is Primary; // - at least one LOCAL diskful volume is Inconsistent, and for EVERY // such volume NO connected peer exposes committed data -// (peer-disk UpToDate / Consistent / Outdated) — i.e. there is no -// real data source anywhere, the defining late-add wedge. If any -// peer held data the correct action is to SyncTarget from it, never -// force-primary (that is the Bug 342 unrelated-data guard); -// - none of those wedged volumes is already being actively resynced -// from a peer (SyncTarget/WFBitMapT in progress) — let a live resync -// finish on its own; -// - this node's my-node-id is the LOWEST among the diskful replicas -// (local + every peer that is NOT diskless on the wedged volume), so -// a single deterministic node promotes — no split-brain race. +// (peer-disk UpToDate / Consistent / Outdated) — the defining wedge; +// a peer with data means SyncTarget instead (Bug 342 guard); +// - none of those wedged volumes is already being actively resynced; +// - this node's my-node-id is the LOWEST among the wedged volume's +// diskful replicas. // // Data-safety: a fresh late-added volume's metadata was seeded at the // deterministic day0 current-UUID on every replica (the seed path runs // before bring-up), so `primary --force` here mints no UNRELATED UUID; // the Inconsistent peers simply SyncTarget from this now-Primary source. -// Self-limiting: once the peers reach UpToDate they are no longer -// Inconsistent and the predicate stops holding. Conservative on any -// probe/parse failure → false (a missed promote just retries next pass). +// Self-limiting: once the peers reach UpToDate the predicate stops +// holding. Conservative on any probe/parse failure → false. func (a *Adm) NeedsLateAddPromote(ctx context.Context, resource string) bool { out, err := a.exec.Run(ctx, "drbdsetup", "status", resource, "--json") if err != nil { @@ -1441,6 +1454,19 @@ func (a *Adm) NeedsLateAddPromote(ctx context.Context, resource string) bool { } } + // Gate 1 (split-brain safety): the RD must be PAST day0 — at least one + // local volume already UpToDate. A pure day0 bootstrap has none, so + // the predicate cannot misfire there. + if !localHasUpToDateVolume(res.Devices) { + return false + } + + // Gate 2 (split-brain safety): every peer must be fully connected so + // the lowest-node-id election runs over COMPLETE peer information. + if !allPeersConnected(res.Connections) { + return false + } + // Which local diskful volumes are wedged Inconsistent? wedged := localInconsistentVolumes(res.Devices) if len(wedged) == 0 { @@ -1460,6 +1486,41 @@ func (a *Adm) NeedsLateAddPromote(ctx context.Context, resource string) bool { return true } +// localHasUpToDateVolume reports whether at least one local device is +// UpToDate — the proof that the RD is past first activation (a day0 +// bootstrap has every volume still Inconsistent). Gate 1 of the +// split-brain-safe NeedsLateAddPromote. +func localHasUpToDateVolume(devices []drbdsetupStatusDevice) bool { + for _, d := range devices { + if DiskState(d.DiskState) == DiskStateUpToDate { + return true + } + } + + return false +} + +// allPeersConnected reports whether EVERY peer connection is in the +// Connected connection-state. Gate 2 of NeedsLateAddPromote: the +// lowest-node-id promoter election is only sound with complete peer +// information, so any StandAlone / Connecting / unconnected peer (the +// day0 t+1s state) defers the self-heal. A resource with zero peer +// connections is NOT covered here (that is NeedsSoloPromote's domain) — +// returns false so the late-add gate never acts peerless. +func allPeersConnected(conns []drbdsetupStatusConnection) bool { + if len(conns) == 0 { + return false + } + + for _, conn := range conns { + if conn.ConnectionStr != "Connected" { + return false + } + } + + return true +} + // localInconsistentVolumes returns the set of local volume numbers whose // disk-state is Inconsistent. A volume in any other state (UpToDate, // Diskless, Negotiating, …) is excluded — only a stuck-Inconsistent diff --git a/pkg/drbd/needs_late_add_promote_test.go b/pkg/drbd/needs_late_add_promote_test.go index ce34b2c5..590a0fce 100644 --- a/pkg/drbd/needs_late_add_promote_test.go +++ b/pkg/drbd/needs_late_add_promote_test.go @@ -74,15 +74,21 @@ func TestNeedsLateAddPromote_WedgedLateVolume(t *testing.T) { // Not the lowest node-id: a peer with a LOWER node-id is also wedged on // vol-2, so it is the elected promoter — we must defer (no split-brain). +// (vol-0 UpToDate everywhere → past day0, peer Connected → gates 1+2 OK.) func TestNeedsLateAddPromote_DefersToLowerNodeID(t *testing.T) { adm := admWithLateAddStatus(t, `[{ "name":"pvc-late","node-id":2,"role":"Secondary", - "devices":[{"volume":2,"disk-state":"Inconsistent"}], + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], "connections":[{ "peer-node-id":0,"name":"n1","connection-state":"Connected", "peer-role":"Secondary", - "peer_devices":[{"volume":2,"peer-disk-state":"Inconsistent", - "replication-state":"Established","resync-suspended":"no"}] + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":2,"peer-disk-state":"Inconsistent","replication-state":"Established","resync-suspended":"no"} + ] }] }]`) @@ -97,12 +103,17 @@ func TestNeedsLateAddPromote_DefersToLowerNodeID(t *testing.T) { func TestNeedsLateAddPromote_PeerHasDataVeto(t *testing.T) { adm := admWithLateAddStatus(t, `[{ "name":"pvc-late","node-id":0,"role":"Secondary", - "devices":[{"volume":2,"disk-state":"Inconsistent"}], + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], "connections":[{ "peer-node-id":1,"name":"n2","connection-state":"Connected", "peer-role":"Secondary", - "peer_devices":[{"volume":2,"peer-disk-state":"UpToDate", - "replication-state":"SyncTarget","resync-suspended":"no"}] + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"SyncTarget","resync-suspended":"no"} + ] }] }]`) @@ -116,12 +127,17 @@ func TestNeedsLateAddPromote_PeerHasDataVeto(t *testing.T) { func TestNeedsLateAddPromote_ActiveResyncDefers(t *testing.T) { adm := admWithLateAddStatus(t, `[{ "name":"pvc-late","node-id":0,"role":"Secondary", - "devices":[{"volume":2,"disk-state":"Inconsistent"}], + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], "connections":[{ "peer-node-id":1,"name":"n2","connection-state":"Connected", "peer-role":"Secondary", - "peer_devices":[{"volume":2,"peer-disk-state":"Inconsistent", - "replication-state":"SyncSource","resync-suspended":"no"}] + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":2,"peer-disk-state":"Inconsistent","replication-state":"SyncSource","resync-suspended":"no"} + ] }] }]`) @@ -134,12 +150,17 @@ func TestNeedsLateAddPromote_ActiveResyncDefers(t *testing.T) { func TestNeedsLateAddPromote_PeerPrimaryVeto(t *testing.T) { adm := admWithLateAddStatus(t, `[{ "name":"pvc-late","node-id":0,"role":"Secondary", - "devices":[{"volume":2,"disk-state":"Inconsistent"}], + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], "connections":[{ "peer-node-id":1,"name":"n2","connection-state":"Connected", "peer-role":"Primary", - "peer_devices":[{"volume":2,"peer-disk-state":"Inconsistent", - "replication-state":"Established","resync-suspended":"no"}] + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":2,"peer-disk-state":"Inconsistent","replication-state":"Established","resync-suspended":"no"} + ] }] }]`) @@ -148,6 +169,52 @@ func TestNeedsLateAddPromote_PeerPrimaryVeto(t *testing.T) { } } +// SPLIT-BRAIN SAFETY gate 1: a pure day0 bootstrap — EVERY volume +// transiently Inconsistent, NO UpToDate sibling — must NOT fire, even +// though "all replicas Inconsistent, no SyncSource" superficially looks +// like the wedge. The normal day0 winner-election + auto-promote own +// this; misfiring here is exactly what split-brained vol-0 on the stand. +func TestNeedsLateAddPromote_Day0BootstrapNoFire(t *testing.T) { + adm := admWithLateAddStatus(t, `[{ + "name":"pvc-late","node-id":0,"role":"Secondary", + "devices":[{"volume":0,"disk-state":"Inconsistent"}], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected", + "peer-role":"Secondary", + "peer_devices":[{"volume":0,"peer-disk-state":"Inconsistent", + "replication-state":"Established","resync-suspended":"no"}] + }] + }]`) + + if adm.NeedsLateAddPromote(t.Context(), "pvc-late") { + t.Fatal("day0 bootstrap (no UpToDate sibling) must NOT trigger late-add-promote — normal winner election owns it") + } +} + +// SPLIT-BRAIN SAFETY gate 2: a peer that is NOT fully Connected (the +// day0 t+1s StandAlone/Connecting state) means the lowest-node-id +// election would run over a partial peer set. Defer until every peer is +// connected, so EXACTLY ONE node promotes — never two simultaneously. +func TestNeedsLateAddPromote_PartialConnectivityNoFire(t *testing.T) { + adm := admWithLateAddStatus(t, `[{ + "name":"pvc-late","node-id":1,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], + "connections":[ + {"peer-node-id":0,"name":"n1","connection-state":"StandAlone","peer-role":"Unknown", + "peer_devices":[{"volume":2,"peer-disk-state":"DUnknown","replication-state":"Off","resync-suspended":"no"}]}, + {"peer-node-id":2,"name":"n3","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[{"volume":2,"peer-disk-state":"Inconsistent","replication-state":"Established","resync-suspended":"no"}]} + ] + }]`) + + if adm.NeedsLateAddPromote(t.Context(), "pvc-late") { + t.Fatal("a not-fully-connected peer set must defer the promote (incomplete election → split-brain risk)") + } +} + // No Inconsistent local volume — nothing to promote. Must not fire (this // is the healthy steady state every converged reconcile passes through). func TestNeedsLateAddPromote_AllUpToDateNoOp(t *testing.T) { diff --git a/pkg/satellite/maybe_late_add_promote_test.go b/pkg/satellite/maybe_late_add_promote_test.go index 68e5a9ce..e22a3839 100644 --- a/pkg/satellite/maybe_late_add_promote_test.go +++ b/pkg/satellite/maybe_late_add_promote_test.go @@ -115,10 +115,16 @@ func TestMaybeLateAddPromote_SkipsWhenPeerHasData(t *testing.T) { fx := storage.NewFakeExec() fx.Expect(lateAddStatusKey, storage.FakeResponse{Stdout: []byte(`[{ "name":"pvc-la","node-id":0,"role":"Secondary", - "devices":[{"volume":2,"disk-state":"Inconsistent"}], + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], "connections":[{ "peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", - "peer_devices":[{"volume":2,"peer-disk-state":"UpToDate","replication-state":"SyncTarget","resync-suspended":"no"}] + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"SyncTarget","resync-suspended":"no"} + ] }] }]`)}) From 39f148b25afb7bc2b090717172eda20a53526696 Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Sun, 14 Jun 2026 13:28:31 +0300 Subject: [PATCH 11/19] test(harness): give BUG-048 replay 360s for the self-heal recovery path When the concurrent seed races and a late volume comes up Inconsistent on every replica, the late-add-promote self-heal recovers it, but the detect + recoveryPromoteThrottle + force-primary + resync chain runs past the bare 240s convergence budget on a contended stand. 360s keeps the recovery path from flaking while staying well inside the wedge discriminator. Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- .../replay/vd-late-concurrent-no-drop.yaml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/operator-harness/replay/vd-late-concurrent-no-drop.yaml b/tests/operator-harness/replay/vd-late-concurrent-no-drop.yaml index 06d61365..4ffce21a 100644 --- a/tests/operator-harness/replay/vd-late-concurrent-no-drop.yaml +++ b/tests/operator-harness/replay/vd-late-concurrent-no-drop.yaml @@ -85,10 +85,17 @@ steps: expect_exit: 0 await: # Every volume of every replica converges — the elected per-fresh- - # volume winner seeds a SyncSource for both late adds. + # volume winner seeds a SyncSource for both late adds. When the + # concurrent seed races and a volume comes up Inconsistent on every + # replica, the late-add-promote self-heal recovers it, but that adds + # latency (wedge-detect + the recoveryPromoteThrottle window + the + # force-primary + resync). On a contended stand that recovery path + # runs past the bare-convergence budget, so allow 360s — still far + # inside the wedge discriminator (a real unfixed wedge never + # advances at all). kind: all_uptodate rd: "{{rd}}" - timeout_s: 240 + timeout_s: 360 teardown: - cmd: ["resource-definition", "delete", "{{rd}}"] From 035915dc00ae65bf0dfec493dfedd3a362e31a1a Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Sun, 14 Jun 2026 16:13:53 +0300 Subject: [PATCH 12/19] fix(controller): optimistic-lock the RD volume-minor patch (BUG-048) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The silent VD-drop core. ensureRDVolumeMinors reads the RD, allocates DRBD minors for any volume still missing one, then writes them back via patchRDVolumeMinors — a JSON merge-patch on spec.volumeDefinitions. RFC 7386 merge-patch REPLACES an array wholesale, so the write stamps the ENTIRE volumeDefinitions list from the snapshot the reconciler read at the top of the function. A concurrent number-less `linstor vd c` that appended a new volume between that read and the patch is silently overwritten away — and both `vd c` return success because the clobber is an async reconciler write, not their own create. This is the operator-visible "vdCount short by one, both vd c rc0, zero conflict/retry in the logs" failure. The code claimed "optimistic concurrency" in a comment but never set metadata.resourceVersion, so the patch could never 409 and the caller's IsConflict re-read branch was dead. Embed the snapshot's metadata.resourceVersion in the merge-patch body (the same precondition client.MergeFromWithOptimisticLock builds for typed patches): a racing VD append moves the RD past the snapshot, the apiserver rejects the wholesale replace with Conflict, the caller re-reads, and the next reconcile re-allocates minors against the now-complete list — the appended volume survives. A missing resourceVersion (RD freshly built in a unit path) degrades to the prior blind patch rather than failing. L1: a stale-snapshot patch racing an appended volume must 409 (and not drop it); a fresh-snapshot patch must still commit. The clobber test fails pre-fix (the blind patch returns nil and drops vol-1). Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- .../bug_048_vd_drop_minor_patch_test.go | 213 ++++++++++++++++++ internal/controller/resource_controller.go | 33 ++- 2 files changed, 244 insertions(+), 2 deletions(-) create mode 100644 internal/controller/bug_048_vd_drop_minor_patch_test.go diff --git a/internal/controller/bug_048_vd_drop_minor_patch_test.go b/internal/controller/bug_048_vd_drop_minor_patch_test.go new file mode 100644 index 00000000..6bf19a23 --- /dev/null +++ b/internal/controller/bug_048_vd_drop_minor_patch_test.go @@ -0,0 +1,213 @@ +// SPDX-License-Identifier: Apache-2.0 + +/* +Copyright 2026 Cozystack contributors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + "testing" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + blockstoriov1alpha1 "github.com/cozystack/blockstor/api/v1alpha1" +) + +// BUG-048 (P1, the silent VD drop). The ResourceReconciler's +// patchRDVolumeMinors writes the RD's per-volume DRBD minors with a JSON +// merge-patch. JSON merge-patch (RFC 7386) REPLACES the +// spec.volumeDefinitions array wholesale, so the write stamps the WHOLE +// list from whatever snapshot the reconciler read at the top of +// ensureRDVolumeMinors. If a concurrent number-less `linstor vd c` +// appended a new volume AFTER that read but BEFORE this patch, the +// wholesale replace silently drops the appended volume — and both `vd c` +// return success because the clobber is an async reconciler write, not +// their own create. +// +// The original code carried a comment claiming "optimistic concurrency" +// but never set metadata.resourceVersion, so the patch could NEVER 409 +// and the appended volume vanished with no error anywhere — the +// operator-visible "vdCount short by one, both vd c rc0, zero +// conflict/retry in the logs" failure observed on the stand. +// +// The fix embeds metadata.resourceVersion in the merge-patch body so the +// apiserver rejects the stale wholesale replace with Conflict. These +// tests pin both halves of the contract: +// +// - a stale-snapshot patch racing an appended volume MUST be rejected +// with Conflict (and therefore NOT drop the appended volume), and +// - a non-racing patch (fresh snapshot) MUST still succeed and stamp +// the minors. + +func bug048Scheme(t *testing.T) *runtime.Scheme { + t.Helper() + + scheme := runtime.NewScheme() + if err := blockstoriov1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("AddToScheme: %v", err) + } + + return scheme +} + +func bug048RDVolNumbers(rd *blockstoriov1alpha1.ResourceDefinition) []int32 { + out := make([]int32, 0, len(rd.Spec.VolumeDefinitions)) + for i := range rd.Spec.VolumeDefinitions { + out = append(out, rd.Spec.VolumeDefinitions[i].VolumeNumber) + } + + return out +} + +// TestBug048MinorPatchRejectsStaleVolumeDefinitionClobber is the core +// regression. We: +// 1. seed an RD with [vol-0], read it into the reconciler's snapshot, +// 2. simulate a concurrent `vd c` appending vol-1 to the LIVE RD +// (bumping its resourceVersion), then +// 3. call patchRDVolumeMinors with the STALE [vol-0] snapshot. +// +// Pre-fix (blind merge-patch, no resourceVersion) the patch silently +// succeeds and the live RD is clobbered back to [vol-0] — vol-1 is +// dropped. Post-fix the patch carries the snapshot's resourceVersion, +// the apiserver returns Conflict, and vol-1 survives. +func TestBug048MinorPatchRejectsStaleVolumeDefinitionClobber(t *testing.T) { + t.Parallel() + + ctx := context.Background() + scheme := bug048Scheme(t) + + const rdName = "drop-rd" + + rd := &blockstoriov1alpha1.ResourceDefinition{ + ObjectMeta: metav1.ObjectMeta{Name: rdName}, + Spec: blockstoriov1alpha1.ResourceDefinitionSpec{ + VolumeDefinitions: []blockstoriov1alpha1.ResourceDefinitionVolume{ + {VolumeNumber: 0, SizeKib: 1024}, + }, + }, + } + + cli := fake.NewClientBuilder().WithScheme(scheme). + WithStatusSubresource(&blockstoriov1alpha1.ResourceDefinition{}). + WithObjects(rd). + Build() + + rec := &ResourceReconciler{Client: cli, Scheme: scheme, APIReader: cli} + + // (1) reconciler snapshot of the RD (carries the stale resourceVersion). + snap := &blockstoriov1alpha1.ResourceDefinition{} + if err := cli.Get(ctx, client.ObjectKey{Name: rdName}, snap); err != nil { + t.Fatalf("get RD snapshot: %v", err) + } + // Reconciler "allocates" a minor for vol-0 on its snapshot. + m0 := int32(1000) + snap.Spec.VolumeDefinitions[0].DRBDMinor = &m0 + + // (2) concurrent `vd c` appends vol-1 to the LIVE RD, bumping its RV. + live := &blockstoriov1alpha1.ResourceDefinition{} + if err := cli.Get(ctx, client.ObjectKey{Name: rdName}, live); err != nil { + t.Fatalf("get live RD: %v", err) + } + + live.Spec.VolumeDefinitions = append(live.Spec.VolumeDefinitions, + blockstoriov1alpha1.ResourceDefinitionVolume{VolumeNumber: 1, SizeKib: 1024}) + if err := cli.Update(ctx, live); err != nil { + t.Fatalf("append vol-1 (simulated vd c): %v", err) + } + + // (3) reconciler patches minors from its STALE [vol-0] snapshot. + err := rec.patchRDVolumeMinors(ctx, snap) + if !apierrors.IsConflict(err) { + t.Fatalf("patchRDVolumeMinors with stale snapshot returned err=%v, want Conflict; "+ + "a non-Conflict means the wholesale volumeDefinitions replace was NOT optimistic-locked "+ + "and would silently drop the concurrently-appended vol-1 (BUG-048)", err) + } + + // The appended volume MUST survive the rejected clobber. + got := &blockstoriov1alpha1.ResourceDefinition{} + if gerr := cli.Get(ctx, client.ObjectKey{Name: rdName}, got); gerr != nil { + t.Fatalf("get RD after rejected patch: %v", gerr) + } + + nums := bug048RDVolNumbers(got) + if len(nums) != 2 || nums[0] != 0 || nums[1] != 1 { + t.Fatalf("RD volumeDefinitions after rejected clobber = %v, want [0 1] "+ + "(vol-1 was silently dropped — the BUG-048 lost update)", nums) + } +} + +// TestBug048MinorPatchSucceedsOnFreshSnapshot proves the optimistic-lock +// does not break the happy path: a patch built from a CURRENT snapshot +// (no racing writer) must still commit and stamp the minors. +func TestBug048MinorPatchSucceedsOnFreshSnapshot(t *testing.T) { + t.Parallel() + + ctx := context.Background() + scheme := bug048Scheme(t) + + const rdName = "fresh-rd" + + rd := &blockstoriov1alpha1.ResourceDefinition{ + ObjectMeta: metav1.ObjectMeta{Name: rdName}, + Spec: blockstoriov1alpha1.ResourceDefinitionSpec{ + VolumeDefinitions: []blockstoriov1alpha1.ResourceDefinitionVolume{ + {VolumeNumber: 0, SizeKib: 1024}, + {VolumeNumber: 1, SizeKib: 1024}, + }, + }, + } + + cli := fake.NewClientBuilder().WithScheme(scheme). + WithStatusSubresource(&blockstoriov1alpha1.ResourceDefinition{}). + WithObjects(rd). + Build() + + rec := &ResourceReconciler{Client: cli, Scheme: scheme, APIReader: cli} + + snap := &blockstoriov1alpha1.ResourceDefinition{} + if err := cli.Get(ctx, client.ObjectKey{Name: rdName}, snap); err != nil { + t.Fatalf("get RD snapshot: %v", err) + } + + m0, m1 := int32(1000), int32(1001) + snap.Spec.VolumeDefinitions[0].DRBDMinor = &m0 + snap.Spec.VolumeDefinitions[1].DRBDMinor = &m1 + + if err := rec.patchRDVolumeMinors(ctx, snap); err != nil { + t.Fatalf("patchRDVolumeMinors on fresh snapshot: %v", err) + } + + got := &blockstoriov1alpha1.ResourceDefinition{} + if err := cli.Get(ctx, client.ObjectKey{Name: rdName}, got); err != nil { + t.Fatalf("get RD after patch: %v", err) + } + + if nums := bug048RDVolNumbers(got); len(nums) != 2 || nums[0] != 0 || nums[1] != 1 { + t.Fatalf("RD volumeDefinitions after fresh patch = %v, want [0 1]", nums) + } + + for i := range got.Spec.VolumeDefinitions { + if got.Spec.VolumeDefinitions[i].DRBDMinor == nil { + t.Fatalf("vol-%d minor not stamped after fresh patch", + got.Spec.VolumeDefinitions[i].VolumeNumber) + } + } +} diff --git a/internal/controller/resource_controller.go b/internal/controller/resource_controller.go index d68912f2..c3ce3cb9 100644 --- a/internal/controller/resource_controller.go +++ b/internal/controller/resource_controller.go @@ -1624,6 +1624,24 @@ func backfillRDMinorsFromStatus(rd *blockstoriov1alpha1.ResourceDefinition) bool // satellite/REST owners of the other VolumeDefinition fields read them // off Spec elsewhere, and we re-send the size/props verbatim to avoid // dropping them. +// +// BUG-048 (P1, the silent VD drop): because a JSON merge-patch REPLACES +// the volumeDefinitions array (RFC 7386 has no array-merge), this write +// stamps the WHOLE list from the in-memory snapshot the reconciler read +// at the top of ensureRDVolumeMinors. If a concurrent number-less +// `linstor vd c` appended a new volume AFTER that read but before this +// patch, the wholesale replace silently drops it — and both `vd c` +// return success because the clobber is an async reconciler write, not +// their own. The original code claimed "optimistic concurrency" in a +// comment but never actually set metadata.resourceVersion, so the patch +// could NEVER 409 and the IsConflict re-read branch in the caller was +// dead. We now embed metadata.resourceVersion in the merge-patch body +// (exactly what client.MergeFromWithOptimisticLock does for typed +// patches): the apiserver rejects the patch with Conflict when a racing +// VD append moved the RD on, the caller re-reads, and the next reconcile +// re-allocates minors against the now-complete list — the appended +// volume survives. A missing resourceVersion (RD freshly built in a unit +// path) degrades to the prior blind patch rather than failing the write. func (r *ResourceReconciler) patchRDVolumeMinors(ctx context.Context, rd *blockstoriov1alpha1.ResourceDefinition) error { vols := make([]map[string]any, 0, len(rd.Spec.VolumeDefinitions)) for i := range rd.Spec.VolumeDefinitions { @@ -1645,15 +1663,26 @@ func (r *ResourceReconciler) patchRDVolumeMinors(ctx context.Context, rd *blocks vols = append(vols, entry) } + meta := map[string]any{} + if rv := rd.ResourceVersion; rv != "" { + // Optimistic-lock precondition: the apiserver enforces + // metadata.resourceVersion on any patch body, so a racing VD + // append (which bumped the RD past this snapshot) makes the + // wholesale volumeDefinitions replace 409 instead of clobbering + // the appended volume. + meta["resourceVersion"] = rv + } + body := map[string]any{patchKeySpec: map[string]any{"volumeDefinitions": vols}} + if len(meta) > 0 { + body["metadata"] = meta + } patchBytes, err := json.Marshal(body) if err != nil { return err } - // Optimistic concurrency: include resourceVersion so a racing - // minor write is rejected with Conflict and we re-read. patchTarget := &blockstoriov1alpha1.ResourceDefinition{ TypeMeta: metav1.TypeMeta{ Kind: resourceDefinitionKindName, From 97fb699c43ca3df42a0451faf75cca0507ed799e Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Sun, 14 Jun 2026 16:33:21 +0300 Subject: [PATCH 13/19] fix(drbd): defer late-add-promote to a lower-id peer mid-bring-up (BUG-048) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ≥3-replica convergence wedge. The late-add-promote lowest-node-id election (lateAddVolumeNeedsLocalPromote) counted a lower-id peer as a competing promoter ONLY when it observed that peer's volume as Inconsistent. A freshly late-added volume on a lower-id diskful peer passes through Attaching / Negotiating / DUnknown while its kernel slot brings the new volume up — it has NOT yet settled to Inconsistent. The old code treated those transient states as "not a competing diskful promoter; ignore", so a higher-id node observing a lower-id peer mid-bring-up concluded "I am lowest" and force-primaried. Once the true-lowest peer finished bring-up it ALSO promoted: two nodes minted divergent current-UUIDs on the same late volume → it wedged PausedSyncS / StandAlone split-brain with no convergence. Stand-observed on 3-diskful: node-id 1 and node-id 0 both logged the late-add force-primary for the same late volume; the volume stuck Inconsistent on every replica past the budget. Treat a lower-id peer in Attaching / Negotiating / DUnknown (the transient diskful bring-up / connection-not-fully-negotiated states) as the rightful promoter and defer — only the true-lowest diskful node ever promotes, so exactly one SyncSource is seeded. A steady-state Diskless witness is still ignored (it never becomes a diskful promoter; deferring to it would deadlock), so the lowest DISKFUL replica still seeds. This change is downstream of the two day0 split-brain gates (localHasUpToDateVolume + allPeersConnected) added in e766d4dc1: at day0 no volume is UpToDate so gate 1 blocks the election entirely and this code is never reached — the split-brain leg is untouched. The change is strictly MORE conservative (defers more, never promotes more), so it can only reduce split-brain risk. L1: defer to a lower-id peer in each transient bring-up state (fails pre-fix); still promote over a lower-id Diskless witness; the true-lowest still promotes despite a higher-id peer mid-bring-up. Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- pkg/drbd/drbdadm.go | 32 +++++++- pkg/drbd/needs_late_add_promote_test.go | 100 ++++++++++++++++++++++++ 2 files changed, 128 insertions(+), 4 deletions(-) diff --git a/pkg/drbd/drbdadm.go b/pkg/drbd/drbdadm.go index 6d21969c..92f7d9c8 100644 --- a/pkg/drbd/drbdadm.go +++ b/pkg/drbd/drbdadm.go @@ -1569,10 +1569,34 @@ func lateAddVolumeNeedsLocalPromote(conns []drbdsetupStatusConnection, vol, myID // it finish rather than churn a promote. return false } - case DiskStateDiskless, DiskStateAttaching, DiskStateDetaching, - DiskStateFailed, DiskStateNegotiating, DiskStateDUnknown: - // Diskless witness / transient — not a data source and - // not a competing diskful promoter; ignore. + case DiskStateAttaching, DiskStateNegotiating, DiskStateDUnknown: + // BUG-048 (≥3-replica double-promoter wedge): a freshly + // late-added volume on a LOWER-id diskful peer passes + // through Attaching/Negotiating/DUnknown while its kernel + // slot brings the new volume up — the peer has NOT yet + // settled to Inconsistent. The old code treated these + // transient states as "not a competing diskful promoter; + // ignore", so a HIGHER-id node observing a lower-id peer + // mid-bring-up wrongly concluded "I am lowest" and force- + // primaried. Both the higher-id node AND the true-lowest + // node (once it finished bring-up) then promoted, minting + // divergent current-UUIDs → the volume wedged PausedSyncS / + // StandAlone split-brain with no convergence (stand-observed + // on 3-diskful: node-id 1 and node-id 0 both force-primaried + // the same late volume). A lower-id peer in a transient + // bring-up state is a real diskful replica that WILL win the + // election, so defer to it. DUnknown is also the + // connection-not-fully-negotiated state of a diskful peer — + // deferring is the conservative, split-brain-safe choice + // (the next reconcile re-evaluates once the peer settles). + if conn.PeerNodeID < myID { + weAreLowest = false + } + case DiskStateDiskless, DiskStateDetaching, DiskStateFailed: + // Diskless witness (steady-state of a tiebreaker — never a + // diskful promoter, deferring to it would deadlock the + // promote) or a failed/detaching local-disk peer that holds + // no data. Not a competing diskful promoter; ignore. default: // Unknown/empty — ignore. } diff --git a/pkg/drbd/needs_late_add_promote_test.go b/pkg/drbd/needs_late_add_promote_test.go index 590a0fce..550fc421 100644 --- a/pkg/drbd/needs_late_add_promote_test.go +++ b/pkg/drbd/needs_late_add_promote_test.go @@ -238,3 +238,103 @@ func TestNeedsLateAddPromote_AllUpToDateNoOp(t *testing.T) { t.Fatal("a fully-converged RD must NOT trigger any promote") } } + +// BUG-048 (≥3-replica double-promoter wedge). A LOWER-node-id diskful peer +// whose freshly late-added volume is still in a TRANSIENT bring-up state +// (DUnknown — connection negotiated, peer disk not yet reported) is the +// rightful promoter. The old election only counted a lower-id peer as a +// competitor when it observed it as Inconsistent, so this node (id 2) saw +// the lower-id peer mid-bring-up, concluded "I am lowest", and force- +// primaried — and once the lower-id peer settled it ALSO promoted → +// divergent current-UUIDs → PausedSyncS / StandAlone wedge (stand-observed +// on 3-diskful: node-id 1 and node-id 0 both force-primaried the same late +// volume). This node MUST defer to the lower-id peer regardless of its +// transient per-volume state. +func TestNeedsLateAddPromote_DefersToLowerNodeIDStillBringingUp(t *testing.T) { + for _, tc := range []struct { + name string + peerState string + }{ + {"DUnknown", "DUnknown"}, + {"Negotiating", "Negotiating"}, + {"Attaching", "Attaching"}, + } { + t.Run(tc.name, func(t *testing.T) { + adm := admWithLateAddStatus(t, `[{ + "name":"pvc-late","node-id":2,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], + "connections":[{ + "peer-node-id":0,"name":"n1","connection-state":"Connected", + "peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":2,"peer-disk-state":"`+tc.peerState+`","replication-state":"Established","resync-suspended":"no"} + ] + }] + }]`) + + if adm.NeedsLateAddPromote(t.Context(), "pvc-late") { + t.Fatalf("a lower-node-id diskful peer still bringing up the volume (%s) is the elected promoter; "+ + "this node (id 2) must defer or both promote and wedge the volume (BUG-048)", tc.peerState) + } + }) + } +} + +// BUG-048 companion: deferring to a transient lower-id peer must NOT +// deadlock when the lower-id peer is a PERMANENT diskless witness +// (tiebreaker). A diskless peer never becomes a diskful promoter, so a +// lower-id Diskless peer must be IGNORED (not deferred to) — this node +// (id 2), the lowest DISKFUL replica, still promotes. (The witness shows +// steady-state "Diskless"; a diskful peer mid-bring-up shows DUnknown / +// Negotiating / Attaching, handled by the test above.) +func TestNeedsLateAddPromote_PromotesOverLowerIDDisklessWitness(t *testing.T) { + adm := admWithLateAddStatus(t, `[{ + "name":"pvc-late","node-id":2,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], + "connections":[{ + "peer-node-id":0,"name":"tiebreaker","connection-state":"Connected", + "peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"Diskless","replication-state":"Established","resync-suspended":"no"}, + {"volume":2,"peer-disk-state":"Diskless","replication-state":"Established","resync-suspended":"no"} + ] + }] + }]`) + + if !adm.NeedsLateAddPromote(t.Context(), "pvc-late") { + t.Fatal("a lower-id DISKLESS witness never promotes; this node (the lowest DISKFUL replica) " + + "must still force-primary to seed the SyncSource (deferring would deadlock the volume forever)") + } +} + +// BUG-048 companion: the true-lowest diskful node still promotes even when +// a HIGHER-id peer is mid-bring-up (DUnknown). A higher-id transient peer +// is not a competitor — this node (id 0) is lowest and must seed. +func TestNeedsLateAddPromote_LowestPromotesDespiteHigherIDBringingUp(t *testing.T) { + adm := admWithLateAddStatus(t, `[{ + "name":"pvc-late","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected", + "peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":2,"peer-disk-state":"DUnknown","replication-state":"Established","resync-suspended":"no"} + ] + }] + }]`) + + if !adm.NeedsLateAddPromote(t.Context(), "pvc-late") { + t.Fatal("the lowest diskful node must promote even when a higher-id peer is still bringing up the volume") + } +} From c4271d1f11632eca2ae422ef2f7cb312bd34252a Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Sun, 14 Jun 2026 17:53:31 +0300 Subject: [PATCH 14/19] fix(satellite,drbd): kick stalled late-add resync to converge all replicas (BUG-048) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The residual ≥3-replica concurrent late-vd-add convergence wedge. When two volumes are added back-to-back to an already-initialized 3-diskful RD, the last volume occasionally enters a DRBD resync that STALLS and never feeds every peer — a SyncSource is elected (syncSeen=1) but the volume sits Inconsistent/Outdated on two of three replicas with out-of-sync frozen at the full size. Two stand-observed signatures (both on the last concurrently-added volume of a 3-diskful RD): - dependency deadlock: a peer-device is PausedSyncS with resync-suspended:dependency (DRBD serialised this volume's resync behind a sibling that already finished, but the dependency latch never cleared) while partner peers wait in WFBitMapT/ resync-suspended:peer for a bitmap exchange the paused source never starts. oos frozen at full forever. - partial stall: one peer reaches UpToDate (a real SyncSource) but a second peer stays WFBitMapT/Outdated and never finalises. Neither existing self-heal recovers it: maybeLateAddPromote mints a source only when NONE exists (all-Inconsistent, no peer data) and a force-primary does not clear a resync-suspended:dependency latch; maybeRecoveryPromote requires the local already-UpToDate. Add a dedicated kernel-truth self-heal. Adm.NeedsLateAddResyncKick fires ONLY when no replica is Primary, the RD is past day0 (a local UpToDate volume exists), every peer is Connected, and a peer-device is in a stalled resync (PausedSync*/WFBitMap* with a non-"no" resync-suspended). maybeKickLateAddResync then disconnect+adjusts the resource, re-running the GI handshake so DRBD restarts the resync from the correct direction. Data-safe: disconnect/reconnect never mutates data and DRBD re-derives the sync direction from generation-UUIDs/bitmaps. A 45s stall-dwell gate ensures only a GENUINELY frozen resync is kicked, never the normal transient volume-resync serialization (a few seconds of PausedSyncS/ dependency while a sibling finishes). Throttled by the shared recoveryPromoteThrottle and self-limiting (once converged no stalled peer-device remains). Skipped on diskless replicas. Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- pkg/drbd/drbdadm.go | 167 +++++++++++ pkg/drbd/needs_late_add_resync_kick_test.go | 268 ++++++++++++++++++ .../maybe_kick_late_add_resync_test.go | 222 +++++++++++++++ pkg/satellite/reconciler.go | 140 +++++++++ 4 files changed, 797 insertions(+) create mode 100644 pkg/drbd/needs_late_add_resync_kick_test.go create mode 100644 pkg/satellite/maybe_kick_late_add_resync_test.go diff --git a/pkg/drbd/drbdadm.go b/pkg/drbd/drbdadm.go index 92f7d9c8..11a8f15f 100644 --- a/pkg/drbd/drbdadm.go +++ b/pkg/drbd/drbdadm.go @@ -92,6 +92,35 @@ func (a *Adm) Adjust(ctx context.Context, resource string) error { return a.run(ctx, "adjust", resource) } +// LateAddResyncKick unsticks a STALLED late-add resync (BUG-048): it +// `disconnect`s every peer of the resource then `adjust`s to reconnect, +// re-running the DRBD GI handshake from scratch. The handshake re-derives +// the correct sync direction from generation-UUIDs / bitmaps and restarts +// the resync, clearing a stale paused-SyncSource (resync-suspended +// "dependency") or a never-completing WFBitMapT (resync-suspended "peer") +// that no longer advances on its own. Gated by the kernel-truth +// NeedsLateAddResyncKick predicate, which fires ONLY on the genuine wedge +// (no Primary anywhere, RD past day0, every peer Connected, a stalled +// resync present). +// +// Data-safe: disconnect/reconnect never mutates on-disk data, and DRBD's +// post-reconnect handshake picks the sync direction from metadata — it +// can never copy the wrong way. The `disconnect` is best-effort (a peer +// already StandAlone returns non-zero, which we ignore); the `adjust` +// reconnects every peer the .res still describes (drbdadm adjust re-adds +// connection objects, the inverse of the down a disconnect leaves). A +// failed adjust surfaces so a genuinely stuck reconnect is not silently +// swallowed. +func (a *Adm) LateAddResyncKick(ctx context.Context, resource string) error { + // Best-effort disconnect of all peers — quiesces the stale + // connection objects so the adjust re-runs a clean handshake. + _, _ = a.exec.Run(ctx, "drbdadm", "disconnect", resource) + + // Reconnect every configured peer; restarts the resync from the + // re-run GI handshake. + return a.run(ctx, "adjust", resource) +} + // AdjustSkipDisk is the Failed-replica variant of Adjust that // appends drbd-utils' `--skip-disk` flag. Used after the observer // detected `disk:Failed` and stamped `DrbdOptions/SkipDisk=True` @@ -1606,6 +1635,144 @@ func lateAddVolumeNeedsLocalPromote(conns []drbdsetupStatusConnection, vol, myID return weAreLowest } +// NeedsLateAddResyncKick probes the live kernel via `drbdsetup status +// --json` and reports whether a LATE-ADDED volume's resync is +// STALLED — present but suspended/waiting, making no progress — so the +// satellite should kick it (disconnect + adjust to re-run the handshake +// and restart the resync) (BUG-048, the ≥3-replica concurrent late-add +// convergence wedge). +// +// Distinct from NeedsLateAddPromote: that predicate fires when NO +// SyncSource exists at all (every replica Inconsistent, no peer holds +// data) and force-primaries the lowest-id node to MINT a source. This +// predicate fires when a resync EXISTS but is wedged in a paused / +// bitmap-exchange state that never advances — the two stand-observed +// signatures: +// +// - "dependency deadlock": a peer-device is PausedSyncS with +// resync-suspended "dependency" (DRBD serialised this volume's +// resync behind another that has already finished, but the +// dependency latch never cleared), and its partner peers sit in +// WFBitMapT/resync-suspended "peer" waiting for a bitmap exchange +// that the paused source never starts. out-of-sync stays at the +// full volume size forever. +// - "partial stall": one peer reached UpToDate (a genuine SyncSource +// exists) but a SECOND peer is stuck in WFBitMapT (peerdisk +// Outdated/Inconsistent) and never finalises — the source fed one +// peer but not the other. The waiting peer's bitmap exchange never +// completes. +// +// In BOTH a force-primary alone does not help: the source/bitmap +// machinery is latched in a paused/waiting state that only a connection +// re-handshake clears. Disconnecting and re-adjusting tears down the +// stale connection objects and re-runs the GI handshake, from which DRBD +// re-derives the correct sync direction and restarts the resync cleanly. +// +// Returns true ONLY when ALL hold, so the kick is data-safe and +// self-limiting: +// - the kernel slot exists and reports a my-node-id; +// - NO replica anywhere (local nor any peer-role) is Primary (never +// disconnect a resource an application is actively writing — the +// resume on a Primary is handled by the normal resync path; a kick +// here would needlessly flap a live device); +// - at least one local volume is already UpToDate — the RD is PAST +// first activation (a pure day0 bootstrap is excluded, same as +// NeedsLateAddPromote gate 1), so this can never fire mid-bring-up; +// - every peer connection is fully Connected (the resync states are +// only authoritative over a settled connection — a Connecting / +// StandAlone peer is a transient that the next reconcile re-reads); +// - at least one peer-device of any volume is a STALLED resync: a +// paused/waiting replication-state (PausedSyncS/PausedSyncT/ +// WFBitMapS/WFBitMapT) whose resync-suspended is NOT "no" (i.e. +// "dependency"/"peer"/"user") — the precise wedge signature, never a +// healthy progressing SyncSource (resync-suspended "no"). +// +// Conservative on any probe/parse failure → false. Self-limiting: once +// the kick lets the resync finish and the volume reaches UpToDate, no +// stalled peer-device remains and the predicate stops holding. +func (a *Adm) NeedsLateAddResyncKick(ctx context.Context, resource string) bool { + out, err := a.exec.Run(ctx, "drbdsetup", "status", resource, "--json") + if err != nil { + return false + } + + var status drbdsetupStatusRoot + + err = json.Unmarshal(out, &status) + if err != nil || len(status) == 0 || status[0].NodeID == nil { + return false + } + + res := status[0] + + // Never kick a resource an application is actively writing. + if Role(res.Role).IsPrimary() { + return false + } + + for _, conn := range res.Connections { + if Role(conn.PeerRole).IsPrimary() { + return false + } + } + + // Gate: the RD must be PAST day0 (at least one local UpToDate volume), + // so this can never misfire during a fresh RD's first-activation + // bring-up where every volume is transiently Inconsistent. + if !localHasUpToDateVolume(res.Devices) { + return false + } + + // Gate: every peer fully Connected — the resync-state read is only + // authoritative over a settled connection. + if !allPeersConnected(res.Connections) { + return false + } + + // Fire only when a genuine STALLED resync peer-device is present. + for _, conn := range res.Connections { + if slices.ContainsFunc(conn.PeerDevices, peerDeviceResyncStalled) { + return true + } + } + + return false +} + +// peerDeviceResyncStalled reports whether a peer-device is wedged in a +// resync that is present but suspended/waiting and making no progress — +// the BUG-048 late-add convergence-wedge signature. True when the +// replication-state is a paused or bitmap-exchange-wait state +// (PausedSyncS/PausedSyncT/WFBitMapS/WFBitMapT) AND resync-suspended is +// a non-"no" reason ("dependency"/"peer"/"user", possibly comma-joined). +// +// A healthy progressing resync (SyncSource/SyncTarget, or a WF* step +// with resync-suspended "no") is NOT stalled — it advances on its own +// and must be left alone. An empty resync-suspended token is treated as +// "no" (drbd-utils omits the field when nothing is suspended), so a +// transient WF* with no suspension is not mistaken for a wedge. +func peerDeviceResyncStalled(peerDev drbdsetupStatusPeerDevice) bool { + switch ReplicationState(peerDev.ReplicationState) { + case ReplicationStatePausedSyncS, ReplicationStatePausedSyncT, + ReplicationStateWFBitMapS, ReplicationStateWFBitMapT: + // A paused or bitmap-exchange-wait state — candidate. + case ReplicationStateOff, ReplicationStateEstablished, + ReplicationStateStartingSyncS, ReplicationStateStartingSyncT, + ReplicationStateWFSyncUUID, ReplicationStateSyncSource, + ReplicationStateSyncTarget, ReplicationStateVerifyS, + ReplicationStateVerifyT, ReplicationStateAhead, + ReplicationStateBehind: + // Progressing or steady-state — not a stalled wedge. + return false + default: + return false + } + + rss := peerDev.ResyncSuspended + + return rss != "" && rss != wireNo +} + // DownVeto is the tri-state outcome of the Bug 350 kernel-truth probe // the satellite consults before committing a `drbdadm down` on the // INACTIVE path. It separates "definitely safe to down" from "must diff --git a/pkg/drbd/needs_late_add_resync_kick_test.go b/pkg/drbd/needs_late_add_resync_kick_test.go new file mode 100644 index 00000000..3b184eb6 --- /dev/null +++ b/pkg/drbd/needs_late_add_resync_kick_test.go @@ -0,0 +1,268 @@ +// SPDX-License-Identifier: Apache-2.0 + +/* +Copyright 2026 Cozystack contributors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package drbd_test + +import ( + "errors" + "testing" + + "github.com/cozystack/blockstor/pkg/drbd" + "github.com/cozystack/blockstor/pkg/storage" +) + +var errKickProbe = errors.New("drbdsetup status: exit 1") + +// Regression pins for the BUG-048 late-add resync-KICK predicate +// (NeedsLateAddResyncKick). It exists to unstick a late-added volume +// whose resync EXISTS but wedged in a paused / bitmap-exchange state +// that never advances — the ≥3-replica concurrent-add convergence wedge. +// It must fire ONLY on that stalled-resync signature and never on a +// healthy progressing sync, a day0 bootstrap, a Primary-held resource, +// or a not-fully-connected peer set. + +const lateAddKickKey = "drbdsetup status pvc-kick --json" + +func admWithKickStatus(t *testing.T, json string) *drbd.Adm { + t.Helper() + + fx := storage.NewFakeExec() + fx.Responses[lateAddKickKey] = storage.FakeResponse{Stdout: []byte(json)} + + return drbd.NewAdm(fx) +} + +// Stand-observed signature 1 — "dependency deadlock" (B-3r RUN2/RUN8): +// vol-0/vol-1 UpToDate, the late vol-2 Inconsistent locally, a peer is +// PausedSyncS/resync-suspended:dependency (an elected source whose +// resync is paused on a stale dependency) and the partner waits in +// WFBitMapT/resync-suspended:peer. oos frozen at full. No Primary. +func TestNeedsLateAddResyncKick_DependencyDeadlock(t *testing.T) { + adm := admWithKickStatus(t, `[{ + "name":"pvc-kick","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":1,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], + "connections":[ + {"peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":1,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":2,"peer-disk-state":"Inconsistent","replication-state":"PausedSyncS","resync-suspended":"dependency"} + ]}, + {"peer-node-id":2,"name":"n3","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":1,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":2,"peer-disk-state":"Inconsistent","replication-state":"WFBitMapT","resync-suspended":"peer"} + ]} + ] + }]`) + + if !adm.NeedsLateAddResyncKick(t.Context(), "pvc-kick") { + t.Fatal("expected kick for the dependency-deadlock wedge") + } +} + +// Stand-observed signature 2 — "partial stall" (B-3r RUN9): one peer +// reached UpToDate (a real SyncSource exists) but a SECOND peer is stuck +// WFBitMapT / peerdisk Outdated and never finalises. The waiting peer's +// resync-suspended is "peer". No Primary. +func TestNeedsLateAddResyncKick_PartialStall(t *testing.T) { + adm := admWithKickStatus(t, `[{ + "name":"pvc-kick","node-id":2,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":1,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Outdated"} + ], + "connections":[ + {"peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":1,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"WFBitMapT","resync-suspended":"peer"} + ]}, + {"peer-node-id":0,"name":"n1","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":1,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"} + ]} + ] + }]`) + + if !adm.NeedsLateAddResyncKick(t.Context(), "pvc-kick") { + t.Fatal("expected kick for the partial-stall wedge") + } +} + +// A HEALTHY progressing initial sync (SyncSource / SyncTarget with +// resync-suspended "no") must NOT be kicked — it advances on its own, +// and a kick would needlessly abort it. +func TestNeedsLateAddResyncKick_HealthySyncNotKicked(t *testing.T) { + adm := admWithKickStatus(t, `[{ + "name":"pvc-kick","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":1,"disk-state":"Inconsistent"} + ], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":1,"peer-disk-state":"Inconsistent","replication-state":"SyncSource","resync-suspended":"no"} + ] + }] + }]`) + + if adm.NeedsLateAddResyncKick(t.Context(), "pvc-kick") { + t.Fatal("must NOT kick a healthy progressing SyncSource") + } +} + +// A transient bitmap-exchange step with resync-suspended "no" (a sync +// about to start, not wedged) must NOT be kicked. +func TestNeedsLateAddResyncKick_WFBitMapNotSuspendedNotKicked(t *testing.T) { + adm := admWithKickStatus(t, `[{ + "name":"pvc-kick","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":1,"disk-state":"Inconsistent"} + ], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":1,"peer-disk-state":"Inconsistent","replication-state":"WFBitMapT","resync-suspended":"no"} + ] + }] + }]`) + + if adm.NeedsLateAddResyncKick(t.Context(), "pvc-kick") { + t.Fatal("must NOT kick a WFBitMapT that is not suspended (sync starting)") + } +} + +// Day0 bootstrap: NO local UpToDate volume yet (every volume transiently +// Inconsistent while the fresh-RD winner election runs). A paused sync +// here must NOT be kicked — the gate requires a local UpToDate sibling so +// this can never misfire during first activation. +func TestNeedsLateAddResyncKick_Day0NotKicked(t *testing.T) { + adm := admWithKickStatus(t, `[{ + "name":"pvc-kick","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"Inconsistent"} + ], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"Inconsistent","replication-state":"PausedSyncS","resync-suspended":"dependency"} + ] + }] + }]`) + + if adm.NeedsLateAddResyncKick(t.Context(), "pvc-kick") { + t.Fatal("must NOT kick during day0 (no local UpToDate volume)") + } +} + +// A Primary anywhere (local or peer) vetoes the kick — never disconnect a +// resource an application is actively writing. +func TestNeedsLateAddResyncKick_PrimaryVetoes(t *testing.T) { + adm := admWithKickStatus(t, `[{ + "name":"pvc-kick","node-id":0,"role":"Primary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":1,"disk-state":"Inconsistent"} + ], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":1,"peer-disk-state":"Inconsistent","replication-state":"PausedSyncS","resync-suspended":"dependency"} + ] + }] + }]`) + + if adm.NeedsLateAddResyncKick(t.Context(), "pvc-kick") { + t.Fatal("must NOT kick while a Primary holds the resource") + } +} + +// A not-fully-connected peer (Connecting / StandAlone) defers the kick: +// the resync-state read is only authoritative over a settled connection. +func TestNeedsLateAddResyncKick_PeerNotConnectedDefers(t *testing.T) { + adm := admWithKickStatus(t, `[{ + "name":"pvc-kick","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":1,"disk-state":"Inconsistent"} + ], + "connections":[ + {"peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":1,"peer-disk-state":"Inconsistent","replication-state":"PausedSyncS","resync-suspended":"dependency"} + ]}, + {"peer-node-id":2,"name":"n3","connection-state":"Connecting","peer-role":"Unknown", + "peer_devices":[]} + ] + }]`) + + if adm.NeedsLateAddResyncKick(t.Context(), "pvc-kick") { + t.Fatal("must NOT kick when a peer is not fully Connected") + } +} + +// Fully converged (every peer Established, resync-suspended "no") — no +// stalled resync remains, so the predicate stops holding (self-limiting). +func TestNeedsLateAddResyncKick_ConvergedNotKicked(t *testing.T) { + adm := admWithKickStatus(t, `[{ + "name":"pvc-kick","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":1,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"UpToDate"} + ], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":1,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"} + ] + }] + }]`) + + if adm.NeedsLateAddResyncKick(t.Context(), "pvc-kick") { + t.Fatal("must NOT kick a fully converged resource") + } +} + +// Probe failure (drbdsetup errors) → conservative false, no kick. +func TestNeedsLateAddResyncKick_ProbeFailureFalse(t *testing.T) { + fx := storage.NewFakeExec() + fx.Responses[lateAddKickKey] = storage.FakeResponse{Err: errKickProbe} + + adm := drbd.NewAdm(fx) + if adm.NeedsLateAddResyncKick(t.Context(), "pvc-kick") { + t.Fatal("must return false on a probe failure") + } +} diff --git a/pkg/satellite/maybe_kick_late_add_resync_test.go b/pkg/satellite/maybe_kick_late_add_resync_test.go new file mode 100644 index 00000000..221dba90 --- /dev/null +++ b/pkg/satellite/maybe_kick_late_add_resync_test.go @@ -0,0 +1,222 @@ +// SPDX-License-Identifier: Apache-2.0 + +/* +Copyright 2026 Cozystack contributors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package satellite + +import ( + "context" + "slices" + "testing" + "time" + + "github.com/cozystack/blockstor/pkg/drbd" + "github.com/cozystack/blockstor/pkg/satellite/intent" + "github.com/cozystack/blockstor/pkg/storage" +) + +// Regression pins for the BUG-048 late-add resync-KICK self-heal wiring. +// maybeKickLateAddResync must disconnect+adjust a late-added volume whose +// resync wedged in a paused / bitmap-exchange state that never advances +// (the ≥3-replica concurrent-add convergence wedge), and must NOT act on +// a diskless replica, a healthy progressing sync, or a converged set. + +const kickStatusKey = "drbdsetup status pvc-ka --json" + +// vol-0/vol-1 UpToDate, the late vol-2 Inconsistent locally, a peer is +// PausedSyncS/resync-suspended:dependency and the partner WFBitMapT/peer +// — the stand-observed dependency-deadlock wedge. No Primary. +const kickStatusWedged = `[{ + "name":"pvc-ka","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":1,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], + "connections":[ + {"peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":2,"peer-disk-state":"Inconsistent","replication-state":"PausedSyncS","resync-suspended":"dependency"} + ]}, + {"peer-node-id":2,"name":"n3","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":2,"peer-disk-state":"Inconsistent","replication-state":"WFBitMapT","resync-suspended":"peer"} + ]} + ] +}]` + +func kickDR() *intent.DesiredResource { + return &intent.DesiredResource{ + Name: "pvc-ka", + NodeName: "n1", + Volumes: []*intent.DesiredVolume{ + {VolumeNumber: 0, SizeKib: 1024 * 1024, StoragePool: "thin1"}, + {VolumeNumber: 1, SizeKib: 1024 * 1024, StoragePool: "thin1"}, + {VolumeNumber: 2, SizeKib: 1024 * 1024, StoragePool: "thin1"}, + }, + Peers: []intent.DesiredPeer{{Name: "n2"}, {Name: "n3"}}, + DrbdOptions: map[string]string{ + "port": "7000", "node-id": "0", "address": "10.0.0.1", "minor": "1000", + }, + } +} + +// The wedge fires the disconnect+adjust kick — but ONLY after the stall +// has persisted beyond the dwell window. The FIRST observation starts the +// dwell (no kick); once the recorded stall is older than lateAddStallDwell +// the kick fires. +func TestMaybeKickLateAddResync_FiresForStalledResync(t *testing.T) { + fx := storage.NewFakeExec() + fx.Expect(kickStatusKey, storage.FakeResponse{Stdout: []byte(kickStatusWedged)}) + + rec := NewReconciler(ReconcilerConfig{Adm: drbd.NewAdm(fx), NodeName: "n1"}) + + // First pass starts the dwell window — must NOT kick yet. + if err := rec.maybeKickLateAddResync(context.Background(), kickDR(), false); err != nil { + t.Fatalf("maybeKickLateAddResync pass 1: %v", err) + } + if slices.Contains(fx.CommandLines(), "drbdadm disconnect pvc-ka") { + t.Fatalf("must NOT kick on the first observation (dwell not elapsed), got: %v", fx.CommandLines()) + } + + // Backdate the recorded stall so the dwell has elapsed. + rec.mu.Lock() + rec.firstLateAddStallAt["pvc-ka"] = time.Now().Add(-2 * lateAddStallDwell) + rec.mu.Unlock() + + if err := rec.maybeKickLateAddResync(context.Background(), kickDR(), false); err != nil { + t.Fatalf("maybeKickLateAddResync pass 2: %v", err) + } + + cmds := fx.CommandLines() + if !slices.Contains(cmds, "drbdadm disconnect pvc-ka") { + t.Errorf("expected `drbdadm disconnect pvc-ka` after dwell, got: %v", cmds) + } + if !slices.Contains(cmds, "drbdadm adjust pvc-ka") { + t.Errorf("expected `drbdadm adjust pvc-ka` (reconnect after disconnect), got: %v", cmds) + } +} + +// A stall that CLEARS (predicate goes false) resets the dwell window — a +// subsequent fresh stall must start its dwell from scratch rather than +// inherit the earlier timestamp and kick immediately. +func TestMaybeKickLateAddResync_DwellResetsWhenStallClears(t *testing.T) { + fx := storage.NewFakeExec() + fx.Expect(kickStatusKey, storage.FakeResponse{Stdout: []byte(kickStatusWedged)}) + + rec := NewReconciler(ReconcilerConfig{Adm: drbd.NewAdm(fx), NodeName: "n1"}) + + // Start the dwell window. + if err := rec.maybeKickLateAddResync(context.Background(), kickDR(), false); err != nil { + t.Fatalf("maybeKickLateAddResync pass 1: %v", err) + } + + // Stall clears (converged) — predicate goes false, dwell is dropped. + fx.Expect(kickStatusKey, storage.FakeResponse{Stdout: []byte(`[{ + "name":"pvc-ka","node-id":0,"role":"Secondary", + "devices":[{"volume":0,"disk-state":"UpToDate"}], + "connections":[{"peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[{"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}]}] + }]`)}) + if err := rec.maybeKickLateAddResync(context.Background(), kickDR(), false); err != nil { + t.Fatalf("maybeKickLateAddResync pass 2: %v", err) + } + + rec.mu.Lock() + _, stillTracked := rec.firstLateAddStallAt["pvc-ka"] + rec.mu.Unlock() + if stillTracked { + t.Errorf("dwell must be cleared once the stall clears") + } +} + +// A diskless replica has no local resync to kick — skip outright. +func TestMaybeKickLateAddResync_SkipsDiskless(t *testing.T) { + fx := storage.NewFakeExec() + fx.Expect(kickStatusKey, storage.FakeResponse{Stdout: []byte(kickStatusWedged)}) + + rec := NewReconciler(ReconcilerConfig{Adm: drbd.NewAdm(fx), NodeName: "n1"}) + + if err := rec.maybeKickLateAddResync(context.Background(), kickDR(), true); err != nil { + t.Fatalf("maybeKickLateAddResync: %v", err) + } + + if slices.Contains(fx.CommandLines(), "drbdadm disconnect pvc-ka") { + t.Errorf("diskless replica must NOT be kicked, got: %v", fx.CommandLines()) + } +} + +// A healthy progressing SyncSource (resync-suspended "no") must NOT be +// kicked — the kick would abort a sync that finishes on its own. +func TestMaybeKickLateAddResync_SkipsHealthySync(t *testing.T) { + fx := storage.NewFakeExec() + fx.Expect(kickStatusKey, storage.FakeResponse{Stdout: []byte(`[{ + "name":"pvc-ka","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":2,"peer-disk-state":"Inconsistent","replication-state":"SyncSource","resync-suspended":"no"} + ] + }] + }]`)}) + + rec := NewReconciler(ReconcilerConfig{Adm: drbd.NewAdm(fx), NodeName: "n1"}) + + if err := rec.maybeKickLateAddResync(context.Background(), kickDR(), false); err != nil { + t.Fatalf("maybeKickLateAddResync: %v", err) + } + + if slices.Contains(fx.CommandLines(), "drbdadm disconnect pvc-ka") { + t.Errorf("must NOT kick a healthy progressing SyncSource, got: %v", fx.CommandLines()) + } +} + +// The kick is throttled by the shared recoveryPromoteThrottle so a +// still-converging resync isn't churned: a second back-to-back call +// inside the window must NOT re-disconnect. +func TestMaybeKickLateAddResync_Throttled(t *testing.T) { + fx := storage.NewFakeExec() + fx.Expect(kickStatusKey, storage.FakeResponse{Stdout: []byte(kickStatusWedged)}) + + rec := NewReconciler(ReconcilerConfig{Adm: drbd.NewAdm(fx), NodeName: "n1"}) + + // Pre-satisfy the dwell so both passes are past the dwell gate and the + // throttle is the only thing limiting the kick. + rec.mu.Lock() + rec.firstLateAddStallAt["pvc-ka"] = time.Now().Add(-2 * lateAddStallDwell) + rec.mu.Unlock() + + for i := range 2 { + if err := rec.maybeKickLateAddResync(context.Background(), kickDR(), false); err != nil { + t.Fatalf("maybeKickLateAddResync pass %d: %v", i, err) + } + } + + disconnects := 0 + for _, c := range fx.CommandLines() { + if c == "drbdadm disconnect pvc-ka" { + disconnects++ + } + } + if disconnects != 1 { + t.Errorf("expected exactly 1 disconnect within the throttle window, got %d", disconnects) + } +} diff --git a/pkg/satellite/reconciler.go b/pkg/satellite/reconciler.go index 3999f7e2..4f535dbe 100644 --- a/pkg/satellite/reconciler.go +++ b/pkg/satellite/reconciler.go @@ -283,6 +283,18 @@ type Reconciler struct { // resets it (at worst one extra promote). lastRecoveryPromoteAt map[string]time.Time + // firstLateAddStallAt records, per resource name, the first wall-clock + // time this node observed a STALLED late-add resync (BUG-048 + // NeedsLateAddResyncKick) without yet kicking it. The kick + // (disconnect+adjust) only fires once the stall has PERSISTED beyond + // lateAddStallDwell, so the normal, transient DRBD volume-resync + // serialization (vol-N legitimately PausedSyncS/dependency for a few + // seconds while vol-(N-1) finishes) is never mistaken for the wedge + // and disrupted. Cleared when the stall clears (predicate stops + // holding). Process-memory only; a restart resets it (at worst one + // extra dwell window before the kick). + firstLateAddStallAt map[string]time.Time + // restoreSnapshotMissingTries counts, per "/", // how many consecutive reconcile passes a snapshot-restore-backed // volume found its source `@snap` ABSENT locally (ErrNotFound from @@ -310,6 +322,20 @@ type Reconciler struct { // fresh nudge promptly. const recoveryPromoteThrottle = 10 * time.Second +// lateAddStallDwell is how long a late-add resync must REMAIN stalled +// (NeedsLateAddResyncKick continuously true) before maybeKickLateAddResync +// fires the disconnect+adjust kick. It must comfortably exceed the normal, +// transient DRBD volume-resync serialization gap — when two volumes are +// added at once, vol-N legitimately sits PausedSyncS/resync-suspended: +// dependency for a few seconds while vol-(N-1) finishes its resync, which +// looks identical to the wedge for a moment. The dwell ensures the kick +// only disrupts a resync that is GENUINELY stuck (oos frozen for the whole +// window), never a healthy serialized sync that is about to advance on its +// own. The genuine wedge sits stalled indefinitely, so it always clears +// the dwell; a transient serialization clears the predicate (and resets +// the dwell) well before it elapses. +const lateAddStallDwell = 45 * time.Second + // restoreSnapshotMissingBudget bounds how many consecutive reconcile // passes a node-local snapshot-restore volume may REQUEUE on a // not-yet-materialised local `@snap` before falling through to the @@ -338,6 +364,7 @@ func NewReconciler(cfg ReconcilerConfig) *Reconciler { seenStuckAt: map[string]time.Time{}, restoreBlankFallback: map[string]bool{}, lastRecoveryPromoteAt: map[string]time.Time{}, + firstLateAddStallAt: map[string]time.Time{}, restoreSnapshotMissingTries: map[string]int{}, } } @@ -2520,6 +2547,21 @@ func (r *Reconciler) maybePromoteSelfHeals(ctx context.Context, dr *intent.Desir return err } + // BUG-048 late-add resync-kick self-heal — unstick a late-added + // volume whose resync EXISTS but wedged in a paused/bitmap-exchange + // state that never advances (the ≥3-replica concurrent-add + // convergence wedge: a SyncSource is elected but its resync sits + // PausedSyncS/resync-suspended:dependency while partner peers wait in + // WFBitMapT/resync-suspended:peer, or one peer reaches UpToDate while + // a second stalls WFBitMapT/Outdated forever). Distinct from + // maybeLateAddPromote: that mints a source when NONE exists; this + // re-handshakes an EXISTING-but-stalled resync via disconnect+adjust. + // See maybeKickLateAddResync / Adm.NeedsLateAddResyncKick. + err = r.maybeKickLateAddResync(ctx, dr, diskless) + if err != nil { + return err + } + // Solo diskless→diskful toggle self-heal — force-promote a lone, // peerless diskful replica wedged below UpToDate. See maybeSoloPromote // for the full why (the auto-primary suppression × offline-safety @@ -2724,6 +2766,104 @@ func (r *Reconciler) maybeLateAddPromote(ctx context.Context, dr *intent.Desired return r.runAutoPromote(ctx, dr) } +// maybeKickLateAddResync is the BUG-048 self-heal for a late-added volume +// whose resync EXISTS but wedged in a paused / bitmap-exchange state that +// never advances — the ≥3-replica concurrent late-add convergence wedge. +// +// Two stand-observed signatures it unsticks (both on the last +// concurrently-added volume of a 3-diskful RD, oos frozen at full / a +// peer stuck below UpToDate forever): +// +// - dependency deadlock: a SyncSource is elected but sits PausedSyncS +// with resync-suspended "dependency" toward its peers, which in turn +// wait in WFBitMapT/resync-suspended "peer" for a bitmap exchange the +// paused source never starts. Nobody ever reaches UpToDate. +// - partial stall: one peer reaches UpToDate (a real SyncSource), but a +// SECOND peer stays WFBitMapT / Outdated and never finalises — the +// source fed one peer but not the other. +// +// Why neither promote self-heal covers it: maybeLateAddPromote fires only +// when NO peer holds data and the volume is Inconsistent on EVERY replica +// (it mints a source); here a source exists (or a peer is already +// UpToDate) but the resync machinery is latched. maybeRecoveryPromote +// needs the local already-UpToDate. A force-primary does not clear a +// resync-suspended:dependency latch — only a connection re-handshake does. +// +// NeedsLateAddResyncKick is the kernel-truth gate: it fires ONLY when no +// replica is Primary, the RD is past day0 (a local UpToDate volume +// exists), every peer is Connected, and a peer-device is in a stalled +// resync (PausedSync*/WFBitMap* with a non-"no" resync-suspended). The +// kick (Adm.LateAddResyncKick) `disconnect`s then `adjust`s, re-running +// the GI handshake so DRBD restarts the resync from the correct +// direction. Data-safe (disconnect/reconnect never mutates data) and +// self-limiting (once the resync finishes no stalled peer-device remains). +// Throttled via the shared recoveryPromoteThrottle so a still-converging +// resync isn't churned by repeated kicks. Skipped on diskless replicas +// (no local resync to kick) and when Adm is unwired (storage-only tests). +func (r *Reconciler) maybeKickLateAddResync(ctx context.Context, dr *intent.DesiredResource, diskless bool) error { + if diskless || r.cfg.Adm == nil { + return nil + } + + if !r.cfg.Adm.NeedsLateAddResyncKick(ctx, dr.GetName()) { + // Not stalled (or converged): clear any pending dwell so a future + // transient stall starts its window fresh. + r.clearLateAddStall(dr.GetName()) + + return nil + } + + // Dwell gate: only kick once the stall has PERSISTED beyond + // lateAddStallDwell, so the normal transient volume-resync + // serialization (a few seconds of PausedSyncS/dependency while a + // sibling volume finishes) is never disrupted — only a genuinely + // frozen resync is. + if !r.lateAddStallDwellElapsed(dr.GetName()) { + return nil + } + + if !r.recoveryPromoteDue(dr.GetName()) { + return nil + } + + log.FromContext(ctx).Info("BUG-048 late-add resync-kick: disconnect+adjust to restart a stalled resync (paused/bitmap-exchange-wedged) for a late-added volume", + "resource", dr.GetName()) + + return errors.Wrapf(r.cfg.Adm.LateAddResyncKick(ctx, dr.GetName()), + "late-add resync-kick %s", dr.GetName()) +} + +// lateAddStallDwellElapsed records the first time a resource was observed +// with a stalled late-add resync and reports whether the stall has now +// persisted at least lateAddStallDwell. The first observation stamps the +// timestamp and returns false (start the window); a later observation +// returns true once the window has elapsed. Serialised by r.mu. +func (r *Reconciler) lateAddStallDwellElapsed(name string) bool { + r.mu.Lock() + defer r.mu.Unlock() + + now := time.Now() + + first, ok := r.firstLateAddStallAt[name] + if !ok { + r.firstLateAddStallAt[name] = now + + return false + } + + return now.Sub(first) >= lateAddStallDwell +} + +// clearLateAddStall drops the recorded stall start for a resource, so a +// transient stall that cleared restarts its dwell window from scratch the +// next time one is observed. Serialised by r.mu. +func (r *Reconciler) clearLateAddStall(name string) { + r.mu.Lock() + defer r.mu.Unlock() + + delete(r.firstLateAddStallAt, name) +} + // recoveryPromoteDue reports whether enough time has elapsed since this // node last fired a recovery-promote for `name` to fire another, and // records the fire time when it returns true. Serialised by r.mu so From 3974401cf8d06dc42063c4ef021b92e4c8fbbdea Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Sun, 14 Jun 2026 18:03:08 +0300 Subject: [PATCH 15/19] test(harness): document the late-add resync-kick self-heal in the BUG-048 replay MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The vd-late-concurrent-no-drop replay's all_uptodate budget comment now references both BUG-048 convergence self-heals — the late-add-promote (mints a source when none exists) and the new late-add resync-kick (restarts a stalled/paused resync that never feeds all peers) — so the 360s budget rationale stays accurate after the resync-kick fix. Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- .../replay/vd-late-concurrent-no-drop.yaml | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/tests/operator-harness/replay/vd-late-concurrent-no-drop.yaml b/tests/operator-harness/replay/vd-late-concurrent-no-drop.yaml index 4ffce21a..0f9fed84 100644 --- a/tests/operator-harness/replay/vd-late-concurrent-no-drop.yaml +++ b/tests/operator-harness/replay/vd-late-concurrent-no-drop.yaml @@ -85,14 +85,18 @@ steps: expect_exit: 0 await: # Every volume of every replica converges — the elected per-fresh- - # volume winner seeds a SyncSource for both late adds. When the - # concurrent seed races and a volume comes up Inconsistent on every - # replica, the late-add-promote self-heal recovers it, but that adds - # latency (wedge-detect + the recoveryPromoteThrottle window + the - # force-primary + resync). On a contended stand that recovery path - # runs past the bare-convergence budget, so allow 360s — still far - # inside the wedge discriminator (a real unfixed wedge never - # advances at all). + # volume winner seeds a SyncSource for both late adds. Two BUG-048 + # self-heals recover a concurrent-seed race: (a) when a volume comes + # up Inconsistent on EVERY replica with no source, the + # late-add-promote self-heal force-primaries the lowest-id node; (b) + # when a resync IS elected but STALLS (paused/bitmap-exchange-wedged + # on a >=3-replica add — out-of-sync frozen, two replicas never + # converge), the late-add resync-KICK self-heal disconnect+adjusts to + # restart it. Both add latency (wedge-detect + the 45s stall-dwell / + # recoveryPromoteThrottle window + the kick/promote + resync). On a + # contended stand those recovery paths run past the bare-convergence + # budget, so allow 360s — still far inside the wedge discriminator (a + # real unfixed wedge never advances at all). kind: all_uptodate rd: "{{rd}}" timeout_s: 360 From 74728ddcb954253d270d91cac656fa19b99e3f52 Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Sun, 14 Jun 2026 18:21:49 +0300 Subject: [PATCH 16/19] fix(satellite,drbd): recover stalled late-add resync via per-volume invalidate, not disconnect (BUG-048) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The first resync-kick iteration used `disconnect`+`adjust` to re-run the DRBD handshake. On-stand that made the wedge WORSE: every replica of a fresh late-added volume shares the deterministic day0 current-UUID, so the post-reconnect GI handshake concludes "same generation, no resync needed" and ABANDONS the dirty bitmap — the connection went StandAlone / Established with out-of-sync frozen at the full size, never converging. Replace the connection re-handshake with a per-volume local `drbdadm invalidate /`: it explicitly discards the empty, non-authoritative LOCAL copy of the one stalled volume and forces a full SyncTarget FROM an UpToDate peer — working WITH the bitmap instead of re-deriving sync direction from the ambiguous day0 GI. Adm.LateAddResyncKickVolumes returns the local volumes that are below UpToDate, have a connected UpToDate peer to pull from, and are not already being actively SyncTarget-pulled — only when no replica is Primary, the RD is past day0, and every peer is Connected. maybeKickLateAddResync invalidates each. This covers the partial-stall signature (a peer reached UpToDate but this replica is stuck WFBitMapT/Outdated) and the post-promote dependency case (maybeLateAddPromote mints the source for the all-Inconsistent case, then invalidate re-pulls the stragglers). Data-safe: only a non-authoritative below-UpToDate local copy is ever discarded, and only when an UpToDate peer exists as the source. The 45s stall-dwell + recoveryPromoteThrottle still gate it so the normal transient volume-resync serialization is never disrupted. Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- pkg/drbd/drbdadm.go | 245 +++++++++--------- pkg/drbd/needs_late_add_resync_kick_test.go | 210 ++++++++------- .../maybe_kick_late_add_resync_test.go | 139 +++++----- pkg/satellite/reconciler.go | 74 +++--- 4 files changed, 345 insertions(+), 323 deletions(-) diff --git a/pkg/drbd/drbdadm.go b/pkg/drbd/drbdadm.go index 11a8f15f..966eab81 100644 --- a/pkg/drbd/drbdadm.go +++ b/pkg/drbd/drbdadm.go @@ -92,33 +92,23 @@ func (a *Adm) Adjust(ctx context.Context, resource string) error { return a.run(ctx, "adjust", resource) } -// LateAddResyncKick unsticks a STALLED late-add resync (BUG-048): it -// `disconnect`s every peer of the resource then `adjust`s to reconnect, -// re-running the DRBD GI handshake from scratch. The handshake re-derives -// the correct sync direction from generation-UUIDs / bitmaps and restarts -// the resync, clearing a stale paused-SyncSource (resync-suspended -// "dependency") or a never-completing WFBitMapT (resync-suspended "peer") -// that no longer advances on its own. Gated by the kernel-truth -// NeedsLateAddResyncKick predicate, which fires ONLY on the genuine wedge -// (no Primary anywhere, RD past day0, every peer Connected, a stalled -// resync present). -// -// Data-safe: disconnect/reconnect never mutates on-disk data, and DRBD's -// post-reconnect handshake picks the sync direction from metadata — it -// can never copy the wrong way. The `disconnect` is best-effort (a peer -// already StandAlone returns non-zero, which we ignore); the `adjust` -// reconnects every peer the .res still describes (drbdadm adjust re-adds -// connection objects, the inverse of the down a disconnect leaves). A -// failed adjust surfaces so a genuinely stuck reconnect is not silently -// swallowed. -func (a *Adm) LateAddResyncKick(ctx context.Context, resource string) error { - // Best-effort disconnect of all peers — quiesces the stale - // connection objects so the adjust re-runs a clean handshake. - _, _ = a.exec.Run(ctx, "drbdadm", "disconnect", resource) - - // Reconnect every configured peer; restarts the resync from the - // re-run GI handshake. - return a.run(ctx, "adjust", resource) +// InvalidateVolume runs `drbdadm invalidate /` — marks the +// LOCAL copy of ONE volume as out-of-sync and forces a full SyncTarget of +// that volume from an UpToDate peer. The per-volume target (`/`) +// confines the discard to the named volume so sibling volumes' data is +// untouched. Requires an UpToDate peer as the resync source (drbdadm +// refuses otherwise); callers gate on LateAddResyncKickVolumes, which only +// returns a volume when a connected peer is UpToDate for it. +// +// Used by the BUG-048 late-add resync-kick self-heal to recover a stalled +// late-add resync: the fresh, empty, non-authoritative local copy is +// thrown away and re-pulled from the UpToDate peer, working WITH the +// bitmap (unlike a connection re-handshake, which the shared day0 GI makes +// conclude "no resync needed"). +func (a *Adm) InvalidateVolume(ctx context.Context, resource string, volume int32) error { + target := fmt.Sprintf("%s/%d", resource, volume) + + return a.run(ctx, "invalidate", target) } // AdjustSkipDisk is the Failed-replica variant of Adjust that @@ -1635,84 +1625,82 @@ func lateAddVolumeNeedsLocalPromote(conns []drbdsetupStatusConnection, vol, myID return weAreLowest } -// NeedsLateAddResyncKick probes the live kernel via `drbdsetup status -// --json` and reports whether a LATE-ADDED volume's resync is -// STALLED — present but suspended/waiting, making no progress — so the -// satellite should kick it (disconnect + adjust to re-run the handshake -// and restart the resync) (BUG-048, the ≥3-replica concurrent late-add -// convergence wedge). -// -// Distinct from NeedsLateAddPromote: that predicate fires when NO -// SyncSource exists at all (every replica Inconsistent, no peer holds -// data) and force-primaries the lowest-id node to MINT a source. This -// predicate fires when a resync EXISTS but is wedged in a paused / -// bitmap-exchange state that never advances — the two stand-observed -// signatures: -// -// - "dependency deadlock": a peer-device is PausedSyncS with -// resync-suspended "dependency" (DRBD serialised this volume's -// resync behind another that has already finished, but the -// dependency latch never cleared), and its partner peers sit in -// WFBitMapT/resync-suspended "peer" waiting for a bitmap exchange -// that the paused source never starts. out-of-sync stays at the -// full volume size forever. -// - "partial stall": one peer reached UpToDate (a genuine SyncSource -// exists) but a SECOND peer is stuck in WFBitMapT (peerdisk -// Outdated/Inconsistent) and never finalises — the source fed one -// peer but not the other. The waiting peer's bitmap exchange never -// completes. -// -// In BOTH a force-primary alone does not help: the source/bitmap -// machinery is latched in a paused/waiting state that only a connection -// re-handshake clears. Disconnecting and re-adjusting tears down the -// stale connection objects and re-runs the GI handshake, from which DRBD -// re-derives the correct sync direction and restarts the resync cleanly. -// -// Returns true ONLY when ALL hold, so the kick is data-safe and -// self-limiting: +// LateAddResyncKickVolumes probes the live kernel via `drbdsetup status +// --json` and returns the LOCAL volume numbers whose late-add +// resync is STALLED and recoverable by a local `drbdadm invalidate +// /` — the ≥3-replica concurrent late-add convergence wedge +// (BUG-048). Empty slice ⇒ nothing to do. +// +// Why `invalidate` and not disconnect/reconnect: every replica of a +// freshly late-added volume shares the deterministic day0 current-UUID, +// so a connection re-handshake (disconnect+adjust/connect) reads "same +// generation ⇒ no resync needed" and ABANDONS the dirty bitmap — leaving +// the volume Established/Inconsistent with out-of-sync frozen, a WORSE +// wedge. `drbdadm invalidate /` instead explicitly discards the +// (empty, non-authoritative) LOCAL copy of that one volume and forces a +// full SyncTarget FROM an UpToDate peer — it works WITH the bitmap rather +// than re-deriving sync direction from the ambiguous day0 GI. +// +// The two stand-observed wedge signatures both reduce to "a local volume +// is below UpToDate, an UpToDate peer for it EXISTS, yet no live resync is +// pulling it up": +// +// - "partial stall": one peer reached UpToDate (a real source) but THIS +// replica is stuck WFBitMapT / PausedSyncT / Established with +// out-of-sync frozen — the source fed another peer but not us. +// - "dependency deadlock" (post-promote): once maybeLateAddPromote +// force-primaries the lowest-id node to UpToDate, the remaining +// Inconsistent replicas have an UpToDate peer but their resync stays +// paused (resync-suspended:dependency/peer) — invalidate restarts it. +// +// A volume qualifies ONLY when ALL hold, so the invalidate is data-safe +// and self-limiting: // - the kernel slot exists and reports a my-node-id; -// - NO replica anywhere (local nor any peer-role) is Primary (never -// disconnect a resource an application is actively writing — the -// resume on a Primary is handled by the normal resync path; a kick -// here would needlessly flap a live device); +// - NO replica anywhere (local nor any peer-role) is Primary — never +// invalidate a volume an application may be writing (would discard +// live data); // - at least one local volume is already UpToDate — the RD is PAST // first activation (a pure day0 bootstrap is excluded, same as // NeedsLateAddPromote gate 1), so this can never fire mid-bring-up; -// - every peer connection is fully Connected (the resync states are -// only authoritative over a settled connection — a Connecting / -// StandAlone peer is a transient that the next reconcile re-reads); -// - at least one peer-device of any volume is a STALLED resync: a -// paused/waiting replication-state (PausedSyncS/PausedSyncT/ -// WFBitMapS/WFBitMapT) whose resync-suspended is NOT "no" (i.e. -// "dependency"/"peer"/"user") — the precise wedge signature, never a -// healthy progressing SyncSource (resync-suspended "no"). -// -// Conservative on any probe/parse failure → false. Self-limiting: once -// the kick lets the resync finish and the volume reaches UpToDate, no -// stalled peer-device remains and the predicate stops holding. -func (a *Adm) NeedsLateAddResyncKick(ctx context.Context, resource string) bool { +// - every peer connection is fully Connected (resync state is only +// authoritative over a settled connection); +// - the LOCAL volume is below UpToDate (Inconsistent/Outdated) — only a +// non-authoritative local copy is ever discarded, never an UpToDate +// one; +// - a connected PEER is UpToDate for that SAME volume — the authoritative +// source `invalidate` will SyncTarget from (without one, invalidate +// has nothing to pull and would fail; maybeLateAddPromote owns minting +// the source for the all-Inconsistent case); +// - the volume is NOT already being actively SyncTarget-pulled +// (replication-state SyncTarget with resync-suspended "no") — a live +// resync finishes on its own and must not be churned. +// +// Conservative on any probe/parse failure → empty. Self-limiting: once a +// volume reaches UpToDate it no longer qualifies, so the predicate stops +// returning it. +func (a *Adm) LateAddResyncKickVolumes(ctx context.Context, resource string) []int32 { out, err := a.exec.Run(ctx, "drbdsetup", "status", resource, "--json") if err != nil { - return false + return nil } var status drbdsetupStatusRoot err = json.Unmarshal(out, &status) if err != nil || len(status) == 0 || status[0].NodeID == nil { - return false + return nil } res := status[0] - // Never kick a resource an application is actively writing. + // Never invalidate a volume an application may be writing. if Role(res.Role).IsPrimary() { - return false + return nil } for _, conn := range res.Connections { if Role(conn.PeerRole).IsPrimary() { - return false + return nil } } @@ -1720,57 +1708,80 @@ func (a *Adm) NeedsLateAddResyncKick(ctx context.Context, resource string) bool // so this can never misfire during a fresh RD's first-activation // bring-up where every volume is transiently Inconsistent. if !localHasUpToDateVolume(res.Devices) { - return false + return nil } // Gate: every peer fully Connected — the resync-state read is only // authoritative over a settled connection. if !allPeersConnected(res.Connections) { - return false + return nil } - // Fire only when a genuine STALLED resync peer-device is present. - for _, conn := range res.Connections { - if slices.ContainsFunc(conn.PeerDevices, peerDeviceResyncStalled) { - return true + var kick []int32 + + for _, dev := range res.Devices { + if lateAddVolumeNeedsInvalidate(res.Connections, dev) { + kick = append(kick, dev.VolumeNumber) } } - return false + return kick } -// peerDeviceResyncStalled reports whether a peer-device is wedged in a -// resync that is present but suspended/waiting and making no progress — -// the BUG-048 late-add convergence-wedge signature. True when the -// replication-state is a paused or bitmap-exchange-wait state -// (PausedSyncS/PausedSyncT/WFBitMapS/WFBitMapT) AND resync-suspended is -// a non-"no" reason ("dependency"/"peer"/"user", possibly comma-joined). -// -// A healthy progressing resync (SyncSource/SyncTarget, or a WF* step -// with resync-suspended "no") is NOT stalled — it advances on its own -// and must be left alone. An empty resync-suspended token is treated as -// "no" (drbd-utils omits the field when nothing is suspended), so a -// transient WF* with no suspension is not mistaken for a wedge. -func peerDeviceResyncStalled(peerDev drbdsetupStatusPeerDevice) bool { - switch ReplicationState(peerDev.ReplicationState) { - case ReplicationStatePausedSyncS, ReplicationStatePausedSyncT, - ReplicationStateWFBitMapS, ReplicationStateWFBitMapT: - // A paused or bitmap-exchange-wait state — candidate. - case ReplicationStateOff, ReplicationStateEstablished, - ReplicationStateStartingSyncS, ReplicationStateStartingSyncT, - ReplicationStateWFSyncUUID, ReplicationStateSyncSource, - ReplicationStateSyncTarget, ReplicationStateVerifyS, - ReplicationStateVerifyT, ReplicationStateAhead, - ReplicationStateBehind: - // Progressing or steady-state — not a stalled wedge. +// lateAddVolumeNeedsInvalidate reports whether THIS node's local `dev` is +// a stalled late-add volume that a local `invalidate` would recover: the +// local copy is below UpToDate (Inconsistent/Outdated), a connected peer +// is UpToDate for the volume (the SyncTarget source), and the volume is +// not already being actively pulled (SyncTarget/resync-suspended:no). +func lateAddVolumeNeedsInvalidate(conns []drbdsetupStatusConnection, dev drbdsetupStatusDevice) bool { + switch DiskState(dev.DiskState) { + case DiskStateInconsistent, DiskStateOutdated: + // Below UpToDate — a candidate to re-pull. Never touch an + // UpToDate local (authoritative) copy. + case DiskStateUpToDate, DiskStateConsistent, DiskStateDiskless, + DiskStateAttaching, DiskStateDetaching, DiskStateFailed, + DiskStateNegotiating, DiskStateDUnknown: return false default: return false } - rss := peerDev.ResyncSuspended + var ( + peerUpToDate bool + activelyPulling bool + ) + + for _, conn := range conns { + for _, peerDev := range conn.PeerDevices { + if peerDev.VolumeNumber != dev.VolumeNumber { + continue + } + + if DiskState(peerDev.PeerDiskState) == DiskStateUpToDate { + peerUpToDate = true + } + + if peerDeviceActivelyPulling(peerDev) { + activelyPulling = true + } + } + } + + return peerUpToDate && !activelyPulling +} + +// peerDeviceActivelyPulling reports whether this peer-device is a live, +// unsuspended SyncTarget — a resync already pulling local data up to date +// that must be left to finish. Mirrors peerDeviceActivelySyncing but for +// the receive direction (StartingSyncT / WFBitMapT-with-resync-running are +// NOT counted: WFBitMapT with resync-suspended != "no" is the wedge this +// recovers). +func peerDeviceActivelyPulling(peerDev drbdsetupStatusPeerDevice) bool { + if ReplicationState(peerDev.ReplicationState) != ReplicationStateSyncTarget { + return false + } - return rss != "" && rss != wireNo + return peerDev.ResyncSuspended == "" || peerDev.ResyncSuspended == wireNo } // DownVeto is the tri-state outcome of the Bug 350 kernel-truth probe diff --git a/pkg/drbd/needs_late_add_resync_kick_test.go b/pkg/drbd/needs_late_add_resync_kick_test.go index 3b184eb6..1c8d8c69 100644 --- a/pkg/drbd/needs_late_add_resync_kick_test.go +++ b/pkg/drbd/needs_late_add_resync_kick_test.go @@ -20,21 +20,22 @@ package drbd_test import ( "errors" + "slices" "testing" "github.com/cozystack/blockstor/pkg/drbd" "github.com/cozystack/blockstor/pkg/storage" ) -var errKickProbe = errors.New("drbdsetup status: exit 1") - // Regression pins for the BUG-048 late-add resync-KICK predicate -// (NeedsLateAddResyncKick). It exists to unstick a late-added volume -// whose resync EXISTS but wedged in a paused / bitmap-exchange state -// that never advances — the ≥3-replica concurrent-add convergence wedge. -// It must fire ONLY on that stalled-resync signature and never on a -// healthy progressing sync, a day0 bootstrap, a Primary-held resource, -// or a not-fully-connected peer set. +// (LateAddResyncKickVolumes). It returns the LOCAL volume numbers whose +// late-add resync is STALLED and recoverable by `drbdadm invalidate +// /` — a local copy below UpToDate, an UpToDate peer to pull +// from, and no live resync already running. It must return ONLY genuine +// stragglers and never an UpToDate local, a volume with no UpToDate peer, +// a live SyncTarget, a day0 bootstrap, or a Primary-held resource. + +var errKickProbe = errors.New("drbdsetup status: exit 1") const lateAddKickKey = "drbdsetup status pvc-kick --json" @@ -47,14 +48,13 @@ func admWithKickStatus(t *testing.T, json string) *drbd.Adm { return drbd.NewAdm(fx) } -// Stand-observed signature 1 — "dependency deadlock" (B-3r RUN2/RUN8): -// vol-0/vol-1 UpToDate, the late vol-2 Inconsistent locally, a peer is -// PausedSyncS/resync-suspended:dependency (an elected source whose -// resync is paused on a stale dependency) and the partner waits in -// WFBitMapT/resync-suspended:peer. oos frozen at full. No Primary. -func TestNeedsLateAddResyncKick_DependencyDeadlock(t *testing.T) { +// Stand-observed "partial stall" (B-3r RUN9): vol-0/vol-1 UpToDate, the +// late vol-2 stuck Inconsistent locally while a peer (n2) reached +// UpToDate for vol-2 — but this replica is stuck WFBitMapT and never +// finalises. An UpToDate peer exists ⇒ invalidate vol-2 locally. +func TestLateAddResyncKickVolumes_PartialStall(t *testing.T) { adm := admWithKickStatus(t, `[{ - "name":"pvc-kick","node-id":0,"role":"Secondary", + "name":"pvc-kick","node-id":2,"role":"Secondary", "devices":[ {"volume":0,"disk-state":"UpToDate"}, {"volume":1,"disk-state":"UpToDate"}, @@ -63,109 +63,123 @@ func TestNeedsLateAddResyncKick_DependencyDeadlock(t *testing.T) { "connections":[ {"peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", "peer_devices":[ - {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, - {"volume":1,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, - {"volume":2,"peer-disk-state":"Inconsistent","replication-state":"PausedSyncS","resync-suspended":"dependency"} + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"WFBitMapT","resync-suspended":"peer"} ]}, - {"peer-node-id":2,"name":"n3","connection-state":"Connected","peer-role":"Secondary", + {"peer-node-id":0,"name":"n1","connection-state":"Connected","peer-role":"Secondary", "peer_devices":[ - {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, - {"volume":1,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, - {"volume":2,"peer-disk-state":"Inconsistent","replication-state":"WFBitMapT","resync-suspended":"peer"} + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"} ]} ] }]`) - if !adm.NeedsLateAddResyncKick(t.Context(), "pvc-kick") { - t.Fatal("expected kick for the dependency-deadlock wedge") + got := adm.LateAddResyncKickVolumes(t.Context(), "pvc-kick") + if !slices.Equal(got, []int32{2}) { + t.Fatalf("expected to invalidate [2], got %v", got) } } -// Stand-observed signature 2 — "partial stall" (B-3r RUN9): one peer -// reached UpToDate (a real SyncSource exists) but a SECOND peer is stuck -// WFBitMapT / peerdisk Outdated and never finalises. The waiting peer's -// resync-suspended is "peer". No Primary. -func TestNeedsLateAddResyncKick_PartialStall(t *testing.T) { +// Post-promote dependency case: a peer is now UpToDate (the promoted +// source) but this Inconsistent replica's resync stays paused +// (resync-suspended:dependency). An UpToDate peer exists ⇒ invalidate. +func TestLateAddResyncKickVolumes_PostPromoteDependency(t *testing.T) { adm := admWithKickStatus(t, `[{ "name":"pvc-kick","node-id":2,"role":"Secondary", "devices":[ {"volume":0,"disk-state":"UpToDate"}, - {"volume":1,"disk-state":"UpToDate"}, - {"volume":2,"disk-state":"Outdated"} + {"volume":2,"disk-state":"Inconsistent"} + ], + "connections":[{ + "peer-node-id":0,"name":"n1","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"PausedSyncT","resync-suspended":"dependency"} + ] + }] + }]`) + + got := adm.LateAddResyncKickVolumes(t.Context(), "pvc-kick") + if !slices.Equal(got, []int32{2}) { + t.Fatalf("expected to invalidate [2], got %v", got) + } +} + +// No UpToDate peer for the stalled volume (all-Inconsistent dependency +// deadlock BEFORE promote): invalidate has no source, so this predicate +// must NOT return the volume — maybeLateAddPromote owns minting the +// source first. +func TestLateAddResyncKickVolumes_NoUpToDatePeerSkipped(t *testing.T) { + adm := admWithKickStatus(t, `[{ + "name":"pvc-kick","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} ], "connections":[ {"peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", "peer_devices":[ - {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, - {"volume":1,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, - {"volume":2,"peer-disk-state":"UpToDate","replication-state":"WFBitMapT","resync-suspended":"peer"} + {"volume":2,"peer-disk-state":"Inconsistent","replication-state":"PausedSyncS","resync-suspended":"dependency"} ]}, - {"peer-node-id":0,"name":"n1","connection-state":"Connected","peer-role":"Secondary", + {"peer-node-id":2,"name":"n3","connection-state":"Connected","peer-role":"Secondary", "peer_devices":[ - {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, - {"volume":1,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, - {"volume":2,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"} + {"volume":2,"peer-disk-state":"Inconsistent","replication-state":"WFBitMapT","resync-suspended":"peer"} ]} ] }]`) - if !adm.NeedsLateAddResyncKick(t.Context(), "pvc-kick") { - t.Fatal("expected kick for the partial-stall wedge") + got := adm.LateAddResyncKickVolumes(t.Context(), "pvc-kick") + if len(got) != 0 { + t.Fatalf("must NOT invalidate when no peer is UpToDate, got %v", got) } } -// A HEALTHY progressing initial sync (SyncSource / SyncTarget with -// resync-suspended "no") must NOT be kicked — it advances on its own, -// and a kick would needlessly abort it. -func TestNeedsLateAddResyncKick_HealthySyncNotKicked(t *testing.T) { +// A live, unsuspended SyncTarget (resync-suspended "no") must NOT be +// invalidated — it is actively pulling and finishes on its own. +func TestLateAddResyncKickVolumes_ActiveSyncTargetSkipped(t *testing.T) { adm := admWithKickStatus(t, `[{ - "name":"pvc-kick","node-id":0,"role":"Secondary", + "name":"pvc-kick","node-id":2,"role":"Secondary", "devices":[ {"volume":0,"disk-state":"UpToDate"}, - {"volume":1,"disk-state":"Inconsistent"} + {"volume":2,"disk-state":"Inconsistent"} ], "connections":[{ - "peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", + "peer-node-id":0,"name":"n1","connection-state":"Connected","peer-role":"Secondary", "peer_devices":[ - {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, - {"volume":1,"peer-disk-state":"Inconsistent","replication-state":"SyncSource","resync-suspended":"no"} + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"SyncTarget","resync-suspended":"no"} ] }] }]`) - if adm.NeedsLateAddResyncKick(t.Context(), "pvc-kick") { - t.Fatal("must NOT kick a healthy progressing SyncSource") + got := adm.LateAddResyncKickVolumes(t.Context(), "pvc-kick") + if len(got) != 0 { + t.Fatalf("must NOT invalidate a live SyncTarget, got %v", got) } } -// A transient bitmap-exchange step with resync-suspended "no" (a sync -// about to start, not wedged) must NOT be kicked. -func TestNeedsLateAddResyncKick_WFBitMapNotSuspendedNotKicked(t *testing.T) { +// An UpToDate local volume is authoritative — never invalidate it even if +// a peer-device shows a stalled state. +func TestLateAddResyncKickVolumes_UpToDateLocalSkipped(t *testing.T) { adm := admWithKickStatus(t, `[{ "name":"pvc-kick","node-id":0,"role":"Secondary", "devices":[ {"volume":0,"disk-state":"UpToDate"}, - {"volume":1,"disk-state":"Inconsistent"} + {"volume":2,"disk-state":"UpToDate"} ], "connections":[{ "peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", "peer_devices":[ - {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, - {"volume":1,"peer-disk-state":"Inconsistent","replication-state":"WFBitMapT","resync-suspended":"no"} + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"PausedSyncS","resync-suspended":"dependency"} ] }] }]`) - if adm.NeedsLateAddResyncKick(t.Context(), "pvc-kick") { - t.Fatal("must NOT kick a WFBitMapT that is not suspended (sync starting)") + got := adm.LateAddResyncKickVolumes(t.Context(), "pvc-kick") + if len(got) != 0 { + t.Fatalf("must NOT invalidate an UpToDate local volume, got %v", got) } } -// Day0 bootstrap: NO local UpToDate volume yet (every volume transiently -// Inconsistent while the fresh-RD winner election runs). A paused sync -// here must NOT be kicked — the gate requires a local UpToDate sibling so -// this can never misfire during first activation. -func TestNeedsLateAddResyncKick_Day0NotKicked(t *testing.T) { +// Day0 bootstrap: NO local UpToDate volume yet. Even with an Inconsistent +// local + UpToDate peer, the gate refuses (RD not past first activation). +func TestLateAddResyncKickVolumes_Day0Skipped(t *testing.T) { adm := admWithKickStatus(t, `[{ "name":"pvc-kick","node-id":0,"role":"Secondary", "devices":[ @@ -174,95 +188,95 @@ func TestNeedsLateAddResyncKick_Day0NotKicked(t *testing.T) { "connections":[{ "peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", "peer_devices":[ - {"volume":0,"peer-disk-state":"Inconsistent","replication-state":"PausedSyncS","resync-suspended":"dependency"} + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"PausedSyncT","resync-suspended":"dependency"} ] }] }]`) - if adm.NeedsLateAddResyncKick(t.Context(), "pvc-kick") { - t.Fatal("must NOT kick during day0 (no local UpToDate volume)") + got := adm.LateAddResyncKickVolumes(t.Context(), "pvc-kick") + if len(got) != 0 { + t.Fatalf("must NOT invalidate during day0 (no local UpToDate volume), got %v", got) } } -// A Primary anywhere (local or peer) vetoes the kick — never disconnect a -// resource an application is actively writing. -func TestNeedsLateAddResyncKick_PrimaryVetoes(t *testing.T) { +// A Primary anywhere (local or peer) vetoes invalidate — never discard a +// volume an application may be writing. +func TestLateAddResyncKickVolumes_PrimaryVetoes(t *testing.T) { adm := admWithKickStatus(t, `[{ - "name":"pvc-kick","node-id":0,"role":"Primary", + "name":"pvc-kick","node-id":2,"role":"Secondary", "devices":[ {"volume":0,"disk-state":"UpToDate"}, - {"volume":1,"disk-state":"Inconsistent"} + {"volume":2,"disk-state":"Inconsistent"} ], "connections":[{ - "peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", + "peer-node-id":0,"name":"n1","connection-state":"Connected","peer-role":"Primary", "peer_devices":[ - {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, - {"volume":1,"peer-disk-state":"Inconsistent","replication-state":"PausedSyncS","resync-suspended":"dependency"} + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"PausedSyncT","resync-suspended":"dependency"} ] }] }]`) - if adm.NeedsLateAddResyncKick(t.Context(), "pvc-kick") { - t.Fatal("must NOT kick while a Primary holds the resource") + got := adm.LateAddResyncKickVolumes(t.Context(), "pvc-kick") + if len(got) != 0 { + t.Fatalf("must NOT invalidate while a Primary holds the resource, got %v", got) } } -// A not-fully-connected peer (Connecting / StandAlone) defers the kick: -// the resync-state read is only authoritative over a settled connection. -func TestNeedsLateAddResyncKick_PeerNotConnectedDefers(t *testing.T) { +// A not-fully-connected peer (Connecting / StandAlone) defers — the +// resync-state read is only authoritative over a settled connection. +func TestLateAddResyncKickVolumes_PeerNotConnectedDefers(t *testing.T) { adm := admWithKickStatus(t, `[{ - "name":"pvc-kick","node-id":0,"role":"Secondary", + "name":"pvc-kick","node-id":2,"role":"Secondary", "devices":[ {"volume":0,"disk-state":"UpToDate"}, - {"volume":1,"disk-state":"Inconsistent"} + {"volume":2,"disk-state":"Inconsistent"} ], "connections":[ - {"peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", + {"peer-node-id":0,"name":"n1","connection-state":"Connected","peer-role":"Secondary", "peer_devices":[ - {"volume":1,"peer-disk-state":"Inconsistent","replication-state":"PausedSyncS","resync-suspended":"dependency"} + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"PausedSyncT","resync-suspended":"dependency"} ]}, - {"peer-node-id":2,"name":"n3","connection-state":"Connecting","peer-role":"Unknown", + {"peer-node-id":1,"name":"n2","connection-state":"Connecting","peer-role":"Unknown", "peer_devices":[]} ] }]`) - if adm.NeedsLateAddResyncKick(t.Context(), "pvc-kick") { - t.Fatal("must NOT kick when a peer is not fully Connected") + got := adm.LateAddResyncKickVolumes(t.Context(), "pvc-kick") + if len(got) != 0 { + t.Fatalf("must NOT invalidate when a peer is not fully Connected, got %v", got) } } -// Fully converged (every peer Established, resync-suspended "no") — no -// stalled resync remains, so the predicate stops holding (self-limiting). -func TestNeedsLateAddResyncKick_ConvergedNotKicked(t *testing.T) { +// Fully converged (every volume UpToDate) — nothing to invalidate. +func TestLateAddResyncKickVolumes_ConvergedEmpty(t *testing.T) { adm := admWithKickStatus(t, `[{ "name":"pvc-kick","node-id":0,"role":"Secondary", "devices":[ {"volume":0,"disk-state":"UpToDate"}, - {"volume":1,"disk-state":"UpToDate"}, {"volume":2,"disk-state":"UpToDate"} ], "connections":[{ "peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", "peer_devices":[ {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, - {"volume":1,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, {"volume":2,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"} ] }] }]`) - if adm.NeedsLateAddResyncKick(t.Context(), "pvc-kick") { - t.Fatal("must NOT kick a fully converged resource") + got := adm.LateAddResyncKickVolumes(t.Context(), "pvc-kick") + if len(got) != 0 { + t.Fatalf("must return nothing for a converged resource, got %v", got) } } -// Probe failure (drbdsetup errors) → conservative false, no kick. -func TestNeedsLateAddResyncKick_ProbeFailureFalse(t *testing.T) { +// Probe failure (drbdsetup errors) → conservative empty. +func TestLateAddResyncKickVolumes_ProbeFailureEmpty(t *testing.T) { fx := storage.NewFakeExec() fx.Responses[lateAddKickKey] = storage.FakeResponse{Err: errKickProbe} adm := drbd.NewAdm(fx) - if adm.NeedsLateAddResyncKick(t.Context(), "pvc-kick") { - t.Fatal("must return false on a probe failure") + if got := adm.LateAddResyncKickVolumes(t.Context(), "pvc-kick"); len(got) != 0 { + t.Fatalf("must return empty on a probe failure, got %v", got) } } diff --git a/pkg/satellite/maybe_kick_late_add_resync_test.go b/pkg/satellite/maybe_kick_late_add_resync_test.go index 221dba90..cb224dad 100644 --- a/pkg/satellite/maybe_kick_late_add_resync_test.go +++ b/pkg/satellite/maybe_kick_late_add_resync_test.go @@ -30,67 +30,60 @@ import ( ) // Regression pins for the BUG-048 late-add resync-KICK self-heal wiring. -// maybeKickLateAddResync must disconnect+adjust a late-added volume whose -// resync wedged in a paused / bitmap-exchange state that never advances -// (the ≥3-replica concurrent-add convergence wedge), and must NOT act on -// a diskless replica, a healthy progressing sync, or a converged set. +// maybeKickLateAddResync must `drbdadm invalidate /` each stalled +// late-added volume that has an UpToDate peer (re-pull FROM it), and must +// NOT act on a diskless replica, an actively-pulling SyncTarget, or a +// volume with no UpToDate peer. const kickStatusKey = "drbdsetup status pvc-ka --json" -// vol-0/vol-1 UpToDate, the late vol-2 Inconsistent locally, a peer is -// PausedSyncS/resync-suspended:dependency and the partner WFBitMapT/peer -// — the stand-observed dependency-deadlock wedge. No Primary. +// vol-0 UpToDate, late vol-2 Inconsistent locally with an UpToDate peer +// whose resync is stalled (PausedSyncT/dependency) — the recoverable +// straggler. No Primary. const kickStatusWedged = `[{ - "name":"pvc-ka","node-id":0,"role":"Secondary", + "name":"pvc-ka","node-id":2,"role":"Secondary", "devices":[ {"volume":0,"disk-state":"UpToDate"}, - {"volume":1,"disk-state":"UpToDate"}, {"volume":2,"disk-state":"Inconsistent"} ], - "connections":[ - {"peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", - "peer_devices":[ - {"volume":2,"peer-disk-state":"Inconsistent","replication-state":"PausedSyncS","resync-suspended":"dependency"} - ]}, - {"peer-node-id":2,"name":"n3","connection-state":"Connected","peer-role":"Secondary", - "peer_devices":[ - {"volume":2,"peer-disk-state":"Inconsistent","replication-state":"WFBitMapT","resync-suspended":"peer"} - ]} - ] + "connections":[{ + "peer-node-id":0,"name":"n1","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"PausedSyncT","resync-suspended":"dependency"} + ] + }] }]` func kickDR() *intent.DesiredResource { return &intent.DesiredResource{ Name: "pvc-ka", - NodeName: "n1", + NodeName: "n3", Volumes: []*intent.DesiredVolume{ {VolumeNumber: 0, SizeKib: 1024 * 1024, StoragePool: "thin1"}, - {VolumeNumber: 1, SizeKib: 1024 * 1024, StoragePool: "thin1"}, {VolumeNumber: 2, SizeKib: 1024 * 1024, StoragePool: "thin1"}, }, - Peers: []intent.DesiredPeer{{Name: "n2"}, {Name: "n3"}}, + Peers: []intent.DesiredPeer{{Name: "n1"}}, DrbdOptions: map[string]string{ - "port": "7000", "node-id": "0", "address": "10.0.0.1", "minor": "1000", + "port": "7000", "node-id": "2", "address": "10.0.0.3", "minor": "1000", }, } } -// The wedge fires the disconnect+adjust kick — but ONLY after the stall -// has persisted beyond the dwell window. The FIRST observation starts the -// dwell (no kick); once the recorded stall is older than lateAddStallDwell -// the kick fires. -func TestMaybeKickLateAddResync_FiresForStalledResync(t *testing.T) { +// The wedge fires the per-volume invalidate — but ONLY after the stall +// has persisted beyond the dwell window. +func TestMaybeKickLateAddResync_InvalidatesStalledVolume(t *testing.T) { fx := storage.NewFakeExec() fx.Expect(kickStatusKey, storage.FakeResponse{Stdout: []byte(kickStatusWedged)}) - rec := NewReconciler(ReconcilerConfig{Adm: drbd.NewAdm(fx), NodeName: "n1"}) + rec := NewReconciler(ReconcilerConfig{Adm: drbd.NewAdm(fx), NodeName: "n3"}) - // First pass starts the dwell window — must NOT kick yet. + // First pass starts the dwell window — must NOT invalidate yet. if err := rec.maybeKickLateAddResync(context.Background(), kickDR(), false); err != nil { t.Fatalf("maybeKickLateAddResync pass 1: %v", err) } - if slices.Contains(fx.CommandLines(), "drbdadm disconnect pvc-ka") { - t.Fatalf("must NOT kick on the first observation (dwell not elapsed), got: %v", fx.CommandLines()) + if slices.Contains(fx.CommandLines(), "drbdadm invalidate pvc-ka/2") { + t.Fatalf("must NOT invalidate on the first observation (dwell not elapsed), got: %v", fx.CommandLines()) } // Backdate the recorded stall so the dwell has elapsed. @@ -102,35 +95,34 @@ func TestMaybeKickLateAddResync_FiresForStalledResync(t *testing.T) { t.Fatalf("maybeKickLateAddResync pass 2: %v", err) } - cmds := fx.CommandLines() - if !slices.Contains(cmds, "drbdadm disconnect pvc-ka") { - t.Errorf("expected `drbdadm disconnect pvc-ka` after dwell, got: %v", cmds) + if !slices.Contains(fx.CommandLines(), "drbdadm invalidate pvc-ka/2") { + t.Errorf("expected `drbdadm invalidate pvc-ka/2` after dwell, got: %v", fx.CommandLines()) } - if !slices.Contains(cmds, "drbdadm adjust pvc-ka") { - t.Errorf("expected `drbdadm adjust pvc-ka` (reconnect after disconnect), got: %v", cmds) + // Must NOT use the abandoned disconnect/adjust handshake kick. + if slices.Contains(fx.CommandLines(), "drbdadm disconnect pvc-ka") { + t.Errorf("must NOT disconnect (re-handshake abandons the bitmap), got: %v", fx.CommandLines()) } } -// A stall that CLEARS (predicate goes false) resets the dwell window — a -// subsequent fresh stall must start its dwell from scratch rather than -// inherit the earlier timestamp and kick immediately. +// A stall that CLEARS (no volume returned) resets the dwell window. func TestMaybeKickLateAddResync_DwellResetsWhenStallClears(t *testing.T) { fx := storage.NewFakeExec() fx.Expect(kickStatusKey, storage.FakeResponse{Stdout: []byte(kickStatusWedged)}) - rec := NewReconciler(ReconcilerConfig{Adm: drbd.NewAdm(fx), NodeName: "n1"}) + rec := NewReconciler(ReconcilerConfig{Adm: drbd.NewAdm(fx), NodeName: "n3"}) - // Start the dwell window. if err := rec.maybeKickLateAddResync(context.Background(), kickDR(), false); err != nil { t.Fatalf("maybeKickLateAddResync pass 1: %v", err) } - // Stall clears (converged) — predicate goes false, dwell is dropped. + // Converged — no stalled volume returned, dwell dropped. fx.Expect(kickStatusKey, storage.FakeResponse{Stdout: []byte(`[{ - "name":"pvc-ka","node-id":0,"role":"Secondary", - "devices":[{"volume":0,"disk-state":"UpToDate"}], - "connections":[{"peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", - "peer_devices":[{"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}]}] + "name":"pvc-ka","node-id":2,"role":"Secondary", + "devices":[{"volume":0,"disk-state":"UpToDate"},{"volume":2,"disk-state":"UpToDate"}], + "connections":[{"peer-node-id":0,"name":"n1","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}]}] }]`)}) if err := rec.maybeKickLateAddResync(context.Background(), kickDR(), false); err != nil { t.Fatalf("maybeKickLateAddResync pass 2: %v", err) @@ -144,62 +136,53 @@ func TestMaybeKickLateAddResync_DwellResetsWhenStallClears(t *testing.T) { } } -// A diskless replica has no local resync to kick — skip outright. +// A diskless replica has no local copy to invalidate — skip outright. func TestMaybeKickLateAddResync_SkipsDiskless(t *testing.T) { fx := storage.NewFakeExec() fx.Expect(kickStatusKey, storage.FakeResponse{Stdout: []byte(kickStatusWedged)}) - rec := NewReconciler(ReconcilerConfig{Adm: drbd.NewAdm(fx), NodeName: "n1"}) + rec := NewReconciler(ReconcilerConfig{Adm: drbd.NewAdm(fx), NodeName: "n3"}) if err := rec.maybeKickLateAddResync(context.Background(), kickDR(), true); err != nil { t.Fatalf("maybeKickLateAddResync: %v", err) } - if slices.Contains(fx.CommandLines(), "drbdadm disconnect pvc-ka") { - t.Errorf("diskless replica must NOT be kicked, got: %v", fx.CommandLines()) + if slices.Contains(fx.CommandLines(), "drbdadm invalidate pvc-ka/2") { + t.Errorf("diskless replica must NOT be invalidated, got: %v", fx.CommandLines()) } } -// A healthy progressing SyncSource (resync-suspended "no") must NOT be -// kicked — the kick would abort a sync that finishes on its own. -func TestMaybeKickLateAddResync_SkipsHealthySync(t *testing.T) { +// A live, unsuspended SyncTarget must NOT be invalidated — it is actively +// pulling and finishes on its own. +func TestMaybeKickLateAddResync_SkipsActiveSyncTarget(t *testing.T) { fx := storage.NewFakeExec() fx.Expect(kickStatusKey, storage.FakeResponse{Stdout: []byte(`[{ - "name":"pvc-ka","node-id":0,"role":"Secondary", - "devices":[ - {"volume":0,"disk-state":"UpToDate"}, - {"volume":2,"disk-state":"Inconsistent"} - ], - "connections":[{ - "peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", + "name":"pvc-ka","node-id":2,"role":"Secondary", + "devices":[{"volume":0,"disk-state":"UpToDate"},{"volume":2,"disk-state":"Inconsistent"}], + "connections":[{"peer-node-id":0,"name":"n1","connection-state":"Connected","peer-role":"Secondary", "peer_devices":[ - {"volume":2,"peer-disk-state":"Inconsistent","replication-state":"SyncSource","resync-suspended":"no"} - ] - }] + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"SyncTarget","resync-suspended":"no"}]}] }]`)}) - rec := NewReconciler(ReconcilerConfig{Adm: drbd.NewAdm(fx), NodeName: "n1"}) + rec := NewReconciler(ReconcilerConfig{Adm: drbd.NewAdm(fx), NodeName: "n3"}) if err := rec.maybeKickLateAddResync(context.Background(), kickDR(), false); err != nil { t.Fatalf("maybeKickLateAddResync: %v", err) } - if slices.Contains(fx.CommandLines(), "drbdadm disconnect pvc-ka") { - t.Errorf("must NOT kick a healthy progressing SyncSource, got: %v", fx.CommandLines()) + if slices.Contains(fx.CommandLines(), "drbdadm invalidate pvc-ka/2") { + t.Errorf("must NOT invalidate a live SyncTarget, got: %v", fx.CommandLines()) } } -// The kick is throttled by the shared recoveryPromoteThrottle so a -// still-converging resync isn't churned: a second back-to-back call -// inside the window must NOT re-disconnect. +// Throttled by the shared recoveryPromoteThrottle: a second back-to-back +// pass inside the window (dwell pre-satisfied) must NOT re-invalidate. func TestMaybeKickLateAddResync_Throttled(t *testing.T) { fx := storage.NewFakeExec() fx.Expect(kickStatusKey, storage.FakeResponse{Stdout: []byte(kickStatusWedged)}) - rec := NewReconciler(ReconcilerConfig{Adm: drbd.NewAdm(fx), NodeName: "n1"}) + rec := NewReconciler(ReconcilerConfig{Adm: drbd.NewAdm(fx), NodeName: "n3"}) - // Pre-satisfy the dwell so both passes are past the dwell gate and the - // throttle is the only thing limiting the kick. rec.mu.Lock() rec.firstLateAddStallAt["pvc-ka"] = time.Now().Add(-2 * lateAddStallDwell) rec.mu.Unlock() @@ -210,13 +193,13 @@ func TestMaybeKickLateAddResync_Throttled(t *testing.T) { } } - disconnects := 0 + invals := 0 for _, c := range fx.CommandLines() { - if c == "drbdadm disconnect pvc-ka" { - disconnects++ + if c == "drbdadm invalidate pvc-ka/2" { + invals++ } } - if disconnects != 1 { - t.Errorf("expected exactly 1 disconnect within the throttle window, got %d", disconnects) + if invals != 1 { + t.Errorf("expected exactly 1 invalidate within the throttle window, got %d", invals) } } diff --git a/pkg/satellite/reconciler.go b/pkg/satellite/reconciler.go index 4f535dbe..d24257d4 100644 --- a/pkg/satellite/reconciler.go +++ b/pkg/satellite/reconciler.go @@ -2774,38 +2774,46 @@ func (r *Reconciler) maybeLateAddPromote(ctx context.Context, dr *intent.Desired // concurrently-added volume of a 3-diskful RD, oos frozen at full / a // peer stuck below UpToDate forever): // -// - dependency deadlock: a SyncSource is elected but sits PausedSyncS -// with resync-suspended "dependency" toward its peers, which in turn -// wait in WFBitMapT/resync-suspended "peer" for a bitmap exchange the -// paused source never starts. Nobody ever reaches UpToDate. -// - partial stall: one peer reaches UpToDate (a real SyncSource), but a -// SECOND peer stays WFBitMapT / Outdated and never finalises — the -// source fed one peer but not the other. +// - partial stall: one peer reaches UpToDate (a real SyncSource), but +// THIS replica stays WFBitMapT / Outdated / Inconsistent and never +// finalises — the source fed another peer but not us. +// - dependency deadlock (post-promote): once maybeLateAddPromote +// force-primaries the lowest-id node to UpToDate, the remaining +// Inconsistent replicas have an UpToDate peer but their resync stays +// paused (resync-suspended:dependency) and never advances. // // Why neither promote self-heal covers it: maybeLateAddPromote fires only // when NO peer holds data and the volume is Inconsistent on EVERY replica -// (it mints a source); here a source exists (or a peer is already -// UpToDate) but the resync machinery is latched. maybeRecoveryPromote -// needs the local already-UpToDate. A force-primary does not clear a -// resync-suspended:dependency latch — only a connection re-handshake does. -// -// NeedsLateAddResyncKick is the kernel-truth gate: it fires ONLY when no -// replica is Primary, the RD is past day0 (a local UpToDate volume -// exists), every peer is Connected, and a peer-device is in a stalled -// resync (PausedSync*/WFBitMap* with a non-"no" resync-suspended). The -// kick (Adm.LateAddResyncKick) `disconnect`s then `adjust`s, re-running -// the GI handshake so DRBD restarts the resync from the correct -// direction. Data-safe (disconnect/reconnect never mutates data) and -// self-limiting (once the resync finishes no stalled peer-device remains). -// Throttled via the shared recoveryPromoteThrottle so a still-converging -// resync isn't churned by repeated kicks. Skipped on diskless replicas -// (no local resync to kick) and when Adm is unwired (storage-only tests). +// (it mints a source); here an UpToDate peer EXISTS but the local resync +// is latched. maybeRecoveryPromote needs the local already-UpToDate. +// +// Why `invalidate` and not a connection re-handshake: every replica of a +// fresh late-added volume shares the day0 current-UUID, so disconnect+ +// reconnect reads "same generation ⇒ no resync needed" and abandons the +// dirty bitmap — a WORSE wedge (verified on-stand: connection went +// StandAlone / Established with out-of-sync frozen). `drbdadm invalidate +// /` instead discards the empty, non-authoritative LOCAL copy of +// that one volume and forces a full SyncTarget FROM the UpToDate peer. +// +// LateAddResyncKickVolumes is the kernel-truth gate: it returns the local +// volumes below UpToDate that have a connected UpToDate peer (the +// SyncTarget source) and are not already being actively pulled — ONLY when +// no replica is Primary, the RD is past day0, and every peer is Connected. +// Each returned volume is invalidated locally. Data-safe (only a +// non-authoritative local copy is discarded, and only when an UpToDate +// peer exists to re-pull from) and self-limiting (once a volume reaches +// UpToDate it no longer qualifies). A 45s stall-dwell ensures only a +// GENUINELY frozen resync is invalidated, never the normal transient +// volume-resync serialization. Throttled via the shared +// recoveryPromoteThrottle. Skipped on diskless replicas (no local copy to +// invalidate) and when Adm is unwired (storage-only tests). func (r *Reconciler) maybeKickLateAddResync(ctx context.Context, dr *intent.DesiredResource, diskless bool) error { if diskless || r.cfg.Adm == nil { return nil } - if !r.cfg.Adm.NeedsLateAddResyncKick(ctx, dr.GetName()) { + volumes := r.cfg.Adm.LateAddResyncKickVolumes(ctx, dr.GetName()) + if len(volumes) == 0 { // Not stalled (or converged): clear any pending dwell so a future // transient stall starts its window fresh. r.clearLateAddStall(dr.GetName()) @@ -2813,9 +2821,9 @@ func (r *Reconciler) maybeKickLateAddResync(ctx context.Context, dr *intent.Desi return nil } - // Dwell gate: only kick once the stall has PERSISTED beyond + // Dwell gate: only invalidate once the stall has PERSISTED beyond // lateAddStallDwell, so the normal transient volume-resync - // serialization (a few seconds of PausedSyncS/dependency while a + // serialization (a few seconds of PausedSync*/dependency while a // sibling volume finishes) is never disrupted — only a genuinely // frozen resync is. if !r.lateAddStallDwellElapsed(dr.GetName()) { @@ -2826,11 +2834,17 @@ func (r *Reconciler) maybeKickLateAddResync(ctx context.Context, dr *intent.Desi return nil } - log.FromContext(ctx).Info("BUG-048 late-add resync-kick: disconnect+adjust to restart a stalled resync (paused/bitmap-exchange-wedged) for a late-added volume", - "resource", dr.GetName()) + for _, vol := range volumes { + log.FromContext(ctx).Info("BUG-048 late-add resync-kick: invalidate locally to re-pull a stalled late-added volume from its UpToDate peer", + "resource", dr.GetName(), "volume", vol) + + err := r.cfg.Adm.InvalidateVolume(ctx, dr.GetName(), vol) + if err != nil { + return errors.Wrapf(err, "late-add resync-kick invalidate %s/%d", dr.GetName(), vol) + } + } - return errors.Wrapf(r.cfg.Adm.LateAddResyncKick(ctx, dr.GetName()), - "late-add resync-kick %s", dr.GetName()) + return nil } // lateAddStallDwellElapsed records the first time a resource was observed From 5df2a680cc7542f2c923abb1fc98dd1d97dffdb1 Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Sun, 14 Jun 2026 18:42:07 +0300 Subject: [PATCH 17/19] fix(satellite,drbd): prevent late-add invalidate flap (one volume at a time, back off on active resync) (BUG-048) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The first invalidate iteration flapped on-stand: it invalidated every stalled volume at once and re-fired while a triggered resync was still running, so out-of-sync dropped to 0, the kick re-invalidated, and oos jumped back to full — the volume never settled UpToDate within the convergence budget. Three refinements make the kick idempotent and non-flapping: - invalidate ONLY a genuinely-stuck Inconsistent local volume; Outdated is now excluded (it is the transient post-resync / mid-handshake state a just-invalidated volume passes through — re-invalidating it re-dirties a converging copy). - back off whenever ANY peer-device on the volume has a resync actively in progress in either direction (peerDeviceResyncInProgress) — so once an invalidate triggers a SyncTarget the kick stops until it finishes; only a genuinely stalled (paused/suspended) handshake re-qualifies. - invalidate ONE volume per kick (lowest-numbered stalled), since DRBD serialises resyncs per connection — invalidating several at once re-creates the dependency-serialization that wedges. Also shortens the stall-dwell 45s→20s: the in-progress back-off is now the primary anti-flap guard, so the dwell only needs a small settling margin, and a genuinely frozen wedge recovers well inside the convergence budget. Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- pkg/drbd/drbdadm.go | 68 +++++++++++++++------ pkg/drbd/needs_late_add_resync_kick_test.go | 54 ++++++++++++++++ pkg/satellite/reconciler.go | 47 +++++++------- 3 files changed, 128 insertions(+), 41 deletions(-) diff --git a/pkg/drbd/drbdadm.go b/pkg/drbd/drbdadm.go index 966eab81..dff9e619 100644 --- a/pkg/drbd/drbdadm.go +++ b/pkg/drbd/drbdadm.go @@ -1734,21 +1734,29 @@ func (a *Adm) LateAddResyncKickVolumes(ctx context.Context, resource string) []i // is UpToDate for the volume (the SyncTarget source), and the volume is // not already being actively pulled (SyncTarget/resync-suspended:no). func lateAddVolumeNeedsInvalidate(conns []drbdsetupStatusConnection, dev drbdsetupStatusDevice) bool { + // Only a genuinely-stuck Inconsistent local copy is invalidated. + // Outdated is DELIBERATELY excluded: after an invalidate the volume + // transits Inconsistent → (resync) → Outdated → UpToDate, and a fresh + // SyncSource peer is briefly Outdated mid-handshake. Re-invalidating an + // Outdated copy re-dirties a volume that is actually converging, which + // FLAPS the resync (stand-observed: oos drops to 0, invalidate re-fires, + // oos jumps back to full). Leaving Outdated alone lets the in-flight + // resync finish. switch DiskState(dev.DiskState) { - case DiskStateInconsistent, DiskStateOutdated: - // Below UpToDate — a candidate to re-pull. Never touch an - // UpToDate local (authoritative) copy. - case DiskStateUpToDate, DiskStateConsistent, DiskStateDiskless, - DiskStateAttaching, DiskStateDetaching, DiskStateFailed, - DiskStateNegotiating, DiskStateDUnknown: + case DiskStateInconsistent: + // Genuinely below-UpToDate and not a transient post-resync state — + // the invalidate candidate. + case DiskStateUpToDate, DiskStateConsistent, DiskStateOutdated, + DiskStateDiskless, DiskStateAttaching, DiskStateDetaching, + DiskStateFailed, DiskStateNegotiating, DiskStateDUnknown: return false default: return false } var ( - peerUpToDate bool - activelyPulling bool + peerUpToDate bool + activeResyncIP bool ) for _, conn := range conns { @@ -1761,27 +1769,47 @@ func lateAddVolumeNeedsInvalidate(conns []drbdsetupStatusConnection, dev drbdset peerUpToDate = true } - if peerDeviceActivelyPulling(peerDev) { - activelyPulling = true + // Back off if ANY peer-device on this volume has a resync + // actively in progress (either direction) — an invalidate now + // would abort/re-dirty a converging resync and flap it. + if peerDeviceResyncInProgress(peerDev) { + activeResyncIP = true } } } - return peerUpToDate && !activelyPulling + return peerUpToDate && !activeResyncIP } -// peerDeviceActivelyPulling reports whether this peer-device is a live, -// unsuspended SyncTarget — a resync already pulling local data up to date -// that must be left to finish. Mirrors peerDeviceActivelySyncing but for -// the receive direction (StartingSyncT / WFBitMapT-with-resync-running are -// NOT counted: WFBitMapT with resync-suspended != "no" is the wedge this -// recovers). -func peerDeviceActivelyPulling(peerDev drbdsetupStatusPeerDevice) bool { - if ReplicationState(peerDev.ReplicationState) != ReplicationStateSyncTarget { +// peerDeviceResyncInProgress reports whether this peer-device has a resync +// actively running or starting in EITHER direction — i.e. NOT stalled. A +// volume with any such peer-device is converging on its own and must not be +// invalidated (doing so re-dirties it and flaps the resync). True for +// SyncSource / SyncTarget / StartingSync* and for a WFBitMap* handshake +// that is NOT suspended (resync-suspended "no"/empty — the bitmap exchange +// is proceeding). A WFBitMap* / PausedSync* with a non-"no" resync-suspended +// is the STALLED wedge — that is NOT "in progress", so the kick may fire. +func peerDeviceResyncInProgress(peerDev drbdsetupStatusPeerDevice) bool { + switch ReplicationState(peerDev.ReplicationState) { + case ReplicationStateSyncSource, ReplicationStateSyncTarget, + ReplicationStateStartingSyncS, ReplicationStateStartingSyncT: + // A resync is actively transferring data — always in progress. + return true + case ReplicationStateWFBitMapS, ReplicationStateWFBitMapT, + ReplicationStateWFSyncUUID: + // A handshake step: in progress ONLY if not suspended. A non-"no" + // resync-suspended (dependency/peer/user) is the stalled wedge. + return peerDev.ResyncSuspended == "" || peerDev.ResyncSuspended == wireNo + case ReplicationStateOff, ReplicationStateEstablished, + ReplicationStatePausedSyncS, ReplicationStatePausedSyncT, + ReplicationStateVerifyS, ReplicationStateVerifyT, + ReplicationStateAhead, ReplicationStateBehind: + // Established (done), explicitly paused, or non-resync states — not + // a resync in progress. return false } - return peerDev.ResyncSuspended == "" || peerDev.ResyncSuspended == wireNo + return false } // DownVeto is the tri-state outcome of the Bug 350 kernel-truth probe diff --git a/pkg/drbd/needs_late_add_resync_kick_test.go b/pkg/drbd/needs_late_add_resync_kick_test.go index 1c8d8c69..745cb20d 100644 --- a/pkg/drbd/needs_late_add_resync_kick_test.go +++ b/pkg/drbd/needs_late_add_resync_kick_test.go @@ -154,6 +154,60 @@ func TestLateAddResyncKickVolumes_ActiveSyncTargetSkipped(t *testing.T) { } } +// Flap guard (multi-peer): the local volume is Inconsistent and one peer +// is stalled (PausedSyncS/dependency) BUT another peer is actively pulling +// it up (SyncTarget/resync-suspended:no). A resync is in progress, so +// invalidating now would re-dirty a converging volume and FLAP it — must +// back off and let the live resync finish. +func TestLateAddResyncKickVolumes_InProgressMultiPeerSkipped(t *testing.T) { + adm := admWithKickStatus(t, `[{ + "name":"pvc-kick","node-id":2,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Inconsistent"} + ], + "connections":[ + {"peer-node-id":0,"name":"n1","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"SyncTarget","resync-suspended":"no"} + ]}, + {"peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":2,"peer-disk-state":"Inconsistent","replication-state":"PausedSyncS","resync-suspended":"dependency"} + ]} + ] + }]`) + + got := adm.LateAddResyncKickVolumes(t.Context(), "pvc-kick") + if len(got) != 0 { + t.Fatalf("must NOT invalidate while a live resync is converging the volume, got %v", got) + } +} + +// An Outdated local volume is a TRANSIENT post-resync / mid-handshake state +// — re-invalidating it re-dirties a converging volume (the stand-observed +// flap). Only a genuinely-stuck Inconsistent local is ever invalidated. +func TestLateAddResyncKickVolumes_OutdatedLocalSkipped(t *testing.T) { + adm := admWithKickStatus(t, `[{ + "name":"pvc-kick","node-id":2,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":2,"disk-state":"Outdated"} + ], + "connections":[{ + "peer-node-id":0,"name":"n1","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":2,"peer-disk-state":"UpToDate","replication-state":"WFBitMapT","resync-suspended":"peer"} + ] + }] + }]`) + + got := adm.LateAddResyncKickVolumes(t.Context(), "pvc-kick") + if len(got) != 0 { + t.Fatalf("must NOT invalidate an Outdated (transient/recovering) local volume, got %v", got) + } +} + // An UpToDate local volume is authoritative — never invalidate it even if // a peer-device shows a stalled state. func TestLateAddResyncKickVolumes_UpToDateLocalSkipped(t *testing.T) { diff --git a/pkg/satellite/reconciler.go b/pkg/satellite/reconciler.go index d24257d4..abbf527c 100644 --- a/pkg/satellite/reconciler.go +++ b/pkg/satellite/reconciler.go @@ -323,18 +323,18 @@ type Reconciler struct { const recoveryPromoteThrottle = 10 * time.Second // lateAddStallDwell is how long a late-add resync must REMAIN stalled -// (NeedsLateAddResyncKick continuously true) before maybeKickLateAddResync -// fires the disconnect+adjust kick. It must comfortably exceed the normal, -// transient DRBD volume-resync serialization gap — when two volumes are -// added at once, vol-N legitimately sits PausedSyncS/resync-suspended: -// dependency for a few seconds while vol-(N-1) finishes its resync, which -// looks identical to the wedge for a moment. The dwell ensures the kick -// only disrupts a resync that is GENUINELY stuck (oos frozen for the whole -// window), never a healthy serialized sync that is about to advance on its -// own. The genuine wedge sits stalled indefinitely, so it always clears -// the dwell; a transient serialization clears the predicate (and resets -// the dwell) well before it elapses. -const lateAddStallDwell = 45 * time.Second +// (LateAddResyncKickVolumes continuously non-empty) before +// maybeKickLateAddResync fires the per-volume invalidate. It is a SECONDARY +// guard against disrupting the normal, transient DRBD volume-resync +// serialization — the PRIMARY guard is peerDeviceResyncInProgress, which +// already backs the kick off whenever any resync on the volume is actively +// progressing (so a freshly-serialized vol-N that is about to start is not +// invalidated). The dwell adds a small settling margin so a stall that is +// momentarily mis-read clears on its own first. Kept short so a genuine +// FROZEN wedge (oos never moves) recovers well inside the operator/L7 +// convergence budget; far longer than the sub-second window in which a +// just-added volume transitions through its handshake states. +const lateAddStallDwell = 20 * time.Second // restoreSnapshotMissingBudget bounds how many consecutive reconcile // passes a node-local snapshot-restore volume may REQUEUE on a @@ -2834,17 +2834,22 @@ func (r *Reconciler) maybeKickLateAddResync(ctx context.Context, dr *intent.Desi return nil } - for _, vol := range volumes { - log.FromContext(ctx).Info("BUG-048 late-add resync-kick: invalidate locally to re-pull a stalled late-added volume from its UpToDate peer", - "resource", dr.GetName(), "volume", vol) + // Invalidate ONE volume per kick (the lowest-numbered stalled one). + // DRBD serialises resyncs per connection, so invalidating every stalled + // volume at once would re-create the very dependency-serialization that + // wedges — vol-N's invalidate would sit PausedSyncT/dependency behind + // vol-(N-1)'s and, looking stalled, get re-invalidated and flap. One at + // a time lets each invalidated volume's resync run (and back off the + // kick via peerDeviceResyncInProgress) before the next reconcile picks + // up the next straggler. volumes is built in ascending volume order by + // the kernel device scan. + vol := volumes[0] - err := r.cfg.Adm.InvalidateVolume(ctx, dr.GetName(), vol) - if err != nil { - return errors.Wrapf(err, "late-add resync-kick invalidate %s/%d", dr.GetName(), vol) - } - } + log.FromContext(ctx).Info("BUG-048 late-add resync-kick: invalidate locally to re-pull a stalled late-added volume from its UpToDate peer", + "resource", dr.GetName(), "volume", vol) - return nil + return errors.Wrapf(r.cfg.Adm.InvalidateVolume(ctx, dr.GetName(), vol), + "late-add resync-kick invalidate %s/%d", dr.GetName(), vol) } // lateAddStallDwellElapsed records the first time a resource was observed From 477fc3e269e60e56df042c748133f7cbf896ecff Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Sun, 14 Jun 2026 19:03:59 +0300 Subject: [PATCH 18/19] fix(satellite,drbd): mint late-add source via clear-bitmap, not primary --force (BUG-048) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The all-Inconsistent late-add wedge (no replica ever won the volume's UpToDate seed) was never actually recovered. maybeLateAddPromote ran `drbdadm primary --force `, but the kernel REJECTS it with "(-16) Disk state is lower than outdated" whenever any volume of the resource is Inconsistent — stand-observed: the promote retried every throttle window and the volume stayed Inconsistent on all 3 replicas past the convergence budget (out-of-sync frozen at full). `primary` is also resource-wide, so one Inconsistent late volume blocks promoting the whole resource even though the sibling volumes are UpToDate. Replace the force-primary with MintLateAddSource, which makes THIS node an UpToDate source per-volume the way DRBD's own skip-initial-sync does: disconnect the resource, run `drbdsetup new-current-uuid --clear-bitmap ` for each LOCAL Inconsistent volume (flips it UpToDate with a clean bitmap), then `drbdadm adjust` to reconnect — the Inconsistent peers then SyncTarget from the freshly-UpToDate local volumes. Verified on-stand: the clear-bitmap flips the volume UpToDate and the peers begin converging, where primary --force was permanently rejected. Data-safe: gated by NeedsLateAddPromote (lowest node-id, NO peer holds committed data, no Primary), so exactly one deterministic node mints the source and the minted copy is byte-equivalent to the equally-empty peers (day0 lineage). Only Inconsistent volumes are cleared; UpToDate siblings are untouched. Adds the device minor to the parsed drbdsetup status so the clear-bitmap can target the specific stuck volume. Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- pkg/drbd/drbdadm.go | 92 ++++++++++++++++++++ pkg/drbd/drbdsetup_show.go | 5 ++ pkg/drbd/needs_late_add_promote_test.go | 72 +++++++++++++++ pkg/satellite/maybe_late_add_promote_test.go | 39 ++++++--- pkg/satellite/reconciler.go | 35 ++++++-- 5 files changed, 221 insertions(+), 22 deletions(-) diff --git a/pkg/drbd/drbdadm.go b/pkg/drbd/drbdadm.go index dff9e619..c8462b3d 100644 --- a/pkg/drbd/drbdadm.go +++ b/pkg/drbd/drbdadm.go @@ -504,6 +504,98 @@ func (a *Adm) NewCurrentUUID(ctx context.Context, resource string) error { return a.run(ctx, "new-current-uuid", resource) } +// MintLateAddSource makes THIS node an UpToDate source for every LOCAL +// Inconsistent volume of `resource`, so the Inconsistent peers can +// SyncTarget from it (BUG-048, the all-Inconsistent late-add wedge where +// NO replica ever won the volume's UpToDate seed). +// +// Why not `primary --force`: the existing late-add-promote ran +// `drbdadm primary --force `, but the kernel REJECTS it with +// "(-16) Disk state is lower than outdated" whenever ANY volume of the +// resource is Inconsistent (stand-observed: the promote retried every +// throttle window and never took, the volume stayed Inconsistent on all +// replicas forever). `primary` is also resource-wide, so a single +// Inconsistent late volume blocks promoting the whole resource even though +// the sibling volumes are UpToDate. +// +// Instead, mint the source per-volume the way DRBD's own skip-initial-sync +// does: `drbdsetup new-current-uuid --clear-bitmap ` marks the +// volume UpToDate with a clean bitmap. That call requires the volume's +// peers to be StandAlone, so the sequence is: +// +// 1. `drbdadm disconnect ` — quiesce every peer (resource-wide; DRBD +// carries all volumes over one connection per peer). UpToDate sibling +// volumes (vol-0/vol-1) are unaffected: on reconnect they re-handshake +// at their shared UUID and stay Established (no resync). +// 2. for each LOCAL Inconsistent volume: `new-current-uuid --clear-bitmap +// ` → that volume flips UpToDate with a clean bitmap. +// 3. `drbdadm adjust ` — reconnect every peer. The peers, still +// Inconsistent, now see an UpToDate source and SyncTarget from it. +// +// Data-safety: the late-added volume is genuinely fresh/empty on every +// replica (day0 lineage; this gate fires only when NO peer holds committed +// data — the caller's NeedsLateAddPromote enforces that), so minting an +// empty UpToDate source loses nothing — the Inconsistent peers were equally +// empty and simply adopt this clean copy. Gated by NeedsLateAddPromote +// (lowest-id node, no peer data, no Primary), so exactly one deterministic +// node mints the source and no split-brain race is possible. +// +// Returns the cleared minors (for logging) and the first error. A disconnect +// failure is best-effort-ignored (a peer already StandAlone is fine); a +// clear-bitmap or adjust failure surfaces. +func (a *Adm) MintLateAddSource(ctx context.Context, resource string) ([]int32, error) { + out, err := a.exec.Run(ctx, "drbdsetup", "status", resource, "--json") + if err != nil { + return nil, errors.Wrapf(err, "drbdsetup status %s", resource) + } + + var status drbdsetupStatusRoot + + err = json.Unmarshal(out, &status) + if err != nil || len(status) == 0 { + return nil, errors.Wrapf(err, "parse drbdsetup status %s", resource) + } + + var minors []int32 + + for _, dev := range status[0].Devices { + if DiskState(dev.DiskState) == DiskStateInconsistent { + minors = append(minors, dev.Minor) + } + } + + if len(minors) == 0 { + return nil, nil + } + + // Best-effort disconnect of all peers — new-current-uuid --clear-bitmap + // requires StandAlone; a peer already StandAlone returns non-zero, which + // we ignore. + _, _ = a.exec.Run(ctx, "drbdadm", "disconnect", resource) + + for _, minor := range minors { + _, mintErr := a.exec.Run(ctx, "drbdsetup", "new-current-uuid", + "--clear-bitmap", strconv.Itoa(int(minor))) + if mintErr != nil { + // Reconnect before bailing so a half-applied mint does not + // leave the resource StandAlone. + _ = a.run(ctx, "adjust", resource) + + return minors, errors.Wrapf(mintErr, + "new-current-uuid --clear-bitmap %d", minor) + } + } + + // Reconnect every peer; the Inconsistent peers now SyncTarget from the + // freshly-UpToDate local volumes. + err = a.run(ctx, "adjust", resource) + if err != nil { + return minors, errors.Wrapf(err, "adjust %s after mint", resource) + } + + return minors, nil +} + // SuspendIO runs `drbdadm suspend-io ` — freezes the // resource's block-I/O path on the local satellite so a backing // snapshot (LVM-thin / ZFS / file) captures bytes at a stable diff --git a/pkg/drbd/drbdsetup_show.go b/pkg/drbd/drbdsetup_show.go index b9743f17..a4b702f8 100644 --- a/pkg/drbd/drbdsetup_show.go +++ b/pkg/drbd/drbdsetup_show.go @@ -152,6 +152,11 @@ type drbdsetupStatusDevice struct { // DiskState is this node's local disk state for the volume // (`disk-state`): UpToDate / Inconsistent / Diskless / … DiskState string `json:"disk-state"` + // Minor is the kernel minor number backing this volume (`/dev/drbdN`). + // Used by the BUG-048 late-add source-mint (MintLateAddSource) to + // target `drbdsetup new-current-uuid --clear-bitmap ` at the + // specific stuck volume. + Minor int32 `json:"minor"` } // drbdsetupStatusConnection is one peer connection slot, as emitted by diff --git a/pkg/drbd/needs_late_add_promote_test.go b/pkg/drbd/needs_late_add_promote_test.go index 550fc421..68dd9c2d 100644 --- a/pkg/drbd/needs_late_add_promote_test.go +++ b/pkg/drbd/needs_late_add_promote_test.go @@ -19,6 +19,7 @@ limitations under the License. package drbd_test import ( + "slices" "testing" "github.com/cozystack/blockstor/pkg/drbd" @@ -338,3 +339,74 @@ func TestNeedsLateAddPromote_LowestPromotesDespiteHigherIDBringingUp(t *testing. t.Fatal("the lowest diskful node must promote even when a higher-id peer is still bringing up the volume") } } + +// MintLateAddSource clears the bitmap for ONLY the local Inconsistent +// volumes (per-minor), wrapped in disconnect → ... → adjust, and never the +// rejected resource-wide `primary --force`. +func TestMintLateAddSource_ClearsInconsistentMinorsOnly(t *testing.T) { + fx := storage.NewFakeExec() + fx.Responses["drbdsetup status pvc-mint --json"] = storage.FakeResponse{Stdout: []byte(`[{ + "name":"pvc-mint","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"minor":1000,"disk-state":"UpToDate"}, + {"volume":1,"minor":1001,"disk-state":"Inconsistent"}, + {"volume":2,"minor":1002,"disk-state":"Inconsistent"} + ], + "connections":[] + }]`)} + + adm := drbd.NewAdm(fx) + + minors, err := adm.MintLateAddSource(t.Context(), "pvc-mint") + if err != nil { + t.Fatalf("MintLateAddSource: %v", err) + } + if !slices.Equal(minors, []int32{1001, 1002}) { + t.Fatalf("expected cleared minors [1001 1002], got %v", minors) + } + + cmds := fx.CommandLines() + want := []string{ + "drbdadm disconnect pvc-mint", + "drbdsetup new-current-uuid --clear-bitmap 1001", + "drbdsetup new-current-uuid --clear-bitmap 1002", + "drbdadm adjust pvc-mint", + } + for _, w := range want { + if !slices.Contains(cmds, w) { + t.Errorf("expected command %q, got: %v", w, cmds) + } + } + // The UpToDate sibling (minor 1000) must NOT be cleared. + if slices.Contains(cmds, "drbdsetup new-current-uuid --clear-bitmap 1000") { + t.Errorf("must NOT clear-bitmap the UpToDate sibling minor 1000, got: %v", cmds) + } + // The kernel-rejected resource-wide primary --force must not appear. + if slices.Contains(cmds, "drbdadm primary --force pvc-mint") { + t.Errorf("must NOT use primary --force, got: %v", cmds) + } +} + +// No local Inconsistent volume → MintLateAddSource is a no-op (no +// disconnect, no clear-bitmap). +func TestMintLateAddSource_NoopWhenNothingInconsistent(t *testing.T) { + fx := storage.NewFakeExec() + fx.Responses["drbdsetup status pvc-mint --json"] = storage.FakeResponse{Stdout: []byte(`[{ + "name":"pvc-mint","node-id":0,"role":"Secondary", + "devices":[{"volume":0,"minor":1000,"disk-state":"UpToDate"}], + "connections":[] + }]`)} + + adm := drbd.NewAdm(fx) + + minors, err := adm.MintLateAddSource(t.Context(), "pvc-mint") + if err != nil { + t.Fatalf("MintLateAddSource: %v", err) + } + if len(minors) != 0 { + t.Fatalf("expected no minors cleared, got %v", minors) + } + if slices.Contains(fx.CommandLines(), "drbdadm disconnect pvc-mint") { + t.Errorf("must NOT disconnect when nothing is Inconsistent, got: %v", fx.CommandLines()) + } +} diff --git a/pkg/satellite/maybe_late_add_promote_test.go b/pkg/satellite/maybe_late_add_promote_test.go index e22a3839..76b29438 100644 --- a/pkg/satellite/maybe_late_add_promote_test.go +++ b/pkg/satellite/maybe_late_add_promote_test.go @@ -42,9 +42,9 @@ const lateAddStatusKey = "drbdsetup status pvc-la --json" const lateAddStatusWedged = `[{ "name":"pvc-la","node-id":0,"role":"Secondary", "devices":[ - {"volume":0,"disk-state":"UpToDate"}, - {"volume":1,"disk-state":"UpToDate"}, - {"volume":2,"disk-state":"Inconsistent"} + {"volume":0,"minor":1000,"disk-state":"UpToDate"}, + {"volume":1,"minor":1001,"disk-state":"UpToDate"}, + {"volume":2,"minor":1002,"disk-state":"Inconsistent"} ], "connections":[{ "peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", @@ -72,8 +72,9 @@ func lateAddDR() *intent.DesiredResource { } } -// The wedge fires the force-primary (and the demote after) even though -// no auto-primary flag was ever stamped — the whole point of the gate. +// The wedge mints an UpToDate source via disconnect + per-volume +// new-current-uuid --clear-bitmap + reconnect (NOT primary --force, which +// the kernel rejects while a volume is Inconsistent). func TestMaybeLateAddPromote_FiresForWedgedLateVolume(t *testing.T) { fx := storage.NewFakeExec() fx.Expect(lateAddStatusKey, storage.FakeResponse{Stdout: []byte(lateAddStatusWedged)}) @@ -85,11 +86,23 @@ func TestMaybeLateAddPromote_FiresForWedgedLateVolume(t *testing.T) { } cmds := fx.CommandLines() - if !slices.Contains(cmds, "drbdadm primary --force pvc-la") { - t.Errorf("expected `drbdadm primary --force pvc-la`, got: %v", cmds) + if !slices.Contains(cmds, "drbdadm disconnect pvc-la") { + t.Errorf("expected `drbdadm disconnect pvc-la`, got: %v", cmds) } - if !slices.Contains(cmds, "drbdadm secondary pvc-la") { - t.Errorf("expected `drbdadm secondary pvc-la` (demote after promote), got: %v", cmds) + if !slices.Contains(cmds, "drbdsetup new-current-uuid --clear-bitmap 1002") { + t.Errorf("expected `drbdsetup new-current-uuid --clear-bitmap 1002` (the Inconsistent vol-2 minor), got: %v", cmds) + } + if !slices.Contains(cmds, "drbdadm adjust pvc-la") { + t.Errorf("expected `drbdadm adjust pvc-la` (reconnect after mint), got: %v", cmds) + } + // Must NOT touch the UpToDate sibling volumes' minors. + if slices.Contains(cmds, "drbdsetup new-current-uuid --clear-bitmap 1000") || + slices.Contains(cmds, "drbdsetup new-current-uuid --clear-bitmap 1001") { + t.Errorf("must NOT clear-bitmap UpToDate sibling volumes, got: %v", cmds) + } + // The rejected resource-wide primary --force is gone. + if slices.Contains(cmds, "drbdadm primary --force pvc-la") { + t.Errorf("must NOT use the kernel-rejected `primary --force` on the late-add path, got: %v", cmds) } } @@ -104,8 +117,8 @@ func TestMaybeLateAddPromote_SkipsDiskless(t *testing.T) { t.Fatalf("maybeLateAddPromote: %v", err) } - if slices.Contains(fx.CommandLines(), "drbdadm primary --force pvc-la") { - t.Errorf("diskless replica must NOT be promoted, got: %v", fx.CommandLines()) + if slices.Contains(fx.CommandLines(), "drbdadm disconnect pvc-la") { + t.Errorf("diskless replica must NOT mint a source, got: %v", fx.CommandLines()) } } @@ -134,7 +147,7 @@ func TestMaybeLateAddPromote_SkipsWhenPeerHasData(t *testing.T) { t.Fatalf("maybeLateAddPromote: %v", err) } - if slices.Contains(fx.CommandLines(), "drbdadm primary --force pvc-la") { - t.Errorf("must NOT force-primary when a peer holds data, got: %v", fx.CommandLines()) + if slices.Contains(fx.CommandLines(), "drbdadm disconnect pvc-la") { + t.Errorf("must NOT mint a source when a peer holds data, got: %v", fx.CommandLines()) } } diff --git a/pkg/satellite/reconciler.go b/pkg/satellite/reconciler.go index abbf527c..8d56cc52 100644 --- a/pkg/satellite/reconciler.go +++ b/pkg/satellite/reconciler.go @@ -2736,17 +2736,26 @@ func (r *Reconciler) maybeRecoveryPromote(ctx context.Context, dr *intent.Desire // when a local diskful volume is Inconsistent, NO connected peer holds // committed data for it (so there is no real SyncSource to wait for), // no replica is Primary, and this node is the deterministic lowest -// node-id — so exactly one node force-primaries and it is data-safe -// (every replica shares the volume's day0 current-UUID; primary --force -// mints no unrelated UUID, and the Inconsistent peers SyncTarget from -// this now-Primary source). Self-limiting: once the peers converge the -// predicate stops holding. +// node-id — so exactly one node mints the source and it is data-safe +// (every replica shares the volume's day0 current-UUID and is equally +// empty, so the minted UpToDate copy loses nothing; the Inconsistent peers +// SyncTarget from it). Self-limiting: once the peers converge the predicate +// stops holding. +// +// How the source is minted: NOT `drbdadm primary --force`. The kernel +// REJECTS that with "(-16) Disk state is lower than outdated" whenever any +// volume of the resource is Inconsistent (stand-observed: the old promote +// retried every throttle window and never took — the wedge persisted on all +// replicas). Instead MintLateAddSource disconnects, runs the per-volume +// `drbdsetup new-current-uuid --clear-bitmap ` (the same skip-init +// path DRBD uses to mark a fresh volume UpToDate), and reconnects — so the +// Inconsistent peers SyncTarget from the freshly-UpToDate local volumes. // // NOT gated on autoPromote/autoPrimaryReplica by design — those are // exactly what is missing on the late-add path. It IS gated on the same // recoveryPromoteThrottle so a still-converging resync isn't churned by -// repeated promotes. Skipped on diskless replicas (no disk to promote) -// and when Adm is unwired (storage-only unit tests). +// repeated mints. Skipped on diskless replicas (no disk to mint) and when +// Adm is unwired (storage-only unit tests). func (r *Reconciler) maybeLateAddPromote(ctx context.Context, dr *intent.DesiredResource, diskless bool) error { if diskless || r.cfg.Adm == nil { return nil @@ -2760,10 +2769,18 @@ func (r *Reconciler) maybeLateAddPromote(ctx context.Context, dr *intent.Desired return nil } - log.FromContext(ctx).Info("BUG-048 late-add-promote: force-primary to seed SyncSource for late-added volume wedged Inconsistent on every replica", + log.FromContext(ctx).Info("BUG-048 late-add-promote: mint UpToDate source (disconnect + new-current-uuid --clear-bitmap + reconnect) for a late-added volume wedged Inconsistent on every replica", "resource", dr.GetName()) - return r.runAutoPromote(ctx, dr) + minors, err := r.cfg.Adm.MintLateAddSource(ctx, dr.GetName()) + if err != nil { + return errors.Wrapf(err, "late-add-promote mint source %s", dr.GetName()) + } + + log.FromContext(ctx).Info("BUG-048 late-add-promote: minted UpToDate source", + "resource", dr.GetName(), "minors", minors) + + return nil } // maybeKickLateAddResync is the BUG-048 self-heal for a late-added volume From e743c3ab85f761fd05ad738b55596d6b00d05345 Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Mon, 15 Jun 2026 12:16:01 +0300 Subject: [PATCH 19/19] fix(store): make auto-numbered VD create idempotent under retry (BUG-048 de-regress) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CreateAutoNumbered's post-commit verification (verifyVolumeLanded) re-reads the RD to confirm the just-written VolumeNumber landed and forces a retry on a synthetic Conflict when it is absent. That guard is only sound when the re-read is LIVE: with no uncached reader wired it falls back to the informer cache, which trails the Update just committed, so it routinely fails to observe its own freshly written volume, surfaces a FALSE conflict, and the retry re-derives the next free number off the lagging cache and appends a SECOND volume. A single auto-numbered create could leave several phantom VolumeDefinitions behind — `vd d 0` then leaves a stray, and a late `vd c` lands 4 VDs instead of 1. Skip the post-commit verification when no live reader is available: a successful optimistic-locked Update is already the apiserver's authoritative confirmation, and there is nothing trustworthy to verify against a stale cache. Production wires mgr.GetAPIReader(), so the live verification path — the actual BUG-048 lost-update guard — is preserved untouched. Also align the integration harness store wiring with cmd/apiserver/main.go (NewWithAPIReader + mgr.GetAPIReader()). The harness used the plain New (no live reader), so CreateAutoNumbered's cache-lag re-read 409-stormed against the RD reconciler's concurrent RD writes; the slow create then made the linstor client re-POST, producing duplicate auto-numbered VDs. Production never hit this because the apiserver always wires the direct reader. The store-layer BUG-048 concurrency guarantee is unchanged: concurrent auto-numbered creates still converge to distinct VolumeNumbers via the optimistic-lock 409 retry (TestK8sVolumeDefinitionConcurrentAutoNumber), and the live post-commit verification still guards the heavy-load lost-update on the production path. Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- pkg/store/k8s/volume_definitions.go | 20 ++++++++++++++++++++ tests/integration/harness/manager.go | 12 +++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/pkg/store/k8s/volume_definitions.go b/pkg/store/k8s/volume_definitions.go index 0a67cda0..0ba15f79 100644 --- a/pkg/store/k8s/volume_definitions.go +++ b/pkg/store/k8s/volume_definitions.go @@ -183,6 +183,26 @@ func (s *volumeDefinitions) CreateAutoNumbered(ctx context.Context, rdName strin // allocate+append re-runs against the now-correct state. This // makes the silent drop impossible: the create either persists or // retries — it never returns success having lost the volume. + // + // Crucially this only holds when the verifying read is LIVE. With + // no uncached reader wired (apiReader == nil) the re-read falls + // back to the informer cache, which trails the Update we just + // committed — so it routinely fails to observe our own freshly + // written volume and surfaces a FALSE synthetic Conflict. The + // retry then re-derives the next free number off the same lagging + // cache and appends a SECOND volume, so a single auto-numbered + // create can leave several phantom VolumeDefinitions behind + // (BUG-048 de-regress). A successful optimistic-locked Update is + // already the apiserver's authoritative confirmation; without a + // live reader there is nothing trustworthy to verify against, so + // skip the check rather than second-guess a committed write + // against a stale cache. Production wires mgr.GetAPIReader(), so + // the live verification path is preserved where it actually + // guards the lost-update. + if s.apiReader == nil { + return nil + } + return s.verifyVolumeLanded(ctx, rdName, assigned) }) if err != nil { diff --git a/tests/integration/harness/manager.go b/tests/integration/harness/manager.go index ce819853..93d4deac 100644 --- a/tests/integration/harness/manager.go +++ b/tests/integration/harness/manager.go @@ -105,7 +105,17 @@ func StartStack(t *testing.T) *Stack { t.Fatalf("build manager: %v", err) } - st := storek8s.New(mgr.GetClient()) + // Mirror cmd/apiserver/main.go: the REST-serving store is built + // with the manager's direct (uncached) API reader. CreateAutoNumbered's + // retry loop reads the parent RD through this reader so a conflict- + // retry observes the just-committed VolumeDefinition rather than a + // stale informer-cache revision. Without it (the plain New) the + // cache-lag re-read re-derives the same hole, 409-storms against the + // RD reconciler's concurrent writes, and the slow create makes the + // linstor client re-POST — leaving duplicate auto-numbered VDs + // (BUG-048 de-regress). Production never hit this because the apiserver + // always wires GetAPIReader(); the harness must match. + st := storek8s.NewWithAPIReader(mgr.GetClient(), mgr.GetAPIReader()) // Wire every reconciler / runnable cmd/controller/main.go // registers. Mirror order exactly so a future split-or-merge