Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
e63895e
fix(rest,store): close concurrent late vd-add lost-update race (BUG-048)
kvaps Jun 14, 2026
836ed9f
test(harness): concurrent late vd-add catchers for BUG-048 (L6 + L7)
kvaps Jun 14, 2026
89443d0
fix(satellite): seed late-added volume race-free, off MetadataCreated…
kvaps Jun 14, 2026
10394f7
fix(store,apiserver): read RD live (uncached) in CreateAutoNumbered (…
kvaps Jun 14, 2026
5f760a0
test(harness): widen concurrent late-vd convergence budget to 360s (B…
kvaps Jun 14, 2026
5f0a444
fix(satellite,drbd): late-add-promote self-heal for ≥3-replica wedge …
kvaps Jun 14, 2026
ab4ce73
test(satellite): pin maybeLateAddPromote self-heal wiring (BUG-048)
kvaps Jun 14, 2026
ec08de8
fix(store): verify volume landed after auto-numbered create (BUG-048)
kvaps Jun 14, 2026
ed53d00
test(harness): shorten BUG-048 replay name to fit DRBD resource-name …
kvaps Jun 14, 2026
e766d4d
fix(drbd): gate late-add-promote against day0 split-brain (BUG-048)
kvaps Jun 14, 2026
39f148b
test(harness): give BUG-048 replay 360s for the self-heal recovery path
kvaps Jun 14, 2026
035915d
fix(controller): optimistic-lock the RD volume-minor patch (BUG-048)
kvaps Jun 14, 2026
97fb699
fix(drbd): defer late-add-promote to a lower-id peer mid-bring-up (BU…
kvaps Jun 14, 2026
c4271d1
fix(satellite,drbd): kick stalled late-add resync to converge all rep…
kvaps Jun 14, 2026
3974401
test(harness): document the late-add resync-kick self-heal in the BUG…
kvaps Jun 14, 2026
74728dd
fix(satellite,drbd): recover stalled late-add resync via per-volume i…
kvaps Jun 14, 2026
5df2a68
fix(satellite,drbd): prevent late-add invalidate flap (one volume at …
kvaps Jun 14, 2026
477fc3e
fix(satellite,drbd): mint late-add source via clear-bitmap, not prima…
kvaps Jun 14, 2026
8850ae3
Merge remote-tracking branch 'origin/main' into HEAD
kvaps Jun 14, 2026
e743c3a
fix(store): make auto-numbered VD create idempotent under retry (BUG-…
kvaps Jun 15, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ Copy the closest existing YAML and fill:
- `teardown[]` — cleanup CLI invocations.
- `invariants[]` — currently only `no_orphans` is implemented.

Available `await.kind` values: `replica_count`, `disk_state`, `all_uptodate`, `replica_diskless`, `no_tiebreaker`, `sync_clean`, `resource_absent`, `rd_absent`, `vd_size_kib`, `pvc_capacity`, `pod_md5_invariant`. See the header comment of `tests/operator-harness/replay-runner.sh` for the contract.
Available `await.kind` values: `replica_count`, `disk_state`, `all_uptodate`, `replica_diskless`, `no_tiebreaker`, `sync_clean`, `resource_absent`, `rd_absent`, `vd_size_kib`, `vd_count`, `pvc_capacity`, `pod_md5_invariant`. See the header comment of `tests/operator-harness/replay-runner.sh` for the contract.

## Running the harness

Expand Down
9 changes: 8 additions & 1 deletion cmd/apiserver/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,14 @@ func main() {
// CRD-backed store is the only supported persistence layer
// post-Phase-11 — the apiserver/controller split made
// in-process state pointless across replicas.
st := storek8s.New(mgr.GetClient())
//
// BUG-048: pass the manager's direct (uncached) API reader so the
// atomic VolumeNumber allocation re-reads live RD state on each
// conflict-retry. With only the informer-cached client, two
// concurrent `vd c` against one RD both retry against a stale cache,
// re-derive the same number, exhaust the retry budget, and silently
// drop the second volume.
st := storek8s.NewWithAPIReader(mgr.GetClient(), mgr.GetAPIReader())

ready := newReadyState()

Expand Down
213 changes: 213 additions & 0 deletions internal/controller/bug_048_vd_drop_minor_patch_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
// SPDX-License-Identifier: Apache-2.0

/*
Copyright 2026 Cozystack contributors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package controller

import (
"context"
"testing"

apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"

blockstoriov1alpha1 "github.com/cozystack/blockstor/api/v1alpha1"
)

// BUG-048 (P1, the silent VD drop). The ResourceReconciler's
// patchRDVolumeMinors writes the RD's per-volume DRBD minors with a JSON
// merge-patch. JSON merge-patch (RFC 7386) REPLACES the
// spec.volumeDefinitions array wholesale, so the write stamps the WHOLE
// list from whatever snapshot the reconciler read at the top of
// ensureRDVolumeMinors. If a concurrent number-less `linstor vd c`
// appended a new volume AFTER that read but BEFORE this patch, the
// wholesale replace silently drops the appended volume — and both `vd c`
// return success because the clobber is an async reconciler write, not
// their own create.
//
// The original code carried a comment claiming "optimistic concurrency"
// but never set metadata.resourceVersion, so the patch could NEVER 409
// and the appended volume vanished with no error anywhere — the
// operator-visible "vdCount short by one, both vd c rc0, zero
// conflict/retry in the logs" failure observed on the stand.
//
// The fix embeds metadata.resourceVersion in the merge-patch body so the
// apiserver rejects the stale wholesale replace with Conflict. These
// tests pin both halves of the contract:
//
// - a stale-snapshot patch racing an appended volume MUST be rejected
// with Conflict (and therefore NOT drop the appended volume), and
// - a non-racing patch (fresh snapshot) MUST still succeed and stamp
// the minors.

func bug048Scheme(t *testing.T) *runtime.Scheme {
t.Helper()

scheme := runtime.NewScheme()
if err := blockstoriov1alpha1.AddToScheme(scheme); err != nil {
t.Fatalf("AddToScheme: %v", err)
}

return scheme
}

func bug048RDVolNumbers(rd *blockstoriov1alpha1.ResourceDefinition) []int32 {
out := make([]int32, 0, len(rd.Spec.VolumeDefinitions))
for i := range rd.Spec.VolumeDefinitions {
out = append(out, rd.Spec.VolumeDefinitions[i].VolumeNumber)
}

return out
}

// TestBug048MinorPatchRejectsStaleVolumeDefinitionClobber is the core
// regression. We:
// 1. seed an RD with [vol-0], read it into the reconciler's snapshot,
// 2. simulate a concurrent `vd c` appending vol-1 to the LIVE RD
// (bumping its resourceVersion), then
// 3. call patchRDVolumeMinors with the STALE [vol-0] snapshot.
//
// Pre-fix (blind merge-patch, no resourceVersion) the patch silently
// succeeds and the live RD is clobbered back to [vol-0] — vol-1 is
// dropped. Post-fix the patch carries the snapshot's resourceVersion,
// the apiserver returns Conflict, and vol-1 survives.
func TestBug048MinorPatchRejectsStaleVolumeDefinitionClobber(t *testing.T) {
t.Parallel()

ctx := context.Background()
scheme := bug048Scheme(t)

const rdName = "drop-rd"

rd := &blockstoriov1alpha1.ResourceDefinition{
ObjectMeta: metav1.ObjectMeta{Name: rdName},
Spec: blockstoriov1alpha1.ResourceDefinitionSpec{
VolumeDefinitions: []blockstoriov1alpha1.ResourceDefinitionVolume{
{VolumeNumber: 0, SizeKib: 1024},
},
},
}

cli := fake.NewClientBuilder().WithScheme(scheme).
WithStatusSubresource(&blockstoriov1alpha1.ResourceDefinition{}).
WithObjects(rd).
Build()

rec := &ResourceReconciler{Client: cli, Scheme: scheme, APIReader: cli}

// (1) reconciler snapshot of the RD (carries the stale resourceVersion).
snap := &blockstoriov1alpha1.ResourceDefinition{}
if err := cli.Get(ctx, client.ObjectKey{Name: rdName}, snap); err != nil {
t.Fatalf("get RD snapshot: %v", err)
}
// Reconciler "allocates" a minor for vol-0 on its snapshot.
m0 := int32(1000)
snap.Spec.VolumeDefinitions[0].DRBDMinor = &m0

// (2) concurrent `vd c` appends vol-1 to the LIVE RD, bumping its RV.
live := &blockstoriov1alpha1.ResourceDefinition{}
if err := cli.Get(ctx, client.ObjectKey{Name: rdName}, live); err != nil {
t.Fatalf("get live RD: %v", err)
}

live.Spec.VolumeDefinitions = append(live.Spec.VolumeDefinitions,
blockstoriov1alpha1.ResourceDefinitionVolume{VolumeNumber: 1, SizeKib: 1024})
if err := cli.Update(ctx, live); err != nil {
t.Fatalf("append vol-1 (simulated vd c): %v", err)
}

// (3) reconciler patches minors from its STALE [vol-0] snapshot.
err := rec.patchRDVolumeMinors(ctx, snap)
if !apierrors.IsConflict(err) {
t.Fatalf("patchRDVolumeMinors with stale snapshot returned err=%v, want Conflict; "+
"a non-Conflict means the wholesale volumeDefinitions replace was NOT optimistic-locked "+
"and would silently drop the concurrently-appended vol-1 (BUG-048)", err)
}

// The appended volume MUST survive the rejected clobber.
got := &blockstoriov1alpha1.ResourceDefinition{}
if gerr := cli.Get(ctx, client.ObjectKey{Name: rdName}, got); gerr != nil {
t.Fatalf("get RD after rejected patch: %v", gerr)
}

nums := bug048RDVolNumbers(got)
if len(nums) != 2 || nums[0] != 0 || nums[1] != 1 {
t.Fatalf("RD volumeDefinitions after rejected clobber = %v, want [0 1] "+
"(vol-1 was silently dropped — the BUG-048 lost update)", nums)
}
}

// TestBug048MinorPatchSucceedsOnFreshSnapshot proves the optimistic-lock
// does not break the happy path: a patch built from a CURRENT snapshot
// (no racing writer) must still commit and stamp the minors.
func TestBug048MinorPatchSucceedsOnFreshSnapshot(t *testing.T) {
t.Parallel()

ctx := context.Background()
scheme := bug048Scheme(t)

const rdName = "fresh-rd"

rd := &blockstoriov1alpha1.ResourceDefinition{
ObjectMeta: metav1.ObjectMeta{Name: rdName},
Spec: blockstoriov1alpha1.ResourceDefinitionSpec{
VolumeDefinitions: []blockstoriov1alpha1.ResourceDefinitionVolume{
{VolumeNumber: 0, SizeKib: 1024},
{VolumeNumber: 1, SizeKib: 1024},
},
},
}

cli := fake.NewClientBuilder().WithScheme(scheme).
WithStatusSubresource(&blockstoriov1alpha1.ResourceDefinition{}).
WithObjects(rd).
Build()

rec := &ResourceReconciler{Client: cli, Scheme: scheme, APIReader: cli}

snap := &blockstoriov1alpha1.ResourceDefinition{}
if err := cli.Get(ctx, client.ObjectKey{Name: rdName}, snap); err != nil {
t.Fatalf("get RD snapshot: %v", err)
}

m0, m1 := int32(1000), int32(1001)
snap.Spec.VolumeDefinitions[0].DRBDMinor = &m0
snap.Spec.VolumeDefinitions[1].DRBDMinor = &m1

if err := rec.patchRDVolumeMinors(ctx, snap); err != nil {
t.Fatalf("patchRDVolumeMinors on fresh snapshot: %v", err)
}

got := &blockstoriov1alpha1.ResourceDefinition{}
if err := cli.Get(ctx, client.ObjectKey{Name: rdName}, got); err != nil {
t.Fatalf("get RD after patch: %v", err)
}

if nums := bug048RDVolNumbers(got); len(nums) != 2 || nums[0] != 0 || nums[1] != 1 {
t.Fatalf("RD volumeDefinitions after fresh patch = %v, want [0 1]", nums)
}

for i := range got.Spec.VolumeDefinitions {
if got.Spec.VolumeDefinitions[i].DRBDMinor == nil {
t.Fatalf("vol-%d minor not stamped after fresh patch",
got.Spec.VolumeDefinitions[i].VolumeNumber)
}
}
}
33 changes: 31 additions & 2 deletions internal/controller/resource_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1624,6 +1624,24 @@ func backfillRDMinorsFromStatus(rd *blockstoriov1alpha1.ResourceDefinition) bool
// satellite/REST owners of the other VolumeDefinition fields read them
// off Spec elsewhere, and we re-send the size/props verbatim to avoid
// dropping them.
//
// BUG-048 (P1, the silent VD drop): because a JSON merge-patch REPLACES
// the volumeDefinitions array (RFC 7386 has no array-merge), this write
// stamps the WHOLE list from the in-memory snapshot the reconciler read
// at the top of ensureRDVolumeMinors. If a concurrent number-less
// `linstor vd c` appended a new volume AFTER that read but before this
// patch, the wholesale replace silently drops it — and both `vd c`
// return success because the clobber is an async reconciler write, not
// their own. The original code claimed "optimistic concurrency" in a
// comment but never actually set metadata.resourceVersion, so the patch
// could NEVER 409 and the IsConflict re-read branch in the caller was
// dead. We now embed metadata.resourceVersion in the merge-patch body
// (exactly what client.MergeFromWithOptimisticLock does for typed
// patches): the apiserver rejects the patch with Conflict when a racing
// VD append moved the RD on, the caller re-reads, and the next reconcile
// re-allocates minors against the now-complete list — the appended
// volume survives. A missing resourceVersion (RD freshly built in a unit
// path) degrades to the prior blind patch rather than failing the write.
func (r *ResourceReconciler) patchRDVolumeMinors(ctx context.Context, rd *blockstoriov1alpha1.ResourceDefinition) error {
vols := make([]map[string]any, 0, len(rd.Spec.VolumeDefinitions))
for i := range rd.Spec.VolumeDefinitions {
Expand All @@ -1645,15 +1663,26 @@ func (r *ResourceReconciler) patchRDVolumeMinors(ctx context.Context, rd *blocks
vols = append(vols, entry)
}

meta := map[string]any{}
if rv := rd.ResourceVersion; rv != "" {
// Optimistic-lock precondition: the apiserver enforces
// metadata.resourceVersion on any patch body, so a racing VD
// append (which bumped the RD past this snapshot) makes the
// wholesale volumeDefinitions replace 409 instead of clobbering
// the appended volume.
meta["resourceVersion"] = rv
}

body := map[string]any{patchKeySpec: map[string]any{"volumeDefinitions": vols}}
if len(meta) > 0 {
body["metadata"] = meta
}

patchBytes, err := json.Marshal(body)
if err != nil {
return err
}

// Optimistic concurrency: include resourceVersion so a racing
// minor write is rejected with Conflict and we re-read.
patchTarget := &blockstoriov1alpha1.ResourceDefinition{
TypeMeta: metav1.TypeMeta{
Kind: resourceDefinitionKindName,
Expand Down
Loading