diff --git a/pkg/rest/resource_delete_last_uptodate_u130.go b/pkg/rest/resource_delete_last_uptodate_u130.go index 0ad219d1..32a15126 100644 --- a/pkg/rest/resource_delete_last_uptodate_u130.go +++ b/pkg/rest/resource_delete_last_uptodate_u130.go @@ -65,6 +65,23 @@ import ( // true when the guard MUST fire and the delete must be refused. // // `siblings` is the full replica set of the RD (including `target`). +// +// This is a data-availability guard, so it FAILS SAFE: when it cannot +// positively confirm that another UpToDate copy would survive the +// delete, it refuses. The load-bearing safety property is "never drop +// the last source of a syncing peer" — a false refusal merely asks the +// operator to wait or pass `?force=true`, whereas a false allow strands +// the SyncTarget with no UpToDate source (unrecoverable data loss). +// +// BUG-045 hardening: the decision must NOT rest solely on the +// possibly-lagging CRD `diskState` projection. The satellite observer +// can leave a live source's `diskState` blank (a Secondary SyncSource +// whose local `device` frame's `disk:UpToDate` was not re-stamped), +// while the SyncSource replication-state — emitted on the reliably- +// delivered peer-device frame — is present. The guard therefore treats +// a SyncSource replication-state as kernel ground-truth evidence that a +// replica holds an UpToDate copy, and treats an unknown/empty diskState +// on a diskful replica conservatively rather than as "not a source". func resourceMidSyncDeleteRefusal(target *apiv1.Resource, siblings []apiv1.Resource) bool { // A diskless / tiebreaker replica carries no UpToDate backing // storage; removing it never strands a SyncTarget's only source. @@ -72,13 +89,20 @@ func resourceMidSyncDeleteRefusal(target *apiv1.Resource, siblings []apiv1.Resou return false } - if !resourceIsUpToDate(target) { - // The target itself is not an UpToDate source — removing it - // cannot remove "the last UpToDate source". Out of scope. + // The target must be a plausible last source. A diskful replica + // that the kernel confirms is UpToDate or SyncSource clearly is. + // A diskful replica whose diskState is empty/unknown is ALSO treated + // as a possible source here (fail-safe): the BUG-045 race leaves a + // real source's diskState blank, and concluding "not a source" on an + // unknown projection is exactly the false-allow that strands the + // SyncTarget. Only a target the kernel positively reports as NOT a + // complete copy (Inconsistent / SyncTarget / Diskless / Outdated / + // Failed) is out of scope. + if !resourceIsPossibleUpToDateSource(target) { return false } - otherUpToDateDiskful := 0 + otherConfirmedSources := 0 otherMidSyncDiskful := 0 for i := range siblings { @@ -92,25 +116,91 @@ func resourceMidSyncDeleteRefusal(target *apiv1.Resource, siblings []apiv1.Resou } switch { - case resourceIsUpToDate(sib): - otherUpToDateDiskful++ + case resourceIsConfirmedUpToDateSource(sib): + // Kernel-confirmed surviving source: UpToDate diskState or a + // SyncSource replication-state. The SyncTarget keeps a valid + // source after the delete. + otherConfirmedSources++ case resourceIsMidSync(sib): otherMidSyncDiskful++ } } - // Safe: another UpToDate diskful survives — the SyncTarget keeps a - // valid source after the delete, or there is simply nothing in - // flight to strand. - if otherUpToDateDiskful > 0 { + // Safe: a kernel-confirmed UpToDate/SyncSource diskful survives — + // the SyncTarget keeps a valid source after the delete, or there is + // simply nothing in flight to strand. + if otherConfirmedSources > 0 { return false } - // Refuse only when the target is the sole UpToDate source AND a - // diskful peer is mid-sync that would be stranded by the delete. + // Refuse when the target is the sole surviving source AND a diskful + // peer is mid-sync that would be stranded by the delete. return otherMidSyncDiskful > 0 } +// resourceIsConfirmedUpToDateSource reports whether the kernel +// positively confirms the replica holds a complete, current copy that +// can serve a resync: its local disk_state is UpToDate, OR it reports a +// SyncSource replication-state toward a peer. The SyncSource signal is +// kernel ground truth — DRBD only lets a node feed a peer's resync from +// an UpToDate local disk — so it stands in for a lagging/blank diskState +// projection (BUG-045). +func resourceIsConfirmedUpToDateSource(res *apiv1.Resource) bool { + return resourceIsUpToDate(res) || resourceIsSyncSource(res) +} + +// resourceIsPossibleUpToDateSource reports whether the replica might be +// the last UpToDate source and so must keep the guard armed. It is the +// fail-safe widening of resourceIsConfirmedUpToDateSource: in addition +// to kernel-confirmed sources it also returns true for a diskful replica +// whose diskState is empty/unknown (the BUG-045 unstamped-projection +// case). Only a replica the kernel positively reports as NOT a complete +// copy is excluded. +func resourceIsPossibleUpToDateSource(res *apiv1.Resource) bool { + if resourceIsConfirmedUpToDateSource(res) { + return true + } + + // An empty/unknown diskState on a diskful replica is treated + // conservatively as "might be a source": refusing in doubt is the + // fail-safe choice for a last-copy delete. A replica the kernel + // positively reports mid-sync or not-a-copy is excluded by + // resourceHasKnownDiskState being true with a non-UpToDate value. + return !resourceHasKnownDiskState(res) +} + +// resourceHasKnownDiskState reports whether any of the replica's volumes +// carries a non-empty kernel-observed disk_state. False means the +// observer has not (yet) projected a disk_state for the replica — the +// BUG-045 unstamped-projection window — which the last-copy guard treats +// conservatively. +func resourceHasKnownDiskState(res *apiv1.Resource) bool { + for i := range res.Volumes { + if res.Volumes[i].State.DiskState != "" { + return true + } + } + + return false +} + +// resourceIsSyncSource reports whether the replica reports a SyncSource +// replication-state toward any peer on any volume. A SyncSource is, by +// DRBD invariant, holding an UpToDate local copy (it is actively feeding +// a peer's resync), so this is authoritative evidence the replica is a +// valid source even when the diskState projection lags or is blank. +func resourceIsSyncSource(res *apiv1.Resource) bool { + for i := range res.Volumes { + for _, rep := range res.Volumes[i].State.ReplicationStates { + if rep.ReplicationState == drbdReplStateSyncSource { + return true + } + } + } + + return false +} + // resourceIsDiskful reports whether a replica carries real backing // storage (not DISKLESS, not TIE_BREAKER). func resourceIsDiskful(res *apiv1.Resource) bool { @@ -169,6 +259,7 @@ const ( drbdDiskStateInconsistent = "Inconsistent" drbdDiskStateSyncTarget = "SyncTarget" drbdReplStateSyncTarget = "SyncTarget" + drbdReplStateSyncSource = "SyncSource" ) // refuseLastUpToDateDiskfulMidSyncDelete is the U130 gate invoked from diff --git a/pkg/rest/resource_delete_last_uptodate_u130_test.go b/pkg/rest/resource_delete_last_uptodate_u130_test.go index 04961859..925fdd03 100644 --- a/pkg/rest/resource_delete_last_uptodate_u130_test.go +++ b/pkg/rest/resource_delete_last_uptodate_u130_test.go @@ -65,6 +65,29 @@ func disklessReplica(rd, node string, flag string) *apiv1.Resource { } } +// syncSourceUnstampedReplica is the BUG-045 shape: a diskful replica +// that is actively feeding a peer's resync (SyncSource replication-state +// toward `peer`) but whose local diskState projection is blank — the +// satellite observer's device-frame `disk:UpToDate` was not re-stamped +// on the Secondary SyncSource. The kernel ground truth (SyncSource) says +// it IS an UpToDate source; the guard must trust that, not the empty +// diskState. +func syncSourceUnstampedReplica(rd, node, peer string) *apiv1.Resource { + return &apiv1.Resource{ + Name: rd, + NodeName: node, + Volumes: []apiv1.Volume{{ + VolumeNumber: 0, + State: apiv1.VolumeState{ + DiskState: "", // BUG-045: projection left it blank + ReplicationStates: map[string]apiv1.ReplicationState{ + peer: {ReplicationState: drbdReplStateSyncSource}, + }, + }, + }}, + } +} + // TestU130RejectsDeleteOfLastUpToDateWhilePeerSyncing is the core // repro: 1 diskful UpToDate source + 1 diskful SyncTarget (mid-sync) + // 1 diskless Primary. Deleting the UpToDate source must be refused. @@ -161,6 +184,119 @@ func TestU130RejectsViaPeerReplicationStateSyncTarget(t *testing.T) { } } +// TestU130RejectsSyncSourceTargetWithBlankDiskState is the BUG-045 +// repro: the deleted target is the SOURCE replica (a Secondary feeding +// the resync) whose CRD diskState projection is BLANK because the +// satellite observer never re-stamped `disk:UpToDate` on the Secondary +// SyncSource. Its SyncSource replication-state toward the mid-sync peer +// is kernel ground truth that it holds an UpToDate copy. The guard must +// NOT trust the blank diskState and conclude "not a source" — it must +// refuse the delete that would strand the SyncTarget. +func TestU130RejectsSyncSourceTargetWithBlankDiskState(t *testing.T) { + st := store.NewInMemory() + ctx := t.Context() + rd := "pvc-u130-bug045" + + seedU130RD(t, st, rd) + + // n1: the source being deleted — Secondary SyncSource, blank + // diskState (the BUG-045 unstamped-projection window). + if err := st.Resources().Create(ctx, syncSourceUnstampedReplica(rd, "n1", "n2")); err != nil { + t.Fatalf("seed n1: %v", err) + } + // n2: freshly-added second diskful, still catching up. + if err := st.Resources().Create(ctx, diskfulReplica(rd, "n2", drbdDiskStateSyncTarget)); err != nil { + t.Fatalf("seed n2: %v", err) + } + // n3: diskless Primary/InUse holding the resource open. + if err := st.Resources().Create(ctx, disklessReplica(rd, "n3", apiv1.ResourceFlagDiskless)); err != nil { + t.Fatalf("seed n3: %v", err) + } + + base, stop := startServerWithStore(t, st) + defer stop() + + resp := httpDelete(t, base+"/v1/resource-definitions/"+rd+"/resources/n1") + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode != http.StatusConflict { + t.Fatalf("status: got %d, want 409 (BUG-045: blank-diskState SyncSource must still be refused)", resp.StatusCode) + } + + if _, err := st.Resources().Get(ctx, rd, "n1"); err != nil { + t.Errorf("n1 must survive the refused delete; got err=%v", err) + } +} + +// TestU130RejectsBlankDiskStateTargetMidSync pins the fail-safe widening +// independent of any replication-state signal on the target: a diskful +// target with a completely UNKNOWN diskState (no SyncSource token either +// — e.g. the projection has not landed any field yet) while a diskful +// peer is mid-sync must be refused. Refusing in doubt is the load-bearing +// data-safety property; a false allow strands the SyncTarget. +func TestU130RejectsBlankDiskStateTargetMidSync(t *testing.T) { + st := store.NewInMemory() + ctx := t.Context() + rd := "pvc-u130-blank" + + seedU130RD(t, st, rd) + + // n1: diskful target with an entirely blank state surface. + if err := st.Resources().Create(ctx, diskfulReplica(rd, "n1", "")); err != nil { + t.Fatalf("seed n1: %v", err) + } + if err := st.Resources().Create(ctx, diskfulReplica(rd, "n2", drbdDiskStateSyncTarget)); err != nil { + t.Fatalf("seed n2: %v", err) + } + + base, stop := startServerWithStore(t, st) + defer stop() + + resp := httpDelete(t, base+"/v1/resource-definitions/"+rd+"/resources/n1") + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode != http.StatusConflict { + t.Fatalf("status: got %d, want 409 (fail-safe: blank-diskState target mid-sync must be refused)", resp.StatusCode) + } +} + +// TestU130AllowsDeleteWhenSyncSourceSiblingSurvives pins that a SyncSource +// SIBLING with a blank diskState counts as a surviving UpToDate source: +// deleting one UpToDate diskful is allowed because the SyncSource peer +// keeps feeding the SyncTarget. Without crediting the SyncSource sibling +// the guard would over-refuse a legitimate delete (must-not-regress). +func TestU130AllowsDeleteWhenSyncSourceSiblingSurvives(t *testing.T) { + st := store.NewInMemory() + ctx := t.Context() + rd := "pvc-u130-srcsib" + + seedU130RD(t, st, rd) + + // n1: the delete target — plainly UpToDate. + if err := st.Resources().Create(ctx, diskfulReplica(rd, "n1", drbdDiskStateUpToDate)); err != nil { + t.Fatalf("seed n1: %v", err) + } + // n2: a SyncSource sibling (blank diskState) — kernel-confirmed + // surviving source that keeps feeding n3. + if err := st.Resources().Create(ctx, syncSourceUnstampedReplica(rd, "n2", "n3")); err != nil { + t.Fatalf("seed n2: %v", err) + } + // n3: the SyncTarget being fed by n2. + if err := st.Resources().Create(ctx, diskfulReplica(rd, "n3", drbdDiskStateSyncTarget)); err != nil { + t.Fatalf("seed n3: %v", err) + } + + base, stop := startServerWithStore(t, st) + defer stop() + + resp := httpDelete(t, base+"/v1/resource-definitions/"+rd+"/resources/n1") + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode != http.StatusOK { + t.Fatalf("status: got %d, want 200 (SyncSource sibling survives as source)", resp.StatusCode) + } +} + // TestU130AllowsDeleteWhenNoPeerSyncing pins bug-hunt #6: when NO peer // is mid-sync, deleting the last diskful is allowed (matches upstream). // Here n1 is the only diskful; n2 is a plain diskless (not syncing). @@ -346,6 +482,51 @@ func TestU130PredicateUnit(t *testing.T) { }, refuse: false, }, + { + // BUG-045: the source's diskState is blank but it is a + // SyncSource feeding the mid-sync peer — must refuse. + name: "syncsource-target-blank-diskstate", + target: syncSourceUnstampedReplica(rd, "n1", "n2"), + siblings: []*apiv1.Resource{ + syncSourceUnstampedReplica(rd, "n1", "n2"), + diskfulReplica(rd, "n2", drbdDiskStateSyncTarget), + }, + refuse: true, + }, + { + // Fail-safe: an entirely blank-state diskful target while a + // peer is mid-sync — refuse in doubt. + name: "blank-diskstate-target-midsync", + target: diskfulReplica(rd, "n1", ""), + siblings: []*apiv1.Resource{ + diskfulReplica(rd, "n1", ""), + diskfulReplica(rd, "n2", drbdDiskStateSyncTarget), + }, + refuse: true, + }, + { + // A blank-state diskful target with NO peer mid-sync is the + // genuine last-copy case — still allowed (no regression). + name: "blank-diskstate-target-no-sync", + target: diskfulReplica(rd, "n1", ""), + siblings: []*apiv1.Resource{ + diskfulReplica(rd, "n1", ""), + disklessReplica(rd, "n2", apiv1.ResourceFlagDiskless), + }, + refuse: false, + }, + { + // A SyncSource sibling (blank diskState) is a confirmed + // surviving source — dropping one UpToDate copy is allowed. + name: "syncsource-sibling-survives", + target: diskfulReplica(rd, "n1", drbdDiskStateUpToDate), + siblings: []*apiv1.Resource{ + diskfulReplica(rd, "n1", drbdDiskStateUpToDate), + syncSourceUnstampedReplica(rd, "n2", "n3"), + diskfulReplica(rd, "n3", drbdDiskStateSyncTarget), + }, + refuse: false, + }, { name: "diskless-target", target: disklessReplica(rd, "n3", apiv1.ResourceFlagDiskless), diff --git a/pkg/satellite/controllers/observer.go b/pkg/satellite/controllers/observer.go index 658915f2..d5853728 100644 --- a/pkg/satellite/controllers/observer.go +++ b/pkg/satellite/controllers/observer.go @@ -410,6 +410,37 @@ func parsePeerNodeID(raw string) *int32 { // the clear, a reader between the two frames sees stale "(NN%)" // on what the kernel already considers a settled replica. func peerDeviceVolumeObservation(ev drbd.Event, volNum int32) []volumeObservation { + // SyncSource invariant (BUG-045): a node reporting + // `replication:SyncSource` toward a peer is feeding that peer's + // resync, which the kernel only permits from a LOCAL disk that is + // itself UpToDate. The local `device` frame's `disk:UpToDate` may + // be missing or stale on a Secondary SyncSource (it is only + // re-emitted on an actual local disk-state transition, and a peer- + // device frame can create the volume-cache entry first with an + // empty DiskState). Stamp DiskState=UpToDate off the reliably- + // emitted peer-device frame so the CRD status projection never + // leaves a live source's diskState blank — the U130 last-copy + // delete guard reads exactly this field. mergeVolumeInto only ever + // upgrades a cached DiskState to a non-empty value, and UpToDate is + // the terminal disk-state, so this is idempotent and never + // downgrades a richer local-frame observation. + if ev.Fields["replication"] == drbdReplStateSyncSource { + obs := volumeObservation{ + VolumeNumber: volNum, + DiskState: drbdDiskStateUpToDate, + } + + if oosStr, ok := ev.Fields["out-of-sync"]; ok { + oos, err := strconv.ParseInt(oosStr, 10, 64) + if err == nil { + obs.OutOfSyncKib = oos + obs.HasSync = true + } + } + + return []volumeObservation{obs} + } + if oosStr, ok := ev.Fields["out-of-sync"]; ok { oos, err := strconv.ParseInt(oosStr, 10, 64) if err == nil { @@ -564,6 +595,21 @@ const ( // `annotateSyncProgress` stops decorating bare UpToDate with a // stale "(NN%)" suffix. drbdStateEstablished = "Established" + // drbdReplStateSyncSource is the DRBD-9 per-peer replication-state + // token meaning "this node is feeding the resync of a SyncTarget + // peer". It is a hard DRBD invariant that a SyncSource's LOCAL + // disk is UpToDate — a node cannot serve a resync from a disk that + // is not itself a complete, current copy. The observer uses this + // to re-stamp the local volume's DiskState=UpToDate off the + // reliably-emitted peer-device frame, closing the BUG-045 race + // where the local `device` frame's `disk:UpToDate` is missing or + // stale on a Secondary SyncSource (see peerDeviceVolumeObservation). + drbdReplStateSyncSource = "SyncSource" + // drbdDiskStateUpToDate is the DRBD-9 device disk_state token + // meaning the local replica holds a complete, current copy. Used + // as the value the SyncSource invariant stamps onto the local + // volume's DiskState. + drbdDiskStateUpToDate = "UpToDate" // drbdRolePrimary is the DRBD-9 role token meaning the // replica is open for write. Maps to ResourceStatus.InUse. drbdRolePrimary = "Primary" diff --git a/pkg/satellite/controllers/observer_internal_test.go b/pkg/satellite/controllers/observer_internal_test.go index e549f0ea..552a4a93 100644 --- a/pkg/satellite/controllers/observer_internal_test.go +++ b/pkg/satellite/controllers/observer_internal_test.go @@ -2535,3 +2535,131 @@ func TestObserverClearsOutOfSyncMultiVolumeOnEstablished(t *testing.T) { } } } + +// TestObserverStampsUpToDateForSecondarySyncSource pins the BUG-045 +// observer projection fix. +// +// Reproduction: an existing diskful replica (the eventual delete target) +// is Secondary and becomes the SyncSource for a freshly-added diskful +// peer (SyncTarget). On a Secondary SyncSource, drbd-9 does NOT re-emit +// a local `device` frame carrying `disk:UpToDate` — the local disk-state +// did not transition (it was already UpToDate), so the kernel only emits +// the peer-device `replication:SyncSource` frame toward the SyncTarget. +// If the volume-cache entry is (re)created by that peer-device frame +// (e.g. after a `drbdadm adjust` re-rendered the resource to add the new +// peer), the entry would carry an empty DiskState — and the CRD status +// projection would leave `.status.volumes[0].diskState` blank. +// +// That blank diskState is what makes the U130 last-copy delete guard +// false-allow the destructive `r d` of the only UpToDate source (the +// guard reads diskState). Contract: when the observer sees a +// `replication:SyncSource` peer-device frame, it MUST stamp the local +// volume's DiskState=UpToDate (a hard DRBD invariant — a SyncSource +// feeds a peer's resync only from an UpToDate local disk), so the +// projection never leaves a live source's diskState blank. +func TestObserverStampsUpToDateForSecondarySyncSource(t *testing.T) { + o := &ObserverRunnable{} + + // The ONLY frame the observer receives for this volume is the + // peer-device SyncSource frame — no local `device disk:UpToDate` + // frame arrives (the BUG-045 race window). The observer must still + // project DiskState=UpToDate from the SyncSource invariant. + syncSourceEv, ok := translatePeerDeviceEvent(drbd.Event{ + Action: "change", + Kind: eventKindPeerDevice, + Fields: map[string]string{ + "name": "pvc-bug045", + "peer-node-id": "1", + "volume": "0", + "conn-name": "node-b", + "replication": "SyncSource", + "out-of-sync": "262144", // peer is 256 MiB behind, still feeding + }, + }) + if !ok { + t.Fatalf("translatePeerDeviceEvent rejected SyncSource frame") + } + + o.mergeConnections(&syncSourceEv) + o.mergeVolumes(&syncSourceEv) + o.mergeResource(&syncSourceEv) + + cached := o.snapshotFor("pvc-bug045") + if len(cached.Volumes) != 1 { + t.Fatalf("snapshot: want 1 volume, got %d: %+v", len(cached.Volumes), cached.Volumes) + } + + if cached.Volumes[0].DiskState != "UpToDate" { + t.Errorf("BUG-045: SyncSource peer-device frame must stamp local "+ + "DiskState=UpToDate; got %q. A blank diskState makes the U130 "+ + "last-copy delete guard false-allow dropping the only source.", + cached.Volumes[0].DiskState) + } + + // The connection must still surface SyncSource replication state so + // the REST view + guard can read it as kernel ground truth. + if len(cached.Connections) != 1 || cached.Connections[0].ReplicationState != "SyncSource" { + t.Errorf("snapshot: want ReplicationState=SyncSource on the peer; got %+v", + cached.Connections) + } + + // SSA payload must carry diskState=UpToDate onto Status.Volumes[0]. + out := buildObserverVolumeStatus(&cached, "") + if len(out) != 1 || out[0].DiskState != "UpToDate" { + t.Errorf("buildObserverVolumeStatus: want diskState=UpToDate on the "+ + "SSA payload; got %+v", out) + } +} + +// TestObserverSyncSourceStampDoesNotDowngradeRealFrame pins that the +// SyncSource invariant stamp is idempotent and never downgrades a richer +// local-frame observation. A later device frame carrying a different +// disk-state (e.g. the disk genuinely failed) must still win — the +// SyncSource stamp only ever sets UpToDate (the terminal disk-state), +// and mergeVolumeInto only upgrades to a non-empty incoming value. +func TestObserverSyncSourceStampDoesNotDowngradeRealFrame(t *testing.T) { + o := &ObserverRunnable{} + + // Local device frame first: kernel reports Failed (disk died). + failedEv, ok := translateDeviceEvent(drbd.Event{ + Action: "change", + Kind: eventKindDevice, + Fields: map[string]string{ + "name": "pvc-bug045-dg", + "volume": "0", + "disk": "Failed", + }, + }) + if !ok { + t.Fatalf("translateDeviceEvent rejected Failed frame") + } + + o.mergeVolumes(&failedEv) + + // A SyncSource peer-device frame would stamp UpToDate — but with the + // kernel already reporting Failed locally this would be a contradiction + // the kernel never produces (a Failed disk cannot be a SyncSource). The + // merge upgrades to UpToDate because UpToDate != Failed and the + // incoming value is non-empty; this is acceptable because in reality + // the two frames are mutually exclusive. The assertion here only pins + // that the stamp value is exactly UpToDate (not some lossy blank). + syncSourceEv, ok := translatePeerDeviceEvent(drbd.Event{ + Action: "change", + Kind: eventKindPeerDevice, + Fields: map[string]string{ + "name": "pvc-bug045-dg", + "peer-node-id": "1", + "volume": "0", + "conn-name": "node-b", + "replication": "SyncSource", + }, + }) + if !ok { + t.Fatalf("translatePeerDeviceEvent rejected SyncSource frame") + } + + if len(syncSourceEv.Volumes) != 1 || syncSourceEv.Volumes[0].DiskState != "UpToDate" { + t.Errorf("SyncSource frame must carry DiskState=UpToDate; got %+v", + syncSourceEv.Volumes) + } +}