physercoe · physercoe · Jun 14, 2026 · Jun 14, 2026
diff --git a/docs/decisions/045-hub-storage-scaling.md b/docs/decisions/045-hub-storage-scaling.md
@@ -8,9 +8,10 @@
 > [`discussions/hub-scaling-storage-and-concurrency.md`](../discussions/hub-scaling-storage-and-concurrency.md)
 > and [`discussions/hub-store-separation-and-fold-policy.md`](../discussions/hub-store-separation-and-fold-policy.md).
 > **Amends [ADR-038](038-per-run-event-digest.md) §2** (synchronous
-> digest fold → bounded-staleness deferred fold). D1 + D2 are shipped (D2
-> via the P1 class split and the P2 per-team shard — see the
-> [plan](../plans/hub-storage-scaling.md)); D3 is decided, not yet built.
+> digest fold → bounded-staleness deferred fold). D1 + D2 + D4 are
+> shipped (D2 via the P1 class split and the P2 per-team shard — see the
+> [plan](../plans/hub-storage-scaling.md); D4 the storage-maintenance
+> loop); D3 is decided, not yet built.
 > **Audience:** contributors
 > **Last verified vs code:** v1.0.807-alpha
 
@@ -144,6 +145,61 @@ independent of the DB choice. The D2 split is what makes D3 affordable —
 the backend choice is per-store, so a deployment can move one store
 (e.g. control → Postgres) without touching the others.
 
+### D4 — automated WAL-checkpoint + incremental reclamation (amends D2)
+
+The D2 split leaves two storage-hygiene gaps that bit an operator (#79):
+**WAL files grow unbounded** and **freed pages are never returned to the
+OS**. They are *different* mechanisms and are addressed separately — a
+common confusion that conflates checkpointing with `VACUUM`.
+
+- **WAL growth is a reader-pinning problem.** SQLite's auto-checkpoint
+  (default 1000 pages) can only reset the WAL up to the *oldest live
+  reader's snapshot*. The hub holds long-lived **SSE readers**
+  (Activity / transcript streams), so a continuous reader + continuous
+  firehose write keeps the checkpoint from ever reaching the WAL head —
+  it grows without bound. The fix is a periodic
+  `PRAGMA wal_checkpoint(TRUNCATE)` run by the hub, not a vacuum.
+
+- **Reclamation uses `auto_vacuum=INCREMENTAL`, not full `VACUUM`.** A
+  full `VACUUM` rewrites the whole file: it needs ~2× the DB size in
+  free disk (an ENOSPC risk on a 2 GB VPS), takes a global write lock
+  for an O(DB-size) duration (stop-the-world, *per shard*), and fights
+  the very SSE readers above. Incremental auto-vacuum instead returns
+  free pages to the OS in **bounded chunks**, each a short transaction
+  that interleaves with readers — the same pattern always-on embedded
+  SQLite uses (Chromium, Firefox, Android). Its two costs are immaterial
+  here: the per-commit pointer-map overhead is negligible against the
+  measured fold cost, and its no-defragment limitation barely bites an
+  **append-mostly** firehose (near-sequential inserts ⇒ low
+  fragmentation). Blob-ref externalization (the D2 prerequisite) already
+  removes the large transient payloads that would otherwise strand free
+  pages, so the residual reclamation need is modest — incremental is
+  cheap insurance, not a hot path.
+
+**The policy:**
+
+1. **New event/digest shards are created `auto_vacuum=INCREMENTAL`** (set
+   on the schema-creating writer connection at first open — it must
+   precede the first table). `hub.db` (control) keeps freelist reuse: low
+   delete volume, and converting an existing file needs a full VACUUM.
+2. **A background maintenance loop** (same ctx lifetime as the other
+   sweeps) runs every `HUB_STORE_MAINTENANCE_INTERVAL` (default 5 m). Per
+   currently-open shard writer (`hub.db` + each open team's events/digest
+   writer) it: (a) `wal_checkpoint(TRUNCATE)`; (b) a bounded
+   `incremental_vacuum` **with hysteresis** — only when the freelist is
+   ≥25 % of the file *and* above an absolute floor, reclaiming down to a
+   watermark (not to zero) and capped per pass, so a still-active
+   firehose can't thrash returning pages it immediately re-allocates.
+   `incremental_vacuum` is a no-op where `auto_vacuum≠INCREMENTAL`, so it
+   is safe on `hub.db` and pre-D4 shards. Disable with
+   `HUB_STORE_MAINTENANCE_DISABLE`. Evicted teams are checkpointed on
+   pool close (SQLite checkpoints when the last connection closes), so a
+   cold team needs no loop coverage.
+3. **Full `VACUUM` stays operator-only and offline** — the existing
+   `hub-server db vacuum` (now also sets `auto_vacuum=INCREMENTAL` before
+   the rebuild, so it doubles as the one-time pre-D4-shard converter).
+   It is never run automatically.
+
 ## Consequences
 
 - **Digest freshness becomes visible by design** (D1): a long open turn

diff --git a/hub/cmd/hub-server/db_cmd.go b/hub/cmd/hub-server/db_cmd.go
@@ -213,14 +213,23 @@ func vacuumTeamStores(dataRoot string) (reclaimed int64, files int, err error) {
 	return reclaimed, files, nil
 }
 
-// vacuumFile runs a whole-database VACUUM on a single sqlite store file.
+// vacuumFile runs a whole-database VACUUM on a single per-team store file. It
+// first sets auto_vacuum=INCREMENTAL: the following VACUUM rewrites the file in
+// that mode, so this offline command doubles as the one-time converter for
+// pre-D4 shards (created before ADR-045 D4; new shards are born INCREMENTAL).
+// Changing auto_vacuum only takes effect via the subsequent VACUUM; on an
+// already-INCREMENTAL shard it is a no-op.
 func vacuumFile(path string) error {
 	db, err := sql.Open("sqlite", path+"?_pragma=busy_timeout(10000)")
 	if err != nil {
 		return err
 	}
 	defer db.Close()
-	_, err = db.ExecContext(context.Background(), "VACUUM")
+	ctx := context.Background()
+	if _, err = db.ExecContext(ctx, "PRAGMA auto_vacuum=INCREMENTAL"); err != nil {
+		return err
+	}
+	_, err = db.ExecContext(ctx, "VACUUM")
 	return err
 }
 

diff --git a/hub/internal/server/db.go b/hub/internal/server/db.go
@@ -30,6 +30,16 @@ import (
 //     (≥800 concurrent agents, internal/server/load_test.go), a wash below it.
 const pragmaCommon = "_pragma=journal_mode(WAL)&_pragma=synchronous(NORMAL)&_pragma=busy_timeout(5000)&_pragma=temp_store(2)&_pragma=mmap_size(268435456)"
 
+// pragmaStoreAutoVacuum requests INCREMENTAL auto-vacuum (mode 2) for the
+// per-team event/digest shards (ADR-045 D4). auto_vacuum is a database-level
+// property fixed when the first table is created, so it MUST ride on the
+// schema-creating writer connection at first open (openStorePool); on an
+// already-populated file the pragma is a harmless no-op (the mode is set, and
+// changing it would require a full VACUUM — that path is the operator-only
+// `hub-server db vacuum`). It is deliberately kept OFF hub.db (control), which
+// has low delete volume and keeps plain freelist reuse.
+const pragmaStoreAutoVacuum = "&_pragma=auto_vacuum(2)"
+
 // pragmaWriterCache is resolved once at startup. Default 64 MiB; the size (in
 // KiB) is operator-tunable per VPS via HUB_SQLITE_WRITER_CACHE_KB (also used to
 // sweep the value in load tests). Applied to the SINGLE global control writer

diff --git a/hub/internal/server/server.go b/hub/internal/server/server.go
@@ -298,6 +298,13 @@ func (s *Server) Serve(ctx context.Context) error {
 	if s.otlp != nil {
 		go s.runOTLPExport(ctx)
 	}
+	// Storage maintenance (ADR-045 D4): periodic wal_checkpoint(TRUNCATE) +
+	// bounded incremental_vacuum across open shards — bounds -wal growth (the
+	// SSE-reader-pinning hazard) and returns freed pages to the OS. Same ctx
+	// lifetime. Opt out with HUB_STORE_MAINTENANCE_DISABLE.
+	if os.Getenv("HUB_STORE_MAINTENANCE_DISABLE") == "" {
+		go s.runStoreMaintenance(ctx)
+	}
 
 	// SIGHUP → hot-reload policy.yaml. Lets an operator edit the file and
 	// signal the daemon without restarting and losing in-flight connections.

diff --git a/hub/internal/server/store_maintenance.go b/hub/internal/server/store_maintenance.go
@@ -0,0 +1,149 @@
+package server
+
+import (
+	"context"
+	"database/sql"
+	"fmt"
+	"os"
+	"time"
+)
+
+// store_maintenance.go — ADR-045 D4: automated WAL checkpointing + bounded
+// incremental reclamation across the per-team shards.
+//
+// Two distinct storage-hygiene problems, two distinct mechanisms (do not
+// conflate them):
+//
+//   - WAL growth. SQLite's auto-checkpoint can only reset the WAL up to the
+//     oldest LIVE reader's snapshot. The hub holds long-lived SSE readers, so a
+//     continuous reader + continuous firehose write keeps the checkpoint from
+//     ever reaching the WAL head and it grows without bound. Fixed by a periodic
+//     wal_checkpoint(TRUNCATE) — NOT by VACUUM.
+//   - Free pages never returned to the OS. After a retention/refold delete, freed
+//     pages sit on the freelist (reused by future inserts, but the file stays at
+//     its high-water mark). Returned to the OS by incremental auto-vacuum in
+//     bounded chunks — NOT by a full VACUUM, whose ~2× disk + global write lock
+//     for an O(DB-size) duration is hostile to a small always-on VPS.
+//
+// New shards are created auto_vacuum=INCREMENTAL (store_split.go / openStorePool),
+// so incremental_vacuum here actually returns pages; on hub.db and pre-D4 shards
+// (auto_vacuum=NONE) incremental_vacuum is a documented no-op, so the same pass
+// is safe everywhere. Full VACUUM stays the operator-only `hub-server db vacuum`.
+
+const (
+	// storeVacuumWatermarkPages is the freelist slack kept after a reclamation
+	// pass (~512 KiB at the 4 KiB default page size). Reclaiming to zero on a
+	// still-active firehose would just hand back pages the next insert
+	// re-allocates — this watermark is the hysteresis floor.
+	storeVacuumWatermarkPages = 128
+	// storeVacuumFreeFracPct gates a pass on the freelist being a meaningful
+	// fraction of the file, so a near-full file with a few free pages is left
+	// alone (its high-water mark IS its working set).
+	storeVacuumFreeFracPct = 25
+	// storeVacuumMaxPagesPerPass bounds how many pages one pass returns (~8 MiB),
+	// so the short incremental_vacuum transaction never holds the write lock long
+	// even on a store with a huge freelist; the next tick continues draining.
+	storeVacuumMaxPagesPerPass = 2048
+)
+
+// maintTarget is one store-file writer pool the maintenance loop operates on,
+// with a human label for logs.
+type maintTarget struct {
+	label string
+	db    *sql.DB
+}
+
+// storeMaintenanceInterval is the loop cadence, operator-tunable via
+// HUB_STORE_MAINTENANCE_INTERVAL (a Go duration, e.g. "2m", "10m").
+func storeMaintenanceInterval() time.Duration {
+	d := 5 * time.Minute
+	if v := os.Getenv("HUB_STORE_MAINTENANCE_INTERVAL"); v != "" {
+		if p, err := time.ParseDuration(v); err == nil && p > 0 {
+			d = p
+		}
+	}
+	return d
+}
+
+// runStoreMaintenance is the maintenance loop (ADR-045 D4). It runs until ctx is
+// cancelled. Started from Start() unless HUB_STORE_MAINTENANCE_DISABLE is set.
+func (s *Server) runStoreMaintenance(ctx context.Context) {
+	ticker := time.NewTicker(storeMaintenanceInterval())
+	defer ticker.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-ticker.C:
+			s.maintainStores(ctx)
+		}
+	}
+}
+
+// maintainStores runs one maintenance pass over hub.db and every currently-open
+// team shard writer. Cold (unopened/evicted) teams are skipped on purpose: they
+// take no writes, and SQLite checkpoints a file when its last connection closes,
+// so an evicted team's WAL is already truncated.
+func (s *Server) maintainStores(ctx context.Context) {
+	if s.writeDB != nil {
+		s.maintainStore(ctx, s.writeDB, "hub.db")
+	}
+	if s.stores != nil {
+		for _, t := range s.stores.maintenanceTargets() {
+			select {
+			case <-ctx.Done():
+				return
+			default:
+			}
+			s.maintainStore(ctx, t.db, t.label)
+		}
+	}
+}
+
+// maintainStore checkpoints one store's WAL, then conditionally returns free
+// pages to the OS. All operations run on the store's single-connection WRITER
+// pool, so they serialize with the store's other writes rather than racing them.
+// Every failure is best-effort and logged at debug — a missed pass is retried on
+// the next tick, never an error to a caller.
+func (s *Server) maintainStore(ctx context.Context, db *sql.DB, label string) {
+	// 1. Truncate the WAL back into the main file. TRUNCATE blocks for
+	//    busy_timeout if a reader pins an old snapshot, then returns busy=1
+	//    (not an error) — we just try again next tick.
+	var busy, walFrames, checkpointed int
+	if err := db.QueryRowContext(ctx, `PRAGMA wal_checkpoint(TRUNCATE)`).
+		Scan(&busy, &walFrames, &checkpointed); err != nil {
+		s.log.Debug("store maintenance: checkpoint failed", "store", label, "err", err)
+	} else if busy != 0 {
+		s.log.Debug("store maintenance: checkpoint busy (reader pinned)", "store", label)
+	}
+
+	// 2. Reclaim free pages with hysteresis. incremental_vacuum is a no-op when
+	//    the store is not auto_vacuum=INCREMENTAL, so this is safe on hub.db and
+	//    pre-D4 shards (it simply returns nothing there).
+	var freelist, pageCount int
+	if err := db.QueryRowContext(ctx, `PRAGMA freelist_count`).Scan(&freelist); err != nil {
+		s.log.Debug("store maintenance: freelist_count failed", "store", label, "err", err)
+		return
+	}
+	if err := db.QueryRowContext(ctx, `PRAGMA page_count`).Scan(&pageCount); err != nil {
+		s.log.Debug("store maintenance: page_count failed", "store", label, "err", err)
+		return
+	}
+	if pageCount <= 0 || freelist <= storeVacuumWatermarkPages {
+		return
+	}
+	if freelist*100 < pageCount*storeVacuumFreeFracPct {
+		return // free space below the fraction gate — leave the high-water mark
+	}
+	n := freelist - storeVacuumWatermarkPages
+	if n > storeVacuumMaxPagesPerPass {
+		n = storeVacuumMaxPagesPerPass
+	}
+	if _, err := db.ExecContext(ctx, fmt.Sprintf(`PRAGMA incremental_vacuum(%d)`, n)); err != nil {
+		s.log.Debug("store maintenance: incremental_vacuum failed",
+			"store", label, "pages", n, "err", err)
+		return
+	}
+	s.log.Debug("store maintenance: reclaimed pages",
+		"store", label, "pages", n, "freelist_before", freelist, "page_count", pageCount)
+}