From 5ba2f2f00473adcc13a4d54f9c9a47b2c79e69cb Mon Sep 17 00:00:00 2001
From: Logan Nguyen <lg.131.dev@gmail.com>
Date: Wed, 10 Jun 2026 13:22:08 -0500
Subject: [PATCH 01/13] chore: ignore fetched engine binaries and model files

Signed-off-by: Logan Nguyen <lg.131.dev@gmail.com>
---
 .gitignore | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitignore b/.gitignore
index 0a114689..2a681a94 100644
--- a/.gitignore
+++ b/.gitignore
@@ -50,3 +50,7 @@ docs/superpowers/
 # SearXNG container dumps upstream defaults here on each start
 sandbox/search-box/searxng/settings.yml.new
 
+
+# Bundled inference engine artifacts (fetched by scripts/ensure-llama-server.ts)
+src-tauri/binaries/
+*.gguf

From eea89b86d91ee6b6993232504a5ba86afa0a9950 Mon Sep 17 00:00:00 2001
From: Logan Nguyen <lg.131.dev@gmail.com>
Date: Wed, 10 Jun 2026 13:32:43 -0500
Subject: [PATCH 02/13] feat: add idle_unload_minutes config field for the
 built-in engine

Signed-off-by: Logan Nguyen <lg.131.dev@gmail.com>
---
 docs/configurations.md                      |  5 ++
 src-tauri/src/config/defaults.rs            | 10 ++++
 src-tauri/src/config/loader.rs              | 20 ++++---
 src-tauri/src/config/schema.rs              | 13 +++--
 src-tauri/src/config/tests.rs               | 64 +++++++++++++++++----
 src-tauri/src/settings_commands/tests.rs    |  2 +-
 src/settings/SettingsWindow.test.tsx        |  1 +
 src/settings/components/SaveField.test.tsx  |  1 +
 src/settings/hooks/useConfigSync.test.ts    |  1 +
 src/settings/hooks/useDebouncedSave.test.ts |  1 +
 src/settings/tabs/tabs.test.tsx             |  1 +
 src/settings/types.ts                       |  1 +
 12 files changed, 98 insertions(+), 22 deletions(-)

diff --git a/docs/configurations.md b/docs/configurations.md
index 16523c67..a735d6c3 100644
--- a/docs/configurations.md
+++ b/docs/configurations.md
@@ -39,6 +39,10 @@ num_ctx = 16384
 # 0 = let Ollama manage (its own 5-minute default applies).
 # -1 = never release. Applies to the Ollama provider only.
 keep_warm_inactivity_minutes = 0
+# Minutes of inactivity before Thuki stops the built-in engine to free RAM.
+# 0 keeps the model loaded indefinitely for instant first tokens (default).
+# Applies to the built-in engine only. Valid range: 0-1440.
+idle_unload_minutes = 0
 
 # One block per provider. The built-in entry is always present. A provider's
 # selected model lives on its own `model` field (empty until you pick one in
@@ -143,6 +147,7 @@ Upgrading from an older version is automatic: a pre-providers config with a flat
 | `active_provider` | `"ollama"` | Yes      | id of a provider    | Which provider receives inference. Must match the `id` of one of the `[[inference.providers]]` entries; an empty or dangling value resets to `ollama`. Phase 1: leave this on `ollama` (the Built-in engine is not available yet).                                                                                                                                                                                                                                                                                              |
 | `num_ctx`         | `16384`    | Yes      | `[2048, 1048576]`   | Context window size in tokens sent to the active provider with every request. Warmup and chat share this value so Ollama reuses the same runner instance and its cached KV prefix for the system prompt: they must match or Ollama creates a second runner and the warmup saves nothing. Ollama silently clamps this to the model's physical maximum. Raise to fit longer conversations: each doubling roughly doubles VRAM for the KV cache; lower to reclaim GPU memory. See [Tuning the Context Window](./tuning-context-window.md). |
 | `keep_warm_inactivity_minutes` | `0` | Yes | `-1` or `[0, 1440]` | Minutes of inactivity before Thuki tells Ollama to release the model from VRAM. Applies to the Ollama provider only. `0` means do not manage: Ollama's own 5-minute default applies. `-1` means never release. Raise for longer sessions between uses; lower to reclaim VRAM sooner.                                                                                                                                                                                                                                            |
+| `idle_unload_minutes`          | `0` | Yes | `[0, 1440]`         | Minutes of inactivity before Thuki stops the built-in engine to free RAM. Applies to the built-in engine only; the Ollama provider uses `keep_warm_inactivity_minutes` instead. `0` keeps the model loaded indefinitely so the first token after a pause stays instant. Raise to free RAM on an idle Mac; keep `0` for instant first tokens.                                                                                                                                                                                   |
 
 Each `[[inference.providers]]` block has these fields:
 
diff --git a/src-tauri/src/config/defaults.rs b/src-tauri/src/config/defaults.rs
index 6b9d5126..c352843a 100644
--- a/src-tauri/src/config/defaults.rs
+++ b/src-tauri/src/config/defaults.rs
@@ -54,6 +54,15 @@ pub const BOUNDS_NUM_CTX: (u32, u32) = (2048, 1_048_576);
 /// Values below -1 or above 1440 are clamped to the compiled default.
 pub const BOUNDS_KEEP_WARM_INACTIVITY_MINUTES: (i32, i32) = (-1, 1440);
 
+/// Minutes of inactivity before Thuki stops the built-in engine to free RAM.
+/// 0 disables auto-unload: the model stays loaded and the first token stays
+/// instant (the default). Positive values free RAM after N idle minutes at
+/// the cost of a cold reload on the next message. Applies to the built-in engine only; the
+/// Ollama provider keeps `keep_warm_inactivity_minutes` (note the different
+/// meaning of 0 there: "use Ollama's own default").
+pub const DEFAULT_IDLE_UNLOAD_MINUTES: u32 = 0;
+pub const BOUNDS_IDLE_UNLOAD_MINUTES: (u32, u32) = (0, 1440);
+
 /// Built-in secretary persona prompt. User overrides via `[prompt] system` in
 /// the config file. The slash-command appendix is composed on top at load time
 /// and is never written back to the file.
@@ -322,6 +331,7 @@ pub const ALLOWED_FIELDS: &[(&str, &str)] = &[
     // [inference] — active_provider and the providers array are not flat fields;
     // they are written via set_active_model / set_ollama_url, not set_config_field.
     ("inference", "keep_warm_inactivity_minutes"),
+    ("inference", "idle_unload_minutes"),
     ("inference", "num_ctx"),
     // [prompt]
     ("prompt", "system"),
diff --git a/src-tauri/src/config/loader.rs b/src-tauri/src/config/loader.rs
index e1212a9e..a636ca11 100644
--- a/src-tauri/src/config/loader.rs
+++ b/src-tauri/src/config/loader.rs
@@ -22,15 +22,15 @@ use std::path::{Path, PathBuf};
 use std::time::{SystemTime, UNIX_EPOCH};
 
 use super::defaults::{
-    ALLOWED_FONT_WEIGHTS, BOUNDS_KEEP_WARM_INACTIVITY_MINUTES, BOUNDS_MAX_CHAT_HEIGHT,
-    BOUNDS_MAX_IMAGES, BOUNDS_MAX_ITERATIONS, BOUNDS_NUM_CTX, BOUNDS_OVERLAY_WIDTH,
-    BOUNDS_PIPELINE_WALL_CLOCK_BUDGET_S, BOUNDS_QUOTE_MAX_CONTEXT_LENGTH,
+    ALLOWED_FONT_WEIGHTS, BOUNDS_IDLE_UNLOAD_MINUTES, BOUNDS_KEEP_WARM_INACTIVITY_MINUTES,
+    BOUNDS_MAX_CHAT_HEIGHT, BOUNDS_MAX_IMAGES, BOUNDS_MAX_ITERATIONS, BOUNDS_NUM_CTX,
+    BOUNDS_OVERLAY_WIDTH, BOUNDS_PIPELINE_WALL_CLOCK_BUDGET_S, BOUNDS_QUOTE_MAX_CONTEXT_LENGTH,
     BOUNDS_QUOTE_MAX_DISPLAY_CHARS, BOUNDS_QUOTE_MAX_DISPLAY_LINES, BOUNDS_SEARXNG_MAX_RESULTS,
     BOUNDS_TEXT_BASE_PX, BOUNDS_TEXT_LETTER_SPACING_PX, BOUNDS_TEXT_LINE_HEIGHT, BOUNDS_TIMEOUT_S,
-    BOUNDS_TOP_K_URLS, BOUNDS_UPDATER_CHECK_INTERVAL_HOURS, DEFAULT_JUDGE_TIMEOUT_S,
-    DEFAULT_KEEP_WARM_INACTIVITY_MINUTES, DEFAULT_MAX_CHAT_HEIGHT, DEFAULT_MAX_IMAGES,
-    DEFAULT_MAX_ITERATIONS, DEFAULT_NUM_CTX, DEFAULT_OLLAMA_URL, DEFAULT_OVERLAY_WIDTH,
-    DEFAULT_PIPELINE_WALL_CLOCK_BUDGET_S, DEFAULT_QUOTE_MAX_CONTEXT_LENGTH,
+    BOUNDS_TOP_K_URLS, BOUNDS_UPDATER_CHECK_INTERVAL_HOURS, DEFAULT_IDLE_UNLOAD_MINUTES,
+    DEFAULT_JUDGE_TIMEOUT_S, DEFAULT_KEEP_WARM_INACTIVITY_MINUTES, DEFAULT_MAX_CHAT_HEIGHT,
+    DEFAULT_MAX_IMAGES, DEFAULT_MAX_ITERATIONS, DEFAULT_NUM_CTX, DEFAULT_OLLAMA_URL,
+    DEFAULT_OVERLAY_WIDTH, DEFAULT_PIPELINE_WALL_CLOCK_BUDGET_S, DEFAULT_QUOTE_MAX_CONTEXT_LENGTH,
     DEFAULT_QUOTE_MAX_DISPLAY_CHARS, DEFAULT_QUOTE_MAX_DISPLAY_LINES,
     DEFAULT_READER_BATCH_TIMEOUT_S, DEFAULT_READER_PER_URL_TIMEOUT_S, DEFAULT_READER_URL,
     DEFAULT_ROUTER_TIMEOUT_S, DEFAULT_SEARCH_TIMEOUT_S, DEFAULT_SEARXNG_MAX_RESULTS,
@@ -328,6 +328,12 @@ fn resolve_inference(inf: &mut crate::config::schema::InferenceSection) {
         DEFAULT_KEEP_WARM_INACTIVITY_MINUTES,
         "inference.keep_warm_inactivity_minutes",
     );
+    clamp_u32(
+        &mut inf.idle_unload_minutes,
+        BOUNDS_IDLE_UNLOAD_MINUTES,
+        DEFAULT_IDLE_UNLOAD_MINUTES,
+        "inference.idle_unload_minutes",
+    );
 
     // Migration: a pre-providers file has `ollama_url` and no `providers`.
     // Carry the URL onto a synthesized Ollama provider; the active model is
diff --git a/src-tauri/src/config/schema.rs b/src-tauri/src/config/schema.rs
index 8702c9f2..43ade2b2 100644
--- a/src-tauri/src/config/schema.rs
+++ b/src-tauri/src/config/schema.rs
@@ -15,10 +15,10 @@ use serde::{Deserialize, Serialize};
 
 use super::defaults::{
     DEFAULT_ACTIVE_PROVIDER, DEFAULT_AUTO_CLOSE, DEFAULT_AUTO_REPLACE, DEFAULT_BUILTIN_LABEL,
-    DEFAULT_DEBUG_TRACE_ENABLED, DEFAULT_JUDGE_TIMEOUT_S, DEFAULT_KEEP_WARM_INACTIVITY_MINUTES,
-    DEFAULT_MAX_CHAT_HEIGHT, DEFAULT_MAX_IMAGES, DEFAULT_MAX_ITERATIONS, DEFAULT_NUM_CTX,
-    DEFAULT_OLLAMA_LABEL, DEFAULT_OLLAMA_URL, DEFAULT_OVERLAY_WIDTH,
-    DEFAULT_PIPELINE_WALL_CLOCK_BUDGET_S, DEFAULT_QUOTE_MAX_CONTEXT_LENGTH,
+    DEFAULT_DEBUG_TRACE_ENABLED, DEFAULT_IDLE_UNLOAD_MINUTES, DEFAULT_JUDGE_TIMEOUT_S,
+    DEFAULT_KEEP_WARM_INACTIVITY_MINUTES, DEFAULT_MAX_CHAT_HEIGHT, DEFAULT_MAX_IMAGES,
+    DEFAULT_MAX_ITERATIONS, DEFAULT_NUM_CTX, DEFAULT_OLLAMA_LABEL, DEFAULT_OLLAMA_URL,
+    DEFAULT_OVERLAY_WIDTH, DEFAULT_PIPELINE_WALL_CLOCK_BUDGET_S, DEFAULT_QUOTE_MAX_CONTEXT_LENGTH,
     DEFAULT_QUOTE_MAX_DISPLAY_CHARS, DEFAULT_QUOTE_MAX_DISPLAY_LINES,
     DEFAULT_READER_BATCH_TIMEOUT_S, DEFAULT_READER_PER_URL_TIMEOUT_S, DEFAULT_READER_URL,
     DEFAULT_ROUTER_TIMEOUT_S, DEFAULT_SEARCH_TIMEOUT_S, DEFAULT_SEARXNG_MAX_RESULTS,
@@ -99,6 +99,10 @@ pub struct InferenceSection {
     /// 5-minute default applies). -1 means keep indefinitely. Valid range: -1
     /// or 0..=1440.
     pub keep_warm_inactivity_minutes: i32,
+    /// Minutes of inactivity before the built-in engine is stopped to free
+    /// RAM. 0 keeps the model loaded indefinitely (default). Built-in only;
+    /// Ollama keeps `keep_warm_inactivity_minutes`. Valid range: 0..=1440.
+    pub idle_unload_minutes: u32,
     /// The configured providers. Always contains the built-in entry after
     /// resolution. The field-level `#[serde(default)]` defaults a *missing*
     /// `providers` key to an empty Vec (not the seeded pair), so the loader can
@@ -119,6 +123,7 @@ impl Default for InferenceSection {
             active_provider: DEFAULT_ACTIVE_PROVIDER.to_string(),
             num_ctx: DEFAULT_NUM_CTX,
             keep_warm_inactivity_minutes: DEFAULT_KEEP_WARM_INACTIVITY_MINUTES,
+            idle_unload_minutes: DEFAULT_IDLE_UNLOAD_MINUTES,
             providers: default_providers(),
             legacy_ollama_url: None,
         }
diff --git a/src-tauri/src/config/tests.rs b/src-tauri/src/config/tests.rs
index 1e374479..c02f1902 100644
--- a/src-tauri/src/config/tests.rs
+++ b/src-tauri/src/config/tests.rs
@@ -14,16 +14,17 @@ use std::path::PathBuf;
 
 use super::defaults::{
     DEFAULT_ACTIVE_PROVIDER, DEFAULT_AUTO_CLOSE, DEFAULT_AUTO_REPLACE, DEFAULT_DEBUG_TRACE_ENABLED,
-    DEFAULT_JUDGE_TIMEOUT_S, DEFAULT_KEEP_WARM_INACTIVITY_MINUTES, DEFAULT_MAX_CHAT_HEIGHT,
-    DEFAULT_MAX_IMAGES, DEFAULT_MAX_ITERATIONS, DEFAULT_NUM_CTX, DEFAULT_OLLAMA_URL,
-    DEFAULT_OVERLAY_WIDTH, DEFAULT_QUOTE_MAX_CONTEXT_LENGTH, DEFAULT_QUOTE_MAX_DISPLAY_CHARS,
-    DEFAULT_QUOTE_MAX_DISPLAY_LINES, DEFAULT_READER_BATCH_TIMEOUT_S,
-    DEFAULT_READER_PER_URL_TIMEOUT_S, DEFAULT_READER_URL, DEFAULT_ROUTER_TIMEOUT_S,
-    DEFAULT_SEARCH_TIMEOUT_S, DEFAULT_SEARXNG_MAX_RESULTS, DEFAULT_SEARXNG_URL,
-    DEFAULT_SYSTEM_PROMPT_BASE, DEFAULT_TEXT_BASE_PX, DEFAULT_TEXT_FONT_WEIGHT,
-    DEFAULT_TEXT_LETTER_SPACING_PX, DEFAULT_TEXT_LINE_HEIGHT, DEFAULT_TOP_K_URLS,
-    DEFAULT_UPDATER_CHECK_INTERVAL_HOURS, DEFAULT_UPDATER_MANIFEST_URL, PROVIDER_ID_BUILTIN,
-    PROVIDER_ID_OLLAMA, PROVIDER_KIND_BUILTIN, PROVIDER_KIND_OLLAMA, SLASH_COMMAND_PROMPT_APPENDIX,
+    DEFAULT_IDLE_UNLOAD_MINUTES, DEFAULT_JUDGE_TIMEOUT_S, DEFAULT_KEEP_WARM_INACTIVITY_MINUTES,
+    DEFAULT_MAX_CHAT_HEIGHT, DEFAULT_MAX_IMAGES, DEFAULT_MAX_ITERATIONS, DEFAULT_NUM_CTX,
+    DEFAULT_OLLAMA_URL, DEFAULT_OVERLAY_WIDTH, DEFAULT_QUOTE_MAX_CONTEXT_LENGTH,
+    DEFAULT_QUOTE_MAX_DISPLAY_CHARS, DEFAULT_QUOTE_MAX_DISPLAY_LINES,
+    DEFAULT_READER_BATCH_TIMEOUT_S, DEFAULT_READER_PER_URL_TIMEOUT_S, DEFAULT_READER_URL,
+    DEFAULT_ROUTER_TIMEOUT_S, DEFAULT_SEARCH_TIMEOUT_S, DEFAULT_SEARXNG_MAX_RESULTS,
+    DEFAULT_SEARXNG_URL, DEFAULT_SYSTEM_PROMPT_BASE, DEFAULT_TEXT_BASE_PX,
+    DEFAULT_TEXT_FONT_WEIGHT, DEFAULT_TEXT_LETTER_SPACING_PX, DEFAULT_TEXT_LINE_HEIGHT,
+    DEFAULT_TOP_K_URLS, DEFAULT_UPDATER_CHECK_INTERVAL_HOURS, DEFAULT_UPDATER_MANIFEST_URL,
+    PROVIDER_ID_BUILTIN, PROVIDER_ID_OLLAMA, PROVIDER_KIND_BUILTIN, PROVIDER_KIND_OLLAMA,
+    SLASH_COMMAND_PROMPT_APPENDIX,
 };
 use super::error::ConfigError;
 use super::loader::{compose_system_prompt, load_from_path};
@@ -532,6 +533,49 @@ fn num_ctx_roundtrips_through_toml() {
     assert_eq!(reloaded.inference.num_ctx, config.inference.num_ctx);
 }
 
+#[test]
+fn idle_unload_default_matches_const() {
+    let c = AppConfig::default();
+    assert_eq!(c.inference.idle_unload_minutes, DEFAULT_IDLE_UNLOAD_MINUTES);
+}
+
+#[test]
+fn idle_unload_out_of_bounds_resets() {
+    let dir = fresh_temp_dir();
+    let path = config_path_in(&dir);
+    std::fs::write(&path, "[inference]\nidle_unload_minutes = 99999\n").unwrap();
+    let config = load_from_path(&path).unwrap();
+    assert_eq!(
+        config.inference.idle_unload_minutes,
+        DEFAULT_IDLE_UNLOAD_MINUTES
+    );
+}
+
+#[test]
+fn idle_unload_in_bounds_preserved() {
+    let dir = fresh_temp_dir();
+    let path = config_path_in(&dir);
+    std::fs::write(&path, "[inference]\nidle_unload_minutes = 30\n").unwrap();
+    let config = load_from_path(&path).unwrap();
+    assert_eq!(config.inference.idle_unload_minutes, 30);
+}
+
+#[test]
+fn idle_unload_roundtrips_through_toml() {
+    let dir = fresh_temp_dir();
+    let path = config_path_in(&dir);
+    std::fs::write(&path, "[inference]\nidle_unload_minutes = 15\n").unwrap();
+    let config = load_from_path(&path).unwrap();
+    assert_eq!(config.inference.idle_unload_minutes, 15);
+
+    atomic_write(&path, &config).unwrap();
+    let reloaded = load_from_path(&path).unwrap();
+    assert_eq!(
+        reloaded.inference.idle_unload_minutes,
+        config.inference.idle_unload_minutes
+    );
+}
+
 #[test]
 fn resolve_empty_ollama_url_falls_back() {
     let dir = fresh_temp_dir();
diff --git a/src-tauri/src/settings_commands/tests.rs b/src-tauri/src/settings_commands/tests.rs
index 86247dfe..f38045c8 100644
--- a/src-tauri/src/settings_commands/tests.rs
+++ b/src-tauri/src/settings_commands/tests.rs
@@ -95,7 +95,7 @@ fn allowed_fields_count_matches_schema_field_count() {
     // and is intentionally absent from ALLOWED_FIELDS. If this assertion fails, the
     // schema has drifted from the allowlist and someone added a field without
     // extending ALLOWED_FIELDS.
-    assert_eq!(ALLOWED_FIELDS.len(), 30);
+    assert_eq!(ALLOWED_FIELDS.len(), 31);
 }
 
 #[test]
diff --git a/src/settings/SettingsWindow.test.tsx b/src/settings/SettingsWindow.test.tsx
index c649da16..57cd7b27 100644
--- a/src/settings/SettingsWindow.test.tsx
+++ b/src/settings/SettingsWindow.test.tsx
@@ -19,6 +19,7 @@ const SAMPLE: RawAppConfig = {
   inference: {
     active_provider: 'ollama',
     keep_warm_inactivity_minutes: 0,
+    idle_unload_minutes: 0,
     num_ctx: 16384,
     providers: [
       {
diff --git a/src/settings/components/SaveField.test.tsx b/src/settings/components/SaveField.test.tsx
index 76e99590..c69c2a02 100644
--- a/src/settings/components/SaveField.test.tsx
+++ b/src/settings/components/SaveField.test.tsx
@@ -12,6 +12,7 @@ const SAMPLE: RawAppConfig = {
   inference: {
     active_provider: 'ollama',
     keep_warm_inactivity_minutes: 0,
+    idle_unload_minutes: 0,
     num_ctx: 16384,
     providers: [
       {
diff --git a/src/settings/hooks/useConfigSync.test.ts b/src/settings/hooks/useConfigSync.test.ts
index 70bd093f..08b5b575 100644
--- a/src/settings/hooks/useConfigSync.test.ts
+++ b/src/settings/hooks/useConfigSync.test.ts
@@ -16,6 +16,7 @@ const CONFIG_A: RawAppConfig = {
   inference: {
     active_provider: 'ollama',
     keep_warm_inactivity_minutes: 0,
+    idle_unload_minutes: 0,
     num_ctx: 16384,
     providers: [
       {
diff --git a/src/settings/hooks/useDebouncedSave.test.ts b/src/settings/hooks/useDebouncedSave.test.ts
index 714a4105..8ec59f88 100644
--- a/src/settings/hooks/useDebouncedSave.test.ts
+++ b/src/settings/hooks/useDebouncedSave.test.ts
@@ -13,6 +13,7 @@ const SAMPLE_CONFIG: RawAppConfig = {
   inference: {
     active_provider: 'ollama',
     keep_warm_inactivity_minutes: 0,
+    idle_unload_minutes: 0,
     num_ctx: 16384,
     providers: [
       {
diff --git a/src/settings/tabs/tabs.test.tsx b/src/settings/tabs/tabs.test.tsx
index 1545aa17..6fe4b307 100644
--- a/src/settings/tabs/tabs.test.tsx
+++ b/src/settings/tabs/tabs.test.tsx
@@ -37,6 +37,7 @@ const CONFIG: RawAppConfig = {
   inference: {
     active_provider: 'ollama',
     keep_warm_inactivity_minutes: 0,
+    idle_unload_minutes: 0,
     num_ctx: 16384,
     providers: [
       {
diff --git a/src/settings/types.ts b/src/settings/types.ts
index c69b387f..e2db1fe2 100644
--- a/src/settings/types.ts
+++ b/src/settings/types.ts
@@ -25,6 +25,7 @@ export interface RawAppConfig {
   inference: {
     active_provider: string;
     keep_warm_inactivity_minutes: number;
+    idle_unload_minutes: number;
     num_ctx: number;
     providers: RawProvider[];
   };

From dbcc8769d15b78bc175b7e2bc30e564a12f75f2d Mon Sep 17 00:00:00 2001
From: Logan Nguyen <lg.131.dev@gmail.com>
Date: Wed, 10 Jun 2026 15:56:50 -0500
Subject: [PATCH 03/13] feat: add engine residency state machine

Signed-off-by: Logan Nguyen <lg.131.dev@gmail.com>
---
 src-tauri/src/engine/mod.rs   |   9 +
 src-tauri/src/engine/state.rs | 518 ++++++++++++++++++++++++++++++++++
 src-tauri/src/lib.rs          |   1 +
 3 files changed, 528 insertions(+)
 create mode 100644 src-tauri/src/engine/mod.rs
 create mode 100644 src-tauri/src/engine/state.rs

diff --git a/src-tauri/src/engine/mod.rs b/src-tauri/src/engine/mod.rs
new file mode 100644
index 00000000..b0777d78
--- /dev/null
+++ b/src-tauri/src/engine/mod.rs
@@ -0,0 +1,9 @@
+//! Built-in inference engine management.
+//!
+//! Thuki bundles a single `llama-server` sidecar and manages its lifecycle:
+//! at most one engine process exists, never two models are resident, and a
+//! model (or context-size) switch always kills the old process and waits for
+//! a confirmed exit before spawning the new one. This module hosts the pieces
+//! of that lifecycle; the pure residency state machine lives in [`state`].
+
+pub mod state;
diff --git a/src-tauri/src/engine/state.rs b/src-tauri/src/engine/state.rs
new file mode 100644
index 00000000..ef81c90c
--- /dev/null
+++ b/src-tauri/src/engine/state.rs
@@ -0,0 +1,518 @@
+//! Pure residency state machine for the built-in engine process.
+//!
+//! Encodes the engine lifecycle invariants as a side-effect-free transition
+//! function: at most one engine process, never two models resident,
+//! kill-then-start with a confirmed exit, latest requested target wins.
+//! The machine owns no process, no IO, and no clock; the runner actor feeds
+//! it [`Event`]s and executes the [`Effect`] each transition requests.
+
+use std::path::PathBuf;
+
+/// A fully resolved engine configuration the runner can spawn.
+///
+/// Two targets are interchangeable only when every field matches: a `num_ctx`
+/// change is a different target and forces a restart, exactly like a model
+/// switch, because the context size is fixed at `llama-server` startup.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct Target {
+    /// Absolute path to the GGUF model file.
+    pub model_path: PathBuf,
+    /// Optional multimodal projector file for vision-capable models.
+    pub mmproj_path: Option<PathBuf>,
+    /// Context window size (tokens) the server is started with.
+    pub num_ctx: u32,
+}
+
+/// Where the engine process is in its lifecycle.
+#[derive(Debug, Clone, PartialEq)]
+pub enum EngineState {
+    /// No process exists and none is wanted.
+    Stopped,
+    /// A spawn is in flight for the given target; not yet healthy.
+    Starting(Target),
+    /// The process is healthy and serving the target on a local port.
+    Loaded { target: Target, port: u16 },
+    /// A kill was issued and the exit is not yet confirmed. `next` is the
+    /// target to spawn once the exit lands, or `None` to stay stopped.
+    Stopping { next: Option<Target> },
+    /// The last spawn or process died with an error. Sticky until the next
+    /// `Ensure` (retry) or `Unload` (acknowledge and return to `Stopped`).
+    Failed(String),
+}
+
+/// Inputs the runner feeds into the machine: user intent (`Ensure`,
+/// `Unload`), process observations (`SpawnedHealthy`, `SpawnFailed`,
+/// `ExitConfirmed`, `ChildCrashed`), and timers (`IdleExpired`).
+#[derive(Debug, PartialEq)]
+pub enum Event {
+    /// Make this target resident, restarting the process if a different
+    /// target is currently starting or loaded.
+    Ensure(Target),
+    /// Stop the engine and release its memory. Always wins over a pending
+    /// restart queued behind a kill.
+    Unload,
+    /// The spawned process passed its health check and is serving on `port`.
+    SpawnedHealthy { port: u16 },
+    /// The spawn attempt itself failed before the process became healthy.
+    SpawnFailed(String),
+    /// The process exit requested by a kill has been observed.
+    ExitConfirmed,
+    /// The process died without being asked to.
+    ChildCrashed(String),
+    /// The idle-unload timer elapsed with no chat activity.
+    IdleExpired,
+}
+
+/// The single side effect, if any, the runner must execute after a
+/// transition. The machine never requests more than one effect per step;
+/// chained work (kill then spawn) is sequenced through `Stopping` and a
+/// follow-up `ExitConfirmed`.
+#[derive(Debug, PartialEq)]
+pub enum Effect {
+    /// Nothing to do.
+    None,
+    /// Spawn an engine process for this target.
+    Spawn(Target),
+    /// Kill the current engine process; an exit confirmation must follow.
+    Kill,
+}
+
+/// Advances the machine by one event and returns the new state plus the
+/// effect the runner must execute.
+///
+/// Total over all `(state, event)` pairs: combinations outside the lifecycle
+/// (late or duplicate process events, redundant user intent) are explicit
+/// no-ops, so the runner never needs to pre-filter events.
+pub fn step(state: EngineState, event: Event) -> (EngineState, Effect) {
+    match (state, event) {
+        // Stopped: only Ensure does anything; there is no process to
+        // observe, unload, or expire.
+        (EngineState::Stopped, Event::Ensure(target)) => {
+            (EngineState::Starting(target.clone()), Effect::Spawn(target))
+        }
+        (state @ EngineState::Stopped, Event::Unload)
+        | (state @ EngineState::Stopped, Event::SpawnedHealthy { .. })
+        | (state @ EngineState::Stopped, Event::SpawnFailed(_))
+        | (state @ EngineState::Stopped, Event::ExitConfirmed)
+        | (state @ EngineState::Stopped, Event::ChildCrashed(_))
+        | (state @ EngineState::Stopped, Event::IdleExpired) => (state, Effect::None),
+
+        // Starting: waiting on the health check of an in-flight spawn.
+        (EngineState::Starting(target), Event::SpawnedHealthy { port }) => {
+            (EngineState::Loaded { target, port }, Effect::None)
+        }
+        (EngineState::Starting(_), Event::SpawnFailed(err))
+        | (EngineState::Starting(_), Event::ChildCrashed(err)) => {
+            (EngineState::Failed(err), Effect::None)
+        }
+        (EngineState::Starting(current), Event::Ensure(requested)) => {
+            if requested == current {
+                // Already starting exactly this target; let the spawn finish.
+                (EngineState::Starting(current), Effect::None)
+            } else {
+                // Different target: abort the in-flight spawn and queue the
+                // new target behind the confirmed exit.
+                (
+                    EngineState::Stopping {
+                        next: Some(requested),
+                    },
+                    Effect::Kill,
+                )
+            }
+        }
+        (EngineState::Starting(_), Event::Unload) => {
+            (EngineState::Stopping { next: None }, Effect::Kill)
+        }
+        // A confirmed exit or idle timer cannot belong to a spawn that has
+        // not reported healthy yet; ignore.
+        (state @ EngineState::Starting(_), Event::ExitConfirmed)
+        | (state @ EngineState::Starting(_), Event::IdleExpired) => (state, Effect::None),
+
+        // Loaded: healthy and serving.
+        (EngineState::Loaded { target, port }, Event::Ensure(requested)) => {
+            if requested == target {
+                (EngineState::Loaded { target, port }, Effect::None)
+            } else {
+                (
+                    EngineState::Stopping {
+                        next: Some(requested),
+                    },
+                    Effect::Kill,
+                )
+            }
+        }
+        (EngineState::Loaded { .. }, Event::Unload)
+        | (EngineState::Loaded { .. }, Event::IdleExpired) => {
+            (EngineState::Stopping { next: None }, Effect::Kill)
+        }
+        (EngineState::Loaded { .. }, Event::ChildCrashed(err)) => {
+            (EngineState::Failed(err), Effect::None)
+        }
+        // A health report or spawn failure cannot apply to an already
+        // loaded process; ignore. An ExitConfirmed without a kill is stale.
+        (state @ EngineState::Loaded { .. }, Event::SpawnedHealthy { .. })
+        | (state @ EngineState::Loaded { .. }, Event::SpawnFailed(_))
+        | (state @ EngineState::Loaded { .. }, Event::ExitConfirmed) => (state, Effect::None),
+
+        // Stopping: a kill was issued; everything pivots on the exit.
+        // A crash while stopping confirms the exit just as well as the kill
+        // landing, so ChildCrashed is handled identically to ExitConfirmed.
+        (EngineState::Stopping { next: Some(next) }, Event::ExitConfirmed)
+        | (EngineState::Stopping { next: Some(next) }, Event::ChildCrashed(_)) => {
+            (EngineState::Starting(next.clone()), Effect::Spawn(next))
+        }
+        (EngineState::Stopping { next: None }, Event::ExitConfirmed)
+        | (EngineState::Stopping { next: None }, Event::ChildCrashed(_)) => {
+            (EngineState::Stopped, Effect::None)
+        }
+        // Latest target wins: replace whatever was queued. The kill is
+        // already in flight, so no new effect is needed.
+        (EngineState::Stopping { .. }, Event::Ensure(requested)) => (
+            EngineState::Stopping {
+                next: Some(requested),
+            },
+            Effect::None,
+        ),
+        // Manual Unload always wins over a pending restart.
+        (EngineState::Stopping { .. }, Event::Unload) => {
+            (EngineState::Stopping { next: None }, Effect::None)
+        }
+        // Health, spawn-failure, and idle events belong to a process
+        // generation that has already been superseded; ignore.
+        (state @ EngineState::Stopping { .. }, Event::SpawnedHealthy { .. })
+        | (state @ EngineState::Stopping { .. }, Event::SpawnFailed(_))
+        | (state @ EngineState::Stopping { .. }, Event::IdleExpired) => (state, Effect::None),
+
+        // Failed: sticky error awaiting user intent.
+        (EngineState::Failed(_), Event::Ensure(target)) => {
+            (EngineState::Starting(target.clone()), Effect::Spawn(target))
+        }
+        (EngineState::Failed(_), Event::Unload) => (EngineState::Stopped, Effect::None),
+        // No process exists in Failed; process and timer events are stale.
+        (state @ EngineState::Failed(_), Event::SpawnedHealthy { .. })
+        | (state @ EngineState::Failed(_), Event::SpawnFailed(_))
+        | (state @ EngineState::Failed(_), Event::ExitConfirmed)
+        | (state @ EngineState::Failed(_), Event::ChildCrashed(_))
+        | (state @ EngineState::Failed(_), Event::IdleExpired) => (state, Effect::None),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn target_a() -> Target {
+        Target {
+            model_path: PathBuf::from("/models/a.gguf"),
+            mmproj_path: None,
+            num_ctx: 8192,
+        }
+    }
+
+    fn target_b() -> Target {
+        Target {
+            model_path: PathBuf::from("/models/b.gguf"),
+            mmproj_path: Some(PathBuf::from("/models/b.mmproj.gguf")),
+            num_ctx: 8192,
+        }
+    }
+
+    fn loaded_a() -> EngineState {
+        EngineState::Loaded {
+            target: target_a(),
+            port: 4242,
+        }
+    }
+
+    // Stopped
+
+    #[test]
+    fn ensure_from_stopped_starts() {
+        let (state, effect) = step(EngineState::Stopped, Event::Ensure(target_a()));
+        assert_eq!(state, EngineState::Starting(target_a()));
+        assert_eq!(effect, Effect::Spawn(target_a()));
+    }
+
+    #[test]
+    fn stopped_ignores_non_ensure_events() {
+        for event in [
+            Event::Unload,
+            Event::SpawnedHealthy { port: 4242 },
+            Event::SpawnFailed("boom".into()),
+            Event::ExitConfirmed,
+            Event::ChildCrashed("boom".into()),
+            Event::IdleExpired,
+        ] {
+            let (state, effect) = step(EngineState::Stopped, event);
+            assert_eq!(state, EngineState::Stopped);
+            assert_eq!(effect, Effect::None);
+        }
+    }
+
+    // Starting
+
+    #[test]
+    fn health_ok_loads() {
+        let (state, effect) = step(
+            EngineState::Starting(target_a()),
+            Event::SpawnedHealthy { port: 4242 },
+        );
+        assert_eq!(state, loaded_a());
+        assert_eq!(effect, Effect::None);
+    }
+
+    #[test]
+    fn spawn_failed_while_starting_fails() {
+        let (state, effect) = step(
+            EngineState::Starting(target_a()),
+            Event::SpawnFailed("bind error".into()),
+        );
+        assert_eq!(state, EngineState::Failed("bind error".into()));
+        assert_eq!(effect, Effect::None);
+    }
+
+    #[test]
+    fn crash_while_starting_fails() {
+        let (state, effect) = step(
+            EngineState::Starting(target_a()),
+            Event::ChildCrashed("signal 9".into()),
+        );
+        assert_eq!(state, EngineState::Failed("signal 9".into()));
+        assert_eq!(effect, Effect::None);
+    }
+
+    #[test]
+    fn ensure_same_target_while_starting_keeps_starting() {
+        let (state, effect) = step(EngineState::Starting(target_a()), Event::Ensure(target_a()));
+        assert_eq!(state, EngineState::Starting(target_a()));
+        assert_eq!(effect, Effect::None);
+    }
+
+    #[test]
+    fn ensure_while_starting_aborts_inflight() {
+        let (state, effect) = step(EngineState::Starting(target_a()), Event::Ensure(target_b()));
+        assert_eq!(
+            state,
+            EngineState::Stopping {
+                next: Some(target_b())
+            }
+        );
+        assert_eq!(effect, Effect::Kill);
+    }
+
+    #[test]
+    fn unload_while_starting_stops() {
+        let (state, effect) = step(EngineState::Starting(target_a()), Event::Unload);
+        assert_eq!(state, EngineState::Stopping { next: None });
+        assert_eq!(effect, Effect::Kill);
+    }
+
+    #[test]
+    fn starting_ignores_exit_confirmed_and_idle() {
+        for event in [Event::ExitConfirmed, Event::IdleExpired] {
+            let (state, effect) = step(EngineState::Starting(target_a()), event);
+            assert_eq!(state, EngineState::Starting(target_a()));
+            assert_eq!(effect, Effect::None);
+        }
+    }
+
+    // Loaded
+
+    #[test]
+    fn ensure_same_target_while_loaded_is_noop() {
+        let (state, effect) = step(loaded_a(), Event::Ensure(target_a()));
+        assert_eq!(state, loaded_a());
+        assert_eq!(effect, Effect::None);
+    }
+
+    #[test]
+    fn ensure_new_target_while_loaded_stops_first() {
+        let (state, effect) = step(loaded_a(), Event::Ensure(target_b()));
+        assert_eq!(
+            state,
+            EngineState::Stopping {
+                next: Some(target_b())
+            }
+        );
+        assert_eq!(effect, Effect::Kill);
+    }
+
+    #[test]
+    fn num_ctx_change_is_new_target() {
+        let resized = Target {
+            num_ctx: 16384,
+            ..target_a()
+        };
+        let (state, effect) = step(loaded_a(), Event::Ensure(resized.clone()));
+        assert_eq!(
+            state,
+            EngineState::Stopping {
+                next: Some(resized)
+            }
+        );
+        assert_eq!(effect, Effect::Kill);
+    }
+
+    #[test]
+    fn unload_while_loaded_stops() {
+        let (state, effect) = step(loaded_a(), Event::Unload);
+        assert_eq!(state, EngineState::Stopping { next: None });
+        assert_eq!(effect, Effect::Kill);
+    }
+
+    #[test]
+    fn idle_expired_while_loaded_stops() {
+        let (state, effect) = step(loaded_a(), Event::IdleExpired);
+        assert_eq!(state, EngineState::Stopping { next: None });
+        assert_eq!(effect, Effect::Kill);
+    }
+
+    #[test]
+    fn crash_while_loaded_fails() {
+        let (state, effect) = step(loaded_a(), Event::ChildCrashed("oom".into()));
+        assert_eq!(state, EngineState::Failed("oom".into()));
+        assert_eq!(effect, Effect::None);
+    }
+
+    #[test]
+    fn loaded_ignores_stale_process_events() {
+        for event in [
+            Event::SpawnedHealthy { port: 9999 },
+            Event::SpawnFailed("stale".into()),
+            Event::ExitConfirmed,
+        ] {
+            let (state, effect) = step(loaded_a(), event);
+            assert_eq!(state, loaded_a());
+            assert_eq!(effect, Effect::None);
+        }
+    }
+
+    // Stopping
+
+    #[test]
+    fn exit_confirmed_with_next_starts_next() {
+        let (state, effect) = step(
+            EngineState::Stopping {
+                next: Some(target_b()),
+            },
+            Event::ExitConfirmed,
+        );
+        assert_eq!(state, EngineState::Starting(target_b()));
+        assert_eq!(effect, Effect::Spawn(target_b()));
+    }
+
+    #[test]
+    fn exit_confirmed_without_next_stops() {
+        let (state, effect) = step(EngineState::Stopping { next: None }, Event::ExitConfirmed);
+        assert_eq!(state, EngineState::Stopped);
+        assert_eq!(effect, Effect::None);
+    }
+
+    #[test]
+    fn crash_while_stopping_with_next_starts_next() {
+        let (state, effect) = step(
+            EngineState::Stopping {
+                next: Some(target_b()),
+            },
+            Event::ChildCrashed("killed".into()),
+        );
+        assert_eq!(state, EngineState::Starting(target_b()));
+        assert_eq!(effect, Effect::Spawn(target_b()));
+    }
+
+    #[test]
+    fn crash_while_stopping_without_next_stops() {
+        let (state, effect) = step(
+            EngineState::Stopping { next: None },
+            Event::ChildCrashed("killed".into()),
+        );
+        assert_eq!(state, EngineState::Stopped);
+        assert_eq!(effect, Effect::None);
+    }
+
+    #[test]
+    fn latest_target_wins_while_stopping() {
+        let (state, effect) = step(
+            EngineState::Stopping {
+                next: Some(target_a()),
+            },
+            Event::Ensure(target_b()),
+        );
+        assert_eq!(
+            state,
+            EngineState::Stopping {
+                next: Some(target_b())
+            }
+        );
+        assert_eq!(effect, Effect::None);
+    }
+
+    #[test]
+    fn unload_while_stopping_clears_next() {
+        let (state, effect) = step(
+            EngineState::Stopping {
+                next: Some(target_a()),
+            },
+            Event::Unload,
+        );
+        assert_eq!(state, EngineState::Stopping { next: None });
+        assert_eq!(effect, Effect::None);
+    }
+
+    #[test]
+    fn stopping_ignores_superseded_events() {
+        for event in [
+            Event::SpawnedHealthy { port: 4242 },
+            Event::SpawnFailed("stale".into()),
+            Event::IdleExpired,
+        ] {
+            let (state, effect) = step(
+                EngineState::Stopping {
+                    next: Some(target_a()),
+                },
+                event,
+            );
+            assert_eq!(
+                state,
+                EngineState::Stopping {
+                    next: Some(target_a())
+                }
+            );
+            assert_eq!(effect, Effect::None);
+        }
+    }
+
+    // Failed
+
+    #[test]
+    fn ensure_from_failed_retries() {
+        let (state, effect) = step(
+            EngineState::Failed("boom".into()),
+            Event::Ensure(target_a()),
+        );
+        assert_eq!(state, EngineState::Starting(target_a()));
+        assert_eq!(effect, Effect::Spawn(target_a()));
+    }
+
+    #[test]
+    fn unload_from_failed_stops() {
+        let (state, effect) = step(EngineState::Failed("boom".into()), Event::Unload);
+        assert_eq!(state, EngineState::Stopped);
+        assert_eq!(effect, Effect::None);
+    }
+
+    #[test]
+    fn failed_ignores_stale_process_events() {
+        for event in [
+            Event::SpawnedHealthy { port: 4242 },
+            Event::SpawnFailed("stale".into()),
+            Event::ExitConfirmed,
+            Event::ChildCrashed("stale".into()),
+            Event::IdleExpired,
+        ] {
+            let (state, effect) = step(EngineState::Failed("boom".into()), event);
+            assert_eq!(state, EngineState::Failed("boom".into()));
+            assert_eq!(effect, Effect::None);
+        }
+    }
+}
diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs
index 8b882e3b..959acc9b 100644
--- a/src-tauri/src/lib.rs
+++ b/src-tauri/src/lib.rs
@@ -18,6 +18,7 @@
 pub mod commands;
 pub mod config;
 pub mod database;
+pub mod engine;
 pub mod export;
 pub mod history;
 pub mod images;

From 233e73f4c487fa62da49615fa5a0563067d2c186 Mon Sep 17 00:00:00 2001
From: Logan Nguyen <lg.131.dev@gmail.com>
Date: Wed, 10 Jun 2026 16:38:26 -0500
Subject: [PATCH 04/13] feat: add engine runner actor with process trait seam

Signed-off-by: Logan Nguyen <lg.131.dev@gmail.com>
---
 docs/configurations.md           |    7 +-
 src-tauri/Cargo.lock             |   11 +
 src-tauri/Cargo.toml             |    4 +-
 src-tauri/src/config/defaults.rs |   36 +
 src-tauri/src/engine/mod.rs      |    2 +
 src-tauri/src/engine/process.rs  |  241 +++++++
 src-tauri/src/engine/runner.rs   | 1122 ++++++++++++++++++++++++++++++
 7 files changed, 1420 insertions(+), 3 deletions(-)
 create mode 100644 src-tauri/src/engine/process.rs
 create mode 100644 src-tauri/src/engine/runner.rs

diff --git a/docs/configurations.md b/docs/configurations.md
index a735d6c3..8322cdbe 100644
--- a/docs/configurations.md
+++ b/docs/configurations.md
@@ -161,7 +161,7 @@ Each `[[inference.providers]]` block has these fields:
 
 If the active model has been removed from Ollama between launches, Thuki silently falls back to the first installed model the next time you open the picker. If no models are installed at all, the next request surfaces a "Model not found" error with the exact `ollama pull <name>` command to run.
 
-The table below also lists the baked-in safety limits that govern Thuki's communication with the Ollama HTTP API. None are tunable.
+The table below also lists the baked-in safety limits that govern Thuki's communication with the Ollama HTTP API and the lifecycle of the built-in engine process. None are tunable.
 
 | Constant                                    | Default  | Tunable? | Why not tunable                                                                                                                                                         | Bounds | Description                                                                                                                                                                          |
 | :------------------------------------------ | :------- | :------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----- | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@@ -171,6 +171,11 @@ The table below also lists the baked-in safety limits that govern Thuki's commun
 | `MAX_OLLAMA_SHOW_BODY_BYTES`                | `4 MiB`  | No       | Defense-in-depth bound on attacker-controlled response body. Same rationale as `MAX_OLLAMA_TAGS_BODY_BYTES`.                                                            | —      | The largest `/api/show` response body Thuki will accept. Full Modelfiles and parameters can be sizable, but 4 MiB is well above any real model; larger responses are rejected.      |
 | `MAX_MODEL_SLUG_LEN`                        | `256 B`  | No       | Defense-in-depth bound on adversarial input. Real Ollama slugs are a handful of characters; capping the length stops malformed values long before any network or DB work. | —      | The longest model slug Thuki will accept from `set_active_model`. Anything longer is rejected immediately by `validate_model_slug`.                                                  |
 | `VRAM_POLL_INTERVAL_SECS`                   | `5 s`    | No       | Tuning this trades responsiveness against localhost polling load; 5 s is the sweet spot for loopback calls and matches Ollama's internal TTL resolution granularity. | —      | How often Thuki polls Ollama's `/api/ps` to detect VRAM changes made outside Thuki (for example, running `ollama stop` or a TTL expiry). The Settings panel VRAM indicator reflects these changes within one interval. |
+| `ENGINE_HEALTH_DEADLINE_SECS`               | `300 s`  | No       | Engine lifecycle contract: this bounds the worst-case "warming up" wait the UI can show before a start is declared failed, so changing it alters the UX contract rather than tuning a preference. | —      | How long Thuki waits for a freshly spawned built-in engine to pass its `/health` check before giving up and killing the process. Large GGUF models loading from a cold disk can legitimately take minutes, so the deadline is generous. |
+| `ENGINE_HEALTH_POLL_INTERVAL_MS`            | `250 ms` | No       | Pure loopback-load tuning: 250 ms detects readiness promptly without hammering the local server while it is busy loading the model.                                  | —      | How often Thuki probes the built-in engine's `/health` endpoint while it starts up. A `503` answer means the model is still loading and the poll continues; `200` means ready.       |
+| `ENGINE_IDLE_CHECK_INTERVAL_SECS`           | `30 s`   | No       | Internal timer granularity behind the user-facing `idle_unload_minutes` knob; 30 s keeps the unload within a minute-scale setting's precision at negligible cost.    | —      | How often the engine runner checks whether `idle_unload_minutes` of inactivity have elapsed and the built-in engine should be stopped to free RAM.                                   |
+| `ENGINE_HEALTH_PROBE_TIMEOUT_SECS`          | `5 s`    | No       | Internal lifecycle contract between the runner and the engine process. A wedged-but-connected server must not park the poll loop forever; loopback probes are normally instant so 5 s is generous. The poll interval and deadline are the user-facing knobs. | —      | How long a single `/health` GET is allowed to take inside the startup poll loop. If the engine has accepted the TCP connection but stopped responding, this timeout causes the probe to return an error (treated as Wait and retried after `ENGINE_HEALTH_POLL_INTERVAL_MS`). |
+| `ENGINE_COMMAND_QUEUE_CAPACITY`             | `64`     | No       | Bounds memory under command bursts; 64 slots is ample for all UI-driven traffic (Ensure, Touch, SetIdleMinutes, Shutdown) under any realistic usage pattern. | —      | Capacity of the bounded `mpsc` channel that carries commands from `EngineHandle` to the runner actor task. Back-pressure from a full queue is not observable in normal use. |
 
 ### `[prompt]`
 
diff --git a/src-tauri/Cargo.lock b/src-tauri/Cargo.lock
index 9b370f7b..37632062 100644
--- a/src-tauri/Cargo.lock
+++ b/src-tauri/Cargo.lock
@@ -3605,6 +3605,16 @@ version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
+[[package]]
+name = "signal-hook-registry"
+version = "1.4.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c4db69cba1110affc0e9f7bcd48bbf87b3f4fc7c61fc9155afd4c469eb3d6c1b"
+dependencies = [
+ "errno",
+ "libc",
+]
+
 [[package]]
 name = "simd-adler32"
 version = "0.3.8"
@@ -4389,6 +4399,7 @@ dependencies = [
  "mio",
  "parking_lot",
  "pin-project-lite",
+ "signal-hook-registry",
  "socket2",
  "tokio-macros",
  "windows-sys 0.61.2",
diff --git a/src-tauri/Cargo.toml b/src-tauri/Cargo.toml
index 912ebf31..3d6b9ab1 100644
--- a/src-tauri/Cargo.toml
+++ b/src-tauri/Cargo.toml
@@ -27,7 +27,7 @@ tauri-plugin-dialog = "2"
 serde = { version = "1", features = ["derive"] }
 serde_json = "1"
 reqwest = { version = "0.13.4", features = ["json", "stream"] }
-tokio = { version = "1.52.3", features = ["macros"] }
+tokio = { version = "1.52.3", features = ["macros", "process", "time", "sync"] }
 futures-util = "0.3.32"
 tokio-util = "0.7"
 toml = "1.0"
@@ -56,7 +56,7 @@ objc2-vision = { version = "0.3", features = ["VNRecognizeTextRequest", "VNReque
 
 [dev-dependencies]
 mockito = "1"
-tokio = { version = "1", features = ["rt", "macros", "net", "io-util"] }
+tokio = { version = "1", features = ["rt", "macros", "net", "io-util", "test-util"] }
 wiremock = "0.6"
 tempfile = "3"
 
diff --git a/src-tauri/src/config/defaults.rs b/src-tauri/src/config/defaults.rs
index c352843a..742441f7 100644
--- a/src-tauri/src/config/defaults.rs
+++ b/src-tauri/src/config/defaults.rs
@@ -63,6 +63,42 @@ pub const BOUNDS_KEEP_WARM_INACTIVITY_MINUTES: (i32, i32) = (-1, 1440);
 pub const DEFAULT_IDLE_UNLOAD_MINUTES: u32 = 0;
 pub const BOUNDS_IDLE_UNLOAD_MINUTES: (u32, u32) = (0, 1440);
 
+// Built-in engine lifecycle constants: baked in because they define the
+// engine runner's startup and idle-check contract, not a user preference.
+
+/// Wall-clock deadline (seconds) for a freshly spawned built-in engine to
+/// pass its `/health` check before the spawn is declared failed. Large GGUF
+/// models on a cold disk can take minutes to load, so the deadline is
+/// generous. Not user-tunable: it bounds the worst-case "warming up" wait the
+/// UI can present, so changing it alters the UX contract.
+pub const ENGINE_HEALTH_DEADLINE_SECS: u64 = 300;
+
+/// Interval (milliseconds) between `/health` probes while the built-in
+/// engine starts up. Not user-tunable: pure loopback-load tuning; 250 ms
+/// detects readiness promptly without hammering the local server while it is
+/// busy loading the model.
+pub const ENGINE_HEALTH_POLL_INTERVAL_MS: u64 = 250;
+
+/// Timeout (seconds) for a single `/health` GET inside the poll loop. Bounds
+/// a server that has accepted the TCP connection but stopped responding: a
+/// wedged-but-connected server would otherwise park the poll loop indefinitely.
+/// Loopback health probes are normally instant; 5 s is generous. Not
+/// user-tunable: internal lifecycle contract between the runner and the engine
+/// process; the poll interval and deadline are the user-facing knobs.
+pub const ENGINE_HEALTH_PROBE_TIMEOUT_SECS: u64 = 5;
+
+/// Interval (seconds) between idle-unload checks in the engine runner. Not
+/// user-tunable: internal timer granularity behind the user-facing
+/// `idle_unload_minutes` knob; 30 s keeps the unload within a minute-scale
+/// setting's precision at negligible cost.
+pub const ENGINE_IDLE_CHECK_INTERVAL_SECS: u64 = 30;
+
+/// Capacity of the engine runner command queue. Not user-tunable: bounds
+/// memory under command bursts; 64 slots is ample for all UI-driven traffic
+/// (Ensure, Touch, SetIdleMinutes, Shutdown) with no back-pressure under
+/// normal use.
+pub const ENGINE_COMMAND_QUEUE_CAPACITY: usize = 64;
+
 /// Built-in secretary persona prompt. User overrides via `[prompt] system` in
 /// the config file. The slash-command appendix is composed on top at load time
 /// and is never written back to the file.
diff --git a/src-tauri/src/engine/mod.rs b/src-tauri/src/engine/mod.rs
index b0777d78..856f337f 100644
--- a/src-tauri/src/engine/mod.rs
+++ b/src-tauri/src/engine/mod.rs
@@ -6,4 +6,6 @@
 //! a confirmed exit before spawning the new one. This module hosts the pieces
 //! of that lifecycle; the pure residency state machine lives in [`state`].
 
+pub mod process;
+pub mod runner;
 pub mod state;
diff --git a/src-tauri/src/engine/process.rs b/src-tauri/src/engine/process.rs
new file mode 100644
index 00000000..50a4df0f
--- /dev/null
+++ b/src-tauri/src/engine/process.rs
@@ -0,0 +1,241 @@
+//! Process seam for the built-in engine.
+//!
+//! [`EngineProcess`] abstracts spawning and health-probing the bundled
+//! `llama-server` binary so the runner actor in [`super::runner`] can be
+//! driven entirely by fakes in tests. The real implementation,
+//! [`TokioEngineProcess`], is a thin wrapper around `tokio::process` and
+//! `reqwest`; all the logic around it (health classification, the startup
+//! poll loop, command-line construction) lives in pure functions tested
+//! directly.
+
+use std::future::Future;
+use std::path::PathBuf;
+use std::time::Duration;
+
+use async_trait::async_trait;
+
+use crate::config::defaults::ENGINE_HEALTH_PROBE_TIMEOUT_SECS;
+
+/// Everything needed to launch one engine process.
+#[derive(Debug, Clone, PartialEq)]
+pub struct SpawnArgs {
+    /// Absolute path to the GGUF model file (`-m`).
+    pub model_path: PathBuf,
+    /// Optional multimodal projector file for vision models (`--mmproj`).
+    pub mmproj_path: Option<PathBuf>,
+    /// Context window size in tokens (`--ctx-size`).
+    pub num_ctx: u32,
+    /// Loopback port the server is told to listen on (`--port`).
+    pub port: u16,
+}
+
+/// A live engine process the runner can await or kill.
+#[async_trait]
+pub trait EngineChild: Send {
+    /// Resolves when the process exits (normally or by kill).
+    async fn wait_exit(&mut self);
+    /// Kills the process and waits for the exit to land.
+    async fn kill(&mut self);
+}
+
+/// Spawn-and-probe seam between the runner actor and the operating system.
+#[async_trait]
+pub trait EngineProcess: Send + Sync + 'static {
+    /// Launches one engine process described by `args`.
+    async fn spawn(&self, args: &SpawnArgs) -> Result<Box<dyn EngineChild>, String>;
+    /// Binds `127.0.0.1:0` and returns the free port the OS handed out.
+    fn free_port(&self) -> Result<u16, String>;
+    /// One GET `http://127.0.0.1:{port}/health` returning the raw HTTP
+    /// status code (`Err` on transport error). The poll loop and the status
+    /// classification are the pure functions in this module; only this
+    /// single call is thin.
+    async fn health_probe(&self, port: u16) -> Result<u16, String>;
+}
+
+/// What one health probe result means for the startup poll loop.
+#[derive(Debug, PartialEq)]
+pub enum HealthVerdict {
+    /// The server is up and the model is loaded.
+    Ready,
+    /// The server answered but the model is still loading; keep polling.
+    Wait,
+    /// The server answered with an unexpected status; abort the startup.
+    Fail(u16),
+}
+
+/// Pure: `200` means ready, `503` means keep waiting (`llama-server` returns
+/// 503 while the model loads), anything else is a startup failure.
+pub fn classify_health_status(status: u16) -> HealthVerdict {
+    match status {
+        200 => HealthVerdict::Ready,
+        503 => HealthVerdict::Wait,
+        other => HealthVerdict::Fail(other),
+    }
+}
+
+/// Drives `probe` until it reports ready, the deadline is exhausted, or a
+/// probe returns a hard failure status. A transport error counts as "keep
+/// waiting" because the server socket is not accepting yet during the early
+/// part of a spawn.
+pub async fn poll_until_healthy<F, Fut>(
+    probe: F,
+    deadline: Duration,
+    interval: Duration,
+) -> Result<(), String>
+where
+    F: Fn() -> Fut,
+    Fut: Future<Output = Result<u16, String>>,
+{
+    let start = tokio::time::Instant::now();
+    loop {
+        if start.elapsed() >= deadline {
+            return Err("engine did not become healthy before the deadline".to_string());
+        }
+        if let Ok(status) = probe().await {
+            match classify_health_status(status) {
+                HealthVerdict::Ready => return Ok(()),
+                HealthVerdict::Fail(code) => {
+                    return Err(format!("engine health check returned HTTP {code}"));
+                }
+                HealthVerdict::Wait => {}
+            }
+        }
+        tokio::time::sleep(interval).await;
+    }
+}
+
+/// Real [`EngineProcess`] backed by `tokio::process` and `reqwest`.
+///
+/// Every trait method is a thin OS or network wrapper excluded from
+/// coverage; the logic they lean on ([`classify_health_status`],
+/// [`poll_until_healthy`], [`llama_server_args`]) is tested directly, and
+/// the runner that consumes the trait is tested through fakes.
+pub struct TokioEngineProcess {
+    /// Configured path to the bundled `llama-server` binary.
+    pub binary: PathBuf,
+    /// Shared HTTP client used for health probes.
+    pub client: reqwest::Client,
+}
+
+/// Pure: the `llama-server` command line for one spawn:
+/// `-m <model> [--mmproj <p>] --ctx-size <n> --host 127.0.0.1 --port <p> --no-webui`.
+fn llama_server_args(args: &SpawnArgs) -> Vec<std::ffi::OsString> {
+    let mut argv: Vec<std::ffi::OsString> = vec!["-m".into(), args.model_path.clone().into()];
+    if let Some(mmproj) = &args.mmproj_path {
+        argv.push("--mmproj".into());
+        argv.push(mmproj.clone().into());
+    }
+    argv.push("--ctx-size".into());
+    argv.push(args.num_ctx.to_string().into());
+    argv.push("--host".into());
+    argv.push("127.0.0.1".into());
+    argv.push("--port".into());
+    argv.push(args.port.to_string().into());
+    argv.push("--no-webui".into());
+    argv
+}
+
+/// A spawned `llama-server` process.
+struct TokioChild {
+    inner: tokio::process::Child,
+}
+
+#[async_trait]
+impl EngineChild for TokioChild {
+    #[cfg_attr(coverage_nightly, coverage(off))]
+    async fn wait_exit(&mut self) {
+        let _ = self.inner.wait().await;
+    }
+
+    #[cfg_attr(coverage_nightly, coverage(off))]
+    async fn kill(&mut self) {
+        let _ = self.inner.start_kill();
+        let _ = self.inner.wait().await;
+    }
+}
+
+#[async_trait]
+impl EngineProcess for TokioEngineProcess {
+    #[cfg_attr(coverage_nightly, coverage(off))]
+    async fn spawn(&self, args: &SpawnArgs) -> Result<Box<dyn EngineChild>, String> {
+        let child = tokio::process::Command::new(&self.binary)
+            .args(llama_server_args(args))
+            .stdout(std::process::Stdio::null())
+            .stderr(std::process::Stdio::null())
+            .kill_on_drop(true)
+            .spawn()
+            .map_err(|e| e.to_string())?;
+        Ok(Box::new(TokioChild { inner: child }))
+    }
+
+    #[cfg_attr(coverage_nightly, coverage(off))]
+    fn free_port(&self) -> Result<u16, String> {
+        let listener = std::net::TcpListener::bind("127.0.0.1:0").map_err(|e| e.to_string())?;
+        Ok(listener.local_addr().map_err(|e| e.to_string())?.port())
+    }
+
+    #[cfg_attr(coverage_nightly, coverage(off))]
+    async fn health_probe(&self, port: u16) -> Result<u16, String> {
+        let probe = self
+            .client
+            .get(format!("http://127.0.0.1:{port}/health"))
+            .send();
+        tokio::time::timeout(Duration::from_secs(ENGINE_HEALTH_PROBE_TIMEOUT_SECS), probe)
+            .await
+            .map_err(|_| "health probe timed out".to_string())?
+            .map(|response| response.status().as_u16())
+            .map_err(|e| e.to_string())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn args(mmproj: Option<&str>) -> SpawnArgs {
+        SpawnArgs {
+            model_path: PathBuf::from("/models/a.gguf"),
+            mmproj_path: mmproj.map(PathBuf::from),
+            num_ctx: 8192,
+            port: 4242,
+        }
+    }
+
+    #[test]
+    fn llama_server_args_without_mmproj() {
+        assert_eq!(
+            llama_server_args(&args(None)),
+            vec![
+                "-m",
+                "/models/a.gguf",
+                "--ctx-size",
+                "8192",
+                "--host",
+                "127.0.0.1",
+                "--port",
+                "4242",
+                "--no-webui",
+            ]
+        );
+    }
+
+    #[test]
+    fn llama_server_args_with_mmproj() {
+        assert_eq!(
+            llama_server_args(&args(Some("/models/a.mmproj.gguf"))),
+            vec![
+                "-m",
+                "/models/a.gguf",
+                "--mmproj",
+                "/models/a.mmproj.gguf",
+                "--ctx-size",
+                "8192",
+                "--host",
+                "127.0.0.1",
+                "--port",
+                "4242",
+                "--no-webui",
+            ]
+        );
+    }
+}
diff --git a/src-tauri/src/engine/runner.rs b/src-tauri/src/engine/runner.rs
new file mode 100644
index 00000000..f28f3b68
--- /dev/null
+++ b/src-tauri/src/engine/runner.rs
@@ -0,0 +1,1122 @@
+//! Async runner actor that drives the pure engine state machine.
+//!
+//! The actor owns the live child process, the in-flight health poll, and the
+//! pending chat waiters. Every transition flows through [`state::step`]; the
+//! actor only executes the effects the machine requests. The invariants
+//! proven by the tests below: at most one engine process is ever alive, a
+//! model switch kills the old process and waits for its confirmed exit
+//! before spawning the new one, the latest requested target wins, and every
+//! chat waiter resolves with the port on `Loaded` or with a typed
+//! [`EnsureError`].
+//!
+//! All timing goes through `tokio::time` so the tests run under paused time.
+
+use std::future::Future;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::time::Duration;
+
+use tokio::sync::{mpsc, oneshot, watch};
+
+use super::process::{poll_until_healthy, EngineChild, EngineProcess, SpawnArgs};
+use super::state::{step, Effect, EngineState, Event, Target};
+use crate::config::defaults::{
+    ENGINE_COMMAND_QUEUE_CAPACITY, ENGINE_HEALTH_DEADLINE_SECS, ENGINE_HEALTH_POLL_INTERVAL_MS,
+};
+
+/// Snapshot of the engine lifecycle published through the status watch.
+#[derive(Debug, Clone, serde::Serialize, PartialEq)]
+pub struct EngineStatus {
+    /// `"stopped"`, `"starting"`, `"loaded"`, `"stopping"`, or `"failed"`.
+    pub state: String,
+    /// The current/last target's model path; empty when stopped.
+    pub model_path: String,
+    /// The serving port; only set while loaded.
+    pub port: Option<u16>,
+    /// The error message; only set while failed.
+    pub error: Option<String>,
+}
+
+/// Why an [`EngineHandle::ensure_loaded`] call did not produce a port.
+#[derive(Debug, PartialEq)]
+pub enum EnsureError {
+    /// A newer Ensure replaced this request's target before it loaded.
+    Superseded,
+    /// Spawn or health check failed.
+    StartFailed(String),
+}
+
+/// Messages from the handle to the actor task.
+enum Command {
+    Ensure {
+        target: Target,
+        reply: oneshot::Sender<Result<u16, EnsureError>>,
+    },
+    Unload {
+        reply: oneshot::Sender<()>,
+    },
+    Touch,
+    SetIdleMinutes(u32),
+    Shutdown {
+        reply: oneshot::Sender<()>,
+    },
+}
+
+/// Cloneable handle to the engine runner actor.
+#[derive(Clone)]
+pub struct EngineHandle {
+    cmd_tx: mpsc::Sender<Command>,
+    status_rx: watch::Receiver<EngineStatus>,
+}
+
+impl EngineHandle {
+    /// Spawns the actor task. `idle_minutes == 0` disables idle unload.
+    pub fn spawn(
+        process: Arc<dyn EngineProcess>,
+        idle_minutes: u32,
+        idle_check_interval: Duration,
+    ) -> Self {
+        let (cmd_tx, cmd_rx) = mpsc::channel(ENGINE_COMMAND_QUEUE_CAPACITY);
+        let (status_tx, status_rx) = watch::channel(status_of(&EngineState::Stopped));
+        let core = Core {
+            process,
+            state: EngineState::Stopped,
+            child: None,
+            health: None,
+            pending_port: 0,
+            waiters: Vec::new(),
+            status_tx,
+        };
+        tokio::spawn(run_actor(core, cmd_rx, idle_minutes, idle_check_interval));
+        Self { cmd_tx, status_rx }
+    }
+
+    /// Resolves with the port once the target is loaded; waits through any
+    /// in-flight transitions (kill, exit confirmation, spawn, health check).
+    pub async fn ensure_loaded(&self, target: Target) -> Result<u16, EnsureError> {
+        let (reply, rx) = oneshot::channel();
+        let send = self.cmd_tx.send(Command::Ensure { target, reply }).await;
+        if send.is_err() {
+            return Err(EnsureError::StartFailed(
+                "engine runner is not running".to_string(),
+            ));
+        }
+        rx.await.unwrap_or_else(|_| {
+            Err(EnsureError::StartFailed(
+                "engine runner stopped before the model loaded".to_string(),
+            ))
+        })
+    }
+
+    /// Stops the engine and releases its memory. Resolved once the process
+    /// exit is confirmed.
+    pub async fn unload(&self) {
+        let (reply, rx) = oneshot::channel();
+        if self.cmd_tx.send(Command::Unload { reply }).await.is_ok() {
+            let _ = rx.await;
+        }
+    }
+
+    /// Marks chat activity so the idle-unload timer starts over.
+    pub fn touch(&self) {
+        let _ = self.cmd_tx.try_send(Command::Touch);
+    }
+
+    /// Applies a new idle-unload setting without restarting the actor.
+    pub async fn set_idle_minutes(&self, minutes: u32) {
+        let _ = self.cmd_tx.send(Command::SetIdleMinutes(minutes)).await;
+    }
+
+    /// Kills any live child, confirms its exit, and ends the actor task.
+    pub async fn shutdown(&self) {
+        let (reply, rx) = oneshot::channel();
+        if self.cmd_tx.send(Command::Shutdown { reply }).await.is_ok() {
+            let _ = rx.await;
+        }
+    }
+
+    /// A watch receiver that observes every lifecycle change.
+    pub fn status(&self) -> watch::Receiver<EngineStatus> {
+        self.status_rx.clone()
+    }
+}
+
+/// Pure projection of the machine state into the published status.
+fn status_of(state: &EngineState) -> EngineStatus {
+    match state {
+        EngineState::Stopped => EngineStatus {
+            state: "stopped".to_string(),
+            model_path: String::new(),
+            port: None,
+            error: None,
+        },
+        EngineState::Starting(target) => EngineStatus {
+            state: "starting".to_string(),
+            model_path: target.model_path.display().to_string(),
+            port: None,
+            error: None,
+        },
+        EngineState::Loaded { target, port } => EngineStatus {
+            state: "loaded".to_string(),
+            model_path: target.model_path.display().to_string(),
+            port: Some(*port),
+            error: None,
+        },
+        EngineState::Stopping { next } => EngineStatus {
+            state: "stopping".to_string(),
+            model_path: next
+                .as_ref()
+                .map(|target| target.model_path.display().to_string())
+                .unwrap_or_default(),
+            port: None,
+            error: None,
+        },
+        EngineState::Failed(error) => EngineStatus {
+            state: "failed".to_string(),
+            model_path: String::new(),
+            port: None,
+            error: Some(error.clone()),
+        },
+    }
+}
+
+/// The in-flight health poll for the current spawn.
+type HealthFuture = Pin<Box<dyn Future<Output = Result<(), String>> + Send>>;
+
+/// A pending `ensure_loaded` call: the target it asked for and the channel
+/// that resolves it.
+type Waiter = (Target, oneshot::Sender<Result<u16, EnsureError>>);
+
+/// Everything the actor mutates while executing transitions.
+struct Core {
+    process: Arc<dyn EngineProcess>,
+    state: EngineState,
+    child: Option<Box<dyn EngineChild>>,
+    health: Option<HealthFuture>,
+    pending_port: u16,
+    waiters: Vec<Waiter>,
+    status_tx: watch::Sender<EngineStatus>,
+}
+
+impl Core {
+    /// Feeds one event through the state machine and executes the requested
+    /// effects until the machine settles (a kill chains into an exit
+    /// confirmation, which can chain into the next spawn).
+    async fn dispatch(&mut self, event: Event) {
+        let mut pending = Some(event);
+        while let Some(ev) = pending.take() {
+            let (next, effect) = step(self.state.clone(), ev);
+            self.state = next;
+            self.status_tx.send_replace(status_of(&self.state));
+            self.settle_waiters();
+            match effect {
+                Effect::None => {}
+                Effect::Kill => {
+                    self.health = None;
+                    self.kill_child().await;
+                    pending = Some(Event::ExitConfirmed);
+                }
+                Effect::Spawn(target) => {
+                    if let Err(error) = self.begin_spawn(&target).await {
+                        pending = Some(Event::SpawnFailed(error));
+                    }
+                }
+            }
+        }
+    }
+
+    /// Resolves every pending waiter when the machine reaches a settling
+    /// state: `Loaded` resolves matching targets with the port and the rest
+    /// as superseded, `Failed` propagates the error, and `Stopped` (reached
+    /// through an unload that aborted an in-flight start) supersedes
+    /// whatever was still waiting.
+    fn settle_waiters(&mut self) {
+        match &self.state {
+            EngineState::Loaded { target, port } => {
+                let (target, port) = (target.clone(), *port);
+                for (requested, reply) in self.waiters.drain(..) {
+                    let outcome = if requested == target {
+                        Ok(port)
+                    } else {
+                        Err(EnsureError::Superseded)
+                    };
+                    let _ = reply.send(outcome);
+                }
+            }
+            EngineState::Failed(error) => {
+                let error = error.clone();
+                for (_, reply) in self.waiters.drain(..) {
+                    let _ = reply.send(Err(EnsureError::StartFailed(error.clone())));
+                }
+            }
+            EngineState::Stopped => {
+                for (_, reply) in self.waiters.drain(..) {
+                    let _ = reply.send(Err(EnsureError::Superseded));
+                }
+            }
+            _ => {}
+        }
+    }
+
+    /// Grabs a free port, spawns the process, and arms the health poll. The
+    /// health result is consumed by the actor loop, racing against an
+    /// unexpected child exit.
+    async fn begin_spawn(&mut self, target: &Target) -> Result<(), String> {
+        let port = self.process.free_port()?;
+        let args = SpawnArgs {
+            model_path: target.model_path.clone(),
+            mmproj_path: target.mmproj_path.clone(),
+            num_ctx: target.num_ctx,
+            port,
+        };
+        let child = self.process.spawn(&args).await?;
+        self.child = Some(child);
+        self.pending_port = port;
+        let process = Arc::clone(&self.process);
+        self.health = Some(Box::pin(poll_until_healthy(
+            move || {
+                let process = Arc::clone(&process);
+                async move { process.health_probe(port).await }
+            },
+            Duration::from_secs(ENGINE_HEALTH_DEADLINE_SECS),
+            Duration::from_millis(ENGINE_HEALTH_POLL_INTERVAL_MS),
+        )));
+        Ok(())
+    }
+
+    /// Kills the live child, if any, and waits for its exit to land.
+    async fn kill_child(&mut self) {
+        if let Some(mut child) = self.child.take() {
+            child.kill().await;
+        }
+    }
+}
+
+/// What woke the actor loop.
+enum Wake {
+    Cmd(Option<Command>),
+    Health(Result<(), String>),
+    ChildExit,
+    Tick,
+}
+
+/// The single actor task: owns the [`Core`], serializes every transition,
+/// and multiplexes commands, the health poll, child-exit detection, and the
+/// idle timer.
+async fn run_actor(
+    mut core: Core,
+    mut cmd_rx: mpsc::Receiver<Command>,
+    mut idle_minutes: u32,
+    idle_check_interval: Duration,
+) {
+    let mut last_activity = tokio::time::Instant::now();
+    let mut ticker = tokio::time::interval(idle_check_interval);
+    loop {
+        let health_armed = core.health.is_some();
+        let child_armed = core.child.is_some();
+        let wake = {
+            // health_fut and child_fut are taken as mutable borrows of
+            // disjoint fields before the select! macro so the borrow checker
+            // can see they do not alias core. The `if health_armed` /
+            // `if child_armed` guards disable the branch when the option is
+            // None, so the `.expect("armed")` inside each async block is
+            // unreachable when the branch is inactive and can never fire.
+            let health_fut = core.health.as_mut();
+            let child_fut = core.child.as_mut();
+            tokio::select! {
+                biased;
+                cmd = cmd_rx.recv() => Wake::Cmd(cmd),
+                result = async { health_fut.expect("armed").await }, if health_armed => {
+                    Wake::Health(result)
+                }
+                _ = async { child_fut.expect("armed").wait_exit().await }, if child_armed => {
+                    Wake::ChildExit
+                }
+                _ = ticker.tick() => Wake::Tick,
+            }
+        };
+        match wake {
+            Wake::Cmd(Some(Command::Ensure { target, reply })) => {
+                last_activity = tokio::time::Instant::now();
+                core.waiters.push((target.clone(), reply));
+                core.dispatch(Event::Ensure(target)).await;
+            }
+            Wake::Cmd(Some(Command::Unload { reply })) => {
+                core.dispatch(Event::Unload).await;
+                let _ = reply.send(());
+            }
+            Wake::Cmd(Some(Command::Touch)) => {
+                last_activity = tokio::time::Instant::now();
+            }
+            Wake::Cmd(Some(Command::SetIdleMinutes(minutes))) => {
+                idle_minutes = minutes;
+            }
+            Wake::Cmd(Some(Command::Shutdown { reply })) => {
+                core.health = None;
+                core.kill_child().await;
+                core.state = EngineState::Stopped;
+                core.status_tx.send_replace(status_of(&core.state));
+                let _ = reply.send(());
+                break;
+            }
+            // Every handle is gone; tear down like a shutdown. Pending
+            // waiters are dropped, which `ensure_loaded` maps to a typed
+            // error.
+            Wake::Cmd(None) => {
+                core.health = None;
+                core.kill_child().await;
+                core.state = EngineState::Stopped;
+                core.status_tx.send_replace(status_of(&core.state));
+                break;
+            }
+            Wake::Health(Ok(())) => {
+                core.health = None;
+                let port = core.pending_port;
+                // Reset the idle clock so a slow model load cannot be
+                // idle-killed immediately: the idle window starts from the
+                // moment the engine becomes Loaded, not from when Ensure was
+                // received.
+                last_activity = tokio::time::Instant::now();
+                core.dispatch(Event::SpawnedHealthy { port }).await;
+            }
+            // Health gave up (deadline or hard failure): the process is
+            // running but useless, so kill it before reporting the failure.
+            Wake::Health(Err(error)) => {
+                core.health = None;
+                core.kill_child().await;
+                core.dispatch(Event::SpawnFailed(error)).await;
+            }
+            Wake::ChildExit => {
+                core.child = None;
+                core.health = None;
+                core.dispatch(Event::ChildCrashed(
+                    "engine process exited unexpectedly".to_string(),
+                ))
+                .await;
+            }
+            Wake::Tick => {
+                if idle_minutes > 0
+                    && matches!(core.state, EngineState::Loaded { .. })
+                    && last_activity.elapsed() >= Duration::from_secs(u64::from(idle_minutes) * 60)
+                {
+                    core.dispatch(Event::IdleExpired).await;
+                }
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::config::defaults::ENGINE_IDLE_CHECK_INTERVAL_SECS;
+    use crate::engine::process::{classify_health_status, HealthVerdict};
+    use std::cell::RefCell;
+    use std::collections::VecDeque;
+    use std::path::PathBuf;
+    use std::sync::Mutex;
+
+    // ── Fakes ──────────────────────────────────────────────────────────
+
+    #[derive(Default)]
+    struct FakeInner {
+        spawns: Vec<SpawnArgs>,
+        spawn_errors: VecDeque<String>,
+        ports_handed: u16,
+        live: usize,
+        max_live: usize,
+        kills: usize,
+        probes_served: usize,
+        log: Vec<String>,
+        current_exit: Option<Arc<watch::Sender<bool>>>,
+    }
+
+    /// Scriptable [`EngineProcess`]: records every spawn, hands out
+    /// sequential ports, serves health probes from a channel (a probe with
+    /// no queued result blocks, so paused time never runs away), and exposes
+    /// crash injection.
+    struct FakeProcess {
+        inner: Arc<Mutex<FakeInner>>,
+        health_tx: mpsc::UnboundedSender<Result<u16, String>>,
+        health_rx: tokio::sync::Mutex<mpsc::UnboundedReceiver<Result<u16, String>>>,
+        /// When set, every probe answers 503 immediately instead of waiting
+        /// for a scripted result; paused time then drives the poll loop all
+        /// the way to its deadline.
+        always_wait: std::sync::atomic::AtomicBool,
+    }
+
+    impl FakeProcess {
+        fn new() -> Arc<Self> {
+            let (health_tx, health_rx) = mpsc::unbounded_channel();
+            Arc::new(Self {
+                inner: Arc::new(Mutex::new(FakeInner::default())),
+                health_tx,
+                health_rx: tokio::sync::Mutex::new(health_rx),
+                always_wait: std::sync::atomic::AtomicBool::new(false),
+            })
+        }
+
+        fn push_health(&self, result: Result<u16, String>) {
+            self.health_tx.send(result).expect("receiver lives in self");
+        }
+
+        fn push_spawn_error(&self, message: &str) {
+            self.inner
+                .lock()
+                .unwrap()
+                .spawn_errors
+                .push_back(message.to_string());
+        }
+
+        /// Makes the live child exit without a kill being issued.
+        fn crash_current(&self) {
+            let exit = {
+                let mut inner = self.inner.lock().unwrap();
+                inner.live -= 1;
+                inner.log.push("exit".to_string());
+                inner.current_exit.take().expect("a child is live")
+            };
+            let _ = exit.send(true);
+        }
+
+        fn snapshot<T>(&self, read: impl Fn(&FakeInner) -> T) -> T {
+            read(&self.inner.lock().unwrap())
+        }
+    }
+
+    struct FakeChild {
+        inner: Arc<Mutex<FakeInner>>,
+        exit_tx: Arc<watch::Sender<bool>>,
+        exit_rx: watch::Receiver<bool>,
+    }
+
+    #[async_trait::async_trait]
+    impl EngineChild for FakeChild {
+        async fn wait_exit(&mut self) {
+            let _ = self.exit_rx.wait_for(|exited| *exited).await;
+        }
+
+        async fn kill(&mut self) {
+            {
+                let mut inner = self.inner.lock().unwrap();
+                inner.kills += 1;
+                inner.log.push("kill".to_string());
+                inner.live -= 1;
+                inner.log.push("exit".to_string());
+            }
+            let _ = self.exit_tx.send(true);
+        }
+    }
+
+    #[async_trait::async_trait]
+    impl EngineProcess for FakeProcess {
+        async fn spawn(&self, args: &SpawnArgs) -> Result<Box<dyn EngineChild>, String> {
+            let mut inner = self.inner.lock().unwrap();
+            if let Some(message) = inner.spawn_errors.pop_front() {
+                return Err(message);
+            }
+            inner.spawns.push(args.clone());
+            inner.live += 1;
+            inner.max_live = inner.max_live.max(inner.live);
+            inner
+                .log
+                .push(format!("spawn {}", args.model_path.display()));
+            let (exit_tx, exit_rx) = watch::channel(false);
+            let exit_tx = Arc::new(exit_tx);
+            inner.current_exit = Some(Arc::clone(&exit_tx));
+            Ok(Box::new(FakeChild {
+                inner: Arc::clone(&self.inner),
+                exit_tx,
+                exit_rx,
+            }))
+        }
+
+        fn free_port(&self) -> Result<u16, String> {
+            let mut inner = self.inner.lock().unwrap();
+            let port = 40000 + inner.ports_handed;
+            inner.ports_handed += 1;
+            Ok(port)
+        }
+
+        async fn health_probe(&self, _port: u16) -> Result<u16, String> {
+            if self.always_wait.load(std::sync::atomic::Ordering::SeqCst) {
+                return Ok(503);
+            }
+            let result = self
+                .health_rx
+                .lock()
+                .await
+                .recv()
+                .await
+                .expect("sender lives in self");
+            self.inner.lock().unwrap().probes_served += 1;
+            result
+        }
+    }
+
+    // ── Helpers ────────────────────────────────────────────────────────
+
+    fn target(name: &str) -> Target {
+        Target {
+            model_path: PathBuf::from(format!("/models/{name}.gguf")),
+            mmproj_path: None,
+            num_ctx: 4096,
+        }
+    }
+
+    fn spawn_handle(process: &Arc<FakeProcess>, idle_minutes: u32) -> EngineHandle {
+        EngineHandle::spawn(
+            Arc::clone(process) as Arc<dyn EngineProcess>,
+            idle_minutes,
+            Duration::from_secs(ENGINE_IDLE_CHECK_INTERVAL_SECS),
+        )
+    }
+
+    async fn load(handle: &EngineHandle, process: &Arc<FakeProcess>, name: &str) -> u16 {
+        process.push_health(Ok(200));
+        handle.ensure_loaded(target(name)).await.expect("loads")
+    }
+
+    /// Lets paused time tick forward until the fake reports the condition.
+    async fn wait_until(process: &Arc<FakeProcess>, predicate: impl Fn(&FakeInner) -> bool) {
+        while !process.snapshot(&predicate) {
+            tokio::time::sleep(Duration::from_millis(1)).await;
+        }
+    }
+
+    async fn wait_for_state(rx: &mut watch::Receiver<EngineStatus>, want: &str) {
+        while rx.borrow_and_update().state != want {
+            rx.changed().await.expect("status channel open");
+        }
+    }
+
+    /// Yields enough times for the actor to drain its ready work.
+    async fn drain_actor() {
+        for _ in 0..32 {
+            tokio::task::yield_now().await;
+        }
+    }
+
+    // ── Pure helpers: classification and poll loop ─────────────────────
+
+    #[test]
+    fn classify_health_status_maps_statuses() {
+        assert_eq!(classify_health_status(200), HealthVerdict::Ready);
+        assert_eq!(classify_health_status(503), HealthVerdict::Wait);
+        assert_eq!(classify_health_status(500), HealthVerdict::Fail(500));
+        assert_eq!(classify_health_status(404), HealthVerdict::Fail(404));
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn poll_until_healthy_ready_immediately() {
+        let calls = RefCell::new(0);
+        let result = poll_until_healthy(
+            || {
+                *calls.borrow_mut() += 1;
+                async { Ok::<u16, String>(200) }
+            },
+            Duration::from_secs(5),
+            Duration::from_millis(250),
+        )
+        .await;
+        assert_eq!(result, Ok(()));
+        assert_eq!(*calls.borrow(), 1);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn poll_until_healthy_waits_through_503_and_transport_errors() {
+        let script = RefCell::new(VecDeque::from([
+            Ok(503),
+            Err("connection refused".to_string()),
+            Ok(200),
+        ]));
+        let start = tokio::time::Instant::now();
+        let result = poll_until_healthy(
+            || {
+                let next = script.borrow_mut().pop_front().expect("script covers");
+                async move { next }
+            },
+            Duration::from_secs(5),
+            Duration::from_millis(250),
+        )
+        .await;
+        assert_eq!(result, Ok(()));
+        assert!(script.borrow().is_empty());
+        assert_eq!(start.elapsed(), Duration::from_millis(500));
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn poll_until_healthy_deadline_exceeded() {
+        let result = poll_until_healthy(
+            || async { Ok::<u16, String>(503) },
+            Duration::from_secs(1),
+            Duration::from_millis(250),
+        )
+        .await;
+        assert_eq!(
+            result,
+            Err("engine did not become healthy before the deadline".to_string())
+        );
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn poll_until_healthy_fail_status_aborts() {
+        let calls = RefCell::new(0);
+        let result = poll_until_healthy(
+            || {
+                *calls.borrow_mut() += 1;
+                async { Ok::<u16, String>(500) }
+            },
+            Duration::from_secs(5),
+            Duration::from_millis(250),
+        )
+        .await;
+        assert_eq!(
+            result,
+            Err("engine health check returned HTTP 500".to_string())
+        );
+        assert_eq!(*calls.borrow(), 1);
+    }
+
+    // ── Runner: load, reuse, switch ────────────────────────────────────
+
+    #[tokio::test(start_paused = true)]
+    async fn ensure_spawns_and_reports_loaded() {
+        let process = FakeProcess::new();
+        let handle = spawn_handle(&process, 0);
+
+        let port = load(&handle, &process, "a").await;
+
+        assert_eq!(port, 40000);
+        assert_eq!(
+            *handle.status().borrow(),
+            EngineStatus {
+                state: "loaded".to_string(),
+                model_path: "/models/a.gguf".to_string(),
+                port: Some(40000),
+                error: None,
+            }
+        );
+        assert_eq!(
+            process.snapshot(|i| i.spawns.clone()),
+            vec![SpawnArgs {
+                model_path: PathBuf::from("/models/a.gguf"),
+                mmproj_path: None,
+                num_ctx: 4096,
+                port: 40000,
+            }]
+        );
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn ensure_waits_for_health() {
+        let process = FakeProcess::new();
+        let handle = spawn_handle(&process, 0);
+        process.push_health(Ok(503));
+        process.push_health(Ok(503));
+
+        let h = handle.clone();
+        let waiter = tokio::spawn(async move { h.ensure_loaded(target("a")).await });
+        wait_until(&process, |i| i.probes_served == 2).await;
+
+        assert!(!waiter.is_finished());
+        assert_eq!(handle.status().borrow().state, "starting");
+
+        process.push_health(Ok(200));
+        assert_eq!(waiter.await.unwrap(), Ok(40000));
+        assert_eq!(handle.status().borrow().state, "loaded");
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn second_ensure_same_target_reuses() {
+        let process = FakeProcess::new();
+        let handle = spawn_handle(&process, 0);
+
+        let first = load(&handle, &process, "a").await;
+        let second = handle.ensure_loaded(target("a")).await.expect("reuses");
+
+        assert_eq!(first, second);
+        assert_eq!(process.snapshot(|i| i.spawns.len()), 1);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn ensure_new_target_kills_then_spawns_once_exit_confirmed() {
+        let process = FakeProcess::new();
+        let handle = spawn_handle(&process, 0);
+
+        load(&handle, &process, "a").await;
+        let port = load(&handle, &process, "b").await;
+
+        assert_eq!(port, 40001);
+        assert_eq!(
+            process.snapshot(|i| i.log.clone()),
+            vec![
+                "spawn /models/a.gguf",
+                "kill",
+                "exit",
+                "spawn /models/b.gguf"
+            ]
+        );
+        assert_eq!(process.snapshot(|i| i.max_live), 1);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn rapid_ensures_converge_to_latest() {
+        let process = FakeProcess::new();
+        let handle = spawn_handle(&process, 0);
+
+        let h1 = handle.clone();
+        let w1 = tokio::spawn(async move { h1.ensure_loaded(target("a")).await });
+        wait_until(&process, |i| i.spawns.len() == 1).await;
+        let h2 = handle.clone();
+        let w2 = tokio::spawn(async move { h2.ensure_loaded(target("b")).await });
+        wait_until(&process, |i| i.spawns.len() == 2).await;
+        let h3 = handle.clone();
+        let w3 = tokio::spawn(async move { h3.ensure_loaded(target("c")).await });
+        wait_until(&process, |i| i.spawns.len() == 3).await;
+
+        process.push_health(Ok(200));
+
+        assert_eq!(w3.await.unwrap(), Ok(40002));
+        assert_eq!(w1.await.unwrap(), Err(EnsureError::Superseded));
+        assert_eq!(w2.await.unwrap(), Err(EnsureError::Superseded));
+        let status = handle.status().borrow().clone();
+        assert_eq!(status.state, "loaded");
+        assert_eq!(status.model_path, "/models/c.gguf");
+        assert_eq!(process.snapshot(|i| i.max_live), 1);
+        assert_eq!(process.snapshot(|i| i.kills), 2);
+    }
+
+    // ── Runner: unload, waiters, failures ──────────────────────────────
+
+    #[tokio::test(start_paused = true)]
+    async fn unload_kills_and_stops() {
+        let process = FakeProcess::new();
+        let handle = spawn_handle(&process, 0);
+
+        load(&handle, &process, "a").await;
+        handle.unload().await;
+
+        assert_eq!(handle.status().borrow().state, "stopped");
+        assert_eq!(process.snapshot(|i| i.kills), 1);
+        assert_eq!(process.snapshot(|i| i.live), 0);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn chat_waiter_mid_transition_resolves_on_loaded() {
+        let process = FakeProcess::new();
+        let handle = spawn_handle(&process, 0);
+        process.push_health(Ok(503));
+
+        let h1 = handle.clone();
+        let w1 = tokio::spawn(async move { h1.ensure_loaded(target("a")).await });
+        wait_until(&process, |i| i.probes_served == 1).await;
+        let h2 = handle.clone();
+        let w2 = tokio::spawn(async move { h2.ensure_loaded(target("a")).await });
+        drain_actor().await;
+        assert!(!w1.is_finished());
+        assert!(!w2.is_finished());
+
+        process.push_health(Ok(200));
+
+        assert_eq!(w1.await.unwrap(), Ok(40000));
+        assert_eq!(w2.await.unwrap(), Ok(40000));
+        assert_eq!(process.snapshot(|i| i.spawns.len()), 1);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn spawn_failure_reports_failed_and_waiter_gets_error() {
+        let process = FakeProcess::new();
+        let handle = spawn_handle(&process, 0);
+        process.push_spawn_error("llama-server missing");
+
+        let result = handle.ensure_loaded(target("a")).await;
+
+        assert_eq!(
+            result,
+            Err(EnsureError::StartFailed("llama-server missing".to_string()))
+        );
+        let status = handle.status().borrow().clone();
+        assert_eq!(status.state, "failed");
+        assert_eq!(status.error, Some("llama-server missing".to_string()));
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn superseded_waiter_gets_superseded_error() {
+        let process = FakeProcess::new();
+        let handle = spawn_handle(&process, 0);
+
+        let h1 = handle.clone();
+        let w1 = tokio::spawn(async move { h1.ensure_loaded(target("a")).await });
+        wait_until(&process, |i| i.spawns.len() == 1).await;
+        let h2 = handle.clone();
+        let w2 = tokio::spawn(async move { h2.ensure_loaded(target("b")).await });
+        wait_until(&process, |i| i.spawns.len() == 2).await;
+
+        process.push_health(Ok(200));
+
+        assert_eq!(w2.await.unwrap(), Ok(40001));
+        assert_eq!(w1.await.unwrap(), Err(EnsureError::Superseded));
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn health_failure_kills_child_and_reports_failed() {
+        let process = FakeProcess::new();
+        let handle = spawn_handle(&process, 0);
+        process.push_health(Ok(500));
+
+        let result = handle.ensure_loaded(target("a")).await;
+
+        assert_eq!(
+            result,
+            Err(EnsureError::StartFailed(
+                "engine health check returned HTTP 500".to_string()
+            ))
+        );
+        assert_eq!(handle.status().borrow().state, "failed");
+        assert_eq!(process.snapshot(|i| i.kills), 1);
+        assert_eq!(process.snapshot(|i| i.live), 0);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn health_deadline_failure_reports_failed() {
+        let process = FakeProcess::new();
+        process
+            .always_wait
+            .store(true, std::sync::atomic::Ordering::SeqCst);
+        let handle = spawn_handle(&process, 0);
+
+        let result = handle.ensure_loaded(target("a")).await;
+
+        assert_eq!(
+            result,
+            Err(EnsureError::StartFailed(
+                "engine did not become healthy before the deadline".to_string()
+            ))
+        );
+        assert_eq!(handle.status().borrow().state, "failed");
+        assert_eq!(process.snapshot(|i| i.kills), 1);
+        assert_eq!(process.snapshot(|i| i.live), 0);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn transport_error_during_startup_counts_as_wait() {
+        let process = FakeProcess::new();
+        let handle = spawn_handle(&process, 0);
+        process.push_health(Err("connection refused".to_string()));
+        process.push_health(Ok(200));
+
+        let port = handle.ensure_loaded(target("a")).await.expect("loads");
+
+        assert_eq!(port, 40000);
+        assert_eq!(handle.status().borrow().state, "loaded");
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn unload_mid_start_resolves_waiter_superseded() {
+        let process = FakeProcess::new();
+        let handle = spawn_handle(&process, 0);
+
+        let h1 = handle.clone();
+        let w1 = tokio::spawn(async move { h1.ensure_loaded(target("a")).await });
+        wait_until(&process, |i| i.spawns.len() == 1).await;
+
+        handle.unload().await;
+
+        assert_eq!(w1.await.unwrap(), Err(EnsureError::Superseded));
+        assert_eq!(handle.status().borrow().state, "stopped");
+        assert_eq!(process.snapshot(|i| i.kills), 1);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn crash_emits_failed_status() {
+        let process = FakeProcess::new();
+        let handle = spawn_handle(&process, 0);
+
+        load(&handle, &process, "a").await;
+        process.crash_current();
+
+        let mut rx = handle.status();
+        wait_for_state(&mut rx, "failed").await;
+        assert_eq!(
+            rx.borrow().error,
+            Some("engine process exited unexpectedly".to_string())
+        );
+        assert_eq!(process.snapshot(|i| i.live), 0);
+        assert_eq!(process.snapshot(|i| i.kills), 0);
+    }
+
+    // ── Runner: idle unload ────────────────────────────────────────────
+
+    #[tokio::test(start_paused = true)]
+    async fn idle_timeout_unloads() {
+        let process = FakeProcess::new();
+        let handle = spawn_handle(&process, 1);
+
+        load(&handle, &process, "a").await;
+
+        let mut rx = handle.status();
+        wait_for_state(&mut rx, "stopped").await;
+        assert_eq!(process.snapshot(|i| i.kills), 1);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn touch_activity_defers_idle() {
+        let process = FakeProcess::new();
+        let handle = spawn_handle(&process, 1);
+
+        load(&handle, &process, "a").await;
+
+        tokio::time::advance(Duration::from_secs(45)).await;
+        drain_actor().await;
+        assert_eq!(handle.status().borrow().state, "loaded");
+
+        handle.touch();
+        drain_actor().await;
+        tokio::time::advance(Duration::from_secs(45)).await;
+        drain_actor().await;
+        // 90 s since load: without the touch the 60 s tick would have
+        // unloaded already.
+        assert_eq!(handle.status().borrow().state, "loaded");
+
+        let mut rx = handle.status();
+        wait_for_state(&mut rx, "stopped").await;
+        assert_eq!(process.snapshot(|i| i.kills), 1);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn set_idle_minutes_applies_live() {
+        let process = FakeProcess::new();
+        let handle = spawn_handle(&process, 0);
+
+        load(&handle, &process, "a").await;
+
+        tokio::time::advance(Duration::from_secs(180)).await;
+        drain_actor().await;
+        assert_eq!(handle.status().borrow().state, "loaded");
+
+        handle.set_idle_minutes(1).await;
+
+        let mut rx = handle.status();
+        wait_for_state(&mut rx, "stopped").await;
+        assert_eq!(process.snapshot(|i| i.kills), 1);
+    }
+
+    /// Regression: a slow model load must not trigger an idle-unload
+    /// immediately after becoming Loaded. The idle clock must start at the
+    /// moment the health check returns Ok, not at the Ensure call.
+    ///
+    /// Scenario: idle_minutes = 1, health takes > 60 virtual seconds to
+    /// report Ok (scripted as Wait results while paused time advances past
+    /// the threshold), then Ready. Engine must stay Loaded and only unload
+    /// one idle minute of inactivity after load completes.
+    #[tokio::test(start_paused = true)]
+    async fn idle_clock_starts_at_loaded_not_at_ensure() {
+        let process = FakeProcess::new();
+        let handle = spawn_handle(&process, 1);
+
+        // Start loading; push two Wait probes so the health loop blocks.
+        process.push_health(Ok(503));
+        process.push_health(Ok(503));
+        let h = handle.clone();
+        let waiter = tokio::spawn(async move { h.ensure_loaded(target("a")).await });
+
+        // Wait for both Wait probes to be consumed (paused time does not
+        // advance automatically; the actor is blocked on the health channel).
+        wait_until(&process, |i| i.probes_served == 2).await;
+
+        // Advance virtual time past the 60 s idle threshold while the engine
+        // is still Starting (health has not yet returned Ok).
+        tokio::time::advance(Duration::from_secs(90)).await;
+        drain_actor().await;
+
+        // The engine is still Starting; no idle-kill should have fired.
+        assert_eq!(handle.status().borrow().state, "starting");
+
+        // Now let the health check succeed: the engine becomes Loaded and the
+        // idle clock resets to now (90 virtual seconds in the past is gone).
+        process.push_health(Ok(200));
+        assert_eq!(waiter.await.unwrap(), Ok(40000));
+        assert_eq!(handle.status().borrow().state, "loaded");
+
+        // Advance only 45 s: still within the idle window. Engine must stay loaded.
+        tokio::time::advance(Duration::from_secs(45)).await;
+        drain_actor().await;
+        assert_eq!(handle.status().borrow().state, "loaded");
+
+        // Advance past the full idle minute from the Loaded moment: now it unloads.
+        let mut rx = handle.status();
+        wait_for_state(&mut rx, "stopped").await;
+        assert_eq!(process.snapshot(|i| i.kills), 1);
+    }
+
+    // ── Runner: shutdown and teardown ──────────────────────────────────
+
+    #[tokio::test(start_paused = true)]
+    async fn shutdown_kills_child() {
+        let process = FakeProcess::new();
+        let handle = spawn_handle(&process, 0);
+
+        load(&handle, &process, "a").await;
+        handle.shutdown().await;
+
+        assert_eq!(handle.status().borrow().state, "stopped");
+        assert_eq!(process.snapshot(|i| i.kills), 1);
+        assert_eq!(process.snapshot(|i| i.live), 0);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn shutdown_fails_pending_waiter() {
+        let process = FakeProcess::new();
+        let handle = spawn_handle(&process, 0);
+
+        let h1 = handle.clone();
+        let w1 = tokio::spawn(async move { h1.ensure_loaded(target("a")).await });
+        wait_until(&process, |i| i.spawns.len() == 1).await;
+
+        handle.shutdown().await;
+
+        assert_eq!(
+            w1.await.unwrap(),
+            Err(EnsureError::StartFailed(
+                "engine runner stopped before the model loaded".to_string()
+            ))
+        );
+        assert_eq!(process.snapshot(|i| i.kills), 1);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn commands_after_shutdown_error_cleanly() {
+        let process = FakeProcess::new();
+        let handle = spawn_handle(&process, 0);
+
+        handle.shutdown().await;
+        drain_actor().await;
+
+        assert_eq!(
+            handle.ensure_loaded(target("a")).await,
+            Err(EnsureError::StartFailed(
+                "engine runner is not running".to_string()
+            ))
+        );
+        handle.unload().await;
+        handle.shutdown().await;
+        handle.touch();
+        handle.set_idle_minutes(5).await;
+        assert_eq!(process.snapshot(|i| i.spawns.len()), 0);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn dropping_handle_stops_actor_and_kills_child() {
+        let process = FakeProcess::new();
+        let handle = spawn_handle(&process, 0);
+
+        load(&handle, &process, "a").await;
+        let mut rx = handle.status();
+        drop(handle);
+
+        wait_for_state(&mut rx, "stopped").await;
+        assert_eq!(process.snapshot(|i| i.kills), 1);
+        assert_eq!(process.snapshot(|i| i.live), 0);
+    }
+}

From 3f3b30a5ac8cec875af31f8891ae915db7498601 Mon Sep 17 00:00:00 2001
From: Logan Nguyen <lg.131.dev@gmail.com>
Date: Wed, 10 Jun 2026 17:04:24 -0500
Subject: [PATCH 05/13] refactor: make models a directory module

Signed-off-by: Logan Nguyen <lg.131.dev@gmail.com>
---
 src-tauri/src/{models.rs => models/mod.rs} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename src-tauri/src/{models.rs => models/mod.rs} (100%)

diff --git a/src-tauri/src/models.rs b/src-tauri/src/models/mod.rs
similarity index 100%
rename from src-tauri/src/models.rs
rename to src-tauri/src/models/mod.rs

From 55622ee9e1bbadde147b307d4daa28f14ac57c5e Mon Sep 17 00:00:00 2001
From: Logan Nguyen <lg.131.dev@gmail.com>
Date: Wed, 10 Jun 2026 17:23:33 -0500
Subject: [PATCH 06/13] feat: add installed-models manifest table

Signed-off-by: Logan Nguyen <lg.131.dev@gmail.com>
---
 src-tauri/src/database.rs        |   6 +
 src-tauri/src/models/manifest.rs | 447 +++++++++++++++++++++++++++++++
 src-tauri/src/models/mod.rs      |   2 +
 3 files changed, 455 insertions(+)
 create mode 100644 src-tauri/src/models/manifest.rs

diff --git a/src-tauri/src/database.rs b/src-tauri/src/database.rs
index c6f1065f..9bb412bc 100644
--- a/src-tauri/src/database.rs
+++ b/src-tauri/src/database.rs
@@ -206,6 +206,12 @@ fn run_migrations(conn: &Connection) -> SqlResult<()> {
         "  ON conversations(updated_at DESC);",
         "CREATE TABLE IF NOT EXISTS app_config (",
         "  key TEXT PRIMARY KEY, value TEXT NOT NULL);",
+        "CREATE TABLE IF NOT EXISTS installed_models (",
+        "  id TEXT PRIMARY KEY, display_name TEXT NOT NULL, repo TEXT NOT NULL,",
+        "  revision TEXT NOT NULL, file_name TEXT NOT NULL, sha256 TEXT NOT NULL,",
+        "  size_bytes INTEGER NOT NULL, quant TEXT NOT NULL,",
+        "  vision INTEGER NOT NULL, thinking INTEGER NOT NULL,",
+        "  mmproj_file TEXT, mmproj_sha256 TEXT, created_at INTEGER NOT NULL);",
     );
     conn.execute_batch(SCHEMA_DDL)?;
 
diff --git a/src-tauri/src/models/manifest.rs b/src-tauri/src/models/manifest.rs
new file mode 100644
index 00000000..4c941afe
--- /dev/null
+++ b/src-tauri/src/models/manifest.rs
@@ -0,0 +1,447 @@
+/*!
+ * Installed-model manifest: CRUD over the `installed_models` SQLite table.
+ *
+ * Each row represents a GGUF model blob that has been downloaded and
+ * content-addressed into the local blob store. The `id` field
+ * (`"<repo>:<file_name>"`) is the stable public key; `sha256` and
+ * `mmproj_sha256` are content addresses shared across rows (two models can
+ * reference the same mmproj blob).
+ *
+ * All functions take a `&rusqlite::Connection` and are synchronous. Callers
+ * inside async Tauri commands must use `spawn_blocking` or hold the
+ * connection behind a `Mutex`.
+ */
+
+use rusqlite::{params, Connection, OptionalExtension, Result as SqlResult};
+use serde::Serialize;
+
+/// A GGUF model that has been downloaded and recorded in the manifest.
+#[derive(Debug, Clone, PartialEq, Serialize)]
+pub struct InstalledModel {
+    /// Stable key: `"<repo>:<file_name>"`. Uniquely identifies a model
+    /// variant within the blob store.
+    pub id: String,
+    /// Human-readable label shown in the model picker (e.g. "Gemma 3 12B").
+    pub display_name: String,
+    /// Hugging Face repo slug (e.g. "google/gemma-3-12b-it-qat-gguf").
+    pub repo: String,
+    /// 40-hex git commit SHA pinned at download time; provenance anchor.
+    pub revision: String,
+    /// Filename within the repo (e.g. "gemma-3-12b-it-q4_k_m.gguf").
+    pub file_name: String,
+    /// SHA-256 hex digest of the weights blob.
+    pub sha256: String,
+    /// Compressed file size in bytes.
+    pub size_bytes: u64,
+    /// Quantisation label (e.g. "Q4_K_M").
+    pub quant: String,
+    /// Whether the model accepts image inputs.
+    pub vision: bool,
+    /// Whether the model exposes a thinking/scratchpad token stream.
+    pub thinking: bool,
+    /// Filename of the vision projection blob, if any.
+    pub mmproj_file: Option<String>,
+    /// SHA-256 hex digest of the mmproj blob, if any.
+    pub mmproj_sha256: Option<String>,
+}
+
+/// Inserts or replaces a model row in the manifest. If a row with the same
+/// `id` already exists it is replaced in full, so re-downloading a model
+/// always produces an up-to-date entry. `created_at` is set to the current
+/// Unix second timestamp inside this function.
+///
+/// # Errors
+///
+/// Returns a `rusqlite::Error` if the underlying SQL execution fails.
+pub fn insert(conn: &Connection, model: &InstalledModel) -> SqlResult<()> {
+    let created_at = std::time::SystemTime::now()
+        .duration_since(std::time::UNIX_EPOCH)
+        .unwrap_or_default()
+        .as_secs() as i64;
+
+    conn.execute(
+        "INSERT OR REPLACE INTO installed_models \
+         (id, display_name, repo, revision, file_name, sha256, size_bytes, \
+          quant, vision, thinking, mmproj_file, mmproj_sha256, created_at) \
+         VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12, ?13)",
+        params![
+            model.id,
+            model.display_name,
+            model.repo,
+            model.revision,
+            model.file_name,
+            model.sha256,
+            model.size_bytes as i64,
+            model.quant,
+            model.vision as i32,
+            model.thinking as i32,
+            model.mmproj_file,
+            model.mmproj_sha256,
+            created_at,
+        ],
+    )?;
+    Ok(())
+}
+
+/// Returns all installed models ordered alphabetically by `display_name`.
+///
+/// # Errors
+///
+/// Returns a `rusqlite::Error` if the query fails.
+pub fn list(conn: &Connection) -> SqlResult<Vec<InstalledModel>> {
+    let mut stmt = conn.prepare(
+        "SELECT id, display_name, repo, revision, file_name, sha256, \
+                size_bytes, quant, vision, thinking, mmproj_file, mmproj_sha256 \
+         FROM installed_models ORDER BY display_name",
+    )?;
+    let rows = stmt.query_map([], row_to_model)?;
+    rows.collect()
+}
+
+/// Returns the model with the given `id`, or `None` if not present.
+///
+/// # Errors
+///
+/// Returns a `rusqlite::Error` if the query fails.
+pub fn get(conn: &Connection, id: &str) -> SqlResult<Option<InstalledModel>> {
+    conn.query_row(
+        "SELECT id, display_name, repo, revision, file_name, sha256, \
+                size_bytes, quant, vision, thinking, mmproj_file, mmproj_sha256 \
+         FROM installed_models WHERE id = ?1",
+        params![id],
+        row_to_model,
+    )
+    .optional()
+}
+
+/// Deletes the model row identified by `id` and returns the SHA-256 values
+/// (weights and mmproj) that are no longer referenced by any remaining row.
+///
+/// A blob SHA is included in the return value only when it is not referenced
+/// by any other row in either the `sha256` or `mmproj_sha256` column. The
+/// caller is responsible for removing the orphaned blobs from disk.
+///
+/// Returns an empty `Vec` if no row with the given `id` exists.
+///
+/// # Errors
+///
+/// Returns a `rusqlite::Error` if the delete or the reference-count query fails.
+pub fn delete(conn: &Connection, id: &str) -> SqlResult<Vec<String>> {
+    // Snapshot the SHA values of the row being deleted before it is gone.
+    let target: Option<(String, Option<String>)> = conn
+        .query_row(
+            "SELECT sha256, mmproj_sha256 FROM installed_models WHERE id = ?1",
+            params![id],
+            |row| Ok((row.get::<_, String>(0)?, row.get::<_, Option<String>>(1)?)),
+        )
+        .optional()?;
+
+    let Some((weights_sha, mmproj_sha)) = target else {
+        return Ok(vec![]);
+    };
+
+    conn.execute("DELETE FROM installed_models WHERE id = ?1", params![id])?;
+
+    // Collect candidate SHAs; deduplicate so a model that is its own mmproj
+    // does not produce duplicate return entries.
+    let mut candidates: Vec<String> = vec![weights_sha];
+    if let Some(ref s) = mmproj_sha {
+        if !candidates.contains(s) {
+            candidates.push(s.clone());
+        }
+    }
+
+    // Filter to those no longer referenced by any remaining row.
+    let mut orphans = Vec::new();
+    for sha in candidates {
+        if sha_refcount(conn, &sha)? == 0 {
+            orphans.push(sha);
+        }
+    }
+
+    Ok(orphans)
+}
+
+/// Counts the number of `installed_models` rows that reference `sha` in
+/// either the `sha256` or `mmproj_sha256` column.
+fn sha_refcount(conn: &Connection, sha: &str) -> SqlResult<i64> {
+    conn.query_row(
+        "SELECT COUNT(*) FROM installed_models \
+         WHERE sha256 = ?1 OR mmproj_sha256 = ?1",
+        params![sha],
+        |row| row.get(0),
+    )
+}
+
+/// Maps a SQLite row to an [`InstalledModel`].
+fn row_to_model(row: &rusqlite::Row<'_>) -> SqlResult<InstalledModel> {
+    Ok(InstalledModel {
+        id: row.get(0)?,
+        display_name: row.get(1)?,
+        repo: row.get(2)?,
+        revision: row.get(3)?,
+        file_name: row.get(4)?,
+        sha256: row.get(5)?,
+        size_bytes: row.get::<_, i64>(6)? as u64,
+        quant: row.get(7)?,
+        vision: row.get::<_, i32>(8)? != 0,
+        thinking: row.get::<_, i32>(9)? != 0,
+        mmproj_file: row.get(10)?,
+        mmproj_sha256: row.get(11)?,
+    })
+}
+
+// ─── Tests ───────────────────────────────────────────────────────────────────
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::database::open_in_memory;
+
+    fn make_model(id: &str, sha256: &str) -> InstalledModel {
+        InstalledModel {
+            id: id.to_string(),
+            display_name: format!("Model {id}"),
+            repo: "org/repo".to_string(),
+            revision: "a".repeat(40),
+            file_name: format!("{id}.gguf"),
+            sha256: sha256.to_string(),
+            size_bytes: 1_000_000,
+            quant: "Q4_K_M".to_string(),
+            vision: false,
+            thinking: false,
+            mmproj_file: None,
+            mmproj_sha256: None,
+        }
+    }
+
+    fn make_model_with_mmproj(id: &str, sha256: &str, mmproj_sha: &str) -> InstalledModel {
+        InstalledModel {
+            mmproj_file: Some(format!("{id}-mmproj.gguf")),
+            mmproj_sha256: Some(mmproj_sha.to_string()),
+            ..make_model(id, sha256)
+        }
+    }
+
+    #[test]
+    fn insert_and_list_roundtrip() {
+        let conn = open_in_memory().unwrap();
+        let m = make_model("org/repo:model.gguf", "sha_weights_1");
+        insert(&conn, &m).unwrap();
+
+        let rows = list(&conn).unwrap();
+        assert_eq!(rows.len(), 1);
+        assert_eq!(rows[0].id, m.id);
+        assert_eq!(rows[0].sha256, m.sha256);
+        assert_eq!(rows[0].mmproj_file, None);
+        assert_eq!(rows[0].mmproj_sha256, None);
+    }
+
+    #[test]
+    fn insert_and_list_roundtrip_with_mmproj() {
+        let conn = open_in_memory().unwrap();
+        let m = make_model_with_mmproj("org/repo:model.gguf", "sha_w", "sha_mm");
+        insert(&conn, &m).unwrap();
+
+        let rows = list(&conn).unwrap();
+        assert_eq!(rows.len(), 1);
+        assert_eq!(
+            rows[0].mmproj_file,
+            Some("org/repo:model.gguf-mmproj.gguf".to_string())
+        );
+        assert_eq!(rows[0].mmproj_sha256, Some("sha_mm".to_string()));
+    }
+
+    #[test]
+    fn list_is_ordered_by_display_name() {
+        let conn = open_in_memory().unwrap();
+        let mut b = make_model("id_b", "sha_b");
+        b.display_name = "Zebra Model".to_string();
+        let mut a = make_model("id_a", "sha_a");
+        a.display_name = "Alpha Model".to_string();
+        insert(&conn, &b).unwrap();
+        insert(&conn, &a).unwrap();
+
+        let rows = list(&conn).unwrap();
+        assert_eq!(rows[0].display_name, "Alpha Model");
+        assert_eq!(rows[1].display_name, "Zebra Model");
+    }
+
+    #[test]
+    fn get_by_id_finds_row() {
+        let conn = open_in_memory().unwrap();
+        let m = make_model("org/repo:find.gguf", "sha_find");
+        insert(&conn, &m).unwrap();
+
+        let found = get(&conn, "org/repo:find.gguf").unwrap();
+        assert!(found.is_some());
+        assert_eq!(found.unwrap().sha256, "sha_find");
+    }
+
+    #[test]
+    fn get_by_id_returns_none_for_missing() {
+        let conn = open_in_memory().unwrap();
+        let result = get(&conn, "does/not:exist.gguf").unwrap();
+        assert!(result.is_none());
+    }
+
+    #[test]
+    fn delete_returns_orphaned_blobs() {
+        let conn = open_in_memory().unwrap();
+        let m = make_model_with_mmproj("org/repo:model.gguf", "sha_w", "sha_mm");
+        insert(&conn, &m).unwrap();
+
+        let orphans = delete(&conn, "org/repo:model.gguf").unwrap();
+        // Both blobs are now unreferenced.
+        assert_eq!(orphans.len(), 2);
+        assert!(orphans.contains(&"sha_w".to_string()));
+        assert!(orphans.contains(&"sha_mm".to_string()));
+
+        // Row is gone.
+        assert!(get(&conn, "org/repo:model.gguf").unwrap().is_none());
+    }
+
+    #[test]
+    fn delete_returns_orphaned_blobs_no_mmproj() {
+        let conn = open_in_memory().unwrap();
+        let m = make_model("org/repo:simple.gguf", "sha_only");
+        insert(&conn, &m).unwrap();
+
+        let orphans = delete(&conn, "org/repo:simple.gguf").unwrap();
+        assert_eq!(orphans, vec!["sha_only".to_string()]);
+    }
+
+    #[test]
+    fn delete_keeps_shared_blob() {
+        let conn = open_in_memory().unwrap();
+        // Two models share the same mmproj SHA.
+        let m1 = make_model_with_mmproj("org/repo:model1.gguf", "sha_w1", "sha_shared_mm");
+        let m2 = make_model_with_mmproj("org/repo:model2.gguf", "sha_w2", "sha_shared_mm");
+        insert(&conn, &m1).unwrap();
+        insert(&conn, &m2).unwrap();
+
+        // Delete model1; its weights blob is orphaned but the shared mmproj is not.
+        let orphans = delete(&conn, "org/repo:model1.gguf").unwrap();
+        assert_eq!(orphans, vec!["sha_w1".to_string()]);
+        assert!(!orphans.contains(&"sha_shared_mm".to_string()));
+    }
+
+    #[test]
+    fn delete_keeps_sha_referenced_as_mmproj_by_another_row() {
+        let conn = open_in_memory().unwrap();
+        // model1's weights SHA is also used as model2's mmproj SHA.
+        let m1 = make_model("org/repo:model1.gguf", "sha_cross");
+        let mut m2 = make_model("org/repo:model2.gguf", "sha_w2");
+        m2.mmproj_sha256 = Some("sha_cross".to_string());
+        m2.mmproj_file = Some("mmproj.gguf".to_string());
+        insert(&conn, &m1).unwrap();
+        insert(&conn, &m2).unwrap();
+
+        // Deleting model1: sha_cross still referenced by model2's mmproj column.
+        let orphans = delete(&conn, "org/repo:model1.gguf").unwrap();
+        assert!(!orphans.contains(&"sha_cross".to_string()));
+    }
+
+    #[test]
+    fn duplicate_install_upserts() {
+        let conn = open_in_memory().unwrap();
+        let m1 = make_model("org/repo:model.gguf", "sha_v1");
+        insert(&conn, &m1).unwrap();
+
+        // Re-insert with a different display_name and sha256.
+        let mut m2 = make_model("org/repo:model.gguf", "sha_v2");
+        m2.display_name = "Updated Name".to_string();
+        insert(&conn, &m2).unwrap();
+
+        let rows = list(&conn).unwrap();
+        assert_eq!(rows.len(), 1, "upsert must not create a second row");
+        assert_eq!(rows[0].sha256, "sha_v2");
+        assert_eq!(rows[0].display_name, "Updated Name");
+    }
+
+    #[test]
+    fn delete_nonexistent_returns_empty() {
+        let conn = open_in_memory().unwrap();
+        let orphans = delete(&conn, "does/not:exist.gguf").unwrap();
+        assert!(orphans.is_empty());
+    }
+
+    #[test]
+    fn vision_and_thinking_flags_roundtrip() {
+        let conn = open_in_memory().unwrap();
+        let m = InstalledModel {
+            vision: true,
+            thinking: true,
+            ..make_model("org/repo:vt.gguf", "sha_vt")
+        };
+        insert(&conn, &m).unwrap();
+
+        let found = get(&conn, "org/repo:vt.gguf").unwrap().unwrap();
+        assert!(found.vision);
+        assert!(found.thinking);
+    }
+
+    #[test]
+    fn size_bytes_roundtrip_large_value() {
+        let conn = open_in_memory().unwrap();
+        let m = InstalledModel {
+            size_bytes: u32::MAX as u64 + 1,
+            ..make_model("org/repo:big.gguf", "sha_big")
+        };
+        insert(&conn, &m).unwrap();
+
+        let found = get(&conn, "org/repo:big.gguf").unwrap().unwrap();
+        assert_eq!(found.size_bytes, u32::MAX as u64 + 1);
+    }
+
+    // ── SQL-error paths (the `?` Err arms) ──────────────────────────────────
+    // Each test drops the table so the next call hits a real SQL error, which
+    // exercises the `?` propagation branches that cannot be reached against a
+    // healthy schema.
+
+    #[test]
+    fn insert_propagates_sql_error_when_table_absent() {
+        let conn = open_in_memory().unwrap();
+        conn.execute_batch("DROP TABLE installed_models;").unwrap();
+        let m = make_model("x:y.gguf", "sha");
+        assert!(insert(&conn, &m).is_err());
+    }
+
+    #[test]
+    fn list_propagates_sql_error_when_table_absent() {
+        let conn = open_in_memory().unwrap();
+        conn.execute_batch("DROP TABLE installed_models;").unwrap();
+        assert!(list(&conn).is_err());
+    }
+
+    #[test]
+    fn delete_propagates_sql_error_when_table_absent() {
+        let conn = open_in_memory().unwrap();
+        // Insert a row first so the SELECT snapshot finds it, then drop the
+        // table so the DELETE statement hits a SQL error.
+        let m = make_model("x:y.gguf", "sha_d");
+        insert(&conn, &m).unwrap();
+        conn.execute_batch("DROP TABLE installed_models;").unwrap();
+        // The snapshot SELECT now fails because the table is gone.
+        assert!(delete(&conn, "x:y.gguf").is_err());
+    }
+
+    #[test]
+    fn delete_propagates_sql_error_on_delete_statement() {
+        // Insert a row then replace the table with a view so the snapshot
+        // SELECT still works (returning the row) but the DELETE statement
+        // fails because the target is now a view, not a base table.
+        // This exercises the `?` Err arm on the DELETE execute call.
+        let conn = open_in_memory().unwrap();
+        let m = make_model("x:y.gguf", "sha_rd");
+        insert(&conn, &m).unwrap();
+        // Rename the real table and create a non-updatable view in its place.
+        conn.execute_batch(
+            "ALTER TABLE installed_models RENAME TO installed_models_real; \
+             CREATE VIEW installed_models AS SELECT * FROM installed_models_real;",
+        )
+        .unwrap();
+        // snapshot SELECT works (reads through the view); DELETE on a view fails.
+        assert!(delete(&conn, "x:y.gguf").is_err());
+    }
+}
diff --git a/src-tauri/src/models/mod.rs b/src-tauri/src/models/mod.rs
index c9abf511..e3d1bbaf 100644
--- a/src-tauri/src/models/mod.rs
+++ b/src-tauri/src/models/mod.rs
@@ -15,6 +15,8 @@
  * the active model is `None` and the user is prompted to pick one.
  */
 
+pub mod manifest;
+
 use std::collections::HashMap;
 use std::sync::Mutex;
 

From 0f3f3c693dc522ae2ef6df921084395665eef97e Mon Sep 17 00:00:00 2001
From: Logan Nguyen <lg.131.dev@gmail.com>
Date: Wed, 10 Jun 2026 17:41:35 -0500
Subject: [PATCH 07/13] feat: add content-addressed model blob store

Signed-off-by: Logan Nguyen <lg.131.dev@gmail.com>
---
 src-tauri/Cargo.lock            |   2 +
 src-tauri/Cargo.toml            |   2 +
 src-tauri/src/models/mod.rs     |   1 +
 src-tauri/src/models/storage.rs | 316 ++++++++++++++++++++++++++++++++
 4 files changed, 321 insertions(+)
 create mode 100644 src-tauri/src/models/storage.rs

diff --git a/src-tauri/Cargo.lock b/src-tauri/Cargo.lock
index 37632062..39a9aed2 100644
--- a/src-tauri/Cargo.lock
+++ b/src-tauri/Cargo.lock
@@ -4305,6 +4305,7 @@ dependencies = [
  "futures-util",
  "html-escape",
  "image",
+ "libc",
  "mockito",
  "objc2",
  "objc2-app-kit",
@@ -4316,6 +4317,7 @@ dependencies = [
  "semver",
  "serde",
  "serde_json",
+ "sha2",
  "tauri",
  "tauri-build",
  "tauri-nspanel",
diff --git a/src-tauri/Cargo.toml b/src-tauri/Cargo.toml
index 3d6b9ab1..df20fb47 100644
--- a/src-tauri/Cargo.toml
+++ b/src-tauri/Cargo.toml
@@ -43,6 +43,8 @@ thiserror = "2"
 time = { version = "0.3", features = ["std"] }
 async-trait = "0.1"
 semver = "1"
+sha2 = "0.10"
+libc = "0.2"
 
 [target.'cfg(target_os = "macos")'.dependencies]
 tauri-nspanel = { git = "https://github.com/ahkohd/tauri-nspanel", branch = "v2.1" }
diff --git a/src-tauri/src/models/mod.rs b/src-tauri/src/models/mod.rs
index e3d1bbaf..efb15e34 100644
--- a/src-tauri/src/models/mod.rs
+++ b/src-tauri/src/models/mod.rs
@@ -16,6 +16,7 @@
  */
 
 pub mod manifest;
+pub mod storage;
 
 use std::collections::HashMap;
 use std::sync::Mutex;
diff --git a/src-tauri/src/models/storage.rs b/src-tauri/src/models/storage.rs
new file mode 100644
index 00000000..a1413a3d
--- /dev/null
+++ b/src-tauri/src/models/storage.rs
@@ -0,0 +1,316 @@
+/*!
+ * Content-addressed model blob store.
+ *
+ * Downloaded GGUF files land here in two stages:
+ *
+ * 1. The downloader writes bytes into `root/tmp/<sha256>.partial` so
+ *    interrupted downloads can be resumed from the already-written offset.
+ * 2. On completion the store verifies the file by streaming it through
+ *    SHA-256 (buffered copy; never fully buffered in memory) and, on match, atomically
+ *    renames it into `root/blobs/<sha256>`. A mismatch deletes the partial
+ *    and returns [`StorageError::VerifyFailed`].
+ *
+ * `free_disk_bytes` is a thin `libc::statfs` wrapper used by callers to show
+ * a low-disk warning before starting a download. Treating `None` as "unknown"
+ * and skipping the warning is safe; the function never panics.
+ */
+
+use std::io;
+use std::path::PathBuf;
+
+use sha2::{Digest, Sha256};
+
+/// Errors returned by [`ModelStore`] operations.
+#[derive(Debug, thiserror::Error)]
+pub enum StorageError {
+    /// The partial file's SHA-256 did not match the expected digest.
+    #[error("download did not verify: expected sha256 {expected}, got {actual}")]
+    VerifyFailed { expected: String, actual: String },
+    /// Any I/O failure (missing file, permission error, rename failure).
+    #[error(transparent)]
+    Io(#[from] io::Error),
+}
+
+/// Content-addressed store rooted at a caller-supplied directory (in the app
+/// this is `<app_data>/models`).
+///
+/// Layout:
+/// - `root/blobs/<sha256>`: completed, verified blobs.
+/// - `root/tmp/<sha256>.partial`: in-flight downloads (resume-safe).
+pub struct ModelStore {
+    root: PathBuf,
+}
+
+impl ModelStore {
+    /// Creates the store handle and eagerly creates the `blobs/` and `tmp/`
+    /// subdirectories so callers do not have to guard against missing dirs.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if either subdirectory cannot be created.
+    pub fn new(root: PathBuf) -> Result<Self, io::Error> {
+        std::fs::create_dir_all(root.join("blobs"))?;
+        std::fs::create_dir_all(root.join("tmp"))?;
+        Ok(Self { root })
+    }
+
+    /// Absolute path where a verified blob is stored: `root/blobs/<sha256>`.
+    pub fn blob_path(&self, sha256: &str) -> PathBuf {
+        self.root.join("blobs").join(sha256)
+    }
+
+    /// Absolute path for an in-flight download: `root/tmp/<sha256>.partial`.
+    pub fn partial_path(&self, sha256: &str) -> PathBuf {
+        self.root.join("tmp").join(format!("{sha256}.partial"))
+    }
+
+    /// Streams `root/tmp/<sha256>.partial` through SHA-256 (buffered copy,
+    /// never whole-file in memory). On hash match the partial is atomically
+    /// renamed into `root/blobs/<sha256>` and the blob path is returned.
+    /// On mismatch the partial is deleted and [`StorageError::VerifyFailed`]
+    /// is returned. `sha256` must be a lowercase hex digest; the comparison
+    /// is case-sensitive.
+    pub fn verify_and_install(&self, sha256: &str) -> Result<PathBuf, StorageError> {
+        let partial = self.partial_path(sha256);
+        let mut file = std::fs::File::open(&partial)?;
+
+        let mut hasher = Sha256::new();
+        io::copy(&mut file, &mut hasher)?;
+        let actual = format!("{:x}", hasher.finalize());
+
+        if actual != sha256 {
+            // Best-effort delete; ignore secondary I/O errors.
+            let _ = std::fs::remove_file(&partial);
+            return Err(StorageError::VerifyFailed {
+                expected: sha256.to_string(),
+                actual,
+            });
+        }
+
+        let blob = self.blob_path(sha256);
+        std::fs::rename(&partial, &blob)?;
+        Ok(blob)
+    }
+
+    /// Removes each blob in `shas` from `root/blobs/`. Missing files are
+    /// silently ignored so callers do not need to pre-check existence.
+    pub fn remove_blobs(&self, shas: &[String]) -> io::Result<()> {
+        for sha in shas {
+            let path = self.blob_path(sha);
+            match std::fs::remove_file(&path) {
+                Ok(()) => {}
+                Err(e) if e.kind() == io::ErrorKind::NotFound => {}
+                Err(e) => return Err(e),
+            }
+        }
+        Ok(())
+    }
+
+    /// Returns the byte length of an in-flight partial download, or `None`
+    /// if no partial file exists for `sha256`. Used by the downloader to
+    /// resume from the already-written offset. Inherently racy with a
+    /// concurrent writer: the downloader must tolerate the partial changing
+    /// between this call and opening the file.
+    pub fn existing_partial_len(&self, sha256: &str) -> Option<u64> {
+        let meta = std::fs::metadata(self.partial_path(sha256)).ok()?;
+        Some(meta.len())
+    }
+}
+
+/// Free bytes available on the volume holding `path`.
+///
+/// Thin `libc::statfs` wrapper. Callers must treat `None` as "unknown" and
+/// skip disk-space warnings rather than blocking the download.
+///
+/// Not covered by the cargo coverage gate: this is a direct OS syscall with
+/// no branching logic beyond error propagation, making branch-level
+/// instrumentation meaningless here.
+#[cfg_attr(coverage_nightly, coverage(off))]
+pub fn free_disk_bytes(path: &std::path::Path) -> Option<u64> {
+    use std::ffi::CString;
+    use std::os::unix::ffi::OsStrExt;
+
+    let c_path = CString::new(path.as_os_str().as_bytes()).ok()?;
+    // SAFETY: `buf` is a valid, zeroed `libc::statfs` on the stack;
+    // `c_path` is a valid null-terminated C string. `libc::statfs` writes
+    // into `buf` only on success (return value 0).
+    unsafe {
+        let mut buf: libc::statfs = std::mem::zeroed();
+        if libc::statfs(c_path.as_ptr(), &mut buf) == 0 && buf.f_bsize > 0 {
+            (buf.f_bavail as u64).checked_mul(buf.f_bsize as u64)
+        } else {
+            None
+        }
+    }
+}
+
+// ─── Tests ───────────────────────────────────────────────────────────────────
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use sha2::{Digest, Sha256};
+    use tempfile::TempDir;
+
+    /// Build a fresh store rooted at a temporary directory.
+    fn make_store() -> (TempDir, ModelStore) {
+        let dir = TempDir::new().unwrap();
+        let store = ModelStore::new(dir.path().to_path_buf()).unwrap();
+        (dir, store)
+    }
+
+    /// Compute the hex SHA-256 of `data`.
+    fn sha256_of(data: &[u8]) -> String {
+        format!("{:x}", Sha256::digest(data))
+    }
+
+    /// Write `data` into the store's partial slot for `sha256`.
+    fn write_partial(store: &ModelStore, sha256: &str, data: &[u8]) {
+        std::fs::write(store.partial_path(sha256), data).unwrap();
+    }
+
+    // ── Path helpers ─────────────────────────────────────────────────────────
+
+    #[test]
+    fn blob_path_is_content_addressed() {
+        let (_dir, store) = make_store();
+        let p = store.blob_path("abc123");
+        assert!(p.ends_with("blobs/abc123"));
+    }
+
+    #[test]
+    fn partial_path_stable_for_resume() {
+        let (_dir, store) = make_store();
+        let p = store.partial_path("abc123");
+        assert!(p.ends_with("tmp/abc123.partial"));
+        // Calling twice must return the same path (stable across calls).
+        assert_eq!(store.partial_path("abc123"), p);
+    }
+
+    // ── verify_and_install ───────────────────────────────────────────────────
+
+    #[test]
+    fn install_renames_atomically() {
+        let (_dir, store) = make_store();
+        let data = b"hello content-addressed world";
+        let sha = sha256_of(data);
+
+        write_partial(&store, &sha, data);
+        let blob = store.verify_and_install(&sha).unwrap();
+
+        // Blob is at the expected path and contains the original bytes.
+        assert_eq!(blob, store.blob_path(&sha));
+        assert_eq!(std::fs::read(&blob).unwrap(), data);
+
+        // Partial must be gone after a successful install.
+        assert!(!store.partial_path(&sha).exists());
+    }
+
+    #[test]
+    fn install_rejects_sha_mismatch() {
+        let (_dir, store) = make_store();
+        let data = b"real bytes";
+        let real_sha = sha256_of(data);
+        let wrong_sha = "0000000000000000000000000000000000000000000000000000000000000000";
+
+        // Partial is filed under the wrong (expected) SHA.
+        write_partial(&store, wrong_sha, data);
+
+        let err = store.verify_and_install(wrong_sha).unwrap_err();
+        assert!(
+            matches!(&err, StorageError::VerifyFailed { .. }),
+            "expected VerifyFailed, got {err}"
+        );
+        // The Display message contains both hashes; check without branching on
+        // the enum variant so no instrumented line goes uncovered.
+        let msg = err.to_string();
+        assert!(msg.contains(wrong_sha), "message missing expected hash");
+        assert!(msg.contains(&real_sha), "message missing actual hash");
+
+        // Partial must be deleted on mismatch.
+        assert!(!store.partial_path(wrong_sha).exists());
+    }
+
+    #[test]
+    fn install_missing_partial_returns_io_error() {
+        let (_dir, store) = make_store();
+        let err = store.verify_and_install("deadbeef").unwrap_err();
+        assert!(matches!(err, StorageError::Io(_)));
+    }
+
+    // ── remove_blobs ─────────────────────────────────────────────────────────
+
+    #[test]
+    fn remove_blobs_deletes_files_and_tolerates_missing() {
+        let (_dir, store) = make_store();
+
+        // Write two blobs directly into the blobs dir.
+        let sha_a = "aaaa";
+        let sha_b = "bbbb";
+        std::fs::write(store.blob_path(sha_a), b"a").unwrap();
+        std::fs::write(store.blob_path(sha_b), b"b").unwrap();
+
+        // Remove one real and one that never existed.
+        let shas = vec![sha_a.to_string(), "cccc".to_string(), sha_b.to_string()];
+        store.remove_blobs(&shas).unwrap();
+
+        assert!(!store.blob_path(sha_a).exists());
+        assert!(!store.blob_path(sha_b).exists());
+    }
+
+    #[test]
+    fn remove_blobs_propagates_non_not_found_io_error() {
+        let (_dir, store) = make_store();
+        // Place a directory at the blob path so remove_file returns IsADirectory,
+        // which is not NotFound and must be propagated as Err.
+        let sha = "dirblob";
+        let path = store.blob_path(sha);
+        std::fs::create_dir_all(&path).unwrap();
+        let err = store.remove_blobs(&[sha.to_string()]).unwrap_err();
+        assert_ne!(err.kind(), io::ErrorKind::NotFound);
+    }
+
+    // ── existing_partial_len ─────────────────────────────────────────────────
+
+    #[test]
+    fn existing_partial_len_some_and_none() {
+        let (_dir, store) = make_store();
+
+        // No partial yet: must return None.
+        assert_eq!(store.existing_partial_len("nothere"), None);
+
+        // Write 42 bytes into the partial slot.
+        let sha = "feedface";
+        write_partial(&store, sha, &[0u8; 42]);
+        assert_eq!(store.existing_partial_len(sha), Some(42));
+    }
+
+    // ── free_disk_bytes ───────────────────────────────────────────────────────
+
+    #[test]
+    fn free_disk_bytes_returns_some_on_real_fs() {
+        let (dir, _store) = make_store();
+        let free = free_disk_bytes(dir.path());
+        assert!(free.is_some(), "expected Some on a real filesystem");
+    }
+
+    // ── StorageError display ─────────────────────────────────────────────────
+
+    #[test]
+    fn storage_error_verify_failed_message_contains_both_hashes() {
+        let err = StorageError::VerifyFailed {
+            expected: "exp".to_string(),
+            actual: "act".to_string(),
+        };
+        let msg = err.to_string();
+        assert!(msg.contains("exp"), "message missing expected hash");
+        assert!(msg.contains("act"), "message missing actual hash");
+    }
+
+    #[test]
+    fn storage_error_io_is_transparent() {
+        let io_err = io::Error::new(io::ErrorKind::PermissionDenied, "denied");
+        let err = StorageError::Io(io_err);
+        assert!(err.to_string().contains("denied"));
+    }
+}

From 09917b59e033d2bcfc60c1ebcfa515f22e3ca61a Mon Sep 17 00:00:00 2001
From: Logan Nguyen <lg.131.dev@gmail.com>
Date: Wed, 10 Jun 2026 18:05:07 -0500
Subject: [PATCH 08/13] feat: add resumable model downloader with typed
 progress events

Signed-off-by: Logan Nguyen <lg.131.dev@gmail.com>
---
 docs/configurations.md           |    1 +
 src-tauri/src/config/defaults.rs |    6 +
 src-tauri/src/models/download.rs | 1069 ++++++++++++++++++++++++++++++
 src-tauri/src/models/mod.rs      |    1 +
 4 files changed, 1077 insertions(+)
 create mode 100644 src-tauri/src/models/download.rs

diff --git a/docs/configurations.md b/docs/configurations.md
index 8322cdbe..a407a3a0 100644
--- a/docs/configurations.md
+++ b/docs/configurations.md
@@ -176,6 +176,7 @@ The table below also lists the baked-in safety limits that govern Thuki's commun
 | `ENGINE_IDLE_CHECK_INTERVAL_SECS`           | `30 s`   | No       | Internal timer granularity behind the user-facing `idle_unload_minutes` knob; 30 s keeps the unload within a minute-scale setting's precision at negligible cost.    | —      | How often the engine runner checks whether `idle_unload_minutes` of inactivity have elapsed and the built-in engine should be stopped to free RAM.                                   |
 | `ENGINE_HEALTH_PROBE_TIMEOUT_SECS`          | `5 s`    | No       | Internal lifecycle contract between the runner and the engine process. A wedged-but-connected server must not park the poll loop forever; loopback probes are normally instant so 5 s is generous. The poll interval and deadline are the user-facing knobs. | —      | How long a single `/health` GET is allowed to take inside the startup poll loop. If the engine has accepted the TCP connection but stopped responding, this timeout causes the probe to return an error (treated as Wait and retried after `ENGINE_HEALTH_POLL_INTERVAL_MS`). |
 | `ENGINE_COMMAND_QUEUE_CAPACITY`             | `64`     | No       | Bounds memory under command bursts; 64 slots is ample for all UI-driven traffic (Ensure, Touch, SetIdleMinutes, Shutdown) under any realistic usage pattern. | —      | Capacity of the bounded `mpsc` channel that carries commands from `EngineHandle` to the runner actor task. Back-pressure from a full queue is not observable in normal use. |
+| `DOWNLOAD_PROGRESS_MIN_INTERVAL_MS`         | `500 ms` | No       | Pure IPC hygiene: a fast local connection can deliver thousands of chunks per second and the UI only needs a few updates per second, so throttling below the UI refresh rate is invisible to the user. | —      | Minimum interval between `Progress` events emitted while a model file downloads. An update is also emitted whenever at least 1% of the file has arrived since the last one, whichever comes first, and a final 100% update always precedes verification. |
 
 ### `[prompt]`
 
diff --git a/src-tauri/src/config/defaults.rs b/src-tauri/src/config/defaults.rs
index 742441f7..c89ed86a 100644
--- a/src-tauri/src/config/defaults.rs
+++ b/src-tauri/src/config/defaults.rs
@@ -99,6 +99,12 @@ pub const ENGINE_IDLE_CHECK_INTERVAL_SECS: u64 = 30;
 /// normal use.
 pub const ENGINE_COMMAND_QUEUE_CAPACITY: usize = 64;
 
+/// Minimum interval between Progress events emitted during a model download.
+/// Bounds IPC channel traffic: a fast local connection can deliver thousands
+/// of chunks per second and the UI only needs a few updates per second. Not
+/// user-tunable: pure IPC hygiene, invisible below the UI refresh rate.
+pub const DOWNLOAD_PROGRESS_MIN_INTERVAL_MS: u64 = 500;
+
 /// Built-in secretary persona prompt. User overrides via `[prompt] system` in
 /// the config file. The slash-command appendix is composed on top at load time
 /// and is never written back to the file.
diff --git a/src-tauri/src/models/download.rs b/src-tauri/src/models/download.rs
new file mode 100644
index 00000000..c1cd252f
--- /dev/null
+++ b/src-tauri/src/models/download.rs
@@ -0,0 +1,1069 @@
+/*!
+ * Resumable model downloader.
+ *
+ * Streams GGUF files from Hugging Face into [`ModelStore`] partials, emitting
+ * typed [`DownloadEvent`]s for the frontend download UI. A vision model is two
+ * specs (weights + mmproj companion) downloaded sequentially. Interrupted
+ * downloads resume with an HTTP `Range` request from the partial's length; a
+ * partial that already spans the full file skips the network entirely and
+ * goes straight to verification.
+ *
+ * Security: a spec's `sha256` arrives from the Hugging Face API and doubles
+ * as the storage key (a file name under the store root), so every spec is
+ * validated as exactly 64 lowercase ASCII hex chars before any filesystem
+ * use. An invalid digest aborts the whole download with a `Failed` event.
+ *
+ * Blocking contract: the verification step hashes the whole file with
+ * synchronous I/O, blocking the current runtime worker for seconds on a
+ * multi-GB model. `run_download` must therefore run on a spawned task of the
+ * multi-threaded runtime (the Tauri command path), never on a thread the UI
+ * waits on.
+ */
+
+use std::io::Write;
+use std::time::Duration;
+
+use futures_util::StreamExt;
+use tokio_util::sync::CancellationToken;
+
+use super::storage::{ModelStore, StorageError};
+use crate::config::defaults::DOWNLOAD_PROGRESS_MIN_INTERVAL_MS;
+
+/// Progress events streamed to the frontend while a model downloads.
+#[derive(Debug, Clone, serde::Serialize, PartialEq)]
+#[serde(tag = "type", content = "data")]
+pub enum DownloadEvent {
+    /// A file's download began. `resumed_from` is the partial's prior length
+    /// (0 on a fresh download).
+    Started {
+        file: String,
+        total_bytes: u64,
+        resumed_from: u64,
+    },
+    /// Bytes written so far; throttled to a few updates per second.
+    Progress {
+        file: String,
+        bytes: u64,
+        total_bytes: u64,
+    },
+    /// All bytes are on disk; the SHA-256 check is running.
+    Verifying { file: String },
+    /// The file verified and was installed into the blob store.
+    FileDone { file: String },
+    /// Every spec finished; the model is fully installed.
+    AllDone,
+    /// The user cancelled; the partial is kept for a later resume.
+    Cancelled,
+    /// The download failed; `kind` drives the UI state machine.
+    Failed {
+        kind: DownloadFailKind,
+        message: String,
+    },
+}
+
+/// Coarse failure category for [`DownloadEvent::Failed`].
+#[derive(Debug, Clone, Copy, serde::Serialize, PartialEq)]
+#[serde(rename_all = "snake_case")]
+pub enum DownloadFailKind {
+    Offline,
+    Http,
+    Checksum,
+    DiskFull,
+    Other,
+}
+
+/// One file to download into the store.
+#[derive(Debug, Clone, PartialEq)]
+pub struct DownloadSpec {
+    /// `https://huggingface.co/<repo>/resolve/<rev>/<file>`.
+    pub url: String,
+    /// Display name for events.
+    pub file: String,
+    /// Expected lowercase hex digest; also the storage key.
+    pub sha256: String,
+    /// Expected file size in bytes.
+    pub total_bytes: u64,
+}
+
+/// Downloads every spec sequentially into store partials, emitting events via
+/// `emit`. Resumes with `Range: bytes=<len>-` when a partial exists; a partial
+/// whose length already equals total_bytes skips the network entirely and goes
+/// straight to verify (no Range request; a 416 is therefore unreachable).
+/// Verifies + installs each file on completion (Verifying then FileDone).
+/// Emits AllDone after the last file. Cancellation: checked between chunks;
+/// emits Cancelled and returns; the partial is KEPT for resume.
+/// Every failure is emitted as a Failed event; the partial is kept except
+/// where verify_and_install already deleted it (checksum mismatch).
+#[allow(clippy::result_unit_err)] // Err carries no detail by design: every failure reaches the UI as a Failed event.
+pub async fn run_download(
+    specs: &[DownloadSpec],
+    store: &ModelStore,
+    client: &reqwest::Client,
+    cancel: CancellationToken,
+    emit: impl Fn(DownloadEvent),
+) -> Result<(), ()> {
+    // Validate every digest BEFORE any filesystem use: the sha256 becomes a
+    // file name in the store, so a malformed one must never reach a path.
+    for spec in specs {
+        if !is_valid_sha256(&spec.sha256) {
+            emit(DownloadEvent::Failed {
+                kind: DownloadFailKind::Other,
+                message: "invalid sha256 in download spec".to_string(),
+            });
+            return Err(());
+        }
+    }
+
+    for spec in specs {
+        match download_one(spec, store, client, &cancel, &emit).await {
+            Ok(FileOutcome::Done) => {}
+            Ok(FileOutcome::Cancelled) => {
+                emit(DownloadEvent::Cancelled);
+                return Err(());
+            }
+            Err(e) => {
+                emit(DownloadEvent::Failed {
+                    kind: classify_download_error(&e),
+                    message: failure_message(&e),
+                });
+                return Err(());
+            }
+        }
+    }
+
+    emit(DownloadEvent::AllDone);
+    Ok(())
+}
+
+/// Per-file result distinguishing completion from user cancellation.
+enum FileOutcome {
+    Done,
+    Cancelled,
+}
+
+/// Downloads (or skips, when the partial is already full-length) one spec,
+/// then verifies and installs it.
+async fn download_one(
+    spec: &DownloadSpec,
+    store: &ModelStore,
+    client: &reqwest::Client,
+    cancel: &CancellationToken,
+    emit: &impl Fn(DownloadEvent),
+) -> Result<FileOutcome, DownloadIoError> {
+    let resumed_from = store.existing_partial_len(&spec.sha256).unwrap_or(0);
+    emit(DownloadEvent::Started {
+        file: spec.file.clone(),
+        total_bytes: spec.total_bytes,
+        resumed_from,
+    });
+
+    // A full-length partial skips the network and goes straight to verify.
+    // Note: if upstream metadata ever overstates total_bytes, the partial can
+    // never reach it and a resume Range past the real EOF returns 416, which
+    // surfaces as an Http failure with the partial kept; Discard is the
+    // user's recovery path.
+    if resumed_from < spec.total_bytes
+        && matches!(
+            fetch_into_partial(spec, store, client, cancel, emit, resumed_from).await?,
+            FileOutcome::Cancelled
+        )
+    {
+        return Ok(FileOutcome::Cancelled);
+    }
+
+    // Final 100% Progress always precedes Verifying so the UI bar completes.
+    emit(DownloadEvent::Progress {
+        file: spec.file.clone(),
+        bytes: spec.total_bytes,
+        total_bytes: spec.total_bytes,
+    });
+    emit(DownloadEvent::Verifying {
+        file: spec.file.clone(),
+    });
+    store
+        .verify_and_install(&spec.sha256)
+        .map_err(map_storage_error)?;
+    emit(DownloadEvent::FileDone {
+        file: spec.file.clone(),
+    });
+    Ok(FileOutcome::Done)
+}
+
+/// Streams the response body into the store partial, resuming from
+/// `resumed_from` when it is non-zero. A 200 answer to a Range request means
+/// the server ignored the range, so the partial is truncated and rewritten
+/// from scratch.
+async fn fetch_into_partial(
+    spec: &DownloadSpec,
+    store: &ModelStore,
+    client: &reqwest::Client,
+    cancel: &CancellationToken,
+    emit: &impl Fn(DownloadEvent),
+    resumed_from: u64,
+) -> Result<FileOutcome, DownloadIoError> {
+    let ranged = resumed_from > 0;
+    let mut request = client.get(&spec.url);
+    if ranged {
+        request = request.header(reqwest::header::RANGE, format!("bytes={resumed_from}-"));
+    }
+    let response = request
+        .send()
+        .await
+        .map_err(|e| DownloadIoError::Connect(e.to_string()))?;
+
+    // 206 continues the partial; 200 carries the full body (fresh download,
+    // or a server that ignored the range). Anything else is an HTTP failure.
+    let status = response.status().as_u16();
+    let start = match (ranged, status) {
+        (true, 206) => resumed_from,
+        (_, 200) => 0,
+        _ => return Err(DownloadIoError::HttpStatus(status)),
+    };
+
+    let mut options = std::fs::OpenOptions::new();
+    options.create(true);
+    if start == 0 {
+        options.write(true).truncate(true);
+    } else {
+        options.append(true);
+    }
+    let mut file = options
+        .open(store.partial_path(&spec.sha256))
+        .map_err(DownloadIoError::Write)?;
+
+    let mut written = start;
+    let mut throttle = ProgressThrottle::new(spec.total_bytes, written);
+    let mut stream = response.bytes_stream();
+    while let Some(chunk) = stream.next().await {
+        // Checked between chunks: the partial is kept for a later resume.
+        if cancel.is_cancelled() {
+            return Ok(FileOutcome::Cancelled);
+        }
+        let chunk = chunk.map_err(|e| DownloadIoError::MidStream(e.to_string()))?;
+        file.write_all(&chunk).map_err(DownloadIoError::Write)?;
+        written += chunk.len() as u64;
+        if throttle.should_emit(written) {
+            emit(DownloadEvent::Progress {
+                file: spec.file.clone(),
+                bytes: written,
+                total_bytes: spec.total_bytes,
+            });
+        }
+    }
+    file.flush().map_err(DownloadIoError::Write)?;
+    Ok(FileOutcome::Done)
+}
+
+/// Rate limiter for Progress events: emits when either
+/// [`DOWNLOAD_PROGRESS_MIN_INTERVAL_MS`] has elapsed since the last emission
+/// or at least 1% of the total has been written since then, whichever comes
+/// first. Keeps IPC traffic to a few updates per second regardless of how
+/// many chunks the network layer delivers.
+struct ProgressThrottle {
+    last_emit: tokio::time::Instant,
+    last_bytes: u64,
+    percent_step: u64,
+}
+
+impl ProgressThrottle {
+    fn new(total_bytes: u64, start_bytes: u64) -> Self {
+        Self {
+            last_emit: tokio::time::Instant::now(),
+            last_bytes: start_bytes,
+            percent_step: (total_bytes / 100).max(1),
+        }
+    }
+
+    fn should_emit(&mut self, bytes: u64) -> bool {
+        let interval = Duration::from_millis(DOWNLOAD_PROGRESS_MIN_INTERVAL_MS);
+        if self.last_emit.elapsed() >= interval || bytes - self.last_bytes >= self.percent_step {
+            self.last_emit = tokio::time::Instant::now();
+            self.last_bytes = bytes;
+            return true;
+        }
+        false
+    }
+}
+
+/// Classifies a download I/O failure for the UI state machine.
+#[derive(Debug)]
+pub(crate) enum DownloadIoError {
+    /// reqwest connect/timeout errors (the request never got a response).
+    Connect(String),
+    /// bytes_stream chunk error (network drop mid-body).
+    MidStream(String),
+    /// Non-success HTTP status from the server.
+    HttpStatus(u16),
+    /// Local filesystem open/write failure.
+    Write(std::io::Error),
+    /// SHA-256 mismatch after a complete download.
+    Verify { expected: String, actual: String },
+}
+
+pub(crate) fn classify_download_error(e: &DownloadIoError) -> DownloadFailKind {
+    match e {
+        // Both fit resume semantics: the partial is kept and a retry resumes.
+        DownloadIoError::Connect(_) | DownloadIoError::MidStream(_) => DownloadFailKind::Offline,
+        DownloadIoError::HttpStatus(_) => DownloadFailKind::Http,
+        DownloadIoError::Write(io) => match io.kind() {
+            std::io::ErrorKind::StorageFull | std::io::ErrorKind::WriteZero => {
+                DownloadFailKind::DiskFull
+            }
+            _ => DownloadFailKind::Other,
+        },
+        DownloadIoError::Verify { .. } => DownloadFailKind::Checksum,
+    }
+}
+
+/// Human-readable message carried by [`DownloadEvent::Failed`].
+fn failure_message(e: &DownloadIoError) -> String {
+    match e {
+        DownloadIoError::Connect(m) => format!("connection failed: {m}"),
+        DownloadIoError::MidStream(m) => format!("download interrupted: {m}"),
+        DownloadIoError::HttpStatus(status) => format!("server returned HTTP {status}"),
+        DownloadIoError::Write(io) => format!("write failed: {io}"),
+        DownloadIoError::Verify { expected, actual } => {
+            format!("checksum mismatch: expected {expected}, got {actual}")
+        }
+    }
+}
+
+/// Maps a [`StorageError`] from verify/install onto the download error space.
+fn map_storage_error(e: StorageError) -> DownloadIoError {
+    match e {
+        StorageError::VerifyFailed { expected, actual } => {
+            DownloadIoError::Verify { expected, actual }
+        }
+        StorageError::Io(io) => DownloadIoError::Write(io),
+    }
+}
+
+/// True when `s` is exactly 64 lowercase ASCII hex chars: the only shape a
+/// sha256 may have before it is used as a file name in the store.
+fn is_valid_sha256(s: &str) -> bool {
+    s.len() == 64 && s.bytes().all(|b| matches!(b, b'0'..=b'9' | b'a'..=b'f'))
+}
+
+// ─── Tests ───────────────────────────────────────────────────────────────────
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use sha2::{Digest, Sha256};
+    use std::sync::{Arc, Mutex};
+    use tempfile::TempDir;
+    use wiremock::matchers::{header, method, path};
+    use wiremock::{Mock, MockServer, ResponseTemplate};
+
+    /// Build a fresh store rooted at a temporary directory.
+    fn make_store() -> (TempDir, ModelStore) {
+        let dir = TempDir::new().unwrap();
+        let store = ModelStore::new(dir.path().to_path_buf()).unwrap();
+        (dir, store)
+    }
+
+    /// Compute the hex SHA-256 of `data`.
+    fn sha256_of(data: &[u8]) -> String {
+        format!("{:x}", Sha256::digest(data))
+    }
+
+    /// Event sink: returns the shared event log and an `emit` closure.
+    fn collector() -> (Arc<Mutex<Vec<DownloadEvent>>>, impl Fn(DownloadEvent)) {
+        let events = Arc::new(Mutex::new(Vec::new()));
+        let sink = Arc::clone(&events);
+        (events, move |e| sink.lock().unwrap().push(e))
+    }
+
+    /// Spec whose sha256/total match `body` exactly.
+    fn spec_for(url: String, file: &str, body: &[u8]) -> DownloadSpec {
+        DownloadSpec {
+            url,
+            file: file.to_string(),
+            sha256: sha256_of(body),
+            total_bytes: body.len() as u64,
+        }
+    }
+
+    /// Deterministic non-trivial body of `len` bytes.
+    fn body_of(len: usize) -> Vec<u8> {
+        (0..len).map(|i| (i % 251) as u8).collect()
+    }
+
+    fn last_event(events: &Arc<Mutex<Vec<DownloadEvent>>>) -> DownloadEvent {
+        events.lock().unwrap().last().unwrap().clone()
+    }
+
+    /// Kinds of every Failed event in emission order.
+    fn failed_kinds(events: &Arc<Mutex<Vec<DownloadEvent>>>) -> Vec<DownloadFailKind> {
+        events
+            .lock()
+            .unwrap()
+            .iter()
+            .filter_map(|e| match e {
+                DownloadEvent::Failed { kind, .. } => Some(*kind),
+                _ => None,
+            })
+            .collect()
+    }
+
+    // ── Happy path ───────────────────────────────────────────────────────────
+
+    #[tokio::test]
+    async fn downloads_and_reports_progress() {
+        let server = MockServer::start().await;
+        let body = body_of(4096);
+        Mock::given(method("GET"))
+            .and(path("/q/resolve/main/w.gguf"))
+            .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
+            .mount(&server)
+            .await;
+
+        let (_dir, store) = make_store();
+        let spec = spec_for(
+            format!("{}/q/resolve/main/w.gguf", server.uri()),
+            "w.gguf",
+            &body,
+        );
+        let sha = spec.sha256.clone();
+        let (events, emit) = collector();
+
+        let result = run_download(
+            &[spec],
+            &store,
+            &reqwest::Client::new(),
+            CancellationToken::new(),
+            emit,
+        )
+        .await;
+        assert_eq!(result, Ok(()));
+
+        let events = events.lock().unwrap();
+        assert_eq!(
+            events[0],
+            DownloadEvent::Started {
+                file: "w.gguf".to_string(),
+                total_bytes: 4096,
+                resumed_from: 0
+            }
+        );
+        // The final 100% Progress immediately precedes Verifying.
+        let verifying_at = events
+            .iter()
+            .position(|e| matches!(e, DownloadEvent::Verifying { .. }))
+            .unwrap();
+        assert_eq!(
+            events[verifying_at - 1],
+            DownloadEvent::Progress {
+                file: "w.gguf".to_string(),
+                bytes: 4096,
+                total_bytes: 4096
+            }
+        );
+        assert_eq!(
+            events[verifying_at + 1],
+            DownloadEvent::FileDone {
+                file: "w.gguf".to_string()
+            }
+        );
+        assert_eq!(*events.last().unwrap(), DownloadEvent::AllDone);
+        assert_eq!(std::fs::read(store.blob_path(&sha)).unwrap(), body);
+    }
+
+    // ── Resume semantics ─────────────────────────────────────────────────────
+
+    #[tokio::test]
+    async fn resumes_with_range_header_from_partial() {
+        let server = MockServer::start().await;
+        let body = body_of(8192);
+        let sha = sha256_of(&body);
+        // The mock only matches when the Range header asks for the remainder,
+        // so a missing/wrong header fails the test via a wiremock 404.
+        Mock::given(method("GET"))
+            .and(path("/q/resolve/main/w.gguf"))
+            .and(header("range", "bytes=1000-"))
+            .respond_with(ResponseTemplate::new(206).set_body_bytes(body[1000..].to_vec()))
+            .mount(&server)
+            .await;
+
+        let (_dir, store) = make_store();
+        std::fs::write(store.partial_path(&sha), &body[..1000]).unwrap();
+        let spec = spec_for(
+            format!("{}/q/resolve/main/w.gguf", server.uri()),
+            "w.gguf",
+            &body,
+        );
+        let (events, emit) = collector();
+
+        let result = run_download(
+            &[spec],
+            &store,
+            &reqwest::Client::new(),
+            CancellationToken::new(),
+            emit,
+        )
+        .await;
+        assert_eq!(result, Ok(()));
+        assert_eq!(
+            events.lock().unwrap()[0],
+            DownloadEvent::Started {
+                file: "w.gguf".to_string(),
+                total_bytes: 8192,
+                resumed_from: 1000
+            }
+        );
+        assert_eq!(std::fs::read(store.blob_path(&sha)).unwrap(), body);
+    }
+
+    #[tokio::test]
+    async fn range_ignored_by_server_restarts_from_scratch() {
+        let server = MockServer::start().await;
+        let body = body_of(4096);
+        let sha = sha256_of(&body);
+        // Server answers 200 with the FULL body even though a Range was sent.
+        Mock::given(method("GET"))
+            .and(path("/q/resolve/main/w.gguf"))
+            .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
+            .mount(&server)
+            .await;
+
+        let (_dir, store) = make_store();
+        // Junk partial that is NOT a prefix of the real body: only a truncate
+        // plus a from-scratch write can make the final file verify.
+        std::fs::write(store.partial_path(&sha), b"junk!").unwrap();
+        let spec = spec_for(
+            format!("{}/q/resolve/main/w.gguf", server.uri()),
+            "w.gguf",
+            &body,
+        );
+        let (events, emit) = collector();
+
+        let result = run_download(
+            &[spec],
+            &store,
+            &reqwest::Client::new(),
+            CancellationToken::new(),
+            emit,
+        )
+        .await;
+        assert_eq!(result, Ok(()));
+        assert_eq!(
+            events.lock().unwrap()[0],
+            DownloadEvent::Started {
+                file: "w.gguf".to_string(),
+                total_bytes: 4096,
+                resumed_from: 5
+            }
+        );
+        assert_eq!(std::fs::read(store.blob_path(&sha)).unwrap(), body);
+    }
+
+    #[tokio::test]
+    async fn full_length_partial_skips_to_verify() {
+        // No HTTP mock mounted at all: a full-length partial must never touch
+        // the network.
+        let body = body_of(512);
+        let sha = sha256_of(&body);
+        let (_dir, store) = make_store();
+        std::fs::write(store.partial_path(&sha), &body).unwrap();
+        let spec = spec_for("http://127.0.0.1:9/unused".to_string(), "w.gguf", &body);
+        let (events, emit) = collector();
+
+        let result = run_download(
+            &[spec],
+            &store,
+            &reqwest::Client::new(),
+            CancellationToken::new(),
+            emit,
+        )
+        .await;
+        assert_eq!(result, Ok(()));
+        assert_eq!(
+            *events.lock().unwrap(),
+            vec![
+                DownloadEvent::Started {
+                    file: "w.gguf".to_string(),
+                    total_bytes: 512,
+                    resumed_from: 512
+                },
+                DownloadEvent::Progress {
+                    file: "w.gguf".to_string(),
+                    bytes: 512,
+                    total_bytes: 512
+                },
+                DownloadEvent::Verifying {
+                    file: "w.gguf".to_string()
+                },
+                DownloadEvent::FileDone {
+                    file: "w.gguf".to_string()
+                },
+                DownloadEvent::AllDone,
+            ]
+        );
+        assert_eq!(std::fs::read(store.blob_path(&sha)).unwrap(), body);
+    }
+
+    // ── Cancellation ─────────────────────────────────────────────────────────
+
+    #[tokio::test]
+    async fn cancel_keeps_partial() {
+        let server = MockServer::start().await;
+        let body = body_of(4096);
+        let sha = sha256_of(&body);
+        Mock::given(method("GET"))
+            .and(path("/q/resolve/main/w.gguf"))
+            .respond_with(ResponseTemplate::new(206).set_body_bytes(body[100..].to_vec()))
+            .mount(&server)
+            .await;
+
+        let (_dir, store) = make_store();
+        std::fs::write(store.partial_path(&sha), &body[..100]).unwrap();
+        let spec = spec_for(
+            format!("{}/q/resolve/main/w.gguf", server.uri()),
+            "w.gguf",
+            &body,
+        );
+        let (events, emit) = collector();
+
+        let cancel = CancellationToken::new();
+        cancel.cancel();
+        let result = run_download(&[spec], &store, &reqwest::Client::new(), cancel, emit).await;
+        assert_eq!(result, Err(()));
+        assert_eq!(last_event(&events), DownloadEvent::Cancelled);
+        // Partial is KEPT with the already-downloaded bytes for resume.
+        assert_eq!(store.existing_partial_len(&sha), Some(100));
+        assert!(!store.blob_path(&sha).exists());
+    }
+
+    // ── Failure mapping (end to end) ─────────────────────────────────────────
+
+    #[tokio::test]
+    async fn http_500_maps_to_http_error() {
+        let server = MockServer::start().await;
+        Mock::given(method("GET"))
+            .and(path("/q/resolve/main/w.gguf"))
+            .respond_with(ResponseTemplate::new(500))
+            .mount(&server)
+            .await;
+
+        let (_dir, store) = make_store();
+        let spec = spec_for(
+            format!("{}/q/resolve/main/w.gguf", server.uri()),
+            "w.gguf",
+            b"never served",
+        );
+        let (events, emit) = collector();
+
+        let result = run_download(
+            &[spec],
+            &store,
+            &reqwest::Client::new(),
+            CancellationToken::new(),
+            emit,
+        )
+        .await;
+        assert_eq!(result, Err(()));
+        assert_eq!(
+            last_event(&events),
+            DownloadEvent::Failed {
+                kind: DownloadFailKind::Http,
+                message: "server returned HTTP 500".to_string(),
+            }
+        );
+    }
+
+    #[tokio::test]
+    async fn offline_maps_to_offline() {
+        // Closed port: the connection is refused before any HTTP exchange.
+        let (_dir, store) = make_store();
+        let spec = spec_for(
+            "http://127.0.0.1:1/w.gguf".to_string(),
+            "w.gguf",
+            b"unreachable",
+        );
+        let (events, emit) = collector();
+
+        let result = run_download(
+            &[spec],
+            &store,
+            &reqwest::Client::new(),
+            CancellationToken::new(),
+            emit,
+        )
+        .await;
+        assert_eq!(result, Err(()));
+        assert_eq!(failed_kinds(&events), vec![DownloadFailKind::Offline]);
+    }
+
+    #[tokio::test]
+    async fn mid_stream_drop_maps_to_offline() {
+        use tokio::io::{AsyncReadExt, AsyncWriteExt};
+
+        // Raw socket that declares 4096 bytes but closes after 7: wiremock
+        // cannot truncate a body mid-stream, so the drop is hand-rolled.
+        let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
+        let addr = listener.local_addr().unwrap();
+        tokio::spawn(async move {
+            let (mut sock, _) = listener.accept().await.unwrap();
+            let mut buf = [0u8; 1024];
+            let _ = sock.read(&mut buf).await;
+            sock.write_all(b"HTTP/1.1 200 OK\r\ncontent-length: 4096\r\n\r\npartial")
+                .await
+                .unwrap();
+            sock.shutdown().await.unwrap();
+        });
+
+        let (_dir, store) = make_store();
+        let body = body_of(4096);
+        let spec = spec_for(format!("http://{addr}/w.gguf"), "w.gguf", &body);
+        let sha = spec.sha256.clone();
+        let (events, emit) = collector();
+
+        let result = run_download(
+            &[spec],
+            &store,
+            &reqwest::Client::new(),
+            CancellationToken::new(),
+            emit,
+        )
+        .await;
+        assert_eq!(result, Err(()));
+        assert_eq!(failed_kinds(&events), vec![DownloadFailKind::Offline]);
+        // The bytes that did arrive are kept for resume.
+        assert!(store.existing_partial_len(&sha).is_some());
+    }
+
+    #[tokio::test]
+    async fn sha_mismatch_after_complete_maps_to_checksum() {
+        let server = MockServer::start().await;
+        let served = body_of(2048);
+        Mock::given(method("GET"))
+            .and(path("/q/resolve/main/w.gguf"))
+            .respond_with(ResponseTemplate::new(200).set_body_bytes(served.clone()))
+            .mount(&server)
+            .await;
+
+        let (_dir, store) = make_store();
+        // Valid-shape digest that does NOT match the served bytes.
+        let expected_sha = sha256_of(b"completely different content");
+        let spec = DownloadSpec {
+            url: format!("{}/q/resolve/main/w.gguf", server.uri()),
+            file: "w.gguf".to_string(),
+            sha256: expected_sha.clone(),
+            total_bytes: served.len() as u64,
+        };
+        let (events, emit) = collector();
+
+        let result = run_download(
+            &[spec],
+            &store,
+            &reqwest::Client::new(),
+            CancellationToken::new(),
+            emit,
+        )
+        .await;
+        assert_eq!(result, Err(()));
+        let events = events.lock().unwrap();
+        assert!(events
+            .iter()
+            .any(|e| matches!(e, DownloadEvent::Verifying { .. })));
+        // The Failed message carries both digests from verify_and_install.
+        assert_eq!(
+            *events.last().unwrap(),
+            DownloadEvent::Failed {
+                kind: DownloadFailKind::Checksum,
+                message: format!(
+                    "checksum mismatch: expected {expected_sha}, got {}",
+                    sha256_of(&served)
+                ),
+            }
+        );
+        // verify_and_install already deleted the mismatched partial.
+        assert_eq!(store.existing_partial_len(&expected_sha), None);
+        assert!(!store.blob_path(&expected_sha).exists());
+    }
+
+    // ── Multi-file ordering ──────────────────────────────────────────────────
+
+    #[tokio::test]
+    async fn mmproj_downloaded_after_weights() {
+        let server = MockServer::start().await;
+        let weights = body_of(1024);
+        let mmproj = body_of(256);
+        Mock::given(method("GET"))
+            .and(path("/q/resolve/main/weights.gguf"))
+            .respond_with(ResponseTemplate::new(200).set_body_bytes(weights.clone()))
+            .mount(&server)
+            .await;
+        Mock::given(method("GET"))
+            .and(path("/q/resolve/main/mmproj.gguf"))
+            .respond_with(ResponseTemplate::new(200).set_body_bytes(mmproj.clone()))
+            .mount(&server)
+            .await;
+
+        let (_dir, store) = make_store();
+        let specs = vec![
+            spec_for(
+                format!("{}/q/resolve/main/weights.gguf", server.uri()),
+                "weights.gguf",
+                &weights,
+            ),
+            spec_for(
+                format!("{}/q/resolve/main/mmproj.gguf", server.uri()),
+                "mmproj.gguf",
+                &mmproj,
+            ),
+        ];
+        let weights_sha = specs[0].sha256.clone();
+        let mmproj_sha = specs[1].sha256.clone();
+        let (events, emit) = collector();
+
+        let result = run_download(
+            &specs,
+            &store,
+            &reqwest::Client::new(),
+            CancellationToken::new(),
+            emit,
+        )
+        .await;
+        assert_eq!(result, Ok(()));
+
+        let events = events.lock().unwrap();
+        let weights_done = events
+            .iter()
+            .position(|e| {
+                *e == DownloadEvent::FileDone {
+                    file: "weights.gguf".to_string(),
+                }
+            })
+            .unwrap();
+        let mmproj_started = events
+            .iter()
+            .position(|e| matches!(e, DownloadEvent::Started { file, .. } if file == "mmproj.gguf"))
+            .unwrap();
+        assert!(
+            weights_done < mmproj_started,
+            "mmproj must start only after the weights file is done"
+        );
+        assert_eq!(*events.last().unwrap(), DownloadEvent::AllDone);
+        assert_eq!(
+            std::fs::read(store.blob_path(&weights_sha)).unwrap(),
+            weights
+        );
+        assert_eq!(std::fs::read(store.blob_path(&mmproj_sha)).unwrap(), mmproj);
+    }
+
+    // ── sha256 validation ────────────────────────────────────────────────────
+
+    #[tokio::test]
+    async fn invalid_sha_rejected() {
+        let (_dir, store) = make_store();
+        let bad_digests = [
+            String::new(),
+            "short".to_string(),
+            "z".repeat(64),                   // not hex
+            "A".repeat(64),                   // uppercase hex is rejected
+            "a".repeat(63),                   // too short
+            "a".repeat(65),                   // too long
+            format!("../{}", "a".repeat(61)), // path traversal shape
+        ];
+        for bad in bad_digests {
+            // A valid first spec must not be downloaded either: validation of
+            // the whole batch happens before any filesystem use.
+            let valid = spec_for("http://127.0.0.1:9/v".to_string(), "v.gguf", b"valid");
+            let invalid = DownloadSpec {
+                url: "http://127.0.0.1:9/w".to_string(),
+                file: "w.gguf".to_string(),
+                sha256: bad,
+                total_bytes: 4,
+            };
+            let (events, emit) = collector();
+            let result = run_download(
+                &[valid, invalid],
+                &store,
+                &reqwest::Client::new(),
+                CancellationToken::new(),
+                emit,
+            )
+            .await;
+            assert_eq!(result, Err(()));
+            assert_eq!(
+                *events.lock().unwrap(),
+                vec![DownloadEvent::Failed {
+                    kind: DownloadFailKind::Other,
+                    message: "invalid sha256 in download spec".to_string(),
+                }]
+            );
+        }
+        // No filesystem path was touched for any spec.
+        let dir = _dir.path();
+        assert_eq!(std::fs::read_dir(dir.join("tmp")).unwrap().count(), 0);
+        assert_eq!(std::fs::read_dir(dir.join("blobs")).unwrap().count(), 0);
+    }
+
+    // ── classify_download_error (pure) ───────────────────────────────────────
+
+    #[test]
+    fn classify_connect_and_midstream_map_to_offline() {
+        let connect = DownloadIoError::Connect("refused".to_string());
+        let midstream = DownloadIoError::MidStream("reset".to_string());
+        assert_eq!(classify_download_error(&connect), DownloadFailKind::Offline);
+        assert_eq!(
+            classify_download_error(&midstream),
+            DownloadFailKind::Offline
+        );
+    }
+
+    #[test]
+    fn classify_http_status_maps_to_http() {
+        let e = DownloadIoError::HttpStatus(503);
+        assert_eq!(classify_download_error(&e), DownloadFailKind::Http);
+    }
+
+    #[test]
+    fn classify_disk_full_from_storage_full_error() {
+        let full = DownloadIoError::Write(std::io::Error::new(
+            std::io::ErrorKind::StorageFull,
+            "no space left on device",
+        ));
+        let zero = DownloadIoError::Write(std::io::Error::new(
+            std::io::ErrorKind::WriteZero,
+            "failed to write whole buffer",
+        ));
+        assert_eq!(classify_download_error(&full), DownloadFailKind::DiskFull);
+        assert_eq!(classify_download_error(&zero), DownloadFailKind::DiskFull);
+    }
+
+    #[test]
+    fn classify_other_write_error_maps_to_other() {
+        let e = DownloadIoError::Write(std::io::Error::new(
+            std::io::ErrorKind::PermissionDenied,
+            "denied",
+        ));
+        assert_eq!(classify_download_error(&e), DownloadFailKind::Other);
+    }
+
+    #[test]
+    fn classify_verify_maps_to_checksum() {
+        let e = DownloadIoError::Verify {
+            expected: "e".to_string(),
+            actual: "a".to_string(),
+        };
+        assert_eq!(classify_download_error(&e), DownloadFailKind::Checksum);
+    }
+
+    // ── failure_message / map_storage_error (pure) ───────────────────────────
+
+    #[test]
+    fn failure_message_covers_every_variant() {
+        let cases = [
+            (DownloadIoError::Connect("refused".to_string()), "refused"),
+            (DownloadIoError::MidStream("reset".to_string()), "reset"),
+            (DownloadIoError::HttpStatus(404), "404"),
+            (
+                DownloadIoError::Write(std::io::Error::new(
+                    std::io::ErrorKind::PermissionDenied,
+                    "denied",
+                )),
+                "denied",
+            ),
+            (
+                DownloadIoError::Verify {
+                    expected: "exp".to_string(),
+                    actual: "act".to_string(),
+                },
+                "exp",
+            ),
+        ];
+        for (error, needle) in cases {
+            let message = failure_message(&error);
+            assert!(message.contains(needle), "{needle} missing in: {message}");
+        }
+    }
+
+    #[test]
+    fn map_storage_error_covers_both_variants() {
+        let verify = map_storage_error(StorageError::VerifyFailed {
+            expected: "exp".to_string(),
+            actual: "act".to_string(),
+        });
+        assert!(
+            matches!(verify, DownloadIoError::Verify { expected, actual } if expected == "exp" && actual == "act")
+        );
+
+        let io = map_storage_error(StorageError::Io(std::io::Error::new(
+            std::io::ErrorKind::NotFound,
+            "missing",
+        )));
+        assert!(
+            matches!(io, DownloadIoError::Write(e) if e.kind() == std::io::ErrorKind::NotFound)
+        );
+    }
+
+    // ── Progress throttle ────────────────────────────────────────────────────
+
+    #[tokio::test]
+    async fn throttle_emits_fewer_progress_events_than_chunks() {
+        let mut throttle = ProgressThrottle::new(100_000, 0);
+        let mut chunks = 0u32;
+        let mut emitted = 0u32;
+        let mut bytes = 0u64;
+        while bytes < 100_000 {
+            bytes += 100;
+            chunks += 1;
+            if throttle.should_emit(bytes) {
+                emitted += 1;
+            }
+        }
+        assert!(emitted > 0, "the 1% step must trigger emissions");
+        assert!(
+            emitted < chunks,
+            "throttle must emit fewer events ({emitted}) than chunks ({chunks})"
+        );
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn throttle_emits_after_min_interval_without_percent_step() {
+        // Total so large that small byte deltas never hit the 1% step.
+        let mut throttle = ProgressThrottle::new(1_000_000_000, 0);
+        assert!(!throttle.should_emit(10));
+        tokio::time::advance(Duration::from_millis(DOWNLOAD_PROGRESS_MIN_INTERVAL_MS)).await;
+        assert!(throttle.should_emit(20));
+        // The clock resets after an emission: the very next call is throttled.
+        assert!(!throttle.should_emit(30));
+    }
+
+    // ── Wire format ──────────────────────────────────────────────────────────
+
+    #[test]
+    fn events_serialize_with_tag_and_content() {
+        let started = serde_json::to_value(DownloadEvent::Started {
+            file: "w.gguf".to_string(),
+            total_bytes: 10,
+            resumed_from: 2,
+        })
+        .unwrap();
+        assert_eq!(
+            started,
+            serde_json::json!({
+                "type": "Started",
+                "data": { "file": "w.gguf", "total_bytes": 10, "resumed_from": 2 }
+            })
+        );
+
+        let failed = serde_json::to_value(DownloadEvent::Failed {
+            kind: DownloadFailKind::DiskFull,
+            message: "no space".to_string(),
+        })
+        .unwrap();
+        assert_eq!(
+            failed,
+            serde_json::json!({
+                "type": "Failed",
+                "data": { "kind": "disk_full", "message": "no space" }
+            })
+        );
+
+        let all_done = serde_json::to_value(DownloadEvent::AllDone).unwrap();
+        assert_eq!(all_done, serde_json::json!({ "type": "AllDone" }));
+    }
+}
diff --git a/src-tauri/src/models/mod.rs b/src-tauri/src/models/mod.rs
index efb15e34..99e4a756 100644
--- a/src-tauri/src/models/mod.rs
+++ b/src-tauri/src/models/mod.rs
@@ -15,6 +15,7 @@
  * the active model is `None` and the user is prompted to pick one.
  */
 
+pub mod download;
 pub mod manifest;
 pub mod storage;
 

From e6f60907c3b73b3bfd0fd9f465b72e6c0522e69c Mon Sep 17 00:00:00 2001
From: Logan Nguyen <lg.131.dev@gmail.com>
Date: Wed, 10 Jun 2026 18:15:48 -0500
Subject: [PATCH 09/13] feat: add curated starter model registry with RAM-fit
 hints

Signed-off-by: Logan Nguyen <lg.131.dev@gmail.com>
---
 src-tauri/src/models/mod.rs      |   1 +
 src-tauri/src/models/registry.rs | 350 +++++++++++++++++++++++++++++++
 2 files changed, 351 insertions(+)
 create mode 100644 src-tauri/src/models/registry.rs

diff --git a/src-tauri/src/models/mod.rs b/src-tauri/src/models/mod.rs
index 99e4a756..722964ec 100644
--- a/src-tauri/src/models/mod.rs
+++ b/src-tauri/src/models/mod.rs
@@ -17,6 +17,7 @@
 
 pub mod download;
 pub mod manifest;
+pub mod registry;
 pub mod storage;
 
 use std::collections::HashMap;
diff --git a/src-tauri/src/models/registry.rs b/src-tauri/src/models/registry.rs
new file mode 100644
index 00000000..293c915d
--- /dev/null
+++ b/src-tauri/src/models/registry.rs
@@ -0,0 +1,350 @@
+/*!
+ * Curated starter model registry for the built-in llama.cpp engine.
+ *
+ * Three tiers (Fast / Balanced / Smartest) cover the RAM spectrum of Apple
+ * Silicon Macs. Every entry pins a Hugging Face repo at an exact git revision
+ * and carries the SHA-256 of each blob, so a starter download is reproducible
+ * and verifiable end to end (the digests feed straight into
+ * [`crate::models::download::DownloadSpec`] which verifies them on install).
+ *
+ * Hashes and sizes were read from the Hugging Face tree-at-revision API
+ * (`/api/models/<repo>/tree/<revision>`) on 2026-06-10, so each digest
+ * matches the pinned commit, not whatever `main` later points to.
+ */
+
+use crate::models::download::DownloadSpec;
+use crate::models::manifest::InstalledModel;
+
+/// Starter tier: a coarse speed/quality dial for the model picker.
+#[derive(Debug, Clone, Copy, PartialEq, serde::Serialize)]
+#[serde(rename_all = "snake_case")]
+pub enum Tier {
+    Fast,
+    Balanced,
+    Smartest,
+}
+
+/// One curated starter model: everything the download UI and the installer
+/// need, baked in at compile time.
+#[derive(Debug, Clone, serde::Serialize, PartialEq)]
+pub struct Starter {
+    /// Which speed/quality tier this entry fills.
+    pub tier: Tier,
+    /// Human-readable label shown in the picker (e.g. "Gemma 3 4B").
+    pub display_name: &'static str,
+    /// Hugging Face repo slug.
+    pub repo: &'static str,
+    /// 40-hex git commit SHA the download is pinned to.
+    pub revision: &'static str,
+    /// Weights file name within the repo at that revision.
+    pub file_name: &'static str,
+    /// Lowercase hex SHA-256 of the weights blob.
+    pub sha256: &'static str,
+    /// Weights file size in bytes.
+    pub size_bytes: u64,
+    /// Quantisation label (e.g. "Q4_K_M").
+    pub quant: &'static str,
+    /// Whether the model accepts image inputs.
+    pub vision: bool,
+    /// Whether the model emits a thinking/scratchpad token stream.
+    pub thinking: bool,
+    /// Vision projection file name, when the model is multimodal.
+    pub mmproj_file: Option<&'static str>,
+    /// Lowercase hex SHA-256 of the mmproj blob, when present.
+    pub mmproj_sha256: Option<&'static str>,
+    /// mmproj file size in bytes; 0 exactly when `mmproj_file` is `None`.
+    pub mmproj_bytes: u64,
+    /// Estimated resident memory in GiB, roughly
+    /// `(size_bytes + mmproj_bytes) / 2^30` plus the 16k-context KV cache
+    /// (sized from the model's layer/head geometry under llama.cpp's default
+    /// sliding-window-aware cache). Sanity-check any new entry against a
+    /// real load before trusting the estimate.
+    pub est_runtime_gb: f64,
+    /// Short license label surfaced next to the download button.
+    pub license_note: &'static str,
+}
+
+/// The curated starters, ordered Fast, Balanced, Smartest.
+pub const STARTERS: &[Starter] = &[
+    Starter {
+        tier: Tier::Fast,
+        display_name: "Gemma 3 4B",
+        repo: "ggml-org/gemma-3-4b-it-GGUF",
+        revision: "d0976223747697cb51e056d85c532013931fe52e",
+        file_name: "gemma-3-4b-it-Q4_K_M.gguf",
+        sha256: "882e8d2db44dc554fb0ea5077cb7e4bc49e7342a1f0da57901c0802ea21a0863",
+        size_bytes: 2_489_757_856,
+        quant: "Q4_K_M",
+        vision: true,
+        thinking: false,
+        mmproj_file: Some("mmproj-model-f16.gguf"),
+        mmproj_sha256: Some("8c0fb064b019a6972856aaae2c7e4792858af3ca4561be2dbf649123ba6c40cb"),
+        mmproj_bytes: 851_251_104,
+        est_runtime_gb: 5.0,
+        license_note: "Gemma Terms of Use",
+    },
+    Starter {
+        tier: Tier::Balanced,
+        display_name: "Gemma 3 12B",
+        repo: "ggml-org/gemma-3-12b-it-GGUF",
+        revision: "ec0cbabd8dbff316f659876a50202295c3c4a314",
+        file_name: "gemma-3-12b-it-Q4_K_M.gguf",
+        sha256: "7bb69bff3f48a7b642355d64a90e481182a7794707b3133890646b1efa778ff5",
+        size_bytes: 7_300_574_976,
+        quant: "Q4_K_M",
+        vision: true,
+        thinking: false,
+        mmproj_file: Some("mmproj-model-f16.gguf"),
+        mmproj_sha256: Some("30c02d056410848227001830866e0a269fcc28aaf8ca971bded494003de9f5a5"),
+        mmproj_bytes: 854_200_224,
+        est_runtime_gb: 11.5,
+        license_note: "Gemma Terms of Use",
+    },
+    Starter {
+        tier: Tier::Smartest,
+        display_name: "Phi-4 14B",
+        repo: "bartowski/phi-4-GGUF",
+        revision: "19cd65f97c2f1712a81c506611d3f9c94b16a1e1",
+        file_name: "phi-4-Q4_K_M.gguf",
+        sha256: "009aba717c09d4a35890c7d35eb59d54e1dba884c7c526e7197d9c13ab5911d9",
+        size_bytes: 9_053_114_816,
+        quant: "Q4_K_M",
+        vision: false,
+        thinking: false,
+        mmproj_file: None,
+        mmproj_sha256: None,
+        mmproj_bytes: 0,
+        est_runtime_gb: 10.7,
+        license_note: "MIT",
+    },
+];
+
+/// RAM-fit hint rendered as a badge on each starter row.
+#[derive(Debug, Clone, Copy, PartialEq, serde::Serialize)]
+#[serde(rename_all = "snake_case")]
+pub enum RamFit {
+    Fits,
+    Tight,
+    TooBig,
+}
+
+/// RAM-fit hint. `ram_bytes` is hw.memsize; GiB = bytes / 2^30.
+/// fits when est <= 0.60 * ram_gib; tight when <= 0.85 * ram_gib; too_big above.
+/// 60% leaves headroom for the OS and other apps in unified memory; up to 85%
+/// runs but close to the machine's limit; beyond that macOS swaps heavily.
+pub fn ram_fit(est_runtime_gb: f64, ram_bytes: u64) -> RamFit {
+    let ram_gib = ram_bytes as f64 / (1u64 << 30) as f64;
+    if est_runtime_gb <= 0.60 * ram_gib {
+        RamFit::Fits
+    } else if est_runtime_gb <= 0.85 * ram_gib {
+        RamFit::Tight
+    } else {
+        RamFit::TooBig
+    }
+}
+
+/// Download URL: `https://huggingface.co/<repo>/resolve/<revision>/<file>`.
+/// One spec for the weights, plus one for the mmproj when present.
+pub fn download_specs(s: &Starter) -> Vec<DownloadSpec> {
+    let url = |file: &str| {
+        format!(
+            "https://huggingface.co/{}/resolve/{}/{}",
+            s.repo, s.revision, file
+        )
+    };
+    let mut specs = vec![DownloadSpec {
+        url: url(s.file_name),
+        file: s.file_name.to_string(),
+        sha256: s.sha256.to_string(),
+        total_bytes: s.size_bytes,
+    }];
+    if let (Some(mmproj_file), Some(mmproj_sha256)) = (s.mmproj_file, s.mmproj_sha256) {
+        specs.push(DownloadSpec {
+            url: url(mmproj_file),
+            file: mmproj_file.to_string(),
+            sha256: mmproj_sha256.to_string(),
+            total_bytes: s.mmproj_bytes,
+        });
+    }
+    specs
+}
+
+/// Manifest row for an installed starter. id = `"<repo>:<file_name>"`.
+pub fn to_installed_model(s: &Starter) -> InstalledModel {
+    InstalledModel {
+        id: format!("{}:{}", s.repo, s.file_name),
+        display_name: s.display_name.to_string(),
+        repo: s.repo.to_string(),
+        revision: s.revision.to_string(),
+        file_name: s.file_name.to_string(),
+        sha256: s.sha256.to_string(),
+        size_bytes: s.size_bytes,
+        quant: s.quant.to_string(),
+        vision: s.vision,
+        thinking: s.thinking,
+        mmproj_file: s.mmproj_file.map(str::to_string),
+        mmproj_sha256: s.mmproj_sha256.map(str::to_string),
+    }
+}
+
+// ─── Tests ───────────────────────────────────────────────────────────────────
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// True when `s` is exactly `len` lowercase ASCII hex chars.
+    fn is_lower_hex(s: &str, len: usize) -> bool {
+        s.len() == len && s.bytes().all(|b| matches!(b, b'0'..=b'9' | b'a'..=b'f'))
+    }
+
+    fn starter(tier: Tier) -> &'static Starter {
+        STARTERS.iter().find(|s| s.tier == tier).unwrap()
+    }
+
+    #[test]
+    fn three_tiers_present() {
+        assert_eq!(STARTERS.len(), 3);
+        assert_eq!(
+            STARTERS.iter().map(|s| s.tier).collect::<Vec<_>>(),
+            vec![Tier::Fast, Tier::Balanced, Tier::Smartest]
+        );
+    }
+
+    #[test]
+    fn balanced_is_vision() {
+        let balanced = starter(Tier::Balanced);
+        assert!(balanced.vision);
+        assert!(balanced.mmproj_file.is_some());
+        assert!(balanced.mmproj_sha256.is_some());
+        assert!(balanced.mmproj_bytes > 0);
+    }
+
+    #[test]
+    fn all_revisions_are_40_hex() {
+        for s in STARTERS {
+            assert!(
+                is_lower_hex(s.revision, 40),
+                "{}: revision is not 40-hex: {}",
+                s.repo,
+                s.revision
+            );
+        }
+    }
+
+    #[test]
+    fn all_sha256_are_64_hex() {
+        for s in STARTERS {
+            assert!(
+                is_lower_hex(s.sha256, 64),
+                "{}: weights sha256 is not lowercase 64-hex: {}",
+                s.repo,
+                s.sha256
+            );
+            if let Some(mm) = s.mmproj_sha256 {
+                assert!(
+                    is_lower_hex(mm, 64),
+                    "{}: mmproj sha256 is not lowercase 64-hex: {mm}",
+                    s.repo
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn mmproj_hashes_are_distinct_between_gemma_tiers() {
+        let fast = starter(Tier::Fast);
+        let balanced = starter(Tier::Balanced);
+        // Both Gemma mmproj files share a name but differ in size, so their
+        // hashes must differ; identical hashes would mean a swap happened.
+        assert_ne!(fast.mmproj_bytes, balanced.mmproj_bytes);
+        assert_ne!(fast.mmproj_sha256.unwrap(), balanced.mmproj_sha256.unwrap());
+    }
+
+    #[test]
+    fn fit_cutoffs() {
+        const GIB: u64 = 1 << 30;
+        // (ram_gib, expected fit for Fast 5.0 / Balanced 11.5 / Smartest 10.7)
+        let table: &[(u64, [RamFit; 3])] = &[
+            (8, [RamFit::Tight, RamFit::TooBig, RamFit::TooBig]),
+            (16, [RamFit::Fits, RamFit::Tight, RamFit::Tight]),
+            (24, [RamFit::Fits, RamFit::Fits, RamFit::Fits]),
+            (32, [RamFit::Fits, RamFit::Fits, RamFit::Fits]),
+        ];
+        for (ram_gib, expected) in table {
+            for (s, want) in STARTERS.iter().zip(expected) {
+                let got = ram_fit(s.est_runtime_gb, ram_gib * GIB);
+                assert_eq!(
+                    got, *want,
+                    "{} at {ram_gib} GiB: expected {want:?}, got {got:?}",
+                    s.display_name
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn download_specs_includes_mmproj() {
+        let fast = starter(Tier::Fast);
+        let specs = download_specs(fast);
+        assert_eq!(specs.len(), 2);
+        assert_eq!(
+            specs[0].url,
+            format!(
+                "https://huggingface.co/{}/resolve/{}/{}",
+                fast.repo, fast.revision, fast.file_name
+            )
+        );
+        assert_eq!(specs[0].file, fast.file_name);
+        assert_eq!(specs[0].sha256, fast.sha256);
+        assert_eq!(specs[0].total_bytes, fast.size_bytes);
+        assert_eq!(
+            specs[1].url,
+            format!(
+                "https://huggingface.co/{}/resolve/{}/{}",
+                fast.repo,
+                fast.revision,
+                fast.mmproj_file.unwrap()
+            )
+        );
+        assert_eq!(specs[1].file, fast.mmproj_file.unwrap());
+        assert_eq!(specs[1].sha256, fast.mmproj_sha256.unwrap());
+        assert_eq!(specs[1].total_bytes, fast.mmproj_bytes);
+
+        let smartest = starter(Tier::Smartest);
+        let specs = download_specs(smartest);
+        assert_eq!(specs.len(), 1);
+        assert_eq!(
+            specs[0].url,
+            format!(
+                "https://huggingface.co/{}/resolve/{}/{}",
+                smartest.repo, smartest.revision, smartest.file_name
+            )
+        );
+    }
+
+    #[test]
+    fn to_installed_model_maps_fields() {
+        let balanced = starter(Tier::Balanced);
+        let m = to_installed_model(balanced);
+        assert_eq!(m.id, format!("{}:{}", balanced.repo, balanced.file_name));
+        assert_eq!(m.display_name, balanced.display_name);
+        assert_eq!(m.repo, balanced.repo);
+        assert_eq!(m.revision, balanced.revision);
+        assert_eq!(m.file_name, balanced.file_name);
+        assert_eq!(m.sha256, balanced.sha256);
+        assert_eq!(m.size_bytes, balanced.size_bytes);
+        assert_eq!(m.quant, balanced.quant);
+        assert_eq!(m.vision, balanced.vision);
+        assert_eq!(m.thinking, balanced.thinking);
+        assert_eq!(m.mmproj_file.as_deref(), balanced.mmproj_file);
+        assert_eq!(m.mmproj_sha256.as_deref(), balanced.mmproj_sha256);
+
+        // Text-only starter: the Option fields map to None.
+        let smartest = starter(Tier::Smartest);
+        let m = to_installed_model(smartest);
+        assert_eq!(m.id, format!("{}:{}", smartest.repo, smartest.file_name));
+        assert_eq!(m.mmproj_file, None);
+        assert_eq!(m.mmproj_sha256, None);
+    }
+}

From 7b45ba781fb9bb8004afb22a364b9825c3a3b6a0 Mon Sep 17 00:00:00 2001
From: Logan Nguyen <lg.131.dev@gmail.com>
Date: Wed, 10 Jun 2026 22:06:58 -0500
Subject: [PATCH 10/13] feat: add model download and library Tauri commands

Signed-off-by: Logan Nguyen <lg.131.dev@gmail.com>
---
 docs/configurations.md           |    3 +
 src-tauri/src/config/defaults.rs |   14 +
 src-tauri/src/lib.rs             |   24 +
 src-tauri/src/models/download.rs |    2 +-
 src-tauri/src/models/mod.rs      | 1376 +++++++++++++++++++++++++++++-
 src-tauri/src/models/registry.rs |   10 +-
 6 files changed, 1420 insertions(+), 9 deletions(-)

diff --git a/docs/configurations.md b/docs/configurations.md
index a407a3a0..9d8bf2d2 100644
--- a/docs/configurations.md
+++ b/docs/configurations.md
@@ -177,6 +177,9 @@ The table below also lists the baked-in safety limits that govern Thuki's commun
 | `ENGINE_HEALTH_PROBE_TIMEOUT_SECS`          | `5 s`    | No       | Internal lifecycle contract between the runner and the engine process. A wedged-but-connected server must not park the poll loop forever; loopback probes are normally instant so 5 s is generous. The poll interval and deadline are the user-facing knobs. | —      | How long a single `/health` GET is allowed to take inside the startup poll loop. If the engine has accepted the TCP connection but stopped responding, this timeout causes the probe to return an error (treated as Wait and retried after `ENGINE_HEALTH_POLL_INTERVAL_MS`). |
 | `ENGINE_COMMAND_QUEUE_CAPACITY`             | `64`     | No       | Bounds memory under command bursts; 64 slots is ample for all UI-driven traffic (Ensure, Touch, SetIdleMinutes, Shutdown) under any realistic usage pattern. | —      | Capacity of the bounded `mpsc` channel that carries commands from `EngineHandle` to the runner actor task. Back-pressure from a full queue is not observable in normal use. |
 | `DOWNLOAD_PROGRESS_MIN_INTERVAL_MS`         | `500 ms` | No       | Pure IPC hygiene: a fast local connection can deliver thousands of chunks per second and the UI only needs a few updates per second, so throttling below the UI refresh rate is invisible to the user. | —      | Minimum interval between `Progress` events emitted while a model file downloads. An update is also emitted whenever at least 1% of the file has arrived since the last one, whichever comes first, and a final 100% update always precedes verification. |
+| `MAX_HF_API_BODY_BYTES`                     | `4 MiB`  | No       | Defense-in-depth bound on attacker-controlled data from a remote service, mirroring `MAX_OLLAMA_TAGS_BODY_BYTES`. | —      | The largest Hugging Face API response body (repo file listings) Thuki will accept while resolving a model to download. Larger responses are rejected mid-stream and the request returns an error. |
+| `HF_API_TIMEOUT_SECS`                       | `15 s`   | No       | Protocol cap on a hung remote service so the download UI cannot stall on metadata resolution; 15 s is generous for a small metadata call over the internet. | —      | How long Thuki waits for a Hugging Face API metadata call (repo file listing) to respond before giving up. Applies to resolving pasted repo ids and listing a repo's GGUF files, not to the model download itself. |
+| `HF_BASE_URL`                               | `https://huggingface.co` | No | Single origin for model metadata and downloads; the sha256-pinning and provenance model assume the canonical Hub. Pointing downloads at an arbitrary mirror would bypass the integrity guarantees that make the curated starter registry safe. | — | The Hugging Face origin Thuki uses for all model metadata calls and blob downloads. Every starter in the registry pins a repo at an exact revision and carries a sha256 digest verified on install; those digests are read from this origin and only meaningful against it. |
 
 ### `[prompt]`
 
diff --git a/src-tauri/src/config/defaults.rs b/src-tauri/src/config/defaults.rs
index c89ed86a..82d31687 100644
--- a/src-tauri/src/config/defaults.rs
+++ b/src-tauri/src/config/defaults.rs
@@ -350,6 +350,20 @@ pub const MAX_OLLAMA_TAGS_BODY_BYTES: usize = 4 * 1024 * 1024;
 /// any real model and bounds attacker-controlled inputs.
 pub const MAX_OLLAMA_SHOW_BODY_BYTES: usize = 4 * 1024 * 1024;
 
+/// Maximum accepted body size for Hugging Face API responses (repo file
+/// listings). Bounds attacker-controlled data from a remote service,
+/// mirroring MAX_OLLAMA_TAGS_BODY_BYTES.
+pub const MAX_HF_API_BODY_BYTES: usize = 4 * 1024 * 1024;
+
+/// Per-request timeout (seconds) for Hugging Face API metadata calls.
+pub const HF_API_TIMEOUT_SECS: u64 = 15;
+
+/// Canonical Hugging Face origin used for both model metadata calls and blob
+/// downloads. Not user-tunable: the sha256-pinning + provenance model assumes
+/// the canonical Hub; pointing downloads at an arbitrary mirror would bypass
+/// the integrity guarantees that make the curated starter registry safe.
+pub const HF_BASE_URL: &str = "https://huggingface.co";
+
 /// Maximum accepted byte length for a model slug passed to `set_active_model`.
 /// Real Ollama slugs are a handful of characters; 256 is generous while still
 /// capping adversarial inputs long before any network or database work.
diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs
index 959acc9b..9a657166 100644
--- a/src-tauri/src/lib.rs
+++ b/src-tauri/src/lib.rs
@@ -1814,6 +1814,12 @@ pub fn run() {
             app.manage(models::ModelCapabilitiesCache::default());
             app.manage(history::Database(std::sync::Mutex::new(db_conn)));
 
+            // ── Model blob store + download slot for the built-in engine ──
+            let model_store = models::storage::ModelStore::new(app_data_dir.join("models"))
+                .expect("failed to initialise model blob store");
+            app.manage(model_store);
+            app.manage(models::DownloadState::default());
+
             // ── Orphaned image cleanup (startup + periodic) ─────────
             run_image_cleanup(app.handle());
             spawn_periodic_image_cleanup(app.handle().clone());
@@ -1857,6 +1863,24 @@ pub fn run() {
             #[cfg(not(coverage))]
             models::get_model_capabilities,
             #[cfg(not(coverage))]
+            models::get_starter_options,
+            #[cfg(not(coverage))]
+            models::get_system_ram_bytes,
+            #[cfg(not(coverage))]
+            models::download_starter,
+            #[cfg(not(coverage))]
+            models::download_repo_model,
+            #[cfg(not(coverage))]
+            models::list_hf_repo_ggufs,
+            #[cfg(not(coverage))]
+            models::cancel_model_download,
+            #[cfg(not(coverage))]
+            models::discard_partial_download,
+            #[cfg(not(coverage))]
+            models::list_installed_models,
+            #[cfg(not(coverage))]
+            models::delete_installed_model,
+            #[cfg(not(coverage))]
             history::save_conversation,
             #[cfg(not(coverage))]
             history::persist_message,
diff --git a/src-tauri/src/models/download.rs b/src-tauri/src/models/download.rs
index c1cd252f..95715aaf 100644
--- a/src-tauri/src/models/download.rs
+++ b/src-tauri/src/models/download.rs
@@ -340,7 +340,7 @@ fn map_storage_error(e: StorageError) -> DownloadIoError {
 
 /// True when `s` is exactly 64 lowercase ASCII hex chars: the only shape a
 /// sha256 may have before it is used as a file name in the store.
-fn is_valid_sha256(s: &str) -> bool {
+pub(crate) fn is_valid_sha256(s: &str) -> bool {
     s.len() == 64 && s.bytes().all(|b| matches!(b, b'0'..=b'9' | b'a'..=b'f'))
 }
 
diff --git a/src-tauri/src/models/mod.rs b/src-tauri/src/models/mod.rs
index 722964ec..e98860a5 100644
--- a/src-tauri/src/models/mod.rs
+++ b/src-tauri/src/models/mod.rs
@@ -25,10 +25,12 @@ use std::sync::Mutex;
 
 use futures_util::StreamExt;
 use serde::{Deserialize, Serialize};
+use tauri::Manager;
 
 use crate::config::defaults::{
     DEFAULT_OLLAMA_SHOW_REQUEST_TIMEOUT_SECS, DEFAULT_OLLAMA_TAGS_REQUEST_TIMEOUT_SECS,
-    MAX_MODEL_SLUG_LEN, MAX_OLLAMA_SHOW_BODY_BYTES, MAX_OLLAMA_TAGS_BODY_BYTES,
+    HF_API_TIMEOUT_SECS, HF_BASE_URL, MAX_HF_API_BODY_BYTES, MAX_MODEL_SLUG_LEN,
+    MAX_OLLAMA_SHOW_BODY_BYTES, MAX_OLLAMA_TAGS_BODY_BYTES, PROVIDER_ID_BUILTIN,
 };
 use crate::config::AppConfig;
 
@@ -789,6 +791,706 @@ async fn reconcile_capabilities(
     hits
 }
 
+// ─── Model library (built-in engine downloads) ──────────────────────────────
+
+/// Stable error returned when a repo id fails [`is_valid_repo_id`].
+const INVALID_REPO_ID_ERR: &str = "invalid Hugging Face repo id";
+
+/// Cancellation handle for the (at most one) in-flight model download.
+/// `Some` while a download is running; `None` otherwise. Claimed atomically
+/// via [`claim_download`] so a second download cannot start until the first
+/// completes, fails, or is cancelled.
+#[derive(Default)]
+pub struct DownloadState(pub std::sync::Mutex<Option<tokio_util::sync::CancellationToken>>);
+
+/// Atomically claims the single download slot. Returns a fresh cancellation
+/// token on success; an error when another download already holds the slot
+/// (or the lock is poisoned).
+pub fn claim_download(
+    state: &DownloadState,
+) -> Result<tokio_util::sync::CancellationToken, String> {
+    let mut guard = state.0.lock().map_err(|e| e.to_string())?;
+    if guard.is_some() {
+        return Err("a download is already in progress".to_string());
+    }
+    let token = tokio_util::sync::CancellationToken::new();
+    *guard = Some(token.clone());
+    Ok(token)
+}
+
+/// Clears the download slot. Best-effort: a poisoned lock is ignored because
+/// release runs on the task teardown path where there is nothing left to do.
+pub fn release_download(state: &DownloadState) {
+    if let Ok(mut guard) = state.0.lock() {
+        *guard = None;
+    }
+}
+
+/// Cancels the in-flight download's token, if one is claimed. Does NOT clear
+/// the slot: the download task notices the cancellation, emits `Cancelled`,
+/// and releases the slot itself.
+pub fn cancel_active_download(state: &DownloadState) {
+    if let Ok(guard) = state.0.lock() {
+        if let Some(token) = guard.as_ref() {
+            token.cancel();
+        }
+    }
+}
+
+/// True when a finished download should be recorded as installed: the run
+/// succeeded AND the user did not cancel between the last event and teardown.
+pub fn should_finalize(result_ok: bool, cancelled: bool) -> bool {
+    result_ok && !cancelled
+}
+
+/// One starter row for the download picker: the compile-time registry entry
+/// plus the machine-specific runtime facts the UI renders next to it.
+#[derive(Debug, Clone, PartialEq, Serialize)]
+pub struct StarterOption {
+    /// The curated registry entry (tier, repo, sizes, license).
+    pub starter: registry::Starter,
+    /// RAM-fit badge for this machine.
+    pub fit: registry::RamFit,
+    /// Whether the starter is already recorded in the installed manifest.
+    pub installed: bool,
+    /// Length of an interrupted download's partial file, when one exists.
+    pub partial_bytes: Option<u64>,
+}
+
+/// Builds the starter picker rows from the manifest, the blob store's partial
+/// slots, and the machine's RAM. A manifest read error degrades to "not
+/// installed" rather than failing the whole picker.
+pub fn build_starter_options(
+    conn: &rusqlite::Connection,
+    store: &storage::ModelStore,
+    ram_bytes: u64,
+) -> Vec<StarterOption> {
+    registry::STARTERS
+        .iter()
+        .map(|s| StarterOption {
+            starter: s.clone(),
+            fit: registry::ram_fit(s.est_runtime_gb, ram_bytes),
+            installed: matches!(
+                manifest::get(conn, &registry::to_installed_model(s).id),
+                Ok(Some(_))
+            ),
+            partial_bytes: store.existing_partial_len(s.sha256),
+        })
+        .collect()
+}
+
+/// Maps a frontend tier string (`"fast" | "balanced" | "smartest"`) onto its
+/// curated starter. Every [`registry::Tier`] has exactly one `STARTERS`
+/// entry (asserted by registry tests), so the lookup is total.
+pub fn starter_for_tier(tier: &str) -> Result<&'static registry::Starter, String> {
+    let tier = match tier {
+        "fast" => registry::Tier::Fast,
+        "balanced" => registry::Tier::Balanced,
+        "smartest" => registry::Tier::Smartest,
+        other => return Err(format!("unknown starter tier: {other}")),
+    };
+    Ok(registry::STARTERS
+        .iter()
+        .find(|s| s.tier == tier)
+        .expect("every tier has a starter"))
+}
+
+/// The builtin provider's currently configured model id (empty when none).
+pub fn builtin_provider_model(config: &AppConfig) -> String {
+    config
+        .inference
+        .providers
+        .iter()
+        .find(|p| p.id == PROVIDER_ID_BUILTIN)
+        .map(|p| p.model.clone())
+        .unwrap_or_default()
+}
+
+/// True when `repo` is a well-formed Hugging Face repo id: exactly two
+/// non-empty segments of `[A-Za-z0-9_.-]` joined by one `/`. Validated before
+/// the id is embedded in any URL so it cannot smuggle path or query syntax.
+pub fn is_valid_repo_id(repo: &str) -> bool {
+    let mut parts = repo.split('/');
+    let (Some(org), Some(name), None) = (parts.next(), parts.next(), parts.next()) else {
+        return false;
+    };
+    let segment_ok = |s: &str| {
+        !s.is_empty()
+            && s != "."
+            && s != ".."
+            && s.bytes().any(|b| b.is_ascii_alphanumeric())
+            && s.bytes()
+                .all(|b| b.is_ascii_alphanumeric() || matches!(b, b'_' | b'.' | b'-'))
+    };
+    segment_ok(org) && segment_ok(name)
+}
+
+/// Quantisation token extracted from a GGUF file name: the first `-`/`.`
+/// separated token that contains `Q` and is made of uppercase letters,
+/// digits, and underscores (e.g. `Q4_K_M`, `IQ4_XS`). Empty when none.
+pub fn quant_from_filename(file: &str) -> String {
+    let stem = file.strip_suffix(".gguf").unwrap_or(file);
+    stem.split(['-', '.'])
+        .find(|t| {
+            !t.is_empty()
+                && t.contains('Q')
+                && t.chars()
+                    .all(|c| c.is_ascii_uppercase() || c.is_ascii_digit() || c == '_')
+        })
+        .map(str::to_string)
+        .unwrap_or_default()
+}
+
+/// A `.gguf` entry in a Hugging Face repo listing, for the paste-a-repo UI.
+#[derive(Debug, Clone, PartialEq, Serialize)]
+pub struct HfGgufFile {
+    /// File name within the repo (`rfilename`).
+    pub file: String,
+    /// File size in bytes; 0 when the API reports no size.
+    pub size_bytes: u64,
+}
+
+/// Subset of the HF `/api/models/<repo>?blobs=true` response Thuki consumes.
+#[derive(Deserialize)]
+struct HfRepoInfo {
+    /// Current commit SHA of the repo's default branch; pinned as the
+    /// manifest revision so later repo pushes cannot change what was vetted.
+    #[serde(default)]
+    sha: Option<String>,
+    #[serde(default)]
+    siblings: Vec<HfSibling>,
+}
+
+/// One repo file in the HF listing. Only LFS-backed `.gguf` files matter.
+#[derive(Deserialize)]
+struct HfSibling {
+    #[serde(default)]
+    rfilename: String,
+    /// Plain (non-LFS) size; fallback for the file browser listing.
+    #[serde(default)]
+    size: Option<u64>,
+    #[serde(default)]
+    lfs: Option<HfLfs>,
+}
+
+/// LFS pointer metadata: the digest the downloader verifies against.
+#[derive(Deserialize)]
+struct HfLfs {
+    #[serde(default)]
+    sha256: Option<String>,
+    #[serde(default)]
+    size: Option<u64>,
+}
+
+/// The sibling's LFS digest and size, when both are present.
+fn lfs_digest(s: &HfSibling) -> Option<(String, u64)> {
+    let lfs = s.lfs.as_ref()?;
+    Some((lfs.sha256.clone()?, lfs.size?))
+}
+
+/// What a pasted repo id + file resolves to: the pinned commit, the weights
+/// digest, and the vision companion when the repo ships an mmproj file.
+#[derive(Debug, Clone, PartialEq)]
+pub struct RepoResolved {
+    /// 40-hex commit SHA reported by the API at resolve time.
+    pub revision: String,
+    /// Lowercase hex SHA-256 of the weights blob.
+    pub weights_sha256: String,
+    /// Weights file size in bytes.
+    pub weights_size_bytes: u64,
+    /// Vision projection companion, when present in the repo.
+    pub mmproj: Option<MmprojCompanion>,
+}
+
+/// An `mmproj*.gguf` sibling shipped next to the weights file.
+#[derive(Debug, Clone, PartialEq)]
+pub struct MmprojCompanion {
+    pub file: String,
+    pub sha256: String,
+    pub size_bytes: u64,
+}
+
+/// Pure parse of an HF repo listing into the spec for one target `file`.
+/// Capability rule for pasted repos: vision = an `mmproj*.gguf` sibling with
+/// complete LFS metadata exists; thinking = false (full detection is Phase 3).
+pub fn resolve_listing(body: &[u8], file: &str) -> Result<RepoResolved, String> {
+    let info: HfRepoInfo = serde_json::from_slice(body)
+        .map_err(|e| format!("failed to decode Hugging Face API response: {e}"))?;
+    let revision = info.sha.unwrap_or_default();
+    if !(revision.len() == 40
+        && revision
+            .bytes()
+            .all(|b| matches!(b, b'0'..=b'9' | b'a'..=b'f')))
+    {
+        return Err("Hugging Face API response carries no valid commit sha".to_string());
+    }
+    let target = info
+        .siblings
+        .iter()
+        .find(|s| s.rfilename == file)
+        .ok_or_else(|| format!("file not found in repo: {file}"))?;
+    let (weights_sha256, weights_size_bytes) =
+        lfs_digest(target).ok_or_else(|| format!("file has no LFS digest metadata: {file}"))?;
+    let mmproj = info
+        .siblings
+        .iter()
+        .filter(|s| s.rfilename.starts_with("mmproj") && s.rfilename.ends_with(".gguf"))
+        .find_map(|s| {
+            lfs_digest(s).map(|(sha256, size_bytes)| MmprojCompanion {
+                file: s.rfilename.clone(),
+                sha256,
+                size_bytes,
+            })
+        });
+    Ok(RepoResolved {
+        revision,
+        weights_sha256,
+        weights_size_bytes,
+        mmproj,
+    })
+}
+
+/// Pure parse of an HF repo listing into the `.gguf` file browser rows.
+/// Excludes `mmproj*` companions: they download alongside their weights file
+/// and are never picked directly.
+pub fn parse_gguf_listing(body: &[u8]) -> Result<Vec<HfGgufFile>, String> {
+    let info: HfRepoInfo = serde_json::from_slice(body)
+        .map_err(|e| format!("failed to decode Hugging Face API response: {e}"))?;
+    Ok(info
+        .siblings
+        .into_iter()
+        .filter(|s| s.rfilename.ends_with(".gguf") && !s.rfilename.starts_with("mmproj"))
+        .map(|s| {
+            let size_bytes = s.lfs.as_ref().and_then(|l| l.size).or(s.size).unwrap_or(0);
+            HfGgufFile {
+                file: s.rfilename,
+                size_bytes,
+            }
+        })
+        .collect())
+}
+
+/// GETs `<base>/api/models/<repo>?blobs=true` with the production timeout and
+/// body cap and returns the raw body bytes.
+async fn fetch_hf_repo_listing(
+    client: &reqwest::Client,
+    base_url: &str,
+    repo: &str,
+) -> Result<Vec<u8>, String> {
+    fetch_hf_repo_listing_inner(
+        client,
+        base_url,
+        repo,
+        std::time::Duration::from_secs(HF_API_TIMEOUT_SECS),
+        MAX_HF_API_BODY_BYTES,
+    )
+    .await
+}
+
+/// Innermost HF metadata fetcher with timeout and body cap configurable so
+/// the cap branches are testable. The cap is enforced incrementally during
+/// the streaming read, mirroring [`fetch_installed_model_names_inner`].
+async fn fetch_hf_repo_listing_inner(
+    client: &reqwest::Client,
+    base_url: &str,
+    repo: &str,
+    timeout: std::time::Duration,
+    max_body_bytes: usize,
+) -> Result<Vec<u8>, String> {
+    let url = format!(
+        "{}/api/models/{}?blobs=true",
+        base_url.trim_end_matches('/'),
+        repo
+    );
+    let response = client
+        .get(&url)
+        .timeout(timeout)
+        .send()
+        .await
+        .map_err(|e| format!("failed to reach Hugging Face: {e}"))?;
+
+    if !response.status().is_success() {
+        return Err(format!(
+            "Hugging Face API returned HTTP {}",
+            response.status().as_u16()
+        ));
+    }
+
+    if let Some(declared_len) = response.content_length() {
+        if declared_len as usize > max_body_bytes {
+            return Err(format!(
+                "Hugging Face API response exceeded {max_body_bytes} bytes"
+            ));
+        }
+    }
+
+    let mut stream = response.bytes_stream();
+    let mut buf: Vec<u8> = Vec::new();
+    while let Some(chunk) = stream.next().await {
+        let chunk = chunk.map_err(|e| format!("failed to read Hugging Face API body: {e}"))?;
+        if buf.len() + chunk.len() > max_body_bytes {
+            return Err(format!(
+                "Hugging Face API response exceeded {max_body_bytes} bytes"
+            ));
+        }
+        buf.extend_from_slice(&chunk);
+    }
+
+    Ok(buf)
+}
+
+/// Validates `repo`, fetches its listing from `base_url`, and resolves the
+/// download spec for `file` (plus the mmproj companion when present).
+/// `base_url` is parameterized so tests point at a mock server; production
+/// passes [`HF_BASE_URL`].
+pub async fn resolve_repo_spec(
+    client: &reqwest::Client,
+    base_url: &str,
+    repo: &str,
+    file: &str,
+) -> Result<RepoResolved, String> {
+    if !is_valid_repo_id(repo) {
+        return Err(INVALID_REPO_ID_ERR.to_string());
+    }
+    let body = fetch_hf_repo_listing(client, base_url, repo).await?;
+    resolve_listing(&body, file)
+}
+
+/// Validates `repo` and returns its `.gguf` file rows for the paste-a-repo
+/// browser. Same API call as [`resolve_repo_spec`].
+pub async fn fetch_repo_gguf_listing(
+    client: &reqwest::Client,
+    base_url: &str,
+    repo: &str,
+) -> Result<Vec<HfGgufFile>, String> {
+    if !is_valid_repo_id(repo) {
+        return Err(INVALID_REPO_ID_ERR.to_string());
+    }
+    let body = fetch_hf_repo_listing(client, base_url, repo).await?;
+    parse_gguf_listing(&body)
+}
+
+/// Download specs for a resolved repo model: weights first, then the mmproj
+/// companion. URL shape matches [`registry::download_specs`]:
+/// `<base>/<repo>/resolve/<revision>/<file>`.
+pub fn repo_download_specs(
+    base_url: &str,
+    repo: &str,
+    file: &str,
+    resolved: &RepoResolved,
+) -> Vec<download::DownloadSpec> {
+    let url = |f: &str| {
+        format!(
+            "{}/{}/resolve/{}/{}",
+            base_url.trim_end_matches('/'),
+            repo,
+            resolved.revision,
+            f
+        )
+    };
+    let mut specs = vec![download::DownloadSpec {
+        url: url(file),
+        file: file.to_string(),
+        sha256: resolved.weights_sha256.clone(),
+        total_bytes: resolved.weights_size_bytes,
+    }];
+    if let Some(mm) = &resolved.mmproj {
+        specs.push(download::DownloadSpec {
+            url: url(&mm.file),
+            file: mm.file.clone(),
+            sha256: mm.sha256.clone(),
+            total_bytes: mm.size_bytes,
+        });
+    }
+    specs
+}
+
+/// Manifest row for a resolved repo model. id = `"<repo>:<file>"`;
+/// display name = the file stem; revision pins the resolve-time commit.
+pub fn repo_installed_model(
+    repo: &str,
+    file: &str,
+    resolved: &RepoResolved,
+) -> manifest::InstalledModel {
+    manifest::InstalledModel {
+        id: format!("{repo}:{file}"),
+        display_name: file.strip_suffix(".gguf").unwrap_or(file).to_string(),
+        repo: repo.to_string(),
+        revision: resolved.revision.clone(),
+        file_name: file.to_string(),
+        sha256: resolved.weights_sha256.clone(),
+        size_bytes: resolved.weights_size_bytes,
+        quant: quant_from_filename(file),
+        vision: resolved.mmproj.is_some(),
+        thinking: false,
+        mmproj_file: resolved.mmproj.as_ref().map(|m| m.file.clone()),
+        mmproj_sha256: resolved.mmproj.as_ref().map(|m| m.sha256.clone()),
+    }
+}
+
+/// Deletion outcome consumed by the thin Tauri wrapper.
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub struct DeleteOutcome {
+    /// True when the deleted model was the builtin provider's configured
+    /// model, so the wrapper must clear that provider's `model` field.
+    pub clear_builtin: bool,
+}
+
+/// Deletes a model from the manifest and removes the blobs no other row
+/// references. `builtin_model` is the builtin provider's currently configured
+/// model id; deleting it flags `clear_builtin` for the caller.
+pub fn delete_installed_model_inner(
+    conn: &rusqlite::Connection,
+    store: &storage::ModelStore,
+    id: &str,
+    builtin_model: &str,
+) -> Result<DeleteOutcome, String> {
+    let orphans = manifest::delete(conn, id).map_err(|e| e.to_string())?;
+    store.remove_blobs(&orphans).map_err(|e| e.to_string())?;
+    Ok(DeleteOutcome {
+        clear_builtin: builtin_model == id,
+    })
+}
+
+/// Removes the partial file for `sha256` so the next download starts fresh.
+/// Refuses malformed digests (the digest doubles as a file name) and refuses
+/// while a download is running (it may be writing that very partial). Holds
+/// the download-state lock across the removal so a concurrent claim cannot
+/// race the delete.
+pub fn discard_partial_inner(
+    state: &DownloadState,
+    store: &storage::ModelStore,
+    sha256: &str,
+) -> Result<(), String> {
+    if !download::is_valid_sha256(sha256) {
+        return Err("invalid sha256".to_string());
+    }
+    let guard = state.0.lock().map_err(|e| e.to_string())?;
+    if guard.is_some() {
+        return Err("a download is already in progress".to_string());
+    }
+    match std::fs::remove_file(store.partial_path(sha256)) {
+        Ok(()) => Ok(()),
+        Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(()),
+        Err(e) => Err(format!("failed to remove partial download: {e}")),
+    }
+}
+
+/// Total physical RAM in bytes via `sysctlbyname("hw.memsize")`; 0 when the
+/// syscall fails.
+///
+/// Not covered by the cargo coverage gate: this is a direct OS syscall with
+/// no branching logic beyond error propagation, making instrumentation
+/// meaningless here (mirrors `storage::free_disk_bytes`).
+#[cfg_attr(coverage_nightly, coverage(off))]
+pub fn system_ram_bytes() -> u64 {
+    let mut value: u64 = 0;
+    let mut len: libc::size_t = std::mem::size_of::<u64>();
+    // SAFETY: `value` is a valid 8-byte buffer and `len` carries its exact
+    // size; `sysctlbyname` writes at most `len` bytes into it on success
+    // (return value 0). The name is a static NUL-terminated literal.
+    unsafe {
+        if libc::sysctlbyname(
+            c"hw.memsize".as_ptr(),
+            &mut value as *mut u64 as *mut libc::c_void,
+            &mut len,
+            std::ptr::null_mut(),
+            0,
+        ) == 0
+        {
+            value
+        } else {
+            0
+        }
+    }
+}
+
+// ─── Model library Tauri commands (thin wrappers) ───────────────────────────
+
+/// Returns the starter picker rows: registry entries annotated with RAM fit,
+/// installed state, and resumable-partial size.
+#[cfg_attr(coverage_nightly, coverage(off))]
+#[cfg_attr(not(coverage), tauri::command)]
+pub fn get_starter_options(
+    db: tauri::State<'_, crate::history::Database>,
+    store: tauri::State<'_, storage::ModelStore>,
+) -> Result<Vec<StarterOption>, String> {
+    let conn = db.0.lock().map_err(|e| e.to_string())?;
+    Ok(build_starter_options(&conn, &store, system_ram_bytes()))
+}
+
+/// Total physical RAM in bytes, for frontend sizing copy.
+#[cfg_attr(coverage_nightly, coverage(off))]
+#[cfg_attr(not(coverage), tauri::command)]
+pub fn get_system_ram_bytes() -> u64 {
+    system_ram_bytes()
+}
+
+/// Starts downloading a curated starter (`tier` = "fast" | "balanced" |
+/// "smartest"). Progress streams over `on_event`; on success the model is
+/// recorded in the manifest and set as the builtin provider's model.
+#[cfg_attr(coverage_nightly, coverage(off))]
+#[cfg_attr(not(coverage), tauri::command)]
+pub fn download_starter(
+    tier: String,
+    on_event: tauri::ipc::Channel<download::DownloadEvent>,
+    app: tauri::AppHandle,
+    download_state: tauri::State<'_, DownloadState>,
+) -> Result<(), String> {
+    let starter = starter_for_tier(&tier)?;
+    let token = claim_download(&download_state)?;
+    spawn_model_download(
+        app,
+        registry::download_specs(starter),
+        registry::to_installed_model(starter),
+        token,
+        on_event,
+    );
+    Ok(())
+}
+
+/// Starts downloading a pasted-repo model after resolving its digest, size,
+/// pinned revision, and optional mmproj companion from the Hugging Face API.
+#[cfg_attr(coverage_nightly, coverage(off))]
+#[cfg_attr(not(coverage), tauri::command)]
+pub async fn download_repo_model(
+    repo: String,
+    file: String,
+    on_event: tauri::ipc::Channel<download::DownloadEvent>,
+    app: tauri::AppHandle,
+    client: tauri::State<'_, reqwest::Client>,
+    download_state: tauri::State<'_, DownloadState>,
+) -> Result<(), String> {
+    let resolved = resolve_repo_spec(&client, HF_BASE_URL, &repo, &file).await?;
+    let token = claim_download(&download_state)?;
+    spawn_model_download(
+        app,
+        repo_download_specs(HF_BASE_URL, &repo, &file, &resolved),
+        repo_installed_model(&repo, &file, &resolved),
+        token,
+        on_event,
+    );
+    Ok(())
+}
+
+/// Lists the `.gguf` files in a Hugging Face repo for the paste-a-repo UI.
+#[cfg_attr(coverage_nightly, coverage(off))]
+#[cfg_attr(not(coverage), tauri::command)]
+pub async fn list_hf_repo_ggufs(
+    repo: String,
+    client: tauri::State<'_, reqwest::Client>,
+) -> Result<Vec<HfGgufFile>, String> {
+    fetch_repo_gguf_listing(&client, HF_BASE_URL, &repo).await
+}
+
+/// Cancels the in-flight model download, if any. The download task emits
+/// `Cancelled` and keeps the partial for a later resume.
+#[cfg_attr(coverage_nightly, coverage(off))]
+#[cfg_attr(not(coverage), tauri::command)]
+pub fn cancel_model_download(download_state: tauri::State<'_, DownloadState>) {
+    cancel_active_download(&download_state);
+}
+
+/// Removes the partial file for `sha256` (the user chose Discard over Resume).
+#[cfg_attr(coverage_nightly, coverage(off))]
+#[cfg_attr(not(coverage), tauri::command)]
+pub fn discard_partial_download(
+    sha256: String,
+    download_state: tauri::State<'_, DownloadState>,
+    store: tauri::State<'_, storage::ModelStore>,
+) -> Result<(), String> {
+    discard_partial_inner(&download_state, &store, &sha256)
+}
+
+/// Returns every installed model from the manifest.
+#[cfg_attr(coverage_nightly, coverage(off))]
+#[cfg_attr(not(coverage), tauri::command)]
+pub fn list_installed_models(
+    db: tauri::State<'_, crate::history::Database>,
+) -> Result<Vec<manifest::InstalledModel>, String> {
+    let conn = db.0.lock().map_err(|e| e.to_string())?;
+    manifest::list(&conn).map_err(|e| e.to_string())
+}
+
+/// Deletes an installed model: manifest row, orphaned blobs, and (when it was
+/// the builtin provider's selected model) the provider's `model` field.
+#[cfg_attr(coverage_nightly, coverage(off))]
+#[cfg_attr(not(coverage), tauri::command)]
+pub fn delete_installed_model(
+    id: String,
+    app: tauri::AppHandle,
+    db: tauri::State<'_, crate::history::Database>,
+    store: tauri::State<'_, storage::ModelStore>,
+    config: tauri::State<'_, parking_lot::RwLock<AppConfig>>,
+) -> Result<(), String> {
+    let builtin_model = builtin_provider_model(&config.read());
+    let outcome = {
+        let conn = db.0.lock().map_err(|e| e.to_string())?;
+        delete_installed_model_inner(&conn, &store, &id, &builtin_model)?
+    };
+    if outcome.clear_builtin {
+        persist_active_provider_model(&app, &config, PROVIDER_ID_BUILTIN, "")?;
+    }
+    Ok(())
+}
+
+/// Converts a `finalize_install` error string into the `Failed` event that
+/// should be emitted over the download channel. Pure function; testable without
+/// Tauri state.
+pub(crate) fn finalize_error_event(message: String) -> download::DownloadEvent {
+    download::DownloadEvent::Failed {
+        kind: download::DownloadFailKind::Other,
+        message,
+    }
+}
+
+/// Runs the claimed download on the async runtime: streams events to the
+/// channel, records the manifest row + builtin provider model on success,
+/// and releases the download slot in every outcome.
+#[cfg_attr(coverage_nightly, coverage(off))]
+fn spawn_model_download(
+    app: tauri::AppHandle,
+    specs: Vec<download::DownloadSpec>,
+    model: manifest::InstalledModel,
+    token: tokio_util::sync::CancellationToken,
+    on_event: tauri::ipc::Channel<download::DownloadEvent>,
+) {
+    tauri::async_runtime::spawn(async move {
+        let client = app.state::<reqwest::Client>().inner().clone();
+        let on_event_finalize = on_event.clone();
+        let result = {
+            let store = app.state::<storage::ModelStore>();
+            let emit = move |event: download::DownloadEvent| {
+                let _ = on_event.send(event);
+            };
+            download::run_download(&specs, store.inner(), &client, token.clone(), emit).await
+        };
+        if should_finalize(result.is_ok(), token.is_cancelled()) {
+            if let Err(e) = finalize_install(&app, &model) {
+                eprintln!("thuki: [models] failed to record installed model: {e}");
+                let _ = on_event_finalize.send(finalize_error_event(e));
+            }
+        }
+        release_download(&app.state::<DownloadState>());
+    });
+}
+
+/// Records a completed download: manifest insert, then the builtin provider's
+/// `model` field (the active provider is never changed here).
+#[cfg_attr(coverage_nightly, coverage(off))]
+fn finalize_install(
+    app: &tauri::AppHandle,
+    model: &manifest::InstalledModel,
+) -> Result<(), String> {
+    {
+        let db = app.state::<crate::history::Database>();
+        let conn = db.0.lock().map_err(|e| e.to_string())?;
+        manifest::insert(&conn, model).map_err(|e| e.to_string())?;
+    }
+    let config = app.state::<parking_lot::RwLock<AppConfig>>();
+    persist_active_provider_model(app, &config, PROVIDER_ID_BUILTIN, &model.id)
+}
+
 // ─── Tests ──────────────────────────────────────────────────────────────────
 
 #[cfg(test)]
@@ -2008,6 +2710,678 @@ mod tests {
         assert!(result["x"].vision);
     }
 
+    // ── Model library: starter options ───────────────────────────────────────
+
+    /// Build a fresh store rooted at a temporary directory.
+    fn make_store() -> (tempfile::TempDir, storage::ModelStore) {
+        let dir = tempfile::TempDir::new().unwrap();
+        let store = storage::ModelStore::new(dir.path().to_path_buf()).unwrap();
+        (dir, store)
+    }
+
+    #[test]
+    fn build_starter_options_marks_installed_and_partial() {
+        let conn = crate::database::open_in_memory().unwrap();
+        let (_dir, store) = make_store();
+
+        // First starter is installed (manifest row present); second has an
+        // in-flight partial; third is untouched.
+        let starters = registry::STARTERS;
+        manifest::insert(&conn, &registry::to_installed_model(&starters[0])).unwrap();
+        std::fs::write(store.partial_path(starters[1].sha256), [0u8; 10]).unwrap();
+
+        const GIB: u64 = 1 << 30;
+        let opts = build_starter_options(&conn, &store, 16 * GIB);
+
+        assert_eq!(opts.len(), starters.len());
+        assert_eq!(opts[0].starter, starters[0]);
+        assert!(opts[0].installed);
+        assert_eq!(opts[0].partial_bytes, None);
+        assert!(!opts[1].installed);
+        assert_eq!(opts[1].partial_bytes, Some(10));
+        assert!(!opts[2].installed);
+        assert_eq!(opts[2].partial_bytes, None);
+        // Fit hints come straight from registry::ram_fit at the given RAM.
+        for (opt, s) in opts.iter().zip(starters) {
+            assert_eq!(opt.fit, registry::ram_fit(s.est_runtime_gb, 16 * GIB));
+        }
+    }
+
+    #[test]
+    fn build_starter_options_treats_sql_error_as_not_installed() {
+        let conn = crate::database::open_in_memory().unwrap();
+        conn.execute_batch("DROP TABLE installed_models;").unwrap();
+        let (_dir, store) = make_store();
+        let opts = build_starter_options(&conn, &store, 16 * (1 << 30));
+        assert!(opts.iter().all(|o| !o.installed));
+    }
+
+    #[test]
+    fn starter_option_serializes_for_frontend() {
+        let opt = StarterOption {
+            starter: registry::STARTERS[0].clone(),
+            fit: registry::RamFit::Fits,
+            installed: false,
+            partial_bytes: Some(42),
+        };
+        let v = serde_json::to_value(&opt).unwrap();
+        assert_eq!(v["fit"], serde_json::json!("fits"));
+        assert_eq!(v["installed"], serde_json::json!(false));
+        assert_eq!(v["partial_bytes"], serde_json::json!(42));
+        assert_eq!(v["starter"]["tier"], serde_json::json!("fast"));
+    }
+
+    // ── Model library: tier parsing ──────────────────────────────────────────
+
+    #[test]
+    fn starter_for_tier_parses_and_rejects() {
+        assert_eq!(starter_for_tier("fast").unwrap().tier, registry::Tier::Fast);
+        assert_eq!(
+            starter_for_tier("balanced").unwrap().tier,
+            registry::Tier::Balanced
+        );
+        assert_eq!(
+            starter_for_tier("smartest").unwrap().tier,
+            registry::Tier::Smartest
+        );
+        assert!(starter_for_tier("Fast").is_err());
+        assert!(starter_for_tier("").is_err());
+        assert!(starter_for_tier("turbo").is_err());
+    }
+
+    // ── Model library: download claim ────────────────────────────────────────
+
+    #[test]
+    fn download_claim_rejects_second_concurrent() {
+        let state = DownloadState::default();
+        let token = claim_download(&state).unwrap();
+        assert!(!token.is_cancelled());
+        let err = claim_download(&state).unwrap_err();
+        assert_eq!(err, "a download is already in progress");
+        // Release clears the claim so a new download can start.
+        release_download(&state);
+        assert!(claim_download(&state).is_ok());
+    }
+
+    #[test]
+    fn cancel_active_download_cancels_claimed_token_and_tolerates_idle() {
+        let state = DownloadState::default();
+        // No claim yet: cancelling is a harmless no-op.
+        cancel_active_download(&state);
+        let token = claim_download(&state).unwrap();
+        cancel_active_download(&state);
+        assert!(token.is_cancelled());
+    }
+
+    #[test]
+    fn poisoned_download_state_surfaces_errors_and_tolerates_best_effort_ops() {
+        let state = DownloadState::default();
+        let state_ref = std::panic::AssertUnwindSafe(&state.0);
+        let _ = std::panic::catch_unwind(|| {
+            let _guard = state_ref.0.lock().unwrap();
+            panic!("poison");
+        });
+        assert!(claim_download(&state).is_err());
+        let (_dir, store) = make_store();
+        assert!(discard_partial_inner(&state, &store, &"a".repeat(64)).is_err());
+        // Best-effort operations must not panic on the poisoned lock.
+        cancel_active_download(&state);
+        release_download(&state);
+    }
+
+    #[test]
+    fn should_finalize_requires_ok_and_not_cancelled() {
+        assert!(should_finalize(true, false));
+        assert!(!should_finalize(true, true));
+        assert!(!should_finalize(false, false));
+        assert!(!should_finalize(false, true));
+    }
+
+    #[test]
+    fn finalize_error_event_produces_failed_other_with_message() {
+        let event = finalize_error_event("disk full".to_string());
+        assert_eq!(
+            event,
+            download::DownloadEvent::Failed {
+                kind: download::DownloadFailKind::Other,
+                message: "disk full".to_string(),
+            }
+        );
+    }
+
+    // ── Model library: repo id validation ────────────────────────────────────
+
+    #[test]
+    fn repo_id_validation_accepts_two_clean_segments_only() {
+        assert!(is_valid_repo_id("ggml-org/gemma-3-4b-it-GGUF"));
+        assert!(is_valid_repo_id("bartowski/phi-4-GGUF"));
+        assert!(is_valid_repo_id("a_b.c-d/e.f_g-h"));
+        assert!(!is_valid_repo_id(""));
+        assert!(!is_valid_repo_id("no-slash"));
+        assert!(!is_valid_repo_id("a/b/c"));
+        assert!(!is_valid_repo_id("/name"));
+        assert!(!is_valid_repo_id("org/"));
+        assert!(!is_valid_repo_id("org/na me"));
+        assert!(!is_valid_repo_id("org/$(whoami)"));
+        assert!(!is_valid_repo_id("org/name?x=1"));
+        assert!(!is_valid_repo_id("örg/name"));
+        // dot and dotdot segments are path-traversal risks; reject them
+        assert!(!is_valid_repo_id("org/.."));
+        assert!(!is_valid_repo_id("../repo"));
+        assert!(!is_valid_repo_id("org/."));
+        assert!(!is_valid_repo_id("./repo"));
+    }
+
+    // ── Model library: quant extraction ──────────────────────────────────────
+
+    #[test]
+    fn quant_from_filename_variants() {
+        assert_eq!(quant_from_filename("phi-4-Q4_K_M.gguf"), "Q4_K_M");
+        assert_eq!(quant_from_filename("gemma-3-4b-it-Q4_K_M.gguf"), "Q4_K_M");
+        assert_eq!(quant_from_filename("model.Q8_0.gguf"), "Q8_0");
+        assert_eq!(quant_from_filename("model-IQ4_XS.gguf"), "IQ4_XS");
+        assert_eq!(quant_from_filename("model-f16.gguf"), "");
+        assert_eq!(quant_from_filename("model-q4_k_m.gguf"), "");
+        assert_eq!(quant_from_filename("no-extension-Q4_0"), "Q4_0");
+        assert_eq!(quant_from_filename(""), "");
+    }
+
+    // ── Model library: HF listing parse ──────────────────────────────────────
+
+    /// Canonical HF `/api/models/<repo>?blobs=true` fixture used across the
+    /// resolve/listing tests. `c…` is the pinned commit; `a…`/`b…` are the
+    /// weights and mmproj digests.
+    fn hf_fixture() -> serde_json::Value {
+        serde_json::json!({
+            "sha": "c".repeat(40),
+            "siblings": [
+                {"rfilename": "README.md", "size": 10},
+                {"rfilename": "model-Q4_K_M.gguf",
+                 "lfs": {"sha256": "a".repeat(64), "size": 1000}},
+                {"rfilename": "mmproj-model-f16.gguf",
+                 "lfs": {"sha256": "b".repeat(64), "size": 200}},
+                {"rfilename": "extra.gguf", "size": 7},
+                {"rfilename": "bare.gguf"}
+            ]
+        })
+    }
+
+    #[test]
+    fn parse_gguf_listing_filters_mmproj_and_non_gguf() {
+        let body = hf_fixture().to_string();
+        let files = parse_gguf_listing(body.as_bytes()).unwrap();
+        assert_eq!(
+            files,
+            vec![
+                HfGgufFile {
+                    file: "model-Q4_K_M.gguf".to_string(),
+                    size_bytes: 1000
+                },
+                HfGgufFile {
+                    file: "extra.gguf".to_string(),
+                    size_bytes: 7
+                },
+                HfGgufFile {
+                    file: "bare.gguf".to_string(),
+                    size_bytes: 0
+                },
+            ]
+        );
+    }
+
+    #[test]
+    fn parse_gguf_listing_rejects_invalid_json() {
+        let err = parse_gguf_listing(b"not json").unwrap_err();
+        assert!(err.contains("failed to decode"), "got: {err}");
+    }
+
+    #[test]
+    fn hf_gguf_file_serializes_for_frontend() {
+        let v = serde_json::to_value(HfGgufFile {
+            file: "x.gguf".to_string(),
+            size_bytes: 5,
+        })
+        .unwrap();
+        assert_eq!(v, serde_json::json!({"file": "x.gguf", "size_bytes": 5}));
+    }
+
+    // ── Model library: resolve_listing (pure) ───────────────────────────────
+
+    #[test]
+    fn resolve_listing_extracts_weights_revision_and_mmproj() {
+        let body = hf_fixture().to_string();
+        let r = resolve_listing(body.as_bytes(), "model-Q4_K_M.gguf").unwrap();
+        assert_eq!(r.revision, "c".repeat(40));
+        assert_eq!(r.weights_sha256, "a".repeat(64));
+        assert_eq!(r.weights_size_bytes, 1000);
+        let mm = r.mmproj.unwrap();
+        assert_eq!(mm.file, "mmproj-model-f16.gguf");
+        assert_eq!(mm.sha256, "b".repeat(64));
+        assert_eq!(mm.size_bytes, 200);
+    }
+
+    #[test]
+    fn resolve_listing_rejects_invalid_json() {
+        let err = resolve_listing(b"not json", "f.gguf").unwrap_err();
+        assert!(err.contains("failed to decode"), "got: {err}");
+    }
+
+    #[test]
+    fn resolve_listing_errors_when_file_missing() {
+        let body = hf_fixture().to_string();
+        let err = resolve_listing(body.as_bytes(), "nope.gguf").unwrap_err();
+        assert!(err.contains("not found"), "got: {err}");
+    }
+
+    #[test]
+    fn resolve_listing_errors_when_file_has_no_lfs_digest() {
+        let body = hf_fixture().to_string();
+        // `extra.gguf` exists but carries no lfs block.
+        let err = resolve_listing(body.as_bytes(), "extra.gguf").unwrap_err();
+        assert!(err.contains("LFS"), "got: {err}");
+    }
+
+    #[test]
+    fn resolve_listing_errors_on_missing_or_malformed_revision() {
+        for sha in [serde_json::Value::Null, serde_json::json!("main")] {
+            let mut fixture = hf_fixture();
+            fixture["sha"] = sha;
+            let body = fixture.to_string();
+            let err = resolve_listing(body.as_bytes(), "model-Q4_K_M.gguf").unwrap_err();
+            assert!(err.contains("commit"), "got: {err}");
+        }
+    }
+
+    #[test]
+    fn resolve_listing_skips_mmproj_without_lfs_and_non_gguf_mmproj() {
+        let body = serde_json::json!({
+            "sha": "c".repeat(40),
+            "siblings": [
+                {"rfilename": "w.gguf", "lfs": {"sha256": "a".repeat(64), "size": 9}},
+                {"rfilename": "mmproj-no-lfs.gguf", "size": 5},
+                {"rfilename": "mmproj-wrong-ext.bin",
+                 "lfs": {"sha256": "b".repeat(64), "size": 5}}
+            ]
+        })
+        .to_string();
+        let r = resolve_listing(body.as_bytes(), "w.gguf").unwrap();
+        assert_eq!(r.mmproj, None);
+    }
+
+    #[test]
+    fn resolve_listing_errors_when_lfs_lacks_sha256() {
+        let body = serde_json::json!({
+            "sha": "c".repeat(40),
+            "siblings": [
+                {"rfilename": "w.gguf", "lfs": {"size": 9}}
+            ]
+        })
+        .to_string();
+        let err = resolve_listing(body.as_bytes(), "w.gguf").unwrap_err();
+        assert!(err.contains("LFS"), "got: {err}");
+    }
+
+    // ── Model library: resolve_repo_spec (HTTP) ──────────────────────────────
+
+    #[tokio::test]
+    async fn resolve_repo_spec_finds_file_and_mmproj() {
+        let mut server = mockito::Server::new_async().await;
+        let mock = server
+            .mock("GET", "/api/models/test-org/test-repo?blobs=true")
+            .with_status(200)
+            .with_header("content-type", "application/json")
+            .with_body(hf_fixture().to_string())
+            .create_async()
+            .await;
+
+        let client = reqwest::Client::new();
+        let r = resolve_repo_spec(
+            &client,
+            &server.url(),
+            "test-org/test-repo",
+            "model-Q4_K_M.gguf",
+        )
+        .await
+        .unwrap();
+
+        mock.assert_async().await;
+        assert_eq!(r.revision, "c".repeat(40));
+        assert_eq!(r.weights_sha256, "a".repeat(64));
+        assert!(r.mmproj.is_some());
+    }
+
+    #[tokio::test]
+    async fn resolve_repo_spec_missing_file_errors() {
+        let mut server = mockito::Server::new_async().await;
+        let _m = server
+            .mock("GET", "/api/models/test-org/test-repo?blobs=true")
+            .with_status(200)
+            .with_body(hf_fixture().to_string())
+            .create_async()
+            .await;
+        let client = reqwest::Client::new();
+        let err = resolve_repo_spec(&client, &server.url(), "test-org/test-repo", "nope.gguf")
+            .await
+            .unwrap_err();
+        assert!(err.contains("not found"), "got: {err}");
+    }
+
+    #[tokio::test]
+    async fn resolve_repo_spec_rejects_bad_repo_id() {
+        // Validation fires before any network work: the bogus base URL would
+        // fail loudly if a request were issued.
+        let client = reqwest::Client::new();
+        let err = resolve_repo_spec(&client, "http://127.0.0.1:9", "no-slash", "w.gguf")
+            .await
+            .unwrap_err();
+        assert!(err.contains("repo id"), "got: {err}");
+    }
+
+    // ── Model library: HF fetch failure modes ────────────────────────────────
+
+    #[tokio::test]
+    async fn hf_fetch_maps_http_error_to_err_string() {
+        let mut server = mockito::Server::new_async().await;
+        let _m = server
+            .mock("GET", "/api/models/o/r?blobs=true")
+            .with_status(500)
+            .create_async()
+            .await;
+        let client = reqwest::Client::new();
+        let err = fetch_hf_repo_listing(&client, &server.url(), "o/r")
+            .await
+            .unwrap_err();
+        assert!(err.contains("500"), "got: {err}");
+    }
+
+    #[tokio::test]
+    async fn hf_fetch_maps_transport_error_to_err_string() {
+        let client = reqwest::Client::new();
+        let err = fetch_hf_repo_listing(&client, "http://127.0.0.1:1", "o/r")
+            .await
+            .unwrap_err();
+        assert!(err.contains("failed to reach Hugging Face"), "got: {err}");
+    }
+
+    #[tokio::test]
+    async fn hf_fetch_rejects_body_exceeding_size_cap_via_content_length() {
+        let mut server = mockito::Server::new_async().await;
+        let _m = server
+            .mock("GET", "/api/models/o/r?blobs=true")
+            .with_status(200)
+            .with_body("x".repeat(100))
+            .create_async()
+            .await;
+        let client = reqwest::Client::new();
+        let err = fetch_hf_repo_listing_inner(
+            &client,
+            &server.url(),
+            "o/r",
+            std::time::Duration::from_secs(5),
+            32,
+        )
+        .await
+        .unwrap_err();
+        assert!(err.contains("exceeded"), "got: {err}");
+    }
+
+    #[tokio::test]
+    async fn hf_fetch_rejects_body_exceeding_size_cap_when_no_content_length() {
+        // Chunked-encoding response (no Content-Length); the incremental
+        // stream cap must reject when the running total exceeds the limit.
+        let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
+        let addr = listener.local_addr().unwrap();
+        std::thread::spawn(move || {
+            let (mut conn, _) = listener.accept().unwrap();
+            use std::io::{Read, Write};
+            let mut request_buf = [0u8; 1024];
+            let _ = conn.read(&mut request_buf);
+            let _ = conn.write_all(
+                b"HTTP/1.1 200 OK\r\nTransfer-Encoding: chunked\r\n\r\n\
+                  0a\r\n0123456789\r\n\
+                  0a\r\n0123456789\r\n\
+                  0a\r\n0123456789\r\n\
+                  0\r\n\r\n",
+            );
+        });
+        let client = reqwest::Client::new();
+        let base = format!("http://{addr}");
+        let err = fetch_hf_repo_listing_inner(
+            &client,
+            &base,
+            "o/r",
+            std::time::Duration::from_secs(5),
+            20,
+        )
+        .await
+        .unwrap_err();
+        assert!(err.contains("exceeded"), "got: {err}");
+    }
+
+    #[tokio::test]
+    async fn hf_fetch_maps_body_read_error_to_err_string() {
+        // Headers promise 100 body bytes, then the server hangs up.
+        let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
+        let addr = listener.local_addr().unwrap();
+        std::thread::spawn(move || {
+            let (mut stream, _) = listener.accept().unwrap();
+            use std::io::{Read, Write};
+            let mut buf = [0u8; 1024];
+            let _ = stream.read(&mut buf);
+            let _ = stream.write_all(
+                b"HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: 100\r\nConnection: close\r\n\r\n",
+            );
+        });
+        let client = reqwest::Client::new();
+        let base = format!("http://{addr}");
+        let err = fetch_hf_repo_listing(&client, &base, "o/r")
+            .await
+            .unwrap_err();
+        assert!(
+            err.contains("failed to read Hugging Face API body"),
+            "got: {err}"
+        );
+    }
+
+    // ── Model library: repo listing wrapper ──────────────────────────────────
+
+    #[tokio::test]
+    async fn fetch_repo_gguf_listing_validates_then_lists() {
+        let client = reqwest::Client::new();
+        // Invalid repo id: rejected before any network work.
+        let err = fetch_repo_gguf_listing(&client, "http://127.0.0.1:9", "no-slash")
+            .await
+            .unwrap_err();
+        assert!(err.contains("repo id"), "got: {err}");
+
+        let mut server = mockito::Server::new_async().await;
+        let _m = server
+            .mock("GET", "/api/models/o/r?blobs=true")
+            .with_status(200)
+            .with_body(hf_fixture().to_string())
+            .create_async()
+            .await;
+        let files = fetch_repo_gguf_listing(&client, &server.url(), "o/r")
+            .await
+            .unwrap();
+        assert_eq!(files.len(), 3);
+        assert_eq!(files[0].file, "model-Q4_K_M.gguf");
+    }
+
+    // ── Model library: repo spec/model mapping ───────────────────────────────
+
+    fn sample_resolved(with_mmproj: bool) -> RepoResolved {
+        RepoResolved {
+            revision: "c".repeat(40),
+            weights_sha256: "a".repeat(64),
+            weights_size_bytes: 1000,
+            mmproj: with_mmproj.then(|| MmprojCompanion {
+                file: "mmproj-model-f16.gguf".to_string(),
+                sha256: "b".repeat(64),
+                size_bytes: 200,
+            }),
+        }
+    }
+
+    #[test]
+    fn repo_download_specs_builds_urls_and_optional_mmproj() {
+        let r = sample_resolved(true);
+        let specs = repo_download_specs("https://huggingface.co/", "o/r", "w-Q4_K_M.gguf", &r);
+        assert_eq!(specs.len(), 2);
+        assert_eq!(
+            specs[0].url,
+            format!(
+                "https://huggingface.co/o/r/resolve/{}/w-Q4_K_M.gguf",
+                r.revision
+            )
+        );
+        assert_eq!(specs[0].file, "w-Q4_K_M.gguf");
+        assert_eq!(specs[0].sha256, r.weights_sha256);
+        assert_eq!(specs[0].total_bytes, 1000);
+        assert_eq!(
+            specs[1].url,
+            format!(
+                "https://huggingface.co/o/r/resolve/{}/mmproj-model-f16.gguf",
+                r.revision
+            )
+        );
+        assert_eq!(specs[1].sha256, "b".repeat(64));
+        assert_eq!(specs[1].total_bytes, 200);
+
+        let text_only = sample_resolved(false);
+        let specs = repo_download_specs("https://huggingface.co", "o/r", "w.gguf", &text_only);
+        assert_eq!(specs.len(), 1);
+    }
+
+    #[test]
+    fn repo_installed_model_maps_fields() {
+        let r = sample_resolved(true);
+        let m = repo_installed_model("o/r", "w-Q4_K_M.gguf", &r);
+        assert_eq!(m.id, "o/r:w-Q4_K_M.gguf");
+        assert_eq!(m.display_name, "w-Q4_K_M");
+        assert_eq!(m.repo, "o/r");
+        assert_eq!(m.revision, r.revision);
+        assert_eq!(m.file_name, "w-Q4_K_M.gguf");
+        assert_eq!(m.sha256, r.weights_sha256);
+        assert_eq!(m.size_bytes, 1000);
+        assert_eq!(m.quant, "Q4_K_M");
+        assert!(m.vision);
+        assert!(!m.thinking);
+        assert_eq!(m.mmproj_file.as_deref(), Some("mmproj-model-f16.gguf"));
+        assert_eq!(m.mmproj_sha256.as_deref(), Some(&*"b".repeat(64)));
+
+        let text_only = sample_resolved(false);
+        let m = repo_installed_model("o/r", "w.gguf", &text_only);
+        assert!(!m.vision);
+        assert_eq!(m.mmproj_file, None);
+        assert_eq!(m.mmproj_sha256, None);
+    }
+
+    // ── Model library: delete ────────────────────────────────────────────────
+
+    #[test]
+    fn delete_installed_model_inner_removes_orphans_and_flags_builtin_clear() {
+        let conn = crate::database::open_in_memory().unwrap();
+        let (_dir, store) = make_store();
+
+        let r = sample_resolved(true);
+        let m = repo_installed_model("o/r", "w-Q4_K_M.gguf", &r);
+        manifest::insert(&conn, &m).unwrap();
+        std::fs::write(store.blob_path(&m.sha256), b"w").unwrap();
+        std::fs::write(store.blob_path(m.mmproj_sha256.as_ref().unwrap()), b"m").unwrap();
+
+        // The builtin provider currently points at this model: deletion must
+        // flag the clear so the wrapper resets the provider's model field.
+        let out = delete_installed_model_inner(&conn, &store, &m.id, &m.id).unwrap();
+        assert!(out.clear_builtin);
+        assert!(!store.blob_path(&m.sha256).exists());
+        assert!(!store.blob_path(m.mmproj_sha256.as_ref().unwrap()).exists());
+        assert!(manifest::get(&conn, &m.id).unwrap().is_none());
+
+        // Builtin points elsewhere: no clear.
+        let m2 = repo_installed_model("o/r2", "x.gguf", &sample_resolved(false));
+        manifest::insert(&conn, &m2).unwrap();
+        std::fs::write(store.blob_path(&m2.sha256), b"x").unwrap();
+        let out = delete_installed_model_inner(&conn, &store, &m2.id, "other:model.gguf").unwrap();
+        assert!(!out.clear_builtin);
+    }
+
+    #[test]
+    fn delete_installed_model_inner_propagates_sql_and_io_errors() {
+        // SQL failure: table dropped.
+        let conn = crate::database::open_in_memory().unwrap();
+        conn.execute_batch("DROP TABLE installed_models;").unwrap();
+        let (_dir, store) = make_store();
+        assert!(delete_installed_model_inner(&conn, &store, "x:y.gguf", "").is_err());
+
+        // I/O failure: a directory sits where the orphaned blob should be.
+        let conn = crate::database::open_in_memory().unwrap();
+        let m = repo_installed_model("o/r", "w.gguf", &sample_resolved(false));
+        manifest::insert(&conn, &m).unwrap();
+        std::fs::create_dir_all(store.blob_path(&m.sha256)).unwrap();
+        assert!(delete_installed_model_inner(&conn, &store, &m.id, "").is_err());
+    }
+
+    // ── Model library: discard partial ───────────────────────────────────────
+
+    #[test]
+    fn discard_partial_validates_hex_and_running_state() {
+        let (_dir, store) = make_store();
+        let state = DownloadState::default();
+        let sha = "a".repeat(64);
+
+        // Invalid digest shapes are rejected before any filesystem use.
+        assert!(discard_partial_inner(&state, &store, "short").is_err());
+        assert!(discard_partial_inner(&state, &store, &"Z".repeat(64)).is_err());
+
+        // Rejected while a download is claimed.
+        let _token = claim_download(&state).unwrap();
+        let err = discard_partial_inner(&state, &store, &sha).unwrap_err();
+        assert!(err.contains("in progress"), "got: {err}");
+        release_download(&state);
+
+        // Removes an existing partial; a missing partial is fine (idempotent).
+        std::fs::write(store.partial_path(&sha), b"bytes").unwrap();
+        discard_partial_inner(&state, &store, &sha).unwrap();
+        assert!(!store.partial_path(&sha).exists());
+        discard_partial_inner(&state, &store, &sha).unwrap();
+    }
+
+    #[test]
+    fn discard_partial_propagates_unexpected_io_error() {
+        let (_dir, store) = make_store();
+        let state = DownloadState::default();
+        let sha = "b".repeat(64);
+        // A directory at the partial path makes remove_file fail with a
+        // non-NotFound error which must be propagated.
+        std::fs::create_dir_all(store.partial_path(&sha)).unwrap();
+        assert!(discard_partial_inner(&state, &store, &sha).is_err());
+    }
+
+    // ── Model library: builtin provider model ───────────────────────────────
+
+    #[test]
+    fn builtin_provider_model_reads_builtin_entry() {
+        let mut cfg = AppConfig::default();
+        assert_eq!(builtin_provider_model(&cfg), "");
+        for p in &mut cfg.inference.providers {
+            if p.id == crate::config::defaults::PROVIDER_ID_BUILTIN {
+                p.model = "o/r:w.gguf".to_string();
+            }
+        }
+        assert_eq!(builtin_provider_model(&cfg), "o/r:w.gguf");
+        // No builtin entry at all: empty.
+        cfg.inference.providers.clear();
+        assert_eq!(builtin_provider_model(&cfg), "");
+    }
+
+    // ── Model library: system RAM probe ──────────────────────────────────────
+
+    #[test]
+    fn system_ram_bytes_returns_positive_on_real_hardware() {
+        assert!(system_ram_bytes() > 0);
+    }
+
     #[tokio::test]
     async fn reconcile_keys_capabilities_by_provider() {
         // The same slug under two providers holds two distinct cache entries;
diff --git a/src-tauri/src/models/registry.rs b/src-tauri/src/models/registry.rs
index 293c915d..2c9e79c1 100644
--- a/src-tauri/src/models/registry.rs
+++ b/src-tauri/src/models/registry.rs
@@ -12,6 +12,7 @@
  * matches the pinned commit, not whatever `main` later points to.
  */
 
+use crate::config::defaults::HF_BASE_URL;
 use crate::models::download::DownloadSpec;
 use crate::models::manifest::InstalledModel;
 
@@ -143,15 +144,10 @@ pub fn ram_fit(est_runtime_gb: f64, ram_bytes: u64) -> RamFit {
     }
 }
 
-/// Download URL: `https://huggingface.co/<repo>/resolve/<revision>/<file>`.
+/// Download URL: `{HF_BASE_URL}/<repo>/resolve/<revision>/<file>`.
 /// One spec for the weights, plus one for the mmproj when present.
 pub fn download_specs(s: &Starter) -> Vec<DownloadSpec> {
-    let url = |file: &str| {
-        format!(
-            "https://huggingface.co/{}/resolve/{}/{}",
-            s.repo, s.revision, file
-        )
-    };
+    let url = |file: &str| format!("{}/{}/resolve/{}/{}", HF_BASE_URL, s.repo, s.revision, file);
     let mut specs = vec![DownloadSpec {
         url: url(s.file_name),
         file: s.file_name.to_string(),

From d2926326af9653cdc4bafaac8e224fb611ad79db Mon Sep 17 00:00:00 2001
From: Logan Nguyen <lg.131.dev@gmail.com>
Date: Wed, 10 Jun 2026 22:28:28 -0500
Subject: [PATCH 11/13] feat: bundle llama-server sidecar with pinned fetch
 script and CI cache

Signed-off-by: Logan Nguyen <lg.131.dev@gmail.com>
---
 .github/workflows/nightly-release.yml |   6 +
 .github/workflows/release-please.yml  |   6 +
 docs/release-process.md               |   6 +
 package.json                          |   7 +-
 scripts/ensure-llama-server.ts        | 188 ++++++++++++++++++++++++++
 src-tauri/tauri.conf.json             |  17 +++
 6 files changed, 227 insertions(+), 3 deletions(-)
 create mode 100644 scripts/ensure-llama-server.ts

diff --git a/.github/workflows/nightly-release.yml b/.github/workflows/nightly-release.yml
index 38568202..d8888862 100644
--- a/.github/workflows/nightly-release.yml
+++ b/.github/workflows/nightly-release.yml
@@ -94,6 +94,12 @@ jobs:
       - name: Run all tests with coverage enforcement
         run: bun run test:all:coverage
 
+      - name: Cache llama.cpp sidecar
+        uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830  # v4.3.0
+        with:
+          path: src-tauri/binaries
+          key: llama-cpp-${{ runner.os }}-b9590-b12cb8851ea60433
+
       - name: Build Tauri app
         # VITE_GIT_COMMIT_SHA is set here, not on a separate frontend step, because
         # tauri build runs beforeBuildCommand (bun run build:frontend) internally.
diff --git a/.github/workflows/release-please.yml b/.github/workflows/release-please.yml
index 215b60b3..f5eefe38 100644
--- a/.github/workflows/release-please.yml
+++ b/.github/workflows/release-please.yml
@@ -95,6 +95,12 @@ jobs:
       - name: Build frontend
         run: bun run build:frontend
 
+      - name: Cache llama.cpp sidecar
+        uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830  # v4.3.0
+        with:
+          path: src-tauri/binaries
+          key: llama-cpp-${{ runner.os }}-b9590-b12cb8851ea60433
+
       - name: Build Tauri app
         run: bun run build:backend
 
diff --git a/docs/release-process.md b/docs/release-process.md
index 9aee34c0..39cb67b5 100644
--- a/docs/release-process.md
+++ b/docs/release-process.md
@@ -30,6 +30,12 @@ A backup copy of both keys lives in the private `quiet-node/thuki-confidential`
 
 There is nothing to set up on your laptop. No env vars, no key files, no `.zshrc.local` overrides. New contributors clone the repo and start working.
 
+## Bundled inference engine
+
+Every build embeds llama.cpp's `llama-server` as a Tauri sidecar. The binary and the dylibs it links are fetched and verified by `scripts/ensure-llama-server.ts`, which pins an exact llama.cpp release tag and the sha256 of its macOS arm64 asset; a hash mismatch aborts the build. The script runs automatically in front of `dev`, `build:backend`, and `build:release`, and is an instant no-op once the pinned version is installed under `src-tauri/binaries/` (gitignored, never committed). CI caches that directory with a key derived from the pinned version and hash, so release builds only hit GitHub's release CDN when the pin changes. Because the script adds an `@loader_path/../Frameworks` rpath for bundle-time dylib resolution, it ad-hoc re-signs the binary and each dylib after the edit.
+
+Deferred: Developer ID re-signing, deep-signing of the nested dylibs, and notarization land as a release-please workflow step when the Apple Developer certificate exists.
+
 ## Cutting a release manually (rare)
 
 If for some reason a release must be cut outside of CI (incident response, rolling back a bad release-please commit, etc.), the procedure is:
diff --git a/package.json b/package.json
index 06c64ee2..1f0622a2 100644
--- a/package.json
+++ b/package.json
@@ -10,12 +10,13 @@
   "homepage": "https://www.thuki.app/",
   "type": "module",
   "scripts": {
-    "dev": "tauri dev",
+    "dev": "bun run engine:ensure && tauri dev",
     "frontend:dev": "vite",
     "generate:commands": "bun scripts/generate-commands.ts",
+    "engine:ensure": "bun scripts/ensure-llama-server.ts",
     "build:frontend": "tsc && vite build",
-    "build:backend": "tauri build --bundles app",
-    "build:release": "tauri build --bundles app -c \"{\\\"bundle\\\":{\\\"createUpdaterArtifacts\\\":true}}\"",
+    "build:backend": "bun run engine:ensure && tauri build --bundles app",
+    "build:release": "bun run engine:ensure && tauri build --bundles app -c \"{\\\"bundle\\\":{\\\"createUpdaterArtifacts\\\":true}}\"",
     "build:all": "bun run build:frontend && bun run build:backend",
     "preview": "vite preview",
     "tauri": "tauri",
diff --git a/scripts/ensure-llama-server.ts b/scripts/ensure-llama-server.ts
new file mode 100644
index 00000000..06c30764
--- /dev/null
+++ b/scripts/ensure-llama-server.ts
@@ -0,0 +1,188 @@
+// Fetches the pinned llama.cpp `llama-server` sidecar for the bundled
+// inference engine. Runs before every dev/build (see package.json); the
+// stamp file makes repeat runs an instant no-op. macOS arm64 only: other
+// platforms (Ubuntu CI lint/test jobs) exit 0 without fetching anything.
+
+import { spawnSync } from 'node:child_process';
+import { createHash } from 'node:crypto';
+import {
+  copyFile,
+  mkdir,
+  mkdtemp,
+  readdir,
+  readFile,
+  realpath,
+  rm,
+  stat,
+  writeFile,
+} from 'node:fs/promises';
+import { tmpdir } from 'node:os';
+import { basename, join, resolve } from 'node:path';
+import { fileURLToPath } from 'node:url';
+
+const LLAMA_CPP_TAG = 'b9590';
+const ASSET = `llama-${LLAMA_CPP_TAG}-bin-macos-arm64.tar.gz`;
+const ASSET_SHA256 =
+  'b12cb8851ea60433e62522e43aa1dc9e820b4096b39d8c51e3cf7b1fba82102d';
+const DEST = 'src-tauri/binaries';
+const BIN = `${DEST}/llama-server-aarch64-apple-darwin`;
+const STAMP = `${DEST}/.llama-cpp-version`;
+
+const DOWNLOAD_URL = `https://github.com/ggml-org/llama.cpp/releases/download/${LLAMA_CPP_TAG}/${ASSET}`;
+const STAMP_CONTENT = `${LLAMA_CPP_TAG} ${ASSET_SHA256}`;
+
+const repoRoot = fileURLToPath(new URL('../', import.meta.url));
+const destDir = resolve(repoRoot, DEST);
+const binPath = resolve(repoRoot, BIN);
+const stampPath = resolve(repoRoot, STAMP);
+
+function fail(message: string): never {
+  console.error(`ensure-llama-server: ${message}`);
+  process.exit(1);
+}
+
+function run(command: string, args: string[]): string {
+  const result = spawnSync(command, args, { encoding: 'utf8' });
+  if (result.error) {
+    fail(`${command} failed to start: ${result.error.message}`);
+  }
+  if (result.status !== 0) {
+    fail(`${command} ${args.join(' ')} exited ${result.status}:\n${result.stderr}`);
+  }
+  return result.stdout;
+}
+
+async function exists(path: string): Promise<boolean> {
+  return stat(path).then(
+    () => true,
+    () => false,
+  );
+}
+
+// Parses `otool -L` output into the @rpath dylib names a Mach-O file links.
+function rpathDeps(machoPath: string): string[] {
+  const output = run('otool', ['-L', machoPath]);
+  const deps: string[] = [];
+  for (const line of output.split('\n')) {
+    const match = /^\s+@rpath\/(lib[^ ]+\.dylib)/.exec(line);
+    if (match) {
+      deps.push(match[1]);
+    }
+  }
+  return deps;
+}
+
+if (process.platform !== 'darwin' || process.arch !== 'arm64') {
+  console.log(
+    `ensure-llama-server: skipping on ${process.platform}/${process.arch} (sidecar is macOS arm64 only)`,
+  );
+  process.exit(0);
+}
+
+// Fast path: pinned version already installed.
+if (await exists(binPath)) {
+  const stamp = await readFile(stampPath, 'utf8').catch(() => '');
+  if (stamp.trim() === STAMP_CONTENT) {
+    process.exit(0);
+  }
+}
+
+console.log(`ensure-llama-server: fetching llama.cpp ${LLAMA_CPP_TAG}...`);
+const workDir = await mkdtemp(join(tmpdir(), 'thuki-llama-'));
+try {
+  // Download and verify against the pinned hash before touching anything.
+  const response = await fetch(DOWNLOAD_URL);
+  if (!response.ok) {
+    fail(`download failed: HTTP ${response.status} for ${DOWNLOAD_URL}`);
+  }
+  const archive = Buffer.from(await response.arrayBuffer());
+  const actualSha256 = createHash('sha256').update(archive).digest('hex');
+  if (actualSha256 !== ASSET_SHA256) {
+    fail(
+      `sha256 mismatch for ${ASSET}\n  expected: ${ASSET_SHA256}\n  actual:   ${actualSha256}\nRefusing to install. The release asset may have been tampered with or the pin is stale.`,
+    );
+  }
+
+  const archivePath = join(workDir, ASSET);
+  await writeFile(archivePath, archive);
+  run('tar', ['-xzf', archivePath, '-C', workDir]);
+
+  const extractedDir = join(workDir, `llama-${LLAMA_CPP_TAG}`);
+  const serverPath = join(extractedDir, 'llama-server');
+  if (!(await exists(serverPath))) {
+    fail(
+      `archive layout unexpected: ${extractedDir}/llama-server not found after extraction`,
+    );
+  }
+
+  // Index every dylib in the archive by name (recursively, in case the
+  // layout ever moves them into a lib/ subdirectory).
+  const dylibByName = new Map<string, string>();
+  async function indexDylibs(dir: string): Promise<void> {
+    for (const entry of await readdir(dir, { withFileTypes: true })) {
+      const path = join(dir, entry.name);
+      if (entry.isDirectory()) {
+        await indexDylibs(path);
+      } else if (/^lib.+\.dylib$/.test(entry.name)) {
+        dylibByName.set(entry.name, path);
+      }
+    }
+  }
+  await indexDylibs(extractedDir);
+
+  // Walk the @rpath link closure starting from llama-server so we copy
+  // exactly the dylibs it needs and skip other tools' impl dylibs.
+  const needed = new Set<string>();
+  const queue = rpathDeps(serverPath);
+  while (queue.length > 0) {
+    const name = queue.shift() as string;
+    if (needed.has(name)) {
+      continue;
+    }
+    const path = dylibByName.get(name);
+    if (path === undefined) {
+      fail(`llama-server links @rpath/${name} but the archive does not contain it`);
+    }
+    needed.add(name);
+    queue.push(...rpathDeps(path));
+  }
+
+  await mkdir(destDir, { recursive: true });
+  await copyFile(serverPath, binPath);
+  const installedDylibs: string[] = [];
+  for (const name of [...needed].sort()) {
+    const target = join(destDir, name);
+    // Dereference symlinks: versioned dylib names may be links to the
+    // real file, and the bundle needs regular files.
+    await copyFile(await realpath(dylibByName.get(name) as string), target);
+    installedDylibs.push(target);
+  }
+
+  // At bundle time the sidecar lands in Contents/MacOS while the dylibs go
+  // to Contents/Frameworks; in dev they sit next to the binary, which the
+  // archive's existing @loader_path rpath already covers.
+  const rpathResult = spawnSync(
+    'install_name_tool',
+    ['-add_rpath', '@loader_path/../Frameworks', binPath],
+    { encoding: 'utf8' },
+  );
+  if (
+    rpathResult.status !== 0 &&
+    !rpathResult.stderr.includes('would duplicate path')
+  ) {
+    fail(`install_name_tool failed:\n${rpathResult.stderr}`);
+  }
+
+  // The rpath edit invalidates the ad-hoc linker signature; re-sign
+  // everything we installed so macOS will execute it.
+  for (const path of [binPath, ...installedDylibs]) {
+    run('codesign', ['--force', '-s', '-', path]);
+  }
+
+  await writeFile(stampPath, `${STAMP_CONTENT}\n`, 'utf8');
+  console.log(
+    `ensure-llama-server: installed llama-server ${LLAMA_CPP_TAG} and ${installedDylibs.length} dylibs into ${DEST}`,
+  );
+} finally {
+  await rm(workDir, { recursive: true, force: true });
+}
diff --git a/src-tauri/tauri.conf.json b/src-tauri/tauri.conf.json
index 6fe476f3..e6eea3e8 100644
--- a/src-tauri/tauri.conf.json
+++ b/src-tauri/tauri.conf.json
@@ -81,6 +81,23 @@
     "active": true,
     "targets": "all",
     "homepage": "https://www.thuki.app/",
+    "externalBin": [
+      "binaries/llama-server"
+    ],
+    "macOS": {
+      "frameworks": [
+        "binaries/libggml-base.0.dylib",
+        "binaries/libggml-blas.0.dylib",
+        "binaries/libggml-cpu.0.dylib",
+        "binaries/libggml-metal.0.dylib",
+        "binaries/libggml-rpc.0.dylib",
+        "binaries/libggml.0.dylib",
+        "binaries/libllama-common.0.dylib",
+        "binaries/libllama-server-impl.dylib",
+        "binaries/libllama.0.dylib",
+        "binaries/libmtmd.0.dylib"
+      ]
+    },
     "icon": [
       "icons/32x32.png",
       "icons/128x128.png",

From bae191f304b4fa1e2723af6ff8c063a8f8fd4214 Mon Sep 17 00:00:00 2001
From: Logan Nguyen <lg.131.dev@gmail.com>
Date: Sat, 13 Jun 2026 18:45:05 -0400
Subject: [PATCH 12/13] docs: explain the llama.cpp version pin and how to bump
 it

Signed-off-by: Logan Nguyen <lg.131.dev@gmail.com>
---
 docs/release-process.md        | 14 ++++++++++++++
 scripts/ensure-llama-server.ts |  7 +++++++
 2 files changed, 21 insertions(+)

diff --git a/docs/release-process.md b/docs/release-process.md
index 39cb67b5..cfde23bf 100644
--- a/docs/release-process.md
+++ b/docs/release-process.md
@@ -36,6 +36,20 @@ Every build embeds llama.cpp's `llama-server` as a Tauri sidecar. The binary and
 
 Deferred: Developer ID re-signing, deep-signing of the nested dylibs, and notarization land as a release-please workflow step when the Apple Developer certificate exists.
 
+### Bumping the pinned llama.cpp version
+
+The pin in `scripts/ensure-llama-server.ts` is two constants. `LLAMA_CPP_TAG` names a published llama.cpp release (for example `b9590`, listed at https://github.com/ggml-org/llama.cpp/releases), and `ASSET_SHA256` is the sha256 of that release's `llama-<tag>-bin-macos-arm64.tar.gz` asset. This is a release pin, not a git commit: llama.cpp's `main` branch moving forward does not affect a pinned build, and a newer release does not make the current one stop working. The pin is updated only when we deliberately adopt a newer engine.
+
+There is no automatic bump, and that is intentional: a new engine version has to clear the manual checks below on real hardware before it ships. Upgrade when there is a concrete reason: a newer model architecture we want to load, a `llama-server` bug or security fix, or a Metal/performance improvement. Otherwise the existing pin keeps working indefinitely.
+
+To bump:
+
+1. Pick the target release on https://github.com/ggml-org/llama.cpp/releases and set `LLAMA_CPP_TAG` to its tag.
+2. Set `ASSET_SHA256` to the macOS arm64 asset's hash. Read it from the GitHub Releases API (the asset's `digest` field) or compute it locally with `shasum -a 256 llama-<tag>-bin-macos-arm64.tar.gz`.
+3. Run `bun run engine:ensure`. It fetches the new asset, verifies the new hash, and re-derives the dylib link closure. If the new release adds, renames, or drops a dylib, the script aborts and names exactly which entries differ from `bundle.macOS.frameworks` in `src-tauri/tauri.conf.json`; update that list to match so the closure check passes.
+4. Bump the cache key in the build workflows so the new asset is not served stale from the old cache.
+5. Re-run the binary-dependent checks on a real machine: the sidecar spawns and streams a response, and `codesign -vv` is clean on the `llama-server` binary and every bundled dylib.
+
 ## Cutting a release manually (rare)
 
 If for some reason a release must be cut outside of CI (incident response, rolling back a bad release-please commit, etc.), the procedure is:
diff --git a/scripts/ensure-llama-server.ts b/scripts/ensure-llama-server.ts
index 06c30764..d3fb14f5 100644
--- a/scripts/ensure-llama-server.ts
+++ b/scripts/ensure-llama-server.ts
@@ -20,6 +20,13 @@ import { tmpdir } from 'node:os';
 import { basename, join, resolve } from 'node:path';
 import { fileURLToPath } from 'node:url';
 
+// The pin: LLAMA_CPP_TAG names a published llama.cpp *release* (browse them at
+// https://github.com/ggml-org/llama.cpp/releases), and ASSET_SHA256 is the
+// sha256 of that release's macOS arm64 asset. This is a release pin, not a git
+// commit: llama.cpp's main branch moving on does not change what a build
+// produces, and the pin is bumped only when we deliberately adopt a newer
+// engine. For when and how to bump, see "Bumping the pinned llama.cpp version"
+// in docs/release-process.md.
 const LLAMA_CPP_TAG = 'b9590';
 const ASSET = `llama-${LLAMA_CPP_TAG}-bin-macos-arm64.tar.gz`;
 const ASSET_SHA256 =

From dee074c677b61bcc6e11ee48abe80851e7b15306 Mon Sep 17 00:00:00 2001
From: Logan Nguyen <lg.131.dev@gmail.com>
Date: Sat, 13 Jun 2026 18:45:05 -0400
Subject: [PATCH 13/13] ci: fetch the llama-server sidecar in PR backend and
 build jobs

Signed-off-by: Logan Nguyen <lg.131.dev@gmail.com>
---
 .github/workflows/pr-backend-tests.yml    | 14 ++++++++++++++
 .github/workflows/pr-build-validation.yml |  9 +++++++++
 2 files changed, 23 insertions(+)

diff --git a/.github/workflows/pr-backend-tests.yml b/.github/workflows/pr-backend-tests.yml
index ea66d0a2..4cf2c178 100644
--- a/.github/workflows/pr-backend-tests.yml
+++ b/.github/workflows/pr-backend-tests.yml
@@ -27,6 +27,20 @@ jobs:
         with:
           tool: cargo-llvm-cov
 
+      - name: Setup Bun
+        uses: oven-sh/setup-bun@0c5077e51419868618aeaa5fe8019c62421857d6  # v2.2.0
+        with:
+          bun-version: 1.3.11
+
+      - name: Cache llama.cpp sidecar
+        uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830  # v4.3.0
+        with:
+          path: src-tauri/binaries
+          key: llama-cpp-${{ runner.os }}-b9590-b12cb8851ea60433
+
+      - name: Fetch llama-server sidecar
+        run: bun run engine:ensure
+
       - name: Run backend tests with coverage
         working-directory: src-tauri
         run: |
diff --git a/.github/workflows/pr-build-validation.yml b/.github/workflows/pr-build-validation.yml
index ee7fe58d..4a1d4a8a 100644
--- a/.github/workflows/pr-build-validation.yml
+++ b/.github/workflows/pr-build-validation.yml
@@ -27,6 +27,15 @@ jobs:
       - name: Install dependencies
         run: bun install --frozen-lockfile
 
+      - name: Cache llama.cpp sidecar
+        uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830  # v4.3.0
+        with:
+          path: src-tauri/binaries
+          key: llama-cpp-${{ runner.os }}-b9590-b12cb8851ea60433
+
+      - name: Fetch llama-server sidecar
+        run: bun run engine:ensure
+
       - name: Security vulnerability scan
         run: |
           AUDIT=$(bun audit 2>&1 || true)