From 5ed3635b73860c0658440c2d06f19a25306e0428 Mon Sep 17 00:00:00 2001
From: Logan Nguyen <lg.131.dev@gmail.com>
Date: Thu, 18 Jun 2026 17:41:03 -0500
Subject: [PATCH 1/4] refactor: unify the two model-residency knobs onto
 keep_warm_inactivity_minutes

Signed-off-by: Logan Nguyen <lg.131.dev@gmail.com>
---
 CLAUDE.md                                   |   2 +-
 docs/configurations.md                      |  19 +-
 docs/tuning-context-window.md               |   2 +-
 src-tauri/src/config/defaults.rs            |  38 ++--
 src-tauri/src/config/loader.rs              |  22 +--
 src-tauri/src/config/schema.rs              |  27 ++-
 src-tauri/src/config/tests.rs               |  65 ++-----
 src-tauri/src/engine/runner.rs              |   2 +-
 src-tauri/src/lib.rs                        |  14 +-
 src-tauri/src/settings_commands.rs          |  65 ++++---
 src-tauri/src/settings_commands/tests.rs    |  43 +++--
 src-tauri/src/warmup.rs                     |  56 ++++++
 src/settings/SettingsWindow.test.tsx        |   1 -
 src/settings/components/SaveField.test.tsx  |   1 -
 src/settings/configHelpers.ts               |   4 +-
 src/settings/hooks/useConfigSync.test.ts    |   1 -
 src/settings/hooks/useDebouncedSave.test.ts |   1 -
 src/settings/tabs/ModelTab.tsx              | 203 ++++++++------------
 src/settings/tabs/ProviderCards.test.tsx    |   1 -
 src/settings/tabs/tabs.test.tsx             |  51 ++---
 src/settings/types.ts                       |   1 -
 21 files changed, 294 insertions(+), 325 deletions(-)
diff --git a/CLAUDE.md b/CLAUDE.md
index 8c6d9c0b..22210844 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -69,7 +69,7 @@ User-facing reference for all commands lives in `docs/commands.md`. **Any new sl
 ### Backend (`src-tauri/src/`)
 
 - **`lib.rs`**: app setup: loads `AppConfig` via `config::load`, converts window to NSPanel (fullscreen overlay), registers tray, spawns hotkey listener, spawns the engine runner actor, intercepts close events (hides instead of quits), and on `RunEvent::Exit` kills the engine sidecar and awaits its confirmed exit so no orphan `llama-server` survives quit
-- **`config/`**: typed TOML-backed application configuration. Loaded once at startup from `~/Library/Application Support/com.quietnode.thuki/config.toml` (seeded with defaults on first run), installed as Tauri managed state, exposed to the frontend via the `get_config` command. Every subsystem that needs model, prompt, window, activation, or quote values reads from `State<AppConfig>`. The `[inference]` section holds `active_provider`, `num_ctx`, `keep_warm_inactivity_minutes` (Ollama only), `idle_unload_minutes` (built-in engine only), and the typed providers list (`[[inference.providers]]`, each `{id, kind, label, base_url, model, vision}`; `kind` is `builtin`, `ollama`, or `openai`, anything else is dropped on load). Fresh installs default `active_provider` to `builtin`; the loader pins any pre-providers config (no `[[inference.providers]]` array) to `ollama`, because no working built-in provider existed when that file was written. The loader also migrates a legacy flat `ollama_url` onto a synthesized Ollama provider, and `config/migrate.rs` folds the legacy SQLite `active_model` onto the active provider when it is Ollama-kind. See `docs/configurations.md` for the user-facing schema.
+- **`config/`**: typed TOML-backed application configuration. Loaded once at startup from `~/Library/Application Support/com.quietnode.thuki/config.toml` (seeded with defaults on first run), installed as Tauri managed state, exposed to the frontend via the `get_config` command. Every subsystem that needs model, prompt, window, activation, or quote values reads from `State<AppConfig>`. The `[inference]` section holds `active_provider`, `num_ctx`, `keep_warm_inactivity_minutes` (unified residency knob governing both local providers: the built-in engine's idle-unload timer and Ollama's `keep_alive`; not applicable to OpenAI), and the typed providers list (`[[inference.providers]]`, each `{id, kind, label, base_url, model, vision}`; `kind` is `builtin`, `ollama`, or `openai`, anything else is dropped on load). Fresh installs default `active_provider` to `builtin`; the loader pins any pre-providers config (no `[[inference.providers]]` array) to `ollama`, because no working built-in provider existed when that file was written. The loader also migrates a legacy flat `ollama_url` onto a synthesized Ollama provider, and `config/migrate.rs` folds the legacy SQLite `active_model` onto the active provider when it is Ollama-kind. See `docs/configurations.md` for the user-facing schema.
 - **`commands.rs`**: `ask_model` Tauri command: routes by the active provider's kind. `builtin` resolves the installed model from the manifest, ensures the sidecar is loaded via the engine runner, and streams OpenAI-compatible `/v1/chat/completions` SSE through `openai.rs` (`V1Flavor::Builtin`); `ollama` streams the native `/api/chat` newline-delimited JSON; `openai` streams `/v1` SSE against the provider's `base_url` (`V1Flavor::Remote`). All paths emit the same `StreamChunk` contract via Tauri Channel and read the active provider, the resolved system prompt, and the in-memory `ActiveModelState` from managed state.
 - **`keychain.rs`**: write-only storage for `openai`-provider API keys in the macOS Keychain via the `keyring` crate. The Keychain is the only place keys ever live: they are never written to the TOML config and never returned to the frontend (only existence is queryable via `has_provider_api_key`); the `SecretStore` trait decouples callers from the real Keychain for tests.
 - **`screenshot.rs`** — `capture_full_screen_command` Tauri command: uses CoreGraphics FFI (`CGWindowListCreateImage`) to capture all displays excluding Thuki's own windows, writes a JPEG to a temp dir, and returns the path
diff --git a/docs/configurations.md b/docs/configurations.md
index 756b3bde..8fcca6f5 100644
--- a/docs/configurations.md
+++ b/docs/configurations.md
@@ -38,14 +38,13 @@ active_provider = "builtin"
 # system prompt are reused. Raise to fit longer conversations; lower to reduce
 # GPU memory use. Valid range: 2048-1048576.
 num_ctx = 16384
-# Minutes of inactivity before Thuki tells Ollama to release the model.
-# 0 = let Ollama manage (its own 5-minute default applies).
-# -1 = never release. Applies to the Ollama provider only.
+# Minutes of inactivity before Thuki releases the active model from memory.
+# Applies to both local providers (built-in engine and Ollama); not applicable
+# to a remote OpenAI-compatible server.
+# 0 = use the provider's natural short default (~5 min): Ollama defers to its
+#     own timer, the built-in engine applies its own ~5-minute timer.
+# -1 = keep resident forever. Valid range: -1 or 0-1440.
 keep_warm_inactivity_minutes = 0
-# Minutes of inactivity before Thuki stops the built-in engine to free RAM.
-# 0 keeps the model loaded indefinitely for instant first tokens (default).
-# Applies to the built-in engine only. Valid range: 0-1440.
-idle_unload_minutes = 0
 
 # One block per provider. The built-in entry is always present. A provider's
 # selected model lives on its own `model` field (empty until you pick one in
@@ -149,8 +148,7 @@ Upgrading from an older version is automatic: a pre-providers config with a flat
 | :---------------- | :--------- | :------- | :------------------ | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `active_provider` | `"builtin"` | Yes      | id of a provider    | Which provider receives inference. Must match the `id` of one of the `[[inference.providers]]` entries; an empty or dangling value resets to `builtin`. Exception: a config that predates the providers list is pinned to `ollama` on load, because no working built-in provider existed when that file was written.                                                                                                                                                                                                                                                                                              |
 | `num_ctx`         | `16384`    | Yes      | `[2048, 1048576]`   | Context window size in tokens sent to the active provider with every request. For the built-in engine, the value becomes `--ctx-size` when the `llama-server` process starts, so changing it restarts the engine. For Ollama, warmup and chat share this value so the same runner instance and its cached KV prefix for the system prompt are reused: they must match or Ollama creates a second runner and the warmup saves nothing. Ollama silently clamps this to the model's physical maximum. For OpenAI-compatible providers the value is informational only; the server controls the actual context. Raise to fit longer conversations: each doubling roughly doubles VRAM for the KV cache; lower to reclaim GPU memory. See [Tuning the Context Window](./tuning-context-window.md). |
-| `keep_warm_inactivity_minutes` | `0` | Yes | `-1` or `[0, 1440]` | Minutes of inactivity before Thuki tells Ollama to release the model from VRAM. Applies to the Ollama provider only. `0` means do not manage: Ollama's own 5-minute default applies. `-1` means never release. Raise for longer sessions between uses; lower to reclaim VRAM sooner.                                                                                                                                                                                                                                            |
-| `idle_unload_minutes`          | `0` | Yes | `[0, 1440]`         | Minutes of inactivity before Thuki stops the built-in engine to free RAM. Applies to the built-in engine only; the Ollama provider uses `keep_warm_inactivity_minutes` instead. `0` keeps the model loaded indefinitely so the first token after a pause stays instant. Raise to free RAM on an idle Mac; keep `0` for instant first tokens.                                                                                                                                                                                   |
+| `keep_warm_inactivity_minutes` | `0` | Yes | `-1` or `[0, 1440]` | Minutes of inactivity before Thuki releases the active model from memory. Governs both local providers: the built-in engine stops its sidecar to free RAM, and Ollama is told to release the model from VRAM. Not applicable to a remote OpenAI-compatible server, whose residency Thuki does not manage. `0` uses the provider's natural short default (about 5 minutes): Ollama defers to its own timer, the built-in engine applies its own ~5-minute timer (`DEFAULT_BUILTIN_IDLE_MINUTES`). `-1` keeps the model resident forever. Raise for longer sessions between uses; lower to reclaim memory sooner. |
 
 Each `[[inference.providers]]` block has these fields:
 
@@ -179,7 +177,8 @@ The table below also lists the baked-in safety limits that govern Thuki's commun
 | `VRAM_POLL_INTERVAL_SECS`                   | `5 s`    | No       | Tuning this trades responsiveness against localhost polling load; 5 s is the sweet spot for loopback calls and matches Ollama's internal TTL resolution granularity. | —      | How often Thuki polls Ollama's `/api/ps` to detect VRAM changes made outside Thuki (for example, running `ollama stop` or a TTL expiry). The Settings panel VRAM indicator reflects these changes within one interval. |
 | `ENGINE_HEALTH_DEADLINE_SECS`               | `300 s`  | No       | Engine lifecycle contract: this bounds the worst-case "warming up" wait the UI can show before a start is declared failed, so changing it alters the UX contract rather than tuning a preference. | —      | How long Thuki waits for a freshly spawned built-in engine to pass its `/health` check before giving up and killing the process. Large GGUF models loading from a cold disk can legitimately take minutes, so the deadline is generous. |
 | `ENGINE_HEALTH_POLL_INTERVAL_MS`            | `250 ms` | No       | Pure loopback-load tuning: 250 ms detects readiness promptly without hammering the local server while it is busy loading the model.                                  | —      | How often Thuki probes the built-in engine's `/health` endpoint while it starts up. A `503` answer means the model is still loading and the poll continues; `200` means ready.       |
-| `ENGINE_IDLE_CHECK_INTERVAL_SECS`           | `30 s`   | No       | Internal timer granularity behind the user-facing `idle_unload_minutes` knob; 30 s keeps the unload within a minute-scale setting's precision at negligible cost.    | —      | How often the engine runner checks whether `idle_unload_minutes` of inactivity have elapsed and the built-in engine should be stopped to free RAM.                                   |
+| `ENGINE_IDLE_CHECK_INTERVAL_SECS`           | `30 s`   | No       | Internal timer granularity behind the user-facing `keep_warm_inactivity_minutes` knob; 30 s keeps the unload within a minute-scale setting's precision at negligible cost.    | —      | How often the engine runner checks whether the configured idle window has elapsed and the built-in engine should be stopped to free RAM.                                   |
+| `DEFAULT_BUILTIN_IDLE_MINUTES`              | `5 min`  | No       | The fixed translation of the `keep_warm_inactivity_minutes = 0` sentinel for the built-in engine, not a separate preference. The built-in engine has no external daemon to defer to, so `0` ("use the provider's natural short default") resolves to this value. Users who want a different timeout set `keep_warm_inactivity_minutes` directly (`N` minutes, or `-1` for forever). | —      | The idle window the built-in engine applies when `keep_warm_inactivity_minutes` is `0`. After this many minutes of inactivity the sidecar is stopped to free RAM. |
 | `ENGINE_HEALTH_PROBE_TIMEOUT_SECS`          | `5 s`    | No       | Internal lifecycle contract between the runner and the engine process. A wedged-but-connected server must not park the poll loop forever; loopback probes are normally instant so 5 s is generous. The poll interval and deadline are the user-facing knobs. | —      | How long a single `/health` GET is allowed to take inside the startup poll loop. If the engine has accepted the TCP connection but stopped responding, this timeout causes the probe to return an error (treated as Wait and retried after `ENGINE_HEALTH_POLL_INTERVAL_MS`). |
 | `ENGINE_COMMAND_QUEUE_CAPACITY`             | `64`     | No       | Bounds memory under command bursts; 64 slots is ample for all UI-driven traffic (Ensure, Touch, SetIdleMinutes, Shutdown) under any realistic usage pattern. | —      | Capacity of the bounded `mpsc` channel that carries commands from `EngineHandle` to the runner actor task. Back-pressure from a full queue is not observable in normal use. |
 | `DOWNLOAD_PROGRESS_MIN_INTERVAL_MS`         | `500 ms` | No       | Pure IPC hygiene: a fast local connection can deliver thousands of chunks per second and the UI only needs a few updates per second, so throttling below the UI refresh rate is invisible to the user. | —      | Minimum interval between `Progress` events emitted while a model file downloads. An update is also emitted whenever at least 1% of the file has arrived since the last one, whichever comes first, and a final 100% update always precedes verification. |
diff --git a/docs/tuning-context-window.md b/docs/tuning-context-window.md
index a40ac0ab..b993fda1 100644
--- a/docs/tuning-context-window.md
+++ b/docs/tuning-context-window.md
@@ -13,7 +13,7 @@ The Context Window value (`num_ctx`) is sent to whichever provider is active:
 - **Built-in engine (the default):** the value is passed to the bundled `llama-server` process as `--ctx-size` when it starts. The context size is fixed for the lifetime of the process, so changing it in Settings restarts the engine (a model reload, a few seconds). The three signals below and the Activity Monitor steps apply unchanged; the `ollama ps` steps do not, so watch Memory Pressure and GPU History instead.
 - **Ollama provider:** everything in this guide applies as written, including the `ollama ps` checks.
 
-The Keep Warm knob is Ollama-only. The built-in engine's counterpart is `idle_unload_minutes` (Settings, or `[inference]` in `config.toml`): minutes of inactivity before Thuki stops the engine to free memory, with `0` meaning keep it loaded indefinitely.
+The Keep Warm knob (`keep_warm_inactivity_minutes`, in Settings or `[inference]` in `config.toml`) governs both local providers: minutes of inactivity before Thuki releases the active model from memory. For the built-in engine it stops the `llama-server` sidecar; for Ollama it sets the `keep_alive`. `0` uses the provider's natural short default (about 5 minutes), and `-1` keeps the model resident forever.
 
 ## Quick vocabulary
 
diff --git a/src-tauri/src/config/defaults.rs b/src-tauri/src/config/defaults.rs
index cedc6301..7c5b517b 100644
--- a/src-tauri/src/config/defaults.rs
+++ b/src-tauri/src/config/defaults.rs
@@ -41,9 +41,14 @@ pub const DEFAULT_OPENAI_LABEL: &str = "OpenAI-compatible";
 /// default) are never rewritten; only fresh or dangling pointers land here.
 pub const DEFAULT_ACTIVE_PROVIDER: &str = PROVIDER_ID_BUILTIN;
 
-/// Default inactivity window before Thuki tells Ollama to release the model.
-/// 0 means do not manage: Ollama's own 5-minute default applies.
-/// -1 means keep indefinitely. Positive values are minutes (1..=1440).
+/// Default inactivity window before Thuki releases the active model from local
+/// memory. Unified across both local providers (built-in engine and Ollama);
+/// not applicable to a remote OpenAI-compatible server, whose residency Thuki
+/// does not manage.
+/// 0 means use the provider's natural short default (~5 min): Ollama defers to
+/// its own 5-minute timer, the built-in engine applies its own ~5-minute timer
+/// (see `DEFAULT_BUILTIN_IDLE_MINUTES`).
+/// -1 means keep resident indefinitely. Positive values are minutes (1..=1440).
 pub const DEFAULT_KEEP_WARM_INACTIVITY_MINUTES: i32 = 0;
 
 /// Ollama context window size (tokens) sent with every /api/chat request.
@@ -62,18 +67,20 @@ pub const DEFAULT_NUM_CTX: u32 = 16384;
 pub const BOUNDS_NUM_CTX: (u32, u32) = (2048, 1_048_576);
 
 /// Accepted range for `keep_warm_inactivity_minutes`.
-/// -1 = never release, 0 = disabled (Ollama default), 1..=1440 = explicit timeout.
-/// Values below -1 or above 1440 are clamped to the compiled default.
+/// -1 = keep resident forever, 0 = provider's natural short default (~5 min),
+/// 1..=1440 = explicit timeout. Values below -1 or above 1440 are clamped to
+/// the compiled default.
 pub const BOUNDS_KEEP_WARM_INACTIVITY_MINUTES: (i32, i32) = (-1, 1440);
 
-/// Minutes of inactivity before Thuki stops the built-in engine to free RAM.
-/// 0 disables auto-unload: the model stays loaded and the first token stays
-/// instant (the default). Positive values free RAM after N idle minutes at
-/// the cost of a cold reload on the next message. Applies to the built-in engine only; the
-/// Ollama provider keeps `keep_warm_inactivity_minutes` (note the different
-/// meaning of 0 there: "use Ollama's own default").
-pub const DEFAULT_IDLE_UNLOAD_MINUTES: u32 = 0;
-pub const BOUNDS_IDLE_UNLOAD_MINUTES: (u32, u32) = (0, 1440);
+/// The built-in engine's idle-unload timer (minutes) for the unified
+/// `keep_warm_inactivity_minutes = 0` sentinel. The built-in engine has no
+/// external daemon to defer to, so `0` ("use the provider's natural short
+/// default") resolves to this fixed ~5-minute timer. Baked in, not tunable:
+/// it is the fixed translation of one sentinel value, not a preference; users
+/// who want a different timeout set `keep_warm_inactivity_minutes` directly
+/// (`N` minutes or `-1` for forever). `warmup::builtin_idle_minutes` maps the
+/// sentinel onto the runner's `idle_minutes` convention.
+pub const DEFAULT_BUILTIN_IDLE_MINUTES: u32 = 5;
 
 // Built-in engine lifecycle constants: baked in because they define the
 // engine runner's startup and idle-check contract, not a user preference.
@@ -101,8 +108,8 @@ pub const ENGINE_HEALTH_PROBE_TIMEOUT_SECS: u64 = 5;
 
 /// Interval (seconds) between idle-unload checks in the engine runner. Not
 /// user-tunable: internal timer granularity behind the user-facing
-/// `idle_unload_minutes` knob; 30 s keeps the unload within a minute-scale
-/// setting's precision at negligible cost.
+/// `keep_warm_inactivity_minutes` knob; 30 s keeps the unload within a
+/// minute-scale setting's precision at negligible cost.
 pub const ENGINE_IDLE_CHECK_INTERVAL_SECS: u64 = 30;
 
 /// Capacity of the engine runner command queue. Not user-tunable: bounds
@@ -418,7 +425,6 @@ pub const ALLOWED_FIELDS: &[(&str, &str)] = &[
     // [inference] — active_provider and the providers array are not flat fields;
     // they are written via set_active_model / set_ollama_url, not set_config_field.
     ("inference", "keep_warm_inactivity_minutes"),
-    ("inference", "idle_unload_minutes"),
     ("inference", "num_ctx"),
     // [prompt]
     ("prompt", "system"),
diff --git a/src-tauri/src/config/loader.rs b/src-tauri/src/config/loader.rs
index ba77fcc7..75dd63f6 100644
--- a/src-tauri/src/config/loader.rs
+++ b/src-tauri/src/config/loader.rs
@@ -22,15 +22,15 @@ use std::path::{Path, PathBuf};
 use std::time::{SystemTime, UNIX_EPOCH};
 
 use super::defaults::{
-    ALLOWED_FONT_WEIGHTS, BOUNDS_IDLE_UNLOAD_MINUTES, BOUNDS_KEEP_WARM_INACTIVITY_MINUTES,
-    BOUNDS_MAX_CHAT_HEIGHT, BOUNDS_MAX_IMAGES, BOUNDS_MAX_ITERATIONS, BOUNDS_NUM_CTX,
-    BOUNDS_OVERLAY_WIDTH, BOUNDS_PIPELINE_WALL_CLOCK_BUDGET_S, BOUNDS_QUOTE_MAX_CONTEXT_LENGTH,
+    ALLOWED_FONT_WEIGHTS, BOUNDS_KEEP_WARM_INACTIVITY_MINUTES, BOUNDS_MAX_CHAT_HEIGHT,
+    BOUNDS_MAX_IMAGES, BOUNDS_MAX_ITERATIONS, BOUNDS_NUM_CTX, BOUNDS_OVERLAY_WIDTH,
+    BOUNDS_PIPELINE_WALL_CLOCK_BUDGET_S, BOUNDS_QUOTE_MAX_CONTEXT_LENGTH,
     BOUNDS_QUOTE_MAX_DISPLAY_CHARS, BOUNDS_QUOTE_MAX_DISPLAY_LINES, BOUNDS_SEARXNG_MAX_RESULTS,
     BOUNDS_TEXT_BASE_PX, BOUNDS_TEXT_LETTER_SPACING_PX, BOUNDS_TEXT_LINE_HEIGHT, BOUNDS_TIMEOUT_S,
-    BOUNDS_TOP_K_URLS, BOUNDS_UPDATER_CHECK_INTERVAL_HOURS, DEFAULT_IDLE_UNLOAD_MINUTES,
-    DEFAULT_JUDGE_TIMEOUT_S, DEFAULT_KEEP_WARM_INACTIVITY_MINUTES, DEFAULT_MAX_CHAT_HEIGHT,
-    DEFAULT_MAX_IMAGES, DEFAULT_MAX_ITERATIONS, DEFAULT_NUM_CTX, DEFAULT_OLLAMA_URL,
-    DEFAULT_OVERLAY_WIDTH, DEFAULT_PIPELINE_WALL_CLOCK_BUDGET_S, DEFAULT_QUOTE_MAX_CONTEXT_LENGTH,
+    BOUNDS_TOP_K_URLS, BOUNDS_UPDATER_CHECK_INTERVAL_HOURS, DEFAULT_JUDGE_TIMEOUT_S,
+    DEFAULT_KEEP_WARM_INACTIVITY_MINUTES, DEFAULT_MAX_CHAT_HEIGHT, DEFAULT_MAX_IMAGES,
+    DEFAULT_MAX_ITERATIONS, DEFAULT_NUM_CTX, DEFAULT_OLLAMA_URL, DEFAULT_OVERLAY_WIDTH,
+    DEFAULT_PIPELINE_WALL_CLOCK_BUDGET_S, DEFAULT_QUOTE_MAX_CONTEXT_LENGTH,
     DEFAULT_QUOTE_MAX_DISPLAY_CHARS, DEFAULT_QUOTE_MAX_DISPLAY_LINES,
     DEFAULT_READER_BATCH_TIMEOUT_S, DEFAULT_READER_PER_URL_TIMEOUT_S, DEFAULT_READER_URL,
     DEFAULT_ROUTER_TIMEOUT_S, DEFAULT_SEARCH_TIMEOUT_S, DEFAULT_SEARXNG_MAX_RESULTS,
@@ -323,7 +323,7 @@ fn resolve_inference(inf: &mut crate::config::schema::InferenceSection) {
     // carry providers. Consumed by the active-pointer pin at the end.
     let is_pre_providers_file = inf.providers.is_empty();
 
-    // num_ctx + keep_warm: unchanged clamping (Ollama-path knobs).
+    // num_ctx + keep_warm: numeric clamps (universal local-provider knobs).
     clamp_u32(
         &mut inf.num_ctx,
         BOUNDS_NUM_CTX,
@@ -335,12 +335,6 @@ fn resolve_inference(inf: &mut crate::config::schema::InferenceSection) {
         DEFAULT_KEEP_WARM_INACTIVITY_MINUTES,
         "inference.keep_warm_inactivity_minutes",
     );
-    clamp_u32(
-        &mut inf.idle_unload_minutes,
-        BOUNDS_IDLE_UNLOAD_MINUTES,
-        DEFAULT_IDLE_UNLOAD_MINUTES,
-        "inference.idle_unload_minutes",
-    );
 
     // Migration: a pre-providers file has `ollama_url` and no `providers`.
     // Carry the URL onto a synthesized Ollama provider; the active model is
diff --git a/src-tauri/src/config/schema.rs b/src-tauri/src/config/schema.rs
index 1e9da3ad..24439ac0 100644
--- a/src-tauri/src/config/schema.rs
+++ b/src-tauri/src/config/schema.rs
@@ -15,10 +15,10 @@ use serde::{Deserialize, Serialize};
 
 use super::defaults::{
     DEFAULT_ACTIVE_PROVIDER, DEFAULT_AUTO_CLOSE, DEFAULT_AUTO_REPLACE, DEFAULT_BUILTIN_LABEL,
-    DEFAULT_DEBUG_TRACE_ENABLED, DEFAULT_IDLE_UNLOAD_MINUTES, DEFAULT_JUDGE_TIMEOUT_S,
-    DEFAULT_KEEP_WARM_INACTIVITY_MINUTES, DEFAULT_MAX_CHAT_HEIGHT, DEFAULT_MAX_IMAGES,
-    DEFAULT_MAX_ITERATIONS, DEFAULT_NUM_CTX, DEFAULT_OLLAMA_LABEL, DEFAULT_OLLAMA_URL,
-    DEFAULT_OVERLAY_WIDTH, DEFAULT_PIPELINE_WALL_CLOCK_BUDGET_S, DEFAULT_QUOTE_MAX_CONTEXT_LENGTH,
+    DEFAULT_DEBUG_TRACE_ENABLED, DEFAULT_JUDGE_TIMEOUT_S, DEFAULT_KEEP_WARM_INACTIVITY_MINUTES,
+    DEFAULT_MAX_CHAT_HEIGHT, DEFAULT_MAX_IMAGES, DEFAULT_MAX_ITERATIONS, DEFAULT_NUM_CTX,
+    DEFAULT_OLLAMA_LABEL, DEFAULT_OLLAMA_URL, DEFAULT_OVERLAY_WIDTH,
+    DEFAULT_PIPELINE_WALL_CLOCK_BUDGET_S, DEFAULT_QUOTE_MAX_CONTEXT_LENGTH,
     DEFAULT_QUOTE_MAX_DISPLAY_CHARS, DEFAULT_QUOTE_MAX_DISPLAY_LINES,
     DEFAULT_READER_BATCH_TIMEOUT_S, DEFAULT_READER_PER_URL_TIMEOUT_S, DEFAULT_READER_URL,
     DEFAULT_ROUTER_TIMEOUT_S, DEFAULT_SEARCH_TIMEOUT_S, DEFAULT_SEARXNG_MAX_RESULTS,
@@ -105,7 +105,8 @@ pub fn default_providers() -> Vec<Provider> {
 /// Inference targets one of several `providers`; `active_provider` selects
 /// which. Per-provider model selection lives on each [`Provider`] record
 /// (replacing the former single SQLite `active_model`). `num_ctx` and
-/// `keep_warm_inactivity_minutes` remain universal Ollama-path knobs.
+/// `keep_warm_inactivity_minutes` are universal knobs read by both local
+/// provider paths.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 #[serde(default)]
 pub struct InferenceSection {
@@ -118,15 +119,14 @@ pub struct InferenceSection {
     /// fit longer conversations in a single context; lower to use less VRAM.
     /// Valid range: 2048..=1048576.
     pub num_ctx: u32,
-    /// Minutes of inactivity before Thuki tells Ollama to release the model.
-    /// Applies to the Ollama provider only. 0 means do not manage (Ollama's
-    /// 5-minute default applies). -1 means keep indefinitely. Valid range: -1
-    /// or 0..=1440.
+    /// Minutes of inactivity before Thuki releases the active model from local
+    /// memory. Governs both local providers: the built-in engine stops its
+    /// sidecar, and Ollama is told to release the model. Not applicable to a
+    /// remote OpenAI-compatible server, whose residency Thuki does not manage.
+    /// 0 uses the provider's natural short default (~5 min): Ollama defers to
+    /// its own timer, the built-in engine applies its own ~5-minute timer.
+    /// -1 keeps the model resident indefinitely. Valid range: -1 or 0..=1440.
     pub keep_warm_inactivity_minutes: i32,
-    /// Minutes of inactivity before the built-in engine is stopped to free
-    /// RAM. 0 keeps the model loaded indefinitely (default). Built-in only;
-    /// Ollama keeps `keep_warm_inactivity_minutes`. Valid range: 0..=1440.
-    pub idle_unload_minutes: u32,
     /// The configured providers. Always contains the built-in entry after
     /// resolution. The field-level `#[serde(default)]` defaults a *missing*
     /// `providers` key to an empty Vec (not the seeded pair), so the loader can
@@ -147,7 +147,6 @@ impl Default for InferenceSection {
             active_provider: DEFAULT_ACTIVE_PROVIDER.to_string(),
             num_ctx: DEFAULT_NUM_CTX,
             keep_warm_inactivity_minutes: DEFAULT_KEEP_WARM_INACTIVITY_MINUTES,
-            idle_unload_minutes: DEFAULT_IDLE_UNLOAD_MINUTES,
             providers: default_providers(),
             legacy_ollama_url: None,
         }
diff --git a/src-tauri/src/config/tests.rs b/src-tauri/src/config/tests.rs
index 69b9c4a9..d9a78276 100644
--- a/src-tauri/src/config/tests.rs
+++ b/src-tauri/src/config/tests.rs
@@ -14,17 +14,17 @@ use std::path::PathBuf;
 
 use super::defaults::{
     DEFAULT_ACTIVE_PROVIDER, DEFAULT_AUTO_CLOSE, DEFAULT_AUTO_REPLACE, DEFAULT_DEBUG_TRACE_ENABLED,
-    DEFAULT_IDLE_UNLOAD_MINUTES, DEFAULT_JUDGE_TIMEOUT_S, DEFAULT_KEEP_WARM_INACTIVITY_MINUTES,
-    DEFAULT_MAX_CHAT_HEIGHT, DEFAULT_MAX_IMAGES, DEFAULT_MAX_ITERATIONS, DEFAULT_NUM_CTX,
-    DEFAULT_OLLAMA_URL, DEFAULT_OVERLAY_WIDTH, DEFAULT_QUOTE_MAX_CONTEXT_LENGTH,
-    DEFAULT_QUOTE_MAX_DISPLAY_CHARS, DEFAULT_QUOTE_MAX_DISPLAY_LINES,
-    DEFAULT_READER_BATCH_TIMEOUT_S, DEFAULT_READER_PER_URL_TIMEOUT_S, DEFAULT_READER_URL,
-    DEFAULT_ROUTER_TIMEOUT_S, DEFAULT_SEARCH_TIMEOUT_S, DEFAULT_SEARXNG_MAX_RESULTS,
-    DEFAULT_SEARXNG_URL, DEFAULT_SYSTEM_PROMPT_BASE, DEFAULT_TEXT_BASE_PX,
-    DEFAULT_TEXT_FONT_WEIGHT, DEFAULT_TEXT_LETTER_SPACING_PX, DEFAULT_TEXT_LINE_HEIGHT,
-    DEFAULT_TOP_K_URLS, DEFAULT_UPDATER_CHECK_INTERVAL_HOURS, DEFAULT_UPDATER_MANIFEST_URL,
-    PROVIDER_ID_BUILTIN, PROVIDER_ID_OLLAMA, PROVIDER_KIND_BUILTIN, PROVIDER_KIND_OLLAMA,
-    PROVIDER_KIND_OPENAI, SLASH_COMMAND_PROMPT_APPENDIX,
+    DEFAULT_JUDGE_TIMEOUT_S, DEFAULT_KEEP_WARM_INACTIVITY_MINUTES, DEFAULT_MAX_CHAT_HEIGHT,
+    DEFAULT_MAX_IMAGES, DEFAULT_MAX_ITERATIONS, DEFAULT_NUM_CTX, DEFAULT_OLLAMA_URL,
+    DEFAULT_OVERLAY_WIDTH, DEFAULT_QUOTE_MAX_CONTEXT_LENGTH, DEFAULT_QUOTE_MAX_DISPLAY_CHARS,
+    DEFAULT_QUOTE_MAX_DISPLAY_LINES, DEFAULT_READER_BATCH_TIMEOUT_S,
+    DEFAULT_READER_PER_URL_TIMEOUT_S, DEFAULT_READER_URL, DEFAULT_ROUTER_TIMEOUT_S,
+    DEFAULT_SEARCH_TIMEOUT_S, DEFAULT_SEARXNG_MAX_RESULTS, DEFAULT_SEARXNG_URL,
+    DEFAULT_SYSTEM_PROMPT_BASE, DEFAULT_TEXT_BASE_PX, DEFAULT_TEXT_FONT_WEIGHT,
+    DEFAULT_TEXT_LETTER_SPACING_PX, DEFAULT_TEXT_LINE_HEIGHT, DEFAULT_TOP_K_URLS,
+    DEFAULT_UPDATER_CHECK_INTERVAL_HOURS, DEFAULT_UPDATER_MANIFEST_URL, PROVIDER_ID_BUILTIN,
+    PROVIDER_ID_OLLAMA, PROVIDER_KIND_BUILTIN, PROVIDER_KIND_OLLAMA, PROVIDER_KIND_OPENAI,
+    SLASH_COMMAND_PROMPT_APPENDIX,
 };
 use super::error::ConfigError;
 use super::loader::{compose_system_prompt, load_from_path, resolve};
@@ -550,49 +550,6 @@ fn num_ctx_roundtrips_through_toml() {
     assert_eq!(reloaded.inference.num_ctx, config.inference.num_ctx);
 }
 
-#[test]
-fn idle_unload_default_matches_const() {
-    let c = AppConfig::default();
-    assert_eq!(c.inference.idle_unload_minutes, DEFAULT_IDLE_UNLOAD_MINUTES);
-}
-
-#[test]
-fn idle_unload_out_of_bounds_resets() {
-    let dir = fresh_temp_dir();
-    let path = config_path_in(&dir);
-    std::fs::write(&path, "[inference]\nidle_unload_minutes = 99999\n").unwrap();
-    let config = load_from_path(&path).unwrap();
-    assert_eq!(
-        config.inference.idle_unload_minutes,
-        DEFAULT_IDLE_UNLOAD_MINUTES
-    );
-}
-
-#[test]
-fn idle_unload_in_bounds_preserved() {
-    let dir = fresh_temp_dir();
-    let path = config_path_in(&dir);
-    std::fs::write(&path, "[inference]\nidle_unload_minutes = 30\n").unwrap();
-    let config = load_from_path(&path).unwrap();
-    assert_eq!(config.inference.idle_unload_minutes, 30);
-}
-
-#[test]
-fn idle_unload_roundtrips_through_toml() {
-    let dir = fresh_temp_dir();
-    let path = config_path_in(&dir);
-    std::fs::write(&path, "[inference]\nidle_unload_minutes = 15\n").unwrap();
-    let config = load_from_path(&path).unwrap();
-    assert_eq!(config.inference.idle_unload_minutes, 15);
-
-    atomic_write(&path, &config).unwrap();
-    let reloaded = load_from_path(&path).unwrap();
-    assert_eq!(
-        reloaded.inference.idle_unload_minutes,
-        config.inference.idle_unload_minutes
-    );
-}
-
 #[test]
 fn resolve_empty_ollama_url_falls_back() {
     let dir = fresh_temp_dir();
diff --git a/src-tauri/src/engine/runner.rs b/src-tauri/src/engine/runner.rs
index ecd2827d..e8ea05f4 100644
--- a/src-tauri/src/engine/runner.rs
+++ b/src-tauri/src/engine/runner.rs
@@ -65,7 +65,7 @@ enum Command {
 
 /// RAII marker for an in-flight LLM request against the engine. While at
 /// least one guard is alive the idle sweep treats the engine as active, so
-/// `idle_unload_minutes` can never kill the sidecar mid-generation (cold
+/// the idle timer can never kill the sidecar mid-generation (cold
 /// ensure, prefill, and body streaming included). Explicit `unload` and
 /// `shutdown` are deliberately NOT blocked by guards: a user-driven eviction
 /// or app quit always wins over an in-flight request.
diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs
index df6459bb..57f725fe 100644
--- a/src-tauri/src/lib.rs
+++ b/src-tauri/src/lib.rs
@@ -2162,11 +2162,15 @@ pub fn run() {
             // process, kill-then-start on model switch, idle unload. Spawned
             // inside block_on so the actor task lands on Tauri's tokio
             // runtime (setup itself runs outside a runtime context).
-            let engine_idle_minutes = app
-                .state::<parking_lot::RwLock<crate::config::AppConfig>>()
-                .read()
-                .inference
-                .idle_unload_minutes;
+            // The unified `keep_warm_inactivity_minutes` field governs both
+            // local providers; translate its sentinel into the runner's own
+            // `idle_minutes` convention through the shared boundary helper.
+            let engine_idle_minutes = warmup::builtin_idle_minutes(
+                app.state::<parking_lot::RwLock<crate::config::AppConfig>>()
+                    .read()
+                    .inference
+                    .keep_warm_inactivity_minutes,
+            );
             let engine_client = app.state::<reqwest::Client>().inner().clone();
             let engine = tauri::async_runtime::block_on(async move {
                 engine::runner::EngineHandle::spawn(
diff --git a/src-tauri/src/settings_commands.rs b/src-tauri/src/settings_commands.rs
index ee5aaf85..4591bc04 100644
--- a/src-tauri/src/settings_commands.rs
+++ b/src-tauri/src/settings_commands.rs
@@ -116,24 +116,32 @@ pub(crate) fn trace_enabled_changed(prior_enabled: bool, resolved: &AppConfig) -
     resolved.debug.trace_enabled != prior_enabled
 }
 
-/// Returns the new `[inference] idle_unload_minutes` value when the post-write
-/// `AppConfig` changed it relative to the pre-write snapshot, `None` when it
-/// is unchanged. Pulled out so the predicate is covered by tests instead of
-/// riding inside the coverage-off Tauri command bodies that forward the new
-/// value to the running engine actor.
-pub(crate) fn idle_unload_minutes_changed(prior_minutes: u32, resolved: &AppConfig) -> Option<u32> {
-    let new_minutes = resolved.inference.idle_unload_minutes;
-    (new_minutes != prior_minutes).then_some(new_minutes)
+/// Returns the engine runner's new `idle_minutes` value when the post-write
+/// `AppConfig` changed `[inference] keep_warm_inactivity_minutes` relative to
+/// the pre-write snapshot, `None` when it is unchanged. The unified field is
+/// translated through [`crate::warmup::builtin_idle_minutes`] so the runner's
+/// own `0 = forever` convention stays an implementation detail. Pulled out so
+/// the predicate is covered by tests instead of riding inside the coverage-off
+/// Tauri command bodies that forward the new value to the running engine actor.
+pub(crate) fn keep_warm_idle_minutes_changed(
+    prior_minutes: i32,
+    resolved: &AppConfig,
+) -> Option<u32> {
+    let new_minutes = resolved.inference.keep_warm_inactivity_minutes;
+    (new_minutes != prior_minutes).then(|| crate::warmup::builtin_idle_minutes(new_minutes))
 }
 
-/// Forwards a changed `[inference] idle_unload_minutes` value to the engine
-/// runner actor so the new idle-unload policy applies without a restart.
-/// Spawned because the config commands are synchronous while the actor's
-/// mailbox is async. Thin dispatch; the predicate and the actor's
+/// Forwards a changed `[inference] keep_warm_inactivity_minutes` value to the
+/// engine runner actor (translated to the runner's `idle_minutes`) so the new
+/// residency policy applies without a restart. The runner is managed
+/// regardless of the active provider, so forwarding is harmless when Ollama is
+/// active (the Ollama path enforces residency through `keep_alive`, not the
+/// runner). Spawned because the config commands are synchronous while the
+/// actor's mailbox is async. Thin dispatch; the predicate and the actor's
 /// `SetIdleMinutes` handling are both tested on their own.
 #[cfg_attr(coverage_nightly, coverage(off))]
-fn forward_idle_unload_minutes(app: &AppHandle, prior_minutes: u32, resolved: &AppConfig) {
-    if let Some(minutes) = idle_unload_minutes_changed(prior_minutes, resolved) {
+fn forward_keep_warm_idle_minutes(app: &AppHandle, prior_minutes: i32, resolved: &AppConfig) {
+    if let Some(minutes) = keep_warm_idle_minutes_changed(prior_minutes, resolved) {
         let engine = app
             .state::<crate::engine::runner::EngineHandle>()
             .inner()
@@ -200,11 +208,11 @@ pub fn set_config_field(
     trace_recorder: State<'_, std::sync::Arc<crate::trace::LiveTraceRecorder>>,
 ) -> Result<AppConfig, ConfigError> {
     let path = config_path(&app)?;
-    let (prior_trace_enabled, prior_idle_unload_minutes) = {
+    let (prior_trace_enabled, prior_keep_warm_minutes) = {
         let guard = state.read();
         (
             guard.debug.trace_enabled,
-            guard.inference.idle_unload_minutes,
+            guard.inference.keep_warm_inactivity_minutes,
         )
     };
     let resolved = {
@@ -224,9 +232,10 @@ pub fn set_config_field(
         let new_inner = crate::build_trace_inner(&app, resolved.debug.trace_enabled);
         trace_recorder.replace(new_inner);
     }
-    // Forward an `[inference] idle_unload_minutes` change to the engine
-    // runner so the new idle policy applies without restarting Thuki.
-    forward_idle_unload_minutes(&app, prior_idle_unload_minutes, &resolved);
+    // Forward an `[inference] keep_warm_inactivity_minutes` change to the
+    // engine runner so the new residency policy applies without restarting
+    // Thuki.
+    forward_keep_warm_idle_minutes(&app, prior_keep_warm_minutes, &resolved);
     emit_config_updated(&app);
     Ok(resolved)
 }
@@ -762,11 +771,11 @@ pub fn reset_config(
     trace_recorder: State<'_, std::sync::Arc<crate::trace::LiveTraceRecorder>>,
 ) -> Result<AppConfig, ConfigError> {
     let path = config_path(&app)?;
-    let (prior_trace_enabled, prior_idle_unload_minutes) = {
+    let (prior_trace_enabled, prior_keep_warm_minutes) = {
         let guard = state.read();
         (
             guard.debug.trace_enabled,
-            guard.inference.idle_unload_minutes,
+            guard.inference.keep_warm_inactivity_minutes,
         )
     };
     let resolved = {
@@ -783,9 +792,9 @@ pub fn reset_config(
         let new_inner = crate::build_trace_inner(&app, resolved.debug.trace_enabled);
         trace_recorder.replace(new_inner);
     }
-    // A whole-file or `[inference]` reset restores the default idle-unload
+    // A whole-file or `[inference]` reset restores the default residency
     // policy; forward it so the engine runner picks it up immediately.
-    forward_idle_unload_minutes(&app, prior_idle_unload_minutes, &resolved);
+    forward_keep_warm_idle_minutes(&app, prior_keep_warm_minutes, &resolved);
     emit_config_updated(&app);
     Ok(resolved)
 }
@@ -852,11 +861,11 @@ pub fn reload_config_from_disk(
     trace_recorder: State<'_, std::sync::Arc<crate::trace::LiveTraceRecorder>>,
 ) -> Result<AppConfig, ConfigError> {
     let path = config_path(&app)?;
-    let (prior_trace_enabled, prior_idle_unload_minutes, prior_kind) = {
+    let (prior_trace_enabled, prior_keep_warm_minutes, prior_kind) = {
         let guard = state.read();
         (
             guard.debug.trace_enabled,
-            guard.inference.idle_unload_minutes,
+            guard.inference.keep_warm_inactivity_minutes,
             guard.inference.active_provider_kind().to_string(),
         )
     };
@@ -873,9 +882,9 @@ pub fn reload_config_from_disk(
         let new_inner = crate::build_trace_inner(&app, resolved.debug.trace_enabled);
         trace_recorder.replace(new_inner);
     }
-    // Manual edits to `[inference] idle_unload_minutes` reach the engine
-    // runner through the same refresh path.
-    forward_idle_unload_minutes(&app, prior_idle_unload_minutes, &resolved);
+    // Manual edits to `[inference] keep_warm_inactivity_minutes` reach the
+    // engine runner through the same refresh path.
+    forward_keep_warm_idle_minutes(&app, prior_keep_warm_minutes, &resolved);
     // A hand-edited `active_provider` that moved away from the built-in
     // engine releases the sidecar, mirroring the Settings radio path.
     unload_engine_if_builtin_deactivated(&app, &prior_kind, &resolved);
diff --git a/src-tauri/src/settings_commands/tests.rs b/src-tauri/src/settings_commands/tests.rs
index 44a544f3..eb101bfb 100644
--- a/src-tauri/src/settings_commands/tests.rs
+++ b/src-tauri/src/settings_commands/tests.rs
@@ -13,8 +13,8 @@ use toml_edit::DocumentMut;
 
 use super::{
     add_openai_provider_to_disk, builtin_deactivated, cleanup_provider_secrets,
-    coerce_json_to_toml, idle_unload_minutes_changed, is_allowed_field, is_allowed_section,
-    is_http_url, json_type_name, json_value_to_toml_item, patch_document, read_document,
+    coerce_json_to_toml, is_allowed_field, is_allowed_section, is_http_url, json_type_name,
+    json_value_to_toml_item, keep_warm_idle_minutes_changed, patch_document, read_document,
     remove_openai_provider_from_disk, reset_section_on_disk, trace_enabled_changed,
     validate_provider_value, write_active_provider_to_disk, write_field_to_disk,
     write_provider_field_to_disk,
@@ -117,7 +117,8 @@ vision = false
 fn allowed_fields_count_matches_schema_field_count() {
     // Hand-counted from `AppConfig`: inference(2) + prompt(1) + window(7) + quote(3)
     // + behavior(2) + search(11) + debug(1) + updater(3) = 30 tunable flat fields.
-    // The inference section's `active_provider` and `providers` array are NOT flat
+    // The inference section's two flat tunables are `keep_warm_inactivity_minutes`
+    // and `num_ctx`; `active_provider` and the `providers` array are NOT flat
     // fields: they are written through the dedicated `set_active_model` /
     // `set_ollama_url` commands, not the generic `set_config_field` path, so they
     // are intentionally absent from ALLOWED_FIELDS. The collapsed bar height and
@@ -128,7 +129,7 @@ fn allowed_fields_count_matches_schema_field_count() {
     // and is intentionally absent from ALLOWED_FIELDS. If this assertion fails, the
     // schema has drifted from the allowlist and someone added a field without
     // extending ALLOWED_FIELDS.
-    assert_eq!(ALLOWED_FIELDS.len(), 31);
+    assert_eq!(ALLOWED_FIELDS.len(), 30);
 }
 
 #[test]
@@ -1576,20 +1577,40 @@ fn trace_enabled_changed_returns_false_when_value_unchanged() {
     assert!(!trace_enabled_changed(false, &cfg));
 }
 
-// ─── idle_unload_minutes_changed ─────────────────────────────────────────────
+// ─── keep_warm_idle_minutes_changed ──────────────────────────────────────────
 
 #[test]
-fn idle_unload_minutes_changed_returns_new_value_on_change() {
+fn keep_warm_idle_minutes_changed_returns_translated_value_on_positive_change() {
     let mut cfg = AppConfig::default();
-    cfg.inference.idle_unload_minutes = 45;
-    assert_eq!(idle_unload_minutes_changed(0, &cfg), Some(45));
+    cfg.inference.keep_warm_inactivity_minutes = 45;
+    // Positive N passes straight through the translator.
+    assert_eq!(keep_warm_idle_minutes_changed(0, &cfg), Some(45));
 }
 
 #[test]
-fn idle_unload_minutes_changed_returns_none_when_unchanged() {
+fn keep_warm_idle_minutes_changed_translates_forever_to_disabled() {
     let mut cfg = AppConfig::default();
-    cfg.inference.idle_unload_minutes = 45;
-    assert_eq!(idle_unload_minutes_changed(45, &cfg), None);
+    cfg.inference.keep_warm_inactivity_minutes = -1;
+    // -1 (forever) translates to the runner's "0 = disabled".
+    assert_eq!(keep_warm_idle_minutes_changed(0, &cfg), Some(0));
+}
+
+#[test]
+fn keep_warm_idle_minutes_changed_translates_zero_to_short_default() {
+    let mut cfg = AppConfig::default();
+    cfg.inference.keep_warm_inactivity_minutes = 0;
+    // 0 (natural short default) translates to the baked-in ~5-minute timer.
+    assert_eq!(
+        keep_warm_idle_minutes_changed(-1, &cfg),
+        Some(crate::config::defaults::DEFAULT_BUILTIN_IDLE_MINUTES)
+    );
+}
+
+#[test]
+fn keep_warm_idle_minutes_changed_returns_none_when_unchanged() {
+    let mut cfg = AppConfig::default();
+    cfg.inference.keep_warm_inactivity_minutes = 45;
+    assert_eq!(keep_warm_idle_minutes_changed(45, &cfg), None);
 }
 
 // ─── builtin_deactivated ─────────────────────────────────────────────────────
diff --git a/src-tauri/src/warmup.rs b/src-tauri/src/warmup.rs
index ed564ab0..a90bf33d 100644
--- a/src-tauri/src/warmup.rs
+++ b/src-tauri/src/warmup.rs
@@ -63,6 +63,33 @@ pub fn keep_alive_string(minutes: i32) -> String {
     }
 }
 
+/// Translates the unified `keep_warm_inactivity_minutes` sentinel into the
+/// built-in engine runner's own `idle_minutes` convention. Lives here next to
+/// [`keep_alive_string`], the symmetric Ollama translator, so both consumers of
+/// the unified field (startup seeding in `lib.rs` and the Settings forward in
+/// `settings_commands.rs`) translate through one tested boundary.
+///
+/// Unified field semantics -> runner convention (`0` = idle-unload disabled =
+/// forever, `N>0` = unload after N minutes):
+/// - `-1` (keep resident forever) -> `0` (disable the runner's idle timer).
+/// - `0` (the provider's natural short default) -> `DEFAULT_BUILTIN_IDLE_MINUTES`
+///   (~5 min): the built-in engine has no external daemon to defer to, so it
+///   applies its own short timer.
+/// - `N>0` (explicit minutes) -> `N` as the runner's minute count.
+///
+/// The loader clamps the field to `[-1, 1440]`, so the catch-all only fires on
+/// already-validated values; the `match` is written total regardless.
+pub(crate) fn builtin_idle_minutes(keep_warm_inactivity_minutes: i32) -> u32 {
+    match keep_warm_inactivity_minutes {
+        -1 => 0,
+        0 => crate::config::defaults::DEFAULT_BUILTIN_IDLE_MINUTES,
+        n if n > 0 => n as u32,
+        // Below -1: out-of-contract once the loader clamps; map to the same
+        // "forever" disable as -1 so a stray negative never enables a timer.
+        _ => 0,
+    }
+}
+
 /// True when the VRAM poller should query Ollama's `/api/ps` on this tick.
 /// The poller observes Ollama's VRAM only: the built-in engine publishes its
 /// lifecycle through the engine status watch and an `openai` provider has no
@@ -1181,6 +1208,35 @@ mod tests {
         assert_eq!(keep_alive_string(-1), "-1");
     }
 
+    #[test]
+    fn builtin_idle_minutes_forever_disables_timer() {
+        // -1 (keep resident forever) maps to the runner's "0 = disabled".
+        assert_eq!(builtin_idle_minutes(-1), 0);
+    }
+
+    #[test]
+    fn builtin_idle_minutes_zero_uses_short_default() {
+        // 0 (natural short default) maps to the baked-in ~5-minute timer.
+        assert_eq!(
+            builtin_idle_minutes(0),
+            crate::config::defaults::DEFAULT_BUILTIN_IDLE_MINUTES
+        );
+    }
+
+    #[test]
+    fn builtin_idle_minutes_positive_passes_through() {
+        assert_eq!(builtin_idle_minutes(30), 30);
+        assert_eq!(builtin_idle_minutes(1), 1);
+        assert_eq!(builtin_idle_minutes(1440), 1440);
+    }
+
+    #[test]
+    fn builtin_idle_minutes_below_minus_one_disables_timer() {
+        // Out-of-contract once the loader clamps; the total match still maps
+        // any stray negative to the "forever" disable rather than a timer.
+        assert_eq!(builtin_idle_minutes(-999), 0);
+    }
+
     #[tokio::test]
     async fn evict_model_request_sends_keep_alive_zero_as_string() {
         let mut server = Server::new_async().await;
diff --git a/src/settings/SettingsWindow.test.tsx b/src/settings/SettingsWindow.test.tsx
index d434a744..93d2c44a 100644
--- a/src/settings/SettingsWindow.test.tsx
+++ b/src/settings/SettingsWindow.test.tsx
@@ -19,7 +19,6 @@ const SAMPLE: RawAppConfig = {
   inference: {
     active_provider: 'ollama',
     keep_warm_inactivity_minutes: 0,
-    idle_unload_minutes: 0,
     num_ctx: 16384,
     providers: [
       {
diff --git a/src/settings/components/SaveField.test.tsx b/src/settings/components/SaveField.test.tsx
index 84dc43e2..a4c05062 100644
--- a/src/settings/components/SaveField.test.tsx
+++ b/src/settings/components/SaveField.test.tsx
@@ -12,7 +12,6 @@ const SAMPLE: RawAppConfig = {
   inference: {
     active_provider: 'ollama',
     keep_warm_inactivity_minutes: 0,
-    idle_unload_minutes: 0,
     num_ctx: 16384,
     providers: [
       {
diff --git a/src/settings/configHelpers.ts b/src/settings/configHelpers.ts
index 32e87aca..048b0972 100644
--- a/src/settings/configHelpers.ts
+++ b/src/settings/configHelpers.ts
@@ -17,11 +17,9 @@ const HELPERS = {
     ollama_base_url:
       'The address where Thuki reaches your Ollama server. The default works if you run Ollama on this Mac with its standard port. Point it at another machine to use Ollama running elsewhere (one server at a time).',
     keep_warm:
-      'When on, Thuki tells Ollama to keep the active model loaded in GPU memory between conversations, saving the cold-load wait on every open. Set "Release after" to −1 to keep it warm indefinitely, or pick a timeout in minutes so GPU memory is reclaimed when you stop using Thuki for a while.',
+      'How long Thuki keeps the active model resident in memory between messages, so the next one skips the cold-load wait. Applies to both local providers (the built-in engine and Ollama); it does not apply to a remote OpenAI-compatible server, whose memory Thuki does not manage. Set "Release after" to −1 to keep it resident indefinitely, 0 to use the provider\'s natural short default (about 5 minutes), or a timeout in minutes so memory is reclaimed when you stop using Thuki for a while.',
     builtin_model:
       'The downloaded model Thuki\'s built-in engine runs. Pick from the models you have downloaded, or use "Download a model" below to grab a curated starter or any GGUF file from a Hugging Face repo.',
-    idle_unload_minutes:
-      'How many minutes of inactivity before Thuki stops its built-in engine to free memory. 0 (the default) keeps the model loaded so the first token of your next message stays instant. A positive value frees memory after that many idle minutes, at the cost of a cold reload on the next message.',
     openai_base_url:
       'The address of your OpenAI-compatible server (LM Studio, Jan, llama-server, and similar all expose one). Thuki calls its /v1 endpoints for chat and model listing. Must start with http:// or https://.',
     openai_api_key:
diff --git a/src/settings/hooks/useConfigSync.test.ts b/src/settings/hooks/useConfigSync.test.ts
index d04ac8d5..d27bb256 100644
--- a/src/settings/hooks/useConfigSync.test.ts
+++ b/src/settings/hooks/useConfigSync.test.ts
@@ -16,7 +16,6 @@ const CONFIG_A: RawAppConfig = {
   inference: {
     active_provider: 'ollama',
     keep_warm_inactivity_minutes: 0,
-    idle_unload_minutes: 0,
     num_ctx: 16384,
     providers: [
       {
diff --git a/src/settings/hooks/useDebouncedSave.test.ts b/src/settings/hooks/useDebouncedSave.test.ts
index 04668a8b..6cc19ae4 100644
--- a/src/settings/hooks/useDebouncedSave.test.ts
+++ b/src/settings/hooks/useDebouncedSave.test.ts
@@ -13,7 +13,6 @@ const SAMPLE_CONFIG: RawAppConfig = {
   inference: {
     active_provider: 'ollama',
     keep_warm_inactivity_minutes: 0,
-    idle_unload_minutes: 0,
     num_ctx: 16384,
     providers: [
       {
diff --git a/src/settings/tabs/ModelTab.tsx b/src/settings/tabs/ModelTab.tsx
index 22f225b8..988536d3 100644
--- a/src/settings/tabs/ModelTab.tsx
+++ b/src/settings/tabs/ModelTab.tsx
@@ -2,9 +2,9 @@
  * AI tab.
  *
  * Holds the Providers panel (built-in engine, Ollama, and an optional
- * OpenAI-compatible server, with the active one selectable), the per-kind
- * memory controls (Keep Warm for Ollama, Idle Unload for the built-in
- * engine), the context window slider, and the custom system prompt. The
+ * OpenAI-compatible server, with the active one selectable), the unified
+ * Keep Warm residency control (shown for both local providers, hidden for
+ * OpenAI), the context window slider, and the custom system prompt. The
  * Window/Quote knobs live in the Display tab.
  */
 
@@ -47,10 +47,11 @@ const EJECT_RESET_MS = 2500;
 const TOKENS_PER_TURN_ESTIMATE = 400;
 
 const KEEP_WARM_TOOLTIP =
-  'Keep Warm holds your active model loaded in VRAM after each use. ' +
+  'Keep Warm holds your active model resident in memory after each use, ' +
+  'for both the built-in engine and Ollama. ' +
   'The timer below sets how long before it auto-releases; use -1 to keep it indefinitely. ' +
   'Unload now releases it immediately. ' +
-  'If set to 0, Ollama unloads models after its default 5-minute timeout.';
+  'If set to 0, each provider uses its natural short default (about 5 minutes).';
 
 // Log-scale context window slider: slider pos [0..1000] ↔ token count.
 // Scale: value = CTX_MIN * (CTX_MAX / CTX_MIN)^(pos/1000)
@@ -194,21 +195,6 @@ export function ModelTab({ config, resyncToken, onSaved }: ModelTabProps) {
     { onSaved },
   );
 
-  // Built-in engine idle-unload minutes (replaces keep-warm when the
-  // built-in provider is active). Same raw-string editing pattern as the
-  // keep-warm minutes input above.
-  const [idleMin, setIdleMin] = useState(config.inference.idle_unload_minutes);
-  const [rawIdleMin, setRawIdleMin] = useState(
-    String(config.inference.idle_unload_minutes),
-  );
-  const idleMinFocusedRef = useRef(false);
-  const { resetTo: resetIdleMin } = useDebouncedSave(
-    'inference',
-    'idle_unload_minutes',
-    idleMin,
-    { onSaved },
-  );
-
   const prevTokenRef = useRef(resyncToken);
 
   if (prevTokenRef.current !== resyncToken) {
@@ -218,11 +204,6 @@ export function ModelTab({ config, resyncToken, onSaved }: ModelTabProps) {
       setRawMin(String(config.inference.keep_warm_inactivity_minutes));
       resetMin(config.inference.keep_warm_inactivity_minutes);
     }
-    if (!idleMinFocusedRef.current) {
-      setIdleMin(config.inference.idle_unload_minutes);
-      setRawIdleMin(String(config.inference.idle_unload_minutes));
-      resetIdleMin(config.inference.idle_unload_minutes);
-    }
     const nextCtx = config.inference.num_ctx;
     setNumCtx(nextCtx);
     setCtxPos(ctxToPos(nextCtx));
@@ -412,74 +393,25 @@ export function ModelTab({ config, resyncToken, onSaved }: ModelTabProps) {
         )}
       </Section>
 
-      {activeKind === 'builtin' ? (
-        <Section heading="Idle Unload">
-          <SettingRow
-            label="Unload after idle"
-            helper={configHelp('inference', 'idle_unload_minutes')}
-          >
-            <div className={styles.keepWarmTimerGroup}>
-              <input
-                type="number"
-                className={styles.keepWarmNumberInput}
-                value={rawIdleMin}
-                min={0}
-                max={1440}
-                aria-label="Unload after N idle minutes"
-                onFocus={() => {
-                  idleMinFocusedRef.current = true;
-                }}
-                onChange={(e) => {
-                  const n = parseInt(e.target.value, 10);
-                  if (Number.isNaN(n)) {
-                    setRawIdleMin(e.target.value);
-                  } else {
-                    const clamped = Math.max(0, Math.min(1440, n));
-                    setRawIdleMin(String(clamped));
-                    setIdleMin(clamped);
-                  }
-                }}
-                onBlur={() => {
-                  idleMinFocusedRef.current = false;
-                  if (Number.isNaN(parseInt(rawIdleMin, 10))) {
-                    setRawIdleMin('0');
-                    setIdleMin(0);
-                  }
-                }}
-              />
-              <span className={styles.keepWarmUnit}>min</span>
-            </div>
-          </SettingRow>
-          <div className={styles.keepWarmStatusRow}>
-            <span className={styles.engineStatusLine}>
-              Engine: {engineState}
-            </span>
-            <button
-              type="button"
-              className={styles.keepWarmEjectPill}
-              aria-label="Unload now"
-              disabled={engineState !== 'loaded'}
-              onClick={handleEngineEject}
-            >
-              Unload now
-            </button>
-          </div>
-        </Section>
-      ) : null}
-
-      {activeKind === 'ollama' ? (
+      {/* Unified residency control: one Keep Warm knob bound to
+          keep_warm_inactivity_minutes, shown for both local providers
+          (built-in engine and Ollama) and hidden for OpenAI (Thuki does not
+          manage a remote server's residency). The status row branches by
+          kind: the built-in engine reports its sidecar lifecycle, Ollama
+          reports the model resident in VRAM. */}
+      {activeKind === 'builtin' || activeKind === 'ollama' ? (
         <Section heading="Keep Warm">
           {/* Row 1: label + [?] on left | Release after [N] min on right */}
           <div className={styles.keepWarmRow1}>
             <div className={styles.keepWarmLabelLine}>
               <span className={styles.keepWarmLabel}>
-                Keep active model in VRAM
+                Keep active model in memory
               </span>
               <Tooltip label={KEEP_WARM_TOOLTIP} multiline>
                 <button
                   type="button"
                   className={styles.infoBtn}
-                  aria-label="About Keep active model in VRAM"
+                  aria-label="About Keep active model in memory"
                 >
                   ?
                 </button>
@@ -521,51 +453,70 @@ export function ModelTab({ config, resyncToken, onSaved }: ModelTabProps) {
             </div>
           </div>
 
-          {/* Row 2: slug status on left | Unload now on right */}
-          <div className={styles.keepWarmStatusRow}>
-            <div className={styles.keepWarmStatusLeft}>
-              {loadedModel !== null ? (
-                <div className={styles.keepWarmVramSubtitle}>
-                  <span
-                    className={styles.keepWarmVramDot}
-                    data-testid="vram-status-dot"
-                    aria-hidden="true"
-                  />
-                  <span className={styles.keepWarmVramModelName}>
-                    {loadedModel}
-                  </span>
-                  <span>&nbsp;· in VRAM</span>
-                </div>
-              ) : (
-                <span className={styles.keepWarmNoModel}>No model loaded</span>
-              )}
+          {/* Row 2: residency status on left | Unload now on right. */}
+          {activeKind === 'builtin' ? (
+            <div className={styles.keepWarmStatusRow}>
+              <span className={styles.engineStatusLine}>
+                Engine: {engineState}
+              </span>
+              <button
+                type="button"
+                className={styles.keepWarmEjectPill}
+                aria-label="Unload now"
+                disabled={engineState !== 'loaded'}
+                onClick={handleEngineEject}
+              >
+                Unload now
+              </button>
             </div>
+          ) : (
+            <div className={styles.keepWarmStatusRow}>
+              <div className={styles.keepWarmStatusLeft}>
+                {loadedModel !== null ? (
+                  <div className={styles.keepWarmVramSubtitle}>
+                    <span
+                      className={styles.keepWarmVramDot}
+                      data-testid="vram-status-dot"
+                      aria-hidden="true"
+                    />
+                    <span className={styles.keepWarmVramModelName}>
+                      {loadedModel}
+                    </span>
+                    <span>&nbsp;· in VRAM</span>
+                  </div>
+                ) : (
+                  <span className={styles.keepWarmNoModel}>
+                    No model loaded
+                  </span>
+                )}
+              </div>
 
-            <button
-              type="button"
-              className={styles.keepWarmEjectPill}
-              aria-label="Unload now"
-              disabled={ejecting || loadedModel === null}
-              data-ejecting={ejecting}
-              onClick={handleEject}
-            >
-              {ejecting ? (
-                <DrawCheckIcon />
-              ) : (
-                <svg
-                  viewBox="0 0 16 16"
-                  width="11"
-                  height="11"
-                  fill="currentColor"
-                  aria-hidden="true"
-                >
-                  <polygon points="8,2 14,11 2,11" />
-                  <rect x="2" y="12.5" width="12" height="2" rx="1" />
-                </svg>
-              )}
-              Unload now
-            </button>
-          </div>
+              <button
+                type="button"
+                className={styles.keepWarmEjectPill}
+                aria-label="Unload now"
+                disabled={ejecting || loadedModel === null}
+                data-ejecting={ejecting}
+                onClick={handleEject}
+              >
+                {ejecting ? (
+                  <DrawCheckIcon />
+                ) : (
+                  <svg
+                    viewBox="0 0 16 16"
+                    width="11"
+                    height="11"
+                    fill="currentColor"
+                    aria-hidden="true"
+                  >
+                    <polygon points="8,2 14,11 2,11" />
+                    <rect x="2" y="12.5" width="12" height="2" rx="1" />
+                  </svg>
+                )}
+                Unload now
+              </button>
+            </div>
+          )}
         </Section>
       ) : null}
 
diff --git a/src/settings/tabs/ProviderCards.test.tsx b/src/settings/tabs/ProviderCards.test.tsx
index b8127bf4..d45b58f3 100644
--- a/src/settings/tabs/ProviderCards.test.tsx
+++ b/src/settings/tabs/ProviderCards.test.tsx
@@ -38,7 +38,6 @@ const BASE_CONFIG: RawAppConfig = {
   inference: {
     active_provider: 'builtin',
     keep_warm_inactivity_minutes: 0,
-    idle_unload_minutes: 0,
     num_ctx: 16384,
     providers: [
       {
diff --git a/src/settings/tabs/tabs.test.tsx b/src/settings/tabs/tabs.test.tsx
index 600fe015..3714ebbd 100644
--- a/src/settings/tabs/tabs.test.tsx
+++ b/src/settings/tabs/tabs.test.tsx
@@ -38,7 +38,6 @@ const CONFIG: RawAppConfig = {
   inference: {
     active_provider: 'ollama',
     keep_warm_inactivity_minutes: 0,
-    idle_unload_minutes: 0,
     num_ctx: 16384,
     providers: [
       {
@@ -95,7 +94,7 @@ const CONFIG: RawAppConfig = {
   },
 };
 
-/** CONFIG with the built-in provider active (Idle Unload replaces Keep Warm). */
+/** CONFIG with the built-in provider active (Keep Warm shows the engine-status row). */
 const BUILTIN_ACTIVE_CONFIG: RawAppConfig = {
   ...CONFIG,
   inference: { ...CONFIG.inference, active_provider: 'builtin' },
@@ -526,7 +525,7 @@ describe('ModelTab', () => {
   it('renders the Keep Warm section with Release after input and Unload now button', async () => {
     await renderModelTab();
     expect(screen.getByText('Keep Warm')).toBeInTheDocument();
-    expect(screen.getByText('Keep active model in VRAM')).toBeInTheDocument();
+    expect(screen.getByText('Keep active model in memory')).toBeInTheDocument();
     expect(screen.getByText('Release after')).toBeInTheDocument();
     expect(
       screen.getByRole('button', { name: 'Unload now' }),
@@ -1154,7 +1153,7 @@ describe('ModelTab', () => {
     });
   });
 
-  // ─── Idle Unload (built-in provider active) ─────────────────────────────
+  // ─── Keep Warm with the built-in provider active ────────────────────────
 
   async function renderBuiltinActive(
     onSaved: (next: RawAppConfig) => void = () => {},
@@ -1172,53 +1171,35 @@ describe('ModelTab', () => {
     return view;
   }
 
-  it('renders Idle Unload instead of Keep Warm when the built-in provider is active', async () => {
+  it('renders the unified Keep Warm control with the engine-status row when the built-in provider is active', async () => {
     await renderBuiltinActive();
-    expect(screen.getByText('Idle Unload')).toBeInTheDocument();
-    expect(screen.queryByText('Keep Warm')).not.toBeInTheDocument();
+    // Same single Keep Warm section as Ollama, but the built-in status row
+    // reports the sidecar lifecycle instead of the VRAM slug.
+    expect(screen.getByText('Keep Warm')).toBeInTheDocument();
+    expect(screen.getByText('Keep active model in memory')).toBeInTheDocument();
+    expect(screen.queryByText('Idle Unload')).not.toBeInTheDocument();
+    expect(screen.queryByTestId('vram-status-dot')).not.toBeInTheDocument();
     expect(screen.getByText('Engine: stopped')).toBeInTheDocument();
     expect(screen.getByRole('button', { name: 'Unload now' })).toBeDisabled();
   });
 
-  it('clamps the idle minutes input to the 0..1440 range', async () => {
+  it('clamps the Keep Warm input to the -1..1440 range while built-in is active', async () => {
     await renderBuiltinActive();
     const input = screen.getByRole('spinbutton', {
-      name: 'Unload after N idle minutes',
+      name: 'Release after N minutes',
     }) as HTMLInputElement;
     fireEvent.change(input, { target: { value: '45' } });
     expect(input.value).toBe('45');
     fireEvent.change(input, { target: { value: '-5' } });
-    expect(input.value).toBe('0');
+    expect(input.value).toBe('-1');
     fireEvent.change(input, { target: { value: '99999' } });
     expect(input.value).toBe('1440');
   });
 
-  it('allows empty idle input mid-edit; blur defaults to 0', async () => {
-    await renderBuiltinActive();
-    const input = screen.getByRole('spinbutton', {
-      name: 'Unload after N idle minutes',
-    }) as HTMLInputElement;
-    fireEvent.focus(input);
-    fireEvent.change(input, { target: { value: '' } });
-    expect(input.value).toBe('');
-    fireEvent.blur(input);
-    expect(input.value).toBe('0');
-  });
-
-  it('blur with a valid idle value does not reset the field', async () => {
-    await renderBuiltinActive();
-    const input = screen.getByRole('spinbutton', {
-      name: 'Unload after N idle minutes',
-    }) as HTMLInputElement;
-    fireEvent.change(input, { target: { value: '30' } });
-    fireEvent.blur(input);
-    expect(input.value).toBe('30');
-  });
-
-  it('resync does not overwrite the idle minutes input while focused', async () => {
+  it('resync does not overwrite the Keep Warm input while focused (built-in active)', async () => {
     const { rerender } = await renderBuiltinActive();
     const input = screen.getByRole('spinbutton', {
-      name: 'Unload after N idle minutes',
+      name: 'Release after N minutes',
     }) as HTMLInputElement;
     fireEvent.focus(input);
     fireEvent.change(input, { target: { value: '25' } });
@@ -1226,7 +1207,7 @@ describe('ModelTab', () => {
       ...BUILTIN_ACTIVE_CONFIG,
       inference: {
         ...BUILTIN_ACTIVE_CONFIG.inference,
-        idle_unload_minutes: 90,
+        keep_warm_inactivity_minutes: 90,
       },
     };
     rerender(
diff --git a/src/settings/types.ts b/src/settings/types.ts
index 97e1197a..c945fb6c 100644
--- a/src/settings/types.ts
+++ b/src/settings/types.ts
@@ -27,7 +27,6 @@ export interface RawAppConfig {
   inference: {
     active_provider: string;
     keep_warm_inactivity_minutes: number;
-    idle_unload_minutes: number;
     num_ctx: number;
     providers: RawProvider[];
   };

From ab046832b6ba8e6da8c63ae2cc38617959b21814 Mon Sep 17 00:00:00 2001
From: Logan Nguyen <lg.131.dev@gmail.com>
Date: Thu, 18 Jun 2026 17:46:20 -0500
Subject: [PATCH 2/4] chore: gate the OpenAI provider UI behind a compile-time
 dev flag

Signed-off-by: Logan Nguyen <lg.131.dev@gmail.com>
---
 src/settings/devFlags.test.ts   | 34 ++++++++++++++++++
 src/settings/devFlags.ts        | 23 +++++++++++++
 src/settings/tabs/ModelTab.tsx  | 61 +++++++++++++++++++--------------
 src/settings/tabs/tabs.test.tsx | 42 +++++++++++++++++++++++
 src/vite-env.d.ts               |  6 ++++
 5 files changed, 140 insertions(+), 26 deletions(-)
 create mode 100644 src/settings/devFlags.test.ts
 create mode 100644 src/settings/devFlags.ts

diff --git a/src/settings/devFlags.test.ts b/src/settings/devFlags.test.ts
new file mode 100644
index 00000000..1b096178
--- /dev/null
+++ b/src/settings/devFlags.test.ts
@@ -0,0 +1,34 @@
+import { afterEach, describe, expect, it, vi } from 'vitest';
+
+/**
+ * The flag is a module-level const evaluated at import, so each case stubs the
+ * env, resets the module registry, and re-imports to observe the resolved
+ * value. This covers both the enabled ("true") and disabled (everything else)
+ * evaluations of `import.meta.env.VITE_ENABLE_OPENAI_PROVIDER === 'true'`.
+ */
+async function loadFlag(): Promise<boolean> {
+  vi.resetModules();
+  const mod = await import('./devFlags');
+  return mod.OPENAI_PROVIDER_ENABLED;
+}
+
+describe('OPENAI_PROVIDER_ENABLED', () => {
+  afterEach(() => {
+    vi.unstubAllEnvs();
+  });
+
+  it('is true only when VITE_ENABLE_OPENAI_PROVIDER is exactly "true"', async () => {
+    vi.stubEnv('VITE_ENABLE_OPENAI_PROVIDER', 'true');
+    expect(await loadFlag()).toBe(true);
+  });
+
+  it('is false when the env var is unset', async () => {
+    vi.stubEnv('VITE_ENABLE_OPENAI_PROVIDER', undefined);
+    expect(await loadFlag()).toBe(false);
+  });
+
+  it('is false for any other value', async () => {
+    vi.stubEnv('VITE_ENABLE_OPENAI_PROVIDER', '1');
+    expect(await loadFlag()).toBe(false);
+  });
+});
diff --git a/src/settings/devFlags.ts b/src/settings/devFlags.ts
new file mode 100644
index 00000000..dda3d761
--- /dev/null
+++ b/src/settings/devFlags.ts
@@ -0,0 +1,23 @@
+/**
+ * Compile-time, dev-only feature flags for the Settings UI.
+ *
+ * These are resolved from `import.meta.env` at build time, so a flag that is
+ * `false` in a production build lets Vite tree-shake the gated affordance out
+ * of the shipped bundle entirely.
+ */
+
+/**
+ * Whether the OpenAI-compatible provider KIND is exposed in the Settings UI.
+ *
+ * Thuki ships local-only (the bundled built-in engine plus local or remote
+ * Ollama). The `openai` backend (the shared `/v1` client, Keychain storage,
+ * the routing arm, the config kind) stays live: this flag gates only the
+ * user-facing affordances that let someone create or manage an `openai`
+ * provider, so no end user can reach one in a shipped build.
+ *
+ * Off by default. Developers opt in at build time by setting
+ * `VITE_ENABLE_OPENAI_PROVIDER=true`; any other value (including unset) is
+ * off.
+ */
+export const OPENAI_PROVIDER_ENABLED =
+  import.meta.env.VITE_ENABLE_OPENAI_PROVIDER === 'true';
diff --git a/src/settings/tabs/ModelTab.tsx b/src/settings/tabs/ModelTab.tsx
index 988536d3..f232ff52 100644
--- a/src/settings/tabs/ModelTab.tsx
+++ b/src/settings/tabs/ModelTab.tsx
@@ -20,6 +20,7 @@ import {
   OpenAiProviderCard,
 } from './ProviderCards';
 import { useDebouncedSave } from '../hooks/useDebouncedSave';
+import { OPENAI_PROVIDER_ENABLED } from '../devFlags';
 import { useModelSelection } from '../../hooks/useModelSelection';
 import { isNonLocalUrl } from '../../utils/isNonLocalUrl';
 import { configHelp } from '../configHelpers';
@@ -364,33 +365,41 @@ export function ModelTab({ config, resyncToken, onSaved }: ModelTabProps) {
           ) : null}
         </div>
 
-        {openaiProvider ? (
-          <div
-            className={providerCardClass(activeKind === 'openai')}
-            data-provider-card="openai"
-          >
-            <label className={styles.providerSelectRow}>
-              <input
-                type="radio"
-                className={styles.providerRadio}
-                name="active-provider"
-                aria-label="Use OpenAI-compatible server"
-                checked={activeKind === 'openai'}
-                onChange={() => selectProvider(openaiProvider.id)}
+        {/* The OpenAI-compatible provider KIND is gated behind a
+            compile-time, dev-only flag (off in shipped builds): both the
+            management card and the "add a server" affordance are the only UI
+            paths to create or manage one, so hiding them keeps the kind out of
+            reach of end users. The shared /v1 backend stays live for the
+            built-in engine regardless. */}
+        {OPENAI_PROVIDER_ENABLED ? (
+          openaiProvider ? (
+            <div
+              className={providerCardClass(activeKind === 'openai')}
+              data-provider-card="openai"
+            >
+              <label className={styles.providerSelectRow}>
+                <input
+                  type="radio"
+                  className={styles.providerRadio}
+                  name="active-provider"
+                  aria-label="Use OpenAI-compatible server"
+                  checked={activeKind === 'openai'}
+                  onChange={() => selectProvider(openaiProvider.id)}
+                />
+                <span className={styles.providerName}>
+                  {openaiProvider.label}
+                </span>
+              </label>
+              <OpenAiProviderCard
+                provider={openaiProvider}
+                resyncToken={resyncToken}
+                onSaved={onSaved}
               />
-              <span className={styles.providerName}>
-                {openaiProvider.label}
-              </span>
-            </label>
-            <OpenAiProviderCard
-              provider={openaiProvider}
-              resyncToken={resyncToken}
-              onSaved={onSaved}
-            />
-          </div>
-        ) : (
-          <AddOpenAiProvider onSaved={onSaved} />
-        )}
+            </div>
+          ) : (
+            <AddOpenAiProvider onSaved={onSaved} />
+          )
+        ) : null}
       </Section>
 
       {/* Unified residency control: one Keep Warm knob bound to
diff --git a/src/settings/tabs/tabs.test.tsx b/src/settings/tabs/tabs.test.tsx
index 3714ebbd..5efb9b4b 100644
--- a/src/settings/tabs/tabs.test.tsx
+++ b/src/settings/tabs/tabs.test.tsx
@@ -32,6 +32,17 @@ import { AboutTab } from './AboutTab';
 import { BehaviorTab } from './BehaviorTab';
 import type { RawAppConfig } from '../types';
 
+// The OpenAI-compatible provider UI is gated behind a compile-time dev flag
+// (off in shipped builds). Mock the flag module through a mutable holder so the
+// suite can drive both branches: the existing openai-card tests run with it
+// enabled, and a dedicated test asserts the affordance is absent when disabled.
+const devFlags = { openaiEnabled: true };
+vi.mock('../devFlags', () => ({
+  get OPENAI_PROVIDER_ENABLED() {
+    return devFlags.openaiEnabled;
+  },
+}));
+
 const invokeMock = invoke as unknown as ReturnType<typeof vi.fn>;
 
 const CONFIG: RawAppConfig = {
@@ -127,6 +138,9 @@ function engineStatus(
 }
 
 beforeEach(() => {
+  // Default to the enabled branch so the openai-card tests render the gated
+  // UI; the disabled-state test flips this within its own body.
+  devFlags.openaiEnabled = true;
   invokeMock.mockReset();
   invokeMock.mockImplementation((cmd: string) => {
     if (cmd === 'get_loaded_model') return Promise.resolve(null);
@@ -1153,6 +1167,34 @@ describe('ModelTab', () => {
     });
   });
 
+  it('hides every OpenAI-compatible affordance when the dev flag is disabled', async () => {
+    devFlags.openaiEnabled = false;
+    // No openai provider configured: the "add a server" affordance is gone.
+    const { unmount } = render(
+      <ModelTab config={CONFIG} resyncToken={0} onSaved={() => {}} />,
+    );
+    await act(async () => {
+      await Promise.resolve();
+    });
+    expect(
+      screen.queryByRole('button', { name: 'Add OpenAI-compatible server' }),
+    ).not.toBeInTheDocument();
+    unmount();
+
+    // An openai provider hand-edited into config: its management card and
+    // radio stay hidden too (the backend still honors it).
+    render(
+      <ModelTab config={OPENAI_CONFIG} resyncToken={0} onSaved={() => {}} />,
+    );
+    await act(async () => {
+      await Promise.resolve();
+    });
+    expect(screen.queryByText('LM Studio')).not.toBeInTheDocument();
+    expect(
+      screen.queryByRole('radio', { name: 'Use OpenAI-compatible server' }),
+    ).not.toBeInTheDocument();
+  });
+
   // ─── Keep Warm with the built-in provider active ────────────────────────
 
   async function renderBuiltinActive(
diff --git a/src/vite-env.d.ts b/src/vite-env.d.ts
index 6c26522c..03152e9f 100644
--- a/src/vite-env.d.ts
+++ b/src/vite-env.d.ts
@@ -6,6 +6,12 @@ interface ImportMetaEnv {
   readonly VITE_QUOTE_MAX_CONTEXT_LENGTH: string | undefined;
   /** Full git commit SHA injected by CI at build time. Absent in local dev and stable release builds. */
   readonly VITE_GIT_COMMIT_SHA: string | undefined;
+  /**
+   * Dev-only opt-in for the OpenAI-compatible provider UI. `"true"` exposes
+   * the Settings affordances to create/manage an `openai` provider; any other
+   * value (including unset) hides them. Off in shipped builds.
+   */
+  readonly VITE_ENABLE_OPENAI_PROVIDER: string | undefined;
 }
 
 interface ImportMeta {

From 0443fa896c5f275e3530db9b333145b40dc89c54 Mon Sep 17 00:00:00 2001
From: Logan Nguyen <lg.131.dev@gmail.com>
Date: Thu, 18 Jun 2026 17:50:49 -0500
Subject: [PATCH 3/4] chore: replace internal phase jargon with behavior
 descriptions

Signed-off-by: Logan Nguyen <lg.131.dev@gmail.com>
---
 src-tauri/src/commands.rs                            | 4 ++--
 src-tauri/src/config/defaults.rs                     | 7 ++++---
 src-tauri/src/config/loader.rs                       | 9 +++++----
 src-tauri/src/config/tests.rs                        | 2 +-
 src-tauri/src/models/mod.rs                          | 8 ++++----
 src-tauri/src/screenshot.rs                          | 4 ++--
 src-tauri/src/settings_commands/tests.rs             | 2 +-
 src/App.tsx                                          | 4 ++--
 src/components/ChatBubble.tsx                        | 6 +++---
 src/utils/__tests__/capabilityConflicts.test.ts      | 4 ++--
 src/utils/__tests__/sanitizeAssistantContent.test.ts | 2 +-
 src/utils/capabilityConflicts.ts                     | 6 +++---
 src/utils/sanitizeAssistantContent.ts                | 6 +++---
 13 files changed, 33 insertions(+), 31 deletions(-)

diff --git a/src-tauri/src/commands.rs b/src-tauri/src/commands.rs
index c0c6aab2..0fb7e5e5 100644
--- a/src-tauri/src/commands.rs
+++ b/src-tauri/src/commands.rs
@@ -2849,7 +2849,7 @@ mod tests {
         assert_eq!(messages[1].images.as_ref().unwrap().len(), 1);
     }
 
-    // ─── classify_http_error: Phase B picker hint ────────────────────────────
+    // ─── classify_http_error: capability picker hint ─────────────────────────
 
     #[test]
     fn classify_http_500_appends_picker_hint_when_body_mentions_image() {
@@ -3191,7 +3191,7 @@ mod tests {
     /// Locks the native `/api/chat` wire contract across the routing change:
     /// the exact request body (model, messages, stream, think, options
     /// {temperature, top_p, top_k, num_ctx}, keep_alive) must be identical
-    /// to the pre-routing Phase 1 payload.
+    /// to the payload Thuki sent before provider routing was introduced.
     #[tokio::test]
     async fn ollama_request_body_unchanged() {
         use crate::config::defaults::PROVIDER_KIND_OLLAMA;
diff --git a/src-tauri/src/config/defaults.rs b/src-tauri/src/config/defaults.rs
index 7c5b517b..38b1a975 100644
--- a/src-tauri/src/config/defaults.rs
+++ b/src-tauri/src/config/defaults.rs
@@ -35,10 +35,11 @@ pub const DEFAULT_OPENAI_LABEL: &str = "OpenAI-compatible";
 
 /// Provider Thuki sends inference to on a fresh install.
 ///
-/// Phase 2 bundles the llama.cpp engine, so a new install starts on the
+/// Thuki bundles the llama.cpp engine, so a new install starts on the
 /// built-in provider and onboarding offers a starter model download. Configs
-/// that already persisted an `active_provider` (including Phase 1's Ollama
-/// default) are never rewritten; only fresh or dangling pointers land here.
+/// that already persisted an `active_provider` (including the older
+/// Ollama-only default) are never rewritten; only fresh or dangling pointers
+/// land here.
 pub const DEFAULT_ACTIVE_PROVIDER: &str = PROVIDER_ID_BUILTIN;
 
 /// Default inactivity window before Thuki releases the active model from local
diff --git a/src-tauri/src/config/loader.rs b/src-tauri/src/config/loader.rs
index 75dd63f6..6e397097 100644
--- a/src-tauri/src/config/loader.rs
+++ b/src-tauri/src/config/loader.rs
@@ -406,7 +406,7 @@ fn resolve_inference(inf: &mut crate::config::schema::InferenceSection) {
     {
         inf.providers.insert(0, builtin_provider());
     }
-    // Ensure a functional Phase-1 provider exists: re-seed Ollama if absent.
+    // Ensure the Ollama provider exists: re-seed it if a user file omitted it.
     if !inf.providers.iter().any(|p| p.kind == PROVIDER_KIND_OLLAMA) {
         inf.providers.push(ollama_provider(DEFAULT_OLLAMA_URL));
     }
@@ -424,8 +424,8 @@ fn resolve_inference(inf: &mut crate::config::schema::InferenceSection) {
 
     // A pre-providers file (no [[inference.providers]] array) predates the
     // built-in engine: that user runs Ollama. Pin the pointer explicitly so
-    // the compiled default (which favors the built-in engine from Phase 2 on)
-    // only ever applies to fresh installs and new-shape files. Covers both
+    // the compiled default (which favors the built-in engine) only ever
+    // applies to fresh installs and new-shape files. Covers both
     // legacy shapes: with an ollama_url key and without one. An explicit
     // active_provider equal to the compiled default, or naming the built-in
     // provider, is also overridden here: in a pre-providers file neither
@@ -465,7 +465,8 @@ pub fn compose_system_prompt(base: &str, appendix: &str) -> String {
 }
 
 fn clamp_keep_warm_inactivity(value: &mut i32, default: i32, field: &str) {
-    // Valid: -1 (never release), 0 (disabled), or 1..=1440 (explicit timeout).
+    // Valid: -1 (keep resident forever), 0 (provider's natural short default),
+    // or 1..=1440 (explicit timeout).
     // Invalid: below -1 or above 1440 — reset to compiled default.
     let (lo, hi) = BOUNDS_KEEP_WARM_INACTIVITY_MINUTES;
     if !(lo..=hi).contains(value) {
diff --git a/src-tauri/src/config/tests.rs b/src-tauri/src/config/tests.rs
index d9a78276..79a5f0ea 100644
--- a/src-tauri/src/config/tests.rs
+++ b/src-tauri/src/config/tests.rs
@@ -117,7 +117,7 @@ fn defaults_prompt_base_is_nonempty() {
 
 #[test]
 fn fresh_default_active_provider_is_builtin() {
-    // Phase 2 ships the bundled engine, so a fresh install starts on the
+    // Thuki ships the bundled engine, so a fresh install starts on the
     // built-in provider. Existing configs keep whatever active_provider they
     // persisted (see the legacy pin tests below).
     assert_eq!(DEFAULT_ACTIVE_PROVIDER, PROVIDER_ID_BUILTIN);
diff --git a/src-tauri/src/models/mod.rs b/src-tauri/src/models/mod.rs
index 33894b9f..316251bd 100644
--- a/src-tauri/src/models/mod.rs
+++ b/src-tauri/src/models/mod.rs
@@ -513,11 +513,11 @@ pub async fn set_active_model(
     Ok(())
 }
 
-// ─── Model setup gate (Phase 3 onboarding) ──────────────────────────────────
+// ─── Model setup gate (onboarding) ──────────────────────────────────────────
 
 /// Result of probing the local Ollama daemon for setup readiness.
 ///
-/// Drives the Phase 3 onboarding gate that fires after the user grants
+/// Drives the onboarding model-setup gate that fires after the user grants
 /// macOS permissions but before the chat overlay is allowed to open.
 /// Variants are emitted to the frontend in `snake_case` with an
 /// internally-tagged `state` discriminator so the React side can route
@@ -1323,7 +1323,7 @@ pub struct MmprojCompanion {
 
 /// Pure parse of an HF repo listing into the spec for one target `file`.
 /// Capability rule for pasted repos: vision = an `mmproj*.gguf` sibling with
-/// complete LFS metadata exists; thinking = false (full detection is Phase 3).
+/// complete LFS metadata exists; thinking = false (full detection is not yet implemented).
 pub fn resolve_listing(body: &[u8], file: &str) -> Result<RepoResolved, String> {
     let info: HfRepoInfo = serde_json::from_slice(body)
         .map_err(|e| format!("failed to decode Hugging Face API response: {e}"))?;
@@ -2820,7 +2820,7 @@ mod tests {
         assert_eq!(MODEL_NOT_INSTALLED_ERR_PREFIX, "Model is not installed: ");
     }
 
-    // ── derive_model_setup_state (Phase 3 onboarding gate) ──────────────────
+    // ── derive_model_setup_state (onboarding model-setup gate) ──────────────
 
     #[test]
     fn derive_setup_state_returns_unreachable_on_fetch_error() {
diff --git a/src-tauri/src/screenshot.rs b/src-tauri/src/screenshot.rs
index 099ee9f6..b68e42a6 100644
--- a/src-tauri/src/screenshot.rs
+++ b/src-tauri/src/screenshot.rs
@@ -499,7 +499,7 @@ pub async fn capture_full_screen_command(
     // strictly main-thread-only.
     let main_window = app_handle.get_webview_window("main");
 
-    // Phase 1: Capture raw RGBA pixels on the main thread (CoreGraphics
+    // Step 1: Capture raw RGBA pixels on the main thread (CoreGraphics
     // requirement). Returns (width, height, rgba_bytes).
     //
     // The anchor point steers multi-monitor capture: we look up the
@@ -529,7 +529,7 @@ pub async fn capture_full_screen_command(
         .await
         .map_err(|_| "main thread capture channel closed unexpectedly".to_string())??;
 
-    // Phase 2: Encode to PNG and save via the images pipeline on a blocking
+    // Step 2: Encode to PNG and save via the images pipeline on a blocking
     // thread so the main thread stays responsive.
     let saved_path = tokio::task::spawn_blocking(move || {
         let buf =
diff --git a/src-tauri/src/settings_commands/tests.rs b/src-tauri/src/settings_commands/tests.rs
index eb101bfb..5c31e544 100644
--- a/src-tauri/src/settings_commands/tests.rs
+++ b/src-tauri/src/settings_commands/tests.rs
@@ -1440,7 +1440,7 @@ fn reset_section_on_disk_replaces_named_section_with_defaults() {
     std::fs::write(&path, SAMPLE_CONFIG).unwrap();
 
     let resolved = reset_section_on_disk(&path, Some("inference")).unwrap();
-    // Section reset restores compiled defaults: builtin active since Phase 2.
+    // Section reset restores compiled defaults: the built-in engine is the default active provider.
     assert_eq!(resolved.inference.active_provider, "builtin");
     assert!(resolved
         .inference
diff --git a/src/App.tsx b/src/App.tsx
index 85d61279..0ee62b4c 100644
--- a/src/App.tsx
+++ b/src/App.tsx
@@ -2401,8 +2401,8 @@ function App() {
    */
   /**
    * History-state derived from the current `messages` array. Drives the
-   * Phase B history-based capability strip: a heads-up when the active
-   * model lacks a capability earlier turns relied on (vision, thinking).
+   * history-based capability strip: a heads-up when the active model lacks
+   * a capability earlier turns relied on (vision, thinking).
    * `messages.some(...)` is O(n) per render but bounded by typical chat
    * lengths and memoized against the messages reference.
    */
diff --git a/src/components/ChatBubble.tsx b/src/components/ChatBubble.tsx
index 86ae3b61..deb577c4 100644
--- a/src/components/ChatBubble.tsx
+++ b/src/components/ChatBubble.tsx
@@ -334,9 +334,9 @@ export function ChatBubble({
   // Render-time defense for legacy assistant content that may carry
   // special turn-boundary tokens leaked by older Ollama versions or
   // mis-tuned models. Backend now strips these on write, so the scrub is
-  // a no-op for fresh replies; pre-Phase-B history on disk relies on it.
-  // User input never contains these markers naturally so we skip the
-  // scrub for user bubbles.
+  // a no-op for fresh replies; history stored before that strip existed
+  // relies on it. User input never contains these markers naturally so we
+  // skip the scrub for user bubbles.
   const displayContent = isUser ? content : cleanForRender(content);
 
   const activeWarnings = searchWarnings ?? [];
diff --git a/src/utils/__tests__/capabilityConflicts.test.ts b/src/utils/__tests__/capabilityConflicts.test.ts
index 99b23c19..348c799d 100644
--- a/src/utils/__tests__/capabilityConflicts.test.ts
+++ b/src/utils/__tests__/capabilityConflicts.test.ts
@@ -275,7 +275,7 @@ describe('getCapabilityConflict', () => {
     expect(result).toBeNull();
   });
 
-  // ── history-state gates (Phase B) ─────────────────────────────────────────
+  // ── history-state gates ───────────────────────────────────────────────────
 
   const HISTORY_HAS_IMAGES: HistoryCapabilityState = {
     historyHasImages: true,
@@ -626,7 +626,7 @@ describe('getEnvironmentMessage', () => {
     });
 
     it('returns the pick-a-model copy when reachable, models present, none active (S3)', () => {
-      // S3 is the rare post-Phase-A defensive state. Backend auto-picks the
+      // S3 is the rare defensive state. Backend auto-picks the
       // first installed model on launch, but if a payload drift ever lands
       // here we still surface a clear recovery cue instead of falling
       // through to the capability helper with a null model.
diff --git a/src/utils/__tests__/sanitizeAssistantContent.test.ts b/src/utils/__tests__/sanitizeAssistantContent.test.ts
index 25f791b0..02ae6ed4 100644
--- a/src/utils/__tests__/sanitizeAssistantContent.test.ts
+++ b/src/utils/__tests__/sanitizeAssistantContent.test.ts
@@ -27,7 +27,7 @@ describe('cleanForRender', () => {
   });
 
   it('strips a leaked thinking tag wrapper from legacy assistant content', () => {
-    // Pre-Phase-B reasoning models occasionally emitted `<think>...</think>`
+    // Some reasoning models occasionally emitted `<think>...</think>`
     // into `content` instead of the structured `thinking` field. The
     // render-time scrub keeps that legacy text visually clean.
     const dirty = 'answer<think>internal reasoning</think> shipped.';
diff --git a/src/utils/capabilityConflicts.ts b/src/utils/capabilityConflicts.ts
index 77fb8f6c..5cd89886 100644
--- a/src/utils/capabilityConflicts.ts
+++ b/src/utils/capabilityConflicts.ts
@@ -144,8 +144,8 @@ export const PICK_A_MODEL_MESSAGE =
  *     `installedCount` or `activeModel` because we cannot trust either.
  *   - S2: Ollama reachable, zero models installed. Returns the no-models copy.
  *   - S3: Ollama reachable, models installed, none active. Returns the
- *     pick-a-model copy. This state is rare post-Phase-A because the backend
- *     auto-picks on first launch, but the strip handles it defensively.
+ *     pick-a-model copy. This state is rare because the backend auto-picks
+ *     a model on first launch, but the strip handles it defensively.
  * - `builtin`: the backend always reports reachable=true (the engine starts
  *   on demand per request), so `reachable=false` only means the picker IPC
  *   call itself failed and the generic model-state copy is shown. Zero
@@ -263,7 +263,7 @@ export function getCapabilityConflict(
     return `${name} doesn't show reasoning. Try a thinking model for /think.`;
   }
 
-  // History-state checks (Phase B). The backend already strips images
+  // History-state checks. The backend already strips images
   // and thinking artifacts from the per-request snapshot when the active
   // model lacks the capability; the strip is purely informational so the
   // user knows why earlier content is missing from the model's view of
diff --git a/src/utils/sanitizeAssistantContent.ts b/src/utils/sanitizeAssistantContent.ts
index b978c906..f87524ce 100644
--- a/src/utils/sanitizeAssistantContent.ts
+++ b/src/utils/sanitizeAssistantContent.ts
@@ -1,9 +1,9 @@
 /**
  * Render-time defense against special turn-boundary tokens that may have
  * leaked into stored assistant content. Backend `sanitize_assistant_content`
- * (Rust) strips these before persisting, but pre-Phase-B conversations on
- * disk may already carry the dirty bytes. The render-time scrub keeps those
- * legacy messages visually clean without a SQLite migration.
+ * (Rust) strips these before persisting, but conversations stored before that
+ * strip existed may already carry the dirty bytes on disk. The render-time
+ * scrub keeps those legacy messages visually clean without a SQLite migration.
  *
  * Keep this list in lock-step with `STRIP_PATTERNS` in
  * `src-tauri/src/config/defaults.rs`. Exact-string match, case-sensitive: these

From 9c7632ee4e05fa6c5350dd6b10b874ad5f87a13c Mon Sep 17 00:00:00 2001
From: Logan Nguyen <lg.131.dev@gmail.com>
Date: Thu, 18 Jun 2026 18:12:13 -0500
Subject: [PATCH 4/4] refactor: read the OpenAI dev flag inline instead of a
 devFlags module

Signed-off-by: Logan Nguyen <lg.131.dev@gmail.com>
---
 src/settings/devFlags.test.ts   | 34 ---------------------------------
 src/settings/devFlags.ts        | 23 ----------------------
 src/settings/tabs/ModelTab.tsx  | 13 +++++++++++--
 src/settings/tabs/tabs.test.tsx | 18 ++++-------------
 4 files changed, 15 insertions(+), 73 deletions(-)
 delete mode 100644 src/settings/devFlags.test.ts
 delete mode 100644 src/settings/devFlags.ts

diff --git a/src/settings/devFlags.test.ts b/src/settings/devFlags.test.ts
deleted file mode 100644
index 1b096178..00000000
--- a/src/settings/devFlags.test.ts
+++ /dev/null
@@ -1,34 +0,0 @@
-import { afterEach, describe, expect, it, vi } from 'vitest';
-
-/**
- * The flag is a module-level const evaluated at import, so each case stubs the
- * env, resets the module registry, and re-imports to observe the resolved
- * value. This covers both the enabled ("true") and disabled (everything else)
- * evaluations of `import.meta.env.VITE_ENABLE_OPENAI_PROVIDER === 'true'`.
- */
-async function loadFlag(): Promise<boolean> {
-  vi.resetModules();
-  const mod = await import('./devFlags');
-  return mod.OPENAI_PROVIDER_ENABLED;
-}
-
-describe('OPENAI_PROVIDER_ENABLED', () => {
-  afterEach(() => {
-    vi.unstubAllEnvs();
-  });
-
-  it('is true only when VITE_ENABLE_OPENAI_PROVIDER is exactly "true"', async () => {
-    vi.stubEnv('VITE_ENABLE_OPENAI_PROVIDER', 'true');
-    expect(await loadFlag()).toBe(true);
-  });
-
-  it('is false when the env var is unset', async () => {
-    vi.stubEnv('VITE_ENABLE_OPENAI_PROVIDER', undefined);
-    expect(await loadFlag()).toBe(false);
-  });
-
-  it('is false for any other value', async () => {
-    vi.stubEnv('VITE_ENABLE_OPENAI_PROVIDER', '1');
-    expect(await loadFlag()).toBe(false);
-  });
-});
diff --git a/src/settings/devFlags.ts b/src/settings/devFlags.ts
deleted file mode 100644
index dda3d761..00000000
--- a/src/settings/devFlags.ts
+++ /dev/null
@@ -1,23 +0,0 @@
-/**
- * Compile-time, dev-only feature flags for the Settings UI.
- *
- * These are resolved from `import.meta.env` at build time, so a flag that is
- * `false` in a production build lets Vite tree-shake the gated affordance out
- * of the shipped bundle entirely.
- */
-
-/**
- * Whether the OpenAI-compatible provider KIND is exposed in the Settings UI.
- *
- * Thuki ships local-only (the bundled built-in engine plus local or remote
- * Ollama). The `openai` backend (the shared `/v1` client, Keychain storage,
- * the routing arm, the config kind) stays live: this flag gates only the
- * user-facing affordances that let someone create or manage an `openai`
- * provider, so no end user can reach one in a shipped build.
- *
- * Off by default. Developers opt in at build time by setting
- * `VITE_ENABLE_OPENAI_PROVIDER=true`; any other value (including unset) is
- * off.
- */
-export const OPENAI_PROVIDER_ENABLED =
-  import.meta.env.VITE_ENABLE_OPENAI_PROVIDER === 'true';
diff --git a/src/settings/tabs/ModelTab.tsx b/src/settings/tabs/ModelTab.tsx
index f232ff52..1849072f 100644
--- a/src/settings/tabs/ModelTab.tsx
+++ b/src/settings/tabs/ModelTab.tsx
@@ -20,7 +20,6 @@ import {
   OpenAiProviderCard,
 } from './ProviderCards';
 import { useDebouncedSave } from '../hooks/useDebouncedSave';
-import { OPENAI_PROVIDER_ENABLED } from '../devFlags';
 import { useModelSelection } from '../../hooks/useModelSelection';
 import { isNonLocalUrl } from '../../utils/isNonLocalUrl';
 import { configHelp } from '../configHelpers';
@@ -105,6 +104,16 @@ export function ModelTab({ config, resyncToken, onSaved }: ModelTabProps) {
   const builtinProvider = providers.find((p) => p.kind === 'builtin');
   const openaiProvider = providers.find((p) => p.kind === 'openai');
 
+  // The OpenAI-compatible provider kind is gated behind a compile-time,
+  // dev-only env flag, off by default. Vite statically replaces
+  // `import.meta.env` at build, so a production build folds this to `false`
+  // and tree-shakes the gated affordance out of the bundle entirely. Gates
+  // the UI only: the shared /v1 client the built-in engine depends on stays
+  // live. Read here (not at module load) so tests can toggle it via
+  // `vi.stubEnv`.
+  const openaiProviderEnabled =
+    import.meta.env.VITE_ENABLE_OPENAI_PROVIDER === 'true';
+
   // Latest engine lifecycle snapshot; drives the built-in residency line and
   // the context slider's non-blocking "Applying" hint.
   const [engineState, setEngineState] =
@@ -371,7 +380,7 @@ export function ModelTab({ config, resyncToken, onSaved }: ModelTabProps) {
             paths to create or manage one, so hiding them keeps the kind out of
             reach of end users. The shared /v1 backend stays live for the
             built-in engine regardless. */}
-        {OPENAI_PROVIDER_ENABLED ? (
+        {openaiProviderEnabled ? (
           openaiProvider ? (
             <div
               className={providerCardClass(activeKind === 'openai')}
diff --git a/src/settings/tabs/tabs.test.tsx b/src/settings/tabs/tabs.test.tsx
index 5efb9b4b..c366a71f 100644
--- a/src/settings/tabs/tabs.test.tsx
+++ b/src/settings/tabs/tabs.test.tsx
@@ -32,17 +32,6 @@ import { AboutTab } from './AboutTab';
 import { BehaviorTab } from './BehaviorTab';
 import type { RawAppConfig } from '../types';
 
-// The OpenAI-compatible provider UI is gated behind a compile-time dev flag
-// (off in shipped builds). Mock the flag module through a mutable holder so the
-// suite can drive both branches: the existing openai-card tests run with it
-// enabled, and a dedicated test asserts the affordance is absent when disabled.
-const devFlags = { openaiEnabled: true };
-vi.mock('../devFlags', () => ({
-  get OPENAI_PROVIDER_ENABLED() {
-    return devFlags.openaiEnabled;
-  },
-}));
-
 const invokeMock = invoke as unknown as ReturnType<typeof vi.fn>;
 
 const CONFIG: RawAppConfig = {
@@ -139,8 +128,9 @@ function engineStatus(
 
 beforeEach(() => {
   // Default to the enabled branch so the openai-card tests render the gated
-  // UI; the disabled-state test flips this within its own body.
-  devFlags.openaiEnabled = true;
+  // UI; the disabled-state test flips this within its own body. ModelTab reads
+  // the flag from `import.meta.env` at render, so stubbing it here is enough.
+  vi.stubEnv('VITE_ENABLE_OPENAI_PROVIDER', 'true');
   invokeMock.mockReset();
   invokeMock.mockImplementation((cmd: string) => {
     if (cmd === 'get_loaded_model') return Promise.resolve(null);
@@ -1168,7 +1158,7 @@ describe('ModelTab', () => {
   });
 
   it('hides every OpenAI-compatible affordance when the dev flag is disabled', async () => {
-    devFlags.openaiEnabled = false;
+    vi.stubEnv('VITE_ENABLE_OPENAI_PROVIDER', 'false');
     // No openai provider configured: the "add a server" affordance is gone.
     const { unmount } = render(
       <ModelTab config={CONFIG} resyncToken={0} onSaved={() => {}} />,