diff --git a/.gitignore b/.gitignore
index 7644665c..8811af62 100644
--- a/.gitignore
+++ b/.gitignore
@@ -434,3 +434,4 @@ docker-compose.host-mounts.generated.yml
 /installer/docker/volumes/content-files/*
 !/installer/docker/volumes/content-files/.gitkeep
 *.cli-auth-token
+/src/client/GuideAnts.code-workspace
diff --git a/README.md b/README.md
index 55ba9c14..1bd94e8a 100644
--- a/README.md
+++ b/README.md
@@ -185,11 +185,15 @@ GuideAnts runs locally with Docker Compose. OS-specific quickstart scripts are i
 ```bash
 # Windows
 .\quickstart.ps1
+# or: start_windows.cmd
 
 # Linux / macOS
 ./quickstart.sh
+# or: start_linux.sh / start_macos.sh
 ```
 
+Backends: `cuda13` (NVIDIA), `rocm` (AMD), `vulkan` (NVIDIA/AMD/Intel via Vulkan), `cpu`, and `slim` (sandbox-only, no local models). The root launchers auto-detect GPU where possible; on Windows, NVIDIA drivers below the CUDA 13 minimum (R580) fall back to `vulkan` instead of CPU.
+
 See the [setup guide](https://github.com/Elumenotion/GuideAnts/blob/main/docs/setup-guide.md) for full instructions and the [developer config guide](https://github.com/Elumenotion/GuideAnts/blob/main/docs/developer-config-guide.md) for configuration options.
 
 ### Documentation
@@ -202,6 +206,7 @@ All documentation lives in the repository:
 - [Project and notebook files system](https://github.com/Elumenotion/GuideAnts/blob/main/docs/project-and-notebook-files-system.md) – file and content management
 - [LLaMA model management](https://github.com/Elumenotion/GuideAnts/blob/main/docs/llama-model-download-and-runtime-management.md) – local model lifecycle
 - [Docker build guide](https://github.com/Elumenotion/GuideAnts/blob/main/docker/guideants-ai-build.md) – building the runtime service
+- [Vulkan backend guide](https://github.com/Elumenotion/GuideAnts/blob/main/docker/guideants-ai-vulkan.md) – vendor-neutral GPU (llama + image gen) on Docker Desktop and native Linux
 - [Full docs directory](https://github.com/Elumenotion/GuideAnts/tree/main/docs) – architecture, features, test plans, and more
 
 ## Development Entry Points
diff --git a/docker/build/guideants-ai/Dockerfile.vulkan b/docker/build/guideants-ai/Dockerfile.vulkan
index d39bc96f..3f3dc9a8 100644
--- a/docker/build/guideants-ai/Dockerfile.vulkan
+++ b/docker/build/guideants-ai/Dockerfile.vulkan
@@ -184,6 +184,13 @@ RUN curl -sSL "https://github.com/PowerShell/PowerShell/releases/download/v${PWS
     && chmod +x /usr/bin/pwsh \
     && rm /tmp/pwsh.tar.gz
 
+ARG NODE_MAJOR=22
+RUN curl -fsSL https://deb.nodesource.com/setup_${NODE_MAJOR}.x | bash - \
+    && apt-get install -y --no-install-recommends nodejs \
+    && rm -rf /var/lib/apt/lists/* \
+    && node --version \
+    && npx --version
+
 ENV VIRTUAL_ENV=/opt/venv
 ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 ENV PIP_DISABLE_PIP_VERSION_CHECK=1
diff --git a/docker/build/guideants-ai/entrypoint.sh b/docker/build/guideants-ai/entrypoint.sh
index cbda1361..3956ddba 100644
--- a/docker/build/guideants-ai/entrypoint.sh
+++ b/docker/build/guideants-ai/entrypoint.sh
@@ -142,6 +142,92 @@ PY
 
 sanitize_router_preset "$ROUTER_PRESET"
 
+# Qwen-VL needs image-min-tokens=1024 for grounding accuracy, but that value breaks
+# other vision models (e.g. Gemma mmproj max pixels). Apply it per-alias only.
+normalize_router_image_min_tokens() {
+    local preset_path="$1"
+    if [ -z "$preset_path" ] || [ ! -f "$preset_path" ]; then
+        return
+    fi
+
+    local tmp_path="${preset_path}.image-tokens.$$"
+    if ! python3 - "$preset_path" "$tmp_path" <<'PY'
+import re
+import sys
+
+source_path = sys.argv[1]
+output_path = sys.argv[2]
+
+with open(source_path, "r", encoding="utf-8") as src:
+    lines = src.read().splitlines()
+
+output: list[str] = []
+current_alias: str | None = None
+section_lines: list[str] = []
+
+def flush_section() -> None:
+    global section_lines, current_alias
+    if current_alias is None:
+        return
+
+    cleaned: list[str] = []
+    for raw_line in section_lines:
+        stripped = raw_line.strip()
+        if stripped.startswith("#") or stripped.startswith(";"):
+            cleaned.append(raw_line)
+            continue
+        if "=" in stripped:
+            key = stripped.split("=", 1)[0].strip().lower()
+            if key == "image-min-tokens":
+                continue
+        cleaned.append(raw_line)
+
+  # Qwen-VL aliases only; global router --image-min-tokens breaks Gemma loads.
+    if re.search(r"(?i)qwen", current_alias):
+        insert_at = 1 if cleaned and cleaned[0].strip().startswith("[") else 0
+        cleaned.insert(insert_at, "image-min-tokens = 1024")
+
+    output.extend(cleaned)
+    section_lines = []
+    current_alias = None
+
+for raw_line in lines:
+    stripped = raw_line.strip()
+    if stripped.startswith("[") and stripped.endswith("]"):
+        flush_section()
+        current_alias = stripped[1:-1].strip()
+        section_lines = [raw_line]
+        continue
+
+    if current_alias is None:
+        output.append(raw_line)
+        continue
+
+    section_lines.append(raw_line)
+
+flush_section()
+
+with open(output_path, "w", encoding="utf-8", newline="\n") as dst:
+    dst.write("\n".join(output))
+    if output:
+        dst.write("\n")
+PY
+    then
+        echo "WARNING: router preset image-min-tokens normalization failed for '${preset_path}'; continuing unchanged." >&2
+        rm -f "$tmp_path" 2>/dev/null || true
+        return
+    fi
+
+    if cmp -s "$preset_path" "$tmp_path"; then
+        rm -f "$tmp_path" 2>/dev/null || true
+        return
+    fi
+
+    mv -f "$tmp_path" "$preset_path"
+}
+
+normalize_router_image_min_tokens "$ROUTER_PRESET"
+
 SCRIPT_EXECUTION_REQUIRE_TOKEN="${SCRIPT_EXECUTION_REQUIRE_TOKEN:-true}"
 SCRIPT_EXECUTION_ENABLE_IDENTITY_ISOLATION="${SCRIPT_EXECUTION_ENABLE_IDENTITY_ISOLATION:-true}"
 
diff --git a/docker/build/guideants-ai/sd-service/sd_service.py b/docker/build/guideants-ai/sd-service/sd_service.py
index 6b9072de..8a508572 100644
--- a/docker/build/guideants-ai/sd-service/sd_service.py
+++ b/docker/build/guideants-ai/sd-service/sd_service.py
@@ -97,6 +97,14 @@ def parse_positive_float(value: str | None, default: float) -> float:
     return parsed if parsed > 0 else default
 
 
+def optional_env_value(name: str) -> str | None:
+    raw = os.getenv(name)
+    if raw is None:
+        return None
+    value = raw.strip()
+    return value or None
+
+
 def parse_size(size: str) -> tuple[int, int]:
     value = (size or "").strip().lower()
     if "x" not in value:
@@ -176,6 +184,7 @@ class SdRuntimeConfig:
     sampling_method: str
     offload_to_cpu: bool
     diffusion_fa: bool
+    vulkan_visible_devices: str | None
     default_output_format: str
     startup_warmup_fail_open: bool
 
@@ -242,18 +251,12 @@ def _require_non_empty(cls, value: str) -> str:
     @field_validator("bundle_id")
     @classmethod
     def _validate_bundle_id(cls, value: str) -> str:
-        if not BUNDLE_ID_RE.fullmatch(value):
-            raise ValueError("bundle_id must match ^[A-Za-z0-9][A-Za-z0-9._-]{0,127}$")
-        return value
+        return validate_bundle_id(value)
 
     @field_validator("diffusion_file", "vae_file", "text_encoder_file")
     @classmethod
-    def _reject_globs(cls, value: str) -> str:
-        if "*" in value or "?" in value:
-            raise ValueError(
-                "must be a single filename (no '*' or '?' glob metacharacters)"
-            )
-        return value
+    def _validate_bundle_filename(cls, value: str) -> str:
+        return validate_bundle_filename(value)
 
 
 class SdRuntimeState:
@@ -299,6 +302,42 @@ def __init__(self) -> None:
 ENGINE_LOCK = threading.Lock()
 
 
+def validate_bundle_filename(value: str) -> str:
+    filename = (value or "").strip()
+    if not filename:
+        raise ValueError("must be a non-empty string")
+    if "*" in filename or "?" in filename:
+        raise ValueError(
+            "must be a single filename (no '*' or '?' glob metacharacters)"
+        )
+    if (
+        filename in {".", ".."}
+        or os.path.isabs(filename)
+        or os.path.basename(filename) != filename
+        or "/" in filename
+        or "\\" in filename
+    ):
+        raise ValueError("must be a single filename with no path separators")
+    return filename
+
+
+def validate_bundle_id(value: str) -> str:
+    candidate = (value or "").strip()
+    if not BUNDLE_ID_RE.fullmatch(candidate):
+        raise ValueError("bundle_id must match ^[A-Za-z0-9][A-Za-z0-9._-]{0,127}$")
+    return candidate
+
+
+def resolve_bundle_dir(model_dir: str, bundle_id: str) -> str:
+    safe_bundle_id = validate_bundle_id(bundle_id)
+    root_real = os.path.realpath(bundle_root_dir(model_dir))
+    bundle_path = os.path.realpath(os.path.join(root_real, safe_bundle_id))
+    root_prefix = root_real if root_real.endswith(os.sep) else root_real + os.sep
+    if not bundle_path.startswith(root_prefix):
+        raise ValueError("resolved bundle path escapes the permitted bundle directory")
+    return bundle_path
+
+
 def resolve_runtime_config() -> SdRuntimeConfig:
     model_dir = os.getenv("GA_SD_MODEL_DIR", "/models-local/sd")
 
@@ -368,6 +407,7 @@ def _only_file(path: str, role: str) -> str:
     sampling_method = (os.getenv("GA_SD_SAMPLING_METHOD") or "euler").strip() or "euler"
     offload_to_cpu = env_flag("GA_SD_OFFLOAD_TO_CPU", False)
     diffusion_fa = env_flag("GA_SD_DIFFUSION_FA", True)
+    vulkan_visible_devices = optional_env_value("GA_SD_VK_VISIBLE_DEVICES")
     default_output_format = normalize_output_format(os.getenv("GA_SD_DEFAULT_OUTPUT_FORMAT"), "png")
     startup_warmup_fail_open = env_flag("GA_SD_WARMUP_FAIL_OPEN_ON_STARTUP", True)
 
@@ -401,6 +441,7 @@ def _only_file(path: str, role: str) -> str:
         sampling_method=sampling_method,
         offload_to_cpu=offload_to_cpu,
         diffusion_fa=diffusion_fa,
+        vulkan_visible_devices=vulkan_visible_devices,
         default_output_format=default_output_format,
         startup_warmup_fail_open=startup_warmup_fail_open,
     )
@@ -424,13 +465,13 @@ def read_active_bundle(model_dir: str) -> str | None:
         with open(marker, "r", encoding="utf-8") as handle:
             payload = json.load(handle)
             bundle_id = payload.get("bundleId")
-            return str(bundle_id) if bundle_id else None
+            return validate_bundle_id(str(bundle_id)) if bundle_id else None
     except Exception:
         return None
 
 
 def expected_bundle_paths(model_dir: str, bundle_id: str) -> dict[str, str]:
-    base = os.path.join(bundle_root_dir(model_dir), bundle_id)
+    base = resolve_bundle_dir(model_dir, bundle_id)
     return {
         "diffusion": os.path.join(base, "diffusion"),
         "vae": os.path.join(base, "vae"),
@@ -439,17 +480,11 @@ def expected_bundle_paths(model_dir: str, bundle_id: str) -> dict[str, str]:
 
 
 def bundle_definition_file(model_dir: str, bundle_id: str) -> str:
-    return os.path.join(bundle_root_dir(model_dir), bundle_id, "bundle-definition.json")
+    return os.path.join(resolve_bundle_dir(model_dir, bundle_id), "bundle-definition.json")
 
 
 def write_bundle_definition_payload(model_dir: str, bundle_id: str, payload: dict[str, Any]) -> None:
-    candidate_bundle_id = (bundle_id or "").strip()
-    if not BUNDLE_ID_RE.fullmatch(candidate_bundle_id):
-        raise ValueError("invalid bundle_id")
-    root_real = os.path.realpath(bundle_root_dir(model_dir))
-    bundle_path = os.path.realpath(os.path.join(root_real, candidate_bundle_id))
-    if not bundle_path.startswith(root_real + os.sep):
-        raise ValueError("resolved bundle path escapes the permitted bundle directory")
+    bundle_path = resolve_bundle_dir(model_dir, bundle_id)
     os.makedirs(bundle_path, exist_ok=True)
     target = os.path.join(bundle_path, "bundle-definition.json")
     temp = f"{target}.{uuid.uuid4().hex}.tmp"
@@ -458,9 +493,10 @@ def write_bundle_definition_payload(model_dir: str, bundle_id: str, payload: dic
     os.replace(temp, target)
 
 
-def bundle_definition_payload(request: DownloadBundleRequest) -> dict[str, Any]:
+def bundle_definition_payload(request: DownloadBundleRequest, bundle_id: str | None = None) -> dict[str, Any]:
+    safe_bundle_id = validate_bundle_id(bundle_id or request.bundle_id)
     return {
-        "bundleId": request.bundle_id,
+        "bundleId": safe_bundle_id,
         "revision": request.revision,
         "updatedAtUtc": utc_now_iso(),
         "roles": {
@@ -474,8 +510,9 @@ def bundle_definition_payload(request: DownloadBundleRequest) -> dict[str, Any]:
     }
 
 
-def write_bundle_definition(model_dir: str, request: DownloadBundleRequest) -> None:
-    write_bundle_definition_payload(model_dir, request.bundle_id, bundle_definition_payload(request))
+def write_bundle_definition(model_dir: str, request: DownloadBundleRequest, bundle_id: str | None = None) -> None:
+    safe_bundle_id = validate_bundle_id(bundle_id or request.bundle_id)
+    write_bundle_definition_payload(model_dir, safe_bundle_id, bundle_definition_payload(request, safe_bundle_id))
 
 
 def _normalize_bundle_definition(payload: Any) -> dict[str, Any] | None:
@@ -687,17 +724,122 @@ def list_bundles(model_dir: str) -> list[dict[str, Any]]:
     return bundles
 
 
+def _normalize_bundle_revision(revision: str | None) -> str | None:
+    text = (revision or "").strip()
+    return text or None
+
+
+def _previous_role_spec(
+    previous_definition: dict[str, Any] | None, role: str
+) -> tuple[str, str] | None:
+    if previous_definition is None:
+        return None
+    roles = previous_definition.get("roles")
+    if not isinstance(roles, dict):
+        return None
+    role_payload = roles.get(role)
+    if not isinstance(role_payload, dict):
+        return None
+    repo = str(role_payload.get("repo") or "").strip()
+    filename = str(role_payload.get("file") or "").strip()
+    if not repo or not filename:
+        return None
+    return repo, filename
+
+
+def _bundle_revision_unchanged(
+    previous_definition: dict[str, Any] | None, request: DownloadBundleRequest
+) -> bool:
+    if previous_definition is None:
+        return False
+    previous = _normalize_bundle_revision(previous_definition.get("revision"))
+    incoming = _normalize_bundle_revision(request.revision)
+    return previous == incoming
+
+
+def bundle_role_download_needed(
+    previous_definition: dict[str, Any] | None,
+    request: DownloadBundleRequest,
+    role: str,
+    repo: str,
+    filename: str,
+) -> bool:
+    if previous_definition is None:
+        return True
+    if not _bundle_revision_unchanged(previous_definition, request):
+        return True
+    previous = _previous_role_spec(previous_definition, role)
+    if previous is None:
+        return True
+    return previous != (repo, filename)
+
+
+def resolve_role_file_path(target_path: str, filename: str) -> str:
+    safe_filename = validate_bundle_filename(filename)
+    role_dir = os.path.realpath(target_path)
+    candidate = os.path.realpath(os.path.join(role_dir, safe_filename))
+    role_prefix = role_dir if role_dir.endswith(os.sep) else role_dir + os.sep
+    if not candidate.startswith(role_prefix):
+        raise ValueError("resolved role file path escapes the permitted role directory")
+    return candidate
+
+
+def role_expected_file_ready(target_path: str, filename: str) -> bool:
+    try:
+        expected_file = resolve_role_file_path(target_path, filename)
+    except ValueError:
+        return False
+    return os.path.isfile(expected_file)
+
+
+def clear_stale_role_files(target_path: str, filename: str) -> None:
+    """
+    Remove files left by a prior recipe (e.g. a renamed gguf) without wiping
+    the role directory so huggingface_hub can resume interrupted downloads.
+    """
+    if not os.path.isdir(target_path):
+        return
+    safe_filename = validate_bundle_filename(filename)
+    try:
+        for name in os.listdir(target_path):
+            file_path = os.path.join(target_path, name)
+            if os.path.isfile(file_path) and name != safe_filename:
+                os.remove(file_path)
+    except OSError:
+        shutil.rmtree(target_path)
+
+
+def resolve_initial_bundle_role_states(
+    previous_definition: dict[str, Any] | None,
+    request: DownloadBundleRequest,
+    paths: dict[str, str],
+) -> dict[str, str]:
+    roles = {
+        "diffusion": (request.diffusion_repo, request.diffusion_file),
+        "vae": (request.vae_repo, request.vae_file),
+        "textEncoder": (request.text_encoder_repo, request.text_encoder_file),
+    }
+    states: dict[str, str] = {}
+    for role, (repo, filename) in roles.items():
+        if bundle_role_download_needed(previous_definition, request, role, repo, filename):
+            states[role] = "queued"
+        elif role_expected_file_ready(paths[role], filename):
+            states[role] = "ready"
+        else:
+            states[role] = "queued"
+    return states
+
+
 def start_bundle_download(request: DownloadBundleRequest, model_dir: str) -> dict[str, Any]:
+    bundle_id = validate_bundle_id(request.bundle_id)
+    previous_definition = read_bundle_definition(model_dir, bundle_id)
+    paths = expected_bundle_paths(model_dir, bundle_id)
     operation_id = uuid.uuid4().hex
     operation = {
         "operationId": operation_id,
-        "bundleId": request.bundle_id,
+        "bundleId": bundle_id,
         "status": "queued",
-        "roles": {
-            "diffusion": "queued",
-            "vae": "queued",
-            "textEncoder": "queued",
-        },
+        "roles": resolve_initial_bundle_role_states(previous_definition, request, paths),
         "error": None,
     }
     with BUNDLE_OPS_LOCK:
@@ -705,11 +847,11 @@ def start_bundle_download(request: DownloadBundleRequest, model_dir: str) -> dic
     try:
         # Persist the declared bundle recipe up front so operators can read and
         # edit the definition even if a download fails mid-way.
-        write_bundle_definition(model_dir, request)
+        write_bundle_definition(model_dir, request, bundle_id)
     except Exception as exc:
         log_event(
             "sd_bundle_definition_write_failed",
-            bundleId=request.bundle_id,
+            bundleId=bundle_id,
             error=truncate_text(str(exc), 2048),
         )
 
@@ -730,19 +872,20 @@ def _run() -> None:
                 "vae": (request.vae_repo, request.vae_file),
                 "textEncoder": (request.text_encoder_repo, request.text_encoder_file),
             }
-            paths = expected_bundle_paths(model_dir, request.bundle_id)
             for role, (repo, filename) in roles.items():
+                target_path = paths[role]
+                if (
+                    not bundle_role_download_needed(previous_definition, request, role, repo, filename)
+                    and role_expected_file_ready(target_path, filename)
+                ):
+                    with BUNDLE_OPS_LOCK:
+                        BUNDLE_OPERATIONS[operation_id]["roles"][role] = "ready"
+                    continue
+
                 with BUNDLE_OPS_LOCK:
                     BUNDLE_OPERATIONS[operation_id]["status"] = "running"
                     BUNDLE_OPERATIONS[operation_id]["roles"][role] = "downloading"
-                target_path = paths[role]
-                # Replacing an existing bundle definition must leave exactly one
-                # file per role. Clear the role directory first so a filename
-                # change cannot leave stale files behind.
-                if os.path.isdir(target_path):
-                    shutil.rmtree(target_path)
-                elif os.path.exists(target_path):
-                    os.remove(target_path)
+                clear_stale_role_files(target_path, filename)
                 os.makedirs(target_path, exist_ok=True)
                 snapshot_download(
                     repo_id=repo,
@@ -756,7 +899,7 @@ def _run() -> None:
                 # snapshot_download with allow_patterns silently produces an
                 # empty directory if the filename does not exist in the repo.
                 # Turn that into a loud failure so the operator sees it.
-                expected_file = os.path.join(target_path, filename)
+                expected_file = resolve_role_file_path(target_path, filename)
                 if not os.path.isfile(expected_file):
                     raise RuntimeError(
                         f"Expected file '{filename}' was not produced by "
@@ -808,6 +951,13 @@ def build_sd_server_command(config: SdRuntimeConfig) -> list[str]:
     return command
 
 
+def build_sd_server_environment(config: SdRuntimeConfig) -> dict[str, str]:
+    env = os.environ.copy()
+    if config.vulkan_visible_devices is not None:
+        env["GGML_VK_VISIBLE_DEVICES"] = config.vulkan_visible_devices
+    return env
+
+
 def is_engine_process_alive() -> bool:
     process = STATE.engine_process
     return process is not None and process.poll() is None
@@ -987,11 +1137,12 @@ def start_engine() -> tuple[bool, str | None]:
         vaePath=config.vae_path,
         llmPath=config.llm_path,
         bundleId=STATE.loaded_bundle_id,
+        vulkanVisibleDevices=config.vulkan_visible_devices,
         command=command,
     )
 
     try:
-        STATE.engine_process = subprocess.Popen(command)
+        STATE.engine_process = subprocess.Popen(command, env=build_sd_server_environment(config))
         STATE.engine_started_at_utc = utc_now_iso()
         wait_for_engine_ready(config)
         STATE.loaded_at_utc = utc_now_iso()
@@ -1671,6 +1822,7 @@ async def health() -> dict[str, Any]:
             "samplingMethod": config.sampling_method,
             "offloadToCpu": config.offload_to_cpu,
             "diffusionFa": config.diffusion_fa,
+            "vulkanVisibleDevices": config.vulkan_visible_devices,
         },
         "engine": {
             "startedAtUtc": STATE.engine_started_at_utc,
@@ -1737,10 +1889,10 @@ def _require_model_dir() -> str:
 
 
 def require_valid_bundle_id(bundle_id: str) -> str:
-    candidate = (bundle_id or "").strip()
-    if not BUNDLE_ID_RE.fullmatch(candidate):
+    try:
+        return validate_bundle_id(bundle_id)
+    except ValueError:
         raise HTTPException(status_code=400, detail="invalid bundle_id")
-    return candidate
 
 
 @APP.get("/admin/bundles")
@@ -1971,15 +2123,13 @@ async def admin_unload() -> JSONResponse:
 @APP.delete("/admin/bundles/{bundle_id}")
 async def admin_delete_bundle(bundle_id: str) -> JSONResponse:
     model_dir = _require_model_dir()
-    bundle_id = (bundle_id or "").strip()
-    if not BUNDLE_ID_RE.fullmatch(bundle_id):
-        raise HTTPException(status_code=400, detail="invalid bundle_id")
+    bundle_id = require_valid_bundle_id(bundle_id)
     if read_active_bundle(model_dir) == bundle_id:
         raise HTTPException(status_code=409, detail="cannot remove active bundle")
 
-    root_real = os.path.realpath(bundle_root_dir(model_dir))
-    target = os.path.realpath(os.path.join(root_real, bundle_id))
-    if not target.startswith(root_real + os.sep):
+    try:
+        target = resolve_bundle_dir(model_dir, bundle_id)
+    except ValueError:
         raise HTTPException(status_code=400, detail="invalid bundle_id")
     if not os.path.exists(target):
         raise HTTPException(status_code=404, detail="bundle not found")
diff --git a/docker/build/guideants-ai/start-llama.sh b/docker/build/guideants-ai/start-llama.sh
index 30940a5d..df6f8652 100644
--- a/docker/build/guideants-ai/start-llama.sh
+++ b/docker/build/guideants-ai/start-llama.sh
@@ -70,6 +70,11 @@ fi
 [ -n "$GA_LLAMA_THREADS" ]        && ARGS="$ARGS --threads $GA_LLAMA_THREADS"
 [ -n "$GA_LLAMA_PARALLEL" ]       && ARGS="$ARGS --parallel $GA_LLAMA_PARALLEL"
 [ -n "$GA_LLAMA_GPU_LAYERS" ]     && ARGS="$ARGS --n-gpu-layers $GA_LLAMA_GPU_LAYERS"
+# Vulkan can fail scheduler reservation when KV-cache tensors are placed on
+# the GPU for some model families. Keep this as an explicit env-controlled
+# base preset because router mode propagates it to child instances.
+[ "$GA_LLAMA_KV_OFFLOAD" = "0" ]  && ARGS="$ARGS --no-kv-offload"
+[ "$GA_LLAMA_KV_OFFLOAD" = "1" ]  && ARGS="$ARGS --kv-offload"
 [ "$GA_LLAMA_KV_UNIFIED" = "1" ]  && ARGS="$ARGS --kv-unified"
 [ "$GA_LLAMA_JINJA" = "1" ]       && ARGS="$ARGS --jinja"
 [ "$GA_LLAMA_CONT_BATCH" = "1" ]  && ARGS="$ARGS --cont-batching"
@@ -78,11 +83,12 @@ fi
 # preset into every spawned child instance (unlike ctx-size/cache-ram, which
 # are left per-alias). --flash-attn takes a literal value (on|off|auto);
 # cache-type-v quantization requires flash attention to be enabled. The
-# image-min-tokens knob only affects vision (mmproj) models.
+# image-min-tokens is per-alias in router-models.ini (Qwen-VL only). Do not set
+# GA_LLAMA_IMAGE_MIN_TOKENS globally — it propagates to every child and breaks
+# models whose mmproj image_max_pixels is below the 1024-token floor.
 [ -n "$GA_LLAMA_FLASH_ATTN" ]     && ARGS="$ARGS --flash-attn $GA_LLAMA_FLASH_ATTN"
 [ -n "$GA_LLAMA_CACHE_TYPE_K" ]   && ARGS="$ARGS --cache-type-k $GA_LLAMA_CACHE_TYPE_K"
 [ -n "$GA_LLAMA_CACHE_TYPE_V" ]   && ARGS="$ARGS --cache-type-v $GA_LLAMA_CACHE_TYPE_V"
-[ -n "$GA_LLAMA_IMAGE_MIN_TOKENS" ] && ARGS="$ARGS --image-min-tokens $GA_LLAMA_IMAGE_MIN_TOKENS"
 # --tensor-split sets the per-GPU layer proportion (comma list, e.g. "7,1").
 # Indices follow this process's visible-device order: with
 # GA_LLAMA_CUDA_VISIBLE_DEVICES=1,0 the FIRST proportion targets physical GPU 1
diff --git a/docker/docker-compose.ghcr-vulkan.yml b/docker/docker-compose.ghcr-vulkan.yml
new file mode 100644
index 00000000..03f57dc0
--- /dev/null
+++ b/docker/docker-compose.ghcr-vulkan.yml
@@ -0,0 +1,292 @@
+name: guideants
+
+services:
+  mssql-express:
+    image: ${GA_MSSQL_IMAGE:-ghcr.io/elumenotion/mssql2025-express-fts:main}
+    pull_policy: always
+    container_name: mssql-express
+    environment:
+      - ACCEPT_EULA=Y
+      - MSSQL_SA_PASSWORD=${GA_SQL_SA_PASSWORD:-YourStrong!Passw0rd}
+    healthcheck:
+      test: ["CMD-SHELL", "/opt/mssql-tools18/bin/sqlcmd -S localhost -U sa -P '${GA_SQL_SA_PASSWORD:-YourStrong!Passw0rd}' -Q 'SELECT 1' -C -No | grep -q 1"]
+      interval: 5s
+      timeout: 5s
+      retries: 30
+      start_period: 30s
+    cpus: "4.0"
+    volumes:
+      - mssql_runtime_state:/var/opt/mssql
+      - mssql_data:/var/opt/mssql/data
+      - mssql_ftdata:/var/opt/mssql/FTData
+      - mssql_log:/var/opt/mssql/log
+    restart: unless-stopped
+
+  guideants-ai:
+    image: ${GA_AI_GHCR_IMAGE:-ghcr.io/elumenotion/guideants-ai-vulkan:main}
+    pull_policy: always
+    container_name: guideants-ai
+    cap_drop:
+      - SYS_PTRACE
+    # ---- Vulkan GPU wiring (cross-OS) -----------------------------------------
+    # Defaults target Windows / Docker Desktop, where GPU Vulkan for EVERY vendor
+    # goes Vulkan -> Mesa dzn -> D3D12 -> /dev/dxg (works from git bash, no WSL).
+    # Native-Linux hosts override the GA_VULKAN_* vars; the installer sets them
+    # automatically (AMD/Intel -> /dev/dri + RADV/ANV, NVIDIA -> nvidia runtime +
+    # toolkit-injected ICD). See docker/guideants-ai-vulkan.md.
+    runtime: ${GA_VULKAN_RUNTIME:-runc}       # 'nvidia' only on native-Linux NVIDIA
+    group_add:
+      - video                                 # /dev/dri render perms on native Linux (harmless on Windows)
+      - render
+    devices:
+      - ${GA_VULKAN_DEVICE:-/dev/dxg}         # Windows: /dev/dxg ; Linux AMD/Intel: /dev/dri
+    shm_size: "8gb"
+    volumes:
+      - ${GA_VULKAN_DRIVER_LIBS:-/usr/lib/wsl}:/usr/lib/wsl:ro
+      - ai_local_models-new:/models-local
+      - script_agent_admin_state:/var/lib/guideants/script-agent-admin
+      - type: bind
+        source: ${GA_CONTENT_FILES_HOST_PATH:-./volumes/content-files}
+        target: /app/ContentFiles
+    environment:
+      - FILE_STORAGE_ROOT=/app/ContentFiles
+      # Vulkan loader wiring. VK_DRIVER_FILES is pinned to ONE ICD so the CPU
+      # software rasterizer (llvmpipe) is never a candidate — the stack uses the
+      # GPU or fails loudly, never a silent CPU fallback. Default = dzn (Windows).
+      # Installer overrides GA_VULKAN_ICD on Linux: RADV/ANV for AMD/Intel,
+      # nvidia_icd.json (toolkit-injected) for NVIDIA.
+      - LD_LIBRARY_PATH=${GA_VULKAN_LD_LIBRARY_PATH:-/usr/lib/wsl/lib}
+      - VK_DRIVER_FILES=${GA_VULKAN_ICD:-/usr/share/vulkan/icd.d/dzn_icd.json}
+      - VK_ICD_FILENAMES=${GA_VULKAN_ICD:-/usr/share/vulkan/icd.d/dzn_icd.json}
+      - MESA_D3D12_DEFAULT_ADAPTER_NAME=${MESA_D3D12_DEFAULT_ADAPTER_NAME:-NVIDIA}
+      - GGML_VK_VISIBLE_DEVICES=${GGML_VK_VISIBLE_DEVICES:-}
+      # Used only when GA_VULKAN_RUNTIME=nvidia (native-Linux NVIDIA); harmless otherwise.
+      - NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-all}
+      - NVIDIA_DRIVER_CAPABILITIES=${NVIDIA_DRIVER_CAPABILITIES:-graphics,compute,utility}
+      - HF_TOKEN=${HF_TOKEN:-}
+      - SCRIPT_EXECUTION_AGENT_TOKEN=${GA_SCRIPT_AGENT_TOKEN:-dev-script-agent-token}
+      - SCRIPT_EXECUTION_ADMIN_TOKEN=${GA_SCRIPT_AGENT_ADMIN_TOKEN:-dev-script-agent-admin-token}
+      - SCRIPT_EXECUTION_ADMIN_STATE_DIR=/var/lib/guideants/script-agent-admin
+      - SCRIPT_EXECUTION_SCOPE_STATE_ROOT=/var/lib/guideants/script-agent-admin/scopes
+      - SCRIPT_EXECUTION_REQUIRE_TOKEN=true
+      - GA_LLAMA_MODELS_PRESET=/models-local/router-models.ini
+      - GA_LLAMA_MODEL_DIR=/models-local/llama
+      - GA_LLAMA_ADMIN_PORT=8086
+      - GA_LLAMA_MODELS_MAX=1
+      - GA_LLAMA_NO_AUTOLOAD=1
+      - GA_LLAMA_CTX_SIZE=262144
+      - GA_LLAMA_THREADS=16
+      - GA_LLAMA_PARALLEL=5
+      - GA_LLAMA_CACHE_RAM=8192
+      # Vulkan currently trips llama.cpp's scheduler for some model families when
+      # KV cache tensors are placed on Vulkan buffers. Keep KV offload disabled
+      # by default; LLAMA_ARG_* lets current images pick this up without rebuild.
+      - GA_LLAMA_KV_OFFLOAD=${GA_LLAMA_KV_OFFLOAD:-0}
+      - LLAMA_ARG_KV_OFFLOAD=${GA_LLAMA_KV_OFFLOAD:-0}
+      # Keep unified KV opt-in for Vulkan as a separate conservative default.
+      - GA_LLAMA_KV_UNIFIED=${GA_LLAMA_KV_UNIFIED:-0}
+      - GA_LLAMA_JINJA=1
+      - GA_LLAMA_CONT_BATCH=1
+      - GA_LLAMA_NO_MMAP=0
+      - GA_ASR_HOST=127.0.0.1
+      - GA_ASR_PORT=8082
+      - GA_ASR_MODEL_DIR=/models-local/asr
+      - GA_ASR_DEFAULT_MODEL_PATH=Qwen3-ASR-0.6B
+      - GA_ASR_DEFAULT_MODEL_ID=Qwen/Qwen3-ASR-0.6B
+      - GA_ASR_AUTO_LOAD_ON_STARTUP=${GA_ASR_AUTO_LOAD_ON_STARTUP:-1}
+      - GA_ASR_WAIT_FOR_READY_ON_STARTUP=${GA_ASR_WAIT_FOR_READY_ON_STARTUP:-0}
+      - GA_ASR_READY_TIMEOUT_SECONDS=${GA_ASR_READY_TIMEOUT_SECONDS:-1800}
+      - GA_ASR_DEVICE_MAP=auto
+      - GA_ASR_DTYPE=bfloat16
+      - GA_ASR_MAX_INFERENCE_BATCH_SIZE=8
+      - GA_ASR_MAX_NEW_TOKENS=512
+      - GA_ASR_WARMUP_ON_LOAD=${GA_ASR_WARMUP_ON_LOAD:-1}
+      - GA_ASR_WARMUP_AUDIO_PATH=${GA_ASR_WARMUP_AUDIO_PATH:-/app/asr-service/warmup.webm}
+      - GA_ASR_WARMUP_LANGUAGE=${GA_ASR_WARMUP_LANGUAGE:-}
+      - GA_TTS_HOST=127.0.0.1
+      - GA_TTS_PORT=8084
+      - GA_TTS_MODEL_DIR=/models-local/tts
+      - GA_TTS_DEFAULT_MODEL_PATH=VibeVoice-1.5B
+      - GA_TTS_DEFAULT_MODEL_ID=microsoft/VibeVoice-1.5B
+      - GA_TTS_TOKENIZER_PATH=Qwen2.5-1.5B-tokenizer
+      - GA_TTS_TOKENIZER_ID=Qwen/Qwen2.5-1.5B
+      - GA_TTS_AUTO_LOAD_ON_STARTUP=${GA_TTS_AUTO_LOAD_ON_STARTUP:-1}
+      - GA_TTS_WAIT_FOR_READY_ON_STARTUP=${GA_TTS_WAIT_FOR_READY_ON_STARTUP:-0}
+      - GA_TTS_READY_TIMEOUT_SECONDS=${GA_TTS_READY_TIMEOUT_SECONDS:-1800}
+      - GA_TTS_DEVICE_MAP=auto
+      - GA_TTS_DTYPE=bfloat16
+      - GA_TTS_TIMEOUT_SECONDS=300
+      - GA_TTS_MAX_NEW_TOKENS=512
+      - GA_TTS_SAMPLE_RATE=24000
+      - GA_TTS_DEFAULT_VOICE_SECONDS=1.0
+      - GA_EMB_HOST=127.0.0.1
+      - GA_EMB_PORT=8085
+      - GA_EMB_MODEL_DIR=/models-local/emb
+      - GA_EMB_DEFAULT_MODEL_PATH=${GA_EMB_DEFAULT_MODEL_PATH}
+      - GA_EMB_DEVICE=${GA_EMB_DEVICE:-cpu}
+      - GA_EMB_FIX_MISTRAL_REGEX=${GA_EMB_FIX_MISTRAL_REGEX:-1}
+      - GA_EMB_AUTO_LOAD_ON_STARTUP=${GA_EMB_AUTO_LOAD_ON_STARTUP}
+      - GA_EMB_WARMUP_ON_LOAD=${GA_EMB_WARMUP_ON_LOAD}
+      - GA_EMB_WAIT_FOR_READY_ON_STARTUP=${GA_EMB_WAIT_FOR_READY_ON_STARTUP:-0}
+      - GA_EMB_READY_TIMEOUT_SECONDS=${GA_EMB_READY_TIMEOUT_SECONDS:-1800}
+      - GA_MEDIA_HOST=127.0.0.1
+      - GA_MEDIA_PORT=8087
+      - GA_SD_HOST=127.0.0.1
+      - GA_SD_PORT=8083
+      - GA_SD_MODEL_DIR=/models-local/sd
+      - GA_SD_TIMEOUT_SECONDS=900
+      - GA_SD_ENGINE_REQUEST_TIMEOUT_SECONDS=${GA_SD_ENGINE_REQUEST_TIMEOUT_SECONDS:-120}
+      - GA_SD_STEPS=4
+      - GA_SD_CFG_SCALE=1.0
+      - GA_SD_STRENGTH=0.75
+      - GA_SD_OFFLOAD_TO_CPU=${GA_SD_OFFLOAD_TO_CPU:-0}
+      - GA_SD_DIFFUSION_FA=1
+      # Optional SD-only Vulkan device selector. Empty means inherit the
+      # container-wide GGML_VK_VISIBLE_DEVICES value used by llama too.
+      - GA_SD_VK_VISIBLE_DEVICES=${GA_SD_VK_VISIBLE_DEVICES:-}
+      - GA_SD_AUTO_LOAD_ON_STARTUP=${GA_SD_AUTO_LOAD_ON_STARTUP:-1}
+      - GA_SD_WARMUP_PROMPT=${GA_SD_WARMUP_PROMPT:-startup-warmup}
+      - GA_SD_WARMUP_SIZE=${GA_SD_WARMUP_SIZE:-512x512}
+      - GA_SD_WARMUP_STEPS=${GA_SD_WARMUP_STEPS:-1}
+      - GA_SD_WARMUP_OUTPUT_FORMAT=${GA_SD_WARMUP_OUTPUT_FORMAT:-png}
+      - GA_SD_WARMUP_REQUEST_TIMEOUT_SECONDS=${GA_SD_WARMUP_REQUEST_TIMEOUT_SECONDS:-180}
+      - GA_SD_WARMUP_FAIL_OPEN_ON_STARTUP=${GA_SD_WARMUP_FAIL_OPEN_ON_STARTUP:-1}
+      - GA_SD_WAIT_FOR_READY_ON_STARTUP=${GA_SD_WAIT_FOR_READY_ON_STARTUP:-0}
+      - GA_SD_READY_TIMEOUT_SECONDS=${GA_SD_READY_TIMEOUT_SECONDS:-1800}
+    networks:
+      - guideants-network
+    restart: unless-stopped
+
+  docling-serve:
+    image: ${DOCLING_SERVE_CPU_IMAGE:-quay.io/docling-project/docling-serve-cpu:v1.16.1}
+    pull_policy: always
+    container_name: docling-serve
+    environment:
+      - DOCLING_SERVE_MAX_SYNC_WAIT=${DOCLING_SERVE_MAX_SYNC_WAIT:-600}
+    networks:
+      - guideants-network
+    restart: unless-stopped
+
+  documentserver:
+    image: ${GA_DOCUMENTSERVER_IMAGE:-documentserver:latest}
+    container_name: documentserver
+    environment:
+      - JWT_ENABLED=${GA_DOCUMENTSERVER_JWT_ENABLED:-false}
+      - JWT_SECRET=${DOCUMENTSERVER_JWT_SECRET:-}
+      - JWT_HEADER=${GA_DOCUMENTSERVER_JWT_HEADER:-Authorization}
+      - JWT_IN_BODY=${GA_DOCUMENTSERVER_JWT_IN_BODY:-false}
+      - ALLOW_PRIVATE_IP_ADDRESS=true
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/info/info.json"]
+      interval: 30s
+      retries: 5
+      start_period: 60s
+      timeout: 10s
+    networks:
+      - guideants-network
+    restart: unless-stopped
+
+  guideants-webapi-ui:
+    image: ${GA_WEBAPI_UI_SLIM_GHCR_IMAGE:-ghcr.io/elumenotion/guideants-webapi-ui-slim:main}
+    pull_policy: always
+    container_name: guideants-webapi-ui
+    depends_on:
+      mssql-express:
+        condition: service_healthy
+      guideants-ai:
+        condition: service_started
+    ports:
+      - "5107:8080"
+    volumes:
+      - type: bind
+        source: ${GA_CONTENT_FILES_HOST_PATH:-./volumes/content-files}
+        target: /app/ContentFiles
+    environment:
+      - ASPNETCORE_URLS=http://127.0.0.1:8081
+      - ASPNETCORE_ENVIRONMENT=Development
+      # Idle session window (minutes). Default 30 days; the sliding renewal re-issues
+      # the cookie past the halfway point so active sessions never hard-expire.
+      - Jwt__LifetimeMinutes=${GA_JWT_LIFETIME_MINUTES:-43200}
+      - API_RUNTIME_CONTEXT=compose-ghcr-vulkan
+      - ConnectionStrings__DefaultConnection=Server=mssql-express,1433;Initial Catalog=${GA_DB_NAME:-guideants-dev};Persist Security Info=False;User ID=sa;Password=${GA_SQL_SA_PASSWORD:-YourStrong!Passw0rd};MultipleActiveResultSets=True;Encrypt=False;TrustServerCertificate=True;Connection Timeout=30;ConnectRetryCount=3;ConnectRetryInterval=5;
+      - FileStorage__Path=/app/ContentFiles
+      - Ui__RootPath=/app/ui
+      - Ui__DevServerUrl=
+      - ALLOWED_ORIGINS=*
+      - SearXngSearch__BaseUrl=http://searxng:8080
+      - BrowserRendering__BaseUrl=http://searxng:8080
+      - LocalServiceHosts__SpeechTranscriptionBaseUrl=http://guideants-ai:80
+      - LocalServiceHosts__SpeechSynthesisBaseUrl=http://guideants-ai:80
+      - LocalServiceHosts__ImageGenerationBaseUrl=http://guideants-ai:80
+      - LocalServiceHosts__EmbeddingsBaseUrl=http://guideants-ai:80
+      - LocalServiceHosts__MediaBaseUrl=http://guideants-ai:80
+      - LocalServiceHosts__DocumentIntelligenceBaseUrl=http://docling-serve:5001
+      - DocumentServer__Enabled=${GA_DOCUMENTSERVER_ENABLED:-false}
+      - DocumentServer__InternalUrl=http://documentserver
+      - DocumentServer__ApiBaseUrl=http://guideants-webapi-ui:8080
+      - DocumentServer__JwtEnabled=${GA_DOCUMENTSERVER_JWT_ENABLED:-false}
+      - DocumentServer__JwtSecret=${DOCUMENTSERVER_JWT_SECRET:-}
+      - DocumentServer__JwtHeader=${GA_DOCUMENTSERVER_JWT_HEADER:-Authorization}
+      - DocumentServer__JwtInBody=${GA_DOCUMENTSERVER_JWT_IN_BODY:-false}
+      - ScriptExecution__AgentToken=${GA_SCRIPT_AGENT_TOKEN:-dev-script-agent-token}
+      - ScriptExecution__AdminToken=${GA_SCRIPT_AGENT_ADMIN_TOKEN:-dev-script-agent-admin-token}
+      - LlamaCpp__BaseUrl=http://guideants-ai:80/llama-cpp
+      - ServiceRouting__Containers__guideants-ai__BaseUrl=http://guideants-ai:80/sandbox
+      - ServiceRouting__Containers__plantuml__BaseUrl=http://plantuml:80
+      - HF_TOKEN=${HF_TOKEN:-}
+      - SettingsSecrets__ActiveKeyId=local-dev
+      - SettingsSecrets__Keys__local-dev=MDEyMzQ1Njc4OUFCQ0RFRjAxMjM0NTY3ODlBQkNERUY=
+      - Logging__LogLevel__GuideAntsApi.Services.Components.SpeechTranscriptionService=Information
+      - Logging__LogLevel__GuideAntsApi.Services.Components.SpeechSynthesisService=Information
+    networks:
+      - guideants-network
+      - default
+    restart: unless-stopped
+
+  plantuml:
+    image: ${GA_PLANTUML_GHCR_IMAGE:-ghcr.io/elumenotion/guideants-plantuml:main}
+    pull_policy: always
+    container_name: plantuml
+    environment:
+      - FILE_STORAGE_ROOT=/app/ContentFiles
+      - SCRIPT_EXECUTION_AGENT_TOKEN=${GA_SCRIPT_AGENT_TOKEN:-dev-script-agent-token}
+      - SCRIPT_EXECUTION_REQUIRE_TOKEN=true
+      - PLANTUML_LIMIT_SIZE=8192
+    volumes:
+      - type: bind
+        source: ${GA_CONTENT_FILES_HOST_PATH:-./volumes/content-files}
+        target: /app/ContentFiles
+    networks:
+      - guideants-network
+    restart: unless-stopped
+
+  searxng:
+    image: ${GA_SEARXNG_GHCR_IMAGE:-ghcr.io/elumenotion/guideants-searxng:main}
+    pull_policy: always
+    container_name: readweb-searxng
+    restart: unless-stopped
+    volumes:
+      - type: bind
+        source: ${GA_SEARXNG_CONFIG_HOST_PATH:-./volumes/searxng/config}
+        target: /etc/searxng
+      - type: bind
+        source: ${GA_SEARXNG_DATA_HOST_PATH:-./volumes/searxng/data}
+        target: /var/cache/searxng
+    environment:
+      - FORCE_OWNERSHIP=true
+      - BROWSER_RENDER_PORT=3001
+    networks:
+      - guideants-network
+
+volumes:
+  mssql_runtime_state:
+  mssql_data:
+  mssql_ftdata:
+  mssql_log:
+  ai_local_models-new:
+  script_agent_admin_state:
+
+networks:
+  guideants-network:
+    driver: bridge
+
diff --git a/docker/docker-compose.vulkan.yml b/docker/docker-compose.vulkan.yml
index 9f7eb3a7..68e0f9ea 100644
--- a/docker/docker-compose.vulkan.yml
+++ b/docker/docker-compose.vulkan.yml
@@ -74,6 +74,7 @@ services:
       # to the NVIDIA discrete GPU; set "Radeon"/"Intel" or use
       # GGML_VK_VISIBLE_DEVICES to target another GPU. Ignored by non-dzn ICDs.
       - MESA_D3D12_DEFAULT_ADAPTER_NAME=${MESA_D3D12_DEFAULT_ADAPTER_NAME:-NVIDIA}
+      - GGML_VK_VISIBLE_DEVICES=${GGML_VK_VISIBLE_DEVICES:-}
       # Used only when GA_VULKAN_RUNTIME=nvidia (native-Linux NVIDIA); harmless otherwise.
       - NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-all}
       - NVIDIA_DRIVER_CAPABILITIES=${NVIDIA_DRIVER_CAPABILITIES:-graphics,compute,utility}
@@ -95,7 +96,13 @@ services:
       - GA_LLAMA_THREADS=16
       - GA_LLAMA_PARALLEL=5
       - GA_LLAMA_CACHE_RAM=8192
-      - GA_LLAMA_KV_UNIFIED=1
+      # Vulkan currently trips llama.cpp's scheduler for some model families when
+      # KV cache tensors are placed on Vulkan buffers. Keep KV offload disabled
+      # by default; LLAMA_ARG_* lets current images pick this up without rebuild.
+      - GA_LLAMA_KV_OFFLOAD=${GA_LLAMA_KV_OFFLOAD:-0}
+      - LLAMA_ARG_KV_OFFLOAD=${GA_LLAMA_KV_OFFLOAD:-0}
+      # Keep unified KV opt-in for Vulkan as a separate conservative default.
+      - GA_LLAMA_KV_UNIFIED=${GA_LLAMA_KV_UNIFIED:-0}
       - GA_LLAMA_JINJA=1
       - GA_LLAMA_CONT_BATCH=1
       - GA_LLAMA_NO_MMAP=0
@@ -158,6 +165,9 @@ services:
       - GA_SD_STRENGTH=0.75
       - GA_SD_OFFLOAD_TO_CPU=${GA_SD_OFFLOAD_TO_CPU:-0}
       - GA_SD_DIFFUSION_FA=1
+      # Optional SD-only Vulkan device selector. Empty means inherit the
+      # container-wide GGML_VK_VISIBLE_DEVICES value used by llama too.
+      - GA_SD_VK_VISIBLE_DEVICES=${GA_SD_VK_VISIBLE_DEVICES:-}
       - GA_SD_AUTO_LOAD_ON_STARTUP=${GA_SD_AUTO_LOAD_ON_STARTUP:-0}
       - GA_SD_WARMUP_PROMPT=${GA_SD_WARMUP_PROMPT:-startup-warmup}
       - GA_SD_WARMUP_SIZE=${GA_SD_WARMUP_SIZE:-512x512}
diff --git a/docker/guideants-ai-build.md b/docker/guideants-ai-build.md
index 80348626..68626fae 100644
--- a/docker/guideants-ai-build.md
+++ b/docker/guideants-ai-build.md
@@ -108,7 +108,7 @@ Workflow implementation details:
 - `deps-rocm` -> runtime dependency image (no compiler toolchain)
 - `final-rocm` -> runtime image on top of `deps-rocm` (or an externally tagged deps image)
 
-- `runtime-vulkan-base` -> OS/runtime base on `ghcr.io/ggml-org/llama.cpp:server-vulkan` (Ubuntu 26.04), plus the universal GPU driver layer (`mesa-vulkan-drivers` + libglvnd/EGL libs) that makes one image work on NVIDIA, AMD, and Intel
+- `runtime-vulkan-base` -> OS/runtime base on `ghcr.io/ggml-org/llama.cpp:server-vulkan` (Ubuntu 26.04), plus the universal GPU driver layer (`mesa-vulkan-drivers` + libglvnd/EGL libs) that makes one image work on NVIDIA, AMD, and Intel; also installs Node.js 22 (`npx`) for `mcp+sandbox://` MCP servers, matching the other full AI images
 - `pydeps-vulkan-builder` -> Python dependency build stage (includes build toolchain)
 - `deps-vulkan` -> runtime dependency image (no compiler toolchain)
 - `final-vulkan` -> runtime image on top of `deps-vulkan` (or an externally tagged deps image)
@@ -588,6 +588,7 @@ Startup loading behavior is configurable per service through environment variabl
   - `0`: skip SD readiness monitoring on startup
 - `GA_SD_READY_TIMEOUT_SECONDS` (default `1800`)
 - `GA_SD_CUDA_VISIBLE_DEVICES` (optional explicit SD physical GPU pinning; empty value means inherit global ordering)
+- `GA_SD_VK_VISIBLE_DEVICES` (optional SD-only Vulkan device selector; empty value means inherit `GGML_VK_VISIBLE_DEVICES`)
 - `GA_EMB_TARGET_DEVICES` (default `cuda:0,cuda:1`; logical indices interpreted after CUDA remapping)
 
 Default compose behavior starts gateway-backed services in parallel. Optional readiness checks are non-blocking monitors so one service startup does not block others.
diff --git a/docker/guideants-ai-vulkan.md b/docker/guideants-ai-vulkan.md
index 19929dab..e0c0d9de 100644
--- a/docker/guideants-ai-vulkan.md
+++ b/docker/guideants-ai-vulkan.md
@@ -23,6 +23,9 @@ present, falling back to CPU for everything else.
 | **`vulkan`** | **Vulkan** | **NVIDIA + AMD + Intel** | **CPU wheels** |
 | `slim` | — | any (no local model runtime) | CPU wheels |
 
+Like the other full `guideants-ai` images, the Vulkan image bakes in **Node.js 22**
+(`node` / `npx`) so `mcp+sandbox://` package MCP servers can run inside the container.
+
 ## How the GPU is reached
 
 The Vulkan binaries are vendor-neutral; what differs per host is **which device node** the
@@ -135,6 +138,13 @@ environment:
 With no env set (Windows) this resolves to the dzn/`/dev/dxg` path. The `NVIDIA_*` vars matter
 only when `GA_VULKAN_RUNTIME=nvidia` (native-Linux NVIDIA) and are harmless otherwise.
 
+Vulkan leaves llama.cpp KV-cache offload disabled by default
+(`GA_LLAMA_KV_OFFLOAD=0`, propagated as `LLAMA_ARG_KV_OFFLOAD=0`) because current
+Vulkan router child processes can abort during startup for some model families when
+KV tensors are placed on a Vulkan buffer. Unified KV is also kept opt-in on this backend
+(`GA_LLAMA_KV_UNIFIED=0`). Set `GA_LLAMA_KV_OFFLOAD=1` or `GA_LLAMA_KV_UNIFIED=1`
+explicitly to retest either path with a newer upstream llama.cpp build.
+
 > **Note:** the bare-file default targets Windows. On a native-Linux host *without* the
 > `GA_VULKAN_*` env set, `${GA_VULKAN_DEVICE:-/dev/dxg}` resolves to `/dev/dxg`, which doesn't
 > exist there — so use the installer (which sets the env) or export the Linux values yourself.
@@ -221,6 +231,12 @@ docker logs guideants-ai 2>&1 | grep -i vulkan
 On Windows, `MESA_D3D12_DEFAULT_ADAPTER_NAME` (default `NVIDIA`) sets which adapter dzn lists
 first; `GGML_VK_VISIBLE_DEVICES` picks/splits among enumerated devices on any platform.
 
+Stable Diffusion can be pinned independently from llama with `GA_SD_VK_VISIBLE_DEVICES`.
+Leave it empty to inherit the container-wide `GGML_VK_VISIBLE_DEVICES`; set it when SD should
+use a different Vulkan device. For example, `GGML_VK_VISIBLE_DEVICES=1` and
+`GA_SD_VK_VISIBLE_DEVICES=0` keeps llama on Vulkan device 1 while the SD `sd-server`
+subprocess uses Vulkan device 0.
+
 ## Publishing
 
 The Vulkan image publishes to GHCR alongside the other backends:
diff --git a/docs/developer-config-guide.md b/docs/developer-config-guide.md
index 81ed5b61..d66541d8 100644
--- a/docs/developer-config-guide.md
+++ b/docs/developer-config-guide.md
@@ -50,9 +50,10 @@ NuGet packages restore automatically on `dotnet build` / `dotnet run`.
 - **NVIDIA driver R580+** plus the **NVIDIA Container Toolkit** — enables the `cuda13` backend. The Windows launcher enforces driver major ≥ 580 via `nvidia-smi`.
   - Driver: <https://www.nvidia.com/Download/index.aspx>
   - Container Toolkit: <https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html>
-- **AMD GPU + ROCm-capable driver** — enables the experimental `rocm` backend.
+- **AMD GPU + ROCm-capable driver** — enables the `rocm` backend.
   - <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/>
-- With neither installed, the launcher auto-selects the `cpu` backend.
+- **Vulkan-capable GPU** (NVIDIA, AMD, or Intel) — enables the `vulkan` backend for GPU-accelerated llama + image generation on Docker Desktop (Windows/macOS) and native Linux. On Windows, when NVIDIA is detected but the driver is below R580 (CUDA 13 minimum), the launcher falls back to `vulkan` instead of `cpu`. See [`docker/guideants-ai-vulkan.md`](../docker/guideants-ai-vulkan.md).
+- With none of the above, the launcher auto-selects the `cpu` backend.
 
 ## Optional secrets
 
@@ -92,8 +93,9 @@ These are needed regardless of which lane you work in.
 
 Optional accelerators:
 
-- **NVIDIA driver R580+** (with NVIDIA Container Toolkit) → enables `cuda13` backend. `start_windows.cmd` enforces driver major ≥ 580 via `nvidia-smi`.
-- **AMD GPU + ROCm-capable driver** → enables experimental `rocm` backend.
+- **NVIDIA driver R580+** (with NVIDIA Container Toolkit) → enables `cuda13` backend. `start_windows.cmd` enforces driver major ≥ 580 via `nvidia-smi`; below R580 on Windows, the launcher selects `vulkan` instead.
+- **AMD GPU + ROCm-capable driver** → enables `rocm` backend.
+- **Vulkan GPU** (any vendor) → use `--backend vulkan` for vendor-neutral llama + SD acceleration; Linux launchers set `GA_VULKAN_*` automatically when needed.
 - Otherwise → `cpu` backend is selected automatically.
 
 Optional secrets:
@@ -261,7 +263,7 @@ Already covered above: Docker, Compose plugin, optional GPU runtime, ~60 GB disk
 | Image | Build tool | Extra pre-requisites |
 |---|---|---|
 | `guideants-webapi-ui` (`docker/build/webapi-ui/Dockerfile`) | `build_webapi_ui.ps1` / `.sh` | Requires the client UI to be built first via `npm run browser:build:docker` (produces `src/client/dist-browser/`). Multi-stage uses `mcr.microsoft.com/dotnet/sdk:8.0`. |
-| `guideants-ai` (`docker/build/guideants-ai/Dockerfile.{cpu,cuda,rocm,slim}`) | `build_guideants_ai.ps1` / `.sh` | Requires `dotnet publish` of `ScriptExecutionAgent` (so a host .NET 8 SDK is needed even though Dockerfiles also have an SDK stage) and BuildKit (`DOCKER_BUILDKIT=1`). CPU/CUDA/ROCm are full local AI variants; `slim` is the sandbox-oriented AI image for Python script execution without starting local model runtime services. The AI image also bakes the script-agent admin assets and `ga-script-exec` privacy wrapper. For cache export (`--cache-to`), use `desktop-linux` context and enable Docker Desktop containerd image store. |
+| `guideants-ai` (`docker/build/guideants-ai/Dockerfile.{cpu,cuda,rocm,vulkan,slim}`) | `build_guideants_ai.ps1` / `.sh` | Requires `dotnet publish` of `ScriptExecutionAgent` (so a host .NET 8 SDK is needed even though Dockerfiles also have an SDK stage) and BuildKit (`DOCKER_BUILDKIT=1`). CPU/CUDA/ROCm/Vulkan are full local AI variants; each bakes Node.js 22 for `mcp+sandbox://` package MCP. `slim` is the sandbox-oriented AI image for Python script execution without starting local model runtime services. The AI image also bakes the script-agent admin assets and `ga-script-exec` privacy wrapper. For cache export (`--cache-to`), use `desktop-linux` context and enable Docker Desktop containerd image store. |
 | `mssql2025-express-fts` (`docker/build/mssql-fts/Dockerfile`) | `-All` switch on the AI build script | Standard Docker build. |
 | `plantuml-1.2025.2` (`docker/build/Sandboxes/PlantUml/dockerfile`) | `-All` switch on the AI build script | Standard Docker build. |
 | `guideants-searxng` | `docker compose build searxng` | Repo-root build context. |
@@ -316,6 +318,7 @@ GA_SEARXNG_DATA_HOST_PATH=./volumes/searxng/data
 GA_AI_CUDA_IMAGE=guideants-ai:cuda13-26132.1047
 GA_AI_CPU_IMAGE=guideants-ai:cpu-26126.1012
 GA_AI_ROCM_IMAGE=guideants-ai:rocm-26131.2226
+GA_AI_VULKAN_IMAGE=guideants-ai:vulkan-latest
 GA_EMB_DEFAULT_MODEL_PATH=harrier-oss-v1-0.6b
 GA_EMB_AUTO_LOAD_ON_STARTUP=1
 GA_EMB_WARMUP_ON_LOAD=1
@@ -353,7 +356,7 @@ Conflicts with local dev API on `5106` and Vite dev server on `5173` — both ar
 2. Run `start_windows.cmd` (or the `.sh` equivalent).
 3. Wait for `http://localhost:5107/` — launcher opens browser.
 
-The launcher auto-picks `cuda13`, `rocm`, or `cpu` and pulls GHCR images. No SDKs required.
+The launcher auto-picks `cuda13`, `rocm`, `vulkan` (Windows NVIDIA below R580), or `cpu` and pulls GHCR images. No SDKs required. Use `--backend vulkan` to force the vendor-neutral GPU stack.
 
 ### 5b) Client-only dev (UI work against dockerized API)
 
@@ -379,7 +382,7 @@ The launcher auto-picks `cuda13`, `rocm`, or `cpu` and pulls GHCR images. No SDK
 
 1. Everything in 5c.
 2. BuildKit enabled (`DOCKER_BUILDKIT=1` — the build scripts set this).
-3. For CUDA AI image builds: NVIDIA container runtime + enough disk for multi-stage CUDA 13 images.
+3. For CUDA AI image builds: NVIDIA container runtime + enough disk for multi-stage CUDA 13 images. For Vulkan builds: see [`docker/guideants-ai-vulkan.md`](../docker/guideants-ai-vulkan.md).
 4. PowerShell scripts at `docker/build/build_guideants_ai.ps1` for AI backends and `docker/build/build_support_images.ps1` for MSSQL FTS, PlantUML, SearXNG, and WebAPI/UI.
 
 ---
@@ -392,9 +395,9 @@ A few items worth confirming explicitly before onboarding a new dev:
 - **No pinned Node engine** in `src/client/package.json` — pin to avoid drift (Vite 6 + Vitest 4 + Electron 41 → Node 20.x or 22.x).
 - **`appsettings.example.json` is not auto-copied** to `appsettings.json` — first-time devs must do this manually and replace the `SettingsSecrets` key.
 - **`docker/.env` ships with stale-style local image tags** (`guideants-ai:cuda13-26132.1047` etc.) — irrelevant if you use the GHCR compose files but will fail `docker compose up` on the `local` compose files unless you build them first.
-- **CUDA 13 needs NVIDIA R580+ drivers** — the Windows launcher enforces this; manual `docker compose` does not.
+- **CUDA 13 needs NVIDIA R580+ drivers** — the Windows launcher enforces this for `cuda13` and falls back to `vulkan` when the driver is older; manual `docker compose` does not auto-select.
 - **HF token must be set in UI** (`Settings → Connections → HuggingFace`); the API does *not* accept per-request token overrides per the setup guide.
-- **Python**: there is no required host Python install. `src/python/pptx` runs inside the `ScriptExecutionAgent`/sandbox containers; Python 3.11 is baked into the `guideants-ai` images, including the sandbox-oriented `slim` AI variant. Script executions use per-`project + guide` venvs in the `script_agent_admin_state` volume, layered over the image's `/opt/venv` packages. Only install Python on the host if you specifically want to iterate on `src/python/pptx` outside Docker.
+- **Python**: there is no required host Python install. `src/python/pptx` runs inside the `ScriptExecutionAgent`/sandbox containers; Python 3.11 and Node.js 22 are baked into the full `guideants-ai` images (cpu/cuda/rocm/vulkan), including `npx` for package MCP. The sandbox-oriented `slim` AI variant also includes Node.js. Script executions use per-`project + guide` venvs in the `script_agent_admin_state` volume, layered over the image's `/opt/venv` packages. Only install Python on the host if you specifically want to iterate on `src/python/pptx` outside Docker.
 
 ---
 
@@ -402,10 +405,10 @@ A few items worth confirming explicitly before onboarding a new dev:
 
 | Lane | Mandatory | Optional |
 |---|---|---|
-| **Run only** | Docker + Compose, ~60 GB disk, curl, WSL2 (Win) | NVIDIA R580+ or AMD ROCm GPU, HF token |
+| **Run only** | Docker + Compose, ~60 GB disk, curl, WSL2 (Win) | NVIDIA R580+, AMD ROCm GPU, Vulkan GPU (`--backend vulkan`), HF token |
 | **Client dev** | Node 20+/22+, npm, `.env.*` files, a running API | Electron 41 (desktop), ANALYZE=true tooling |
 | **Server dev** | .NET 8 SDK, PowerShell 7+ (cross-platform), `appsettings*.json`, SQL Server (containerized) | EF CLI tools for migrations, Azure Speech key (if testing Azure speech path), local/admin test accounts for role-gated endpoint checks |
-| **Docker builds** | All of the above + BuildKit, GPU runtime for CUDA/ROCm image builds, dotnet publish for ScriptExecutionAgent | GHCR write access (only for publish workflows) |
+| **Docker builds** | All of the above + BuildKit, GPU runtime for CUDA/ROCm/Vulkan image builds, dotnet publish for ScriptExecutionAgent | GHCR write access (only for publish workflows) |
 
 ---
 
diff --git a/docs/host-mounts-execution/docker-gate.md b/docs/host-mounts-execution/docker-gate.md
index 98be2813..a3dc3f17 100644
--- a/docs/host-mounts-execution/docker-gate.md
+++ b/docs/host-mounts-execution/docker-gate.md
@@ -21,7 +21,8 @@ override generator.
 
 - **Base compose files** live in `docker/` (selected by `start_*`):
   `docker-compose.ghcr-cpu.yml` (default), `…ghcr-cuda13.yml`, `…ghcr-rocm.yml`,
-  `…ghcr-slim.yml`, and the local-build variants (`docker-compose.cpu.yml`, etc.).
+  `…ghcr-vulkan.yml`, `…ghcr-slim.yml`, and the local-build variants
+  (`docker-compose.cpu.yml`, `docker-compose.vulkan.yml`, etc.).
 - **Generated override** (new): `docker/docker-compose.host-mounts.generated.yml`.
   It mounts each configured source into the **affected services**
   (DECISIONS D2; default `guideants-webapi-ui;guideants-ai;plantuml`) at
diff --git a/docs/local-ai-setup-guide.md b/docs/local-ai-setup-guide.md
index 905e973b..0dca33aa 100644
--- a/docs/local-ai-setup-guide.md
+++ b/docs/local-ai-setup-guide.md
@@ -4,6 +4,8 @@ Last validated: 2026-05-05
 
 This guide configures GuideAnts for fully local AI using the Setup Wizard only. If you only need Python sandbox/script execution and plan to use cloud/provider AI for model calls, use the explicit `--backend slim` stack instead of this local model setup.
 
+For GPU acceleration without CUDA 13 or ROCm, start the stack with `--backend vulkan` (see [`docker/guideants-ai-vulkan.md`](../docker/guideants-ai-vulkan.md)). Vulkan GPU-accelerates llama and image generation; ASR, TTS, and embeddings still run on CPU inside the image.
+
 ## Prerequisites
 
 1. GuideAnts is running at `http://localhost:5107`.
diff --git a/docs/setup-guide.md b/docs/setup-guide.md
index a4981db8..40b98530 100644
--- a/docs/setup-guide.md
+++ b/docs/setup-guide.md
@@ -1,6 +1,6 @@
 # GuideAnts Setup Guide
 
-Last updated: 2026-06-05
+Last updated: 2026-06-30
 
 This is the setup-first operator guide for GuideAnts.
 Use it to get a working environment from zero to usable chat/services, then use linked docs for deeper architecture details.
@@ -21,7 +21,7 @@ Use the root launcher script for your OS:
 What these scripts do:
 
 - Validate Docker + Docker Compose.
-- Auto-detect backend (`cuda13` when NVIDIA is available, `rocm` when AMD/ROCm is available, otherwise `cpu`). The `slim` backend is explicit only.
+- Auto-detect backend (`cuda13` when NVIDIA + R580+ drivers are available, `rocm` when AMD/ROCm is available, `vulkan` on Windows when NVIDIA is present but below R580, otherwise `cpu`). The `slim` and `vulkan` backends are also available via explicit `--backend`.
 - Choose compose stack (`ghcr` by default, `local` optional).
 - Start the stack and wait for `http://localhost:5107/`.
 
@@ -29,7 +29,7 @@ Useful options:
 
 - `--doctor` (checks only, no startup)
 - `--fix` (limited auto-remediation)
-- `--backend cpu|cuda13|rocm|slim` (force backend; `slim` is the sandbox-oriented stack)
+- `--backend cpu|cuda13|rocm|slim|vulkan` (force backend; `slim` is sandbox-only; `vulkan` is vendor-neutral GPU for llama + image gen)
 - `--compose ghcr|local` (prebuilt GHCR vs local images)
 
 If the launcher gets you to `http://localhost:5107/`, skip to section 5 for first-user auth bootstrap and initial wizard flow.
@@ -38,13 +38,14 @@ If the launcher gets you to `http://localhost:5107/`, skip to section 5 for firs
 
 GuideAnts runs as a Docker Compose stack on a single host. Pick the stack by deciding two things:
 
-1. Whether model runtimes should run locally (`cpu`, `cuda13`, `rocm`) or elsewhere (`slim`).
+1. Whether model runtimes should run locally (`cpu`, `cuda13`, `rocm`, `vulkan`) or elsewhere (`slim`).
 2. Whether images should be pulled from GHCR (`--compose ghcr`) or built locally first (`--compose local`).
 
 | Backend | Best for | Compose files | Web/API/SQL shape | AI runtime shape |
 |---------|----------|---------------|-------------------|------------------|
-| `cuda13` | Local AI on NVIDIA GPUs. | `docker-compose.ghcr-cuda13.yml` or `docker-compose.cuda.yml` | Split stack: API/UI plus separate SQL Server. | Full local AI services. |
-| `rocm` | Experimental local AI on AMD/ROCm. | `docker-compose.ghcr-rocm.yml` or `docker-compose.rocm.yml` | Split stack: API/UI plus separate SQL Server. | Full local AI services. |
+| `cuda13` | Local AI on NVIDIA GPUs (CUDA 13, driver R580+). | `docker-compose.ghcr-cuda13.yml` or `docker-compose.cuda.yml` | Split stack: API/UI plus separate SQL Server. | Full local AI services (CUDA). |
+| `rocm` | Local AI on AMD/ROCm. | `docker-compose.ghcr-rocm.yml` or `docker-compose.rocm.yml` | Split stack: API/UI plus separate SQL Server. | Full local AI services (HIP for llama/SD; CPU torch for ASR/TTS/emb). |
+| `vulkan` | Local AI on NVIDIA, AMD, or Intel via Vulkan (one image). Best on Docker Desktop (Windows/macOS) and native Linux. | `docker-compose.ghcr-vulkan.yml` or `docker-compose.vulkan.yml` | Split stack: API/UI plus separate SQL Server. | Full local AI services (Vulkan GPU for llama + SD; CPU torch for ASR/TTS/emb). Includes Node.js 22 for `mcp+sandbox://` MCP servers. |
 | `cpu` | Local AI without GPU acceleration. | `docker-compose.ghcr-cpu.yml` or `docker-compose.cpu.yml` | Split stack: API/UI plus separate SQL Server. | Full local AI services. |
 | `slim` | Python sandbox users who use cloud/provider AI for model calls. | `docker-compose.ghcr-slim.yml` or `docker-compose.slim.yml` | Combined `guideants-webapi-ui-mssql`; no separate `mssql-express` service. | `guideants-ai slim`: sandbox/media only. |
 
@@ -52,8 +53,8 @@ The services you see depend on that stack:
 
 | Service | Image/source | Role |
 |---------|---------------|------|
-| `mssql-express` | `mssql2025-express-fts` | SQL Server database for split-stack `cpu`, `cuda13`, and `rocm` deployments. Not present in the slim stack because SQL Server is bundled into `guideants-webapi-ui-mssql`. |
-| `guideants-ai` | `ghcr.io/elumenotion/guideants-ai-{cpu,cuda13,rocm}:latest` (or local tag); `guideants-ai-slim` for the slim stack | Full variants are the local AI gateway: llama.cpp, ASR, TTS, image generation, embeddings, media, script execution. The slim AI variant is for Python sandbox/script execution without starting local model runtime services. |
+| `mssql-express` | `mssql2025-express-fts` | SQL Server database for split-stack `cpu`, `cuda13`, `rocm`, and `vulkan` deployments. Not present in the slim stack because SQL Server is bundled into `guideants-webapi-ui-mssql`. |
+| `guideants-ai` | `ghcr.io/elumenotion/guideants-ai-{cpu,cuda13,rocm,vulkan}:latest` (or local tag); `guideants-ai-slim` for the slim stack | Full variants are the local AI gateway: llama.cpp, ASR, TTS, image generation, embeddings, media, script execution (with Node.js 22 for package MCP). The slim AI variant is for Python sandbox/script execution without starting local model runtime services. |
 | `docling-serve` | `quay.io/docling-project/docling-serve-cpu:v1.21.0` by default | Local document intelligence / markdown extraction. The `cpu` in this image tag is Docling's CPU image variant, not the GuideAnts backend selection. Healthcheck: `GET /version`. |
 | `documentserver` | `${GA_DOCUMENTSERVER_IMAGE}` from `docker/.env` | DocumentServer used for in-app Office document display and full editing in project/notebook file flows. |
 | `guideants-webapi-ui` / `guideants-webapi-ui-slim` / `guideants-webapi-ui-mssql` | Stack-specific API/UI image | Main API plus bundled browser UI at `http://localhost:5107`. `guideants-webapi-ui-slim` is API/UI-only for split stacks; it is not the slim AI stack. |
@@ -94,7 +95,8 @@ Settings top-level tab order (current):
 
 - Docker Desktop (Windows/macOS) or Docker Engine 24+ with Compose plugin.
 - Windows PowerShell 7+ for `docker/llama/run/*.ps1` helper scripts.
-- For CUDA local AI: NVIDIA drivers + container runtime support.
+- For CUDA local AI: NVIDIA drivers (R580+) + container runtime support.
+- For Vulkan local AI: Vulkan-capable GPU; Docker Desktop on Windows/macOS (Mesa dzn over D3D12), or Mesa RADV/ANV or nvidia-container-toolkit on native Linux. See [`docker/guideants-ai-vulkan.md`](../docker/guideants-ai-vulkan.md).
 - Disk budget: ~60 GB minimum for common local model sets.
 
 ### Images and compose mode
@@ -102,7 +104,7 @@ Settings top-level tab order (current):
 You can run in either mode:
 
 - `ghcr` mode (default in launcher): pulls prebuilt images via `docker/docker-compose.ghcr-*.yml`.
-- `local` mode: uses `docker/docker-compose.{cpu,cuda,rocm,slim}.yml`; build GuideAnts local images first when needed. Third-party images such as Docling or DocumentServer may still be pulled if the exact tag is not already present locally.
+- `local` mode: uses `docker/docker-compose.{cpu,cuda,rocm,vulkan,slim}.yml`; build GuideAnts local images first when needed. Third-party images such as Docling or DocumentServer may still be pulled if the exact tag is not already present locally.
 
 The slim stack is selected with `--backend slim` and uses `docker/docker-compose.slim.yml` locally or `docker/docker-compose.ghcr-slim.yml` in GHCR mode. It uses the combined Web/API/SQL image (`guideants-webapi-ui-mssql`) plus the sandbox-oriented AI image (`guideants-ai slim`). It does not use `guideants-webapi-ui-slim`; that image is orthogonal and remains the API/UI image for split-stack deployments.
 
@@ -143,6 +145,7 @@ Local images:
 - CUDA: `docker/docker-compose.cuda.yml`
 - CPU: `docker/docker-compose.cpu.yml`
 - ROCm: `docker/docker-compose.rocm.yml`
+- Vulkan: `docker/docker-compose.vulkan.yml`
 - Slim: `docker/docker-compose.slim.yml`
 
 GHCR images:
@@ -150,6 +153,7 @@ GHCR images:
 - CUDA: `docker/docker-compose.ghcr-cuda13.yml`
 - CPU: `docker/docker-compose.ghcr-cpu.yml`
 - ROCm: `docker/docker-compose.ghcr-rocm.yml`
+- Vulkan: `docker/docker-compose.ghcr-vulkan.yml`
 - Slim: `docker/docker-compose.ghcr-slim.yml`
 
 ### Example startup commands
@@ -173,6 +177,12 @@ GHCR images:
 # GHCR ROCm
  docker compose -f docker/docker-compose.ghcr-rocm.yml up -d
 
+# local Vulkan
+ docker compose -f docker/docker-compose.vulkan.yml up -d
+
+# GHCR Vulkan
+ docker compose -f docker/docker-compose.ghcr-vulkan.yml up -d
+
 # local slim
  docker compose -f docker/docker-compose.slim.yml up -d
 
@@ -628,4 +638,5 @@ Read in this order:
 8. [`llama-model-download-and-runtime-management.md`](llama-model-download-and-runtime-management.md)
 9. [`telemetry-configuration.md`](telemetry-configuration.md)
 10. [`../docker/guideants-ai-build.md`](../docker/guideants-ai-build.md)
-11. [`../docker/build-processes.md`](../docker/build-processes.md)
+11. [`../docker/guideants-ai-vulkan.md`](../docker/guideants-ai-vulkan.md)
+12. [`../docker/build-processes.md`](../docker/build-processes.md)
diff --git a/installer/docker/docker-compose.ghcr-vulkan.yml b/installer/docker/docker-compose.ghcr-vulkan.yml
index ede7f937..dfd3334f 100644
--- a/installer/docker/docker-compose.ghcr-vulkan.yml
+++ b/installer/docker/docker-compose.ghcr-vulkan.yml
@@ -26,6 +26,8 @@ services:
     image: ${GA_AI_GHCR_IMAGE:-ghcr.io/elumenotion/guideants-ai-vulkan:main}
     pull_policy: missing
     container_name: guideants-ai
+    cap_drop:
+      - SYS_PTRACE
     # ---- Vulkan GPU wiring (cross-OS) -----------------------------------------
     # Defaults target Windows / Docker Desktop, where GPU Vulkan for EVERY vendor
     # goes Vulkan -> Mesa dzn -> D3D12 -> /dev/dxg (works from git bash, no WSL).
@@ -42,6 +44,7 @@ services:
     volumes:
       - ${GA_VULKAN_DRIVER_LIBS:-/usr/lib/wsl}:/usr/lib/wsl:ro
       - ai_local_models-new:/models-local
+      - script_agent_admin_state:/var/lib/guideants/script-agent-admin
       - type: bind
         source: ${GA_CONTENT_FILES_HOST_PATH:-./volumes/content-files}
         target: /app/ContentFiles
@@ -56,11 +59,15 @@ services:
       - VK_DRIVER_FILES=${GA_VULKAN_ICD:-/usr/share/vulkan/icd.d/dzn_icd.json}
       - VK_ICD_FILENAMES=${GA_VULKAN_ICD:-/usr/share/vulkan/icd.d/dzn_icd.json}
       - MESA_D3D12_DEFAULT_ADAPTER_NAME=${MESA_D3D12_DEFAULT_ADAPTER_NAME:-NVIDIA}
+      - GGML_VK_VISIBLE_DEVICES=${GGML_VK_VISIBLE_DEVICES:-}
       # Used only when GA_VULKAN_RUNTIME=nvidia (native-Linux NVIDIA); harmless otherwise.
       - NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-all}
       - NVIDIA_DRIVER_CAPABILITIES=${NVIDIA_DRIVER_CAPABILITIES:-graphics,compute,utility}
       - HF_TOKEN=${HF_TOKEN:-}
       - SCRIPT_EXECUTION_AGENT_TOKEN=${GA_SCRIPT_AGENT_TOKEN:-dev-script-agent-token}
+      - SCRIPT_EXECUTION_ADMIN_TOKEN=${GA_SCRIPT_AGENT_ADMIN_TOKEN:-dev-script-agent-admin-token}
+      - SCRIPT_EXECUTION_ADMIN_STATE_DIR=/var/lib/guideants/script-agent-admin
+      - SCRIPT_EXECUTION_SCOPE_STATE_ROOT=/var/lib/guideants/script-agent-admin/scopes
       - SCRIPT_EXECUTION_REQUIRE_TOKEN=true
       - GA_LLAMA_MODELS_PRESET=/models-local/router-models.ini
       - GA_LLAMA_MODEL_DIR=/models-local/llama
@@ -71,7 +78,13 @@ services:
       - GA_LLAMA_THREADS=16
       - GA_LLAMA_PARALLEL=5
       - GA_LLAMA_CACHE_RAM=8192
-      - GA_LLAMA_KV_UNIFIED=1
+      # Vulkan currently trips llama.cpp's scheduler for some model families when
+      # KV cache tensors are placed on Vulkan buffers. Keep KV offload disabled
+      # by default; LLAMA_ARG_* lets current images pick this up without rebuild.
+      - GA_LLAMA_KV_OFFLOAD=${GA_LLAMA_KV_OFFLOAD:-0}
+      - LLAMA_ARG_KV_OFFLOAD=${GA_LLAMA_KV_OFFLOAD:-0}
+      # Keep unified KV opt-in for Vulkan as a separate conservative default.
+      - GA_LLAMA_KV_UNIFIED=${GA_LLAMA_KV_UNIFIED:-0}
       - GA_LLAMA_JINJA=1
       - GA_LLAMA_CONT_BATCH=1
       - GA_LLAMA_NO_MMAP=0
@@ -128,6 +141,9 @@ services:
       - GA_SD_STRENGTH=0.75
       - GA_SD_OFFLOAD_TO_CPU=${GA_SD_OFFLOAD_TO_CPU:-0}
       - GA_SD_DIFFUSION_FA=1
+      # Optional SD-only Vulkan device selector. Empty means inherit the
+      # container-wide GGML_VK_VISIBLE_DEVICES value used by llama too.
+      - GA_SD_VK_VISIBLE_DEVICES=${GA_SD_VK_VISIBLE_DEVICES:-}
       - GA_SD_AUTO_LOAD_ON_STARTUP=${GA_SD_AUTO_LOAD_ON_STARTUP:-1}
       - GA_SD_WARMUP_PROMPT=${GA_SD_WARMUP_PROMPT:-startup-warmup}
       - GA_SD_WARMUP_SIZE=${GA_SD_WARMUP_SIZE:-512x512}
@@ -213,6 +229,7 @@ services:
       - DocumentServer__JwtHeader=${GA_DOCUMENTSERVER_JWT_HEADER:-Authorization}
       - DocumentServer__JwtInBody=${GA_DOCUMENTSERVER_JWT_IN_BODY:-false}
       - ScriptExecution__AgentToken=${GA_SCRIPT_AGENT_TOKEN:-dev-script-agent-token}
+      - ScriptExecution__AdminToken=${GA_SCRIPT_AGENT_ADMIN_TOKEN:-dev-script-agent-admin-token}
       - LlamaCpp__BaseUrl=http://guideants-ai:80/llama-cpp
       - ServiceRouting__Containers__guideants-ai__BaseUrl=http://guideants-ai:80/sandbox
       - ServiceRouting__Containers__plantuml__BaseUrl=http://plantuml:80
@@ -266,6 +283,7 @@ volumes:
   mssql_ftdata:
   mssql_log:
   ai_local_models-new:
+  script_agent_admin_state:
 
 networks:
   guideants-network:
diff --git a/installer/docker/docker-compose.vulkan.yml b/installer/docker/docker-compose.vulkan.yml
index 9f7eb3a7..68e0f9ea 100644
--- a/installer/docker/docker-compose.vulkan.yml
+++ b/installer/docker/docker-compose.vulkan.yml
@@ -74,6 +74,7 @@ services:
       # to the NVIDIA discrete GPU; set "Radeon"/"Intel" or use
       # GGML_VK_VISIBLE_DEVICES to target another GPU. Ignored by non-dzn ICDs.
       - MESA_D3D12_DEFAULT_ADAPTER_NAME=${MESA_D3D12_DEFAULT_ADAPTER_NAME:-NVIDIA}
+      - GGML_VK_VISIBLE_DEVICES=${GGML_VK_VISIBLE_DEVICES:-}
       # Used only when GA_VULKAN_RUNTIME=nvidia (native-Linux NVIDIA); harmless otherwise.
       - NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-all}
       - NVIDIA_DRIVER_CAPABILITIES=${NVIDIA_DRIVER_CAPABILITIES:-graphics,compute,utility}
@@ -95,7 +96,13 @@ services:
       - GA_LLAMA_THREADS=16
       - GA_LLAMA_PARALLEL=5
       - GA_LLAMA_CACHE_RAM=8192
-      - GA_LLAMA_KV_UNIFIED=1
+      # Vulkan currently trips llama.cpp's scheduler for some model families when
+      # KV cache tensors are placed on Vulkan buffers. Keep KV offload disabled
+      # by default; LLAMA_ARG_* lets current images pick this up without rebuild.
+      - GA_LLAMA_KV_OFFLOAD=${GA_LLAMA_KV_OFFLOAD:-0}
+      - LLAMA_ARG_KV_OFFLOAD=${GA_LLAMA_KV_OFFLOAD:-0}
+      # Keep unified KV opt-in for Vulkan as a separate conservative default.
+      - GA_LLAMA_KV_UNIFIED=${GA_LLAMA_KV_UNIFIED:-0}
       - GA_LLAMA_JINJA=1
       - GA_LLAMA_CONT_BATCH=1
       - GA_LLAMA_NO_MMAP=0
@@ -158,6 +165,9 @@ services:
       - GA_SD_STRENGTH=0.75
       - GA_SD_OFFLOAD_TO_CPU=${GA_SD_OFFLOAD_TO_CPU:-0}
       - GA_SD_DIFFUSION_FA=1
+      # Optional SD-only Vulkan device selector. Empty means inherit the
+      # container-wide GGML_VK_VISIBLE_DEVICES value used by llama too.
+      - GA_SD_VK_VISIBLE_DEVICES=${GA_SD_VK_VISIBLE_DEVICES:-}
       - GA_SD_AUTO_LOAD_ON_STARTUP=${GA_SD_AUTO_LOAD_ON_STARTUP:-0}
       - GA_SD_WARMUP_PROMPT=${GA_SD_WARMUP_PROMPT:-startup-warmup}
       - GA_SD_WARMUP_SIZE=${GA_SD_WARMUP_SIZE:-512x512}
diff --git a/src/client/GuideAnts.code-workspace b/src/client/GuideAnts.code-workspace
deleted file mode 100644
index 05e932db..00000000
--- a/src/client/GuideAnts.code-workspace
+++ /dev/null
@@ -1,14 +0,0 @@
-{
-	"folders": [
-		{
-			"path": "../.."
-		},
-		{
-			"path": "../../../GuideAntsChat"
-		},
-		{
-			"path": "../../../llama.cpp"
-		}
-	],
-	"settings": {}
-}
\ No newline at end of file
diff --git a/src/client/src/components/LoadingSpinner.tsx b/src/client/src/components/LoadingSpinner.tsx
index 8d61b812..40e9bf85 100644
--- a/src/client/src/components/LoadingSpinner.tsx
+++ b/src/client/src/components/LoadingSpinner.tsx
@@ -4,7 +4,11 @@ interface LoadingSpinnerProps {
   message?: string;
 }
 
-const LoadingSpinner: React.FC<LoadingSpinnerProps> = ({ message = 'Loading your content...' }) => {
+const LoadingSpinner: React.FC<LoadingSpinnerProps> = (props) => {
+  const displayMessage = 'message' in props
+    ? (props.message ?? '')
+    : 'Loading your content...';
+
   return (
     <div className="flex flex-col items-center justify-center w-full min-h-[200px] p-4">
       <img 
@@ -12,7 +16,9 @@ const LoadingSpinner: React.FC<LoadingSpinnerProps> = ({ message = 'Loading your
         alt="Loading..." 
         className="w-16 h-16 animate-bounce"
       />
-      <p className="mt-4 text-gray-600 text-sm font-medium">{message}</p>
+      {displayMessage && (
+        <p className="mt-4 text-gray-600 text-sm font-medium">{displayMessage}</p>
+      )}
     </div>
   );
 };
diff --git a/src/client/src/components/notebook/conversations/LlamaRuntimeModal.tsx b/src/client/src/components/notebook/conversations/LlamaRuntimeModal.tsx
index a295713e..7620d254 100644
--- a/src/client/src/components/notebook/conversations/LlamaRuntimeModal.tsx
+++ b/src/client/src/components/notebook/conversations/LlamaRuntimeModal.tsx
@@ -10,6 +10,21 @@ interface LlamaRuntimeModalProps {
   isPolling: boolean;
 }
 
+function getActiveOperationMessage(state?: string): string {
+  switch (state) {
+    case 'unloading':
+      return 'Unloading current models...';
+    case 'loading':
+      return 'Loading new models into VRAM...';
+    case 'verifying':
+      return 'Verifying model readiness...';
+    case 'queued':
+      return 'Waiting to start...';
+    default:
+      return 'Loading new models into VRAM...';
+  }
+}
+
 export const LlamaRuntimeModal: React.FC<LlamaRuntimeModalProps> = ({
   isOpen,
   onClose,
@@ -37,10 +52,11 @@ export const LlamaRuntimeModal: React.FC<LlamaRuntimeModalProps> = ({
              isInvalid ? 'Incompatible Models' : 
              'Local Models Required'}
           </h2>
-          <p className="text-sm text-gray-600 mt-2">
-            {isPolling && 'Please wait while the required models are loaded into the local runtime.'}
-            {!isPolling && !isFailed && !isInvalid && 'The selected assistant requires local models that are not currently loaded.'}
-          </p>
+          {!isPolling && !isFailed && !isInvalid && (
+            <p className="text-sm text-gray-600 mt-2">
+              The selected assistant requires local models that are not currently loaded.
+            </p>
+          )}
           {isPolling && status.activeOperation?.operationId === '__external_loading__' && (
             <p className="text-sm text-slate-500 mt-1">
               Models are already loading from startup or another session — no action needed.
@@ -50,15 +66,7 @@ export const LlamaRuntimeModal: React.FC<LlamaRuntimeModalProps> = ({
 
         <div className="px-6 pb-6">
           {isPolling ? (
-            <div className="flex flex-col items-center justify-center space-y-4 py-8">
-              <LoadingSpinner />
-              <p className="text-sm text-slate-500">
-                {status.activeOperation?.state === 'unloading' && 'Unloading current models...'}
-                {status.activeOperation?.state === 'loading' && 'Loading new models into VRAM...'}
-                {status.activeOperation?.state === 'verifying' && 'Verifying model readiness...'}
-                {status.activeOperation?.state === 'queued' && 'Waiting to start...'}
-              </p>
-            </div>
+            <LoadingSpinner message={getActiveOperationMessage(status.activeOperation?.state)} />
           ) : isInvalid ? (
             <div className="space-y-4">
               <div className="p-4 bg-red-50 text-red-700 rounded-md text-sm">
diff --git a/src/client/src/components/notebook/conversations/__tests__/LlamaRuntimeModal.test.tsx b/src/client/src/components/notebook/conversations/__tests__/LlamaRuntimeModal.test.tsx
index e5a4f8e1..dc8db939 100644
--- a/src/client/src/components/notebook/conversations/__tests__/LlamaRuntimeModal.test.tsx
+++ b/src/client/src/components/notebook/conversations/__tests__/LlamaRuntimeModal.test.tsx
@@ -71,9 +71,30 @@ describe('LlamaRuntimeModal', () => {
 
     expect(screen.getByText('Loading Local Models...')).toBeInTheDocument();
     expect(screen.getByText('Loading new models into VRAM...')).toBeInTheDocument();
+    expect(screen.queryByText('Loading your content...')).not.toBeInTheDocument();
+    expect(screen.queryByText('Please wait while the required models are loaded into the local runtime.')).not.toBeInTheDocument();
     expect(screen.queryByRole('button', { name: 'Cancel' })).not.toBeInTheDocument();
   });
 
+  it('shows external startup loading hint when polling external operation', () => {
+    render(
+      <LlamaRuntimeModal
+        isOpen
+        onClose={onClose}
+        status={{
+          ...baseStatus,
+          activeOperation: { operationId: '__external_loading__', state: 'loading' },
+        }}
+        onStartLoad={onStartLoad}
+        isPolling
+      />
+    );
+
+    expect(
+      screen.getByText('Models are already loading from startup or another session — no action needed.')
+    ).toBeInTheDocument();
+  });
+
   it('shows failed state with retry', async () => {
     const user = userEvent.setup();
     render(
diff --git a/src/client/src/pages/settings/editors/image-generation/ImageBundleManager.tsx b/src/client/src/pages/settings/editors/image-generation/ImageBundleManager.tsx
index 7dcf17f4..bc22737c 100644
--- a/src/client/src/pages/settings/editors/image-generation/ImageBundleManager.tsx
+++ b/src/client/src/pages/settings/editors/image-generation/ImageBundleManager.tsx
@@ -811,8 +811,8 @@ export function ImageBundleManager({ enabled, onDownloadOperationChange, onRunti
                               : bundleExportBusy
                               ? 'Wait for definition download to finish.'
                               : b.definition
-                              ? 'Edit the saved bundle recipe and re-download this bundle.'
-                              : 'Edit this bundle recipe and re-download. No saved recipe metadata is present yet.'
+                              ? 'Edit the saved bundle recipe; only changed roles are re-downloaded.'
+                              : 'Edit this bundle recipe; changed roles are re-downloaded. No saved recipe metadata is present yet.'
                           }
                         />
                         <IconActionButton
@@ -1281,7 +1281,7 @@ function DownloadBundleDialog({
               onClick={() => void submit()}
               disabled={submitting || loadingBundle}
             >
-              {submitting ? 'Starting…' : mode === 'edit' ? 'Save & re-download' : 'Download snapshot'}
+              {submitting ? 'Starting…' : mode === 'edit' ? 'Save & update' : 'Download snapshot'}
             </TextActionButton>
           </>
         )
@@ -1300,7 +1300,7 @@ function DownloadBundleDialog({
               </>
             )
             : mode === 'edit'
-            ? 'Editing reuses the same bundle id and re-downloads the role files into that bundle. Each role keeps exactly one file.'
+            ? 'Editing reuses the same bundle id. Only roles whose repo, file, or revision changed are re-downloaded; unchanged files on disk are kept. Use Delete bundle to remove everything and start over.'
             : 'Read-only view of the bundle recipe and role readiness on disk.'}
         </p>
         {mode !== 'create' && bundle ? (
diff --git a/src/client/src/pages/settings/editors/image-generation/__tests__/ImageBundleManager.test.tsx b/src/client/src/pages/settings/editors/image-generation/__tests__/ImageBundleManager.test.tsx
index e0e5c740..a1728d59 100644
--- a/src/client/src/pages/settings/editors/image-generation/__tests__/ImageBundleManager.test.tsx
+++ b/src/client/src/pages/settings/editors/image-generation/__tests__/ImageBundleManager.test.tsx
@@ -636,7 +636,7 @@ describe('ImageBundleManager', () => {
       operationId: 'op-edit',
       bundleId: 'bundle-a',
       status: 'queued',
-      roles: { diffusion: 'queued', vae: 'queued', textEncoder: 'queued' },
+      roles: { diffusion: 'queued', vae: 'ready', textEncoder: 'ready' },
     });
     (api.settings.localModels.getOperation as any).mockResolvedValue({
       operationId: 'op-edit',
@@ -662,7 +662,7 @@ describe('ImageBundleManager', () => {
     });
     fireEvent.change(diffusionFileInput, { target: { value: 'new-diff.gguf' } });
 
-    const saveButton = screen.getByRole('button', { name: /Save & re-download/i });
+    const saveButton = screen.getByRole('button', { name: /Save & update/i });
     await waitFor(() => {
       expect((saveButton as HTMLButtonElement).disabled).toBe(false);
     });
diff --git a/src/client/src/types/settings.ts b/src/client/src/types/settings.ts
index 0944fd8d..d4e6a95f 100644
--- a/src/client/src/types/settings.ts
+++ b/src/client/src/types/settings.ts
@@ -424,6 +424,8 @@ export interface LlamaRuntimeInventoryItemDto {
   routerContextSize?: number | null;
   /** Per-alias `cache-ram` (MiB) in router-models.ini when set. */
   routerCacheRamMib?: number | null;
+  runtimeFailed?: boolean;
+  runtimeExitCode?: number | null;
 }
 
 /**
diff --git a/src/server/GuideAntsApi.Tests/Services/LlamaCpp/LlamaServerRuntimeClientTests.cs b/src/server/GuideAntsApi.Tests/Services/LlamaCpp/LlamaServerRuntimeClientTests.cs
index 53cffd69..8595460f 100644
--- a/src/server/GuideAntsApi.Tests/Services/LlamaCpp/LlamaServerRuntimeClientTests.cs
+++ b/src/server/GuideAntsApi.Tests/Services/LlamaCpp/LlamaServerRuntimeClientTests.cs
@@ -75,6 +75,137 @@ public async Task LoadModelAsync_PreservesBasePathPrefix()
         handler.LastRequestUri!.ToString().Should().Be("http://localhost:8110/llama-cpp/models/load");
     }
 
+    [TestMethod]
+    public async Task LoadModelAsync_FailureMessageIncludesResponseBody()
+    {
+        var calls = 0;
+        var handler = new CapturingHandler(_ =>
+        {
+            calls++;
+            return new HttpResponseMessage(HttpStatusCode.InternalServerError)
+            {
+                ReasonPhrase = "Internal Server Error",
+                Content = new StringContent("instance name=gemma exited with status 1", Encoding.UTF8, "text/plain")
+            };
+        });
+
+        using var httpClient = new HttpClient(handler)
+        {
+            BaseAddress = new Uri("http://localhost:8110/llama-cpp/")
+        };
+
+        var client = new LlamaServerRuntimeClient(httpClient, NullLogger<LlamaServerRuntimeClient>.Instance);
+
+        var act = async () => await client.LoadModelAsync("gemma");
+
+        var ex = await act.Should().ThrowAsync<HttpRequestException>();
+        ex.Which.StatusCode.Should().Be(HttpStatusCode.InternalServerError);
+        ex.Which.Message.Should().Contain("models/load");
+        ex.Which.Message.Should().Contain("instance name=gemma exited with status 1");
+        calls.Should().Be(1);
+    }
+
+    [TestMethod]
+    public async Task ListModelsAsync_DeserializesRouterFailureFields()
+    {
+        var handler = new CapturingHandler(_ =>
+            new HttpResponseMessage(HttpStatusCode.OK)
+            {
+                Content = new StringContent(
+                    "{\"data\":[{\"id\":\"gemma\",\"status\":{\"value\":\"unloaded\"},\"failed\":true,\"exit_code\":1}]}",
+                    Encoding.UTF8,
+                    "application/json")
+            });
+
+        using var httpClient = new HttpClient(handler)
+        {
+            BaseAddress = new Uri("http://localhost:8110/llama-cpp/")
+        };
+
+        var client = new LlamaServerRuntimeClient(httpClient, NullLogger<LlamaServerRuntimeClient>.Instance);
+
+        var response = await client.ListModelsAsync();
+
+        response.Data.Should().ContainSingle();
+        response.Data[0].Failed.Should().BeTrue();
+        response.Data[0].ExitCode.Should().Be(1);
+    }
+
+    [TestMethod]
+    public async Task ListModelsAsync_RetriesTransientGatewayFailure()
+    {
+        var calls = 0;
+        var handler = new CapturingHandler(_ =>
+        {
+            calls++;
+            return calls == 1
+                ? new HttpResponseMessage(HttpStatusCode.BadGateway)
+                {
+                    ReasonPhrase = "Bad Gateway",
+                    Content = new StringContent("router is starting", Encoding.UTF8, "text/plain")
+                }
+                : new HttpResponseMessage(HttpStatusCode.OK)
+                {
+                    Content = new StringContent("{\"data\":[{\"id\":\"qwen\",\"status\":{\"value\":\"loaded\"}}]}", Encoding.UTF8, "application/json")
+                };
+        });
+
+        using var httpClient = new HttpClient(handler)
+        {
+            BaseAddress = new Uri("http://localhost:8110/llama-cpp/")
+        };
+
+        var client = new LlamaServerRuntimeClient(httpClient, NullLogger<LlamaServerRuntimeClient>.Instance);
+
+        var response = await client.ListModelsAsync();
+
+        calls.Should().Be(2);
+        response.Data.Should().ContainSingle(m => m.Id == "qwen");
+    }
+
+    [TestMethod]
+    public async Task ListModelsAsync_RetriesTransientConnectionFailure()
+    {
+        var calls = 0;
+        var handler = new CapturingHandler(_ =>
+        {
+            calls++;
+            if (calls == 1)
+            {
+                throw new HttpRequestException("Connection refused (guideants-ai:80)");
+            }
+
+            return new HttpResponseMessage(HttpStatusCode.OK)
+            {
+                Content = new StringContent("{\"data\":[]}", Encoding.UTF8, "application/json")
+            };
+        });
+
+        using var httpClient = new HttpClient(handler)
+        {
+            BaseAddress = new Uri("http://localhost:8110/llama-cpp/")
+        };
+
+        var client = new LlamaServerRuntimeClient(httpClient, NullLogger<LlamaServerRuntimeClient>.Instance);
+
+        await client.ListModelsAsync();
+
+        calls.Should().Be(2);
+    }
+
+    [TestMethod]
+    public void MapRuntimeState_PrefersRouterFailedFlagOverUnloadedStatus()
+    {
+        var state = LlamaRuntimeInventoryService.MapRuntimeState(new LlamaModelData
+        {
+            Failed = true,
+            ExitCode = 1,
+            Status = new LlamaModelStatus { Value = "unloaded" }
+        });
+
+        state.Should().Be("failed");
+    }
+
     private sealed class CapturingHandler(Func<HttpRequestMessage, HttpResponseMessage> responder) : HttpMessageHandler
     {
         private readonly Func<HttpRequestMessage, HttpResponseMessage> _responder = responder;
diff --git a/src/server/GuideAntsApi.Tests/Services/PublishedGuides/PublishedGuideCostLimitServiceTests.cs b/src/server/GuideAntsApi.Tests/Services/PublishedGuides/PublishedGuideCostLimitServiceTests.cs
index 4b707bda..02020d35 100644
--- a/src/server/GuideAntsApi.Tests/Services/PublishedGuides/PublishedGuideCostLimitServiceTests.cs
+++ b/src/server/GuideAntsApi.Tests/Services/PublishedGuides/PublishedGuideCostLimitServiceTests.cs
@@ -75,6 +75,7 @@ public async Task EnsureWithinLimitsAsync_Prioritizes_daily_limit_when_both_dail
         var notebookId = Guid.NewGuid();
         var now = DateTime.UtcNow;
         var dayStart = new DateTime(now.Year, now.Month, now.Day, 0, 0, 0, DateTimeKind.Utc);
+        var monthStart = new DateTime(now.Year, now.Month, 1, 0, 0, 0, DateTimeKind.Utc);
 
         await using (var seed = new ApplicationDbContext(options))
         {
@@ -88,7 +89,7 @@ public async Task EnsureWithinLimitsAsync_Prioritizes_daily_limit_when_both_dail
                 NotebookId = notebookId,
                 Active = true,
                 DailyChargeLimitUsd = 0.50m,
-                BillingPeriodChargeLimitUsd = 1.00m
+                BillingPeriodChargeLimitUsd = 0.50m
             });
             seed.UsageEvents.AddRange(
                 new UsageEvent
@@ -109,8 +110,8 @@ public async Task EnsureWithinLimitsAsync_Prioritizes_daily_limit_when_both_dail
                 {
                     NotebookId = notebookId,
                     ProjectId = projectId,
-                    Created = dayStart.AddDays(-2),
-                    ChargeUsd = 0.70m
+                    Created = monthStart.AddTicks(-1),
+                    ChargeUsd = 9.99m
                 });
             await seed.SaveChangesAsync();
         }
@@ -123,6 +124,6 @@ public async Task EnsureWithinLimitsAsync_Prioritizes_daily_limit_when_both_dail
         result.Allowed.Should().BeFalse();
         result.Reason.Should().Be("daily_limit_exceeded");
         result.DailyChargeUsd.Should().Be(0.70m);
-        result.BillingPeriodChargeUsd.Should().Be(1.40m);
+        result.BillingPeriodChargeUsd.Should().Be(0.70m);
     }
 }
diff --git a/src/server/GuideAntsApi/Configuration/StartupConfiguration.cs b/src/server/GuideAntsApi/Configuration/StartupConfiguration.cs
index 39d87940..f60bf171 100644
--- a/src/server/GuideAntsApi/Configuration/StartupConfiguration.cs
+++ b/src/server/GuideAntsApi/Configuration/StartupConfiguration.cs
@@ -1,6 +1,7 @@
 using Microsoft.EntityFrameworkCore;
 using Microsoft.OpenApi.Models;
 using Microsoft.OpenApi.Any;
+using System.Net.Http;
 using System.Reflection;
 using System.Security.Claims;
 using GuideAntsApi.DataModel;
@@ -149,6 +150,12 @@ private static void RegisterServices(IServiceCollection services, IConfiguration
             var baseUrl = config["BaseUrl"]
                 ?? throw new InvalidOperationException("LlamaCpp:BaseUrl is required.");
             client.BaseAddress = new Uri(baseUrl);
+        })
+        .SetHandlerLifetime(TimeSpan.FromMinutes(1))
+        .ConfigurePrimaryHttpMessageHandler(() => new SocketsHttpHandler
+        {
+            PooledConnectionLifetime = TimeSpan.FromSeconds(30),
+            PooledConnectionIdleTimeout = TimeSpan.FromSeconds(15)
         });
         services.AddHttpClient<ILlamaRuntimeAdminClient, LlamaRuntimeAdminClient>(client =>
         {
diff --git a/src/server/GuideAntsApi/Endpoints/Settings/SettingsLlamaEndpoints.cs b/src/server/GuideAntsApi/Endpoints/Settings/SettingsLlamaEndpoints.cs
index ebc4f788..1818e431 100644
--- a/src/server/GuideAntsApi/Endpoints/Settings/SettingsLlamaEndpoints.cs
+++ b/src/server/GuideAntsApi/Endpoints/Settings/SettingsLlamaEndpoints.cs
@@ -165,7 +165,11 @@ public static void MapSettingsLlamaEndpoints(this WebApplication app)
                     RouterModelId: item.RouterModelId,
                     LastLoadStartedAt: null,
                     LastLoadDurationMs: null,
-                    LastError: null));
+                    LastError: item.RuntimeFailed
+                        ? item.RuntimeExitCode is int exitCode
+                            ? $"llama-server child exited with status {exitCode}."
+                            : "llama-server child exited during model load."
+                        : null));
             }
 
             return Results.Ok((IReadOnlyList<LlamaRuntimeAliasStatusDto>)statuses);
diff --git a/src/server/GuideAntsApi/Models/Settings/SettingsDtos.cs b/src/server/GuideAntsApi/Models/Settings/SettingsDtos.cs
index 26fad133..194a5306 100644
--- a/src/server/GuideAntsApi/Models/Settings/SettingsDtos.cs
+++ b/src/server/GuideAntsApi/Models/Settings/SettingsDtos.cs
@@ -303,7 +303,9 @@ public sealed record LlamaRuntimeInventoryItemDto(
     IReadOnlyList<string> CatalogModelIds,
     int NotebookReferenceCount,
     int? RouterContextSize = null,
-    int? RouterCacheRamMib = null);
+    int? RouterCacheRamMib = null,
+    bool RuntimeFailed = false,
+    int? RuntimeExitCode = null);
 
 public sealed record StartModelDownloadRequest(
     string Repository,
diff --git a/src/server/GuideAntsApi/Services/LlamaCpp/ILlamaServerRuntimeClient.cs b/src/server/GuideAntsApi/Services/LlamaCpp/ILlamaServerRuntimeClient.cs
index 55ccca75..ea75f55c 100644
--- a/src/server/GuideAntsApi/Services/LlamaCpp/ILlamaServerRuntimeClient.cs
+++ b/src/server/GuideAntsApi/Services/LlamaCpp/ILlamaServerRuntimeClient.cs
@@ -1,3 +1,4 @@
+using System.Net;
 using System.Text;
 using System.Text.Json;
 using System.Text.Json.Nodes;
@@ -32,6 +33,14 @@ public class LlamaModelData
     [JsonPropertyName("state")]
     public string State { get; set; } = string.Empty;
 
+    // Router mode marks a child process that exited during load with these
+    // fields while status.value may still be "unloaded".
+    [JsonPropertyName("failed")]
+    public bool Failed { get; set; }
+
+    [JsonPropertyName("exit_code")]
+    public int? ExitCode { get; set; }
+
     [JsonPropertyName("meta")]
     public LlamaModelMeta? Meta { get; set; }
 }
@@ -68,6 +77,16 @@ public class LlamaOpenAiModelData
 
 public class LlamaServerRuntimeClient : ILlamaServerRuntimeClient
 {
+    private static readonly TimeSpan[] TransientRetryDelays =
+    [
+        TimeSpan.FromMilliseconds(250),
+        TimeSpan.FromMilliseconds(500),
+        TimeSpan.FromSeconds(1),
+        TimeSpan.FromSeconds(2),
+        TimeSpan.FromSeconds(4),
+        TimeSpan.FromSeconds(5)
+    ];
+
     private readonly HttpClient _httpClient;
     private readonly ILogger<LlamaServerRuntimeClient> _logger;
 
@@ -79,25 +98,13 @@ public LlamaServerRuntimeClient(HttpClient httpClient, ILogger<LlamaServerRuntim
 
     public async Task<LlamaModelsResponse> ListModelsAsync(CancellationToken cancellationToken = default)
     {
-        var requestPath = "models";
-        var requestUri = BuildEndpointUri(_httpClient.BaseAddress, requestPath);
-        
-        var response = await _httpClient.GetAsync(requestUri, cancellationToken);
-        var responseContent = await response.Content.ReadAsStringAsync(cancellationToken);
-        
-        response.EnsureSuccessStatusCode();
+        var responseContent = await GetStringWithTransientRetryAsync("models", cancellationToken);
         return JsonSerializer.Deserialize<LlamaModelsResponse>(responseContent) ?? new LlamaModelsResponse();
     }
 
     public async Task<LlamaOpenAiModelsResponse> ListOpenAiModelsAsync(CancellationToken cancellationToken = default)
     {
-        var requestPath = "v1/models";
-        var requestUri = BuildEndpointUri(_httpClient.BaseAddress, requestPath);
-        
-        var response = await _httpClient.GetAsync(requestUri, cancellationToken);
-        var responseContent = await response.Content.ReadAsStringAsync(cancellationToken);
-        
-        response.EnsureSuccessStatusCode();
+        var responseContent = await GetStringWithTransientRetryAsync("v1/models", cancellationToken);
         return JsonSerializer.Deserialize<LlamaOpenAiModelsResponse>(responseContent) ?? new LlamaOpenAiModelsResponse();
     }
 
@@ -120,54 +127,18 @@ public async Task LoadModelAsync(string modelPathOrPreset, JsonObject? loadParam
         }
 
         var requestPath = "models/load";
-        var requestUri = BuildEndpointUri(_httpClient.BaseAddress, requestPath);
         var requestJson = requestBody.ToJsonString();
         
-        using var request = new HttpRequestMessage(HttpMethod.Post, requestUri)
-        {
-            Content = new StringContent(requestJson, Encoding.UTF8, "application/json")
-        };
-
-        var response = await _httpClient.SendAsync(request, cancellationToken);
-        var responseContent = await response.Content.ReadAsStringAsync(cancellationToken);
-        
-        if (!response.IsSuccessStatusCode)
-        {
-            _logger.LogError(
-                "Llama runtime POST failed. Url: {RequestUri}. Status: {StatusCode}. RequestBody: {RequestBody}. ResponseBody: {ResponseBody}",
-                requestUri.ToString(),
-                (int)response.StatusCode,
-                requestJson,
-                responseContent);
-        }
-        response.EnsureSuccessStatusCode();
+        await PostJsonWithTransientRetryAsync(requestPath, requestJson, cancellationToken);
     }
 
     public async Task UnloadModelAsync(string routerModelId, CancellationToken cancellationToken = default)
     {
         var requestBody = new { model = routerModelId };
         var requestPath = "models/unload";
-        var requestUri = BuildEndpointUri(_httpClient.BaseAddress, requestPath);
         var requestJson = JsonSerializer.Serialize(requestBody);
         
-        using var request = new HttpRequestMessage(HttpMethod.Post, requestUri)
-        {
-            Content = new StringContent(requestJson, Encoding.UTF8, "application/json")
-        };
-
-        var response = await _httpClient.SendAsync(request, cancellationToken);
-        var responseContent = await response.Content.ReadAsStringAsync(cancellationToken);
-        
-        if (!response.IsSuccessStatusCode)
-        {
-            _logger.LogError(
-                "Llama runtime POST failed. Url: {RequestUri}. Status: {StatusCode}. RequestBody: {RequestBody}. ResponseBody: {ResponseBody}",
-                requestUri.ToString(),
-                (int)response.StatusCode,
-                requestJson,
-                responseContent);
-        }
-        response.EnsureSuccessStatusCode();
+        await PostJsonWithTransientRetryAsync(requestPath, requestJson, cancellationToken);
     }
 
     internal static Uri BuildEndpointUri(Uri? baseAddress, string relativePath)
@@ -181,4 +152,159 @@ internal static Uri BuildEndpointUri(Uri? baseAddress, string relativePath)
         var normalizedRelativePath = relativePath.TrimStart('/');
         return new Uri(new Uri(normalizedBaseUrl), normalizedRelativePath);
     }
+
+    private static string LimitForException(string value)
+    {
+        const int maxChars = 2000;
+        if (string.IsNullOrEmpty(value))
+        {
+            return value;
+        }
+
+        return value.Length <= maxChars ? value : value[..maxChars] + "...";
+    }
+
+    private async Task<string> GetStringWithTransientRetryAsync(
+        string requestPath,
+        CancellationToken cancellationToken)
+    {
+        var requestUri = BuildEndpointUri(_httpClient.BaseAddress, requestPath);
+
+        for (var attempt = 0; ; attempt++)
+        {
+            try
+            {
+                using var response = await _httpClient.GetAsync(requestUri, cancellationToken);
+                var responseContent = await response.Content.ReadAsStringAsync(cancellationToken);
+
+                if (response.IsSuccessStatusCode)
+                {
+                    return responseContent;
+                }
+
+                if (ShouldRetryStatus(response.StatusCode, attempt))
+                {
+                    await DelayBeforeRetryAsync(
+                        "GET",
+                        requestUri,
+                        attempt,
+                        $"HTTP {(int)response.StatusCode} ({response.ReasonPhrase ?? "<none>"})",
+                        cancellationToken).ConfigureAwait(false);
+                    continue;
+                }
+
+                _logger.LogError(
+                    "Llama runtime GET failed. Url: {RequestUri}. Status: {StatusCode}. ResponseBody: {ResponseBody}",
+                    requestUri.ToString(),
+                    (int)response.StatusCode,
+                    responseContent);
+                throw new HttpRequestException(
+                    $"Llama runtime GET {requestPath} failed with HTTP {(int)response.StatusCode} ({response.ReasonPhrase ?? "<none>"}). ResponseBody={LimitForException(responseContent)}",
+                    null,
+                    response.StatusCode);
+            }
+            catch (Exception ex) when (ShouldRetryException(ex, cancellationToken, attempt))
+            {
+                await DelayBeforeRetryAsync(
+                    "GET",
+                    requestUri,
+                    attempt,
+                    ex.Message,
+                    cancellationToken).ConfigureAwait(false);
+            }
+        }
+    }
+
+    private async Task PostJsonWithTransientRetryAsync(
+        string requestPath,
+        string requestJson,
+        CancellationToken cancellationToken)
+    {
+        var requestUri = BuildEndpointUri(_httpClient.BaseAddress, requestPath);
+
+        for (var attempt = 0; ; attempt++)
+        {
+            try
+            {
+                using var request = new HttpRequestMessage(HttpMethod.Post, requestUri)
+                {
+                    Content = new StringContent(requestJson, Encoding.UTF8, "application/json")
+                };
+                using var response = await _httpClient.SendAsync(request, cancellationToken);
+                var responseContent = await response.Content.ReadAsStringAsync(cancellationToken);
+
+                if (response.IsSuccessStatusCode)
+                {
+                    return;
+                }
+
+                if (ShouldRetryStatus(response.StatusCode, attempt))
+                {
+                    await DelayBeforeRetryAsync(
+                        "POST",
+                        requestUri,
+                        attempt,
+                        $"HTTP {(int)response.StatusCode} ({response.ReasonPhrase ?? "<none>"})",
+                        cancellationToken).ConfigureAwait(false);
+                    continue;
+                }
+
+                _logger.LogError(
+                    "Llama runtime POST failed. Url: {RequestUri}. Status: {StatusCode}. RequestBody: {RequestBody}. ResponseBody: {ResponseBody}",
+                    requestUri.ToString(),
+                    (int)response.StatusCode,
+                    requestJson,
+                    responseContent);
+                throw new HttpRequestException(
+                    $"Llama runtime POST {requestPath} failed with HTTP {(int)response.StatusCode} ({response.ReasonPhrase ?? "<none>"}). ResponseBody={LimitForException(responseContent)}",
+                    null,
+                    response.StatusCode);
+            }
+            catch (Exception ex) when (ShouldRetryException(ex, cancellationToken, attempt))
+            {
+                await DelayBeforeRetryAsync(
+                    "POST",
+                    requestUri,
+                    attempt,
+                    ex.Message,
+                    cancellationToken).ConfigureAwait(false);
+            }
+        }
+    }
+
+    private bool ShouldRetryStatus(HttpStatusCode statusCode, int attempt)
+    {
+        return attempt < TransientRetryDelays.Length
+            && (statusCode == HttpStatusCode.RequestTimeout
+                || statusCode == HttpStatusCode.BadGateway
+                || statusCode == HttpStatusCode.ServiceUnavailable
+                || statusCode == HttpStatusCode.GatewayTimeout);
+    }
+
+    private static bool ShouldRetryException(Exception ex, CancellationToken cancellationToken, int attempt)
+    {
+        return attempt < TransientRetryDelays.Length
+            && !cancellationToken.IsCancellationRequested
+            && (ex is HttpRequestException { StatusCode: null }
+                || ex is TaskCanceledException);
+    }
+
+    private async Task DelayBeforeRetryAsync(
+        string method,
+        Uri requestUri,
+        int attempt,
+        string reason,
+        CancellationToken cancellationToken)
+    {
+        var delay = TransientRetryDelays[attempt];
+        _logger.LogWarning(
+            "Transient llama runtime {Method} failure. Url: {RequestUri}. Attempt: {Attempt}/{MaxAttempts}. Retrying in {DelayMs} ms. Reason: {Reason}",
+            method,
+            requestUri.ToString(),
+            attempt + 1,
+            TransientRetryDelays.Length + 1,
+            (int)delay.TotalMilliseconds,
+            LimitForException(reason));
+        await Task.Delay(delay, cancellationToken);
+    }
 }
diff --git a/src/server/GuideAntsApi/Services/LlamaCpp/LlamaRuntimeInventoryService.cs b/src/server/GuideAntsApi/Services/LlamaCpp/LlamaRuntimeInventoryService.cs
index e7f98845..17604c37 100644
--- a/src/server/GuideAntsApi/Services/LlamaCpp/LlamaRuntimeInventoryService.cs
+++ b/src/server/GuideAntsApi/Services/LlamaCpp/LlamaRuntimeInventoryService.cs
@@ -145,19 +145,26 @@ public async Task<IReadOnlyList<LlamaRuntimeInventoryItemDto>> GetInventoryAsync
                 CatalogModelIds: catalogIds,
                 NotebookReferenceCount: notebookCount,
                 RouterContextSize: entry?.ContextSize,
-                RouterCacheRamMib: entry?.CacheRamMib));
+                RouterCacheRamMib: entry?.CacheRamMib,
+                RuntimeFailed: runtimeRow?.Failed ?? false,
+                RuntimeExitCode: runtimeRow?.ExitCode));
         }
 
         return results;
     }
 
-    private static string MapRuntimeState(LlamaModelData? data)
+    internal static string MapRuntimeState(LlamaModelData? data)
     {
         if (data is null)
         {
             return "unloaded";
         }
 
+        if (data.Failed)
+        {
+            return "failed";
+        }
+
         if (!string.IsNullOrWhiteSpace(data.Status?.Value))
         {
             return data.Status.Value.ToLowerInvariant();
diff --git a/start_linux.sh b/start_linux.sh
index 933bc45f..d22e2e6f 100644
--- a/start_linux.sh
+++ b/start_linux.sh
@@ -7,7 +7,7 @@ DOCKER_DIR="$ROOT_DIR/docker"
 
 MODE="install"          # install | doctor
 FIX_MODE="0"            # 0 | 1
-BACKEND_OVERRIDE=""     # cpu | cuda13 | rocm | slim
+BACKEND_OVERRIDE=""     # cpu | cuda13 | rocm | slim | vulkan
 COMPOSE_MODE="ghcr"     # ghcr | local
 HEALTH_URL="http://localhost:5107/"
 HOST_MOUNT_OVERRIDE_FILE="docker-compose.host-mounts.generated.yml"
@@ -21,7 +21,7 @@ Usage: ./start_linux.sh [options]
 Options:
   --doctor               Run checks only, do not change anything.
   --fix                  Attempt limited auto-remediation where possible.
-  --backend cpu|cuda13|rocm|slim   Force backend selection. slim is explicit only and is not auto-detected.
+  --backend cpu|cuda13|rocm|slim|vulkan   Force backend selection. slim and vulkan are explicit only and are not auto-detected.
   --compose ghcr|local   Use GHCR compose files (default) or local build files.
   --help                 Show this help.
 EOF
@@ -101,6 +101,7 @@ select_compose_file() {
       slim) COMPOSE_FILE="docker-compose.slim.yml" ;;
       cuda13) COMPOSE_FILE="docker-compose.cuda.yml" ;;
       rocm) COMPOSE_FILE="docker-compose.rocm.yml" ;;
+      vulkan) COMPOSE_FILE="docker-compose.vulkan.yml" ;;
       *) COMPOSE_FILE="docker-compose.cpu.yml" ;;
     esac
   else
@@ -108,11 +109,52 @@ select_compose_file() {
       slim) COMPOSE_FILE="docker-compose.ghcr-slim.yml" ;;
       cuda13) COMPOSE_FILE="docker-compose.ghcr-cuda13.yml" ;;
       rocm) COMPOSE_FILE="docker-compose.ghcr-rocm.yml" ;;
+      vulkan) COMPOSE_FILE="docker-compose.ghcr-vulkan.yml" ;;
       *) COMPOSE_FILE="docker-compose.ghcr-cpu.yml" ;;
     esac
   fi
 }
 
+select_vulkan_runtime() {
+  [[ "$SELECTED_BACKEND" == "vulkan" ]] || return 0
+
+  if docker info --format '{{.OperatingSystem}}' 2>/dev/null | grep -q 'Docker Desktop'; then
+    log "Vulkan: Docker Desktop → Mesa dzn over D3D12 (/dev/dxg). Using built-in defaults (no env)."
+    return 0
+  fi
+
+  local dev="/dev/null"
+  [[ -e /dev/dri ]] && dev="/dev/dri"
+  export GA_VULKAN_DEVICE="$dev"
+  export GA_VULKAN_DRIVER_LIBS="/usr/lib"
+  export GA_VULKAN_LD_LIBRARY_PATH="/usr/lib/x86_64-linux-gnu"
+
+  if docker info --format '{{json .Runtimes}}' 2>/dev/null | grep -q '"nvidia"'; then
+    export GA_VULKAN_RUNTIME="nvidia"
+    export GA_VULKAN_ICD="/usr/share/vulkan/icd.d/nvidia_icd.json"
+    log "Vulkan: native Linux NVIDIA → nvidia runtime injects the Vulkan ICD (device $dev)."
+  elif [[ -e /dev/dri ]]; then
+    local icd=""
+    for v in /sys/class/drm/renderD*/device/vendor; do
+      [[ -r "$v" ]] || continue
+      case "$(cat "$v" 2>/dev/null)" in
+        0x1002) icd="/usr/share/vulkan/icd.d/radeon_icd.x86_64.json"; break ;;
+        0x8086) icd="/usr/share/vulkan/icd.d/intel_icd.x86_64.json";  break ;;
+      esac
+    done
+    if [[ -n "$icd" ]]; then
+      export GA_VULKAN_ICD="$icd"
+      log "Vulkan: native Linux Mesa via /dev/dri (ICD $(basename "$icd"))."
+    else
+      export GA_VULKAN_ICD="/usr/share/vulkan/icd.d/radeon_icd.x86_64.json"
+      warn "Vulkan: /dev/dri present but GPU vendor undetermined; assuming AMD RADV. Override GA_VULKAN_ICD if this is an Intel GPU."
+    fi
+  else
+    warn "Vulkan: native Linux with no nvidia runtime and no /dev/dri — no GPU device found."
+    warn "        LLM and image generation will run on CPU. Install Mesa (AMD/Intel) or the nvidia-container-toolkit (NVIDIA)."
+  fi
+}
+
 wait_for_health() {
   log "Waiting for GuideAnts UI to become reachable at $HEALTH_URL"
   for _ in $(seq 1 120); do
@@ -151,11 +193,12 @@ while [[ $# -gt 0 ]]; do
 done
 
 [[ "$COMPOSE_MODE" == "ghcr" || "$COMPOSE_MODE" == "local" ]] || fail "--compose must be ghcr or local"
-[[ -z "$BACKEND_OVERRIDE" || "$BACKEND_OVERRIDE" == "cpu" || "$BACKEND_OVERRIDE" == "cuda13" || "$BACKEND_OVERRIDE" == "rocm" || "$BACKEND_OVERRIDE" == "slim" ]] || fail "--backend must be cpu, cuda13, rocm, or slim"
+[[ -z "$BACKEND_OVERRIDE" || "$BACKEND_OVERRIDE" == "cpu" || "$BACKEND_OVERRIDE" == "cuda13" || "$BACKEND_OVERRIDE" == "rocm" || "$BACKEND_OVERRIDE" == "slim" || "$BACKEND_OVERRIDE" == "vulkan" ]] || fail "--backend must be cpu, cuda13, rocm, slim, or vulkan"
 
 check_prereqs
 detect_backend
 select_compose_file
+select_vulkan_runtime
 
 log "Selected backend: $SELECTED_BACKEND"
 log "Compose file: docker/$COMPOSE_FILE"
diff --git a/start_macos.sh b/start_macos.sh
index e5973ecd..e9cceeee 100644
--- a/start_macos.sh
+++ b/start_macos.sh
@@ -7,7 +7,7 @@ DOCKER_DIR="$ROOT_DIR/docker"
 
 MODE="install"          # install | doctor
 FIX_MODE="0"            # 0 | 1
-BACKEND_OVERRIDE=""     # cpu | cuda13 | rocm | slim
+BACKEND_OVERRIDE=""     # cpu | cuda13 | rocm | slim | vulkan
 COMPOSE_MODE="ghcr"     # ghcr | local
 HEALTH_URL="http://localhost:5107/"
 HOST_MOUNT_OVERRIDE_FILE="docker-compose.host-mounts.generated.yml"
@@ -21,7 +21,7 @@ Usage: ./start_macos.sh [options]
 Options:
   --doctor               Run checks only, do not change anything.
   --fix                  Attempt limited auto-remediation where possible.
-  --backend cpu|cuda13|rocm|slim   Force backend selection. slim is explicit only and is not auto-detected.
+  --backend cpu|cuda13|rocm|slim|vulkan   Force backend selection. slim and vulkan are explicit only and are not auto-detected.
   --compose ghcr|local   Use GHCR compose files (default) or local build files.
   --help                 Show this help.
 EOF
@@ -91,6 +91,7 @@ select_compose_file() {
       slim) COMPOSE_FILE="docker-compose.slim.yml" ;;
       cuda13) COMPOSE_FILE="docker-compose.cuda.yml" ;;
       rocm) COMPOSE_FILE="docker-compose.rocm.yml" ;;
+      vulkan) COMPOSE_FILE="docker-compose.vulkan.yml" ;;
       *) COMPOSE_FILE="docker-compose.cpu.yml" ;;
     esac
   else
@@ -98,11 +99,17 @@ select_compose_file() {
       slim) COMPOSE_FILE="docker-compose.ghcr-slim.yml" ;;
       cuda13) COMPOSE_FILE="docker-compose.ghcr-cuda13.yml" ;;
       rocm) COMPOSE_FILE="docker-compose.ghcr-rocm.yml" ;;
+      vulkan) COMPOSE_FILE="docker-compose.ghcr-vulkan.yml" ;;
       *) COMPOSE_FILE="docker-compose.ghcr-cpu.yml" ;;
     esac
   fi
 }
 
+select_vulkan_runtime() {
+  [[ "$SELECTED_BACKEND" == "vulkan" ]] || return 0
+  log "Vulkan: Docker Desktop → Mesa dzn over D3D12 (/dev/dxg). Using built-in defaults (no env)."
+}
+
 wait_for_health() {
   log "Waiting for GuideAnts UI to become reachable at $HEALTH_URL"
   for _ in $(seq 1 120); do
@@ -139,11 +146,12 @@ while [[ $# -gt 0 ]]; do
 done
 
 [[ "$COMPOSE_MODE" == "ghcr" || "$COMPOSE_MODE" == "local" ]] || fail "--compose must be ghcr or local"
-[[ -z "$BACKEND_OVERRIDE" || "$BACKEND_OVERRIDE" == "cpu" || "$BACKEND_OVERRIDE" == "cuda13" || "$BACKEND_OVERRIDE" == "rocm" || "$BACKEND_OVERRIDE" == "slim" ]] || fail "--backend must be cpu, cuda13, rocm, or slim"
+[[ -z "$BACKEND_OVERRIDE" || "$BACKEND_OVERRIDE" == "cpu" || "$BACKEND_OVERRIDE" == "cuda13" || "$BACKEND_OVERRIDE" == "rocm" || "$BACKEND_OVERRIDE" == "slim" || "$BACKEND_OVERRIDE" == "vulkan" ]] || fail "--backend must be cpu, cuda13, rocm, slim, or vulkan"
 
 check_prereqs
 detect_backend
 select_compose_file
+select_vulkan_runtime
 
 log "Selected backend: $SELECTED_BACKEND"
 log "Compose file: docker/$COMPOSE_FILE"
diff --git a/start_windows.cmd b/start_windows.cmd
index 7f5d0559..13f46a06 100644
--- a/start_windows.cmd
+++ b/start_windows.cmd
@@ -48,7 +48,7 @@ call :fail Unknown option: %~1
 :args_done
 if /I not "%COMPOSE_MODE%"=="ghcr" if /I not "%COMPOSE_MODE%"=="local" call :fail --compose must be ghcr or local
 if not "%BACKEND_OVERRIDE%"=="" (
-  if /I not "%BACKEND_OVERRIDE%"=="cpu" if /I not "%BACKEND_OVERRIDE%"=="cuda13" if /I not "%BACKEND_OVERRIDE%"=="rocm" if /I not "%BACKEND_OVERRIDE%"=="slim" call :fail --backend must be cpu, cuda13, rocm, or slim
+  if /I not "%BACKEND_OVERRIDE%"=="cpu" if /I not "%BACKEND_OVERRIDE%"=="cuda13" if /I not "%BACKEND_OVERRIDE%"=="rocm" if /I not "%BACKEND_OVERRIDE%"=="slim" if /I not "%BACKEND_OVERRIDE%"=="vulkan" call :fail --backend must be cpu, cuda13, rocm, slim, or vulkan
 )
 
 call :log Running preflight checks...
@@ -68,6 +68,7 @@ call :check_wsl
 call :detect_backend
 call :validate_backend
 call :select_compose_file
+call :select_vulkan_runtime
 
 call :log Selected backend: %SELECTED_BACKEND%
 call :log Compose file: docker\%COMPOSE_FILE%
@@ -158,8 +159,8 @@ if not defined NVIDIA_DRIVER_VERSION (
   if not "%BACKEND_OVERRIDE%"=="" (
     call :fail Could not read NVIDIA driver version from nvidia-smi. Remove --backend cuda13 or fix NVIDIA driver/runtime.
   )
-  call :warn Could not read NVIDIA driver version from nvidia-smi. Falling back to cpu backend.
-  set "SELECTED_BACKEND=cpu"
+  call :warn Could not read NVIDIA driver version from nvidia-smi. Falling back to vulkan backend.
+  set "SELECTED_BACKEND=vulkan"
   exit /b 0
 )
 
@@ -171,8 +172,8 @@ if errorlevel 1 (
     call :fail Could not parse NVIDIA driver version "%NVIDIA_DRIVER_VERSION%". Remove --backend cuda13 or fix NVIDIA drivers.
     exit /b 1
   )
-  call :warn Could not parse NVIDIA driver version "%NVIDIA_DRIVER_VERSION%". Falling back to cpu backend.
-  set "SELECTED_BACKEND=cpu"
+  call :warn Could not parse NVIDIA driver version "%NVIDIA_DRIVER_VERSION%". Falling back to vulkan backend.
+  set "SELECTED_BACKEND=vulkan"
   exit /b 0
 )
 
@@ -182,8 +183,8 @@ if %NVIDIA_DRIVER_MAJOR_NUM% LSS 580 (
     call :fail NVIDIA driver %NVIDIA_DRIVER_VERSION% is too old for cuda13. Install R580+ driver or use --backend cpu.
     exit /b 1
   )
-  call :warn NVIDIA driver %NVIDIA_DRIVER_VERSION% is below the CUDA 13 minimum ^(R580^). Falling back to cpu backend.
-  set "SELECTED_BACKEND=cpu"
+  call :warn NVIDIA driver %NVIDIA_DRIVER_VERSION% is below the CUDA 13 minimum ^(R580^). Falling back to vulkan backend.
+  set "SELECTED_BACKEND=vulkan"
   exit /b 0
 )
 
@@ -198,6 +199,8 @@ if /I "%COMPOSE_MODE%"=="local" (
     set "COMPOSE_FILE=docker-compose.cuda.yml"
   ) else if /I "%SELECTED_BACKEND%"=="rocm" (
     set "COMPOSE_FILE=docker-compose.rocm.yml"
+  ) else if /I "%SELECTED_BACKEND%"=="vulkan" (
+    set "COMPOSE_FILE=docker-compose.vulkan.yml"
   ) else (
     set "COMPOSE_FILE=docker-compose.cpu.yml"
   )
@@ -208,12 +211,19 @@ if /I "%COMPOSE_MODE%"=="local" (
     set "COMPOSE_FILE=docker-compose.ghcr-cuda13.yml"
   ) else if /I "%SELECTED_BACKEND%"=="rocm" (
     set "COMPOSE_FILE=docker-compose.ghcr-rocm.yml"
+  ) else if /I "%SELECTED_BACKEND%"=="vulkan" (
+    set "COMPOSE_FILE=docker-compose.ghcr-vulkan.yml"
   ) else (
     set "COMPOSE_FILE=docker-compose.ghcr-cpu.yml"
   )
 )
 exit /b 0
 
+:select_vulkan_runtime
+if /I not "%SELECTED_BACKEND%"=="vulkan" exit /b 0
+call :log Vulkan: Docker Desktop -^> Mesa dzn over D3D12 (/dev/dxg). Using built-in defaults (no env).
+exit /b 0
+
 :wait_for_health
 set /a _max=120
 set /a _count=0
@@ -254,7 +264,7 @@ echo.
 echo Options:
 echo   --doctor               Run checks only, do not change anything.
 echo   --fix                  Attempt limited auto-remediation where possible.
-echo   --backend cpu^|cuda13^|rocm^|slim   Force backend selection. slim is explicit only and is not auto-detected.
+echo   --backend cpu^|cuda13^|rocm^|slim^|vulkan   Force backend selection. slim and vulkan are explicit only and are not auto-detected.
 echo   --compose ghcr^|local   Use GHCR compose files ^(default^) or local build files.
 echo   --help                 Show this help.
 exit /b 0