diff --git a/.gitignore b/.gitignore index 7644665c..8811af62 100644 --- a/.gitignore +++ b/.gitignore @@ -434,3 +434,4 @@ docker-compose.host-mounts.generated.yml /installer/docker/volumes/content-files/* !/installer/docker/volumes/content-files/.gitkeep *.cli-auth-token +/src/client/GuideAnts.code-workspace diff --git a/README.md b/README.md index 55ba9c14..1bd94e8a 100644 --- a/README.md +++ b/README.md @@ -185,11 +185,15 @@ GuideAnts runs locally with Docker Compose. OS-specific quickstart scripts are i ```bash # Windows .\quickstart.ps1 +# or: start_windows.cmd # Linux / macOS ./quickstart.sh +# or: start_linux.sh / start_macos.sh ``` +Backends: `cuda13` (NVIDIA), `rocm` (AMD), `vulkan` (NVIDIA/AMD/Intel via Vulkan), `cpu`, and `slim` (sandbox-only, no local models). The root launchers auto-detect GPU where possible; on Windows, NVIDIA drivers below the CUDA 13 minimum (R580) fall back to `vulkan` instead of CPU. + See the [setup guide](https://github.com/Elumenotion/GuideAnts/blob/main/docs/setup-guide.md) for full instructions and the [developer config guide](https://github.com/Elumenotion/GuideAnts/blob/main/docs/developer-config-guide.md) for configuration options. ### Documentation @@ -202,6 +206,7 @@ All documentation lives in the repository: - [Project and notebook files system](https://github.com/Elumenotion/GuideAnts/blob/main/docs/project-and-notebook-files-system.md) – file and content management - [LLaMA model management](https://github.com/Elumenotion/GuideAnts/blob/main/docs/llama-model-download-and-runtime-management.md) – local model lifecycle - [Docker build guide](https://github.com/Elumenotion/GuideAnts/blob/main/docker/guideants-ai-build.md) – building the runtime service +- [Vulkan backend guide](https://github.com/Elumenotion/GuideAnts/blob/main/docker/guideants-ai-vulkan.md) – vendor-neutral GPU (llama + image gen) on Docker Desktop and native Linux - [Full docs directory](https://github.com/Elumenotion/GuideAnts/tree/main/docs) – architecture, features, test plans, and more ## Development Entry Points diff --git a/docker/build/guideants-ai/Dockerfile.vulkan b/docker/build/guideants-ai/Dockerfile.vulkan index d39bc96f..3f3dc9a8 100644 --- a/docker/build/guideants-ai/Dockerfile.vulkan +++ b/docker/build/guideants-ai/Dockerfile.vulkan @@ -184,6 +184,13 @@ RUN curl -sSL "https://github.com/PowerShell/PowerShell/releases/download/v${PWS && chmod +x /usr/bin/pwsh \ && rm /tmp/pwsh.tar.gz +ARG NODE_MAJOR=22 +RUN curl -fsSL https://deb.nodesource.com/setup_${NODE_MAJOR}.x | bash - \ + && apt-get install -y --no-install-recommends nodejs \ + && rm -rf /var/lib/apt/lists/* \ + && node --version \ + && npx --version + ENV VIRTUAL_ENV=/opt/venv ENV PATH="$VIRTUAL_ENV/bin:$PATH" ENV PIP_DISABLE_PIP_VERSION_CHECK=1 diff --git a/docker/build/guideants-ai/entrypoint.sh b/docker/build/guideants-ai/entrypoint.sh index cbda1361..3956ddba 100644 --- a/docker/build/guideants-ai/entrypoint.sh +++ b/docker/build/guideants-ai/entrypoint.sh @@ -142,6 +142,92 @@ PY sanitize_router_preset "$ROUTER_PRESET" +# Qwen-VL needs image-min-tokens=1024 for grounding accuracy, but that value breaks +# other vision models (e.g. Gemma mmproj max pixels). Apply it per-alias only. +normalize_router_image_min_tokens() { + local preset_path="$1" + if [ -z "$preset_path" ] || [ ! -f "$preset_path" ]; then + return + fi + + local tmp_path="${preset_path}.image-tokens.$$" + if ! python3 - "$preset_path" "$tmp_path" <<'PY' +import re +import sys + +source_path = sys.argv[1] +output_path = sys.argv[2] + +with open(source_path, "r", encoding="utf-8") as src: + lines = src.read().splitlines() + +output: list[str] = [] +current_alias: str | None = None +section_lines: list[str] = [] + +def flush_section() -> None: + global section_lines, current_alias + if current_alias is None: + return + + cleaned: list[str] = [] + for raw_line in section_lines: + stripped = raw_line.strip() + if stripped.startswith("#") or stripped.startswith(";"): + cleaned.append(raw_line) + continue + if "=" in stripped: + key = stripped.split("=", 1)[0].strip().lower() + if key == "image-min-tokens": + continue + cleaned.append(raw_line) + + # Qwen-VL aliases only; global router --image-min-tokens breaks Gemma loads. + if re.search(r"(?i)qwen", current_alias): + insert_at = 1 if cleaned and cleaned[0].strip().startswith("[") else 0 + cleaned.insert(insert_at, "image-min-tokens = 1024") + + output.extend(cleaned) + section_lines = [] + current_alias = None + +for raw_line in lines: + stripped = raw_line.strip() + if stripped.startswith("[") and stripped.endswith("]"): + flush_section() + current_alias = stripped[1:-1].strip() + section_lines = [raw_line] + continue + + if current_alias is None: + output.append(raw_line) + continue + + section_lines.append(raw_line) + +flush_section() + +with open(output_path, "w", encoding="utf-8", newline="\n") as dst: + dst.write("\n".join(output)) + if output: + dst.write("\n") +PY + then + echo "WARNING: router preset image-min-tokens normalization failed for '${preset_path}'; continuing unchanged." >&2 + rm -f "$tmp_path" 2>/dev/null || true + return + fi + + if cmp -s "$preset_path" "$tmp_path"; then + rm -f "$tmp_path" 2>/dev/null || true + return + fi + + mv -f "$tmp_path" "$preset_path" +} + +normalize_router_image_min_tokens "$ROUTER_PRESET" + SCRIPT_EXECUTION_REQUIRE_TOKEN="${SCRIPT_EXECUTION_REQUIRE_TOKEN:-true}" SCRIPT_EXECUTION_ENABLE_IDENTITY_ISOLATION="${SCRIPT_EXECUTION_ENABLE_IDENTITY_ISOLATION:-true}" diff --git a/docker/build/guideants-ai/sd-service/sd_service.py b/docker/build/guideants-ai/sd-service/sd_service.py index 6b9072de..8a508572 100644 --- a/docker/build/guideants-ai/sd-service/sd_service.py +++ b/docker/build/guideants-ai/sd-service/sd_service.py @@ -97,6 +97,14 @@ def parse_positive_float(value: str | None, default: float) -> float: return parsed if parsed > 0 else default +def optional_env_value(name: str) -> str | None: + raw = os.getenv(name) + if raw is None: + return None + value = raw.strip() + return value or None + + def parse_size(size: str) -> tuple[int, int]: value = (size or "").strip().lower() if "x" not in value: @@ -176,6 +184,7 @@ class SdRuntimeConfig: sampling_method: str offload_to_cpu: bool diffusion_fa: bool + vulkan_visible_devices: str | None default_output_format: str startup_warmup_fail_open: bool @@ -242,18 +251,12 @@ def _require_non_empty(cls, value: str) -> str: @field_validator("bundle_id") @classmethod def _validate_bundle_id(cls, value: str) -> str: - if not BUNDLE_ID_RE.fullmatch(value): - raise ValueError("bundle_id must match ^[A-Za-z0-9][A-Za-z0-9._-]{0,127}$") - return value + return validate_bundle_id(value) @field_validator("diffusion_file", "vae_file", "text_encoder_file") @classmethod - def _reject_globs(cls, value: str) -> str: - if "*" in value or "?" in value: - raise ValueError( - "must be a single filename (no '*' or '?' glob metacharacters)" - ) - return value + def _validate_bundle_filename(cls, value: str) -> str: + return validate_bundle_filename(value) class SdRuntimeState: @@ -299,6 +302,42 @@ def __init__(self) -> None: ENGINE_LOCK = threading.Lock() +def validate_bundle_filename(value: str) -> str: + filename = (value or "").strip() + if not filename: + raise ValueError("must be a non-empty string") + if "*" in filename or "?" in filename: + raise ValueError( + "must be a single filename (no '*' or '?' glob metacharacters)" + ) + if ( + filename in {".", ".."} + or os.path.isabs(filename) + or os.path.basename(filename) != filename + or "/" in filename + or "\\" in filename + ): + raise ValueError("must be a single filename with no path separators") + return filename + + +def validate_bundle_id(value: str) -> str: + candidate = (value or "").strip() + if not BUNDLE_ID_RE.fullmatch(candidate): + raise ValueError("bundle_id must match ^[A-Za-z0-9][A-Za-z0-9._-]{0,127}$") + return candidate + + +def resolve_bundle_dir(model_dir: str, bundle_id: str) -> str: + safe_bundle_id = validate_bundle_id(bundle_id) + root_real = os.path.realpath(bundle_root_dir(model_dir)) + bundle_path = os.path.realpath(os.path.join(root_real, safe_bundle_id)) + root_prefix = root_real if root_real.endswith(os.sep) else root_real + os.sep + if not bundle_path.startswith(root_prefix): + raise ValueError("resolved bundle path escapes the permitted bundle directory") + return bundle_path + + def resolve_runtime_config() -> SdRuntimeConfig: model_dir = os.getenv("GA_SD_MODEL_DIR", "/models-local/sd") @@ -368,6 +407,7 @@ def _only_file(path: str, role: str) -> str: sampling_method = (os.getenv("GA_SD_SAMPLING_METHOD") or "euler").strip() or "euler" offload_to_cpu = env_flag("GA_SD_OFFLOAD_TO_CPU", False) diffusion_fa = env_flag("GA_SD_DIFFUSION_FA", True) + vulkan_visible_devices = optional_env_value("GA_SD_VK_VISIBLE_DEVICES") default_output_format = normalize_output_format(os.getenv("GA_SD_DEFAULT_OUTPUT_FORMAT"), "png") startup_warmup_fail_open = env_flag("GA_SD_WARMUP_FAIL_OPEN_ON_STARTUP", True) @@ -401,6 +441,7 @@ def _only_file(path: str, role: str) -> str: sampling_method=sampling_method, offload_to_cpu=offload_to_cpu, diffusion_fa=diffusion_fa, + vulkan_visible_devices=vulkan_visible_devices, default_output_format=default_output_format, startup_warmup_fail_open=startup_warmup_fail_open, ) @@ -424,13 +465,13 @@ def read_active_bundle(model_dir: str) -> str | None: with open(marker, "r", encoding="utf-8") as handle: payload = json.load(handle) bundle_id = payload.get("bundleId") - return str(bundle_id) if bundle_id else None + return validate_bundle_id(str(bundle_id)) if bundle_id else None except Exception: return None def expected_bundle_paths(model_dir: str, bundle_id: str) -> dict[str, str]: - base = os.path.join(bundle_root_dir(model_dir), bundle_id) + base = resolve_bundle_dir(model_dir, bundle_id) return { "diffusion": os.path.join(base, "diffusion"), "vae": os.path.join(base, "vae"), @@ -439,17 +480,11 @@ def expected_bundle_paths(model_dir: str, bundle_id: str) -> dict[str, str]: def bundle_definition_file(model_dir: str, bundle_id: str) -> str: - return os.path.join(bundle_root_dir(model_dir), bundle_id, "bundle-definition.json") + return os.path.join(resolve_bundle_dir(model_dir, bundle_id), "bundle-definition.json") def write_bundle_definition_payload(model_dir: str, bundle_id: str, payload: dict[str, Any]) -> None: - candidate_bundle_id = (bundle_id or "").strip() - if not BUNDLE_ID_RE.fullmatch(candidate_bundle_id): - raise ValueError("invalid bundle_id") - root_real = os.path.realpath(bundle_root_dir(model_dir)) - bundle_path = os.path.realpath(os.path.join(root_real, candidate_bundle_id)) - if not bundle_path.startswith(root_real + os.sep): - raise ValueError("resolved bundle path escapes the permitted bundle directory") + bundle_path = resolve_bundle_dir(model_dir, bundle_id) os.makedirs(bundle_path, exist_ok=True) target = os.path.join(bundle_path, "bundle-definition.json") temp = f"{target}.{uuid.uuid4().hex}.tmp" @@ -458,9 +493,10 @@ def write_bundle_definition_payload(model_dir: str, bundle_id: str, payload: dic os.replace(temp, target) -def bundle_definition_payload(request: DownloadBundleRequest) -> dict[str, Any]: +def bundle_definition_payload(request: DownloadBundleRequest, bundle_id: str | None = None) -> dict[str, Any]: + safe_bundle_id = validate_bundle_id(bundle_id or request.bundle_id) return { - "bundleId": request.bundle_id, + "bundleId": safe_bundle_id, "revision": request.revision, "updatedAtUtc": utc_now_iso(), "roles": { @@ -474,8 +510,9 @@ def bundle_definition_payload(request: DownloadBundleRequest) -> dict[str, Any]: } -def write_bundle_definition(model_dir: str, request: DownloadBundleRequest) -> None: - write_bundle_definition_payload(model_dir, request.bundle_id, bundle_definition_payload(request)) +def write_bundle_definition(model_dir: str, request: DownloadBundleRequest, bundle_id: str | None = None) -> None: + safe_bundle_id = validate_bundle_id(bundle_id or request.bundle_id) + write_bundle_definition_payload(model_dir, safe_bundle_id, bundle_definition_payload(request, safe_bundle_id)) def _normalize_bundle_definition(payload: Any) -> dict[str, Any] | None: @@ -687,17 +724,122 @@ def list_bundles(model_dir: str) -> list[dict[str, Any]]: return bundles +def _normalize_bundle_revision(revision: str | None) -> str | None: + text = (revision or "").strip() + return text or None + + +def _previous_role_spec( + previous_definition: dict[str, Any] | None, role: str +) -> tuple[str, str] | None: + if previous_definition is None: + return None + roles = previous_definition.get("roles") + if not isinstance(roles, dict): + return None + role_payload = roles.get(role) + if not isinstance(role_payload, dict): + return None + repo = str(role_payload.get("repo") or "").strip() + filename = str(role_payload.get("file") or "").strip() + if not repo or not filename: + return None + return repo, filename + + +def _bundle_revision_unchanged( + previous_definition: dict[str, Any] | None, request: DownloadBundleRequest +) -> bool: + if previous_definition is None: + return False + previous = _normalize_bundle_revision(previous_definition.get("revision")) + incoming = _normalize_bundle_revision(request.revision) + return previous == incoming + + +def bundle_role_download_needed( + previous_definition: dict[str, Any] | None, + request: DownloadBundleRequest, + role: str, + repo: str, + filename: str, +) -> bool: + if previous_definition is None: + return True + if not _bundle_revision_unchanged(previous_definition, request): + return True + previous = _previous_role_spec(previous_definition, role) + if previous is None: + return True + return previous != (repo, filename) + + +def resolve_role_file_path(target_path: str, filename: str) -> str: + safe_filename = validate_bundle_filename(filename) + role_dir = os.path.realpath(target_path) + candidate = os.path.realpath(os.path.join(role_dir, safe_filename)) + role_prefix = role_dir if role_dir.endswith(os.sep) else role_dir + os.sep + if not candidate.startswith(role_prefix): + raise ValueError("resolved role file path escapes the permitted role directory") + return candidate + + +def role_expected_file_ready(target_path: str, filename: str) -> bool: + try: + expected_file = resolve_role_file_path(target_path, filename) + except ValueError: + return False + return os.path.isfile(expected_file) + + +def clear_stale_role_files(target_path: str, filename: str) -> None: + """ + Remove files left by a prior recipe (e.g. a renamed gguf) without wiping + the role directory so huggingface_hub can resume interrupted downloads. + """ + if not os.path.isdir(target_path): + return + safe_filename = validate_bundle_filename(filename) + try: + for name in os.listdir(target_path): + file_path = os.path.join(target_path, name) + if os.path.isfile(file_path) and name != safe_filename: + os.remove(file_path) + except OSError: + shutil.rmtree(target_path) + + +def resolve_initial_bundle_role_states( + previous_definition: dict[str, Any] | None, + request: DownloadBundleRequest, + paths: dict[str, str], +) -> dict[str, str]: + roles = { + "diffusion": (request.diffusion_repo, request.diffusion_file), + "vae": (request.vae_repo, request.vae_file), + "textEncoder": (request.text_encoder_repo, request.text_encoder_file), + } + states: dict[str, str] = {} + for role, (repo, filename) in roles.items(): + if bundle_role_download_needed(previous_definition, request, role, repo, filename): + states[role] = "queued" + elif role_expected_file_ready(paths[role], filename): + states[role] = "ready" + else: + states[role] = "queued" + return states + + def start_bundle_download(request: DownloadBundleRequest, model_dir: str) -> dict[str, Any]: + bundle_id = validate_bundle_id(request.bundle_id) + previous_definition = read_bundle_definition(model_dir, bundle_id) + paths = expected_bundle_paths(model_dir, bundle_id) operation_id = uuid.uuid4().hex operation = { "operationId": operation_id, - "bundleId": request.bundle_id, + "bundleId": bundle_id, "status": "queued", - "roles": { - "diffusion": "queued", - "vae": "queued", - "textEncoder": "queued", - }, + "roles": resolve_initial_bundle_role_states(previous_definition, request, paths), "error": None, } with BUNDLE_OPS_LOCK: @@ -705,11 +847,11 @@ def start_bundle_download(request: DownloadBundleRequest, model_dir: str) -> dic try: # Persist the declared bundle recipe up front so operators can read and # edit the definition even if a download fails mid-way. - write_bundle_definition(model_dir, request) + write_bundle_definition(model_dir, request, bundle_id) except Exception as exc: log_event( "sd_bundle_definition_write_failed", - bundleId=request.bundle_id, + bundleId=bundle_id, error=truncate_text(str(exc), 2048), ) @@ -730,19 +872,20 @@ def _run() -> None: "vae": (request.vae_repo, request.vae_file), "textEncoder": (request.text_encoder_repo, request.text_encoder_file), } - paths = expected_bundle_paths(model_dir, request.bundle_id) for role, (repo, filename) in roles.items(): + target_path = paths[role] + if ( + not bundle_role_download_needed(previous_definition, request, role, repo, filename) + and role_expected_file_ready(target_path, filename) + ): + with BUNDLE_OPS_LOCK: + BUNDLE_OPERATIONS[operation_id]["roles"][role] = "ready" + continue + with BUNDLE_OPS_LOCK: BUNDLE_OPERATIONS[operation_id]["status"] = "running" BUNDLE_OPERATIONS[operation_id]["roles"][role] = "downloading" - target_path = paths[role] - # Replacing an existing bundle definition must leave exactly one - # file per role. Clear the role directory first so a filename - # change cannot leave stale files behind. - if os.path.isdir(target_path): - shutil.rmtree(target_path) - elif os.path.exists(target_path): - os.remove(target_path) + clear_stale_role_files(target_path, filename) os.makedirs(target_path, exist_ok=True) snapshot_download( repo_id=repo, @@ -756,7 +899,7 @@ def _run() -> None: # snapshot_download with allow_patterns silently produces an # empty directory if the filename does not exist in the repo. # Turn that into a loud failure so the operator sees it. - expected_file = os.path.join(target_path, filename) + expected_file = resolve_role_file_path(target_path, filename) if not os.path.isfile(expected_file): raise RuntimeError( f"Expected file '{filename}' was not produced by " @@ -808,6 +951,13 @@ def build_sd_server_command(config: SdRuntimeConfig) -> list[str]: return command +def build_sd_server_environment(config: SdRuntimeConfig) -> dict[str, str]: + env = os.environ.copy() + if config.vulkan_visible_devices is not None: + env["GGML_VK_VISIBLE_DEVICES"] = config.vulkan_visible_devices + return env + + def is_engine_process_alive() -> bool: process = STATE.engine_process return process is not None and process.poll() is None @@ -987,11 +1137,12 @@ def start_engine() -> tuple[bool, str | None]: vaePath=config.vae_path, llmPath=config.llm_path, bundleId=STATE.loaded_bundle_id, + vulkanVisibleDevices=config.vulkan_visible_devices, command=command, ) try: - STATE.engine_process = subprocess.Popen(command) + STATE.engine_process = subprocess.Popen(command, env=build_sd_server_environment(config)) STATE.engine_started_at_utc = utc_now_iso() wait_for_engine_ready(config) STATE.loaded_at_utc = utc_now_iso() @@ -1671,6 +1822,7 @@ async def health() -> dict[str, Any]: "samplingMethod": config.sampling_method, "offloadToCpu": config.offload_to_cpu, "diffusionFa": config.diffusion_fa, + "vulkanVisibleDevices": config.vulkan_visible_devices, }, "engine": { "startedAtUtc": STATE.engine_started_at_utc, @@ -1737,10 +1889,10 @@ def _require_model_dir() -> str: def require_valid_bundle_id(bundle_id: str) -> str: - candidate = (bundle_id or "").strip() - if not BUNDLE_ID_RE.fullmatch(candidate): + try: + return validate_bundle_id(bundle_id) + except ValueError: raise HTTPException(status_code=400, detail="invalid bundle_id") - return candidate @APP.get("/admin/bundles") @@ -1971,15 +2123,13 @@ async def admin_unload() -> JSONResponse: @APP.delete("/admin/bundles/{bundle_id}") async def admin_delete_bundle(bundle_id: str) -> JSONResponse: model_dir = _require_model_dir() - bundle_id = (bundle_id or "").strip() - if not BUNDLE_ID_RE.fullmatch(bundle_id): - raise HTTPException(status_code=400, detail="invalid bundle_id") + bundle_id = require_valid_bundle_id(bundle_id) if read_active_bundle(model_dir) == bundle_id: raise HTTPException(status_code=409, detail="cannot remove active bundle") - root_real = os.path.realpath(bundle_root_dir(model_dir)) - target = os.path.realpath(os.path.join(root_real, bundle_id)) - if not target.startswith(root_real + os.sep): + try: + target = resolve_bundle_dir(model_dir, bundle_id) + except ValueError: raise HTTPException(status_code=400, detail="invalid bundle_id") if not os.path.exists(target): raise HTTPException(status_code=404, detail="bundle not found") diff --git a/docker/build/guideants-ai/start-llama.sh b/docker/build/guideants-ai/start-llama.sh index 30940a5d..df6f8652 100644 --- a/docker/build/guideants-ai/start-llama.sh +++ b/docker/build/guideants-ai/start-llama.sh @@ -70,6 +70,11 @@ fi [ -n "$GA_LLAMA_THREADS" ] && ARGS="$ARGS --threads $GA_LLAMA_THREADS" [ -n "$GA_LLAMA_PARALLEL" ] && ARGS="$ARGS --parallel $GA_LLAMA_PARALLEL" [ -n "$GA_LLAMA_GPU_LAYERS" ] && ARGS="$ARGS --n-gpu-layers $GA_LLAMA_GPU_LAYERS" +# Vulkan can fail scheduler reservation when KV-cache tensors are placed on +# the GPU for some model families. Keep this as an explicit env-controlled +# base preset because router mode propagates it to child instances. +[ "$GA_LLAMA_KV_OFFLOAD" = "0" ] && ARGS="$ARGS --no-kv-offload" +[ "$GA_LLAMA_KV_OFFLOAD" = "1" ] && ARGS="$ARGS --kv-offload" [ "$GA_LLAMA_KV_UNIFIED" = "1" ] && ARGS="$ARGS --kv-unified" [ "$GA_LLAMA_JINJA" = "1" ] && ARGS="$ARGS --jinja" [ "$GA_LLAMA_CONT_BATCH" = "1" ] && ARGS="$ARGS --cont-batching" @@ -78,11 +83,12 @@ fi # preset into every spawned child instance (unlike ctx-size/cache-ram, which # are left per-alias). --flash-attn takes a literal value (on|off|auto); # cache-type-v quantization requires flash attention to be enabled. The -# image-min-tokens knob only affects vision (mmproj) models. +# image-min-tokens is per-alias in router-models.ini (Qwen-VL only). Do not set +# GA_LLAMA_IMAGE_MIN_TOKENS globally — it propagates to every child and breaks +# models whose mmproj image_max_pixels is below the 1024-token floor. [ -n "$GA_LLAMA_FLASH_ATTN" ] && ARGS="$ARGS --flash-attn $GA_LLAMA_FLASH_ATTN" [ -n "$GA_LLAMA_CACHE_TYPE_K" ] && ARGS="$ARGS --cache-type-k $GA_LLAMA_CACHE_TYPE_K" [ -n "$GA_LLAMA_CACHE_TYPE_V" ] && ARGS="$ARGS --cache-type-v $GA_LLAMA_CACHE_TYPE_V" -[ -n "$GA_LLAMA_IMAGE_MIN_TOKENS" ] && ARGS="$ARGS --image-min-tokens $GA_LLAMA_IMAGE_MIN_TOKENS" # --tensor-split sets the per-GPU layer proportion (comma list, e.g. "7,1"). # Indices follow this process's visible-device order: with # GA_LLAMA_CUDA_VISIBLE_DEVICES=1,0 the FIRST proportion targets physical GPU 1 diff --git a/docker/docker-compose.ghcr-vulkan.yml b/docker/docker-compose.ghcr-vulkan.yml new file mode 100644 index 00000000..03f57dc0 --- /dev/null +++ b/docker/docker-compose.ghcr-vulkan.yml @@ -0,0 +1,292 @@ +name: guideants + +services: + mssql-express: + image: ${GA_MSSQL_IMAGE:-ghcr.io/elumenotion/mssql2025-express-fts:main} + pull_policy: always + container_name: mssql-express + environment: + - ACCEPT_EULA=Y + - MSSQL_SA_PASSWORD=${GA_SQL_SA_PASSWORD:-YourStrong!Passw0rd} + healthcheck: + test: ["CMD-SHELL", "/opt/mssql-tools18/bin/sqlcmd -S localhost -U sa -P '${GA_SQL_SA_PASSWORD:-YourStrong!Passw0rd}' -Q 'SELECT 1' -C -No | grep -q 1"] + interval: 5s + timeout: 5s + retries: 30 + start_period: 30s + cpus: "4.0" + volumes: + - mssql_runtime_state:/var/opt/mssql + - mssql_data:/var/opt/mssql/data + - mssql_ftdata:/var/opt/mssql/FTData + - mssql_log:/var/opt/mssql/log + restart: unless-stopped + + guideants-ai: + image: ${GA_AI_GHCR_IMAGE:-ghcr.io/elumenotion/guideants-ai-vulkan:main} + pull_policy: always + container_name: guideants-ai + cap_drop: + - SYS_PTRACE + # ---- Vulkan GPU wiring (cross-OS) ----------------------------------------- + # Defaults target Windows / Docker Desktop, where GPU Vulkan for EVERY vendor + # goes Vulkan -> Mesa dzn -> D3D12 -> /dev/dxg (works from git bash, no WSL). + # Native-Linux hosts override the GA_VULKAN_* vars; the installer sets them + # automatically (AMD/Intel -> /dev/dri + RADV/ANV, NVIDIA -> nvidia runtime + + # toolkit-injected ICD). See docker/guideants-ai-vulkan.md. + runtime: ${GA_VULKAN_RUNTIME:-runc} # 'nvidia' only on native-Linux NVIDIA + group_add: + - video # /dev/dri render perms on native Linux (harmless on Windows) + - render + devices: + - ${GA_VULKAN_DEVICE:-/dev/dxg} # Windows: /dev/dxg ; Linux AMD/Intel: /dev/dri + shm_size: "8gb" + volumes: + - ${GA_VULKAN_DRIVER_LIBS:-/usr/lib/wsl}:/usr/lib/wsl:ro + - ai_local_models-new:/models-local + - script_agent_admin_state:/var/lib/guideants/script-agent-admin + - type: bind + source: ${GA_CONTENT_FILES_HOST_PATH:-./volumes/content-files} + target: /app/ContentFiles + environment: + - FILE_STORAGE_ROOT=/app/ContentFiles + # Vulkan loader wiring. VK_DRIVER_FILES is pinned to ONE ICD so the CPU + # software rasterizer (llvmpipe) is never a candidate — the stack uses the + # GPU or fails loudly, never a silent CPU fallback. Default = dzn (Windows). + # Installer overrides GA_VULKAN_ICD on Linux: RADV/ANV for AMD/Intel, + # nvidia_icd.json (toolkit-injected) for NVIDIA. + - LD_LIBRARY_PATH=${GA_VULKAN_LD_LIBRARY_PATH:-/usr/lib/wsl/lib} + - VK_DRIVER_FILES=${GA_VULKAN_ICD:-/usr/share/vulkan/icd.d/dzn_icd.json} + - VK_ICD_FILENAMES=${GA_VULKAN_ICD:-/usr/share/vulkan/icd.d/dzn_icd.json} + - MESA_D3D12_DEFAULT_ADAPTER_NAME=${MESA_D3D12_DEFAULT_ADAPTER_NAME:-NVIDIA} + - GGML_VK_VISIBLE_DEVICES=${GGML_VK_VISIBLE_DEVICES:-} + # Used only when GA_VULKAN_RUNTIME=nvidia (native-Linux NVIDIA); harmless otherwise. + - NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-all} + - NVIDIA_DRIVER_CAPABILITIES=${NVIDIA_DRIVER_CAPABILITIES:-graphics,compute,utility} + - HF_TOKEN=${HF_TOKEN:-} + - SCRIPT_EXECUTION_AGENT_TOKEN=${GA_SCRIPT_AGENT_TOKEN:-dev-script-agent-token} + - SCRIPT_EXECUTION_ADMIN_TOKEN=${GA_SCRIPT_AGENT_ADMIN_TOKEN:-dev-script-agent-admin-token} + - SCRIPT_EXECUTION_ADMIN_STATE_DIR=/var/lib/guideants/script-agent-admin + - SCRIPT_EXECUTION_SCOPE_STATE_ROOT=/var/lib/guideants/script-agent-admin/scopes + - SCRIPT_EXECUTION_REQUIRE_TOKEN=true + - GA_LLAMA_MODELS_PRESET=/models-local/router-models.ini + - GA_LLAMA_MODEL_DIR=/models-local/llama + - GA_LLAMA_ADMIN_PORT=8086 + - GA_LLAMA_MODELS_MAX=1 + - GA_LLAMA_NO_AUTOLOAD=1 + - GA_LLAMA_CTX_SIZE=262144 + - GA_LLAMA_THREADS=16 + - GA_LLAMA_PARALLEL=5 + - GA_LLAMA_CACHE_RAM=8192 + # Vulkan currently trips llama.cpp's scheduler for some model families when + # KV cache tensors are placed on Vulkan buffers. Keep KV offload disabled + # by default; LLAMA_ARG_* lets current images pick this up without rebuild. + - GA_LLAMA_KV_OFFLOAD=${GA_LLAMA_KV_OFFLOAD:-0} + - LLAMA_ARG_KV_OFFLOAD=${GA_LLAMA_KV_OFFLOAD:-0} + # Keep unified KV opt-in for Vulkan as a separate conservative default. + - GA_LLAMA_KV_UNIFIED=${GA_LLAMA_KV_UNIFIED:-0} + - GA_LLAMA_JINJA=1 + - GA_LLAMA_CONT_BATCH=1 + - GA_LLAMA_NO_MMAP=0 + - GA_ASR_HOST=127.0.0.1 + - GA_ASR_PORT=8082 + - GA_ASR_MODEL_DIR=/models-local/asr + - GA_ASR_DEFAULT_MODEL_PATH=Qwen3-ASR-0.6B + - GA_ASR_DEFAULT_MODEL_ID=Qwen/Qwen3-ASR-0.6B + - GA_ASR_AUTO_LOAD_ON_STARTUP=${GA_ASR_AUTO_LOAD_ON_STARTUP:-1} + - GA_ASR_WAIT_FOR_READY_ON_STARTUP=${GA_ASR_WAIT_FOR_READY_ON_STARTUP:-0} + - GA_ASR_READY_TIMEOUT_SECONDS=${GA_ASR_READY_TIMEOUT_SECONDS:-1800} + - GA_ASR_DEVICE_MAP=auto + - GA_ASR_DTYPE=bfloat16 + - GA_ASR_MAX_INFERENCE_BATCH_SIZE=8 + - GA_ASR_MAX_NEW_TOKENS=512 + - GA_ASR_WARMUP_ON_LOAD=${GA_ASR_WARMUP_ON_LOAD:-1} + - GA_ASR_WARMUP_AUDIO_PATH=${GA_ASR_WARMUP_AUDIO_PATH:-/app/asr-service/warmup.webm} + - GA_ASR_WARMUP_LANGUAGE=${GA_ASR_WARMUP_LANGUAGE:-} + - GA_TTS_HOST=127.0.0.1 + - GA_TTS_PORT=8084 + - GA_TTS_MODEL_DIR=/models-local/tts + - GA_TTS_DEFAULT_MODEL_PATH=VibeVoice-1.5B + - GA_TTS_DEFAULT_MODEL_ID=microsoft/VibeVoice-1.5B + - GA_TTS_TOKENIZER_PATH=Qwen2.5-1.5B-tokenizer + - GA_TTS_TOKENIZER_ID=Qwen/Qwen2.5-1.5B + - GA_TTS_AUTO_LOAD_ON_STARTUP=${GA_TTS_AUTO_LOAD_ON_STARTUP:-1} + - GA_TTS_WAIT_FOR_READY_ON_STARTUP=${GA_TTS_WAIT_FOR_READY_ON_STARTUP:-0} + - GA_TTS_READY_TIMEOUT_SECONDS=${GA_TTS_READY_TIMEOUT_SECONDS:-1800} + - GA_TTS_DEVICE_MAP=auto + - GA_TTS_DTYPE=bfloat16 + - GA_TTS_TIMEOUT_SECONDS=300 + - GA_TTS_MAX_NEW_TOKENS=512 + - GA_TTS_SAMPLE_RATE=24000 + - GA_TTS_DEFAULT_VOICE_SECONDS=1.0 + - GA_EMB_HOST=127.0.0.1 + - GA_EMB_PORT=8085 + - GA_EMB_MODEL_DIR=/models-local/emb + - GA_EMB_DEFAULT_MODEL_PATH=${GA_EMB_DEFAULT_MODEL_PATH} + - GA_EMB_DEVICE=${GA_EMB_DEVICE:-cpu} + - GA_EMB_FIX_MISTRAL_REGEX=${GA_EMB_FIX_MISTRAL_REGEX:-1} + - GA_EMB_AUTO_LOAD_ON_STARTUP=${GA_EMB_AUTO_LOAD_ON_STARTUP} + - GA_EMB_WARMUP_ON_LOAD=${GA_EMB_WARMUP_ON_LOAD} + - GA_EMB_WAIT_FOR_READY_ON_STARTUP=${GA_EMB_WAIT_FOR_READY_ON_STARTUP:-0} + - GA_EMB_READY_TIMEOUT_SECONDS=${GA_EMB_READY_TIMEOUT_SECONDS:-1800} + - GA_MEDIA_HOST=127.0.0.1 + - GA_MEDIA_PORT=8087 + - GA_SD_HOST=127.0.0.1 + - GA_SD_PORT=8083 + - GA_SD_MODEL_DIR=/models-local/sd + - GA_SD_TIMEOUT_SECONDS=900 + - GA_SD_ENGINE_REQUEST_TIMEOUT_SECONDS=${GA_SD_ENGINE_REQUEST_TIMEOUT_SECONDS:-120} + - GA_SD_STEPS=4 + - GA_SD_CFG_SCALE=1.0 + - GA_SD_STRENGTH=0.75 + - GA_SD_OFFLOAD_TO_CPU=${GA_SD_OFFLOAD_TO_CPU:-0} + - GA_SD_DIFFUSION_FA=1 + # Optional SD-only Vulkan device selector. Empty means inherit the + # container-wide GGML_VK_VISIBLE_DEVICES value used by llama too. + - GA_SD_VK_VISIBLE_DEVICES=${GA_SD_VK_VISIBLE_DEVICES:-} + - GA_SD_AUTO_LOAD_ON_STARTUP=${GA_SD_AUTO_LOAD_ON_STARTUP:-1} + - GA_SD_WARMUP_PROMPT=${GA_SD_WARMUP_PROMPT:-startup-warmup} + - GA_SD_WARMUP_SIZE=${GA_SD_WARMUP_SIZE:-512x512} + - GA_SD_WARMUP_STEPS=${GA_SD_WARMUP_STEPS:-1} + - GA_SD_WARMUP_OUTPUT_FORMAT=${GA_SD_WARMUP_OUTPUT_FORMAT:-png} + - GA_SD_WARMUP_REQUEST_TIMEOUT_SECONDS=${GA_SD_WARMUP_REQUEST_TIMEOUT_SECONDS:-180} + - GA_SD_WARMUP_FAIL_OPEN_ON_STARTUP=${GA_SD_WARMUP_FAIL_OPEN_ON_STARTUP:-1} + - GA_SD_WAIT_FOR_READY_ON_STARTUP=${GA_SD_WAIT_FOR_READY_ON_STARTUP:-0} + - GA_SD_READY_TIMEOUT_SECONDS=${GA_SD_READY_TIMEOUT_SECONDS:-1800} + networks: + - guideants-network + restart: unless-stopped + + docling-serve: + image: ${DOCLING_SERVE_CPU_IMAGE:-quay.io/docling-project/docling-serve-cpu:v1.16.1} + pull_policy: always + container_name: docling-serve + environment: + - DOCLING_SERVE_MAX_SYNC_WAIT=${DOCLING_SERVE_MAX_SYNC_WAIT:-600} + networks: + - guideants-network + restart: unless-stopped + + documentserver: + image: ${GA_DOCUMENTSERVER_IMAGE:-documentserver:latest} + container_name: documentserver + environment: + - JWT_ENABLED=${GA_DOCUMENTSERVER_JWT_ENABLED:-false} + - JWT_SECRET=${DOCUMENTSERVER_JWT_SECRET:-} + - JWT_HEADER=${GA_DOCUMENTSERVER_JWT_HEADER:-Authorization} + - JWT_IN_BODY=${GA_DOCUMENTSERVER_JWT_IN_BODY:-false} + - ALLOW_PRIVATE_IP_ADDRESS=true + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/info/info.json"] + interval: 30s + retries: 5 + start_period: 60s + timeout: 10s + networks: + - guideants-network + restart: unless-stopped + + guideants-webapi-ui: + image: ${GA_WEBAPI_UI_SLIM_GHCR_IMAGE:-ghcr.io/elumenotion/guideants-webapi-ui-slim:main} + pull_policy: always + container_name: guideants-webapi-ui + depends_on: + mssql-express: + condition: service_healthy + guideants-ai: + condition: service_started + ports: + - "5107:8080" + volumes: + - type: bind + source: ${GA_CONTENT_FILES_HOST_PATH:-./volumes/content-files} + target: /app/ContentFiles + environment: + - ASPNETCORE_URLS=http://127.0.0.1:8081 + - ASPNETCORE_ENVIRONMENT=Development + # Idle session window (minutes). Default 30 days; the sliding renewal re-issues + # the cookie past the halfway point so active sessions never hard-expire. + - Jwt__LifetimeMinutes=${GA_JWT_LIFETIME_MINUTES:-43200} + - API_RUNTIME_CONTEXT=compose-ghcr-vulkan + - ConnectionStrings__DefaultConnection=Server=mssql-express,1433;Initial Catalog=${GA_DB_NAME:-guideants-dev};Persist Security Info=False;User ID=sa;Password=${GA_SQL_SA_PASSWORD:-YourStrong!Passw0rd};MultipleActiveResultSets=True;Encrypt=False;TrustServerCertificate=True;Connection Timeout=30;ConnectRetryCount=3;ConnectRetryInterval=5; + - FileStorage__Path=/app/ContentFiles + - Ui__RootPath=/app/ui + - Ui__DevServerUrl= + - ALLOWED_ORIGINS=* + - SearXngSearch__BaseUrl=http://searxng:8080 + - BrowserRendering__BaseUrl=http://searxng:8080 + - LocalServiceHosts__SpeechTranscriptionBaseUrl=http://guideants-ai:80 + - LocalServiceHosts__SpeechSynthesisBaseUrl=http://guideants-ai:80 + - LocalServiceHosts__ImageGenerationBaseUrl=http://guideants-ai:80 + - LocalServiceHosts__EmbeddingsBaseUrl=http://guideants-ai:80 + - LocalServiceHosts__MediaBaseUrl=http://guideants-ai:80 + - LocalServiceHosts__DocumentIntelligenceBaseUrl=http://docling-serve:5001 + - DocumentServer__Enabled=${GA_DOCUMENTSERVER_ENABLED:-false} + - DocumentServer__InternalUrl=http://documentserver + - DocumentServer__ApiBaseUrl=http://guideants-webapi-ui:8080 + - DocumentServer__JwtEnabled=${GA_DOCUMENTSERVER_JWT_ENABLED:-false} + - DocumentServer__JwtSecret=${DOCUMENTSERVER_JWT_SECRET:-} + - DocumentServer__JwtHeader=${GA_DOCUMENTSERVER_JWT_HEADER:-Authorization} + - DocumentServer__JwtInBody=${GA_DOCUMENTSERVER_JWT_IN_BODY:-false} + - ScriptExecution__AgentToken=${GA_SCRIPT_AGENT_TOKEN:-dev-script-agent-token} + - ScriptExecution__AdminToken=${GA_SCRIPT_AGENT_ADMIN_TOKEN:-dev-script-agent-admin-token} + - LlamaCpp__BaseUrl=http://guideants-ai:80/llama-cpp + - ServiceRouting__Containers__guideants-ai__BaseUrl=http://guideants-ai:80/sandbox + - ServiceRouting__Containers__plantuml__BaseUrl=http://plantuml:80 + - HF_TOKEN=${HF_TOKEN:-} + - SettingsSecrets__ActiveKeyId=local-dev + - SettingsSecrets__Keys__local-dev=MDEyMzQ1Njc4OUFCQ0RFRjAxMjM0NTY3ODlBQkNERUY= + - Logging__LogLevel__GuideAntsApi.Services.Components.SpeechTranscriptionService=Information + - Logging__LogLevel__GuideAntsApi.Services.Components.SpeechSynthesisService=Information + networks: + - guideants-network + - default + restart: unless-stopped + + plantuml: + image: ${GA_PLANTUML_GHCR_IMAGE:-ghcr.io/elumenotion/guideants-plantuml:main} + pull_policy: always + container_name: plantuml + environment: + - FILE_STORAGE_ROOT=/app/ContentFiles + - SCRIPT_EXECUTION_AGENT_TOKEN=${GA_SCRIPT_AGENT_TOKEN:-dev-script-agent-token} + - SCRIPT_EXECUTION_REQUIRE_TOKEN=true + - PLANTUML_LIMIT_SIZE=8192 + volumes: + - type: bind + source: ${GA_CONTENT_FILES_HOST_PATH:-./volumes/content-files} + target: /app/ContentFiles + networks: + - guideants-network + restart: unless-stopped + + searxng: + image: ${GA_SEARXNG_GHCR_IMAGE:-ghcr.io/elumenotion/guideants-searxng:main} + pull_policy: always + container_name: readweb-searxng + restart: unless-stopped + volumes: + - type: bind + source: ${GA_SEARXNG_CONFIG_HOST_PATH:-./volumes/searxng/config} + target: /etc/searxng + - type: bind + source: ${GA_SEARXNG_DATA_HOST_PATH:-./volumes/searxng/data} + target: /var/cache/searxng + environment: + - FORCE_OWNERSHIP=true + - BROWSER_RENDER_PORT=3001 + networks: + - guideants-network + +volumes: + mssql_runtime_state: + mssql_data: + mssql_ftdata: + mssql_log: + ai_local_models-new: + script_agent_admin_state: + +networks: + guideants-network: + driver: bridge + diff --git a/docker/docker-compose.vulkan.yml b/docker/docker-compose.vulkan.yml index 9f7eb3a7..68e0f9ea 100644 --- a/docker/docker-compose.vulkan.yml +++ b/docker/docker-compose.vulkan.yml @@ -74,6 +74,7 @@ services: # to the NVIDIA discrete GPU; set "Radeon"/"Intel" or use # GGML_VK_VISIBLE_DEVICES to target another GPU. Ignored by non-dzn ICDs. - MESA_D3D12_DEFAULT_ADAPTER_NAME=${MESA_D3D12_DEFAULT_ADAPTER_NAME:-NVIDIA} + - GGML_VK_VISIBLE_DEVICES=${GGML_VK_VISIBLE_DEVICES:-} # Used only when GA_VULKAN_RUNTIME=nvidia (native-Linux NVIDIA); harmless otherwise. - NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-all} - NVIDIA_DRIVER_CAPABILITIES=${NVIDIA_DRIVER_CAPABILITIES:-graphics,compute,utility} @@ -95,7 +96,13 @@ services: - GA_LLAMA_THREADS=16 - GA_LLAMA_PARALLEL=5 - GA_LLAMA_CACHE_RAM=8192 - - GA_LLAMA_KV_UNIFIED=1 + # Vulkan currently trips llama.cpp's scheduler for some model families when + # KV cache tensors are placed on Vulkan buffers. Keep KV offload disabled + # by default; LLAMA_ARG_* lets current images pick this up without rebuild. + - GA_LLAMA_KV_OFFLOAD=${GA_LLAMA_KV_OFFLOAD:-0} + - LLAMA_ARG_KV_OFFLOAD=${GA_LLAMA_KV_OFFLOAD:-0} + # Keep unified KV opt-in for Vulkan as a separate conservative default. + - GA_LLAMA_KV_UNIFIED=${GA_LLAMA_KV_UNIFIED:-0} - GA_LLAMA_JINJA=1 - GA_LLAMA_CONT_BATCH=1 - GA_LLAMA_NO_MMAP=0 @@ -158,6 +165,9 @@ services: - GA_SD_STRENGTH=0.75 - GA_SD_OFFLOAD_TO_CPU=${GA_SD_OFFLOAD_TO_CPU:-0} - GA_SD_DIFFUSION_FA=1 + # Optional SD-only Vulkan device selector. Empty means inherit the + # container-wide GGML_VK_VISIBLE_DEVICES value used by llama too. + - GA_SD_VK_VISIBLE_DEVICES=${GA_SD_VK_VISIBLE_DEVICES:-} - GA_SD_AUTO_LOAD_ON_STARTUP=${GA_SD_AUTO_LOAD_ON_STARTUP:-0} - GA_SD_WARMUP_PROMPT=${GA_SD_WARMUP_PROMPT:-startup-warmup} - GA_SD_WARMUP_SIZE=${GA_SD_WARMUP_SIZE:-512x512} diff --git a/docker/guideants-ai-build.md b/docker/guideants-ai-build.md index 80348626..68626fae 100644 --- a/docker/guideants-ai-build.md +++ b/docker/guideants-ai-build.md @@ -108,7 +108,7 @@ Workflow implementation details: - `deps-rocm` -> runtime dependency image (no compiler toolchain) - `final-rocm` -> runtime image on top of `deps-rocm` (or an externally tagged deps image) -- `runtime-vulkan-base` -> OS/runtime base on `ghcr.io/ggml-org/llama.cpp:server-vulkan` (Ubuntu 26.04), plus the universal GPU driver layer (`mesa-vulkan-drivers` + libglvnd/EGL libs) that makes one image work on NVIDIA, AMD, and Intel +- `runtime-vulkan-base` -> OS/runtime base on `ghcr.io/ggml-org/llama.cpp:server-vulkan` (Ubuntu 26.04), plus the universal GPU driver layer (`mesa-vulkan-drivers` + libglvnd/EGL libs) that makes one image work on NVIDIA, AMD, and Intel; also installs Node.js 22 (`npx`) for `mcp+sandbox://` MCP servers, matching the other full AI images - `pydeps-vulkan-builder` -> Python dependency build stage (includes build toolchain) - `deps-vulkan` -> runtime dependency image (no compiler toolchain) - `final-vulkan` -> runtime image on top of `deps-vulkan` (or an externally tagged deps image) @@ -588,6 +588,7 @@ Startup loading behavior is configurable per service through environment variabl - `0`: skip SD readiness monitoring on startup - `GA_SD_READY_TIMEOUT_SECONDS` (default `1800`) - `GA_SD_CUDA_VISIBLE_DEVICES` (optional explicit SD physical GPU pinning; empty value means inherit global ordering) +- `GA_SD_VK_VISIBLE_DEVICES` (optional SD-only Vulkan device selector; empty value means inherit `GGML_VK_VISIBLE_DEVICES`) - `GA_EMB_TARGET_DEVICES` (default `cuda:0,cuda:1`; logical indices interpreted after CUDA remapping) Default compose behavior starts gateway-backed services in parallel. Optional readiness checks are non-blocking monitors so one service startup does not block others. diff --git a/docker/guideants-ai-vulkan.md b/docker/guideants-ai-vulkan.md index 19929dab..e0c0d9de 100644 --- a/docker/guideants-ai-vulkan.md +++ b/docker/guideants-ai-vulkan.md @@ -23,6 +23,9 @@ present, falling back to CPU for everything else. | **`vulkan`** | **Vulkan** | **NVIDIA + AMD + Intel** | **CPU wheels** | | `slim` | — | any (no local model runtime) | CPU wheels | +Like the other full `guideants-ai` images, the Vulkan image bakes in **Node.js 22** +(`node` / `npx`) so `mcp+sandbox://` package MCP servers can run inside the container. + ## How the GPU is reached The Vulkan binaries are vendor-neutral; what differs per host is **which device node** the @@ -135,6 +138,13 @@ environment: With no env set (Windows) this resolves to the dzn/`/dev/dxg` path. The `NVIDIA_*` vars matter only when `GA_VULKAN_RUNTIME=nvidia` (native-Linux NVIDIA) and are harmless otherwise. +Vulkan leaves llama.cpp KV-cache offload disabled by default +(`GA_LLAMA_KV_OFFLOAD=0`, propagated as `LLAMA_ARG_KV_OFFLOAD=0`) because current +Vulkan router child processes can abort during startup for some model families when +KV tensors are placed on a Vulkan buffer. Unified KV is also kept opt-in on this backend +(`GA_LLAMA_KV_UNIFIED=0`). Set `GA_LLAMA_KV_OFFLOAD=1` or `GA_LLAMA_KV_UNIFIED=1` +explicitly to retest either path with a newer upstream llama.cpp build. + > **Note:** the bare-file default targets Windows. On a native-Linux host *without* the > `GA_VULKAN_*` env set, `${GA_VULKAN_DEVICE:-/dev/dxg}` resolves to `/dev/dxg`, which doesn't > exist there — so use the installer (which sets the env) or export the Linux values yourself. @@ -221,6 +231,12 @@ docker logs guideants-ai 2>&1 | grep -i vulkan On Windows, `MESA_D3D12_DEFAULT_ADAPTER_NAME` (default `NVIDIA`) sets which adapter dzn lists first; `GGML_VK_VISIBLE_DEVICES` picks/splits among enumerated devices on any platform. +Stable Diffusion can be pinned independently from llama with `GA_SD_VK_VISIBLE_DEVICES`. +Leave it empty to inherit the container-wide `GGML_VK_VISIBLE_DEVICES`; set it when SD should +use a different Vulkan device. For example, `GGML_VK_VISIBLE_DEVICES=1` and +`GA_SD_VK_VISIBLE_DEVICES=0` keeps llama on Vulkan device 1 while the SD `sd-server` +subprocess uses Vulkan device 0. + ## Publishing The Vulkan image publishes to GHCR alongside the other backends: diff --git a/docs/developer-config-guide.md b/docs/developer-config-guide.md index 81ed5b61..d66541d8 100644 --- a/docs/developer-config-guide.md +++ b/docs/developer-config-guide.md @@ -50,9 +50,10 @@ NuGet packages restore automatically on `dotnet build` / `dotnet run`. - **NVIDIA driver R580+** plus the **NVIDIA Container Toolkit** — enables the `cuda13` backend. The Windows launcher enforces driver major ≥ 580 via `nvidia-smi`. - Driver: - Container Toolkit: -- **AMD GPU + ROCm-capable driver** — enables the experimental `rocm` backend. +- **AMD GPU + ROCm-capable driver** — enables the `rocm` backend. - -- With neither installed, the launcher auto-selects the `cpu` backend. +- **Vulkan-capable GPU** (NVIDIA, AMD, or Intel) — enables the `vulkan` backend for GPU-accelerated llama + image generation on Docker Desktop (Windows/macOS) and native Linux. On Windows, when NVIDIA is detected but the driver is below R580 (CUDA 13 minimum), the launcher falls back to `vulkan` instead of `cpu`. See [`docker/guideants-ai-vulkan.md`](../docker/guideants-ai-vulkan.md). +- With none of the above, the launcher auto-selects the `cpu` backend. ## Optional secrets @@ -92,8 +93,9 @@ These are needed regardless of which lane you work in. Optional accelerators: -- **NVIDIA driver R580+** (with NVIDIA Container Toolkit) → enables `cuda13` backend. `start_windows.cmd` enforces driver major ≥ 580 via `nvidia-smi`. -- **AMD GPU + ROCm-capable driver** → enables experimental `rocm` backend. +- **NVIDIA driver R580+** (with NVIDIA Container Toolkit) → enables `cuda13` backend. `start_windows.cmd` enforces driver major ≥ 580 via `nvidia-smi`; below R580 on Windows, the launcher selects `vulkan` instead. +- **AMD GPU + ROCm-capable driver** → enables `rocm` backend. +- **Vulkan GPU** (any vendor) → use `--backend vulkan` for vendor-neutral llama + SD acceleration; Linux launchers set `GA_VULKAN_*` automatically when needed. - Otherwise → `cpu` backend is selected automatically. Optional secrets: @@ -261,7 +263,7 @@ Already covered above: Docker, Compose plugin, optional GPU runtime, ~60 GB disk | Image | Build tool | Extra pre-requisites | |---|---|---| | `guideants-webapi-ui` (`docker/build/webapi-ui/Dockerfile`) | `build_webapi_ui.ps1` / `.sh` | Requires the client UI to be built first via `npm run browser:build:docker` (produces `src/client/dist-browser/`). Multi-stage uses `mcr.microsoft.com/dotnet/sdk:8.0`. | -| `guideants-ai` (`docker/build/guideants-ai/Dockerfile.{cpu,cuda,rocm,slim}`) | `build_guideants_ai.ps1` / `.sh` | Requires `dotnet publish` of `ScriptExecutionAgent` (so a host .NET 8 SDK is needed even though Dockerfiles also have an SDK stage) and BuildKit (`DOCKER_BUILDKIT=1`). CPU/CUDA/ROCm are full local AI variants; `slim` is the sandbox-oriented AI image for Python script execution without starting local model runtime services. The AI image also bakes the script-agent admin assets and `ga-script-exec` privacy wrapper. For cache export (`--cache-to`), use `desktop-linux` context and enable Docker Desktop containerd image store. | +| `guideants-ai` (`docker/build/guideants-ai/Dockerfile.{cpu,cuda,rocm,vulkan,slim}`) | `build_guideants_ai.ps1` / `.sh` | Requires `dotnet publish` of `ScriptExecutionAgent` (so a host .NET 8 SDK is needed even though Dockerfiles also have an SDK stage) and BuildKit (`DOCKER_BUILDKIT=1`). CPU/CUDA/ROCm/Vulkan are full local AI variants; each bakes Node.js 22 for `mcp+sandbox://` package MCP. `slim` is the sandbox-oriented AI image for Python script execution without starting local model runtime services. The AI image also bakes the script-agent admin assets and `ga-script-exec` privacy wrapper. For cache export (`--cache-to`), use `desktop-linux` context and enable Docker Desktop containerd image store. | | `mssql2025-express-fts` (`docker/build/mssql-fts/Dockerfile`) | `-All` switch on the AI build script | Standard Docker build. | | `plantuml-1.2025.2` (`docker/build/Sandboxes/PlantUml/dockerfile`) | `-All` switch on the AI build script | Standard Docker build. | | `guideants-searxng` | `docker compose build searxng` | Repo-root build context. | @@ -316,6 +318,7 @@ GA_SEARXNG_DATA_HOST_PATH=./volumes/searxng/data GA_AI_CUDA_IMAGE=guideants-ai:cuda13-26132.1047 GA_AI_CPU_IMAGE=guideants-ai:cpu-26126.1012 GA_AI_ROCM_IMAGE=guideants-ai:rocm-26131.2226 +GA_AI_VULKAN_IMAGE=guideants-ai:vulkan-latest GA_EMB_DEFAULT_MODEL_PATH=harrier-oss-v1-0.6b GA_EMB_AUTO_LOAD_ON_STARTUP=1 GA_EMB_WARMUP_ON_LOAD=1 @@ -353,7 +356,7 @@ Conflicts with local dev API on `5106` and Vite dev server on `5173` — both ar 2. Run `start_windows.cmd` (or the `.sh` equivalent). 3. Wait for `http://localhost:5107/` — launcher opens browser. -The launcher auto-picks `cuda13`, `rocm`, or `cpu` and pulls GHCR images. No SDKs required. +The launcher auto-picks `cuda13`, `rocm`, `vulkan` (Windows NVIDIA below R580), or `cpu` and pulls GHCR images. No SDKs required. Use `--backend vulkan` to force the vendor-neutral GPU stack. ### 5b) Client-only dev (UI work against dockerized API) @@ -379,7 +382,7 @@ The launcher auto-picks `cuda13`, `rocm`, or `cpu` and pulls GHCR images. No SDK 1. Everything in 5c. 2. BuildKit enabled (`DOCKER_BUILDKIT=1` — the build scripts set this). -3. For CUDA AI image builds: NVIDIA container runtime + enough disk for multi-stage CUDA 13 images. +3. For CUDA AI image builds: NVIDIA container runtime + enough disk for multi-stage CUDA 13 images. For Vulkan builds: see [`docker/guideants-ai-vulkan.md`](../docker/guideants-ai-vulkan.md). 4. PowerShell scripts at `docker/build/build_guideants_ai.ps1` for AI backends and `docker/build/build_support_images.ps1` for MSSQL FTS, PlantUML, SearXNG, and WebAPI/UI. --- @@ -392,9 +395,9 @@ A few items worth confirming explicitly before onboarding a new dev: - **No pinned Node engine** in `src/client/package.json` — pin to avoid drift (Vite 6 + Vitest 4 + Electron 41 → Node 20.x or 22.x). - **`appsettings.example.json` is not auto-copied** to `appsettings.json` — first-time devs must do this manually and replace the `SettingsSecrets` key. - **`docker/.env` ships with stale-style local image tags** (`guideants-ai:cuda13-26132.1047` etc.) — irrelevant if you use the GHCR compose files but will fail `docker compose up` on the `local` compose files unless you build them first. -- **CUDA 13 needs NVIDIA R580+ drivers** — the Windows launcher enforces this; manual `docker compose` does not. +- **CUDA 13 needs NVIDIA R580+ drivers** — the Windows launcher enforces this for `cuda13` and falls back to `vulkan` when the driver is older; manual `docker compose` does not auto-select. - **HF token must be set in UI** (`Settings → Connections → HuggingFace`); the API does *not* accept per-request token overrides per the setup guide. -- **Python**: there is no required host Python install. `src/python/pptx` runs inside the `ScriptExecutionAgent`/sandbox containers; Python 3.11 is baked into the `guideants-ai` images, including the sandbox-oriented `slim` AI variant. Script executions use per-`project + guide` venvs in the `script_agent_admin_state` volume, layered over the image's `/opt/venv` packages. Only install Python on the host if you specifically want to iterate on `src/python/pptx` outside Docker. +- **Python**: there is no required host Python install. `src/python/pptx` runs inside the `ScriptExecutionAgent`/sandbox containers; Python 3.11 and Node.js 22 are baked into the full `guideants-ai` images (cpu/cuda/rocm/vulkan), including `npx` for package MCP. The sandbox-oriented `slim` AI variant also includes Node.js. Script executions use per-`project + guide` venvs in the `script_agent_admin_state` volume, layered over the image's `/opt/venv` packages. Only install Python on the host if you specifically want to iterate on `src/python/pptx` outside Docker. --- @@ -402,10 +405,10 @@ A few items worth confirming explicitly before onboarding a new dev: | Lane | Mandatory | Optional | |---|---|---| -| **Run only** | Docker + Compose, ~60 GB disk, curl, WSL2 (Win) | NVIDIA R580+ or AMD ROCm GPU, HF token | +| **Run only** | Docker + Compose, ~60 GB disk, curl, WSL2 (Win) | NVIDIA R580+, AMD ROCm GPU, Vulkan GPU (`--backend vulkan`), HF token | | **Client dev** | Node 20+/22+, npm, `.env.*` files, a running API | Electron 41 (desktop), ANALYZE=true tooling | | **Server dev** | .NET 8 SDK, PowerShell 7+ (cross-platform), `appsettings*.json`, SQL Server (containerized) | EF CLI tools for migrations, Azure Speech key (if testing Azure speech path), local/admin test accounts for role-gated endpoint checks | -| **Docker builds** | All of the above + BuildKit, GPU runtime for CUDA/ROCm image builds, dotnet publish for ScriptExecutionAgent | GHCR write access (only for publish workflows) | +| **Docker builds** | All of the above + BuildKit, GPU runtime for CUDA/ROCm/Vulkan image builds, dotnet publish for ScriptExecutionAgent | GHCR write access (only for publish workflows) | --- diff --git a/docs/host-mounts-execution/docker-gate.md b/docs/host-mounts-execution/docker-gate.md index 98be2813..a3dc3f17 100644 --- a/docs/host-mounts-execution/docker-gate.md +++ b/docs/host-mounts-execution/docker-gate.md @@ -21,7 +21,8 @@ override generator. - **Base compose files** live in `docker/` (selected by `start_*`): `docker-compose.ghcr-cpu.yml` (default), `…ghcr-cuda13.yml`, `…ghcr-rocm.yml`, - `…ghcr-slim.yml`, and the local-build variants (`docker-compose.cpu.yml`, etc.). + `…ghcr-vulkan.yml`, `…ghcr-slim.yml`, and the local-build variants + (`docker-compose.cpu.yml`, `docker-compose.vulkan.yml`, etc.). - **Generated override** (new): `docker/docker-compose.host-mounts.generated.yml`. It mounts each configured source into the **affected services** (DECISIONS D2; default `guideants-webapi-ui;guideants-ai;plantuml`) at diff --git a/docs/local-ai-setup-guide.md b/docs/local-ai-setup-guide.md index 905e973b..0dca33aa 100644 --- a/docs/local-ai-setup-guide.md +++ b/docs/local-ai-setup-guide.md @@ -4,6 +4,8 @@ Last validated: 2026-05-05 This guide configures GuideAnts for fully local AI using the Setup Wizard only. If you only need Python sandbox/script execution and plan to use cloud/provider AI for model calls, use the explicit `--backend slim` stack instead of this local model setup. +For GPU acceleration without CUDA 13 or ROCm, start the stack with `--backend vulkan` (see [`docker/guideants-ai-vulkan.md`](../docker/guideants-ai-vulkan.md)). Vulkan GPU-accelerates llama and image generation; ASR, TTS, and embeddings still run on CPU inside the image. + ## Prerequisites 1. GuideAnts is running at `http://localhost:5107`. diff --git a/docs/setup-guide.md b/docs/setup-guide.md index a4981db8..40b98530 100644 --- a/docs/setup-guide.md +++ b/docs/setup-guide.md @@ -1,6 +1,6 @@ # GuideAnts Setup Guide -Last updated: 2026-06-05 +Last updated: 2026-06-30 This is the setup-first operator guide for GuideAnts. Use it to get a working environment from zero to usable chat/services, then use linked docs for deeper architecture details. @@ -21,7 +21,7 @@ Use the root launcher script for your OS: What these scripts do: - Validate Docker + Docker Compose. -- Auto-detect backend (`cuda13` when NVIDIA is available, `rocm` when AMD/ROCm is available, otherwise `cpu`). The `slim` backend is explicit only. +- Auto-detect backend (`cuda13` when NVIDIA + R580+ drivers are available, `rocm` when AMD/ROCm is available, `vulkan` on Windows when NVIDIA is present but below R580, otherwise `cpu`). The `slim` and `vulkan` backends are also available via explicit `--backend`. - Choose compose stack (`ghcr` by default, `local` optional). - Start the stack and wait for `http://localhost:5107/`. @@ -29,7 +29,7 @@ Useful options: - `--doctor` (checks only, no startup) - `--fix` (limited auto-remediation) -- `--backend cpu|cuda13|rocm|slim` (force backend; `slim` is the sandbox-oriented stack) +- `--backend cpu|cuda13|rocm|slim|vulkan` (force backend; `slim` is sandbox-only; `vulkan` is vendor-neutral GPU for llama + image gen) - `--compose ghcr|local` (prebuilt GHCR vs local images) If the launcher gets you to `http://localhost:5107/`, skip to section 5 for first-user auth bootstrap and initial wizard flow. @@ -38,13 +38,14 @@ If the launcher gets you to `http://localhost:5107/`, skip to section 5 for firs GuideAnts runs as a Docker Compose stack on a single host. Pick the stack by deciding two things: -1. Whether model runtimes should run locally (`cpu`, `cuda13`, `rocm`) or elsewhere (`slim`). +1. Whether model runtimes should run locally (`cpu`, `cuda13`, `rocm`, `vulkan`) or elsewhere (`slim`). 2. Whether images should be pulled from GHCR (`--compose ghcr`) or built locally first (`--compose local`). | Backend | Best for | Compose files | Web/API/SQL shape | AI runtime shape | |---------|----------|---------------|-------------------|------------------| -| `cuda13` | Local AI on NVIDIA GPUs. | `docker-compose.ghcr-cuda13.yml` or `docker-compose.cuda.yml` | Split stack: API/UI plus separate SQL Server. | Full local AI services. | -| `rocm` | Experimental local AI on AMD/ROCm. | `docker-compose.ghcr-rocm.yml` or `docker-compose.rocm.yml` | Split stack: API/UI plus separate SQL Server. | Full local AI services. | +| `cuda13` | Local AI on NVIDIA GPUs (CUDA 13, driver R580+). | `docker-compose.ghcr-cuda13.yml` or `docker-compose.cuda.yml` | Split stack: API/UI plus separate SQL Server. | Full local AI services (CUDA). | +| `rocm` | Local AI on AMD/ROCm. | `docker-compose.ghcr-rocm.yml` or `docker-compose.rocm.yml` | Split stack: API/UI plus separate SQL Server. | Full local AI services (HIP for llama/SD; CPU torch for ASR/TTS/emb). | +| `vulkan` | Local AI on NVIDIA, AMD, or Intel via Vulkan (one image). Best on Docker Desktop (Windows/macOS) and native Linux. | `docker-compose.ghcr-vulkan.yml` or `docker-compose.vulkan.yml` | Split stack: API/UI plus separate SQL Server. | Full local AI services (Vulkan GPU for llama + SD; CPU torch for ASR/TTS/emb). Includes Node.js 22 for `mcp+sandbox://` MCP servers. | | `cpu` | Local AI without GPU acceleration. | `docker-compose.ghcr-cpu.yml` or `docker-compose.cpu.yml` | Split stack: API/UI plus separate SQL Server. | Full local AI services. | | `slim` | Python sandbox users who use cloud/provider AI for model calls. | `docker-compose.ghcr-slim.yml` or `docker-compose.slim.yml` | Combined `guideants-webapi-ui-mssql`; no separate `mssql-express` service. | `guideants-ai slim`: sandbox/media only. | @@ -52,8 +53,8 @@ The services you see depend on that stack: | Service | Image/source | Role | |---------|---------------|------| -| `mssql-express` | `mssql2025-express-fts` | SQL Server database for split-stack `cpu`, `cuda13`, and `rocm` deployments. Not present in the slim stack because SQL Server is bundled into `guideants-webapi-ui-mssql`. | -| `guideants-ai` | `ghcr.io/elumenotion/guideants-ai-{cpu,cuda13,rocm}:latest` (or local tag); `guideants-ai-slim` for the slim stack | Full variants are the local AI gateway: llama.cpp, ASR, TTS, image generation, embeddings, media, script execution. The slim AI variant is for Python sandbox/script execution without starting local model runtime services. | +| `mssql-express` | `mssql2025-express-fts` | SQL Server database for split-stack `cpu`, `cuda13`, `rocm`, and `vulkan` deployments. Not present in the slim stack because SQL Server is bundled into `guideants-webapi-ui-mssql`. | +| `guideants-ai` | `ghcr.io/elumenotion/guideants-ai-{cpu,cuda13,rocm,vulkan}:latest` (or local tag); `guideants-ai-slim` for the slim stack | Full variants are the local AI gateway: llama.cpp, ASR, TTS, image generation, embeddings, media, script execution (with Node.js 22 for package MCP). The slim AI variant is for Python sandbox/script execution without starting local model runtime services. | | `docling-serve` | `quay.io/docling-project/docling-serve-cpu:v1.21.0` by default | Local document intelligence / markdown extraction. The `cpu` in this image tag is Docling's CPU image variant, not the GuideAnts backend selection. Healthcheck: `GET /version`. | | `documentserver` | `${GA_DOCUMENTSERVER_IMAGE}` from `docker/.env` | DocumentServer used for in-app Office document display and full editing in project/notebook file flows. | | `guideants-webapi-ui` / `guideants-webapi-ui-slim` / `guideants-webapi-ui-mssql` | Stack-specific API/UI image | Main API plus bundled browser UI at `http://localhost:5107`. `guideants-webapi-ui-slim` is API/UI-only for split stacks; it is not the slim AI stack. | @@ -94,7 +95,8 @@ Settings top-level tab order (current): - Docker Desktop (Windows/macOS) or Docker Engine 24+ with Compose plugin. - Windows PowerShell 7+ for `docker/llama/run/*.ps1` helper scripts. -- For CUDA local AI: NVIDIA drivers + container runtime support. +- For CUDA local AI: NVIDIA drivers (R580+) + container runtime support. +- For Vulkan local AI: Vulkan-capable GPU; Docker Desktop on Windows/macOS (Mesa dzn over D3D12), or Mesa RADV/ANV or nvidia-container-toolkit on native Linux. See [`docker/guideants-ai-vulkan.md`](../docker/guideants-ai-vulkan.md). - Disk budget: ~60 GB minimum for common local model sets. ### Images and compose mode @@ -102,7 +104,7 @@ Settings top-level tab order (current): You can run in either mode: - `ghcr` mode (default in launcher): pulls prebuilt images via `docker/docker-compose.ghcr-*.yml`. -- `local` mode: uses `docker/docker-compose.{cpu,cuda,rocm,slim}.yml`; build GuideAnts local images first when needed. Third-party images such as Docling or DocumentServer may still be pulled if the exact tag is not already present locally. +- `local` mode: uses `docker/docker-compose.{cpu,cuda,rocm,vulkan,slim}.yml`; build GuideAnts local images first when needed. Third-party images such as Docling or DocumentServer may still be pulled if the exact tag is not already present locally. The slim stack is selected with `--backend slim` and uses `docker/docker-compose.slim.yml` locally or `docker/docker-compose.ghcr-slim.yml` in GHCR mode. It uses the combined Web/API/SQL image (`guideants-webapi-ui-mssql`) plus the sandbox-oriented AI image (`guideants-ai slim`). It does not use `guideants-webapi-ui-slim`; that image is orthogonal and remains the API/UI image for split-stack deployments. @@ -143,6 +145,7 @@ Local images: - CUDA: `docker/docker-compose.cuda.yml` - CPU: `docker/docker-compose.cpu.yml` - ROCm: `docker/docker-compose.rocm.yml` +- Vulkan: `docker/docker-compose.vulkan.yml` - Slim: `docker/docker-compose.slim.yml` GHCR images: @@ -150,6 +153,7 @@ GHCR images: - CUDA: `docker/docker-compose.ghcr-cuda13.yml` - CPU: `docker/docker-compose.ghcr-cpu.yml` - ROCm: `docker/docker-compose.ghcr-rocm.yml` +- Vulkan: `docker/docker-compose.ghcr-vulkan.yml` - Slim: `docker/docker-compose.ghcr-slim.yml` ### Example startup commands @@ -173,6 +177,12 @@ GHCR images: # GHCR ROCm docker compose -f docker/docker-compose.ghcr-rocm.yml up -d +# local Vulkan + docker compose -f docker/docker-compose.vulkan.yml up -d + +# GHCR Vulkan + docker compose -f docker/docker-compose.ghcr-vulkan.yml up -d + # local slim docker compose -f docker/docker-compose.slim.yml up -d @@ -628,4 +638,5 @@ Read in this order: 8. [`llama-model-download-and-runtime-management.md`](llama-model-download-and-runtime-management.md) 9. [`telemetry-configuration.md`](telemetry-configuration.md) 10. [`../docker/guideants-ai-build.md`](../docker/guideants-ai-build.md) -11. [`../docker/build-processes.md`](../docker/build-processes.md) +11. [`../docker/guideants-ai-vulkan.md`](../docker/guideants-ai-vulkan.md) +12. [`../docker/build-processes.md`](../docker/build-processes.md) diff --git a/installer/docker/docker-compose.ghcr-vulkan.yml b/installer/docker/docker-compose.ghcr-vulkan.yml index ede7f937..dfd3334f 100644 --- a/installer/docker/docker-compose.ghcr-vulkan.yml +++ b/installer/docker/docker-compose.ghcr-vulkan.yml @@ -26,6 +26,8 @@ services: image: ${GA_AI_GHCR_IMAGE:-ghcr.io/elumenotion/guideants-ai-vulkan:main} pull_policy: missing container_name: guideants-ai + cap_drop: + - SYS_PTRACE # ---- Vulkan GPU wiring (cross-OS) ----------------------------------------- # Defaults target Windows / Docker Desktop, where GPU Vulkan for EVERY vendor # goes Vulkan -> Mesa dzn -> D3D12 -> /dev/dxg (works from git bash, no WSL). @@ -42,6 +44,7 @@ services: volumes: - ${GA_VULKAN_DRIVER_LIBS:-/usr/lib/wsl}:/usr/lib/wsl:ro - ai_local_models-new:/models-local + - script_agent_admin_state:/var/lib/guideants/script-agent-admin - type: bind source: ${GA_CONTENT_FILES_HOST_PATH:-./volumes/content-files} target: /app/ContentFiles @@ -56,11 +59,15 @@ services: - VK_DRIVER_FILES=${GA_VULKAN_ICD:-/usr/share/vulkan/icd.d/dzn_icd.json} - VK_ICD_FILENAMES=${GA_VULKAN_ICD:-/usr/share/vulkan/icd.d/dzn_icd.json} - MESA_D3D12_DEFAULT_ADAPTER_NAME=${MESA_D3D12_DEFAULT_ADAPTER_NAME:-NVIDIA} + - GGML_VK_VISIBLE_DEVICES=${GGML_VK_VISIBLE_DEVICES:-} # Used only when GA_VULKAN_RUNTIME=nvidia (native-Linux NVIDIA); harmless otherwise. - NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-all} - NVIDIA_DRIVER_CAPABILITIES=${NVIDIA_DRIVER_CAPABILITIES:-graphics,compute,utility} - HF_TOKEN=${HF_TOKEN:-} - SCRIPT_EXECUTION_AGENT_TOKEN=${GA_SCRIPT_AGENT_TOKEN:-dev-script-agent-token} + - SCRIPT_EXECUTION_ADMIN_TOKEN=${GA_SCRIPT_AGENT_ADMIN_TOKEN:-dev-script-agent-admin-token} + - SCRIPT_EXECUTION_ADMIN_STATE_DIR=/var/lib/guideants/script-agent-admin + - SCRIPT_EXECUTION_SCOPE_STATE_ROOT=/var/lib/guideants/script-agent-admin/scopes - SCRIPT_EXECUTION_REQUIRE_TOKEN=true - GA_LLAMA_MODELS_PRESET=/models-local/router-models.ini - GA_LLAMA_MODEL_DIR=/models-local/llama @@ -71,7 +78,13 @@ services: - GA_LLAMA_THREADS=16 - GA_LLAMA_PARALLEL=5 - GA_LLAMA_CACHE_RAM=8192 - - GA_LLAMA_KV_UNIFIED=1 + # Vulkan currently trips llama.cpp's scheduler for some model families when + # KV cache tensors are placed on Vulkan buffers. Keep KV offload disabled + # by default; LLAMA_ARG_* lets current images pick this up without rebuild. + - GA_LLAMA_KV_OFFLOAD=${GA_LLAMA_KV_OFFLOAD:-0} + - LLAMA_ARG_KV_OFFLOAD=${GA_LLAMA_KV_OFFLOAD:-0} + # Keep unified KV opt-in for Vulkan as a separate conservative default. + - GA_LLAMA_KV_UNIFIED=${GA_LLAMA_KV_UNIFIED:-0} - GA_LLAMA_JINJA=1 - GA_LLAMA_CONT_BATCH=1 - GA_LLAMA_NO_MMAP=0 @@ -128,6 +141,9 @@ services: - GA_SD_STRENGTH=0.75 - GA_SD_OFFLOAD_TO_CPU=${GA_SD_OFFLOAD_TO_CPU:-0} - GA_SD_DIFFUSION_FA=1 + # Optional SD-only Vulkan device selector. Empty means inherit the + # container-wide GGML_VK_VISIBLE_DEVICES value used by llama too. + - GA_SD_VK_VISIBLE_DEVICES=${GA_SD_VK_VISIBLE_DEVICES:-} - GA_SD_AUTO_LOAD_ON_STARTUP=${GA_SD_AUTO_LOAD_ON_STARTUP:-1} - GA_SD_WARMUP_PROMPT=${GA_SD_WARMUP_PROMPT:-startup-warmup} - GA_SD_WARMUP_SIZE=${GA_SD_WARMUP_SIZE:-512x512} @@ -213,6 +229,7 @@ services: - DocumentServer__JwtHeader=${GA_DOCUMENTSERVER_JWT_HEADER:-Authorization} - DocumentServer__JwtInBody=${GA_DOCUMENTSERVER_JWT_IN_BODY:-false} - ScriptExecution__AgentToken=${GA_SCRIPT_AGENT_TOKEN:-dev-script-agent-token} + - ScriptExecution__AdminToken=${GA_SCRIPT_AGENT_ADMIN_TOKEN:-dev-script-agent-admin-token} - LlamaCpp__BaseUrl=http://guideants-ai:80/llama-cpp - ServiceRouting__Containers__guideants-ai__BaseUrl=http://guideants-ai:80/sandbox - ServiceRouting__Containers__plantuml__BaseUrl=http://plantuml:80 @@ -266,6 +283,7 @@ volumes: mssql_ftdata: mssql_log: ai_local_models-new: + script_agent_admin_state: networks: guideants-network: diff --git a/installer/docker/docker-compose.vulkan.yml b/installer/docker/docker-compose.vulkan.yml index 9f7eb3a7..68e0f9ea 100644 --- a/installer/docker/docker-compose.vulkan.yml +++ b/installer/docker/docker-compose.vulkan.yml @@ -74,6 +74,7 @@ services: # to the NVIDIA discrete GPU; set "Radeon"/"Intel" or use # GGML_VK_VISIBLE_DEVICES to target another GPU. Ignored by non-dzn ICDs. - MESA_D3D12_DEFAULT_ADAPTER_NAME=${MESA_D3D12_DEFAULT_ADAPTER_NAME:-NVIDIA} + - GGML_VK_VISIBLE_DEVICES=${GGML_VK_VISIBLE_DEVICES:-} # Used only when GA_VULKAN_RUNTIME=nvidia (native-Linux NVIDIA); harmless otherwise. - NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-all} - NVIDIA_DRIVER_CAPABILITIES=${NVIDIA_DRIVER_CAPABILITIES:-graphics,compute,utility} @@ -95,7 +96,13 @@ services: - GA_LLAMA_THREADS=16 - GA_LLAMA_PARALLEL=5 - GA_LLAMA_CACHE_RAM=8192 - - GA_LLAMA_KV_UNIFIED=1 + # Vulkan currently trips llama.cpp's scheduler for some model families when + # KV cache tensors are placed on Vulkan buffers. Keep KV offload disabled + # by default; LLAMA_ARG_* lets current images pick this up without rebuild. + - GA_LLAMA_KV_OFFLOAD=${GA_LLAMA_KV_OFFLOAD:-0} + - LLAMA_ARG_KV_OFFLOAD=${GA_LLAMA_KV_OFFLOAD:-0} + # Keep unified KV opt-in for Vulkan as a separate conservative default. + - GA_LLAMA_KV_UNIFIED=${GA_LLAMA_KV_UNIFIED:-0} - GA_LLAMA_JINJA=1 - GA_LLAMA_CONT_BATCH=1 - GA_LLAMA_NO_MMAP=0 @@ -158,6 +165,9 @@ services: - GA_SD_STRENGTH=0.75 - GA_SD_OFFLOAD_TO_CPU=${GA_SD_OFFLOAD_TO_CPU:-0} - GA_SD_DIFFUSION_FA=1 + # Optional SD-only Vulkan device selector. Empty means inherit the + # container-wide GGML_VK_VISIBLE_DEVICES value used by llama too. + - GA_SD_VK_VISIBLE_DEVICES=${GA_SD_VK_VISIBLE_DEVICES:-} - GA_SD_AUTO_LOAD_ON_STARTUP=${GA_SD_AUTO_LOAD_ON_STARTUP:-0} - GA_SD_WARMUP_PROMPT=${GA_SD_WARMUP_PROMPT:-startup-warmup} - GA_SD_WARMUP_SIZE=${GA_SD_WARMUP_SIZE:-512x512} diff --git a/src/client/GuideAnts.code-workspace b/src/client/GuideAnts.code-workspace deleted file mode 100644 index 05e932db..00000000 --- a/src/client/GuideAnts.code-workspace +++ /dev/null @@ -1,14 +0,0 @@ -{ - "folders": [ - { - "path": "../.." - }, - { - "path": "../../../GuideAntsChat" - }, - { - "path": "../../../llama.cpp" - } - ], - "settings": {} -} \ No newline at end of file diff --git a/src/client/src/components/LoadingSpinner.tsx b/src/client/src/components/LoadingSpinner.tsx index 8d61b812..40e9bf85 100644 --- a/src/client/src/components/LoadingSpinner.tsx +++ b/src/client/src/components/LoadingSpinner.tsx @@ -4,7 +4,11 @@ interface LoadingSpinnerProps { message?: string; } -const LoadingSpinner: React.FC = ({ message = 'Loading your content...' }) => { +const LoadingSpinner: React.FC = (props) => { + const displayMessage = 'message' in props + ? (props.message ?? '') + : 'Loading your content...'; + return (
= ({ message = 'Loading your alt="Loading..." className="w-16 h-16 animate-bounce" /> -

{message}

+ {displayMessage && ( +

{displayMessage}

+ )}
); }; diff --git a/src/client/src/components/notebook/conversations/LlamaRuntimeModal.tsx b/src/client/src/components/notebook/conversations/LlamaRuntimeModal.tsx index a295713e..7620d254 100644 --- a/src/client/src/components/notebook/conversations/LlamaRuntimeModal.tsx +++ b/src/client/src/components/notebook/conversations/LlamaRuntimeModal.tsx @@ -10,6 +10,21 @@ interface LlamaRuntimeModalProps { isPolling: boolean; } +function getActiveOperationMessage(state?: string): string { + switch (state) { + case 'unloading': + return 'Unloading current models...'; + case 'loading': + return 'Loading new models into VRAM...'; + case 'verifying': + return 'Verifying model readiness...'; + case 'queued': + return 'Waiting to start...'; + default: + return 'Loading new models into VRAM...'; + } +} + export const LlamaRuntimeModal: React.FC = ({ isOpen, onClose, @@ -37,10 +52,11 @@ export const LlamaRuntimeModal: React.FC = ({ isInvalid ? 'Incompatible Models' : 'Local Models Required'} -

- {isPolling && 'Please wait while the required models are loaded into the local runtime.'} - {!isPolling && !isFailed && !isInvalid && 'The selected assistant requires local models that are not currently loaded.'} -

+ {!isPolling && !isFailed && !isInvalid && ( +

+ The selected assistant requires local models that are not currently loaded. +

+ )} {isPolling && status.activeOperation?.operationId === '__external_loading__' && (

Models are already loading from startup or another session — no action needed. @@ -50,15 +66,7 @@ export const LlamaRuntimeModal: React.FC = ({

{isPolling ? ( -
- -

- {status.activeOperation?.state === 'unloading' && 'Unloading current models...'} - {status.activeOperation?.state === 'loading' && 'Loading new models into VRAM...'} - {status.activeOperation?.state === 'verifying' && 'Verifying model readiness...'} - {status.activeOperation?.state === 'queued' && 'Waiting to start...'} -

-
+ ) : isInvalid ? (
diff --git a/src/client/src/components/notebook/conversations/__tests__/LlamaRuntimeModal.test.tsx b/src/client/src/components/notebook/conversations/__tests__/LlamaRuntimeModal.test.tsx index e5a4f8e1..dc8db939 100644 --- a/src/client/src/components/notebook/conversations/__tests__/LlamaRuntimeModal.test.tsx +++ b/src/client/src/components/notebook/conversations/__tests__/LlamaRuntimeModal.test.tsx @@ -71,9 +71,30 @@ describe('LlamaRuntimeModal', () => { expect(screen.getByText('Loading Local Models...')).toBeInTheDocument(); expect(screen.getByText('Loading new models into VRAM...')).toBeInTheDocument(); + expect(screen.queryByText('Loading your content...')).not.toBeInTheDocument(); + expect(screen.queryByText('Please wait while the required models are loaded into the local runtime.')).not.toBeInTheDocument(); expect(screen.queryByRole('button', { name: 'Cancel' })).not.toBeInTheDocument(); }); + it('shows external startup loading hint when polling external operation', () => { + render( + + ); + + expect( + screen.getByText('Models are already loading from startup or another session — no action needed.') + ).toBeInTheDocument(); + }); + it('shows failed state with retry', async () => { const user = userEvent.setup(); render( diff --git a/src/client/src/pages/settings/editors/image-generation/ImageBundleManager.tsx b/src/client/src/pages/settings/editors/image-generation/ImageBundleManager.tsx index 7dcf17f4..bc22737c 100644 --- a/src/client/src/pages/settings/editors/image-generation/ImageBundleManager.tsx +++ b/src/client/src/pages/settings/editors/image-generation/ImageBundleManager.tsx @@ -811,8 +811,8 @@ export function ImageBundleManager({ enabled, onDownloadOperationChange, onRunti : bundleExportBusy ? 'Wait for definition download to finish.' : b.definition - ? 'Edit the saved bundle recipe and re-download this bundle.' - : 'Edit this bundle recipe and re-download. No saved recipe metadata is present yet.' + ? 'Edit the saved bundle recipe; only changed roles are re-downloaded.' + : 'Edit this bundle recipe; changed roles are re-downloaded. No saved recipe metadata is present yet.' } /> void submit()} disabled={submitting || loadingBundle} > - {submitting ? 'Starting…' : mode === 'edit' ? 'Save & re-download' : 'Download snapshot'} + {submitting ? 'Starting…' : mode === 'edit' ? 'Save & update' : 'Download snapshot'} ) @@ -1300,7 +1300,7 @@ function DownloadBundleDialog({ ) : mode === 'edit' - ? 'Editing reuses the same bundle id and re-downloads the role files into that bundle. Each role keeps exactly one file.' + ? 'Editing reuses the same bundle id. Only roles whose repo, file, or revision changed are re-downloaded; unchanged files on disk are kept. Use Delete bundle to remove everything and start over.' : 'Read-only view of the bundle recipe and role readiness on disk.'}

{mode !== 'create' && bundle ? ( diff --git a/src/client/src/pages/settings/editors/image-generation/__tests__/ImageBundleManager.test.tsx b/src/client/src/pages/settings/editors/image-generation/__tests__/ImageBundleManager.test.tsx index e0e5c740..a1728d59 100644 --- a/src/client/src/pages/settings/editors/image-generation/__tests__/ImageBundleManager.test.tsx +++ b/src/client/src/pages/settings/editors/image-generation/__tests__/ImageBundleManager.test.tsx @@ -636,7 +636,7 @@ describe('ImageBundleManager', () => { operationId: 'op-edit', bundleId: 'bundle-a', status: 'queued', - roles: { diffusion: 'queued', vae: 'queued', textEncoder: 'queued' }, + roles: { diffusion: 'queued', vae: 'ready', textEncoder: 'ready' }, }); (api.settings.localModels.getOperation as any).mockResolvedValue({ operationId: 'op-edit', @@ -662,7 +662,7 @@ describe('ImageBundleManager', () => { }); fireEvent.change(diffusionFileInput, { target: { value: 'new-diff.gguf' } }); - const saveButton = screen.getByRole('button', { name: /Save & re-download/i }); + const saveButton = screen.getByRole('button', { name: /Save & update/i }); await waitFor(() => { expect((saveButton as HTMLButtonElement).disabled).toBe(false); }); diff --git a/src/client/src/types/settings.ts b/src/client/src/types/settings.ts index 0944fd8d..d4e6a95f 100644 --- a/src/client/src/types/settings.ts +++ b/src/client/src/types/settings.ts @@ -424,6 +424,8 @@ export interface LlamaRuntimeInventoryItemDto { routerContextSize?: number | null; /** Per-alias `cache-ram` (MiB) in router-models.ini when set. */ routerCacheRamMib?: number | null; + runtimeFailed?: boolean; + runtimeExitCode?: number | null; } /** diff --git a/src/server/GuideAntsApi.Tests/Services/LlamaCpp/LlamaServerRuntimeClientTests.cs b/src/server/GuideAntsApi.Tests/Services/LlamaCpp/LlamaServerRuntimeClientTests.cs index 53cffd69..8595460f 100644 --- a/src/server/GuideAntsApi.Tests/Services/LlamaCpp/LlamaServerRuntimeClientTests.cs +++ b/src/server/GuideAntsApi.Tests/Services/LlamaCpp/LlamaServerRuntimeClientTests.cs @@ -75,6 +75,137 @@ public async Task LoadModelAsync_PreservesBasePathPrefix() handler.LastRequestUri!.ToString().Should().Be("http://localhost:8110/llama-cpp/models/load"); } + [TestMethod] + public async Task LoadModelAsync_FailureMessageIncludesResponseBody() + { + var calls = 0; + var handler = new CapturingHandler(_ => + { + calls++; + return new HttpResponseMessage(HttpStatusCode.InternalServerError) + { + ReasonPhrase = "Internal Server Error", + Content = new StringContent("instance name=gemma exited with status 1", Encoding.UTF8, "text/plain") + }; + }); + + using var httpClient = new HttpClient(handler) + { + BaseAddress = new Uri("http://localhost:8110/llama-cpp/") + }; + + var client = new LlamaServerRuntimeClient(httpClient, NullLogger.Instance); + + var act = async () => await client.LoadModelAsync("gemma"); + + var ex = await act.Should().ThrowAsync(); + ex.Which.StatusCode.Should().Be(HttpStatusCode.InternalServerError); + ex.Which.Message.Should().Contain("models/load"); + ex.Which.Message.Should().Contain("instance name=gemma exited with status 1"); + calls.Should().Be(1); + } + + [TestMethod] + public async Task ListModelsAsync_DeserializesRouterFailureFields() + { + var handler = new CapturingHandler(_ => + new HttpResponseMessage(HttpStatusCode.OK) + { + Content = new StringContent( + "{\"data\":[{\"id\":\"gemma\",\"status\":{\"value\":\"unloaded\"},\"failed\":true,\"exit_code\":1}]}", + Encoding.UTF8, + "application/json") + }); + + using var httpClient = new HttpClient(handler) + { + BaseAddress = new Uri("http://localhost:8110/llama-cpp/") + }; + + var client = new LlamaServerRuntimeClient(httpClient, NullLogger.Instance); + + var response = await client.ListModelsAsync(); + + response.Data.Should().ContainSingle(); + response.Data[0].Failed.Should().BeTrue(); + response.Data[0].ExitCode.Should().Be(1); + } + + [TestMethod] + public async Task ListModelsAsync_RetriesTransientGatewayFailure() + { + var calls = 0; + var handler = new CapturingHandler(_ => + { + calls++; + return calls == 1 + ? new HttpResponseMessage(HttpStatusCode.BadGateway) + { + ReasonPhrase = "Bad Gateway", + Content = new StringContent("router is starting", Encoding.UTF8, "text/plain") + } + : new HttpResponseMessage(HttpStatusCode.OK) + { + Content = new StringContent("{\"data\":[{\"id\":\"qwen\",\"status\":{\"value\":\"loaded\"}}]}", Encoding.UTF8, "application/json") + }; + }); + + using var httpClient = new HttpClient(handler) + { + BaseAddress = new Uri("http://localhost:8110/llama-cpp/") + }; + + var client = new LlamaServerRuntimeClient(httpClient, NullLogger.Instance); + + var response = await client.ListModelsAsync(); + + calls.Should().Be(2); + response.Data.Should().ContainSingle(m => m.Id == "qwen"); + } + + [TestMethod] + public async Task ListModelsAsync_RetriesTransientConnectionFailure() + { + var calls = 0; + var handler = new CapturingHandler(_ => + { + calls++; + if (calls == 1) + { + throw new HttpRequestException("Connection refused (guideants-ai:80)"); + } + + return new HttpResponseMessage(HttpStatusCode.OK) + { + Content = new StringContent("{\"data\":[]}", Encoding.UTF8, "application/json") + }; + }); + + using var httpClient = new HttpClient(handler) + { + BaseAddress = new Uri("http://localhost:8110/llama-cpp/") + }; + + var client = new LlamaServerRuntimeClient(httpClient, NullLogger.Instance); + + await client.ListModelsAsync(); + + calls.Should().Be(2); + } + + [TestMethod] + public void MapRuntimeState_PrefersRouterFailedFlagOverUnloadedStatus() + { + var state = LlamaRuntimeInventoryService.MapRuntimeState(new LlamaModelData + { + Failed = true, + ExitCode = 1, + Status = new LlamaModelStatus { Value = "unloaded" } + }); + + state.Should().Be("failed"); + } + private sealed class CapturingHandler(Func responder) : HttpMessageHandler { private readonly Func _responder = responder; diff --git a/src/server/GuideAntsApi.Tests/Services/PublishedGuides/PublishedGuideCostLimitServiceTests.cs b/src/server/GuideAntsApi.Tests/Services/PublishedGuides/PublishedGuideCostLimitServiceTests.cs index 4b707bda..02020d35 100644 --- a/src/server/GuideAntsApi.Tests/Services/PublishedGuides/PublishedGuideCostLimitServiceTests.cs +++ b/src/server/GuideAntsApi.Tests/Services/PublishedGuides/PublishedGuideCostLimitServiceTests.cs @@ -75,6 +75,7 @@ public async Task EnsureWithinLimitsAsync_Prioritizes_daily_limit_when_both_dail var notebookId = Guid.NewGuid(); var now = DateTime.UtcNow; var dayStart = new DateTime(now.Year, now.Month, now.Day, 0, 0, 0, DateTimeKind.Utc); + var monthStart = new DateTime(now.Year, now.Month, 1, 0, 0, 0, DateTimeKind.Utc); await using (var seed = new ApplicationDbContext(options)) { @@ -88,7 +89,7 @@ public async Task EnsureWithinLimitsAsync_Prioritizes_daily_limit_when_both_dail NotebookId = notebookId, Active = true, DailyChargeLimitUsd = 0.50m, - BillingPeriodChargeLimitUsd = 1.00m + BillingPeriodChargeLimitUsd = 0.50m }); seed.UsageEvents.AddRange( new UsageEvent @@ -109,8 +110,8 @@ public async Task EnsureWithinLimitsAsync_Prioritizes_daily_limit_when_both_dail { NotebookId = notebookId, ProjectId = projectId, - Created = dayStart.AddDays(-2), - ChargeUsd = 0.70m + Created = monthStart.AddTicks(-1), + ChargeUsd = 9.99m }); await seed.SaveChangesAsync(); } @@ -123,6 +124,6 @@ public async Task EnsureWithinLimitsAsync_Prioritizes_daily_limit_when_both_dail result.Allowed.Should().BeFalse(); result.Reason.Should().Be("daily_limit_exceeded"); result.DailyChargeUsd.Should().Be(0.70m); - result.BillingPeriodChargeUsd.Should().Be(1.40m); + result.BillingPeriodChargeUsd.Should().Be(0.70m); } } diff --git a/src/server/GuideAntsApi/Configuration/StartupConfiguration.cs b/src/server/GuideAntsApi/Configuration/StartupConfiguration.cs index 39d87940..f60bf171 100644 --- a/src/server/GuideAntsApi/Configuration/StartupConfiguration.cs +++ b/src/server/GuideAntsApi/Configuration/StartupConfiguration.cs @@ -1,6 +1,7 @@ using Microsoft.EntityFrameworkCore; using Microsoft.OpenApi.Models; using Microsoft.OpenApi.Any; +using System.Net.Http; using System.Reflection; using System.Security.Claims; using GuideAntsApi.DataModel; @@ -149,6 +150,12 @@ private static void RegisterServices(IServiceCollection services, IConfiguration var baseUrl = config["BaseUrl"] ?? throw new InvalidOperationException("LlamaCpp:BaseUrl is required."); client.BaseAddress = new Uri(baseUrl); + }) + .SetHandlerLifetime(TimeSpan.FromMinutes(1)) + .ConfigurePrimaryHttpMessageHandler(() => new SocketsHttpHandler + { + PooledConnectionLifetime = TimeSpan.FromSeconds(30), + PooledConnectionIdleTimeout = TimeSpan.FromSeconds(15) }); services.AddHttpClient(client => { diff --git a/src/server/GuideAntsApi/Endpoints/Settings/SettingsLlamaEndpoints.cs b/src/server/GuideAntsApi/Endpoints/Settings/SettingsLlamaEndpoints.cs index ebc4f788..1818e431 100644 --- a/src/server/GuideAntsApi/Endpoints/Settings/SettingsLlamaEndpoints.cs +++ b/src/server/GuideAntsApi/Endpoints/Settings/SettingsLlamaEndpoints.cs @@ -165,7 +165,11 @@ public static void MapSettingsLlamaEndpoints(this WebApplication app) RouterModelId: item.RouterModelId, LastLoadStartedAt: null, LastLoadDurationMs: null, - LastError: null)); + LastError: item.RuntimeFailed + ? item.RuntimeExitCode is int exitCode + ? $"llama-server child exited with status {exitCode}." + : "llama-server child exited during model load." + : null)); } return Results.Ok((IReadOnlyList)statuses); diff --git a/src/server/GuideAntsApi/Models/Settings/SettingsDtos.cs b/src/server/GuideAntsApi/Models/Settings/SettingsDtos.cs index 26fad133..194a5306 100644 --- a/src/server/GuideAntsApi/Models/Settings/SettingsDtos.cs +++ b/src/server/GuideAntsApi/Models/Settings/SettingsDtos.cs @@ -303,7 +303,9 @@ public sealed record LlamaRuntimeInventoryItemDto( IReadOnlyList CatalogModelIds, int NotebookReferenceCount, int? RouterContextSize = null, - int? RouterCacheRamMib = null); + int? RouterCacheRamMib = null, + bool RuntimeFailed = false, + int? RuntimeExitCode = null); public sealed record StartModelDownloadRequest( string Repository, diff --git a/src/server/GuideAntsApi/Services/LlamaCpp/ILlamaServerRuntimeClient.cs b/src/server/GuideAntsApi/Services/LlamaCpp/ILlamaServerRuntimeClient.cs index 55ccca75..ea75f55c 100644 --- a/src/server/GuideAntsApi/Services/LlamaCpp/ILlamaServerRuntimeClient.cs +++ b/src/server/GuideAntsApi/Services/LlamaCpp/ILlamaServerRuntimeClient.cs @@ -1,3 +1,4 @@ +using System.Net; using System.Text; using System.Text.Json; using System.Text.Json.Nodes; @@ -32,6 +33,14 @@ public class LlamaModelData [JsonPropertyName("state")] public string State { get; set; } = string.Empty; + // Router mode marks a child process that exited during load with these + // fields while status.value may still be "unloaded". + [JsonPropertyName("failed")] + public bool Failed { get; set; } + + [JsonPropertyName("exit_code")] + public int? ExitCode { get; set; } + [JsonPropertyName("meta")] public LlamaModelMeta? Meta { get; set; } } @@ -68,6 +77,16 @@ public class LlamaOpenAiModelData public class LlamaServerRuntimeClient : ILlamaServerRuntimeClient { + private static readonly TimeSpan[] TransientRetryDelays = + [ + TimeSpan.FromMilliseconds(250), + TimeSpan.FromMilliseconds(500), + TimeSpan.FromSeconds(1), + TimeSpan.FromSeconds(2), + TimeSpan.FromSeconds(4), + TimeSpan.FromSeconds(5) + ]; + private readonly HttpClient _httpClient; private readonly ILogger _logger; @@ -79,25 +98,13 @@ public LlamaServerRuntimeClient(HttpClient httpClient, ILogger ListModelsAsync(CancellationToken cancellationToken = default) { - var requestPath = "models"; - var requestUri = BuildEndpointUri(_httpClient.BaseAddress, requestPath); - - var response = await _httpClient.GetAsync(requestUri, cancellationToken); - var responseContent = await response.Content.ReadAsStringAsync(cancellationToken); - - response.EnsureSuccessStatusCode(); + var responseContent = await GetStringWithTransientRetryAsync("models", cancellationToken); return JsonSerializer.Deserialize(responseContent) ?? new LlamaModelsResponse(); } public async Task ListOpenAiModelsAsync(CancellationToken cancellationToken = default) { - var requestPath = "v1/models"; - var requestUri = BuildEndpointUri(_httpClient.BaseAddress, requestPath); - - var response = await _httpClient.GetAsync(requestUri, cancellationToken); - var responseContent = await response.Content.ReadAsStringAsync(cancellationToken); - - response.EnsureSuccessStatusCode(); + var responseContent = await GetStringWithTransientRetryAsync("v1/models", cancellationToken); return JsonSerializer.Deserialize(responseContent) ?? new LlamaOpenAiModelsResponse(); } @@ -120,54 +127,18 @@ public async Task LoadModelAsync(string modelPathOrPreset, JsonObject? loadParam } var requestPath = "models/load"; - var requestUri = BuildEndpointUri(_httpClient.BaseAddress, requestPath); var requestJson = requestBody.ToJsonString(); - using var request = new HttpRequestMessage(HttpMethod.Post, requestUri) - { - Content = new StringContent(requestJson, Encoding.UTF8, "application/json") - }; - - var response = await _httpClient.SendAsync(request, cancellationToken); - var responseContent = await response.Content.ReadAsStringAsync(cancellationToken); - - if (!response.IsSuccessStatusCode) - { - _logger.LogError( - "Llama runtime POST failed. Url: {RequestUri}. Status: {StatusCode}. RequestBody: {RequestBody}. ResponseBody: {ResponseBody}", - requestUri.ToString(), - (int)response.StatusCode, - requestJson, - responseContent); - } - response.EnsureSuccessStatusCode(); + await PostJsonWithTransientRetryAsync(requestPath, requestJson, cancellationToken); } public async Task UnloadModelAsync(string routerModelId, CancellationToken cancellationToken = default) { var requestBody = new { model = routerModelId }; var requestPath = "models/unload"; - var requestUri = BuildEndpointUri(_httpClient.BaseAddress, requestPath); var requestJson = JsonSerializer.Serialize(requestBody); - using var request = new HttpRequestMessage(HttpMethod.Post, requestUri) - { - Content = new StringContent(requestJson, Encoding.UTF8, "application/json") - }; - - var response = await _httpClient.SendAsync(request, cancellationToken); - var responseContent = await response.Content.ReadAsStringAsync(cancellationToken); - - if (!response.IsSuccessStatusCode) - { - _logger.LogError( - "Llama runtime POST failed. Url: {RequestUri}. Status: {StatusCode}. RequestBody: {RequestBody}. ResponseBody: {ResponseBody}", - requestUri.ToString(), - (int)response.StatusCode, - requestJson, - responseContent); - } - response.EnsureSuccessStatusCode(); + await PostJsonWithTransientRetryAsync(requestPath, requestJson, cancellationToken); } internal static Uri BuildEndpointUri(Uri? baseAddress, string relativePath) @@ -181,4 +152,159 @@ internal static Uri BuildEndpointUri(Uri? baseAddress, string relativePath) var normalizedRelativePath = relativePath.TrimStart('/'); return new Uri(new Uri(normalizedBaseUrl), normalizedRelativePath); } + + private static string LimitForException(string value) + { + const int maxChars = 2000; + if (string.IsNullOrEmpty(value)) + { + return value; + } + + return value.Length <= maxChars ? value : value[..maxChars] + "..."; + } + + private async Task GetStringWithTransientRetryAsync( + string requestPath, + CancellationToken cancellationToken) + { + var requestUri = BuildEndpointUri(_httpClient.BaseAddress, requestPath); + + for (var attempt = 0; ; attempt++) + { + try + { + using var response = await _httpClient.GetAsync(requestUri, cancellationToken); + var responseContent = await response.Content.ReadAsStringAsync(cancellationToken); + + if (response.IsSuccessStatusCode) + { + return responseContent; + } + + if (ShouldRetryStatus(response.StatusCode, attempt)) + { + await DelayBeforeRetryAsync( + "GET", + requestUri, + attempt, + $"HTTP {(int)response.StatusCode} ({response.ReasonPhrase ?? ""})", + cancellationToken).ConfigureAwait(false); + continue; + } + + _logger.LogError( + "Llama runtime GET failed. Url: {RequestUri}. Status: {StatusCode}. ResponseBody: {ResponseBody}", + requestUri.ToString(), + (int)response.StatusCode, + responseContent); + throw new HttpRequestException( + $"Llama runtime GET {requestPath} failed with HTTP {(int)response.StatusCode} ({response.ReasonPhrase ?? ""}). ResponseBody={LimitForException(responseContent)}", + null, + response.StatusCode); + } + catch (Exception ex) when (ShouldRetryException(ex, cancellationToken, attempt)) + { + await DelayBeforeRetryAsync( + "GET", + requestUri, + attempt, + ex.Message, + cancellationToken).ConfigureAwait(false); + } + } + } + + private async Task PostJsonWithTransientRetryAsync( + string requestPath, + string requestJson, + CancellationToken cancellationToken) + { + var requestUri = BuildEndpointUri(_httpClient.BaseAddress, requestPath); + + for (var attempt = 0; ; attempt++) + { + try + { + using var request = new HttpRequestMessage(HttpMethod.Post, requestUri) + { + Content = new StringContent(requestJson, Encoding.UTF8, "application/json") + }; + using var response = await _httpClient.SendAsync(request, cancellationToken); + var responseContent = await response.Content.ReadAsStringAsync(cancellationToken); + + if (response.IsSuccessStatusCode) + { + return; + } + + if (ShouldRetryStatus(response.StatusCode, attempt)) + { + await DelayBeforeRetryAsync( + "POST", + requestUri, + attempt, + $"HTTP {(int)response.StatusCode} ({response.ReasonPhrase ?? ""})", + cancellationToken).ConfigureAwait(false); + continue; + } + + _logger.LogError( + "Llama runtime POST failed. Url: {RequestUri}. Status: {StatusCode}. RequestBody: {RequestBody}. ResponseBody: {ResponseBody}", + requestUri.ToString(), + (int)response.StatusCode, + requestJson, + responseContent); + throw new HttpRequestException( + $"Llama runtime POST {requestPath} failed with HTTP {(int)response.StatusCode} ({response.ReasonPhrase ?? ""}). ResponseBody={LimitForException(responseContent)}", + null, + response.StatusCode); + } + catch (Exception ex) when (ShouldRetryException(ex, cancellationToken, attempt)) + { + await DelayBeforeRetryAsync( + "POST", + requestUri, + attempt, + ex.Message, + cancellationToken).ConfigureAwait(false); + } + } + } + + private bool ShouldRetryStatus(HttpStatusCode statusCode, int attempt) + { + return attempt < TransientRetryDelays.Length + && (statusCode == HttpStatusCode.RequestTimeout + || statusCode == HttpStatusCode.BadGateway + || statusCode == HttpStatusCode.ServiceUnavailable + || statusCode == HttpStatusCode.GatewayTimeout); + } + + private static bool ShouldRetryException(Exception ex, CancellationToken cancellationToken, int attempt) + { + return attempt < TransientRetryDelays.Length + && !cancellationToken.IsCancellationRequested + && (ex is HttpRequestException { StatusCode: null } + || ex is TaskCanceledException); + } + + private async Task DelayBeforeRetryAsync( + string method, + Uri requestUri, + int attempt, + string reason, + CancellationToken cancellationToken) + { + var delay = TransientRetryDelays[attempt]; + _logger.LogWarning( + "Transient llama runtime {Method} failure. Url: {RequestUri}. Attempt: {Attempt}/{MaxAttempts}. Retrying in {DelayMs} ms. Reason: {Reason}", + method, + requestUri.ToString(), + attempt + 1, + TransientRetryDelays.Length + 1, + (int)delay.TotalMilliseconds, + LimitForException(reason)); + await Task.Delay(delay, cancellationToken); + } } diff --git a/src/server/GuideAntsApi/Services/LlamaCpp/LlamaRuntimeInventoryService.cs b/src/server/GuideAntsApi/Services/LlamaCpp/LlamaRuntimeInventoryService.cs index e7f98845..17604c37 100644 --- a/src/server/GuideAntsApi/Services/LlamaCpp/LlamaRuntimeInventoryService.cs +++ b/src/server/GuideAntsApi/Services/LlamaCpp/LlamaRuntimeInventoryService.cs @@ -145,19 +145,26 @@ public async Task> GetInventoryAsync CatalogModelIds: catalogIds, NotebookReferenceCount: notebookCount, RouterContextSize: entry?.ContextSize, - RouterCacheRamMib: entry?.CacheRamMib)); + RouterCacheRamMib: entry?.CacheRamMib, + RuntimeFailed: runtimeRow?.Failed ?? false, + RuntimeExitCode: runtimeRow?.ExitCode)); } return results; } - private static string MapRuntimeState(LlamaModelData? data) + internal static string MapRuntimeState(LlamaModelData? data) { if (data is null) { return "unloaded"; } + if (data.Failed) + { + return "failed"; + } + if (!string.IsNullOrWhiteSpace(data.Status?.Value)) { return data.Status.Value.ToLowerInvariant(); diff --git a/start_linux.sh b/start_linux.sh index 933bc45f..d22e2e6f 100644 --- a/start_linux.sh +++ b/start_linux.sh @@ -7,7 +7,7 @@ DOCKER_DIR="$ROOT_DIR/docker" MODE="install" # install | doctor FIX_MODE="0" # 0 | 1 -BACKEND_OVERRIDE="" # cpu | cuda13 | rocm | slim +BACKEND_OVERRIDE="" # cpu | cuda13 | rocm | slim | vulkan COMPOSE_MODE="ghcr" # ghcr | local HEALTH_URL="http://localhost:5107/" HOST_MOUNT_OVERRIDE_FILE="docker-compose.host-mounts.generated.yml" @@ -21,7 +21,7 @@ Usage: ./start_linux.sh [options] Options: --doctor Run checks only, do not change anything. --fix Attempt limited auto-remediation where possible. - --backend cpu|cuda13|rocm|slim Force backend selection. slim is explicit only and is not auto-detected. + --backend cpu|cuda13|rocm|slim|vulkan Force backend selection. slim and vulkan are explicit only and are not auto-detected. --compose ghcr|local Use GHCR compose files (default) or local build files. --help Show this help. EOF @@ -101,6 +101,7 @@ select_compose_file() { slim) COMPOSE_FILE="docker-compose.slim.yml" ;; cuda13) COMPOSE_FILE="docker-compose.cuda.yml" ;; rocm) COMPOSE_FILE="docker-compose.rocm.yml" ;; + vulkan) COMPOSE_FILE="docker-compose.vulkan.yml" ;; *) COMPOSE_FILE="docker-compose.cpu.yml" ;; esac else @@ -108,11 +109,52 @@ select_compose_file() { slim) COMPOSE_FILE="docker-compose.ghcr-slim.yml" ;; cuda13) COMPOSE_FILE="docker-compose.ghcr-cuda13.yml" ;; rocm) COMPOSE_FILE="docker-compose.ghcr-rocm.yml" ;; + vulkan) COMPOSE_FILE="docker-compose.ghcr-vulkan.yml" ;; *) COMPOSE_FILE="docker-compose.ghcr-cpu.yml" ;; esac fi } +select_vulkan_runtime() { + [[ "$SELECTED_BACKEND" == "vulkan" ]] || return 0 + + if docker info --format '{{.OperatingSystem}}' 2>/dev/null | grep -q 'Docker Desktop'; then + log "Vulkan: Docker Desktop → Mesa dzn over D3D12 (/dev/dxg). Using built-in defaults (no env)." + return 0 + fi + + local dev="/dev/null" + [[ -e /dev/dri ]] && dev="/dev/dri" + export GA_VULKAN_DEVICE="$dev" + export GA_VULKAN_DRIVER_LIBS="/usr/lib" + export GA_VULKAN_LD_LIBRARY_PATH="/usr/lib/x86_64-linux-gnu" + + if docker info --format '{{json .Runtimes}}' 2>/dev/null | grep -q '"nvidia"'; then + export GA_VULKAN_RUNTIME="nvidia" + export GA_VULKAN_ICD="/usr/share/vulkan/icd.d/nvidia_icd.json" + log "Vulkan: native Linux NVIDIA → nvidia runtime injects the Vulkan ICD (device $dev)." + elif [[ -e /dev/dri ]]; then + local icd="" + for v in /sys/class/drm/renderD*/device/vendor; do + [[ -r "$v" ]] || continue + case "$(cat "$v" 2>/dev/null)" in + 0x1002) icd="/usr/share/vulkan/icd.d/radeon_icd.x86_64.json"; break ;; + 0x8086) icd="/usr/share/vulkan/icd.d/intel_icd.x86_64.json"; break ;; + esac + done + if [[ -n "$icd" ]]; then + export GA_VULKAN_ICD="$icd" + log "Vulkan: native Linux Mesa via /dev/dri (ICD $(basename "$icd"))." + else + export GA_VULKAN_ICD="/usr/share/vulkan/icd.d/radeon_icd.x86_64.json" + warn "Vulkan: /dev/dri present but GPU vendor undetermined; assuming AMD RADV. Override GA_VULKAN_ICD if this is an Intel GPU." + fi + else + warn "Vulkan: native Linux with no nvidia runtime and no /dev/dri — no GPU device found." + warn " LLM and image generation will run on CPU. Install Mesa (AMD/Intel) or the nvidia-container-toolkit (NVIDIA)." + fi +} + wait_for_health() { log "Waiting for GuideAnts UI to become reachable at $HEALTH_URL" for _ in $(seq 1 120); do @@ -151,11 +193,12 @@ while [[ $# -gt 0 ]]; do done [[ "$COMPOSE_MODE" == "ghcr" || "$COMPOSE_MODE" == "local" ]] || fail "--compose must be ghcr or local" -[[ -z "$BACKEND_OVERRIDE" || "$BACKEND_OVERRIDE" == "cpu" || "$BACKEND_OVERRIDE" == "cuda13" || "$BACKEND_OVERRIDE" == "rocm" || "$BACKEND_OVERRIDE" == "slim" ]] || fail "--backend must be cpu, cuda13, rocm, or slim" +[[ -z "$BACKEND_OVERRIDE" || "$BACKEND_OVERRIDE" == "cpu" || "$BACKEND_OVERRIDE" == "cuda13" || "$BACKEND_OVERRIDE" == "rocm" || "$BACKEND_OVERRIDE" == "slim" || "$BACKEND_OVERRIDE" == "vulkan" ]] || fail "--backend must be cpu, cuda13, rocm, slim, or vulkan" check_prereqs detect_backend select_compose_file +select_vulkan_runtime log "Selected backend: $SELECTED_BACKEND" log "Compose file: docker/$COMPOSE_FILE" diff --git a/start_macos.sh b/start_macos.sh index e5973ecd..e9cceeee 100644 --- a/start_macos.sh +++ b/start_macos.sh @@ -7,7 +7,7 @@ DOCKER_DIR="$ROOT_DIR/docker" MODE="install" # install | doctor FIX_MODE="0" # 0 | 1 -BACKEND_OVERRIDE="" # cpu | cuda13 | rocm | slim +BACKEND_OVERRIDE="" # cpu | cuda13 | rocm | slim | vulkan COMPOSE_MODE="ghcr" # ghcr | local HEALTH_URL="http://localhost:5107/" HOST_MOUNT_OVERRIDE_FILE="docker-compose.host-mounts.generated.yml" @@ -21,7 +21,7 @@ Usage: ./start_macos.sh [options] Options: --doctor Run checks only, do not change anything. --fix Attempt limited auto-remediation where possible. - --backend cpu|cuda13|rocm|slim Force backend selection. slim is explicit only and is not auto-detected. + --backend cpu|cuda13|rocm|slim|vulkan Force backend selection. slim and vulkan are explicit only and are not auto-detected. --compose ghcr|local Use GHCR compose files (default) or local build files. --help Show this help. EOF @@ -91,6 +91,7 @@ select_compose_file() { slim) COMPOSE_FILE="docker-compose.slim.yml" ;; cuda13) COMPOSE_FILE="docker-compose.cuda.yml" ;; rocm) COMPOSE_FILE="docker-compose.rocm.yml" ;; + vulkan) COMPOSE_FILE="docker-compose.vulkan.yml" ;; *) COMPOSE_FILE="docker-compose.cpu.yml" ;; esac else @@ -98,11 +99,17 @@ select_compose_file() { slim) COMPOSE_FILE="docker-compose.ghcr-slim.yml" ;; cuda13) COMPOSE_FILE="docker-compose.ghcr-cuda13.yml" ;; rocm) COMPOSE_FILE="docker-compose.ghcr-rocm.yml" ;; + vulkan) COMPOSE_FILE="docker-compose.ghcr-vulkan.yml" ;; *) COMPOSE_FILE="docker-compose.ghcr-cpu.yml" ;; esac fi } +select_vulkan_runtime() { + [[ "$SELECTED_BACKEND" == "vulkan" ]] || return 0 + log "Vulkan: Docker Desktop → Mesa dzn over D3D12 (/dev/dxg). Using built-in defaults (no env)." +} + wait_for_health() { log "Waiting for GuideAnts UI to become reachable at $HEALTH_URL" for _ in $(seq 1 120); do @@ -139,11 +146,12 @@ while [[ $# -gt 0 ]]; do done [[ "$COMPOSE_MODE" == "ghcr" || "$COMPOSE_MODE" == "local" ]] || fail "--compose must be ghcr or local" -[[ -z "$BACKEND_OVERRIDE" || "$BACKEND_OVERRIDE" == "cpu" || "$BACKEND_OVERRIDE" == "cuda13" || "$BACKEND_OVERRIDE" == "rocm" || "$BACKEND_OVERRIDE" == "slim" ]] || fail "--backend must be cpu, cuda13, rocm, or slim" +[[ -z "$BACKEND_OVERRIDE" || "$BACKEND_OVERRIDE" == "cpu" || "$BACKEND_OVERRIDE" == "cuda13" || "$BACKEND_OVERRIDE" == "rocm" || "$BACKEND_OVERRIDE" == "slim" || "$BACKEND_OVERRIDE" == "vulkan" ]] || fail "--backend must be cpu, cuda13, rocm, slim, or vulkan" check_prereqs detect_backend select_compose_file +select_vulkan_runtime log "Selected backend: $SELECTED_BACKEND" log "Compose file: docker/$COMPOSE_FILE" diff --git a/start_windows.cmd b/start_windows.cmd index 7f5d0559..13f46a06 100644 --- a/start_windows.cmd +++ b/start_windows.cmd @@ -48,7 +48,7 @@ call :fail Unknown option: %~1 :args_done if /I not "%COMPOSE_MODE%"=="ghcr" if /I not "%COMPOSE_MODE%"=="local" call :fail --compose must be ghcr or local if not "%BACKEND_OVERRIDE%"=="" ( - if /I not "%BACKEND_OVERRIDE%"=="cpu" if /I not "%BACKEND_OVERRIDE%"=="cuda13" if /I not "%BACKEND_OVERRIDE%"=="rocm" if /I not "%BACKEND_OVERRIDE%"=="slim" call :fail --backend must be cpu, cuda13, rocm, or slim + if /I not "%BACKEND_OVERRIDE%"=="cpu" if /I not "%BACKEND_OVERRIDE%"=="cuda13" if /I not "%BACKEND_OVERRIDE%"=="rocm" if /I not "%BACKEND_OVERRIDE%"=="slim" if /I not "%BACKEND_OVERRIDE%"=="vulkan" call :fail --backend must be cpu, cuda13, rocm, slim, or vulkan ) call :log Running preflight checks... @@ -68,6 +68,7 @@ call :check_wsl call :detect_backend call :validate_backend call :select_compose_file +call :select_vulkan_runtime call :log Selected backend: %SELECTED_BACKEND% call :log Compose file: docker\%COMPOSE_FILE% @@ -158,8 +159,8 @@ if not defined NVIDIA_DRIVER_VERSION ( if not "%BACKEND_OVERRIDE%"=="" ( call :fail Could not read NVIDIA driver version from nvidia-smi. Remove --backend cuda13 or fix NVIDIA driver/runtime. ) - call :warn Could not read NVIDIA driver version from nvidia-smi. Falling back to cpu backend. - set "SELECTED_BACKEND=cpu" + call :warn Could not read NVIDIA driver version from nvidia-smi. Falling back to vulkan backend. + set "SELECTED_BACKEND=vulkan" exit /b 0 ) @@ -171,8 +172,8 @@ if errorlevel 1 ( call :fail Could not parse NVIDIA driver version "%NVIDIA_DRIVER_VERSION%". Remove --backend cuda13 or fix NVIDIA drivers. exit /b 1 ) - call :warn Could not parse NVIDIA driver version "%NVIDIA_DRIVER_VERSION%". Falling back to cpu backend. - set "SELECTED_BACKEND=cpu" + call :warn Could not parse NVIDIA driver version "%NVIDIA_DRIVER_VERSION%". Falling back to vulkan backend. + set "SELECTED_BACKEND=vulkan" exit /b 0 ) @@ -182,8 +183,8 @@ if %NVIDIA_DRIVER_MAJOR_NUM% LSS 580 ( call :fail NVIDIA driver %NVIDIA_DRIVER_VERSION% is too old for cuda13. Install R580+ driver or use --backend cpu. exit /b 1 ) - call :warn NVIDIA driver %NVIDIA_DRIVER_VERSION% is below the CUDA 13 minimum ^(R580^). Falling back to cpu backend. - set "SELECTED_BACKEND=cpu" + call :warn NVIDIA driver %NVIDIA_DRIVER_VERSION% is below the CUDA 13 minimum ^(R580^). Falling back to vulkan backend. + set "SELECTED_BACKEND=vulkan" exit /b 0 ) @@ -198,6 +199,8 @@ if /I "%COMPOSE_MODE%"=="local" ( set "COMPOSE_FILE=docker-compose.cuda.yml" ) else if /I "%SELECTED_BACKEND%"=="rocm" ( set "COMPOSE_FILE=docker-compose.rocm.yml" + ) else if /I "%SELECTED_BACKEND%"=="vulkan" ( + set "COMPOSE_FILE=docker-compose.vulkan.yml" ) else ( set "COMPOSE_FILE=docker-compose.cpu.yml" ) @@ -208,12 +211,19 @@ if /I "%COMPOSE_MODE%"=="local" ( set "COMPOSE_FILE=docker-compose.ghcr-cuda13.yml" ) else if /I "%SELECTED_BACKEND%"=="rocm" ( set "COMPOSE_FILE=docker-compose.ghcr-rocm.yml" + ) else if /I "%SELECTED_BACKEND%"=="vulkan" ( + set "COMPOSE_FILE=docker-compose.ghcr-vulkan.yml" ) else ( set "COMPOSE_FILE=docker-compose.ghcr-cpu.yml" ) ) exit /b 0 +:select_vulkan_runtime +if /I not "%SELECTED_BACKEND%"=="vulkan" exit /b 0 +call :log Vulkan: Docker Desktop -^> Mesa dzn over D3D12 (/dev/dxg). Using built-in defaults (no env). +exit /b 0 + :wait_for_health set /a _max=120 set /a _count=0 @@ -254,7 +264,7 @@ echo. echo Options: echo --doctor Run checks only, do not change anything. echo --fix Attempt limited auto-remediation where possible. -echo --backend cpu^|cuda13^|rocm^|slim Force backend selection. slim is explicit only and is not auto-detected. +echo --backend cpu^|cuda13^|rocm^|slim^|vulkan Force backend selection. slim and vulkan are explicit only and are not auto-detected. echo --compose ghcr^|local Use GHCR compose files ^(default^) or local build files. echo --help Show this help. exit /b 0