diff --git a/README.md b/README.md index 183ebed..e0b994b 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ becomes a routable model; the router load-balances across instances; and a bundl - **Add a model by pasting `vllm serve …`** — parsed into a form and layered on as a dynamic overlay; the router hot-reloads, no `config.yaml` edits. - **Lifecycle + self-healing** — per-instance state machine (`stopped → starting → ready → failed`), VRAM pre-flight guard, GPU auto-placement, crash auto-restart with backoff. -- **Load-aware routing** — picks the least-loaded replica (running/waiting requests + KV-cache usage). +- **Pluggable routing strategies** — pick the load-balancing policy per model group or globally: `least_load` (default), `round_robin`, `random`, `least_inflight`, `p2c`, plus `session_affinity` / `prefix_affinity` for cache reuse on multi-turn chat & shared prompts. Switch it live from the dashboard; transparent failover + per-backend cooldown apply to every strategy. - **Live observability** — SSE status, animated system-topology & router-balancing graphs, per-model usage / latency / error stats. - **Bundled Grafana monitoring** — Prometheus auto-discovers every running instance; Overview / Capacity / Performance / GPU / Host dashboards embedded in-app, with SLO thresholds & alerts. - **Playground** — OpenAI-compatible chat (streaming) / completions / embeddings / reranking, with reasoning display. @@ -107,6 +107,16 @@ share one network namespace so the spawned vLLM instances are reachable on `loca NVIDIA GPU (CUDA 13.1+ recommended) · 16GB+ RAM · 50GB+ disk. +> **Tip — running multiple instances on limited RAM.** Each vLLM instance runs +> `torch.compile` + CUDA-graph capture on startup, which is heavy on **system RAM** +> (not VRAM). On a small box (e.g. WSL2 with ~8GB RAM), launching a second instance +> of the same model can exhaust RAM and thrash swap, leaving the new instance stuck +> in `starting`. Add **`--enforce-eager`** to the launch command to skip compilation: +> startup drops from minutes to seconds and RAM/CPU pressure falls sharply, at a small +> inference-latency cost. RAM — not VRAM — is usually the bottleneck for multi-instance, +> so give WSL more memory (`.wslconfig` → `memory=12GB`, then `wsl --shutdown`) before +> scaling out. + ## License MIT — see [LICENSE](LICENSE). diff --git a/README_zh-CN.md b/README_zh-CN.md index e317782..8ab9a92 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -29,7 +29,7 @@ - **貼上 `vllm serve …` 即可新增模型** — 解析成表單、以動態 overlay 疊加;router 熱重載。 - **生命週期** — 每實例狀態機(`stopped → starting → ready → failed`)、VRAM 預檢防呆、GPU 自動擺放、崩潰指數退避自動重啟。 -- **負載感知路由** — 自動挑負載最低的副本(運行中/等待中請求 + KV 快取使用率)。 +- **可插拔路由策略** — 每個模型群組或全域各自選負載平衡策略:`least_load`(預設)、`round_robin`、`random`、`least_inflight`、`p2c`,以及 `session_affinity` / `prefix_affinity`(多輪對話與共用 prompt 的快取重用)。可在控制台即時切換;失效轉移與每後端冷卻對所有策略一體適用。 - **即時觀測** — SSE 狀態、動畫系統拓撲圖與 router 負載平衡圖、每模型用量/延遲/錯誤統計。 - **內建 Grafana 監控** — Prometheus 自動發現每個運行中的實例;總覽/容量/效能/GPU/主機 dashboards 嵌入應用內,含 SLO 門檻線與告警。 - **Playground** — OpenAI 相容的 chat(串流)/completions/embeddings/reranking。 @@ -107,6 +107,14 @@ namespace,所以被拉起的 vLLM 實例可在 `localhost` 互相連到。 NVIDIA GPU(建議 CUDA 13.1+)· 16GB+ RAM · 50GB+ 磁碟。 +> **提示 — RAM 有限時跑多個 instance。** 每個 vLLM instance 啟動時都會做 +> `torch.compile` + CUDA-graph capture,這非常吃**系統 RAM**(不是 VRAM)。在小機器上 +> (例如 WSL2 只有 ~8GB RAM),對同一顆模型開第二個 instance 很容易把 RAM 吃光、swap +> 抖動,讓新 instance 一直卡在 `starting`。在啟動指令加上 **`--enforce-eager`** 即可跳過 +> 編譯:啟動時間從數分鐘降到數秒、RAM/CPU 壓力大幅下降,代價只是推理延遲略增。多 instance +> 的瓶頸通常是 **RAM 而非 VRAM**,擴展前先把 WSL 記憶體加大(`.wslconfig` → +> `memory=12GB`,再 `wsl --shutdown`)。 + ## 授權 MIT — 見 [LICENSE](LICENSE)。 diff --git a/apps/backend/app/llmops/launchers.py b/apps/backend/app/llmops/launchers.py index a9ca95f..07e70e8 100644 --- a/apps/backend/app/llmops/launchers.py +++ b/apps/backend/app/llmops/launchers.py @@ -53,6 +53,12 @@ def _write_effective_config(config) -> str: # Keys consumed as env vars / handled specially, not emitted as CLI flags. _LORA_RUNTIME_KEY = "allow_runtime_lora" +# Router-only knobs that ride the shared model_config (EngineModelConfig is +# extra="allow") but belong to the router, not vLLM — never pass them to +# `vllm serve` or it errors on an unknown argument. +_ROUTER_ONLY_KEYS = frozenset({"routing_strategy"}) +# Everything build_vllm_cli_args must skip (model_tag is the positional arg). +_SKIP_CLI_KEYS = frozenset({"model_tag", _LORA_RUNTIME_KEY}) | _ROUTER_ONLY_KEYS # vLLM's --max-loras defaults to 1 (only one distinct adapter per batch, which # serialises mixed-LoRA traffic and leaves no headroom for hot-loading more). @@ -85,7 +91,7 @@ def build_vllm_cli_args(model_cfg: dict) -> list[str]: cli_args = ["serve", model_tag] for key, value in model_cfg.items(): - if key == "model_tag" or key == _LORA_RUNTIME_KEY or value is None: + if key in _SKIP_CLI_KEYS or value is None: continue key_flag = "--" + key.replace("_", "-") if key == "lora_modules": diff --git a/apps/backend/tests/unit/test_launchers.py b/apps/backend/tests/unit/test_launchers.py index d796f0d..3788798 100644 --- a/apps/backend/tests/unit/test_launchers.py +++ b/apps/backend/tests/unit/test_launchers.py @@ -74,6 +74,17 @@ def test_build_vllm_cli_args_requires_model_tag(): build_vllm_cli_args({"dtype": "float16"}) +def test_routing_strategy_not_passed_to_vllm(): + # routing_strategy is a router-only knob riding the shared model_config; it + # must never reach `vllm serve` (vLLM errors on the unknown arg). + args = build_vllm_cli_args( + {"model_tag": "org/m", "dtype": "float16", "routing_strategy": "session_affinity"} + ) + assert "--routing-strategy" not in args + assert "session_affinity" not in args + assert "--dtype" in args # other flags still pass through + + def test_build_vllm_cli_args_lora_modules_multi_value(): args = build_vllm_cli_args( { diff --git a/apps/frontend_llmops/package-lock.json b/apps/frontend_llmops/package-lock.json index 9159a53..e1fc05a 100644 --- a/apps/frontend_llmops/package-lock.json +++ b/apps/frontend_llmops/package-lock.json @@ -743,448 +743,6 @@ "tslib": "^2.4.0" } }, - "node_modules/@esbuild/aix-ppc64": { - "version": "0.28.1", - "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.28.1.tgz", - "integrity": "sha512-Svl7tq8k/08+p6CXPpRjQ1fKX+1odH/BQbb48fV6fj3CWHhsoIOoY87w1oHXm0qEpkIK3ZfVgp0hed3XBXzXMQ==", - "cpu": [ - "ppc64" - ], - "license": "MIT", - "optional": true, - "os": [ - "aix" - ], - "peer": true, - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/android-arm": { - "version": "0.28.1", - "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.28.1.tgz", - "integrity": "sha512-0k2F129Xdio1TdJfzJ8sy1Q47vUD2NnwdhiAf7drUN1EBTfPf4hsFCtmMgu/6m8JSzsBrlmVjudMBQqOfG8usQ==", - "cpu": [ - "arm" - ], - "license": "MIT", - "optional": true, - "os": [ - "android" - ], - "peer": true, - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/android-arm64": { - "version": "0.28.1", - "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.28.1.tgz", - "integrity": "sha512-34EGEbCIAgosYz6goLcopX6Mo7NyGv9tfwEM2/7Ce2VcVRk568iSvniGWcUXIy7wEDR1wzolcxcriFVrWYcwBg==", - "cpu": [ - "arm64" - ], - "license": "MIT", - "optional": true, - "os": [ - "android" - ], - "peer": true, - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/android-x64": { - "version": "0.28.1", - "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.28.1.tgz", - "integrity": "sha512-dbwY7ltSMDWsRatcRpCnES4F+im88OCUgGZjy52shC7GqHRE/cYlxNbB4Z4UpJswpcc4Qxd2oE/ufM0p61IKng==", - "cpu": [ - "x64" - ], - "license": "MIT", - "optional": true, - "os": [ - "android" - ], - "peer": true, - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/darwin-arm64": { - "version": "0.28.1", - "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.28.1.tgz", - "integrity": "sha512-TZbWkQY7kvTAXbXUT7uVACR5cMHsDiSz9z7ZKAX/RTq/WJEk3QyRr0wZpNhBDX+/0CtdqUIJlOiodQcta6tY3Q==", - "cpu": [ - "arm64" - ], - "license": "MIT", - "optional": true, - "os": [ - "darwin" - ], - "peer": true, - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/darwin-x64": { - "version": "0.28.1", - "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.28.1.tgz", - "integrity": "sha512-zfdzgK9ACBNZLI/CyHTOx81SyNbM6YXn7rxSgX97VjyiPl9W1i4Ka4fgKECEoFCKGpvBj5qArWIGgQjOwkgskQ==", - "cpu": [ - "x64" - ], - "license": "MIT", - "optional": true, - "os": [ - "darwin" - ], - "peer": true, - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/freebsd-arm64": { - "version": "0.28.1", - "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.28.1.tgz", - "integrity": "sha512-wG2EA8ENdEI0qhkSZMjfqrdY+ziCYCPMmtZjjIwOmXFjmyzEHn+UUxk5of+SYsjtfs3VpnlC7QLzSI5hY/rOAw==", - "cpu": [ - "arm64" - ], - "license": "MIT", - "optional": true, - "os": [ - "freebsd" - ], - "peer": true, - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/freebsd-x64": { - "version": "0.28.1", - "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.28.1.tgz", - "integrity": "sha512-i7dZ9vQgnvSCzi/rYCXNgtF/U+eKZNJBzu3eTQbRgHnM7tNSizLOkRFAl3qzVc/Op/u5YkHHa4pf/3DOYHthLQ==", - "cpu": [ - "x64" - ], - "license": "MIT", - "optional": true, - "os": [ - "freebsd" - ], - "peer": true, - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-arm": { - "version": "0.28.1", - "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.28.1.tgz", - "integrity": "sha512-qVXBOHQS+d5Y722GwJzJUtOLlX7km3CraOaGormF1pDtPd2C/l1SHRPgjLunLGe51Sh5YYWKMFDyV4SxgMQYTQ==", - "cpu": [ - "arm" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "peer": true, - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-arm64": { - "version": "0.28.1", - "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.28.1.tgz", - "integrity": "sha512-yHs+0uc8+nvEAfAfxrWQKK5peSNzBc4PegcMO0EJ2hT71uA7vB8Ihg2e77R2P7SG5uYjPbHlLLmve4LLLRCf0g==", - "cpu": [ - "arm64" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "peer": true, - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-ia32": { - "version": "0.28.1", - "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.28.1.tgz", - "integrity": "sha512-d1z4ZuP0ajrfz/FhGT4vv278rX8KnPPJx8i5+AtK7TYbx9Le9F1hyzurZpkEyjkGa9dUGhQow4C1NmeGvqxN2w==", - "cpu": [ - "ia32" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "peer": true, - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-loong64": { - "version": "0.28.1", - "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.28.1.tgz", - "integrity": "sha512-M5sRjUVZrkm1OAPR3dlOYzNmN+loZKGVi1VUQGrwuqLcbR6qeAz+famMhjASeH3YVKvZz+zT1jlh/keC3Rj/lg==", - "cpu": [ - "loong64" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "peer": true, - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-mips64el": { - "version": "0.28.1", - "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.28.1.tgz", - "integrity": "sha512-mRObBZeHh2OxcBFPWE/FjylkRgZdYuiTR3vaTozquCGOH14iP9oN4x4Ge81CoIDYQrXmIxpFumJBu5MtZpnQJQ==", - "cpu": [ - "mips64el" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "peer": true, - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-ppc64": { - "version": "0.28.1", - "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.28.1.tgz", - "integrity": "sha512-slScBsMAb3GFDcdrCgLwZtPYRoH2H/youv10QiZyRjmsP48fznoveWytSgCI/R0ZcUgpc0ZhIUEx6LHts8yrfQ==", - "cpu": [ - "ppc64" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "peer": true, - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-riscv64": { - "version": "0.28.1", - "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.28.1.tgz", - "integrity": "sha512-kw0owk1o0GFETUJyW0jc0G4Yzs0BHZn0JDZ8JRT088vjJYX777BAs1fDGxAC+q831qOs2DTC96mNsG2opdfyyQ==", - "cpu": [ - "riscv64" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "peer": true, - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-s390x": { - "version": "0.28.1", - "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.28.1.tgz", - "integrity": "sha512-/lAIjX8aYFRByhh6L5rYtPEDRqa9de/4V/juOXcta5frjvzXO4/sqEtyytse0g3zZFuWu5cDN0MkLz2qRDD2Ag==", - "cpu": [ - "s390x" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "peer": true, - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-x64": { - "version": "0.28.1", - "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.28.1.tgz", - "integrity": "sha512-u/anNYF2mmVOEDwLtnQ1wOr3EZ9sTNGLWrsYGYwHWzGA3Si84IOkHXlbWTD1NB+9/1lcnweYKO54uhxZydNzfA==", - "cpu": [ - "x64" - ], - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "peer": true, - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/netbsd-arm64": { - "version": "0.28.1", - "resolved": "https://registry.npmjs.org/@esbuild/netbsd-arm64/-/netbsd-arm64-0.28.1.tgz", - "integrity": "sha512-oks0DYbLwWMmaakTsCb+zL4E+aHRVLom9IJZOAthMQEPiQmydXHkziYEsGYRx0uNV/IjEKGAV941JzH02pflqw==", - "cpu": [ - "arm64" - ], - "license": "MIT", - "optional": true, - "os": [ - "netbsd" - ], - "peer": true, - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/netbsd-x64": { - "version": "0.28.1", - "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.28.1.tgz", - "integrity": "sha512-aeL6lAnN89Hz43Mlh1G8ARasbuoYvSITDEx0tHh5b7jJnHcssqgjy9Yx430GDpmCa6OyrKoS0aNRjKundRizGg==", - "cpu": [ - "x64" - ], - "license": "MIT", - "optional": true, - "os": [ - "netbsd" - ], - "peer": true, - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/openbsd-arm64": { - "version": "0.28.1", - "resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.28.1.tgz", - "integrity": "sha512-MEFJe5C3R8pwXdZ5Y21oo6m7ePiS0d9pWucn99O/wvyJZChoIQKrQDxKrGeW8F5+T0okTHesAmDeiHDTIq0V/Q==", - "cpu": [ - "arm64" - ], - "license": "MIT", - "optional": true, - "os": [ - "openbsd" - ], - "peer": true, - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/openbsd-x64": { - "version": "0.28.1", - "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.28.1.tgz", - "integrity": "sha512-i/ZLIOafE0Z8cI/XANJAixoJL/uRAoS2xOA3rb0xN+KK0K177cMAsQYkzHtBrtMXAKuAc7HGgcWiZ/sRC1Nxgw==", - "cpu": [ - "x64" - ], - "license": "MIT", - "optional": true, - "os": [ - "openbsd" - ], - "peer": true, - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/openharmony-arm64": { - "version": "0.28.1", - "resolved": "https://registry.npmjs.org/@esbuild/openharmony-arm64/-/openharmony-arm64-0.28.1.tgz", - "integrity": "sha512-ge+Z7EXFNt2BO1oAMsVpiQ8EwndV9i1xXerAeTIK7AtPs3bKFXQM7nlRxDSIUIMeueR1CNXxqztLzdNeReKBJg==", - "cpu": [ - "arm64" - ], - "license": "MIT", - "optional": true, - "os": [ - "openharmony" - ], - "peer": true, - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/sunos-x64": { - "version": "0.28.1", - "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.28.1.tgz", - "integrity": "sha512-BEjgtECkL3vY+SaSQ6nzVfiALUeFxpawyp8Jmf5PtYhf1Ug40N1h/hxlhts+f1FvSvarEigdxS3BlSMI2PJLcQ==", - "cpu": [ - "x64" - ], - "license": "MIT", - "optional": true, - "os": [ - "sunos" - ], - "peer": true, - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/win32-arm64": { - "version": "0.28.1", - "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.28.1.tgz", - "integrity": "sha512-lCv9eK/H6ZJWbE7bh2nw54CZ9M2nupBxJcTsdk/QQnWkdSjKGuxmmH8/GWrlT1eMmZfn4dGcCjRte397WqfQXA==", - "cpu": [ - "arm64" - ], - "license": "MIT", - "optional": true, - "os": [ - "win32" - ], - "peer": true, - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/win32-ia32": { - "version": "0.28.1", - "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.28.1.tgz", - "integrity": "sha512-zvb/mB2bSCoJOpoCBgYKKpX6YM6mJBlBUVUtVj41DlZJVEB6/0CKlRYxP5wWl1C1ILiCoAU5wZZ4q1P3qeS6Eg==", - "cpu": [ - "ia32" - ], - "license": "MIT", - "optional": true, - "os": [ - "win32" - ], - "peer": true, - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/win32-x64": { - "version": "0.28.1", - "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.28.1.tgz", - "integrity": "sha512-bm4Mowrv+GXMlpWX++EcXw/iLyd1o3+bJkC2DkWXYVvgZCqD/bSj9ctZeAMC3cIxgjRVR2Dufaiu4YPxr5gW1A==", - "cpu": [ - "x64" - ], - "license": "MIT", - "optional": true, - "os": [ - "win32" - ], - "peer": true, - "engines": { - "node": ">=18" - } - }, "node_modules/@eslint-community/eslint-utils": { "version": "4.9.1", "resolved": "https://registry.npmjs.org/@eslint-community/eslint-utils/-/eslint-utils-4.9.1.tgz", @@ -4662,49 +4220,6 @@ "dev": true, "license": "MIT" }, - "node_modules/esbuild": { - "version": "0.28.1", - "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.28.1.tgz", - "integrity": "sha512-HrJrvZv5ayxBzPfwphOoNzkzOIIlifzk0KJrGK2c8R4+LKpMtpYLQeUdjnwjWv/LZlkH2laZk+4w78pi99D4Vw==", - "hasInstallScript": true, - "license": "MIT", - "optional": true, - "peer": true, - "bin": { - "esbuild": "bin/esbuild" - }, - "engines": { - "node": ">=18" - }, - "optionalDependencies": { - "@esbuild/aix-ppc64": "0.28.1", - "@esbuild/android-arm": "0.28.1", - "@esbuild/android-arm64": "0.28.1", - "@esbuild/android-x64": "0.28.1", - "@esbuild/darwin-arm64": "0.28.1", - "@esbuild/darwin-x64": "0.28.1", - "@esbuild/freebsd-arm64": "0.28.1", - "@esbuild/freebsd-x64": "0.28.1", - "@esbuild/linux-arm": "0.28.1", - "@esbuild/linux-arm64": "0.28.1", - "@esbuild/linux-ia32": "0.28.1", - "@esbuild/linux-loong64": "0.28.1", - "@esbuild/linux-mips64el": "0.28.1", - "@esbuild/linux-ppc64": "0.28.1", - "@esbuild/linux-riscv64": "0.28.1", - "@esbuild/linux-s390x": "0.28.1", - "@esbuild/linux-x64": "0.28.1", - "@esbuild/netbsd-arm64": "0.28.1", - "@esbuild/netbsd-x64": "0.28.1", - "@esbuild/openbsd-arm64": "0.28.1", - "@esbuild/openbsd-x64": "0.28.1", - "@esbuild/openharmony-arm64": "0.28.1", - "@esbuild/sunos-x64": "0.28.1", - "@esbuild/win32-arm64": "0.28.1", - "@esbuild/win32-ia32": "0.28.1", - "@esbuild/win32-x64": "0.28.1" - } - }, "node_modules/escalade": { "version": "3.2.0", "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz", diff --git a/apps/frontend_llmops/src/components/AddModelDialog.vue b/apps/frontend_llmops/src/components/AddModelDialog.vue index 300cfaf..f1eac8c 100644 --- a/apps/frontend_llmops/src/components/AddModelDialog.vue +++ b/apps/frontend_llmops/src/components/AddModelDialog.vue @@ -12,6 +12,7 @@ import { ApiError } from '@/lib/api' import { useModelsStore } from '@/stores/models' import { useResourcesStore } from '@/stores/resources' import { formatBytes } from '@/lib/utils' +import { ROUTING_STRATEGIES, routingStrategyLabel } from '@/lib/routingStrategies' import type { CachedModel, DownloadJob, LoraAdapter, LoraModule, SettingValue } from '@/types/api' const open = defineModel('open', { default: false }) @@ -37,6 +38,11 @@ const port = ref(8000) const cudaDevice = ref(null) const modelTag = ref('') const params = ref<{ key: string; value: string }[]>([]) +// Router-only load-balancing policy for the group. Lives in model_config but is +// NOT a vLLM flag, so it's edited as its own field and kept out of the raw param +// list (the launcher would otherwise reject it as an unknown `vllm serve` arg). +// '' = inherit the global default. +const routingStrategy = ref('') // LoRA adapters mounted at serve time; edited apart from the flat param list // because each is a {name, path, base_model_name} object, not a scalar flag. const loras = ref([]) @@ -141,6 +147,7 @@ function reset() { cudaDevice.value = null modelTag.value = '' params.value = [] + routingStrategy.value = '' loras.value = [] } @@ -150,12 +157,19 @@ function extractLoras(entries: [string, unknown][]): [string, unknown][] { loras.value = [] const rest: [string, unknown][] = [] for (const [k, v] of entries) { - if (k === 'lora_modules' && Array.isArray(v)) { - loras.value = (v as LoraModule[]).map((m) => ({ - name: m.name ?? '', - path: m.path ?? '', - base_model_name: m.base_model_name ?? '', - })) + if (k === 'lora_modules') { + // `lora_modules` is always owned by the LoRA editor — never let it leak + // into the raw param list. The config endpoint reports `lora_modules: null` + // for models without adapters; if that fell through to `rest` it would be + // rendered as a param and re-submitted as the string "", which fails the + // backend's `Optional[list[LoraModule]]` validation. + loras.value = Array.isArray(v) + ? (v as LoraModule[]).map((m) => ({ + name: m.name ?? '', + path: m.path ?? '', + base_model_name: m.base_model_name ?? '', + })) + : [] } else { rest.push([k, v]) } @@ -175,8 +189,9 @@ function prefillForEdit() { port.value = cfg.port cudaDevice.value = cfg.cuda_device ?? null modelTag.value = String(cfg.settings.model_tag ?? '') + routingStrategy.value = String(cfg.settings.routing_strategy ?? '') params.value = extractLoras( - Object.entries(cfg.settings).filter(([k2]) => k2 !== 'model_tag'), + Object.entries(cfg.settings).filter(([k2]) => k2 !== 'model_tag' && k2 !== 'routing_strategy'), ).map(([k2, v]) => ({ key: k2, value: v === null ? '' : String(v) })) warnings.value = [] parsed.value = true // skip the paste/parse step @@ -209,8 +224,11 @@ async function parse() { port.value = p.instance.port cudaDevice.value = p.instance.cuda_device modelTag.value = String(p.model_config.model_tag ?? '') + routingStrategy.value = String( + (p.model_config as Record).routing_strategy ?? '', + ) params.value = extractLoras( - Object.entries(p.model_config).filter(([k]) => k !== 'model_tag'), + Object.entries(p.model_config).filter(([k]) => k !== 'model_tag' && k !== 'routing_strategy'), ).map(([k, v]) => ({ key: k, value: String(v) })) warnings.value = p.warnings parsed.value = true @@ -365,8 +383,15 @@ async function submit() { model_tag: modelTag.value, } for (const { key: k, value } of params.value) { - if (k.trim()) settings[k.trim()] = coerce(value) + // `lora_modules` and `routing_strategy` have dedicated editors below — never + // let a raw param (e.g. a stray "" from a null, or a leaked router key) stomp + // them via the generic param list. + const kk = k.trim() + if (kk && kk !== 'lora_modules' && kk !== 'routing_strategy') settings[kk] = coerce(value) } + // Router-only load-balancing policy; '' inherits the global default, so only + // send it when explicitly chosen. + if (routingStrategy.value) settings.routing_strategy = routingStrategy.value // Mounted adapters: keep only filled rows; drop the empty base_model_name field. const cleanLoras = loras.value .filter((l) => l.name.trim() && l.path.trim()) @@ -494,6 +519,19 @@ async function submit() { 模型標籤 * + diff --git a/apps/frontend_llmops/src/components/ModelDetailDrawer.vue b/apps/frontend_llmops/src/components/ModelDetailDrawer.vue index 49acb35..3c55915 100644 --- a/apps/frontend_llmops/src/components/ModelDetailDrawer.vue +++ b/apps/frontend_llmops/src/components/ModelDetailDrawer.vue @@ -18,6 +18,7 @@ import { useAuth } from '@/composables/useAuth' import { api, ApiError } from '@/lib/api' import { toast } from '@/lib/toast' import { formatDuration, formatLatency, formatNumber, formatPercent, formatTime } from '@/lib/utils' +import { routingStrategyLabel } from '@/lib/routingStrategies' import type { EmbeddingModelParams, LoraAdapter, ModelStartupMetrics, StateEvent } from '@/types/api' const open = defineModel('open', { default: false }) @@ -63,14 +64,20 @@ const gpu = computed(() => ) const busy = computed(() => (props.modelKey ? models.pending.has(props.modelKey) : false)) -// Every vLLM parameter from model_config, shown generically (model_tag is -// already surfaced in the header, so it's filtered out here). +// Every vLLM parameter from model_config, shown generically. model_tag is in the +// header; lora_modules has its own section; routing_strategy is a router-only knob +// (not a vLLM flag), surfaced separately below. const vllmParams = computed( () => Object.entries(engine.value?.settings ?? {}).filter( - ([k]) => k !== 'model_tag' && k !== 'lora_modules', + ([k]) => k !== 'model_tag' && k !== 'lora_modules' && k !== 'routing_strategy', ) as [string, string | number | boolean | null][], ) +// Router load-balancing policy for the group (shown apart from vLLM flags). +const routingStrategy = computed(() => { + const s = engine.value?.settings?.routing_strategy + return typeof s === 'string' && s ? s : null +}) // LoRA adapters mounted on this group (rendered apart from the scalar params). const loras = computed(() => engine.value?.settings?.lora_modules ?? []) // Runtime (hot) LoRA load/unload is only possible when the model is running and @@ -469,6 +476,17 @@ const eventColor: Record = { + +
+

+ 路由策略(負載平衡) +

+
+ {{ routingStrategyLabel(routingStrategy) }} + ({{ routingStrategy }}) +
+
+

diff --git a/apps/frontend_llmops/src/lib/api.ts b/apps/frontend_llmops/src/lib/api.ts index a1d61e1..b4e9cf7 100644 --- a/apps/frontend_llmops/src/lib/api.ts +++ b/apps/frontend_llmops/src/lib/api.ts @@ -29,6 +29,7 @@ import type { RequestRow, ResourcesView, RouterMetrics, + RoutingInfo, SettingValue, StateEvent, UsageRow, @@ -172,6 +173,15 @@ export const api = { routerModels: () => request(ROUTER_BASE, '/v1/models'), routerMetrics: () => request(ROUTER_BASE, '/metrics'), + /** Current global load-balancing strategy + the selectable catalogue. */ + getRouting: () => request(ROUTER_BASE, '/routing'), + /** Hot-swap the global strategy (effective next request; not persisted). */ + setRouting: (strategy: string) => + request<{ strategy: string }>(ROUTER_BASE, '/routing', { + method: 'POST', + body: JSON.stringify({ strategy }), + }), + /** SSE endpoint URL for the live model snapshot stream. */ modelStreamUrl: () => `${API_BASE}/api/stream/models`, diff --git a/apps/frontend_llmops/src/lib/routingStrategies.ts b/apps/frontend_llmops/src/lib/routingStrategies.ts new file mode 100644 index 0000000..a09cee6 --- /dev/null +++ b/apps/frontend_llmops/src/lib/routingStrategies.ts @@ -0,0 +1,27 @@ +/** Router load-balancing strategies — shared by the Traffic page selector and the + * model edit dialog. Keep in sync with router-server's STRATEGIES registry + * (apps/router-server/src/llm_router/routing_strategies.py). + */ +export const ROUTING_STRATEGIES = [ + 'least_load', + 'round_robin', + 'random', + 'least_inflight', + 'p2c', + 'session_affinity', + 'prefix_affinity', +] as const + +export type RoutingStrategy = (typeof ROUTING_STRATEGIES)[number] + +export const ROUTING_STRATEGY_LABELS: Record = { + least_load: '最低負載(預設)', + round_robin: '輪詢', + random: '隨機', + least_inflight: '最少進行中', + p2c: '二選一取優', + session_affinity: '會話黏性', + prefix_affinity: '前綴黏性', +} + +export const routingStrategyLabel = (s: string) => ROUTING_STRATEGY_LABELS[s] ?? s diff --git a/apps/frontend_llmops/src/types/api.ts b/apps/frontend_llmops/src/types/api.ts index a0ebf4f..591198d 100644 --- a/apps/frontend_llmops/src/types/api.ts +++ b/apps/frontend_llmops/src/types/api.ts @@ -172,6 +172,13 @@ export interface InstanceMetrics { } export type RouterMetrics = Record> +/** Global load-balancing strategy state from the router's GET /routing. */ +export interface RoutingInfo { + strategy: string + available: string[] + default: string +} + export interface OpenAIModelList { object: string // `parent` is present only for LoRA adapters (points at the base group). diff --git a/apps/frontend_llmops/src/views/TrafficView.vue b/apps/frontend_llmops/src/views/TrafficView.vue index fc7df2e..f00313b 100644 --- a/apps/frontend_llmops/src/views/TrafficView.vue +++ b/apps/frontend_llmops/src/views/TrafficView.vue @@ -1,6 +1,6 @@