diff --git a/README.md b/README.md
index 183ebed..e0b994b 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,7 @@ becomes a routable model; the router load-balances across instances; and a bundl
 
 - **Add a model by pasting `vllm serve …`** — parsed into a form and layered on as a dynamic overlay; the router hot-reloads, no `config.yaml` edits.
 - **Lifecycle + self-healing** — per-instance state machine (`stopped → starting → ready → failed`), VRAM pre-flight guard, GPU auto-placement, crash auto-restart with backoff.
-- **Load-aware routing** — picks the least-loaded replica (running/waiting requests + KV-cache usage).
+- **Pluggable routing strategies** — pick the load-balancing policy per model group or globally: `least_load` (default), `round_robin`, `random`, `least_inflight`, `p2c`, plus `session_affinity` / `prefix_affinity` for cache reuse on multi-turn chat & shared prompts. Switch it live from the dashboard; transparent failover + per-backend cooldown apply to every strategy.
 - **Live observability** — SSE status, animated system-topology & router-balancing graphs, per-model usage / latency / error stats.
 - **Bundled Grafana monitoring** — Prometheus auto-discovers every running instance; Overview / Capacity / Performance / GPU / Host dashboards embedded in-app, with SLO thresholds & alerts.
 - **Playground** — OpenAI-compatible chat (streaming) / completions / embeddings / reranking, with reasoning display.
@@ -107,6 +107,16 @@ share one network namespace so the spawned vLLM instances are reachable on `loca
 
 NVIDIA GPU (CUDA 13.1+ recommended) · 16GB+ RAM · 50GB+ disk.
 
+> **Tip — running multiple instances on limited RAM.** Each vLLM instance runs
+> `torch.compile` + CUDA-graph capture on startup, which is heavy on **system RAM**
+> (not VRAM). On a small box (e.g. WSL2 with ~8GB RAM), launching a second instance
+> of the same model can exhaust RAM and thrash swap, leaving the new instance stuck
+> in `starting`. Add **`--enforce-eager`** to the launch command to skip compilation:
+> startup drops from minutes to seconds and RAM/CPU pressure falls sharply, at a small
+> inference-latency cost. RAM — not VRAM — is usually the bottleneck for multi-instance,
+> so give WSL more memory (`.wslconfig` → `memory=12GB`, then `wsl --shutdown`) before
+> scaling out.
+
 ## License
 
 MIT — see [LICENSE](LICENSE).
diff --git a/README_zh-CN.md b/README_zh-CN.md
index e317782..8ab9a92 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -29,7 +29,7 @@
 
 - **貼上 `vllm serve …` 即可新增模型** — 解析成表單、以動態 overlay 疊加；router 熱重載。
 - **生命週期** — 每實例狀態機（`stopped → starting → ready → failed`）、VRAM 預檢防呆、GPU 自動擺放、崩潰指數退避自動重啟。
-- **負載感知路由** — 自動挑負載最低的副本（運行中／等待中請求 + KV 快取使用率）。
+- **可插拔路由策略** — 每個模型群組或全域各自選負載平衡策略：`least_load`（預設）、`round_robin`、`random`、`least_inflight`、`p2c`,以及 `session_affinity` / `prefix_affinity`(多輪對話與共用 prompt 的快取重用）。可在控制台即時切換;失效轉移與每後端冷卻對所有策略一體適用。
 - **即時觀測** — SSE 狀態、動畫系統拓撲圖與 router 負載平衡圖、每模型用量／延遲／錯誤統計。
 - **內建 Grafana 監控** — Prometheus 自動發現每個運行中的實例；總覽／容量／效能／GPU／主機 dashboards 嵌入應用內，含 SLO 門檻線與告警。
 - **Playground** — OpenAI 相容的 chat（串流）／completions／embeddings／reranking。
@@ -107,6 +107,14 @@ namespace，所以被拉起的 vLLM 實例可在 `localhost` 互相連到。
 
 NVIDIA GPU（建議 CUDA 13.1+）· 16GB+ RAM · 50GB+ 磁碟。
 
+> **提示 — RAM 有限時跑多個 instance。** 每個 vLLM instance 啟動時都會做
+> `torch.compile` + CUDA-graph capture，這非常吃**系統 RAM**（不是 VRAM）。在小機器上
+> （例如 WSL2 只有 ~8GB RAM），對同一顆模型開第二個 instance 很容易把 RAM 吃光、swap
+> 抖動，讓新 instance 一直卡在 `starting`。在啟動指令加上 **`--enforce-eager`** 即可跳過
+> 編譯：啟動時間從數分鐘降到數秒、RAM/CPU 壓力大幅下降，代價只是推理延遲略增。多 instance
+> 的瓶頸通常是 **RAM 而非 VRAM**，擴展前先把 WSL 記憶體加大（`.wslconfig` →
+> `memory=12GB`，再 `wsl --shutdown`）。
+
 ## 授權
 
 MIT — 見 [LICENSE](LICENSE)。
diff --git a/apps/backend/app/llmops/launchers.py b/apps/backend/app/llmops/launchers.py
index a9ca95f..07e70e8 100644
--- a/apps/backend/app/llmops/launchers.py
+++ b/apps/backend/app/llmops/launchers.py
@@ -53,6 +53,12 @@ def _write_effective_config(config) -> str:
 
 # Keys consumed as env vars / handled specially, not emitted as CLI flags.
 _LORA_RUNTIME_KEY = "allow_runtime_lora"
+# Router-only knobs that ride the shared model_config (EngineModelConfig is
+# extra="allow") but belong to the router, not vLLM — never pass them to
+# `vllm serve` or it errors on an unknown argument.
+_ROUTER_ONLY_KEYS = frozenset({"routing_strategy"})
+# Everything build_vllm_cli_args must skip (model_tag is the positional arg).
+_SKIP_CLI_KEYS = frozenset({"model_tag", _LORA_RUNTIME_KEY}) | _ROUTER_ONLY_KEYS
 
 # vLLM's --max-loras defaults to 1 (only one distinct adapter per batch, which
 # serialises mixed-LoRA traffic and leaves no headroom for hot-loading more).
@@ -85,7 +91,7 @@ def build_vllm_cli_args(model_cfg: dict) -> list[str]:
 
     cli_args = ["serve", model_tag]
     for key, value in model_cfg.items():
-        if key == "model_tag" or key == _LORA_RUNTIME_KEY or value is None:
+        if key in _SKIP_CLI_KEYS or value is None:
             continue
         key_flag = "--" + key.replace("_", "-")
         if key == "lora_modules":
diff --git a/apps/backend/tests/unit/test_launchers.py b/apps/backend/tests/unit/test_launchers.py
index d796f0d..3788798 100644
--- a/apps/backend/tests/unit/test_launchers.py
+++ b/apps/backend/tests/unit/test_launchers.py
@@ -74,6 +74,17 @@ def test_build_vllm_cli_args_requires_model_tag():
         build_vllm_cli_args({"dtype": "float16"})
 
 
+def test_routing_strategy_not_passed_to_vllm():
+    # routing_strategy is a router-only knob riding the shared model_config; it
+    # must never reach `vllm serve` (vLLM errors on the unknown arg).
+    args = build_vllm_cli_args(
+        {"model_tag": "org/m", "dtype": "float16", "routing_strategy": "session_affinity"}
+    )
+    assert "--routing-strategy" not in args
+    assert "session_affinity" not in args
+    assert "--dtype" in args  # other flags still pass through
+
+
 def test_build_vllm_cli_args_lora_modules_multi_value():
     args = build_vllm_cli_args(
         {
diff --git a/apps/frontend_llmops/package-lock.json b/apps/frontend_llmops/package-lock.json
index 9159a53..e1fc05a 100644
--- a/apps/frontend_llmops/package-lock.json
+++ b/apps/frontend_llmops/package-lock.json
@@ -743,448 +743,6 @@
         "tslib": "^2.4.0"
       }
     },
-    "node_modules/@esbuild/aix-ppc64": {
-      "version": "0.28.1",
-      "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.28.1.tgz",
-      "integrity": "sha512-Svl7tq8k/08+p6CXPpRjQ1fKX+1odH/BQbb48fV6fj3CWHhsoIOoY87w1oHXm0qEpkIK3ZfVgp0hed3XBXzXMQ==",
-      "cpu": [
-        "ppc64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "aix"
-      ],
-      "peer": true,
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/android-arm": {
-      "version": "0.28.1",
-      "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.28.1.tgz",
-      "integrity": "sha512-0k2F129Xdio1TdJfzJ8sy1Q47vUD2NnwdhiAf7drUN1EBTfPf4hsFCtmMgu/6m8JSzsBrlmVjudMBQqOfG8usQ==",
-      "cpu": [
-        "arm"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "android"
-      ],
-      "peer": true,
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/android-arm64": {
-      "version": "0.28.1",
-      "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.28.1.tgz",
-      "integrity": "sha512-34EGEbCIAgosYz6goLcopX6Mo7NyGv9tfwEM2/7Ce2VcVRk568iSvniGWcUXIy7wEDR1wzolcxcriFVrWYcwBg==",
-      "cpu": [
-        "arm64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "android"
-      ],
-      "peer": true,
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/android-x64": {
-      "version": "0.28.1",
-      "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.28.1.tgz",
-      "integrity": "sha512-dbwY7ltSMDWsRatcRpCnES4F+im88OCUgGZjy52shC7GqHRE/cYlxNbB4Z4UpJswpcc4Qxd2oE/ufM0p61IKng==",
-      "cpu": [
-        "x64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "android"
-      ],
-      "peer": true,
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/darwin-arm64": {
-      "version": "0.28.1",
-      "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.28.1.tgz",
-      "integrity": "sha512-TZbWkQY7kvTAXbXUT7uVACR5cMHsDiSz9z7ZKAX/RTq/WJEk3QyRr0wZpNhBDX+/0CtdqUIJlOiodQcta6tY3Q==",
-      "cpu": [
-        "arm64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "darwin"
-      ],
-      "peer": true,
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/darwin-x64": {
-      "version": "0.28.1",
-      "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.28.1.tgz",
-      "integrity": "sha512-zfdzgK9ACBNZLI/CyHTOx81SyNbM6YXn7rxSgX97VjyiPl9W1i4Ka4fgKECEoFCKGpvBj5qArWIGgQjOwkgskQ==",
-      "cpu": [
-        "x64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "darwin"
-      ],
-      "peer": true,
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/freebsd-arm64": {
-      "version": "0.28.1",
-      "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.28.1.tgz",
-      "integrity": "sha512-wG2EA8ENdEI0qhkSZMjfqrdY+ziCYCPMmtZjjIwOmXFjmyzEHn+UUxk5of+SYsjtfs3VpnlC7QLzSI5hY/rOAw==",
-      "cpu": [
-        "arm64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "freebsd"
-      ],
-      "peer": true,
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/freebsd-x64": {
-      "version": "0.28.1",
-      "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.28.1.tgz",
-      "integrity": "sha512-i7dZ9vQgnvSCzi/rYCXNgtF/U+eKZNJBzu3eTQbRgHnM7tNSizLOkRFAl3qzVc/Op/u5YkHHa4pf/3DOYHthLQ==",
-      "cpu": [
-        "x64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "freebsd"
-      ],
-      "peer": true,
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/linux-arm": {
-      "version": "0.28.1",
-      "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.28.1.tgz",
-      "integrity": "sha512-qVXBOHQS+d5Y722GwJzJUtOLlX7km3CraOaGormF1pDtPd2C/l1SHRPgjLunLGe51Sh5YYWKMFDyV4SxgMQYTQ==",
-      "cpu": [
-        "arm"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ],
-      "peer": true,
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/linux-arm64": {
-      "version": "0.28.1",
-      "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.28.1.tgz",
-      "integrity": "sha512-yHs+0uc8+nvEAfAfxrWQKK5peSNzBc4PegcMO0EJ2hT71uA7vB8Ihg2e77R2P7SG5uYjPbHlLLmve4LLLRCf0g==",
-      "cpu": [
-        "arm64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ],
-      "peer": true,
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/linux-ia32": {
-      "version": "0.28.1",
-      "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.28.1.tgz",
-      "integrity": "sha512-d1z4ZuP0ajrfz/FhGT4vv278rX8KnPPJx8i5+AtK7TYbx9Le9F1hyzurZpkEyjkGa9dUGhQow4C1NmeGvqxN2w==",
-      "cpu": [
-        "ia32"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ],
-      "peer": true,
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/linux-loong64": {
-      "version": "0.28.1",
-      "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.28.1.tgz",
-      "integrity": "sha512-M5sRjUVZrkm1OAPR3dlOYzNmN+loZKGVi1VUQGrwuqLcbR6qeAz+famMhjASeH3YVKvZz+zT1jlh/keC3Rj/lg==",
-      "cpu": [
-        "loong64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ],
-      "peer": true,
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/linux-mips64el": {
-      "version": "0.28.1",
-      "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.28.1.tgz",
-      "integrity": "sha512-mRObBZeHh2OxcBFPWE/FjylkRgZdYuiTR3vaTozquCGOH14iP9oN4x4Ge81CoIDYQrXmIxpFumJBu5MtZpnQJQ==",
-      "cpu": [
-        "mips64el"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ],
-      "peer": true,
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/linux-ppc64": {
-      "version": "0.28.1",
-      "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.28.1.tgz",
-      "integrity": "sha512-slScBsMAb3GFDcdrCgLwZtPYRoH2H/youv10QiZyRjmsP48fznoveWytSgCI/R0ZcUgpc0ZhIUEx6LHts8yrfQ==",
-      "cpu": [
-        "ppc64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ],
-      "peer": true,
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/linux-riscv64": {
-      "version": "0.28.1",
-      "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.28.1.tgz",
-      "integrity": "sha512-kw0owk1o0GFETUJyW0jc0G4Yzs0BHZn0JDZ8JRT088vjJYX777BAs1fDGxAC+q831qOs2DTC96mNsG2opdfyyQ==",
-      "cpu": [
-        "riscv64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ],
-      "peer": true,
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/linux-s390x": {
-      "version": "0.28.1",
-      "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.28.1.tgz",
-      "integrity": "sha512-/lAIjX8aYFRByhh6L5rYtPEDRqa9de/4V/juOXcta5frjvzXO4/sqEtyytse0g3zZFuWu5cDN0MkLz2qRDD2Ag==",
-      "cpu": [
-        "s390x"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ],
-      "peer": true,
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/linux-x64": {
-      "version": "0.28.1",
-      "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.28.1.tgz",
-      "integrity": "sha512-u/anNYF2mmVOEDwLtnQ1wOr3EZ9sTNGLWrsYGYwHWzGA3Si84IOkHXlbWTD1NB+9/1lcnweYKO54uhxZydNzfA==",
-      "cpu": [
-        "x64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ],
-      "peer": true,
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/netbsd-arm64": {
-      "version": "0.28.1",
-      "resolved": "https://registry.npmjs.org/@esbuild/netbsd-arm64/-/netbsd-arm64-0.28.1.tgz",
-      "integrity": "sha512-oks0DYbLwWMmaakTsCb+zL4E+aHRVLom9IJZOAthMQEPiQmydXHkziYEsGYRx0uNV/IjEKGAV941JzH02pflqw==",
-      "cpu": [
-        "arm64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "netbsd"
-      ],
-      "peer": true,
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/netbsd-x64": {
-      "version": "0.28.1",
-      "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.28.1.tgz",
-      "integrity": "sha512-aeL6lAnN89Hz43Mlh1G8ARasbuoYvSITDEx0tHh5b7jJnHcssqgjy9Yx430GDpmCa6OyrKoS0aNRjKundRizGg==",
-      "cpu": [
-        "x64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "netbsd"
-      ],
-      "peer": true,
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/openbsd-arm64": {
-      "version": "0.28.1",
-      "resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.28.1.tgz",
-      "integrity": "sha512-MEFJe5C3R8pwXdZ5Y21oo6m7ePiS0d9pWucn99O/wvyJZChoIQKrQDxKrGeW8F5+T0okTHesAmDeiHDTIq0V/Q==",
-      "cpu": [
-        "arm64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "openbsd"
-      ],
-      "peer": true,
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/openbsd-x64": {
-      "version": "0.28.1",
-      "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.28.1.tgz",
-      "integrity": "sha512-i/ZLIOafE0Z8cI/XANJAixoJL/uRAoS2xOA3rb0xN+KK0K177cMAsQYkzHtBrtMXAKuAc7HGgcWiZ/sRC1Nxgw==",
-      "cpu": [
-        "x64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "openbsd"
-      ],
-      "peer": true,
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/openharmony-arm64": {
-      "version": "0.28.1",
-      "resolved": "https://registry.npmjs.org/@esbuild/openharmony-arm64/-/openharmony-arm64-0.28.1.tgz",
-      "integrity": "sha512-ge+Z7EXFNt2BO1oAMsVpiQ8EwndV9i1xXerAeTIK7AtPs3bKFXQM7nlRxDSIUIMeueR1CNXxqztLzdNeReKBJg==",
-      "cpu": [
-        "arm64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "openharmony"
-      ],
-      "peer": true,
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/sunos-x64": {
-      "version": "0.28.1",
-      "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.28.1.tgz",
-      "integrity": "sha512-BEjgtECkL3vY+SaSQ6nzVfiALUeFxpawyp8Jmf5PtYhf1Ug40N1h/hxlhts+f1FvSvarEigdxS3BlSMI2PJLcQ==",
-      "cpu": [
-        "x64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "sunos"
-      ],
-      "peer": true,
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/win32-arm64": {
-      "version": "0.28.1",
-      "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.28.1.tgz",
-      "integrity": "sha512-lCv9eK/H6ZJWbE7bh2nw54CZ9M2nupBxJcTsdk/QQnWkdSjKGuxmmH8/GWrlT1eMmZfn4dGcCjRte397WqfQXA==",
-      "cpu": [
-        "arm64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "win32"
-      ],
-      "peer": true,
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/win32-ia32": {
-      "version": "0.28.1",
-      "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.28.1.tgz",
-      "integrity": "sha512-zvb/mB2bSCoJOpoCBgYKKpX6YM6mJBlBUVUtVj41DlZJVEB6/0CKlRYxP5wWl1C1ILiCoAU5wZZ4q1P3qeS6Eg==",
-      "cpu": [
-        "ia32"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "win32"
-      ],
-      "peer": true,
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/@esbuild/win32-x64": {
-      "version": "0.28.1",
-      "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.28.1.tgz",
-      "integrity": "sha512-bm4Mowrv+GXMlpWX++EcXw/iLyd1o3+bJkC2DkWXYVvgZCqD/bSj9ctZeAMC3cIxgjRVR2Dufaiu4YPxr5gW1A==",
-      "cpu": [
-        "x64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "win32"
-      ],
-      "peer": true,
-      "engines": {
-        "node": ">=18"
-      }
-    },
     "node_modules/@eslint-community/eslint-utils": {
       "version": "4.9.1",
       "resolved": "https://registry.npmjs.org/@eslint-community/eslint-utils/-/eslint-utils-4.9.1.tgz",
@@ -4662,49 +4220,6 @@
       "dev": true,
       "license": "MIT"
     },
-    "node_modules/esbuild": {
-      "version": "0.28.1",
-      "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.28.1.tgz",
-      "integrity": "sha512-HrJrvZv5ayxBzPfwphOoNzkzOIIlifzk0KJrGK2c8R4+LKpMtpYLQeUdjnwjWv/LZlkH2laZk+4w78pi99D4Vw==",
-      "hasInstallScript": true,
-      "license": "MIT",
-      "optional": true,
-      "peer": true,
-      "bin": {
-        "esbuild": "bin/esbuild"
-      },
-      "engines": {
-        "node": ">=18"
-      },
-      "optionalDependencies": {
-        "@esbuild/aix-ppc64": "0.28.1",
-        "@esbuild/android-arm": "0.28.1",
-        "@esbuild/android-arm64": "0.28.1",
-        "@esbuild/android-x64": "0.28.1",
-        "@esbuild/darwin-arm64": "0.28.1",
-        "@esbuild/darwin-x64": "0.28.1",
-        "@esbuild/freebsd-arm64": "0.28.1",
-        "@esbuild/freebsd-x64": "0.28.1",
-        "@esbuild/linux-arm": "0.28.1",
-        "@esbuild/linux-arm64": "0.28.1",
-        "@esbuild/linux-ia32": "0.28.1",
-        "@esbuild/linux-loong64": "0.28.1",
-        "@esbuild/linux-mips64el": "0.28.1",
-        "@esbuild/linux-ppc64": "0.28.1",
-        "@esbuild/linux-riscv64": "0.28.1",
-        "@esbuild/linux-s390x": "0.28.1",
-        "@esbuild/linux-x64": "0.28.1",
-        "@esbuild/netbsd-arm64": "0.28.1",
-        "@esbuild/netbsd-x64": "0.28.1",
-        "@esbuild/openbsd-arm64": "0.28.1",
-        "@esbuild/openbsd-x64": "0.28.1",
-        "@esbuild/openharmony-arm64": "0.28.1",
-        "@esbuild/sunos-x64": "0.28.1",
-        "@esbuild/win32-arm64": "0.28.1",
-        "@esbuild/win32-ia32": "0.28.1",
-        "@esbuild/win32-x64": "0.28.1"
-      }
-    },
     "node_modules/escalade": {
       "version": "3.2.0",
       "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz",
diff --git a/apps/frontend_llmops/src/components/AddModelDialog.vue b/apps/frontend_llmops/src/components/AddModelDialog.vue
index 300cfaf..f1eac8c 100644
--- a/apps/frontend_llmops/src/components/AddModelDialog.vue
+++ b/apps/frontend_llmops/src/components/AddModelDialog.vue
@@ -12,6 +12,7 @@ import { ApiError } from '@/lib/api'
 import { useModelsStore } from '@/stores/models'
 import { useResourcesStore } from '@/stores/resources'
 import { formatBytes } from '@/lib/utils'
+import { ROUTING_STRATEGIES, routingStrategyLabel } from '@/lib/routingStrategies'
 import type { CachedModel, DownloadJob, LoraAdapter, LoraModule, SettingValue } from '@/types/api'
 
 const open = defineModel<boolean>('open', { default: false })
@@ -37,6 +38,11 @@ const port = ref<number>(8000)
 const cudaDevice = ref<number | null>(null)
 const modelTag = ref('')
 const params = ref<{ key: string; value: string }[]>([])
+// Router-only load-balancing policy for the group. Lives in model_config but is
+// NOT a vLLM flag, so it's edited as its own field and kept out of the raw param
+// list (the launcher would otherwise reject it as an unknown `vllm serve` arg).
+// '' = inherit the global default.
+const routingStrategy = ref('')
 // LoRA adapters mounted at serve time; edited apart from the flat param list
 // because each is a {name, path, base_model_name} object, not a scalar flag.
 const loras = ref<LoraModule[]>([])
@@ -141,6 +147,7 @@ function reset() {
   cudaDevice.value = null
   modelTag.value = ''
   params.value = []
+  routingStrategy.value = ''
   loras.value = []
 }
 
@@ -150,12 +157,19 @@ function extractLoras(entries: [string, unknown][]): [string, unknown][] {
   loras.value = []
   const rest: [string, unknown][] = []
   for (const [k, v] of entries) {
-    if (k === 'lora_modules' && Array.isArray(v)) {
-      loras.value = (v as LoraModule[]).map((m) => ({
-        name: m.name ?? '',
-        path: m.path ?? '',
-        base_model_name: m.base_model_name ?? '',
-      }))
+    if (k === 'lora_modules') {
+      // `lora_modules` is always owned by the LoRA editor — never let it leak
+      // into the raw param list. The config endpoint reports `lora_modules: null`
+      // for models without adapters; if that fell through to `rest` it would be
+      // rendered as a param and re-submitted as the string "", which fails the
+      // backend's `Optional[list[LoraModule]]` validation.
+      loras.value = Array.isArray(v)
+        ? (v as LoraModule[]).map((m) => ({
+            name: m.name ?? '',
+            path: m.path ?? '',
+            base_model_name: m.base_model_name ?? '',
+          }))
+        : []
     } else {
       rest.push([k, v])
     }
@@ -175,8 +189,9 @@ function prefillForEdit() {
   port.value = cfg.port
   cudaDevice.value = cfg.cuda_device ?? null
   modelTag.value = String(cfg.settings.model_tag ?? '')
+  routingStrategy.value = String(cfg.settings.routing_strategy ?? '')
   params.value = extractLoras(
-    Object.entries(cfg.settings).filter(([k2]) => k2 !== 'model_tag'),
+    Object.entries(cfg.settings).filter(([k2]) => k2 !== 'model_tag' && k2 !== 'routing_strategy'),
   ).map(([k2, v]) => ({ key: k2, value: v === null ? '' : String(v) }))
   warnings.value = []
   parsed.value = true // skip the paste/parse step
@@ -209,8 +224,11 @@ async function parse() {
     port.value = p.instance.port
     cudaDevice.value = p.instance.cuda_device
     modelTag.value = String(p.model_config.model_tag ?? '')
+    routingStrategy.value = String(
+      (p.model_config as Record<string, unknown>).routing_strategy ?? '',
+    )
     params.value = extractLoras(
-      Object.entries(p.model_config).filter(([k]) => k !== 'model_tag'),
+      Object.entries(p.model_config).filter(([k]) => k !== 'model_tag' && k !== 'routing_strategy'),
     ).map(([k, v]) => ({ key: k, value: String(v) }))
     warnings.value = p.warnings
     parsed.value = true
@@ -365,8 +383,15 @@ async function submit() {
     model_tag: modelTag.value,
   }
   for (const { key: k, value } of params.value) {
-    if (k.trim()) settings[k.trim()] = coerce(value)
+    // `lora_modules` and `routing_strategy` have dedicated editors below — never
+    // let a raw param (e.g. a stray "" from a null, or a leaked router key) stomp
+    // them via the generic param list.
+    const kk = k.trim()
+    if (kk && kk !== 'lora_modules' && kk !== 'routing_strategy') settings[kk] = coerce(value)
   }
+  // Router-only load-balancing policy; '' inherits the global default, so only
+  // send it when explicitly chosen.
+  if (routingStrategy.value) settings.routing_strategy = routingStrategy.value
   // Mounted adapters: keep only filled rows; drop the empty base_model_name field.
   const cleanLoras = loras.value
     .filter((l) => l.name.trim() && l.path.trim())
@@ -494,6 +519,19 @@ async function submit() {
             <span class="text-xs text-muted-foreground">模型標籤 <span class="text-status-failed">*</span></span>
             <Input v-model="modelTag" class="mt-1 font-mono" placeholder="org/model" />
           </label>
+          <label class="col-span-2 block">
+            <span class="text-xs text-muted-foreground">路由策略（負載平衡）</span>
+            <select
+              v-model="routingStrategy"
+              class="mt-1 h-9 w-full rounded-md border border-input bg-background/40 px-2 text-sm"
+            >
+              <option value="">跟隨全域預設</option>
+              <option v-for="s in ROUTING_STRATEGIES" :key="s" :value="s">{{ routingStrategyLabel(s) }}</option>
+            </select>
+            <span class="mt-1 block text-[11px] text-muted-foreground">
+              此群組請求的分流方式;留空則跟隨全域設定（可在「流量」頁切換）。多副本才有效。
+            </span>
+          </label>
         </div>
 
         <!-- Weight cache status for the entered model_tag -->
diff --git a/apps/frontend_llmops/src/components/ModelDetailDrawer.vue b/apps/frontend_llmops/src/components/ModelDetailDrawer.vue
index 49acb35..3c55915 100644
--- a/apps/frontend_llmops/src/components/ModelDetailDrawer.vue
+++ b/apps/frontend_llmops/src/components/ModelDetailDrawer.vue
@@ -18,6 +18,7 @@ import { useAuth } from '@/composables/useAuth'
 import { api, ApiError } from '@/lib/api'
 import { toast } from '@/lib/toast'
 import { formatDuration, formatLatency, formatNumber, formatPercent, formatTime } from '@/lib/utils'
+import { routingStrategyLabel } from '@/lib/routingStrategies'
 import type { EmbeddingModelParams, LoraAdapter, ModelStartupMetrics, StateEvent } from '@/types/api'
 
 const open = defineModel<boolean>('open', { default: false })
@@ -63,14 +64,20 @@ const gpu = computed(() =>
 )
 const busy = computed(() => (props.modelKey ? models.pending.has(props.modelKey) : false))
 
-// Every vLLM parameter from model_config, shown generically (model_tag is
-// already surfaced in the header, so it's filtered out here).
+// Every vLLM parameter from model_config, shown generically. model_tag is in the
+// header; lora_modules has its own section; routing_strategy is a router-only knob
+// (not a vLLM flag), surfaced separately below.
 const vllmParams = computed(
   () =>
     Object.entries(engine.value?.settings ?? {}).filter(
-      ([k]) => k !== 'model_tag' && k !== 'lora_modules',
+      ([k]) => k !== 'model_tag' && k !== 'lora_modules' && k !== 'routing_strategy',
     ) as [string, string | number | boolean | null][],
 )
+// Router load-balancing policy for the group (shown apart from vLLM flags).
+const routingStrategy = computed(() => {
+  const s = engine.value?.settings?.routing_strategy
+  return typeof s === 'string' && s ? s : null
+})
 // LoRA adapters mounted on this group (rendered apart from the scalar params).
 const loras = computed(() => engine.value?.settings?.lora_modules ?? [])
 // Runtime (hot) LoRA load/unload is only possible when the model is running and
@@ -469,6 +476,17 @@ const eventColor: Record<string, string> = {
             </div>
           </div>
 
+          <!-- Routing policy (router-only, not a vLLM flag) -->
+          <div v-if="routingStrategy">
+            <p class="mb-2 text-xs font-medium uppercase tracking-wide text-muted-foreground">
+              路由策略（負載平衡）
+            </p>
+            <div class="rounded-lg border border-border/60 bg-background/40 px-3 py-2 text-sm">
+              {{ routingStrategyLabel(routingStrategy) }}
+              <span class="font-mono text-xs text-muted-foreground">（{{ routingStrategy }}）</span>
+            </div>
+          </div>
+
           <!-- Full vLLM model_config -->
           <div v-if="vllmParams.length">
             <p class="mb-2 text-xs font-medium uppercase tracking-wide text-muted-foreground">
diff --git a/apps/frontend_llmops/src/lib/api.ts b/apps/frontend_llmops/src/lib/api.ts
index a1d61e1..b4e9cf7 100644
--- a/apps/frontend_llmops/src/lib/api.ts
+++ b/apps/frontend_llmops/src/lib/api.ts
@@ -29,6 +29,7 @@ import type {
   RequestRow,
   ResourcesView,
   RouterMetrics,
+  RoutingInfo,
   SettingValue,
   StateEvent,
   UsageRow,
@@ -172,6 +173,15 @@ export const api = {
   routerModels: () => request<OpenAIModelList>(ROUTER_BASE, '/v1/models'),
   routerMetrics: () => request<RouterMetrics>(ROUTER_BASE, '/metrics'),
 
+  /** Current global load-balancing strategy + the selectable catalogue. */
+  getRouting: () => request<RoutingInfo>(ROUTER_BASE, '/routing'),
+  /** Hot-swap the global strategy (effective next request; not persisted). */
+  setRouting: (strategy: string) =>
+    request<{ strategy: string }>(ROUTER_BASE, '/routing', {
+      method: 'POST',
+      body: JSON.stringify({ strategy }),
+    }),
+
   /** SSE endpoint URL for the live model snapshot stream. */
   modelStreamUrl: () => `${API_BASE}/api/stream/models`,
 
diff --git a/apps/frontend_llmops/src/lib/routingStrategies.ts b/apps/frontend_llmops/src/lib/routingStrategies.ts
new file mode 100644
index 0000000..a09cee6
--- /dev/null
+++ b/apps/frontend_llmops/src/lib/routingStrategies.ts
@@ -0,0 +1,27 @@
+/** Router load-balancing strategies — shared by the Traffic page selector and the
+ *  model edit dialog. Keep in sync with router-server's STRATEGIES registry
+ *  (apps/router-server/src/llm_router/routing_strategies.py).
+ */
+export const ROUTING_STRATEGIES = [
+  'least_load',
+  'round_robin',
+  'random',
+  'least_inflight',
+  'p2c',
+  'session_affinity',
+  'prefix_affinity',
+] as const
+
+export type RoutingStrategy = (typeof ROUTING_STRATEGIES)[number]
+
+export const ROUTING_STRATEGY_LABELS: Record<string, string> = {
+  least_load: '最低負載（預設）',
+  round_robin: '輪詢',
+  random: '隨機',
+  least_inflight: '最少進行中',
+  p2c: '二選一取優',
+  session_affinity: '會話黏性',
+  prefix_affinity: '前綴黏性',
+}
+
+export const routingStrategyLabel = (s: string) => ROUTING_STRATEGY_LABELS[s] ?? s
diff --git a/apps/frontend_llmops/src/types/api.ts b/apps/frontend_llmops/src/types/api.ts
index a0ebf4f..591198d 100644
--- a/apps/frontend_llmops/src/types/api.ts
+++ b/apps/frontend_llmops/src/types/api.ts
@@ -172,6 +172,13 @@ export interface InstanceMetrics {
 }
 export type RouterMetrics = Record<string, Record<string, InstanceMetrics>>
 
+/** Global load-balancing strategy state from the router's GET /routing. */
+export interface RoutingInfo {
+  strategy: string
+  available: string[]
+  default: string
+}
+
 export interface OpenAIModelList {
   object: string
   // `parent` is present only for LoRA adapters (points at the base group).
diff --git a/apps/frontend_llmops/src/views/TrafficView.vue b/apps/frontend_llmops/src/views/TrafficView.vue
index fc7df2e..f00313b 100644
--- a/apps/frontend_llmops/src/views/TrafficView.vue
+++ b/apps/frontend_llmops/src/views/TrafficView.vue
@@ -1,6 +1,6 @@
 <script setup lang="ts">
-import { computed } from 'vue'
-import { RefreshCw } from '@lucide/vue'
+import { computed, onMounted, ref } from 'vue'
+import { Info, RefreshCw } from '@lucide/vue'
 import { useTrafficStore } from '@/stores/traffic'
 import { useModelsStore } from '@/stores/models'
 import Card from '@/components/ui/Card.vue'
@@ -10,11 +10,62 @@ import CardContent from '@/components/ui/CardContent.vue'
 import Badge from '@/components/ui/Badge.vue'
 import Button from '@/components/ui/Button.vue'
 import RouterFanDiagram from '@/components/RouterFanDiagram.vue'
+import { api, ApiError } from '@/lib/api'
+import { toast } from '@/lib/toast'
+import { routingStrategyLabel } from '@/lib/routingStrategies'
 import { formatLatency, formatNumber } from '@/lib/utils'
 
 const traffic = useTrafficStore()
 const models = useModelsStore()
 
+// ---- Router load-balancing strategy (global, hot-swappable) ----
+// One-line "best for" per strategy, shown in the help card.
+const STRATEGY_INFO: Record<string, string> = {
+  least_load: '請求長短不一、要平均各副本飽和度。通用安全的預設。',
+  round_robin: '同質 GPU、請求差異不大,或想要可預測的均分 / 當 baseline。',
+  random: '大量短請求、要最低決策成本的無狀態分流。',
+  least_inflight: '請求耗時相近,且不想被 ~1 秒 metrics 抓取延遲影響時。',
+  p2c: '突發流量下,想避免大家一窩蜂衝向同一個「目前最閒」的副本。',
+  session_affinity: '多輪對話 / Playground:同一會話黏同一台,提升 KV cache 重用（需帶 X-Session-Id 或 user,否則退回最低負載）。',
+  prefix_affinity: '固定 system prompt、RAG / few-shot 模板等高前綴重複率的請求。',
+}
+const strategy = ref<string>('')
+const strategies = ref<string[]>([])
+const savingStrategy = ref(false)
+const showStrategyHelp = ref(false)
+const strategyLabel = routingStrategyLabel
+const strategyInfo = (s: string) => STRATEGY_INFO[s] ?? ''
+
+onMounted(async () => {
+  try {
+    const r = await api.getRouting()
+    strategy.value = r.strategy
+    strategies.value = r.available
+  } catch {
+    /* router may be unreachable; the selector just stays empty */
+  }
+})
+
+async function onStrategyChange(e: Event) {
+  const next = (e.target as HTMLSelectElement).value
+  const prev = strategy.value
+  savingStrategy.value = true
+  try {
+    const r = await api.setRouting(next)
+    strategy.value = r.strategy
+    toast.success(`路由策略已切換為「${strategyLabel(r.strategy)}」`, {
+      description: '下一個請求起生效。未持久化，router 重啟後回到預設。',
+    })
+  } catch (e) {
+    strategy.value = prev // revert the <select> to the real value
+    toast.error('切換路由策略失敗', {
+      description: e instanceof ApiError ? e.message : String(e),
+    })
+  } finally {
+    savingStrategy.value = false
+  }
+}
+
 // Max p95 across rows → scale the latency bars.
 const maxP95 = computed(() => Math.max(1, ...traffic.usage.map((u) => u.p95_latency_ms)))
 
@@ -107,13 +158,63 @@ function onFilter(e: Event) {
 
     <!-- Router load: load-balancing fan per group -->
     <Card>
-      <CardHeader>
-        <CardTitle>路由器負載均衡</CardTitle>
-        <p class="text-xs text-muted-foreground">
-          線條粗細 = 實際流量佔比（來自請求記錄）· ★ = 路由器下次選擇的最低分實例
-        </p>
+      <CardHeader class="flex-row items-start justify-between gap-4">
+        <div>
+          <CardTitle>路由器負載均衡</CardTitle>
+          <p class="mt-1 text-xs text-muted-foreground">
+            線條粗細 = 實際流量佔比（來自請求記錄）· ★ = 路由器下次選擇的最低分實例
+          </p>
+        </div>
+        <div class="flex shrink-0 items-center gap-2">
+          <label class="flex items-center gap-2 text-xs text-muted-foreground">
+            策略
+            <select
+              class="h-8 rounded-md border border-input bg-background/40 px-2 text-xs text-foreground disabled:opacity-50"
+              :value="strategy"
+              :disabled="savingStrategy || !strategies.length"
+              @change="onStrategyChange"
+            >
+              <option v-if="!strategy" value="">—</option>
+              <option v-for="s in strategies" :key="s" :value="s">{{ strategyLabel(s) }}</option>
+            </select>
+          </label>
+          <button
+            type="button"
+            class="flex size-7 items-center justify-center rounded-md border border-input text-muted-foreground transition-colors hover:bg-accent hover:text-foreground"
+            :class="showStrategyHelp && 'bg-accent text-foreground'"
+            :aria-pressed="showStrategyHelp"
+            title="各策略適合的場景"
+            @click="showStrategyHelp = !showStrategyHelp"
+          >
+            <Info class="size-4" />
+          </button>
+        </div>
       </CardHeader>
       <CardContent class="space-y-3">
+        <!-- Strategy reference: which one suits which scenario -->
+        <div
+          v-if="showStrategyHelp"
+          class="rounded-lg border border-border/60 bg-muted/30 p-3 text-xs"
+        >
+          <p class="mb-2 font-medium text-foreground">各策略適合的場景</p>
+          <ul class="space-y-1.5">
+            <li
+              v-for="s in strategies"
+              :key="s"
+              class="flex gap-2"
+              :class="s === strategy ? 'text-foreground' : 'text-muted-foreground'"
+            >
+              <span
+                class="mt-0.5 shrink-0 rounded px-1.5 py-0.5 font-mono text-[10px]"
+                :class="s === strategy ? 'bg-[var(--chart-1)]/20 text-[var(--chart-1)]' : 'bg-muted text-muted-foreground'"
+              >{{ strategyLabel(s) }}</span>
+              <span>{{ strategyInfo(s) }}</span>
+            </li>
+          </ul>
+          <p class="mt-2 text-[11px] text-muted-foreground/80">
+            此下拉切換的是全域預設;在 config.yaml 為某群組設定 routing_strategy 會覆寫此處。
+          </p>
+        </div>
         <RouterFanDiagram v-for="g in llmGroups" :key="g" :group="g" />
         <p v-if="!llmGroups.length" class="py-6 text-center text-sm text-muted-foreground">
           尚未設定 LLM 群組。
diff --git a/apps/router-server/src/llm_router/backend_selector.py b/apps/router-server/src/llm_router/backend_selector.py
index e1e786d..65fa8fc 100644
--- a/apps/router-server/src/llm_router/backend_selector.py
+++ b/apps/router-server/src/llm_router/backend_selector.py
@@ -1,14 +1,12 @@
-from typing import Any, Dict, Optional, Set
-
-from fastapi import HTTPException
-import logging
+"""Backward-compatible least-load selector.
 
-from src.llm_router.backend_runtime_state import (FAIL_OPEN_PENALTY,
-                                                  INFLIGHT_WEIGHT,
-                                                  get_inflight,
-                                                  is_backend_in_cooldown)
+The routing policy is now pluggable (see routing_strategies.py). This module keeps
+the original entry point as a thin wrapper over the `least_load` strategy so
+existing imports and tests keep working unchanged.
+"""
+from typing import Any, Dict, Optional, Set
 
-logger = logging.getLogger(__name__)
+from src.llm_router.routing_strategies import select_instance
 
 
 async def select_instance_least_load(
@@ -17,71 +15,10 @@ async def select_instance_least_load(
     model_cfg: Dict[str, Any],
     exclude: Optional[Set[str]] = None,
 ) -> Dict[str, Any]:
-    """Pick the least-loaded instance for a model group.
+    """Pick the least-loaded instance for a model group (the historical default).
 
-    `exclude` is the set of instance ids already tried this request; it lets the
-    proxy fail over to the next-best backend without re-picking a dead one.
+    Equivalent to `select_instance(..., strategy="least_load")`.
     """
-    instances = model_cfg.get("instances", [])
-    if not instances:
-        raise HTTPException(status_code=500, detail=f"Model '{model_key}' has no instances configured.")
-
-    exclude = exclude or set()
-    candidates = [i for i in instances if i["id"] not in exclude]
-    if not candidates:
-        raise HTTPException(
-            status_code=503,
-            detail=f"No remaining instance to try for model '{model_key}'.",
-        )
-
-    if len(candidates) == 1:
-        return candidates[0]
-
-    metrics_map = app.state.metrics_cache.get(model_key, {})
-    best_instance = None
-    best_score = float("inf")
-
-    for instance in candidates:
-        instance_id = instance["id"]
-        metric = metrics_map.get(instance_id)
-
-        # metric is None only before the first scrape lands (fresh start or a
-        # just-reloaded group). Don't skip — that would 500 the whole request
-        # during the cold-start window. Treat unknown load as idle (0) and let
-        # the inflight penalty spread requests until real metrics arrive.
-        if metric is None:
-            logger.warning(
-                "No cached metrics for model=%s backend=%s; assuming idle.",
-                model_key,
-                instance_id,
-            )
-            base_score = 0.0
-        else:
-            base_score = metric.compute_load_score()
-
-        inflight = get_inflight(app, model_key, instance_id)
-        inflight_penalty = inflight * INFLIGHT_WEIGHT
-
-        cooldown_penalty = FAIL_OPEN_PENALTY if is_backend_in_cooldown(app, model_key, instance_id) else 0.0
-
-        final_score = base_score + inflight_penalty + cooldown_penalty
-
-        # debug, not info: this runs per-instance per-request — at INFO it floods
-        # the single event loop with string formatting + stdout writes under load.
-        logger.debug(
-            "Instance %s has load score %.4f, base_score=%.4f, inflight_penalty=%.4f, cooldown_penalty=%.4f",
-            instance_id,
-            final_score,
-            base_score,
-            inflight_penalty,
-            cooldown_penalty,
-        )
-        
-        if final_score < best_score:
-            best_score = final_score
-            best_instance = instance
-
-    if best_instance is None:
-        raise HTTPException(status_code=500, detail=f"No suitable instance found for model '{model_key}'.")
-
-    return best_instance
\ No newline at end of file
+    return await select_instance(
+        app, model_key, model_cfg, strategy="least_load", exclude=exclude
+    )
diff --git a/apps/router-server/src/llm_router/main.py b/apps/router-server/src/llm_router/main.py
index a4382db..6819148 100644
--- a/apps/router-server/src/llm_router/main.py
+++ b/apps/router-server/src/llm_router/main.py
@@ -11,6 +11,7 @@
 from src.llm_router.overlay import load_config_with_overlay
 from src.llm_router.metrics_poller import poll_metrics_forever
 from src.llm_router.router import router
+from src.llm_router.routing_strategies import DEFAULT_STRATEGY
 from src.llm_router.store import LLMOpsStore
 from src.llm_router.vllm_metrics_client import VLLMMetricsClient
 
@@ -48,6 +49,10 @@ async def lifespan(app: FastAPI):
     app.state.metrics_cache = {}
     app.state.backend_inflight = {}
     app.state.backend_health = {}
+    # Routing policy: global default (per-group overrides ride model_config), plus
+    # the round-robin cursor map. See routing_strategies.py.
+    app.state.routing_strategy = os.environ.get("LLMOPS_ROUTING_STRATEGY", DEFAULT_STRATEGY)
+    app.state.rr_counters = {}
 
     # Shared telemetry DB (same file the dashboard backend reads). LLMOPS_DB_PATH
     # must match the backend; default resolves to <repo>/data/llmops.db.
diff --git a/apps/router-server/src/llm_router/router.py b/apps/router-server/src/llm_router/router.py
index d0079bb..292d22b 100644
--- a/apps/router-server/src/llm_router/router.py
+++ b/apps/router-server/src/llm_router/router.py
@@ -11,7 +11,8 @@
                                                   mark_backend_failure,
                                                   mark_backend_success)
 from src.llm_router.auth import authenticate
-from src.llm_router.backend_selector import select_instance_least_load
+from src.llm_router.routing_strategies import (DEFAULT_STRATEGY, STRATEGIES,
+                                               select_instance)
 from src.llm_router.lora import iter_models, resolve_model
 from src.llm_router.overlay import load_config_with_overlay
 
@@ -34,6 +35,35 @@ async def reload_config(request: Request):
     return {"status": "reloaded", "groups": groups}
 
 
+@router.get("/routing")
+async def get_routing(request: Request):
+    """Current global routing strategy + the selectable catalogue.
+
+    Note: a per-group `model_config.routing_strategy` still overrides this global
+    value for that group; this endpoint reads/sets the global default only."""
+    return {
+        "strategy": getattr(request.app.state, "routing_strategy", DEFAULT_STRATEGY),
+        "available": sorted(STRATEGIES),
+        "default": DEFAULT_STRATEGY,
+    }
+
+
+@router.post("/routing")
+async def set_routing(request: Request):
+    """Hot-swap the global routing strategy (takes effect on the next request, no
+    reload). Not persisted — restarts fall back to LLMOPS_ROUTING_STRATEGY."""
+    body = await request.json()
+    name = body.get("strategy")
+    if name not in STRATEGIES:
+        raise HTTPException(
+            status_code=400,
+            detail=f"unknown strategy {name!r}; valid: {sorted(STRATEGIES)}",
+        )
+    request.app.state.routing_strategy = name
+    logger.info("Routing strategy set to %s via /routing", name)
+    return {"strategy": name}
+
+
 def _usage_from_body(body) -> dict | None:
     """Pull an OpenAI `usage` block out of a buffered JSON response body."""
     if body is None:
@@ -74,6 +104,70 @@ def _scan_sse_for_usage(buffer: bytes, captured: dict) -> bytes:
     return buffer
 
 
+def _resolve_strategy(app, model_cfg: dict) -> str:
+    """Pick the routing strategy: per-group override > global env > default.
+
+    The per-group override rides the group's `model_config` (EngineModelConfig is
+    `extra="allow"`, so `routing_strategy` passes through with no schema change).
+    The global default is read once into app.state.routing_strategy at startup.
+    """
+    mc = model_cfg.get("model_config") or {}
+    return (
+        mc.get("routing_strategy")
+        or getattr(app.state, "routing_strategy", None)
+        or DEFAULT_STRATEGY
+    )
+
+
+def _session_key(request: Request, body: dict) -> str | None:
+    """Affinity key for session_affinity: X-Session-Id header, else OpenAI `user`."""
+    sid = request.headers.get("x-session-id")
+    if sid:
+        return sid
+    user = body.get("user")
+    return user if isinstance(user, str) and user else None
+
+
+def _content_text(content) -> str:
+    """Flatten an OpenAI message `content` (str or multimodal parts) to text."""
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        return " ".join(
+            p.get("text", "")
+            for p in content
+            if isinstance(p, dict) and p.get("type") == "text"
+        )
+    return ""
+
+
+def _prompt_prefix(body: dict, limit: int = 512) -> str | None:
+    """Affinity key for prefix_affinity: the leading prompt text, bounded.
+
+    Chat: `role:content` of the first messages until `limit` chars. Completions:
+    the `prompt` (or its first element). Bounded so the hot path stays cheap.
+    """
+    msgs = body.get("messages")
+    if isinstance(msgs, list) and msgs:
+        out: list[str] = []
+        total = 0
+        for m in msgs:
+            if not isinstance(m, dict):
+                continue
+            piece = f"{m.get('role', '')}:{_content_text(m.get('content'))}"
+            out.append(piece)
+            total += len(piece)
+            if total >= limit:
+                break
+        return ("\n".join(out)[:limit]) or None
+    prompt = body.get("prompt")
+    if isinstance(prompt, str):
+        return prompt[:limit] or None
+    if isinstance(prompt, list) and prompt and isinstance(prompt[0], str):
+        return prompt[0][:limit] or None
+    return None
+
+
 async def _record_request(app, model_key, instance_id, path, status_code, started,
                           usage=None, error=None, api_key_name=None):
     """Persist one request log row to the shared store. Best-effort, non-blocking
@@ -155,9 +249,17 @@ async def _proxy_to_backend(request: Request, upstream_path: str, api_key_name=N
         tried: set[str] = set()
         response = None
 
+        # Routing policy + the inputs the affinity strategies need (computed once;
+        # key extraction is skipped for strategies that don't use it).
+        strategy_name = _resolve_strategy(request.app, model_cfg)
+        session_key = _session_key(request, request_json) if strategy_name == "session_affinity" else None
+        prompt_prefix = _prompt_prefix(request_json) if strategy_name == "prefix_affinity" else None
+
         for attempt in range(max_attempts):
-            instance = await select_instance_least_load(
-                app=request.app, model_key=model_key, model_cfg=model_cfg, exclude=tried,
+            instance = await select_instance(
+                request.app, model_key, model_cfg,
+                strategy=strategy_name, exclude=tried,
+                session_key=session_key, prompt_prefix=prompt_prefix,
             )
             instance_id = instance["id"]
             tried.add(instance_id)
diff --git a/apps/router-server/src/llm_router/routing_strategies.py b/apps/router-server/src/llm_router/routing_strategies.py
new file mode 100644
index 0000000..1fde3b8
--- /dev/null
+++ b/apps/router-server/src/llm_router/routing_strategies.py
@@ -0,0 +1,245 @@
+"""Pluggable request-routing strategies.
+
+The router proxies each OpenAI request to one backend instance of the resolved
+model group. *Which* instance is chosen is a swappable policy: this module owns
+the catalogue of strategies and the registry to look one up by name.
+
+A strategy only *picks a candidate*. All the resilient-routing machinery — the
+in-flight accounting, the failover loop across instances, the per-backend
+cooldown — stays in the proxy (`router.py`) and is shared by every strategy.
+Strategies therefore stay small and side-effect-free: given the live cluster
+state they return one instance dict.
+
+`least_load` reuses the exact score the router has always used (waiting/running/
+kv-cache load + in-flight penalty + cooldown fail-open), so it is byte-for-byte
+the historical behaviour and remains the default.
+
+See docs/routing-strategies.md for the design.
+"""
+from __future__ import annotations
+
+import hashlib
+import logging
+import os
+import random
+from dataclasses import dataclass, field
+from typing import Any, Callable, Optional
+
+from fastapi import HTTPException
+
+from src.llm_router.backend_runtime_state import (FAIL_OPEN_PENALTY,
+                                                  INFLIGHT_WEIGHT, get_inflight,
+                                                  is_backend_in_cooldown)
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_STRATEGY = "least_load"
+
+# How much extra load we tolerate on an affinity "home" instance before giving up
+# the cache-reuse benefit and spreading to the least-loaded replica instead.
+AFFINITY_LOAD_MARGIN = float(os.environ.get("LLMOPS_AFFINITY_LOAD_MARGIN", "50.0"))
+
+
+# --------------------------------------------------------------------------- #
+# Scoring (shared by least_load / least_inflight / p2c / affinity fallbacks)
+# --------------------------------------------------------------------------- #
+def score_instance(app: Any, model_key: str, instance: dict) -> float:
+    """Load-aware score for one instance; lower is less loaded.
+
+    base load (from the ~1s metrics scrape) + in-flight penalty + cooldown
+    fail-open penalty. A missing metric (cold start / just-reloaded group) counts
+    as idle (0) rather than skipping the instance, so the first request after a
+    start doesn't 500 — the in-flight penalty still spreads load until real
+    metrics land. This mirrors the original `select_instance_least_load`.
+    """
+    instance_id = instance["id"]
+    metric = app.state.metrics_cache.get(model_key, {}).get(instance_id)
+    if metric is None:
+        logger.warning(
+            "No cached metrics for model=%s backend=%s; assuming idle.",
+            model_key, instance_id,
+        )
+        base = 0.0
+    else:
+        base = metric.compute_load_score()
+    inflight_penalty = get_inflight(app, model_key, instance_id) * INFLIGHT_WEIGHT
+    cooldown_penalty = (
+        FAIL_OPEN_PENALTY if is_backend_in_cooldown(app, model_key, instance_id) else 0.0
+    )
+    return base + inflight_penalty + cooldown_penalty
+
+
+def inflight_score(app: Any, model_key: str, instance: dict) -> float:
+    """Cheaper score that ignores the metrics scrape: in-flight + cooldown only."""
+    instance_id = instance["id"]
+    inflight_penalty = get_inflight(app, model_key, instance_id) * INFLIGHT_WEIGHT
+    cooldown_penalty = (
+        FAIL_OPEN_PENALTY if is_backend_in_cooldown(app, model_key, instance_id) else 0.0
+    )
+    return inflight_penalty + cooldown_penalty
+
+
+def _least_by(score: Callable[[dict], float], candidates: list[dict]) -> dict:
+    """min() that returns the first candidate on a tie (stable, like the old `<`)."""
+    best = candidates[0]
+    best_score = score(best)
+    for inst in candidates[1:]:
+        s = score(inst)
+        if s < best_score:
+            best, best_score = inst, s
+    return best
+
+
+# --------------------------------------------------------------------------- #
+# Context + strategy type
+# --------------------------------------------------------------------------- #
+@dataclass
+class SelectContext:
+    app: Any
+    model_key: str
+    candidates: list[dict]            # eligible this request (exclude already removed)
+    all_instances: list[dict] = field(default_factory=list)  # full group roster
+    session_key: Optional[str] = None
+    prompt_prefix: Optional[str] = None
+
+
+Strategy = Callable[[SelectContext], dict]
+
+
+# --------------------------------------------------------------------------- #
+# Strategies
+# --------------------------------------------------------------------------- #
+def _round_robin(ctx: SelectContext) -> dict:
+    counters: dict[str, int] = ctx.app.state.rr_counters
+    n = counters.get(ctx.model_key, 0)
+    counters[ctx.model_key] = n + 1
+    return ctx.candidates[n % len(ctx.candidates)]
+
+
+def _random(ctx: SelectContext) -> dict:
+    return random.choice(ctx.candidates)
+
+
+def _least_inflight(ctx: SelectContext) -> dict:
+    return _least_by(lambda i: inflight_score(ctx.app, ctx.model_key, i), ctx.candidates)
+
+
+def _least_load(ctx: SelectContext) -> dict:
+    return _least_by(lambda i: score_instance(ctx.app, ctx.model_key, i), ctx.candidates)
+
+
+def _p2c(ctx: SelectContext) -> dict:
+    """Power-of-two-choices: sample two distinct candidates, keep the less loaded.
+
+    Avoids the thundering herd of everyone picking the single global-minimum during
+    the ~1s window where the scrape is stale.
+    """
+    picks = random.sample(ctx.candidates, 2)  # caller guarantees len >= 2
+    return _least_by(lambda i: score_instance(ctx.app, ctx.model_key, i), picks)
+
+
+def _hash_key(key: str) -> int:
+    # sha1, not builtin hash(): the latter is per-process salted, so the same key
+    # would map to different replicas across workers/restarts.
+    return int(hashlib.sha1(key.encode("utf-8")).hexdigest(), 16)
+
+
+def _affinity(ctx: SelectContext, key: Optional[str]) -> dict:
+    """Sticky routing with a load escape valve.
+
+    Map `key` deterministically to a home instance over the *full* roster (a stable
+    set, unlike `candidates` which shrinks on failover). Keep the home replica
+    unless it's missing/excluded/in cooldown or its load exceeds the least-loaded
+    candidate by more than the margin — then spread like `least_load`. With no key,
+    degrade straight to `least_load`, so affinity is never worse than the default.
+    """
+    if not key:
+        return _least_load(ctx)
+
+    roster = ctx.all_instances or ctx.candidates
+    ring = sorted(roster, key=lambda i: i["id"])
+    home = ring[_hash_key(key) % len(ring)]
+
+    cand_ids = {i["id"] for i in ctx.candidates}
+    best = _least_by(lambda i: score_instance(ctx.app, ctx.model_key, i), ctx.candidates)
+    if (
+        home["id"] in cand_ids
+        and not is_backend_in_cooldown(ctx.app, ctx.model_key, home["id"])
+    ):
+        home_score = score_instance(ctx.app, ctx.model_key, home)
+        best_score = score_instance(ctx.app, ctx.model_key, best)
+        if home_score <= best_score + AFFINITY_LOAD_MARGIN:
+            return home
+    return best
+
+
+def _session_affinity(ctx: SelectContext) -> dict:
+    return _affinity(ctx, ctx.session_key)
+
+
+def _prefix_affinity(ctx: SelectContext) -> dict:
+    return _affinity(ctx, ctx.prompt_prefix)
+
+
+STRATEGIES: dict[str, Strategy] = {
+    "round_robin": _round_robin,
+    "random": _random,
+    "least_inflight": _least_inflight,
+    "least_load": _least_load,
+    "p2c": _p2c,
+    "session_affinity": _session_affinity,
+    "prefix_affinity": _prefix_affinity,
+}
+
+
+# --------------------------------------------------------------------------- #
+# Public dispatcher
+# --------------------------------------------------------------------------- #
+async def select_instance(
+    app: Any,
+    model_key: str,
+    model_cfg: dict,
+    *,
+    strategy: Optional[str] = None,
+    exclude: Optional[set[str]] = None,
+    session_key: Optional[str] = None,
+    prompt_prefix: Optional[str] = None,
+) -> dict:
+    """Pick one instance for `model_key` using the named strategy.
+
+    `exclude` is the set of instance ids already tried this request, so the proxy
+    can fail over to the next-best backend without re-picking a dead one. Resilient
+    machinery (in-flight, failover, cooldown) lives in the caller; this only picks.
+    """
+    instances = model_cfg.get("instances", [])
+    if not instances:
+        raise HTTPException(
+            status_code=500, detail=f"Model '{model_key}' has no instances configured."
+        )
+
+    exclude = exclude or set()
+    candidates = [i for i in instances if i["id"] not in exclude]
+    if not candidates:
+        raise HTTPException(
+            status_code=503,
+            detail=f"No remaining instance to try for model '{model_key}'.",
+        )
+    if len(candidates) == 1:
+        return candidates[0]
+
+    name = strategy or DEFAULT_STRATEGY
+    fn = STRATEGIES.get(name)
+    if fn is None:
+        logger.warning("Unknown routing strategy %r; using %s.", name, DEFAULT_STRATEGY)
+        fn = STRATEGIES[DEFAULT_STRATEGY]
+
+    ctx = SelectContext(
+        app=app,
+        model_key=model_key,
+        candidates=candidates,
+        all_instances=instances,
+        session_key=session_key,
+        prompt_prefix=prompt_prefix,
+    )
+    chosen = fn(ctx)
+    return chosen or candidates[0]
diff --git a/apps/router-server/tests/conftest.py b/apps/router-server/tests/conftest.py
index f193bbf..886a1a8 100644
--- a/apps/router-server/tests/conftest.py
+++ b/apps/router-server/tests/conftest.py
@@ -19,12 +19,20 @@ def make_app():
     backend_health, so we don't need a real FastAPI instance.
     """
 
-    def _make(metrics_cache=None, backend_inflight=None, backend_health=None):
+    def _make(
+        metrics_cache=None,
+        backend_inflight=None,
+        backend_health=None,
+        rr_counters=None,
+        routing_strategy="least_load",
+    ):
         return SimpleNamespace(
             state=SimpleNamespace(
                 metrics_cache=metrics_cache or {},
                 backend_inflight=backend_inflight or {},
                 backend_health=backend_health or {},
+                rr_counters=rr_counters if rr_counters is not None else {},
+                routing_strategy=routing_strategy,
             )
         )
 
diff --git a/apps/router-server/tests/unit/test_routing_endpoint.py b/apps/router-server/tests/unit/test_routing_endpoint.py
new file mode 100644
index 0000000..c95e8f4
--- /dev/null
+++ b/apps/router-server/tests/unit/test_routing_endpoint.py
@@ -0,0 +1,43 @@
+"""GET/POST /routing — read + hot-swap the global routing strategy."""
+import pytest
+from fastapi import FastAPI
+from fastapi.testclient import TestClient
+
+from src.llm_router.router import router as llm_router
+from src.llm_router.routing_strategies import DEFAULT_STRATEGY, STRATEGIES
+
+pytestmark = pytest.mark.unit
+
+
+@pytest.fixture
+def client():
+    app = FastAPI()
+    app.include_router(llm_router)
+    app.state.routing_strategy = DEFAULT_STRATEGY
+    app.state.rr_counters = {}
+    return TestClient(app)
+
+
+def test_get_routing_reports_current_and_catalogue(client):
+    r = client.get("/routing")
+    assert r.status_code == 200
+    body = r.json()
+    assert body["strategy"] == DEFAULT_STRATEGY
+    assert body["default"] == DEFAULT_STRATEGY
+    assert set(body["available"]) == set(STRATEGIES)
+
+
+def test_post_routing_hot_swaps_and_persists_in_state(client):
+    r = client.post("/routing", json={"strategy": "session_affinity"})
+    assert r.status_code == 200
+    assert r.json()["strategy"] == "session_affinity"
+    # State updated, so a subsequent read reflects it.
+    assert client.app.state.routing_strategy == "session_affinity"
+    assert client.get("/routing").json()["strategy"] == "session_affinity"
+
+
+def test_post_routing_rejects_unknown_strategy(client):
+    r = client.post("/routing", json={"strategy": "bogus"})
+    assert r.status_code == 400
+    # The global strategy is unchanged after a rejected swap.
+    assert client.app.state.routing_strategy == DEFAULT_STRATEGY
diff --git a/apps/router-server/tests/unit/test_routing_strategies.py b/apps/router-server/tests/unit/test_routing_strategies.py
new file mode 100644
index 0000000..a06ba77
--- /dev/null
+++ b/apps/router-server/tests/unit/test_routing_strategies.py
@@ -0,0 +1,164 @@
+import time
+
+import pytest
+from fastapi import HTTPException
+
+from src.llm_router.backend_runtime_state import make_backend_key
+from src.llm_router import routing_strategies as rs
+from src.llm_router.routing_strategies import (SelectContext, score_instance,
+                                               select_instance)
+from src.llm_router.vllm_metrics_client import VLLMInstanceMetrics
+
+pytestmark = pytest.mark.unit
+
+MODEL = "Qwen3-0.6B"
+INSTANCES = [{"id": "a", "port": 8002}, {"id": "b", "port": 8004}, {"id": "c", "port": 8006}]
+
+
+def metric(running=0.0, waiting=0.0, kv=0.0):
+    return VLLMInstanceMetrics(base_url="x", running=running, waiting=waiting, kv_cache_usage_perc=kv)
+
+
+def ctx(app, candidates=None, all_instances=None, **kw):
+    cand = candidates if candidates is not None else INSTANCES
+    return SelectContext(
+        app=app, model_key=MODEL, candidates=cand,
+        all_instances=all_instances if all_instances is not None else INSTANCES, **kw,
+    )
+
+
+# --------------------------------------------------------------------------- #
+# dispatcher
+# --------------------------------------------------------------------------- #
+async def test_no_instances_raises_500(make_app):
+    with pytest.raises(HTTPException) as exc:
+        await select_instance(make_app(), MODEL, {"instances": []})
+    assert exc.value.status_code == 500
+
+
+async def test_all_excluded_raises_503(make_app):
+    with pytest.raises(HTTPException) as exc:
+        await select_instance(make_app(), MODEL, {"instances": INSTANCES}, exclude={"a", "b", "c"})
+    assert exc.value.status_code == 503
+
+
+async def test_single_candidate_shortcut_ignores_strategy(make_app):
+    # Even with an affinity strategy and no key, one candidate is returned directly.
+    res = await select_instance(
+        make_app(), MODEL, {"instances": [{"id": "solo", "port": 8000}]},
+        strategy="session_affinity",
+    )
+    assert res["id"] == "solo"
+
+
+async def test_unknown_strategy_falls_back_to_least_load(make_app):
+    metrics = {MODEL: {"a": metric(running=10), "b": metric(running=1), "c": metric(running=5)}}
+    res = await select_instance(
+        make_app(metrics_cache=metrics), MODEL, {"instances": INSTANCES}, strategy="bogus",
+    )
+    assert res["id"] == "b"
+
+
+async def test_exclude_removes_candidate(make_app):
+    metrics = {MODEL: {"a": metric(running=1), "b": metric(running=0), "c": metric(running=5)}}
+    res = await select_instance(
+        make_app(metrics_cache=metrics), MODEL, {"instances": INSTANCES},
+        strategy="least_load", exclude={"b"},
+    )
+    assert res["id"] == "a"
+
+
+# --------------------------------------------------------------------------- #
+# least_load / least_inflight
+# --------------------------------------------------------------------------- #
+async def test_least_load_picks_lowest(make_app):
+    metrics = {MODEL: {"a": metric(running=10), "b": metric(running=1), "c": metric(kv=0.5)}}
+    res = rs._least_load(ctx(make_app(metrics_cache=metrics)))
+    assert res["id"] == "b"
+
+
+async def test_least_inflight_ignores_metrics(make_app):
+    # 'a' is the most loaded by metrics but has zero in-flight; least_inflight
+    # ignores the scrape, so it wins over backends carrying in-flight requests.
+    metrics = {MODEL: {"a": metric(running=99), "b": metric(running=0), "c": metric(running=0)}}
+    app = make_app(
+        metrics_cache=metrics,
+        backend_inflight={make_backend_key(MODEL, "b"): 1, make_backend_key(MODEL, "c"): 2},
+    )
+    assert rs._least_inflight(ctx(app))["id"] == "a"
+
+
+async def test_cooldown_avoided(make_app):
+    metrics = {MODEL: {"a": metric(running=0), "b": metric(running=100), "c": metric(running=100)}}
+    health = {make_backend_key(MODEL, "a"): {"cooldown_until": time.time() + 100}}
+    res = rs._least_load(ctx(make_app(metrics_cache=metrics, backend_health=health)))
+    assert res["id"] != "a"
+
+
+# --------------------------------------------------------------------------- #
+# round_robin / random / p2c
+# --------------------------------------------------------------------------- #
+async def test_round_robin_cycles(make_app):
+    app = make_app()
+    picks = [rs._round_robin(ctx(app))["id"] for _ in range(6)]
+    assert picks == ["a", "b", "c", "a", "b", "c"]
+
+
+async def test_random_stays_in_candidate_set(make_app):
+    app = make_app()
+    for _ in range(20):
+        assert rs._random(ctx(app, candidates=INSTANCES[:2]))["id"] in {"a", "b"}
+
+
+async def test_p2c_never_worse_than_global_max(make_app):
+    # With scores a<b<c, p2c samples two of three; the pick is never the global
+    # worst 'c' unless 'c' wasn't even... actually p2c can pick 'c' only if both
+    # samples were {b,c} or {a,c} and c<other — impossible here, so 'c' never wins.
+    metrics = {MODEL: {"a": metric(running=0), "b": metric(running=1), "c": metric(running=2)}}
+    app = make_app(metrics_cache=metrics)
+    for _ in range(50):
+        assert rs._p2c(ctx(app))["id"] in {"a", "b"}
+
+
+# --------------------------------------------------------------------------- #
+# affinity
+# --------------------------------------------------------------------------- #
+async def test_session_affinity_is_deterministic(make_app):
+    app = make_app(metrics_cache={MODEL: {i["id"]: metric() for i in INSTANCES}})
+    homes = {rs._session_affinity(ctx(app, session_key="sess-42"))["id"] for _ in range(10)}
+    assert len(homes) == 1  # same key -> same replica every time
+
+
+async def test_session_affinity_no_key_degrades_to_least_load(make_app):
+    metrics = {MODEL: {"a": metric(running=9), "b": metric(running=0), "c": metric(running=9)}}
+    app = make_app(metrics_cache=metrics)
+    assert rs._session_affinity(ctx(app, session_key=None))["id"] == "b"
+
+
+async def test_affinity_escapes_when_home_in_cooldown(make_app):
+    app = make_app(metrics_cache={MODEL: {i["id"]: metric() for i in INSTANCES}})
+    home = rs._session_affinity(ctx(app, session_key="sess-42"))["id"]
+    # Put the home replica in cooldown; affinity must escape to another instance.
+    app.state.backend_health = {make_backend_key(MODEL, home): {"cooldown_until": time.time() + 100}}
+    assert rs._session_affinity(ctx(app, session_key="sess-42"))["id"] != home
+
+
+async def test_affinity_escapes_when_home_overloaded(make_app):
+    app = make_app(metrics_cache={MODEL: {i["id"]: metric() for i in INSTANCES}})
+    home = rs._session_affinity(ctx(app, session_key="sess-42"))["id"]
+    # Make the home replica far more loaded than the margin allows -> spread away.
+    app.state.metrics_cache[MODEL][home] = metric(running=1000)
+    assert rs._session_affinity(ctx(app, session_key="sess-42"))["id"] != home
+
+
+async def test_prefix_affinity_deterministic_and_distinct_keys_can_differ(make_app):
+    app = make_app(metrics_cache={MODEL: {i["id"]: metric() for i in INSTANCES}})
+    a1 = rs._prefix_affinity(ctx(app, prompt_prefix="system: you are helpful"))["id"]
+    a2 = rs._prefix_affinity(ctx(app, prompt_prefix="system: you are helpful"))["id"]
+    assert a1 == a2  # stable for a given prefix
+
+
+async def test_score_instance_cold_start_is_idle(make_app):
+    # No cached metric for 'a' -> treated as idle (0), still selectable.
+    app = make_app(metrics_cache={MODEL: {"b": metric(running=5)}})
+    assert score_instance(app, MODEL, {"id": "a"}) == 0.0
diff --git a/deploy/nginx.conf b/deploy/nginx.conf
index 621ce6a..9a06b08 100644
--- a/deploy/nginx.conf
+++ b/deploy/nginx.conf
@@ -58,6 +58,7 @@ server {
     }
     location = /metrics { proxy_pass http://llm_router; }
     location = /reload  { proxy_pass http://llm_router; }
+    location = /routing { proxy_pass http://llm_router; }
 
     # Grafana, served under /grafana (Grafana runs with serve_from_sub_path, so
     # it expects the prefix intact — proxy_pass without a trailing path keeps it).
diff --git a/packages/config-schema/config.yaml b/packages/config-schema/config.yaml
index f33124a..4cd68b7 100644
--- a/packages/config-schema/config.yaml
+++ b/packages/config-schema/config.yaml
@@ -29,6 +29,12 @@ LLM_engines:
       max_model_len: 500
       gpu_memory_utilization: 0.35
       tensor_parallel_size: 1
+      # Load-balancing policy for this group: session_affinity keeps a chat
+      # session on one replica (better multi-turn KV-cache reuse), falling back to
+      # least-load when there's no session key or the home replica is overloaded.
+      # See docs/routing-strategies.md. (No-op for single-instance groups below,
+      # which always route to their one instance; set for intent + future scaling.)
+      routing_strategy: "session_affinity"
       # Tool calling: Qwen3 dense uses the Hermes-style tool format. It's a
       # reasoning ("thinking") model, so also split <think> via the qwen3
       # reasoning parser (tool calls are parsed from content, not reasoning).
@@ -62,6 +68,7 @@ LLM_engines:
       max_model_len: 2048
       gpu_memory_utilization: 0.30
       tensor_parallel_size: 1
+      routing_strategy: "session_affinity"
       # Qwen2.5 tokenizer_config ships Hermes-style tool use — parser `hermes`.
       enable_auto_tool_choice: true
       tool_call_parser: "hermes"
@@ -83,6 +90,7 @@ LLM_engines:
       max_model_len: 2048
       gpu_memory_utilization: 0.45
       tensor_parallel_size: 1
+      routing_strategy: "session_affinity"
       # Qwen2.5 tokenizer_config ships Hermes-style tool use — parser `hermes`.
       enable_auto_tool_choice: true
       tool_call_parser: "hermes"
@@ -100,6 +108,7 @@ LLM_engines:
       max_model_len: 2048
       gpu_memory_utilization: 0.85
       tensor_parallel_size: 1
+      routing_strategy: "session_affinity"
       # Qwen2.5 tokenizer_config ships Hermes-style tool use — parser `hermes`.
       enable_auto_tool_choice: true
       tool_call_parser: "hermes"
@@ -117,6 +126,7 @@ LLM_engines:
       max_model_len: 2048
       gpu_memory_utilization: 0.50
       tensor_parallel_size: 1
+      routing_strategy: "session_affinity"
       # No tool calling: SmolLM2 has no matching vLLM tool-call parser. Adding a
       # mismatched parser wouldn't produce tool_calls (see docs/vllm_auto_tool_整理.md).
 
@@ -137,6 +147,7 @@ LLM_engines:
       max_model_len: 2048
       gpu_memory_utilization: 0.35
       tensor_parallel_size: 1
+      routing_strategy: "session_affinity"
       # No tool calling: TinyLlama has no matching vLLM tool-call parser.
 
   # --- Single-instance group ---
@@ -152,6 +163,7 @@ LLM_engines:
       max_model_len: 2048
       gpu_memory_utilization: 0.90
       tensor_parallel_size: 1
+      routing_strategy: "session_affinity"
       # No tool calling: vLLM has no Phi-3.5 tool parser (phi4_mini_json is Phi-4 only).
 
 embedding_server: