From b2828efa0ad143a93fd5f6b4699b04633d73c631 Mon Sep 17 00:00:00 2001
From: Bihan  Rana <bihan@Bihans-MacBook-Pro.local>
Date: Fri, 21 Nov 2025 18:38:44 +0545
Subject: [PATCH 1/2] Add sglang_router details in examples, gateway and refs

---
 docs/docs/concepts/gateways.md              | 23 +++++++++++++++++++++
 docs/docs/reference/dstack.yml/gateway.md   | 10 +++++++++
 examples/inference/sglang/README.md         | 23 +++++++++++++++++++++
 src/dstack/_internal/core/models/routers.py | 12 +++++++++--
 4 files changed, 66 insertions(+), 2 deletions(-)
diff --git a/docs/docs/concepts/gateways.md b/docs/docs/concepts/gateways.md
index 1435926810..05feffdbbe 100644
--- a/docs/docs/concepts/gateways.md
+++ b/docs/docs/concepts/gateways.md
@@ -65,6 +65,29 @@ Private gateways are currently supported in `aws` and `gcp` backends.
 !!! info "Reference"
     For all gateway configuration options, refer to the [reference](../reference/dstack.yml/gateway.md).
 
+### Router
+
+You can use [SGLang Router](https://docs.sglang.ai/advanced_features/router.html#) to route requests using polices such as `cache_aware`, `power_of_two`, `round_robin`, and `random`.
+
+To enable `SGLang Router`, configure gateway as below:
+
+<div editor-title="gateway.dstack.yml">
+
+```yaml
+type: gateway
+name: sglang-gateway
+
+backend: aws
+region: eu-west-1
+
+domain: example.com
+router:
+  type: sglang
+  policy: cache_aware
+```
+
+</div>
+
 ## Update DNS records
 
 Once the gateway is assigned a hostname, go to your domain's DNS settings
diff --git a/docs/docs/reference/dstack.yml/gateway.md b/docs/docs/reference/dstack.yml/gateway.md
index 4d81d5d508..96537971e2 100644
--- a/docs/docs/reference/dstack.yml/gateway.md
+++ b/docs/docs/reference/dstack.yml/gateway.md
@@ -27,3 +27,13 @@ The `gateway` configuration type allows creating and updating [gateways](../../c
           show_root_heading: false
           type:
             required: true
+
+### `router`
+
+=== "SGLang"
+
+    #SCHEMA# dstack._internal.core.models.routers.SGLangRouterConfig
+        overrides:
+          show_root_heading: false
+          type:
+            required: true
diff --git a/examples/inference/sglang/README.md b/examples/inference/sglang/README.md
index f880ac30b7..b2b03ac457 100644
--- a/examples/inference/sglang/README.md
+++ b/examples/inference/sglang/README.md
@@ -121,6 +121,29 @@ curl http://127.0.0.1:3000/proxy/models/main/chat/completions \
 When a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured, the OpenAI-compatible endpoint
 is available at `https://gateway.<gateway domain>/`.
 
+## SGLang Router
+You can use [SGLang Router](https://docs.sglang.ai/advanced_features/router.html#) with dstack gateway to route requests using routing polices such as `cache_aware`, `power_of_two`, `round_robin`, and `random`.
+
+Here is an example configuration to enable `SGLang Router`
+
+<div editor-title="gateway.dstack.yml">
+
+```yaml
+type: gateway
+name: sglang-gateway
+
+backend: aws
+region: eu-west-1
+
+domain: example.com
+router:
+  type: sglang
+  policy: cache_aware
+```
+
+</div>
+
+
 ## Source code
 
 The source-code of this example can be found in
diff --git a/src/dstack/_internal/core/models/routers.py b/src/dstack/_internal/core/models/routers.py
index ec779b1242..e07631e12e 100644
--- a/src/dstack/_internal/core/models/routers.py
+++ b/src/dstack/_internal/core/models/routers.py
@@ -1,6 +1,9 @@
 from enum import Enum
 from typing import Literal
 
+from pydantic import Field
+from typing_extensions import Annotated
+
 from dstack._internal.core.models.common import CoreModel
 
 
@@ -9,8 +12,13 @@ class RouterType(str, Enum):
 
 
 class SGLangRouterConfig(CoreModel):
-    type: Literal["sglang"] = "sglang"
-    policy: Literal["random", "round_robin", "cache_aware", "power_of_two"] = "cache_aware"
+    type: Annotated[Literal["sglang"], Field(description="The router type")] = "sglang"
+    policy: Annotated[
+        Literal["random", "round_robin", "cache_aware", "power_of_two"],
+        Field(
+            description="The routing policy. Options: `random`, `round_robin`, `cache_aware`, `power_of_two`"
+        ),
+    ] = "cache_aware"
 
 
 AnyRouterConfig = SGLangRouterConfig

From 85f160b0f66a241d0d2447651bc302e0ed691242 Mon Sep 17 00:00:00 2001
From: peterschmidt85 <andrey.cheptsov@gmail.com>
Date: Fri, 21 Nov 2025 17:47:50 +0100
Subject: [PATCH 2/2] [Docs] Improve the `sglang` router configuration with
 gateways

---
 docs/docs/concepts/gateways.md            | 44 +++++++++----
 docs/docs/concepts/services.md            |  5 +-
 docs/docs/reference/dstack.yml/gateway.md | 18 +++---
 examples/inference/sglang/README.md       | 75 ++++++-----------------
 4 files changed, 63 insertions(+), 79 deletions(-)

diff --git a/docs/docs/concepts/gateways.md b/docs/docs/concepts/gateways.md
index 05feffdbbe..03ddd10e5c 100644
--- a/docs/docs/concepts/gateways.md
+++ b/docs/docs/concepts/gateways.md
@@ -1,10 +1,9 @@
 # Gateways
 
-Gateways manage the ingress traffic of running [services](services.md),
-provide an HTTPS endpoint mapped to your domain, handle auto-scaling and rate limits.
+Gateways manage ingress traffic for running [services](services.md), handle auto-scaling and rate limits, enable HTTPS, and allow you to configure a custom domain. They also support custom routers, such as the [SGLang Model Gateway :material-arrow-top-right-thin:{ .external }](https://docs.sglang.ai/advanced_features/router.html#){:target="_blank"}.
 
-> If you're using [dstack Sky :material-arrow-top-right-thin:{ .external }](https://sky.dstack.ai){:target="_blank"},
-> the gateway is already set up for you.
+<!-- > If you're using [dstack Sky :material-arrow-top-right-thin:{ .external }](https://sky.dstack.ai){:target="_blank"},
+> the gateway is already set up for you. -->
 
 ## Apply a configuration
 
@@ -57,19 +56,15 @@ You can create gateways with the `aws`, `azure`, `gcp`, or `kubernetes` backends
     Gateways in `kubernetes` backend require an external load balancer. Managed Kubernetes solutions usually include a load balancer.
     For self-hosted Kubernetes, you must provide a load balancer by yourself.
 
-### Public IP
+### Router
 
-If you don't need/want a public IP for the gateway, you can set the `public_ip` to `false` (the default value is `true`), making the gateway private.
-Private gateways are currently supported in `aws` and `gcp` backends.
+By default, the gateway uses its own load balancer to route traffic between replicas. However, you can delegate this responsibility to a specific router by setting the `router` property. Currently, the only supported external router is `sglang`.
 
-!!! info "Reference"
-    For all gateway configuration options, refer to the [reference](../reference/dstack.yml/gateway.md).
+#### SGLang
 
-### Router
-
-You can use [SGLang Router](https://docs.sglang.ai/advanced_features/router.html#) to route requests using polices such as `cache_aware`, `power_of_two`, `round_robin`, and `random`.
+The `sglang` router delegates routing logic to the [SGLang Model Gateway :material-arrow-top-right-thin:{ .external }](https://docs.sglang.ai/advanced_features/router.html#){:target="_blank"}.
 
-To enable `SGLang Router`, configure gateway as below:
+To enable it, set `type` field under `router` to `sglang`:
 
 <div editor-title="gateway.dstack.yml">
 
@@ -81,6 +76,7 @@ backend: aws
 region: eu-west-1
 
 domain: example.com
+
 router:
   type: sglang
   policy: cache_aware
@@ -88,6 +84,28 @@ router:
 
 </div>
 
+!!! info "Policy"
+
+    The `router` property allows you to configure the routing `policy`:
+
+    * `cache_aware` &mdash; Default policy; combines cache locality with load balancing, falling back to shortest queue. 
+    * `power_of_two` &mdash; Samples two workers and picks the lighter one.                                               
+    * `random` &mdash; Uniform random selection.                                                                    
+    * `round_robin` &mdash; Cycles through workers in order.                                                             
+
+
+> Currently, services using this type of gateway must run standard SGLang workers. See the [example](../../examples/inference/sglang/index.md).
+>
+> Support for prefill/decode disaggregation and auto-scaling based on inter-token latency is coming soon.
+
+### Public IP
+
+If you don't need/want a public IP for the gateway, you can set the `public_ip` to `false` (the default value is `true`), making the gateway private.
+Private gateways are currently supported in `aws` and `gcp` backends.
+
+!!! info "Reference"
+    For all gateway configuration options, refer to the [reference](../reference/dstack.yml/gateway.md).
+
 ## Update DNS records
 
 Once the gateway is assigned a hostname, go to your domain's DNS settings
diff --git a/docs/docs/concepts/services.md b/docs/docs/concepts/services.md
index 6404c2bd1a..09ff1fba8f 100644
--- a/docs/docs/concepts/services.md
+++ b/docs/docs/concepts/services.md
@@ -100,12 +100,13 @@ If [authorization](#authorization) is not disabled, the service endpoint require
     However, you'll need a gateway in the following cases:
 
     * To use auto-scaling or rate limits
+    * To enable a support custom router, e.g. such as the [SGLang Model Gateway :material-arrow-top-right-thin:{ .external }](https://docs.sglang.ai/advanced_features/router.html#){:target="_blank"}
     * To enable HTTPS for the endpoint and map it to your domain
     * If your service requires WebSockets
     * If your service cannot work with a [path prefix](#path-prefix)
 
-    Note, if you're using [dstack Sky :material-arrow-top-right-thin:{ .external }](https://sky.dstack.ai){:target="_blank"},
-    a gateway is already pre-configured for you.
+    <!-- Note, if you're using [dstack Sky :material-arrow-top-right-thin:{ .external }](https://sky.dstack.ai){:target="_blank"},
+    a gateway is already pre-configured for you. -->
 
     If a [gateway](gateways.md) is configured, the service endpoint will be accessible at
     `https://<run name>.<gateway domain>/`.
diff --git a/docs/docs/reference/dstack.yml/gateway.md b/docs/docs/reference/dstack.yml/gateway.md
index 96537971e2..b8e2742891 100644
--- a/docs/docs/reference/dstack.yml/gateway.md
+++ b/docs/docs/reference/dstack.yml/gateway.md
@@ -10,29 +10,29 @@ The `gateway` configuration type allows creating and updating [gateways](../../c
       type:
         required: true
 
-### `certificate`
+### `router`
 
-=== "Let's encrypt"
+=== "SGLang Model Gateway"
 
-    #SCHEMA# dstack._internal.core.models.gateways.LetsEncryptGatewayCertificate
+    #SCHEMA# dstack._internal.core.models.routers.SGLangRouterConfig
         overrides:
           show_root_heading: false
           type:
             required: true
 
-=== "ACM" 
+### `certificate`
 
-    #SCHEMA# dstack._internal.core.models.gateways.ACMGatewayCertificate
+=== "Let's encrypt"
+
+    #SCHEMA# dstack._internal.core.models.gateways.LetsEncryptGatewayCertificate
         overrides:
           show_root_heading: false
           type:
             required: true
 
-### `router`
-
-=== "SGLang"
+=== "ACM" 
 
-    #SCHEMA# dstack._internal.core.models.routers.SGLangRouterConfig
+    #SCHEMA# dstack._internal.core.models.gateways.ACMGatewayCertificate
         overrides:
           show_root_heading: false
           type:
diff --git a/examples/inference/sglang/README.md b/examples/inference/sglang/README.md
index b2b03ac457..1652b838c8 100644
--- a/examples/inference/sglang/README.md
+++ b/examples/inference/sglang/README.md
@@ -2,32 +2,21 @@
 
 This example shows how to deploy DeepSeek-R1-Distill-Llama 8B and 70B using [SGLang :material-arrow-top-right-thin:{ .external }](https://github.com/sgl-project/sglang){:target="_blank"} and `dstack`.
 
-??? info "Prerequisites"
-    Once `dstack` is [installed](https://dstack.ai/docs/installation), clone the repo with examples.
-
-    <div class="termy">
- 
-    ```shell
-    $ git clone https://github.com/dstackai/dstack
-    $ cd dstack
-    ```
- 
-    </div>
+## Apply a configuration
 
-## Deployment
 Here's an example of a service that deploys DeepSeek-R1-Distill-Llama 8B and 70B using SgLang.
 
-=== "AMD"
+=== "NVIDIA"
 
-    <div editor-title="examples/inference/sglang/amd/.dstack.yml">
+    <div editor-title="examples/inference/sglang/nvidia/.dstack.yml">
 
     ```yaml
     type: service
-    name: deepseek-r1-amd
+    name: deepseek-r1-nvidia
 
-    image: lmsysorg/sglang:v0.4.1.post4-rocm620
+    image: lmsysorg/sglang:latest
     env:
-      - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+      - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-8B
 
     commands:
       - python3 -m sglang.launch_server
@@ -36,25 +25,24 @@ Here's an example of a service that deploys DeepSeek-R1-Distill-Llama 8B and 70B
          --trust-remote-code
 
     port: 8000
-    model: deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+    model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
 
     resources:
-      gpu: MI300x
-      disk: 300GB
+       gpu: 24GB
     ```
     </div>
 
-=== "NVIDIA"
+=== "AMD"
 
-    <div editor-title="examples/inference/sglang/nvidia/.dstack.yml">
+    <div editor-title="examples/inference/sglang/amd/.dstack.yml">
 
     ```yaml
     type: service
-    name: deepseek-r1-nvidia
+    name: deepseek-r1-amd
 
-    image: lmsysorg/sglang:latest
+    image: lmsysorg/sglang:v0.4.1.post4-rocm620
     env:
-      - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+      - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-70B
 
     commands:
       - python3 -m sglang.launch_server
@@ -63,16 +51,14 @@ Here's an example of a service that deploys DeepSeek-R1-Distill-Llama 8B and 70B
          --trust-remote-code
 
     port: 8000
-    model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+    model: deepseek-ai/DeepSeek-R1-Distill-Llama-70B
 
     resources:
-       gpu: 24GB
+      gpu: MI300x
+      disk: 300GB
     ```
     </div>
 
-
-### Applying the configuration
-
 To run a configuration, use the [`dstack apply`](https://dstack.ai/docs/reference/cli/dstack/apply.md) command.
 
 <div class="termy">
@@ -118,31 +104,10 @@ curl http://127.0.0.1:3000/proxy/models/main/chat/completions \
 ```
 </div>
 
-When a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured, the OpenAI-compatible endpoint
-is available at `https://gateway.<gateway domain>/`.
-
-## SGLang Router
-You can use [SGLang Router](https://docs.sglang.ai/advanced_features/router.html#) with dstack gateway to route requests using routing polices such as `cache_aware`, `power_of_two`, `round_robin`, and `random`.
-
-Here is an example configuration to enable `SGLang Router`
-
-<div editor-title="gateway.dstack.yml">
-
-```yaml
-type: gateway
-name: sglang-gateway
-
-backend: aws
-region: eu-west-1
-
-domain: example.com
-router:
-  type: sglang
-  policy: cache_aware
-```
-
-</div>
+!!! info "SGLang Model Gateway"
+    If you'd like to use a custom routing policy, e.g. by leveraging the [SGLang Model Gateway :material-arrow-top-right-thin:{ .external }](https://docs.sglang.ai/advanced_features/router.html#){:target="_blank"}, create a gateway with `router` set to `sglang`. Check out [gateways](https://dstack.ai/docs/concepts/gateways#router) for more details.
 
+> If a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured (e.g. to enable auto-scaling or HTTPs, rate-limits, etc), the OpenAI-compatible endpoint is available at `https://gateway.<gateway domain>/`.
 
 ## Source code
 
@@ -151,5 +116,5 @@ The source-code of this example can be found in
 
 ## What's next?
 
-1. Check [services](https://dstack.ai/docs/services)
+1. Read about [services](https://dstack.ai/docs/concepts/services) and [gateways](https://dstack.ai/docs/concepts/gateways)
 2. Browse the [SgLang DeepSeek Usage](https://docs.sglang.ai/references/deepseek.html), [Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html)