From b2828efa0ad143a93fd5f6b4699b04633d73c631 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Fri, 21 Nov 2025 18:38:44 +0545 Subject: [PATCH 1/2] Add sglang_router details in examples, gateway and refs --- docs/docs/concepts/gateways.md | 23 +++++++++++++++++++++ docs/docs/reference/dstack.yml/gateway.md | 10 +++++++++ examples/inference/sglang/README.md | 23 +++++++++++++++++++++ src/dstack/_internal/core/models/routers.py | 12 +++++++++-- 4 files changed, 66 insertions(+), 2 deletions(-) diff --git a/docs/docs/concepts/gateways.md b/docs/docs/concepts/gateways.md index 1435926810..05feffdbbe 100644 --- a/docs/docs/concepts/gateways.md +++ b/docs/docs/concepts/gateways.md @@ -65,6 +65,29 @@ Private gateways are currently supported in `aws` and `gcp` backends. !!! info "Reference" For all gateway configuration options, refer to the [reference](../reference/dstack.yml/gateway.md). +### Router + +You can use [SGLang Router](https://docs.sglang.ai/advanced_features/router.html#) to route requests using polices such as `cache_aware`, `power_of_two`, `round_robin`, and `random`. + +To enable `SGLang Router`, configure gateway as below: + +
+ +```yaml +type: gateway +name: sglang-gateway + +backend: aws +region: eu-west-1 + +domain: example.com +router: + type: sglang + policy: cache_aware +``` + +
+ ## Update DNS records Once the gateway is assigned a hostname, go to your domain's DNS settings diff --git a/docs/docs/reference/dstack.yml/gateway.md b/docs/docs/reference/dstack.yml/gateway.md index 4d81d5d508..96537971e2 100644 --- a/docs/docs/reference/dstack.yml/gateway.md +++ b/docs/docs/reference/dstack.yml/gateway.md @@ -27,3 +27,13 @@ The `gateway` configuration type allows creating and updating [gateways](../../c show_root_heading: false type: required: true + +### `router` + +=== "SGLang" + + #SCHEMA# dstack._internal.core.models.routers.SGLangRouterConfig + overrides: + show_root_heading: false + type: + required: true diff --git a/examples/inference/sglang/README.md b/examples/inference/sglang/README.md index f880ac30b7..b2b03ac457 100644 --- a/examples/inference/sglang/README.md +++ b/examples/inference/sglang/README.md @@ -121,6 +121,29 @@ curl http://127.0.0.1:3000/proxy/models/main/chat/completions \ When a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured, the OpenAI-compatible endpoint is available at `https://gateway./`. +## SGLang Router +You can use [SGLang Router](https://docs.sglang.ai/advanced_features/router.html#) with dstack gateway to route requests using routing polices such as `cache_aware`, `power_of_two`, `round_robin`, and `random`. + +Here is an example configuration to enable `SGLang Router` + +
+ +```yaml +type: gateway +name: sglang-gateway + +backend: aws +region: eu-west-1 + +domain: example.com +router: + type: sglang + policy: cache_aware +``` + +
+ + ## Source code The source-code of this example can be found in diff --git a/src/dstack/_internal/core/models/routers.py b/src/dstack/_internal/core/models/routers.py index ec779b1242..e07631e12e 100644 --- a/src/dstack/_internal/core/models/routers.py +++ b/src/dstack/_internal/core/models/routers.py @@ -1,6 +1,9 @@ from enum import Enum from typing import Literal +from pydantic import Field +from typing_extensions import Annotated + from dstack._internal.core.models.common import CoreModel @@ -9,8 +12,13 @@ class RouterType(str, Enum): class SGLangRouterConfig(CoreModel): - type: Literal["sglang"] = "sglang" - policy: Literal["random", "round_robin", "cache_aware", "power_of_two"] = "cache_aware" + type: Annotated[Literal["sglang"], Field(description="The router type")] = "sglang" + policy: Annotated[ + Literal["random", "round_robin", "cache_aware", "power_of_two"], + Field( + description="The routing policy. Options: `random`, `round_robin`, `cache_aware`, `power_of_two`" + ), + ] = "cache_aware" AnyRouterConfig = SGLangRouterConfig From 85f160b0f66a241d0d2447651bc302e0ed691242 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Fri, 21 Nov 2025 17:47:50 +0100 Subject: [PATCH 2/2] [Docs] Improve the `sglang` router configuration with gateways --- docs/docs/concepts/gateways.md | 44 +++++++++---- docs/docs/concepts/services.md | 5 +- docs/docs/reference/dstack.yml/gateway.md | 18 +++--- examples/inference/sglang/README.md | 75 ++++++----------------- 4 files changed, 63 insertions(+), 79 deletions(-) diff --git a/docs/docs/concepts/gateways.md b/docs/docs/concepts/gateways.md index 05feffdbbe..03ddd10e5c 100644 --- a/docs/docs/concepts/gateways.md +++ b/docs/docs/concepts/gateways.md @@ -1,10 +1,9 @@ # Gateways -Gateways manage the ingress traffic of running [services](services.md), -provide an HTTPS endpoint mapped to your domain, handle auto-scaling and rate limits. +Gateways manage ingress traffic for running [services](services.md), handle auto-scaling and rate limits, enable HTTPS, and allow you to configure a custom domain. They also support custom routers, such as the [SGLang Model Gateway :material-arrow-top-right-thin:{ .external }](https://docs.sglang.ai/advanced_features/router.html#){:target="_blank"}. -> If you're using [dstack Sky :material-arrow-top-right-thin:{ .external }](https://sky.dstack.ai){:target="_blank"}, -> the gateway is already set up for you. + ## Apply a configuration @@ -57,19 +56,15 @@ You can create gateways with the `aws`, `azure`, `gcp`, or `kubernetes` backends Gateways in `kubernetes` backend require an external load balancer. Managed Kubernetes solutions usually include a load balancer. For self-hosted Kubernetes, you must provide a load balancer by yourself. -### Public IP +### Router -If you don't need/want a public IP for the gateway, you can set the `public_ip` to `false` (the default value is `true`), making the gateway private. -Private gateways are currently supported in `aws` and `gcp` backends. +By default, the gateway uses its own load balancer to route traffic between replicas. However, you can delegate this responsibility to a specific router by setting the `router` property. Currently, the only supported external router is `sglang`. -!!! info "Reference" - For all gateway configuration options, refer to the [reference](../reference/dstack.yml/gateway.md). +#### SGLang -### Router - -You can use [SGLang Router](https://docs.sglang.ai/advanced_features/router.html#) to route requests using polices such as `cache_aware`, `power_of_two`, `round_robin`, and `random`. +The `sglang` router delegates routing logic to the [SGLang Model Gateway :material-arrow-top-right-thin:{ .external }](https://docs.sglang.ai/advanced_features/router.html#){:target="_blank"}. -To enable `SGLang Router`, configure gateway as below: +To enable it, set `type` field under `router` to `sglang`:
@@ -81,6 +76,7 @@ backend: aws region: eu-west-1 domain: example.com + router: type: sglang policy: cache_aware @@ -88,6 +84,28 @@ router:
+!!! info "Policy" + + The `router` property allows you to configure the routing `policy`: + + * `cache_aware` — Default policy; combines cache locality with load balancing, falling back to shortest queue. + * `power_of_two` — Samples two workers and picks the lighter one. + * `random` — Uniform random selection. + * `round_robin` — Cycles through workers in order. + + +> Currently, services using this type of gateway must run standard SGLang workers. See the [example](../../examples/inference/sglang/index.md). +> +> Support for prefill/decode disaggregation and auto-scaling based on inter-token latency is coming soon. + +### Public IP + +If you don't need/want a public IP for the gateway, you can set the `public_ip` to `false` (the default value is `true`), making the gateway private. +Private gateways are currently supported in `aws` and `gcp` backends. + +!!! info "Reference" + For all gateway configuration options, refer to the [reference](../reference/dstack.yml/gateway.md). + ## Update DNS records Once the gateway is assigned a hostname, go to your domain's DNS settings diff --git a/docs/docs/concepts/services.md b/docs/docs/concepts/services.md index 6404c2bd1a..09ff1fba8f 100644 --- a/docs/docs/concepts/services.md +++ b/docs/docs/concepts/services.md @@ -100,12 +100,13 @@ If [authorization](#authorization) is not disabled, the service endpoint require However, you'll need a gateway in the following cases: * To use auto-scaling or rate limits + * To enable a support custom router, e.g. such as the [SGLang Model Gateway :material-arrow-top-right-thin:{ .external }](https://docs.sglang.ai/advanced_features/router.html#){:target="_blank"} * To enable HTTPS for the endpoint and map it to your domain * If your service requires WebSockets * If your service cannot work with a [path prefix](#path-prefix) - Note, if you're using [dstack Sky :material-arrow-top-right-thin:{ .external }](https://sky.dstack.ai){:target="_blank"}, - a gateway is already pre-configured for you. + If a [gateway](gateways.md) is configured, the service endpoint will be accessible at `https://./`. diff --git a/docs/docs/reference/dstack.yml/gateway.md b/docs/docs/reference/dstack.yml/gateway.md index 96537971e2..b8e2742891 100644 --- a/docs/docs/reference/dstack.yml/gateway.md +++ b/docs/docs/reference/dstack.yml/gateway.md @@ -10,29 +10,29 @@ The `gateway` configuration type allows creating and updating [gateways](../../c type: required: true -### `certificate` +### `router` -=== "Let's encrypt" +=== "SGLang Model Gateway" - #SCHEMA# dstack._internal.core.models.gateways.LetsEncryptGatewayCertificate + #SCHEMA# dstack._internal.core.models.routers.SGLangRouterConfig overrides: show_root_heading: false type: required: true -=== "ACM" +### `certificate` - #SCHEMA# dstack._internal.core.models.gateways.ACMGatewayCertificate +=== "Let's encrypt" + + #SCHEMA# dstack._internal.core.models.gateways.LetsEncryptGatewayCertificate overrides: show_root_heading: false type: required: true -### `router` - -=== "SGLang" +=== "ACM" - #SCHEMA# dstack._internal.core.models.routers.SGLangRouterConfig + #SCHEMA# dstack._internal.core.models.gateways.ACMGatewayCertificate overrides: show_root_heading: false type: diff --git a/examples/inference/sglang/README.md b/examples/inference/sglang/README.md index b2b03ac457..1652b838c8 100644 --- a/examples/inference/sglang/README.md +++ b/examples/inference/sglang/README.md @@ -2,32 +2,21 @@ This example shows how to deploy DeepSeek-R1-Distill-Llama 8B and 70B using [SGLang :material-arrow-top-right-thin:{ .external }](https://github.com/sgl-project/sglang){:target="_blank"} and `dstack`. -??? info "Prerequisites" - Once `dstack` is [installed](https://dstack.ai/docs/installation), clone the repo with examples. - -
- - ```shell - $ git clone https://github.com/dstackai/dstack - $ cd dstack - ``` - -
+## Apply a configuration -## Deployment Here's an example of a service that deploys DeepSeek-R1-Distill-Llama 8B and 70B using SgLang. -=== "AMD" +=== "NVIDIA" -
+
```yaml type: service - name: deepseek-r1-amd + name: deepseek-r1-nvidia - image: lmsysorg/sglang:v0.4.1.post4-rocm620 + image: lmsysorg/sglang:latest env: - - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-70B + - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-8B commands: - python3 -m sglang.launch_server @@ -36,25 +25,24 @@ Here's an example of a service that deploys DeepSeek-R1-Distill-Llama 8B and 70B --trust-remote-code port: 8000 - model: deepseek-ai/DeepSeek-R1-Distill-Llama-70B + model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B resources: - gpu: MI300x - disk: 300GB + gpu: 24GB ```
-=== "NVIDIA" +=== "AMD" -
+
```yaml type: service - name: deepseek-r1-nvidia + name: deepseek-r1-amd - image: lmsysorg/sglang:latest + image: lmsysorg/sglang:v0.4.1.post4-rocm620 env: - - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-8B + - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-70B commands: - python3 -m sglang.launch_server @@ -63,16 +51,14 @@ Here's an example of a service that deploys DeepSeek-R1-Distill-Llama 8B and 70B --trust-remote-code port: 8000 - model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B + model: deepseek-ai/DeepSeek-R1-Distill-Llama-70B resources: - gpu: 24GB + gpu: MI300x + disk: 300GB ```
- -### Applying the configuration - To run a configuration, use the [`dstack apply`](https://dstack.ai/docs/reference/cli/dstack/apply.md) command.
@@ -118,31 +104,10 @@ curl http://127.0.0.1:3000/proxy/models/main/chat/completions \ ```
-When a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured, the OpenAI-compatible endpoint -is available at `https://gateway./`. - -## SGLang Router -You can use [SGLang Router](https://docs.sglang.ai/advanced_features/router.html#) with dstack gateway to route requests using routing polices such as `cache_aware`, `power_of_two`, `round_robin`, and `random`. - -Here is an example configuration to enable `SGLang Router` - -
- -```yaml -type: gateway -name: sglang-gateway - -backend: aws -region: eu-west-1 - -domain: example.com -router: - type: sglang - policy: cache_aware -``` - -
+!!! info "SGLang Model Gateway" + If you'd like to use a custom routing policy, e.g. by leveraging the [SGLang Model Gateway :material-arrow-top-right-thin:{ .external }](https://docs.sglang.ai/advanced_features/router.html#){:target="_blank"}, create a gateway with `router` set to `sglang`. Check out [gateways](https://dstack.ai/docs/concepts/gateways#router) for more details. +> If a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured (e.g. to enable auto-scaling or HTTPs, rate-limits, etc), the OpenAI-compatible endpoint is available at `https://gateway./`. ## Source code @@ -151,5 +116,5 @@ The source-code of this example can be found in ## What's next? -1. Check [services](https://dstack.ai/docs/services) +1. Read about [services](https://dstack.ai/docs/concepts/services) and [gateways](https://dstack.ai/docs/concepts/gateways) 2. Browse the [SgLang DeepSeek Usage](https://docs.sglang.ai/references/deepseek.html), [Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html)