Add pd.dstack.yml file

Bihan  Rana · Bihan  Rana · commit f5a32c349c2c · 2026-02-19T18:12:03.000+05:45
diff --git a/examples/inference/sglang/README.md b/examples/inference/sglang/README.md
@@ -156,9 +156,7 @@ replicas:
           --disaggregation-transfer-backend mooncake \
           --host 0.0.0.0 \
           --port 8000 \
-          --disaggregation-bootstrap-port 8998 \
-          --log-level debug \
-          > worker-server.log 2>&1
+          --disaggregation-bootstrap-port 8998
     resources:
       gpu: H200
 
@@ -173,9 +171,7 @@ replicas:
           --disaggregation-mode decode \
           --disaggregation-transfer-backend mooncake \
           --host 0.0.0.0 \
-          --port 8000 \
-          --log-level debug \
-          > worker-server.log 2>&1
+          --port 8000
     resources:
       gpu: H200
 
@@ -195,8 +191,8 @@ router:
 
 ## Source code
 
-The source-code of this example can be found in
-[`examples/llms/deepseek/sglang`](https://github.com/dstackai/dstack/blob/master/examples/llms/deepseek/sglang).
+The source-code of these examples can be found in
+[`examples/llms/deepseek/sglang`](https://github.com/dstackai/dstack/blob/master/examples/llms/deepseek/sglang) and [`examples/inference/sglang`](https://github.com/dstackai/dstack/blob/master/examples/inference/sglang).
 
 ## What's next?
 
diff --git a/examples/inference/sglang/pd.dstack.yml b/examples/inference/sglang/pd.dstack.yml
@@ -0,0 +1,52 @@
+type: service
+name: prefill-decode-test
+https: false
+image: lmsysorg/sglang:latest
+
+env:
+  - HF_TOKEN
+  - MODEL_ID=zai-org/GLM-4.5-Air-FP8
+
+replicas:
+  - count: 1..2
+    scaling:
+      metric: rps
+      target: 3
+    commands:
+      - echo "Group Prefill" > /tmp/version.txt
+      - |
+          python -m sglang.launch_server \
+            --model-path $MODEL_ID \
+            --disaggregation-mode prefill \
+            --disaggregation-transfer-backend mooncake \
+            --host 0.0.0.0 \
+            --port 8000 \
+            --disaggregation-bootstrap-port 8998
+    resources:
+      gpu: 1
+
+  - count: 1
+    commands:
+      - echo "Group Decode" > /tmp/version.txt
+      - |
+          python -m sglang.launch_server \
+            --model-path $MODEL_ID \
+            --disaggregation-mode decode \
+            --disaggregation-transfer-backend mooncake \
+            --host 0.0.0.0 \
+            --port 8000
+    resources:
+      gpu: 1
+
+port: 8000
+model: zai-org/GLM-4.5-Air-FP8
+
+probes:
+  - type: http
+    url: /health_generate
+    interval: 15s
+
+router:
+  type: sglang
+  policy: round_robin
+  pd_disaggregation: true