From 4d56166a8364aa5bf1eef70d2aa1419971110a60 Mon Sep 17 00:00:00 2001
From: Yijia J <yijiaj@google.com>
Date: Thu, 25 Dec 2025 00:26:14 +0000
Subject: [PATCH 01/12] initial commit for a4x dynamo deepseek-fp8 2p2d recipe

---
 .../disaggregated-serving/dynamo/README.md    |  28 ++
 .../disaggregated-serving/dynamo/values.yaml  | 197 ++++++++
 .../deepseekr1-fp8-multi-node-decode.yaml     |  46 ++
 .../deepseekr1-fp8-multi-node-prefill.yaml    |  46 ++
 .../dynamo-deployment/Chart.yaml              |  20 +
 .../templates/dynamo-compute-domain.yaml      |  24 +
 .../templates/dynamo-graph-deployment.yaml    | 470 ++++++++++++++++++
 .../templates/dynamo-launcher-configmap.yaml  |  28 ++
 .../templates/dynamo-worker-configmap.yaml    |  35 ++
 9 files changed, 894 insertions(+)
 create mode 100644 inference/a4x/disaggregated-serving/dynamo/README.md
 create mode 100644 inference/a4x/disaggregated-serving/dynamo/values.yaml
 create mode 100644 src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml
 create mode 100644 src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml
 create mode 100644 src/helm-charts/a4x/inference-templates/dynamo-deployment/Chart.yaml
 create mode 100644 src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-compute-domain.yaml
 create mode 100644 src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml
 create mode 100644 src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-launcher-configmap.yaml
 create mode 100644 src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-worker-configmap.yaml

diff --git a/inference/a4x/disaggregated-serving/dynamo/README.md b/inference/a4x/disaggregated-serving/dynamo/README.md
new file mode 100644
index 00000000..ec499d10
--- /dev/null
+++ b/inference/a4x/disaggregated-serving/dynamo/README.md
@@ -0,0 +1,28 @@
+# Disaggregated Multi-Node Dynamo Recipe for A4x
+
+This recipe runs a disaggregated multi-node Dynamo deployment on A4x.
+
+## Setup
+
+1.  **Set Environment Variables**
+
+    ```bash
+    export REPO_ROOT=$(git rev-parse --show-toplevel)
+    export RELEASE_VERSION="24.05"
+    export USER=$(whoami)
+    ```
+
+2.  **Run the Recipe**
+
+    ```bash
+    helm install -f values.yaml \
+      --set-file workload_launcher=$REPO_ROOT/src/launchers/dynamo-vllm-launcher.sh \
+      --set-file serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/llama-3.3-70b-multi-node.yaml \
+      --set workload.framework=vllm \
+      --set workload.model.name=meta-llama/Llama-3.3-70B-Instruct \
+      --set workload.image=nvcr.io/nvidia/ai-dynamo/vllm-runtime:${RELEASE_VERSION} \
+      --set workload.gpus=16 \
+      $USER-dynamo-multi-node-serving-a4x \
+      $REPO_ROOT/src/helm-charts/a4x/inference-templates/dynamo-deployment
+    ```
+
diff --git a/inference/a4x/disaggregated-serving/dynamo/values.yaml b/inference/a4x/disaggregated-serving/dynamo/values.yaml
new file mode 100644
index 00000000..b49162bc
--- /dev/null
+++ b/inference/a4x/disaggregated-serving/dynamo/values.yaml
@@ -0,0 +1,197 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+dynamo:
+  namespace: dynamo-cloud
+  releaseVersion: "0.7.0"
+  deploymentName:
+  computeDomain:
+    name: yijiaj-a4x-domain
+    numNodes: 4
+    resourceClaimTemplateName: yijiaj-a4x-channel
+  frontend:
+    image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1
+    replicas: 1
+    livenessProbe:
+      initialDelaySeconds: 3000
+      periodSeconds: 60
+      timeoutSeconds: 150
+      failureThreshold: 100
+    readinessProbe:
+      initialDelaySeconds: 3000
+      periodSeconds: 60
+      timeoutSeconds: 300
+      failureThreshold: 100
+  decodeWorker:
+    image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.6.1
+    nodeCount: 2
+    replicas: 1
+    envs:
+    - name: LD_LIBRARY_PATH
+      value: "/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu/plugins:/usr/local/nvidia/lib64"
+    - name: GLOO_SOCKET_IFNAME
+      value: eth0
+    - name: TP_SOCKET_IFNAME
+      value: eth0
+    - name: SGLANG_ENABLE_JIT_DEEPGEMM
+      value: "1"
+    - name: DYN_SKIP_SGLANG_LOG_FORMATTING
+      value: "1"
+    - name: MC_TE_METRIC
+      value: "true"
+    - name: SGLANG_ENABLE_FLASHINFER_GEMM
+      value: "1"
+    - name: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
+      value: "100000"
+    - name: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
+      value: "100000"
+    - name: SGLANG_DISAGGREGATION_WAITING_TIMEOUT
+      value: "100000"
+    - name: SGLANG_DECODE_BOOTSTRAP_TIMEOUT
+      value: "1000"
+    - name: SGLANG_HACK_SEQ_BOOTSTRAP_ROOM
+      value: "1"
+    - name: SGLANG_MOONCAKE_CUSTOM_MEM_POOL
+      value: "True"
+    - name: MC_FORCE_MNNVL
+      value: "1"
+    - name: NCCL_MNNVL_ENABLE
+      value: "1"
+    - name: NCCL_CUMEM_ENABLE
+      value: "1"
+    - name: SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
+      value: "0"
+    - name: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
+      value: "1"
+    - name: PYTHONUNBUFFERED
+      value: "1"
+    - name: NCCL_DEBUG
+      value: INFO
+    - name: NCCL_DEBUG_SUBSYS
+      value: INIT,BOOTSTRAP,ENV,NET,GRAPH
+    - name: NCCL_SOCKET_FAMILY
+      value: "AF_INET"
+    - name: GLOO_SOCKET_FAMILY
+      value: "AF_INET"
+    livenessProbe:
+      initialDelaySeconds: 3000
+      periodSeconds: 60
+      timeoutSeconds: 150
+      failureThreshold: 100
+    readinessProbe:
+      initialDelaySeconds: 3000
+      periodSeconds: 60
+      timeoutSeconds: 300
+      failureThreshold: 100
+    startupProbe:
+      initialDelaySeconds: 3000
+      periodSeconds: 60
+      timeoutSeconds: 600
+      failureThreshold: 3000
+  prefillWorker:
+    image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.6.1
+    nodeCount: 2
+    replicas: 1
+    envs:
+      - name: LD_LIBRARY_PATH
+        value: "/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu/plugins:/usr/local/nvidia/lib64"
+      - name: UCX_TLS
+        value: "^tcp"
+      - name: GLOO_SOCKET_IFNAME
+        value: eth0
+      - name: TP_SOCKET_IFNAME
+        value: eth0
+      - name: SGLANG_ENABLE_JIT_DEEPGEMM
+        value: "1"
+      - name: DYN_SKIP_SGLANG_LOG_FORMATTING
+        value: "1"
+      - name: MC_TE_METRIC
+        value: "true"
+      - name: SGLANG_ENABLE_FLASHINFER_GEMM
+        value: "1"
+      - name: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
+        value: "100000"
+      - name: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
+        value: "100000"
+      - name: SGLANG_DISAGGREGATION_WAITING_TIMEOUT
+        value: "100000"
+      - name: SGLANG_MOONCAKE_CUSTOM_MEM_POOL
+        value: "True"
+      - name: MC_FORCE_MNNVL
+        value: "1"
+      - name: NCCL_MNNVL_ENABLE
+        value: "1"
+      - name: NCCL_CUMEM_ENABLE
+        value: "1"
+      - name: SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
+        value: "0"
+      - name: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
+        value: "1"
+      - name: PYTHONUNBUFFERED
+        value: "1"
+    livenessProbe:
+      initialDelaySeconds: 3000
+      periodSeconds: 60
+      timeoutSeconds: 150
+      failureThreshold: 100
+    readinessProbe:
+      initialDelaySeconds: 3000
+      periodSeconds: 60
+      timeoutSeconds: 300
+      failureThreshold: 100
+    startupProbe:
+      initialDelaySeconds: 3000
+      periodSeconds: 60
+      timeoutSeconds: 600
+      failureThreshold: 3000
+ 
+
+secrets:
+  ngc:
+    secretName: nvcr-secret
+  huggingface:
+    secretName: hf-token-secret
+    secretData:
+      token: "hf_api_token"
+
+volumes:
+  gcsfuse:
+    bucketName: "yijiaj-test"
+    fileCacheCapacity: "500G"
+    cachePath: "/gcs-cache"
+  ssdMountPath: "/ssd"
+  gcsMounts:
+    - bucketName:  "yijiaj-test"
+      mountPath: "/data/model"
+
+service:
+  type: ClusterIP
+  ports:
+    frontend: 8000
+    worker: 9090
+
+workload:
+  model: deepseek-ai/DeepSeek-R1
+  gpus: 16
+  framework: sglang
+
+network:
+  subnetworks: []
+  gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic-arm64:v1.0.7
+  ncclSettings:
+    - name: NCCL_DEBUG
+      value: "VERSION"
+
+quantizations:
+  - "fp8"
diff --git a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml
new file mode 100644
index 00000000..82029f49
--- /dev/null
+++ b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml
@@ -0,0 +1,46 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+model-path: /data/model/deepseek-ai/DeepSeek-R1
+served-model-name: deepseek-ai/DeepSeek-R1
+log-level: DEBUG
+tp: "8"
+dp-size: "8"
+decode-log-interval: "1"
+page-size: "1"
+enable-dp-attention: true
+trust-remote-code: true
+disaggregation-mode: decode
+disaggregation-transfer-backend: nixl
+disaggregation-bootstrap-port: "30001"
+host: "0.0.0.0"
+port: "9090"
+max-running-requests: "36864"
+context-length: "2716"
+disable-radix-cache: true
+moe-a2a-backend: deepep
+prefill-round-robin-balance: true
+deepep-mode: normal
+moe-dense-tp-size: "1"
+enable-dp-lm-head: true
+disable-cuda-graph: true
+cuda-graph-max-bs: "256"
+disable-shared-experts-fusion: true
+ep-num-redundant-experts: "32"
+ep-dispatch-algorithm: static
+eplb-algorithm: deepseek
+attention-backend: cutlass_mla
+watchdog-timeout: "1000000"
+chunked-prefill-size: "36864"
+mem-fraction-static: "0.8"
diff --git a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml
new file mode 100644
index 00000000..939aa2cc
--- /dev/null
+++ b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml
@@ -0,0 +1,46 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+model-path: /data/model/deepseek-ai/DeepSeek-R1
+served-model-name: deepseek-ai/DeepSeek-R1
+log-level: DEBUG
+tp: "8"
+dp-size: "8"
+trust-remote-code: true
+decode-log-interval: "1"
+page-size: "1"
+enable-dp-attention: true
+disaggregation-mode: prefill
+disaggregation-transfer-backend: nixl
+disaggregation-bootstrap-port: "30001"
+host: "0.0.0.0"
+port: "9090"
+max-running-requests: "6144"
+context-length: "2716"
+disable-radix-cache: true
+moe-a2a-backend: deepep
+load-balance-method: round_robin
+deepep-mode: normal
+moe-dense-tp-size: "1"
+enable-dp-lm-head: true
+disable-shared-experts-fusion: true
+ep-num-redundant-experts: "32"
+ep-dispatch-algorithm: static
+eplb-algorithm: deepseek
+attention-backend: cutlass_mla
+watchdog-timeout: "1000000"
+disable-cuda-graph: true
+chunked-prefill-size: "16384"
+max-total-tokens: "32768"
+mem-fraction-static: "0.8"
diff --git a/src/helm-charts/a4x/inference-templates/dynamo-deployment/Chart.yaml b/src/helm-charts/a4x/inference-templates/dynamo-deployment/Chart.yaml
new file mode 100644
index 00000000..25a2209e
--- /dev/null
+++ b/src/helm-charts/a4x/inference-templates/dynamo-deployment/Chart.yaml
@@ -0,0 +1,20 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v2
+name: a4x-dynamo-deployment
+description: a4x-dynamo-deployment
+type: application
+version: 0.1.0
+appVersion: "0.4.0"
\ No newline at end of file
diff --git a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-compute-domain.yaml b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-compute-domain.yaml
new file mode 100644
index 00000000..dc2ab53a
--- /dev/null
+++ b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-compute-domain.yaml
@@ -0,0 +1,24 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: resource.nvidia.com/v1beta1
+kind: ComputeDomain
+metadata:
+  name: {{ .Values.dynamo.computeDomain.name }}
+  namespace: {{ .Values.dynamo.namespace }}
+spec:
+  numNodes: {{ .Values.dynamo.computeDomain.numNodes }}
+  channel:
+    resourceClaimTemplate:
+      name: {{ .Values.dynamo.computeDomain.resourceClaimTemplateName }}
diff --git a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml
new file mode 100644
index 00000000..efe0306d
--- /dev/null
+++ b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml
@@ -0,0 +1,470 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: {{ .Values.dynamo.deploymentName }}
+  namespace: {{ .Values.dynamo.namespace }}
+spec:
+  {{- if .Values.workload.framework }}
+  backendFramework: {{ .Values.workload.framework }}
+  {{- end }}
+  services:
+    Frontend:
+      dynamoNamespace: {{ .Values.dynamo.namespace }}
+      componentType: frontend
+      replicas: {{ .Values.dynamo.frontend.replicas }}
+      resources:
+        requests:
+          cpu: "5"
+          memory: "10Gi"
+        limits:
+          cpu: "5"
+          memory: "10Gi"
+      extraPodMetadata:
+        annotations:
+          gke-gcsfuse/volumes: "true"
+          gke-gcsfuse/cpu-limit: "0"
+          gke-gcsfuse/memory-limit: "0"
+          gke-gcsfuse/ephemeral-storage-limit: "0"
+          gke-gcsfuse/file-cache-capacity: "500Gi"
+          gke-gcsfuse/cache-path: "/gcs-cache"
+      extraPodSpec:
+        tolerations:
+        - key: "kubernetes.io/arch"
+          operator: "Equal"
+          value: "arm64"
+          effect: "NoSchedule"
+        - key: "nvidia.com/gpu"
+          operator: "Exists"
+          effect: "NoSchedule"
+        volumes:
+        - name: local-ssd
+          emptyDir: {}
+        - name: gcs-model-volume
+          csi:
+            driver: gcsfuse.csi.storage.gke.io
+            volumeAttributes:
+              bucketName: {{ .Values.volumes.gcsfuse.bucketName }}
+              mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:50,file-cache:max-size-mb:-1"
+
+        mainContainer:
+          image: {{ .Values.dynamo.frontend.image }}
+          volumeMounts:
+          - name: local-ssd
+            mountPath: /gcs-cache
+          - name: gcs-model-volume
+            mountPath: /data/model
+            readOnly: true
+          resources:
+            requests:
+              ephemeral-storage: "30Gi"
+            limits:
+              ephemeral-storage: "30Gi"
+
+    Decode:
+      multinode:
+        nodeCount:  {{ .Values.dynamo.decodeWorker.nodeCount }}
+      dynamoNamespace: {{ .Values.dynamo.namespace }}
+      envFromSecret: {{ .Values.secrets.huggingface.secretName }}
+      componentType: worker
+      subComponentType: decode
+      replicas: {{ .Values.dynamo.decodeWorker.replicas }}
+      livenessProbe:
+        httpGet:
+          path: /live
+          port: system
+        initialDelaySeconds: {{ .Values.dynamo.decodeWorker.livenessProbe.initialDelaySeconds }} 
+        periodSeconds: {{ .Values.dynamo.decodeWorker.livenessProbe.periodSeconds }}        
+        timeoutSeconds: {{ .Values.dynamo.decodeWorker.livenessProbe.timeoutSeconds }}        
+        failureThreshold: {{ .Values.dynamo.decodeWorker.livenessProbe.failureThreshold }}
+      readinessProbe:
+        httpGet:
+          path: /health
+          port: system
+        initialDelaySeconds: {{ .Values.dynamo.decodeWorker.readinessProbe.initialDelaySeconds }}
+        timeoutSeconds: {{ .Values.dynamo.decodeWorker.readinessProbe.timeoutSeconds }}
+        periodSeconds: {{ .Values.dynamo.decodeWorker.readinessProbe.periodSeconds }}
+        failureThreshold: {{ .Values.dynamo.decodeWorker.readinessProbe.failureThreshold }}
+      sharedMemory:
+        size: 80Gi
+      resources:
+        resources:
+        limits:
+          gpu: "4"
+        claims:
+          - name: compute-domain-channel
+      envs:
+        {{- if .Values.dynamo.decodeWorker.envs }}
+        {{- toYaml .Values.dynamo.decodeWorker.envs | nindent 8 }}
+        {{- end }}
+      extraPodMetadata:
+        annotations:
+          gke-gcsfuse/cpu-limit: "0"
+          gke-gcsfuse/ephemeral-storage-limit: "0"
+          gke-gcsfuse/memory-limit: "0"
+          gke-gcsfuse/volumes: "true"
+          networking.gke.io/default-interface: 'eth0'
+          networking.gke.io/interfaces: |
+            [
+              {"interfaceName":"eth0","network":"default"},
+              {"interfaceName":"eth2","network":"rdma-0"},
+              {"interfaceName":"eth3","network":"rdma-1"},
+              {"interfaceName":"eth4","network":"rdma-2"},
+              {"interfaceName":"eth5","network":"rdma-3"}
+            ]
+      extraPodSpec:
+        resourceClaims:
+          - name: compute-domain-channel
+            resourceClaimTemplateName: {{ .Values.dynamo.computeDomain.resourceClaimTemplateName }}
+        affinity:
+          nodeAffinity:
+            requiredDuringSchedulingIgnoredDuringExecution:
+              nodeSelectorTerms:
+              - matchExpressions:
+                - key: kubernetes.io/arch
+                  operator: In
+                  values:
+                  - arm64
+        volumes:
+        - name: gcs-model-volume
+          csi:
+            driver: gcsfuse.csi.storage.gke.io
+            volumeAttributes:
+              bucketName: {{ .Values.volumes.gcsfuse.bucketName }}
+              mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1
+        - name: library-dir-host
+          hostPath:
+            path: /home/kubernetes/bin/nvidia
+        - name: gib
+          hostPath:
+            path: /home/kubernetes/bin/gib
+
+        mainContainer:
+          securityContext:
+              privileged: true
+          image: {{ .Values.dynamo.decodeWorker.image }}
+          workingDir: /sgl-workspace/dynamo/components/backends/sglang
+          startupProbe:
+            failureThreshold: {{ .Values.dynamo.decodeWorker.startupProbe.failureThreshold }}
+            httpGet:
+              path: /live
+              port: system
+            periodSeconds: {{ .Values.dynamo.decodeWorker.startupProbe.periodSeconds }}
+            timeoutSeconds: {{ .Values.dynamo.decodeWorker.startupProbe.timeoutSeconds }}
+            initialDelaySeconds: {{ .Values.dynamo.decodeWorker.startupProbe.initialDelaySeconds }}
+          command: ["/bin/bash", "-c"]
+          stdin: true
+          tty: true
+          args:
+          - |
+            set -e
+            nvidia-smi
+            . /usr/local/gib/scripts/set_nccl_env.sh
+
+            echo "--- VERIFYING NCCL ENV VARS IN SHELL ---"
+            env | grep NCCL_
+            echo "--- END VERIFICATION ---"
+
+            {{- if .Values.workload_launcher }}
+            # Use custom launcher if provided
+            if [ ! -f "$LAUNCHER_SCRIPT" ]; then
+              echo "Error: Launcher script $LAUNCHER_SCRIPT not found!"
+              exit 1
+            fi
+
+            ARGS=()
+            if [ -f "$SERVER_ARGS_FILE" ]; then
+              echo "Loading server arguments from ConfigMap"
+              while IFS=': ' read -r key value || [ -n "$key" ]; do
+                [[ -z "$key" || "$key" == \#* ]] && continue
+                key=$(echo "$key" | xargs)
+                value=$(echo "$value" | xargs)
+
+                if [ -n "$key" ]; then
+                  if [[ "$value" == "true" ]]; then
+                    ARGS+=("--$key")
+                  elif [[ "$value" == "false" ]]; then
+                    ARGS+=("--$key" "false")
+                  elif [ -n "$value" ]; then
+                    ARGS+=("--$key" "$value")
+                  else
+                    ARGS+=("--$key")
+                  fi
+                fi
+              done < "$SERVER_ARGS_FILE"
+            fi
+
+            echo "Running: $LAUNCHER_SCRIPT ${ARGS[@]}"
+            exec "$LAUNCHER_SCRIPT" "${ARGS[@]}"
+            {{- else }}
+            exec python3 -m dynamo.sglang \
+              --model-path /data/model/deepseek-ai/DeepSeek-R1 \
+              --served-model-name deepseek-ai/DeepSeek-R1 \
+              --log-level DEBUG \
+              --tp 8 \
+              --dp-size 8 \
+              --decode-log-interval 1 \
+              --page-size 1 \
+              --enable-dp-attention \
+              --trust-remote-code \
+              --disaggregation-mode decode \
+              --disaggregation-transfer-backend nixl \
+              --disaggregation-bootstrap-port 30001 \
+              --host 0.0.0.0 \
+              --port 9090 \
+              --decode-log-interval 1 \
+              --max-running-requests 36864 \
+              --context-length 2716 \
+              --disable-radix-cache \
+              --moe-a2a-backend deepep \
+              --prefill-round-robin-balance \
+              --deepep-mode normal \
+              --moe-dense-tp-size 1 \
+              --enable-dp-lm-head \
+              --disable-cuda-graph \
+              --cuda-graph-max-bs 256 \
+              --disable-shared-experts-fusion \
+              --ep-num-redundant-experts 32 \
+              --ep-dispatch-algorithm static \
+              --eplb-algorithm deepseek \
+              --attention-backend cutlass_mla \
+              --watchdog-timeout 1000000 \
+              --chunked-prefill-size 36864 \
+              --mem-fraction-static 0.8
+            {{- end }}
+
+          volumeMounts:
+            - mountPath: /data/model
+              name: gcs-model-volume
+            - name: library-dir-host
+              mountPath: /usr/local/nvidia
+            - name: gib
+              mountPath: /usr/local/gib
+            {{- if .Values.workload_launcher }}
+            - name: serving-configuration
+              mountPath: {{ .Values.workload.configPath | default "/workload/configs" }}
+            - name: serving-launcher
+              mountPath: /workload/launcher
+            {{- end }}
+        volumes:
+        {{- if .Values.workload_launcher }}
+        - name: serving-configuration
+          configMap:
+            name: "{{ .Release.Name }}-decode-config"
+            items:
+            - key: serving-configuration
+              path: {{ .Values.workload.configFile | default "serving-args.yaml" }}
+        - name: serving-launcher
+          configMap:
+            name: "{{ .Release.Name }}-launcher"
+            defaultMode: 0700
+        {{- end }}
+
+
+    Prefill:
+      multinode:
+          nodeCount: {{ .Values.dynamo.prefillWorker.nodeCount }}
+      dynamoNamespace: {{ .Values.dynamo.namespace }}
+      envFromSecret: {{ .Values.secrets.huggingface.secretName }}
+      componentType: worker
+      subComponentType: prefill
+      replicas: {{ .Values.dynamo.prefillWorker.replicas }}
+      livenessProbe:
+        httpGet:
+          path: /live
+          port: system
+        initialDelaySeconds: {{ .Values.dynamo.prefillWorker.livenessProbe.initialDelaySeconds }} 
+        periodSeconds: {{ .Values.dynamo.prefillWorker.livenessProbe.periodSeconds }}        
+        timeoutSeconds: {{ .Values.dynamo.prefillWorker.livenessProbe.timeoutSeconds }}        
+        failureThreshold: {{ .Values.dynamo.prefillWorker.livenessProbe.failureThreshold }}
+      readinessProbe:
+        httpGet:
+          path: /health
+          port: system
+        initialDelaySeconds: {{ .Values.dynamo.prefillWorker.readinessProbe.initialDelaySeconds }}
+        timeoutSeconds: {{ .Values.dynamo.prefillWorker.readinessProbe.timeoutSeconds }}
+        periodSeconds: {{ .Values.dynamo.prefillWorker.readinessProbe.periodSeconds }}
+        failureThreshold: {{ .Values.dynamo.prefillWorker.readinessProbe.failureThreshold }}
+      sharedMemory:
+        size: 80Gi
+      resources:
+        limits:
+          gpu: "4"
+        claims:
+          - name: compute-domain-channel
+      envs:
+        {{- if .Values.dynamo.prefillWorker.envs }}
+        {{- toYaml .Values.dynamo.prefillWorker.envs | nindent 8 }}
+        {{- end }}
+      extraPodMetadata:
+        annotations:
+          gke-gcsfuse/cpu-limit: "0"
+          gke-gcsfuse/ephemeral-storage-limit: "0"
+          gke-gcsfuse/memory-limit: "0"
+          gke-gcsfuse/volumes: "true"
+          networking.gke.io/default-interface: 'eth0'
+          networking.gke.io/interfaces: |
+            [
+              {"interfaceName":"eth0","network":"default"},
+              {"interfaceName":"eth2","network":"rdma-0"},
+              {"interfaceName":"eth3","network":"rdma-1"},
+              {"interfaceName":"eth4","network":"rdma-2"},
+              {"interfaceName":"eth5","network":"rdma-3"}
+            ]
+      extraPodSpec:
+        resourceClaims:
+          - name: compute-domain-channel
+            resourceClaimTemplateName: {{ .Values.dynamo.computeDomain.resourceClaimTemplateName }}
+        affinity:
+          nodeAffinity:
+            requiredDuringSchedulingIgnoredDuringExecution:
+              nodeSelectorTerms:
+              - matchExpressions:
+                - key: kubernetes.io/arch
+                  operator: In
+                  values:
+                  - arm64
+        volumes:
+        - name: gcs-model-volume
+          csi:
+            driver: gcsfuse.csi.storage.gke.io
+            volumeAttributes:
+              bucketName: {{ .Values.volumes.gcsfuse.bucketName }}
+              mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1
+        - name: library-dir-host
+          hostPath:
+            path: /home/kubernetes/bin/nvidia
+        - name: gib
+          hostPath:
+            path: /home/kubernetes/bin/gib
+        mainContainer:
+          securityContext:
+              privileged: true
+          stdin: true
+          tty: true
+          image: {{ .Values.dynamo.prefillWorker.image }}
+          workingDir: /sgl-workspace/dynamo/components/backends/sglang
+          startupProbe:
+            failureThreshold: {{ .Values.dynamo.prefillWorker.startupProbe.failureThreshold }}
+            httpGet:
+              path: /live
+              port: system
+            periodSeconds: {{ .Values.dynamo.prefillWorker.startupProbe.periodSeconds }}
+            timeoutSeconds: {{ .Values.dynamo.prefillWorker.startupProbe.timeoutSeconds }}
+            initialDelaySeconds: {{ .Values.dynamo.prefillWorker.startupProbe.initialDelaySeconds }}
+          command: ["/bin/bash", "-c"]
+          args:
+          - |
+            set -e
+            nvidia-smi
+            . /usr/local/gib/scripts/set_nccl_env.sh
+            echo "Pre-compiling DeepGEMM kernels for Prefill Worker..."
+
+            echo "Finished pre-compiling DeepGEMM kernels for Prefill Worker."
+            {{- if .Values.workload_launcher }}
+            # Use custom launcher if provided
+            if [ ! -f "$LAUNCHER_SCRIPT" ]; then
+              echo "Error: Launcher script $LAUNCHER_SCRIPT not found!"
+              exit 1
+            fi
+
+            ARGS=("--is-prefill-worker")
+            if [ -f "$SERVER_ARGS_FILE" ]; then
+              echo "Loading server arguments from ConfigMap"
+              while IFS=': ' read -r key value || [ -n "$key" ]; do
+                [[ -z "$key" || "$key" == \#* ]] && continue
+                key=$(echo "$key" | xargs)
+                value=$(echo "$value" | xargs)
+
+                if [ -n "$key" ]; then
+                  if [[ "$value" == "true" ]]; then
+                    ARGS+=("--$key")
+                  elif [[ "$value" == "false" ]]; then
+                    ARGS+=("--$key" "false")
+                  elif [ -n "$value" ]; then
+                    ARGS+=("--$key" "$value")
+                  else
+                    ARGS+=("--$key")
+                  fi
+                fi
+              done < "$SERVER_ARGS_FILE"
+            fi
+
+            echo "Running: $LAUNCHER_SCRIPT ${ARGS[@]}"
+            exec "$LAUNCHER_SCRIPT" "${ARGS[@]}"
+            {{- else }}
+            exec python3 -m dynamo.sglang \
+              --model-path /data/model/deepseek-ai/DeepSeek-R1 \
+              --served-model-name deepseek-ai/DeepSeek-R1 \
+              --log-level DEBUG \
+              --tp 8 \
+              --dp-size 8 \
+              --trust-remote-code \
+              --decode-log-interval 1 \
+              --page-size 1 \
+              --enable-dp-attention \
+              --disaggregation-mode prefill \
+              --disaggregation-transfer-backend nixl \
+              --disaggregation-bootstrap-port 30001 \
+              --host 0.0.0.0 \
+              --port 9090 \
+              --decode-log-interval 1 \
+              --max-running-requests 6144 \
+              --context-length 2716 \
+              --disable-radix-cache \
+              --moe-a2a-backend deepep \
+              --load-balance-method round_robin \
+              --deepep-mode normal \
+              --moe-dense-tp-size 1 \
+              --enable-dp-lm-head \
+              --disable-shared-experts-fusion \
+              --ep-num-redundant-experts 32 \
+              --ep-dispatch-algorithm static \
+              --eplb-algorithm deepseek \
+              --attention-backend cutlass_mla \
+              --watchdog-timeout 1000000 \
+              --disable-cuda-graph \
+              --chunked-prefill-size 16384 \
+              --max-total-tokens 32768 \
+              --mem-fraction-static 0.8
+            {{- end }}
+    
+          volumeMounts:
+            - mountPath: /data/model
+              name: gcs-model-volume
+            - name: library-dir-host
+              mountPath: /usr/local/nvidia
+            - name: gib
+              mountPath: /usr/local/gib
+            {{- if .Values.workload_launcher }}
+            - name: serving-configuration
+              mountPath: {{ .Values.workload.configPath | default "/workload/configs" }}
+            - name: serving-launcher
+              mountPath: /workload/launcher
+            {{- end }}
+        volumes:
+        {{- if .Values.workload_launcher }}
+        - name: serving-configuration
+          configMap:
+            name: "{{ .Release.Name }}-prefill-config"
+            items:
+            - key: serving-configuration
+              path: {{ .Values.workload.configFile | default "serving-args.yaml" }}
+        - name: serving-launcher
+          configMap:
+            name: "{{ .Release.Name }}-launcher"
+            defaultMode: 0700
+        {{- end }}
diff --git a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-launcher-configmap.yaml b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-launcher-configmap.yaml
new file mode 100644
index 00000000..01e9b51f
--- /dev/null
+++ b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-launcher-configmap.yaml
@@ -0,0 +1,28 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Release.Name }}-launcher"
+  namespace: {{ .Values.dynamo.namespace }}
+data:
+  launch-workload.sh: |-
+{{- if .Values.workload_launcher }}
+{{ .Values.workload_launcher | nindent 4 }}
+{{- else }}
+    #!/bin/bash
+    echo "No workload launcher specified"
+    exit 1
+{{- end }}
\ No newline at end of file
diff --git a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-worker-configmap.yaml b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-worker-configmap.yaml
new file mode 100644
index 00000000..f82580ae
--- /dev/null
+++ b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-worker-configmap.yaml
@@ -0,0 +1,35 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+{{- if .Values.prefill_serving_config }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Release.Name }}-prefill-config"
+  namespace: {{ .Values.dynamo.namespace }}
+data:
+  serving-configuration: |-
+{{ .Values.prefill_serving_config | nindent 4 }}
+{{- end }}
+---
+{{- if .Values.decode_serving_config }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Release.Name }}-decode-config"
+  namespace: {{ .Values.dynamo.namespace }}
+data:
+  serving-configuration: |-
+{{ .Values.decode_serving_config | nindent 4 }}
+{{- end }}
\ No newline at end of file

From 42b686d07d346cfdd7e27886dfe79dd6e98b5e64 Mon Sep 17 00:00:00 2001
From: Yijia J <yijiaj@google.com>
Date: Sat, 27 Dec 2025 00:01:31 +0000
Subject: [PATCH 02/12] fix values

---
 .../disaggregated-serving/dynamo/README.md    | 24 ++++++++-----------
 .../disaggregated-serving/dynamo/values.yaml  |  2 +-
 2 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/inference/a4x/disaggregated-serving/dynamo/README.md b/inference/a4x/disaggregated-serving/dynamo/README.md
index ec499d10..9a5f91e6 100644
--- a/inference/a4x/disaggregated-serving/dynamo/README.md
+++ b/inference/a4x/disaggregated-serving/dynamo/README.md
@@ -1,28 +1,24 @@
 # Disaggregated Multi-Node Dynamo Recipe for A4x
 
-This recipe runs a disaggregated multi-node Dynamo deployment on A4x.
+This recipe runs a disaggregated multi-node Dynamo deployment on A4X.
 
 ## Setup
 
 1.  **Set Environment Variables**
 
     ```bash
-    export REPO_ROOT=$(git rev-parse --show-toplevel)
-    export RELEASE_VERSION="24.05"
     export USER=$(whoami)
     ```
 
 2.  **Run the Recipe**
 
-    ```bash
-    helm install -f values.yaml \
-      --set-file workload_launcher=$REPO_ROOT/src/launchers/dynamo-vllm-launcher.sh \
-      --set-file serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/llama-3.3-70b-multi-node.yaml \
-      --set workload.framework=vllm \
-      --set workload.model.name=meta-llama/Llama-3.3-70B-Instruct \
-      --set workload.image=nvcr.io/nvidia/ai-dynamo/vllm-runtime:${RELEASE_VERSION} \
-      --set workload.gpus=16 \
-      $USER-dynamo-multi-node-serving-a4x \
-      $REPO_ROOT/src/helm-charts/a4x/inference-templates/dynamo-deployment
-    ```
+  ```bash
+  cd $RECIPE_ROOT
+  helm install -f values.yaml \
+  --set-file workload_launcher=$REPO_ROOT/src/launchers/dynamo-sglang-launcher.sh \
+  --set-file prefill_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml \
+  --set-file decode_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml \
+  $USER-dynamo-a4x-multi-node \
+  $REPO_ROOT/src/helm-charts/a4x/inference-templates/dynamo-deployment
+  ```
 
diff --git a/inference/a4x/disaggregated-serving/dynamo/values.yaml b/inference/a4x/disaggregated-serving/dynamo/values.yaml
index b49162bc..71f43c74 100644
--- a/inference/a4x/disaggregated-serving/dynamo/values.yaml
+++ b/inference/a4x/disaggregated-serving/dynamo/values.yaml
@@ -15,7 +15,7 @@
 dynamo:
   namespace: dynamo-cloud
   releaseVersion: "0.7.0"
-  deploymentName:
+  deploymentName: disagg2p2d-yijiaj
   computeDomain:
     name: yijiaj-a4x-domain
     numNodes: 4

From 3dfc415947c9b42b1036115b9457c1672bbffc1c Mon Sep 17 00:00:00 2001
From: Yijia J <yijiaj@google.com>
Date: Sat, 27 Dec 2025 00:28:09 +0000
Subject: [PATCH 03/12] update

---
 .../templates/dynamo-graph-deployment.yaml    | 52 +++++++++----------
 1 file changed, 24 insertions(+), 28 deletions(-)

diff --git a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml
index efe0306d..cb9fbbf0 100644
--- a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml
+++ b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml
@@ -101,7 +101,6 @@ spec:
       sharedMemory:
         size: 80Gi
       resources:
-        resources:
         limits:
           gpu: "4"
         claims:
@@ -138,20 +137,6 @@ spec:
                   operator: In
                   values:
                   - arm64
-        volumes:
-        - name: gcs-model-volume
-          csi:
-            driver: gcsfuse.csi.storage.gke.io
-            volumeAttributes:
-              bucketName: {{ .Values.volumes.gcsfuse.bucketName }}
-              mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1
-        - name: library-dir-host
-          hostPath:
-            path: /home/kubernetes/bin/nvidia
-        - name: gib
-          hostPath:
-            path: /home/kubernetes/bin/gib
-
         mainContainer:
           securityContext:
               privileged: true
@@ -260,6 +245,18 @@ spec:
               mountPath: /workload/launcher
             {{- end }}
         volumes:
+        - name: gcs-model-volume
+          csi:
+            driver: gcsfuse.csi.storage.gke.io
+            volumeAttributes:
+              bucketName: {{ .Values.volumes.gcsfuse.bucketName }}
+              mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1
+        - name: library-dir-host
+          hostPath:
+            path: /home/kubernetes/bin/nvidia
+        - name: gib
+          hostPath:
+            path: /home/kubernetes/bin/gib
         {{- if .Values.workload_launcher }}
         - name: serving-configuration
           configMap:
@@ -337,19 +334,6 @@ spec:
                   operator: In
                   values:
                   - arm64
-        volumes:
-        - name: gcs-model-volume
-          csi:
-            driver: gcsfuse.csi.storage.gke.io
-            volumeAttributes:
-              bucketName: {{ .Values.volumes.gcsfuse.bucketName }}
-              mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1
-        - name: library-dir-host
-          hostPath:
-            path: /home/kubernetes/bin/nvidia
-        - name: gib
-          hostPath:
-            path: /home/kubernetes/bin/gib
         mainContainer:
           securityContext:
               privileged: true
@@ -456,6 +440,18 @@ spec:
               mountPath: /workload/launcher
             {{- end }}
         volumes:
+        - name: gcs-model-volume
+          csi:
+            driver: gcsfuse.csi.storage.gke.io
+            volumeAttributes:
+              bucketName: {{ .Values.volumes.gcsfuse.bucketName }}
+              mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1
+        - name: library-dir-host
+          hostPath:
+            path: /home/kubernetes/bin/nvidia
+        - name: gib
+          hostPath:
+            path: /home/kubernetes/bin/gib
         {{- if .Values.workload_launcher }}
         - name: serving-configuration
           configMap:

From 36ccdb63e6b5444cc722654dc890070290fb00a9 Mon Sep 17 00:00:00 2001
From: Yijia J <yijiaj@google.com>
Date: Thu, 1 Jan 2026 00:25:42 +0000
Subject: [PATCH 04/12] recipe 2p2d, README

---
 .../disaggregated-serving/dynamo/README.md    | 302 +++++++++++++++++-
 .../disaggregated-serving/dynamo/values.yaml  |   8 +-
 .../templates/dynamo-graph-deployment.yaml    | 125 +-------
 3 files changed, 300 insertions(+), 135 deletions(-)

diff --git a/inference/a4x/disaggregated-serving/dynamo/README.md b/inference/a4x/disaggregated-serving/dynamo/README.md
index 9a5f91e6..b185b4fb 100644
--- a/inference/a4x/disaggregated-serving/dynamo/README.md
+++ b/inference/a4x/disaggregated-serving/dynamo/README.md
@@ -1,24 +1,292 @@
-# Disaggregated Multi-Node Dynamo Recipe for A4x
+# Disaggregated Multi-Node Inference with NVIDIA Dynamo on A4X GKE
 
-This recipe runs a disaggregated multi-node Dynamo deployment on A4X.
+This document outlines the steps to deploy and serve Large Language Models (LLMs) using [NVIDIA Dynamo](https://github.com/ai-dynamo/dynamo) disaggregated inference platform on [A4X GKE Node pools](https://cloud.google.com/kubernetes-engine).
 
-## Setup
+Dynamo provides a disaggregated architecture that separates prefill and decode operations for optimized inference performance, supporting both single-node (8 GPUs) and multi-node (16 GPUs) configurations. Dynamo also supports various inference framework backends like [vLLM](https://docs.nvidia.com/dynamo/latest/components/backends/vllm/README.html) and [SGLang](https://docs.nvidia.com/dynamo/latest/components/backends/sglang/README.html). In this recipe, we will focus on serving using the SGLang backend. 
 
-1.  **Set Environment Variables**
+<a name="table-of-contents"></a>
+## Table of Contents
 
-    ```bash
-    export USER=$(whoami)
-    ```
+* [1. Test Environment](#test-environment)
+* [2. Environment Setup (One-Time)](#environment-setup)
+  * [2.1. Clone the Repository](#clone-repo)
+  * [2.2. Configure Environment Variables](#configure-vars)
+  * [2.3. Connect to your GKE Cluster](#connect-cluster)
+  * [2.4. Create Secrets](#create-secrets)
+  * [2.5. Install Dynamo Platform](#install-platform)
+* [3. Deploy with SGLang Backend](#deploy-sglang)
+  * [3.1. Multi-Node SGLang Deployment (16 GPUs)](#sglang-multi-node)
+* [4. Inference Request](#inference-request)
+* [5. Monitoring and Troubleshooting](#monitoring)
+* [6. Cleanup](#cleanup)
 
-2.  **Run the Recipe**
+<a name="test-environment"></a>
+## 1. Test Environment
 
-  ```bash
-  cd $RECIPE_ROOT
-  helm install -f values.yaml \
-  --set-file workload_launcher=$REPO_ROOT/src/launchers/dynamo-sglang-launcher.sh \
-  --set-file prefill_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml \
-  --set-file decode_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml \
-  $USER-dynamo-a4x-multi-node \
-  $REPO_ROOT/src/helm-charts/a4x/inference-templates/dynamo-deployment
-  ```
+[Back to Top](#table-of-contents)
+
+This recipe has been tested with the following configuration:
+
+* **GKE Cluster**:
+    * GPU node pools with [a4x-highgpu-4g](https://docs.cloud.google.com/compute/docs/gpus#gb200-gpus) machines:
+      * For multi-node deployment: 4 machines with 4 GPUs each (16 GPUs total)
+    * [Workload Identity Federation for GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/workload-identity) enabled
+    * [Cloud Storage FUSE CSI driver for GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/cloud-storage-fuse-csi-driver) enabled
+
+> [!IMPORTANT]
+> To prepare the required environment, see the [GKE environment setup guide](../../../../docs/configuring-environment-gke-a4x.md).
+    
+<a name="environment-setup"></a>
+## 2. Environment Setup (One-Time)
+
+[Back to Top](#table-of-contents)
+
+<a name="clone-repo"></a>
+### 2.1. Clone the Repository
+
+```bash
+git clone https://github.com/ai-hypercomputer/gpu-recipes.git
+cd gpu-recipes
+export REPO_ROOT=$(pwd)
+export RECIPE_ROOT=$REPO_ROOT/inference/a4x/disaggregated-serving/dynamo
+```
+
+<a name="configure-vars"></a>
+### 2.2. Configure Environment Variables
+
+```bash
+export PROJECT_ID=<PROJECT_ID>
+export CLUSTER_REGION=<REGION_of_your_cluster>
+export CLUSTER_NAME=<YOUR_GKE_CLUSTER_NAME>
+export NAMESPACE=dynamo-cloud
+export NGC_API_KEY=<YOUR_NGC_API_KEY>
+export HF_TOKEN=<YOUR_HF_TOKEN>
+export RELEASE_VERSION=0.7.0
+
+# Set the project for gcloud commands
+gcloud config set project $PROJECT_ID
+```
+
+Replace the following values:
+
+| Variable | Description | Example |
+| -------- | ----------- | ------- |
+| `PROJECT_ID` | Your Google Cloud Project ID | `gcp-project-12345` |
+| `CLUSTER_REGION` | The GCP region where your GKE cluster is located | `us-central1` |
+| `CLUSTER_NAME` | The name of your GKE cluster | `a4x-cluster` |
+| `NGC_API_KEY` | Your NVIDIA NGC API key (get from [NGC](https://ngc.nvidia.com)) | `nvapi-xxx...` |
+| `HF_TOKEN` | Your Hugging Face access token | `hf_xxx...` |
+
+<a name="connect-cluster"></a>
+### 2.3. Connect to your GKE Cluster
+
+```bash
+gcloud container clusters get-credentials $CLUSTER_NAME --region $CLUSTER_REGION
+```
+
+<a name="create-secrets"></a>
+### 2.4. Create Secrets
+
+Create the namespace:
+```bash
+kubectl create namespace ${NAMESPACE}
+kubectl config set-context --current --namespace=$NAMESPACE
+```
+
+Create the Docker registry secret for NVIDIA Container Registry:
+```bash
+kubectl create secret docker-registry nvcr-secret \
+  --namespace=${NAMESPACE} \
+  --docker-server=nvcr.io \
+  --docker-username='$oauthtoken' \
+  --docker-password=${NGC_API_KEY}
+```
+
+Create the secret for the Hugging Face token:
+```bash
+kubectl create secret generic hf-token-secret \
+  --from-literal=HF_TOKEN=${HF_TOKEN} \
+  -n ${NAMESPACE}
+```
+
+<a name="install-platform"></a>
+### 2.5. Install Dynamo Platform (One-Time Setup)
+
+Add the NVIDIA Helm repository:
+```bash
+helm repo add nvidia https://helm.ngc.nvidia.com/nvidia \
+  --username='$oauthtoken' --password=${NGC_API_KEY}
+helm repo update
+```
+
+Fetch the Dynamo Helm charts:
+```bash
+helm fetch https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-crds-${RELEASE_VERSION}.tgz
+helm fetch https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-platform-${RELEASE_VERSION}.tgz
+```
+
+Install the Dynamo CRDs:
+```bash
+helm install dynamo-crds dynamo-crds-${RELEASE_VERSION}.tgz \
+  --namespace default \
+  --wait \
+  --atomic
+```
+
+Install the Dynamo Platform with Grove & Kai scheduler enabled:
+```bash
+helm install dynamo-platform dynamo-platform-${RELEASE_VERSION}.tgz \
+  --namespace ${NAMESPACE} --set grove.enabled=true --set kai-scheduler.enabled=true
+```
+
+Verify the installation:
+```bash
+kubectl get pods -n ${NAMESPACE}
+```
+
+Wait until all pods show a `Running` status before proceeding.
+
+<a name="deploy-sglang"></a>
+## 3. Deploy with SGLang Backend
+
+[Back to Top](#table-of-contents)
+
+Deploy Dynamo with SGLang backend for high-performance inference.
+
+<a name="sglang-multi-node"></a>
+### 3.1. Multi-Node vLLM Deployment (16 GPUs)
+
+Multi-node deployment uses 16 GPUs across 4 A4X machines, providing increased capacity for larger models or higher throughput.
+
+#### DeepSeekR1 671B Model
+
+Deploy DeepSeekR1-671B across multiple nodes for production workloads. Note the use of `--set-file prefill_serving_config` and `--set-file decode_serving_config` pointing to the correct model config file for a multi node deployment scenario: 
+
+```bash
+cd $RECIPE_ROOT
+helm install -f values.yaml \
+--set-file prefill_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml \
+--set-file decode_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml \
+$USER-dynamo-a4x-multi-node \
+$REPO_ROOT/src/helm-charts/a4x/inference-templates/dynamo-deployment
+```
+
+<a name="inference-request"></a>
+## 4. Inference Request
+[Back to Top](#table-of-contents)
+
+To make an inference request to test the server, we can first run a health check against the server using `curl`
+
+```bash
+kubectl exec -it -n ${NAMESPACE} deployment/$USER-dynamo-a4x-multi-node -- curl http://localhost:8000/health | jq
+```
+
+You should see a server status like this. Wait for it to be in a `healthy` state.
+
+```json
+{
+  "instances": [
+    {
+      "component": "backend",
+      "endpoint": "load_metrics",
+      "instance_id": 3994861215823793160,
+      "namespace": "dynamo",
+      "transport": {
+        "nats_tcp": "dynamo_backend.load_metrics-3770991c30298c08"
+      }
+    },
+    {
+      "component": "prefill",
+      "endpoint": "clear_kv_blocks",
+      "instance_id": 3994861215823793153,
+      "namespace": "dynamo",
+      "transport": {
+        "nats_tcp": "dynamo_prefill.clear_kv_blocks-3770991c30298c01"
+      }
+    },
+    {
+      "component": "prefill",
+      "endpoint": "generate",
+      "instance_id": 3994861215823793153,
+      "namespace": "dynamo",
+      "transport": {
+        "nats_tcp": "dynamo_prefill.generate-3770991c30298c01"
+      }
+    }
+  ],
+  "message": "No endpoints available",
+  "status": "unhealthy"
+}
+``` 
+
+Then we can send a benchmark request with like this:
+
+```bash
+kubectl exec -n ${NAMESPACE} $USER-dynamo-multi-node-serving-frontend -- python3 -u -m sglang.bench_serving    --backend sglang-oai-chat    --base-url http://localhost:8000    --model "deepseek-ai/DeepSeek-R1"    --tokenizer /data/model/deepseek-ai/DeepSeek-R1    --dataset-name random    --num-prompts 2048    --random-input-len 2048    --random-output-len 512    --max-concurrency 512
+```
+
+<a name="monitoring"></a>
+## 5. Monitoring and Troubleshooting
+
+[Back to Top](#table-of-contents)
+
+View logs for different components (replace with your deployment name):
+
+You can find the exact pod name by:
+```bash
+kubectl get pods -n ${NAMESPACE}
+```
+
+Frontend logs:
+```bash
+kubectl logs -f deployment/$USER-dynamo-multi-node-serving-frontend -n ${NAMESPACE}
+```
+
+Decode worker logs:
+```bash
+kubectl logs -f deployment/$USER-dynamo-multi-node-serving-decode-worker -n ${NAMESPACE}
+```
+
+Prefill worker logs:
+```bash
+kubectl logs -f deployment/$USER-dynamo-multi-node-serving-prefill-worker -n ${NAMESPACE}
+```
+
+Common issues:
+
+* **Pods stuck in Pending**: Check if nodes have sufficient resources (especially for multi-node deployments)
+* **Model download slow**: Large models like DeepSeekR1 671B can take 30 minutes to download
+* **Multi-node issues**: Verify network connectivity between nodes and proper subnet configuration
+
+<a name="cleanup"></a>
+## 6. Cleanup
+
+[Back to Top](#table-of-contents)
+
+List deployed releases:
+```bash
+helm list -n ${NAMESPACE} --filter $USER-dynamo-
+```
+
+Uninstall specific deployments:
+```bash
+helm uninstall $USER-dynamo-multi-node-serving -n ${NAMESPACE}
+```
+
+Uninstall Dynamo platform (if no longer needed):
+```bash
+helm uninstall dynamo-platform -n ${NAMESPACE}
+helm uninstall dynamo-crds -n default
+```
+
+Delete namespace and secrets:
+```bash
+kubectl delete namespace ${NAMESPACE}
+```
+
+Clean up downloaded charts:
+```bash
+rm -f dynamo-crds-${RELEASE_VERSION}.tgz
+rm -f dynamo-platform-${RELEASE_VERSION}.tgz
+```
 
diff --git a/inference/a4x/disaggregated-serving/dynamo/values.yaml b/inference/a4x/disaggregated-serving/dynamo/values.yaml
index 71f43c74..9c271b35 100644
--- a/inference/a4x/disaggregated-serving/dynamo/values.yaml
+++ b/inference/a4x/disaggregated-serving/dynamo/values.yaml
@@ -15,11 +15,11 @@
 dynamo:
   namespace: dynamo-cloud
   releaseVersion: "0.7.0"
-  deploymentName: disagg2p2d-yijiaj
+  deploymentName: disagg2p2d
   computeDomain:
-    name: yijiaj-a4x-domain
+    name: a4x-domain
     numNodes: 4
-    resourceClaimTemplateName: yijiaj-a4x-channel
+    resourceClaimTemplateName: a4x-channel
   frontend:
     image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1
     replicas: 1
@@ -185,6 +185,8 @@ workload:
   model: deepseek-ai/DeepSeek-R1
   gpus: 16
   framework: sglang
+  configFile: serving-args.yaml
+  configPath: /workload/configs
 
 network:
   subnetworks: []
diff --git a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml
index cb9fbbf0..67444375 100644
--- a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml
+++ b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml
@@ -106,6 +106,8 @@ spec:
         claims:
           - name: compute-domain-channel
       envs:
+        - name: SERVER_ARGS_FILE
+          value: {{ .Values.workload.configPath }}/{{ .Values.workload.configFile }}
         {{- if .Values.dynamo.decodeWorker.envs }}
         {{- toYaml .Values.dynamo.decodeWorker.envs | nindent 8 }}
         {{- end }}
@@ -163,13 +165,6 @@ spec:
             env | grep NCCL_
             echo "--- END VERIFICATION ---"
 
-            {{- if .Values.workload_launcher }}
-            # Use custom launcher if provided
-            if [ ! -f "$LAUNCHER_SCRIPT" ]; then
-              echo "Error: Launcher script $LAUNCHER_SCRIPT not found!"
-              exit 1
-            fi
-
             ARGS=()
             if [ -f "$SERVER_ARGS_FILE" ]; then
               echo "Loading server arguments from ConfigMap"
@@ -191,45 +186,8 @@ spec:
                 fi
               done < "$SERVER_ARGS_FILE"
             fi
-
-            echo "Running: $LAUNCHER_SCRIPT ${ARGS[@]}"
-            exec "$LAUNCHER_SCRIPT" "${ARGS[@]}"
-            {{- else }}
-            exec python3 -m dynamo.sglang \
-              --model-path /data/model/deepseek-ai/DeepSeek-R1 \
-              --served-model-name deepseek-ai/DeepSeek-R1 \
-              --log-level DEBUG \
-              --tp 8 \
-              --dp-size 8 \
-              --decode-log-interval 1 \
-              --page-size 1 \
-              --enable-dp-attention \
-              --trust-remote-code \
-              --disaggregation-mode decode \
-              --disaggregation-transfer-backend nixl \
-              --disaggregation-bootstrap-port 30001 \
-              --host 0.0.0.0 \
-              --port 9090 \
-              --decode-log-interval 1 \
-              --max-running-requests 36864 \
-              --context-length 2716 \
-              --disable-radix-cache \
-              --moe-a2a-backend deepep \
-              --prefill-round-robin-balance \
-              --deepep-mode normal \
-              --moe-dense-tp-size 1 \
-              --enable-dp-lm-head \
-              --disable-cuda-graph \
-              --cuda-graph-max-bs 256 \
-              --disable-shared-experts-fusion \
-              --ep-num-redundant-experts 32 \
-              --ep-dispatch-algorithm static \
-              --eplb-algorithm deepseek \
-              --attention-backend cutlass_mla \
-              --watchdog-timeout 1000000 \
-              --chunked-prefill-size 36864 \
-              --mem-fraction-static 0.8
-            {{- end }}
+            echo "Running: python3 -m dynamo.sglang ${ARGS[@]}"
+            exec python3 -m dynamo.sglang "${ARGS[@]}"
 
           volumeMounts:
             - mountPath: /data/model
@@ -238,12 +196,8 @@ spec:
               mountPath: /usr/local/nvidia
             - name: gib
               mountPath: /usr/local/gib
-            {{- if .Values.workload_launcher }}
             - name: serving-configuration
               mountPath: {{ .Values.workload.configPath | default "/workload/configs" }}
-            - name: serving-launcher
-              mountPath: /workload/launcher
-            {{- end }}
         volumes:
         - name: gcs-model-volume
           csi:
@@ -257,19 +211,12 @@ spec:
         - name: gib
           hostPath:
             path: /home/kubernetes/bin/gib
-        {{- if .Values.workload_launcher }}
         - name: serving-configuration
           configMap:
             name: "{{ .Release.Name }}-decode-config"
             items:
             - key: serving-configuration
               path: {{ .Values.workload.configFile | default "serving-args.yaml" }}
-        - name: serving-launcher
-          configMap:
-            name: "{{ .Release.Name }}-launcher"
-            defaultMode: 0700
-        {{- end }}
-
 
     Prefill:
       multinode:
@@ -303,6 +250,8 @@ spec:
         claims:
           - name: compute-domain-channel
       envs:
+        - name: SERVER_ARGS_FILE
+          value: {{ .Values.workload.configPath }}/{{ .Values.workload.configFile }}
         {{- if .Values.dynamo.prefillWorker.envs }}
         {{- toYaml .Values.dynamo.prefillWorker.envs | nindent 8 }}
         {{- end }}
@@ -356,16 +305,9 @@ spec:
             nvidia-smi
             . /usr/local/gib/scripts/set_nccl_env.sh
             echo "Pre-compiling DeepGEMM kernels for Prefill Worker..."
-
             echo "Finished pre-compiling DeepGEMM kernels for Prefill Worker."
-            {{- if .Values.workload_launcher }}
-            # Use custom launcher if provided
-            if [ ! -f "$LAUNCHER_SCRIPT" ]; then
-              echo "Error: Launcher script $LAUNCHER_SCRIPT not found!"
-              exit 1
-            fi
 
-            ARGS=("--is-prefill-worker")
+            ARGS=()
             if [ -f "$SERVER_ARGS_FILE" ]; then
               echo "Loading server arguments from ConfigMap"
               while IFS=': ' read -r key value || [ -n "$key" ]; do
@@ -386,45 +328,8 @@ spec:
                 fi
               done < "$SERVER_ARGS_FILE"
             fi
-
-            echo "Running: $LAUNCHER_SCRIPT ${ARGS[@]}"
-            exec "$LAUNCHER_SCRIPT" "${ARGS[@]}"
-            {{- else }}
-            exec python3 -m dynamo.sglang \
-              --model-path /data/model/deepseek-ai/DeepSeek-R1 \
-              --served-model-name deepseek-ai/DeepSeek-R1 \
-              --log-level DEBUG \
-              --tp 8 \
-              --dp-size 8 \
-              --trust-remote-code \
-              --decode-log-interval 1 \
-              --page-size 1 \
-              --enable-dp-attention \
-              --disaggregation-mode prefill \
-              --disaggregation-transfer-backend nixl \
-              --disaggregation-bootstrap-port 30001 \
-              --host 0.0.0.0 \
-              --port 9090 \
-              --decode-log-interval 1 \
-              --max-running-requests 6144 \
-              --context-length 2716 \
-              --disable-radix-cache \
-              --moe-a2a-backend deepep \
-              --load-balance-method round_robin \
-              --deepep-mode normal \
-              --moe-dense-tp-size 1 \
-              --enable-dp-lm-head \
-              --disable-shared-experts-fusion \
-              --ep-num-redundant-experts 32 \
-              --ep-dispatch-algorithm static \
-              --eplb-algorithm deepseek \
-              --attention-backend cutlass_mla \
-              --watchdog-timeout 1000000 \
-              --disable-cuda-graph \
-              --chunked-prefill-size 16384 \
-              --max-total-tokens 32768 \
-              --mem-fraction-static 0.8
-            {{- end }}
+            echo "Running: python3 -m dynamo.sglang ${ARGS[@]}"
+            exec python3 -m dynamo.sglang "${ARGS[@]}"
     
           volumeMounts:
             - mountPath: /data/model
@@ -433,12 +338,8 @@ spec:
               mountPath: /usr/local/nvidia
             - name: gib
               mountPath: /usr/local/gib
-            {{- if .Values.workload_launcher }}
             - name: serving-configuration
               mountPath: {{ .Values.workload.configPath | default "/workload/configs" }}
-            - name: serving-launcher
-              mountPath: /workload/launcher
-            {{- end }}
         volumes:
         - name: gcs-model-volume
           csi:
@@ -452,15 +353,9 @@ spec:
         - name: gib
           hostPath:
             path: /home/kubernetes/bin/gib
-        {{- if .Values.workload_launcher }}
         - name: serving-configuration
           configMap:
             name: "{{ .Release.Name }}-prefill-config"
             items:
             - key: serving-configuration
-              path: {{ .Values.workload.configFile | default "serving-args.yaml" }}
-        - name: serving-launcher
-          configMap:
-            name: "{{ .Release.Name }}-launcher"
-            defaultMode: 0700
-        {{- end }}
+              path: {{ .Values.workload.configFile | default "serving-args.yaml" }}
\ No newline at end of file

From e7503e8c5efa13a7f5c3aab439d534b886dcd572 Mon Sep 17 00:00:00 2001
From: Yijia J <yijiaj@google.com>
Date: Sat, 10 Jan 2026 06:13:31 +0000
Subject: [PATCH 05/12] add 10p8d configs, add path without gcsfuse

---
 .../disaggregated-serving/dynamo/values.yaml  | 69 ++++++++++++-------
 .../deepseekr1-fp8-10p8d-decode.yaml          | 50 ++++++++++++++
 .../deepseekr1-fp8-10p8d-prefill.yaml         | 50 ++++++++++++++
 ...e.yaml => deepseekr1-fp8-2p2d-decode.yaml} |  1 -
 ....yaml => deepseekr1-fp8-2p2d-prefill.yaml} |  1 -
 .../templates/dynamo-graph-deployment.yaml    | 59 ++++++++++++++--
 6 files changed, 197 insertions(+), 33 deletions(-)
 create mode 100644 src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-decode.yaml
 create mode 100644 src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-prefill.yaml
 rename src/frameworks/a4x/dynamo-configs/{deepseekr1-fp8-multi-node-decode.yaml => deepseekr1-fp8-2p2d-decode.yaml} (96%)
 rename src/frameworks/a4x/dynamo-configs/{deepseekr1-fp8-multi-node-prefill.yaml => deepseekr1-fp8-2p2d-prefill.yaml} (96%)

diff --git a/inference/a4x/disaggregated-serving/dynamo/values.yaml b/inference/a4x/disaggregated-serving/dynamo/values.yaml
index 9c271b35..a047a65f 100644
--- a/inference/a4x/disaggregated-serving/dynamo/values.yaml
+++ b/inference/a4x/disaggregated-serving/dynamo/values.yaml
@@ -13,15 +13,16 @@
 # limitations under the License.
 
 dynamo:
-  namespace: dynamo-cloud
+  namespace: yijiaj-test
   releaseVersion: "0.7.0"
-  deploymentName: disagg2p2d
+  deploymentName: disagg2p2d-yijiaj
   computeDomain:
-    name: a4x-domain
+    name: yijiaj-a4x-domain
     numNodes: 4
-    resourceClaimTemplateName: a4x-channel
+    resourceClaimTemplateName: yijiaj-a4x-channel
+  serviceAccountName: dynamo-platform-dynamo-operator-component
   frontend:
-    image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1
+    image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.7.0
     replicas: 1
     livenessProbe:
       initialDelaySeconds: 3000
@@ -34,24 +35,34 @@ dynamo:
       timeoutSeconds: 300
       failureThreshold: 100
   decodeWorker:
-    image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.6.1
+    image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout 
+    #image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.6.1
     nodeCount: 2
     replicas: 1
     envs:
+    - name: HF_TOKEN
+      valueFrom:
+        secretKeyRef:
+          name: hf-token-secret
+          key: HF_TOKEN
+    - name: HF_HUB_ENABLE_HF_TRANSFER
+      value: "1"
     - name: LD_LIBRARY_PATH
       value: "/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu/plugins:/usr/local/nvidia/lib64"
     - name: GLOO_SOCKET_IFNAME
       value: eth0
     - name: TP_SOCKET_IFNAME
       value: eth0
-    - name: SGLANG_ENABLE_JIT_DEEPGEMM
-      value: "1"
+    # - name: SGLANG_ENABLE_JIT_DEEPGEMM
+    #   value: "1"
     - name: DYN_SKIP_SGLANG_LOG_FORMATTING
       value: "1"
+    - name: SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK
+      value: "256"
     - name: MC_TE_METRIC
       value: "true"
-    - name: SGLANG_ENABLE_FLASHINFER_GEMM
-      value: "1"
+    # - name: SGLANG_ENABLE_FLASHINFER_GEMM
+    #   value: "1"
     - name: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
       value: "100000"
     - name: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
@@ -76,14 +87,14 @@ dynamo:
       value: "1"
     - name: PYTHONUNBUFFERED
       value: "1"
-    - name: NCCL_DEBUG
-      value: INFO
-    - name: NCCL_DEBUG_SUBSYS
-      value: INIT,BOOTSTRAP,ENV,NET,GRAPH
-    - name: NCCL_SOCKET_FAMILY
-      value: "AF_INET"
-    - name: GLOO_SOCKET_FAMILY
-      value: "AF_INET"
+    # - name: NCCL_DEBUG
+    #   value: INFO
+    # - name: NCCL_DEBUG_SUBSYS
+    #   value: INIT,BOOTSTRAP,ENV,NET,GRAPH
+    # - name: NCCL_SOCKET_FAMILY
+    #   value: "AF_INET"
+    # - name: GLOO_SOCKET_FAMILY
+    #   value: "AF_INET"
     livenessProbe:
       initialDelaySeconds: 3000
       periodSeconds: 60
@@ -100,10 +111,18 @@ dynamo:
       timeoutSeconds: 600
       failureThreshold: 3000
   prefillWorker:
-    image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.6.1
+    image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout 
+    #image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.6.1
     nodeCount: 2
     replicas: 1
     envs:
+      - name: HF_TOKEN
+        valueFrom:
+          secretKeyRef:
+            name: hf-token-secret
+            key: HF_TOKEN
+      - name: HF_HUB_ENABLE_HF_TRANSFER
+        value: "1"
       - name: LD_LIBRARY_PATH
         value: "/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu/plugins:/usr/local/nvidia/lib64"
       - name: UCX_TLS
@@ -112,14 +131,14 @@ dynamo:
         value: eth0
       - name: TP_SOCKET_IFNAME
         value: eth0
-      - name: SGLANG_ENABLE_JIT_DEEPGEMM
-        value: "1"
+      # - name: SGLANG_ENABLE_JIT_DEEPGEMM
+      #   value: "1"
       - name: DYN_SKIP_SGLANG_LOG_FORMATTING
         value: "1"
       - name: MC_TE_METRIC
         value: "true"
-      - name: SGLANG_ENABLE_FLASHINFER_GEMM
-        value: "1"
+      # - name: SGLANG_ENABLE_FLASHINFER_GEMM
+      #   value: "1"
       - name: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
         value: "100000"
       - name: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
@@ -166,14 +185,14 @@ secrets:
       token: "hf_api_token"
 
 volumes:
+  useGcs: true
   gcsfuse:
     bucketName: "yijiaj-test"
     fileCacheCapacity: "500G"
     cachePath: "/gcs-cache"
   ssdMountPath: "/ssd"
   gcsMounts:
-    - bucketName:  "yijiaj-test"
-      mountPath: "/data/model"
+    mountPath: "/data/model"
 
 service:
   type: ClusterIP
diff --git a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-decode.yaml b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-decode.yaml
new file mode 100644
index 00000000..bbbdf18f
--- /dev/null
+++ b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-decode.yaml
@@ -0,0 +1,50 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+served-model-name: deepseek-ai/DeepSeek-R1
+disaggregation-mode: decode
+disaggregation-bootstrap-port: "30001"
+host: "0.0.0.0"
+port: "9090"
+trust-remote-code: true
+skip-tokenizer-init: true
+tp-size: "32"
+dp-size: "32"
+ep-size: "32" 
+quantization: "fp8"
+# page-size: "1"
+enable-dp-attention: true
+attention-backend: "trtllm_mla"
+kv-cache-dtype: "fp8_e4m3"
+disable-radix-cache: true
+stream-interval: "50"
+# disaggregation-transfer-backend: nixl
+decode-log-interval: "1000"
+max-running-requests: "8192"
+context-length: "9300"
+watchdog-timeout: "1000000"
+disable-shared-experts-fusion: true
+eplb-algorithm: deepseek
+mem-fraction-static: "0.82"
+chunked-prefill-size: "36864"
+moe-a2a-backend: "deepep"
+deepep-mode: "low_latency"
+ep-dispatch-algorithm: static
+moe-dense-tp-size: "1"
+enable-dp-lm-head: true
+prefill-round-robin-balance: true
+ep-num-redundant-experts: "32"
+cuda-graph-max-bs: "256"
+# disable-cuda-graph: true
+deepep-config: '{"normal_dispatch": {"num_sms": 128,"num_max_nvl_chunked_send_tokens": 28,"num_max_nvl_chunked_recv_tokens": 256,"num_max_rdma_chunked_send_tokens": 6,"num_max_rdma_chunked_recv_tokens": 256}, "normal_combine": {"num_sms": 128,"num_max_nvl_chunked_send_tokens": 15,"num_max_nvl_chunked_recv_tokens": 256,"num_max_rdma_chunked_send_tokens": 6,"num_max_rdma_chunked_recv_tokens": 128}}'
\ No newline at end of file
diff --git a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-prefill.yaml b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-prefill.yaml
new file mode 100644
index 00000000..f5748607
--- /dev/null
+++ b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-prefill.yaml
@@ -0,0 +1,50 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+served-model-name: deepseek-ai/DeepSeek-R1
+# log-level: DEBUG
+disaggregation-mode: prefill
+disaggregation-bootstrap-port: "30001"
+host: "0.0.0.0"
+port: "9090"
+trust-remote-code: true
+tp-size: "8"
+dp-size: "8"
+ep-size: "8"
+quantization: "fp8"
+enable-dp-attention: true
+attention-backend: "trtllm_mla"
+kv-cache-dtype: "fp8_e4m3"
+disable-radix-cache: true
+stream-interval: "50"
+max-running-requests: "30000"
+context-length: "9300"
+# decode-log-interval: "1"
+# page-size: "1"
+# disaggregation-transfer-backend: nixl
+watchdog-timeout: "1000000"
+disable-shared-experts-fusion: true
+eplb-algorithm: deepseek
+mem-fraction-static: "0.8"
+max-total-tokens: "524288"
+chunked-prefill-size: "131072"
+load-balance-method: round_robin
+disable-cuda-graph: true
+moe-a2a-backend: deepep
+deepep-mode: normal
+ep-dispatch-algorithm: "dynamic"
+moe-dense-tp-size: "1"
+enable-dp-lm-head: true
+ep-num-redundant-experts: "32"
+deepep-config: '{"normal_dispatch": {"num_sms": 128,"num_max_nvl_chunked_send_tokens": 28,"num_max_nvl_chunked_recv_tokens": 256,"num_max_rdma_chunked_send_tokens": 6,"num_max_rdma_chunked_recv_tokens": 256}, "normal_combine": {"num_sms": 128,"num_max_nvl_chunked_send_tokens": 15,"num_max_nvl_chunked_recv_tokens": 256,"num_max_rdma_chunked_send_tokens": 6,"num_max_rdma_chunked_recv_tokens": 128}}'
\ No newline at end of file
diff --git a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-2p2d-decode.yaml
similarity index 96%
rename from src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml
rename to src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-2p2d-decode.yaml
index 82029f49..a2287217 100644
--- a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml
+++ b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-2p2d-decode.yaml
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-model-path: /data/model/deepseek-ai/DeepSeek-R1
 served-model-name: deepseek-ai/DeepSeek-R1
 log-level: DEBUG
 tp: "8"
diff --git a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-2p2d-prefill.yaml
similarity index 96%
rename from src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml
rename to src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-2p2d-prefill.yaml
index 939aa2cc..f2abbcd4 100644
--- a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml
+++ b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-2p2d-prefill.yaml
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-model-path: /data/model/deepseek-ai/DeepSeek-R1
 served-model-name: deepseek-ai/DeepSeek-R1
 log-level: DEBUG
 tp: "8"
diff --git a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml
index 67444375..0ac6cdf5 100644
--- a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml
+++ b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml
@@ -35,12 +35,14 @@ spec:
           memory: "10Gi"
       extraPodMetadata:
         annotations:
+          {{- if eq .Values.volumes.useGcs true }}
           gke-gcsfuse/volumes: "true"
           gke-gcsfuse/cpu-limit: "0"
           gke-gcsfuse/memory-limit: "0"
           gke-gcsfuse/ephemeral-storage-limit: "0"
           gke-gcsfuse/file-cache-capacity: "500Gi"
           gke-gcsfuse/cache-path: "/gcs-cache"
+          {{- end }}
       extraPodSpec:
         tolerations:
         - key: "kubernetes.io/arch"
@@ -53,21 +55,24 @@ spec:
         volumes:
         - name: local-ssd
           emptyDir: {}
+        {{- if eq .Values.volumes.useGcs true }}
         - name: gcs-model-volume
           csi:
             driver: gcsfuse.csi.storage.gke.io
             volumeAttributes:
               bucketName: {{ .Values.volumes.gcsfuse.bucketName }}
               mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:50,file-cache:max-size-mb:-1"
-
+        {{- end }}
         mainContainer:
           image: {{ .Values.dynamo.frontend.image }}
+          {{- if eq .Values.volumes.useGcs true }}
           volumeMounts:
           - name: local-ssd
             mountPath: /gcs-cache
           - name: gcs-model-volume
             mountPath: /data/model
             readOnly: true
+          {{- end }}
           resources:
             requests:
               ephemeral-storage: "30Gi"
@@ -108,15 +113,21 @@ spec:
       envs:
         - name: SERVER_ARGS_FILE
           value: {{ .Values.workload.configPath }}/{{ .Values.workload.configFile }}
+        {{- if eq .Values.volumes.useGcs true }}
+        - name: MODEL_PATH
+          value: {{ .Values.volumes.gcsMounts.mountPath }}/{{ .Values.workload.model }}
+        {{- end }}
         {{- if .Values.dynamo.decodeWorker.envs }}
         {{- toYaml .Values.dynamo.decodeWorker.envs | nindent 8 }}
         {{- end }}
       extraPodMetadata:
         annotations:
+          {{- if eq .Values.volumes.useGcs true }}
           gke-gcsfuse/cpu-limit: "0"
           gke-gcsfuse/ephemeral-storage-limit: "0"
           gke-gcsfuse/memory-limit: "0"
           gke-gcsfuse/volumes: "true"
+          {{- end }}
           networking.gke.io/default-interface: 'eth0'
           networking.gke.io/interfaces: |
             [
@@ -127,6 +138,9 @@ spec:
               {"interfaceName":"eth5","network":"rdma-3"}
             ]
       extraPodSpec:
+        {{- if .Values.dynamo.serviceAccountName }}
+        serviceAccountName: {{ .Values.dynamo.serviceAccountName }}
+        {{- end }}
         resourceClaims:
           - name: compute-domain-channel
             resourceClaimTemplateName: {{ .Values.dynamo.computeDomain.resourceClaimTemplateName }}
@@ -164,8 +178,16 @@ spec:
             echo "--- VERIFYING NCCL ENV VARS IN SHELL ---"
             env | grep NCCL_
             echo "--- END VERIFICATION ---"
+            pip install hf_transfer
 
             ARGS=()
+            if [ -n "$MODEL_PATH" ]; then
+              echo "Adding model path from env var: $MODEL_PATH"
+              ARGS+=("--model-path" "$MODEL_PATH")
+            else
+              echo "No MODEL_PATH env var set from gcsfuse, relying on config file for model"
+              ARGS+=("--model" "{{ .Values.workload.model }}")
+            fi
             if [ -f "$SERVER_ARGS_FILE" ]; then
               echo "Loading server arguments from ConfigMap"
               while IFS=': ' read -r key value || [ -n "$key" ]; do
@@ -190,8 +212,10 @@ spec:
             exec python3 -m dynamo.sglang "${ARGS[@]}"
 
           volumeMounts:
+          {{- if eq .Values.volumes.useGcs true }}
             - mountPath: /data/model
               name: gcs-model-volume
+          {{- end }}
             - name: library-dir-host
               mountPath: /usr/local/nvidia
             - name: gib
@@ -199,12 +223,14 @@ spec:
             - name: serving-configuration
               mountPath: {{ .Values.workload.configPath | default "/workload/configs" }}
         volumes:
+        {{- if eq .Values.volumes.useGcs true }}
         - name: gcs-model-volume
           csi:
             driver: gcsfuse.csi.storage.gke.io
             volumeAttributes:
               bucketName: {{ .Values.volumes.gcsfuse.bucketName }}
               mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1
+        {{- end }}
         - name: library-dir-host
           hostPath:
             path: /home/kubernetes/bin/nvidia
@@ -227,9 +253,11 @@ spec:
       subComponentType: prefill
       replicas: {{ .Values.dynamo.prefillWorker.replicas }}
       livenessProbe:
-        httpGet:
-          path: /live
-          port: system
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - "exit 0"
         initialDelaySeconds: {{ .Values.dynamo.prefillWorker.livenessProbe.initialDelaySeconds }} 
         periodSeconds: {{ .Values.dynamo.prefillWorker.livenessProbe.periodSeconds }}        
         timeoutSeconds: {{ .Values.dynamo.prefillWorker.livenessProbe.timeoutSeconds }}        
@@ -252,15 +280,21 @@ spec:
       envs:
         - name: SERVER_ARGS_FILE
           value: {{ .Values.workload.configPath }}/{{ .Values.workload.configFile }}
+        {{- if eq .Values.volumes.useGcs true }}
+        - name: MODEL_PATH
+          value: {{ .Values.volumes.gcsMounts.mountPath }}/{{ .Values.workload.model }}
+        {{- end }}
         {{- if .Values.dynamo.prefillWorker.envs }}
         {{- toYaml .Values.dynamo.prefillWorker.envs | nindent 8 }}
         {{- end }}
       extraPodMetadata:
         annotations:
+          {{- if eq .Values.volumes.useGcs true }}
           gke-gcsfuse/cpu-limit: "0"
           gke-gcsfuse/ephemeral-storage-limit: "0"
           gke-gcsfuse/memory-limit: "0"
           gke-gcsfuse/volumes: "true"
+          {{- end }}
           networking.gke.io/default-interface: 'eth0'
           networking.gke.io/interfaces: |
             [
@@ -271,6 +305,9 @@ spec:
               {"interfaceName":"eth5","network":"rdma-3"}
             ]
       extraPodSpec:
+        {{- if .Values.dynamo.serviceAccountName }}
+        serviceAccountName: {{ .Values.dynamo.serviceAccountName }}
+        {{- end }}
         resourceClaims:
           - name: compute-domain-channel
             resourceClaimTemplateName: {{ .Values.dynamo.computeDomain.resourceClaimTemplateName }}
@@ -304,10 +341,16 @@ spec:
             set -e
             nvidia-smi
             . /usr/local/gib/scripts/set_nccl_env.sh
-            echo "Pre-compiling DeepGEMM kernels for Prefill Worker..."
-            echo "Finished pre-compiling DeepGEMM kernels for Prefill Worker."
+            pip install hf_transfer
 
             ARGS=()
+            if [ -n "$MODEL_PATH" ]; then
+              echo "Adding model path from env var: $MODEL_PATH"
+              ARGS+=("--model-path" "$MODEL_PATH")
+            else
+              echo "No MODEL_PATH env var set from gcsfuse, relying on config file for model"
+              ARGS+=("--model" "{{ .Values.workload.model }}")
+            fi
             if [ -f "$SERVER_ARGS_FILE" ]; then
               echo "Loading server arguments from ConfigMap"
               while IFS=': ' read -r key value || [ -n "$key" ]; do
@@ -332,8 +375,10 @@ spec:
             exec python3 -m dynamo.sglang "${ARGS[@]}"
     
           volumeMounts:
+          {{- if eq .Values.volumes.useGcs true }}
             - mountPath: /data/model
               name: gcs-model-volume
+          {{- end }}
             - name: library-dir-host
               mountPath: /usr/local/nvidia
             - name: gib
@@ -341,12 +386,14 @@ spec:
             - name: serving-configuration
               mountPath: {{ .Values.workload.configPath | default "/workload/configs" }}
         volumes:
+        {{- if eq .Values.volumes.useGcs true }}
         - name: gcs-model-volume
           csi:
             driver: gcsfuse.csi.storage.gke.io
             volumeAttributes:
               bucketName: {{ .Values.volumes.gcsfuse.bucketName }}
               mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1
+        {{- end }}
         - name: library-dir-host
           hostPath:
             path: /home/kubernetes/bin/nvidia

From 3b5167275f53ce4b13913d89a54675d99e80e608 Mon Sep 17 00:00:00 2001
From: Yijia J <yijiaj@google.com>
Date: Tue, 13 Jan 2026 01:32:10 +0000
Subject: [PATCH 06/12] nit, update README and value to 18 nodes

---
 .../disaggregated-serving/dynamo/README.md    | 37 +++++++++++++++++--
 .../disaggregated-serving/dynamo/values.yaml  | 36 +++++-------------
 .../deepseekr1-fp8-10p8d-decode.yaml          |  3 --
 .../deepseekr1-fp8-10p8d-prefill.yaml         |  4 --
 4 files changed, 42 insertions(+), 38 deletions(-)

diff --git a/inference/a4x/disaggregated-serving/dynamo/README.md b/inference/a4x/disaggregated-serving/dynamo/README.md
index b185b4fb..0a02a267 100644
--- a/inference/a4x/disaggregated-serving/dynamo/README.md
+++ b/inference/a4x/disaggregated-serving/dynamo/README.md
@@ -14,6 +14,7 @@ Dynamo provides a disaggregated architecture that separates prefill and decode o
   * [2.3. Connect to your GKE Cluster](#connect-cluster)
   * [2.4. Create Secrets](#create-secrets)
   * [2.5. Install Dynamo Platform](#install-platform)
+  * [2.6. Setup GCS Bucket for GKE ](#setup-gcsfuse)
 * [3. Deploy with SGLang Backend](#deploy-sglang)
   * [3.1. Multi-Node SGLang Deployment (16 GPUs)](#sglang-multi-node)
 * [4. Inference Request](#inference-request)
@@ -62,6 +63,7 @@ export NAMESPACE=dynamo-cloud
 export NGC_API_KEY=<YOUR_NGC_API_KEY>
 export HF_TOKEN=<YOUR_HF_TOKEN>
 export RELEASE_VERSION=0.7.0
+export GCS_BUCKET=<YOUR_CGS_BUCKET>
 
 # Set the project for gcloud commands
 gcloud config set project $PROJECT_ID
@@ -76,6 +78,7 @@ Replace the following values:
 | `CLUSTER_NAME` | The name of your GKE cluster | `a4x-cluster` |
 | `NGC_API_KEY` | Your NVIDIA NGC API key (get from [NGC](https://ngc.nvidia.com)) | `nvapi-xxx...` |
 | `HF_TOKEN` | Your Hugging Face access token | `hf_xxx...` |
+| `GCS_BUCKET` | Your GCS bucket name | `gs://xxx` |
 
 <a name="connect-cluster"></a>
 ### 2.3. Connect to your GKE Cluster
@@ -146,6 +149,32 @@ kubectl get pods -n ${NAMESPACE}
 
 Wait until all pods show a `Running` status before proceeding.
 
+<a name="setup-gcsfuse"></a>
+### 2.6. Setup GCS Bucket for GKE (One-Time Setup)
+
+It is recommended to utilize [gcsfuse](https://docs.cloud.google.com/kubernetes-engine/docs/how-to/cloud-storage-fuse-csi-driver-setup) to facilitate model access and mitigate [huggingface rate limiting](https://huggingface.co/docs/hub/en/rate-limits#hub-rate-limits) issues.
+
+Find the service account (usually annotated to default):
+```bash
+kubectl get serviceaccounts ${NAMESPACE} -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.metadata.annotations.iam\.gke\.io/gcp-service-account}{"\n"}{end}'
+```
+
+Authorize the service account:
+```bash
+gcloud iam service-accounts add-iam-policy-binding xxx@project_id.iam.gserviceaccount.com \
+    --role roles/iam.workloadIdentityUser \
+    --member "serviceAccount:project_id.svc.id.goog[${NAMESPACE}/default]"
+```
+
+Grant read access to the bucket:
+```bash
+gcloud storage buckets add-iam-policy-binding ${GCS_BUCKET} \
+    --member "serviceAccount:xxx@project_id.iam.gserviceaccount.com" \
+    --role "roles/storage.objectViewer"
+```
+
+Downloading model files into the gcs bucket.
+
 <a name="deploy-sglang"></a>
 ## 3. Deploy with SGLang Backend
 
@@ -154,9 +183,9 @@ Wait until all pods show a `Running` status before proceeding.
 Deploy Dynamo with SGLang backend for high-performance inference.
 
 <a name="sglang-multi-node"></a>
-### 3.1. Multi-Node vLLM Deployment (16 GPUs)
+### 3.1. Multi-Node SGLang Deployment (72 GPUs)
 
-Multi-node deployment uses 16 GPUs across 4 A4X machines, providing increased capacity for larger models or higher throughput.
+Multi-node deployment uses 72 GPUs across 18 A4X machines, providing increased capacity for larger models or higher throughput.
 
 #### DeepSeekR1 671B Model
 
@@ -165,8 +194,8 @@ Deploy DeepSeekR1-671B across multiple nodes for production workloads. Note the
 ```bash
 cd $RECIPE_ROOT
 helm install -f values.yaml \
---set-file prefill_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml \
---set-file decode_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml \
+--set-file prefill_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-prefill.yaml \
+--set-file decode_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-decode.yaml \
 $USER-dynamo-a4x-multi-node \
 $REPO_ROOT/src/helm-charts/a4x/inference-templates/dynamo-deployment
 ```
diff --git a/inference/a4x/disaggregated-serving/dynamo/values.yaml b/inference/a4x/disaggregated-serving/dynamo/values.yaml
index a047a65f..06954aae 100644
--- a/inference/a4x/disaggregated-serving/dynamo/values.yaml
+++ b/inference/a4x/disaggregated-serving/dynamo/values.yaml
@@ -13,17 +13,17 @@
 # limitations under the License.
 
 dynamo:
-  namespace: yijiaj-test
+  namespace: dynamo-cloud
   releaseVersion: "0.7.0"
-  deploymentName: disagg2p2d-yijiaj
+  deploymentName: disagg10p8d
   computeDomain:
-    name: yijiaj-a4x-domain
-    numNodes: 4
-    resourceClaimTemplateName: yijiaj-a4x-channel
-  serviceAccountName: dynamo-platform-dynamo-operator-component
+    name: a4x-domain
+    numNodes: 18
+    resourceClaimTemplateName: a4x-channel
+  serviceAccountName:
   frontend:
     image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.7.0
-    replicas: 1
+    replicas: 9
     livenessProbe:
       initialDelaySeconds: 3000
       periodSeconds: 60
@@ -36,8 +36,7 @@ dynamo:
       failureThreshold: 100
   decodeWorker:
     image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout 
-    #image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.6.1
-    nodeCount: 2
+    nodeCount: 8
     replicas: 1
     envs:
     - name: HF_TOKEN
@@ -53,16 +52,12 @@ dynamo:
       value: eth0
     - name: TP_SOCKET_IFNAME
       value: eth0
-    # - name: SGLANG_ENABLE_JIT_DEEPGEMM
-    #   value: "1"
     - name: DYN_SKIP_SGLANG_LOG_FORMATTING
       value: "1"
     - name: SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK
       value: "256"
     - name: MC_TE_METRIC
       value: "true"
-    # - name: SGLANG_ENABLE_FLASHINFER_GEMM
-    #   value: "1"
     - name: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
       value: "100000"
     - name: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
@@ -87,14 +82,6 @@ dynamo:
       value: "1"
     - name: PYTHONUNBUFFERED
       value: "1"
-    # - name: NCCL_DEBUG
-    #   value: INFO
-    # - name: NCCL_DEBUG_SUBSYS
-    #   value: INIT,BOOTSTRAP,ENV,NET,GRAPH
-    # - name: NCCL_SOCKET_FAMILY
-    #   value: "AF_INET"
-    # - name: GLOO_SOCKET_FAMILY
-    #   value: "AF_INET"
     livenessProbe:
       initialDelaySeconds: 3000
       periodSeconds: 60
@@ -112,9 +99,8 @@ dynamo:
       failureThreshold: 3000
   prefillWorker:
     image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout 
-    #image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.6.1
     nodeCount: 2
-    replicas: 1
+    replicas: 5
     envs:
       - name: HF_TOKEN
         valueFrom:
@@ -131,14 +117,10 @@ dynamo:
         value: eth0
       - name: TP_SOCKET_IFNAME
         value: eth0
-      # - name: SGLANG_ENABLE_JIT_DEEPGEMM
-      #   value: "1"
       - name: DYN_SKIP_SGLANG_LOG_FORMATTING
         value: "1"
       - name: MC_TE_METRIC
         value: "true"
-      # - name: SGLANG_ENABLE_FLASHINFER_GEMM
-      #   value: "1"
       - name: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
         value: "100000"
       - name: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
diff --git a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-decode.yaml b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-decode.yaml
index bbbdf18f..4369e1ce 100644
--- a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-decode.yaml
+++ b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-decode.yaml
@@ -23,13 +23,11 @@ tp-size: "32"
 dp-size: "32"
 ep-size: "32" 
 quantization: "fp8"
-# page-size: "1"
 enable-dp-attention: true
 attention-backend: "trtllm_mla"
 kv-cache-dtype: "fp8_e4m3"
 disable-radix-cache: true
 stream-interval: "50"
-# disaggregation-transfer-backend: nixl
 decode-log-interval: "1000"
 max-running-requests: "8192"
 context-length: "9300"
@@ -46,5 +44,4 @@ enable-dp-lm-head: true
 prefill-round-robin-balance: true
 ep-num-redundant-experts: "32"
 cuda-graph-max-bs: "256"
-# disable-cuda-graph: true
 deepep-config: '{"normal_dispatch": {"num_sms": 128,"num_max_nvl_chunked_send_tokens": 28,"num_max_nvl_chunked_recv_tokens": 256,"num_max_rdma_chunked_send_tokens": 6,"num_max_rdma_chunked_recv_tokens": 256}, "normal_combine": {"num_sms": 128,"num_max_nvl_chunked_send_tokens": 15,"num_max_nvl_chunked_recv_tokens": 256,"num_max_rdma_chunked_send_tokens": 6,"num_max_rdma_chunked_recv_tokens": 128}}'
\ No newline at end of file
diff --git a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-prefill.yaml b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-prefill.yaml
index f5748607..9c86f420 100644
--- a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-prefill.yaml
+++ b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-prefill.yaml
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 served-model-name: deepseek-ai/DeepSeek-R1
-# log-level: DEBUG
 disaggregation-mode: prefill
 disaggregation-bootstrap-port: "30001"
 host: "0.0.0.0"
@@ -30,9 +29,6 @@ disable-radix-cache: true
 stream-interval: "50"
 max-running-requests: "30000"
 context-length: "9300"
-# decode-log-interval: "1"
-# page-size: "1"
-# disaggregation-transfer-backend: nixl
 watchdog-timeout: "1000000"
 disable-shared-experts-fusion: true
 eplb-algorithm: deepseek

From fb7dfa5dc94988875435a27e7a63a25252431661 Mon Sep 17 00:00:00 2001
From: Yijia J <yijiaj@google.com>
Date: Wed, 14 Jan 2026 17:30:37 +0000
Subject: [PATCH 07/12] Add 8GPU recipe, modify README

---
 .../disaggregated-serving/dynamo/README.md    |  33 +-
 .../disaggregated-serving/dynamo/test.yaml    | 389 ++++++++++++++++++
 .../{values.yaml => values_deepep.yaml}       |  33 +-
 .../dynamo/values_wo_deepep.yaml              | 203 +++++++++
 ...e.yaml => deepseekr1-fp8-1p1d-decode.yaml} |  45 +-
 ....yaml => deepseekr1-fp8-1p1d-prefill.yaml} |  43 +-
 .../templates/dynamo-graph-deployment.yaml    |  12 +-
 7 files changed, 682 insertions(+), 76 deletions(-)
 create mode 100644 inference/a4x/disaggregated-serving/dynamo/test.yaml
 rename inference/a4x/disaggregated-serving/dynamo/{values.yaml => values_deepep.yaml} (91%)
 create mode 100644 inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml
 rename src/frameworks/a4x/dynamo-configs/{deepseekr1-fp8-2p2d-decode.yaml => deepseekr1-fp8-1p1d-decode.yaml} (59%)
 rename src/frameworks/a4x/dynamo-configs/{deepseekr1-fp8-2p2d-prefill.yaml => deepseekr1-fp8-1p1d-prefill.yaml} (62%)

diff --git a/inference/a4x/disaggregated-serving/dynamo/README.md b/inference/a4x/disaggregated-serving/dynamo/README.md
index 0a02a267..df1d99e0 100644
--- a/inference/a4x/disaggregated-serving/dynamo/README.md
+++ b/inference/a4x/disaggregated-serving/dynamo/README.md
@@ -16,7 +16,8 @@ Dynamo provides a disaggregated architecture that separates prefill and decode o
   * [2.5. Install Dynamo Platform](#install-platform)
   * [2.6. Setup GCS Bucket for GKE ](#setup-gcsfuse)
 * [3. Deploy with SGLang Backend](#deploy-sglang)
-  * [3.1. Multi-Node SGLang Deployment (16 GPUs)](#sglang-multi-node)
+  * [3.1. SGLang Deployment without DeepEP(8 GPUs)](#sglang-wo-deepep)
+  * [3.2. SGLang Deployment with DeepEP(72 GPUs)](#sglang-deepep)
 * [4. Inference Request](#inference-request)
 * [5. Monitoring and Troubleshooting](#monitoring)
 * [6. Cleanup](#cleanup)
@@ -180,16 +181,34 @@ Downloading model files into the gcs bucket.
 
 [Back to Top](#table-of-contents)
 
-Deploy Dynamo with SGLang backend for high-performance inference.
+Deploy Dynamo with SGLang backend for high-performance inference. 
 
-<a name="sglang-multi-node"></a>
-### 3.1. Multi-Node SGLang Deployment (72 GPUs)
+<a name="sglang-wo-deepep"></a>
+### 3.1. SGLang Deployment without DeepEP (8 GPUs)
 
-Multi-node deployment uses 72 GPUs across 18 A4X machines, providing increased capacity for larger models or higher throughput.
+Two nodes deployment uses 8 GPUs across 2 A4X machines, targeting low latency. 
 
 #### DeepSeekR1 671B Model
 
-Deploy DeepSeekR1-671B across multiple nodes for production workloads. Note the use of `--set-file prefill_serving_config` and `--set-file decode_serving_config` pointing to the correct model config file for a multi node deployment scenario: 
+Deploy DeepSeekR1-671B across 2 nodes for testing and validation.  Note the use of `--set-file prefill_serving_config` and `--set-file decode_serving_config` pointing to the correct model config file.
+
+```bash
+cd $RECIPE_ROOT
+helm install -f values_wo_deepep.yaml \
+--set-file prefill_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-1p1d-prefill.yaml \
+--set-file decode_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-1p1d-decode.yaml \
+$USER-dynamo-a4x-1p1d \
+$REPO_ROOT/src/helm-charts/a4x/inference-templates/dynamo-deployment
+```
+
+<a name="sglang-deepep"></a>
+### 3.2. SGLang Deployment with DeepEP (72 GPUs)
+
+Multi-node deployment uses 72 GPUs across 18 A4X machines, providing increased capacity for larger models or higher throughput. 
+
+#### DeepSeekR1 671B Model
+
+Deploy DeepSeekR1-671B across 18 nodes for production workloads. Note the use of `--set-file prefill_serving_config` and `--set-file decode_serving_config` pointing to the correct model config file for a multi node deployment scenario: 
 
 ```bash
 cd $RECIPE_ROOT
@@ -251,7 +270,7 @@ You should see a server status like this. Wait for it to be in a `healthy` state
 Then we can send a benchmark request with like this:
 
 ```bash
-kubectl exec -n ${NAMESPACE} $USER-dynamo-multi-node-serving-frontend -- python3 -u -m sglang.bench_serving    --backend sglang-oai-chat    --base-url http://localhost:8000    --model "deepseek-ai/DeepSeek-R1"    --tokenizer /data/model/deepseek-ai/DeepSeek-R1    --dataset-name random    --num-prompts 2048    --random-input-len 2048    --random-output-len 512    --max-concurrency 512
+kubectl exec -n ${NAMESPACE} $USER-dynamo-multi-node-serving-frontend -- python3 -u -m sglang.bench_serving    --backend sglang-oai-chat    --base-url http://localhost:8000    --model "deepseek-ai/DeepSeek-R1"    --tokenizer /data/model/deepseek-ai/DeepSeek-R1    --dataset-name random    --num-prompts 10240   --random-input-len 8192  --random-range-ratio 0.8   --random-output-len 1024   --max-concurrency 2048
 ```
 
 <a name="monitoring"></a>
diff --git a/inference/a4x/disaggregated-serving/dynamo/test.yaml b/inference/a4x/disaggregated-serving/dynamo/test.yaml
new file mode 100644
index 00000000..92e43f57
--- /dev/null
+++ b/inference/a4x/disaggregated-serving/dynamo/test.yaml
@@ -0,0 +1,389 @@
+apiVersion: resource.nvidia.com/v1beta1
+kind: ComputeDomain
+metadata:
+  name: a4x-compute-domain-test
+  namespace: yijiaj-test
+spec:
+  numNodes: 2
+  channel:
+    resourceClaimTemplate:
+      name: a4x-compute-domain-channel-test
+---
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: yijiaj-test-1p1d
+spec:
+  services:
+    Frontend:
+      dynamoNamespace: yijiaj-test
+      componentType: frontend
+      replicas: 9
+      resources:
+        requests:
+          cpu: "5"
+          memory: "50Gi"
+        limits:
+          cpu: "5"
+          memory: "50Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.7.0
+          workingDir: /sgl-workspace/dynamo/components/backends/sglang
+          stdin: true
+          tty: true
+          command:
+            - /bin/sh
+            - -c
+          args:
+            - "python3 -m dynamo.frontend --http-port 8000"
+    Decode:
+      envFromSecret: hf-token-secret
+      livenessProbe:
+        httpGet:
+          path: /live
+          port: system
+        initialDelaySeconds: 600
+        periodSeconds: 30
+        timeoutSeconds: 15
+        failureThreshold: 5
+      readinessProbe:
+        httpGet:
+          path: /health
+          port: system
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+        periodSeconds: 60
+        failureThreshold: 60
+      dynamoNamespace: yijiaj-test
+      componentType: worker
+      replicas: 1
+      resources:
+        limits:
+          gpu: "4"
+        claims:
+          - name: compute-domain-channel
+      sharedMemory:
+        size: 80Gi
+      envs:
+      - name: LD_LIBRARY_PATH
+        value: "/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu/plugins:/usr/local/nvidia/lib64"
+      - name: GLOO_SOCKET_IFNAME
+        value: eth0
+      - name: TP_SOCKET_IFNAME
+        value: eth0
+
+      - name: PYTHONUNBUFFERED
+        value: "1"
+      - name: DYN_SKIP_SGLANG_LOG_FORMATTING
+        value: "1"
+      - name: SGLANG_ENABLE_JIT_DEEPGEMM
+        value: "false"
+      - name: SGLANG_ENABLE_FLASHINFER_GEMM
+        value: "1"
+      - name: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
+        value: "100000"
+      - name: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
+        value: "100000"
+      - name: SGLANG_DISAGGREGATION_WAITING_TIMEOUT
+        value: "100000"
+      - name: SGLANG_DECODE_BOOTSTRAP_TIMEOUT
+        value: "1000"
+      - name: SGLANG_HACK_SEQ_BOOTSTRAP_ROOM
+        value: "1"
+      - name: SGLANG_MOONCAKE_CUSTOM_MEM_POOL
+        value: "True"
+      - name: SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
+        value: "0"
+      - name: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
+        value: "1"
+      - name: MC_TE_METRIC
+        value: "true"
+      - name: MC_FORCE_MNNVL
+        value: "1"
+      - name: NCCL_MNNVL_ENABLE
+        value: "1"
+      - name: NCCL_CUMEM_ENABLE
+        value: "1"
+
+
+      extraPodMetadata:
+        annotations:
+          gke-gcsfuse/cpu-limit: "0"
+          gke-gcsfuse/ephemeral-storage-limit: "0"
+          gke-gcsfuse/memory-limit: "0"
+          gke-gcsfuse/volumes: "true"
+          networking.gke.io/default-interface: 'eth0'
+          networking.gke.io/interfaces: |
+            [
+              {"interfaceName":"eth0","network":"default"},
+              {"interfaceName":"eth2","network":"rdma-0"},
+              {"interfaceName":"eth3","network":"rdma-1"},
+              {"interfaceName":"eth4","network":"rdma-2"},
+              {"interfaceName":"eth5","network":"rdma-3"}
+            ]
+      extraPodSpec:
+        resourceClaims:
+          - name: compute-domain-channel
+            resourceClaimTemplateName: a4x-compute-domain-channel-test
+        affinity:
+          nodeAffinity:
+            requiredDuringSchedulingIgnoredDuringExecution:
+              nodeSelectorTerms:
+              - matchExpressions:
+                - key: kubernetes.io/arch
+                  operator: In
+                  values:
+                  - arm64
+        volumes:
+        - name: model-src
+          csi:
+            driver: gcsfuse.csi.storage.gke.io
+            volumeAttributes:
+              bucketName: yijiaj-test
+              mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1
+        - name: library-dir-host
+          hostPath:
+            path: /home/kubernetes/bin/nvidia
+        - name: gib
+          hostPath:
+            path: /home/kubernetes/bin/gib
+        mainContainer:
+          securityContext:
+            privileged: true
+          startupProbe:
+            failureThreshold: 1800
+            httpGet:
+              path: /live
+              port: system
+            periodSeconds: 10
+            timeoutSeconds: 5
+          image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout
+          workingDir: /sgl-workspace/dynamo/components/backends/sglang
+          command: ["/bin/bash", "-c"]
+          stdin: true
+          tty: true
+          volumeMounts:
+          - mountPath: /data/model
+            name: model-src
+          - name: library-dir-host
+            mountPath: /usr/local/nvidia
+          - name: gib
+            mountPath: /usr/local/gib
+          args:
+            - |
+              set -e
+
+              nvidia-smi
+              . /usr/local/gib/scripts/set_nccl_env.sh
+
+              echo "--- VERIFYING NCCL ENV VARS IN SHELL ---"
+              env | grep NCCL_
+              echo "--- END VERIFICATION ---"
+
+              exec python3 -m dynamo.sglang \
+                --enable-metrics \
+                --model-path /data/model/deepseek-ai/DeepSeek-R1 \
+                --served-model-name deepseek-ai/DeepSeek-R1 \
+                --disaggregation-bootstrap-port 30001 \
+                --disaggregation-mode decode \
+                --host 0.0.0.0 \
+                --port 8000 \
+                --disable-radix-cache \
+                --tensor-parallel-size 4 \
+                --data-parallel-size 1 \
+                --expert-parallel-size 1 \
+                --trust-remote-code \
+                --kv-cache-dtype fp8_e4m3 \
+                --attention-backend trtllm_mla \
+                --quantization fp8 \
+                --moe-runner-backend flashinfer_trtllm \
+                --disable-radix-cache \
+                --watchdog-timeout 1000000 \
+                --context-length 9600 \
+                --mem-fraction-static 0.95 \
+                --chunked-prefill-size 8192 \
+                --cuda-graph-max-bs 512 \
+                --max-running-requests 512 \
+                --scheduler-recv-interval 10 \
+                --enable-flashinfer-allreduce-fusion \
+                --enable-symm-mem \
+                --moe-dense-tp-size 1 \
+                --prefill-round-robin-balance
+
+    Prefill:
+      envFromSecret: hf-token-secret
+      livenessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - "exit 0"
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      readinessProbe:
+        httpGet:
+          path: /health
+          port: system
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+        periodSeconds: 60
+        failureThreshold: 60
+      dynamoNamespace: yijiaj-test
+      componentType: worker
+      replicas: 1
+      resources:
+        requests:
+          cpu: "130"
+          memory: "800Gi"
+        limits:
+          gpu: "4"
+        claims:
+          - name: compute-domain-channel
+      sharedMemory:
+        size: 80Gi
+      envs:
+      - name: LD_LIBRARY_PATH
+        value: "/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu/plugins:/usr/local/nvidia/lib64"
+      - name: UCX_TLS
+        value: "^tcp"
+      - name: GLOO_SOCKET_IFNAME
+        value: eth0
+      - name: TP_SOCKET_IFNAME
+        value: eth0
+
+
+      - name: PYTHONUNBUFFERED
+        value: "1"
+      - name: DYN_SKIP_SGLANG_LOG_FORMATTING
+        value: "1"
+      - name: SGLANG_ENABLE_JIT_DEEPGEMM
+        value: "false"
+      - name: SGLANG_ENABLE_FLASHINFER_GEMM
+        value: "1"
+      - name: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
+        value: "100000"
+      - name: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
+        value: "100000"
+      - name: SGLANG_DISAGGREGATION_WAITING_TIMEOUT
+        value: "100000"
+      - name: SGLANG_MOONCAKE_CUSTOM_MEM_POOL
+        value: "True"
+      - name: SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
+        value: "0"
+      - name: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
+        value: "1"
+      - name: MC_TE_METRIC
+        value: "true"
+      - name: MC_FORCE_MNNVL
+        value: "1"
+      - name: NCCL_MNNVL_ENABLE
+        value: "1"
+      - name: NCCL_CUMEM_ENABLE
+        value: "1"
+
+      extraPodMetadata:
+        annotations:
+          gke-gcsfuse/cpu-limit: "0"
+          gke-gcsfuse/ephemeral-storage-limit: "0"
+          gke-gcsfuse/memory-limit: "0"
+          gke-gcsfuse/volumes: "true"
+          networking.gke.io/default-interface: 'eth0'
+          networking.gke.io/interfaces: |
+            [
+              {"interfaceName":"eth0","network":"default"},
+              {"interfaceName":"eth2","network":"rdma-0"},
+              {"interfaceName":"eth3","network":"rdma-1"},
+              {"interfaceName":"eth4","network":"rdma-2"},
+              {"interfaceName":"eth5","network":"rdma-3"}
+            ]
+      extraPodSpec:
+        resourceClaims:
+          - name: compute-domain-channel
+            resourceClaimTemplateName: a4x-compute-domain-channel-test
+        affinity:
+          nodeAffinity:
+            requiredDuringSchedulingIgnoredDuringExecution:
+              nodeSelectorTerms:
+              - matchExpressions:
+                - key: kubernetes.io/arch
+                  operator: In
+                  values:
+                  - arm64
+        volumes:
+        - name: model-src
+          csi:
+            driver: gcsfuse.csi.storage.gke.io
+            volumeAttributes:
+              bucketName: yijiaj-test
+              mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1
+        - name: library-dir-host
+          hostPath:
+            path: /home/kubernetes/bin/nvidia
+        - name: gib
+          hostPath:
+            path: /home/kubernetes/bin/gib
+        mainContainer:
+          startupProbe:
+            failureThreshold: 1800
+            httpGet:
+              path: /live
+              port: system
+            periodSeconds: 10
+            timeoutSeconds: 5
+          securityContext:
+            privileged: true
+          stdin: true
+          tty: true
+          volumeMounts:
+          - mountPath: /data/model
+            name: model-src
+          - name: library-dir-host
+            mountPath: /usr/local/nvidia
+          - name: gib
+            mountPath: /usr/local/gib
+          image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout
+          workingDir: /sgl-workspace/dynamo/components/backends/sglang
+          command: ["/bin/bash", "-c"]
+          args:
+            - |
+              set -e
+
+              nvidia-smi
+              . /usr/local/gib/scripts/set_nccl_env.sh
+
+              echo "--- VERIFYING NCCL ENV VARS IN SHELL ---"
+              env | grep NCCL_
+              echo "--- END VERIFICATION ---"
+
+              exec python3 -m dynamo.sglang \
+                --enable-metrics \
+                --model-path /data/model/deepseek-ai/DeepSeek-R1 \
+                --served-model-name deepseek-ai/DeepSeek-R1 \
+                --disaggregation-bootstrap-port 30001 \
+                --disaggregation-mode prefill \
+                --host 0.0.0.0 \
+                --port 8000 \
+                --disable-radix-cache \
+                --tensor-parallel-size 4 \
+                --data-parallel-size 1 \
+                --expert-parallel-size 1 \
+                --trust-remote-code \
+                --kv-cache-dtype fp8_e4m3 \
+                --attention-backend trtllm_mla \
+                --quantization fp8 \
+                --moe-runner-backend flashinfer_trtllm \
+                --disable-radix-cache \
+                --watchdog-timeout 1000000 \
+                --context-length 9600 \
+                --mem-fraction-static 0.95 \
+                --max-total-tokens 32768 \
+                --chunked-prefill-size 24576 \
+                --cuda-graph-max-bs 512 \
+                --max-running-requests 512 \
+                --load-balance-method round_robin \
+                --scheduler-recv-interval 10 \
+                --enable-flashinfer-allreduce-fusion \
+                --moe-dense-tp-size 1
+
diff --git a/inference/a4x/disaggregated-serving/dynamo/values.yaml b/inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml
similarity index 91%
rename from inference/a4x/disaggregated-serving/dynamo/values.yaml
rename to inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml
index 06954aae..d5a02dc6 100644
--- a/inference/a4x/disaggregated-serving/dynamo/values.yaml
+++ b/inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml
@@ -15,7 +15,7 @@
 dynamo:
   namespace: dynamo-cloud
   releaseVersion: "0.7.0"
-  deploymentName: disagg10p8d
+  deploymentName: dynamo-disagg10p8d
   computeDomain:
     name: a4x-domain
     numNodes: 18
@@ -83,20 +83,20 @@ dynamo:
     - name: PYTHONUNBUFFERED
       value: "1"
     livenessProbe:
-      initialDelaySeconds: 3000
+      initialDelaySeconds: 600
       periodSeconds: 60
-      timeoutSeconds: 150
-      failureThreshold: 100
+      timeoutSeconds: 30
+      failureThreshold: 60
     readinessProbe:
-      initialDelaySeconds: 3000
+      initialDelaySeconds: 60
       periodSeconds: 60
-      timeoutSeconds: 300
-      failureThreshold: 100
+      timeoutSeconds: 30
+      failureThreshold: 60
     startupProbe:
       initialDelaySeconds: 3000
       periodSeconds: 60
-      timeoutSeconds: 600
-      failureThreshold: 3000
+      timeoutSeconds: 30
+      failureThreshold: 1800
   prefillWorker:
     image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout 
     nodeCount: 2
@@ -144,18 +144,18 @@ dynamo:
     livenessProbe:
       initialDelaySeconds: 3000
       periodSeconds: 60
-      timeoutSeconds: 150
-      failureThreshold: 100
+      timeoutSeconds: 30
+      failureThreshold: 60
     readinessProbe:
       initialDelaySeconds: 3000
       periodSeconds: 60
-      timeoutSeconds: 300
-      failureThreshold: 100
+      timeoutSeconds: 30
+      failureThreshold: 60
     startupProbe:
       initialDelaySeconds: 3000
       periodSeconds: 60
-      timeoutSeconds: 600
-      failureThreshold: 3000
+      timeoutSeconds: 30
+      failureThreshold: 1800
  
 
 secrets:
@@ -170,8 +170,6 @@ volumes:
   useGcs: true
   gcsfuse:
     bucketName: "yijiaj-test"
-    fileCacheCapacity: "500G"
-    cachePath: "/gcs-cache"
   ssdMountPath: "/ssd"
   gcsMounts:
     mountPath: "/data/model"
@@ -184,7 +182,6 @@ service:
 
 workload:
   model: deepseek-ai/DeepSeek-R1
-  gpus: 16
   framework: sglang
   configFile: serving-args.yaml
   configPath: /workload/configs
diff --git a/inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml b/inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml
new file mode 100644
index 00000000..bf992302
--- /dev/null
+++ b/inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml
@@ -0,0 +1,203 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+dynamo:
+  namespace: yijiaj-test
+  releaseVersion: "0.7.0"
+  deploymentName: dynamo-disagg1p1d
+  computeDomain:
+    name: a4x-domain
+    numNodes: 2
+    resourceClaimTemplateName: a4x-channel
+  serviceAccountName:
+  frontend:
+    image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.7.0
+    replicas: 9
+    livenessProbe:
+      initialDelaySeconds: 3000
+      periodSeconds: 60
+      timeoutSeconds: 150
+      failureThreshold: 100
+    readinessProbe:
+      initialDelaySeconds: 3000
+      periodSeconds: 60
+      timeoutSeconds: 300
+      failureThreshold: 100
+  decodeWorker:
+    image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout 
+    nodeCount: 1
+    replicas: 1
+    envs:
+    - name: HF_TOKEN
+      valueFrom:
+        secretKeyRef:
+          name: hf-token-secret
+          key: HF_TOKEN
+    - name: HF_HUB_ENABLE_HF_TRANSFER
+      value: "1"
+    - name: LD_LIBRARY_PATH
+      value: "/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu/plugins:/usr/local/nvidia/lib64"
+    - name: GLOO_SOCKET_IFNAME
+      value: eth0
+    - name: TP_SOCKET_IFNAME
+      value: eth0
+    - name: PYTHONUNBUFFERED
+      value: "1"
+    - name: DYN_SKIP_SGLANG_LOG_FORMATTING
+      value: "1"
+    - name: SGLANG_ENABLE_JIT_DEEPGEMM
+      value: "false"
+    - name: SGLANG_ENABLE_FLASHINFER_GEMM
+      value: "1"
+    - name: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
+      value: "100000"
+    - name: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
+      value: "100000"
+    - name: SGLANG_DISAGGREGATION_WAITING_TIMEOUT
+      value: "100000"
+    - name: SGLANG_DECODE_BOOTSTRAP_TIMEOUT
+      value: "1000"
+    - name: SGLANG_HACK_SEQ_BOOTSTRAP_ROOM
+      value: "1"
+    - name: SGLANG_MOONCAKE_CUSTOM_MEM_POOL
+      value: "True"
+    - name: SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
+      value: "0"
+    - name: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
+      value: "1"
+    - name: MC_TE_METRIC
+      value: "true"
+    - name: MC_FORCE_MNNVL
+      value: "1"
+    - name: NCCL_MNNVL_ENABLE
+      value: "1"
+    - name: NCCL_CUMEM_ENABLE
+      value: "1"
+    livenessProbe:
+      initialDelaySeconds: 600
+      periodSeconds: 60
+      timeoutSeconds: 30
+      failureThreshold: 60
+    readinessProbe:
+      initialDelaySeconds: 60
+      periodSeconds: 60
+      timeoutSeconds: 30
+      failureThreshold: 60
+    startupProbe:
+      initialDelaySeconds: 3000
+      periodSeconds: 60
+      timeoutSeconds: 30
+      failureThreshold: 1800
+  prefillWorker:
+    image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout 
+    nodeCount: 1
+    replicas: 1
+    envs:
+      - name: HF_TOKEN
+        valueFrom:
+          secretKeyRef:
+            name: hf-token-secret
+            key: HF_TOKEN
+      - name: HF_HUB_ENABLE_HF_TRANSFER
+        value: "1"
+      - name: LD_LIBRARY_PATH
+        value: "/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu/plugins:/usr/local/nvidia/lib64"
+      - name: UCX_TLS
+        value: "^tcp"
+      - name: GLOO_SOCKET_IFNAME
+        value: eth0
+      - name: TP_SOCKET_IFNAME
+        value: eth0
+      - name: PYTHONUNBUFFERED
+        value: "1"
+      - name: DYN_SKIP_SGLANG_LOG_FORMATTING
+        value: "1"
+      - name: SGLANG_ENABLE_JIT_DEEPGEMM
+        value: "false"
+      - name: SGLANG_ENABLE_FLASHINFER_GEMM
+        value: "1"
+      - name: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
+        value: "100000"
+      - name: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
+        value: "100000"
+      - name: SGLANG_DISAGGREGATION_WAITING_TIMEOUT
+        value: "100000"
+      - name: SGLANG_MOONCAKE_CUSTOM_MEM_POOL
+        value: "True"
+      - name: SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
+        value: "0"
+      - name: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
+        value: "1"
+      - name: MC_TE_METRIC
+        value: "true"
+      - name: MC_FORCE_MNNVL
+        value: "1"
+      - name: NCCL_MNNVL_ENABLE
+        value: "1"
+      - name: NCCL_CUMEM_ENABLE
+        value: "1"
+    livenessProbe:
+      initialDelaySeconds: 3000
+      periodSeconds: 60
+      timeoutSeconds: 30
+      failureThreshold: 60
+    readinessProbe:
+      initialDelaySeconds: 3000
+      periodSeconds: 60
+      timeoutSeconds: 30
+      failureThreshold: 60
+    startupProbe:
+      initialDelaySeconds: 3000
+      periodSeconds: 60
+      timeoutSeconds: 30
+      failureThreshold: 1800
+ 
+
+secrets:
+  ngc:
+    secretName: nvcr-secret
+  huggingface:
+    secretName: hf-token-secret
+    secretData:
+      token: "hf_api_token"
+
+volumes:
+  useGcs: true
+  gcsfuse:
+    bucketName: "yijiaj-test"
+  ssdMountPath: "/ssd"
+  gcsMounts:
+    mountPath: "/data/model"
+
+service:
+  type: ClusterIP
+  ports:
+    frontend: 8000
+    worker: 9090
+
+workload:
+  model: deepseek-ai/DeepSeek-R1
+  framework: sglang
+  configFile: serving-args.yaml
+  configPath: /workload/configs
+
+network:
+  subnetworks: []
+  gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic-arm64:v1.0.7
+  ncclSettings:
+    - name: NCCL_DEBUG
+      value: "VERSION"
+
+quantizations:
+  - "fp8"
diff --git a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-2p2d-decode.yaml b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-1p1d-decode.yaml
similarity index 59%
rename from src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-2p2d-decode.yaml
rename to src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-1p1d-decode.yaml
index a2287217..ff0f3c47 100644
--- a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-2p2d-decode.yaml
+++ b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-1p1d-decode.yaml
@@ -12,34 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+enable-metrics: true
 served-model-name: deepseek-ai/DeepSeek-R1
-log-level: DEBUG
-tp: "8"
-dp-size: "8"
-decode-log-interval: "1"
-page-size: "1"
-enable-dp-attention: true
-trust-remote-code: true
-disaggregation-mode: decode
-disaggregation-transfer-backend: nixl
 disaggregation-bootstrap-port: "30001"
+disaggregation-mode: decode
 host: "0.0.0.0"
 port: "9090"
-max-running-requests: "36864"
-context-length: "2716"
 disable-radix-cache: true
-moe-a2a-backend: deepep
-prefill-round-robin-balance: true
-deepep-mode: normal
-moe-dense-tp-size: "1"
-enable-dp-lm-head: true
-disable-cuda-graph: true
-cuda-graph-max-bs: "256"
-disable-shared-experts-fusion: true
-ep-num-redundant-experts: "32"
-ep-dispatch-algorithm: static
-eplb-algorithm: deepseek
-attention-backend: cutlass_mla
+tensor-parallel-size: 4
+data-parallel-size: 1
+expert-parallel-size: 1
+trust-remote-code: true
+kv-cache-dtype: "fp8_e4m3"
+attention-backend: "trtllm_mla"
+quantization: "fp8"
+moe-runner-backend: "flashinfer_trtllm"
 watchdog-timeout: "1000000"
-chunked-prefill-size: "36864"
-mem-fraction-static: "0.8"
+context-length: "9600"
+mem-fraction-static: "0.95"
+chunked-prefill-size: "8192"
+cuda-graph-max-bs: "512"
+max-running-requests: "512"
+scheduler-recv-interval: "10"
+enable-flashinfer-allreduce-fusion: true
+enable-symm-mem: true
+moe-dense-tp-size: "1"
+prefill-round-robin-balance: true
\ No newline at end of file
diff --git a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-2p2d-prefill.yaml b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-1p1d-prefill.yaml
similarity index 62%
rename from src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-2p2d-prefill.yaml
rename to src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-1p1d-prefill.yaml
index f2abbcd4..e42cb117 100644
--- a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-2p2d-prefill.yaml
+++ b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-1p1d-prefill.yaml
@@ -12,34 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+enable-metrics: true
 served-model-name: deepseek-ai/DeepSeek-R1
-log-level: DEBUG
-tp: "8"
-dp-size: "8"
-trust-remote-code: true
-decode-log-interval: "1"
-page-size: "1"
-enable-dp-attention: true
-disaggregation-mode: prefill
-disaggregation-transfer-backend: nixl
 disaggregation-bootstrap-port: "30001"
+disaggregation-mode: prefill
 host: "0.0.0.0"
 port: "9090"
-max-running-requests: "6144"
-context-length: "2716"
 disable-radix-cache: true
-moe-a2a-backend: deepep
-load-balance-method: round_robin
-deepep-mode: normal
-moe-dense-tp-size: "1"
-enable-dp-lm-head: true
-disable-shared-experts-fusion: true
-ep-num-redundant-experts: "32"
-ep-dispatch-algorithm: static
-eplb-algorithm: deepseek
-attention-backend: cutlass_mla
+tensor-parallel-size: 4
+data-parallel-size: 1
+expert-parallel-size: 1
+trust-remote-code: true
+kv-cache-dtype: "fp8_e4m3"
+attention-backend: "trtllm_mla"
+quantization: "fp8"
+moe-runner-backend: "flashinfer_trtllm"
 watchdog-timeout: "1000000"
-disable-cuda-graph: true
-chunked-prefill-size: "16384"
+context-length: "9600"
+mem-fraction-static: "0.95"
 max-total-tokens: "32768"
-mem-fraction-static: "0.8"
+chunked-prefill-size: "24576"
+cuda-graph-max-bs: "512"
+max-running-requests: "512"
+load-balance-method: round_robin
+scheduler-recv-interval: "10"
+enable-flashinfer-allreduce-fusion: true
+moe-dense-tp-size: "1"
\ No newline at end of file
diff --git a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml
index 0ac6cdf5..355db26a 100644
--- a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml
+++ b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml
@@ -29,10 +29,10 @@ spec:
       resources:
         requests:
           cpu: "5"
-          memory: "10Gi"
+          memory: "50Gi"
         limits:
           cpu: "5"
-          memory: "10Gi"
+          memory: "50Gi"
       extraPodMetadata:
         annotations:
           {{- if eq .Values.volumes.useGcs true }}
@@ -80,8 +80,10 @@ spec:
               ephemeral-storage: "30Gi"
 
     Decode:
+      {{- if gt (int .Values.dynamo.decodeWorker.nodeCount) 1 }}
       multinode:
         nodeCount:  {{ .Values.dynamo.decodeWorker.nodeCount }}
+      {{- end }}
       dynamoNamespace: {{ .Values.dynamo.namespace }}
       envFromSecret: {{ .Values.secrets.huggingface.secretName }}
       componentType: worker
@@ -245,8 +247,10 @@ spec:
               path: {{ .Values.workload.configFile | default "serving-args.yaml" }}
 
     Prefill:
+      {{- if gt (int .Values.dynamo.prefillWorker.nodeCount) 1 }}
       multinode:
           nodeCount: {{ .Values.dynamo.prefillWorker.nodeCount }}
+      {{- end }}
       dynamoNamespace: {{ .Values.dynamo.namespace }}
       envFromSecret: {{ .Values.secrets.huggingface.secretName }}
       componentType: worker
@@ -341,6 +345,10 @@ spec:
             set -e
             nvidia-smi
             . /usr/local/gib/scripts/set_nccl_env.sh
+
+            echo "--- VERIFYING NCCL ENV VARS IN SHELL ---"
+            env | grep NCCL_
+            echo "--- END VERIFICATION ---"
             pip install hf_transfer
 
             ARGS=()

From b2cb48336e3c732718f23547c621e2c092cefd3f Mon Sep 17 00:00:00 2001
From: Yijia J <yijiaj@google.com>
Date: Wed, 14 Jan 2026 20:18:28 +0000
Subject: [PATCH 08/12] update README, nit

---
 .../disaggregated-serving/dynamo/README.md    |  58 +--
 .../dynamo/bench_clint.yaml                   |  47 +++
 .../disaggregated-serving/dynamo/test.yaml    | 389 ------------------
 .../dynamo/values_deepep.yaml                 |   2 +-
 .../dynamo/values_wo_deepep.yaml              |   4 +-
 5 files changed, 66 insertions(+), 434 deletions(-)
 create mode 100644 inference/a4x/disaggregated-serving/dynamo/bench_clint.yaml
 delete mode 100644 inference/a4x/disaggregated-serving/dynamo/test.yaml

diff --git a/inference/a4x/disaggregated-serving/dynamo/README.md b/inference/a4x/disaggregated-serving/dynamo/README.md
index df1d99e0..90849044 100644
--- a/inference/a4x/disaggregated-serving/dynamo/README.md
+++ b/inference/a4x/disaggregated-serving/dynamo/README.md
@@ -174,7 +174,7 @@ gcloud storage buckets add-iam-policy-binding ${GCS_BUCKET} \
     --role "roles/storage.objectViewer"
 ```
 
-Downloading model files into the gcs bucket.
+Downloading model files into the gcs bucket and set your gcs bucket name in values.yaml file.
 
 <a name="deploy-sglang"></a>
 ## 3. Deploy with SGLang Backend
@@ -223,51 +223,25 @@ $REPO_ROOT/src/helm-charts/a4x/inference-templates/dynamo-deployment
 ## 4. Inference Request
 [Back to Top](#table-of-contents)
 
-To make an inference request to test the server, we can first run a health check against the server using `curl`
+Check if the pods are in `Running` status before sending inference requests. 
 
 ```bash
-kubectl exec -it -n ${NAMESPACE} deployment/$USER-dynamo-a4x-multi-node -- curl http://localhost:8000/health | jq
+kubectl get pods -n ${NAMESPACE}
+```
+
+We can then deploy the benchmark clint and send benchark request.
+Deploy the benchmark clint like this:
+```bash
+kubectl apply -f bench_clint.yaml -n ${NAMESPACE}
+```
+
+And send the request like this: 
+
+```bash
+kubectl exec -it bench-client -- bash -c "cd /workspace/dynamo/examples/backends/sglang/slurm_jobs/scripts/vllm && python3 -u benchmark_serving.py     --host $USER-dynamo-a4x-1p1d-frontend   --port 8000     --model deepseek-ai/DeepSeek-R1     --tokenizer deepseek-ai/DeepSeek-R1     --backend 'dynamo'     --endpoint /v1/completions     --disable-tqdm     --dataset-name random     --num-prompts 2560     --random-input-len 8192     --random-output-len 1024     --random-range-ratio 0.8     --ignore-eos     --request-rate inf     --percentile-metrics ttft,tpot,itl,e2el     --max-concurrency 512"
 ```
 
-You should see a server status like this. Wait for it to be in a `healthy` state.
-
-```json
-{
-  "instances": [
-    {
-      "component": "backend",
-      "endpoint": "load_metrics",
-      "instance_id": 3994861215823793160,
-      "namespace": "dynamo",
-      "transport": {
-        "nats_tcp": "dynamo_backend.load_metrics-3770991c30298c08"
-      }
-    },
-    {
-      "component": "prefill",
-      "endpoint": "clear_kv_blocks",
-      "instance_id": 3994861215823793153,
-      "namespace": "dynamo",
-      "transport": {
-        "nats_tcp": "dynamo_prefill.clear_kv_blocks-3770991c30298c01"
-      }
-    },
-    {
-      "component": "prefill",
-      "endpoint": "generate",
-      "instance_id": 3994861215823793153,
-      "namespace": "dynamo",
-      "transport": {
-        "nats_tcp": "dynamo_prefill.generate-3770991c30298c01"
-      }
-    }
-  ],
-  "message": "No endpoints available",
-  "status": "unhealthy"
-}
-``` 
-
-Then we can send a benchmark request with like this:
+Or we can send a benchmark request to a frontend pod like this:
 
 ```bash
 kubectl exec -n ${NAMESPACE} $USER-dynamo-multi-node-serving-frontend -- python3 -u -m sglang.bench_serving    --backend sglang-oai-chat    --base-url http://localhost:8000    --model "deepseek-ai/DeepSeek-R1"    --tokenizer /data/model/deepseek-ai/DeepSeek-R1    --dataset-name random    --num-prompts 10240   --random-input-len 8192  --random-range-ratio 0.8   --random-output-len 1024   --max-concurrency 2048
diff --git a/inference/a4x/disaggregated-serving/dynamo/bench_clint.yaml b/inference/a4x/disaggregated-serving/dynamo/bench_clint.yaml
new file mode 100644
index 00000000..16a96971
--- /dev/null
+++ b/inference/a4x/disaggregated-serving/dynamo/bench_clint.yaml
@@ -0,0 +1,47 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Pod
+metadata:
+  name: bench-client
+  labels:
+    app: bench-client
+spec:
+  restartPolicy: Never
+  containers:
+  - name: benchmark
+    image: python:3.10
+    workingDir: /workspace
+    command: ["/bin/bash", "-c"]
+    # This script runs ONCE when the pod starts to set everything up.
+    # Then it sleeps forever so the pod stays open for you.
+    args:
+    - |
+      echo "--- STARTING SETUP ---"
+
+      # 1. Install Git
+      apt-get update && apt-get install -y git
+
+      # 2. Install Python Dependencies
+      pip install -q transformers aiohttp numpy requests tqdm pandas datasets Pillow
+
+      # 3. Clone the Repo (Specific Branch)
+      echo "Cloning repo..."
+      git clone --single-branch --branch ishan/sa-1.1-sgl-dsr1-fp8 https://github.com/ai-dynamo/dynamo.git /workspace/dynamo
+
+      echo "--- SETUP COMPLETE. POD IS READY. ---"
+
+      # 4. Keep the pod alive indefinitely
+      sleep infinity
diff --git a/inference/a4x/disaggregated-serving/dynamo/test.yaml b/inference/a4x/disaggregated-serving/dynamo/test.yaml
deleted file mode 100644
index 92e43f57..00000000
--- a/inference/a4x/disaggregated-serving/dynamo/test.yaml
+++ /dev/null
@@ -1,389 +0,0 @@
-apiVersion: resource.nvidia.com/v1beta1
-kind: ComputeDomain
-metadata:
-  name: a4x-compute-domain-test
-  namespace: yijiaj-test
-spec:
-  numNodes: 2
-  channel:
-    resourceClaimTemplate:
-      name: a4x-compute-domain-channel-test
----
-apiVersion: nvidia.com/v1alpha1
-kind: DynamoGraphDeployment
-metadata:
-  name: yijiaj-test-1p1d
-spec:
-  services:
-    Frontend:
-      dynamoNamespace: yijiaj-test
-      componentType: frontend
-      replicas: 9
-      resources:
-        requests:
-          cpu: "5"
-          memory: "50Gi"
-        limits:
-          cpu: "5"
-          memory: "50Gi"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.7.0
-          workingDir: /sgl-workspace/dynamo/components/backends/sglang
-          stdin: true
-          tty: true
-          command:
-            - /bin/sh
-            - -c
-          args:
-            - "python3 -m dynamo.frontend --http-port 8000"
-    Decode:
-      envFromSecret: hf-token-secret
-      livenessProbe:
-        httpGet:
-          path: /live
-          port: system
-        initialDelaySeconds: 600
-        periodSeconds: 30
-        timeoutSeconds: 15
-        failureThreshold: 5
-      readinessProbe:
-        httpGet:
-          path: /health
-          port: system
-        initialDelaySeconds: 60
-        timeoutSeconds: 30
-        periodSeconds: 60
-        failureThreshold: 60
-      dynamoNamespace: yijiaj-test
-      componentType: worker
-      replicas: 1
-      resources:
-        limits:
-          gpu: "4"
-        claims:
-          - name: compute-domain-channel
-      sharedMemory:
-        size: 80Gi
-      envs:
-      - name: LD_LIBRARY_PATH
-        value: "/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu/plugins:/usr/local/nvidia/lib64"
-      - name: GLOO_SOCKET_IFNAME
-        value: eth0
-      - name: TP_SOCKET_IFNAME
-        value: eth0
-
-      - name: PYTHONUNBUFFERED
-        value: "1"
-      - name: DYN_SKIP_SGLANG_LOG_FORMATTING
-        value: "1"
-      - name: SGLANG_ENABLE_JIT_DEEPGEMM
-        value: "false"
-      - name: SGLANG_ENABLE_FLASHINFER_GEMM
-        value: "1"
-      - name: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
-        value: "100000"
-      - name: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
-        value: "100000"
-      - name: SGLANG_DISAGGREGATION_WAITING_TIMEOUT
-        value: "100000"
-      - name: SGLANG_DECODE_BOOTSTRAP_TIMEOUT
-        value: "1000"
-      - name: SGLANG_HACK_SEQ_BOOTSTRAP_ROOM
-        value: "1"
-      - name: SGLANG_MOONCAKE_CUSTOM_MEM_POOL
-        value: "True"
-      - name: SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
-        value: "0"
-      - name: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
-        value: "1"
-      - name: MC_TE_METRIC
-        value: "true"
-      - name: MC_FORCE_MNNVL
-        value: "1"
-      - name: NCCL_MNNVL_ENABLE
-        value: "1"
-      - name: NCCL_CUMEM_ENABLE
-        value: "1"
-
-
-      extraPodMetadata:
-        annotations:
-          gke-gcsfuse/cpu-limit: "0"
-          gke-gcsfuse/ephemeral-storage-limit: "0"
-          gke-gcsfuse/memory-limit: "0"
-          gke-gcsfuse/volumes: "true"
-          networking.gke.io/default-interface: 'eth0'
-          networking.gke.io/interfaces: |
-            [
-              {"interfaceName":"eth0","network":"default"},
-              {"interfaceName":"eth2","network":"rdma-0"},
-              {"interfaceName":"eth3","network":"rdma-1"},
-              {"interfaceName":"eth4","network":"rdma-2"},
-              {"interfaceName":"eth5","network":"rdma-3"}
-            ]
-      extraPodSpec:
-        resourceClaims:
-          - name: compute-domain-channel
-            resourceClaimTemplateName: a4x-compute-domain-channel-test
-        affinity:
-          nodeAffinity:
-            requiredDuringSchedulingIgnoredDuringExecution:
-              nodeSelectorTerms:
-              - matchExpressions:
-                - key: kubernetes.io/arch
-                  operator: In
-                  values:
-                  - arm64
-        volumes:
-        - name: model-src
-          csi:
-            driver: gcsfuse.csi.storage.gke.io
-            volumeAttributes:
-              bucketName: yijiaj-test
-              mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1
-        - name: library-dir-host
-          hostPath:
-            path: /home/kubernetes/bin/nvidia
-        - name: gib
-          hostPath:
-            path: /home/kubernetes/bin/gib
-        mainContainer:
-          securityContext:
-            privileged: true
-          startupProbe:
-            failureThreshold: 1800
-            httpGet:
-              path: /live
-              port: system
-            periodSeconds: 10
-            timeoutSeconds: 5
-          image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout
-          workingDir: /sgl-workspace/dynamo/components/backends/sglang
-          command: ["/bin/bash", "-c"]
-          stdin: true
-          tty: true
-          volumeMounts:
-          - mountPath: /data/model
-            name: model-src
-          - name: library-dir-host
-            mountPath: /usr/local/nvidia
-          - name: gib
-            mountPath: /usr/local/gib
-          args:
-            - |
-              set -e
-
-              nvidia-smi
-              . /usr/local/gib/scripts/set_nccl_env.sh
-
-              echo "--- VERIFYING NCCL ENV VARS IN SHELL ---"
-              env | grep NCCL_
-              echo "--- END VERIFICATION ---"
-
-              exec python3 -m dynamo.sglang \
-                --enable-metrics \
-                --model-path /data/model/deepseek-ai/DeepSeek-R1 \
-                --served-model-name deepseek-ai/DeepSeek-R1 \
-                --disaggregation-bootstrap-port 30001 \
-                --disaggregation-mode decode \
-                --host 0.0.0.0 \
-                --port 8000 \
-                --disable-radix-cache \
-                --tensor-parallel-size 4 \
-                --data-parallel-size 1 \
-                --expert-parallel-size 1 \
-                --trust-remote-code \
-                --kv-cache-dtype fp8_e4m3 \
-                --attention-backend trtllm_mla \
-                --quantization fp8 \
-                --moe-runner-backend flashinfer_trtllm \
-                --disable-radix-cache \
-                --watchdog-timeout 1000000 \
-                --context-length 9600 \
-                --mem-fraction-static 0.95 \
-                --chunked-prefill-size 8192 \
-                --cuda-graph-max-bs 512 \
-                --max-running-requests 512 \
-                --scheduler-recv-interval 10 \
-                --enable-flashinfer-allreduce-fusion \
-                --enable-symm-mem \
-                --moe-dense-tp-size 1 \
-                --prefill-round-robin-balance
-
-    Prefill:
-      envFromSecret: hf-token-secret
-      livenessProbe:
-        exec:
-          command:
-            - /bin/sh
-            - -c
-            - "exit 0"
-        periodSeconds: 60
-        timeoutSeconds: 30
-        failureThreshold: 10
-      readinessProbe:
-        httpGet:
-          path: /health
-          port: system
-        initialDelaySeconds: 60
-        timeoutSeconds: 30
-        periodSeconds: 60
-        failureThreshold: 60
-      dynamoNamespace: yijiaj-test
-      componentType: worker
-      replicas: 1
-      resources:
-        requests:
-          cpu: "130"
-          memory: "800Gi"
-        limits:
-          gpu: "4"
-        claims:
-          - name: compute-domain-channel
-      sharedMemory:
-        size: 80Gi
-      envs:
-      - name: LD_LIBRARY_PATH
-        value: "/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu/plugins:/usr/local/nvidia/lib64"
-      - name: UCX_TLS
-        value: "^tcp"
-      - name: GLOO_SOCKET_IFNAME
-        value: eth0
-      - name: TP_SOCKET_IFNAME
-        value: eth0
-
-
-      - name: PYTHONUNBUFFERED
-        value: "1"
-      - name: DYN_SKIP_SGLANG_LOG_FORMATTING
-        value: "1"
-      - name: SGLANG_ENABLE_JIT_DEEPGEMM
-        value: "false"
-      - name: SGLANG_ENABLE_FLASHINFER_GEMM
-        value: "1"
-      - name: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE
-        value: "100000"
-      - name: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
-        value: "100000"
-      - name: SGLANG_DISAGGREGATION_WAITING_TIMEOUT
-        value: "100000"
-      - name: SGLANG_MOONCAKE_CUSTOM_MEM_POOL
-        value: "True"
-      - name: SGLANG_USE_MESSAGE_QUEUE_BROADCASTER
-        value: "0"
-      - name: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK
-        value: "1"
-      - name: MC_TE_METRIC
-        value: "true"
-      - name: MC_FORCE_MNNVL
-        value: "1"
-      - name: NCCL_MNNVL_ENABLE
-        value: "1"
-      - name: NCCL_CUMEM_ENABLE
-        value: "1"
-
-      extraPodMetadata:
-        annotations:
-          gke-gcsfuse/cpu-limit: "0"
-          gke-gcsfuse/ephemeral-storage-limit: "0"
-          gke-gcsfuse/memory-limit: "0"
-          gke-gcsfuse/volumes: "true"
-          networking.gke.io/default-interface: 'eth0'
-          networking.gke.io/interfaces: |
-            [
-              {"interfaceName":"eth0","network":"default"},
-              {"interfaceName":"eth2","network":"rdma-0"},
-              {"interfaceName":"eth3","network":"rdma-1"},
-              {"interfaceName":"eth4","network":"rdma-2"},
-              {"interfaceName":"eth5","network":"rdma-3"}
-            ]
-      extraPodSpec:
-        resourceClaims:
-          - name: compute-domain-channel
-            resourceClaimTemplateName: a4x-compute-domain-channel-test
-        affinity:
-          nodeAffinity:
-            requiredDuringSchedulingIgnoredDuringExecution:
-              nodeSelectorTerms:
-              - matchExpressions:
-                - key: kubernetes.io/arch
-                  operator: In
-                  values:
-                  - arm64
-        volumes:
-        - name: model-src
-          csi:
-            driver: gcsfuse.csi.storage.gke.io
-            volumeAttributes:
-              bucketName: yijiaj-test
-              mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1
-        - name: library-dir-host
-          hostPath:
-            path: /home/kubernetes/bin/nvidia
-        - name: gib
-          hostPath:
-            path: /home/kubernetes/bin/gib
-        mainContainer:
-          startupProbe:
-            failureThreshold: 1800
-            httpGet:
-              path: /live
-              port: system
-            periodSeconds: 10
-            timeoutSeconds: 5
-          securityContext:
-            privileged: true
-          stdin: true
-          tty: true
-          volumeMounts:
-          - mountPath: /data/model
-            name: model-src
-          - name: library-dir-host
-            mountPath: /usr/local/nvidia
-          - name: gib
-            mountPath: /usr/local/gib
-          image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout
-          workingDir: /sgl-workspace/dynamo/components/backends/sglang
-          command: ["/bin/bash", "-c"]
-          args:
-            - |
-              set -e
-
-              nvidia-smi
-              . /usr/local/gib/scripts/set_nccl_env.sh
-
-              echo "--- VERIFYING NCCL ENV VARS IN SHELL ---"
-              env | grep NCCL_
-              echo "--- END VERIFICATION ---"
-
-              exec python3 -m dynamo.sglang \
-                --enable-metrics \
-                --model-path /data/model/deepseek-ai/DeepSeek-R1 \
-                --served-model-name deepseek-ai/DeepSeek-R1 \
-                --disaggregation-bootstrap-port 30001 \
-                --disaggregation-mode prefill \
-                --host 0.0.0.0 \
-                --port 8000 \
-                --disable-radix-cache \
-                --tensor-parallel-size 4 \
-                --data-parallel-size 1 \
-                --expert-parallel-size 1 \
-                --trust-remote-code \
-                --kv-cache-dtype fp8_e4m3 \
-                --attention-backend trtllm_mla \
-                --quantization fp8 \
-                --moe-runner-backend flashinfer_trtllm \
-                --disable-radix-cache \
-                --watchdog-timeout 1000000 \
-                --context-length 9600 \
-                --mem-fraction-static 0.95 \
-                --max-total-tokens 32768 \
-                --chunked-prefill-size 24576 \
-                --cuda-graph-max-bs 512 \
-                --max-running-requests 512 \
-                --load-balance-method round_robin \
-                --scheduler-recv-interval 10 \
-                --enable-flashinfer-allreduce-fusion \
-                --moe-dense-tp-size 1
-
diff --git a/inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml b/inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml
index d5a02dc6..f853ab74 100644
--- a/inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml
+++ b/inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml
@@ -169,7 +169,7 @@ secrets:
 volumes:
   useGcs: true
   gcsfuse:
-    bucketName: "yijiaj-test"
+    bucketName: your-gcs-bucket-name
   ssdMountPath: "/ssd"
   gcsMounts:
     mountPath: "/data/model"
diff --git a/inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml b/inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml
index bf992302..9a6eb78e 100644
--- a/inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml
+++ b/inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 dynamo:
-  namespace: yijiaj-test
+  namespace: dynamo-cloud
   releaseVersion: "0.7.0"
   deploymentName: dynamo-disagg1p1d
   computeDomain:
@@ -175,7 +175,7 @@ secrets:
 volumes:
   useGcs: true
   gcsfuse:
-    bucketName: "yijiaj-test"
+    bucketName: your-gcs-bucket-name
   ssdMountPath: "/ssd"
   gcsMounts:
     mountPath: "/data/model"

From 2b824f82ecc843618a646e6c5c6e0fbe41dcdf57 Mon Sep 17 00:00:00 2001
From: Yijia J <yijiaj@google.com>
Date: Thu, 15 Jan 2026 05:27:14 +0000
Subject: [PATCH 09/12] nit

---
 inference/a4x/disaggregated-serving/dynamo/README.md      | 8 ++++----
 .../dynamo/{bench_clint.yaml => bench_client.yaml}        | 0
 2 files changed, 4 insertions(+), 4 deletions(-)
 rename inference/a4x/disaggregated-serving/dynamo/{bench_clint.yaml => bench_client.yaml} (100%)

diff --git a/inference/a4x/disaggregated-serving/dynamo/README.md b/inference/a4x/disaggregated-serving/dynamo/README.md
index 90849044..aace9f01 100644
--- a/inference/a4x/disaggregated-serving/dynamo/README.md
+++ b/inference/a4x/disaggregated-serving/dynamo/README.md
@@ -2,7 +2,7 @@
 
 This document outlines the steps to deploy and serve Large Language Models (LLMs) using [NVIDIA Dynamo](https://github.com/ai-dynamo/dynamo) disaggregated inference platform on [A4X GKE Node pools](https://cloud.google.com/kubernetes-engine).
 
-Dynamo provides a disaggregated architecture that separates prefill and decode operations for optimized inference performance, supporting both single-node (8 GPUs) and multi-node (16 GPUs) configurations. Dynamo also supports various inference framework backends like [vLLM](https://docs.nvidia.com/dynamo/latest/components/backends/vllm/README.html) and [SGLang](https://docs.nvidia.com/dynamo/latest/components/backends/sglang/README.html). In this recipe, we will focus on serving using the SGLang backend. 
+Dynamo provides a disaggregated architecture that separates prefill and decode operations for optimized inference performance, supporting both single-node (4 GPUs) and multi-node NVL72 (72 GPUs) configurations. Dynamo also supports various inference framework backends like [vLLM](https://docs.nvidia.com/dynamo/latest/components/backends/vllm/README.html) and [SGLang](https://docs.nvidia.com/dynamo/latest/components/backends/sglang/README.html). In this recipe, we will focus on serving using the SGLang backend. 
 
 <a name="table-of-contents"></a>
 ## Table of Contents
@@ -229,10 +229,10 @@ Check if the pods are in `Running` status before sending inference requests.
 kubectl get pods -n ${NAMESPACE}
 ```
 
-We can then deploy the benchmark clint and send benchark request.
-Deploy the benchmark clint like this:
+We can then deploy the benchmark client and send benchark request.
+Deploy the benchmark client like this:
 ```bash
-kubectl apply -f bench_clint.yaml -n ${NAMESPACE}
+kubectl apply -f bench_client.yaml -n ${NAMESPACE}
 ```
 
 And send the request like this: 
diff --git a/inference/a4x/disaggregated-serving/dynamo/bench_clint.yaml b/inference/a4x/disaggregated-serving/dynamo/bench_client.yaml
similarity index 100%
rename from inference/a4x/disaggregated-serving/dynamo/bench_clint.yaml
rename to inference/a4x/disaggregated-serving/dynamo/bench_client.yaml

From ecf8087da9fd95e93200e576313cbd0d637ec939 Mon Sep 17 00:00:00 2001
From: Yijia J <yijiaj@google.com>
Date: Thu, 15 Jan 2026 05:36:09 +0000
Subject: [PATCH 10/12] readme

---
 inference/a4x/disaggregated-serving/dynamo/README.md | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/inference/a4x/disaggregated-serving/dynamo/README.md b/inference/a4x/disaggregated-serving/dynamo/README.md
index aace9f01..008f1262 100644
--- a/inference/a4x/disaggregated-serving/dynamo/README.md
+++ b/inference/a4x/disaggregated-serving/dynamo/README.md
@@ -160,9 +160,14 @@ Find the service account (usually annotated to default):
 kubectl get serviceaccounts ${NAMESPACE} -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.metadata.annotations.iam\.gke\.io/gcp-service-account}{"\n"}{end}'
 ```
 
+Config the service account email:
+```bash
+export SERVICE_ACCOUNT_EMAIL=$(kubectl get serviceaccount/default -n ${NAMESPACE} -o jsonpath='{.metadata.annotations.iam\.gke\.io/gcp-service-account}')
+```
+
 Authorize the service account:
 ```bash
-gcloud iam service-accounts add-iam-policy-binding xxx@project_id.iam.gserviceaccount.com \
+gcloud iam service-accounts add-iam-policy-binding ${SERVICE_ACCOUNT_EMAIL} \
     --role roles/iam.workloadIdentityUser \
     --member "serviceAccount:project_id.svc.id.goog[${NAMESPACE}/default]"
 ```
@@ -170,7 +175,7 @@ gcloud iam service-accounts add-iam-policy-binding xxx@project_id.iam.gserviceac
 Grant read access to the bucket:
 ```bash
 gcloud storage buckets add-iam-policy-binding ${GCS_BUCKET} \
-    --member "serviceAccount:xxx@project_id.iam.gserviceaccount.com" \
+    --member "serviceAccount:${SERVICE_ACCOUNT_EMAIL}" \
     --role "roles/storage.objectViewer"
 ```
 

From 5b003c8ac5f6067e284ddef66d2eecbd4beee575 Mon Sep 17 00:00:00 2001
From: Yijia J <yijiaj@google.com>
Date: Fri, 16 Jan 2026 01:18:08 +0000
Subject: [PATCH 11/12] README, update image path

---
 .../disaggregated-serving/dynamo/README.md    | 21 ++++++++++++++++++-
 .../dynamo/values_deepep.yaml                 |  3 +--
 .../dynamo/values_wo_deepep.yaml              |  7 +++----
 .../templates/dynamo-graph-deployment.yaml    |  4 ++--
 4 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/inference/a4x/disaggregated-serving/dynamo/README.md b/inference/a4x/disaggregated-serving/dynamo/README.md
index 008f1262..59f937ab 100644
--- a/inference/a4x/disaggregated-serving/dynamo/README.md
+++ b/inference/a4x/disaggregated-serving/dynamo/README.md
@@ -15,6 +15,7 @@ Dynamo provides a disaggregated architecture that separates prefill and decode o
   * [2.4. Create Secrets](#create-secrets)
   * [2.5. Install Dynamo Platform](#install-platform)
   * [2.6. Setup GCS Bucket for GKE ](#setup-gcsfuse)
+  * [2.7. Build Dynamo Image ](#build-dyanmo-image)
 * [3. Deploy with SGLang Backend](#deploy-sglang)
   * [3.1. SGLang Deployment without DeepEP(8 GPUs)](#sglang-wo-deepep)
   * [3.2. SGLang Deployment with DeepEP(72 GPUs)](#sglang-deepep)
@@ -181,6 +182,21 @@ gcloud storage buckets add-iam-policy-binding ${GCS_BUCKET} \
 
 Downloading model files into the gcs bucket and set your gcs bucket name in values.yaml file.
 
+<a name="build-dynamo-image"></a>
+### 2.7. Build Dynamo Image
+
+Follow the [Dynamo container guide](https://github.com/ai-dynamo/dynamo/blob/main/container/README.md) to build the image, then push it to your artifact registry. 
+
+Build the image like this:
+```bash
+docker build -f container/Dockerfile.sglang . -t dynamo-wideep --no-cache --build-arg DYNAMO_VERSION=0.7.0 --platform linux/arm64
+```
+
+Config the docker image:
+```bash
+export ARTIFACT_REGISTRY=<YOUR_IMAGE_ARTIFACT_REGISTRY>
+```
+
 <a name="deploy-sglang"></a>
 ## 3. Deploy with SGLang Backend
 
@@ -200,6 +216,7 @@ Deploy DeepSeekR1-671B across 2 nodes for testing and validation.  Note the use
 ```bash
 cd $RECIPE_ROOT
 helm install -f values_wo_deepep.yaml \
+--set workload.image=${ARTIFACT_REGISTRY} \
 --set-file prefill_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-1p1d-prefill.yaml \
 --set-file decode_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-1p1d-decode.yaml \
 $USER-dynamo-a4x-1p1d \
@@ -217,7 +234,8 @@ Deploy DeepSeekR1-671B across 18 nodes for production workloads. Note the use of
 
 ```bash
 cd $RECIPE_ROOT
-helm install -f values.yaml \
+helm install -f values_deepep.yaml \
+--set workload.image=${ARTIFACT_REGISTRY} \
 --set-file prefill_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-prefill.yaml \
 --set-file decode_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-decode.yaml \
 $USER-dynamo-a4x-multi-node \
@@ -284,6 +302,7 @@ Common issues:
 * **Pods stuck in Pending**: Check if nodes have sufficient resources (especially for multi-node deployments)
 * **Model download slow**: Large models like DeepSeekR1 671B can take 30 minutes to download
 * **Multi-node issues**: Verify network connectivity between nodes and proper subnet configuration
+* **Deepep timeout issue**: Recompile DeepEP to patch NUM_CPU_TIMEOUT_SECS and NUM_TIMEOUT_CYCLES in csrc/kernels/configs.cuh during the image build.
 
 <a name="cleanup"></a>
 ## 6. Cleanup
diff --git a/inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml b/inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml
index f853ab74..194b2b2b 100644
--- a/inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml
+++ b/inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml
@@ -35,7 +35,6 @@ dynamo:
       timeoutSeconds: 300
       failureThreshold: 100
   decodeWorker:
-    image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout 
     nodeCount: 8
     replicas: 1
     envs:
@@ -98,7 +97,6 @@ dynamo:
       timeoutSeconds: 30
       failureThreshold: 1800
   prefillWorker:
-    image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout 
     nodeCount: 2
     replicas: 5
     envs:
@@ -182,6 +180,7 @@ service:
 
 workload:
   model: deepseek-ai/DeepSeek-R1
+  image:
   framework: sglang
   configFile: serving-args.yaml
   configPath: /workload/configs
diff --git a/inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml b/inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml
index 9a6eb78e..ab506a4e 100644
--- a/inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml
+++ b/inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 dynamo:
-  namespace: dynamo-cloud
+  namespace: yijiaj-test
   releaseVersion: "0.7.0"
   deploymentName: dynamo-disagg1p1d
   computeDomain:
@@ -35,7 +35,6 @@ dynamo:
       timeoutSeconds: 300
       failureThreshold: 100
   decodeWorker:
-    image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout 
     nodeCount: 1
     replicas: 1
     envs:
@@ -100,7 +99,6 @@ dynamo:
       timeoutSeconds: 30
       failureThreshold: 1800
   prefillWorker:
-    image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout 
     nodeCount: 1
     replicas: 1
     envs:
@@ -175,7 +173,7 @@ secrets:
 volumes:
   useGcs: true
   gcsfuse:
-    bucketName: your-gcs-bucket-name
+    bucketName: yijiaj-test
   ssdMountPath: "/ssd"
   gcsMounts:
     mountPath: "/data/model"
@@ -188,6 +186,7 @@ service:
 
 workload:
   model: deepseek-ai/DeepSeek-R1
+  image:
   framework: sglang
   configFile: serving-args.yaml
   configPath: /workload/configs
diff --git a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml
index 355db26a..8002e43a 100644
--- a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml
+++ b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml
@@ -158,7 +158,7 @@ spec:
         mainContainer:
           securityContext:
               privileged: true
-          image: {{ .Values.dynamo.decodeWorker.image }}
+          image: {{ .Values.workload.image }}
           workingDir: /sgl-workspace/dynamo/components/backends/sglang
           startupProbe:
             failureThreshold: {{ .Values.dynamo.decodeWorker.startupProbe.failureThreshold }}
@@ -329,7 +329,7 @@ spec:
               privileged: true
           stdin: true
           tty: true
-          image: {{ .Values.dynamo.prefillWorker.image }}
+          image: {{ .Values.workload.image }}
           workingDir: /sgl-workspace/dynamo/components/backends/sglang
           startupProbe:
             failureThreshold: {{ .Values.dynamo.prefillWorker.startupProbe.failureThreshold }}

From ea2c3b37cd340337f238ee53ac9e86d5a6208e2d Mon Sep 17 00:00:00 2001
From: Yijia J <yijiaj@google.com>
Date: Fri, 16 Jan 2026 01:39:09 +0000
Subject: [PATCH 12/12] nit

---
 inference/a4x/disaggregated-serving/dynamo/README.md          | 2 ++
 inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml | 2 +-
 .../a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml    | 4 ++--
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/inference/a4x/disaggregated-serving/dynamo/README.md b/inference/a4x/disaggregated-serving/dynamo/README.md
index 59f937ab..c53cdba0 100644
--- a/inference/a4x/disaggregated-serving/dynamo/README.md
+++ b/inference/a4x/disaggregated-serving/dynamo/README.md
@@ -217,6 +217,7 @@ Deploy DeepSeekR1-671B across 2 nodes for testing and validation.  Note the use
 cd $RECIPE_ROOT
 helm install -f values_wo_deepep.yaml \
 --set workload.image=${ARTIFACT_REGISTRY} \
+--set volumes.gcsfuse.bucketName=${GCS_BUCKET} \
 --set-file prefill_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-1p1d-prefill.yaml \
 --set-file decode_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-1p1d-decode.yaml \
 $USER-dynamo-a4x-1p1d \
@@ -236,6 +237,7 @@ Deploy DeepSeekR1-671B across 18 nodes for production workloads. Note the use of
 cd $RECIPE_ROOT
 helm install -f values_deepep.yaml \
 --set workload.image=${ARTIFACT_REGISTRY} \
+--set volumes.gcsfuse.bucketName=${GCS_BUCKET} \
 --set-file prefill_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-prefill.yaml \
 --set-file decode_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-decode.yaml \
 $USER-dynamo-a4x-multi-node \
diff --git a/inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml b/inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml
index 194b2b2b..a68f203a 100644
--- a/inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml
+++ b/inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml
@@ -167,7 +167,7 @@ secrets:
 volumes:
   useGcs: true
   gcsfuse:
-    bucketName: your-gcs-bucket-name
+    bucketName:
   ssdMountPath: "/ssd"
   gcsMounts:
     mountPath: "/data/model"
diff --git a/inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml b/inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml
index ab506a4e..5308d69f 100644
--- a/inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml
+++ b/inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 dynamo:
-  namespace: yijiaj-test
+  namespace: dynamo-cloud
   releaseVersion: "0.7.0"
   deploymentName: dynamo-disagg1p1d
   computeDomain:
@@ -173,7 +173,7 @@ secrets:
 volumes:
   useGcs: true
   gcsfuse:
-    bucketName: yijiaj-test
+    bucketName:
   ssdMountPath: "/ssd"
   gcsMounts:
     mountPath: "/data/model"