From 4d56166a8364aa5bf1eef70d2aa1419971110a60 Mon Sep 17 00:00:00 2001 From: Yijia J Date: Thu, 25 Dec 2025 00:26:14 +0000 Subject: [PATCH 01/12] initial commit for a4x dynamo deepseek-fp8 2p2d recipe --- .../disaggregated-serving/dynamo/README.md | 28 ++ .../disaggregated-serving/dynamo/values.yaml | 197 ++++++++ .../deepseekr1-fp8-multi-node-decode.yaml | 46 ++ .../deepseekr1-fp8-multi-node-prefill.yaml | 46 ++ .../dynamo-deployment/Chart.yaml | 20 + .../templates/dynamo-compute-domain.yaml | 24 + .../templates/dynamo-graph-deployment.yaml | 470 ++++++++++++++++++ .../templates/dynamo-launcher-configmap.yaml | 28 ++ .../templates/dynamo-worker-configmap.yaml | 35 ++ 9 files changed, 894 insertions(+) create mode 100644 inference/a4x/disaggregated-serving/dynamo/README.md create mode 100644 inference/a4x/disaggregated-serving/dynamo/values.yaml create mode 100644 src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml create mode 100644 src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml create mode 100644 src/helm-charts/a4x/inference-templates/dynamo-deployment/Chart.yaml create mode 100644 src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-compute-domain.yaml create mode 100644 src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml create mode 100644 src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-launcher-configmap.yaml create mode 100644 src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-worker-configmap.yaml diff --git a/inference/a4x/disaggregated-serving/dynamo/README.md b/inference/a4x/disaggregated-serving/dynamo/README.md new file mode 100644 index 00000000..ec499d10 --- /dev/null +++ b/inference/a4x/disaggregated-serving/dynamo/README.md @@ -0,0 +1,28 @@ +# Disaggregated Multi-Node Dynamo Recipe for A4x + +This recipe runs a disaggregated multi-node Dynamo deployment on A4x. + +## Setup + +1. **Set Environment Variables** + + ```bash + export REPO_ROOT=$(git rev-parse --show-toplevel) + export RELEASE_VERSION="24.05" + export USER=$(whoami) + ``` + +2. **Run the Recipe** + + ```bash + helm install -f values.yaml \ + --set-file workload_launcher=$REPO_ROOT/src/launchers/dynamo-vllm-launcher.sh \ + --set-file serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/llama-3.3-70b-multi-node.yaml \ + --set workload.framework=vllm \ + --set workload.model.name=meta-llama/Llama-3.3-70B-Instruct \ + --set workload.image=nvcr.io/nvidia/ai-dynamo/vllm-runtime:${RELEASE_VERSION} \ + --set workload.gpus=16 \ + $USER-dynamo-multi-node-serving-a4x \ + $REPO_ROOT/src/helm-charts/a4x/inference-templates/dynamo-deployment + ``` + diff --git a/inference/a4x/disaggregated-serving/dynamo/values.yaml b/inference/a4x/disaggregated-serving/dynamo/values.yaml new file mode 100644 index 00000000..b49162bc --- /dev/null +++ b/inference/a4x/disaggregated-serving/dynamo/values.yaml @@ -0,0 +1,197 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +dynamo: + namespace: dynamo-cloud + releaseVersion: "0.7.0" + deploymentName: + computeDomain: + name: yijiaj-a4x-domain + numNodes: 4 + resourceClaimTemplateName: yijiaj-a4x-channel + frontend: + image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1 + replicas: 1 + livenessProbe: + initialDelaySeconds: 3000 + periodSeconds: 60 + timeoutSeconds: 150 + failureThreshold: 100 + readinessProbe: + initialDelaySeconds: 3000 + periodSeconds: 60 + timeoutSeconds: 300 + failureThreshold: 100 + decodeWorker: + image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.6.1 + nodeCount: 2 + replicas: 1 + envs: + - name: LD_LIBRARY_PATH + value: "/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu/plugins:/usr/local/nvidia/lib64" + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: TP_SOCKET_IFNAME + value: eth0 + - name: SGLANG_ENABLE_JIT_DEEPGEMM + value: "1" + - name: DYN_SKIP_SGLANG_LOG_FORMATTING + value: "1" + - name: MC_TE_METRIC + value: "true" + - name: SGLANG_ENABLE_FLASHINFER_GEMM + value: "1" + - name: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE + value: "100000" + - name: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT + value: "100000" + - name: SGLANG_DISAGGREGATION_WAITING_TIMEOUT + value: "100000" + - name: SGLANG_DECODE_BOOTSTRAP_TIMEOUT + value: "1000" + - name: SGLANG_HACK_SEQ_BOOTSTRAP_ROOM + value: "1" + - name: SGLANG_MOONCAKE_CUSTOM_MEM_POOL + value: "True" + - name: MC_FORCE_MNNVL + value: "1" + - name: NCCL_MNNVL_ENABLE + value: "1" + - name: NCCL_CUMEM_ENABLE + value: "1" + - name: SGLANG_USE_MESSAGE_QUEUE_BROADCASTER + value: "0" + - name: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK + value: "1" + - name: PYTHONUNBUFFERED + value: "1" + - name: NCCL_DEBUG + value: INFO + - name: NCCL_DEBUG_SUBSYS + value: INIT,BOOTSTRAP,ENV,NET,GRAPH + - name: NCCL_SOCKET_FAMILY + value: "AF_INET" + - name: GLOO_SOCKET_FAMILY + value: "AF_INET" + livenessProbe: + initialDelaySeconds: 3000 + periodSeconds: 60 + timeoutSeconds: 150 + failureThreshold: 100 + readinessProbe: + initialDelaySeconds: 3000 + periodSeconds: 60 + timeoutSeconds: 300 + failureThreshold: 100 + startupProbe: + initialDelaySeconds: 3000 + periodSeconds: 60 + timeoutSeconds: 600 + failureThreshold: 3000 + prefillWorker: + image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.6.1 + nodeCount: 2 + replicas: 1 + envs: + - name: LD_LIBRARY_PATH + value: "/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu/plugins:/usr/local/nvidia/lib64" + - name: UCX_TLS + value: "^tcp" + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: TP_SOCKET_IFNAME + value: eth0 + - name: SGLANG_ENABLE_JIT_DEEPGEMM + value: "1" + - name: DYN_SKIP_SGLANG_LOG_FORMATTING + value: "1" + - name: MC_TE_METRIC + value: "true" + - name: SGLANG_ENABLE_FLASHINFER_GEMM + value: "1" + - name: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE + value: "100000" + - name: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT + value: "100000" + - name: SGLANG_DISAGGREGATION_WAITING_TIMEOUT + value: "100000" + - name: SGLANG_MOONCAKE_CUSTOM_MEM_POOL + value: "True" + - name: MC_FORCE_MNNVL + value: "1" + - name: NCCL_MNNVL_ENABLE + value: "1" + - name: NCCL_CUMEM_ENABLE + value: "1" + - name: SGLANG_USE_MESSAGE_QUEUE_BROADCASTER + value: "0" + - name: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK + value: "1" + - name: PYTHONUNBUFFERED + value: "1" + livenessProbe: + initialDelaySeconds: 3000 + periodSeconds: 60 + timeoutSeconds: 150 + failureThreshold: 100 + readinessProbe: + initialDelaySeconds: 3000 + periodSeconds: 60 + timeoutSeconds: 300 + failureThreshold: 100 + startupProbe: + initialDelaySeconds: 3000 + periodSeconds: 60 + timeoutSeconds: 600 + failureThreshold: 3000 + + +secrets: + ngc: + secretName: nvcr-secret + huggingface: + secretName: hf-token-secret + secretData: + token: "hf_api_token" + +volumes: + gcsfuse: + bucketName: "yijiaj-test" + fileCacheCapacity: "500G" + cachePath: "/gcs-cache" + ssdMountPath: "/ssd" + gcsMounts: + - bucketName: "yijiaj-test" + mountPath: "/data/model" + +service: + type: ClusterIP + ports: + frontend: 8000 + worker: 9090 + +workload: + model: deepseek-ai/DeepSeek-R1 + gpus: 16 + framework: sglang + +network: + subnetworks: [] + gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic-arm64:v1.0.7 + ncclSettings: + - name: NCCL_DEBUG + value: "VERSION" + +quantizations: + - "fp8" diff --git a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml new file mode 100644 index 00000000..82029f49 --- /dev/null +++ b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml @@ -0,0 +1,46 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +model-path: /data/model/deepseek-ai/DeepSeek-R1 +served-model-name: deepseek-ai/DeepSeek-R1 +log-level: DEBUG +tp: "8" +dp-size: "8" +decode-log-interval: "1" +page-size: "1" +enable-dp-attention: true +trust-remote-code: true +disaggregation-mode: decode +disaggregation-transfer-backend: nixl +disaggregation-bootstrap-port: "30001" +host: "0.0.0.0" +port: "9090" +max-running-requests: "36864" +context-length: "2716" +disable-radix-cache: true +moe-a2a-backend: deepep +prefill-round-robin-balance: true +deepep-mode: normal +moe-dense-tp-size: "1" +enable-dp-lm-head: true +disable-cuda-graph: true +cuda-graph-max-bs: "256" +disable-shared-experts-fusion: true +ep-num-redundant-experts: "32" +ep-dispatch-algorithm: static +eplb-algorithm: deepseek +attention-backend: cutlass_mla +watchdog-timeout: "1000000" +chunked-prefill-size: "36864" +mem-fraction-static: "0.8" diff --git a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml new file mode 100644 index 00000000..939aa2cc --- /dev/null +++ b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml @@ -0,0 +1,46 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +model-path: /data/model/deepseek-ai/DeepSeek-R1 +served-model-name: deepseek-ai/DeepSeek-R1 +log-level: DEBUG +tp: "8" +dp-size: "8" +trust-remote-code: true +decode-log-interval: "1" +page-size: "1" +enable-dp-attention: true +disaggregation-mode: prefill +disaggregation-transfer-backend: nixl +disaggregation-bootstrap-port: "30001" +host: "0.0.0.0" +port: "9090" +max-running-requests: "6144" +context-length: "2716" +disable-radix-cache: true +moe-a2a-backend: deepep +load-balance-method: round_robin +deepep-mode: normal +moe-dense-tp-size: "1" +enable-dp-lm-head: true +disable-shared-experts-fusion: true +ep-num-redundant-experts: "32" +ep-dispatch-algorithm: static +eplb-algorithm: deepseek +attention-backend: cutlass_mla +watchdog-timeout: "1000000" +disable-cuda-graph: true +chunked-prefill-size: "16384" +max-total-tokens: "32768" +mem-fraction-static: "0.8" diff --git a/src/helm-charts/a4x/inference-templates/dynamo-deployment/Chart.yaml b/src/helm-charts/a4x/inference-templates/dynamo-deployment/Chart.yaml new file mode 100644 index 00000000..25a2209e --- /dev/null +++ b/src/helm-charts/a4x/inference-templates/dynamo-deployment/Chart.yaml @@ -0,0 +1,20 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v2 +name: a4x-dynamo-deployment +description: a4x-dynamo-deployment +type: application +version: 0.1.0 +appVersion: "0.4.0" \ No newline at end of file diff --git a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-compute-domain.yaml b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-compute-domain.yaml new file mode 100644 index 00000000..dc2ab53a --- /dev/null +++ b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-compute-domain.yaml @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: resource.nvidia.com/v1beta1 +kind: ComputeDomain +metadata: + name: {{ .Values.dynamo.computeDomain.name }} + namespace: {{ .Values.dynamo.namespace }} +spec: + numNodes: {{ .Values.dynamo.computeDomain.numNodes }} + channel: + resourceClaimTemplate: + name: {{ .Values.dynamo.computeDomain.resourceClaimTemplateName }} diff --git a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml new file mode 100644 index 00000000..efe0306d --- /dev/null +++ b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml @@ -0,0 +1,470 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: {{ .Values.dynamo.deploymentName }} + namespace: {{ .Values.dynamo.namespace }} +spec: + {{- if .Values.workload.framework }} + backendFramework: {{ .Values.workload.framework }} + {{- end }} + services: + Frontend: + dynamoNamespace: {{ .Values.dynamo.namespace }} + componentType: frontend + replicas: {{ .Values.dynamo.frontend.replicas }} + resources: + requests: + cpu: "5" + memory: "10Gi" + limits: + cpu: "5" + memory: "10Gi" + extraPodMetadata: + annotations: + gke-gcsfuse/volumes: "true" + gke-gcsfuse/cpu-limit: "0" + gke-gcsfuse/memory-limit: "0" + gke-gcsfuse/ephemeral-storage-limit: "0" + gke-gcsfuse/file-cache-capacity: "500Gi" + gke-gcsfuse/cache-path: "/gcs-cache" + extraPodSpec: + tolerations: + - key: "kubernetes.io/arch" + operator: "Equal" + value: "arm64" + effect: "NoSchedule" + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: local-ssd + emptyDir: {} + - name: gcs-model-volume + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: {{ .Values.volumes.gcsfuse.bucketName }} + mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:50,file-cache:max-size-mb:-1" + + mainContainer: + image: {{ .Values.dynamo.frontend.image }} + volumeMounts: + - name: local-ssd + mountPath: /gcs-cache + - name: gcs-model-volume + mountPath: /data/model + readOnly: true + resources: + requests: + ephemeral-storage: "30Gi" + limits: + ephemeral-storage: "30Gi" + + Decode: + multinode: + nodeCount: {{ .Values.dynamo.decodeWorker.nodeCount }} + dynamoNamespace: {{ .Values.dynamo.namespace }} + envFromSecret: {{ .Values.secrets.huggingface.secretName }} + componentType: worker + subComponentType: decode + replicas: {{ .Values.dynamo.decodeWorker.replicas }} + livenessProbe: + httpGet: + path: /live + port: system + initialDelaySeconds: {{ .Values.dynamo.decodeWorker.livenessProbe.initialDelaySeconds }} + periodSeconds: {{ .Values.dynamo.decodeWorker.livenessProbe.periodSeconds }} + timeoutSeconds: {{ .Values.dynamo.decodeWorker.livenessProbe.timeoutSeconds }} + failureThreshold: {{ .Values.dynamo.decodeWorker.livenessProbe.failureThreshold }} + readinessProbe: + httpGet: + path: /health + port: system + initialDelaySeconds: {{ .Values.dynamo.decodeWorker.readinessProbe.initialDelaySeconds }} + timeoutSeconds: {{ .Values.dynamo.decodeWorker.readinessProbe.timeoutSeconds }} + periodSeconds: {{ .Values.dynamo.decodeWorker.readinessProbe.periodSeconds }} + failureThreshold: {{ .Values.dynamo.decodeWorker.readinessProbe.failureThreshold }} + sharedMemory: + size: 80Gi + resources: + resources: + limits: + gpu: "4" + claims: + - name: compute-domain-channel + envs: + {{- if .Values.dynamo.decodeWorker.envs }} + {{- toYaml .Values.dynamo.decodeWorker.envs | nindent 8 }} + {{- end }} + extraPodMetadata: + annotations: + gke-gcsfuse/cpu-limit: "0" + gke-gcsfuse/ephemeral-storage-limit: "0" + gke-gcsfuse/memory-limit: "0" + gke-gcsfuse/volumes: "true" + networking.gke.io/default-interface: 'eth0' + networking.gke.io/interfaces: | + [ + {"interfaceName":"eth0","network":"default"}, + {"interfaceName":"eth2","network":"rdma-0"}, + {"interfaceName":"eth3","network":"rdma-1"}, + {"interfaceName":"eth4","network":"rdma-2"}, + {"interfaceName":"eth5","network":"rdma-3"} + ] + extraPodSpec: + resourceClaims: + - name: compute-domain-channel + resourceClaimTemplateName: {{ .Values.dynamo.computeDomain.resourceClaimTemplateName }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/arch + operator: In + values: + - arm64 + volumes: + - name: gcs-model-volume + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: {{ .Values.volumes.gcsfuse.bucketName }} + mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1 + - name: library-dir-host + hostPath: + path: /home/kubernetes/bin/nvidia + - name: gib + hostPath: + path: /home/kubernetes/bin/gib + + mainContainer: + securityContext: + privileged: true + image: {{ .Values.dynamo.decodeWorker.image }} + workingDir: /sgl-workspace/dynamo/components/backends/sglang + startupProbe: + failureThreshold: {{ .Values.dynamo.decodeWorker.startupProbe.failureThreshold }} + httpGet: + path: /live + port: system + periodSeconds: {{ .Values.dynamo.decodeWorker.startupProbe.periodSeconds }} + timeoutSeconds: {{ .Values.dynamo.decodeWorker.startupProbe.timeoutSeconds }} + initialDelaySeconds: {{ .Values.dynamo.decodeWorker.startupProbe.initialDelaySeconds }} + command: ["/bin/bash", "-c"] + stdin: true + tty: true + args: + - | + set -e + nvidia-smi + . /usr/local/gib/scripts/set_nccl_env.sh + + echo "--- VERIFYING NCCL ENV VARS IN SHELL ---" + env | grep NCCL_ + echo "--- END VERIFICATION ---" + + {{- if .Values.workload_launcher }} + # Use custom launcher if provided + if [ ! -f "$LAUNCHER_SCRIPT" ]; then + echo "Error: Launcher script $LAUNCHER_SCRIPT not found!" + exit 1 + fi + + ARGS=() + if [ -f "$SERVER_ARGS_FILE" ]; then + echo "Loading server arguments from ConfigMap" + while IFS=': ' read -r key value || [ -n "$key" ]; do + [[ -z "$key" || "$key" == \#* ]] && continue + key=$(echo "$key" | xargs) + value=$(echo "$value" | xargs) + + if [ -n "$key" ]; then + if [[ "$value" == "true" ]]; then + ARGS+=("--$key") + elif [[ "$value" == "false" ]]; then + ARGS+=("--$key" "false") + elif [ -n "$value" ]; then + ARGS+=("--$key" "$value") + else + ARGS+=("--$key") + fi + fi + done < "$SERVER_ARGS_FILE" + fi + + echo "Running: $LAUNCHER_SCRIPT ${ARGS[@]}" + exec "$LAUNCHER_SCRIPT" "${ARGS[@]}" + {{- else }} + exec python3 -m dynamo.sglang \ + --model-path /data/model/deepseek-ai/DeepSeek-R1 \ + --served-model-name deepseek-ai/DeepSeek-R1 \ + --log-level DEBUG \ + --tp 8 \ + --dp-size 8 \ + --decode-log-interval 1 \ + --page-size 1 \ + --enable-dp-attention \ + --trust-remote-code \ + --disaggregation-mode decode \ + --disaggregation-transfer-backend nixl \ + --disaggregation-bootstrap-port 30001 \ + --host 0.0.0.0 \ + --port 9090 \ + --decode-log-interval 1 \ + --max-running-requests 36864 \ + --context-length 2716 \ + --disable-radix-cache \ + --moe-a2a-backend deepep \ + --prefill-round-robin-balance \ + --deepep-mode normal \ + --moe-dense-tp-size 1 \ + --enable-dp-lm-head \ + --disable-cuda-graph \ + --cuda-graph-max-bs 256 \ + --disable-shared-experts-fusion \ + --ep-num-redundant-experts 32 \ + --ep-dispatch-algorithm static \ + --eplb-algorithm deepseek \ + --attention-backend cutlass_mla \ + --watchdog-timeout 1000000 \ + --chunked-prefill-size 36864 \ + --mem-fraction-static 0.8 + {{- end }} + + volumeMounts: + - mountPath: /data/model + name: gcs-model-volume + - name: library-dir-host + mountPath: /usr/local/nvidia + - name: gib + mountPath: /usr/local/gib + {{- if .Values.workload_launcher }} + - name: serving-configuration + mountPath: {{ .Values.workload.configPath | default "/workload/configs" }} + - name: serving-launcher + mountPath: /workload/launcher + {{- end }} + volumes: + {{- if .Values.workload_launcher }} + - name: serving-configuration + configMap: + name: "{{ .Release.Name }}-decode-config" + items: + - key: serving-configuration + path: {{ .Values.workload.configFile | default "serving-args.yaml" }} + - name: serving-launcher + configMap: + name: "{{ .Release.Name }}-launcher" + defaultMode: 0700 + {{- end }} + + + Prefill: + multinode: + nodeCount: {{ .Values.dynamo.prefillWorker.nodeCount }} + dynamoNamespace: {{ .Values.dynamo.namespace }} + envFromSecret: {{ .Values.secrets.huggingface.secretName }} + componentType: worker + subComponentType: prefill + replicas: {{ .Values.dynamo.prefillWorker.replicas }} + livenessProbe: + httpGet: + path: /live + port: system + initialDelaySeconds: {{ .Values.dynamo.prefillWorker.livenessProbe.initialDelaySeconds }} + periodSeconds: {{ .Values.dynamo.prefillWorker.livenessProbe.periodSeconds }} + timeoutSeconds: {{ .Values.dynamo.prefillWorker.livenessProbe.timeoutSeconds }} + failureThreshold: {{ .Values.dynamo.prefillWorker.livenessProbe.failureThreshold }} + readinessProbe: + httpGet: + path: /health + port: system + initialDelaySeconds: {{ .Values.dynamo.prefillWorker.readinessProbe.initialDelaySeconds }} + timeoutSeconds: {{ .Values.dynamo.prefillWorker.readinessProbe.timeoutSeconds }} + periodSeconds: {{ .Values.dynamo.prefillWorker.readinessProbe.periodSeconds }} + failureThreshold: {{ .Values.dynamo.prefillWorker.readinessProbe.failureThreshold }} + sharedMemory: + size: 80Gi + resources: + limits: + gpu: "4" + claims: + - name: compute-domain-channel + envs: + {{- if .Values.dynamo.prefillWorker.envs }} + {{- toYaml .Values.dynamo.prefillWorker.envs | nindent 8 }} + {{- end }} + extraPodMetadata: + annotations: + gke-gcsfuse/cpu-limit: "0" + gke-gcsfuse/ephemeral-storage-limit: "0" + gke-gcsfuse/memory-limit: "0" + gke-gcsfuse/volumes: "true" + networking.gke.io/default-interface: 'eth0' + networking.gke.io/interfaces: | + [ + {"interfaceName":"eth0","network":"default"}, + {"interfaceName":"eth2","network":"rdma-0"}, + {"interfaceName":"eth3","network":"rdma-1"}, + {"interfaceName":"eth4","network":"rdma-2"}, + {"interfaceName":"eth5","network":"rdma-3"} + ] + extraPodSpec: + resourceClaims: + - name: compute-domain-channel + resourceClaimTemplateName: {{ .Values.dynamo.computeDomain.resourceClaimTemplateName }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/arch + operator: In + values: + - arm64 + volumes: + - name: gcs-model-volume + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: {{ .Values.volumes.gcsfuse.bucketName }} + mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1 + - name: library-dir-host + hostPath: + path: /home/kubernetes/bin/nvidia + - name: gib + hostPath: + path: /home/kubernetes/bin/gib + mainContainer: + securityContext: + privileged: true + stdin: true + tty: true + image: {{ .Values.dynamo.prefillWorker.image }} + workingDir: /sgl-workspace/dynamo/components/backends/sglang + startupProbe: + failureThreshold: {{ .Values.dynamo.prefillWorker.startupProbe.failureThreshold }} + httpGet: + path: /live + port: system + periodSeconds: {{ .Values.dynamo.prefillWorker.startupProbe.periodSeconds }} + timeoutSeconds: {{ .Values.dynamo.prefillWorker.startupProbe.timeoutSeconds }} + initialDelaySeconds: {{ .Values.dynamo.prefillWorker.startupProbe.initialDelaySeconds }} + command: ["/bin/bash", "-c"] + args: + - | + set -e + nvidia-smi + . /usr/local/gib/scripts/set_nccl_env.sh + echo "Pre-compiling DeepGEMM kernels for Prefill Worker..." + + echo "Finished pre-compiling DeepGEMM kernels for Prefill Worker." + {{- if .Values.workload_launcher }} + # Use custom launcher if provided + if [ ! -f "$LAUNCHER_SCRIPT" ]; then + echo "Error: Launcher script $LAUNCHER_SCRIPT not found!" + exit 1 + fi + + ARGS=("--is-prefill-worker") + if [ -f "$SERVER_ARGS_FILE" ]; then + echo "Loading server arguments from ConfigMap" + while IFS=': ' read -r key value || [ -n "$key" ]; do + [[ -z "$key" || "$key" == \#* ]] && continue + key=$(echo "$key" | xargs) + value=$(echo "$value" | xargs) + + if [ -n "$key" ]; then + if [[ "$value" == "true" ]]; then + ARGS+=("--$key") + elif [[ "$value" == "false" ]]; then + ARGS+=("--$key" "false") + elif [ -n "$value" ]; then + ARGS+=("--$key" "$value") + else + ARGS+=("--$key") + fi + fi + done < "$SERVER_ARGS_FILE" + fi + + echo "Running: $LAUNCHER_SCRIPT ${ARGS[@]}" + exec "$LAUNCHER_SCRIPT" "${ARGS[@]}" + {{- else }} + exec python3 -m dynamo.sglang \ + --model-path /data/model/deepseek-ai/DeepSeek-R1 \ + --served-model-name deepseek-ai/DeepSeek-R1 \ + --log-level DEBUG \ + --tp 8 \ + --dp-size 8 \ + --trust-remote-code \ + --decode-log-interval 1 \ + --page-size 1 \ + --enable-dp-attention \ + --disaggregation-mode prefill \ + --disaggregation-transfer-backend nixl \ + --disaggregation-bootstrap-port 30001 \ + --host 0.0.0.0 \ + --port 9090 \ + --decode-log-interval 1 \ + --max-running-requests 6144 \ + --context-length 2716 \ + --disable-radix-cache \ + --moe-a2a-backend deepep \ + --load-balance-method round_robin \ + --deepep-mode normal \ + --moe-dense-tp-size 1 \ + --enable-dp-lm-head \ + --disable-shared-experts-fusion \ + --ep-num-redundant-experts 32 \ + --ep-dispatch-algorithm static \ + --eplb-algorithm deepseek \ + --attention-backend cutlass_mla \ + --watchdog-timeout 1000000 \ + --disable-cuda-graph \ + --chunked-prefill-size 16384 \ + --max-total-tokens 32768 \ + --mem-fraction-static 0.8 + {{- end }} + + volumeMounts: + - mountPath: /data/model + name: gcs-model-volume + - name: library-dir-host + mountPath: /usr/local/nvidia + - name: gib + mountPath: /usr/local/gib + {{- if .Values.workload_launcher }} + - name: serving-configuration + mountPath: {{ .Values.workload.configPath | default "/workload/configs" }} + - name: serving-launcher + mountPath: /workload/launcher + {{- end }} + volumes: + {{- if .Values.workload_launcher }} + - name: serving-configuration + configMap: + name: "{{ .Release.Name }}-prefill-config" + items: + - key: serving-configuration + path: {{ .Values.workload.configFile | default "serving-args.yaml" }} + - name: serving-launcher + configMap: + name: "{{ .Release.Name }}-launcher" + defaultMode: 0700 + {{- end }} diff --git a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-launcher-configmap.yaml b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-launcher-configmap.yaml new file mode 100644 index 00000000..01e9b51f --- /dev/null +++ b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-launcher-configmap.yaml @@ -0,0 +1,28 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Release.Name }}-launcher" + namespace: {{ .Values.dynamo.namespace }} +data: + launch-workload.sh: |- +{{- if .Values.workload_launcher }} +{{ .Values.workload_launcher | nindent 4 }} +{{- else }} + #!/bin/bash + echo "No workload launcher specified" + exit 1 +{{- end }} \ No newline at end of file diff --git a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-worker-configmap.yaml b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-worker-configmap.yaml new file mode 100644 index 00000000..f82580ae --- /dev/null +++ b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-worker-configmap.yaml @@ -0,0 +1,35 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{- if .Values.prefill_serving_config }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Release.Name }}-prefill-config" + namespace: {{ .Values.dynamo.namespace }} +data: + serving-configuration: |- +{{ .Values.prefill_serving_config | nindent 4 }} +{{- end }} +--- +{{- if .Values.decode_serving_config }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Release.Name }}-decode-config" + namespace: {{ .Values.dynamo.namespace }} +data: + serving-configuration: |- +{{ .Values.decode_serving_config | nindent 4 }} +{{- end }} \ No newline at end of file From 42b686d07d346cfdd7e27886dfe79dd6e98b5e64 Mon Sep 17 00:00:00 2001 From: Yijia J Date: Sat, 27 Dec 2025 00:01:31 +0000 Subject: [PATCH 02/12] fix values --- .../disaggregated-serving/dynamo/README.md | 24 ++++++++----------- .../disaggregated-serving/dynamo/values.yaml | 2 +- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/inference/a4x/disaggregated-serving/dynamo/README.md b/inference/a4x/disaggregated-serving/dynamo/README.md index ec499d10..9a5f91e6 100644 --- a/inference/a4x/disaggregated-serving/dynamo/README.md +++ b/inference/a4x/disaggregated-serving/dynamo/README.md @@ -1,28 +1,24 @@ # Disaggregated Multi-Node Dynamo Recipe for A4x -This recipe runs a disaggregated multi-node Dynamo deployment on A4x. +This recipe runs a disaggregated multi-node Dynamo deployment on A4X. ## Setup 1. **Set Environment Variables** ```bash - export REPO_ROOT=$(git rev-parse --show-toplevel) - export RELEASE_VERSION="24.05" export USER=$(whoami) ``` 2. **Run the Recipe** - ```bash - helm install -f values.yaml \ - --set-file workload_launcher=$REPO_ROOT/src/launchers/dynamo-vllm-launcher.sh \ - --set-file serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/llama-3.3-70b-multi-node.yaml \ - --set workload.framework=vllm \ - --set workload.model.name=meta-llama/Llama-3.3-70B-Instruct \ - --set workload.image=nvcr.io/nvidia/ai-dynamo/vllm-runtime:${RELEASE_VERSION} \ - --set workload.gpus=16 \ - $USER-dynamo-multi-node-serving-a4x \ - $REPO_ROOT/src/helm-charts/a4x/inference-templates/dynamo-deployment - ``` + ```bash + cd $RECIPE_ROOT + helm install -f values.yaml \ + --set-file workload_launcher=$REPO_ROOT/src/launchers/dynamo-sglang-launcher.sh \ + --set-file prefill_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml \ + --set-file decode_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml \ + $USER-dynamo-a4x-multi-node \ + $REPO_ROOT/src/helm-charts/a4x/inference-templates/dynamo-deployment + ``` diff --git a/inference/a4x/disaggregated-serving/dynamo/values.yaml b/inference/a4x/disaggregated-serving/dynamo/values.yaml index b49162bc..71f43c74 100644 --- a/inference/a4x/disaggregated-serving/dynamo/values.yaml +++ b/inference/a4x/disaggregated-serving/dynamo/values.yaml @@ -15,7 +15,7 @@ dynamo: namespace: dynamo-cloud releaseVersion: "0.7.0" - deploymentName: + deploymentName: disagg2p2d-yijiaj computeDomain: name: yijiaj-a4x-domain numNodes: 4 From 3dfc415947c9b42b1036115b9457c1672bbffc1c Mon Sep 17 00:00:00 2001 From: Yijia J Date: Sat, 27 Dec 2025 00:28:09 +0000 Subject: [PATCH 03/12] update --- .../templates/dynamo-graph-deployment.yaml | 52 +++++++++---------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml index efe0306d..cb9fbbf0 100644 --- a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml +++ b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml @@ -101,7 +101,6 @@ spec: sharedMemory: size: 80Gi resources: - resources: limits: gpu: "4" claims: @@ -138,20 +137,6 @@ spec: operator: In values: - arm64 - volumes: - - name: gcs-model-volume - csi: - driver: gcsfuse.csi.storage.gke.io - volumeAttributes: - bucketName: {{ .Values.volumes.gcsfuse.bucketName }} - mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1 - - name: library-dir-host - hostPath: - path: /home/kubernetes/bin/nvidia - - name: gib - hostPath: - path: /home/kubernetes/bin/gib - mainContainer: securityContext: privileged: true @@ -260,6 +245,18 @@ spec: mountPath: /workload/launcher {{- end }} volumes: + - name: gcs-model-volume + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: {{ .Values.volumes.gcsfuse.bucketName }} + mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1 + - name: library-dir-host + hostPath: + path: /home/kubernetes/bin/nvidia + - name: gib + hostPath: + path: /home/kubernetes/bin/gib {{- if .Values.workload_launcher }} - name: serving-configuration configMap: @@ -337,19 +334,6 @@ spec: operator: In values: - arm64 - volumes: - - name: gcs-model-volume - csi: - driver: gcsfuse.csi.storage.gke.io - volumeAttributes: - bucketName: {{ .Values.volumes.gcsfuse.bucketName }} - mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1 - - name: library-dir-host - hostPath: - path: /home/kubernetes/bin/nvidia - - name: gib - hostPath: - path: /home/kubernetes/bin/gib mainContainer: securityContext: privileged: true @@ -456,6 +440,18 @@ spec: mountPath: /workload/launcher {{- end }} volumes: + - name: gcs-model-volume + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: {{ .Values.volumes.gcsfuse.bucketName }} + mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1 + - name: library-dir-host + hostPath: + path: /home/kubernetes/bin/nvidia + - name: gib + hostPath: + path: /home/kubernetes/bin/gib {{- if .Values.workload_launcher }} - name: serving-configuration configMap: From 36ccdb63e6b5444cc722654dc890070290fb00a9 Mon Sep 17 00:00:00 2001 From: Yijia J Date: Thu, 1 Jan 2026 00:25:42 +0000 Subject: [PATCH 04/12] recipe 2p2d, README --- .../disaggregated-serving/dynamo/README.md | 302 +++++++++++++++++- .../disaggregated-serving/dynamo/values.yaml | 8 +- .../templates/dynamo-graph-deployment.yaml | 125 +------- 3 files changed, 300 insertions(+), 135 deletions(-) diff --git a/inference/a4x/disaggregated-serving/dynamo/README.md b/inference/a4x/disaggregated-serving/dynamo/README.md index 9a5f91e6..b185b4fb 100644 --- a/inference/a4x/disaggregated-serving/dynamo/README.md +++ b/inference/a4x/disaggregated-serving/dynamo/README.md @@ -1,24 +1,292 @@ -# Disaggregated Multi-Node Dynamo Recipe for A4x +# Disaggregated Multi-Node Inference with NVIDIA Dynamo on A4X GKE -This recipe runs a disaggregated multi-node Dynamo deployment on A4X. +This document outlines the steps to deploy and serve Large Language Models (LLMs) using [NVIDIA Dynamo](https://github.com/ai-dynamo/dynamo) disaggregated inference platform on [A4X GKE Node pools](https://cloud.google.com/kubernetes-engine). -## Setup +Dynamo provides a disaggregated architecture that separates prefill and decode operations for optimized inference performance, supporting both single-node (8 GPUs) and multi-node (16 GPUs) configurations. Dynamo also supports various inference framework backends like [vLLM](https://docs.nvidia.com/dynamo/latest/components/backends/vllm/README.html) and [SGLang](https://docs.nvidia.com/dynamo/latest/components/backends/sglang/README.html). In this recipe, we will focus on serving using the SGLang backend. -1. **Set Environment Variables** + +## Table of Contents - ```bash - export USER=$(whoami) - ``` +* [1. Test Environment](#test-environment) +* [2. Environment Setup (One-Time)](#environment-setup) + * [2.1. Clone the Repository](#clone-repo) + * [2.2. Configure Environment Variables](#configure-vars) + * [2.3. Connect to your GKE Cluster](#connect-cluster) + * [2.4. Create Secrets](#create-secrets) + * [2.5. Install Dynamo Platform](#install-platform) +* [3. Deploy with SGLang Backend](#deploy-sglang) + * [3.1. Multi-Node SGLang Deployment (16 GPUs)](#sglang-multi-node) +* [4. Inference Request](#inference-request) +* [5. Monitoring and Troubleshooting](#monitoring) +* [6. Cleanup](#cleanup) -2. **Run the Recipe** + +## 1. Test Environment - ```bash - cd $RECIPE_ROOT - helm install -f values.yaml \ - --set-file workload_launcher=$REPO_ROOT/src/launchers/dynamo-sglang-launcher.sh \ - --set-file prefill_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml \ - --set-file decode_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml \ - $USER-dynamo-a4x-multi-node \ - $REPO_ROOT/src/helm-charts/a4x/inference-templates/dynamo-deployment - ``` +[Back to Top](#table-of-contents) + +This recipe has been tested with the following configuration: + +* **GKE Cluster**: + * GPU node pools with [a4x-highgpu-4g](https://docs.cloud.google.com/compute/docs/gpus#gb200-gpus) machines: + * For multi-node deployment: 4 machines with 4 GPUs each (16 GPUs total) + * [Workload Identity Federation for GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/workload-identity) enabled + * [Cloud Storage FUSE CSI driver for GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/cloud-storage-fuse-csi-driver) enabled + +> [!IMPORTANT] +> To prepare the required environment, see the [GKE environment setup guide](../../../../docs/configuring-environment-gke-a4x.md). + + +## 2. Environment Setup (One-Time) + +[Back to Top](#table-of-contents) + + +### 2.1. Clone the Repository + +```bash +git clone https://github.com/ai-hypercomputer/gpu-recipes.git +cd gpu-recipes +export REPO_ROOT=$(pwd) +export RECIPE_ROOT=$REPO_ROOT/inference/a4x/disaggregated-serving/dynamo +``` + + +### 2.2. Configure Environment Variables + +```bash +export PROJECT_ID= +export CLUSTER_REGION= +export CLUSTER_NAME= +export NAMESPACE=dynamo-cloud +export NGC_API_KEY= +export HF_TOKEN= +export RELEASE_VERSION=0.7.0 + +# Set the project for gcloud commands +gcloud config set project $PROJECT_ID +``` + +Replace the following values: + +| Variable | Description | Example | +| -------- | ----------- | ------- | +| `PROJECT_ID` | Your Google Cloud Project ID | `gcp-project-12345` | +| `CLUSTER_REGION` | The GCP region where your GKE cluster is located | `us-central1` | +| `CLUSTER_NAME` | The name of your GKE cluster | `a4x-cluster` | +| `NGC_API_KEY` | Your NVIDIA NGC API key (get from [NGC](https://ngc.nvidia.com)) | `nvapi-xxx...` | +| `HF_TOKEN` | Your Hugging Face access token | `hf_xxx...` | + + +### 2.3. Connect to your GKE Cluster + +```bash +gcloud container clusters get-credentials $CLUSTER_NAME --region $CLUSTER_REGION +``` + + +### 2.4. Create Secrets + +Create the namespace: +```bash +kubectl create namespace ${NAMESPACE} +kubectl config set-context --current --namespace=$NAMESPACE +``` + +Create the Docker registry secret for NVIDIA Container Registry: +```bash +kubectl create secret docker-registry nvcr-secret \ + --namespace=${NAMESPACE} \ + --docker-server=nvcr.io \ + --docker-username='$oauthtoken' \ + --docker-password=${NGC_API_KEY} +``` + +Create the secret for the Hugging Face token: +```bash +kubectl create secret generic hf-token-secret \ + --from-literal=HF_TOKEN=${HF_TOKEN} \ + -n ${NAMESPACE} +``` + + +### 2.5. Install Dynamo Platform (One-Time Setup) + +Add the NVIDIA Helm repository: +```bash +helm repo add nvidia https://helm.ngc.nvidia.com/nvidia \ + --username='$oauthtoken' --password=${NGC_API_KEY} +helm repo update +``` + +Fetch the Dynamo Helm charts: +```bash +helm fetch https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-crds-${RELEASE_VERSION}.tgz +helm fetch https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-platform-${RELEASE_VERSION}.tgz +``` + +Install the Dynamo CRDs: +```bash +helm install dynamo-crds dynamo-crds-${RELEASE_VERSION}.tgz \ + --namespace default \ + --wait \ + --atomic +``` + +Install the Dynamo Platform with Grove & Kai scheduler enabled: +```bash +helm install dynamo-platform dynamo-platform-${RELEASE_VERSION}.tgz \ + --namespace ${NAMESPACE} --set grove.enabled=true --set kai-scheduler.enabled=true +``` + +Verify the installation: +```bash +kubectl get pods -n ${NAMESPACE} +``` + +Wait until all pods show a `Running` status before proceeding. + + +## 3. Deploy with SGLang Backend + +[Back to Top](#table-of-contents) + +Deploy Dynamo with SGLang backend for high-performance inference. + + +### 3.1. Multi-Node vLLM Deployment (16 GPUs) + +Multi-node deployment uses 16 GPUs across 4 A4X machines, providing increased capacity for larger models or higher throughput. + +#### DeepSeekR1 671B Model + +Deploy DeepSeekR1-671B across multiple nodes for production workloads. Note the use of `--set-file prefill_serving_config` and `--set-file decode_serving_config` pointing to the correct model config file for a multi node deployment scenario: + +```bash +cd $RECIPE_ROOT +helm install -f values.yaml \ +--set-file prefill_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml \ +--set-file decode_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml \ +$USER-dynamo-a4x-multi-node \ +$REPO_ROOT/src/helm-charts/a4x/inference-templates/dynamo-deployment +``` + + +## 4. Inference Request +[Back to Top](#table-of-contents) + +To make an inference request to test the server, we can first run a health check against the server using `curl` + +```bash +kubectl exec -it -n ${NAMESPACE} deployment/$USER-dynamo-a4x-multi-node -- curl http://localhost:8000/health | jq +``` + +You should see a server status like this. Wait for it to be in a `healthy` state. + +```json +{ + "instances": [ + { + "component": "backend", + "endpoint": "load_metrics", + "instance_id": 3994861215823793160, + "namespace": "dynamo", + "transport": { + "nats_tcp": "dynamo_backend.load_metrics-3770991c30298c08" + } + }, + { + "component": "prefill", + "endpoint": "clear_kv_blocks", + "instance_id": 3994861215823793153, + "namespace": "dynamo", + "transport": { + "nats_tcp": "dynamo_prefill.clear_kv_blocks-3770991c30298c01" + } + }, + { + "component": "prefill", + "endpoint": "generate", + "instance_id": 3994861215823793153, + "namespace": "dynamo", + "transport": { + "nats_tcp": "dynamo_prefill.generate-3770991c30298c01" + } + } + ], + "message": "No endpoints available", + "status": "unhealthy" +} +``` + +Then we can send a benchmark request with like this: + +```bash +kubectl exec -n ${NAMESPACE} $USER-dynamo-multi-node-serving-frontend -- python3 -u -m sglang.bench_serving --backend sglang-oai-chat --base-url http://localhost:8000 --model "deepseek-ai/DeepSeek-R1" --tokenizer /data/model/deepseek-ai/DeepSeek-R1 --dataset-name random --num-prompts 2048 --random-input-len 2048 --random-output-len 512 --max-concurrency 512 +``` + + +## 5. Monitoring and Troubleshooting + +[Back to Top](#table-of-contents) + +View logs for different components (replace with your deployment name): + +You can find the exact pod name by: +```bash +kubectl get pods -n ${NAMESPACE} +``` + +Frontend logs: +```bash +kubectl logs -f deployment/$USER-dynamo-multi-node-serving-frontend -n ${NAMESPACE} +``` + +Decode worker logs: +```bash +kubectl logs -f deployment/$USER-dynamo-multi-node-serving-decode-worker -n ${NAMESPACE} +``` + +Prefill worker logs: +```bash +kubectl logs -f deployment/$USER-dynamo-multi-node-serving-prefill-worker -n ${NAMESPACE} +``` + +Common issues: + +* **Pods stuck in Pending**: Check if nodes have sufficient resources (especially for multi-node deployments) +* **Model download slow**: Large models like DeepSeekR1 671B can take 30 minutes to download +* **Multi-node issues**: Verify network connectivity between nodes and proper subnet configuration + + +## 6. Cleanup + +[Back to Top](#table-of-contents) + +List deployed releases: +```bash +helm list -n ${NAMESPACE} --filter $USER-dynamo- +``` + +Uninstall specific deployments: +```bash +helm uninstall $USER-dynamo-multi-node-serving -n ${NAMESPACE} +``` + +Uninstall Dynamo platform (if no longer needed): +```bash +helm uninstall dynamo-platform -n ${NAMESPACE} +helm uninstall dynamo-crds -n default +``` + +Delete namespace and secrets: +```bash +kubectl delete namespace ${NAMESPACE} +``` + +Clean up downloaded charts: +```bash +rm -f dynamo-crds-${RELEASE_VERSION}.tgz +rm -f dynamo-platform-${RELEASE_VERSION}.tgz +``` diff --git a/inference/a4x/disaggregated-serving/dynamo/values.yaml b/inference/a4x/disaggregated-serving/dynamo/values.yaml index 71f43c74..9c271b35 100644 --- a/inference/a4x/disaggregated-serving/dynamo/values.yaml +++ b/inference/a4x/disaggregated-serving/dynamo/values.yaml @@ -15,11 +15,11 @@ dynamo: namespace: dynamo-cloud releaseVersion: "0.7.0" - deploymentName: disagg2p2d-yijiaj + deploymentName: disagg2p2d computeDomain: - name: yijiaj-a4x-domain + name: a4x-domain numNodes: 4 - resourceClaimTemplateName: yijiaj-a4x-channel + resourceClaimTemplateName: a4x-channel frontend: image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1 replicas: 1 @@ -185,6 +185,8 @@ workload: model: deepseek-ai/DeepSeek-R1 gpus: 16 framework: sglang + configFile: serving-args.yaml + configPath: /workload/configs network: subnetworks: [] diff --git a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml index cb9fbbf0..67444375 100644 --- a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml +++ b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml @@ -106,6 +106,8 @@ spec: claims: - name: compute-domain-channel envs: + - name: SERVER_ARGS_FILE + value: {{ .Values.workload.configPath }}/{{ .Values.workload.configFile }} {{- if .Values.dynamo.decodeWorker.envs }} {{- toYaml .Values.dynamo.decodeWorker.envs | nindent 8 }} {{- end }} @@ -163,13 +165,6 @@ spec: env | grep NCCL_ echo "--- END VERIFICATION ---" - {{- if .Values.workload_launcher }} - # Use custom launcher if provided - if [ ! -f "$LAUNCHER_SCRIPT" ]; then - echo "Error: Launcher script $LAUNCHER_SCRIPT not found!" - exit 1 - fi - ARGS=() if [ -f "$SERVER_ARGS_FILE" ]; then echo "Loading server arguments from ConfigMap" @@ -191,45 +186,8 @@ spec: fi done < "$SERVER_ARGS_FILE" fi - - echo "Running: $LAUNCHER_SCRIPT ${ARGS[@]}" - exec "$LAUNCHER_SCRIPT" "${ARGS[@]}" - {{- else }} - exec python3 -m dynamo.sglang \ - --model-path /data/model/deepseek-ai/DeepSeek-R1 \ - --served-model-name deepseek-ai/DeepSeek-R1 \ - --log-level DEBUG \ - --tp 8 \ - --dp-size 8 \ - --decode-log-interval 1 \ - --page-size 1 \ - --enable-dp-attention \ - --trust-remote-code \ - --disaggregation-mode decode \ - --disaggregation-transfer-backend nixl \ - --disaggregation-bootstrap-port 30001 \ - --host 0.0.0.0 \ - --port 9090 \ - --decode-log-interval 1 \ - --max-running-requests 36864 \ - --context-length 2716 \ - --disable-radix-cache \ - --moe-a2a-backend deepep \ - --prefill-round-robin-balance \ - --deepep-mode normal \ - --moe-dense-tp-size 1 \ - --enable-dp-lm-head \ - --disable-cuda-graph \ - --cuda-graph-max-bs 256 \ - --disable-shared-experts-fusion \ - --ep-num-redundant-experts 32 \ - --ep-dispatch-algorithm static \ - --eplb-algorithm deepseek \ - --attention-backend cutlass_mla \ - --watchdog-timeout 1000000 \ - --chunked-prefill-size 36864 \ - --mem-fraction-static 0.8 - {{- end }} + echo "Running: python3 -m dynamo.sglang ${ARGS[@]}" + exec python3 -m dynamo.sglang "${ARGS[@]}" volumeMounts: - mountPath: /data/model @@ -238,12 +196,8 @@ spec: mountPath: /usr/local/nvidia - name: gib mountPath: /usr/local/gib - {{- if .Values.workload_launcher }} - name: serving-configuration mountPath: {{ .Values.workload.configPath | default "/workload/configs" }} - - name: serving-launcher - mountPath: /workload/launcher - {{- end }} volumes: - name: gcs-model-volume csi: @@ -257,19 +211,12 @@ spec: - name: gib hostPath: path: /home/kubernetes/bin/gib - {{- if .Values.workload_launcher }} - name: serving-configuration configMap: name: "{{ .Release.Name }}-decode-config" items: - key: serving-configuration path: {{ .Values.workload.configFile | default "serving-args.yaml" }} - - name: serving-launcher - configMap: - name: "{{ .Release.Name }}-launcher" - defaultMode: 0700 - {{- end }} - Prefill: multinode: @@ -303,6 +250,8 @@ spec: claims: - name: compute-domain-channel envs: + - name: SERVER_ARGS_FILE + value: {{ .Values.workload.configPath }}/{{ .Values.workload.configFile }} {{- if .Values.dynamo.prefillWorker.envs }} {{- toYaml .Values.dynamo.prefillWorker.envs | nindent 8 }} {{- end }} @@ -356,16 +305,9 @@ spec: nvidia-smi . /usr/local/gib/scripts/set_nccl_env.sh echo "Pre-compiling DeepGEMM kernels for Prefill Worker..." - echo "Finished pre-compiling DeepGEMM kernels for Prefill Worker." - {{- if .Values.workload_launcher }} - # Use custom launcher if provided - if [ ! -f "$LAUNCHER_SCRIPT" ]; then - echo "Error: Launcher script $LAUNCHER_SCRIPT not found!" - exit 1 - fi - ARGS=("--is-prefill-worker") + ARGS=() if [ -f "$SERVER_ARGS_FILE" ]; then echo "Loading server arguments from ConfigMap" while IFS=': ' read -r key value || [ -n "$key" ]; do @@ -386,45 +328,8 @@ spec: fi done < "$SERVER_ARGS_FILE" fi - - echo "Running: $LAUNCHER_SCRIPT ${ARGS[@]}" - exec "$LAUNCHER_SCRIPT" "${ARGS[@]}" - {{- else }} - exec python3 -m dynamo.sglang \ - --model-path /data/model/deepseek-ai/DeepSeek-R1 \ - --served-model-name deepseek-ai/DeepSeek-R1 \ - --log-level DEBUG \ - --tp 8 \ - --dp-size 8 \ - --trust-remote-code \ - --decode-log-interval 1 \ - --page-size 1 \ - --enable-dp-attention \ - --disaggregation-mode prefill \ - --disaggregation-transfer-backend nixl \ - --disaggregation-bootstrap-port 30001 \ - --host 0.0.0.0 \ - --port 9090 \ - --decode-log-interval 1 \ - --max-running-requests 6144 \ - --context-length 2716 \ - --disable-radix-cache \ - --moe-a2a-backend deepep \ - --load-balance-method round_robin \ - --deepep-mode normal \ - --moe-dense-tp-size 1 \ - --enable-dp-lm-head \ - --disable-shared-experts-fusion \ - --ep-num-redundant-experts 32 \ - --ep-dispatch-algorithm static \ - --eplb-algorithm deepseek \ - --attention-backend cutlass_mla \ - --watchdog-timeout 1000000 \ - --disable-cuda-graph \ - --chunked-prefill-size 16384 \ - --max-total-tokens 32768 \ - --mem-fraction-static 0.8 - {{- end }} + echo "Running: python3 -m dynamo.sglang ${ARGS[@]}" + exec python3 -m dynamo.sglang "${ARGS[@]}" volumeMounts: - mountPath: /data/model @@ -433,12 +338,8 @@ spec: mountPath: /usr/local/nvidia - name: gib mountPath: /usr/local/gib - {{- if .Values.workload_launcher }} - name: serving-configuration mountPath: {{ .Values.workload.configPath | default "/workload/configs" }} - - name: serving-launcher - mountPath: /workload/launcher - {{- end }} volumes: - name: gcs-model-volume csi: @@ -452,15 +353,9 @@ spec: - name: gib hostPath: path: /home/kubernetes/bin/gib - {{- if .Values.workload_launcher }} - name: serving-configuration configMap: name: "{{ .Release.Name }}-prefill-config" items: - key: serving-configuration - path: {{ .Values.workload.configFile | default "serving-args.yaml" }} - - name: serving-launcher - configMap: - name: "{{ .Release.Name }}-launcher" - defaultMode: 0700 - {{- end }} + path: {{ .Values.workload.configFile | default "serving-args.yaml" }} \ No newline at end of file From e7503e8c5efa13a7f5c3aab439d534b886dcd572 Mon Sep 17 00:00:00 2001 From: Yijia J Date: Sat, 10 Jan 2026 06:13:31 +0000 Subject: [PATCH 05/12] add 10p8d configs, add path without gcsfuse --- .../disaggregated-serving/dynamo/values.yaml | 69 ++++++++++++------- .../deepseekr1-fp8-10p8d-decode.yaml | 50 ++++++++++++++ .../deepseekr1-fp8-10p8d-prefill.yaml | 50 ++++++++++++++ ...e.yaml => deepseekr1-fp8-2p2d-decode.yaml} | 1 - ....yaml => deepseekr1-fp8-2p2d-prefill.yaml} | 1 - .../templates/dynamo-graph-deployment.yaml | 59 ++++++++++++++-- 6 files changed, 197 insertions(+), 33 deletions(-) create mode 100644 src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-decode.yaml create mode 100644 src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-prefill.yaml rename src/frameworks/a4x/dynamo-configs/{deepseekr1-fp8-multi-node-decode.yaml => deepseekr1-fp8-2p2d-decode.yaml} (96%) rename src/frameworks/a4x/dynamo-configs/{deepseekr1-fp8-multi-node-prefill.yaml => deepseekr1-fp8-2p2d-prefill.yaml} (96%) diff --git a/inference/a4x/disaggregated-serving/dynamo/values.yaml b/inference/a4x/disaggregated-serving/dynamo/values.yaml index 9c271b35..a047a65f 100644 --- a/inference/a4x/disaggregated-serving/dynamo/values.yaml +++ b/inference/a4x/disaggregated-serving/dynamo/values.yaml @@ -13,15 +13,16 @@ # limitations under the License. dynamo: - namespace: dynamo-cloud + namespace: yijiaj-test releaseVersion: "0.7.0" - deploymentName: disagg2p2d + deploymentName: disagg2p2d-yijiaj computeDomain: - name: a4x-domain + name: yijiaj-a4x-domain numNodes: 4 - resourceClaimTemplateName: a4x-channel + resourceClaimTemplateName: yijiaj-a4x-channel + serviceAccountName: dynamo-platform-dynamo-operator-component frontend: - image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1 + image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.7.0 replicas: 1 livenessProbe: initialDelaySeconds: 3000 @@ -34,24 +35,34 @@ dynamo: timeoutSeconds: 300 failureThreshold: 100 decodeWorker: - image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.6.1 + image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout + #image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.6.1 nodeCount: 2 replicas: 1 envs: + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: HF_TOKEN + - name: HF_HUB_ENABLE_HF_TRANSFER + value: "1" - name: LD_LIBRARY_PATH value: "/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu/plugins:/usr/local/nvidia/lib64" - name: GLOO_SOCKET_IFNAME value: eth0 - name: TP_SOCKET_IFNAME value: eth0 - - name: SGLANG_ENABLE_JIT_DEEPGEMM - value: "1" + # - name: SGLANG_ENABLE_JIT_DEEPGEMM + # value: "1" - name: DYN_SKIP_SGLANG_LOG_FORMATTING value: "1" + - name: SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK + value: "256" - name: MC_TE_METRIC value: "true" - - name: SGLANG_ENABLE_FLASHINFER_GEMM - value: "1" + # - name: SGLANG_ENABLE_FLASHINFER_GEMM + # value: "1" - name: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE value: "100000" - name: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT @@ -76,14 +87,14 @@ dynamo: value: "1" - name: PYTHONUNBUFFERED value: "1" - - name: NCCL_DEBUG - value: INFO - - name: NCCL_DEBUG_SUBSYS - value: INIT,BOOTSTRAP,ENV,NET,GRAPH - - name: NCCL_SOCKET_FAMILY - value: "AF_INET" - - name: GLOO_SOCKET_FAMILY - value: "AF_INET" + # - name: NCCL_DEBUG + # value: INFO + # - name: NCCL_DEBUG_SUBSYS + # value: INIT,BOOTSTRAP,ENV,NET,GRAPH + # - name: NCCL_SOCKET_FAMILY + # value: "AF_INET" + # - name: GLOO_SOCKET_FAMILY + # value: "AF_INET" livenessProbe: initialDelaySeconds: 3000 periodSeconds: 60 @@ -100,10 +111,18 @@ dynamo: timeoutSeconds: 600 failureThreshold: 3000 prefillWorker: - image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.6.1 + image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout + #image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.6.1 nodeCount: 2 replicas: 1 envs: + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: HF_TOKEN + - name: HF_HUB_ENABLE_HF_TRANSFER + value: "1" - name: LD_LIBRARY_PATH value: "/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu/plugins:/usr/local/nvidia/lib64" - name: UCX_TLS @@ -112,14 +131,14 @@ dynamo: value: eth0 - name: TP_SOCKET_IFNAME value: eth0 - - name: SGLANG_ENABLE_JIT_DEEPGEMM - value: "1" + # - name: SGLANG_ENABLE_JIT_DEEPGEMM + # value: "1" - name: DYN_SKIP_SGLANG_LOG_FORMATTING value: "1" - name: MC_TE_METRIC value: "true" - - name: SGLANG_ENABLE_FLASHINFER_GEMM - value: "1" + # - name: SGLANG_ENABLE_FLASHINFER_GEMM + # value: "1" - name: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE value: "100000" - name: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT @@ -166,14 +185,14 @@ secrets: token: "hf_api_token" volumes: + useGcs: true gcsfuse: bucketName: "yijiaj-test" fileCacheCapacity: "500G" cachePath: "/gcs-cache" ssdMountPath: "/ssd" gcsMounts: - - bucketName: "yijiaj-test" - mountPath: "/data/model" + mountPath: "/data/model" service: type: ClusterIP diff --git a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-decode.yaml b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-decode.yaml new file mode 100644 index 00000000..bbbdf18f --- /dev/null +++ b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-decode.yaml @@ -0,0 +1,50 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +served-model-name: deepseek-ai/DeepSeek-R1 +disaggregation-mode: decode +disaggregation-bootstrap-port: "30001" +host: "0.0.0.0" +port: "9090" +trust-remote-code: true +skip-tokenizer-init: true +tp-size: "32" +dp-size: "32" +ep-size: "32" +quantization: "fp8" +# page-size: "1" +enable-dp-attention: true +attention-backend: "trtllm_mla" +kv-cache-dtype: "fp8_e4m3" +disable-radix-cache: true +stream-interval: "50" +# disaggregation-transfer-backend: nixl +decode-log-interval: "1000" +max-running-requests: "8192" +context-length: "9300" +watchdog-timeout: "1000000" +disable-shared-experts-fusion: true +eplb-algorithm: deepseek +mem-fraction-static: "0.82" +chunked-prefill-size: "36864" +moe-a2a-backend: "deepep" +deepep-mode: "low_latency" +ep-dispatch-algorithm: static +moe-dense-tp-size: "1" +enable-dp-lm-head: true +prefill-round-robin-balance: true +ep-num-redundant-experts: "32" +cuda-graph-max-bs: "256" +# disable-cuda-graph: true +deepep-config: '{"normal_dispatch": {"num_sms": 128,"num_max_nvl_chunked_send_tokens": 28,"num_max_nvl_chunked_recv_tokens": 256,"num_max_rdma_chunked_send_tokens": 6,"num_max_rdma_chunked_recv_tokens": 256}, "normal_combine": {"num_sms": 128,"num_max_nvl_chunked_send_tokens": 15,"num_max_nvl_chunked_recv_tokens": 256,"num_max_rdma_chunked_send_tokens": 6,"num_max_rdma_chunked_recv_tokens": 128}}' \ No newline at end of file diff --git a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-prefill.yaml b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-prefill.yaml new file mode 100644 index 00000000..f5748607 --- /dev/null +++ b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-prefill.yaml @@ -0,0 +1,50 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +served-model-name: deepseek-ai/DeepSeek-R1 +# log-level: DEBUG +disaggregation-mode: prefill +disaggregation-bootstrap-port: "30001" +host: "0.0.0.0" +port: "9090" +trust-remote-code: true +tp-size: "8" +dp-size: "8" +ep-size: "8" +quantization: "fp8" +enable-dp-attention: true +attention-backend: "trtllm_mla" +kv-cache-dtype: "fp8_e4m3" +disable-radix-cache: true +stream-interval: "50" +max-running-requests: "30000" +context-length: "9300" +# decode-log-interval: "1" +# page-size: "1" +# disaggregation-transfer-backend: nixl +watchdog-timeout: "1000000" +disable-shared-experts-fusion: true +eplb-algorithm: deepseek +mem-fraction-static: "0.8" +max-total-tokens: "524288" +chunked-prefill-size: "131072" +load-balance-method: round_robin +disable-cuda-graph: true +moe-a2a-backend: deepep +deepep-mode: normal +ep-dispatch-algorithm: "dynamic" +moe-dense-tp-size: "1" +enable-dp-lm-head: true +ep-num-redundant-experts: "32" +deepep-config: '{"normal_dispatch": {"num_sms": 128,"num_max_nvl_chunked_send_tokens": 28,"num_max_nvl_chunked_recv_tokens": 256,"num_max_rdma_chunked_send_tokens": 6,"num_max_rdma_chunked_recv_tokens": 256}, "normal_combine": {"num_sms": 128,"num_max_nvl_chunked_send_tokens": 15,"num_max_nvl_chunked_recv_tokens": 256,"num_max_rdma_chunked_send_tokens": 6,"num_max_rdma_chunked_recv_tokens": 128}}' \ No newline at end of file diff --git a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-2p2d-decode.yaml similarity index 96% rename from src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml rename to src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-2p2d-decode.yaml index 82029f49..a2287217 100644 --- a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml +++ b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-2p2d-decode.yaml @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -model-path: /data/model/deepseek-ai/DeepSeek-R1 served-model-name: deepseek-ai/DeepSeek-R1 log-level: DEBUG tp: "8" diff --git a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-2p2d-prefill.yaml similarity index 96% rename from src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml rename to src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-2p2d-prefill.yaml index 939aa2cc..f2abbcd4 100644 --- a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml +++ b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-2p2d-prefill.yaml @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -model-path: /data/model/deepseek-ai/DeepSeek-R1 served-model-name: deepseek-ai/DeepSeek-R1 log-level: DEBUG tp: "8" diff --git a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml index 67444375..0ac6cdf5 100644 --- a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml +++ b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml @@ -35,12 +35,14 @@ spec: memory: "10Gi" extraPodMetadata: annotations: + {{- if eq .Values.volumes.useGcs true }} gke-gcsfuse/volumes: "true" gke-gcsfuse/cpu-limit: "0" gke-gcsfuse/memory-limit: "0" gke-gcsfuse/ephemeral-storage-limit: "0" gke-gcsfuse/file-cache-capacity: "500Gi" gke-gcsfuse/cache-path: "/gcs-cache" + {{- end }} extraPodSpec: tolerations: - key: "kubernetes.io/arch" @@ -53,21 +55,24 @@ spec: volumes: - name: local-ssd emptyDir: {} + {{- if eq .Values.volumes.useGcs true }} - name: gcs-model-volume csi: driver: gcsfuse.csi.storage.gke.io volumeAttributes: bucketName: {{ .Values.volumes.gcsfuse.bucketName }} mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:50,file-cache:max-size-mb:-1" - + {{- end }} mainContainer: image: {{ .Values.dynamo.frontend.image }} + {{- if eq .Values.volumes.useGcs true }} volumeMounts: - name: local-ssd mountPath: /gcs-cache - name: gcs-model-volume mountPath: /data/model readOnly: true + {{- end }} resources: requests: ephemeral-storage: "30Gi" @@ -108,15 +113,21 @@ spec: envs: - name: SERVER_ARGS_FILE value: {{ .Values.workload.configPath }}/{{ .Values.workload.configFile }} + {{- if eq .Values.volumes.useGcs true }} + - name: MODEL_PATH + value: {{ .Values.volumes.gcsMounts.mountPath }}/{{ .Values.workload.model }} + {{- end }} {{- if .Values.dynamo.decodeWorker.envs }} {{- toYaml .Values.dynamo.decodeWorker.envs | nindent 8 }} {{- end }} extraPodMetadata: annotations: + {{- if eq .Values.volumes.useGcs true }} gke-gcsfuse/cpu-limit: "0" gke-gcsfuse/ephemeral-storage-limit: "0" gke-gcsfuse/memory-limit: "0" gke-gcsfuse/volumes: "true" + {{- end }} networking.gke.io/default-interface: 'eth0' networking.gke.io/interfaces: | [ @@ -127,6 +138,9 @@ spec: {"interfaceName":"eth5","network":"rdma-3"} ] extraPodSpec: + {{- if .Values.dynamo.serviceAccountName }} + serviceAccountName: {{ .Values.dynamo.serviceAccountName }} + {{- end }} resourceClaims: - name: compute-domain-channel resourceClaimTemplateName: {{ .Values.dynamo.computeDomain.resourceClaimTemplateName }} @@ -164,8 +178,16 @@ spec: echo "--- VERIFYING NCCL ENV VARS IN SHELL ---" env | grep NCCL_ echo "--- END VERIFICATION ---" + pip install hf_transfer ARGS=() + if [ -n "$MODEL_PATH" ]; then + echo "Adding model path from env var: $MODEL_PATH" + ARGS+=("--model-path" "$MODEL_PATH") + else + echo "No MODEL_PATH env var set from gcsfuse, relying on config file for model" + ARGS+=("--model" "{{ .Values.workload.model }}") + fi if [ -f "$SERVER_ARGS_FILE" ]; then echo "Loading server arguments from ConfigMap" while IFS=': ' read -r key value || [ -n "$key" ]; do @@ -190,8 +212,10 @@ spec: exec python3 -m dynamo.sglang "${ARGS[@]}" volumeMounts: + {{- if eq .Values.volumes.useGcs true }} - mountPath: /data/model name: gcs-model-volume + {{- end }} - name: library-dir-host mountPath: /usr/local/nvidia - name: gib @@ -199,12 +223,14 @@ spec: - name: serving-configuration mountPath: {{ .Values.workload.configPath | default "/workload/configs" }} volumes: + {{- if eq .Values.volumes.useGcs true }} - name: gcs-model-volume csi: driver: gcsfuse.csi.storage.gke.io volumeAttributes: bucketName: {{ .Values.volumes.gcsfuse.bucketName }} mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1 + {{- end }} - name: library-dir-host hostPath: path: /home/kubernetes/bin/nvidia @@ -227,9 +253,11 @@ spec: subComponentType: prefill replicas: {{ .Values.dynamo.prefillWorker.replicas }} livenessProbe: - httpGet: - path: /live - port: system + exec: + command: + - /bin/sh + - -c + - "exit 0" initialDelaySeconds: {{ .Values.dynamo.prefillWorker.livenessProbe.initialDelaySeconds }} periodSeconds: {{ .Values.dynamo.prefillWorker.livenessProbe.periodSeconds }} timeoutSeconds: {{ .Values.dynamo.prefillWorker.livenessProbe.timeoutSeconds }} @@ -252,15 +280,21 @@ spec: envs: - name: SERVER_ARGS_FILE value: {{ .Values.workload.configPath }}/{{ .Values.workload.configFile }} + {{- if eq .Values.volumes.useGcs true }} + - name: MODEL_PATH + value: {{ .Values.volumes.gcsMounts.mountPath }}/{{ .Values.workload.model }} + {{- end }} {{- if .Values.dynamo.prefillWorker.envs }} {{- toYaml .Values.dynamo.prefillWorker.envs | nindent 8 }} {{- end }} extraPodMetadata: annotations: + {{- if eq .Values.volumes.useGcs true }} gke-gcsfuse/cpu-limit: "0" gke-gcsfuse/ephemeral-storage-limit: "0" gke-gcsfuse/memory-limit: "0" gke-gcsfuse/volumes: "true" + {{- end }} networking.gke.io/default-interface: 'eth0' networking.gke.io/interfaces: | [ @@ -271,6 +305,9 @@ spec: {"interfaceName":"eth5","network":"rdma-3"} ] extraPodSpec: + {{- if .Values.dynamo.serviceAccountName }} + serviceAccountName: {{ .Values.dynamo.serviceAccountName }} + {{- end }} resourceClaims: - name: compute-domain-channel resourceClaimTemplateName: {{ .Values.dynamo.computeDomain.resourceClaimTemplateName }} @@ -304,10 +341,16 @@ spec: set -e nvidia-smi . /usr/local/gib/scripts/set_nccl_env.sh - echo "Pre-compiling DeepGEMM kernels for Prefill Worker..." - echo "Finished pre-compiling DeepGEMM kernels for Prefill Worker." + pip install hf_transfer ARGS=() + if [ -n "$MODEL_PATH" ]; then + echo "Adding model path from env var: $MODEL_PATH" + ARGS+=("--model-path" "$MODEL_PATH") + else + echo "No MODEL_PATH env var set from gcsfuse, relying on config file for model" + ARGS+=("--model" "{{ .Values.workload.model }}") + fi if [ -f "$SERVER_ARGS_FILE" ]; then echo "Loading server arguments from ConfigMap" while IFS=': ' read -r key value || [ -n "$key" ]; do @@ -332,8 +375,10 @@ spec: exec python3 -m dynamo.sglang "${ARGS[@]}" volumeMounts: + {{- if eq .Values.volumes.useGcs true }} - mountPath: /data/model name: gcs-model-volume + {{- end }} - name: library-dir-host mountPath: /usr/local/nvidia - name: gib @@ -341,12 +386,14 @@ spec: - name: serving-configuration mountPath: {{ .Values.workload.configPath | default "/workload/configs" }} volumes: + {{- if eq .Values.volumes.useGcs true }} - name: gcs-model-volume csi: driver: gcsfuse.csi.storage.gke.io volumeAttributes: bucketName: {{ .Values.volumes.gcsfuse.bucketName }} mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1 + {{- end }} - name: library-dir-host hostPath: path: /home/kubernetes/bin/nvidia From 3b5167275f53ce4b13913d89a54675d99e80e608 Mon Sep 17 00:00:00 2001 From: Yijia J Date: Tue, 13 Jan 2026 01:32:10 +0000 Subject: [PATCH 06/12] nit, update README and value to 18 nodes --- .../disaggregated-serving/dynamo/README.md | 37 +++++++++++++++++-- .../disaggregated-serving/dynamo/values.yaml | 36 +++++------------- .../deepseekr1-fp8-10p8d-decode.yaml | 3 -- .../deepseekr1-fp8-10p8d-prefill.yaml | 4 -- 4 files changed, 42 insertions(+), 38 deletions(-) diff --git a/inference/a4x/disaggregated-serving/dynamo/README.md b/inference/a4x/disaggregated-serving/dynamo/README.md index b185b4fb..0a02a267 100644 --- a/inference/a4x/disaggregated-serving/dynamo/README.md +++ b/inference/a4x/disaggregated-serving/dynamo/README.md @@ -14,6 +14,7 @@ Dynamo provides a disaggregated architecture that separates prefill and decode o * [2.3. Connect to your GKE Cluster](#connect-cluster) * [2.4. Create Secrets](#create-secrets) * [2.5. Install Dynamo Platform](#install-platform) + * [2.6. Setup GCS Bucket for GKE ](#setup-gcsfuse) * [3. Deploy with SGLang Backend](#deploy-sglang) * [3.1. Multi-Node SGLang Deployment (16 GPUs)](#sglang-multi-node) * [4. Inference Request](#inference-request) @@ -62,6 +63,7 @@ export NAMESPACE=dynamo-cloud export NGC_API_KEY= export HF_TOKEN= export RELEASE_VERSION=0.7.0 +export GCS_BUCKET= # Set the project for gcloud commands gcloud config set project $PROJECT_ID @@ -76,6 +78,7 @@ Replace the following values: | `CLUSTER_NAME` | The name of your GKE cluster | `a4x-cluster` | | `NGC_API_KEY` | Your NVIDIA NGC API key (get from [NGC](https://ngc.nvidia.com)) | `nvapi-xxx...` | | `HF_TOKEN` | Your Hugging Face access token | `hf_xxx...` | +| `GCS_BUCKET` | Your GCS bucket name | `gs://xxx` | ### 2.3. Connect to your GKE Cluster @@ -146,6 +149,32 @@ kubectl get pods -n ${NAMESPACE} Wait until all pods show a `Running` status before proceeding. + +### 2.6. Setup GCS Bucket for GKE (One-Time Setup) + +It is recommended to utilize [gcsfuse](https://docs.cloud.google.com/kubernetes-engine/docs/how-to/cloud-storage-fuse-csi-driver-setup) to facilitate model access and mitigate [huggingface rate limiting](https://huggingface.co/docs/hub/en/rate-limits#hub-rate-limits) issues. + +Find the service account (usually annotated to default): +```bash +kubectl get serviceaccounts ${NAMESPACE} -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.metadata.annotations.iam\.gke\.io/gcp-service-account}{"\n"}{end}' +``` + +Authorize the service account: +```bash +gcloud iam service-accounts add-iam-policy-binding xxx@project_id.iam.gserviceaccount.com \ + --role roles/iam.workloadIdentityUser \ + --member "serviceAccount:project_id.svc.id.goog[${NAMESPACE}/default]" +``` + +Grant read access to the bucket: +```bash +gcloud storage buckets add-iam-policy-binding ${GCS_BUCKET} \ + --member "serviceAccount:xxx@project_id.iam.gserviceaccount.com" \ + --role "roles/storage.objectViewer" +``` + +Downloading model files into the gcs bucket. + ## 3. Deploy with SGLang Backend @@ -154,9 +183,9 @@ Wait until all pods show a `Running` status before proceeding. Deploy Dynamo with SGLang backend for high-performance inference. -### 3.1. Multi-Node vLLM Deployment (16 GPUs) +### 3.1. Multi-Node SGLang Deployment (72 GPUs) -Multi-node deployment uses 16 GPUs across 4 A4X machines, providing increased capacity for larger models or higher throughput. +Multi-node deployment uses 72 GPUs across 18 A4X machines, providing increased capacity for larger models or higher throughput. #### DeepSeekR1 671B Model @@ -165,8 +194,8 @@ Deploy DeepSeekR1-671B across multiple nodes for production workloads. Note the ```bash cd $RECIPE_ROOT helm install -f values.yaml \ ---set-file prefill_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml \ ---set-file decode_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml \ +--set-file prefill_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-prefill.yaml \ +--set-file decode_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-decode.yaml \ $USER-dynamo-a4x-multi-node \ $REPO_ROOT/src/helm-charts/a4x/inference-templates/dynamo-deployment ``` diff --git a/inference/a4x/disaggregated-serving/dynamo/values.yaml b/inference/a4x/disaggregated-serving/dynamo/values.yaml index a047a65f..06954aae 100644 --- a/inference/a4x/disaggregated-serving/dynamo/values.yaml +++ b/inference/a4x/disaggregated-serving/dynamo/values.yaml @@ -13,17 +13,17 @@ # limitations under the License. dynamo: - namespace: yijiaj-test + namespace: dynamo-cloud releaseVersion: "0.7.0" - deploymentName: disagg2p2d-yijiaj + deploymentName: disagg10p8d computeDomain: - name: yijiaj-a4x-domain - numNodes: 4 - resourceClaimTemplateName: yijiaj-a4x-channel - serviceAccountName: dynamo-platform-dynamo-operator-component + name: a4x-domain + numNodes: 18 + resourceClaimTemplateName: a4x-channel + serviceAccountName: frontend: image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.7.0 - replicas: 1 + replicas: 9 livenessProbe: initialDelaySeconds: 3000 periodSeconds: 60 @@ -36,8 +36,7 @@ dynamo: failureThreshold: 100 decodeWorker: image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout - #image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.6.1 - nodeCount: 2 + nodeCount: 8 replicas: 1 envs: - name: HF_TOKEN @@ -53,16 +52,12 @@ dynamo: value: eth0 - name: TP_SOCKET_IFNAME value: eth0 - # - name: SGLANG_ENABLE_JIT_DEEPGEMM - # value: "1" - name: DYN_SKIP_SGLANG_LOG_FORMATTING value: "1" - name: SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK value: "256" - name: MC_TE_METRIC value: "true" - # - name: SGLANG_ENABLE_FLASHINFER_GEMM - # value: "1" - name: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE value: "100000" - name: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT @@ -87,14 +82,6 @@ dynamo: value: "1" - name: PYTHONUNBUFFERED value: "1" - # - name: NCCL_DEBUG - # value: INFO - # - name: NCCL_DEBUG_SUBSYS - # value: INIT,BOOTSTRAP,ENV,NET,GRAPH - # - name: NCCL_SOCKET_FAMILY - # value: "AF_INET" - # - name: GLOO_SOCKET_FAMILY - # value: "AF_INET" livenessProbe: initialDelaySeconds: 3000 periodSeconds: 60 @@ -112,9 +99,8 @@ dynamo: failureThreshold: 3000 prefillWorker: image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout - #image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.6.1 nodeCount: 2 - replicas: 1 + replicas: 5 envs: - name: HF_TOKEN valueFrom: @@ -131,14 +117,10 @@ dynamo: value: eth0 - name: TP_SOCKET_IFNAME value: eth0 - # - name: SGLANG_ENABLE_JIT_DEEPGEMM - # value: "1" - name: DYN_SKIP_SGLANG_LOG_FORMATTING value: "1" - name: MC_TE_METRIC value: "true" - # - name: SGLANG_ENABLE_FLASHINFER_GEMM - # value: "1" - name: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE value: "100000" - name: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT diff --git a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-decode.yaml b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-decode.yaml index bbbdf18f..4369e1ce 100644 --- a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-decode.yaml +++ b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-decode.yaml @@ -23,13 +23,11 @@ tp-size: "32" dp-size: "32" ep-size: "32" quantization: "fp8" -# page-size: "1" enable-dp-attention: true attention-backend: "trtllm_mla" kv-cache-dtype: "fp8_e4m3" disable-radix-cache: true stream-interval: "50" -# disaggregation-transfer-backend: nixl decode-log-interval: "1000" max-running-requests: "8192" context-length: "9300" @@ -46,5 +44,4 @@ enable-dp-lm-head: true prefill-round-robin-balance: true ep-num-redundant-experts: "32" cuda-graph-max-bs: "256" -# disable-cuda-graph: true deepep-config: '{"normal_dispatch": {"num_sms": 128,"num_max_nvl_chunked_send_tokens": 28,"num_max_nvl_chunked_recv_tokens": 256,"num_max_rdma_chunked_send_tokens": 6,"num_max_rdma_chunked_recv_tokens": 256}, "normal_combine": {"num_sms": 128,"num_max_nvl_chunked_send_tokens": 15,"num_max_nvl_chunked_recv_tokens": 256,"num_max_rdma_chunked_send_tokens": 6,"num_max_rdma_chunked_recv_tokens": 128}}' \ No newline at end of file diff --git a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-prefill.yaml b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-prefill.yaml index f5748607..9c86f420 100644 --- a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-prefill.yaml +++ b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-prefill.yaml @@ -13,7 +13,6 @@ # limitations under the License. served-model-name: deepseek-ai/DeepSeek-R1 -# log-level: DEBUG disaggregation-mode: prefill disaggregation-bootstrap-port: "30001" host: "0.0.0.0" @@ -30,9 +29,6 @@ disable-radix-cache: true stream-interval: "50" max-running-requests: "30000" context-length: "9300" -# decode-log-interval: "1" -# page-size: "1" -# disaggregation-transfer-backend: nixl watchdog-timeout: "1000000" disable-shared-experts-fusion: true eplb-algorithm: deepseek From fb7dfa5dc94988875435a27e7a63a25252431661 Mon Sep 17 00:00:00 2001 From: Yijia J Date: Wed, 14 Jan 2026 17:30:37 +0000 Subject: [PATCH 07/12] Add 8GPU recipe, modify README --- .../disaggregated-serving/dynamo/README.md | 33 +- .../disaggregated-serving/dynamo/test.yaml | 389 ++++++++++++++++++ .../{values.yaml => values_deepep.yaml} | 33 +- .../dynamo/values_wo_deepep.yaml | 203 +++++++++ ...e.yaml => deepseekr1-fp8-1p1d-decode.yaml} | 45 +- ....yaml => deepseekr1-fp8-1p1d-prefill.yaml} | 43 +- .../templates/dynamo-graph-deployment.yaml | 12 +- 7 files changed, 682 insertions(+), 76 deletions(-) create mode 100644 inference/a4x/disaggregated-serving/dynamo/test.yaml rename inference/a4x/disaggregated-serving/dynamo/{values.yaml => values_deepep.yaml} (91%) create mode 100644 inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml rename src/frameworks/a4x/dynamo-configs/{deepseekr1-fp8-2p2d-decode.yaml => deepseekr1-fp8-1p1d-decode.yaml} (59%) rename src/frameworks/a4x/dynamo-configs/{deepseekr1-fp8-2p2d-prefill.yaml => deepseekr1-fp8-1p1d-prefill.yaml} (62%) diff --git a/inference/a4x/disaggregated-serving/dynamo/README.md b/inference/a4x/disaggregated-serving/dynamo/README.md index 0a02a267..df1d99e0 100644 --- a/inference/a4x/disaggregated-serving/dynamo/README.md +++ b/inference/a4x/disaggregated-serving/dynamo/README.md @@ -16,7 +16,8 @@ Dynamo provides a disaggregated architecture that separates prefill and decode o * [2.5. Install Dynamo Platform](#install-platform) * [2.6. Setup GCS Bucket for GKE ](#setup-gcsfuse) * [3. Deploy with SGLang Backend](#deploy-sglang) - * [3.1. Multi-Node SGLang Deployment (16 GPUs)](#sglang-multi-node) + * [3.1. SGLang Deployment without DeepEP(8 GPUs)](#sglang-wo-deepep) + * [3.2. SGLang Deployment with DeepEP(72 GPUs)](#sglang-deepep) * [4. Inference Request](#inference-request) * [5. Monitoring and Troubleshooting](#monitoring) * [6. Cleanup](#cleanup) @@ -180,16 +181,34 @@ Downloading model files into the gcs bucket. [Back to Top](#table-of-contents) -Deploy Dynamo with SGLang backend for high-performance inference. +Deploy Dynamo with SGLang backend for high-performance inference. - -### 3.1. Multi-Node SGLang Deployment (72 GPUs) + +### 3.1. SGLang Deployment without DeepEP (8 GPUs) -Multi-node deployment uses 72 GPUs across 18 A4X machines, providing increased capacity for larger models or higher throughput. +Two nodes deployment uses 8 GPUs across 2 A4X machines, targeting low latency. #### DeepSeekR1 671B Model -Deploy DeepSeekR1-671B across multiple nodes for production workloads. Note the use of `--set-file prefill_serving_config` and `--set-file decode_serving_config` pointing to the correct model config file for a multi node deployment scenario: +Deploy DeepSeekR1-671B across 2 nodes for testing and validation. Note the use of `--set-file prefill_serving_config` and `--set-file decode_serving_config` pointing to the correct model config file. + +```bash +cd $RECIPE_ROOT +helm install -f values_wo_deepep.yaml \ +--set-file prefill_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-1p1d-prefill.yaml \ +--set-file decode_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-1p1d-decode.yaml \ +$USER-dynamo-a4x-1p1d \ +$REPO_ROOT/src/helm-charts/a4x/inference-templates/dynamo-deployment +``` + + +### 3.2. SGLang Deployment with DeepEP (72 GPUs) + +Multi-node deployment uses 72 GPUs across 18 A4X machines, providing increased capacity for larger models or higher throughput. + +#### DeepSeekR1 671B Model + +Deploy DeepSeekR1-671B across 18 nodes for production workloads. Note the use of `--set-file prefill_serving_config` and `--set-file decode_serving_config` pointing to the correct model config file for a multi node deployment scenario: ```bash cd $RECIPE_ROOT @@ -251,7 +270,7 @@ You should see a server status like this. Wait for it to be in a `healthy` state Then we can send a benchmark request with like this: ```bash -kubectl exec -n ${NAMESPACE} $USER-dynamo-multi-node-serving-frontend -- python3 -u -m sglang.bench_serving --backend sglang-oai-chat --base-url http://localhost:8000 --model "deepseek-ai/DeepSeek-R1" --tokenizer /data/model/deepseek-ai/DeepSeek-R1 --dataset-name random --num-prompts 2048 --random-input-len 2048 --random-output-len 512 --max-concurrency 512 +kubectl exec -n ${NAMESPACE} $USER-dynamo-multi-node-serving-frontend -- python3 -u -m sglang.bench_serving --backend sglang-oai-chat --base-url http://localhost:8000 --model "deepseek-ai/DeepSeek-R1" --tokenizer /data/model/deepseek-ai/DeepSeek-R1 --dataset-name random --num-prompts 10240 --random-input-len 8192 --random-range-ratio 0.8 --random-output-len 1024 --max-concurrency 2048 ``` diff --git a/inference/a4x/disaggregated-serving/dynamo/test.yaml b/inference/a4x/disaggregated-serving/dynamo/test.yaml new file mode 100644 index 00000000..92e43f57 --- /dev/null +++ b/inference/a4x/disaggregated-serving/dynamo/test.yaml @@ -0,0 +1,389 @@ +apiVersion: resource.nvidia.com/v1beta1 +kind: ComputeDomain +metadata: + name: a4x-compute-domain-test + namespace: yijiaj-test +spec: + numNodes: 2 + channel: + resourceClaimTemplate: + name: a4x-compute-domain-channel-test +--- +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: yijiaj-test-1p1d +spec: + services: + Frontend: + dynamoNamespace: yijiaj-test + componentType: frontend + replicas: 9 + resources: + requests: + cpu: "5" + memory: "50Gi" + limits: + cpu: "5" + memory: "50Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.7.0 + workingDir: /sgl-workspace/dynamo/components/backends/sglang + stdin: true + tty: true + command: + - /bin/sh + - -c + args: + - "python3 -m dynamo.frontend --http-port 8000" + Decode: + envFromSecret: hf-token-secret + livenessProbe: + httpGet: + path: /live + port: system + initialDelaySeconds: 600 + periodSeconds: 30 + timeoutSeconds: 15 + failureThreshold: 5 + readinessProbe: + httpGet: + path: /health + port: system + initialDelaySeconds: 60 + timeoutSeconds: 30 + periodSeconds: 60 + failureThreshold: 60 + dynamoNamespace: yijiaj-test + componentType: worker + replicas: 1 + resources: + limits: + gpu: "4" + claims: + - name: compute-domain-channel + sharedMemory: + size: 80Gi + envs: + - name: LD_LIBRARY_PATH + value: "/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu/plugins:/usr/local/nvidia/lib64" + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: TP_SOCKET_IFNAME + value: eth0 + + - name: PYTHONUNBUFFERED + value: "1" + - name: DYN_SKIP_SGLANG_LOG_FORMATTING + value: "1" + - name: SGLANG_ENABLE_JIT_DEEPGEMM + value: "false" + - name: SGLANG_ENABLE_FLASHINFER_GEMM + value: "1" + - name: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE + value: "100000" + - name: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT + value: "100000" + - name: SGLANG_DISAGGREGATION_WAITING_TIMEOUT + value: "100000" + - name: SGLANG_DECODE_BOOTSTRAP_TIMEOUT + value: "1000" + - name: SGLANG_HACK_SEQ_BOOTSTRAP_ROOM + value: "1" + - name: SGLANG_MOONCAKE_CUSTOM_MEM_POOL + value: "True" + - name: SGLANG_USE_MESSAGE_QUEUE_BROADCASTER + value: "0" + - name: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK + value: "1" + - name: MC_TE_METRIC + value: "true" + - name: MC_FORCE_MNNVL + value: "1" + - name: NCCL_MNNVL_ENABLE + value: "1" + - name: NCCL_CUMEM_ENABLE + value: "1" + + + extraPodMetadata: + annotations: + gke-gcsfuse/cpu-limit: "0" + gke-gcsfuse/ephemeral-storage-limit: "0" + gke-gcsfuse/memory-limit: "0" + gke-gcsfuse/volumes: "true" + networking.gke.io/default-interface: 'eth0' + networking.gke.io/interfaces: | + [ + {"interfaceName":"eth0","network":"default"}, + {"interfaceName":"eth2","network":"rdma-0"}, + {"interfaceName":"eth3","network":"rdma-1"}, + {"interfaceName":"eth4","network":"rdma-2"}, + {"interfaceName":"eth5","network":"rdma-3"} + ] + extraPodSpec: + resourceClaims: + - name: compute-domain-channel + resourceClaimTemplateName: a4x-compute-domain-channel-test + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/arch + operator: In + values: + - arm64 + volumes: + - name: model-src + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: yijiaj-test + mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1 + - name: library-dir-host + hostPath: + path: /home/kubernetes/bin/nvidia + - name: gib + hostPath: + path: /home/kubernetes/bin/gib + mainContainer: + securityContext: + privileged: true + startupProbe: + failureThreshold: 1800 + httpGet: + path: /live + port: system + periodSeconds: 10 + timeoutSeconds: 5 + image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout + workingDir: /sgl-workspace/dynamo/components/backends/sglang + command: ["/bin/bash", "-c"] + stdin: true + tty: true + volumeMounts: + - mountPath: /data/model + name: model-src + - name: library-dir-host + mountPath: /usr/local/nvidia + - name: gib + mountPath: /usr/local/gib + args: + - | + set -e + + nvidia-smi + . /usr/local/gib/scripts/set_nccl_env.sh + + echo "--- VERIFYING NCCL ENV VARS IN SHELL ---" + env | grep NCCL_ + echo "--- END VERIFICATION ---" + + exec python3 -m dynamo.sglang \ + --enable-metrics \ + --model-path /data/model/deepseek-ai/DeepSeek-R1 \ + --served-model-name deepseek-ai/DeepSeek-R1 \ + --disaggregation-bootstrap-port 30001 \ + --disaggregation-mode decode \ + --host 0.0.0.0 \ + --port 8000 \ + --disable-radix-cache \ + --tensor-parallel-size 4 \ + --data-parallel-size 1 \ + --expert-parallel-size 1 \ + --trust-remote-code \ + --kv-cache-dtype fp8_e4m3 \ + --attention-backend trtllm_mla \ + --quantization fp8 \ + --moe-runner-backend flashinfer_trtllm \ + --disable-radix-cache \ + --watchdog-timeout 1000000 \ + --context-length 9600 \ + --mem-fraction-static 0.95 \ + --chunked-prefill-size 8192 \ + --cuda-graph-max-bs 512 \ + --max-running-requests 512 \ + --scheduler-recv-interval 10 \ + --enable-flashinfer-allreduce-fusion \ + --enable-symm-mem \ + --moe-dense-tp-size 1 \ + --prefill-round-robin-balance + + Prefill: + envFromSecret: hf-token-secret + livenessProbe: + exec: + command: + - /bin/sh + - -c + - "exit 0" + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 10 + readinessProbe: + httpGet: + path: /health + port: system + initialDelaySeconds: 60 + timeoutSeconds: 30 + periodSeconds: 60 + failureThreshold: 60 + dynamoNamespace: yijiaj-test + componentType: worker + replicas: 1 + resources: + requests: + cpu: "130" + memory: "800Gi" + limits: + gpu: "4" + claims: + - name: compute-domain-channel + sharedMemory: + size: 80Gi + envs: + - name: LD_LIBRARY_PATH + value: "/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu/plugins:/usr/local/nvidia/lib64" + - name: UCX_TLS + value: "^tcp" + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: TP_SOCKET_IFNAME + value: eth0 + + + - name: PYTHONUNBUFFERED + value: "1" + - name: DYN_SKIP_SGLANG_LOG_FORMATTING + value: "1" + - name: SGLANG_ENABLE_JIT_DEEPGEMM + value: "false" + - name: SGLANG_ENABLE_FLASHINFER_GEMM + value: "1" + - name: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE + value: "100000" + - name: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT + value: "100000" + - name: SGLANG_DISAGGREGATION_WAITING_TIMEOUT + value: "100000" + - name: SGLANG_MOONCAKE_CUSTOM_MEM_POOL + value: "True" + - name: SGLANG_USE_MESSAGE_QUEUE_BROADCASTER + value: "0" + - name: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK + value: "1" + - name: MC_TE_METRIC + value: "true" + - name: MC_FORCE_MNNVL + value: "1" + - name: NCCL_MNNVL_ENABLE + value: "1" + - name: NCCL_CUMEM_ENABLE + value: "1" + + extraPodMetadata: + annotations: + gke-gcsfuse/cpu-limit: "0" + gke-gcsfuse/ephemeral-storage-limit: "0" + gke-gcsfuse/memory-limit: "0" + gke-gcsfuse/volumes: "true" + networking.gke.io/default-interface: 'eth0' + networking.gke.io/interfaces: | + [ + {"interfaceName":"eth0","network":"default"}, + {"interfaceName":"eth2","network":"rdma-0"}, + {"interfaceName":"eth3","network":"rdma-1"}, + {"interfaceName":"eth4","network":"rdma-2"}, + {"interfaceName":"eth5","network":"rdma-3"} + ] + extraPodSpec: + resourceClaims: + - name: compute-domain-channel + resourceClaimTemplateName: a4x-compute-domain-channel-test + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/arch + operator: In + values: + - arm64 + volumes: + - name: model-src + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: yijiaj-test + mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1 + - name: library-dir-host + hostPath: + path: /home/kubernetes/bin/nvidia + - name: gib + hostPath: + path: /home/kubernetes/bin/gib + mainContainer: + startupProbe: + failureThreshold: 1800 + httpGet: + path: /live + port: system + periodSeconds: 10 + timeoutSeconds: 5 + securityContext: + privileged: true + stdin: true + tty: true + volumeMounts: + - mountPath: /data/model + name: model-src + - name: library-dir-host + mountPath: /usr/local/nvidia + - name: gib + mountPath: /usr/local/gib + image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout + workingDir: /sgl-workspace/dynamo/components/backends/sglang + command: ["/bin/bash", "-c"] + args: + - | + set -e + + nvidia-smi + . /usr/local/gib/scripts/set_nccl_env.sh + + echo "--- VERIFYING NCCL ENV VARS IN SHELL ---" + env | grep NCCL_ + echo "--- END VERIFICATION ---" + + exec python3 -m dynamo.sglang \ + --enable-metrics \ + --model-path /data/model/deepseek-ai/DeepSeek-R1 \ + --served-model-name deepseek-ai/DeepSeek-R1 \ + --disaggregation-bootstrap-port 30001 \ + --disaggregation-mode prefill \ + --host 0.0.0.0 \ + --port 8000 \ + --disable-radix-cache \ + --tensor-parallel-size 4 \ + --data-parallel-size 1 \ + --expert-parallel-size 1 \ + --trust-remote-code \ + --kv-cache-dtype fp8_e4m3 \ + --attention-backend trtllm_mla \ + --quantization fp8 \ + --moe-runner-backend flashinfer_trtllm \ + --disable-radix-cache \ + --watchdog-timeout 1000000 \ + --context-length 9600 \ + --mem-fraction-static 0.95 \ + --max-total-tokens 32768 \ + --chunked-prefill-size 24576 \ + --cuda-graph-max-bs 512 \ + --max-running-requests 512 \ + --load-balance-method round_robin \ + --scheduler-recv-interval 10 \ + --enable-flashinfer-allreduce-fusion \ + --moe-dense-tp-size 1 + diff --git a/inference/a4x/disaggregated-serving/dynamo/values.yaml b/inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml similarity index 91% rename from inference/a4x/disaggregated-serving/dynamo/values.yaml rename to inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml index 06954aae..d5a02dc6 100644 --- a/inference/a4x/disaggregated-serving/dynamo/values.yaml +++ b/inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml @@ -15,7 +15,7 @@ dynamo: namespace: dynamo-cloud releaseVersion: "0.7.0" - deploymentName: disagg10p8d + deploymentName: dynamo-disagg10p8d computeDomain: name: a4x-domain numNodes: 18 @@ -83,20 +83,20 @@ dynamo: - name: PYTHONUNBUFFERED value: "1" livenessProbe: - initialDelaySeconds: 3000 + initialDelaySeconds: 600 periodSeconds: 60 - timeoutSeconds: 150 - failureThreshold: 100 + timeoutSeconds: 30 + failureThreshold: 60 readinessProbe: - initialDelaySeconds: 3000 + initialDelaySeconds: 60 periodSeconds: 60 - timeoutSeconds: 300 - failureThreshold: 100 + timeoutSeconds: 30 + failureThreshold: 60 startupProbe: initialDelaySeconds: 3000 periodSeconds: 60 - timeoutSeconds: 600 - failureThreshold: 3000 + timeoutSeconds: 30 + failureThreshold: 1800 prefillWorker: image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout nodeCount: 2 @@ -144,18 +144,18 @@ dynamo: livenessProbe: initialDelaySeconds: 3000 periodSeconds: 60 - timeoutSeconds: 150 - failureThreshold: 100 + timeoutSeconds: 30 + failureThreshold: 60 readinessProbe: initialDelaySeconds: 3000 periodSeconds: 60 - timeoutSeconds: 300 - failureThreshold: 100 + timeoutSeconds: 30 + failureThreshold: 60 startupProbe: initialDelaySeconds: 3000 periodSeconds: 60 - timeoutSeconds: 600 - failureThreshold: 3000 + timeoutSeconds: 30 + failureThreshold: 1800 secrets: @@ -170,8 +170,6 @@ volumes: useGcs: true gcsfuse: bucketName: "yijiaj-test" - fileCacheCapacity: "500G" - cachePath: "/gcs-cache" ssdMountPath: "/ssd" gcsMounts: mountPath: "/data/model" @@ -184,7 +182,6 @@ service: workload: model: deepseek-ai/DeepSeek-R1 - gpus: 16 framework: sglang configFile: serving-args.yaml configPath: /workload/configs diff --git a/inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml b/inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml new file mode 100644 index 00000000..bf992302 --- /dev/null +++ b/inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml @@ -0,0 +1,203 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +dynamo: + namespace: yijiaj-test + releaseVersion: "0.7.0" + deploymentName: dynamo-disagg1p1d + computeDomain: + name: a4x-domain + numNodes: 2 + resourceClaimTemplateName: a4x-channel + serviceAccountName: + frontend: + image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.7.0 + replicas: 9 + livenessProbe: + initialDelaySeconds: 3000 + periodSeconds: 60 + timeoutSeconds: 150 + failureThreshold: 100 + readinessProbe: + initialDelaySeconds: 3000 + periodSeconds: 60 + timeoutSeconds: 300 + failureThreshold: 100 + decodeWorker: + image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout + nodeCount: 1 + replicas: 1 + envs: + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: HF_TOKEN + - name: HF_HUB_ENABLE_HF_TRANSFER + value: "1" + - name: LD_LIBRARY_PATH + value: "/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu/plugins:/usr/local/nvidia/lib64" + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: TP_SOCKET_IFNAME + value: eth0 + - name: PYTHONUNBUFFERED + value: "1" + - name: DYN_SKIP_SGLANG_LOG_FORMATTING + value: "1" + - name: SGLANG_ENABLE_JIT_DEEPGEMM + value: "false" + - name: SGLANG_ENABLE_FLASHINFER_GEMM + value: "1" + - name: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE + value: "100000" + - name: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT + value: "100000" + - name: SGLANG_DISAGGREGATION_WAITING_TIMEOUT + value: "100000" + - name: SGLANG_DECODE_BOOTSTRAP_TIMEOUT + value: "1000" + - name: SGLANG_HACK_SEQ_BOOTSTRAP_ROOM + value: "1" + - name: SGLANG_MOONCAKE_CUSTOM_MEM_POOL + value: "True" + - name: SGLANG_USE_MESSAGE_QUEUE_BROADCASTER + value: "0" + - name: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK + value: "1" + - name: MC_TE_METRIC + value: "true" + - name: MC_FORCE_MNNVL + value: "1" + - name: NCCL_MNNVL_ENABLE + value: "1" + - name: NCCL_CUMEM_ENABLE + value: "1" + livenessProbe: + initialDelaySeconds: 600 + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 60 + readinessProbe: + initialDelaySeconds: 60 + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 60 + startupProbe: + initialDelaySeconds: 3000 + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 1800 + prefillWorker: + image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout + nodeCount: 1 + replicas: 1 + envs: + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: HF_TOKEN + - name: HF_HUB_ENABLE_HF_TRANSFER + value: "1" + - name: LD_LIBRARY_PATH + value: "/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu/plugins:/usr/local/nvidia/lib64" + - name: UCX_TLS + value: "^tcp" + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: TP_SOCKET_IFNAME + value: eth0 + - name: PYTHONUNBUFFERED + value: "1" + - name: DYN_SKIP_SGLANG_LOG_FORMATTING + value: "1" + - name: SGLANG_ENABLE_JIT_DEEPGEMM + value: "false" + - name: SGLANG_ENABLE_FLASHINFER_GEMM + value: "1" + - name: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE + value: "100000" + - name: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT + value: "100000" + - name: SGLANG_DISAGGREGATION_WAITING_TIMEOUT + value: "100000" + - name: SGLANG_MOONCAKE_CUSTOM_MEM_POOL + value: "True" + - name: SGLANG_USE_MESSAGE_QUEUE_BROADCASTER + value: "0" + - name: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK + value: "1" + - name: MC_TE_METRIC + value: "true" + - name: MC_FORCE_MNNVL + value: "1" + - name: NCCL_MNNVL_ENABLE + value: "1" + - name: NCCL_CUMEM_ENABLE + value: "1" + livenessProbe: + initialDelaySeconds: 3000 + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 60 + readinessProbe: + initialDelaySeconds: 3000 + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 60 + startupProbe: + initialDelaySeconds: 3000 + periodSeconds: 60 + timeoutSeconds: 30 + failureThreshold: 1800 + + +secrets: + ngc: + secretName: nvcr-secret + huggingface: + secretName: hf-token-secret + secretData: + token: "hf_api_token" + +volumes: + useGcs: true + gcsfuse: + bucketName: "yijiaj-test" + ssdMountPath: "/ssd" + gcsMounts: + mountPath: "/data/model" + +service: + type: ClusterIP + ports: + frontend: 8000 + worker: 9090 + +workload: + model: deepseek-ai/DeepSeek-R1 + framework: sglang + configFile: serving-args.yaml + configPath: /workload/configs + +network: + subnetworks: [] + gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic-arm64:v1.0.7 + ncclSettings: + - name: NCCL_DEBUG + value: "VERSION" + +quantizations: + - "fp8" diff --git a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-2p2d-decode.yaml b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-1p1d-decode.yaml similarity index 59% rename from src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-2p2d-decode.yaml rename to src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-1p1d-decode.yaml index a2287217..ff0f3c47 100644 --- a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-2p2d-decode.yaml +++ b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-1p1d-decode.yaml @@ -12,34 +12,29 @@ # See the License for the specific language governing permissions and # limitations under the License. +enable-metrics: true served-model-name: deepseek-ai/DeepSeek-R1 -log-level: DEBUG -tp: "8" -dp-size: "8" -decode-log-interval: "1" -page-size: "1" -enable-dp-attention: true -trust-remote-code: true -disaggregation-mode: decode -disaggregation-transfer-backend: nixl disaggregation-bootstrap-port: "30001" +disaggregation-mode: decode host: "0.0.0.0" port: "9090" -max-running-requests: "36864" -context-length: "2716" disable-radix-cache: true -moe-a2a-backend: deepep -prefill-round-robin-balance: true -deepep-mode: normal -moe-dense-tp-size: "1" -enable-dp-lm-head: true -disable-cuda-graph: true -cuda-graph-max-bs: "256" -disable-shared-experts-fusion: true -ep-num-redundant-experts: "32" -ep-dispatch-algorithm: static -eplb-algorithm: deepseek -attention-backend: cutlass_mla +tensor-parallel-size: 4 +data-parallel-size: 1 +expert-parallel-size: 1 +trust-remote-code: true +kv-cache-dtype: "fp8_e4m3" +attention-backend: "trtllm_mla" +quantization: "fp8" +moe-runner-backend: "flashinfer_trtllm" watchdog-timeout: "1000000" -chunked-prefill-size: "36864" -mem-fraction-static: "0.8" +context-length: "9600" +mem-fraction-static: "0.95" +chunked-prefill-size: "8192" +cuda-graph-max-bs: "512" +max-running-requests: "512" +scheduler-recv-interval: "10" +enable-flashinfer-allreduce-fusion: true +enable-symm-mem: true +moe-dense-tp-size: "1" +prefill-round-robin-balance: true \ No newline at end of file diff --git a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-2p2d-prefill.yaml b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-1p1d-prefill.yaml similarity index 62% rename from src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-2p2d-prefill.yaml rename to src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-1p1d-prefill.yaml index f2abbcd4..e42cb117 100644 --- a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-2p2d-prefill.yaml +++ b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-1p1d-prefill.yaml @@ -12,34 +12,29 @@ # See the License for the specific language governing permissions and # limitations under the License. +enable-metrics: true served-model-name: deepseek-ai/DeepSeek-R1 -log-level: DEBUG -tp: "8" -dp-size: "8" -trust-remote-code: true -decode-log-interval: "1" -page-size: "1" -enable-dp-attention: true -disaggregation-mode: prefill -disaggregation-transfer-backend: nixl disaggregation-bootstrap-port: "30001" +disaggregation-mode: prefill host: "0.0.0.0" port: "9090" -max-running-requests: "6144" -context-length: "2716" disable-radix-cache: true -moe-a2a-backend: deepep -load-balance-method: round_robin -deepep-mode: normal -moe-dense-tp-size: "1" -enable-dp-lm-head: true -disable-shared-experts-fusion: true -ep-num-redundant-experts: "32" -ep-dispatch-algorithm: static -eplb-algorithm: deepseek -attention-backend: cutlass_mla +tensor-parallel-size: 4 +data-parallel-size: 1 +expert-parallel-size: 1 +trust-remote-code: true +kv-cache-dtype: "fp8_e4m3" +attention-backend: "trtllm_mla" +quantization: "fp8" +moe-runner-backend: "flashinfer_trtllm" watchdog-timeout: "1000000" -disable-cuda-graph: true -chunked-prefill-size: "16384" +context-length: "9600" +mem-fraction-static: "0.95" max-total-tokens: "32768" -mem-fraction-static: "0.8" +chunked-prefill-size: "24576" +cuda-graph-max-bs: "512" +max-running-requests: "512" +load-balance-method: round_robin +scheduler-recv-interval: "10" +enable-flashinfer-allreduce-fusion: true +moe-dense-tp-size: "1" \ No newline at end of file diff --git a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml index 0ac6cdf5..355db26a 100644 --- a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml +++ b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml @@ -29,10 +29,10 @@ spec: resources: requests: cpu: "5" - memory: "10Gi" + memory: "50Gi" limits: cpu: "5" - memory: "10Gi" + memory: "50Gi" extraPodMetadata: annotations: {{- if eq .Values.volumes.useGcs true }} @@ -80,8 +80,10 @@ spec: ephemeral-storage: "30Gi" Decode: + {{- if gt (int .Values.dynamo.decodeWorker.nodeCount) 1 }} multinode: nodeCount: {{ .Values.dynamo.decodeWorker.nodeCount }} + {{- end }} dynamoNamespace: {{ .Values.dynamo.namespace }} envFromSecret: {{ .Values.secrets.huggingface.secretName }} componentType: worker @@ -245,8 +247,10 @@ spec: path: {{ .Values.workload.configFile | default "serving-args.yaml" }} Prefill: + {{- if gt (int .Values.dynamo.prefillWorker.nodeCount) 1 }} multinode: nodeCount: {{ .Values.dynamo.prefillWorker.nodeCount }} + {{- end }} dynamoNamespace: {{ .Values.dynamo.namespace }} envFromSecret: {{ .Values.secrets.huggingface.secretName }} componentType: worker @@ -341,6 +345,10 @@ spec: set -e nvidia-smi . /usr/local/gib/scripts/set_nccl_env.sh + + echo "--- VERIFYING NCCL ENV VARS IN SHELL ---" + env | grep NCCL_ + echo "--- END VERIFICATION ---" pip install hf_transfer ARGS=() From b2cb48336e3c732718f23547c621e2c092cefd3f Mon Sep 17 00:00:00 2001 From: Yijia J Date: Wed, 14 Jan 2026 20:18:28 +0000 Subject: [PATCH 08/12] update README, nit --- .../disaggregated-serving/dynamo/README.md | 58 +-- .../dynamo/bench_clint.yaml | 47 +++ .../disaggregated-serving/dynamo/test.yaml | 389 ------------------ .../dynamo/values_deepep.yaml | 2 +- .../dynamo/values_wo_deepep.yaml | 4 +- 5 files changed, 66 insertions(+), 434 deletions(-) create mode 100644 inference/a4x/disaggregated-serving/dynamo/bench_clint.yaml delete mode 100644 inference/a4x/disaggregated-serving/dynamo/test.yaml diff --git a/inference/a4x/disaggregated-serving/dynamo/README.md b/inference/a4x/disaggregated-serving/dynamo/README.md index df1d99e0..90849044 100644 --- a/inference/a4x/disaggregated-serving/dynamo/README.md +++ b/inference/a4x/disaggregated-serving/dynamo/README.md @@ -174,7 +174,7 @@ gcloud storage buckets add-iam-policy-binding ${GCS_BUCKET} \ --role "roles/storage.objectViewer" ``` -Downloading model files into the gcs bucket. +Downloading model files into the gcs bucket and set your gcs bucket name in values.yaml file. ## 3. Deploy with SGLang Backend @@ -223,51 +223,25 @@ $REPO_ROOT/src/helm-charts/a4x/inference-templates/dynamo-deployment ## 4. Inference Request [Back to Top](#table-of-contents) -To make an inference request to test the server, we can first run a health check against the server using `curl` +Check if the pods are in `Running` status before sending inference requests. ```bash -kubectl exec -it -n ${NAMESPACE} deployment/$USER-dynamo-a4x-multi-node -- curl http://localhost:8000/health | jq +kubectl get pods -n ${NAMESPACE} +``` + +We can then deploy the benchmark clint and send benchark request. +Deploy the benchmark clint like this: +```bash +kubectl apply -f bench_clint.yaml -n ${NAMESPACE} +``` + +And send the request like this: + +```bash +kubectl exec -it bench-client -- bash -c "cd /workspace/dynamo/examples/backends/sglang/slurm_jobs/scripts/vllm && python3 -u benchmark_serving.py --host $USER-dynamo-a4x-1p1d-frontend --port 8000 --model deepseek-ai/DeepSeek-R1 --tokenizer deepseek-ai/DeepSeek-R1 --backend 'dynamo' --endpoint /v1/completions --disable-tqdm --dataset-name random --num-prompts 2560 --random-input-len 8192 --random-output-len 1024 --random-range-ratio 0.8 --ignore-eos --request-rate inf --percentile-metrics ttft,tpot,itl,e2el --max-concurrency 512" ``` -You should see a server status like this. Wait for it to be in a `healthy` state. - -```json -{ - "instances": [ - { - "component": "backend", - "endpoint": "load_metrics", - "instance_id": 3994861215823793160, - "namespace": "dynamo", - "transport": { - "nats_tcp": "dynamo_backend.load_metrics-3770991c30298c08" - } - }, - { - "component": "prefill", - "endpoint": "clear_kv_blocks", - "instance_id": 3994861215823793153, - "namespace": "dynamo", - "transport": { - "nats_tcp": "dynamo_prefill.clear_kv_blocks-3770991c30298c01" - } - }, - { - "component": "prefill", - "endpoint": "generate", - "instance_id": 3994861215823793153, - "namespace": "dynamo", - "transport": { - "nats_tcp": "dynamo_prefill.generate-3770991c30298c01" - } - } - ], - "message": "No endpoints available", - "status": "unhealthy" -} -``` - -Then we can send a benchmark request with like this: +Or we can send a benchmark request to a frontend pod like this: ```bash kubectl exec -n ${NAMESPACE} $USER-dynamo-multi-node-serving-frontend -- python3 -u -m sglang.bench_serving --backend sglang-oai-chat --base-url http://localhost:8000 --model "deepseek-ai/DeepSeek-R1" --tokenizer /data/model/deepseek-ai/DeepSeek-R1 --dataset-name random --num-prompts 10240 --random-input-len 8192 --random-range-ratio 0.8 --random-output-len 1024 --max-concurrency 2048 diff --git a/inference/a4x/disaggregated-serving/dynamo/bench_clint.yaml b/inference/a4x/disaggregated-serving/dynamo/bench_clint.yaml new file mode 100644 index 00000000..16a96971 --- /dev/null +++ b/inference/a4x/disaggregated-serving/dynamo/bench_clint.yaml @@ -0,0 +1,47 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Pod +metadata: + name: bench-client + labels: + app: bench-client +spec: + restartPolicy: Never + containers: + - name: benchmark + image: python:3.10 + workingDir: /workspace + command: ["/bin/bash", "-c"] + # This script runs ONCE when the pod starts to set everything up. + # Then it sleeps forever so the pod stays open for you. + args: + - | + echo "--- STARTING SETUP ---" + + # 1. Install Git + apt-get update && apt-get install -y git + + # 2. Install Python Dependencies + pip install -q transformers aiohttp numpy requests tqdm pandas datasets Pillow + + # 3. Clone the Repo (Specific Branch) + echo "Cloning repo..." + git clone --single-branch --branch ishan/sa-1.1-sgl-dsr1-fp8 https://github.com/ai-dynamo/dynamo.git /workspace/dynamo + + echo "--- SETUP COMPLETE. POD IS READY. ---" + + # 4. Keep the pod alive indefinitely + sleep infinity diff --git a/inference/a4x/disaggregated-serving/dynamo/test.yaml b/inference/a4x/disaggregated-serving/dynamo/test.yaml deleted file mode 100644 index 92e43f57..00000000 --- a/inference/a4x/disaggregated-serving/dynamo/test.yaml +++ /dev/null @@ -1,389 +0,0 @@ -apiVersion: resource.nvidia.com/v1beta1 -kind: ComputeDomain -metadata: - name: a4x-compute-domain-test - namespace: yijiaj-test -spec: - numNodes: 2 - channel: - resourceClaimTemplate: - name: a4x-compute-domain-channel-test ---- -apiVersion: nvidia.com/v1alpha1 -kind: DynamoGraphDeployment -metadata: - name: yijiaj-test-1p1d -spec: - services: - Frontend: - dynamoNamespace: yijiaj-test - componentType: frontend - replicas: 9 - resources: - requests: - cpu: "5" - memory: "50Gi" - limits: - cpu: "5" - memory: "50Gi" - extraPodSpec: - mainContainer: - image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.7.0 - workingDir: /sgl-workspace/dynamo/components/backends/sglang - stdin: true - tty: true - command: - - /bin/sh - - -c - args: - - "python3 -m dynamo.frontend --http-port 8000" - Decode: - envFromSecret: hf-token-secret - livenessProbe: - httpGet: - path: /live - port: system - initialDelaySeconds: 600 - periodSeconds: 30 - timeoutSeconds: 15 - failureThreshold: 5 - readinessProbe: - httpGet: - path: /health - port: system - initialDelaySeconds: 60 - timeoutSeconds: 30 - periodSeconds: 60 - failureThreshold: 60 - dynamoNamespace: yijiaj-test - componentType: worker - replicas: 1 - resources: - limits: - gpu: "4" - claims: - - name: compute-domain-channel - sharedMemory: - size: 80Gi - envs: - - name: LD_LIBRARY_PATH - value: "/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu/plugins:/usr/local/nvidia/lib64" - - name: GLOO_SOCKET_IFNAME - value: eth0 - - name: TP_SOCKET_IFNAME - value: eth0 - - - name: PYTHONUNBUFFERED - value: "1" - - name: DYN_SKIP_SGLANG_LOG_FORMATTING - value: "1" - - name: SGLANG_ENABLE_JIT_DEEPGEMM - value: "false" - - name: SGLANG_ENABLE_FLASHINFER_GEMM - value: "1" - - name: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE - value: "100000" - - name: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT - value: "100000" - - name: SGLANG_DISAGGREGATION_WAITING_TIMEOUT - value: "100000" - - name: SGLANG_DECODE_BOOTSTRAP_TIMEOUT - value: "1000" - - name: SGLANG_HACK_SEQ_BOOTSTRAP_ROOM - value: "1" - - name: SGLANG_MOONCAKE_CUSTOM_MEM_POOL - value: "True" - - name: SGLANG_USE_MESSAGE_QUEUE_BROADCASTER - value: "0" - - name: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK - value: "1" - - name: MC_TE_METRIC - value: "true" - - name: MC_FORCE_MNNVL - value: "1" - - name: NCCL_MNNVL_ENABLE - value: "1" - - name: NCCL_CUMEM_ENABLE - value: "1" - - - extraPodMetadata: - annotations: - gke-gcsfuse/cpu-limit: "0" - gke-gcsfuse/ephemeral-storage-limit: "0" - gke-gcsfuse/memory-limit: "0" - gke-gcsfuse/volumes: "true" - networking.gke.io/default-interface: 'eth0' - networking.gke.io/interfaces: | - [ - {"interfaceName":"eth0","network":"default"}, - {"interfaceName":"eth2","network":"rdma-0"}, - {"interfaceName":"eth3","network":"rdma-1"}, - {"interfaceName":"eth4","network":"rdma-2"}, - {"interfaceName":"eth5","network":"rdma-3"} - ] - extraPodSpec: - resourceClaims: - - name: compute-domain-channel - resourceClaimTemplateName: a4x-compute-domain-channel-test - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: kubernetes.io/arch - operator: In - values: - - arm64 - volumes: - - name: model-src - csi: - driver: gcsfuse.csi.storage.gke.io - volumeAttributes: - bucketName: yijiaj-test - mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1 - - name: library-dir-host - hostPath: - path: /home/kubernetes/bin/nvidia - - name: gib - hostPath: - path: /home/kubernetes/bin/gib - mainContainer: - securityContext: - privileged: true - startupProbe: - failureThreshold: 1800 - httpGet: - path: /live - port: system - periodSeconds: 10 - timeoutSeconds: 5 - image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout - workingDir: /sgl-workspace/dynamo/components/backends/sglang - command: ["/bin/bash", "-c"] - stdin: true - tty: true - volumeMounts: - - mountPath: /data/model - name: model-src - - name: library-dir-host - mountPath: /usr/local/nvidia - - name: gib - mountPath: /usr/local/gib - args: - - | - set -e - - nvidia-smi - . /usr/local/gib/scripts/set_nccl_env.sh - - echo "--- VERIFYING NCCL ENV VARS IN SHELL ---" - env | grep NCCL_ - echo "--- END VERIFICATION ---" - - exec python3 -m dynamo.sglang \ - --enable-metrics \ - --model-path /data/model/deepseek-ai/DeepSeek-R1 \ - --served-model-name deepseek-ai/DeepSeek-R1 \ - --disaggregation-bootstrap-port 30001 \ - --disaggregation-mode decode \ - --host 0.0.0.0 \ - --port 8000 \ - --disable-radix-cache \ - --tensor-parallel-size 4 \ - --data-parallel-size 1 \ - --expert-parallel-size 1 \ - --trust-remote-code \ - --kv-cache-dtype fp8_e4m3 \ - --attention-backend trtllm_mla \ - --quantization fp8 \ - --moe-runner-backend flashinfer_trtllm \ - --disable-radix-cache \ - --watchdog-timeout 1000000 \ - --context-length 9600 \ - --mem-fraction-static 0.95 \ - --chunked-prefill-size 8192 \ - --cuda-graph-max-bs 512 \ - --max-running-requests 512 \ - --scheduler-recv-interval 10 \ - --enable-flashinfer-allreduce-fusion \ - --enable-symm-mem \ - --moe-dense-tp-size 1 \ - --prefill-round-robin-balance - - Prefill: - envFromSecret: hf-token-secret - livenessProbe: - exec: - command: - - /bin/sh - - -c - - "exit 0" - periodSeconds: 60 - timeoutSeconds: 30 - failureThreshold: 10 - readinessProbe: - httpGet: - path: /health - port: system - initialDelaySeconds: 60 - timeoutSeconds: 30 - periodSeconds: 60 - failureThreshold: 60 - dynamoNamespace: yijiaj-test - componentType: worker - replicas: 1 - resources: - requests: - cpu: "130" - memory: "800Gi" - limits: - gpu: "4" - claims: - - name: compute-domain-channel - sharedMemory: - size: 80Gi - envs: - - name: LD_LIBRARY_PATH - value: "/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu/plugins:/usr/local/nvidia/lib64" - - name: UCX_TLS - value: "^tcp" - - name: GLOO_SOCKET_IFNAME - value: eth0 - - name: TP_SOCKET_IFNAME - value: eth0 - - - - name: PYTHONUNBUFFERED - value: "1" - - name: DYN_SKIP_SGLANG_LOG_FORMATTING - value: "1" - - name: SGLANG_ENABLE_JIT_DEEPGEMM - value: "false" - - name: SGLANG_ENABLE_FLASHINFER_GEMM - value: "1" - - name: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE - value: "100000" - - name: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT - value: "100000" - - name: SGLANG_DISAGGREGATION_WAITING_TIMEOUT - value: "100000" - - name: SGLANG_MOONCAKE_CUSTOM_MEM_POOL - value: "True" - - name: SGLANG_USE_MESSAGE_QUEUE_BROADCASTER - value: "0" - - name: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK - value: "1" - - name: MC_TE_METRIC - value: "true" - - name: MC_FORCE_MNNVL - value: "1" - - name: NCCL_MNNVL_ENABLE - value: "1" - - name: NCCL_CUMEM_ENABLE - value: "1" - - extraPodMetadata: - annotations: - gke-gcsfuse/cpu-limit: "0" - gke-gcsfuse/ephemeral-storage-limit: "0" - gke-gcsfuse/memory-limit: "0" - gke-gcsfuse/volumes: "true" - networking.gke.io/default-interface: 'eth0' - networking.gke.io/interfaces: | - [ - {"interfaceName":"eth0","network":"default"}, - {"interfaceName":"eth2","network":"rdma-0"}, - {"interfaceName":"eth3","network":"rdma-1"}, - {"interfaceName":"eth4","network":"rdma-2"}, - {"interfaceName":"eth5","network":"rdma-3"} - ] - extraPodSpec: - resourceClaims: - - name: compute-domain-channel - resourceClaimTemplateName: a4x-compute-domain-channel-test - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: kubernetes.io/arch - operator: In - values: - - arm64 - volumes: - - name: model-src - csi: - driver: gcsfuse.csi.storage.gke.io - volumeAttributes: - bucketName: yijiaj-test - mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1 - - name: library-dir-host - hostPath: - path: /home/kubernetes/bin/nvidia - - name: gib - hostPath: - path: /home/kubernetes/bin/gib - mainContainer: - startupProbe: - failureThreshold: 1800 - httpGet: - path: /live - port: system - periodSeconds: 10 - timeoutSeconds: 5 - securityContext: - privileged: true - stdin: true - tty: true - volumeMounts: - - mountPath: /data/model - name: model-src - - name: library-dir-host - mountPath: /usr/local/nvidia - - name: gib - mountPath: /usr/local/gib - image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout - workingDir: /sgl-workspace/dynamo/components/backends/sglang - command: ["/bin/bash", "-c"] - args: - - | - set -e - - nvidia-smi - . /usr/local/gib/scripts/set_nccl_env.sh - - echo "--- VERIFYING NCCL ENV VARS IN SHELL ---" - env | grep NCCL_ - echo "--- END VERIFICATION ---" - - exec python3 -m dynamo.sglang \ - --enable-metrics \ - --model-path /data/model/deepseek-ai/DeepSeek-R1 \ - --served-model-name deepseek-ai/DeepSeek-R1 \ - --disaggregation-bootstrap-port 30001 \ - --disaggregation-mode prefill \ - --host 0.0.0.0 \ - --port 8000 \ - --disable-radix-cache \ - --tensor-parallel-size 4 \ - --data-parallel-size 1 \ - --expert-parallel-size 1 \ - --trust-remote-code \ - --kv-cache-dtype fp8_e4m3 \ - --attention-backend trtllm_mla \ - --quantization fp8 \ - --moe-runner-backend flashinfer_trtllm \ - --disable-radix-cache \ - --watchdog-timeout 1000000 \ - --context-length 9600 \ - --mem-fraction-static 0.95 \ - --max-total-tokens 32768 \ - --chunked-prefill-size 24576 \ - --cuda-graph-max-bs 512 \ - --max-running-requests 512 \ - --load-balance-method round_robin \ - --scheduler-recv-interval 10 \ - --enable-flashinfer-allreduce-fusion \ - --moe-dense-tp-size 1 - diff --git a/inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml b/inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml index d5a02dc6..f853ab74 100644 --- a/inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml +++ b/inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml @@ -169,7 +169,7 @@ secrets: volumes: useGcs: true gcsfuse: - bucketName: "yijiaj-test" + bucketName: your-gcs-bucket-name ssdMountPath: "/ssd" gcsMounts: mountPath: "/data/model" diff --git a/inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml b/inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml index bf992302..9a6eb78e 100644 --- a/inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml +++ b/inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml @@ -13,7 +13,7 @@ # limitations under the License. dynamo: - namespace: yijiaj-test + namespace: dynamo-cloud releaseVersion: "0.7.0" deploymentName: dynamo-disagg1p1d computeDomain: @@ -175,7 +175,7 @@ secrets: volumes: useGcs: true gcsfuse: - bucketName: "yijiaj-test" + bucketName: your-gcs-bucket-name ssdMountPath: "/ssd" gcsMounts: mountPath: "/data/model" From 2b824f82ecc843618a646e6c5c6e0fbe41dcdf57 Mon Sep 17 00:00:00 2001 From: Yijia J Date: Thu, 15 Jan 2026 05:27:14 +0000 Subject: [PATCH 09/12] nit --- inference/a4x/disaggregated-serving/dynamo/README.md | 8 ++++---- .../dynamo/{bench_clint.yaml => bench_client.yaml} | 0 2 files changed, 4 insertions(+), 4 deletions(-) rename inference/a4x/disaggregated-serving/dynamo/{bench_clint.yaml => bench_client.yaml} (100%) diff --git a/inference/a4x/disaggregated-serving/dynamo/README.md b/inference/a4x/disaggregated-serving/dynamo/README.md index 90849044..aace9f01 100644 --- a/inference/a4x/disaggregated-serving/dynamo/README.md +++ b/inference/a4x/disaggregated-serving/dynamo/README.md @@ -2,7 +2,7 @@ This document outlines the steps to deploy and serve Large Language Models (LLMs) using [NVIDIA Dynamo](https://github.com/ai-dynamo/dynamo) disaggregated inference platform on [A4X GKE Node pools](https://cloud.google.com/kubernetes-engine). -Dynamo provides a disaggregated architecture that separates prefill and decode operations for optimized inference performance, supporting both single-node (8 GPUs) and multi-node (16 GPUs) configurations. Dynamo also supports various inference framework backends like [vLLM](https://docs.nvidia.com/dynamo/latest/components/backends/vllm/README.html) and [SGLang](https://docs.nvidia.com/dynamo/latest/components/backends/sglang/README.html). In this recipe, we will focus on serving using the SGLang backend. +Dynamo provides a disaggregated architecture that separates prefill and decode operations for optimized inference performance, supporting both single-node (4 GPUs) and multi-node NVL72 (72 GPUs) configurations. Dynamo also supports various inference framework backends like [vLLM](https://docs.nvidia.com/dynamo/latest/components/backends/vllm/README.html) and [SGLang](https://docs.nvidia.com/dynamo/latest/components/backends/sglang/README.html). In this recipe, we will focus on serving using the SGLang backend. ## Table of Contents @@ -229,10 +229,10 @@ Check if the pods are in `Running` status before sending inference requests. kubectl get pods -n ${NAMESPACE} ``` -We can then deploy the benchmark clint and send benchark request. -Deploy the benchmark clint like this: +We can then deploy the benchmark client and send benchark request. +Deploy the benchmark client like this: ```bash -kubectl apply -f bench_clint.yaml -n ${NAMESPACE} +kubectl apply -f bench_client.yaml -n ${NAMESPACE} ``` And send the request like this: diff --git a/inference/a4x/disaggregated-serving/dynamo/bench_clint.yaml b/inference/a4x/disaggregated-serving/dynamo/bench_client.yaml similarity index 100% rename from inference/a4x/disaggregated-serving/dynamo/bench_clint.yaml rename to inference/a4x/disaggregated-serving/dynamo/bench_client.yaml From ecf8087da9fd95e93200e576313cbd0d637ec939 Mon Sep 17 00:00:00 2001 From: Yijia J Date: Thu, 15 Jan 2026 05:36:09 +0000 Subject: [PATCH 10/12] readme --- inference/a4x/disaggregated-serving/dynamo/README.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/inference/a4x/disaggregated-serving/dynamo/README.md b/inference/a4x/disaggregated-serving/dynamo/README.md index aace9f01..008f1262 100644 --- a/inference/a4x/disaggregated-serving/dynamo/README.md +++ b/inference/a4x/disaggregated-serving/dynamo/README.md @@ -160,9 +160,14 @@ Find the service account (usually annotated to default): kubectl get serviceaccounts ${NAMESPACE} -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.metadata.annotations.iam\.gke\.io/gcp-service-account}{"\n"}{end}' ``` +Config the service account email: +```bash +export SERVICE_ACCOUNT_EMAIL=$(kubectl get serviceaccount/default -n ${NAMESPACE} -o jsonpath='{.metadata.annotations.iam\.gke\.io/gcp-service-account}') +``` + Authorize the service account: ```bash -gcloud iam service-accounts add-iam-policy-binding xxx@project_id.iam.gserviceaccount.com \ +gcloud iam service-accounts add-iam-policy-binding ${SERVICE_ACCOUNT_EMAIL} \ --role roles/iam.workloadIdentityUser \ --member "serviceAccount:project_id.svc.id.goog[${NAMESPACE}/default]" ``` @@ -170,7 +175,7 @@ gcloud iam service-accounts add-iam-policy-binding xxx@project_id.iam.gserviceac Grant read access to the bucket: ```bash gcloud storage buckets add-iam-policy-binding ${GCS_BUCKET} \ - --member "serviceAccount:xxx@project_id.iam.gserviceaccount.com" \ + --member "serviceAccount:${SERVICE_ACCOUNT_EMAIL}" \ --role "roles/storage.objectViewer" ``` From 5b003c8ac5f6067e284ddef66d2eecbd4beee575 Mon Sep 17 00:00:00 2001 From: Yijia J Date: Fri, 16 Jan 2026 01:18:08 +0000 Subject: [PATCH 11/12] README, update image path --- .../disaggregated-serving/dynamo/README.md | 21 ++++++++++++++++++- .../dynamo/values_deepep.yaml | 3 +-- .../dynamo/values_wo_deepep.yaml | 7 +++---- .../templates/dynamo-graph-deployment.yaml | 4 ++-- 4 files changed, 26 insertions(+), 9 deletions(-) diff --git a/inference/a4x/disaggregated-serving/dynamo/README.md b/inference/a4x/disaggregated-serving/dynamo/README.md index 008f1262..59f937ab 100644 --- a/inference/a4x/disaggregated-serving/dynamo/README.md +++ b/inference/a4x/disaggregated-serving/dynamo/README.md @@ -15,6 +15,7 @@ Dynamo provides a disaggregated architecture that separates prefill and decode o * [2.4. Create Secrets](#create-secrets) * [2.5. Install Dynamo Platform](#install-platform) * [2.6. Setup GCS Bucket for GKE ](#setup-gcsfuse) + * [2.7. Build Dynamo Image ](#build-dyanmo-image) * [3. Deploy with SGLang Backend](#deploy-sglang) * [3.1. SGLang Deployment without DeepEP(8 GPUs)](#sglang-wo-deepep) * [3.2. SGLang Deployment with DeepEP(72 GPUs)](#sglang-deepep) @@ -181,6 +182,21 @@ gcloud storage buckets add-iam-policy-binding ${GCS_BUCKET} \ Downloading model files into the gcs bucket and set your gcs bucket name in values.yaml file. + +### 2.7. Build Dynamo Image + +Follow the [Dynamo container guide](https://github.com/ai-dynamo/dynamo/blob/main/container/README.md) to build the image, then push it to your artifact registry. + +Build the image like this: +```bash +docker build -f container/Dockerfile.sglang . -t dynamo-wideep --no-cache --build-arg DYNAMO_VERSION=0.7.0 --platform linux/arm64 +``` + +Config the docker image: +```bash +export ARTIFACT_REGISTRY= +``` + ## 3. Deploy with SGLang Backend @@ -200,6 +216,7 @@ Deploy DeepSeekR1-671B across 2 nodes for testing and validation. Note the use ```bash cd $RECIPE_ROOT helm install -f values_wo_deepep.yaml \ +--set workload.image=${ARTIFACT_REGISTRY} \ --set-file prefill_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-1p1d-prefill.yaml \ --set-file decode_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-1p1d-decode.yaml \ $USER-dynamo-a4x-1p1d \ @@ -217,7 +234,8 @@ Deploy DeepSeekR1-671B across 18 nodes for production workloads. Note the use of ```bash cd $RECIPE_ROOT -helm install -f values.yaml \ +helm install -f values_deepep.yaml \ +--set workload.image=${ARTIFACT_REGISTRY} \ --set-file prefill_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-prefill.yaml \ --set-file decode_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-decode.yaml \ $USER-dynamo-a4x-multi-node \ @@ -284,6 +302,7 @@ Common issues: * **Pods stuck in Pending**: Check if nodes have sufficient resources (especially for multi-node deployments) * **Model download slow**: Large models like DeepSeekR1 671B can take 30 minutes to download * **Multi-node issues**: Verify network connectivity between nodes and proper subnet configuration +* **Deepep timeout issue**: Recompile DeepEP to patch NUM_CPU_TIMEOUT_SECS and NUM_TIMEOUT_CYCLES in csrc/kernels/configs.cuh during the image build. ## 6. Cleanup diff --git a/inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml b/inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml index f853ab74..194b2b2b 100644 --- a/inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml +++ b/inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml @@ -35,7 +35,6 @@ dynamo: timeoutSeconds: 300 failureThreshold: 100 decodeWorker: - image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout nodeCount: 8 replicas: 1 envs: @@ -98,7 +97,6 @@ dynamo: timeoutSeconds: 30 failureThreshold: 1800 prefillWorker: - image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout nodeCount: 2 replicas: 5 envs: @@ -182,6 +180,7 @@ service: workload: model: deepseek-ai/DeepSeek-R1 + image: framework: sglang configFile: serving-args.yaml configPath: /workload/configs diff --git a/inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml b/inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml index 9a6eb78e..ab506a4e 100644 --- a/inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml +++ b/inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml @@ -13,7 +13,7 @@ # limitations under the License. dynamo: - namespace: dynamo-cloud + namespace: yijiaj-test releaseVersion: "0.7.0" deploymentName: dynamo-disagg1p1d computeDomain: @@ -35,7 +35,6 @@ dynamo: timeoutSeconds: 300 failureThreshold: 100 decodeWorker: - image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout nodeCount: 1 replicas: 1 envs: @@ -100,7 +99,6 @@ dynamo: timeoutSeconds: 30 failureThreshold: 1800 prefillWorker: - image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout nodeCount: 1 replicas: 1 envs: @@ -175,7 +173,7 @@ secrets: volumes: useGcs: true gcsfuse: - bucketName: your-gcs-bucket-name + bucketName: yijiaj-test ssdMountPath: "/ssd" gcsMounts: mountPath: "/data/model" @@ -188,6 +186,7 @@ service: workload: model: deepseek-ai/DeepSeek-R1 + image: framework: sglang configFile: serving-args.yaml configPath: /workload/configs diff --git a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml index 355db26a..8002e43a 100644 --- a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml +++ b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml @@ -158,7 +158,7 @@ spec: mainContainer: securityContext: privileged: true - image: {{ .Values.dynamo.decodeWorker.image }} + image: {{ .Values.workload.image }} workingDir: /sgl-workspace/dynamo/components/backends/sglang startupProbe: failureThreshold: {{ .Values.dynamo.decodeWorker.startupProbe.failureThreshold }} @@ -329,7 +329,7 @@ spec: privileged: true stdin: true tty: true - image: {{ .Values.dynamo.prefillWorker.image }} + image: {{ .Values.workload.image }} workingDir: /sgl-workspace/dynamo/components/backends/sglang startupProbe: failureThreshold: {{ .Values.dynamo.prefillWorker.startupProbe.failureThreshold }} From ea2c3b37cd340337f238ee53ac9e86d5a6208e2d Mon Sep 17 00:00:00 2001 From: Yijia J Date: Fri, 16 Jan 2026 01:39:09 +0000 Subject: [PATCH 12/12] nit --- inference/a4x/disaggregated-serving/dynamo/README.md | 2 ++ inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml | 2 +- .../a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml | 4 ++-- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/inference/a4x/disaggregated-serving/dynamo/README.md b/inference/a4x/disaggregated-serving/dynamo/README.md index 59f937ab..c53cdba0 100644 --- a/inference/a4x/disaggregated-serving/dynamo/README.md +++ b/inference/a4x/disaggregated-serving/dynamo/README.md @@ -217,6 +217,7 @@ Deploy DeepSeekR1-671B across 2 nodes for testing and validation. Note the use cd $RECIPE_ROOT helm install -f values_wo_deepep.yaml \ --set workload.image=${ARTIFACT_REGISTRY} \ +--set volumes.gcsfuse.bucketName=${GCS_BUCKET} \ --set-file prefill_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-1p1d-prefill.yaml \ --set-file decode_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-1p1d-decode.yaml \ $USER-dynamo-a4x-1p1d \ @@ -236,6 +237,7 @@ Deploy DeepSeekR1-671B across 18 nodes for production workloads. Note the use of cd $RECIPE_ROOT helm install -f values_deepep.yaml \ --set workload.image=${ARTIFACT_REGISTRY} \ +--set volumes.gcsfuse.bucketName=${GCS_BUCKET} \ --set-file prefill_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-prefill.yaml \ --set-file decode_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-decode.yaml \ $USER-dynamo-a4x-multi-node \ diff --git a/inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml b/inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml index 194b2b2b..a68f203a 100644 --- a/inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml +++ b/inference/a4x/disaggregated-serving/dynamo/values_deepep.yaml @@ -167,7 +167,7 @@ secrets: volumes: useGcs: true gcsfuse: - bucketName: your-gcs-bucket-name + bucketName: ssdMountPath: "/ssd" gcsMounts: mountPath: "/data/model" diff --git a/inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml b/inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml index ab506a4e..5308d69f 100644 --- a/inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml +++ b/inference/a4x/disaggregated-serving/dynamo/values_wo_deepep.yaml @@ -13,7 +13,7 @@ # limitations under the License. dynamo: - namespace: yijiaj-test + namespace: dynamo-cloud releaseVersion: "0.7.0" deploymentName: dynamo-disagg1p1d computeDomain: @@ -173,7 +173,7 @@ secrets: volumes: useGcs: true gcsfuse: - bucketName: yijiaj-test + bucketName: ssdMountPath: "/ssd" gcsMounts: mountPath: "/data/model"