SemiAnalysisAI · chunfangamd · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026 · Mar 12, 2026
@@ -1031,6 +1031,81 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
   #       - "DECODE_MTP_SIZE=0"
 
 
+dsr1-fp8-mi355x-vllm-disagg:
+  image: vllm/vllm-openai-rocm:v0.17.1
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: mi355x-disagg
+  precision: fp8
+  framework: vllm-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total
+    - spec-decoding: "none"
+      conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+        - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 8
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - spec-decoding: "none"
+      conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+        - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 8
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+
+  - isl: 1024
+    osl: 8192
+    search-space:
+    - spec-decoding: "none"
+      conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+        - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 8
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+
+
 dsr1-fp4-mi355x-sglang-disagg:
   image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3
   model: amd/DeepSeek-R1-0528-MXFP4

diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh
@@ -0,0 +1,79 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    CONC_LIST \
+    ISL \
+    OSL \
+    IMAGE \
+    SPEC_DECODING \
+    MODEL_PATH \
+    PREFILL_NUM_WORKERS \
+    PREFILL_TP \
+    PREFILL_EP \
+    PREFILL_DP_ATTN \
+    DECODE_NUM_WORKERS \
+    DECODE_TP \
+    DECODE_EP \
+    DECODE_DP_ATTN \
+    PREFILL_NODES \
+    DECODE_NODES \
+    RANDOM_RANGE_RATIO
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+set -x
+
+cd "$GITHUB_WORKSPACE/benchmarks/multi_node/vllm_disagg_utils" || exit 1
+
+export TIME_LIMIT="08:00:00"
+export MODEL_PATH=$MODEL_PATH
+export MODEL_NAME=$MODEL_NAME
+export CONTAINER_IMAGE=$IMAGE
+
+# Same EP/DP booleans as dsr1_fp8_mi355x_sglang-disagg.sh → amd_utils/submit.sh
+if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
+    export PREFILL_ENABLE_EP=false
+else
+    export PREFILL_ENABLE_EP=true
+fi
+
+if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
+    export PREFILL_ENABLE_DP=true
+else
+    export PREFILL_ENABLE_DP=false
+fi
+
+if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
+    export DECODE_ENABLE_EP=false
+else
+    export DECODE_ENABLE_EP=true
+fi
+
+if [[ "$DECODE_DP_ATTN" == "true" ]]; then
+    export DECODE_ENABLE_DP=true
+else
+    export DECODE_ENABLE_DP=false
+fi
+
+# Parameter order matches SGLang disagg submit.sh; arg 16 is optional NODELIST.
+JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
+    $PREFILL_NUM_WORKERS \
+    $DECODE_NODES \
+    $DECODE_NUM_WORKERS \
+    $ISL $OSL "${CONC_LIST// /x}" inf \
+    ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
+    ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
+    ${PREFILL_TP} ${DECODE_TP} \
+    ${RANDOM_RANGE_RATIO} \
+    "${NODELIST:-}")
+
+if [[ $? -ne 0 ]]; then
+    echo "Failed to submit job" >&2
+    exit 1
+fi
+
+echo "$JOB_ID"
diff --git a/benchmarks/multi_node/vllm_disagg_utils/bench.sh b/benchmarks/multi_node/vllm_disagg_utils/bench.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+# vLLM Disaggregated Benchmark Runner
+#
+# Produces JSON result files via benchmark_serving.py (same as SGLang bench.sh)
+# so that the CI pipeline can collect and process results.
+#
+# Usage: bash bench.sh <n_prefill> <n_decode> <prefill_gpus> <decode_gpus> \
+#            <model_dir> <model_name> <log_path> <isl> <osl> \
+#            <concurrency_list> <req_rate> <random_range_ratio> <num_prompts_multiplier>
+
+n_prefill=$1
+n_decode=$2
+prefill_gpus=$3
+decode_gpus=$4
+model_path=$5
+model_name=$6
+MODEL_PATH="${MODEL_PATH:-${model_path}/${model_name}}"
+log_path=$7
+
+chosen_isl=${8:-1024}
+chosen_osl=${9:-1024}
+concurrency_list=${10:-"512x1"}
+chosen_req_rate=${11:-inf}
+random_range_ratio=${12:-0.8}
+num_prompts_multiplier=${13:-10}
+
+IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list"
+
+ROUTER_PORT="${ROUTER_PORT:-30000}"
+
+echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}"
+
+profile_folder="${log_path}/vllm_isl_${chosen_isl}_osl_${chosen_osl}"
+mkdir -p "$profile_folder"
+
+source "$(dirname "$0")/../../benchmark_lib.sh"
+
+REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)"
+
+for max_concurrency in "${chosen_concurrencies[@]}"; do
+
+    export_file="${profile_folder}/concurrency_${max_concurrency}_req_rate_${chosen_req_rate}_gpus_$((prefill_gpus+decode_gpus))_ctx_${prefill_gpus}_gen_${decode_gpus}"
+
+    num_prompts=$(( max_concurrency * num_prompts_multiplier ))
+    if [[ "$num_prompts" -lt 16 ]]; then
+        num_prompts=16
+    fi
+
+    echo "profile_folder: $profile_folder"
+    echo "max_concurrency: $max_concurrency"
+    echo "chosen_req_rate: $chosen_req_rate"
+    echo "MODEL_PATH: $MODEL_PATH"
+    echo "ROUTER_PORT: $ROUTER_PORT"
+    echo "chosen_isl: $chosen_isl"
+    echo "chosen_osl: $chosen_osl"
+    echo "num_prompts: $num_prompts"
+    echo "export_file: $export_file"
+
+    run_benchmark_serving \
+        --bench-serving-dir "$REPO_ROOT" \
+        --model "$MODEL_PATH" \
+        --port "$ROUTER_PORT" \
+        --backend openai \
+        --input-len "$chosen_isl" \
+        --output-len "$chosen_osl" \
+        --random-range-ratio "$random_range_ratio" \
+        --num-prompts "$num_prompts" \
+        --max-concurrency "$max_concurrency" \
+        --result-filename "$export_file" \
+        --result-dir /workspace/
+
+    echo "-----------------------------------------"
+    echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..."
+    sleep 10
+done
diff --git a/benchmarks/multi_node/vllm_disagg_utils/env.sh b/benchmarks/multi_node/vllm_disagg_utils/env.sh
@@ -0,0 +1,98 @@
+#!/bin/bash
+# vLLM/Nixl environment setup for multi-node disaggregated serving.
+#
+# REQUIRED ENVIRONMENT VARIABLES:
+#   IBDEVICES - RDMA/InfiniBand device names (e.g., ionic_0,ionic_1,... or mlx5_0,mlx5_1,...)
+#               Set by runner or auto-detected from hostname.
+#
+# UCX and RIXL paths (LD_LIBRARY_PATH, PATH) are set by setup_deps.sh, which is
+# sourced at the top of server.sh before this file.
+
+set -x
+
+# IBDEVICES configuration
+# Prefer IBDEVICES set by runner (runners/launch_mi355x-amds.sh)
+# Fall back to hostname detection if not set (for direct script execution)
+if [[ -z "$IBDEVICES" ]]; then
+    NODENAME=$(hostname -s)
+    if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
+        export IBDEVICES=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7
+    elif [[ $NODENAME == mia1* ]]; then
+        export IBDEVICES=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
+    else
+        DETECTED=$(ibv_devinfo 2>/dev/null | grep "hca_id:" | awk '{print $2}' | paste -sd',')
+        if [[ -n "$DETECTED" ]]; then
+            export IBDEVICES="$DETECTED"
+        else
+            echo "WARNING: Unable to detect RDMA devices. Set IBDEVICES explicitly." >&2
+        fi
+    fi
+    echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES from hostname $(hostname -s)"
+else
+    echo "[INFO] Using IBDEVICES=$IBDEVICES (set by runner or environment)"
+fi
+
+if [[ -z "$UCX_NET_DEVICES" ]]; then
+    # Use the first benic interface for UCX TCP transport (maps to ionic RDMA NIC).
+    # We use TCP device names (benicXp1) instead of IB device names (ionic_X:1)
+    # because ud_verbs/ionic crashes in ucp_request_memory_dereg (UCX bug with ionic provider).
+    UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/benic1p1/{print $2}' | head -1)
+    if [[ -n "$UCX_NET_DEV" ]]; then
+        export UCX_NET_DEVICES="$UCX_NET_DEV"
+    else
+        FIRST_IB=$(echo "$IBDEVICES" | cut -d',' -f1)
+        if [[ -n "$FIRST_IB" ]]; then
+            export UCX_NET_DEVICES="${FIRST_IB}:1"
+        fi
+    fi
+    echo "[INFO] Auto-set UCX_NET_DEVICES=$UCX_NET_DEVICES"
+else
+    echo "[INFO] Using UCX_NET_DEVICES=$UCX_NET_DEVICES (set by environment)"
+fi
+
+export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1)
+export NCCL_IB_HCA=${NCCL_IB_HCA:-$IBDEVICES}
+
+# RoCEv2: use IPv4-mapped GID (index 1) for inter-node RDMA routing
+export UCX_IB_GID_INDEX=${UCX_IB_GID_INDEX:-1}
+
+# QoS/DSCP configuration for lossless RoCEv2 fabric.
+# Priority order: 1) Set by runner, 2) Detect via nicctl, 3) Detect from hostname
+if [[ -n "$UCX_IB_TRAFFIC_CLASS" ]]; then
+    echo "[INFO] Using UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS (set by environment)"
+elif command -v nicctl &> /dev/null; then
+    ND_PRIO=$(nicctl show qos 2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}')
+    ND_DSCP=$(nicctl show qos 2>/dev/null | awk -v p="$ND_PRIO" '
+$1 == "DSCP" && $2 == ":" && $NF == p {
+    print $3; exit
+}')
+    if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then
+        export UCX_IB_TRAFFIC_CLASS=$(( 4 * ND_DSCP ))
+        export UCX_IB_SL=$ND_PRIO
+        echo "[INFO] Detected QoS from nicctl: UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS, UCX_IB_SL=$UCX_IB_SL"
+    else
+        echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection."
+        NODENAME=$(hostname -s)
+        if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
+            export UCX_IB_TRAFFIC_CLASS=96
+            echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME"
+        elif [[ $NODENAME == mia1* ]]; then
+            export UCX_IB_TRAFFIC_CLASS=104
+            echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME"
+        fi
+    fi
+else
+    NODENAME=$(hostname -s)
+    if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
+        export UCX_IB_TRAFFIC_CLASS=96
+        echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME"
+    elif [[ $NODENAME == mia1* ]]; then
+        export UCX_IB_TRAFFIC_CLASS=104
+        echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME"
+    else
+        echo "[INFO] No nicctl and unable to detect from hostname. Skipping QoS configuration."
+    fi
+fi
+
+set +x
+echo "[INFO] IBDEVICES=$IBDEVICES  UCX_NET_DEVICES=$UCX_NET_DEVICES  NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME  UCX_IB_GID_INDEX=$UCX_IB_GID_INDEX  UCX_IB_TRAFFIC_CLASS=${UCX_IB_TRAFFIC_CLASS:-unset}"