Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
894b08e
[AMD] Add vLLM disaggregated prefill-decode benchmark for MI355X
chunfangamd Mar 11, 2026
1c4ad3d
[AMD] Refactor vLLM disagg recipe: models.yaml, UCX cleanup, QoS support
chunfangamd Mar 11, 2026
04ab30d
[AMD] Update vLLM disagg recipe for v0.17.1 NixlConnector API
chunfangamd Mar 11, 2026
99ce774
[AMD] Make vLLM disagg recipe CI-compatible (mia1 cluster)
chunfangamd Mar 12, 2026
d16bd21
[AMD] Co-locate vLLM disagg router with prefill on NODE_RANK=0
chunfangamd Mar 12, 2026
cf4b88c
[AMD] Use public vLLM base image with runtime dependency install
chunfangamd Mar 12, 2026
1b46ce5
[AMD] Enable Expert Parallelism with MoRI all-to-all on vLLM disagg d…
chunfangamd Mar 13, 2026
585ddb4
[AMD] Switch vLLM disagg KV transfer to MoRI-IO with protocol-aware p…
chunfangamd Mar 13, 2026
69fcdbd
[AMD] BUG fix: RANDOM_RANGE_RATIO never reaches bench.sh
ichbinblau Mar 17, 2026
d214e79
Bug fix: 1. With DRY_RUN=1, node 0 skipped starting proxy/prefill but…
ichbinblau Mar 17, 2026
3ffcc74
[AMD] Fix vLLM disagg hang: READ mode support + safety timeouts
chunfangamd Mar 19, 2026
9129ead
Adapt vLLM disagg recipe for 9N mia1 cluster (mlx5 NICs)
chunfangamd Mar 21, 2026
728f91a
[AMD] Fix vLLM disagg sweep hang: KV cache leak + benchmark client ha…
chunfangamd Mar 22, 2026
a163fd6
[AMD] Fix vLLM disagg Slurm job never terminating after benchmark com…
chunfangamd Mar 22, 2026
cb52c29
[AMD] Enable MoRI-IO READ mode by default for vLLM disagg
chunfangamd Mar 22, 2026
25a0310
[AMD] Fix CI checkout failure caused by root-owned __pycache__ files
chunfangamd Mar 22, 2026
5bbc954
[AMD] Fix CI checkout EACCES by redirecting Python bytecache off NFS
chunfangamd Mar 23, 2026
89ae516
[AMD] Fix KV reaper deadlock on high-ISL disagg workloads
chunfangamd Mar 23, 2026
f611f47
[AMD] Enable reading PREFILL_TP,PREFILL_EP,PREFILL_DP_ATTN,DECODE_TP,…
ichbinblau Mar 24, 2026
bec9c09
Merge branch 'main' into chun-oren-theresa/vllm_disagg
chunfangamd Mar 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1031,6 +1031,81 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
# - "DECODE_MTP_SIZE=0"


dsr1-fp8-mi355x-vllm-disagg:
image: vllm/vllm-openai-rocm:v0.17.1
model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1
runner: mi355x-disagg
precision: fp8
framework: vllm-disagg
multinode: true
disagg: true
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
# 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total
- spec-decoding: "none"
conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
- "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
additional-settings:
- "DECODE_NODES=2"

- isl: 8192
osl: 1024
search-space:
- spec-decoding: "none"
conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
- "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
additional-settings:
- "DECODE_NODES=2"

- isl: 1024
osl: 8192
search-space:
- spec-decoding: "none"
conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "PREFILL_NODES=1"
- "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
decode:
num-worker: 2
tp: 8
ep: 8
dp-attn: false
additional-settings:
- "DECODE_NODES=2"


dsr1-fp4-mi355x-sglang-disagg:
image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3
model: amd/DeepSeek-R1-0528-MXFP4
Expand Down
79 changes: 79 additions & 0 deletions benchmarks/multi_node/dsr1_fp8_mi355x_vllm-disagg.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../benchmark_lib.sh"

check_env_vars \
CONC_LIST \
ISL \
OSL \
IMAGE \
SPEC_DECODING \
MODEL_PATH \
PREFILL_NUM_WORKERS \
PREFILL_TP \
PREFILL_EP \
PREFILL_DP_ATTN \
DECODE_NUM_WORKERS \
DECODE_TP \
DECODE_EP \
DECODE_DP_ATTN \
PREFILL_NODES \
DECODE_NODES \
RANDOM_RANGE_RATIO

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

set -x

cd "$GITHUB_WORKSPACE/benchmarks/multi_node/vllm_disagg_utils" || exit 1

export TIME_LIMIT="08:00:00"
export MODEL_PATH=$MODEL_PATH
export MODEL_NAME=$MODEL_NAME
export CONTAINER_IMAGE=$IMAGE

# Same EP/DP booleans as dsr1_fp8_mi355x_sglang-disagg.sh → amd_utils/submit.sh
if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
export PREFILL_ENABLE_EP=false
else
export PREFILL_ENABLE_EP=true
fi

if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
export PREFILL_ENABLE_DP=true
else
export PREFILL_ENABLE_DP=false
fi

if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
export DECODE_ENABLE_EP=false
else
export DECODE_ENABLE_EP=true
fi

if [[ "$DECODE_DP_ATTN" == "true" ]]; then
export DECODE_ENABLE_DP=true
else
export DECODE_ENABLE_DP=false
fi

# Parameter order matches SGLang disagg submit.sh; arg 16 is optional NODELIST.
JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
$PREFILL_NUM_WORKERS \
$DECODE_NODES \
$DECODE_NUM_WORKERS \
$ISL $OSL "${CONC_LIST// /x}" inf \
${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
${PREFILL_TP} ${DECODE_TP} \
${RANDOM_RANGE_RATIO} \
"${NODELIST:-}")

if [[ $? -ne 0 ]]; then
echo "Failed to submit job" >&2
exit 1
fi

echo "$JOB_ID"
75 changes: 75 additions & 0 deletions benchmarks/multi_node/vllm_disagg_utils/bench.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/bin/bash
# vLLM Disaggregated Benchmark Runner
#
# Produces JSON result files via benchmark_serving.py (same as SGLang bench.sh)
# so that the CI pipeline can collect and process results.
#
# Usage: bash bench.sh <n_prefill> <n_decode> <prefill_gpus> <decode_gpus> \
# <model_dir> <model_name> <log_path> <isl> <osl> \
# <concurrency_list> <req_rate> <random_range_ratio> <num_prompts_multiplier>

n_prefill=$1
n_decode=$2
prefill_gpus=$3
decode_gpus=$4
model_path=$5
model_name=$6
MODEL_PATH="${MODEL_PATH:-${model_path}/${model_name}}"
log_path=$7

chosen_isl=${8:-1024}
chosen_osl=${9:-1024}
concurrency_list=${10:-"512x1"}
chosen_req_rate=${11:-inf}
random_range_ratio=${12:-0.8}
num_prompts_multiplier=${13:-10}

IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list"

ROUTER_PORT="${ROUTER_PORT:-30000}"

echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}"

profile_folder="${log_path}/vllm_isl_${chosen_isl}_osl_${chosen_osl}"
mkdir -p "$profile_folder"

source "$(dirname "$0")/../../benchmark_lib.sh"

REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)"

for max_concurrency in "${chosen_concurrencies[@]}"; do

export_file="${profile_folder}/concurrency_${max_concurrency}_req_rate_${chosen_req_rate}_gpus_$((prefill_gpus+decode_gpus))_ctx_${prefill_gpus}_gen_${decode_gpus}"

num_prompts=$(( max_concurrency * num_prompts_multiplier ))
if [[ "$num_prompts" -lt 16 ]]; then
num_prompts=16
fi

echo "profile_folder: $profile_folder"
echo "max_concurrency: $max_concurrency"
echo "chosen_req_rate: $chosen_req_rate"
echo "MODEL_PATH: $MODEL_PATH"
echo "ROUTER_PORT: $ROUTER_PORT"
echo "chosen_isl: $chosen_isl"
echo "chosen_osl: $chosen_osl"
echo "num_prompts: $num_prompts"
echo "export_file: $export_file"

run_benchmark_serving \
--bench-serving-dir "$REPO_ROOT" \
--model "$MODEL_PATH" \
--port "$ROUTER_PORT" \
--backend openai \
--input-len "$chosen_isl" \
--output-len "$chosen_osl" \
--random-range-ratio "$random_range_ratio" \
--num-prompts "$num_prompts" \
--max-concurrency "$max_concurrency" \
--result-filename "$export_file" \
--result-dir /workspace/

echo "-----------------------------------------"
echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..."
sleep 10
done
98 changes: 98 additions & 0 deletions benchmarks/multi_node/vllm_disagg_utils/env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#!/bin/bash
# vLLM/Nixl environment setup for multi-node disaggregated serving.
#
# REQUIRED ENVIRONMENT VARIABLES:
# IBDEVICES - RDMA/InfiniBand device names (e.g., ionic_0,ionic_1,... or mlx5_0,mlx5_1,...)
# Set by runner or auto-detected from hostname.
#
# UCX and RIXL paths (LD_LIBRARY_PATH, PATH) are set by setup_deps.sh, which is
# sourced at the top of server.sh before this file.

set -x

# IBDEVICES configuration
# Prefer IBDEVICES set by runner (runners/launch_mi355x-amds.sh)
# Fall back to hostname detection if not set (for direct script execution)
if [[ -z "$IBDEVICES" ]]; then
NODENAME=$(hostname -s)
if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
export IBDEVICES=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7
elif [[ $NODENAME == mia1* ]]; then
export IBDEVICES=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
else
DETECTED=$(ibv_devinfo 2>/dev/null | grep "hca_id:" | awk '{print $2}' | paste -sd',')
if [[ -n "$DETECTED" ]]; then
export IBDEVICES="$DETECTED"
else
echo "WARNING: Unable to detect RDMA devices. Set IBDEVICES explicitly." >&2
fi
fi
echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES from hostname $(hostname -s)"
else
echo "[INFO] Using IBDEVICES=$IBDEVICES (set by runner or environment)"
fi

if [[ -z "$UCX_NET_DEVICES" ]]; then
# Use the first benic interface for UCX TCP transport (maps to ionic RDMA NIC).
# We use TCP device names (benicXp1) instead of IB device names (ionic_X:1)
# because ud_verbs/ionic crashes in ucp_request_memory_dereg (UCX bug with ionic provider).
UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/benic1p1/{print $2}' | head -1)
if [[ -n "$UCX_NET_DEV" ]]; then
export UCX_NET_DEVICES="$UCX_NET_DEV"
else
FIRST_IB=$(echo "$IBDEVICES" | cut -d',' -f1)
if [[ -n "$FIRST_IB" ]]; then
export UCX_NET_DEVICES="${FIRST_IB}:1"
fi
fi
echo "[INFO] Auto-set UCX_NET_DEVICES=$UCX_NET_DEVICES"
else
echo "[INFO] Using UCX_NET_DEVICES=$UCX_NET_DEVICES (set by environment)"
fi

export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1)
export NCCL_IB_HCA=${NCCL_IB_HCA:-$IBDEVICES}

# RoCEv2: use IPv4-mapped GID (index 1) for inter-node RDMA routing
export UCX_IB_GID_INDEX=${UCX_IB_GID_INDEX:-1}

# QoS/DSCP configuration for lossless RoCEv2 fabric.
# Priority order: 1) Set by runner, 2) Detect via nicctl, 3) Detect from hostname
if [[ -n "$UCX_IB_TRAFFIC_CLASS" ]]; then
echo "[INFO] Using UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS (set by environment)"
elif command -v nicctl &> /dev/null; then
ND_PRIO=$(nicctl show qos 2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}')
ND_DSCP=$(nicctl show qos 2>/dev/null | awk -v p="$ND_PRIO" '
$1 == "DSCP" && $2 == ":" && $NF == p {
print $3; exit
}')
if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then
export UCX_IB_TRAFFIC_CLASS=$(( 4 * ND_DSCP ))
export UCX_IB_SL=$ND_PRIO
echo "[INFO] Detected QoS from nicctl: UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS, UCX_IB_SL=$UCX_IB_SL"
else
echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection."
NODENAME=$(hostname -s)
if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
export UCX_IB_TRAFFIC_CLASS=96
echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME"
elif [[ $NODENAME == mia1* ]]; then
export UCX_IB_TRAFFIC_CLASS=104
echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME"
fi
fi
else
NODENAME=$(hostname -s)
if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
export UCX_IB_TRAFFIC_CLASS=96
echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME"
elif [[ $NODENAME == mia1* ]]; then
export UCX_IB_TRAFFIC_CLASS=104
echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME"
else
echo "[INFO] No nicctl and unable to detect from hostname. Skipping QoS configuration."
fi
fi

set +x
echo "[INFO] IBDEVICES=$IBDEVICES UCX_NET_DEVICES=$UCX_NET_DEVICES NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME UCX_IB_GID_INDEX=$UCX_IB_GID_INDEX UCX_IB_TRAFFIC_CLASS=${UCX_IB_TRAFFIC_CLASS:-unset}"
Loading