From dc27fa1d0ba99c3ffeadc1777b6fe18f9542096e Mon Sep 17 00:00:00 2001 From: Kedar Potdar <115327600+kedarpotdar-nv@users.noreply.github.com> Date: Wed, 25 Mar 2026 18:10:12 -0700 Subject: [PATCH 1/5] Update nvidia-master.yaml --- .github/configs/nvidia-master.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index f4570fd2c..b98d8ad37 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -3208,18 +3208,18 @@ minimaxm2.5-fp8-b200-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 256 } + - { tp: 4, conc-start: 4, conc-end: 256 } - isl: 1024 osl: 8192 search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 256 } + - { tp: 4, conc-start: 4, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 256 } + - { tp: 4, conc-start: 4, conc-end: 256 } gptoss-fp4-h100-vllm: image: vllm/vllm-openai:v0.15.1 From 9ee612f268f99be712ecdb23c5dedd0354c23885 Mon Sep 17 00:00:00 2001 From: Kedar Potdar Date: Wed, 25 Mar 2026 18:18:54 -0700 Subject: [PATCH 2/5] vllm version bump --- .github/configs/nvidia-master.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index b98d8ad37..d990b3ae1 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -3197,7 +3197,7 @@ gptoss-fp4-b200-vllm: - { tp: 8, conc-start: 4, conc-end: 4 } minimaxm2.5-fp8-b200-vllm: - image: vllm/vllm-openai:v0.17.0-cu130 + image: vllm/vllm-openai:v0.18.0-cu130 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: b200 @@ -3210,16 +3210,19 @@ minimaxm2.5-fp8-b200-vllm: search-space: - { tp: 2, conc-start: 4, conc-end: 256 } - { tp: 4, conc-start: 4, conc-end: 256 } + - { tp: 4, ep: 4, conc-start: 32, conc-end: 256 } - isl: 1024 osl: 8192 search-space: - { tp: 2, conc-start: 4, conc-end: 256 } - { tp: 4, conc-start: 4, conc-end: 256 } + - { tp: 4, ep: 4, conc-start: 32, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - { tp: 2, conc-start: 4, conc-end: 256 } - { tp: 4, conc-start: 4, conc-end: 256 } + - { tp: 4, ep: 4, conc-start: 32, conc-end: 256 } gptoss-fp4-h100-vllm: image: vllm/vllm-openai:v0.15.1 From a27edeecd6821374f0bf80a1a50b15ca2132161c Mon Sep 17 00:00:00 2001 From: Kedar Potdar Date: Wed, 25 Mar 2026 18:20:09 -0700 Subject: [PATCH 3/5] add perf changelog --- perf-changelog.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 03fb6e082..afce65264 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1068,3 +1068,10 @@ - "dsr1-fp8-h200-sglang: v0.5.9-cu129-amd64 → v0.5.9-cu130" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/943 +- config-keys: + - minimaxm2.5-fp8-b200-vllm + description: + - "Update vLLM image from v0.17.0 to v0.18.0 for MiniMax-M2.5 FP8 B200" + - "Add tp4 ep4 search-space entries (conc 32-256) for all seq-len configs" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/947 + From a33dc21986e2f1f9fe0940d42df104ad439b60fd Mon Sep 17 00:00:00 2001 From: Kedar Potdar Date: Fri, 27 Mar 2026 12:19:32 -0700 Subject: [PATCH 4/5] update search space and configs --- .github/configs/nvidia-master.yaml | 5 ++--- benchmarks/single_node/minimaxm2.5_fp8_b200.sh | 4 +++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index d990b3ae1..e9f7f7603 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -3210,19 +3210,18 @@ minimaxm2.5-fp8-b200-vllm: search-space: - { tp: 2, conc-start: 4, conc-end: 256 } - { tp: 4, conc-start: 4, conc-end: 256 } - - { tp: 4, ep: 4, conc-start: 32, conc-end: 256 } + - { tp: 4, ep: 4, conc-start: 64, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 8 } - isl: 1024 osl: 8192 search-space: - { tp: 2, conc-start: 4, conc-end: 256 } - { tp: 4, conc-start: 4, conc-end: 256 } - - { tp: 4, ep: 4, conc-start: 32, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - { tp: 2, conc-start: 4, conc-end: 256 } - { tp: 4, conc-start: 4, conc-end: 256 } - - { tp: 4, ep: 4, conc-start: 32, conc-end: 256 } gptoss-fp4-h100-vllm: image: vllm/vllm-openai:v0.15.1 diff --git a/benchmarks/single_node/minimaxm2.5_fp8_b200.sh b/benchmarks/single_node/minimaxm2.5_fp8_b200.sh index 2e5aa4b24..066a870fd 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_b200.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_b200.sh @@ -24,8 +24,8 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} -export VLLM_USE_FLASHINFER_MOE_FP8=0 export VLLM_MOE_USE_DEEP_GEMM=0 +export VLLM_FLASHINFER_ALLREDUCE_BACKEND=mnnvl if [ "$EP_SIZE" -ge 1 ]; then EP=" --enable-expert-parallel" @@ -43,6 +43,8 @@ $EP \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ --block-size=32 \ +--kv-cache-dtype fp8 \ +--stream-interval 20 --no-enable-prefix-caching \ --trust-remote-code > $SERVER_LOG 2>&1 & SERVER_PID=$! From 89acdf2ff13f6ca69f5794c8998e7b5c87636584 Mon Sep 17 00:00:00 2001 From: Kedar Potdar Date: Fri, 27 Mar 2026 13:01:29 -0700 Subject: [PATCH 5/5] fix typo in VLLM_USE_DEEP_GEMM --- benchmarks/single_node/minimaxm2.5_fp8_b200.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/single_node/minimaxm2.5_fp8_b200.sh b/benchmarks/single_node/minimaxm2.5_fp8_b200.sh index 066a870fd..0b4151e17 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_b200.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_b200.sh @@ -24,7 +24,7 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} -export VLLM_MOE_USE_DEEP_GEMM=0 +export VLLM_USE_DEEP_GEMM=0 export VLLM_FLASHINFER_ALLREDUCE_BACKEND=mnnvl if [ "$EP_SIZE" -ge 1 ]; then