Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 109 additions & 29 deletions .github/workflows/vllm-benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,120 @@ jobs:
runs-on: ubuntu-latest
outputs:
benchmark_matrix: ${{ steps.set-parameters.outputs.benchmark_matrix }}
vllm_commit: ${{ steps.get-vllm-commit.outputs.vllm_commit }}
steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Checkout vLLM repository
uses: actions/checkout@v4
with:
repository: vllm-project/vllm
path: vllm
ref: ${{ inputs.vllm_branch || 'main' }}
fetch-depth: 100

- uses: actions/setup-python@v5
with:
python-version: '3.12'

- name: Get vLLM commit
id: get-vllm-commit
shell: bash
env:
HEAD_BRANCH: ${{ inputs.vllm_branch || 'main' }}
HEAD_SHA: ${{ inputs.vllm_commit || '' }}
RUNNERS: ${{ inputs.runners || 'h100,rocm,spr,gnr,b200,gaudi3' }}
run: |
set -eux

# Use the input commit if provided
if [[ -n "${HEAD_SHA}" ]]; then
echo "Using user-provided vLLM commit: ${HEAD_SHA}"
echo "vllm_commit=${HEAD_SHA}" >> $GITHUB_OUTPUT
exit 0
fi

# Set Docker image prefix based on branch
if [[ "${HEAD_BRANCH}" == "main" ]]; then
DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo
else
DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-test-repo
fi

# Determine which platforms need to be checked based on runners input
CHECK_CUDA=false
CHECK_ROCM=false
CHECK_HPU=false
CHECK_CPU=false

RUNNERS_LOWER=$(echo "${RUNNERS}" | tr '[:upper:]' '[:lower:]')
if [[ "${RUNNERS_LOWER}" == *"h100"* ]] || [[ "${RUNNERS_LOWER}" == *"b200"* ]]; then
CHECK_CUDA=true
fi
if [[ "${RUNNERS_LOWER}" == *"rocm"* ]]; then
CHECK_ROCM=true
fi
if [[ "${RUNNERS_LOWER}" == *"gaudi"* ]]; then
CHECK_HPU=true
fi
if [[ "${RUNNERS_LOWER}" == *"spr"* ]] || [[ "${RUNNERS_LOWER}" == *"gnr"* ]]; then
CHECK_CPU=true
fi

echo "Checking platforms - CUDA: ${CHECK_CUDA}, ROCm: ${CHECK_ROCM}, HPU: ${CHECK_HPU}, CPU: ${CHECK_CPU}"

# Find the latest commit with Docker images available for all required platforms
pushd vllm
for i in {0..99}
do
HEAD_SHA=$(git rev-parse --verify HEAD~${i})
ALL_IMAGES_EXIST=true

# Check CUDA image
if [[ "${CHECK_CUDA}" == "true" ]]; then
CUDA_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}"
if ! docker manifest inspect "${CUDA_IMAGE}" > /dev/null 2>&1; then
echo "CUDA image not found for ${HEAD_SHA}"
ALL_IMAGES_EXIST=false
fi
fi

# Check ROCm image
if [[ "${CHECK_ROCM}" == "true" ]] && [[ "${ALL_IMAGES_EXIST}" == "true" ]]; then
ROCM_IMAGE="docker.io/rocm/vllm-ci:${HEAD_SHA}"
if ! docker manifest inspect "${ROCM_IMAGE}" > /dev/null 2>&1; then
echo "ROCm image not found for ${HEAD_SHA}"
ALL_IMAGES_EXIST=false
fi
fi

# Check HPU image
if [[ "${CHECK_HPU}" == "true" ]] && [[ "${ALL_IMAGES_EXIST}" == "true" ]]; then
HPU_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}-hpu"
if ! docker manifest inspect "${HPU_IMAGE}" > /dev/null 2>&1; then
echo "HPU image not found for ${HEAD_SHA}"
ALL_IMAGES_EXIST=false
fi
fi

# Check CPU image
if [[ "${CHECK_CPU}" == "true" ]] && [[ "${ALL_IMAGES_EXIST}" == "true" ]]; then
CPU_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}-cpu"
if ! docker manifest inspect "${CPU_IMAGE}" > /dev/null 2>&1; then
echo "CPU image not found for ${HEAD_SHA}"
ALL_IMAGES_EXIST=false
fi
fi

if [[ "${ALL_IMAGES_EXIST}" == "true" ]]; then
Copy link
Copy Markdown
Contributor

@huydhn huydhn Jan 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This ALL_IMAGES_EXIST might not work in practice because I remember the Docker image for different platforms are maintained differently:

  1. If HPU (not that important when comparing to CUDA) build fail, it might take a long time to get fix. And if I read the code here correctly, this would leave us stuck on an older commit (the last commit HPU was built successfully)
  2. They might be build at different schedule. HPU and arm64 CPU build for example were in this bucket before where they were build daily or something (IIRC). Again, the loop needs to go back to much older commit to find a common commit.

What do you think about returning the list of commit for each of these platform independent? For example, 1 for CUDA, 1 for ROCm, etc.. IMO, this should be a good middle ground because the perf results are used per platform anyway, i.e. no ROCm vs CUDA comparison. With this, you can fold the vllm_commit into the output of generate_vllm_benchmark_matrix.py, and potentially move the Docker check logic here into the Python script too, which is usually easier to maintain than CI bash

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i see, we can def do thatm let me modify this

echo "Found vLLM commit with Docker images available for all required platforms: ${HEAD_SHA}"
echo "vllm_commit=${HEAD_SHA}" >> $GITHUB_OUTPUT
break
fi
done
popd

- name: Set parameters
id: set-parameters
shell: bash
Expand Down Expand Up @@ -217,39 +323,13 @@ jobs:
set -eux
docker image prune -a -f || true

- name: Check for last benchmark commit
working-directory: vllm-benchmarks
- name: Set vLLM commit
env:
HEAD_BRANCH: ${{ inputs.vllm_branch || 'main' }}
HEAD_SHA: ${{ inputs.vllm_commit || '' }}
MODELS: ${{ matrix.models }}
# Use the vLLM commit determined by set-parameters job (same for all matrix jobs)
HEAD_SHA: ${{ needs.set-parameters.outputs.vllm_commit }}
run: |
set -eux

if [[ -z "${HEAD_SHA}" ]]; then
pushd vllm
# Looking back the latest 100 commits is enough
for i in {0..99}
do
# Check if the image is there, if it doesn't then check an older one
# because the commit is too recent
HEAD_SHA=$(git rev-parse --verify HEAD~${i})
DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}"
# No Docker image available yet because the commit is too recent
if ! docker manifest inspect "${DOCKER_IMAGE}"; then
continue
fi
NOT_EXIST=0
S3_PATH="v3/vllm-project/vllm/${HEAD_BRANCH}/${HEAD_SHA}/${DEVICE_TYPE// /_}/benchmark_results_${MODELS//\//_}.json"
aws s3api head-object --bucket ossci-benchmarks --key ${S3_PATH} || NOT_EXIST=1
if [[ ${NOT_EXIST} == "1" ]]; then
echo "Found a vLLM commit ${HEAD_SHA} that hasn't been benchmarked yet"
break
fi
done
popd
fi

echo "HEAD_SHA=$HEAD_SHA" >> $GITHUB_ENV

# Print the benchmark commit for reference
Expand Down
Loading