From 0bd45e575ce57859443ddf075460ecda9c24fe97 Mon Sep 17 00:00:00 2001 From: Adrian Cole Date: Mon, 9 Mar 2026 17:36:12 +0800 Subject: [PATCH] examples: migrate to vllm official CPU image Signed-off-by: Adrian Cole --- inference-platforms/vllm/Dockerfile | 59 --------------------- inference-platforms/vllm/README.md | 13 +++-- inference-platforms/vllm/docker-compose.yml | 14 ++++- 3 files changed, 20 insertions(+), 66 deletions(-) delete mode 100644 inference-platforms/vllm/Dockerfile diff --git a/inference-platforms/vllm/Dockerfile b/inference-platforms/vllm/Dockerfile deleted file mode 100644 index 7050de7..0000000 --- a/inference-platforms/vllm/Dockerfile +++ /dev/null @@ -1,59 +0,0 @@ -# Pytorch doesn't build on Alpine, so we use Ubuntu instead. -# We can't use the official Pytorch image because it is x86 only. -FROM ubuntu:24.04 - -ARG VLLM_VERSION=v0.9.0.1 - -# Package pre-reqs copied from https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.arm -ENV CCACHE_DIR=/root/.cache/ccache -ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache -RUN --mount=type=cache,target=/var/cache/apt \ - apt-get update -y \ - && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \ - && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \ - && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 - -# Install CMake 3.26+, required for installation -RUN --mount=type=cache,target=/var/cache/apt \ - wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null \ - && . /etc/os-release \ - && echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | tee /etc/apt/sources.list.d/kitware.list >/dev/null \ - && apt-get install -y cmake - -# Install Python and make a virtual environment -RUN apt-get install -y python3-dev python3-pip python3-setuptools python3-venv -RUN python3 -m venv /opt/venv -ENV PATH="/opt/venv/bin:$PATH" -RUN pip install --upgrade pip - -# tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects. -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install py-cpuinfo # Use this to gather CPU info and optimize based on ARM Neoverse cores - -# Set LD_PRELOAD for tcmalloc on ARM -ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4" - -# Install from source -RUN git clone --depth 1 --single-branch --branch ${VLLM_VERSION} https://github.com/vllm-project/vllm.git -WORKDIR vllm - -# Use old dependencies -# See https://github.com/vllm-project/vllm/blob/main/examples/online_serving/opentelemetry/README.md -RUN pip install \ - 'opentelemetry-sdk>=1.26.0,<1.27.0' \ - 'opentelemetry-api>=1.26.0,<1.27.0' \ - 'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \ - 'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0' - -# CPU-only in order to run on MacOS. -# See https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html -RUN pip install "cmake>=3.26" wheel packaging ninja "setuptools-scm>=8" numpy -RUN pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu -RUN VLLM_TARGET_DEVICE=cpu python setup.py install - -ENV CHAT_MODEL=Qwen/Qwen3-0.6B -ENV OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://localhost:11434/v1/traces - -EXPOSE 8000 - -CMD vllm serve ${CHAT_MODEL} --max-model-len 4096 --enforce-eager --otlp-traces-endpoint=${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT} diff --git a/inference-platforms/vllm/README.md b/inference-platforms/vllm/README.md index dae07f1..153fe57 100644 --- a/inference-platforms/vllm/README.md +++ b/inference-platforms/vllm/README.md @@ -1,7 +1,7 @@ # vLLM -This shows how to use the [vLLM OpenTelemetry POC][otel-poc] to export -OpenTelemetry traces from vLLM requests to its OpenAI compatible endpoint. +This shows how to export OpenTelemetry traces from [vLLM][vllm] requests to +its OpenAI compatible endpoint. ## Prerequisites @@ -28,13 +28,16 @@ Once vLLM is running, use [uv][uv] to make an OpenAI request via uv run --exact -q --env-file env.local ../chat.py ``` +Or, for the OpenAI Responses API +```bash +uv run --exact -q --env-file env.local ../chat.py --use-responses-api +``` + ## Notes * This does not yet support metrics, and there is no GitHub issue on it. * This does not yet support logs, and there is no GitHub issue on it. -* Until [this][openai-responses] resolves, don't use `--use-responses-api`. --- -[otel-poc]: https://github.com/vllm-project/vllm/blob/main/examples/online_serving/opentelemetry/README.md +[vllm]: https://docs.vllm.ai/en/latest/features/opentelemetry.html [uv]: https://docs.astral.sh/uv/getting-started/installation/ -[openai-responses]: https://github.com/vllm-project/vllm/issues/14721 diff --git a/inference-platforms/vllm/docker-compose.yml b/inference-platforms/vllm/docker-compose.yml index 8e1cad3..cc56fd4 100644 --- a/inference-platforms/vllm/docker-compose.yml +++ b/inference-platforms/vllm/docker-compose.yml @@ -1,8 +1,18 @@ services: vllm: container_name: vllm - build: - context: . + image: vllm/vllm-openai-cpu:v0.17.0 + entrypoint: [] + # Serve args from the prior Dockerfile CMD: + # https://github.com/elastic/observability-examples/blob/139feb0f/inference-platforms/vllm/Dockerfile#L59 + command: + - sh + - -c + - > + vllm serve $$CHAT_MODEL + --max-model-len=8192 + --enforce-eager + --otlp-traces-endpoint=$$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT env_file: - env.local ports: