From 0bd45e575ce57859443ddf075460ecda9c24fe97 Mon Sep 17 00:00:00 2001
From: Adrian Cole <adrian@tetrate.io>
Date: Mon, 9 Mar 2026 17:36:12 +0800
Subject: [PATCH] examples: migrate to vllm official CPU image

Signed-off-by: Adrian Cole <adrian@tetrate.io>
---
 inference-platforms/vllm/Dockerfile         | 59 ---------------------
 inference-platforms/vllm/README.md          | 13 +++--
 inference-platforms/vllm/docker-compose.yml | 14 ++++-
 3 files changed, 20 insertions(+), 66 deletions(-)
 delete mode 100644 inference-platforms/vllm/Dockerfile

diff --git a/inference-platforms/vllm/Dockerfile b/inference-platforms/vllm/Dockerfile
deleted file mode 100644
index 7050de7..0000000
--- a/inference-platforms/vllm/Dockerfile
+++ /dev/null
@@ -1,59 +0,0 @@
-# Pytorch doesn't build on Alpine, so we use Ubuntu instead.
-# We can't use the official Pytorch image because it is x86 only.
-FROM ubuntu:24.04
-
-ARG VLLM_VERSION=v0.9.0.1
-
-# Package pre-reqs copied from https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile.arm
-ENV CCACHE_DIR=/root/.cache/ccache
-ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
-RUN --mount=type=cache,target=/var/cache/apt \
-    apt-get update -y \
-    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
-    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
-
-# Install CMake 3.26+, required for installation
-RUN --mount=type=cache,target=/var/cache/apt \
-    wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null \
-    && . /etc/os-release \
-    && echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | tee /etc/apt/sources.list.d/kitware.list >/dev/null \
-    && apt-get install -y cmake
-
-# Install Python and make a virtual environment
-RUN apt-get install -y python3-dev python3-pip python3-setuptools python3-venv
-RUN python3 -m venv /opt/venv
-ENV PATH="/opt/venv/bin:$PATH"
-RUN pip install --upgrade pip
-
-# tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects.
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install py-cpuinfo  # Use this to gather CPU info and optimize based on ARM Neoverse cores
-
-# Set LD_PRELOAD for tcmalloc on ARM
-ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"
-
-# Install from source
-RUN git clone --depth 1 --single-branch --branch ${VLLM_VERSION} https://github.com/vllm-project/vllm.git
-WORKDIR vllm
-
-# Use old dependencies
-# See https://github.com/vllm-project/vllm/blob/main/examples/online_serving/opentelemetry/README.md
-RUN pip install \
-      'opentelemetry-sdk>=1.26.0,<1.27.0' \
-      'opentelemetry-api>=1.26.0,<1.27.0' \
-      'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
-      'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'
-
-# CPU-only in order to run on MacOS.
-# See https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html
-RUN pip install "cmake>=3.26" wheel packaging ninja "setuptools-scm>=8" numpy
-RUN pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
-RUN VLLM_TARGET_DEVICE=cpu python setup.py install
-
-ENV CHAT_MODEL=Qwen/Qwen3-0.6B
-ENV OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://localhost:11434/v1/traces
-
-EXPOSE 8000
-
-CMD vllm serve ${CHAT_MODEL} --max-model-len 4096 --enforce-eager --otlp-traces-endpoint=${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT}
diff --git a/inference-platforms/vllm/README.md b/inference-platforms/vllm/README.md
index dae07f1..153fe57 100644
--- a/inference-platforms/vllm/README.md
+++ b/inference-platforms/vllm/README.md
@@ -1,7 +1,7 @@
 # vLLM
 
-This shows how to use the [vLLM OpenTelemetry POC][otel-poc] to export
-OpenTelemetry traces from vLLM requests to its OpenAI compatible endpoint.
+This shows how to export OpenTelemetry traces from [vLLM][vllm] requests to
+its OpenAI compatible endpoint.
 
 ## Prerequisites
 
@@ -28,13 +28,16 @@ Once vLLM is running, use [uv][uv] to make an OpenAI request via
 uv run --exact -q --env-file env.local ../chat.py
 ```
 
+Or, for the OpenAI Responses API
+```bash
+uv run --exact -q --env-file env.local ../chat.py --use-responses-api
+```
+
 ## Notes
 
 * This does not yet support metrics, and there is no GitHub issue on it.
 * This does not yet support logs, and there is no GitHub issue on it.
-* Until [this][openai-responses] resolves, don't use `--use-responses-api`.
 
 ---
-[otel-poc]: https://github.com/vllm-project/vllm/blob/main/examples/online_serving/opentelemetry/README.md
+[vllm]: https://docs.vllm.ai/en/latest/features/opentelemetry.html
 [uv]: https://docs.astral.sh/uv/getting-started/installation/
-[openai-responses]: https://github.com/vllm-project/vllm/issues/14721
diff --git a/inference-platforms/vllm/docker-compose.yml b/inference-platforms/vllm/docker-compose.yml
index 8e1cad3..cc56fd4 100644
--- a/inference-platforms/vllm/docker-compose.yml
+++ b/inference-platforms/vllm/docker-compose.yml
@@ -1,8 +1,18 @@
 services:
   vllm:
     container_name: vllm
-    build:
-      context: .
+    image: vllm/vllm-openai-cpu:v0.17.0
+    entrypoint: []
+    # Serve args from the prior Dockerfile CMD:
+    # https://github.com/elastic/observability-examples/blob/139feb0f/inference-platforms/vllm/Dockerfile#L59
+    command:
+      - sh
+      - -c
+      - >
+        vllm serve $$CHAT_MODEL
+        --max-model-len=8192
+        --enforce-eager
+        --otlp-traces-endpoint=$$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
     env_file:
       - env.local
     ports: