docker image: b7/b8
vllm serving script:
export VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT=0
export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
export VLLM_WORKER_MULTIPROC_METHOD=spawn
export PYTORCH_ALLOC_CONF="expandable_segments:True"
python3 -m vllm.entrypoints.openai.api_server
--model "$MODEL_PATH"
--served-model-name "$MODEL_NAME"
--dtype=float16
--enforce-eager
--port 8000
--host 0.0.0.0
--trust-remote-code
--gpu-memory-util=0.9
--no-enable-prefix-caching
--max-num-batched-tokens 16384
--disable-log-requests
--max-model-len 16384
--block-size 64
--quantization fp8
--speculative_config='{"method": "eagle3", "model": "/llm/models/Qwen3-32B-Eagle", "num_speculative_tokens": 5}'
-tp=4
benchmark script:
vllm bench serve
--model "$MODEL_PATH"
--dataset-name random
--served-model-name "$MODEL_NAME"
--random-input-len=1024
--random-output-len=1024
--ignore-eos
--num-prompt $bsize
--trust-remote-code
--request-rate inf
--backend vllm
--port=8000
If num-prompt is 1, the TTFT is more than 3s.
if num-prompt is 4, the benchmark will fail. The log is below:
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] WorkerProc hit an exception.
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] Traceback (most recent call last):
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 817, in worker_busy_loop
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] output = func(*args, **kwargs)
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] return func(*args, **kwargs)
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 578, in sample_tokens
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] return self.model_runner.sample_tokens(grammar_output)
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] return func(*args, **kwargs)
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 3485, in sample_tokens
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ) = self._bookkeeping_sync(
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 2804, in _bookkeeping_sync
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] valid_sampled_token_ids = self._to_list(sampled_token_ids)
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5766, in _to_list
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] self.transfer_event.synchronize()
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] RuntimeError: level_zero backend failed with error: 20 (UR_RESULT_ERROR_DEVICE_LOST)
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] Traceback (most recent call last):
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 817, in worker_busy_loop
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] output = func(*args, **kwargs)
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] return func(*args, **kwargs)
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 578, in sample_tokens
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] return self.model_runner.sample_tokens(grammar_output)
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] return func(*args, **kwargs)
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 3485, in sample_tokens
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ) = self._bookkeeping_sync(
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 2804, in _bookkeeping_sync
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] valid_sampled_token_ids = self._to_list(sampled_token_ids)
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5766, in _to_list
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] self.transfer_event.synchronize()
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] RuntimeError: level_zero backend failed with error: 20 (UR_RESULT_ERROR_DEVICE_LOST)
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822]
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] WorkerProc hit an exception.
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] Traceback (most recent call last):
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 817, in worker_busy_loop
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] output = func(*args, **kwargs)
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] return func(*args, **kwargs)
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 578, in sample_tokens
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] return self.model_runner.sample_tokens(grammar_output)
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] return func(*args, **kwargs)
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 3485, in sample_tokens
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ) = self._bookkeeping_sync(
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 2804, in _bookkeeping_sync
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] valid_sampled_token_ids = self._to_list(sampled_token_ids)
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5766, in _to_list
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] self.transfer_event.synchronize()
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] RuntimeError: level_zero backend failed with error: 20 (UR_RESULT_ERROR_DEVICE_LOST)
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] Traceback (most recent call last):
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 817, in worker_busy_loop
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] output = func(*args, **kwargs)
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] return func(*args, **kwargs)
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 578, in sample_tokens
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] return self.model_runner.sample_tokens(grammar_output)
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] return func(*args, **kwargs)
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 3485, in sample_tokens
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ) = self._bookkeeping_sync(
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 2804, in _bookkeeping_sync
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] valid_sampled_token_ids = self._to_list(sampled_token_ids)
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5766, in _to_list
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] self.transfer_event.synchronize()
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] RuntimeError: level_zero backend failed with error: 20 (UR_RESULT_ERROR_DEVICE_LOST)
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822]
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [dump_input.py:72] Dumping input data for V1 LLM engine (v0.14.1.dev0+gb17039bcc.d20260227) with config: model='/llm/models/Qwen3-32B', speculative_config=SpeculativeConfig(method='eagle3', model='/llm/models/Qwen3-32B-Eagle', num_spec_tokens=5), tokenizer='/llm/models/Qwen3-32B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=16384, download_dir=None, load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=True, quantization=fp8, enforce_eager=True, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=xpu, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen3-32B, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'level': None, 'mode': <CompilationMode.NONE: 0>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['all'], 'splitting_ops': [], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [16384], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.NONE: 0>, 'cudagraph_num_of_warmups': 0, 'cudagraph_capture_sizes': None, 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'eliminate_noops': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': None, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False, 'assume_32_bit_indexing': True}, 'local_cache_dir': None},
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [dump_input.py:79] Dumping scheduler output for model execution: SchedulerOutput(scheduled_new_reqs=[NewRequestData(req_id=cmpl-bench-38d5e741-0-0-a59e6d3a,prompt_token_ids_len=1024,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.0, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[], bad_words=[], include_stop_str_in_output=False, ignore_eos=True, max_tokens=1024, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, structured_outputs=None, extra_args=None),block_ids=([100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None)], scheduled_cached_reqs=CachedRequestData(req_ids=[],resumed_req_ids=set(),new_token_ids_lens=[],all_token_ids_lens={},new_block_ids=[],num_computed_tokens=[],num_output_tokens=[]), num_scheduled_tokens={cmpl-bench-38d5e741-0-0-a59e6d3a: 1024}, total_num_scheduled_tokens=1024, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, num_common_prefix_blocks=[0], finished_req_ids=[], free_encoder_mm_hashes=[], preempted_req_ids=[], has_structured_output_requests=false, pending_structured_output_tokens=false, num_invalid_spec_tokens=null, kv_connector_metadata=null, ec_connector_metadata=null)
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [dump_input.py:81] Dumping scheduler stats: SchedulerStats(num_running_reqs=1, num_waiting_reqs=0, step_counter=0, current_wave=0, kv_cache_usage=0.006831767719897575, prefix_cache_stats=PrefixCacheStats(reset=False, requests=0, queries=0, hits=0, preempted_requests=0, preempted_queries=0, preempted_hits=0), connector_prefix_cache_stats=None, kv_cache_eviction_events=[], spec_decoding_stats=None, kv_connector_stats=None, waiting_lora_adapters={}, running_lora_adapters={}, cudagraph_stats=None, perf_stats=None)
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] EngineCore encountered a fatal error.
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] Traceback (most recent call last):
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 929, in run_engine_core
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] engine_core.run_busy_loop()
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 956, in run_busy_loop
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] self._process_engine_step()
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 989, in _process_engine_step
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] outputs, model_executed = self.step_fn()
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] ^^^^^^^^^^^^^^
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 390, in step
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] model_output = self.model_executor.sample_tokens(grammar_output)
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 269, in sample_tokens
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] return self.collective_rpc(
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] ^^^^^^^^^^^^^^^^^^^^
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 359, in collective_rpc
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] return aggregate(get_response())
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] ^^^^^^^^^^^^^^
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 342, in get_response
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] raise RuntimeError(
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] RuntimeError: Worker failed with error 'level_zero backend failed with error: 20 (UR_RESULT_ERROR_DEVICE_LOST)', please check the stack trace above for the root cause
docker image: b7/b8
vllm serving script:
export VLLM_OFFLOAD_WEIGHTS_BEFORE_QUANT=0
export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
export VLLM_WORKER_MULTIPROC_METHOD=spawn
export PYTORCH_ALLOC_CONF="expandable_segments:True"
python3 -m vllm.entrypoints.openai.api_server
--model "$MODEL_PATH"
--served-model-name "$MODEL_NAME"
--dtype=float16
--enforce-eager
--port 8000
--host 0.0.0.0
--trust-remote-code
--gpu-memory-util=0.9
--no-enable-prefix-caching
--max-num-batched-tokens 16384
--disable-log-requests
--max-model-len 16384
--block-size 64
--quantization fp8
--speculative_config='{"method": "eagle3", "model": "/llm/models/Qwen3-32B-Eagle", "num_speculative_tokens": 5}'
-tp=4
benchmark script:
vllm bench serve
--model "$MODEL_PATH"
--dataset-name random
--served-model-name "$MODEL_NAME"
--random-input-len=1024
--random-output-len=1024
--ignore-eos
--num-prompt $bsize
--trust-remote-code
--request-rate inf
--backend vllm
--port=8000
If num-prompt is 1, the TTFT is more than 3s.
if num-prompt is 4, the benchmark will fail. The log is below:
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] WorkerProc hit an exception.
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] Traceback (most recent call last):
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 817, in worker_busy_loop
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] output = func(*args, **kwargs)
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] return func(*args, **kwargs)
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 578, in sample_tokens
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] return self.model_runner.sample_tokens(grammar_output)
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] return func(*args, **kwargs)
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 3485, in sample_tokens
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ) = self._bookkeeping_sync(
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 2804, in _bookkeeping_sync
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] valid_sampled_token_ids = self._to_list(sampled_token_ids)
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5766, in _to_list
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] self.transfer_event.synchronize()
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] RuntimeError: level_zero backend failed with error: 20 (UR_RESULT_ERROR_DEVICE_LOST)
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] Traceback (most recent call last):
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 817, in worker_busy_loop
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] output = func(*args, **kwargs)
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] return func(*args, **kwargs)
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 578, in sample_tokens
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] return self.model_runner.sample_tokens(grammar_output)
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] return func(*args, **kwargs)
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 3485, in sample_tokens
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ) = self._bookkeeping_sync(
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 2804, in _bookkeeping_sync
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] valid_sampled_token_ids = self._to_list(sampled_token_ids)
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5766, in _to_list
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] self.transfer_event.synchronize()
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] RuntimeError: level_zero backend failed with error: 20 (UR_RESULT_ERROR_DEVICE_LOST)
�[0;36m(Worker_TP2 pid=6161)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822]
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] WorkerProc hit an exception.
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] Traceback (most recent call last):
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 817, in worker_busy_loop
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] output = func(*args, **kwargs)
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] return func(*args, **kwargs)
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 578, in sample_tokens
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] return self.model_runner.sample_tokens(grammar_output)
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] return func(*args, **kwargs)
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 3485, in sample_tokens
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ) = self._bookkeeping_sync(
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 2804, in _bookkeeping_sync
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] valid_sampled_token_ids = self._to_list(sampled_token_ids)
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5766, in _to_list
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] self.transfer_event.synchronize()
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] RuntimeError: level_zero backend failed with error: 20 (UR_RESULT_ERROR_DEVICE_LOST)
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] Traceback (most recent call last):
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 817, in worker_busy_loop
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] output = func(*args, **kwargs)
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] return func(*args, **kwargs)
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 578, in sample_tokens
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] return self.model_runner.sample_tokens(grammar_output)
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] return func(*args, **kwargs)
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 3485, in sample_tokens
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ) = self._bookkeeping_sync(
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 2804, in _bookkeeping_sync
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] valid_sampled_token_ids = self._to_list(sampled_token_ids)
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5766, in _to_list
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] self.transfer_event.synchronize()
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822] RuntimeError: level_zero backend failed with error: 20 (UR_RESULT_ERROR_DEVICE_LOST)
�[0;36m(Worker_TP0 pid=6159)�[0;0m ERROR 03-03 13:32:49 [multiproc_executor.py:822]
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [dump_input.py:72] Dumping input data for V1 LLM engine (v0.14.1.dev0+gb17039bcc.d20260227) with config: model='/llm/models/Qwen3-32B', speculative_config=SpeculativeConfig(method='eagle3', model='/llm/models/Qwen3-32B-Eagle', num_spec_tokens=5), tokenizer='/llm/models/Qwen3-32B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=16384, download_dir=None, load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=True, quantization=fp8, enforce_eager=True, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=xpu, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen3-32B, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'level': None, 'mode': <CompilationMode.NONE: 0>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['all'], 'splitting_ops': [], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [16384], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.NONE: 0>, 'cudagraph_num_of_warmups': 0, 'cudagraph_capture_sizes': None, 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'eliminate_noops': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': None, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False, 'assume_32_bit_indexing': True}, 'local_cache_dir': None},
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [dump_input.py:79] Dumping scheduler output for model execution: SchedulerOutput(scheduled_new_reqs=[NewRequestData(req_id=cmpl-bench-38d5e741-0-0-a59e6d3a,prompt_token_ids_len=1024,prefill_token_ids_len=None,mm_features=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.0, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[], bad_words=[], include_stop_str_in_output=False, ignore_eos=True, max_tokens=1024, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, structured_outputs=None, extra_args=None),block_ids=([100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115],),num_computed_tokens=0,lora_request=None,prompt_embeds_shape=None)], scheduled_cached_reqs=CachedRequestData(req_ids=[],resumed_req_ids=set(),new_token_ids_lens=[],all_token_ids_lens={},new_block_ids=[],num_computed_tokens=[],num_output_tokens=[]), num_scheduled_tokens={cmpl-bench-38d5e741-0-0-a59e6d3a: 1024}, total_num_scheduled_tokens=1024, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, num_common_prefix_blocks=[0], finished_req_ids=[], free_encoder_mm_hashes=[], preempted_req_ids=[], has_structured_output_requests=false, pending_structured_output_tokens=false, num_invalid_spec_tokens=null, kv_connector_metadata=null, ec_connector_metadata=null)
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [dump_input.py:81] Dumping scheduler stats: SchedulerStats(num_running_reqs=1, num_waiting_reqs=0, step_counter=0, current_wave=0, kv_cache_usage=0.006831767719897575, prefix_cache_stats=PrefixCacheStats(reset=False, requests=0, queries=0, hits=0, preempted_requests=0, preempted_queries=0, preempted_hits=0), connector_prefix_cache_stats=None, kv_cache_eviction_events=[], spec_decoding_stats=None, kv_connector_stats=None, waiting_lora_adapters={}, running_lora_adapters={}, cudagraph_stats=None, perf_stats=None)
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] EngineCore encountered a fatal error.
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] Traceback (most recent call last):
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 929, in run_engine_core
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] engine_core.run_busy_loop()
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 956, in run_busy_loop
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] self._process_engine_step()
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 989, in _process_engine_step
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] outputs, model_executed = self.step_fn()
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] ^^^^^^^^^^^^^^
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 390, in step
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] model_output = self.model_executor.sample_tokens(grammar_output)
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 269, in sample_tokens
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] return self.collective_rpc(
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] ^^^^^^^^^^^^^^^^^^^^
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 359, in collective_rpc
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] return aggregate(get_response())
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] ^^^^^^^^^^^^^^
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 342, in get_response
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] raise RuntimeError(
�[0;36m(EngineCore_DP0 pid=6007)�[0;0m ERROR 03-03 13:32:49 [core.py:938] RuntimeError: Worker failed with error 'level_zero backend failed with error: 20 (UR_RESULT_ERROR_DEVICE_LOST)', please check the stack trace above for the root cause