From 1bd634e75c7d71e1a2e291b8c2c2b6b541b347ab Mon Sep 17 00:00:00 2001 From: ssaketh-ch Date: Thu, 26 Feb 2026 20:23:28 +0530 Subject: [PATCH 1/3] feat(llama3.1-8b): expose vLLM engine params as CLI arguments Hardcoded vLLM memory and scheduling parameters are now configurable via CLI flags with defaults that preserve existing behavior. No change to model outputs or sampling -- only memory allocation and scheduling. New flags: --gpu-memory-utilization --max-num-batched-tokens --max-num-seqs --enable-prefix-caching / --no-enable-prefix-caching --block-size --enforce-eager / --no-enforce-eager --enable-chunked-prefill / --no-enable-chunked-prefill --max-model-len --- language/llama3.1-8b/SUT_VLLM.py | 62 ++++++++++++++++++++++++++++---- language/llama3.1-8b/main.py | 61 +++++++++++++++++++++++++++++-- 2 files changed, 115 insertions(+), 8 deletions(-) diff --git a/language/llama3.1-8b/SUT_VLLM.py b/language/llama3.1-8b/SUT_VLLM.py index 94ee14abdd..11acaa9c1b 100644 --- a/language/llama3.1-8b/SUT_VLLM.py +++ b/language/llama3.1-8b/SUT_VLLM.py @@ -37,10 +37,18 @@ def __init__( # Set this to True *only for test accuracy runs* in case your prior # session was killed partway through workers=1, - tensor_parallel_size=8 + tensor_parallel_size=8, + gpu_memory_utilization=0.90, + max_num_batched_tokens=None, + max_num_seqs=256, + enable_prefix_caching=False, + block_size=16, + enforce_eager=False, + enable_chunked_prefill=None, + max_model_len=None, ): - self.model_path = model_path or f"meta-llama/Meta-Llama-3.1-8B-Instruct" + self.model_path = model_path or "meta-llama/Meta-Llama-3.1-8B-Instruct" if not batch_size: batch_size = 1 @@ -49,6 +57,16 @@ def __init__( self.dtype = dtype self.tensor_parallel_size = tensor_parallel_size + # Store vLLM engine config + self.gpu_memory_utilization = gpu_memory_utilization + self.max_num_batched_tokens = max_num_batched_tokens + self.max_num_seqs = max_num_seqs + self.enable_prefix_caching = enable_prefix_caching + self.block_size = block_size + self.enforce_eager = enforce_eager + self.enable_chunked_prefill = enable_chunked_prefill + self.max_model_len = max_model_len + if not torch.cuda.is_available(): assert False, "torch gpu is not available, exiting..." @@ -73,7 +91,7 @@ def __init__( "top_k": 1, "seed": 42, "max_tokens": 128, - "min_tokens": 1 + "min_tokens": 1, } self.sampling_params = SamplingParams(**gen_kwargs) # self.sampling_params.all_stop_token_ids.add(self.model.get_tokenizer().eos_token_id) @@ -162,6 +180,13 @@ def load_model(self): self.model_path, dtype=self.dtype, tensor_parallel_size=self.tensor_parallel_size, + gpu_memory_utilization=self.gpu_memory_utilization, + max_num_batched_tokens=self.max_num_batched_tokens, + max_num_seqs=self.max_num_seqs, + enable_prefix_caching=self.enable_prefix_caching, + block_size=self.block_size, + enforce_eager=self.enforce_eager, + enable_chunked_prefill=self.enable_chunked_prefill, ) log.info("Loaded model") @@ -203,7 +228,15 @@ def __init__( dataset_path=None, batch_size=None, workers=1, - tensor_parallel_size=8 + tensor_parallel_size=8, + gpu_memory_utilization=0.90, + max_num_batched_tokens=None, + max_num_seqs=256, + enable_prefix_caching=False, + block_size=16, + enforce_eager=False, + enable_chunked_prefill=None, + max_model_len=None, ): super().__init__( @@ -213,6 +246,14 @@ def __init__( dataset_path=dataset_path, workers=workers, tensor_parallel_size=tensor_parallel_size, + gpu_memory_utilization=gpu_memory_utilization, + max_num_batched_tokens=max_num_batched_tokens, + max_num_seqs=max_num_seqs, + enable_prefix_caching=enable_prefix_caching, + block_size=block_size, + enforce_eager=enforce_eager, + enable_chunked_prefill=enable_chunked_prefill, + max_model_len=max_model_len, ) self.request_id = 0 @@ -287,10 +328,19 @@ def stop(self): self.ft_response_thread.join() def load_model(self): - log.info("Loading model") + log.info("Loading model...") self.engine_args = AsyncEngineArgs( self.model_path, dtype=self.dtype, - tensor_parallel_size=self.tensor_parallel_size) + tensor_parallel_size=self.tensor_parallel_size, + gpu_memory_utilization=self.gpu_memory_utilization, + max_num_batched_tokens=self.max_num_batched_tokens, + max_num_seqs=self.max_num_seqs, + enable_prefix_caching=self.enable_prefix_caching, + max_model_len=self.max_model_len, + block_size=self.block_size, + enforce_eager=self.enforce_eager, + enable_chunked_prefill=self.enable_chunked_prefill, + ) self.model = AsyncLLMEngine.from_engine_args(self.engine_args) log.info("Loaded model") diff --git a/language/llama3.1-8b/main.py b/language/llama3.1-8b/main.py index a489e7d2bc..85da5effa5 100644 --- a/language/llama3.1-8b/main.py +++ b/language/llama3.1-8b/main.py @@ -104,7 +104,7 @@ def get_args(): "--tensor-parallel-size", type=int, default=8, - help="Number of workers to process queries", + help="Number of tensor parallel GPUs", ) parser.add_argument("--vllm", action="store_true", help="vllm mode") parser.add_argument( @@ -127,6 +127,55 @@ def get_args(): help="Model name(specified in llm server)", ) + parser.add_argument( + "--gpu-memory-utilization", + type=float, + default=0.90, + help="Fraction of GPU memory for vLLM to use (default: 0.90)", + ) + parser.add_argument( + "--max-num-batched-tokens", + type=int, + default=None, + help="Max tokens in a single batch (default: vLLM engine default)", + ) + parser.add_argument( + "--max-num-seqs", + type=int, + default=256, + help="Max concurrent sequences (default: 256)", + ) + parser.add_argument( + "--enable-prefix-caching", + action=argparse.BooleanOptionalAction, + default=False, + help="Enable/disable KV cache prefix reuse (default: disabled)", + ) + parser.add_argument( + "--block-size", + type=int, + default=16, + help="KV cache block size (default: 16)", + ) + parser.add_argument( + "--enforce-eager", + action=argparse.BooleanOptionalAction, + default=False, + help="Use eager mode instead of CUDA graphs (default: disabled)", + ) + parser.add_argument( + "--enable-chunked-prefill", + action=argparse.BooleanOptionalAction, + default=None, + help="Enable chunked prefill (default: vLLM engine default)", + ) + parser.add_argument( + "--max-model-len", + type=int, + default=None, + help="Max model context length (default: vLLM engine default)", + ) + args = parser.parse_args() return args @@ -177,7 +226,15 @@ def main(): dataset_path=args.dataset_path, total_sample_count=args.total_sample_count, workers=args.num_workers, - tensor_parallel_size=args.tensor_parallel_size + tensor_parallel_size=args.tensor_parallel_size, + gpu_memory_utilization=args.gpu_memory_utilization, + max_num_batched_tokens=args.max_num_batched_tokens, + max_num_seqs=args.max_num_seqs, + enable_prefix_caching=args.enable_prefix_caching, + block_size=args.block_size, + enforce_eager=args.enforce_eager, + enable_chunked_prefill=args.enable_chunked_prefill, + max_model_len=args.max_model_len ) else: sut = sut_cls( From 0247c29380846ff8f91d9029fe29e441bdd6c733 Mon Sep 17 00:00:00 2001 From: Sai Saketh Cherukuri <68653511+ssaketh-ch@users.noreply.github.com> Date: Wed, 1 Apr 2026 13:29:17 +0530 Subject: [PATCH 2/3] Remove prefix caching argument from main.py As requested, I removed the Prefix caching from main.py since the benchmark requires it to be False always avoiding any potential confusion --- language/llama3.1-8b/main.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/language/llama3.1-8b/main.py b/language/llama3.1-8b/main.py index 85da5effa5..f2ef1dc78c 100644 --- a/language/llama3.1-8b/main.py +++ b/language/llama3.1-8b/main.py @@ -145,12 +145,6 @@ def get_args(): default=256, help="Max concurrent sequences (default: 256)", ) - parser.add_argument( - "--enable-prefix-caching", - action=argparse.BooleanOptionalAction, - default=False, - help="Enable/disable KV cache prefix reuse (default: disabled)", - ) parser.add_argument( "--block-size", type=int, @@ -230,7 +224,6 @@ def main(): gpu_memory_utilization=args.gpu_memory_utilization, max_num_batched_tokens=args.max_num_batched_tokens, max_num_seqs=args.max_num_seqs, - enable_prefix_caching=args.enable_prefix_caching, block_size=args.block_size, enforce_eager=args.enforce_eager, enable_chunked_prefill=args.enable_chunked_prefill, From 1c7fa04a4da0ea8e1602e96bd96fdcc7ce073f9e Mon Sep 17 00:00:00 2001 From: Sai Saketh Cherukuri <68653511+ssaketh-ch@users.noreply.github.com> Date: Wed, 1 Apr 2026 13:31:42 +0530 Subject: [PATCH 3/3] Remove enable_prefix_caching parameter from SUT_VLLM Again, as mentioned, I removed prefix caching as a tunable parameter entirely since the benchmark doesn't allow it. --- language/llama3.1-8b/SUT_VLLM.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/language/llama3.1-8b/SUT_VLLM.py b/language/llama3.1-8b/SUT_VLLM.py index 11acaa9c1b..fc7d2c26ff 100644 --- a/language/llama3.1-8b/SUT_VLLM.py +++ b/language/llama3.1-8b/SUT_VLLM.py @@ -41,7 +41,6 @@ def __init__( gpu_memory_utilization=0.90, max_num_batched_tokens=None, max_num_seqs=256, - enable_prefix_caching=False, block_size=16, enforce_eager=False, enable_chunked_prefill=None, @@ -61,7 +60,6 @@ def __init__( self.gpu_memory_utilization = gpu_memory_utilization self.max_num_batched_tokens = max_num_batched_tokens self.max_num_seqs = max_num_seqs - self.enable_prefix_caching = enable_prefix_caching self.block_size = block_size self.enforce_eager = enforce_eager self.enable_chunked_prefill = enable_chunked_prefill @@ -183,7 +181,6 @@ def load_model(self): gpu_memory_utilization=self.gpu_memory_utilization, max_num_batched_tokens=self.max_num_batched_tokens, max_num_seqs=self.max_num_seqs, - enable_prefix_caching=self.enable_prefix_caching, block_size=self.block_size, enforce_eager=self.enforce_eager, enable_chunked_prefill=self.enable_chunked_prefill, @@ -232,7 +229,6 @@ def __init__( gpu_memory_utilization=0.90, max_num_batched_tokens=None, max_num_seqs=256, - enable_prefix_caching=False, block_size=16, enforce_eager=False, enable_chunked_prefill=None, @@ -249,7 +245,6 @@ def __init__( gpu_memory_utilization=gpu_memory_utilization, max_num_batched_tokens=max_num_batched_tokens, max_num_seqs=max_num_seqs, - enable_prefix_caching=enable_prefix_caching, block_size=block_size, enforce_eager=enforce_eager, enable_chunked_prefill=enable_chunked_prefill, @@ -336,7 +331,6 @@ def load_model(self): gpu_memory_utilization=self.gpu_memory_utilization, max_num_batched_tokens=self.max_num_batched_tokens, max_num_seqs=self.max_num_seqs, - enable_prefix_caching=self.enable_prefix_caching, max_model_len=self.max_model_len, block_size=self.block_size, enforce_eager=self.enforce_eager,