From c318066b91ed5f703f39a181d4301039817fb80c Mon Sep 17 00:00:00 2001
From: geonwoo <geonwoo.choi@moreh.io>
Date: Thu, 4 Dec 2025 22:26:23 +0000
Subject: [PATCH] benchmark trim

---
 vllm/benchmarks/serve.py | 188 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 182 insertions(+), 6 deletions(-)

diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 2933f5d01b27..e8704fe23c75 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -289,6 +289,7 @@ def calculate_metrics(
     tokenizer: PreTrainedTokenizerBase,
     selected_percentiles: list[float],
     goodput_config_dict: dict[str, float],
+    is_trim: bool = False,
 ) -> tuple[BenchmarkMetrics, list[int]]:
     """Calculate the metrics for the benchmark.
 
@@ -323,11 +324,13 @@ def calculate_metrics(
                 # bundled together
                 # Note : this may inflate the output token count slightly
                 output_len = len(
-                    tokenizer(
-                        outputs[i].generated_text, add_special_tokens=False
-                    ).input_ids
-                )
+                    tokenizer(outputs[i].generated_text,
+                              add_special_tokens=False).input_ids)
+            itls += outputs[i].itl
             actual_output_lens.append(output_len)
+            if is_trim:
+                completed += 1
+                continue # NOTE: if trimmed, only itl and output_len are needed
             total_input += input_requests[i].prompt_len
             tpot = 0
             if output_len > 1:
@@ -336,7 +339,6 @@ def calculate_metrics(
                 tpots.append(tpot)
             # Note: if output_len <= 1, we regard tpot as 0 for goodput
             all_tpots.append(tpot)
-            itls += outputs[i].itl
             ttfts.append(outputs[i].ttft)
             e2els.append(outputs[i].latency)
             completed += 1
@@ -509,6 +511,8 @@ async def benchmark(
     ramp_up_start_rps: int | None = None,
     ramp_up_end_rps: int | None = None,
     ready_check_timeout_sec: int = 600,
+    warmup_time: float = 0.0,
+    cooldown_time: float = 0.0,
 ):
     try:
         request_func = ASYNC_REQUEST_FUNCS[endpoint_type]
@@ -874,6 +878,165 @@ def process_one_metric(
 
     print("=" * 50)
 
+    if warmup_time > 0.0 or cooldown_time > 0.0:
+        """
+        Filter the information inside of each RequestFuncOutput object.
+        RequestFuncOutput:
+        * generated_text    -- Copy at first
+        * success           -- Copy at first
+        * latency           -- Accumulated at filtering process
+        * output_tokens     -- Accumulated at filtering process
+        * ttft              -- Copy at filtering process
+        * itl               -- Appended at filtering process
+        * tpot              -- It seems not used variable.
+        * prompt_len        -- Copy at filtering process
+        * error             -- Copy at first
+        * start_time        -- Copy at first
+        """
+        min_start = min(e.start_time for e in outputs)
+        max_end = max(e.start_time + e.latency for e in outputs)
+        num_counted_tokens = 0
+        effective_outputs: list[RequestFuncOutput] = []
+
+        warmup_sentinel = min_start + warmup_time
+        cooldown_sentinel = max_end - cooldown_time
+
+        for output in outputs:
+            # 1. Initialize request base info
+            new_output = RequestFuncOutput()
+            new_output.start_time = output.start_time
+            new_output.error = output.error
+            new_output.success = False
+            new_output.generated_text = output.generated_text
+            new_output.prompt_len = output.prompt_len
+
+            # 2. Initialize target info
+            new_output.output_tokens = 0
+            new_output.itl = []
+            new_output.latency = 0.0
+            new_output.ttft = -1.0  # (Later) -1 will skip the ttft collecting process
+
+            # 3. Filter by absolute time
+            current_absolute_time = output.start_time
+
+            # 3a. Check first token generation
+            current_absolute_time += output.ttft
+
+            # Check the first token is inside of time window
+            if warmup_sentinel <= current_absolute_time < cooldown_sentinel:
+                new_output.ttft = output.ttft
+                new_output.latency += output.ttft
+                new_output.output_tokens += 1
+
+            # 3b. Check the itl
+            for itl in output.itl:
+                current_absolute_time += itl
+
+                # If over the window, break
+                if current_absolute_time >= cooldown_sentinel:
+                    break
+
+                # Collect if inside of window
+                if current_absolute_time >= warmup_sentinel:
+                    new_output.itl.append(itl)
+                    new_output.latency += itl
+                    new_output.output_tokens += 1
+
+            # 4. If the output_tokens > 1, append it
+            if new_output.output_tokens > 0:
+                new_output.success = True
+                effective_outputs.append(new_output)
+                num_counted_tokens += new_output.output_tokens
+
+        # Get effective duration (t_duration)
+        t_duration = cooldown_sentinel - warmup_sentinel
+
+        t_metrics, t_actual_output_lens = calculate_metrics(
+            input_requests=input_requests,
+            outputs=effective_outputs,
+            dur_s=t_duration,
+            tokenizer=tokenizer,
+            selected_percentiles=selected_percentiles,
+            goodput_config_dict=goodput_config_dict,
+            is_trim=True,
+        )
+        print("{s:{c}^{n}}".format(s="Serving Benchmark Result after warmup before cooldown", n=50, c="="))
+        print("{:<40} {:<10}".format("Warm-up Time:", warmup_time))
+        print("{:<40} {:<10}".format("Cool-down Time:", cooldown_time))
+        print("{:<40} {:<10}".format("Total counted tokens at filtering:", num_counted_tokens))
+        print("{:<40} {:<10.2f}".format("Benchmark duration (s):", t_duration))
+        if isinstance(metrics, BenchmarkMetrics):
+            print("{:<40} {:<10}".format("Total generated tokens:", t_metrics.total_output))
+        if isinstance(metrics, BenchmarkMetrics):
+            print(
+                "{:<40} {:<10.2f}".format(
+                    "Output token throughput (tok/s):", num_counted_tokens / t_duration
+                )
+            )
+
+        result_t = {
+            "duration": t_duration,
+            "completed": t_metrics.completed,
+            "total_input_tokens": t_metrics.total_input,
+            "total_output_tokens": t_metrics.total_output,
+            "request_throughput": t_metrics.request_throughput,
+            "request_goodput": t_metrics.request_goodput if goodput_config_dict else None,
+            "output_throughput": t_metrics.output_throughput,
+            "total_token_throughput": t_metrics.total_token_throughput,
+            "input_lens": [output.prompt_len for output in outputs],
+            "output_lens": t_actual_output_lens,
+            "ttfts": [output.ttft for output in effective_outputs],
+            "itls": [output.itl for output in effective_outputs],
+            "generated_texts": [output.generated_text for output in effective_outputs],
+            "errors": [output.error for output in outputs],
+            "max_output_tokens_per_s": t_metrics.max_output_tokens_per_s,
+            "max_concurrent_requests": t_metrics.max_concurrent_requests,
+        }
+
+        def process_one_metric_trim(
+            # E.g., "ttft"
+            metric_attribute_name: str,
+            # E.g., "TTFT"
+            metric_name: str,
+            # E.g., "Time to First Token"
+            metric_header: str,
+        ):
+            # This function prints and adds statistics of the specified
+            # metric.
+            if metric_attribute_name not in selected_percentile_metrics:
+                return
+            print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
+            print(
+                "{:<40} {:<10.2f}".format(
+                    f"Mean {metric_name} (ms):",
+                    getattr(t_metrics, f"mean_{metric_attribute_name}_ms"),
+                )
+            )
+            print(
+                "{:<40} {:<10.2f}".format(
+                    f"Median {metric_name} (ms):",
+                    getattr(t_metrics, f"median_{metric_attribute_name}_ms"),
+                )
+            )
+            result_t[f"mean_{metric_attribute_name}_ms"] = getattr(
+                t_metrics, f"mean_{metric_attribute_name}_ms"
+            )
+            result_t[f"median_{metric_attribute_name}_ms"] = getattr(
+                t_metrics, f"median_{metric_attribute_name}_ms"
+            )
+            result_t[f"std_{metric_attribute_name}_ms"] = getattr(
+                t_metrics, f"std_{metric_attribute_name}_ms"
+            )
+            for p, value in getattr(t_metrics, f"percentiles_{metric_attribute_name}_ms"):
+                p_word = str(int(p)) if int(p) == p else str(p)
+                print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
+                result_t[f"p{p_word}_{metric_attribute_name}_ms"] = value
+
+        if task_type == TaskType.GENERATION:
+            process_one_metric_trim("itl", "ITL", "Inter-token Latency")
+
+        print("=" * 50)
+
     if profile:
         print("Stopping profiler...")
         profile_input = RequestFuncInput(
@@ -1284,7 +1447,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
         "in seconds (default: 600 seconds / 10 minutes). If set to 0, "
         "the ready check will be skipped.",
     )
-
     parser.add_argument(
         "--extra-body",
         help="A JSON string representing extra body parameters to include "
@@ -1293,6 +1455,18 @@ def add_cli_args(parser: argparse.ArgumentParser):
         type=json.loads,
         default=None,
     )
+    parser.add_argument(
+        "--warmup-time",
+        type=float,
+        default=0.0,
+        help="Warm-up time in seconds."
+    )
+    parser.add_argument(
+        "--cooldown-time",
+        type=float,
+        default=0.0,
+        help="Cool-down time in seconds."
+    )
 
 
 def main(args: argparse.Namespace) -> dict[str, Any]:
@@ -1445,6 +1619,8 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
         ramp_up_start_rps=args.ramp_up_start_rps,
         ramp_up_end_rps=args.ramp_up_end_rps,
         ready_check_timeout_sec=args.ready_check_timeout_sec,
+        warmup_time=args.warmup_time,
+        cooldown_time=args.cooldown_time,
     )
 
     # Save config and results to json