Address CodeRabbitAI review comments for SA benchmark

chenfeiz0326 · chenfeiz0326 · commit 928d56d9f17f · 2026-04-14T02:52:28.000Z
- Pin bench_serving repo to commit f3ea022a with shallow clone for
  supply-chain security (addresses S108/S603/S607 ruff warnings)
- Make --trust-remote-code, --use-chat-template, --backend conditional
  in _to_sa_benchmark_cmd() to match _to_default_benchmark_cmd() style
- Use None instead of 0.0 as user_throughput placeholder to distinguish
  "metric not available" from actual zero throughput
diff --git a/tests/integration/defs/perf/test_perf_sanity.py b/tests/integration/defs/perf/test_perf_sanity.py
@@ -62,6 +62,7 @@
 }
 
 BENCH_SERVING_REPO = "https://github.com/kedarpotdar-nv/bench_serving.git"
+BENCH_SERVING_COMMIT = "f3ea022a5780de5d0babc5fffa53634e2023d28f"
 BENCH_SERVING_DIR = "/tmp/bench_serving"
 
 
@@ -71,7 +72,22 @@ def ensure_bench_serving_repo() -> str:
     if not os.path.exists(bench_script):
         if os.path.exists(BENCH_SERVING_DIR):
             shutil.rmtree(BENCH_SERVING_DIR)
-        subprocess.check_call(["git", "clone", BENCH_SERVING_REPO, BENCH_SERVING_DIR])
+        subprocess.check_call(
+            ["git", "clone", "--depth", "1", BENCH_SERVING_REPO, BENCH_SERVING_DIR]
+        )
+        subprocess.check_call(
+            [
+                "git",
+                "-C",
+                BENCH_SERVING_DIR,
+                "fetch",
+                "--depth",
+                "1",
+                "origin",
+                BENCH_SERVING_COMMIT,
+            ]
+        )
+        subprocess.check_call(["git", "-C", BENCH_SERVING_DIR, "checkout", BENCH_SERVING_COMMIT])
     return bench_script
 
 
@@ -486,7 +502,6 @@ def _to_sa_benchmark_cmd(self) -> List[str]:
             str(self.concurrency * self.iterations),
             "--max-concurrency",
             str(self.concurrency),
-            "--trust-remote-code",
             "--ignore-eos",
             "--random-input-len",
             str(self.isl),
@@ -495,10 +510,16 @@ def _to_sa_benchmark_cmd(self) -> List[str]:
             "--random-range-ratio",
             str(self.random_range_ratio),
             "--save-result",
-            "--use-chat-template",
             "--percentile-metrics",
             "ttft,tpot,itl,e2el",
         ]
+        if self.backend:
+            benchmark_cmd.extend(["--backend", self.backend])
+        if self.trust_remote_code:
+            benchmark_cmd.append("--trust-remote-code")
+        if self.use_chat_template:
+            benchmark_cmd.append("--use-chat-template")
+        # Note: bench_serving has no --non-streaming flag; streaming is backend-determined
         return benchmark_cmd
 
     def _to_default_benchmark_cmd(self) -> List[str]:
@@ -1537,14 +1558,15 @@ def parse_metrics_from_output(output: str) -> Optional[Dict[str, float]]:
             server_outputs = outputs.get(server_idx, [])
             for client_idx, output in enumerate(server_outputs):
                 metrics = parse_metrics_from_output(output)
-                # SA benchmark (bench_serving) doesn't output user_throughput
+                # SA benchmark (bench_serving) doesn't report user_throughput.
+                # Use None as sentinel to distinguish "not available" from actual zero.
                 if (
                     metrics
                     and "user_throughput" not in metrics
                     and client_idx < len(client_configs)
                     and client_configs[client_idx].use_nv_sa_benchmark
                 ):
-                    metrics["user_throughput"] = 0.0
+                    metrics["user_throughput"] = None
                 self._perf_results[server_idx].append(metrics)
 
     def check_test_failure(self):