update

chenfeiz0326 · chenfeiz0326 · commit 09f6d2e1f5c2 · 2026-04-14T14:16:48.000Z
Signed-off-by: Chenfei Zhang &lt;chenfeiz@nvidia.com&gt;
diff --git a/tests/integration/defs/perf/open_search_db_utils.py b/tests/integration/defs/perf/open_search_db_utils.py
@@ -54,6 +54,15 @@ def is_empty(value):
     for field in match_keys:
         history_value = history_data.get(field, None)
         new_value = new_data.get(field, None)
+        # For boolean fields (b_ prefix), treat None/missing as False.
+        # This ensures backward compatibility when new boolean match keys
+        # are added — historical data without the field can still match
+        # current data where the field defaults to False.
+        if field.startswith("b_"):
+            if history_value is None:
+                history_value = False
+            if new_value is None:
+                new_value = False
         if is_empty(history_value) and is_empty(new_value):
             continue
         if history_value != new_value:
@@ -138,7 +147,34 @@ def parse_timestamp(timestamp):
             },
         ]
         for key, value in common_values_dict.items():
-            must_clauses.append({"term": {key: value}})
+            if key.startswith("b_") and value is False:
+                # For boolean fields with value False, also match documents
+                # where the field is missing (backward compatibility for
+                # newly added boolean match keys).
+                must_clauses.append({
+                    "bool": {
+                        "should": [
+                            {
+                                "term": {
+                                    key: False
+                                }
+                            },
+                            {
+                                "bool": {
+                                    "must_not": [{
+                                        "exists": {
+                                            "field": key
+                                        }
+                                    }]
+                                }
+                            },
+                        ],
+                        "minimum_should_match":
+                        1,
+                    }
+                })
+            else:
+                must_clauses.append({"term": {key: value}})
         history_data_list = OpenSearchDB.queryPerfDataFromOpenSearchDB(
             TEST_INFO_PROJECT_NAME, must_clauses, size=MAX_QUERY_SIZE)
 
diff --git a/tests/integration/defs/perf/test_perf_sanity.py b/tests/integration/defs/perf/test_perf_sanity.py
@@ -18,6 +18,7 @@
 import glob
 import os
 import re
+import shutil
 import socket
 import subprocess
 import time
@@ -60,6 +61,36 @@
     "H200": "h200",
 }
 
+BENCH_SERVING_REPO = "https://github.com/kedarpotdar-nv/bench_serving.git"
+BENCH_SERVING_COMMIT = "f3ea022a5780de5d0babc5fffa53634e2023d28f"
+BENCH_SERVING_DIR = "/tmp/bench_serving"
+
+
+def ensure_bench_serving_repo() -> str:
+    """Clone bench_serving repo if not already present. Returns path to benchmark_serving.py."""
+    bench_script = os.path.join(BENCH_SERVING_DIR, "benchmark_serving.py")
+    if not os.path.exists(bench_script):
+        if os.path.exists(BENCH_SERVING_DIR):
+            shutil.rmtree(BENCH_SERVING_DIR)
+        subprocess.check_call(
+            ["git", "clone", "--depth", "1", BENCH_SERVING_REPO, BENCH_SERVING_DIR]
+        )
+        subprocess.check_call(
+            [
+                "git",
+                "-C",
+                BENCH_SERVING_DIR,
+                "fetch",
+                "--depth",
+                "1",
+                "origin",
+                BENCH_SERVING_COMMIT,
+            ]
+        )
+        subprocess.check_call(["git", "-C", BENCH_SERVING_DIR, "checkout", BENCH_SERVING_COMMIT])
+    return bench_script
+
+
 DEFAULT_TIMEOUT = 5400
 AGG_CONFIG_FOLDER = os.environ.get("AGG_CONFIG_FOLDER", "tests/scripts/perf-sanity/aggregated")
 DISAGG_CONFIG_FOLDER = os.environ.get(
@@ -439,6 +470,7 @@ def __init__(
         self.trust_remote_code = client_config_data.get("trust_remote_code", True)
         self.model_path = ""
         self.dataset_file = client_config_data.get("dataset_file", "")
+        self.use_nv_sa_benchmark = client_config_data.get("use_nv_sa_benchmark", False)
         self.env_vars = env_vars
 
         # Generate default name if not provided
@@ -450,6 +482,48 @@ def to_cmd(self) -> List[str]:
         """Generate benchmark command."""
         model_dir = get_model_dir(self.model_name)
         self.model_path = model_dir if os.path.exists(model_dir) else self.model_name
+
+        if self.use_nv_sa_benchmark:
+            return self._to_sa_benchmark_cmd()
+        else:
+            return self._to_default_benchmark_cmd()
+
+    def _to_sa_benchmark_cmd(self) -> List[str]:
+        """Generate SA benchmark command (bench_serving repo)."""
+        bench_script = ensure_bench_serving_repo()
+        benchmark_cmd = [
+            "python",
+            bench_script,
+            "--model",
+            self.model_path,
+            "--dataset-name",
+            "random",
+            "--num-prompts",
+            str(self.concurrency * self.iterations),
+            "--max-concurrency",
+            str(self.concurrency),
+            "--ignore-eos",
+            "--random-input-len",
+            str(self.isl),
+            "--random-output-len",
+            str(self.osl),
+            "--random-range-ratio",
+            str(self.random_range_ratio),
+            "--save-result",
+            "--percentile-metrics",
+            "ttft,tpot,itl,e2el",
+        ]
+        if self.backend:
+            benchmark_cmd.extend(["--backend", self.backend])
+        if self.trust_remote_code:
+            benchmark_cmd.append("--trust-remote-code")
+        if self.use_chat_template:
+            benchmark_cmd.append("--use-chat-template")
+        # Note: bench_serving has no --non-streaming flag; streaming is backend-determined
+        return benchmark_cmd
+
+    def _to_default_benchmark_cmd(self) -> List[str]:
+        """Generate default benchmark command (tensorrt_llm benchmark_serving)."""
         dataset_path = get_dataset_dir(self.dataset_file)
         benchmark_cmd = [
             "python",
@@ -513,6 +587,7 @@ def to_match_keys(self) -> List[str]:
             "s_backend",
             "b_use_chat_template",
             "b_streaming",
+            "b_use_nv_sa_benchmark",
         ]
 
     def to_db_data(self) -> dict:
@@ -529,6 +604,7 @@ def to_db_data(self) -> dict:
             "b_use_chat_template": self.use_chat_template,
             "b_streaming": self.streaming,
             "b_trust_remote_code": self.trust_remote_code,
+            "b_use_nv_sa_benchmark": self.use_nv_sa_benchmark,
             "s_client_log_link": "",
             "s_client_env_vars": self.env_vars,
         }
@@ -1292,6 +1368,7 @@ def _parse_disagg_config_file(self, config_file_path: str, config_file: str):
         # For ctx_only: OSL is set to 1 and dataset_file is empty
         osl = 1 if benchmark_mode == "ctx_only" else benchmark.get("output_length", 1024)
         dataset_file = "" if benchmark_mode == "ctx_only" else benchmark.get("dataset_file", "")
+        use_nv_sa_benchmark = benchmark.get("use_nv_sa_benchmark", False)
 
         client_configs = []
         for concurrency in concurrency_values:
@@ -1305,6 +1382,7 @@ def _parse_disagg_config_file(self, config_file_path: str, config_file: str):
                 "use_chat_template": False,
                 "streaming": benchmark.get("streaming", True),
                 "dataset_file": dataset_file,
+                "use_nv_sa_benchmark": use_nv_sa_benchmark,
             }
             client_config = ClientConfig(
                 client_config_data,
@@ -1478,8 +1556,17 @@ def parse_metrics_from_output(output: str) -> Optional[Dict[str, float]]:
         for server_idx, client_configs in self.server_client_configs.items():
             self._perf_results[server_idx] = []
             server_outputs = outputs.get(server_idx, [])
-            for output in server_outputs:
+            for client_idx, output in enumerate(server_outputs):
                 metrics = parse_metrics_from_output(output)
+                # SA benchmark (bench_serving) doesn't report user_throughput.
+                # Use None as sentinel to distinguish "not available" from actual zero.
+                if (
+                    metrics
+                    and "user_throughput" not in metrics
+                    and client_idx < len(client_configs)
+                    and client_configs[client_idx].use_nv_sa_benchmark
+                ):
+                    metrics["user_throughput"] = None
                 self._perf_results[server_idx].append(metrics)
 
     def check_test_failure(self):