From 0753567a6ad431b441bcb59a6139ae67c3246275 Mon Sep 17 00:00:00 2001
From: Viraat Chandra <viraatc@nvidia.com>
Date: Thu, 28 May 2026 15:18:27 -0700
Subject: [PATCH 1/3] test: add E2E perf tests for benchmark CLI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drive the cyclopts `inference-endpoint` app in-process against the
existing MaxThroughputServer and VariableResponseServer stubs. Two
families:

* Roofline (`@pytest.mark.performance`, CI-skipped) — measures peak
  QPS for max_throughput, concurrency sweep, and binary-searches the
  largest 10k-multiple target_qps Poisson sustains. Reports numbers
  rather than asserting on them.

* Low-QPS correctness (`@pytest.mark.integration`, CI-included) — 5
  QPS Poisson against the realistic stub for 20s; asserts zero failed
  requests. Guards keep-alive / idle-pool / slow-response regressions
  that only surface when connections sit idle longer than
  TCP_KEEPIDLE.

A conftest.py captures each parameterized case via a `record_result`
fixture and renders a unified summary table with host + CPU info at
end of session, so cross-machine roofline runs are easy to compare.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/performance/commands/__init__.py      |  16 ++
 tests/performance/commands/conftest.py      | 140 ++++++++++
 tests/performance/commands/test_e2e_perf.py | 276 ++++++++++++++++++++
 tests/performance/commands/utils.py         |  86 ++++++
 4 files changed, 518 insertions(+)
 create mode 100644 tests/performance/commands/__init__.py
 create mode 100644 tests/performance/commands/conftest.py
 create mode 100644 tests/performance/commands/test_e2e_perf.py
 create mode 100644 tests/performance/commands/utils.py

diff --git a/tests/performance/commands/__init__.py b/tests/performance/commands/__init__.py
new file mode 100644
index 000000000..b6cde4e30
--- /dev/null
+++ b/tests/performance/commands/__init__.py
@@ -0,0 +1,16 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""End-to-end performance tests that drive the CLI in-process."""
diff --git a/tests/performance/commands/conftest.py b/tests/performance/commands/conftest.py
new file mode 100644
index 000000000..ab003e0a1
--- /dev/null
+++ b/tests/performance/commands/conftest.py
@@ -0,0 +1,140 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Shared fixtures + summary table for E2E performance tests.
+
+Tests in this directory inject the ``record_result`` fixture and call it
+once per parameterization. After the session finishes,
+:func:`pytest_terminal_summary` prints a formatted table of every recorded
+row — handy when running roofline + low-QPS together.
+"""
+
+from __future__ import annotations
+
+import platform
+from typing import Any
+
+import pytest
+
+
+class _Collected:
+    """Module-level singleton holding rows recorded during the session."""
+
+    rows: list[dict[str, Any]] = []
+
+
+@pytest.fixture
+def record_result():
+    """Record a result row that will appear in the end-of-session summary.
+
+    Pass keyword fields you want in the table — anything missing renders as
+    ``—`` in the output.
+
+    Usage::
+
+        def test_foo(record_result):
+            record_result(
+                "max_throughput", stream=False,
+                qps=44426.0, total=2_000_000, elapsed=45.02, failed=0,
+            )
+    """
+
+    def _record(label: str, **fields: Any) -> None:
+        _Collected.rows.append({"label": label, **fields})
+
+    return _record
+
+
+# -----------------------------------------------------------------------------
+# Host info + table rendering
+# -----------------------------------------------------------------------------
+
+
+def _host_info() -> dict[str, str]:
+    cpu = "unknown"
+    cores = 0
+    try:
+        with open("/proc/cpuinfo") as f:
+            text = f.read()
+        for line in text.splitlines():
+            if line.startswith("model name"):
+                cpu = line.split(":", 1)[1].strip()
+                break
+        cores = text.count("processor\t:")
+    except OSError:
+        pass
+    return {
+        "host": platform.node(),
+        "arch": platform.machine(),
+        "cpu": cpu,
+        "cores": str(cores) if cores else "?",
+    }
+
+
+def _fmt_cell(value: Any, kind: str) -> str:
+    if value is None:
+        return "—"
+    if kind == "stream":
+        return "on " if value else "off"
+    if kind == "qps":
+        try:
+            v = float(value)
+        except (TypeError, ValueError):
+            return str(value)
+        return f"{v:>9,.0f}" if v >= 100 else f"{v:>9.2f}"
+    if kind == "total":
+        return f"{int(value):>10,}"
+    if kind == "elapsed":
+        return f"{float(value):>7.2f}s"
+    if kind == "failed":
+        return f"{int(value):>4}"
+    return str(value)
+
+
+def pytest_terminal_summary(terminalreporter, exitstatus, config) -> None:  # noqa: ARG001
+    rows = _Collected.rows
+    if not rows:
+        return
+
+    tr = terminalreporter
+    tr.write_sep("=", "E2E Performance Summary")
+
+    info = _host_info()
+    tr.write_line(
+        f"Host:  {info['host']}    Arch: {info['arch']}    Cores: {info['cores']}"
+    )
+    tr.write_line(f"CPU:   {info['cpu']}")
+    tr.write_line("")
+
+    headers = ["Test", "Stream", "QPS", "Total", "Elapsed", "Failed"]
+    kinds = ["label", "stream", "qps", "total", "elapsed", "failed"]
+    keys = ["label", "stream", "qps", "total", "elapsed", "failed"]
+
+    body = [
+        [_fmt_cell(r.get(k), kind) for k, kind in zip(keys, kinds, strict=False)]
+        for r in rows
+    ]
+
+    widths = [
+        max(len(h), max((len(row[i]) for row in body), default=0))
+        for i, h in enumerate(headers)
+    ]
+    fmt = "  ".join(f"{{:<{w}}}" for w in widths)
+    sep = "  ".join("-" * w for w in widths)
+
+    tr.write_line(fmt.format(*headers))
+    tr.write_line(sep)
+    for row in body:
+        tr.write_line(fmt.format(*row))
diff --git a/tests/performance/commands/test_e2e_perf.py b/tests/performance/commands/test_e2e_perf.py
new file mode 100644
index 000000000..178b512eb
--- /dev/null
+++ b/tests/performance/commands/test_e2e_perf.py
@@ -0,0 +1,276 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""End-to-end performance + correctness tests for the benchmark CLI.
+
+Two families of tests, both driving the cyclopts ``inference-endpoint``
+app in-process and parameterized on stream/non-stream:
+
+* **Roofline** against :class:`MaxThroughputServer` (instant pre-compiled
+  responses). Measures peak QPS for each load pattern
+  (``max_throughput``, ``concurrency``, ``poisson``). Prints numbers
+  rather than asserting on them. Marker: ``performance`` (CI-skipped).
+
+* **Low-QPS correctness** against :class:`VariableResponseServer`
+  (realistic TTFT + per-token TPOT). Asserts zero ``failed`` requests at
+  5 QPS for 20 s — guards keep-alive / idle-pool / slow-response
+  regressions. Marker: ``integration`` (CI-included).
+
+Results from every parametrized case are written via the
+``record_result`` fixture and rendered as a single summary table by
+``conftest.py`` after the session completes.
+
+Run::
+
+    # roofline only
+    pytest -xvs -m performance --no-cov tests/performance/commands/test_e2e_perf.py
+
+    # low-QPS only
+    pytest -xvs -m integration tests/performance/commands/test_e2e_perf.py
+
+    # both
+    pytest -xvs -m "performance or integration" --no-cov \\
+        tests/performance/commands/test_e2e_perf.py
+"""
+
+from __future__ import annotations
+
+import pytest
+from inference_endpoint.testing.max_throughput_server import MaxThroughputServer
+from inference_endpoint.testing.variable_throughput_server import VariableResponseServer
+
+from .utils import run_cli
+
+# =============================================================================
+# Roofline tests — MaxThroughputServer, every load pattern, stream + non-stream
+# =============================================================================
+
+
+@pytest.fixture(scope="module", params=[False, True], ids=["nonstream", "stream"])
+def max_tput_server(request):
+    """Stub server returning fixed pre-compiled responses (roofline target)."""
+    with MaxThroughputServer(
+        port=0,
+        num_workers=4,
+        stream=request.param,
+        stream_interval=10,
+        quiet=True,
+    ) as srv:
+        yield srv
+
+
+@pytest.mark.performance
+@pytest.mark.xdist_group(name="serial_performance")
+def test_max_throughput_roofline(max_tput_server, tmp_path, record_result):
+    """Offline burst — issue 2,000,000 queries at t=0."""
+    results = run_cli(
+        [
+            "offline",
+            "--load-pattern",
+            "max_throughput",
+            "--num-samples",
+            "2000000",
+        ],
+        tmp_path,
+        max_tput_server,
+    )
+    r = results["results"]
+    assert r["failed"] == 0, f"failed={r['failed']} (expected 0)"
+    record_result(
+        "max_throughput (2M burst)",
+        stream=max_tput_server.stream,
+        qps=r["qps"],
+        total=r["total"],
+        elapsed=r["elapsed_time"],
+        failed=r["failed"],
+    )
+    print(
+        f"\n  max_throughput  stream={max_tput_server.stream}: "
+        f"QPS={r['qps']:>10,.0f}  total={r['total']:>9,}  "
+        f"elapsed={r['elapsed_time']:6.2f}s"
+    )
+
+
+@pytest.mark.performance
+@pytest.mark.xdist_group(name="serial_performance")
+@pytest.mark.parametrize("concurrency", [1000, 4000, 16000])
+def test_concurrency_roofline(max_tput_server, concurrency, tmp_path, record_result):
+    """Online concurrency — N in-flight requests for fixed duration."""
+    results = run_cli(
+        [
+            "online",
+            "--load-pattern",
+            "concurrency",
+            "--concurrency",
+            str(concurrency),
+            "--duration",
+            "10s",
+            "--runtime.max-duration-ms",
+            "12000",
+            # Headroom so wall time, not sample count, is the limit.
+            "--num-samples",
+            "10000000",
+        ],
+        tmp_path,
+        max_tput_server,
+    )
+    r = results["results"]
+    assert r["failed"] == 0, f"failed={r['failed']} (expected 0)"
+    record_result(
+        f"concurrency c={concurrency:,}",
+        stream=max_tput_server.stream,
+        qps=r["qps"],
+        total=r["total"],
+        elapsed=r["elapsed_time"],
+        failed=r["failed"],
+    )
+    print(
+        f"\n  concurrency  c={concurrency:>5}  stream={max_tput_server.stream}: "
+        f"QPS={r['qps']:>10,.0f}  total={r['total']:>9,}  "
+        f"elapsed={r['elapsed_time']:6.2f}s"
+    )
+
+
+@pytest.mark.performance
+@pytest.mark.xdist_group(name="serial_performance")
+def test_poisson_binary_search_max_qps(max_tput_server, tmp_path, record_result):
+    """Binary search for the largest 10k-multiple target_qps the server sustains."""
+    STEP = 10_000
+    LO, HI = 10_000, 250_000  # search space (inclusive)
+    PASS_RATIO = 0.95  # achieved/target threshold for "sustained"
+
+    history: list[tuple[int, float, bool]] = []
+    lo, hi = LO // STEP, HI // STEP  # integer bounds in units of STEP
+    while lo < hi:
+        mid = (lo + hi + 1) // 2
+        target = mid * STEP
+        results = run_cli(
+            [
+                "online",
+                "--load-pattern",
+                "poisson",
+                "--target-qps",
+                str(target),
+                "--duration",
+                "10s",
+                "--runtime.max-duration-ms",
+                "12000",
+                # Headroom so wall time, not sample count, is the limit.
+                "--num-samples",
+                str(max(100_000, target * 15)),
+            ],
+            tmp_path / f"qps_{target}",
+            max_tput_server,
+        )
+        r = results["results"]
+        achieved = r["qps"]
+        sustained = achieved >= target * PASS_RATIO
+        history.append((target, achieved, sustained))
+        if sustained:
+            lo = mid
+        else:
+            hi = mid - 1
+
+    sustained_targets = [t for t, _, s in history if s]
+    max_sustained = max(sustained_targets) if sustained_targets else 0
+    record_result(
+        "poisson max_sustained",
+        stream=max_tput_server.stream,
+        qps=max_sustained,
+        failed=0,
+    )
+    print(
+        f"\n  poisson binary search  stream={max_tput_server.stream}: "
+        f"max_sustained={max_sustained:>7,} QPS  (PASS_RATIO={PASS_RATIO})"
+    )
+    for t, a, s in history:
+        print(f"    target={t:>7,}  achieved={a:>10,.0f}  sustained={s}")
+
+
+# =============================================================================
+# Low-QPS correctness — VariableResponseServer, 5 QPS, no network errors
+# =============================================================================
+
+
+@pytest.fixture(scope="module", params=[False, True], ids=["nonstream", "stream"])
+def variable_server(request):
+    """Realistic LLM stub: ~100-char responses, 50ms TTFT, 10ms/token TPOT."""
+    with VariableResponseServer(
+        host="127.0.0.1",
+        port=0,
+        output_len_mean=100,
+        output_len_spread=0.2,
+        inter_token_latency=10.0,
+        inter_token_spread=0.1,
+        first_chunk_latency=0.05,
+        first_chunk_spread=0.1,
+        stream=request.param,
+        stream_interval=20,
+        num_workers=2,
+        quiet=True,
+    ) as srv:
+        yield srv
+
+
+@pytest.mark.integration
+@pytest.mark.xdist_group(name="serial_performance")
+def test_low_qps_no_network_errors(variable_server, tmp_path, record_result):
+    """Sustain 5 QPS Poisson for 20 s — must complete with zero failed requests.
+
+    Low QPS spaces requests far enough apart that idle connections may
+    sit past ``TCP_KEEPIDLE`` (1 s in :class:`_SocketConfig`). A regression
+    in keep-alive probing, idle pool eviction, or slow-response handling
+    surfaces here as non-zero ``failed`` count.
+    """
+    TARGET_QPS = 5
+    DURATION_S = 20
+
+    results = run_cli(
+        [
+            "online",
+            "--load-pattern",
+            "poisson",
+            "--target-qps",
+            str(TARGET_QPS),
+            "--duration",
+            f"{DURATION_S}s",
+            "--num-samples",
+            str(TARGET_QPS * DURATION_S),
+            # Low QPS needs neither many workers nor pre-warmed connections;
+            # using auto defaults makes startup slow and flaky against a stub
+            # that has TTFT + per-token delays.
+            "--workers",
+            "4",
+            "--client.warmup-connections",
+            "0",
+        ],
+        tmp_path,
+        variable_server,
+    )
+    r = results["results"]
+    assert r["failed"] == 0, f"failed={r['failed']} of {r['total']}"
+    record_result(
+        f"low_qps target={TARGET_QPS}",
+        stream=variable_server.stream,
+        qps=r["qps"],
+        total=r["total"],
+        elapsed=r["elapsed_time"],
+        failed=r["failed"],
+    )
+    print(
+        f"\n  low_qps  target={TARGET_QPS}  stream={variable_server.stream}: "
+        f"achieved={r['qps']:.2f} QPS  total={r['total']}  "
+        f"failed={r['failed']}  elapsed={r['elapsed_time']:.2f}s"
+    )
diff --git a/tests/performance/commands/utils.py b/tests/performance/commands/utils.py
new file mode 100644
index 000000000..f9713013c
--- /dev/null
+++ b/tests/performance/commands/utils.py
@@ -0,0 +1,86 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Shared helpers for E2E command tests."""
+
+from __future__ import annotations
+
+import json
+import os
+from pathlib import Path
+from typing import Protocol
+
+DATASET = str(
+    Path(__file__).resolve().parents[2] / "assets" / "datasets" / "ds_samples.jsonl"
+)
+
+
+class StubServer(Protocol):
+    """Duck-type for the stub servers (MaxThroughputServer, VariableResponseServer).
+
+    Named ``StubServer`` rather than ``TestServer`` so pytest doesn't try to
+    collect it as a test class (any class whose name starts with ``Test`` is
+    a collection candidate).
+    """
+
+    url: str
+    stream: bool
+
+
+def run_cli(
+    extra_args: list[str],
+    report_dir: Path,
+    server: StubServer,
+    *,
+    dataset: str = DATASET,
+) -> dict:
+    """Invoke ``inference-endpoint`` in-process via cyclopts; return results.json.
+
+    Client ``--streaming`` is coupled to the server's response mode: the stub
+    server always returns the same pre-compiled bytes (JSON or SSE),
+    regardless of what the client sent in the request body. Mismatched modes
+    produce ``DecodeError: JSON is malformed`` on every response.
+
+    Env overrides (useful in containers where cpu_affinity is restricted):
+        ROOFLINE_NUM_WORKERS  — override --workers (default: auto)
+        ROOFLINE_INIT_TIMEOUT — override --client.worker-initialization-timeout
+    """
+    from inference_endpoint.main import app
+
+    report_dir.mkdir(parents=True, exist_ok=True)
+    args = [
+        "benchmark",
+        *extra_args,
+        "--endpoints",
+        server.url,
+        "--streaming",
+        "on" if server.stream else "off",
+        "--model",
+        "max-tp",
+        "--dataset",
+        dataset,
+        "--report-dir",
+        str(report_dir),
+    ]
+    if nw := os.environ.get("ROOFLINE_NUM_WORKERS"):
+        args += ["--workers", nw]
+    if to := os.environ.get("ROOFLINE_INIT_TIMEOUT"):
+        args += ["--client.worker-initialization-timeout", to]
+    try:
+        app(args)
+    except SystemExit as e:
+        if e.code not in (None, 0):
+            raise
+    return json.loads((report_dir / "results.json").read_text())

From 31f9f2d50f757b5af5af5283978355358d8d5e07 Mon Sep 17 00:00:00 2001
From: Viraat Chandra <viraatc@nvidia.com>
Date: Thu, 28 May 2026 15:40:56 -0700
Subject: [PATCH 2/3] test: address PR review comments

* poisson binary search: switch to while lo<=hi + best_sustained so
  the LO boundary is actually tested. Old loop could converge to
  lo==hi==LO/STEP without running LO and report max_sustained=0.
* low_qps: 2x num-samples headroom over TARGET_QPS*DURATION so wall
  time, not sample count, caps the run despite Poisson variance.
* conftest._host_info: use os.cpu_count() for cores (cross-platform);
  keep /proc/cpuinfo only for the CPU model string. Document why the
  OSError except is silent.
* conftest._fmt_cell: wrap int/float conversions in try/except so a
  bad recorded value can't crash the end-of-session summary table.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/performance/commands/conftest.py      | 40 +++++++++++----------
 tests/performance/commands/test_e2e_perf.py | 20 +++++++----
 2 files changed, 36 insertions(+), 24 deletions(-)

diff --git a/tests/performance/commands/conftest.py b/tests/performance/commands/conftest.py
index ab003e0a1..141077bce 100644
--- a/tests/performance/commands/conftest.py
+++ b/tests/performance/commands/conftest.py
@@ -23,6 +23,7 @@
 
 from __future__ import annotations
 
+import os
 import platform
 from typing import Any
 
@@ -64,17 +65,17 @@ def _record(label: str, **fields: Any) -> None:
 
 def _host_info() -> dict[str, str]:
     cpu = "unknown"
-    cores = 0
     try:
         with open("/proc/cpuinfo") as f:
-            text = f.read()
-        for line in text.splitlines():
-            if line.startswith("model name"):
-                cpu = line.split(":", 1)[1].strip()
-                break
-        cores = text.count("processor\t:")
+            for line in f:
+                if line.startswith("model name"):
+                    cpu = line.split(":", 1)[1].strip()
+                    break
     except OSError:
+        # CPU model is informational; missing /proc/cpuinfo (non-Linux,
+        # restricted container) just leaves it as "unknown".
         pass
+    cores = os.cpu_count() or 0
     return {
         "host": platform.node(),
         "arch": platform.machine(),
@@ -88,18 +89,21 @@ def _fmt_cell(value: Any, kind: str) -> str:
         return "—"
     if kind == "stream":
         return "on " if value else "off"
-    if kind == "qps":
-        try:
+    # Conversions go through float() first so numeric strings ("100.0")
+    # don't crash int(). Any conversion failure falls back to str(value)
+    # so the end-of-session summary never blows up the pytest run.
+    try:
+        if kind == "qps":
             v = float(value)
-        except (TypeError, ValueError):
-            return str(value)
-        return f"{v:>9,.0f}" if v >= 100 else f"{v:>9.2f}"
-    if kind == "total":
-        return f"{int(value):>10,}"
-    if kind == "elapsed":
-        return f"{float(value):>7.2f}s"
-    if kind == "failed":
-        return f"{int(value):>4}"
+            return f"{v:>9,.0f}" if v >= 100 else f"{v:>9.2f}"
+        if kind == "total":
+            return f"{int(float(value)):>10,}"
+        if kind == "elapsed":
+            return f"{float(value):>7.2f}s"
+        if kind == "failed":
+            return f"{int(float(value)):>4}"
+    except (TypeError, ValueError):
+        return str(value)
     return str(value)
 
 
diff --git a/tests/performance/commands/test_e2e_perf.py b/tests/performance/commands/test_e2e_perf.py
index 178b512eb..de0dcfb00 100644
--- a/tests/performance/commands/test_e2e_perf.py
+++ b/tests/performance/commands/test_e2e_perf.py
@@ -151,10 +151,15 @@ def test_poisson_binary_search_max_qps(max_tput_server, tmp_path, record_result)
     LO, HI = 10_000, 250_000  # search space (inclusive)
     PASS_RATIO = 0.95  # achieved/target threshold for "sustained"
 
+    # Standard binary search over candidate targets so the LO boundary is
+    # actually exercised: with ``while lo < hi`` we could converge to
+    # ``lo == hi == LO/STEP`` without ever issuing a run at LO, leaving
+    # ``max_sustained`` reported as 0 even if LO is sustainable.
     history: list[tuple[int, float, bool]] = []
+    best_sustained = 0
     lo, hi = LO // STEP, HI // STEP  # integer bounds in units of STEP
-    while lo < hi:
-        mid = (lo + hi + 1) // 2
+    while lo <= hi:
+        mid = (lo + hi) // 2
         target = mid * STEP
         results = run_cli(
             [
@@ -179,12 +184,12 @@ def test_poisson_binary_search_max_qps(max_tput_server, tmp_path, record_result)
         sustained = achieved >= target * PASS_RATIO
         history.append((target, achieved, sustained))
         if sustained:
-            lo = mid
+            best_sustained = target
+            lo = mid + 1
         else:
             hi = mid - 1
 
-    sustained_targets = [t for t, _, s in history if s]
-    max_sustained = max(sustained_targets) if sustained_targets else 0
+    max_sustained = best_sustained
     record_result(
         "poisson max_sustained",
         stream=max_tput_server.stream,
@@ -246,8 +251,11 @@ def test_low_qps_no_network_errors(variable_server, tmp_path, record_result):
             str(TARGET_QPS),
             "--duration",
             f"{DURATION_S}s",
+            # 2x Poisson expectation so wall time (--duration) always caps
+            # the run; without headroom, variance in inter-arrivals can
+            # finish the test early before the full idle-connection window.
             "--num-samples",
-            str(TARGET_QPS * DURATION_S),
+            str(TARGET_QPS * DURATION_S * 2),
             # Low QPS needs neither many workers nor pre-warmed connections;
             # using auto defaults makes startup slow and flaky against a stub
             # that has TTFT + per-token delays.

From b66712f1f1a2940cc45183d5d3e11ec842d9e0ec Mon Sep 17 00:00:00 2001
From: Viraat Chandra <viraatc@nvidia.com>
Date: Thu, 28 May 2026 17:13:22 -0700
Subject: [PATCH 3/3] test: mark all E2E tests as performance (CI-skip)

Low-QPS correctness was marked integration which would have it run in
CI on every PR. These are long-running benchmark tests that aren't
meant to gate merges; marking them all performance keeps CI fast and
makes the file's policy uniform.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/performance/commands/test_e2e_perf.py | 26 ++++++++-------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/tests/performance/commands/test_e2e_perf.py b/tests/performance/commands/test_e2e_perf.py
index de0dcfb00..57a2e90d4 100644
--- a/tests/performance/commands/test_e2e_perf.py
+++ b/tests/performance/commands/test_e2e_perf.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""End-to-end performance + correctness tests for the benchmark CLI.
+"""End-to-end performance tests for the benchmark CLI.
 
 Two families of tests, both driving the cyclopts ``inference-endpoint``
 app in-process and parameterized on stream/non-stream:
@@ -21,28 +21,22 @@
 * **Roofline** against :class:`MaxThroughputServer` (instant pre-compiled
   responses). Measures peak QPS for each load pattern
   (``max_throughput``, ``concurrency``, ``poisson``). Prints numbers
-  rather than asserting on them. Marker: ``performance`` (CI-skipped).
+  rather than asserting on them.
 
 * **Low-QPS correctness** against :class:`VariableResponseServer`
   (realistic TTFT + per-token TPOT). Asserts zero ``failed`` requests at
   5 QPS for 20 s — guards keep-alive / idle-pool / slow-response
-  regressions. Marker: ``integration`` (CI-included).
+  regressions.
 
-Results from every parametrized case are written via the
-``record_result`` fixture and rendered as a single summary table by
-``conftest.py`` after the session completes.
+Both families are marked ``performance`` and are therefore CI-skipped;
+run them explicitly when investigating throughput regressions or
+benchmarking a new machine. Results from every parametrized case are
+written via the ``record_result`` fixture and rendered as a single
+summary table by ``conftest.py`` after the session completes.
 
 Run::
 
-    # roofline only
-    pytest -xvs -m performance --no-cov tests/performance/commands/test_e2e_perf.py
-
-    # low-QPS only
-    pytest -xvs -m integration tests/performance/commands/test_e2e_perf.py
-
-    # both
-    pytest -xvs -m "performance or integration" --no-cov \\
-        tests/performance/commands/test_e2e_perf.py
+    pytest -vs -m performance --no-cov tests/performance/commands/test_e2e_perf.py
 """
 
 from __future__ import annotations
@@ -229,7 +223,7 @@ def variable_server(request):
         yield srv
 
 
-@pytest.mark.integration
+@pytest.mark.performance
 @pytest.mark.xdist_group(name="serial_performance")
 def test_low_qps_no_network_errors(variable_server, tmp_path, record_result):
     """Sustain 5 QPS Poisson for 20 s — must complete with zero failed requests.