From c28e6ce7172bfb146f3ea6c43b36fc15043bb56b Mon Sep 17 00:00:00 2001
From: Tin-Yin Lai <tinyinl@nvidia.com>
Date: Mon, 22 Jun 2026 18:18:20 -0700
Subject: [PATCH 1/3] docs(compliance): output-caching audit (MLPerf TEST04)
 design + examples

Design plan (docs/compliance_audit_plan.md, incl. an ASCII program-flow
diagram showing every decision gate and its exit code), the compliance-module
entry in AGENTS.md, and the WAN2.2 Offline/SingleStream submission example
configs (perf + accuracy + output_caching_test audit in one from-config run).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 AGENTS.md                                     |  49 +-
 docs/compliance_audit_plan.md                 | 728 ++++++++++++++++++
 .../offline_wan22_submission.yaml             |  70 ++
 .../single_stream_wan22_submission.yaml       |  71 ++
 4 files changed, 902 insertions(+), 16 deletions(-)
 create mode 100644 docs/compliance_audit_plan.md
 create mode 100644 examples/09_Wan22_VideoGen_Example/offline_wan22_submission.yaml
 create mode 100644 examples/09_Wan22_VideoGen_Example/single_stream_wan22_submission.yaml

diff --git a/AGENTS.md b/AGENTS.md
index 4803c99a7..6dbbd1a9e 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -85,19 +85,20 @@ Dataset Manager --> Load Generator --> Endpoint Client --> External Endpoint
 
 ### Key Components
 
-| Component              | Location                                                          | Purpose                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-| ---------------------- | ----------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| **Load Generator**     | `src/inference_endpoint/load_generator/`                          | Central orchestrator: `BenchmarkSession` owns the lifecycle, `PhaseIssuer` drives per-phase execution, `TimedIssueStrategy`/`BurstStrategy`/`ConcurrencyStrategy` control timing. Emits `ERROR` before `COMPLETE` for failed queries (metrics aggregator depends on this order).                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
-| **Endpoint Client**    | `src/inference_endpoint/endpoint_client/`                         | Multi-process HTTP workers communicating via ZMQ IPC. `HTTPEndpointClient` is the main entry point                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-| **Dataset Manager**    | `src/inference_endpoint/dataset_manager/`                         | Loads JSONL, HuggingFace, CSV, JSON, Parquet datasets. `Dataset` base class with `load_sample()`/`num_samples()` interface                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
-| **Metrics Aggregator** | `src/inference_endpoint/async_utils/services/metrics_aggregator/` | Subprocess. Subscribes to events, aggregates per-sample metrics into a `MetricsRegistry` (counters + HDR-histogram series + raw values), publishes `MetricsSnapshot` over IPC PUB at a configurable cadence (`SessionState`: `INITIALIZE` → `LIVE` → `DRAINING` → {`COMPLETE` \| `INTERRUPTED`}). Final snapshot is atomically written to `final_snapshot.json` as the **primary** Report source; the terminal pub/sub frame is a TUI "run finished" signal only.                                                                                                                                                                                                                                                                               |
-| **Report**             | `src/inference_endpoint/metrics/report.py`                        | `Report.from_snapshot(dict)` — pure-function builder consuming the dict form (`snapshot_to_dict`). Reads `final_snapshot.json` directly via `json.loads` (no Struct decode). Plumbs `complete = (state == "complete" and n_pending_tasks == 0)`; renders an explicit warning for `INTERRUPTED` runs.                                                                                                                                                                                                                                                                                                                                                                                                                                            |
-| **Config**             | `src/inference_endpoint/config/`, `endpoint_client/config.py`     | Pydantic-based YAML schema (`schema.py`), `HTTPClientConfig` (single Pydantic model for CLI/YAML/runtime), `RuntimeSettings`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| **CLI**                | `src/inference_endpoint/main.py`, `commands/benchmark/cli.py`     | cyclopts-based, auto-generated from `schema.py` and `HTTPClientConfig` Pydantic models. Flat shorthands via `cyclopts.Parameter(alias=...)`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
-| **Async Utils**        | `src/inference_endpoint/async_utils/`                             | `LoopManager` (uvloop + eager_task_factory), ZMQ transport layer, generic `MessageCodec[T]`-parametrized pub/sub, event publisher                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
-| **OpenAI/SGLang**      | `src/inference_endpoint/openai/`, `sglang/`                       | Protocol adapters and response accumulators for different API formats. `openai_completions` adapter (`completions_adapter.py`) sends pre-tokenized token IDs to `/v1/completions`, bypassing the server chat template — required for gpt-oss-120b on vLLM. `sglang` adapter sends to `/generate` via `input_ids`. Both apply `Harmonize()` client-side.                                                                                                                                                                                                                                                                                                                                                                                         |
-| **TensorRT-LLM**       | `src/inference_endpoint/trtllm/`                                  | Adapter for TensorRT-LLM endpoints. `TRTLLMAdapter` sends requests; `TRTLLMSSEAccumulator` handles SSE streaming responses.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
-| **VideoGen**           | `src/inference_endpoint/videogen/`                                | Adapter for video-generation endpoints (e.g. trtllm-serve `POST /v1/videos/generations`, used by MLPerf WAN2.2-T2V-A14B). Defaults to `response_format=video_path` (server saves video to shared storage and returns path) to avoid large byte payloads. Accuracy mode also runs on `video_path`: the adapter mirrors the path into `response_output` so the event log carries it to `VBenchScorer` (see `evaluation/scoring.py`), which scores videos via VBench from a sibling `uv` subproject at `examples/09_Wan22_VideoGen_Example/accuracy/` (vbench's `transformers==4.33.2` + `numpy<2` pins are incompatible with the parent env, so it runs out-of-process via `uv run --project`). Dataset is ingested via the generic JSONL loader. |
+| Component              | Location                                                          | Purpose                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| ---------------------- | ----------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| **Load Generator**     | `src/inference_endpoint/load_generator/`                          | Central orchestrator: `BenchmarkSession` owns the lifecycle, `PhaseIssuer` drives per-phase execution, `TimedIssueStrategy`/`BurstStrategy`/`ConcurrencyStrategy` control timing. Emits `ERROR` before `COMPLETE` for failed queries (metrics aggregator depends on this order).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| **Endpoint Client**    | `src/inference_endpoint/endpoint_client/`                         | Multi-process HTTP workers communicating via ZMQ IPC. `HTTPEndpointClient` is the main entry point                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| **Dataset Manager**    | `src/inference_endpoint/dataset_manager/`                         | Loads JSONL, HuggingFace, CSV, JSON, Parquet datasets. `Dataset` base class with `load_sample()`/`num_samples()` interface                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| **Metrics Aggregator** | `src/inference_endpoint/async_utils/services/metrics_aggregator/` | Subprocess. Subscribes to events, aggregates per-sample metrics into a `MetricsRegistry` (counters + HDR-histogram series + raw values), publishes `MetricsSnapshot` over IPC PUB at a configurable cadence (`SessionState`: `INITIALIZE` → `LIVE` → `DRAINING` → {`COMPLETE` \| `INTERRUPTED`}). Final snapshot is atomically written to `final_snapshot.json` as the **primary** Report source; the terminal pub/sub frame is a TUI "run finished" signal only.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| **Report**             | `src/inference_endpoint/metrics/report.py`                        | `Report.from_snapshot(dict)` — pure-function builder consuming the dict form (`snapshot_to_dict`). Reads `final_snapshot.json` directly via `json.loads` (no Struct decode). Plumbs `complete = (state == "complete" and n_pending_tasks == 0)`; renders an explicit warning for `INTERRUPTED` runs.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| **Config**             | `src/inference_endpoint/config/`, `endpoint_client/config.py`     | Pydantic-based YAML schema (`schema.py`), `HTTPClientConfig` (single Pydantic model for CLI/YAML/runtime), `RuntimeSettings`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| **CLI**                | `src/inference_endpoint/main.py`, `commands/benchmark/cli.py`     | cyclopts-based, auto-generated from `schema.py` and `HTTPClientConfig` Pydantic models. Flat shorthands via `cyclopts.Parameter(alias=...)`                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| **Async Utils**        | `src/inference_endpoint/async_utils/`                             | `LoopManager` (uvloop + eager_task_factory), ZMQ transport layer, generic `MessageCodec[T]`-parametrized pub/sub, event publisher                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| **OpenAI/SGLang**      | `src/inference_endpoint/openai/`, `sglang/`                       | Protocol adapters and response accumulators for different API formats. `openai_completions` adapter (`completions_adapter.py`) sends pre-tokenized token IDs to `/v1/completions`, bypassing the server chat template — required for gpt-oss-120b on vLLM. `sglang` adapter sends to `/generate` via `input_ids`. Both apply `Harmonize()` client-side.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| **TensorRT-LLM**       | `src/inference_endpoint/trtllm/`                                  | Adapter for TensorRT-LLM endpoints. `TRTLLMAdapter` sends requests; `TRTLLMSSEAccumulator` handles SSE streaming responses.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| **VideoGen**           | `src/inference_endpoint/videogen/`                                | Adapter for video-generation endpoints (e.g. trtllm-serve `POST /v1/videos/generations`, used by MLPerf WAN2.2-T2V-A14B). Defaults to `response_format=video_path` (server saves video to shared storage and returns path) to avoid large byte payloads. Accuracy mode also runs on `video_path`: the adapter mirrors the path into `response_output` so the event log carries it to `VBenchScorer` (see `evaluation/scoring.py`), which scores videos via VBench from a sibling `uv` subproject at `examples/09_Wan22_VideoGen_Example/accuracy/` (vbench's `transformers==4.33.2` + `numpy<2` pins are incompatible with the parent env, so it runs out-of-process via `uv run --project`). Dataset is ingested via the generic JSONL loader.                                                                                                                                                                                                     |
+| **Compliance**         | `src/inference_endpoint/compliance/`, `commands/audit.py`         | MLPerf compliance audits. `AuditTest` protocol + `RunSpec`/`RunStats`/`RunArtifacts` + test registry (`compliance/__init__.py`); `OutputCachingAudit` (`compliance/tests/output_caching_test.py`) implements the **output-caching** audit (MLPerf **TEST04**) caching detection — a reference phase over distinct samples vs an audit phase repeating one fixed sample (`SingleSampleOrder`), failing if audit QPS exceeds reference QPS by more than `threshold`. `commands/audit.py:run_audit` orchestrates the phases back-to-back, refuses to certify an incomplete phase, and writes `verify_OUTPUT_CACHING_TEST.txt` + `audit_result.json` atomically via `compliance/result.py`. Enabled by the YAML `audit:` block (`AuditConfig`/`OutputCachingTestConfig`, `AuditTestId.OUTPUT_CACHING_TEST` in `schema.py`); `run_benchmark` dispatches to it after the main run. Performance-only, unpaced loads only (`max_throughput`/`concurrency`). |
 
 ### Hot-Path Architecture
 
@@ -154,6 +155,16 @@ Validation is layered:
 - `poisson`: Fixed QPS with Poisson arrival distribution
 - `concurrency`: Fixed concurrent requests
 
+### Compliance Audits
+
+Orthogonal to the main run: a YAML-only `audit:` block (`show=False`, no CLI flag) on `BenchmarkConfig` selects an `AuditTest`. `run_benchmark` runs the main benchmark, then — if `audit:` is set — calls `commands/audit.py:run_audit`, which:
+
+1. Validates the load pattern (unpaced only: `max_throughput`/`concurrency`) and the configured `sample_index` bounds (reusing the first phase's loaded dataset — no extra load).
+2. Runs each `RunSpec` phase (from `AuditTest.plan_runs`) back-to-back under its own `<report_dir>/<label>/` subdir via `setup_benchmark`/`run_benchmark_async`. A phase whose `Report.complete` is `False` (drain timeout / interrupt) aborts with `ExecutionError` — no verdict on partial data.
+3. Calls `AuditTest.verify(...)` and atomically writes `verify_<TEST>.txt` + `audit_result.json`.
+
+`run_audit` returns an `AuditResult`; `cli.py` maps `passed` to the process exit code (0 PASS / 1 FAIL; setup/IO errors → non-zero via the standard error path). The output-caching audit (MLPerf TEST04) is the only registered audit today (`AuditTestId.OUTPUT_CACHING_TEST`); add new audits by implementing the `AuditTest` protocol and calling `register(...)` in `compliance/tests/`.
+
 ## Code Organization
 
 ```
@@ -164,11 +175,17 @@ src/inference_endpoint/
 │   ├── benchmark/
 │   │   ├── __init__.py
 │   │   ├── cli.py             # benchmark_app: offline, online, from-config subcommands
-│   │   └── execute.py         # Phased execution: setup/run_threaded/finalize + BenchmarkContext
+│   │   └── execute.py         # Phased execution: setup/run_threaded/finalize + BenchmarkContext; run_benchmark dispatches to run_audit when audit: is set
+│   ├── audit.py               # run_audit() — compliance audit orchestrator (phases → verify → result)
 │   ├── probe.py               # ProbeConfig + execute_probe()
 │   ├── info.py                # execute_info()
 │   ├── validate.py            # execute_validate()
 │   └── init.py                # execute_init()
+├── compliance/                # MLPerf compliance audits
+│   ├── __init__.py            # AuditTest protocol + RunSpec/RunStats/RunArtifacts + test registry
+│   ├── result.py              # AuditResult + atomic write_result (verify_<TEST>.txt + audit_result.json)
+│   └── tests/
+│       └── output_caching_test.py  # OutputCachingAudit (MLPerf TEST04): caching detection (reference vs fixed-sample QPS)
 ├── core/
 │   ├── types.py               # APIType, Query, QueryResult, StreamChunk, QueryStatus (msgspec Structs)
 │   └── record.py              # EventRecord — transport record used by event logger and ZMQ transport
@@ -177,7 +194,7 @@ src/inference_endpoint/
 │   ├── strategy.py            # TimedIssueStrategy, BurstStrategy, ConcurrencyStrategy, LoadStrategy
 │   ├── multi_turn_strategy.py # MultiTurnStrategy
 │   ├── conversation_manager.py # ConversationManager, ConversationState
-│   ├── sample_order.py        # SampleOrder, WithoutReplacementSampleOrder, WithReplacementSampleOrder
+│   ├── sample_order.py        # SampleOrder, WithoutReplacement/WithReplacement/SingleSampleOrder, create_sample_order
 │   └── delay.py               # poisson_delay_fn, make_delay_fn
 ├── endpoint_client/
 │   ├── http_client.py         # HTTPEndpointClient - main client interface
@@ -218,7 +235,7 @@ src/inference_endpoint/
 │   └── metric.py              # Metric types (Throughput, etc.)
 ├── config/
 │   ├── schema.py              # Single source of truth: Pydantic models + cyclopts annotations
-│   ├── runtime_settings.py    # RuntimeSettings dataclass
+│   ├── runtime_settings.py    # RuntimeSettings + SampleOrderSpec dataclasses
 │   ├── ruleset_base.py        # BenchmarkSuiteRuleset base
 │   ├── ruleset_registry.py    # Ruleset registry
 │   ├── user_config.py         # UserConfig dataclass for ruleset user overrides
diff --git a/docs/compliance_audit_plan.md b/docs/compliance_audit_plan.md
new file mode 100644
index 000000000..25dc251e4
--- /dev/null
+++ b/docs/compliance_audit_plan.md
@@ -0,0 +1,728 @@
+# Compliance Audit Module — Design Plan
+
+Status: **Implemented** (TEST04) · TEST01/06/07/09 as planned extensions.
+
+This document plans a modular compliance/audit framework for the endpoint benchmarking
+tool that re-implements the _intent_ of the MLPerf Inference compliance ("audit") tests.
+The reference implementation lives in the MLCommons inference repo
+(`compliance/nvidia/TESTxx`).
+
+This is a ground-up redesign. The driving requirements come from two sources: the
+maintainer's workflow constraints (a single command that runs both phases back-to-back
+against the same endpoint) and a first-principles design review (TEST04 must not be bolted onto
+the benchmark via per-phase config surgery — it needs a first-class, extensible
+abstraction). Section 8 maps each requirement to where the design satisfies it.
+
+---
+
+## 1. Background: what MLPerf audit tests do
+
+MLPerf compliance tests detect that a submitter is not gaming the benchmark (caching,
+truncating outputs, running a different/cheaper model in the perf run, EOS exploits).
+They are built on three LoadGen-specific pieces:
+
+1. **`audit.config`** — a file LoadGen reads at `StartTest()` that overrides run settings
+   to enable the test (e.g. issue duplicate samples, log a sample of outputs, fix seeds).
+2. **`mlperf_log_accuracy.json`** — the SUT logs raw **output token IDs** during the run.
+3. **`run_verification.py`** — a post-run script that consumes the logs and emits
+   `verify_*.txt` with a `Performance check pass: True/False` / `TEST PASS` line.
+
+### Test matrix (LLM-relevant subset)
+
+| Test   | Detects                                             | Category      | Required for                         |
+| ------ | --------------------------------------------------- | ------------- | ------------------------------------ |
+| TEST01 | Different model in perf vs accuracy run             | orchestrator  | ResNet50, BERT, SDXL, RetinaNet, …   |
+| TEST04 | Caching of duplicate queries (throughput inflation) | orchestrator  | ResNet50, SDXL (LLMs largely exempt) |
+| TEST06 | LLM output consistency (EOS / first-token / length) | analyzer      | llama2/3.1, mixtral, deepseek        |
+| TEST07 | Accuracy ≥ threshold in perf mode                   | analyzer      | gpt-oss-120b                         |
+| TEST09 | Mean output token length within ±10% of reference   | analyzer      | gpt-oss-120b                         |
+| TEST08 | DLRM-v3 streaming accuracy                          | n/a (not LLM) | DLRM-v3 — **out of scope**           |
+
+**TEST04 (mechanism).** `audit.config` sets `performance_issue_same=1` /
+`performance_issue_same_index=3` so LoadGen issues the **same sample repeatedly** for the
+**same number of queries** as the standard run, then the verification compares throughput.
+Pass if the audit run is **not more than 10% faster** than the reference (20% for
+low-throughput streams). If the SUT caches responses for duplicate queries, throughput
+inflates → FAIL.
+
+---
+
+## 2. Conceptual mapping: MLPerf → this repo
+
+This tool is its own HTTP load generator (no LoadGen). The audit module re-implements the
+_intent_ over this repo's own artifacts.
+
+| MLPerf                                 | This repo                                                   |
+| -------------------------------------- | ----------------------------------------------------------- |
+| `audit.config` (run-setting override)  | a typed **`SampleOrderSpec`** carried on a **`RunSpec`**    |
+| `mlperf_log_accuracy.json` (token IDs) | `events.jsonl` (must carry token IDs for token-level tests) |
+| `run_verification.py` → `verify_*.txt` | an **`AuditTest.verify()`** → `verify_<TEST>.txt` + JSON    |
+| LoadGen runs both phases of a test     | a **generic orchestrator** runs `plan_runs()` back-to-back  |
+| compliance submission dir layout       | mirrored under the run's report dir                         |
+
+This repo names MLPerf **TEST04** the **output-caching test**: id `output_caching_test`
+(`AuditTestId.OUTPUT_CACHING_TEST`), config class `OutputCachingTestConfig`, audit
+`OutputCachingAudit`, and artifacts `verify_OUTPUT_CACHING_TEST.txt` + `audit_result.json`.
+Where this doc writes "TEST04" it means the upstream MLPerf test the output-caching audit
+re-implements.
+
+---
+
+## 3. Two axes (the core principle)
+
+Every audit test decomposes into two independent concerns. Keeping them separate is what
+prevents test-specific knowledge from leaking into general-purpose code.
+
+- **Axis A — run modification** (the `audit.config` analogue): _how_ a test alters the
+  benchmark run(s). For TEST04 it is "issue one fixed sample repeatedly for the audit
+  phase." This is expressed as a generic, typed **`SampleOrderSpec`**, not a per-test
+  boolean. The load generator never learns the string "output_caching_test".
+- **Axis B — verification**: a pure post-run check comparing run artifacts → a result.
+  Per-test, registered.
+
+---
+
+## 4. Architecture
+
+### Component map
+
+```
+benchmark from-config
+   │
+   ├─ run main benchmark: perf  [+ accuracy when accuracy datasets present]   (existing path)
+   │
+   └─ if config.audit is set ▼   (additive post-step, same report_dir)
+   run_audit(config)                         commands/audit.py  ── the generic loop
+            │
+            │ 1. get_audit_test(config.audit.test)
+            ▼
+   AuditTest  ──────────────────────────────  compliance/tests/output_caching_test.py
+     ├─ plan_runs(cfg) -> list[RunSpec]        (declarative: what phases to run)
+     └─ verify(runs,cfg)-> AuditResult         (pure: read artifacts → result)
+            │
+            │ 2. for each RunSpec
+            ▼
+   setup_benchmark(config, run_spec)           commands/benchmark/execute.py  (reused)
+            │   run_spec.sample_order
+            ▼
+   create_sample_order(settings)               load_generator/sample_order.py
+     └─ switch on SampleOrderSpec              (WITHOUT_REPLACEMENT | SINGLE(index))
+            │   no "output_caching_test" knowledge here
+            ▼
+   run_benchmark_async(ctx) ─► RunArtifacts    (final_snapshot.json, events.jsonl)
+            │
+            │ 3. verify(runs, cfg) ; 4. write_result (atomic)
+            ▼
+   verify_OUTPUT_CACHING_TEST.txt  +  audit_result.json
+```
+
+### Program flow (output-caching audit / MLPerf TEST04, two phases)
+
+Every decision gate is shown, with the exit code it produces. Exit codes:
+`0` PASS · `1` FAIL · `3` SetupError · `4` ExecutionError · `130` interrupted
+before the audit.
+
+```
+                         ┌─────────────────────────┐
+                         │  benchmark from-config  │
+                         └────────────┬────────────┘
+                                      ▼
+                         ┌─────────────────────────┐
+                         │  run_benchmark(cfg,mode) │
+                         └────────────┬────────────┘
+                                      ▼
+                  ┌──────────────────────────────────────┐
+                  │ setup_benchmark + run main benchmark  │
+                  │ (perf  [+ accuracy if acc datasets])  │
+                  └────────────────┬─────────────────────┘
+                                   ▼
+                       ╱────────────────────────╲    yes   ┌─────────────────────────┐
+                      ╱ KeyboardInterrupt during   ╲──────►│ salvage partial results │
+                      ╲     the main run?           ╱       │ re-raise → exit 130     │
+                       ╲────────────┬──────────────╱        │ (audit NOT started)     │
+                                    │ no                    └─────────────────────────┘
+                                    ▼
+                          ╱──────────────────╲   no   ┌────────────────────────┐
+                         ╱  config.audit set?  ╲─────►│ return None → exit 0   │
+                         ╲────────┬────────────╱       └────────────────────────┘
+                                  │ yes
+                                  ▼
+                   ┌──────────────────────────────────┐
+                   │   run_audit(cfg, report_dir)      │
+                   │   test = get_audit_test(test_id)  │
+                   └────────────────┬─────────────────┘
+                                    ▼
+            ╱────────────────────────────────────────-╲   no   ┌───────────────────────┐
+           ╱ load_pattern ∈ {max_throughput,            ╲─────►│ raise SetupError      │
+           ╲  concurrency}  AND  ≥1 performance dataset?  ╱      │     → exit 3          │
+            ╲────────────────────┬───────────────────-──╱       └───────────────────────┘
+                                 │ yes                                    ▲
+                                 ▼                                        │
+            ┌──────────────────────────────────────────┐                 │
+            │ specs = test.plan_runs(cfg)               │                 │
+            │   [ "reference"      (without_replacement),│                 │
+            │     "output_caching" (single(index)) ]    │                 │
+            └────────────────┬─────────────────────────┘                 │
+                             ▼                                            │
+        ╔═══════════ for each spec (back-to-back) ═════════════╗         │
+        ║                    ▼                                  ║         │
+        ║   ┌──────────────────────────────────────────┐       ║         │
+        ║   │ setup_benchmark(phase_cfg, audit=None)    │       ║         │
+        ║   └──────────────┬───────────────────────────┘       ║         │
+        ║                  ▼                                    ║         │
+        ║          ╱──────────────╲ yes  ╱─────────────────╲    ║         │
+        ║         ╱  first phase?   ╲───►╱ every spec's      ╲──╫──no─────┘
+        ║         ╲────────┬────────╱    ╲ sample_index in    ╱ ║ (out of range)
+        ║                  │ no           ╲ range [0,N)?     ╱  ║
+        ║                  │               ╲──────┬────────-╱   ║
+        ║                  │◄─────────────────────┘ yes         ║
+        ║                  ▼                                    ║
+        ║   ┌──────────────────────────────────────────┐       ║
+        ║   │ run_benchmark_async(ctx) → finalize       │       ║
+        ║   └──────────────┬───────────────────────────┘       ║
+        ║                  ▼                                    ║
+        ║       ╱───────────────────────╲ no   ┌─────────────────────────┐
+        ║      ╱ report not None AND       ╲───►│ raise ExecutionError    │
+        ║      ╲   report.complete?         ╱   │   → exit 4              │
+        ║       ╲──────────┬──────────────╱     │ (no verdict on partial) │
+        ║                  │ yes                 └─────────────────────────┘
+        ║                  ▼                                    ║
+        ║   ┌──────────────────────────────────────────┐       ║
+        ║   │ append RunArtifacts(label, report,        │       ║
+        ║   │   n_requested = spec.n_samples)           │       ║
+        ║   └──────────────────────────────────────────┘       ║
+        ╚══════════════════╪═══════════════════════════════════╝
+                           ▼ (all phases done)
+            ┌──────────────────────────────────────────┐
+            │ result = test.verify([ref, audit])        │
+            │  • completion guard:                      │
+            │      completed ≥ requested × (1 − thr)    │
+            │  • caching rule:                          │
+            │      audit_qps < ref_qps × (1 + thr)      │
+            └──────────────┬───────────────────────────┘
+                           ▼
+            ┌──────────────────────────────────────────┐
+            │ write_result(result, report_dir) [atomic] │
+            │   verify_OUTPUT_CACHING_TEST.txt           │
+            │   audit_result.json                        │
+            └──────────────┬───────────────────────────┘
+                           ▼
+            ┌──────────────────────────────────────────┐
+            │ return AuditResult → cli.py               │
+            │   exit 0 (PASS)  /  exit 1 (FAIL)         │
+            └──────────────────────────────────────────┘
+```
+
+Analyzer tests (TEST06/07/09) take the same path with a single-element `plan_runs`, so
+phase 2 simply doesn't exist and `verify` reads the one run's artifacts.
+
+In a `type: submission` config (see §5) this whole `run_audit` block runs **after** the
+main perf [+ accuracy] run, under the same `report_dir`.
+
+### The `AuditTest` abstraction
+
+A single protocol covers **both** categories — orchestrators (must execute a
+specially-configured run) and analyzers (pure post-run). An analyzer is just an audit whose
+plan is a single normal run, so the orchestration loop never special-cases a category.
+
+```python
+class AuditTest(Protocol):
+    test_id: ClassVar[AuditTestId]                          # AuditTestId.OUTPUT_CACHING_TEST
+    def plan_runs(self, cfg: AuditConfig) -> list[RunSpec]: ...
+    def verify(self, runs: list[RunArtifacts], cfg: AuditConfig) -> AuditResult: ...
+```
+
+- **Orchestrator (TEST04, TEST01):** `plan_runs` returns ≥2 specs.
+- **Analyzer (TEST06, TEST07, TEST09):** `plan_runs` returns 1 normal-run spec; all logic
+  lives in `verify`.
+
+### `RunSpec` — declarative and typed
+
+Replaces ad-hoc per-phase `model_copy` surgery and stringly-typed override kwargs.
+
+```python
+@dataclass(frozen=True, slots=True)
+class RunSpec:
+    label: str                    # "reference" / "output_caching" → report subdir
+    n_samples: int | None         # this phase's query count (may differ per phase; None = dataset default)
+    sample_order: SampleOrderSpec # WITHOUT_REPLACEMENT | SINGLE(index)
+```
+
+### `SampleOrderSpec` — the one generic load-gen seam
+
+```python
+# load_generator/sample_order.py
+class SampleOrderSpec:   # WITHOUT_REPLACEMENT | SINGLE(index=...)
+    ...
+
+def create_sample_order(settings: RuntimeSettings) -> SampleOrder:
+    spec = settings.sample_order            # generic; default WITHOUT_REPLACEMENT
+    ...                                      # switch on spec, no "output_caching_test" knowledge
+```
+
+### `AuditConfig` — per-test discriminated union on `BenchmarkConfig`
+
+Each test carries **only its own knobs** in a per-test config model, discriminated on
+`test`. This avoids a flat model where one `threshold` field means different things per
+test (caching tolerance vs OSL band vs accuracy floor) and is meaningless for the
+equality-based tests (TEST01/06). No `DatasetType.AUDIT`, no audit fields on the shared
+`Dataset` model.
+
+```python
+class AuditTestId(str, Enum):
+    OUTPUT_CACHING_TEST = "output_caching_test"   # MLPerf TEST04
+
+class OutputCachingTestConfig(BaseModel):
+    model_config = ConfigDict(frozen=True, extra="forbid")
+    test: Literal[AuditTestId.OUTPUT_CACHING_TEST]
+    samples: int                      # reference phase count (required, ge=1)
+    audit_samples: int | None = None  # audit phase count; None = equals `samples`
+    sample_index: int = 0             # MLPerf performance_issue_same_index
+    threshold: float = 0.10           # caching tolerance (MLPerf TEST04-specific)
+
+# One member today; becomes a discriminated union as tests are added:
+#   AuditConfig = Annotated[OutputCachingTestConfig | Test01Config | ..., Field(discriminator="test")]
+AuditConfig = OutputCachingTestConfig
+```
+
+On `BenchmarkConfig`: `audit: AuditConfig | None = None`. With a single member the alias is
+just `OutputCachingTestConfig`; the `test: Literal[...]` discriminator field is already in place, so
+adding the second test only assembles the `Annotated[... , Field(discriminator="test")]`
+union — no change to existing tests.
+
+**`samples` is required (no full-dataset/`None` mode).** An audit needs an explicit
+reference count so the per-phase completion guard has an independent target to validate
+against (a duration-driven, countless phase would make the guard tautological). `samples`
+sizes the reference phase; `audit_samples` the fixed-sample phase, falling back to `samples`
+when omitted (equal counts — the shipped examples). The two counts **may** differ (set
+`audit_samples` lower, e.g. 64 / 32, to shorten the audit phase — upstream TEST04 does this;
+see §5): the result relies on `qps` being rate-normalized plus a per-phase completion guard,
+so it does not require equal counts.
+
+### Generic orchestrator
+
+`run_benchmark` first executes the main benchmark (performance, plus accuracy scoring when
+the config carries accuracy datasets) exactly as today. Then, when `config.audit is not
+None`, it runs `run_audit(config)` (in `commands/audit.py`) as an **additive
+post-step** under the same `report_dir`. If the main run is interrupted (`KeyboardInterrupt`
+/ Ctrl-C), `run_benchmark` salvages partial results and re-raises **without** starting the
+audit — an interrupted run must not silently roll into the (long) compliance phases. The two stages are independent, self-contained
+operations sequenced at the top level — not per-phase config surgery — so one
+`type: submission` YAML can produce the full set: perf [+ accuracy] + the audit's reference
+and test04 phases + result (§5). The audit runs its **own** reference phase at
+`samples`; it does not reuse the (typically larger, full-dataset) submission perf run. The
+generic loop never names a specific test:
+
+1. `test = get_audit_test(config.audit.test)`
+2. `specs = test.plan_runs(config.audit)`
+3. **Validate before any run executes.** The load pattern is checked against an
+   **allow-list** — only `max_throughput` (offline) or `concurrency` (single-stream) are
+   accepted; everything else (`poisson`, `multi_turn`, `burst`, `step`) is rejected up front.
+   Paced loads cap throughput and would mask caching; patterns with different sample
+   semantics (e.g. `multi_turn`) make the fixed-index audit phase meaningless. Every spec's
+   `sample_index` must be in range for the **loaded** dataset: the first phase's
+   `setup_benchmark` loads the dataset, and all specs are bounds-checked against that row
+   count before any phase runs (reusing that load — no separate probe load — and never
+   discovering an out-of-range index after a full reference run has already executed).
+4. Execute each spec back-to-back via the existing `setup_benchmark` /
+   `run_benchmark_async` path (no duplicated report-dir or `config.yaml` logic). Each phase
+   config has `audit=None` to prevent re-entry into `run_audit`. If any phase raises
+   (`SetupError` / `ExecutionError`), `run_audit` aborts **without verifying** — a crashed
+   phase must never produce a result. A phase that returns but whose `Report.complete` is
+   `False` (metrics drain timed out, or the run was interrupted → partial stats) is likewise
+   rejected with `ExecutionError` — a result is never certified on partial data. Errors
+   propagate to the standard CLI handler (`main.py`), which maps `SetupError` → exit `3` and
+   `ExecutionError` → exit `4`.
+5. `result = test.verify(runs, cfg)`
+6. Atomically write the result (`tmp → fsync → rename → fsync(parent)`).
+7. Return the typed `AuditResult`. Because `run_benchmark` currently returns `None` and
+   `cli.py` ignores its return, the audit path must **propagate** the result: `run_audit`
+   returns it, `run_benchmark` returns it for an audit config, and `cli.py` maps it to
+   `sys.exit` — `0` (PASS) / `1` (FAIL). Errors are not flattened to a single code:
+   they propagate to `main.py`'s handler, which uses the repo-wide scheme
+   (`InputValidationError` → `2`, `SetupError` → `3`, `ExecutionError` → `4`). The on-disk
+   `audit_result.json` is the durable record; the exit code is the automation signal.
+
+### Verifier — one core + in-process adapter
+
+```python
+@dataclass(frozen=True, slots=True)
+class RunStats:          # .from_report(Report, n_requested)
+    qps: float
+    n_completed: int
+    n_requested: int
+    # from_report raises ValueError when the report has no duration (qps is None)
+    # or zero throughput (qps <= 0) — a degenerate run can't anchor the ratio.
+
+def verify_output_caching(ref: RunStats, audit: RunStats, threshold: float = 0.10) -> AuditResult:
+    # per-phase completion guard: each phase completed >= requested * (1 - threshold)
+    #   (catches a phase that mostly failed — bogus low qps — without assuming ref == audit)
+    # caching rule:               audit.qps < ref.qps * (1 + threshold)
+```
+
+The phases may issue **different** counts (`samples` vs `audit_samples`), so the result does
+**not** require `ref.n_completed == audit.n_completed`. Validity comes from `qps` being a
+rate (caching still shows up as a throughput spike) plus the per-phase completion guard,
+which rejects a run that crashed partway and would otherwise post a misleadingly low qps.
+
+`RunStats.from_report(Report, n_requested)` is the sole adapter — the in-process path the
+orchestrator uses, guarding `qps is None` (no duration) and `qps <= 0` (no completions) with
+a clean `ValueError`. The redesign exposes no standalone verifier CLI and no offline
+re-check-from-disk adapter — the audit runs only via `benchmark from-config`.
+
+---
+
+## 5. Module layout
+
+```
+src/inference_endpoint/compliance/
+├── __init__.py        # AuditTest protocol, RunSpec, RunStats, AuditResult, get_audit_test()
+├── result.py         # AuditResult + atomic write → verify_<TEST>.txt + audit_result.json
+└── tests/
+    ├── __init__.py    # imports submodules so registration fires
+    └── output_caching_test.py      # OutputCachingAudit: plan_runs (reference + audit specs) + verify_output_caching core
+```
+
+CLI surface: an `audit:` block in the benchmark YAML, picked up by `benchmark from-config`.
+
+```yaml
+# bench.yaml
+audit:
+  test: output_caching_test
+  samples: 64
+  threshold: 0.10
+```
+
+```
+inference-endpoint benchmark from-config --config bench.yaml
+# detects audit:, runs reference (64 distinct) + test04 (64 × fixed index) back-to-back,
+# writes verify_OUTPUT_CACHING_TEST.txt + audit_result.json, exits 0 (PASS) / 1 (FAIL)
+# (errors propagate via the standard handler: SetupError → 3, ExecutionError → 4)
+```
+
+### Unified submission (perf + accuracy + audit in one file)
+
+`audit:` is additive, so a single `type: submission` config drives the whole submission:
+`run_benchmark` does the performance run, scores the accuracy datasets, then runs the audit
+— one command, one `report_dir`. Each piece is optional: drop `audit:` for perf+acc, or
+omit accuracy datasets for perf+audit.
+
+The committed example is `examples/09_Wan22_VideoGen_Example/offline_wan22_submission.yaml`:
+
+```yaml
+# Full WAN 2.2 Offline submission: performance + VBench accuracy + TEST04 audit.
+# One command runs all three under a single report_dir:
+#   inference-endpoint benchmark from-config \
+#       examples/09_Wan22_VideoGen_Example/offline_wan22_submission.yaml
+#
+# Execution order (run_benchmark):
+#   1. performance run  — full 248-prompt dataset (the submission perf result)
+#   2. accuracy scoring — VBench over the produced videos
+#   3. audit (TEST04)   — reference + fixed-sample phases (equal counts here), then result
+#
+# NOTE: the `audit:` block is implemented per docs/compliance_audit_plan.md
+# (the `compliance/` module). The performance + accuracy portion mirrors
+# offline_wan22_accuracy.yaml.
+
+name: "submission-wan22-video-generation"
+version: "1.0"
+type: "submission"
+benchmark_mode: "offline" # required for type: submission
+
+model_params:
+  name: "wan22"
+  max_new_tokens: 1 # ignored by VideoGenAdapter; kept >0 for api_type debug swaps
+  streaming: "off" # WAN 2.2 uses non-streaming HTTP POST/response
+
+datasets:
+  # Performance dataset drives request issuance (the submission perf run).
+  - name: wan22_perf
+    path: examples/09_Wan22_VideoGen_Example/wan22_prompts.jsonl
+    type: "performance"
+    samples: 248
+
+  # Accuracy dataset reuses the same prompts; videos are scored VBench-style.
+  - name: wan22_vbench
+    path: examples/09_Wan22_VideoGen_Example/wan22_prompts.jsonl
+    type: "accuracy"
+    samples: 248
+    accuracy_config:
+      eval_method: "vbench"
+      ground_truth: "prompt" # VBench input is (prompt, video), not a GT comparison
+      num_repeats: 1
+
+# TEST04 caching audit — additive post-step. Runs its OWN reference + fixed-sample
+# phases at equal counts (the audit count may be lowered to shorten the phase).
+audit:
+  test: "output_caching_test"
+  samples: 64 # reference phase count (subset of the 248 prompts)
+  audit_samples: 64 # audit (fixed-sample) phase count; lower (e.g. 32) to shorten the audit phase
+  sample_index: 3 # MLCommons audit.config performance_issue_same_index=3
+  threshold: 0.10 # audit qps must stay < reference qps * (1 + threshold)
+
+settings:
+  runtime:
+    # NOTE: runs are count-driven (n_samples_to_issue / audit.samples). min_duration_ms is
+    # NOT enforced as a duration floor by the current stop logic (counts take priority);
+    # MLCommons' 10-min minimum / AND-semantics is future work. Only max_duration_ms caps.
+    max_duration_ms: 14400000 # 4-hour ceiling
+    scheduler_random_seed: 42
+    dataloader_random_seed: 42
+    n_samples_to_issue: 248 # applies to the perf/accuracy run; audit uses audit.samples
+
+  load_pattern:
+    type: "max_throughput"
+
+endpoint_config:
+  endpoints:
+    - "http://localhost:8000"
+  api_type: "videogen"
+  api_key: null
+
+report_dir: logs/wan22_submission
+```
+
+Resulting `report_dir/` (main perf/accuracy artifacts keep their current layout; the audit
+adds its labelled phase subdirs + result):
+
+```
+report_dir/
+├── final_snapshot.json   # submission perf run (existing top-level layout)
+├── events.jsonl
+├── …                     # accuracy scoring outputs (existing)
+├── reference/            # audit reference phase    (samples=64)
+├── output_caching/               # audit fixed-sample phase (samples=64)
+├── verify_OUTPUT_CACHING_TEST.txt
+└── audit_result.json
+```
+
+### WAN2.2-T2V — the first target
+
+The first workload to exercise TEST04 is **WAN2.2-T2V-A14B** (MLPerf text-to-video), served
+through the `videogen` adapter (`api_type: videogen`, model `wan22`, non-streaming HTTP).
+Prompts come from the 248-row `examples/09_Wan22_VideoGen_Example/wan22_prompts.jsonl`.
+Two scenarios must be covered: **Offline** (`max_throughput`) and **SingleStream**
+(`concurrency`, one request in-flight).
+
+**MLCommons knobs and how they map to `AuditConfig`:**
+
+| MLCommons (WAN2.2 `audit.config` / `mlperf.conf`) | `AuditConfig`                 | Notes                                               |
+| ------------------------------------------------- | ----------------------------- | --------------------------------------------------- |
+| `performance_issue_same=1`                        | (implied by TEST04)           | audit phase issues one fixed prompt for every query |
+| `performance_issue_same_index=3`                  | `sample_index: 3`             | which prompt is repeated                            |
+| TEST04 throughput tolerance                       | `threshold: 0.10`             | `0.20` for the low-throughput SingleStream scenario |
+| `min_query_count` (reference / audit)             | `samples` / `audit_samples`   | independent per-phase counts (§4)                   |
+| `min_duration` (compliance ≥ 10 min)              | _not yet enforced_ (see note) | counts take priority in current stop logic          |
+
+> **Design decision — equal counts in the shipped examples; independent counts supported.** > `samples` sizes the reference phase and `audit_samples` the fixed-sample phase
+> (`audit_samples=None` falls back to `samples`). The **shipped examples use equal counts** —
+> Offline `samples: 64` / `audit_samples: 64`, SingleStream `samples: 20` — which addresses
+> the maintainer's fairness concern ("comparing QPS of 50 distinct vs 20 repeated … doesn't
+> seem fair", PR #332) by comparing like-for-like.
+>
+> The schema still **supports** independent counts because upstream MLPerf TEST04 itself uses
+> them: the MLCommons `compliance/nvidia/TEST04/audit.config` overrides
+> `stable-diffusion-xl.Offline.min_query_count = 500` against a `mlperf.conf` reference of
+> `5000` — i.e. a **5000 reference / 500 audit** split, compared as samples-per-second. So
+> `audit_samples < samples` is a valid, upstream-faithful way to shorten the (expensive) audit
+> phase. The result does **not** require equal counts — `qps` is rate-normalized and a
+> **per-phase completion guard** (each phase must complete ≥ `requested × (1 − threshold)`)
+> catches a crashed run — but the examples default to equal for the clearest, least-contentious
+> comparison.
+
+> **`min_duration` is not a duration floor (current limitation).** The load-generator stop
+> check (`session.py`) halts a phase on **sample count** or **`max_duration_ms`** only;
+> `min_duration_ms` merely _derives_ a count when no explicit count is set. Because TEST04
+> drives an explicit `samples` count, each phase stops at `samples` and `min_duration_ms` is
+> **not** honored as a "run for at least 10 minutes" floor. MLCommons' 10-minute compliance
+> minimum therefore is **not** enforced today; combining a count floor with a duration floor
+> ("AND-semantics") is future work. Set `samples` large enough that each phase reaches a
+> stable throughput on its own.
+
+**Offline (`max_throughput`):**
+
+```yaml
+# Illustrative: Offline TEST04 audit-only (perf + audit, no accuracy datasets).
+# The committed file is offline_wan22_submission.yaml (perf + accuracy + audit).
+type: offline
+model_params: { name: wan22, streaming: off }
+audit:
+  test: output_caching_test
+  samples: 64 # reference phase count (tunable subset of the 248-prompt dataset)
+  audit_samples: 64 # audit (fixed-sample) phase count; lower (e.g. 32) to shorten the audit phase
+  sample_index: 3 # MLCommons performance_issue_same_index
+  threshold: 0.10
+datasets:
+  - {
+      name: wan22_prompts,
+      path: examples/09_Wan22_VideoGen_Example/wan22_prompts.jsonl,
+    }
+settings:
+  runtime: {} # count-driven (samples / audit_samples); min_duration_ms is not a floor (see note)
+  load_pattern: { type: max_throughput }
+endpoint_config: { api_type: videogen, endpoints: ["http://localhost:8000"] }
+```
+
+**SingleStream (`concurrency` = 1):**
+
+```yaml
+# Illustrative: SingleStream TEST04 audit-only (perf + audit, no accuracy datasets).
+# The committed file is single_stream_wan22_submission.yaml (perf + accuracy + audit).
+type: online
+model_params: { name: wan22, streaming: off }
+audit:
+  test: output_caching_test
+  samples: 20 # MLCommons SingleStream min_query_count (audit_samples omitted → also 20)
+  sample_index: 3
+  threshold: 0.20 # low-throughput stream tolerance
+datasets:
+  - {
+      name: wan22_prompts,
+      path: examples/09_Wan22_VideoGen_Example/wan22_prompts.jsonl,
+    }
+settings:
+  runtime: {} # count-driven (see note); min_duration_ms is not a floor
+  load_pattern: { type: concurrency, target_concurrency: 1 }
+endpoint_config: { api_type: videogen, endpoints: ["http://localhost:8000"] }
+```
+
+---
+
+## 6. File-by-file changes (against `main`)
+
+| File                                                                     | Change                                                                                                      |
+| ------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------- |
+| `compliance/__init__.py`                                                 | **new** — `AuditTest` protocol, `RunSpec`, `RunStats`, `AuditResult`, `get_audit_test()` registry           |
+| `compliance/result.py`                                                   | **new** — `AuditResult` + atomic `write_result` (reference `verify_OUTPUT_CACHING_TEST.txt` wording + JSON) |
+| `compliance/tests/__init__.py`                                           | **new** — imports submodules so registration fires                                                          |
+| `compliance/tests/output_caching_test.py`                                | **new** — `OutputCachingAudit.plan_runs` (reference + audit specs) + `verify_output_caching` core           |
+| `commands/audit.py`                                                      | **new** — generic `run_audit` loop (plan → validate-all → execute → verify → write)                         |
+| `config/schema.py`                                                       | **+** `AuditTestId`, `AuditConfig`, `audit: AuditConfig \| None` on `BenchmarkConfig`                       |
+| `load_generator/sample_order.py`                                         | **+** `SampleOrderSpec` + `SingleSampleOrder`; `create_sample_order` switches on the spec                   |
+| `config/runtime_settings.py`                                             | **+** `sample_order: SampleOrderSpec` (generic; default `WITHOUT_REPLACEMENT`)                              |
+| `commands/benchmark/execute.py`                                          | **+** typed `run_spec` seam in `setup_benchmark`; `run_benchmark` dispatches to `run_audit`                 |
+| `examples/09_Wan22_VideoGen_Example/offline_wan22_submission.yaml`       | **new** — WAN2.2 Offline submission (perf + accuracy + TEST04)                                              |
+| `examples/09_Wan22_VideoGen_Example/single_stream_wan22_submission.yaml` | **new** — WAN2.2 SingleStream submission (perf + accuracy + TEST04)                                         |
+
+---
+
+## 7. Extending to other audit tests
+
+Adding a test whose run behavior is already expressible touches **four things**: a new file
+under `compliance/tests/`, one `AuditTestId` enum value, a per-test config model added to
+the `AuditConfig` discriminated union (`Annotated[OutputCachingTestConfig | TestNNConfig, Field(discriminator="test")]`),
+and one import line in `compliance/tests/__init__.py`. The orchestrator, load generator,
+result writer, and CLI are untouched.
+
+**Orchestrator example (TEST01 — same-model check):**
+
+```python
+# compliance/tests/test01.py
+class Test01Audit:
+    test_id = AuditTestId.TEST01
+
+    def plan_runs(self, cfg: AuditConfig) -> list[RunSpec]:
+        return [
+            RunSpec("performance", cfg.samples, SampleOrderSpec.without_replacement()),
+            RunSpec("accuracy",    cfg.samples, SampleOrderSpec.without_replacement()),
+        ]
+
+    def verify(self, runs: list[RunArtifacts], cfg: AuditConfig) -> AuditResult:
+        perf, acc = runs
+        return AuditResult("TEST01", perf.model_outputs_match(acc), {...})
+
+register(Test01Audit())
+```
+
+**Analyzer example (TEST09 — output-length check):** `plan_runs` returns a single normal
+run; `verify` reads `events.jsonl` and checks mean OSL within `[ref × 0.9, ref × 1.1]`.
+
+**What costs more than one file (honest limits):**
+
+1. A test needing run behavior `SampleOrderSpec` cannot express → add **one variant** to
+   `SampleOrderSpec` + its branch in `create_sample_order`. A typed extension of the single
+   generic seam, not leakage.
+2. TEST06/09 need raw output token IDs (see §2) → one isolated, audit-capture data-path
+   addition shared by all token-level tests. TEST04 and TEST01 need none of it.
+
+---
+
+## 8. Requirements traceability
+
+Covers **every** comment thread on PR #332 — the maintainer workflow threads
+(@nvzhihanj, @viraatc), both "Review Council" passes, and the Gemini robustness comments.
+
+### Maintainer workflow & example-config threads
+
+| Comment                                                            | Resolution                                                                                                                                                                                                                 |
+| ------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Run **one command**, not two/three; phases back-to-back            | `run_audit` generic loop; `audit:` block on `benchmark from-config`                                                                                                                                                        |
+| Perf + accuracy + audit from a single config                       | `type: submission` YAML; `run_benchmark` runs perf [+acc], then `run_audit` additively (§5)                                                                                                                                |
+| Comparing 50 distinct vs 20/25 repeated "doesn't seem fair"        | **resolved** — shipped examples use **equal** counts (Offline 64/64, SingleStream 20). `audit_samples` allows independent counts (upstream TEST04 uses 5000/500 for SDXL) as an opt-in to shorten the audit phase; see §5. |
+| "Forced to run 248 in audit … too long"                            | `samples` (reference) and `audit_samples` (audit) are independent subsets; no full-dataset requirement                                                                                                                     |
+| Audit sample "shuffled or fixed?"                                  | fixed — reference = `WITHOUT_REPLACEMENT`, audit = `SINGLE(sample_index)` (MLPerf `issue_same`)                                                                                                                            |
+| Need an audit config for single-stream too                         | load-pattern validation admits `concurrency` (single-stream) and `max_throughput` (offline)                                                                                                                                |
+| Paced loads should not silently pass                               | `poisson` rejected up front (§4 step 3) — pacing caps throughput and masks caching                                                                                                                                         |
+| Inconsistent / context-free example file names                     | the shipped example YAMLs use context-rich sibling names (`offline_wan22_submission.yaml` / `single_stream_wan22_submission.yaml`); result artifacts use fixed `verify_OUTPUT_CACHING_TEST.txt` + JSON                     |
+| `num_workers` hard-coded in example YAMLs; use default             | omitted from the shipped examples — they carry only what TEST04 requires (endpoint defaults otherwise)                                                                                                                     |
+| README / unrelated dependency churn (`pip`, `aiohttp`) in the diff | this PR contains only the design doc + the two WAN2.2 example configs — no README or dependency changes bundled                                                                                                            |
+
+### Design-review findings (both Review Council passes)
+
+| Finding (severity)                                           | Resolution                                                                                                                                             |
+| ------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `ref_samples` dead write / mismatched counts (high)          | each phase's count is an explicit `RunSpec.n_samples` honored via `n_samples_to_issue` (the bug was the reference count being silently dropped)        |
+| No `AuditTest` abstraction; TEST04 hardcoded (high)          | `AuditTest` protocol + `get_audit_test` registry; generic loop                                                                                         |
+| `DatasetType.AUDIT` abstraction leak (high)                  | dropped; phases derive from a normal PERFORMANCE dataset                                                                                               |
+| `test04` boolean in `RuntimeSettings`/load-gen (high)        | generic `SampleOrderSpec`; load-gen has no test knowledge                                                                                              |
+| `_OVERRIDE_TEST04_SAMPLE_INDEX` stringly-typed kwarg (med)   | typed `run_spec` seam                                                                                                                                  |
+| Two-phase `model_copy` surgery; ref skips validation (med)   | declarative `RunSpec`; validate all specs before any run                                                                                               |
+| Orchestrator untested (med)                                  | unit tests assert per-phase counts + early-return paths                                                                                                |
+| Scattered params / hardcoded threshold (med)                 | per-test config model (`OutputCachingTestConfig`), discriminated on `test` — each test carries only its own knobs                                      |
+| Unfair QPS comparison across counts/contents (med)           | examples use **equal** counts; per-phase completion guard + qps rate-normalization keep unequal counts sound when opted into (upstream-faithful, §5)   |
+| Audit params belong in `AuditConfig`, not `Dataset` (med)    | `AuditConfig` sub-model on `BenchmarkConfig`; `Dataset` untouched                                                                                      |
+| Two parallel verifier entry points (low)                     | one `verify_output_caching(RunStats, RunStats)` core + a single `RunStats.from_report` adapter                                                         |
+| `sample_index` bound-checked late (low)                      | validated vs loaded dataset size before any run                                                                                                        |
+| `audit_config` re-entrancy trap (critical)                   | every phase config sets `audit=None`; cannot re-enter `run_audit`                                                                                      |
+| Orchestrator returns `None`; PASS/FAIL indistinguishable     | `run_audit` returns a typed `AuditResult`; CLI exits `0` (PASS) / `1` (FAIL); errors via the standard handler (`SetupError` → 3, `ExecutionError` → 4) |
+| Non-atomic result write (high)                               | `write_result` uses `tmp → fsync → rename → fsync(parent)`                                                                                             |
+| Duplicates `setup_benchmark` dir / `config.yaml` logic (med) | phases reuse `setup_benchmark`; no recomputed report-dir                                                                                               |
+| `_audit_marker` parsed twice in error path (low)             | n/a — orchestrator owns phase labels, so no directory-swap guard                                                                                       |
+
+### Robustness & API hygiene (Gemini + Review Council)
+
+| Comment                                                              | Resolution                                                                                                                    |
+| -------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
+| CLI catches only `FileNotFoundError`/`ValueError`; write outside try | n/a in redesign (no standalone verifier CLI); the audit runs via `from-config` and errors propagate to `main.py`'s handler    |
+| `_audit_marker` `AttributeError` on non-`dict` JSON                  | n/a in redesign (no standalone dir-swap marker; the orchestrator owns phase labels)                                           |
+| `Report.from_snapshot` `KeyError`/`TypeError` uncaught               | n/a — snapshots are consumed in-process per phase; a phase that fails to produce a usable report aborts with `ExecutionError` |
+| Public entry points missing from `__all__`                           | `compliance/__init__.py` `__all__` exports the full public surface                                                            |
+
+---
+
+## 9. Success criteria (goal-driven; verify before done)
+
+1. **Integration** — `benchmark from-config` with an `audit:` block runs both phases
+   back-to-back and writes `verify_OUTPUT_CACHING_TEST.txt` + `audit_result.json`; PASS against a
+   no-caching `mock_http_echo_server`, FAIL against a caching mock.
+2. **Completion guard** — a phase that completes far fewer than its _requested_ count fails
+   the result (`completed < requested × (1 − threshold)` → FAIL), independent of the other
+   phase's count.
+3. **Unit** — `SingleSampleOrder` always yields the configured index (bounds-checked);
+   `verify_output_caching` PASS within threshold, FAIL above, boundary at the strict `<` line,
+   slower-passes, custom threshold, and the completion guard trips; `RunStats.from_report`
+   raises on a `None`-duration or non-positive `qps`; `OutputCachingAudit.plan_runs` emits a
+   reference spec at `samples` and an audit spec at `audit_samples` (which may differ).
+4. **Unit (orchestrator)** — assert the reference phase issues `samples` and the audit phase
+   issues `audit_samples` (defaulting to `samples` when omitted), validation fires before any
+   run, the typed result propagates (PASS/FAIL distinguishable), and a phase config never
+   carries `audit` (no re-entry).
+5. **Validation** — a paced (`poisson`) load and an out-of-range `sample_index` are both
+   rejected before any phase runs.
+6. **Robustness** — `RunStats.from_report` raises a clean `ValueError` on a report with no
+   duration (`qps is None`) or non-positive throughput (`qps <= 0`); a phase whose
+   `Report.complete` is `False` (metrics drain timeout / interrupt) aborts the audit with
+   `ExecutionError` rather than certifying a result on partial data.
+7. **No leakage** — `grep -r test04 src/inference_endpoint/{load_generator,config/runtime_settings.py}`
+   returns nothing.
+8. `pre-commit run --all-files` clean (ruff / mypy / license headers).
diff --git a/examples/09_Wan22_VideoGen_Example/offline_wan22_submission.yaml b/examples/09_Wan22_VideoGen_Example/offline_wan22_submission.yaml
new file mode 100644
index 000000000..d908208ef
--- /dev/null
+++ b/examples/09_Wan22_VideoGen_Example/offline_wan22_submission.yaml
@@ -0,0 +1,70 @@
+# Full WAN 2.2 Offline submission: performance + VBench accuracy + output-caching audit (MLPerf TEST04).
+# One command runs all three under a single report_dir:
+#   inference-endpoint benchmark from-config \
+#       examples/09_Wan22_VideoGen_Example/offline_wan22_submission.yaml
+#
+# Execution order (run_benchmark):
+#   1. performance run  — full 248-prompt dataset (the submission perf result)
+#   2. accuracy scoring — VBench over the produced videos
+#   3. audit (output_caching_test, MLPerf TEST04) — reference + fixed-sample phases (equal counts here), then result
+#
+# NOTE: the `audit:` block is implemented per docs/compliance_audit_plan.md
+# (the `compliance/` module). The performance + accuracy portion mirrors
+# offline_wan22_accuracy.yaml.
+
+name: "submission-wan22-video-generation"
+version: "1.0"
+type: "submission"
+benchmark_mode: "offline" # required for type: submission
+
+model_params:
+  name: "wan22"
+  max_new_tokens: 1 # ignored by VideoGenAdapter; kept >0 for api_type debug swaps
+  streaming: "off" # WAN 2.2 uses non-streaming HTTP POST/response
+
+datasets:
+  # Performance dataset drives request issuance (the submission perf run).
+  - name: wan22_perf
+    path: examples/09_Wan22_VideoGen_Example/wan22_prompts.jsonl
+    type: "performance"
+    samples: 248
+
+  # Accuracy dataset reuses the same prompts; videos are scored VBench-style.
+  - name: wan22_vbench
+    path: examples/09_Wan22_VideoGen_Example/wan22_prompts.jsonl
+    type: "accuracy"
+    samples: 248
+    accuracy_config:
+      eval_method: "vbench"
+      ground_truth: "prompt" # VBench input is (prompt, video), not a GT comparison
+      num_repeats: 1
+
+# Output-caching audit (MLPerf TEST04) — additive post-step. Runs its OWN reference + fixed-sample
+# phases at equal counts (the audit count may be lowered to shorten the phase).
+audit:
+  test: "output_caching_test"
+  samples: 64 # reference phase count (subset of the 248 prompts)
+  audit_samples: 64 # audit (fixed-sample) phase count; lower (e.g. 32) to shorten the audit phase
+  sample_index: 3 # MLCommons audit.config performance_issue_same_index=3
+  threshold: 0.10 # audit qps must stay < reference qps * (1 + threshold)
+
+settings:
+  runtime:
+    # NOTE: runs are count-driven (n_samples_to_issue / audit.samples). min_duration_ms is
+    # NOT enforced as a duration floor by the current stop logic (counts take priority);
+    # MLCommons' 10-min minimum / AND-semantics is future work. Only max_duration_ms caps.
+    max_duration_ms: 14400000 # 4-hour ceiling
+    scheduler_random_seed: 42
+    dataloader_random_seed: 42
+    n_samples_to_issue: 248 # applies to the perf/accuracy run; audit uses audit.samples
+
+  load_pattern:
+    type: "max_throughput"
+
+endpoint_config:
+  endpoints:
+    - "http://localhost:8000"
+  api_type: "videogen"
+  api_key: null
+
+report_dir: logs/wan22_submission
diff --git a/examples/09_Wan22_VideoGen_Example/single_stream_wan22_submission.yaml b/examples/09_Wan22_VideoGen_Example/single_stream_wan22_submission.yaml
new file mode 100644
index 000000000..2e0998e37
--- /dev/null
+++ b/examples/09_Wan22_VideoGen_Example/single_stream_wan22_submission.yaml
@@ -0,0 +1,71 @@
+# Full WAN 2.2 SingleStream submission: performance + VBench accuracy + output-caching audit (MLPerf TEST04).
+# SingleStream = one request in-flight at a time (concurrency=1).
+# One command runs all three under a single report_dir:
+#   inference-endpoint benchmark from-config \
+#       examples/09_Wan22_VideoGen_Example/single_stream_wan22_submission.yaml
+#
+# Execution order (run_benchmark):
+#   1. performance run  — single-stream latency over the prompt set
+#   2. accuracy scoring — VBench over the produced videos
+#   3. audit (output_caching_test, MLPerf TEST04) — reference + fixed-sample phases, then result
+#
+# NOTE: the `audit:` block is implemented per docs/compliance_audit_plan.md
+# (the `compliance/` module).
+
+name: "submission-wan22-video-generation-singlestream"
+version: "1.0"
+type: "submission"
+benchmark_mode: "online" # required for type: submission
+
+model_params:
+  name: "wan22"
+  max_new_tokens: 1 # ignored by VideoGenAdapter; kept >0 for api_type debug swaps
+  streaming: "off" # WAN 2.2 uses non-streaming HTTP POST/response
+
+datasets:
+  # Performance dataset drives request issuance (the submission perf run).
+  - name: wan22_perf
+    path: examples/09_Wan22_VideoGen_Example/wan22_prompts.jsonl
+    type: "performance"
+    samples: 20 # MLCommons SingleStream min_query_count
+
+  # Accuracy dataset reuses the same prompts; videos are scored VBench-style.
+  - name: wan22_vbench
+    path: examples/09_Wan22_VideoGen_Example/wan22_prompts.jsonl
+    type: "accuracy"
+    samples: 20
+    accuracy_config:
+      eval_method: "vbench"
+      ground_truth: "prompt" # VBench input is (prompt, video), not a GT comparison
+      num_repeats: 1
+
+# Output-caching audit (MLPerf TEST04) — additive post-step. Runs its OWN reference + fixed-sample
+# phases at equal counts (the audit count may be lowered to shorten the phase).
+audit:
+  test: "output_caching_test"
+  samples: 20 # reference phase count (SingleStream min_query_count)
+  audit_samples: 20 # audit (fixed-sample) phase count; omit to equal `samples`
+  sample_index: 3 # MLCommons audit.config performance_issue_same_index=3
+  threshold: 0.20 # low-throughput stream tolerance (±20%)
+
+settings:
+  runtime:
+    # NOTE: runs are count-driven (n_samples_to_issue / audit counts). min_duration_ms is
+    # NOT enforced as a duration floor by the current stop logic (counts take priority);
+    # MLCommons' 10-min minimum / AND-semantics is future work. Only max_duration_ms caps.
+    max_duration_ms: 7200000 # 2-hour ceiling
+    scheduler_random_seed: 42
+    dataloader_random_seed: 42
+    n_samples_to_issue: 20 # applies to the perf/accuracy run; audit uses its own counts
+
+  load_pattern:
+    type: "concurrency"
+    target_concurrency: 1 # SingleStream: one request in-flight at a time
+
+endpoint_config:
+  endpoints:
+    - "http://localhost:8000"
+  api_type: "videogen"
+  api_key: null
+
+report_dir: logs/wan22_singlestream_submission

From e909f5f705f257168555f72655e3882d8fd6a7a8 Mon Sep 17 00:00:00 2001
From: Tin-Yin Lai <tinyinl@nvidia.com>
Date: Mon, 22 Jun 2026 18:18:20 -0700
Subject: [PATCH 2/3] feat(compliance): output-caching audit (MLPerf TEST04)
 implementation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Generic AuditTest framework (compliance/): AuditTest protocol +
RunSpec/RunStats/RunArtifacts + registry; OutputCachingAudit
(compliance/tests/output_caching_test.py) implements MLPerf TEST04 caching
detection — reference vs fixed-sample phase, fails if audit QPS exceeds
reference QPS by > threshold. run_audit orchestrator (commands/audit.py)
runs phases back-to-back, validates unpaced load + sample_index, refuses to
certify an incomplete phase, and writes verify_OUTPUT_CACHING_TEST.txt +
audit_result.json atomically (compliance/result.py). Wired via the YAML
audit: block (schema.py AuditTestId/OutputCachingTestConfig) and a generic
SampleOrderSpec + SingleSampleOrder seam in the load generator.

Also folds in the branch's incidental non-compliance changes that touch
these files: the metrics-aggregator --ready-file flag, the service launcher
ready-check timeout widening, and the aiohttp + msgpack==1.2.1 CVE bumps
(uv.lock/pyproject; msgpack clears GHSA-6v7p-g79w-8964).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 pyproject.toml                                |   3 +-
 .../services/metrics_aggregator/__main__.py   |  13 ++
 src/inference_endpoint/commands/audit.py      | 160 ++++++++++++++++++
 .../commands/benchmark/cli.py                 |   6 +-
 .../commands/benchmark/execute.py             |  36 +++-
 src/inference_endpoint/compliance/__init__.py | 121 +++++++++++++
 src/inference_endpoint/compliance/result.py   |  72 ++++++++
 .../compliance/tests/__init__.py              |  18 ++
 .../compliance/tests/output_caching_test.py   | 124 ++++++++++++++
 .../config/runtime_settings.py                |  25 ++-
 src/inference_endpoint/config/schema.py       |  51 ++++++
 .../templates/concurrency_template_full.yaml  |   5 +-
 .../templates/offline_template_full.yaml      |   5 +-
 .../templates/online_template_full.yaml       |   5 +-
 .../load_generator/sample_order.py            |  37 +++-
 uv.lock                                       | 134 +++++++--------
 16 files changed, 732 insertions(+), 83 deletions(-)
 create mode 100644 src/inference_endpoint/commands/audit.py
 create mode 100644 src/inference_endpoint/compliance/__init__.py
 create mode 100644 src/inference_endpoint/compliance/result.py
 create mode 100644 src/inference_endpoint/compliance/tests/__init__.py
 create mode 100644 src/inference_endpoint/compliance/tests/output_caching_test.py

diff --git a/pyproject.toml b/pyproject.toml
index 0b0f67a86..c352ef126 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -73,6 +73,7 @@ dependencies = [
     # Fix pytz-2024 import warning
     "pytz==2026.1.post1",
     "urllib3==2.7.0",
+    "msgpack==1.2.1",
 ]
 
 [project.optional-dependencies]
@@ -112,7 +113,7 @@ test = [
     "Pympler==1.1",
     "scipy==1.17.1",
     # HTTP server and client for mock server fixture
-    "aiohttp==3.14.0",
+    "aiohttp==3.14.1",
     # Plotting for benchmark sweep mode
     "matplotlib==3.10.8",
     # Property-based testing (CLI fuzz)
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
index 2231d6dc8..212977b71 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
@@ -183,6 +183,16 @@ async def main() -> None:
         default=0,
         help="Identity to send in the readiness signal",
     )
+    parser.add_argument(
+        "--ready-file",
+        type=str,
+        default=None,
+        help=(
+            "If set, touch this file after signal handlers are registered "
+            "so that test harnesses can poll for startup completion instead "
+            "of relying on a fixed sleep."
+        ),
+    )
     args = parser.parse_args()
     setup_logging(level="INFO")
 
@@ -282,6 +292,9 @@ async def main() -> None:
                 ),
             )
 
+            if args.ready_file:
+                Path(args.ready_file).touch()
+
             if args.readiness_path:
                 await send_ready_signal(zmq_ctx, args.readiness_path, args.readiness_id)
 
diff --git a/src/inference_endpoint/commands/audit.py b/src/inference_endpoint/commands/audit.py
new file mode 100644
index 000000000..a1ecbe1a2
--- /dev/null
+++ b/src/inference_endpoint/commands/audit.py
@@ -0,0 +1,160 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Generic compliance audit orchestrator.
+
+run_audit(config) drives all phases of a compliance audit test back-to-back
+against the same endpoint, then verifies the results and writes the result.
+
+Exit semantics (propagated by run_benchmark → cli.py → sys.exit):
+  0  PASS
+  1  FAIL
+  2  setup / I/O / phase error (raises ExecutionError or SetupError)
+"""
+
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+
+from ..compliance import RunArtifacts, get_audit_test
+from ..compliance.result import AuditResult, write_result
+from ..config.schema import BenchmarkConfig, LoadPatternType
+from ..exceptions import ExecutionError, SetupError
+
+logger = logging.getLogger(__name__)
+
+
+def run_audit(config: BenchmarkConfig, base_report_dir: Path) -> AuditResult:
+    """Orchestrate the planned audit phases and return the result.
+
+    All phases run back-to-back against the same endpoint, each under its
+    own subdirectory of ``base_report_dir``. If any phase raises, the error
+    is re-raised without verifying (a crashed phase must not produce a result).
+
+    Args:
+        config: Main benchmark config (must have config.audit set).
+        base_report_dir: Report directory for the main run; audit subdirs go here.
+
+    Returns:
+        AuditResult — always returned; caller maps passed/failed to exit code.
+
+    Raises:
+        SetupError: Config invalid for audit (missing audit block, paced load, bad index).
+        ExecutionError: A phase benchmark run failed.
+    """
+    from ..commands.benchmark.execute import (
+        TestMode,
+        finalize_benchmark,
+        run_benchmark_async,
+        setup_benchmark,
+    )
+
+    assert config.audit is not None, "run_audit called with config.audit=None"
+    audit_cfg = config.audit
+    test = get_audit_test(audit_cfg.test)
+
+    # Validate load pattern. The output-caching audit (MLPerf TEST04) needs a
+    # pattern where cache-induced
+    # speedups surface as higher achieved throughput:
+    #   - max_throughput: a faster SUT completes more queries per second.
+    #   - concurrency: at a fixed in-flight count, faster (cached) responses
+    #     raise the completion rate, so caching still shows up in QPS.
+    # Rate-paced patterns (poisson / target-QPS) pin the arrival rate, so a
+    # cached SUT just idles and the speedup is masked; patterns with different
+    # sample semantics (e.g. multi_turn) make the fixed-index phase meaningless.
+    # Allow-list the two valid patterns rather than enumerate the rejects.
+    load_type = config.settings.load_pattern.type
+    if load_type not in (LoadPatternType.MAX_THROUGHPUT, LoadPatternType.CONCURRENCY):
+        raise SetupError(
+            "Compliance audit requires an unpaced load pattern (max_throughput or concurrency). "
+            f"Got: {load_type.value}"
+        )
+
+    specs = test.plan_runs(audit_cfg)
+
+    perf_datasets = [d for d in config.datasets if d.type.value == "performance"]
+    if not perf_datasets:
+        raise SetupError("Audit requires at least one performance dataset")
+
+    # Execute each phase back-to-back. The first phase's setup_benchmark loads
+    # the dataset; reuse that count to bounds-check every fixed-index spec
+    # before any phase actually runs. setup_benchmark only loads data (it spawns
+    # no workers), so a failed bounds check here costs one load and nothing more.
+    artifacts: list[RunArtifacts] = []
+    n_samples: int | None = None
+    for spec in specs:
+        phase_dir = base_report_dir / spec.label
+        phase_dir.mkdir(parents=True, exist_ok=True)
+
+        # Build a per-phase config: phase subdirectory, no nested audit, explicit count.
+        phase_config = config.with_updates(report_dir=phase_dir, audit=None)
+
+        try:
+            ctx = setup_benchmark(phase_config, TestMode.PERF, run_spec=spec)
+            if n_samples is None:
+                n_samples = ctx.dataloader.num_samples()
+                for check_spec in specs:
+                    idx = check_spec.sample_order.fixed_index
+                    if idx is not None and not (0 <= idx < n_samples):
+                        raise SetupError(
+                            f"Audit phase '{check_spec.label}': sample_index={idx} "
+                            f"is out of range [0, {n_samples}) for dataset with "
+                            f"{n_samples} samples"
+                        )
+            bench = run_benchmark_async(ctx)
+            finalize_benchmark(ctx, bench)
+        except (SetupError, ExecutionError):
+            raise
+        except Exception as exc:
+            raise ExecutionError(f"Audit phase '{spec.label}' failed: {exc}") from exc
+
+        report = bench.report
+        if report is None:
+            raise ExecutionError(f"Audit phase '{spec.label}' produced no report")
+        # A drain-timeout (complete with pending async tasks) or an
+        # INTERRUPTED phase yields partial stats; certifying a result from it
+        # would let an incomplete run pass compliance.
+        if not report.complete:
+            raise ExecutionError(
+                f"Audit phase '{spec.label}' did not complete cleanly "
+                "(metrics drain timed out or the run was interrupted); "
+                "refusing to certify a result from partial data"
+            )
+        # When the spec didn't fix a count (None = full dataset), the requested
+        # count is the number actually issued this phase.
+        n_requested = (
+            spec.n_samples if spec.n_samples is not None else report.n_samples_issued
+        )
+        artifacts.append(
+            RunArtifacts(
+                label=spec.label,
+                report_dir=phase_dir,
+                report=report,
+                n_requested=n_requested,
+            )
+        )
+
+    result = test.verify(artifacts, audit_cfg)
+    write_result(result, base_report_dir)
+
+    status = "PASS" if result.passed else "FAIL"
+    logger.info(
+        "Audit %s %s — %s",
+        audit_cfg.test,
+        status,
+        result.details.get("reason", ""),
+    )
+    return result
diff --git a/src/inference_endpoint/commands/benchmark/cli.py b/src/inference_endpoint/commands/benchmark/cli.py
index 685d2d305..6e3e14384 100644
--- a/src/inference_endpoint/commands/benchmark/cli.py
+++ b/src/inference_endpoint/commands/benchmark/cli.py
@@ -17,6 +17,7 @@
 
 from __future__ import annotations
 
+import sys
 from pathlib import Path
 from typing import Annotated
 
@@ -25,6 +26,7 @@
 from pydantic import ValidationError  # noqa: F401 (used in from_config)
 
 from inference_endpoint.commands.benchmark.execute import run_benchmark
+from inference_endpoint.compliance.result import AuditResult
 from inference_endpoint.config.schema import (
     BenchmarkConfig,
     OfflineBenchmarkConfig,
@@ -51,7 +53,9 @@ def _run(config: BenchmarkConfig, dataset: list[str], mode: TestMode) -> None:
             raise DatasetValidationError(f"Invalid --dataset: {msgs}") from e
         except ValueError as e:
             raise DatasetValidationError(f"Invalid --dataset: {e}") from e
-    run_benchmark(config, mode)
+    result = run_benchmark(config, mode)
+    if isinstance(result, AuditResult):
+        sys.exit(0 if result.passed else 1)
 
 
 @benchmark_app.command
diff --git a/src/inference_endpoint/commands/benchmark/execute.py b/src/inference_endpoint/commands/benchmark/execute.py
index 7019ef444..98a734847 100644
--- a/src/inference_endpoint/commands/benchmark/execute.py
+++ b/src/inference_endpoint/commands/benchmark/execute.py
@@ -58,6 +58,8 @@
     MetricsSnapshotSubscriber,
 )
 from inference_endpoint.async_utils.transport.zmq.context import ManagedZMQContext
+from inference_endpoint.compliance import RunSpec
+from inference_endpoint.compliance.result import AuditResult
 from inference_endpoint.config.runtime_settings import RuntimeSettings
 from inference_endpoint.config.schema import (
     APIType,
@@ -310,7 +312,11 @@ def _load_datasets(
     return dataloader, accuracy_datasets, eval_configs
 
 
-def setup_benchmark(config: BenchmarkConfig, test_mode: TestMode) -> BenchmarkContext:
+def setup_benchmark(
+    config: BenchmarkConfig,
+    test_mode: TestMode,
+    run_spec: RunSpec | None = None,
+) -> BenchmarkContext:
     """Load tokenizer, dataset, create scheduler, setup report dir."""
     # CPU affinity
     affinity_plan = (
@@ -352,6 +358,12 @@ def setup_benchmark(config: BenchmarkConfig, test_mode: TestMode) -> BenchmarkCo
 
     # Setup runtime settings using factory method
     rt_settings = RuntimeSettings.from_config(config, dataloader.num_samples())
+    if run_spec is not None:
+        rt_settings = dataclass_replace(
+            rt_settings,
+            n_samples_to_issue=run_spec.n_samples,
+            sample_order=run_spec.sample_order,
+        )
 
     # Calculate and display expected sample count
     total_samples = rt_settings.total_samples_to_issue()
@@ -595,7 +607,7 @@ async def _run_benchmark_async(
                     args=event_logger_args,
                 ),
             ],
-            timeout=30.0,
+            timeout=120.0,
         )
 
         # Create endpoint client on the shared loop
@@ -931,8 +943,13 @@ def finalize_benchmark(ctx: BenchmarkContext, bench: BenchmarkResult) -> None:
         logger.error(f"Save failed: {e}")
 
 
-def run_benchmark(config: BenchmarkConfig, test_mode: TestMode) -> None:
-    """Orchestrate setup → execute → finalize."""
+def run_benchmark(config: BenchmarkConfig, test_mode: TestMode) -> AuditResult | None:
+    """Orchestrate setup → execute → finalize [→ audit].
+
+    Returns an AuditResult when config.audit is set; None otherwise.
+    Callers should map the result to an exit code (0=PASS, 1=FAIL).
+    """
+
     logger.debug(
         "BenchmarkConfig (%s):\n%s",
         type(config).__name__,
@@ -944,10 +961,19 @@ def run_benchmark(config: BenchmarkConfig, test_mode: TestMode) -> None:
         bench = run_benchmark_async(ctx)
         finalize_benchmark(ctx, bench)
     except KeyboardInterrupt:
-        logger.warning("Benchmark interrupted by user")
+        # Salvage partial results (finally), then propagate: an interrupted
+        # run must not silently roll into the long compliance audit phases.
+        logger.warning("Benchmark interrupted by user; skipping audit")
+        raise
     finally:
         if bench:
             if bench.tmpfs_dir.exists():
                 _salvage_tmpfs(ctx.report_dir, bench.tmpfs_dir)
                 shutil.rmtree(bench.tmpfs_dir, ignore_errors=True)
             logger.info(f"Partial results saved to {ctx.report_dir}")
+
+    if config.audit is not None:
+        from inference_endpoint.commands.audit import run_audit
+
+        return run_audit(config, ctx.report_dir)
+    return None
diff --git a/src/inference_endpoint/compliance/__init__.py b/src/inference_endpoint/compliance/__init__.py
new file mode 100644
index 000000000..4490f71c7
--- /dev/null
+++ b/src/inference_endpoint/compliance/__init__.py
@@ -0,0 +1,121 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Compliance audit framework.
+
+AuditTest protocol + RunSpec/RunStats/RunArtifacts types + test registry.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import TYPE_CHECKING, ClassVar, Protocol
+
+from ..metrics.report import Report
+from .result import AuditResult
+
+if TYPE_CHECKING:
+    from ..config.runtime_settings import SampleOrderSpec
+    from ..config.schema import AuditConfig, AuditTestId
+
+
+@dataclass(frozen=True, slots=True)
+class RunSpec:
+    """Declarative description of one audit phase.
+
+    ``n_samples = None`` means "issue the benchmark's default count" (full
+    dataset / duration-driven) — it flows through to
+    ``RuntimeSettings.n_samples_to_issue`` unchanged.
+    """
+
+    label: str
+    n_samples: int | None
+    sample_order: SampleOrderSpec
+
+
+@dataclass(frozen=True, slots=True)
+class RunStats:
+    """Per-phase throughput stats consumed by AuditTest.verify()."""
+
+    qps: float
+    n_completed: int
+    n_requested: int
+
+    @classmethod
+    def from_report(cls, report: Report, n_requested: int) -> RunStats:
+        qps = report.qps()
+        if qps is None:
+            raise ValueError("Report has no duration — cannot compute QPS")
+        if qps <= 0:
+            raise ValueError(
+                f"Report has non-positive throughput (qps={qps}); the run "
+                "completed no samples, so an output-caching comparison is impossible"
+            )
+        return cls(
+            qps=qps, n_completed=report.n_samples_completed, n_requested=n_requested
+        )
+
+
+@dataclass(frozen=True, slots=True)
+class RunArtifacts:
+    """Collected output of one audit phase — passed to AuditTest.verify()."""
+
+    label: str
+    report_dir: Path
+    report: Report
+    n_requested: int
+
+    def stats(self) -> RunStats:
+        return RunStats.from_report(self.report, self.n_requested)
+
+
+class AuditTest(Protocol):
+    test_id: ClassVar[AuditTestId]
+
+    def plan_runs(self, cfg: AuditConfig) -> list[RunSpec]: ...
+
+    def verify(self, runs: list[RunArtifacts], cfg: AuditConfig) -> AuditResult: ...
+
+
+_REGISTRY: dict[str, AuditTest] = {}
+
+
+def register(test: AuditTest) -> None:
+    # Key on the enum .value (e.g. "output_caching_test"); str() on a (str, Enum)
+    # member yields "AuditTestId.OUTPUT_CACHING_TEST", not the value.
+    _REGISTRY[test.test_id.value] = test
+
+
+def get_audit_test(test_id: AuditTestId) -> AuditTest:
+    key = test_id.value
+    if key not in _REGISTRY:
+        # Trigger registrations by importing the tests sub-package.
+        from . import tests as _  # noqa: F401
+
+        if key not in _REGISTRY:
+            raise KeyError(f"No audit test registered for '{key}'")
+    return _REGISTRY[key]
+
+
+__all__ = [
+    "AuditTest",
+    "AuditResult",
+    "RunArtifacts",
+    "RunSpec",
+    "RunStats",
+    "get_audit_test",
+    "register",
+]
diff --git a/src/inference_endpoint/compliance/result.py b/src/inference_endpoint/compliance/result.py
new file mode 100644
index 000000000..b4ae0092b
--- /dev/null
+++ b/src/inference_endpoint/compliance/result.py
@@ -0,0 +1,72 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Audit result type and atomic disk writer."""
+
+from __future__ import annotations
+
+import json
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+
+@dataclass(frozen=True, slots=True)
+class AuditResult:
+    """Outcome of an AuditTest.verify() call."""
+
+    test_id: str
+    passed: bool
+    details: dict[str, Any]
+
+
+def write_result(result: AuditResult, report_dir: Path) -> None:
+    """Atomically write verify_<TEST>.txt and audit_result.json to report_dir.
+
+    Uses tmp → fsync(file) → rename → fsync(parent_dir) so a mid-write
+    crash never leaves a partial file that an MLCommons validator would
+    silently accept as passing.
+    """
+    test_upper = result.test_id.upper()
+    _atomic_write_text(
+        report_dir / f"verify_{test_upper}.txt",
+        f"Performance check pass: {result.passed}\n",
+    )
+    _atomic_write_text(
+        report_dir / "audit_result.json",
+        json.dumps(
+            {"test": result.test_id, "passed": result.passed, **result.details},
+            indent=2,
+        )
+        + "\n",
+    )
+
+
+def _atomic_write_text(path: Path, content: str) -> None:
+    tmp = path.with_suffix(path.suffix + ".tmp")
+    try:
+        tmp.write_text(content, encoding="utf-8")
+        with open(tmp) as f:
+            os.fsync(f.fileno())
+        tmp.rename(path)
+        dir_fd = os.open(path.parent, os.O_RDONLY | os.O_DIRECTORY)
+        try:
+            os.fsync(dir_fd)
+        finally:
+            os.close(dir_fd)
+    except Exception:
+        tmp.unlink(missing_ok=True)
+        raise
diff --git a/src/inference_endpoint/compliance/tests/__init__.py b/src/inference_endpoint/compliance/tests/__init__.py
new file mode 100644
index 000000000..dbd1b2ebe
--- /dev/null
+++ b/src/inference_endpoint/compliance/tests/__init__.py
@@ -0,0 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Import all audit-test modules to fire their register() calls."""
+
+from . import output_caching_test  # noqa: F401
diff --git a/src/inference_endpoint/compliance/tests/output_caching_test.py b/src/inference_endpoint/compliance/tests/output_caching_test.py
new file mode 100644
index 000000000..2241287f4
--- /dev/null
+++ b/src/inference_endpoint/compliance/tests/output_caching_test.py
@@ -0,0 +1,124 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Output-caching audit (MLPerf TEST04).
+
+Detects throughput inflation from duplicate-query caching by issuing the
+same sample repeatedly (the audit phase) and comparing QPS against a
+reference run of distinct samples. This re-implements the intent of the
+MLPerf Inference TEST04 compliance test.
+
+Pass criterion (MLCommons-faithful):
+  Each phase completed ≥ requested * (1 - threshold)
+  AND audit_qps < ref_qps * (1 + threshold)  [caching inflates audit QPS → FAIL]
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, ClassVar
+
+from ...config.runtime_settings import SampleOrderSpec
+from ...config.schema import AuditTestId
+from .. import RunArtifacts, RunSpec, RunStats, register
+from ..result import AuditResult
+
+if TYPE_CHECKING:
+    from ...config.schema import AuditConfig, OutputCachingTestConfig
+
+
+class OutputCachingAudit:
+    """Output-caching audit (MLPerf TEST04)."""
+
+    test_id: ClassVar[AuditTestId] = AuditTestId.OUTPUT_CACHING_TEST
+
+    def plan_runs(self, cfg: AuditConfig) -> list[RunSpec]:
+        c: OutputCachingTestConfig = cfg  # type: ignore[assignment]
+        # samples is required, so the reference phase always has an explicit
+        # count; the audit phase defaults to the same count when omitted.
+        ref_n = c.samples
+        audit_n = c.audit_samples if c.audit_samples is not None else ref_n
+        return [
+            RunSpec(
+                label="reference",
+                n_samples=ref_n,
+                sample_order=SampleOrderSpec.without_replacement(),
+            ),
+            RunSpec(
+                label="output_caching",
+                n_samples=audit_n,
+                sample_order=SampleOrderSpec.single(c.sample_index),
+            ),
+        ]
+
+    def verify(self, runs: list[RunArtifacts], cfg: AuditConfig) -> AuditResult:
+        if len(runs) != 2:
+            raise ValueError(
+                "Output-caching verify expects exactly 2 phases (reference, "
+                f"output_caching); got {len(runs)}"
+            )
+        c: OutputCachingTestConfig = cfg  # type: ignore[assignment]
+        ref_arts, audit_arts = runs[0], runs[1]
+        return verify_output_caching(
+            ref_arts.stats(), audit_arts.stats(), threshold=c.threshold
+        )
+
+
+def verify_output_caching(
+    ref: RunStats,
+    audit: RunStats,
+    threshold: float = 0.10,
+) -> AuditResult:
+    """Core output-caching (MLPerf TEST04) result logic — pure function, no I/O.
+
+    Pass iff:
+      1. Each phase completed ≥ (1 - threshold) of its requested queries.
+      2. audit_qps < ref_qps * (1 + threshold)
+    """
+    min_completion = 1.0 - threshold
+    ref_ok = ref.n_completed >= ref.n_requested * min_completion
+    audit_ok = audit.n_completed >= audit.n_requested * min_completion
+
+    if not ref_ok or not audit_ok:
+        passed = False
+        reason = (
+            f"Phase incomplete: reference {ref.n_completed}/{ref.n_requested}, "
+            f"audit {audit.n_completed}/{audit.n_requested} "
+            f"(threshold {threshold:.0%})"
+        )
+    else:
+        limit = ref.qps * (1.0 + threshold)
+        passed = audit.qps < limit
+        reason = (
+            f"audit_qps={audit.qps:.4f} {'<' if passed else '>='} "
+            f"ref_qps * (1 + {threshold:.0%}) = {limit:.4f}"
+        )
+
+    return AuditResult(
+        test_id=AuditTestId.OUTPUT_CACHING_TEST.value,
+        passed=passed,
+        details={
+            "ref_qps": ref.qps,
+            "audit_qps": audit.qps,
+            "threshold": threshold,
+            "ref_completed": ref.n_completed,
+            "ref_requested": ref.n_requested,
+            "audit_completed": audit.n_completed,
+            "audit_requested": audit.n_requested,
+            "reason": reason,
+        },
+    )
+
+
+register(OutputCachingAudit())
diff --git a/src/inference_endpoint/config/runtime_settings.py b/src/inference_endpoint/config/runtime_settings.py
index 7259bda75..e1b0109cb 100644
--- a/src/inference_endpoint/config/runtime_settings.py
+++ b/src/inference_endpoint/config/runtime_settings.py
@@ -28,7 +28,7 @@
 import logging
 import math
 import random
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import TYPE_CHECKING
 
 from .. import metrics
@@ -41,6 +41,25 @@
     from .schema import BenchmarkConfig, LoadPattern
 
 
+@dataclass(frozen=True, slots=True)
+class SampleOrderSpec:
+    """Generic sample-ordering selector consumed by create_sample_order.
+
+    fixed_index is None  -> without-replacement (the normal default).
+    fixed_index set      -> always issue that one fixed dataset index.
+    """
+
+    fixed_index: int | None = None
+
+    @classmethod
+    def without_replacement(cls) -> SampleOrderSpec:
+        return cls(fixed_index=None)
+
+    @classmethod
+    def single(cls, index: int) -> SampleOrderSpec:
+        return cls(fixed_index=index)
+
+
 @dataclass(frozen=True, slots=True)
 class RuntimeSettings:
     """Immutable runtime settings for benchmark execution.
@@ -85,6 +104,9 @@ class RuntimeSettings:
     load_pattern: LoadPattern | None
     """Load pattern configuration"""
 
+    sample_order: SampleOrderSpec = field(default_factory=SampleOrderSpec, kw_only=True)
+    """Sample-ordering strategy (default: without-replacement)."""
+
     @classmethod
     def from_config(
         cls,
@@ -162,6 +184,7 @@ def _from_config_default(
             "rng_sched": random.Random(runtime_cfg.scheduler_random_seed),
             "rng_sample_index": random.Random(runtime_cfg.dataloader_random_seed),
             "load_pattern": load_pattern_cfg,
+            "sample_order": SampleOrderSpec(),
         }
 
         # Apply overrides
diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py
index 2cfa35d73..172509de1 100644
--- a/src/inference_endpoint/config/schema.py
+++ b/src/inference_endpoint/config/schema.py
@@ -100,6 +100,53 @@ class ScorerMethod(str, Enum):
     VBENCH = "vbench"
 
 
+class AuditTestId(str, Enum):
+    """Registered compliance audit test identifiers."""
+
+    # Output-caching audit — MLPerf TEST04 (duplicate-query caching detection).
+    OUTPUT_CACHING_TEST = "output_caching_test"
+
+
+class OutputCachingTestConfig(BaseModel):
+    """Configuration for the output-caching audit (MLPerf TEST04).
+
+    The output-caching test runs two back-to-back phases — a reference run of
+    distinct samples and an audit run that repeats one fixed sample — then
+    checks that the audit QPS does not exceed the reference QPS by more than
+    ``threshold``. A large speedup indicates the SUT is caching responses.
+
+    samples: reference-phase query count (required — an explicit count keeps
+        the per-phase completion check meaningful; a duration-driven phase has
+        no independent target to validate completion against)
+    audit_samples: audit-phase query count (None → equals samples)
+    sample_index: which dataset row is repeated (MLCommons performance_issue_same_index)
+    threshold: caching tolerance; audit_qps must stay < ref_qps * (1 + threshold)
+    """
+
+    model_config = ConfigDict(frozen=True, extra="forbid")
+
+    test: Literal[AuditTestId.OUTPUT_CACHING_TEST]
+    samples: int = Field(..., ge=1, description="Reference phase query count")
+    audit_samples: int | None = Field(
+        None, ge=1, description="Audit phase query count (default: equals samples)"
+    )
+    sample_index: int = Field(
+        0, ge=0, description="Dataset row index repeated in the audit phase"
+    )
+    threshold: float = Field(
+        0.10,
+        gt=0,
+        lt=1,
+        description="Caching tolerance: audit_qps must stay < ref_qps * (1 + threshold)",
+    )
+
+
+# Single member today; becomes
+# Annotated[OutputCachingTestConfig | ..., Field(discriminator="test")]
+# when additional audit tests are added.
+AuditConfig = OutputCachingTestConfig
+
+
 class TestMode(str, Enum):
     """Test mode determining what to collect.
 
@@ -702,6 +749,10 @@ class BenchmarkConfig(WithUpdatesMixin, BaseModel):
             help="NUMA-aware CPU pinning",
         ),
     ] = True
+    audit: Annotated[AuditConfig | None, cyclopts.Parameter(show=False)] = Field(
+        None,
+        description="Compliance audit config (YAML only). When set, runs the audit after the main benchmark.",
+    )
 
     @field_validator("datasets", mode="before")
     @classmethod
diff --git a/src/inference_endpoint/config/templates/concurrency_template_full.yaml b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
index 4fef4afcb..fc7b7dd7a 100644
--- a/src/inference_endpoint/config/templates/concurrency_template_full.yaml
+++ b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
@@ -21,7 +21,7 @@ datasets:  # Dataset configs
   type: performance  # Dataset purpose: performance or accuracy | options: performance, accuracy
   path: '<DATASET_PATH eg: tests/assets/datasets/dummy_1k.jsonl>'  # Dataset file path
   format: null  # Dataset format (auto-detected)
-  samples: null  # Number of samples to use
+  samples: null
   eval_method: null
   parser:  # Column remapping: {prompt: <col>, system: <col>}
     prompt: text_input
@@ -31,7 +31,7 @@ datasets:  # Dataset configs
   type: accuracy  # Dataset purpose: performance or accuracy | options: performance, accuracy
   path: '<DATASET_PATH eg: tests/assets/datasets/ds_samples.jsonl>'  # Dataset file path
   format: null  # Dataset format (auto-detected)
-  samples: null  # Number of samples to use
+  samples: null
   eval_method: exact_match  # Accuracy evaluation method | options: exact_match, contains, judge
   parser:  # Column remapping: {prompt: <col>, system: <col>}
     prompt: question
@@ -96,3 +96,4 @@ report_dir: null  # Report output directory
 timeout: null  # Global timeout in seconds
 verbose: false  # Enable verbose logging
 enable_cpu_affinity: true  # NUMA-aware CPU pinning
+audit: null  # Compliance audit config (YAML only). When set, runs the audit after the main benchmark.
diff --git a/src/inference_endpoint/config/templates/offline_template_full.yaml b/src/inference_endpoint/config/templates/offline_template_full.yaml
index 1f61837fe..33fed12f3 100644
--- a/src/inference_endpoint/config/templates/offline_template_full.yaml
+++ b/src/inference_endpoint/config/templates/offline_template_full.yaml
@@ -21,7 +21,7 @@ datasets:  # Dataset configs
   type: performance  # Dataset purpose: performance or accuracy | options: performance, accuracy
   path: '<DATASET_PATH eg: tests/assets/datasets/dummy_1k.jsonl>'  # Dataset file path
   format: null  # Dataset format (auto-detected)
-  samples: null  # Number of samples to use
+  samples: null
   eval_method: null
   parser:  # Column remapping: {prompt: <col>, system: <col>}
     prompt: text_input
@@ -31,7 +31,7 @@ datasets:  # Dataset configs
   type: accuracy  # Dataset purpose: performance or accuracy | options: performance, accuracy
   path: '<DATASET_PATH eg: tests/assets/datasets/ds_samples.jsonl>'  # Dataset file path
   format: null  # Dataset format (auto-detected)
-  samples: null  # Number of samples to use
+  samples: null
   eval_method: exact_match  # Accuracy evaluation method | options: exact_match, contains, judge
   parser:  # Column remapping: {prompt: <col>, system: <col>}
     prompt: question
@@ -96,3 +96,4 @@ report_dir: null  # Report output directory
 timeout: null  # Global timeout in seconds
 verbose: false  # Enable verbose logging
 enable_cpu_affinity: true  # NUMA-aware CPU pinning
+audit: null  # Compliance audit config (YAML only). When set, runs the audit after the main benchmark.
diff --git a/src/inference_endpoint/config/templates/online_template_full.yaml b/src/inference_endpoint/config/templates/online_template_full.yaml
index a212fa95b..907552975 100644
--- a/src/inference_endpoint/config/templates/online_template_full.yaml
+++ b/src/inference_endpoint/config/templates/online_template_full.yaml
@@ -21,7 +21,7 @@ datasets:  # Dataset configs
   type: performance  # Dataset purpose: performance or accuracy | options: performance, accuracy
   path: '<DATASET_PATH eg: tests/assets/datasets/dummy_1k.jsonl>'  # Dataset file path
   format: null  # Dataset format (auto-detected)
-  samples: null  # Number of samples to use
+  samples: null
   eval_method: null
   parser:  # Column remapping: {prompt: <col>, system: <col>}
     prompt: text_input
@@ -31,7 +31,7 @@ datasets:  # Dataset configs
   type: accuracy  # Dataset purpose: performance or accuracy | options: performance, accuracy
   path: '<DATASET_PATH eg: tests/assets/datasets/ds_samples.jsonl>'  # Dataset file path
   format: null  # Dataset format (auto-detected)
-  samples: null  # Number of samples to use
+  samples: null
   eval_method: exact_match  # Accuracy evaluation method | options: exact_match, contains, judge
   parser:  # Column remapping: {prompt: <col>, system: <col>}
     prompt: question
@@ -96,3 +96,4 @@ report_dir: null  # Report output directory
 timeout: null  # Global timeout in seconds
 verbose: false  # Enable verbose logging
 enable_cpu_affinity: true  # NUMA-aware CPU pinning
+audit: null  # Compliance audit config (YAML only). When set, runs the audit after the main benchmark.
diff --git a/src/inference_endpoint/load_generator/sample_order.py b/src/inference_endpoint/load_generator/sample_order.py
index 3feddd92f..d5c52512a 100644
--- a/src/inference_endpoint/load_generator/sample_order.py
+++ b/src/inference_endpoint/load_generator/sample_order.py
@@ -42,14 +42,18 @@ class SampleOrder(ABC):
     def __init__(
         self,
         n_samples_in_dataset: int,
-        rng: random.Random = random,  # type: ignore[assignment]
+        rng: random.Random | None = None,
     ):
         if n_samples_in_dataset <= 0:
             raise ValueError(
                 f"n_samples_in_dataset must be > 0, got {n_samples_in_dataset}"
             )
         self.n_samples_in_dataset = n_samples_in_dataset
-        self.rng = rng
+        # Default to a fresh per-instance Random rather than the process-global
+        # `random` module, so an order constructed without an explicit rng can't
+        # couple its draws to unrelated global state. Reproducible runs pass a
+        # seeded rng (see create_sample_order).
+        self.rng = rng if rng is not None else random.Random()
 
     def __iter__(self) -> Iterator[int]:
         return self
@@ -102,8 +106,35 @@ def next_sample_index(self) -> int:
         return self.rng.randint(0, self.n_samples_in_dataset - 1)
 
 
+class SingleSampleOrder(SampleOrder):
+    """Always yield one fixed dataset index (issue the same sample every call).
+
+    The index is fixed at construction and bounds-checked against the dataset
+    size; the rng is unused.
+    """
+
+    def __init__(self, sample_index: int, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        if not 0 <= sample_index < self.n_samples_in_dataset:
+            raise ValueError(
+                f"sample_index must be in [0, {self.n_samples_in_dataset}), "
+                f"got {sample_index}"
+            )
+        self.sample_index = sample_index
+
+    def next_sample_index(self) -> int:
+        return self.sample_index
+
+
 def create_sample_order(settings: RuntimeSettings) -> SampleOrder:
-    """Create a SampleOrder from RuntimeSettings."""
+    """Create a SampleOrder from RuntimeSettings, switching on sample_order spec."""
+    spec = settings.sample_order
+    if spec.fixed_index is not None:
+        return SingleSampleOrder(
+            sample_index=spec.fixed_index,
+            n_samples_in_dataset=settings.n_samples_from_dataset,
+            rng=settings.rng_sample_index,
+        )
     return WithoutReplacementSampleOrder(
         n_samples_in_dataset=settings.n_samples_from_dataset,
         rng=settings.rng_sample_index,
diff --git a/uv.lock b/uv.lock
index bfdb3b236..f58a0ab31 100644
--- a/uv.lock
+++ b/uv.lock
@@ -29,7 +29,7 @@ wheels = [
 
 [[package]]
 name = "aiohttp"
-version = "3.14.0"
+version = "3.14.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "aiohappyeyeballs", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
@@ -41,42 +41,42 @@ dependencies = [
     { name = "typing-extensions", marker = "(python_full_version < '3.13' and platform_machine == 'arm64' and sys_platform == 'darwin') or (python_full_version < '3.13' and platform_machine == 'x86_64' and sys_platform == 'darwin') or (python_full_version < '3.13' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "yarl", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/ee/ab/93ce242f899b68c51b0578c027aafa791ab3614cb9345fa5d37b5f5c8e3e/aiohttp-3.14.0.tar.gz", hash = "sha256:2882de819734c715fd1b9c11c97e09fa020d14438203d1d354d8ed1702791c9b", size = 7940674, upload-time = "2026-06-01T19:41:02.763Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/89/97/2b6889bfb6b6847520d50d95eb8c4307a45e28aaca39faf4a9454b3d1b2f/aiohttp-3.14.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:b29518c9c2ec7e373e68259206a137c7f4f5439c58baaec4b5ab3ab799850a4e", size = 750194, upload-time = "2026-06-01T19:37:48.164Z" },
-    { url = "https://files.pythonhosted.org/packages/21/e2/62634b7fff918ed98c3c6b2f0e70d520f7f28846cb412d451b04354c6459/aiohttp-3.14.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:dbec68ce61b64cb73cab4d33df9433427b1713c8bcccb181dce695c1b6f8e87c", size = 506966, upload-time = "2026-06-01T19:37:50.014Z" },
-    { url = "https://files.pythonhosted.org/packages/dd/fb/5ce075150828c797a5106f1c2fb26034e709d4289b9d2bf8b07f1e59fac6/aiohttp-3.14.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3cdf534aa455593e589302990c5097aa5c92c06c4262a20da22934f9186a5fff", size = 507527, upload-time = "2026-06-01T19:37:51.96Z" },
-    { url = "https://files.pythonhosted.org/packages/01/d5/405a0ae4e6b081754a3609c1c97c63a950e000a2def16046f1e736933a0e/aiohttp-3.14.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cb6c657104393b5fbff01a5f59b2023db74058a8077d94475d6c25d03882a108", size = 1762420, upload-time = "2026-06-01T19:37:53.839Z" },
-    { url = "https://files.pythonhosted.org/packages/19/d8/51de5c6b971c27bb1ef620293b8d1ca611ec78736b34b3f6ccf68e4c8785/aiohttp-3.14.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:78d6f9286a629ce52728430afe18f8ed2b6c39a1fddb3802d7244b9983910ad2", size = 1783112, upload-time = "2026-06-01T19:38:02.641Z" },
-    { url = "https://files.pythonhosted.org/packages/bc/05/750a3265ca4dc54a460bd0cb1121a8f2ce9171fce4a135fb47ea7fd594d2/aiohttp-3.14.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4d6a998191f5ebe3b8c28463ff72bc030250008b3193c402464efadd08b5ca02", size = 1723119, upload-time = "2026-06-01T19:38:06.713Z" },
-    { url = "https://files.pythonhosted.org/packages/a8/fb/05d9214c975f23225a8cd5c439325e338c7c377b315480ef3871db51f54e/aiohttp-3.14.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5ba10966d4f03dd96a14365be4b8e37c327c76f11c3ca867116966cdd9f98066", size = 1760193, upload-time = "2026-06-01T19:38:17.624Z" },
-    { url = "https://files.pythonhosted.org/packages/11/41/cc2d2cfbfbdc3126ba258f3cd27d1ac8a33492ae3c35a4583ee21f0ba7f1/aiohttp-3.14.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:3366751d68d237c621264233a32f3078bbc21b7904ab90a77e03d21390c742c6", size = 481670, upload-time = "2026-06-01T19:38:29.836Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/07/381f4023c3b08cb616e520f566d8c58957abad54e56441d41fe67cfb0195/aiohttp-3.14.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:57ea07d28695a7a40304d42251892a8df765e5588c10ee32afeddcd5df33c0a2", size = 487591, upload-time = "2026-06-01T19:38:31.704Z" },
-    { url = "https://files.pythonhosted.org/packages/fb/4d/4506fdb7a022bdf70011a3bbb4ca00c5c570026ef6a3c5bd7bc70c39089c/aiohttp-3.14.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:076cb014191ae2e65d949e1ad01f1dcfe33e32789b5172510f3e79c79fc04d50", size = 496503, upload-time = "2026-06-01T19:38:33.6Z" },
-    { url = "https://files.pythonhosted.org/packages/ef/7d/c814111e04894a45d9e2defc94443879a6f118d9633d5fedfe6e2e8af5f0/aiohttp-3.14.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:2f3fc37054564dee64a855b5b092d87ec35dcddfaabf7dacb1c8a2b1f83dc0a9", size = 745870, upload-time = "2026-06-01T19:38:36.013Z" },
-    { url = "https://files.pythonhosted.org/packages/c6/ee/80eee0efddfe187e7cd05027086b7ce1c0e492e82a4eda58f5c5543a44a0/aiohttp-3.14.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8fcaef74d2ab0f607d7ff85a0d15e21bb5a258c4a58df1908396eb50d7f4ed3c", size = 505588, upload-time = "2026-06-01T19:38:38.282Z" },
-    { url = "https://files.pythonhosted.org/packages/d6/f8/0f28f04eef75d52fc9c715dde7ce9c0abb810fd20cfeb0fea7afd2ab1e98/aiohttp-3.14.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e4c01b0bfc6209590960e68eac083cd22d5d87c21f974dd6208cafa5d3542bc8", size = 504492, upload-time = "2026-06-01T19:38:40.611Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/db/44c755232085545065c94378dfce38641b1aee647f4939fcd32f5b32e719/aiohttp-3.14.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f12eb7896e81caf403a2b18c9406426f1207361e7239c057ab29c076d4257e83", size = 1752111, upload-time = "2026-06-01T19:38:42.682Z" },
-    { url = "https://files.pythonhosted.org/packages/c5/a3/3800dbd095cb2bb165a7ea5d94d790914677e27f45638c7d80e3f34c8945/aiohttp-3.14.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:26d9224c6dd7f5c749aba4f61315a894601448b28d94d12f4dea0903e26d2096", size = 1777241, upload-time = "2026-06-01T19:38:52.04Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/3d/dc94df99ed1511fdf28314f722643ed334112643cab00223577085e788c4/aiohttp-3.14.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:23e8314e7aed8576fbe33314d218bd81447a3adbc91dc36f1163bf583cd3084c", size = 1714864, upload-time = "2026-06-01T19:38:56.788Z" },
-    { url = "https://files.pythonhosted.org/packages/fa/10/ab28818262f4d26bdb47ed5f1fc7999b69e2fc6e0370b02d0f49011f45ea/aiohttp-3.14.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:666c7c5036df57b693026398b69b41874a1931ac5b3485fd910e57bfac253869", size = 1754516, upload-time = "2026-06-01T19:39:08.788Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/fe/6edbf5d39bf29322b6816365b17ed8ede4dace164a3aea1abcd30110eb78/aiohttp-3.14.0-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:70ea956f6cc4a37620966b56c2e205d88ca3e6d85ec063277e414b1035cddad3", size = 483329, upload-time = "2026-06-01T19:39:22.607Z" },
-    { url = "https://files.pythonhosted.org/packages/1b/5a/fae531bdbc6456fb6241f46b7b81e4d8a0dd3fc09118a0055dc7141ac1ec/aiohttp-3.14.0-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:ea3b9806c89f61da22fddf1f12dd524fb368e5e28f1261fbdafe5c3cd8ce893b", size = 489502, upload-time = "2026-06-01T19:39:24.881Z" },
-    { url = "https://files.pythonhosted.org/packages/36/f4/48a7b0414db7fed77a03d5dde34508c026afd83510ab6bca08c313855776/aiohttp-3.14.0-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:a071be341c2bd9b0188e62d173509f024e0a35b1c342c53c50f8daaeda8c3bd8", size = 497357, upload-time = "2026-06-01T19:39:27.197Z" },
-    { url = "https://files.pythonhosted.org/packages/75/75/e85a13a370acc007fca5feb1fd1b88ac2d8426e6dadd625479b7cadd55a3/aiohttp-3.14.0-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:198cfe61bf253b19da1fb3e0fa122249dc4f14c12709493fed8054aa0411cc76", size = 750898, upload-time = "2026-06-01T19:39:29.563Z" },
-    { url = "https://files.pythonhosted.org/packages/9e/e4/3d637f800c724eff0e2bed64df72557444482366fd0a35b0cec0e6968f6c/aiohttp-3.14.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:9dc203d6ce6b9106d54e2a93f41dfdfebfbca2d99962ba503bfd3e5921a6549e", size = 506986, upload-time = "2026-06-01T19:39:31.872Z" },
-    { url = "https://files.pythonhosted.org/packages/1d/df/35161f3598bf7501d2b2a805b41ab4f45a2e34150c421bcb4ef8c0d281a7/aiohttp-3.14.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:9e19d17ab02bf16832a2c8c0d55a486792c5b1645665652ee9531aebcc30cb72", size = 508033, upload-time = "2026-06-01T19:39:34.137Z" },
-    { url = "https://files.pythonhosted.org/packages/e5/39/b36e5d3d31e850fb4691dd3e941684ac490a2559249f6fa634b6b0fdf020/aiohttp-3.14.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d925fba0c14d5b498a8028b0107beebdfd16c5d48d702ff54f879cb017aaaca3", size = 1746213, upload-time = "2026-06-01T19:39:36.654Z" },
-    { url = "https://files.pythonhosted.org/packages/3a/05/27df32c844b2156e1675a8d8ec22d963e3c8ba469ed7ceb1863320c7b521/aiohttp-3.14.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ff82be7f1ef73634cb77890a770743239bc3d487b848669be1c599889336dc0a", size = 1751659, upload-time = "2026-06-01T19:39:46.398Z" },
-    { url = "https://files.pythonhosted.org/packages/66/e3/53c67097e8a5ce98625e91e3fa7f43c9c6940de680345d03b3509a72a078/aiohttp-3.14.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:edc01ea4e1ec5a1649a28866262bf24195889ff7b27bdd947029a6086741de9b", size = 1710090, upload-time = "2026-06-01T19:39:51.392Z" },
-    { url = "https://files.pythonhosted.org/packages/b8/69/155c4ef3aec96417d47024800472b33b16c5d8a665371dcd044c2afdf25d/aiohttp-3.14.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:26b6d79aa54cb4ed50cc7d41ed14e99e0f1fc8e7c2d42f2e05b37aea897b2b52", size = 1733716, upload-time = "2026-06-01T19:40:03.631Z" },
-    { url = "https://files.pythonhosted.org/packages/12/34/6180103ce9aabc8ebff3f7bb55a1228ffe60f61042823031d9692cb7b101/aiohttp-3.14.0-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:6aa1a40f9cbb3da9f80714c5966b8946c21e6a2530d809b9498b33161e3c8733", size = 787878, upload-time = "2026-06-01T19:40:13.401Z" },
-    { url = "https://files.pythonhosted.org/packages/92/e9/08954a40e8b7baa3d8beadd2b074b186e9b1e9c8ddabc288678a6265de50/aiohttp-3.14.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:b62af5a8cc96a194eaa01a9ed7b34a3ffa58d3d8daaa1a0d7a749353ad12d228", size = 524400, upload-time = "2026-06-01T19:40:15.972Z" },
-    { url = "https://files.pythonhosted.org/packages/08/6a/b5965a634ac4d5ba99a463314cf4ab214ca073fcdc38a15e0294273701fc/aiohttp-3.14.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:6eb63b1417efaf7d1002a6ad034a40d44376afcc16508a57f8e74b49ad26a095", size = 527904, upload-time = "2026-06-01T19:40:18.28Z" },
-    { url = "https://files.pythonhosted.org/packages/06/b4/932bcdd850c354d9bcca30f360e475d7852e30413fbbd44b182782ed5432/aiohttp-3.14.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c20b9ad156a79eb97be5cf9e069eec01d2f0dc8472ffbd75299a8b2d4c2cbbde", size = 1912162, upload-time = "2026-06-01T19:40:20.825Z" },
-    { url = "https://files.pythonhosted.org/packages/d0/1c/a57de71a4508c93a830b77c28af3d08cd97f606dedfc6b94275347744508/aiohttp-3.14.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:145262119b07d7f95abc1839add35ba2bfc84551d4b4660ca11542c0b215455b", size = 1868606, upload-time = "2026-06-01T19:40:31.843Z" },
-    { url = "https://files.pythonhosted.org/packages/35/1e/c237923232c7da7f0392ea25d89fc5e60c0e93f685f4ebca8e7bcdd5271c/aiohttp-3.14.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:2cc736a9c9fc2bc4dd71fd404815741b6573df27c3f985948ec4076989ac57de", size = 1834090, upload-time = "2026-06-01T19:40:37.733Z" },
-    { url = "https://files.pythonhosted.org/packages/cc/bc/2aaab2f85cadb26ea59c091fa2b8e370d625154b5c14b478f1b489d07551/aiohttp-3.14.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6199707cc40e0e9cd39c36fbc97bec416c704e1d0ddce03412bb3b3e6a90ccd0", size = 1832281, upload-time = "2026-06-01T19:40:52.303Z" },
+sdist = { url = "https://files.pythonhosted.org/packages/82/78/8ea7308cac6934de8c74a14f3d5f65d1c89287426688be79538d0e5c013d/aiohttp-3.14.1.tar.gz", hash = "sha256:307f2cff90a764d329e77040603fa032db89c5c24fdad50c4c15334cba744035", size = 7955794, upload-time = "2026-06-07T21:09:35.529Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1d/21/151624b51cd92553d95424daf4bf19f19ce9be9002d19253e7e7ce67197b/aiohttp-3.14.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:d35143e27778b4bb0fb189562d7f275bff79c62ab8e98459717c0ea617ff2480", size = 757402, upload-time = "2026-06-07T21:06:40.311Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/82/280619e0bd7bf2454987e19282616e84762255dd9c8468f62382e8c191f1/aiohttp-3.14.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:bcfb80a2cc36fba2534e5e5b5264dc7ae6fcd9bf15256da3e53d2f499e6fa29d", size = 512310, upload-time = "2026-06-07T21:06:42.207Z" },
+    { url = "https://files.pythonhosted.org/packages/55/b2/2aac325583aaa1353045f96dffa586d8a34e8322e14a7ba49cffeb103ab4/aiohttp-3.14.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:27fd7c91e51729b4f7e1577865fa6d34c9adccbc39aabe9000285b48af9f0ec2", size = 512448, upload-time = "2026-06-07T21:06:43.813Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/72/a60607cb849faa8af8a356c9329ea2eb6f395d49e82cc82ccba1fd8deb8f/aiohttp-3.14.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:64c567bf9eaf664280116a8688f63016e6b32db2505908e2bdaca1b6438142f2", size = 1766854, upload-time = "2026-06-07T21:06:45.391Z" },
+    { url = "https://files.pythonhosted.org/packages/20/9c/d445818389df371f56d141d881153ba23183c4735a03f7356ffb43f7757d/aiohttp-3.14.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3e6fc1a85fa7194a1a7d19f44e8609180f4a8eb5fa4c7ed8b4355f080fad235c", size = 1790278, upload-time = "2026-06-07T21:06:54.049Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/b4/4dac0038960427ba832f6609dfb4ea5437d7fd80c72001b9e48f834f428b/aiohttp-3.14.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c6fa4dc7ad6f8109c70bb1499e589f76b0b792baf39f9b017eb92c8a81d0a199", size = 1728397, upload-time = "2026-06-07T21:06:57.777Z" },
+    { url = "https://files.pythonhosted.org/packages/70/0a/e0075ce9ca0279ee1d4f0c0b85f54fea02ebc83c3007651a72bece658fec/aiohttp-3.14.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6f71173be42d3241d428f760122febb748de0623f44308a6f120d0dd9ec572e3", size = 1767580, upload-time = "2026-06-07T21:07:07.873Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/22/a73ccbf9dbd6e26dda0b24d5fd5db7da92ee3383a79f47677ffb834c5c5b/aiohttp-3.14.1-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:915fbb7b41b115192259f8c9ae58f3ddc444d2b5579917270211858e606a4afd", size = 485841, upload-time = "2026-06-07T21:07:19.555Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/b9/57ed8eaf596321c2ad747bd480fb1700dbd7177c60dfc9e4c187f629662e/aiohttp-3.14.1-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:7fb4bdf95b0561a79f259f9d28fbc109728c5ee7f27aff6391f0ca703a329abe", size = 492088, upload-time = "2026-06-07T21:07:21.581Z" },
+    { url = "https://files.pythonhosted.org/packages/78/c0/5ebe5270a7c140d7c6f79dcb018640225f14d406c149e4eec04a7d82fe71/aiohttp-3.14.1-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:1b9748363260121d2927704f5d4fc498150669ca3ae93625986ee89c8f80dcd4", size = 501564, upload-time = "2026-06-07T21:07:23.388Z" },
+    { url = "https://files.pythonhosted.org/packages/75/7f/8cdaa24fc7983865e0915153b96a9ac5bcdd3548d64c5a27d17cecccad2d/aiohttp-3.14.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:86a6dab78b0e43e2897a3bbe15745aa60dc5423ca437b7b0b164c069bf91b876", size = 751998, upload-time = "2026-06-07T21:07:25.046Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/f4/c4227aacfacc5cb0cc2d119b65301d177912a6842cd64e120c47af76064f/aiohttp-3.14.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4dfd6e47d3c44c2279907607f73a4240b88c69eb8b90da7e2441a8045dfd21da", size = 510918, upload-time = "2026-06-07T21:07:27.28Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/01/a2d5f96cd4e74424864d30bc0a7e44d0a12dacdcfa91b5b2d1bd3dca6bf3/aiohttp-3.14.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:317acd9f8602858dc7d59679812c376c7f0b97bcbbf16e0d6237f54141d8a8a6", size = 508657, upload-time = "2026-06-07T21:07:29.252Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/ed/3c0fb5c500fdd8e7ebc10d1889c04384fffa1a9163eac1356088ca9da1b1/aiohttp-3.14.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bd869c427324e5cb15195793de951295710db28be7d818247f3097b4ab5d4b96", size = 1757907, upload-time = "2026-06-07T21:07:31.03Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/6e/dbf1d0625dc711fb2851f4f3c3055c39ed58bae92082d8c627dbe6013736/aiohttp-3.14.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:faccab372e66bc76d5731525e7f1143c922271725b9d38c9f97edcc66266b451", size = 1783881, upload-time = "2026-06-07T21:07:39.063Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/bd/cf9cee17e140f942a3de73e658a543aa8fbf35a5fc67a9d2538d52d77f0b/aiohttp-3.14.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:97e704dcd26271f5bda3fa07c3ce0fb76d6d3f8659f4baa1a24442cc9ba177ca", size = 1722137, upload-time = "2026-06-07T21:07:43.014Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/45/4de841f005cfe1fd63e2a2fe011262c515e2a62aa6994b15947e7d717ac9/aiohttp-3.14.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:cb21957bb8aca671c1765e32f58164cf0c50e6bf41c0bbbd16da20732ecaf588", size = 1761094, upload-time = "2026-06-07T21:07:54.113Z" },
+    { url = "https://files.pythonhosted.org/packages/85/a5/9594ad6289eebbc97d167c44213d557807f90e59115caad24de21ad2c3b1/aiohttp-3.14.1-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:62a759436b29e677181a9e76bab8b8f689a29cb9c535f45f7c48c9c830d3f8c3", size = 487918, upload-time = "2026-06-07T21:08:06.377Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/61/16a32c36c3c49edec122a3dc811f2057df2f94d3b14aa107c8017d981618/aiohttp-3.14.1-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:2964cbf553df4d7a57348da44d961d871895fc1ee4e8c322b2a95612c7b17fba", size = 494014, upload-time = "2026-06-07T21:08:08.263Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/89/3ebcf96ed99c05bec9c434aaac6963fd3cbab4a786ae739908a144d9ce44/aiohttp-3.14.1-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:237651caadc3a59badd39319c54642b5299e9cc98a3a194310e55d5bb9f5e397", size = 502398, upload-time = "2026-06-07T21:08:10.244Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/3d/b74870a0c2d40c355928cd5b96c7a11fa821b8a40fc41365e64479b151fb/aiohttp-3.14.1-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:896e12dfdbbab9d8f7e16d2b28c6769a60126fa92095d1ebf9473d02593a2448", size = 758018, upload-time = "2026-06-07T21:08:12.447Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/66/f42f5c984d99e49c6cff5f26f590750f2e2f7ef1fcfb99966ab5be1b632e/aiohttp-3.14.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:d03f281ed22579314ba00821ce20115a7c0ac430660b4cc05704a3f818b3e004", size = 512462, upload-time = "2026-06-07T21:08:14.624Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/a7/248e1aebe0c7810b0271e021a0f2a5eb6e78a051885b3c9df49f42a5802d/aiohttp-3.14.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:07eabb979d236335fed927e137a928c9adfb7df3b9ec7aa31726f133a62be983", size = 512824, upload-time = "2026-06-07T21:08:16.572Z" },
+    { url = "https://files.pythonhosted.org/packages/26/97/2aa0e5ba0727dc3bd5aaebb7ccbc510f7dfb7fb961ec87497cd496635ab1/aiohttp-3.14.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4fe1f1087cbadb280b5e1bb054a4f00d1423c74d6626c5e48400d871d34ecefe", size = 1749898, upload-time = "2026-06-07T21:08:18.635Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/18/938441025db6769a3464596b2410af3afde0b21eb2f204c6f766f68af4bd/aiohttp-3.14.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:634e385930fb6d2d479cf3aa66515955863b77a5e3c2b5894ca259a25b308602", size = 1760329, upload-time = "2026-06-07T21:08:27.363Z" },
+    { url = "https://files.pythonhosted.org/packages/49/a2/2136674d52123b1354bd05dd5753c318db47dc0c927cc70b27bab3755456/aiohttp-3.14.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:335c0cc3e3545ce98dcb9cfcb836f40c3411f43fa03dab757597d80c89af8a35", size = 1714756, upload-time = "2026-06-07T21:08:32.094Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/af/14bb5843eccbe234f4dfb78ab73e549d99727247e62ae5d62cbd22eaf5b0/aiohttp-3.14.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6ffbb2f4ec1ceaff7e07d43922954da26b223d188bf30658e561b98e23089444", size = 1742574, upload-time = "2026-06-07T21:08:43.795Z" },
+    { url = "https://files.pythonhosted.org/packages/34/e3/19dbe1a1f4cc6230eb9e314de7fe68053b0992f9302b27d12141a0b5db53/aiohttp-3.14.1-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:819c054312f1af92947e6a55883d1b66feefab11531a7fc45e0fb9b63880b5c2", size = 793320, upload-time = "2026-06-07T21:08:52.775Z" },
+    { url = "https://files.pythonhosted.org/packages/7f/20/1b7182219ba1b108430d6e4dc53d25ae02dcfcf5a045b33af4e8c5167527/aiohttp-3.14.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:10ee9c1753a8f706345b22496c79fbddb5be0599e0823f3738b1534058e25340", size = 529077, upload-time = "2026-06-07T21:08:55Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/c8/14ce60ec31a2e5f5274bb17d383a6f7a3aabca31ac04eee05585bbadab16/aiohttp-3.14.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1601cc37baf5750ccacae618ec2daf020769581695550e3b654a911f859c563d", size = 532476, upload-time = "2026-06-07T21:08:57.176Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/02/9ac85e081e53da2e061b02fa7758fe0a12d17b8ce2d1f5e6c7cb76730328/aiohttp-3.14.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4d6e0ac9da31c9c04c84e1c0182ad8d6df35965a85cae29cd71d089621b3ae94", size = 1922347, upload-time = "2026-06-07T21:08:59.563Z" },
+    { url = "https://files.pythonhosted.org/packages/66/4e/560c7472d3d198a23aa5c8b19a5115bf6a9b77b7d3e4bb363da320430ad2/aiohttp-3.14.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fc0cacab7ba4e56f0f81c82a98c09bed2f39c940107b03a34b168bdf7597edd3", size = 1877095, upload-time = "2026-06-07T21:09:09.011Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/c9/48255813cca749a229ef0ab476004ec623728ad79a9c0840616f6c076325/aiohttp-3.14.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:38e1e7daaea81df51c952e18483f323d878499a1e2bfe564790e0f9701d6f203", size = 1842922, upload-time = "2026-06-07T21:09:14.118Z" },
+    { url = "https://files.pythonhosted.org/packages/44/be/0474c5a8b5640e1e4aa1923430a91f4151be82e511373fe764189b89aef5/aiohttp-3.14.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:99abd37084b82f5830c635fddd0b4993b9742a66eb746dacf433c8590e8f9e3c", size = 1841409, upload-time = "2026-06-07T21:09:26.207Z" },
 ]
 
 [[package]]
@@ -801,6 +801,7 @@ dependencies = [
     { name = "hdrhistogram", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "httptools", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "jinja2", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
+    { name = "msgpack", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "msgspec", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "numpy", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "openai-harmony", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
@@ -858,7 +859,7 @@ test = [
 
 [package.metadata]
 requires-dist = [
-    { name = "aiohttp", marker = "extra == 'test'", specifier = "==3.14.0" },
+    { name = "aiohttp", marker = "extra == 'test'", specifier = "==3.14.1" },
     { name = "colorama", specifier = "==0.4.6" },
     { name = "coverage", marker = "extra == 'test'", specifier = "==7.13.4" },
     { name = "cyclopts", specifier = "==4.10.0" },
@@ -873,6 +874,7 @@ requires-dist = [
     { name = "line-profiler", marker = "extra == 'test'", specifier = "==5.0.2" },
     { name = "matplotlib", marker = "extra == 'test'", specifier = "==3.10.8" },
     { name = "memory-profiler", marker = "extra == 'performance'", specifier = "==0.61.0" },
+    { name = "msgpack", specifier = "==1.2.1" },
     { name = "msgspec", specifier = "==0.20.0" },
     { name = "myst-parser", marker = "extra == 'dev'", specifier = "==5.0.0" },
     { name = "numpy", specifier = "==2.4.4" },
@@ -1151,34 +1153,34 @@ wheels = [
 
 [[package]]
 name = "msgpack"
-version = "1.1.2"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/4d/f2/bfb55a6236ed8725a96b0aa3acbd0ec17588e6a2c3b62a93eb513ed8783f/msgpack-1.1.2.tar.gz", hash = "sha256:3b60763c1373dd60f398488069bcdc703cd08a711477b5d480eecc9f9626f47e", size = 173581, upload-time = "2025-10-08T09:15:56.596Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/ad/bd/8b0d01c756203fbab65d265859749860682ccd2a59594609aeec3a144efa/msgpack-1.1.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:70a0dff9d1f8da25179ffcf880e10cf1aad55fdb63cd59c9a49a1b82290062aa", size = 81939, upload-time = "2025-10-08T09:15:01.472Z" },
-    { url = "https://files.pythonhosted.org/packages/34/68/ba4f155f793a74c1483d4bdef136e1023f7bcba557f0db4ef3db3c665cf1/msgpack-1.1.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:446abdd8b94b55c800ac34b102dffd2f6aa0ce643c55dfc017ad89347db3dbdb", size = 85064, upload-time = "2025-10-08T09:15:03.764Z" },
-    { url = "https://files.pythonhosted.org/packages/f2/60/a064b0345fc36c4c3d2c743c82d9100c40388d77f0b48b2f04d6041dbec1/msgpack-1.1.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c63eea553c69ab05b6747901b97d620bb2a690633c77f23feb0c6a947a8a7b8f", size = 417131, upload-time = "2025-10-08T09:15:05.136Z" },
-    { url = "https://files.pythonhosted.org/packages/65/92/a5100f7185a800a5d29f8d14041f61475b9de465ffcc0f3b9fba606e4505/msgpack-1.1.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:372839311ccf6bdaf39b00b61288e0557916c3729529b301c52c2d88842add42", size = 427556, upload-time = "2025-10-08T09:15:06.837Z" },
-    { url = "https://files.pythonhosted.org/packages/f5/87/ffe21d1bf7d9991354ad93949286f643b2bb6ddbeab66373922b44c3b8cc/msgpack-1.1.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2929af52106ca73fcb28576218476ffbb531a036c2adbcf54a3664de124303e9", size = 404920, upload-time = "2025-10-08T09:15:08.179Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/41/8543ed2b8604f7c0d89ce066f42007faac1eaa7d79a81555f206a5cdb889/msgpack-1.1.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:be52a8fc79e45b0364210eef5234a7cf8d330836d0a64dfbb878efa903d84620", size = 415013, upload-time = "2025-10-08T09:15:09.83Z" },
-    { url = "https://files.pythonhosted.org/packages/6b/31/b46518ecc604d7edf3a4f94cb3bf021fc62aa301f0cb849936968164ef23/msgpack-1.1.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4efd7b5979ccb539c221a4c4e16aac1a533efc97f3b759bb5a5ac9f6d10383bf", size = 81212, upload-time = "2025-10-08T09:15:14.552Z" },
-    { url = "https://files.pythonhosted.org/packages/92/dc/c385f38f2c2433333345a82926c6bfa5ecfff3ef787201614317b58dd8be/msgpack-1.1.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:42eefe2c3e2af97ed470eec850facbe1b5ad1d6eacdbadc42ec98e7dcf68b4b7", size = 84315, upload-time = "2025-10-08T09:15:15.543Z" },
-    { url = "https://files.pythonhosted.org/packages/d3/68/93180dce57f684a61a88a45ed13047558ded2be46f03acb8dec6d7c513af/msgpack-1.1.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1fdf7d83102bf09e7ce3357de96c59b627395352a4024f6e2458501f158bf999", size = 412721, upload-time = "2025-10-08T09:15:16.567Z" },
-    { url = "https://files.pythonhosted.org/packages/5d/ba/459f18c16f2b3fc1a1ca871f72f07d70c07bf768ad0a507a698b8052ac58/msgpack-1.1.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fac4be746328f90caa3cd4bc67e6fe36ca2bf61d5c6eb6d895b6527e3f05071e", size = 424657, upload-time = "2025-10-08T09:15:17.825Z" },
-    { url = "https://files.pythonhosted.org/packages/38/f8/4398c46863b093252fe67368b44edc6c13b17f4e6b0e4929dbf0bdb13f23/msgpack-1.1.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:fffee09044073e69f2bad787071aeec727183e7580443dfeb8556cbf1978d162", size = 402668, upload-time = "2025-10-08T09:15:19.003Z" },
-    { url = "https://files.pythonhosted.org/packages/28/ce/698c1eff75626e4124b4d78e21cca0b4cc90043afb80a507626ea354ab52/msgpack-1.1.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5928604de9b032bc17f5099496417f113c45bc6bc21b5c6920caf34b3c428794", size = 419040, upload-time = "2025-10-08T09:15:20.183Z" },
-    { url = "https://files.pythonhosted.org/packages/22/71/201105712d0a2ff07b7873ed3c220292fb2ea5120603c00c4b634bcdafb3/msgpack-1.1.2-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:e23ce8d5f7aa6ea6d2a2b326b4ba46c985dbb204523759984430db7114f8aa00", size = 81127, upload-time = "2025-10-08T09:15:24.408Z" },
-    { url = "https://files.pythonhosted.org/packages/1b/9f/38ff9e57a2eade7bf9dfee5eae17f39fc0e998658050279cbb14d97d36d9/msgpack-1.1.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:6c15b7d74c939ebe620dd8e559384be806204d73b4f9356320632d783d1f7939", size = 84981, upload-time = "2025-10-08T09:15:25.812Z" },
-    { url = "https://files.pythonhosted.org/packages/8e/a9/3536e385167b88c2cc8f4424c49e28d49a6fc35206d4a8060f136e71f94c/msgpack-1.1.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:99e2cb7b9031568a2a5c73aa077180f93dd2e95b4f8d3b8e14a73ae94a9e667e", size = 411885, upload-time = "2025-10-08T09:15:27.22Z" },
-    { url = "https://files.pythonhosted.org/packages/2f/40/dc34d1a8d5f1e51fc64640b62b191684da52ca469da9cd74e84936ffa4a6/msgpack-1.1.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:180759d89a057eab503cf62eeec0aa61c4ea1200dee709f3a8e9397dbb3b6931", size = 419658, upload-time = "2025-10-08T09:15:28.4Z" },
-    { url = "https://files.pythonhosted.org/packages/3b/ef/2b92e286366500a09a67e03496ee8b8ba00562797a52f3c117aa2b29514b/msgpack-1.1.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:04fb995247a6e83830b62f0b07bf36540c213f6eac8e851166d8d86d83cbd014", size = 403290, upload-time = "2025-10-08T09:15:29.764Z" },
-    { url = "https://files.pythonhosted.org/packages/78/90/e0ea7990abea5764e4655b8177aa7c63cdfa89945b6e7641055800f6c16b/msgpack-1.1.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:8e22ab046fa7ede9e36eeb4cfad44d46450f37bb05d5ec482b02868f451c95e2", size = 415234, upload-time = "2025-10-08T09:15:31.022Z" },
-    { url = "https://files.pythonhosted.org/packages/16/67/93f80545eb1792b61a217fa7f06d5e5cb9e0055bed867f43e2b8e012e137/msgpack-1.1.2-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:897c478140877e5307760b0ea66e0932738879e7aa68144d9b78ea4c8302a84a", size = 85264, upload-time = "2025-10-08T09:15:35.61Z" },
-    { url = "https://files.pythonhosted.org/packages/87/1c/33c8a24959cf193966ef11a6f6a2995a65eb066bd681fd085afd519a57ce/msgpack-1.1.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:a668204fa43e6d02f89dbe79a30b0d67238d9ec4c5bd8a940fc3a004a47b721b", size = 89076, upload-time = "2025-10-08T09:15:36.619Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/6b/62e85ff7193663fbea5c0254ef32f0c77134b4059f8da89b958beb7696f3/msgpack-1.1.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5559d03930d3aa0f3aacb4c42c776af1a2ace2611871c84a75afe436695e6245", size = 435242, upload-time = "2025-10-08T09:15:37.647Z" },
-    { url = "https://files.pythonhosted.org/packages/c1/47/5c74ecb4cc277cf09f64e913947871682ffa82b3b93c8dad68083112f412/msgpack-1.1.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:70c5a7a9fea7f036b716191c29047374c10721c389c21e9ffafad04df8c52c90", size = 432509, upload-time = "2025-10-08T09:15:38.794Z" },
-    { url = "https://files.pythonhosted.org/packages/24/a4/e98ccdb56dc4e98c929a3f150de1799831c0a800583cde9fa022fa90602d/msgpack-1.1.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:f2cb069d8b981abc72b41aea1c580ce92d57c673ec61af4c500153a626cb9e20", size = 415957, upload-time = "2025-10-08T09:15:40.238Z" },
-    { url = "https://files.pythonhosted.org/packages/da/28/6951f7fb67bc0a4e184a6b38ab71a92d9ba58080b27a77d3e2fb0be5998f/msgpack-1.1.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:d62ce1f483f355f61adb5433ebfd8868c5f078d1a52d042b0a998682b4fa8c27", size = 422910, upload-time = "2025-10-08T09:15:41.505Z" },
+version = "1.2.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/31/f9/c0a1c127f9049db9155afc316952ea571720dd01833ff5e4d7e8e6352dbb/msgpack-1.2.1.tar.gz", hash = "sha256:04c721c2c7448767e9e3f2520a475663d8ee0f09c31890f6d2bd70fd636a9647", size = 183960, upload-time = "2026-06-18T16:13:52.594Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/bc/dd/9e8cbd8f5582ca4b590336f2b91ee5662f6a6ca562b565abaf696a0f81ff/msgpack-1.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:2ef59c659f289eddf8aa6623823f19fa2f40a4029266889eac7a2505dd210c35", size = 83531, upload-time = "2026-06-18T16:12:58.249Z" },
+    { url = "https://files.pythonhosted.org/packages/50/2e/ebdb85a8da151397a2790363676b7ed7c125924fe618e4c6d8befb0cc62c/msgpack-1.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d3567748a5107cb40cdf66a275430c2f87c07777698f4bfd25c35f44d533258c", size = 82657, upload-time = "2026-06-18T16:12:59.396Z" },
+    { url = "https://files.pythonhosted.org/packages/26/aa/753ad8b007b464e1d8aa0c8e650b9c5f4f725e658fc5ac8a7635c55b7f6e/msgpack-1.2.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:60926b75d00c8e816ef98f3034f484a8bc64242d66839cef4cf7e503142316a0", size = 410634, upload-time = "2026-06-18T16:13:00.383Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/fd/6adabd4f6d5e686f97dd02ce7fce3fe4cf672cbac36b8f67ff4040e8ad8b/msgpack-1.2.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:020e881a764b20d8d7ca1a54fc01b8175519d108e3c3f194fddc200bda95951a", size = 419989, upload-time = "2026-06-18T16:13:01.776Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/bf/35963899493b32030c85fc513b723ae66144ac70c11ebc52e889e16e3d99/msgpack-1.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8b267ce94efb76fbd1b3373511420074ee3187f0f7811bf394531de13294735a", size = 400842, upload-time = "2026-06-18T16:13:05.012Z" },
+    { url = "https://files.pythonhosted.org/packages/17/dd/fa8bd265110dfa51c20cb529f9e6d240a16fafe7e645004c6af2d01353ba/msgpack-1.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f02cf17a6ca1abe29b5f980644f7551f94d71f2011509b26d8625ce038f0df64", size = 414939, upload-time = "2026-06-18T16:13:07.478Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/ac/dcddcab6f6c20ecb387ca5e980371cdb3f87ff69aeca388be97eebc4c074/msgpack-1.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0a70e3cf2804a300d921bb0940426e35f4e489a23adfb77a808892241db0a064", size = 83151, upload-time = "2026-06-18T16:13:12.173Z" },
+    { url = "https://files.pythonhosted.org/packages/64/71/fbcfa83a1d6a9c6091942d1cfd070962244664b87427a9a49a6897b1b219/msgpack-1.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:491cc39455ca765fad51fb451bf2915eb2cf41192ab5801ce8d67c1d614fe056", size = 82351, upload-time = "2026-06-18T16:13:13.194Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/10/ddf7b06db879e8792d13934ddda09ff20bd2a583fd84c9b59aae9b0e650b/msgpack-1.2.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f310233ef7fb9c14e201c93639fe5f5260b005f56f0b29048e999c30935596cc", size = 407518, upload-time = "2026-06-18T16:13:14.233Z" },
+    { url = "https://files.pythonhosted.org/packages/79/d3/36a46a8ed992b781acbc05928bd5bee3c810cb0c3563bf81a7b0c04a1a76/msgpack-1.2.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:787c9bebb5833e8f6fc8abca3c0597683d8d87f56a8842b6b89c75a5f3176e2d", size = 416405, upload-time = "2026-06-18T16:13:15.435Z" },
+    { url = "https://files.pythonhosted.org/packages/40/16/738fe6d875ad7e2a9429c165322a4ec088f4f273cdfae63d96a89c467961/msgpack-1.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:85f57e960d877f2977f6430896191b04a21f8901b3b4baf2e4604329f4db5402", size = 397469, upload-time = "2026-06-18T16:13:18.287Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/39/e2ef7dbf0473bcb8dc7c50bf782a892d67414877b63e47fc88eb189ef5e6/msgpack-1.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e3dc2feb0876209d9c38aa56cb1de169bd6c4348f1aa48271f241226590993e6", size = 411273, upload-time = "2026-06-18T16:13:21.028Z" },
+    { url = "https://files.pythonhosted.org/packages/77/58/cce442852c6b9e1639c7c8ac8fd9143121cb32dab0f308df4d1426a8eb9c/msgpack-1.2.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:05f340e47e7e47d2da8db9b53e1bb1d294369e9ef45a747441309f6650b8351d", size = 83610, upload-time = "2026-06-18T16:13:25.724Z" },
+    { url = "https://files.pythonhosted.org/packages/60/5c/15b4c7a0182f75ffa90751958ba36a9c01cafee367d49a3edc10ed140b01/msgpack-1.2.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:810b916696c86ef0deb3b74588480224df4c1b071136c34183e4a2a4284d7ac7", size = 83138, upload-time = "2026-06-18T16:13:26.781Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/a6/99e58722feaffc5f2fbcc0c8c0d1451ab9f84097f7af87291b46af2390f4/msgpack-1.2.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ca0dacff965c47afdc3749a8469d7302a8f801d6a28758d55120d75e66ce6889", size = 406090, upload-time = "2026-06-18T16:13:28.072Z" },
+    { url = "https://files.pythonhosted.org/packages/19/03/8c63e8cf52958534ef688625965ab04c269a6cadd8caef16758b380a821a/msgpack-1.2.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0e2bf9280bceb5efca998435904b5d3e9fdbcc11d90dc9df30aec7973252b720", size = 412106, upload-time = "2026-06-18T16:13:29.427Z" },
+    { url = "https://files.pythonhosted.org/packages/98/48/deaf2326262a8d5ea3295ce9649912ecd3f551ba7ec8e33c665d2ba583f3/msgpack-1.2.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ec0e675d59150a6269ddc9139087c722292664a37d071a849c05c473350f1f2d", size = 396168, upload-time = "2026-06-18T16:13:31.977Z" },
+    { url = "https://files.pythonhosted.org/packages/59/86/1edc67270099a528fa2093ea60fe191233cd238e4bd30cfacf7db79fc959/msgpack-1.2.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5ad5467fc3f68b5468e06c5f788d712e9f8ffc8b0cd1bcb160c105c1ee92dae7", size = 408457, upload-time = "2026-06-18T16:13:34.567Z" },
+    { url = "https://files.pythonhosted.org/packages/1b/02/ad2afb678b4de94496cd432b581759b756a92c1192d8c767edd6b132efdc/msgpack-1.2.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:8d00f177ca88a77c1cf848d204a38f249751650b601cb6532acc68805d8a8273", size = 86000, upload-time = "2026-06-18T16:13:39.44Z" },
+    { url = "https://files.pythonhosted.org/packages/54/74/0b797484013128837f3b1cbb6cea019277c4de4e377dc512b4d9a0f92940/msgpack-1.2.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:5bb9c386f0a329c035ddbab4b72d1028bf9627add8dda41070288563d57ed1b1", size = 86544, upload-time = "2026-06-18T16:13:40.447Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/b4/b774d7eb95561739907fec675582f83203cf41c597a418c2589b4bfb8e9d/msgpack-1.2.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:20466cca18c49c7292a8984bc15d65857b171e7264bdcb5f96baf8be238791fc", size = 427661, upload-time = "2026-06-18T16:13:41.574Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/f9/3243191dc9937e00756c8bc1b0272fed8f23758e43df2a3b46f533e5090f/msgpack-1.2.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:196300e7e5d6e74d50f1607ab9c06c4a1484c383cd22defd727902591f7e8dde", size = 426375, upload-time = "2026-06-18T16:13:42.936Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/2b/92f86956a0c13e8662f7e2ad630c4eb4db07497b967589bd5245e018b2c1/msgpack-1.2.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8c2ed1e48cc0f460bf3c7780e7137ff21a4e18433451916f2442c1b21036cd7d", size = 410897, upload-time = "2026-06-18T16:13:45.629Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/4d/fa006060ffa1011d32bfae826fe766fe73e02982183601633b7121058ab3/msgpack-1.2.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:f9389552ecf4784886345ead0647e4edc96bee37cbab05b75540f542f766c48c", size = 419815, upload-time = "2026-06-18T16:13:48.205Z" },
 ]
 
 [[package]]

From 5abd63b75401a863a6461e29354bf24c2259b4a8 Mon Sep 17 00:00:00 2001
From: Tin-Yin Lai <tinyinl@nvidia.com>
Date: Mon, 22 Jun 2026 18:18:20 -0700
Subject: [PATCH 3/3] test(compliance): unit + integration tests for the
 output-caching audit

Unit tests for verify_output_caching, OutputCachingAudit.plan_runs/verify,
RunStats.from_report, the run_audit guards (load-pattern, incomplete phase,
interrupt-skips-audit), SampleOrderSpec/SingleSampleOrder, and the atomic
result writer; plus the end-to-end audit: flow (offline + single-stream).
Includes the metrics aggregator signal-handling ready-file test update.

The rejected-load-pattern guard test derives its parametrization from the
LoadPatternType enum (anything that isn't max_throughput/concurrency) so it
stays correct regardless of which patterns exist on the base branch.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../test_signal_handling.py                   |  54 ++-
 .../commands/test_benchmark_command.py        |  72 ++++
 tests/unit/commands/test_benchmark.py         |  29 ++
 tests/unit/compliance/__init__.py             |  14 +
 tests/unit/compliance/test_output_caching.py  | 388 ++++++++++++++++++
 .../test_runtime_settings_sample_order.py     |  41 ++
 .../unit/load_generator/test_sample_order.py  |  13 +
 .../test_single_sample_order.py               |  67 +++
 8 files changed, 658 insertions(+), 20 deletions(-)
 create mode 100644 tests/unit/compliance/__init__.py
 create mode 100644 tests/unit/compliance/test_output_caching.py
 create mode 100644 tests/unit/config/test_runtime_settings_sample_order.py
 create mode 100644 tests/unit/load_generator/test_single_sample_order.py

diff --git a/tests/integration/async_utils/services/metrics_aggregator/test_signal_handling.py b/tests/integration/async_utils/services/metrics_aggregator/test_signal_handling.py
index 010536c09..ec1a412a6 100644
--- a/tests/integration/async_utils/services/metrics_aggregator/test_signal_handling.py
+++ b/tests/integration/async_utils/services/metrics_aggregator/test_signal_handling.py
@@ -43,6 +43,7 @@ def _spawn_aggregator(
     *,
     socket_name: str,
     metrics_socket: str,
+    ready_file: Path | None = None,
 ) -> subprocess.Popen:
     """Launch the metrics-aggregator subprocess in its own process group.
 
@@ -51,20 +52,23 @@ def _spawn_aggregator(
     user Ctrl-C in the foreground process group of the subprocess and
     not the test runner.
     """
+    cmd = [
+        sys.executable,
+        "-m",
+        "inference_endpoint.async_utils.services.metrics_aggregator",
+        "--socket-dir",
+        str(socket_dir),
+        "--socket-name",
+        socket_name,
+        "--metrics-socket",
+        metrics_socket,
+        "--metrics-output-dir",
+        str(output_dir),
+    ]
+    if ready_file is not None:
+        cmd += ["--ready-file", str(ready_file)]
     return subprocess.Popen(
-        [
-            sys.executable,
-            "-m",
-            "inference_endpoint.async_utils.services.metrics_aggregator",
-            "--socket-dir",
-            str(socket_dir),
-            "--socket-name",
-            socket_name,
-            "--metrics-socket",
-            metrics_socket,
-            "--metrics-output-dir",
-            str(output_dir),
-        ],
+        cmd,
         # New process group so we can signal it without disturbing the
         # test runner.
         preexec_fn=os.setsid,
@@ -103,21 +107,28 @@ def test_sigterm_writes_interrupted_final_snapshot(self, tmp_path: Path):
         # Use a unique socket name per test to avoid collisions if a
         # previous test run left an IPC file behind.
         suffix = uuid.uuid4().hex[:8]
+        ready_file = tmp_path / "ready"
         proc = _spawn_aggregator(
             socket_dir,
             output_dir,
             socket_name=f"events_{suffix}",
             metrics_socket=f"metrics_{suffix}",
+            ready_file=ready_file,
         )
         try:
-            # Give the subprocess time to: parse args, set up ZMQ, bind
-            # sockets, register signal handlers, enter the await loop.
-            # The signal-handler registration is what we're testing, so
-            # we MUST wait for it before sending the signal.
-            time.sleep(2.0)
+            # Poll for the ready sentinel instead of sleeping a fixed amount:
+            # on network-mounted filesystems (e.g. Lustre) Python import can
+            # take several seconds, so a fixed sleep races with signal-handler
+            # registration. The aggregator touches --ready-file only after
+            # loop.add_signal_handler returns, so this is an exact gate.
+            ready = _wait_for_file(ready_file, timeout=30.0)
+            assert ready, (
+                f"aggregator did not become ready within 30 s — "
+                f"stderr: {(proc.stderr.read() if proc.stderr else b'').decode()[-2000:]}"
+            )
             assert (
                 proc.poll() is None
-            ), f"aggregator died early: stderr={(proc.stderr.read() if proc.stderr else b"").decode()}"
+            ), f"aggregator died early: stderr={(proc.stderr.read() if proc.stderr else b'').decode()}"
 
             # SIGTERM the process group → triggers _signal_finalize.
             os.killpg(proc.pid, signal.SIGTERM)
@@ -156,14 +167,17 @@ def test_sigint_does_not_finalize_aggregator(self, tmp_path: Path):
         output_dir = tmp_path / "output"
         output_dir.mkdir()  # parent owns dir setup (see sibling test)
         suffix = uuid.uuid4().hex[:8]
+        ready_file = tmp_path / "ready"
         proc = _spawn_aggregator(
             socket_dir,
             output_dir,
             socket_name=f"events_{suffix}",
             metrics_socket=f"metrics_{suffix}",
+            ready_file=ready_file,
         )
         try:
-            time.sleep(2.0)
+            ready = _wait_for_file(ready_file, timeout=30.0)
+            assert ready, "aggregator did not become ready within 30 s"
             assert proc.poll() is None, "aggregator died before signal-handler test"
 
             os.killpg(proc.pid, signal.SIGINT)
diff --git a/tests/integration/commands/test_benchmark_command.py b/tests/integration/commands/test_benchmark_command.py
index cf396f1f3..d62eb7f94 100644
--- a/tests/integration/commands/test_benchmark_command.py
+++ b/tests/integration/commands/test_benchmark_command.py
@@ -23,6 +23,8 @@
 import yaml
 from inference_endpoint.commands.benchmark.execute import run_benchmark
 from inference_endpoint.config.schema import (
+    AuditConfig,
+    AuditTestId,
     BenchmarkConfig,
     Dataset,
     DatasetType,
@@ -166,6 +168,76 @@ def test_mode_logging(self, mock_http_echo_server, ds_dataset_path, caplog):
         assert "QPS: 20" in caplog.text
         assert "Responses: False" in caplog.text
 
+    @pytest.mark.integration
+    @pytest.mark.parametrize(
+        "test_type,settings",
+        [
+            (
+                TestType.OFFLINE,
+                Settings(
+                    runtime=RuntimeConfig(min_duration_ms=0),
+                    load_pattern=LoadPattern(type=LoadPatternType.MAX_THROUGHPUT),
+                    client=HTTPClientConfig(
+                        num_workers=1, warmup_connections=0, max_connections=10
+                    ),
+                ),
+            ),
+            (
+                TestType.ONLINE,
+                Settings(
+                    runtime=RuntimeConfig(min_duration_ms=0),
+                    load_pattern=LoadPattern(
+                        type=LoadPatternType.CONCURRENCY, target_concurrency=1
+                    ),
+                    client=HTTPClientConfig(
+                        num_workers=1, warmup_connections=0, max_connections=10
+                    ),
+                ),
+            ),
+        ],
+        ids=["offline", "single-stream"],
+    )
+    def test_audit_output_caching_two_phase_flow(
+        self,
+        mock_http_echo_server,
+        ds_dataset_path,
+        tmp_path,
+        caplog,
+        test_type,
+        settings,
+    ):
+        """Output-caching audit (MLPerf TEST04) runs reference + output_caching
+        phases for offline and single-stream.
+
+        Exercises the redesigned audit: config block → run_audit orchestrator →
+        AuditResult. Asserts both phase subdirs are created, the result file is
+        written, and run_benchmark returns an AuditResult.
+        """
+        config = BenchmarkConfig(
+            type=test_type,
+            audit=AuditConfig(
+                test=AuditTestId.OUTPUT_CACHING_TEST,
+                samples=4,
+                audit_samples=2,
+                sample_index=0,
+            ),
+            endpoint_config=EndpointConfig(endpoints=[mock_http_echo_server.url]),
+            model_params=ModelParams(name="echo-server", streaming=StreamingMode.OFF),
+            datasets=[Dataset(path=ds_dataset_path, type=DatasetType.PERFORMANCE)],
+            settings=settings,
+            report_dir=str(tmp_path),
+        )
+
+        with caplog.at_level("INFO"):
+            result = run_benchmark(config, TestMode.PERF)
+
+        # Both planned phases ran under their own subdirs.
+        assert (tmp_path / "reference").is_dir()
+        assert (tmp_path / "output_caching").is_dir()
+        # Orchestrator returned a result and wrote it to disk.
+        assert result is not None
+        assert (tmp_path / "audit_result.json").exists()
+
 
 TEMPLATE_DIR = (
     Path(__file__).parent.parent.parent.parent
diff --git a/tests/unit/commands/test_benchmark.py b/tests/unit/commands/test_benchmark.py
index 1c90554fb..8457e5567 100644
--- a/tests/unit/commands/test_benchmark.py
+++ b/tests/unit/commands/test_benchmark.py
@@ -1277,3 +1277,32 @@ def test_no_override_yields_none_when_model_has_no_tokenizer(
             ctx = setup_benchmark(config, TestMode.PERF)
 
         assert ctx.tokenizer_name is None
+
+
+class TestRunBenchmarkInterrupt:
+    @pytest.mark.unit
+    def test_keyboard_interrupt_skips_audit(self, monkeypatch):
+        """A Ctrl-C during the main run must propagate and NOT start the audit:
+        an interrupted run cannot certify a compliance result."""
+        from inference_endpoint.commands import audit as audit_mod
+        from inference_endpoint.commands.benchmark import execute as ex
+        from inference_endpoint.commands.benchmark.execute import (
+            TestMode,
+            run_benchmark,
+        )
+
+        config = MagicMock()
+        config.audit = MagicMock()  # audit IS configured
+
+        def _interrupt(ctx):
+            raise KeyboardInterrupt
+
+        monkeypatch.setattr(ex, "setup_benchmark", lambda *a, **k: MagicMock())
+        monkeypatch.setattr(ex, "run_benchmark_async", _interrupt)
+        monkeypatch.setattr(ex, "finalize_benchmark", lambda *a, **k: None)
+        audit_spy = MagicMock()
+        monkeypatch.setattr(audit_mod, "run_audit", audit_spy)
+
+        with pytest.raises(KeyboardInterrupt):
+            run_benchmark(config, TestMode.PERF)
+        audit_spy.assert_not_called()
diff --git a/tests/unit/compliance/__init__.py b/tests/unit/compliance/__init__.py
new file mode 100644
index 000000000..467079831
--- /dev/null
+++ b/tests/unit/compliance/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/unit/compliance/test_output_caching.py b/tests/unit/compliance/test_output_caching.py
new file mode 100644
index 000000000..2c2cb8dbf
--- /dev/null
+++ b/tests/unit/compliance/test_output_caching.py
@@ -0,0 +1,388 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for output-caching (MLPerf TEST04) audit logic."""
+
+from __future__ import annotations
+
+import json
+from unittest.mock import MagicMock
+
+import pytest
+from inference_endpoint.compliance import RunStats
+from inference_endpoint.compliance.result import AuditResult, write_result
+from inference_endpoint.compliance.tests.output_caching_test import (
+    OutputCachingAudit,
+    verify_output_caching,
+)
+from inference_endpoint.config.schema import AuditTestId, LoadPatternType
+
+# Load patterns the audit must reject, derived from the enum so the test stays
+# correct regardless of which patterns exist on the base (only max_throughput
+# and concurrency are accepted).
+_REJECTED_LOAD_PATTERNS = [
+    p
+    for p in LoadPatternType
+    if p not in (LoadPatternType.MAX_THROUGHPUT, LoadPatternType.CONCURRENCY)
+]
+
+# ---------------------------------------------------------------------------
+# verify_output_caching — pure function, no I/O
+# ---------------------------------------------------------------------------
+
+
+class TestVerifyOutputCaching:
+    @pytest.mark.unit
+    def test_pass_when_audit_qps_below_threshold(self):
+        ref = RunStats(qps=100.0, n_completed=1000, n_requested=1000)
+        audit = RunStats(qps=105.0, n_completed=1000, n_requested=1000)
+        result = verify_output_caching(ref, audit, threshold=0.10)
+        assert result.passed is True
+        assert result.test_id == AuditTestId.OUTPUT_CACHING_TEST.value
+
+    @pytest.mark.unit
+    def test_fail_when_audit_qps_clearly_above_threshold(self):
+        ref = RunStats(qps=100.0, n_completed=1000, n_requested=1000)
+        # audit 120 > limit 110 → FAIL
+        audit = RunStats(qps=120.0, n_completed=1000, n_requested=1000)
+        result = verify_output_caching(ref, audit, threshold=0.10)
+        assert result.passed is False
+
+    @pytest.mark.unit
+    def test_fail_when_reference_phase_incomplete(self):
+        ref = RunStats(qps=100.0, n_completed=800, n_requested=1000)
+        audit = RunStats(qps=50.0, n_completed=1000, n_requested=1000)
+        result = verify_output_caching(ref, audit, threshold=0.10)
+        assert result.passed is False
+        assert "Phase incomplete" in result.details["reason"]
+
+    @pytest.mark.unit
+    def test_fail_when_audit_phase_incomplete(self):
+        ref = RunStats(qps=100.0, n_completed=1000, n_requested=1000)
+        audit = RunStats(qps=50.0, n_completed=800, n_requested=1000)
+        result = verify_output_caching(ref, audit, threshold=0.10)
+        assert result.passed is False
+        assert "Phase incomplete" in result.details["reason"]
+
+    @pytest.mark.unit
+    def test_details_contain_qps_values(self):
+        ref = RunStats(qps=100.0, n_completed=1000, n_requested=1000)
+        audit = RunStats(qps=50.0, n_completed=1000, n_requested=1000)
+        result = verify_output_caching(ref, audit)
+        assert result.details["ref_qps"] == 100.0
+        assert result.details["audit_qps"] == 50.0
+        assert result.details["threshold"] == 0.10
+
+    @pytest.mark.unit
+    def test_custom_threshold(self):
+        ref = RunStats(qps=100.0, n_completed=1000, n_requested=1000)
+        audit = RunStats(qps=105.0, n_completed=1000, n_requested=1000)
+        # With threshold=0.02 the limit is 102.0 → audit 105 > limit → FAIL
+        result = verify_output_caching(ref, audit, threshold=0.02)
+        assert result.passed is False
+
+
+# ---------------------------------------------------------------------------
+# OutputCachingAudit.plan_runs
+# ---------------------------------------------------------------------------
+
+
+class TestOutputCachingAuditPlanRuns:
+    def _make_cfg(self, samples=None, audit_samples=None, sample_index=0):
+        from inference_endpoint.config.schema import (
+            AuditTestId,
+            OutputCachingTestConfig,
+        )
+
+        return OutputCachingTestConfig(
+            test=AuditTestId.OUTPUT_CACHING_TEST,
+            samples=samples,
+            audit_samples=audit_samples,
+            sample_index=sample_index,
+        )
+
+    @pytest.mark.unit
+    def test_samples_is_required(self):
+        # Audits need an explicit reference count so the per-phase completion
+        # check has an independent target (see OutputCachingTestConfig.samples).
+        import pydantic
+        from inference_endpoint.config.schema import (
+            AuditTestId,
+            OutputCachingTestConfig,
+        )
+
+        with pytest.raises(pydantic.ValidationError):
+            OutputCachingTestConfig(
+                test=AuditTestId.OUTPUT_CACHING_TEST, sample_index=0
+            )
+
+    @pytest.mark.unit
+    def test_plan_produces_two_specs(self):
+        cfg = self._make_cfg(samples=500)
+        specs = OutputCachingAudit().plan_runs(cfg)
+        assert len(specs) == 2
+
+    @pytest.mark.unit
+    def test_reference_spec_uses_without_replacement(self):
+        cfg = self._make_cfg(samples=500)
+        specs = OutputCachingAudit().plan_runs(cfg)
+        ref = specs[0]
+        assert ref.label == "reference"
+        assert ref.n_samples == 500
+        assert ref.sample_order.fixed_index is None
+
+    @pytest.mark.unit
+    def test_audit_spec_uses_single_index(self):
+        cfg = self._make_cfg(samples=500, sample_index=7)
+        specs = OutputCachingAudit().plan_runs(cfg)
+        audit = specs[1]
+        assert audit.label == "output_caching"
+        assert audit.sample_order.fixed_index == 7
+
+    @pytest.mark.unit
+    def test_audit_n_defaults_to_ref_n_when_not_set(self):
+        cfg = self._make_cfg(samples=300)
+        specs = OutputCachingAudit().plan_runs(cfg)
+        assert specs[1].n_samples == 300
+
+    @pytest.mark.unit
+    def test_audit_n_overrides_when_audit_samples_set(self):
+        cfg = self._make_cfg(samples=300, audit_samples=150)
+        specs = OutputCachingAudit().plan_runs(cfg)
+        assert specs[1].n_samples == 150
+
+
+# ---------------------------------------------------------------------------
+# OutputCachingAudit.verify — threshold plumbing + phase-count guard
+# ---------------------------------------------------------------------------
+
+
+class TestOutputCachingAuditVerify:
+    def _cfg(self, threshold=0.10):
+        from inference_endpoint.config.schema import (
+            AuditTestId,
+            OutputCachingTestConfig,
+        )
+
+        return OutputCachingTestConfig(
+            test=AuditTestId.OUTPUT_CACHING_TEST,
+            samples=1000,
+            audit_samples=1000,
+            sample_index=0,
+            threshold=threshold,
+        )
+
+    def _arts(self, qps, label, n_completed=1000, n_requested=1000):
+        from pathlib import Path
+
+        from inference_endpoint.compliance import RunArtifacts
+
+        rep = MagicMock()
+        rep.qps.return_value = qps
+        rep.n_samples_completed = n_completed
+        return RunArtifacts(
+            label=label, report_dir=Path("/tmp"), report=rep, n_requested=n_requested
+        )
+
+    @pytest.mark.unit
+    def test_verify_honors_configured_threshold(self):
+        # Regression: verify() must apply cfg.threshold, not the default 0.10.
+        # audit 115 vs ref 100 → limit is 110 at 0.10 (FAIL) but 120 at 0.20 (PASS).
+        ref = self._arts(100.0, label="reference")
+        audit = self._arts(115.0, label="output_caching")
+
+        v10 = OutputCachingAudit().verify([ref, audit], self._cfg(threshold=0.10))
+        assert v10.passed is False
+
+        v20 = OutputCachingAudit().verify([ref, audit], self._cfg(threshold=0.20))
+        assert v20.passed is True
+        assert v20.details["threshold"] == 0.20
+
+    @pytest.mark.unit
+    def test_verify_rejects_wrong_phase_count(self):
+        ref = self._arts(100.0, label="reference")
+        with pytest.raises(ValueError, match="exactly 2 phases"):
+            OutputCachingAudit().verify([ref], self._cfg())
+
+
+# ---------------------------------------------------------------------------
+# RunStats.from_report
+# ---------------------------------------------------------------------------
+
+
+class TestRunStats:
+    @pytest.mark.unit
+    def test_from_report_extracts_qps(self):
+        mock_report = MagicMock()
+        mock_report.qps.return_value = 42.5
+        mock_report.n_samples_completed = 100
+        stats = RunStats.from_report(mock_report, n_requested=100)
+        assert stats.qps == 42.5
+        assert stats.n_completed == 100
+        assert stats.n_requested == 100
+
+    @pytest.mark.unit
+    def test_from_report_raises_when_qps_is_none(self):
+        mock_report = MagicMock()
+        mock_report.qps.return_value = None
+        with pytest.raises(ValueError, match="no duration"):
+            RunStats.from_report(mock_report, n_requested=100)
+
+    @pytest.mark.unit
+    def test_from_report_raises_when_qps_non_positive(self):
+        # A zero-throughput run (no completions) can't anchor an output-caching ratio.
+        mock_report = MagicMock()
+        mock_report.qps.return_value = 0.0
+        with pytest.raises(ValueError, match="non-positive throughput"):
+            RunStats.from_report(mock_report, n_requested=100)
+
+
+# ---------------------------------------------------------------------------
+# write_result — atomic disk write
+# ---------------------------------------------------------------------------
+
+
+class TestWriteResult:
+    @pytest.mark.unit
+    def test_writes_txt_and_json(self, tmp_path):
+        result = AuditResult(
+            test_id="output_caching_test",
+            passed=True,
+            details={"reason": "ok", "ref_qps": 100.0},
+        )
+        write_result(result, tmp_path)
+        txt = (tmp_path / "verify_OUTPUT_CACHING_TEST.txt").read_text()
+        assert "Performance check pass: True" in txt
+        data = json.loads((tmp_path / "audit_result.json").read_text())
+        assert data["passed"] is True
+        assert data["test"] == "output_caching_test"
+
+    @pytest.mark.unit
+    def test_failed_result_writes_false(self, tmp_path):
+        result = AuditResult(
+            test_id="output_caching_test", passed=False, details={"reason": "fail"}
+        )
+        write_result(result, tmp_path)
+        txt = (tmp_path / "verify_OUTPUT_CACHING_TEST.txt").read_text()
+        assert "Performance check pass: False" in txt
+
+    @pytest.mark.unit
+    def test_json_contains_full_details(self, tmp_path):
+        details = {
+            "ref_qps": 100.0,
+            "audit_qps": 80.0,
+            "threshold": 0.10,
+            "reason": "ok",
+        }
+        result = AuditResult(
+            test_id="output_caching_test", passed=True, details=details
+        )
+        write_result(result, tmp_path)
+        data = json.loads((tmp_path / "audit_result.json").read_text())
+        assert data["ref_qps"] == 100.0
+        assert data["threshold"] == 0.10
+
+
+# ---------------------------------------------------------------------------
+# Registry
+# ---------------------------------------------------------------------------
+
+
+class TestRegistry:
+    @pytest.mark.unit
+    def test_get_audit_test_returns_output_caching(self):
+        from inference_endpoint.compliance import get_audit_test
+
+        test = get_audit_test(AuditTestId.OUTPUT_CACHING_TEST)
+        assert test.test_id == AuditTestId.OUTPUT_CACHING_TEST
+
+    @pytest.mark.unit
+    def test_get_audit_test_raises_on_unknown(self, monkeypatch):
+        from inference_endpoint import compliance
+
+        # Empty the registry so the id resolves to nothing; the re-import of the
+        # tests subpackage is cached and won't re-register, so the miss stands.
+        monkeypatch.setattr(compliance, "_REGISTRY", {})
+        with pytest.raises(KeyError, match="No audit test registered"):
+            compliance.get_audit_test(AuditTestId.OUTPUT_CACHING_TEST)
+
+
+# ---------------------------------------------------------------------------
+# run_audit orchestrator guards
+# ---------------------------------------------------------------------------
+
+
+class TestRunAuditGuards:
+    def _audit_config(self) -> MagicMock:
+        """A MagicMock BenchmarkConfig with a real output-caching audit block."""
+        from inference_endpoint.config.schema import OutputCachingTestConfig
+
+        config = MagicMock()
+        config.audit = OutputCachingTestConfig(
+            test=AuditTestId.OUTPUT_CACHING_TEST, samples=4, sample_index=0
+        )
+        return config
+
+    @pytest.mark.unit
+    @pytest.mark.parametrize("pattern", _REJECTED_LOAD_PATTERNS, ids=lambda p: p.name)
+    def test_rejects_paced_or_incompatible_load_pattern(self, tmp_path, pattern):
+        """Only max_throughput / concurrency are valid; everything else is
+        rejected before any phase runs (guards the fixed-index audit semantics)."""
+        from inference_endpoint.commands.audit import run_audit
+        from inference_endpoint.exceptions import SetupError
+
+        config = self._audit_config()
+        config.settings.load_pattern.type = pattern
+        with pytest.raises(SetupError, match="unpaced load pattern"):
+            run_audit(config, tmp_path)
+
+    @pytest.mark.unit
+    @pytest.mark.parametrize("pattern", ["MAX_THROUGHPUT", "CONCURRENCY"])
+    def test_refuses_result_on_incomplete_phase(self, tmp_path, monkeypatch, pattern):
+        """A phase whose Report.complete is False (drain timeout / interrupted)
+        must abort with ExecutionError, never a certified result."""
+        from inference_endpoint.commands.audit import run_audit
+        from inference_endpoint.config.schema import LoadPatternType
+        from inference_endpoint.exceptions import ExecutionError
+
+        config = self._audit_config()
+        config.settings.load_pattern.type = getattr(LoadPatternType, pattern)
+        perf_ds = MagicMock()
+        perf_ds.type.value = "performance"
+        config.datasets = [perf_ds]
+        config.with_updates.return_value = MagicMock()
+
+        # The bounds-check reads num_samples from the first phase's loaded ctx.
+        ctx = MagicMock()
+        ctx.dataloader.num_samples.return_value = 100
+        incomplete = MagicMock()
+        incomplete.complete = False
+        bench = MagicMock()
+        bench.report = incomplete
+        monkeypatch.setattr(
+            "inference_endpoint.commands.benchmark.execute.setup_benchmark",
+            lambda *a, **k: ctx,
+        )
+        monkeypatch.setattr(
+            "inference_endpoint.commands.benchmark.execute.run_benchmark_async",
+            lambda ctx: bench,
+        )
+        monkeypatch.setattr(
+            "inference_endpoint.commands.benchmark.execute.finalize_benchmark",
+            lambda ctx, b: None,
+        )
+
+        with pytest.raises(ExecutionError, match="did not complete cleanly"):
+            run_audit(config, tmp_path)
diff --git a/tests/unit/config/test_runtime_settings_sample_order.py b/tests/unit/config/test_runtime_settings_sample_order.py
new file mode 100644
index 000000000..7981fb7f8
--- /dev/null
+++ b/tests/unit/config/test_runtime_settings_sample_order.py
@@ -0,0 +1,41 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+from inference_endpoint.config.runtime_settings import SampleOrderSpec
+
+
+@pytest.mark.unit
+def test_default_is_without_replacement():
+    spec = SampleOrderSpec()
+    assert spec.fixed_index is None
+
+
+@pytest.mark.unit
+def test_without_replacement_factory():
+    assert SampleOrderSpec.without_replacement().fixed_index is None
+
+
+@pytest.mark.unit
+def test_single_factory_carries_index():
+    spec = SampleOrderSpec.single(3)
+    assert spec.fixed_index == 3
+
+
+@pytest.mark.unit
+def test_spec_is_frozen():
+    spec = SampleOrderSpec()
+    with pytest.raises(AttributeError):
+        spec.fixed_index = 5  # type: ignore[misc]
diff --git a/tests/unit/load_generator/test_sample_order.py b/tests/unit/load_generator/test_sample_order.py
index 4ddeff10d..ca5605877 100644
--- a/tests/unit/load_generator/test_sample_order.py
+++ b/tests/unit/load_generator/test_sample_order.py
@@ -28,6 +28,19 @@
 _DATASET_SIZES = [3, 100, 10_000]
 
 
+@pytest.mark.unit
+class TestDefaultRng:
+    def test_default_rng_is_isolated_instance(self):
+        # Constructing without an explicit rng must yield a fresh per-instance
+        # Random, not the process-global `random` module (which would couple
+        # draws across unrelated instances).
+        o1 = WithoutReplacementSampleOrder(n_samples_in_dataset=10)
+        o2 = WithoutReplacementSampleOrder(n_samples_in_dataset=10)
+        assert isinstance(o1.rng, random.Random)
+        assert o1.rng is not random
+        assert o1.rng is not o2.rng
+
+
 @pytest.mark.unit
 class TestWithoutReplacementSampleOrder:
     @pytest.mark.parametrize("n_samples", _DATASET_SIZES)
diff --git a/tests/unit/load_generator/test_single_sample_order.py b/tests/unit/load_generator/test_single_sample_order.py
new file mode 100644
index 000000000..f2a85e277
--- /dev/null
+++ b/tests/unit/load_generator/test_single_sample_order.py
@@ -0,0 +1,67 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+
+import pytest
+from inference_endpoint.config.runtime_settings import RuntimeSettings, SampleOrderSpec
+from inference_endpoint.load_generator.sample_order import (
+    SingleSampleOrder,
+    WithoutReplacementSampleOrder,
+    create_sample_order,
+)
+from inference_endpoint.metrics.metric import Throughput
+
+
+@pytest.mark.unit
+def test_single_yields_fixed_index_forever():
+    order = SingleSampleOrder(sample_index=3, n_samples_in_dataset=10)
+    assert [next(order) for _ in range(5)] == [3, 3, 3, 3, 3]
+
+
+@pytest.mark.unit
+@pytest.mark.parametrize("bad", [-1, 10, 99])
+def test_single_rejects_out_of_range_index(bad: int):
+    with pytest.raises(ValueError, match="sample_index"):
+        SingleSampleOrder(sample_index=bad, n_samples_in_dataset=10)
+
+
+def _settings(spec: SampleOrderSpec, n: int = 10) -> RuntimeSettings:
+    return RuntimeSettings(
+        metric_target=Throughput(1.0),
+        reported_metrics=[],
+        min_duration_ms=0,
+        max_duration_ms=None,
+        n_samples_from_dataset=n,
+        n_samples_to_issue=None,
+        min_sample_count=1,
+        rng_sched=random.Random(0),
+        rng_sample_index=random.Random(0),
+        load_pattern=None,
+        sample_order=spec,
+    )
+
+
+@pytest.mark.unit
+def test_create_dispatches_single():
+    order = create_sample_order(_settings(SampleOrderSpec.single(2)))
+    assert isinstance(order, SingleSampleOrder)
+    assert next(order) == 2
+
+
+@pytest.mark.unit
+def test_create_defaults_without_replacement():
+    order = create_sample_order(_settings(SampleOrderSpec()))
+    assert isinstance(order, WithoutReplacementSampleOrder)