mlcommons · wu6u3tw · Jun 23, 2026 · Jun 23, 2026 · Jun 23, 2026
@@ -0,0 +1,70 @@
+# Full WAN 2.2 Offline submission: performance + VBench accuracy + output-caching audit (MLPerf TEST04).
+# One command runs all three under a single report_dir:
+#   inference-endpoint benchmark from-config \
+#       examples/09_Wan22_VideoGen_Example/offline_wan22_submission.yaml
+#
+# Execution order (run_benchmark):
+#   1. performance run  — full 248-prompt dataset (the submission perf result)
+#   2. accuracy scoring — VBench over the produced videos
+#   3. audit (output_caching_test, MLPerf TEST04) — reference + fixed-sample phases (equal counts here), then result
+#
+# NOTE: the `audit:` block is implemented per docs/compliance_audit_plan.md
+# (the `compliance/` module). The performance + accuracy portion mirrors
+# offline_wan22_accuracy.yaml.
+
+name: "submission-wan22-video-generation"
+version: "1.0"
+type: "submission"
+benchmark_mode: "offline" # required for type: submission
+
+model_params:
+  name: "wan22"
+  max_new_tokens: 1 # ignored by VideoGenAdapter; kept >0 for api_type debug swaps
+  streaming: "off" # WAN 2.2 uses non-streaming HTTP POST/response
+
+datasets:
+  # Performance dataset drives request issuance (the submission perf run).
+  - name: wan22_perf
+    path: examples/09_Wan22_VideoGen_Example/wan22_prompts.jsonl
+    type: "performance"
+    samples: 248
+
+  # Accuracy dataset reuses the same prompts; videos are scored VBench-style.
+  - name: wan22_vbench
+    path: examples/09_Wan22_VideoGen_Example/wan22_prompts.jsonl
+    type: "accuracy"
+    samples: 248
+    accuracy_config:
+      eval_method: "vbench"
+      ground_truth: "prompt" # VBench input is (prompt, video), not a GT comparison
+      num_repeats: 1
+
+# Output-caching audit (MLPerf TEST04) — additive post-step. Runs its OWN reference + fixed-sample
+# phases at equal counts (the audit count may be lowered to shorten the phase).
+audit:
+  test: "output_caching_test"
+  samples: 64 # reference phase count (subset of the 248 prompts)
+  audit_samples: 64 # audit (fixed-sample) phase count; lower (e.g. 32) to shorten the audit phase
+  sample_index: 3 # MLCommons audit.config performance_issue_same_index=3
+  threshold: 0.10 # audit qps must stay < reference qps * (1 + threshold)
+
+settings:
+  runtime:
+    # NOTE: runs are count-driven (n_samples_to_issue / audit.samples). min_duration_ms is
+    # NOT enforced as a duration floor by the current stop logic (counts take priority);
+    # MLCommons' 10-min minimum / AND-semantics is future work. Only max_duration_ms caps.
+    max_duration_ms: 14400000 # 4-hour ceiling
+    scheduler_random_seed: 42
+    dataloader_random_seed: 42
+    n_samples_to_issue: 248 # applies to the perf/accuracy run; audit uses audit.samples
+
+  load_pattern:
+    type: "max_throughput"
+
+endpoint_config:
+  endpoints:
+    - "http://localhost:8000"
+  api_type: "videogen"
+  api_key: null
+
+report_dir: logs/wan22_submission
@@ -0,0 +1,71 @@
+# Full WAN 2.2 SingleStream submission: performance + VBench accuracy + output-caching audit (MLPerf TEST04).
+# SingleStream = one request in-flight at a time (concurrency=1).
+# One command runs all three under a single report_dir:
+#   inference-endpoint benchmark from-config \
+#       examples/09_Wan22_VideoGen_Example/single_stream_wan22_submission.yaml
+#
+# Execution order (run_benchmark):
+#   1. performance run  — single-stream latency over the prompt set
+#   2. accuracy scoring — VBench over the produced videos
+#   3. audit (output_caching_test, MLPerf TEST04) — reference + fixed-sample phases, then result
+#
+# NOTE: the `audit:` block is implemented per docs/compliance_audit_plan.md
+# (the `compliance/` module).
+
+name: "submission-wan22-video-generation-singlestream"
+version: "1.0"
+type: "submission"
+benchmark_mode: "online" # required for type: submission
+
+model_params:
+  name: "wan22"
+  max_new_tokens: 1 # ignored by VideoGenAdapter; kept >0 for api_type debug swaps
+  streaming: "off" # WAN 2.2 uses non-streaming HTTP POST/response
+
+datasets:
+  # Performance dataset drives request issuance (the submission perf run).
+  - name: wan22_perf
+    path: examples/09_Wan22_VideoGen_Example/wan22_prompts.jsonl
+    type: "performance"
+    samples: 20 # MLCommons SingleStream min_query_count
+
+  # Accuracy dataset reuses the same prompts; videos are scored VBench-style.
+  - name: wan22_vbench
+    path: examples/09_Wan22_VideoGen_Example/wan22_prompts.jsonl
+    type: "accuracy"
+    samples: 20
+    accuracy_config:
+      eval_method: "vbench"
+      ground_truth: "prompt" # VBench input is (prompt, video), not a GT comparison
+      num_repeats: 1
+
+# Output-caching audit (MLPerf TEST04) — additive post-step. Runs its OWN reference + fixed-sample
+# phases at equal counts (the audit count may be lowered to shorten the phase).
+audit:
+  test: "output_caching_test"
+  samples: 20 # reference phase count (SingleStream min_query_count)
+  audit_samples: 20 # audit (fixed-sample) phase count; omit to equal `samples`
+  sample_index: 3 # MLCommons audit.config performance_issue_same_index=3
+  threshold: 0.20 # low-throughput stream tolerance (±20%)
+
+settings:
+  runtime:
+    # NOTE: runs are count-driven (n_samples_to_issue / audit counts). min_duration_ms is
+    # NOT enforced as a duration floor by the current stop logic (counts take priority);
+    # MLCommons' 10-min minimum / AND-semantics is future work. Only max_duration_ms caps.
+    max_duration_ms: 7200000 # 2-hour ceiling
+    scheduler_random_seed: 42
+    dataloader_random_seed: 42
+    n_samples_to_issue: 20 # applies to the perf/accuracy run; audit uses its own counts
+
+  load_pattern:
+    type: "concurrency"
+    target_concurrency: 1 # SingleStream: one request in-flight at a time
+
+endpoint_config:
+  endpoints:
+    - "http://localhost:8000"
+  api_type: "videogen"
+  api_key: null
+
+report_dir: logs/wan22_singlestream_submission
@@ -73,6 +73,7 @@ dependencies = [
     # Fix pytz-2024 import warning
     "pytz==2026.1.post1",
     "urllib3==2.7.0",
+    "msgpack==1.2.1",
 ]
 
 [project.optional-dependencies]
@@ -112,7 +113,7 @@ test = [
     "Pympler==1.1",
     "scipy==1.17.1",
     # HTTP server and client for mock server fixture
-    "aiohttp==3.14.0",
+    "aiohttp==3.14.1",
     # Plotting for benchmark sweep mode
     "matplotlib==3.10.8",
     # Property-based testing (CLI fuzz)

@@ -183,6 +183,16 @@ async def main() -> None:
         default=0,
         help="Identity to send in the readiness signal",
     )
+    parser.add_argument(
+        "--ready-file",
+        type=str,
+        default=None,
+        help=(
+            "If set, touch this file after signal handlers are registered "
+            "so that test harnesses can poll for startup completion instead "
+            "of relying on a fixed sleep."
+        ),
+    )
     args = parser.parse_args()
     setup_logging(level="INFO")
 
@@ -282,6 +292,9 @@ async def main() -> None:
                 ),
             )
 
+            if args.ready_file:
+                Path(args.ready_file).touch()
+
             if args.readiness_path:
                 await send_ready_signal(zmq_ctx, args.readiness_path, args.readiness_id)
 

@@ -0,0 +1,160 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Generic compliance audit orchestrator.
+
+run_audit(config) drives all phases of a compliance audit test back-to-back
+against the same endpoint, then verifies the results and writes the result.
+
+Exit semantics (propagated by run_benchmark → cli.py → sys.exit):
+  0  PASS
+  1  FAIL
+  2  setup / I/O / phase error (raises ExecutionError or SetupError)
+"""
+
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+
+from ..compliance import RunArtifacts, get_audit_test
+from ..compliance.result import AuditResult, write_result
+from ..config.schema import BenchmarkConfig, LoadPatternType
+from ..exceptions import ExecutionError, SetupError
+
+logger = logging.getLogger(__name__)
+
+
+def run_audit(config: BenchmarkConfig, base_report_dir: Path) -> AuditResult:
+    """Orchestrate the planned audit phases and return the result.
+
+    All phases run back-to-back against the same endpoint, each under its
+    own subdirectory of ``base_report_dir``. If any phase raises, the error
+    is re-raised without verifying (a crashed phase must not produce a result).
+
+    Args:
+        config: Main benchmark config (must have config.audit set).
+        base_report_dir: Report directory for the main run; audit subdirs go here.
+
+    Returns:
+        AuditResult — always returned; caller maps passed/failed to exit code.
+
+    Raises:
+        SetupError: Config invalid for audit (missing audit block, paced load, bad index).
+        ExecutionError: A phase benchmark run failed.
+    """
+    from ..commands.benchmark.execute import (
+        TestMode,
+        finalize_benchmark,
+        run_benchmark_async,
+        setup_benchmark,
+    )
+
+    assert config.audit is not None, "run_audit called with config.audit=None"
+    audit_cfg = config.audit
+    test = get_audit_test(audit_cfg.test)
+
+    # Validate load pattern. The output-caching audit (MLPerf TEST04) needs a
+    # pattern where cache-induced
+    # speedups surface as higher achieved throughput:
+    #   - max_throughput: a faster SUT completes more queries per second.
+    #   - concurrency: at a fixed in-flight count, faster (cached) responses
+    #     raise the completion rate, so caching still shows up in QPS.
+    # Rate-paced patterns (poisson / target-QPS) pin the arrival rate, so a
+    # cached SUT just idles and the speedup is masked; patterns with different
+    # sample semantics (e.g. multi_turn) make the fixed-index phase meaningless.
+    # Allow-list the two valid patterns rather than enumerate the rejects.
+    load_type = config.settings.load_pattern.type
+    if load_type not in (LoadPatternType.MAX_THROUGHPUT, LoadPatternType.CONCURRENCY):
+        raise SetupError(
+            "Compliance audit requires an unpaced load pattern (max_throughput or concurrency). "
+            f"Got: {load_type.value}"
+        )
+
+    specs = test.plan_runs(audit_cfg)
+
+    perf_datasets = [d for d in config.datasets if d.type.value == "performance"]
+    if not perf_datasets:
+        raise SetupError("Audit requires at least one performance dataset")
+
+    # Execute each phase back-to-back. The first phase's setup_benchmark loads
+    # the dataset; reuse that count to bounds-check every fixed-index spec
+    # before any phase actually runs. setup_benchmark only loads data (it spawns
+    # no workers), so a failed bounds check here costs one load and nothing more.
+    artifacts: list[RunArtifacts] = []
+    n_samples: int | None = None
+    for spec in specs:
+        phase_dir = base_report_dir / spec.label
+        phase_dir.mkdir(parents=True, exist_ok=True)
+
+        # Build a per-phase config: phase subdirectory, no nested audit, explicit count.
+        phase_config = config.with_updates(report_dir=phase_dir, audit=None)
+
+        try:
+            ctx = setup_benchmark(phase_config, TestMode.PERF, run_spec=spec)
+            if n_samples is None:
+                n_samples = ctx.dataloader.num_samples()
+                for check_spec in specs:
+                    idx = check_spec.sample_order.fixed_index
+                    if idx is not None and not (0 <= idx < n_samples):
+                        raise SetupError(
+                            f"Audit phase '{check_spec.label}': sample_index={idx} "
+                            f"is out of range [0, {n_samples}) for dataset with "
+                            f"{n_samples} samples"
+                        )
+            bench = run_benchmark_async(ctx)
+            finalize_benchmark(ctx, bench)
+        except (SetupError, ExecutionError):
+            raise
+        except Exception as exc:
+            raise ExecutionError(f"Audit phase '{spec.label}' failed: {exc}") from exc
+
+        report = bench.report
+        if report is None:
+            raise ExecutionError(f"Audit phase '{spec.label}' produced no report")
+        # A drain-timeout (complete with pending async tasks) or an
+        # INTERRUPTED phase yields partial stats; certifying a result from it
+        # would let an incomplete run pass compliance.
+        if not report.complete:
+            raise ExecutionError(
+                f"Audit phase '{spec.label}' did not complete cleanly "
+                "(metrics drain timed out or the run was interrupted); "
+                "refusing to certify a result from partial data"
+            )
+        # When the spec didn't fix a count (None = full dataset), the requested
+        # count is the number actually issued this phase.
+        n_requested = (
+            spec.n_samples if spec.n_samples is not None else report.n_samples_issued
+        )
+        artifacts.append(
+            RunArtifacts(
+                label=spec.label,
+                report_dir=phase_dir,
+                report=report,
+                n_requested=n_requested,
+            )
+        )
+
+    result = test.verify(artifacts, audit_cfg)
+    write_result(result, base_report_dir)
+
+    status = "PASS" if result.passed else "FAIL"
+    logger.info(
+        "Audit %s %s — %s",
+        audit_cfg.test,
+        status,
+        result.details.get("reason", ""),
+    )
+    return result
@@ -17,6 +17,7 @@
 
 from __future__ import annotations
 
+import sys
 from pathlib import Path
 from typing import Annotated
 
@@ -25,6 +26,7 @@
 from pydantic import ValidationError  # noqa: F401 (used in from_config)
 
 from inference_endpoint.commands.benchmark.execute import run_benchmark
+from inference_endpoint.compliance.result import AuditResult
 from inference_endpoint.config.schema import (
     BenchmarkConfig,
     OfflineBenchmarkConfig,
@@ -51,7 +53,9 @@ def _run(config: BenchmarkConfig, dataset: list[str], mode: TestMode) -> None:
             raise DatasetValidationError(f"Invalid --dataset: {msgs}") from e
         except ValueError as e:
             raise DatasetValidationError(f"Invalid --dataset: {e}") from e
-    run_benchmark(config, mode)
+    result = run_benchmark(config, mode)
+    if isinstance(result, AuditResult):
+        sys.exit(0 if result.passed else 1)
 
 
 @benchmark_app.command