Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 33 additions & 16 deletions AGENTS.md

Large diffs are not rendered by default.

728 changes: 728 additions & 0 deletions docs/compliance_audit_plan.md

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Full WAN 2.2 Offline submission: performance + VBench accuracy + output-caching audit (MLPerf TEST04).
# One command runs all three under a single report_dir:
# inference-endpoint benchmark from-config \
# examples/09_Wan22_VideoGen_Example/offline_wan22_submission.yaml
#
# Execution order (run_benchmark):
# 1. performance run — full 248-prompt dataset (the submission perf result)
# 2. accuracy scoring — VBench over the produced videos
# 3. audit (output_caching_test, MLPerf TEST04) — reference + fixed-sample phases (equal counts here), then result
#
# NOTE: the `audit:` block is implemented per docs/compliance_audit_plan.md
# (the `compliance/` module). The performance + accuracy portion mirrors
# offline_wan22_accuracy.yaml.

name: "submission-wan22-video-generation"
version: "1.0"
type: "submission"
benchmark_mode: "offline" # required for type: submission

model_params:
name: "wan22"
max_new_tokens: 1 # ignored by VideoGenAdapter; kept >0 for api_type debug swaps
streaming: "off" # WAN 2.2 uses non-streaming HTTP POST/response

datasets:
# Performance dataset drives request issuance (the submission perf run).
- name: wan22_perf
path: examples/09_Wan22_VideoGen_Example/wan22_prompts.jsonl
type: "performance"
samples: 248

# Accuracy dataset reuses the same prompts; videos are scored VBench-style.
- name: wan22_vbench
path: examples/09_Wan22_VideoGen_Example/wan22_prompts.jsonl
type: "accuracy"
samples: 248
accuracy_config:
eval_method: "vbench"
ground_truth: "prompt" # VBench input is (prompt, video), not a GT comparison
num_repeats: 1

# Output-caching audit (MLPerf TEST04) — additive post-step. Runs its OWN reference + fixed-sample
# phases at equal counts (the audit count may be lowered to shorten the phase).
audit:
test: "output_caching_test"
samples: 64 # reference phase count (subset of the 248 prompts)
audit_samples: 64 # audit (fixed-sample) phase count; lower (e.g. 32) to shorten the audit phase
sample_index: 3 # MLCommons audit.config performance_issue_same_index=3
threshold: 0.10 # audit qps must stay < reference qps * (1 + threshold)

settings:
runtime:
# NOTE: runs are count-driven (n_samples_to_issue / audit.samples). min_duration_ms is
# NOT enforced as a duration floor by the current stop logic (counts take priority);
# MLCommons' 10-min minimum / AND-semantics is future work. Only max_duration_ms caps.
max_duration_ms: 14400000 # 4-hour ceiling
scheduler_random_seed: 42
dataloader_random_seed: 42
n_samples_to_issue: 248 # applies to the perf/accuracy run; audit uses audit.samples

load_pattern:
type: "max_throughput"

endpoint_config:
endpoints:
- "http://localhost:8000"
api_type: "videogen"
api_key: null

report_dir: logs/wan22_submission
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# Full WAN 2.2 SingleStream submission: performance + VBench accuracy + output-caching audit (MLPerf TEST04).
# SingleStream = one request in-flight at a time (concurrency=1).
# One command runs all three under a single report_dir:
# inference-endpoint benchmark from-config \
# examples/09_Wan22_VideoGen_Example/single_stream_wan22_submission.yaml
#
# Execution order (run_benchmark):
# 1. performance run — single-stream latency over the prompt set
# 2. accuracy scoring — VBench over the produced videos
# 3. audit (output_caching_test, MLPerf TEST04) — reference + fixed-sample phases, then result
#
# NOTE: the `audit:` block is implemented per docs/compliance_audit_plan.md
# (the `compliance/` module).

name: "submission-wan22-video-generation-singlestream"
version: "1.0"
type: "submission"
benchmark_mode: "online" # required for type: submission

model_params:
name: "wan22"
max_new_tokens: 1 # ignored by VideoGenAdapter; kept >0 for api_type debug swaps
streaming: "off" # WAN 2.2 uses non-streaming HTTP POST/response

datasets:
# Performance dataset drives request issuance (the submission perf run).
- name: wan22_perf
path: examples/09_Wan22_VideoGen_Example/wan22_prompts.jsonl
type: "performance"
samples: 20 # MLCommons SingleStream min_query_count

# Accuracy dataset reuses the same prompts; videos are scored VBench-style.
- name: wan22_vbench
path: examples/09_Wan22_VideoGen_Example/wan22_prompts.jsonl
type: "accuracy"
samples: 20
accuracy_config:
eval_method: "vbench"
ground_truth: "prompt" # VBench input is (prompt, video), not a GT comparison
num_repeats: 1

# Output-caching audit (MLPerf TEST04) — additive post-step. Runs its OWN reference + fixed-sample
# phases at equal counts (the audit count may be lowered to shorten the phase).
audit:
test: "output_caching_test"
samples: 20 # reference phase count (SingleStream min_query_count)
audit_samples: 20 # audit (fixed-sample) phase count; omit to equal `samples`
sample_index: 3 # MLCommons audit.config performance_issue_same_index=3
threshold: 0.20 # low-throughput stream tolerance (±20%)

settings:
runtime:
# NOTE: runs are count-driven (n_samples_to_issue / audit counts). min_duration_ms is
# NOT enforced as a duration floor by the current stop logic (counts take priority);
# MLCommons' 10-min minimum / AND-semantics is future work. Only max_duration_ms caps.
max_duration_ms: 7200000 # 2-hour ceiling
scheduler_random_seed: 42
dataloader_random_seed: 42
n_samples_to_issue: 20 # applies to the perf/accuracy run; audit uses its own counts

load_pattern:
type: "concurrency"
target_concurrency: 1 # SingleStream: one request in-flight at a time

endpoint_config:
endpoints:
- "http://localhost:8000"
api_type: "videogen"
api_key: null

report_dir: logs/wan22_singlestream_submission
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ dependencies = [
# Fix pytz-2024 import warning
"pytz==2026.1.post1",
"urllib3==2.7.0",
"msgpack==1.2.1",
]

[project.optional-dependencies]
Expand Down Expand Up @@ -112,7 +113,7 @@ test = [
"Pympler==1.1",
"scipy==1.17.1",
# HTTP server and client for mock server fixture
"aiohttp==3.14.0",
"aiohttp==3.14.1",
# Plotting for benchmark sweep mode
"matplotlib==3.10.8",
# Property-based testing (CLI fuzz)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,16 @@ async def main() -> None:
default=0,
help="Identity to send in the readiness signal",
)
parser.add_argument(
"--ready-file",
type=str,
default=None,
help=(
"If set, touch this file after signal handlers are registered "
"so that test harnesses can poll for startup completion instead "
"of relying on a fixed sleep."
),
)
args = parser.parse_args()
setup_logging(level="INFO")

Expand Down Expand Up @@ -282,6 +292,9 @@ async def main() -> None:
),
)

if args.ready_file:
Path(args.ready_file).touch()

if args.readiness_path:
await send_ready_signal(zmq_ctx, args.readiness_path, args.readiness_id)

Expand Down
160 changes: 160 additions & 0 deletions src/inference_endpoint/commands/audit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Generic compliance audit orchestrator.

run_audit(config) drives all phases of a compliance audit test back-to-back
against the same endpoint, then verifies the results and writes the result.

Exit semantics (propagated by run_benchmark → cli.py → sys.exit):
0 PASS
1 FAIL
2 setup / I/O / phase error (raises ExecutionError or SetupError)
"""

from __future__ import annotations

import logging
from pathlib import Path

from ..compliance import RunArtifacts, get_audit_test
from ..compliance.result import AuditResult, write_result
from ..config.schema import BenchmarkConfig, LoadPatternType
from ..exceptions import ExecutionError, SetupError

logger = logging.getLogger(__name__)


def run_audit(config: BenchmarkConfig, base_report_dir: Path) -> AuditResult:
"""Orchestrate the planned audit phases and return the result.

All phases run back-to-back against the same endpoint, each under its
own subdirectory of ``base_report_dir``. If any phase raises, the error
is re-raised without verifying (a crashed phase must not produce a result).

Args:
config: Main benchmark config (must have config.audit set).
base_report_dir: Report directory for the main run; audit subdirs go here.

Returns:
AuditResult — always returned; caller maps passed/failed to exit code.

Raises:
SetupError: Config invalid for audit (missing audit block, paced load, bad index).
ExecutionError: A phase benchmark run failed.
"""
from ..commands.benchmark.execute import (
TestMode,
finalize_benchmark,
run_benchmark_async,
setup_benchmark,
)

assert config.audit is not None, "run_audit called with config.audit=None"
audit_cfg = config.audit
test = get_audit_test(audit_cfg.test)

# Validate load pattern. The output-caching audit (MLPerf TEST04) needs a
# pattern where cache-induced
# speedups surface as higher achieved throughput:
# - max_throughput: a faster SUT completes more queries per second.
# - concurrency: at a fixed in-flight count, faster (cached) responses
# raise the completion rate, so caching still shows up in QPS.
# Rate-paced patterns (poisson / target-QPS) pin the arrival rate, so a
# cached SUT just idles and the speedup is masked; patterns with different
# sample semantics (e.g. multi_turn) make the fixed-index phase meaningless.
# Allow-list the two valid patterns rather than enumerate the rejects.
load_type = config.settings.load_pattern.type
if load_type not in (LoadPatternType.MAX_THROUGHPUT, LoadPatternType.CONCURRENCY):
raise SetupError(
"Compliance audit requires an unpaced load pattern (max_throughput or concurrency). "
f"Got: {load_type.value}"
)

specs = test.plan_runs(audit_cfg)

perf_datasets = [d for d in config.datasets if d.type.value == "performance"]
if not perf_datasets:
raise SetupError("Audit requires at least one performance dataset")

# Execute each phase back-to-back. The first phase's setup_benchmark loads
# the dataset; reuse that count to bounds-check every fixed-index spec
# before any phase actually runs. setup_benchmark only loads data (it spawns
# no workers), so a failed bounds check here costs one load and nothing more.
artifacts: list[RunArtifacts] = []
n_samples: int | None = None
for spec in specs:
phase_dir = base_report_dir / spec.label
phase_dir.mkdir(parents=True, exist_ok=True)

# Build a per-phase config: phase subdirectory, no nested audit, explicit count.
phase_config = config.with_updates(report_dir=phase_dir, audit=None)

try:
ctx = setup_benchmark(phase_config, TestMode.PERF, run_spec=spec)
if n_samples is None:
n_samples = ctx.dataloader.num_samples()
for check_spec in specs:
idx = check_spec.sample_order.fixed_index
if idx is not None and not (0 <= idx < n_samples):
raise SetupError(
f"Audit phase '{check_spec.label}': sample_index={idx} "
f"is out of range [0, {n_samples}) for dataset with "
f"{n_samples} samples"
)
bench = run_benchmark_async(ctx)
finalize_benchmark(ctx, bench)
except (SetupError, ExecutionError):
raise
except Exception as exc:
raise ExecutionError(f"Audit phase '{spec.label}' failed: {exc}") from exc

report = bench.report
if report is None:
raise ExecutionError(f"Audit phase '{spec.label}' produced no report")
# A drain-timeout (complete with pending async tasks) or an
# INTERRUPTED phase yields partial stats; certifying a result from it
# would let an incomplete run pass compliance.
if not report.complete:
raise ExecutionError(
f"Audit phase '{spec.label}' did not complete cleanly "
"(metrics drain timed out or the run was interrupted); "
"refusing to certify a result from partial data"
)
# When the spec didn't fix a count (None = full dataset), the requested
# count is the number actually issued this phase.
n_requested = (
spec.n_samples if spec.n_samples is not None else report.n_samples_issued
)
artifacts.append(
RunArtifacts(
label=spec.label,
report_dir=phase_dir,
report=report,
n_requested=n_requested,
)
)

result = test.verify(artifacts, audit_cfg)
write_result(result, base_report_dir)

status = "PASS" if result.passed else "FAIL"
logger.info(
"Audit %s %s — %s",
audit_cfg.test,
status,
result.details.get("reason", ""),
)
return result
6 changes: 5 additions & 1 deletion src/inference_endpoint/commands/benchmark/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

from __future__ import annotations

import sys
from pathlib import Path
from typing import Annotated

Expand All @@ -25,6 +26,7 @@
from pydantic import ValidationError # noqa: F401 (used in from_config)

from inference_endpoint.commands.benchmark.execute import run_benchmark
from inference_endpoint.compliance.result import AuditResult
from inference_endpoint.config.schema import (
BenchmarkConfig,
OfflineBenchmarkConfig,
Expand All @@ -51,7 +53,9 @@ def _run(config: BenchmarkConfig, dataset: list[str], mode: TestMode) -> None:
raise DatasetValidationError(f"Invalid --dataset: {msgs}") from e
except ValueError as e:
raise DatasetValidationError(f"Invalid --dataset: {e}") from e
run_benchmark(config, mode)
result = run_benchmark(config, mode)
if isinstance(result, AuditResult):
sys.exit(0 if result.passed else 1)


@benchmark_app.command
Expand Down
Loading
Loading