Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/endpoints_submission_cli/submissions/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,8 @@ def _write_pareto_entries(
"requests_completed": warmup_cfg.get("requests_completed"),
"data_source": warmup_cfg.get("data_source"),
"concurrency": warmup_cfg.get("concurrency"),
"initialization_steps": warmup_cfg.get("initialization_steps"),
# Checker types this as a list; default to [] rather than null when absent.
"initialization_steps": warmup_cfg.get("initialization_steps") or [],
}

point_cfg: dict[str, Any] = {
Expand Down
92 changes: 92 additions & 0 deletions src/submission_checker/checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from __future__ import annotations

import json
from typing import TYPE_CHECKING

__all__ = ["SubmissionChecker"]
Expand Down Expand Up @@ -33,12 +34,16 @@
load_accuracy_result,
load_point_config,
load_result_summary,
load_run_metadata,
load_system_description,
)

if TYPE_CHECKING:
from pathlib import Path

# Absolute tolerance for the tps_utilization consistency check.
_TPS_UTILIZATION_ABS_TOL = 0.1


class SubmissionChecker:
"""Validates an MLPerf Endpoints submission directory against §9.1 rules.
Expand Down Expand Up @@ -111,6 +116,9 @@ def run(self) -> Report:
for system_json in system_jsons:
report.results.extend(self._check_system(system_json, pareto_dir))

# Submission-wide: tps_utilization must match system_tps / max(system_tps).
report.results.extend(self._check_tps_utilization(pareto_dir))

# §15: at least one model in the submission must have an accuracy/results.json.
has_full_accuracy = any(True for _ in pareto_dir.rglob("accuracy/results.json"))
if has_full_accuracy:
Expand All @@ -134,6 +142,66 @@ def run(self) -> Report:

return report

# ------------------------------------------------------------------
# Submission-wide checks
# ------------------------------------------------------------------

def _check_tps_utilization(self, pareto_dir: Path) -> list[CheckResult]:
"""Verify each run's ``tps_utilization`` equals ``system_tps / max(system_tps)``.

``tps_utilization`` normalises a run to the peak ``system_tps`` across the
whole submission, so this is a cross-run check. Stored values are compared
to the recomputed expectation within an absolute tolerance of
``_TPS_UTILIZATION_ABS_TOL``. Structurally invalid metadata (missing or
non-numeric fields) is left to the per-file ``run-metadata-valid`` check.
"""
entries: list[tuple[Path, float, float]] = []
for md_path in sorted(pareto_dir.rglob("run_metadata.json")):
try:
data = json.loads(md_path.read_text())
except (OSError, ValueError):
continue
tps = data.get("system_tps")
util = data.get("tps_utilization")
if (
isinstance(tps, (int, float))
and not isinstance(tps, bool)
and isinstance(util, (int, float))
and not isinstance(util, bool)
):
entries.append((md_path, float(tps), float(util)))

if not entries:
return []
max_tps = max(tps for _, tps, _ in entries)
if max_tps <= 0:
return []

results: list[CheckResult] = []
for md_path, tps, util in entries:
expected = tps / max_tps
if abs(util - expected) <= _TPS_UTILIZATION_ABS_TOL:
results.append(
_ok(
"tps-utilization",
f"tps_utilization {util:.4f} matches expected {expected:.4f}",
md_path,
"#8.1",
)
)
else:
results.append(
_err(
"tps-utilization",
f"tps_utilization {util} != expected {expected:.4f}"
f" (system_tps {tps} / submission max {max_tps};"
f" abs tol {_TPS_UTILIZATION_ABS_TOL})",
md_path,
"#8.1",
)
)
return results

# ------------------------------------------------------------------
# Per-system orchestration
# ------------------------------------------------------------------
Expand Down Expand Up @@ -290,6 +358,30 @@ def _check_model(
)
)

run_metadata_path = point_result_dir / "run_metadata.json"
if not run_metadata_path.exists():
results.append(
_err(
"run-metadata-present",
f"Missing run_metadata.json for point_{config.concurrency}:"
f" {run_metadata_path.relative_to(self.submission_path)}",
run_metadata_path,
"#8.1",
)
)
else:
run_metadata, rm_results = load_run_metadata(run_metadata_path)
results.extend(rm_results)
if run_metadata is not None:
results.append(
_ok(
"run-metadata-valid",
f"run_metadata.json valid for point_{config.concurrency}",
run_metadata_path,
"#8.1",
)
)

summary, load_results = load_result_summary(summary_path)
results.extend(load_results)
if summary is None:
Expand Down
4 changes: 4 additions & 0 deletions src/submission_checker/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
from .aggregate import MIN_QUERY_COUNT, ModelContext, PointResult
from .file import (
AccuracyResult,
ConfigSummary,
Division,
NodeType,
PercentileStats,
PointConfig,
PointSummary,
RunMetadata,
RuntimeSettings,
SystemAvailabilityStatus,
SystemDescription,
Expand All @@ -27,9 +29,11 @@
"NodeType",
"PercentileStats",
"PointConfig",
"ConfigSummary",
"PointResult",
"PointSummary",
"RegionBounds",
"RunMetadata",
"Regions",
"Report",
"RuntimeSettings",
Expand Down
3 changes: 3 additions & 0 deletions src/submission_checker/models/file/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,19 @@
from .accuracy import AccuracyResult
from .point_config import PointConfig, RuntimeSettings
from .point_summary import PercentileStats, PointSummary
from .run_metadata import ConfigSummary, RunMetadata
from .system import Division, NodeType, SystemAvailabilityStatus, SystemDescription

__all__ = [
"AccuracyResult",
"ConfigSummary",
"Division",
"NodeType",
"SystemAvailabilityStatus",
"PercentileStats",
"PointConfig",
"PointSummary",
"RunMetadata",
"RuntimeSettings",
"SystemDescription",
]
101 changes: 101 additions & 0 deletions src/submission_checker/models/file/run_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
"""Run metadata model — ``run_metadata.json`` schema and per-file validation.

``run_metadata.json`` is generated by the submission CLI for every measurement
point and lives at ``pareto/<system>/<model>/results/point_<N>/run_metadata.json``.

Validation policy:
* Every measurement field must be present **and non-null**.
* ``config_summary`` is either a string of length >= 4 or a structured
:class:`ConfigSummary` object.
* Only ``config_summary_notes``, ``link_config``, and ``link_logs`` may be
null — but their keys must still be present.
"""

from __future__ import annotations

from typing import Annotated

from pydantic import BaseModel, ConfigDict, StringConstraints

__all__ = ["ConfigSummary", "RunMetadata"]


class ConfigSummary(BaseModel):
"""Structured parallelism/config summary (the object form of ``config_summary``)."""

model_config = ConfigDict(extra="allow")

disaggregated: bool | None = None
expert_parallel: int | None = None
tensor_parallel: int | None = None
pipeline_parallel: int | None = None
data_parallel: int | None = None
batch: int | None = None


# config_summary accepts either a structured object or a free-form string (len >= 4).
_ConfigSummaryStr = Annotated[str, StringConstraints(min_length=4)]


class RunMetadata(BaseModel):
"""Parsed contents of ``run_metadata.json``.

All fields are required and non-null except ``config_summary_notes``,
``link_config``, and ``link_logs``, whose keys must be present but whose
values may be null.
"""

model_config = ConfigDict(extra="allow")

# Identity / configuration
run_date: str
node_config: str
config_summary: ConfigSummary | _ConfigSummaryStr
config_summary_notes: str | None
concurrency: int

# Headline metrics
system_tps: float
tps_per_user: float
ttft: float
qps: float
tps_utilization: float

# Run accounting
measured_total_output_tokens: int
measured_run_duration: float
measured_total_requests: int

# Optional reference links (key required, value may be null)
link_config: str | None
link_logs: str | None

# TTFT latencies
measured_latency_ttft_min: float
measured_latency_ttft_average: float
measured_latency_ttft_p50: float
measured_latency_ttft_p90: float
measured_latency_ttft_p95: float
measured_latency_ttft_p99: float
measured_latency_ttft_p999: float
measured_latency_ttft_max: float

# TPOT latencies
measured_latency_tpot_min: float
measured_latency_tpot_average: float
measured_latency_tpot_p50: float
measured_latency_tpot_p90: float
measured_latency_tpot_p95: float
measured_latency_tpot_p99: float
measured_latency_tpot_p999: float
measured_latency_tpot_max: float

# End-to-end request latencies
measured_latency_request_min: float
measured_latency_request_average: float
measured_latency_request_p50: float
measured_latency_request_p90: float
measured_latency_request_p95: float
measured_latency_request_p99: float
measured_latency_request_p999: float
measured_latency_request_max: float
Loading
Loading