Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions src/endpoints_submission_cli/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,11 @@ class SubmissionBuildError(Exception):

class SubmissionCheckError(Exception):
"""Raised when the Submission Checker reports validation errors."""


class TruncationError(Exception):
"""Raised when a results payload's ``responses`` cannot be truncated under the cap.

Guards against silently shipping an un-truncated (potentially multi-GB) payload
when ``responses`` is an unexpected shape the truncator does not handle.
"""
61 changes: 45 additions & 16 deletions src/endpoints_submission_cli/truncation.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,33 +11,62 @@

import json

from .exceptions import TruncationError

__all__ = ["RESPONSES_LIMIT", "truncate_responses"]

RESPONSES_LIMIT = 10 * 1024 # 10 KB


def truncate_responses(content: bytes) -> bytes:
"""Truncate the ``responses`` list in a results.json payload to stay under 10 KB.
"""Truncate the ``responses`` collection in a results.json payload to stay under 10 KB.

Returns *content* unchanged when it is not JSON, has no ``responses`` list, or
already fits within the limit. Only the ``responses`` key is affected; all
other fields (accuracy scores, config, results, ...) are preserved.
``responses`` may be either a list (``[entry, ...]``) or a dict keyed by sample
id (``{uuid: output, ...}``); both are produced by different run modes. Entries
are kept in iteration order until adding the next would exceed the limit. Returns
*content* unchanged when it is not JSON or has no non-empty ``responses``. Only the
``responses`` key is affected; all other fields (accuracy scores, config, ...) are
preserved.
"""
try:
data = json.loads(content)
except (json.JSONDecodeError, ValueError):
return content
responses = data.get("responses")
if not isinstance(responses, list) or not responses:
return content
# Walk items and stop as soon as adding the next one would exceed the limit.
# Each item contributes its own bytes plus 2 for the ", " separator after the first.
total = 2 # "[]"
idx = 0
for i, r in enumerate(responses):
total += len(json.dumps(r).encode()) + (2 if i > 0 else 0)
if total > RESPONSES_LIMIT:
break
idx = i + 1
data["responses"] = responses[:idx]
if responses is None or (isinstance(responses, (list, dict)) and not responses):
return content # nothing to truncate
if isinstance(responses, list):
# Each item contributes its own bytes plus 2 for the ", " separator after the first.
total = 2 # "[]"
idx = 0
for i, r in enumerate(responses):
total += len(json.dumps(r).encode()) + (2 if i > 0 else 0)
if total > RESPONSES_LIMIT:
break
idx = i + 1
data["responses"] = responses[:idx]
elif isinstance(responses, dict):
# Each entry contributes ``"key": value`` plus 2 for the ", " separator after
# the first. Approximate the key/value cost conservatively to stay under the cap.
total = 2 # "{}"
kept: dict[str, object] = {}
for i, (k, v) in enumerate(responses.items()):
entry = len(json.dumps(k).encode()) + len(json.dumps(v).encode()) + 2 # ': '
if i > 0:
entry += 2 # ", "
if total + entry > RESPONSES_LIMIT:
break
total += entry
kept[k] = v
data["responses"] = kept
else:
# Unknown shape — refuse to ship a payload we cannot bound. A silent pass-through
# here is what let a multi-GB results.json reach a submission bundle.
raise TruncationError(
f"Cannot truncate 'responses' of type {type(responses).__name__}; "
"expected a list or dict."
)
# Defense-in-depth: never return a payload whose responses still exceed the cap.
if len(json.dumps(data["responses"]).encode()) > RESPONSES_LIMIT:
raise TruncationError("responses still exceed the size limit after truncation")
return json.dumps(data, indent=2).encode()
22 changes: 22 additions & 0 deletions src/submission_checker/accuracy_targets.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,28 @@
# (model-keyword fragments, {metric → (lower, upper | None)}, min_queries)
# More-specific entries must come before less-specific ones.
_TARGETS: list[tuple[frozenset[str], dict[str, tuple[float, float | None]], int]] = [
# deepseek-r1 — golden fp32 exact_match 81.3582 (gate ≥ 0.99×).
# Only exact_match is gated: results.json's accuracy_scores expose a single scalar
# `score` (== exact_match). The canonical spec also bounds TOKENS_PER_SAMPLE
# (0.9–1.1 × 3886.2274), but that metric lives only in the deepseek_eval file, not
# in results.json, so it is intentionally NOT gated here.
(
frozenset({"deepseek", "r1"}),
{
"exact_match": (81.3582 * 0.99, None),
},
4388,
),
# gpt-oss-120b — golden fp32 exact_match 83.13; upstream tokens_per_sample upper
# bound is still a placeholder (constants.py:215), so it is intentionally omitted.
# min_queries uses the accuracy-sample-count (4395), not the perf count (6396).
(
frozenset({"gptoss", "120b"}),
{
"exact_match": (83.13 * 0.99, None),
},
4395,
),
# llama3.1-405b — inference uses ROUGEL as primary; also exact_match + tokens
(
frozenset({"llama3", "405b"}),
Expand Down
64 changes: 50 additions & 14 deletions src/submission_checker/checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from __future__ import annotations

import json
from typing import TYPE_CHECKING

__all__ = ["SubmissionChecker"]
Expand All @@ -31,6 +32,7 @@
from .models import warn as _warn
from .models.loader import (
load_accuracy_result,
load_accuracy_scores,
load_point_config,
load_result_summary,
load_system_description,
Expand All @@ -40,6 +42,19 @@
from pathlib import Path


def _results_has_accuracy_scores(path: Path) -> bool:
"""True if a results.json carries a non-empty ``accuracy_scores`` mapping."""
try:
data = json.loads(path.read_text())
except (OSError, ValueError):
return False
return (
isinstance(data, dict)
and isinstance(data.get("accuracy_scores"), dict)
and bool(data["accuracy_scores"])
)


class SubmissionChecker:
"""Validates an MLPerf Endpoints submission directory against §9.1 rules.

Expand Down Expand Up @@ -111,13 +126,16 @@ def run(self) -> Report:
for system_json in system_jsons:
report.results.extend(self._check_system(system_json, pareto_dir))

# §15: at least one model in the submission must have an accuracy/results.json.
has_full_accuracy = any(True for _ in pareto_dir.rglob("accuracy/results.json"))
# §15: at least one model must carry accuracy results — either as accuracy_scores
# embedded in a results.json, or as a standalone accuracy/results.json.
has_full_accuracy = any(
True for _ in pareto_dir.rglob("accuracy/results.json")
) or any(_results_has_accuracy_scores(p) for p in pareto_dir.rglob("results.json"))
if has_full_accuracy:
report.results.append(
_ok(
"accuracy-present",
"At least one model has accuracy/results.json",
"At least one model has accuracy results",
pareto_dir,
"#15",
)
Expand All @@ -126,7 +144,8 @@ def run(self) -> Report:
report.results.append(
_err(
"accuracy-present",
"No model in this submission has accuracy/results.json",
"No model in this submission has accuracy results "
"(accuracy_scores in results.json or accuracy/results.json)",
pareto_dir,
"#15",
)
Expand Down Expand Up @@ -303,13 +322,30 @@ def _check_model(
results.extend(point_result._check_results)
loaded_points.append((config, summary))

# Load accuracy from per-point result dirs. Per-model warnings are only
# emitted when a point has an accuracy/ dir but files within it are absent.
# run() enforces that at least one model in the submission has both files.
# Load accuracy per point, preferring the accuracy_scores embedded in
# results.json, then falling back to a standalone accuracy/results.json.
# The first valid source wins; run() enforces that at least one model has one.
accuracy_dir: Path | None = None
accuracy_result = None
for config, _ in loaded_points:
pd = results_dir / f"point_{config.concurrency}" / "accuracy"
if accuracy_result is not None:
break
point_dir = results_dir / f"point_{config.concurrency}"

# Primary: accuracy_scores embedded in results.json.
results_json = point_dir / "results.json"
if results_json.exists():
loaded, acc_results, present = load_accuracy_scores(results_json)
if present:
results.extend(acc_results)
if loaded is not None and not any(
r.severity == Severity.ERROR for r in acc_results
):
accuracy_result, accuracy_dir = loaded, point_dir
continue

# Fallback: standalone accuracy/results.json (moved from the run archive).
pd = point_dir / "accuracy"
if not pd.is_dir():
continue
json_p = pd / "results.json"
Expand All @@ -322,13 +358,13 @@ def _check_model(
"#15",
)
)
elif accuracy_result is None:
accuracy_result, acc_results = load_accuracy_result(json_p)
else:
loaded, acc_results = load_accuracy_result(json_p)
results.extend(acc_results)
if any(r.severity == Severity.ERROR for r in acc_results):
accuracy_result = None
else:
accuracy_dir = pd
if loaded is not None and not any(
r.severity == Severity.ERROR for r in acc_results
):
accuracy_result, accuracy_dir = loaded, pd

# ModelContext validates point-count, regional-coverage, config-consistency, accuracy-gate
model_ctx = ModelContext(
Expand Down
20 changes: 20 additions & 0 deletions src/submission_checker/models/aggregate/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,26 @@ def _check_accuracy(self) -> ModelContext:
for ds_scores in all_scores.values():
flat_scores.update(ds_scores)

# Endpoints scorers (e.g. DeepSeekR1Scorer) write a single *unnamed* scalar
# `score` per dataset into results.json — the scorer's primary metric, with its
# identity dropped. When the model declares exactly one accuracy metric, gate
# that scalar against it; but WARN, because we cannot verify the scalar's
# identity, and any secondary metrics (e.g. tokens_per_sample) are absent from
# results.json and are therefore NOT checked.
if list(flat_scores) == ["score"] and len(thresholds) == 1:
only_metric = next(iter(thresholds))
self._check_results.append(
warn(
"accuracy-gate",
f"results.json exposes only an unnamed scalar accuracy score; "
f"gating it as '{only_metric}'. Secondary metrics (if any) are not "
f"present in results.json and are not checked.",
json_path,
"#15",
)
)
flat_scores = {only_metric: flat_scores["score"]}

for threshold_key, (lower, upper) in thresholds.items():
# Match score key case-insensitively
score: float | None = None
Expand Down
29 changes: 29 additions & 0 deletions src/submission_checker/models/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

__all__ = [
"load_accuracy_result",
"load_accuracy_scores",
"load_point_config",
"load_result_summary",
"load_system_description",
Expand Down Expand Up @@ -151,3 +152,31 @@ def load_accuracy_result(
return instance, list(instance._check_results)
except ValidationError as exc:
return None, _validation_errors(exc, "accuracy-valid", path)


def load_accuracy_scores(
path: Path,
) -> tuple[AccuracyResult | None, list[CheckResult], bool]:
"""Load accuracy from a ``results.json``'s ``accuracy_scores`` field.
The benchmark writes per-dataset accuracy directly into ``results.json`` under
``accuracy_scores`` (already in the ``AccuracyResult`` schema). This reads that
field instead of a separate ``accuracy/results.json`` file.
Returns ``(model, check_results, present)``. ``present`` is True when the file
contains a non-empty ``accuracy_scores`` mapping (regardless of validity); a
missing/invalid ``results.json`` is reported by the result-summary loaders, so
accuracy is simply treated as absent here. On a validation failure the model is
None and ``check_results`` holds one entry per error.
"""
data, load_err = _load_json(path)
if load_err or not isinstance(data, dict):
return None, [], False
scores = data.get("accuracy_scores")
if not isinstance(scores, dict) or not scores:
return None, [], False
try:
instance = AccuracyResult.model_validate(scores, context={"json_path": path})
return instance, list(instance._check_results), True
except ValidationError as exc:
return None, _validation_errors(exc, "accuracy-valid", path), True
Loading
Loading