mlcommons · arav-agarwal2 · Jun 24, 2026 · Jun 24, 2026 · Jun 24, 2026 · Jun 24, 2026
diff --git a/src/endpoints_submission_cli/exceptions.py b/src/endpoints_submission_cli/exceptions.py
@@ -39,3 +39,11 @@ class SubmissionBuildError(Exception):
 
 class SubmissionCheckError(Exception):
     """Raised when the Submission Checker reports validation errors."""
+
+
+class TruncationError(Exception):
+    """Raised when a results payload's ``responses`` cannot be truncated under the cap.
+
+    Guards against silently shipping an un-truncated (potentially multi-GB) payload
+    when ``responses`` is an unexpected shape the truncator does not handle.
+    """
diff --git a/src/endpoints_submission_cli/truncation.py b/src/endpoints_submission_cli/truncation.py
@@ -11,33 +11,62 @@
 
 import json
 
+from .exceptions import TruncationError
+
 __all__ = ["RESPONSES_LIMIT", "truncate_responses"]
 
 RESPONSES_LIMIT = 10 * 1024  # 10 KB
 
 
 def truncate_responses(content: bytes) -> bytes:
-    """Truncate the ``responses`` list in a results.json payload to stay under 10 KB.
+    """Truncate the ``responses`` collection in a results.json payload to stay under 10 KB.
 
-    Returns *content* unchanged when it is not JSON, has no ``responses`` list, or
-    already fits within the limit. Only the ``responses`` key is affected; all
-    other fields (accuracy scores, config, results, ...) are preserved.
+    ``responses`` may be either a list (``[entry, ...]``) or a dict keyed by sample
+    id (``{uuid: output, ...}``); both are produced by different run modes. Entries
+    are kept in iteration order until adding the next would exceed the limit. Returns
+    *content* unchanged when it is not JSON or has no non-empty ``responses``. Only the
+    ``responses`` key is affected; all other fields (accuracy scores, config, ...) are
+    preserved.
     """
     try:
         data = json.loads(content)
     except (json.JSONDecodeError, ValueError):
         return content
     responses = data.get("responses")
-    if not isinstance(responses, list) or not responses:
-        return content
-    # Walk items and stop as soon as adding the next one would exceed the limit.
-    # Each item contributes its own bytes plus 2 for the ", " separator after the first.
-    total = 2  # "[]"
-    idx = 0
-    for i, r in enumerate(responses):
-        total += len(json.dumps(r).encode()) + (2 if i > 0 else 0)
-        if total > RESPONSES_LIMIT:
-            break
-        idx = i + 1
-    data["responses"] = responses[:idx]
+    if responses is None or (isinstance(responses, (list, dict)) and not responses):
+        return content  # nothing to truncate
+    if isinstance(responses, list):
+        # Each item contributes its own bytes plus 2 for the ", " separator after the first.
+        total = 2  # "[]"
+        idx = 0
+        for i, r in enumerate(responses):
+            total += len(json.dumps(r).encode()) + (2 if i > 0 else 0)
+            if total > RESPONSES_LIMIT:
+                break
+            idx = i + 1
+        data["responses"] = responses[:idx]
+    elif isinstance(responses, dict):
+        # Each entry contributes ``"key": value`` plus 2 for the ", " separator after
+        # the first. Approximate the key/value cost conservatively to stay under the cap.
+        total = 2  # "{}"
+        kept: dict[str, object] = {}
+        for i, (k, v) in enumerate(responses.items()):
+            entry = len(json.dumps(k).encode()) + len(json.dumps(v).encode()) + 2  # ': '
+            if i > 0:
+                entry += 2  # ", "
+            if total + entry > RESPONSES_LIMIT:
+                break
+            total += entry
+            kept[k] = v
+        data["responses"] = kept
+    else:
+        # Unknown shape — refuse to ship a payload we cannot bound. A silent pass-through
+        # here is what let a multi-GB results.json reach a submission bundle.
+        raise TruncationError(
+            f"Cannot truncate 'responses' of type {type(responses).__name__}; "
+            "expected a list or dict."
+        )
+    # Defense-in-depth: never return a payload whose responses still exceed the cap.
+    if len(json.dumps(data["responses"]).encode()) > RESPONSES_LIMIT:
+        raise TruncationError("responses still exceed the size limit after truncation")
     return json.dumps(data, indent=2).encode()
diff --git a/src/submission_checker/accuracy_targets.py b/src/submission_checker/accuracy_targets.py
@@ -20,6 +20,28 @@
 # (model-keyword fragments, {metric → (lower, upper | None)}, min_queries)
 # More-specific entries must come before less-specific ones.
 _TARGETS: list[tuple[frozenset[str], dict[str, tuple[float, float | None]], int]] = [
+    # deepseek-r1  — golden fp32 exact_match 81.3582 (gate ≥ 0.99×).
+    # Only exact_match is gated: results.json's accuracy_scores expose a single scalar
+    # `score` (== exact_match). The canonical spec also bounds TOKENS_PER_SAMPLE
+    # (0.9–1.1 × 3886.2274), but that metric lives only in the deepseek_eval file, not
+    # in results.json, so it is intentionally NOT gated here.
+    (
+        frozenset({"deepseek", "r1"}),
+        {
+            "exact_match": (81.3582 * 0.99, None),
+        },
+        4388,
+    ),
+    # gpt-oss-120b  — golden fp32 exact_match 83.13; upstream tokens_per_sample upper
+    # bound is still a placeholder (constants.py:215), so it is intentionally omitted.
+    # min_queries uses the accuracy-sample-count (4395), not the perf count (6396).
+    (
+        frozenset({"gptoss", "120b"}),
+        {
+            "exact_match": (83.13 * 0.99, None),
+        },
+        4395,
+    ),
     # llama3.1-405b  — inference uses ROUGEL as primary; also exact_match + tokens
     (
         frozenset({"llama3", "405b"}),

diff --git a/src/submission_checker/checker.py b/src/submission_checker/checker.py
@@ -6,6 +6,7 @@
 
 from __future__ import annotations
 
+import json
 from typing import TYPE_CHECKING
 
 __all__ = ["SubmissionChecker"]
@@ -31,6 +32,7 @@
 from .models import warn as _warn
 from .models.loader import (
     load_accuracy_result,
+    load_accuracy_scores,
     load_point_config,
     load_result_summary,
     load_system_description,
@@ -40,6 +42,19 @@
     from pathlib import Path
 
 
+def _results_has_accuracy_scores(path: Path) -> bool:
+    """True if a results.json carries a non-empty ``accuracy_scores`` mapping."""
+    try:
+        data = json.loads(path.read_text())
+    except (OSError, ValueError):
+        return False
+    return (
+        isinstance(data, dict)
+        and isinstance(data.get("accuracy_scores"), dict)
+        and bool(data["accuracy_scores"])
+    )
+
+
 class SubmissionChecker:
     """Validates an MLPerf Endpoints submission directory against §9.1 rules.
 
@@ -111,13 +126,16 @@ def run(self) -> Report:
         for system_json in system_jsons:
             report.results.extend(self._check_system(system_json, pareto_dir))
 
-        # §15: at least one model in the submission must have an accuracy/results.json.
-        has_full_accuracy = any(True for _ in pareto_dir.rglob("accuracy/results.json"))
+        # §15: at least one model must carry accuracy results — either as accuracy_scores
+        # embedded in a results.json, or as a standalone accuracy/results.json.
+        has_full_accuracy = any(
+            True for _ in pareto_dir.rglob("accuracy/results.json")
+        ) or any(_results_has_accuracy_scores(p) for p in pareto_dir.rglob("results.json"))
         if has_full_accuracy:
             report.results.append(
                 _ok(
                     "accuracy-present",
-                    "At least one model has accuracy/results.json",
+                    "At least one model has accuracy results",
                     pareto_dir,
                     "#15",
                 )
@@ -126,7 +144,8 @@ def run(self) -> Report:
             report.results.append(
                 _err(
                     "accuracy-present",
-                    "No model in this submission has accuracy/results.json",
+                    "No model in this submission has accuracy results "
+                    "(accuracy_scores in results.json or accuracy/results.json)",
                     pareto_dir,
                     "#15",
                 )
@@ -303,13 +322,30 @@ def _check_model(
             results.extend(point_result._check_results)
             loaded_points.append((config, summary))
 
-        # Load accuracy from per-point result dirs. Per-model warnings are only
-        # emitted when a point has an accuracy/ dir but files within it are absent.
-        # run() enforces that at least one model in the submission has both files.
+        # Load accuracy per point, preferring the accuracy_scores embedded in
+        # results.json, then falling back to a standalone accuracy/results.json.
+        # The first valid source wins; run() enforces that at least one model has one.
         accuracy_dir: Path | None = None
         accuracy_result = None
         for config, _ in loaded_points:
-            pd = results_dir / f"point_{config.concurrency}" / "accuracy"
+            if accuracy_result is not None:
+                break
+            point_dir = results_dir / f"point_{config.concurrency}"
+
+            # Primary: accuracy_scores embedded in results.json.
+            results_json = point_dir / "results.json"
+            if results_json.exists():
+                loaded, acc_results, present = load_accuracy_scores(results_json)
+                if present:
+                    results.extend(acc_results)
+                    if loaded is not None and not any(
+                        r.severity == Severity.ERROR for r in acc_results
+                    ):
+                        accuracy_result, accuracy_dir = loaded, point_dir
+                    continue
+
+            # Fallback: standalone accuracy/results.json (moved from the run archive).
+            pd = point_dir / "accuracy"
             if not pd.is_dir():
                 continue
             json_p = pd / "results.json"
@@ -322,13 +358,13 @@ def _check_model(
                         "#15",
                     )
                 )
-            elif accuracy_result is None:
-                accuracy_result, acc_results = load_accuracy_result(json_p)
+            else:
+                loaded, acc_results = load_accuracy_result(json_p)
                 results.extend(acc_results)
-                if any(r.severity == Severity.ERROR for r in acc_results):
-                    accuracy_result = None
-                else:
-                    accuracy_dir = pd
+                if loaded is not None and not any(
+                    r.severity == Severity.ERROR for r in acc_results
+                ):
+                    accuracy_result, accuracy_dir = loaded, pd
 
         # ModelContext validates point-count, regional-coverage, config-consistency, accuracy-gate
         model_ctx = ModelContext(

diff --git a/src/submission_checker/models/aggregate/context.py b/src/submission_checker/models/aggregate/context.py
@@ -248,6 +248,26 @@ def _check_accuracy(self) -> ModelContext:
         for ds_scores in all_scores.values():
             flat_scores.update(ds_scores)
 
+        # Endpoints scorers (e.g. DeepSeekR1Scorer) write a single *unnamed* scalar
+        # `score` per dataset into results.json — the scorer's primary metric, with its
+        # identity dropped. When the model declares exactly one accuracy metric, gate
+        # that scalar against it; but WARN, because we cannot verify the scalar's
+        # identity, and any secondary metrics (e.g. tokens_per_sample) are absent from
+        # results.json and are therefore NOT checked.
+        if list(flat_scores) == ["score"] and len(thresholds) == 1:
+            only_metric = next(iter(thresholds))
+            self._check_results.append(
+                warn(
+                    "accuracy-gate",
+                    f"results.json exposes only an unnamed scalar accuracy score; "
+                    f"gating it as '{only_metric}'. Secondary metrics (if any) are not "
+                    f"present in results.json and are not checked.",
+                    json_path,
+                    "#15",
+                )
+            )
+            flat_scores = {only_metric: flat_scores["score"]}
+
         for threshold_key, (lower, upper) in thresholds.items():
             # Match score key case-insensitively
             score: float | None = None

diff --git a/src/submission_checker/models/loader.py b/src/submission_checker/models/loader.py
@@ -7,6 +7,7 @@
 
 __all__ = [
     "load_accuracy_result",
+    "load_accuracy_scores",
     "load_point_config",
     "load_result_summary",
     "load_system_description",
@@ -151,3 +152,31 @@ def load_accuracy_result(
         return instance, list(instance._check_results)
     except ValidationError as exc:
         return None, _validation_errors(exc, "accuracy-valid", path)
+
+
+def load_accuracy_scores(
+    path: Path,
+) -> tuple[AccuracyResult | None, list[CheckResult], bool]:
+    """Load accuracy from a ``results.json``'s ``accuracy_scores`` field.
+
+    The benchmark writes per-dataset accuracy directly into ``results.json`` under
+    ``accuracy_scores`` (already in the ``AccuracyResult`` schema). This reads that
+    field instead of a separate ``accuracy/results.json`` file.
+
+    Returns ``(model, check_results, present)``. ``present`` is True when the file
+    contains a non-empty ``accuracy_scores`` mapping (regardless of validity); a
+    missing/invalid ``results.json`` is reported by the result-summary loaders, so
+    accuracy is simply treated as absent here. On a validation failure the model is
+    None and ``check_results`` holds one entry per error.
+    """
+    data, load_err = _load_json(path)
+    if load_err or not isinstance(data, dict):
+        return None, [], False
+    scores = data.get("accuracy_scores")
+    if not isinstance(scores, dict) or not scores:
+        return None, [], False
+    try:
+        instance = AccuracyResult.model_validate(scores, context={"json_path": path})
+        return instance, list(instance._check_results), True
+    except ValidationError as exc:
+        return None, _validation_errors(exc, "accuracy-valid", path), True