From 24252afa9eecf61c1957801c6aaca7c505462adb Mon Sep 17 00:00:00 2001
From: Arav Agarwal <aravagar@umich.edu>
Date: Wed, 24 Jun 2026 15:21:17 -0400
Subject: [PATCH 1/4] Bugfix

---
 .../submissions/builder.py                    | 46 ++++++++++++++++++-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/src/endpoints_submission_cli/submissions/builder.py b/src/endpoints_submission_cli/submissions/builder.py
index 299723a..69bb342 100644
--- a/src/endpoints_submission_cli/submissions/builder.py
+++ b/src/endpoints_submission_cli/submissions/builder.py
@@ -199,15 +199,59 @@ def _load_run_data(run_id: str, division: str, availability: str, run_dir: Path)
 
     system_info = _load_system_desc(base, run_id, division, availability)
 
+    extra_files = _load_extra_files(base)
+    # A combined run can be a performance run that *also* declares an accuracy
+    # dataset (e.g. datasets[0] is performance, datasets[1] is accuracy) without
+    # shipping a canonical accuracy/results.json. In that case results.json carries
+    # an `accuracy_scores` block already in the checker's AccuracyResult schema;
+    # surface it as accuracy/results.json so the accuracy run gets assembled
+    # regardless. (Restores the behavior previously provided by the removed
+    # _write_accuracy_fallback, but at the harvest stage so the write-loop places
+    # the file correctly.) Accuracy-typed runs already route results.json into
+    # accuracy/, so this only applies to performance runs to avoid colliding there.
+    if (
+        "accuracy/results.json" not in extra_files
+        and _extract_run_type(config) == "performance"
+        and _config_has_accuracy_dataset(config)
+    ):
+        derived = _derive_accuracy_results(extra_files.get("results.json"))
+        if derived is not None:
+            extra_files["accuracy/results.json"] = derived
+
     return {
         "run_id": run_id,
         "system_info": system_info,
         "config": config,
         "result_summary": result_summary,
-        "_extra_files": _load_extra_files(base),
+        "_extra_files": extra_files,
     }
 
 
+def _config_has_accuracy_dataset(config: dict[str, Any]) -> bool:
+    """Return True if *config* declares any dataset of type 'accuracy'."""
+    datasets = config.get("datasets", []) or []
+    return any(isinstance(d, dict) and d.get("type") == "accuracy" for d in datasets)
+
+
+def _derive_accuracy_results(results_bytes: bytes | None) -> bytes | None:
+    """Derive accuracy/results.json bytes from a run's results.json.
+
+    results.json carries an ``accuracy_scores`` block already in the checker's
+    AccuracyResult schema (``{dataset_name: {score, num_samples, ...}}``).
+    Return it serialized, or None when unavailable/malformed.
+    """
+    if not results_bytes:
+        return None
+    try:
+        parsed = json.loads(results_bytes)
+    except (json.JSONDecodeError, TypeError, ValueError):
+        return None
+    scores = parsed.get("accuracy_scores") if isinstance(parsed, dict) else None
+    if not scores:
+        return None
+    return json.dumps(scores, indent=2).encode("utf-8")
+
+
 def _load_system_desc(base: Path, run_id: str, division: str, availability: str) -> dict[str, Any]:
     """Load system_desc.json from a run folder and return the validated flat dict.
 

From 96bdc99eb41167d8f2fd64fd92348048777d63b5 Mon Sep 17 00:00:00 2001
From: Arav Agarwal <aravagar@umich.edu>
Date: Wed, 24 Jun 2026 16:15:52 -0400
Subject: [PATCH 2/4] Update accuracy tests

---
 src/submission_checker/accuracy_targets.py    | 22 +++++++++
 .../models/aggregate/context.py               | 20 ++++++++
 .../test_checks_aggregate.py                  | 48 +++++++++++++++++++
 3 files changed, 90 insertions(+)

diff --git a/src/submission_checker/accuracy_targets.py b/src/submission_checker/accuracy_targets.py
index 263a68d..df64e48 100644
--- a/src/submission_checker/accuracy_targets.py
+++ b/src/submission_checker/accuracy_targets.py
@@ -20,6 +20,28 @@
 # (model-keyword fragments, {metric → (lower, upper | None)}, min_queries)
 # More-specific entries must come before less-specific ones.
 _TARGETS: list[tuple[frozenset[str], dict[str, tuple[float, float | None]], int]] = [
+    # deepseek-r1  — golden fp32 exact_match 81.3582 (gate ≥ 0.99×).
+    # Only exact_match is gated: results.json's accuracy_scores expose a single scalar
+    # `score` (== exact_match). The canonical spec also bounds TOKENS_PER_SAMPLE
+    # (0.9–1.1 × 3886.2274), but that metric lives only in the deepseek_eval file, not
+    # in results.json, so it is intentionally NOT gated here.
+    (
+        frozenset({"deepseek", "r1"}),
+        {
+            "exact_match": (81.3582 * 0.99, None),
+        },
+        4388,
+    ),
+    # gpt-oss-120b  — golden fp32 exact_match 83.13; upstream tokens_per_sample upper
+    # bound is still a placeholder (constants.py:215), so it is intentionally omitted.
+    # min_queries uses the accuracy-sample-count (4395), not the perf count (6396).
+    (
+        frozenset({"gptoss", "120b"}),
+        {
+            "exact_match": (83.13 * 0.99, None),
+        },
+        4395,
+    ),
     # llama3.1-405b  — inference uses ROUGEL as primary; also exact_match + tokens
     (
         frozenset({"llama3", "405b"}),
diff --git a/src/submission_checker/models/aggregate/context.py b/src/submission_checker/models/aggregate/context.py
index 72b167b..b933cf4 100644
--- a/src/submission_checker/models/aggregate/context.py
+++ b/src/submission_checker/models/aggregate/context.py
@@ -248,6 +248,26 @@ def _check_accuracy(self) -> ModelContext:
         for ds_scores in all_scores.values():
             flat_scores.update(ds_scores)
 
+        # Endpoints scorers (e.g. DeepSeekR1Scorer) write a single *unnamed* scalar
+        # `score` per dataset into results.json — the scorer's primary metric, with its
+        # identity dropped. When the model declares exactly one accuracy metric, gate
+        # that scalar against it; but WARN, because we cannot verify the scalar's
+        # identity, and any secondary metrics (e.g. tokens_per_sample) are absent from
+        # results.json and are therefore NOT checked.
+        if list(flat_scores) == ["score"] and len(thresholds) == 1:
+            only_metric = next(iter(thresholds))
+            self._check_results.append(
+                warn(
+                    "accuracy-gate",
+                    f"results.json exposes only an unnamed scalar accuracy score; "
+                    f"gating it as '{only_metric}'. Secondary metrics (if any) are not "
+                    f"present in results.json and are not checked.",
+                    json_path,
+                    "#15",
+                )
+            )
+            flat_scores = {only_metric: flat_scores["score"]}
+
         for threshold_key, (lower, upper) in thresholds.items():
             # Match score key case-insensitively
             score: float | None = None
diff --git a/tests/submission_checker/test_checks_aggregate.py b/tests/submission_checker/test_checks_aggregate.py
index 7225fef..a40f227 100644
--- a/tests/submission_checker/test_checks_aggregate.py
+++ b/tests/submission_checker/test_checks_aggregate.py
@@ -551,3 +551,51 @@ def test_sample_count_missing_skips(self, tmp_path):
         )
         ctx = _model_ctx(tmp_path, accuracy_result=ar, model_name="Llama-3_1-8B-Instruct")
         assert not any(r.rule == "accuracy-sample-count" for r in ctx._check_results)
+
+    def test_scalar_score_gated_as_single_metric_passes(self, tmp_path):
+        # Endpoints results.json exposes a single *unnamed* scalar `score`
+        # (== exact_match for deepseek). With a single-metric threshold it is gated
+        # as that metric — with a warning — and 81.3355 ≥ 0.99*81.3582 (80.5446) passes.
+        ar = AccuracyResult({"deepseek_r1_accuracy": {"num_samples": 4388, "score": 81.3355}})
+        ctx = _model_ctx(tmp_path, accuracy_result=ar, model_name="deepseek_r1-torch-fp4")
+        assert any(
+            r.rule == "accuracy-gate"
+            and r.severity == Severity.WARNING
+            and "unnamed scalar" in r.message
+            for r in ctx._check_results
+        )
+        assert any(
+            r.rule == "accuracy-gate"
+            and r.severity == Severity.INFO
+            and "exact_match" in r.message
+            for r in ctx._check_results
+        )
+        assert not any(
+            r.rule == "accuracy-gate" and r.severity == Severity.ERROR
+            for r in ctx._check_results
+        )
+
+    def test_scalar_score_gated_as_single_metric_fails(self, tmp_path):
+        # 70.0 < 0.99*81.3582 (80.5446) → the mapped exact_match gate errors.
+        ar = AccuracyResult({"deepseek_r1_accuracy": {"num_samples": 4388, "score": 70.0}})
+        ctx = _model_ctx(tmp_path, accuracy_result=ar, model_name="deepseek_r1-torch-fp4")
+        assert any(
+            r.rule == "accuracy-gate"
+            and r.severity == Severity.ERROR
+            and "exact_match" in r.message
+            for r in ctx._check_results
+        )
+
+    def test_scalar_score_not_mapped_for_multimetric_model(self, tmp_path):
+        # llama3.1-8b declares 5 metrics, so a bare scalar `score` is ambiguous and
+        # must NOT be mapped — no scalar warning, and no metric comparison fires.
+        ar = AccuracyResult({"cnn_dailymail::llama3_8b": {"num_samples": 13368, "score": 39.0}})
+        ctx = _model_ctx(tmp_path, accuracy_result=ar, model_name="Llama-3_1-8B-Instruct")
+        assert not any(
+            r.rule == "accuracy-gate" and "unnamed scalar" in r.message
+            for r in ctx._check_results
+        )
+        assert not any(
+            r.rule == "accuracy-gate" and r.severity in (Severity.INFO, Severity.ERROR)
+            for r in ctx._check_results
+        )

From 3c3faf54b84425870a77a0ddef945f9f19ec18e7 Mon Sep 17 00:00:00 2001
From: Arav Agarwal <aravagar@umich.edu>
Date: Wed, 24 Jun 2026 16:59:15 -0400
Subject: [PATCH 3/4] Fix truncation and extraction behavior

---
 src/endpoints_submission_cli/exceptions.py    |   8 +
 .../submissions/builder.py                    |  32 +++-
 src/endpoints_submission_cli/truncation.py    |  61 +++++--
 .../submissions/test_builder.py               | 156 ++++++++++++++----
 4 files changed, 200 insertions(+), 57 deletions(-)

diff --git a/src/endpoints_submission_cli/exceptions.py b/src/endpoints_submission_cli/exceptions.py
index 2aad49e..cb0a0c5 100644
--- a/src/endpoints_submission_cli/exceptions.py
+++ b/src/endpoints_submission_cli/exceptions.py
@@ -39,3 +39,11 @@ class SubmissionBuildError(Exception):
 
 class SubmissionCheckError(Exception):
     """Raised when the Submission Checker reports validation errors."""
+
+
+class TruncationError(Exception):
+    """Raised when a results payload's ``responses`` cannot be truncated under the cap.
+
+    Guards against silently shipping an un-truncated (potentially multi-GB) payload
+    when ``responses`` is an unexpected shape the truncator does not handle.
+    """
diff --git a/src/endpoints_submission_cli/submissions/builder.py b/src/endpoints_submission_cli/submissions/builder.py
index 69bb342..5693cf0 100644
--- a/src/endpoints_submission_cli/submissions/builder.py
+++ b/src/endpoints_submission_cli/submissions/builder.py
@@ -214,7 +214,7 @@ def _load_run_data(run_id: str, division: str, availability: str, run_dir: Path)
         and _extract_run_type(config) == "performance"
         and _config_has_accuracy_dataset(config)
     ):
-        derived = _derive_accuracy_results(extra_files.get("results.json"))
+        derived = _unwrap_accuracy_scores(extra_files.get("results.json"))
         if derived is not None:
             extra_files["accuracy/results.json"] = derived
 
@@ -233,21 +233,27 @@ def _config_has_accuracy_dataset(config: dict[str, Any]) -> bool:
     return any(isinstance(d, dict) and d.get("type") == "accuracy" for d in datasets)
 
 
-def _derive_accuracy_results(results_bytes: bytes | None) -> bytes | None:
-    """Derive accuracy/results.json bytes from a run's results.json.
+def _unwrap_accuracy_scores(content: bytes | None) -> bytes | None:
+    """Unwrap a normal ``results.json`` blob into the checker's accuracy schema.
 
-    results.json carries an ``accuracy_scores`` block already in the checker's
-    AccuracyResult schema (``{dataset_name: {score, num_samples, ...}}``).
-    Return it serialized, or None when unavailable/malformed.
+    A run's ``results.json`` (and sometimes the accuracy file itself) is shaped as
+    ``{"config": ..., "results": ..., "accuracy_scores": {dataset: {...}}, "responses": [...]}``.
+    The checker's ``AccuracyResult`` expects only the per-dataset mapping
+    (``{dataset_name: {score, num_samples, ...}}``); a results.json-shaped file fails
+    ``accuracy-valid`` (its ``config``/``results``/``responses`` keys are not per-dataset
+    dicts). When *content* carries a top-level ``accuracy_scores`` mapping, return just
+    that mapping serialized. Return None when *content* is absent, unparseable, or
+    already in the expected per-dataset format (no ``accuracy_scores`` key) — in which
+    case the caller leaves it untouched.
     """
-    if not results_bytes:
+    if not content:
         return None
     try:
-        parsed = json.loads(results_bytes)
+        parsed = json.loads(content)
     except (json.JSONDecodeError, TypeError, ValueError):
         return None
     scores = parsed.get("accuracy_scores") if isinstance(parsed, dict) else None
-    if not scores:
+    if not isinstance(scores, dict) or not scores:
         return None
     return json.dumps(scores, indent=2).encode("utf-8")
 
@@ -501,6 +507,14 @@ def _write_pareto_entries(
                 else rel_path
             )
             dest = result_dir / dest_rel
+            # Whatever lands as <point>/accuracy/results.json must be in the checker's
+            # AccuracyResult schema. If it is instead a normal results.json blob (e.g. an
+            # accuracy-typed run whose results.json carries accuracy_scores), unwrap it to
+            # the per-dataset mapping so it doesn't fail accuracy-valid.
+            if dest.name == "results.json" and dest.parent.name == "accuracy":
+                unwrapped = _unwrap_accuracy_scores(content)
+                if unwrapped is not None:
+                    content = unwrapped
             dest.parent.mkdir(parents=True, exist_ok=True)
             dest.write_bytes(content)
 
diff --git a/src/endpoints_submission_cli/truncation.py b/src/endpoints_submission_cli/truncation.py
index 88b3c91..824fc6e 100644
--- a/src/endpoints_submission_cli/truncation.py
+++ b/src/endpoints_submission_cli/truncation.py
@@ -11,33 +11,62 @@
 
 import json
 
+from .exceptions import TruncationError
+
 __all__ = ["RESPONSES_LIMIT", "truncate_responses"]
 
 RESPONSES_LIMIT = 10 * 1024  # 10 KB
 
 
 def truncate_responses(content: bytes) -> bytes:
-    """Truncate the ``responses`` list in a results.json payload to stay under 10 KB.
+    """Truncate the ``responses`` collection in a results.json payload to stay under 10 KB.
 
-    Returns *content* unchanged when it is not JSON, has no ``responses`` list, or
-    already fits within the limit. Only the ``responses`` key is affected; all
-    other fields (accuracy scores, config, results, ...) are preserved.
+    ``responses`` may be either a list (``[entry, ...]``) or a dict keyed by sample
+    id (``{uuid: output, ...}``); both are produced by different run modes. Entries
+    are kept in iteration order until adding the next would exceed the limit. Returns
+    *content* unchanged when it is not JSON or has no non-empty ``responses``. Only the
+    ``responses`` key is affected; all other fields (accuracy scores, config, ...) are
+    preserved.
     """
     try:
         data = json.loads(content)
     except (json.JSONDecodeError, ValueError):
         return content
     responses = data.get("responses")
-    if not isinstance(responses, list) or not responses:
-        return content
-    # Walk items and stop as soon as adding the next one would exceed the limit.
-    # Each item contributes its own bytes plus 2 for the ", " separator after the first.
-    total = 2  # "[]"
-    idx = 0
-    for i, r in enumerate(responses):
-        total += len(json.dumps(r).encode()) + (2 if i > 0 else 0)
-        if total > RESPONSES_LIMIT:
-            break
-        idx = i + 1
-    data["responses"] = responses[:idx]
+    if responses is None or (isinstance(responses, (list, dict)) and not responses):
+        return content  # nothing to truncate
+    if isinstance(responses, list):
+        # Each item contributes its own bytes plus 2 for the ", " separator after the first.
+        total = 2  # "[]"
+        idx = 0
+        for i, r in enumerate(responses):
+            total += len(json.dumps(r).encode()) + (2 if i > 0 else 0)
+            if total > RESPONSES_LIMIT:
+                break
+            idx = i + 1
+        data["responses"] = responses[:idx]
+    elif isinstance(responses, dict):
+        # Each entry contributes ``"key": value`` plus 2 for the ", " separator after
+        # the first. Approximate the key/value cost conservatively to stay under the cap.
+        total = 2  # "{}"
+        kept: dict[str, object] = {}
+        for i, (k, v) in enumerate(responses.items()):
+            entry = len(json.dumps(k).encode()) + len(json.dumps(v).encode()) + 2  # ': '
+            if i > 0:
+                entry += 2  # ", "
+            if total + entry > RESPONSES_LIMIT:
+                break
+            total += entry
+            kept[k] = v
+        data["responses"] = kept
+    else:
+        # Unknown shape — refuse to ship a payload we cannot bound. A silent pass-through
+        # here is what let a multi-GB results.json reach a submission bundle.
+        raise TruncationError(
+            f"Cannot truncate 'responses' of type {type(responses).__name__}; "
+            "expected a list or dict."
+        )
+    # Defense-in-depth: never return a payload whose responses still exceed the cap.
+    if len(json.dumps(data["responses"]).encode()) > RESPONSES_LIMIT:
+        raise TruncationError("responses still exceed the size limit after truncation")
     return json.dumps(data, indent=2).encode()
diff --git a/tests/endpoints_submission_cli/submissions/test_builder.py b/tests/endpoints_submission_cli/submissions/test_builder.py
index 72f5458..e0f6505 100644
--- a/tests/endpoints_submission_cli/submissions/test_builder.py
+++ b/tests/endpoints_submission_cli/submissions/test_builder.py
@@ -11,7 +11,7 @@
 import pytest
 import yaml
 
-from endpoints_submission_cli.exceptions import SubmissionBuildError
+from endpoints_submission_cli.exceptions import SubmissionBuildError, TruncationError
 from endpoints_submission_cli.submissions.builder import (
     _compute_max_tps,
     _slugify,
@@ -20,6 +20,7 @@
     extract_archive,
 )
 from endpoints_submission_cli.truncation import truncate_responses as _truncate_responses
+from submission_checker.models.file import AccuracyResult
 
 
 @pytest.mark.unit
@@ -608,6 +609,40 @@ def test_large_responses_truncated_under_10kb(self) -> None:
         result = json.loads(result_bytes)
         assert len(json.dumps(result["responses"]).encode()) <= 10 * 1024
 
+    def _make_dict_content(self, n_responses: int) -> bytes:
+        # Some run modes emit responses as a {sample_uuid: output_text} mapping.
+        data = {
+            "config": {"mode": "perf"},
+            "accuracy_scores": {"ds": {"num_samples": n_responses, "score": 1.0}},
+            "responses": {f"uuid-{i:08d}": "hello world " * 5 for i in range(n_responses)},
+        }
+        return json.dumps(data).encode()
+
+    def test_small_dict_responses_unchanged(self) -> None:
+        content = self._make_dict_content(2)
+        result = json.loads(_truncate_responses(content))
+        assert result["responses"] == json.loads(content)["responses"]
+        assert result["accuracy_scores"] == {"ds": {"num_samples": 2, "score": 1.0}}
+
+    def test_large_dict_responses_truncated_under_10kb(self) -> None:
+        content = self._make_dict_content(10_000)
+        result = json.loads(_truncate_responses(content))
+        assert isinstance(result["responses"], dict)
+        assert len(json.dumps(result["responses"]).encode()) <= 10 * 1024
+        assert 0 < len(result["responses"]) < 10_000  # kept some, dropped the rest
+        # non-responses fields untouched
+        assert result["accuracy_scores"] == {"ds": {"num_samples": 10_000, "score": 1.0}}
+
+    def test_unknown_responses_type_raises_hard_error(self) -> None:
+        # A responses shape we cannot bound must fail loudly, never pass through silently.
+        content = json.dumps({"config": {}, "responses": "x" * 50_000}).encode()
+        with pytest.raises(TruncationError):
+            _truncate_responses(content)
+
+    def test_missing_responses_is_not_an_error(self) -> None:
+        content = json.dumps({"config": {}, "accuracy_scores": {}}).encode()
+        assert _truncate_responses(content) == content  # nothing to truncate, no raise
+
     def test_other_keys_preserved(self) -> None:
         content = self._make_content(10_000)
         result = json.loads(_truncate_responses(content))
@@ -641,8 +676,14 @@ def test_long_name_truncated(self) -> None:
 
 
 @pytest.mark.unit
-class TestAccuracyResultsTruncation:
-    """Tests for accuracy/results.json truncation across three workflow scenarios."""
+class TestAccuracyResultsNormalization:
+    """`accuracy/results.json` must end up in the checker's AccuracyResult schema.
+
+    A run's accuracy file is frequently a normal results.json blob (top-level
+    config/results/accuracy_scores/responses). The builder unwraps it to the bare
+    per-dataset mapping so it does not fail accuracy-valid; a file already in the
+    expected schema is left untouched.
+    """
 
     def _make_accuracy_archive(
         self,
@@ -669,55 +710,60 @@ def _make_accuracy_archive(
             tar.add(folder, arcname=name)
         return archive
 
-    def _big_results(self, count: int = 5000) -> dict:
+    _EXPECTED_SCORES = {"test_ds": {"dataset_name": "test_ds", "num_samples": 100, "score": 0.45}}
+
+    def _results_blob(self, count: int = 5000) -> dict:
+        """A normal results.json blob carrying accuracy_scores plus a large responses list."""
         return {
             "config": {},
-            "accuracy_scores": {"score": 0.45},
+            "results": {"total": 100},
+            "accuracy_scores": self._EXPECTED_SCORES,
             "responses": [{"text": "a" * 100, "idx": i} for i in range(count)],
         }
 
-    def test_no_accuracy_subdir_truncates_responses(
-        self, run_folder: Path, tmp_path: Path
-    ) -> None:
-        """Workflow 1/3: archive has only root results.json; responses written truncated to accuracy/results.json."""
+    def _assert_unwrapped_and_valid(self, written: dict) -> None:
+        # The bare per-dataset mapping only — no wrapper keys, no responses — and it
+        # must pass the checker's AccuracyResult schema.
+        assert "responses" not in written
+        assert "accuracy_scores" not in written and "config" not in written
+        assert written == self._EXPECTED_SCORES
+        AccuracyResult.model_validate(written)  # raises ValidationError if malformed
+
+    def test_root_results_blob_unwrapped(self, run_folder: Path, tmp_path: Path) -> None:
+        """Archive ships only a normal results.json blob → unwrapped to its accuracy_scores."""
         archive = self._make_accuracy_archive(
-            run_folder, tmp_path, "wf1", results_content=self._big_results()
+            run_folder, tmp_path, "wf1", results_content=self._results_blob()
         )
         sub_dir = build_submission_folder(
             [("run-001", archive)], "standardized", "available", tmp_path / "sub"
         )
         acc_files = list(sub_dir.rglob("accuracy/results.json"))
         assert len(acc_files) == 1
-        written = json.loads(acc_files[0].read_text())
-        assert len(json.dumps(written["responses"]).encode()) <= 10 * 1024
+        self._assert_unwrapped_and_valid(json.loads(acc_files[0].read_text()))
 
-    def test_accuracy_subdir_large_responses_truncated(
-        self, run_folder: Path, tmp_path: Path
-    ) -> None:
-        """Workflow 2: archive has accuracy/results.json with large responses; truncated on write."""
+    def test_accuracy_subdir_blob_unwrapped(self, run_folder: Path, tmp_path: Path) -> None:
+        """Archive ships a results.json-shaped accuracy/results.json → unwrapped."""
         archive = self._make_accuracy_archive(
-            run_folder, tmp_path, "wf2", acc_results_content=self._big_results()
+            run_folder, tmp_path, "wf2", acc_results_content=self._results_blob()
         )
         sub_dir = build_submission_folder(
             [("run-001", archive)], "standardized", "available", tmp_path / "sub"
         )
         acc_files = list(sub_dir.rglob("accuracy/results.json"))
         assert len(acc_files) == 1
-        written = json.loads(acc_files[0].read_text())
-        assert len(json.dumps(written["responses"]).encode()) <= 10 * 1024
-
-    def test_accuracy_subdir_already_small_written_intact(
-        self, run_folder: Path, tmp_path: Path
-    ) -> None:
-        """Workflow 3: archive has accuracy/results.json already within 10 KB; written without corruption."""
-        small_responses = [{"text": "hi", "idx": 0}]
-        acc_results = {
-            "config": {},
-            "accuracy_scores": {"score": 0.45},
-            "responses": small_responses,
+        self._assert_unwrapped_and_valid(json.loads(acc_files[0].read_text()))
+
+    def test_already_expected_format_left_intact(self, run_folder: Path, tmp_path: Path) -> None:
+        """An accuracy file already in the AccuracyResult schema is written through unchanged."""
+        correct = {
+            "cnn_dailymail::m": {
+                "dataset_name": "cnn_dailymail::m",
+                "num_samples": 13368,
+                "score": {"rouge1": 39.0},
+            }
         }
         archive = self._make_accuracy_archive(
-            run_folder, tmp_path, "wf3", acc_results_content=acc_results
+            run_folder, tmp_path, "wf3", acc_results_content=correct
         )
         sub_dir = build_submission_folder(
             [("run-001", archive)], "standardized", "available", tmp_path / "sub"
@@ -725,5 +771,51 @@ def test_accuracy_subdir_already_small_written_intact(
         acc_files = list(sub_dir.rglob("accuracy/results.json"))
         assert len(acc_files) == 1
         written = json.loads(acc_files[0].read_text())
-        assert written["responses"] == small_responses
-        assert written["accuracy_scores"] == acc_results["accuracy_scores"]
+        assert written == correct  # unchanged: no top-level accuracy_scores to unwrap
+        AccuracyResult.model_validate(written)
+
+
+@pytest.mark.unit
+class TestBuilderTruncatesResults:
+    """build_submission_folder truncates the verbose responses in the perf results.json."""
+
+    def _archive(self, run_folder: Path, tmp_path: Path, responses: object) -> Path:
+        import shutil
+
+        folder = tmp_path / "perf_run"
+        shutil.copytree(run_folder, folder)
+        results = json.loads((folder / "results.json").read_text())
+        results["responses"] = responses
+        (folder / "results.json").write_text(json.dumps(results))
+        archive = tmp_path / "perf_run.tar.gz"
+        with tarfile.open(archive, "w:gz") as tar:
+            tar.add(folder, arcname=folder.name)
+        return archive
+
+    def _perf_results(self, sub_dir: Path) -> dict:
+        perf = [p for p in sub_dir.rglob("results.json") if p.parent.name != "accuracy"]
+        assert len(perf) == 1
+        return json.loads(perf[0].read_text())
+
+    def test_dict_responses_truncated_in_bundle(self, run_folder: Path, tmp_path: Path) -> None:
+        # Regression: responses emitted as a {uuid: text} dict were not truncated.
+        big = {f"uuid-{i:08d}": "hello world " * 5 for i in range(10_000)}
+        sub_dir = build_submission_folder(
+            [("run-001", self._archive(run_folder, tmp_path, big))],
+            "standardized", "available", tmp_path / "sub",
+        )
+        written = self._perf_results(sub_dir)
+        assert isinstance(written["responses"], dict)
+        assert len(json.dumps(written["responses"]).encode()) <= 10 * 1024
+        assert 0 < len(written["responses"]) < 10_000
+
+    def test_list_responses_truncated_in_bundle(self, run_folder: Path, tmp_path: Path) -> None:
+        big = [{"text": "hello world", "idx": i} for i in range(10_000)]
+        sub_dir = build_submission_folder(
+            [("run-001", self._archive(run_folder, tmp_path, big))],
+            "standardized", "available", tmp_path / "sub",
+        )
+        written = self._perf_results(sub_dir)
+        assert isinstance(written["responses"], list)
+        assert len(json.dumps(written["responses"]).encode()) <= 10 * 1024
+        assert 0 < len(written["responses"]) < 10_000

From 3c44211d17e773d282f5c03ceaf53b91fec06f18 Mon Sep 17 00:00:00 2001
From: Arav Agarwal <aravagar@umich.edu>
Date: Wed, 24 Jun 2026 17:38:45 -0400
Subject: [PATCH 4/4] Update

---
 .../submissions/builder.py                    |  60 +----------
 src/submission_checker/checker.py             |  64 ++++++++---
 src/submission_checker/models/loader.py       |  29 +++++
 .../submissions/test_builder.py               | 101 ------------------
 tests/submission_checker/test_checker.py      |  28 +++++
 5 files changed, 108 insertions(+), 174 deletions(-)

diff --git a/src/endpoints_submission_cli/submissions/builder.py b/src/endpoints_submission_cli/submissions/builder.py
index 5693cf0..299723a 100644
--- a/src/endpoints_submission_cli/submissions/builder.py
+++ b/src/endpoints_submission_cli/submissions/builder.py
@@ -199,65 +199,15 @@ def _load_run_data(run_id: str, division: str, availability: str, run_dir: Path)
 
     system_info = _load_system_desc(base, run_id, division, availability)
 
-    extra_files = _load_extra_files(base)
-    # A combined run can be a performance run that *also* declares an accuracy
-    # dataset (e.g. datasets[0] is performance, datasets[1] is accuracy) without
-    # shipping a canonical accuracy/results.json. In that case results.json carries
-    # an `accuracy_scores` block already in the checker's AccuracyResult schema;
-    # surface it as accuracy/results.json so the accuracy run gets assembled
-    # regardless. (Restores the behavior previously provided by the removed
-    # _write_accuracy_fallback, but at the harvest stage so the write-loop places
-    # the file correctly.) Accuracy-typed runs already route results.json into
-    # accuracy/, so this only applies to performance runs to avoid colliding there.
-    if (
-        "accuracy/results.json" not in extra_files
-        and _extract_run_type(config) == "performance"
-        and _config_has_accuracy_dataset(config)
-    ):
-        derived = _unwrap_accuracy_scores(extra_files.get("results.json"))
-        if derived is not None:
-            extra_files["accuracy/results.json"] = derived
-
     return {
         "run_id": run_id,
         "system_info": system_info,
         "config": config,
         "result_summary": result_summary,
-        "_extra_files": extra_files,
+        "_extra_files": _load_extra_files(base),
     }
 
 
-def _config_has_accuracy_dataset(config: dict[str, Any]) -> bool:
-    """Return True if *config* declares any dataset of type 'accuracy'."""
-    datasets = config.get("datasets", []) or []
-    return any(isinstance(d, dict) and d.get("type") == "accuracy" for d in datasets)
-
-
-def _unwrap_accuracy_scores(content: bytes | None) -> bytes | None:
-    """Unwrap a normal ``results.json`` blob into the checker's accuracy schema.
-
-    A run's ``results.json`` (and sometimes the accuracy file itself) is shaped as
-    ``{"config": ..., "results": ..., "accuracy_scores": {dataset: {...}}, "responses": [...]}``.
-    The checker's ``AccuracyResult`` expects only the per-dataset mapping
-    (``{dataset_name: {score, num_samples, ...}}``); a results.json-shaped file fails
-    ``accuracy-valid`` (its ``config``/``results``/``responses`` keys are not per-dataset
-    dicts). When *content* carries a top-level ``accuracy_scores`` mapping, return just
-    that mapping serialized. Return None when *content* is absent, unparseable, or
-    already in the expected per-dataset format (no ``accuracy_scores`` key) — in which
-    case the caller leaves it untouched.
-    """
-    if not content:
-        return None
-    try:
-        parsed = json.loads(content)
-    except (json.JSONDecodeError, TypeError, ValueError):
-        return None
-    scores = parsed.get("accuracy_scores") if isinstance(parsed, dict) else None
-    if not isinstance(scores, dict) or not scores:
-        return None
-    return json.dumps(scores, indent=2).encode("utf-8")
-
-
 def _load_system_desc(base: Path, run_id: str, division: str, availability: str) -> dict[str, Any]:
     """Load system_desc.json from a run folder and return the validated flat dict.
 
@@ -507,14 +457,6 @@ def _write_pareto_entries(
                 else rel_path
             )
             dest = result_dir / dest_rel
-            # Whatever lands as <point>/accuracy/results.json must be in the checker's
-            # AccuracyResult schema. If it is instead a normal results.json blob (e.g. an
-            # accuracy-typed run whose results.json carries accuracy_scores), unwrap it to
-            # the per-dataset mapping so it doesn't fail accuracy-valid.
-            if dest.name == "results.json" and dest.parent.name == "accuracy":
-                unwrapped = _unwrap_accuracy_scores(content)
-                if unwrapped is not None:
-                    content = unwrapped
             dest.parent.mkdir(parents=True, exist_ok=True)
             dest.write_bytes(content)
 
diff --git a/src/submission_checker/checker.py b/src/submission_checker/checker.py
index e1d4535..9fffda0 100644
--- a/src/submission_checker/checker.py
+++ b/src/submission_checker/checker.py
@@ -6,6 +6,7 @@
 
 from __future__ import annotations
 
+import json
 from typing import TYPE_CHECKING
 
 __all__ = ["SubmissionChecker"]
@@ -31,6 +32,7 @@
 from .models import warn as _warn
 from .models.loader import (
     load_accuracy_result,
+    load_accuracy_scores,
     load_point_config,
     load_result_summary,
     load_system_description,
@@ -40,6 +42,19 @@
     from pathlib import Path
 
 
+def _results_has_accuracy_scores(path: Path) -> bool:
+    """True if a results.json carries a non-empty ``accuracy_scores`` mapping."""
+    try:
+        data = json.loads(path.read_text())
+    except (OSError, ValueError):
+        return False
+    return (
+        isinstance(data, dict)
+        and isinstance(data.get("accuracy_scores"), dict)
+        and bool(data["accuracy_scores"])
+    )
+
+
 class SubmissionChecker:
     """Validates an MLPerf Endpoints submission directory against §9.1 rules.
 
@@ -111,13 +126,16 @@ def run(self) -> Report:
         for system_json in system_jsons:
             report.results.extend(self._check_system(system_json, pareto_dir))
 
-        # §15: at least one model in the submission must have an accuracy/results.json.
-        has_full_accuracy = any(True for _ in pareto_dir.rglob("accuracy/results.json"))
+        # §15: at least one model must carry accuracy results — either as accuracy_scores
+        # embedded in a results.json, or as a standalone accuracy/results.json.
+        has_full_accuracy = any(
+            True for _ in pareto_dir.rglob("accuracy/results.json")
+        ) or any(_results_has_accuracy_scores(p) for p in pareto_dir.rglob("results.json"))
         if has_full_accuracy:
             report.results.append(
                 _ok(
                     "accuracy-present",
-                    "At least one model has accuracy/results.json",
+                    "At least one model has accuracy results",
                     pareto_dir,
                     "#15",
                 )
@@ -126,7 +144,8 @@ def run(self) -> Report:
             report.results.append(
                 _err(
                     "accuracy-present",
-                    "No model in this submission has accuracy/results.json",
+                    "No model in this submission has accuracy results "
+                    "(accuracy_scores in results.json or accuracy/results.json)",
                     pareto_dir,
                     "#15",
                 )
@@ -303,13 +322,30 @@ def _check_model(
             results.extend(point_result._check_results)
             loaded_points.append((config, summary))
 
-        # Load accuracy from per-point result dirs. Per-model warnings are only
-        # emitted when a point has an accuracy/ dir but files within it are absent.
-        # run() enforces that at least one model in the submission has both files.
+        # Load accuracy per point, preferring the accuracy_scores embedded in
+        # results.json, then falling back to a standalone accuracy/results.json.
+        # The first valid source wins; run() enforces that at least one model has one.
         accuracy_dir: Path | None = None
         accuracy_result = None
         for config, _ in loaded_points:
-            pd = results_dir / f"point_{config.concurrency}" / "accuracy"
+            if accuracy_result is not None:
+                break
+            point_dir = results_dir / f"point_{config.concurrency}"
+
+            # Primary: accuracy_scores embedded in results.json.
+            results_json = point_dir / "results.json"
+            if results_json.exists():
+                loaded, acc_results, present = load_accuracy_scores(results_json)
+                if present:
+                    results.extend(acc_results)
+                    if loaded is not None and not any(
+                        r.severity == Severity.ERROR for r in acc_results
+                    ):
+                        accuracy_result, accuracy_dir = loaded, point_dir
+                    continue
+
+            # Fallback: standalone accuracy/results.json (moved from the run archive).
+            pd = point_dir / "accuracy"
             if not pd.is_dir():
                 continue
             json_p = pd / "results.json"
@@ -322,13 +358,13 @@ def _check_model(
                         "#15",
                     )
                 )
-            elif accuracy_result is None:
-                accuracy_result, acc_results = load_accuracy_result(json_p)
+            else:
+                loaded, acc_results = load_accuracy_result(json_p)
                 results.extend(acc_results)
-                if any(r.severity == Severity.ERROR for r in acc_results):
-                    accuracy_result = None
-                else:
-                    accuracy_dir = pd
+                if loaded is not None and not any(
+                    r.severity == Severity.ERROR for r in acc_results
+                ):
+                    accuracy_result, accuracy_dir = loaded, pd
 
         # ModelContext validates point-count, regional-coverage, config-consistency, accuracy-gate
         model_ctx = ModelContext(
diff --git a/src/submission_checker/models/loader.py b/src/submission_checker/models/loader.py
index 329cc23..a6afd89 100644
--- a/src/submission_checker/models/loader.py
+++ b/src/submission_checker/models/loader.py
@@ -7,6 +7,7 @@
 
 __all__ = [
     "load_accuracy_result",
+    "load_accuracy_scores",
     "load_point_config",
     "load_result_summary",
     "load_system_description",
@@ -151,3 +152,31 @@ def load_accuracy_result(
         return instance, list(instance._check_results)
     except ValidationError as exc:
         return None, _validation_errors(exc, "accuracy-valid", path)
+
+
+def load_accuracy_scores(
+    path: Path,
+) -> tuple[AccuracyResult | None, list[CheckResult], bool]:
+    """Load accuracy from a ``results.json``'s ``accuracy_scores`` field.
+
+    The benchmark writes per-dataset accuracy directly into ``results.json`` under
+    ``accuracy_scores`` (already in the ``AccuracyResult`` schema). This reads that
+    field instead of a separate ``accuracy/results.json`` file.
+
+    Returns ``(model, check_results, present)``. ``present`` is True when the file
+    contains a non-empty ``accuracy_scores`` mapping (regardless of validity); a
+    missing/invalid ``results.json`` is reported by the result-summary loaders, so
+    accuracy is simply treated as absent here. On a validation failure the model is
+    None and ``check_results`` holds one entry per error.
+    """
+    data, load_err = _load_json(path)
+    if load_err or not isinstance(data, dict):
+        return None, [], False
+    scores = data.get("accuracy_scores")
+    if not isinstance(scores, dict) or not scores:
+        return None, [], False
+    try:
+        instance = AccuracyResult.model_validate(scores, context={"json_path": path})
+        return instance, list(instance._check_results), True
+    except ValidationError as exc:
+        return None, _validation_errors(exc, "accuracy-valid", path), True
diff --git a/tests/endpoints_submission_cli/submissions/test_builder.py b/tests/endpoints_submission_cli/submissions/test_builder.py
index e0f6505..955f734 100644
--- a/tests/endpoints_submission_cli/submissions/test_builder.py
+++ b/tests/endpoints_submission_cli/submissions/test_builder.py
@@ -20,7 +20,6 @@
     extract_archive,
 )
 from endpoints_submission_cli.truncation import truncate_responses as _truncate_responses
-from submission_checker.models.file import AccuracyResult
 
 
 @pytest.mark.unit
@@ -675,106 +674,6 @@ def test_long_name_truncated(self) -> None:
         assert len(_slugify(long)) <= 64
 
 
-@pytest.mark.unit
-class TestAccuracyResultsNormalization:
-    """`accuracy/results.json` must end up in the checker's AccuracyResult schema.
-
-    A run's accuracy file is frequently a normal results.json blob (top-level
-    config/results/accuracy_scores/responses). The builder unwraps it to the bare
-    per-dataset mapping so it does not fail accuracy-valid; a file already in the
-    expected schema is left untouched.
-    """
-
-    def _make_accuracy_archive(
-        self,
-        run_folder: Path,
-        tmp_path: Path,
-        name: str,
-        results_content: dict | None = None,
-        acc_results_content: dict | None = None,
-    ) -> Path:
-        import shutil
-
-        folder = tmp_path / name
-        shutil.copytree(run_folder, folder)
-        cfg = yaml.safe_load((folder / "config.yaml").read_text())
-        cfg["datasets"][0]["type"] = "accuracy"
-        (folder / "config.yaml").write_text(yaml.dump(cfg))
-        if results_content is not None:
-            (folder / "results.json").write_text(json.dumps(results_content))
-        if acc_results_content is not None:
-            (folder / "accuracy").mkdir(exist_ok=True)
-            (folder / "accuracy" / "results.json").write_text(json.dumps(acc_results_content))
-        archive = tmp_path / f"{name}.tar.gz"
-        with tarfile.open(archive, "w:gz") as tar:
-            tar.add(folder, arcname=name)
-        return archive
-
-    _EXPECTED_SCORES = {"test_ds": {"dataset_name": "test_ds", "num_samples": 100, "score": 0.45}}
-
-    def _results_blob(self, count: int = 5000) -> dict:
-        """A normal results.json blob carrying accuracy_scores plus a large responses list."""
-        return {
-            "config": {},
-            "results": {"total": 100},
-            "accuracy_scores": self._EXPECTED_SCORES,
-            "responses": [{"text": "a" * 100, "idx": i} for i in range(count)],
-        }
-
-    def _assert_unwrapped_and_valid(self, written: dict) -> None:
-        # The bare per-dataset mapping only — no wrapper keys, no responses — and it
-        # must pass the checker's AccuracyResult schema.
-        assert "responses" not in written
-        assert "accuracy_scores" not in written and "config" not in written
-        assert written == self._EXPECTED_SCORES
-        AccuracyResult.model_validate(written)  # raises ValidationError if malformed
-
-    def test_root_results_blob_unwrapped(self, run_folder: Path, tmp_path: Path) -> None:
-        """Archive ships only a normal results.json blob → unwrapped to its accuracy_scores."""
-        archive = self._make_accuracy_archive(
-            run_folder, tmp_path, "wf1", results_content=self._results_blob()
-        )
-        sub_dir = build_submission_folder(
-            [("run-001", archive)], "standardized", "available", tmp_path / "sub"
-        )
-        acc_files = list(sub_dir.rglob("accuracy/results.json"))
-        assert len(acc_files) == 1
-        self._assert_unwrapped_and_valid(json.loads(acc_files[0].read_text()))
-
-    def test_accuracy_subdir_blob_unwrapped(self, run_folder: Path, tmp_path: Path) -> None:
-        """Archive ships a results.json-shaped accuracy/results.json → unwrapped."""
-        archive = self._make_accuracy_archive(
-            run_folder, tmp_path, "wf2", acc_results_content=self._results_blob()
-        )
-        sub_dir = build_submission_folder(
-            [("run-001", archive)], "standardized", "available", tmp_path / "sub"
-        )
-        acc_files = list(sub_dir.rglob("accuracy/results.json"))
-        assert len(acc_files) == 1
-        self._assert_unwrapped_and_valid(json.loads(acc_files[0].read_text()))
-
-    def test_already_expected_format_left_intact(self, run_folder: Path, tmp_path: Path) -> None:
-        """An accuracy file already in the AccuracyResult schema is written through unchanged."""
-        correct = {
-            "cnn_dailymail::m": {
-                "dataset_name": "cnn_dailymail::m",
-                "num_samples": 13368,
-                "score": {"rouge1": 39.0},
-            }
-        }
-        archive = self._make_accuracy_archive(
-            run_folder, tmp_path, "wf3", acc_results_content=correct
-        )
-        sub_dir = build_submission_folder(
-            [("run-001", archive)], "standardized", "available", tmp_path / "sub"
-        )
-        acc_files = list(sub_dir.rglob("accuracy/results.json"))
-        assert len(acc_files) == 1
-        written = json.loads(acc_files[0].read_text())
-        assert written == correct  # unchanged: no top-level accuracy_scores to unwrap
-        AccuracyResult.model_validate(written)
-
-
 @pytest.mark.unit
 class TestBuilderTruncatesResults:
     """build_submission_folder truncates the verbose responses in the perf results.json."""
diff --git a/tests/submission_checker/test_checker.py b/tests/submission_checker/test_checker.py
index 86e4bec..2d18a3b 100644
--- a/tests/submission_checker/test_checker.py
+++ b/tests/submission_checker/test_checker.py
@@ -489,6 +489,34 @@ def test_invalid_accuracy_json(self, tmp_path):
         report = _check(root)
         assert _errors(report, "accuracy-valid")
 
+    def test_accuracy_scores_in_results_json(self, tmp_path):
+        """Accuracy is read from a point's results.json accuracy_scores (no accuracy/ dir)."""
+        root = _build_submission(tmp_path, write_accuracy_json=False)
+        # Drop accuracy_scores into the first point's results.json. The checker reads it
+        # directly; the empty accuracy/ dir is not consulted (no accuracy-file error).
+        first_point = (
+            root / "pareto" / "test-sys" / "llama3-70b" / "results" / f"point_{_CONCURRENCIES[0]}"
+        )
+        (first_point / "results.json").write_text(
+            json.dumps({"config": {}, "results": {}, "accuracy_scores": _ACCURACY, "responses": []})
+        )
+        report = _check(root)
+        assert not _errors(report, "accuracy-present")
+        assert not _errors(report, "accuracy-valid")
+        assert not _errors(report, "accuracy-file")
+
+    def test_invalid_accuracy_scores_in_results_json(self, tmp_path):
+        """accuracy-valid error when results.json accuracy_scores is malformed."""
+        root = _build_submission(tmp_path, write_accuracy_json=False)
+        first_point = (
+            root / "pareto" / "test-sys" / "llama3-70b" / "results" / f"point_{_CONCURRENCIES[0]}"
+        )
+        (first_point / "results.json").write_text(
+            json.dumps({"accuracy_scores": {"cnn_dailymail": "not-a-dict"}})
+        )
+        report = _check(root)
+        assert _errors(report, "accuracy-valid")
+
     def test_point_filename_concurrency_mismatch(self, tmp_path):
         """point-filename-concurrency warning when filename concurrency ≠ declared concurrency."""
         root = _build_submission(tmp_path)