diff --git a/src/endpoints_submission_cli/exceptions.py b/src/endpoints_submission_cli/exceptions.py
index 2aad49e..cb0a0c5 100644
--- a/src/endpoints_submission_cli/exceptions.py
+++ b/src/endpoints_submission_cli/exceptions.py
@@ -39,3 +39,11 @@ class SubmissionBuildError(Exception):
 
 class SubmissionCheckError(Exception):
     """Raised when the Submission Checker reports validation errors."""
+
+
+class TruncationError(Exception):
+    """Raised when a results payload's ``responses`` cannot be truncated under the cap.
+
+    Guards against silently shipping an un-truncated (potentially multi-GB) payload
+    when ``responses`` is an unexpected shape the truncator does not handle.
+    """
diff --git a/src/endpoints_submission_cli/truncation.py b/src/endpoints_submission_cli/truncation.py
index 88b3c91..824fc6e 100644
--- a/src/endpoints_submission_cli/truncation.py
+++ b/src/endpoints_submission_cli/truncation.py
@@ -11,33 +11,62 @@
 
 import json
 
+from .exceptions import TruncationError
+
 __all__ = ["RESPONSES_LIMIT", "truncate_responses"]
 
 RESPONSES_LIMIT = 10 * 1024  # 10 KB
 
 
 def truncate_responses(content: bytes) -> bytes:
-    """Truncate the ``responses`` list in a results.json payload to stay under 10 KB.
+    """Truncate the ``responses`` collection in a results.json payload to stay under 10 KB.
 
-    Returns *content* unchanged when it is not JSON, has no ``responses`` list, or
-    already fits within the limit. Only the ``responses`` key is affected; all
-    other fields (accuracy scores, config, results, ...) are preserved.
+    ``responses`` may be either a list (``[entry, ...]``) or a dict keyed by sample
+    id (``{uuid: output, ...}``); both are produced by different run modes. Entries
+    are kept in iteration order until adding the next would exceed the limit. Returns
+    *content* unchanged when it is not JSON or has no non-empty ``responses``. Only the
+    ``responses`` key is affected; all other fields (accuracy scores, config, ...) are
+    preserved.
     """
     try:
         data = json.loads(content)
     except (json.JSONDecodeError, ValueError):
         return content
     responses = data.get("responses")
-    if not isinstance(responses, list) or not responses:
-        return content
-    # Walk items and stop as soon as adding the next one would exceed the limit.
-    # Each item contributes its own bytes plus 2 for the ", " separator after the first.
-    total = 2  # "[]"
-    idx = 0
-    for i, r in enumerate(responses):
-        total += len(json.dumps(r).encode()) + (2 if i > 0 else 0)
-        if total > RESPONSES_LIMIT:
-            break
-        idx = i + 1
-    data["responses"] = responses[:idx]
+    if responses is None or (isinstance(responses, (list, dict)) and not responses):
+        return content  # nothing to truncate
+    if isinstance(responses, list):
+        # Each item contributes its own bytes plus 2 for the ", " separator after the first.
+        total = 2  # "[]"
+        idx = 0
+        for i, r in enumerate(responses):
+            total += len(json.dumps(r).encode()) + (2 if i > 0 else 0)
+            if total > RESPONSES_LIMIT:
+                break
+            idx = i + 1
+        data["responses"] = responses[:idx]
+    elif isinstance(responses, dict):
+        # Each entry contributes ``"key": value`` plus 2 for the ", " separator after
+        # the first. Approximate the key/value cost conservatively to stay under the cap.
+        total = 2  # "{}"
+        kept: dict[str, object] = {}
+        for i, (k, v) in enumerate(responses.items()):
+            entry = len(json.dumps(k).encode()) + len(json.dumps(v).encode()) + 2  # ': '
+            if i > 0:
+                entry += 2  # ", "
+            if total + entry > RESPONSES_LIMIT:
+                break
+            total += entry
+            kept[k] = v
+        data["responses"] = kept
+    else:
+        # Unknown shape — refuse to ship a payload we cannot bound. A silent pass-through
+        # here is what let a multi-GB results.json reach a submission bundle.
+        raise TruncationError(
+            f"Cannot truncate 'responses' of type {type(responses).__name__}; "
+            "expected a list or dict."
+        )
+    # Defense-in-depth: never return a payload whose responses still exceed the cap.
+    if len(json.dumps(data["responses"]).encode()) > RESPONSES_LIMIT:
+        raise TruncationError("responses still exceed the size limit after truncation")
     return json.dumps(data, indent=2).encode()
diff --git a/src/submission_checker/accuracy_targets.py b/src/submission_checker/accuracy_targets.py
index 263a68d..df64e48 100644
--- a/src/submission_checker/accuracy_targets.py
+++ b/src/submission_checker/accuracy_targets.py
@@ -20,6 +20,28 @@
 # (model-keyword fragments, {metric → (lower, upper | None)}, min_queries)
 # More-specific entries must come before less-specific ones.
 _TARGETS: list[tuple[frozenset[str], dict[str, tuple[float, float | None]], int]] = [
+    # deepseek-r1  — golden fp32 exact_match 81.3582 (gate ≥ 0.99×).
+    # Only exact_match is gated: results.json's accuracy_scores expose a single scalar
+    # `score` (== exact_match). The canonical spec also bounds TOKENS_PER_SAMPLE
+    # (0.9–1.1 × 3886.2274), but that metric lives only in the deepseek_eval file, not
+    # in results.json, so it is intentionally NOT gated here.
+    (
+        frozenset({"deepseek", "r1"}),
+        {
+            "exact_match": (81.3582 * 0.99, None),
+        },
+        4388,
+    ),
+    # gpt-oss-120b  — golden fp32 exact_match 83.13; upstream tokens_per_sample upper
+    # bound is still a placeholder (constants.py:215), so it is intentionally omitted.
+    # min_queries uses the accuracy-sample-count (4395), not the perf count (6396).
+    (
+        frozenset({"gptoss", "120b"}),
+        {
+            "exact_match": (83.13 * 0.99, None),
+        },
+        4395,
+    ),
     # llama3.1-405b  — inference uses ROUGEL as primary; also exact_match + tokens
     (
         frozenset({"llama3", "405b"}),
diff --git a/src/submission_checker/checker.py b/src/submission_checker/checker.py
index e1d4535..9fffda0 100644
--- a/src/submission_checker/checker.py
+++ b/src/submission_checker/checker.py
@@ -6,6 +6,7 @@
 
 from __future__ import annotations
 
+import json
 from typing import TYPE_CHECKING
 
 __all__ = ["SubmissionChecker"]
@@ -31,6 +32,7 @@
 from .models import warn as _warn
 from .models.loader import (
     load_accuracy_result,
+    load_accuracy_scores,
     load_point_config,
     load_result_summary,
     load_system_description,
@@ -40,6 +42,19 @@
     from pathlib import Path
 
 
+def _results_has_accuracy_scores(path: Path) -> bool:
+    """True if a results.json carries a non-empty ``accuracy_scores`` mapping."""
+    try:
+        data = json.loads(path.read_text())
+    except (OSError, ValueError):
+        return False
+    return (
+        isinstance(data, dict)
+        and isinstance(data.get("accuracy_scores"), dict)
+        and bool(data["accuracy_scores"])
+    )
+
+
 class SubmissionChecker:
     """Validates an MLPerf Endpoints submission directory against §9.1 rules.
 
@@ -111,13 +126,16 @@ def run(self) -> Report:
         for system_json in system_jsons:
             report.results.extend(self._check_system(system_json, pareto_dir))
 
-        # §15: at least one model in the submission must have an accuracy/results.json.
-        has_full_accuracy = any(True for _ in pareto_dir.rglob("accuracy/results.json"))
+        # §15: at least one model must carry accuracy results — either as accuracy_scores
+        # embedded in a results.json, or as a standalone accuracy/results.json.
+        has_full_accuracy = any(
+            True for _ in pareto_dir.rglob("accuracy/results.json")
+        ) or any(_results_has_accuracy_scores(p) for p in pareto_dir.rglob("results.json"))
         if has_full_accuracy:
             report.results.append(
                 _ok(
                     "accuracy-present",
-                    "At least one model has accuracy/results.json",
+                    "At least one model has accuracy results",
                     pareto_dir,
                     "#15",
                 )
@@ -126,7 +144,8 @@ def run(self) -> Report:
             report.results.append(
                 _err(
                     "accuracy-present",
-                    "No model in this submission has accuracy/results.json",
+                    "No model in this submission has accuracy results "
+                    "(accuracy_scores in results.json or accuracy/results.json)",
                     pareto_dir,
                     "#15",
                 )
@@ -303,13 +322,30 @@ def _check_model(
             results.extend(point_result._check_results)
             loaded_points.append((config, summary))
 
-        # Load accuracy from per-point result dirs. Per-model warnings are only
-        # emitted when a point has an accuracy/ dir but files within it are absent.
-        # run() enforces that at least one model in the submission has both files.
+        # Load accuracy per point, preferring the accuracy_scores embedded in
+        # results.json, then falling back to a standalone accuracy/results.json.
+        # The first valid source wins; run() enforces that at least one model has one.
         accuracy_dir: Path | None = None
         accuracy_result = None
         for config, _ in loaded_points:
-            pd = results_dir / f"point_{config.concurrency}" / "accuracy"
+            if accuracy_result is not None:
+                break
+            point_dir = results_dir / f"point_{config.concurrency}"
+
+            # Primary: accuracy_scores embedded in results.json.
+            results_json = point_dir / "results.json"
+            if results_json.exists():
+                loaded, acc_results, present = load_accuracy_scores(results_json)
+                if present:
+                    results.extend(acc_results)
+                    if loaded is not None and not any(
+                        r.severity == Severity.ERROR for r in acc_results
+                    ):
+                        accuracy_result, accuracy_dir = loaded, point_dir
+                    continue
+
+            # Fallback: standalone accuracy/results.json (moved from the run archive).
+            pd = point_dir / "accuracy"
             if not pd.is_dir():
                 continue
             json_p = pd / "results.json"
@@ -322,13 +358,13 @@ def _check_model(
                         "#15",
                     )
                 )
-            elif accuracy_result is None:
-                accuracy_result, acc_results = load_accuracy_result(json_p)
+            else:
+                loaded, acc_results = load_accuracy_result(json_p)
                 results.extend(acc_results)
-                if any(r.severity == Severity.ERROR for r in acc_results):
-                    accuracy_result = None
-                else:
-                    accuracy_dir = pd
+                if loaded is not None and not any(
+                    r.severity == Severity.ERROR for r in acc_results
+                ):
+                    accuracy_result, accuracy_dir = loaded, pd
 
         # ModelContext validates point-count, regional-coverage, config-consistency, accuracy-gate
         model_ctx = ModelContext(
diff --git a/src/submission_checker/models/aggregate/context.py b/src/submission_checker/models/aggregate/context.py
index 72b167b..b933cf4 100644
--- a/src/submission_checker/models/aggregate/context.py
+++ b/src/submission_checker/models/aggregate/context.py
@@ -248,6 +248,26 @@ def _check_accuracy(self) -> ModelContext:
         for ds_scores in all_scores.values():
             flat_scores.update(ds_scores)
 
+        # Endpoints scorers (e.g. DeepSeekR1Scorer) write a single *unnamed* scalar
+        # `score` per dataset into results.json — the scorer's primary metric, with its
+        # identity dropped. When the model declares exactly one accuracy metric, gate
+        # that scalar against it; but WARN, because we cannot verify the scalar's
+        # identity, and any secondary metrics (e.g. tokens_per_sample) are absent from
+        # results.json and are therefore NOT checked.
+        if list(flat_scores) == ["score"] and len(thresholds) == 1:
+            only_metric = next(iter(thresholds))
+            self._check_results.append(
+                warn(
+                    "accuracy-gate",
+                    f"results.json exposes only an unnamed scalar accuracy score; "
+                    f"gating it as '{only_metric}'. Secondary metrics (if any) are not "
+                    f"present in results.json and are not checked.",
+                    json_path,
+                    "#15",
+                )
+            )
+            flat_scores = {only_metric: flat_scores["score"]}
+
         for threshold_key, (lower, upper) in thresholds.items():
             # Match score key case-insensitively
             score: float | None = None
diff --git a/src/submission_checker/models/loader.py b/src/submission_checker/models/loader.py
index 329cc23..a6afd89 100644
--- a/src/submission_checker/models/loader.py
+++ b/src/submission_checker/models/loader.py
@@ -7,6 +7,7 @@
 
 __all__ = [
     "load_accuracy_result",
+    "load_accuracy_scores",
     "load_point_config",
     "load_result_summary",
     "load_system_description",
@@ -151,3 +152,31 @@ def load_accuracy_result(
         return instance, list(instance._check_results)
     except ValidationError as exc:
         return None, _validation_errors(exc, "accuracy-valid", path)
+
+
+def load_accuracy_scores(
+    path: Path,
+) -> tuple[AccuracyResult | None, list[CheckResult], bool]:
+    """Load accuracy from a ``results.json``'s ``accuracy_scores`` field.
+
+    The benchmark writes per-dataset accuracy directly into ``results.json`` under
+    ``accuracy_scores`` (already in the ``AccuracyResult`` schema). This reads that
+    field instead of a separate ``accuracy/results.json`` file.
+
+    Returns ``(model, check_results, present)``. ``present`` is True when the file
+    contains a non-empty ``accuracy_scores`` mapping (regardless of validity); a
+    missing/invalid ``results.json`` is reported by the result-summary loaders, so
+    accuracy is simply treated as absent here. On a validation failure the model is
+    None and ``check_results`` holds one entry per error.
+    """
+    data, load_err = _load_json(path)
+    if load_err or not isinstance(data, dict):
+        return None, [], False
+    scores = data.get("accuracy_scores")
+    if not isinstance(scores, dict) or not scores:
+        return None, [], False
+    try:
+        instance = AccuracyResult.model_validate(scores, context={"json_path": path})
+        return instance, list(instance._check_results), True
+    except ValidationError as exc:
+        return None, _validation_errors(exc, "accuracy-valid", path), True
diff --git a/tests/endpoints_submission_cli/submissions/test_builder.py b/tests/endpoints_submission_cli/submissions/test_builder.py
index 72f5458..955f734 100644
--- a/tests/endpoints_submission_cli/submissions/test_builder.py
+++ b/tests/endpoints_submission_cli/submissions/test_builder.py
@@ -11,7 +11,7 @@
 import pytest
 import yaml
 
-from endpoints_submission_cli.exceptions import SubmissionBuildError
+from endpoints_submission_cli.exceptions import SubmissionBuildError, TruncationError
 from endpoints_submission_cli.submissions.builder import (
     _compute_max_tps,
     _slugify,
@@ -608,6 +608,40 @@ def test_large_responses_truncated_under_10kb(self) -> None:
         result = json.loads(result_bytes)
         assert len(json.dumps(result["responses"]).encode()) <= 10 * 1024
 
+    def _make_dict_content(self, n_responses: int) -> bytes:
+        # Some run modes emit responses as a {sample_uuid: output_text} mapping.
+        data = {
+            "config": {"mode": "perf"},
+            "accuracy_scores": {"ds": {"num_samples": n_responses, "score": 1.0}},
+            "responses": {f"uuid-{i:08d}": "hello world " * 5 for i in range(n_responses)},
+        }
+        return json.dumps(data).encode()
+
+    def test_small_dict_responses_unchanged(self) -> None:
+        content = self._make_dict_content(2)
+        result = json.loads(_truncate_responses(content))
+        assert result["responses"] == json.loads(content)["responses"]
+        assert result["accuracy_scores"] == {"ds": {"num_samples": 2, "score": 1.0}}
+
+    def test_large_dict_responses_truncated_under_10kb(self) -> None:
+        content = self._make_dict_content(10_000)
+        result = json.loads(_truncate_responses(content))
+        assert isinstance(result["responses"], dict)
+        assert len(json.dumps(result["responses"]).encode()) <= 10 * 1024
+        assert 0 < len(result["responses"]) < 10_000  # kept some, dropped the rest
+        # non-responses fields untouched
+        assert result["accuracy_scores"] == {"ds": {"num_samples": 10_000, "score": 1.0}}
+
+    def test_unknown_responses_type_raises_hard_error(self) -> None:
+        # A responses shape we cannot bound must fail loudly, never pass through silently.
+        content = json.dumps({"config": {}, "responses": "x" * 50_000}).encode()
+        with pytest.raises(TruncationError):
+            _truncate_responses(content)
+
+    def test_missing_responses_is_not_an_error(self) -> None:
+        content = json.dumps({"config": {}, "accuracy_scores": {}}).encode()
+        assert _truncate_responses(content) == content  # nothing to truncate, no raise
+
     def test_other_keys_preserved(self) -> None:
         content = self._make_content(10_000)
         result = json.loads(_truncate_responses(content))
@@ -641,89 +675,46 @@ def test_long_name_truncated(self) -> None:
 
 
 @pytest.mark.unit
-class TestAccuracyResultsTruncation:
-    """Tests for accuracy/results.json truncation across three workflow scenarios."""
-
-    def _make_accuracy_archive(
-        self,
-        run_folder: Path,
-        tmp_path: Path,
-        name: str,
-        results_content: dict | None = None,
-        acc_results_content: dict | None = None,
-    ) -> Path:
+class TestBuilderTruncatesResults:
+    """build_submission_folder truncates the verbose responses in the perf results.json."""
+
+    def _archive(self, run_folder: Path, tmp_path: Path, responses: object) -> Path:
         import shutil
 
-        folder = tmp_path / name
+        folder = tmp_path / "perf_run"
         shutil.copytree(run_folder, folder)
-        cfg = yaml.safe_load((folder / "config.yaml").read_text())
-        cfg["datasets"][0]["type"] = "accuracy"
-        (folder / "config.yaml").write_text(yaml.dump(cfg))
-        if results_content is not None:
-            (folder / "results.json").write_text(json.dumps(results_content))
-        if acc_results_content is not None:
-            (folder / "accuracy").mkdir(exist_ok=True)
-            (folder / "accuracy" / "results.json").write_text(json.dumps(acc_results_content))
-        archive = tmp_path / f"{name}.tar.gz"
+        results = json.loads((folder / "results.json").read_text())
+        results["responses"] = responses
+        (folder / "results.json").write_text(json.dumps(results))
+        archive = tmp_path / "perf_run.tar.gz"
         with tarfile.open(archive, "w:gz") as tar:
-            tar.add(folder, arcname=name)
+            tar.add(folder, arcname=folder.name)
         return archive
 
-    def _big_results(self, count: int = 5000) -> dict:
-        return {
-            "config": {},
-            "accuracy_scores": {"score": 0.45},
-            "responses": [{"text": "a" * 100, "idx": i} for i in range(count)],
-        }
+    def _perf_results(self, sub_dir: Path) -> dict:
+        perf = [p for p in sub_dir.rglob("results.json") if p.parent.name != "accuracy"]
+        assert len(perf) == 1
+        return json.loads(perf[0].read_text())
 
-    def test_no_accuracy_subdir_truncates_responses(
-        self, run_folder: Path, tmp_path: Path
-    ) -> None:
-        """Workflow 1/3: archive has only root results.json; responses written truncated to accuracy/results.json."""
-        archive = self._make_accuracy_archive(
-            run_folder, tmp_path, "wf1", results_content=self._big_results()
-        )
+    def test_dict_responses_truncated_in_bundle(self, run_folder: Path, tmp_path: Path) -> None:
+        # Regression: responses emitted as a {uuid: text} dict were not truncated.
+        big = {f"uuid-{i:08d}": "hello world " * 5 for i in range(10_000)}
         sub_dir = build_submission_folder(
-            [("run-001", archive)], "standardized", "available", tmp_path / "sub"
+            [("run-001", self._archive(run_folder, tmp_path, big))],
+            "standardized", "available", tmp_path / "sub",
         )
-        acc_files = list(sub_dir.rglob("accuracy/results.json"))
-        assert len(acc_files) == 1
-        written = json.loads(acc_files[0].read_text())
+        written = self._perf_results(sub_dir)
+        assert isinstance(written["responses"], dict)
         assert len(json.dumps(written["responses"]).encode()) <= 10 * 1024
+        assert 0 < len(written["responses"]) < 10_000
 
-    def test_accuracy_subdir_large_responses_truncated(
-        self, run_folder: Path, tmp_path: Path
-    ) -> None:
-        """Workflow 2: archive has accuracy/results.json with large responses; truncated on write."""
-        archive = self._make_accuracy_archive(
-            run_folder, tmp_path, "wf2", acc_results_content=self._big_results()
-        )
+    def test_list_responses_truncated_in_bundle(self, run_folder: Path, tmp_path: Path) -> None:
+        big = [{"text": "hello world", "idx": i} for i in range(10_000)]
         sub_dir = build_submission_folder(
-            [("run-001", archive)], "standardized", "available", tmp_path / "sub"
+            [("run-001", self._archive(run_folder, tmp_path, big))],
+            "standardized", "available", tmp_path / "sub",
         )
-        acc_files = list(sub_dir.rglob("accuracy/results.json"))
-        assert len(acc_files) == 1
-        written = json.loads(acc_files[0].read_text())
+        written = self._perf_results(sub_dir)
+        assert isinstance(written["responses"], list)
         assert len(json.dumps(written["responses"]).encode()) <= 10 * 1024
-
-    def test_accuracy_subdir_already_small_written_intact(
-        self, run_folder: Path, tmp_path: Path
-    ) -> None:
-        """Workflow 3: archive has accuracy/results.json already within 10 KB; written without corruption."""
-        small_responses = [{"text": "hi", "idx": 0}]
-        acc_results = {
-            "config": {},
-            "accuracy_scores": {"score": 0.45},
-            "responses": small_responses,
-        }
-        archive = self._make_accuracy_archive(
-            run_folder, tmp_path, "wf3", acc_results_content=acc_results
-        )
-        sub_dir = build_submission_folder(
-            [("run-001", archive)], "standardized", "available", tmp_path / "sub"
-        )
-        acc_files = list(sub_dir.rglob("accuracy/results.json"))
-        assert len(acc_files) == 1
-        written = json.loads(acc_files[0].read_text())
-        assert written["responses"] == small_responses
-        assert written["accuracy_scores"] == acc_results["accuracy_scores"]
+        assert 0 < len(written["responses"]) < 10_000
diff --git a/tests/submission_checker/test_checker.py b/tests/submission_checker/test_checker.py
index 86e4bec..2d18a3b 100644
--- a/tests/submission_checker/test_checker.py
+++ b/tests/submission_checker/test_checker.py
@@ -489,6 +489,34 @@ def test_invalid_accuracy_json(self, tmp_path):
         report = _check(root)
         assert _errors(report, "accuracy-valid")
 
+    def test_accuracy_scores_in_results_json(self, tmp_path):
+        """Accuracy is read from a point's results.json accuracy_scores (no accuracy/ dir)."""
+        root = _build_submission(tmp_path, write_accuracy_json=False)
+        # Drop accuracy_scores into the first point's results.json. The checker reads it
+        # directly; the empty accuracy/ dir is not consulted (no accuracy-file error).
+        first_point = (
+            root / "pareto" / "test-sys" / "llama3-70b" / "results" / f"point_{_CONCURRENCIES[0]}"
+        )
+        (first_point / "results.json").write_text(
+            json.dumps({"config": {}, "results": {}, "accuracy_scores": _ACCURACY, "responses": []})
+        )
+        report = _check(root)
+        assert not _errors(report, "accuracy-present")
+        assert not _errors(report, "accuracy-valid")
+        assert not _errors(report, "accuracy-file")
+
+    def test_invalid_accuracy_scores_in_results_json(self, tmp_path):
+        """accuracy-valid error when results.json accuracy_scores is malformed."""
+        root = _build_submission(tmp_path, write_accuracy_json=False)
+        first_point = (
+            root / "pareto" / "test-sys" / "llama3-70b" / "results" / f"point_{_CONCURRENCIES[0]}"
+        )
+        (first_point / "results.json").write_text(
+            json.dumps({"accuracy_scores": {"cnn_dailymail": "not-a-dict"}})
+        )
+        report = _check(root)
+        assert _errors(report, "accuracy-valid")
+
     def test_point_filename_concurrency_mismatch(self, tmp_path):
         """point-filename-concurrency warning when filename concurrency ≠ declared concurrency."""
         root = _build_submission(tmp_path)
diff --git a/tests/submission_checker/test_checks_aggregate.py b/tests/submission_checker/test_checks_aggregate.py
index 7225fef..a40f227 100644
--- a/tests/submission_checker/test_checks_aggregate.py
+++ b/tests/submission_checker/test_checks_aggregate.py
@@ -551,3 +551,51 @@ def test_sample_count_missing_skips(self, tmp_path):
         )
         ctx = _model_ctx(tmp_path, accuracy_result=ar, model_name="Llama-3_1-8B-Instruct")
         assert not any(r.rule == "accuracy-sample-count" for r in ctx._check_results)
+
+    def test_scalar_score_gated_as_single_metric_passes(self, tmp_path):
+        # Endpoints results.json exposes a single *unnamed* scalar `score`
+        # (== exact_match for deepseek). With a single-metric threshold it is gated
+        # as that metric — with a warning — and 81.3355 ≥ 0.99*81.3582 (80.5446) passes.
+        ar = AccuracyResult({"deepseek_r1_accuracy": {"num_samples": 4388, "score": 81.3355}})
+        ctx = _model_ctx(tmp_path, accuracy_result=ar, model_name="deepseek_r1-torch-fp4")
+        assert any(
+            r.rule == "accuracy-gate"
+            and r.severity == Severity.WARNING
+            and "unnamed scalar" in r.message
+            for r in ctx._check_results
+        )
+        assert any(
+            r.rule == "accuracy-gate"
+            and r.severity == Severity.INFO
+            and "exact_match" in r.message
+            for r in ctx._check_results
+        )
+        assert not any(
+            r.rule == "accuracy-gate" and r.severity == Severity.ERROR
+            for r in ctx._check_results
+        )
+
+    def test_scalar_score_gated_as_single_metric_fails(self, tmp_path):
+        # 70.0 < 0.99*81.3582 (80.5446) → the mapped exact_match gate errors.
+        ar = AccuracyResult({"deepseek_r1_accuracy": {"num_samples": 4388, "score": 70.0}})
+        ctx = _model_ctx(tmp_path, accuracy_result=ar, model_name="deepseek_r1-torch-fp4")
+        assert any(
+            r.rule == "accuracy-gate"
+            and r.severity == Severity.ERROR
+            and "exact_match" in r.message
+            for r in ctx._check_results
+        )
+
+    def test_scalar_score_not_mapped_for_multimetric_model(self, tmp_path):
+        # llama3.1-8b declares 5 metrics, so a bare scalar `score` is ambiguous and
+        # must NOT be mapped — no scalar warning, and no metric comparison fires.
+        ar = AccuracyResult({"cnn_dailymail::llama3_8b": {"num_samples": 13368, "score": 39.0}})
+        ctx = _model_ctx(tmp_path, accuracy_result=ar, model_name="Llama-3_1-8B-Instruct")
+        assert not any(
+            r.rule == "accuracy-gate" and "unnamed scalar" in r.message
+            for r in ctx._check_results
+        )
+        assert not any(
+            r.rule == "accuracy-gate" and r.severity in (Severity.INFO, Severity.ERROR)
+            for r in ctx._check_results
+        )