diff --git a/src/endpoints_submission_cli/exceptions.py b/src/endpoints_submission_cli/exceptions.py index 2aad49e..cb0a0c5 100644 --- a/src/endpoints_submission_cli/exceptions.py +++ b/src/endpoints_submission_cli/exceptions.py @@ -39,3 +39,11 @@ class SubmissionBuildError(Exception): class SubmissionCheckError(Exception): """Raised when the Submission Checker reports validation errors.""" + + +class TruncationError(Exception): + """Raised when a results payload's ``responses`` cannot be truncated under the cap. + + Guards against silently shipping an un-truncated (potentially multi-GB) payload + when ``responses`` is an unexpected shape the truncator does not handle. + """ diff --git a/src/endpoints_submission_cli/truncation.py b/src/endpoints_submission_cli/truncation.py index 88b3c91..824fc6e 100644 --- a/src/endpoints_submission_cli/truncation.py +++ b/src/endpoints_submission_cli/truncation.py @@ -11,33 +11,62 @@ import json +from .exceptions import TruncationError + __all__ = ["RESPONSES_LIMIT", "truncate_responses"] RESPONSES_LIMIT = 10 * 1024 # 10 KB def truncate_responses(content: bytes) -> bytes: - """Truncate the ``responses`` list in a results.json payload to stay under 10 KB. + """Truncate the ``responses`` collection in a results.json payload to stay under 10 KB. - Returns *content* unchanged when it is not JSON, has no ``responses`` list, or - already fits within the limit. Only the ``responses`` key is affected; all - other fields (accuracy scores, config, results, ...) are preserved. + ``responses`` may be either a list (``[entry, ...]``) or a dict keyed by sample + id (``{uuid: output, ...}``); both are produced by different run modes. Entries + are kept in iteration order until adding the next would exceed the limit. Returns + *content* unchanged when it is not JSON or has no non-empty ``responses``. Only the + ``responses`` key is affected; all other fields (accuracy scores, config, ...) are + preserved. """ try: data = json.loads(content) except (json.JSONDecodeError, ValueError): return content responses = data.get("responses") - if not isinstance(responses, list) or not responses: - return content - # Walk items and stop as soon as adding the next one would exceed the limit. - # Each item contributes its own bytes plus 2 for the ", " separator after the first. - total = 2 # "[]" - idx = 0 - for i, r in enumerate(responses): - total += len(json.dumps(r).encode()) + (2 if i > 0 else 0) - if total > RESPONSES_LIMIT: - break - idx = i + 1 - data["responses"] = responses[:idx] + if responses is None or (isinstance(responses, (list, dict)) and not responses): + return content # nothing to truncate + if isinstance(responses, list): + # Each item contributes its own bytes plus 2 for the ", " separator after the first. + total = 2 # "[]" + idx = 0 + for i, r in enumerate(responses): + total += len(json.dumps(r).encode()) + (2 if i > 0 else 0) + if total > RESPONSES_LIMIT: + break + idx = i + 1 + data["responses"] = responses[:idx] + elif isinstance(responses, dict): + # Each entry contributes ``"key": value`` plus 2 for the ", " separator after + # the first. Approximate the key/value cost conservatively to stay under the cap. + total = 2 # "{}" + kept: dict[str, object] = {} + for i, (k, v) in enumerate(responses.items()): + entry = len(json.dumps(k).encode()) + len(json.dumps(v).encode()) + 2 # ': ' + if i > 0: + entry += 2 # ", " + if total + entry > RESPONSES_LIMIT: + break + total += entry + kept[k] = v + data["responses"] = kept + else: + # Unknown shape — refuse to ship a payload we cannot bound. A silent pass-through + # here is what let a multi-GB results.json reach a submission bundle. + raise TruncationError( + f"Cannot truncate 'responses' of type {type(responses).__name__}; " + "expected a list or dict." + ) + # Defense-in-depth: never return a payload whose responses still exceed the cap. + if len(json.dumps(data["responses"]).encode()) > RESPONSES_LIMIT: + raise TruncationError("responses still exceed the size limit after truncation") return json.dumps(data, indent=2).encode() diff --git a/src/submission_checker/accuracy_targets.py b/src/submission_checker/accuracy_targets.py index 263a68d..df64e48 100644 --- a/src/submission_checker/accuracy_targets.py +++ b/src/submission_checker/accuracy_targets.py @@ -20,6 +20,28 @@ # (model-keyword fragments, {metric → (lower, upper | None)}, min_queries) # More-specific entries must come before less-specific ones. _TARGETS: list[tuple[frozenset[str], dict[str, tuple[float, float | None]], int]] = [ + # deepseek-r1 — golden fp32 exact_match 81.3582 (gate ≥ 0.99×). + # Only exact_match is gated: results.json's accuracy_scores expose a single scalar + # `score` (== exact_match). The canonical spec also bounds TOKENS_PER_SAMPLE + # (0.9–1.1 × 3886.2274), but that metric lives only in the deepseek_eval file, not + # in results.json, so it is intentionally NOT gated here. + ( + frozenset({"deepseek", "r1"}), + { + "exact_match": (81.3582 * 0.99, None), + }, + 4388, + ), + # gpt-oss-120b — golden fp32 exact_match 83.13; upstream tokens_per_sample upper + # bound is still a placeholder (constants.py:215), so it is intentionally omitted. + # min_queries uses the accuracy-sample-count (4395), not the perf count (6396). + ( + frozenset({"gptoss", "120b"}), + { + "exact_match": (83.13 * 0.99, None), + }, + 4395, + ), # llama3.1-405b — inference uses ROUGEL as primary; also exact_match + tokens ( frozenset({"llama3", "405b"}), diff --git a/src/submission_checker/checker.py b/src/submission_checker/checker.py index e1d4535..9fffda0 100644 --- a/src/submission_checker/checker.py +++ b/src/submission_checker/checker.py @@ -6,6 +6,7 @@ from __future__ import annotations +import json from typing import TYPE_CHECKING __all__ = ["SubmissionChecker"] @@ -31,6 +32,7 @@ from .models import warn as _warn from .models.loader import ( load_accuracy_result, + load_accuracy_scores, load_point_config, load_result_summary, load_system_description, @@ -40,6 +42,19 @@ from pathlib import Path +def _results_has_accuracy_scores(path: Path) -> bool: + """True if a results.json carries a non-empty ``accuracy_scores`` mapping.""" + try: + data = json.loads(path.read_text()) + except (OSError, ValueError): + return False + return ( + isinstance(data, dict) + and isinstance(data.get("accuracy_scores"), dict) + and bool(data["accuracy_scores"]) + ) + + class SubmissionChecker: """Validates an MLPerf Endpoints submission directory against §9.1 rules. @@ -111,13 +126,16 @@ def run(self) -> Report: for system_json in system_jsons: report.results.extend(self._check_system(system_json, pareto_dir)) - # §15: at least one model in the submission must have an accuracy/results.json. - has_full_accuracy = any(True for _ in pareto_dir.rglob("accuracy/results.json")) + # §15: at least one model must carry accuracy results — either as accuracy_scores + # embedded in a results.json, or as a standalone accuracy/results.json. + has_full_accuracy = any( + True for _ in pareto_dir.rglob("accuracy/results.json") + ) or any(_results_has_accuracy_scores(p) for p in pareto_dir.rglob("results.json")) if has_full_accuracy: report.results.append( _ok( "accuracy-present", - "At least one model has accuracy/results.json", + "At least one model has accuracy results", pareto_dir, "#15", ) @@ -126,7 +144,8 @@ def run(self) -> Report: report.results.append( _err( "accuracy-present", - "No model in this submission has accuracy/results.json", + "No model in this submission has accuracy results " + "(accuracy_scores in results.json or accuracy/results.json)", pareto_dir, "#15", ) @@ -303,13 +322,30 @@ def _check_model( results.extend(point_result._check_results) loaded_points.append((config, summary)) - # Load accuracy from per-point result dirs. Per-model warnings are only - # emitted when a point has an accuracy/ dir but files within it are absent. - # run() enforces that at least one model in the submission has both files. + # Load accuracy per point, preferring the accuracy_scores embedded in + # results.json, then falling back to a standalone accuracy/results.json. + # The first valid source wins; run() enforces that at least one model has one. accuracy_dir: Path | None = None accuracy_result = None for config, _ in loaded_points: - pd = results_dir / f"point_{config.concurrency}" / "accuracy" + if accuracy_result is not None: + break + point_dir = results_dir / f"point_{config.concurrency}" + + # Primary: accuracy_scores embedded in results.json. + results_json = point_dir / "results.json" + if results_json.exists(): + loaded, acc_results, present = load_accuracy_scores(results_json) + if present: + results.extend(acc_results) + if loaded is not None and not any( + r.severity == Severity.ERROR for r in acc_results + ): + accuracy_result, accuracy_dir = loaded, point_dir + continue + + # Fallback: standalone accuracy/results.json (moved from the run archive). + pd = point_dir / "accuracy" if not pd.is_dir(): continue json_p = pd / "results.json" @@ -322,13 +358,13 @@ def _check_model( "#15", ) ) - elif accuracy_result is None: - accuracy_result, acc_results = load_accuracy_result(json_p) + else: + loaded, acc_results = load_accuracy_result(json_p) results.extend(acc_results) - if any(r.severity == Severity.ERROR for r in acc_results): - accuracy_result = None - else: - accuracy_dir = pd + if loaded is not None and not any( + r.severity == Severity.ERROR for r in acc_results + ): + accuracy_result, accuracy_dir = loaded, pd # ModelContext validates point-count, regional-coverage, config-consistency, accuracy-gate model_ctx = ModelContext( diff --git a/src/submission_checker/models/aggregate/context.py b/src/submission_checker/models/aggregate/context.py index 72b167b..b933cf4 100644 --- a/src/submission_checker/models/aggregate/context.py +++ b/src/submission_checker/models/aggregate/context.py @@ -248,6 +248,26 @@ def _check_accuracy(self) -> ModelContext: for ds_scores in all_scores.values(): flat_scores.update(ds_scores) + # Endpoints scorers (e.g. DeepSeekR1Scorer) write a single *unnamed* scalar + # `score` per dataset into results.json — the scorer's primary metric, with its + # identity dropped. When the model declares exactly one accuracy metric, gate + # that scalar against it; but WARN, because we cannot verify the scalar's + # identity, and any secondary metrics (e.g. tokens_per_sample) are absent from + # results.json and are therefore NOT checked. + if list(flat_scores) == ["score"] and len(thresholds) == 1: + only_metric = next(iter(thresholds)) + self._check_results.append( + warn( + "accuracy-gate", + f"results.json exposes only an unnamed scalar accuracy score; " + f"gating it as '{only_metric}'. Secondary metrics (if any) are not " + f"present in results.json and are not checked.", + json_path, + "#15", + ) + ) + flat_scores = {only_metric: flat_scores["score"]} + for threshold_key, (lower, upper) in thresholds.items(): # Match score key case-insensitively score: float | None = None diff --git a/src/submission_checker/models/loader.py b/src/submission_checker/models/loader.py index 329cc23..a6afd89 100644 --- a/src/submission_checker/models/loader.py +++ b/src/submission_checker/models/loader.py @@ -7,6 +7,7 @@ __all__ = [ "load_accuracy_result", + "load_accuracy_scores", "load_point_config", "load_result_summary", "load_system_description", @@ -151,3 +152,31 @@ def load_accuracy_result( return instance, list(instance._check_results) except ValidationError as exc: return None, _validation_errors(exc, "accuracy-valid", path) + + +def load_accuracy_scores( + path: Path, +) -> tuple[AccuracyResult | None, list[CheckResult], bool]: + """Load accuracy from a ``results.json``'s ``accuracy_scores`` field. + + The benchmark writes per-dataset accuracy directly into ``results.json`` under + ``accuracy_scores`` (already in the ``AccuracyResult`` schema). This reads that + field instead of a separate ``accuracy/results.json`` file. + + Returns ``(model, check_results, present)``. ``present`` is True when the file + contains a non-empty ``accuracy_scores`` mapping (regardless of validity); a + missing/invalid ``results.json`` is reported by the result-summary loaders, so + accuracy is simply treated as absent here. On a validation failure the model is + None and ``check_results`` holds one entry per error. + """ + data, load_err = _load_json(path) + if load_err or not isinstance(data, dict): + return None, [], False + scores = data.get("accuracy_scores") + if not isinstance(scores, dict) or not scores: + return None, [], False + try: + instance = AccuracyResult.model_validate(scores, context={"json_path": path}) + return instance, list(instance._check_results), True + except ValidationError as exc: + return None, _validation_errors(exc, "accuracy-valid", path), True diff --git a/tests/endpoints_submission_cli/submissions/test_builder.py b/tests/endpoints_submission_cli/submissions/test_builder.py index 72f5458..955f734 100644 --- a/tests/endpoints_submission_cli/submissions/test_builder.py +++ b/tests/endpoints_submission_cli/submissions/test_builder.py @@ -11,7 +11,7 @@ import pytest import yaml -from endpoints_submission_cli.exceptions import SubmissionBuildError +from endpoints_submission_cli.exceptions import SubmissionBuildError, TruncationError from endpoints_submission_cli.submissions.builder import ( _compute_max_tps, _slugify, @@ -608,6 +608,40 @@ def test_large_responses_truncated_under_10kb(self) -> None: result = json.loads(result_bytes) assert len(json.dumps(result["responses"]).encode()) <= 10 * 1024 + def _make_dict_content(self, n_responses: int) -> bytes: + # Some run modes emit responses as a {sample_uuid: output_text} mapping. + data = { + "config": {"mode": "perf"}, + "accuracy_scores": {"ds": {"num_samples": n_responses, "score": 1.0}}, + "responses": {f"uuid-{i:08d}": "hello world " * 5 for i in range(n_responses)}, + } + return json.dumps(data).encode() + + def test_small_dict_responses_unchanged(self) -> None: + content = self._make_dict_content(2) + result = json.loads(_truncate_responses(content)) + assert result["responses"] == json.loads(content)["responses"] + assert result["accuracy_scores"] == {"ds": {"num_samples": 2, "score": 1.0}} + + def test_large_dict_responses_truncated_under_10kb(self) -> None: + content = self._make_dict_content(10_000) + result = json.loads(_truncate_responses(content)) + assert isinstance(result["responses"], dict) + assert len(json.dumps(result["responses"]).encode()) <= 10 * 1024 + assert 0 < len(result["responses"]) < 10_000 # kept some, dropped the rest + # non-responses fields untouched + assert result["accuracy_scores"] == {"ds": {"num_samples": 10_000, "score": 1.0}} + + def test_unknown_responses_type_raises_hard_error(self) -> None: + # A responses shape we cannot bound must fail loudly, never pass through silently. + content = json.dumps({"config": {}, "responses": "x" * 50_000}).encode() + with pytest.raises(TruncationError): + _truncate_responses(content) + + def test_missing_responses_is_not_an_error(self) -> None: + content = json.dumps({"config": {}, "accuracy_scores": {}}).encode() + assert _truncate_responses(content) == content # nothing to truncate, no raise + def test_other_keys_preserved(self) -> None: content = self._make_content(10_000) result = json.loads(_truncate_responses(content)) @@ -641,89 +675,46 @@ def test_long_name_truncated(self) -> None: @pytest.mark.unit -class TestAccuracyResultsTruncation: - """Tests for accuracy/results.json truncation across three workflow scenarios.""" - - def _make_accuracy_archive( - self, - run_folder: Path, - tmp_path: Path, - name: str, - results_content: dict | None = None, - acc_results_content: dict | None = None, - ) -> Path: +class TestBuilderTruncatesResults: + """build_submission_folder truncates the verbose responses in the perf results.json.""" + + def _archive(self, run_folder: Path, tmp_path: Path, responses: object) -> Path: import shutil - folder = tmp_path / name + folder = tmp_path / "perf_run" shutil.copytree(run_folder, folder) - cfg = yaml.safe_load((folder / "config.yaml").read_text()) - cfg["datasets"][0]["type"] = "accuracy" - (folder / "config.yaml").write_text(yaml.dump(cfg)) - if results_content is not None: - (folder / "results.json").write_text(json.dumps(results_content)) - if acc_results_content is not None: - (folder / "accuracy").mkdir(exist_ok=True) - (folder / "accuracy" / "results.json").write_text(json.dumps(acc_results_content)) - archive = tmp_path / f"{name}.tar.gz" + results = json.loads((folder / "results.json").read_text()) + results["responses"] = responses + (folder / "results.json").write_text(json.dumps(results)) + archive = tmp_path / "perf_run.tar.gz" with tarfile.open(archive, "w:gz") as tar: - tar.add(folder, arcname=name) + tar.add(folder, arcname=folder.name) return archive - def _big_results(self, count: int = 5000) -> dict: - return { - "config": {}, - "accuracy_scores": {"score": 0.45}, - "responses": [{"text": "a" * 100, "idx": i} for i in range(count)], - } + def _perf_results(self, sub_dir: Path) -> dict: + perf = [p for p in sub_dir.rglob("results.json") if p.parent.name != "accuracy"] + assert len(perf) == 1 + return json.loads(perf[0].read_text()) - def test_no_accuracy_subdir_truncates_responses( - self, run_folder: Path, tmp_path: Path - ) -> None: - """Workflow 1/3: archive has only root results.json; responses written truncated to accuracy/results.json.""" - archive = self._make_accuracy_archive( - run_folder, tmp_path, "wf1", results_content=self._big_results() - ) + def test_dict_responses_truncated_in_bundle(self, run_folder: Path, tmp_path: Path) -> None: + # Regression: responses emitted as a {uuid: text} dict were not truncated. + big = {f"uuid-{i:08d}": "hello world " * 5 for i in range(10_000)} sub_dir = build_submission_folder( - [("run-001", archive)], "standardized", "available", tmp_path / "sub" + [("run-001", self._archive(run_folder, tmp_path, big))], + "standardized", "available", tmp_path / "sub", ) - acc_files = list(sub_dir.rglob("accuracy/results.json")) - assert len(acc_files) == 1 - written = json.loads(acc_files[0].read_text()) + written = self._perf_results(sub_dir) + assert isinstance(written["responses"], dict) assert len(json.dumps(written["responses"]).encode()) <= 10 * 1024 + assert 0 < len(written["responses"]) < 10_000 - def test_accuracy_subdir_large_responses_truncated( - self, run_folder: Path, tmp_path: Path - ) -> None: - """Workflow 2: archive has accuracy/results.json with large responses; truncated on write.""" - archive = self._make_accuracy_archive( - run_folder, tmp_path, "wf2", acc_results_content=self._big_results() - ) + def test_list_responses_truncated_in_bundle(self, run_folder: Path, tmp_path: Path) -> None: + big = [{"text": "hello world", "idx": i} for i in range(10_000)] sub_dir = build_submission_folder( - [("run-001", archive)], "standardized", "available", tmp_path / "sub" + [("run-001", self._archive(run_folder, tmp_path, big))], + "standardized", "available", tmp_path / "sub", ) - acc_files = list(sub_dir.rglob("accuracy/results.json")) - assert len(acc_files) == 1 - written = json.loads(acc_files[0].read_text()) + written = self._perf_results(sub_dir) + assert isinstance(written["responses"], list) assert len(json.dumps(written["responses"]).encode()) <= 10 * 1024 - - def test_accuracy_subdir_already_small_written_intact( - self, run_folder: Path, tmp_path: Path - ) -> None: - """Workflow 3: archive has accuracy/results.json already within 10 KB; written without corruption.""" - small_responses = [{"text": "hi", "idx": 0}] - acc_results = { - "config": {}, - "accuracy_scores": {"score": 0.45}, - "responses": small_responses, - } - archive = self._make_accuracy_archive( - run_folder, tmp_path, "wf3", acc_results_content=acc_results - ) - sub_dir = build_submission_folder( - [("run-001", archive)], "standardized", "available", tmp_path / "sub" - ) - acc_files = list(sub_dir.rglob("accuracy/results.json")) - assert len(acc_files) == 1 - written = json.loads(acc_files[0].read_text()) - assert written["responses"] == small_responses - assert written["accuracy_scores"] == acc_results["accuracy_scores"] + assert 0 < len(written["responses"]) < 10_000 diff --git a/tests/submission_checker/test_checker.py b/tests/submission_checker/test_checker.py index 86e4bec..2d18a3b 100644 --- a/tests/submission_checker/test_checker.py +++ b/tests/submission_checker/test_checker.py @@ -489,6 +489,34 @@ def test_invalid_accuracy_json(self, tmp_path): report = _check(root) assert _errors(report, "accuracy-valid") + def test_accuracy_scores_in_results_json(self, tmp_path): + """Accuracy is read from a point's results.json accuracy_scores (no accuracy/ dir).""" + root = _build_submission(tmp_path, write_accuracy_json=False) + # Drop accuracy_scores into the first point's results.json. The checker reads it + # directly; the empty accuracy/ dir is not consulted (no accuracy-file error). + first_point = ( + root / "pareto" / "test-sys" / "llama3-70b" / "results" / f"point_{_CONCURRENCIES[0]}" + ) + (first_point / "results.json").write_text( + json.dumps({"config": {}, "results": {}, "accuracy_scores": _ACCURACY, "responses": []}) + ) + report = _check(root) + assert not _errors(report, "accuracy-present") + assert not _errors(report, "accuracy-valid") + assert not _errors(report, "accuracy-file") + + def test_invalid_accuracy_scores_in_results_json(self, tmp_path): + """accuracy-valid error when results.json accuracy_scores is malformed.""" + root = _build_submission(tmp_path, write_accuracy_json=False) + first_point = ( + root / "pareto" / "test-sys" / "llama3-70b" / "results" / f"point_{_CONCURRENCIES[0]}" + ) + (first_point / "results.json").write_text( + json.dumps({"accuracy_scores": {"cnn_dailymail": "not-a-dict"}}) + ) + report = _check(root) + assert _errors(report, "accuracy-valid") + def test_point_filename_concurrency_mismatch(self, tmp_path): """point-filename-concurrency warning when filename concurrency ≠ declared concurrency.""" root = _build_submission(tmp_path) diff --git a/tests/submission_checker/test_checks_aggregate.py b/tests/submission_checker/test_checks_aggregate.py index 7225fef..a40f227 100644 --- a/tests/submission_checker/test_checks_aggregate.py +++ b/tests/submission_checker/test_checks_aggregate.py @@ -551,3 +551,51 @@ def test_sample_count_missing_skips(self, tmp_path): ) ctx = _model_ctx(tmp_path, accuracy_result=ar, model_name="Llama-3_1-8B-Instruct") assert not any(r.rule == "accuracy-sample-count" for r in ctx._check_results) + + def test_scalar_score_gated_as_single_metric_passes(self, tmp_path): + # Endpoints results.json exposes a single *unnamed* scalar `score` + # (== exact_match for deepseek). With a single-metric threshold it is gated + # as that metric — with a warning — and 81.3355 ≥ 0.99*81.3582 (80.5446) passes. + ar = AccuracyResult({"deepseek_r1_accuracy": {"num_samples": 4388, "score": 81.3355}}) + ctx = _model_ctx(tmp_path, accuracy_result=ar, model_name="deepseek_r1-torch-fp4") + assert any( + r.rule == "accuracy-gate" + and r.severity == Severity.WARNING + and "unnamed scalar" in r.message + for r in ctx._check_results + ) + assert any( + r.rule == "accuracy-gate" + and r.severity == Severity.INFO + and "exact_match" in r.message + for r in ctx._check_results + ) + assert not any( + r.rule == "accuracy-gate" and r.severity == Severity.ERROR + for r in ctx._check_results + ) + + def test_scalar_score_gated_as_single_metric_fails(self, tmp_path): + # 70.0 < 0.99*81.3582 (80.5446) → the mapped exact_match gate errors. + ar = AccuracyResult({"deepseek_r1_accuracy": {"num_samples": 4388, "score": 70.0}}) + ctx = _model_ctx(tmp_path, accuracy_result=ar, model_name="deepseek_r1-torch-fp4") + assert any( + r.rule == "accuracy-gate" + and r.severity == Severity.ERROR + and "exact_match" in r.message + for r in ctx._check_results + ) + + def test_scalar_score_not_mapped_for_multimetric_model(self, tmp_path): + # llama3.1-8b declares 5 metrics, so a bare scalar `score` is ambiguous and + # must NOT be mapped — no scalar warning, and no metric comparison fires. + ar = AccuracyResult({"cnn_dailymail::llama3_8b": {"num_samples": 13368, "score": 39.0}}) + ctx = _model_ctx(tmp_path, accuracy_result=ar, model_name="Llama-3_1-8B-Instruct") + assert not any( + r.rule == "accuracy-gate" and "unnamed scalar" in r.message + for r in ctx._check_results + ) + assert not any( + r.rule == "accuracy-gate" and r.severity in (Severity.INFO, Severity.ERROR) + for r in ctx._check_results + )