From 24252afa9eecf61c1957801c6aaca7c505462adb Mon Sep 17 00:00:00 2001 From: Arav Agarwal Date: Wed, 24 Jun 2026 15:21:17 -0400 Subject: [PATCH 1/4] Bugfix --- .../submissions/builder.py | 46 ++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/src/endpoints_submission_cli/submissions/builder.py b/src/endpoints_submission_cli/submissions/builder.py index 299723a..69bb342 100644 --- a/src/endpoints_submission_cli/submissions/builder.py +++ b/src/endpoints_submission_cli/submissions/builder.py @@ -199,15 +199,59 @@ def _load_run_data(run_id: str, division: str, availability: str, run_dir: Path) system_info = _load_system_desc(base, run_id, division, availability) + extra_files = _load_extra_files(base) + # A combined run can be a performance run that *also* declares an accuracy + # dataset (e.g. datasets[0] is performance, datasets[1] is accuracy) without + # shipping a canonical accuracy/results.json. In that case results.json carries + # an `accuracy_scores` block already in the checker's AccuracyResult schema; + # surface it as accuracy/results.json so the accuracy run gets assembled + # regardless. (Restores the behavior previously provided by the removed + # _write_accuracy_fallback, but at the harvest stage so the write-loop places + # the file correctly.) Accuracy-typed runs already route results.json into + # accuracy/, so this only applies to performance runs to avoid colliding there. + if ( + "accuracy/results.json" not in extra_files + and _extract_run_type(config) == "performance" + and _config_has_accuracy_dataset(config) + ): + derived = _derive_accuracy_results(extra_files.get("results.json")) + if derived is not None: + extra_files["accuracy/results.json"] = derived + return { "run_id": run_id, "system_info": system_info, "config": config, "result_summary": result_summary, - "_extra_files": _load_extra_files(base), + "_extra_files": extra_files, } +def _config_has_accuracy_dataset(config: dict[str, Any]) -> bool: + """Return True if *config* declares any dataset of type 'accuracy'.""" + datasets = config.get("datasets", []) or [] + return any(isinstance(d, dict) and d.get("type") == "accuracy" for d in datasets) + + +def _derive_accuracy_results(results_bytes: bytes | None) -> bytes | None: + """Derive accuracy/results.json bytes from a run's results.json. + + results.json carries an ``accuracy_scores`` block already in the checker's + AccuracyResult schema (``{dataset_name: {score, num_samples, ...}}``). + Return it serialized, or None when unavailable/malformed. + """ + if not results_bytes: + return None + try: + parsed = json.loads(results_bytes) + except (json.JSONDecodeError, TypeError, ValueError): + return None + scores = parsed.get("accuracy_scores") if isinstance(parsed, dict) else None + if not scores: + return None + return json.dumps(scores, indent=2).encode("utf-8") + + def _load_system_desc(base: Path, run_id: str, division: str, availability: str) -> dict[str, Any]: """Load system_desc.json from a run folder and return the validated flat dict. From 96bdc99eb41167d8f2fd64fd92348048777d63b5 Mon Sep 17 00:00:00 2001 From: Arav Agarwal Date: Wed, 24 Jun 2026 16:15:52 -0400 Subject: [PATCH 2/4] Update accuracy tests --- src/submission_checker/accuracy_targets.py | 22 +++++++++ .../models/aggregate/context.py | 20 ++++++++ .../test_checks_aggregate.py | 48 +++++++++++++++++++ 3 files changed, 90 insertions(+) diff --git a/src/submission_checker/accuracy_targets.py b/src/submission_checker/accuracy_targets.py index 263a68d..df64e48 100644 --- a/src/submission_checker/accuracy_targets.py +++ b/src/submission_checker/accuracy_targets.py @@ -20,6 +20,28 @@ # (model-keyword fragments, {metric → (lower, upper | None)}, min_queries) # More-specific entries must come before less-specific ones. _TARGETS: list[tuple[frozenset[str], dict[str, tuple[float, float | None]], int]] = [ + # deepseek-r1 — golden fp32 exact_match 81.3582 (gate ≥ 0.99×). + # Only exact_match is gated: results.json's accuracy_scores expose a single scalar + # `score` (== exact_match). The canonical spec also bounds TOKENS_PER_SAMPLE + # (0.9–1.1 × 3886.2274), but that metric lives only in the deepseek_eval file, not + # in results.json, so it is intentionally NOT gated here. + ( + frozenset({"deepseek", "r1"}), + { + "exact_match": (81.3582 * 0.99, None), + }, + 4388, + ), + # gpt-oss-120b — golden fp32 exact_match 83.13; upstream tokens_per_sample upper + # bound is still a placeholder (constants.py:215), so it is intentionally omitted. + # min_queries uses the accuracy-sample-count (4395), not the perf count (6396). + ( + frozenset({"gptoss", "120b"}), + { + "exact_match": (83.13 * 0.99, None), + }, + 4395, + ), # llama3.1-405b — inference uses ROUGEL as primary; also exact_match + tokens ( frozenset({"llama3", "405b"}), diff --git a/src/submission_checker/models/aggregate/context.py b/src/submission_checker/models/aggregate/context.py index 72b167b..b933cf4 100644 --- a/src/submission_checker/models/aggregate/context.py +++ b/src/submission_checker/models/aggregate/context.py @@ -248,6 +248,26 @@ def _check_accuracy(self) -> ModelContext: for ds_scores in all_scores.values(): flat_scores.update(ds_scores) + # Endpoints scorers (e.g. DeepSeekR1Scorer) write a single *unnamed* scalar + # `score` per dataset into results.json — the scorer's primary metric, with its + # identity dropped. When the model declares exactly one accuracy metric, gate + # that scalar against it; but WARN, because we cannot verify the scalar's + # identity, and any secondary metrics (e.g. tokens_per_sample) are absent from + # results.json and are therefore NOT checked. + if list(flat_scores) == ["score"] and len(thresholds) == 1: + only_metric = next(iter(thresholds)) + self._check_results.append( + warn( + "accuracy-gate", + f"results.json exposes only an unnamed scalar accuracy score; " + f"gating it as '{only_metric}'. Secondary metrics (if any) are not " + f"present in results.json and are not checked.", + json_path, + "#15", + ) + ) + flat_scores = {only_metric: flat_scores["score"]} + for threshold_key, (lower, upper) in thresholds.items(): # Match score key case-insensitively score: float | None = None diff --git a/tests/submission_checker/test_checks_aggregate.py b/tests/submission_checker/test_checks_aggregate.py index 7225fef..a40f227 100644 --- a/tests/submission_checker/test_checks_aggregate.py +++ b/tests/submission_checker/test_checks_aggregate.py @@ -551,3 +551,51 @@ def test_sample_count_missing_skips(self, tmp_path): ) ctx = _model_ctx(tmp_path, accuracy_result=ar, model_name="Llama-3_1-8B-Instruct") assert not any(r.rule == "accuracy-sample-count" for r in ctx._check_results) + + def test_scalar_score_gated_as_single_metric_passes(self, tmp_path): + # Endpoints results.json exposes a single *unnamed* scalar `score` + # (== exact_match for deepseek). With a single-metric threshold it is gated + # as that metric — with a warning — and 81.3355 ≥ 0.99*81.3582 (80.5446) passes. + ar = AccuracyResult({"deepseek_r1_accuracy": {"num_samples": 4388, "score": 81.3355}}) + ctx = _model_ctx(tmp_path, accuracy_result=ar, model_name="deepseek_r1-torch-fp4") + assert any( + r.rule == "accuracy-gate" + and r.severity == Severity.WARNING + and "unnamed scalar" in r.message + for r in ctx._check_results + ) + assert any( + r.rule == "accuracy-gate" + and r.severity == Severity.INFO + and "exact_match" in r.message + for r in ctx._check_results + ) + assert not any( + r.rule == "accuracy-gate" and r.severity == Severity.ERROR + for r in ctx._check_results + ) + + def test_scalar_score_gated_as_single_metric_fails(self, tmp_path): + # 70.0 < 0.99*81.3582 (80.5446) → the mapped exact_match gate errors. + ar = AccuracyResult({"deepseek_r1_accuracy": {"num_samples": 4388, "score": 70.0}}) + ctx = _model_ctx(tmp_path, accuracy_result=ar, model_name="deepseek_r1-torch-fp4") + assert any( + r.rule == "accuracy-gate" + and r.severity == Severity.ERROR + and "exact_match" in r.message + for r in ctx._check_results + ) + + def test_scalar_score_not_mapped_for_multimetric_model(self, tmp_path): + # llama3.1-8b declares 5 metrics, so a bare scalar `score` is ambiguous and + # must NOT be mapped — no scalar warning, and no metric comparison fires. + ar = AccuracyResult({"cnn_dailymail::llama3_8b": {"num_samples": 13368, "score": 39.0}}) + ctx = _model_ctx(tmp_path, accuracy_result=ar, model_name="Llama-3_1-8B-Instruct") + assert not any( + r.rule == "accuracy-gate" and "unnamed scalar" in r.message + for r in ctx._check_results + ) + assert not any( + r.rule == "accuracy-gate" and r.severity in (Severity.INFO, Severity.ERROR) + for r in ctx._check_results + ) From 3c3faf54b84425870a77a0ddef945f9f19ec18e7 Mon Sep 17 00:00:00 2001 From: Arav Agarwal Date: Wed, 24 Jun 2026 16:59:15 -0400 Subject: [PATCH 3/4] Fix truncation and extraction behavior --- src/endpoints_submission_cli/exceptions.py | 8 + .../submissions/builder.py | 32 +++- src/endpoints_submission_cli/truncation.py | 61 +++++-- .../submissions/test_builder.py | 156 ++++++++++++++---- 4 files changed, 200 insertions(+), 57 deletions(-) diff --git a/src/endpoints_submission_cli/exceptions.py b/src/endpoints_submission_cli/exceptions.py index 2aad49e..cb0a0c5 100644 --- a/src/endpoints_submission_cli/exceptions.py +++ b/src/endpoints_submission_cli/exceptions.py @@ -39,3 +39,11 @@ class SubmissionBuildError(Exception): class SubmissionCheckError(Exception): """Raised when the Submission Checker reports validation errors.""" + + +class TruncationError(Exception): + """Raised when a results payload's ``responses`` cannot be truncated under the cap. + + Guards against silently shipping an un-truncated (potentially multi-GB) payload + when ``responses`` is an unexpected shape the truncator does not handle. + """ diff --git a/src/endpoints_submission_cli/submissions/builder.py b/src/endpoints_submission_cli/submissions/builder.py index 69bb342..5693cf0 100644 --- a/src/endpoints_submission_cli/submissions/builder.py +++ b/src/endpoints_submission_cli/submissions/builder.py @@ -214,7 +214,7 @@ def _load_run_data(run_id: str, division: str, availability: str, run_dir: Path) and _extract_run_type(config) == "performance" and _config_has_accuracy_dataset(config) ): - derived = _derive_accuracy_results(extra_files.get("results.json")) + derived = _unwrap_accuracy_scores(extra_files.get("results.json")) if derived is not None: extra_files["accuracy/results.json"] = derived @@ -233,21 +233,27 @@ def _config_has_accuracy_dataset(config: dict[str, Any]) -> bool: return any(isinstance(d, dict) and d.get("type") == "accuracy" for d in datasets) -def _derive_accuracy_results(results_bytes: bytes | None) -> bytes | None: - """Derive accuracy/results.json bytes from a run's results.json. +def _unwrap_accuracy_scores(content: bytes | None) -> bytes | None: + """Unwrap a normal ``results.json`` blob into the checker's accuracy schema. - results.json carries an ``accuracy_scores`` block already in the checker's - AccuracyResult schema (``{dataset_name: {score, num_samples, ...}}``). - Return it serialized, or None when unavailable/malformed. + A run's ``results.json`` (and sometimes the accuracy file itself) is shaped as + ``{"config": ..., "results": ..., "accuracy_scores": {dataset: {...}}, "responses": [...]}``. + The checker's ``AccuracyResult`` expects only the per-dataset mapping + (``{dataset_name: {score, num_samples, ...}}``); a results.json-shaped file fails + ``accuracy-valid`` (its ``config``/``results``/``responses`` keys are not per-dataset + dicts). When *content* carries a top-level ``accuracy_scores`` mapping, return just + that mapping serialized. Return None when *content* is absent, unparseable, or + already in the expected per-dataset format (no ``accuracy_scores`` key) — in which + case the caller leaves it untouched. """ - if not results_bytes: + if not content: return None try: - parsed = json.loads(results_bytes) + parsed = json.loads(content) except (json.JSONDecodeError, TypeError, ValueError): return None scores = parsed.get("accuracy_scores") if isinstance(parsed, dict) else None - if not scores: + if not isinstance(scores, dict) or not scores: return None return json.dumps(scores, indent=2).encode("utf-8") @@ -501,6 +507,14 @@ def _write_pareto_entries( else rel_path ) dest = result_dir / dest_rel + # Whatever lands as /accuracy/results.json must be in the checker's + # AccuracyResult schema. If it is instead a normal results.json blob (e.g. an + # accuracy-typed run whose results.json carries accuracy_scores), unwrap it to + # the per-dataset mapping so it doesn't fail accuracy-valid. + if dest.name == "results.json" and dest.parent.name == "accuracy": + unwrapped = _unwrap_accuracy_scores(content) + if unwrapped is not None: + content = unwrapped dest.parent.mkdir(parents=True, exist_ok=True) dest.write_bytes(content) diff --git a/src/endpoints_submission_cli/truncation.py b/src/endpoints_submission_cli/truncation.py index 88b3c91..824fc6e 100644 --- a/src/endpoints_submission_cli/truncation.py +++ b/src/endpoints_submission_cli/truncation.py @@ -11,33 +11,62 @@ import json +from .exceptions import TruncationError + __all__ = ["RESPONSES_LIMIT", "truncate_responses"] RESPONSES_LIMIT = 10 * 1024 # 10 KB def truncate_responses(content: bytes) -> bytes: - """Truncate the ``responses`` list in a results.json payload to stay under 10 KB. + """Truncate the ``responses`` collection in a results.json payload to stay under 10 KB. - Returns *content* unchanged when it is not JSON, has no ``responses`` list, or - already fits within the limit. Only the ``responses`` key is affected; all - other fields (accuracy scores, config, results, ...) are preserved. + ``responses`` may be either a list (``[entry, ...]``) or a dict keyed by sample + id (``{uuid: output, ...}``); both are produced by different run modes. Entries + are kept in iteration order until adding the next would exceed the limit. Returns + *content* unchanged when it is not JSON or has no non-empty ``responses``. Only the + ``responses`` key is affected; all other fields (accuracy scores, config, ...) are + preserved. """ try: data = json.loads(content) except (json.JSONDecodeError, ValueError): return content responses = data.get("responses") - if not isinstance(responses, list) or not responses: - return content - # Walk items and stop as soon as adding the next one would exceed the limit. - # Each item contributes its own bytes plus 2 for the ", " separator after the first. - total = 2 # "[]" - idx = 0 - for i, r in enumerate(responses): - total += len(json.dumps(r).encode()) + (2 if i > 0 else 0) - if total > RESPONSES_LIMIT: - break - idx = i + 1 - data["responses"] = responses[:idx] + if responses is None or (isinstance(responses, (list, dict)) and not responses): + return content # nothing to truncate + if isinstance(responses, list): + # Each item contributes its own bytes plus 2 for the ", " separator after the first. + total = 2 # "[]" + idx = 0 + for i, r in enumerate(responses): + total += len(json.dumps(r).encode()) + (2 if i > 0 else 0) + if total > RESPONSES_LIMIT: + break + idx = i + 1 + data["responses"] = responses[:idx] + elif isinstance(responses, dict): + # Each entry contributes ``"key": value`` plus 2 for the ", " separator after + # the first. Approximate the key/value cost conservatively to stay under the cap. + total = 2 # "{}" + kept: dict[str, object] = {} + for i, (k, v) in enumerate(responses.items()): + entry = len(json.dumps(k).encode()) + len(json.dumps(v).encode()) + 2 # ': ' + if i > 0: + entry += 2 # ", " + if total + entry > RESPONSES_LIMIT: + break + total += entry + kept[k] = v + data["responses"] = kept + else: + # Unknown shape — refuse to ship a payload we cannot bound. A silent pass-through + # here is what let a multi-GB results.json reach a submission bundle. + raise TruncationError( + f"Cannot truncate 'responses' of type {type(responses).__name__}; " + "expected a list or dict." + ) + # Defense-in-depth: never return a payload whose responses still exceed the cap. + if len(json.dumps(data["responses"]).encode()) > RESPONSES_LIMIT: + raise TruncationError("responses still exceed the size limit after truncation") return json.dumps(data, indent=2).encode() diff --git a/tests/endpoints_submission_cli/submissions/test_builder.py b/tests/endpoints_submission_cli/submissions/test_builder.py index 72f5458..e0f6505 100644 --- a/tests/endpoints_submission_cli/submissions/test_builder.py +++ b/tests/endpoints_submission_cli/submissions/test_builder.py @@ -11,7 +11,7 @@ import pytest import yaml -from endpoints_submission_cli.exceptions import SubmissionBuildError +from endpoints_submission_cli.exceptions import SubmissionBuildError, TruncationError from endpoints_submission_cli.submissions.builder import ( _compute_max_tps, _slugify, @@ -20,6 +20,7 @@ extract_archive, ) from endpoints_submission_cli.truncation import truncate_responses as _truncate_responses +from submission_checker.models.file import AccuracyResult @pytest.mark.unit @@ -608,6 +609,40 @@ def test_large_responses_truncated_under_10kb(self) -> None: result = json.loads(result_bytes) assert len(json.dumps(result["responses"]).encode()) <= 10 * 1024 + def _make_dict_content(self, n_responses: int) -> bytes: + # Some run modes emit responses as a {sample_uuid: output_text} mapping. + data = { + "config": {"mode": "perf"}, + "accuracy_scores": {"ds": {"num_samples": n_responses, "score": 1.0}}, + "responses": {f"uuid-{i:08d}": "hello world " * 5 for i in range(n_responses)}, + } + return json.dumps(data).encode() + + def test_small_dict_responses_unchanged(self) -> None: + content = self._make_dict_content(2) + result = json.loads(_truncate_responses(content)) + assert result["responses"] == json.loads(content)["responses"] + assert result["accuracy_scores"] == {"ds": {"num_samples": 2, "score": 1.0}} + + def test_large_dict_responses_truncated_under_10kb(self) -> None: + content = self._make_dict_content(10_000) + result = json.loads(_truncate_responses(content)) + assert isinstance(result["responses"], dict) + assert len(json.dumps(result["responses"]).encode()) <= 10 * 1024 + assert 0 < len(result["responses"]) < 10_000 # kept some, dropped the rest + # non-responses fields untouched + assert result["accuracy_scores"] == {"ds": {"num_samples": 10_000, "score": 1.0}} + + def test_unknown_responses_type_raises_hard_error(self) -> None: + # A responses shape we cannot bound must fail loudly, never pass through silently. + content = json.dumps({"config": {}, "responses": "x" * 50_000}).encode() + with pytest.raises(TruncationError): + _truncate_responses(content) + + def test_missing_responses_is_not_an_error(self) -> None: + content = json.dumps({"config": {}, "accuracy_scores": {}}).encode() + assert _truncate_responses(content) == content # nothing to truncate, no raise + def test_other_keys_preserved(self) -> None: content = self._make_content(10_000) result = json.loads(_truncate_responses(content)) @@ -641,8 +676,14 @@ def test_long_name_truncated(self) -> None: @pytest.mark.unit -class TestAccuracyResultsTruncation: - """Tests for accuracy/results.json truncation across three workflow scenarios.""" +class TestAccuracyResultsNormalization: + """`accuracy/results.json` must end up in the checker's AccuracyResult schema. + + A run's accuracy file is frequently a normal results.json blob (top-level + config/results/accuracy_scores/responses). The builder unwraps it to the bare + per-dataset mapping so it does not fail accuracy-valid; a file already in the + expected schema is left untouched. + """ def _make_accuracy_archive( self, @@ -669,55 +710,60 @@ def _make_accuracy_archive( tar.add(folder, arcname=name) return archive - def _big_results(self, count: int = 5000) -> dict: + _EXPECTED_SCORES = {"test_ds": {"dataset_name": "test_ds", "num_samples": 100, "score": 0.45}} + + def _results_blob(self, count: int = 5000) -> dict: + """A normal results.json blob carrying accuracy_scores plus a large responses list.""" return { "config": {}, - "accuracy_scores": {"score": 0.45}, + "results": {"total": 100}, + "accuracy_scores": self._EXPECTED_SCORES, "responses": [{"text": "a" * 100, "idx": i} for i in range(count)], } - def test_no_accuracy_subdir_truncates_responses( - self, run_folder: Path, tmp_path: Path - ) -> None: - """Workflow 1/3: archive has only root results.json; responses written truncated to accuracy/results.json.""" + def _assert_unwrapped_and_valid(self, written: dict) -> None: + # The bare per-dataset mapping only — no wrapper keys, no responses — and it + # must pass the checker's AccuracyResult schema. + assert "responses" not in written + assert "accuracy_scores" not in written and "config" not in written + assert written == self._EXPECTED_SCORES + AccuracyResult.model_validate(written) # raises ValidationError if malformed + + def test_root_results_blob_unwrapped(self, run_folder: Path, tmp_path: Path) -> None: + """Archive ships only a normal results.json blob → unwrapped to its accuracy_scores.""" archive = self._make_accuracy_archive( - run_folder, tmp_path, "wf1", results_content=self._big_results() + run_folder, tmp_path, "wf1", results_content=self._results_blob() ) sub_dir = build_submission_folder( [("run-001", archive)], "standardized", "available", tmp_path / "sub" ) acc_files = list(sub_dir.rglob("accuracy/results.json")) assert len(acc_files) == 1 - written = json.loads(acc_files[0].read_text()) - assert len(json.dumps(written["responses"]).encode()) <= 10 * 1024 + self._assert_unwrapped_and_valid(json.loads(acc_files[0].read_text())) - def test_accuracy_subdir_large_responses_truncated( - self, run_folder: Path, tmp_path: Path - ) -> None: - """Workflow 2: archive has accuracy/results.json with large responses; truncated on write.""" + def test_accuracy_subdir_blob_unwrapped(self, run_folder: Path, tmp_path: Path) -> None: + """Archive ships a results.json-shaped accuracy/results.json → unwrapped.""" archive = self._make_accuracy_archive( - run_folder, tmp_path, "wf2", acc_results_content=self._big_results() + run_folder, tmp_path, "wf2", acc_results_content=self._results_blob() ) sub_dir = build_submission_folder( [("run-001", archive)], "standardized", "available", tmp_path / "sub" ) acc_files = list(sub_dir.rglob("accuracy/results.json")) assert len(acc_files) == 1 - written = json.loads(acc_files[0].read_text()) - assert len(json.dumps(written["responses"]).encode()) <= 10 * 1024 - - def test_accuracy_subdir_already_small_written_intact( - self, run_folder: Path, tmp_path: Path - ) -> None: - """Workflow 3: archive has accuracy/results.json already within 10 KB; written without corruption.""" - small_responses = [{"text": "hi", "idx": 0}] - acc_results = { - "config": {}, - "accuracy_scores": {"score": 0.45}, - "responses": small_responses, + self._assert_unwrapped_and_valid(json.loads(acc_files[0].read_text())) + + def test_already_expected_format_left_intact(self, run_folder: Path, tmp_path: Path) -> None: + """An accuracy file already in the AccuracyResult schema is written through unchanged.""" + correct = { + "cnn_dailymail::m": { + "dataset_name": "cnn_dailymail::m", + "num_samples": 13368, + "score": {"rouge1": 39.0}, + } } archive = self._make_accuracy_archive( - run_folder, tmp_path, "wf3", acc_results_content=acc_results + run_folder, tmp_path, "wf3", acc_results_content=correct ) sub_dir = build_submission_folder( [("run-001", archive)], "standardized", "available", tmp_path / "sub" @@ -725,5 +771,51 @@ def test_accuracy_subdir_already_small_written_intact( acc_files = list(sub_dir.rglob("accuracy/results.json")) assert len(acc_files) == 1 written = json.loads(acc_files[0].read_text()) - assert written["responses"] == small_responses - assert written["accuracy_scores"] == acc_results["accuracy_scores"] + assert written == correct # unchanged: no top-level accuracy_scores to unwrap + AccuracyResult.model_validate(written) + + +@pytest.mark.unit +class TestBuilderTruncatesResults: + """build_submission_folder truncates the verbose responses in the perf results.json.""" + + def _archive(self, run_folder: Path, tmp_path: Path, responses: object) -> Path: + import shutil + + folder = tmp_path / "perf_run" + shutil.copytree(run_folder, folder) + results = json.loads((folder / "results.json").read_text()) + results["responses"] = responses + (folder / "results.json").write_text(json.dumps(results)) + archive = tmp_path / "perf_run.tar.gz" + with tarfile.open(archive, "w:gz") as tar: + tar.add(folder, arcname=folder.name) + return archive + + def _perf_results(self, sub_dir: Path) -> dict: + perf = [p for p in sub_dir.rglob("results.json") if p.parent.name != "accuracy"] + assert len(perf) == 1 + return json.loads(perf[0].read_text()) + + def test_dict_responses_truncated_in_bundle(self, run_folder: Path, tmp_path: Path) -> None: + # Regression: responses emitted as a {uuid: text} dict were not truncated. + big = {f"uuid-{i:08d}": "hello world " * 5 for i in range(10_000)} + sub_dir = build_submission_folder( + [("run-001", self._archive(run_folder, tmp_path, big))], + "standardized", "available", tmp_path / "sub", + ) + written = self._perf_results(sub_dir) + assert isinstance(written["responses"], dict) + assert len(json.dumps(written["responses"]).encode()) <= 10 * 1024 + assert 0 < len(written["responses"]) < 10_000 + + def test_list_responses_truncated_in_bundle(self, run_folder: Path, tmp_path: Path) -> None: + big = [{"text": "hello world", "idx": i} for i in range(10_000)] + sub_dir = build_submission_folder( + [("run-001", self._archive(run_folder, tmp_path, big))], + "standardized", "available", tmp_path / "sub", + ) + written = self._perf_results(sub_dir) + assert isinstance(written["responses"], list) + assert len(json.dumps(written["responses"]).encode()) <= 10 * 1024 + assert 0 < len(written["responses"]) < 10_000 From 3c44211d17e773d282f5c03ceaf53b91fec06f18 Mon Sep 17 00:00:00 2001 From: Arav Agarwal Date: Wed, 24 Jun 2026 17:38:45 -0400 Subject: [PATCH 4/4] Update --- .../submissions/builder.py | 60 +---------- src/submission_checker/checker.py | 64 ++++++++--- src/submission_checker/models/loader.py | 29 +++++ .../submissions/test_builder.py | 101 ------------------ tests/submission_checker/test_checker.py | 28 +++++ 5 files changed, 108 insertions(+), 174 deletions(-) diff --git a/src/endpoints_submission_cli/submissions/builder.py b/src/endpoints_submission_cli/submissions/builder.py index 5693cf0..299723a 100644 --- a/src/endpoints_submission_cli/submissions/builder.py +++ b/src/endpoints_submission_cli/submissions/builder.py @@ -199,65 +199,15 @@ def _load_run_data(run_id: str, division: str, availability: str, run_dir: Path) system_info = _load_system_desc(base, run_id, division, availability) - extra_files = _load_extra_files(base) - # A combined run can be a performance run that *also* declares an accuracy - # dataset (e.g. datasets[0] is performance, datasets[1] is accuracy) without - # shipping a canonical accuracy/results.json. In that case results.json carries - # an `accuracy_scores` block already in the checker's AccuracyResult schema; - # surface it as accuracy/results.json so the accuracy run gets assembled - # regardless. (Restores the behavior previously provided by the removed - # _write_accuracy_fallback, but at the harvest stage so the write-loop places - # the file correctly.) Accuracy-typed runs already route results.json into - # accuracy/, so this only applies to performance runs to avoid colliding there. - if ( - "accuracy/results.json" not in extra_files - and _extract_run_type(config) == "performance" - and _config_has_accuracy_dataset(config) - ): - derived = _unwrap_accuracy_scores(extra_files.get("results.json")) - if derived is not None: - extra_files["accuracy/results.json"] = derived - return { "run_id": run_id, "system_info": system_info, "config": config, "result_summary": result_summary, - "_extra_files": extra_files, + "_extra_files": _load_extra_files(base), } -def _config_has_accuracy_dataset(config: dict[str, Any]) -> bool: - """Return True if *config* declares any dataset of type 'accuracy'.""" - datasets = config.get("datasets", []) or [] - return any(isinstance(d, dict) and d.get("type") == "accuracy" for d in datasets) - - -def _unwrap_accuracy_scores(content: bytes | None) -> bytes | None: - """Unwrap a normal ``results.json`` blob into the checker's accuracy schema. - - A run's ``results.json`` (and sometimes the accuracy file itself) is shaped as - ``{"config": ..., "results": ..., "accuracy_scores": {dataset: {...}}, "responses": [...]}``. - The checker's ``AccuracyResult`` expects only the per-dataset mapping - (``{dataset_name: {score, num_samples, ...}}``); a results.json-shaped file fails - ``accuracy-valid`` (its ``config``/``results``/``responses`` keys are not per-dataset - dicts). When *content* carries a top-level ``accuracy_scores`` mapping, return just - that mapping serialized. Return None when *content* is absent, unparseable, or - already in the expected per-dataset format (no ``accuracy_scores`` key) — in which - case the caller leaves it untouched. - """ - if not content: - return None - try: - parsed = json.loads(content) - except (json.JSONDecodeError, TypeError, ValueError): - return None - scores = parsed.get("accuracy_scores") if isinstance(parsed, dict) else None - if not isinstance(scores, dict) or not scores: - return None - return json.dumps(scores, indent=2).encode("utf-8") - - def _load_system_desc(base: Path, run_id: str, division: str, availability: str) -> dict[str, Any]: """Load system_desc.json from a run folder and return the validated flat dict. @@ -507,14 +457,6 @@ def _write_pareto_entries( else rel_path ) dest = result_dir / dest_rel - # Whatever lands as /accuracy/results.json must be in the checker's - # AccuracyResult schema. If it is instead a normal results.json blob (e.g. an - # accuracy-typed run whose results.json carries accuracy_scores), unwrap it to - # the per-dataset mapping so it doesn't fail accuracy-valid. - if dest.name == "results.json" and dest.parent.name == "accuracy": - unwrapped = _unwrap_accuracy_scores(content) - if unwrapped is not None: - content = unwrapped dest.parent.mkdir(parents=True, exist_ok=True) dest.write_bytes(content) diff --git a/src/submission_checker/checker.py b/src/submission_checker/checker.py index e1d4535..9fffda0 100644 --- a/src/submission_checker/checker.py +++ b/src/submission_checker/checker.py @@ -6,6 +6,7 @@ from __future__ import annotations +import json from typing import TYPE_CHECKING __all__ = ["SubmissionChecker"] @@ -31,6 +32,7 @@ from .models import warn as _warn from .models.loader import ( load_accuracy_result, + load_accuracy_scores, load_point_config, load_result_summary, load_system_description, @@ -40,6 +42,19 @@ from pathlib import Path +def _results_has_accuracy_scores(path: Path) -> bool: + """True if a results.json carries a non-empty ``accuracy_scores`` mapping.""" + try: + data = json.loads(path.read_text()) + except (OSError, ValueError): + return False + return ( + isinstance(data, dict) + and isinstance(data.get("accuracy_scores"), dict) + and bool(data["accuracy_scores"]) + ) + + class SubmissionChecker: """Validates an MLPerf Endpoints submission directory against §9.1 rules. @@ -111,13 +126,16 @@ def run(self) -> Report: for system_json in system_jsons: report.results.extend(self._check_system(system_json, pareto_dir)) - # §15: at least one model in the submission must have an accuracy/results.json. - has_full_accuracy = any(True for _ in pareto_dir.rglob("accuracy/results.json")) + # §15: at least one model must carry accuracy results — either as accuracy_scores + # embedded in a results.json, or as a standalone accuracy/results.json. + has_full_accuracy = any( + True for _ in pareto_dir.rglob("accuracy/results.json") + ) or any(_results_has_accuracy_scores(p) for p in pareto_dir.rglob("results.json")) if has_full_accuracy: report.results.append( _ok( "accuracy-present", - "At least one model has accuracy/results.json", + "At least one model has accuracy results", pareto_dir, "#15", ) @@ -126,7 +144,8 @@ def run(self) -> Report: report.results.append( _err( "accuracy-present", - "No model in this submission has accuracy/results.json", + "No model in this submission has accuracy results " + "(accuracy_scores in results.json or accuracy/results.json)", pareto_dir, "#15", ) @@ -303,13 +322,30 @@ def _check_model( results.extend(point_result._check_results) loaded_points.append((config, summary)) - # Load accuracy from per-point result dirs. Per-model warnings are only - # emitted when a point has an accuracy/ dir but files within it are absent. - # run() enforces that at least one model in the submission has both files. + # Load accuracy per point, preferring the accuracy_scores embedded in + # results.json, then falling back to a standalone accuracy/results.json. + # The first valid source wins; run() enforces that at least one model has one. accuracy_dir: Path | None = None accuracy_result = None for config, _ in loaded_points: - pd = results_dir / f"point_{config.concurrency}" / "accuracy" + if accuracy_result is not None: + break + point_dir = results_dir / f"point_{config.concurrency}" + + # Primary: accuracy_scores embedded in results.json. + results_json = point_dir / "results.json" + if results_json.exists(): + loaded, acc_results, present = load_accuracy_scores(results_json) + if present: + results.extend(acc_results) + if loaded is not None and not any( + r.severity == Severity.ERROR for r in acc_results + ): + accuracy_result, accuracy_dir = loaded, point_dir + continue + + # Fallback: standalone accuracy/results.json (moved from the run archive). + pd = point_dir / "accuracy" if not pd.is_dir(): continue json_p = pd / "results.json" @@ -322,13 +358,13 @@ def _check_model( "#15", ) ) - elif accuracy_result is None: - accuracy_result, acc_results = load_accuracy_result(json_p) + else: + loaded, acc_results = load_accuracy_result(json_p) results.extend(acc_results) - if any(r.severity == Severity.ERROR for r in acc_results): - accuracy_result = None - else: - accuracy_dir = pd + if loaded is not None and not any( + r.severity == Severity.ERROR for r in acc_results + ): + accuracy_result, accuracy_dir = loaded, pd # ModelContext validates point-count, regional-coverage, config-consistency, accuracy-gate model_ctx = ModelContext( diff --git a/src/submission_checker/models/loader.py b/src/submission_checker/models/loader.py index 329cc23..a6afd89 100644 --- a/src/submission_checker/models/loader.py +++ b/src/submission_checker/models/loader.py @@ -7,6 +7,7 @@ __all__ = [ "load_accuracy_result", + "load_accuracy_scores", "load_point_config", "load_result_summary", "load_system_description", @@ -151,3 +152,31 @@ def load_accuracy_result( return instance, list(instance._check_results) except ValidationError as exc: return None, _validation_errors(exc, "accuracy-valid", path) + + +def load_accuracy_scores( + path: Path, +) -> tuple[AccuracyResult | None, list[CheckResult], bool]: + """Load accuracy from a ``results.json``'s ``accuracy_scores`` field. + + The benchmark writes per-dataset accuracy directly into ``results.json`` under + ``accuracy_scores`` (already in the ``AccuracyResult`` schema). This reads that + field instead of a separate ``accuracy/results.json`` file. + + Returns ``(model, check_results, present)``. ``present`` is True when the file + contains a non-empty ``accuracy_scores`` mapping (regardless of validity); a + missing/invalid ``results.json`` is reported by the result-summary loaders, so + accuracy is simply treated as absent here. On a validation failure the model is + None and ``check_results`` holds one entry per error. + """ + data, load_err = _load_json(path) + if load_err or not isinstance(data, dict): + return None, [], False + scores = data.get("accuracy_scores") + if not isinstance(scores, dict) or not scores: + return None, [], False + try: + instance = AccuracyResult.model_validate(scores, context={"json_path": path}) + return instance, list(instance._check_results), True + except ValidationError as exc: + return None, _validation_errors(exc, "accuracy-valid", path), True diff --git a/tests/endpoints_submission_cli/submissions/test_builder.py b/tests/endpoints_submission_cli/submissions/test_builder.py index e0f6505..955f734 100644 --- a/tests/endpoints_submission_cli/submissions/test_builder.py +++ b/tests/endpoints_submission_cli/submissions/test_builder.py @@ -20,7 +20,6 @@ extract_archive, ) from endpoints_submission_cli.truncation import truncate_responses as _truncate_responses -from submission_checker.models.file import AccuracyResult @pytest.mark.unit @@ -675,106 +674,6 @@ def test_long_name_truncated(self) -> None: assert len(_slugify(long)) <= 64 -@pytest.mark.unit -class TestAccuracyResultsNormalization: - """`accuracy/results.json` must end up in the checker's AccuracyResult schema. - - A run's accuracy file is frequently a normal results.json blob (top-level - config/results/accuracy_scores/responses). The builder unwraps it to the bare - per-dataset mapping so it does not fail accuracy-valid; a file already in the - expected schema is left untouched. - """ - - def _make_accuracy_archive( - self, - run_folder: Path, - tmp_path: Path, - name: str, - results_content: dict | None = None, - acc_results_content: dict | None = None, - ) -> Path: - import shutil - - folder = tmp_path / name - shutil.copytree(run_folder, folder) - cfg = yaml.safe_load((folder / "config.yaml").read_text()) - cfg["datasets"][0]["type"] = "accuracy" - (folder / "config.yaml").write_text(yaml.dump(cfg)) - if results_content is not None: - (folder / "results.json").write_text(json.dumps(results_content)) - if acc_results_content is not None: - (folder / "accuracy").mkdir(exist_ok=True) - (folder / "accuracy" / "results.json").write_text(json.dumps(acc_results_content)) - archive = tmp_path / f"{name}.tar.gz" - with tarfile.open(archive, "w:gz") as tar: - tar.add(folder, arcname=name) - return archive - - _EXPECTED_SCORES = {"test_ds": {"dataset_name": "test_ds", "num_samples": 100, "score": 0.45}} - - def _results_blob(self, count: int = 5000) -> dict: - """A normal results.json blob carrying accuracy_scores plus a large responses list.""" - return { - "config": {}, - "results": {"total": 100}, - "accuracy_scores": self._EXPECTED_SCORES, - "responses": [{"text": "a" * 100, "idx": i} for i in range(count)], - } - - def _assert_unwrapped_and_valid(self, written: dict) -> None: - # The bare per-dataset mapping only — no wrapper keys, no responses — and it - # must pass the checker's AccuracyResult schema. - assert "responses" not in written - assert "accuracy_scores" not in written and "config" not in written - assert written == self._EXPECTED_SCORES - AccuracyResult.model_validate(written) # raises ValidationError if malformed - - def test_root_results_blob_unwrapped(self, run_folder: Path, tmp_path: Path) -> None: - """Archive ships only a normal results.json blob → unwrapped to its accuracy_scores.""" - archive = self._make_accuracy_archive( - run_folder, tmp_path, "wf1", results_content=self._results_blob() - ) - sub_dir = build_submission_folder( - [("run-001", archive)], "standardized", "available", tmp_path / "sub" - ) - acc_files = list(sub_dir.rglob("accuracy/results.json")) - assert len(acc_files) == 1 - self._assert_unwrapped_and_valid(json.loads(acc_files[0].read_text())) - - def test_accuracy_subdir_blob_unwrapped(self, run_folder: Path, tmp_path: Path) -> None: - """Archive ships a results.json-shaped accuracy/results.json → unwrapped.""" - archive = self._make_accuracy_archive( - run_folder, tmp_path, "wf2", acc_results_content=self._results_blob() - ) - sub_dir = build_submission_folder( - [("run-001", archive)], "standardized", "available", tmp_path / "sub" - ) - acc_files = list(sub_dir.rglob("accuracy/results.json")) - assert len(acc_files) == 1 - self._assert_unwrapped_and_valid(json.loads(acc_files[0].read_text())) - - def test_already_expected_format_left_intact(self, run_folder: Path, tmp_path: Path) -> None: - """An accuracy file already in the AccuracyResult schema is written through unchanged.""" - correct = { - "cnn_dailymail::m": { - "dataset_name": "cnn_dailymail::m", - "num_samples": 13368, - "score": {"rouge1": 39.0}, - } - } - archive = self._make_accuracy_archive( - run_folder, tmp_path, "wf3", acc_results_content=correct - ) - sub_dir = build_submission_folder( - [("run-001", archive)], "standardized", "available", tmp_path / "sub" - ) - acc_files = list(sub_dir.rglob("accuracy/results.json")) - assert len(acc_files) == 1 - written = json.loads(acc_files[0].read_text()) - assert written == correct # unchanged: no top-level accuracy_scores to unwrap - AccuracyResult.model_validate(written) - - @pytest.mark.unit class TestBuilderTruncatesResults: """build_submission_folder truncates the verbose responses in the perf results.json.""" diff --git a/tests/submission_checker/test_checker.py b/tests/submission_checker/test_checker.py index 86e4bec..2d18a3b 100644 --- a/tests/submission_checker/test_checker.py +++ b/tests/submission_checker/test_checker.py @@ -489,6 +489,34 @@ def test_invalid_accuracy_json(self, tmp_path): report = _check(root) assert _errors(report, "accuracy-valid") + def test_accuracy_scores_in_results_json(self, tmp_path): + """Accuracy is read from a point's results.json accuracy_scores (no accuracy/ dir).""" + root = _build_submission(tmp_path, write_accuracy_json=False) + # Drop accuracy_scores into the first point's results.json. The checker reads it + # directly; the empty accuracy/ dir is not consulted (no accuracy-file error). + first_point = ( + root / "pareto" / "test-sys" / "llama3-70b" / "results" / f"point_{_CONCURRENCIES[0]}" + ) + (first_point / "results.json").write_text( + json.dumps({"config": {}, "results": {}, "accuracy_scores": _ACCURACY, "responses": []}) + ) + report = _check(root) + assert not _errors(report, "accuracy-present") + assert not _errors(report, "accuracy-valid") + assert not _errors(report, "accuracy-file") + + def test_invalid_accuracy_scores_in_results_json(self, tmp_path): + """accuracy-valid error when results.json accuracy_scores is malformed.""" + root = _build_submission(tmp_path, write_accuracy_json=False) + first_point = ( + root / "pareto" / "test-sys" / "llama3-70b" / "results" / f"point_{_CONCURRENCIES[0]}" + ) + (first_point / "results.json").write_text( + json.dumps({"accuracy_scores": {"cnn_dailymail": "not-a-dict"}}) + ) + report = _check(root) + assert _errors(report, "accuracy-valid") + def test_point_filename_concurrency_mismatch(self, tmp_path): """point-filename-concurrency warning when filename concurrency ≠ declared concurrency.""" root = _build_submission(tmp_path)