diff --git a/src/endpoints_submission_cli/runs/parser.py b/src/endpoints_submission_cli/runs/parser.py index bb00a97..2c138f2 100644 --- a/src/endpoints_submission_cli/runs/parser.py +++ b/src/endpoints_submission_cli/runs/parser.py @@ -28,6 +28,7 @@ import yaml from ..exceptions import RunFolderError +from ..truncation import truncate_responses __all__ = ["parse_run_folder", "build_archive"] @@ -156,6 +157,11 @@ def build_archive(folder: Path, dest: Path | None = None, run_date: str | None = archived copy of that file has its ``run_date`` field set to this value. The source folder on disk is left untouched. + The archived copy of every ``results.json`` (top-level and any nested one, + e.g. ``accuracy/results.json``) has its verbose ``responses`` list truncated + (see :func:`truncate_responses`) to keep uploads small; the source folder on + disk is never mutated. + Returns: Path of the created archive. """ @@ -165,7 +171,11 @@ def build_archive(folder: Path, dest: Path | None = None, run_date: str | None = if dest is None: dest = folder.parent / f"{folder.name}.tar.gz" - patched_metadata: bytes | None = None + # In-memory replacements for specific archive members, keyed by arcname. The + # matching files on disk are excluded from the tar and these copies appended, + # so the source folder is never mutated. + replacements: dict[str, bytes] = {} + meta_path = folder / "run_metadata.json" if run_date is not None and meta_path.is_file(): try: @@ -174,21 +184,30 @@ def build_archive(folder: Path, dest: Path | None = None, run_date: str | None = metadata = None if isinstance(metadata, dict): metadata["run_date"] = run_date - patched_metadata = json.dumps(metadata, indent=2).encode("utf-8") + replacements[f"{folder.name}/run_metadata.json"] = json.dumps( + metadata, indent=2 + ).encode("utf-8") + + for results_path in sorted(folder.rglob("results.json")): + if not results_path.is_file(): + continue + original = results_path.read_bytes() + truncated = truncate_responses(original) + if truncated != original: + rel = results_path.relative_to(folder).as_posix() + replacements[f"{folder.name}/{rel}"] = truncated - meta_arcname = f"{folder.name}/run_metadata.json" with tarfile.open(dest, "w:gz") as tar: - if patched_metadata is None: + if not replacements: tar.add(folder, arcname=folder.name) else: - # Add everything except the original run_metadata.json, then append the - # patched copy from memory so the source folder is never mutated. tar.add( folder, arcname=folder.name, - filter=lambda ti: None if ti.name == meta_arcname else ti, + filter=lambda ti: None if ti.name in replacements else ti, ) - info = tarfile.TarInfo(name=meta_arcname) - info.size = len(patched_metadata) - tar.addfile(info, io.BytesIO(patched_metadata)) + for arcname, data in replacements.items(): + info = tarfile.TarInfo(name=arcname) + info.size = len(data) + tar.addfile(info, io.BytesIO(data)) return dest diff --git a/src/endpoints_submission_cli/submissions/builder.py b/src/endpoints_submission_cli/submissions/builder.py index 3ccd221..299723a 100644 --- a/src/endpoints_submission_cli/submissions/builder.py +++ b/src/endpoints_submission_cli/submissions/builder.py @@ -34,6 +34,7 @@ from submission_checker.models.file import SystemDescription from ..exceptions import SubmissionBuildError +from ..truncation import truncate_responses __all__ = ["build_submission_folder", "create_bundle_archive", "extract_archive"] @@ -449,7 +450,7 @@ def _write_pareto_entries( except (json.JSONDecodeError, TypeError, ValueError): pass if rel_path in ("results.json", "accuracy/results.json"): - content = _truncate_responses(content) + content = truncate_responses(content) dest_rel = ( rel_path[len(_acc_prefix):] if run_type == "accuracy" and rel_path.startswith(_acc_prefix) @@ -460,32 +461,6 @@ def _write_pareto_entries( dest.write_bytes(content) -_RESPONSES_LIMIT = 10 * 1024 # 10 KB - - -def _truncate_responses(content: bytes) -> bytes: - """Truncate the responses list in a results.json payload to stay under 10 KB.""" - try: - data = json.loads(content) - except (json.JSONDecodeError, ValueError): - return content - responses = data.get("responses") - if not isinstance(responses, list) or not responses: - return content - # Walk items and stop as soon as adding the next one would exceed the limit. - # Each item contributes its own bytes plus 2 for the ", " separator after the first. - total = 2 # "[]" - idx = 0 - for i, r in enumerate(responses): - total += len(json.dumps(r).encode()) + (2 if i > 0 else 0) - if total > _RESPONSES_LIMIT: - break - idx = i + 1 - data["responses"] = responses[:idx] - return json.dumps(data, indent=2).encode() - - - def _write_documentation(submission_dir: Path, run_data: list[dict[str, Any]]) -> None: """Merge documentation files from all runs into submission_dir/documentation/.""" doc_dir = submission_dir / "documentation" diff --git a/src/endpoints_submission_cli/truncation.py b/src/endpoints_submission_cli/truncation.py new file mode 100644 index 0000000..88b3c91 --- /dev/null +++ b/src/endpoints_submission_cli/truncation.py @@ -0,0 +1,43 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 MLCommons +# SPDX-License-Identifier: Apache-2.0 +"""Shared helper for trimming the verbose ``responses`` list in results.json. + +The raw ``responses`` array captured during a run can be very large (one entry +per sample). It is only kept for spot-checking, so both the run-upload archiver +and the submission builder cap it to keep payloads small and uploads reliable. +""" + +from __future__ import annotations + +import json + +__all__ = ["RESPONSES_LIMIT", "truncate_responses"] + +RESPONSES_LIMIT = 10 * 1024 # 10 KB + + +def truncate_responses(content: bytes) -> bytes: + """Truncate the ``responses`` list in a results.json payload to stay under 10 KB. + + Returns *content* unchanged when it is not JSON, has no ``responses`` list, or + already fits within the limit. Only the ``responses`` key is affected; all + other fields (accuracy scores, config, results, ...) are preserved. + """ + try: + data = json.loads(content) + except (json.JSONDecodeError, ValueError): + return content + responses = data.get("responses") + if not isinstance(responses, list) or not responses: + return content + # Walk items and stop as soon as adding the next one would exceed the limit. + # Each item contributes its own bytes plus 2 for the ", " separator after the first. + total = 2 # "[]" + idx = 0 + for i, r in enumerate(responses): + total += len(json.dumps(r).encode()) + (2 if i > 0 else 0) + if total > RESPONSES_LIMIT: + break + idx = i + 1 + data["responses"] = responses[:idx] + return json.dumps(data, indent=2).encode() diff --git a/tests/endpoints_submission_cli/runs/test_parser.py b/tests/endpoints_submission_cli/runs/test_parser.py index d3b8bdc..953e45f 100644 --- a/tests/endpoints_submission_cli/runs/test_parser.py +++ b/tests/endpoints_submission_cli/runs/test_parser.py @@ -168,6 +168,58 @@ def test_archive_contains_files(self, run_folder: Path, tmp_path: Path) -> None: assert any("config.yaml" in n for n in names) assert any("result_summary.json" in n for n in names) + def test_truncates_results_responses(self, run_folder: Path, tmp_path: Path) -> None: + results_path = run_folder / "results.json" + big = { + "config": {"mode": "perf"}, + "results": {"total": 5000}, + "responses": [{"idx": i, "text": "lorem ipsum " * 10} for i in range(5000)], + } + results_path.write_text(json.dumps(big)) + original_bytes = results_path.read_bytes() + + dest = tmp_path / "out.tar.gz" + build_archive(run_folder, dest) + + archived = _read_member(dest, "/results.json") + # responses capped well below the original count... + assert 0 < len(archived["responses"]) < 5000 + # ...while the other keys survive intact. + assert archived["config"] == {"mode": "perf"} + assert archived["results"] == {"total": 5000} + # source folder is never mutated. + assert results_path.read_bytes() == original_bytes + + def test_truncates_nested_accuracy_results(self, run_folder: Path, tmp_path: Path) -> None: + # Accuracy run folders carry an accuracy/results.json; it must be truncated too. + acc_dir = run_folder / "accuracy" + acc_dir.mkdir() + big = { + "results": {"total": 5000}, + "responses": [{"idx": i, "text": "lorem ipsum " * 10} for i in range(5000)], + } + (acc_dir / "results.json").write_text(json.dumps(big)) + + dest = tmp_path / "out.tar.gz" + build_archive(run_folder, dest) + + archived = _read_member(dest, "/accuracy/results.json") + assert 0 < len(archived["responses"]) < 5000 + assert archived["results"] == {"total": 5000} + + def test_results_without_responses_archived_unchanged( + self, run_folder: Path, tmp_path: Path + ) -> None: + # The fixture's results.json has no "responses" key -> archived byte-for-byte. + original = (run_folder / "results.json").read_bytes() + dest = tmp_path / "out.tar.gz" + build_archive(run_folder, dest) + with tarfile.open(dest) as tar: + member = next(m for m in tar.getmembers() if m.name.endswith("/results.json")) + fh = tar.extractfile(member) + assert fh is not None + assert fh.read() == original + def _read_member(archive: Path, suffix: str) -> dict: with tarfile.open(archive) as tar: diff --git a/tests/endpoints_submission_cli/submissions/test_builder.py b/tests/endpoints_submission_cli/submissions/test_builder.py index c74c535..72f5458 100644 --- a/tests/endpoints_submission_cli/submissions/test_builder.py +++ b/tests/endpoints_submission_cli/submissions/test_builder.py @@ -15,11 +15,11 @@ from endpoints_submission_cli.submissions.builder import ( _compute_max_tps, _slugify, - _truncate_responses, build_submission_folder, create_bundle_archive, extract_archive, ) +from endpoints_submission_cli.truncation import truncate_responses as _truncate_responses @pytest.mark.unit