From c975b9c911cdd945d01f517afa7b6a8e30ff2762 Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Wed, 24 Jun 2026 21:49:47 +0530 Subject: [PATCH 1/3] Fix multi-model accuracy submission bugs and update tests - builder.py: apply _truncate_responses to accuracy/results.json in addition to root results.json; remove _write_accuracy_fallback (it overwrote the truncated file with bare accuracy_scores); fix yaml_dir for accuracy runs so point_N.yaml lands in points/ not accuracy/; fix _write_src to namespace files under src// to prevent multi-model collisions - test_builder.py: fix three stale tests that asserted old YAML-in- accuracy-dir behavior; add TestAccuracyResultsTruncation covering the three accuracy workflow scenarios (no accuracy/ subdir, large accuracy/results.json, already-small accuracy/results.json) Co-Authored-By: Claude Sonnet 4.6 --- .../submissions/builder.py | 48 ++------ .../submissions/test_builder.py | 108 ++++++++++++++++-- 2 files changed, 108 insertions(+), 48 deletions(-) diff --git a/src/endpoints_submission_cli/submissions/builder.py b/src/endpoints_submission_cli/submissions/builder.py index 31e1b36..2aadee1 100644 --- a/src/endpoints_submission_cli/submissions/builder.py +++ b/src/endpoints_submission_cli/submissions/builder.py @@ -112,7 +112,12 @@ def build_submission_folder( ) system_max_concurrency[system_id] = int(values.pop()) - max_tps = _compute_max_tps(run_data) + model_runs: dict[str, list[dict[str, Any]]] = {} + for (system_id, model), runs in groups.items(): + model_runs.setdefault(model, []).extend(runs) + max_tps_by_model: dict[str, float | None] = { + model: _compute_max_tps(runs) for model, runs in model_runs.items() + } written_systems: set[str] = set() for (system_id, model), runs in groups.items(): @@ -120,7 +125,7 @@ def build_submission_folder( _write_system_description(submission_dir, system_id, runs[0]["system_info"]) written_systems.add(system_id) _write_pareto_entries( - submission_dir, system_id, model, runs, system_max_concurrency[system_id], max_tps + submission_dir, system_id, model, runs, system_max_concurrency[system_id], max_tps_by_model[model] ) # Copy src/ for Standardized division submissions (mirrors documentation/ handling) @@ -360,7 +365,7 @@ def _write_pareto_entries( points_dir = model_dir / "points" if run_type == "accuracy": result_dir = model_dir / "results" / f"point_{concurrency}" / "accuracy" - yaml_dir = result_dir + yaml_dir = points_dir else: result_dir = model_dir / "results" / f"point_{concurrency}" yaml_dir = points_dir @@ -442,10 +447,10 @@ def _write_pareto_entries( content = json.dumps(metadata, indent=2).encode() except (json.JSONDecodeError, TypeError, ValueError): pass - if rel_path == "results.json": + if rel_path in ("results.json", "accuracy/results.json"): content = _truncate_responses(content) dest_rel = ( - rel_path[len(_acc_prefix) :] + rel_path[len(_acc_prefix):] if run_type == "accuracy" and rel_path.startswith(_acc_prefix) else rel_path ) @@ -453,10 +458,6 @@ def _write_pareto_entries( dest.parent.mkdir(parents=True, exist_ok=True) dest.write_bytes(content) - # Fallback: if the archive had no accuracy/results.json, derive from results.json - if "accuracy/results.json" not in extra_files: - _write_accuracy_fallback(result_dir, run) - _RESPONSES_LIMIT = 10 * 1024 # 10 KB @@ -483,33 +484,6 @@ def _truncate_responses(content: bytes) -> bytes: return json.dumps(data, indent=2).encode() -def _write_accuracy_fallback(result_dir: Path, run: dict[str, Any]) -> None: - """Write per-point accuracy/ files from results.json when the archive has no accuracy/ dir. - - Called only when accuracy/results.json is absent from the run's extra_files - (i.e. the run archive did not include an accuracy/ directory). Sources accuracy_scores - from results.json. - - The written results.json format: - {"": {"score": {...}, "num_samples": N, ...}} - """ - results_bytes = run.get("_extra_files", {}).get("results.json") - if not results_bytes: - return - try: - parsed = json.loads(results_bytes) - except json.JSONDecodeError: - return - accuracy_scores: dict[str, Any] | None = parsed.get("accuracy_scores") - if not accuracy_scores: - return - - accuracy_dir = result_dir / "accuracy" - accuracy_dir.mkdir(parents=True, exist_ok=True) - (accuracy_dir / "results.json").write_text( - json.dumps(accuracy_scores, indent=2), encoding="utf-8" - ) - def _write_documentation(submission_dir: Path, run_data: list[dict[str, Any]]) -> None: """Merge documentation files from all runs into submission_dir/documentation/.""" @@ -533,7 +507,7 @@ def _write_src(submission_dir: Path, run_data: list[dict[str, Any]]) -> None: for rel, content in run.get("_extra_files", {}).items(): if not rel.startswith("src/"): continue - dest = submission_dir / rel + dest = src_model_dir / Path(rel).relative_to("src") dest.parent.mkdir(parents=True, exist_ok=True) dest.write_bytes(content) diff --git a/tests/endpoints_submission_cli/submissions/test_builder.py b/tests/endpoints_submission_cli/submissions/test_builder.py index a41e0e2..9f3125e 100644 --- a/tests/endpoints_submission_cli/submissions/test_builder.py +++ b/tests/endpoints_submission_cli/submissions/test_builder.py @@ -87,9 +87,10 @@ def test_log_summary_created(self, run_archive: Path, tmp_path: Path) -> None: assert "n_samples_completed" in data assert "duration_ns" in data - def test_accuracy_file_created(self, run_archive: Path, tmp_path: Path) -> None: + def test_accuracy_file_created(self, run_folder: Path, tmp_path: Path) -> None: + acc_archive = self._make_archive(run_folder, tmp_path, "acc", 4, "accuracy") sub_dir = build_submission_folder( - [("run-001", run_archive)], "standardized", "available", tmp_path + [("run-001", acc_archive)], "standardized", "available", tmp_path / "sub" ) acc_jsons = list(sub_dir.rglob("accuracy/results.json")) assert len(acc_jsons) == 1 @@ -223,13 +224,9 @@ def test_perf_and_accuracy_same_concurrency_ok(self, run_folder: Path, tmp_path: "available", tmp_path / "sub", ) - assert (sub_dir / "pareto").glob("*/*/points/point_4.yaml").__next__().exists() - assert ( - (sub_dir / "pareto") - .glob("*/*/results/point_4/accuracy/point_4.yaml") - .__next__() - .exists() - ) + assert list((sub_dir / "pareto").glob("*/*/points/point_4.yaml")) + assert list((sub_dir / "pareto").glob("*/*/results/point_4/accuracy/results.json")) + assert not list((sub_dir / "pareto").glob("*/*/results/point_4/accuracy/point_4.yaml")) def test_accuracy_run_routed_to_accuracy(self, run_folder: Path, tmp_path: Path) -> None: a_acc = self._make_archive(run_folder, tmp_path, "acc", 4, "accuracy") @@ -240,9 +237,9 @@ def test_accuracy_run_routed_to_accuracy(self, run_folder: Path, tmp_path: Path) assert len(model_dirs) == 1 model_dir = model_dirs[0] acc_dir = model_dir / "results" / "point_4" / "accuracy" - assert (acc_dir / "point_4.yaml").exists() + assert not (acc_dir / "point_4.yaml").exists() assert (acc_dir / "results_summary.json").exists() - assert not (model_dir / "points" / "point_4.yaml").exists() + assert (model_dir / "points" / "point_4.yaml").exists() assert not (model_dir / "results" / "point_4" / "results_summary.json").exists() @@ -641,3 +638,92 @@ def test_empty_string(self) -> None: def test_long_name_truncated(self) -> None: long = "A" * 100 assert len(_slugify(long)) <= 64 + + +@pytest.mark.unit +class TestAccuracyResultsTruncation: + """Tests for accuracy/results.json truncation across three workflow scenarios.""" + + def _make_accuracy_archive( + self, + run_folder: Path, + tmp_path: Path, + name: str, + results_content: dict | None = None, + acc_results_content: dict | None = None, + ) -> Path: + import shutil + + folder = tmp_path / name + shutil.copytree(run_folder, folder) + cfg = yaml.safe_load((folder / "config.yaml").read_text()) + cfg["datasets"][0]["type"] = "accuracy" + (folder / "config.yaml").write_text(yaml.dump(cfg)) + if results_content is not None: + (folder / "results.json").write_text(json.dumps(results_content)) + if acc_results_content is not None: + (folder / "accuracy").mkdir(exist_ok=True) + (folder / "accuracy" / "results.json").write_text(json.dumps(acc_results_content)) + archive = tmp_path / f"{name}.tar.gz" + with tarfile.open(archive, "w:gz") as tar: + tar.add(folder, arcname=name) + return archive + + def _big_results(self, count: int = 5000) -> dict: + return { + "config": {}, + "accuracy_scores": {"score": 0.45}, + "responses": [{"text": "a" * 100, "idx": i} for i in range(count)], + } + + def test_no_accuracy_subdir_truncates_responses( + self, run_folder: Path, tmp_path: Path + ) -> None: + """Workflow 1/3: archive has only root results.json; responses written truncated to accuracy/results.json.""" + archive = self._make_accuracy_archive( + run_folder, tmp_path, "wf1", results_content=self._big_results() + ) + sub_dir = build_submission_folder( + [("run-001", archive)], "standardized", "available", tmp_path / "sub" + ) + acc_files = list(sub_dir.rglob("accuracy/results.json")) + assert len(acc_files) == 1 + written = json.loads(acc_files[0].read_text()) + assert len(json.dumps(written["responses"]).encode()) <= 10 * 1024 + + def test_accuracy_subdir_large_responses_truncated( + self, run_folder: Path, tmp_path: Path + ) -> None: + """Workflow 2: archive has accuracy/results.json with large responses; truncated on write.""" + archive = self._make_accuracy_archive( + run_folder, tmp_path, "wf2", acc_results_content=self._big_results() + ) + sub_dir = build_submission_folder( + [("run-001", archive)], "standardized", "available", tmp_path / "sub" + ) + acc_files = list(sub_dir.rglob("accuracy/results.json")) + assert len(acc_files) == 1 + written = json.loads(acc_files[0].read_text()) + assert len(json.dumps(written["responses"]).encode()) <= 10 * 1024 + + def test_accuracy_subdir_already_small_written_intact( + self, run_folder: Path, tmp_path: Path + ) -> None: + """Workflow 3: archive has accuracy/results.json already within 10 KB; written without corruption.""" + small_responses = [{"text": "hi", "idx": 0}] + acc_results = { + "config": {}, + "accuracy_scores": {"score": 0.45}, + "responses": small_responses, + } + archive = self._make_accuracy_archive( + run_folder, tmp_path, "wf3", acc_results_content=acc_results + ) + sub_dir = build_submission_folder( + [("run-001", archive)], "standardized", "available", tmp_path / "sub" + ) + acc_files = list(sub_dir.rglob("accuracy/results.json")) + assert len(acc_files) == 1 + written = json.loads(acc_files[0].read_text()) + assert written["responses"] == small_responses + assert written["accuracy_scores"] == acc_results["accuracy_scores"] From 688930095832cddd0ce784093716f01551844c0a Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Wed, 24 Jun 2026 21:55:02 +0530 Subject: [PATCH 2/3] Fix ruff lint errors in builder.py - Rename unused loop variable system_id to _system_id (B007) - Wrap long _write_pareto_entries call to stay under 100 chars (E501) Co-Authored-By: Claude Sonnet 4.6 --- src/endpoints_submission_cli/submissions/builder.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/endpoints_submission_cli/submissions/builder.py b/src/endpoints_submission_cli/submissions/builder.py index 2aadee1..04106a3 100644 --- a/src/endpoints_submission_cli/submissions/builder.py +++ b/src/endpoints_submission_cli/submissions/builder.py @@ -113,7 +113,7 @@ def build_submission_folder( system_max_concurrency[system_id] = int(values.pop()) model_runs: dict[str, list[dict[str, Any]]] = {} - for (system_id, model), runs in groups.items(): + for (_system_id, model), runs in groups.items(): model_runs.setdefault(model, []).extend(runs) max_tps_by_model: dict[str, float | None] = { model: _compute_max_tps(runs) for model, runs in model_runs.items() @@ -125,7 +125,8 @@ def build_submission_folder( _write_system_description(submission_dir, system_id, runs[0]["system_info"]) written_systems.add(system_id) _write_pareto_entries( - submission_dir, system_id, model, runs, system_max_concurrency[system_id], max_tps_by_model[model] + submission_dir, system_id, model, runs, + system_max_concurrency[system_id], max_tps_by_model[model], ) # Copy src/ for Standardized division submissions (mirrors documentation/ handling) From 19afa0ab105aabf7935bea807d0d4620f3662e94 Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Wed, 24 Jun 2026 22:07:48 +0530 Subject: [PATCH 3/3] Revert yaml_dir for accuracy runs: point_N.yaml goes into accuracy/ For accuracy runs yaml_dir should be result_dir (.../accuracy/) so that point_N.yaml is co-located with the accuracy results, not in points/. Performance runs continue to write their YAML to points/. Update test assertions to match the correct layout. Co-Authored-By: Claude Sonnet 4.6 --- src/endpoints_submission_cli/submissions/builder.py | 2 +- tests/endpoints_submission_cli/submissions/test_builder.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/endpoints_submission_cli/submissions/builder.py b/src/endpoints_submission_cli/submissions/builder.py index 04106a3..3ccd221 100644 --- a/src/endpoints_submission_cli/submissions/builder.py +++ b/src/endpoints_submission_cli/submissions/builder.py @@ -366,7 +366,7 @@ def _write_pareto_entries( points_dir = model_dir / "points" if run_type == "accuracy": result_dir = model_dir / "results" / f"point_{concurrency}" / "accuracy" - yaml_dir = points_dir + yaml_dir = result_dir else: result_dir = model_dir / "results" / f"point_{concurrency}" yaml_dir = points_dir diff --git a/tests/endpoints_submission_cli/submissions/test_builder.py b/tests/endpoints_submission_cli/submissions/test_builder.py index 9f3125e..c74c535 100644 --- a/tests/endpoints_submission_cli/submissions/test_builder.py +++ b/tests/endpoints_submission_cli/submissions/test_builder.py @@ -226,7 +226,7 @@ def test_perf_and_accuracy_same_concurrency_ok(self, run_folder: Path, tmp_path: ) assert list((sub_dir / "pareto").glob("*/*/points/point_4.yaml")) assert list((sub_dir / "pareto").glob("*/*/results/point_4/accuracy/results.json")) - assert not list((sub_dir / "pareto").glob("*/*/results/point_4/accuracy/point_4.yaml")) + assert list((sub_dir / "pareto").glob("*/*/results/point_4/accuracy/point_4.yaml")) def test_accuracy_run_routed_to_accuracy(self, run_folder: Path, tmp_path: Path) -> None: a_acc = self._make_archive(run_folder, tmp_path, "acc", 4, "accuracy") @@ -237,9 +237,9 @@ def test_accuracy_run_routed_to_accuracy(self, run_folder: Path, tmp_path: Path) assert len(model_dirs) == 1 model_dir = model_dirs[0] acc_dir = model_dir / "results" / "point_4" / "accuracy" - assert not (acc_dir / "point_4.yaml").exists() + assert (acc_dir / "point_4.yaml").exists() assert (acc_dir / "results_summary.json").exists() - assert (model_dir / "points" / "point_4.yaml").exists() + assert not (model_dir / "points" / "point_4.yaml").exists() assert not (model_dir / "results" / "point_4" / "results_summary.json").exists()