From c975b9c911cdd945d01f517afa7b6a8e30ff2762 Mon Sep 17 00:00:00 2001
From: anandhu-eng <anandhusooraj011@gmail.com>
Date: Wed, 24 Jun 2026 21:49:47 +0530
Subject: [PATCH 1/3] Fix multi-model accuracy submission bugs and update tests

- builder.py: apply _truncate_responses to accuracy/results.json in
  addition to root results.json; remove _write_accuracy_fallback (it
  overwrote the truncated file with bare accuracy_scores); fix yaml_dir
  for accuracy runs so point_N.yaml lands in points/ not accuracy/;
  fix _write_src to namespace files under src/<model>/ to prevent
  multi-model collisions

- test_builder.py: fix three stale tests that asserted old YAML-in-
  accuracy-dir behavior; add TestAccuracyResultsTruncation covering the
  three accuracy workflow scenarios (no accuracy/ subdir, large
  accuracy/results.json, already-small accuracy/results.json)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../submissions/builder.py                    |  48 ++------
 .../submissions/test_builder.py               | 108 ++++++++++++++++--
 2 files changed, 108 insertions(+), 48 deletions(-)
diff --git a/src/endpoints_submission_cli/submissions/builder.py b/src/endpoints_submission_cli/submissions/builder.py
index 31e1b36..2aadee1 100644
--- a/src/endpoints_submission_cli/submissions/builder.py
+++ b/src/endpoints_submission_cli/submissions/builder.py
@@ -112,7 +112,12 @@ def build_submission_folder(
             )
         system_max_concurrency[system_id] = int(values.pop())
 
-    max_tps = _compute_max_tps(run_data)
+    model_runs: dict[str, list[dict[str, Any]]] = {}
+    for (system_id, model), runs in groups.items():
+        model_runs.setdefault(model, []).extend(runs)
+    max_tps_by_model: dict[str, float | None] = {
+        model: _compute_max_tps(runs) for model, runs in model_runs.items()
+    }
 
     written_systems: set[str] = set()
     for (system_id, model), runs in groups.items():
@@ -120,7 +125,7 @@ def build_submission_folder(
             _write_system_description(submission_dir, system_id, runs[0]["system_info"])
             written_systems.add(system_id)
         _write_pareto_entries(
-            submission_dir, system_id, model, runs, system_max_concurrency[system_id], max_tps
+            submission_dir, system_id, model, runs, system_max_concurrency[system_id], max_tps_by_model[model]
         )
 
     # Copy src/ for Standardized division submissions (mirrors documentation/ handling)
@@ -360,7 +365,7 @@ def _write_pareto_entries(
         points_dir = model_dir / "points"
         if run_type == "accuracy":
             result_dir = model_dir / "results" / f"point_{concurrency}" / "accuracy"
-            yaml_dir = result_dir
+            yaml_dir = points_dir
         else:
             result_dir = model_dir / "results" / f"point_{concurrency}"
             yaml_dir = points_dir
@@ -442,10 +447,10 @@ def _write_pareto_entries(
                         content = json.dumps(metadata, indent=2).encode()
                 except (json.JSONDecodeError, TypeError, ValueError):
                     pass
-            if rel_path == "results.json":
+            if rel_path in ("results.json", "accuracy/results.json"):
                 content = _truncate_responses(content)
             dest_rel = (
-                rel_path[len(_acc_prefix) :]
+                rel_path[len(_acc_prefix):]
                 if run_type == "accuracy" and rel_path.startswith(_acc_prefix)
                 else rel_path
             )
@@ -453,10 +458,6 @@ def _write_pareto_entries(
             dest.parent.mkdir(parents=True, exist_ok=True)
             dest.write_bytes(content)
 
-        # Fallback: if the archive had no accuracy/results.json, derive from results.json
-        if "accuracy/results.json" not in extra_files:
-            _write_accuracy_fallback(result_dir, run)
-
 
 _RESPONSES_LIMIT = 10 * 1024  # 10 KB
 
@@ -483,33 +484,6 @@ def _truncate_responses(content: bytes) -> bytes:
     return json.dumps(data, indent=2).encode()
 
 
-def _write_accuracy_fallback(result_dir: Path, run: dict[str, Any]) -> None:
-    """Write per-point accuracy/ files from results.json when the archive has no accuracy/ dir.
-
-    Called only when accuracy/results.json is absent from the run's extra_files
-    (i.e. the run archive did not include an accuracy/ directory). Sources accuracy_scores
-    from results.json.
-
-    The written results.json format:
-        {"<dataset_name>": {"score": {...}, "num_samples": N, ...}}
-    """
-    results_bytes = run.get("_extra_files", {}).get("results.json")
-    if not results_bytes:
-        return
-    try:
-        parsed = json.loads(results_bytes)
-    except json.JSONDecodeError:
-        return
-    accuracy_scores: dict[str, Any] | None = parsed.get("accuracy_scores")
-    if not accuracy_scores:
-        return
-
-    accuracy_dir = result_dir / "accuracy"
-    accuracy_dir.mkdir(parents=True, exist_ok=True)
-    (accuracy_dir / "results.json").write_text(
-        json.dumps(accuracy_scores, indent=2), encoding="utf-8"
-    )
-
 
 def _write_documentation(submission_dir: Path, run_data: list[dict[str, Any]]) -> None:
     """Merge documentation files from all runs into submission_dir/documentation/."""
@@ -533,7 +507,7 @@ def _write_src(submission_dir: Path, run_data: list[dict[str, Any]]) -> None:
         for rel, content in run.get("_extra_files", {}).items():
             if not rel.startswith("src/"):
                 continue
-            dest = submission_dir / rel
+            dest = src_model_dir / Path(rel).relative_to("src")
             dest.parent.mkdir(parents=True, exist_ok=True)
             dest.write_bytes(content)
 
diff --git a/tests/endpoints_submission_cli/submissions/test_builder.py b/tests/endpoints_submission_cli/submissions/test_builder.py
index a41e0e2..9f3125e 100644
--- a/tests/endpoints_submission_cli/submissions/test_builder.py
+++ b/tests/endpoints_submission_cli/submissions/test_builder.py
@@ -87,9 +87,10 @@ def test_log_summary_created(self, run_archive: Path, tmp_path: Path) -> None:
         assert "n_samples_completed" in data
         assert "duration_ns" in data
 
-    def test_accuracy_file_created(self, run_archive: Path, tmp_path: Path) -> None:
+    def test_accuracy_file_created(self, run_folder: Path, tmp_path: Path) -> None:
+        acc_archive = self._make_archive(run_folder, tmp_path, "acc", 4, "accuracy")
         sub_dir = build_submission_folder(
-            [("run-001", run_archive)], "standardized", "available", tmp_path
+            [("run-001", acc_archive)], "standardized", "available", tmp_path / "sub"
         )
         acc_jsons = list(sub_dir.rglob("accuracy/results.json"))
         assert len(acc_jsons) == 1
@@ -223,13 +224,9 @@ def test_perf_and_accuracy_same_concurrency_ok(self, run_folder: Path, tmp_path:
             "available",
             tmp_path / "sub",
         )
-        assert (sub_dir / "pareto").glob("*/*/points/point_4.yaml").__next__().exists()
-        assert (
-            (sub_dir / "pareto")
-            .glob("*/*/results/point_4/accuracy/point_4.yaml")
-            .__next__()
-            .exists()
-        )
+        assert list((sub_dir / "pareto").glob("*/*/points/point_4.yaml"))
+        assert list((sub_dir / "pareto").glob("*/*/results/point_4/accuracy/results.json"))
+        assert not list((sub_dir / "pareto").glob("*/*/results/point_4/accuracy/point_4.yaml"))
 
     def test_accuracy_run_routed_to_accuracy(self, run_folder: Path, tmp_path: Path) -> None:
         a_acc = self._make_archive(run_folder, tmp_path, "acc", 4, "accuracy")
@@ -240,9 +237,9 @@ def test_accuracy_run_routed_to_accuracy(self, run_folder: Path, tmp_path: Path)
         assert len(model_dirs) == 1
         model_dir = model_dirs[0]
         acc_dir = model_dir / "results" / "point_4" / "accuracy"
-        assert (acc_dir / "point_4.yaml").exists()
+        assert not (acc_dir / "point_4.yaml").exists()
         assert (acc_dir / "results_summary.json").exists()
-        assert not (model_dir / "points" / "point_4.yaml").exists()
+        assert (model_dir / "points" / "point_4.yaml").exists()
         assert not (model_dir / "results" / "point_4" / "results_summary.json").exists()
 
 
@@ -641,3 +638,92 @@ def test_empty_string(self) -> None:
     def test_long_name_truncated(self) -> None:
         long = "A" * 100
         assert len(_slugify(long)) <= 64
+
+
+@pytest.mark.unit
+class TestAccuracyResultsTruncation:
+    """Tests for accuracy/results.json truncation across three workflow scenarios."""
+
+    def _make_accuracy_archive(
+        self,
+        run_folder: Path,
+        tmp_path: Path,
+        name: str,
+        results_content: dict | None = None,
+        acc_results_content: dict | None = None,
+    ) -> Path:
+        import shutil
+
+        folder = tmp_path / name
+        shutil.copytree(run_folder, folder)
+        cfg = yaml.safe_load((folder / "config.yaml").read_text())
+        cfg["datasets"][0]["type"] = "accuracy"
+        (folder / "config.yaml").write_text(yaml.dump(cfg))
+        if results_content is not None:
+            (folder / "results.json").write_text(json.dumps(results_content))
+        if acc_results_content is not None:
+            (folder / "accuracy").mkdir(exist_ok=True)
+            (folder / "accuracy" / "results.json").write_text(json.dumps(acc_results_content))
+        archive = tmp_path / f"{name}.tar.gz"
+        with tarfile.open(archive, "w:gz") as tar:
+            tar.add(folder, arcname=name)
+        return archive
+
+    def _big_results(self, count: int = 5000) -> dict:
+        return {
+            "config": {},
+            "accuracy_scores": {"score": 0.45},
+            "responses": [{"text": "a" * 100, "idx": i} for i in range(count)],
+        }
+
+    def test_no_accuracy_subdir_truncates_responses(
+        self, run_folder: Path, tmp_path: Path
+    ) -> None:
+        """Workflow 1/3: archive has only root results.json; responses written truncated to accuracy/results.json."""
+        archive = self._make_accuracy_archive(
+            run_folder, tmp_path, "wf1", results_content=self._big_results()
+        )
+        sub_dir = build_submission_folder(
+            [("run-001", archive)], "standardized", "available", tmp_path / "sub"
+        )
+        acc_files = list(sub_dir.rglob("accuracy/results.json"))
+        assert len(acc_files) == 1
+        written = json.loads(acc_files[0].read_text())
+        assert len(json.dumps(written["responses"]).encode()) <= 10 * 1024
+
+    def test_accuracy_subdir_large_responses_truncated(
+        self, run_folder: Path, tmp_path: Path
+    ) -> None:
+        """Workflow 2: archive has accuracy/results.json with large responses; truncated on write."""
+        archive = self._make_accuracy_archive(
+            run_folder, tmp_path, "wf2", acc_results_content=self._big_results()
+        )
+        sub_dir = build_submission_folder(
+            [("run-001", archive)], "standardized", "available", tmp_path / "sub"
+        )
+        acc_files = list(sub_dir.rglob("accuracy/results.json"))
+        assert len(acc_files) == 1
+        written = json.loads(acc_files[0].read_text())
+        assert len(json.dumps(written["responses"]).encode()) <= 10 * 1024
+
+    def test_accuracy_subdir_already_small_written_intact(
+        self, run_folder: Path, tmp_path: Path
+    ) -> None:
+        """Workflow 3: archive has accuracy/results.json already within 10 KB; written without corruption."""
+        small_responses = [{"text": "hi", "idx": 0}]
+        acc_results = {
+            "config": {},
+            "accuracy_scores": {"score": 0.45},
+            "responses": small_responses,
+        }
+        archive = self._make_accuracy_archive(
+            run_folder, tmp_path, "wf3", acc_results_content=acc_results
+        )
+        sub_dir = build_submission_folder(
+            [("run-001", archive)], "standardized", "available", tmp_path / "sub"
+        )
+        acc_files = list(sub_dir.rglob("accuracy/results.json"))
+        assert len(acc_files) == 1
+        written = json.loads(acc_files[0].read_text())
+        assert written["responses"] == small_responses
+        assert written["accuracy_scores"] == acc_results["accuracy_scores"]

From 688930095832cddd0ce784093716f01551844c0a Mon Sep 17 00:00:00 2001
From: anandhu-eng <anandhusooraj011@gmail.com>
Date: Wed, 24 Jun 2026 21:55:02 +0530
Subject: [PATCH 2/3] Fix ruff lint errors in builder.py

- Rename unused loop variable system_id to _system_id (B007)
- Wrap long _write_pareto_entries call to stay under 100 chars (E501)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/endpoints_submission_cli/submissions/builder.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/endpoints_submission_cli/submissions/builder.py b/src/endpoints_submission_cli/submissions/builder.py
index 2aadee1..04106a3 100644
--- a/src/endpoints_submission_cli/submissions/builder.py
+++ b/src/endpoints_submission_cli/submissions/builder.py
@@ -113,7 +113,7 @@ def build_submission_folder(
         system_max_concurrency[system_id] = int(values.pop())
 
     model_runs: dict[str, list[dict[str, Any]]] = {}
-    for (system_id, model), runs in groups.items():
+    for (_system_id, model), runs in groups.items():
         model_runs.setdefault(model, []).extend(runs)
     max_tps_by_model: dict[str, float | None] = {
         model: _compute_max_tps(runs) for model, runs in model_runs.items()
@@ -125,7 +125,8 @@ def build_submission_folder(
             _write_system_description(submission_dir, system_id, runs[0]["system_info"])
             written_systems.add(system_id)
         _write_pareto_entries(
-            submission_dir, system_id, model, runs, system_max_concurrency[system_id], max_tps_by_model[model]
+            submission_dir, system_id, model, runs,
+            system_max_concurrency[system_id], max_tps_by_model[model],
         )
 
     # Copy src/ for Standardized division submissions (mirrors documentation/ handling)

From 19afa0ab105aabf7935bea807d0d4620f3662e94 Mon Sep 17 00:00:00 2001
From: anandhu-eng <anandhusooraj011@gmail.com>
Date: Wed, 24 Jun 2026 22:07:48 +0530
Subject: [PATCH 3/3] Revert yaml_dir for accuracy runs: point_N.yaml goes into
 accuracy/

For accuracy runs yaml_dir should be result_dir (.../accuracy/) so that
point_N.yaml is co-located with the accuracy results, not in points/.
Performance runs continue to write their YAML to points/.

Update test assertions to match the correct layout.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/endpoints_submission_cli/submissions/builder.py        | 2 +-
 tests/endpoints_submission_cli/submissions/test_builder.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/endpoints_submission_cli/submissions/builder.py b/src/endpoints_submission_cli/submissions/builder.py
index 04106a3..3ccd221 100644
--- a/src/endpoints_submission_cli/submissions/builder.py
+++ b/src/endpoints_submission_cli/submissions/builder.py
@@ -366,7 +366,7 @@ def _write_pareto_entries(
         points_dir = model_dir / "points"
         if run_type == "accuracy":
             result_dir = model_dir / "results" / f"point_{concurrency}" / "accuracy"
-            yaml_dir = points_dir
+            yaml_dir = result_dir
         else:
             result_dir = model_dir / "results" / f"point_{concurrency}"
             yaml_dir = points_dir
diff --git a/tests/endpoints_submission_cli/submissions/test_builder.py b/tests/endpoints_submission_cli/submissions/test_builder.py
index 9f3125e..c74c535 100644
--- a/tests/endpoints_submission_cli/submissions/test_builder.py
+++ b/tests/endpoints_submission_cli/submissions/test_builder.py
@@ -226,7 +226,7 @@ def test_perf_and_accuracy_same_concurrency_ok(self, run_folder: Path, tmp_path:
         )
         assert list((sub_dir / "pareto").glob("*/*/points/point_4.yaml"))
         assert list((sub_dir / "pareto").glob("*/*/results/point_4/accuracy/results.json"))
-        assert not list((sub_dir / "pareto").glob("*/*/results/point_4/accuracy/point_4.yaml"))
+        assert list((sub_dir / "pareto").glob("*/*/results/point_4/accuracy/point_4.yaml"))
 
     def test_accuracy_run_routed_to_accuracy(self, run_folder: Path, tmp_path: Path) -> None:
         a_acc = self._make_archive(run_folder, tmp_path, "acc", 4, "accuracy")
@@ -237,9 +237,9 @@ def test_accuracy_run_routed_to_accuracy(self, run_folder: Path, tmp_path: Path)
         assert len(model_dirs) == 1
         model_dir = model_dirs[0]
         acc_dir = model_dir / "results" / "point_4" / "accuracy"
-        assert not (acc_dir / "point_4.yaml").exists()
+        assert (acc_dir / "point_4.yaml").exists()
         assert (acc_dir / "results_summary.json").exists()
-        assert (model_dir / "points" / "point_4.yaml").exists()
+        assert not (model_dir / "points" / "point_4.yaml").exists()
         assert not (model_dir / "results" / "point_4" / "results_summary.json").exists()