mlcommons · arav-agarwal2 · Jun 24, 2026 · Jun 24, 2026 · Jun 24, 2026 · Jun 24, 2026
diff --git a/src/endpoints_submission_cli/submissions/builder.py b/src/endpoints_submission_cli/submissions/builder.py
@@ -112,15 +112,21 @@ def build_submission_folder(
             )
         system_max_concurrency[system_id] = int(values.pop())
 
-    max_tps = _compute_max_tps(run_data)
+    model_runs: dict[str, list[dict[str, Any]]] = {}
+    for (_system_id, model), runs in groups.items():
+        model_runs.setdefault(model, []).extend(runs)
+    max_tps_by_model: dict[str, float | None] = {
+        model: _compute_max_tps(runs) for model, runs in model_runs.items()
+    }
 
     written_systems: set[str] = set()
     for (system_id, model), runs in groups.items():
         if system_id not in written_systems:
             _write_system_description(submission_dir, system_id, runs[0]["system_info"])
             written_systems.add(system_id)
         _write_pareto_entries(
-            submission_dir, system_id, model, runs, system_max_concurrency[system_id], max_tps
+            submission_dir, system_id, model, runs,
+            system_max_concurrency[system_id], max_tps_by_model[model],
         )
 
     # Copy src/ for Standardized division submissions (mirrors documentation/ handling)
@@ -442,21 +448,17 @@ def _write_pareto_entries(
                         content = json.dumps(metadata, indent=2).encode()
                 except (json.JSONDecodeError, TypeError, ValueError):
                     pass
-            if rel_path == "results.json":
+            if rel_path in ("results.json", "accuracy/results.json"):
                 content = _truncate_responses(content)
             dest_rel = (
-                rel_path[len(_acc_prefix) :]
+                rel_path[len(_acc_prefix):]
                 if run_type == "accuracy" and rel_path.startswith(_acc_prefix)
                 else rel_path
             )
             dest = result_dir / dest_rel
             dest.parent.mkdir(parents=True, exist_ok=True)
             dest.write_bytes(content)
 
-        # Fallback: if the archive had no accuracy/results.json, derive from results.json
-        if "accuracy/results.json" not in extra_files:
-            _write_accuracy_fallback(result_dir, run)
-
 
 _RESPONSES_LIMIT = 10 * 1024  # 10 KB
 
@@ -483,33 +485,6 @@ def _truncate_responses(content: bytes) -> bytes:
     return json.dumps(data, indent=2).encode()
 
 
-def _write_accuracy_fallback(result_dir: Path, run: dict[str, Any]) -> None:
-    """Write per-point accuracy/ files from results.json when the archive has no accuracy/ dir.
-
-    Called only when accuracy/results.json is absent from the run's extra_files
-    (i.e. the run archive did not include an accuracy/ directory). Sources accuracy_scores
-    from results.json.
-
-    The written results.json format:
-        {"<dataset_name>": {"score": {...}, "num_samples": N, ...}}
-    """
-    results_bytes = run.get("_extra_files", {}).get("results.json")
-    if not results_bytes:
-        return
-    try:
-        parsed = json.loads(results_bytes)
-    except json.JSONDecodeError:
-        return
-    accuracy_scores: dict[str, Any] | None = parsed.get("accuracy_scores")
-    if not accuracy_scores:
-        return
-
-    accuracy_dir = result_dir / "accuracy"
-    accuracy_dir.mkdir(parents=True, exist_ok=True)
-    (accuracy_dir / "results.json").write_text(
-        json.dumps(accuracy_scores, indent=2), encoding="utf-8"
-    )
-
 
 def _write_documentation(submission_dir: Path, run_data: list[dict[str, Any]]) -> None:
     """Merge documentation files from all runs into submission_dir/documentation/."""
@@ -533,7 +508,7 @@ def _write_src(submission_dir: Path, run_data: list[dict[str, Any]]) -> None:
         for rel, content in run.get("_extra_files", {}).items():
             if not rel.startswith("src/"):
                 continue
-            dest = submission_dir / rel
+            dest = src_model_dir / Path(rel).relative_to("src")
             dest.parent.mkdir(parents=True, exist_ok=True)
             dest.write_bytes(content)
 

diff --git a/tests/endpoints_submission_cli/submissions/test_builder.py b/tests/endpoints_submission_cli/submissions/test_builder.py
@@ -87,9 +87,10 @@ def test_log_summary_created(self, run_archive: Path, tmp_path: Path) -> None:
         assert "n_samples_completed" in data
         assert "duration_ns" in data
 
-    def test_accuracy_file_created(self, run_archive: Path, tmp_path: Path) -> None:
+    def test_accuracy_file_created(self, run_folder: Path, tmp_path: Path) -> None:
+        acc_archive = self._make_archive(run_folder, tmp_path, "acc", 4, "accuracy")
         sub_dir = build_submission_folder(
-            [("run-001", run_archive)], "standardized", "available", tmp_path
+            [("run-001", acc_archive)], "standardized", "available", tmp_path / "sub"
         )
         acc_jsons = list(sub_dir.rglob("accuracy/results.json"))
         assert len(acc_jsons) == 1
@@ -223,13 +224,9 @@ def test_perf_and_accuracy_same_concurrency_ok(self, run_folder: Path, tmp_path:
             "available",
             tmp_path / "sub",
         )
-        assert (sub_dir / "pareto").glob("*/*/points/point_4.yaml").__next__().exists()
-        assert (
-            (sub_dir / "pareto")
-            .glob("*/*/results/point_4/accuracy/point_4.yaml")
-            .__next__()
-            .exists()
-        )
+        assert list((sub_dir / "pareto").glob("*/*/points/point_4.yaml"))
+        assert list((sub_dir / "pareto").glob("*/*/results/point_4/accuracy/results.json"))
+        assert list((sub_dir / "pareto").glob("*/*/results/point_4/accuracy/point_4.yaml"))
 
     def test_accuracy_run_routed_to_accuracy(self, run_folder: Path, tmp_path: Path) -> None:
         a_acc = self._make_archive(run_folder, tmp_path, "acc", 4, "accuracy")
@@ -641,3 +638,92 @@ def test_empty_string(self) -> None:
     def test_long_name_truncated(self) -> None:
         long = "A" * 100
         assert len(_slugify(long)) <= 64
+
+
+@pytest.mark.unit
+class TestAccuracyResultsTruncation:
+    """Tests for accuracy/results.json truncation across three workflow scenarios."""
+
+    def _make_accuracy_archive(
+        self,
+        run_folder: Path,
+        tmp_path: Path,
+        name: str,
+        results_content: dict | None = None,
+        acc_results_content: dict | None = None,
+    ) -> Path:
+        import shutil
+
+        folder = tmp_path / name
+        shutil.copytree(run_folder, folder)
+        cfg = yaml.safe_load((folder / "config.yaml").read_text())
+        cfg["datasets"][0]["type"] = "accuracy"
+        (folder / "config.yaml").write_text(yaml.dump(cfg))
+        if results_content is not None:
+            (folder / "results.json").write_text(json.dumps(results_content))
+        if acc_results_content is not None:
+            (folder / "accuracy").mkdir(exist_ok=True)
+            (folder / "accuracy" / "results.json").write_text(json.dumps(acc_results_content))
+        archive = tmp_path / f"{name}.tar.gz"
+        with tarfile.open(archive, "w:gz") as tar:
+            tar.add(folder, arcname=name)
+        return archive
+
+    def _big_results(self, count: int = 5000) -> dict:
+        return {
+            "config": {},
+            "accuracy_scores": {"score": 0.45},
+            "responses": [{"text": "a" * 100, "idx": i} for i in range(count)],
+        }
+
+    def test_no_accuracy_subdir_truncates_responses(
+        self, run_folder: Path, tmp_path: Path
+    ) -> None:
+        """Workflow 1/3: archive has only root results.json; responses written truncated to accuracy/results.json."""
+        archive = self._make_accuracy_archive(
+            run_folder, tmp_path, "wf1", results_content=self._big_results()
+        )
+        sub_dir = build_submission_folder(
+            [("run-001", archive)], "standardized", "available", tmp_path / "sub"
+        )
+        acc_files = list(sub_dir.rglob("accuracy/results.json"))
+        assert len(acc_files) == 1
+        written = json.loads(acc_files[0].read_text())
+        assert len(json.dumps(written["responses"]).encode()) <= 10 * 1024
+
+    def test_accuracy_subdir_large_responses_truncated(
+        self, run_folder: Path, tmp_path: Path
+    ) -> None:
+        """Workflow 2: archive has accuracy/results.json with large responses; truncated on write."""
+        archive = self._make_accuracy_archive(
+            run_folder, tmp_path, "wf2", acc_results_content=self._big_results()
+        )
+        sub_dir = build_submission_folder(
+            [("run-001", archive)], "standardized", "available", tmp_path / "sub"
+        )
+        acc_files = list(sub_dir.rglob("accuracy/results.json"))
+        assert len(acc_files) == 1
+        written = json.loads(acc_files[0].read_text())
+        assert len(json.dumps(written["responses"]).encode()) <= 10 * 1024
+
+    def test_accuracy_subdir_already_small_written_intact(
+        self, run_folder: Path, tmp_path: Path
+    ) -> None:
+        """Workflow 3: archive has accuracy/results.json already within 10 KB; written without corruption."""
+        small_responses = [{"text": "hi", "idx": 0}]
+        acc_results = {
+            "config": {},
+            "accuracy_scores": {"score": 0.45},
+            "responses": small_responses,
+        }
+        archive = self._make_accuracy_archive(
+            run_folder, tmp_path, "wf3", acc_results_content=acc_results
+        )
+        sub_dir = build_submission_folder(
+            [("run-001", archive)], "standardized", "available", tmp_path / "sub"
+        )
+        acc_files = list(sub_dir.rglob("accuracy/results.json"))
+        assert len(acc_files) == 1
+        written = json.loads(acc_files[0].read_text())
+        assert written["responses"] == small_responses
+        assert written["accuracy_scores"] == acc_results["accuracy_scores"]