Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 11 additions & 36 deletions src/endpoints_submission_cli/submissions/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,15 +112,21 @@ def build_submission_folder(
)
system_max_concurrency[system_id] = int(values.pop())

max_tps = _compute_max_tps(run_data)
model_runs: dict[str, list[dict[str, Any]]] = {}
for (_system_id, model), runs in groups.items():
model_runs.setdefault(model, []).extend(runs)
max_tps_by_model: dict[str, float | None] = {
model: _compute_max_tps(runs) for model, runs in model_runs.items()
}

written_systems: set[str] = set()
for (system_id, model), runs in groups.items():
if system_id not in written_systems:
_write_system_description(submission_dir, system_id, runs[0]["system_info"])
written_systems.add(system_id)
_write_pareto_entries(
submission_dir, system_id, model, runs, system_max_concurrency[system_id], max_tps
submission_dir, system_id, model, runs,
system_max_concurrency[system_id], max_tps_by_model[model],
)

# Copy src/ for Standardized division submissions (mirrors documentation/ handling)
Expand Down Expand Up @@ -442,21 +448,17 @@ def _write_pareto_entries(
content = json.dumps(metadata, indent=2).encode()
except (json.JSONDecodeError, TypeError, ValueError):
pass
if rel_path == "results.json":
if rel_path in ("results.json", "accuracy/results.json"):
content = _truncate_responses(content)
dest_rel = (
rel_path[len(_acc_prefix) :]
rel_path[len(_acc_prefix):]
if run_type == "accuracy" and rel_path.startswith(_acc_prefix)
else rel_path
)
dest = result_dir / dest_rel
dest.parent.mkdir(parents=True, exist_ok=True)
dest.write_bytes(content)

# Fallback: if the archive had no accuracy/results.json, derive from results.json
if "accuracy/results.json" not in extra_files:
_write_accuracy_fallback(result_dir, run)


_RESPONSES_LIMIT = 10 * 1024 # 10 KB

Expand All @@ -483,33 +485,6 @@ def _truncate_responses(content: bytes) -> bytes:
return json.dumps(data, indent=2).encode()


def _write_accuracy_fallback(result_dir: Path, run: dict[str, Any]) -> None:
"""Write per-point accuracy/ files from results.json when the archive has no accuracy/ dir.

Called only when accuracy/results.json is absent from the run's extra_files
(i.e. the run archive did not include an accuracy/ directory). Sources accuracy_scores
from results.json.

The written results.json format:
{"<dataset_name>": {"score": {...}, "num_samples": N, ...}}
"""
results_bytes = run.get("_extra_files", {}).get("results.json")
if not results_bytes:
return
try:
parsed = json.loads(results_bytes)
except json.JSONDecodeError:
return
accuracy_scores: dict[str, Any] | None = parsed.get("accuracy_scores")
if not accuracy_scores:
return

accuracy_dir = result_dir / "accuracy"
accuracy_dir.mkdir(parents=True, exist_ok=True)
(accuracy_dir / "results.json").write_text(
json.dumps(accuracy_scores, indent=2), encoding="utf-8"
)


def _write_documentation(submission_dir: Path, run_data: list[dict[str, Any]]) -> None:
"""Merge documentation files from all runs into submission_dir/documentation/."""
Expand All @@ -533,7 +508,7 @@ def _write_src(submission_dir: Path, run_data: list[dict[str, Any]]) -> None:
for rel, content in run.get("_extra_files", {}).items():
if not rel.startswith("src/"):
continue
dest = submission_dir / rel
dest = src_model_dir / Path(rel).relative_to("src")
dest.parent.mkdir(parents=True, exist_ok=True)
dest.write_bytes(content)

Expand Down
104 changes: 95 additions & 9 deletions tests/endpoints_submission_cli/submissions/test_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,10 @@ def test_log_summary_created(self, run_archive: Path, tmp_path: Path) -> None:
assert "n_samples_completed" in data
assert "duration_ns" in data

def test_accuracy_file_created(self, run_archive: Path, tmp_path: Path) -> None:
def test_accuracy_file_created(self, run_folder: Path, tmp_path: Path) -> None:
acc_archive = self._make_archive(run_folder, tmp_path, "acc", 4, "accuracy")
sub_dir = build_submission_folder(
[("run-001", run_archive)], "standardized", "available", tmp_path
[("run-001", acc_archive)], "standardized", "available", tmp_path / "sub"
)
acc_jsons = list(sub_dir.rglob("accuracy/results.json"))
assert len(acc_jsons) == 1
Expand Down Expand Up @@ -223,13 +224,9 @@ def test_perf_and_accuracy_same_concurrency_ok(self, run_folder: Path, tmp_path:
"available",
tmp_path / "sub",
)
assert (sub_dir / "pareto").glob("*/*/points/point_4.yaml").__next__().exists()
assert (
(sub_dir / "pareto")
.glob("*/*/results/point_4/accuracy/point_4.yaml")
.__next__()
.exists()
)
assert list((sub_dir / "pareto").glob("*/*/points/point_4.yaml"))
assert list((sub_dir / "pareto").glob("*/*/results/point_4/accuracy/results.json"))
assert list((sub_dir / "pareto").glob("*/*/results/point_4/accuracy/point_4.yaml"))

def test_accuracy_run_routed_to_accuracy(self, run_folder: Path, tmp_path: Path) -> None:
a_acc = self._make_archive(run_folder, tmp_path, "acc", 4, "accuracy")
Expand Down Expand Up @@ -641,3 +638,92 @@ def test_empty_string(self) -> None:
def test_long_name_truncated(self) -> None:
long = "A" * 100
assert len(_slugify(long)) <= 64


@pytest.mark.unit
class TestAccuracyResultsTruncation:
"""Tests for accuracy/results.json truncation across three workflow scenarios."""

def _make_accuracy_archive(
self,
run_folder: Path,
tmp_path: Path,
name: str,
results_content: dict | None = None,
acc_results_content: dict | None = None,
) -> Path:
import shutil

folder = tmp_path / name
shutil.copytree(run_folder, folder)
cfg = yaml.safe_load((folder / "config.yaml").read_text())
cfg["datasets"][0]["type"] = "accuracy"
(folder / "config.yaml").write_text(yaml.dump(cfg))
if results_content is not None:
(folder / "results.json").write_text(json.dumps(results_content))
if acc_results_content is not None:
(folder / "accuracy").mkdir(exist_ok=True)
(folder / "accuracy" / "results.json").write_text(json.dumps(acc_results_content))
archive = tmp_path / f"{name}.tar.gz"
with tarfile.open(archive, "w:gz") as tar:
tar.add(folder, arcname=name)
return archive

def _big_results(self, count: int = 5000) -> dict:
return {
"config": {},
"accuracy_scores": {"score": 0.45},
"responses": [{"text": "a" * 100, "idx": i} for i in range(count)],
}

def test_no_accuracy_subdir_truncates_responses(
self, run_folder: Path, tmp_path: Path
) -> None:
"""Workflow 1/3: archive has only root results.json; responses written truncated to accuracy/results.json."""
archive = self._make_accuracy_archive(
run_folder, tmp_path, "wf1", results_content=self._big_results()
)
sub_dir = build_submission_folder(
[("run-001", archive)], "standardized", "available", tmp_path / "sub"
)
acc_files = list(sub_dir.rglob("accuracy/results.json"))
assert len(acc_files) == 1
written = json.loads(acc_files[0].read_text())
assert len(json.dumps(written["responses"]).encode()) <= 10 * 1024

def test_accuracy_subdir_large_responses_truncated(
self, run_folder: Path, tmp_path: Path
) -> None:
"""Workflow 2: archive has accuracy/results.json with large responses; truncated on write."""
archive = self._make_accuracy_archive(
run_folder, tmp_path, "wf2", acc_results_content=self._big_results()
)
sub_dir = build_submission_folder(
[("run-001", archive)], "standardized", "available", tmp_path / "sub"
)
acc_files = list(sub_dir.rglob("accuracy/results.json"))
assert len(acc_files) == 1
written = json.loads(acc_files[0].read_text())
assert len(json.dumps(written["responses"]).encode()) <= 10 * 1024

def test_accuracy_subdir_already_small_written_intact(
self, run_folder: Path, tmp_path: Path
) -> None:
"""Workflow 3: archive has accuracy/results.json already within 10 KB; written without corruption."""
small_responses = [{"text": "hi", "idx": 0}]
acc_results = {
"config": {},
"accuracy_scores": {"score": 0.45},
"responses": small_responses,
}
archive = self._make_accuracy_archive(
run_folder, tmp_path, "wf3", acc_results_content=acc_results
)
sub_dir = build_submission_folder(
[("run-001", archive)], "standardized", "available", tmp_path / "sub"
)
acc_files = list(sub_dir.rglob("accuracy/results.json"))
assert len(acc_files) == 1
written = json.loads(acc_files[0].read_text())
assert written["responses"] == small_responses
assert written["accuracy_scores"] == acc_results["accuracy_scores"]
Loading