From 43ac53b726433a4682ec119c7e2a22012ab13481 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Mon, 18 May 2026 13:30:52 +0000
Subject: [PATCH 1/6] Remove deprecatd stuf

---
 evaluation/scripts/run_serving_benchmarks.py | 26 ++------------------
 1 file changed, 2 insertions(+), 24 deletions(-)

diff --git a/evaluation/scripts/run_serving_benchmarks.py b/evaluation/scripts/run_serving_benchmarks.py
index e0c76f8..0b819b2 100644
--- a/evaluation/scripts/run_serving_benchmarks.py
+++ b/evaluation/scripts/run_serving_benchmarks.py
@@ -23,7 +23,6 @@
 DEFAULT_OUTPUT_ROOT = REPO_ROOT / "scratch" / "serving_benchmarks"
 DEFAULT_CARBON_MODEL = "HuggingFaceBio/Carbon-3B"
 DEFAULT_CARBON_DRAFT_MODEL = "HuggingFaceBio/Carbon-500M"
-DEFAULT_CARBON_VLLM_ARCHITECTURE_OVERRIDE = "LlamaForCausalLM"
 DEFAULT_GENERATOR_MODEL = "GenerTeam/GENERator-v2-eukaryote-3b-base"
 DEFAULT_EVO2_MODEL = "evo2_7b"
 SUMMARY_FIELDS = [
@@ -95,8 +94,8 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument("--split", default="test")
     parser.add_argument("--num-prompts", type=int, default=16)
     parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument("--input-bp", type=int, default=1000)
-    parser.add_argument("--output-bp", type=int, default=1000)
+    parser.add_argument("--input-bp", type=int, default=1080)
+    parser.add_argument("--output-bp", type=int, default=1080)
     parser.add_argument("--bp-per-token", type=int, default=6)
     parser.add_argument("--carbon-model", default=DEFAULT_CARBON_MODEL)
     parser.add_argument(
@@ -125,16 +124,6 @@ def parse_args() -> argparse.Namespace:
         default=None,
         help="Draft Carbon code revision recorded in --speculative-config.",
     )
-    parser.add_argument(
-        "--carbon-vllm-architecture-override",
-        default=DEFAULT_CARBON_VLLM_ARCHITECTURE_OVERRIDE,
-        help=(
-            "HF architecture override for Carbon vLLM servers. The default "
-            "forces vLLM's native Llama implementation and avoids the "
-            "Transformers-backend attention-name collision in speculative "
-            "decoding. Pass an empty value to disable."
-        ),
-    )
     parser.add_argument(
         "--carbon-speculative-tokens",
         type=int,
@@ -540,14 +529,6 @@ def carbon_server_extra_args(args: argparse.Namespace) -> list[str]:
         extra_args.extend(["--code-revision", args.carbon_code_revision])
     if args.carbon_tokenizer_revision:
         extra_args.extend(["--tokenizer-revision", args.carbon_tokenizer_revision])
-
-    architecture = args.carbon_vllm_architecture_override.strip()
-    if not architecture:
-        return extra_args
-    hf_overrides = {"architectures": [architecture]}
-    extra_args.extend(
-        ["--hf-overrides", json.dumps(hf_overrides, separators=(",", ":"))]
-    )
     return extra_args
 
 
@@ -652,9 +633,6 @@ def build_run_specs(
             "carbon_tokenizer_revision": args.carbon_tokenizer_revision or "",
             "carbon_draft_revision": args.carbon_draft_revision or "",
             "carbon_draft_code_revision": args.carbon_draft_code_revision or "",
-            "vllm_architecture_override": (
-                args.carbon_vllm_architecture_override.strip()
-            ),
         }
         specs.append(
             RunSpec(

From 9591313f5165fb8ccce71fc4cae5ad15fe9d31e8 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Mon, 18 May 2026 13:59:48 +0000
Subject: [PATCH 2/6] Support multi-GPU serving benchmark specs

Add per-spec GPU allocation so Evo2 40B can reserve two visible GPUs while one-GPU serving benchmarks continue to run on individual devices. Also label GENERator vLLM runs from the selected model name.

Co-authored-by: Codex <codex@openai.com>
---
 evaluation/scripts/run_serving_benchmarks.py | 103 +++++++++++++++----
 1 file changed, 84 insertions(+), 19 deletions(-)

diff --git a/evaluation/scripts/run_serving_benchmarks.py b/evaluation/scripts/run_serving_benchmarks.py
index 0b819b2..f89a8e2 100644
--- a/evaluation/scripts/run_serving_benchmarks.py
+++ b/evaluation/scripts/run_serving_benchmarks.py
@@ -19,7 +19,9 @@
 
 
 REPO_ROOT = Path(__file__).resolve().parents[2]
-EVO2_BENCHMARK_SCRIPT = REPO_ROOT / "evaluation" / "scripts" / "benchmark_evo2_serving.py"
+EVO2_BENCHMARK_SCRIPT = (
+    REPO_ROOT / "evaluation" / "scripts" / "benchmark_evo2_serving.py"
+)
 DEFAULT_OUTPUT_ROOT = REPO_ROOT / "scratch" / "serving_benchmarks"
 DEFAULT_CARBON_MODEL = "HuggingFaceBio/Carbon-3B"
 DEFAULT_CARBON_DRAFT_MODEL = "HuggingFaceBio/Carbon-500M"
@@ -63,6 +65,7 @@ class RunSpec:
     model: str
     prompt_file: Path
     run_dir: Path
+    gpu_count: int = 1
     port: int | None = None
     served_model_name: str | None = None
     draft_model: str | None = None
@@ -215,6 +218,13 @@ def evo2_run_name(model_name: str) -> str:
     return sanitize_path_component(normalized_model_name)
 
 
+def evo2_gpu_count(model_name: str) -> int:
+    normalized_model_name = normalize_evo2_model_name(model_name)
+    if normalized_model_name in {"evo2_40b", "evo2_40b_base"}:
+        return 2
+    return 1
+
+
 def model_run_name(model_name: str) -> str:
     return sanitize_path_component(model_name.rsplit("/", 1)[-1].lower())
 
@@ -226,8 +236,12 @@ def load_sequence_recovery_rows(args: argparse.Namespace) -> list[dict]:
     try:
         dataset = load_dataset(repo_id, args.data_config, split=args.split)
     except Exception:
-        parquet_path = f"hf://datasets/{repo_id}/{args.data_config}/{args.split}.parquet"
-        dataset = load_dataset("parquet", data_files={args.split: parquet_path}, split=args.split)
+        parquet_path = (
+            f"hf://datasets/{repo_id}/{args.data_config}/{args.split}.parquet"
+        )
+        dataset = load_dataset(
+            "parquet", data_files={args.split: parquet_path}, split=args.split
+        )
     return list(dataset)
 
 
@@ -617,10 +631,12 @@ def build_run_specs(
         and not args.skip_spec_vocab_check
         and args.carbon_speculative_tokens
     ):
-        spec_preflight_ok, spec_preflight_reason = check_speculative_vocab_compatibility(
-            args.carbon_model,
-            args.carbon_draft_model,
-            run_dir / "speculative_vocab_preflight.json",
+        spec_preflight_ok, spec_preflight_reason = (
+            check_speculative_vocab_compatibility(
+                args.carbon_model,
+                args.carbon_draft_model,
+                run_dir / "speculative_vocab_preflight.json",
+            )
         )
 
     if not args.skip_carbon:
@@ -659,9 +675,7 @@ def build_run_specs(
                         served_model_name=f"{carbon_name}-spec-{token_count}",
                         draft_model=args.carbon_draft_model,
                         num_speculative_tokens=token_count,
-                        speculative_config=carbon_speculative_config(
-                            args, token_count
-                        ),
+                        speculative_config=carbon_speculative_config(args, token_count),
                         server_extra_args=carbon_extra_args,
                         skip_reason=skip_reason,
                         metadata=carbon_metadata,
@@ -678,6 +692,7 @@ def build_run_specs(
                 model=normalized_evo2_model,
                 prompt_file=Path(prompt_files["evo2"]),
                 run_dir=run_dir / evo2_name,
+                gpu_count=evo2_gpu_count(normalized_evo2_model),
                 metadata={
                     "prompt_family": "evo2",
                     "requested_model": args.evo2_model,
@@ -687,14 +702,15 @@ def build_run_specs(
         )
 
     if args.generator != "never":
+        generator_name = model_run_name(args.generator_model)
         specs.append(
             RunSpec(
-                name="generator-v2-eukaryote-3b-vllm",
+                name=f"{generator_name}-vllm",
                 backend="vllm",
                 model=args.generator_model,
                 prompt_file=Path(prompt_files["generator"]),
-                run_dir=run_dir / "generator-v2-eukaryote-3b-vllm",
-                served_model_name="generator-v2-eukaryote-3b-vllm",
+                run_dir=run_dir / f"{generator_name}-vllm",
+                served_model_name=f"{generator_name}-vllm",
                 requires_probe=True,
                 skip_on_probe_failure=args.generator == "auto",
                 metadata={"prompt_family": "generator"},
@@ -998,7 +1014,9 @@ def run_spec(args: argparse.Namespace, spec: RunSpec) -> dict:
     raise ValueError(f"Unsupported backend: {spec.backend}")
 
 
-def run_gpu_queue(args: argparse.Namespace, gpu_id: str, specs: list[RunSpec]) -> list[dict]:
+def run_gpu_queue(
+    args: argparse.Namespace, gpu_id: str, specs: list[RunSpec]
+) -> list[dict]:
     rows = []
     for spec in specs:
         spec.gpu_id = gpu_id
@@ -1059,12 +1077,59 @@ def assign_specs_to_gpus(
     gpu_ids: list[str],
     max_parallel: int,
 ) -> dict[str, list[RunSpec]]:
-    active_gpus = gpu_ids[: min(max_parallel, len(gpu_ids), len(specs))]
-    assignments = {gpu_id: [] for gpu_id in active_gpus}
-    for index, spec in enumerate(specs):
-        gpu_id = active_gpus[index % len(active_gpus)]
+    active_gpus = gpu_ids[: min(max_parallel, len(gpu_ids))]
+    if not active_gpus:
+        return {}
+    for spec in specs:
+        if spec.gpu_count <= 0:
+            raise ValueError(f"{spec.name} has invalid gpu_count={spec.gpu_count}")
+        if spec.gpu_count > len(active_gpus):
+            raise RuntimeError(
+                f"{spec.name} requires {spec.gpu_count} GPUs, but only "
+                f"{len(active_gpus)} are available after --max-parallel."
+            )
+
+    single_gpu_specs = [spec for spec in specs if spec.gpu_count == 1]
+    remaining_gpus = list(active_gpus)
+    multi_gpu_groups: dict[int, list[tuple[str, ...]]] = {}
+
+    for gpu_count in sorted(
+        {spec.gpu_count for spec in specs if spec.gpu_count > 1},
+        reverse=True,
+    ):
+        specs_for_count = [spec for spec in specs if spec.gpu_count == gpu_count]
+        max_groups = len(remaining_gpus) // gpu_count
+        if single_gpu_specs:
+            max_groups = min(max_groups, max(1, (len(remaining_gpus) - 1) // gpu_count))
+        group_count = min(len(specs_for_count), max_groups)
+        if group_count <= 0:
+            raise RuntimeError(
+                f"No free GPU group is available for {gpu_count}-GPU benchmark specs."
+            )
+        groups = []
+        for _ in range(group_count):
+            group = tuple(remaining_gpus[:gpu_count])
+            del remaining_gpus[:gpu_count]
+            groups.append(group)
+        multi_gpu_groups[gpu_count] = groups
+
+    single_gpu_groups = [(gpu_id,) for gpu_id in remaining_gpus]
+    if single_gpu_specs and not single_gpu_groups:
+        raise RuntimeError("No free GPU is available for one-GPU benchmark specs.")
+
+    assignments: dict[str, list[RunSpec]] = {}
+    group_offsets: dict[int, int] = {}
+    for spec in specs:
+        if spec.gpu_count == 1:
+            groups = single_gpu_groups
+        else:
+            groups = multi_gpu_groups[spec.gpu_count]
+        offset = group_offsets.get(spec.gpu_count, 0)
+        group = groups[offset % len(groups)]
+        group_offsets[spec.gpu_count] = offset + 1
+        gpu_id = ",".join(group)
         spec.gpu_id = gpu_id
-        assignments[gpu_id].append(spec)
+        assignments.setdefault(gpu_id, []).append(spec)
     return assignments
 
 

From ef58fb050ce1c67cd64f40f8e5929c801d733e65 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Mon, 18 May 2026 14:03:40 +0000
Subject: [PATCH 3/6] Add bp throughput to serving summaries

Record output_bp_per_second in serving benchmark summary rows using the model family's output-token-to-base-pair ratio.

Co-authored-by: Codex <codex@openai.com>
---
 evaluation/scripts/run_serving_benchmarks.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/evaluation/scripts/run_serving_benchmarks.py b/evaluation/scripts/run_serving_benchmarks.py
index f89a8e2..bb9210e 100644
--- a/evaluation/scripts/run_serving_benchmarks.py
+++ b/evaluation/scripts/run_serving_benchmarks.py
@@ -46,6 +46,7 @@
     "total_output_tokens",
     "request_throughput",
     "output_throughput",
+    "output_bp_per_second",
     "total_token_throughput",
     "mean_ttft_ms",
     "mean_tpot_ms",
@@ -66,6 +67,7 @@ class RunSpec:
     prompt_file: Path
     run_dir: Path
     gpu_count: int = 1
+    bp_per_output_token: float = 1.0
     port: int | None = None
     served_model_name: str | None = None
     draft_model: str | None = None
@@ -657,6 +659,7 @@ def build_run_specs(
                 model=args.carbon_model,
                 prompt_file=Path(prompt_files["carbon"]),
                 run_dir=run_dir / f"{carbon_name}-vllm",
+                bp_per_output_token=args.bp_per_token,
                 served_model_name=f"{carbon_name}-vllm",
                 server_extra_args=carbon_extra_args,
                 metadata=carbon_metadata,
@@ -672,6 +675,7 @@ def build_run_specs(
                         model=args.carbon_model,
                         prompt_file=Path(prompt_files["carbon"]),
                         run_dir=run_dir / f"{carbon_name}-spec-{token_count}",
+                        bp_per_output_token=args.bp_per_token,
                         served_model_name=f"{carbon_name}-spec-{token_count}",
                         draft_model=args.carbon_draft_model,
                         num_speculative_tokens=token_count,
@@ -710,6 +714,7 @@ def build_run_specs(
                 model=args.generator_model,
                 prompt_file=Path(prompt_files["generator"]),
                 run_dir=run_dir / f"{generator_name}-vllm",
+                bp_per_output_token=args.bp_per_token,
                 served_model_name=f"{generator_name}-vllm",
                 requires_probe=True,
                 skip_on_probe_failure=args.generator == "auto",
@@ -824,6 +829,9 @@ def metric_row_from_result(spec: RunSpec, result: dict, status: str) -> dict:
         "p99_itl_ms",
     ]:
         row[key] = result.get(key, "")
+    output_throughput = result.get("output_throughput")
+    if output_throughput not in (None, ""):
+        row["output_bp_per_second"] = output_throughput * spec.bp_per_output_token
     row["result_json"] = str(spec.run_dir / "benchmark.json")
     return row
 
@@ -848,6 +856,7 @@ def base_summary_row(spec: RunSpec, status: str) -> dict:
         "total_output_tokens": "",
         "request_throughput": "",
         "output_throughput": "",
+        "output_bp_per_second": "",
         "total_token_throughput": "",
         "mean_ttft_ms": "",
         "mean_tpot_ms": "",

From eea32a427e76b5c4f6cdc5f35f7c663bd7a61db0 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Wed, 20 May 2026 13:23:20 +0000
Subject: [PATCH 4/6] Rename

---
 evaluation/{scripts => serving}/benchmark_evo2_serving.py         | 0
 .../benchmark_generator_transformers_serving.py                   | 0
 .../{scripts => serving}/benchmark_sequence_recovery_gen_len.py   | 0
 evaluation/{scripts => serving}/plot_sequence_recovery_sweep.py   | 0
 evaluation/{scripts => serving}/run_serving_benchmarks.py         | 0
 5 files changed, 0 insertions(+), 0 deletions(-)
 rename evaluation/{scripts => serving}/benchmark_evo2_serving.py (100%)
 rename evaluation/{scripts => serving}/benchmark_generator_transformers_serving.py (100%)
 rename evaluation/{scripts => serving}/benchmark_sequence_recovery_gen_len.py (100%)
 rename evaluation/{scripts => serving}/plot_sequence_recovery_sweep.py (100%)
 rename evaluation/{scripts => serving}/run_serving_benchmarks.py (100%)

diff --git a/evaluation/scripts/benchmark_evo2_serving.py b/evaluation/serving/benchmark_evo2_serving.py
similarity index 100%
rename from evaluation/scripts/benchmark_evo2_serving.py
rename to evaluation/serving/benchmark_evo2_serving.py
diff --git a/evaluation/scripts/benchmark_generator_transformers_serving.py b/evaluation/serving/benchmark_generator_transformers_serving.py
similarity index 100%
rename from evaluation/scripts/benchmark_generator_transformers_serving.py
rename to evaluation/serving/benchmark_generator_transformers_serving.py
diff --git a/evaluation/scripts/benchmark_sequence_recovery_gen_len.py b/evaluation/serving/benchmark_sequence_recovery_gen_len.py
similarity index 100%
rename from evaluation/scripts/benchmark_sequence_recovery_gen_len.py
rename to evaluation/serving/benchmark_sequence_recovery_gen_len.py
diff --git a/evaluation/scripts/plot_sequence_recovery_sweep.py b/evaluation/serving/plot_sequence_recovery_sweep.py
similarity index 100%
rename from evaluation/scripts/plot_sequence_recovery_sweep.py
rename to evaluation/serving/plot_sequence_recovery_sweep.py
diff --git a/evaluation/scripts/run_serving_benchmarks.py b/evaluation/serving/run_serving_benchmarks.py
similarity index 100%
rename from evaluation/scripts/run_serving_benchmarks.py
rename to evaluation/serving/run_serving_benchmarks.py

From 75eb7f38736318a1d9be03e43a09738ebc7033da Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Wed, 20 May 2026 13:35:53 +0000
Subject: [PATCH 5/6] Document serving inference benchmarks

Add commands for dry-run, Carbon vLLM, speculative decoding, full-node comparisons, and direct Evo2 serving benchmarks. Fix the serving wrapper's Evo2 helper path so the documented wrapper can launch it.

Co-authored-by: Codex <codex@openai.com>
---
 evaluation/README.md                         | 92 ++++++++++++++++++++
 evaluation/serving/run_serving_benchmarks.py |  2 +-
 2 files changed, 93 insertions(+), 1 deletion(-)

diff --git a/evaluation/README.md b/evaluation/README.md
index 5b0ac6b..c92c51e 100644
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -12,6 +12,7 @@ one flag, so the same script runs on Carbon, GENERator, or Evo2.
 3. [ClinVar VEP](#3-clinvar-vep) — right-end / next-token scoring (GENERator recipe)
 4. [Sequence-level perturbation tasks](#4-sequence-level-perturbation-tasks) — nucleotide triplet-expansion + synonymous codon substitution, new tasks we built
 5. [Genome-NIAH long-context retrieval](#5-genome-niah-long-context-retrieval) — long-context needle-in-a-haystack for DNA (4 tasks × 6 context lengths up to 786 kbp)
+6. [Serving inference benchmarks](#6-serving-inference-benchmarks) — vLLM and Evo2 latency / throughput on sequence-recovery prompts
 
 ## Scripts
 
@@ -333,6 +334,97 @@ for SHARD in 0 1 2 3 4 5; do
     sbatch evaluation/slurm/evo2-7b/genome_niah.sbatch
 done
 ```
+
+## 6. Serving inference benchmarks
+
+Serving benchmarks live in [`evaluation/serving/`](serving). They sample
+prompts from the sequence-recovery dataset, run fixed-length generation, and
+write vLLM-style latency / throughput metrics under `scratch/serving_benchmarks`.
+The wrapper runs Carbon through `vllm serve`, optional speculative Carbon with a
+draft model, optional GENERator through vLLM, and Evo2 through the local Evo2
+benchmark helper.
+
+Use `--dry-run` first to prepare prompt files and inspect the exact server and
+benchmark commands without starting any model servers:
+
+```bash
+uv run --group evaluation python evaluation/serving/run_serving_benchmarks.py \
+    --run-id serving-dry-run \
+    --num-prompts 4 \
+    --input-bp 1080 --output-bp 1080 \
+    --gpu-ids 0,1 --max-parallel 2 \
+    --generator never \
+    --dry-run
+```
+
+The dry run writes commands to:
+
+```text
+scratch/serving_benchmarks/serving-dry-run/dry_run_commands.sh
+```
+
+Carbon-only vLLM smoke benchmark:
+
+```bash
+uv run --group evaluation python evaluation/serving/run_serving_benchmarks.py \
+    --run-id carbon-3b-vllm-smoke \
+    --num-prompts 16 \
+    --input-bp 1080 --output-bp 1080 \
+    --gpu-ids 0 --max-parallel 1 \
+    --skip-evo2 --skip-speculative --generator never
+```
+
+Carbon target plus speculative decoding with a Carbon draft model:
+
+```bash
+uv run --group evaluation python evaluation/serving/run_serving_benchmarks.py \
+    --run-id carbon-3b-speculative \
+    --num-prompts 64 \
+    --input-bp 1080 --output-bp 1080 \
+    --gpu-ids 0,1,2,3 --max-parallel 4 \
+    --carbon-model HuggingFaceBio/Carbon-3B \
+    --carbon-draft-model HuggingFaceBio/Carbon-500M \
+    --carbon-speculative-tokens 2 4 8 \
+    --skip-evo2 --generator never
+```
+
+Full comparison on one 8-GPU node, including Carbon, speculative Carbon, Evo2,
+and GENERator when its vLLM compatibility probe succeeds:
+
+```bash
+uv run --group evaluation python evaluation/serving/run_serving_benchmarks.py \
+    --run-id serving-full-8gpu \
+    --num-prompts 128 \
+    --input-bp 1080 --output-bp 1080 \
+    --gpu-ids 0,1,2,3,4,5,6,7 --max-parallel 8 \
+    --generator auto
+```
+
+To run only the Evo2 helper, first prepare prompts, then point
+[`benchmark_evo2_serving.py`](serving/benchmark_evo2_serving.py) at the Evo2
+prompt JSONL:
+
+```bash
+uv run --group evaluation python evaluation/serving/run_serving_benchmarks.py \
+    --run-id evo2-serving-prompts \
+    --num-prompts 16 \
+    --input-bp 1080 --output-bp 1080 \
+    --prepare-only
+
+uv run --group evaluation python evaluation/serving/benchmark_evo2_serving.py \
+    --model evo2_7b \
+    --dataset-path scratch/serving_benchmarks/evo2-serving-prompts/prompts/evo2_prompts.jsonl \
+    --num-prompts 16 \
+    --output-dir scratch/serving_benchmarks/evo2-7b-direct \
+    --temperature 1.0 --top-k 1 --top-p 0.0
+```
+
+Each run writes a top-level `summary.csv` and `summary.json`, plus one
+subdirectory per benchmark with `commands.json`, logs, `benchmark.json`, and
+`detailed.json` where available. The `output_throughput` column is generated
+tokens per second; `output_bp_per_second` converts Carbon and GENERator 6-mer
+tokens back to base pairs per second.
+
 ## Environment
 
 Install the root project environment with uv:
diff --git a/evaluation/serving/run_serving_benchmarks.py b/evaluation/serving/run_serving_benchmarks.py
index bb9210e..aac3986 100644
--- a/evaluation/serving/run_serving_benchmarks.py
+++ b/evaluation/serving/run_serving_benchmarks.py
@@ -20,7 +20,7 @@
 
 REPO_ROOT = Path(__file__).resolve().parents[2]
 EVO2_BENCHMARK_SCRIPT = (
-    REPO_ROOT / "evaluation" / "scripts" / "benchmark_evo2_serving.py"
+    REPO_ROOT / "evaluation" / "serving" / "benchmark_evo2_serving.py"
 )
 DEFAULT_OUTPUT_ROOT = REPO_ROOT / "scratch" / "serving_benchmarks"
 DEFAULT_CARBON_MODEL = "HuggingFaceBio/Carbon-3B"

From eee0d1999e4c185551e7ce94af02a44d3065ba88 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Wed, 20 May 2026 13:40:44 +0000
Subject: [PATCH 6/6] clean up

---
 ...enchmark_generator_transformers_serving.py | 489 ------------------
 .../benchmark_sequence_recovery_gen_len.py    | 429 ---------------
 .../serving/plot_sequence_recovery_sweep.py   | 279 ----------
 3 files changed, 1197 deletions(-)
 delete mode 100644 evaluation/serving/benchmark_generator_transformers_serving.py
 delete mode 100644 evaluation/serving/benchmark_sequence_recovery_gen_len.py
 delete mode 100644 evaluation/serving/plot_sequence_recovery_sweep.py

diff --git a/evaluation/serving/benchmark_generator_transformers_serving.py b/evaluation/serving/benchmark_generator_transformers_serving.py
deleted file mode 100644
index 1cdb243..0000000
--- a/evaluation/serving/benchmark_generator_transformers_serving.py
+++ /dev/null
@@ -1,489 +0,0 @@
-import argparse
-import json
-import math
-import statistics
-import time
-from datetime import datetime
-from pathlib import Path
-
-
-SPECIAL_TOKENS = [
-    "<oov>",
-    "<s>",
-    "</s>",
-    "<pad>",
-    "<mask>",
-    "<bog>",
-    "<eog>",
-    "<bok>",
-    "<eok>",
-    "<+>",
-    "<->",
-    "<cds>",
-    "<pseudo>",
-    "<tRNA>",
-    "<rRNA>",
-    "<ncRNA>",
-    "<miscRNA>",
-    "<mam>",
-    "<vrt>",
-    "<inv>",
-    "<pln>",
-    "<fng>",
-    "<prt>",
-    "<arc>",
-    "<bct>",
-    "<mit>",
-    "<plt>",
-    "<plm>",
-    "<vir>",
-    "<sp0>",
-    "<sp1>",
-    "<sp2>",
-]
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description=(
-            "Benchmark GENERator generation through native Transformers with a "
-            "vLLM-like serving metric block."
-        )
-    )
-    parser.add_argument(
-        "--model",
-        default="GenerTeam/GENERator-v2-eukaryote-1.2b-base",
-        help="Hugging Face model id or local path.",
-    )
-    parser.add_argument(
-        "--dataset-path",
-        type=Path,
-        required=True,
-        help="Prepared JSONL prompt file.",
-    )
-    parser.add_argument("--num-prompts", type=int, default=16)
-    parser.add_argument(
-        "--output-dir",
-        type=Path,
-        required=True,
-        help="Directory for result files.",
-    )
-    parser.add_argument("--result-json", type=Path, default=None)
-    parser.add_argument("--detailed-json", type=Path, default=None)
-    parser.add_argument("--batch-size", type=int, default=1)
-    parser.add_argument(
-        "--dtype",
-        choices=["bfloat16", "float16", "float32"],
-        default="bfloat16",
-    )
-    parser.add_argument("--device", default="cuda:0")
-    parser.add_argument("--bp-per-token", type=int, default=6)
-    parser.add_argument("--kmer-size", type=int, default=6)
-    parser.add_argument("--temperature", type=float, default=0.00001)
-    parser.add_argument("--top-k", type=int, default=1)
-    parser.add_argument("--trust-remote-code", action="store_true")
-    parser.add_argument("--num-warmups", type=int, default=0)
-    parser.add_argument("--label", default="generator-transformers")
-    return parser.parse_args()
-
-
-def read_jsonl(path: Path) -> list[dict]:
-    rows = []
-    with path.open("r", encoding="utf-8") as handle:
-        for line_number, line in enumerate(handle, start=1):
-            line = line.strip()
-            if not line:
-                continue
-            item = json.loads(line)
-            if "prompt" not in item:
-                raise ValueError(f"{path}:{line_number} is missing 'prompt'")
-            if "output_tokens" not in item:
-                raise ValueError(f"{path}:{line_number} is missing 'output_tokens'")
-            item["output_tokens"] = int(item["output_tokens"])
-            rows.append(item)
-    return rows
-
-
-def percentile(values: list[float], percentile_value: float) -> float:
-    if not values:
-        return 0.0
-    ordered = sorted(values)
-    if len(ordered) == 1:
-        return ordered[0]
-    rank = (len(ordered) - 1) * percentile_value / 100.0
-    lower = math.floor(rank)
-    upper = math.ceil(rank)
-    if lower == upper:
-        return ordered[int(rank)]
-    weight = rank - lower
-    return ordered[lower] * (1 - weight) + ordered[upper] * weight
-
-
-def mean_ms(values: list[float]) -> float:
-    return (statistics.mean(values) * 1000.0) if values else 0.0
-
-
-def median_ms(values: list[float]) -> float:
-    return (statistics.median(values) * 1000.0) if values else 0.0
-
-
-def std_ms(values: list[float]) -> float:
-    return (statistics.pstdev(values) * 1000.0) if len(values) > 1 else 0.0
-
-
-def p99_ms(values: list[float]) -> float:
-    return percentile(values, 99.0) * 1000.0
-
-
-def synchronize_cuda() -> None:
-    try:
-        import torch
-
-        if torch.cuda.is_available():
-            torch.cuda.synchronize()
-    except Exception:
-        return
-
-
-class KmerTokenizer:
-    def __init__(self, k: int):
-        import itertools
-        import re
-
-        self.k = k
-        self.special_tokens = SPECIAL_TOKENS
-        self.vocab = {
-            token: index
-            for index, token in enumerate(
-                self.special_tokens
-                + ["".join(kmer) for kmer in itertools.product("ATCG", repeat=k)]
-            )
-        }
-        self.ids_to_tokens = {index: token for token, index in self.vocab.items()}
-        self.special_token_pattern = re.compile(
-            "|".join(re.escape(token) for token in self.special_tokens)
-        )
-        self.dna_pattern = re.compile(f"[A-Z]{{{self.k}}}|[A-Z]+")
-        self.bos_token_id = self.vocab["<s>"]
-        self.eos_token_id = self.vocab["</s>"]
-        self.pad_token_id = self.vocab["<pad>"]
-        self.unk_token_id = self.vocab["<oov>"]
-
-    @property
-    def vocab_size(self) -> int:
-        return len(self.vocab)
-
-    def tokenize(self, text: str) -> list[str]:
-        tokens = []
-        pos = 0
-        while pos < len(text):
-            special_match = self.special_token_pattern.match(text, pos)
-            if special_match:
-                tokens.append(special_match.group())
-                pos = special_match.end()
-                continue
-            dna_match = self.dna_pattern.match(text, pos)
-            if dna_match:
-                tokens.append(dna_match.group())
-                pos = dna_match.end()
-                continue
-            tokens.append(text[pos])
-            pos += 1
-        return tokens
-
-    def encode(self, text: str, add_bos_token: bool = True) -> list[int]:
-        token_ids = [
-            self.vocab.get(token, self.unk_token_id) for token in self.tokenize(text)
-        ]
-        if add_bos_token:
-            return [self.bos_token_id] + token_ids
-        return token_ids
-
-    def decode(self, token_ids: list[int], skip_special_tokens: bool = True) -> str:
-        tokens = []
-        for token_id in token_ids:
-            token = self.ids_to_tokens.get(int(token_id), "<oov>")
-            if skip_special_tokens and token in self.special_tokens:
-                continue
-            tokens.append(token)
-        return "".join(tokens)
-
-
-def load_model(args: argparse.Namespace):
-    import torch
-    from transformers import AutoModelForCausalLM
-
-    dtype_map = {
-        "bfloat16": torch.bfloat16,
-        "float16": torch.float16,
-        "float32": torch.float32,
-    }
-    model = AutoModelForCausalLM.from_pretrained(
-        args.model,
-        trust_remote_code=args.trust_remote_code,
-        dtype=dtype_map[args.dtype],
-        low_cpu_mem_usage=True,
-    )
-    model.to(args.device)
-    model.eval()
-    return model
-
-
-def make_batch(
-    requests: list[dict],
-    tokenizer: KmerTokenizer,
-    device: str,
-) -> tuple:
-    import torch
-
-    encoded = [tokenizer.encode(request["prompt"], add_bos_token=True) for request in requests]
-    max_len = max(len(item) for item in encoded)
-    input_ids = []
-    attention_mask = []
-    for item in encoded:
-        pad_len = max_len - len(item)
-        input_ids.append([tokenizer.pad_token_id] * pad_len + item)
-        attention_mask.append([0] * pad_len + [1] * len(item))
-    return (
-        torch.tensor(input_ids, dtype=torch.long, device=device),
-        torch.tensor(attention_mask, dtype=torch.long, device=device),
-        [len(item) for item in encoded],
-    )
-
-
-def run_batch(
-    model,
-    requests: list[dict],
-    tokenizer: KmerTokenizer,
-    args: argparse.Namespace,
-) -> list[dict]:
-    import torch
-
-    output_tokens = int(requests[0]["output_tokens"])
-    output_lengths = {int(request["output_tokens"]) for request in requests}
-    if len(output_lengths) != 1:
-        raise ValueError("All requests in a batch must have the same output_tokens")
-
-    input_ids, attention_mask, prompt_lens = make_batch(requests, tokenizer, args.device)
-    synchronize_cuda()
-    start_perf = time.perf_counter()
-    start_wall = time.time()
-    with torch.inference_mode():
-        generated = model.generate(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            max_new_tokens=output_tokens,
-            min_new_tokens=output_tokens,
-            do_sample=True,
-            temperature=args.temperature,
-            top_k=args.top_k,
-            use_cache=True,
-            eos_token_id=None,
-            pad_token_id=tokenizer.pad_token_id,
-        )
-    synchronize_cuda()
-    end_perf = time.perf_counter()
-    latency = end_perf - start_perf
-
-    details = []
-    for index, request in enumerate(requests):
-        prompt_len = prompt_lens[index]
-        generated_ids = generated[index, input_ids.shape[1] :].tolist()
-        generated_ids = generated_ids[:output_tokens]
-        generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)
-        details.append(
-            {
-                "success": True,
-                "request_id": request.get("request_id"),
-                "prompt_len": prompt_len,
-                "output_len": len(generated_ids),
-                "expected_output_len": output_tokens,
-                "output_bp": len(generated_ids) * args.bp_per_token,
-                "ttft": 0.0,
-                "itl": [],
-                "latency": latency,
-                "start_time": start_wall,
-                "generated_text": generated_text,
-                "error": "",
-                "metadata": request.get("metadata", {}),
-                "batch_size": len(requests),
-            }
-        )
-    return details
-
-
-def summarize_results(
-    requests: list[dict],
-    details: list[dict],
-    duration: float,
-    args: argparse.Namespace,
-) -> dict:
-    successes = [item for item in details if item["success"]]
-    failures = [item for item in details if not item["success"]]
-    ttfts = [item["ttft"] for item in successes if item["ttft"] > 0]
-    itls = [latency for item in successes for latency in item["itl"]]
-    e2els = [item["latency"] for item in successes]
-    total_input = sum(item["prompt_len"] for item in successes)
-    total_output = sum(item["output_len"] for item in successes)
-    completed = len(successes)
-    safe_duration = duration if duration > 0 else 1e-12
-
-    return {
-        "date": datetime.now().strftime("%Y%m%d-%H%M%S"),
-        "backend": "transformers",
-        "endpoint_type": "transformers",
-        "label": args.label,
-        "model_id": args.model,
-        "tokenizer_id": "builtin-kmer",
-        "num_prompts": len(requests),
-        "batch_size": max(1, args.batch_size),
-        "duration": duration,
-        "completed": completed,
-        "failed": len(failures),
-        "total_input_tokens": total_input,
-        "total_output_tokens": total_output,
-        "request_throughput": completed / safe_duration,
-        "request_goodput": None,
-        "output_throughput": total_output / safe_duration,
-        "bp_per_token": args.bp_per_token,
-        "output_bp_throughput": (total_output * args.bp_per_token) / safe_duration,
-        "total_token_throughput": (total_input + total_output) / safe_duration,
-        "input_lens": [item["prompt_len"] for item in details],
-        "output_lens": [item["output_len"] for item in details],
-        "ttfts": [item["ttft"] for item in details],
-        "itls": [item["itl"] for item in details],
-        "start_times": [item["start_time"] for item in details],
-        "generated_texts": [item["generated_text"] for item in details],
-        "errors": [item["error"] for item in details],
-        "mean_ttft_ms": mean_ms(ttfts),
-        "median_ttft_ms": median_ms(ttfts),
-        "std_ttft_ms": std_ms(ttfts),
-        "p99_ttft_ms": p99_ms(ttfts),
-        "mean_tpot_ms": 0.0,
-        "median_tpot_ms": 0.0,
-        "std_tpot_ms": 0.0,
-        "p99_tpot_ms": 0.0,
-        "mean_itl_ms": mean_ms(itls),
-        "median_itl_ms": median_ms(itls),
-        "std_itl_ms": std_ms(itls),
-        "p99_itl_ms": p99_ms(itls),
-        "mean_e2el_ms": mean_ms(e2els),
-        "median_e2el_ms": median_ms(e2els),
-        "std_e2el_ms": std_ms(e2els),
-        "p99_e2el_ms": p99_ms(e2els),
-        "max_output_tokens_per_s": 0.0,
-        "max_concurrent_requests": min(max(1, args.batch_size), completed),
-        "rtfx": 0.0,
-    }
-
-
-def print_metric_block(result: dict) -> None:
-    print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
-    print("{:<40} {:<10}".format("Successful requests:", result["completed"]))
-    print("{:<40} {:<10}".format("Failed requests:", result["failed"]))
-    print("{:<40} {:<10.2f}".format("Benchmark duration (s):", result["duration"]))
-    print("{:<40} {:<10}".format("Total input tokens:", result["total_input_tokens"]))
-    print(
-        "{:<40} {:<10}".format(
-            "Total generated tokens:", result["total_output_tokens"]
-        )
-    )
-    print(
-        "{:<40} {:<10.2f}".format(
-            "Request throughput (req/s):", result["request_throughput"]
-        )
-    )
-    print(
-        "{:<40} {:<10.2f}".format(
-            "Output token throughput (tok/s):", result["output_throughput"]
-        )
-    )
-    print(
-        "{:<40} {:<10.2f}".format(
-            "Output bp throughput (bp/s):", result["output_bp_throughput"]
-        )
-    )
-    print(
-        "{:<40} {:<10.2f}".format(
-            "Total token throughput (tok/s):", result["total_token_throughput"]
-        )
-    )
-    print("=" * 50)
-
-
-def main() -> None:
-    args = parse_args()
-    args.output_dir.mkdir(parents=True, exist_ok=True)
-    result_json = args.result_json or (args.output_dir / "benchmark.json")
-    detailed_json = args.detailed_json or (args.output_dir / "detailed.json")
-
-    requests = read_jsonl(args.dataset_path)
-    if args.num_prompts > 0:
-        requests = requests[: args.num_prompts]
-    if not requests:
-        raise ValueError("No requests to benchmark")
-
-    tokenizer = KmerTokenizer(args.kmer_size)
-    model = load_model(args)
-
-    batch_size = max(1, args.batch_size)
-    warmups = requests[: args.num_warmups]
-    for batch_start in range(0, len(warmups), batch_size):
-        _ = run_batch(
-            model,
-            warmups[batch_start : batch_start + batch_size],
-            tokenizer,
-            args,
-        )
-
-    benchmark_start = time.perf_counter()
-    details = []
-    try:
-        for batch_start in range(0, len(requests), batch_size):
-            details.extend(
-                run_batch(
-                    model,
-                    requests[batch_start : batch_start + batch_size],
-                    tokenizer,
-                    args,
-                )
-            )
-    except Exception as exc:
-        details.append(
-            {
-                "success": False,
-                "request_id": None,
-                "prompt_len": 0,
-                "output_len": 0,
-                "expected_output_len": 0,
-                "output_bp": 0,
-                "ttft": 0.0,
-                "itl": [],
-                "latency": 0.0,
-                "start_time": time.time(),
-                "generated_text": "",
-                "error": repr(exc),
-                "metadata": {},
-                "batch_size": batch_size,
-            }
-        )
-    synchronize_cuda()
-    duration = time.perf_counter() - benchmark_start
-
-    result = summarize_results(requests, details, duration, args)
-    print_metric_block(result)
-
-    with detailed_json.open("w", encoding="utf-8") as handle:
-        json.dump(details, handle, indent=2)
-    with result_json.open("w", encoding="utf-8") as handle:
-        json.dump(result, handle, indent=2)
-
-    print(f"Result JSON: {result_json}")
-    print(f"Detailed JSON: {detailed_json}")
-
-    if result["failed"]:
-        raise SystemExit(1)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/evaluation/serving/benchmark_sequence_recovery_gen_len.py b/evaluation/serving/benchmark_sequence_recovery_gen_len.py
deleted file mode 100644
index 15d249e..0000000
--- a/evaluation/serving/benchmark_sequence_recovery_gen_len.py
+++ /dev/null
@@ -1,429 +0,0 @@
-import argparse
-import json
-import os
-import re
-import subprocess
-import sys
-import time
-from pathlib import Path
-
-import matplotlib.pyplot as plt
-import pandas as pd
-import torch
-
-REPO_ROOT = Path(__file__).resolve().parents[2]
-EVAL_SCRIPT = REPO_ROOT / "evaluation" / "sequence_recovery_eval.py"
-DEFAULT_OUTPUT_DIR = REPO_ROOT / "scratch" / "sequence_recovery_gen_len_benchmark"
-DEFAULT_GEN_LENS = [5, 10, 20, 40, 80, 160, 320, 640]
-GROUP_ORDER = [
-    "fungi",
-    "invertebrate",
-    "plant",
-    "protozoa",
-    "vertebrate_mammalian",
-    "vertebrate_other",
-    "overall",
-]
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description=(
-            "Run sequence recovery serially over a gen_len sweep, using all selected GPUs "
-            "for each eval, then aggregate and plot accuracy-vs-gen_len."
-        )
-    )
-    parser.add_argument("--model", required=True, help="Model name or path")
-    parser.add_argument(
-        "--model_name",
-        default=None,
-        help="Optional output name override passed through to sequence_recovery_eval.py",
-    )
-    parser.add_argument(
-        "--revision",
-        default=None,
-        help="Optional model revision/tag/commit",
-    )
-    parser.add_argument(
-        "--data_type",
-        default="eukaryote",
-        choices=["eukaryote", "bacteria", "others"],
-        help="Dataset split to evaluate",
-    )
-    parser.add_argument(
-        "--data_path",
-        default="hf://datasets/GenerTeam/sequence-recovery",
-        help="HF dataset parquet path",
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=Path,
-        default=DEFAULT_OUTPUT_DIR,
-        help="Directory for run outputs and aggregate artifacts",
-    )
-    parser.add_argument(
-        "--gen_lens",
-        type=int,
-        nargs="+",
-        default=DEFAULT_GEN_LENS,
-        help="gen_len values to benchmark serially",
-    )
-    parser.add_argument(
-        "--num_gpus",
-        type=int,
-        default=8,
-        help="Number of visible GPUs to expose to each eval subprocess",
-    )
-    parser.add_argument(
-        "--max_seq_len",
-        type=int,
-        default=6144,
-        help="Max input length in bp",
-    )
-    parser.add_argument(
-        "--gen_len_bp",
-        type=int,
-        default=None,
-        help=(
-            "Base-pair generation length for Evo2 runs. Defaults to "
-            "gen_len * bp_per_token for each sweep point."
-        ),
-    )
-    parser.add_argument(
-        "--bp_per_token",
-        type=int,
-        default=6,
-        help="Base pairs represented by each HF generation token.",
-    )
-    parser.add_argument(
-        "--batch_size",
-        type=int,
-        default=64,
-        help="Batch size per GPU",
-    )
-    parser.add_argument(
-        "--max_samples",
-        type=int,
-        default=None,
-        help="Optional test-only sample cap passed through to the eval script",
-    )
-    parser.add_argument(
-        "--sample_seed",
-        type=int,
-        default=0,
-        help="Random seed used when --max_samples subsamples the dataset",
-    )
-    parser.add_argument("--bf16", action="store_true", help="Use bfloat16")
-    parser.add_argument(
-        "--use_evo2",
-        action="store_true",
-        help="Use official Evo2 inference path",
-    )
-    parser.add_argument(
-        "--use_dna_tags",
-        action="store_true",
-        help="Wrap DNA sequences with <dna>...</dna> tags",
-    )
-    parser.add_argument(
-        "--no_prefix",
-        action="store_true",
-        help="Do not add a BOS or DNA prefix token",
-    )
-    parser.add_argument(
-        "--use_species_tags",
-        action="store_true",
-        help="Prepend species tags before DNA sequences",
-    )
-    return parser.parse_args()
-
-
-def sanitize_path_component(value: str) -> str:
-    sanitized = re.sub(r"[^A-Za-z0-9._-]+", "-", value).strip("-")
-    return sanitized or "run"
-
-
-def resolve_visible_gpu_ids(requested_count: int) -> list[str]:
-    env_value = os.environ.get("CUDA_VISIBLE_DEVICES", "").strip()
-    if env_value and env_value != "NoDevFiles":
-        visible_ids = [item.strip() for item in env_value.split(",") if item.strip()]
-    else:
-        visible_ids = [str(index) for index in range(torch.cuda.device_count())]
-
-    if len(visible_ids) < requested_count:
-        raise ValueError(
-            f"Requested {requested_count} GPUs but only found {len(visible_ids)} visible GPUs"
-        )
-
-    return visible_ids[:requested_count]
-
-
-def build_eval_command(
-    args: argparse.Namespace,
-    run_dir: Path,
-    gen_len: int,
-) -> list[str]:
-    gen_len_bp = args.gen_len_bp
-    if gen_len_bp is None:
-        gen_len_bp = gen_len * args.bp_per_token
-
-    command = [
-        sys.executable,
-        str(EVAL_SCRIPT),
-        "--model",
-        args.model,
-        "--data_type",
-        args.data_type,
-        "--data_path",
-        args.data_path,
-        "--output_dir",
-        str(run_dir),
-        "--max_seq_len",
-        str(args.max_seq_len),
-        "--gen_len",
-        str(gen_len),
-        "--gen_len_bp",
-        str(gen_len_bp),
-        "--batch_size",
-        str(args.batch_size),
-        "--accuracy_mode",
-        "prediction_length",
-        "--bp_per_token",
-        str(args.bp_per_token),
-    ]
-    if args.model_name:
-        command.extend(["--model_name", args.model_name])
-    if args.revision:
-        command.extend(["--revision", args.revision])
-    if args.max_samples is not None:
-        command.extend(["--max_samples", str(args.max_samples)])
-        command.extend(["--sample_seed", str(args.sample_seed)])
-    if args.bf16:
-        command.append("--bf16")
-    if args.use_evo2:
-        command.append("--use_evo2")
-    if args.use_dna_tags:
-        command.append("--use_dna_tags")
-    if args.no_prefix:
-        command.append("--no_prefix")
-    if args.use_species_tags:
-        command.append("--use_species_tags")
-    return command
-
-
-def load_run_outputs(run_dir: Path) -> tuple[Path, Path, dict]:
-    parquet_paths = sorted(run_dir.glob("*.parquet"))
-    summary_paths = sorted(run_dir.glob("*.json"))
-    if len(parquet_paths) != 1 or len(summary_paths) != 1:
-        raise RuntimeError(
-            f"Expected exactly one parquet and one json in {run_dir}, "
-            f"found {len(parquet_paths)} parquet and {len(summary_paths)} json files"
-        )
-
-    summary_path = summary_paths[0]
-    with summary_path.open("r", encoding="utf-8") as handle:
-        summary = json.load(handle)
-    return parquet_paths[0], summary_path, summary
-
-
-def build_aggregate_rows(
-    gen_len: int, generation_bp: int, run_df: pd.DataFrame
-) -> list[dict]:
-    rows = [
-        {
-            "gen_len": gen_len,
-            "generation_bp": generation_bp,
-            "group": "overall",
-            "accuracy": float(run_df["accuracy"].mean()),
-            "num_sequences": int(len(run_df)),
-            "effective_scored_bp": float(run_df["scored_bp"].mean()),
-        }
-    ]
-
-    if "type" in run_df.columns:
-        grouped = (
-            run_df.groupby("type", dropna=False)
-            .agg(
-                accuracy=("accuracy", "mean"),
-                num_sequences=("accuracy", "size"),
-                effective_scored_bp=("scored_bp", "mean"),
-            )
-            .reset_index()
-        )
-        for row in grouped.to_dict("records"):
-            rows.append(
-                {
-                    "gen_len": gen_len,
-                    "generation_bp": generation_bp,
-                    "group": row["type"],
-                    "accuracy": float(row["accuracy"]),
-                    "num_sequences": int(row["num_sequences"]),
-                    "effective_scored_bp": float(row["effective_scored_bp"]),
-                }
-            )
-
-    return rows
-
-
-def plot_accuracy_by_group(aggregate_df: pd.DataFrame, output_path: Path) -> None:
-    output_path.parent.mkdir(parents=True, exist_ok=True)
-    fig, ax = plt.subplots(figsize=(12, 7))
-
-    for group in GROUP_ORDER:
-        group_df = aggregate_df[aggregate_df["group"] == group].sort_values("gen_len")
-        if group_df.empty:
-            continue
-
-        x_values = group_df["generation_bp"]
-        style = {
-            "marker": "o",
-            "linewidth": 2.75 if group == "overall" else 1.8,
-            "color": "black" if group == "overall" else None,
-        }
-        ax.plot(x_values, group_df["accuracy"], label=group, **style)
-
-    bp_lengths = sorted(aggregate_df["generation_bp"].unique())
-    ax.set_xticks(bp_lengths)
-    ax.set_xticklabels([str(value) for value in bp_lengths])
-    ax.set_xlabel("Base pair generation length")
-    ax.set_ylabel("Accuracy")
-    ax.set_ylim(0.0, 1.0)
-    ax.set_title("Sequence Recovery Accuracy vs Base Pair Generation Length by Type")
-    ax.grid(True, alpha=0.25)
-    ax.legend(loc="best")
-
-    max_scored_bp = aggregate_df["effective_scored_bp"].max()
-    fig.text(
-        0.5,
-        0.01,
-        f"Mean scored bp is capped by the dataset label length. Observed max mean scored bp: {max_scored_bp:.1f}",
-        ha="center",
-    )
-    fig.tight_layout(rect=(0, 0.03, 1, 1))
-    fig.savefig(output_path, dpi=200)
-    plt.close(fig)
-
-
-def plot_overall_accuracy(aggregate_df: pd.DataFrame, output_path: Path) -> None:
-    output_path.parent.mkdir(parents=True, exist_ok=True)
-    overall_df = aggregate_df[aggregate_df["group"] == "overall"].sort_values("gen_len")
-    x_values = overall_df["generation_bp"]
-
-    fig, ax = plt.subplots(figsize=(10, 6))
-    ax.plot(
-        x_values,
-        overall_df["accuracy"],
-        color="black",
-        marker="o",
-        linewidth=2.75,
-    )
-    ax.set_xticks(x_values.tolist())
-    ax.set_xticklabels([str(value) for value in x_values.tolist()])
-    ax.set_xlabel("Base pair generation length")
-    ax.set_ylabel("Overall accuracy")
-    ax.set_ylim(0.0, 1.0)
-    ax.set_title("Overall Sequence Recovery Accuracy vs Base Pair Generation Length")
-    ax.grid(True, alpha=0.25)
-    fig.tight_layout()
-    fig.savefig(output_path, dpi=200)
-    plt.close(fig)
-
-
-def main() -> None:
-    args = parse_args()
-    visible_gpu_ids = resolve_visible_gpu_ids(args.num_gpus)
-
-    model_label = args.model_name or args.model.split("/")[-1]
-    benchmark_root = (
-        args.output_dir
-        / sanitize_path_component(model_label)
-        / sanitize_path_component(args.data_type)
-    )
-    benchmark_root.mkdir(parents=True, exist_ok=True)
-
-    aggregate_rows = []
-    run_manifest = []
-
-    for gen_len in args.gen_lens:
-        run_dir = benchmark_root / f"gen_len_{gen_len}"
-        run_dir.mkdir(parents=True, exist_ok=True)
-
-        command = build_eval_command(args, run_dir, gen_len)
-        env = os.environ.copy()
-        env["CUDA_VISIBLE_DEVICES"] = ",".join(visible_gpu_ids)
-
-        print(
-            f"\nRunning gen_len={gen_len} on GPUs {env['CUDA_VISIBLE_DEVICES']}",
-            flush=True,
-        )
-        print("Command:", " ".join(command), flush=True)
-
-        start_time = time.time()
-        subprocess.run(command, check=True, cwd=REPO_ROOT, env=env)
-        elapsed = time.time() - start_time
-
-        parquet_path, summary_path, summary = load_run_outputs(run_dir)
-        run_df = pd.read_parquet(parquet_path)
-        generation_bp = int(
-            summary.get("requested_rollout_bp") or gen_len * args.bp_per_token
-        )
-        aggregate_rows.extend(build_aggregate_rows(gen_len, generation_bp, run_df))
-
-        run_manifest.append(
-            {
-                "gen_len": gen_len,
-                "run_dir": str(run_dir),
-                "parquet_path": str(parquet_path),
-                "summary_path": str(summary_path),
-                "elapsed_seconds": elapsed,
-                "overall_accuracy": float(summary["overall_accuracy"]),
-                "requested_rollout_bp": generation_bp,
-                "mean_scored_bp": float(summary["mean_scored_bp"]),
-                "visible_gpu_count": int(summary["visible_gpu_count"]),
-            }
-        )
-
-    aggregate_df = pd.DataFrame(aggregate_rows)
-    aggregate_df["group"] = pd.Categorical(
-        aggregate_df["group"],
-        categories=GROUP_ORDER,
-        ordered=True,
-    )
-    aggregate_df = aggregate_df.sort_values(
-        ["group", "generation_bp", "gen_len"]
-    ).reset_index(drop=True)
-
-    aggregate_csv_path = benchmark_root / "accuracy_vs_gen_len.csv"
-    aggregate_json_path = benchmark_root / "benchmark_manifest.json"
-    group_plot_path = benchmark_root / "accuracy_vs_gen_len_by_type.png"
-    overall_plot_path = benchmark_root / "accuracy_vs_gen_len_overall.png"
-
-    aggregate_df.to_csv(aggregate_csv_path, index=False)
-    plot_accuracy_by_group(aggregate_df, group_plot_path)
-    plot_overall_accuracy(aggregate_df, overall_plot_path)
-
-    manifest = {
-        "model": args.model,
-        "model_name": model_label,
-        "revision": args.revision,
-        "data_type": args.data_type,
-        "data_path": args.data_path,
-        "gen_lens": args.gen_lens,
-        "bp_per_token": args.bp_per_token,
-        "accuracy_mode": "prediction_length",
-        "requested_gpu_count": args.num_gpus,
-        "cuda_visible_devices": visible_gpu_ids,
-        "sample_seed": args.sample_seed if args.max_samples is not None else None,
-        "output_root": str(benchmark_root),
-        "runs": run_manifest,
-    }
-    with aggregate_json_path.open("w", encoding="utf-8") as handle:
-        json.dump(manifest, handle, indent=2)
-
-    print(f"\nWrote aggregate CSV to {aggregate_csv_path}")
-    print(f"Wrote benchmark manifest to {aggregate_json_path}")
-    print(f"Wrote grouped plot to {group_plot_path}")
-    print(f"Wrote overall plot to {overall_plot_path}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/evaluation/serving/plot_sequence_recovery_sweep.py b/evaluation/serving/plot_sequence_recovery_sweep.py
deleted file mode 100644
index 2a9e0db..0000000
--- a/evaluation/serving/plot_sequence_recovery_sweep.py
+++ /dev/null
@@ -1,279 +0,0 @@
-"""Plot sequence-recovery accuracy vs generation length across models.
-
-Reads summary JSONs written by `evaluation/sequence_recovery_eval.py` in the
-directory layout produced by `evaluation/submit_sequence_recovery_gen_len_sweep.sh`:
-
-  {base_dir}/{model_name}/{data_type}/gen_len_{gen_len}/*.json
-
-For each model it plots overall_accuracy (and optionally per-type accuracy) as a
-function of gen_len_bp = gen_len * bp_per_token (inferred from summary).
-
-Usage:
-  uv run --project evaluation python evaluation/scripts/plot_sequence_recovery_sweep.py \
-    --base_dir ./eval_results/sequence_recovery_long_rollouts_pow2 \
-    --data_type eukaryote \
-    --model "3B hybrid=Carbon-3B-600B-dna-generv2-fp32-lmhead" \
-    --model "8B hybrid=Carbon-8B-600B-dna-fp32-lmhead" \
-    --model "Evo2 7B=Evo2-7B" \
-    --out scratch/plots/sequence_recovery_sweep_overall.png \
-    --type_panels scratch/plots/sequence_recovery_sweep_types.png
-"""
-
-import argparse
-import glob
-import json
-import os
-from collections import defaultdict
-
-import matplotlib.pyplot as plt
-import numpy as np
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--base_dir",
-        required=True,
-        help="Root dir containing {model_name}/{data_type}/gen_len_*/ subdirs.",
-    )
-    parser.add_argument(
-        "--data_type",
-        default="eukaryote",
-        help="Data-type split name used as a subdirectory.",
-    )
-    parser.add_argument(
-        "--model",
-        action="append",
-        required=True,
-        dest="models",
-        help="Repeatable 'LABEL=MODEL_NAME' or 'LABEL=BASE_DIR::MODEL_NAME' mapping. "
-        "MODEL_NAME must match the directory name under the resolved base dir. "
-        "When BASE_DIR is omitted, --base_dir is used.",
-    )
-    parser.add_argument(
-        "--out",
-        required=True,
-        help="Output PNG path for the overall-accuracy plot.",
-    )
-    parser.add_argument(
-        "--type_panels",
-        default=None,
-        help="Optional output PNG path for a per-type panel grid.",
-    )
-    parser.add_argument(
-        "--random_baseline",
-        type=float,
-        default=0.25,
-        help="Horizontal reference line (default 0.25 = 4-base uniform).",
-    )
-    parser.add_argument(
-        "--title_suffix",
-        default="",
-        help="Optional text appended to plot titles, e.g. '(n=1000 samples)'.",
-    )
-    return parser.parse_args()
-
-
-def load_sweep(base_dir: str, data_type: str, model_name: str):
-    """Return rows = list of dicts, one per gen_len, sorted by gen_len_bp."""
-    pattern = os.path.join(base_dir, model_name, data_type, "gen_len_*", "*.json")
-    paths = glob.glob(pattern)
-    rows = []
-    for p in paths:
-        with open(p) as f:
-            s = json.load(f)
-        gen_len_dir = os.path.basename(os.path.dirname(p))
-        gen_len = int(gen_len_dir.removeprefix("gen_len_"))
-        requested_bp = int(s.get("requested_rollout_bp") or 0)
-        bp_per_token = int(s.get("bp_per_token") or 6)
-        gen_len_bp = requested_bp if requested_bp > 0 else gen_len * bp_per_token
-        rows.append(
-            {
-                "gen_len": gen_len,
-                "gen_len_bp": gen_len_bp,
-                "overall": float(s["overall_accuracy"]),
-                "label_source": s.get("label_source", "dataset"),
-                "type_accuracy": s.get("type_accuracy", {}),
-                "accuracy_mode": s.get("accuracy_mode"),
-            }
-        )
-    rows.sort(key=lambda r: r["gen_len_bp"])
-    return rows
-
-
-def parse_model_specs(specs, default_base_dir):
-    parsed = []
-    for spec in specs:
-        if "=" not in spec:
-            raise SystemExit(
-                f"--model must be 'LABEL=MODEL_NAME' or 'LABEL=BASE_DIR::MODEL_NAME', got: {spec}"
-            )
-        label, rhs = spec.split("=", 1)
-        if "::" in rhs:
-            base, name = rhs.split("::", 1)
-        else:
-            base, name = default_base_dir, rhs
-        parsed.append((label.strip(), base.strip(), name.strip()))
-    return parsed
-
-
-def plot_overall(
-    models_data, out_path: str, random_baseline: float, title_suffix: str = ""
-):
-    fig, ax = plt.subplots(figsize=(11, 6.6), dpi=200)
-    colors = plt.rcParams["axes.prop_cycle"].by_key()["color"]
-    for (label, _), rows, color in zip(
-        models_data.keys(),
-        models_data.values(),
-        colors,
-    ):
-        if not rows:
-            continue
-        xs = [r["gen_len_bp"] for r in rows]
-        ys = [r["overall"] for r in rows]
-        ax.plot(xs, ys, color=color, linewidth=2, label=label, zorder=2)
-        for r in rows:
-            marker = "o" if r["label_source"] == "dataset" else "s"
-            ax.scatter(
-                [r["gen_len_bp"]],
-                [r["overall"]],
-                color=color,
-                marker=marker,
-                s=60,
-                zorder=3,
-                edgecolors="white",
-                linewidths=0.8,
-            )
-
-    ax.axhline(
-        random_baseline,
-        color="#666666",
-        linestyle="--",
-        linewidth=1.2,
-        label="Random baseline",
-    )
-    ax.scatter([], [], color="#444444", marker="o", s=60, label="label_source=dataset")
-    ax.scatter(
-        [], [], color="#444444", marker="s", s=60, label="label_source=sequence_tail"
-    )
-
-    ax.set_xscale("log", base=2)
-    all_x = sorted({r["gen_len_bp"] for rows in models_data.values() for r in rows})
-    if all_x:
-        ax.set_xticks(all_x)
-        ax.set_xticklabels([str(x) for x in all_x])
-    ax.set_xlabel("Generation length (base pairs)")
-    ax.set_ylabel("Accuracy")
-    ax.set_ylim(0.0, 1.0)
-    suffix = f" {title_suffix}" if title_suffix else ""
-    ax.set_title(f"Long-rollout sweep: Overall accuracy{suffix}")
-    ax.grid(True, alpha=0.3)
-    ax.legend(loc="upper right", framealpha=0.95)
-    fig.tight_layout()
-    os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)
-    fig.savefig(out_path)
-    print(f"Saved overall plot to {out_path}")
-    plt.close(fig)
-
-
-def plot_type_panels(
-    models_data, out_path: str, random_baseline: float, title_suffix: str = ""
-):
-    type_names = sorted(
-        {
-            t
-            for rows in models_data.values()
-            for r in rows
-            for t in r["type_accuracy"].keys()
-        }
-    )
-    if not type_names:
-        print("No per-type accuracy available; skipping type panels")
-        return
-
-    n = len(type_names)
-    cols = 3
-    rows_n = (n + cols - 1) // cols
-    fig, axes = plt.subplots(rows_n, cols, figsize=(cols * 4.5, rows_n * 3.2), dpi=200)
-    axes = np.array(axes).reshape(-1)
-    colors = plt.rcParams["axes.prop_cycle"].by_key()["color"]
-
-    for ax, tname in zip(axes, type_names):
-        for (label, _), rows, color in zip(
-            models_data.keys(),
-            models_data.values(),
-            colors,
-        ):
-            xs = [r["gen_len_bp"] for r in rows if tname in r["type_accuracy"]]
-            ys = [
-                r["type_accuracy"][tname] for r in rows if tname in r["type_accuracy"]
-            ]
-            if not xs:
-                continue
-            ax.plot(xs, ys, color=color, linewidth=1.8, label=label)
-            for r in rows:
-                if tname not in r["type_accuracy"]:
-                    continue
-                marker = "o" if r["label_source"] == "dataset" else "s"
-                ax.scatter(
-                    [r["gen_len_bp"]],
-                    [r["type_accuracy"][tname]],
-                    color=color,
-                    marker=marker,
-                    s=40,
-                    edgecolors="white",
-                    linewidths=0.6,
-                )
-
-        ax.axhline(random_baseline, color="#666666", linestyle="--", linewidth=1.0)
-        ax.set_xscale("log", base=2)
-        ax.set_title(tname)
-        ax.set_ylim(0.0, 1.0)
-        ax.grid(True, alpha=0.3)
-
-    for extra in axes[n:]:
-        extra.set_visible(False)
-
-    handles, labels = axes[0].get_legend_handles_labels()
-    if handles:
-        fig.legend(
-            handles,
-            labels,
-            loc="lower center",
-            ncol=len(labels),
-            bbox_to_anchor=(0.5, -0.01),
-        )
-    suffix = f" {title_suffix}" if title_suffix else ""
-    fig.suptitle(f"Long-rollout sweep: Per-type accuracy{suffix}", y=1.00)
-    fig.tight_layout()
-    os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)
-    fig.savefig(out_path, bbox_inches="tight")
-    print(f"Saved type-panel plot to {out_path}")
-    plt.close(fig)
-
-
-def main():
-    args = parse_args()
-    model_specs = parse_model_specs(args.models, args.base_dir)
-
-    models_data = {}
-    for label, base, name in model_specs:
-        rows = load_sweep(base, args.data_type, name)
-        models_data[(label, name)] = rows
-        print(f"  [{label}] ({base}/{name}): {len(rows)} gen_len points")
-
-    if not any(models_data.values()):
-        raise SystemExit(
-            f"No summary JSONs found under {args.base_dir}. "
-            f"Check --base_dir / --model names / --data_type."
-        )
-
-    plot_overall(models_data, args.out, args.random_baseline, args.title_suffix)
-    if args.type_panels:
-        plot_type_panels(
-            models_data, args.type_panels, args.random_baseline, args.title_suffix
-        )
-
-
-if __name__ == "__main__":
-    main()