From 43ac53b726433a4682ec119c7e2a22012ab13481 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Mon, 18 May 2026 13:30:52 +0000 Subject: [PATCH 1/6] Remove deprecatd stuf --- evaluation/scripts/run_serving_benchmarks.py | 26 ++------------------ 1 file changed, 2 insertions(+), 24 deletions(-) diff --git a/evaluation/scripts/run_serving_benchmarks.py b/evaluation/scripts/run_serving_benchmarks.py index e0c76f8..0b819b2 100644 --- a/evaluation/scripts/run_serving_benchmarks.py +++ b/evaluation/scripts/run_serving_benchmarks.py @@ -23,7 +23,6 @@ DEFAULT_OUTPUT_ROOT = REPO_ROOT / "scratch" / "serving_benchmarks" DEFAULT_CARBON_MODEL = "HuggingFaceBio/Carbon-3B" DEFAULT_CARBON_DRAFT_MODEL = "HuggingFaceBio/Carbon-500M" -DEFAULT_CARBON_VLLM_ARCHITECTURE_OVERRIDE = "LlamaForCausalLM" DEFAULT_GENERATOR_MODEL = "GenerTeam/GENERator-v2-eukaryote-3b-base" DEFAULT_EVO2_MODEL = "evo2_7b" SUMMARY_FIELDS = [ @@ -95,8 +94,8 @@ def parse_args() -> argparse.Namespace: parser.add_argument("--split", default="test") parser.add_argument("--num-prompts", type=int, default=16) parser.add_argument("--seed", type=int, default=0) - parser.add_argument("--input-bp", type=int, default=1000) - parser.add_argument("--output-bp", type=int, default=1000) + parser.add_argument("--input-bp", type=int, default=1080) + parser.add_argument("--output-bp", type=int, default=1080) parser.add_argument("--bp-per-token", type=int, default=6) parser.add_argument("--carbon-model", default=DEFAULT_CARBON_MODEL) parser.add_argument( @@ -125,16 +124,6 @@ def parse_args() -> argparse.Namespace: default=None, help="Draft Carbon code revision recorded in --speculative-config.", ) - parser.add_argument( - "--carbon-vllm-architecture-override", - default=DEFAULT_CARBON_VLLM_ARCHITECTURE_OVERRIDE, - help=( - "HF architecture override for Carbon vLLM servers. The default " - "forces vLLM's native Llama implementation and avoids the " - "Transformers-backend attention-name collision in speculative " - "decoding. Pass an empty value to disable." - ), - ) parser.add_argument( "--carbon-speculative-tokens", type=int, @@ -540,14 +529,6 @@ def carbon_server_extra_args(args: argparse.Namespace) -> list[str]: extra_args.extend(["--code-revision", args.carbon_code_revision]) if args.carbon_tokenizer_revision: extra_args.extend(["--tokenizer-revision", args.carbon_tokenizer_revision]) - - architecture = args.carbon_vllm_architecture_override.strip() - if not architecture: - return extra_args - hf_overrides = {"architectures": [architecture]} - extra_args.extend( - ["--hf-overrides", json.dumps(hf_overrides, separators=(",", ":"))] - ) return extra_args @@ -652,9 +633,6 @@ def build_run_specs( "carbon_tokenizer_revision": args.carbon_tokenizer_revision or "", "carbon_draft_revision": args.carbon_draft_revision or "", "carbon_draft_code_revision": args.carbon_draft_code_revision or "", - "vllm_architecture_override": ( - args.carbon_vllm_architecture_override.strip() - ), } specs.append( RunSpec( From 9591313f5165fb8ccce71fc4cae5ad15fe9d31e8 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Mon, 18 May 2026 13:59:48 +0000 Subject: [PATCH 2/6] Support multi-GPU serving benchmark specs Add per-spec GPU allocation so Evo2 40B can reserve two visible GPUs while one-GPU serving benchmarks continue to run on individual devices. Also label GENERator vLLM runs from the selected model name. Co-authored-by: Codex --- evaluation/scripts/run_serving_benchmarks.py | 103 +++++++++++++++---- 1 file changed, 84 insertions(+), 19 deletions(-) diff --git a/evaluation/scripts/run_serving_benchmarks.py b/evaluation/scripts/run_serving_benchmarks.py index 0b819b2..f89a8e2 100644 --- a/evaluation/scripts/run_serving_benchmarks.py +++ b/evaluation/scripts/run_serving_benchmarks.py @@ -19,7 +19,9 @@ REPO_ROOT = Path(__file__).resolve().parents[2] -EVO2_BENCHMARK_SCRIPT = REPO_ROOT / "evaluation" / "scripts" / "benchmark_evo2_serving.py" +EVO2_BENCHMARK_SCRIPT = ( + REPO_ROOT / "evaluation" / "scripts" / "benchmark_evo2_serving.py" +) DEFAULT_OUTPUT_ROOT = REPO_ROOT / "scratch" / "serving_benchmarks" DEFAULT_CARBON_MODEL = "HuggingFaceBio/Carbon-3B" DEFAULT_CARBON_DRAFT_MODEL = "HuggingFaceBio/Carbon-500M" @@ -63,6 +65,7 @@ class RunSpec: model: str prompt_file: Path run_dir: Path + gpu_count: int = 1 port: int | None = None served_model_name: str | None = None draft_model: str | None = None @@ -215,6 +218,13 @@ def evo2_run_name(model_name: str) -> str: return sanitize_path_component(normalized_model_name) +def evo2_gpu_count(model_name: str) -> int: + normalized_model_name = normalize_evo2_model_name(model_name) + if normalized_model_name in {"evo2_40b", "evo2_40b_base"}: + return 2 + return 1 + + def model_run_name(model_name: str) -> str: return sanitize_path_component(model_name.rsplit("/", 1)[-1].lower()) @@ -226,8 +236,12 @@ def load_sequence_recovery_rows(args: argparse.Namespace) -> list[dict]: try: dataset = load_dataset(repo_id, args.data_config, split=args.split) except Exception: - parquet_path = f"hf://datasets/{repo_id}/{args.data_config}/{args.split}.parquet" - dataset = load_dataset("parquet", data_files={args.split: parquet_path}, split=args.split) + parquet_path = ( + f"hf://datasets/{repo_id}/{args.data_config}/{args.split}.parquet" + ) + dataset = load_dataset( + "parquet", data_files={args.split: parquet_path}, split=args.split + ) return list(dataset) @@ -617,10 +631,12 @@ def build_run_specs( and not args.skip_spec_vocab_check and args.carbon_speculative_tokens ): - spec_preflight_ok, spec_preflight_reason = check_speculative_vocab_compatibility( - args.carbon_model, - args.carbon_draft_model, - run_dir / "speculative_vocab_preflight.json", + spec_preflight_ok, spec_preflight_reason = ( + check_speculative_vocab_compatibility( + args.carbon_model, + args.carbon_draft_model, + run_dir / "speculative_vocab_preflight.json", + ) ) if not args.skip_carbon: @@ -659,9 +675,7 @@ def build_run_specs( served_model_name=f"{carbon_name}-spec-{token_count}", draft_model=args.carbon_draft_model, num_speculative_tokens=token_count, - speculative_config=carbon_speculative_config( - args, token_count - ), + speculative_config=carbon_speculative_config(args, token_count), server_extra_args=carbon_extra_args, skip_reason=skip_reason, metadata=carbon_metadata, @@ -678,6 +692,7 @@ def build_run_specs( model=normalized_evo2_model, prompt_file=Path(prompt_files["evo2"]), run_dir=run_dir / evo2_name, + gpu_count=evo2_gpu_count(normalized_evo2_model), metadata={ "prompt_family": "evo2", "requested_model": args.evo2_model, @@ -687,14 +702,15 @@ def build_run_specs( ) if args.generator != "never": + generator_name = model_run_name(args.generator_model) specs.append( RunSpec( - name="generator-v2-eukaryote-3b-vllm", + name=f"{generator_name}-vllm", backend="vllm", model=args.generator_model, prompt_file=Path(prompt_files["generator"]), - run_dir=run_dir / "generator-v2-eukaryote-3b-vllm", - served_model_name="generator-v2-eukaryote-3b-vllm", + run_dir=run_dir / f"{generator_name}-vllm", + served_model_name=f"{generator_name}-vllm", requires_probe=True, skip_on_probe_failure=args.generator == "auto", metadata={"prompt_family": "generator"}, @@ -998,7 +1014,9 @@ def run_spec(args: argparse.Namespace, spec: RunSpec) -> dict: raise ValueError(f"Unsupported backend: {spec.backend}") -def run_gpu_queue(args: argparse.Namespace, gpu_id: str, specs: list[RunSpec]) -> list[dict]: +def run_gpu_queue( + args: argparse.Namespace, gpu_id: str, specs: list[RunSpec] +) -> list[dict]: rows = [] for spec in specs: spec.gpu_id = gpu_id @@ -1059,12 +1077,59 @@ def assign_specs_to_gpus( gpu_ids: list[str], max_parallel: int, ) -> dict[str, list[RunSpec]]: - active_gpus = gpu_ids[: min(max_parallel, len(gpu_ids), len(specs))] - assignments = {gpu_id: [] for gpu_id in active_gpus} - for index, spec in enumerate(specs): - gpu_id = active_gpus[index % len(active_gpus)] + active_gpus = gpu_ids[: min(max_parallel, len(gpu_ids))] + if not active_gpus: + return {} + for spec in specs: + if spec.gpu_count <= 0: + raise ValueError(f"{spec.name} has invalid gpu_count={spec.gpu_count}") + if spec.gpu_count > len(active_gpus): + raise RuntimeError( + f"{spec.name} requires {spec.gpu_count} GPUs, but only " + f"{len(active_gpus)} are available after --max-parallel." + ) + + single_gpu_specs = [spec for spec in specs if spec.gpu_count == 1] + remaining_gpus = list(active_gpus) + multi_gpu_groups: dict[int, list[tuple[str, ...]]] = {} + + for gpu_count in sorted( + {spec.gpu_count for spec in specs if spec.gpu_count > 1}, + reverse=True, + ): + specs_for_count = [spec for spec in specs if spec.gpu_count == gpu_count] + max_groups = len(remaining_gpus) // gpu_count + if single_gpu_specs: + max_groups = min(max_groups, max(1, (len(remaining_gpus) - 1) // gpu_count)) + group_count = min(len(specs_for_count), max_groups) + if group_count <= 0: + raise RuntimeError( + f"No free GPU group is available for {gpu_count}-GPU benchmark specs." + ) + groups = [] + for _ in range(group_count): + group = tuple(remaining_gpus[:gpu_count]) + del remaining_gpus[:gpu_count] + groups.append(group) + multi_gpu_groups[gpu_count] = groups + + single_gpu_groups = [(gpu_id,) for gpu_id in remaining_gpus] + if single_gpu_specs and not single_gpu_groups: + raise RuntimeError("No free GPU is available for one-GPU benchmark specs.") + + assignments: dict[str, list[RunSpec]] = {} + group_offsets: dict[int, int] = {} + for spec in specs: + if spec.gpu_count == 1: + groups = single_gpu_groups + else: + groups = multi_gpu_groups[spec.gpu_count] + offset = group_offsets.get(spec.gpu_count, 0) + group = groups[offset % len(groups)] + group_offsets[spec.gpu_count] = offset + 1 + gpu_id = ",".join(group) spec.gpu_id = gpu_id - assignments[gpu_id].append(spec) + assignments.setdefault(gpu_id, []).append(spec) return assignments From ef58fb050ce1c67cd64f40f8e5929c801d733e65 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Mon, 18 May 2026 14:03:40 +0000 Subject: [PATCH 3/6] Add bp throughput to serving summaries Record output_bp_per_second in serving benchmark summary rows using the model family's output-token-to-base-pair ratio. Co-authored-by: Codex --- evaluation/scripts/run_serving_benchmarks.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/evaluation/scripts/run_serving_benchmarks.py b/evaluation/scripts/run_serving_benchmarks.py index f89a8e2..bb9210e 100644 --- a/evaluation/scripts/run_serving_benchmarks.py +++ b/evaluation/scripts/run_serving_benchmarks.py @@ -46,6 +46,7 @@ "total_output_tokens", "request_throughput", "output_throughput", + "output_bp_per_second", "total_token_throughput", "mean_ttft_ms", "mean_tpot_ms", @@ -66,6 +67,7 @@ class RunSpec: prompt_file: Path run_dir: Path gpu_count: int = 1 + bp_per_output_token: float = 1.0 port: int | None = None served_model_name: str | None = None draft_model: str | None = None @@ -657,6 +659,7 @@ def build_run_specs( model=args.carbon_model, prompt_file=Path(prompt_files["carbon"]), run_dir=run_dir / f"{carbon_name}-vllm", + bp_per_output_token=args.bp_per_token, served_model_name=f"{carbon_name}-vllm", server_extra_args=carbon_extra_args, metadata=carbon_metadata, @@ -672,6 +675,7 @@ def build_run_specs( model=args.carbon_model, prompt_file=Path(prompt_files["carbon"]), run_dir=run_dir / f"{carbon_name}-spec-{token_count}", + bp_per_output_token=args.bp_per_token, served_model_name=f"{carbon_name}-spec-{token_count}", draft_model=args.carbon_draft_model, num_speculative_tokens=token_count, @@ -710,6 +714,7 @@ def build_run_specs( model=args.generator_model, prompt_file=Path(prompt_files["generator"]), run_dir=run_dir / f"{generator_name}-vllm", + bp_per_output_token=args.bp_per_token, served_model_name=f"{generator_name}-vllm", requires_probe=True, skip_on_probe_failure=args.generator == "auto", @@ -824,6 +829,9 @@ def metric_row_from_result(spec: RunSpec, result: dict, status: str) -> dict: "p99_itl_ms", ]: row[key] = result.get(key, "") + output_throughput = result.get("output_throughput") + if output_throughput not in (None, ""): + row["output_bp_per_second"] = output_throughput * spec.bp_per_output_token row["result_json"] = str(spec.run_dir / "benchmark.json") return row @@ -848,6 +856,7 @@ def base_summary_row(spec: RunSpec, status: str) -> dict: "total_output_tokens": "", "request_throughput": "", "output_throughput": "", + "output_bp_per_second": "", "total_token_throughput": "", "mean_ttft_ms": "", "mean_tpot_ms": "", From eea32a427e76b5c4f6cdc5f35f7c663bd7a61db0 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Wed, 20 May 2026 13:23:20 +0000 Subject: [PATCH 4/6] Rename --- evaluation/{scripts => serving}/benchmark_evo2_serving.py | 0 .../benchmark_generator_transformers_serving.py | 0 .../{scripts => serving}/benchmark_sequence_recovery_gen_len.py | 0 evaluation/{scripts => serving}/plot_sequence_recovery_sweep.py | 0 evaluation/{scripts => serving}/run_serving_benchmarks.py | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename evaluation/{scripts => serving}/benchmark_evo2_serving.py (100%) rename evaluation/{scripts => serving}/benchmark_generator_transformers_serving.py (100%) rename evaluation/{scripts => serving}/benchmark_sequence_recovery_gen_len.py (100%) rename evaluation/{scripts => serving}/plot_sequence_recovery_sweep.py (100%) rename evaluation/{scripts => serving}/run_serving_benchmarks.py (100%) diff --git a/evaluation/scripts/benchmark_evo2_serving.py b/evaluation/serving/benchmark_evo2_serving.py similarity index 100% rename from evaluation/scripts/benchmark_evo2_serving.py rename to evaluation/serving/benchmark_evo2_serving.py diff --git a/evaluation/scripts/benchmark_generator_transformers_serving.py b/evaluation/serving/benchmark_generator_transformers_serving.py similarity index 100% rename from evaluation/scripts/benchmark_generator_transformers_serving.py rename to evaluation/serving/benchmark_generator_transformers_serving.py diff --git a/evaluation/scripts/benchmark_sequence_recovery_gen_len.py b/evaluation/serving/benchmark_sequence_recovery_gen_len.py similarity index 100% rename from evaluation/scripts/benchmark_sequence_recovery_gen_len.py rename to evaluation/serving/benchmark_sequence_recovery_gen_len.py diff --git a/evaluation/scripts/plot_sequence_recovery_sweep.py b/evaluation/serving/plot_sequence_recovery_sweep.py similarity index 100% rename from evaluation/scripts/plot_sequence_recovery_sweep.py rename to evaluation/serving/plot_sequence_recovery_sweep.py diff --git a/evaluation/scripts/run_serving_benchmarks.py b/evaluation/serving/run_serving_benchmarks.py similarity index 100% rename from evaluation/scripts/run_serving_benchmarks.py rename to evaluation/serving/run_serving_benchmarks.py From 75eb7f38736318a1d9be03e43a09738ebc7033da Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Wed, 20 May 2026 13:35:53 +0000 Subject: [PATCH 5/6] Document serving inference benchmarks Add commands for dry-run, Carbon vLLM, speculative decoding, full-node comparisons, and direct Evo2 serving benchmarks. Fix the serving wrapper's Evo2 helper path so the documented wrapper can launch it. Co-authored-by: Codex --- evaluation/README.md | 92 ++++++++++++++++++++ evaluation/serving/run_serving_benchmarks.py | 2 +- 2 files changed, 93 insertions(+), 1 deletion(-) diff --git a/evaluation/README.md b/evaluation/README.md index 5b0ac6b..c92c51e 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -12,6 +12,7 @@ one flag, so the same script runs on Carbon, GENERator, or Evo2. 3. [ClinVar VEP](#3-clinvar-vep) — right-end / next-token scoring (GENERator recipe) 4. [Sequence-level perturbation tasks](#4-sequence-level-perturbation-tasks) — nucleotide triplet-expansion + synonymous codon substitution, new tasks we built 5. [Genome-NIAH long-context retrieval](#5-genome-niah-long-context-retrieval) — long-context needle-in-a-haystack for DNA (4 tasks × 6 context lengths up to 786 kbp) +6. [Serving inference benchmarks](#6-serving-inference-benchmarks) — vLLM and Evo2 latency / throughput on sequence-recovery prompts ## Scripts @@ -333,6 +334,97 @@ for SHARD in 0 1 2 3 4 5; do sbatch evaluation/slurm/evo2-7b/genome_niah.sbatch done ``` + +## 6. Serving inference benchmarks + +Serving benchmarks live in [`evaluation/serving/`](serving). They sample +prompts from the sequence-recovery dataset, run fixed-length generation, and +write vLLM-style latency / throughput metrics under `scratch/serving_benchmarks`. +The wrapper runs Carbon through `vllm serve`, optional speculative Carbon with a +draft model, optional GENERator through vLLM, and Evo2 through the local Evo2 +benchmark helper. + +Use `--dry-run` first to prepare prompt files and inspect the exact server and +benchmark commands without starting any model servers: + +```bash +uv run --group evaluation python evaluation/serving/run_serving_benchmarks.py \ + --run-id serving-dry-run \ + --num-prompts 4 \ + --input-bp 1080 --output-bp 1080 \ + --gpu-ids 0,1 --max-parallel 2 \ + --generator never \ + --dry-run +``` + +The dry run writes commands to: + +```text +scratch/serving_benchmarks/serving-dry-run/dry_run_commands.sh +``` + +Carbon-only vLLM smoke benchmark: + +```bash +uv run --group evaluation python evaluation/serving/run_serving_benchmarks.py \ + --run-id carbon-3b-vllm-smoke \ + --num-prompts 16 \ + --input-bp 1080 --output-bp 1080 \ + --gpu-ids 0 --max-parallel 1 \ + --skip-evo2 --skip-speculative --generator never +``` + +Carbon target plus speculative decoding with a Carbon draft model: + +```bash +uv run --group evaluation python evaluation/serving/run_serving_benchmarks.py \ + --run-id carbon-3b-speculative \ + --num-prompts 64 \ + --input-bp 1080 --output-bp 1080 \ + --gpu-ids 0,1,2,3 --max-parallel 4 \ + --carbon-model HuggingFaceBio/Carbon-3B \ + --carbon-draft-model HuggingFaceBio/Carbon-500M \ + --carbon-speculative-tokens 2 4 8 \ + --skip-evo2 --generator never +``` + +Full comparison on one 8-GPU node, including Carbon, speculative Carbon, Evo2, +and GENERator when its vLLM compatibility probe succeeds: + +```bash +uv run --group evaluation python evaluation/serving/run_serving_benchmarks.py \ + --run-id serving-full-8gpu \ + --num-prompts 128 \ + --input-bp 1080 --output-bp 1080 \ + --gpu-ids 0,1,2,3,4,5,6,7 --max-parallel 8 \ + --generator auto +``` + +To run only the Evo2 helper, first prepare prompts, then point +[`benchmark_evo2_serving.py`](serving/benchmark_evo2_serving.py) at the Evo2 +prompt JSONL: + +```bash +uv run --group evaluation python evaluation/serving/run_serving_benchmarks.py \ + --run-id evo2-serving-prompts \ + --num-prompts 16 \ + --input-bp 1080 --output-bp 1080 \ + --prepare-only + +uv run --group evaluation python evaluation/serving/benchmark_evo2_serving.py \ + --model evo2_7b \ + --dataset-path scratch/serving_benchmarks/evo2-serving-prompts/prompts/evo2_prompts.jsonl \ + --num-prompts 16 \ + --output-dir scratch/serving_benchmarks/evo2-7b-direct \ + --temperature 1.0 --top-k 1 --top-p 0.0 +``` + +Each run writes a top-level `summary.csv` and `summary.json`, plus one +subdirectory per benchmark with `commands.json`, logs, `benchmark.json`, and +`detailed.json` where available. The `output_throughput` column is generated +tokens per second; `output_bp_per_second` converts Carbon and GENERator 6-mer +tokens back to base pairs per second. + ## Environment Install the root project environment with uv: diff --git a/evaluation/serving/run_serving_benchmarks.py b/evaluation/serving/run_serving_benchmarks.py index bb9210e..aac3986 100644 --- a/evaluation/serving/run_serving_benchmarks.py +++ b/evaluation/serving/run_serving_benchmarks.py @@ -20,7 +20,7 @@ REPO_ROOT = Path(__file__).resolve().parents[2] EVO2_BENCHMARK_SCRIPT = ( - REPO_ROOT / "evaluation" / "scripts" / "benchmark_evo2_serving.py" + REPO_ROOT / "evaluation" / "serving" / "benchmark_evo2_serving.py" ) DEFAULT_OUTPUT_ROOT = REPO_ROOT / "scratch" / "serving_benchmarks" DEFAULT_CARBON_MODEL = "HuggingFaceBio/Carbon-3B" From eee0d1999e4c185551e7ce94af02a44d3065ba88 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Wed, 20 May 2026 13:40:44 +0000 Subject: [PATCH 6/6] clean up --- ...enchmark_generator_transformers_serving.py | 489 ------------------ .../benchmark_sequence_recovery_gen_len.py | 429 --------------- .../serving/plot_sequence_recovery_sweep.py | 279 ---------- 3 files changed, 1197 deletions(-) delete mode 100644 evaluation/serving/benchmark_generator_transformers_serving.py delete mode 100644 evaluation/serving/benchmark_sequence_recovery_gen_len.py delete mode 100644 evaluation/serving/plot_sequence_recovery_sweep.py diff --git a/evaluation/serving/benchmark_generator_transformers_serving.py b/evaluation/serving/benchmark_generator_transformers_serving.py deleted file mode 100644 index 1cdb243..0000000 --- a/evaluation/serving/benchmark_generator_transformers_serving.py +++ /dev/null @@ -1,489 +0,0 @@ -import argparse -import json -import math -import statistics -import time -from datetime import datetime -from pathlib import Path - - -SPECIAL_TOKENS = [ - "", - "", - "", - "", - "", - "", - "", - "", - "", - "<+>", - "<->", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", -] - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - description=( - "Benchmark GENERator generation through native Transformers with a " - "vLLM-like serving metric block." - ) - ) - parser.add_argument( - "--model", - default="GenerTeam/GENERator-v2-eukaryote-1.2b-base", - help="Hugging Face model id or local path.", - ) - parser.add_argument( - "--dataset-path", - type=Path, - required=True, - help="Prepared JSONL prompt file.", - ) - parser.add_argument("--num-prompts", type=int, default=16) - parser.add_argument( - "--output-dir", - type=Path, - required=True, - help="Directory for result files.", - ) - parser.add_argument("--result-json", type=Path, default=None) - parser.add_argument("--detailed-json", type=Path, default=None) - parser.add_argument("--batch-size", type=int, default=1) - parser.add_argument( - "--dtype", - choices=["bfloat16", "float16", "float32"], - default="bfloat16", - ) - parser.add_argument("--device", default="cuda:0") - parser.add_argument("--bp-per-token", type=int, default=6) - parser.add_argument("--kmer-size", type=int, default=6) - parser.add_argument("--temperature", type=float, default=0.00001) - parser.add_argument("--top-k", type=int, default=1) - parser.add_argument("--trust-remote-code", action="store_true") - parser.add_argument("--num-warmups", type=int, default=0) - parser.add_argument("--label", default="generator-transformers") - return parser.parse_args() - - -def read_jsonl(path: Path) -> list[dict]: - rows = [] - with path.open("r", encoding="utf-8") as handle: - for line_number, line in enumerate(handle, start=1): - line = line.strip() - if not line: - continue - item = json.loads(line) - if "prompt" not in item: - raise ValueError(f"{path}:{line_number} is missing 'prompt'") - if "output_tokens" not in item: - raise ValueError(f"{path}:{line_number} is missing 'output_tokens'") - item["output_tokens"] = int(item["output_tokens"]) - rows.append(item) - return rows - - -def percentile(values: list[float], percentile_value: float) -> float: - if not values: - return 0.0 - ordered = sorted(values) - if len(ordered) == 1: - return ordered[0] - rank = (len(ordered) - 1) * percentile_value / 100.0 - lower = math.floor(rank) - upper = math.ceil(rank) - if lower == upper: - return ordered[int(rank)] - weight = rank - lower - return ordered[lower] * (1 - weight) + ordered[upper] * weight - - -def mean_ms(values: list[float]) -> float: - return (statistics.mean(values) * 1000.0) if values else 0.0 - - -def median_ms(values: list[float]) -> float: - return (statistics.median(values) * 1000.0) if values else 0.0 - - -def std_ms(values: list[float]) -> float: - return (statistics.pstdev(values) * 1000.0) if len(values) > 1 else 0.0 - - -def p99_ms(values: list[float]) -> float: - return percentile(values, 99.0) * 1000.0 - - -def synchronize_cuda() -> None: - try: - import torch - - if torch.cuda.is_available(): - torch.cuda.synchronize() - except Exception: - return - - -class KmerTokenizer: - def __init__(self, k: int): - import itertools - import re - - self.k = k - self.special_tokens = SPECIAL_TOKENS - self.vocab = { - token: index - for index, token in enumerate( - self.special_tokens - + ["".join(kmer) for kmer in itertools.product("ATCG", repeat=k)] - ) - } - self.ids_to_tokens = {index: token for token, index in self.vocab.items()} - self.special_token_pattern = re.compile( - "|".join(re.escape(token) for token in self.special_tokens) - ) - self.dna_pattern = re.compile(f"[A-Z]{{{self.k}}}|[A-Z]+") - self.bos_token_id = self.vocab[""] - self.eos_token_id = self.vocab[""] - self.pad_token_id = self.vocab[""] - self.unk_token_id = self.vocab[""] - - @property - def vocab_size(self) -> int: - return len(self.vocab) - - def tokenize(self, text: str) -> list[str]: - tokens = [] - pos = 0 - while pos < len(text): - special_match = self.special_token_pattern.match(text, pos) - if special_match: - tokens.append(special_match.group()) - pos = special_match.end() - continue - dna_match = self.dna_pattern.match(text, pos) - if dna_match: - tokens.append(dna_match.group()) - pos = dna_match.end() - continue - tokens.append(text[pos]) - pos += 1 - return tokens - - def encode(self, text: str, add_bos_token: bool = True) -> list[int]: - token_ids = [ - self.vocab.get(token, self.unk_token_id) for token in self.tokenize(text) - ] - if add_bos_token: - return [self.bos_token_id] + token_ids - return token_ids - - def decode(self, token_ids: list[int], skip_special_tokens: bool = True) -> str: - tokens = [] - for token_id in token_ids: - token = self.ids_to_tokens.get(int(token_id), "") - if skip_special_tokens and token in self.special_tokens: - continue - tokens.append(token) - return "".join(tokens) - - -def load_model(args: argparse.Namespace): - import torch - from transformers import AutoModelForCausalLM - - dtype_map = { - "bfloat16": torch.bfloat16, - "float16": torch.float16, - "float32": torch.float32, - } - model = AutoModelForCausalLM.from_pretrained( - args.model, - trust_remote_code=args.trust_remote_code, - dtype=dtype_map[args.dtype], - low_cpu_mem_usage=True, - ) - model.to(args.device) - model.eval() - return model - - -def make_batch( - requests: list[dict], - tokenizer: KmerTokenizer, - device: str, -) -> tuple: - import torch - - encoded = [tokenizer.encode(request["prompt"], add_bos_token=True) for request in requests] - max_len = max(len(item) for item in encoded) - input_ids = [] - attention_mask = [] - for item in encoded: - pad_len = max_len - len(item) - input_ids.append([tokenizer.pad_token_id] * pad_len + item) - attention_mask.append([0] * pad_len + [1] * len(item)) - return ( - torch.tensor(input_ids, dtype=torch.long, device=device), - torch.tensor(attention_mask, dtype=torch.long, device=device), - [len(item) for item in encoded], - ) - - -def run_batch( - model, - requests: list[dict], - tokenizer: KmerTokenizer, - args: argparse.Namespace, -) -> list[dict]: - import torch - - output_tokens = int(requests[0]["output_tokens"]) - output_lengths = {int(request["output_tokens"]) for request in requests} - if len(output_lengths) != 1: - raise ValueError("All requests in a batch must have the same output_tokens") - - input_ids, attention_mask, prompt_lens = make_batch(requests, tokenizer, args.device) - synchronize_cuda() - start_perf = time.perf_counter() - start_wall = time.time() - with torch.inference_mode(): - generated = model.generate( - input_ids=input_ids, - attention_mask=attention_mask, - max_new_tokens=output_tokens, - min_new_tokens=output_tokens, - do_sample=True, - temperature=args.temperature, - top_k=args.top_k, - use_cache=True, - eos_token_id=None, - pad_token_id=tokenizer.pad_token_id, - ) - synchronize_cuda() - end_perf = time.perf_counter() - latency = end_perf - start_perf - - details = [] - for index, request in enumerate(requests): - prompt_len = prompt_lens[index] - generated_ids = generated[index, input_ids.shape[1] :].tolist() - generated_ids = generated_ids[:output_tokens] - generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True) - details.append( - { - "success": True, - "request_id": request.get("request_id"), - "prompt_len": prompt_len, - "output_len": len(generated_ids), - "expected_output_len": output_tokens, - "output_bp": len(generated_ids) * args.bp_per_token, - "ttft": 0.0, - "itl": [], - "latency": latency, - "start_time": start_wall, - "generated_text": generated_text, - "error": "", - "metadata": request.get("metadata", {}), - "batch_size": len(requests), - } - ) - return details - - -def summarize_results( - requests: list[dict], - details: list[dict], - duration: float, - args: argparse.Namespace, -) -> dict: - successes = [item for item in details if item["success"]] - failures = [item for item in details if not item["success"]] - ttfts = [item["ttft"] for item in successes if item["ttft"] > 0] - itls = [latency for item in successes for latency in item["itl"]] - e2els = [item["latency"] for item in successes] - total_input = sum(item["prompt_len"] for item in successes) - total_output = sum(item["output_len"] for item in successes) - completed = len(successes) - safe_duration = duration if duration > 0 else 1e-12 - - return { - "date": datetime.now().strftime("%Y%m%d-%H%M%S"), - "backend": "transformers", - "endpoint_type": "transformers", - "label": args.label, - "model_id": args.model, - "tokenizer_id": "builtin-kmer", - "num_prompts": len(requests), - "batch_size": max(1, args.batch_size), - "duration": duration, - "completed": completed, - "failed": len(failures), - "total_input_tokens": total_input, - "total_output_tokens": total_output, - "request_throughput": completed / safe_duration, - "request_goodput": None, - "output_throughput": total_output / safe_duration, - "bp_per_token": args.bp_per_token, - "output_bp_throughput": (total_output * args.bp_per_token) / safe_duration, - "total_token_throughput": (total_input + total_output) / safe_duration, - "input_lens": [item["prompt_len"] for item in details], - "output_lens": [item["output_len"] for item in details], - "ttfts": [item["ttft"] for item in details], - "itls": [item["itl"] for item in details], - "start_times": [item["start_time"] for item in details], - "generated_texts": [item["generated_text"] for item in details], - "errors": [item["error"] for item in details], - "mean_ttft_ms": mean_ms(ttfts), - "median_ttft_ms": median_ms(ttfts), - "std_ttft_ms": std_ms(ttfts), - "p99_ttft_ms": p99_ms(ttfts), - "mean_tpot_ms": 0.0, - "median_tpot_ms": 0.0, - "std_tpot_ms": 0.0, - "p99_tpot_ms": 0.0, - "mean_itl_ms": mean_ms(itls), - "median_itl_ms": median_ms(itls), - "std_itl_ms": std_ms(itls), - "p99_itl_ms": p99_ms(itls), - "mean_e2el_ms": mean_ms(e2els), - "median_e2el_ms": median_ms(e2els), - "std_e2el_ms": std_ms(e2els), - "p99_e2el_ms": p99_ms(e2els), - "max_output_tokens_per_s": 0.0, - "max_concurrent_requests": min(max(1, args.batch_size), completed), - "rtfx": 0.0, - } - - -def print_metric_block(result: dict) -> None: - print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="=")) - print("{:<40} {:<10}".format("Successful requests:", result["completed"])) - print("{:<40} {:<10}".format("Failed requests:", result["failed"])) - print("{:<40} {:<10.2f}".format("Benchmark duration (s):", result["duration"])) - print("{:<40} {:<10}".format("Total input tokens:", result["total_input_tokens"])) - print( - "{:<40} {:<10}".format( - "Total generated tokens:", result["total_output_tokens"] - ) - ) - print( - "{:<40} {:<10.2f}".format( - "Request throughput (req/s):", result["request_throughput"] - ) - ) - print( - "{:<40} {:<10.2f}".format( - "Output token throughput (tok/s):", result["output_throughput"] - ) - ) - print( - "{:<40} {:<10.2f}".format( - "Output bp throughput (bp/s):", result["output_bp_throughput"] - ) - ) - print( - "{:<40} {:<10.2f}".format( - "Total token throughput (tok/s):", result["total_token_throughput"] - ) - ) - print("=" * 50) - - -def main() -> None: - args = parse_args() - args.output_dir.mkdir(parents=True, exist_ok=True) - result_json = args.result_json or (args.output_dir / "benchmark.json") - detailed_json = args.detailed_json or (args.output_dir / "detailed.json") - - requests = read_jsonl(args.dataset_path) - if args.num_prompts > 0: - requests = requests[: args.num_prompts] - if not requests: - raise ValueError("No requests to benchmark") - - tokenizer = KmerTokenizer(args.kmer_size) - model = load_model(args) - - batch_size = max(1, args.batch_size) - warmups = requests[: args.num_warmups] - for batch_start in range(0, len(warmups), batch_size): - _ = run_batch( - model, - warmups[batch_start : batch_start + batch_size], - tokenizer, - args, - ) - - benchmark_start = time.perf_counter() - details = [] - try: - for batch_start in range(0, len(requests), batch_size): - details.extend( - run_batch( - model, - requests[batch_start : batch_start + batch_size], - tokenizer, - args, - ) - ) - except Exception as exc: - details.append( - { - "success": False, - "request_id": None, - "prompt_len": 0, - "output_len": 0, - "expected_output_len": 0, - "output_bp": 0, - "ttft": 0.0, - "itl": [], - "latency": 0.0, - "start_time": time.time(), - "generated_text": "", - "error": repr(exc), - "metadata": {}, - "batch_size": batch_size, - } - ) - synchronize_cuda() - duration = time.perf_counter() - benchmark_start - - result = summarize_results(requests, details, duration, args) - print_metric_block(result) - - with detailed_json.open("w", encoding="utf-8") as handle: - json.dump(details, handle, indent=2) - with result_json.open("w", encoding="utf-8") as handle: - json.dump(result, handle, indent=2) - - print(f"Result JSON: {result_json}") - print(f"Detailed JSON: {detailed_json}") - - if result["failed"]: - raise SystemExit(1) - - -if __name__ == "__main__": - main() diff --git a/evaluation/serving/benchmark_sequence_recovery_gen_len.py b/evaluation/serving/benchmark_sequence_recovery_gen_len.py deleted file mode 100644 index 15d249e..0000000 --- a/evaluation/serving/benchmark_sequence_recovery_gen_len.py +++ /dev/null @@ -1,429 +0,0 @@ -import argparse -import json -import os -import re -import subprocess -import sys -import time -from pathlib import Path - -import matplotlib.pyplot as plt -import pandas as pd -import torch - -REPO_ROOT = Path(__file__).resolve().parents[2] -EVAL_SCRIPT = REPO_ROOT / "evaluation" / "sequence_recovery_eval.py" -DEFAULT_OUTPUT_DIR = REPO_ROOT / "scratch" / "sequence_recovery_gen_len_benchmark" -DEFAULT_GEN_LENS = [5, 10, 20, 40, 80, 160, 320, 640] -GROUP_ORDER = [ - "fungi", - "invertebrate", - "plant", - "protozoa", - "vertebrate_mammalian", - "vertebrate_other", - "overall", -] - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - description=( - "Run sequence recovery serially over a gen_len sweep, using all selected GPUs " - "for each eval, then aggregate and plot accuracy-vs-gen_len." - ) - ) - parser.add_argument("--model", required=True, help="Model name or path") - parser.add_argument( - "--model_name", - default=None, - help="Optional output name override passed through to sequence_recovery_eval.py", - ) - parser.add_argument( - "--revision", - default=None, - help="Optional model revision/tag/commit", - ) - parser.add_argument( - "--data_type", - default="eukaryote", - choices=["eukaryote", "bacteria", "others"], - help="Dataset split to evaluate", - ) - parser.add_argument( - "--data_path", - default="hf://datasets/GenerTeam/sequence-recovery", - help="HF dataset parquet path", - ) - parser.add_argument( - "--output_dir", - type=Path, - default=DEFAULT_OUTPUT_DIR, - help="Directory for run outputs and aggregate artifacts", - ) - parser.add_argument( - "--gen_lens", - type=int, - nargs="+", - default=DEFAULT_GEN_LENS, - help="gen_len values to benchmark serially", - ) - parser.add_argument( - "--num_gpus", - type=int, - default=8, - help="Number of visible GPUs to expose to each eval subprocess", - ) - parser.add_argument( - "--max_seq_len", - type=int, - default=6144, - help="Max input length in bp", - ) - parser.add_argument( - "--gen_len_bp", - type=int, - default=None, - help=( - "Base-pair generation length for Evo2 runs. Defaults to " - "gen_len * bp_per_token for each sweep point." - ), - ) - parser.add_argument( - "--bp_per_token", - type=int, - default=6, - help="Base pairs represented by each HF generation token.", - ) - parser.add_argument( - "--batch_size", - type=int, - default=64, - help="Batch size per GPU", - ) - parser.add_argument( - "--max_samples", - type=int, - default=None, - help="Optional test-only sample cap passed through to the eval script", - ) - parser.add_argument( - "--sample_seed", - type=int, - default=0, - help="Random seed used when --max_samples subsamples the dataset", - ) - parser.add_argument("--bf16", action="store_true", help="Use bfloat16") - parser.add_argument( - "--use_evo2", - action="store_true", - help="Use official Evo2 inference path", - ) - parser.add_argument( - "--use_dna_tags", - action="store_true", - help="Wrap DNA sequences with ... tags", - ) - parser.add_argument( - "--no_prefix", - action="store_true", - help="Do not add a BOS or DNA prefix token", - ) - parser.add_argument( - "--use_species_tags", - action="store_true", - help="Prepend species tags before DNA sequences", - ) - return parser.parse_args() - - -def sanitize_path_component(value: str) -> str: - sanitized = re.sub(r"[^A-Za-z0-9._-]+", "-", value).strip("-") - return sanitized or "run" - - -def resolve_visible_gpu_ids(requested_count: int) -> list[str]: - env_value = os.environ.get("CUDA_VISIBLE_DEVICES", "").strip() - if env_value and env_value != "NoDevFiles": - visible_ids = [item.strip() for item in env_value.split(",") if item.strip()] - else: - visible_ids = [str(index) for index in range(torch.cuda.device_count())] - - if len(visible_ids) < requested_count: - raise ValueError( - f"Requested {requested_count} GPUs but only found {len(visible_ids)} visible GPUs" - ) - - return visible_ids[:requested_count] - - -def build_eval_command( - args: argparse.Namespace, - run_dir: Path, - gen_len: int, -) -> list[str]: - gen_len_bp = args.gen_len_bp - if gen_len_bp is None: - gen_len_bp = gen_len * args.bp_per_token - - command = [ - sys.executable, - str(EVAL_SCRIPT), - "--model", - args.model, - "--data_type", - args.data_type, - "--data_path", - args.data_path, - "--output_dir", - str(run_dir), - "--max_seq_len", - str(args.max_seq_len), - "--gen_len", - str(gen_len), - "--gen_len_bp", - str(gen_len_bp), - "--batch_size", - str(args.batch_size), - "--accuracy_mode", - "prediction_length", - "--bp_per_token", - str(args.bp_per_token), - ] - if args.model_name: - command.extend(["--model_name", args.model_name]) - if args.revision: - command.extend(["--revision", args.revision]) - if args.max_samples is not None: - command.extend(["--max_samples", str(args.max_samples)]) - command.extend(["--sample_seed", str(args.sample_seed)]) - if args.bf16: - command.append("--bf16") - if args.use_evo2: - command.append("--use_evo2") - if args.use_dna_tags: - command.append("--use_dna_tags") - if args.no_prefix: - command.append("--no_prefix") - if args.use_species_tags: - command.append("--use_species_tags") - return command - - -def load_run_outputs(run_dir: Path) -> tuple[Path, Path, dict]: - parquet_paths = sorted(run_dir.glob("*.parquet")) - summary_paths = sorted(run_dir.glob("*.json")) - if len(parquet_paths) != 1 or len(summary_paths) != 1: - raise RuntimeError( - f"Expected exactly one parquet and one json in {run_dir}, " - f"found {len(parquet_paths)} parquet and {len(summary_paths)} json files" - ) - - summary_path = summary_paths[0] - with summary_path.open("r", encoding="utf-8") as handle: - summary = json.load(handle) - return parquet_paths[0], summary_path, summary - - -def build_aggregate_rows( - gen_len: int, generation_bp: int, run_df: pd.DataFrame -) -> list[dict]: - rows = [ - { - "gen_len": gen_len, - "generation_bp": generation_bp, - "group": "overall", - "accuracy": float(run_df["accuracy"].mean()), - "num_sequences": int(len(run_df)), - "effective_scored_bp": float(run_df["scored_bp"].mean()), - } - ] - - if "type" in run_df.columns: - grouped = ( - run_df.groupby("type", dropna=False) - .agg( - accuracy=("accuracy", "mean"), - num_sequences=("accuracy", "size"), - effective_scored_bp=("scored_bp", "mean"), - ) - .reset_index() - ) - for row in grouped.to_dict("records"): - rows.append( - { - "gen_len": gen_len, - "generation_bp": generation_bp, - "group": row["type"], - "accuracy": float(row["accuracy"]), - "num_sequences": int(row["num_sequences"]), - "effective_scored_bp": float(row["effective_scored_bp"]), - } - ) - - return rows - - -def plot_accuracy_by_group(aggregate_df: pd.DataFrame, output_path: Path) -> None: - output_path.parent.mkdir(parents=True, exist_ok=True) - fig, ax = plt.subplots(figsize=(12, 7)) - - for group in GROUP_ORDER: - group_df = aggregate_df[aggregate_df["group"] == group].sort_values("gen_len") - if group_df.empty: - continue - - x_values = group_df["generation_bp"] - style = { - "marker": "o", - "linewidth": 2.75 if group == "overall" else 1.8, - "color": "black" if group == "overall" else None, - } - ax.plot(x_values, group_df["accuracy"], label=group, **style) - - bp_lengths = sorted(aggregate_df["generation_bp"].unique()) - ax.set_xticks(bp_lengths) - ax.set_xticklabels([str(value) for value in bp_lengths]) - ax.set_xlabel("Base pair generation length") - ax.set_ylabel("Accuracy") - ax.set_ylim(0.0, 1.0) - ax.set_title("Sequence Recovery Accuracy vs Base Pair Generation Length by Type") - ax.grid(True, alpha=0.25) - ax.legend(loc="best") - - max_scored_bp = aggregate_df["effective_scored_bp"].max() - fig.text( - 0.5, - 0.01, - f"Mean scored bp is capped by the dataset label length. Observed max mean scored bp: {max_scored_bp:.1f}", - ha="center", - ) - fig.tight_layout(rect=(0, 0.03, 1, 1)) - fig.savefig(output_path, dpi=200) - plt.close(fig) - - -def plot_overall_accuracy(aggregate_df: pd.DataFrame, output_path: Path) -> None: - output_path.parent.mkdir(parents=True, exist_ok=True) - overall_df = aggregate_df[aggregate_df["group"] == "overall"].sort_values("gen_len") - x_values = overall_df["generation_bp"] - - fig, ax = plt.subplots(figsize=(10, 6)) - ax.plot( - x_values, - overall_df["accuracy"], - color="black", - marker="o", - linewidth=2.75, - ) - ax.set_xticks(x_values.tolist()) - ax.set_xticklabels([str(value) for value in x_values.tolist()]) - ax.set_xlabel("Base pair generation length") - ax.set_ylabel("Overall accuracy") - ax.set_ylim(0.0, 1.0) - ax.set_title("Overall Sequence Recovery Accuracy vs Base Pair Generation Length") - ax.grid(True, alpha=0.25) - fig.tight_layout() - fig.savefig(output_path, dpi=200) - plt.close(fig) - - -def main() -> None: - args = parse_args() - visible_gpu_ids = resolve_visible_gpu_ids(args.num_gpus) - - model_label = args.model_name or args.model.split("/")[-1] - benchmark_root = ( - args.output_dir - / sanitize_path_component(model_label) - / sanitize_path_component(args.data_type) - ) - benchmark_root.mkdir(parents=True, exist_ok=True) - - aggregate_rows = [] - run_manifest = [] - - for gen_len in args.gen_lens: - run_dir = benchmark_root / f"gen_len_{gen_len}" - run_dir.mkdir(parents=True, exist_ok=True) - - command = build_eval_command(args, run_dir, gen_len) - env = os.environ.copy() - env["CUDA_VISIBLE_DEVICES"] = ",".join(visible_gpu_ids) - - print( - f"\nRunning gen_len={gen_len} on GPUs {env['CUDA_VISIBLE_DEVICES']}", - flush=True, - ) - print("Command:", " ".join(command), flush=True) - - start_time = time.time() - subprocess.run(command, check=True, cwd=REPO_ROOT, env=env) - elapsed = time.time() - start_time - - parquet_path, summary_path, summary = load_run_outputs(run_dir) - run_df = pd.read_parquet(parquet_path) - generation_bp = int( - summary.get("requested_rollout_bp") or gen_len * args.bp_per_token - ) - aggregate_rows.extend(build_aggregate_rows(gen_len, generation_bp, run_df)) - - run_manifest.append( - { - "gen_len": gen_len, - "run_dir": str(run_dir), - "parquet_path": str(parquet_path), - "summary_path": str(summary_path), - "elapsed_seconds": elapsed, - "overall_accuracy": float(summary["overall_accuracy"]), - "requested_rollout_bp": generation_bp, - "mean_scored_bp": float(summary["mean_scored_bp"]), - "visible_gpu_count": int(summary["visible_gpu_count"]), - } - ) - - aggregate_df = pd.DataFrame(aggregate_rows) - aggregate_df["group"] = pd.Categorical( - aggregate_df["group"], - categories=GROUP_ORDER, - ordered=True, - ) - aggregate_df = aggregate_df.sort_values( - ["group", "generation_bp", "gen_len"] - ).reset_index(drop=True) - - aggregate_csv_path = benchmark_root / "accuracy_vs_gen_len.csv" - aggregate_json_path = benchmark_root / "benchmark_manifest.json" - group_plot_path = benchmark_root / "accuracy_vs_gen_len_by_type.png" - overall_plot_path = benchmark_root / "accuracy_vs_gen_len_overall.png" - - aggregate_df.to_csv(aggregate_csv_path, index=False) - plot_accuracy_by_group(aggregate_df, group_plot_path) - plot_overall_accuracy(aggregate_df, overall_plot_path) - - manifest = { - "model": args.model, - "model_name": model_label, - "revision": args.revision, - "data_type": args.data_type, - "data_path": args.data_path, - "gen_lens": args.gen_lens, - "bp_per_token": args.bp_per_token, - "accuracy_mode": "prediction_length", - "requested_gpu_count": args.num_gpus, - "cuda_visible_devices": visible_gpu_ids, - "sample_seed": args.sample_seed if args.max_samples is not None else None, - "output_root": str(benchmark_root), - "runs": run_manifest, - } - with aggregate_json_path.open("w", encoding="utf-8") as handle: - json.dump(manifest, handle, indent=2) - - print(f"\nWrote aggregate CSV to {aggregate_csv_path}") - print(f"Wrote benchmark manifest to {aggregate_json_path}") - print(f"Wrote grouped plot to {group_plot_path}") - print(f"Wrote overall plot to {overall_plot_path}") - - -if __name__ == "__main__": - main() diff --git a/evaluation/serving/plot_sequence_recovery_sweep.py b/evaluation/serving/plot_sequence_recovery_sweep.py deleted file mode 100644 index 2a9e0db..0000000 --- a/evaluation/serving/plot_sequence_recovery_sweep.py +++ /dev/null @@ -1,279 +0,0 @@ -"""Plot sequence-recovery accuracy vs generation length across models. - -Reads summary JSONs written by `evaluation/sequence_recovery_eval.py` in the -directory layout produced by `evaluation/submit_sequence_recovery_gen_len_sweep.sh`: - - {base_dir}/{model_name}/{data_type}/gen_len_{gen_len}/*.json - -For each model it plots overall_accuracy (and optionally per-type accuracy) as a -function of gen_len_bp = gen_len * bp_per_token (inferred from summary). - -Usage: - uv run --project evaluation python evaluation/scripts/plot_sequence_recovery_sweep.py \ - --base_dir ./eval_results/sequence_recovery_long_rollouts_pow2 \ - --data_type eukaryote \ - --model "3B hybrid=Carbon-3B-600B-dna-generv2-fp32-lmhead" \ - --model "8B hybrid=Carbon-8B-600B-dna-fp32-lmhead" \ - --model "Evo2 7B=Evo2-7B" \ - --out scratch/plots/sequence_recovery_sweep_overall.png \ - --type_panels scratch/plots/sequence_recovery_sweep_types.png -""" - -import argparse -import glob -import json -import os -from collections import defaultdict - -import matplotlib.pyplot as plt -import numpy as np - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser() - parser.add_argument( - "--base_dir", - required=True, - help="Root dir containing {model_name}/{data_type}/gen_len_*/ subdirs.", - ) - parser.add_argument( - "--data_type", - default="eukaryote", - help="Data-type split name used as a subdirectory.", - ) - parser.add_argument( - "--model", - action="append", - required=True, - dest="models", - help="Repeatable 'LABEL=MODEL_NAME' or 'LABEL=BASE_DIR::MODEL_NAME' mapping. " - "MODEL_NAME must match the directory name under the resolved base dir. " - "When BASE_DIR is omitted, --base_dir is used.", - ) - parser.add_argument( - "--out", - required=True, - help="Output PNG path for the overall-accuracy plot.", - ) - parser.add_argument( - "--type_panels", - default=None, - help="Optional output PNG path for a per-type panel grid.", - ) - parser.add_argument( - "--random_baseline", - type=float, - default=0.25, - help="Horizontal reference line (default 0.25 = 4-base uniform).", - ) - parser.add_argument( - "--title_suffix", - default="", - help="Optional text appended to plot titles, e.g. '(n=1000 samples)'.", - ) - return parser.parse_args() - - -def load_sweep(base_dir: str, data_type: str, model_name: str): - """Return rows = list of dicts, one per gen_len, sorted by gen_len_bp.""" - pattern = os.path.join(base_dir, model_name, data_type, "gen_len_*", "*.json") - paths = glob.glob(pattern) - rows = [] - for p in paths: - with open(p) as f: - s = json.load(f) - gen_len_dir = os.path.basename(os.path.dirname(p)) - gen_len = int(gen_len_dir.removeprefix("gen_len_")) - requested_bp = int(s.get("requested_rollout_bp") or 0) - bp_per_token = int(s.get("bp_per_token") or 6) - gen_len_bp = requested_bp if requested_bp > 0 else gen_len * bp_per_token - rows.append( - { - "gen_len": gen_len, - "gen_len_bp": gen_len_bp, - "overall": float(s["overall_accuracy"]), - "label_source": s.get("label_source", "dataset"), - "type_accuracy": s.get("type_accuracy", {}), - "accuracy_mode": s.get("accuracy_mode"), - } - ) - rows.sort(key=lambda r: r["gen_len_bp"]) - return rows - - -def parse_model_specs(specs, default_base_dir): - parsed = [] - for spec in specs: - if "=" not in spec: - raise SystemExit( - f"--model must be 'LABEL=MODEL_NAME' or 'LABEL=BASE_DIR::MODEL_NAME', got: {spec}" - ) - label, rhs = spec.split("=", 1) - if "::" in rhs: - base, name = rhs.split("::", 1) - else: - base, name = default_base_dir, rhs - parsed.append((label.strip(), base.strip(), name.strip())) - return parsed - - -def plot_overall( - models_data, out_path: str, random_baseline: float, title_suffix: str = "" -): - fig, ax = plt.subplots(figsize=(11, 6.6), dpi=200) - colors = plt.rcParams["axes.prop_cycle"].by_key()["color"] - for (label, _), rows, color in zip( - models_data.keys(), - models_data.values(), - colors, - ): - if not rows: - continue - xs = [r["gen_len_bp"] for r in rows] - ys = [r["overall"] for r in rows] - ax.plot(xs, ys, color=color, linewidth=2, label=label, zorder=2) - for r in rows: - marker = "o" if r["label_source"] == "dataset" else "s" - ax.scatter( - [r["gen_len_bp"]], - [r["overall"]], - color=color, - marker=marker, - s=60, - zorder=3, - edgecolors="white", - linewidths=0.8, - ) - - ax.axhline( - random_baseline, - color="#666666", - linestyle="--", - linewidth=1.2, - label="Random baseline", - ) - ax.scatter([], [], color="#444444", marker="o", s=60, label="label_source=dataset") - ax.scatter( - [], [], color="#444444", marker="s", s=60, label="label_source=sequence_tail" - ) - - ax.set_xscale("log", base=2) - all_x = sorted({r["gen_len_bp"] for rows in models_data.values() for r in rows}) - if all_x: - ax.set_xticks(all_x) - ax.set_xticklabels([str(x) for x in all_x]) - ax.set_xlabel("Generation length (base pairs)") - ax.set_ylabel("Accuracy") - ax.set_ylim(0.0, 1.0) - suffix = f" {title_suffix}" if title_suffix else "" - ax.set_title(f"Long-rollout sweep: Overall accuracy{suffix}") - ax.grid(True, alpha=0.3) - ax.legend(loc="upper right", framealpha=0.95) - fig.tight_layout() - os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True) - fig.savefig(out_path) - print(f"Saved overall plot to {out_path}") - plt.close(fig) - - -def plot_type_panels( - models_data, out_path: str, random_baseline: float, title_suffix: str = "" -): - type_names = sorted( - { - t - for rows in models_data.values() - for r in rows - for t in r["type_accuracy"].keys() - } - ) - if not type_names: - print("No per-type accuracy available; skipping type panels") - return - - n = len(type_names) - cols = 3 - rows_n = (n + cols - 1) // cols - fig, axes = plt.subplots(rows_n, cols, figsize=(cols * 4.5, rows_n * 3.2), dpi=200) - axes = np.array(axes).reshape(-1) - colors = plt.rcParams["axes.prop_cycle"].by_key()["color"] - - for ax, tname in zip(axes, type_names): - for (label, _), rows, color in zip( - models_data.keys(), - models_data.values(), - colors, - ): - xs = [r["gen_len_bp"] for r in rows if tname in r["type_accuracy"]] - ys = [ - r["type_accuracy"][tname] for r in rows if tname in r["type_accuracy"] - ] - if not xs: - continue - ax.plot(xs, ys, color=color, linewidth=1.8, label=label) - for r in rows: - if tname not in r["type_accuracy"]: - continue - marker = "o" if r["label_source"] == "dataset" else "s" - ax.scatter( - [r["gen_len_bp"]], - [r["type_accuracy"][tname]], - color=color, - marker=marker, - s=40, - edgecolors="white", - linewidths=0.6, - ) - - ax.axhline(random_baseline, color="#666666", linestyle="--", linewidth=1.0) - ax.set_xscale("log", base=2) - ax.set_title(tname) - ax.set_ylim(0.0, 1.0) - ax.grid(True, alpha=0.3) - - for extra in axes[n:]: - extra.set_visible(False) - - handles, labels = axes[0].get_legend_handles_labels() - if handles: - fig.legend( - handles, - labels, - loc="lower center", - ncol=len(labels), - bbox_to_anchor=(0.5, -0.01), - ) - suffix = f" {title_suffix}" if title_suffix else "" - fig.suptitle(f"Long-rollout sweep: Per-type accuracy{suffix}", y=1.00) - fig.tight_layout() - os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True) - fig.savefig(out_path, bbox_inches="tight") - print(f"Saved type-panel plot to {out_path}") - plt.close(fig) - - -def main(): - args = parse_args() - model_specs = parse_model_specs(args.models, args.base_dir) - - models_data = {} - for label, base, name in model_specs: - rows = load_sweep(base, args.data_type, name) - models_data[(label, name)] = rows - print(f" [{label}] ({base}/{name}): {len(rows)} gen_len points") - - if not any(models_data.values()): - raise SystemExit( - f"No summary JSONs found under {args.base_dir}. " - f"Check --base_dir / --model names / --data_type." - ) - - plot_overall(models_data, args.out, args.random_baseline, args.title_suffix) - if args.type_panels: - plot_type_panels( - models_data, args.type_panels, args.random_baseline, args.title_suffix - ) - - -if __name__ == "__main__": - main()