diff --git a/.gitignore b/.gitignore index 2797057..920b43e 100644 --- a/.gitignore +++ b/.gitignore @@ -131,3 +131,4 @@ job_sub/datasets/**/*_local.* # Local-only analysis / figure-generation scripts (notebooks, sensitivity-analysis # helpers, etc.). Anything dropped under this folder stays out of git. job_sub/utils/analysis/ +utils/ diff --git a/job_sub/conf/config.yaml b/job_sub/conf/config.yaml index 6bf5800..9a7d157 100755 --- a/job_sub/conf/config.yaml +++ b/job_sub/conf/config.yaml @@ -22,7 +22,7 @@ subset_ids_path: ${dataset_field:${datasets_file},${dataset_index},subset_ids_pa results_root_dir: /storage2/wangzitongLab/share/deepdraw_opt/jerry # number of random seeds to run for each active learning experiment -num_seeds_per_job: 30 +num_seeds_per_job: 20 seed_start: 0 parallelize_seeds: true @@ -30,7 +30,7 @@ parallelize_seeds: true al_settings: batch_size: 12 starting_batch_size: 12 - max_rounds: 29 # not including the initial selection + max_rounds: 19 # not including the initial selection feature_transforms: ${feature_transforms} target_transforms: ${target_transforms} output_dir: ${hydra:runtime.output_dir} @@ -47,14 +47,14 @@ hydra: subdir: ${dataset_name}/${override_values:${hydra.job.override_dirname},dataset_index|single_array_across_datasets|al_settings.seed,default} sweeper: # multirun mode sweeps over these parameters params: - initial_selection_strategy: probcover_euclidean, core_set, random - embedding_model: gLM2_166k_kneedle - query_strategy: botorch_qlog_nei, topk, botorch_q_ucb + initial_selection_strategy: kmedoids + embedding_model: 166k_alphagenome_1bp_embeddingkneedle + query_strategy: botorch_mes predictor: botorch_gp launcher: - timeout_min: 720 + timeout_min: 1440 # everything below is used only for submitting jobs to the cluster - partition: intel-sc3,wzt_20250411 # no spacing between names + partition: intel-sc3-32c,amd-ep5 # no spacing between names cpus_per_task: 1 qos: huge mem_per_cpu: 30GB diff --git a/utils/__init__.py b/utils/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/utils/add_n_top.py b/utils/add_n_top.py deleted file mode 100644 index c018e83..0000000 --- a/utils/add_n_top.py +++ /dev/null @@ -1,377 +0,0 @@ -#!/usr/bin/env python3 -""" -Add n_top to results.csv files by recomputing top_p counts. - -Example: - python utils/add_n_top.py job_sub/multirun/2025-12-19 - python utils/add_n_top.py job_sub/multirun/2025-12-19 --top-p 0.01 --column-name n_top_1e2 --overwrite -""" - -from __future__ import annotations - -import argparse -import ast -import csv -import json -from pathlib import Path -from typing import Any - -import numpy as np -import pandas as pd -import yaml -from tqdm import tqdm - -DEFAULT_LABEL_KEY = "Fold Change (Induced/Basal)" -DEFAULT_DATASETS_YAML = ( - Path(__file__).resolve().parents[1] / "job_sub" / "datasets" / "datasets.yaml" -) - - -def _parse_selected_ids(value: Any) -> list[Any]: - if value is None: - return [] - if isinstance(value, list): - return value - text = str(value).strip() - if not text: - return [] - try: - parsed = ast.literal_eval(text) - except (ValueError, SyntaxError): - return [item for item in text.split(",") if item.strip()] - if isinstance(parsed, list): - return parsed - return [parsed] - - -def _normalize_id(value: Any) -> str: - if isinstance(value, int | np.integer): - return str(int(value)) - text = str(value).strip() - if not text: - return "" - try: - return str(int(text)) - except ValueError: - try: - return str(int(float(text))) - except ValueError: - return text - - -def _load_yaml(path: Path) -> dict[str, Any]: - if not path.exists(): - return {} - return yaml.safe_load(path.read_text()) or {} - - -def _resolve_path(raw: Any, base_dir: Path) -> Path | None: - if raw in (None, "", "null"): - return None - path = Path(str(raw)).expanduser() - if path.is_absolute(): - return path - return (base_dir / path).resolve() - - -def _load_subset_ids(path: Path) -> np.ndarray: - subset_ids = [] - for line in path.read_text().splitlines(): - text = line.strip() - if not text: - continue - try: - subset_ids.append(int(text)) - except ValueError as exc: - raise ValueError( - f"Invalid sample id '{text}' in subset file {path}" - ) from exc - if not subset_ids: - raise ValueError(f"Subset ids file {path} did not contain any sample ids.") - return np.asarray(subset_ids, dtype=np.int64) - - -def _load_sample_ids(embeddings_path: Path) -> np.ndarray: - data = np.load(embeddings_path, allow_pickle=True) - if "ids" not in data: - raise ValueError( - f"'ids' array not found in {embeddings_path}. Available keys: {list(data.keys())}" - ) - return data["ids"].astype(np.int64) - - -def _load_labels( - metadata_path: Path, label_key: str, sample_ids: np.ndarray -) -> np.ndarray: - df = pd.read_csv(metadata_path, usecols=[label_key]) - df = df.iloc[sample_ids] - return df[label_key].to_numpy() - - -def _compute_top_id_set( - embeddings_path: Path, - metadata_path: Path, - label_key: str, - subset_ids_path: Path | None, - top_p: float, -) -> set[str]: - sample_ids = _load_sample_ids(embeddings_path) - if subset_ids_path is not None: - subset_ids = _load_subset_ids(subset_ids_path) - mask = np.isin(sample_ids, subset_ids) - if not np.any(mask): - raise ValueError( - "Subset id filtering removed all samples. " - "Ensure the subset ids match those stored in the embeddings file." - ) - sample_ids = sample_ids[mask] - - labels = _load_labels(metadata_path, label_key, sample_ids) - sorted_indices = np.argsort(labels) - num_top = max(1, int(len(labels) * top_p)) - top_indices = sorted_indices[-num_top:] - return {_normalize_id(item) for item in sample_ids[top_indices]} - - -def _load_summary(path: Path) -> dict[str, Any]: - if not path.exists(): - raise FileNotFoundError(f"summary.json not found: {path}") - return json.loads(path.read_text()) - - -def _extract_override_value(overrides: list[Any], key: str) -> str | None: - for entry in overrides: - text = str(entry).strip() - if not text: - continue - # Strip + prefix used by Hydra for adding new keys - if text.startswith("+"): - text = text[1:].strip() - if "=" in text: - candidate_key, value = text.split("=", 1) - if candidate_key.strip() == key: - return value.strip() - if text.startswith(f"{key}:"): - return text.split(":", 1)[1].strip() - return None - - -def _resolve_embedding_model(summary: dict[str, Any]) -> str | None: - model = str(summary.get("embedding_model", "")).strip() - if model and model.lower() != "none": - return model - overrides = summary.get("hydra_overrides") or [] - override_value = _extract_override_value(overrides, "embedding_model") - if override_value: - return override_value - return None - - -def _load_dataset_map(datasets_yaml_path: Path) -> dict[str, dict[str, Path | None]]: - if not datasets_yaml_path.exists(): - raise FileNotFoundError(f"Datasets YAML not found: {datasets_yaml_path}") - payload = _load_yaml(datasets_yaml_path) - datasets = payload.get("datasets") or [] - if not datasets: - raise ValueError(f"No datasets found in {datasets_yaml_path}") - - base_dir = datasets_yaml_path.parent - dataset_map: dict[str, dict[str, Path | None]] = {} - for entry in datasets: - name = str(entry.get("name", "")).strip() - if not name: - raise ValueError(f"Dataset entry missing name in {datasets_yaml_path}") - metadata_raw = str(entry.get("metadata_path", "")).strip() - if not metadata_raw: - raise ValueError( - f"Dataset '{name}' missing metadata_path in {datasets_yaml_path}" - ) - embedding_raw = str(entry.get("embedding_dir", "")).strip() - if not embedding_raw: - raise ValueError( - f"Dataset '{name}' missing embedding_dir in {datasets_yaml_path}" - ) - subset_raw = entry.get("subset_ids_path") - - dataset_map[name] = { - "metadata_path": _resolve_path(metadata_raw, base_dir), - "embedding_dir": _resolve_path(embedding_raw, base_dir), - "subset_ids_path": _resolve_path(subset_raw, base_dir) - if subset_raw - else None, - } - - return dataset_map - - -def _load_rows(path: Path) -> tuple[list[dict[str, Any]], list[str]]: - with path.open(newline="") as handle: - reader = csv.DictReader(handle) - rows = [dict(row) for row in reader] - fieldnames = list(reader.fieldnames or []) - return rows, fieldnames - - -def _write_rows(path: Path, rows: list[dict[str, Any]], fieldnames: list[str]) -> None: - tmp_path = path.with_suffix(".tmp") - with tmp_path.open("w", newline="") as handle: - writer = csv.DictWriter(handle, fieldnames=fieldnames) - writer.writeheader() - writer.writerows(rows) - tmp_path.replace(path) - - -def _update_results_csv( - results_path: Path, - top_ids: set[str], - column_name: str, - overwrite: bool, -) -> bool: - rows, fieldnames = _load_rows(results_path) - if not rows: - return False - if "selected_sample_ids" not in fieldnames: - raise ValueError(f"'selected_sample_ids' column missing in {results_path}") - if column_name in fieldnames and not overwrite: - return False - - for row in rows: - selected_ids = _parse_selected_ids(row.get("selected_sample_ids")) - normalized = {_normalize_id(item) for item in selected_ids} - row[column_name] = str(sum(1 for item in normalized if item in top_ids)) - - if column_name not in fieldnames: - fieldnames.append(column_name) - _write_rows(results_path, rows, fieldnames) - return True - - -def _iter_results(root: Path) -> list[Path]: - return [path for path in root.rglob("results.csv") if path.is_file()] - - -def main() -> int: - parser = argparse.ArgumentParser( - description="Add n_top to results.csv files by recomputing top_p." - ) - parser.add_argument( - "root_dir", - type=Path, - help="Directory containing results.csv files (e.g. job_sub/multirun/2025-12-19).", - ) - parser.add_argument( - "--datasets-yaml", - type=Path, - default=DEFAULT_DATASETS_YAML, - help="Path to datasets.yaml (default: job_sub/datasets/datasets.yaml).", - ) - parser.add_argument( - "--top-p", - type=float, - default=0.01, - help="Top percentage used to recompute n_top (default: 0.01).", - ) - parser.add_argument( - "--column-name", - type=str, - default="n_top_1e2", - help="Column name to write (default: n_top_1e2).", - ) - parser.add_argument( - "--overwrite", - action="store_true", - help="Overwrite the column if it already exists.", - ) - parser.add_argument( - "--label-key", - type=str, - default=DEFAULT_LABEL_KEY, - help="Label column in the metadata CSV.", - ) - args = parser.parse_args() - - if not 0.0 < args.top_p <= 1.0: - raise SystemExit("top_p must be between 0 and 1.") - - root_dir = args.root_dir - if not root_dir.exists(): - raise SystemExit(f"Root dir not found: {root_dir}") - - dataset_map = _load_dataset_map(args.datasets_yaml) - - results_paths = _iter_results(root_dir) - if not results_paths: - raise SystemExit(f"No results.csv found under {root_dir}") - - cache: dict[tuple[Path, Path, Path | None, str, float], set[str]] = {} - updated = 0 - skipped = 0 - for results_path in tqdm(results_paths, desc="Processing runs", unit="run"): - run_dir = results_path.parent - try: - summary = _load_summary(run_dir / "summary.json") - dataset_name = str(summary.get("dataset_name", "")).strip() - if not dataset_name: - raise ValueError("dataset_name missing in summary.json") - dataset_spec = dataset_map.get(dataset_name) - if dataset_spec is None: - raise ValueError( - f"Dataset '{dataset_name}' not found in {args.datasets_yaml}" - ) - embedding_model = _resolve_embedding_model(summary) - if not embedding_model: - raise ValueError( - f"embedding_model missing for dataset '{dataset_name}'" - ) - embedding_dir = dataset_spec.get("embedding_dir") - if embedding_dir is None: - raise ValueError(f"embedding_dir missing for dataset '{dataset_name}'") - embedding_file = ( - embedding_model - if embedding_model.endswith(".npz") - else f"{embedding_model}.npz" - ) - embeddings_path = Path(embedding_dir) / embedding_file - metadata_path = dataset_spec.get("metadata_path") - if metadata_path is None: - raise ValueError(f"metadata_path missing for dataset '{dataset_name}'") - subset_ids_path = dataset_spec.get("subset_ids_path") - - cache_key = ( - embeddings_path, - Path(metadata_path), - Path(subset_ids_path) if subset_ids_path else None, - args.label_key, - args.top_p, - ) - top_ids = cache.get(cache_key) - if top_ids is None: - top_ids = _compute_top_id_set( - embeddings_path=embeddings_path, - metadata_path=metadata_path, - label_key=args.label_key, - subset_ids_path=subset_ids_path, - top_p=args.top_p, - ) - cache[cache_key] = top_ids - - changed = _update_results_csv( - results_path=results_path, - top_ids=top_ids, - column_name=args.column_name, - overwrite=args.overwrite, - ) - if changed: - updated += 1 - else: - skipped += 1 - except Exception as exc: - tqdm.write(f"Skipping {results_path}: {exc}") - skipped += 1 - - tqdm.write(f"Updated {updated} runs, skipped {skipped} runs.") - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/utils/baseline_scores.py b/utils/baseline_scores.py deleted file mode 100644 index 911d71e..0000000 --- a/utils/baseline_scores.py +++ /dev/null @@ -1,340 +0,0 @@ -""" -Compute random baseline summary metrics across datasets, reported per round. - -Example: - python utils/baseline_scores.py \\ - --datasets-yaml job_sub/datasets/datasets.yaml \\ - --output-csv results/baseline_scores.csv \\ - --num-experiments 1000 \\ - --num-rounds 10 \\ - --num-samples-per-round 12 -""" - -from __future__ import annotations - -import argparse -from dataclasses import dataclass -from pathlib import Path - -import numpy as np -import pandas as pd -import yaml -from tqdm import tqdm - -DEFAULT_DATASETS_YAML = ( - Path(__file__).resolve().parents[1] / "job_sub" / "datasets" / "datasets.yaml" -) -DEFAULT_LABEL_KEY = "Fold Change (Induced/Basal)" -DEFAULT_OUTPUT_CSV = ( - Path(__file__).resolve().parents[1] / "results" / "baseline_scores.csv" -) - - -@dataclass(frozen=True) -class DatasetSpec: - name: str - metadata_path: Path - subset_ids_path: Path | None = None - - -def load_dataset_specs(dataset_yaml_path: Path) -> list[DatasetSpec]: - if not dataset_yaml_path.exists(): - raise FileNotFoundError(f"Dataset YAML not found: {dataset_yaml_path}") - - with dataset_yaml_path.open("r") as handle: - payload = yaml.safe_load(handle) or {} - - datasets = payload.get("datasets") or [] - if not datasets: - raise ValueError(f"No datasets found in {dataset_yaml_path}") - - specs: list[DatasetSpec] = [] - for entry in datasets: - name = str(entry.get("name", "")).strip() - if not name: - raise ValueError(f"Dataset entry missing name in {dataset_yaml_path}") - - metadata_raw = str(entry.get("metadata_path", "")).strip() - if not metadata_raw: - raise ValueError( - f"Dataset '{name}' missing metadata_path in {dataset_yaml_path}" - ) - metadata_path = Path(metadata_raw).expanduser() - if not metadata_path.is_absolute(): - metadata_path = (dataset_yaml_path.parent / metadata_path).resolve() - - subset_raw = entry.get("subset_ids_path") - subset_ids_path = None - if subset_raw: - subset_ids_path = Path(str(subset_raw)).expanduser() - if not subset_ids_path.is_absolute(): - subset_ids_path = (dataset_yaml_path.parent / subset_ids_path).resolve() - - specs.append( - DatasetSpec( - name=name, - metadata_path=metadata_path, - subset_ids_path=subset_ids_path, - ) - ) - - return specs - - -def load_subset_ids(subset_ids_path: Path) -> np.ndarray: - subset_ids = [] - for line in subset_ids_path.read_text().splitlines(): - text = line.strip() - if not text: - continue - try: - subset_ids.append(int(text)) - except ValueError as exc: - raise ValueError( - f"Invalid sample id '{text}' in subset file {subset_ids_path}" - ) from exc - if not subset_ids: - raise ValueError(f"Subset ids file {subset_ids_path} did not contain any ids.") - return np.asarray(subset_ids, dtype=np.int64) - - -def load_label_array( - metadata_path: Path, - label_key: str, - label_cache: dict[tuple[Path, str], np.ndarray], -) -> np.ndarray: - cache_key = (metadata_path, label_key) - if cache_key in label_cache: - return label_cache[cache_key] - - try: - df = pd.read_csv(metadata_path, usecols=[label_key]) - except ValueError as exc: - raise ValueError( - f"Label key '{label_key}' not found in {metadata_path}" - ) from exc - - series = pd.to_numeric(df[label_key], errors="coerce") - label_cache[cache_key] = series.to_numpy() - return label_cache[cache_key] - - -def load_labels( - dataset: DatasetSpec, - label_key: str, - label_cache: dict[tuple[Path, str], np.ndarray], - subset_cache: dict[Path, np.ndarray], -) -> tuple[np.ndarray, np.ndarray]: - labels = load_label_array(dataset.metadata_path, label_key, label_cache) - sample_ids = np.arange(len(labels), dtype=np.int64) - - if dataset.subset_ids_path is not None: - subset_ids_path = dataset.subset_ids_path - if subset_ids_path not in subset_cache: - subset_cache[subset_ids_path] = load_subset_ids(subset_ids_path) - subset_ids = subset_cache[subset_ids_path] - if np.any(subset_ids < 0) or subset_ids.max() >= len(labels): - raise ValueError( - f"Subset ids in {subset_ids_path} are out of bounds for " - f"{dataset.metadata_path} (len={len(labels)})" - ) - labels = labels[subset_ids] - sample_ids = subset_ids - - finite_mask = np.isfinite(labels) - labels = labels[finite_mask] - sample_ids = sample_ids[finite_mask] - if labels.size == 0: - raise ValueError( - f"No finite labels found for dataset '{dataset.name}' after filtering." - ) - return labels, sample_ids - - -def build_top_mask(labels: np.ndarray, top_p: float) -> np.ndarray: - num_top = max(1, int(len(labels) * top_p)) - if num_top >= len(labels): - return np.ones(len(labels), dtype=bool) - top_indices = np.argsort(labels)[-num_top:] - top_mask = np.zeros(len(labels), dtype=bool) - top_mask[top_indices] = True - return top_mask - - -def draw_random_rounds( - num_samples: int, - num_rounds: int, - num_samples_per_round: int, - rng: np.random.Generator, -) -> np.ndarray: - if num_rounds <= 0 or num_samples_per_round <= 0: - raise ValueError("num_rounds and num_samples_per_round must be > 0.") - total_samples = num_rounds * num_samples_per_round - if total_samples > num_samples: - raise ValueError( - "Cannot sample without replacement: requested samples exceed dataset size." - ) - selections = rng.choice(num_samples, size=total_samples, replace=False) - return selections.reshape(num_rounds, num_samples_per_round) - - -def compute_random_summary_metrics_history( - labels: np.ndarray, - top_mask: np.ndarray, - max_label: float, - num_rounds: int, - num_samples_per_round: int, - seed: int, -) -> list[dict[str, float]]: - rng = np.random.default_rng(seed) - rounds = draw_random_rounds( - num_samples=len(labels), - num_rounds=num_rounds, - num_samples_per_round=num_samples_per_round, - rng=rng, - ) - round_labels = labels[rounds] - normalized_true = round_labels.max(axis=1) / max_label - n_top = top_mask[rounds].sum(axis=1).astype(np.float64) - cumulative_max = np.maximum.accumulate(normalized_true) - cumulative_max_sum = np.cumsum(cumulative_max) - cumulative_n_top = np.cumsum(n_top) - cumulative_n_top_sum = np.cumsum(cumulative_n_top) - selected_per_round = np.full(num_rounds, num_samples_per_round, dtype=np.float64) - cumulative_selected = np.cumsum(selected_per_round) - cumulative_selected_sum = np.cumsum(cumulative_selected) - top_hits = np.where(n_top >= 1)[0] - if top_hits.size: - first_hit = float(top_hits[0] + 1) - rounds_to_top_history = np.full(num_rounds, first_hit, dtype=np.float64) - rounds_to_top_history[: int(first_hit) - 1] = np.nan - else: - rounds_to_top_history = np.full(num_rounds, np.nan, dtype=np.float64) - - history: list[dict[str, float]] = [] - for idx in range(num_rounds): - prefix_len = idx + 1 - denom = float(cumulative_selected_sum[idx]) - history.append( - { - "round": idx, - "auc_true": float(cumulative_max_sum[idx] / prefix_len), - "avg_top": float(cumulative_n_top_sum[idx] / denom) if denom else 0.0, - "rounds_to_top": float(rounds_to_top_history[idx]), - "overall_true": float(cumulative_max[idx]), - "max_train_spearman": float("nan"), - "max_extreme_value_auc": float("nan"), - } - ) - return history - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - description="Compute random baseline metrics for each dataset in a YAML file." - ) - parser.add_argument( - "--datasets-yaml", - default=str(DEFAULT_DATASETS_YAML), - help="Path to the datasets YAML file.", - ) - parser.add_argument( - "--output-csv", - default=str(DEFAULT_OUTPUT_CSV), - help="Path to save aggregated results as CSV.", - ) - parser.add_argument( - "--label-key", - default=DEFAULT_LABEL_KEY, - help="Column name in the metadata CSV containing target labels.", - ) - parser.add_argument("--num-experiments", type=int, default=10000) - parser.add_argument("--num-rounds", type=int, default=10) - parser.add_argument("--num-samples-per-round", type=int, default=12) - parser.add_argument( - "--top-p", - type=float, - default=0.01, - help="Top percentage used for avg_top (matches active learning defaults).", - ) - parser.add_argument( - "--dataset", - action="append", - default=[], - help="Dataset name to include (can be repeated).", - ) - return parser.parse_args() - - -def main() -> None: - args = parse_args() - dataset_yaml_path = Path(args.datasets_yaml).expanduser() - output_csv = Path(args.output_csv).expanduser() - - if args.num_experiments <= 0: - raise ValueError("num_experiments must be > 0.") - if not 0.0 < args.top_p <= 1.0: - raise ValueError("top_p must be between 0 and 1.") - - datasets = load_dataset_specs(dataset_yaml_path) - if args.dataset: - requested = set(args.dataset) - datasets = [dataset for dataset in datasets if dataset.name in requested] - missing = requested - {dataset.name for dataset in datasets} - if missing: - raise ValueError( - f"Requested datasets not found in {dataset_yaml_path}: {sorted(missing)}" - ) - - label_cache: dict[tuple[Path, str], np.ndarray] = {} - subset_cache: dict[Path, np.ndarray] = {} - experiment_seeds = np.arange(args.num_experiments, dtype=np.int64) - - random_states = [] - for dataset in tqdm(datasets): - labels, _ = load_labels(dataset, args.label_key, label_cache, subset_cache) - dataset_max_label = float(np.max(labels)) - top_mask = build_top_mask(labels, args.top_p) - for seed_value in experiment_seeds: - summary_metrics_history = compute_random_summary_metrics_history( - labels=labels, - top_mask=top_mask, - max_label=dataset_max_label, - num_rounds=args.num_rounds, - num_samples_per_round=args.num_samples_per_round, - seed=int(seed_value), - ) - for summary_metrics in summary_metrics_history: - random_states.append( - { - "dataset_name": dataset.name, - "query_strategy": "RANDOM", - "predictor": "NONE", - "initial_selection": "RANDOM", - "embedding_model": "NONE", - "feature_transforms": "NONE", - "target_transforms": "NONE", - "seed": int(seed_value), - "round": summary_metrics["round"], - "overall_true": summary_metrics["overall_true"], - "auc_true": summary_metrics["auc_true"], - "avg_top": summary_metrics["avg_top"], - "rounds_to_top": summary_metrics["rounds_to_top"], - "max_train_spearman": summary_metrics["max_train_spearman"], - "max_extreme_value_auc": summary_metrics[ - "max_extreme_value_auc" - ], - "dataset_max_label": dataset_max_label, - } - ) - - df = pd.DataFrame(random_states) - if df.empty: - raise ValueError("No datasets selected; nothing to write.") - - output_csv.parent.mkdir(parents=True, exist_ok=True) - df.to_csv(output_csv, index=False) - - -if __name__ == "__main__": - main() diff --git a/utils/cancel_long_jobs.sh b/utils/cancel_long_jobs.sh deleted file mode 100644 index 514ee0d..0000000 --- a/utils/cancel_long_jobs.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -interval_seconds=300 -threshold=10 - -if ! command -v squeue >/dev/null 2>&1; then - echo "squeue not found in PATH" >&2 - exit 1 -fi - -if ! command -v scancel >/dev/null 2>&1; then - echo "scancel not found in PATH" >&2 - exit 1 -fi - -while true; do - array_job_ids=() - while IFS= read -r job_id; do - [[ "$job_id" == *"_"* ]] || continue - base_id="${job_id%%_*}" - array_job_ids+=("$base_id") - done < <(squeue -u "${USER}" -h -o "%i") - - if ((${#array_job_ids[@]} > 0)); then - mapfile -t unique_base_ids < <(printf "%s\n" "${array_job_ids[@]}" | sort -u) - for base_id in "${unique_base_ids[@]}"; do - task_count=$(squeue -u "${USER}" -h -o "%i" | awk -v base="${base_id}_" '$1 ~ "^"base {count++} END {print count+0}') - echo "Array job ${base_id}: remaining tasks = ${task_count}" - if ((task_count > 0 && task_count < threshold)); then - echo "Canceling array job ${base_id} (remaining tasks: ${task_count})" - scancel "${base_id}" - fi - done - else - echo "No array jobs found." - fi - - sleep "${interval_seconds}" -done diff --git a/utils/concat_embedding.py b/utils/concat_embedding.py deleted file mode 100644 index d78b816..0000000 --- a/utils/concat_embedding.py +++ /dev/null @@ -1,149 +0,0 @@ -#!/usr/bin/env python3 -""" -Concatenate two embedding NPZ files by matching sample ids. - -By default, embeddings are concatenated as-is. Optional L2 normalization can be -enabled before concatenation, and optional PCA can be applied after -concatenation to reach a target explained variance ratio. - -Usage examples: - python utils/concat_embedding.py a.npz b.npz out.npz - python utils/concat_embedding.py a.npz b.npz out.npz --normalize - python utils/concat_embedding.py a.npz b.npz out.npz --normalize --pca-var 0.95 -""" - -from __future__ import annotations - -import argparse -from pathlib import Path - -import numpy as np - - -def _load_npz(path: Path) -> tuple[np.ndarray, np.ndarray]: - data = np.load(path, allow_pickle=True) - if "embeddings" not in data or "ids" not in data: - raise ValueError( - f"{path} must contain 'embeddings' and 'ids' arrays. " - f"Found keys: {list(data.keys())}" - ) - embeddings = np.asarray(data["embeddings"]) - ids = np.asarray(data["ids"]) - if ids.ndim != 1: - raise ValueError(f"{path} ids must be 1D, got shape {ids.shape}") - if embeddings.shape[0] != ids.shape[0]: - raise ValueError( - f"{path} embeddings/ids length mismatch: " - f"{embeddings.shape[0]} vs {ids.shape[0]}" - ) - return embeddings, ids - - -def _ensure_unique(ids: np.ndarray, label: str) -> None: - unique_count = np.unique(ids).size - if unique_count != ids.size: - raise ValueError(f"{label} ids contain duplicates ({ids.size - unique_count}).") - - -def _l2_normalize(embeddings: np.ndarray, eps: float = 1e-12) -> np.ndarray: - norms = np.linalg.norm(embeddings, axis=1, keepdims=True) - return embeddings / np.maximum(norms, eps) - - -def _apply_pca_variance( - embeddings: np.ndarray, target_variance: float -) -> tuple[np.ndarray, int]: - if not 0.0 < target_variance <= 1.0: - raise ValueError("target_variance must be in (0, 1].") - mean = np.mean(embeddings, axis=0, keepdims=True) - centered = embeddings - mean - _, s, vt = np.linalg.svd(centered, full_matrices=False) - if s.size == 0: - return centered, 0 - var = (s**2) / max(embeddings.shape[0] - 1, 1) - total_var = float(np.sum(var)) - if total_var <= 0: - return centered, 1 - explained_ratio = var / total_var - cumulative = np.cumsum(explained_ratio) - n_components = int(np.searchsorted(cumulative, target_variance) + 1) - components = vt[:n_components] - return centered @ components.T, n_components - - -def concat_embeddings( - path_a: Path, - path_b: Path, - output_path: Path, - normalize: bool = False, - pca_variance: float | None = None, -) -> None: - emb_a, ids_a = _load_npz(path_a) - emb_b, ids_b = _load_npz(path_b) - _ensure_unique(ids_a, f"{path_a}") - _ensure_unique(ids_b, f"{path_b}") - - ids_b_set = set(ids_b.tolist()) - mask_a = np.isin(ids_a, ids_b) - ids_common = ids_a[mask_a] - if ids_common.size == 0: - raise ValueError("No overlapping ids found between the two files.") - - index_b = {int(id_val): idx for idx, id_val in enumerate(ids_b)} - idx_a = np.nonzero(mask_a)[0] - idx_b = np.array([index_b[int(id_val)] for id_val in ids_common], dtype=int) - - emb_a_sel = emb_a[idx_a] - emb_b_sel = emb_b[idx_b] - if normalize: - emb_a_sel = _l2_normalize(emb_a_sel) - emb_b_sel = _l2_normalize(emb_b_sel) - emb_concat = np.concatenate([emb_a_sel, emb_b_sel], axis=1) - if pca_variance is not None: - emb_concat, n_components = _apply_pca_variance(emb_concat, pca_variance) - - output_path.parent.mkdir(parents=True, exist_ok=True) - np.savez(output_path, embeddings=emb_concat, ids=ids_common) - - print(f"Saved concatenated embeddings to {output_path}") - print(f"File A: {path_a} ({ids_a.size} ids, {emb_a.shape[1]} dims)") - print(f"File B: {path_b} ({ids_b.size} ids, {emb_b.shape[1]} dims)") - print(f"Overlap: {ids_common.size} ids, output shape: {emb_concat.shape}") - if pca_variance is not None: - print(f"PCA retained {n_components} components for {pca_variance:.2f} variance") - - -def _parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - description="Concatenate two embedding NPZ files by matching ids." - ) - parser.add_argument("embedding_a", type=Path, help="Path to first NPZ file.") - parser.add_argument("embedding_b", type=Path, help="Path to second NPZ file.") - parser.add_argument("output", type=Path, help="Path for output NPZ file.") - parser.add_argument( - "--normalize", - action="store_true", - help="Enable L2 normalization before concatenation.", - ) - parser.add_argument( - "--pca-var", - type=float, - default=None, - help="Target explained variance ratio for PCA (e.g., 0.95).", - ) - return parser.parse_args() - - -def main() -> None: - args = _parse_args() - concat_embeddings( - args.embedding_a, - args.embedding_b, - args.output, - normalize=args.normalize, - pca_variance=args.pca_var, - ) - - -if __name__ == "__main__": - main() diff --git a/utils/generate_partial_summaries.py b/utils/generate_partial_summaries.py deleted file mode 100644 index 8b099a7..0000000 --- a/utils/generate_partial_summaries.py +++ /dev/null @@ -1,226 +0,0 @@ -#!/usr/bin/env python3 -""" -Generate summary_n.json files for partial runs based on results.csv. - -Example: - python utils/generate_partial_summaries.py job_sub/multirun/2026-01-01 - python utils/generate_partial_summaries.py job_sub/multirun/2026-01-01 --n 3,5 --overwrite -""" - -from __future__ import annotations - -import argparse -import ast -import csv -import json -import os -from collections.abc import Iterable -from pathlib import Path -from typing import Any - -import numpy as np -from tqdm import tqdm - -SUMMARY_METRIC_RULES = { - "auc_true": ("max_accumulate", "normalized_true"), - "avg_top": ("top_mean", "n_top"), - "rounds_to_top": ("rounds_to_top", "n_top"), - "overall_true": ("max_overall", "normalized_true"), - "max_train_spearman": ("max_overall", "train_spearman"), - "max_extreme_value_auc": ("max_overall", "extreme_value_auc"), -} - - -def _parse_float(value: Any) -> float: - if value is None: - return float("nan") - if isinstance(value, int | float): - return float(value) - text = str(value).strip() - if not text: - return float("nan") - try: - return float(text) - except ValueError: - return float("nan") - - -def _parse_selected_ids(value: Any) -> list[Any]: - if value is None: - return [] - if isinstance(value, list): - return value - text = str(value).strip() - if not text: - return [] - try: - parsed = ast.literal_eval(text) - except (ValueError, SyntaxError): - return [item for item in text.split(",") if item.strip()] - if isinstance(parsed, list): - return parsed - return [parsed] - - -def _load_rows(path: Path) -> list[dict[str, Any]]: - with path.open(newline="") as handle: - reader = csv.DictReader(handle) - rows = [dict(row) for row in reader] - for row in rows: - try: - row["_round"] = int(row.get("round", 0)) - except (TypeError, ValueError): - row["_round"] = 0 - return sorted(rows, key=lambda r: r["_round"]) - - -def _compute_summary(rows: list[dict[str, Any]]) -> dict[str, float]: - if not rows: - return {name: float("nan") for name in SUMMARY_METRIC_RULES} - - columns = set(rows[0].keys()) - n_top_col = None - if "n_top" in columns: - n_top_col = "n_top" - elif "n_selected_in_top" in columns: - n_top_col = "n_selected_in_top" - - selected_counts = [ - len(_parse_selected_ids(row.get("selected_sample_ids"))) for row in rows - ] - cumulative_selected = int(np.sum(np.cumsum(selected_counts))) - - summary: dict[str, float] = {} - for metric_name, (rule, metric_column) in SUMMARY_METRIC_RULES.items(): - column = metric_column - if metric_column == "n_top" and n_top_col is not None: - column = n_top_col - if column not in columns: - summary[metric_name] = float("nan") - continue - - values = np.array([_parse_float(row.get(column)) for row in rows], dtype=float) - if rule == "top_mean": - if cumulative_selected <= 0: - summary[metric_name] = 0.0 - else: - cumulative_sum = np.cumsum(values) - summary[metric_name] = ( - float(np.sum(cumulative_sum)) / cumulative_selected - ) - elif rule == "mean": - summary[metric_name] = float(np.nanmean(values)) - elif rule == "max_accumulate": - cumulative_max = np.maximum.accumulate(values) - summary[metric_name] = float(np.sum(cumulative_max)) / len(values) - elif rule == "max_overall": - finite = values[np.isfinite(values)] - summary[metric_name] = ( - float(np.max(finite)) if finite.size else float("nan") - ) - elif rule == "rounds_to_top": - hits = np.where(values >= 1)[0] - summary[metric_name] = float(hits[0] + 1) if hits.size else float("nan") - else: - raise ValueError(f"Unknown summary metric rule: {rule}") - - return summary - - -def _load_base_summary(path: Path) -> dict[str, Any]: - if not path.exists(): - return {} - return json.loads(path.read_text()) - - -def _resolve_metrics_to_update(_: dict[str, Any]) -> list[str]: - return list(SUMMARY_METRIC_RULES.keys()) - - -def _iter_results(root: Path) -> Iterable[Path]: - max_depth = 4 # root/