From a3edf635e86604be5a4898b1631c783cc6ca1ad2 Mon Sep 17 00:00:00 2001
From: Matt Davis <matt@opensensor.io>
Date: Tue, 28 Apr 2026 00:48:33 -0400
Subject: [PATCH] Fix bring-up bugs surfaced by transformers >=5 / datasets >=4
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

While bringing up the DNABERT-Epi training pipeline against current
PyPI deps (torch 2.11+, transformers 5.x, datasets 4.x) on Python 3.14
we hit a small set of upstream bugs that block end-to-end execution.
Each is independently fixable and none changes algorithmic behaviour.

1. src/models/pair_finetuning_dnabert.py
   - load_sequence_data: dataset_dict["sgRNA"] → dataset_dict["sgrna"]
     to match the key data_loader.load_dataset_information actually
     returns (data_loader.py:168). Was a hard KeyError on first iter.
   - Same function: sgrna_seqs / dna_seqs are initialised as plain
     `list` but were then indexed as if they were dicts
     (`sgrna_seqs[dataset_name].extend(...)`) — TypeError. The intent
     is clearly to accumulate across all datasets, so `.extend(...)`
     directly on the lists.
   - generate_random_sequence_input: `range((n_samples/n_sgrna)//6)`
     yields a float on Py3, which `range()` rejects. Wrap in `int()`.
   - Trainer instantiation: `tokenizer=` kwarg was renamed to
     `processing_class=` in transformers 5. Old kwarg now TypeErrors.

2. src/models/data_loader.py — BalancedSampler.__init__
   `isinstance(dataset, Dataset)` checks torch.utils.data.Dataset, but
   the actual object is a HuggingFace datasets.arrow_dataset.Dataset
   (unrelated classes). It then falls to the else-branch which
   expects torch.Tensor from `dataset["labels"]`, but datasets>=4
   returns a `Column`, so `.tolist()` never runs and `self.labels`
   is a sequence of 0-d Tensors. The defaultdict buckets each tensor
   by id() so the int-key check `0 in label_to_indices` fails even
   when both classes are present. Normalise once via a more permissive
   detection cascade (Tensor → .tolist → list comprehension).

3. src/models/dnabert_module.py — test_scratch + test_transfer
   `test_dataset["labels"].numpy()` → `np.asarray(list(...))`. Same
   datasets>=4 root cause; Column has no .numpy().

4. src/models/dnabert2_module.py (new file)
   run_preprocess.py imports `models.dnabert2_module` unconditionally
   but the file was never committed; ImportError before any work.
   Stub keeps the import resolvable. The module is never actually
   called on the DNABERT (no-2) code path.

End-to-end verification on Lazzarotto 2020 GUIDE-seq fold 0
(scratch mode, 8 epochs, RTX 4070): pair-finetune converges to
train_loss 0.060, off-target classifier converges to train_loss
0.0028, checkpoint saved successfully, downstream evaluation runs.

Note: I left `config.yaml` untouched in this PR. Bringing up on a
fresh host also requires adding two empty top-level dicts
(`dataset_name: {}` and `model_info: {}`) — every entry-point writes
into them without first creating the parent. That's a separate
config schema change worth surfacing on its own.
---
 src/models/data_loader.py             | 34 +++++++++++++++++++++++----
 src/models/dnabert2_module.py         | 11 +++++++++
 src/models/dnabert_module.py          | 10 ++++++--
 src/models/pair_finetuning_dnabert.py | 29 ++++++++++++++++-------
 4 files changed, 69 insertions(+), 15 deletions(-)
 create mode 100644 src/models/dnabert2_module.py

diff --git a/src/models/data_loader.py b/src/models/data_loader.py
index 3cdd455..4c43e4b 100644
--- a/src/models/data_loader.py
+++ b/src/models/data_loader.py
@@ -20,12 +20,36 @@
 class BalancedSampler(Sampler):
     def __init__(self, dataset, majority_rate: float=0.5, seed=None):
 
-        if isinstance(dataset, dict):
-            self.labels = dataset["labels"].tolist() if isinstance(dataset["labels"], torch.Tensor) else dataset["labels"]
-        elif isinstance(dataset, Dataset):
-            self.labels = [dataset[i]['labels'].item() for i in range(len(dataset))]
+        # `isinstance(dataset, Dataset)` checks `torch.utils.data.Dataset`,
+        # but a HuggingFace `datasets.arrow_dataset.Dataset` does NOT
+        # inherit from it (they're unrelated classes). The else-branch
+        # below previously expected `torch.Tensor` from
+        # `dataset["labels"]`, but `datasets >= 4` returns a
+        # `datasets.arrow_dataset.Column` instead — `.tolist()` is never
+        # called and `self.labels` ends up as a sequence of 0-d Tensor
+        # objects. The defaultdict then buckets each tensor by `id()`
+        # (every label its own key) and the int-key check
+        # `0 in label_to_indices` fails even when the dataset has both
+        # classes. Normalise once, here, and let downstream code see
+        # plain Python ints.
+        if isinstance(dataset, list):
+            raw = dataset
+        elif isinstance(dataset, dict) or hasattr(dataset, '__getitem__'):
+            try:
+                raw = dataset["labels"]
+            except (KeyError, TypeError):
+                raw = [dataset[i]['labels'] for i in range(len(dataset))]
         else:
-            self.labels = dataset["labels"].tolist() if isinstance(dataset["labels"], torch.Tensor) else dataset["labels"]
+            raw = dataset
+
+        if isinstance(raw, torch.Tensor):
+            self.labels = raw.tolist()
+        elif hasattr(raw, 'tolist'):
+            # numpy array, pandas Series, datasets.arrow_dataset.Column, …
+            self.labels = list(raw.tolist())
+        else:
+            # Per-row tensors → Python ints
+            self.labels = [int(x.item()) if hasattr(x, 'item') else int(x) for x in raw]
         self.majority_rate = majority_rate
         self.seed = seed
         
diff --git a/src/models/dnabert2_module.py b/src/models/dnabert2_module.py
new file mode 100644
index 0000000..525736d
--- /dev/null
+++ b/src/models/dnabert2_module.py
@@ -0,0 +1,11 @@
+# Stub for the upstream's dnabert2 path.
+#
+# `run_preprocess.py` imports this unconditionally but never calls into
+# it on the DNABERT (no-2) code path. The upstream commit that should
+# have shipped the real module never materialised in
+# opensensor/CRISPR_DNABERT@bfbeb81e; this stub keeps the import
+# resolvable without changing behaviour for the no-2 path we care about.
+#
+# If a future code path actually invokes anything here, the AttributeError
+# will be loud and obvious — we deliberately do NOT shim functions
+# silently.
diff --git a/src/models/dnabert_module.py b/src/models/dnabert_module.py
index c8a921f..d1835c8 100644
--- a/src/models/dnabert_module.py
+++ b/src/models/dnabert_module.py
@@ -488,7 +488,10 @@ def test_scratch(self) -> None:
         # Results processing
         probabilities = inference_results["probability"]
         predictions = inference_results["prediction"]
-        true_labels = test_dataset["labels"].numpy()
+        # datasets >=4 returns a `datasets.arrow_dataset.Column` from
+        # `dataset["labels"]`, not a torch.Tensor; .numpy() is gone.
+        # Pull the column to a list and convert via numpy.
+        true_labels = np.asarray(list(test_dataset["labels"]))
 
         # Save the results
         os.makedirs(os.path.dirname(self.result_path), exist_ok=True)
@@ -750,7 +753,10 @@ def test_transfer_epi(self) -> None:
         # Results processing
         probabilities = inference_results["probability"]
         predictions = inference_results["prediction"]
-        true_labels = test_dataset["labels"].numpy()
+        # datasets >=4 returns a `datasets.arrow_dataset.Column` from
+        # `dataset["labels"]`, not a torch.Tensor; .numpy() is gone.
+        # Pull the column to a list and convert via numpy.
+        true_labels = np.asarray(list(test_dataset["labels"]))
 
         # Save the results
         os.makedirs(os.path.dirname(self.result_path), exist_ok=True)
diff --git a/src/models/pair_finetuning_dnabert.py b/src/models/pair_finetuning_dnabert.py
index 582aedc..f20f4b8 100644
--- a/src/models/pair_finetuning_dnabert.py
+++ b/src/models/pair_finetuning_dnabert.py
@@ -214,7 +214,7 @@ def generate_random_sequence_input(self, rna_seq_list: list, n_samples: int) ->
             dna_seqs.append("-" + rna_seq)
             # Generate off-target pairs with random mutations
             for mismatch_count in range(1, 7):
-                for _ in range((n_samples/n_sgrna)//6):
+                for _ in range(int((n_samples / n_sgrna) // 6)):
                     # Generate random DNA sequence with specified number of mismatches
                     rna_list = list(rna_seq)
                     dna_list = rna_list.copy()
@@ -240,21 +240,33 @@ def generate_random_sequence_input(self, rna_seq_list: list, n_samples: int) ->
         return {"rna_seq": rna_seqs, "dna_seq": dna_seqs}
 
     def load_sequence_data(self, if_test=None) -> Dataset:
+        # Two upstream bugs fixed here:
+        #   1. data_loader.load_dataset_information returns the dataset
+        #      dict with key "sgrna" (lowercase, see data_loader.py:168),
+        #      not "sgRNA". The previous `dataset_dict["sgRNA"]` lookup
+        #      was a hard KeyError on the first dataset.
+        #   2. `sgrna_seqs[dataset_name].extend(...)` treats the lists
+        #      `sgrna_seqs = []` and `dna_seqs = []` as if they were dicts
+        #      keyed by dataset name. That's a TypeError as soon as the
+        #      first iteration runs. The intent is clearly to accumulate
+        #      across all datasets — direct list extend matches that.
         dataset_names = [
             "Lazzarotto_2020_CHANGE_seq", "Lazzarotto_2020_GUIDE_seq", "SchmidBurgk_2020_TTISS",
             "Chen_2017_GUIDE_seq", "Listgarten_2018_GUIDE_seq", "Tsai_2015_GUIDE_seq_1", "Tsai_2015_GUIDE_seq_2"
         ]
-        dna_seqs = []
-        sgrna_seqs = []
+        dna_seqs: list[str] = []
+        sgrna_seqs: list[str] = []
         for dataset_name in dataset_names:
             self.config["dataset_name"]["dataset_current"] = dataset_name
             DataLoaderClass = data_loader.DataLoaderClass(self.config)
             dataset_dict = DataLoaderClass.load_dataset()
-            sgrna_list = dataset_dict["sgRNA"]
+            sgrna_list = dataset_dict["sgrna"]
             # Generate random sequence inputs
-            generated_data = self.generate_random_sequence_input(sgrna_list, n_samples=len(dataset_dict["rna_seq"])//10)
-            sgrna_seqs[dataset_name].extend(generated_data["rna_seq"])
-            dna_seqs[dataset_name].extend(generated_data["dna_seq"])
+            generated_data = self.generate_random_sequence_input(
+                sgrna_list, n_samples=len(dataset_dict["rna_seq"]) // 10
+            )
+            sgrna_seqs.extend(generated_data["rna_seq"])
+            dna_seqs.extend(generated_data["dna_seq"])
         dataset = self.process(sgrna_seqs, dna_seqs)
         if if_test:
             dataset = dataset.select(range(200000))
@@ -409,7 +421,8 @@ def train(self) -> None:
             args=training_args,
             train_dataset=train_datasets,
             compute_metrics=compute_metrics,
-            tokenizer=self.tokenizer,
+            # transformers >=5: kwarg renamed tokenizer → processing_class
+            processing_class=self.tokenizer,
             callbacks=[loss_callback],  # Add the loss callback here
         )