From 1d0029cf9a8d45d036ba157d39630844ed27fde8 Mon Sep 17 00:00:00 2001 From: Matthew Wood <62712722+mattwoodx@users.noreply.github.com> Date: Fri, 26 Sep 2025 16:24:24 +0200 Subject: [PATCH 1/3] scGPT Binning Randomness (#269) * Revert "Hot fix remove notebook from release run" This reverts commit 52bba026f1cb6149cf7e6a7f57f1ee261a6c4769. * Use generator instead of seed state for reproducible embeddings independent of ordering of cells --- .github/workflows/release.yml | 1 - ci/tests/test_scgpt/test_binning.py | 4 ++-- helical/models/scgpt/binning.py | 3 ++- helical/models/scgpt/data_collator.py | 5 ++++- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 8fed8c75..c31b24c5 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -175,7 +175,6 @@ jobs: sed -i 's/list(np.array(train_dataset.obs\[\\"LVL1\\"].tolist()))/list(np.array(train_dataset.obs\[\\"LVL1\\"].tolist()))[:100]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb sed -i 's/list(np.array(test_dataset.obs\[\\"LVL1\\"].tolist()))/list(np.array(test_dataset.obs\[\\"LVL1\\"].tolist()))[:10]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb rm ./examples/notebooks/Evo-2.ipynb - rm ./examples/notebooks/Geneformer-Series-Comparison.ipynb - name: Run Notebooks run: | diff --git a/ci/tests/test_scgpt/test_binning.py b/ci/tests/test_scgpt/test_binning.py index b93526a7..524b7ad0 100644 --- a/ci/tests/test_scgpt/test_binning.py +++ b/ci/tests/test_scgpt/test_binning.py @@ -38,7 +38,7 @@ def test_digitize_identical_bins(): x = np.array([1, 2, 3, 4, 5]) bins = np.array([2, 2, 4, 4]) result = _digitize(x, bins) - expected = np.array([0, 1, 2, 3, 4]) + expected = np.array([0, 2, 2, 3, 4]) assert np.array_equal(result, expected) @@ -51,7 +51,7 @@ def test_digitize_identical_bins(): np.array([1, 1, 1, 2, 2, 2, 3, 3, 4]), ), # distrubution of the bins depends on the distribution of the data - (np.array([1, 1, 1, 1, 1, 1, 1, 8, 9]), np.array([2, 1, 1, 2, 3, 3, 3, 3, 4])), + (np.array([1, 1, 1, 1, 1, 1, 1, 8, 9]), np.array([1, 2, 1, 1, 3, 1, 1, 3, 4])), (np.array([1, 2, 1, 1, 9, 6, 7, 8, 9]), np.array([1, 2, 1, 1, 4, 2, 2, 3, 4])), ), ) diff --git a/helical/models/scgpt/binning.py b/helical/models/scgpt/binning.py index 9c457fb1..d029da6c 100644 --- a/helical/models/scgpt/binning.py +++ b/helical/models/scgpt/binning.py @@ -32,7 +32,8 @@ def _digitize(x: np.ndarray, bins: np.ndarray, side="both") -> np.ndarray: right_digits = np.digitize(x, bins, right=True) - rands = np.random.rand(len(x)) # uniform random numbers + rng = np.random.default_rng(42) + rands = rng.random(len(x)) digits = rands * (right_digits - left_digits) + left_digits digits = np.ceil(digits).astype(np.int64) diff --git a/helical/models/scgpt/data_collator.py b/helical/models/scgpt/data_collator.py index 5dd3a634..5fdc1882 100644 --- a/helical/models/scgpt/data_collator.py +++ b/helical/models/scgpt/data_collator.py @@ -163,7 +163,10 @@ def _sample( # keep the first n tokens unchanged _n = self.keep_first_n_tokens - indices = torch.randperm(len(genes) - _n, device=device)[: max_length - _n] + g = torch.Generator().manual_seed(0) + indices = torch.randperm(len(genes) - _n, device=device, generator=g)[ + : max_length - _n + ] indices = torch.cat([torch.arange(_n), indices + _n], dim=0) return genes[indices], expressions[indices] From b4442824a0e22facf57d1d9d0e9ada2abd1dab0c Mon Sep 17 00:00:00 2001 From: Benoit Putzeys <157973952+bputzeys@users.noreply.github.com> Date: Fri, 26 Sep 2025 16:31:37 +0200 Subject: [PATCH 2/3] Update pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f6747baa..3c3b9be7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "helical" -version = "1.4.4" +version = "1.4.5" authors = [ { name="Helical Team", email="support@helical-ai.com" }, ] From 805c30cac613b3ad7897e91294783574d65114aa Mon Sep 17 00:00:00 2001 From: Matthew Wood <62712722+mattwoodx@users.noreply.github.com> Date: Mon, 29 Sep 2025 09:51:59 +0200 Subject: [PATCH 3/3] Release workflow fix (#271) * Revert "Hot fix remove notebook from release run" This reverts commit 52bba026f1cb6149cf7e6a7f57f1ee261a6c4769. * Remove large notebook from workflow * fixup! Remove large notebook from workflow --- .github/workflows/release.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index c31b24c5..8fed8c75 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -175,6 +175,7 @@ jobs: sed -i 's/list(np.array(train_dataset.obs\[\\"LVL1\\"].tolist()))/list(np.array(train_dataset.obs\[\\"LVL1\\"].tolist()))[:100]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb sed -i 's/list(np.array(test_dataset.obs\[\\"LVL1\\"].tolist()))/list(np.array(test_dataset.obs\[\\"LVL1\\"].tolist()))[:10]/g' ./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb rm ./examples/notebooks/Evo-2.ipynb + rm ./examples/notebooks/Geneformer-Series-Comparison.ipynb - name: Run Notebooks run: |