From 2a2cb9a30007990afa9159400095759f0b63b4f8 Mon Sep 17 00:00:00 2001 From: martinaoliver Date: Tue, 3 Mar 2026 14:57:00 +0000 Subject: [PATCH 1/5] TF adapted to work with other non-homosapiens species (#333) * changed to tf on ensembl * added info on genes expressed and mapped * logger * github pr rerun checks * Bump version to 1.8.1 --------- Co-authored-by: maxime --- .../transcriptformer/data/dataloader.py | 21 +++++++++++++++++-- pyproject.toml | 2 +- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/helical/models/transcriptformer/data/dataloader.py b/helical/models/transcriptformer/data/dataloader.py index f8272db8..b4d8f201 100644 --- a/helical/models/transcriptformer/data/dataloader.py +++ b/helical/models/transcriptformer/data/dataloader.py @@ -35,8 +35,13 @@ def load_gene_features(adata, gene_col_name, species: str = "hsapiens"): message = f"Gene column '{gene_col_name}' not found in adata.var.columns. Available columns: {adata.var.columns}. Modify config accordingly." logging.error(message) raise ValueError(message) - adata = map_gene_symbols_to_ensembl_ids(adata, gene_names=gene_col_name, species=species) - gene_names = np.array(list(adata.var["ensembl_id"].values)) + if adata.var[gene_col_name].str.contains("ENS", na=False).mean() <= 0.5: + adata = map_gene_symbols_to_ensembl_ids(adata, gene_names=gene_col_name, species=species) + gene_names = np.array(list(adata.var["ensembl_id"].values)) + else: + adata.var["ensembl_id"] = adata.var[gene_col_name] + gene_names = np.array(list(adata.var["ensembl_id"].values)) + return gene_names, True @@ -52,7 +57,10 @@ def apply_filters( ): """Apply filters to the data.""" if filter_to_vocab: + n_total_genes = len(gene_names) filter_idx = [i for i, name in enumerate(gene_names) if name in vocab] + print(len(vocab)) + not_in_vocab = n_total_genes - len(filter_idx) X = X[:, filter_idx] gene_names = gene_names[filter_idx] if X.shape[1] == 0: @@ -60,6 +68,15 @@ def apply_filters( logging.warning(f"Available genes: {len(gene_names)}") logging.warning(f"Number of non-zero genes: {np.sum(X > 0, axis=1).mean()}") return None, None, None + zero_expr = int((X == 0).all(axis=0).sum()) + nonzero_expr = len(filter_idx) - zero_expr + logging.info( + f"Gene mapping: {len(filter_idx)} / {n_total_genes} in vocab | " + f"not in vocab: {not_in_vocab} | " + f"in vocab but zero expression: {zero_expr} | " + f"in vocab and expressed: {nonzero_expr}" + ) + if filter_outliers > 0: expr_counts = X.sum(axis=1) diff --git a/pyproject.toml b/pyproject.toml index e381dd77..300ee304 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "helical" -version = "1.8.0" +version = "1.8.1" authors = [ { name="Helical Team", email="support@helical-ai.com" }, ] From 7835b009d9e57c16e16f5b6134b62d0e61d9b491 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Fri, 6 Mar 2026 12:10:23 +0100 Subject: [PATCH 2/5] More ESM-2 embeddings Included embeddings for the following species: * canis_lupus_familiaris * macaca_fascicularis * rattus_norvegicus * sus_scrofa as per [Ensembl](https://ftp.ensembl.org/pub/current/fasta/) --- helical/models/transcriptformer/transcriptformer_config.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/helical/models/transcriptformer/transcriptformer_config.py b/helical/models/transcriptformer/transcriptformer_config.py index 6ad19cee..d8acfb47 100644 --- a/helical/models/transcriptformer/transcriptformer_config.py +++ b/helical/models/transcriptformer/transcriptformer_config.py @@ -129,6 +129,10 @@ def __init__( "transcriptformer/tf_metazoa/vocabs/oryctolagus_cuniculus_gene.h5", "transcriptformer/tf_metazoa/vocabs/spongilla_lacustris_gene.h5", "transcriptformer/tf_metazoa/vocabs/homo_sapiens_gene.h5", + "transcriptformer/tf_metazoa/vocabs/canis_lupus_familiaris.h5", + "transcriptformer/tf_metazoa/vocabs/rattus_norvegicus.h5", + "transcriptformer/tf_metazoa/vocabs/sus_scrofa.h5", + "transcriptformer/tf_metazoa/vocabs/macaca_fascicularis.h5", ] elif model_name == "tf_exemplar": self.list_of_files_to_download = [ From ead949aa35d98afb7557b70589c3bd9d8a0f4779 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Fri, 6 Mar 2026 18:06:21 +0100 Subject: [PATCH 3/5] Fixing typo in ESM-2 configs on S3 for 4 additional species --- .../models/transcriptformer/transcriptformer_config.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/helical/models/transcriptformer/transcriptformer_config.py b/helical/models/transcriptformer/transcriptformer_config.py index d8acfb47..0256cd95 100644 --- a/helical/models/transcriptformer/transcriptformer_config.py +++ b/helical/models/transcriptformer/transcriptformer_config.py @@ -129,10 +129,10 @@ def __init__( "transcriptformer/tf_metazoa/vocabs/oryctolagus_cuniculus_gene.h5", "transcriptformer/tf_metazoa/vocabs/spongilla_lacustris_gene.h5", "transcriptformer/tf_metazoa/vocabs/homo_sapiens_gene.h5", - "transcriptformer/tf_metazoa/vocabs/canis_lupus_familiaris.h5", - "transcriptformer/tf_metazoa/vocabs/rattus_norvegicus.h5", - "transcriptformer/tf_metazoa/vocabs/sus_scrofa.h5", - "transcriptformer/tf_metazoa/vocabs/macaca_fascicularis.h5", + "transcriptformer/tf_metazoa/vocabs/canis_lupus_familiaris_gene.h5", + "transcriptformer/tf_metazoa/vocabs/rattus_norvegicus_gene.h5", + "transcriptformer/tf_metazoa/vocabs/sus_scrofa_gene.h5", + "transcriptformer/tf_metazoa/vocabs/macaca_fascicularis_gene.h5", ] elif model_name == "tf_exemplar": self.list_of_files_to_download = [ From 7c439b74111ad707815b385b185d4ccc94f350c3 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Mon, 9 Mar 2026 17:17:12 +0100 Subject: [PATCH 4/5] Lists of gene vocabs for TF This change allows transcriptformer to accept multiple vocabularies of gene embeddings for out-of-sample species --- .../test_transcriptformer_model.py | 38 +++++++++++++++++++ helical/models/transcriptformer/model.py | 6 +-- .../transcriptformer_config.py | 8 ++-- 3 files changed, 45 insertions(+), 7 deletions(-) diff --git a/ci/tests/test_transcriptformer/test_transcriptformer_model.py b/ci/tests/test_transcriptformer/test_transcriptformer_model.py index 900088c0..141e803a 100644 --- a/ci/tests/test_transcriptformer/test_transcriptformer_model.py +++ b/ci/tests/test_transcriptformer/test_transcriptformer_model.py @@ -1,3 +1,7 @@ +import numpy as np +import h5py +import pytest +import torch from helical.models.transcriptformer.model import TranscriptFormer from helical.models.transcriptformer.transcriptformer_config import ( TranscriptFormerConfig, @@ -5,6 +9,16 @@ from anndata import AnnData +def _write_dummy_embedding_h5(path, gene_names, emb_dim=2560): + """Write a minimal HDF5 embedding file with random embeddings.""" + with h5py.File(path, "w") as f: + f.create_dataset("keys", data=np.array(gene_names, dtype="S")) + arrays_group = f.create_group("arrays") + rng = np.random.default_rng(seed=0) + for gene in gene_names: + arrays_group.create_dataset(gene, data=rng.random(emb_dim).astype(np.float32)) + + class TestTranscriptFormerModel: configurer = TranscriptFormerConfig(emb_mode="gene") transcriptformer = TranscriptFormer(configurer) @@ -40,3 +54,27 @@ def test_get_embeddings__in_gene_mode(self): assert embeddings[0]["ENSG00000121410"].shape == (2048,) assert embeddings[0]["ENSG00000036549"].shape == (2048,) assert embeddings[0]["ENSG00000074755"].shape == (2048,) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +class TestTranscriptFormerPretainedEmbeddingList: + """Tests that a list of pretrained embedding paths is accepted and applied correctly.""" + + GENES_FILE_1 = ["ENSG00000121410", "ENSG00000036549"] + GENES_FILE_2 = ["ENSG00000074755", "ENSG00000078808"] + + def test_model_loads_with_list_of_pretrained_embeddings(self, tmp_path): + path1 = str(tmp_path / "embeddings_1.h5") + path2 = str(tmp_path / "embeddings_2.h5") + _write_dummy_embedding_h5(path1, self.GENES_FILE_1) + _write_dummy_embedding_h5(path2, self.GENES_FILE_2) + + configurer = TranscriptFormerConfig( + emb_mode="gene", + pretrained_embedding=[path1, path2], + ) + model = TranscriptFormer(configurer) + + # All genes from both embedding files should be present in the updated vocab + for gene in self.GENES_FILE_1 + self.GENES_FILE_2: + assert gene in model.gene_vocab diff --git a/helical/models/transcriptformer/model.py b/helical/models/transcriptformer/model.py index 64447090..3cec5952 100644 --- a/helical/models/transcriptformer/model.py +++ b/helical/models/transcriptformer/model.py @@ -11,7 +11,7 @@ from helical.models.transcriptformer.utils.utils import stack_dict from helical.models.base_models import HelicalRNAModel from helical.utils.downloader import Downloader -from omegaconf import OmegaConf +from omegaconf import OmegaConf, ListConfig import json import os import pandas as pd @@ -149,12 +149,12 @@ def __init__(self, configurer: TranscriptFormerConfig = configurer): if self.model.inference_config.pretrained_embedding is not None: logger.info("Performing embedding surgery") # Check if pretrained_embedding_paths is a list, if not convert it to a list - if not isinstance(self.model.inference_config.pretrained_embedding, list): + if not isinstance(self.model.inference_config.pretrained_embedding, (list, ListConfig)): pretrained_embedding_paths = [ self.model.inference_config.pretrained_embedding ] else: - pretrained_embedding_paths = ( + pretrained_embedding_paths = list( self.model.inference_config.pretrained_embedding ) self.model, self.gene_vocab = change_embedding_layer( diff --git a/helical/models/transcriptformer/transcriptformer_config.py b/helical/models/transcriptformer/transcriptformer_config.py index 0256cd95..bf91d679 100644 --- a/helical/models/transcriptformer/transcriptformer_config.py +++ b/helical/models/transcriptformer/transcriptformer_config.py @@ -1,5 +1,5 @@ from omegaconf import OmegaConf -from typing import Literal, List +from typing import Literal, List, Union class TranscriptFormerConfig: @@ -24,8 +24,8 @@ class TranscriptFormerConfig: Directory where results will be saved load_checkpoint: str = None Path to model weights file (automatically set by inference.py) - pretrained_embedding: str = None - Path to pretrained embeddings for out-of-distribution species + pretrained_embedding: Union[str, List[str]] = None + Path or list of paths to pretrained embeddings for out-of-distribution species gene_col_name: str = "ensembl_id" Column name in AnnData.var containing gene names which will be mapped to ensembl ids. If index is set, .var_names will be used. clip_counts: int = 30 @@ -57,7 +57,7 @@ def __init__( data_files: List[str] = [None], output_path: str = "./inference_results", load_checkpoint: str = None, - pretrained_embedding: str = None, + pretrained_embedding: Union[str, List[str]] = None, gene_col_name: str = "index", clip_counts: int = 30, filter_to_vocabs: bool = True, From 2f515c0f059cd38bf185ebd05f4950109a99e4c4 Mon Sep 17 00:00:00 2001 From: Priyank Vyas <60008075+priyankvyas@users.noreply.github.com> Date: Tue, 10 Mar 2026 16:31:19 +0000 Subject: [PATCH 5/5] Bump version from 1.8.1 to 1.8.2 (#341) --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 300ee304..f10a89f6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "helical" -version = "1.8.1" +version = "1.8.2" authors = [ { name="Helical Team", email="support@helical-ai.com" }, ]