From fd2e1cb693e5cd3b858e1e84be45e1b97e830dcc Mon Sep 17 00:00:00 2001 From: "Dmitry Ivanov @ helical-ai.com" Date: Mon, 23 Mar 2026 15:26:18 +0100 Subject: [PATCH 1/2] Transcriptformer config download cache dir (#358) Closes #348 Here, we parametrise the cache directory of the Transcriptformer at init. --- .../transcriptformer_config.py | 31 ++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/helical/models/transcriptformer/transcriptformer_config.py b/helical/models/transcriptformer/transcriptformer_config.py index bf91d679..7a9acc01 100644 --- a/helical/models/transcriptformer/transcriptformer_config.py +++ b/helical/models/transcriptformer/transcriptformer_config.py @@ -1,5 +1,7 @@ from omegaconf import OmegaConf from typing import Literal, List, Union +from pathlib import Path +from helical.constants.paths import CACHE_DIR_HELICAL class TranscriptFormerConfig: @@ -25,7 +27,9 @@ class TranscriptFormerConfig: load_checkpoint: str = None Path to model weights file (automatically set by inference.py) pretrained_embedding: Union[str, List[str]] = None - Path or list of paths to pretrained embeddings for out-of-distribution species + Path or list of paths to pretrained embeddings for out-of-distribution species. Mutually exclusive with `pretrained_embedding_species`. + pretrained_embedding_species: Union[str, List[str]] = None + Underscore-separated specie name or list of names to retrieve paths. Example: `pretrained_embedding_species="mus_musculus"` or `pretrained_embedding_species=["mus_musculus", "sus_scrofa"]`. Mutually exclusive with `pretrained_embedding`. gene_col_name: str = "ensembl_id" Column name in AnnData.var containing gene names which will be mapped to ensembl ids. If index is set, .var_names will be used. clip_counts: int = 30 @@ -58,6 +62,7 @@ def __init__( output_path: str = "./inference_results", load_checkpoint: str = None, pretrained_embedding: Union[str, List[str]] = None, + pretrained_embedding_species: Union[str, List[str]] = None, gene_col_name: str = "index", clip_counts: int = 30, filter_to_vocabs: bool = True, @@ -68,6 +73,30 @@ def __init__( min_expressed_genes: int = 0, ): + if ( + pretrained_embedding_species is not None + and pretrained_embedding is not None + ): + raise ValueError( + "pretrained_embedding_species and pretrained_embedding are mutually exclusive" + ) + + if pretrained_embedding_species is not None and pretrained_embedding is None: + species_list = ( + [pretrained_embedding_species] + if isinstance(pretrained_embedding_species, str) + else pretrained_embedding_species + ) + vocab_base = ( + Path(CACHE_DIR_HELICAL) + / "models/transcriptformer" + / model_name + / "vocabs" + ) + pretrained_embedding = [ + str(vocab_base / f"{s}_gene.h5") for s in species_list + ] + inference_config: dict = { "batch_size": batch_size, "output_keys": output_keys, From fb80a76458e5bb7f12ca06b2749974513c1c5d57 Mon Sep 17 00:00:00 2001 From: Benoit Putzeys <157973952+bputzeys@users.noreply.github.com> Date: Mon, 23 Mar 2026 14:36:05 +0000 Subject: [PATCH 2/2] Update pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 73e728b1..db8e1b52 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "helical" -version = "1.10.1" +version = "1.11.0" authors = [ { name="Helical Team", email="support@helical-ai.com" }, ]