helicalAI · bputzeys · Jun 18, 2025 · May 7, 2025 · May 14, 2025 · May 14, 2025
diff --git a/README.md b/README.md
@@ -76,9 +76,14 @@ or in case you're installing from the Helical repo cloned locally:
 pip install .[mamba-ssm]
 ```
 
-Note: 
+## Notes on the installation: 
 - Make sure your machine has GPU(s) and Cuda installed. Currently this is a requirement for the packages mamba-ssm and causal-conv1d. 
 - The package `causal_conv1d` requires `torch` to be installed already. First installing `helical` separately (without `[mamba-ssm]`) will install `torch` for you. A second installation (with `[mamba-ssm]`), installs the packages correctly.
+- If you have problems installing `mamba-ssm`, you can install the package via the provided `.whl` files on their release page [here](https://github.com/state-spaces/mamba/releases/tag/v2.2.4). Choose the package according to your cuda, torch and python version:
+```
+pip install https://github.com/state-spaces/mamba/releases/download/v2.2.4/mamba_ssm-2.2.4+cu12torch2.3cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
+```
+- Now continue with `pip install .[mamba-ssm]` to also install the remaining `causal-conv1d`.
 
 ### Singularity (Optional)
 If you desire to run your code in a singularity file, you can use the [singularity.def](./singularity.def) file and build an apptainer with it:

diff --git a/examples/notebooks/Geneformer-vs-TranscriptFormer.ipynb b/examples/notebooks/Geneformer-vs-TranscriptFormer.ipynb
@@ -53,7 +53,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {
     "tags": []
    },
@@ -81,7 +81,6 @@
     "import torch\n",
     "from helical.utils import get_anndata_from_hf_dataset\n",
     "from datasets import load_dataset\n",
-    "from copy import deepcopy\n",
     "\n",
     "logging.getLogger().setLevel(logging.ERROR)\n",
     "\n",
@@ -837,7 +836,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {
     "tags": []
    },

diff --git a/examples/run_models/configs/transcriptformer_config.yaml b/examples/run_models/configs/transcriptformer_config.yaml
@@ -16,7 +16,6 @@ data_files:
 output_path: ./inference_results  # Directory where results will be saved
 load_checkpoint: null  # Path to model weights file (automatically set by inference.py)
 pretrained_embedding: null  # Path to pretrained embeddings for out-of-distribution species
-precision: 16-mixed  # Numerical precision for inference (16-mixed, 32, etc.)
 
 # data settings
 gene_col_name: 'index'  # Column name in AnnData.var containing gene names which will be mapped to ensembl ids. If index is set, .var_names will be used.

diff --git a/examples/run_models/run_geneformer.py b/examples/run_models/run_geneformer.py
@@ -12,16 +12,27 @@ def run(cfg: DictConfig):
     geneformer = Geneformer(configurer=geneformer_config)
 
     # either load via huggingface
-    # hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
+    # hf_dataset = load_dataset(
+    #     "helical-ai/yolksac_human",
+    #     split="train[:5%]",
+    #     trust_remote_code=True,
+    #     download_mode="reuse_cache_if_exists",
+    # )
     # ann_data = get_anndata_from_hf_dataset(hf_dataset)
 
     # or load directly
     ann_data = ad.read_h5ad("./yolksac_human.h5ad")
 
-    dataset = geneformer.process_data(ann_data[:10])
+    dataset = geneformer.process_data(ann_data[:10, :100])
     embeddings = geneformer.get_embeddings(dataset)
 
     print(embeddings)
+    embeddings, attention_weights = geneformer.get_embeddings(
+        dataset, output_attentions=True
+    )
+
+    print(embeddings)
+    print(attention_weights.shape)
 
 
 if __name__ == "__main__":

diff --git a/examples/run_models/run_scgpt.py b/examples/run_models/run_scgpt.py
@@ -12,16 +12,24 @@ def run(cfg: DictConfig):
     scgpt = scGPT(configurer=scgpt_config)
 
     # either load via huggingface
-    # hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
+    # hf_dataset = load_dataset(
+    #     "helical-ai/yolksac_human",
+    #     split="train[:5%]",
+    #     trust_remote_code=True,
+    #     download_mode="reuse_cache_if_exists",
+    # )
     # ann_data = get_anndata_from_hf_dataset(hf_dataset)
 
     # or load directly
     ann_data = ad.read_h5ad("./yolksac_human.h5ad")
 
     data = scgpt.process_data(ann_data[:10])
-    embeddings = scgpt.get_embeddings(data)
+    embeddings = scgpt.get_embeddings(data, output_attentions=False)
+    print(embeddings)
+    embeddings, attn_weights = scgpt.get_embeddings(data, output_attentions=True)
 
     print(embeddings)
+    print(attn_weights.shape)
 
 
 if __name__ == "__main__":

diff --git a/helical/constants/hash_values.py b/helical/constants/hash_values.py
@@ -66,15 +66,15 @@
     "transcriptformer/tf_metazoa/vocabs/assay_vocab.json": "0c405e4ead45a4b8350d8e874f834273efdd3f7ad4669b52d3e3727fb4fe70af",
     "transcriptformer/tf_metazoa/vocabs/drosophila_melanogaster_gene.h5": "fa3915cf36ed457a162719b2172367a1066cde47677afb2a3b78bc1c51da0ac2",
     "transcriptformer/tf_metazoa/vocabs/lytechinus_variegatus_gene.h5": "142c8bc30a1cc9efd3ff92e65346fead6b92a85d3f74ef5b7a3f57a4bfead676",
-    "transcriptformer/tf_metazoa/vocabs/plasmodium_falciparum_gene.h5": "40678c595883ce04a77cba372bc068b22e592b994a5ef0890e344cae68d08c1b",     
+    "transcriptformer/tf_metazoa/vocabs/plasmodium_falciparum_gene.h5": "40678c595883ce04a77cba372bc068b22e592b994a5ef0890e344cae68d08c1b",
     "transcriptformer/tf_metazoa/vocabs/xenopus_laevis_gene.h5": "df5526e1268f88608e38441623dedfa6dfe984cfb73565cb35e7817bbea9eb8c",
-    "transcriptformer/tf_metazoa/vocabs/caenorhabditis_elegans_gene.h5": "a18f234ce456614b1c6d1fb57f6d1ba031a71539c2c658b0dcbb87af4e7efb64",  
-    "transcriptformer/tf_metazoa/vocabs/gallus_gallus_gene.h5": "17873277d968735bb8f3bcb0d6f9e257db1416a6918d48e7a8e83091f4651c3a",            
-    "transcriptformer/tf_metazoa/vocabs/mus_musculus_gene.h5": "e63c25b8d2ad87d696d8f7dfe4bcb307eee8a7a69dec62a33bb8469dbd75e3e1",           
+    "transcriptformer/tf_metazoa/vocabs/caenorhabditis_elegans_gene.h5": "a18f234ce456614b1c6d1fb57f6d1ba031a71539c2c658b0dcbb87af4e7efb64",
+    "transcriptformer/tf_metazoa/vocabs/gallus_gallus_gene.h5": "17873277d968735bb8f3bcb0d6f9e257db1416a6918d48e7a8e83091f4651c3a",
+    "transcriptformer/tf_metazoa/vocabs/mus_musculus_gene.h5": "e63c25b8d2ad87d696d8f7dfe4bcb307eee8a7a69dec62a33bb8469dbd75e3e1",
     "transcriptformer/tf_metazoa/vocabs/saccharomyces_cerevisiae_gene.h5": "4685470b3d8bb5bf82aded19b1b208293d4b9cae63ffb6bf42d7ad6be3f3f299",
-    "transcriptformer/tf_metazoa/vocabs/danio_rerio_gene.h5": "e2b34013c1a0779361b3aaac05cdba5062cb117ce903779232a43f73a99c78e3",             
-    "transcriptformer/tf_metazoa/vocabs/homo_sapiens_gene.h5": "74366d4f45c4bd60983c3d1d1c406d7d58d30a798455c239eb2691eaa162e2dc",             
-    "transcriptformer/tf_metazoa/vocabs/oryctolagus_cuniculus_gene.h5": "0fbe3ca92a36f58134491723cee0834fe06e4106bf0c3b17e1f65ea884abf8a2",  
+    "transcriptformer/tf_metazoa/vocabs/danio_rerio_gene.h5": "e2b34013c1a0779361b3aaac05cdba5062cb117ce903779232a43f73a99c78e3",
+    "transcriptformer/tf_metazoa/vocabs/homo_sapiens_gene.h5": "74366d4f45c4bd60983c3d1d1c406d7d58d30a798455c239eb2691eaa162e2dc",
+    "transcriptformer/tf_metazoa/vocabs/oryctolagus_cuniculus_gene.h5": "0fbe3ca92a36f58134491723cee0834fe06e4106bf0c3b17e1f65ea884abf8a2",
     "transcriptformer/tf_metazoa/vocabs/spongilla_lacustris_gene.h5": "e082a6bccbcbf89d5f04b97c5f39402765897259becb6b650a2dfd00ea4d3afc",
     "transcriptformer/tf_exemplar/config.json": "695c50832a608078bc3b136a0776f8d795532369bc249a83c07db2b28bb97b24",
     "transcriptformer/tf_exemplar/model_weights.pt": "1adbb8671c00ea9d66ac0a1be8261c7e96824eaf8435ae17b048570372d4f8a1",
@@ -83,5 +83,7 @@
     "transcriptformer/tf_exemplar/vocabs/danio_rerio_gene.h5": "e2b34013c1a0779361b3aaac05cdba5062cb117ce903779232a43f73a99c78e3",
     "transcriptformer/tf_exemplar/vocabs/drosophila_melanogaster_gene.h5": "fa3915cf36ed457a162719b2172367a1066cde47677afb2a3b78bc1c51da0ac2",
     "transcriptformer/tf_exemplar/vocabs/homo_sapiens_gene.h5": "74366d4f45c4bd60983c3d1d1c406d7d58d30a798455c239eb2691eaa162e2dc",
-    "transcriptformer/tf_exemplar/vocabs/mus_musculus_gene.h5": "e63c25b8d2ad87d696d8f7dfe4bcb307eee8a7a69dec62a33bb8469dbd75e3e1",    
+    "transcriptformer/tf_exemplar/vocabs/mus_musculus_gene.h5": "e63c25b8d2ad87d696d8f7dfe4bcb307eee8a7a69dec62a33bb8469dbd75e3e1",
+    "text_embeddings/gene_prot_embeddings_granite.json": "8fb51d28986fa24e4b7b63bc28f86789db184f902259838cf7f7aa9ae4942ffd",
+    "mrna_embeddings/sequences_helix_genes_cds_go.json": "364c744b845572b6900820f031c88a24a9ed5b12d2b9c49fa9e5855b1b7abdbc",
 }
diff --git a/helical/models/evo_2/README.md b/helical/models/evo_2/README.md
@@ -101,7 +101,7 @@ export CPLUS_INCLUDE_PATH=$CONDA_PREFIX/lib/python3.11/site-packages/nvidia/nvtx
 pip install torch==2.6.0
 pip install "helical[evo-2]@git+https://github.com/helicalAI/helical.git"
 
-git clone git@github.com:Zymrael/vortex.git
+git clone https://github.com/Zymrael/vortex.git
 cd vortex
 git checkout f243e8e
 sed -i 's/torch==2.5.1/torch==2.6.0/g' pyproject.toml

diff --git a/helical/models/geneformer/geneformer_utils.py b/helical/models/geneformer/geneformer_utils.py
@@ -154,10 +154,12 @@ def get_embs(
     eos_present,
     device,
     silent=False,
+    output_attentions=False,
 ):
     model_input_size = get_model_input_size(model)
     total_batch_length = len(filtered_input_data)
     embs_list = []
+    attn_list = []
 
     _check_for_expected_special_tokens(
         filtered_input_data, emb_mode, cls_present, eos_present, gene_token_dict
@@ -182,9 +184,15 @@ def get_embs(
             outputs = model(
                 input_ids=input_data_minibatch,
                 attention_mask=gen_attention_mask(minibatch),
+                output_attentions=output_attentions,
             )
 
         embs_i = outputs.hidden_states[layer_to_quant]
+        # attention of size (batch_size, num_heads, sequence_length, sequence_length)
+        if output_attentions:
+            attn_i = outputs.attentions[layer_to_quant]
+            # attn_i = torch.mean(attn_i, dim=1).cpu().numpy()  # average over heads
+            attn_list.extend(attn_i.cpu().numpy())
 
         embs_list.extend(
             _compute_embeddings_depending_on_mode(
@@ -207,6 +215,8 @@ def get_embs(
     if emb_mode != "gene":
         embs_list = np.array(embs_list)
 
+    if output_attentions:
+        return embs_list, np.array(attn_list)
     return embs_list
 
 

diff --git a/helical/models/geneformer/model.py b/helical/models/geneformer/model.py
@@ -95,16 +95,15 @@ def __init__(self, configurer: GeneformerConfig = default_configurer) -> None:
         available_layers = list(np.arange(num_available_layers))
         available_layers.append(-1)
         if self.config["emb_layer"] not in available_layers:
-            message = (
-                f"Layer {self.config['emb_layer']} is not available. Available layers are {available_layers}."
-            )
+            message = f"Layer {self.config['emb_layer']} is not available. Available layers are {available_layers}."
             LOGGER.error(message)
             raise ValueError(message)
-
-        if self.config["emb_layer"] < (num_available_layers-1) and self.config["emb_layer"] != -1:
-            message = (
-                f"Layer {self.config['emb_layer']} is not the last layer. This can lead to different embeddings and fine-tuning results."
-            )
+
+        if (
+            self.config["emb_layer"] < (num_available_layers - 1)
+            and self.config["emb_layer"] != -1
+        ):
+            message = f"Layer {self.config['emb_layer']} is not the last layer. This can lead to different embeddings and fine-tuning results."
             LOGGER.warning(message)
 
         self.layer_to_quant = self.config["emb_layer"]
@@ -203,18 +202,27 @@ def process_data(
         LOGGER.info(f"Successfully processed the data for Geneformer.")
         return tokenized_dataset
 
-    def get_embeddings(self, dataset: Dataset) -> np.array:
+    def get_embeddings(
+        self, dataset: Dataset, output_attentions: bool = False
+    ) -> np.array:
         """Gets the gene embeddings from the Geneformer model
 
         Parameters
         ----------
         dataset : Dataset
             The tokenized dataset containing the processed data
+        output_attentions : bool, optional, default=False
+            Whether to output attentions from the model. This is useful for debugging or analysis purposes.
+            The attention maps are averaged across heads and use the same layer as the embeddings.
 
         Returns
         -------
         np.array
             The gene embeddings in the form of a numpy array
+        np.array, optional
+            The attention weights from the model, if `output_attentions` is set to True.
+            The shape of the attention weights is (batch_size, num_heads, seq_length, seq_length).
+            If `output_attentions` is False, this will not be returned.
         """
         LOGGER.info(f"Started getting embeddings:")
         embeddings = get_embs(
@@ -229,6 +237,7 @@ def get_embeddings(self, dataset: Dataset) -> np.array:
             self.cls_present,
             self.eos_present,
             self.device,
+            output_attentions=output_attentions,
         )
 
         LOGGER.info(f"Finished getting embeddings.")

diff --git a/helical/models/helix_mrna/model.py b/helical/models/helix_mrna/model.py
@@ -94,8 +94,8 @@ def process_data(self, sequences: Union[list[str], DataFrame]) -> Dataset:
         tokenized_sequences = self.tokenizer(
             sequences,
             return_tensors="pt",
-            padding="max_length",
             truncation=True,
+            padding="longest",
             max_length=self.config["input_size"],
             return_special_tokens_mask=True,
         )

diff --git a/helical/models/scgpt/model.py b/helical/models/scgpt/model.py
@@ -86,20 +86,28 @@ def __init__(self, configurer: scGPTConfig = configurer) -> None:
             f"'scGPT' model is in '{mode}' mode, on device '{next(self.model.parameters()).device.type}' with embedding mode '{self.config['emb_mode']}'."
         )
 
-    def get_embeddings(self, dataset: Dataset) -> np.array:
+    def get_embeddings(
+        self, dataset: Dataset, output_attentions: bool = False
+    ) -> np.array:
         """Gets the gene embeddings
 
         Parameters
         ----------
         dataset : Dataset
             The processed dataset to get the embeddings from.
+        output_attentions : bool, optional, default=False
+            Whether to output the attention maps from the model. If set to True, the attention maps will be returned along with the embeddings.
+            If set to False, only the embeddings will be returned. **Note**: This will increase the memory usage of the model significantly, so use it only if you need the attention maps.
 
         Returns
         -------
         np.ndarray | List[pd.Series]
             The embeddings produced by the model.
             The return type depends on the `emb_mode` parameter in the configuration.
             If `emb_mode` is set to "gene", the embeddings are returned as a list of pd.Series which contain a mapping of gene_name:embedding for each cell.
+
+        np.ndarray
+            If `output_attentions` is set to True, the attention maps will be returned as a numpy array of shape (n_layers, n_heads, n_cells, n_tokens, n_tokens).
         """
         LOGGER.info(f"Started getting embeddings:")
 
@@ -136,6 +144,7 @@ def get_embeddings(self, dataset: Dataset) -> np.array:
         device = next(self.model.parameters()).device
 
         resulting_embeddings = []
+        resulting_attn_maps = []
 
         with (
             torch.no_grad(),
@@ -147,16 +156,30 @@ def get_embeddings(self, dataset: Dataset) -> np.array:
                 src_key_padding_mask = input_gene_ids.eq(
                     self.vocab[self.config["pad_token"]]
                 )
-                embeddings = self.model._encode(
-                    input_gene_ids,
-                    data_dict["expr"].to(device),
-                    src_key_padding_mask=src_key_padding_mask,
-                    batch_labels=(
-                        data_dict["batch_labels"].to(device)
-                        if use_batch_labels
-                        else None
-                    ),
-                )
+                if output_attentions:
+                    embeddings, attn_maps = self.model._encode(
+                        input_gene_ids,
+                        data_dict["expr"].to(device),
+                        src_key_padding_mask=src_key_padding_mask,
+                        batch_labels=(
+                            data_dict["batch_labels"].to(device)
+                            if use_batch_labels
+                            else None
+                        ),
+                        output_attentions=output_attentions,
+                    )
+                    resulting_attn_maps.extend(attn_maps)
+                else:
+                    embeddings = self.model._encode(
+                        input_gene_ids,
+                        data_dict["expr"].to(device),
+                        src_key_padding_mask=src_key_padding_mask,
+                        batch_labels=(
+                            data_dict["batch_labels"].to(device)
+                            if use_batch_labels
+                            else None
+                        ),
+                    )
 
                 resulting_embeddings.extend(
                     self._compute_embeddings_depending_on_mode(embeddings, data_dict)
@@ -165,7 +188,10 @@ def get_embeddings(self, dataset: Dataset) -> np.array:
         resulting_embeddings = self._normalize_embeddings(resulting_embeddings)
 
         LOGGER.info(f"Finished getting embeddings.")
-        return resulting_embeddings
+        if output_attentions:
+            return resulting_embeddings, torch.stack(resulting_attn_maps).cpu().numpy()
+        else:
+            return resulting_embeddings
 
     def _normalize_embeddings(self, resulting_embeddings: torch.tensor) -> np.ndarray:
         """

diff --git a/helical/models/scgpt/model_dir/model.py b/helical/models/scgpt/model_dir/model.py
@@ -7,7 +7,10 @@
 from torch import nn, Tensor
 import torch.distributed as dist
 import torch.nn.functional as F
-from torch.nn import TransformerEncoder, TransformerEncoderLayer
+
+# from torch.nn import TransformerEncoder, TransformerEncoderLayer
+### Needed a custom version to output attention weights
+from .transformer import TransformerEncoder, TransformerEncoderLayer
 from torch.distributions import Bernoulli
 from tqdm import trange
 
@@ -170,6 +173,7 @@ def _encode(
         values: Tensor,
         src_key_padding_mask: Tensor,
         batch_labels: Optional[Tensor] = None,  # (batch,)
+        output_attentions: bool = False,
     ) -> Tensor:
         self._check_batch_labels(batch_labels)
 
@@ -191,10 +195,18 @@ def _encode(
         elif getattr(self, "bn", None) is not None:
             total_embs = self.bn(total_embs.permute(0, 2, 1)).permute(0, 2, 1)
 
-        output = self.transformer_encoder(
-            total_embs, src_key_padding_mask=src_key_padding_mask
-        )
-        return output  # (batch, seq_len, embsize)
+        if output_attentions:
+            output, attn_maps = self.transformer_encoder(
+                total_embs,
+                src_key_padding_mask=src_key_padding_mask,
+                output_attentions=output_attentions,
+            )
+            return output, attn_maps
+        else:
+            output = self.transformer_encoder(
+                total_embs, src_key_padding_mask=src_key_padding_mask
+            )
+            return output  # (batch, seq_len, embsize)
 
     def _get_cell_emb_from_layer(
         self, layer_output: Tensor, weights: Tensor = None