Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,14 @@ or in case you're installing from the Helical repo cloned locally:
pip install .[mamba-ssm]
```

Note:
## Notes on the installation:
- Make sure your machine has GPU(s) and Cuda installed. Currently this is a requirement for the packages mamba-ssm and causal-conv1d.
- The package `causal_conv1d` requires `torch` to be installed already. First installing `helical` separately (without `[mamba-ssm]`) will install `torch` for you. A second installation (with `[mamba-ssm]`), installs the packages correctly.
- If you have problems installing `mamba-ssm`, you can install the package via the provided `.whl` files on their release page [here](https://github.com/state-spaces/mamba/releases/tag/v2.2.4). Choose the package according to your cuda, torch and python version:
```
pip install https://github.com/state-spaces/mamba/releases/download/v2.2.4/mamba_ssm-2.2.4+cu12torch2.3cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
```
- Now continue with `pip install .[mamba-ssm]` to also install the remaining `causal-conv1d`.

### Singularity (Optional)
If you desire to run your code in a singularity file, you can use the [singularity.def](./singularity.def) file and build an apptainer with it:
Expand Down
5 changes: 2 additions & 3 deletions examples/notebooks/Geneformer-vs-TranscriptFormer.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {
"tags": []
},
Expand Down Expand Up @@ -81,7 +81,6 @@
"import torch\n",
"from helical.utils import get_anndata_from_hf_dataset\n",
"from datasets import load_dataset\n",
"from copy import deepcopy\n",
"\n",
"logging.getLogger().setLevel(logging.ERROR)\n",
"\n",
Expand Down Expand Up @@ -837,7 +836,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": null,
"metadata": {
"tags": []
},
Expand Down
1 change: 0 additions & 1 deletion examples/run_models/configs/transcriptformer_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ data_files:
output_path: ./inference_results # Directory where results will be saved
load_checkpoint: null # Path to model weights file (automatically set by inference.py)
pretrained_embedding: null # Path to pretrained embeddings for out-of-distribution species
precision: 16-mixed # Numerical precision for inference (16-mixed, 32, etc.)

# data settings
gene_col_name: 'index' # Column name in AnnData.var containing gene names which will be mapped to ensembl ids. If index is set, .var_names will be used.
Expand Down
15 changes: 13 additions & 2 deletions examples/run_models/run_geneformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,27 @@ def run(cfg: DictConfig):
geneformer = Geneformer(configurer=geneformer_config)

# either load via huggingface
# hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
# hf_dataset = load_dataset(
# "helical-ai/yolksac_human",
# split="train[:5%]",
# trust_remote_code=True,
# download_mode="reuse_cache_if_exists",
# )
# ann_data = get_anndata_from_hf_dataset(hf_dataset)

# or load directly
ann_data = ad.read_h5ad("./yolksac_human.h5ad")

dataset = geneformer.process_data(ann_data[:10])
dataset = geneformer.process_data(ann_data[:10, :100])
embeddings = geneformer.get_embeddings(dataset)

print(embeddings)
embeddings, attention_weights = geneformer.get_embeddings(
dataset, output_attentions=True
)

print(embeddings)
print(attention_weights.shape)


if __name__ == "__main__":
Expand Down
12 changes: 10 additions & 2 deletions examples/run_models/run_scgpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,24 @@ def run(cfg: DictConfig):
scgpt = scGPT(configurer=scgpt_config)

# either load via huggingface
# hf_dataset = load_dataset("helical-ai/yolksac_human",split="train[:5%]", trust_remote_code=True, download_mode="reuse_cache_if_exists")
# hf_dataset = load_dataset(
# "helical-ai/yolksac_human",
# split="train[:5%]",
# trust_remote_code=True,
# download_mode="reuse_cache_if_exists",
# )
# ann_data = get_anndata_from_hf_dataset(hf_dataset)

# or load directly
ann_data = ad.read_h5ad("./yolksac_human.h5ad")

data = scgpt.process_data(ann_data[:10])
embeddings = scgpt.get_embeddings(data)
embeddings = scgpt.get_embeddings(data, output_attentions=False)
print(embeddings)
embeddings, attn_weights = scgpt.get_embeddings(data, output_attentions=True)

print(embeddings)
print(attn_weights.shape)


if __name__ == "__main__":
Expand Down
18 changes: 10 additions & 8 deletions helical/constants/hash_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,15 +66,15 @@
"transcriptformer/tf_metazoa/vocabs/assay_vocab.json": "0c405e4ead45a4b8350d8e874f834273efdd3f7ad4669b52d3e3727fb4fe70af",
"transcriptformer/tf_metazoa/vocabs/drosophila_melanogaster_gene.h5": "fa3915cf36ed457a162719b2172367a1066cde47677afb2a3b78bc1c51da0ac2",
"transcriptformer/tf_metazoa/vocabs/lytechinus_variegatus_gene.h5": "142c8bc30a1cc9efd3ff92e65346fead6b92a85d3f74ef5b7a3f57a4bfead676",
"transcriptformer/tf_metazoa/vocabs/plasmodium_falciparum_gene.h5": "40678c595883ce04a77cba372bc068b22e592b994a5ef0890e344cae68d08c1b",
"transcriptformer/tf_metazoa/vocabs/plasmodium_falciparum_gene.h5": "40678c595883ce04a77cba372bc068b22e592b994a5ef0890e344cae68d08c1b",
"transcriptformer/tf_metazoa/vocabs/xenopus_laevis_gene.h5": "df5526e1268f88608e38441623dedfa6dfe984cfb73565cb35e7817bbea9eb8c",
"transcriptformer/tf_metazoa/vocabs/caenorhabditis_elegans_gene.h5": "a18f234ce456614b1c6d1fb57f6d1ba031a71539c2c658b0dcbb87af4e7efb64",
"transcriptformer/tf_metazoa/vocabs/gallus_gallus_gene.h5": "17873277d968735bb8f3bcb0d6f9e257db1416a6918d48e7a8e83091f4651c3a",
"transcriptformer/tf_metazoa/vocabs/mus_musculus_gene.h5": "e63c25b8d2ad87d696d8f7dfe4bcb307eee8a7a69dec62a33bb8469dbd75e3e1",
"transcriptformer/tf_metazoa/vocabs/caenorhabditis_elegans_gene.h5": "a18f234ce456614b1c6d1fb57f6d1ba031a71539c2c658b0dcbb87af4e7efb64",
"transcriptformer/tf_metazoa/vocabs/gallus_gallus_gene.h5": "17873277d968735bb8f3bcb0d6f9e257db1416a6918d48e7a8e83091f4651c3a",
"transcriptformer/tf_metazoa/vocabs/mus_musculus_gene.h5": "e63c25b8d2ad87d696d8f7dfe4bcb307eee8a7a69dec62a33bb8469dbd75e3e1",
"transcriptformer/tf_metazoa/vocabs/saccharomyces_cerevisiae_gene.h5": "4685470b3d8bb5bf82aded19b1b208293d4b9cae63ffb6bf42d7ad6be3f3f299",
"transcriptformer/tf_metazoa/vocabs/danio_rerio_gene.h5": "e2b34013c1a0779361b3aaac05cdba5062cb117ce903779232a43f73a99c78e3",
"transcriptformer/tf_metazoa/vocabs/homo_sapiens_gene.h5": "74366d4f45c4bd60983c3d1d1c406d7d58d30a798455c239eb2691eaa162e2dc",
"transcriptformer/tf_metazoa/vocabs/oryctolagus_cuniculus_gene.h5": "0fbe3ca92a36f58134491723cee0834fe06e4106bf0c3b17e1f65ea884abf8a2",
"transcriptformer/tf_metazoa/vocabs/danio_rerio_gene.h5": "e2b34013c1a0779361b3aaac05cdba5062cb117ce903779232a43f73a99c78e3",
"transcriptformer/tf_metazoa/vocabs/homo_sapiens_gene.h5": "74366d4f45c4bd60983c3d1d1c406d7d58d30a798455c239eb2691eaa162e2dc",
"transcriptformer/tf_metazoa/vocabs/oryctolagus_cuniculus_gene.h5": "0fbe3ca92a36f58134491723cee0834fe06e4106bf0c3b17e1f65ea884abf8a2",
"transcriptformer/tf_metazoa/vocabs/spongilla_lacustris_gene.h5": "e082a6bccbcbf89d5f04b97c5f39402765897259becb6b650a2dfd00ea4d3afc",
"transcriptformer/tf_exemplar/config.json": "695c50832a608078bc3b136a0776f8d795532369bc249a83c07db2b28bb97b24",
"transcriptformer/tf_exemplar/model_weights.pt": "1adbb8671c00ea9d66ac0a1be8261c7e96824eaf8435ae17b048570372d4f8a1",
Expand All @@ -83,5 +83,7 @@
"transcriptformer/tf_exemplar/vocabs/danio_rerio_gene.h5": "e2b34013c1a0779361b3aaac05cdba5062cb117ce903779232a43f73a99c78e3",
"transcriptformer/tf_exemplar/vocabs/drosophila_melanogaster_gene.h5": "fa3915cf36ed457a162719b2172367a1066cde47677afb2a3b78bc1c51da0ac2",
"transcriptformer/tf_exemplar/vocabs/homo_sapiens_gene.h5": "74366d4f45c4bd60983c3d1d1c406d7d58d30a798455c239eb2691eaa162e2dc",
"transcriptformer/tf_exemplar/vocabs/mus_musculus_gene.h5": "e63c25b8d2ad87d696d8f7dfe4bcb307eee8a7a69dec62a33bb8469dbd75e3e1",
"transcriptformer/tf_exemplar/vocabs/mus_musculus_gene.h5": "e63c25b8d2ad87d696d8f7dfe4bcb307eee8a7a69dec62a33bb8469dbd75e3e1",
"text_embeddings/gene_prot_embeddings_granite.json": "8fb51d28986fa24e4b7b63bc28f86789db184f902259838cf7f7aa9ae4942ffd",
"mrna_embeddings/sequences_helix_genes_cds_go.json": "364c744b845572b6900820f031c88a24a9ed5b12d2b9c49fa9e5855b1b7abdbc",
}
2 changes: 1 addition & 1 deletion helical/models/evo_2/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ export CPLUS_INCLUDE_PATH=$CONDA_PREFIX/lib/python3.11/site-packages/nvidia/nvtx
pip install torch==2.6.0
pip install "helical[evo-2]@git+https://github.com/helicalAI/helical.git"

git clone git@github.com:Zymrael/vortex.git
git clone https://github.com/Zymrael/vortex.git
cd vortex
git checkout f243e8e
sed -i 's/torch==2.5.1/torch==2.6.0/g' pyproject.toml
Expand Down
10 changes: 10 additions & 0 deletions helical/models/geneformer/geneformer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,10 +154,12 @@ def get_embs(
eos_present,
device,
silent=False,
output_attentions=False,
):
model_input_size = get_model_input_size(model)
total_batch_length = len(filtered_input_data)
embs_list = []
attn_list = []

_check_for_expected_special_tokens(
filtered_input_data, emb_mode, cls_present, eos_present, gene_token_dict
Expand All @@ -182,9 +184,15 @@ def get_embs(
outputs = model(
input_ids=input_data_minibatch,
attention_mask=gen_attention_mask(minibatch),
output_attentions=output_attentions,
)

embs_i = outputs.hidden_states[layer_to_quant]
# attention of size (batch_size, num_heads, sequence_length, sequence_length)
if output_attentions:
attn_i = outputs.attentions[layer_to_quant]
# attn_i = torch.mean(attn_i, dim=1).cpu().numpy() # average over heads
attn_list.extend(attn_i.cpu().numpy())

embs_list.extend(
_compute_embeddings_depending_on_mode(
Expand All @@ -207,6 +215,8 @@ def get_embs(
if emb_mode != "gene":
embs_list = np.array(embs_list)

if output_attentions:
return embs_list, np.array(attn_list)
return embs_list


Expand Down
27 changes: 18 additions & 9 deletions helical/models/geneformer/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,16 +95,15 @@ def __init__(self, configurer: GeneformerConfig = default_configurer) -> None:
available_layers = list(np.arange(num_available_layers))
available_layers.append(-1)
if self.config["emb_layer"] not in available_layers:
message = (
f"Layer {self.config['emb_layer']} is not available. Available layers are {available_layers}."
)
message = f"Layer {self.config['emb_layer']} is not available. Available layers are {available_layers}."
LOGGER.error(message)
raise ValueError(message)

if self.config["emb_layer"] < (num_available_layers-1) and self.config["emb_layer"] != -1:
message = (
f"Layer {self.config['emb_layer']} is not the last layer. This can lead to different embeddings and fine-tuning results."
)

if (
self.config["emb_layer"] < (num_available_layers - 1)
and self.config["emb_layer"] != -1
):
message = f"Layer {self.config['emb_layer']} is not the last layer. This can lead to different embeddings and fine-tuning results."
LOGGER.warning(message)

self.layer_to_quant = self.config["emb_layer"]
Expand Down Expand Up @@ -203,18 +202,27 @@ def process_data(
LOGGER.info(f"Successfully processed the data for Geneformer.")
return tokenized_dataset

def get_embeddings(self, dataset: Dataset) -> np.array:
def get_embeddings(
self, dataset: Dataset, output_attentions: bool = False
) -> np.array:
"""Gets the gene embeddings from the Geneformer model

Parameters
----------
dataset : Dataset
The tokenized dataset containing the processed data
output_attentions : bool, optional, default=False
Whether to output attentions from the model. This is useful for debugging or analysis purposes.
The attention maps are averaged across heads and use the same layer as the embeddings.

Returns
-------
np.array
The gene embeddings in the form of a numpy array
np.array, optional
The attention weights from the model, if `output_attentions` is set to True.
The shape of the attention weights is (batch_size, num_heads, seq_length, seq_length).
If `output_attentions` is False, this will not be returned.
"""
LOGGER.info(f"Started getting embeddings:")
embeddings = get_embs(
Expand All @@ -229,6 +237,7 @@ def get_embeddings(self, dataset: Dataset) -> np.array:
self.cls_present,
self.eos_present,
self.device,
output_attentions=output_attentions,
)

LOGGER.info(f"Finished getting embeddings.")
Expand Down
2 changes: 1 addition & 1 deletion helical/models/helix_mrna/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,8 @@ def process_data(self, sequences: Union[list[str], DataFrame]) -> Dataset:
tokenized_sequences = self.tokenizer(
sequences,
return_tensors="pt",
padding="max_length",
truncation=True,
padding="longest",
max_length=self.config["input_size"],
return_special_tokens_mask=True,
)
Expand Down
50 changes: 38 additions & 12 deletions helical/models/scgpt/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,20 +86,28 @@ def __init__(self, configurer: scGPTConfig = configurer) -> None:
f"'scGPT' model is in '{mode}' mode, on device '{next(self.model.parameters()).device.type}' with embedding mode '{self.config['emb_mode']}'."
)

def get_embeddings(self, dataset: Dataset) -> np.array:
def get_embeddings(
self, dataset: Dataset, output_attentions: bool = False
) -> np.array:
"""Gets the gene embeddings

Parameters
----------
dataset : Dataset
The processed dataset to get the embeddings from.
output_attentions : bool, optional, default=False
Whether to output the attention maps from the model. If set to True, the attention maps will be returned along with the embeddings.
If set to False, only the embeddings will be returned. **Note**: This will increase the memory usage of the model significantly, so use it only if you need the attention maps.

Returns
-------
np.ndarray | List[pd.Series]
The embeddings produced by the model.
The return type depends on the `emb_mode` parameter in the configuration.
If `emb_mode` is set to "gene", the embeddings are returned as a list of pd.Series which contain a mapping of gene_name:embedding for each cell.

np.ndarray
If `output_attentions` is set to True, the attention maps will be returned as a numpy array of shape (n_layers, n_heads, n_cells, n_tokens, n_tokens).
"""
LOGGER.info(f"Started getting embeddings:")

Expand Down Expand Up @@ -136,6 +144,7 @@ def get_embeddings(self, dataset: Dataset) -> np.array:
device = next(self.model.parameters()).device

resulting_embeddings = []
resulting_attn_maps = []

with (
torch.no_grad(),
Expand All @@ -147,16 +156,30 @@ def get_embeddings(self, dataset: Dataset) -> np.array:
src_key_padding_mask = input_gene_ids.eq(
self.vocab[self.config["pad_token"]]
)
embeddings = self.model._encode(
input_gene_ids,
data_dict["expr"].to(device),
src_key_padding_mask=src_key_padding_mask,
batch_labels=(
data_dict["batch_labels"].to(device)
if use_batch_labels
else None
),
)
if output_attentions:
embeddings, attn_maps = self.model._encode(
input_gene_ids,
data_dict["expr"].to(device),
src_key_padding_mask=src_key_padding_mask,
batch_labels=(
data_dict["batch_labels"].to(device)
if use_batch_labels
else None
),
output_attentions=output_attentions,
)
resulting_attn_maps.extend(attn_maps)
else:
embeddings = self.model._encode(
input_gene_ids,
data_dict["expr"].to(device),
src_key_padding_mask=src_key_padding_mask,
batch_labels=(
data_dict["batch_labels"].to(device)
if use_batch_labels
else None
),
)

resulting_embeddings.extend(
self._compute_embeddings_depending_on_mode(embeddings, data_dict)
Expand All @@ -165,7 +188,10 @@ def get_embeddings(self, dataset: Dataset) -> np.array:
resulting_embeddings = self._normalize_embeddings(resulting_embeddings)

LOGGER.info(f"Finished getting embeddings.")
return resulting_embeddings
if output_attentions:
return resulting_embeddings, torch.stack(resulting_attn_maps).cpu().numpy()
else:
return resulting_embeddings

def _normalize_embeddings(self, resulting_embeddings: torch.tensor) -> np.ndarray:
"""
Expand Down
22 changes: 17 additions & 5 deletions helical/models/scgpt/model_dir/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@
from torch import nn, Tensor
import torch.distributed as dist
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer

# from torch.nn import TransformerEncoder, TransformerEncoderLayer
### Needed a custom version to output attention weights
from .transformer import TransformerEncoder, TransformerEncoderLayer
from torch.distributions import Bernoulli
from tqdm import trange

Expand Down Expand Up @@ -170,6 +173,7 @@ def _encode(
values: Tensor,
src_key_padding_mask: Tensor,
batch_labels: Optional[Tensor] = None, # (batch,)
output_attentions: bool = False,
) -> Tensor:
self._check_batch_labels(batch_labels)

Expand All @@ -191,10 +195,18 @@ def _encode(
elif getattr(self, "bn", None) is not None:
total_embs = self.bn(total_embs.permute(0, 2, 1)).permute(0, 2, 1)

output = self.transformer_encoder(
total_embs, src_key_padding_mask=src_key_padding_mask
)
return output # (batch, seq_len, embsize)
if output_attentions:
output, attn_maps = self.transformer_encoder(
total_embs,
src_key_padding_mask=src_key_padding_mask,
output_attentions=output_attentions,
)
return output, attn_maps
else:
output = self.transformer_encoder(
total_embs, src_key_padding_mask=src_key_padding_mask
)
return output # (batch, seq_len, embsize)

def _get_cell_emb_from_layer(
self, layer_output: Tensor, weights: Tensor = None
Expand Down
Loading
Loading