helicalAI · bputzeys · Feb 19, 2025 · Feb 5, 2025 · Feb 10, 2025 · Feb 10, 2025
diff --git a/README.md b/README.md
@@ -120,6 +120,7 @@ Within the `examples/notebooks` folder, open the notebook of your choice. We rec
 |[Cell-Type-Annotation.ipynb](./examples/notebooks/Cell-Type-Annotation.ipynb)|An example how to do probing with scGPT by training a neural network to predict cell type annotations.|[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/helicalAI/helical/blob/main/examples/notebooks/Cell-Type-Annotation.ipynb) |
 |[Cell-Type-Classification-Fine-Tuning.ipynb](./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb)|An example how to fine-tune different models on classification tasks.|[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/helicalAI/helical/blob/main/examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb) |
 |[HyenaDNA-Fine-Tuning.ipynb](./examples/notebooks/HyenaDNA-Fine-Tuning.ipynb)|An example of how to fine-tune the HyenaDNA model on downstream benchmarks.|[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/helicalAI/helical/blob/main/examples/notebooks/HyenaDNA-Fine-Tuning.ipynb) |
+|[Cell-Gene-Cls-embedding-generation.ipynb](./examples/notebooks/Cell-Gene-Cls-embedding-generation.ipynb)|A notebook explaining the different embedding modes of single cell RNA models.|[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/helicalAI/helical/blob/main/examples/notebooks/Cell-Gene-Cls-embedding-generation.ipynb) |
 | Coming Soon | New models such as SCimilarity, scVI; benchmarking scripts; new use cases; others |
 
 ## Stuck somewhere ? Other ideas ?
@@ -179,4 +180,3 @@ Please use this BibTeX to cite this repository in your publications:
   url          = {https://doi.org/10.5281/zenodo.13135902}
 }
 ```
-
diff --git a/docs/index.md b/docs/index.md
@@ -98,6 +98,7 @@ Within the `example/notebooks` folder, open the notebook of your choice. We reco
 |[Cell-Type-Annotation.ipynb](./notebooks/Cell-Type-Annotation.ipynb)|An example how to do probing with scGPT by training a neural network to predict cell type annotations.|[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/helicalAI/helical/blob/main/examples/notebooks/Cell-Type-Annotation.ipynb) |
 |[Cell-Type-Classification-Fine-Tuning.ipynb](./notebooks/Cell-Type-Classification-Fine-Tuning.ipynb)|An example how to fine-tune different models on classification tasks.|[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/helicalAI/helical/blob/main/examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb) |
 |[HyenaDNA-Fine-Tuning.ipynb](./notebooks/HyenaDNA-Fine-Tuning.ipynb)|An example of how to fine-tune the HyenaDNA model on downstream benchmarks.|[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/helicalAI/helical/blob/main/examples/notebooks/HyenaDNA-Fine-Tuning.ipynb) |
+|[Cell-Gene-Cls-embedding-generation.ipynb](./examples/notebooks/Cell-Gene-Cls-embedding-generation.ipynb)|A notebook explaining the different embedding modes of single cell RNA models.|[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/helicalAI/helical/blob/main/examples/notebooks/Cell-Gene-Cls-embedding-generation.ipynb) |
 | Coming Soon | New models such as SCimilarity, scVI; benchmarking scripts; new use cases; others |
 
 ## Stuck somewhere ? Other ideas ?

diff --git a/docs/model_cards/geneformer.md b/docs/model_cards/geneformer.md
@@ -192,16 +192,19 @@ import anndata as ad
 
 # Example configuration
 model_config = GeneformerConfig(model_name="gf-12L-95M-i4096", batch_size=10)
-geneformer = Geneformer(model_config=model_config)
+geneformer_v2 = Geneformer(model_config)
 
 # Example usage for base pretrained model
-ann_data = ad.read_h5ad("general_dataset.h5ad")
+ann_data = ad.read_h5ad("anndata_file.h5ad")
 dataset = geneformer_v2.process_data(ann_data)
 embeddings = geneformer_v2.get_embeddings(dataset)
 print("Base model embeddings shape:", embeddings.shape)
 
 # Example usage for cancer-tuned model
-cancer_ann_data = ad.read_h5ad("cancer_dataset.h5ad")
+model_config_cancer = GeneformerConfig(model_name="gf-12L-95M-i4096-CLcancer", batch_size=10)
+geneformer_v2_cancer = Geneformer(model_config)
+
+cancer_ann_data = ad.read_h5ad("anndata_file.h5ad")
 cancer_dataset = geneformer_v2_cancer.process_data(cancer_ann_data)
 cancer_embeddings = geneformer_v2_cancer.get_embeddings(cancer_dataset)
 print("Cancer-tuned model embeddings shape:", cancer_embeddings.shape)
@@ -211,43 +214,47 @@ print("Cancer-tuned model embeddings shape:", cancer_embeddings.shape)
 
 ```python
 from helical import GeneformerConfig, GeneformerFineTuningModel
+import anndata as ad
 
-# Prepare the data
-ann_data = ad.read_h5ad("dataset.h5ad")
-
-# Get the desired label class
-cell_types = list(ann_data.obs.cell_type)
+# Load the data
+ann_data = ad.read_h5ad("/home/matthew/helical-dev/helical/yolksac_human.h5ad")
 
-# Create a dictionary mapping the classes to unique integers for training
+# Get the column for fine-tuning
+cell_types = list(ann_data.obs["cell_types"])
 label_set = set(cell_types)
-class_id_dict = dict(zip(label_set, [i for i in range(len(label_set))]))
 
-for i in range(len(cell_types)):
-    cell_types[i] = class_id_dict[cell_types[i]]
+# Create a GeneformerConfig object
+geneformer_config = GeneformerConfig(model_name="gf-12L-95M-i4096", batch_size=10)
+
+# Create a GeneformerFineTuningModel object
+geneformer_fine_tune = GeneformerFineTuningModel(geneformer_config=geneformer_config, fine_tuning_head="classification", output_size=len(label_set))
 
-# Add this column to the Dataset
+# Process the data
+dataset = geneformer_fine_tune.process_data(ann_data[:10])
+
+# Add column to the dataset
 dataset = dataset.add_column('cell_types', cell_types)
 
-# Create the fine-tuning model
-model_config = GeneformerConfig(model_name="gf-12L-95M-i4096", batch_size=10)
-geneformer_fine_tune = GeneformerFineTuningModel(
-    geneformer_config=model_config, 
-    fine_tuning_head="classification", 
-    label="cell_types", 
-    output_size=len(label_set)
-)
+# Create a dictionary to map cell types to ids
+class_id_dict = dict(zip(label_set, [i for i in range(len(label_set))]))
+
+def classes_to_ids(example):
+    example["cell_types"] = class_id_dict[example["cell_types"]]
+    return example
 
-# Process the data for training
-dataset = geneformer_fine_tune.process_data(ann_data)
+# Convert cell types to ids
+dataset = dataset.map(classes_to_ids, num_proc=1)
 
-# Fine-tune
-geneformer_fine_tune.train(train_dataset=dataset)
+# Fine-tune the model
+geneformer_fine_tune.train(train_dataset=dataset, label="cell_types")
 
-# Get outputs of the fine-tuned model
+# Get logits from the fine-tuned model
 outputs = geneformer_fine_tune.get_outputs(dataset)
+print(outputs[:10])
 
-# Get the embeddings of the fine-tuned model
+# Get embeddings from the fine-tuned model
 embeddings = geneformer_fine_tune.get_embeddings(dataset)
+print(embeddings[:10])
 ```
 
 ## Contact

diff --git a/docs/model_cards/helix_mrna.md b/docs/model_cards/helix_mrna.md
@@ -101,7 +101,7 @@ import torch
 
 device = "cuda" if torch.cuda.is_available() else "cpu"
 
-helix_mrna_config = HelimRNAConfig(batch_size=5, max_length=100, device=device)
+helix_mrna_config = HelixmRNAConfig(batch_size=5, max_length=100, device=device)
 helix_mrna = HelixmRNA(configurer=helix_mrna_config)
 
 rna_sequences = ["EACUEGGG", "EACUEGGG", "EACUEGGG", "EACUEGGG", "EACUEGGG"]

diff --git a/examples/fine_tune_models/fine_tune_geneformer.py b/examples/fine_tune_models/fine_tune_geneformer.py
@@ -32,7 +32,7 @@ def classes_to_ids(example):
 
     dataset = dataset.map(classes_to_ids, num_proc=1)
 
-    geneformer_fine_tune.train(train_dataset=dataset)
+    geneformer_fine_tune.train(train_dataset=dataset, label="cell_types")
 
     outputs = geneformer_fine_tune.get_outputs(dataset)
     print(outputs)