diff --git a/benchmarks/ViT/config.yml b/benchmarks/ViT/config.yml
new file mode 100644
index 0000000..dd2a49f
--- /dev/null
+++ b/benchmarks/ViT/config.yml
@@ -0,0 +1,4 @@
+benchmark:
+  maximize_score: True
+hardware:
+  needs_gpu: True
diff --git a/benchmarks/ViT/environment.yml b/benchmarks/ViT/environment.yml
new file mode 100644
index 0000000..f4c768e
--- /dev/null
+++ b/benchmarks/ViT/environment.yml
@@ -0,0 +1,12 @@
+name: vit_env
+channels:
+  - conda-forge
+  - pytorch
+  - nvidia
+dependencies:
+  - python=3.9
+  - pytorch
+  - pytorch-lightning
+  - transformers
+  - datasets
+  - torchvision
diff --git a/benchmarks/ViT/run.sh b/benchmarks/ViT/run.sh
new file mode 100755
index 0000000..14dffa6
--- /dev/null
+++ b/benchmarks/ViT/run.sh
@@ -0,0 +1,3 @@
+python vit.py
+
+rm -rf ckpts/ data/
diff --git a/benchmarks/ViT/vit.py b/benchmarks/ViT/vit.py
new file mode 100644
index 0000000..70a56f7
--- /dev/null
+++ b/benchmarks/ViT/vit.py
@@ -0,0 +1,261 @@
+# Code taken from https://lightning.ai/docs/pytorch/stable/notebooks/course_UvA-DL/11-vision-transformer.html
+
+import os
+import time
+import urllib.request
+from urllib.error import HTTPError
+import shutil
+from argparse import ArgumentParser
+
+import pytorch_lightning as L
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import torch.utils.data as data
+import torchvision
+from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
+from torchvision import transforms
+from torchvision.datasets import CIFAR10
+
+test_transform = transforms.Compose(
+    [
+        transforms.ToTensor(),
+        transforms.Normalize([0.49139968, 0.48215841, 0.44653091], [0.24703223, 0.24348513, 0.26158784]),
+    ]
+)
+# For training, we add some augmentation. Networks are too powerful and would overfit.
+train_transform = transforms.Compose(
+    [
+        transforms.RandomHorizontalFlip(),
+        transforms.RandomResizedCrop((32, 32), scale=(0.8, 1.0), ratio=(0.9, 1.1)),
+        transforms.ToTensor(),
+        transforms.Normalize([0.49139968, 0.48215841, 0.44653091], [0.24703223, 0.24348513, 0.26158784]),
+    ]
+)
+
+
+def img_to_patch(x, patch_size, flatten_channels=True):
+    """
+    Inputs:
+        x - Tensor representing the image of shape [B, C, H, W]
+        patch_size - Number of pixels per dimension of the patches (integer)
+        flatten_channels - If True, the patches will be returned in a flattened format
+                           as a feature vector instead of a image grid.
+    """
+    B, C, H, W = x.shape
+    x = x.reshape(B, C, H // patch_size, patch_size, W // patch_size, patch_size)
+    x = x.permute(0, 2, 4, 1, 3, 5)  # [B, H', W', C, p_H, p_W]
+    x = x.flatten(1, 2)  # [B, H'*W', C, p_H, p_W]
+    if flatten_channels:
+        x = x.flatten(2, 4)  # [B, H'*W', C*p_H*p_W]
+    return x
+
+class AttentionBlock(nn.Module):
+    def __init__(self, embed_dim, hidden_dim, num_heads, dropout=0.0):
+        """
+        Inputs:
+            embed_dim - Dimensionality of input and attention feature vectors
+            hidden_dim - Dimensionality of hidden layer in feed-forward network
+                         (usually 2-4x larger than embed_dim)
+            num_heads - Number of heads to use in the Multi-Head Attention block
+            dropout - Amount of dropout to apply in the feed-forward network
+        """
+        super().__init__()
+
+        self.layer_norm_1 = nn.LayerNorm(embed_dim)
+        self.attn = nn.MultiheadAttention(embed_dim, num_heads)
+        self.layer_norm_2 = nn.LayerNorm(embed_dim)
+        self.linear = nn.Sequential(
+            nn.Linear(embed_dim, hidden_dim),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, embed_dim),
+            nn.Dropout(dropout),
+        )
+
+    def forward(self, x):
+        inp_x = self.layer_norm_1(x)
+        x = x + self.attn(inp_x, inp_x, inp_x)[0]
+        x = x + self.linear(self.layer_norm_2(x))
+        return x
+
+class VisionTransformer(nn.Module):
+    def __init__(
+        self,
+        embed_dim,
+        hidden_dim,
+        num_channels,
+        num_heads,
+        num_layers,
+        num_classes,
+        patch_size,
+        num_patches,
+        dropout=0.0,
+    ):
+        """
+        Inputs:
+            embed_dim - Dimensionality of the input feature vectors to the Transformer
+            hidden_dim - Dimensionality of the hidden layer in the feed-forward networks
+                         within the Transformer
+            num_channels - Number of channels of the input (3 for RGB)
+            num_heads - Number of heads to use in the Multi-Head Attention block
+            num_layers - Number of layers to use in the Transformer
+            num_classes - Number of classes to predict
+            patch_size - Number of pixels that the patches have per dimension
+            num_patches - Maximum number of patches an image can have
+            dropout - Amount of dropout to apply in the feed-forward network and
+                      on the input encoding
+        """
+        super().__init__()
+
+        self.patch_size = patch_size
+
+        # Layers/Networks
+        self.input_layer = nn.Linear(num_channels * (patch_size**2), embed_dim)
+        self.transformer = nn.Sequential(
+            *(AttentionBlock(embed_dim, hidden_dim, num_heads, dropout=dropout) for _ in range(num_layers))
+        )
+        self.mlp_head = nn.Sequential(nn.LayerNorm(embed_dim), nn.Linear(embed_dim, num_classes))
+        self.dropout = nn.Dropout(dropout)
+
+        # Parameters/Embeddings
+        self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))
+        self.pos_embedding = nn.Parameter(torch.randn(1, 1 + num_patches, embed_dim))
+
+    def forward(self, x):
+        # Preprocess input
+        x = img_to_patch(x, self.patch_size)
+        B, T, _ = x.shape
+        x = self.input_layer(x)
+
+        # Add CLS token and positional encoding
+        cls_token = self.cls_token.repeat(B, 1, 1)
+        x = torch.cat([cls_token, x], dim=1)
+        x = x + self.pos_embedding[:, : T + 1]
+
+        # Apply Transforrmer
+        x = self.dropout(x)
+        x = x.transpose(0, 1)
+        x = self.transformer(x)
+
+        # Perform classification prediction
+        cls = x[0]
+        out = self.mlp_head(cls)
+        return out
+
+class ViT(L.LightningModule):
+    def __init__(self, model_kwargs, lr):
+        super().__init__()
+        self.save_hyperparameters()
+        self.model = VisionTransformer(**model_kwargs)
+        self.example_input_array = next(iter(train_loader))[0]
+
+    def forward(self, x):
+        return self.model(x)
+
+    def configure_optimizers(self):
+        optimizer = optim.AdamW(self.parameters(), lr=self.hparams.lr)
+        lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[100, 150], gamma=0.1)
+        return [optimizer], [lr_scheduler]
+
+    def _calculate_loss(self, batch, mode="train"):
+        imgs, labels = batch
+        preds = self.model(imgs)
+        loss = F.cross_entropy(preds, labels)
+        acc = (preds.argmax(dim=-1) == labels).float().mean()
+
+        self.log("%s_loss" % mode, loss)
+        self.log("%s_acc" % mode, acc)
+        return loss
+
+    def training_step(self, batch, batch_idx):
+        loss = self._calculate_loss(batch, mode="train")
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        self._calculate_loss(batch, mode="val")
+
+    def test_step(self, batch, batch_idx):
+        self._calculate_loss(batch, mode="test")
+
+def train_model(num_gpus, **kwargs):
+    trainer = L.Trainer(
+        # default_root_dir=os.path.join(CHECKPOINT_PATH, "ViT"),
+        accelerator="auto",
+        devices=num_gpus,
+        max_epochs=10,
+        callbacks=[
+            ModelCheckpoint(dirpath="ckpts/",
+                            save_weights_only=True, mode="max", monitor="val_acc"),
+        ],
+        logger=False
+    )
+
+    L.seed_everything(42)  # To be reproducable
+    model = ViT(**kwargs)
+    start_time = time.perf_counter()
+    trainer.fit(model, train_loader, val_loader)
+    end_time = time.perf_counter()
+    training_time = end_time - start_time
+
+    # Load best checkpoint after training
+    model = ViT.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
+
+    print(f"score: {1./training_time:f}")
+
+    return model
+
+
+if __name__=="__main__":
+    if torch.cuda.is_available():
+        num_gpus = torch.cuda.device_count()
+    else:
+        print("No GPU available")
+        print(f"score: {0.01:f}")
+        exit(1)
+
+    # Path to the folder where the datasets are/should be downloaded (e.g. CIFAR10)
+    DATASET_PATH = os.environ.get("PATH_DATASETS", "data/")
+    # Path to the folder where the pretrained models are saved
+    CHECKPOINT_PATH = os.environ.get("PATH_CHECKPOINT", "saved_models/VisionTransformers/")
+
+    # Setting the seed
+    L.seed_everything(42)
+    # Ensure that all operations are deterministic on GPU (if used) for reproducibility torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+
+    # Github URL where saved models are stored for this tutorial
+    base_url = "https://raw.githubusercontent.com/phlippe/saved_models/main/"
+
+    # Loading the training dataset. We need to split it into a training and validation part
+    # We need to do a little trick because the validation set should not use the augmentation.
+    train_dataset = CIFAR10(root=DATASET_PATH, train=True, transform=train_transform, download=True)
+    val_dataset = CIFAR10(root=DATASET_PATH, train=True, transform=test_transform, download=True)
+    L.seed_everything(42)
+    train_set, _ = torch.utils.data.random_split(train_dataset, [45000, 5000])
+    L.seed_everything(42)
+    _, val_set = torch.utils.data.random_split(val_dataset, [45000, 5000])
+
+    # Loading the test set
+    test_set = CIFAR10(root=DATASET_PATH, train=False, transform=test_transform, download=True)
+
+    # We define a set of data loaders that we can use for various purposes later.
+    train_loader = data.DataLoader(train_set, batch_size=128, shuffle=True, drop_last=True, pin_memory=True, num_workers=4)
+    val_loader = data.DataLoader(val_set, batch_size=128, shuffle=False, drop_last=False, num_workers=4)
+
+    model = train_model(
+    num_gpus=num_gpus,
+    model_kwargs={
+        "embed_dim": 256,
+        "hidden_dim": 512,
+        "num_heads": 8,
+        "num_layers": 6,
+        "patch_size": 4,
+        "num_channels": 3,
+        "num_patches": 64,
+        "num_classes": 10,
+        "dropout": 0.2,
+    },
+    lr=3e-4,
+    )