diff --git a/benchmarks/ViT/config.yml b/benchmarks/ViT/config.yml new file mode 100644 index 0000000..dd2a49f --- /dev/null +++ b/benchmarks/ViT/config.yml @@ -0,0 +1,4 @@ +benchmark: + maximize_score: True +hardware: + needs_gpu: True diff --git a/benchmarks/ViT/environment.yml b/benchmarks/ViT/environment.yml new file mode 100644 index 0000000..f4c768e --- /dev/null +++ b/benchmarks/ViT/environment.yml @@ -0,0 +1,12 @@ +name: vit_env +channels: + - conda-forge + - pytorch + - nvidia +dependencies: + - python=3.9 + - pytorch + - pytorch-lightning + - transformers + - datasets + - torchvision diff --git a/benchmarks/ViT/run.sh b/benchmarks/ViT/run.sh new file mode 100755 index 0000000..14dffa6 --- /dev/null +++ b/benchmarks/ViT/run.sh @@ -0,0 +1,3 @@ +python vit.py + +rm -rf ckpts/ data/ diff --git a/benchmarks/ViT/vit.py b/benchmarks/ViT/vit.py new file mode 100644 index 0000000..70a56f7 --- /dev/null +++ b/benchmarks/ViT/vit.py @@ -0,0 +1,261 @@ +# Code taken from https://lightning.ai/docs/pytorch/stable/notebooks/course_UvA-DL/11-vision-transformer.html + +import os +import time +import urllib.request +from urllib.error import HTTPError +import shutil +from argparse import ArgumentParser + +import pytorch_lightning as L +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import torch.utils.data as data +import torchvision +from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint +from torchvision import transforms +from torchvision.datasets import CIFAR10 + +test_transform = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize([0.49139968, 0.48215841, 0.44653091], [0.24703223, 0.24348513, 0.26158784]), + ] +) +# For training, we add some augmentation. Networks are too powerful and would overfit. +train_transform = transforms.Compose( + [ + transforms.RandomHorizontalFlip(), + transforms.RandomResizedCrop((32, 32), scale=(0.8, 1.0), ratio=(0.9, 1.1)), + transforms.ToTensor(), + transforms.Normalize([0.49139968, 0.48215841, 0.44653091], [0.24703223, 0.24348513, 0.26158784]), + ] +) + + +def img_to_patch(x, patch_size, flatten_channels=True): + """ + Inputs: + x - Tensor representing the image of shape [B, C, H, W] + patch_size - Number of pixels per dimension of the patches (integer) + flatten_channels - If True, the patches will be returned in a flattened format + as a feature vector instead of a image grid. + """ + B, C, H, W = x.shape + x = x.reshape(B, C, H // patch_size, patch_size, W // patch_size, patch_size) + x = x.permute(0, 2, 4, 1, 3, 5) # [B, H', W', C, p_H, p_W] + x = x.flatten(1, 2) # [B, H'*W', C, p_H, p_W] + if flatten_channels: + x = x.flatten(2, 4) # [B, H'*W', C*p_H*p_W] + return x + +class AttentionBlock(nn.Module): + def __init__(self, embed_dim, hidden_dim, num_heads, dropout=0.0): + """ + Inputs: + embed_dim - Dimensionality of input and attention feature vectors + hidden_dim - Dimensionality of hidden layer in feed-forward network + (usually 2-4x larger than embed_dim) + num_heads - Number of heads to use in the Multi-Head Attention block + dropout - Amount of dropout to apply in the feed-forward network + """ + super().__init__() + + self.layer_norm_1 = nn.LayerNorm(embed_dim) + self.attn = nn.MultiheadAttention(embed_dim, num_heads) + self.layer_norm_2 = nn.LayerNorm(embed_dim) + self.linear = nn.Sequential( + nn.Linear(embed_dim, hidden_dim), + nn.GELU(), + nn.Dropout(dropout), + nn.Linear(hidden_dim, embed_dim), + nn.Dropout(dropout), + ) + + def forward(self, x): + inp_x = self.layer_norm_1(x) + x = x + self.attn(inp_x, inp_x, inp_x)[0] + x = x + self.linear(self.layer_norm_2(x)) + return x + +class VisionTransformer(nn.Module): + def __init__( + self, + embed_dim, + hidden_dim, + num_channels, + num_heads, + num_layers, + num_classes, + patch_size, + num_patches, + dropout=0.0, + ): + """ + Inputs: + embed_dim - Dimensionality of the input feature vectors to the Transformer + hidden_dim - Dimensionality of the hidden layer in the feed-forward networks + within the Transformer + num_channels - Number of channels of the input (3 for RGB) + num_heads - Number of heads to use in the Multi-Head Attention block + num_layers - Number of layers to use in the Transformer + num_classes - Number of classes to predict + patch_size - Number of pixels that the patches have per dimension + num_patches - Maximum number of patches an image can have + dropout - Amount of dropout to apply in the feed-forward network and + on the input encoding + """ + super().__init__() + + self.patch_size = patch_size + + # Layers/Networks + self.input_layer = nn.Linear(num_channels * (patch_size**2), embed_dim) + self.transformer = nn.Sequential( + *(AttentionBlock(embed_dim, hidden_dim, num_heads, dropout=dropout) for _ in range(num_layers)) + ) + self.mlp_head = nn.Sequential(nn.LayerNorm(embed_dim), nn.Linear(embed_dim, num_classes)) + self.dropout = nn.Dropout(dropout) + + # Parameters/Embeddings + self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim)) + self.pos_embedding = nn.Parameter(torch.randn(1, 1 + num_patches, embed_dim)) + + def forward(self, x): + # Preprocess input + x = img_to_patch(x, self.patch_size) + B, T, _ = x.shape + x = self.input_layer(x) + + # Add CLS token and positional encoding + cls_token = self.cls_token.repeat(B, 1, 1) + x = torch.cat([cls_token, x], dim=1) + x = x + self.pos_embedding[:, : T + 1] + + # Apply Transforrmer + x = self.dropout(x) + x = x.transpose(0, 1) + x = self.transformer(x) + + # Perform classification prediction + cls = x[0] + out = self.mlp_head(cls) + return out + +class ViT(L.LightningModule): + def __init__(self, model_kwargs, lr): + super().__init__() + self.save_hyperparameters() + self.model = VisionTransformer(**model_kwargs) + self.example_input_array = next(iter(train_loader))[0] + + def forward(self, x): + return self.model(x) + + def configure_optimizers(self): + optimizer = optim.AdamW(self.parameters(), lr=self.hparams.lr) + lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[100, 150], gamma=0.1) + return [optimizer], [lr_scheduler] + + def _calculate_loss(self, batch, mode="train"): + imgs, labels = batch + preds = self.model(imgs) + loss = F.cross_entropy(preds, labels) + acc = (preds.argmax(dim=-1) == labels).float().mean() + + self.log("%s_loss" % mode, loss) + self.log("%s_acc" % mode, acc) + return loss + + def training_step(self, batch, batch_idx): + loss = self._calculate_loss(batch, mode="train") + return loss + + def validation_step(self, batch, batch_idx): + self._calculate_loss(batch, mode="val") + + def test_step(self, batch, batch_idx): + self._calculate_loss(batch, mode="test") + +def train_model(num_gpus, **kwargs): + trainer = L.Trainer( + # default_root_dir=os.path.join(CHECKPOINT_PATH, "ViT"), + accelerator="auto", + devices=num_gpus, + max_epochs=10, + callbacks=[ + ModelCheckpoint(dirpath="ckpts/", + save_weights_only=True, mode="max", monitor="val_acc"), + ], + logger=False + ) + + L.seed_everything(42) # To be reproducable + model = ViT(**kwargs) + start_time = time.perf_counter() + trainer.fit(model, train_loader, val_loader) + end_time = time.perf_counter() + training_time = end_time - start_time + + # Load best checkpoint after training + model = ViT.load_from_checkpoint(trainer.checkpoint_callback.best_model_path) + + print(f"score: {1./training_time:f}") + + return model + + +if __name__=="__main__": + if torch.cuda.is_available(): + num_gpus = torch.cuda.device_count() + else: + print("No GPU available") + print(f"score: {0.01:f}") + exit(1) + + # Path to the folder where the datasets are/should be downloaded (e.g. CIFAR10) + DATASET_PATH = os.environ.get("PATH_DATASETS", "data/") + # Path to the folder where the pretrained models are saved + CHECKPOINT_PATH = os.environ.get("PATH_CHECKPOINT", "saved_models/VisionTransformers/") + + # Setting the seed + L.seed_everything(42) + # Ensure that all operations are deterministic on GPU (if used) for reproducibility torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + # Github URL where saved models are stored for this tutorial + base_url = "https://raw.githubusercontent.com/phlippe/saved_models/main/" + + # Loading the training dataset. We need to split it into a training and validation part + # We need to do a little trick because the validation set should not use the augmentation. + train_dataset = CIFAR10(root=DATASET_PATH, train=True, transform=train_transform, download=True) + val_dataset = CIFAR10(root=DATASET_PATH, train=True, transform=test_transform, download=True) + L.seed_everything(42) + train_set, _ = torch.utils.data.random_split(train_dataset, [45000, 5000]) + L.seed_everything(42) + _, val_set = torch.utils.data.random_split(val_dataset, [45000, 5000]) + + # Loading the test set + test_set = CIFAR10(root=DATASET_PATH, train=False, transform=test_transform, download=True) + + # We define a set of data loaders that we can use for various purposes later. + train_loader = data.DataLoader(train_set, batch_size=128, shuffle=True, drop_last=True, pin_memory=True, num_workers=4) + val_loader = data.DataLoader(val_set, batch_size=128, shuffle=False, drop_last=False, num_workers=4) + + model = train_model( + num_gpus=num_gpus, + model_kwargs={ + "embed_dim": 256, + "hidden_dim": 512, + "num_heads": 8, + "num_layers": 6, + "patch_size": 4, + "num_channels": 3, + "num_patches": 64, + "num_classes": 10, + "dropout": 0.2, + }, + lr=3e-4, + )