hbac/train.py at main · ryanirl/hbac · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
import argparse
import logging
import math
import os

from omegaconf import OmegaConf
from omegaconf import DictConfig

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import hbac.utils.trainer as T
import hbac.utils.cli as cli
from hbac.datasets.hms import HmsDataModule
from hbac.utils.logger import setup_logger
from hbac.registry import instantiate

from typing import Optional
from typing import Tuple
from typing import List
from typing import Any

logger = logging.getLogger("train")


class HbacUnimodalTask(T.Task):
    """Defines the core logic for how to train a unimodal model for the HMS-HBAC
    Kaggle competition. Because we are tasked with learning the distribution of
    expert annotator votes the KL-Divergence loss function is used.

    Also the distribution of the number of expert annotators is bimodal and can
    be classified into two groups. The first group (l10) is where there are
    stricly less than 10 expert annotators, these are less confident samples.
    The other group contains greater than or equal to 10 expert annotators,
    these are 'high quality' samples. That is why it's seperated out in the
    validation step.
    """
    def __init__(self) -> None:
        super().__init__()

        self.train_criterion = nn.KLDivLoss(reduction = "batchmean")
        self.valid_criterion = nn.KLDivLoss(reduction = "none")

    def training_step(self, model: nn.Module, batch: Tuple[Any]) -> torch.Tensor:
        """Code for training the model one step."""
        x = batch[0]
        y = batch[1]

        y_hat = model(x)
        loss = self.train_criterion(y_hat, y)

        self.log("train/loss", loss.item())

        return loss

    def validation_step(self, model: nn.Module, batch: Tuple[Any]) -> Any:
        """Code for evaluating the model one step."""
        x = batch[0]
        y = batch[1]
        count = batch[2]

        y_hat = model(x)
        loss = self.valid_criterion(y_hat, y).sum(axis = 1)

        # Split based on based >9 or <9.
        loss_all = loss.mean().item()
        loss_g10 = loss[count >= 9].mean().item()
        loss_l10 = loss[count < 9].mean().item()

        # Possible that there were no count less than or greater than 9.
        loss_g10 = loss_g10 if not math.isnan(loss_g10) else 0.0
        loss_l10 = loss_l10 if not math.isnan(loss_l10) else 0.0

        # Update the metrics
        self.log_dict({
            "val/loss_all": loss_all,
            "val/loss_g10": loss_g10,
            "val/loss_l10": loss_l10
        })


class HbacMultimodalTask(T.Task):
    """Defines the core logic for how to train the final mulitmodal model for
    the HMS-HBAC Kaggle competition. For more information on the training task,
    see the unimodal training task documentation.
    """
    def __init__(self) -> None:
        super().__init__()

        self.train_criterion = nn.KLDivLoss(reduction = "batchmean")
        self.valid_criterion = nn.KLDivLoss(reduction = "none")

    def training_step(self, model: nn.Module, batch: Tuple[Any]) -> torch.Tensor:
        """Code for training the model one step."""
        eeg = batch[0]
        eeg_spectrogram = batch[-2]
        spectrogram = batch[-1]
        y = batch[1]

        y_hat = model(eeg, eeg_spectrogram, spectrogram)
        loss = self.train_criterion(y_hat, y)

        self.log("train/loss", loss.item())

        return loss

    def validation_step(self, model: nn.Module, batch: Tuple[Any]) -> Any:
        """Code for evaluating the model one step."""
        eeg = batch[0]
        eeg_spectrogram = batch[-2]
        spectrogram = batch[-1]
        y = batch[1]
        count = batch[2]

        y_hat = model(eeg, eeg_spectrogram, spectrogram)
        loss = self.valid_criterion(y_hat, y).sum(axis = 1)

        # Split based on based >10 or <10.
        loss_all = loss.mean().item()
        loss_g10 = loss[count >= 9].mean().item()
        loss_l10 = loss[count < 9].mean().item()

        # Possible that there were no count less than or greater than 9.
        loss_g10 = loss_g10 if not math.isnan(loss_g10) else 0.0
        loss_l10 = loss_l10 if not math.isnan(loss_l10) else 0.0

        # Update the metrics
        self.log_dict({
            "val/loss_all": loss_all,
            "val/loss_g10": loss_g10,
            "val/loss_l10": loss_l10
        })


def train(
    task: T.Task,
    model: nn.Module,
    optimizer: torch.optim.Optimizer,
    scheduler: Optional[torch.optim.lr_scheduler.LRScheduler],
    trainer_args: T.TrainerArgs,
    train_dataloader: DataLoader,
    val_dataloader: DataLoader
) -> None:
    """
    """
    trainer = T.Trainer(
        args = trainer_args,
        model = model,
        optimizer = optimizer,
        scheduler = scheduler,
        task = task,
        callbacks = [
            T.ModelCheckpoint(monitor = "val/loss_g10", mode = "min"),
            T.ModelCheckpoint(monitor = "val/loss_l10", mode = "min"),
            T.ModelCheckpoint(monitor = "val/loss_all", mode = "min")
        ]
    )
    trainer.fit(
        train_dataloader = train_dataloader,
        val_dataloader = val_dataloader
    )


def main(argv: Optional[List[str]] = None) -> None:
    args = parse_args(argv)

    # Validate that the output directory does not already exist.
    output_dir = args.trainer.output_dir
    if os.path.exists(output_dir):
        raise FileExistsError(f"The directory '{output_dir}' already exists!")
    else:
        os.makedirs(output_dir, exist_ok = False)

    # Setup the logger, add a file handler to save the output.
    setup_logger(filename = os.path.join(output_dir, "train.log"))

    # Save the config to the output_dir.
    logger.info(f"Config: \n{OmegaConf.to_yaml(args)}")
    logger.info(f"Saving config to '{output_dir}'")
    OmegaConf.save(args, os.path.join(output_dir, "config.yaml"))

    logger.info("Instantiating TrainerArgs")
    trainer_args = T.TrainerArgs(**args.trainer)

    logger.info("Instantiating Model")
    model = instantiate("model", args.model)
    logger.info(model)
    logger.info(f"Parameter count: {sum(p.numel() for p in model.parameters())}")

    logger.info("Instantiating Optimizer")
    optimizer = instantiate("optimizer", args.optimizer, params = model.parameters())

    scheduler: Optional[torch.optim.lr_scheduler.LRScheduler] = None
    if args.scheduler._target_ is not None:
        logger.info("Instantiating Scheduler")
        scheduler = instantiate("scheduler", args.scheduler, optimizer = optimizer)

    logger.info("Instantiating Dataloaders")
    train_dataloader, val_dataloader = HmsDataModule(**args.data).get_loaders()

    # Handle the `from_pretrained` option.
    if args.from_pretrained is not None:
        if args.trainer.resume_from_checkpoint is not None:
            logger.warning(
                "Both 'from_pretrained' and 'from_checkpoint' are set. Ignoring "
                "'from_pretrained' and resuming training from checkpoint."
            )
        else:
            logger.info(f"Resuming from checkpoint '{args.from_pretrained}'.")
            model.load_state_dict(torch.load(args.from_pretrained)["model_state_dict"])

    logger.info(f"Training with the {args.mode} task")
    if args.mode == "unimodal": task: T.Task = HbacUnimodalTask()
    elif args.mode == "multimodal": task = HbacMultimodalTask()
    else: raise ValueError("mode must be one of (unimodal, multimodal).")

    logger.info("Starting training!")
    train(
        task = task,
        model = model,
        optimizer = optimizer,
        scheduler = scheduler,
        trainer_args = trainer_args,
        train_dataloader = train_dataloader,
        val_dataloader = val_dataloader
    )


def parse_args(argv: Optional[List[str]] = None) -> DictConfig:
    """ """
    parser = argparse.ArgumentParser(
        add_help = False, formatter_class = argparse.RawDescriptionHelpFormatter
    )

    # `add_register_group_to_parser` will automatically check the user provided
    # config
    cli.add_register_group_to_parser(
        parser = parser,
        name = "optimizer",
        default = "Adam",
        exclude = ["self", "params"]
    )
    cli.add_register_group_to_parser(
        parser = parser,
        name = "scheduler",
        default = None,
        exclude = ["self", "optimizer"]
    )
    cli.add_register_group_to_parser(
        parser = parser,
        name = "model",
        default = "eeg_cnn_rnn_att_base",
        exclude = ["self"]
    )
    cli.add_dataclass_to_parser(
        parser = parser.add_argument_group("trainer"),
        dataclass = T.TrainerArgs,
        prefix = "trainer"
    )
    cli.add_dataclass_to_parser(
        parser = parser.add_argument_group("data"),
        dataclass = HmsDataModule,
        prefix = "data"
    )

    # Default arguments
    parser.add_argument("-h", "--help", action = argparse._HelpAction)
    parser.add_argument("-p", "--print_config", action = cli.PrintConfigAction)
    parser.add_argument("-c", "--config", type = str, action = cli.ConfigAction)
    parser.add_argument("-m", "--mode", type = str, default = "unimodal", choices = ["unimodal", "multimodal"], metavar = "")
    parser.add_argument("--from_pretrained", type = str, default = None)

    args = parser.parse_args(argv)
    config = cli.parse_args(args) # Dot-to-dict + conversion to OmegaConf

    return config


if __name__ == "__main__":
    main()