selfsupervised-training-ehr/config.py at main · khushpatelmd/selfsupervised-training-ehr · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#@Author: Khush Patel, drpatelkhush@gmail.com
#Config options: change data/logging/checkpoint paths, experiment name, gpu, training options

#####################################  MOST LIKELY TO BE CHANGED  ##################################################
#device
gpu_number = ":" + "0"
device = torch.device('cuda' + gpu_number) if torch.cuda.is_available() else torch.device('cpu')
# device = torch.device('cpu')

#experiment name
experiment_name  = "bert01"

#Batch size
batch_size = 10

#####################################  Imports  ##################################################

#python imports
from tqdm import tqdm
import random
import os
import pickle
import numpy as np
from prettytable import PrettyTable
import sys
import dill
import logging
from sklearn.metrics import accuracy_score
import numpy as np
from statistics import mean
import shutil

#pytorch imports
from torch.utils.tensorboard import SummaryWriter
import torch
import torch.nn as nn

#transformers imports
import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

###################################     Paths     ##########################################################
#Directory
path_on_server="/home/Khush/" + experiment_name
if os.path.isdir(path_on_server):
    pass
else:
    os.mkdir(path_on_server)


#Data Location
raw_data_path = "../" + ".train"

#Saving model checkpoints
#Location
save_directory = path_on_server  + "/modelcheckpoints_"

if os.path.isdir(save_directory):
    pass
else:
    os.mkdir(save_directory)

save_dir = save_directory  + experiment_name + ".pt"

save_every_step = 5000

#Save metrics like loss and accuracy after how many steps
measure_metrics_steps = 100


#Saving model weights if performance improved then last epoch
#Location
save_wts_directory = path_on_server  + "/saved_weights_/"
save_wts_loc = save_wts_directory + "pytorch_model" + ".bin"
if os.path.isdir(save_wts_directory):
    pass
else:
    os.mkdir(save_wts_directory)

#tensorboard directory
tbpath = 'runs/' + experiment_name + "/"


#Saving model weigths if performance improved over the epoch

#logging
log_path = path_on_server +  "/modelcheckpoints_/" + "logging/"
if os.path.isdir(log_path):
    pass
else:
    os.mkdir(log_path)

logging_path = log_path+experiment_name+".log"

print(logging_path)

logging.basicConfig(filename=logging_path, level=logging.DEBUG, format="	%(asctime)s:%(message)s")

##############################      Hyperparameters      ######################################
#Learning Rate
lr = 5e-5
#Number of epochs
num_of_epochs = 20


#max_position_embeddings – The maximum sequence length that this model might ever be used with.
max_position_embeddings = 128
#Encoding for mask tokens
masked_token_encoding = 103
#Percentage of tokens to be masked
percent_tokens_to_mask = 0.15
#Vocabulary size
vocab_size = 84840
#Number of patients
dataset_size = 2000000

########################Calculated parameters#######################################

#Number of training steps
num_train_steps = int(dataset_size / batch_size * num_of_epochs)


##################################    Seeds    ######################################################

#seeds: Change seeds for changing mask for MLM. Currently doing after 4 epochs manually

#seed for MLM masking
run_no_mask = 0  #Keep it fixed as 0 as I am using this to know the actual number of steps completed while resuming training.