-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.py
More file actions
129 lines (96 loc) · 3.38 KB
/
config.py
File metadata and controls
129 lines (96 loc) · 3.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#@Author: Khush Patel, drpatelkhush@gmail.com
#Config options: change data/logging/checkpoint paths, experiment name, gpu, training options
##################################### MOST LIKELY TO BE CHANGED ##################################################
#device
gpu_number = ":" + "0"
device = torch.device('cuda' + gpu_number) if torch.cuda.is_available() else torch.device('cpu')
# device = torch.device('cpu')
#experiment name
experiment_name = "bert01"
#Batch size
batch_size = 10
##################################### Imports ##################################################
#python imports
from tqdm import tqdm
import random
import os
import pickle
import numpy as np
from prettytable import PrettyTable
import sys
import dill
import logging
from sklearn.metrics import accuracy_score
import numpy as np
from statistics import mean
import shutil
#pytorch imports
from torch.utils.tensorboard import SummaryWriter
import torch
import torch.nn as nn
#transformers imports
import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
################################### Paths ##########################################################
#Directory
path_on_server="/home/Khush/" + experiment_name
if os.path.isdir(path_on_server):
pass
else:
os.mkdir(path_on_server)
#Data Location
raw_data_path = "../" + ".train"
#Saving model checkpoints
#Location
save_directory = path_on_server + "/modelcheckpoints_"
if os.path.isdir(save_directory):
pass
else:
os.mkdir(save_directory)
save_dir = save_directory + experiment_name + ".pt"
save_every_step = 5000
#Save metrics like loss and accuracy after how many steps
measure_metrics_steps = 100
#Saving model weights if performance improved then last epoch
#Location
save_wts_directory = path_on_server + "/saved_weights_/"
save_wts_loc = save_wts_directory + "pytorch_model" + ".bin"
if os.path.isdir(save_wts_directory):
pass
else:
os.mkdir(save_wts_directory)
#tensorboard directory
tbpath = 'runs/' + experiment_name + "/"
#Saving model weigths if performance improved over the epoch
#logging
log_path = path_on_server + "/modelcheckpoints_/" + "logging/"
if os.path.isdir(log_path):
pass
else:
os.mkdir(log_path)
logging_path = log_path+experiment_name+".log"
print(logging_path)
logging.basicConfig(filename=logging_path, level=logging.DEBUG, format=" %(asctime)s:%(message)s")
############################## Hyperparameters ######################################
#Learning Rate
lr = 5e-5
#Number of epochs
num_of_epochs = 20
#max_position_embeddings – The maximum sequence length that this model might ever be used with.
max_position_embeddings = 128
#Encoding for mask tokens
masked_token_encoding = 103
#Percentage of tokens to be masked
percent_tokens_to_mask = 0.15
#Vocabulary size
vocab_size = 84840
#Number of patients
dataset_size = 2000000
########################Calculated parameters#######################################
#Number of training steps
num_train_steps = int(dataset_size / batch_size * num_of_epochs)
################################## Seeds ######################################################
#seeds: Change seeds for changing mask for MLM. Currently doing after 4 epochs manually
#seed for MLM masking
run_no_mask = 0 #Keep it fixed as 0 as I am using this to know the actual number of steps completed while resuming training.