-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfinetuning_classification.py
More file actions
98 lines (73 loc) · 3.34 KB
/
finetuning_classification.py
File metadata and controls
98 lines (73 loc) · 3.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import os
import torch
from transformers import AutoTokenizer, BertForSequenceClassification, BertTokenizer, BertConfig, TFBertModel
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import Trainer, TrainingArguments
from datasets import load_dataset,load_metric
####################################################
# MAKE DIRECTORY
## Load the dataset (CAN CHANGE TO GET THE ENTIRE DATASET BY CONCATENATING DIFFERENT SPLITS)
train_dataset = load_dataset('glue', 'sst2', split='train')
dataset = train_dataset.train_test_split(test_size=0.2, stratify_by_column="label" , seed=1)
train_data = dataset['train']
test_data = dataset['test']
train_dataset = train_data.map(lambda examples: {'labels': examples['label']}, batched=True)
test_dataset = test_data.map(lambda examples: {'labels': examples['label']}, batched=True)
test_dataset = test_dataset.remove_columns(['label'])
train_dataset = train_dataset.remove_columns(['label'])
# tokenizer = BertTokenizer.from_pretrained('/content/drive/MyDrive/ES613_Assignment_3')
tokenizer = AutoTokenizer.from_pretrained("Skratch99/bert-pretrained")
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# print(device)
model = BertForSequenceClassification.from_pretrained("Skratch99/bert-pretrained").to(device)
# Change the max_length according to your wish
MAX_LENGTH = 256
train_dataset = train_dataset.map(lambda e: tokenizer(e['sentence'], truncation=True, padding='max_length', max_length=MAX_LENGTH), batched=True)
test_dataset = test_dataset.map(lambda e: tokenizer(e['sentence'], truncation=True, padding='max_length', max_length=MAX_LENGTH), batched=True)
train_dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
# train_dataset = train_dataset.to(device)
# test_dataset = test_dataset.to(device)
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
acc = accuracy_score(labels, preds)
return {
'accuracy': acc,
'f1': f1,
'precision': precision,
'recall': recall
}
#start training
training_args = TrainingArguments(
output_dir='sst2_results', #output directory
learning_rate=1e-4,
num_train_epochs=10,
per_device_train_batch_size=32, #batch size per device during training
per_device_eval_batch_size=32, #batch size for evaluation
logging_dir='sst2_logs',
logging_steps=100,
do_train=True,
do_eval=False,
no_cuda=False,
load_best_model_at_end=False,
save_strategy = "epoch",
evaluation_strategy="no"
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=None,
compute_metrics=compute_metrics
)
train_out = trainer.train()
PATH = "bert_sst2_finetuned"
# PATH = None # Set the path to the huggingface here to save the model
model.push_to_hub(PATH)
results = trainer.evaluate(eval_dataset = test_dataset)
print("Test results on test_dataset:", results)
train_results = trainer.evaluate(eval_dataset = train_dataset)
print("Test results on train_dataset:", train_results)