-
Notifications
You must be signed in to change notification settings - Fork 0
/
full-finetuning.py
115 lines (79 loc) · 2.79 KB
/
full-finetuning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import torch
import torch.nn as nn
from torch.functional import F
import transformers as ts
from datasets import Dataset
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle
torch.cuda.empty_cache()
SAVE_PATH = "PATH_TO_CHECKPOINTS/biomedical_model_checkpoints/biobert-base/"
MODEL_PATH = "dmis-lab/biobert-v1.1"
os.environ["WANDB_DISABLED"] = "true"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_dict = pickle.load(open("train_dict" , "rb")) #Path to Training File
val_dict = pickle.load(open("val_dict" , "rb")) #Path to Validation File
train_dataset = Dataset.from_dict(train_dict)
val_dataset = Dataset.from_dict(val_dict)
tokenizer = ts.AutoTokenizer.from_pretrained(MODEL_PATH)
data_collator = ts.DataCollatorWithPadding(tokenizer=tokenizer , return_tensors="pt")
def mappingFunction(dataset):
return tokenizer(dataset["text"])
final_train_dataset = train_dataset.map(mappingFunction , batched=True)
final_val_dataset = val_dataset.map(mappingFunction , batched=True)
model = ts.AutoModelForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=1)
def collator_function(dataset):
keys = dataset[0].keys()
output_dict = {
key: [] for key in keys
}
for item in dataset:
for key in keys:
output_dict[key].append(item[key])
labels = torch.tensor(output_dict.pop("label"))
output_dict.pop("text")
collator_output = data_collator(output_dict)
collator_output["labels"] = labels
return collator_output
def train():
training_arguments = ts.TrainingArguments(
"output/",
save_steps= 3000,
num_train_epochs=5,
learning_rate=2e-5,
lr_scheduler_type="cosine",
per_device_train_batch_size=64,
per_device_eval_batch_size=64,
weight_decay=0.01,
remove_unused_columns=False,
logging_steps=100,
seed=123,
)
trainer = ts.Trainer(
model=model,
args=training_arguments,
train_dataset=final_train_dataset,
eval_dataset=final_val_dataset,
data_collator=collator_function,
)
trainer.train()
trainer.save_model(SAVE_PATH)
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
all_labels = []
all_preds = []
model.cpu()
model.eval()
for sample in final_val_dataset:
input_sample = collator_function([sample])
output = model(**input_sample)
predicted_label = round(float(torch.sigmoid(output["logits"]).view(-1)))
all_labels.append(sample["label"])
all_preds.append(predicted_label)
array = confusion_matrix(all_labels, all_preds)
disp = ConfusionMatrixDisplay(confusion_matrix=array,display_labels=[0,1])
disp.plot()
plt.show()
train()