forked from gpoesia/minbert-default-final-project
-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathbart_detection.py
191 lines (146 loc) · 6.91 KB
/
bart_detection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import argparse
import random
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from transformers import AutoTokenizer, BartModel
from sklearn.metrics import matthews_corrcoef
from optimizer import AdamW
TQDM_DISABLE = False
class BartWithClassifier(nn.Module):
def __init__(self, num_labels=7):
super(BartWithClassifier, self).__init__()
self.bart = BartModel.from_pretrained("facebook/bart-large", local_files_only=True)
self.classifier = nn.Linear(self.bart.config.hidden_size, num_labels)
self.sigmoid = nn.Sigmoid()
def forward(self, input_ids, attention_mask=None):
# Use the BartModel to obtain the last hidden state
outputs = self.bart(input_ids=input_ids, attention_mask=attention_mask)
last_hidden_state = outputs.last_hidden_state
cls_output = last_hidden_state[:, 0, :]
# Add an additional fully connected layer to obtain the logits
logits = self.classifier(cls_output)
# Return the probabilities
probabilities = self.sigmoid(logits)
return probabilities
def transform_data(dataset, max_length=512):
"""
dataset: pd.DataFrame
Turn the data to the format you want to use.
1. Extract the sentences from the dataset. We recommend using the already split
sentences in the dataset.
2. Use the AutoTokenizer from_pretrained to tokenize the sentences and obtain the
input_ids and attention_mask.
3. Currently, the labels are in the form of [2, 5, 6, 0, 0, 0, 0]. This means that
the sentence pair is of type 2, 5, and 6. Turn this into a binary form, where the
label becomes [0, 1, 0, 0, 1, 1, 0]. Be careful that the test-student.csv does not
have the paraphrase_types column. You should return a DataLoader without the labels.
4. Use the input_ids, attention_mask, and binary labels to create a TensorDataset.
Return a DataLoader with the TensorDataset. You can choose a batch size of your
choice.
"""
raise NotImplementedError
def train_model(model, train_data, dev_data, device):
"""
Train the model. You can use any training loop you want. We recommend starting with
AdamW as your optimizer. You can take a look at the SST training loop for reference.
Think about your loss function and the number of epochs you want to train for.
You can also use the evaluate_model function to evaluate the
model on the dev set. Print the training loss, training accuracy, and dev accuracy at
the end of each epoch.
Return the trained model.
"""
### TODO
raise NotImplementedError
def test_model(model, test_data, test_ids, device):
"""
Test the model. Predict the paraphrase types for the given sentences and return the results in form of
a Pandas dataframe with the columns 'id' and 'Predicted_Paraphrase_Types'.
The 'Predicted_Paraphrase_Types' column should contain the binary array of your model predictions.
Return this dataframe.
"""
### TODO
raise NotImplementedError
def evaluate_model(model, test_data, device):
"""
This function measures the accuracy of our model's prediction on a given train/validation set
We measure how many of the seven paraphrase types the model has predicted correctly for each data point.
So, if the models prediction is [1,1,0,0,1,1,0] and the true label is [0,0,0,0,1,1,0], this predicition
has an accuracy of 5/7, i.e. 71.4% .
"""
all_pred = []
all_labels = []
model.eval()
with torch.no_grad():
for batch in test_data:
input_ids, attention_mask, labels = batch
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
predicted_labels = (outputs > 0.5).int()
all_pred.append(predicted_labels)
all_labels.append(labels)
all_predictions = torch.cat(all_pred, dim=0)
all_true_labels = torch.cat(all_labels, dim=0)
true_labels_np = all_true_labels.cpu().numpy()
predicted_labels_np = all_predictions.cpu().numpy()
# Compute the accuracy for each label
accuracies = []
matthews_coefficients = []
for label_idx in range(true_labels_np.shape[1]):
correct_predictions = np.sum(
true_labels_np[:, label_idx] == predicted_labels_np[:, label_idx]
)
total_predictions = true_labels_np.shape[0]
label_accuracy = correct_predictions / total_predictions
accuracies.append(label_accuracy)
#compute Matthwes Correlation Coefficient for each paraphrase type
matth_coef = matthews_corrcoef(true_labels_np[:,label_idx], predicted_labels_np[:,label_idx])
matthews_coefficients.append(matth_coef)
# Calculate the average accuracy over all labels
accuracy = np.mean(accuracies)
matthews_coefficient = np.mean(matthews_coefficients)
model.train()
return accuracy, matthews_coefficient
def seed_everything(seed=11711):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--seed", type=int, default=11711)
parser.add_argument("--use_gpu", action="store_true")
args = parser.parse_args()
return args
def finetune_paraphrase_detection(args):
model = BartWithClassifier()
device = torch.device("cuda") if args.use_gpu else torch.device("cpu")
model.to(device)
train_dataset = pd.read_csv("data/etpc-paraphrase-train.csv", sep="\t")
test_dataset = pd.read_csv("data/etpc-paraphrase-detection-test-student.csv", sep="\t")
# TODO You might do a split of the train data into train/validation set here
# (or in the csv files directly)
train_data = transform_data(train_dataset)
test_data = transform_data(test_dataset)
print(f"Loaded {len(train_dataset)} training samples.")
model = train_model(model, train_data, dev_data, device)
print("Training finished.")
accuracy, matthews_corr = evaluate_model(model, dev_data, device)
print(f"The accuracy of the model is: {accuracy:.3f}")
print(f"Matthews Correlation Coefficient of the model is: {matthews_corr:.3f}")
test_ids = test_dataset["id"]
test_results = test_model(model, test_data, test_ids, device)
test_results.to_csv(
"predictions/bart/etpc-paraphrase-detection-test-output.csv", index=False, sep="\t"
)
if __name__ == "__main__":
args = get_args()
seed_everything(args.seed)
finetune_paraphrase_detection(args)