-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy paththreshold.py
93 lines (74 loc) · 2.42 KB
/
threshold.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import torch
import argparse
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn import metrics
import os
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
)
def _get_parser():
parser = argparse.ArgumentParser()
parser.add_argument("--model_name", type=str, default="tunib/electra-ko-base")
parser.add_argument("--dir_path", type=str, default="/home/ckpt")
parser.add_argument("--file_path", type=str, default="")
parser.add_argument("--num_labels", type=int, default=7)
parser.add_argument("--data_path", type=str, default="./data/test.csv")
return parser
parser = _get_parser()
args = parser.parse_args()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(args.model_name)
model = AutoModelForSequenceClassification.from_pretrained(
args.model_name, num_labels=args.num_labels
)
trained_model_dict = torch.load(f"{args.dir_path}/{args.file_path}")["state_dict"]
model_dict = dict()
for key in trained_model_dict:
model_dict[key[6:]] = trained_model_dict[key]
model.load_state_dict(model_dict)
model.to(device)
model.eval()
df = pd.read_csv(args.data_path, encoding="utf-8", index_col=0)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
comments = df["comment"]
comments = list(comments)
cols = [
"stereotype",
"anti-stereotype",
"unrelated",
"profession",
"race",
"gender",
"religion",
]
diction = {}
for col in cols:
diction[col] = []
diction[f"{col}_label"] = []
for idx, my_input in tqdm(enumerate(comments)):
tokenized = tokenizer(
my_input,
return_tensors="pt",
max_length=512,
padding="max_length",
truncation=True,
).to(device)
output = model(**tokenized)
logits = np.array(output.logits.cpu().detach()).squeeze()
label = df.loc[idx][: args.num_labels].tolist()
label = np.array(label)
for i, col in enumerate(cols):
diction[col].append(logits[i])
diction[f"{col}_label"].append(label[i])
for col in cols:
pre, rec, threshold = metrics.precision_recall_curve(
np.array(diction[f"{col}_label"]), np.array(diction[f"{col}"])
)
temp = []
for i in range(min(len(threshold), len(rec), len(pre))):
temp.append([threshold[i], (2 * rec[i] * pre[i]) / (rec[i] + pre[i])])
t1, s1 = sorted(temp, key=lambda x: x[-1])[-1]
print(f"{col}: {t1}")