-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathevaluate_clusters.py
162 lines (142 loc) · 4.23 KB
/
evaluate_clusters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
from cluster_abstracts import PubMedClustering
from sklearn import metrics
from sklearn import preprocessing
import argparse
def perform_clustering(
filename,
n_components,
num_clusters,
f_op,
num_words,
f_store):
""" Performs clustering on abstracts. """
clusterer = PubMedClustering(
filename,
n_components,
num_clusters,
num_words,
f_store)
clusterer.cluster_abstracts()
clusterer.get_top_terms()
if f_op:
clusterer.store_output()
return clusterer
def get_true_labels(filename):
""" Extracts gold cluster labels from the input file. """
with open('data/' + filename, 'r') as f:
lines = f.readlines()
# Checking if gold labels exist in the input file
if len(lines[0].split()) > 1:
cluster_labels = [' '.join(line.split()[1:]) for line in lines]
# Encoding the string labels to numeric labels
le = preprocessing.LabelEncoder()
labels_true = le.fit_transform(cluster_labels)
else:
labels_true = None
return labels_true
def evaluate_clusters(model, labels_true, f1, f2, f3, dist_metric):
""" Computes various evaluation metrics to assess clustering. """
labels_pred = model.clusters
print('Computing evaluation metric(s)...')
if labels_true is not None:
if f1:
print(
"Adjusted Rand index: %0.3f" %
metrics.adjusted_rand_score(
labels_true,
labels_pred))
if f2:
print(
"Homogeneity: %0.3f" %
metrics.homogeneity_score(
labels_true,
labels_pred))
print(
"Completeness: %0.3f" %
metrics.completeness_score(
labels_true,
labels_pred))
print(
"V-measure: %0.3f" %
metrics.v_measure_score(
labels_true,
labels_pred))
else:
print("Gold labels are not provided.")
if f3:
print(
"Silhouette Coefficient: %0.3f" %
metrics.silhouette_score(
model.X,
labels_pred,
metric=dist_metric))
def parse_arguments():
""" Parses command-line arguments. """
parser = argparse.ArgumentParser()
parser.add_argument(
'filename',
type=str,
help='name of the file containing PubMed article IDs')
parser.add_argument(
'--lsa',
dest='n_components',
type=int,
default=100,
help='dimensionality of latent semantic analysis output')
parser.add_argument(
'--num_clusters',
dest='num_clusters',
type=int,
default=6,
help='number of clusters to form')
parser.add_argument(
'--ari',
action='store_true',
help='adjusted rand index (ARI)')
parser.add_argument(
'--hcv',
action='store_true',
help='homogeneity, completeness and v-measure')
parser.add_argument(
'--sc',
action='store_true',
help='mean silhouette coefficient')
parser.add_argument(
'--dist',
dest='dist_metric',
type=str,
default='euclidean',
help='distance metric for silhouette coefficient')
parser.add_argument(
'--store_output',
action='store_true',
help='store output clusters in a file')
parser.add_argument(
'--num_words',
dest='num_words',
type=int,
default=5,
help='number of top words per cluster')
parser.add_argument(
'--store_model',
action='store_true',
help='persist the model on disk')
args = parser.parse_args()
return args
if __name__ == "__main__":
args = parse_arguments()
model = perform_clustering(
args.filename,
args.n_components,
args.num_clusters,
args.store_output,
args.num_words,
args.store_model)
labels_true = get_true_labels(args.filename)
evaluate_clusters(
model,
labels_true,
args.ari,
args.hcv,
args.sc,
args.dist_metric)