-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathscluster.py
79 lines (70 loc) · 2.74 KB
/
scluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# Create by Etzion Harari
# https://github.com/EtzionR
# import libraries
from sklearn.cluster import KMeans, MeanShift
from sklearn.metrics import silhouette_score
from hdbscan import HDBSCAN
# dictionary of clustering functions:
CLUSTERING = {'kmeans' : lambda df, k: KMeans(n_clusters = k).fit(df).labels_,
'hdbscan' : lambda df, s: HDBSCAN(min_cluster_size = s).fit(df).labels_,
'meanshift': lambda df, q: MeanShift(bandwidth=0.025*min(q,50)).fit(df).labels_}
# the clustering by silhouette object
class SCluster:
"""
The object that calculate the clustering by silhouette.
for each step, the object calculate the clustering labels,
and then, calculate for each result the silhouette score.
next, this code choose the labels with the best score.
"""
def __init__(self,typ='kmeans', org=2, lim=20, stp=1, dup=0.95):
"""
initialize the object
:param typ: clustering type
:param org: first value in the loop
:param lim: last value in the loop
:param stp: # values between each step
:param dup: value for fix dataframe row length for silhouette
"""
# initial parameters
self.type= typ
self.org = org
self.lim = lim+1
self.stp = stp
self.dup = dup
# clustering function
self.function = CLUSTERING[self.type.lower()]
# values for calculation
self.max = -1
self.scores = {}
self.labels_= []
def adapt_silhouette(self,labels):
"""
calculate the silhouette value for the given dataframe
:param labels: cluster labels
:return: the dataframe silhouette score for the given labels
"""
data, labels= self.df[labels > -1], labels[labels > -1]
if data.shape[0] == 0: return -1
while True:
try:
return silhouette_score(data, labels, sample_size=self.size)*(labels.shape[0]/self.n)
except:
self.size = int(self.size*self.duf)
def fit(self,data):
"""
fit the optimal cluster labels to the data
:param data: input dataframe
"""
self.n = data.shape[0]
self.size = self.n
self.df = data
for i in range(self.org, self.lim , self.stp):
label = self.function(self.df, i)
silho = self.adapt_silhouette(label)
self.scores[silho] = label
self.max = silho if self.max<silho else self.max
print(f'cluster kind: {self.type}, input value = {i}, silhouette = {round(silho,2)}')
self.labels_ = self.scores[self.max]
return self
# License
# MIT © Etzion Harari