-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcluster.py
55 lines (46 loc) · 1.79 KB
/
cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import numpy as np
from sklearn.cluster import AffinityPropagation
from sklearn.feature_extraction.text import CountVectorizer
# import distance
import sklearn
from collections import OrderedDict
def freqs2clustering(dic_mots):
if not dic_mots:
return {}
new_d = OrderedDict(sorted(dic_mots.items(), key=lambda t: t[0]))
Set_00 = set(dic_mots.keys())
liste_words = [item for item in Set_00 if len(item) != 1]
dic_output = {}
matrice=[]
words = np.asarray(liste_words) #So that indexing with a list will work
for w in words:
liste_vecteur=[]
for w2 in words:
V = CountVectorizer(ngram_range=(2,3), analyzer='char')# Vectorisation bigramme et trigramme de caractères
X = V.fit_transform([w,w2]).toarray()
distance_tab1=sklearn.metrics.pairwise.cosine_distances(X) # Distance avec cosinus
liste_vecteur.append(distance_tab1[0][1])# stockage de la mesure de similarité
matrice.append(liste_vecteur)
matrice_def=-1*np.array(matrice)
##### CLUSTER
affprop = AffinityPropagation(affinity="precomputed", damping= 0.5, random_state = None)
# print("="*64)
# print("="*64)
# print("="*64)
# print()
affprop.fit(matrice_def)
for cluster_id in np.unique(affprop.labels_):
exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
cluster = np.unique(words[np.nonzero(affprop.labels_==cluster_id)])
dic = new_d.get(exemplar)
# print(exemplar, " ==> ", list(cluster))
if dic is not None:
dic_output[exemplar] = {
"Freq.centroide": dic,
"Termes": list(cluster),
}
# print()
# print("="*64)
# print("="*64)
# print("="*64)
return dic_output