-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathkmedianInitialCenters.py
90 lines (79 loc) · 3.84 KB
/
kmedianInitialCenters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import numpy as np
from sklearn.utils.extmath import stable_cumsum
from sklearn.metrics.pairwise import euclidean_distances
def _kmeans_plusplus(X, n_clusters, random_state, n_local_trials=None):
"""Computational component for initialization of n_clusters by
k-means++. Prior validation of data is assumed.
Parameters
----------
X : {ndarray, sparse matrix} of shape (n_samples, n_features)
The data to pick seeds for.
n_clusters : int
The number of seeds to choose.
x_squared_norms : ndarray of shape (n_samples,)
Squared Euclidean norm of each data point.
random_state : RandomState instance
The generator used to initialize the centers.
See :term:`Glossary <random_state>`.
n_local_trials : int, default=None
The number of seeding trials for each center (except the first),
of which the one reducing inertia the most is greedily chosen.
Set to None to make the number of trials depend logarithmically
on the number of seeds (2+log(k)); this is the default.
Returns
-------
centers : ndarray of shape (n_clusters, n_features)
The inital centers for k-means.
indices : ndarray of shape (n_clusters,)
The index location of the chosen centers in the data array X. For a
given index and center, X[index] = center.
"""
n_samples, n_features = X.shape
centers = np.empty((n_clusters, n_features), dtype=X.dtype)
# Set the number of local seeding trials if none is given
if n_local_trials is None:
# This is what Arthur/Vassilvitskii tried, but did not report
# specific results for other than mentioning in the conclusion
# that it helped.
n_local_trials = 2 + int(np.log(n_clusters))
# Pick first center randomly and track index of point
# center_id = random_state.randint(n_samples)
center_id = np.random.randint(n_samples)
indices = np.full(n_clusters, -1, dtype=int)
# if sp.issparse(X):
# centers[0] = X[center_id].toarray()
# else:
centers[0] = X[center_id]
indices[0] = center_id
# Initialize list of closest distances and calculate current potential
closest_dist_sq = euclidean_distances(centers[0, np.newaxis], X, squared=False)
current_pot = closest_dist_sq.sum()
# Pick the remaining n_clusters-1 points
for c in range(1, n_clusters):
# Choose center candidates by sampling with probability proportional
# to the squared distance to the closest existing center
# rand_vals = random_state.random_sample(n_local_trials) * current_pot
rand_vals = np.random.random_sample(n_local_trials) * current_pot
candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq),
rand_vals)
# XXX: numerical imprecision can result in a candidate_id out of range
np.clip(candidate_ids, None, closest_dist_sq.size - 1,
out=candidate_ids)
# Compute distances to center candidates
distance_to_candidates = euclidean_distances(X[candidate_ids], X, squared=False)
# update closest distances squared and potential for each candidate
np.minimum(closest_dist_sq, distance_to_candidates,
out=distance_to_candidates)
candidates_pot = distance_to_candidates.sum(axis=1)
# Decide which candidate is the best
best_candidate = np.argmin(candidates_pot)
current_pot = candidates_pot[best_candidate]
closest_dist_sq = distance_to_candidates[best_candidate]
best_candidate = candidate_ids[best_candidate]
# Permanently add best center candidate found in local tries
# if sp.issparse(X):
# centers[c] = X[best_candidate].toarray()
# else:
centers[c] = X[best_candidate]
indices[c] = best_candidate
return centers, indices