@ -0,0 +1,836 @@
"""
Balanced groups K - Means clustering ( same size | max size | custom sizes )
utlizing the scikit - learn api and related
utilities .
BSD 3 - Clause License
Copyright ( c ) 2017 , Cayetano Benavent , José Manuel Camacho ( Balanced - Sizes - K - Means
Implementation )
Copyright ( c ) 2017 , Nathan Danielsen ( Same - Size - K - Means Implementation )
Copyright ( c ) 2007 – 2017 The scikit - learn developers .
All rights reserved .
Redistribution and use in source and binary forms , with or without
modification , are permitted provided that the following conditions are met :
* Redistributions of source code must retain the above copyright notice , this
list of conditions and the following disclaimer .
* Redistributions in binary form must reproduce the above copyright notice ,
this list of conditions and the following disclaimer in the documentation
and / or other materials provided with the distribution .
* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission .
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS " AS IS "
AND ANY EXPRESS OR IMPLIED WARRANTIES , INCLUDING , BUT NOT LIMITED TO , THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED . IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT , INDIRECT , INCIDENTAL , SPECIAL , EXEMPLARY , OR CONSEQUENTIAL
DAMAGES ( INCLUDING , BUT NOT LIMITED TO , PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES ; LOSS OF USE , DATA , OR PROFITS ; OR BUSINESS INTERRUPTION ) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY , WHETHER IN CONTRACT , STRICT LIABILITY ,
OR TORT ( INCLUDING NEGLIGENCE OR OTHERWISE ) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE , EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE .
BSD 3 - clause " New " or " Revised " License
version 0.17 .1
"""
import warnings
import numpy as np
import scipy . sparse as sp
from sklearn . base import BaseEstimator , ClusterMixin , TransformerMixin
from sklearn . cluster import k_means_
from sklearn . cluster import _k_means
from sklearn . externals . joblib import Parallel
from sklearn . externals . joblib import delayed
from sklearn . metrics . pairwise import euclidean_distances
from sklearn . utils . extmath import row_norms , squared_norm
from sklearn . utils . sparsefuncs import mean_variance_axis
from sklearn . utils import check_array
from sklearn . utils import check_random_state
from sklearn . utils import as_float_array
from sklearn . utils . validation import check_is_fitted
from sklearn . utils . validation import FLOAT_DTYPES
class BalancedGroupsKMeans ( BaseEstimator , ClusterMixin , TransformerMixin ) :
""" Balanced Groups K-Means clustering
90 percent of this is the Kmeans implmentations with the equal groups logic
located in ` _labels_inertia_precompute_dense ( ) ` which follows the steps laid
out in the Elki Same - size k - Means Variation tutorial .
https : / / elki - project . github . io / tutorial / same - size_k_means
Please note that this implementation only works in scikit - learn 17. X as later
versions having breaking changes to this implementation .
Parameters
- - - - - - - - - -
n_clusters : int , optional , default : 8
The number of clusters to form as well as the number of
centroids to generate .
max_iter : int , default : 300
Maximum number of iterations of the k - means algorithm for a
single run .
n_init : int , default : 10
Number of time the k - means algorithm will be run with different
centroid seeds . The final results will be the best output of
n_init consecutive runs in terms of inertia .
init : { ' k-means++ ' , ' random ' or an ndarray }
Method for initialization , defaults to ' k-means++ ' :
' k-means++ ' : selects initial cluster centers for k - mean
clustering in a smart way to speed up convergence . See section
Notes in k_init for more details .
' random ' : choose k observations ( rows ) at random from data for
the initial centroids .
If an ndarray is passed , it should be of shape ( n_clusters , n_features )
and gives the initial centers .
precompute_distances : { ' auto ' , True , False }
Precompute distances ( faster but takes more memory ) .
' auto ' : do not precompute distances if n_samples * n_clusters > 12
million . This corresponds to about 100 MB overhead per job using
double precision .
True : always precompute distances
False : never precompute distances
tol : float , default : 1e-4
Relative tolerance with regards to inertia to declare convergence
n_jobs : int
The number of jobs to use for the computation . This works by computing
each of the n_init runs in parallel .
If - 1 all CPUs are used . If 1 is given , no parallel computing code is
used at all , which is useful for debugging . For n_jobs below - 1 ,
( n_cpus + 1 + n_jobs ) are used . Thus for n_jobs = - 2 , all CPUs but one
are used .
random_state : integer or numpy . RandomState , optional
The generator used to initialize the centers . If an integer is
given , it fixes the seed . Defaults to the global numpy random
number generator .
verbose : int , default 0
Verbosity mode .
copy_x : boolean , default True
When pre - computing distances it is more numerically accurate to center
the data first . If copy_x is True , then the original data is not
modified . If False , the original data is modified , and put back before
the function returns , but small numerical differences may be introduced
by subtracting and then adding the data mean .
Attributes
- - - - - - - - - -
cluster_centers_ : array , [ n_clusters , n_features ]
Coordinates of cluster centers
labels_ :
Labels of each point
inertia_ : float
Sum of distances of samples to their closest cluster center .
Notes
- - - - - -
The k - means problem is solved using Lloyd ' s algorithm.
The average complexity is given by O ( k n T ) , were n is the number of
samples and T is the number of iteration .
The worst case complexity is given by O ( n ^ ( k + 2 / p ) ) with
n = n_samples , p = n_features . ( D . Arthur and S . Vassilvitskii ,
' How slow is the k-means method? ' SoCG2006 )
In practice , the k - means algorithm is very fast ( one of the fastest
clustering algorithms available ) , but it falls in local minima . That ' s why
it can be useful to restart it several times .
See also
- - - - - - - -
MiniBatchKMeans :
Alternative online implementation that does incremental updates
of the centers positions using mini - batches .
For large scale learning ( say n_samples > 10 k ) MiniBatchKMeans is
probably much faster to than the default batch implementation .
"""
def __init__ ( self , n_clusters = 8 , max_cluster_size = None ,
init = ' k-means++ ' , n_init = 10 , max_iter = 300 ,
tol = 1e-4 , precompute_distances = ' auto ' ,
verbose = 0 , random_state = None , copy_x = True , n_jobs = 1 ) :
self . n_clusters = n_clusters
self . max_cluster_size = max_cluster_size
self . init = init
self . max_iter = max_iter
self . tol = tol
self . precompute_distances = precompute_distances
self . n_init = n_init
self . verbose = verbose
self . random_state = random_state
self . copy_x = copy_x
self . n_jobs = n_jobs
self . cluster_centers_ = None
self . labels_ = None
def _check_fit_data ( self , X ) :
""" Verify that the number of samples given is larger than k """
X = check_array ( X , accept_sparse = ' csr ' , dtype = np . float64 )
if X . shape [ 0 ] < self . n_clusters :
raise ValueError ( " n_samples= %d should be >= n_clusters= %d " % (
X . shape [ 0 ] , self . n_clusters ) )
return X
def _check_test_data ( self , X ) :
X = check_array ( X , accept_sparse = ' csr ' , dtype = FLOAT_DTYPES ,
warn_on_dtype = True )
n_samples , n_features = X . shape
expected_n_features = self . cluster_centers_ . shape [ 1 ]
if not n_features == expected_n_features :
raise ValueError ( " Incorrect number of features. "
" Got %d features, expected %d " % (
n_features , expected_n_features ) )
return X
def get_cluster_labels ( self ) :
return self . labels_
def get_cluster_centers ( self ) :
return self . cluster_centers_
def get_inertia ( self ) :
return self . inertia_
def fit ( self , X , y = None , values = None ) :
""" Compute k-means clustering.
Parameters
- - - - - - - - - -
X : array - like or sparse matrix , shape = ( n_samples , n_features )
"""
print ( ' fit ' , values )
random_state = check_random_state ( self . random_state )
X = self . _check_fit_data ( X )
self . cluster_centers_ , self . labels_ , self . inertia_ , self . n_iter_ = \
k_means (
X , n_clusters = self . n_clusters , max_cluster_size = self . max_cluster_size ,
init = self . init , n_init = self . n_init , max_iter = self . max_iter ,
verbose = self . verbose , return_n_iter = True ,
precompute_distances = self . precompute_distances ,
tol = self . tol , random_state = random_state , copy_x = self . copy_x ,
n_jobs = self . n_jobs , values = values )
return self
def fit_predict ( self , X , y = None , values = None ) :
""" Compute cluster centers and predict cluster index for each sample.
Convenience method ; equivalent to calling fit ( X ) followed by
predict ( X ) .
"""
print ( ' fit predict ' , values )
return self . fit ( X , values = values ) . labels_
def fit_transform ( self , X , y = None ) :
""" Compute clustering and transform X to cluster-distance space.
Equivalent to fit ( X ) . transform ( X ) , but more efficiently implemented .
"""
# Currently, this just skips a copy of the data if it is not in
# np.array or CSR format already.
# XXX This skips _check_test_data, which may change the dtype;
# we should refactor the input validation.
X = self . _check_fit_data ( X )
return self . fit ( X ) . _transform ( X )
def transform ( self , X , y = None ) :
""" Transform X to a cluster-distance space.
In the new space , each dimension is the distance to the cluster
centers . Note that even if X is sparse , the array returned by
` transform ` will typically be dense .
Parameters
- - - - - - - - - -
X : { array - like , sparse matrix } , shape = [ n_samples , n_features ]
New data to transform .
Returns
- - - - - - -
X_new : array , shape [ n_samples , k ]
X transformed in the new space .
"""
check_is_fitted ( self , ' cluster_centers_ ' )
X = self . _check_test_data ( X )
return self . _transform ( X )
def _transform ( self , X ) :
""" guts of transform method; no input validation """
return euclidean_distances ( X , self . cluster_centers_ )
def predict ( self , X , values = None ) :
""" Predict the closest cluster each sample in X belongs to.
In the vector quantization literature , ` cluster_centers_ ` is called
the code book and each value returned by ` predict ` is the index of
the closest code in the code book .
Parameters
- - - - - - - - - -
X : { array - like , sparse matrix } , shape = [ n_samples , n_features ]
New data to predict .
Returns
- - - - - - -
labels : array , shape [ n_samples , ]
Index of the cluster each sample belongs to .
"""
check_is_fitted ( self , ' cluster_centers_ ' )
X = self . _check_test_data ( X )
x_squared_norms = row_norms ( X , squared = True )
return _labels_inertia ( X , x_squared_norms , self . cluster_centers_ ,
self . max_cluster_size , values = values ) [ 0 ]
def score ( self , X , y = None , values = None ) :
""" Opposite of the value of X on the K-means objective.
Parameters
- - - - - - - - - -
X : { array - like , sparse matrix } , shape = [ n_samples , n_features ]
New data .
Returns
- - - - - - -
score : float
Opposite of the value of X on the K - means objective .
"""
check_is_fitted ( self , ' cluster_centers_ ' )
X = self . _check_test_data ( X )
x_squared_norms = row_norms ( X , squared = True )
return - _labels_inertia ( X , x_squared_norms , self . cluster_centers_ ,
self . max_cluster_size , values = values ) [ 1 ]
def k_means ( X , n_clusters , max_cluster_size = None , init = ' k-means++ ' , precompute_distances = ' auto ' ,
n_init = 10 , max_iter = 300 , verbose = False ,
tol = 1e-4 , random_state = None , copy_x = True , n_jobs = 1 ,
return_n_iter = False , values = None ) :
""" K-means clustering algorithm.
Read more in the : ref : ` User Guide < k_means > ` .
Parameters
- - - - - - - - - -
X : array - like or sparse matrix , shape ( n_samples , n_features )
The observations to cluster .
n_clusters : int
The number of clusters to form as well as the number of
centroids to generate .
max_iter : int , optional , default 300
Maximum number of iterations of the k - means algorithm to run .
n_init : int , optional , default : 10
Number of time the k - means algorithm will be run with different
centroid seeds . The final results will be the best output of
n_init consecutive runs in terms of inertia .
init : { ' k-means++ ' , ' random ' , or ndarray , or a callable } , optional
Method for initialization , default to ' k-means++ ' :
' k-means++ ' : selects initial cluster centers for k - mean
clustering in a smart way to speed up convergence . See section
Notes in k_init for more details .
' random ' : generate k centroids from a Gaussian with mean and
variance estimated from the data .
If an ndarray is passed , it should be of shape ( n_clusters , n_features )
and gives the initial centers .
If a callable is passed , it should take arguments X , k and
and a random state and return an initialization .
precompute_distances : { ' auto ' , True , False }
Precompute distances ( faster but takes more memory ) .
' auto ' : do not precompute distances if n_samples * n_clusters > 12
million . This corresponds to about 100 MB overhead per job using
double precision .
True : always precompute distances
False : never precompute distances
tol : float , optional
The relative increment in the results before declaring convergence .
verbose : boolean , optional
Verbosity mode .
random_state : integer or numpy . RandomState , optional
The generator used to initialize the centers . If an integer is
given , it fixes the seed . Defaults to the global numpy random
number generator .
copy_x : boolean , optional
When pre - computing distances it is more numerically accurate to center
the data first . If copy_x is True , then the original data is not
modified . If False , the original data is modified , and put back before
the function returns , but small numerical differences may be introduced
by subtracting and then adding the data mean .
n_jobs : int
The number of jobs to use for the computation . This works by computing
each of the n_init runs in parallel .
If - 1 all CPUs are used . If 1 is given , no parallel computing code is
used at all , which is useful for debugging . For n_jobs below - 1 ,
( n_cpus + 1 + n_jobs ) are used . Thus for n_jobs = - 2 , all CPUs but one
are used .
return_n_iter : bool , optional
Whether or not to return the number of iterations .
Returns
- - - - - - -
centroid : float ndarray with shape ( k , n_features )
Centroids found at the last iteration of k - means .
label : integer ndarray with shape ( n_samples , )
label [ i ] is the code or index of the centroid the
i ' th observation is closest to.
inertia : float
The final value of the inertia criterion ( sum of squared distances to
the closest centroid for all observations in the training set ) .
best_n_iter : int
Number of iterations corresponding to the best results .
Returned only if ` return_n_iter ` is set to True .
"""
if n_init < = 0 :
raise ValueError ( " Invalid number of initializations. "
" n_init= %d must be bigger than zero. " % n_init )
random_state = check_random_state ( random_state )
if max_iter < = 0 :
raise ValueError ( ' Number of iterations should be a positive number, '
' got %d instead ' % max_iter )
best_inertia = np . infty
X = as_float_array ( X , copy = copy_x )
tol = _tolerance ( X , tol )
# If the distances are precomputed every job will create a matrix of shape
# (n_clusters, n_samples). To stop KMeans from eating up memory we only
# activate this if the created matrix is guaranteed to be under 100MB. 12
# million entries consume a little under 100MB if they are of type double.
if precompute_distances == ' auto ' :
n_samples = X . shape [ 0 ]
precompute_distances = ( n_clusters * n_samples ) < 12e6
elif isinstance ( precompute_distances , bool ) :
pass
else :
raise ValueError ( " precompute_distances should be ' auto ' or True/False "
" , but a value of %r was passed " %
precompute_distances )
# subtract of mean of x for more accurate distance computations
if not sp . issparse ( X ) or hasattr ( init , ' __array__ ' ) :
X_mean = X . mean ( axis = 0 )
if not sp . issparse ( X ) :
# The copy was already done above
X - = X_mean
if hasattr ( init , ' __array__ ' ) :
init = check_array ( init , dtype = np . float64 , copy = True )
_validate_center_shape ( X , n_clusters , init )
init - = X_mean
if n_init != 1 :
warnings . warn (
' Explicit initial center position passed: '
' performing only one init in k-means instead of n_init= %d '
% n_init , RuntimeWarning , stacklevel = 2 )
n_init = 1
# precompute squared norms of data points
x_squared_norms = row_norms ( X , squared = True )
best_labels , best_inertia , best_centers = None , None , None
if n_jobs == 1 :
# For a single thread, less memory is needed if we just store one set
# of the best results (as opposed to one set per run per thread).
for it in range ( n_init ) :
# run a k-means once
labels , inertia , centers , n_iter_ = _kmeans_single (
X , n_clusters , max_cluster_size , max_iter = max_iter , init = init ,
verbose = verbose , precompute_distances = precompute_distances , tol = tol ,
x_squared_norms = x_squared_norms , random_state = random_state , values = values )
# determine if these results are the best so far
if best_inertia is None or inertia < best_inertia :
best_labels = labels . copy ( )
best_centers = centers . copy ( )
best_inertia = inertia
best_n_iter = n_iter_
else :
# parallelisation of k-means runs
seeds = random_state . randint ( np . iinfo ( np . int32 ) . max , size = n_init )
results = Parallel ( n_jobs = n_jobs , verbose = 0 ) (
delayed ( _kmeans_single ) ( X , n_clusters , max_cluster_size , max_iter = max_iter ,
init = init , verbose = verbose , tol = tol ,
precompute_distances = precompute_distances ,
x_squared_norms = x_squared_norms ,
# Change seed to ensure variety
random_state = seed ,
values = values )
for seed in seeds )
# Get results with the lowest inertia
labels , inertia , centers , n_iters = zip ( * results )
best = np . argmin ( inertia )
best_labels = labels [ best ]
best_inertia = inertia [ best ]
best_centers = centers [ best ]
best_n_iter = n_iters [ best ]
if not sp . issparse ( X ) :
if not copy_x :
X + = X_mean
best_centers + = X_mean
if return_n_iter :
return best_centers , best_labels , best_inertia , best_n_iter
else :
return best_centers , best_labels , best_inertia
def _kmeans_single ( X , n_clusters , max_cluster_size , x_squared_norms , max_iter = 300 ,
init = ' k-means++ ' , verbose = False , random_state = None ,
tol = 1e-4 , precompute_distances = True , values = None ) :
""" A single run of k-means, assumes preparation completed prior.
Parameters
- - - - - - - - - -
X : array - like of floats , shape ( n_samples , n_features )
The observations to cluster .
n_clusters : int
The number of clusters to form as well as the number of
centroids to generate .
max_iter : int , optional , default 300
Maximum number of iterations of the k - means algorithm to run .
init : { ' k-means++ ' , ' random ' , or ndarray , or a callable } , optional
Method for initialization , default to ' k-means++ ' :
' k-means++ ' : selects initial cluster centers for k - mean
clustering in a smart way to speed up convergence . See section
Notes in k_init for more details .
' random ' : generate k centroids from a Gaussian with mean and
variance estimated from the data .
If an ndarray is passed , it should be of shape ( k , p ) and gives
the initial centers .
If a callable is passed , it should take arguments X , k and
and a random state and return an initialization .
tol : float , optional
The relative increment in the results before declaring convergence .
verbose : boolean , optional
Verbosity mode
x_squared_norms : array
Precomputed x_squared_norms .
precompute_distances : boolean , default : True
Precompute distances ( faster but takes more memory ) .
random_state : integer or numpy . RandomState , optional
The generator used to initialize the centers . If an integer is
given , it fixes the seed . Defaults to the global numpy random
number generator .
Returns
- - - - - - -
centroid : float ndarray with shape ( k , n_features )
Centroids found at the last iteration of k - means .
label : integer ndarray with shape ( n_samples , )
label [ i ] is the code or index of the centroid the
i ' th observation is closest to.
inertia : float
The final value of the inertia criterion ( sum of squared distances to
the closest centroid for all observations in the training set ) .
n_iter : int
Number of iterations run .
"""
random_state = check_random_state ( random_state )
best_labels , best_inertia , best_centers = None , None , None
# init
centers = k_means_ . _init_centroids ( X , n_clusters , init , random_state = random_state ,
x_squared_norms = x_squared_norms )
if verbose :
print ( " Initialization complete " )
# Allocate memory to store the distances for each sample to its
# closer center for reallocation in case of ties
distances = np . zeros ( shape = ( X . shape [ 0 ] , ) , dtype = np . float64 )
# iterations
for i in range ( max_iter ) :
centers_old = centers . copy ( )
# labels assignment is also called the E-step of EM
labels , inertia = \
_labels_inertia ( X , x_squared_norms , centers , max_cluster_size ,
precompute_distances = precompute_distances ,
distances = distances , values = values )
# computation of the means is also called the M-step of EM
if sp . issparse ( X ) :
centers = _k_means . _centers_sparse ( X , labels , n_clusters ,
distances )
else :
centers = _k_means . _centers_dense ( X , labels , n_clusters , distances )
if verbose :
print ( " Iteration %2d , inertia %.3f " % ( i , inertia ) )
if best_inertia is None or inertia < best_inertia :
best_labels = labels . copy ( )
best_centers = centers . copy ( )
best_inertia = inertia
shift = squared_norm ( centers_old - centers )
if shift < = tol :
if verbose :
print ( " Converged at iteration %d " % i )
break
if shift > 0 :
# rerun E-step in case of non-convergence so that predicted labels
# match cluster centers
best_labels , best_inertia = \
_labels_inertia ( X , x_squared_norms , best_centers , max_cluster_size ,
precompute_distances = precompute_distances ,
distances = distances , values = values )
return best_labels , best_inertia , best_centers , i + 1
def _validate_center_shape ( X , n_centers , centers ) :
""" Check if centers is compatible with X and n_centers """
if len ( centers ) != n_centers :
raise ValueError ( ' The shape of the initial centers ( %s ) '
' does not match the number of clusters %i '
% ( centers . shape , n_centers ) )
if centers . shape [ 1 ] != X . shape [ 1 ] :
raise ValueError (
" The number of features of the initial centers %s "
" does not match the number of features of the data %s . "
% ( centers . shape [ 1 ] , X . shape [ 1 ] ) )
def _tolerance ( X , tol ) :
""" Return a tolerance which is independent of the dataset """
if sp . issparse ( X ) :
variances = mean_variance_axis ( X , axis = 0 ) [ 1 ]
else :
variances = np . var ( X , axis = 0 )
return np . mean ( variances ) * tol
def _labels_inertia ( X , x_squared_norms , centers , max_cluster_size ,
precompute_distances = True , distances = None , values = None ) :
""" E step of the K-means EM algorithm.
Compute the labels and the inertia of the given samples and centers .
This will compute the distances in - place .
Parameters
- - - - - - - - - -
X : float64 array - like or CSR sparse matrix , shape ( n_samples , n_features )
The input samples to assign to the labels .
x_squared_norms : array , shape ( n_samples , )
Precomputed squared euclidean norm of each data point , to speed up
computations .
centers : float64 array , shape ( k , n_features )
The cluster centers .
precompute_distances : boolean , default : True
Precompute distances ( faster but takes more memory ) .
distances : float64 array , shape ( n_samples , )
Pre - allocated array to be filled in with each sample ' s distance
to the closest center .
Returns
- - - - - - -
labels : int array of shape ( n )
The resulting assignment
inertia : float
Sum of distances of samples to their closest cluster center .
"""
n_samples = X . shape [ 0 ]
# set the default value of centers to -1 to be able to detect any anomaly
# easily
labels = - np . ones ( n_samples , np . int32 )
if distances is None :
distances = np . zeros ( shape = ( 0 , ) , dtype = np . float64 )
# distances will be changed in-place
if sp . issparse ( X ) :
inertia = k_means_ . _k_means . _assign_labels_csr (
X , x_squared_norms , centers , labels , distances = distances )
else :
if precompute_distances :
return _labels_inertia_precompute_dense ( X , x_squared_norms ,
centers , distances ,
max_cluster_size , values = values )
inertia = k_means_ . _k_means . _assign_labels_array (
X , x_squared_norms , centers , labels , distances = distances )
return labels , inertia
def _labels_inertia_precompute_dense ( X , x_squared_norms , centers , distances ,
max_cluster_size , values = None ) :
""" Compute labels and inertia using a full distance matrix.
This will overwrite the ' distances ' array in - place .
Parameters
- - - - - - - - - -
X : numpy array , shape ( n_sample , n_features )
Input data .
x_squared_norms : numpy array , shape ( n_samples , )
Precomputed squared norms of X .
centers : numpy array , shape ( n_clusters , n_features )
Cluster centers which data is assigned to .
distances : numpy array , shape ( n_samples , )
Pre - allocated array in which distances are stored .
max_cluster_size : TODO
Returns
- - - - - - -
labels : numpy array , dtype = np . int , shape ( n_samples , )
Indices of clusters that samples are assigned to .
inertia : float
Sum of distances of samples to their closest cluster center .
"""
n_samples = X . shape [ 0 ]
k = centers . shape [ 0 ]
all_distances = euclidean_distances ( centers , X , x_squared_norms ,
squared = True )
labels = np . empty ( n_samples , dtype = np . int32 )
labels . fill ( - 1 )
mindist = np . empty ( n_samples )
mindist . fill ( np . infty )
n_samples = X . shape [ 0 ]
k = centers . shape [ 0 ]
if isinstance ( max_cluster_size , int ) and max_cluster_size * k < n_samples :
raise ValueError ( " max_cluster_size * k must be greater than n_samples " )
elif isinstance ( max_cluster_size , list ) :
if sum ( max_cluster_size ) != n_samples :
raise ValueError ( " max_cluster_size summatory must be equal than n_samples " )
elif len ( max_cluster_size ) != k :
raise ValueError ( " max_cluster_size len must be equal than k " )
if not max_cluster_size :
max_cluster_size = get_clusters_size ( n_samples , k )
labels , mindist = initial_assignment ( labels , mindist , n_samples , all_distances , max_cluster_size , values = values )
all_points = np . arange ( n_samples )
for point in all_points :
for point_dist in get_best_point_distances ( point , all_distances ) :
cluster_id , point_dist = point_dist
# initial assignment
if not is_cluster_full ( cluster_id , max_cluster_size , labels , values ) :
labels [ point ] = cluster_id
mindist [ point ] = point_dist
break
# refinement of clustering
transfer_list = [ ]
best_mindist = mindist . copy ( )
best_labels = labels . copy ( )
# sort all of the points from largest distance to smallest
points_by_high_distance = np . argsort ( mindist ) [ : : - 1 ]
for point in points_by_high_distance :
point_cluster = labels [ point ]
# see if there is an opening on the best cluster for this point
cluster_id , point_dist = get_best_cluster_for_point ( point , all_distances )
if not is_cluster_full ( cluster_id , max_cluster_size , labels , values = values ) and point_cluster != cluster_id :
labels [ point ] = cluster_id
mindist [ point ] = point_dist
best_labels = labels . copy ( )
best_mindist = mindist . copy ( )
continue # on to the next point
for swap_candidate in transfer_list :
cand_cluster = labels [ swap_candidate ]
if point_cluster != cand_cluster :
# get the current dist of swap candidate
cand_distance = mindist [ swap_candidate ]
# get the potential dist of point
point_distance = all_distances [ cand_cluster , point ]
# compare
if point_distance < cand_distance :
labels [ point ] = cand_cluster
mindist [ point ] = all_distances [ cand_cluster , point ]
labels [ swap_candidate ] = point_cluster
mindist [ swap_candidate ] = all_distances [ point_cluster , swap_candidate ]
if np . absolute ( mindist ) . sum ( ) < np . absolute ( best_mindist ) . sum ( ) :
# update the labels since the transfer was a success
best_labels = labels . copy ( )
best_mindist = mindist . copy ( )
break
else :
# reset since the transfer was not a success
labels = best_labels . copy ( )
mindist = best_mindist . copy ( )
transfer_list . append ( point )
if n_samples == distances . shape [ 0 ] :
# distances will be changed in-place
distances [ : ] = mindist
inertia = best_mindist . sum ( )
return best_labels , inertia
def get_best_cluster_for_point ( point , all_distances ) :
""" Gets the best cluster by distance for a point
Argument
- - - - - - - -
point : int
the point index
Returns
- - - - - - - -
tuple
( cluster_id , distance_from_cluster_center )
"""
sorted_distances = get_best_point_distances ( point , all_distances )
cluster_id , point_dist = sorted_distances [ 0 ]
return cluster_id , point_dist
def get_best_point_distances ( point , all_distances ) :
""" Gets a sorted by best distance of clusters
Argument
- - - - - - - -
point : int
the point index
Returns
- - - - - - - -
list of tuples sorted by point_dist
example : [ ( cluster_id , point_dist ) , ( cluster_id , point_dist ) ]
"""
points_distances = all_distances [ : , point ]
sorted_points = sort_adjust_row ( points_distances )
return sorted_points
def sort_adjust_row ( points_distances ) :
" Sorts the points row from smallest distance to lowest distance "
return sorted ( [ ( cluster_id , point_dist ) for cluster_id , point_dist in enumerate ( points_distances ) ] , key = lambda x : x [ 1 ] )
def is_cluster_full ( cluster_id , max_cluster_size , labels , values = None ) :
""" Determines if a cluster is full """
if isinstance ( max_cluster_size , list ) :
mx_cs = max_cluster_size [ cluster_id ]
else :
mx_cs = max_cluster_size
if values is not None :
cluster_count = np . sum ( values [ np . where ( labels == cluster_id ) ] )
else :
cluster_count = len ( np . where ( labels == cluster_id ) [ 0 ] )
is_full = cluster_count > = mx_cs
return is_full
def get_clusters_size ( n_samples , n_clusters ) :
""" Gets the number of members per cluster for equal groups kmeans """
return ( n_samples + n_clusters - 1 ) / / n_clusters
def initial_assignment ( labels , mindist , n_samples , all_distances , max_cluster_size , values = None ) :
""" Initial assignment of labels and mindist """
all_points = np . arange ( n_samples )
for point in all_points :
for point_dist in get_best_point_distances ( point , all_distances ) :
cluster_id , point_dist = point_dist
# initial assignment
if not is_cluster_full ( cluster_id , max_cluster_size , labels , values ) :
labels [ point ] = cluster_id
mindist [ point ] = point_dist
break
return labels , mindist