inital commit for balanced k means implementation
This commit is contained in:
parent
0194c036f6
commit
9b7eda798c
@ -10,6 +10,15 @@ RETURNS table (cartodb_id integer, cluster_no integer) as $$
|
||||
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
|
||||
|
||||
|
||||
CREATE OR REPLACE FUNCTION CDB_KMeansBalanced(query text, no_clusters integer, value_col TEXT default NULL, no_init integer default 20, max_per_cluster float default NULL )
|
||||
RETURNS table (cartodb_id integer, cluster_no integer) as $$
|
||||
|
||||
from crakshaft.clustering import KmeansBallanced
|
||||
kmeans = KmeansBallanced()
|
||||
return kmeans.spatial_balanced(query,no_clusters,no_init, max_per_cluster, value_col)
|
||||
|
||||
$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
|
||||
|
||||
CREATE OR REPLACE FUNCTION CDB_WeightedMeanS(state Numeric[],the_geom GEOMETRY(Point, 4326), weight NUMERIC)
|
||||
RETURNS Numeric[] AS
|
||||
$$
|
||||
|
@ -66,6 +66,24 @@ class AnalysisDataProvider:
|
||||
except plpy.SPIError, err:
|
||||
plpy.error('Analysis failed: %s' % err)
|
||||
|
||||
def get_spatial_balanced_kmeans(self, params):
|
||||
"""fetch data for spatial kmeans"""
|
||||
|
||||
params.setdefault('value_column', 1)
|
||||
|
||||
query = ("SELECT "
|
||||
"array_agg({id_col} ORDER BY {id_col}) as ids,"
|
||||
"array_agg(ST_X({geom_col}) ORDER BY {id_col}) As xs,"
|
||||
"array_agg(ST_Y({geom_col}) ORDER BY {id_col}) As ys "
|
||||
"array_agg({value_column} ORDER BY {id_col}) As values"
|
||||
"FROM ({subquery}) As a "
|
||||
"WHERE {geom_col} IS NOT NULL").format(**params)
|
||||
try:
|
||||
data = plpy.execute(query)
|
||||
return data
|
||||
except plpy.SPIError, err:
|
||||
plpy.error('Analysis failed: %s' % err)
|
||||
|
||||
def get_gwr(self, params):
|
||||
"""fetch data for gwr analysis"""
|
||||
query = pu.gwr_query(params)
|
||||
|
836
src/py/crankshaft/crankshaft/clustering/balanced_kmeans.py
Normal file
836
src/py/crankshaft/crankshaft/clustering/balanced_kmeans.py
Normal file
@ -0,0 +1,836 @@
|
||||
"""
|
||||
Balanced groups K-Means clustering (same size | max size | custom sizes)
|
||||
utlizing the scikit-learn api and related
|
||||
utilities.
|
||||
|
||||
BSD 3-Clause License
|
||||
|
||||
Copyright (c) 2017, Cayetano Benavent, José Manuel Camacho (Balanced-Sizes-K-Means
|
||||
Implementation)
|
||||
Copyright (c) 2017, Nathan Danielsen (Same-Size-K-Means Implementation)
|
||||
Copyright (c) 2007–2017 The scikit-learn developers.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the copyright holder nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
BSD 3-clause "New" or "Revised" License
|
||||
|
||||
version 0.17.1
|
||||
"""
|
||||
|
||||
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
|
||||
from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin
|
||||
from sklearn.cluster import k_means_
|
||||
from sklearn.cluster import _k_means
|
||||
from sklearn.externals.joblib import Parallel
|
||||
from sklearn.externals.joblib import delayed
|
||||
from sklearn.metrics.pairwise import euclidean_distances
|
||||
from sklearn.utils.extmath import row_norms, squared_norm
|
||||
from sklearn.utils.sparsefuncs import mean_variance_axis
|
||||
from sklearn.utils import check_array
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils import as_float_array
|
||||
from sklearn.utils.validation import check_is_fitted
|
||||
from sklearn.utils.validation import FLOAT_DTYPES
|
||||
|
||||
|
||||
class BalancedGroupsKMeans(BaseEstimator, ClusterMixin, TransformerMixin):
|
||||
"""Balanced Groups K-Means clustering
|
||||
|
||||
90 percent of this is the Kmeans implmentations with the equal groups logic
|
||||
located in `_labels_inertia_precompute_dense()` which follows the steps laid
|
||||
out in the Elki Same-size k-Means Variation tutorial.
|
||||
|
||||
https://elki-project.github.io/tutorial/same-size_k_means
|
||||
|
||||
Please note that this implementation only works in scikit-learn 17.X as later
|
||||
versions having breaking changes to this implementation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_clusters : int, optional, default: 8
|
||||
The number of clusters to form as well as the number of
|
||||
centroids to generate.
|
||||
max_iter : int, default: 300
|
||||
Maximum number of iterations of the k-means algorithm for a
|
||||
single run.
|
||||
n_init : int, default: 10
|
||||
Number of time the k-means algorithm will be run with different
|
||||
centroid seeds. The final results will be the best output of
|
||||
n_init consecutive runs in terms of inertia.
|
||||
init : {'k-means++', 'random' or an ndarray}
|
||||
Method for initialization, defaults to 'k-means++':
|
||||
'k-means++' : selects initial cluster centers for k-mean
|
||||
clustering in a smart way to speed up convergence. See section
|
||||
Notes in k_init for more details.
|
||||
'random': choose k observations (rows) at random from data for
|
||||
the initial centroids.
|
||||
If an ndarray is passed, it should be of shape (n_clusters, n_features)
|
||||
and gives the initial centers.
|
||||
precompute_distances : {'auto', True, False}
|
||||
Precompute distances (faster but takes more memory).
|
||||
'auto' : do not precompute distances if n_samples * n_clusters > 12
|
||||
million. This corresponds to about 100MB overhead per job using
|
||||
double precision.
|
||||
True : always precompute distances
|
||||
False : never precompute distances
|
||||
tol : float, default: 1e-4
|
||||
Relative tolerance with regards to inertia to declare convergence
|
||||
n_jobs : int
|
||||
The number of jobs to use for the computation. This works by computing
|
||||
each of the n_init runs in parallel.
|
||||
If -1 all CPUs are used. If 1 is given, no parallel computing code is
|
||||
used at all, which is useful for debugging. For n_jobs below -1,
|
||||
(n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
|
||||
are used.
|
||||
random_state : integer or numpy.RandomState, optional
|
||||
The generator used to initialize the centers. If an integer is
|
||||
given, it fixes the seed. Defaults to the global numpy random
|
||||
number generator.
|
||||
verbose : int, default 0
|
||||
Verbosity mode.
|
||||
copy_x : boolean, default True
|
||||
When pre-computing distances it is more numerically accurate to center
|
||||
the data first. If copy_x is True, then the original data is not
|
||||
modified. If False, the original data is modified, and put back before
|
||||
the function returns, but small numerical differences may be introduced
|
||||
by subtracting and then adding the data mean.
|
||||
Attributes
|
||||
----------
|
||||
cluster_centers_ : array, [n_clusters, n_features]
|
||||
Coordinates of cluster centers
|
||||
labels_ :
|
||||
Labels of each point
|
||||
inertia_ : float
|
||||
Sum of distances of samples to their closest cluster center.
|
||||
Notes
|
||||
------
|
||||
The k-means problem is solved using Lloyd's algorithm.
|
||||
The average complexity is given by O(k n T), were n is the number of
|
||||
samples and T is the number of iteration.
|
||||
The worst case complexity is given by O(n^(k+2/p)) with
|
||||
n = n_samples, p = n_features. (D. Arthur and S. Vassilvitskii,
|
||||
'How slow is the k-means method?' SoCG2006)
|
||||
In practice, the k-means algorithm is very fast (one of the fastest
|
||||
clustering algorithms available), but it falls in local minima. That's why
|
||||
it can be useful to restart it several times.
|
||||
See also
|
||||
--------
|
||||
MiniBatchKMeans:
|
||||
Alternative online implementation that does incremental updates
|
||||
of the centers positions using mini-batches.
|
||||
For large scale learning (say n_samples > 10k) MiniBatchKMeans is
|
||||
probably much faster to than the default batch implementation.
|
||||
"""
|
||||
|
||||
def __init__(self, n_clusters=8, max_cluster_size=None,
|
||||
init='k-means++', n_init=10, max_iter=300,
|
||||
tol=1e-4, precompute_distances='auto',
|
||||
verbose=0, random_state=None, copy_x=True, n_jobs=1):
|
||||
|
||||
self.n_clusters = n_clusters
|
||||
self.max_cluster_size = max_cluster_size
|
||||
self.init = init
|
||||
self.max_iter = max_iter
|
||||
self.tol = tol
|
||||
self.precompute_distances = precompute_distances
|
||||
self.n_init = n_init
|
||||
self.verbose = verbose
|
||||
self.random_state = random_state
|
||||
self.copy_x = copy_x
|
||||
self.n_jobs = n_jobs
|
||||
self.cluster_centers_ = None
|
||||
self.labels_ = None
|
||||
|
||||
def _check_fit_data(self, X):
|
||||
"""Verify that the number of samples given is larger than k"""
|
||||
X = check_array(X, accept_sparse='csr', dtype=np.float64)
|
||||
if X.shape[0] < self.n_clusters:
|
||||
raise ValueError("n_samples=%d should be >= n_clusters=%d" % (
|
||||
X.shape[0], self.n_clusters))
|
||||
return X
|
||||
|
||||
def _check_test_data(self, X):
|
||||
X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES,
|
||||
warn_on_dtype=True)
|
||||
n_samples, n_features = X.shape
|
||||
expected_n_features = self.cluster_centers_.shape[1]
|
||||
if not n_features == expected_n_features:
|
||||
raise ValueError("Incorrect number of features. "
|
||||
"Got %d features, expected %d" % (
|
||||
n_features, expected_n_features))
|
||||
|
||||
return X
|
||||
|
||||
def get_cluster_labels(self):
|
||||
return self.labels_
|
||||
|
||||
def get_cluster_centers(self):
|
||||
return self.cluster_centers_
|
||||
|
||||
def get_inertia(self):
|
||||
return self.inertia_
|
||||
|
||||
def fit(self, X, y=None, values=None):
|
||||
"""Compute k-means clustering.
|
||||
Parameters
|
||||
----------
|
||||
X : array-like or sparse matrix, shape=(n_samples, n_features)
|
||||
"""
|
||||
|
||||
print('fit ', values)
|
||||
random_state = check_random_state(self.random_state)
|
||||
X = self._check_fit_data(X)
|
||||
|
||||
self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \
|
||||
k_means(
|
||||
X, n_clusters=self.n_clusters, max_cluster_size=self.max_cluster_size,
|
||||
init=self.init, n_init=self.n_init, max_iter=self.max_iter,
|
||||
verbose=self.verbose, return_n_iter=True,
|
||||
precompute_distances=self.precompute_distances,
|
||||
tol=self.tol, random_state=random_state, copy_x=self.copy_x,
|
||||
n_jobs=self.n_jobs,values=values)
|
||||
return self
|
||||
|
||||
def fit_predict(self, X, y=None, values=None):
|
||||
"""Compute cluster centers and predict cluster index for each sample.
|
||||
Convenience method; equivalent to calling fit(X) followed by
|
||||
predict(X).
|
||||
"""
|
||||
print('fit predict ', values)
|
||||
return self.fit(X,values=values).labels_
|
||||
|
||||
def fit_transform(self, X, y=None):
|
||||
"""Compute clustering and transform X to cluster-distance space.
|
||||
Equivalent to fit(X).transform(X), but more efficiently implemented.
|
||||
"""
|
||||
# Currently, this just skips a copy of the data if it is not in
|
||||
# np.array or CSR format already.
|
||||
# XXX This skips _check_test_data, which may change the dtype;
|
||||
# we should refactor the input validation.
|
||||
X = self._check_fit_data(X)
|
||||
return self.fit(X)._transform(X)
|
||||
|
||||
def transform(self, X, y=None):
|
||||
"""Transform X to a cluster-distance space.
|
||||
In the new space, each dimension is the distance to the cluster
|
||||
centers. Note that even if X is sparse, the array returned by
|
||||
`transform` will typically be dense.
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
|
||||
New data to transform.
|
||||
Returns
|
||||
-------
|
||||
X_new : array, shape [n_samples, k]
|
||||
X transformed in the new space.
|
||||
"""
|
||||
check_is_fitted(self, 'cluster_centers_')
|
||||
|
||||
X = self._check_test_data(X)
|
||||
return self._transform(X)
|
||||
|
||||
def _transform(self, X):
|
||||
"""guts of transform method; no input validation"""
|
||||
return euclidean_distances(X, self.cluster_centers_)
|
||||
|
||||
def predict(self, X, values=None):
|
||||
"""Predict the closest cluster each sample in X belongs to.
|
||||
In the vector quantization literature, `cluster_centers_` is called
|
||||
the code book and each value returned by `predict` is the index of
|
||||
the closest code in the code book.
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
|
||||
New data to predict.
|
||||
Returns
|
||||
-------
|
||||
labels : array, shape [n_samples,]
|
||||
Index of the cluster each sample belongs to.
|
||||
"""
|
||||
check_is_fitted(self, 'cluster_centers_')
|
||||
|
||||
X = self._check_test_data(X)
|
||||
x_squared_norms = row_norms(X, squared=True)
|
||||
return _labels_inertia(X, x_squared_norms, self.cluster_centers_,
|
||||
self.max_cluster_size,values=values)[0]
|
||||
|
||||
def score(self, X, y=None, values=None):
|
||||
"""Opposite of the value of X on the K-means objective.
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
|
||||
New data.
|
||||
Returns
|
||||
-------
|
||||
score : float
|
||||
Opposite of the value of X on the K-means objective.
|
||||
"""
|
||||
check_is_fitted(self, 'cluster_centers_')
|
||||
|
||||
X = self._check_test_data(X)
|
||||
x_squared_norms = row_norms(X, squared=True)
|
||||
return -_labels_inertia(X, x_squared_norms, self.cluster_centers_,
|
||||
self.max_cluster_size, values=values)[1]
|
||||
|
||||
|
||||
def k_means(X, n_clusters, max_cluster_size=None, init='k-means++', precompute_distances='auto',
|
||||
n_init=10, max_iter=300, verbose=False,
|
||||
tol=1e-4, random_state=None, copy_x=True, n_jobs=1,
|
||||
return_n_iter=False, values=None):
|
||||
"""K-means clustering algorithm.
|
||||
Read more in the :ref:`User Guide <k_means>`.
|
||||
Parameters
|
||||
----------
|
||||
X : array-like or sparse matrix, shape (n_samples, n_features)
|
||||
The observations to cluster.
|
||||
n_clusters : int
|
||||
The number of clusters to form as well as the number of
|
||||
centroids to generate.
|
||||
max_iter : int, optional, default 300
|
||||
Maximum number of iterations of the k-means algorithm to run.
|
||||
n_init : int, optional, default: 10
|
||||
Number of time the k-means algorithm will be run with different
|
||||
centroid seeds. The final results will be the best output of
|
||||
n_init consecutive runs in terms of inertia.
|
||||
init : {'k-means++', 'random', or ndarray, or a callable}, optional
|
||||
Method for initialization, default to 'k-means++':
|
||||
'k-means++' : selects initial cluster centers for k-mean
|
||||
clustering in a smart way to speed up convergence. See section
|
||||
Notes in k_init for more details.
|
||||
'random': generate k centroids from a Gaussian with mean and
|
||||
variance estimated from the data.
|
||||
If an ndarray is passed, it should be of shape (n_clusters, n_features)
|
||||
and gives the initial centers.
|
||||
If a callable is passed, it should take arguments X, k and
|
||||
and a random state and return an initialization.
|
||||
precompute_distances : {'auto', True, False}
|
||||
Precompute distances (faster but takes more memory).
|
||||
'auto' : do not precompute distances if n_samples * n_clusters > 12
|
||||
million. This corresponds to about 100MB overhead per job using
|
||||
double precision.
|
||||
True : always precompute distances
|
||||
False : never precompute distances
|
||||
tol : float, optional
|
||||
The relative increment in the results before declaring convergence.
|
||||
verbose : boolean, optional
|
||||
Verbosity mode.
|
||||
random_state : integer or numpy.RandomState, optional
|
||||
The generator used to initialize the centers. If an integer is
|
||||
given, it fixes the seed. Defaults to the global numpy random
|
||||
number generator.
|
||||
copy_x : boolean, optional
|
||||
When pre-computing distances it is more numerically accurate to center
|
||||
the data first. If copy_x is True, then the original data is not
|
||||
modified. If False, the original data is modified, and put back before
|
||||
the function returns, but small numerical differences may be introduced
|
||||
by subtracting and then adding the data mean.
|
||||
n_jobs : int
|
||||
The number of jobs to use for the computation. This works by computing
|
||||
each of the n_init runs in parallel.
|
||||
If -1 all CPUs are used. If 1 is given, no parallel computing code is
|
||||
used at all, which is useful for debugging. For n_jobs below -1,
|
||||
(n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
|
||||
are used.
|
||||
return_n_iter : bool, optional
|
||||
Whether or not to return the number of iterations.
|
||||
Returns
|
||||
-------
|
||||
centroid : float ndarray with shape (k, n_features)
|
||||
Centroids found at the last iteration of k-means.
|
||||
label : integer ndarray with shape (n_samples,)
|
||||
label[i] is the code or index of the centroid the
|
||||
i'th observation is closest to.
|
||||
inertia : float
|
||||
The final value of the inertia criterion (sum of squared distances to
|
||||
the closest centroid for all observations in the training set).
|
||||
best_n_iter: int
|
||||
Number of iterations corresponding to the best results.
|
||||
Returned only if `return_n_iter` is set to True.
|
||||
"""
|
||||
if n_init <= 0:
|
||||
raise ValueError("Invalid number of initializations."
|
||||
" n_init=%d must be bigger than zero." % n_init)
|
||||
random_state = check_random_state(random_state)
|
||||
|
||||
if max_iter <= 0:
|
||||
raise ValueError('Number of iterations should be a positive number,'
|
||||
' got %d instead' % max_iter)
|
||||
|
||||
best_inertia = np.infty
|
||||
X = as_float_array(X, copy=copy_x)
|
||||
tol = _tolerance(X, tol)
|
||||
|
||||
# If the distances are precomputed every job will create a matrix of shape
|
||||
# (n_clusters, n_samples). To stop KMeans from eating up memory we only
|
||||
# activate this if the created matrix is guaranteed to be under 100MB. 12
|
||||
# million entries consume a little under 100MB if they are of type double.
|
||||
if precompute_distances == 'auto':
|
||||
n_samples = X.shape[0]
|
||||
precompute_distances = (n_clusters * n_samples) < 12e6
|
||||
elif isinstance(precompute_distances, bool):
|
||||
pass
|
||||
else:
|
||||
raise ValueError("precompute_distances should be 'auto' or True/False"
|
||||
", but a value of %r was passed" %
|
||||
precompute_distances)
|
||||
|
||||
# subtract of mean of x for more accurate distance computations
|
||||
if not sp.issparse(X) or hasattr(init, '__array__'):
|
||||
X_mean = X.mean(axis=0)
|
||||
if not sp.issparse(X):
|
||||
# The copy was already done above
|
||||
X -= X_mean
|
||||
|
||||
if hasattr(init, '__array__'):
|
||||
init = check_array(init, dtype=np.float64, copy=True)
|
||||
_validate_center_shape(X, n_clusters, init)
|
||||
|
||||
init -= X_mean
|
||||
if n_init != 1:
|
||||
warnings.warn(
|
||||
'Explicit initial center position passed: '
|
||||
'performing only one init in k-means instead of n_init=%d'
|
||||
% n_init, RuntimeWarning, stacklevel=2)
|
||||
n_init = 1
|
||||
|
||||
# precompute squared norms of data points
|
||||
x_squared_norms = row_norms(X, squared=True)
|
||||
|
||||
best_labels, best_inertia, best_centers = None, None, None
|
||||
if n_jobs == 1:
|
||||
# For a single thread, less memory is needed if we just store one set
|
||||
# of the best results (as opposed to one set per run per thread).
|
||||
for it in range(n_init):
|
||||
# run a k-means once
|
||||
labels, inertia, centers, n_iter_ = _kmeans_single(
|
||||
X, n_clusters, max_cluster_size, max_iter=max_iter, init=init,
|
||||
verbose=verbose, precompute_distances=precompute_distances, tol=tol,
|
||||
x_squared_norms=x_squared_norms, random_state=random_state, values=values)
|
||||
# determine if these results are the best so far
|
||||
if best_inertia is None or inertia < best_inertia:
|
||||
best_labels = labels.copy()
|
||||
best_centers = centers.copy()
|
||||
best_inertia = inertia
|
||||
best_n_iter = n_iter_
|
||||
else:
|
||||
# parallelisation of k-means runs
|
||||
seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
|
||||
results = Parallel(n_jobs=n_jobs, verbose=0)(
|
||||
delayed(_kmeans_single)(X, n_clusters, max_cluster_size, max_iter=max_iter,
|
||||
init=init, verbose=verbose, tol=tol,
|
||||
precompute_distances=precompute_distances,
|
||||
x_squared_norms=x_squared_norms,
|
||||
# Change seed to ensure variety
|
||||
random_state=seed,
|
||||
values=values)
|
||||
for seed in seeds)
|
||||
# Get results with the lowest inertia
|
||||
labels, inertia, centers, n_iters = zip(*results)
|
||||
best = np.argmin(inertia)
|
||||
best_labels = labels[best]
|
||||
best_inertia = inertia[best]
|
||||
best_centers = centers[best]
|
||||
best_n_iter = n_iters[best]
|
||||
|
||||
if not sp.issparse(X):
|
||||
if not copy_x:
|
||||
X += X_mean
|
||||
best_centers += X_mean
|
||||
|
||||
if return_n_iter:
|
||||
return best_centers, best_labels, best_inertia, best_n_iter
|
||||
else:
|
||||
return best_centers, best_labels, best_inertia
|
||||
|
||||
|
||||
def _kmeans_single(X, n_clusters, max_cluster_size, x_squared_norms, max_iter=300,
|
||||
init='k-means++', verbose=False, random_state=None,
|
||||
tol=1e-4, precompute_distances=True, values=None):
|
||||
"""A single run of k-means, assumes preparation completed prior.
|
||||
Parameters
|
||||
----------
|
||||
X: array-like of floats, shape (n_samples, n_features)
|
||||
The observations to cluster.
|
||||
n_clusters: int
|
||||
The number of clusters to form as well as the number of
|
||||
centroids to generate.
|
||||
max_iter: int, optional, default 300
|
||||
Maximum number of iterations of the k-means algorithm to run.
|
||||
init: {'k-means++', 'random', or ndarray, or a callable}, optional
|
||||
Method for initialization, default to 'k-means++':
|
||||
'k-means++' : selects initial cluster centers for k-mean
|
||||
clustering in a smart way to speed up convergence. See section
|
||||
Notes in k_init for more details.
|
||||
'random': generate k centroids from a Gaussian with mean and
|
||||
variance estimated from the data.
|
||||
If an ndarray is passed, it should be of shape (k, p) and gives
|
||||
the initial centers.
|
||||
If a callable is passed, it should take arguments X, k and
|
||||
and a random state and return an initialization.
|
||||
tol: float, optional
|
||||
The relative increment in the results before declaring convergence.
|
||||
verbose: boolean, optional
|
||||
Verbosity mode
|
||||
x_squared_norms: array
|
||||
Precomputed x_squared_norms.
|
||||
precompute_distances : boolean, default: True
|
||||
Precompute distances (faster but takes more memory).
|
||||
random_state: integer or numpy.RandomState, optional
|
||||
The generator used to initialize the centers. If an integer is
|
||||
given, it fixes the seed. Defaults to the global numpy random
|
||||
number generator.
|
||||
Returns
|
||||
-------
|
||||
centroid: float ndarray with shape (k, n_features)
|
||||
Centroids found at the last iteration of k-means.
|
||||
label: integer ndarray with shape (n_samples,)
|
||||
label[i] is the code or index of the centroid the
|
||||
i'th observation is closest to.
|
||||
inertia: float
|
||||
The final value of the inertia criterion (sum of squared distances to
|
||||
the closest centroid for all observations in the training set).
|
||||
n_iter : int
|
||||
Number of iterations run.
|
||||
"""
|
||||
random_state = check_random_state(random_state)
|
||||
|
||||
best_labels, best_inertia, best_centers = None, None, None
|
||||
# init
|
||||
centers = k_means_._init_centroids(X, n_clusters, init, random_state=random_state,
|
||||
x_squared_norms=x_squared_norms)
|
||||
if verbose:
|
||||
print("Initialization complete")
|
||||
|
||||
# Allocate memory to store the distances for each sample to its
|
||||
# closer center for reallocation in case of ties
|
||||
distances = np.zeros(shape=(X.shape[0],), dtype=np.float64)
|
||||
|
||||
# iterations
|
||||
for i in range(max_iter):
|
||||
centers_old = centers.copy()
|
||||
# labels assignment is also called the E-step of EM
|
||||
labels, inertia = \
|
||||
_labels_inertia(X, x_squared_norms, centers, max_cluster_size,
|
||||
precompute_distances=precompute_distances,
|
||||
distances=distances, values=values)
|
||||
|
||||
# computation of the means is also called the M-step of EM
|
||||
if sp.issparse(X):
|
||||
centers = _k_means._centers_sparse(X, labels, n_clusters,
|
||||
distances)
|
||||
else:
|
||||
centers = _k_means._centers_dense(X, labels, n_clusters, distances)
|
||||
|
||||
if verbose:
|
||||
print("Iteration %2d, inertia %.3f" % (i, inertia))
|
||||
|
||||
if best_inertia is None or inertia < best_inertia:
|
||||
best_labels = labels.copy()
|
||||
best_centers = centers.copy()
|
||||
best_inertia = inertia
|
||||
|
||||
shift = squared_norm(centers_old - centers)
|
||||
if shift <= tol:
|
||||
if verbose:
|
||||
print("Converged at iteration %d" % i)
|
||||
|
||||
break
|
||||
|
||||
if shift > 0:
|
||||
# rerun E-step in case of non-convergence so that predicted labels
|
||||
# match cluster centers
|
||||
best_labels, best_inertia = \
|
||||
_labels_inertia(X, x_squared_norms, best_centers, max_cluster_size,
|
||||
precompute_distances=precompute_distances,
|
||||
distances=distances, values=values)
|
||||
|
||||
return best_labels, best_inertia, best_centers, i + 1
|
||||
|
||||
|
||||
def _validate_center_shape(X, n_centers, centers):
|
||||
"""Check if centers is compatible with X and n_centers"""
|
||||
if len(centers) != n_centers:
|
||||
raise ValueError('The shape of the initial centers (%s) '
|
||||
'does not match the number of clusters %i'
|
||||
% (centers.shape, n_centers))
|
||||
if centers.shape[1] != X.shape[1]:
|
||||
raise ValueError(
|
||||
"The number of features of the initial centers %s "
|
||||
"does not match the number of features of the data %s."
|
||||
% (centers.shape[1], X.shape[1]))
|
||||
|
||||
|
||||
def _tolerance(X, tol):
|
||||
"""Return a tolerance which is independent of the dataset"""
|
||||
if sp.issparse(X):
|
||||
variances = mean_variance_axis(X, axis=0)[1]
|
||||
else:
|
||||
variances = np.var(X, axis=0)
|
||||
return np.mean(variances) * tol
|
||||
|
||||
|
||||
def _labels_inertia(X, x_squared_norms, centers, max_cluster_size,
|
||||
precompute_distances=True, distances=None, values=None):
|
||||
"""E step of the K-means EM algorithm.
|
||||
Compute the labels and the inertia of the given samples and centers.
|
||||
This will compute the distances in-place.
|
||||
Parameters
|
||||
----------
|
||||
X: float64 array-like or CSR sparse matrix, shape (n_samples, n_features)
|
||||
The input samples to assign to the labels.
|
||||
x_squared_norms: array, shape (n_samples,)
|
||||
Precomputed squared euclidean norm of each data point, to speed up
|
||||
computations.
|
||||
centers: float64 array, shape (k, n_features)
|
||||
The cluster centers.
|
||||
precompute_distances : boolean, default: True
|
||||
Precompute distances (faster but takes more memory).
|
||||
distances: float64 array, shape (n_samples,)
|
||||
Pre-allocated array to be filled in with each sample's distance
|
||||
to the closest center.
|
||||
Returns
|
||||
-------
|
||||
labels: int array of shape(n)
|
||||
The resulting assignment
|
||||
inertia : float
|
||||
Sum of distances of samples to their closest cluster center.
|
||||
"""
|
||||
n_samples = X.shape[0]
|
||||
# set the default value of centers to -1 to be able to detect any anomaly
|
||||
# easily
|
||||
labels = -np.ones(n_samples, np.int32)
|
||||
if distances is None:
|
||||
distances = np.zeros(shape=(0,), dtype=np.float64)
|
||||
# distances will be changed in-place
|
||||
if sp.issparse(X):
|
||||
inertia = k_means_._k_means._assign_labels_csr(
|
||||
X, x_squared_norms, centers, labels, distances=distances)
|
||||
else:
|
||||
if precompute_distances:
|
||||
return _labels_inertia_precompute_dense(X, x_squared_norms,
|
||||
centers, distances,
|
||||
max_cluster_size,values=values)
|
||||
inertia = k_means_._k_means._assign_labels_array(
|
||||
X, x_squared_norms, centers, labels, distances=distances)
|
||||
return labels, inertia
|
||||
|
||||
|
||||
def _labels_inertia_precompute_dense(X, x_squared_norms, centers, distances,
|
||||
max_cluster_size, values=None):
|
||||
"""Compute labels and inertia using a full distance matrix.
|
||||
This will overwrite the 'distances' array in-place.
|
||||
Parameters
|
||||
----------
|
||||
X : numpy array, shape (n_sample, n_features)
|
||||
Input data.
|
||||
x_squared_norms : numpy array, shape (n_samples,)
|
||||
Precomputed squared norms of X.
|
||||
centers : numpy array, shape (n_clusters, n_features)
|
||||
Cluster centers which data is assigned to.
|
||||
distances : numpy array, shape (n_samples,)
|
||||
Pre-allocated array in which distances are stored.
|
||||
max_cluster_size: TODO
|
||||
Returns
|
||||
-------
|
||||
labels : numpy array, dtype=np.int, shape (n_samples,)
|
||||
Indices of clusters that samples are assigned to.
|
||||
inertia : float
|
||||
Sum of distances of samples to their closest cluster center.
|
||||
"""
|
||||
n_samples = X.shape[0]
|
||||
k = centers.shape[0]
|
||||
all_distances = euclidean_distances(centers, X, x_squared_norms,
|
||||
squared=True)
|
||||
labels = np.empty(n_samples, dtype=np.int32)
|
||||
labels.fill(-1)
|
||||
mindist = np.empty(n_samples)
|
||||
mindist.fill(np.infty)
|
||||
|
||||
n_samples = X.shape[0]
|
||||
k = centers.shape[0]
|
||||
|
||||
if isinstance(max_cluster_size, int) and max_cluster_size * k < n_samples:
|
||||
raise ValueError("max_cluster_size * k must be greater than n_samples")
|
||||
elif isinstance(max_cluster_size, list):
|
||||
if sum(max_cluster_size) != n_samples:
|
||||
raise ValueError("max_cluster_size summatory must be equal than n_samples")
|
||||
elif len(max_cluster_size) != k:
|
||||
raise ValueError("max_cluster_size len must be equal than k")
|
||||
|
||||
if not max_cluster_size:
|
||||
max_cluster_size = get_clusters_size(n_samples, k)
|
||||
|
||||
labels, mindist = initial_assignment(labels, mindist, n_samples, all_distances, max_cluster_size,values=values)
|
||||
all_points = np.arange(n_samples)
|
||||
|
||||
for point in all_points:
|
||||
for point_dist in get_best_point_distances(point, all_distances):
|
||||
cluster_id, point_dist = point_dist
|
||||
# initial assignment
|
||||
if not is_cluster_full(cluster_id, max_cluster_size, labels,values):
|
||||
labels[point] = cluster_id
|
||||
mindist[point] = point_dist
|
||||
break
|
||||
|
||||
# refinement of clustering
|
||||
transfer_list = []
|
||||
best_mindist = mindist.copy()
|
||||
best_labels = labels.copy()
|
||||
# sort all of the points from largest distance to smallest
|
||||
points_by_high_distance = np.argsort(mindist)[::-1]
|
||||
for point in points_by_high_distance:
|
||||
point_cluster = labels[point]
|
||||
|
||||
# see if there is an opening on the best cluster for this point
|
||||
cluster_id, point_dist = get_best_cluster_for_point(point, all_distances)
|
||||
if not is_cluster_full(cluster_id, max_cluster_size, labels,values=values) and point_cluster != cluster_id:
|
||||
labels[point] = cluster_id
|
||||
mindist[point] = point_dist
|
||||
best_labels = labels.copy()
|
||||
best_mindist = mindist.copy()
|
||||
continue # on to the next point
|
||||
|
||||
for swap_candidate in transfer_list:
|
||||
cand_cluster = labels[swap_candidate]
|
||||
if point_cluster != cand_cluster:
|
||||
|
||||
# get the current dist of swap candidate
|
||||
cand_distance = mindist[swap_candidate]
|
||||
|
||||
# get the potential dist of point
|
||||
point_distance = all_distances[cand_cluster, point]
|
||||
|
||||
# compare
|
||||
if point_distance < cand_distance:
|
||||
|
||||
labels[point] = cand_cluster
|
||||
mindist[point] = all_distances[cand_cluster, point]
|
||||
|
||||
labels[swap_candidate] = point_cluster
|
||||
mindist[swap_candidate] = all_distances[point_cluster, swap_candidate]
|
||||
|
||||
if np.absolute(mindist).sum() < np.absolute(best_mindist).sum():
|
||||
# update the labels since the transfer was a success
|
||||
best_labels = labels.copy()
|
||||
best_mindist = mindist.copy()
|
||||
break
|
||||
|
||||
else:
|
||||
# reset since the transfer was not a success
|
||||
labels = best_labels.copy()
|
||||
mindist = best_mindist.copy()
|
||||
|
||||
transfer_list.append(point)
|
||||
|
||||
if n_samples == distances.shape[0]:
|
||||
# distances will be changed in-place
|
||||
distances[:] = mindist
|
||||
inertia = best_mindist.sum()
|
||||
|
||||
return best_labels, inertia
|
||||
|
||||
|
||||
def get_best_cluster_for_point(point, all_distances):
|
||||
"""Gets the best cluster by distance for a point
|
||||
|
||||
Argument
|
||||
--------
|
||||
point : int
|
||||
the point index
|
||||
|
||||
Returns
|
||||
--------
|
||||
tuple
|
||||
(cluster_id, distance_from_cluster_center)
|
||||
"""
|
||||
|
||||
sorted_distances = get_best_point_distances(point, all_distances)
|
||||
cluster_id, point_dist = sorted_distances[0]
|
||||
return cluster_id, point_dist
|
||||
|
||||
|
||||
def get_best_point_distances(point, all_distances):
|
||||
"""Gets a sorted by best distance of clusters
|
||||
|
||||
Argument
|
||||
--------
|
||||
point : int
|
||||
the point index
|
||||
|
||||
Returns
|
||||
--------
|
||||
list of tuples sorted by point_dist
|
||||
example: [(cluster_id, point_dist), (cluster_id, point_dist)]
|
||||
"""
|
||||
points_distances = all_distances[:, point]
|
||||
sorted_points = sort_adjust_row(points_distances)
|
||||
return sorted_points
|
||||
|
||||
|
||||
def sort_adjust_row(points_distances):
|
||||
"Sorts the points row from smallest distance to lowest distance"
|
||||
return sorted([(cluster_id, point_dist) for cluster_id, point_dist in enumerate(points_distances)], key=lambda x: x[1])
|
||||
|
||||
|
||||
def is_cluster_full(cluster_id, max_cluster_size, labels, values=None):
|
||||
"""Determines if a cluster is full"""
|
||||
if isinstance(max_cluster_size, list):
|
||||
mx_cs = max_cluster_size[cluster_id]
|
||||
else:
|
||||
mx_cs = max_cluster_size
|
||||
|
||||
if values is not None:
|
||||
cluster_count = np.sum(values[np.where(labels == cluster_id)])
|
||||
else:
|
||||
cluster_count = len(np.where(labels == cluster_id)[0])
|
||||
|
||||
is_full = cluster_count >= mx_cs
|
||||
return is_full
|
||||
|
||||
|
||||
def get_clusters_size(n_samples, n_clusters):
|
||||
"""Gets the number of members per cluster for equal groups kmeans"""
|
||||
return (n_samples + n_clusters - 1) // n_clusters
|
||||
|
||||
|
||||
def initial_assignment(labels, mindist, n_samples, all_distances, max_cluster_size, values=None):
|
||||
"""Initial assignment of labels and mindist"""
|
||||
all_points = np.arange(n_samples)
|
||||
for point in all_points:
|
||||
for point_dist in get_best_point_distances(point, all_distances):
|
||||
cluster_id, point_dist = point_dist
|
||||
# initial assignment
|
||||
if not is_cluster_full(cluster_id, max_cluster_size, labels, values):
|
||||
labels[point] = cluster_id
|
||||
mindist[point] = point_dist
|
||||
break
|
||||
return labels, mindist
|
@ -1,4 +1,6 @@
|
||||
from sklearn.cluster import KMeans
|
||||
from balanced_kmeans import BalancedGroupsKMeans
|
||||
|
||||
import numpy as np
|
||||
|
||||
from crankshaft.analysis_data_provider import AnalysisDataProvider
|
||||
@ -30,3 +32,25 @@ class Kmeans:
|
||||
km = KMeans(n_clusters=no_clusters, n_init=no_init)
|
||||
labels = km.fit_predict(zip(xs, ys))
|
||||
return zip(ids, labels)
|
||||
|
||||
def spatial_balanced(self, query, no_clusters, no_init=20, target_per_cluster = None, value_column = None):
|
||||
params = { "subquery": query,
|
||||
"geom_col": "the_geom",
|
||||
"id_col" "cartodb_id",
|
||||
"value_column" : value_column }
|
||||
|
||||
data = self.data_provider.get_spatial_balanced_kmeans(params)
|
||||
|
||||
xs = data[0]['xs']
|
||||
ts = data[0]['ys']
|
||||
ids = data[0]['ids']
|
||||
values = data[0]['values']
|
||||
|
||||
total_value = np.sum(values)
|
||||
|
||||
if target_per_cluster is None:
|
||||
target_per_cluster = total_value / float(no_clusters)
|
||||
|
||||
km = KmeansBallanced(n_clusters=17,max_iter=100, max_cluster_size=target_per_cluster)
|
||||
labels = km.fit_predict(zip(xs,ys), values=values)
|
||||
return zip(ids,labels)
|
||||
|
Loading…
Reference in New Issue
Block a user