fixes variale name error

small syntax fixes
update to the kmeans - balanced code
2018-02-16 13:31:17 -05:00 · 2018-02-09 16:22:10 -05:00 · 2018-02-09 15:55:04 -05:00 · 2018-01-29 16:41:58 -05:00 · 2018-01-29 10:51:13 -05:00
4 changed files with 906 additions and 0 deletions
--- a/src/pg/sql/11_kmeans.sql
+++ b/src/pg/sql/11_kmeans.sql
@ -24,6 +24,24 @@ $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
 --              deviation of 1
 -- id_colname: name of the id column

+CREATE OR REPLACE FUNCTION CDB_KMeansBalanced(
+    query text,
+    no_clusters integer,
+    value_col TEXT default NULL,
+    no_init integer default 20,
+    max_per_cluster float default NULL)
+RETURNS table(cartodb_id integer, cluster_no integer)
+AS $$
+
+from crankshaft.clustering import KmeansBalanced
+kmeans = KmeansBalanced()
+return kmeans.spatial_balanced(query,no_clusters,no_init, max_per_cluster, value_col)
+
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+
+CREATE OR REPLACE FUNCTION CDB_WeightedMeanS(state Numeric[],the_geom GEOMETRY(Point, 4326), weight NUMERIC)
+RETURNS Numeric[] AS
+$$
 CREATE OR REPLACE FUNCTION CDB_KMeansNonspatial(
  query TEXT,
  colnames TEXT[],
--- a/src/py/crankshaft/crankshaft/analysis_data_provider.py
+++ b/src/py/crankshaft/crankshaft/analysis_data_provider.py
@ -85,6 +85,24 @@ class AnalysisDataProvider(object):
                '''.format(**params)
        return plpy.execute(query)

+    def get_spatial_balanced_kmeans(self, params):
+        """fetch data for spatial kmeans"""
+
+        params.setdefault('value_column', 1)
+
+        query = ("SELECT "
+                 "array_agg({id_col} ORDER BY {id_col}) as ids,"
+                 "array_agg(ST_X({geom_col}) ORDER BY {id_col}) As xs,"
+                 "array_agg(ST_Y({geom_col}) ORDER BY {id_col}) As ys "
+                 "array_agg({value_column} ORDER BY {id_col}) As values"
+                 "FROM ({subquery}) As a "
+                 "WHERE {geom_col} IS NOT NULL").format(**params)
+        try:
+            data = plpy.execute(query)
+            return data
+        except plpy.SPIError, err:
+            plpy.error('Analysis failed: %s' % err)
+
    @verify_data
    def get_gwr(self, params):
        """fetch data for gwr analysis"""
--- a/src/py/crankshaft/crankshaft/clustering/balanced_kmeans.py
+++ b/src/py/crankshaft/crankshaft/clustering/balanced_kmeans.py
@ -0,0 +1,836 @@
+"""
+Balanced groups K-Means clustering (same size | max size | custom sizes)
+utlizing the scikit-learn api and related
+utilities.
+
+BSD 3-Clause License
+
+Copyright (c) 2017, Cayetano Benavent, José Manuel Camacho (Balanced-Sizes-K-Means
+    Implementation)
+Copyright (c) 2017, Nathan Danielsen (Same-Size-K-Means Implementation)
+Copyright (c) 2007–2017 The scikit-learn developers.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+BSD 3-clause "New" or "Revised" License
+
+version 0.17.1
+"""
+
+
+import warnings
+
+import numpy as np
+import scipy.sparse as sp
+
+from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin
+from sklearn.cluster import k_means_
+from sklearn.cluster import _k_means
+from sklearn.externals.joblib import Parallel
+from sklearn.externals.joblib import delayed
+from sklearn.metrics.pairwise import euclidean_distances
+from sklearn.utils.extmath import row_norms, squared_norm
+from sklearn.utils.sparsefuncs import mean_variance_axis
+from sklearn.utils import check_array
+from sklearn.utils import check_random_state
+from sklearn.utils import as_float_array
+from sklearn.utils.validation import check_is_fitted
+from sklearn.utils.validation import FLOAT_DTYPES
+
+
+class BalancedGroupsKMeans(BaseEstimator, ClusterMixin, TransformerMixin):
+    """Balanced Groups K-Means clustering
+
+    90 percent of this is the Kmeans implmentations with the equal groups logic
+    located in `_labels_inertia_precompute_dense()` which follows the steps laid
+    out in the Elki Same-size k-Means Variation tutorial.
+
+    https://elki-project.github.io/tutorial/same-size_k_means
+
+    Please note that this implementation only works in scikit-learn 17.X as later
+    versions having breaking changes to this implementation.
+
+    Parameters
+    ----------
+    n_clusters : int, optional, default: 8
+        The number of clusters to form as well as the number of
+        centroids to generate.
+    max_iter : int, default: 300
+        Maximum number of iterations of the k-means algorithm for a
+        single run.
+    n_init : int, default: 10
+        Number of time the k-means algorithm will be run with different
+        centroid seeds. The final results will be the best output of
+        n_init consecutive runs in terms of inertia.
+    init : {'k-means++', 'random' or an ndarray}
+        Method for initialization, defaults to 'k-means++':
+        'k-means++' : selects initial cluster centers for k-mean
+        clustering in a smart way to speed up convergence. See section
+        Notes in k_init for more details.
+        'random': choose k observations (rows) at random from data for
+        the initial centroids.
+        If an ndarray is passed, it should be of shape (n_clusters, n_features)
+        and gives the initial centers.
+    precompute_distances : {'auto', True, False}
+        Precompute distances (faster but takes more memory).
+        'auto' : do not precompute distances if n_samples * n_clusters > 12
+        million. This corresponds to about 100MB overhead per job using
+        double precision.
+        True : always precompute distances
+        False : never precompute distances
+    tol : float, default: 1e-4
+        Relative tolerance with regards to inertia to declare convergence
+    n_jobs : int
+        The number of jobs to use for the computation. This works by computing
+        each of the n_init runs in parallel.
+        If -1 all CPUs are used. If 1 is given, no parallel computing code is
+        used at all, which is useful for debugging. For n_jobs below -1,
+        (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
+        are used.
+    random_state : integer or numpy.RandomState, optional
+        The generator used to initialize the centers. If an integer is
+        given, it fixes the seed. Defaults to the global numpy random
+        number generator.
+    verbose : int, default 0
+        Verbosity mode.
+    copy_x : boolean, default True
+        When pre-computing distances it is more numerically accurate to center
+        the data first.  If copy_x is True, then the original data is not
+        modified.  If False, the original data is modified, and put back before
+        the function returns, but small numerical differences may be introduced
+        by subtracting and then adding the data mean.
+    Attributes
+    ----------
+    cluster_centers_ : array, [n_clusters, n_features]
+        Coordinates of cluster centers
+    labels_ :
+        Labels of each point
+    inertia_ : float
+        Sum of distances of samples to their closest cluster center.
+    Notes
+    ------
+    The k-means problem is solved using Lloyd's algorithm.
+    The average complexity is given by O(k n T), were n is the number of
+    samples and T is the number of iteration.
+    The worst case complexity is given by O(n^(k+2/p)) with
+    n = n_samples, p = n_features. (D. Arthur and S. Vassilvitskii,
+    'How slow is the k-means method?' SoCG2006)
+    In practice, the k-means algorithm is very fast (one of the fastest
+    clustering algorithms available), but it falls in local minima. That's why
+    it can be useful to restart it several times.
+    See also
+    --------
+    MiniBatchKMeans:
+        Alternative online implementation that does incremental updates
+        of the centers positions using mini-batches.
+        For large scale learning (say n_samples > 10k) MiniBatchKMeans is
+        probably much faster to than the default batch implementation.
+    """
+
+    def __init__(self, n_clusters=8, max_cluster_size=None,
+                 init='k-means++', n_init=10, max_iter=300,
+                 tol=1e-4, precompute_distances='auto',
+                 verbose=0, random_state=None, copy_x=True, n_jobs=1):
+
+        self.n_clusters = n_clusters
+        self.max_cluster_size = max_cluster_size
+        self.init = init
+        self.max_iter = max_iter
+        self.tol = tol
+        self.precompute_distances = precompute_distances
+        self.n_init = n_init
+        self.verbose = verbose
+        self.random_state = random_state
+        self.copy_x = copy_x
+        self.n_jobs = n_jobs
+        self.cluster_centers_ = None
+        self.labels_ = None
+
+    def _check_fit_data(self, X):
+        """Verify that the number of samples given is larger than k"""
+        X = check_array(X, accept_sparse='csr', dtype=np.float64)
+        if X.shape[0] < self.n_clusters:
+            raise ValueError("n_samples=%d should be >= n_clusters=%d" % (
+                X.shape[0], self.n_clusters))
+        return X
+
+    def _check_test_data(self, X):
+        X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES,
+                        warn_on_dtype=True)
+        n_samples, n_features = X.shape
+        expected_n_features = self.cluster_centers_.shape[1]
+        if not n_features == expected_n_features:
+            raise ValueError("Incorrect number of features. "
+                             "Got %d features, expected %d" % (
+                                 n_features, expected_n_features))
+
+        return X
+
+    def get_cluster_labels(self):
+        return self.labels_
+
+    def get_cluster_centers(self):
+        return self.cluster_centers_
+
+    def get_inertia(self):
+        return self.inertia_
+
+    def fit(self, X, y=None, values=None):
+        """Compute k-means clustering.
+        Parameters
+        ----------
+        X : array-like or sparse matrix, shape=(n_samples, n_features)
+        """
+
+        print('fit ', values)
+        random_state = check_random_state(self.random_state)
+        X = self._check_fit_data(X)
+
+        self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \
+            k_means(
+                X, n_clusters=self.n_clusters, max_cluster_size=self.max_cluster_size,
+                init=self.init, n_init=self.n_init, max_iter=self.max_iter,
+                verbose=self.verbose, return_n_iter=True,
+                precompute_distances=self.precompute_distances,
+                tol=self.tol, random_state=random_state, copy_x=self.copy_x,
+                n_jobs=self.n_jobs,values=values)
+        return self
+
+    def fit_predict(self, X, y=None, values=None):
+        """Compute cluster centers and predict cluster index for each sample.
+        Convenience method; equivalent to calling fit(X) followed by
+        predict(X).
+        """
+        print('fit predict ', values)
+        return self.fit(X,values=values).labels_
+
+    def fit_transform(self, X, y=None):
+        """Compute clustering and transform X to cluster-distance space.
+        Equivalent to fit(X).transform(X), but more efficiently implemented.
+        """
+        # Currently, this just skips a copy of the data if it is not in
+        # np.array or CSR format already.
+        # XXX This skips _check_test_data, which may change the dtype;
+        # we should refactor the input validation.
+        X = self._check_fit_data(X)
+        return self.fit(X)._transform(X)
+
+    def transform(self, X, y=None):
+        """Transform X to a cluster-distance space.
+        In the new space, each dimension is the distance to the cluster
+        centers.  Note that even if X is sparse, the array returned by
+        `transform` will typically be dense.
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
+            New data to transform.
+        Returns
+        -------
+        X_new : array, shape [n_samples, k]
+            X transformed in the new space.
+        """
+        check_is_fitted(self, 'cluster_centers_')
+
+        X = self._check_test_data(X)
+        return self._transform(X)
+
+    def _transform(self, X):
+        """guts of transform method; no input validation"""
+        return euclidean_distances(X, self.cluster_centers_)
+
+    def predict(self, X, values=None):
+        """Predict the closest cluster each sample in X belongs to.
+        In the vector quantization literature, `cluster_centers_` is called
+        the code book and each value returned by `predict` is the index of
+        the closest code in the code book.
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
+            New data to predict.
+        Returns
+        -------
+        labels : array, shape [n_samples,]
+            Index of the cluster each sample belongs to.
+        """
+        check_is_fitted(self, 'cluster_centers_')
+
+        X = self._check_test_data(X)
+        x_squared_norms = row_norms(X, squared=True)
+        return _labels_inertia(X, x_squared_norms, self.cluster_centers_,
+                               self.max_cluster_size,values=values)[0]
+
+    def score(self, X, y=None, values=None):
+        """Opposite of the value of X on the K-means objective.
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
+            New data.
+        Returns
+        -------
+        score : float
+            Opposite of the value of X on the K-means objective.
+        """
+        check_is_fitted(self, 'cluster_centers_')
+
+        X = self._check_test_data(X)
+        x_squared_norms = row_norms(X, squared=True)
+        return -_labels_inertia(X, x_squared_norms, self.cluster_centers_,
+                                self.max_cluster_size, values=values)[1]
+
+
+def k_means(X, n_clusters, max_cluster_size=None, init='k-means++', precompute_distances='auto',
+            n_init=10, max_iter=300, verbose=False,
+            tol=1e-4, random_state=None, copy_x=True, n_jobs=1,
+            return_n_iter=False, values=None):
+    """K-means clustering algorithm.
+    Read more in the :ref:`User Guide <k_means>`.
+    Parameters
+    ----------
+    X : array-like or sparse matrix, shape (n_samples, n_features)
+        The observations to cluster.
+    n_clusters : int
+        The number of clusters to form as well as the number of
+        centroids to generate.
+    max_iter : int, optional, default 300
+        Maximum number of iterations of the k-means algorithm to run.
+    n_init : int, optional, default: 10
+        Number of time the k-means algorithm will be run with different
+        centroid seeds. The final results will be the best output of
+        n_init consecutive runs in terms of inertia.
+    init : {'k-means++', 'random', or ndarray, or a callable}, optional
+        Method for initialization, default to 'k-means++':
+        'k-means++' : selects initial cluster centers for k-mean
+        clustering in a smart way to speed up convergence. See section
+        Notes in k_init for more details.
+        'random': generate k centroids from a Gaussian with mean and
+        variance estimated from the data.
+        If an ndarray is passed, it should be of shape (n_clusters, n_features)
+        and gives the initial centers.
+        If a callable is passed, it should take arguments X, k and
+        and a random state and return an initialization.
+    precompute_distances : {'auto', True, False}
+        Precompute distances (faster but takes more memory).
+        'auto' : do not precompute distances if n_samples * n_clusters > 12
+        million. This corresponds to about 100MB overhead per job using
+        double precision.
+        True : always precompute distances
+        False : never precompute distances
+    tol : float, optional
+        The relative increment in the results before declaring convergence.
+    verbose : boolean, optional
+        Verbosity mode.
+    random_state : integer or numpy.RandomState, optional
+        The generator used to initialize the centers. If an integer is
+        given, it fixes the seed. Defaults to the global numpy random
+        number generator.
+    copy_x : boolean, optional
+        When pre-computing distances it is more numerically accurate to center
+        the data first.  If copy_x is True, then the original data is not
+        modified.  If False, the original data is modified, and put back before
+        the function returns, but small numerical differences may be introduced
+        by subtracting and then adding the data mean.
+    n_jobs : int
+        The number of jobs to use for the computation. This works by computing
+        each of the n_init runs in parallel.
+        If -1 all CPUs are used. If 1 is given, no parallel computing code is
+        used at all, which is useful for debugging. For n_jobs below -1,
+        (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
+        are used.
+    return_n_iter : bool, optional
+        Whether or not to return the number of iterations.
+    Returns
+    -------
+    centroid : float ndarray with shape (k, n_features)
+        Centroids found at the last iteration of k-means.
+    label : integer ndarray with shape (n_samples,)
+        label[i] is the code or index of the centroid the
+        i'th observation is closest to.
+    inertia : float
+        The final value of the inertia criterion (sum of squared distances to
+        the closest centroid for all observations in the training set).
+    best_n_iter: int
+        Number of iterations corresponding to the best results.
+        Returned only if `return_n_iter` is set to True.
+    """
+    if n_init <= 0:
+        raise ValueError("Invalid number of initializations."
+                         " n_init=%d must be bigger than zero." % n_init)
+    random_state = check_random_state(random_state)
+
+    if max_iter <= 0:
+        raise ValueError('Number of iterations should be a positive number,'
+                         ' got %d instead' % max_iter)
+
+    best_inertia = np.infty
+    X = as_float_array(X, copy=copy_x)
+    tol = _tolerance(X, tol)
+
+    # If the distances are precomputed every job will create a matrix of shape
+    # (n_clusters, n_samples). To stop KMeans from eating up memory we only
+    # activate this if the created matrix is guaranteed to be under 100MB. 12
+    # million entries consume a little under 100MB if they are of type double.
+    if precompute_distances == 'auto':
+        n_samples = X.shape[0]
+        precompute_distances = (n_clusters * n_samples) < 12e6
+    elif isinstance(precompute_distances, bool):
+        pass
+    else:
+        raise ValueError("precompute_distances should be 'auto' or True/False"
+                         ", but a value of %r was passed" %
+                         precompute_distances)
+
+    # subtract of mean of x for more accurate distance computations
+    if not sp.issparse(X) or hasattr(init, '__array__'):
+        X_mean = X.mean(axis=0)
+    if not sp.issparse(X):
+        # The copy was already done above
+        X -= X_mean
+
+    if hasattr(init, '__array__'):
+        init = check_array(init, dtype=np.float64, copy=True)
+        _validate_center_shape(X, n_clusters, init)
+
+        init -= X_mean
+        if n_init != 1:
+            warnings.warn(
+                'Explicit initial center position passed: '
+                'performing only one init in k-means instead of n_init=%d'
+                % n_init, RuntimeWarning, stacklevel=2)
+            n_init = 1
+
+    # precompute squared norms of data points
+    x_squared_norms = row_norms(X, squared=True)
+
+    best_labels, best_inertia, best_centers = None, None, None
+    if n_jobs == 1:
+        # For a single thread, less memory is needed if we just store one set
+        # of the best results (as opposed to one set per run per thread).
+        for it in range(n_init):
+            # run a k-means once
+            labels, inertia, centers, n_iter_ = _kmeans_single(
+                X, n_clusters, max_cluster_size, max_iter=max_iter, init=init,
+                verbose=verbose, precompute_distances=precompute_distances, tol=tol,
+                x_squared_norms=x_squared_norms, random_state=random_state, values=values)
+            # determine if these results are the best so far
+            if best_inertia is None or inertia < best_inertia:
+                best_labels = labels.copy()
+                best_centers = centers.copy()
+                best_inertia = inertia
+                best_n_iter = n_iter_
+    else:
+        # parallelisation of k-means runs
+        seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
+        results = Parallel(n_jobs=n_jobs, verbose=0)(
+            delayed(_kmeans_single)(X, n_clusters, max_cluster_size, max_iter=max_iter,
+                                    init=init, verbose=verbose, tol=tol,
+                                    precompute_distances=precompute_distances,
+                                    x_squared_norms=x_squared_norms,
+                                    # Change seed to ensure variety
+                                    random_state=seed,
+                                    values=values)
+            for seed in seeds)
+        # Get results with the lowest inertia
+        labels, inertia, centers, n_iters = zip(*results)
+        best = np.argmin(inertia)
+        best_labels = labels[best]
+        best_inertia = inertia[best]
+        best_centers = centers[best]
+        best_n_iter = n_iters[best]
+
+    if not sp.issparse(X):
+        if not copy_x:
+            X += X_mean
+        best_centers += X_mean
+
+    if return_n_iter:
+        return best_centers, best_labels, best_inertia, best_n_iter
+    else:
+        return best_centers, best_labels, best_inertia
+
+
+def _kmeans_single(X, n_clusters, max_cluster_size, x_squared_norms, max_iter=300,
+                   init='k-means++', verbose=False, random_state=None,
+                   tol=1e-4, precompute_distances=True, values=None):
+    """A single run of k-means, assumes preparation completed prior.
+    Parameters
+    ----------
+    X: array-like of floats, shape (n_samples, n_features)
+        The observations to cluster.
+    n_clusters: int
+        The number of clusters to form as well as the number of
+        centroids to generate.
+    max_iter: int, optional, default 300
+        Maximum number of iterations of the k-means algorithm to run.
+    init: {'k-means++', 'random', or ndarray, or a callable}, optional
+        Method for initialization, default to 'k-means++':
+        'k-means++' : selects initial cluster centers for k-mean
+        clustering in a smart way to speed up convergence. See section
+        Notes in k_init for more details.
+        'random': generate k centroids from a Gaussian with mean and
+        variance estimated from the data.
+        If an ndarray is passed, it should be of shape (k, p) and gives
+        the initial centers.
+        If a callable is passed, it should take arguments X, k and
+        and a random state and return an initialization.
+    tol: float, optional
+        The relative increment in the results before declaring convergence.
+    verbose: boolean, optional
+        Verbosity mode
+    x_squared_norms: array
+        Precomputed x_squared_norms.
+    precompute_distances : boolean, default: True
+        Precompute distances (faster but takes more memory).
+    random_state: integer or numpy.RandomState, optional
+        The generator used to initialize the centers. If an integer is
+        given, it fixes the seed. Defaults to the global numpy random
+        number generator.
+    Returns
+    -------
+    centroid: float ndarray with shape (k, n_features)
+        Centroids found at the last iteration of k-means.
+    label: integer ndarray with shape (n_samples,)
+        label[i] is the code or index of the centroid the
+        i'th observation is closest to.
+    inertia: float
+        The final value of the inertia criterion (sum of squared distances to
+        the closest centroid for all observations in the training set).
+    n_iter : int
+        Number of iterations run.
+    """
+    random_state = check_random_state(random_state)
+
+    best_labels, best_inertia, best_centers = None, None, None
+    # init
+    centers = k_means_._init_centroids(X, n_clusters, init, random_state=random_state,
+                                       x_squared_norms=x_squared_norms)
+    if verbose:
+        print("Initialization complete")
+
+    # Allocate memory to store the distances for each sample to its
+    # closer center for reallocation in case of ties
+    distances = np.zeros(shape=(X.shape[0],), dtype=np.float64)
+
+    # iterations
+    for i in range(max_iter):
+        centers_old = centers.copy()
+        # labels assignment is also called the E-step of EM
+        labels, inertia = \
+            _labels_inertia(X, x_squared_norms, centers, max_cluster_size,
+                            precompute_distances=precompute_distances,
+                            distances=distances, values=values)
+
+        # computation of the means is also called the M-step of EM
+        if sp.issparse(X):
+            centers = _k_means._centers_sparse(X, labels, n_clusters,
+                                               distances)
+        else:
+            centers = _k_means._centers_dense(X, labels, n_clusters, distances)
+
+        if verbose:
+            print("Iteration %2d, inertia %.3f" % (i, inertia))
+
+        if best_inertia is None or inertia < best_inertia:
+            best_labels = labels.copy()
+            best_centers = centers.copy()
+            best_inertia = inertia
+
+        shift = squared_norm(centers_old - centers)
+        if shift <= tol:
+            if verbose:
+                print("Converged at iteration %d" % i)
+
+            break
+
+    if shift > 0:
+        # rerun E-step in case of non-convergence so that predicted labels
+        # match cluster centers
+        best_labels, best_inertia = \
+            _labels_inertia(X, x_squared_norms, best_centers, max_cluster_size,
+                            precompute_distances=precompute_distances,
+                            distances=distances, values=values)
+
+    return best_labels, best_inertia, best_centers, i + 1
+
+
+def _validate_center_shape(X, n_centers, centers):
+    """Check if centers is compatible with X and n_centers"""
+    if len(centers) != n_centers:
+        raise ValueError('The shape of the initial centers (%s) '
+                         'does not match the number of clusters %i'
+                         % (centers.shape, n_centers))
+    if centers.shape[1] != X.shape[1]:
+        raise ValueError(
+            "The number of features of the initial centers %s "
+            "does not match the number of features of the data %s."
+            % (centers.shape[1], X.shape[1]))
+
+
+def _tolerance(X, tol):
+    """Return a tolerance which is independent of the dataset"""
+    if sp.issparse(X):
+        variances = mean_variance_axis(X, axis=0)[1]
+    else:
+        variances = np.var(X, axis=0)
+    return np.mean(variances) * tol
+
+
+def _labels_inertia(X, x_squared_norms, centers, max_cluster_size,
+                    precompute_distances=True, distances=None, values=None):
+    """E step of the K-means EM algorithm.
+    Compute the labels and the inertia of the given samples and centers.
+    This will compute the distances in-place.
+    Parameters
+    ----------
+    X: float64 array-like or CSR sparse matrix, shape (n_samples, n_features)
+        The input samples to assign to the labels.
+    x_squared_norms: array, shape (n_samples,)
+        Precomputed squared euclidean norm of each data point, to speed up
+        computations.
+    centers: float64 array, shape (k, n_features)
+        The cluster centers.
+    precompute_distances : boolean, default: True
+        Precompute distances (faster but takes more memory).
+    distances: float64 array, shape (n_samples,)
+        Pre-allocated array to be filled in with each sample's distance
+        to the closest center.
+    Returns
+    -------
+    labels: int array of shape(n)
+        The resulting assignment
+    inertia : float
+        Sum of distances of samples to their closest cluster center.
+    """
+    n_samples = X.shape[0]
+    # set the default value of centers to -1 to be able to detect any anomaly
+    # easily
+    labels = -np.ones(n_samples, np.int32)
+    if distances is None:
+        distances = np.zeros(shape=(0,), dtype=np.float64)
+    # distances will be changed in-place
+    if sp.issparse(X):
+        inertia = k_means_._k_means._assign_labels_csr(
+            X, x_squared_norms, centers, labels, distances=distances)
+    else:
+        if precompute_distances:
+            return _labels_inertia_precompute_dense(X, x_squared_norms,
+                                                    centers, distances,
+                                                    max_cluster_size,values=values)
+        inertia = k_means_._k_means._assign_labels_array(
+            X, x_squared_norms, centers, labels, distances=distances)
+    return labels, inertia
+
+
+def _labels_inertia_precompute_dense(X, x_squared_norms, centers, distances,
+                                     max_cluster_size, values=None):
+    """Compute labels and inertia using a full distance matrix.
+    This will overwrite the 'distances' array in-place.
+    Parameters
+    ----------
+    X : numpy array, shape (n_sample, n_features)
+        Input data.
+    x_squared_norms : numpy array, shape (n_samples,)
+        Precomputed squared norms of X.
+    centers : numpy array, shape (n_clusters, n_features)
+        Cluster centers which data is assigned to.
+    distances : numpy array, shape (n_samples,)
+        Pre-allocated array in which distances are stored.
+    max_cluster_size: TODO
+    Returns
+    -------
+    labels : numpy array, dtype=np.int, shape (n_samples,)
+        Indices of clusters that samples are assigned to.
+    inertia : float
+        Sum of distances of samples to their closest cluster center.
+    """
+    n_samples = X.shape[0]
+    k = centers.shape[0]
+    all_distances = euclidean_distances(centers, X, x_squared_norms,
+                                        squared=True)
+    labels = np.empty(n_samples, dtype=np.int32)
+    labels.fill(-1)
+    mindist = np.empty(n_samples)
+    mindist.fill(np.infty)
+
+    n_samples = X.shape[0]
+    k = centers.shape[0]
+
+    if isinstance(max_cluster_size, int) and max_cluster_size * k < n_samples:
+        raise ValueError("max_cluster_size * k must be greater than n_samples")
+    elif isinstance(max_cluster_size, list):
+        if sum(max_cluster_size) != n_samples:
+            raise ValueError("max_cluster_size summatory must be equal than n_samples")
+        elif len(max_cluster_size) != k:
+            raise ValueError("max_cluster_size len must be equal than k")
+
+    if not max_cluster_size:
+        max_cluster_size = get_clusters_size(n_samples, k)
+
+    labels, mindist = initial_assignment(labels, mindist, n_samples, all_distances, max_cluster_size,values=values)
+    all_points = np.arange(n_samples)
+
+    for point in all_points:
+        for point_dist in get_best_point_distances(point, all_distances):
+            cluster_id, point_dist = point_dist
+            # initial assignment
+            if not is_cluster_full(cluster_id, max_cluster_size, labels,values):
+                labels[point] = cluster_id
+                mindist[point] = point_dist
+                break
+
+    # refinement of clustering
+    transfer_list = []
+    best_mindist = mindist.copy()
+    best_labels = labels.copy()
+    # sort all of the points from largest distance to smallest
+    points_by_high_distance = np.argsort(mindist)[::-1]
+    for point in points_by_high_distance:
+        point_cluster = labels[point]
+
+        # see if there is an opening on the best cluster for this point
+        cluster_id, point_dist = get_best_cluster_for_point(point, all_distances)
+        if not is_cluster_full(cluster_id, max_cluster_size, labels,values=values) and point_cluster != cluster_id:
+            labels[point] = cluster_id
+            mindist[point] = point_dist
+            best_labels = labels.copy()
+            best_mindist = mindist.copy()
+            continue  # on to the next point
+
+        for swap_candidate in transfer_list:
+            cand_cluster = labels[swap_candidate]
+            if point_cluster != cand_cluster:
+
+                # get the current dist of swap candidate
+                cand_distance = mindist[swap_candidate]
+
+                # get the potential dist of point
+                point_distance = all_distances[cand_cluster, point]
+
+                # compare
+                if point_distance < cand_distance:
+
+                    labels[point] = cand_cluster
+                    mindist[point] = all_distances[cand_cluster, point]
+
+                    labels[swap_candidate] = point_cluster
+                    mindist[swap_candidate] = all_distances[point_cluster, swap_candidate]
+
+                    if np.absolute(mindist).sum() < np.absolute(best_mindist).sum():
+                        # update the labels since the transfer was a success
+                        best_labels = labels.copy()
+                        best_mindist = mindist.copy()
+                        break
+
+                    else:
+                        # reset since the transfer was not a success
+                        labels = best_labels.copy()
+                        mindist = best_mindist.copy()
+
+        transfer_list.append(point)
+
+    if n_samples == distances.shape[0]:
+        # distances will be changed in-place
+        distances[:] = mindist
+    inertia = best_mindist.sum()
+
+    return best_labels, inertia
+
+
+def get_best_cluster_for_point(point, all_distances):
+    """Gets the best cluster by distance for a point
+
+    Argument
+    --------
+    point : int
+        the point index
+
+    Returns
+    --------
+    tuple
+        (cluster_id, distance_from_cluster_center)
+    """
+
+    sorted_distances = get_best_point_distances(point, all_distances)
+    cluster_id, point_dist = sorted_distances[0]
+    return cluster_id, point_dist
+
+
+def get_best_point_distances(point, all_distances):
+    """Gets a sorted by best distance of clusters
+
+    Argument
+    --------
+    point : int
+        the point index
+
+    Returns
+    --------
+    list of tuples sorted by point_dist
+        example: [(cluster_id, point_dist), (cluster_id, point_dist)]
+    """
+    points_distances = all_distances[:, point]
+    sorted_points = sort_adjust_row(points_distances)
+    return sorted_points
+
+
+def sort_adjust_row(points_distances):
+    "Sorts the points row from smallest distance to lowest distance"
+    return sorted([(cluster_id, point_dist) for cluster_id, point_dist in enumerate(points_distances)], key=lambda x: x[1])
+
+
+def is_cluster_full(cluster_id, max_cluster_size, labels, values=None):
+    """Determines if a cluster is full"""
+    if isinstance(max_cluster_size, list):
+        mx_cs = max_cluster_size[cluster_id]
+    else:
+        mx_cs = max_cluster_size
+
+    if values is not None:
+        cluster_count = np.sum(values[np.where(labels == cluster_id)])
+    else:
+        cluster_count = len(np.where(labels == cluster_id)[0])
+
+    is_full = cluster_count >= mx_cs
+    return is_full
+
+
+def get_clusters_size(n_samples, n_clusters):
+    """Gets the number of members per cluster for equal groups kmeans"""
+    return (n_samples + n_clusters - 1) // n_clusters
+
+
+def initial_assignment(labels, mindist, n_samples, all_distances, max_cluster_size, values=None):
+    """Initial assignment of labels and mindist"""
+    all_points = np.arange(n_samples)
+    for point in all_points:
+        for point_dist in get_best_point_distances(point, all_distances):
+            cluster_id, point_dist = point_dist
+            # initial assignment
+            if not is_cluster_full(cluster_id, max_cluster_size, labels, values):
+                labels[point] = cluster_id
+                mindist[point] = point_dist
+                break
+    return labels, mindist
--- a/src/py/crankshaft/crankshaft/clustering/kmeans.py
+++ b/src/py/crankshaft/crankshaft/clustering/kmeans.py
@ -1,4 +1,6 @@
 from sklearn.cluster import KMeans
+from .balanced_kmeans import BalancedGroupsKMeans
+
 import numpy as np

 from crankshaft.analysis_data_provider import AnalysisDataProvider
@ -31,6 +33,38 @@ class Kmeans(object):
        labels = km.fit_predict(zip(xs, ys))
        return zip(ids, labels)

+    def spatial_balanced(self, query, no_clusters, no_init=20,
+                         target_per_cluster=None, value_column=None):
+        params = {
+            "subquery": query,
+            "geom_col": "the_geom",
+            "id_col": "cartodb_id",
+            "value_column": value_column,
+        }
+
+        data = self.data_provider.get_spatial_balanced_kmeans(params)
+
+        lons = data[0]['xs']
+        lats = data[0]['ys']
+        ids = data[0]['ids']
+        values = data[0]['values']
+
+        total_value = np.sum(values)
+
+        if target_per_cluster is None:
+            target_per_cluster = total_value / float(no_clusters)
+
+        bal_kmeans = BalancedGroupsKMeans(
+            n_clusters=17,
+            max_iter=100,
+            max_cluster_size=target_per_cluster
+        )
+        labels = bal_kmeans.fit_predict(
+            zip(lons, lats),
+            values=values
+        )
+        return zip(ids, labels)
+
    def nonspatial(self, subquery, colnames, no_clusters=5,
                   standardize=True, id_col='cartodb_id'):
        """
Author	SHA1	Message	Date
Andy Eschbacher	2cd6cdd4a6	fixes variale name error	2018-02-16 13:31:17 -05:00
Andy Eschbacher	c48ae50ade	small syntax fixes	2018-02-09 16:22:10 -05:00
Stuart Lynn	d2574c20ef	update to the kmeans - balanced code	2018-02-09 15:55:04 -05:00
Stuart Lynn	7b42beb82f	Merge branch 'develop' into balanced_kmeans	2018-01-29 16:41:58 -05:00
Stuart Lynn	9b7eda798c	inital commit for balanced k means implementation	2018-01-29 10:51:13 -05:00