strips out kmeans non spatial

2016-11-21 16:19:54 +00:00 · 2016-11-21 16:19:54 +00:00 · 2f27622a6d
commit 2f27622a6d
parent c5a2746a53
3 changed files with 0 additions and 130 deletions
--- a/src/pg/sql/11_kmeans.sql
+++ b/src/pg/sql/11_kmeans.sql
@ -9,24 +9,6 @@ RETURNS table (cartodb_id integer, cluster_no integer) as $$

 $$ LANGUAGE plpythonu;

-- Non-spatial k-means clustering
-- query: sql query to retrieve all the needed data
-
-CREATE OR REPLACE FUNCTION CDB_KMeansNonspatial(
-  query TEXT,
-  colnames TEXT[],
-  num_clusters INTEGER,
-  id_colname TEXT DEFAULT 'cartodb_id',
-  standarize BOOLEAN DEFAULT true
-)
-RETURNS TABLE(cluster_label text, cluster_center json, silhouettes numeric, rowid bigint) AS $$
-
-    from crankshaft.clustering import Kmeans
-    kmeans = Kmeans()
-    return kmeans.nonspatial(query, colnames, num_clusters,
-                             id_colname, standarize)
-$$ LANGUAGE plpythonu;
-

 CREATE OR REPLACE FUNCTION CDB_WeightedMeanS(state Numeric[],the_geom GEOMETRY(Point, 4326), weight NUMERIC)
 RETURNS Numeric[] AS
--- a/src/py/crankshaft/crankshaft/clustering/kmeans.py
+++ b/src/py/crankshaft/crankshaft/clustering/kmeans.py
@ -33,82 +33,3 @@ class Kmeans:
        km = KMeans(n_clusters=no_clusters, n_init=no_init)
        labels = km.fit_predict(zip(xs, ys))
        return zip(ids, labels)
-
-    def nonspatial(self, query, colnames, num_clusters=5,
-                   id_col='cartodb_id', standarize=True):
-        """
-            query (string): A SQL query to retrieve the data required to do the
-                            k-means clustering analysis, like so:
-                            SELECT * FROM iris_flower_data
-            colnames (list): a list of the column names which contain the data
-                             of interest, like so: ["sepal_width",
-                                                    "petal_width",
-                                                    "sepal_length",
-                                                    "petal_length"]
-            num_clusters (int): number of clusters (greater than zero)
-            id_col (string): name of the input id_column
-        """
-        import json
-        from sklearn import metrics
-
-        out_id_colname = 'rowids'
-        # TODO: need a random seed?
-
-        full_query = '''
-            SELECT {cols}, array_agg({id_col}) As {out_id_colname}
-            FROM ({query}) As a
-        '''.format(query=query,
-                   id_col=id_col,
-                   out_id_colname=out_id_colname,
-                   cols=', '.join(['array_agg({0}) As col{1}'.format(val, idx)
-                                   for idx, val in enumerate(colnames)]))
-
-        db_resp = self.query_runner.get_nonspatial_kmeans(full_query, standarize)
-
-        # fill array with values for k-means clustering
-        if standarize:
-            cluster_columns = _scale_data(
-              _extract_columns(db_resp, colnames))
-        else:
-            cluster_columns = _extract_columns(db_resp, colnames)
-
-        print str(cluster_columns)
-        # TODO: decide on optimal parameters for most cases
-        #       Are there ways of deciding parameters based on inputs?
-        kmeans = KMeans(n_clusters=num_clusters,
-                        random_state=0).fit(cluster_columns)
-
-        centers = [json.dumps(dict(zip(colnames, c)))
-                   for c in kmeans.cluster_centers_[kmeans.labels_]]
-
-        silhouettes = metrics.silhouette_samples(cluster_columns,
-                                                 kmeans.labels_,
-                                                 metric='sqeuclidean')
-
-        return zip(kmeans.labels_,
-                   centers,
-                   silhouettes,
-                   db_resp[0][out_id_colname])
-
-
-# -- Preprocessing steps
-
-def _extract_columns(db_resp, colnames):
-    """
-        Extract the features from the query and pack them into a NumPy array
-        db_resp (plpy data object): result of the kmeans request
-        id_col_name (string): name of column which has the row id (not a
-                              feature of the analysis)
-    """
-    return np.array([db_resp[0][c] for c in colnames],
-                    dtype=float).T
-
-
-def _scale_data(features):
-    """
-        Scale all input columns to center on 0 with a standard devation of 1
-
-        features (numpy matrix): features of dimension (n_features, n_samples)
-    """
-    from sklearn.preprocessing import StandardScaler
-    return StandardScaler().fit_transform(features)
--- a/src/py/crankshaft/test/test_clustering_kmeans.py
+++ b/src/py/crankshaft/test/test_clustering_kmeans.py
@ -54,36 +54,3 @@ class KMeansTest(unittest.TestCase):
        self.assertEqual(len(np.unique(labels)), 2)
        self.assertEqual(len(c1), 20)
        self.assertEqual(len(c2), 20)
-
-
-class KMeansNonspatialTest(unittest.TestCase):
-    """Testing class for k-means non-spatial"""
-
-    def setUp(self):
-        self.params = {"subquery": "SELECT * FROM TABLE",
-                       "n_clusters": 5}
-
-    def test_kmeans_nonspatial(self):
-        """
-            test for k-means non-spatial
-        """
-        # data from:
-        # http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn-cluster-kmeans
-        data_raw = [OrderedDict([("col1", [1, 1, 1, 4, 4, 4]),
-                                 ("col2", [2, 4, 0, 2, 4, 0]),
-                                 ("rowids", [1, 2, 3, 4, 5, 6])])]
-
-        random_seeds.set_random_seeds(1234)
-        kmeans = Kmeans(FakeQueryRunner(data_raw))
-        print 'asfasdfasd'
-        clusters = kmeans.nonspatial('subquery', ['col1', 'col2'], 2)
-        print str([c[0] for c in clusters])
-
-        cl1 = clusters[0][0]
-        cl2 = clusters[3][0]
-
-        for idx, val in enumerate(clusters):
-            if idx < 3:
-                self.assertEqual(val[0], cl1)
-            else:
-                self.assertEqual(val[0], cl2)