From c6f64ad2f401a0b2f14a17453dcc9610e40f01c1 Mon Sep 17 00:00:00 2001
From: Andy Eschbacher <andy.eschbacher@gmail.com>
Date: Tue, 10 Jan 2017 09:49:16 -0500
Subject: [PATCH] bug fixes and adding of internal docs

---
 src/pg/sql/11_kmeans.sql                      | 11 ++++--
 .../crankshaft/analysis_data_provider.py      | 25 ++++++++++++--
 .../crankshaft/clustering/kmeans.py           | 34 +++++++++++--------
 .../crankshaft/test/test_clustering_kmeans.py |  4 +--
 4 files changed, 52 insertions(+), 22 deletions(-)

diff --git a/src/pg/sql/11_kmeans.sql b/src/pg/sql/11_kmeans.sql
index 6100d27..89e16a8 100644
--- a/src/pg/sql/11_kmeans.sql
+++ b/src/pg/sql/11_kmeans.sql
@@ -11,20 +11,25 @@ $$ LANGUAGE plpythonu;
 
 -- Non-spatial k-means clustering
 -- query: sql query to retrieve all the needed data
+-- colnames: text array of column names for doing the clustering analysis
+-- standardize: whether to scale variables to a mean of zero and a standard
+--              deviation of 1
+-- id_colname: name of the id column
 
 CREATE OR REPLACE FUNCTION CDB_KMeansNonspatial(
   query TEXT,
   colnames TEXT[],
   num_clusters INTEGER,
-  id_colname TEXT DEFAULT 'cartodb_id',
-  standarize BOOLEAN DEFAULT true
+  standardize BOOLEAN DEFAULT true,
+  id_colname TEXT DEFAULT 'cartodb_id'
 )
 RETURNS TABLE(cluster_label text, cluster_center json, silhouettes numeric, rowid bigint) AS $$
 
     from crankshaft.clustering import Kmeans
     kmeans = Kmeans()
     return kmeans.nonspatial(query, colnames, num_clusters,
-                             id_colname, standarize)
+                             standardize=standardize,
+                             id_col=id_colname)
 $$ LANGUAGE plpythonu;
 
 
diff --git a/src/py/crankshaft/crankshaft/analysis_data_provider.py b/src/py/crankshaft/crankshaft/analysis_data_provider.py
index d942627..ef9baed 100644
--- a/src/py/crankshaft/crankshaft/analysis_data_provider.py
+++ b/src/py/crankshaft/crankshaft/analysis_data_provider.py
@@ -45,18 +45,34 @@ class AnalysisDataProvider:
             return pu.empty_zipped_array(2)
 
     def get_nonspatial_kmeans(self, params):
-        """fetch data for non-spatial kmeans"""
+        """
+            Fetch data for non-spatial k-means.
+
+            Inputs - a dict (params) with the following keys:
+                colnames: a (text) list of column names (e.g.,
+                          `['andy', 'cookie']`)
+                id_col: the name of the id column (e.g., `'cartodb_id'`)
+                subquery: the subquery for exposing the data (e.g.,
+                          SELECT * FROM favorite_things)
+            Output:
+                A SQL query for packaging the data for consumption within
+                `KMeans().nonspatial`. Format will be a list of length one,
+                with the first element a dict with keys ('rowid', 'attr1',
+                'attr2', ...)
+        """
         agg_cols = ', '.join(['array_agg({0}) As arr_col{1}'.format(val, idx+1)
                               for idx, val in enumerate(params['colnames'])])
-        print agg_cols
         query = '''
             SELECT {cols}, array_agg({id_col}) As rowid
             FROM ({subquery}) As a
         '''.format(subquery=params['subquery'],
                    id_col=params['id_col'],
-                   cols=agg_cols)
+                   cols=agg_cols).strip()
         try:
             data = plpy.execute(query)
+            if len(data) == 0:
+                plpy.error('No non-null-valued data to analyze. Check the '
+                           'rows and columns of all of the inputs')
             return data
         except plpy.SPIError, err:
             plpy.error('Analysis failed: %s' % err)
@@ -71,6 +87,9 @@ class AnalysisDataProvider:
                  "WHERE {geom_col} IS NOT NULL").format(**params)
         try:
             data = plpy.execute(query)
+            if len(data) == 0:
+                plpy.error('No non-null-valued data to analyze. Check the '
+                           'rows and columns of all of the inputs')
             return data
         except plpy.SPIError, err:
             plpy.error('Analysis failed: %s' % err)
diff --git a/src/py/crankshaft/crankshaft/clustering/kmeans.py b/src/py/crankshaft/crankshaft/clustering/kmeans.py
index bb3343b..2477d80 100644
--- a/src/py/crankshaft/crankshaft/clustering/kmeans.py
+++ b/src/py/crankshaft/crankshaft/clustering/kmeans.py
@@ -32,40 +32,45 @@ class Kmeans:
         return zip(ids, labels)
 
     def nonspatial(self, subquery, colnames, num_clusters=5,
-                   id_col='cartodb_id', standarize=True):
+                   standardize=True, id_col='cartodb_id'):
         """
+            Inputs:
             query (string): A SQL query to retrieve the data required to do the
                             k-means clustering analysis, like so:
                             SELECT * FROM iris_flower_data
             colnames (list): a list of the column names which contain the data
-                             of interest, like so: ["sepal_width",
-                                                    "petal_width",
-                                                    "sepal_length",
-                                                    "petal_length"]
+                             of interest, like so: ['sepal_width',
+                                                    'petal_width',
+                                                    'sepal_length',
+                                                    'petal_length']
             num_clusters (int): number of clusters (greater than zero)
             id_col (string): name of the input id_column
+
+            Output:
+            A list of tuples with the following columns:
+            cluster labels: a label for the cluster that the row belongs to
+            centers: center of the cluster that this row belongs to
+            silhouettes: silhouette measure for this value
+            rowid: row that these values belong to (corresponds to the value in
+                   `id_col`)
         """
         import json
         from sklearn import metrics
 
-        out_id_colname = 'rowids'
         # TODO: need a random seed?
-        params = {"cols": colnames,
+        params = {"colnames": colnames,
                   "subquery": subquery,
                   "id_col": id_col}
 
-        data = self.data_provider.get_nonspatial_kmeans(params, standarize)
+        data = self.data_provider.get_nonspatial_kmeans(params)
 
         # fill array with values for k-means clustering
-        if standarize:
+        if standardize:
             cluster_columns = _scale_data(
               _extract_columns(data, len(colnames)))
         else:
             cluster_columns = _extract_columns(data, len(colnames))
 
-        print str(cluster_columns)
-        # TODO: decide on optimal parameters for most cases
-        #       Are there ways of deciding parameters based on inputs?
         kmeans = KMeans(n_clusters=num_clusters,
                         random_state=0).fit(cluster_columns)
 
@@ -79,7 +84,7 @@ class Kmeans:
         return zip(kmeans.labels_,
                    centers,
                    silhouettes,
-                   data[0][out_id_colname])
+                   data[0]['rowid'])
 
 
 # -- Preprocessing steps
@@ -102,4 +107,5 @@ def _scale_data(features):
         features (numpy matrix): features of dimension (n_features, n_samples)
     """
     from sklearn.preprocessing import StandardScaler
-    return StandardScaler().fit_transform(features)
+    scaler = StandardScaler()
+    return scaler.fit_transform(features)
diff --git a/src/py/crankshaft/test/test_clustering_kmeans.py b/src/py/crankshaft/test/test_clustering_kmeans.py
index 572a514..3756b7e 100644
--- a/src/py/crankshaft/test/test_clustering_kmeans.py
+++ b/src/py/crankshaft/test/test_clustering_kmeans.py
@@ -19,7 +19,7 @@ class FakeDataProvider(AnalysisDataProvider):
     def get_spatial_kmeans(self, query):
         return self.mocked_result
 
-    def get_nonspatial_kmeans(self, query, standarize):
+    def get_nonspatial_kmeans(self, query):
         return self.mocked_result
 
 
@@ -66,7 +66,7 @@ class KMeansNonspatialTest(unittest.TestCase):
         # http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn-cluster-kmeans
         data_raw = [OrderedDict([("arr_col1", [1, 1, 1, 4, 4, 4]),
                                  ("arr_col2", [2, 4, 0, 2, 4, 0]),
-                                 ("rowids", [1, 2, 3, 4, 5, 6])])]
+                                 ("rowid", [1, 2, 3, 4, 5, 6])])]
 
         random_seeds.set_random_seeds(1234)
         kmeans = Kmeans(FakeDataProvider(data_raw))