bug fixes and adding of internal docs

2024-11-01 10:20:48 +08:00 · 2017-01-10 09:49:16 -05:00 · 2017-01-10 09:49:16 -05:00 · c6f64ad2f4
commit c6f64ad2f4
parent 3dad9c6044
4 changed files with 52 additions and 22 deletions
--- a/src/pg/sql/11_kmeans.sql
+++ b/src/pg/sql/11_kmeans.sql
@ -11,20 +11,25 @@ $$ LANGUAGE plpythonu;

 -- Non-spatial k-means clustering
 -- query: sql query to retrieve all the needed data
+-- colnames: text array of column names for doing the clustering analysis
+-- standardize: whether to scale variables to a mean of zero and a standard
+--              deviation of 1
+-- id_colname: name of the id column

 CREATE OR REPLACE FUNCTION CDB_KMeansNonspatial(
  query TEXT,
  colnames TEXT[],
  num_clusters INTEGER,
-  id_colname TEXT DEFAULT 'cartodb_id',
-  standarize BOOLEAN DEFAULT true
+  standardize BOOLEAN DEFAULT true,
+  id_colname TEXT DEFAULT 'cartodb_id'
 )
 RETURNS TABLE(cluster_label text, cluster_center json, silhouettes numeric, rowid bigint) AS $$

    from crankshaft.clustering import Kmeans
    kmeans = Kmeans()
    return kmeans.nonspatial(query, colnames, num_clusters,
-                             id_colname, standarize)
+                             standardize=standardize,
+                             id_col=id_colname)
 $$ LANGUAGE plpythonu;


--- a/src/py/crankshaft/crankshaft/analysis_data_provider.py
+++ b/src/py/crankshaft/crankshaft/analysis_data_provider.py
@ -45,18 +45,34 @@ class AnalysisDataProvider:
            return pu.empty_zipped_array(2)

    def get_nonspatial_kmeans(self, params):
-        """fetch data for non-spatial kmeans"""
+        """
+            Fetch data for non-spatial k-means.
+
+            Inputs - a dict (params) with the following keys:
+                colnames: a (text) list of column names (e.g.,
+                          `['andy', 'cookie']`)
+                id_col: the name of the id column (e.g., `'cartodb_id'`)
+                subquery: the subquery for exposing the data (e.g.,
+                          SELECT * FROM favorite_things)
+            Output:
+                A SQL query for packaging the data for consumption within
+                `KMeans().nonspatial`. Format will be a list of length one,
+                with the first element a dict with keys ('rowid', 'attr1',
+                'attr2', ...)
+        """
        agg_cols = ', '.join(['array_agg({0}) As arr_col{1}'.format(val, idx+1)
                              for idx, val in enumerate(params['colnames'])])
-        print agg_cols
        query = '''
            SELECT {cols}, array_agg({id_col}) As rowid
            FROM ({subquery}) As a
        '''.format(subquery=params['subquery'],
                   id_col=params['id_col'],
-                   cols=agg_cols)
+                   cols=agg_cols).strip()
        try:
            data = plpy.execute(query)
+            if len(data) == 0:
+                plpy.error('No non-null-valued data to analyze. Check the '
+                           'rows and columns of all of the inputs')
            return data
        except plpy.SPIError, err:
            plpy.error('Analysis failed: %s' % err)
@ -71,6 +87,9 @@ class AnalysisDataProvider:
                 "WHERE {geom_col} IS NOT NULL").format(**params)
        try:
            data = plpy.execute(query)
+            if len(data) == 0:
+                plpy.error('No non-null-valued data to analyze. Check the '
+                           'rows and columns of all of the inputs')
            return data
        except plpy.SPIError, err:
            plpy.error('Analysis failed: %s' % err)
--- a/src/py/crankshaft/crankshaft/clustering/kmeans.py
+++ b/src/py/crankshaft/crankshaft/clustering/kmeans.py
@ -32,40 +32,45 @@ class Kmeans:
        return zip(ids, labels)

    def nonspatial(self, subquery, colnames, num_clusters=5,
-                   id_col='cartodb_id', standarize=True):
+                   standardize=True, id_col='cartodb_id'):
        """
+            Inputs:
            query (string): A SQL query to retrieve the data required to do the
                            k-means clustering analysis, like so:
                            SELECT * FROM iris_flower_data
            colnames (list): a list of the column names which contain the data
-                             of interest, like so: ["sepal_width",
-                                                    "petal_width",
-                                                    "sepal_length",
-                                                    "petal_length"]
+                             of interest, like so: ['sepal_width',
+                                                    'petal_width',
+                                                    'sepal_length',
+                                                    'petal_length']
            num_clusters (int): number of clusters (greater than zero)
            id_col (string): name of the input id_column
+
+            Output:
+            A list of tuples with the following columns:
+            cluster labels: a label for the cluster that the row belongs to
+            centers: center of the cluster that this row belongs to
+            silhouettes: silhouette measure for this value
+            rowid: row that these values belong to (corresponds to the value in
+                   `id_col`)
        """
        import json
        from sklearn import metrics

-        out_id_colname = 'rowids'
        # TODO: need a random seed?
-        params = {"cols": colnames,
+        params = {"colnames": colnames,
                  "subquery": subquery,
                  "id_col": id_col}

-        data = self.data_provider.get_nonspatial_kmeans(params, standarize)
+        data = self.data_provider.get_nonspatial_kmeans(params)

        # fill array with values for k-means clustering
-        if standarize:
+        if standardize:
            cluster_columns = _scale_data(
              _extract_columns(data, len(colnames)))
        else:
            cluster_columns = _extract_columns(data, len(colnames))

-        print str(cluster_columns)
-        # TODO: decide on optimal parameters for most cases
-        #       Are there ways of deciding parameters based on inputs?
        kmeans = KMeans(n_clusters=num_clusters,
                        random_state=0).fit(cluster_columns)

@ -79,7 +84,7 @@ class Kmeans:
        return zip(kmeans.labels_,
                   centers,
                   silhouettes,
-                   data[0][out_id_colname])
+                   data[0]['rowid'])


 # -- Preprocessing steps
@ -102,4 +107,5 @@ def _scale_data(features):
        features (numpy matrix): features of dimension (n_features, n_samples)
    """
    from sklearn.preprocessing import StandardScaler
-    return StandardScaler().fit_transform(features)
+    scaler = StandardScaler()
+    return scaler.fit_transform(features)
--- a/src/py/crankshaft/test/test_clustering_kmeans.py
+++ b/src/py/crankshaft/test/test_clustering_kmeans.py
@ -19,7 +19,7 @@ class FakeDataProvider(AnalysisDataProvider):
    def get_spatial_kmeans(self, query):
        return self.mocked_result

-    def get_nonspatial_kmeans(self, query, standarize):
+    def get_nonspatial_kmeans(self, query):
        return self.mocked_result


@ -66,7 +66,7 @@ class KMeansNonspatialTest(unittest.TestCase):
        # http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn-cluster-kmeans
        data_raw = [OrderedDict([("arr_col1", [1, 1, 1, 4, 4, 4]),
                                 ("arr_col2", [2, 4, 0, 2, 4, 0]),
-                                 ("rowids", [1, 2, 3, 4, 5, 6])])]
+                                 ("rowid", [1, 2, 3, 4, 5, 6])])]

        random_seeds.set_random_seeds(1234)
        kmeans = Kmeans(FakeDataProvider(data_raw))