From c6f64ad2f401a0b2f14a17453dcc9610e40f01c1 Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Tue, 10 Jan 2017 09:49:16 -0500 Subject: [PATCH] bug fixes and adding of internal docs --- src/pg/sql/11_kmeans.sql | 11 ++++-- .../crankshaft/analysis_data_provider.py | 25 ++++++++++++-- .../crankshaft/clustering/kmeans.py | 34 +++++++++++-------- .../crankshaft/test/test_clustering_kmeans.py | 4 +-- 4 files changed, 52 insertions(+), 22 deletions(-) diff --git a/src/pg/sql/11_kmeans.sql b/src/pg/sql/11_kmeans.sql index 6100d27..89e16a8 100644 --- a/src/pg/sql/11_kmeans.sql +++ b/src/pg/sql/11_kmeans.sql @@ -11,20 +11,25 @@ $$ LANGUAGE plpythonu; -- Non-spatial k-means clustering -- query: sql query to retrieve all the needed data +-- colnames: text array of column names for doing the clustering analysis +-- standardize: whether to scale variables to a mean of zero and a standard +-- deviation of 1 +-- id_colname: name of the id column CREATE OR REPLACE FUNCTION CDB_KMeansNonspatial( query TEXT, colnames TEXT[], num_clusters INTEGER, - id_colname TEXT DEFAULT 'cartodb_id', - standarize BOOLEAN DEFAULT true + standardize BOOLEAN DEFAULT true, + id_colname TEXT DEFAULT 'cartodb_id' ) RETURNS TABLE(cluster_label text, cluster_center json, silhouettes numeric, rowid bigint) AS $$ from crankshaft.clustering import Kmeans kmeans = Kmeans() return kmeans.nonspatial(query, colnames, num_clusters, - id_colname, standarize) + standardize=standardize, + id_col=id_colname) $$ LANGUAGE plpythonu; diff --git a/src/py/crankshaft/crankshaft/analysis_data_provider.py b/src/py/crankshaft/crankshaft/analysis_data_provider.py index d942627..ef9baed 100644 --- a/src/py/crankshaft/crankshaft/analysis_data_provider.py +++ b/src/py/crankshaft/crankshaft/analysis_data_provider.py @@ -45,18 +45,34 @@ class AnalysisDataProvider: return pu.empty_zipped_array(2) def get_nonspatial_kmeans(self, params): - """fetch data for non-spatial kmeans""" + """ + Fetch data for non-spatial k-means. + + Inputs - a dict (params) with the following keys: + colnames: a (text) list of column names (e.g., + `['andy', 'cookie']`) + id_col: the name of the id column (e.g., `'cartodb_id'`) + subquery: the subquery for exposing the data (e.g., + SELECT * FROM favorite_things) + Output: + A SQL query for packaging the data for consumption within + `KMeans().nonspatial`. Format will be a list of length one, + with the first element a dict with keys ('rowid', 'attr1', + 'attr2', ...) + """ agg_cols = ', '.join(['array_agg({0}) As arr_col{1}'.format(val, idx+1) for idx, val in enumerate(params['colnames'])]) - print agg_cols query = ''' SELECT {cols}, array_agg({id_col}) As rowid FROM ({subquery}) As a '''.format(subquery=params['subquery'], id_col=params['id_col'], - cols=agg_cols) + cols=agg_cols).strip() try: data = plpy.execute(query) + if len(data) == 0: + plpy.error('No non-null-valued data to analyze. Check the ' + 'rows and columns of all of the inputs') return data except plpy.SPIError, err: plpy.error('Analysis failed: %s' % err) @@ -71,6 +87,9 @@ class AnalysisDataProvider: "WHERE {geom_col} IS NOT NULL").format(**params) try: data = plpy.execute(query) + if len(data) == 0: + plpy.error('No non-null-valued data to analyze. Check the ' + 'rows and columns of all of the inputs') return data except plpy.SPIError, err: plpy.error('Analysis failed: %s' % err) diff --git a/src/py/crankshaft/crankshaft/clustering/kmeans.py b/src/py/crankshaft/crankshaft/clustering/kmeans.py index bb3343b..2477d80 100644 --- a/src/py/crankshaft/crankshaft/clustering/kmeans.py +++ b/src/py/crankshaft/crankshaft/clustering/kmeans.py @@ -32,40 +32,45 @@ class Kmeans: return zip(ids, labels) def nonspatial(self, subquery, colnames, num_clusters=5, - id_col='cartodb_id', standarize=True): + standardize=True, id_col='cartodb_id'): """ + Inputs: query (string): A SQL query to retrieve the data required to do the k-means clustering analysis, like so: SELECT * FROM iris_flower_data colnames (list): a list of the column names which contain the data - of interest, like so: ["sepal_width", - "petal_width", - "sepal_length", - "petal_length"] + of interest, like so: ['sepal_width', + 'petal_width', + 'sepal_length', + 'petal_length'] num_clusters (int): number of clusters (greater than zero) id_col (string): name of the input id_column + + Output: + A list of tuples with the following columns: + cluster labels: a label for the cluster that the row belongs to + centers: center of the cluster that this row belongs to + silhouettes: silhouette measure for this value + rowid: row that these values belong to (corresponds to the value in + `id_col`) """ import json from sklearn import metrics - out_id_colname = 'rowids' # TODO: need a random seed? - params = {"cols": colnames, + params = {"colnames": colnames, "subquery": subquery, "id_col": id_col} - data = self.data_provider.get_nonspatial_kmeans(params, standarize) + data = self.data_provider.get_nonspatial_kmeans(params) # fill array with values for k-means clustering - if standarize: + if standardize: cluster_columns = _scale_data( _extract_columns(data, len(colnames))) else: cluster_columns = _extract_columns(data, len(colnames)) - print str(cluster_columns) - # TODO: decide on optimal parameters for most cases - # Are there ways of deciding parameters based on inputs? kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(cluster_columns) @@ -79,7 +84,7 @@ class Kmeans: return zip(kmeans.labels_, centers, silhouettes, - data[0][out_id_colname]) + data[0]['rowid']) # -- Preprocessing steps @@ -102,4 +107,5 @@ def _scale_data(features): features (numpy matrix): features of dimension (n_features, n_samples) """ from sklearn.preprocessing import StandardScaler - return StandardScaler().fit_transform(features) + scaler = StandardScaler() + return scaler.fit_transform(features) diff --git a/src/py/crankshaft/test/test_clustering_kmeans.py b/src/py/crankshaft/test/test_clustering_kmeans.py index 572a514..3756b7e 100644 --- a/src/py/crankshaft/test/test_clustering_kmeans.py +++ b/src/py/crankshaft/test/test_clustering_kmeans.py @@ -19,7 +19,7 @@ class FakeDataProvider(AnalysisDataProvider): def get_spatial_kmeans(self, query): return self.mocked_result - def get_nonspatial_kmeans(self, query, standarize): + def get_nonspatial_kmeans(self, query): return self.mocked_result @@ -66,7 +66,7 @@ class KMeansNonspatialTest(unittest.TestCase): # http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn-cluster-kmeans data_raw = [OrderedDict([("arr_col1", [1, 1, 1, 4, 4, 4]), ("arr_col2", [2, 4, 0, 2, 4, 0]), - ("rowids", [1, 2, 3, 4, 5, 6])])] + ("rowid", [1, 2, 3, 4, 5, 6])])] random_seeds.set_random_seeds(1234) kmeans = Kmeans(FakeDataProvider(data_raw))