bug fixes and adding of internal docs

This commit is contained in:
Andy Eschbacher 2017-01-10 09:49:16 -05:00
parent 3dad9c6044
commit c6f64ad2f4
4 changed files with 52 additions and 22 deletions

View File

@ -11,20 +11,25 @@ $$ LANGUAGE plpythonu;
-- Non-spatial k-means clustering
-- query: sql query to retrieve all the needed data
-- colnames: text array of column names for doing the clustering analysis
-- standardize: whether to scale variables to a mean of zero and a standard
-- deviation of 1
-- id_colname: name of the id column
CREATE OR REPLACE FUNCTION CDB_KMeansNonspatial(
query TEXT,
colnames TEXT[],
num_clusters INTEGER,
id_colname TEXT DEFAULT 'cartodb_id',
standarize BOOLEAN DEFAULT true
standardize BOOLEAN DEFAULT true,
id_colname TEXT DEFAULT 'cartodb_id'
)
RETURNS TABLE(cluster_label text, cluster_center json, silhouettes numeric, rowid bigint) AS $$
from crankshaft.clustering import Kmeans
kmeans = Kmeans()
return kmeans.nonspatial(query, colnames, num_clusters,
id_colname, standarize)
standardize=standardize,
id_col=id_colname)
$$ LANGUAGE plpythonu;

View File

@ -45,18 +45,34 @@ class AnalysisDataProvider:
return pu.empty_zipped_array(2)
def get_nonspatial_kmeans(self, params):
"""fetch data for non-spatial kmeans"""
"""
Fetch data for non-spatial k-means.
Inputs - a dict (params) with the following keys:
colnames: a (text) list of column names (e.g.,
`['andy', 'cookie']`)
id_col: the name of the id column (e.g., `'cartodb_id'`)
subquery: the subquery for exposing the data (e.g.,
SELECT * FROM favorite_things)
Output:
A SQL query for packaging the data for consumption within
`KMeans().nonspatial`. Format will be a list of length one,
with the first element a dict with keys ('rowid', 'attr1',
'attr2', ...)
"""
agg_cols = ', '.join(['array_agg({0}) As arr_col{1}'.format(val, idx+1)
for idx, val in enumerate(params['colnames'])])
print agg_cols
query = '''
SELECT {cols}, array_agg({id_col}) As rowid
FROM ({subquery}) As a
'''.format(subquery=params['subquery'],
id_col=params['id_col'],
cols=agg_cols)
cols=agg_cols).strip()
try:
data = plpy.execute(query)
if len(data) == 0:
plpy.error('No non-null-valued data to analyze. Check the '
'rows and columns of all of the inputs')
return data
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)
@ -71,6 +87,9 @@ class AnalysisDataProvider:
"WHERE {geom_col} IS NOT NULL").format(**params)
try:
data = plpy.execute(query)
if len(data) == 0:
plpy.error('No non-null-valued data to analyze. Check the '
'rows and columns of all of the inputs')
return data
except plpy.SPIError, err:
plpy.error('Analysis failed: %s' % err)

View File

@ -32,40 +32,45 @@ class Kmeans:
return zip(ids, labels)
def nonspatial(self, subquery, colnames, num_clusters=5,
id_col='cartodb_id', standarize=True):
standardize=True, id_col='cartodb_id'):
"""
Inputs:
query (string): A SQL query to retrieve the data required to do the
k-means clustering analysis, like so:
SELECT * FROM iris_flower_data
colnames (list): a list of the column names which contain the data
of interest, like so: ["sepal_width",
"petal_width",
"sepal_length",
"petal_length"]
of interest, like so: ['sepal_width',
'petal_width',
'sepal_length',
'petal_length']
num_clusters (int): number of clusters (greater than zero)
id_col (string): name of the input id_column
Output:
A list of tuples with the following columns:
cluster labels: a label for the cluster that the row belongs to
centers: center of the cluster that this row belongs to
silhouettes: silhouette measure for this value
rowid: row that these values belong to (corresponds to the value in
`id_col`)
"""
import json
from sklearn import metrics
out_id_colname = 'rowids'
# TODO: need a random seed?
params = {"cols": colnames,
params = {"colnames": colnames,
"subquery": subquery,
"id_col": id_col}
data = self.data_provider.get_nonspatial_kmeans(params, standarize)
data = self.data_provider.get_nonspatial_kmeans(params)
# fill array with values for k-means clustering
if standarize:
if standardize:
cluster_columns = _scale_data(
_extract_columns(data, len(colnames)))
else:
cluster_columns = _extract_columns(data, len(colnames))
print str(cluster_columns)
# TODO: decide on optimal parameters for most cases
# Are there ways of deciding parameters based on inputs?
kmeans = KMeans(n_clusters=num_clusters,
random_state=0).fit(cluster_columns)
@ -79,7 +84,7 @@ class Kmeans:
return zip(kmeans.labels_,
centers,
silhouettes,
data[0][out_id_colname])
data[0]['rowid'])
# -- Preprocessing steps
@ -102,4 +107,5 @@ def _scale_data(features):
features (numpy matrix): features of dimension (n_features, n_samples)
"""
from sklearn.preprocessing import StandardScaler
return StandardScaler().fit_transform(features)
scaler = StandardScaler()
return scaler.fit_transform(features)

View File

@ -19,7 +19,7 @@ class FakeDataProvider(AnalysisDataProvider):
def get_spatial_kmeans(self, query):
return self.mocked_result
def get_nonspatial_kmeans(self, query, standarize):
def get_nonspatial_kmeans(self, query):
return self.mocked_result
@ -66,7 +66,7 @@ class KMeansNonspatialTest(unittest.TestCase):
# http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn-cluster-kmeans
data_raw = [OrderedDict([("arr_col1", [1, 1, 1, 4, 4, 4]),
("arr_col2", [2, 4, 0, 2, 4, 0]),
("rowids", [1, 2, 3, 4, 5, 6])])]
("rowid", [1, 2, 3, 4, 5, 6])])]
random_seeds.set_random_seeds(1234)
kmeans = Kmeans(FakeDataProvider(data_raw))