diff --git a/src/pg/sql/11_kmeans.sql b/src/pg/sql/11_kmeans.sql index 2db57e0..6a9d1a9 100644 --- a/src/pg/sql/11_kmeans.sql +++ b/src/pg/sql/11_kmeans.sql @@ -11,12 +11,18 @@ $$ LANGUAGE plpythonu; -- Non-spatial k-means clustering -- query: sql query to retrieve all the needed data -CREATE OR REPLACE FUNCTION CDB_KMeansNonspatial(query TEXT, colnames TEXT[], num_clusters INTEGER, id_col TEXT DEFAULT 'cartodb_id') +CREATE OR REPLACE FUNCTION CDB_KMeansNonspatial( + query TEXT, + colnames TEXT[], + num_clusters INTEGER, + id_col TEXT DEFAULT 'cartodb_id', + standarize BOOLEAN DEFAULT true +) RETURNS TABLE(cluster_label text, cluster_center text, rowid bigint) AS $$ -from crankshaft.clustering import kmeans_nonspatial -return kmeans_nonspatial(query, colnames, num_clusters, id_col) - + from crankshaft.clustering import kmeans_nonspatial + return kmeans_nonspatial(query, colnames, num_clusters, + id_col, standarize) $$ LANGUAGE plpythonu; diff --git a/src/py/crankshaft/crankshaft/clustering/kmeans.py b/src/py/crankshaft/crankshaft/clustering/kmeans.py index df024a1..6e972e5 100644 --- a/src/py/crankshaft/crankshaft/clustering/kmeans.py +++ b/src/py/crankshaft/crankshaft/clustering/kmeans.py @@ -62,7 +62,7 @@ def kmeans_nonspatial(query, colnames, num_clusters=5, cluster_columns = _scale_data( _extract_columns(db_resp, id_col=out_id_colname)) else: - cluster_columns = _extract_columns(db_resp) + cluster_columns = _extract_columns(db_resp, id_col=out_id_colname) # TODO: decide on optimal parameters for most cases # Are there ways of deciding parameters based on inputs?