strips out kmeans non spatial
This commit is contained in:
parent
c5a2746a53
commit
2f27622a6d
@ -9,24 +9,6 @@ RETURNS table (cartodb_id integer, cluster_no integer) as $$
|
|||||||
|
|
||||||
$$ LANGUAGE plpythonu;
|
$$ LANGUAGE plpythonu;
|
||||||
|
|
||||||
-- Non-spatial k-means clustering
|
|
||||||
-- query: sql query to retrieve all the needed data
|
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION CDB_KMeansNonspatial(
|
|
||||||
query TEXT,
|
|
||||||
colnames TEXT[],
|
|
||||||
num_clusters INTEGER,
|
|
||||||
id_colname TEXT DEFAULT 'cartodb_id',
|
|
||||||
standarize BOOLEAN DEFAULT true
|
|
||||||
)
|
|
||||||
RETURNS TABLE(cluster_label text, cluster_center json, silhouettes numeric, rowid bigint) AS $$
|
|
||||||
|
|
||||||
from crankshaft.clustering import Kmeans
|
|
||||||
kmeans = Kmeans()
|
|
||||||
return kmeans.nonspatial(query, colnames, num_clusters,
|
|
||||||
id_colname, standarize)
|
|
||||||
$$ LANGUAGE plpythonu;
|
|
||||||
|
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION CDB_WeightedMeanS(state Numeric[],the_geom GEOMETRY(Point, 4326), weight NUMERIC)
|
CREATE OR REPLACE FUNCTION CDB_WeightedMeanS(state Numeric[],the_geom GEOMETRY(Point, 4326), weight NUMERIC)
|
||||||
RETURNS Numeric[] AS
|
RETURNS Numeric[] AS
|
||||||
|
@ -33,82 +33,3 @@ class Kmeans:
|
|||||||
km = KMeans(n_clusters=no_clusters, n_init=no_init)
|
km = KMeans(n_clusters=no_clusters, n_init=no_init)
|
||||||
labels = km.fit_predict(zip(xs, ys))
|
labels = km.fit_predict(zip(xs, ys))
|
||||||
return zip(ids, labels)
|
return zip(ids, labels)
|
||||||
|
|
||||||
def nonspatial(self, query, colnames, num_clusters=5,
|
|
||||||
id_col='cartodb_id', standarize=True):
|
|
||||||
"""
|
|
||||||
query (string): A SQL query to retrieve the data required to do the
|
|
||||||
k-means clustering analysis, like so:
|
|
||||||
SELECT * FROM iris_flower_data
|
|
||||||
colnames (list): a list of the column names which contain the data
|
|
||||||
of interest, like so: ["sepal_width",
|
|
||||||
"petal_width",
|
|
||||||
"sepal_length",
|
|
||||||
"petal_length"]
|
|
||||||
num_clusters (int): number of clusters (greater than zero)
|
|
||||||
id_col (string): name of the input id_column
|
|
||||||
"""
|
|
||||||
import json
|
|
||||||
from sklearn import metrics
|
|
||||||
|
|
||||||
out_id_colname = 'rowids'
|
|
||||||
# TODO: need a random seed?
|
|
||||||
|
|
||||||
full_query = '''
|
|
||||||
SELECT {cols}, array_agg({id_col}) As {out_id_colname}
|
|
||||||
FROM ({query}) As a
|
|
||||||
'''.format(query=query,
|
|
||||||
id_col=id_col,
|
|
||||||
out_id_colname=out_id_colname,
|
|
||||||
cols=', '.join(['array_agg({0}) As col{1}'.format(val, idx)
|
|
||||||
for idx, val in enumerate(colnames)]))
|
|
||||||
|
|
||||||
db_resp = self.query_runner.get_nonspatial_kmeans(full_query, standarize)
|
|
||||||
|
|
||||||
# fill array with values for k-means clustering
|
|
||||||
if standarize:
|
|
||||||
cluster_columns = _scale_data(
|
|
||||||
_extract_columns(db_resp, colnames))
|
|
||||||
else:
|
|
||||||
cluster_columns = _extract_columns(db_resp, colnames)
|
|
||||||
|
|
||||||
print str(cluster_columns)
|
|
||||||
# TODO: decide on optimal parameters for most cases
|
|
||||||
# Are there ways of deciding parameters based on inputs?
|
|
||||||
kmeans = KMeans(n_clusters=num_clusters,
|
|
||||||
random_state=0).fit(cluster_columns)
|
|
||||||
|
|
||||||
centers = [json.dumps(dict(zip(colnames, c)))
|
|
||||||
for c in kmeans.cluster_centers_[kmeans.labels_]]
|
|
||||||
|
|
||||||
silhouettes = metrics.silhouette_samples(cluster_columns,
|
|
||||||
kmeans.labels_,
|
|
||||||
metric='sqeuclidean')
|
|
||||||
|
|
||||||
return zip(kmeans.labels_,
|
|
||||||
centers,
|
|
||||||
silhouettes,
|
|
||||||
db_resp[0][out_id_colname])
|
|
||||||
|
|
||||||
|
|
||||||
# -- Preprocessing steps
|
|
||||||
|
|
||||||
def _extract_columns(db_resp, colnames):
|
|
||||||
"""
|
|
||||||
Extract the features from the query and pack them into a NumPy array
|
|
||||||
db_resp (plpy data object): result of the kmeans request
|
|
||||||
id_col_name (string): name of column which has the row id (not a
|
|
||||||
feature of the analysis)
|
|
||||||
"""
|
|
||||||
return np.array([db_resp[0][c] for c in colnames],
|
|
||||||
dtype=float).T
|
|
||||||
|
|
||||||
|
|
||||||
def _scale_data(features):
|
|
||||||
"""
|
|
||||||
Scale all input columns to center on 0 with a standard devation of 1
|
|
||||||
|
|
||||||
features (numpy matrix): features of dimension (n_features, n_samples)
|
|
||||||
"""
|
|
||||||
from sklearn.preprocessing import StandardScaler
|
|
||||||
return StandardScaler().fit_transform(features)
|
|
||||||
|
@ -54,36 +54,3 @@ class KMeansTest(unittest.TestCase):
|
|||||||
self.assertEqual(len(np.unique(labels)), 2)
|
self.assertEqual(len(np.unique(labels)), 2)
|
||||||
self.assertEqual(len(c1), 20)
|
self.assertEqual(len(c1), 20)
|
||||||
self.assertEqual(len(c2), 20)
|
self.assertEqual(len(c2), 20)
|
||||||
|
|
||||||
|
|
||||||
class KMeansNonspatialTest(unittest.TestCase):
|
|
||||||
"""Testing class for k-means non-spatial"""
|
|
||||||
|
|
||||||
def setUp(self):
|
|
||||||
self.params = {"subquery": "SELECT * FROM TABLE",
|
|
||||||
"n_clusters": 5}
|
|
||||||
|
|
||||||
def test_kmeans_nonspatial(self):
|
|
||||||
"""
|
|
||||||
test for k-means non-spatial
|
|
||||||
"""
|
|
||||||
# data from:
|
|
||||||
# http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn-cluster-kmeans
|
|
||||||
data_raw = [OrderedDict([("col1", [1, 1, 1, 4, 4, 4]),
|
|
||||||
("col2", [2, 4, 0, 2, 4, 0]),
|
|
||||||
("rowids", [1, 2, 3, 4, 5, 6])])]
|
|
||||||
|
|
||||||
random_seeds.set_random_seeds(1234)
|
|
||||||
kmeans = Kmeans(FakeQueryRunner(data_raw))
|
|
||||||
print 'asfasdfasd'
|
|
||||||
clusters = kmeans.nonspatial('subquery', ['col1', 'col2'], 2)
|
|
||||||
print str([c[0] for c in clusters])
|
|
||||||
|
|
||||||
cl1 = clusters[0][0]
|
|
||||||
cl2 = clusters[3][0]
|
|
||||||
|
|
||||||
for idx, val in enumerate(clusters):
|
|
||||||
if idx < 3:
|
|
||||||
self.assertEqual(val[0], cl1)
|
|
||||||
else:
|
|
||||||
self.assertEqual(val[0], cl2)
|
|
||||||
|
Loading…
Reference in New Issue
Block a user