From b6dae5e3801e7f33360c16172b41b4c62077614e Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Tue, 15 Nov 2016 00:15:23 +0100 Subject: [PATCH] adding silhouette --- src/py/crankshaft/crankshaft/clustering/kmeans.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/py/crankshaft/crankshaft/clustering/kmeans.py b/src/py/crankshaft/crankshaft/clustering/kmeans.py index 5bd7830..52139d1 100644 --- a/src/py/crankshaft/crankshaft/clustering/kmeans.py +++ b/src/py/crankshaft/crankshaft/clustering/kmeans.py @@ -41,6 +41,8 @@ def kmeans_nonspatial(query, colnames, num_clusters=5, id_col (string): name of the input id_column """ import json + from sklearn import metrics + out_id_colname = 'rowids' # TODO: need a random seed? @@ -70,7 +72,13 @@ def kmeans_nonspatial(query, colnames, num_clusters=5, kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(cluster_columns) - centers = [json.dumps(dict(zip(colnames, c))) for c in kmeans.cluster_centers_[kmeans.labels_]] + centers = [json.dumps(dict(zip(colnames, c))) + for c in kmeans.cluster_centers_[kmeans.labels_]] + + silhouettes = metrics.silhouette_samples(cluster_columns, + labels, + metric='sqeuclidean') + return zip(kmeans.labels_, centers, db_resp[0][out_id_colname])