first add

2016-10-11 16:38:18 -04:00 · 2016-10-11 16:38:18 -04:00 · 947d6ba798
commit 947d6ba798
parent ecb4bd9606
2 changed files with 86 additions and 24 deletions
--- a/src/pg/sql/11_kmeans.sql
+++ b/src/pg/sql/11_kmeans.sql
@ -1,10 +1,23 @@
-CREATE OR REPLACE FUNCTION  CDB_KMeans(query text, no_clusters integer,no_init integer default 20)
+-- Spatial k-means clustering
+
+CREATE OR REPLACE FUNCTION CDB_KMeans(query text, no_clusters integer, no_init integer default 20)
 RETURNS table (cartodb_id integer, cluster_no integer) as $$

    from crankshaft.clustering import kmeans
-    return kmeans(query,no_clusters,no_init)
+    return kmeans(query, no_clusters, no_init)

-$$ language plpythonu;
+$$ LANGUAGE plpythonu;
+
+-- Non-spatial k-means clustering
+-- query: sql query to retrieve all the needed data
+
+CREATE OR REPLACE FUNCTION CDB_KMeansNonspatial(query TEXT, col_names TEXT[], no_clusters INTEGER, id_col TEXT DEFAULT 'cartodb_id')
+RETURNS TABLE(rowid BIGINT, cluster_no INTEGER, )
+
+from crankshaft.clustering import kmeans_nonspatial
+return kmeans_nonspatial(query, colnames, num_clusters, id_col)
+
+$$ LANGUAGE plpythonu;


 CREATE OR REPLACE FUNCTION CDB_WeightedMeanS(state Numeric[],the_geom GEOMETRY(Point, 4326), weight NUMERIC)
--- a/src/py/crankshaft/crankshaft/clustering/kmeans.py
+++ b/src/py/crankshaft/crankshaft/clustering/kmeans.py
@ -1,18 +1,67 @@
 from sklearn.cluster import KMeans
 import plpy

-def kmeans(query, no_clusters, no_init=20):
-    data = plpy.execute('''select array_agg(cartodb_id order by cartodb_id) as ids,
-        array_agg(ST_X(the_geom) order by cartodb_id) xs,
-        array_agg(ST_Y(the_geom) order by cartodb_id) ys from ({query}) a
-        where the_geom is not null
-    '''.format(query=query))

-    xs  = data[0]['xs']
-    ys  = data[0]['ys']
+def kmeans(query, no_clusters, no_init=20):
+    """
+
+    """
+    full_query = '''
+      SELECT array_agg(cartodb_id ORDER BY cartodb_id) as ids,
+             array_agg(ST_X(the_geom) ORDER BY cartodb_id) xs,
+             array_agg(ST_Y(the_geom) ORDER BY cartodb_id)
+        FROM ({query}) As a
+       WHERE the_geom IS NOT NULL
+        '''.format(query=query)
+    try:
+        data = plpy.execute(full_query)
+    except plpy.SPIError, err:
+        plpy.error("KMeans cluster failed: %s" % err)
+
+    xs = data[0]['xs']
+    ys = data[0]['ys']
    ids = data[0]['ids']

-    km = KMeans(n_clusters= no_clusters, n_init=no_init)
-    labels = km.fit_predict(zip(xs,ys))
-    return zip(ids,labels)
+    km = KMeans(n_clusters=no_clusters, n_init=no_init)
+    labels = km.fit_predict(zip(xs, ys))
+    return zip(ids, labels)

+
+def kmeans_nonspatial(query, colnames, num_clusters=5, id_col='cartodb_id'):
+    """
+        query (string): A SQL query to retrieve the data required to do the
+                        k-means clustering analysis, like so:
+                        SELECT * FROM iris_flower_data
+        colnames (list): a list of the column names which contain the data of
+                         interest, like so: ["sepal_width", "petal_width",
+                                             "sepal_length", "petal_length"]
+        num_clusters (int): number of clusters (greater than zero)
+        id_col (string): name of the input id_column
+    """
+
+    id_colname = 'rowids'
+
+    full_query = '''
+        SELECT {cols}, array_agg({id_col}) As {id_colname}
+        FROM ({query}) As a
+    '''.format(query=query,
+               id_col=id_col,
+               id_colname=id_colname,
+               cols=', '.join(['array_agg({0}) As col{1}'.format(val, idx)
+                               for idx, val in enumerate(colnames)]))
+
+    try:
+        data = plpy.execute(full_query)
+        plpy.notice('query: %s' % full_query)
+
+        # fill array with values for kmeans clustering
+        data = np.array([d[c] for c in d if c != 'id_colname'],
+                        dtype=float).T
+    except plpy.SPIError, err:
+        plpy.error('KMeans cluster failed: %s' % err)
+
+    kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(data)
+
+    # zip(ids, labels, means)
+    return zip(kmeans.labels_, map(str, kmeans.cluster_centers_),
+               d[0]['rowids'])