first add

2016-10-11 16:38:18 -04:00 · 2016-10-11 16:38:18 -04:00 · 947d6ba798
commit 947d6ba798
parent ecb4bd9606
2 changed files with 86 additions and 24 deletions
--- a/src/pg/sql/11_kmeans.sql
+++ b/src/pg/sql/11_kmeans.sql
@ -1,10 +1,23 @@
-CREATE OR REPLACE FUNCTION  CDB_KMeans(query text, no_clusters integer,no_init integer default 20)
+-- Spatial k-means clustering
 CREATE OR REPLACE FUNCTION CDB_KMeans(query text, no_clusters integer, no_init integer default 20)
 RETURNS table (cartodb_id integer, cluster_no integer) as $$
    from crankshaft.clustering import kmeans
-    return kmeans(query,no_clusters,no_init)
+    return kmeans(query, no_clusters, no_init)
-$$ language plpythonu;
+$$ LANGUAGE plpythonu;
 -- Non-spatial k-means clustering
 -- query: sql query to retrieve all the needed data
 CREATE OR REPLACE FUNCTION CDB_KMeansNonspatial(query TEXT, col_names TEXT[], no_clusters INTEGER, id_col TEXT DEFAULT 'cartodb_id')
 RETURNS TABLE(rowid BIGINT, cluster_no INTEGER, )
 from crankshaft.clustering import kmeans_nonspatial
 return kmeans_nonspatial(query, colnames, num_clusters, id_col)
 $$ LANGUAGE plpythonu;
 CREATE OR REPLACE FUNCTION CDB_WeightedMeanS(state Numeric[],the_geom GEOMETRY(Point, 4326), weight NUMERIC)
--- a/src/py/crankshaft/crankshaft/clustering/kmeans.py
+++ b/src/py/crankshaft/crankshaft/clustering/kmeans.py
@ -1,18 +1,67 @@
 from sklearn.cluster import KMeans
 import plpy
 def kmeans(query, no_clusters, no_init=20):
    data = plpy.execute('''select array_agg(cartodb_id order by cartodb_id) as ids,
        array_agg(ST_X(the_geom) order by cartodb_id) xs,
        array_agg(ST_Y(the_geom) order by cartodb_id) ys from ({query}) a
        where the_geom is not null
    '''.format(query=query))
-    xs  = data[0]['xs']
+def kmeans(query, no_clusters, no_init=20):
-    ys  = data[0]['ys']
+    """
    """
    full_query = '''
      SELECT array_agg(cartodb_id ORDER BY cartodb_id) as ids,
             array_agg(ST_X(the_geom) ORDER BY cartodb_id) xs,
             array_agg(ST_Y(the_geom) ORDER BY cartodb_id)
        FROM ({query}) As a
       WHERE the_geom IS NOT NULL
        '''.format(query=query)
    try:
        data = plpy.execute(full_query)
    except plpy.SPIError, err:
        plpy.error("KMeans cluster failed: %s" % err)
    xs = data[0]['xs']
    ys = data[0]['ys']
    ids = data[0]['ids']
-    km = KMeans(n_clusters= no_clusters, n_init=no_init)
+    km = KMeans(n_clusters=no_clusters, n_init=no_init)
-    labels = km.fit_predict(zip(xs,ys))
+    labels = km.fit_predict(zip(xs, ys))
-    return zip(ids,labels)
+    return zip(ids, labels)
 def kmeans_nonspatial(query, colnames, num_clusters=5, id_col='cartodb_id'):
    """
        query (string): A SQL query to retrieve the data required to do the
                        k-means clustering analysis, like so:
                        SELECT * FROM iris_flower_data
        colnames (list): a list of the column names which contain the data of
                         interest, like so: ["sepal_width", "petal_width",
                                             "sepal_length", "petal_length"]
        num_clusters (int): number of clusters (greater than zero)
        id_col (string): name of the input id_column
    """
    id_colname = 'rowids'
    full_query = '''
        SELECT {cols}, array_agg({id_col}) As {id_colname}
        FROM ({query}) As a
    '''.format(query=query,
               id_col=id_col,
               id_colname=id_colname,
               cols=', '.join(['array_agg({0}) As col{1}'.format(val, idx)
                               for idx, val in enumerate(colnames)]))
    try:
        data = plpy.execute(full_query)
        plpy.notice('query: %s' % full_query)
        # fill array with values for kmeans clustering
        data = np.array([d[c] for c in d if c != 'id_colname'],
                        dtype=float).T
    except plpy.SPIError, err:
        plpy.error('KMeans cluster failed: %s' % err)
    kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(data)
    # zip(ids, labels, means)
    return zip(kmeans.labels_, map(str, kmeans.cluster_centers_),
               d[0]['rowids'])