Merge pull request #59 from CartoDB/kmeans

KMeans clustering and weighted centroid analysis
2016-06-16 12:39:13 +02:00 · 2016-06-16 12:39:13 +02:00 · a222341863
commit a222341863
parent 284d8ede44 fd1862167c
9 changed files with 189 additions and 2 deletions
--- a/doc/11_kmeans.md
+++ b/doc/11_kmeans.md
@ -0,0 +1,62 @@
 ## K-Means Functions
 ### CDB_KMeans(subquery text, no_clusters INTEGER)
 This function attempts to find n clusters within the input data. It will return a table to CartoDB ids and 
 the number of the cluster each point in the input was assigend to.
 #### Arguments
 | Name | Type | Description |
 |------|------|-------------|
 | subquery | TEXT | SQL query that exposes the data to be analyzed (e.g., `SELECT * FROM interesting_table`). This query must have the geometry column name `the_geom` and id column name `cartodb_id` unless otherwise specified in the input arguments |
 | no\_clusters | INTEGER | The number of clusters to try and find |
 #### Returns
 A table with the following columns.
 | Column Name | Type | Description |
 |-------------|------|-------------|
 | cartodb\_id | INTEGER | The CartoDB id of the row in the input table.|
 | cluster\_no | INTEGER | The cluster that this point belongs to. |
 #### Example Usage
 ```sql
 SELECT 
    customers.*, 
    km.cluster_no 
    FROM cdb_crankshaft.CDB_Kmeans('SELECT * from customers' , 6) km, customers_3
    WHERE customers.cartodb_id = km.cartodb_id
 ```
 ### CDB_WeightedMean(subquery text, weight_column text, category_column text)
 Function that computes the weighted centroid of a number of clusters by some weight column.
 ### Arguments 
 | Name | Type | Description |
 |------|------|-------------|
 | subquery | TEXT | SQL query that exposes the data to be analyzed (e.g., `SELECT * FROM interesting_table`). This query must have the geometry column and the columns specified as the weight and category columns|
 | weight\_column | TEXT | The name of the column to use as a weight |
 | category\_column | TEXT | The name of the column to use as a category |
 ### Returns 
 A table with the following columns.
 | Column Name | Type | Description |
 |-------------|------|-------------|
 | the\_geom | GEOMETRY | A point for the weighted cluster center |
 | class | INTEGER | The cluster class | 
 ### Example Usage 
 ```sql 
 SELECT ST_TRANSFORM(the_geom, 3857) as the_geom_webmercator, class 
 FROM cdb_weighted_mean('SELECT *, customer_value FROM customers','customer_value','cluster_no')
 ```
--- a/src/pg/sql/11_kmeans.sql
+++ b/src/pg/sql/11_kmeans.sql
@ -0,0 +1,51 @@
 CREATE OR REPLACE FUNCTION  CDB_KMeans(query text, no_clusters integer,no_init integer default 20)
 RETURNS table (cartodb_id integer, cluster_no integer) as $$
    import plpy 
    plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()')
    from crankshaft.clustering import kmeans
    return kmeans(query,no_clusters,no_init)
 $$ language plpythonu;
 CREATE OR REPLACE FUNCTION CDB_WeightedMeanS(state Numeric[],the_geom GEOMETRY(Point, 4326), weight NUMERIC)
 RETURNS Numeric[] AS 
 $$
 DECLARE 
    newX NUMERIC;
    newY NUMERIC;
    newW NUMERIC;
 BEGIN
    IF weight IS NULL OR the_geom IS NULL THEN 
        newX = state[1];
        newY = state[2];
        newW = state[3];
    ELSE
        newX = state[1] + ST_X(the_geom)*weight;
        newY = state[2] + ST_Y(the_geom)*weight;
        newW = state[3] + weight;
    END IF;
    RETURN Array[newX,newY,newW];
 END
 $$ LANGUAGE plpgsql;
 CREATE OR REPLACE FUNCTION CDB_WeightedMeanF(state Numeric[])
 RETURNS GEOMETRY AS 
 $$
 BEGIN
    IF state[3] = 0 THEN 
        RETURN ST_SetSRID(ST_MakePoint(state[1],state[2]), 4326);
    ELSE 
        RETURN ST_SETSRID(ST_MakePoint(state[1]/state[3], state[2]/state[3]),4326);
    END IF;
 END
 $$ LANGUAGE plpgsql;
 CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC)(
    SFUNC = CDB_WeightedMeanS,
    FINALFUNC = CDB_WeightedMeanF,
    STYPE = Numeric[],
    INITCOND = "{0.0,0.0,0.0}" 
 );
--- a/src/pg/test/expected/05_kmeans_test.out
+++ b/src/pg/test/expected/05_kmeans_test.out
@ -0,0 +1,10 @@
 \pset format unaligned
 \set ECHO all
 SELECT count(DISTINCT cluster_no) as clusters from cdb_crankshaft.cdb_kmeans('select * from ppoints', 2);
 clusters
 2
 (1 row)
 SELECT count(*) clusters from (select  cdb_crankshaft.CDB_WeightedMean(the_geom, value::NUMERIC), code from ppoints group by code) p;
 clusters
 52
 (1 row)
--- a/src/pg/test/sql/05_kmeans_test.sql
+++ b/src/pg/test/sql/05_kmeans_test.sql
@ -0,0 +1,6 @@
 \pset format unaligned
 \set ECHO all
 SELECT count(DISTINCT cluster_no) as clusters from cdb_crankshaft.cdb_kmeans('select * from ppoints', 2);
 SELECT count(*) clusters from (select  cdb_crankshaft.CDB_WeightedMean(the_geom, value::NUMERIC), code from ppoints group by code) p;
--- a/src/py/crankshaft/crankshaft/clustering/init.py
+++ b/src/py/crankshaft/crankshaft/clustering/init.py
@ -1 +1,2 @@
 from moran import *
 from kmeans import *
--- a/src/py/crankshaft/crankshaft/clustering/kmeans.py
+++ b/src/py/crankshaft/crankshaft/clustering/kmeans.py
@ -0,0 +1,18 @@
 from sklearn.cluster import KMeans
 import plpy
 def kmeans(query, no_clusters, no_init=20):
    data = plpy.execute('''select array_agg(cartodb_id order by cartodb_id) as ids,
        array_agg(ST_X(the_geom) order by cartodb_id) xs,
        array_agg(ST_Y(the_geom) order by cartodb_id) ys from ({query}) a
        where the_geom is not null
    '''.format(query=query))
    xs  = data[0]['xs']
    ys  = data[0]['ys']
    ids = data[0]['ids']
    km = KMeans(n_clusters= no_clusters, n_init=no_init)
    labels = km.fit_predict(zip(xs,ys))
    return zip(ids,labels)
--- a/src/py/crankshaft/setup.py
+++ b/src/py/crankshaft/setup.py
@ -40,9 +40,9 @@ setup(
    # The choice of component versions is dictated by what's
    # provisioned in the production servers.
-    install_requires=['pysal==1.9.1'],
+    install_requires=['pysal==1.9.1', 'scikit-learn==0.17.1'],
-    requires=['pysal', 'numpy' ],
+    requires=['pysal', 'numpy', 'sklearn'],
    test_suite='test'
 )
--- a/src/py/crankshaft/test/fixtures/kmeans.json
+++ b/src/py/crankshaft/test/fixtures/kmeans.json
@ -0,0 +1 @@
 [{"xs": [9.917239463463458, 9.042767302696836, 10.798929825304187, 8.763751051762995, 11.383882954810852, 11.018206993460897, 8.939526075734316, 9.636159342565252, 10.136336896960058, 11.480610059427342, 12.115011910725082, 9.173267848893428, 10.239300931201738, 8.00012512174072, 8.979962292282131, 9.318376124429575, 10.82259513754284, 10.391747171927115, 10.04904588886165, 9.96007160443463, -0.78825626804569, -0.3511819898577426, -1.2796410003764271, -0.3977049391203402, 2.4792311265774667, 1.3670311632092624, 1.2963504112955613, 2.0404844103073025, -1.6439708506073223, 0.39122885445645805, 1.026031821452462, -0.04044477160482201, -0.7442346929085072, -0.34687120826243034, -0.23420359971379054, -0.5919629143336708, -0.202903054395391, -0.1893399644841902, 1.9331834251176807, -0.12321054392851609], "ys": [8.735627063679981, 9.857615954045011, 10.81439096759407, 10.586727233537191, 9.232919976568622, 11.54281262696508, 8.392787912674466, 9.355119689665944, 9.22380703532752, 10.542142541823122, 10.111980619367035, 10.760836265570738, 8.819773453269804, 10.25325722424816, 9.802077905695608, 8.955420161552611, 9.833801181904477, 10.491684241001613, 12.076108669877556, 11.74289693140474, -0.5685725015474191, -0.5715728344759778, -0.20180907868635137, 0.38431336480089595, -0.3402202083684184, -2.4652736827783586, 0.08295159401756182, 0.8503818775816505, 0.6488691600321166, 0.5794762568230527, -0.6770063922144103, -0.6557616416449478, -1.2834289177624947, 0.1096318195532717, -0.38986922166834853, -1.6224497706950238, 0.09429787743230483, 0.4005097316394031, -0.508002811195673, -1.2473463371366507], "ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]}]
--- a/src/py/crankshaft/test/test_cluster_kmeans.py
+++ b/src/py/crankshaft/test/test_cluster_kmeans.py
@ -0,0 +1,38 @@
 import unittest
 import numpy as np
 # from mock_plpy import MockPlPy
 # plpy = MockPlPy()
 #
 # import sys
 # sys.modules['plpy'] = plpy
 from helper import plpy, fixture_file
 import numpy as np
 import crankshaft.clustering as cc
 import crankshaft.pysal_utils as pu
 from crankshaft import random_seeds
 import json
 class KMeansTest(unittest.TestCase):
    """Testing class for Moran's I functions"""
    def setUp(self):
        plpy._reset()
        self.cluster_data = json.loads(open(fixture_file('kmeans.json')).read())
        self.params = {"subquery": "select * from table",
                       "no_clusters": "10"
                       }
    def test_kmeans(self):
        data = self.cluster_data
        plpy._define_result('select' ,data)
        clusters = cc.kmeans('subquery', 2)
        labels  = [a[1] for a in clusters]
        c1 = [a for a in clusters if a[1]==0]
        c2 = [a for a in clusters if a[1]==1]
        self.assertEqual(len(np.unique(labels)),2)
        self.assertEqual(len(c1),20)
        self.assertEqual(len(c2),20)
`@ -1 +1,2 @@`
	`from moran import *`	`from moran import *`
		`from kmeans import *`
		`@ -0,0 +1 @@`
							[{"xs": [9.917239463463458, 9.042767302696836, 10.798929825304187, 8.763751051762995, 11.383882954810852, 11.018206993460897, 8.939526075734316, 9.636159342565252, 10.136336896960058, 11.480610059427342, 12.115011910725082, 9.173267848893428, 10.239300931201738, 8.00012512174072, 8.979962292282131, 9.318376124429575, 10.82259513754284, 10.391747171927115, 10.04904588886165, 9.96007160443463, -0.78825626804569, -0.3511819898577426, -1.2796410003764271, -0.3977049391203402, 2.4792311265774667, 1.3670311632092624, 1.2963504112955613, 2.0404844103073025, -1.6439708506073223, 0.39122885445645805, 1.026031821452462, -0.04044477160482201, -0.7442346929085072, -0.34687120826243034, -0.23420359971379054, -0.5919629143336708, -0.202903054395391, -0.1893399644841902, 1.9331834251176807, -0.12321054392851609], "ys": [8.735627063679981, 9.857615954045011, 10.81439096759407, 10.586727233537191, 9.232919976568622, 11.54281262696508, 8.392787912674466, 9.355119689665944, 9.22380703532752, 10.542142541823122, 10.111980619367035, 10.760836265570738, 8.819773453269804, 10.25325722424816, 9.802077905695608, 8.955420161552611, 9.833801181904477, 10.491684241001613, 12.076108669877556, 11.74289693140474, -0.5685725015474191, -0.5715728344759778, -0.20180907868635137, 0.38431336480089595, -0.3402202083684184, -2.4652736827783586, 0.08295159401756182, 0.8503818775816505, 0.6488691600321166, 0.5794762568230527, -0.6770063922144103, -0.6557616416449478, -1.2834289177624947, 0.1096318195532717, -0.38986922166834853, -1.6224497706950238, 0.09429787743230483, 0.4005097316394031, -0.508002811195673, -1.2473463371366507], "ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]}]