From 63f490f6e6048a47461e3ab06fc664bd0163c6fe Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Tue, 16 Feb 2016 17:50:58 +0100 Subject: [PATCH] Add some support functions for clustering using PySAL --- pg/.gitignore | 1 - pg/dist/.gitignore | 0 pg/dist/crankshaft--0.0.1.sql | 5 + python/crankshaft/crankshaft/__init__.py | 1 + .../crankshaft/clustering/__init__.py | 1 + .../crankshaft/crankshaft/clustering/moran.py | 167 ++++++++++++++++++ python/crankshaft/setup.py | 8 +- python/crankshaft/test/helper.py | 7 + .../crankshaft/test/test_clustering_moran.py | 118 +++++++++++++ python/crankshaft/test/test_poc.py | 16 +- 10 files changed, 313 insertions(+), 11 deletions(-) create mode 100644 pg/dist/.gitignore create mode 100644 pg/dist/crankshaft--0.0.1.sql create mode 100644 python/crankshaft/crankshaft/clustering/__init__.py create mode 100644 python/crankshaft/crankshaft/clustering/moran.py create mode 100644 python/crankshaft/test/helper.py create mode 100644 python/crankshaft/test/test_clustering_moran.py diff --git a/pg/.gitignore b/pg/.gitignore index 356c99c..8392e95 100644 --- a/pg/.gitignore +++ b/pg/.gitignore @@ -1,4 +1,3 @@ -dist/ test/regression.diffs test/regression.out test/results diff --git a/pg/dist/.gitignore b/pg/dist/.gitignore new file mode 100644 index 0000000..e69de29 diff --git a/pg/dist/crankshaft--0.0.1.sql b/pg/dist/crankshaft--0.0.1.sql new file mode 100644 index 0000000..e342f68 --- /dev/null +++ b/pg/dist/crankshaft--0.0.1.sql @@ -0,0 +1,5 @@ +CREATE OR REPLACE FUNCTION cdb_poc_xyz() +RETURNS Text AS $$ + from crankshaft.poc import xyz + return xyz() +$$ LANGUAGE plpythonu; diff --git a/python/crankshaft/crankshaft/__init__.py b/python/crankshaft/crankshaft/__init__.py index dddc2b3..9ae4eae 100644 --- a/python/crankshaft/crankshaft/__init__.py +++ b/python/crankshaft/crankshaft/__init__.py @@ -1 +1,2 @@ import poc +import clustering diff --git a/python/crankshaft/crankshaft/clustering/__init__.py b/python/crankshaft/crankshaft/clustering/__init__.py new file mode 100644 index 0000000..0df080f --- /dev/null +++ b/python/crankshaft/crankshaft/clustering/__init__.py @@ -0,0 +1 @@ +from moran import * diff --git a/python/crankshaft/crankshaft/clustering/moran.py b/python/crankshaft/crankshaft/clustering/moran.py new file mode 100644 index 0000000..54045bd --- /dev/null +++ b/python/crankshaft/crankshaft/clustering/moran.py @@ -0,0 +1,167 @@ +""" +Moran's I geostatistics (global clustering & outliers presence) +""" + +# TODO: Fill in local neighbors which have null/NoneType values with the +# average of the their neighborhood + +import numpy as np +import pysal as ps + +def map_quads(coord): + """ + Map a quadrant number to Moran's I designation + HH=1, LH=2, LL=3, HL=4 + Input: + :param coord (int): quadrant of a specific measurement + """ + if coord == 1: + return 'HH' + elif coord == 2: + return 'LH' + elif coord == 3: + return 'LL' + elif coord == 4: + return 'HL' + else: + return None + +def query_attr_select(params): + """ + Create portion of SELECT statement for attributes inolved in query. + :param params: dict of information used in query (column names, + table name, etc.) + """ + + attrs = [k for k in params + if k not in ('id_col', 'geom_col', 'table', 'num_ngbrs')] + + template = "i.\"{%(col)s}\"::numeric As attr%(alias_num)s, " + + attr_string = "" + + for idx, val in enumerate(sorted(attrs)): + attr_string += template % {"col": val, "alias_num": idx + 1} + + return attr_string + +def query_attr_where(params): + """ + Create portion of WHERE clauses for weeding out NULL-valued geometries + """ + attrs = sorted([k for k in params + if k not in ('id_col', 'geom_col', 'table', 'num_ngbrs')]) + + attr_string = [] + + for attr in attrs: + attr_string.append("idx_replace.\"{%s}\" IS NOT NULL" % attr) + + if len(attrs) == 2: + attr_string.append("idx_replace.\"{%s}\" <> 0" % attrs[1]) + + out = " AND ".join(attr_string) + + return out + +def knn(params): + """SQL query for k-nearest neighbors. + :param vars: dict of values to fill template + """ + + attr_select = query_attr_select(params) + attr_where = query_attr_where(params) + + replacements = {"attr_select": attr_select, + "attr_where_i": attr_where.replace("idx_replace", "i"), + "attr_where_j": attr_where.replace("idx_replace", "j")} + + query = "SELECT " \ + "i.\"{id_col}\" As id, " \ + "%(attr_select)s" \ + "(SELECT ARRAY(SELECT j.\"{id_col}\" " \ + "FROM \"{table}\" As j " \ + "WHERE %(attr_where_j)s " \ + "ORDER BY j.\"{geom_col}\" <-> i.\"{geom_col}\" ASC " \ + "LIMIT {num_ngbrs} OFFSET 1 ) " \ + ") As neighbors " \ + "FROM \"{table}\" As i " \ + "WHERE " \ + "%(attr_where_i)s " \ + "ORDER BY i.\"{id_col}\" ASC;" % replacements + + return query.format(**params) + +## SQL query for finding queens neighbors (all contiguous polygons) +def queen(params): + """SQL query for queen neighbors. + :param params: dict of information to fill query + """ + attr_select = query_attr_select(params) + attr_where = query_attr_where(params) + + replacements = {"attr_select": attr_select, + "attr_where_i": attr_where.replace("idx_replace", "i"), + "attr_where_j": attr_where.replace("idx_replace", "j")} + + query = "SELECT " \ + "i.\"{id_col}\" As id, " \ + "%(attr_select)s" \ + "(SELECT ARRAY(SELECT j.\"{id_col}\" " \ + "FROM \"{table}\" As j " \ + "WHERE ST_Touches(i.\"{geom_col}\", j.\"{geom_col}\") AND " \ + "%(attr_where_j)s)" \ + ") As neighbors " \ + "FROM \"{table}\" As i " \ + "WHERE " \ + "%(attr_where_i)s " \ + "ORDER BY i.\"{id_col}\" ASC;" % replacements + + return query.format(**params) + +## to add more weight methods open a ticket or pull request + +def get_query(w_type, query_vals): + """Return requested query. + :param w_type: type of neighbors to calculate (knn or queen) + :param query_vals: values used to construct the query + """ + + if w_type == 'knn': + return knn(query_vals) + else: + return queen(query_vals) + +def get_attributes(query_res, attr_num): + """ + :param query_res: query results with attributes and neighbors + :param attr_num: attribute number (1, 2, ...) + """ + return np.array([x['attr' + str(attr_num)] for x in query_res], dtype=np.float) + +## Build weight object +def get_weight(query_res, w_type='queen', num_ngbrs=5): + """ + Construct PySAL weight from return value of query + :param query_res: query results with attributes and neighbors + """ + if w_type == 'knn': + row_normed_weights = [1.0 / float(num_ngbrs)] * num_ngbrs + weights = {x['id']: row_normed_weights for x in query_res} + elif w_type == 'queen': + weights = {x['id']: [1.0 / len(x['neighbors'])] * len(x['neighbors']) + if len(x['neighbors']) > 0 + else [] for x in query_res} + + neighbors = {x['id']: x['neighbors'] for x in query_res} + + return ps.W(neighbors, weights) + +def quad_position(quads): + """ + Produce Moran's I classification based of n + """ + + lisa_sig = np.array([map_quads(q) for q in quads]) + + return lisa_sig diff --git a/python/crankshaft/setup.py b/python/crankshaft/setup.py index f95edfd..8dcc27a 100644 --- a/python/crankshaft/setup.py +++ b/python/crankshaft/setup.py @@ -7,7 +7,7 @@ https://github.com/CartoDB/crankshaft from setuptools import setup, find_packages -REQUIRES = [] # ['pysal','numpy'] +REQUIRES = ['pysal','numpy'] setup( name='crankshaft', @@ -40,9 +40,9 @@ setup( 'test': ['unittest', 'nose', 'mock'], }, - # install_requires=REQUIRES, + install_requires=REQUIRES, - # requires=REQUIRES, + requires=REQUIRES, - test_suite='test' + test_suite='test' ) diff --git a/python/crankshaft/test/helper.py b/python/crankshaft/test/helper.py new file mode 100644 index 0000000..b87a12a --- /dev/null +++ b/python/crankshaft/test/helper.py @@ -0,0 +1,7 @@ +import unittest + +from mock_plpy import MockPlPy +plpy = MockPlPy() + +import sys +sys.modules['plpy'] = plpy diff --git a/python/crankshaft/test/test_clustering_moran.py b/python/crankshaft/test/test_clustering_moran.py new file mode 100644 index 0000000..f758c37 --- /dev/null +++ b/python/crankshaft/test/test_clustering_moran.py @@ -0,0 +1,118 @@ +import unittest +import numpy as np + +import unittest + + +# from mock_plpy import MockPlPy +# plpy = MockPlPy() +# +# import sys +# sys.modules['plpy'] = plpy +from helper import plpy + +# import crankshaft.clustering as cc +import crankshaft.clustering as cc + + +class MoranTest(unittest.TestCase): + """Testing class for Moran's I functions.""" + + def setUp(self): + self.params = {"id_col": "cartodb_id", + "attr1": "andy", + "attr2": "jay_z", + "table": "a_list", + "geom_col": "the_geom", + "num_ngbrs": 321} + + def test_map_quads(self): + """Test map_quads.""" + self.assertEqual(cc.map_quads(1), 'HH') + self.assertEqual(cc.map_quads(2), 'LH') + self.assertEqual(cc.map_quads(3), 'LL') + self.assertEqual(cc.map_quads(4), 'HL') + self.assertEqual(cc.map_quads(33), None) + self.assertEqual(cc.map_quads('andy'), None) + + def test_query_attr_select(self): + """Test query_attr_select.""" + + ans = "i.\"{attr1}\"::numeric As attr1, " \ + "i.\"{attr2}\"::numeric As attr2, " + + self.assertEqual(cc.query_attr_select(self.params), ans) + + def test_query_attr_where(self): + """Test query_attr_where.""" + + ans = "idx_replace.\"{attr1}\" IS NOT NULL AND "\ + "idx_replace.\"{attr2}\" IS NOT NULL AND "\ + "idx_replace.\"{attr2}\" <> 0" + + self.assertEqual(cc.query_attr_where(self.params), ans) + + def test_knn(self): + """Test knn function.""" + + ans = "SELECT i.\"cartodb_id\" As id, i.\"andy\"::numeric As attr1, " \ + "i.\"jay_z\"::numeric As attr2, (SELECT ARRAY(SELECT j.\"cartodb_id\" " \ + "FROM \"a_list\" As j WHERE j.\"andy\" IS NOT NULL AND " \ + "j.\"jay_z\" IS NOT NULL AND j.\"jay_z\" <> 0 ORDER BY " \ + "j.\"the_geom\" <-> i.\"the_geom\" ASC LIMIT 321 OFFSET 1 ) ) " \ + "As neighbors FROM \"a_list\" As i WHERE i.\"andy\" IS NOT " \ + "NULL AND i.\"jay_z\" IS NOT NULL AND i.\"jay_z\" <> 0 ORDER " \ + "BY i.\"cartodb_id\" ASC;" + + self.assertEqual(cc.knn(self.params), ans) + + def test_queen(self): + """Test queen neighbors function.""" + + ans = "SELECT i.\"cartodb_id\" As id, i.\"andy\"::numeric As attr1, " \ + "i.\"jay_z\"::numeric As attr2, (SELECT ARRAY(SELECT " \ + "j.\"cartodb_id\" FROM \"a_list\" As j WHERE ST_Touches(" \ + "i.\"the_geom\", j.\"the_geom\") AND j.\"andy\" IS NOT NULL " \ + "AND j.\"jay_z\" IS NOT NULL AND j.\"jay_z\" <> 0)) As " \ + "neighbors FROM \"a_list\" As i WHERE i.\"andy\" IS NOT NULL " \ + "AND i.\"jay_z\" IS NOT NULL AND i.\"jay_z\" <> 0 ORDER BY " \ + "i.\"cartodb_id\" ASC;" + + self.assertEqual(cc.queen(self.params), ans) + + def test_get_query(self): + """Test get_query.""" + + ans = "SELECT i.\"cartodb_id\" As id, i.\"andy\"::numeric As attr1, " \ + "i.\"jay_z\"::numeric As attr2, (SELECT ARRAY(SELECT " \ + "j.\"cartodb_id\" FROM \"a_list\" As j WHERE j.\"andy\" IS " \ + "NOT NULL AND j.\"jay_z\" IS NOT NULL AND j.\"jay_z\" <> 0 " \ + "ORDER BY j.\"the_geom\" <-> i.\"the_geom\" ASC LIMIT 321 " \ + "OFFSET 1 ) ) As neighbors FROM \"a_list\" As i WHERE " \ + "i.\"andy\" IS NOT NULL AND i.\"jay_z\" IS NOT NULL AND " \ + "i.\"jay_z\" <> 0 ORDER BY i.\"cartodb_id\" ASC;" + + self.assertEqual(cc.get_query('knn', self.params), ans) + + def test_get_attributes(self): + """Test get_attributes.""" + + ## need to add tests + + self.assertEqual(True, True) + + def test_get_weight(self): + """Test get_weight.""" + + self.assertEqual(True, True) + + + def test_quad_position(self): + """Test lisa_sig_vals.""" + + quads = np.array([1, 2, 3, 4], np.int) + + ans = np.array(['HH', 'LH', 'LL', 'HL']) + test_ans = cc.quad_position(quads) + + self.assertTrue((test_ans == ans).all()) diff --git a/python/crankshaft/test/test_poc.py b/python/crankshaft/test/test_poc.py index 7307285..ca90735 100644 --- a/python/crankshaft/test/test_poc.py +++ b/python/crankshaft/test/test_poc.py @@ -3,17 +3,21 @@ import unittest -from mock_plpy import MockPlPy -plpy = MockPlPy() - -import sys -sys.modules['plpy'] = plpy +# from mock_plpy import MockPlPy +# plpy = MockPlPy() +# +# import sys +# sys.modules['plpy'] = plpy +from helper import plpy import crankshaft class TestPoc(unittest.TestCase): - def test_should_have_xyz(self): + + def setUp(self): plpy._reset() + + def test_should_have_xyz(self): plpy._define_result('select\s+\*\s+from\s+table', [{'x': 111}]) assert crankshaft.poc.xyz() == 111 assert plpy.notices[0] == 'XYZ...'