Add some support functions for clustering using PySAL
This commit is contained in:
parent
545abc87de
commit
63f490f6e6
1
pg/.gitignore
vendored
1
pg/.gitignore
vendored
@ -1,4 +1,3 @@
|
|||||||
dist/
|
|
||||||
test/regression.diffs
|
test/regression.diffs
|
||||||
test/regression.out
|
test/regression.out
|
||||||
test/results
|
test/results
|
||||||
|
0
pg/dist/.gitignore
vendored
Normal file
0
pg/dist/.gitignore
vendored
Normal file
5
pg/dist/crankshaft--0.0.1.sql
vendored
Normal file
5
pg/dist/crankshaft--0.0.1.sql
vendored
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
CREATE OR REPLACE FUNCTION cdb_poc_xyz()
|
||||||
|
RETURNS Text AS $$
|
||||||
|
from crankshaft.poc import xyz
|
||||||
|
return xyz()
|
||||||
|
$$ LANGUAGE plpythonu;
|
@ -1 +1,2 @@
|
|||||||
import poc
|
import poc
|
||||||
|
import clustering
|
||||||
|
1
python/crankshaft/crankshaft/clustering/__init__.py
Normal file
1
python/crankshaft/crankshaft/clustering/__init__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
from moran import *
|
167
python/crankshaft/crankshaft/clustering/moran.py
Normal file
167
python/crankshaft/crankshaft/clustering/moran.py
Normal file
@ -0,0 +1,167 @@
|
|||||||
|
"""
|
||||||
|
Moran's I geostatistics (global clustering & outliers presence)
|
||||||
|
"""
|
||||||
|
|
||||||
|
# TODO: Fill in local neighbors which have null/NoneType values with the
|
||||||
|
# average of the their neighborhood
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pysal as ps
|
||||||
|
|
||||||
|
def map_quads(coord):
|
||||||
|
"""
|
||||||
|
Map a quadrant number to Moran's I designation
|
||||||
|
HH=1, LH=2, LL=3, HL=4
|
||||||
|
Input:
|
||||||
|
:param coord (int): quadrant of a specific measurement
|
||||||
|
"""
|
||||||
|
if coord == 1:
|
||||||
|
return 'HH'
|
||||||
|
elif coord == 2:
|
||||||
|
return 'LH'
|
||||||
|
elif coord == 3:
|
||||||
|
return 'LL'
|
||||||
|
elif coord == 4:
|
||||||
|
return 'HL'
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def query_attr_select(params):
|
||||||
|
"""
|
||||||
|
Create portion of SELECT statement for attributes inolved in query.
|
||||||
|
:param params: dict of information used in query (column names,
|
||||||
|
table name, etc.)
|
||||||
|
"""
|
||||||
|
|
||||||
|
attrs = [k for k in params
|
||||||
|
if k not in ('id_col', 'geom_col', 'table', 'num_ngbrs')]
|
||||||
|
|
||||||
|
template = "i.\"{%(col)s}\"::numeric As attr%(alias_num)s, "
|
||||||
|
|
||||||
|
attr_string = ""
|
||||||
|
|
||||||
|
for idx, val in enumerate(sorted(attrs)):
|
||||||
|
attr_string += template % {"col": val, "alias_num": idx + 1}
|
||||||
|
|
||||||
|
return attr_string
|
||||||
|
|
||||||
|
def query_attr_where(params):
|
||||||
|
"""
|
||||||
|
Create portion of WHERE clauses for weeding out NULL-valued geometries
|
||||||
|
"""
|
||||||
|
attrs = sorted([k for k in params
|
||||||
|
if k not in ('id_col', 'geom_col', 'table', 'num_ngbrs')])
|
||||||
|
|
||||||
|
attr_string = []
|
||||||
|
|
||||||
|
for attr in attrs:
|
||||||
|
attr_string.append("idx_replace.\"{%s}\" IS NOT NULL" % attr)
|
||||||
|
|
||||||
|
if len(attrs) == 2:
|
||||||
|
attr_string.append("idx_replace.\"{%s}\" <> 0" % attrs[1])
|
||||||
|
|
||||||
|
out = " AND ".join(attr_string)
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
def knn(params):
|
||||||
|
"""SQL query for k-nearest neighbors.
|
||||||
|
:param vars: dict of values to fill template
|
||||||
|
"""
|
||||||
|
|
||||||
|
attr_select = query_attr_select(params)
|
||||||
|
attr_where = query_attr_where(params)
|
||||||
|
|
||||||
|
replacements = {"attr_select": attr_select,
|
||||||
|
"attr_where_i": attr_where.replace("idx_replace", "i"),
|
||||||
|
"attr_where_j": attr_where.replace("idx_replace", "j")}
|
||||||
|
|
||||||
|
query = "SELECT " \
|
||||||
|
"i.\"{id_col}\" As id, " \
|
||||||
|
"%(attr_select)s" \
|
||||||
|
"(SELECT ARRAY(SELECT j.\"{id_col}\" " \
|
||||||
|
"FROM \"{table}\" As j " \
|
||||||
|
"WHERE %(attr_where_j)s " \
|
||||||
|
"ORDER BY j.\"{geom_col}\" <-> i.\"{geom_col}\" ASC " \
|
||||||
|
"LIMIT {num_ngbrs} OFFSET 1 ) " \
|
||||||
|
") As neighbors " \
|
||||||
|
"FROM \"{table}\" As i " \
|
||||||
|
"WHERE " \
|
||||||
|
"%(attr_where_i)s " \
|
||||||
|
"ORDER BY i.\"{id_col}\" ASC;" % replacements
|
||||||
|
|
||||||
|
return query.format(**params)
|
||||||
|
|
||||||
|
## SQL query for finding queens neighbors (all contiguous polygons)
|
||||||
|
def queen(params):
|
||||||
|
"""SQL query for queen neighbors.
|
||||||
|
:param params: dict of information to fill query
|
||||||
|
"""
|
||||||
|
attr_select = query_attr_select(params)
|
||||||
|
attr_where = query_attr_where(params)
|
||||||
|
|
||||||
|
replacements = {"attr_select": attr_select,
|
||||||
|
"attr_where_i": attr_where.replace("idx_replace", "i"),
|
||||||
|
"attr_where_j": attr_where.replace("idx_replace", "j")}
|
||||||
|
|
||||||
|
query = "SELECT " \
|
||||||
|
"i.\"{id_col}\" As id, " \
|
||||||
|
"%(attr_select)s" \
|
||||||
|
"(SELECT ARRAY(SELECT j.\"{id_col}\" " \
|
||||||
|
"FROM \"{table}\" As j " \
|
||||||
|
"WHERE ST_Touches(i.\"{geom_col}\", j.\"{geom_col}\") AND " \
|
||||||
|
"%(attr_where_j)s)" \
|
||||||
|
") As neighbors " \
|
||||||
|
"FROM \"{table}\" As i " \
|
||||||
|
"WHERE " \
|
||||||
|
"%(attr_where_i)s " \
|
||||||
|
"ORDER BY i.\"{id_col}\" ASC;" % replacements
|
||||||
|
|
||||||
|
return query.format(**params)
|
||||||
|
|
||||||
|
## to add more weight methods open a ticket or pull request
|
||||||
|
|
||||||
|
def get_query(w_type, query_vals):
|
||||||
|
"""Return requested query.
|
||||||
|
:param w_type: type of neighbors to calculate (knn or queen)
|
||||||
|
:param query_vals: values used to construct the query
|
||||||
|
"""
|
||||||
|
|
||||||
|
if w_type == 'knn':
|
||||||
|
return knn(query_vals)
|
||||||
|
else:
|
||||||
|
return queen(query_vals)
|
||||||
|
|
||||||
|
def get_attributes(query_res, attr_num):
|
||||||
|
"""
|
||||||
|
:param query_res: query results with attributes and neighbors
|
||||||
|
:param attr_num: attribute number (1, 2, ...)
|
||||||
|
"""
|
||||||
|
return np.array([x['attr' + str(attr_num)] for x in query_res], dtype=np.float)
|
||||||
|
|
||||||
|
## Build weight object
|
||||||
|
def get_weight(query_res, w_type='queen', num_ngbrs=5):
|
||||||
|
"""
|
||||||
|
Construct PySAL weight from return value of query
|
||||||
|
:param query_res: query results with attributes and neighbors
|
||||||
|
"""
|
||||||
|
if w_type == 'knn':
|
||||||
|
row_normed_weights = [1.0 / float(num_ngbrs)] * num_ngbrs
|
||||||
|
weights = {x['id']: row_normed_weights for x in query_res}
|
||||||
|
elif w_type == 'queen':
|
||||||
|
weights = {x['id']: [1.0 / len(x['neighbors'])] * len(x['neighbors'])
|
||||||
|
if len(x['neighbors']) > 0
|
||||||
|
else [] for x in query_res}
|
||||||
|
|
||||||
|
neighbors = {x['id']: x['neighbors'] for x in query_res}
|
||||||
|
|
||||||
|
return ps.W(neighbors, weights)
|
||||||
|
|
||||||
|
def quad_position(quads):
|
||||||
|
"""
|
||||||
|
Produce Moran's I classification based of n
|
||||||
|
"""
|
||||||
|
|
||||||
|
lisa_sig = np.array([map_quads(q) for q in quads])
|
||||||
|
|
||||||
|
return lisa_sig
|
@ -7,7 +7,7 @@ https://github.com/CartoDB/crankshaft
|
|||||||
|
|
||||||
from setuptools import setup, find_packages
|
from setuptools import setup, find_packages
|
||||||
|
|
||||||
REQUIRES = [] # ['pysal','numpy']
|
REQUIRES = ['pysal','numpy']
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name='crankshaft',
|
name='crankshaft',
|
||||||
@ -40,9 +40,9 @@ setup(
|
|||||||
'test': ['unittest', 'nose', 'mock'],
|
'test': ['unittest', 'nose', 'mock'],
|
||||||
},
|
},
|
||||||
|
|
||||||
# install_requires=REQUIRES,
|
install_requires=REQUIRES,
|
||||||
|
|
||||||
# requires=REQUIRES,
|
requires=REQUIRES,
|
||||||
|
|
||||||
test_suite='test'
|
test_suite='test'
|
||||||
)
|
)
|
||||||
|
7
python/crankshaft/test/helper.py
Normal file
7
python/crankshaft/test/helper.py
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
import unittest
|
||||||
|
|
||||||
|
from mock_plpy import MockPlPy
|
||||||
|
plpy = MockPlPy()
|
||||||
|
|
||||||
|
import sys
|
||||||
|
sys.modules['plpy'] = plpy
|
118
python/crankshaft/test/test_clustering_moran.py
Normal file
118
python/crankshaft/test/test_clustering_moran.py
Normal file
@ -0,0 +1,118 @@
|
|||||||
|
import unittest
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
|
||||||
|
# from mock_plpy import MockPlPy
|
||||||
|
# plpy = MockPlPy()
|
||||||
|
#
|
||||||
|
# import sys
|
||||||
|
# sys.modules['plpy'] = plpy
|
||||||
|
from helper import plpy
|
||||||
|
|
||||||
|
# import crankshaft.clustering as cc
|
||||||
|
import crankshaft.clustering as cc
|
||||||
|
|
||||||
|
|
||||||
|
class MoranTest(unittest.TestCase):
|
||||||
|
"""Testing class for Moran's I functions."""
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.params = {"id_col": "cartodb_id",
|
||||||
|
"attr1": "andy",
|
||||||
|
"attr2": "jay_z",
|
||||||
|
"table": "a_list",
|
||||||
|
"geom_col": "the_geom",
|
||||||
|
"num_ngbrs": 321}
|
||||||
|
|
||||||
|
def test_map_quads(self):
|
||||||
|
"""Test map_quads."""
|
||||||
|
self.assertEqual(cc.map_quads(1), 'HH')
|
||||||
|
self.assertEqual(cc.map_quads(2), 'LH')
|
||||||
|
self.assertEqual(cc.map_quads(3), 'LL')
|
||||||
|
self.assertEqual(cc.map_quads(4), 'HL')
|
||||||
|
self.assertEqual(cc.map_quads(33), None)
|
||||||
|
self.assertEqual(cc.map_quads('andy'), None)
|
||||||
|
|
||||||
|
def test_query_attr_select(self):
|
||||||
|
"""Test query_attr_select."""
|
||||||
|
|
||||||
|
ans = "i.\"{attr1}\"::numeric As attr1, " \
|
||||||
|
"i.\"{attr2}\"::numeric As attr2, "
|
||||||
|
|
||||||
|
self.assertEqual(cc.query_attr_select(self.params), ans)
|
||||||
|
|
||||||
|
def test_query_attr_where(self):
|
||||||
|
"""Test query_attr_where."""
|
||||||
|
|
||||||
|
ans = "idx_replace.\"{attr1}\" IS NOT NULL AND "\
|
||||||
|
"idx_replace.\"{attr2}\" IS NOT NULL AND "\
|
||||||
|
"idx_replace.\"{attr2}\" <> 0"
|
||||||
|
|
||||||
|
self.assertEqual(cc.query_attr_where(self.params), ans)
|
||||||
|
|
||||||
|
def test_knn(self):
|
||||||
|
"""Test knn function."""
|
||||||
|
|
||||||
|
ans = "SELECT i.\"cartodb_id\" As id, i.\"andy\"::numeric As attr1, " \
|
||||||
|
"i.\"jay_z\"::numeric As attr2, (SELECT ARRAY(SELECT j.\"cartodb_id\" " \
|
||||||
|
"FROM \"a_list\" As j WHERE j.\"andy\" IS NOT NULL AND " \
|
||||||
|
"j.\"jay_z\" IS NOT NULL AND j.\"jay_z\" <> 0 ORDER BY " \
|
||||||
|
"j.\"the_geom\" <-> i.\"the_geom\" ASC LIMIT 321 OFFSET 1 ) ) " \
|
||||||
|
"As neighbors FROM \"a_list\" As i WHERE i.\"andy\" IS NOT " \
|
||||||
|
"NULL AND i.\"jay_z\" IS NOT NULL AND i.\"jay_z\" <> 0 ORDER " \
|
||||||
|
"BY i.\"cartodb_id\" ASC;"
|
||||||
|
|
||||||
|
self.assertEqual(cc.knn(self.params), ans)
|
||||||
|
|
||||||
|
def test_queen(self):
|
||||||
|
"""Test queen neighbors function."""
|
||||||
|
|
||||||
|
ans = "SELECT i.\"cartodb_id\" As id, i.\"andy\"::numeric As attr1, " \
|
||||||
|
"i.\"jay_z\"::numeric As attr2, (SELECT ARRAY(SELECT " \
|
||||||
|
"j.\"cartodb_id\" FROM \"a_list\" As j WHERE ST_Touches(" \
|
||||||
|
"i.\"the_geom\", j.\"the_geom\") AND j.\"andy\" IS NOT NULL " \
|
||||||
|
"AND j.\"jay_z\" IS NOT NULL AND j.\"jay_z\" <> 0)) As " \
|
||||||
|
"neighbors FROM \"a_list\" As i WHERE i.\"andy\" IS NOT NULL " \
|
||||||
|
"AND i.\"jay_z\" IS NOT NULL AND i.\"jay_z\" <> 0 ORDER BY " \
|
||||||
|
"i.\"cartodb_id\" ASC;"
|
||||||
|
|
||||||
|
self.assertEqual(cc.queen(self.params), ans)
|
||||||
|
|
||||||
|
def test_get_query(self):
|
||||||
|
"""Test get_query."""
|
||||||
|
|
||||||
|
ans = "SELECT i.\"cartodb_id\" As id, i.\"andy\"::numeric As attr1, " \
|
||||||
|
"i.\"jay_z\"::numeric As attr2, (SELECT ARRAY(SELECT " \
|
||||||
|
"j.\"cartodb_id\" FROM \"a_list\" As j WHERE j.\"andy\" IS " \
|
||||||
|
"NOT NULL AND j.\"jay_z\" IS NOT NULL AND j.\"jay_z\" <> 0 " \
|
||||||
|
"ORDER BY j.\"the_geom\" <-> i.\"the_geom\" ASC LIMIT 321 " \
|
||||||
|
"OFFSET 1 ) ) As neighbors FROM \"a_list\" As i WHERE " \
|
||||||
|
"i.\"andy\" IS NOT NULL AND i.\"jay_z\" IS NOT NULL AND " \
|
||||||
|
"i.\"jay_z\" <> 0 ORDER BY i.\"cartodb_id\" ASC;"
|
||||||
|
|
||||||
|
self.assertEqual(cc.get_query('knn', self.params), ans)
|
||||||
|
|
||||||
|
def test_get_attributes(self):
|
||||||
|
"""Test get_attributes."""
|
||||||
|
|
||||||
|
## need to add tests
|
||||||
|
|
||||||
|
self.assertEqual(True, True)
|
||||||
|
|
||||||
|
def test_get_weight(self):
|
||||||
|
"""Test get_weight."""
|
||||||
|
|
||||||
|
self.assertEqual(True, True)
|
||||||
|
|
||||||
|
|
||||||
|
def test_quad_position(self):
|
||||||
|
"""Test lisa_sig_vals."""
|
||||||
|
|
||||||
|
quads = np.array([1, 2, 3, 4], np.int)
|
||||||
|
|
||||||
|
ans = np.array(['HH', 'LH', 'LL', 'HL'])
|
||||||
|
test_ans = cc.quad_position(quads)
|
||||||
|
|
||||||
|
self.assertTrue((test_ans == ans).all())
|
@ -3,17 +3,21 @@
|
|||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from mock_plpy import MockPlPy
|
# from mock_plpy import MockPlPy
|
||||||
plpy = MockPlPy()
|
# plpy = MockPlPy()
|
||||||
|
#
|
||||||
import sys
|
# import sys
|
||||||
sys.modules['plpy'] = plpy
|
# sys.modules['plpy'] = plpy
|
||||||
|
|
||||||
|
from helper import plpy
|
||||||
import crankshaft
|
import crankshaft
|
||||||
|
|
||||||
class TestPoc(unittest.TestCase):
|
class TestPoc(unittest.TestCase):
|
||||||
def test_should_have_xyz(self):
|
|
||||||
|
def setUp(self):
|
||||||
plpy._reset()
|
plpy._reset()
|
||||||
|
|
||||||
|
def test_should_have_xyz(self):
|
||||||
plpy._define_result('select\s+\*\s+from\s+table', [{'x': 111}])
|
plpy._define_result('select\s+\*\s+from\s+table', [{'x': 111}])
|
||||||
assert crankshaft.poc.xyz() == 111
|
assert crankshaft.poc.xyz() == 111
|
||||||
assert plpy.notices[0] == 'XYZ...'
|
assert plpy.notices[0] == 'XYZ...'
|
||||||
|
Loading…
Reference in New Issue
Block a user