Merge pull request #133 from CartoDB/moran-global-fix

Moran global fix
This commit is contained in:
Carla 2016-09-22 10:49:52 +02:00 committed by GitHub
commit d0e967d22c
6 changed files with 88 additions and 36 deletions

View File

@ -10,7 +10,7 @@ CREATE OR REPLACE FUNCTION
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran NUMERIC, significance NUMERIC)
AS $$
from crankshaft.clustering import moran_local
from crankshaft.clustering import moran
# TODO: use named parameters or a dictionary
return moran(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpythonu;

View File

@ -5,6 +5,12 @@ SET client_min_messages TO WARNING;
\set ECHO none
_cdb_random_seeds
(1 row)
moran|significance
0.3399|-0.0196
(1 row)
_cdb_random_seeds
(1 row)
code|quads
01|HH

View File

@ -6,6 +6,14 @@
-- Areas of Interest functions perform some nondeterministic computations
-- (to estimate the significance); we will set the seeds for the RNGs
-- that affect those results to have repeateble results
-- Moran's I Global
SELECT cdb_crankshaft._cdb_random_seeds(1234);
SELECT round(moran, 4) As moran, round(significance, 4) As significance
FROM cdb_crankshaft.CDB_AreasOfInterestGlobal('SELECT * FROM ppoints', 'value') m(moran, significance);
-- Moran's I Local
SELECT cdb_crankshaft._cdb_random_seeds(1234);
SELECT ppoints.code, m.quads

View File

@ -14,6 +14,7 @@ import crankshaft.pysal_utils as pu
# High level interface ---------------------------------------
def moran(subquery, attr_name,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
@ -39,18 +40,19 @@ def moran(subquery, attr_name,
plpy.error('Analysis failed: %s' % e)
return pu.empty_zipped_array(2)
## collect attributes
# collect attributes
attr_vals = pu.get_attributes(result)
## calculate weights
# calculate weights
weight = pu.get_weight(result, w_type, num_ngbrs)
## calculate moran global
# calculate moran global
moran_global = ps.esda.moran.Moran(attr_vals, weight,
permutations=permutations)
return zip([moran_global.I], [moran_global.EI])
def moran_local(subquery, attr,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
@ -90,6 +92,7 @@ def moran_local(subquery, attr,
return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y)
def moran_rate(subquery, numerator, denominator,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
@ -114,18 +117,19 @@ def moran_rate(subquery, numerator, denominator,
plpy.error('Analysis failed: %s' % e)
return pu.empty_zipped_array(2)
## collect attributes
# collect attributes
numer = pu.get_attributes(result, 1)
denom = pu.get_attributes(result, 2)
weight = pu.get_weight(result, w_type, num_ngbrs)
## calculate moran global rate
# calculate moran global rate
lisa_rate = ps.esda.moran.Moran_Rate(numer, denom, weight,
permutations=permutations)
return zip([lisa_rate.I], [lisa_rate.EI])
def moran_local_rate(subquery, numerator, denominator,
w_type, num_ngbrs, permutations, geom_col, id_col):
"""
@ -153,7 +157,7 @@ def moran_local_rate(subquery, numerator, denominator,
plpy.error('Analysis failed: %s' % e)
return pu.empty_zipped_array(5)
## collect attributes
# collect attributes
numer = pu.get_attributes(result, 1)
denom = pu.get_attributes(result, 2)
@ -168,6 +172,7 @@ def moran_local_rate(subquery, numerator, denominator,
return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y)
def moran_local_bv(subquery, attr1, attr2,
permutations, geom_col, id_col, w_type, num_ngbrs):
"""
@ -189,11 +194,11 @@ def moran_local_bv(subquery, attr1, attr2,
if len(result) == 0:
return pu.empty_zipped_array(4)
except plpy.SPIError:
plpy.error("Error: areas of interest query failed, " \
plpy.error("Error: areas of interest query failed, "
"check input parameters")
return pu.empty_zipped_array(4)
## collect attributes
# collect attributes
attr1_vals = pu.get_attributes(result, 1)
attr2_vals = pu.get_attributes(result, 2)
@ -211,6 +216,7 @@ def moran_local_bv(subquery, attr1, attr2,
# Low level functions ----------------------------------------
def map_quads(coord):
"""
Map a quadrant number to Moran's I designation
@ -231,6 +237,7 @@ def map_quads(coord):
else:
return None
def quad_position(quads):
"""
Produce Moran's I classification based of n

View File

@ -6,6 +6,7 @@
import numpy as np
import pysal as ps
def construct_neighbor_query(w_type, query_vals):
"""Return query (a string) used for finding neighbors
@param w_type text: type of neighbors to calculate ('knn' or 'queen')
@ -17,7 +18,8 @@ def construct_neighbor_query(w_type, query_vals):
else:
return queen(query_vals)
## Build weight object
# Build weight object
def get_weight(query_res, w_type='knn', num_ngbrs=5):
"""
Construct PySAL weight from return value of query
@ -39,6 +41,7 @@ def get_weight(query_res, w_type='knn', num_ngbrs=5):
return built_weight
def query_attr_select(params):
"""
Create portion of SELECT statement for attributes inolved in query.
@ -50,21 +53,24 @@ def query_attr_select(params):
template = "i.\"%(col)s\"::numeric As attr%(alias_num)s, "
if 'time_cols' in params:
## if markov analysis
# if markov analysis
attrs = params['time_cols']
for idx, val in enumerate(attrs):
attr_string += template % {"col": val, "alias_num": idx + 1}
else:
## if moran's analysis
# if moran's analysis
attrs = [k for k in params
if k not in ('id_col', 'geom_col', 'subquery', 'num_ngbrs', 'subquery')]
if k not in ('id_col', 'geom_col', 'subquery',
'num_ngbrs', 'subquery')]
for idx, val in enumerate(sorted(attrs)):
attr_string += template % {"col": params[val], "alias_num": idx + 1}
attr_string += template % {"col": params[val],
"alias_num": idx + 1}
return attr_string
def query_attr_where(params):
"""
Construct where conditions when building neighbors query
@ -74,7 +80,8 @@ def query_attr_where(params):
'numerator': 'data1',
'denominator': 'data2',
'': ...}
Output: 'idx_replace."data1" IS NOT NULL AND idx_replace."data2" IS NOT NULL'
Output: 'idx_replace."data1" IS NOT NULL AND idx_replace."data2"
IS NOT NULL'
Input:
{'subquery': ...,
'time_cols': ['time1', 'time2', 'time3'],
@ -86,17 +93,18 @@ def query_attr_where(params):
template = "idx_replace.\"%s\" IS NOT NULL"
if 'time_cols' in params:
## markov where clauses
# markov where clauses
attrs = params['time_cols']
# add values to template
for attr in attrs:
attr_string.append(template % attr)
else:
## moran where clauses
# moran where clauses
# get keys
attrs = sorted([k for k in params
if k not in ('id_col', 'geom_col', 'subquery', 'num_ngbrs', 'subquery')])
if k not in ('id_col', 'geom_col', 'subquery',
'num_ngbrs', 'subquery')])
# add values to template
for attr in attrs:
attr_string.append(template % params[attr])
@ -108,6 +116,7 @@ def query_attr_where(params):
return out
def knn(params):
"""SQL query for k-nearest neighbors.
@param vars: dict of values to fill template
@ -139,7 +148,8 @@ def knn(params):
return query.format(**params)
## SQL query for finding queens neighbors (all contiguous polygons)
# SQL query for finding queens neighbors (all contiguous polygons)
def queen(params):
"""SQL query for queen neighbors.
@param params dict: information to fill query
@ -167,14 +177,17 @@ def queen(params):
return query.format(**params)
## to add more weight methods open a ticket or pull request
# to add more weight methods open a ticket or pull request
def get_attributes(query_res, attr_num=1):
"""
@param query_res: query results with attributes and neighbors
@param attr_num: attribute number (1, 2, ...)
"""
return np.array([x['attr' + str(attr_num)] for x in query_res], dtype=np.float)
return np.array([x['attr' + str(attr_num)] for x in query_res],
dtype=np.float)
def empty_zipped_array(num_nones):
"""

View File

@ -14,6 +14,7 @@ import crankshaft.pysal_utils as pu
from crankshaft import random_seeds
import json
class MoranTest(unittest.TestCase):
"""Testing class for Moran's I functions"""
@ -26,12 +27,15 @@ class MoranTest(unittest.TestCase):
"geom_col": "the_geom",
"num_ngbrs": 321}
self.params_markov = {"id_col": "cartodb_id",
"time_cols": ["_2013_dec", "_2014_jan", "_2014_feb"],
"time_cols": ["_2013_dec", "_2014_jan",
"_2014_feb"],
"subquery": "SELECT * FROM a_list",
"geom_col": "the_geom",
"num_ngbrs": 321}
self.neighbors_data = json.loads(open(fixture_file('neighbors.json')).read())
self.moran_data = json.loads(open(fixture_file('moran.json')).read())
self.neighbors_data = json.loads(
open(fixture_file('neighbors.json')).read())
self.moran_data = json.loads(
open(fixture_file('moran.json')).read())
def test_map_quads(self):
"""Test map_quads"""
@ -54,35 +58,49 @@ class MoranTest(unittest.TestCase):
def test_moran_local(self):
"""Test Moran's I local"""
data = [ { 'id': d['id'], 'attr1': d['value'], 'neighbors': d['neighbors'] } for d in self.neighbors_data]
data = [{'id': d['id'],
'attr1': d['value'],
'neighbors': d['neighbors']} for d in self.neighbors_data]
plpy._define_result('select', data)
random_seeds.set_random_seeds(1234)
result = cc.moran_local('subquery', 'value', 'knn', 5, 99, 'the_geom', 'cartodb_id')
result = cc.moran_local('subquery', 'value',
'knn', 5, 99, 'the_geom', 'cartodb_id')
result = [(row[0], row[1]) for row in result]
expected = self.moran_data
for ([res_val, res_quad], [exp_val, exp_quad]) in zip(result, expected):
zipped_values = zip(result, self.moran_data)
for ([res_val, res_quad], [exp_val, exp_quad]) in zipped_values:
self.assertAlmostEqual(res_val, exp_val)
self.assertEqual(res_quad, exp_quad)
def test_moran_local_rate(self):
"""Test Moran's I rate"""
data = [ { 'id': d['id'], 'attr1': d['value'], 'attr2': 1, 'neighbors': d['neighbors'] } for d in self.neighbors_data]
data = [{'id': d['id'],
'attr1': d['value'],
'attr2': 1,
'neighbors': d['neighbors']} for d in self.neighbors_data]
plpy._define_result('select', data)
random_seeds.set_random_seeds(1234)
result = cc.moran_local_rate('subquery', 'numerator', 'denominator', 'knn', 5, 99, 'the_geom', 'cartodb_id')
print 'result == None? ', result == None
result = cc.moran_local_rate('subquery', 'numerator', 'denominator',
'knn', 5, 99, 'the_geom', 'cartodb_id')
result = [(row[0], row[1]) for row in result]
expected = self.moran_data
for ([res_val, res_quad], [exp_val, exp_quad]) in zip(result, expected):
zipped_values = zip(result, self.moran_data)
for ([res_val, res_quad], [exp_val, exp_quad]) in zipped_values:
self.assertAlmostEqual(res_val, exp_val)
def test_moran(self):
"""Test Moran's I global"""
data = [{ 'id': d['id'], 'attr1': d['value'], 'neighbors': d['neighbors'] } for d in self.neighbors_data]
data = [{'id': d['id'],
'attr1': d['value'],
'neighbors': d['neighbors']} for d in self.neighbors_data]
plpy._define_result('select', data)
random_seeds.set_random_seeds(1235)
result = cc.moran('table', 'value', 'knn', 5, 99, 'the_geom', 'cartodb_id')
print 'result == None?', result == None
result = cc.moran('table', 'value',
'knn', 5, 99, 'the_geom', 'cartodb_id')
result_moran = result[0][0]
expected_moran = np.array([row[0] for row in self.moran_data]).mean()
self.assertAlmostEqual(expected_moran, result_moran, delta=10e-2)