Merge pull request #133 from CartoDB/moran-global-fix

Moran global fix
2016-09-22 10:49:52 +02:00 · 2016-09-22 10:49:52 +02:00 · d0e967d22c
commit d0e967d22c
parent bcb73dee11 c52eb507ea
6 changed files with 88 additions and 36 deletions
--- a/src/pg/sql/10_moran.sql
+++ b/src/pg/sql/10_moran.sql
@ -10,7 +10,7 @@ CREATE OR REPLACE FUNCTION
      id_col TEXT DEFAULT 'cartodb_id')
 RETURNS TABLE (moran NUMERIC, significance NUMERIC)
 AS $$
-  from crankshaft.clustering import moran_local
+  from crankshaft.clustering import moran
  # TODO: use named parameters or a dictionary
  return moran(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col)
 $$ LANGUAGE plpythonu;
--- a/src/pg/test/expected/02_moran_test.out
+++ b/src/pg/test/expected/02_moran_test.out
@ -5,6 +5,12 @@ SET client_min_messages TO WARNING;
 \set ECHO none
 _cdb_random_seeds

+(1 row)
+moran|significance
+0.3399|-0.0196
+(1 row)
+_cdb_random_seeds
+
 (1 row)
 code|quads
 01|HH
--- a/src/pg/test/sql/02_moran_test.sql
+++ b/src/pg/test/sql/02_moran_test.sql
@ -6,6 +6,14 @@
 -- Areas of Interest functions perform some nondeterministic computations
 -- (to estimate the significance); we will set the seeds for the RNGs
 -- that affect those results to have repeateble results
+
+-- Moran's I Global
+SELECT cdb_crankshaft._cdb_random_seeds(1234);
+
+SELECT round(moran, 4) As moran, round(significance, 4) As significance
+  FROM cdb_crankshaft.CDB_AreasOfInterestGlobal('SELECT * FROM ppoints', 'value') m(moran, significance);
+
+-- Moran's I Local
 SELECT cdb_crankshaft._cdb_random_seeds(1234);

 SELECT ppoints.code, m.quads
--- a/src/py/crankshaft/crankshaft/clustering/moran.py
+++ b/src/py/crankshaft/crankshaft/clustering/moran.py
@ -14,6 +14,7 @@ import crankshaft.pysal_utils as pu

 # High level interface ---------------------------------------

+
 def moran(subquery, attr_name,
          w_type, num_ngbrs, permutations, geom_col, id_col):
    """
@ -39,18 +40,19 @@ def moran(subquery, attr_name,
        plpy.error('Analysis failed: %s' % e)
        return pu.empty_zipped_array(2)

-    ## collect attributes
+    # collect attributes
    attr_vals = pu.get_attributes(result)

-    ## calculate weights
+    # calculate weights
    weight = pu.get_weight(result, w_type, num_ngbrs)

-    ## calculate moran global
+    # calculate moran global
    moran_global = ps.esda.moran.Moran(attr_vals, weight,
                                       permutations=permutations)

    return zip([moran_global.I], [moran_global.EI])

+
 def moran_local(subquery, attr,
                w_type, num_ngbrs, permutations, geom_col, id_col):
    """
@ -90,6 +92,7 @@ def moran_local(subquery, attr,

    return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y)

+
 def moran_rate(subquery, numerator, denominator,
               w_type, num_ngbrs, permutations, geom_col, id_col):
    """
@ -114,18 +117,19 @@ def moran_rate(subquery, numerator, denominator,
        plpy.error('Analysis failed: %s' % e)
        return pu.empty_zipped_array(2)

-    ## collect attributes
+    # collect attributes
    numer = pu.get_attributes(result, 1)
    denom = pu.get_attributes(result, 2)

    weight = pu.get_weight(result, w_type, num_ngbrs)

-    ## calculate moran global rate
+    # calculate moran global rate
    lisa_rate = ps.esda.moran.Moran_Rate(numer, denom, weight,
                                         permutations=permutations)

    return zip([lisa_rate.I], [lisa_rate.EI])

+
 def moran_local_rate(subquery, numerator, denominator,
                     w_type, num_ngbrs, permutations, geom_col, id_col):
    """
@ -153,7 +157,7 @@ def moran_local_rate(subquery, numerator, denominator,
        plpy.error('Analysis failed: %s' % e)
        return pu.empty_zipped_array(5)

-    ## collect attributes
+    # collect attributes
    numer = pu.get_attributes(result, 1)
    denom = pu.get_attributes(result, 2)

@ -168,6 +172,7 @@ def moran_local_rate(subquery, numerator, denominator,

    return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y)

+
 def moran_local_bv(subquery, attr1, attr2,
                   permutations, geom_col, id_col, w_type, num_ngbrs):
    """
@ -189,11 +194,11 @@ def moran_local_bv(subquery, attr1, attr2,
        if len(result) == 0:
            return pu.empty_zipped_array(4)
    except plpy.SPIError:
-        plpy.error("Error: areas of interest query failed, " \
+        plpy.error("Error: areas of interest query failed, "
                   "check input parameters")
        return pu.empty_zipped_array(4)

-    ## collect attributes
+    # collect attributes
    attr1_vals = pu.get_attributes(result, 1)
    attr2_vals = pu.get_attributes(result, 2)

@ -211,6 +216,7 @@ def moran_local_bv(subquery, attr1, attr2,

 # Low level functions ----------------------------------------

+
 def map_quads(coord):
    """
        Map a quadrant number to Moran's I designation
@ -231,6 +237,7 @@ def map_quads(coord):
    else:
        return None

+
 def quad_position(quads):
    """
        Produce Moran's I classification based of n
--- a/src/py/crankshaft/crankshaft/pysal_utils/pysal_utils.py
+++ b/src/py/crankshaft/crankshaft/pysal_utils/pysal_utils.py
@ -6,6 +6,7 @@
 import numpy as np
 import pysal as ps

+
 def construct_neighbor_query(w_type, query_vals):
    """Return query (a string) used for finding neighbors
        @param w_type text: type of neighbors to calculate ('knn' or 'queen')
@ -17,7 +18,8 @@ def construct_neighbor_query(w_type, query_vals):
    else:
        return queen(query_vals)

-## Build weight object
+
+# Build weight object
 def get_weight(query_res, w_type='knn', num_ngbrs=5):
    """
        Construct PySAL weight from return value of query
@ -39,6 +41,7 @@ def get_weight(query_res, w_type='knn', num_ngbrs=5):

    return built_weight

+
 def query_attr_select(params):
    """
        Create portion of SELECT statement for attributes inolved in query.
@ -50,21 +53,24 @@ def query_attr_select(params):
    template = "i.\"%(col)s\"::numeric As attr%(alias_num)s, "

    if 'time_cols' in params:
-        ## if markov analysis
+        # if markov analysis
        attrs = params['time_cols']

        for idx, val in enumerate(attrs):
            attr_string += template % {"col": val, "alias_num": idx + 1}
    else:
-        ## if moran's analysis
+        # if moran's analysis
        attrs = [k for k in params
-                 if k not in ('id_col', 'geom_col', 'subquery', 'num_ngbrs', 'subquery')]
+                 if k not in ('id_col', 'geom_col', 'subquery',
+                              'num_ngbrs', 'subquery')]

        for idx, val in enumerate(sorted(attrs)):
-            attr_string += template % {"col": params[val], "alias_num": idx + 1}
+            attr_string += template % {"col": params[val],
+                                       "alias_num": idx + 1}

    return attr_string

+
 def query_attr_where(params):
    """
      Construct where conditions when building neighbors query
@ -74,7 +80,8 @@ def query_attr_where(params):
             'numerator': 'data1',
             'denominator': 'data2',
             '': ...}
-        Output: 'idx_replace."data1" IS NOT NULL AND idx_replace."data2" IS NOT NULL'
+        Output: 'idx_replace."data1" IS NOT NULL AND idx_replace."data2"
+                IS NOT NULL'
        Input:
        {'subquery': ...,
         'time_cols': ['time1', 'time2', 'time3'],
@ -86,17 +93,18 @@ def query_attr_where(params):
    template = "idx_replace.\"%s\" IS NOT NULL"

    if 'time_cols' in params:
-        ## markov where clauses
+        # markov where clauses
        attrs = params['time_cols']
        # add values to template
        for attr in attrs:
            attr_string.append(template % attr)
    else:
-        ## moran where clauses
+        # moran where clauses

        # get keys
        attrs = sorted([k for k in params
-                        if k not in ('id_col', 'geom_col', 'subquery', 'num_ngbrs', 'subquery')])
+                        if k not in ('id_col', 'geom_col', 'subquery',
+                                     'num_ngbrs', 'subquery')])
        # add values to template
        for attr in attrs:
            attr_string.append(template % params[attr])
@ -108,6 +116,7 @@ def query_attr_where(params):

    return out

+
 def knn(params):
    """SQL query for k-nearest neighbors.
        @param vars: dict of values to fill template
@ -139,7 +148,8 @@ def knn(params):

    return query.format(**params)

-## SQL query for finding queens neighbors (all contiguous polygons)
+
+# SQL query for finding queens neighbors (all contiguous polygons)
 def queen(params):
    """SQL query for queen neighbors.
        @param params dict: information to fill query
@ -167,14 +177,17 @@ def queen(params):

    return query.format(**params)

-## to add more weight methods open a ticket or pull request
+# to add more weight methods open a ticket or pull request
+

 def get_attributes(query_res, attr_num=1):
    """
        @param query_res: query results with attributes and neighbors
        @param attr_num: attribute number (1, 2, ...)
    """
-    return np.array([x['attr' + str(attr_num)] for x in query_res], dtype=np.float)
+    return np.array([x['attr' + str(attr_num)] for x in query_res],
+                    dtype=np.float)
+

 def empty_zipped_array(num_nones):
    """
--- a/src/py/crankshaft/test/test_clustering_moran.py
+++ b/src/py/crankshaft/test/test_clustering_moran.py
@ -14,6 +14,7 @@ import crankshaft.pysal_utils as pu
 from crankshaft import random_seeds
 import json

+
 class MoranTest(unittest.TestCase):
    """Testing class for Moran's I functions"""

@ -26,12 +27,15 @@ class MoranTest(unittest.TestCase):
                       "geom_col": "the_geom",
                       "num_ngbrs": 321}
        self.params_markov = {"id_col": "cartodb_id",
-                              "time_cols": ["_2013_dec", "_2014_jan", "_2014_feb"],
+                              "time_cols": ["_2013_dec", "_2014_jan",
+                                            "_2014_feb"],
                              "subquery": "SELECT * FROM a_list",
                              "geom_col": "the_geom",
                              "num_ngbrs": 321}
-        self.neighbors_data = json.loads(open(fixture_file('neighbors.json')).read())
-        self.moran_data = json.loads(open(fixture_file('moran.json')).read())
+        self.neighbors_data = json.loads(
+          open(fixture_file('neighbors.json')).read())
+        self.moran_data = json.loads(
+          open(fixture_file('moran.json')).read())

    def test_map_quads(self):
        """Test map_quads"""
@ -54,35 +58,49 @@ class MoranTest(unittest.TestCase):

    def test_moran_local(self):
        """Test Moran's I local"""
-        data = [ { 'id': d['id'], 'attr1': d['value'], 'neighbors': d['neighbors'] } for d in self.neighbors_data]
+        data = [{'id': d['id'],
+                 'attr1': d['value'],
+                 'neighbors': d['neighbors']} for d in self.neighbors_data]
+
        plpy._define_result('select', data)
        random_seeds.set_random_seeds(1234)
-        result = cc.moran_local('subquery', 'value', 'knn', 5, 99, 'the_geom', 'cartodb_id')
+        result = cc.moran_local('subquery', 'value',
+                                'knn', 5, 99, 'the_geom', 'cartodb_id')
        result = [(row[0], row[1]) for row in result]
-        expected = self.moran_data
-        for ([res_val, res_quad], [exp_val, exp_quad]) in zip(result, expected):
+        zipped_values = zip(result, self.moran_data)
+
+        for ([res_val, res_quad], [exp_val, exp_quad]) in zipped_values:
            self.assertAlmostEqual(res_val, exp_val)
            self.assertEqual(res_quad, exp_quad)

    def test_moran_local_rate(self):
        """Test Moran's I rate"""
-        data = [ { 'id': d['id'], 'attr1': d['value'], 'attr2': 1, 'neighbors': d['neighbors'] } for d in self.neighbors_data]
+        data = [{'id': d['id'],
+                 'attr1': d['value'],
+                 'attr2': 1,
+                 'neighbors': d['neighbors']} for d in self.neighbors_data]
+
        plpy._define_result('select', data)
        random_seeds.set_random_seeds(1234)
-        result = cc.moran_local_rate('subquery', 'numerator', 'denominator', 'knn', 5, 99, 'the_geom', 'cartodb_id')
-        print 'result == None? ', result == None
+        result = cc.moran_local_rate('subquery', 'numerator', 'denominator',
+                                     'knn', 5, 99, 'the_geom', 'cartodb_id')
        result = [(row[0], row[1]) for row in result]
-        expected = self.moran_data
-        for ([res_val, res_quad], [exp_val, exp_quad]) in zip(result, expected):
+
+        zipped_values = zip(result, self.moran_data)
+
+        for ([res_val, res_quad], [exp_val, exp_quad]) in zipped_values:
            self.assertAlmostEqual(res_val, exp_val)

    def test_moran(self):
        """Test Moran's I global"""
-        data = [{ 'id': d['id'], 'attr1': d['value'], 'neighbors': d['neighbors'] } for d in self.neighbors_data]
+        data = [{'id': d['id'],
+                 'attr1': d['value'],
+                 'neighbors': d['neighbors']} for d in self.neighbors_data]
        plpy._define_result('select', data)
        random_seeds.set_random_seeds(1235)
-        result = cc.moran('table', 'value', 'knn', 5, 99, 'the_geom', 'cartodb_id')
-        print 'result == None?', result == None
+        result = cc.moran('table', 'value',
+                          'knn', 5, 99, 'the_geom', 'cartodb_id')
+
        result_moran = result[0][0]
        expected_moran = np.array([row[0] for row in self.moran_data]).mean()
        self.assertAlmostEqual(expected_moran, result_moran, delta=10e-2)