From ba875bc50b985a8712b9339972100ff6dbfe45e7 Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Tue, 27 Feb 2018 13:40:28 -0500 Subject: [PATCH 01/12] creates new functions which expose spatial lag --- src/pg/sql/10_moran.sql | 76 +++++++++++++++++-- .../crankshaft/crankshaft/clustering/moran.py | 11 ++- 2 files changed, 80 insertions(+), 7 deletions(-) diff --git a/src/pg/sql/10_moran.sql b/src/pg/sql/10_moran.sql index 840e07b..fd936b8 100644 --- a/src/pg/sql/10_moran.sql +++ b/src/pg/sql/10_moran.sql @@ -27,16 +27,48 @@ CREATE OR REPLACE FUNCTION permutations INT, geom_col TEXT, id_col TEXT) -RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +RETURNS TABLE ( + moran NUMERIC, + quads TEXT, + significance NUMERIC, + rowid INT, + vals NUMERIC, + spatial_lag NUMERIC) AS $$ from crankshaft.clustering import Moran moran = Moran() - # TODO: use named parameters or a dictionary return moran.local_stat(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col) $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; --- Moran's I Local (public-facing function) + +-- Moran's I Local (public-facing function) - deprecated +CREATE OR REPLACE FUNCTION + CDB_MoransILocal( + subquery TEXT, + column_name TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS TABLE ( + moran NUMERIC, + quads TEXT, + significance NUMERIC, + rowid INT, + vals NUMERIC, + spatial_lag NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals, spatial_lag + FROM cdb_crankshaft._CDB_AreasOfInterestLocal( + subquery, column_name, w_type, + num_ngbrs, permutations, geom_col, id_col); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local (public-facing function) - DEPRECATED CREATE OR REPLACE FUNCTION CDB_AreasOfInterestLocal( subquery TEXT, @@ -144,7 +176,13 @@ CREATE OR REPLACE FUNCTION geom_col TEXT, id_col TEXT) RETURNS -TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +TABLE( + moran NUMERIC, + quads TEXT, + significance NUMERIC, + rowid INT, + vals NUMERIC, + spatial_lag NUMERIC) AS $$ from crankshaft.clustering import Moran moran = Moran() @@ -152,7 +190,7 @@ AS $$ return moran.local_rate_stat(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; --- Moran's I Local Rate (public-facing function) +-- Moran's I Local Rate (public-facing function) - DEPRECATED CREATE OR REPLACE FUNCTION CDB_AreasOfInterestLocalRate( subquery TEXT, @@ -172,6 +210,34 @@ AS $$ $$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; +-- Replaces CDB_AreasOfInterestLocalRate +CREATE OR REPLACE FUNCTION + CDB_MoransILocalRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS +TABLE( + moran NUMERIC, + quads TEXT, + significance NUMERIC, + rowid INT, + vals NUMERIC, + spatial_lag NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals, spatial_lag + FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate( + subquery, numerator, denominator, w_type, + num_ngbrs, permutations, geom_col, id_col); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + -- Moran's I Local Rate only for HH and HL (public-facing function) CREATE OR REPLACE FUNCTION CDB_GetSpatialHotspotsRate( diff --git a/src/py/crankshaft/crankshaft/clustering/moran.py b/src/py/crankshaft/crankshaft/clustering/moran.py index 0d5753f..cfd40dd 100644 --- a/src/py/crankshaft/crankshaft/clustering/moran.py +++ b/src/py/crankshaft/crankshaft/clustering/moran.py @@ -78,7 +78,11 @@ class Moran(object): # find quadrants for each geometry quads = quad_position(lisa.q) - return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y) + # calculate spatial lag + lag = ps.weights.spatial_lag.lag_spatial(weight, lisa.y) + + return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, + lisa.y, lag) def global_rate_stat(self, subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col): @@ -138,7 +142,10 @@ class Moran(object): # find quadrants for each geometry quads = quad_position(lisa.q) - return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y) + # spatial lag + lag = ps.weights.spatial_lag.lag_spatial(weight, lisa.y) + + return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y, lag) def local_bivariate_stat(self, subquery, attr1, attr2, permutations, geom_col, id_col, From 46d3375ea3e905ba213d69fcd5313a4e2c456328 Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Tue, 27 Feb 2018 14:24:26 -0500 Subject: [PATCH 02/12] adds new private functions --- src/pg/sql/10_moran.sql | 64 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 57 insertions(+), 7 deletions(-) diff --git a/src/pg/sql/10_moran.sql b/src/pg/sql/10_moran.sql index fd936b8..1428bc0 100644 --- a/src/pg/sql/10_moran.sql +++ b/src/pg/sql/10_moran.sql @@ -17,7 +17,7 @@ AS $$ num_ngbrs, permutations, geom_col, id_col) $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; --- Moran's I Local (internal function) +-- Moran's I Local (internal function) - DEPRECATED CREATE OR REPLACE FUNCTION _CDB_AreasOfInterestLocal( subquery TEXT, @@ -27,6 +27,29 @@ CREATE OR REPLACE FUNCTION permutations INT, geom_col TEXT, id_col TEXT) +RETURNS TABLE ( + moran NUMERIC, + quads TEXT, + significance NUMERIC, + rowid INT, + vals NUMERIC) +AS $$ + from crankshaft.clustering import Moran + moran = Moran() + return moran.local_stat(subquery, column_name, w_type, + num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local (internal function) +CREATE OR REPLACE FUNCTION + _CDB_MoransILocal( + subquery TEXT, + column_name TEXT, + w_type TEXT, + num_ngbrs INT, + permutations INT, + geom_col TEXT, + id_col TEXT) RETURNS TABLE ( moran NUMERIC, quads TEXT, @@ -42,7 +65,8 @@ AS $$ $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; --- Moran's I Local (public-facing function) - deprecated +-- Moran's I Local (public-facing function) +-- Replaces CDB_AreasOfInterestLocal CREATE OR REPLACE FUNCTION CDB_MoransILocal( subquery TEXT, @@ -62,7 +86,7 @@ RETURNS TABLE ( AS $$ SELECT moran, quads, significance, rowid, vals, spatial_lag - FROM cdb_crankshaft._CDB_AreasOfInterestLocal( + FROM cdb_crankshaft._CDB_MoransILocal( subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col); @@ -164,7 +188,7 @@ AS $$ $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; --- Moran's I Local Rate (internal function) +-- Moran's I Local Rate (internal function) - DEPRECATED CREATE OR REPLACE FUNCTION _CDB_AreasOfInterestLocalRate( subquery TEXT, @@ -181,8 +205,7 @@ TABLE( quads TEXT, significance NUMERIC, rowid INT, - vals NUMERIC, - spatial_lag NUMERIC) + vals NUMERIC) AS $$ from crankshaft.clustering import Moran moran = Moran() @@ -210,6 +233,33 @@ AS $$ $$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; +-- Internal function +CREATE OR REPLACE FUNCTION + _CDB_MoransILocalRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT, + num_ngbrs INT, + permutations INT, + geom_col TEXT, + id_col TEXT) +RETURNS +TABLE( + moran NUMERIC, + quads TEXT, + significance NUMERIC, + rowid INT, + vals NUMERIC, + spatial_lag NUMERIC) +AS $$ + from crankshaft.clustering import Moran + moran = Moran() + # TODO: use named parameters or a dictionary + return moran.local_rate_stat(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + +-- Moran's I Rate -- Replaces CDB_AreasOfInterestLocalRate CREATE OR REPLACE FUNCTION CDB_MoransILocalRate( @@ -232,7 +282,7 @@ TABLE( AS $$ SELECT moran, quads, significance, rowid, vals, spatial_lag - FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate( + FROM cdb_crankshaft._CDB_MoransILocalRate( subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col); From dcab63f3b3e191ae47d44379a0fb89ffe8cc549f Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Tue, 27 Feb 2018 15:36:27 -0500 Subject: [PATCH 03/12] update deprecated private funcs to return all but spatial lag --- src/pg/sql/10_moran.sql | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/pg/sql/10_moran.sql b/src/pg/sql/10_moran.sql index 1428bc0..45e1e1e 100644 --- a/src/pg/sql/10_moran.sql +++ b/src/pg/sql/10_moran.sql @@ -36,8 +36,10 @@ RETURNS TABLE ( AS $$ from crankshaft.clustering import Moran moran = Moran() - return moran.local_stat(subquery, column_name, w_type, - num_ngbrs, permutations, geom_col, id_col) + result = moran.local_stat(subquery, column_name, w_type, + num_ngbrs, permutations, geom_col, id_col) + # remove spatial lag + return [r[:-1] for r in result] $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; -- Moran's I Local (internal function) @@ -210,7 +212,9 @@ AS $$ from crankshaft.clustering import Moran moran = Moran() # TODO: use named parameters or a dictionary - return moran.local_rate_stat(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) + result = moran.local_rate_stat(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) + # remove spatial lag + return [r[:-1] for r in result] $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; -- Moran's I Local Rate (public-facing function) - DEPRECATED From dd32d454223ddeaca29d92261c5c633eefdae6b5 Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Tue, 27 Feb 2018 16:12:20 -0500 Subject: [PATCH 04/12] small pep8/syntax fixes --- src/py/crankshaft/crankshaft/clustering/moran.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/py/crankshaft/crankshaft/clustering/moran.py b/src/py/crankshaft/crankshaft/clustering/moran.py index cfd40dd..c4cad92 100644 --- a/src/py/crankshaft/crankshaft/clustering/moran.py +++ b/src/py/crankshaft/crankshaft/clustering/moran.py @@ -16,6 +16,8 @@ import crankshaft.pysal_utils as pu class Moran(object): + """Class for calculation of Moran's I statistics (global, local, and local + rate""" def __init__(self, data_provider=None): if data_provider is None: self.data_provider = AnalysisDataProvider() @@ -199,8 +201,7 @@ def map_quads(coord): return 'LL' elif coord == 4: return 'HL' - else: - return None + return None def quad_position(quads): From 0d050a22067dd6a5fa4a5bc63a57ec06dd1877d5 Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Tue, 27 Feb 2018 16:36:08 -0500 Subject: [PATCH 05/12] adding more thorough docs --- .../crankshaft/crankshaft/clustering/moran.py | 127 ++++++++++++++---- 1 file changed, 103 insertions(+), 24 deletions(-) diff --git a/src/py/crankshaft/crankshaft/clustering/moran.py b/src/py/crankshaft/crankshaft/clustering/moran.py index c4cad92..ff4501b 100644 --- a/src/py/crankshaft/crankshaft/clustering/moran.py +++ b/src/py/crankshaft/crankshaft/clustering/moran.py @@ -1,23 +1,26 @@ """ Moran's I geostatistics (global clustering & outliers presence) +Functionality relies PySAL: http://pysal.readthedocs.io/en/latest/ """ -# TODO: Fill in local neighbors which have null/NoneType values with the -# average of the their neighborhood - -import pysal as ps from collections import OrderedDict -from crankshaft.analysis_data_provider import AnalysisDataProvider +import pysal as ps # crankshaft module import crankshaft.pysal_utils as pu +from crankshaft.analysis_data_provider import AnalysisDataProvider # High level interface --------------------------------------- class Moran(object): """Class for calculation of Moran's I statistics (global, local, and local - rate""" + rate) + + Parameters: + data_provider (:obj:`AnalysisDataProvider`): Class for fetching data. See + the `crankshaft.analysis_data_provider` module for more information. + """ def __init__(self, data_provider=None): if data_provider is None: self.data_provider = AnalysisDataProvider() @@ -30,7 +33,26 @@ class Moran(object): Moran's I (global) Implementation building neighbors with a PostGIS database and Moran's I core clusters with PySAL. - Andy Eschbacher + + Args: + + subquery (str): Query to give access to the data needed. This query + must give access to ``attr_name``, ``geom_col``, and ``id_col``. + attr_name (str): Column name of data to analyze + w_type (str): Type of spatial weight. Must be one of `knn` + or `queen`. See `PySAL documentation + `__ + for more information. + num_ngbrs (int): If using `knn` for ``w_type``, this + specifies the number of neighbors to be used to define the spatial + neighborhoods. + permutations (int): Number of permutations for performing + conditional randomization to find the p-value. Higher numbers + takes a longer time for getting results. + geom_col (str): Name of the geometry column in the dataset for + finding the spatial neighborhoods. + id_col (str): Row index for each value. Usually the database index. + """ params = OrderedDict([("id_col", id_col), ("attr1", attr_name), @@ -55,8 +77,26 @@ class Moran(object): def local_stat(self, subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col): """ - Moran's I implementation for PL/Python - Andy Eschbacher + Moran's I (local) + + Args: + + subquery (str): Query to give access to the data needed. This query + must give access to ``attr_name``, ``geom_col``, and ``id_col``. + attr (str): Column name of data to analyze + w_type (str): Type of spatial weight. Must be one of `knn` + or `queen`. See `PySAL documentation + `__ + for more information. + num_ngbrs (int): If using `knn` for ``w_type``, this + specifies the number of neighbors to be used to define the spatial + neighborhoods. + permutations (int): Number of permutations for performing + conditional randomization to find the p-value. Higher numbers + takes a longer time for getting results. + geom_col (str): Name of the geometry column in the dataset for + finding the spatial neighborhoods. + id_col (str): Row index for each value. Usually the database index. """ # geometries with attributes that are null are ignored @@ -90,7 +130,26 @@ class Moran(object): w_type, num_ngbrs, permutations, geom_col, id_col): """ Moran's I Rate (global) - Andy Eschbacher + + Args: + + subquery (str): Query to give access to the data needed. This query + must give access to ``attr_name``, ``geom_col``, and ``id_col``. + numerator (str): Column name of numerator to analyze + denominator (str): Column name of the denominator + w_type (str): Type of spatial weight. Must be one of `knn` + or `queen`. See `PySAL documentation + `__ + for more information. + num_ngbrs (int): If using `knn` for ``w_type``, this + specifies the number of neighbors to be used to define the spatial + neighborhoods. + permutations (int): Number of permutations for performing + conditional randomization to find the p-value. Higher numbers + takes a longer time for getting results. + geom_col (str): Name of the geometry column in the dataset for + finding the spatial neighborhoods. + id_col (str): Row index for each value. Usually the database index. """ params = OrderedDict([("id_col", id_col), ("attr1", numerator), @@ -116,8 +175,27 @@ class Moran(object): def local_rate_stat(self, subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col): """ - Moran's I Local Rate - Andy Eschbacher + Moran's I Local Rate + + Args: + + subquery (str): Query to give access to the data needed. This query + must give access to ``attr_name``, ``geom_col``, and ``id_col``. + numerator (str): Column name of numerator to analyze + denominator (str): Column name of the denominator + w_type (str): Type of spatial weight. Must be one of `knn` + or `queen`. See `PySAL documentation + `__ + for more information. + num_ngbrs (int): If using `knn` for ``w_type``, this + specifies the number of neighbors to be used to define the spatial + neighborhoods. + permutations (int): Number of permutations for performing + conditional randomization to find the p-value. Higher numbers + takes a longer time for getting results. + geom_col (str): Name of the geometry column in the dataset for + finding the spatial neighborhoods. + id_col (str): Row index for each value. Usually the database index. """ # geometries with values that are null are ignored # resulting in a collection of not as near neighbors @@ -186,12 +264,12 @@ class Moran(object): def map_quads(coord): """ - Map a quadrant number to Moran's I designation - HH=1, LH=2, LL=3, HL=4 - Input: - @param coord (int): quadrant of a specific measurement - Output: - classification (one of 'HH', 'LH', 'LL', or 'HL') + Map a quadrant number to Moran's I designation + HH=1, LH=2, LL=3, HL=4 + Args: + coord (int): quadrant of a specific measurement + Returns: + classification (one of 'HH', 'LH', 'LL', or 'HL') """ if coord == 1: return 'HH' @@ -206,11 +284,12 @@ def map_quads(coord): def quad_position(quads): """ - Produce Moran's I classification based of n - Input: - @param quads ndarray: an array of quads classified by - 1-4 (PySAL default) - Output: - @param list: an array of quads classied by 'HH', 'LL', etc. + Map all quads + + Args: + quads (:obj:`numpy.ndarray`): an array of quads classified by + 1-4 (PySAL default) + Returns: + list: an array of quads classied by 'HH', 'LL', etc. """ return [map_quads(q) for q in quads] From 7b1ce57abc3e5e9ea8f97e81842a531ae51a6b40 Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Thu, 1 Mar 2018 09:25:56 -0500 Subject: [PATCH 06/12] expand outputs for python moran, adjust existing functions to return previous values --- src/pg/sql/10_moran.sql | 84 ++++++++++++------- .../crankshaft/crankshaft/clustering/moran.py | 54 +++++++++++- 2 files changed, 103 insertions(+), 35 deletions(-) diff --git a/src/pg/sql/10_moran.sql b/src/pg/sql/10_moran.sql index 45e1e1e..bc75aff 100644 --- a/src/pg/sql/10_moran.sql +++ b/src/pg/sql/10_moran.sql @@ -39,7 +39,7 @@ AS $$ result = moran.local_stat(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col) # remove spatial lag - return [r[:-1] for r in result] + return [(r[6], r[0], r[1], r[7], r[5]) for r in result] $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; -- Moran's I Local (internal function) @@ -53,17 +53,21 @@ CREATE OR REPLACE FUNCTION geom_col TEXT, id_col TEXT) RETURNS TABLE ( - moran NUMERIC, quads TEXT, significance NUMERIC, - rowid INT, - vals NUMERIC, - spatial_lag NUMERIC) + spatial_lag NUMERIC, + spatial_lag_std NUMERIC, + orig_val NUMERIC, + orig_val_std NUMERIC, + moran_stat NUMERIC, + rowid INT) AS $$ - from crankshaft.clustering import Moran - moran = Moran() - return moran.local_stat(subquery, column_name, w_type, - num_ngbrs, permutations, geom_col, id_col) + +from crankshaft.clustering import Moran +moran = Moran() +return moran.local_stat(subquery, column_name, w_type, + num_ngbrs, permutations, geom_col, id_col) + $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; @@ -79,15 +83,19 @@ CREATE OR REPLACE FUNCTION geom_col TEXT DEFAULT 'the_geom', id_col TEXT DEFAULT 'cartodb_id') RETURNS TABLE ( - moran NUMERIC, quads TEXT, significance NUMERIC, - rowid INT, - vals NUMERIC, - spatial_lag NUMERIC) + spatial_lag NUMERIC, + spatial_lag_std NUMERIC, + orig_val NUMERIC, + orig_val_std NUMERIC, + moran_stat NUMERIC, + rowid INT) AS $$ - SELECT moran, quads, significance, rowid, vals, spatial_lag + SELECT + quads, significance, spatial_lag, spatial_lag_std, + orig_val, orig_val_std, moran_stat, rowid FROM cdb_crankshaft._CDB_MoransILocal( subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col); @@ -214,7 +222,7 @@ AS $$ # TODO: use named parameters or a dictionary result = moran.local_rate_stat(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) # remove spatial lag - return [r[:-1] for r in result] + return [(r[6], r[0], r[1], r[7], r[4]) for r in result] $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; -- Moran's I Local Rate (public-facing function) - DEPRECATED @@ -250,17 +258,27 @@ CREATE OR REPLACE FUNCTION id_col TEXT) RETURNS TABLE( - moran NUMERIC, quads TEXT, significance NUMERIC, - rowid INT, - vals NUMERIC, - spatial_lag NUMERIC) + spatial_lag NUMERIC, + spatial_lag_std NUMERIC, + orig_val NUMERIC, + orig_val_std NUMERIC, + moran_stat NUMERIC, + rowid INT) AS $$ - from crankshaft.clustering import Moran - moran = Moran() - # TODO: use named parameters or a dictionary - return moran.local_rate_stat(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) +from crankshaft.clustering import Moran +moran = Moran() +return moran.local_rate_stat( + subquery, + numerator, + denominator, + w_type, + num_ngbrs, + permutations, + geom_col, + id_col +) $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; -- Moran's I Rate @@ -277,18 +295,22 @@ CREATE OR REPLACE FUNCTION id_col TEXT DEFAULT 'cartodb_id') RETURNS TABLE( - moran NUMERIC, quads TEXT, significance NUMERIC, - rowid INT, - vals NUMERIC, - spatial_lag NUMERIC) + spatial_lag NUMERIC, + spatial_lag_std NUMERIC, + orig_val NUMERIC, + orig_val_std NUMERIC, + moran_stat NUMERIC, + rowid INT) AS $$ - SELECT moran, quads, significance, rowid, vals, spatial_lag - FROM cdb_crankshaft._CDB_MoransILocalRate( - subquery, numerator, denominator, w_type, - num_ngbrs, permutations, geom_col, id_col); +SELECT + quads, significance, spatial_lag, spatial_lag_std, + orig_val, orig_val_std, moran_stat, rowid +FROM cdb_crankshaft._CDB_MoransILocalRate( + subquery, numerator, denominator, w_type, + num_ngbrs, permutations, geom_col, id_col); $$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; diff --git a/src/py/crankshaft/crankshaft/clustering/moran.py b/src/py/crankshaft/crankshaft/clustering/moran.py index ff4501b..cce5670 100644 --- a/src/py/crankshaft/crankshaft/clustering/moran.py +++ b/src/py/crankshaft/crankshaft/clustering/moran.py @@ -1,6 +1,9 @@ """ Moran's I geostatistics (global clustering & outliers presence) -Functionality relies PySAL: http://pysal.readthedocs.io/en/latest/ +Functionality relies on a combination of `PySAL +`__ and the data providered provided in +the class instantiation (which defaults to PostgreSQL's plpy module's `database +access functions `__). """ from collections import OrderedDict @@ -97,6 +100,18 @@ class Moran(object): geom_col (str): Name of the geometry column in the dataset for finding the spatial neighborhoods. id_col (str): Row index for each value. Usually the database index. + + Returns: + list of tuples: Where each tuple consists of the following values: + - quadrants classification (one of `HH`, `HL`, `LL`, or `LH`) + - p-value + - spatial lag + - standardized spatial lag (centered on the mean, normalized by the + standard deviation) + - original value + - standardized value + - Moran's I statistic + - original row index """ # geometries with attributes that are null are ignored @@ -122,9 +137,18 @@ class Moran(object): # calculate spatial lag lag = ps.weights.spatial_lag.lag_spatial(weight, lisa.y) + lag_std = ps.weights.spatial_lag.lag_spatial(weight, lisa.z) - return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, - lisa.y, lag) + return zip( + quads, + lisa.p_sim, + lag, + lag_std, + lisa.y, + lisa.z, + lisa.Is, + weight.id_order + ) def global_rate_stat(self, subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col): @@ -196,6 +220,18 @@ class Moran(object): geom_col (str): Name of the geometry column in the dataset for finding the spatial neighborhoods. id_col (str): Row index for each value. Usually the database index. + + Returns: + list of tuples: Where each tuple consists of the following values: + - quadrants classification (one of `HH`, `HL`, `LL`, or `LH`) + - p-value + - spatial lag + - standardized spatial lag (centered on the mean, normalized by the + standard deviation) + - original value (roughly numerator divided by denominator) + - standardized value + - Moran's I statistic + - original row index """ # geometries with values that are null are ignored # resulting in a collection of not as near neighbors @@ -224,8 +260,18 @@ class Moran(object): # spatial lag lag = ps.weights.spatial_lag.lag_spatial(weight, lisa.y) + lag_std = ps.weights.spatial_lag.lag_spatial(weight, lisa.z) - return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y, lag) + return zip( + quads, + lisa.p_sim, + lag, + lag_std, + lisa.y, + lisa.z, + lisa.Is, + weight.id_order + ) def local_bivariate_stat(self, subquery, attr1, attr2, permutations, geom_col, id_col, From c1ca03e159f0c6c5bc22a31e55be3afd1c6a4439 Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Thu, 1 Mar 2018 09:26:53 -0500 Subject: [PATCH 07/12] adjust tests with new outputs --- src/py/crankshaft/test/test_clustering_moran.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/py/crankshaft/test/test_clustering_moran.py b/src/py/crankshaft/test/test_clustering_moran.py index cc1930e..a91c046 100644 --- a/src/py/crankshaft/test/test_clustering_moran.py +++ b/src/py/crankshaft/test/test_clustering_moran.py @@ -71,10 +71,10 @@ class MoranTest(unittest.TestCase): random_seeds.set_random_seeds(1234) result = moran.local_stat('subquery', 'value', 'knn', 5, 99, 'the_geom', 'cartodb_id') - result = [(row[0], row[1]) for row in result] + result = [(row[0], row[6]) for row in result] zipped_values = zip(result, self.moran_data) - for ([res_val, res_quad], [exp_val, exp_quad]) in zipped_values: + for ([res_quad, res_val], [exp_val, exp_quad]) in zipped_values: self.assertAlmostEqual(res_val, exp_val) self.assertEqual(res_quad, exp_quad) @@ -89,11 +89,11 @@ class MoranTest(unittest.TestCase): moran = Moran(FakeDataProvider(data)) result = moran.local_rate_stat('subquery', 'numerator', 'denominator', 'knn', 5, 99, 'the_geom', 'cartodb_id') - result = [(row[0], row[1]) for row in result] + result = [(row[0], row[6]) for row in result] zipped_values = zip(result, self.moran_data) - for ([res_val, res_quad], [exp_val, exp_quad]) in zipped_values: + for ([res_quad, res_val], [exp_val, exp_quad]) in zipped_values: self.assertAlmostEqual(res_val, exp_val) def test_moran(self): From f1bd05831b730de62dd6c1d529ca4bed52bf20af Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Thu, 1 Mar 2018 10:22:29 -0500 Subject: [PATCH 08/12] adds more thorough tests --- src/pg/test/expected/02_moran_test.out | 114 +++++++++++++++++++++++++ src/pg/test/sql/02_moran_test.sql | 37 ++++++++ 2 files changed, 151 insertions(+) diff --git a/src/pg/test/expected/02_moran_test.out b/src/pg/test/expected/02_moran_test.out index 95ce189..384f180 100644 --- a/src/pg/test/expected/02_moran_test.out +++ b/src/pg/test/expected/02_moran_test.out @@ -68,6 +68,63 @@ code|quads (52 rows) _cdb_random_seeds +(1 row) +code|quads|diff_orig|expected|moran_stat_not_null|significance_not_null|value_comparison +01|HH|t|t|t|t|t +02|HL|t|t|t|t|t +03|LL|t|t|t|t|t +04|LL|t|t|t|t|t +05|LH|t|t|t|t|t +06|LL|t|t|t|t|t +07|HH|t|t|t|t|t +08|HH|t|t|t|t|t +09|HH|t|t|t|t|t +10|LL|t|t|t|t|t +11|LL|t|t|t|t|t +12|LL|t|t|t|t|t +13|HL|t|t|t|t|t +14|LL|t|t|t|t|t +15|LL|t|t|t|t|t +16|HH|t|t|t|t|t +17|HH|t|t|t|t|t +18|LL|t|t|t|t|t +19|HH|t|t|t|t|t +20|HH|t|t|t|t|t +21|LL|t|t|t|t|t +22|HH|t|t|t|t|t +23|LL|t|t|t|t|t +24|LL|t|t|t|t|t +25|HH|t|t|t|t|t +26|HH|t|t|t|t|t +27|LL|t|t|t|t|t +28|HH|t|t|t|t|t +29|LL|t|t|t|t|t +30|LL|t|t|t|t|t +31|HH|t|t|t|t|t +32|LL|t|t|t|t|t +33|HL|t|t|t|t|t +34|LH|t|t|t|t|t +35|LL|t|t|t|t|t +36|LL|t|t|t|t|t +37|HL|t|t|t|t|t +38|HL|t|t|t|t|t +39|HH|t|t|t|t|t +40|HH|t|t|t|t|t +41|HL|t|t|t|t|t +42|LH|t|t|t|t|t +43|LH|t|t|t|t|t +44|LL|t|t|t|t|t +45|LH|t|t|t|t|t +46|LL|t|t|t|t|t +47|LL|t|t|t|t|t +48|HH|t|t|t|t|t +49|LH|t|t|t|t|t +50|HH|t|t|t|t|t +51|LL|t|t|t|t|t +52|LL|t|t|t|t|t +(52 rows) +_cdb_random_seeds + (1 row) code|quads 01|HH @@ -204,6 +261,63 @@ code|quads (52 rows) _cdb_random_seeds +(1 row) +code|quads|diff_orig|expected|moran_stat_not_null|significance_not_null +01|HH|t|t|t|t +02|HL|t|t|t|t +03|LL|t|t|t|t +04|LL|t|t|t|t +05|LH|t|t|t|t +06|LL|t|t|t|t +07|HH|t|t|t|t +08|HH|t|t|t|t +09|HH|t|t|t|t +10|LL|t|t|t|t +11|LL|t|t|t|t +12|LL|t|t|t|t +13|HL|t|t|t|t +14|LL|t|t|t|t +15|LL|t|t|t|t +16|HH|t|t|t|t +17|HH|t|t|t|t +18|LL|t|t|t|t +19|HH|t|t|t|t +20|HH|t|t|t|t +21|LL|t|t|t|t +22|HH|t|t|t|t +23|LL|t|t|t|t +24|LL|t|t|t|t +25|HH|t|t|t|t +26|HH|t|t|t|t +27|LL|t|t|t|t +28|HH|t|t|t|t +29|LL|t|t|t|t +30|LL|t|t|t|t +31|HH|t|t|t|t +32|LL|t|t|t|t +33|HL|t|t|t|t +34|LH|t|t|t|t +35|LL|t|t|t|t +36|LL|t|t|t|t +37|HL|t|t|t|t +38|HL|t|t|t|t +39|HH|t|t|t|t +40|HH|t|t|t|t +41|HL|t|t|t|t +42|LH|t|t|t|t +43|LH|t|t|t|t +44|LL|t|t|t|t +45|LH|t|t|t|t +46|LL|t|t|t|t +47|LL|t|t|t|t +48|HH|t|t|t|t +49|LH|t|t|t|t +50|HH|t|t|t|t +51|LL|t|t|t|t +52|LL|t|t|t|t +(52 rows) +_cdb_random_seeds + (1 row) code|quads 01|HH diff --git a/src/pg/test/sql/02_moran_test.sql b/src/pg/test/sql/02_moran_test.sql index d2cfaef..ac5a231 100644 --- a/src/pg/test/sql/02_moran_test.sql +++ b/src/pg/test/sql/02_moran_test.sql @@ -24,6 +24,25 @@ SELECT ppoints.code, m.quads SELECT cdb_crankshaft._cdb_random_seeds(1234); +-- Moran's I local +SELECT + ppoints.code, m.quads, + abs(avg(m.orig_val_std) OVER ()) < 1e-6 as diff_orig, + CASE WHEN m.quads = 'HL' THEN m.orig_val_std > m.spatial_lag_std + WHEN m.quads = 'HH' THEN m.orig_val_std >= 0 and m.spatial_lag_std >= 0 + WHEN m.quads = 'LH' THEN m.orig_val_std < m.spatial_lag_std + WHEN m.quads = 'LL' THEN m.orig_val_std <= 0 and m.spatial_lag_std <= 0 + ELSE null END as expected, + moran_stat is not null moran_stat_not_null, + significance >= 0.001 significance_not_null, -- greater than 1/1000 (default) + abs(m.orig_val - ppoints.value) <= 1e-6 as value_comparison + FROM ppoints + JOIN cdb_crankshaft.CDB_MoransILocal('SELECT * FROM ppoints', 'value') m + ON ppoints.cartodb_id = m.rowid + ORDER BY ppoints.code; + +SELECT cdb_crankshaft._cdb_random_seeds(1234); + -- Spatial Hotspots SELECT ppoints.code, m.quads FROM ppoints @@ -61,6 +80,24 @@ SELECT ppoints2.code, m.quads SELECT cdb_crankshaft._cdb_random_seeds(1234); +-- Moran's I local rate +SELECT + ppoints2.code, m.quads, + abs(avg(m.orig_val_std) OVER ()) < 1e-6 as diff_orig, + CASE WHEN m.quads = 'HL' THEN m.orig_val_std > m.spatial_lag_std + WHEN m.quads = 'HH' THEN m.orig_val_std >= 0 and m.spatial_lag_std >= 0 + WHEN m.quads = 'LH' THEN m.orig_val_std < m.spatial_lag_std + WHEN m.quads = 'LL' THEN m.orig_val_std <= 0 and m.spatial_lag_std <= 0 + ELSE null END as expected, + moran_stat is not null moran_stat_not_null, + significance >= 0.001 significance_not_null -- greater than 1/1000 (default) + FROM ppoints2 + JOIN cdb_crankshaft.CDB_MoransILocalRate('SELECT * FROM ppoints2', 'numerator', 'denominator') m + ON ppoints2.cartodb_id = m.rowid + ORDER BY ppoints2.code; + +SELECT cdb_crankshaft._cdb_random_seeds(1234); + -- Spatial Hotspots (rate) SELECT ppoints2.code, m.quads FROM ppoints2 From bba6a0f58e30797271a2a73117d761fce8f79484 Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Thu, 1 Mar 2018 10:40:12 -0500 Subject: [PATCH 09/12] updates docs to include new functions and notes deprecation of old ones --- doc/02_moran.md | 114 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 108 insertions(+), 6 deletions(-) diff --git a/doc/02_moran.md b/doc/02_moran.md index a00c33e..e610860 100644 --- a/doc/02_moran.md +++ b/doc/02_moran.md @@ -1,4 +1,6 @@ -## Areas of Interest Functions +## Moran's I - Spatial Autocorrelation + +Note: these were formerly called _Areas of Interest_. A family of analyses to uncover groupings of areas with consistently high or low values (clusters) and smaller areas with values unlike those around them (outliers). A cluster is labeled by an 'HH' (high value compared to the entire dataset in an area with other high values), or its opposite 'LL'. An outlier is labeled by an 'LH' (low value surrounded by high values) or an 'HL' (the opposite). Each cluster and outlier classification has an associated p-value, a measure of how significant the pattern of highs and lows is compared to a random distribution. @@ -9,7 +11,107 @@ These functions have two forms: local and global. The local versions classify ev * Rows with null values will be omitted from this analysis. To ensure they are added to the analysis, fill the null-valued cells with an appropriate value such as the mean of a column, the mean of the most recent two time steps, or use a `LEFT JOIN` to get null outputs from the analysis. * Input query can only accept tables (datasets) in the users database account. Common table expressions (CTEs) do not work as an input unless specified within the `subquery` argument. -### CDB_AreasOfInterestLocal(subquery text, column_name text) +### CDB_MoransILocal(subquery text, column_name text) + + +This function classifies your data as being part of a cluster, as an outlier, or not part of a pattern based the significance of a classification. The classification happens through an autocorrelation statistic called Local Moran's I. + +#### Arguments + +| Name | Type | Description | +|------|------|-------------| +| subquery | TEXT | SQL query that exposes the data to be analyzed (e.g., `SELECT * FROM interesting_table`). This query must have the geometry column name `the_geom` and id column name `cartodb_id` unless otherwise specified in the input arguments | +| column_name | TEXT | Name of column (e.g., should be `'interesting_value'` instead of `interesting_value` without single quotes) used for the analysis. | +| weight type (optional) | TEXT | Type of weight to use when finding neighbors. Currently available options are 'knn' (default) and 'queen'. Read more about weight types in [PySAL's weights documentation](https://pysal.readthedocs.io/en/v1.11.0/users/tutorials/weights.html). | +| num_ngbrs (optional) | INT | Number of neighbors if using k-nearest neighbors weight type. Defaults to 5. | +| permutations (optional) | INT | Number of permutations to check against a random arrangement of the values in `column_name`. This influences the accuracy of the output field `significance`. Defaults to 99. | +| geom_col (optional) | TEXT | The column name for the geometries. Defaults to `'the_geom'` | +| id_col (optional) | TEXT | The column name for the unique ID of each geometry/value pair. Defaults to `'cartodb_id'`. | + +#### Returns + +A table with the following columns. + +| Column Name | Type | Description | +|-------------|------|-------------| +| quads | TEXT | Classification of geometry. Result is one of 'HH' (a high value with neighbors high on average), 'LL' (opposite of 'HH'), 'HL' (a high value surrounded by lows on average), and 'LH' (opposite of 'HL'). Null values are returned when nulls exist in the original data. | +| significance | NUMERIC | The statistical significance (from 0 to 1) of a cluster or outlier classification. Lower numbers are more significant. | +| spatial\_lag | NUMERIC | The 'average' of the neighbors of the value in this row. The average is calculated from it's neighborhood -- defined by `weight_type`. | +| spatial\_lag\_std | NUMERIC | The standardized version of `spatial\_lag` -- that is, centered on the mean and divided by the standard deviation. | +| orig\_val | NUMERIC | Values from `'column_name'`. | +| orig\_val\_std | NUMERIC | Values from `'column_name'` but centered on the mean and divided by the standard devation. Useful as the x-axis in Moran's I scatter plots. | +| moran\_stat | NUMERIC | Value of Moran's I (spatial autocorrelation measure) for the geometry with id of `rowid` | +| rowid | INT | Row id of the values which correspond to the input rows. | + + + +#### Example Usage + +```sql +SELECT + c.the_geom, + aoi.quads, + aoi.significance, + c.num_cyclists_per_total_population +FROM + cdb_crankshaft.CDB_MoransILocal( + 'SELECT * FROM commute_data' + 'num_cyclists_per_total_population') As aoi +JOIN commute_data As c +ON c.cartodb_id = aoi.rowid; +``` + + +### CDB_MoransILocalRate(subquery text, numerator text, denominator text) + +Just like `CDB_MoransILocal`, this function classifies your data as being part of a cluster, as an outlier, or not part of a pattern based the significance of a classification. This function differs in that it calculates the classifications based on input `numerator` and `denominator` columns for finding the areas where there are clusters and outliers for the resulting rate of those two values. + +#### Arguments + +| Name | Type | Description | +|------|------|-------------| +| subquery | TEXT | SQL query that exposes the data to be analyzed (e.g., `SELECT * FROM interesting_table`). This query must have the geometry column name `the_geom` and id column name `cartodb_id` unless otherwise specified in the input arguments | +| numerator | TEXT | Name of the numerator for forming a rate to be used in analysis. | +| denominator | TEXT | Name of the denominator for forming a rate to be used in analysis. | +| weight type (optional) | TEXT | Type of weight to use when finding neighbors. Currently available options are 'knn' (default) and 'queen'. Read more about weight types in [PySAL's weights documentation](https://pysal.readthedocs.io/en/v1.11.0/users/tutorials/weights.html). | +| num_ngbrs (optional) | INT | Number of neighbors if using k-nearest neighbors weight type. Defaults to 5. | +| permutations (optional) | INT | Number of permutations to check against a random arrangement of the values in `column_name`. This influences the accuracy of the output field `significance`. Defaults to 99. | +| geom_col (optional) | TEXT | The column name for the geometries. Defaults to `'the_geom'` | +| id_col (optional) | TEXT | The column name for the unique ID of each geometry/value pair. Defaults to `'cartodb_id'`. | + +#### Returns + +A table with the following columns. + +| Column Name | Type | Description | +|-------------|------|-------------| +| quads | TEXT | Classification of geometry. Result is one of 'HH' (a high value with neighbors high on average), 'LL' (opposite of 'HH'), 'HL' (a high value surrounded by lows on average), and 'LH' (opposite of 'HL'). Null values are returned when nulls exist in the original data. | +| significance | NUMERIC | The statistical significance (from 0 to 1) of a cluster or outlier classification. Lower numbers are more significant. | +| spatial\_lag | NUMERIC | The 'average' of the neighbors of the value in this row. The average is calculated from it's neighborhood -- defined by `weight_type`. | +| spatial\_lag\_std | NUMERIC | The standardized version of `spatial\_lag` -- that is, centered on the mean and divided by the standard deviation. | +| orig\_val | NUMERIC | Standardized rate (centered on the mean and normalized by the standard deviation) calculated from `numerator` and `denominator`. This is calculated by [Assuncao Rate](http://pysal.readthedocs.io/en/latest/library/esda/smoothing.html?highlight=assuncao#pysal.esda.smoothing.assuncao_rate) in the PySAL library. | +| orig\_val\_std | NUMERIC | Values from `'column_name'` but centered on the mean and divided by the standard devation. Useful as the x-axis in Moran's I scatter plots. | +| moran\_stat | NUMERIC | Value of Moran's I (spatial autocorrelation measure) for the geometry with id of `rowid` | +| rowid | INT | Row id of the values which correspond to the input rows. | +A table with the following columns. + +#### Example Usage + +```sql +SELECT + c.the_geom, + aoi.quads, + aoi.significance, + c.cyclists_per_total_population +FROM + cdb_crankshaft.CDB_MoransILocalRate( + 'SELECT * FROM commute_data' + 'num_cyclists', + 'total_population') As aoi +JOIN commute_data As c +ON c.cartodb_id = aoi.rowid; +``` +### CDB_AreasOfInterestLocal(subquery text, column_name text) (deprecated) This function classifies your data as being part of a cluster, as an outlier, or not part of a pattern based the significance of a classification. The classification happens through an autocorrelation statistic called Local Moran's I. @@ -55,7 +157,7 @@ JOIN commute_data As c ON c.cartodb_id = aoi.rowid; ``` -### CDB_AreasOfInterestGlobal(subquery text, column_name text) +### CDB_AreasOfInterestGlobal(subquery text, column_name text) (deprecated) This function identifies the extent to which geometries cluster (the groupings of geometries with similarly high or low values relative to the mean) or form outliers (areas where geometries have values opposite of their neighbors). The output of this function gives values between -1 and 1 as well as a significance of that classification. Values close to 0 mean that there is little to no distribution of values as compared to what one would see in a randomly distributed collection of geometries and values. @@ -91,7 +193,7 @@ FROM 'num_cyclists_per_total_population') ``` -### CDB_AreasOfInterestLocalRate(subquery text, numerator_column text, denominator_column text) +### CDB_AreasOfInterestLocalRate(subquery text, numerator_column text, denominator_column text) (deprecated) Just like `CDB_AreasOfInterestLocal`, this function classifies your data as being part of a cluster, as an outlier, or not part of a pattern based the significance of a classification. This function differs in that it calculates the classifications based on input `numerator` and `denominator` columns for finding the areas where there are clusters and outliers for the resulting rate of those two values. @@ -138,7 +240,7 @@ JOIN commute_data As c ON c.cartodb_id = aoi.rowid; ``` -### CDB_AreasOfInterestGlobalRate(subquery text, column_name text) +### CDB_AreasOfInterestGlobalRate(subquery text, column_name text) (deprecated) This function identifies the extent to which geometries cluster (the groupings of geometries with similarly high or low values relative to the mean) or form outliers (areas where geometries have values opposite of their neighbors). The output of this function gives values between -1 and 1 as well as a significance of that classification. Values close to 0 mean that there is little to no distribution of values as compared to what one would see in a randomly distributed collection of geometries and values. @@ -178,7 +280,7 @@ FROM ## Hotspot, Coldspot, and Outlier Functions -These functions are convenience functions for extracting only information that you are interested in exposing based on the outputs of the `CDB_AreasOfInterest` functions. For instance, you can use `CDB_GetSpatialHotspots` to output only the classifications of `HH` and `HL`. +These functions are convenience functions for extracting only information that you are interested in exposing based on the outputs of the `CDB_MoransI*` functions. For instance, you can use `CDB_GetSpatialHotspots` to output only the classifications of `HH` and `HL`. ### Non-rate functions From 3cadc2dd95fdd3ec164508925d67fd4002364265 Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Thu, 1 Mar 2018 11:16:42 -0500 Subject: [PATCH 10/12] minor docs updates --- doc/02_moran.md | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/doc/02_moran.md b/doc/02_moran.md index e610860..797b660 100644 --- a/doc/02_moran.md +++ b/doc/02_moran.md @@ -37,9 +37,9 @@ A table with the following columns. | quads | TEXT | Classification of geometry. Result is one of 'HH' (a high value with neighbors high on average), 'LL' (opposite of 'HH'), 'HL' (a high value surrounded by lows on average), and 'LH' (opposite of 'HL'). Null values are returned when nulls exist in the original data. | | significance | NUMERIC | The statistical significance (from 0 to 1) of a cluster or outlier classification. Lower numbers are more significant. | | spatial\_lag | NUMERIC | The 'average' of the neighbors of the value in this row. The average is calculated from it's neighborhood -- defined by `weight_type`. | -| spatial\_lag\_std | NUMERIC | The standardized version of `spatial\_lag` -- that is, centered on the mean and divided by the standard deviation. | -| orig\_val | NUMERIC | Values from `'column_name'`. | -| orig\_val\_std | NUMERIC | Values from `'column_name'` but centered on the mean and divided by the standard devation. Useful as the x-axis in Moran's I scatter plots. | +| spatial\_lag\_std | NUMERIC | The standardized version of `spatial_lag` -- that is, centered on the mean and divided by the standard deviation. Useful as the y-axis in a Moran's I scatter plot. | +| orig\_val | NUMERIC | Values from `column_name`. | +| orig\_val\_std | NUMERIC | Values from `column_name` but centered on the mean and divided by the standard devation. Useful as the x-axis in Moran's I scatter plots. | | moran\_stat | NUMERIC | Value of Moran's I (spatial autocorrelation measure) for the geometry with id of `rowid` | | rowid | INT | Row id of the values which correspond to the input rows. | @@ -50,15 +50,15 @@ A table with the following columns. ```sql SELECT c.the_geom, - aoi.quads, - aoi.significance, + m.quads, + m.significance, c.num_cyclists_per_total_population FROM cdb_crankshaft.CDB_MoransILocal( 'SELECT * FROM commute_data' - 'num_cyclists_per_total_population') As aoi + 'num_cyclists_per_total_population') As m JOIN commute_data As c -ON c.cartodb_id = aoi.rowid; +ON c.cartodb_id = m.rowid; ``` @@ -76,8 +76,8 @@ Just like `CDB_MoransILocal`, this function classifies your data as being part o | weight type (optional) | TEXT | Type of weight to use when finding neighbors. Currently available options are 'knn' (default) and 'queen'. Read more about weight types in [PySAL's weights documentation](https://pysal.readthedocs.io/en/v1.11.0/users/tutorials/weights.html). | | num_ngbrs (optional) | INT | Number of neighbors if using k-nearest neighbors weight type. Defaults to 5. | | permutations (optional) | INT | Number of permutations to check against a random arrangement of the values in `column_name`. This influences the accuracy of the output field `significance`. Defaults to 99. | -| geom_col (optional) | TEXT | The column name for the geometries. Defaults to `'the_geom'` | -| id_col (optional) | TEXT | The column name for the unique ID of each geometry/value pair. Defaults to `'cartodb_id'`. | +| geom_col (optional) | TEXT | The column name for the geometries. Defaults to `the_geom` | +| id_col (optional) | TEXT | The column name for the unique ID of each geometry/value pair. Defaults to `cartodb_id`. | #### Returns @@ -88,29 +88,30 @@ A table with the following columns. | quads | TEXT | Classification of geometry. Result is one of 'HH' (a high value with neighbors high on average), 'LL' (opposite of 'HH'), 'HL' (a high value surrounded by lows on average), and 'LH' (opposite of 'HL'). Null values are returned when nulls exist in the original data. | | significance | NUMERIC | The statistical significance (from 0 to 1) of a cluster or outlier classification. Lower numbers are more significant. | | spatial\_lag | NUMERIC | The 'average' of the neighbors of the value in this row. The average is calculated from it's neighborhood -- defined by `weight_type`. | -| spatial\_lag\_std | NUMERIC | The standardized version of `spatial\_lag` -- that is, centered on the mean and divided by the standard deviation. | +| spatial\_lag\_std | NUMERIC | The standardized version of `spatial_lag` -- that is, centered on the mean and divided by the standard deviation. | | orig\_val | NUMERIC | Standardized rate (centered on the mean and normalized by the standard deviation) calculated from `numerator` and `denominator`. This is calculated by [Assuncao Rate](http://pysal.readthedocs.io/en/latest/library/esda/smoothing.html?highlight=assuncao#pysal.esda.smoothing.assuncao_rate) in the PySAL library. | -| orig\_val\_std | NUMERIC | Values from `'column_name'` but centered on the mean and divided by the standard devation. Useful as the x-axis in Moran's I scatter plots. | +| orig\_val\_std | NUMERIC | Values from `column_name` but centered on the mean and divided by the standard devation. Useful as the x-axis in Moran's I scatter plots. | | moran\_stat | NUMERIC | Value of Moran's I (spatial autocorrelation measure) for the geometry with id of `rowid` | | rowid | INT | Row id of the values which correspond to the input rows. | -A table with the following columns. +A table with the following columns. | #### Example Usage ```sql SELECT c.the_geom, - aoi.quads, - aoi.significance, + m.quads, + m.significance, c.cyclists_per_total_population FROM cdb_crankshaft.CDB_MoransILocalRate( 'SELECT * FROM commute_data' 'num_cyclists', - 'total_population') As aoi + 'total_population') As m JOIN commute_data As c -ON c.cartodb_id = aoi.rowid; +ON c.cartodb_id = m.rowid; ``` + ### CDB_AreasOfInterestLocal(subquery text, column_name text) (deprecated) This function classifies your data as being part of a cluster, as an outlier, or not part of a pattern based the significance of a classification. The classification happens through an autocorrelation statistic called Local Moran's I. From 7b99a88d5ebb99ae93e534184ef55ebe51c7d122 Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Mon, 12 Mar 2018 10:06:39 -0400 Subject: [PATCH 11/12] adds more information to the notes --- doc/02_moran.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/02_moran.md b/doc/02_moran.md index 797b660..b869492 100644 --- a/doc/02_moran.md +++ b/doc/02_moran.md @@ -1,6 +1,6 @@ ## Moran's I - Spatial Autocorrelation -Note: these were formerly called _Areas of Interest_. +Note: these functions are replacing the functions in the _Areas of Interest_ family (still documented below). `CDB_MoransILocal` and `CDB_MoransILocalRate` perform the same analysis as their `CDB_AreasOfInterest*` counterparts but return spatial lag information, which is needed for creating the Moran's I scatter plot. It recommended to use the `CDB_MoransILocal*` variants instead as they will be maintained and improved going foward. A family of analyses to uncover groupings of areas with consistently high or low values (clusters) and smaller areas with values unlike those around them (outliers). A cluster is labeled by an 'HH' (high value compared to the entire dataset in an area with other high values), or its opposite 'LL'. An outlier is labeled by an 'LH' (low value surrounded by high values) or an 'HL' (the opposite). Each cluster and outlier classification has an associated p-value, a measure of how significant the pattern of highs and lows is compared to a random distribution. From b86384bb9433f75d4ab0600ab8bb3ff4dd2364ad Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Mon, 12 Mar 2018 11:16:51 -0400 Subject: [PATCH 12/12] updates news --- NEWS.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/NEWS.md b/NEWS.md index 19aa56c..2260f7e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +0.8.0 (yyyy-mm-dd) +------------------ +* Adds `CDB_MoransILocal*` functions that return spatial lag [#202](https://github.com/CartoDB/crankshaft/pull/202) + 0.7.0 (2018-02-23) ------------------ * Updated Moran and Markov documentation [#179](https://github.com/CartoDB/crankshaft/pull/179) [#155](https://github.com/CartoDB/crankshaft/pull/155)