diff --git a/NEWS.md b/NEWS.md index 201da51..0b8c2da 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ -0.0.1 (2016-03-15) +0.0.2 (2016-03-16) +------------------ +* New versioning approach using per-version Python virtual environments + +0.0.1 (2016-02-22) ------------------ * Preliminar release diff --git a/release/crankshaft--0.0.1--0.0.2.sql b/release/crankshaft--0.0.1--0.0.2.sql new file mode 100644 index 0000000..60c6ecd --- /dev/null +++ b/release/crankshaft--0.0.1--0.0.2.sql @@ -0,0 +1,74 @@ +CREATE OR REPLACE FUNCTION cdb_crankshaft.cdb_crankshaft_version() +RETURNS text AS $$ + SELECT '0.0.2'::text; +$$ language 'sql' STABLE STRICT; + +CREATE OR REPLACE FUNCTION cdb_crankshaft._cdb_crankshaft_internal_version() +RETURNS text AS $$ + SELECT installed_version FROM pg_available_extensions where name='crankshaft' and pg_available_extensions IS NOT NULL; +$$ language 'sql' STABLE STRICT; +CREATE OR REPLACE FUNCTION cdb_crankshaft._cdb_crankshaft_virtualenvs_path() +RETURNS text +AS $$ + BEGIN + RETURN '/home/ubuntu/crankshaft/envs'; + END; +$$ language plpgsql IMMUTABLE STRICT; + +CREATE OR REPLACE FUNCTION cdb_crankshaft._cdb_crankshaft_activate_py() +RETURNS VOID +AS $$ + import os + # plpy.notice('%',str(os.environ)) + # activate virtualenv + crankshaft_version = plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_internal_version()')[0]['_cdb_crankshaft_internal_version'] + base_path = plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_virtualenvs_path()')[0]['_cdb_crankshaft_virtualenvs_path'] + default_venv_path = os.path.join(base_path, crankshaft_version) + venv_path = os.environ.get('CRANKSHAFT_VENV', default_venv_path) + activate_path = venv_path + '/bin/activate_this.py' + exec(open(activate_path).read(), dict(__file__=activate_path)) +$$ LANGUAGE plpythonu; + +CREATE OR REPLACE FUNCTION +cdb_crankshaft._cdb_random_seeds (seed_value INTEGER) RETURNS VOID +AS $$ + plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()') + from crankshaft import random_seeds + random_seeds.set_random_seeds(seed_value) +$$ LANGUAGE plpythonu; +-- Moran's I +CREATE OR REPLACE FUNCTION +cdb_crankshaft.cdb_moran_local ( + t TEXT, + attr TEXT, + significance float DEFAULT 0.05, + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_column TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id', + w_type TEXT DEFAULT 'knn') +RETURNS TABLE (moran FLOAT, quads TEXT, significance FLOAT, ids INT) +AS $$ + plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()') + from crankshaft.clustering import moran_local + # TODO: use named parameters or a dictionary + return moran_local(t, attr, significance, num_ngbrs, permutations, geom_column, id_col, w_type) +$$ LANGUAGE plpythonu; + +CREATE OR REPLACE FUNCTION +cdb_crankshaft.cdb_moran_local_rate(t TEXT, + numerator TEXT, + denominator TEXT, + significance FLOAT DEFAULT 0.05, + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_column TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id', + w_type TEXT DEFAULT 'knn') +RETURNS TABLE(moran FLOAT, quads TEXT, significance FLOAT, ids INT, y numeric) +AS $$ + plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()') + from crankshaft.clustering import moran_local_rate + # TODO: use named parameters or a dictionary + return moran_local_rate(t, numerator, denominator, significance, num_ngbrs, permutations, geom_column, id_col, w_type) +$$ LANGUAGE plpythonu; diff --git a/release/crankshaft--0.0.2--0.0.1.sql b/release/crankshaft--0.0.2--0.0.1.sql new file mode 100644 index 0000000..1bfdcba --- /dev/null +++ b/release/crankshaft--0.0.2--0.0.1.sql @@ -0,0 +1,44 @@ +CREATE OR REPLACE FUNCTION +cdb_crankshaft._cdb_random_seeds (seed_value INTEGER) RETURNS VOID +AS $$ + from crankshaft import random_seeds + random_seeds.set_random_seeds(seed_value) +$$ LANGUAGE plpythonu; + +CREATE OR REPLACE FUNCTION +cdb_crankshaft.cdb_moran_local ( + t TEXT, + attr TEXT, + significance float DEFAULT 0.05, + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_column TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id', + w_type TEXT DEFAULT 'knn') +RETURNS TABLE (moran FLOAT, quads TEXT, significance FLOAT, ids INT) +AS $$ + from crankshaft.clustering import moran_local + # TODO: use named parameters or a dictionary + return moran_local(t, attr, significance, num_ngbrs, permutations, geom_column, id_col, w_type) +$$ LANGUAGE plpythonu; + +CREATE OR REPLACE FUNCTION +cdb_crankshaft.cdb_moran_local_rate(t TEXT, + numerator TEXT, + denominator TEXT, + significance FLOAT DEFAULT 0.05, + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_column TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id', + w_type TEXT DEFAULT 'knn') +RETURNS TABLE(moran FLOAT, quads TEXT, significance FLOAT, ids INT, y numeric) +AS $$ + from crankshaft.clustering import moran_local_rate + # TODO: use named parameters or a dictionary + return moran_local_rate(t, numerator, denominator, significance, num_ngbrs, permutations, geom_column, id_col, w_type) +$$ LANGUAGE plpythonu; + +DROP FUNCTION IF EXISTS cdb_crankshaft.cdb_crankshaft_version(); +DROP FUNCTION IF EXISTS cdb_crankshaft._cdb_crankshaft_internal_version(); +DROP FUNCTION IF EXISTS cdb_crankshaft._cdb_crankshaft_activate_py(); diff --git a/release/crankshaft--0.0.2.sql b/release/crankshaft--0.0.2.sql new file mode 100644 index 0000000..daf8d27 --- /dev/null +++ b/release/crankshaft--0.0.2.sql @@ -0,0 +1,186 @@ +--DO NOT MODIFY THIS FILE, IT IS GENERATED AUTOMATICALLY FROM SOURCES +-- Complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION crankshaft" to load this file. \quit +-- Version number of the extension release +CREATE OR REPLACE FUNCTION cdb_crankshaft_version() +RETURNS text AS $$ + SELECT '0.0.2'::text; +$$ language 'sql' STABLE STRICT; + +-- Internal identifier of the installed extension instence +-- e.g. 'dev' for current development version +CREATE OR REPLACE FUNCTION _cdb_crankshaft_internal_version() +RETURNS text AS $$ + SELECT installed_version FROM pg_available_extensions where name='crankshaft' and pg_available_extensions IS NOT NULL; +$$ language 'sql' STABLE STRICT; +CREATE OR REPLACE FUNCTION _cdb_crankshaft_virtualenvs_path() +RETURNS text +AS $$ + BEGIN + -- RETURN '/opt/virtualenvs/crankshaft'; + RETURN '/home/ubuntu/crankshaft/envs'; + END; +$$ language plpgsql IMMUTABLE STRICT; + +-- Use the crankshaft python module +CREATE OR REPLACE FUNCTION _cdb_crankshaft_activate_py() +RETURNS VOID +AS $$ + import os + # plpy.notice('%',str(os.environ)) + # activate virtualenv + crankshaft_version = plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_internal_version()')[0]['_cdb_crankshaft_internal_version'] + base_path = plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_virtualenvs_path()')[0]['_cdb_crankshaft_virtualenvs_path'] + default_venv_path = os.path.join(base_path, crankshaft_version) + venv_path = os.environ.get('CRANKSHAFT_VENV', default_venv_path) + activate_path = venv_path + '/bin/activate_this.py' + exec(open(activate_path).read(), dict(__file__=activate_path)) +$$ LANGUAGE plpythonu; +-- Internal function. +-- Set the seeds of the RNGs (Random Number Generators) +-- used internally. +CREATE OR REPLACE FUNCTION +_cdb_random_seeds (seed_value INTEGER) RETURNS VOID +AS $$ + plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()') + from crankshaft import random_seeds + random_seeds.set_random_seeds(seed_value) +$$ LANGUAGE plpythonu; +-- Moran's I +CREATE OR REPLACE FUNCTION + cdb_moran_local ( + t TEXT, + attr TEXT, + significance float DEFAULT 0.05, + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_column TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id', + w_type TEXT DEFAULT 'knn') +RETURNS TABLE (moran FLOAT, quads TEXT, significance FLOAT, ids INT) +AS $$ + plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()') + from crankshaft.clustering import moran_local + # TODO: use named parameters or a dictionary + return moran_local(t, attr, significance, num_ngbrs, permutations, geom_column, id_col, w_type) +$$ LANGUAGE plpythonu; + +-- Moran's I Local Rate +CREATE OR REPLACE FUNCTION + cdb_moran_local_rate(t TEXT, + numerator TEXT, + denominator TEXT, + significance FLOAT DEFAULT 0.05, + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_column TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id', + w_type TEXT DEFAULT 'knn') +RETURNS TABLE(moran FLOAT, quads TEXT, significance FLOAT, ids INT, y numeric) +AS $$ + plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()') + from crankshaft.clustering import moran_local_rate + # TODO: use named parameters or a dictionary + return moran_local_rate(t, numerator, denominator, significance, num_ngbrs, permutations, geom_column, id_col, w_type) +$$ LANGUAGE plpythonu; +-- Function by Stuart Lynn for a simple interpolation of a value +-- from a polygon table over an arbitrary polygon +-- (weighted by the area proportion overlapped) +-- Aereal weighting is a very simple form of aereal interpolation. +-- +-- Parameters: +-- * geom a Polygon geometry which defines the area where a value will be +-- estimated as the area-weighted sum of a given table/column +-- * target_table_name table name of the table that provides the values +-- * target_column column name of the column that provides the values +-- * schema_name optional parameter to defina the schema the target table +-- belongs to, which is necessary if its not in the search_path. +-- Note that target_table_name should never include the schema in it. +-- Return value: +-- Aereal-weighted interpolation of the column values over the geometry +CREATE OR REPLACE +FUNCTION cdb_overlap_sum(geom geometry, target_table_name text, target_column text, schema_name text DEFAULT NULL) + RETURNS numeric AS +$$ +DECLARE + result numeric; + qualified_name text; +BEGIN + IF schema_name IS NULL THEN + qualified_name := Format('%I', target_table_name); + ELSE + qualified_name := Format('%I.%s', schema_name, target_table_name); + END IF; + EXECUTE Format(' + SELECT sum(%I*ST_Area(St_Intersection($1, a.the_geom))/ST_Area(a.the_geom)) + FROM %s AS a + WHERE $1 && a.the_geom + ', target_column, qualified_name) + USING geom + INTO result; + RETURN result; +END; +$$ LANGUAGE plpgsql; +-- +-- Creates N points randomly distributed arround the polygon +-- +-- @param g - the geometry to be turned in to points +-- +-- @param no_points - the number of points to generate +-- +-- @params max_iter_per_point - the function generates points in the polygon's bounding box +-- and discards points which don't lie in the polygon. max_iter_per_point specifies how many +-- misses per point the funciton accepts before giving up. +-- +-- Returns: Multipoint with the requested points +CREATE OR REPLACE FUNCTION cdb_dot_density(geom geometry , no_points Integer, max_iter_per_point Integer DEFAULT 1000) +RETURNS GEOMETRY AS $$ +DECLARE + extent GEOMETRY; + test_point Geometry; + width NUMERIC; + height NUMERIC; + x0 NUMERIC; + y0 NUMERIC; + xp NUMERIC; + yp NUMERIC; + no_left INTEGER; + remaining_iterations INTEGER; + points GEOMETRY[]; + bbox_line GEOMETRY; + intersection_line GEOMETRY; +BEGIN + extent := ST_Envelope(geom); + width := ST_XMax(extent) - ST_XMIN(extent); + height := ST_YMax(extent) - ST_YMIN(extent); + x0 := ST_XMin(extent); + y0 := ST_YMin(extent); + no_left := no_points; + + LOOP + if(no_left=0) THEN + EXIT; + END IF; + yp = y0 + height*random(); + bbox_line = ST_MakeLine( + ST_SetSRID(ST_MakePoint(yp, x0),4326), + ST_SetSRID(ST_MakePoint(yp, x0+width),4326) + ); + intersection_line = ST_Intersection(bbox_line,geom); + test_point = ST_LineInterpolatePoint(st_makeline(st_linemerge(intersection_line)),random()); + points := points || test_point; + no_left = no_left - 1 ; + END LOOP; + RETURN ST_Collect(points); +END; +$$ +LANGUAGE plpgsql VOLATILE; +-- Make sure by default there are no permissions for publicuser +-- NOTE: this happens at extension creation time, as part of an implicit transaction. +-- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE; + +-- Grant permissions on the schema to publicuser (but just the schema) +GRANT USAGE ON SCHEMA cdb_crankshaft TO publicuser; + +-- Revoke execute permissions on all functions in the schema by default +-- REVOKE EXECUTE ON ALL FUNCTIONS IN SCHEMA cdb_crankshaft FROM PUBLIC, publicuser; diff --git a/release/crankshaft.control b/release/crankshaft.control index 74777ac..49c0d22 100644 --- a/release/crankshaft.control +++ b/release/crankshaft.control @@ -1,5 +1,5 @@ comment = 'CartoDB Spatial Analysis extension' -default_version = '0.0.1' +default_version = '0.0.2' requires = 'plpythonu, postgis, cartodb' superuser = true schema = cdb_crankshaft diff --git a/release/python/0.0.2/crankshaft/crankshaft/__init__.py b/release/python/0.0.2/crankshaft/crankshaft/__init__.py new file mode 100644 index 0000000..d07e330 --- /dev/null +++ b/release/python/0.0.2/crankshaft/crankshaft/__init__.py @@ -0,0 +1,2 @@ +import random_seeds +import clustering diff --git a/release/python/0.0.2/crankshaft/crankshaft/clustering/__init__.py b/release/python/0.0.2/crankshaft/crankshaft/clustering/__init__.py new file mode 100644 index 0000000..0df080f --- /dev/null +++ b/release/python/0.0.2/crankshaft/crankshaft/clustering/__init__.py @@ -0,0 +1 @@ +from moran import * diff --git a/release/python/0.0.2/crankshaft/crankshaft/clustering/moran.py b/release/python/0.0.2/crankshaft/crankshaft/clustering/moran.py new file mode 100644 index 0000000..8882235 --- /dev/null +++ b/release/python/0.0.2/crankshaft/crankshaft/clustering/moran.py @@ -0,0 +1,321 @@ +""" +Moran's I geostatistics (global clustering & outliers presence) +""" + +# TODO: Fill in local neighbors which have null/NoneType values with the +# average of the their neighborhood + +import numpy as np +import pysal as ps +import plpy + +# High level interface --------------------------------------- + +def moran_local(t, attr, significance, num_ngbrs, permutations, geom_column, id_col, w_type): + """ + Moran's I implementation for PL/Python + Andy Eschbacher + """ + # TODO: ensure that the significance output can be smaller that 1e-3 (0.001) + # TODO: make a wishlist of output features (zscores, pvalues, raw local lisa, what else?) + + plpy.notice('** Constructing query') + + # geometries with attributes that are null are ignored + # resulting in a collection of not as near neighbors + + qvals = {"id_col": id_col, + "attr1": attr, + "geom_col": geom_column, + "table": t, + "num_ngbrs": num_ngbrs} + + q = get_query(w_type, qvals) + + try: + r = plpy.execute(q) + plpy.notice('** Query returned with %d rows' % len(r)) + except plpy.SPIError: + plpy.notice('** Query failed: "%s"' % q) + plpy.notice('** Exiting function') + return zip([None], [None], [None], [None]) + + y = get_attributes(r, 1) + w = get_weight(r, w_type) + + # calculate LISA values + lisa = ps.Moran_Local(y, w) + + # find units of significance + lisa_sig = lisa_sig_vals(lisa.p_sim, lisa.q, significance) + + plpy.notice('** Finished calculations') + + return zip(lisa.Is, lisa_sig, lisa.p_sim, w.id_order) + + +def moran_local_rate(t, numerator, denominator, significance, num_ngbrs, permutations, geom_column, id_col, w_type): + """ + Moran's I Local Rate + Andy Eschbacher + """ + + plpy.notice('** Constructing query') + + # geometries with attributes that are null are ignored + # resulting in a collection of not as near neighbors + + qvals = {"id_col": id_col, + "numerator": numerator, + "denominator": denominator, + "geom_col": geom_column, + "table": t, + "num_ngbrs": num_ngbrs} + + q = get_query(w_type, qvals) + + try: + r = plpy.execute(q) + plpy.notice('** Query returned with %d rows' % len(r)) + except plpy.SPIError: + plpy.notice('** Query failed: "%s"' % q) + plpy.notice('** Error: %s' % plpy.SPIError) + plpy.notice('** Exiting function') + return zip([None], [None], [None], [None]) + + plpy.notice('r.nrows() = %d' % r.nrows()) + + ## collect attributes + numer = get_attributes(r, 1) + denom = get_attributes(r, 2) + + w = get_weight(r, w_type, num_ngbrs) + + # calculate LISA values + lisa = ps.esda.moran.Moran_Local_Rate(numer, denom, w, permutations=permutations) + + # find units of significance + lisa_sig = lisa_sig_vals(lisa.p_sim, lisa.q, significance) + + plpy.notice('** Finished calculations') + + ## TODO: Decide on which return values here + return zip(lisa.Is, lisa_sig, lisa.p_sim, w.id_order, lisa.y) + +def moran_local_bv(t, attr1, attr2, significance, num_ngbrs, permutations, geom_column, id_col, w_type): + plpy.notice('** Constructing query') + + qvals = {"num_ngbrs": num_ngbrs, + "attr1": attr1, + "attr2": attr2, + "table": t, + "geom_col": geom_column, + "id_col": id_col} + + q = get_query(w_type, qvals) + + try: + r = plpy.execute(q) + plpy.notice('** Query returned with %d rows' % len(r)) + except plpy.SPIError: + plpy.notice('** Query failed: "%s"' % q) + plpy.notice('** Error: %s' % plpy.SPIError) + plpy.notice('** Exiting function') + return zip([None], [None], [None], [None]) + + ## collect attributes + attr1_vals = get_attributes(r, 1) + attr2_vals = get_attributes(r, 2) + + # create weights + w = get_weight(r, w_type, num_ngbrs) + + # calculate LISA values + lisa = ps.esda.moran.Moran_Local_BV(attr1_vals, attr2_vals, w) + + plpy.notice("len of Is: %d" % len(lisa.Is)) + + # find clustering of significance + lisa_sig = lisa_sig_vals(lisa.p_sim, lisa.q, significance) + + plpy.notice('** Finished calculations') + + return zip(lisa.Is, lisa_sig, lisa.p_sim, w.id_order) + + +# Low level functions ---------------------------------------- + +def map_quads(coord): + """ + Map a quadrant number to Moran's I designation + HH=1, LH=2, LL=3, HL=4 + Input: + :param coord (int): quadrant of a specific measurement + """ + if coord == 1: + return 'HH' + elif coord == 2: + return 'LH' + elif coord == 3: + return 'LL' + elif coord == 4: + return 'HL' + else: + return None + +def query_attr_select(params): + """ + Create portion of SELECT statement for attributes inolved in query. + :param params: dict of information used in query (column names, + table name, etc.) + """ + + attrs = [k for k in params + if k not in ('id_col', 'geom_col', 'table', 'num_ngbrs')] + + template = "i.\"{%(col)s}\"::numeric As attr%(alias_num)s, " + + attr_string = "" + + for idx, val in enumerate(sorted(attrs)): + attr_string += template % {"col": val, "alias_num": idx + 1} + + return attr_string + +def query_attr_where(params): + """ + Create portion of WHERE clauses for weeding out NULL-valued geometries + """ + attrs = sorted([k for k in params + if k not in ('id_col', 'geom_col', 'table', 'num_ngbrs')]) + + attr_string = [] + + for attr in attrs: + attr_string.append("idx_replace.\"{%s}\" IS NOT NULL" % attr) + + if len(attrs) == 2: + attr_string.append("idx_replace.\"{%s}\" <> 0" % attrs[1]) + + out = " AND ".join(attr_string) + + return out + +def knn(params): + """SQL query for k-nearest neighbors. + :param vars: dict of values to fill template + """ + + attr_select = query_attr_select(params) + attr_where = query_attr_where(params) + + replacements = {"attr_select": attr_select, + "attr_where_i": attr_where.replace("idx_replace", "i"), + "attr_where_j": attr_where.replace("idx_replace", "j")} + + query = "SELECT " \ + "i.\"{id_col}\" As id, " \ + "%(attr_select)s" \ + "(SELECT ARRAY(SELECT j.\"{id_col}\" " \ + "FROM \"{table}\" As j " \ + "WHERE %(attr_where_j)s " \ + "ORDER BY j.\"{geom_col}\" <-> i.\"{geom_col}\" ASC " \ + "LIMIT {num_ngbrs} OFFSET 1 ) " \ + ") As neighbors " \ + "FROM \"{table}\" As i " \ + "WHERE " \ + "%(attr_where_i)s " \ + "ORDER BY i.\"{id_col}\" ASC;" % replacements + + return query.format(**params) + +## SQL query for finding queens neighbors (all contiguous polygons) +def queen(params): + """SQL query for queen neighbors. + :param params: dict of information to fill query + """ + attr_select = query_attr_select(params) + attr_where = query_attr_where(params) + + replacements = {"attr_select": attr_select, + "attr_where_i": attr_where.replace("idx_replace", "i"), + "attr_where_j": attr_where.replace("idx_replace", "j")} + + query = "SELECT " \ + "i.\"{id_col}\" As id, " \ + "%(attr_select)s" \ + "(SELECT ARRAY(SELECT j.\"{id_col}\" " \ + "FROM \"{table}\" As j " \ + "WHERE ST_Touches(i.\"{geom_col}\", j.\"{geom_col}\") AND " \ + "%(attr_where_j)s)" \ + ") As neighbors " \ + "FROM \"{table}\" As i " \ + "WHERE " \ + "%(attr_where_i)s " \ + "ORDER BY i.\"{id_col}\" ASC;" % replacements + + return query.format(**params) + +## to add more weight methods open a ticket or pull request + +def get_query(w_type, query_vals): + """Return requested query. + :param w_type: type of neighbors to calculate (knn or queen) + :param query_vals: values used to construct the query + """ + + if w_type == 'knn': + return knn(query_vals) + else: + return queen(query_vals) + +def get_attributes(query_res, attr_num): + """ + :param query_res: query results with attributes and neighbors + :param attr_num: attribute number (1, 2, ...) + """ + return np.array([x['attr' + str(attr_num)] for x in query_res], dtype=np.float) + +## Build weight object +def get_weight(query_res, w_type='queen', num_ngbrs=5): + """ + Construct PySAL weight from return value of query + :param query_res: query results with attributes and neighbors + """ + if w_type == 'knn': + row_normed_weights = [1.0 / float(num_ngbrs)] * num_ngbrs + weights = {x['id']: row_normed_weights for x in query_res} + elif w_type == 'queen': + weights = {x['id']: [1.0 / len(x['neighbors'])] * len(x['neighbors']) + if len(x['neighbors']) > 0 + else [] for x in query_res} + + neighbors = {x['id']: x['neighbors'] for x in query_res} + + return ps.W(neighbors, weights) + +def quad_position(quads): + """ + Produce Moran's I classification based of n + """ + + lisa_sig = np.array([map_quads(q) for q in quads]) + + return lisa_sig + +def lisa_sig_vals(pvals, quads, threshold): + """ + Produce Moran's I classification based of n + """ + + sig = (pvals <= threshold) + + lisa_sig = np.empty(len(sig), np.chararray) + + for idx, val in enumerate(sig): + if val: + lisa_sig[idx] = map_quads(quads[idx]) + else: + lisa_sig[idx] = 'Not significant' + + return lisa_sig diff --git a/release/python/0.0.2/crankshaft/crankshaft/random_seeds.py b/release/python/0.0.2/crankshaft/crankshaft/random_seeds.py new file mode 100644 index 0000000..b7c8eed --- /dev/null +++ b/release/python/0.0.2/crankshaft/crankshaft/random_seeds.py @@ -0,0 +1,10 @@ +import random +import numpy + +def set_random_seeds(value): + """ + Set the seeds of the RNGs (Random Number Generators) + used internally. + """ + random.seed(value) + numpy.random.seed(value) diff --git a/release/python/0.0.2/crankshaft/setup.py b/release/python/0.0.2/crankshaft/setup.py new file mode 100644 index 0000000..7a709a1 --- /dev/null +++ b/release/python/0.0.2/crankshaft/setup.py @@ -0,0 +1,48 @@ + +""" +CartoDB Spatial Analysis Python Library +See: +https://github.com/CartoDB/crankshaft +""" + +from setuptools import setup, find_packages + +setup( + name='crankshaft', + + version='0.0.2', + + description='CartoDB Spatial Analysis Python Library', + + url='https://github.com/CartoDB/crankshaft', + + author='Data Services Team - CartoDB', + author_email='dataservices@cartodb.com', + + license='MIT', + + classifiers=[ + 'Development Status :: 3 - Alpha', + 'Intended Audience :: Mapping comunity', + 'Topic :: Maps :: Mapping Tools', + 'License :: OSI Approved :: MIT License', + 'Programming Language :: Python :: 2.7', + ], + + keywords='maps mapping tools spatial analysis geostatistics', + + packages=find_packages(exclude=['contrib', 'docs', 'tests']), + + extras_require={ + 'dev': ['unittest'], + 'test': ['unittest', 'nose', 'mock'], + }, + + # The choice of component versions is dictated by what's + # provisioned in the production servers. + install_requires=['pysal==1.9.1'], + + requires=['pysal', 'numpy' ], + + test_suite='test' +) diff --git a/release/python/0.0.2/crankshaft/test/fixtures/moran.json b/release/python/0.0.2/crankshaft/test/fixtures/moran.json new file mode 100644 index 0000000..0530c18 --- /dev/null +++ b/release/python/0.0.2/crankshaft/test/fixtures/moran.json @@ -0,0 +1,52 @@ +[[0.9319096128346788, "HH"], +[-1.135787401862846, "HL"], +[0.11732030672508517, "Not significant"], +[0.6152779669180425, "Not significant"], +[-0.14657336660125297, "Not significant"], +[0.6967858120189607, "Not significant"], +[0.07949310115714454, "Not significant"], +[0.4703198759258987, "Not significant"], +[0.4421125200498064, "Not significant"], +[0.5724288737143592, "Not significant"], +[0.8970743435692062, "LL"], +[0.18327334401918674, "Not significant"], +[-0.01466729201304962, "Not significant"], +[0.3481559372544409, "Not significant"], +[0.06547094736902978, "Not significant"], +[0.15482141569329988, "HH"], +[0.4373841193538136, "Not significant"], +[0.15971286468915544, "Not significant"], +[1.0543588860308968, "Not significant"], +[1.7372866900020818, "HH"], +[1.091998586053999, "LL"], +[0.1171572584252222, "Not significant"], +[0.08438455015300014, "Not significant"], +[0.06547094736902978, "Not significant"], +[0.15482141569329985, "HH"], +[1.1627044812890683, "HH"], +[0.06547094736902978, "Not significant"], +[0.795275137550483, "Not significant"], +[0.18562939195219, "LL"], +[0.3010757406693439, "Not significant"], +[2.8205795942839376, "HH"], +[0.11259190602909264, "Not significant"], +[-0.07116352791516614, "Not significant"], +[-0.09945240794119009, "Not significant"], +[0.18562939195219, "LL"], +[0.1832733440191868, "Not significant"], +[-0.39054253768447705, "Not significant"], +[-0.1672071289487642, "HL"], +[0.3337669247916343, "Not significant"], +[0.2584386102554792, "Not significant"], +[-0.19733845476322634, "HL"], +[-0.9379282899805409, "LH"], +[-0.028770969951095866, "Not significant"], +[0.051367269430983485, "Not significant"], +[-0.2172548045913472, "LH"], +[0.05136726943098351, "Not significant"], +[0.04191046803899837, "Not significant"], +[0.7482357030403517, "HH"], +[-0.014585767863118111, "Not significant"], +[0.5410013139159929, "Not significant"], +[1.0223932668429925, "LL"], +[1.4179402898927476, "LL"]] diff --git a/release/python/0.0.2/crankshaft/test/fixtures/neighbors.json b/release/python/0.0.2/crankshaft/test/fixtures/neighbors.json new file mode 100644 index 0000000..055b359 --- /dev/null +++ b/release/python/0.0.2/crankshaft/test/fixtures/neighbors.json @@ -0,0 +1,54 @@ +[ + {"neighbors": [48, 26, 20, 9, 31], "id": 1, "value": 0.5}, + {"neighbors": [30, 16, 46, 3, 4], "id": 2, "value": 0.7}, + {"neighbors": [46, 30, 2, 12, 16], "id": 3, "value": 0.2}, + {"neighbors": [18, 30, 23, 2, 52], "id": 4, "value": 0.1}, + {"neighbors": [47, 40, 45, 37, 28], "id": 5, "value": 0.3}, + {"neighbors": [10, 21, 41, 14, 37], "id": 6, "value": 0.05}, + {"neighbors": [8, 17, 43, 25, 12], "id": 7, "value": 0.4}, + {"neighbors": [17, 25, 43, 22, 7], "id": 8, "value": 0.7}, + {"neighbors": [39, 34, 1, 26, 48], "id": 9, "value": 0.5}, + {"neighbors": [6, 37, 5, 45, 49], "id": 10, "value": 0.04}, + {"neighbors": [51, 41, 29, 21, 14], "id": 11, "value": 0.08}, + {"neighbors": [44, 46, 43, 50, 3], "id": 12, "value": 0.2}, + {"neighbors": [45, 23, 14, 28, 18], "id": 13, "value": 0.4}, + {"neighbors": [41, 29, 13, 23, 6], "id": 14, "value": 0.2}, + {"neighbors": [36, 27, 32, 33, 24], "id": 15, "value": 0.3}, + {"neighbors": [19, 2, 46, 44, 28], "id": 16, "value": 0.4}, + {"neighbors": [8, 25, 43, 7, 22], "id": 17, "value": 0.6}, + {"neighbors": [23, 4, 29, 14, 13], "id": 18, "value": 0.3}, + {"neighbors": [42, 16, 28, 26, 40], "id": 19, "value": 0.7}, + {"neighbors": [1, 48, 31, 26, 42], "id": 20, "value": 0.8}, + {"neighbors": [41, 6, 11, 14, 10], "id": 21, "value": 0.1}, + {"neighbors": [25, 50, 43, 31, 44], "id": 22, "value": 0.4}, + {"neighbors": [18, 13, 14, 4, 2], "id": 23, "value": 0.1}, + {"neighbors": [33, 49, 34, 47, 27], "id": 24, "value": 0.3}, + {"neighbors": [43, 8, 22, 17, 50], "id": 25, "value": 0.4}, + {"neighbors": [1, 42, 20, 31, 48], "id": 26, "value": 0.6}, + {"neighbors": [32, 15, 36, 33, 24], "id": 27, "value": 0.3}, + {"neighbors": [40, 45, 19, 5, 13], "id": 28, "value": 0.8}, + {"neighbors": [11, 51, 41, 14, 18], "id": 29, "value": 0.3}, + {"neighbors": [2, 3, 4, 46, 18], "id": 30, "value": 0.1}, + {"neighbors": [20, 26, 1, 50, 48], "id": 31, "value": 0.9}, + {"neighbors": [27, 36, 15, 49, 24], "id": 32, "value": 0.3}, + {"neighbors": [24, 27, 49, 34, 32], "id": 33, "value": 0.4}, + {"neighbors": [47, 9, 39, 40, 24], "id": 34, "value": 0.3}, + {"neighbors": [38, 51, 11, 21, 41], "id": 35, "value": 0.3}, + {"neighbors": [15, 32, 27, 49, 33], "id": 36, "value": 0.2}, + {"neighbors": [49, 10, 5, 47, 24], "id": 37, "value": 0.5}, + {"neighbors": [35, 21, 51, 11, 41], "id": 38, "value": 0.4}, + {"neighbors": [9, 34, 48, 1, 47], "id": 39, "value": 0.6}, + {"neighbors": [28, 47, 5, 9, 34], "id": 40, "value": 0.5}, + {"neighbors": [11, 14, 29, 21, 6], "id": 41, "value": 0.4}, + {"neighbors": [26, 19, 1, 9, 31], "id": 42, "value": 0.2}, + {"neighbors": [25, 12, 8, 22, 44], "id": 43, "value": 0.3}, + {"neighbors": [12, 50, 46, 16, 43], "id": 44, "value": 0.2}, + {"neighbors": [28, 13, 5, 40, 19], "id": 45, "value": 0.3}, + {"neighbors": [3, 12, 44, 2, 16], "id": 46, "value": 0.2}, + {"neighbors": [34, 40, 5, 49, 24], "id": 47, "value": 0.3}, + {"neighbors": [1, 20, 26, 9, 39], "id": 48, "value": 0.5}, + {"neighbors": [24, 37, 47, 5, 33], "id": 49, "value": 0.2}, + {"neighbors": [44, 22, 31, 42, 26], "id": 50, "value": 0.6}, + {"neighbors": [11, 29, 41, 14, 21], "id": 51, "value": 0.01}, + {"neighbors": [4, 18, 29, 51, 23], "id": 52, "value": 0.01} + ] diff --git a/release/python/0.0.2/crankshaft/test/helper.py b/release/python/0.0.2/crankshaft/test/helper.py new file mode 100644 index 0000000..7d28b94 --- /dev/null +++ b/release/python/0.0.2/crankshaft/test/helper.py @@ -0,0 +1,13 @@ +import unittest + +from mock_plpy import MockPlPy +plpy = MockPlPy() + +import sys +sys.modules['plpy'] = plpy + +import os + +def fixture_file(name): + dir = os.path.dirname(os.path.realpath(__file__)) + return os.path.join(dir, 'fixtures', name) diff --git a/release/python/0.0.2/crankshaft/test/mock_plpy.py b/release/python/0.0.2/crankshaft/test/mock_plpy.py new file mode 100644 index 0000000..63c88f6 --- /dev/null +++ b/release/python/0.0.2/crankshaft/test/mock_plpy.py @@ -0,0 +1,34 @@ +import re + +class MockPlPy: + def __init__(self): + self._reset() + + def _reset(self): + self.infos = [] + self.notices = [] + self.debugs = [] + self.logs = [] + self.warnings = [] + self.errors = [] + self.fatals = [] + self.executes = [] + self.results = [] + self.prepares = [] + self.results = [] + + def _define_result(self, query, result): + pattern = re.compile(query, re.IGNORECASE | re.MULTILINE) + self.results.append([pattern, result]) + + def notice(self, msg): + self.notices.append(msg) + + def info(self, msg): + self.infos.append(msg) + + def execute(self, query): # TODO: additional arguments + for result in self.results: + if result[0].match(query): + return result[1] + return [] diff --git a/release/python/0.0.2/crankshaft/test/test_clustering_moran.py b/release/python/0.0.2/crankshaft/test/test_clustering_moran.py new file mode 100644 index 0000000..2e730a1 --- /dev/null +++ b/release/python/0.0.2/crankshaft/test/test_clustering_moran.py @@ -0,0 +1,144 @@ +import unittest +import numpy as np + +import unittest + + +# from mock_plpy import MockPlPy +# plpy = MockPlPy() +# +# import sys +# sys.modules['plpy'] = plpy +from helper import plpy, fixture_file + +import crankshaft.clustering as cc +from crankshaft import random_seeds +import json + +class MoranTest(unittest.TestCase): + """Testing class for Moran's I functions.""" + + def setUp(self): + plpy._reset() + self.params = {"id_col": "cartodb_id", + "attr1": "andy", + "attr2": "jay_z", + "table": "a_list", + "geom_col": "the_geom", + "num_ngbrs": 321} + self.neighbors_data = json.loads(open(fixture_file('neighbors.json')).read()) + self.moran_data = json.loads(open(fixture_file('moran.json')).read()) + + def test_map_quads(self): + """Test map_quads.""" + self.assertEqual(cc.map_quads(1), 'HH') + self.assertEqual(cc.map_quads(2), 'LH') + self.assertEqual(cc.map_quads(3), 'LL') + self.assertEqual(cc.map_quads(4), 'HL') + self.assertEqual(cc.map_quads(33), None) + self.assertEqual(cc.map_quads('andy'), None) + + def test_query_attr_select(self): + """Test query_attr_select.""" + + ans = "i.\"{attr1}\"::numeric As attr1, " \ + "i.\"{attr2}\"::numeric As attr2, " + + self.assertEqual(cc.query_attr_select(self.params), ans) + + def test_query_attr_where(self): + """Test query_attr_where.""" + + ans = "idx_replace.\"{attr1}\" IS NOT NULL AND "\ + "idx_replace.\"{attr2}\" IS NOT NULL AND "\ + "idx_replace.\"{attr2}\" <> 0" + + self.assertEqual(cc.query_attr_where(self.params), ans) + + def test_knn(self): + """Test knn function.""" + + ans = "SELECT i.\"cartodb_id\" As id, i.\"andy\"::numeric As attr1, " \ + "i.\"jay_z\"::numeric As attr2, (SELECT ARRAY(SELECT j.\"cartodb_id\" " \ + "FROM \"a_list\" As j WHERE j.\"andy\" IS NOT NULL AND " \ + "j.\"jay_z\" IS NOT NULL AND j.\"jay_z\" <> 0 ORDER BY " \ + "j.\"the_geom\" <-> i.\"the_geom\" ASC LIMIT 321 OFFSET 1 ) ) " \ + "As neighbors FROM \"a_list\" As i WHERE i.\"andy\" IS NOT " \ + "NULL AND i.\"jay_z\" IS NOT NULL AND i.\"jay_z\" <> 0 ORDER " \ + "BY i.\"cartodb_id\" ASC;" + + self.assertEqual(cc.knn(self.params), ans) + + def test_queen(self): + """Test queen neighbors function.""" + + ans = "SELECT i.\"cartodb_id\" As id, i.\"andy\"::numeric As attr1, " \ + "i.\"jay_z\"::numeric As attr2, (SELECT ARRAY(SELECT " \ + "j.\"cartodb_id\" FROM \"a_list\" As j WHERE ST_Touches(" \ + "i.\"the_geom\", j.\"the_geom\") AND j.\"andy\" IS NOT NULL " \ + "AND j.\"jay_z\" IS NOT NULL AND j.\"jay_z\" <> 0)) As " \ + "neighbors FROM \"a_list\" As i WHERE i.\"andy\" IS NOT NULL " \ + "AND i.\"jay_z\" IS NOT NULL AND i.\"jay_z\" <> 0 ORDER BY " \ + "i.\"cartodb_id\" ASC;" + + self.assertEqual(cc.queen(self.params), ans) + + def test_get_query(self): + """Test get_query.""" + + ans = "SELECT i.\"cartodb_id\" As id, i.\"andy\"::numeric As attr1, " \ + "i.\"jay_z\"::numeric As attr2, (SELECT ARRAY(SELECT " \ + "j.\"cartodb_id\" FROM \"a_list\" As j WHERE j.\"andy\" IS " \ + "NOT NULL AND j.\"jay_z\" IS NOT NULL AND j.\"jay_z\" <> 0 " \ + "ORDER BY j.\"the_geom\" <-> i.\"the_geom\" ASC LIMIT 321 " \ + "OFFSET 1 ) ) As neighbors FROM \"a_list\" As i WHERE " \ + "i.\"andy\" IS NOT NULL AND i.\"jay_z\" IS NOT NULL AND " \ + "i.\"jay_z\" <> 0 ORDER BY i.\"cartodb_id\" ASC;" + + self.assertEqual(cc.get_query('knn', self.params), ans) + + def test_get_attributes(self): + """Test get_attributes.""" + + ## need to add tests + + self.assertEqual(True, True) + + def test_get_weight(self): + """Test get_weight.""" + + self.assertEqual(True, True) + + + def test_quad_position(self): + """Test lisa_sig_vals.""" + + quads = np.array([1, 2, 3, 4], np.int) + + ans = np.array(['HH', 'LH', 'LL', 'HL']) + test_ans = cc.quad_position(quads) + + self.assertTrue((test_ans == ans).all()) + + def test_moran_local(self): + """Test Moran's I local""" + data = [ { 'id': d['id'], 'attr1': d['value'], 'neighbors': d['neighbors'] } for d in self.neighbors_data] + plpy._define_result('select', data) + random_seeds.set_random_seeds(1234) + result = cc.moran_local('table', 'value', 0.05, 5, 99, 'the_geom', 'cartodb_id', 'knn') + result = [(row[0], row[1]) for row in result] + expected = self.moran_data + for ([res_val, res_quad], [exp_val, exp_quad]) in zip(result, expected): + self.assertAlmostEqual(res_val, exp_val) + self.assertEqual(res_quad, exp_quad) + + def test_moran_local_rate(self): + """Test Moran's I rate""" + data = [ { 'id': d['id'], 'attr1': d['value'], 'attr2': 1, 'neighbors': d['neighbors'] } for d in self.neighbors_data] + plpy._define_result('select', data) + random_seeds.set_random_seeds(1234) + result = cc.moran_local_rate('table', 'numerator', 'denominator', 0.05, 5, 99, 'the_geom', 'cartodb_id', 'knn') + result = [(row[0], row[1]) for row in result] + expected = self.moran_data + for ([res_val, res_quad], [exp_val, exp_quad]) in zip(result, expected): + self.assertAlmostEqual(res_val, exp_val) diff --git a/src/pg/crankshaft.control b/src/pg/crankshaft.control index 74777ac..49c0d22 100644 --- a/src/pg/crankshaft.control +++ b/src/pg/crankshaft.control @@ -1,5 +1,5 @@ comment = 'CartoDB Spatial Analysis extension' -default_version = '0.0.1' +default_version = '0.0.2' requires = 'plpythonu, postgis, cartodb' superuser = true schema = cdb_crankshaft