From 8b5e9102345fc2a7218961ef26c033715a441d6b Mon Sep 17 00:00:00 2001 From: Raul Ochoa Date: Thu, 16 Jun 2016 14:16:32 +0200 Subject: [PATCH] Release 0.0.3 --- NEWS.md | 5 + release/crankshaft--0.0.2--0.0.3.sql | 413 ++++++++++++++++++ release/crankshaft--0.0.3--0.0.2.sql | 209 +++++++++ release/crankshaft--0.0.3.sql | 403 +++++++++++++++++ release/crankshaft.control | 2 +- .../0.0.3/crankshaft/crankshaft/__init__.py | 2 + .../crankshaft/clustering/__init__.py | 2 + .../crankshaft/clustering/kmeans.py | 18 + .../crankshaft/crankshaft/clustering/moran.py | 260 +++++++++++ .../crankshaft/pysal_utils/__init__.py | 1 + .../crankshaft/pysal_utils/pysal_utils.py | 152 +++++++ .../crankshaft/crankshaft/random_seeds.py | 10 + release/python/0.0.3/crankshaft/setup.py | 48 ++ .../crankshaft/test/fixtures/kmeans.json | 1 + .../0.0.3/crankshaft/test/fixtures/moran.json | 52 +++ .../crankshaft/test/fixtures/neighbors.json | 54 +++ .../python/0.0.3/crankshaft/test/helper.py | 13 + .../python/0.0.3/crankshaft/test/mock_plpy.py | 34 ++ .../crankshaft/test/test_cluster_kmeans.py | 38 ++ .../crankshaft/test/test_clustering_moran.py | 83 ++++ .../0.0.3/crankshaft/test/test_pysal_utils.py | 107 +++++ src/pg/crankshaft.control | 2 +- 22 files changed, 1907 insertions(+), 2 deletions(-) create mode 100644 release/crankshaft--0.0.2--0.0.3.sql create mode 100644 release/crankshaft--0.0.3--0.0.2.sql create mode 100644 release/crankshaft--0.0.3.sql create mode 100644 release/python/0.0.3/crankshaft/crankshaft/__init__.py create mode 100644 release/python/0.0.3/crankshaft/crankshaft/clustering/__init__.py create mode 100644 release/python/0.0.3/crankshaft/crankshaft/clustering/kmeans.py create mode 100644 release/python/0.0.3/crankshaft/crankshaft/clustering/moran.py create mode 100644 release/python/0.0.3/crankshaft/crankshaft/pysal_utils/__init__.py create mode 100644 release/python/0.0.3/crankshaft/crankshaft/pysal_utils/pysal_utils.py create mode 100644 release/python/0.0.3/crankshaft/crankshaft/random_seeds.py create mode 100644 release/python/0.0.3/crankshaft/setup.py create mode 100644 release/python/0.0.3/crankshaft/test/fixtures/kmeans.json create mode 100644 release/python/0.0.3/crankshaft/test/fixtures/moran.json create mode 100644 release/python/0.0.3/crankshaft/test/fixtures/neighbors.json create mode 100644 release/python/0.0.3/crankshaft/test/helper.py create mode 100644 release/python/0.0.3/crankshaft/test/mock_plpy.py create mode 100644 release/python/0.0.3/crankshaft/test/test_cluster_kmeans.py create mode 100644 release/python/0.0.3/crankshaft/test/test_clustering_moran.py create mode 100644 release/python/0.0.3/crankshaft/test/test_pysal_utils.py diff --git a/NEWS.md b/NEWS.md index 0b8c2da..ed66fd9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +0.0.3 (2016-06-16) +------------------ +* Adds new functions: kmeans, weighted centroids. +* Replaces moran functions with new areas of interest naming. + 0.0.2 (2016-03-16) ------------------ * New versioning approach using per-version Python virtual environments diff --git a/release/crankshaft--0.0.2--0.0.3.sql b/release/crankshaft--0.0.2--0.0.3.sql new file mode 100644 index 0000000..8a865d5 --- /dev/null +++ b/release/crankshaft--0.0.2--0.0.3.sql @@ -0,0 +1,413 @@ +--DO NOT MODIFY THIS FILE, IT IS GENERATED AUTOMATICALLY FROM SOURCES +-- Complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION crankshaft" to load this file. \quit + +-- [MANUALLY] DROP FUNCTIONS REMOVED SINCE 0.0.2 version + +DROP FUNCTION IF EXISTS cdb_moran_local(TEXT, TEXT, float, INT, INT, TEXT, TEXT, TEXT); +DROP FUNCTION IF EXISTS cdb_moran_local_rate(TEXT, TEXT, TEXT, FLOAT, INT, INT, TEXT, TEXT, TEXT); +DROP FUNCTION IF EXISTS _cdb_crankshaft_virtualenvs_path(); +DROP FUNCTION IF EXISTS _cdb_crankshaft_activate_py(); + +-- [END MANUALLY] DROP FUNCTIONS REMOVED SINCE 0.0.2 version + +-- Version number of the extension release +CREATE OR REPLACE FUNCTION cdb_crankshaft_version() + RETURNS text AS $$ + SELECT '0.0.3'::text; +$$ language 'sql' STABLE STRICT; + +-- Internal identifier of the installed extension instence +-- e.g. 'dev' for current development version +CREATE OR REPLACE FUNCTION _cdb_crankshaft_internal_version() + RETURNS text AS $$ + SELECT installed_version FROM pg_available_extensions where name='crankshaft' and pg_available_extensions IS NOT NULL; +$$ language 'sql' STABLE STRICT; +-- Internal function. +-- Set the seeds of the RNGs (Random Number Generators) +-- used internally. +CREATE OR REPLACE FUNCTION + _cdb_random_seeds (seed_value INTEGER) RETURNS VOID +AS $$ + from crankshaft import random_seeds + random_seeds.set_random_seeds(seed_value) +$$ LANGUAGE plpythonu; +-- Moran's I Global Measure (public-facing) +CREATE OR REPLACE FUNCTION + CDB_AreasOfInterestGlobal( + subquery TEXT, + column_name TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') + RETURNS TABLE (moran NUMERIC, significance NUMERIC) +AS $$ + from crankshaft.clustering import moran_local + # TODO: use named parameters or a dictionary + return moran(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu; + +-- Moran's I Local (internal function) +CREATE OR REPLACE FUNCTION + _CDB_AreasOfInterestLocal( + subquery TEXT, + column_name TEXT, + w_type TEXT, + num_ngbrs INT, + permutations INT, + geom_col TEXT, + id_col TEXT) + RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + from crankshaft.clustering import moran_local + # TODO: use named parameters or a dictionary + return moran_local(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu; + +-- Moran's I Local (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_AreasOfInterestLocal( + subquery TEXT, + column_name TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') + RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col); + +$$ LANGUAGE SQL; + +-- Moran's I only for HH and HL (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialHotspots( + subquery TEXT, + column_name TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') + RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('HH', 'HL'); + +$$ LANGUAGE SQL; + +-- Moran's I only for LL and LH (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialColdspots( + subquery TEXT, + attr TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') + RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('LL', 'LH'); + +$$ LANGUAGE SQL; + +-- Moran's I only for LH and HL (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialOutliers( + subquery TEXT, + attr TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') + RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('HL', 'LH'); + +$$ LANGUAGE SQL; + +-- Moran's I Global Rate (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_AreasOfInterestGlobalRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') + RETURNS TABLE (moran FLOAT, significance FLOAT) +AS $$ + from crankshaft.clustering import moran_local + # TODO: use named parameters or a dictionary + return moran_rate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu; + + +-- Moran's I Local Rate (internal function) +CREATE OR REPLACE FUNCTION + _CDB_AreasOfInterestLocalRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT, + num_ngbrs INT, + permutations INT, + geom_col TEXT, + id_col TEXT) + RETURNS + TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + from crankshaft.clustering import moran_local_rate + # TODO: use named parameters or a dictionary + return moran_local_rate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu; + +-- Moran's I Local Rate (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_AreasOfInterestLocalRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') + RETURNS + TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col); + +$$ LANGUAGE SQL; + +-- Moran's I Local Rate only for HH and HL (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialHotspotsRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') + RETURNS + TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('HH', 'HL'); + +$$ LANGUAGE SQL; + +-- Moran's I Local Rate only for LL and LH (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialColdspotsRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') + RETURNS + TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('LL', 'LH'); + +$$ LANGUAGE SQL; + +-- Moran's I Local Rate only for LH and HL (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialOutliersRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') + RETURNS + TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('HL', 'LH'); + +$$ LANGUAGE SQL; +CREATE OR REPLACE FUNCTION CDB_KMeans(query text, no_clusters integer,no_init integer default 20) + RETURNS table (cartodb_id integer, cluster_no integer) as $$ + + from crankshaft.clustering import kmeans + return kmeans(query,no_clusters,no_init) + +$$ language plpythonu; + + +CREATE OR REPLACE FUNCTION CDB_WeightedMeanS(state Numeric[],the_geom GEOMETRY(Point, 4326), weight NUMERIC) + RETURNS Numeric[] AS + $$ +DECLARE + newX NUMERIC; + newY NUMERIC; + newW NUMERIC; +BEGIN + IF weight IS NULL OR the_geom IS NULL THEN + newX = state[1]; + newY = state[2]; + newW = state[3]; + ELSE + newX = state[1] + ST_X(the_geom)*weight; + newY = state[2] + ST_Y(the_geom)*weight; + newW = state[3] + weight; + END IF; + RETURN Array[newX,newY,newW]; + +END +$$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION CDB_WeightedMeanF(state Numeric[]) + RETURNS GEOMETRY AS + $$ +BEGIN + IF state[3] = 0 THEN + RETURN ST_SetSRID(ST_MakePoint(state[1],state[2]), 4326); + ELSE + RETURN ST_SETSRID(ST_MakePoint(state[1]/state[3], state[2]/state[3]),4326); + END IF; +END +$$ LANGUAGE plpgsql; + +CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC)( +SFUNC = CDB_WeightedMeanS, +FINALFUNC = CDB_WeightedMeanF, +STYPE = Numeric[], +INITCOND = "{0.0,0.0,0.0}" +); +-- Function by Stuart Lynn for a simple interpolation of a value +-- from a polygon table over an arbitrary polygon +-- (weighted by the area proportion overlapped) +-- Aereal weighting is a very simple form of aereal interpolation. +-- +-- Parameters: +-- * geom a Polygon geometry which defines the area where a value will be +-- estimated as the area-weighted sum of a given table/column +-- * target_table_name table name of the table that provides the values +-- * target_column column name of the column that provides the values +-- * schema_name optional parameter to defina the schema the target table +-- belongs to, which is necessary if its not in the search_path. +-- Note that target_table_name should never include the schema in it. +-- Return value: +-- Aereal-weighted interpolation of the column values over the geometry +CREATE OR REPLACE +FUNCTION cdb_overlap_sum(geom geometry, target_table_name text, target_column text, schema_name text DEFAULT NULL) + RETURNS numeric AS + $$ + DECLARE + result numeric; + qualified_name text; + BEGIN + IF schema_name IS NULL THEN + qualified_name := Format('%I', target_table_name); + ELSE + qualified_name := Format('%I.%s', schema_name, target_table_name); + END IF; + EXECUTE Format(' + SELECT sum(%I*ST_Area(St_Intersection($1, a.the_geom))/ST_Area(a.the_geom)) + FROM %s AS a + WHERE $1 && a.the_geom + ', target_column, qualified_name) + USING geom + INTO result; + RETURN result; + END; + $$ LANGUAGE plpgsql; +-- +-- Creates N points randomly distributed arround the polygon +-- +-- @param g - the geometry to be turned in to points +-- +-- @param no_points - the number of points to generate +-- +-- @params max_iter_per_point - the function generates points in the polygon's bounding box +-- and discards points which don't lie in the polygon. max_iter_per_point specifies how many +-- misses per point the funciton accepts before giving up. +-- +-- Returns: Multipoint with the requested points +CREATE OR REPLACE FUNCTION cdb_dot_density(geom geometry , no_points Integer, max_iter_per_point Integer DEFAULT 1000) + RETURNS GEOMETRY AS $$ +DECLARE + extent GEOMETRY; + test_point Geometry; + width NUMERIC; + height NUMERIC; + x0 NUMERIC; + y0 NUMERIC; + xp NUMERIC; + yp NUMERIC; + no_left INTEGER; + remaining_iterations INTEGER; + points GEOMETRY[]; + bbox_line GEOMETRY; + intersection_line GEOMETRY; +BEGIN + extent := ST_Envelope(geom); + width := ST_XMax(extent) - ST_XMIN(extent); + height := ST_YMax(extent) - ST_YMIN(extent); + x0 := ST_XMin(extent); + y0 := ST_YMin(extent); + no_left := no_points; + + LOOP + if(no_left=0) THEN + EXIT; + END IF; + yp = y0 + height*random(); + bbox_line = ST_MakeLine( + ST_SetSRID(ST_MakePoint(yp, x0),4326), + ST_SetSRID(ST_MakePoint(yp, x0+width),4326) + ); + intersection_line = ST_Intersection(bbox_line,geom); + test_point = ST_LineInterpolatePoint(st_makeline(st_linemerge(intersection_line)),random()); + points := points || test_point; + no_left = no_left - 1 ; + END LOOP; + RETURN ST_Collect(points); +END; +$$ +LANGUAGE plpgsql VOLATILE; +-- Make sure by default there are no permissions for publicuser +-- NOTE: this happens at extension creation time, as part of an implicit transaction. +-- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE; + +-- Grant permissions on the schema to publicuser (but just the schema) +GRANT USAGE ON SCHEMA cdb_crankshaft TO publicuser; + +-- Revoke execute permissions on all functions in the schema by default +-- REVOKE EXECUTE ON ALL FUNCTIONS IN SCHEMA cdb_crankshaft FROM PUBLIC, publicuser; diff --git a/release/crankshaft--0.0.3--0.0.2.sql b/release/crankshaft--0.0.3--0.0.2.sql new file mode 100644 index 0000000..a2ccd2f --- /dev/null +++ b/release/crankshaft--0.0.3--0.0.2.sql @@ -0,0 +1,209 @@ +--DO NOT MODIFY THIS FILE, IT IS GENERATED AUTOMATICALLY FROM SOURCES +-- Complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION crankshaft" to load this file. \quit + +-- [MANUALLY] DROP FUNCTIONS INTRODUCED IN 0.0.3 version + +DROP FUNCTION IF EXISTS CDB_AreasOfInterestGlobal(TEXT,TEXT,TEXT,INT,INT,TEXT,TEXT); +DROP FUNCTION IF EXISTS _CDB_AreasOfInterestLocal(TEXT,TEXT,TEXT,INT,INT,TEXT,TEXT); +DROP FUNCTION IF EXISTS CDB_AreasOfInterestLocal(TEXT,TEXT,TEXT,INT,INT,TEXT,TEXT); +DROP FUNCTION IF EXISTS CDB_GetSpatialHotspots(TEXT,TEXT,TEXT,INT,INT,TEXT,TEXT); +DROP FUNCTION IF EXISTS CDB_GetSpatialColdspots(TEXT,TEXT,TEXT,INT,INT,TEXT,TEXT); +DROP FUNCTION IF EXISTS CDB_GetSpatialOutliers(TEXT,TEXT,TEXT,INT,INT,TEXT,TEXT); +DROP FUNCTION IF EXISTS CDB_AreasOfInterestGlobalRate(TEXT,TEXT,TEXT,TEXT,INT,INT,TEXT,TEXT); +DROP FUNCTION IF EXISTS CDB_AreasOfInterestLocalRate(TEXT,TEXT,TEXT,TEXT,INT,INT,TEXT,TEXT); +DROP FUNCTION IF EXISTS _CDB_AreasOfInterestLocalRate(TEXT,TEXT,TEXT,TEXT,INT,INT,TEXT,TEXT); +DROP FUNCTION IF EXISTS CDB_GetSpatialHotspotsRate(TEXT,TEXT,TEXT,TEXT,INT,INT,TEXT,TEXT); +DROP FUNCTION IF EXISTS CDB_GetSpatialColdspotsRate(TEXT,TEXT,TEXT,TEXT,INT,INT,TEXT,TEXT); +DROP FUNCTION IF EXISTS CDB_GetSpatialOutliersRate(TEXT,TEXT,TEXT,TEXT,INT,INT,TEXT,TEXT); +DROP FUNCTION IF EXISTS CDB_KMeans(text,integer,integer); +DROP AGGREGATE IF EXISTS CDB_WeightedMean(geometry(Point, 4326), NUMERIC); +DROP FUNCTION IF EXISTS CDB_WeightedMeanS(Numeric[], GEOMETRY(Point, 4326), NUMERIC); +DROP FUNCTION IF EXISTS CDB_WeightedMeanF(Numeric[]); + + +-- [END MANUALLY] DROP FUNCTIONS INTRODUCED IN 0.0.3 version + +-- Version number of the extension release +CREATE OR REPLACE FUNCTION cdb_crankshaft_version() +RETURNS text AS $$ + SELECT '0.0.2'::text; +$$ language 'sql' STABLE STRICT; + +-- Internal identifier of the installed extension instence +-- e.g. 'dev' for current development version +CREATE OR REPLACE FUNCTION _cdb_crankshaft_internal_version() +RETURNS text AS $$ + SELECT installed_version FROM pg_available_extensions where name='crankshaft' and pg_available_extensions IS NOT NULL; +$$ language 'sql' STABLE STRICT; +CREATE OR REPLACE FUNCTION _cdb_crankshaft_virtualenvs_path() +RETURNS text +AS $$ + BEGIN + -- RETURN '/opt/virtualenvs/crankshaft'; + RETURN '/home/ubuntu/crankshaft/envs'; + END; +$$ language plpgsql IMMUTABLE STRICT; + +-- Use the crankshaft python module +CREATE OR REPLACE FUNCTION _cdb_crankshaft_activate_py() +RETURNS VOID +AS $$ + import os + # plpy.notice('%',str(os.environ)) + # activate virtualenv + crankshaft_version = plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_internal_version()')[0]['_cdb_crankshaft_internal_version'] + base_path = plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_virtualenvs_path()')[0]['_cdb_crankshaft_virtualenvs_path'] + default_venv_path = os.path.join(base_path, crankshaft_version) + venv_path = os.environ.get('CRANKSHAFT_VENV', default_venv_path) + activate_path = venv_path + '/bin/activate_this.py' + exec(open(activate_path).read(), dict(__file__=activate_path)) +$$ LANGUAGE plpythonu; +-- Internal function. +-- Set the seeds of the RNGs (Random Number Generators) +-- used internally. +CREATE OR REPLACE FUNCTION +_cdb_random_seeds (seed_value INTEGER) RETURNS VOID +AS $$ + plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()') + from crankshaft import random_seeds + random_seeds.set_random_seeds(seed_value) +$$ LANGUAGE plpythonu; +-- Moran's I +CREATE OR REPLACE FUNCTION + cdb_moran_local ( + t TEXT, + attr TEXT, + significance float DEFAULT 0.05, + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_column TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id', + w_type TEXT DEFAULT 'knn') +RETURNS TABLE (moran FLOAT, quads TEXT, significance FLOAT, ids INT) +AS $$ + plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()') + from crankshaft.clustering import moran_local + # TODO: use named parameters or a dictionary + return moran_local(t, attr, significance, num_ngbrs, permutations, geom_column, id_col, w_type) +$$ LANGUAGE plpythonu; + +-- Moran's I Local Rate +CREATE OR REPLACE FUNCTION + cdb_moran_local_rate(t TEXT, + numerator TEXT, + denominator TEXT, + significance FLOAT DEFAULT 0.05, + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_column TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id', + w_type TEXT DEFAULT 'knn') +RETURNS TABLE(moran FLOAT, quads TEXT, significance FLOAT, ids INT, y numeric) +AS $$ + plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()') + from crankshaft.clustering import moran_local_rate + # TODO: use named parameters or a dictionary + return moran_local_rate(t, numerator, denominator, significance, num_ngbrs, permutations, geom_column, id_col, w_type) +$$ LANGUAGE plpythonu; +-- Function by Stuart Lynn for a simple interpolation of a value +-- from a polygon table over an arbitrary polygon +-- (weighted by the area proportion overlapped) +-- Aereal weighting is a very simple form of aereal interpolation. +-- +-- Parameters: +-- * geom a Polygon geometry which defines the area where a value will be +-- estimated as the area-weighted sum of a given table/column +-- * target_table_name table name of the table that provides the values +-- * target_column column name of the column that provides the values +-- * schema_name optional parameter to defina the schema the target table +-- belongs to, which is necessary if its not in the search_path. +-- Note that target_table_name should never include the schema in it. +-- Return value: +-- Aereal-weighted interpolation of the column values over the geometry +CREATE OR REPLACE +FUNCTION cdb_overlap_sum(geom geometry, target_table_name text, target_column text, schema_name text DEFAULT NULL) + RETURNS numeric AS +$$ +DECLARE + result numeric; + qualified_name text; +BEGIN + IF schema_name IS NULL THEN + qualified_name := Format('%I', target_table_name); + ELSE + qualified_name := Format('%I.%s', schema_name, target_table_name); + END IF; + EXECUTE Format(' + SELECT sum(%I*ST_Area(St_Intersection($1, a.the_geom))/ST_Area(a.the_geom)) + FROM %s AS a + WHERE $1 && a.the_geom + ', target_column, qualified_name) + USING geom + INTO result; + RETURN result; +END; +$$ LANGUAGE plpgsql; +-- +-- Creates N points randomly distributed arround the polygon +-- +-- @param g - the geometry to be turned in to points +-- +-- @param no_points - the number of points to generate +-- +-- @params max_iter_per_point - the function generates points in the polygon's bounding box +-- and discards points which don't lie in the polygon. max_iter_per_point specifies how many +-- misses per point the funciton accepts before giving up. +-- +-- Returns: Multipoint with the requested points +CREATE OR REPLACE FUNCTION cdb_dot_density(geom geometry , no_points Integer, max_iter_per_point Integer DEFAULT 1000) +RETURNS GEOMETRY AS $$ +DECLARE + extent GEOMETRY; + test_point Geometry; + width NUMERIC; + height NUMERIC; + x0 NUMERIC; + y0 NUMERIC; + xp NUMERIC; + yp NUMERIC; + no_left INTEGER; + remaining_iterations INTEGER; + points GEOMETRY[]; + bbox_line GEOMETRY; + intersection_line GEOMETRY; +BEGIN + extent := ST_Envelope(geom); + width := ST_XMax(extent) - ST_XMIN(extent); + height := ST_YMax(extent) - ST_YMIN(extent); + x0 := ST_XMin(extent); + y0 := ST_YMin(extent); + no_left := no_points; + + LOOP + if(no_left=0) THEN + EXIT; + END IF; + yp = y0 + height*random(); + bbox_line = ST_MakeLine( + ST_SetSRID(ST_MakePoint(yp, x0),4326), + ST_SetSRID(ST_MakePoint(yp, x0+width),4326) + ); + intersection_line = ST_Intersection(bbox_line,geom); + test_point = ST_LineInterpolatePoint(st_makeline(st_linemerge(intersection_line)),random()); + points := points || test_point; + no_left = no_left - 1 ; + END LOOP; + RETURN ST_Collect(points); +END; +$$ +LANGUAGE plpgsql VOLATILE; +-- Make sure by default there are no permissions for publicuser +-- NOTE: this happens at extension creation time, as part of an implicit transaction. +-- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE; + +-- Grant permissions on the schema to publicuser (but just the schema) +GRANT USAGE ON SCHEMA cdb_crankshaft TO publicuser; + +-- Revoke execute permissions on all functions in the schema by default +-- REVOKE EXECUTE ON ALL FUNCTIONS IN SCHEMA cdb_crankshaft FROM PUBLIC, publicuser; diff --git a/release/crankshaft--0.0.3.sql b/release/crankshaft--0.0.3.sql new file mode 100644 index 0000000..caacd75 --- /dev/null +++ b/release/crankshaft--0.0.3.sql @@ -0,0 +1,403 @@ +--DO NOT MODIFY THIS FILE, IT IS GENERATED AUTOMATICALLY FROM SOURCES +-- Complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION crankshaft" to load this file. \quit +-- Version number of the extension release +CREATE OR REPLACE FUNCTION cdb_crankshaft_version() +RETURNS text AS $$ + SELECT '0.0.3'::text; +$$ language 'sql' STABLE STRICT; + +-- Internal identifier of the installed extension instence +-- e.g. 'dev' for current development version +CREATE OR REPLACE FUNCTION _cdb_crankshaft_internal_version() +RETURNS text AS $$ + SELECT installed_version FROM pg_available_extensions where name='crankshaft' and pg_available_extensions IS NOT NULL; +$$ language 'sql' STABLE STRICT; +-- Internal function. +-- Set the seeds of the RNGs (Random Number Generators) +-- used internally. +CREATE OR REPLACE FUNCTION +_cdb_random_seeds (seed_value INTEGER) RETURNS VOID +AS $$ + from crankshaft import random_seeds + random_seeds.set_random_seeds(seed_value) +$$ LANGUAGE plpythonu; +-- Moran's I Global Measure (public-facing) +CREATE OR REPLACE FUNCTION + CDB_AreasOfInterestGlobal( + subquery TEXT, + column_name TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS TABLE (moran NUMERIC, significance NUMERIC) +AS $$ + from crankshaft.clustering import moran_local + # TODO: use named parameters or a dictionary + return moran(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu; + +-- Moran's I Local (internal function) +CREATE OR REPLACE FUNCTION + _CDB_AreasOfInterestLocal( + subquery TEXT, + column_name TEXT, + w_type TEXT, + num_ngbrs INT, + permutations INT, + geom_col TEXT, + id_col TEXT) +RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + from crankshaft.clustering import moran_local + # TODO: use named parameters or a dictionary + return moran_local(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu; + +-- Moran's I Local (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_AreasOfInterestLocal( + subquery TEXT, + column_name TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col); + +$$ LANGUAGE SQL; + +-- Moran's I only for HH and HL (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialHotspots( + subquery TEXT, + column_name TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') + RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('HH', 'HL'); + +$$ LANGUAGE SQL; + +-- Moran's I only for LL and LH (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialColdspots( + subquery TEXT, + attr TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') + RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('LL', 'LH'); + +$$ LANGUAGE SQL; + +-- Moran's I only for LH and HL (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialOutliers( + subquery TEXT, + attr TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') + RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('HL', 'LH'); + +$$ LANGUAGE SQL; + +-- Moran's I Global Rate (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_AreasOfInterestGlobalRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS TABLE (moran FLOAT, significance FLOAT) +AS $$ + from crankshaft.clustering import moran_local + # TODO: use named parameters or a dictionary + return moran_rate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu; + + +-- Moran's I Local Rate (internal function) +CREATE OR REPLACE FUNCTION + _CDB_AreasOfInterestLocalRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT, + num_ngbrs INT, + permutations INT, + geom_col TEXT, + id_col TEXT) +RETURNS +TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + from crankshaft.clustering import moran_local_rate + # TODO: use named parameters or a dictionary + return moran_local_rate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu; + +-- Moran's I Local Rate (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_AreasOfInterestLocalRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS +TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col); + +$$ LANGUAGE SQL; + +-- Moran's I Local Rate only for HH and HL (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialHotspotsRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS +TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('HH', 'HL'); + +$$ LANGUAGE SQL; + +-- Moran's I Local Rate only for LL and LH (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialColdspotsRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS +TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('LL', 'LH'); + +$$ LANGUAGE SQL; + +-- Moran's I Local Rate only for LH and HL (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialOutliersRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS +TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('HL', 'LH'); + +$$ LANGUAGE SQL; +CREATE OR REPLACE FUNCTION CDB_KMeans(query text, no_clusters integer,no_init integer default 20) +RETURNS table (cartodb_id integer, cluster_no integer) as $$ + + from crankshaft.clustering import kmeans + return kmeans(query,no_clusters,no_init) + +$$ language plpythonu; + + +CREATE OR REPLACE FUNCTION CDB_WeightedMeanS(state Numeric[],the_geom GEOMETRY(Point, 4326), weight NUMERIC) +RETURNS Numeric[] AS +$$ +DECLARE + newX NUMERIC; + newY NUMERIC; + newW NUMERIC; +BEGIN + IF weight IS NULL OR the_geom IS NULL THEN + newX = state[1]; + newY = state[2]; + newW = state[3]; + ELSE + newX = state[1] + ST_X(the_geom)*weight; + newY = state[2] + ST_Y(the_geom)*weight; + newW = state[3] + weight; + END IF; + RETURN Array[newX,newY,newW]; + +END +$$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION CDB_WeightedMeanF(state Numeric[]) +RETURNS GEOMETRY AS +$$ +BEGIN + IF state[3] = 0 THEN + RETURN ST_SetSRID(ST_MakePoint(state[1],state[2]), 4326); + ELSE + RETURN ST_SETSRID(ST_MakePoint(state[1]/state[3], state[2]/state[3]),4326); + END IF; +END +$$ LANGUAGE plpgsql; + +CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC)( + SFUNC = CDB_WeightedMeanS, + FINALFUNC = CDB_WeightedMeanF, + STYPE = Numeric[], + INITCOND = "{0.0,0.0,0.0}" +); +-- Function by Stuart Lynn for a simple interpolation of a value +-- from a polygon table over an arbitrary polygon +-- (weighted by the area proportion overlapped) +-- Aereal weighting is a very simple form of aereal interpolation. +-- +-- Parameters: +-- * geom a Polygon geometry which defines the area where a value will be +-- estimated as the area-weighted sum of a given table/column +-- * target_table_name table name of the table that provides the values +-- * target_column column name of the column that provides the values +-- * schema_name optional parameter to defina the schema the target table +-- belongs to, which is necessary if its not in the search_path. +-- Note that target_table_name should never include the schema in it. +-- Return value: +-- Aereal-weighted interpolation of the column values over the geometry +CREATE OR REPLACE +FUNCTION cdb_overlap_sum(geom geometry, target_table_name text, target_column text, schema_name text DEFAULT NULL) + RETURNS numeric AS +$$ +DECLARE + result numeric; + qualified_name text; +BEGIN + IF schema_name IS NULL THEN + qualified_name := Format('%I', target_table_name); + ELSE + qualified_name := Format('%I.%s', schema_name, target_table_name); + END IF; + EXECUTE Format(' + SELECT sum(%I*ST_Area(St_Intersection($1, a.the_geom))/ST_Area(a.the_geom)) + FROM %s AS a + WHERE $1 && a.the_geom + ', target_column, qualified_name) + USING geom + INTO result; + RETURN result; +END; +$$ LANGUAGE plpgsql; +-- +-- Creates N points randomly distributed arround the polygon +-- +-- @param g - the geometry to be turned in to points +-- +-- @param no_points - the number of points to generate +-- +-- @params max_iter_per_point - the function generates points in the polygon's bounding box +-- and discards points which don't lie in the polygon. max_iter_per_point specifies how many +-- misses per point the funciton accepts before giving up. +-- +-- Returns: Multipoint with the requested points +CREATE OR REPLACE FUNCTION cdb_dot_density(geom geometry , no_points Integer, max_iter_per_point Integer DEFAULT 1000) +RETURNS GEOMETRY AS $$ +DECLARE + extent GEOMETRY; + test_point Geometry; + width NUMERIC; + height NUMERIC; + x0 NUMERIC; + y0 NUMERIC; + xp NUMERIC; + yp NUMERIC; + no_left INTEGER; + remaining_iterations INTEGER; + points GEOMETRY[]; + bbox_line GEOMETRY; + intersection_line GEOMETRY; +BEGIN + extent := ST_Envelope(geom); + width := ST_XMax(extent) - ST_XMIN(extent); + height := ST_YMax(extent) - ST_YMIN(extent); + x0 := ST_XMin(extent); + y0 := ST_YMin(extent); + no_left := no_points; + + LOOP + if(no_left=0) THEN + EXIT; + END IF; + yp = y0 + height*random(); + bbox_line = ST_MakeLine( + ST_SetSRID(ST_MakePoint(yp, x0),4326), + ST_SetSRID(ST_MakePoint(yp, x0+width),4326) + ); + intersection_line = ST_Intersection(bbox_line,geom); + test_point = ST_LineInterpolatePoint(st_makeline(st_linemerge(intersection_line)),random()); + points := points || test_point; + no_left = no_left - 1 ; + END LOOP; + RETURN ST_Collect(points); +END; +$$ +LANGUAGE plpgsql VOLATILE; +-- Make sure by default there are no permissions for publicuser +-- NOTE: this happens at extension creation time, as part of an implicit transaction. +-- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE; + +-- Grant permissions on the schema to publicuser (but just the schema) +GRANT USAGE ON SCHEMA cdb_crankshaft TO publicuser; + +-- Revoke execute permissions on all functions in the schema by default +-- REVOKE EXECUTE ON ALL FUNCTIONS IN SCHEMA cdb_crankshaft FROM PUBLIC, publicuser; diff --git a/release/crankshaft.control b/release/crankshaft.control index 49c0d22..2029b7e 100644 --- a/release/crankshaft.control +++ b/release/crankshaft.control @@ -1,5 +1,5 @@ comment = 'CartoDB Spatial Analysis extension' -default_version = '0.0.2' +default_version = '0.0.3' requires = 'plpythonu, postgis, cartodb' superuser = true schema = cdb_crankshaft diff --git a/release/python/0.0.3/crankshaft/crankshaft/__init__.py b/release/python/0.0.3/crankshaft/crankshaft/__init__.py new file mode 100644 index 0000000..d07e330 --- /dev/null +++ b/release/python/0.0.3/crankshaft/crankshaft/__init__.py @@ -0,0 +1,2 @@ +import random_seeds +import clustering diff --git a/release/python/0.0.3/crankshaft/crankshaft/clustering/__init__.py b/release/python/0.0.3/crankshaft/crankshaft/clustering/__init__.py new file mode 100644 index 0000000..338e8ea --- /dev/null +++ b/release/python/0.0.3/crankshaft/crankshaft/clustering/__init__.py @@ -0,0 +1,2 @@ +from moran import * +from kmeans import * diff --git a/release/python/0.0.3/crankshaft/crankshaft/clustering/kmeans.py b/release/python/0.0.3/crankshaft/crankshaft/clustering/kmeans.py new file mode 100644 index 0000000..4134062 --- /dev/null +++ b/release/python/0.0.3/crankshaft/crankshaft/clustering/kmeans.py @@ -0,0 +1,18 @@ +from sklearn.cluster import KMeans +import plpy + +def kmeans(query, no_clusters, no_init=20): + data = plpy.execute('''select array_agg(cartodb_id order by cartodb_id) as ids, + array_agg(ST_X(the_geom) order by cartodb_id) xs, + array_agg(ST_Y(the_geom) order by cartodb_id) ys from ({query}) a + where the_geom is not null + '''.format(query=query)) + + xs = data[0]['xs'] + ys = data[0]['ys'] + ids = data[0]['ids'] + + km = KMeans(n_clusters= no_clusters, n_init=no_init) + labels = km.fit_predict(zip(xs,ys)) + return zip(ids,labels) + diff --git a/release/python/0.0.3/crankshaft/crankshaft/clustering/moran.py b/release/python/0.0.3/crankshaft/crankshaft/clustering/moran.py new file mode 100644 index 0000000..39b3ff6 --- /dev/null +++ b/release/python/0.0.3/crankshaft/crankshaft/clustering/moran.py @@ -0,0 +1,260 @@ +""" +Moran's I geostatistics (global clustering & outliers presence) +""" + +# TODO: Fill in local neighbors which have null/NoneType values with the +# average of the their neighborhood + +import pysal as ps +import plpy + +# crankshaft module +import crankshaft.pysal_utils as pu + +# High level interface --------------------------------------- + +def moran(subquery, attr_name, + w_type, num_ngbrs, permutations, geom_col, id_col): + """ + Moran's I (global) + Implementation building neighbors with a PostGIS database and Moran's I + core clusters with PySAL. + Andy Eschbacher + """ + qvals = {"id_col": id_col, + "attr1": attr_name, + "geom_col": geom_col, + "subquery": subquery, + "num_ngbrs": num_ngbrs} + + query = pu.construct_neighbor_query(w_type, qvals) + + plpy.notice('** Query: %s' % query) + + try: + result = plpy.execute(query) + # if there are no neighbors, exit + if len(result) == 0: + return pu.empty_zipped_array(2) + plpy.notice('** Query returned with %d rows' % len(result)) + except plpy.SPIError: + plpy.error('Error: areas of interest query failed, check input parameters') + plpy.notice('** Query failed: "%s"' % query) + plpy.notice('** Error: %s' % plpy.SPIError) + return pu.empty_zipped_array(2) + + ## collect attributes + attr_vals = pu.get_attributes(result) + + ## calculate weights + weight = pu.get_weight(result, w_type, num_ngbrs) + + ## calculate moran global + moran_global = ps.esda.moran.Moran(attr_vals, weight, + permutations=permutations) + + return zip([moran_global.I], [moran_global.EI]) + +def moran_local(subquery, attr, + w_type, num_ngbrs, permutations, geom_col, id_col): + """ + Moran's I implementation for PL/Python + Andy Eschbacher + """ + + # geometries with attributes that are null are ignored + # resulting in a collection of not as near neighbors + + qvals = {"id_col": id_col, + "attr1": attr, + "geom_col": geom_col, + "subquery": subquery, + "num_ngbrs": num_ngbrs} + + query = pu.construct_neighbor_query(w_type, qvals) + + try: + result = plpy.execute(query) + # if there are no neighbors, exit + if len(result) == 0: + return pu.empty_zipped_array(5) + except plpy.SPIError: + plpy.error('Error: areas of interest query failed, check input parameters') + plpy.notice('** Query failed: "%s"' % query) + return pu.empty_zipped_array(5) + + attr_vals = pu.get_attributes(result) + weight = pu.get_weight(result, w_type, num_ngbrs) + + # calculate LISA values + lisa = ps.esda.moran.Moran_Local(attr_vals, weight, + permutations=permutations) + + # find quadrants for each geometry + quads = quad_position(lisa.q) + + return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y) + +def moran_rate(subquery, numerator, denominator, + w_type, num_ngbrs, permutations, geom_col, id_col): + """ + Moran's I Rate (global) + Andy Eschbacher + """ + qvals = {"id_col": id_col, + "attr1": numerator, + "attr2": denominator, + "geom_col": geom_col, + "subquery": subquery, + "num_ngbrs": num_ngbrs} + + query = pu.construct_neighbor_query(w_type, qvals) + + plpy.notice('** Query: %s' % query) + + try: + result = plpy.execute(query) + # if there are no neighbors, exit + if len(result) == 0: + return pu.empty_zipped_array(2) + plpy.notice('** Query returned with %d rows' % len(result)) + except plpy.SPIError: + plpy.error('Error: areas of interest query failed, check input parameters') + plpy.notice('** Query failed: "%s"' % query) + plpy.notice('** Error: %s' % plpy.SPIError) + return pu.empty_zipped_array(2) + + ## collect attributes + numer = pu.get_attributes(result, 1) + denom = pu.get_attributes(result, 2) + + weight = pu.get_weight(result, w_type, num_ngbrs) + + ## calculate moran global rate + lisa_rate = ps.esda.moran.Moran_Rate(numer, denom, weight, + permutations=permutations) + + return zip([lisa_rate.I], [lisa_rate.EI]) + +def moran_local_rate(subquery, numerator, denominator, + w_type, num_ngbrs, permutations, geom_col, id_col): + """ + Moran's I Local Rate + Andy Eschbacher + """ + # geometries with values that are null are ignored + # resulting in a collection of not as near neighbors + + query = pu.construct_neighbor_query(w_type, + {"id_col": id_col, + "numerator": numerator, + "denominator": denominator, + "geom_col": geom_col, + "subquery": subquery, + "num_ngbrs": num_ngbrs}) + + try: + result = plpy.execute(query) + # if there are no neighbors, exit + if len(result) == 0: + return pu.empty_zipped_array(5) + except plpy.SPIError: + plpy.error('Error: areas of interest query failed, check input parameters') + plpy.notice('** Query failed: "%s"' % query) + plpy.notice('** Error: %s' % plpy.SPIError) + return pu.empty_zipped_array(5) + + ## collect attributes + numer = pu.get_attributes(result, 1) + denom = pu.get_attributes(result, 2) + + weight = pu.get_weight(result, w_type, num_ngbrs) + + # calculate LISA values + lisa = ps.esda.moran.Moran_Local_Rate(numer, denom, weight, + permutations=permutations) + + # find units of significance + quads = quad_position(lisa.q) + + return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y) + +def moran_local_bv(subquery, attr1, attr2, + permutations, geom_col, id_col, w_type, num_ngbrs): + """ + Moran's I (local) Bivariate (untested) + """ + plpy.notice('** Constructing query') + + qvals = {"num_ngbrs": num_ngbrs, + "attr1": attr1, + "attr2": attr2, + "subquery": subquery, + "geom_col": geom_col, + "id_col": id_col} + + query = pu.construct_neighbor_query(w_type, qvals) + + try: + result = plpy.execute(query) + # if there are no neighbors, exit + if len(result) == 0: + return pu.empty_zipped_array(4) + except plpy.SPIError: + plpy.error("Error: areas of interest query failed, " \ + "check input parameters") + plpy.notice('** Query failed: "%s"' % query) + return pu.empty_zipped_array(4) + + ## collect attributes + attr1_vals = pu.get_attributes(result, 1) + attr2_vals = pu.get_attributes(result, 2) + + # create weights + weight = pu.get_weight(result, w_type, num_ngbrs) + + # calculate LISA values + lisa = ps.esda.moran.Moran_Local_BV(attr1_vals, attr2_vals, weight, + permutations=permutations) + + plpy.notice("len of Is: %d" % len(lisa.Is)) + + # find clustering of significance + lisa_sig = quad_position(lisa.q) + + plpy.notice('** Finished calculations') + + return zip(lisa.Is, lisa_sig, lisa.p_sim, weight.id_order) + +# Low level functions ---------------------------------------- + +def map_quads(coord): + """ + Map a quadrant number to Moran's I designation + HH=1, LH=2, LL=3, HL=4 + Input: + @param coord (int): quadrant of a specific measurement + Output: + classification (one of 'HH', 'LH', 'LL', or 'HL') + """ + if coord == 1: + return 'HH' + elif coord == 2: + return 'LH' + elif coord == 3: + return 'LL' + elif coord == 4: + return 'HL' + else: + return None + +def quad_position(quads): + """ + Produce Moran's I classification based of n + Input: + @param quads ndarray: an array of quads classified by + 1-4 (PySAL default) + Output: + @param list: an array of quads classied by 'HH', 'LL', etc. + """ + return [map_quads(q) for q in quads] diff --git a/release/python/0.0.3/crankshaft/crankshaft/pysal_utils/__init__.py b/release/python/0.0.3/crankshaft/crankshaft/pysal_utils/__init__.py new file mode 100644 index 0000000..835880d --- /dev/null +++ b/release/python/0.0.3/crankshaft/crankshaft/pysal_utils/__init__.py @@ -0,0 +1 @@ +from pysal_utils import * diff --git a/release/python/0.0.3/crankshaft/crankshaft/pysal_utils/pysal_utils.py b/release/python/0.0.3/crankshaft/crankshaft/pysal_utils/pysal_utils.py new file mode 100644 index 0000000..02b5e35 --- /dev/null +++ b/release/python/0.0.3/crankshaft/crankshaft/pysal_utils/pysal_utils.py @@ -0,0 +1,152 @@ +""" + Utilities module for generic PySAL functionality, mainly centered on translating queries into numpy arrays or PySAL weights objects +""" + +import numpy as np +import pysal as ps + +def construct_neighbor_query(w_type, query_vals): + """Return query (a string) used for finding neighbors + @param w_type text: type of neighbors to calculate ('knn' or 'queen') + @param query_vals dict: values used to construct the query + """ + + if w_type.lower() == 'knn': + return knn(query_vals) + else: + return queen(query_vals) + +## Build weight object +def get_weight(query_res, w_type='knn', num_ngbrs=5): + """ + Construct PySAL weight from return value of query + @param query_res: query results with attributes and neighbors + """ + if w_type.lower() == 'knn': + row_normed_weights = [1.0 / float(num_ngbrs)] * num_ngbrs + weights = {x['id']: row_normed_weights for x in query_res} + else: + weights = {x['id']: [1.0 / len(x['neighbors'])] * len(x['neighbors']) + if len(x['neighbors']) > 0 + else [] for x in query_res} + + neighbors = {x['id']: x['neighbors'] for x in query_res} + + return ps.W(neighbors, weights) + +def query_attr_select(params): + """ + Create portion of SELECT statement for attributes inolved in query. + @param params: dict of information used in query (column names, + table name, etc.) + """ + + attrs = [k for k in params + if k not in ('id_col', 'geom_col', 'subquery', 'num_ngbrs')] + + template = "i.\"{%(col)s}\"::numeric As attr%(alias_num)s, " + + attr_string = "" + + for idx, val in enumerate(sorted(attrs)): + attr_string += template % {"col": val, "alias_num": idx + 1} + + return attr_string + +def query_attr_where(params): + """ + Create portion of WHERE clauses for weeding out NULL-valued geometries + """ + attrs = sorted([k for k in params + if k not in ('id_col', 'geom_col', 'subquery', 'num_ngbrs')]) + + attr_string = [] + + for attr in attrs: + attr_string.append("idx_replace.\"{%s}\" IS NOT NULL" % attr) + + if len(attrs) == 2: + attr_string.append("idx_replace.\"{%s}\" <> 0" % attrs[1]) + + out = " AND ".join(attr_string) + + return out + +def knn(params): + """SQL query for k-nearest neighbors. + @param vars: dict of values to fill template + """ + + attr_select = query_attr_select(params) + attr_where = query_attr_where(params) + + replacements = {"attr_select": attr_select, + "attr_where_i": attr_where.replace("idx_replace", "i"), + "attr_where_j": attr_where.replace("idx_replace", "j")} + + query = "SELECT " \ + "i.\"{id_col}\" As id, " \ + "%(attr_select)s" \ + "(SELECT ARRAY(SELECT j.\"{id_col}\" " \ + "FROM ({subquery}) As j " \ + "WHERE " \ + "i.\"{id_col}\" <> j.\"{id_col}\" AND " \ + "%(attr_where_j)s " \ + "ORDER BY " \ + "j.\"{geom_col}\" <-> i.\"{geom_col}\" ASC " \ + "LIMIT {num_ngbrs})" \ + ") As neighbors " \ + "FROM ({subquery}) As i " \ + "WHERE " \ + "%(attr_where_i)s " \ + "ORDER BY i.\"{id_col}\" ASC;" % replacements + + return query.format(**params) + +## SQL query for finding queens neighbors (all contiguous polygons) +def queen(params): + """SQL query for queen neighbors. + @param params dict: information to fill query + """ + attr_select = query_attr_select(params) + attr_where = query_attr_where(params) + + replacements = {"attr_select": attr_select, + "attr_where_i": attr_where.replace("idx_replace", "i"), + "attr_where_j": attr_where.replace("idx_replace", "j")} + + query = "SELECT " \ + "i.\"{id_col}\" As id, " \ + "%(attr_select)s" \ + "(SELECT ARRAY(SELECT j.\"{id_col}\" " \ + "FROM ({subquery}) As j " \ + "WHERE i.\"{id_col}\" <> j.\"{id_col}\" AND " \ + "ST_Touches(i.\"{geom_col}\", j.\"{geom_col}\") AND " \ + "%(attr_where_j)s)" \ + ") As neighbors " \ + "FROM ({subquery}) As i " \ + "WHERE " \ + "%(attr_where_i)s " \ + "ORDER BY i.\"{id_col}\" ASC;" % replacements + + return query.format(**params) + +## to add more weight methods open a ticket or pull request + +def get_attributes(query_res, attr_num=1): + """ + @param query_res: query results with attributes and neighbors + @param attr_num: attribute number (1, 2, ...) + """ + return np.array([x['attr' + str(attr_num)] for x in query_res], dtype=np.float) + +def empty_zipped_array(num_nones): + """ + prepare return values for cases of empty weights objects (no neighbors) + Input: + @param num_nones int: number of columns (e.g., 4) + Output: + [(None, None, None, None)] + """ + + return [tuple([None] * num_nones)] diff --git a/release/python/0.0.3/crankshaft/crankshaft/random_seeds.py b/release/python/0.0.3/crankshaft/crankshaft/random_seeds.py new file mode 100644 index 0000000..b7c8eed --- /dev/null +++ b/release/python/0.0.3/crankshaft/crankshaft/random_seeds.py @@ -0,0 +1,10 @@ +import random +import numpy + +def set_random_seeds(value): + """ + Set the seeds of the RNGs (Random Number Generators) + used internally. + """ + random.seed(value) + numpy.random.seed(value) diff --git a/release/python/0.0.3/crankshaft/setup.py b/release/python/0.0.3/crankshaft/setup.py new file mode 100644 index 0000000..33a3b62 --- /dev/null +++ b/release/python/0.0.3/crankshaft/setup.py @@ -0,0 +1,48 @@ + +""" +CartoDB Spatial Analysis Python Library +See: +https://github.com/CartoDB/crankshaft +""" + +from setuptools import setup, find_packages + +setup( + name='crankshaft', + + version='0.0.3', + + description='CartoDB Spatial Analysis Python Library', + + url='https://github.com/CartoDB/crankshaft', + + author='Data Services Team - CartoDB', + author_email='dataservices@cartodb.com', + + license='MIT', + + classifiers=[ + 'Development Status :: 3 - Alpha', + 'Intended Audience :: Mapping comunity', + 'Topic :: Maps :: Mapping Tools', + 'License :: OSI Approved :: MIT License', + 'Programming Language :: Python :: 2.7', + ], + + keywords='maps mapping tools spatial analysis geostatistics', + + packages=find_packages(exclude=['contrib', 'docs', 'tests']), + + extras_require={ + 'dev': ['unittest'], + 'test': ['unittest', 'nose', 'mock'], + }, + + # The choice of component versions is dictated by what's + # provisioned in the production servers. + install_requires=['pysal==1.9.1', 'scikit-learn==0.17.1'], + + requires=['pysal', 'numpy', 'sklearn'], + + test_suite='test' +) diff --git a/release/python/0.0.3/crankshaft/test/fixtures/kmeans.json b/release/python/0.0.3/crankshaft/test/fixtures/kmeans.json new file mode 100644 index 0000000..8f31c79 --- /dev/null +++ b/release/python/0.0.3/crankshaft/test/fixtures/kmeans.json @@ -0,0 +1 @@ +[{"xs": [9.917239463463458, 9.042767302696836, 10.798929825304187, 8.763751051762995, 11.383882954810852, 11.018206993460897, 8.939526075734316, 9.636159342565252, 10.136336896960058, 11.480610059427342, 12.115011910725082, 9.173267848893428, 10.239300931201738, 8.00012512174072, 8.979962292282131, 9.318376124429575, 10.82259513754284, 10.391747171927115, 10.04904588886165, 9.96007160443463, -0.78825626804569, -0.3511819898577426, -1.2796410003764271, -0.3977049391203402, 2.4792311265774667, 1.3670311632092624, 1.2963504112955613, 2.0404844103073025, -1.6439708506073223, 0.39122885445645805, 1.026031821452462, -0.04044477160482201, -0.7442346929085072, -0.34687120826243034, -0.23420359971379054, -0.5919629143336708, -0.202903054395391, -0.1893399644841902, 1.9331834251176807, -0.12321054392851609], "ys": [8.735627063679981, 9.857615954045011, 10.81439096759407, 10.586727233537191, 9.232919976568622, 11.54281262696508, 8.392787912674466, 9.355119689665944, 9.22380703532752, 10.542142541823122, 10.111980619367035, 10.760836265570738, 8.819773453269804, 10.25325722424816, 9.802077905695608, 8.955420161552611, 9.833801181904477, 10.491684241001613, 12.076108669877556, 11.74289693140474, -0.5685725015474191, -0.5715728344759778, -0.20180907868635137, 0.38431336480089595, -0.3402202083684184, -2.4652736827783586, 0.08295159401756182, 0.8503818775816505, 0.6488691600321166, 0.5794762568230527, -0.6770063922144103, -0.6557616416449478, -1.2834289177624947, 0.1096318195532717, -0.38986922166834853, -1.6224497706950238, 0.09429787743230483, 0.4005097316394031, -0.508002811195673, -1.2473463371366507], "ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]}] \ No newline at end of file diff --git a/release/python/0.0.3/crankshaft/test/fixtures/moran.json b/release/python/0.0.3/crankshaft/test/fixtures/moran.json new file mode 100644 index 0000000..2f75cf1 --- /dev/null +++ b/release/python/0.0.3/crankshaft/test/fixtures/moran.json @@ -0,0 +1,52 @@ +[[0.9319096128346788, "HH"], +[-1.135787401862846, "HL"], +[0.11732030672508517, "LL"], +[0.6152779669180425, "LL"], +[-0.14657336660125297, "LH"], +[0.6967858120189607, "LL"], +[0.07949310115714454, "HH"], +[0.4703198759258987, "HH"], +[0.4421125200498064, "HH"], +[0.5724288737143592, "LL"], +[0.8970743435692062, "LL"], +[0.18327334401918674, "LL"], +[-0.01466729201304962, "HL"], +[0.3481559372544409, "LL"], +[0.06547094736902978, "LL"], +[0.15482141569329988, "HH"], +[0.4373841193538136, "HH"], +[0.15971286468915544, "LL"], +[1.0543588860308968, "HH"], +[1.7372866900020818, "HH"], +[1.091998586053999, "LL"], +[0.1171572584252222, "HH"], +[0.08438455015300014, "LL"], +[0.06547094736902978, "LL"], +[0.15482141569329985, "HH"], +[1.1627044812890683, "HH"], +[0.06547094736902978, "LL"], +[0.795275137550483, "HH"], +[0.18562939195219, "LL"], +[0.3010757406693439, "LL"], +[2.8205795942839376, "HH"], +[0.11259190602909264, "LL"], +[-0.07116352791516614, "HL"], +[-0.09945240794119009, "LH"], +[0.18562939195219, "LL"], +[0.1832733440191868, "LL"], +[-0.39054253768447705, "HL"], +[-0.1672071289487642, "HL"], +[0.3337669247916343, "HH"], +[0.2584386102554792, "HH"], +[-0.19733845476322634, "HL"], +[-0.9379282899805409, "LH"], +[-0.028770969951095866, "LH"], +[0.051367269430983485, "LL"], +[-0.2172548045913472, "LH"], +[0.05136726943098351, "LL"], +[0.04191046803899837, "LL"], +[0.7482357030403517, "HH"], +[-0.014585767863118111, "LH"], +[0.5410013139159929, "HH"], +[1.0223932668429925, "LL"], +[1.4179402898927476, "LL"]] \ No newline at end of file diff --git a/release/python/0.0.3/crankshaft/test/fixtures/neighbors.json b/release/python/0.0.3/crankshaft/test/fixtures/neighbors.json new file mode 100644 index 0000000..055b359 --- /dev/null +++ b/release/python/0.0.3/crankshaft/test/fixtures/neighbors.json @@ -0,0 +1,54 @@ +[ + {"neighbors": [48, 26, 20, 9, 31], "id": 1, "value": 0.5}, + {"neighbors": [30, 16, 46, 3, 4], "id": 2, "value": 0.7}, + {"neighbors": [46, 30, 2, 12, 16], "id": 3, "value": 0.2}, + {"neighbors": [18, 30, 23, 2, 52], "id": 4, "value": 0.1}, + {"neighbors": [47, 40, 45, 37, 28], "id": 5, "value": 0.3}, + {"neighbors": [10, 21, 41, 14, 37], "id": 6, "value": 0.05}, + {"neighbors": [8, 17, 43, 25, 12], "id": 7, "value": 0.4}, + {"neighbors": [17, 25, 43, 22, 7], "id": 8, "value": 0.7}, + {"neighbors": [39, 34, 1, 26, 48], "id": 9, "value": 0.5}, + {"neighbors": [6, 37, 5, 45, 49], "id": 10, "value": 0.04}, + {"neighbors": [51, 41, 29, 21, 14], "id": 11, "value": 0.08}, + {"neighbors": [44, 46, 43, 50, 3], "id": 12, "value": 0.2}, + {"neighbors": [45, 23, 14, 28, 18], "id": 13, "value": 0.4}, + {"neighbors": [41, 29, 13, 23, 6], "id": 14, "value": 0.2}, + {"neighbors": [36, 27, 32, 33, 24], "id": 15, "value": 0.3}, + {"neighbors": [19, 2, 46, 44, 28], "id": 16, "value": 0.4}, + {"neighbors": [8, 25, 43, 7, 22], "id": 17, "value": 0.6}, + {"neighbors": [23, 4, 29, 14, 13], "id": 18, "value": 0.3}, + {"neighbors": [42, 16, 28, 26, 40], "id": 19, "value": 0.7}, + {"neighbors": [1, 48, 31, 26, 42], "id": 20, "value": 0.8}, + {"neighbors": [41, 6, 11, 14, 10], "id": 21, "value": 0.1}, + {"neighbors": [25, 50, 43, 31, 44], "id": 22, "value": 0.4}, + {"neighbors": [18, 13, 14, 4, 2], "id": 23, "value": 0.1}, + {"neighbors": [33, 49, 34, 47, 27], "id": 24, "value": 0.3}, + {"neighbors": [43, 8, 22, 17, 50], "id": 25, "value": 0.4}, + {"neighbors": [1, 42, 20, 31, 48], "id": 26, "value": 0.6}, + {"neighbors": [32, 15, 36, 33, 24], "id": 27, "value": 0.3}, + {"neighbors": [40, 45, 19, 5, 13], "id": 28, "value": 0.8}, + {"neighbors": [11, 51, 41, 14, 18], "id": 29, "value": 0.3}, + {"neighbors": [2, 3, 4, 46, 18], "id": 30, "value": 0.1}, + {"neighbors": [20, 26, 1, 50, 48], "id": 31, "value": 0.9}, + {"neighbors": [27, 36, 15, 49, 24], "id": 32, "value": 0.3}, + {"neighbors": [24, 27, 49, 34, 32], "id": 33, "value": 0.4}, + {"neighbors": [47, 9, 39, 40, 24], "id": 34, "value": 0.3}, + {"neighbors": [38, 51, 11, 21, 41], "id": 35, "value": 0.3}, + {"neighbors": [15, 32, 27, 49, 33], "id": 36, "value": 0.2}, + {"neighbors": [49, 10, 5, 47, 24], "id": 37, "value": 0.5}, + {"neighbors": [35, 21, 51, 11, 41], "id": 38, "value": 0.4}, + {"neighbors": [9, 34, 48, 1, 47], "id": 39, "value": 0.6}, + {"neighbors": [28, 47, 5, 9, 34], "id": 40, "value": 0.5}, + {"neighbors": [11, 14, 29, 21, 6], "id": 41, "value": 0.4}, + {"neighbors": [26, 19, 1, 9, 31], "id": 42, "value": 0.2}, + {"neighbors": [25, 12, 8, 22, 44], "id": 43, "value": 0.3}, + {"neighbors": [12, 50, 46, 16, 43], "id": 44, "value": 0.2}, + {"neighbors": [28, 13, 5, 40, 19], "id": 45, "value": 0.3}, + {"neighbors": [3, 12, 44, 2, 16], "id": 46, "value": 0.2}, + {"neighbors": [34, 40, 5, 49, 24], "id": 47, "value": 0.3}, + {"neighbors": [1, 20, 26, 9, 39], "id": 48, "value": 0.5}, + {"neighbors": [24, 37, 47, 5, 33], "id": 49, "value": 0.2}, + {"neighbors": [44, 22, 31, 42, 26], "id": 50, "value": 0.6}, + {"neighbors": [11, 29, 41, 14, 21], "id": 51, "value": 0.01}, + {"neighbors": [4, 18, 29, 51, 23], "id": 52, "value": 0.01} + ] diff --git a/release/python/0.0.3/crankshaft/test/helper.py b/release/python/0.0.3/crankshaft/test/helper.py new file mode 100644 index 0000000..7d28b94 --- /dev/null +++ b/release/python/0.0.3/crankshaft/test/helper.py @@ -0,0 +1,13 @@ +import unittest + +from mock_plpy import MockPlPy +plpy = MockPlPy() + +import sys +sys.modules['plpy'] = plpy + +import os + +def fixture_file(name): + dir = os.path.dirname(os.path.realpath(__file__)) + return os.path.join(dir, 'fixtures', name) diff --git a/release/python/0.0.3/crankshaft/test/mock_plpy.py b/release/python/0.0.3/crankshaft/test/mock_plpy.py new file mode 100644 index 0000000..63c88f6 --- /dev/null +++ b/release/python/0.0.3/crankshaft/test/mock_plpy.py @@ -0,0 +1,34 @@ +import re + +class MockPlPy: + def __init__(self): + self._reset() + + def _reset(self): + self.infos = [] + self.notices = [] + self.debugs = [] + self.logs = [] + self.warnings = [] + self.errors = [] + self.fatals = [] + self.executes = [] + self.results = [] + self.prepares = [] + self.results = [] + + def _define_result(self, query, result): + pattern = re.compile(query, re.IGNORECASE | re.MULTILINE) + self.results.append([pattern, result]) + + def notice(self, msg): + self.notices.append(msg) + + def info(self, msg): + self.infos.append(msg) + + def execute(self, query): # TODO: additional arguments + for result in self.results: + if result[0].match(query): + return result[1] + return [] diff --git a/release/python/0.0.3/crankshaft/test/test_cluster_kmeans.py b/release/python/0.0.3/crankshaft/test/test_cluster_kmeans.py new file mode 100644 index 0000000..aba8e07 --- /dev/null +++ b/release/python/0.0.3/crankshaft/test/test_cluster_kmeans.py @@ -0,0 +1,38 @@ +import unittest +import numpy as np + + +# from mock_plpy import MockPlPy +# plpy = MockPlPy() +# +# import sys +# sys.modules['plpy'] = plpy +from helper import plpy, fixture_file +import numpy as np +import crankshaft.clustering as cc +import crankshaft.pysal_utils as pu +from crankshaft import random_seeds +import json + +class KMeansTest(unittest.TestCase): + """Testing class for Moran's I functions""" + + def setUp(self): + plpy._reset() + self.cluster_data = json.loads(open(fixture_file('kmeans.json')).read()) + self.params = {"subquery": "select * from table", + "no_clusters": "10" + } + + def test_kmeans(self): + data = self.cluster_data + plpy._define_result('select' ,data) + clusters = cc.kmeans('subquery', 2) + labels = [a[1] for a in clusters] + c1 = [a for a in clusters if a[1]==0] + c2 = [a for a in clusters if a[1]==1] + + self.assertEqual(len(np.unique(labels)),2) + self.assertEqual(len(c1),20) + self.assertEqual(len(c2),20) + diff --git a/release/python/0.0.3/crankshaft/test/test_clustering_moran.py b/release/python/0.0.3/crankshaft/test/test_clustering_moran.py new file mode 100644 index 0000000..393e93b --- /dev/null +++ b/release/python/0.0.3/crankshaft/test/test_clustering_moran.py @@ -0,0 +1,83 @@ +import unittest +import numpy as np + + +# from mock_plpy import MockPlPy +# plpy = MockPlPy() +# +# import sys +# sys.modules['plpy'] = plpy +from helper import plpy, fixture_file + +import crankshaft.clustering as cc +import crankshaft.pysal_utils as pu +from crankshaft import random_seeds +import json + +class MoranTest(unittest.TestCase): + """Testing class for Moran's I functions""" + + def setUp(self): + plpy._reset() + self.params = {"id_col": "cartodb_id", + "attr1": "andy", + "attr2": "jay_z", + "subquery": "SELECT * FROM a_list", + "geom_col": "the_geom", + "num_ngbrs": 321} + self.neighbors_data = json.loads(open(fixture_file('neighbors.json')).read()) + self.moran_data = json.loads(open(fixture_file('moran.json')).read()) + + def test_map_quads(self): + """Test map_quads""" + self.assertEqual(cc.map_quads(1), 'HH') + self.assertEqual(cc.map_quads(2), 'LH') + self.assertEqual(cc.map_quads(3), 'LL') + self.assertEqual(cc.map_quads(4), 'HL') + self.assertEqual(cc.map_quads(33), None) + self.assertEqual(cc.map_quads('andy'), None) + + def test_quad_position(self): + """Test lisa_sig_vals""" + + quads = np.array([1, 2, 3, 4], np.int) + + ans = np.array(['HH', 'LH', 'LL', 'HL']) + test_ans = cc.quad_position(quads) + + self.assertTrue((test_ans == ans).all()) + + def test_moran_local(self): + """Test Moran's I local""" + data = [ { 'id': d['id'], 'attr1': d['value'], 'neighbors': d['neighbors'] } for d in self.neighbors_data] + plpy._define_result('select', data) + random_seeds.set_random_seeds(1234) + result = cc.moran_local('subquery', 'value', 'knn', 5, 99, 'the_geom', 'cartodb_id') + result = [(row[0], row[1]) for row in result] + expected = self.moran_data + for ([res_val, res_quad], [exp_val, exp_quad]) in zip(result, expected): + self.assertAlmostEqual(res_val, exp_val) + self.assertEqual(res_quad, exp_quad) + + def test_moran_local_rate(self): + """Test Moran's I rate""" + data = [ { 'id': d['id'], 'attr1': d['value'], 'attr2': 1, 'neighbors': d['neighbors'] } for d in self.neighbors_data] + plpy._define_result('select', data) + random_seeds.set_random_seeds(1234) + result = cc.moran_local_rate('subquery', 'numerator', 'denominator', 'knn', 5, 99, 'the_geom', 'cartodb_id') + print 'result == None? ', result == None + result = [(row[0], row[1]) for row in result] + expected = self.moran_data + for ([res_val, res_quad], [exp_val, exp_quad]) in zip(result, expected): + self.assertAlmostEqual(res_val, exp_val) + + def test_moran(self): + """Test Moran's I global""" + data = [{ 'id': d['id'], 'attr1': d['value'], 'neighbors': d['neighbors'] } for d in self.neighbors_data] + plpy._define_result('select', data) + random_seeds.set_random_seeds(1235) + result = cc.moran('table', 'value', 'knn', 5, 99, 'the_geom', 'cartodb_id') + print 'result == None?', result == None + result_moran = result[0][0] + expected_moran = np.array([row[0] for row in self.moran_data]).mean() + self.assertAlmostEqual(expected_moran, result_moran, delta=10e-2) diff --git a/release/python/0.0.3/crankshaft/test/test_pysal_utils.py b/release/python/0.0.3/crankshaft/test/test_pysal_utils.py new file mode 100644 index 0000000..4ea0d9b --- /dev/null +++ b/release/python/0.0.3/crankshaft/test/test_pysal_utils.py @@ -0,0 +1,107 @@ +import unittest + +import crankshaft.pysal_utils as pu +from crankshaft import random_seeds + + +class PysalUtilsTest(unittest.TestCase): + """Testing class for utility functions related to PySAL integrations""" + + def setUp(self): + self.params = {"id_col": "cartodb_id", + "attr1": "andy", + "attr2": "jay_z", + "subquery": "SELECT * FROM a_list", + "geom_col": "the_geom", + "num_ngbrs": 321} + + def test_query_attr_select(self): + """Test query_attr_select""" + + ans = "i.\"{attr1}\"::numeric As attr1, " \ + "i.\"{attr2}\"::numeric As attr2, " + + self.assertEqual(pu.query_attr_select(self.params), ans) + + def test_query_attr_where(self): + """Test pu.query_attr_where""" + + ans = "idx_replace.\"{attr1}\" IS NOT NULL AND " \ + "idx_replace.\"{attr2}\" IS NOT NULL AND " \ + "idx_replace.\"{attr2}\" <> 0" + + self.assertEqual(pu.query_attr_where(self.params), ans) + + def test_knn(self): + """Test knn neighbors constructor""" + + ans = "SELECT i.\"cartodb_id\" As id, " \ + "i.\"andy\"::numeric As attr1, " \ + "i.\"jay_z\"::numeric As attr2, " \ + "(SELECT ARRAY(SELECT j.\"cartodb_id\" " \ + "FROM (SELECT * FROM a_list) As j " \ + "WHERE " \ + "i.\"cartodb_id\" <> j.\"cartodb_id\" AND " \ + "j.\"andy\" IS NOT NULL AND " \ + "j.\"jay_z\" IS NOT NULL AND " \ + "j.\"jay_z\" <> 0 " \ + "ORDER BY " \ + "j.\"the_geom\" <-> i.\"the_geom\" ASC " \ + "LIMIT 321)) As neighbors " \ + "FROM (SELECT * FROM a_list) As i " \ + "WHERE i.\"andy\" IS NOT NULL AND " \ + "i.\"jay_z\" IS NOT NULL AND " \ + "i.\"jay_z\" <> 0 " \ + "ORDER BY i.\"cartodb_id\" ASC;" + + self.assertEqual(pu.knn(self.params), ans) + + def test_queen(self): + """Test queen neighbors constructor""" + + ans = "SELECT i.\"cartodb_id\" As id, " \ + "i.\"andy\"::numeric As attr1, " \ + "i.\"jay_z\"::numeric As attr2, " \ + "(SELECT ARRAY(SELECT j.\"cartodb_id\" " \ + "FROM (SELECT * FROM a_list) As j " \ + "WHERE " \ + "i.\"cartodb_id\" <> j.\"cartodb_id\" AND " \ + "ST_Touches(i.\"the_geom\", " \ + "j.\"the_geom\") AND " \ + "j.\"andy\" IS NOT NULL AND " \ + "j.\"jay_z\" IS NOT NULL AND " \ + "j.\"jay_z\" <> 0)" \ + ") As neighbors " \ + "FROM (SELECT * FROM a_list) As i " \ + "WHERE i.\"andy\" IS NOT NULL AND " \ + "i.\"jay_z\" IS NOT NULL AND " \ + "i.\"jay_z\" <> 0 " \ + "ORDER BY i.\"cartodb_id\" ASC;" + + self.assertEqual(pu.queen(self.params), ans) + + def test_construct_neighbor_query(self): + """Test construct_neighbor_query""" + + # Compare to raw knn query + self.assertEqual(pu.construct_neighbor_query('knn', self.params), + pu.knn(self.params)) + + def test_get_attributes(self): + """Test get_attributes""" + + ## need to add tests + + self.assertEqual(True, True) + + def test_get_weight(self): + """Test get_weight""" + + self.assertEqual(True, True) + + def test_empty_zipped_array(self): + """Test empty_zipped_array""" + ans2 = [(None, None)] + ans4 = [(None, None, None, None)] + self.assertEqual(pu.empty_zipped_array(2), ans2) + self.assertEqual(pu.empty_zipped_array(4), ans4) diff --git a/src/pg/crankshaft.control b/src/pg/crankshaft.control index 49c0d22..2029b7e 100644 --- a/src/pg/crankshaft.control +++ b/src/pg/crankshaft.control @@ -1,5 +1,5 @@ comment = 'CartoDB Spatial Analysis extension' -default_version = '0.0.2' +default_version = '0.0.3' requires = 'plpythonu, postgis, cartodb' superuser = true schema = cdb_crankshaft