From a1198627b52e7c1b032e111ea8e706d6af3d33fc Mon Sep 17 00:00:00 2001 From: Rafa de la Torre Date: Fri, 23 Feb 2018 15:45:12 +0100 Subject: [PATCH] Release 0.7.0 --- NEWS.md | 11 +- release/crankshaft--0.6.1--0.7.0.sql | 2165 +++++++++++++++++ release/crankshaft--0.7.0.sql | 2165 +++++++++++++++++ release/crankshaft.control | 2 +- .../0.7.0/crankshaft/crankshaft/__init__.py | 7 + .../crankshaft/analysis_data_provider.py | 98 + .../crankshaft/clustering/__init__.py | 4 + .../crankshaft/crankshaft/clustering/getis.py | 50 + .../crankshaft/clustering/kmeans.py | 113 + .../crankshaft/crankshaft/clustering/moran.py | 208 ++ .../crankshaft/pysal_utils/__init__.py | 2 + .../crankshaft/pysal_utils/pysal_utils.py | 251 ++ .../crankshaft/crankshaft/random_seeds.py | 12 + .../crankshaft/regression/__init__.py | 3 + .../glm/GLM_validate_estimation.ipynb | 444 ++++ .../crankshaft/regression/glm/__init__.py | 4 + .../crankshaft/regression/glm/base.py | 959 ++++++++ .../crankshaft/regression/glm/family.py | 1845 ++++++++++++++ .../crankshaft/regression/glm/glm.py | 326 +++ .../crankshaft/regression/glm/iwls.py | 84 + .../crankshaft/regression/glm/links.py | 953 ++++++++ .../regression/glm/tests/test_glm.py | 993 ++++++++ .../crankshaft/regression/glm/utils.py | 350 +++ .../crankshaft/regression/glm/varfuncs.py | 284 +++ .../crankshaft/regression/gwr/__init__.py | 1 + .../regression/gwr/base/__init__.py | 4 + .../regression/gwr/base/diagnostics.py | 81 + .../crankshaft/regression/gwr/base/gwr.py | 1086 +++++++++ .../crankshaft/regression/gwr/base/kernels.py | 120 + .../crankshaft/regression/gwr/base/search.py | 208 ++ .../crankshaft/regression/gwr/base/sel_bw.py | 286 +++ .../regression/gwr/base/tests/test_gwr.py | 853 +++++++ .../regression/gwr/base/tests/test_kernels.py | 84 + .../regression/gwr/base/tests/test_sel_bw.py | 139 ++ .../crankshaft/regression/gwr_cs.py | 202 ++ .../crankshaft/segmentation/__init__.py | 1 + .../crankshaft/segmentation/segmentation.py | 176 ++ .../space_time_dynamics/__init__.py | 2 + .../crankshaft/space_time_dynamics/markov.py | 194 ++ .../python/0.7.0/crankshaft/requirements.txt | 5 + release/python/0.7.0/crankshaft/setup.py | 49 + .../0.7.0/crankshaft/test/fixtures/getis.json | 1 + .../test/fixtures/gwr_packed_data.json | 1 + .../test/fixtures/gwr_packed_knowns.json | 1 + .../crankshaft/test/fixtures/kmeans.json | 1 + .../crankshaft/test/fixtures/markov.json | 1 + .../0.7.0/crankshaft/test/fixtures/moran.json | 52 + .../crankshaft/test/fixtures/neighbors.json | 54 + .../test/fixtures/neighbors_getis.json | 1 + .../test/fixtures/neighbors_markov.json | 1 + .../python/0.7.0/crankshaft/test/helper.py | 13 + .../python/0.7.0/crankshaft/test/mock_plpy.py | 57 + .../crankshaft/test/test_clustering_getis.py | 78 + .../crankshaft/test/test_clustering_kmeans.py | 87 + .../crankshaft/test/test_clustering_moran.py | 112 + .../0.7.0/crankshaft/test/test_pysal_utils.py | 83 + .../crankshaft/test/test_regression_gwr.py | 130 + .../crankshaft/test/test_segmentation.py | 64 + .../test/test_space_time_dynamics.py | 349 +++ src/pg/crankshaft.control | 2 +- 60 files changed, 15909 insertions(+), 3 deletions(-) create mode 100644 release/crankshaft--0.6.1--0.7.0.sql create mode 100644 release/crankshaft--0.7.0.sql create mode 100644 release/python/0.7.0/crankshaft/crankshaft/__init__.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/analysis_data_provider.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/clustering/__init__.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/clustering/getis.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/clustering/kmeans.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/clustering/moran.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/pysal_utils/__init__.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/pysal_utils/pysal_utils.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/random_seeds.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/regression/__init__.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/regression/glm/GLM_validate_estimation.ipynb create mode 100644 release/python/0.7.0/crankshaft/crankshaft/regression/glm/__init__.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/regression/glm/base.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/regression/glm/family.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/regression/glm/glm.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/regression/glm/iwls.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/regression/glm/links.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/regression/glm/tests/test_glm.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/regression/glm/utils.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/regression/glm/varfuncs.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/regression/gwr/__init__.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/__init__.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/diagnostics.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/gwr.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/kernels.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/search.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/sel_bw.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/tests/test_gwr.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/tests/test_kernels.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/tests/test_sel_bw.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/regression/gwr_cs.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/segmentation/__init__.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/segmentation/segmentation.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/space_time_dynamics/__init__.py create mode 100644 release/python/0.7.0/crankshaft/crankshaft/space_time_dynamics/markov.py create mode 100644 release/python/0.7.0/crankshaft/requirements.txt create mode 100644 release/python/0.7.0/crankshaft/setup.py create mode 100644 release/python/0.7.0/crankshaft/test/fixtures/getis.json create mode 100644 release/python/0.7.0/crankshaft/test/fixtures/gwr_packed_data.json create mode 100644 release/python/0.7.0/crankshaft/test/fixtures/gwr_packed_knowns.json create mode 100644 release/python/0.7.0/crankshaft/test/fixtures/kmeans.json create mode 100644 release/python/0.7.0/crankshaft/test/fixtures/markov.json create mode 100644 release/python/0.7.0/crankshaft/test/fixtures/moran.json create mode 100644 release/python/0.7.0/crankshaft/test/fixtures/neighbors.json create mode 100644 release/python/0.7.0/crankshaft/test/fixtures/neighbors_getis.json create mode 100644 release/python/0.7.0/crankshaft/test/fixtures/neighbors_markov.json create mode 100644 release/python/0.7.0/crankshaft/test/helper.py create mode 100644 release/python/0.7.0/crankshaft/test/mock_plpy.py create mode 100644 release/python/0.7.0/crankshaft/test/test_clustering_getis.py create mode 100644 release/python/0.7.0/crankshaft/test/test_clustering_kmeans.py create mode 100644 release/python/0.7.0/crankshaft/test/test_clustering_moran.py create mode 100644 release/python/0.7.0/crankshaft/test/test_pysal_utils.py create mode 100644 release/python/0.7.0/crankshaft/test/test_regression_gwr.py create mode 100644 release/python/0.7.0/crankshaft/test/test_segmentation.py create mode 100644 release/python/0.7.0/crankshaft/test/test_space_time_dynamics.py diff --git a/NEWS.md b/NEWS.md index 3d07756..efcfd31 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,14 @@ +0.7.0 (2018-mm-dd) +------------------ +* Updated Moran and Markov documentation [#179](https://github.com/CartoDB/crankshaft/pull/179) [#155](https://github.com/CartoDB/crankshaft/pull/155) +* Updated examples in documentation [#193](https://github.com/CartoDB/crankshaft/pull/193) +* Better error management for empty values [#157](https://github.com/CartoDB/crankshaft/pull/157) +* Added nonspatial kmeans with class framework [#150](https://github.com/CartoDB/crankshaft/pull/150) +* Added multipolygons and geometry collections support to PIA analyssis [#165](https://github.com/CartoDB/crankshaft/pull/165) + 0.6.1 (2017-11-23) -* Add VOLATILITY and PARALLEL categories to PostgreSQL functions +------------------ +* Added VOLATILITY and PARALLEL categories to PostgreSQL functions [#183](https://github.com/CartoDB/crankshaft/pull/183) 0.6.0 (2017-11-08) ------------------ diff --git a/release/crankshaft--0.6.1--0.7.0.sql b/release/crankshaft--0.6.1--0.7.0.sql new file mode 100644 index 0000000..cc66ac6 --- /dev/null +++ b/release/crankshaft--0.6.1--0.7.0.sql @@ -0,0 +1,2165 @@ +--DO NOT MODIFY THIS FILE, IT IS GENERATED AUTOMATICALLY FROM SOURCES +-- Complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION crankshaft" to load this file. \quit +-- Version number of the extension release +CREATE OR REPLACE FUNCTION cdb_crankshaft_version() +RETURNS text AS $$ + SELECT '0.7.0'::text; +$$ language 'sql' IMMUTABLE STRICT PARALLEL SAFE; + +-- Internal identifier of the installed extension instence +-- e.g. 'dev' for current development version +CREATE OR REPLACE FUNCTION _cdb_crankshaft_internal_version() +RETURNS text AS $$ + SELECT installed_version FROM pg_available_extensions where name='crankshaft' and pg_available_extensions IS NOT NULL; +$$ language 'sql' STABLE STRICT PARALLEL SAFE; +-- Internal function. +-- Set the seeds of the RNGs (Random Number Generators) +-- used internally. +CREATE OR REPLACE FUNCTION +_cdb_random_seeds (seed_value INTEGER) RETURNS VOID +AS $$ + from crankshaft import random_seeds + random_seeds.set_random_seeds(seed_value) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; +CREATE OR REPLACE FUNCTION + CDB_PyAggS(current_state Numeric[], current_row Numeric[]) + returns NUMERIC[] as $$ + BEGIN + if array_upper(current_state,1) is null then + RAISE NOTICE 'setting state %',array_upper(current_row,1); + current_state[1] = array_upper(current_row,1); + end if; + return array_cat(current_state,current_row) ; + END + $$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; + +-- Create aggregate if it did not exist +DO $$ +BEGIN + IF NOT EXISTS ( + SELECT * + FROM pg_catalog.pg_proc p + LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace + WHERE n.nspname = 'cdb_crankshaft' + AND p.proname = 'cdb_pyagg' + AND p.proisagg) + THEN + CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) ( + SFUNC = CDB_PyAggS, + STYPE = Numeric[], + PARALLEL = SAFE, + INITCOND = "{}" + ); + END IF; +END +$$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION + CDB_CreateAndPredictSegment( + target NUMERIC[], + features NUMERIC[], + target_features NUMERIC[], + target_ids NUMERIC[], + n_estimators INTEGER DEFAULT 1200, + max_depth INTEGER DEFAULT 3, + subsample DOUBLE PRECISION DEFAULT 0.5, + learning_rate DOUBLE PRECISION DEFAULT 0.01, + min_samples_leaf INTEGER DEFAULT 1) +RETURNS TABLE(cartodb_id NUMERIC, prediction NUMERIC, accuracy NUMERIC) +AS $$ + import numpy as np + import plpy + + from crankshaft.segmentation import create_and_predict_segment_agg + model_params = {'n_estimators': n_estimators, + 'max_depth': max_depth, + 'subsample': subsample, + 'learning_rate': learning_rate, + 'min_samples_leaf': min_samples_leaf} + + def unpack2D(data): + dimension = data.pop(0) + a = np.array(data, dtype=float) + return a.reshape(len(a)/dimension, dimension) + + return create_and_predict_segment_agg(np.array(target, dtype=float), + unpack2D(features), + unpack2D(target_features), + target_ids, + model_params) + +$$ LANGUAGE plpythonu VOLATILE PARALLEL RESTRICTED; + +CREATE OR REPLACE FUNCTION + CDB_CreateAndPredictSegment ( + query TEXT, + variable_name TEXT, + target_table TEXT, + n_estimators INTEGER DEFAULT 1200, + max_depth INTEGER DEFAULT 3, + subsample DOUBLE PRECISION DEFAULT 0.5, + learning_rate DOUBLE PRECISION DEFAULT 0.01, + min_samples_leaf INTEGER DEFAULT 1) +RETURNS TABLE (cartodb_id TEXT, prediction NUMERIC, accuracy NUMERIC) +AS $$ + from crankshaft.segmentation import create_and_predict_segment + model_params = {'n_estimators': n_estimators, 'max_depth':max_depth, 'subsample' : subsample, 'learning_rate': learning_rate, 'min_samples_leaf' : min_samples_leaf} + return create_and_predict_segment(query,variable_name,target_table, model_params) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; +CREATE OR REPLACE FUNCTION CDB_Gravity( + IN target_query text, + IN weight_column text, + IN source_query text, + IN pop_column text, + IN target bigint, + IN radius integer, + IN minval numeric DEFAULT -10e307 + ) +RETURNS TABLE( + the_geom geometry, + source_id bigint, + target_id bigint, + dist numeric, + h numeric, + hpop numeric) AS $$ +DECLARE + t_id bigint[]; + t_geom geometry[]; + t_weight numeric[]; + s_id bigint[]; + s_geom geometry[]; + s_pop numeric[]; +BEGIN + EXECUTE 'WITH foo as('+target_query+') SELECT array_agg(cartodb_id), array_agg(the_geom), array_agg(' || weight_column || ') FROM foo' INTO t_id, t_geom, t_weight; + EXECUTE 'WITH foo as('+source_query+') SELECT array_agg(cartodb_id), array_agg(the_geom), array_agg(' || pop_column || ') FROM foo' INTO s_id, s_geom, s_pop; + RETURN QUERY + SELECT g.* FROM t, s, CDB_Gravity(t_id, t_geom, t_weight, s_id, s_geom, s_pop, target, radius, minval) g; +END; +$$ language plpgsql VOLATILE PARALLEL UNSAFE; + +CREATE OR REPLACE FUNCTION CDB_Gravity( + IN t_id bigint[], + IN t_geom geometry[], + IN t_weight numeric[], + IN s_id bigint[], + IN s_geom geometry[], + IN s_pop numeric[], + IN target bigint, + IN radius integer, + IN minval numeric DEFAULT -10e307 + ) +RETURNS TABLE( + the_geom geometry, + source_id bigint, + target_id bigint, + dist numeric, + h numeric, + hpop numeric) AS $$ +DECLARE + t_type text; + s_type text; + t_center geometry[]; + s_center geometry[]; +BEGIN + t_type := GeometryType(t_geom[1]); + s_type := GeometryType(s_geom[1]); + IF t_type = 'POINT' THEN + t_center := t_geom; + ELSE + WITH tmp as (SELECT unnest(t_geom) as g) SELECT array_agg(ST_Centroid(g)) INTO t_center FROM tmp; + END IF; + IF s_type = 'POINT' THEN + s_center := s_geom; + ELSE + WITH tmp as (SELECT unnest(s_geom) as g) SELECT array_agg(ST_Centroid(g)) INTO s_center FROM tmp; + END IF; + RETURN QUERY + with target0 as( + SELECT unnest(t_center) as tc, unnest(t_weight) as tw, unnest(t_id) as td + ), + source0 as( + SELECT unnest(s_center) as sc, unnest(s_id) as sd, unnest (s_geom) as sg, unnest(s_pop) as sp + ), + prev0 as( + SELECT + source0.sg, + source0.sd as sourc_id, + coalesce(source0.sp,0) as sp, + target.td as targ_id, + coalesce(target.tw,0) as tw, + GREATEST(1.0,ST_Distance(geography(target.tc), geography(source0.sc)))::numeric as distance + FROM source0 + CROSS JOIN LATERAL + ( + SELECT + * + FROM target0 + WHERE tw > minval + AND ST_DWithin(geography(source0.sc), geography(tc), radius) + ) AS target + ), + deno as( + SELECT + sourc_id, + sum(tw/distance) as h_deno + FROM + prev0 + GROUP BY sourc_id + ) + SELECT + p.sg as the_geom, + p.sourc_id as source_id, + p.targ_id as target_id, + case when p.distance > 1 then p.distance else 0.0 end as dist, + 100*(p.tw/p.distance)/d.h_deno as h, + p.sp*(p.tw/p.distance)/d.h_deno as hpop + FROM + prev0 p, + deno d + WHERE + p.targ_id = target AND + p.sourc_id = d.sourc_id; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; +-- 0: nearest neighbor(s) +-- 1: barymetric +-- 2: IDW +-- 3: krigin ---> TO DO + + +CREATE OR REPLACE FUNCTION CDB_SpatialInterpolation( + IN query text, + IN point geometry, + IN method integer DEFAULT 1, + IN p1 numeric DEFAULT 0, + IN p2 numeric DEFAULT 0 + ) +RETURNS numeric AS +$$ +DECLARE + gs geometry[]; + vs numeric[]; + output numeric; +BEGIN + EXECUTE 'WITH a AS('||query||') SELECT array_agg(the_geom), array_agg(attrib) FROM a' INTO gs, vs; + SELECT CDB_SpatialInterpolation(gs, vs, point, method, p1,p2) INTO output FROM a; + + RETURN output; +END; +$$ +language plpgsql VOLATILE PARALLEL UNSAFE; + +CREATE OR REPLACE FUNCTION CDB_SpatialInterpolation( + IN geomin geometry[], + IN colin numeric[], + IN point geometry, + IN method integer DEFAULT 1, + IN p1 numeric DEFAULT 0, + IN p2 numeric DEFAULT 0 + ) +RETURNS numeric AS +$$ +DECLARE + gs geometry[]; + vs numeric[]; + gs2 geometry[]; + vs2 numeric[]; + g geometry; + vertex geometry[]; + sg numeric; + sa numeric; + sb numeric; + sc numeric; + va numeric; + vb numeric; + vc numeric; + output numeric; +BEGIN + -- output := -999.999; + + -- nearest neighbors + -- p1: limit the number of neighbors, 0-> closest one + IF method = 0 THEN + + IF p1 = 0 THEN + p1 := 1; + END IF; + + WITH a as (SELECT unnest(geomin) as g, unnest(colin) as v), + b as (SELECT a.v as v FROM a ORDER BY point<->a.g LIMIT p1::integer) + SELECT avg(b.v) INTO output FROM b; + RETURN output; + + -- barymetric + ELSIF method = 1 THEN + WITH a as (SELECT unnest(geomin) AS e), + b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a), + c as (SELECT (ST_Dump(t)).geom as v FROM b), + d as (SELECT v FROM c WHERE ST_Within(point, v)) + SELECT v INTO g FROM d; + IF g is null THEN + -- out of the realm of the input data + RETURN -888.888; + END IF; + -- vertex of the selected cell + WITH a AS (SELECT (ST_DumpPoints(g)).geom AS v) + SELECT array_agg(v) INTO vertex FROM a; + + -- retrieve the value of each vertex + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]); + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]); + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]); + + SELECT ST_area(g), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[2], vertex[3], point]))), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[1], vertex[3], point]))), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point,vertex[1],vertex[2], point]))) INTO sg, sa, sb, sc; + + output := (coalesce(sa,0) * coalesce(va,0) + coalesce(sb,0) * coalesce(vb,0) + coalesce(sc,0) * coalesce(vc,0)) / coalesce(sg); + RETURN output; + + -- IDW + -- p1: limit the number of neighbors, 0->no limit + -- p2: order of distance decay, 0-> order 1 + ELSIF method = 2 THEN + + IF p2 = 0 THEN + p2 := 1; + END IF; + + WITH a as (SELECT unnest(geomin) as g, unnest(colin) as v), + b as (SELECT a.g, a.v FROM a ORDER BY point<->a.g) + SELECT array_agg(b.g), array_agg(b.v) INTO gs, vs FROM b; + IF p1::integer>0 THEN + gs2:=gs; + vs2:=vs; + FOR i IN 1..p1 + LOOP + gs2 := gs2 || gs[i]; + vs2 := vs2 || vs[i]; + END LOOP; + ELSE + gs2:=gs; + vs2:=vs; + END IF; + + WITH a as (SELECT unnest(gs2) as g, unnest(vs2) as v), + b as ( + SELECT + (1/ST_distance(point, a.g)^p2::integer) as k, + (a.v/ST_distance(point, a.g)^p2::integer) as f + FROM a + ) + SELECT sum(b.f)/sum(b.k) INTO output FROM b; + RETURN output; + + -- krigin + ELSIF method = 3 THEN + + -- TO DO + + END IF; + + RETURN -777.777; + +END; +$$ +language plpgsql IMMUTABLE PARALLEL SAFE; +-- ============================================================================================= +-- +-- CDB_Voronoi +-- +-- ============================================================================================= +CREATE OR REPLACE FUNCTION CDB_voronoi( + IN geomin geometry[], + IN buffer numeric DEFAULT 0.5, + IN tolerance numeric DEFAULT 1e-9 + ) +RETURNS geometry AS $$ +DECLARE + geomout geometry; +BEGIN + -- we need to make the geometry calculations in (pseudo)meters!!! + with a as ( + SELECT unnest(geomin) as g1 + ), + b as( + SELECT st_transform(g1, 3857) g2 from a + ) + SELECT array_agg(g2) INTO geomin from b; + + WITH + convexhull_1 as ( + SELECT + ST_ConvexHull(ST_Collect(geomin)) as g, + buffer * |/ (st_area(ST_ConvexHull(ST_Collect(geomin)))/PI()) as r + ), + clipper as( + SELECT + st_buffer(ST_MinimumBoundingCircle(a.g), buffer*a.r) as g + FROM convexhull_1 a + ), + env0 as ( + SELECT + (st_dumppoints(st_expand(a.g, buffer*a.r))).geom as e + FROM convexhull_1 a + ), + env as ( + SELECT + array_agg(env0.e) as e + FROM env0 + ), + sample AS ( + SELECT + ST_Collect(geomin || env.e) as geom + FROM env + ), + convexhull as ( + SELECT + ST_ConvexHull(ST_Collect(geomin)) as cg + ), + tin as ( + SELECT + ST_Dump(ST_DelaunayTriangles(geom, tolerance, 0)) as gd + FROM + sample + ), + tin_polygons as ( + SELECT + (gd).Path as id, + (gd).Geom as pg, + ST_Centroid(ST_MinimumBoundingCircle((gd).Geom, 180)) as ct + FROM tin + ), + tin_lines as ( + SELECT + id, + ST_ExteriorRing(pg) as lg + FROM tin_polygons + ), + tin_nodes as ( + SELECT + id, + ST_PointN(lg,1) p1, + ST_PointN(lg,2) p2, + ST_PointN(lg,3) p3 + FROM tin_lines + ), + tin_edges AS ( + SELECT + p.id, + UNNEST(ARRAY[ + ST_MakeLine(n.p1,n.p2) , + ST_MakeLine(n.p2,n.p3) , + ST_MakeLine(n.p3,n.p1)]) as Edge, + ST_Force2D(cdb_crankshaft._Find_Circle(n.p1,n.p2,n.p3)) as ct, + CASE WHEN st_distance(p.ct, ST_ExteriorRing(p.pg)) < tolerance THEN + TRUE + ELSE FALSE END AS ctx, + p.pg, + ST_within(p.ct, convexhull.cg) as ctin + FROM + tin_polygons p, + tin_nodes n, + convexhull + WHERE p.id = n.id + ), + voro_nodes as ( + SELECT + CASE WHEN x.ctx = TRUE THEN + ST_Centroid(x.edge) + ELSE + x.ct + END as xct, + CASE WHEN y.id is null THEN + CASE WHEN x.ctin = TRUE THEN + ST_SetSRID(ST_MakePoint( + ST_X(x.ct) + ((ST_X(ST_Centroid(x.edge)) - ST_X(x.ct)) * (1+buffer)), + ST_Y(x.ct) + ((ST_Y(ST_Centroid(x.edge)) - ST_Y(x.ct)) * (1+buffer)) + ), ST_SRID(x.ct)) + END + ELSE + y.ct + END as yct + FROM + tin_edges x + LEFT OUTER JOIN + tin_edges y + ON x.id <> y.id AND ST_Equals(x.edge, y.edge) + ), + voro_edges as( + SELECT + ST_LineMerge(ST_Collect(ST_MakeLine(xct, yct))) as v + FROM + voro_nodes + ), + voro_cells as( + SELECT + ST_Polygonize( + ST_Node( + ST_LineMerge( + ST_Union(v, ST_ExteriorRing( + ST_Convexhull(v) + ) + ) + ) + ) + ) as g + FROM + voro_edges + ), + voro_set as( + SELECT + (st_dump(v.g)).geom as g + FROM voro_cells v + ), + clipped_voro as( + SELECT + ST_intersection(c.g, v.g) as g + FROM + voro_set v, + clipper c + WHERE + ST_GeometryType(v.g) = 'ST_Polygon' + ) + SELECT + st_collect( + ST_Transform( + ST_ConvexHull(g), + 4326 + ) + ) + INTO geomout + FROM + clipped_voro; + RETURN geomout; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; + +/** ---------------------------------------------------------------------------------------- + * @function : FindCircle + * @precis : Function that determines if three points form a circle. If so a table containing + * centre and radius is returned. If not, a null table is returned. + * @version : 1.0.1 + * @param : p_pt1 : First point in curve + * @param : p_pt2 : Second point in curve + * @param : p_pt3 : Third point in curve + * @return : geometry : In which X,Y ordinates are the centre X, Y and the Z being the radius of found circle + * or NULL if three points do not form a circle. + * @history : Simon Greener - Feb 2012 - Original coding. + * Rafa de la Torre - Aug 2016 - Small fix for type checking + * Raul Marin - Sept 2017 - Remove unnecessary NULL checks and set function categories + * @copyright : Simon Greener @ 2012 + * Licensed under a Creative Commons Attribution-Share Alike 2.5 Australia License. (http://creativecommons.org/licenses/by-sa/2.5/au/) +**/ +CREATE OR REPLACE FUNCTION _Find_Circle( + IN p_pt1 geometry, + IN p_pt2 geometry, + IN p_pt3 geometry) + RETURNS geometry AS +$BODY$ +DECLARE + v_Centre geometry; + v_radius NUMERIC; + v_CX NUMERIC; + v_CY NUMERIC; + v_dA NUMERIC; + v_dB NUMERIC; + v_dC NUMERIC; + v_dD NUMERIC; + v_dE NUMERIC; + v_dF NUMERIC; + v_dG NUMERIC; +BEGIN + IF ( ST_GeometryType(p_pt1) <> 'ST_Point' OR + ST_GeometryType(p_pt2) <> 'ST_Point' OR + ST_GeometryType(p_pt3) <> 'ST_Point' ) THEN + RAISE EXCEPTION 'All supplied geometries must be points.'; + RETURN NULL; + END IF; + v_dA := ST_X(p_pt2) - ST_X(p_pt1); + v_dB := ST_Y(p_pt2) - ST_Y(p_pt1); + v_dC := ST_X(p_pt3) - ST_X(p_pt1); + v_dD := ST_Y(p_pt3) - ST_Y(p_pt1); + v_dE := v_dA * (ST_X(p_pt1) + ST_X(p_pt2)) + v_dB * (ST_Y(p_pt1) + ST_Y(p_pt2)); + v_dF := v_dC * (ST_X(p_pt1) + ST_X(p_pt3)) + v_dD * (ST_Y(p_pt1) + ST_Y(p_pt3)); + v_dG := 2.0 * (v_dA * (ST_Y(p_pt3) - ST_Y(p_pt2)) - v_dB * (ST_X(p_pt3) - ST_X(p_pt2))); + -- If v_dG is zero then the three points are collinear and no finite-radius + -- circle through them exists. + IF ( v_dG = 0 ) THEN + RETURN NULL; + ELSE + v_CX := (v_dD * v_dE - v_dB * v_dF) / v_dG; + v_CY := (v_dA * v_dF - v_dC * v_dE) / v_dG; + v_Radius := SQRT(POWER(ST_X(p_pt1) - v_CX,2) + POWER(ST_Y(p_pt1) - v_CY,2) ); + END IF; + RETURN ST_SetSRID(ST_MakePoint(v_CX, v_CY, v_radius),ST_Srid(p_pt1)); +END; +$BODY$ + LANGUAGE plpgsql IMMUTABLE STRICT PARALLEL SAFE; + +-- Moran's I Global Measure (public-facing) +CREATE OR REPLACE FUNCTION + CDB_AreasOfInterestGlobal( + subquery TEXT, + column_name TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS TABLE (moran NUMERIC, significance NUMERIC) +AS $$ + from crankshaft.clustering import Moran + # TODO: use named parameters or a dictionary + moran = Moran() + return moran.global_stat(subquery, column_name, w_type, + num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local (internal function) +CREATE OR REPLACE FUNCTION + _CDB_AreasOfInterestLocal( + subquery TEXT, + column_name TEXT, + w_type TEXT, + num_ngbrs INT, + permutations INT, + geom_col TEXT, + id_col TEXT) +RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + from crankshaft.clustering import Moran + moran = Moran() + # TODO: use named parameters or a dictionary + return moran.local_stat(subquery, column_name, w_type, + num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_AreasOfInterestLocal( + subquery TEXT, + column_name TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I only for HH and HL (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialHotspots( + subquery TEXT, + column_name TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') + RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('HH', 'HL'); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I only for LL and LH (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialColdspots( + subquery TEXT, + attr TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') + RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('LL', 'LH'); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I only for LH and HL (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialOutliers( + subquery TEXT, + attr TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') + RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('HL', 'LH'); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I Global Rate (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_AreasOfInterestGlobalRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS TABLE (moran FLOAT, significance FLOAT) +AS $$ + from crankshaft.clustering import Moran + moran = Moran() + # TODO: use named parameters or a dictionary + return moran.global_rate_stat(subquery, numerator, denominator, w_type, + num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + + +-- Moran's I Local Rate (internal function) +CREATE OR REPLACE FUNCTION + _CDB_AreasOfInterestLocalRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT, + num_ngbrs INT, + permutations INT, + geom_col TEXT, + id_col TEXT) +RETURNS +TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + from crankshaft.clustering import Moran + moran = Moran() + # TODO: use named parameters or a dictionary + return moran.local_rate_stat(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local Rate (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_AreasOfInterestLocalRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS +TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local Rate only for HH and HL (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialHotspotsRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS +TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('HH', 'HL'); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local Rate only for LL and LH (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialColdspotsRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS +TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('LL', 'LH'); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local Rate only for LH and HL (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialOutliersRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS +TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('HL', 'LH'); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; +-- Spatial k-means clustering + +CREATE OR REPLACE FUNCTION CDB_KMeans( + query TEXT, + no_clusters INTEGER, + no_init INTEGER DEFAULT 20 +) +RETURNS TABLE( + cartodb_id INTEGER, + cluster_no INTEGER +) AS $$ + +from crankshaft.clustering import Kmeans +kmeans = Kmeans() +return kmeans.spatial(query, no_clusters, no_init) + +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + +-- Non-spatial k-means clustering +-- query: sql query to retrieve all the needed data +-- colnames: text array of column names for doing the clustering analysis +-- no_clusters: number of requested clusters +-- standardize: whether to scale variables to a mean of zero and a standard +-- deviation of 1 +-- id_colname: name of the id column + +CREATE OR REPLACE FUNCTION CDB_KMeansNonspatial( + query TEXT, + colnames TEXT[], + no_clusters INTEGER, + standardize BOOLEAN DEFAULT true, + id_col TEXT DEFAULT 'cartodb_id' +) +RETURNS TABLE( + cluster_label text, + cluster_center json, + silhouettes numeric, + inertia numeric, + rowid bigint +) AS $$ + +from crankshaft.clustering import Kmeans +kmeans = Kmeans() +return kmeans.nonspatial(query, colnames, no_clusters, + standardize=standardize, + id_col=id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + + +CREATE OR REPLACE FUNCTION CDB_WeightedMeanS( + state NUMERIC[], + the_geom GEOMETRY(Point, 4326), + weight NUMERIC +) +RETURNS Numeric[] AS $$ +DECLARE + newX NUMERIC; + newY NUMERIC; + newW NUMERIC; +BEGIN + IF weight IS NULL OR the_geom IS NULL THEN + newX = state[1]; + newY = state[2]; + newW = state[3]; + ELSE + newX = state[1] + ST_X(the_geom)*weight; + newY = state[2] + ST_Y(the_geom)*weight; + newW = state[3] + weight; + END IF; + RETURN Array[newX,newY,newW]; + +END +$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; + + +CREATE OR REPLACE FUNCTION CDB_WeightedMeanF(state NUMERIC[]) +RETURNS GEOMETRY AS +$$ +BEGIN + IF state[3] = 0 THEN + RETURN ST_SetSRID(ST_MakePoint(state[1],state[2]), 4326); + ELSE + RETURN ST_SETSRID(ST_MakePoint(state[1]/state[3], state[2]/state[3]),4326); + END IF; +END +$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; + +-- Create aggregate if it did not exist +DO $$ +BEGIN + IF NOT EXISTS ( + SELECT * + FROM pg_catalog.pg_proc p + LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace + WHERE n.nspname = 'cdb_crankshaft' + AND p.proname = 'cdb_weightedmean' + AND p.proisagg) + THEN + CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) ( + SFUNC = CDB_WeightedMeanS, + FINALFUNC = CDB_WeightedMeanF, + STYPE = Numeric[], + PARALLEL = SAFE, + INITCOND = "{0.0,0.0,0.0}" + ); + END IF; +END +$$ LANGUAGE plpgsql; +-- Spatial Markov + +-- input table format: +-- id | geom | date_1 | date_2 | date_3 +-- 1 | Pt1 | 12.3 | 13.1 | 14.2 +-- 2 | Pt2 | 11.0 | 13.2 | 12.5 +-- ... +-- Sample Function call: +-- SELECT CDB_SpatialMarkov('SELECT * FROM real_estate', +-- Array['date_1', 'date_2', 'date_3']) + +CREATE OR REPLACE FUNCTION + CDB_SpatialMarkovTrend ( + subquery TEXT, + time_cols TEXT[], + num_classes INT DEFAULT 7, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS TABLE (trend NUMERIC, trend_up NUMERIC, trend_down NUMERIC, volatility NUMERIC, rowid INT) +AS $$ + + from crankshaft.space_time_dynamics import Markov + markov = Markov() + + ## TODO: use named parameters or a dictionary + return markov.spatial_trend(subquery, time_cols, num_classes, w_type, num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + +-- input table format: identical to above but in a predictable format +-- Sample function call: +-- SELECT cdb_spatial_markov('SELECT * FROM real_estate', +-- 'date_1') + + +-- CREATE OR REPLACE FUNCTION +-- cdb_spatial_markov ( +-- subquery TEXT, +-- time_col_min text, +-- time_col_max text, +-- date_format text, -- '_YYYY_MM_DD' +-- num_time_per_bin INT DEFAULT 1, +-- permutations INT DEFAULT 99, +-- geom_column TEXT DEFAULT 'the_geom', +-- id_col TEXT DEFAULT 'cartodb_id', +-- w_type TEXT DEFAULT 'knn', +-- num_ngbrs int DEFAULT 5) +-- RETURNS TABLE (moran FLOAT, quads TEXT, significance FLOAT, ids INT) +-- AS $$ +-- plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()') +-- from crankshaft.clustering import moran_local +-- # TODO: use named parameters or a dictionary +-- return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs) +-- $$ LANGUAGE plpythonu; +-- +-- -- input table format: +-- -- id | geom | date | measurement +-- -- 1 | Pt1 | 12/3 | 13.2 +-- -- 2 | Pt2 | 11/5 | 11.3 +-- -- 3 | Pt1 | 11/13 | 12.9 +-- -- 4 | Pt3 | 12/19 | 10.1 +-- -- ... +-- +-- CREATE OR REPLACE FUNCTION +-- cdb_spatial_markov ( +-- subquery TEXT, +-- time_col text, +-- num_time_per_bin INT DEFAULT 1, +-- permutations INT DEFAULT 99, +-- geom_column TEXT DEFAULT 'the_geom', +-- id_col TEXT DEFAULT 'cartodb_id', +-- w_type TEXT DEFAULT 'knn', +-- num_ngbrs int DEFAULT 5) +-- RETURNS TABLE (moran FLOAT, quads TEXT, significance FLOAT, ids INT) +-- AS $$ +-- plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()') +-- from crankshaft.clustering import moran_local +-- # TODO: use named parameters or a dictionary +-- return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs) +-- $$ LANGUAGE plpythonu; +-- Based on: +-- https://github.com/mapbox/polylabel/blob/master/index.js +-- https://sites.google.com/site/polesofinaccessibility/ +-- Requires: https://github.com/CartoDB/cartodb-postgresql + +-- Based on: +-- https://github.com/mapbox/polylabel/blob/master/index.js +-- https://sites.google.com/site/polesofinaccessibility/ +-- Requires: https://github.com/CartoDB/cartodb-postgresql + +CREATE OR REPLACE FUNCTION CDB_PIA( + IN polygon geometry, + IN tolerance numeric DEFAULT 1.0 + ) +RETURNS geometry AS $$ +DECLARE + env geometry[]; + cells geometry[]; + cell geometry; + best_c geometry; + best_d numeric; + test_d numeric; + test_mx numeric; + test_h numeric; + test_cells geometry[]; + width numeric; + height numeric; + h numeric; + i integer; + n integer; + sqr numeric; + p geometry; +BEGIN + sqr := 0.5*(|/2.0); + polygon := ST_Transform(polygon, 3857); + + -- grid #0 cell size + height := ST_YMax(polygon) - ST_YMin(polygon); + width := ST_XMax(polygon) - ST_XMin(polygon); + h := 0.5*LEAST(height, width); + + -- grid #0 + with c1 as( + SELECT cdb_crankshaft.CDB_RectangleGrid(polygon, h, h) as c + ) + SELECT array_agg(c) INTO cells FROM c1; + + -- 1st guess: centroid + best_c := polygon; + best_d := cdb_crankshaft._Signed_Dist(polygon, ST_Centroid(Polygon)); + + -- looping the loop + n := array_length(cells,1); + i := 1; + LOOP + + EXIT WHEN i > n; + + cell := cells[i]; + + i := i+1; + + -- cell side size, it's square + test_h := ST_XMax(cell) - ST_XMin(cell) ; + + -- check distance + test_d := cdb_crankshaft._Signed_Dist(polygon, ST_Centroid(cell)); + + IF test_d > best_d THEN + best_d := test_d; + best_c := cell; + END IF; + + -- longest distance within the cell + test_mx := test_d + (test_h * sqr); + + -- if the cell has no chance to contains the desired point, continue + CONTINUE WHEN test_mx - best_d <= tolerance; + + -- resample the cell + with c1 as( + SELECT cdb_crankshaft.CDB_RectangleGrid(cell, test_h/2, test_h/2) as c + ) + SELECT array_agg(c) INTO test_cells FROM c1; + + -- concat the new cells to the former array + cells := cells || test_cells; + + -- prepare next iteration + n := array_length(cells,1); + + END LOOP; + + RETURN ST_transform(ST_Centroid(best_c), 4326); + +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; + + + +-- signed distance point to polygon with holes +-- negative is the point is out the polygon +-- rev 1. adding MULTIPOLYGON and GEOMETRYCOLLECTION support by @abelvm +CREATE OR REPLACE FUNCTION _Signed_Dist( + IN polygon geometry, + IN point geometry + ) +RETURNS numeric AS $$ +DECLARE + pols geometry[]; + pol geometry; + i integer; + j integer; + within integer; + w integer; + holes integer; + dist numeric; + d numeric; +BEGIN + dist := 1e999; + WITH collection as (SELECT (ST_dump(polygon)).geom as geom) SELECT array_agg(geom) into pols FROM collection; + FOR j in 1..array_length(pols, 1) + LOOP + pol := pols[j]; + d := dist; + SELECT LEAST(dist, ST_distance(point, ST_ExteriorRing(pol))::numeric) INTO d; + SELECT CASE WHEN ST_Within(point,pol) THEN 1 ELSE -1 END INTO w; + SELECT ST_NumInteriorRings(pol) INTO holes; + IF holes > 0 THEN + FOR i IN 1..holes + LOOP + SELECT LEAST(d, ST_distance(point, ST_InteriorRingN(pol, i))::numeric) INTO d; + END LOOP; + END IF; + IF d < dist THEN + dist:= d; + within := w; + END IF; + END LOOP; + dist := dist * within::numeric; + RETURN dist; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; +-- +-- Iterative densification of a set of points using Delaunay triangulation +-- the new points have as assigned value the average value of the 3 vertex (centroid) +-- +-- @param geomin - array of geometries (points) +-- +-- @param colin - array of numeric values in that points +-- +-- @param iterations - integer, number of iterations +-- +-- +-- Returns: TABLE(geomout geometry, colout numeric) +-- +-- +CREATE OR REPLACE FUNCTION CDB_Densify( + IN geomin geometry[], + IN colin numeric[], + IN iterations integer + ) +RETURNS TABLE(geomout geometry, colout numeric) AS $$ +DECLARE + geotemp geometry[]; + coltemp numeric[]; + i integer; + gs geometry[]; + g geometry; + vertex geometry[]; + va numeric; + vb numeric; + vc numeric; + center geometry; + centerval numeric; + tmp integer; +BEGIN + geotemp := geomin; + coltemp := colin; + FOR i IN 1..iterations + LOOP + -- generate TIN + WITH a as (SELECT unnest(geotemp) AS e), + b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a), + c as (SELECT (ST_Dump(t)).geom AS v FROM b) + SELECT array_agg(v) INTO gs FROM c; + -- loop cells + FOREACH g IN ARRAY gs + LOOP + -- append centroid + SELECT ST_Centroid(g) INTO center; + geotemp := array_append(geotemp, center); + -- retrieve the value of each vertex + WITH a AS (SELECT (ST_DumpPoints(g)).geom AS v) + SELECT array_agg(v) INTO vertex FROM a; + WITH a AS(SELECT unnest(geotemp) as geo, unnest(coltemp) as c) + SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]); + WITH a AS(SELECT unnest(geotemp) as geo, unnest(coltemp) as c) + SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]); + WITH a AS(SELECT unnest(geotemp) as geo, unnest(coltemp) as c) + SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]); + -- calc the value at the center + centerval := (va + vb + vc) / 3; + -- append the value + coltemp := array_append(coltemp, centerval); + END LOOP; + END LOOP; + RETURN QUERY SELECT unnest(geotemp ) as geomout, unnest(coltemp ) as colout; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; +CREATE OR REPLACE FUNCTION CDB_TINmap( + IN geomin geometry[], + IN colin numeric[], + IN iterations integer + ) +RETURNS TABLE(geomout geometry, colout numeric) AS $$ +DECLARE + p geometry[]; + vals numeric[]; + gs geometry[]; + g geometry; + vertex geometry[]; + centerval numeric; + va numeric; + vb numeric; + vc numeric; + coltemp numeric[]; +BEGIN + SELECT array_agg(dens.geomout), array_agg(dens.colout) INTO p, vals FROM cdb_crankshaft.CDB_Densify(geomin, colin, iterations) dens; + WITH a as (SELECT unnest(p) AS e), + b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a), + c as (SELECT (ST_Dump(t)).geom AS v FROM b) + SELECT array_agg(v) INTO gs FROM c; + FOREACH g IN ARRAY gs + LOOP + -- retrieve the vertex of each triangle + WITH a AS (SELECT (ST_DumpPoints(g)).geom AS v) + SELECT array_agg(v) INTO vertex FROM a; + -- retrieve the value of each vertex + WITH a AS(SELECT unnest(p) as geo, unnest(vals) as c) + SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]); + WITH a AS(SELECT unnest(p) as geo, unnest(vals) as c) + SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]); + WITH a AS(SELECT unnest(p) as geo, unnest(vals) as c) + SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]); + -- calc the value at the center + centerval := (va + vb + vc) / 3; + -- append the value + coltemp := array_append(coltemp, centerval); + END LOOP; + RETURN QUERY SELECT unnest(gs) as geomout, unnest(coltemp ) as colout; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; +-- Getis-Ord's G +-- Hotspot/Coldspot Analysis tool +CREATE OR REPLACE FUNCTION + CDB_GetisOrdsG( + subquery TEXT, + column_name TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 999, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS TABLE (z_score NUMERIC, p_value NUMERIC, p_z_sim NUMERIC, rowid BIGINT) +AS $$ + from crankshaft.clustering import Getis + getis = Getis() + return getis.getis_ord(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + +-- TODO: make a version that accepts the values as arrays + +-- Find outliers using a static threshold +-- +CREATE OR REPLACE FUNCTION CDB_StaticOutlier(column_value numeric, threshold numeric) +RETURNS boolean +AS $$ +BEGIN + + RETURN column_value > threshold; + +END; +$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE ; + +-- Find outliers by a percentage above the threshold +-- TODO: add symmetric option? `is_symmetric boolean DEFAULT false` + +CREATE OR REPLACE FUNCTION CDB_PercentOutlier(column_values numeric[], outlier_fraction numeric, ids int[]) +RETURNS TABLE(is_outlier boolean, rowid int) +AS $$ +DECLARE + avg_val numeric; + out_vals boolean[]; +BEGIN + + SELECT avg(i) INTO avg_val + FROM unnest(column_values) As x(i); + + IF avg_val = 0 THEN + RAISE EXCEPTION 'Mean value is zero. Try another outlier method.'; + END IF; + + SELECT array_agg( + outlier_fraction < i / avg_val) INTO out_vals + FROM unnest(column_values) As x(i); + + RETURN QUERY + SELECT unnest(out_vals) As is_outlier, + unnest(ids) As rowid; + +END; +$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; + +-- Find outliers above a given number of standard deviations from the mean + +CREATE OR REPLACE FUNCTION CDB_StdDevOutlier(column_values numeric[], num_deviations numeric, ids int[], is_symmetric boolean DEFAULT true) +RETURNS TABLE(is_outlier boolean, rowid int) +AS $$ +DECLARE + stddev_val numeric; + avg_val numeric; + out_vals boolean[]; +BEGIN + + SELECT stddev(i), avg(i) INTO stddev_val, avg_val + FROM unnest(column_values) As x(i); + + IF stddev_val = 0 THEN + RAISE EXCEPTION 'Standard deviation of input data is zero'; + END IF; + + IF is_symmetric THEN + SELECT array_agg( + abs(i - avg_val) / stddev_val > num_deviations) INTO out_vals + FROM unnest(column_values) As x(i); + ELSE + SELECT array_agg( + (i - avg_val) / stddev_val > num_deviations) INTO out_vals + FROM unnest(column_values) As x(i); + END IF; + + RETURN QUERY + SELECT unnest(out_vals) As is_outlier, + unnest(ids) As rowid; +END; +$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; +CREATE OR REPLACE FUNCTION CDB_Contour( + IN geomin geometry[], + IN colin numeric[], + IN buffer numeric, + IN intmethod integer, + IN classmethod integer, + IN steps integer, + IN max_time integer DEFAULT 60000 + ) +RETURNS TABLE( + the_geom geometry, + bin integer, + min_value numeric, + max_value numeric, + avg_value numeric +) AS $$ +DECLARE + cell_count integer; + tin geometry[]; + resolution integer; +BEGIN + + -- nasty trick to override issue #121 + IF max_time = 0 THEN + max_time = -90; + END IF; + resolution := max_time; + max_time := -1 * resolution; + + -- calc the optimal number of cells for the current dataset + SELECT + CASE intmethod + WHEN 0 THEN round(3.7745903782 * max_time - 9.4399210051 * array_length(geomin,1) - 1350.8778213073) + WHEN 1 THEN round(2.2855592156 * max_time - 87.285217133 * array_length(geomin,1) + 17255.7085601797) + WHEN 2 THEN round(0.9799471999 * max_time - 127.0334085369 * array_length(geomin,1) + 22707.9579721218) + ELSE 10000 + END INTO cell_count; + + -- we don't have iterative barycentric interpolation in CDB_interpolation, + -- and it's a costy function, so let's make a custom one here till + -- we update the code + -- tin := ARRAY[]::geometry[]; + IF intmethod=1 THEN + WITH + a as (SELECT unnest(geomin) AS e), + b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a), + c as (SELECT (ST_Dump(t)).geom as v FROM b) + SELECT array_agg(v) INTO tin FROM c; + END IF; + -- Delaunay stuff performed just ONCE!! + + -- magic + RETURN QUERY + WITH + convexhull as ( + SELECT + ST_ConvexHull(ST_Collect(geomin)) as g, + buffer * |/ st_area(ST_ConvexHull(ST_Collect(geomin)))/PI() as r + ), + envelope as ( + SELECT + st_expand(a.g, a.r) as e + FROM convexhull a + ), + envelope3857 as( + SELECT + ST_Transform(e, 3857) as geom + FROM envelope + ), + resolution as( + SELECT + CASE WHEN resolution <= 0 THEN + round(|/ ( + ST_area(geom) / abs(cell_count) + )) + ELSE + resolution + END AS cell + FROM envelope3857 + ), + grid as( + SELECT + ST_Transform(cdb_crankshaft.CDB_RectangleGrid(e.geom, r.cell, r.cell), 4326) as geom + FROM envelope3857 e, resolution r + ), + interp as( + SELECT + geom, + CASE + WHEN intmethod=1 THEN cdb_crankshaft._interp_in_tin(geomin, colin, tin, ST_Centroid(geom)) + ELSE cdb_crankshaft.CDB_SpatialInterpolation(geomin, colin, ST_Centroid(geom), intmethod) + END as val + FROM grid + ), + classes as( + SELECT CASE + WHEN classmethod = 0 THEN + cdb_crankshaft.CDB_EqualIntervalBins(array_agg(val), steps) + WHEN classmethod = 1 THEN + cdb_crankshaft.CDB_HeadsTailsBins(array_agg(val), steps) + WHEN classmethod = 2 THEN + cdb_crankshaft.CDB_JenksBins(array_agg(val), steps) + ELSE + cdb_crankshaft.CDB_QuantileBins(array_agg(val), steps) + END as b + FROM interp + where val is not null + ), + classified as( + SELECT + i.*, + width_bucket(i.val, c.b) as bucket + FROM interp i left join classes c + ON 1=1 + ), + classified2 as( + SELECT + geom, + val, + CASE + WHEN bucket = steps THEN bucket - 1 + ELSE bucket + END as b + FROM classified + ), + final as( + SELECT + st_union(geom) as the_geom, + b as bin, + min(val) as min_value, + max(val) as max_value, + avg(val) as avg_value + FROM classified2 + GROUP BY bin + ) + SELECT + * + FROM final + where final.bin is not null + ; +END; +$$ language plpgsql VOLATILE PARALLEL RESTRICTED; + + +-- ===================================================================== +-- Interp in grid, so we can use barycentric with a precalculated tin (NNI) +-- ===================================================================== +CREATE OR REPLACE FUNCTION _interp_in_tin( + IN geomin geometry[], + IN colin numeric[], + IN tin geometry[], + IN point geometry + ) +RETURNS numeric AS +$$ +DECLARE + g geometry; + vertex geometry[]; + sg numeric; + sa numeric; + sb numeric; + sc numeric; + va numeric; + vb numeric; + vc numeric; + output numeric; +BEGIN + -- get the cell the point is within + WITH + a as (SELECT unnest(tin) as v), + b as (SELECT v FROM a WHERE ST_Within(point, v)) + SELECT v INTO g FROM b; + + -- if we're out of the data realm, + -- return null + IF g is null THEN + RETURN null; + END IF; + + -- vertex of the selected cell + WITH a AS ( + SELECT (ST_DumpPoints(g)).geom AS v + ) + SELECT array_agg(v) INTO vertex FROM a; + + -- retrieve the value of each vertex + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]); + + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]); + + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]); + + -- calc the areas + SELECT + ST_area(g), + ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[2], vertex[3], point]))), + ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[1], vertex[3], point]))), + ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point,vertex[1],vertex[2], point]))) INTO sg, sa, sb, sc; + + output := (coalesce(sa,0) * coalesce(va,0) + coalesce(sb,0) * coalesce(vb,0) + coalesce(sc,0) * coalesce(vc,0)) / coalesce(sg,1); + RETURN output; +END; +$$ +language plpgsql IMMUTABLE PARALLEL SAFE; +-- Function by Stuart Lynn for a simple interpolation of a value +-- from a polygon table over an arbitrary polygon +-- (weighted by the area proportion overlapped) +-- Aereal weighting is a very simple form of aereal interpolation. +-- +-- Parameters: +-- * geom a Polygon geometry which defines the area where a value will be +-- estimated as the area-weighted sum of a given table/column +-- * target_table_name table name of the table that provides the values +-- * target_column column name of the column that provides the values +-- * schema_name optional parameter to defina the schema the target table +-- belongs to, which is necessary if its not in the search_path. +-- Note that target_table_name should never include the schema in it. +-- Return value: +-- Aereal-weighted interpolation of the column values over the geometry +CREATE OR REPLACE +FUNCTION cdb_overlap_sum(geom geometry, target_table_name text, target_column text, schema_name text DEFAULT NULL) + RETURNS numeric AS +$$ +DECLARE + result numeric; + qualified_name text; +BEGIN + IF schema_name IS NULL THEN + qualified_name := Format('%I', target_table_name); + ELSE + qualified_name := Format('%I.%s', schema_name, target_table_name); + END IF; + EXECUTE Format(' + SELECT sum(%I*ST_Area(St_Intersection($1, a.the_geom))/ST_Area(a.the_geom)) + FROM %s AS a + WHERE $1 && a.the_geom + ', target_column, qualified_name) + USING geom + INTO result; + RETURN result; +END; +$$ LANGUAGE plpgsql STABLE PARALLEL SAFE; +CREATE OR REPLACE FUNCTION +CDB_GWR(subquery text, dep_var text, ind_vars text[], + bw numeric default null, fixed boolean default False, + kernel text default 'bisquare', geom_col text default 'the_geom', + id_col text default 'cartodb_id') +RETURNS table(coeffs JSON, stand_errs JSON, t_vals JSON, + filtered_t_vals JSON, predicted numeric, + residuals numeric, r_squared numeric, bandwidth numeric, + rowid bigint) +AS $$ + +from crankshaft.regression import GWR + +gwr = GWR() + +return gwr.gwr(subquery, dep_var, ind_vars, bw, fixed, kernel, geom_col, id_col) + +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + + +CREATE OR REPLACE FUNCTION +CDB_GWR_Predict(subquery text, dep_var text, ind_vars text[], + bw numeric default null, fixed boolean default False, + kernel text default 'bisquare', + geom_col text default 'the_geom', + id_col text default 'cartodb_id') +RETURNS table(coeffs JSON, stand_errs JSON, t_vals JSON, + r_squared numeric, predicted numeric, rowid bigint) +AS $$ + +from crankshaft.regression import GWR +gwr = GWR() + +return gwr.gwr_predict(subquery, dep_var, ind_vars, bw, fixed, kernel, geom_col, id_col) + +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; +-- +-- Creates N points randomly distributed arround the polygon +-- +-- @param g - the geometry to be turned in to points +-- +-- @param no_points - the number of points to generate +-- +-- @params max_iter_per_point - the function generates points in the polygon's bounding box +-- and discards points which don't lie in the polygon. max_iter_per_point specifies how many +-- misses per point the funciton accepts before giving up. +-- +-- Returns: Multipoint with the requested points +CREATE OR REPLACE FUNCTION cdb_dot_density(geom geometry , no_points Integer, max_iter_per_point Integer DEFAULT 1000) +RETURNS GEOMETRY AS $$ +DECLARE + extent GEOMETRY; + test_point Geometry; + width NUMERIC; + height NUMERIC; + x0 NUMERIC; + y0 NUMERIC; + xp NUMERIC; + yp NUMERIC; + no_left INTEGER; + remaining_iterations INTEGER; + points GEOMETRY[]; + bbox_line GEOMETRY; + intersection_line GEOMETRY; +BEGIN + extent := ST_Envelope(geom); + width := ST_XMax(extent) - ST_XMIN(extent); + height := ST_YMax(extent) - ST_YMIN(extent); + x0 := ST_XMin(extent); + y0 := ST_YMin(extent); + no_left := no_points; + + LOOP + if(no_left=0) THEN + EXIT; + END IF; + yp = y0 + height*random(); + bbox_line = ST_MakeLine( + ST_SetSRID(ST_MakePoint(yp, x0),4326), + ST_SetSRID(ST_MakePoint(yp, x0+width),4326) + ); + intersection_line = ST_Intersection(bbox_line,geom); + test_point = ST_LineInterpolatePoint(st_makeline(st_linemerge(intersection_line)),random()); + points := points || test_point; + no_left = no_left - 1 ; + END LOOP; + RETURN ST_Collect(points); +END; +$$ +LANGUAGE plpgsql VOLATILE PARALLEL RESTRICTED; +-- Make sure by default there are no permissions for publicuser +-- NOTE: this happens at extension creation time, as part of an implicit transaction. +-- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE; + +-- Grant permissions on the schema to publicuser (but just the schema) +GRANT USAGE ON SCHEMA cdb_crankshaft TO publicuser; + +-- Revoke execute permissions on all functions in the schema by default +-- REVOKE EXECUTE ON ALL FUNCTIONS IN SCHEMA cdb_crankshaft FROM PUBLIC, publicuser; +-- +-- Fill given extent with a rectangular coverage +-- +-- @param ext Extent to fill. Only rectangles with center point falling +-- inside the extent (or at the lower or leftmost edge) will +-- be emitted. The returned hexagons will have the same SRID +-- as this extent. +-- +-- @param width With of each rectangle +-- +-- @param height Height of each rectangle +-- +-- @param origin Optional origin to allow for exact tiling. +-- If omitted the origin will be 0,0. +-- The parameter is checked for having the same SRID +-- as the extent. +-- +-- +CREATE OR REPLACE FUNCTION CDB_RectangleGrid(ext GEOMETRY, width FLOAT8, height FLOAT8, origin GEOMETRY DEFAULT NULL) +RETURNS SETOF GEOMETRY +AS $$ +DECLARE + h GEOMETRY; -- rectangle cell + hstep FLOAT8; -- horizontal step + vstep FLOAT8; -- vertical step + hw FLOAT8; -- half width + hh FLOAT8; -- half height + vstart FLOAT8; + hstart FLOAT8; + hend FLOAT8; + vend FLOAT8; + xoff FLOAT8; + yoff FLOAT8; + xgrd FLOAT8; + ygrd FLOAT8; + x FLOAT8; + y FLOAT8; + srid INTEGER; +BEGIN + + srid := ST_SRID(ext); + + xoff := 0; + yoff := 0; + + IF origin IS NOT NULL THEN + IF ST_SRID(origin) != srid THEN + RAISE EXCEPTION 'SRID mismatch between extent (%) and origin (%)', srid, ST_SRID(origin); + END IF; + xoff := ST_X(origin); + yoff := ST_Y(origin); + END IF; + + --RAISE DEBUG 'X offset: %', xoff; + --RAISE DEBUG 'Y offset: %', yoff; + + hw := width/2.0; + hh := height/2.0; + + xgrd := hw; + ygrd := hh; + --RAISE DEBUG 'X grid size: %', xgrd; + --RAISE DEBUG 'Y grid size: %', ygrd; + + hstep := width; + vstep := height; + + -- Tweak horizontal start on hstep grid from origin + hstart := xoff + ceil((ST_XMin(ext)-xoff)/hstep)*hstep; + --RAISE DEBUG 'hstart: %', hstart; + + -- Tweak vertical start on vstep grid from origin + vstart := yoff + ceil((ST_Ymin(ext)-yoff)/vstep)*vstep; + --RAISE DEBUG 'vstart: %', vstart; + + hend := ST_XMax(ext); + vend := ST_YMax(ext); + + --RAISE DEBUG 'hend: %', hend; + --RAISE DEBUG 'vend: %', vend; + + x := hstart; + WHILE x < hend LOOP -- over X + y := vstart; + h := ST_MakeEnvelope(x-hw, y-hh, x+hw, y+hh, srid); + WHILE y < vend LOOP -- over Y + RETURN NEXT h; + h := ST_Translate(h, 0, vstep); + y := yoff + round(((y + vstep)-yoff)/ygrd)*ygrd; -- round to grid + END LOOP; + x := xoff + round(((x + hstep)-xoff)/xgrd)*xgrd; -- round to grid + END LOOP; + + RETURN; +END +$$ LANGUAGE 'plpgsql' IMMUTABLE PARALLEL SAFE; + +-- +-- Calculate the equal interval bins for a given column +-- +-- @param in_array A numeric array of numbers to determine the best +-- to determine the bin boundary +-- +-- @param breaks The number of bins you want to find. +-- +-- +-- Returns: upper edges of bins +-- +-- + +CREATE OR REPLACE FUNCTION CDB_EqualIntervalBins ( in_array NUMERIC[], breaks INT ) RETURNS NUMERIC[] as $$ +DECLARE + diff numeric; + min_val numeric; + max_val numeric; + tmp_val numeric; + i INT := 1; + reply numeric[]; +BEGIN + SELECT min(e), max(e) INTO min_val, max_val FROM ( SELECT unnest(in_array) e ) x WHERE e IS NOT NULL; + diff = (max_val - min_val) / breaks::numeric; + LOOP + IF i < breaks THEN + tmp_val = min_val + i::numeric * diff; + reply = array_append(reply, tmp_val); + i := i+1; + ELSE + reply = array_append(reply, max_val); + EXIT; + END IF; + END LOOP; + RETURN reply; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; + +-- +-- Determine the Heads/Tails classifications from a numeric array +-- +-- @param in_array A numeric array of numbers to determine the best +-- bins based on the Heads/Tails method. +-- +-- @param breaks The number of bins you want to find. +-- +-- + +CREATE OR REPLACE FUNCTION CDB_HeadsTailsBins ( in_array NUMERIC[], breaks INT) RETURNS NUMERIC[] as $$ +DECLARE + element_count INT4; + arr_mean numeric; + i INT := 2; + reply numeric[]; +BEGIN + -- get the total size of our row + element_count := array_upper(in_array, 1) - array_lower(in_array, 1); + -- ensure the ordering of in_array + SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x; + -- stop if no rows + IF element_count IS NULL THEN + RETURN NULL; + END IF; + -- stop if our breaks are more than our input array size + IF element_count < breaks THEN + RETURN in_array; + END IF; + + -- get our mean value + SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x; + + reply = Array[arr_mean]; + -- slice our bread + LOOP + IF i > breaks THEN EXIT; END IF; + SELECT avg(e) INTO arr_mean FROM ( SELECT unnest(in_array) e) x WHERE e > reply[i-1]; + IF arr_mean IS NOT NULL THEN + reply = array_append(reply, arr_mean); + END IF; + i := i+1; + END LOOP; + RETURN reply; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; + +-- +-- Determine the Jenks classifications from a numeric array +-- +-- @param in_array A numeric array of numbers to determine the best +-- bins based on the Jenks method. +-- +-- @param breaks The number of bins you want to find. +-- +-- @param iterations The number of different starting positions to test. +-- +-- @param invert Optional wheter to return the top of each bin (default) +-- or the bottom. BOOLEAN, default=FALSE. +-- +-- + + +CREATE OR REPLACE FUNCTION CDB_JenksBins ( in_array NUMERIC[], breaks INT, iterations INT DEFAULT 5, invert BOOLEAN DEFAULT FALSE) RETURNS NUMERIC[] as $$ +DECLARE + element_count INT4; + arr_mean NUMERIC; + bot INT; + top INT; + tops INT[]; + classes INT[][]; + i INT := 1; j INT := 1; + curr_result NUMERIC[]; + best_result NUMERIC[]; + seedtarget TEXT; + quant NUMERIC[]; + shuffles INT; +BEGIN + -- get the total size of our row + element_count := array_length(in_array, 1); --array_upper(in_array, 1) - array_lower(in_array, 1); + -- ensure the ordering of in_array + SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x; + -- stop if no rows + IF element_count IS NULL THEN + RETURN NULL; + END IF; + -- stop if our breaks are more than our input array size + IF element_count < breaks THEN + RETURN in_array; + END IF; + + shuffles := LEAST(GREATEST(floor(2500000.0/(element_count::float*iterations::float)), 1), 750)::int; + -- get our mean value + SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x; + + -- assume best is actually Quantile + SELECT cdb_crankshaft.CDB_QuantileBins(in_array, breaks) INTO quant; + + -- if data is very very large, just return quant and be done + IF element_count > 5000000 THEN + RETURN quant; + END IF; + + -- change quant into bottom, top markers + LOOP + IF i = 1 THEN + bot = 1; + ELSE + -- use last top to find this bot + bot = top+1; + END IF; + IF i = breaks THEN + top = element_count; + ELSE + SELECT count(*) INTO top FROM ( SELECT unnest(in_array) as v) x WHERE v <= quant[i]; + END IF; + IF i = 1 THEN + classes = ARRAY[ARRAY[bot,top]]; + ELSE + classes = ARRAY_CAT(classes,ARRAY[bot,top]); + END IF; + IF i > breaks THEN EXIT; END IF; + i = i+1; + END LOOP; + + best_result = cdb_crankshaft.CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles); + + --set the seed so we can ensure the same results + SELECT setseed(0.4567) INTO seedtarget; + --loop through random starting positions + LOOP + IF j > iterations-1 THEN EXIT; END IF; + i = 1; + tops = ARRAY[element_count]; + LOOP + IF i = breaks THEN EXIT; END IF; + SELECT array_agg(distinct e) INTO tops FROM (SELECT unnest(array_cat(tops, ARRAY[floor(random()*element_count::float)::int])) as e ORDER BY e) x WHERE e != 1; + i = array_length(tops, 1); + END LOOP; + i = 1; + LOOP + IF i > breaks THEN EXIT; END IF; + IF i = 1 THEN + bot = 1; + ELSE + bot = top+1; + END IF; + top = tops[i]; + IF i = 1 THEN + classes = ARRAY[ARRAY[bot,top]]; + ELSE + classes = ARRAY_CAT(classes,ARRAY[bot,top]); + END IF; + i := i+1; + END LOOP; + curr_result = cdb_crankshaft.CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles); + + IF curr_result[1] > best_result[1] THEN + best_result = curr_result; + j = j-1; -- if we found a better result, add one more search + END IF; + j = j+1; + END LOOP; + + RETURN (best_result)[2:array_upper(best_result, 1)]; +END; +$$ language plpgsql VOLATILE PARALLEL RESTRICTED; + + + +-- +-- Perform a single iteration of the Jenks classification +-- + +CREATE OR REPLACE FUNCTION CDB_JenksBinsIteration ( in_array NUMERIC[], breaks INT, classes INT[][], invert BOOLEAN, element_count INT4, arr_mean NUMERIC, max_search INT DEFAULT 50) RETURNS NUMERIC[] as $$ +DECLARE + tmp_val numeric; + new_classes int[][]; + tmp_class int[]; + i INT := 1; + j INT := 1; + side INT := 2; + sdam numeric; + gvf numeric := 0.0; + new_gvf numeric; + arr_gvf numeric[]; + class_avg numeric; + class_max_i INT; + class_min_i INT; + class_max numeric; + class_min numeric; + reply numeric[]; +BEGIN + + -- Calculate the sum of squared deviations from the array mean (SDAM). + SELECT sum((arr_mean - e)^2) INTO sdam FROM ( SELECT unnest(in_array) as e ) x; + --Identify the breaks for the lowest GVF + LOOP + i = 1; + LOOP + -- get our mean + SELECT avg(e) INTO class_avg FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e) x; + -- find the deviation + SELECT sum((class_avg-e)^2) INTO tmp_val FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e ) x; + IF i = 1 THEN + arr_gvf = ARRAY[tmp_val]; + -- init our min/max map for later + class_max = arr_gvf[i]; + class_min = arr_gvf[i]; + class_min_i = 1; + class_max_i = 1; + ELSE + arr_gvf = array_append(arr_gvf, tmp_val); + END IF; + i := i+1; + IF i > breaks THEN EXIT; END IF; + END LOOP; + -- calculate our new GVF + SELECT sdam-sum(e) INTO new_gvf FROM ( SELECT unnest(arr_gvf) as e ) x; + -- if no improvement was made, exit + IF new_gvf < gvf THEN EXIT; END IF; + gvf = new_gvf; + IF j > max_search THEN EXIT; END IF; + j = j+1; + i = 1; + LOOP + --establish directionality (uppward through classes or downward) + IF arr_gvf[i] < class_min THEN + class_min = arr_gvf[i]; + class_min_i = i; + END IF; + IF arr_gvf[i] > class_max THEN + class_max = arr_gvf[i]; + class_max_i = i; + END IF; + i := i+1; + IF i > breaks THEN EXIT; END IF; + END LOOP; + IF class_max_i > class_min_i THEN + class_min_i = class_max_i - 1; + ELSE + class_min_i = class_max_i + 1; + END IF; + --Move from higher class to a lower gid order + IF class_max_i > class_min_i THEN + classes[class_max_i][1] = classes[class_max_i][1] + 1; + classes[class_min_i][2] = classes[class_min_i][2] + 1; + ELSE -- Move from lower class UP into a higher class by gid + classes[class_max_i][2] = classes[class_max_i][2] - 1; + classes[class_min_i][1] = classes[class_min_i][1] - 1; + END IF; + END LOOP; + + i = 1; + LOOP + IF invert = TRUE THEN + side = 1; --default returns bottom side of breaks, invert returns top side + END IF; + reply = array_append(reply, in_array[classes[i][side]]); + i = i+1; + IF i > breaks THEN EXIT; END IF; + END LOOP; + + RETURN array_prepend(gvf, reply); + +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; + + +-- +-- Determine the Quantile classifications from a numeric array +-- +-- @param in_array A numeric array of numbers to determine the best +-- bins based on the Quantile method. +-- +-- @param breaks The number of bins you want to find. +-- +-- +CREATE OR REPLACE FUNCTION CDB_QuantileBins ( in_array NUMERIC[], breaks INT) RETURNS NUMERIC[] as $$ +DECLARE + element_count INT4; + break_size numeric; + tmp_val numeric; + i INT := 1; + reply numeric[]; +BEGIN + -- sort our values + SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e ASC) x; + -- get the total size of our data + element_count := array_length(in_array, 1); + break_size := element_count::numeric / breaks; + -- slice our bread + LOOP + IF i < breaks THEN + IF break_size * i % 1 > 0 THEN + SELECT e INTO tmp_val FROM ( SELECT unnest(in_array) e LIMIT 1 OFFSET ceil(break_size * i) - 1) x; + ELSE + SELECT avg(e) INTO tmp_val FROM ( SELECT unnest(in_array) e LIMIT 2 OFFSET ceil(break_size * i) - 1 ) x; + END IF; + ELSIF i = breaks THEN + -- select the last value + SELECT max(e) INTO tmp_val FROM ( SELECT unnest(in_array) e ) x; + ELSE + EXIT; + END IF; + + reply = array_append(reply, tmp_val); + i := i+1; + END LOOP; + RETURN reply; +END; +$$ language plpgsql IMMUTABLE STRICT PARALLEL SAFE; diff --git a/release/crankshaft--0.7.0.sql b/release/crankshaft--0.7.0.sql new file mode 100644 index 0000000..cc66ac6 --- /dev/null +++ b/release/crankshaft--0.7.0.sql @@ -0,0 +1,2165 @@ +--DO NOT MODIFY THIS FILE, IT IS GENERATED AUTOMATICALLY FROM SOURCES +-- Complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION crankshaft" to load this file. \quit +-- Version number of the extension release +CREATE OR REPLACE FUNCTION cdb_crankshaft_version() +RETURNS text AS $$ + SELECT '0.7.0'::text; +$$ language 'sql' IMMUTABLE STRICT PARALLEL SAFE; + +-- Internal identifier of the installed extension instence +-- e.g. 'dev' for current development version +CREATE OR REPLACE FUNCTION _cdb_crankshaft_internal_version() +RETURNS text AS $$ + SELECT installed_version FROM pg_available_extensions where name='crankshaft' and pg_available_extensions IS NOT NULL; +$$ language 'sql' STABLE STRICT PARALLEL SAFE; +-- Internal function. +-- Set the seeds of the RNGs (Random Number Generators) +-- used internally. +CREATE OR REPLACE FUNCTION +_cdb_random_seeds (seed_value INTEGER) RETURNS VOID +AS $$ + from crankshaft import random_seeds + random_seeds.set_random_seeds(seed_value) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; +CREATE OR REPLACE FUNCTION + CDB_PyAggS(current_state Numeric[], current_row Numeric[]) + returns NUMERIC[] as $$ + BEGIN + if array_upper(current_state,1) is null then + RAISE NOTICE 'setting state %',array_upper(current_row,1); + current_state[1] = array_upper(current_row,1); + end if; + return array_cat(current_state,current_row) ; + END + $$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; + +-- Create aggregate if it did not exist +DO $$ +BEGIN + IF NOT EXISTS ( + SELECT * + FROM pg_catalog.pg_proc p + LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace + WHERE n.nspname = 'cdb_crankshaft' + AND p.proname = 'cdb_pyagg' + AND p.proisagg) + THEN + CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) ( + SFUNC = CDB_PyAggS, + STYPE = Numeric[], + PARALLEL = SAFE, + INITCOND = "{}" + ); + END IF; +END +$$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION + CDB_CreateAndPredictSegment( + target NUMERIC[], + features NUMERIC[], + target_features NUMERIC[], + target_ids NUMERIC[], + n_estimators INTEGER DEFAULT 1200, + max_depth INTEGER DEFAULT 3, + subsample DOUBLE PRECISION DEFAULT 0.5, + learning_rate DOUBLE PRECISION DEFAULT 0.01, + min_samples_leaf INTEGER DEFAULT 1) +RETURNS TABLE(cartodb_id NUMERIC, prediction NUMERIC, accuracy NUMERIC) +AS $$ + import numpy as np + import plpy + + from crankshaft.segmentation import create_and_predict_segment_agg + model_params = {'n_estimators': n_estimators, + 'max_depth': max_depth, + 'subsample': subsample, + 'learning_rate': learning_rate, + 'min_samples_leaf': min_samples_leaf} + + def unpack2D(data): + dimension = data.pop(0) + a = np.array(data, dtype=float) + return a.reshape(len(a)/dimension, dimension) + + return create_and_predict_segment_agg(np.array(target, dtype=float), + unpack2D(features), + unpack2D(target_features), + target_ids, + model_params) + +$$ LANGUAGE plpythonu VOLATILE PARALLEL RESTRICTED; + +CREATE OR REPLACE FUNCTION + CDB_CreateAndPredictSegment ( + query TEXT, + variable_name TEXT, + target_table TEXT, + n_estimators INTEGER DEFAULT 1200, + max_depth INTEGER DEFAULT 3, + subsample DOUBLE PRECISION DEFAULT 0.5, + learning_rate DOUBLE PRECISION DEFAULT 0.01, + min_samples_leaf INTEGER DEFAULT 1) +RETURNS TABLE (cartodb_id TEXT, prediction NUMERIC, accuracy NUMERIC) +AS $$ + from crankshaft.segmentation import create_and_predict_segment + model_params = {'n_estimators': n_estimators, 'max_depth':max_depth, 'subsample' : subsample, 'learning_rate': learning_rate, 'min_samples_leaf' : min_samples_leaf} + return create_and_predict_segment(query,variable_name,target_table, model_params) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; +CREATE OR REPLACE FUNCTION CDB_Gravity( + IN target_query text, + IN weight_column text, + IN source_query text, + IN pop_column text, + IN target bigint, + IN radius integer, + IN minval numeric DEFAULT -10e307 + ) +RETURNS TABLE( + the_geom geometry, + source_id bigint, + target_id bigint, + dist numeric, + h numeric, + hpop numeric) AS $$ +DECLARE + t_id bigint[]; + t_geom geometry[]; + t_weight numeric[]; + s_id bigint[]; + s_geom geometry[]; + s_pop numeric[]; +BEGIN + EXECUTE 'WITH foo as('+target_query+') SELECT array_agg(cartodb_id), array_agg(the_geom), array_agg(' || weight_column || ') FROM foo' INTO t_id, t_geom, t_weight; + EXECUTE 'WITH foo as('+source_query+') SELECT array_agg(cartodb_id), array_agg(the_geom), array_agg(' || pop_column || ') FROM foo' INTO s_id, s_geom, s_pop; + RETURN QUERY + SELECT g.* FROM t, s, CDB_Gravity(t_id, t_geom, t_weight, s_id, s_geom, s_pop, target, radius, minval) g; +END; +$$ language plpgsql VOLATILE PARALLEL UNSAFE; + +CREATE OR REPLACE FUNCTION CDB_Gravity( + IN t_id bigint[], + IN t_geom geometry[], + IN t_weight numeric[], + IN s_id bigint[], + IN s_geom geometry[], + IN s_pop numeric[], + IN target bigint, + IN radius integer, + IN minval numeric DEFAULT -10e307 + ) +RETURNS TABLE( + the_geom geometry, + source_id bigint, + target_id bigint, + dist numeric, + h numeric, + hpop numeric) AS $$ +DECLARE + t_type text; + s_type text; + t_center geometry[]; + s_center geometry[]; +BEGIN + t_type := GeometryType(t_geom[1]); + s_type := GeometryType(s_geom[1]); + IF t_type = 'POINT' THEN + t_center := t_geom; + ELSE + WITH tmp as (SELECT unnest(t_geom) as g) SELECT array_agg(ST_Centroid(g)) INTO t_center FROM tmp; + END IF; + IF s_type = 'POINT' THEN + s_center := s_geom; + ELSE + WITH tmp as (SELECT unnest(s_geom) as g) SELECT array_agg(ST_Centroid(g)) INTO s_center FROM tmp; + END IF; + RETURN QUERY + with target0 as( + SELECT unnest(t_center) as tc, unnest(t_weight) as tw, unnest(t_id) as td + ), + source0 as( + SELECT unnest(s_center) as sc, unnest(s_id) as sd, unnest (s_geom) as sg, unnest(s_pop) as sp + ), + prev0 as( + SELECT + source0.sg, + source0.sd as sourc_id, + coalesce(source0.sp,0) as sp, + target.td as targ_id, + coalesce(target.tw,0) as tw, + GREATEST(1.0,ST_Distance(geography(target.tc), geography(source0.sc)))::numeric as distance + FROM source0 + CROSS JOIN LATERAL + ( + SELECT + * + FROM target0 + WHERE tw > minval + AND ST_DWithin(geography(source0.sc), geography(tc), radius) + ) AS target + ), + deno as( + SELECT + sourc_id, + sum(tw/distance) as h_deno + FROM + prev0 + GROUP BY sourc_id + ) + SELECT + p.sg as the_geom, + p.sourc_id as source_id, + p.targ_id as target_id, + case when p.distance > 1 then p.distance else 0.0 end as dist, + 100*(p.tw/p.distance)/d.h_deno as h, + p.sp*(p.tw/p.distance)/d.h_deno as hpop + FROM + prev0 p, + deno d + WHERE + p.targ_id = target AND + p.sourc_id = d.sourc_id; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; +-- 0: nearest neighbor(s) +-- 1: barymetric +-- 2: IDW +-- 3: krigin ---> TO DO + + +CREATE OR REPLACE FUNCTION CDB_SpatialInterpolation( + IN query text, + IN point geometry, + IN method integer DEFAULT 1, + IN p1 numeric DEFAULT 0, + IN p2 numeric DEFAULT 0 + ) +RETURNS numeric AS +$$ +DECLARE + gs geometry[]; + vs numeric[]; + output numeric; +BEGIN + EXECUTE 'WITH a AS('||query||') SELECT array_agg(the_geom), array_agg(attrib) FROM a' INTO gs, vs; + SELECT CDB_SpatialInterpolation(gs, vs, point, method, p1,p2) INTO output FROM a; + + RETURN output; +END; +$$ +language plpgsql VOLATILE PARALLEL UNSAFE; + +CREATE OR REPLACE FUNCTION CDB_SpatialInterpolation( + IN geomin geometry[], + IN colin numeric[], + IN point geometry, + IN method integer DEFAULT 1, + IN p1 numeric DEFAULT 0, + IN p2 numeric DEFAULT 0 + ) +RETURNS numeric AS +$$ +DECLARE + gs geometry[]; + vs numeric[]; + gs2 geometry[]; + vs2 numeric[]; + g geometry; + vertex geometry[]; + sg numeric; + sa numeric; + sb numeric; + sc numeric; + va numeric; + vb numeric; + vc numeric; + output numeric; +BEGIN + -- output := -999.999; + + -- nearest neighbors + -- p1: limit the number of neighbors, 0-> closest one + IF method = 0 THEN + + IF p1 = 0 THEN + p1 := 1; + END IF; + + WITH a as (SELECT unnest(geomin) as g, unnest(colin) as v), + b as (SELECT a.v as v FROM a ORDER BY point<->a.g LIMIT p1::integer) + SELECT avg(b.v) INTO output FROM b; + RETURN output; + + -- barymetric + ELSIF method = 1 THEN + WITH a as (SELECT unnest(geomin) AS e), + b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a), + c as (SELECT (ST_Dump(t)).geom as v FROM b), + d as (SELECT v FROM c WHERE ST_Within(point, v)) + SELECT v INTO g FROM d; + IF g is null THEN + -- out of the realm of the input data + RETURN -888.888; + END IF; + -- vertex of the selected cell + WITH a AS (SELECT (ST_DumpPoints(g)).geom AS v) + SELECT array_agg(v) INTO vertex FROM a; + + -- retrieve the value of each vertex + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]); + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]); + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]); + + SELECT ST_area(g), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[2], vertex[3], point]))), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[1], vertex[3], point]))), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point,vertex[1],vertex[2], point]))) INTO sg, sa, sb, sc; + + output := (coalesce(sa,0) * coalesce(va,0) + coalesce(sb,0) * coalesce(vb,0) + coalesce(sc,0) * coalesce(vc,0)) / coalesce(sg); + RETURN output; + + -- IDW + -- p1: limit the number of neighbors, 0->no limit + -- p2: order of distance decay, 0-> order 1 + ELSIF method = 2 THEN + + IF p2 = 0 THEN + p2 := 1; + END IF; + + WITH a as (SELECT unnest(geomin) as g, unnest(colin) as v), + b as (SELECT a.g, a.v FROM a ORDER BY point<->a.g) + SELECT array_agg(b.g), array_agg(b.v) INTO gs, vs FROM b; + IF p1::integer>0 THEN + gs2:=gs; + vs2:=vs; + FOR i IN 1..p1 + LOOP + gs2 := gs2 || gs[i]; + vs2 := vs2 || vs[i]; + END LOOP; + ELSE + gs2:=gs; + vs2:=vs; + END IF; + + WITH a as (SELECT unnest(gs2) as g, unnest(vs2) as v), + b as ( + SELECT + (1/ST_distance(point, a.g)^p2::integer) as k, + (a.v/ST_distance(point, a.g)^p2::integer) as f + FROM a + ) + SELECT sum(b.f)/sum(b.k) INTO output FROM b; + RETURN output; + + -- krigin + ELSIF method = 3 THEN + + -- TO DO + + END IF; + + RETURN -777.777; + +END; +$$ +language plpgsql IMMUTABLE PARALLEL SAFE; +-- ============================================================================================= +-- +-- CDB_Voronoi +-- +-- ============================================================================================= +CREATE OR REPLACE FUNCTION CDB_voronoi( + IN geomin geometry[], + IN buffer numeric DEFAULT 0.5, + IN tolerance numeric DEFAULT 1e-9 + ) +RETURNS geometry AS $$ +DECLARE + geomout geometry; +BEGIN + -- we need to make the geometry calculations in (pseudo)meters!!! + with a as ( + SELECT unnest(geomin) as g1 + ), + b as( + SELECT st_transform(g1, 3857) g2 from a + ) + SELECT array_agg(g2) INTO geomin from b; + + WITH + convexhull_1 as ( + SELECT + ST_ConvexHull(ST_Collect(geomin)) as g, + buffer * |/ (st_area(ST_ConvexHull(ST_Collect(geomin)))/PI()) as r + ), + clipper as( + SELECT + st_buffer(ST_MinimumBoundingCircle(a.g), buffer*a.r) as g + FROM convexhull_1 a + ), + env0 as ( + SELECT + (st_dumppoints(st_expand(a.g, buffer*a.r))).geom as e + FROM convexhull_1 a + ), + env as ( + SELECT + array_agg(env0.e) as e + FROM env0 + ), + sample AS ( + SELECT + ST_Collect(geomin || env.e) as geom + FROM env + ), + convexhull as ( + SELECT + ST_ConvexHull(ST_Collect(geomin)) as cg + ), + tin as ( + SELECT + ST_Dump(ST_DelaunayTriangles(geom, tolerance, 0)) as gd + FROM + sample + ), + tin_polygons as ( + SELECT + (gd).Path as id, + (gd).Geom as pg, + ST_Centroid(ST_MinimumBoundingCircle((gd).Geom, 180)) as ct + FROM tin + ), + tin_lines as ( + SELECT + id, + ST_ExteriorRing(pg) as lg + FROM tin_polygons + ), + tin_nodes as ( + SELECT + id, + ST_PointN(lg,1) p1, + ST_PointN(lg,2) p2, + ST_PointN(lg,3) p3 + FROM tin_lines + ), + tin_edges AS ( + SELECT + p.id, + UNNEST(ARRAY[ + ST_MakeLine(n.p1,n.p2) , + ST_MakeLine(n.p2,n.p3) , + ST_MakeLine(n.p3,n.p1)]) as Edge, + ST_Force2D(cdb_crankshaft._Find_Circle(n.p1,n.p2,n.p3)) as ct, + CASE WHEN st_distance(p.ct, ST_ExteriorRing(p.pg)) < tolerance THEN + TRUE + ELSE FALSE END AS ctx, + p.pg, + ST_within(p.ct, convexhull.cg) as ctin + FROM + tin_polygons p, + tin_nodes n, + convexhull + WHERE p.id = n.id + ), + voro_nodes as ( + SELECT + CASE WHEN x.ctx = TRUE THEN + ST_Centroid(x.edge) + ELSE + x.ct + END as xct, + CASE WHEN y.id is null THEN + CASE WHEN x.ctin = TRUE THEN + ST_SetSRID(ST_MakePoint( + ST_X(x.ct) + ((ST_X(ST_Centroid(x.edge)) - ST_X(x.ct)) * (1+buffer)), + ST_Y(x.ct) + ((ST_Y(ST_Centroid(x.edge)) - ST_Y(x.ct)) * (1+buffer)) + ), ST_SRID(x.ct)) + END + ELSE + y.ct + END as yct + FROM + tin_edges x + LEFT OUTER JOIN + tin_edges y + ON x.id <> y.id AND ST_Equals(x.edge, y.edge) + ), + voro_edges as( + SELECT + ST_LineMerge(ST_Collect(ST_MakeLine(xct, yct))) as v + FROM + voro_nodes + ), + voro_cells as( + SELECT + ST_Polygonize( + ST_Node( + ST_LineMerge( + ST_Union(v, ST_ExteriorRing( + ST_Convexhull(v) + ) + ) + ) + ) + ) as g + FROM + voro_edges + ), + voro_set as( + SELECT + (st_dump(v.g)).geom as g + FROM voro_cells v + ), + clipped_voro as( + SELECT + ST_intersection(c.g, v.g) as g + FROM + voro_set v, + clipper c + WHERE + ST_GeometryType(v.g) = 'ST_Polygon' + ) + SELECT + st_collect( + ST_Transform( + ST_ConvexHull(g), + 4326 + ) + ) + INTO geomout + FROM + clipped_voro; + RETURN geomout; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; + +/** ---------------------------------------------------------------------------------------- + * @function : FindCircle + * @precis : Function that determines if three points form a circle. If so a table containing + * centre and radius is returned. If not, a null table is returned. + * @version : 1.0.1 + * @param : p_pt1 : First point in curve + * @param : p_pt2 : Second point in curve + * @param : p_pt3 : Third point in curve + * @return : geometry : In which X,Y ordinates are the centre X, Y and the Z being the radius of found circle + * or NULL if three points do not form a circle. + * @history : Simon Greener - Feb 2012 - Original coding. + * Rafa de la Torre - Aug 2016 - Small fix for type checking + * Raul Marin - Sept 2017 - Remove unnecessary NULL checks and set function categories + * @copyright : Simon Greener @ 2012 + * Licensed under a Creative Commons Attribution-Share Alike 2.5 Australia License. (http://creativecommons.org/licenses/by-sa/2.5/au/) +**/ +CREATE OR REPLACE FUNCTION _Find_Circle( + IN p_pt1 geometry, + IN p_pt2 geometry, + IN p_pt3 geometry) + RETURNS geometry AS +$BODY$ +DECLARE + v_Centre geometry; + v_radius NUMERIC; + v_CX NUMERIC; + v_CY NUMERIC; + v_dA NUMERIC; + v_dB NUMERIC; + v_dC NUMERIC; + v_dD NUMERIC; + v_dE NUMERIC; + v_dF NUMERIC; + v_dG NUMERIC; +BEGIN + IF ( ST_GeometryType(p_pt1) <> 'ST_Point' OR + ST_GeometryType(p_pt2) <> 'ST_Point' OR + ST_GeometryType(p_pt3) <> 'ST_Point' ) THEN + RAISE EXCEPTION 'All supplied geometries must be points.'; + RETURN NULL; + END IF; + v_dA := ST_X(p_pt2) - ST_X(p_pt1); + v_dB := ST_Y(p_pt2) - ST_Y(p_pt1); + v_dC := ST_X(p_pt3) - ST_X(p_pt1); + v_dD := ST_Y(p_pt3) - ST_Y(p_pt1); + v_dE := v_dA * (ST_X(p_pt1) + ST_X(p_pt2)) + v_dB * (ST_Y(p_pt1) + ST_Y(p_pt2)); + v_dF := v_dC * (ST_X(p_pt1) + ST_X(p_pt3)) + v_dD * (ST_Y(p_pt1) + ST_Y(p_pt3)); + v_dG := 2.0 * (v_dA * (ST_Y(p_pt3) - ST_Y(p_pt2)) - v_dB * (ST_X(p_pt3) - ST_X(p_pt2))); + -- If v_dG is zero then the three points are collinear and no finite-radius + -- circle through them exists. + IF ( v_dG = 0 ) THEN + RETURN NULL; + ELSE + v_CX := (v_dD * v_dE - v_dB * v_dF) / v_dG; + v_CY := (v_dA * v_dF - v_dC * v_dE) / v_dG; + v_Radius := SQRT(POWER(ST_X(p_pt1) - v_CX,2) + POWER(ST_Y(p_pt1) - v_CY,2) ); + END IF; + RETURN ST_SetSRID(ST_MakePoint(v_CX, v_CY, v_radius),ST_Srid(p_pt1)); +END; +$BODY$ + LANGUAGE plpgsql IMMUTABLE STRICT PARALLEL SAFE; + +-- Moran's I Global Measure (public-facing) +CREATE OR REPLACE FUNCTION + CDB_AreasOfInterestGlobal( + subquery TEXT, + column_name TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS TABLE (moran NUMERIC, significance NUMERIC) +AS $$ + from crankshaft.clustering import Moran + # TODO: use named parameters or a dictionary + moran = Moran() + return moran.global_stat(subquery, column_name, w_type, + num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local (internal function) +CREATE OR REPLACE FUNCTION + _CDB_AreasOfInterestLocal( + subquery TEXT, + column_name TEXT, + w_type TEXT, + num_ngbrs INT, + permutations INT, + geom_col TEXT, + id_col TEXT) +RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + from crankshaft.clustering import Moran + moran = Moran() + # TODO: use named parameters or a dictionary + return moran.local_stat(subquery, column_name, w_type, + num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_AreasOfInterestLocal( + subquery TEXT, + column_name TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I only for HH and HL (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialHotspots( + subquery TEXT, + column_name TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') + RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('HH', 'HL'); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I only for LL and LH (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialColdspots( + subquery TEXT, + attr TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') + RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('LL', 'LH'); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I only for LH and HL (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialOutliers( + subquery TEXT, + attr TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') + RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('HL', 'LH'); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I Global Rate (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_AreasOfInterestGlobalRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS TABLE (moran FLOAT, significance FLOAT) +AS $$ + from crankshaft.clustering import Moran + moran = Moran() + # TODO: use named parameters or a dictionary + return moran.global_rate_stat(subquery, numerator, denominator, w_type, + num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + + +-- Moran's I Local Rate (internal function) +CREATE OR REPLACE FUNCTION + _CDB_AreasOfInterestLocalRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT, + num_ngbrs INT, + permutations INT, + geom_col TEXT, + id_col TEXT) +RETURNS +TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + from crankshaft.clustering import Moran + moran = Moran() + # TODO: use named parameters or a dictionary + return moran.local_rate_stat(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local Rate (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_AreasOfInterestLocalRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS +TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local Rate only for HH and HL (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialHotspotsRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS +TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('HH', 'HL'); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local Rate only for LL and LH (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialColdspotsRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS +TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('LL', 'LH'); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local Rate only for LH and HL (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialOutliersRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS +TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('HL', 'LH'); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; +-- Spatial k-means clustering + +CREATE OR REPLACE FUNCTION CDB_KMeans( + query TEXT, + no_clusters INTEGER, + no_init INTEGER DEFAULT 20 +) +RETURNS TABLE( + cartodb_id INTEGER, + cluster_no INTEGER +) AS $$ + +from crankshaft.clustering import Kmeans +kmeans = Kmeans() +return kmeans.spatial(query, no_clusters, no_init) + +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + +-- Non-spatial k-means clustering +-- query: sql query to retrieve all the needed data +-- colnames: text array of column names for doing the clustering analysis +-- no_clusters: number of requested clusters +-- standardize: whether to scale variables to a mean of zero and a standard +-- deviation of 1 +-- id_colname: name of the id column + +CREATE OR REPLACE FUNCTION CDB_KMeansNonspatial( + query TEXT, + colnames TEXT[], + no_clusters INTEGER, + standardize BOOLEAN DEFAULT true, + id_col TEXT DEFAULT 'cartodb_id' +) +RETURNS TABLE( + cluster_label text, + cluster_center json, + silhouettes numeric, + inertia numeric, + rowid bigint +) AS $$ + +from crankshaft.clustering import Kmeans +kmeans = Kmeans() +return kmeans.nonspatial(query, colnames, no_clusters, + standardize=standardize, + id_col=id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + + +CREATE OR REPLACE FUNCTION CDB_WeightedMeanS( + state NUMERIC[], + the_geom GEOMETRY(Point, 4326), + weight NUMERIC +) +RETURNS Numeric[] AS $$ +DECLARE + newX NUMERIC; + newY NUMERIC; + newW NUMERIC; +BEGIN + IF weight IS NULL OR the_geom IS NULL THEN + newX = state[1]; + newY = state[2]; + newW = state[3]; + ELSE + newX = state[1] + ST_X(the_geom)*weight; + newY = state[2] + ST_Y(the_geom)*weight; + newW = state[3] + weight; + END IF; + RETURN Array[newX,newY,newW]; + +END +$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; + + +CREATE OR REPLACE FUNCTION CDB_WeightedMeanF(state NUMERIC[]) +RETURNS GEOMETRY AS +$$ +BEGIN + IF state[3] = 0 THEN + RETURN ST_SetSRID(ST_MakePoint(state[1],state[2]), 4326); + ELSE + RETURN ST_SETSRID(ST_MakePoint(state[1]/state[3], state[2]/state[3]),4326); + END IF; +END +$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; + +-- Create aggregate if it did not exist +DO $$ +BEGIN + IF NOT EXISTS ( + SELECT * + FROM pg_catalog.pg_proc p + LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace + WHERE n.nspname = 'cdb_crankshaft' + AND p.proname = 'cdb_weightedmean' + AND p.proisagg) + THEN + CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) ( + SFUNC = CDB_WeightedMeanS, + FINALFUNC = CDB_WeightedMeanF, + STYPE = Numeric[], + PARALLEL = SAFE, + INITCOND = "{0.0,0.0,0.0}" + ); + END IF; +END +$$ LANGUAGE plpgsql; +-- Spatial Markov + +-- input table format: +-- id | geom | date_1 | date_2 | date_3 +-- 1 | Pt1 | 12.3 | 13.1 | 14.2 +-- 2 | Pt2 | 11.0 | 13.2 | 12.5 +-- ... +-- Sample Function call: +-- SELECT CDB_SpatialMarkov('SELECT * FROM real_estate', +-- Array['date_1', 'date_2', 'date_3']) + +CREATE OR REPLACE FUNCTION + CDB_SpatialMarkovTrend ( + subquery TEXT, + time_cols TEXT[], + num_classes INT DEFAULT 7, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS TABLE (trend NUMERIC, trend_up NUMERIC, trend_down NUMERIC, volatility NUMERIC, rowid INT) +AS $$ + + from crankshaft.space_time_dynamics import Markov + markov = Markov() + + ## TODO: use named parameters or a dictionary + return markov.spatial_trend(subquery, time_cols, num_classes, w_type, num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + +-- input table format: identical to above but in a predictable format +-- Sample function call: +-- SELECT cdb_spatial_markov('SELECT * FROM real_estate', +-- 'date_1') + + +-- CREATE OR REPLACE FUNCTION +-- cdb_spatial_markov ( +-- subquery TEXT, +-- time_col_min text, +-- time_col_max text, +-- date_format text, -- '_YYYY_MM_DD' +-- num_time_per_bin INT DEFAULT 1, +-- permutations INT DEFAULT 99, +-- geom_column TEXT DEFAULT 'the_geom', +-- id_col TEXT DEFAULT 'cartodb_id', +-- w_type TEXT DEFAULT 'knn', +-- num_ngbrs int DEFAULT 5) +-- RETURNS TABLE (moran FLOAT, quads TEXT, significance FLOAT, ids INT) +-- AS $$ +-- plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()') +-- from crankshaft.clustering import moran_local +-- # TODO: use named parameters or a dictionary +-- return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs) +-- $$ LANGUAGE plpythonu; +-- +-- -- input table format: +-- -- id | geom | date | measurement +-- -- 1 | Pt1 | 12/3 | 13.2 +-- -- 2 | Pt2 | 11/5 | 11.3 +-- -- 3 | Pt1 | 11/13 | 12.9 +-- -- 4 | Pt3 | 12/19 | 10.1 +-- -- ... +-- +-- CREATE OR REPLACE FUNCTION +-- cdb_spatial_markov ( +-- subquery TEXT, +-- time_col text, +-- num_time_per_bin INT DEFAULT 1, +-- permutations INT DEFAULT 99, +-- geom_column TEXT DEFAULT 'the_geom', +-- id_col TEXT DEFAULT 'cartodb_id', +-- w_type TEXT DEFAULT 'knn', +-- num_ngbrs int DEFAULT 5) +-- RETURNS TABLE (moran FLOAT, quads TEXT, significance FLOAT, ids INT) +-- AS $$ +-- plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()') +-- from crankshaft.clustering import moran_local +-- # TODO: use named parameters or a dictionary +-- return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs) +-- $$ LANGUAGE plpythonu; +-- Based on: +-- https://github.com/mapbox/polylabel/blob/master/index.js +-- https://sites.google.com/site/polesofinaccessibility/ +-- Requires: https://github.com/CartoDB/cartodb-postgresql + +-- Based on: +-- https://github.com/mapbox/polylabel/blob/master/index.js +-- https://sites.google.com/site/polesofinaccessibility/ +-- Requires: https://github.com/CartoDB/cartodb-postgresql + +CREATE OR REPLACE FUNCTION CDB_PIA( + IN polygon geometry, + IN tolerance numeric DEFAULT 1.0 + ) +RETURNS geometry AS $$ +DECLARE + env geometry[]; + cells geometry[]; + cell geometry; + best_c geometry; + best_d numeric; + test_d numeric; + test_mx numeric; + test_h numeric; + test_cells geometry[]; + width numeric; + height numeric; + h numeric; + i integer; + n integer; + sqr numeric; + p geometry; +BEGIN + sqr := 0.5*(|/2.0); + polygon := ST_Transform(polygon, 3857); + + -- grid #0 cell size + height := ST_YMax(polygon) - ST_YMin(polygon); + width := ST_XMax(polygon) - ST_XMin(polygon); + h := 0.5*LEAST(height, width); + + -- grid #0 + with c1 as( + SELECT cdb_crankshaft.CDB_RectangleGrid(polygon, h, h) as c + ) + SELECT array_agg(c) INTO cells FROM c1; + + -- 1st guess: centroid + best_c := polygon; + best_d := cdb_crankshaft._Signed_Dist(polygon, ST_Centroid(Polygon)); + + -- looping the loop + n := array_length(cells,1); + i := 1; + LOOP + + EXIT WHEN i > n; + + cell := cells[i]; + + i := i+1; + + -- cell side size, it's square + test_h := ST_XMax(cell) - ST_XMin(cell) ; + + -- check distance + test_d := cdb_crankshaft._Signed_Dist(polygon, ST_Centroid(cell)); + + IF test_d > best_d THEN + best_d := test_d; + best_c := cell; + END IF; + + -- longest distance within the cell + test_mx := test_d + (test_h * sqr); + + -- if the cell has no chance to contains the desired point, continue + CONTINUE WHEN test_mx - best_d <= tolerance; + + -- resample the cell + with c1 as( + SELECT cdb_crankshaft.CDB_RectangleGrid(cell, test_h/2, test_h/2) as c + ) + SELECT array_agg(c) INTO test_cells FROM c1; + + -- concat the new cells to the former array + cells := cells || test_cells; + + -- prepare next iteration + n := array_length(cells,1); + + END LOOP; + + RETURN ST_transform(ST_Centroid(best_c), 4326); + +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; + + + +-- signed distance point to polygon with holes +-- negative is the point is out the polygon +-- rev 1. adding MULTIPOLYGON and GEOMETRYCOLLECTION support by @abelvm +CREATE OR REPLACE FUNCTION _Signed_Dist( + IN polygon geometry, + IN point geometry + ) +RETURNS numeric AS $$ +DECLARE + pols geometry[]; + pol geometry; + i integer; + j integer; + within integer; + w integer; + holes integer; + dist numeric; + d numeric; +BEGIN + dist := 1e999; + WITH collection as (SELECT (ST_dump(polygon)).geom as geom) SELECT array_agg(geom) into pols FROM collection; + FOR j in 1..array_length(pols, 1) + LOOP + pol := pols[j]; + d := dist; + SELECT LEAST(dist, ST_distance(point, ST_ExteriorRing(pol))::numeric) INTO d; + SELECT CASE WHEN ST_Within(point,pol) THEN 1 ELSE -1 END INTO w; + SELECT ST_NumInteriorRings(pol) INTO holes; + IF holes > 0 THEN + FOR i IN 1..holes + LOOP + SELECT LEAST(d, ST_distance(point, ST_InteriorRingN(pol, i))::numeric) INTO d; + END LOOP; + END IF; + IF d < dist THEN + dist:= d; + within := w; + END IF; + END LOOP; + dist := dist * within::numeric; + RETURN dist; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; +-- +-- Iterative densification of a set of points using Delaunay triangulation +-- the new points have as assigned value the average value of the 3 vertex (centroid) +-- +-- @param geomin - array of geometries (points) +-- +-- @param colin - array of numeric values in that points +-- +-- @param iterations - integer, number of iterations +-- +-- +-- Returns: TABLE(geomout geometry, colout numeric) +-- +-- +CREATE OR REPLACE FUNCTION CDB_Densify( + IN geomin geometry[], + IN colin numeric[], + IN iterations integer + ) +RETURNS TABLE(geomout geometry, colout numeric) AS $$ +DECLARE + geotemp geometry[]; + coltemp numeric[]; + i integer; + gs geometry[]; + g geometry; + vertex geometry[]; + va numeric; + vb numeric; + vc numeric; + center geometry; + centerval numeric; + tmp integer; +BEGIN + geotemp := geomin; + coltemp := colin; + FOR i IN 1..iterations + LOOP + -- generate TIN + WITH a as (SELECT unnest(geotemp) AS e), + b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a), + c as (SELECT (ST_Dump(t)).geom AS v FROM b) + SELECT array_agg(v) INTO gs FROM c; + -- loop cells + FOREACH g IN ARRAY gs + LOOP + -- append centroid + SELECT ST_Centroid(g) INTO center; + geotemp := array_append(geotemp, center); + -- retrieve the value of each vertex + WITH a AS (SELECT (ST_DumpPoints(g)).geom AS v) + SELECT array_agg(v) INTO vertex FROM a; + WITH a AS(SELECT unnest(geotemp) as geo, unnest(coltemp) as c) + SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]); + WITH a AS(SELECT unnest(geotemp) as geo, unnest(coltemp) as c) + SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]); + WITH a AS(SELECT unnest(geotemp) as geo, unnest(coltemp) as c) + SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]); + -- calc the value at the center + centerval := (va + vb + vc) / 3; + -- append the value + coltemp := array_append(coltemp, centerval); + END LOOP; + END LOOP; + RETURN QUERY SELECT unnest(geotemp ) as geomout, unnest(coltemp ) as colout; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; +CREATE OR REPLACE FUNCTION CDB_TINmap( + IN geomin geometry[], + IN colin numeric[], + IN iterations integer + ) +RETURNS TABLE(geomout geometry, colout numeric) AS $$ +DECLARE + p geometry[]; + vals numeric[]; + gs geometry[]; + g geometry; + vertex geometry[]; + centerval numeric; + va numeric; + vb numeric; + vc numeric; + coltemp numeric[]; +BEGIN + SELECT array_agg(dens.geomout), array_agg(dens.colout) INTO p, vals FROM cdb_crankshaft.CDB_Densify(geomin, colin, iterations) dens; + WITH a as (SELECT unnest(p) AS e), + b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a), + c as (SELECT (ST_Dump(t)).geom AS v FROM b) + SELECT array_agg(v) INTO gs FROM c; + FOREACH g IN ARRAY gs + LOOP + -- retrieve the vertex of each triangle + WITH a AS (SELECT (ST_DumpPoints(g)).geom AS v) + SELECT array_agg(v) INTO vertex FROM a; + -- retrieve the value of each vertex + WITH a AS(SELECT unnest(p) as geo, unnest(vals) as c) + SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]); + WITH a AS(SELECT unnest(p) as geo, unnest(vals) as c) + SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]); + WITH a AS(SELECT unnest(p) as geo, unnest(vals) as c) + SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]); + -- calc the value at the center + centerval := (va + vb + vc) / 3; + -- append the value + coltemp := array_append(coltemp, centerval); + END LOOP; + RETURN QUERY SELECT unnest(gs) as geomout, unnest(coltemp ) as colout; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; +-- Getis-Ord's G +-- Hotspot/Coldspot Analysis tool +CREATE OR REPLACE FUNCTION + CDB_GetisOrdsG( + subquery TEXT, + column_name TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 999, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS TABLE (z_score NUMERIC, p_value NUMERIC, p_z_sim NUMERIC, rowid BIGINT) +AS $$ + from crankshaft.clustering import Getis + getis = Getis() + return getis.getis_ord(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + +-- TODO: make a version that accepts the values as arrays + +-- Find outliers using a static threshold +-- +CREATE OR REPLACE FUNCTION CDB_StaticOutlier(column_value numeric, threshold numeric) +RETURNS boolean +AS $$ +BEGIN + + RETURN column_value > threshold; + +END; +$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE ; + +-- Find outliers by a percentage above the threshold +-- TODO: add symmetric option? `is_symmetric boolean DEFAULT false` + +CREATE OR REPLACE FUNCTION CDB_PercentOutlier(column_values numeric[], outlier_fraction numeric, ids int[]) +RETURNS TABLE(is_outlier boolean, rowid int) +AS $$ +DECLARE + avg_val numeric; + out_vals boolean[]; +BEGIN + + SELECT avg(i) INTO avg_val + FROM unnest(column_values) As x(i); + + IF avg_val = 0 THEN + RAISE EXCEPTION 'Mean value is zero. Try another outlier method.'; + END IF; + + SELECT array_agg( + outlier_fraction < i / avg_val) INTO out_vals + FROM unnest(column_values) As x(i); + + RETURN QUERY + SELECT unnest(out_vals) As is_outlier, + unnest(ids) As rowid; + +END; +$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; + +-- Find outliers above a given number of standard deviations from the mean + +CREATE OR REPLACE FUNCTION CDB_StdDevOutlier(column_values numeric[], num_deviations numeric, ids int[], is_symmetric boolean DEFAULT true) +RETURNS TABLE(is_outlier boolean, rowid int) +AS $$ +DECLARE + stddev_val numeric; + avg_val numeric; + out_vals boolean[]; +BEGIN + + SELECT stddev(i), avg(i) INTO stddev_val, avg_val + FROM unnest(column_values) As x(i); + + IF stddev_val = 0 THEN + RAISE EXCEPTION 'Standard deviation of input data is zero'; + END IF; + + IF is_symmetric THEN + SELECT array_agg( + abs(i - avg_val) / stddev_val > num_deviations) INTO out_vals + FROM unnest(column_values) As x(i); + ELSE + SELECT array_agg( + (i - avg_val) / stddev_val > num_deviations) INTO out_vals + FROM unnest(column_values) As x(i); + END IF; + + RETURN QUERY + SELECT unnest(out_vals) As is_outlier, + unnest(ids) As rowid; +END; +$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; +CREATE OR REPLACE FUNCTION CDB_Contour( + IN geomin geometry[], + IN colin numeric[], + IN buffer numeric, + IN intmethod integer, + IN classmethod integer, + IN steps integer, + IN max_time integer DEFAULT 60000 + ) +RETURNS TABLE( + the_geom geometry, + bin integer, + min_value numeric, + max_value numeric, + avg_value numeric +) AS $$ +DECLARE + cell_count integer; + tin geometry[]; + resolution integer; +BEGIN + + -- nasty trick to override issue #121 + IF max_time = 0 THEN + max_time = -90; + END IF; + resolution := max_time; + max_time := -1 * resolution; + + -- calc the optimal number of cells for the current dataset + SELECT + CASE intmethod + WHEN 0 THEN round(3.7745903782 * max_time - 9.4399210051 * array_length(geomin,1) - 1350.8778213073) + WHEN 1 THEN round(2.2855592156 * max_time - 87.285217133 * array_length(geomin,1) + 17255.7085601797) + WHEN 2 THEN round(0.9799471999 * max_time - 127.0334085369 * array_length(geomin,1) + 22707.9579721218) + ELSE 10000 + END INTO cell_count; + + -- we don't have iterative barycentric interpolation in CDB_interpolation, + -- and it's a costy function, so let's make a custom one here till + -- we update the code + -- tin := ARRAY[]::geometry[]; + IF intmethod=1 THEN + WITH + a as (SELECT unnest(geomin) AS e), + b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a), + c as (SELECT (ST_Dump(t)).geom as v FROM b) + SELECT array_agg(v) INTO tin FROM c; + END IF; + -- Delaunay stuff performed just ONCE!! + + -- magic + RETURN QUERY + WITH + convexhull as ( + SELECT + ST_ConvexHull(ST_Collect(geomin)) as g, + buffer * |/ st_area(ST_ConvexHull(ST_Collect(geomin)))/PI() as r + ), + envelope as ( + SELECT + st_expand(a.g, a.r) as e + FROM convexhull a + ), + envelope3857 as( + SELECT + ST_Transform(e, 3857) as geom + FROM envelope + ), + resolution as( + SELECT + CASE WHEN resolution <= 0 THEN + round(|/ ( + ST_area(geom) / abs(cell_count) + )) + ELSE + resolution + END AS cell + FROM envelope3857 + ), + grid as( + SELECT + ST_Transform(cdb_crankshaft.CDB_RectangleGrid(e.geom, r.cell, r.cell), 4326) as geom + FROM envelope3857 e, resolution r + ), + interp as( + SELECT + geom, + CASE + WHEN intmethod=1 THEN cdb_crankshaft._interp_in_tin(geomin, colin, tin, ST_Centroid(geom)) + ELSE cdb_crankshaft.CDB_SpatialInterpolation(geomin, colin, ST_Centroid(geom), intmethod) + END as val + FROM grid + ), + classes as( + SELECT CASE + WHEN classmethod = 0 THEN + cdb_crankshaft.CDB_EqualIntervalBins(array_agg(val), steps) + WHEN classmethod = 1 THEN + cdb_crankshaft.CDB_HeadsTailsBins(array_agg(val), steps) + WHEN classmethod = 2 THEN + cdb_crankshaft.CDB_JenksBins(array_agg(val), steps) + ELSE + cdb_crankshaft.CDB_QuantileBins(array_agg(val), steps) + END as b + FROM interp + where val is not null + ), + classified as( + SELECT + i.*, + width_bucket(i.val, c.b) as bucket + FROM interp i left join classes c + ON 1=1 + ), + classified2 as( + SELECT + geom, + val, + CASE + WHEN bucket = steps THEN bucket - 1 + ELSE bucket + END as b + FROM classified + ), + final as( + SELECT + st_union(geom) as the_geom, + b as bin, + min(val) as min_value, + max(val) as max_value, + avg(val) as avg_value + FROM classified2 + GROUP BY bin + ) + SELECT + * + FROM final + where final.bin is not null + ; +END; +$$ language plpgsql VOLATILE PARALLEL RESTRICTED; + + +-- ===================================================================== +-- Interp in grid, so we can use barycentric with a precalculated tin (NNI) +-- ===================================================================== +CREATE OR REPLACE FUNCTION _interp_in_tin( + IN geomin geometry[], + IN colin numeric[], + IN tin geometry[], + IN point geometry + ) +RETURNS numeric AS +$$ +DECLARE + g geometry; + vertex geometry[]; + sg numeric; + sa numeric; + sb numeric; + sc numeric; + va numeric; + vb numeric; + vc numeric; + output numeric; +BEGIN + -- get the cell the point is within + WITH + a as (SELECT unnest(tin) as v), + b as (SELECT v FROM a WHERE ST_Within(point, v)) + SELECT v INTO g FROM b; + + -- if we're out of the data realm, + -- return null + IF g is null THEN + RETURN null; + END IF; + + -- vertex of the selected cell + WITH a AS ( + SELECT (ST_DumpPoints(g)).geom AS v + ) + SELECT array_agg(v) INTO vertex FROM a; + + -- retrieve the value of each vertex + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]); + + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]); + + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]); + + -- calc the areas + SELECT + ST_area(g), + ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[2], vertex[3], point]))), + ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[1], vertex[3], point]))), + ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point,vertex[1],vertex[2], point]))) INTO sg, sa, sb, sc; + + output := (coalesce(sa,0) * coalesce(va,0) + coalesce(sb,0) * coalesce(vb,0) + coalesce(sc,0) * coalesce(vc,0)) / coalesce(sg,1); + RETURN output; +END; +$$ +language plpgsql IMMUTABLE PARALLEL SAFE; +-- Function by Stuart Lynn for a simple interpolation of a value +-- from a polygon table over an arbitrary polygon +-- (weighted by the area proportion overlapped) +-- Aereal weighting is a very simple form of aereal interpolation. +-- +-- Parameters: +-- * geom a Polygon geometry which defines the area where a value will be +-- estimated as the area-weighted sum of a given table/column +-- * target_table_name table name of the table that provides the values +-- * target_column column name of the column that provides the values +-- * schema_name optional parameter to defina the schema the target table +-- belongs to, which is necessary if its not in the search_path. +-- Note that target_table_name should never include the schema in it. +-- Return value: +-- Aereal-weighted interpolation of the column values over the geometry +CREATE OR REPLACE +FUNCTION cdb_overlap_sum(geom geometry, target_table_name text, target_column text, schema_name text DEFAULT NULL) + RETURNS numeric AS +$$ +DECLARE + result numeric; + qualified_name text; +BEGIN + IF schema_name IS NULL THEN + qualified_name := Format('%I', target_table_name); + ELSE + qualified_name := Format('%I.%s', schema_name, target_table_name); + END IF; + EXECUTE Format(' + SELECT sum(%I*ST_Area(St_Intersection($1, a.the_geom))/ST_Area(a.the_geom)) + FROM %s AS a + WHERE $1 && a.the_geom + ', target_column, qualified_name) + USING geom + INTO result; + RETURN result; +END; +$$ LANGUAGE plpgsql STABLE PARALLEL SAFE; +CREATE OR REPLACE FUNCTION +CDB_GWR(subquery text, dep_var text, ind_vars text[], + bw numeric default null, fixed boolean default False, + kernel text default 'bisquare', geom_col text default 'the_geom', + id_col text default 'cartodb_id') +RETURNS table(coeffs JSON, stand_errs JSON, t_vals JSON, + filtered_t_vals JSON, predicted numeric, + residuals numeric, r_squared numeric, bandwidth numeric, + rowid bigint) +AS $$ + +from crankshaft.regression import GWR + +gwr = GWR() + +return gwr.gwr(subquery, dep_var, ind_vars, bw, fixed, kernel, geom_col, id_col) + +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; + + +CREATE OR REPLACE FUNCTION +CDB_GWR_Predict(subquery text, dep_var text, ind_vars text[], + bw numeric default null, fixed boolean default False, + kernel text default 'bisquare', + geom_col text default 'the_geom', + id_col text default 'cartodb_id') +RETURNS table(coeffs JSON, stand_errs JSON, t_vals JSON, + r_squared numeric, predicted numeric, rowid bigint) +AS $$ + +from crankshaft.regression import GWR +gwr = GWR() + +return gwr.gwr_predict(subquery, dep_var, ind_vars, bw, fixed, kernel, geom_col, id_col) + +$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; +-- +-- Creates N points randomly distributed arround the polygon +-- +-- @param g - the geometry to be turned in to points +-- +-- @param no_points - the number of points to generate +-- +-- @params max_iter_per_point - the function generates points in the polygon's bounding box +-- and discards points which don't lie in the polygon. max_iter_per_point specifies how many +-- misses per point the funciton accepts before giving up. +-- +-- Returns: Multipoint with the requested points +CREATE OR REPLACE FUNCTION cdb_dot_density(geom geometry , no_points Integer, max_iter_per_point Integer DEFAULT 1000) +RETURNS GEOMETRY AS $$ +DECLARE + extent GEOMETRY; + test_point Geometry; + width NUMERIC; + height NUMERIC; + x0 NUMERIC; + y0 NUMERIC; + xp NUMERIC; + yp NUMERIC; + no_left INTEGER; + remaining_iterations INTEGER; + points GEOMETRY[]; + bbox_line GEOMETRY; + intersection_line GEOMETRY; +BEGIN + extent := ST_Envelope(geom); + width := ST_XMax(extent) - ST_XMIN(extent); + height := ST_YMax(extent) - ST_YMIN(extent); + x0 := ST_XMin(extent); + y0 := ST_YMin(extent); + no_left := no_points; + + LOOP + if(no_left=0) THEN + EXIT; + END IF; + yp = y0 + height*random(); + bbox_line = ST_MakeLine( + ST_SetSRID(ST_MakePoint(yp, x0),4326), + ST_SetSRID(ST_MakePoint(yp, x0+width),4326) + ); + intersection_line = ST_Intersection(bbox_line,geom); + test_point = ST_LineInterpolatePoint(st_makeline(st_linemerge(intersection_line)),random()); + points := points || test_point; + no_left = no_left - 1 ; + END LOOP; + RETURN ST_Collect(points); +END; +$$ +LANGUAGE plpgsql VOLATILE PARALLEL RESTRICTED; +-- Make sure by default there are no permissions for publicuser +-- NOTE: this happens at extension creation time, as part of an implicit transaction. +-- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE; + +-- Grant permissions on the schema to publicuser (but just the schema) +GRANT USAGE ON SCHEMA cdb_crankshaft TO publicuser; + +-- Revoke execute permissions on all functions in the schema by default +-- REVOKE EXECUTE ON ALL FUNCTIONS IN SCHEMA cdb_crankshaft FROM PUBLIC, publicuser; +-- +-- Fill given extent with a rectangular coverage +-- +-- @param ext Extent to fill. Only rectangles with center point falling +-- inside the extent (or at the lower or leftmost edge) will +-- be emitted. The returned hexagons will have the same SRID +-- as this extent. +-- +-- @param width With of each rectangle +-- +-- @param height Height of each rectangle +-- +-- @param origin Optional origin to allow for exact tiling. +-- If omitted the origin will be 0,0. +-- The parameter is checked for having the same SRID +-- as the extent. +-- +-- +CREATE OR REPLACE FUNCTION CDB_RectangleGrid(ext GEOMETRY, width FLOAT8, height FLOAT8, origin GEOMETRY DEFAULT NULL) +RETURNS SETOF GEOMETRY +AS $$ +DECLARE + h GEOMETRY; -- rectangle cell + hstep FLOAT8; -- horizontal step + vstep FLOAT8; -- vertical step + hw FLOAT8; -- half width + hh FLOAT8; -- half height + vstart FLOAT8; + hstart FLOAT8; + hend FLOAT8; + vend FLOAT8; + xoff FLOAT8; + yoff FLOAT8; + xgrd FLOAT8; + ygrd FLOAT8; + x FLOAT8; + y FLOAT8; + srid INTEGER; +BEGIN + + srid := ST_SRID(ext); + + xoff := 0; + yoff := 0; + + IF origin IS NOT NULL THEN + IF ST_SRID(origin) != srid THEN + RAISE EXCEPTION 'SRID mismatch between extent (%) and origin (%)', srid, ST_SRID(origin); + END IF; + xoff := ST_X(origin); + yoff := ST_Y(origin); + END IF; + + --RAISE DEBUG 'X offset: %', xoff; + --RAISE DEBUG 'Y offset: %', yoff; + + hw := width/2.0; + hh := height/2.0; + + xgrd := hw; + ygrd := hh; + --RAISE DEBUG 'X grid size: %', xgrd; + --RAISE DEBUG 'Y grid size: %', ygrd; + + hstep := width; + vstep := height; + + -- Tweak horizontal start on hstep grid from origin + hstart := xoff + ceil((ST_XMin(ext)-xoff)/hstep)*hstep; + --RAISE DEBUG 'hstart: %', hstart; + + -- Tweak vertical start on vstep grid from origin + vstart := yoff + ceil((ST_Ymin(ext)-yoff)/vstep)*vstep; + --RAISE DEBUG 'vstart: %', vstart; + + hend := ST_XMax(ext); + vend := ST_YMax(ext); + + --RAISE DEBUG 'hend: %', hend; + --RAISE DEBUG 'vend: %', vend; + + x := hstart; + WHILE x < hend LOOP -- over X + y := vstart; + h := ST_MakeEnvelope(x-hw, y-hh, x+hw, y+hh, srid); + WHILE y < vend LOOP -- over Y + RETURN NEXT h; + h := ST_Translate(h, 0, vstep); + y := yoff + round(((y + vstep)-yoff)/ygrd)*ygrd; -- round to grid + END LOOP; + x := xoff + round(((x + hstep)-xoff)/xgrd)*xgrd; -- round to grid + END LOOP; + + RETURN; +END +$$ LANGUAGE 'plpgsql' IMMUTABLE PARALLEL SAFE; + +-- +-- Calculate the equal interval bins for a given column +-- +-- @param in_array A numeric array of numbers to determine the best +-- to determine the bin boundary +-- +-- @param breaks The number of bins you want to find. +-- +-- +-- Returns: upper edges of bins +-- +-- + +CREATE OR REPLACE FUNCTION CDB_EqualIntervalBins ( in_array NUMERIC[], breaks INT ) RETURNS NUMERIC[] as $$ +DECLARE + diff numeric; + min_val numeric; + max_val numeric; + tmp_val numeric; + i INT := 1; + reply numeric[]; +BEGIN + SELECT min(e), max(e) INTO min_val, max_val FROM ( SELECT unnest(in_array) e ) x WHERE e IS NOT NULL; + diff = (max_val - min_val) / breaks::numeric; + LOOP + IF i < breaks THEN + tmp_val = min_val + i::numeric * diff; + reply = array_append(reply, tmp_val); + i := i+1; + ELSE + reply = array_append(reply, max_val); + EXIT; + END IF; + END LOOP; + RETURN reply; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; + +-- +-- Determine the Heads/Tails classifications from a numeric array +-- +-- @param in_array A numeric array of numbers to determine the best +-- bins based on the Heads/Tails method. +-- +-- @param breaks The number of bins you want to find. +-- +-- + +CREATE OR REPLACE FUNCTION CDB_HeadsTailsBins ( in_array NUMERIC[], breaks INT) RETURNS NUMERIC[] as $$ +DECLARE + element_count INT4; + arr_mean numeric; + i INT := 2; + reply numeric[]; +BEGIN + -- get the total size of our row + element_count := array_upper(in_array, 1) - array_lower(in_array, 1); + -- ensure the ordering of in_array + SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x; + -- stop if no rows + IF element_count IS NULL THEN + RETURN NULL; + END IF; + -- stop if our breaks are more than our input array size + IF element_count < breaks THEN + RETURN in_array; + END IF; + + -- get our mean value + SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x; + + reply = Array[arr_mean]; + -- slice our bread + LOOP + IF i > breaks THEN EXIT; END IF; + SELECT avg(e) INTO arr_mean FROM ( SELECT unnest(in_array) e) x WHERE e > reply[i-1]; + IF arr_mean IS NOT NULL THEN + reply = array_append(reply, arr_mean); + END IF; + i := i+1; + END LOOP; + RETURN reply; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; + +-- +-- Determine the Jenks classifications from a numeric array +-- +-- @param in_array A numeric array of numbers to determine the best +-- bins based on the Jenks method. +-- +-- @param breaks The number of bins you want to find. +-- +-- @param iterations The number of different starting positions to test. +-- +-- @param invert Optional wheter to return the top of each bin (default) +-- or the bottom. BOOLEAN, default=FALSE. +-- +-- + + +CREATE OR REPLACE FUNCTION CDB_JenksBins ( in_array NUMERIC[], breaks INT, iterations INT DEFAULT 5, invert BOOLEAN DEFAULT FALSE) RETURNS NUMERIC[] as $$ +DECLARE + element_count INT4; + arr_mean NUMERIC; + bot INT; + top INT; + tops INT[]; + classes INT[][]; + i INT := 1; j INT := 1; + curr_result NUMERIC[]; + best_result NUMERIC[]; + seedtarget TEXT; + quant NUMERIC[]; + shuffles INT; +BEGIN + -- get the total size of our row + element_count := array_length(in_array, 1); --array_upper(in_array, 1) - array_lower(in_array, 1); + -- ensure the ordering of in_array + SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x; + -- stop if no rows + IF element_count IS NULL THEN + RETURN NULL; + END IF; + -- stop if our breaks are more than our input array size + IF element_count < breaks THEN + RETURN in_array; + END IF; + + shuffles := LEAST(GREATEST(floor(2500000.0/(element_count::float*iterations::float)), 1), 750)::int; + -- get our mean value + SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x; + + -- assume best is actually Quantile + SELECT cdb_crankshaft.CDB_QuantileBins(in_array, breaks) INTO quant; + + -- if data is very very large, just return quant and be done + IF element_count > 5000000 THEN + RETURN quant; + END IF; + + -- change quant into bottom, top markers + LOOP + IF i = 1 THEN + bot = 1; + ELSE + -- use last top to find this bot + bot = top+1; + END IF; + IF i = breaks THEN + top = element_count; + ELSE + SELECT count(*) INTO top FROM ( SELECT unnest(in_array) as v) x WHERE v <= quant[i]; + END IF; + IF i = 1 THEN + classes = ARRAY[ARRAY[bot,top]]; + ELSE + classes = ARRAY_CAT(classes,ARRAY[bot,top]); + END IF; + IF i > breaks THEN EXIT; END IF; + i = i+1; + END LOOP; + + best_result = cdb_crankshaft.CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles); + + --set the seed so we can ensure the same results + SELECT setseed(0.4567) INTO seedtarget; + --loop through random starting positions + LOOP + IF j > iterations-1 THEN EXIT; END IF; + i = 1; + tops = ARRAY[element_count]; + LOOP + IF i = breaks THEN EXIT; END IF; + SELECT array_agg(distinct e) INTO tops FROM (SELECT unnest(array_cat(tops, ARRAY[floor(random()*element_count::float)::int])) as e ORDER BY e) x WHERE e != 1; + i = array_length(tops, 1); + END LOOP; + i = 1; + LOOP + IF i > breaks THEN EXIT; END IF; + IF i = 1 THEN + bot = 1; + ELSE + bot = top+1; + END IF; + top = tops[i]; + IF i = 1 THEN + classes = ARRAY[ARRAY[bot,top]]; + ELSE + classes = ARRAY_CAT(classes,ARRAY[bot,top]); + END IF; + i := i+1; + END LOOP; + curr_result = cdb_crankshaft.CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles); + + IF curr_result[1] > best_result[1] THEN + best_result = curr_result; + j = j-1; -- if we found a better result, add one more search + END IF; + j = j+1; + END LOOP; + + RETURN (best_result)[2:array_upper(best_result, 1)]; +END; +$$ language plpgsql VOLATILE PARALLEL RESTRICTED; + + + +-- +-- Perform a single iteration of the Jenks classification +-- + +CREATE OR REPLACE FUNCTION CDB_JenksBinsIteration ( in_array NUMERIC[], breaks INT, classes INT[][], invert BOOLEAN, element_count INT4, arr_mean NUMERIC, max_search INT DEFAULT 50) RETURNS NUMERIC[] as $$ +DECLARE + tmp_val numeric; + new_classes int[][]; + tmp_class int[]; + i INT := 1; + j INT := 1; + side INT := 2; + sdam numeric; + gvf numeric := 0.0; + new_gvf numeric; + arr_gvf numeric[]; + class_avg numeric; + class_max_i INT; + class_min_i INT; + class_max numeric; + class_min numeric; + reply numeric[]; +BEGIN + + -- Calculate the sum of squared deviations from the array mean (SDAM). + SELECT sum((arr_mean - e)^2) INTO sdam FROM ( SELECT unnest(in_array) as e ) x; + --Identify the breaks for the lowest GVF + LOOP + i = 1; + LOOP + -- get our mean + SELECT avg(e) INTO class_avg FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e) x; + -- find the deviation + SELECT sum((class_avg-e)^2) INTO tmp_val FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e ) x; + IF i = 1 THEN + arr_gvf = ARRAY[tmp_val]; + -- init our min/max map for later + class_max = arr_gvf[i]; + class_min = arr_gvf[i]; + class_min_i = 1; + class_max_i = 1; + ELSE + arr_gvf = array_append(arr_gvf, tmp_val); + END IF; + i := i+1; + IF i > breaks THEN EXIT; END IF; + END LOOP; + -- calculate our new GVF + SELECT sdam-sum(e) INTO new_gvf FROM ( SELECT unnest(arr_gvf) as e ) x; + -- if no improvement was made, exit + IF new_gvf < gvf THEN EXIT; END IF; + gvf = new_gvf; + IF j > max_search THEN EXIT; END IF; + j = j+1; + i = 1; + LOOP + --establish directionality (uppward through classes or downward) + IF arr_gvf[i] < class_min THEN + class_min = arr_gvf[i]; + class_min_i = i; + END IF; + IF arr_gvf[i] > class_max THEN + class_max = arr_gvf[i]; + class_max_i = i; + END IF; + i := i+1; + IF i > breaks THEN EXIT; END IF; + END LOOP; + IF class_max_i > class_min_i THEN + class_min_i = class_max_i - 1; + ELSE + class_min_i = class_max_i + 1; + END IF; + --Move from higher class to a lower gid order + IF class_max_i > class_min_i THEN + classes[class_max_i][1] = classes[class_max_i][1] + 1; + classes[class_min_i][2] = classes[class_min_i][2] + 1; + ELSE -- Move from lower class UP into a higher class by gid + classes[class_max_i][2] = classes[class_max_i][2] - 1; + classes[class_min_i][1] = classes[class_min_i][1] - 1; + END IF; + END LOOP; + + i = 1; + LOOP + IF invert = TRUE THEN + side = 1; --default returns bottom side of breaks, invert returns top side + END IF; + reply = array_append(reply, in_array[classes[i][side]]); + i = i+1; + IF i > breaks THEN EXIT; END IF; + END LOOP; + + RETURN array_prepend(gvf, reply); + +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; + + +-- +-- Determine the Quantile classifications from a numeric array +-- +-- @param in_array A numeric array of numbers to determine the best +-- bins based on the Quantile method. +-- +-- @param breaks The number of bins you want to find. +-- +-- +CREATE OR REPLACE FUNCTION CDB_QuantileBins ( in_array NUMERIC[], breaks INT) RETURNS NUMERIC[] as $$ +DECLARE + element_count INT4; + break_size numeric; + tmp_val numeric; + i INT := 1; + reply numeric[]; +BEGIN + -- sort our values + SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e ASC) x; + -- get the total size of our data + element_count := array_length(in_array, 1); + break_size := element_count::numeric / breaks; + -- slice our bread + LOOP + IF i < breaks THEN + IF break_size * i % 1 > 0 THEN + SELECT e INTO tmp_val FROM ( SELECT unnest(in_array) e LIMIT 1 OFFSET ceil(break_size * i) - 1) x; + ELSE + SELECT avg(e) INTO tmp_val FROM ( SELECT unnest(in_array) e LIMIT 2 OFFSET ceil(break_size * i) - 1 ) x; + END IF; + ELSIF i = breaks THEN + -- select the last value + SELECT max(e) INTO tmp_val FROM ( SELECT unnest(in_array) e ) x; + ELSE + EXIT; + END IF; + + reply = array_append(reply, tmp_val); + i := i+1; + END LOOP; + RETURN reply; +END; +$$ language plpgsql IMMUTABLE STRICT PARALLEL SAFE; diff --git a/release/crankshaft.control b/release/crankshaft.control index 216a89f..7d5a93a 100644 --- a/release/crankshaft.control +++ b/release/crankshaft.control @@ -1,5 +1,5 @@ comment = 'CartoDB Spatial Analysis extension' -default_version = '0.6.1' +default_version = '0.7.0' requires = 'plpythonu, postgis' superuser = true schema = cdb_crankshaft diff --git a/release/python/0.7.0/crankshaft/crankshaft/__init__.py b/release/python/0.7.0/crankshaft/crankshaft/__init__.py new file mode 100644 index 0000000..82b2b87 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/__init__.py @@ -0,0 +1,7 @@ +"""Import all modules""" +import crankshaft.random_seeds +import crankshaft.clustering +import crankshaft.space_time_dynamics +import crankshaft.segmentation +import crankshaft.regression +import analysis_data_provider diff --git a/release/python/0.7.0/crankshaft/crankshaft/analysis_data_provider.py b/release/python/0.7.0/crankshaft/crankshaft/analysis_data_provider.py new file mode 100644 index 0000000..3d5225a --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/analysis_data_provider.py @@ -0,0 +1,98 @@ +"""class for fetching data""" +import plpy +import pysal_utils as pu + +NULL_VALUE_ERROR = ('No usable data passed to analysis. Check your input rows ' + 'for null values and fill in appropriately.') + + +def verify_data(func): + """decorator to verify data result before returning to algorithm""" + def wrapper(*args, **kwargs): + """Error checking""" + try: + data = func(*args, **kwargs) + if not data: + plpy.error(NULL_VALUE_ERROR) + else: + return data + except Exception as err: + plpy.error('Analysis failed: {}'.format(err)) + + return [] + + return wrapper + + +class AnalysisDataProvider(object): + @verify_data + def get_getis(self, w_type, params): + """fetch data for getis ord's g""" + query = pu.construct_neighbor_query(w_type, params) + return plpy.execute(query) + + @verify_data + def get_markov(self, w_type, params): + """fetch data for spatial markov""" + query = pu.construct_neighbor_query(w_type, params) + return plpy.execute(query) + + @verify_data + def get_moran(self, w_type, params): + """fetch data for moran's i analyses""" + query = pu.construct_neighbor_query(w_type, params) + return plpy.execute(query) + + @verify_data + def get_nonspatial_kmeans(self, params): + """ + Fetch data for non-spatial k-means. + + Inputs - a dict (params) with the following keys: + colnames: a (text) list of column names (e.g., + `['andy', 'cookie']`) + id_col: the name of the id column (e.g., `'cartodb_id'`) + subquery: the subquery for exposing the data (e.g., + SELECT * FROM favorite_things) + Output: + A SQL query for packaging the data for consumption within + `KMeans().nonspatial`. Format will be a list of length one, + with the first element a dict with keys ('rowid', 'attr1', + 'attr2', ...) + """ + agg_cols = ', '.join([ + 'array_agg({0}) As arr_col{1}'.format(val, idx+1) + for idx, val in enumerate(params['colnames']) + ]) + query = ''' + SELECT {cols}, array_agg({id_col}) As rowid + FROM ({subquery}) As a + '''.format(subquery=params['subquery'], + id_col=params['id_col'], + cols=agg_cols).strip() + return plpy.execute(query) + + @verify_data + def get_spatial_kmeans(self, params): + """fetch data for spatial kmeans""" + query = ''' + SELECT + array_agg("{id_col}" ORDER BY "{id_col}") as ids, + array_agg(ST_X("{geom_col}") ORDER BY "{id_col}") As xs, + array_agg(ST_Y("{geom_col}") ORDER BY "{id_col}") As ys + FROM ({subquery}) As a + WHERE "{geom_col}" IS NOT NULL + '''.format(**params) + return plpy.execute(query) + + @verify_data + def get_gwr(self, params): + """fetch data for gwr analysis""" + query = pu.gwr_query(params) + return plpy.execute(query) + + @verify_data + def get_gwr_predict(self, params): + """fetch data for gwr predict""" + query = pu.gwr_predict_query(params) + return plpy.execute(query) diff --git a/release/python/0.7.0/crankshaft/crankshaft/clustering/__init__.py b/release/python/0.7.0/crankshaft/crankshaft/clustering/__init__.py new file mode 100644 index 0000000..d9682fa --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/clustering/__init__.py @@ -0,0 +1,4 @@ +"""Import all functions from for clustering""" +from moran import * +from kmeans import * +from getis import * diff --git a/release/python/0.7.0/crankshaft/crankshaft/clustering/getis.py b/release/python/0.7.0/crankshaft/crankshaft/clustering/getis.py new file mode 100644 index 0000000..2bee3a2 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/clustering/getis.py @@ -0,0 +1,50 @@ +""" +Getis-Ord's G geostatistics (hotspot/coldspot analysis) +""" + +import pysal as ps +from collections import OrderedDict + +# crankshaft modules +import crankshaft.pysal_utils as pu +from crankshaft.analysis_data_provider import AnalysisDataProvider + +# High level interface --------------------------------------- + + +class Getis(object): + def __init__(self, data_provider=None): + if data_provider is None: + self.data_provider = AnalysisDataProvider() + else: + self.data_provider = data_provider + + def getis_ord(self, subquery, attr, + w_type, num_ngbrs, permutations, geom_col, id_col): + """ + Getis-Ord's G* + Implementation building neighbors with a PostGIS database and PySAL's + Getis-Ord's G* hotspot/coldspot module. + Andy Eschbacher + """ + + # geometries with attributes that are null are ignored + # resulting in a collection of not as near neighbors if kNN is chosen + + params = OrderedDict([("id_col", id_col), + ("attr1", attr), + ("geom_col", geom_col), + ("subquery", subquery), + ("num_ngbrs", num_ngbrs)]) + + result = self.data_provider.get_getis(w_type, params) + attr_vals = pu.get_attributes(result) + + # build PySAL weight object + weight = pu.get_weight(result, w_type, num_ngbrs) + + # calculate Getis-Ord's G* z- and p-values + getis = ps.esda.getisord.G_Local(attr_vals, weight, + star=True, permutations=permutations) + + return zip(getis.z_sim, getis.p_sim, getis.p_z_sim, weight.id_order) diff --git a/release/python/0.7.0/crankshaft/crankshaft/clustering/kmeans.py b/release/python/0.7.0/crankshaft/crankshaft/clustering/kmeans.py new file mode 100644 index 0000000..6d22d44 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/clustering/kmeans.py @@ -0,0 +1,113 @@ +from sklearn.cluster import KMeans +import numpy as np + +from crankshaft.analysis_data_provider import AnalysisDataProvider + + +class Kmeans(object): + def __init__(self, data_provider=None): + if data_provider is None: + self.data_provider = AnalysisDataProvider() + else: + self.data_provider = data_provider + + def spatial(self, query, no_clusters, no_init=20): + """ + find centers based on clusters of latitude/longitude pairs + query: SQL query that has a WGS84 geometry (the_geom) + """ + params = {"subquery": query, + "geom_col": "the_geom", + "id_col": "cartodb_id"} + + result = self.data_provider.get_spatial_kmeans(params) + + # Unpack query response + xs = result[0]['xs'] + ys = result[0]['ys'] + ids = result[0]['ids'] + + km = KMeans(n_clusters=no_clusters, n_init=no_init) + labels = km.fit_predict(zip(xs, ys)) + return zip(ids, labels) + + def nonspatial(self, subquery, colnames, no_clusters=5, + standardize=True, id_col='cartodb_id'): + """ + Arguments: + query (string): A SQL query to retrieve the data required to do the + k-means clustering analysis, like so: + SELECT * FROM iris_flower_data + colnames (list): a list of the column names which contain the data + of interest, like so: ['sepal_width', + 'petal_width', + 'sepal_length', + 'petal_length'] + no_clusters (int): number of clusters (greater than zero) + id_col (string): name of the input id_column + + Returns: + A list of tuples with the following columns: + cluster labels: a label for the cluster that the row belongs to + centers: center of the cluster that this row belongs to + silhouettes: silhouette measure for this value + rowid: row that these values belong to (corresponds to the value in + `id_col`) + """ + import json + from sklearn import metrics + + params = { + "colnames": colnames, + "subquery": subquery, + "id_col": id_col + } + + data = self.data_provider.get_nonspatial_kmeans(params) + + # fill array with values for k-means clustering + if standardize: + cluster_columns = _scale_data( + _extract_columns(data)) + else: + cluster_columns = _extract_columns(data) + + kmeans = KMeans(n_clusters=no_clusters, + random_state=0).fit(cluster_columns) + + centers = [json.dumps(dict(zip(colnames, c))) + for c in kmeans.cluster_centers_[kmeans.labels_]] + + silhouettes = metrics.silhouette_samples(cluster_columns, + kmeans.labels_, + metric='sqeuclidean') + + return zip(kmeans.labels_, + centers, + silhouettes, + [kmeans.inertia_] * kmeans.labels_.shape[0], + data[0]['rowid']) + + +# -- Preprocessing steps + +def _extract_columns(data): + """ + Extract the features from the query and pack them into a NumPy array + data (list of dicts): result of the kmeans request + """ + # number of columns minus rowid column + n_cols = len(data[0]) - 1 + return np.array([data[0]['arr_col{0}'.format(i+1)] + for i in xrange(n_cols)], + dtype=float).T + + +def _scale_data(features): + """ + Scale all input columns to center on 0 with a standard devation of 1 + features (numpy matrix): features of dimension (n_features, n_samples) + """ + from sklearn.preprocessing import StandardScaler + scaler = StandardScaler() + return scaler.fit_transform(features) diff --git a/release/python/0.7.0/crankshaft/crankshaft/clustering/moran.py b/release/python/0.7.0/crankshaft/crankshaft/clustering/moran.py new file mode 100644 index 0000000..0d5753f --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/clustering/moran.py @@ -0,0 +1,208 @@ +""" +Moran's I geostatistics (global clustering & outliers presence) +""" + +# TODO: Fill in local neighbors which have null/NoneType values with the +# average of the their neighborhood + +import pysal as ps +from collections import OrderedDict +from crankshaft.analysis_data_provider import AnalysisDataProvider + +# crankshaft module +import crankshaft.pysal_utils as pu + +# High level interface --------------------------------------- + + +class Moran(object): + def __init__(self, data_provider=None): + if data_provider is None: + self.data_provider = AnalysisDataProvider() + else: + self.data_provider = data_provider + + def global_stat(self, subquery, attr_name, + w_type, num_ngbrs, permutations, geom_col, id_col): + """ + Moran's I (global) + Implementation building neighbors with a PostGIS database and Moran's I + core clusters with PySAL. + Andy Eschbacher + """ + params = OrderedDict([("id_col", id_col), + ("attr1", attr_name), + ("geom_col", geom_col), + ("subquery", subquery), + ("num_ngbrs", num_ngbrs)]) + + result = self.data_provider.get_moran(w_type, params) + + # collect attributes + attr_vals = pu.get_attributes(result) + + # calculate weights + weight = pu.get_weight(result, w_type, num_ngbrs) + + # calculate moran global + moran_global = ps.esda.moran.Moran(attr_vals, weight, + permutations=permutations) + + return zip([moran_global.I], [moran_global.EI]) + + def local_stat(self, subquery, attr, + w_type, num_ngbrs, permutations, geom_col, id_col): + """ + Moran's I implementation for PL/Python + Andy Eschbacher + """ + + # geometries with attributes that are null are ignored + # resulting in a collection of not as near neighbors + + params = OrderedDict([("id_col", id_col), + ("attr1", attr), + ("geom_col", geom_col), + ("subquery", subquery), + ("num_ngbrs", num_ngbrs)]) + + result = self.data_provider.get_moran(w_type, params) + + attr_vals = pu.get_attributes(result) + weight = pu.get_weight(result, w_type, num_ngbrs) + + # calculate LISA values + lisa = ps.esda.moran.Moran_Local(attr_vals, weight, + permutations=permutations) + + # find quadrants for each geometry + quads = quad_position(lisa.q) + + return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y) + + def global_rate_stat(self, subquery, numerator, denominator, + w_type, num_ngbrs, permutations, geom_col, id_col): + """ + Moran's I Rate (global) + Andy Eschbacher + """ + params = OrderedDict([("id_col", id_col), + ("attr1", numerator), + ("attr2", denominator), + ("geom_col", geom_col), + ("subquery", subquery), + ("num_ngbrs", num_ngbrs)]) + + result = self.data_provider.get_moran(w_type, params) + + # collect attributes + numer = pu.get_attributes(result, 1) + denom = pu.get_attributes(result, 2) + + weight = pu.get_weight(result, w_type, num_ngbrs) + + # calculate moran global rate + lisa_rate = ps.esda.moran.Moran_Rate(numer, denom, weight, + permutations=permutations) + + return zip([lisa_rate.I], [lisa_rate.EI]) + + def local_rate_stat(self, subquery, numerator, denominator, + w_type, num_ngbrs, permutations, geom_col, id_col): + """ + Moran's I Local Rate + Andy Eschbacher + """ + # geometries with values that are null are ignored + # resulting in a collection of not as near neighbors + + params = OrderedDict([("id_col", id_col), + ("numerator", numerator), + ("denominator", denominator), + ("geom_col", geom_col), + ("subquery", subquery), + ("num_ngbrs", num_ngbrs)]) + + result = self.data_provider.get_moran(w_type, params) + + # collect attributes + numer = pu.get_attributes(result, 1) + denom = pu.get_attributes(result, 2) + + weight = pu.get_weight(result, w_type, num_ngbrs) + + # calculate LISA values + lisa = ps.esda.moran.Moran_Local_Rate(numer, denom, weight, + permutations=permutations) + + # find quadrants for each geometry + quads = quad_position(lisa.q) + + return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y) + + def local_bivariate_stat(self, subquery, attr1, attr2, + permutations, geom_col, id_col, + w_type, num_ngbrs): + """ + Moran's I (local) Bivariate (untested) + """ + + params = OrderedDict([("id_col", id_col), + ("attr1", attr1), + ("attr2", attr2), + ("geom_col", geom_col), + ("subquery", subquery), + ("num_ngbrs", num_ngbrs)]) + + result = self.data_provider.get_moran(w_type, params) + + # collect attributes + attr1_vals = pu.get_attributes(result, 1) + attr2_vals = pu.get_attributes(result, 2) + + # create weights + weight = pu.get_weight(result, w_type, num_ngbrs) + + # calculate LISA values + lisa = ps.esda.moran.Moran_Local_BV(attr1_vals, attr2_vals, weight, + permutations=permutations) + + # find clustering of significance + lisa_sig = quad_position(lisa.q) + + return zip(lisa.Is, lisa_sig, lisa.p_sim, weight.id_order) + +# Low level functions ---------------------------------------- + + +def map_quads(coord): + """ + Map a quadrant number to Moran's I designation + HH=1, LH=2, LL=3, HL=4 + Input: + @param coord (int): quadrant of a specific measurement + Output: + classification (one of 'HH', 'LH', 'LL', or 'HL') + """ + if coord == 1: + return 'HH' + elif coord == 2: + return 'LH' + elif coord == 3: + return 'LL' + elif coord == 4: + return 'HL' + else: + return None + + +def quad_position(quads): + """ + Produce Moran's I classification based of n + Input: + @param quads ndarray: an array of quads classified by + 1-4 (PySAL default) + Output: + @param list: an array of quads classied by 'HH', 'LL', etc. + """ + return [map_quads(q) for q in quads] diff --git a/release/python/0.7.0/crankshaft/crankshaft/pysal_utils/__init__.py b/release/python/0.7.0/crankshaft/crankshaft/pysal_utils/__init__.py new file mode 100644 index 0000000..fdf073b --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/pysal_utils/__init__.py @@ -0,0 +1,2 @@ +"""Import all functions for pysal_utils""" +from crankshaft.pysal_utils.pysal_utils import * diff --git a/release/python/0.7.0/crankshaft/crankshaft/pysal_utils/pysal_utils.py b/release/python/0.7.0/crankshaft/crankshaft/pysal_utils/pysal_utils.py new file mode 100644 index 0000000..6b02f6d --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/pysal_utils/pysal_utils.py @@ -0,0 +1,251 @@ +""" + Utilities module for generic PySAL functionality, mainly centered on + translating queries into numpy arrays or PySAL weights objects +""" + +import numpy as np +import pysal as ps + + +def construct_neighbor_query(w_type, query_vals): + """Return query (a string) used for finding neighbors + @param w_type text: type of neighbors to calculate ('knn' or 'queen') + @param query_vals dict: values used to construct the query + """ + + if w_type.lower() == 'knn': + return knn(query_vals) + else: + return queen(query_vals) + + +# Build weight object +def get_weight(query_res, w_type='knn', num_ngbrs=5): + """ + Construct PySAL weight from return value of query + @param query_res dict-like: query results with attributes and neighbors + """ + + neighbors = {x['id']: x['neighbors'] for x in query_res} + print 'len of neighbors: %d' % len(neighbors) + + built_weight = ps.W(neighbors) + built_weight.transform = 'r' + + return built_weight + + +def query_attr_select(params, table_ref=True): + """ + Create portion of SELECT statement for attributes inolved in query. + Defaults to order in the params + @param params: dict of information used in query (column names, + table name, etc.) + Example: + OrderedDict([('numerator', 'price'), + ('denominator', 'sq_meters'), + ('subquery', 'SELECT * FROM interesting_data')]) + Output: + "i.\"price\"::numeric As attr1, " \ + "i.\"sq_meters\"::numeric As attr2, " + """ + + attr_string = "" + template = "\"%(col)s\"::numeric As attr%(alias_num)s, " + + if table_ref: + template = "i." + template + + if ('time_cols' in params) or ('ind_vars' in params): + # if markov or gwr analysis + attrs = (params['time_cols'] if 'time_cols' in params + else params['ind_vars']) + if 'ind_vars' in params: + template = "array_agg(\"%(col)s\"::numeric) As attr%(alias_num)s, " + + for idx, val in enumerate(attrs): + attr_string += template % {"col": val, "alias_num": idx + 1} + else: + # if moran's analysis + attrs = [k for k in params + if k not in ('id_col', 'geom_col', 'subquery', + 'num_ngbrs', 'subquery')] + + for idx, val in enumerate(attrs): + attr_string += template % {"col": params[val], + "alias_num": idx + 1} + + return attr_string + + +def query_attr_where(params, table_ref=True): + """ + Construct where conditions when building neighbors query + Create portion of WHERE clauses for weeding out NULL-valued geometries + Input: dict of params: + {'subquery': ..., + 'numerator': 'data1', + 'denominator': 'data2', + '': ...} + Output: + 'idx_replace."data1" IS NOT NULL AND idx_replace."data2" IS NOT NULL' + Input: + {'subquery': ..., + 'time_cols': ['time1', 'time2', 'time3'], + 'etc': ...} + Output: 'idx_replace."time1" IS NOT NULL AND idx_replace."time2" IS NOT + NULL AND idx_replace."time3" IS NOT NULL' + """ + attr_string = [] + template = "\"%s\" IS NOT NULL" + if table_ref: + template = "idx_replace." + template + + if ('time_cols' in params) or ('ind_vars' in params): + # markov or gwr where clauses + attrs = (params['time_cols'] if 'time_cols' in params + else params['ind_vars']) + # add values to template + for attr in attrs: + attr_string.append(template % attr) + else: + # moran where clauses + + # get keys + attrs = [k for k in params + if k not in ('id_col', 'geom_col', 'subquery', + 'num_ngbrs', 'subquery')] + + # add values to template + for attr in attrs: + attr_string.append(template % params[attr]) + + if 'denominator' in attrs: + attr_string.append( + "idx_replace.\"%s\" <> 0" % params['denominator']) + + out = " AND ".join(attr_string) + + return out + + +def knn(params): + """SQL query for k-nearest neighbors. + @param vars: dict of values to fill template + """ + + attr_select = query_attr_select(params, table_ref=True) + attr_where = query_attr_where(params, table_ref=True) + + replacements = {"attr_select": attr_select, + "attr_where_i": attr_where.replace("idx_replace", "i"), + "attr_where_j": attr_where.replace("idx_replace", "j")} + + query = ''' + SELECT + i."{id_col}" As id, + %(attr_select)s + (SELECT ARRAY(SELECT j."{id_col}" + FROM ({subquery}) As j + WHERE i."{id_col}" <> j."{id_col}" AND + %(attr_where_j)s AND + j."{geom_col}" IS NOT NULL + ORDER BY j."{geom_col}" <-> i."{geom_col}" ASC + LIMIT {num_ngbrs})) As neighbors + FROM ({subquery}) As i + WHERE %(attr_where_i)s AND i."{geom_col}" IS NOT NULL + ORDER BY i."{id_col}" ASC; + ''' % replacements + + return query.format(**params) + + +# SQL query for finding queens neighbors (all contiguous polygons) +def queen(params): + """SQL query for queen neighbors. + @param params dict: information to fill query + """ + attr_select = query_attr_select(params) + attr_where = query_attr_where(params) + + replacements = {"attr_select": attr_select, + "attr_where_i": attr_where.replace("idx_replace", "i"), + "attr_where_j": attr_where.replace("idx_replace", "j")} + + query = ''' + SELECT + i."{id_col}" As id, + %(attr_select)s + (SELECT ARRAY(SELECT j."{id_col}" + FROM ({subquery}) As j + WHERE i."{id_col}" <> j."{id_col}" AND + ST_Touches(i."{geom_col}", j."{geom_col}") AND + %(attr_where_j)s)) As neighbors + FROM ({subquery}) As i + WHERE + %(attr_where_i)s + ORDER BY i."{id_col}" ASC; + ''' % replacements + + return query.format(**params) + + +def gwr_query(params): + """ + GWR query + """ + + replacements = {"ind_vars_select": query_attr_select(params, + table_ref=None), + "ind_vars_where": query_attr_where(params, + table_ref=None)} + + query = ''' + SELECT + array_agg(ST_X(ST_Centroid("{geom_col}"))) As x, + array_agg(ST_Y(ST_Centroid("{geom_col}"))) As y, + array_agg("{dep_var}") As dep_var, + %(ind_vars_select)s + array_agg("{id_col}") As rowid + FROM ({subquery}) As q + WHERE + "{dep_var}" IS NOT NULL AND + %(ind_vars_where)s + ''' % replacements + + return query.format(**params).strip() + + +def gwr_predict_query(params): + """ + GWR query + """ + + replacements = {"ind_vars_select": query_attr_select(params, + table_ref=None), + "ind_vars_where": query_attr_where(params, + table_ref=None)} + + query = ''' + SELECT + array_agg(ST_X(ST_Centroid({geom_col}))) As x, + array_agg(ST_Y(ST_Centroid({geom_col}))) As y, + array_agg({dep_var}) As dep_var, + %(ind_vars_select)s + array_agg({id_col}) As rowid + FROM ({subquery}) As q + WHERE + %(ind_vars_where)s + ''' % replacements + + return query.format(**params).strip() +# to add more weight methods open a ticket or pull request + + +def get_attributes(query_res, attr_num=1): + """ + @param query_res: query results with attributes and neighbors + @param attr_num: attribute number (1, 2, ...) + """ + return np.array([x['attr' + str(attr_num)] for x in query_res], + dtype=np.float) diff --git a/release/python/0.7.0/crankshaft/crankshaft/random_seeds.py b/release/python/0.7.0/crankshaft/crankshaft/random_seeds.py new file mode 100644 index 0000000..c55ba14 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/random_seeds.py @@ -0,0 +1,12 @@ +"""Random seed generator used for non-deterministic functions in crankshaft""" +import random +import numpy + + +def set_random_seeds(value): + """ + Set the seeds of the RNGs (Random Number Generators) + used internally. + """ + random.seed(value) + numpy.random.seed(value) diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/__init__.py b/release/python/0.7.0/crankshaft/crankshaft/regression/__init__.py new file mode 100644 index 0000000..f9d6d07 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/__init__.py @@ -0,0 +1,3 @@ +from crankshaft.regression.gwr import * +from crankshaft.regression.glm import * +from crankshaft.regression.gwr_cs import * diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/glm/GLM_validate_estimation.ipynb b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/GLM_validate_estimation.ipynb new file mode 100644 index 0000000..1b17831 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/GLM_validate_estimation.ipynb @@ -0,0 +1,444 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#Import GLM and pysal\n", + "import os\n", + "import numpy as np\n", + "os.chdir('/Users/toshan/dev/pysal/pysal/contrib/glm')\n", + "from glm import GLM\n", + "import pysal\n", + "import pandas as pd\n", + "import statsmodels.formula.api as smf\n", + "import statsmodels.api as sm\n", + "from family import Gaussian, Binomial, Poisson, QuasiPoisson\n", + "\n", + "from statsmodels.api import families" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#Prepare some test data - columbus example\n", + "db = pysal.open(pysal.examples.get_path('columbus.dbf'),'r')\n", + "y = np.array(db.by_col(\"HOVAL\"))\n", + "y = np.reshape(y, (49,1))\n", + "X = []\n", + "#X.append(np.ones(len(y)))\n", + "X.append(db.by_col(\"INC\"))\n", + "X.append(db.by_col(\"CRIME\"))\n", + "X = np.array(X).T" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 46.42818268]\n", + " [ 0.62898397]\n", + " [ -0.48488854]]\n" + ] + } + ], + "source": [ + "#First fit pysal OLS model\n", + "from pysal.spreg import ols\n", + "OLS = ols.OLS(y, X)\n", + "print OLS.betas" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false, + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "[ 46.42818268 0.62898397 -0.48488854]\n", + "[ 46.42818268 0.62898397 -0.48488854]\n" + ] + } + ], + "source": [ + "#Then fit Gaussian GLM\n", + "\n", + "#create Gaussian GLM model object\n", + "model = GLM(y, X, Gaussian())\n", + "model\n", + "\n", + "#Fit model to estimate coefficients and return GLMResults object\n", + "results = model.fit()\n", + "\n", + "#Check coefficients - R betas [46.4282, 0.6290, -0.4849]\n", + "print results.params\n", + "\n", + "# Gaussian GLM results from statsmodels\n", + "sm_model = smf.GLM(y, sm.add_constant(X), family=families.Gaussian())\n", + "sm_results = sm_model.fit()\n", + "print sm_results.params" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2 2\n", + "\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "\n", + "\n", + "\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n" + ] + } + ], + "source": [ + "print results.df_model, sm_results.df_model\n", + "print np.allclose(results.aic, sm_results.aic)\n", + "print np.allclose(results.bic, sm_results.bic)\n", + "print np.allclose(results.deviance, sm_results.deviance)\n", + "print np.allclose(results.df_model, sm_results.df_model)\n", + "print np.allclose(results.df_resid, sm_results.df_resid)\n", + "print np.allclose(results.llf, sm_results.llf)\n", + "print np.allclose(results.mu, sm_results.mu)\n", + "print np.allclose(results.n, sm_results.nobs)\n", + "print np.allclose(results.null, sm_results.null)\n", + "print np.allclose(results.null_deviance, sm_results.null_deviance)\n", + "print np.allclose(results.params, sm_results.params)\n", + "print np.allclose(results.pearson_chi2, sm_results.pearson_chi2)\n", + "print np.allclose(results.resid_anscombe, sm_results.resid_anscombe)\n", + "print np.allclose(results.resid_deviance, sm_results.resid_deviance)\n", + "print np.allclose(results.resid_pearson, sm_results.resid_pearson)\n", + "print np.allclose(results.resid_response, sm_results.resid_response)\n", + "print np.allclose(results.resid_working, sm_results.resid_working)\n", + "print np.allclose(results.scale, sm_results.scale)\n", + "print np.allclose(results.normalized_cov_params, sm_results.normalized_cov_params)\n", + "print np.allclose(results.cov_params(), sm_results.cov_params())\n", + "print np.allclose(results.bse, sm_results.bse)\n", + "print np.allclose(results.conf_int(), sm_results.conf_int())\n", + "print np.allclose(results.pvalues, sm_results.pvalues)\n", + "print np.allclose(results.tvalues, sm_results.tvalues)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "[ 3.92159085 0.01183491 -0.01371397]\n", + "[ 3.92159085 0.01183491 -0.01371397]\n" + ] + } + ], + "source": [ + "#Now fit a Poisson GLM \n", + "\n", + "poisson_y = np.round(y).astype(int)\n", + "\n", + "#create Poisson GLM model object\n", + "model = GLM(poisson_y, X, Poisson())\n", + "model\n", + "\n", + "#Fit model to estimate coefficients and return GLMResults object\n", + "results = model.fit()\n", + "\n", + "#Check coefficients - R betas [3.91926, 0.01198, -0.01371]\n", + "print results.params.T\n", + "\n", + "# Poisson GLM results from statsmodels\n", + "sm_results = smf.GLM(poisson_y, sm.add_constant(X), family=families.Poisson()).fit()\n", + "print sm_results.params" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "\n", + "\n", + "\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "[ 0.13049161 0.00511599 0.00193769] [ 0.13049161 0.00511599 0.00193769]\n" + ] + } + ], + "source": [ + "print np.allclose(results.aic, sm_results.aic)\n", + "print np.allclose(results.bic, sm_results.bic)\n", + "print np.allclose(results.deviance, sm_results.deviance)\n", + "print np.allclose(results.df_model, sm_results.df_model)\n", + "print np.allclose(results.df_resid, sm_results.df_resid)\n", + "print np.allclose(results.llf, sm_results.llf)\n", + "print np.allclose(results.mu, sm_results.mu)\n", + "print np.allclose(results.n, sm_results.nobs)\n", + "print np.allclose(results.null, sm_results.null)\n", + "print np.allclose(results.null_deviance, sm_results.null_deviance)\n", + "print np.allclose(results.params, sm_results.params)\n", + "print np.allclose(results.pearson_chi2, sm_results.pearson_chi2)\n", + "print np.allclose(results.resid_anscombe, sm_results.resid_anscombe)\n", + "print np.allclose(results.resid_deviance, sm_results.resid_deviance)\n", + "print np.allclose(results.resid_pearson, sm_results.resid_pearson)\n", + "print np.allclose(results.resid_response, sm_results.resid_response)\n", + "print np.allclose(results.resid_working, sm_results.resid_working)\n", + "print np.allclose(results.scale, sm_results.scale)\n", + "print np.allclose(results.normalized_cov_params, sm_results.normalized_cov_params)\n", + "print np.allclose(results.cov_params(), sm_results.cov_params())\n", + "print np.allclose(results.bse, sm_results.bse)\n", + "print np.allclose(results.conf_int(), sm_results.conf_int())\n", + "print np.allclose(results.pvalues, sm_results.pvalues)\n", + "print np.allclose(results.tvalues, sm_results.tvalues)\n", + "print results.bse, sm_results.bse" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": { + "collapsed": false, + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[-5.33638276 0.0287754 ]\n", + "[-5.33638276 0.0287754 ]\n" + ] + } + ], + "source": [ + "#Now fit a binomial GLM\n", + "londonhp = pd.read_csv('/Users/toshan/projects/londonhp.csv')\n", + "#londonhp = pd.read_csv('/Users/qszhao/Dropbox/pysal/pysal/contrib/gwr/londonhp.csv')\n", + "y = londonhp['BATH2'].values\n", + "y = np.reshape(y, (316,1))\n", + "X = londonhp['FLOORSZ'].values\n", + "X = np.reshape(X, (316,1))\n", + "\n", + "#create logistic GLM model object\n", + "model = GLM(y, X, Binomial())\n", + "model\n", + "\n", + "#Fit model to estimate coefficients and return GLMResults object\n", + "results = model.fit()\n", + "\n", + "#Check coefficients - R betas [-5.33638, 0.02878]\n", + "print results.params.T\n", + "\n", + "# Logistic GLM results from statsmodels\n", + "sm_results = smf.GLM(y, sm.add_constant(X), family=families.Binomial()).fit()\n", + "print sm_results.params" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 1\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n", + "True\n" + ] + } + ], + "source": [ + "print results.df_model, sm_results.df_model\n", + "print np.allclose(results.aic, sm_results.aic)\n", + "print np.allclose(results.bic, sm_results.bic)\n", + "print np.allclose(results.deviance, sm_results.deviance)\n", + "print np.allclose(results.df_model, sm_results.df_model)\n", + "print np.allclose(results.df_resid, sm_results.df_resid)\n", + "print np.allclose(results.llf, sm_results.llf)\n", + "print np.allclose(results.mu, sm_results.mu)\n", + "print np.allclose(results.n, sm_results.nobs)\n", + "print np.allclose(results.null, sm_results.null)\n", + "print np.allclose(results.null_deviance, sm_results.null_deviance)\n", + "print np.allclose(results.params, sm_results.params)\n", + "print np.allclose(results.pearson_chi2, sm_results.pearson_chi2)\n", + "print np.allclose(results.resid_anscombe, sm_results.resid_anscombe)\n", + "print np.allclose(results.resid_deviance, sm_results.resid_deviance)\n", + "print np.allclose(results.resid_pearson, sm_results.resid_pearson)\n", + "print np.allclose(results.resid_response, sm_results.resid_response)\n", + "print np.allclose(results.resid_working, sm_results.resid_working)\n", + "print np.allclose(results.scale, sm_results.scale)\n", + "print np.allclose(results.normalized_cov_params, sm_results.normalized_cov_params)\n", + "print np.allclose(results.cov_params(), sm_results.cov_params())\n", + "print np.allclose(results.bse, sm_results.bse)\n", + "print np.allclose(results.conf_int(), sm_results.conf_int())\n", + "print np.allclose(results.pvalues, sm_results.pvalues)\n", + "print np.allclose(results.tvalues, sm_results.tvalues)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n" + ] + } + ], + "source": [ + "#create QUasiPoisson GLM model object\n", + "model = GLM(poisson_y, X, QuasiPoisson())\n", + "model\n", + "\n", + "#Fit model to estimate coefficients and return GLMResults object\n", + "results = model.fit()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/glm/__init__.py b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/__init__.py new file mode 100644 index 0000000..4a468d5 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/__init__.py @@ -0,0 +1,4 @@ +import glm +import family +import utils +import iwls diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/glm/base.py b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/base.py new file mode 100644 index 0000000..484c1c8 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/base.py @@ -0,0 +1,959 @@ + +from __future__ import print_function +import numpy as np +from scipy import stats +from utils import cache_readonly + +class Results(object): + """ + Class to contain model results + Parameters + ---------- + model : class instance + the previously specified model instance + params : array + parameter estimates from the fit model + """ + def __init__(self, model, params, **kwd): + self.__dict__.update(kwd) + self.initialize(model, params, **kwd) + self._data_attr = [] + + def initialize(self, model, params, **kwd): + self.params = params + self.model = model + if hasattr(model, 'k_constant'): + self.k_constant = model.k_constant + + def predict(self, exog=None, transform=True, *args, **kwargs): + """ + Call self.model.predict with self.params as the first argument. + Parameters + ---------- + exog : array-like, optional + The values for which you want to predict. + transform : bool, optional + If the model was fit via a formula, do you want to pass + exog through the formula. Default is True. E.g., if you fit + a model y ~ log(x1) + log(x2), and transform is True, then + you can pass a data structure that contains x1 and x2 in + their original form. Otherwise, you'd need to log the data + first. + args, kwargs : + Some models can take additional arguments or keywords, see the + predict method of the model for the details. + Returns + ------- + prediction : ndarray or pandas.Series + See self.model.predict + """ + if transform and hasattr(self.model, 'formula') and exog is not None: + from patsy import dmatrix + exog = dmatrix(self.model.data.design_info.builder, + exog) + + if exog is not None: + exog = np.asarray(exog) + if exog.ndim == 1 and (self.model.exog.ndim == 1 or + self.model.exog.shape[1] == 1): + exog = exog[:, None] + exog = np.atleast_2d(exog) # needed in count model shape[1] + + return self.model.predict(self.params, exog, *args, **kwargs) + + +#TODO: public method? +class LikelihoodModelResults(Results): + """ + Class to contain results from likelihood models + Parameters + ----------- + model : LikelihoodModel instance or subclass instance + LikelihoodModelResults holds a reference to the model that is fit. + params : 1d array_like + parameter estimates from estimated model + normalized_cov_params : 2d array + Normalized (before scaling) covariance of params. (dot(X.T,X))**-1 + scale : float + For (some subset of models) scale will typically be the + mean square error from the estimated model (sigma^2) + Returns + ------- + **Attributes** + mle_retvals : dict + Contains the values returned from the chosen optimization method if + full_output is True during the fit. Available only if the model + is fit by maximum likelihood. See notes below for the output from + the different methods. + mle_settings : dict + Contains the arguments passed to the chosen optimization method. + Available if the model is fit by maximum likelihood. See + LikelihoodModel.fit for more information. + model : model instance + LikelihoodResults contains a reference to the model that is fit. + params : ndarray + The parameters estimated for the model. + scale : float + The scaling factor of the model given during instantiation. + tvalues : array + The t-values of the standard errors. + Notes + ----- + The covariance of params is given by scale times normalized_cov_params. + Return values by solver if full_output is True during fit: + 'newton' + fopt : float + The value of the (negative) loglikelihood at its + minimum. + iterations : int + Number of iterations performed. + score : ndarray + The score vector at the optimum. + Hessian : ndarray + The Hessian at the optimum. + warnflag : int + 1 if maxiter is exceeded. 0 if successful convergence. + converged : bool + True: converged. False: did not converge. + allvecs : list + List of solutions at each iteration. + 'nm' + fopt : float + The value of the (negative) loglikelihood at its + minimum. + iterations : int + Number of iterations performed. + warnflag : int + 1: Maximum number of function evaluations made. + 2: Maximum number of iterations reached. + converged : bool + True: converged. False: did not converge. + allvecs : list + List of solutions at each iteration. + 'bfgs' + fopt : float + Value of the (negative) loglikelihood at its minimum. + gopt : float + Value of gradient at minimum, which should be near 0. + Hinv : ndarray + value of the inverse Hessian matrix at minimum. Note + that this is just an approximation and will often be + different from the value of the analytic Hessian. + fcalls : int + Number of calls to loglike. + gcalls : int + Number of calls to gradient/score. + warnflag : int + 1: Maximum number of iterations exceeded. 2: Gradient + and/or function calls are not changing. + converged : bool + True: converged. False: did not converge. + allvecs : list + Results at each iteration. + 'lbfgs' + fopt : float + Value of the (negative) loglikelihood at its minimum. + gopt : float + Value of gradient at minimum, which should be near 0. + fcalls : int + Number of calls to loglike. + warnflag : int + Warning flag: + - 0 if converged + - 1 if too many function evaluations or too many iterations + - 2 if stopped for another reason + converged : bool + True: converged. False: did not converge. + 'powell' + fopt : float + Value of the (negative) loglikelihood at its minimum. + direc : ndarray + Current direction set. + iterations : int + Number of iterations performed. + fcalls : int + Number of calls to loglike. + warnflag : int + 1: Maximum number of function evaluations. 2: Maximum number + of iterations. + converged : bool + True : converged. False: did not converge. + allvecs : list + Results at each iteration. + 'cg' + fopt : float + Value of the (negative) loglikelihood at its minimum. + fcalls : int + Number of calls to loglike. + gcalls : int + Number of calls to gradient/score. + warnflag : int + 1: Maximum number of iterations exceeded. 2: Gradient and/ + or function calls not changing. + converged : bool + True: converged. False: did not converge. + allvecs : list + Results at each iteration. + 'ncg' + fopt : float + Value of the (negative) loglikelihood at its minimum. + fcalls : int + Number of calls to loglike. + gcalls : int + Number of calls to gradient/score. + hcalls : int + Number of calls to hessian. + warnflag : int + 1: Maximum number of iterations exceeded. + converged : bool + True: converged. False: did not converge. + allvecs : list + Results at each iteration. + """ + + # by default we use normal distribution + # can be overwritten by instances or subclasses + use_t = False + + def __init__(self, model, params, normalized_cov_params=None, scale=1., + **kwargs): + super(LikelihoodModelResults, self).__init__(model, params) + self.normalized_cov_params = normalized_cov_params + self.scale = scale + + # robust covariance + # We put cov_type in kwargs so subclasses can decide in fit whether to + # use this generic implementation + if 'use_t' in kwargs: + use_t = kwargs['use_t'] + if use_t is not None: + self.use_t = use_t + if 'cov_type' in kwargs: + cov_type = kwargs.get('cov_type', 'nonrobust') + cov_kwds = kwargs.get('cov_kwds', {}) + + if cov_type == 'nonrobust': + self.cov_type = 'nonrobust' + self.cov_kwds = {'description' : 'Standard Errors assume that the ' + + 'covariance matrix of the errors is correctly ' + + 'specified.'} + else: + from statsmodels.base.covtype import get_robustcov_results + if cov_kwds is None: + cov_kwds = {} + use_t = self.use_t + # TODO: we shouldn't need use_t in get_robustcov_results + get_robustcov_results(self, cov_type=cov_type, use_self=True, + use_t=use_t, **cov_kwds) + + + def normalized_cov_params(self): + raise NotImplementedError + + + def _get_robustcov_results(self, cov_type='nonrobust', use_self=True, + use_t=None, **cov_kwds): + from statsmodels.base.covtype import get_robustcov_results + if cov_kwds is None: + cov_kwds = {} + + if cov_type == 'nonrobust': + self.cov_type = 'nonrobust' + self.cov_kwds = {'description' : 'Standard Errors assume that the ' + + 'covariance matrix of the errors is correctly ' + + 'specified.'} + else: + # TODO: we shouldn't need use_t in get_robustcov_results + get_robustcov_results(self, cov_type=cov_type, use_self=True, + use_t=use_t, **cov_kwds) + + @cache_readonly + def llf(self): + return self.model.loglike(self.params) + + @cache_readonly + def bse(self): + return np.sqrt(np.diag(self.cov_params())) + + @cache_readonly + def tvalues(self): + """ + Return the t-statistic for a given parameter estimate. + """ + return self.params / self.bse + + @cache_readonly + def pvalues(self): + if self.use_t: + df_resid = getattr(self, 'df_resid_inference', self.df_resid) + return stats.t.sf(np.abs(self.tvalues), df_resid)*2 + else: + return stats.norm.sf(np.abs(self.tvalues))*2 + + + def cov_params(self, r_matrix=None, column=None, scale=None, cov_p=None, + other=None): + """ + Returns the variance/covariance matrix. + The variance/covariance matrix can be of a linear contrast + of the estimates of params or all params multiplied by scale which + will usually be an estimate of sigma^2. Scale is assumed to be + a scalar. + Parameters + ---------- + r_matrix : array-like + Can be 1d, or 2d. Can be used alone or with other. + column : array-like, optional + Must be used on its own. Can be 0d or 1d see below. + scale : float, optional + Can be specified or not. Default is None, which means that + the scale argument is taken from the model. + other : array-like, optional + Can be used when r_matrix is specified. + Returns + ------- + cov : ndarray + covariance matrix of the parameter estimates or of linear + combination of parameter estimates. See Notes. + Notes + ----- + (The below are assumed to be in matrix notation.) + If no argument is specified returns the covariance matrix of a model + ``(scale)*(X.T X)^(-1)`` + If contrast is specified it pre and post-multiplies as follows + ``(scale) * r_matrix (X.T X)^(-1) r_matrix.T`` + If contrast and other are specified returns + ``(scale) * r_matrix (X.T X)^(-1) other.T`` + If column is specified returns + ``(scale) * (X.T X)^(-1)[column,column]`` if column is 0d + OR + ``(scale) * (X.T X)^(-1)[column][:,column]`` if column is 1d + """ + if (hasattr(self, 'mle_settings') and + self.mle_settings['optimizer'] in ['l1', 'l1_cvxopt_cp']): + dot_fun = nan_dot + else: + dot_fun = np.dot + + if (cov_p is None and self.normalized_cov_params is None and + not hasattr(self, 'cov_params_default')): + raise ValueError('need covariance of parameters for computing ' + '(unnormalized) covariances') + if column is not None and (r_matrix is not None or other is not None): + raise ValueError('Column should be specified without other ' + 'arguments.') + if other is not None and r_matrix is None: + raise ValueError('other can only be specified with r_matrix') + + if cov_p is None: + if hasattr(self, 'cov_params_default'): + cov_p = self.cov_params_default + else: + if scale is None: + scale = self.scale + cov_p = self.normalized_cov_params * scale + + if column is not None: + column = np.asarray(column) + if column.shape == (): + return cov_p[column, column] + else: + #return cov_p[column][:, column] + return cov_p[column[:, None], column] + elif r_matrix is not None: + r_matrix = np.asarray(r_matrix) + if r_matrix.shape == (): + raise ValueError("r_matrix should be 1d or 2d") + if other is None: + other = r_matrix + else: + other = np.asarray(other) + tmp = dot_fun(r_matrix, dot_fun(cov_p, np.transpose(other))) + return tmp + else: # if r_matrix is None and column is None: + return cov_p + + #TODO: make sure this works as needed for GLMs + def t_test(self, r_matrix, cov_p=None, scale=None, + use_t=None): + """ + Compute a t-test for a each linear hypothesis of the form Rb = q + Parameters + ---------- + r_matrix : array-like, str, tuple + - array : If an array is given, a p x k 2d array or length k 1d + array specifying the linear restrictions. It is assumed + that the linear combination is equal to zero. + - str : The full hypotheses to test can be given as a string. + See the examples. + - tuple : A tuple of arrays in the form (R, q). If q is given, + can be either a scalar or a length p row vector. + cov_p : array-like, optional + An alternative estimate for the parameter covariance matrix. + If None is given, self.normalized_cov_params is used. + scale : float, optional + An optional `scale` to use. Default is the scale specified + by the model fit. + use_t : bool, optional + If use_t is None, then the default of the model is used. + If use_t is True, then the p-values are based on the t + distribution. + If use_t is False, then the p-values are based on the normal + distribution. + Returns + ------- + res : ContrastResults instance + The results for the test are attributes of this results instance. + The available results have the same elements as the parameter table + in `summary()`. + Examples + -------- + >>> import numpy as np + >>> import statsmodels.api as sm + >>> data = sm.datasets.longley.load() + >>> data.exog = sm.add_constant(data.exog) + >>> results = sm.OLS(data.endog, data.exog).fit() + >>> r = np.zeros_like(results.params) + >>> r[5:] = [1,-1] + >>> print(r) + [ 0. 0. 0. 0. 0. 1. -1.] + r tests that the coefficients on the 5th and 6th independent + variable are the same. + >>> T_test = results.t_test(r) + >>> print(T_test) + + >>> T_test.effect + -1829.2025687192481 + >>> T_test.sd + 455.39079425193762 + >>> T_test.tvalue + -4.0167754636411717 + >>> T_test.pvalue + 0.0015163772380899498 + Alternatively, you can specify the hypothesis tests using a string + >>> from statsmodels.formula.api import ols + >>> dta = sm.datasets.longley.load_pandas().data + >>> formula = 'TOTEMP ~ GNPDEFL + GNP + UNEMP + ARMED + POP + YEAR' + >>> results = ols(formula, dta).fit() + >>> hypotheses = 'GNPDEFL = GNP, UNEMP = 2, YEAR/1829 = 1' + >>> t_test = results.t_test(hypotheses) + >>> print(t_test) + See Also + --------- + tvalues : individual t statistics + f_test : for F tests + patsy.DesignInfo.linear_constraint + """ + from patsy import DesignInfo + names = self.model.data.param_names + LC = DesignInfo(names).linear_constraint(r_matrix) + r_matrix, q_matrix = LC.coefs, LC.constants + num_ttests = r_matrix.shape[0] + num_params = r_matrix.shape[1] + + if (cov_p is None and self.normalized_cov_params is None and + not hasattr(self, 'cov_params_default')): + raise ValueError('Need covariance of parameters for computing ' + 'T statistics') + if num_params != self.params.shape[0]: + raise ValueError('r_matrix and params are not aligned') + if q_matrix is None: + q_matrix = np.zeros(num_ttests) + else: + q_matrix = np.asarray(q_matrix) + q_matrix = q_matrix.squeeze() + if q_matrix.size > 1: + if q_matrix.shape[0] != num_ttests: + raise ValueError("r_matrix and q_matrix must have the same " + "number of rows") + + if use_t is None: + #switch to use_t false if undefined + use_t = (hasattr(self, 'use_t') and self.use_t) + + _t = _sd = None + + _effect = np.dot(r_matrix, self.params) + # nan_dot multiplies with the convention nan * 0 = 0 + + # Perform the test + if num_ttests > 1: + _sd = np.sqrt(np.diag(self.cov_params( + r_matrix=r_matrix, cov_p=cov_p))) + else: + _sd = np.sqrt(self.cov_params(r_matrix=r_matrix, cov_p=cov_p)) + _t = (_effect - q_matrix) * recipr(_sd) + + df_resid = getattr(self, 'df_resid_inference', self.df_resid) + + if use_t: + return ContrastResults(effect=_effect, t=_t, sd=_sd, + df_denom=df_resid) + else: + return ContrastResults(effect=_effect, statistic=_t, sd=_sd, + df_denom=df_resid, + distribution='norm') + + def f_test(self, r_matrix, cov_p=None, scale=1.0, invcov=None): + """ + Compute the F-test for a joint linear hypothesis. + This is a special case of `wald_test` that always uses the F + distribution. + Parameters + ---------- + r_matrix : array-like, str, or tuple + - array : An r x k array where r is the number of restrictions to + test and k is the number of regressors. It is assumed + that the linear combination is equal to zero. + - str : The full hypotheses to test can be given as a string. + See the examples. + - tuple : A tuple of arrays in the form (R, q), ``q`` can be + either a scalar or a length k row vector. + cov_p : array-like, optional + An alternative estimate for the parameter covariance matrix. + If None is given, self.normalized_cov_params is used. + scale : float, optional + Default is 1.0 for no scaling. + invcov : array-like, optional + A q x q array to specify an inverse covariance matrix based on a + restrictions matrix. + Returns + ------- + res : ContrastResults instance + The results for the test are attributes of this results instance. + Examples + -------- + >>> import numpy as np + >>> import statsmodels.api as sm + >>> data = sm.datasets.longley.load() + >>> data.exog = sm.add_constant(data.exog) + >>> results = sm.OLS(data.endog, data.exog).fit() + >>> A = np.identity(len(results.params)) + >>> A = A[1:,:] + This tests that each coefficient is jointly statistically + significantly different from zero. + >>> print(results.f_test(A)) + + Compare this to + >>> results.fvalue + 330.2853392346658 + >>> results.f_pvalue + 4.98403096572e-10 + >>> B = np.array(([0,0,1,-1,0,0,0],[0,0,0,0,0,1,-1])) + This tests that the coefficient on the 2nd and 3rd regressors are + equal and jointly that the coefficient on the 5th and 6th regressors + are equal. + >>> print(results.f_test(B)) + + Alternatively, you can specify the hypothesis tests using a string + >>> from statsmodels.datasets import longley + >>> from statsmodels.formula.api import ols + >>> dta = longley.load_pandas().data + >>> formula = 'TOTEMP ~ GNPDEFL + GNP + UNEMP + ARMED + POP + YEAR' + >>> results = ols(formula, dta).fit() + >>> hypotheses = '(GNPDEFL = GNP), (UNEMP = 2), (YEAR/1829 = 1)' + >>> f_test = results.f_test(hypotheses) + >>> print(f_test) + See Also + -------- + statsmodels.stats.contrast.ContrastResults + wald_test + t_test + patsy.DesignInfo.linear_constraint + Notes + ----- + The matrix `r_matrix` is assumed to be non-singular. More precisely, + r_matrix (pX pX.T) r_matrix.T + is assumed invertible. Here, pX is the generalized inverse of the + design matrix of the model. There can be problems in non-OLS models + where the rank of the covariance of the noise is not full. + """ + res = self.wald_test(r_matrix, cov_p=cov_p, scale=scale, + invcov=invcov, use_f=True) + return res + + #TODO: untested for GLMs? + def wald_test(self, r_matrix, cov_p=None, scale=1.0, invcov=None, + use_f=None): + """ + Compute a Wald-test for a joint linear hypothesis. + Parameters + ---------- + r_matrix : array-like, str, or tuple + - array : An r x k array where r is the number of restrictions to + test and k is the number of regressors. It is assumed that the + linear combination is equal to zero. + - str : The full hypotheses to test can be given as a string. + See the examples. + - tuple : A tuple of arrays in the form (R, q), ``q`` can be + either a scalar or a length p row vector. + cov_p : array-like, optional + An alternative estimate for the parameter covariance matrix. + If None is given, self.normalized_cov_params is used. + scale : float, optional + Default is 1.0 for no scaling. + invcov : array-like, optional + A q x q array to specify an inverse covariance matrix based on a + restrictions matrix. + use_f : bool + If True, then the F-distribution is used. If False, then the + asymptotic distribution, chisquare is used. If use_f is None, then + the F distribution is used if the model specifies that use_t is True. + The test statistic is proportionally adjusted for the distribution + by the number of constraints in the hypothesis. + Returns + ------- + res : ContrastResults instance + The results for the test are attributes of this results instance. + See also + -------- + statsmodels.stats.contrast.ContrastResults + f_test + t_test + patsy.DesignInfo.linear_constraint + Notes + ----- + The matrix `r_matrix` is assumed to be non-singular. More precisely, + r_matrix (pX pX.T) r_matrix.T + is assumed invertible. Here, pX is the generalized inverse of the + design matrix of the model. There can be problems in non-OLS models + where the rank of the covariance of the noise is not full. + """ + if use_f is None: + #switch to use_t false if undefined + use_f = (hasattr(self, 'use_t') and self.use_t) + + from patsy import DesignInfo + names = self.model.data.param_names + LC = DesignInfo(names).linear_constraint(r_matrix) + r_matrix, q_matrix = LC.coefs, LC.constants + + if (self.normalized_cov_params is None and cov_p is None and + invcov is None and not hasattr(self, 'cov_params_default')): + raise ValueError('need covariance of parameters for computing ' + 'F statistics') + + cparams = np.dot(r_matrix, self.params[:, None]) + J = float(r_matrix.shape[0]) # number of restrictions + if q_matrix is None: + q_matrix = np.zeros(J) + else: + q_matrix = np.asarray(q_matrix) + if q_matrix.ndim == 1: + q_matrix = q_matrix[:, None] + if q_matrix.shape[0] != J: + raise ValueError("r_matrix and q_matrix must have the same " + "number of rows") + Rbq = cparams - q_matrix + if invcov is None: + cov_p = self.cov_params(r_matrix=r_matrix, cov_p=cov_p) + if np.isnan(cov_p).max(): + raise ValueError("r_matrix performs f_test for using " + "dimensions that are asymptotically " + "non-normal") + invcov = np.linalg.inv(cov_p) + + if (hasattr(self, 'mle_settings') and + self.mle_settings['optimizer'] in ['l1', 'l1_cvxopt_cp']): + F = nan_dot(nan_dot(Rbq.T, invcov), Rbq) + else: + F = np.dot(np.dot(Rbq.T, invcov), Rbq) + + df_resid = getattr(self, 'df_resid_inference', self.df_resid) + if use_f: + F /= J + return ContrastResults(F=F, df_denom=df_resid, + df_num=invcov.shape[0]) + else: + return ContrastResults(chi2=F, df_denom=J, statistic=F, + distribution='chi2', distargs=(J,)) + + + def wald_test_terms(self, skip_single=False, extra_constraints=None, + combine_terms=None): + """ + Compute a sequence of Wald tests for terms over multiple columns + This computes joined Wald tests for the hypothesis that all + coefficients corresponding to a `term` are zero. + `Terms` are defined by the underlying formula or by string matching. + Parameters + ---------- + skip_single : boolean + If true, then terms that consist only of a single column and, + therefore, refers only to a single parameter is skipped. + If false, then all terms are included. + extra_constraints : ndarray + not tested yet + combine_terms : None or list of strings + Each string in this list is matched to the name of the terms or + the name of the exogenous variables. All columns whose name + includes that string are combined in one joint test. + Returns + ------- + test_result : result instance + The result instance contains `table` which is a pandas DataFrame + with the test results: test statistic, degrees of freedom and + pvalues. + Examples + -------- + >>> res_ols = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)", + data).fit() + >>> res_ols.wald_test_terms() + + F P>F df constraint df denom + Intercept 279.754525 2.37985521351e-22 1 51 + C(Duration, Sum) 5.367071 0.0245738436636 1 51 + C(Weight, Sum) 12.432445 3.99943118767e-05 2 51 + C(Duration, Sum):C(Weight, Sum) 0.176002 0.83912310946 2 51 + >>> res_poi = Poisson.from_formula("Days ~ C(Weight) * C(Duration)", + data).fit(cov_type='HC0') + >>> wt = res_poi.wald_test_terms(skip_single=False, + combine_terms=['Duration', 'Weight']) + >>> print(wt) + chi2 P>chi2 df constraint + Intercept 15.695625 7.43960374424e-05 1 + C(Weight) 16.132616 0.000313940174705 2 + C(Duration) 1.009147 0.315107378931 1 + C(Weight):C(Duration) 0.216694 0.897315972824 2 + Duration 11.187849 0.010752286833 3 + Weight 30.263368 4.32586407145e-06 4 + """ + # lazy import + from collections import defaultdict + + result = self + if extra_constraints is None: + extra_constraints = [] + if combine_terms is None: + combine_terms = [] + design_info = getattr(result.model.data.orig_exog, 'design_info', None) + + if design_info is None and extra_constraints is None: + raise ValueError('no constraints, nothing to do') + + + identity = np.eye(len(result.params)) + constraints = [] + combined = defaultdict(list) + if design_info is not None: + for term in design_info.terms: + cols = design_info.slice(term) + name = term.name() + constraint_matrix = identity[cols] + + # check if in combined + for cname in combine_terms: + if cname in name: + combined[cname].append(constraint_matrix) + + k_constraint = constraint_matrix.shape[0] + if skip_single: + if k_constraint == 1: + continue + + constraints.append((name, constraint_matrix)) + + combined_constraints = [] + for cname in combine_terms: + combined_constraints.append((cname, np.vstack(combined[cname]))) + else: + # check by exog/params names if there is no formula info + for col, name in enumerate(result.model.exog_names): + constraint_matrix = identity[col] + + # check if in combined + for cname in combine_terms: + if cname in name: + combined[cname].append(constraint_matrix) + + if skip_single: + continue + + constraints.append((name, constraint_matrix)) + + combined_constraints = [] + for cname in combine_terms: + combined_constraints.append((cname, np.vstack(combined[cname]))) + + use_t = result.use_t + distribution = ['chi2', 'F'][use_t] + + res_wald = [] + index = [] + for name, constraint in constraints + combined_constraints + extra_constraints: + wt = result.wald_test(constraint) + row = [wt.statistic.item(), wt.pvalue, constraint.shape[0]] + if use_t: + row.append(wt.df_denom) + res_wald.append(row) + index.append(name) + + # distribution nerutral names + col_names = ['statistic', 'pvalue', 'df_constraint'] + if use_t: + col_names.append('df_denom') + # TODO: maybe move DataFrame creation to results class + from pandas import DataFrame + table = DataFrame(res_wald, index=index, columns=col_names) + res = WaldTestResults(None, distribution, None, table=table) + # TODO: remove temp again, added for testing + res.temp = constraints + combined_constraints + extra_constraints + return res + + + def conf_int(self, alpha=.05, cols=None, method='default'): + """ + Returns the confidence interval of the fitted parameters. + Parameters + ---------- + alpha : float, optional + The significance level for the confidence interval. + ie., The default `alpha` = .05 returns a 95% confidence interval. + cols : array-like, optional + `cols` specifies which confidence intervals to return + method : string + Not Implemented Yet + Method to estimate the confidence_interval. + "Default" : uses self.bse which is based on inverse Hessian for MLE + "hjjh" : + "jac" : + "boot-bse" + "boot_quant" + "profile" + Returns + -------- + conf_int : array + Each row contains [lower, upper] limits of the confidence interval + for the corresponding parameter. The first column contains all + lower, the second column contains all upper limits. + Examples + -------- + >>> import statsmodels.api as sm + >>> data = sm.datasets.longley.load() + >>> data.exog = sm.add_constant(data.exog) + >>> results = sm.OLS(data.endog, data.exog).fit() + >>> results.conf_int() + array([[-5496529.48322745, -1467987.78596704], + [ -177.02903529, 207.15277984], + [ -0.1115811 , 0.03994274], + [ -3.12506664, -0.91539297], + [ -1.5179487 , -0.54850503], + [ -0.56251721, 0.460309 ], + [ 798.7875153 , 2859.51541392]]) + >>> results.conf_int(cols=(2,3)) + array([[-0.1115811 , 0.03994274], + [-3.12506664, -0.91539297]]) + Notes + ----- + The confidence interval is based on the standard normal distribution. + Models wish to use a different distribution should overwrite this + method. + """ + bse = self.bse + + if self.use_t: + dist = stats.t + df_resid = getattr(self, 'df_resid_inference', self.df_resid) + q = dist.ppf(1 - alpha / 2, df_resid) + else: + dist = stats.norm + q = dist.ppf(1 - alpha / 2) + + if cols is None: + lower = self.params - q * bse + upper = self.params + q * bse + else: + cols = np.asarray(cols) + lower = self.params[cols] - q * bse[cols] + upper = self.params[cols] + q * bse[cols] + return np.asarray(lzip(lower, upper)) + + def save(self, fname, remove_data=False): + ''' + save a pickle of this instance + Parameters + ---------- + fname : string or filehandle + fname can be a string to a file path or filename, or a filehandle. + remove_data : bool + If False (default), then the instance is pickled without changes. + If True, then all arrays with length nobs are set to None before + pickling. See the remove_data method. + In some cases not all arrays will be set to None. + Notes + ----- + If remove_data is true and the model result does not implement a + remove_data method then this will raise an exception. + ''' + + from statsmodels.iolib.smpickle import save_pickle + + if remove_data: + self.remove_data() + + save_pickle(self, fname) + + @classmethod + def load(cls, fname): + ''' + load a pickle, (class method) + Parameters + ---------- + fname : string or filehandle + fname can be a string to a file path or filename, or a filehandle. + Returns + ------- + unpickled instance + ''' + + from statsmodels.iolib.smpickle import load_pickle + return load_pickle(fname) + + def remove_data(self): + '''remove data arrays, all nobs arrays from result and model + This reduces the size of the instance, so it can be pickled with less + memory. Currently tested for use with predict from an unpickled + results and model instance. + .. warning:: Since data and some intermediate results have been removed + calculating new statistics that require them will raise exceptions. + The exception will occur the first time an attribute is accessed + that has been set to None. + Not fully tested for time series models, tsa, and might delete too much + for prediction or not all that would be possible. + The list of arrays to delete is maintained as an attribute of the + result and model instance, except for cached values. These lists could + be changed before calling remove_data. + ''' + def wipe(obj, att): + #get to last element in attribute path + p = att.split('.') + att_ = p.pop(-1) + try: + obj_ = reduce(getattr, [obj] + p) + + #print(repr(obj), repr(att)) + #print(hasattr(obj_, att_)) + if hasattr(obj_, att_): + #print('removing3', att_) + setattr(obj_, att_, None) + except AttributeError: + pass + + model_attr = ['model.' + i for i in self.model._data_attr] + for att in self._data_attr + model_attr: + #print('removing', att) + wipe(self, att) + + data_in_cache = getattr(self, 'data_in_cache', []) + data_in_cache += ['fittedvalues', 'resid', 'wresid'] + for key in data_in_cache: + try: + self._cache[key] = None + except (AttributeError, KeyError): + pass + +def lzip(*args, **kwargs): + return list(zip(*args, **kwargs)) diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/glm/family.py b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/family.py new file mode 100644 index 0000000..bad22c1 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/family.py @@ -0,0 +1,1845 @@ +''' +The one parameter exponential family distributions used by GLM. +''' +# TODO: quasi, quasibinomial, quasipoisson +# see http://www.biostat.jhsph.edu/~qli/biostatistics_r_doc/library/stats/html/family.html +# for comparison to R, and McCullagh and Nelder + +import numpy as np +from scipy import special +import links as L +import varfuncs as V +FLOAT_EPS = np.finfo(float).eps + + +class Family(object): + """ + The parent class for one-parameter exponential families. + + Parameters + ---------- + link : a link function instance + Link is the linear transformation function. + See the individual families for available links. + variance : a variance function + Measures the variance as a function of the mean probabilities. + See the individual families for the default variance function. + + See Also + -------- + :ref:`links` + + """ + # TODO: change these class attributes, use valid somewhere... + valid = [-np.inf, np.inf] + + links = [] + + def _setlink(self, link): + """ + Helper method to set the link for a family. + + Raises a ValueError exception if the link is not available. Note that + the error message might not be that informative because it tells you + that the link should be in the base class for the link function. + + See glm.GLM for a list of appropriate links for each family but note + that not all of these are currently available. + """ + # TODO: change the links class attribute in the families to hold + # meaningful information instead of a list of links instances such as + # [, + # , + # ] + # for Poisson... + self._link = link + if not isinstance(link, L.Link): + raise TypeError("The input should be a valid Link object.") + if hasattr(self, "links"): + validlink = link in self.links + validlink = max([isinstance(link, _) for _ in self.links]) + if not validlink: + errmsg = "Invalid link for family, should be in %s. (got %s)" + raise ValueError(errmsg % (repr(self.links), link)) + + def _getlink(self): + """ + Helper method to get the link for a family. + """ + return self._link + + # link property for each family is a pointer to link instance + link = property(_getlink, _setlink, doc="Link function for family") + + def __init__(self, link, variance): + self.link = link() + self.variance = variance + + def starting_mu(self, y): + r""" + Starting value for mu in the IRLS algorithm. + + Parameters + ---------- + y : array + The untransformed response variable. + + Returns + ------- + mu_0 : array + The first guess on the transformed response variable. + + Notes + ----- + .. math:: + + \mu_0 = (Y + \overline{Y})/2 + + Notes + ----- + Only the Binomial family takes a different initial value. + """ + return (y + y.mean())/2. + + def weights(self, mu): + r""" + Weights for IRLS steps + + Parameters + ---------- + mu : array-like + The transformed mean response variable in the exponential family + + Returns + ------- + w : array + The weights for the IRLS steps + + Notes + ----- + .. math:: + + w = 1 / (g'(\mu)^2 * Var(\mu)) + """ + return 1. / (self.link.deriv(mu)**2 * self.variance(mu)) + + def deviance(self, endog, mu, freq_weights=1., scale=1.): + r""" + The deviance function evaluated at (endog,mu,freq_weights,mu). + + Deviance is usually defined as twice the loglikelihood ratio. + + Parameters + ---------- + endog : array-like + The endogenous response variable + mu : array-like + The inverse of the link function at the linear predicted values. + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + An optional scale argument. The default is 1. + + Returns + ------- + Deviance : array + The value of deviance function defined below. + + Notes + ----- + Deviance is defined + + .. math:: + + D = \sum_i (2 * freq\_weights_i * llf(Y_i, Y_i) - 2 * + llf(Y_i, \mu_i)) / scale + + where y is the endogenous variable. The deviance functions are + analytically defined for each family. + """ + raise NotImplementedError + + def resid_dev(self, endog, mu, freq_weights=1., scale=1.): + """ + The deviance residuals + + Parameters + ---------- + endog : array + The endogenous response variable + mu : array + The inverse of the link function at the linear predicted values. + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + An optional argument to divide the residuals by scale. The default + is 1. + + Returns + ------- + Deviance residuals. + + Notes + ----- + The deviance residuals are defined for each family. + """ + raise NotImplementedError + + def fitted(self, lin_pred): + """ + Fitted values based on linear predictors lin_pred. + + Parameters + ----------- + lin_pred : array + Values of the linear predictor of the model. + dot(X,beta) in a classical linear model. + + Returns + -------- + mu : array + The mean response variables given by the inverse of the link + function. + """ + fits = self.link.inverse(lin_pred) + return fits + + def predict(self, mu): + """ + Linear predictors based on given mu values. + + Parameters + ---------- + mu : array + The mean response variables + + Returns + ------- + lin_pred : array + Linear predictors based on the mean response variables. The value + of the link function at the given mu. + """ + return self.link(mu) + + def loglike(self, endog, mu, freq_weights=1., scale=1.): + """ + The log-likelihood function in terms of the fitted mean response. + + Parameters + ---------- + `endog` : array + Usually the endogenous response variable. + `mu` : array + Usually but not always the fitted mean response variable. + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float + The scale parameter. The default is 1. + + Returns + ------- + llf : float + The value of the loglikelihood evaluated at + (endog,mu,freq_weights,scale) as defined below. + Notes + ----- + This is defined for each family. endog and mu are not restricted to + `endog` and `mu` respectively. For instance, the deviance function + calls both loglike(endog,endog) and loglike(endog,mu) to get the + likelihood ratio. + """ + raise NotImplementedError + + def resid_anscombe(self, endog, mu): + """ + The Anscome residuals. + + See also + -------- + statsmodels.families.family.Family docstring and the `resid_anscombe` + for the individual families for more information. + """ + raise NotImplementedError + + +class Poisson(Family): + """ + Poisson exponential family. + + Parameters + ---------- + link : a link instance, optional + The default link for the Poisson family is the log link. Available + links are log, identity, and sqrt. See statsmodels.family.links for + more information. + + Attributes + ---------- + Poisson.link : a link instance + The link function of the Poisson instance. + Poisson.variance : varfuncs instance + `variance` is an instance of + statsmodels.genmod.families.family.varfuncs.mu + + See also + -------- + statsmodels.genmod.families.family.Family + :ref:`links` + + """ + + links = [L.log, L.identity, L.sqrt] + variance = V.mu + valid = [0, np.inf] + safe_links = [L.Log, ] + + def __init__(self, link=L.log): + self.variance = Poisson.variance + self.link = link() + + def _clean(self, x): + """ + Helper function to trim the data so that is in (0,inf) + + Notes + ----- + The need for this function was discovered through usage and its + possible that other families might need a check for validity of the + domain. + """ + return np.clip(x, FLOAT_EPS, np.inf) + + def resid_dev(self, endog, mu, scale=1.): + r"""Poisson deviance residual + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + scale : float, optional + An optional argument to divide the residuals by scale. The default + is 1. + + Returns + ------- + resid_dev : array + Deviance residuals as defined below + + Notes + ----- + .. math:: + + resid\_dev_i = sign(Y_i - \mu_i) * \sqrt{2 * + (Y_i * \log(Y_i / \mu_i) - (Y_i - \mu_i))} / scale + """ + endog_mu = self._clean(endog / mu) + return (np.sign(endog - mu) * + np.sqrt(2 * (endog * np.log(endog_mu) - (endog - mu))) / scale) + + def deviance(self, endog, mu, freq_weights=1., scale=1.): + r''' + Poisson deviance function + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + An optional scale argument. The default is 1. + + Returns + ------- + deviance : float + The deviance function at (endog,mu,freq_weights,scale) as defined + below. + + Notes + ----- + If a constant term is included it is defined as + + .. math:: + + D = 2 * \sum_i (freq\_weights_i * Y_i * \log(Y_i / \mu_i))/ scale + ''' + endog_mu = self._clean(endog / mu) + return 2 * np.sum(endog * freq_weights * np.log(endog_mu)) / scale + + def loglike(self, endog, mu, freq_weights=1., scale=1.): + r""" + The log-likelihood function in terms of the fitted mean response. + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + The scale parameter, defaults to 1. + + Returns + ------- + llf : float + The value of the loglikelihood function evaluated at + (endog,mu,freq_weights,scale) as defined below. + + Notes + ----- + .. math:: + + llf = scale * \sum_i freq\_weights_i * (Y_i * \log(\mu_i) - \mu_i - + \ln \Gamma(Y_i + 1)) + """ + loglike = np.sum(freq_weights * (endog * np.log(mu) - mu - + special.gammaln(endog + 1))) + return scale * loglike + + def resid_anscombe(self, endog, mu): + r""" + Anscombe residuals for the Poisson exponential family distribution + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + + Returns + ------- + resid_anscombe : array + The Anscome residuals for the Poisson family defined below + + Notes + ----- + .. math:: + + resid\_anscombe_i = (3/2) * (Y_i^{2/3} - \mu_i^{2/3}) / \mu_i^{1/6} + """ + return (3 / 2.) * (endog**(2/3.) - mu**(2 / 3.)) / mu**(1 / 6.) + +class QuasiPoisson(Family): + """ + QuasiPoisson exponential family. + + Parameters + ---------- + link : a link instance, optional + The default link for the Poisson family is the log link. Available + links are log, identity, and sqrt. See statsmodels.family.links for + more information. + + Attributes + ---------- + Poisson.link : a link instance + The link function of the Poisson instance. + Poisson.variance : varfuncs instance + `variance` is an instance of + statsmodels.genmod.families.family.varfuncs.mu + + See also + -------- + statsmodels.genmod.families.family.Family + :ref:`links` + + """ + + links = [L.log, L.identity, L.sqrt] + variance = V.mu + valid = [0, np.inf] + safe_links = [L.Log, ] + + def __init__(self, link=L.log): + self.variance = Poisson.variance + self.link = link() + + def _clean(self, x): + """ + Helper function to trim the data so that is in (0,inf) + + Notes + ----- + The need for this function was discovered through usage and its + possible that other families might need a check for validity of the + domain. + """ + return np.clip(x, FLOAT_EPS, np.inf) + + def resid_dev(self, endog, mu, scale=1.): + r"""Poisson deviance residual + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + scale : float, optional + An optional argument to divide the residuals by scale. The default + is 1. + + Returns + ------- + resid_dev : array + Deviance residuals as defined below + + Notes + ----- + .. math:: + + resid\_dev_i = sign(Y_i - \mu_i) * \sqrt{2 * + (Y_i * \log(Y_i / \mu_i) - (Y_i - \mu_i))} / scale + """ + endog_mu = self._clean(endog / mu) + return (np.sign(endog - mu) * + np.sqrt(2 * (endog * np.log(endog_mu) - (endog - mu))) / scale) + + def deviance(self, endog, mu, freq_weights=1., scale=1.): + r''' + Poisson deviance function + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + An optional scale argument. The default is 1. + + Returns + ------- + deviance : float + The deviance function at (endog,mu,freq_weights,scale) as defined + below. + + Notes + ----- + If a constant term is included it is defined as + + .. math:: + + D = 2 * \sum_i (freq\_weights_i * Y_i * \log(Y_i / \mu_i))/ scale + ''' + endog_mu = self._clean(endog / mu) + return 2 * np.sum(endog * freq_weights * np.log(endog_mu)) / scale + + def loglike(self, endog, mu, freq_weights=1., scale=1.): + r""" + The log-likelihood function in terms of the fitted mean response. + + Returns NaN for QuasiPoisson + + Returns + ------- + None: not applicable for QuasiPoisson + """ + return np.nan + + def resid_anscombe(self, endog, mu): + r""" + Anscombe residuals for the Poisson exponential family distribution + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + + Returns + ------- + resid_anscombe : array + The Anscome residuals for the Poisson family defined below + + Notes + ----- + .. math:: + + resid\_anscombe_i = (3/2) * (Y_i^{2/3} - \mu_i^{2/3}) / \mu_i^{1/6} + """ + return (3 / 2.) * (endog**(2/3.) - mu**(2 / 3.)) / mu**(1 / 6.) + +class Gaussian(Family): + """ + Gaussian exponential family distribution. + + Parameters + ---------- + link : a link instance, optional + The default link for the Gaussian family is the identity link. + Available links are log, identity, and inverse. + See statsmodels.family.links for more information. + + Attributes + ---------- + Gaussian.link : a link instance + The link function of the Gaussian instance + Gaussian.variance : varfunc instance + `variance` is an instance of statsmodels.family.varfuncs.constant + + See also + -------- + statsmodels.genmod.families.family.Family + :ref:`links` + + """ + + links = [L.log, L.identity, L.inverse_power] + variance = V.constant + safe_links = links + + def __init__(self, link=L.identity): + self.variance = Gaussian.variance + self.link = link() + + def resid_dev(self, endog, mu, scale=1.): + r""" + Gaussian deviance residuals + + Parameters + ----------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + scale : float, optional + An optional argument to divide the residuals by scale. The default + is 1. + + Returns + ------- + resid_dev : array + Deviance residuals as defined below + + Notes + -------- + .. math:: + + resid\_dev_i = (Y_i - \mu_i) / \sqrt{Var(\mu_i)} / scale + """ + + return (endog - mu) / np.sqrt(self.variance(mu)) / scale + + def deviance(self, endog, mu, freq_weights=1., scale=1.): + r""" + Gaussian deviance function + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + An optional scale argument. The default is 1. + + Returns + ------- + deviance : float + The deviance function at (endog,mu,freq_weights,scale) + as defined below. + + Notes + -------- + .. math:: + + D = \sum_i freq\_weights_i * (Y_i - \mu_i)^2 / scale + """ + return np.sum((freq_weights * (endog - mu)**2)) / scale + + def loglike(self, endog, mu, freq_weights=1., scale=1.): + r""" + The log-likelihood in terms of the fitted mean response. + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + Scales the loglikelihood function. The default is 1. + + Returns + ------- + llf : float + The value of the loglikelihood function evaluated at + (endog,mu,freq_weights,scale) as defined below. + + Notes + ----- + If the link is the identity link function then the + loglikelihood function is the same as the classical OLS model. + + .. math:: + + llf = -nobs / 2 * (\log(SSR) + (1 + \log(2 \pi / nobs))) + + where + + .. math:: + SSR = \sum_i (Y_i - g^{-1}(\mu_i))^2 + + If the links is not the identity link then the loglikelihood + function is defined as + + .. math:: + + llf = \sum_i freq\_weights_i * ((Y_i * \mu_i - \mu_i^2 / 2) / scale- + Y^2 / (2 * scale) - (1/2) * \log(2 * \pi * scale)) + """ + if isinstance(self.link, L.Power) and self.link.power == 1: + # This is just the loglikelihood for classical OLS + nobs2 = endog.shape[0] / 2. + SSR = np.sum((endog-self.fitted(mu))**2, axis=0) + llf = -np.log(SSR) * nobs2 + llf -= (1+np.log(np.pi/nobs2))*nobs2 + return llf + else: + return np.sum(freq_weights * ((endog * mu - mu**2/2)/scale - + endog**2/(2 * scale) - .5*np.log(2 * np.pi * scale))) + + def resid_anscombe(self, endog, mu): + r""" + The Anscombe residuals for the Gaussian exponential family distribution + + Parameters + ---------- + endog : array + Endogenous response variable + mu : array + Fitted mean response variable + + Returns + ------- + resid_anscombe : array + The Anscombe residuals for the Gaussian family defined below + + Notes + -------- + .. math:: + + resid\_anscombe_i = Y_i - \mu_i + """ + return endog - mu + + +class Gamma(Family): + """ + Gamma exponential family distribution. + + Parameters + ---------- + link : a link instance, optional + The default link for the Gamma family is the inverse link. + Available links are log, identity, and inverse. + See statsmodels.family.links for more information. + + Attributes + ---------- + Gamma.link : a link instance + The link function of the Gamma instance + Gamma.variance : varfunc instance + `variance` is an instance of statsmodels.family.varfuncs.mu_squared + + See also + -------- + statsmodels.genmod.families.family.Family + :ref:`links` + + """ + + links = [L.log, L.identity, L.inverse_power] + variance = V.mu_squared + safe_links = [L.Log, ] + + def __init__(self, link=L.inverse_power): + self.variance = Gamma.variance + self.link = link() + + def _clean(self, x): + """ + Helper function to trim the data so that is in (0,inf) + + Notes + ----- + The need for this function was discovered through usage and its + possible that other families might need a check for validity of the + domain. + """ + return np.clip(x, FLOAT_EPS, np.inf) + + def deviance(self, endog, mu, freq_weights=1., scale=1.): + r""" + Gamma deviance function + + Parameters + ----------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + An optional scale argument. The default is 1. + + Returns + ------- + deviance : float + Deviance function as defined below + + Notes + ----- + .. math:: + + D = 2 * \sum_i freq\_weights_i * ((Y_i - \mu_i)/\mu_i - \log(Y_i / + \mu_i)) + """ + endog_mu = self._clean(endog/mu) + return 2*np.sum(freq_weights*((endog-mu)/mu-np.log(endog_mu))) + + def resid_dev(self, endog, mu, scale=1.): + r""" + Gamma deviance residuals + + Parameters + ----------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + scale : float, optional + An optional argument to divide the residuals by scale. The default + is 1. + + Returns + ------- + resid_dev : array + Deviance residuals as defined below + + Notes + ----- + .. math:: + + resid\_dev_i = sign(Y_i - \mu_i) \sqrt{-2 * + (-(Y_i - \mu_i) / \mu_i + \log(Y_i / \mu_i))} + """ + endog_mu = self._clean(endog / mu) + return np.sign(endog - mu) * np.sqrt(-2 * (-(endog - mu)/mu + + np.log(endog_mu))) + + def loglike(self, endog, mu, freq_weights=1., scale=1.): + r""" + The log-likelihood function in terms of the fitted mean response. + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + The default is 1. + + Returns + ------- + llf : float + The value of the loglikelihood function evaluated at + (endog,mu,freq_weights,scale) as defined below. + + Notes + -------- + .. math:: + + llf = -1 / scale * \sum_i *(Y_i / \mu_i+ \log(\mu_i)+ + (scale -1) * \log(Y) + \log(scale) + scale * + \ln \Gamma(1 / scale)) + """ + return - 1./scale * np.sum((endog/mu + np.log(mu) + (scale - 1) * + np.log(endog) + np.log(scale) + scale * + special.gammaln(1./scale)) * freq_weights) + + # in Stata scale is set to equal 1 for reporting llf + # in R it's the dispersion, though there is a loss of precision vs. + # our results due to an assumed difference in implementation + + def resid_anscombe(self, endog, mu): + r""" + The Anscombe residuals for Gamma exponential family distribution + + Parameters + ---------- + endog : array + Endogenous response variable + mu : array + Fitted mean response variable + + Returns + ------- + resid_anscombe : array + The Anscombe residuals for the Gamma family defined below + + Notes + ----- + .. math:: + + resid\_anscombe_i = 3 * (Y_i^{1/3} - \mu_i^{1/3}) / \mu_i^{1/3} + """ + return 3 * (endog**(1/3.) - mu**(1/3.)) / mu**(1/3.) + + +class Binomial(Family): + """ + Binomial exponential family distribution. + + Parameters + ---------- + link : a link instance, optional + The default link for the Binomial family is the logit link. + Available links are logit, probit, cauchy, log, and cloglog. + See statsmodels.family.links for more information. + + Attributes + ---------- + Binomial.link : a link instance + The link function of the Binomial instance + Binomial.variance : varfunc instance + `variance` is an instance of statsmodels.family.varfuncs.binary + + See also + -------- + statsmodels.genmod.families.family.Family + :ref:`links` + + Notes + ----- + endog for Binomial can be specified in one of three ways. + + """ + + links = [L.logit, L.probit, L.cauchy, L.log, L.cloglog, L.identity] + variance = V.binary # this is not used below in an effort to include n + + # Other safe links, e.g. cloglog and probit are subclasses + safe_links = [L.Logit, L.CDFLink] + + def __init__(self, link=L.logit): # , n=1.): + # TODO: it *should* work for a constant n>1 actually, if freq_weights + # is equal to n + self.n = 1 + # overwritten by initialize if needed but always used to initialize + # variance since endog is assumed/forced to be (0,1) + self.variance = V.Binomial(n=self.n) + self.link = link() + + def starting_mu(self, y): + """ + The starting values for the IRLS algorithm for the Binomial family. + A good choice for the binomial family is :math:`\mu_0 = (Y_i + 0.5)/2` + """ + return (y + .5)/2 + + def initialize(self, endog, freq_weights): + ''' + Initialize the response variable. + + Parameters + ---------- + endog : array + Endogenous response variable + + Returns + -------- + If `endog` is binary, returns `endog` + + If `endog` is a 2d array, then the input is assumed to be in the format + (successes, failures) and + successes/(success + failures) is returned. And n is set to + successes + failures. + ''' + # if not np.all(np.asarray(freq_weights) == 1): + # self.variance = V.Binomial(n=freq_weights) + if (endog.ndim > 1 and endog.shape[1] > 1): + y = endog[:, 0] + # overwrite self.freq_weights for deviance below + self.n = endog.sum(1) + return y*1./self.n, self.n + else: + return endog, np.ones(endog.shape[0]) + + def deviance(self, endog, mu, freq_weights=1, scale=1., axis=None): + r''' + Deviance function for either Bernoulli or Binomial data. + + Parameters + ---------- + endog : array-like + Endogenous response variable (already transformed to a probability + if appropriate). + mu : array + Fitted mean response variable + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + An optional scale argument. The default is 1. + + Returns + -------- + deviance : float + The deviance function as defined below + + Notes + ----- + If the endogenous variable is binary: + + .. math:: + + D = -2 * \sum_i freq\_weights * (I_{1,i} * \log(\mu_i) + I_{0,i} * + \log(1 - \mu_i)) + + where :math:`I_{1,i}` is an indicator function that evalueates to 1 if + :math:`Y_i = 1`. and :math:`I_{0,i}` is an indicator function that + evaluates to 1 if :math:`Y_i = 0`. + + If the model is ninomial: + + .. math:: + + D = 2 * \sum_i freq\_weights * (\log(Y_i / \mu_i) + (n_i - Y_i) * + \log((n_i - Y_i) / n_i - \mu_i)) + + where :math:`Y_i` and :math:`n` are as defined in Binomial.initialize. + ''' + if np.shape(self.n) == () and self.n == 1: + one = np.equal(endog, 1) + return -2 * np.sum((one * np.log(mu + 1e-200) + (1-one) * + np.log(1 - mu + 1e-200)) * freq_weights, axis=axis) + + else: + return 2 * np.sum(self.n * freq_weights * + (endog * np.log(endog/mu + 1e-200) + + (1 - endog) * np.log((1 - endog) / + (1 - mu) + 1e-200)), axis=axis) + + def resid_dev(self, endog, mu, scale=1.): + r""" + Binomial deviance residuals + + Parameters + ----------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + scale : float, optional + An optional argument to divide the residuals by scale. The default + is 1. + + Returns + ------- + resid_dev : array + Deviance residuals as defined below + + Notes + ----- + If the endogenous variable is binary: + + .. math:: + + resid\_dev_i = sign(Y_i - \mu_i) * \sqrt{-2 * + \log(I_{1,i} * \mu_i + I_{0,i} * (1 - \mu_i))} + + where :math:`I_{1,i}` is an indicator function that evalueates to 1 if + :math:`Y_i = 1`. and :math:`I_{0,i}` is an indicator function that + evaluates to 1 if :math:`Y_i = 0`. + + If the endogenous variable is binomial: + + .. math:: + + resid\_dev_i = sign(Y_i - \mu_i) \sqrt{2 * n_i * + (Y_i * \log(Y_i / \mu_i) + (1 - Y_i) * + \log(1 - Y_i)/(1 - \mu_i))} + + where :math:`Y_i` and :math:`n` are as defined in Binomial.initialize. + """ + + mu = self.link._clean(mu) + if np.shape(self.n) == () and self.n == 1: + one = np.equal(endog, 1) + return np.sign(endog-mu)*np.sqrt(-2 * + np.log(one * mu + (1 - one) * + (1 - mu)))/scale + else: + return (np.sign(endog - mu) * + np.sqrt(2 * self.n * + (endog * np.log(endog/mu + 1e-200) + + (1 - endog) * np.log((1 - endog)/(1 - mu) + 1e-200)))/scale) + + def loglike(self, endog, mu, freq_weights=1, scale=1.): + r""" + The log-likelihood function in terms of the fitted mean response. + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + Not used for the Binomial GLM. + + Returns + ------- + llf : float + The value of the loglikelihood function evaluated at + (endog,mu,freq_weights,scale) as defined below. + + Notes + -------- + If the endogenous variable is binary: + + .. math:: + + llf = scale * \sum_i (y_i * \log(\mu_i/(1-\mu_i)) + \log(1-\mu_i)) * + freq\_weights_i + + If the endogenous variable is binomial: + + .. math:: + + llf = scale * \sum_i freq\_weights_i * (\ln \Gamma(n+1) - + \ln \Gamma(y_i + 1) - \ln \Gamma(n_i - y_i +1) + y_i * + \log(\mu_i / (1 - \mu_i)) + n * \log(1 - \mu_i)) + + where :math:`y_i = Y_i * n_i` with :math:`Y_i` and :math:`n_i` as + defined in Binomial initialize. This simply makes :math:`y_i` the + original number of successes. + """ + + if np.shape(self.n) == () and self.n == 1: + return scale * np.sum((endog * np.log(mu/(1 - mu) + 1e-200) + + np.log(1 - mu)) * freq_weights) + else: + y = endog * self.n # convert back to successes + return scale * np.sum((special.gammaln(self.n + 1) - + special.gammaln(y + 1) - + special.gammaln(self.n - y + 1) + y * + np.log(mu/(1 - mu)) + self.n * + np.log(1 - mu)) * freq_weights) + + def resid_anscombe(self, endog, mu): + ''' + The Anscombe residuals + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + + Returns + ------- + resid_anscombe : array + The Anscombe residuals as defined below. + + Notes + ----- + sqrt(n)*(cox_snell(endog)-cox_snell(mu))/(mu**(1/6.)*(1-mu)**(1/6.)) + + where cox_snell is defined as + cox_snell(x) = betainc(2/3., 2/3., x)*betainc(2/3.,2/3.) + where betainc is the incomplete beta function + + The name 'cox_snell' is idiosyncratic and is simply used for + convenience following the approach suggested in Cox and Snell (1968). + Further note that + cox_snell(x) = x**(2/3.)/(2/3.)*hyp2f1(2/3.,1/3.,5/3.,x) + where hyp2f1 is the hypergeometric 2f1 function. The Anscombe + residuals are sometimes defined in the literature using the + hyp2f1 formulation. Both betainc and hyp2f1 can be found in scipy. + + References + ---------- + Anscombe, FJ. (1953) "Contribution to the discussion of H. Hotelling's + paper." Journal of the Royal Statistical Society B. 15, 229-30. + + Cox, DR and Snell, EJ. (1968) "A General Definition of Residuals." + Journal of the Royal Statistical Society B. 30, 248-75. + + ''' + cox_snell = lambda x: (special.betainc(2/3., 2/3., x) + * special.beta(2/3., 2/3.)) + return np.sqrt(self.n) * ((cox_snell(endog) - cox_snell(mu)) / + (mu**(1/6.) * (1 - mu)**(1/6.))) + + +class InverseGaussian(Family): + """ + InverseGaussian exponential family. + + Parameters + ---------- + link : a link instance, optional + The default link for the inverse Gaussian family is the + inverse squared link. + Available links are inverse_squared, inverse, log, and identity. + See statsmodels.family.links for more information. + + Attributes + ---------- + InverseGaussian.link : a link instance + The link function of the inverse Gaussian instance + InverseGaussian.variance : varfunc instance + `variance` is an instance of statsmodels.family.varfuncs.mu_cubed + + See also + -------- + statsmodels.genmod.families.family.Family + :ref:`links` + + Notes + ----- + The inverse Guassian distribution is sometimes referred to in the + literature as the Wald distribution. + + """ + + links = [L.inverse_squared, L.inverse_power, L.identity, L.log] + variance = V.mu_cubed + safe_links = [L.inverse_squared, L.Log, ] + + def __init__(self, link=L.inverse_squared): + self.variance = InverseGaussian.variance + self.link = link() + + def resid_dev(self, endog, mu, scale=1.): + r""" + Returns the deviance residuals for the inverse Gaussian family. + + Parameters + ----------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + An optional argument to divide the residuals by scale. The default + is 1. + + Returns + ------- + resid_dev : array + Deviance residuals as defined below + + Notes + ----- + .. math:: + + resid\_dev_i = sign(Y_i - \mu_i) * + \sqrt {(Y_i - \mu_i)^2 / (Y_i * \mu_i^2)} / scale + """ + return np.sign(endog-mu) * np.sqrt((endog-mu)**2/(endog*mu**2))/scale + + def deviance(self, endog, mu, freq_weights=1., scale=1.): + r""" + Inverse Gaussian deviance function + + Parameters + ----------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + An optional scale argument. The default is 1. + + Returns + ------- + deviance : float + Deviance function as defined below + + Notes + ----- + .. math:: + + D = \sum_i freq\_weights_i * ((Y_i - \mu_i)^2 / (Y_i *\mu_i^2)) / + scale + """ + return np.sum(freq_weights*(endog-mu)**2/(endog*mu**2))/scale + + def loglike(self, endog, mu, freq_weights=1., scale=1.): + r""" + The log-likelihood function in terms of the fitted mean response. + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + The default is 1. + + Returns + ------- + llf : float + The value of the loglikelihood function evaluated at + (endog,mu,freq_weights,scale) as defined below. + + Notes + ----- + .. math:: + + llf = -1/2 * \sum_i freq\_weights_i * ((Y_i - \mu_i)^2 / (Y_i * + \mu_i * scale) + \log(scale * Y_i^3) + \log(2 * \pi)) + """ + return -.5 * np.sum(((endog - mu)**2/(endog * mu**2 * scale) + + np.log(scale * endog**3) + np.log(2 * np.pi)) * + freq_weights) + + def resid_anscombe(self, endog, mu): + r""" + The Anscombe residuals for the inverse Gaussian distribution + + Parameters + ---------- + endog : array + Endogenous response variable + mu : array + Fitted mean response variable + + Returns + ------- + resid_anscombe : array + The Anscombe residuals for the inverse Gaussian distribution as + defined below + + Notes + ----- + .. math:: + + resid\_anscombe_i = \log(Y_i / \mu_i) / \sqrt{\mu_i} + """ + return np.log(endog / mu) / np.sqrt(mu) + + +class NegativeBinomial(Family): + """ + Negative Binomial exponential family. + + Parameters + ---------- + link : a link instance, optional + The default link for the negative binomial family is the log link. + Available links are log, cloglog, identity, nbinom and power. + See statsmodels.family.links for more information. + alpha : float, optional + The ancillary parameter for the negative binomial distribution. + For now `alpha` is assumed to be nonstochastic. The default value + is 1. Permissible values are usually assumed to be between .01 and 2. + + + Attributes + ---------- + NegativeBinomial.link : a link instance + The link function of the negative binomial instance + NegativeBinomial.variance : varfunc instance + `variance` is an instance of statsmodels.family.varfuncs.nbinom + + See also + -------- + statsmodels.genmod.families.family.Family + :ref:`links` + + Notes + ----- + Power link functions are not yet supported. + + """ + links = [L.log, L.cloglog, L.identity, L.nbinom, L.Power] + # TODO: add the ability to use the power links with an if test + # similar to below + variance = V.nbinom + safe_links = [L.Log, ] + + def __init__(self, link=L.log, alpha=1.): + self.alpha = 1. * alpha # make it at least float + self.variance = V.NegativeBinomial(alpha=self.alpha) + if isinstance(link, L.NegativeBinomial): + self.link = link(alpha=self.alpha) + else: + self.link = link() + + def _clean(self, x): + """ + Helper function to trim the data so that is in (0,inf) + + Notes + ----- + The need for this function was discovered through usage and its + possible that other families might need a check for validity of the + domain. + """ + return np.clip(x, FLOAT_EPS, np.inf) + + def deviance(self, endog, mu, freq_weights=1., scale=1.): + r""" + Returns the value of the deviance function. + + Parameters + ----------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + An optional scale argument. The default is 1. + + Returns + ------- + deviance : float + Deviance function as defined below + + Notes + ----- + :math:`D = \sum_i piecewise_i` where :math:`piecewise_i` is defined as: + + If :math:`Y_{i} = 0`: + + :math:`piecewise_i = 2* \log(1 + \alpha * \mu_i) / \alpha` + + If :math:`Y_{i} > 0`: + + :math:`piecewise_i = 2 * Y_i * \log(Y_i / \mu_i) - (2 / \alpha) * + (1 + \alpha * Y_i) * \ln(1 + \alpha * Y_i) / (1 + \alpha * \mu_i)` + """ + iszero = np.equal(endog, 0) + notzero = 1 - iszero + endog_mu = self._clean(endog/mu) + tmp = iszero * 2 * np.log(1 + self.alpha * mu) / self.alpha + tmp += notzero * (2 * endog * np.log(endog_mu) - 2 / self.alpha * + (1 + self.alpha * endog) * + np.log((1 + self.alpha * endog) / + (1 + self.alpha * mu))) + return np.sum(freq_weights * tmp) / scale + + def resid_dev(self, endog, mu, scale=1.): + r""" + Negative Binomial Deviance Residual + + Parameters + ---------- + endog : array-like + `endog` is the response variable + mu : array-like + `mu` is the fitted value of the model + scale : float, optional + An optional argument to divide the residuals by scale. The default + is 1. + + Returns + -------- + resid_dev : array + The array of deviance residuals + + Notes + ----- + :math:`resid\_dev_i = sign(Y_i-\mu_i) * \sqrt{piecewise_i}` + + where :math:`piecewise_i` is defined as + + If :math:`Y_i = 0`: + + :math:`piecewise_i = 2 * \log(1 + \alpha * \mu_i)/ \alpha` + + If :math:`Y_i > 0`: + + :math:`piecewise_i = 2 * Y_i * \log(Y_i / \mu_i) - (2 / \alpha) * + (1 + \alpha * Y_i) * \log((1 + \alpha * Y_i) / (1 + \alpha * \mu_i))` + """ + iszero = np.equal(endog, 0) + notzero = 1 - iszero + endog_mu = self._clean(endog / mu) + tmp = iszero * 2 * np.log(1 + self.alpha * mu) / self.alpha + tmp += notzero * (2 * endog * np.log(endog_mu) - 2 / self.alpha * + (1 + self.alpha * endog) * + np.log((1 + self.alpha * endog) / + (1 + self.alpha * mu))) + return np.sign(endog - mu) * np.sqrt(tmp) / scale + + def loglike(self, endog, mu, freq_weights=1., scale=1.): + r""" + The log-likelihood function in terms of the fitted mean response. + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + The fitted mean response values + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float + The scale parameter. The default is 1. + + Returns + ------- + llf : float + The value of the loglikelihood function evaluated at + (endog,mu,freq_weights,scale) as defined below. + + Notes + ----- + Defined as: + + .. math:: + + llf = \sum_i freq\_weights_i * (Y_i * \log{(\alpha * e^{\eta_i} / + (1 + \alpha * e^{\eta_i}))} - \log{(1 + \alpha * e^{\eta_i})}/ + \alpha + Constant) + + where :math:`Constant` is defined as: + + .. math:: + + Constant = \ln \Gamma{(Y_i + 1/ \alpha )} - \ln \Gamma(Y_i + 1) - + \ln \Gamma{(1/ \alpha )} + """ + lin_pred = self._link(mu) + constant = (special.gammaln(endog + 1 / self.alpha) - + special.gammaln(endog+1)-special.gammaln(1/self.alpha)) + exp_lin_pred = np.exp(lin_pred) + return np.sum((endog * np.log(self.alpha * exp_lin_pred / + (1 + self.alpha * exp_lin_pred)) - + np.log(1 + self.alpha * exp_lin_pred) / + self.alpha + constant) * freq_weights) + + def resid_anscombe(self, endog, mu): + """ + The Anscombe residuals for the negative binomial family + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + + Returns + ------- + resid_anscombe : array + The Anscombe residuals as defined below. + + Notes + ----- + `resid_anscombe` = (hyp2f1(-alpha*endog)-hyp2f1(-alpha*mu)+\ + 1.5*(endog**(2/3.)-mu**(2/3.)))/(mu+alpha*mu**2)**(1/6.) + + where hyp2f1 is the hypergeometric 2f1 function parameterized as + hyp2f1(x) = hyp2f1(2/3.,1/3.,5/3.,x) + """ + + hyp2f1 = lambda x : special.hyp2f1(2 / 3., 1 / 3., 5 / 3., x) + return ((hyp2f1(-self.alpha * endog) - hyp2f1(-self.alpha * mu) + + 1.5 * ( endog**(2 / 3.) - mu**(2 / 3.))) / + (mu + self.alpha * mu**2)**(1 / 6.)) + + +class Tweedie(Family): + """ + Tweedie family. + + Parameters + ---------- + link : a link instance, optional + The default link for the Tweedie family is the log link when the + link_power is 0. Otherwise, the power link is default. + Available links are log and Power. + var_power : float, optional + The variance power. + link_power : float, optional + The link power. + + Attributes + ---------- + Tweedie.link : a link instance + The link function of the Tweedie instance + Tweedie.variance : varfunc instance + `variance` is an instance of statsmodels.family.varfuncs.Power + Tweedie.link_power : float + The power of the link function, or 0 if its a log link. + Tweedie.var_power : float + The power of the variance function. + + See also + -------- + statsmodels.genmod.families.family.Family + :ref:`links` + + Notes + ----- + Logliklihood function not implemented because of the complexity of + calculating an infinite series of summations. The variance power can be + estimated using the `estimate_tweedie_power` function that is part of the + `GLM` class. + """ + links = [L.log, L.Power] + variance = V.Power + safe_links = [L.log, L.Power] + + def __init__(self, link=None, var_power=1., link_power=0): + self.var_power = var_power + self.link_power = link_power + self.variance = V.Power(power=var_power * 1.) + if link_power != 0 and not ((link is L.Power) or (link is None)): + msg = 'link_power of {} not supported specified link' + msg = msg.format(link_power) + raise ValueError(msg) + if (link_power == 0) and ((link is None) or (link is L.Log)): + self.link = L.log() + elif link_power != 0: + self.link = L.Power(power=link_power * 1.) + else: + self.link = link() + + def _clean(self, x): + """ + Helper function to trim the data so that is in (0,inf) + + Notes + ----- + The need for this function was discovered through usage and its + possible that other families might need a check for validity of the + domain. + """ + return np.clip(x, 0, np.inf) + + def deviance(self, endog, mu, freq_weights=1., scale=1.): + r""" + Returns the value of the deviance function. + + Parameters + ----------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float, optional + An optional scale argument. The default is 1. + + Returns + ------- + deviance : float + Deviance function as defined below + + Notes + ----- + When :math:`p = 1`, + + .. math:: + + resid\_dev_i = \mu + + when :math:`endog = 0` and + + .. math:: + + resid\_dev_i = endog * \log(endog / \mu) + (\mu - endog) + + otherwise. + + When :math:`p = 2`, + + .. math:: + + resid\_dev_i = (endog - \mu) / \mu - \log(endog / \mu) + + For all other p, + + .. math:: + + resid\_dev_i = endog ^{2 - p} / ((1 - p) * (2 - p)) - + endog * \mu ^{1 - p} / (1 - p) + \mu ^{2 - p} / + (2 - p) + + Once :math:`resid\_dev_i` is calculated, then calculate deviance as + + .. math:: + + D = \sum{2 * freq\_weights * resid\_dev_i} + """ + p = self.var_power + if p == 1: + dev = np.where(endog == 0, + mu, + endog * np.log(endog / mu) + (mu - endog)) + elif p == 2: + endog1 = np.clip(endog, FLOAT_EPS, np.inf) + dev = ((endog - mu) / mu) - np.log(endog1 / mu) + else: + dev = (endog ** (2 - p) / ((1 - p) * (2 - p)) - + endog * mu ** (1-p) / (1 - p) + mu ** (2 - p) / (2 - p)) + return np.sum(2 * freq_weights * dev) + + def resid_dev(self, endog, mu, scale=1.): + r""" + Tweedie Deviance Residual + + Parameters + ---------- + endog : array-like + `endog` is the response variable + mu : array-like + `mu` is the fitted value of the model + scale : float, optional + An optional argument to divide the residuals by scale. The default + is 1. + + Returns + -------- + resid_dev : array + The array of deviance residuals + + Notes + ----- + When :math:`p = 1`, + + .. math:: + + resid\_dev_i = \mu + + when :math:`endog = 0` and + + .. math:: + + resid\_dev_i = endog * \log(endog / \mu) + (\mu - endog) + + otherwise. + + When :math:`p = 2`, + + .. math:: + + resid\_dev_i = (endog - \mu) / \mu - \log(endog / \mu) + + For all other p, + + .. math:: + + resid\_dev_i = endog ^{2 - p} / ((1 - p) * (2 - p)) - + endog * \mu ^{1 - p} / (1 - p) + \mu ^{2 - p} / + (2 - p) + """ + p = self.var_power + if p == 1: + dev = np.where(endog == 0, + mu, + endog * np.log(endog / mu) + (mu - endog)) + elif p == 2: + endog1 = np.clip(endog, FLOAT_EPS, np.inf) + dev = ((endog - mu) / mu) - np.log(endog1 / mu) + else: + dev = (endog ** (2 - p) / ((1 - p) * (2 - p)) - + endog * mu ** (1-p) / (1 - p) + mu ** (2 - p) / (2 - p)) + return np.sign(endog - mu) * np.sqrt(2 * dev) + + def loglike(self, endog, mu, freq_weights=1., scale=1.): + r""" + The log-likelihood function in terms of the fitted mean response. + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + The fitted mean response values + freq_weights : array-like + 1d array of frequency weights. The default is 1. + scale : float + The scale parameter. The default is 1. + + Returns + ------- + llf : float + The value of the loglikelihood function evaluated at + (endog,mu,freq_weights,scale) as defined below. + + Notes + ----- + This is not implemented because of the complexity of calculating an + infinite series of sums. + """ + return np.nan + + def resid_anscombe(self, endog, mu): + """ + The Anscombe residuals for the Tweedie family + + Parameters + ---------- + endog : array-like + Endogenous response variable + mu : array-like + Fitted mean response variable + + Returns + ------- + resid_anscombe : array + The Anscombe residuals as defined below. + + Notes + ----- + When :math:`p = 3`, then + + .. math:: + + resid\_anscombe_i = (\log(endog) - \log(\mu)) / \sqrt{mu} + + Otherwise, + + .. math:: + + c = (3 - p) / 3 + + .. math:: + + resid\_anscombe_i = (1 / c) * (endog ^ c - \mu ^ c) / \mu ^{p / 6} + """ + if self.var_power == 3: + return (np.log(endog) - np.log(mu)) / np.sqrt(mu) + else: + c = (3. - self.var_power) / 3. + return ((1. / c) * (endog ** c - mu ** c) / + mu ** (self.var_power / 6.)) diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/glm/glm.py b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/glm.py new file mode 100644 index 0000000..f2fc17d --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/glm.py @@ -0,0 +1,326 @@ + +import numpy as np +import numpy.linalg as la +from pysal.spreg.utils import RegressionPropsY, spdot +import pysal.spreg.user_output as USER +from utils import cache_readonly +from base import LikelihoodModelResults +import family +from iwls import iwls + +__all__ = ['GLM'] + +class GLM(RegressionPropsY): + """ + Generalised linear models. Can currently estimate Guassian, Poisson and + Logisitc regression coefficients. GLM object prepares model input and fit + method performs estimation which then returns a GLMResults object. + + Parameters + ---------- + y : array + n*1, dependent variable. + X : array + n*k, independent variable, exlcuding the constant. + family : string + Model type: 'Gaussian', 'Poisson', 'Binomial' + + Attributes + ---------- + y : array + n*1, dependent variable. + X : array + n*k, independent variable, including constant. + family : string + Model type: 'Gaussian', 'Poisson', 'logistic' + n : integer + Number of observations + k : integer + Number of independent variables + df_model : float + k-1, where k is the number of variables (including + intercept) + df_residual : float + observations minus variables (n-k) + mean_y : float + Mean of y + std_y : float + Standard deviation of y + fit_params : dict + Parameters passed into fit method to define estimation + routine. + normalized_cov_params : array + k*k, approximates [X.T*X]-1 + """ + def __init__(self, y, X, family=family.Gaussian(), constant=True): + """ + Initialize class + """ + self.n = USER.check_arrays(y, X) + USER.check_y(y, self.n) + self.y = y + if constant: + self.X = USER.check_constant(X) + else: + self.X = X + self.family = family + self.k = self.X.shape[1] + self.fit_params = {} + + def fit(self, ini_betas=None, tol=1.0e-6, max_iter=200, solve='iwls'): + """ + Method that fits a model with a particular estimation routine. + + Parameters + ---------- + + ini_betas : array + k*1, initial coefficient values, including constant. + Default is None, which calculates initial values during + estimation. + tol: float + Tolerence for estimation convergence. + max_iter : integer + Maximum number of iterations if convergence not + achieved. + solve :string + Technique to solve MLE equations. + 'iwls' = iteratively (re)weighted least squares (default) + """ + self.fit_params['ini_betas'] = ini_betas + self.fit_params['tol'] = tol + self.fit_params['max_iter'] = max_iter + self.fit_params['solve']=solve + if solve.lower() == 'iwls': + params, predy, w, n_iter = iwls(self.y, self.X, self.family, + ini_betas=ini_betas, tol=tol, max_iter=max_iter) + self.fit_params['n_iter'] = n_iter + return GLMResults(self, params.flatten(), predy, w) + + @cache_readonly + def df_model(self): + return self.X.shape[1] - 1 + + @cache_readonly + def df_resid(self): + return self.n - self.df_model - 1 + +class GLMResults(LikelihoodModelResults): + """ + Results of estimated GLM and diagnostics. + + Parameters + ---------- + model : GLM object + Pointer to GLM object with estimation parameters. + params : array + k*1, estimared coefficients + mu : array + n*1, predicted y values. + w : array + n*1, final weight used for iwls + + Attributes + ---------- + model : GLM Object + Points to GLM object for which parameters have been + estimated. + y : array + n*1, dependent variable. + x : array + n*k, independent variable, including constant. + family : string + Model type: 'Gaussian', 'Poisson', 'Logistic' + n : integer + Number of observations + k : integer + Number of independent variables + df_model : float + k-1, where k is the number of variables (including + intercept) + df_residual : float + observations minus variables (n-k) + fit_params : dict + parameters passed into fit method to define estimation + routine. + scale : float + sigma squared used for subsequent computations. + params : array + n*k, estimared beta coefficients + w : array + n*1, final weight values of x + mu : array + n*1, predicted value of y (i.e., fittedvalues) + cov_params : array + Variance covariance matrix (kxk) of betas which has been + appropriately scaled by sigma-squared + bse : array + k*1, standard errors of betas + pvalues : array + k*1, two-tailed pvalues of parameters + tvalues : array + k*1, the tvalues of the standard errors + null : array + n*1, predicted values of y for null model + deviance : float + value of the deviance function evalued at params; + see family.py for distribution-specific deviance + null_deviance : float + value of the deviance function for the model fit with + a constant as the only regressor + llf : float + value of the loglikelihood function evalued at params; + see family.py for distribution-specific loglikelihoods + llnull : float + value of log-likelihood function evaluated at null + aic : float + AIC + bic : float + BIC + D2 : float + percent deviance explained + adj_D2 : float + adjusted percent deviance explained + pseudo_R2 : float + McFadden's pseudo R2 (coefficient of determination) + adj_pseudoR2 : float + adjusted McFadden's pseudo R2 + resid_response : array + response residuals; defined as y-mu + resid_pearson : array + Pearson residuals; defined as (y-mu)/sqrt(VAR(mu)) + where VAR is the distribution specific variance + function; see family.py and varfuncs.py for more information. + resid_working : array + Working residuals; the working residuals are defined as + resid_response/link'(mu); see links.py for the + derivatives of the link functions. + + resid_anscombe : array + Anscombe residuals; see family.py for + distribution-specific Anscombe residuals. + + resid_deviance : array + deviance residuals; see family.py for + distribution-specific deviance residuals. + + pearson_chi2 : float + chi-Squared statistic is defined as the sum + of the squares of the Pearson residuals + + normalized_cov_params : array + k*k, approximates [X.T*X]-1 + """ + def __init__(self, model, params, mu, w): + self.model = model + self.n = model.n + self.y = model.y.T.flatten() + self.X = model.X + self.k = model.k + self.family = model.family + self.fit_params = model.fit_params + self.params = params + self.w = w + self.mu = mu.flatten() + self._cache = {} + + @cache_readonly + def df_model(self): + return self.model.df_model + + @cache_readonly + def df_resid(self): + return self.model.df_resid + + @cache_readonly + def normalized_cov_params(self): + return la.inv(spdot(self.w.T, self.w)) + + @cache_readonly + def resid_response(self): + return (self.y-self.mu) + + @cache_readonly + def resid_pearson(self): + return ((self.y-self.mu) / + np.sqrt(self.family.variance(self.mu))) + + @cache_readonly + def resid_working(self): + return (self.resid_response / self.family.link.deriv(self.mu)) + + @cache_readonly + def resid_anscombe(self): + return (self.family.resid_anscombe(self.y, self.mu)) + + @cache_readonly + def resid_deviance(self): + return (self.family.resid_dev(self.y, self.mu)) + + @cache_readonly + def pearson_chi2(self): + chisq = (self.y - self.mu)**2 / self.family.variance(self.mu) + chisqsum = np.sum(chisq) + return chisqsum + + @cache_readonly + def null(self): + y = np.reshape(self.y, (-1,1)) + model = self.model + X = np.ones((len(y), 1)) + null_mod = GLM(y, X, family=self.family, constant=False) + return null_mod.fit().mu + + @cache_readonly + def scale(self): + if isinstance(self.family, (family.Binomial, family.Poisson)): + return 1. + else: + return (((np.power(self.resid_response, 2) / + self.family.variance(self.mu))).sum() / + (self.df_resid)) + @cache_readonly + def deviance(self): + return self.family.deviance(self.y, self.mu) + + @cache_readonly + def null_deviance(self): + return self.family.deviance(self.y, self.null) + + @cache_readonly + def llnull(self): + return self.family.loglike(self.y, self.null, scale=self.scale) + + @cache_readonly + def llf(self): + return self.family.loglike(self.y, self.mu, scale=self.scale) + + @cache_readonly + def aic(self): + if isinstance(self.family, family.QuasiPoisson): + return np.nan + else: + return -2 * self.llf + 2*(self.df_model+1) + + @cache_readonly + def bic(self): + return (self.deviance - + (self.model.n - self.df_model - 1) * + np.log(self.model.n)) + + @cache_readonly + def D2(self): + return 1 - (self.deviance / self.null_deviance) + + @cache_readonly + def adj_D2(self): + return 1.0 - (float(self.n) - 1.0)/(float(self.n) - float(self.k)) * (1.0-self.D2) + + @cache_readonly + def pseudoR2(self): + return 1 - (self.llf/self.llnull) + + @cache_readonly + def adj_pseudoR2(self): + return 1 - ((self.llf-self.k)/self.llnull) + diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/glm/iwls.py b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/iwls.py new file mode 100644 index 0000000..3ea6747 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/iwls.py @@ -0,0 +1,84 @@ +import numpy as np +import numpy.linalg as la +from scipy import sparse as sp +from scipy.sparse import linalg as spla +from pysal.spreg.utils import spdot, spmultiply +from family import Binomial, Poisson + +def _compute_betas(y, x): + """ + compute MLE coefficients using iwls routine + + Methods: p189, Iteratively (Re)weighted Least Squares (IWLS), + Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). + Geographically weighted regression: the analysis of spatially varying relationships. + """ + xT = x.T + xtx = spdot(xT, x) + xtx_inv = la.inv(xtx) + xtx_inv = sp.csr_matrix(xtx_inv) + xTy = spdot(xT, y, array_out=False) + betas = spdot(xtx_inv, xTy) + return betas + +def _compute_betas_gwr(y, x, wi): + """ + compute MLE coefficients using iwls routine + + Methods: p189, Iteratively (Re)weighted Least Squares (IWLS), + Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). + Geographically weighted regression: the analysis of spatially varying relationships. + """ + xT = (x * wi).T + xtx = np.dot(xT, x) + xtx_inv = la.inv(xtx) + xtx_inv_xt = np.dot(xtx_inv, xT) + betas = np.dot(xtx_inv_xt, y) + return betas, xtx_inv_xt + +def iwls(y, x, family, offset=1.0, ini_betas=None, tol=1.0e-8, max_iter=200, wi=None): + """ + Iteratively re-weighted least squares estimation routine + """ + n_iter = 0 + diff = 1.0e6 + if ini_betas is None: + betas = np.zeros((x.shape[1], 1), np.float) + else: + betas = ini_betas + if isinstance(family, Binomial): + y = family.link._clean(y) + if isinstance(family, Poisson): + y_off = y/offset + y_off = family.starting_mu(y_off) + v = family.predict(y_off) + mu = family.starting_mu(y) + else: + mu = family.starting_mu(y) + v = family.predict(mu) + + while diff > tol and n_iter < max_iter: + n_iter += 1 + w = family.weights(mu) + z = v + (family.link.deriv(mu)*(y-mu)) + w = np.sqrt(w) + if type(x) != np.ndarray: + w = sp.csr_matrix(w) + z = sp.csr_matrix(z) + wx = spmultiply(x, w, array_out=False) + wz = spmultiply(z, w, array_out=False) + if wi is None: + n_betas = _compute_betas(wz, wx) + else: + n_betas, xtx_inv_xt = _compute_betas_gwr(wz, wx, wi) + v = spdot(x, n_betas) + mu = family.fitted(v) + if isinstance(family, Poisson): + mu = mu * offset + diff = min(abs(n_betas-betas)) + betas = n_betas + + if wi is None: + return betas, mu, wx, n_iter + else: + return betas, mu, v, w, z, xtx_inv_xt, n_iter diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/glm/links.py b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/links.py new file mode 100644 index 0000000..f45724d --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/links.py @@ -0,0 +1,953 @@ +''' +Defines the link functions to be used with GLM and GEE families. +''' + +import numpy as np +import scipy.stats +FLOAT_EPS = np.finfo(float).eps + + +class Link(object): + """ + A generic link function for one-parameter exponential family. + + `Link` does nothing, but lays out the methods expected of any subclass. + """ + + def __call__(self, p): + """ + Return the value of the link function. This is just a placeholder. + + Parameters + ---------- + p : array-like + Probabilities + + Returns + ------- + g(p) : array-like + The value of the link function g(p) = z + """ + return NotImplementedError + + def inverse(self, z): + """ + Inverse of the link function. Just a placeholder. + + Parameters + ---------- + z : array-like + `z` is usually the linear predictor of the transformed variable + in the IRLS algorithm for GLM. + + Returns + ------- + g^(-1)(z) : array + The value of the inverse of the link function g^(-1)(z) = p + + + """ + return NotImplementedError + + def deriv(self, p): + """ + Derivative of the link function g'(p). Just a placeholder. + + Parameters + ---------- + p : array-like + + Returns + ------- + g'(p) : array + The value of the derivative of the link function g'(p) + """ + return NotImplementedError + + def deriv2(self, p): + """Second derivative of the link function g''(p) + + implemented through numerical differentiation + """ + from statsmodels.tools.numdiff import approx_fprime_cs + # TODO: workaround proplem with numdiff for 1d + return np.diag(approx_fprime_cs(p, self.deriv)) + + def inverse_deriv(self, z): + """ + Derivative of the inverse link function g^(-1)(z). + + Notes + ----- + This reference implementation gives the correct result but is + inefficient, so it can be overriden in subclasses. + + Parameters + ---------- + z : array-like + `z` is usually the linear predictor for a GLM or GEE model. + + Returns + ------- + g'^(-1)(z) : array + The value of the derivative of the inverse of the link function + + """ + return 1 / self.deriv(self.inverse(z)) + + +class Logit(Link): + """ + The logit transform + + Notes + ----- + call and derivative use a private method _clean to make trim p by + machine epsilon so that p is in (0,1) + + Alias of Logit: + logit = Logit() + """ + + def _clean(self, p): + """ + Clip logistic values to range (eps, 1-eps) + + Parameters + ----------- + p : array-like + Probabilities + + Returns + -------- + pclip : array + Clipped probabilities + """ + return np.clip(p, FLOAT_EPS, 1. - FLOAT_EPS) + + def __call__(self, p): + """ + The logit transform + + Parameters + ---------- + p : array-like + Probabilities + + Returns + ------- + z : array + Logit transform of `p` + + Notes + ----- + g(p) = log(p / (1 - p)) + """ + p = self._clean(p) + return np.log(p / (1. - p)) + + def inverse(self, z): + """ + Inverse of the logit transform + + Parameters + ---------- + z : array-like + The value of the logit transform at `p` + + Returns + ------- + p : array + Probabilities + + Notes + ----- + g^(-1)(z) = exp(z)/(1+exp(z)) + """ + z = np.asarray(z) + t = np.exp(-z) + return 1. / (1. + t) + + def deriv(self, p): + + """ + Derivative of the logit transform + + Parameters + ---------- + p: array-like + Probabilities + + Returns + ------- + g'(p) : array + Value of the derivative of logit transform at `p` + + Notes + ----- + g'(p) = 1 / (p * (1 - p)) + + Alias for `Logit`: + logit = Logit() + """ + p = self._clean(p) + return 1. / (p * (1 - p)) + + def inverse_deriv(self, z): + """ + Derivative of the inverse of the logit transform + + Parameters + ---------- + z : array-like + `z` is usually the linear predictor for a GLM or GEE model. + + Returns + ------- + g'^(-1)(z) : array + The value of the derivative of the inverse of the logit function + + """ + t = np.exp(z) + return t/(1 + t)**2 + + + def deriv2(self, p): + """ + Second derivative of the logit function. + + Parameters + ---------- + p : array-like + probabilities + + Returns + ------- + g''(z) : array + The value of the second derivative of the logit function + """ + v = p * (1 - p) + return (2*p - 1) / v**2 + +class logit(Logit): + pass + + +class Power(Link): + """ + The power transform + + Parameters + ---------- + power : float + The exponent of the power transform + + Notes + ----- + Aliases of Power: + inverse = Power(power=-1) + sqrt = Power(power=.5) + inverse_squared = Power(power=-2.) + identity = Power(power=1.) + """ + + def __init__(self, power=1.): + self.power = power + + def __call__(self, p): + """ + Power transform link function + + Parameters + ---------- + p : array-like + Mean parameters + + Returns + ------- + z : array-like + Power transform of x + + Notes + ----- + g(p) = x**self.power + """ + + z = np.power(p, self.power) + return z + + def inverse(self, z): + """ + Inverse of the power transform link function + + Parameters + ---------- + `z` : array-like + Value of the transformed mean parameters at `p` + + Returns + ------- + `p` : array + Mean parameters + + Notes + ----- + g^(-1)(z`) = `z`**(1/`power`) + """ + + p = np.power(z, 1. / self.power) + return p + + def deriv(self, p): + """ + Derivative of the power transform + + Parameters + ---------- + p : array-like + Mean parameters + + Returns + -------- + g'(p) : array + Derivative of power transform of `p` + + Notes + ----- + g'(`p`) = `power` * `p`**(`power` - 1) + """ + return self.power * np.power(p, self.power - 1) + + def deriv2(self, p): + """ + Second derivative of the power transform + + Parameters + ---------- + p : array-like + Mean parameters + + Returns + -------- + g''(p) : array + Second derivative of the power transform of `p` + + Notes + ----- + g''(`p`) = `power` * (`power` - 1) * `p`**(`power` - 2) + """ + return self.power * (self.power - 1) * np.power(p, self.power - 2) + + def inverse_deriv(self, z): + """ + Derivative of the inverse of the power transform + + Parameters + ---------- + z : array-like + `z` is usually the linear predictor for a GLM or GEE model. + + Returns + ------- + g^(-1)'(z) : array + The value of the derivative of the inverse of the power transform + function + """ + return np.power(z, (1 - self.power)/self.power) / self.power + + +class inverse_power(Power): + """ + The inverse transform + + Notes + ----- + g(p) = 1/p + + Alias of statsmodels.family.links.Power(power=-1.) + """ + def __init__(self): + super(inverse_power, self).__init__(power=-1.) + + +class sqrt(Power): + """ + The square-root transform + + Notes + ----- + g(`p`) = sqrt(`p`) + + Alias of statsmodels.family.links.Power(power=.5) + """ + def __init__(self): + super(sqrt, self).__init__(power=.5) + + +class inverse_squared(Power): + """ + The inverse squared transform + + Notes + ----- + g(`p`) = 1/(`p`\ \*\*2) + + Alias of statsmodels.family.links.Power(power=2.) + """ + def __init__(self): + super(inverse_squared, self).__init__(power=-2.) + + +class identity(Power): + """ + The identity transform + + Notes + ----- + g(`p`) = `p` + + Alias of statsmodels.family.links.Power(power=1.) + """ + def __init__(self): + super(identity, self).__init__(power=1.) + + +class Log(Link): + """ + The log transform + + Notes + ----- + call and derivative call a private method _clean to trim the data by + machine epsilon so that p is in (0,1). log is an alias of Log. + """ + + def _clean(self, x): + return np.clip(x, FLOAT_EPS, np.inf) + + def __call__(self, p, **extra): + """ + Log transform link function + + Parameters + ---------- + x : array-like + Mean parameters + + Returns + ------- + z : array + log(x) + + Notes + ----- + g(p) = log(p) + """ + x = self._clean(p) + return np.log(x) + + def inverse(self, z): + """ + Inverse of log transform link function + + Parameters + ---------- + z : array + The inverse of the link function at `p` + + Returns + ------- + p : array + The mean probabilities given the value of the inverse `z` + + Notes + ----- + g^{-1}(z) = exp(z) + """ + return np.exp(z) + + def deriv(self, p): + """ + Derivative of log transform link function + + Parameters + ---------- + p : array-like + Mean parameters + + Returns + ------- + g'(p) : array + derivative of log transform of x + + Notes + ----- + g'(x) = 1/x + """ + p = self._clean(p) + return 1. / p + + def deriv2(self, p): + """ + Second derivative of the log transform link function + + Parameters + ---------- + p : array-like + Mean parameters + + Returns + ------- + g''(p) : array + Second derivative of log transform of x + + Notes + ----- + g''(x) = -1/x^2 + """ + p = self._clean(p) + return -1. / p**2 + + def inverse_deriv(self, z): + """ + Derivative of the inverse of the log transform link function + + Parameters + ---------- + z : array + The inverse of the link function at `p` + + Returns + ------- + g^(-1)'(z) : array + The value of the derivative of the inverse of the log function, + the exponential function + """ + return np.exp(z) + + +class log(Log): + """ + The log transform + + Notes + ----- + log is a an alias of Log. + """ + pass + + +# TODO: the CDFLink is untested +class CDFLink(Logit): + """ + The use the CDF of a scipy.stats distribution + + CDFLink is a subclass of logit in order to use its _clean method + for the link and its derivative. + + Parameters + ---------- + dbn : scipy.stats distribution + Default is dbn=scipy.stats.norm + + Notes + ----- + The CDF link is untested. + """ + + def __init__(self, dbn=scipy.stats.norm): + self.dbn = dbn + + def __call__(self, p): + """ + CDF link function + + Parameters + ---------- + p : array-like + Mean parameters + + Returns + ------- + z : array + (ppf) inverse of CDF transform of p + + Notes + ----- + g(`p`) = `dbn`.ppf(`p`) + """ + p = self._clean(p) + return self.dbn.ppf(p) + + def inverse(self, z): + """ + The inverse of the CDF link + + Parameters + ---------- + z : array-like + The value of the inverse of the link function at `p` + + Returns + ------- + p : array + Mean probabilities. The value of the inverse of CDF link of `z` + + Notes + ----- + g^(-1)(`z`) = `dbn`.cdf(`z`) + """ + return self.dbn.cdf(z) + + def deriv(self, p): + """ + Derivative of CDF link + + Parameters + ---------- + p : array-like + mean parameters + + Returns + ------- + g'(p) : array + The derivative of CDF transform at `p` + + Notes + ----- + g'(`p`) = 1./ `dbn`.pdf(`dbn`.ppf(`p`)) + """ + p = self._clean(p) + return 1. / self.dbn.pdf(self.dbn.ppf(p)) + + def deriv2(self, p): + """ + Second derivative of the link function g''(p) + + implemented through numerical differentiation + """ + from statsmodels.tools.numdiff import approx_fprime + p = np.atleast_1d(p) + # Note: special function for norm.ppf does not support complex + return np.diag(approx_fprime(p, self.deriv, centered=True)) + + def inverse_deriv(self, z): + """ + Derivative of the inverse of the CDF transformation link function + + Parameters + ---------- + z : array + The inverse of the link function at `p` + + Returns + ------- + g^(-1)'(z) : array + The value of the derivative of the inverse of the logit function + """ + return 1/self.deriv(self.inverse(z)) + + +class probit(CDFLink): + """ + The probit (standard normal CDF) transform + + Notes + -------- + g(p) = scipy.stats.norm.ppf(p) + + probit is an alias of CDFLink. + """ + pass + + +class cauchy(CDFLink): + """ + The Cauchy (standard Cauchy CDF) transform + + Notes + ----- + g(p) = scipy.stats.cauchy.ppf(p) + + cauchy is an alias of CDFLink with dbn=scipy.stats.cauchy + """ + + def __init__(self): + super(cauchy, self).__init__(dbn=scipy.stats.cauchy) + + def deriv2(self, p): + """ + Second derivative of the Cauchy link function. + + Parameters + ---------- + p: array-like + Probabilities + + Returns + ------- + g''(p) : array + Value of the second derivative of Cauchy link function at `p` + """ + a = np.pi * (p - 0.5) + d2 = 2 * np.pi**2 * np.sin(a) / np.cos(a)**3 + return d2 + +class CLogLog(Logit): + """ + The complementary log-log transform + + CLogLog inherits from Logit in order to have access to its _clean method + for the link and its derivative. + + Notes + ----- + CLogLog is untested. + """ + def __call__(self, p): + """ + C-Log-Log transform link function + + Parameters + ---------- + p : array + Mean parameters + + Returns + ------- + z : array + The CLogLog transform of `p` + + Notes + ----- + g(p) = log(-log(1-p)) + """ + p = self._clean(p) + return np.log(-np.log(1 - p)) + + def inverse(self, z): + """ + Inverse of C-Log-Log transform link function + + + Parameters + ---------- + z : array-like + The value of the inverse of the CLogLog link function at `p` + + Returns + ------- + p : array + Mean parameters + + Notes + ----- + g^(-1)(`z`) = 1-exp(-exp(`z`)) + """ + return 1 - np.exp(-np.exp(z)) + + def deriv(self, p): + """ + Derivative of C-Log-Log transform link function + + Parameters + ---------- + p : array-like + Mean parameters + + Returns + ------- + g'(p) : array + The derivative of the CLogLog transform link function + + Notes + ----- + g'(p) = - 1 / ((p-1)*log(1-p)) + """ + p = self._clean(p) + return 1. / ((p - 1) * (np.log(1 - p))) + + def deriv2(self, p): + """ + Second derivative of the C-Log-Log ink function + + Parameters + ---------- + p : array-like + Mean parameters + + Returns + ------- + g''(p) : array + The second derivative of the CLogLog link function + """ + p = self._clean(p) + fl = np.log(1 - p) + d2 = -1 / ((1 - p)**2 * fl) + d2 *= 1 + 1 / fl + return d2 + + def inverse_deriv(self, z): + """ + Derivative of the inverse of the C-Log-Log transform link function + + Parameters + ---------- + z : array-like + The value of the inverse of the CLogLog link function at `p` + + Returns + ------- + g^(-1)'(z) : array + The derivative of the inverse of the CLogLog link function + """ + return np.exp(z - np.exp(z)) + + +class cloglog(CLogLog): + """ + The CLogLog transform link function. + + Notes + ----- + g(`p`) = log(-log(1-`p`)) + + cloglog is an alias for CLogLog + cloglog = CLogLog() + """ + pass + + +class NegativeBinomial(object): + ''' + The negative binomial link function + + Parameters + ---------- + alpha : float, optional + Alpha is the ancillary parameter of the Negative Binomial link + function. It is assumed to be nonstochastic. The default value is 1. + Permissible values are usually assumed to be in (.01, 2). + ''' + + def __init__(self, alpha=1.): + self.alpha = alpha + + def _clean(self, x): + return np.clip(x, FLOAT_EPS, np.inf) + + def __call__(self, p): + ''' + Negative Binomial transform link function + + Parameters + ---------- + p : array-like + Mean parameters + + Returns + ------- + z : array + The negative binomial transform of `p` + + Notes + ----- + g(p) = log(p/(p + 1/alpha)) + ''' + p = self._clean(p) + return np.log(p/(p + 1/self.alpha)) + + def inverse(self, z): + ''' + Inverse of the negative binomial transform + + Parameters + ----------- + z : array-like + The value of the inverse of the negative binomial link at `p`. + + Returns + ------- + p : array + Mean parameters + + Notes + ----- + g^(-1)(z) = exp(z)/(alpha*(1-exp(z))) + ''' + return -1/(self.alpha * (1 - np.exp(-z))) + + def deriv(self, p): + ''' + Derivative of the negative binomial transform + + Parameters + ---------- + p : array-like + Mean parameters + + Returns + ------- + g'(p) : array + The derivative of the negative binomial transform link function + + Notes + ----- + g'(x) = 1/(x+alpha*x^2) + ''' + return 1/(p + self.alpha * p**2) + + def deriv2(self,p): + ''' + Second derivative of the negative binomial link function. + + Parameters + ---------- + p : array-like + Mean parameters + + Returns + ------- + g''(p) : array + The second derivative of the negative binomial transform link + function + + Notes + ----- + g''(x) = -(1+2*alpha*x)/(x+alpha*x^2)^2 + ''' + numer = -(1 + 2 * self.alpha * p) + denom = (p + self.alpha * p**2)**2 + return numer / denom + + def inverse_deriv(self, z): + ''' + Derivative of the inverse of the negative binomial transform + + Parameters + ----------- + z : array-like + Usually the linear predictor for a GLM or GEE model + + Returns + ------- + g^(-1)'(z) : array + The value of the derivative of the inverse of the negative + binomial link + ''' + t = np.exp(z) + return t / (self.alpha * (1-t)**2) + + +class nbinom(NegativeBinomial): + """ + The negative binomial link function. + + Notes + ----- + g(p) = log(p/(p + 1/alpha)) + + nbinom is an alias of NegativeBinomial. + nbinom = NegativeBinomial(alpha=1.) + """ + pass diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/glm/tests/test_glm.py b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/tests/test_glm.py new file mode 100644 index 0000000..b86ad6a --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/tests/test_glm.py @@ -0,0 +1,993 @@ +""" +Tests for generalized linear models. Majority of code either directly borrowed +or closely adapted from statsmodels package. Model results verfiied using glm +function in R and GLM function in statsmodels. +""" + +__author__ = 'Taylor Oshan tayoshan@gmail.com' + +from pysal.contrib.glm.glm import GLM +from pysal.contrib.glm.family import Gaussian, Poisson, Binomial, QuasiPoisson +import numpy as np +import pysal +import unittest +import math + + +class TestGaussian(unittest.TestCase): + """ + Tests for Poisson GLM + """ + + def setUp(self): + db = pysal.open(pysal.examples.get_path('columbus.dbf'),'r') + y = np.array(db.by_col("HOVAL")) + self.y = np.reshape(y, (49,1)) + X = [] + X.append(db.by_col("INC")) + X.append(db.by_col("CRIME")) + self.X = np.array(X).T + + def testIWLS(self): + model = GLM(self.y, self.X, family=Gaussian()) + results = model.fit() + self.assertEqual(results.n, 49) + self.assertEqual(results.df_model, 2) + self.assertEqual(results.df_resid, 46) + self.assertEqual(results.aic, 408.73548964604873) + self.assertEqual(results.bic, 10467.991340493107) + self.assertEqual(results.deviance, 10647.015074206196) + self.assertEqual(results.llf, -201.36774482302437) + self.assertEqual(results.null_deviance, 16367.794631703124) + self.assertEqual(results.scale, 231.45684943926514) + np.testing.assert_allclose(results.params, [ 46.42818268, 0.62898397, + -0.48488854]) + np.testing.assert_allclose(results.bse, [ 13.19175703, 0.53591045, + 0.18267291]) + np.testing.assert_allclose(results.cov_params(), + [[ 1.74022453e+02, -6.52060364e+00, -2.15109867e+00], + [ -6.52060364e+00, 2.87200008e-01, 6.80956787e-02], + [ -2.15109867e+00, 6.80956787e-02, 3.33693910e-02]]) + np.testing.assert_allclose(results.tvalues, [ 3.51948437, 1.17367365, + -2.65440864]) + np.testing.assert_allclose(results.pvalues, [ 0.00043239, 0.24052577, + 0.00794475], atol=1.0e-8) + np.testing.assert_allclose(results.conf_int(), + [[ 20.57281401, 72.28355135], + [ -0.42138121, 1.67934915], + [ -0.84292086, -0.12685622]]) + np.testing.assert_allclose(results.normalized_cov_params, + [[ 7.51857004e-01, -2.81720055e-02, -9.29373521e-03], + [ -2.81720055e-02, 1.24083607e-03, 2.94204638e-04], + [ -9.29373521e-03, 2.94204638e-04, 1.44171110e-04]]) + np.testing.assert_allclose(results.mu, + [ 51.08752105, 50.66601521, 41.61367567, 33.53969014, + 28.90638232, 43.87074227, 51.64910882, 34.92671563, + 42.69267622, 38.49449134, 20.92815471, 25.25228436, + 29.78223486, 25.02403635, 29.07959539, 24.63352275, + 34.71372149, 33.40443052, 27.29864225, 65.86219802, + 33.69854751, 37.44976435, 50.01304928, 36.81219959, + 22.02674837, 31.64775955, 27.63563294, 23.7697291 , + 22.43119725, 21.76987089, 48.51169321, 49.05891819, + 32.31656426, 44.20550354, 35.49244888, 51.27811308, + 36.55047181, 27.37048914, 48.78812922, 57.31744163, + 51.22914162, 54.70515578, 37.06622277, 44.5075759 , + 41.24328983, 49.93821824, 44.85644299, 40.93838609, 47.32045464]) + self.assertEqual(results.pearson_chi2, 10647.015074206196) + np.testing.assert_allclose(results.resid_response, + [ 29.37948195, -6.09901421, -15.26367567, -0.33968914, + -5.68138232, -15.12074227, 23.35089118, 2.19828437, + 9.90732178, 57.90551066, -1.22815371, -5.35228436, + 11.91776614, 17.87596565, -11.07959539, -5.83352375, + 7.03627851, 26.59556948, 3.30135775, 15.40479998, + -13.72354751, -6.99976335, -2.28004728, 16.38780141, + -4.12674837, -11.34776055, 6.46436506, -0.9197291 , + 10.06880275, 0.73012911, -16.71169421, -8.75891919, + -8.71656426, -15.75550254, -8.49244888, -14.97811408, + 6.74952719, -4.67048814, -9.18813122, 4.63255937, + -9.12914362, -10.37215578, -11.36622177, -11.0075759 , + -13.51028983, 26.16177976, -2.35644299, -14.13838709, -11.52045564]) + np.testing.assert_allclose(results.resid_working, + [ 29.37948195, -6.09901421, -15.26367567, -0.33968914, + -5.68138232, -15.12074227, 23.35089118, 2.19828437, + 9.90732178, 57.90551066, -1.22815371, -5.35228436, + 11.91776614, 17.87596565, -11.07959539, -5.83352375, + 7.03627851, 26.59556948, 3.30135775, 15.40479998, + -13.72354751, -6.99976335, -2.28004728, 16.38780141, + -4.12674837, -11.34776055, 6.46436506, -0.9197291 , + 10.06880275, 0.73012911, -16.71169421, -8.75891919, + -8.71656426, -15.75550254, -8.49244888, -14.97811408, + 6.74952719, -4.67048814, -9.18813122, 4.63255937, + -9.12914362, -10.37215578, -11.36622177, -11.0075759 , + -13.51028983, 26.16177976, -2.35644299, -14.13838709, -11.52045564]) + np.testing.assert_allclose(results.resid_pearson, + [ 29.37948195, -6.09901421, -15.26367567, -0.33968914, + -5.68138232, -15.12074227, 23.35089118, 2.19828437, + 9.90732178, 57.90551066, -1.22815371, -5.35228436, + 11.91776614, 17.87596565, -11.07959539, -5.83352375, + 7.03627851, 26.59556948, 3.30135775, 15.40479998, + -13.72354751, -6.99976335, -2.28004728, 16.38780141, + -4.12674837, -11.34776055, 6.46436506, -0.9197291 , + 10.06880275, 0.73012911, -16.71169421, -8.75891919, + -8.71656426, -15.75550254, -8.49244888, -14.97811408, + 6.74952719, -4.67048814, -9.18813122, 4.63255937, + -9.12914362, -10.37215578, -11.36622177, -11.0075759 , + -13.51028983, 26.16177976, -2.35644299, -14.13838709, -11.52045564]) + np.testing.assert_allclose(results.resid_anscombe, + [ 29.37948195, -6.09901421, -15.26367567, -0.33968914, + -5.68138232, -15.12074227, 23.35089118, 2.19828437, + 9.90732178, 57.90551066, -1.22815371, -5.35228436, + 11.91776614, 17.87596565, -11.07959539, -5.83352375, + 7.03627851, 26.59556948, 3.30135775, 15.40479998, + -13.72354751, -6.99976335, -2.28004728, 16.38780141, + -4.12674837, -11.34776055, 6.46436506, -0.9197291 , + 10.06880275, 0.73012911, -16.71169421, -8.75891919, + -8.71656426, -15.75550254, -8.49244888, -14.97811408, + 6.74952719, -4.67048814, -9.18813122, 4.63255937, + -9.12914362, -10.37215578, -11.36622177, -11.0075759 , + -13.51028983, 26.16177976, -2.35644299, -14.13838709, -11.52045564]) + np.testing.assert_allclose(results.resid_deviance, + [ 29.37948195, -6.09901421, -15.26367567, -0.33968914, + -5.68138232, -15.12074227, 23.35089118, 2.19828437, + 9.90732178, 57.90551066, -1.22815371, -5.35228436, + 11.91776614, 17.87596565, -11.07959539, -5.83352375, + 7.03627851, 26.59556948, 3.30135775, 15.40479998, + -13.72354751, -6.99976335, -2.28004728, 16.38780141, + -4.12674837, -11.34776055, 6.46436506, -0.9197291 , + 10.06880275, 0.73012911, -16.71169421, -8.75891919, + -8.71656426, -15.75550254, -8.49244888, -14.97811408, + 6.74952719, -4.67048814, -9.18813122, 4.63255937, + -9.12914362, -10.37215578, -11.36622177, -11.0075759 , + -13.51028983, 26.16177976, -2.35644299, -14.13838709, -11.52045564]) + np.testing.assert_allclose(results.null, + [ 38.43622447, 38.43622447, 38.43622447, 38.43622447, + 38.43622447, 38.43622447, 38.43622447, 38.43622447, + 38.43622447, 38.43622447, 38.43622447, 38.43622447, + 38.43622447, 38.43622447, 38.43622447, 38.43622447, + 38.43622447, 38.43622447, 38.43622447, 38.43622447, + 38.43622447, 38.43622447, 38.43622447, 38.43622447, + 38.43622447, 38.43622447, 38.43622447, 38.43622447, + 38.43622447, 38.43622447, 38.43622447, 38.43622447, + 38.43622447, 38.43622447, 38.43622447, 38.43622447, + 38.43622447, 38.43622447, 38.43622447, 38.43622447, + 38.43622447, 38.43622447, 38.43622447, 38.43622447, + 38.43622447, 38.43622447, 38.43622447, 38.43622447, 38.43622447]) + self.assertAlmostEqual(results.D2, .349514377851) + self.assertAlmostEqual(results.adj_D2, 0.32123239427957673) + +class TestPoisson(unittest.TestCase): + + def setUp(self): + db = pysal.open(pysal.examples.get_path('columbus.dbf'),'r') + y = np.array(db.by_col("HOVAL")) + y = np.reshape(y, (49,1)) + self.y = np.round(y).astype(int) + X = [] + X.append(db.by_col("INC")) + X.append(db.by_col("CRIME")) + self.X = np.array(X).T + + def testIWLS(self): + model = GLM(self.y, self.X, family=Poisson()) + results = model.fit() + self.assertEqual(results.n, 49) + self.assertEqual(results.df_model, 2) + self.assertEqual(results.df_resid, 46) + self.assertAlmostEqual(results.aic, 500.85184179938756) + self.assertAlmostEqual(results.bic, 51.436404535087661) + self.assertAlmostEqual(results.deviance, 230.46013824817649) + self.assertAlmostEqual(results.llf, -247.42592089969378) + self.assertAlmostEqual(results.null_deviance, 376.97293610347361) + self.assertEqual(results.scale, 1.0) + np.testing.assert_allclose(results.params, [ 3.92159085, 0.01183491, + -0.01371397], atol=1.0e-8) + np.testing.assert_allclose(results.bse, [ 0.13049161, 0.00511599, + 0.00193769], atol=1.0e-8) + np.testing.assert_allclose(results.cov_params(), + [[ 1.70280610e-02, -6.18628383e-04, -2.21386966e-04], + [ -6.18628383e-04, 2.61733917e-05, 6.77496445e-06], + [ -2.21386966e-04, 6.77496445e-06, 3.75463502e-06]]) + np.testing.assert_allclose(results.tvalues, [ 30.0524361 , 2.31331634, + -7.07748998]) + np.testing.assert_allclose(results.pvalues, [ 2.02901657e-198, + 2.07052532e-002, 1.46788805e-012]) + np.testing.assert_allclose(results.conf_int(), + [[ 3.66583199e+00, 4.17734972e+00], + [ 1.80774841e-03, 2.18620753e-02], + [ -1.75117666e-02, -9.91616901e-03]]) + np.testing.assert_allclose(results.normalized_cov_params, + [[ 1.70280610e-02, -6.18628383e-04, -2.21386966e-04], + [ -6.18628383e-04, 2.61733917e-05, 6.77496445e-06], + [ -2.21386966e-04, 6.77496445e-06, 3.75463502e-06]]) + np.testing.assert_allclose(results.mu, + [ 51.26831574, 50.15022766, 40.06142973, 34.13799739, + 28.76119226, 42.6836241 , 55.64593703, 34.08277997, + 40.90389582, 37.19727958, 23.47459217, 26.12384057, + 29.78303507, 25.96888223, 29.14073823, 26.04369592, + 34.18996367, 32.28924005, 27.42284396, 72.69207879, + 33.05316347, 36.52276972, 49.2551479 , 35.33439632, + 24.07252457, 31.67153709, 27.81699478, 25.38021219, + 24.31759259, 23.13586161, 48.40724678, 48.57969818, + 31.92596006, 43.3679231 , 34.32925819, 51.78908089, + 34.49778584, 27.56236198, 48.34273194, 57.50829097, + 50.66038226, 54.68701352, 35.77103116, 43.21886784, + 40.07615759, 49.98658004, 43.13352883, 40.28520774, 46.28910294]) + self.assertAlmostEqual(results.pearson_chi2, 264.62262932090221) + np.testing.assert_allclose(results.resid_response, + [ 28.73168426, -5.15022766, -14.06142973, -1.13799739, + -5.76119226, -13.6836241 , 19.35406297, 2.91722003, + 12.09610418, 58.80272042, -3.47459217, -6.12384057, + 12.21696493, 17.03111777, -11.14073823, -7.04369592, + 7.81003633, 27.71075995, 3.57715604, 8.30792121, + -13.05316347, -6.52276972, -1.2551479 , 17.66560368, + -6.07252457, -11.67153709, 6.18300522, -2.38021219, + 7.68240741, -1.13586161, -16.40724678, -8.57969818, + -7.92596006, -15.3679231 , -7.32925819, -15.78908089, + 8.50221416, -4.56236198, -8.34273194, 4.49170903, + -8.66038226, -10.68701352, -9.77103116, -9.21886784, + -12.07615759, 26.01341996, -1.13352883, -13.28520774, -10.28910294]) + np.testing.assert_allclose(results.resid_working, + [ 1473.02506034, -258.28508941, -563.32097891, -38.84895192, + -165.69875817, -584.06666725, 1076.97496919, 99.42696848, + 494.77778514, 2187.30123163, -81.56463405, -159.97823479, + 363.858295 , 442.27909165, -324.64933645, -183.44387481, + 267.02485844, 894.75938 , 98.09579187, 603.9200634 , + -431.44834594, -238.2296165 , -61.82249568, 624.20344168, + -146.18099686, -369.65551968, 171.99262399, -60.41029031, + 186.81765356, -26.27913713, -794.22964417, -416.79914795, + -253.04388425, -666.47490701, -251.6079969 , -817.70198717, + 293.30756327, -125.74947222, -403.31045369, 258.31051005, + -438.73827602, -584.440853 , -349.51985996, -398.42903071, + -483.96599444, 1300.32189904, -48.89309853, -535.19735391, + -476.27334527]) + np.testing.assert_allclose(results.resid_pearson, + [ 4.01269878, -0.72726045, -2.221602 , -0.19477008, -1.07425881, + -2.09445239, 2.59451042, 0.49969118, 1.89131202, 9.64143836, + -0.71714142, -1.19813392, 2.23861212, 3.34207756, -2.0637814 , + -1.3802231 , 1.33568403, 4.87662684, 0.68309584, 0.97442591, + -2.27043598, -1.07931992, -0.17884182, 2.97186889, -1.23768025, + -2.07392709, 1.1723155 , -0.47246327, 1.55789092, -0.23614708, + -2.35819937, -1.23096188, -1.40274877, -2.33362391, -1.25091503, + -2.19400568, 1.44755952, -0.8690235 , -1.19989348, 0.59230634, + -1.21675413, -1.44515442, -1.63370888, -1.40229988, -1.90759306, + 3.67934693, -0.17259375, -2.09312684, -1.51230062]) + np.testing.assert_allclose(results.resid_anscombe, + [ 3.70889134, -0.74031295, -2.37729865, -0.19586855, -1.11374751, + -2.22611959, 2.46352013, 0.49282126, 1.80857757, 8.06444452, + -0.73610811, -1.25061371, 2.10820431, 3.05467547, -2.22437611, + -1.45136173, 1.28939698, 4.35942058, 0.66904552, 0.95674923, + -2.45438937, -1.11429881, -0.17961012, 2.76715848, -1.29658591, + -2.22816691, 1.13269136, -0.48017382, 1.48562248, -0.23812278, + -2.51664399, -1.2703721 , -1.4683091 , -2.49907536, -1.30026484, + -2.32398309, 1.39380683, -0.89495368, -1.23735395, 0.58485202, + -1.25435224, -1.4968484 , -1.71888038, -1.45756652, -2.01906267, + 3.41729922, -0.17335867, -2.22921828, -1.57470549]) + np.testing.assert_allclose(results.resid_deviance, + [ 3.70529668, -0.74027329, -2.37536322, -0.19586751, -1.11349765, + -2.22466106, 2.46246446, 0.4928057 , 1.80799655, 8.02696525, + -0.73602255, -1.25021555, 2.10699958, 3.05084608, -2.22214376, + -1.45072221, 1.28913747, 4.35106213, 0.6689982 , 0.95669662, + -2.45171913, -1.11410444, -0.17960956, 2.76494217, -1.29609865, + -2.22612429, 1.13247453, -0.48015254, 1.48508549, -0.23812 , + -2.51476072, -1.27015583, -1.46777697, -2.49699318, -1.29992892, + -2.32263069, 1.39348459, -0.89482132, -1.23715363, 0.58483655, + -1.25415329, -1.49653039, -1.7181055 , -1.45719072, -2.01791949, + 3.41437156, -0.1733581 , -2.22765605, -1.57426046]) + np.testing.assert_allclose(results.null, + [ 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, 38.42857143]) + self.assertAlmostEqual(results.D2, .388656011675) + self.assertAlmostEqual(results.adj_D2, 0.36207583826952761)#.375648692774) + + def testQuasi(self): + model = GLM(self.y, self.X, family=QuasiPoisson()) + results = model.fit() + self.assertEqual(results.n, 49) + self.assertEqual(results.df_model, 2) + self.assertEqual(results.df_resid, 46) + self.assertTrue(math.isnan(results.aic)) + self.assertAlmostEqual(results.bic, 51.436404535087661) + self.assertAlmostEqual(results.deviance, 230.46013824817649) + self.assertTrue(math.isnan(results.llf)) + self.assertAlmostEqual(results.null_deviance, 376.97293610347361) + self.assertAlmostEqual(results.scale, 5.7526658548022223) + np.testing.assert_allclose(results.params, [ 3.92159085, 0.01183491, + -0.01371397], atol=1.0e-8) + np.testing.assert_allclose(results.bse, [ 0.31298042, 0.01227057, + 0.00464749], atol=1.0e-8) + np.testing.assert_allclose(results.cov_params(), + [[ 9.79567451e-02, -3.55876238e-03, -1.27356524e-03], + [ -3.55876238e-03, 1.50566777e-04, 3.89741067e-05], + [ -1.27356524e-03, 3.89741067e-05, 2.15991606e-05]]) + np.testing.assert_allclose(results.tvalues, [ 12.52982796, 0.96449604, + -2.95083339]) + np.testing.assert_allclose(results.pvalues, [ 5.12737770e-36, + 3.34797291e-01, 3.16917819e-03]) + np.testing.assert_allclose(results.conf_int(), + [[ 3.3081605 , 4.53502121], + [-0.01221495, 0.03588478], + [-0.02282288, -0.00460506]], atol=1.0e-8) + np.testing.assert_allclose(results.normalized_cov_params, + [[ 1.70280610e-02, -6.18628383e-04, -2.21386966e-04], + [ -6.18628383e-04, 2.61733917e-05, 6.77496445e-06], + [ -2.21386966e-04, 6.77496445e-06, 3.75463502e-06]]) + np.testing.assert_allclose(results.mu, + [ 51.26831574, 50.15022766, 40.06142973, 34.13799739, + 28.76119226, 42.6836241 , 55.64593703, 34.08277997, + 40.90389582, 37.19727958, 23.47459217, 26.12384057, + 29.78303507, 25.96888223, 29.14073823, 26.04369592, + 34.18996367, 32.28924005, 27.42284396, 72.69207879, + 33.05316347, 36.52276972, 49.2551479 , 35.33439632, + 24.07252457, 31.67153709, 27.81699478, 25.38021219, + 24.31759259, 23.13586161, 48.40724678, 48.57969818, + 31.92596006, 43.3679231 , 34.32925819, 51.78908089, + 34.49778584, 27.56236198, 48.34273194, 57.50829097, + 50.66038226, 54.68701352, 35.77103116, 43.21886784, + 40.07615759, 49.98658004, 43.13352883, 40.28520774, 46.28910294]) + self.assertAlmostEqual(results.pearson_chi2, 264.62262932090221) + np.testing.assert_allclose(results.resid_response, + [ 28.73168426, -5.15022766, -14.06142973, -1.13799739, + -5.76119226, -13.6836241 , 19.35406297, 2.91722003, + 12.09610418, 58.80272042, -3.47459217, -6.12384057, + 12.21696493, 17.03111777, -11.14073823, -7.04369592, + 7.81003633, 27.71075995, 3.57715604, 8.30792121, + -13.05316347, -6.52276972, -1.2551479 , 17.66560368, + -6.07252457, -11.67153709, 6.18300522, -2.38021219, + 7.68240741, -1.13586161, -16.40724678, -8.57969818, + -7.92596006, -15.3679231 , -7.32925819, -15.78908089, + 8.50221416, -4.56236198, -8.34273194, 4.49170903, + -8.66038226, -10.68701352, -9.77103116, -9.21886784, + -12.07615759, 26.01341996, -1.13352883, -13.28520774, -10.28910294]) + np.testing.assert_allclose(results.resid_working, + [ 1473.02506034, -258.28508941, -563.32097891, -38.84895192, + -165.69875817, -584.06666725, 1076.97496919, 99.42696848, + 494.77778514, 2187.30123163, -81.56463405, -159.97823479, + 363.858295 , 442.27909165, -324.64933645, -183.44387481, + 267.02485844, 894.75938 , 98.09579187, 603.9200634 , + -431.44834594, -238.2296165 , -61.82249568, 624.20344168, + -146.18099686, -369.65551968, 171.99262399, -60.41029031, + 186.81765356, -26.27913713, -794.22964417, -416.79914795, + -253.04388425, -666.47490701, -251.6079969 , -817.70198717, + 293.30756327, -125.74947222, -403.31045369, 258.31051005, + -438.73827602, -584.440853 , -349.51985996, -398.42903071, + -483.96599444, 1300.32189904, -48.89309853, -535.19735391, + -476.27334527]) + np.testing.assert_allclose(results.resid_pearson, + [ 4.01269878, -0.72726045, -2.221602 , -0.19477008, -1.07425881, + -2.09445239, 2.59451042, 0.49969118, 1.89131202, 9.64143836, + -0.71714142, -1.19813392, 2.23861212, 3.34207756, -2.0637814 , + -1.3802231 , 1.33568403, 4.87662684, 0.68309584, 0.97442591, + -2.27043598, -1.07931992, -0.17884182, 2.97186889, -1.23768025, + -2.07392709, 1.1723155 , -0.47246327, 1.55789092, -0.23614708, + -2.35819937, -1.23096188, -1.40274877, -2.33362391, -1.25091503, + -2.19400568, 1.44755952, -0.8690235 , -1.19989348, 0.59230634, + -1.21675413, -1.44515442, -1.63370888, -1.40229988, -1.90759306, + 3.67934693, -0.17259375, -2.09312684, -1.51230062]) + np.testing.assert_allclose(results.resid_anscombe, + [ 3.70889134, -0.74031295, -2.37729865, -0.19586855, -1.11374751, + -2.22611959, 2.46352013, 0.49282126, 1.80857757, 8.06444452, + -0.73610811, -1.25061371, 2.10820431, 3.05467547, -2.22437611, + -1.45136173, 1.28939698, 4.35942058, 0.66904552, 0.95674923, + -2.45438937, -1.11429881, -0.17961012, 2.76715848, -1.29658591, + -2.22816691, 1.13269136, -0.48017382, 1.48562248, -0.23812278, + -2.51664399, -1.2703721 , -1.4683091 , -2.49907536, -1.30026484, + -2.32398309, 1.39380683, -0.89495368, -1.23735395, 0.58485202, + -1.25435224, -1.4968484 , -1.71888038, -1.45756652, -2.01906267, + 3.41729922, -0.17335867, -2.22921828, -1.57470549]) + np.testing.assert_allclose(results.resid_deviance, + [ 3.70529668, -0.74027329, -2.37536322, -0.19586751, -1.11349765, + -2.22466106, 2.46246446, 0.4928057 , 1.80799655, 8.02696525, + -0.73602255, -1.25021555, 2.10699958, 3.05084608, -2.22214376, + -1.45072221, 1.28913747, 4.35106213, 0.6689982 , 0.95669662, + -2.45171913, -1.11410444, -0.17960956, 2.76494217, -1.29609865, + -2.22612429, 1.13247453, -0.48015254, 1.48508549, -0.23812 , + -2.51476072, -1.27015583, -1.46777697, -2.49699318, -1.29992892, + -2.32263069, 1.39348459, -0.89482132, -1.23715363, 0.58483655, + -1.25415329, -1.49653039, -1.7181055 , -1.45719072, -2.01791949, + 3.41437156, -0.1733581 , -2.22765605, -1.57426046]) + np.testing.assert_allclose(results.null, + [ 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, + 38.42857143, 38.42857143, 38.42857143, 38.42857143, 38.42857143]) + self.assertAlmostEqual(results.D2, .388656011675) + self.assertAlmostEqual(results.adj_D2, 0.36207583826952761) + +class TestBinomial(unittest.TestCase): + + def setUp(self): + #London house price data + #y: 'BATH2' + y = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, + 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + self.y = y.reshape((316,1)) + #X: 'FLOORSZ' + X = np.array([ 77, 75, 64, 95, 107, 100, 81, 151, 98, 260, 171, 161, 91, + 80, 50, 85, 52, 69, 60, 84, 155, 97, 69, 126, 90, 43, + 51, 41, 140, 80, 52, 86, 66, 60, 40, 155, 138, 97, 115, + 148, 206, 60, 53, 96, 88, 160, 31, 43, 154, 60, 131, 60, + 46, 61, 125, 150, 76, 92, 96, 100, 105, 72, 48, 41, 72, + 65, 60, 65, 98, 33, 144, 111, 91, 108, 38, 48, 95, 63, + 98, 129, 108, 51, 131, 66, 48, 127, 76, 68, 52, 64, 57, + 121, 67, 76, 112, 96, 90, 53, 93, 64, 97, 58, 44, 157, + 53, 70, 71, 167, 47, 70, 96, 77, 75, 71, 67, 47, 71, + 90, 69, 64, 65, 95, 60, 60, 65, 54, 121, 105, 50, 85, + 69, 69, 62, 65, 93, 93, 70, 62, 155, 68, 117, 80, 80, + 75, 98, 114, 86, 70, 50, 51, 163, 124, 59, 95, 51, 63, + 85, 53, 46, 102, 114, 83, 47, 40, 63, 123, 100, 63, 110, + 79, 98, 99, 120, 52, 48, 37, 81, 30, 88, 50, 35, 116, + 67, 45, 80, 86, 109, 59, 75, 60, 71, 141, 121, 50, 168, + 90, 51, 133, 75, 133, 127, 37, 68, 105, 61, 123, 151, 110, + 77, 220, 94, 77, 70, 100, 98, 126, 55, 105, 60, 176, 104, + 68, 62, 70, 48, 102, 80, 97, 66, 80, 102, 160, 55, 60, + 71, 125, 85, 85, 190, 137, 48, 41, 42, 51, 57, 60, 114, + 88, 84, 108, 66, 85, 42, 98, 90, 127, 100, 55, 76, 82, + 63, 80, 71, 76, 121, 109, 92, 160, 109, 185, 100, 90, 90, + 86, 88, 95, 116, 135, 61, 74, 60, 235, 76, 66, 100, 49, + 50, 37, 100, 88, 90, 52, 95, 81, 79, 96, 75, 91, 86, + 83, 180, 108, 80, 96, 49, 117, 117, 86, 46, 66, 95, 57, + 120, 137, 68, 240]) + self.X = X.reshape((316,1)) + + def testIWLS(self): + model = GLM(self.y, self.X, family=Binomial()) + results = model.fit() + self.assertEqual(results.n, 316) + self.assertEqual(results.df_model, 1) + self.assertEqual(results.df_resid, 314) + self.assertEqual(results.aic, 155.19347530342466) + self.assertEqual(results.bic, -1656.1095797628657) + self.assertEqual(results.deviance, 151.19347530342466) + self.assertEqual(results.llf, -75.596737651712331) + self.assertEqual(results.null_deviance, 189.16038985881212) + self.assertEqual(results.scale, 1.0) + np.testing.assert_allclose(results.params, [-5.33638276, 0.0287754 ]) + np.testing.assert_allclose(results.bse, [ 0.64499904, 0.00518312], + atol=1.0e-8) + np.testing.assert_allclose(results.cov_params(), + [[ 4.16023762e-01, -3.14338457e-03], + [ -3.14338457e-03, 2.68646833e-05]]) + np.testing.assert_allclose(results.tvalues, [-8.27347396, 5.55175826]) + np.testing.assert_allclose(results.pvalues, [ 1.30111233e-16, + 2.82810512e-08]) + np.testing.assert_allclose(results.conf_int(), + [[-6.60055765, -4.07220787], + [ 0.01861668, 0.03893412]], atol=1.0e-8) + np.testing.assert_allclose(results.normalized_cov_params, + [[ 4.16023762e-01, -3.14338457e-03], + [ -3.14338457e-03, 2.68646833e-05]]) + np.testing.assert_allclose(results.mu, + [ 0.04226237, 0.03999333, 0.02946178, 0.0689636 , 0.09471181, + 0.07879431, 0.04717464, 0.27065598, 0.07471691, 0.89522144, + 0.39752487, 0.33102718, 0.06192993, 0.04589793, 0.01988679, + 0.0526265 , 0.02104007, 0.03386636, 0.02634295, 0.05121018, + 0.29396682, 0.07275173, 0.03386636, 0.15307528, 0.06027915, + 0.01631789, 0.02045547, 0.01541937, 0.2128508 , 0.04589793, + 0.02104007, 0.05407977, 0.0311527 , 0.02634295, 0.01498855, + 0.29396682, 0.20336776, 0.07275173, 0.11637537, 0.25395607, + 0.64367488, 0.02634295, 0.02164101, 0.07083428, 0.05710047, + 0.32468619, 0.01160845, 0.01631789, 0.28803008, 0.02634295, + 0.17267234, 0.02634295, 0.01776301, 0.02709115, 0.14938186, + 0.26501331, 0.04111287, 0.06362285, 0.07083428, 0.07879431, + 0.08989109, 0.03680743, 0.0187955 , 0.01541937, 0.03680743, + 0.03029581, 0.02634295, 0.03029581, 0.07471691, 0.01228768, + 0.23277197, 0.10505173, 0.06192993, 0.09720799, 0.01416217, + 0.0187955 , 0.0689636 , 0.02865003, 0.07471691, 0.16460503, + 0.09720799, 0.02045547, 0.17267234, 0.0311527 , 0.0187955 , + 0.15684317, 0.04111287, 0.03293737, 0.02104007, 0.02946178, + 0.02421701, 0.1353385 , 0.03203302, 0.04111287, 0.10778798, + 0.07083428, 0.06027915, 0.02164101, 0.06535882, 0.02946178, + 0.07275173, 0.02490638, 0.01678627, 0.30605146, 0.02164101, + 0.03482061, 0.03580075, 0.37030921, 0.0182721 , 0.03482061, + 0.07083428, 0.04226237, 0.03999333, 0.03580075, 0.03203302, + 0.0182721 , 0.03580075, 0.06027915, 0.03386636, 0.02946178, + 0.03029581, 0.0689636 , 0.02634295, 0.02634295, 0.03029581, + 0.02225873, 0.1353385 , 0.08989109, 0.01988679, 0.0526265 , + 0.03386636, 0.03386636, 0.02786 , 0.03029581, 0.06535882, + 0.06535882, 0.03482061, 0.02786 , 0.29396682, 0.03293737, + 0.12242534, 0.04589793, 0.04589793, 0.03999333, 0.07471691, + 0.11344884, 0.05407977, 0.03482061, 0.01988679, 0.02045547, + 0.34389327, 0.14576223, 0.02561486, 0.0689636 , 0.02045547, + 0.02865003, 0.0526265 , 0.02164101, 0.01776301, 0.08307425, + 0.11344884, 0.04982997, 0.0182721 , 0.01498855, 0.02865003, + 0.14221564, 0.07879431, 0.02865003, 0.10237696, 0.04465416, + 0.07471691, 0.07673078, 0.13200634, 0.02104007, 0.0187955 , + 0.01376599, 0.04717464, 0.01128289, 0.05710047, 0.01988679, + 0.01300612, 0.11936722, 0.03203302, 0.01726786, 0.04589793, + 0.05407977, 0.09976271, 0.02561486, 0.03999333, 0.02634295, + 0.03580075, 0.21771181, 0.1353385 , 0.01988679, 0.37704374, + 0.06027915, 0.02045547, 0.18104935, 0.03999333, 0.18104935, + 0.15684317, 0.01376599, 0.03293737, 0.08989109, 0.02709115, + 0.14221564, 0.27065598, 0.10237696, 0.04226237, 0.72991785, + 0.06713876, 0.04226237, 0.03482061, 0.07879431, 0.07471691, + 0.15307528, 0.02289366, 0.08989109, 0.02634295, 0.43243779, + 0.08756457, 0.03293737, 0.02786 , 0.03482061, 0.0187955 , + 0.08307425, 0.04589793, 0.07275173, 0.0311527 , 0.04589793, + 0.08307425, 0.32468619, 0.02289366, 0.02634295, 0.03580075, + 0.14938186, 0.0526265 , 0.0526265 , 0.53268924, 0.19874565, + 0.0187955 , 0.01541937, 0.01586237, 0.02045547, 0.02421701, + 0.02634295, 0.11344884, 0.05710047, 0.05121018, 0.09720799, + 0.0311527 , 0.0526265 , 0.01586237, 0.07471691, 0.06027915, + 0.15684317, 0.07879431, 0.02289366, 0.04111287, 0.04848506, + 0.02865003, 0.04589793, 0.03580075, 0.04111287, 0.1353385 , + 0.09976271, 0.06362285, 0.32468619, 0.09976271, 0.49676673, + 0.07879431, 0.06027915, 0.06027915, 0.05407977, 0.05710047, + 0.0689636 , 0.11936722, 0.18973955, 0.02709115, 0.03890304, + 0.02634295, 0.80625182, 0.04111287, 0.0311527 , 0.07879431, + 0.0193336 , 0.01988679, 0.01376599, 0.07879431, 0.05710047, + 0.06027915, 0.02104007, 0.0689636 , 0.04717464, 0.04465416, + 0.07083428, 0.03999333, 0.06192993, 0.05407977, 0.04982997, + 0.46087756, 0.09720799, 0.04589793, 0.07083428, 0.0193336 , + 0.12242534, 0.12242534, 0.05407977, 0.01776301, 0.0311527 , + 0.0689636 , 0.02421701, 0.13200634, 0.19874565, 0.03293737, + 0.82774282], atol=1.0e-8) + self.assertAlmostEqual(results.pearson_chi2, 271.21110541713801) + np.testing.assert_allclose(results.resid_response, + [-0.04226237, -0.03999333, -0.02946178, -0.0689636 , -0.09471181, + -0.07879431, -0.04717464, -0.27065598, -0.07471691, 0.10477856, + -0.39752487, 0.66897282, -0.06192993, -0.04589793, -0.01988679, + -0.0526265 , -0.02104007, -0.03386636, -0.02634295, -0.05121018, + -0.29396682, 0.92724827, -0.03386636, -0.15307528, -0.06027915, + -0.01631789, -0.02045547, -0.01541937, -0.2128508 , -0.04589793, + -0.02104007, -0.05407977, -0.0311527 , -0.02634295, -0.01498855, + -0.29396682, 0.79663224, -0.07275173, -0.11637537, 0.74604393, + -0.64367488, -0.02634295, -0.02164101, -0.07083428, -0.05710047, + -0.32468619, -0.01160845, -0.01631789, -0.28803008, -0.02634295, + -0.17267234, -0.02634295, -0.01776301, -0.02709115, 0.85061814, + 0.73498669, -0.04111287, -0.06362285, -0.07083428, -0.07879431, + 0.91010891, -0.03680743, -0.0187955 , -0.01541937, -0.03680743, + -0.03029581, -0.02634295, -0.03029581, -0.07471691, -0.01228768, + 0.76722803, -0.10505173, -0.06192993, -0.09720799, -0.01416217, + -0.0187955 , -0.0689636 , -0.02865003, -0.07471691, -0.16460503, + -0.09720799, -0.02045547, 0.82732766, -0.0311527 , -0.0187955 , + -0.15684317, -0.04111287, -0.03293737, -0.02104007, -0.02946178, + -0.02421701, -0.1353385 , -0.03203302, -0.04111287, -0.10778798, + -0.07083428, -0.06027915, -0.02164101, -0.06535882, -0.02946178, + -0.07275173, -0.02490638, -0.01678627, -0.30605146, -0.02164101, + -0.03482061, -0.03580075, 0.62969079, -0.0182721 , -0.03482061, + -0.07083428, -0.04226237, -0.03999333, -0.03580075, -0.03203302, + -0.0182721 , -0.03580075, -0.06027915, -0.03386636, -0.02946178, + -0.03029581, -0.0689636 , -0.02634295, -0.02634295, -0.03029581, + -0.02225873, -0.1353385 , -0.08989109, -0.01988679, -0.0526265 , + -0.03386636, -0.03386636, -0.02786 , -0.03029581, -0.06535882, + -0.06535882, -0.03482061, -0.02786 , -0.29396682, -0.03293737, + -0.12242534, -0.04589793, -0.04589793, -0.03999333, -0.07471691, + -0.11344884, -0.05407977, -0.03482061, -0.01988679, -0.02045547, + 0.65610673, 0.85423777, -0.02561486, -0.0689636 , -0.02045547, + -0.02865003, -0.0526265 , -0.02164101, -0.01776301, -0.08307425, + -0.11344884, -0.04982997, -0.0182721 , -0.01498855, -0.02865003, + -0.14221564, -0.07879431, -0.02865003, -0.10237696, -0.04465416, + -0.07471691, -0.07673078, -0.13200634, -0.02104007, -0.0187955 , + -0.01376599, -0.04717464, -0.01128289, 0.94289953, -0.01988679, + -0.01300612, -0.11936722, -0.03203302, -0.01726786, -0.04589793, + -0.05407977, -0.09976271, -0.02561486, -0.03999333, -0.02634295, + -0.03580075, -0.21771181, 0.8646615 , -0.01988679, 0.62295626, + -0.06027915, -0.02045547, -0.18104935, 0.96000667, -0.18104935, + -0.15684317, -0.01376599, -0.03293737, -0.08989109, -0.02709115, + -0.14221564, 0.72934402, -0.10237696, -0.04226237, -0.72991785, + -0.06713876, -0.04226237, -0.03482061, -0.07879431, -0.07471691, + -0.15307528, 0.97710634, 0.91010891, -0.02634295, -0.43243779, + -0.08756457, -0.03293737, -0.02786 , -0.03482061, -0.0187955 , + 0.91692575, -0.04589793, -0.07275173, -0.0311527 , -0.04589793, + -0.08307425, 0.67531381, -0.02289366, -0.02634295, -0.03580075, + -0.14938186, -0.0526265 , -0.0526265 , 0.46731076, -0.19874565, + -0.0187955 , -0.01541937, -0.01586237, -0.02045547, -0.02421701, + -0.02634295, -0.11344884, -0.05710047, -0.05121018, -0.09720799, + 0.9688473 , -0.0526265 , -0.01586237, -0.07471691, -0.06027915, + -0.15684317, -0.07879431, -0.02289366, -0.04111287, -0.04848506, + -0.02865003, -0.04589793, -0.03580075, -0.04111287, -0.1353385 , + -0.09976271, -0.06362285, 0.67531381, -0.09976271, -0.49676673, + -0.07879431, -0.06027915, -0.06027915, -0.05407977, -0.05710047, + -0.0689636 , -0.11936722, -0.18973955, -0.02709115, -0.03890304, + -0.02634295, 0.19374818, -0.04111287, -0.0311527 , -0.07879431, + -0.0193336 , -0.01988679, -0.01376599, -0.07879431, 0.94289953, + -0.06027915, -0.02104007, -0.0689636 , -0.04717464, -0.04465416, + 0.92916572, -0.03999333, -0.06192993, -0.05407977, -0.04982997, + -0.46087756, -0.09720799, -0.04589793, -0.07083428, -0.0193336 , + -0.12242534, -0.12242534, -0.05407977, -0.01776301, -0.0311527 , + -0.0689636 , -0.02421701, -0.13200634, -0.19874565, -0.03293737, + -0.82774282], atol=1.0e-8) + np.testing.assert_allclose(results.resid_working, + [ -1.71062283e-03, -1.53549840e-03, -8.42423701e-04, + -4.42798906e-03, -8.12073047e-03, -5.71934606e-03, + -2.12046213e-03, -5.34278480e-02, -5.16550074e-03, + 9.82823035e-03, -9.52067472e-02, 1.48142818e-01, + -3.59779501e-03, -2.00993083e-03, -3.87619325e-04, + -2.62379729e-03, -4.33370579e-04, -1.10808799e-03, + -6.75670103e-04, -2.48818484e-03, -6.10129090e-02, + 6.25511612e-02, -1.10808799e-03, -1.98451739e-02, + -3.41454749e-03, -2.61928659e-04, -4.09867263e-04, + -2.34090923e-04, -3.56621577e-02, -2.00993083e-03, + -4.33370579e-04, -2.76645832e-03, -9.40257152e-04, + -6.75670103e-04, -2.21289369e-04, -6.10129090e-02, + 1.29061842e-01, -4.90775251e-03, -1.19671283e-02, + 1.41347263e-01, -1.47631680e-01, -6.75670103e-04, + -4.58198217e-04, -4.66208406e-03, -3.07429001e-03, + -7.11923401e-02, -1.33191898e-04, -2.61928659e-04, + -5.90659690e-02, -6.75670103e-04, -2.46673839e-02, + -6.75670103e-04, -3.09919962e-04, -7.14047519e-04, + 1.08085429e-01, 1.43161630e-01, -1.62077632e-03, + -3.79032977e-03, -4.66208406e-03, -5.71934606e-03, + 7.44566288e-02, -1.30492035e-03, -3.46630910e-04, + -2.34090923e-04, -1.30492035e-03, -8.90029618e-04, + -6.75670103e-04, -8.90029618e-04, -5.16550074e-03, + -1.49131762e-04, 1.37018624e-01, -9.87652847e-03, + -3.59779501e-03, -8.53083698e-03, -1.97726627e-04, + -3.46630910e-04, -4.42798906e-03, -7.97307494e-04, + -5.16550074e-03, -2.26348718e-02, -8.53083698e-03, + -4.09867263e-04, 1.18189219e-01, -9.40257152e-04, + -3.46630910e-04, -2.07414715e-02, -1.62077632e-03, + -1.04913757e-03, -4.33370579e-04, -8.42423701e-04, + -5.72261321e-04, -1.58375811e-02, -9.93244730e-04, + -1.62077632e-03, -1.03659408e-02, -4.66208406e-03, + -3.41454749e-03, -4.58198217e-04, -3.99257703e-03, + -8.42423701e-04, -4.90775251e-03, -6.04877746e-04, + -2.77048947e-04, -6.50004229e-02, -4.58198217e-04, + -1.17025566e-03, -1.23580799e-03, 1.46831486e-01, + -3.27769165e-04, -1.17025566e-03, -4.66208406e-03, + -1.71062283e-03, -1.53549840e-03, -1.23580799e-03, + -9.93244730e-04, -3.27769165e-04, -1.23580799e-03, + -3.41454749e-03, -1.10808799e-03, -8.42423701e-04, + -8.90029618e-04, -4.42798906e-03, -6.75670103e-04, + -6.75670103e-04, -8.90029618e-04, -4.84422741e-04, + -1.58375811e-02, -7.35405096e-03, -3.87619325e-04, + -2.62379729e-03, -1.10808799e-03, -1.10808799e-03, + -7.54555329e-04, -8.90029618e-04, -3.99257703e-03, + -3.99257703e-03, -1.17025566e-03, -7.54555329e-04, + -6.10129090e-02, -1.04913757e-03, -1.31530576e-02, + -2.00993083e-03, -2.00993083e-03, -1.53549840e-03, + -5.16550074e-03, -1.14104800e-02, -2.76645832e-03, + -1.17025566e-03, -3.87619325e-04, -4.09867263e-04, + 1.48037813e-01, 1.06365931e-01, -6.39314594e-04, + -4.42798906e-03, -4.09867263e-04, -7.97307494e-04, + -2.62379729e-03, -4.58198217e-04, -3.09919962e-04, + -6.32800839e-03, -1.14104800e-02, -2.35929680e-03, + -3.27769165e-04, -2.21289369e-04, -7.97307494e-04, + -1.73489362e-02, -5.71934606e-03, -7.97307494e-04, + -9.40802551e-03, -1.90495384e-03, -5.16550074e-03, + -5.43585191e-03, -1.51253748e-02, -4.33370579e-04, + -3.46630910e-04, -1.86893696e-04, -2.12046213e-03, + -1.25867293e-04, 5.07657192e-02, -3.87619325e-04, + -1.66959104e-04, -1.25477263e-02, -9.93244730e-04, + -2.93030065e-04, -2.00993083e-03, -2.76645832e-03, + -8.95970087e-03, -6.39314594e-04, -1.53549840e-03, + -6.75670103e-04, -1.23580799e-03, -3.70792339e-02, + 1.01184411e-01, -3.87619325e-04, 1.46321062e-01, + -3.41454749e-03, -4.09867263e-04, -2.68442736e-02, + 3.68583645e-02, -2.68442736e-02, -2.07414715e-02, + -1.86893696e-04, -1.04913757e-03, -7.35405096e-03, + -7.14047519e-04, -1.73489362e-02, 1.43973473e-01, + -9.40802551e-03, -1.71062283e-03, -1.43894386e-01, + -4.20497779e-03, -1.71062283e-03, -1.17025566e-03, + -5.71934606e-03, -5.16550074e-03, -1.98451739e-02, + 2.18574168e-02, 7.44566288e-02, -6.75670103e-04, + -1.06135519e-01, -6.99614755e-03, -1.04913757e-03, + -7.54555329e-04, -1.17025566e-03, -3.46630910e-04, + 6.98449121e-02, -2.00993083e-03, -4.90775251e-03, + -9.40257152e-04, -2.00993083e-03, -6.32800839e-03, + 1.48072729e-01, -5.12120512e-04, -6.75670103e-04, + -1.23580799e-03, -1.89814939e-02, -2.62379729e-03, + -2.62379729e-03, 1.16328328e-01, -3.16494123e-02, + -3.46630910e-04, -2.34090923e-04, -2.47623705e-04, + -4.09867263e-04, -5.72261321e-04, -6.75670103e-04, + -1.14104800e-02, -3.07429001e-03, -2.48818484e-03, + -8.53083698e-03, 2.92419496e-02, -2.62379729e-03, + -2.47623705e-04, -5.16550074e-03, -3.41454749e-03, + -2.07414715e-02, -5.71934606e-03, -5.12120512e-04, + -1.62077632e-03, -2.23682205e-03, -7.97307494e-04, + -2.00993083e-03, -1.23580799e-03, -1.62077632e-03, + -1.58375811e-02, -8.95970087e-03, -3.79032977e-03, + 1.48072729e-01, -8.95970087e-03, -1.24186489e-01, + -5.71934606e-03, -3.41454749e-03, -3.41454749e-03, + -2.76645832e-03, -3.07429001e-03, -4.42798906e-03, + -1.25477263e-02, -2.91702648e-02, -7.14047519e-04, + -1.45456868e-03, -6.75670103e-04, 3.02653681e-02, + -1.62077632e-03, -9.40257152e-04, -5.71934606e-03, + -3.66561274e-04, -3.87619325e-04, -1.86893696e-04, + -5.71934606e-03, 5.07657192e-02, -3.41454749e-03, + -4.33370579e-04, -4.42798906e-03, -2.12046213e-03, + -1.90495384e-03, 6.11546973e-02, -1.53549840e-03, + -3.59779501e-03, -2.76645832e-03, -2.35929680e-03, + -1.14513988e-01, -8.53083698e-03, -2.00993083e-03, + -4.66208406e-03, -3.66561274e-04, -1.31530576e-02, + -1.31530576e-02, -2.76645832e-03, -3.09919962e-04, + -9.40257152e-04, -4.42798906e-03, -5.72261321e-04, + -1.51253748e-02, -3.16494123e-02, -1.04913757e-03, + -1.18023417e-01]) + np.testing.assert_allclose(results.resid_pearson, + [-0.21006498, -0.20410641, -0.17423009, -0.27216147, -0.3234511 , + -0.29246179, -0.22250903, -0.60917574, -0.28416602, 0.3421141 , + -0.81229277, 1.42158361, -0.25694055, -0.21933056, -0.142444 , + -0.23569027, -0.14660243, -0.18722578, -0.16448609, -0.2323235 , + -0.64526275, 3.57006696, -0.18722578, -0.42513819, -0.25327023, + -0.12879668, -0.14450826, -0.12514332, -0.5200069 , -0.21933056, + -0.14660243, -0.23910582, -0.17931646, -0.16448609, -0.12335569, + -0.64526275, 1.97919183, -0.28010679, -0.36290807, 1.71396874, + -1.3440334 , -0.16448609, -0.14872695, -0.27610555, -0.24608613, + -0.69339243, -0.1083734 , -0.12879668, -0.63604537, -0.16448609, + -0.45684893, -0.16448609, -0.13447767, -0.16686977, 2.3862634 , + 1.66535145, -0.20706426, -0.26066405, -0.27610555, -0.29246179, + 3.18191348, -0.19548397, -0.13840353, -0.12514332, -0.19548397, + -0.17675498, -0.16448609, -0.17675498, -0.28416602, -0.11153719, + 1.81550268, -0.34261205, -0.25694055, -0.32813846, -0.11985666, + -0.13840353, -0.27216147, -0.17174127, -0.28416602, -0.44389026, + -0.32813846, -0.14450826, 2.18890738, -0.17931646, -0.13840353, + -0.43129917, -0.20706426, -0.18455132, -0.14660243, -0.17423009, + -0.1575374 , -0.39562855, -0.18191506, -0.20706426, -0.34757708, + -0.27610555, -0.25327023, -0.14872695, -0.26444152, -0.17423009, + -0.28010679, -0.15982038, -0.13066317, -0.66410018, -0.14872695, + -0.189939 , -0.19269154, 1.30401147, -0.13642648, -0.189939 , + -0.27610555, -0.21006498, -0.20410641, -0.19269154, -0.18191506, + -0.13642648, -0.19269154, -0.25327023, -0.18722578, -0.17423009, + -0.17675498, -0.27216147, -0.16448609, -0.16448609, -0.17675498, + -0.15088226, -0.39562855, -0.3142763 , -0.142444 , -0.23569027, + -0.18722578, -0.18722578, -0.169288 , -0.17675498, -0.26444152, + -0.26444152, -0.189939 , -0.169288 , -0.64526275, -0.18455132, + -0.3735026 , -0.21933056, -0.21933056, -0.20410641, -0.28416602, + -0.35772404, -0.23910582, -0.189939 , -0.142444 , -0.14450826, + 1.38125991, 2.42084442, -0.16213645, -0.27216147, -0.14450826, + -0.17174127, -0.23569027, -0.14872695, -0.13447767, -0.30099975, + -0.35772404, -0.22900483, -0.13642648, -0.12335569, -0.17174127, + -0.4071783 , -0.29246179, -0.17174127, -0.33771794, -0.21619749, + -0.28416602, -0.28828407, -0.38997712, -0.14660243, -0.13840353, + -0.11814455, -0.22250903, -0.10682532, 4.06361781, -0.142444 , + -0.11479334, -0.36816723, -0.18191506, -0.1325567 , -0.21933056, + -0.23910582, -0.33289374, -0.16213645, -0.20410641, -0.16448609, + -0.19269154, -0.52754269, 2.52762346, -0.142444 , 1.28538406, + -0.25327023, -0.14450826, -0.47018591, 4.89940505, -0.47018591, + -0.43129917, -0.11814455, -0.18455132, -0.3142763 , -0.16686977, + -0.4071783 , 1.64156241, -0.33771794, -0.21006498, -1.6439517 , + -0.26827373, -0.21006498, -0.189939 , -0.29246179, -0.28416602, + -0.42513819, 6.53301013, 3.18191348, -0.16448609, -0.87288109, + -0.30978696, -0.18455132, -0.169288 , -0.189939 , -0.13840353, + 3.32226189, -0.21933056, -0.28010679, -0.17931646, -0.21933056, + -0.30099975, 1.44218477, -0.1530688 , -0.16448609, -0.19269154, + -0.41906522, -0.23569027, -0.23569027, 0.93662539, -0.4980393 , + -0.13840353, -0.12514332, -0.12695686, -0.14450826, -0.1575374 , + -0.16448609, -0.35772404, -0.24608613, -0.2323235 , -0.32813846, + 5.57673284, -0.23569027, -0.12695686, -0.28416602, -0.25327023, + -0.43129917, -0.29246179, -0.1530688 , -0.20706426, -0.22573357, + -0.17174127, -0.21933056, -0.19269154, -0.20706426, -0.39562855, + -0.33289374, -0.26066405, 1.44218477, -0.33289374, -0.99355423, + -0.29246179, -0.25327023, -0.25327023, -0.23910582, -0.24608613, + -0.27216147, -0.36816723, -0.48391225, -0.16686977, -0.20119082, + -0.16448609, 0.49021146, -0.20706426, -0.17931646, -0.29246179, + -0.14040923, -0.142444 , -0.11814455, -0.29246179, 4.06361781, + -0.25327023, -0.14660243, -0.27216147, -0.22250903, -0.21619749, + 3.6218033 , -0.20410641, -0.25694055, -0.23910582, -0.22900483, + -0.92458976, -0.32813846, -0.21933056, -0.27610555, -0.14040923, + -0.3735026 , -0.3735026 , -0.23910582, -0.13447767, -0.17931646, + -0.27216147, -0.1575374 , -0.38997712, -0.4980393 , -0.18455132, + -2.19209332]) + np.testing.assert_allclose(results.resid_anscombe, + [-0.31237627, -0.3036605 , -0.25978208, -0.40240831, -0.47552289, + -0.43149255, -0.33053793, -0.85617194, -0.41962951, 0.50181328, + -1.0954382 , 1.66940149, -0.38048321, -0.3259044 , -0.21280762, + -0.34971301, -0.21896842, -0.27890356, -0.2454118 , -0.34482158, + -0.90063409, 2.80452413, -0.27890356, -0.61652596, -0.37518169, + -0.19255932, -0.2158664 , -0.18713159, -0.74270558, -0.3259044 , + -0.21896842, -0.35467084, -0.2672722 , -0.2454118 , -0.18447466, + -0.90063409, 2.05763941, -0.41381347, -0.53089521, 1.88552083, + -1.60654218, -0.2454118 , -0.22211425, -0.40807333, -0.3647888 , + -0.95861559, -0.16218047, -0.19255932, -0.88935802, -0.2454118 , + -0.65930821, -0.2454118 , -0.20099345, -0.24892975, 2.28774016, + 1.85167195, -0.30798858, -0.38585584, -0.40807333, -0.43149255, + 2.65398426, -0.2910267 , -0.20681747, -0.18713159, -0.2910267 , + -0.26350118, -0.2454118 , -0.26350118, -0.41962951, -0.16689207, + 1.95381191, -0.50251231, -0.38048321, -0.48214234, -0.17927213, + -0.20681747, -0.40240831, -0.25611424, -0.41962951, -0.64189694, + -0.48214234, -0.2158664 , 2.18071204, -0.2672722 , -0.20681747, + -0.62488429, -0.30798858, -0.27497271, -0.21896842, -0.25978208, + -0.23514749, -0.57618899, -0.27109582, -0.30798858, -0.50947546, + -0.40807333, -0.37518169, -0.22211425, -0.39130036, -0.25978208, + -0.41381347, -0.2385213 , -0.19533116, -0.92350689, -0.22211425, + -0.28288904, -0.28692985, 1.5730846 , -0.20388497, -0.28288904, + -0.40807333, -0.31237627, -0.3036605 , -0.28692985, -0.27109582, + -0.20388497, -0.28692985, -0.37518169, -0.27890356, -0.25978208, + -0.26350118, -0.40240831, -0.2454118 , -0.2454118 , -0.26350118, + -0.22530448, -0.57618899, -0.46253505, -0.21280762, -0.34971301, + -0.27890356, -0.27890356, -0.25249702, -0.26350118, -0.39130036, + -0.39130036, -0.28288904, -0.25249702, -0.90063409, -0.27497271, + -0.5456246 , -0.3259044 , -0.3259044 , -0.3036605 , -0.41962951, + -0.52366614, -0.35467084, -0.28288904, -0.21280762, -0.2158664 , + 1.63703418, 2.30570989, -0.24194253, -0.40240831, -0.2158664 , + -0.25611424, -0.34971301, -0.22211425, -0.20099345, -0.44366892, + -0.52366614, -0.33999576, -0.20388497, -0.18447466, -0.25611424, + -0.59203547, -0.43149255, -0.25611424, -0.49563627, -0.32133344, + -0.41962951, -0.42552227, -0.56840788, -0.21896842, -0.20681747, + -0.17672552, -0.33053793, -0.15987433, 2.9768074 , -0.21280762, + -0.17173916, -0.53821445, -0.27109582, -0.19814236, -0.3259044 , + -0.35467084, -0.48884654, -0.24194253, -0.3036605 , -0.2454118 , + -0.28692985, -0.75249089, 2.35983933, -0.21280762, 1.55726719, + -0.37518169, -0.2158664 , -0.67712261, 3.23165236, -0.67712261, + -0.62488429, -0.17672552, -0.27497271, -0.46253505, -0.24892975, + -0.59203547, 1.83482464, -0.49563627, -0.31237627, -1.83652534, + -0.39681759, -0.31237627, -0.28288904, -0.43149255, -0.41962951, + -0.61652596, 3.63983609, 2.65398426, -0.2454118 , -1.16171662, + -0.45616505, -0.27497271, -0.25249702, -0.28288904, -0.20681747, + 2.71015945, -0.3259044 , -0.41381347, -0.2672722 , -0.3259044 , + -0.44366892, 1.68567947, -0.22853969, -0.2454118 , -0.28692985, + -0.60826548, -0.34971301, -0.34971301, 1.2290223 , -0.71397735, + -0.20681747, -0.18713159, -0.1898263 , -0.2158664 , -0.23514749, + -0.2454118 , -0.52366614, -0.3647888 , -0.34482158, -0.48214234, + 3.41271513, -0.34971301, -0.1898263 , -0.41962951, -0.37518169, + -0.62488429, -0.43149255, -0.22853969, -0.30798858, -0.3352348 , + -0.25611424, -0.3259044 , -0.28692985, -0.30798858, -0.57618899, + -0.48884654, -0.38585584, 1.68567947, -0.48884654, -1.28709718, + -0.43149255, -0.37518169, -0.37518169, -0.35467084, -0.3647888 , + -0.40240831, -0.53821445, -0.69534436, -0.24892975, -0.29939131, + -0.2454118 , 0.70366797, -0.30798858, -0.2672722 , -0.43149255, + -0.2097915 , -0.21280762, -0.17672552, -0.43149255, 2.9768074 , + -0.37518169, -0.21896842, -0.40240831, -0.33053793, -0.32133344, + 2.82351017, -0.3036605 , -0.38048321, -0.35467084, -0.33999576, + -1.21650102, -0.48214234, -0.3259044 , -0.40807333, -0.2097915 , + -0.5456246 , -0.5456246 , -0.35467084, -0.20099345, -0.2672722 , + -0.40240831, -0.23514749, -0.56840788, -0.71397735, -0.27497271, + -2.18250381]) + np.testing.assert_allclose(results.resid_deviance, + [-0.29387552, -0.2857098 , -0.24455876, -0.37803944, -0.44609851, + -0.40514674, -0.31088148, -0.79449324, -0.39409528, 0.47049798, + -1.00668653, 1.48698001, -0.35757692, -0.30654405, -0.20043547, + -0.32882173, -0.20622595, -0.26249995, -0.23106769, -0.32424676, + -0.83437766, 2.28941155, -0.26249995, -0.57644334, -0.35262564, + -0.18139734, -0.20331052, -0.17629229, -0.69186337, -0.30654405, + -0.20622595, -0.33345774, -0.251588 , -0.23106769, -0.17379306, + -0.83437766, 1.78479093, -0.38867448, -0.4974393 , 1.65565332, + -1.43660134, -0.23106769, -0.20918228, -0.38332275, -0.34291558, + -0.88609006, -0.15281596, -0.18139734, -0.82428104, -0.23106769, + -0.61571821, -0.23106769, -0.18932865, -0.234371 , 1.94999969, + 1.62970871, -0.2897651 , -0.36259328, -0.38332275, -0.40514674, + 2.19506559, -0.27386827, -0.19480442, -0.17629229, -0.27386827, + -0.24804925, -0.23106769, -0.24804925, -0.39409528, -0.15725009, + 1.7074519 , -0.47114617, -0.35757692, -0.4522457 , -0.16889886, + -0.19480442, -0.37803944, -0.24111595, -0.39409528, -0.59975102, + -0.4522457 , -0.20331052, 1.87422489, -0.251588 , -0.19480442, + -0.5841272 , -0.2897651 , -0.25881274, -0.20622595, -0.24455876, + -0.22142749, -0.53929061, -0.25517563, -0.2897651 , -0.47760126, + -0.38332275, -0.35262564, -0.20918228, -0.36767536, -0.24455876, + -0.38867448, -0.2245965 , -0.18400413, -0.85481866, -0.20918228, + -0.26623785, -0.27002708, 1.40955093, -0.19204738, -0.26623785, + -0.38332275, -0.29387552, -0.2857098 , -0.27002708, -0.25517563, + -0.19204738, -0.27002708, -0.35262564, -0.26249995, -0.24455876, + -0.24804925, -0.37803944, -0.23106769, -0.23106769, -0.24804925, + -0.21218006, -0.53929061, -0.43402996, -0.20043547, -0.32882173, + -0.26249995, -0.26249995, -0.23772023, -0.24804925, -0.36767536, + -0.36767536, -0.26623785, -0.23772023, -0.83437766, -0.25881274, + -0.51106408, -0.30654405, -0.30654405, -0.2857098 , -0.39409528, + -0.49074728, -0.33345774, -0.26623785, -0.20043547, -0.20331052, + 1.46111186, 1.96253843, -0.22780971, -0.37803944, -0.20331052, + -0.24111595, -0.32882173, -0.20918228, -0.18932865, -0.41648237, + -0.49074728, -0.31973217, -0.19204738, -0.17379306, -0.24111595, + -0.55389988, -0.40514674, -0.24111595, -0.46476893, -0.30226435, + -0.39409528, -0.39958581, -0.53211065, -0.20622595, -0.19480442, + -0.16650295, -0.31088148, -0.15064545, 2.39288231, -0.20043547, + -0.16181126, -0.5042114 , -0.25517563, -0.18664773, -0.30654405, + -0.33345774, -0.45846897, -0.22780971, -0.2857098 , -0.23106769, + -0.27002708, -0.7007597 , 1.99998811, -0.20043547, 1.39670618, + -0.35262564, -0.20331052, -0.63203077, 2.53733821, -0.63203077, + -0.5841272 , -0.16650295, -0.25881274, -0.43402996, -0.234371 , + -0.55389988, 1.61672923, -0.46476893, -0.29387552, -1.61804148, + -0.37282386, -0.29387552, -0.26623785, -0.40514674, -0.39409528, + -0.57644334, 2.74841605, 2.19506559, -0.23106769, -1.06433539, + -0.42810736, -0.25881274, -0.23772023, -0.26623785, -0.19480442, + 2.23070414, -0.30654405, -0.38867448, -0.251588 , -0.30654405, + -0.41648237, 1.49993075, -0.21521982, -0.23106769, -0.27002708, + -0.5688444 , -0.32882173, -0.32882173, 1.12233423, -0.66569789, + -0.19480442, -0.17629229, -0.17882689, -0.20331052, -0.22142749, + -0.23106769, -0.49074728, -0.34291558, -0.32424676, -0.4522457 , + 2.63395309, -0.32882173, -0.17882689, -0.39409528, -0.35262564, + -0.5841272 , -0.40514674, -0.21521982, -0.2897651 , -0.3152773 , + -0.24111595, -0.30654405, -0.27002708, -0.2897651 , -0.53929061, + -0.45846897, -0.36259328, 1.49993075, -0.45846897, -1.17192274, + -0.40514674, -0.35262564, -0.35262564, -0.33345774, -0.34291558, + -0.37803944, -0.5042114 , -0.64869028, -0.234371 , -0.28170899, + -0.23106769, 0.65629132, -0.2897651 , -0.251588 , -0.40514674, + -0.19760028, -0.20043547, -0.16650295, -0.40514674, 2.39288231, + -0.35262564, -0.20622595, -0.37803944, -0.31088148, -0.30226435, + 2.30104857, -0.2857098 , -0.35757692, -0.33345774, -0.31973217, + -1.11158678, -0.4522457 , -0.30654405, -0.38332275, -0.19760028, + -0.51106408, -0.51106408, -0.33345774, -0.18932865, -0.251588 , + -0.37803944, -0.22142749, -0.53211065, -0.66569789, -0.25881274, + -1.87550882]) + np.testing.assert_allclose(results.null, + [ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759, + 0.08860759]) + self.assertAlmostEqual(results.D2, .200712816165) + self.assertAlmostEqual(results.adj_D2, 0.19816731557930456) + + + +if __name__ == '__main__': + unittest.main() diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/glm/utils.py b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/utils.py new file mode 100644 index 0000000..0789675 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/utils.py @@ -0,0 +1,350 @@ + +from __future__ import absolute_import, print_function +import numpy as np +import warnings + + +def _bit_length_26(x): + if x == 0: + return 0 + elif x == 1: + return 1 + else: + return len(bin(x)) - 2 + + +try: + from scipy.lib._version import NumpyVersion +except ImportError: + import re + string_types = basestring + + class NumpyVersion(): + """Parse and compare numpy version strings. + Numpy has the following versioning scheme (numbers given are examples; they + can be >9) in principle): + - Released version: '1.8.0', '1.8.1', etc. + - Alpha: '1.8.0a1', '1.8.0a2', etc. + - Beta: '1.8.0b1', '1.8.0b2', etc. + - Release candidates: '1.8.0rc1', '1.8.0rc2', etc. + - Development versions: '1.8.0.dev-f1234afa' (git commit hash appended) + - Development versions after a1: '1.8.0a1.dev-f1234afa', + '1.8.0b2.dev-f1234afa', + '1.8.1rc1.dev-f1234afa', etc. + - Development versions (no git hash available): '1.8.0.dev-Unknown' + Comparing needs to be done against a valid version string or other + `NumpyVersion` instance. + Parameters + ---------- + vstring : str + Numpy version string (``np.__version__``). + Notes + ----- + All dev versions of the same (pre-)release compare equal. + Examples + -------- + >>> from scipy.lib._version import NumpyVersion + >>> if NumpyVersion(np.__version__) < '1.7.0': + ... print('skip') + skip + >>> NumpyVersion('1.7') # raises ValueError, add ".0" + """ + + def __init__(self, vstring): + self.vstring = vstring + ver_main = re.match(r'\d[.]\d+[.]\d+', vstring) + if not ver_main: + raise ValueError("Not a valid numpy version string") + + self.version = ver_main.group() + self.major, self.minor, self.bugfix = [int(x) for x in + self.version.split('.')] + if len(vstring) == ver_main.end(): + self.pre_release = 'final' + else: + alpha = re.match(r'a\d', vstring[ver_main.end():]) + beta = re.match(r'b\d', vstring[ver_main.end():]) + rc = re.match(r'rc\d', vstring[ver_main.end():]) + pre_rel = [m for m in [alpha, beta, rc] if m is not None] + if pre_rel: + self.pre_release = pre_rel[0].group() + else: + self.pre_release = '' + + self.is_devversion = bool(re.search(r'.dev-', vstring)) + + def _compare_version(self, other): + """Compare major.minor.bugfix""" + if self.major == other.major: + if self.minor == other.minor: + if self.bugfix == other.bugfix: + vercmp = 0 + elif self.bugfix > other.bugfix: + vercmp = 1 + else: + vercmp = -1 + elif self.minor > other.minor: + vercmp = 1 + else: + vercmp = -1 + elif self.major > other.major: + vercmp = 1 + else: + vercmp = -1 + + return vercmp + + def _compare_pre_release(self, other): + """Compare alpha/beta/rc/final.""" + if self.pre_release == other.pre_release: + vercmp = 0 + elif self.pre_release == 'final': + vercmp = 1 + elif other.pre_release == 'final': + vercmp = -1 + elif self.pre_release > other.pre_release: + vercmp = 1 + else: + vercmp = -1 + + return vercmp + + def _compare(self, other): + if not isinstance(other, (string_types, NumpyVersion)): + raise ValueError("Invalid object to compare with NumpyVersion.") + + if isinstance(other, string_types): + other = NumpyVersion(other) + + vercmp = self._compare_version(other) + if vercmp == 0: + # Same x.y.z version, check for alpha/beta/rc + vercmp = self._compare_pre_release(other) + if vercmp == 0: + # Same version and same pre-release, check if dev version + if self.is_devversion is other.is_devversion: + vercmp = 0 + elif self.is_devversion: + vercmp = -1 + else: + vercmp = 1 + + return vercmp + + def __lt__(self, other): + return self._compare(other) < 0 + + def __le__(self, other): + return self._compare(other) <= 0 + + def __eq__(self, other): + return self._compare(other) == 0 + + def __ne__(self, other): + return self._compare(other) != 0 + + def __gt__(self, other): + return self._compare(other) > 0 + + def __ge__(self, other): + return self._compare(other) >= 0 + + def __repr(self): + return "NumpyVersion(%s)" % self.vstring + + +def _next_regular(target): + """ + Find the next regular number greater than or equal to target. + Regular numbers are composites of the prime factors 2, 3, and 5. + Also known as 5-smooth numbers or Hamming numbers, these are the optimal + size for inputs to FFTPACK. + Target must be a positive integer. + """ + if target <= 6: + return target + + # Quickly check if it's already a power of 2 + if not (target & (target - 1)): + return target + + match = float('inf') # Anything found will be smaller + p5 = 1 + while p5 < target: + p35 = p5 + while p35 < target: + # Ceiling integer division, avoiding conversion to float + # (quotient = ceil(target / p35)) + quotient = -(-target // p35) + # Quickly find next power of 2 >= quotient + try: + p2 = 2 ** ((quotient - 1).bit_length()) + except AttributeError: + # Fallback for Python <2.7 + p2 = 2 ** _bit_length_26(quotient - 1) + + N = p2 * p35 + if N == target: + return N + elif N < match: + match = N + p35 *= 3 + if p35 == target: + return p35 + if p35 < match: + match = p35 + p5 *= 5 + if p5 == target: + return p5 + if p5 < match: + match = p5 + return match +if NumpyVersion(np.__version__) >= '1.7.1': + np_matrix_rank = np.linalg.matrix_rank +else: + def np_matrix_rank(M, tol=None): + """ + Return matrix rank of array using SVD method + Rank of the array is the number of SVD singular values of the array that are + greater than `tol`. + Parameters + ---------- + M : {(M,), (M, N)} array_like + array of <=2 dimensions + tol : {None, float}, optional + threshold below which SVD values are considered zero. If `tol` is + None, and ``S`` is an array with singular values for `M`, and + ``eps`` is the epsilon value for datatype of ``S``, then `tol` is + set to ``S.max() * max(M.shape) * eps``. + Notes + ----- + The default threshold to detect rank deficiency is a test on the magnitude + of the singular values of `M`. By default, we identify singular values less + than ``S.max() * max(M.shape) * eps`` as indicating rank deficiency (with + the symbols defined above). This is the algorithm MATLAB uses [1]. It also + appears in *Numerical recipes* in the discussion of SVD solutions for linear + least squares [2]. + This default threshold is designed to detect rank deficiency accounting for + the numerical errors of the SVD computation. Imagine that there is a column + in `M` that is an exact (in floating point) linear combination of other + columns in `M`. Computing the SVD on `M` will not produce a singular value + exactly equal to 0 in general: any difference of the smallest SVD value from + 0 will be caused by numerical imprecision in the calculation of the SVD. + Our threshold for small SVD values takes this numerical imprecision into + account, and the default threshold will detect such numerical rank + deficiency. The threshold may declare a matrix `M` rank deficient even if + the linear combination of some columns of `M` is not exactly equal to + another column of `M` but only numerically very close to another column of + `M`. + We chose our default threshold because it is in wide use. Other thresholds + are possible. For example, elsewhere in the 2007 edition of *Numerical + recipes* there is an alternative threshold of ``S.max() * + np.finfo(M.dtype).eps / 2. * np.sqrt(m + n + 1.)``. The authors describe + this threshold as being based on "expected roundoff error" (p 71). + The thresholds above deal with floating point roundoff error in the + calculation of the SVD. However, you may have more information about the + sources of error in `M` that would make you consider other tolerance values + to detect *effective* rank deficiency. The most useful measure of the + tolerance depends on the operations you intend to use on your matrix. For + example, if your data come from uncertain measurements with uncertainties + greater than floating point epsilon, choosing a tolerance near that + uncertainty may be preferable. The tolerance may be absolute if the + uncertainties are absolute rather than relative. + References + ---------- + .. [1] MATLAB reference documention, "Rank" + http://www.mathworks.com/help/techdoc/ref/rank.html + .. [2] W. H. Press, S. A. Teukolsky, W. T. Vetterling and B. P. Flannery, + "Numerical Recipes (3rd edition)", Cambridge University Press, 2007, + page 795. + Examples + -------- + >>> from numpy.linalg import matrix_rank + >>> matrix_rank(np.eye(4)) # Full rank matrix + 4 + >>> I=np.eye(4); I[-1,-1] = 0. # rank deficient matrix + >>> matrix_rank(I) + 3 + >>> matrix_rank(np.ones((4,))) # 1 dimension - rank 1 unless all 0 + 1 + >>> matrix_rank(np.zeros((4,))) + 0 + """ + M = np.asarray(M) + if M.ndim > 2: + raise TypeError('array should have 2 or fewer dimensions') + if M.ndim < 2: + return int(not all(M == 0)) + S = np.linalg.svd(M, compute_uv=False) + if tol is None: + tol = S.max() * max(M.shape) * np.finfo(S.dtype).eps + return np.sum(S > tol) + + + +class CacheWriteWarning(UserWarning): + pass + +class CachedAttribute(object): + + def __init__(self, func, cachename=None, resetlist=None): + self.fget = func + self.name = func.__name__ + self.cachename = cachename or '_cache' + self.resetlist = resetlist or () + + def __get__(self, obj, type=None): + if obj is None: + return self.fget + # Get the cache or set a default one if needed + _cachename = self.cachename + _cache = getattr(obj, _cachename, None) + if _cache is None: + setattr(obj, _cachename, resettable_cache()) + _cache = getattr(obj, _cachename) + # Get the name of the attribute to set and cache + name = self.name + _cachedval = _cache.get(name, None) + # print("[_cachedval=%s]" % _cachedval) + if _cachedval is None: + # Call the "fget" function + _cachedval = self.fget(obj) + # Set the attribute in obj + # print("Setting %s in cache to %s" % (name, _cachedval)) + try: + _cache[name] = _cachedval + except KeyError: + setattr(_cache, name, _cachedval) + # Update the reset list if needed (and possible) + resetlist = self.resetlist + if resetlist is not (): + try: + _cache._resetdict[name] = self.resetlist + except AttributeError: + pass + # else: + # print("Reading %s from cache (%s)" % (name, _cachedval)) + return _cachedval + + def __set__(self, obj, value): + errmsg = "The attribute '%s' cannot be overwritten" % self.name + warnings.warn(errmsg, CacheWriteWarning) + + +class _cache_readonly(object): + """ + Decorator for CachedAttribute + """ + + def __init__(self, cachename=None, resetlist=None): + self.func = None + self.cachename = cachename + self.resetlist = resetlist or None + + def __call__(self, func): + return CachedAttribute(func, + cachename=self.cachename, + resetlist=self.resetlist) +cache_readonly = _cache_readonly() + + diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/glm/varfuncs.py b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/varfuncs.py new file mode 100644 index 0000000..af66d8c --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/glm/varfuncs.py @@ -0,0 +1,284 @@ +""" +Variance functions for use with the link functions in statsmodels.family.links +""" + +__docformat__ = 'restructuredtext' + +import numpy as np +FLOAT_EPS = np.finfo(float).eps + +class VarianceFunction(object): + """ + Relates the variance of a random variable to its mean. Defaults to 1. + + Methods + ------- + call + Returns an array of ones that is the same shape as `mu` + + Notes + ----- + After a variance function is initialized, its call method can be used. + + Alias for VarianceFunction: + constant = VarianceFunction() + + See also + -------- + statsmodels.family.family + """ + + def __call__(self, mu): + """ + Default variance function + + Parameters + ----------- + mu : array-like + mean parameters + + Returns + ------- + v : array + ones(mu.shape) + """ + mu = np.asarray(mu) + return np.ones(mu.shape, np.float64) + + + def deriv(self, mu): + """ + Derivative of the variance function v'(mu) + """ + from statsmodels.tools.numdiff import approx_fprime_cs + # TODO: diag workaround proplem with numdiff for 1d + return np.diag(approx_fprime_cs(mu, self)) + + +constant = VarianceFunction() +constant.__doc__ = """ +The call method of constant returns a constant variance, i.e., a vector of ones. + +constant is an alias of VarianceFunction() +""" + +class Power(object): + """ + Power variance function + + Parameters + ---------- + power : float + exponent used in power variance function + + Methods + ------- + call + Returns the power variance + + Formulas + -------- + V(mu) = numpy.fabs(mu)**power + + Notes + ----- + Aliases for Power: + mu = Power() + mu_squared = Power(power=2) + mu_cubed = Power(power=3) + """ + + def __init__(self, power=1.): + self.power = power + + def __call__(self, mu): + """ + Power variance function + + Parameters + ---------- + mu : array-like + mean parameters + + Returns + ------- + variance : array + numpy.fabs(mu)**self.power + """ + return np.power(np.fabs(mu), self.power) + + + def deriv(self, mu): + """ + Derivative of the variance function v'(mu) + """ + from statsmodels.tools.numdiff import approx_fprime_cs, approx_fprime + #return approx_fprime_cs(mu, self) # TODO fix breaks in `fabs + # TODO: diag is workaround problem with numdiff for 1d + return np.diag(approx_fprime(mu, self)) + + +mu = Power() +mu.__doc__ = """ +Returns np.fabs(mu) + +Notes +----- +This is an alias of Power() +""" +mu_squared = Power(power=2) +mu_squared.__doc__ = """ +Returns np.fabs(mu)**2 + +Notes +----- +This is an alias of statsmodels.family.links.Power(power=2) +""" +mu_cubed = Power(power=3) +mu_cubed.__doc__ = """ +Returns np.fabs(mu)**3 + +Notes +----- +This is an alias of statsmodels.family.links.Power(power=3) +""" + +class Binomial(object): + """ + Binomial variance function + + Parameters + ---------- + n : int, optional + The number of trials for a binomial variable. The default is 1 for + p in (0,1) + + Methods + ------- + call + Returns the binomial variance + + Formulas + -------- + V(mu) = p * (1 - p) * n + + where p = mu / n + + Notes + ----- + Alias for Binomial: + binary = Binomial() + + A private method _clean trims the data by machine epsilon so that p is + in (0,1) + """ + + def __init__(self, n=1): + self.n = n + + def _clean(self, p): + return np.clip(p, FLOAT_EPS, 1 - FLOAT_EPS) + + def __call__(self, mu): + """ + Binomial variance function + + Parameters + ----------- + mu : array-like + mean parameters + + Returns + ------- + variance : array + variance = mu/n * (1 - mu/n) * self.n + """ + p = self._clean(mu / self.n) + return p * (1 - p) * self.n + + #TODO: inherit from super + def deriv(self, mu): + """ + Derivative of the variance function v'(mu) + """ + from statsmodels.tools.numdiff import approx_fprime_cs, approx_fprime + # TODO: diag workaround proplem with numdiff for 1d + return np.diag(approx_fprime_cs(mu, self)) + + +binary = Binomial() +binary.__doc__ = """ +The binomial variance function for n = 1 + +Notes +----- +This is an alias of Binomial(n=1) +""" + +class NegativeBinomial(object): + ''' + Negative binomial variance function + + Parameters + ---------- + alpha : float + The ancillary parameter for the negative binomial variance function. + `alpha` is assumed to be nonstochastic. The default is 1. + + Methods + ------- + call + Returns the negative binomial variance + + Formulas + -------- + V(mu) = mu + alpha*mu**2 + + Notes + ----- + Alias for NegativeBinomial: + nbinom = NegativeBinomial() + + A private method _clean trims the data by machine epsilon so that p is + in (0,inf) + ''' + + def __init__(self, alpha=1.): + self.alpha = alpha + + def _clean(self, p): + return np.clip(p, FLOAT_EPS, np.inf) + + def __call__(self, mu): + """ + Negative binomial variance function + + Parameters + ---------- + mu : array-like + mean parameters + + Returns + ------- + variance : array + variance = mu + alpha*mu**2 + """ + p = self._clean(mu) + return p + self.alpha*p**2 + + def deriv(self, mu): + """ + Derivative of the negative binomial variance function. + """ + + p = self._clean(mu) + return 1 + 2 * self.alpha * p + +nbinom = NegativeBinomial() +nbinom.__doc__ = """ +Negative Binomial variance function. + +Notes +----- +This is an alias of NegativeBinomial(alpha=1.) +""" diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/__init__.py b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/__init__.py new file mode 100644 index 0000000..f7a77b2 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/__init__.py @@ -0,0 +1 @@ +from base import * diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/__init__.py b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/__init__.py new file mode 100644 index 0000000..eeb63b3 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/__init__.py @@ -0,0 +1,4 @@ +import gwr +import sel_bw +import diagnostics +import kernels diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/diagnostics.py b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/diagnostics.py new file mode 100644 index 0000000..7fbcdc4 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/diagnostics.py @@ -0,0 +1,81 @@ +""" +Diagnostics for estimated gwr modesl +""" +__author__ = "Taylor Oshan tayoshan@gmail.com" + +import numpy as np +from crankshaft.regression.glm.family import Gaussian, Poisson, Binomial + +def get_AICc(gwr): + """ + Get AICc value + + Gaussian: p61, (2.33), Fotheringham, Brunsdon and Charlton (2002) + + GWGLM: AICc=AIC+2k(k+1)/(n-k-1), Nakaya et al. (2005): p2704, (36) + + """ + n = gwr.n + k = gwr.tr_S + if isinstance(gwr.family, Gaussian): + aicc = -2.0*gwr.llf + 2.0*n*(k + 1.0)/(n-k-2.0) + elif isinstance(gwr.family, (Poisson, Binomial)): + aicc = get_AIC(gwr) + 2.0 * k * (k+1.0) / (n - k - 1.0) + return aicc + +def get_AIC(gwr): + """ + Get AIC calue + + Gaussian: p96, (4.22), Fotheringham, Brunsdon and Charlton (2002) + + GWGLM: AIC(G)=D(G) + 2K(G), where D and K denote the deviance and the effective + number of parameters in the model with bandwidth G, respectively. + + """ + k = gwr.tr_S + #deviance = -2*log-likelihood + y = gwr.y + mu = gwr.mu + if isinstance(gwr.family, Gaussian): + aic = -2.0 * gwr.llf + 2.0 * (k+1) + elif isinstance(gwr.family, (Poisson, Binomial)): + aic = np.sum(gwr.family.resid_dev(y, mu)**2) + 2.0 * k + return aic + +def get_BIC(gwr): + """ + Get BIC value + + Gaussian: p61 (2.34), Fotheringham, Brunsdon and Charlton (2002) + BIC = -2log(L)+klog(n) + + GWGLM: BIC = dev + tr_S * log(n) + + """ + n = gwr.n # (scalar) number of observations + k = gwr.tr_S + y = gwr.y + mu = gwr.mu + if isinstance(gwr.family, Gaussian): + bic = -2.0 * gwr.llf + (k+1) * np.log(n) + elif isinstance(gwr.family, (Poisson, Binomial)): + bic = np.sum(gwr.family.resid_dev(y, mu)**2) + k * np.log(n) + return bic + +def get_CV(gwr): + """ + Get CV value + + Gaussian only + + Methods: p60, (2.31) or p212 (9.4) + Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). + Geographically weighted regression: the analysis of spatially varying relationships. + Modification: sum of residual squared is divided by n according to GWR4 results + + """ + aa = gwr.resid_response.reshape((-1,1))/(1.0-gwr.influ) + cv = np.sum(aa**2)/gwr.n + return cv + diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/gwr.py b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/gwr.py new file mode 100644 index 0000000..6d5257f --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/gwr.py @@ -0,0 +1,1086 @@ +#Main GWR classes + +#Offset does not yet do anyhting and needs to be implemented + +__author__ = "Taylor Oshan Tayoshan@gmail.com" + +import numpy as np +import numpy.linalg as la +from scipy.stats import t +from kernels import * +from diagnostics import get_AIC, get_AICc, get_BIC +import pysal.spreg.user_output as USER +from crankshaft.regression.glm.family import Gaussian, Binomial, Poisson +from crankshaft.regression.glm.glm import GLM, GLMResults +from crankshaft.regression.glm.iwls import iwls +from crankshaft.regression.glm.utils import cache_readonly + +fk = {'gaussian': fix_gauss, 'bisquare': fix_bisquare, 'exponential': fix_exp} +ak = {'gaussian': adapt_gauss, 'bisquare': adapt_bisquare, 'exponential': adapt_exp} + +class GWR(GLM): + """ + Geographically weighted regression. Can currently estimate Gaussian, + Poisson, and logistic models(built on a GLM framework). GWR object prepares + model input. Fit method performs estimation and returns a GWRResults object. + + Parameters + ---------- + coords : array-like + n*2, collection of n sets of (x,y) coordinates of + observatons; also used as calibration locations is + 'points' is set to None + + y : array + n*1, dependent variable + + X : array + n*k, independent variable, exlcuding the constant + + points : array-like + n*2, collection of n sets of (x,y) coordinates used for + calibration locations; default is set to None, which + uses every observation as a calibration point + + bw : scalar + bandwidth value consisting of either a distance or N + nearest neighbors; user specified or obtained using + Sel_BW + + family : family object + underlying probability model; provides + distribution-specific calculations + + offset : array + n*1, the offset variable at the ith location. For Poisson model + this term is often the size of the population at risk or + the expected size of the outcome in spatial epidemiology + Default is None where Ni becomes 1.0 for all locations; + only for Poisson models + + sigma2_v1 : boolean + specify sigma squared, True to use n as denominator; + default is False which uses n-k + + kernel : string + type of kernel function used to weight observations; + available options: + 'gaussian' + 'bisquare' + 'exponential' + + fixed : boolean + True for distance based kernel function and False for + adaptive (nearest neighbor) kernel function (default) + + constant : boolean + True to include intercept (default) in model and False to exclude + intercept. + + Attributes + ---------- + coords : array-like + n*2, collection of n sets of (x,y) coordinates used for + calibration locations + + y : array + n*1, dependent variable + + X : array + n*k, independent variable, exlcuding the constant + + bw : scalar + bandwidth value consisting of either a distance or N + nearest neighbors; user specified or obtained using + Sel_BW + + family : family object + underlying probability model; provides + distribution-specific calculations + + offset : array + n*1, the offset variable at the ith location. For Poisson model + this term is often the size of the population at risk or + the expected size of the outcome in spatial epidemiology + Default is None where Ni becomes 1.0 for all locations + + sigma2_v1 : boolean + specify sigma squared, True to use n as denominator; + default is False which uses n-k + + kernel : string + type of kernel function used to weight observations; + available options: + 'gaussian' + 'bisquare' + 'exponential' + + fixed : boolean + True for distance based kernel function and False for + adaptive (nearest neighbor) kernel function (default) + + constant : boolean + True to include intercept (default) in model and False to exclude + intercept + + n : integer + number of observations + + k : integer + number of independent variables + + mean_y : float + mean of y + + std_y : float + standard deviation of y + + fit_params : dict + parameters passed into fit method to define estimation + routine + + W : array + n*n, spatial weights matrix for weighting all + observations from each calibration point + """ + def __init__(self, coords, y, X, bw, family=Gaussian(), offset=None, + sigma2_v1=False, kernel='bisquare', fixed=False, constant=True): + """ + Initialize class + """ + GLM.__init__(self, y, X, family, constant=constant) + self.constant = constant + self.sigma2_v1 = sigma2_v1 + self.coords = coords + self.bw = bw + self.kernel = kernel + self.fixed = fixed + if offset is None: + self.offset = np.ones((self.n, 1)) + else: + self.offset = offset * 1.0 + self.fit_params = {} + self.W = self._build_W(fixed, kernel, coords, bw) + self.points = None + self.exog_scale = None + self.exog_resid = None + self.P = None + + def _build_W(self, fixed, kernel, coords, bw, points=None): + if fixed: + try: + W = fk[kernel](coords, bw, points) + except: + raise TypeError('Unsupported kernel function ', kernel) + else: + try: + W = ak[kernel](coords, bw, points) + except: + raise TypeError('Unsupported kernel function ', kernel) + + return W + + def fit(self, ini_params=None, tol=1.0e-5, max_iter=20, solve='iwls'): + """ + Method that fits a model with a particular estimation routine. + + Parameters + ---------- + + ini_betas : array + k*1, initial coefficient values, including constant. + Default is None, which calculates initial values during + estimation + tol: float + Tolerence for estimation convergence + max_iter : integer + Maximum number of iterations if convergence not + achieved + solve : string + Technique to solve MLE equations. + 'iwls' = iteratively (re)weighted least squares (default) + """ + self.fit_params['ini_params'] = ini_params + self.fit_params['tol'] = tol + self.fit_params['max_iter'] = max_iter + self.fit_params['solve']= solve + if solve.lower() == 'iwls': + m = self.W.shape[0] + params = np.zeros((m, self.k)) + predy = np.zeros((m, 1)) + v = np.zeros((m, 1)) + w = np.zeros((m, 1)) + z = np.zeros((self.n, self.n)) + S = np.zeros((self.n, self.n)) + R = np.zeros((self.n, self.n)) + CCT = np.zeros((m, self.k)) + #f = np.zeros((n, n)) + p = np.zeros((m, 1)) + for i in range(m): + wi = self.W[i].reshape((-1,1)) + rslt = iwls(self.y, self.X, self.family, self.offset, + ini_params, tol, max_iter, wi=wi) + params[i,:] = rslt[0].T + predy[i] = rslt[1][i] + v[i] = rslt[2][i] + w[i] = rslt[3][i] + z[i] = rslt[4].flatten() + R[i] = np.dot(self.X[i], rslt[5]) + ri = np.dot(self.X[i], rslt[5]) + S[i] = ri*np.reshape(rslt[4].flatten(), (1,-1)) + #dont need unless f is explicitly passed for + #prediction of non-sampled points + #cf = rslt[5] - np.dot(rslt[5], f) + #CCT[i] = np.diag(np.dot(cf, cf.T/rslt[3])) + CCT[i] = np.diag(np.dot(rslt[5], rslt[5].T)) + S = S * (1.0/z) + return GWRResults(self, params, predy, S, CCT, w) + + def predict(self, points, P, exog_scale=None, exog_resid=None, fit_params={}): + """ + Method that predicts values of the dependent variable at un-sampled + locations + + Parameters + ---------- + points : array-like + n*2, collection of n sets of (x,y) coordinates used for + calibration prediction locations + P : array + n*k, independent variables used to make prediction; + exlcuding the constant + exog_scale : scalar + estimated scale using sampled locations; defualt is None + which estimates a model using points from "coords" + exog_resid : array-like + estimated residuals using sampled locations; defualt is None + which estimates a model using points from "coords"; if + given it must be n*1 where n is the length of coords + fit_params : dict + key-value pairs of parameters that will be passed into fit method to define estimation + routine; see fit method for more details + + """ + if (exog_scale is None) & (exog_resid is None): + train_gwr = self.fit(**fit_params) + self.exog_scale = train_gwr.scale + self.exog_resid = train_gwr.resid_response + elif (exog_scale is not None) & (exog_resid is not None): + self.exog_scale = exog_scale + self.exog_resid = exog_resid + else: + raise InputError('exog_scale and exog_resid must both either be' + 'None or specified') + self.points = points + if self.constant: + P = np.hstack([np.ones((len(P),1)), P]) + self.P = P + else: + self.P = P + self.W = self._build_W(self.fixed, self.kernel, self.coords, self.bw, points) + gwr = self.fit(**fit_params) + + return gwr + + @cache_readonly + def df_model(self): + raise NotImplementedError('Only computed for fitted model in GWRResults') + + @cache_readonly + def df_resid(self): + raise NotImplementedError('Only computed for fitted model in GWRResults') + +class GWRResults(GLMResults): + """ + Basic class including common properties for all GWR regression models + + Parameters + ---------- + model : GWR object + pointer to GWR object with estimation parameters + + params : array + n*k, estimated coefficients + + predy : array + n*1, predicted y values + + w : array + n*1, final weight used for iteratively re-weighted least + sqaures; default is None + + S : array + n*n, hat matrix + + CCT : array + n*k, scaled variance-covariance matrix + + Attributes + ---------- + model : GWR Object + points to GWR object for which parameters have been + estimated + + params : array + n*k, parameter estimates + + predy : array + n*1, predicted value of y + + y : array + n*1, dependent variable + + X : array + n*k, independent variable, including constant + + family : family object + underlying probability model; provides + distribution-specific calculations + + n : integer + number of observations + + k : integer + number of independent variables + + df_model : integer + model degrees of freedom + + df_resid : integer + residual degrees of freedom + + offset : array + n*1, the offset variable at the ith location. + For Poisson model this term is often the size of + the population at risk or the expected size of + the outcome in spatial epidemiology; Default is + None where Ni becomes 1.0 for all locations + + scale : float + sigma squared used for subsequent computations + + w : array + n*1, final weights from iteratively re-weighted least + sqaures routine + + resid_response : array + n*1, residuals of the repsonse + + resid_ss : scalar + residual sum of sqaures + + W : array + n*n; spatial weights for each observation from each + calibration point + + S : array + n*n, hat matrix + + CCT : array + n*k, scaled variance-covariance matrix + + tr_S : float + trace of S (hat) matrix + + tr_STS : float + trace of STS matrix + + tr_SWSTW : float + trace of weighted STS matrix; weights are those output + from iteratively weighted least sqaures (not spatial + weights) + + y_bar : array + n*1, weighted mean value of y + + TSS : array + n*1, geographically weighted total sum of squares + + RSS : array + n*1, geographically weighted residual sum of squares + + localR2 : array + n*1, local R square + + sigma2_v1 : float + sigma squared, use (n-v1) as denominator + + sigma2_v1v2 : float + sigma squared, use (n-2v1+v2) as denominator + + sigma2_ML : float + sigma squared, estimated using ML + + std_res : array + n*1, standardised residuals + + bse : array + n*k, standard errors of parameters (betas) + + influ : array + n*1, leading diagonal of S matrix + + CooksD : array + n*1, Cook's D + + tvalues : array + n*k, local t-statistics + + adj_alpha : array + 3*1, corrected alpha values to account for multiple + hypothesis testing for the 90%, 95%, and 99% confidence + levels; tvalues with an absolute value larger than the + corrected alpha are considered statistically + significant. + + deviance : array + n*1, local model deviance for each calibration point + + resid_deviance : array + n*1, local sum of residual deviance for each + calibration point + + llf : scalar + log-likelihood of the full model; see + pysal.contrib.glm.family for damily-sepcific + log-likelihoods + + pDev : float + local percent of deviation accounted for; analogous to + r-squared for GLM's + + mu : array + n*, flat one dimensional array of predicted mean + response value from estimator + + fit_params : dict + parameters passed into fit method to define estimation + routine + """ + def __init__(self, model, params, predy, S, CCT, w=None): + GLMResults.__init__(self, model, params, predy, w) + self.W = model.W + self.offset = model.offset + if w is not None: + self.w = w + self.predy = predy + self.S = S + self.CCT = self.cov_params(CCT, model.exog_scale) + self._cache = {} + + @cache_readonly + def resid_ss(self): + u = self.resid_response.flatten() + return np.dot(u, u.T) + + @cache_readonly + def scale(self, scale=None): + if isinstance(self.family, Gaussian): + if self.model.sigma2_v1: + scale = self.sigma2_v1 + else: + scale = self.sigma2_v1v2 + else: + scale = 1.0 + return scale + + def cov_params(self, cov, exog_scale=None): + """ + Returns scaled covariance parameters + Parameters + ---------- + cov : array + estimated covariance parameters + + Returns + ------- + Scaled covariance parameters + + """ + if exog_scale is not None: + return cov*exog_scale + else: + return cov*self.scale + + @cache_readonly + def tr_S(self): + """ + trace of S (hat) matrix + """ + return np.trace(self.S*self.w) + + @cache_readonly + def tr_STS(self): + """ + trace of STS matrix + """ + return np.trace(np.dot(self.S.T*self.w,self.S*self.w)) + + @cache_readonly + def y_bar(self): + """ + weighted mean of y + """ + if self.model.points is not None: + n = len(self.model.points) + else: + n = self.n + off = self.offset.reshape((-1,1)) + arr_ybar = np.zeros(shape=(self.n,1)) + for i in range(n): + w_i= np.reshape(np.array(self.W[i]), (-1, 1)) + sum_yw = np.sum(self.y.reshape((-1,1)) * w_i) + arr_ybar[i] = 1.0 * sum_yw / np.sum(w_i*off) + return arr_ybar + + @cache_readonly + def TSS(self): + """ + geographically weighted total sum of squares + + Methods: p215, (9.9) + Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). + Geographically weighted regression: the analysis of spatially varying + relationships. + + """ + if self.model.points is not None: + n = len(self.model.points) + else: + n = self.n + TSS = np.zeros(shape=(n,1)) + for i in range(n): + TSS[i] = np.sum(np.reshape(np.array(self.W[i]), (-1,1)) * + (self.y.reshape((-1,1)) - self.y_bar[i])**2) + return TSS + + @cache_readonly + def RSS(self): + """ + geographically weighted residual sum of squares + + Methods: p215, (9.10) + Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). + Geographically weighted regression: the analysis of spatially varying + relationships. + """ + if self.model.points is not None: + n = len(self.model.points) + resid = self.model.exog_resid.reshape((-1,1)) + else: + n = self.n + resid = self.resid_response.reshape((-1,1)) + RSS = np.zeros(shape=(n,1)) + for i in range(n): + RSS[i] = np.sum(np.reshape(np.array(self.W[i]), (-1,1)) + * resid**2) + return RSS + + @cache_readonly + def localR2(self): + """ + local R square + + Methods: p215, (9.8) + Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). + Geographically weighted regression: the analysis of spatially varying + relationships. + """ + if isinstance(self.family, Gaussian): + return (self.TSS - self.RSS)/self.TSS + else: + raise NotImplementedError('Only applicable to Gaussian') + + @cache_readonly + def sigma2_v1(self): + """ + residual variance + + Methods: p214, (9.6), + Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). + Geographically weighted regression: the analysis of spatially varying + relationships. + + only use v1 + """ + return (self.resid_ss/(self.n-self.tr_S)) + + @cache_readonly + def sigma2_v1v2(self): + """ + residual variance + + Methods: p55 (2.16)-(2.18) + Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). + Geographically weighted regression: the analysis of spatially varying + relationships. + + use v1 and v2 #used in GWR4 + """ + if isinstance(self.family, (Poisson, Binomial)): + return self.resid_ss/(self.n - 2.0*self.tr_S + + self.tr_STS) #could be changed to SWSTW - nothing to test against + else: + return self.resid_ss/(self.n - 2.0*self.tr_S + + self.tr_STS) #could be changed to SWSTW - nothing to test against + @cache_readonly + def sigma2_ML(self): + """ + residual variance + + Methods: maximum likelihood + """ + return self.resid_ss/self.n + + @cache_readonly + def std_res(self): + """ + standardized residuals + + Methods: p215, (9.7) + Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). + Geographically weighted regression: the analysis of spatially varying + relationships. + """ + return self.resid_response.reshape((-1,1))/(np.sqrt(self.scale * (1.0 - self.influ))) + + @cache_readonly + def bse(self): + """ + standard errors of Betas + + Methods: p215, (2.15) and (2.21) + Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). + Geographically weighted regression: the analysis of spatially varying + relationships. + """ + return np.sqrt(self.CCT) + + @cache_readonly + def influ(self): + """ + Influence: leading diagonal of S Matrix + """ + return np.reshape(np.diag(self.S),(-1,1)) + + @cache_readonly + def cooksD(self): + """ + Influence: leading diagonal of S Matrix + + Methods: p216, (9.11), + Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). + Geographically weighted regression: the analysis of spatially varying + relationships. + Note: in (9.11), p should be tr(S), that is, the effective number of parameters + """ + return self.std_res**2 * self.influ / (self.tr_S * (1.0-self.influ)) + + @cache_readonly + def deviance(self): + off = self.offset.reshape((-1,1)).T + y = self.y + ybar = self.y_bar + if isinstance(self.family, Gaussian): + raise NotImplementedError('deviance not currently used for Gaussian') + elif isinstance(self.family, Poisson): + dev = np.sum(2.0*self.W*(y*np.log(y/(ybar*off))-(y-ybar*off)),axis=1) + elif isinstance(self.family, Binomial): + dev = self.family.deviance(self.y, self.y_bar, self.W, axis=1) + return dev.reshape((-1,1)) + + @cache_readonly + def resid_deviance(self): + if isinstance(self.family, Gaussian): + raise NotImplementedError('deviance not currently used for Gaussian') + else: + off = self.offset.reshape((-1,1)).T + y = self.y + ybar = self.y_bar + global_dev_res = ((self.family.resid_dev(self.y, self.mu))**2) + dev_res = np.repeat(global_dev_res.flatten(),self.n) + dev_res = dev_res.reshape((self.n, self.n)) + dev_res = np.sum(dev_res * self.W.T, axis=0) + return dev_res.reshape((-1,1)) + + @cache_readonly + def pDev(self): + """ + Local percentage of deviance accounted for. Described in the GWR4 + manual. Equivalent to 1 - (deviance/null deviance) + """ + if isinstance(self.family, Gaussian): + raise NotImplementedError('Not implemented for Gaussian') + else: + return 1.0 - (self.resid_deviance/self.deviance) + + @cache_readonly + def adj_alpha(self): + """ + Corrected alpha (critical) values to account for multiple testing during hypothesis + testing. Includes corrected value for 90% (.1), 95% (.05), and 99% + (.01) confidence levels. Correction comes from: + + da Silva, A. R., & Fotheringham, A. S. (2015). The Multiple Testing Issue in + Geographically Weighted Regression. Geographical Analysis. + + """ + alpha = np.array([.1, .05, .001]) + pe = (2.0 * self.tr_S) - self.tr_STS + p = self.k + return (alpha*p)/pe + + def filter_tvals(self, alpha): + """ + Utility function to set tvalues with an absolute value smaller than the + absolute value of the alpha (critical) value to 0 + + Parameters + ---------- + alpha : scalar + critical value to determine which tvalues are + associated with statistically significant parameter + estimates + + Returns + ------- + filtered : array + n*k; new set of n tvalues for each of k variables + where absolute tvalues less than the absolute value of + alpha have been set to 0. + """ + alpha = np.abs(alpha)/2.0 + n = self.n + critical = t.ppf(1-alpha, n-1) + subset = (self.tvalues < critical) & (self.tvalues > -1.0*critical) + tvalues = self.tvalues.copy() + tvalues[subset] = 0 + return tvalues + + @cache_readonly + def df_model(self): + return self.n - self.tr_S + + @cache_readonly + def df_resid(self): + return self.n - 2.0*self.tr_S + self.tr_STS + + @cache_readonly + def normalized_cov_params(self): + raise NotImplementedError('Not implemented for GWR') + + @cache_readonly + def resid_pearson(self): + raise NotImplementedError('Not implemented for GWR') + + @cache_readonly + def resid_working(self): + raise NotImplementedError('Not implemented for GWR') + + @cache_readonly + def resid_anscombe(self): + raise NotImplementedError('Not implemented for GWR') + + @cache_readonly + def pearson_chi2(self): + raise NotImplementedError('Not implemented for GWR') + + @cache_readonly + def null(self): + raise NotImplementedError('Not implemented for GWR') + + @cache_readonly + def llnull(self): + raise NotImplementedError('Not implemented for GWR') + + @cache_readonly + def null_deviance(self): + raise NotImplementedError('Not implemented for GWR') + + @cache_readonly + def aic(self): + return get_AIC(self) + + @cache_readonly + def aicc(self): + return get_AICc(self) + + @cache_readonly + def bic(self): + return get_BIC(self) + + @cache_readonly + def D2(self): + raise NotImplementedError('Not implemented for GWR') + + @cache_readonly + def adj_D2(self): + raise NotImplementedError('Not implemented for GWR') + + @cache_readonly + def pseudoR2(self): + raise NotImplementedError('Not implemented for GWR') + + @cache_readonly + def adj_pseudoR2(self): + raise NotImplementedError('Not implemented for GWR') + + @cache_readonly + def pvalues(self): + raise NotImplementedError('Not implemented for GWR') + + @cache_readonly + def predictions(self): + P = self.model.P + if P is None: + raise NotImplementedError('predictions only avaialble if predict' + 'method called on GWR model') + else: + predictions = np.sum(P*self.params, axis=1).reshape((-1,1)) + return predictions + +class FBGWR(GWR): + """ + Parameters + ---------- + coords : array-like + n*2, collection of n sets of (x,y) coordinates of + observatons; also used as calibration locations is + 'points' is set to None + + y : array + n*1, dependent variable + + X : array + n*k, independent variable, exlcuding the constant + + points : array-like + n*2, collection of n sets of (x,y) coordinates used for + calibration locations; default is set to None, which + uses every observation as a calibration point + + bws : array-like + collection of bandwidth values consisting of either a distance or N + nearest neighbors; user specified or obtained using + Sel_BW with fb=True. Order of values should the same as + the order of columns associated with X + XB : array + n*k, product of temporary X and params obtained as through-put + from the backfitting algorithm used to select flexible + bandwidths; product of the Sel_BW class + err : array + n*1, temporary residuals associated with the predicted values from + the backfitting algorithm used to select flexible + bandwidths; product of the Sel_BW class + + family : family object + underlying probability model; provides + distribution-specific calculations + + offset : array + n*1, the offset variable at the ith location. For Poisson model + this term is often the size of the population at risk or + the expected size of the outcome in spatial epidemiology + Default is None where Ni becomes 1.0 for all locations + + sigma2_v1 : boolean + specify sigma squared, True to use n as denominator; + default is False which uses n-k + + kernel : string + type of kernel function used to weight observations; + available options: + 'gaussian' + 'bisquare' + 'exponential' + + fixed : boolean + True for distance based kernel function and False for + adaptive (nearest neighbor) kernel function (default) + + constant : boolean + True to include intercept (default) in model and False to exclude + intercept. + + Attributes + ---------- + coords : array-like + n*2, collection of n sets of (x,y) coordinates of + observatons; also used as calibration locations is + 'points' is set to None + + y : array + n*1, dependent variable + + X : array + n*k, independent variable, exlcuding the constant + + points : array-like + n*2, collection of n sets of (x,y) coordinates used for + calibration locations; default is set to None, which + uses every observation as a calibration point + + bws : array-like + collection of bandwidth values consisting of either a distance or N + nearest neighbors; user specified or obtained using + Sel_BW with fb=True. Order of values should the same as + the order of columns associated with X + XB : array + n*k, product of temporary X and params obtained as through-put + from the backfitting algorithm used to select flexible + bandwidths; product of the Sel_BW class + err : array + n*1, temporary residuals associated with the predicted values from + the backfitting algorithm used to select flexible + bandwidths; product of the Sel_BW class + + family : family object + underlying probability model; provides + distribution-specific calculations + + offset : array + n*1, the offset variable at the ith location. For Poisson model + this term is often the size of the population at risk or + the expected size of the outcome in spatial epidemiology + Default is None where Ni becomes 1.0 for all locations + + sigma2_v1 : boolean + specify sigma squared, True to use n as denominator; + default is False which uses n-k + + kernel : string + type of kernel function used to weight observations; + available options: + 'gaussian' + 'bisquare' + 'exponential' + + fixed : boolean + True for distance based kernel function and False for + adaptive (nearest neighbor) kernel function (default) + + constant : boolean + True to include intercept (default) in model and False to exclude + intercept. + + + Examples + ------- + TODO + + """ + def __init__(self, coords, y, X, bws, XB, err, family=Gaussian(), offset=None, + sigma2_v1=False, kernel='bisquare', fixed=False, constant=True): + """ + Initialize class + """ + self.coords = coords + self.y = y + self.X = X + self.XB = XB + self.err = err + self.bws = bws + self.family = family + self.offset = offset + self.sigma2_v1 = sigma2_v1 + self.kernel = kernel + self.fixed = fixed + self.constant = constant + if constant: + self.X = USER.check_constant(self.X) + + def fit(self, ini_params=None, tol=1.0e-5, max_iter=20, solve='iwls'): + """ + Method that fits a model with a particular estimation routine. + + Parameters + ---------- + + ini_betas : array + k*1, initial coefficient values, including constant. + Default is None, which calculates initial values during + estimation + tol: float + Tolerence for estimation convergence + max_iter : integer + Maximum number of iterations if convergence not + achieved + solve : string + Technique to solve MLE equations. + 'iwls' = iteratively (re)weighted least squares (default) + + """ + params = np.zeros_like(self.X) + err = self.err + for i, bw in enumerate(self.bws): + W = self._build_W(self.fixed, self.kernel, self.coords, bw) + X = self.X[:,i].reshape((-1,1)) + y = self.XB[:,i].reshape((-1,1)) + err + model = GWR(self.coords, y, X, bw, self.family, self.offset, + self.sigma2_v1, self.kernel, self.fixed, constant=False) + results = model.fit(ini_params, tol, max_iter, solve) + params[:,i] = results.params.flatten() + err = results.resid_response.reshape((-1,1)) + return FBGWRResults(self, params) + +class FBGWRResults(object): + """ + Parameters + ---------- + model : GWR object + pointer to FBGWR object with estimation parameters + + params : array + n*k, estimated coefficients + + Attributes + ---------- + model : GWR Object + points to FBGWR object for which parameters have been + estimated + + params : array + n*k, parameter estimates + + predy : array + n*1, predicted value of y + + y : array + n*1, dependent variable + + X : array + n*k, independent variable, including constant + + : array + resid_response n*1, residuals of response + + resid_ss : scalar + residual sum of sqaures + + Examples + ------- + TODO + + """ + def __init__(self, model, params): + """ + Initialize class + """ + self.model = model + self.params = params + self.X = model.X + self.y = model.y + self._cache = {} + + @cache_readonly + def predy(self): + return np.sum(np.multiply(self.params, self.X), axis=1).reshape((-1,1)) + + @cache_readonly + def resid_response(self): + return (self.y - self.predy).reshape((-1,1)) + + @cache_readonly + def resid_ss(self): + u = self.resid_response.flatten() + return np.dot(u, u.T) diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/kernels.py b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/kernels.py new file mode 100644 index 0000000..bdf246d --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/kernels.py @@ -0,0 +1,120 @@ +# GWR kernel function specifications + +__author__ = "Taylor Oshan tayoshan@gmail.com" + +#from pysal.weights.Distance import Kernel +import scipy +from scipy.spatial.kdtree import KDTree +import numpy as np + +#adaptive specifications should be parameterized with nn-1 to match original gwr +#implementation. That is, pysal counts self neighbors with knn automatically. + +def fix_gauss(coords, bw, points=None): + w = _Kernel(coords, function='gwr_gaussian', bandwidth=bw, + truncate=False, points=points) + return w.kernel + +def adapt_gauss(coords, nn, points=None): + w = _Kernel(coords, fixed=False, k=nn-1, function='gwr_gaussian', + truncate=False, points=points) + return w.kernel + +def fix_bisquare(coords, bw, points=None): + w = _Kernel(coords, function='bisquare', bandwidth=bw, points=points) + return w.kernel + +def adapt_bisquare(coords, nn, points=None): + w = _Kernel(coords, fixed=False, k=nn-1, function='bisquare', points=points) + return w.kernel + +def fix_exp(coords, bw, points=None): + w = _Kernel(coords, function='exponential', bandwidth=bw, + truncate=False, points=points) + return w.kernel + +def adapt_exp(coords, nn, points=None): + w = _Kernel(coords, fixed=False, k=nn-1, function='exponential', + truncate=False, points=points) + return w.kernel + +from scipy.spatial.distance import cdist + +class _Kernel(object): + """ + + """ + def __init__(self, data, bandwidth=None, fixed=True, k=None, + function='triangular', eps=1.0000001, ids=None, truncate=True, + points=None): #Added truncate flag + if issubclass(type(data), scipy.spatial.KDTree): + self.data = data.data + data = self.data + else: + self.data = data + if k is not None: + self.k = int(k) + 1 + else: + self.k = k + if points is None: + self.dmat = cdist(self.data, self.data) + else: + self.points = points + self.dmat = cdist(self.points, self.data) + self.function = function.lower() + self.fixed = fixed + self.eps = eps + self.trunc = truncate + if bandwidth: + try: + bandwidth = np.array(bandwidth) + bandwidth.shape = (len(bandwidth), 1) + except: + bandwidth = np.ones((len(data), 1), 'float') * bandwidth + self.bandwidth = bandwidth + else: + self._set_bw() + self.kernel = self._kernel_funcs(self.dmat/self.bandwidth) + + if self.trunc: + mask = np.repeat(self.bandwidth, len(self.data), axis=1) + self.kernel[(self.dmat >= mask)] = 0 + + def _set_bw(self): + if self.k is not None: + dmat = np.sort(self.dmat)[:,:self.k] + else: + dmat = self.dmat + if self.fixed: + # use max knn distance as bandwidth + bandwidth = dmat.max() * self.eps + n = len(self.data) + self.bandwidth = np.ones((n, 1), 'float') * bandwidth + else: + # use local max knn distance + self.bandwidth = dmat.max(axis=1) * self.eps + self.bandwidth.shape = (self.bandwidth.size, 1) + + + def _kernel_funcs(self, zs): + # functions follow Anselin and Rey (2010) table 5.4 + if self.function == 'triangular': + return 1 - zs + elif self.function == 'uniform': + return np.ones(zi.shape) * 0.5 + elif self.function == 'quadratic': + return (3. / 4) * (1 - zs ** 2) + elif self.function == 'quartic': + return (15. / 16) * (1 - zs ** 2) ** 2 + elif self.function == 'gaussian': + c = np.pi * 2 + c = c ** (-0.5) + return c * np.exp(-(zs ** 2) / 2.) + elif self.function == 'gwr_gaussian': + return np.exp(-0.5*(zs)**2) + elif self.function == 'bisquare': + return (1-(zs)**2)**2 + elif self.function =='exponential': + return np.exp(-zs) + else: + print('Unsupported kernel function', self.function) diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/search.py b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/search.py new file mode 100644 index 0000000..97de4be --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/search.py @@ -0,0 +1,208 @@ +#Bandwidth optimization methods + +__author__ = "Taylor Oshan" + +import numpy as np + +def golden_section(a, c, delta, function, tol, max_iter, int_score=False): + """ + Golden section search routine + Method: p212, 9.6.4 + Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). + Geographically weighted regression: the analysis of spatially varying relationships. + + Parameters + ---------- + a : float + initial max search section value + b : float + initial min search section value + delta : float + constant used to determine width of search sections + function : function + obejective function to be evaluated at different section + values + int_score : boolean + False for float score, True for integer score + tol : float + tolerance used to determine convergence + max_iter : integer + maximum iterations if no convergence to tolerance + + Returns + ------- + opt_val : float + optimal value + opt_score : kernel + optimal score + output : list of tuples + searching history + """ + b = a + delta * np.abs(c-a) + d = c - delta * np.abs(c-a) + score = 0.0 + diff = 1.0e9 + iters = 0 + output = [] + while np.abs(diff) > tol and iters < max_iter: + iters += 1 + if int_score: + b = np.round(b) + d = np.round(d) + + score_a = function(a) + score_b = function(b) + score_c = function(c) + score_d = function(d) + + if score_b <= score_d: + opt_val = b + opt_score = score_b + c = d + d = b + b = a + delta * np.abs(c-a) + #if int_score: + #b = np.round(b) + else: + opt_val = d + opt_score = score_d + a = b + b = d + d = c - delta * np.abs(c-a) + #if int_score: + #d = np.round(b) + + #if int_score: + # opt_val = np.round(opt_val) + output.append((opt_val, opt_score)) + diff = score_b - score_d + score = opt_score + return np.round(opt_val, 2), opt_score, output + +def equal_interval(l_bound, u_bound, interval, function, int_score=False): + """ + Interval search, using interval as stepsize + + Parameters + ---------- + l_bound : float + initial min search section value + u_bound : float + initial max search section value + interval : float + constant used to determine width of search sections + function : function + obejective function to be evaluated at different section + values + int_score : boolean + False for float score, True for integer score + + Returns + ------- + opt_val : float + optimal value + opt_score : kernel + optimal score + output : list of tuples + searching history + """ + a = l_bound + c = u_bound + b = a + interval + if int_score: + a = np.round(a,0) + c = np.round(c,0) + b = np.round(b,0) + + output = [] + + score_a = function(a) + score_c = function(c) + + output.append((a,score_a)) + output.append((c,score_c)) + + if score_a < score_c: + opt_val = a + opt_score = score_a + else: + opt_val = c + opt_score = score_c + + while b < c: + score_b = function(b) + + output.append((b,score_b)) + + if score_b < opt_score: + opt_val = b + opt_score = score_b + b = b + interval + + return opt_val, opt_score, output + + +def flexible_bw(init, y, X, n, k, family, tol, max_iter, rss_score, + gwr_func, bw_func, sel_func): + if init: + bw = sel_func(bw_func(y, X)) + print bw + optim_model = gwr_func(y, X, bw) + err = optim_model.resid_response.reshape((-1,1)) + est = optim_model.params + else: + model = GLM(y, X, family=self.family, constant=False).fit() + err = model.resid_response.reshape((-1,1)) + est = np.repeat(model.params.T, n, axis=0) + + + XB = np.multiply(est, X) + if rss_score: + rss = np.sum((err)**2) + iters = 0 + scores = [] + delta = 1e6 + BWs = [] + VALs = [] + + while delta > tol and iters < max_iter: + iters += 1 + new_XB = np.zeros_like(X) + bws = [] + vals = [] + ests = np.zeros_like(X) + f_XB = XB.copy() + f_err = err.copy() + for i in range(k): + temp_y = XB[:,i].reshape((-1,1)) + temp_y = temp_y + err + temp_X = X[:,i].reshape((-1,1)) + bw_class = bw_func(temp_y, temp_X) + bw = sel_func(bw_class) + optim_model = gwr_func(temp_y, temp_X, bw) + err = optim_model.resid_response.reshape((-1,1)) + est = optim_model.params.reshape((-1,)) + + new_XB[:,i] = np.multiply(est, temp_X.reshape((-1,))) + bws.append(bw) + ests[:,i] = est + vals.append(bw_class.bw[1]) + + predy = np.sum(np.multiply(ests, X), axis=1).reshape((-1,1)) + num = np.sum((new_XB - XB)**2)/n + den = np.sum(np.sum(new_XB, axis=1)**2) + score = (num/den)**0.5 + XB = new_XB + + if rss_score: + new_rss = np.sum((y - predy)**2) + score = np.abs((new_rss - rss)/new_rss) + rss = new_rss + print score + scores.append(score) + delta = score + BWs.append(bws) + VALs.append(vals) + + opt_bws = BWs[-1] + return opt_bws, np.array(BWs), np.array(VALs), np.array(scores), f_XB, f_err diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/sel_bw.py b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/sel_bw.py new file mode 100644 index 0000000..9ab1263 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/sel_bw.py @@ -0,0 +1,286 @@ +# GWR Bandwidth selection class + +#Thinking about removing the search method and just having optimization begin in +#class __init__ + +#x_glob and offset parameters dont yet do anything; former is for semiparametric +#GWR and later is for offset variable for Poisson model + +__author__ = "Taylor Oshan Tayoshan@gmail.com" + +from kernels import * +from search import golden_section, equal_interval, flexible_bw +from gwr import GWR +from crankshaft.regression.glm.family import Gaussian, Poisson, Binomial +import pysal.spreg.user_output as USER +from diagnostics import get_AICc, get_AIC, get_BIC, get_CV +from scipy.spatial.distance import pdist, squareform +from pysal.common import KDTree +import numpy as np + +kernels = {1: fix_gauss, 2: adapt_gauss, 3: fix_bisquare, 4: + adapt_bisquare, 5: fix_exp, 6:adapt_exp} +getDiag = {'AICc': get_AICc,'AIC':get_AIC, 'BIC': get_BIC, 'CV': get_CV} + +class Sel_BW(object): + """ + Select bandwidth for kernel + + Methods: p211 - p213, bandwidth selection + Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002). + Geographically weighted regression: the analysis of spatially varying relationships. + + Parameters + ---------- + y : array + n*1, dependent variable. + x_glob : array + n*k1, fixed independent variable. + x_loc : array + n*k2, local independent variable, including constant. + coords : list of tuples + (x,y) of points used in bandwidth selection + family : string + GWR model type: 'Gaussian', 'logistic, 'Poisson'' + offset : array + n*1, offset variable for Poisson model + kernel : string + kernel function: 'gaussian', 'bisquare', 'exponetial' + fixed : boolean + True for fixed bandwidth and False for adaptive (NN) + fb : True for flexible (mutliple covaraite-specific) bandwidths + False for a traditional (same for all covariates) + bandwdith; defualt is False. + constant : boolean + True to include intercept (default) in model and False to exclude + intercept. + + + Attributes + ---------- + y : array + n*1, dependent variable. + x_glob : array + n*k1, fixed independent variable. + x_loc : array + n*k2, local independent variable, including constant. + coords : list of tuples + (x,y) of points used in bandwidth selection + family : string + GWR model type: 'Gaussian', 'logistic, 'Poisson'' + kernel : string + type of kernel used and wether fixed or adaptive + criterion : string + bw selection criterion: 'AICc', 'AIC', 'BIC', 'CV' + search : string + bw search method: 'golden', 'interval' + bw_min : float + min value used in bandwidth search + bw_max : float + max value used in bandwidth search + interval : float + interval increment used in interval search + tol : float + tolerance used to determine convergence + max_iter : integer + max interations if no convergence to tol + fb : True for flexible (mutliple covaraite-specific) bandwidths + False for a traditional (same for all covariates) + bandwdith; defualt is False. + constant : boolean + True to include intercept (default) in model and False to exclude + intercept. + """ + def __init__(self, coords, y, x_loc, x_glob=None, family=Gaussian(), + offset=None, kernel='bisquare', fixed=False, fb=False, constant=True): + self.coords = coords + self.y = y + self.x_loc = x_loc + if x_glob is not None: + self.x_glob = x_glob + else: + self.x_glob = [] + self.family=family + self.fixed = fixed + self.kernel = kernel + if offset is None: + self.offset = np.ones((len(y), 1)) + else: + self.offset = offset * 1.0 + self.fb = fb + self.constant = constant + + def search(self, search='golden_section', criterion='AICc', bw_min=0.0, + bw_max=0.0, interval=0.0, tol=1.0e-6, max_iter=200, init_fb=True, + tol_fb=1.0e-5, rss_score=False, max_iter_fb=200): + """ + Parameters + ---------- + criterion : string + bw selection criterion: 'AICc', 'AIC', 'BIC', 'CV' + search : string + bw search method: 'golden', 'interval' + bw_min : float + min value used in bandwidth search + bw_max : float + max value used in bandwidth search + interval : float + interval increment used in interval search + tol : float + tolerance used to determine convergence + max_iter : integer + max iterations if no convergence to tol + init_fb : True to initialize flexible bandwidth search with + esitmates from a traditional GWR and False to + initialize flexible bandwidth search with global + regression estimates + tol_fb : convergence tolerence for the flexible bandwidth + backfitting algorithm; a larger tolerance may stop the + algorith faster though it may result in a less optimal + model + max_iter_fb : max iterations if no convergence to tol for flexible + bandwidth backfittign algorithm + rss_score : True to use the residual sum of sqaures to evaluate + each iteration of the flexible bandwidth backfitting + routine and False to use a smooth function; default is + False + + Returns + ------- + bw : scalar or array + optimal bandwidth value or values; returns scalar for + fb=False and array for fb=True; ordering of bandwidths + matches the ordering of the covariates (columns) of the + designs matrix, X + """ + self.search = search + self.criterion = criterion + self.bw_min = bw_min + self.bw_max = bw_max + self.interval = interval + self.tol = tol + self.max_iter = max_iter + self.init_fb = init_fb + self.tol_fb = tol_fb + self.rss_score = rss_score + self.max_iter_fb = max_iter_fb + + + if self.fixed: + if self.kernel == 'gaussian': + ktype = 1 + elif self.kernel == 'bisquare': + ktype = 3 + elif self.kernel == 'exponential': + ktype = 5 + else: + raise TypeError('Unsupported kernel function ', self.kernel) + else: + if self.kernel == 'gaussian': + ktype = 2 + elif self.kernel == 'bisquare': + ktype = 4 + elif self.kernel == 'exponential': + ktype = 6 + else: + raise TypeError('Unsupported kernel function ', self.kernel) + + function = lambda bw: getDiag[criterion]( + GWR(self.coords, self.y, self.x_loc, bw, family=self.family, + kernel=self.kernel, fixed=self.fixed, offset=self.offset).fit()) + + if ktype % 2 == 0: + int_score = True + else: + int_score = False + self.int_score = int_score + + if self.fb: + self._fbw() + print self.bw[1] + self.XB = self.bw[4] + self.err = self.bw[5] + else: + self._bw() + + return self.bw[0] + + def _bw(self): + gwr_func = lambda bw: getDiag[self.criterion]( + GWR(self.coords, self.y, self.x_loc, bw, family=self.family, + kernel=self.kernel, fixed=self.fixed, constant=self.constant).fit()) + if self.search == 'golden_section': + a,c = self._init_section(self.x_glob, self.x_loc, self.coords, + self.constant) + delta = 0.38197 #1 - (np.sqrt(5.0)-1.0)/2.0 + self.bw = golden_section(a, c, delta, gwr_func, self.tol, + self.max_iter, self.int_score) + elif self.search == 'interval': + self.bw = equal_interval(self.bw_min, self.bw_max, self.interval, + gwr_func, self.int_score) + else: + raise TypeError('Unsupported computational search method ', search) + + def _fbw(self): + y = self.y + if self.constant: + X = USER.check_constant(self.x_loc) + else: + X = self.x_loc + n, k = X.shape + family = self.family + offset = self.offset + kernel = self.kernel + fixed = self.fixed + coords = self.coords + search = self.search + criterion = self.criterion + bw_min = self.bw_min + bw_max = self.bw_max + interval = self.interval + tol = self.tol + max_iter = self.max_iter + gwr_func = lambda y, X, bw: GWR(coords, y, X, bw, family=family, + kernel=kernel, fixed=fixed, offset=offset, constant=False).fit() + bw_func = lambda y, X: Sel_BW(coords, y, X, x_glob=[], family=family, + kernel=kernel, fixed=fixed, offset=offset, constant=False) + sel_func = lambda bw_func: bw_func.search(search=search, + criterion=criterion, bw_min=bw_min, bw_max=bw_max, + interval=interval, tol=tol, max_iter=max_iter) + self.bw = flexible_bw(self.init_fb, y, X, n, k, family, self.tol_fb, + self.max_iter_fb, self.rss_score, gwr_func, bw_func, sel_func) + + + + def _init_section(self, x_glob, x_loc, coords, constant): + if len(x_glob) > 0: + n_glob = x_glob.shape[1] + else: + n_glob = 0 + if len(x_loc) > 0: + n_loc = x_loc.shape[1] + else: + n_loc = 0 + if constant: + n_vars = n_glob + n_loc + 1 + else: + n_vars = n_glob + n_loc + n = np.array(coords).shape[0] + + if self.int_score: + a = 40 + 2 * n_vars + c = n + else: + nn = 40 + 2 * n_vars + sq_dists = squareform(pdist(coords)) + sort_dists = np.sort(sq_dists, axis=1) + min_dists = sort_dists[:,nn-1] + max_dists = sort_dists[:,-1] + a = np.min(min_dists)/2.0 + c = np.max(max_dists)/2.0 + + if a < self.bw_min: + a = self.bw_min + if c > self.bw_max and self.bw_max > 0: + c = self.bw_max + return a, c diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/tests/test_gwr.py b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/tests/test_gwr.py new file mode 100644 index 0000000..7f12b7e --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/tests/test_gwr.py @@ -0,0 +1,853 @@ +""" +GWR is tested against results from GWR4 +""" + +import unittest +import pickle as pk +from crankshaft.regression.gwr.gwr import GWR, FBGWR +from crankshaft.regression.gwr.sel_bw import Sel_BW +from crankshaft.regression.gwr.diagnostics import get_AICc, get_AIC, get_BIC, get_CV +from crankshaft.regression.glm.family import Gaussian, Poisson, Binomial +import numpy as np +import pysal + +class TestGWRGaussian(unittest.TestCase): + def setUp(self): + data = pysal.open(pysal.examples.get_path('GData_utm.csv')) + self.coords = zip(data.by_col('X'), data.by_col('Y')) + self.y = np.array(data.by_col('PctBach')).reshape((-1,1)) + rural = np.array(data.by_col('PctRural')).reshape((-1,1)) + pov = np.array(data.by_col('PctPov')).reshape((-1,1)) + black = np.array(data.by_col('PctBlack')).reshape((-1,1)) + self.X = np.hstack([rural, pov, black]) + self.BS_F = pysal.open(pysal.examples.get_path('georgia_BS_F_listwise.csv')) + self.BS_NN = pysal.open(pysal.examples.get_path('georgia_BS_NN_listwise.csv')) + self.GS_F = pysal.open(pysal.examples.get_path('georgia_GS_F_listwise.csv')) + self.GS_NN = pysal.open(pysal.examples.get_path('georgia_GS_NN_listwise.csv')) + self.FB = pk.load(open(pysal.examples.get_path('FB.p'), 'r')) + self.XB = pk.load(open(pysal.examples.get_path('XB.p'), 'r')) + self.err = pk.load(open(pysal.examples.get_path('err.p'), 'r')) + + def test_BS_F(self): + est_Int = self.BS_F.by_col(' est_Intercept') + se_Int = self.BS_F.by_col(' se_Intercept') + t_Int = self.BS_F.by_col(' t_Intercept') + est_rural = self.BS_F.by_col(' est_PctRural') + se_rural = self.BS_F.by_col(' se_PctRural') + t_rural = self.BS_F.by_col(' t_PctRural') + est_pov = self.BS_F.by_col(' est_PctPov') + se_pov = self.BS_F.by_col(' se_PctPov') + t_pov = self.BS_F.by_col(' t_PctPov') + est_black = self.BS_F.by_col(' est_PctBlack') + se_black = self.BS_F.by_col(' se_PctBlack') + t_black = self.BS_F.by_col(' t_PctBlack') + yhat = self.BS_F.by_col(' yhat') + res = np.array(self.BS_F.by_col(' residual')) + std_res = np.array(self.BS_F.by_col(' std_residual')).reshape((-1,1)) + localR2 = np.array(self.BS_F.by_col(' localR2')).reshape((-1,1)) + inf = np.array(self.BS_F.by_col(' influence')).reshape((-1,1)) + cooksD = np.array(self.BS_F.by_col(' CooksD')).reshape((-1,1)) + + model = GWR(self.coords, self.y, self.X, bw=209267.689, fixed=True) + rslt = model.fit() + + AICc = get_AICc(rslt) + AIC = get_AIC(rslt) + BIC = get_BIC(rslt) + CV = get_CV(rslt) + + self.assertAlmostEquals(np.floor(AICc), 894.0) + self.assertAlmostEquals(np.floor(AIC), 890.0) + self.assertAlmostEquals(np.floor(BIC), 944.0) + self.assertAlmostEquals(np.round(CV,2), 18.25) + np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04) + np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-04) + np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-04) + np.testing.assert_allclose(est_rural, rslt.params[:,1], rtol=1e-04) + np.testing.assert_allclose(se_rural, rslt.bse[:,1], rtol=1e-04) + np.testing.assert_allclose(t_rural, rslt.tvalues[:,1], rtol=1e-04) + np.testing.assert_allclose(est_pov, rslt.params[:,2], rtol=1e-04) + np.testing.assert_allclose(se_pov, rslt.bse[:,2], rtol=1e-04) + np.testing.assert_allclose(t_pov, rslt.tvalues[:,2], rtol=1e-04) + np.testing.assert_allclose(est_black, rslt.params[:,3], rtol=1e-02) + np.testing.assert_allclose(se_black, rslt.bse[:,3], rtol=1e-02) + np.testing.assert_allclose(t_black, rslt.tvalues[:,3], rtol=1e-02) + np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-05) + np.testing.assert_allclose(res, rslt.resid_response, rtol=1e-04) + np.testing.assert_allclose(std_res, rslt.std_res, rtol=1e-04) + np.testing.assert_allclose(localR2, rslt.localR2, rtol=1e-05) + np.testing.assert_allclose(inf, rslt.influ, rtol=1e-04) + np.testing.assert_allclose(cooksD, rslt.cooksD, rtol=1e-00) + + def test_BS_NN(self): + est_Int = self.BS_NN.by_col(' est_Intercept') + se_Int = self.BS_NN.by_col(' se_Intercept') + t_Int = self.BS_NN.by_col(' t_Intercept') + est_rural = self.BS_NN.by_col(' est_PctRural') + se_rural = self.BS_NN.by_col(' se_PctRural') + t_rural = self.BS_NN.by_col(' t_PctRural') + est_pov = self.BS_NN.by_col(' est_PctPov') + se_pov = self.BS_NN.by_col(' se_PctPov') + t_pov = self.BS_NN.by_col(' t_PctPov') + est_black = self.BS_NN.by_col(' est_PctBlack') + se_black = self.BS_NN.by_col(' se_PctBlack') + t_black = self.BS_NN.by_col(' t_PctBlack') + yhat = self.BS_NN.by_col(' yhat') + res = np.array(self.BS_NN.by_col(' residual')) + std_res = np.array(self.BS_NN.by_col(' std_residual')).reshape((-1,1)) + localR2 = np.array(self.BS_NN.by_col(' localR2')).reshape((-1,1)) + inf = np.array(self.BS_NN.by_col(' influence')).reshape((-1,1)) + cooksD = np.array(self.BS_NN.by_col(' CooksD')).reshape((-1,1)) + + model = GWR(self.coords, self.y, self.X, bw=90.000, fixed=False) + rslt = model.fit() + + AICc = get_AICc(rslt) + AIC = get_AIC(rslt) + BIC = get_BIC(rslt) + CV = get_CV(rslt) + + self.assertAlmostEquals(np.floor(AICc), 896.0) + self.assertAlmostEquals(np.floor(AIC), 892.0) + self.assertAlmostEquals(np.floor(BIC), 941.0) + self.assertAlmostEquals(np.around(CV, 2), 19.19) + np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04) + np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-04) + np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-04) + np.testing.assert_allclose(est_rural, rslt.params[:,1], rtol=1e-04) + np.testing.assert_allclose(se_rural, rslt.bse[:,1], rtol=1e-04) + np.testing.assert_allclose(t_rural, rslt.tvalues[:,1], rtol=1e-04) + np.testing.assert_allclose(est_pov, rslt.params[:,2], rtol=1e-04) + np.testing.assert_allclose(se_pov, rslt.bse[:,2], rtol=1e-04) + np.testing.assert_allclose(t_pov, rslt.tvalues[:,2], rtol=1e-04) + np.testing.assert_allclose(est_black, rslt.params[:,3], rtol=1e-02) + np.testing.assert_allclose(se_black, rslt.bse[:,3], rtol=1e-02) + np.testing.assert_allclose(t_black, rslt.tvalues[:,3], rtol=1e-02) + np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-05) + np.testing.assert_allclose(res, rslt.resid_response, rtol=1e-04) + np.testing.assert_allclose(std_res, rslt.std_res, rtol=1e-04) + np.testing.assert_allclose(localR2, rslt.localR2, rtol=1e-05) + np.testing.assert_allclose(inf, rslt.influ, rtol=1e-04) + np.testing.assert_allclose(cooksD, rslt.cooksD, rtol=1e-00) + + def test_GS_F(self): + est_Int = self.GS_F.by_col(' est_Intercept') + se_Int = self.GS_F.by_col(' se_Intercept') + t_Int = self.GS_F.by_col(' t_Intercept') + est_rural = self.GS_F.by_col(' est_PctRural') + se_rural = self.GS_F.by_col(' se_PctRural') + t_rural = self.GS_F.by_col(' t_PctRural') + est_pov = self.GS_F.by_col(' est_PctPov') + se_pov = self.GS_F.by_col(' se_PctPov') + t_pov = self.GS_F.by_col(' t_PctPov') + est_black = self.GS_F.by_col(' est_PctBlack') + se_black = self.GS_F.by_col(' se_PctBlack') + t_black = self.GS_F.by_col(' t_PctBlack') + yhat = self.GS_F.by_col(' yhat') + res = np.array(self.GS_F.by_col(' residual')) + std_res = np.array(self.GS_F.by_col(' std_residual')).reshape((-1,1)) + localR2 = np.array(self.GS_F.by_col(' localR2')).reshape((-1,1)) + inf = np.array(self.GS_F.by_col(' influence')).reshape((-1,1)) + cooksD = np.array(self.GS_F.by_col(' CooksD')).reshape((-1,1)) + + model = GWR(self.coords, self.y, self.X, bw=87308.298, + kernel='gaussian', fixed=True) + rslt = model.fit() + + AICc = get_AICc(rslt) + AIC = get_AIC(rslt) + BIC = get_BIC(rslt) + CV = get_CV(rslt) + + self.assertAlmostEquals(np.floor(AICc), 895.0) + self.assertAlmostEquals(np.floor(AIC), 890.0) + self.assertAlmostEquals(np.floor(BIC), 943.0) + self.assertAlmostEquals(np.around(CV, 2), 18.21) + np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04) + np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-04) + np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-04) + np.testing.assert_allclose(est_rural, rslt.params[:,1], rtol=1e-04) + np.testing.assert_allclose(se_rural, rslt.bse[:,1], rtol=1e-04) + np.testing.assert_allclose(t_rural, rslt.tvalues[:,1], rtol=1e-04) + np.testing.assert_allclose(est_pov, rslt.params[:,2], rtol=1e-04) + np.testing.assert_allclose(se_pov, rslt.bse[:,2], rtol=1e-04) + np.testing.assert_allclose(t_pov, rslt.tvalues[:,2], rtol=1e-04) + np.testing.assert_allclose(est_black, rslt.params[:,3], rtol=1e-02) + np.testing.assert_allclose(se_black, rslt.bse[:,3], rtol=1e-02) + np.testing.assert_allclose(t_black, rslt.tvalues[:,3], rtol=1e-02) + np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-05) + np.testing.assert_allclose(res, rslt.resid_response, rtol=1e-04) + np.testing.assert_allclose(std_res, rslt.std_res, rtol=1e-04) + np.testing.assert_allclose(localR2, rslt.localR2, rtol=1e-05) + np.testing.assert_allclose(inf, rslt.influ, rtol=1e-04) + np.testing.assert_allclose(cooksD, rslt.cooksD, rtol=1e-00) + + def test_GS_NN(self): + est_Int = self.GS_NN.by_col(' est_Intercept') + se_Int = self.GS_NN.by_col(' se_Intercept') + t_Int = self.GS_NN.by_col(' t_Intercept') + est_rural = self.GS_NN.by_col(' est_PctRural') + se_rural = self.GS_NN.by_col(' se_PctRural') + t_rural = self.GS_NN.by_col(' t_PctRural') + est_pov = self.GS_NN.by_col(' est_PctPov') + se_pov = self.GS_NN.by_col(' se_PctPov') + t_pov = self.GS_NN.by_col(' t_PctPov') + est_black = self.GS_NN.by_col(' est_PctBlack') + se_black = self.GS_NN.by_col(' se_PctBlack') + t_black = self.GS_NN.by_col(' t_PctBlack') + yhat = self.GS_NN.by_col(' yhat') + res = np.array(self.GS_NN.by_col(' residual')) + std_res = np.array(self.GS_NN.by_col(' std_residual')).reshape((-1,1)) + localR2 = np.array(self.GS_NN.by_col(' localR2')).reshape((-1,1)) + inf = np.array(self.GS_NN.by_col(' influence')).reshape((-1,1)) + cooksD = np.array(self.GS_NN.by_col(' CooksD')).reshape((-1,1)) + + model = GWR(self.coords, self.y, self.X, bw=49.000, + kernel='gaussian', fixed=False) + rslt = model.fit() + + AICc = get_AICc(rslt) + AIC = get_AIC(rslt) + BIC = get_BIC(rslt) + CV = get_CV(rslt) + + self.assertAlmostEquals(np.floor(AICc), 896) + self.assertAlmostEquals(np.floor(AIC), 894.0) + self.assertAlmostEquals(np.floor(BIC), 922.0) + self.assertAlmostEquals(np.around(CV, 2), 17.91) + np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04) + np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-04) + np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-04) + np.testing.assert_allclose(est_rural, rslt.params[:,1], rtol=1e-04) + np.testing.assert_allclose(se_rural, rslt.bse[:,1], rtol=1e-04) + np.testing.assert_allclose(t_rural, rslt.tvalues[:,1], rtol=1e-04) + np.testing.assert_allclose(est_pov, rslt.params[:,2], rtol=1e-04) + np.testing.assert_allclose(se_pov, rslt.bse[:,2], rtol=1e-04) + np.testing.assert_allclose(t_pov, rslt.tvalues[:,2], rtol=1e-04) + np.testing.assert_allclose(est_black, rslt.params[:,3], rtol=1e-02) + np.testing.assert_allclose(se_black, rslt.bse[:,3], rtol=1e-02) + np.testing.assert_allclose(t_black, rslt.tvalues[:,3], rtol=1e-02) + np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-05) + np.testing.assert_allclose(res, rslt.resid_response, rtol=1e-04) + np.testing.assert_allclose(std_res, rslt.std_res, rtol=1e-04) + np.testing.assert_allclose(localR2, rslt.localR2, rtol=1e-05) + np.testing.assert_allclose(inf, rslt.influ, rtol=1e-04) + np.testing.assert_allclose(cooksD, rslt.cooksD, rtol=1e-00) + + def test_FBGWR(self): + model = FBGWR(self.coords, self.y, self.X, [157.0, 65.0, 52.0], + XB=self.XB, err=self.err, constant=False) + rslt = model.fit() + + np.testing.assert_allclose(rslt.predy, self.FB['predy'], atol=1e-07) + np.testing.assert_allclose(rslt.params, self.FB['params'], atol=1e-07) + np.testing.assert_allclose(rslt.resid_response, self.FB['u'], atol=1e-05) + np.testing.assert_almost_equal(rslt.resid_ss, 6339.3497144025841) + + def test_Prediction(self): + coords =np.array(self.coords) + index = np.arange(len(self.y)) + #train = index[0:-10] + test = index[-10:] + + #y_train = self.y[train] + #X_train = self.X[train] + #coords_train = list(coords[train]) + + #y_test = self.y[test] + X_test = self.X[test] + coords_test = list(coords[test]) + + + model = GWR(self.coords, self.y, self.X, 93, family=Gaussian(), + fixed=False, kernel='bisquare') + results = model.predict(coords_test, X_test) + + params = np.array([22.77198, -0.10254, -0.215093, -0.01405, + 19.10531, -0.094177, -0.232529, 0.071913, + 19.743421, -0.080447, -0.30893, 0.083206, + 17.505759, -0.078919, -0.187955, 0.051719, + 27.747402, -0.165335, -0.208553, 0.004067, + 26.210627, -0.138398, -0.360514, 0.072199, + 18.034833, -0.077047, -0.260556, 0.084319, + 28.452802, -0.163408, -0.14097, -0.063076, + 22.353095, -0.103046, -0.226654, 0.002992, + 18.220508, -0.074034, -0.309812, 0.108636]).reshape((10,4)) + np.testing.assert_allclose(params, results.params, rtol=1e-03) + + bse = np.array([2.080166, 0.021462, 0.102954, 0.049627, + 2.536355, 0.022111, 0.123857, 0.051917, + 1.967813, 0.019716, 0.102562, 0.054918, + 2.463219, 0.021745, 0.110297, 0.044189, + 1.556056, 0.019513, 0.12764, 0.040315, + 1.664108, 0.020114, 0.131208, 0.041613, + 2.5835, 0.021481, 0.113158, 0.047243, + 1.709483, 0.019752, 0.116944, 0.043636, + 1.958233, 0.020947, 0.09974, 0.049821, + 2.276849, 0.020122, 0.107867, 0.047842]).reshape((10,4)) + np.testing.assert_allclose(bse, results.bse, rtol=1e-03) + + tvalues = np.array([10.947193, -4.777659, -2.089223, -0.283103, + 7.532584, -4.259179, -1.877395, 1.385161, + 10.033179, -4.080362, -3.012133, 1.515096, + 7.106862, -3.629311, -1.704079, 1.17042, + 17.831878, -8.473156, -1.633924, 0.100891, + 15.750552, -6.880725, -2.74765, 1.734978, + 6.980774, -3.586757, -2.302575, 1.784818, + 16.644095, -8.273001, -1.205451, -1.445501, + 11.414933, -4.919384, -2.272458, 0.060064, + 8.00251, -3.679274, -2.872176, 2.270738]).reshape((10,4)) + np.testing.assert_allclose(tvalues, results.tvalues, rtol=1e-03) + + localR2 = np.array([[ 0.53068693], + [ 0.59582647], + [ 0.59700925], + [ 0.45769954], + [ 0.54634509], + [ 0.5494828 ], + [ 0.55159604], + [ 0.55634237], + [ 0.53903842], + [ 0.55884954]]) + np.testing.assert_allclose(localR2, results.localR2, rtol=1e-05) + +class TestGWRPoisson(unittest.TestCase): + def setUp(self): + data = pysal.open(pysal.examples.get_path('Tokyomortality.csv'), mode='Ur') + self.coords = zip(data.by_col('X_CENTROID'), data.by_col('Y_CENTROID')) + self.y = np.array(data.by_col('db2564')).reshape((-1,1)) + self.off = np.array(data.by_col('eb2564')).reshape((-1,1)) + OCC = np.array(data.by_col('OCC_TEC')).reshape((-1,1)) + OWN = np.array(data.by_col('OWNH')).reshape((-1,1)) + POP = np.array(data.by_col('POP65')).reshape((-1,1)) + UNEMP = np.array(data.by_col('UNEMP')).reshape((-1,1)) + self.X = np.hstack([OCC,OWN,POP,UNEMP]) + self.BS_F = pysal.open(pysal.examples.get_path('tokyo_BS_F_listwise.csv')) + self.BS_NN = pysal.open(pysal.examples.get_path('tokyo_BS_NN_listwise.csv')) + self.GS_F = pysal.open(pysal.examples.get_path('tokyo_GS_F_listwise.csv')) + self.GS_NN = pysal.open(pysal.examples.get_path('tokyo_GS_NN_listwise.csv')) + self.BS_NN_OFF = pysal.open(pysal.examples.get_path('tokyo_BS_NN_OFF_listwise.csv')) + + def test_BS_F(self): + est_Int = self.BS_F.by_col(' est_Intercept') + se_Int = self.BS_F.by_col(' se_Intercept') + t_Int = self.BS_F.by_col(' t_Intercept') + est_OCC = self.BS_F.by_col(' est_OCC_TEC') + se_OCC = self.BS_F.by_col(' se_OCC_TEC') + t_OCC = self.BS_F.by_col(' t_OCC_TEC') + est_OWN = self.BS_F.by_col(' est_OWNH') + se_OWN = self.BS_F.by_col(' se_OWNH') + t_OWN = self.BS_F.by_col(' t_OWNH') + est_POP = self.BS_F.by_col(' est_POP65') + se_POP = self.BS_F.by_col(' se_POP65') + t_POP = self.BS_F.by_col(' t_POP65') + est_UNEMP = self.BS_F.by_col(' est_UNEMP') + se_UNEMP = self.BS_F.by_col(' se_UNEMP') + t_UNEMP = self.BS_F.by_col(' t_UNEMP') + yhat = self.BS_F.by_col(' yhat') + pdev = np.array(self.BS_F.by_col(' localpdev')).reshape((-1,1)) + + model = GWR(self.coords, self.y, self.X, bw=26029.625, family=Poisson(), + kernel='bisquare', fixed=True) + rslt = model.fit() + + AICc = get_AICc(rslt) + AIC = get_AIC(rslt) + BIC = get_BIC(rslt) + + self.assertAlmostEquals(np.floor(AICc), 13294.0) + self.assertAlmostEquals(np.floor(AIC), 13247.0) + self.assertAlmostEquals(np.floor(BIC), 13485.0) + np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-05) + np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-03) + np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-03) + np.testing.assert_allclose(est_OCC, rslt.params[:,1], rtol=1e-04) + np.testing.assert_allclose(se_OCC, rslt.bse[:,1], rtol=1e-02) + np.testing.assert_allclose(t_OCC, rslt.tvalues[:,1], rtol=1e-02) + np.testing.assert_allclose(est_OWN, rslt.params[:,2], rtol=1e-04) + np.testing.assert_allclose(se_OWN, rslt.bse[:,2], rtol=1e-03) + np.testing.assert_allclose(t_OWN, rslt.tvalues[:,2], rtol=1e-03) + np.testing.assert_allclose(est_POP, rslt.params[:,3], rtol=1e-04) + np.testing.assert_allclose(se_POP, rslt.bse[:,3], rtol=1e-02) + np.testing.assert_allclose(t_POP, rslt.tvalues[:,3], rtol=1e-02) + np.testing.assert_allclose(est_UNEMP, rslt.params[:,4], rtol=1e-04) + np.testing.assert_allclose(se_UNEMP, rslt.bse[:,4], rtol=1e-02) + np.testing.assert_allclose(t_UNEMP, rslt.tvalues[:,4], rtol=1e-02) + np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-05) + np.testing.assert_allclose(pdev, rslt.pDev, rtol=1e-05) + + + def test_BS_NN(self): + est_Int = self.BS_NN.by_col(' est_Intercept') + se_Int = self.BS_NN.by_col(' se_Intercept') + t_Int = self.BS_NN.by_col(' t_Intercept') + est_OCC = self.BS_NN.by_col(' est_OCC_TEC') + se_OCC = self.BS_NN.by_col(' se_OCC_TEC') + t_OCC = self.BS_NN.by_col(' t_OCC_TEC') + est_OWN = self.BS_NN.by_col(' est_OWNH') + se_OWN = self.BS_NN.by_col(' se_OWNH') + t_OWN = self.BS_NN.by_col(' t_OWNH') + est_POP = self.BS_NN.by_col(' est_POP65') + se_POP = self.BS_NN.by_col(' se_POP65') + t_POP = self.BS_NN.by_col(' t_POP65') + est_UNEMP = self.BS_NN.by_col(' est_UNEMP') + se_UNEMP = self.BS_NN.by_col(' se_UNEMP') + t_UNEMP = self.BS_NN.by_col(' t_UNEMP') + yhat = self.BS_NN.by_col(' yhat') + pdev = np.array(self.BS_NN.by_col(' localpdev')).reshape((-1,1)) + + model = GWR(self.coords, self.y, self.X, bw=50, family=Poisson(), + kernel='bisquare', fixed=False) + rslt = model.fit() + + AICc = get_AICc(rslt) + AIC = get_AIC(rslt) + BIC = get_BIC(rslt) + + self.assertAlmostEquals(np.floor(AICc), 13285) + self.assertAlmostEquals(np.floor(AIC), 13259.0) + self.assertAlmostEquals(np.floor(BIC), 13442.0) + np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04) + np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-02) + np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-02) + np.testing.assert_allclose(est_OCC, rslt.params[:,1], rtol=1e-03) + np.testing.assert_allclose(se_OCC, rslt.bse[:,1], rtol=1e-02) + np.testing.assert_allclose(t_OCC, rslt.tvalues[:,1], rtol=1e-02) + np.testing.assert_allclose(est_OWN, rslt.params[:,2], rtol=1e-04) + np.testing.assert_allclose(se_OWN, rslt.bse[:,2], rtol=1e-02) + np.testing.assert_allclose(t_OWN, rslt.tvalues[:,2], rtol=1e-02) + np.testing.assert_allclose(est_POP, rslt.params[:,3], rtol=1e-03) + np.testing.assert_allclose(se_POP, rslt.bse[:,3], rtol=1e-02) + np.testing.assert_allclose(t_POP, rslt.tvalues[:,3], rtol=1e-02) + np.testing.assert_allclose(est_UNEMP, rslt.params[:,4], rtol=1e-04) + np.testing.assert_allclose(se_UNEMP, rslt.bse[:,4], rtol=1e-02) + np.testing.assert_allclose(t_UNEMP, rslt.tvalues[:,4], rtol=1e-02) + np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-04) + np.testing.assert_allclose(pdev, rslt.pDev, rtol=1e-05) + + def test_BS_NN_Offset(self): + est_Int = self.BS_NN_OFF.by_col(' est_Intercept') + se_Int = self.BS_NN_OFF.by_col(' se_Intercept') + t_Int = self.BS_NN_OFF.by_col(' t_Intercept') + est_OCC = self.BS_NN_OFF.by_col(' est_OCC_TEC') + se_OCC = self.BS_NN_OFF.by_col(' se_OCC_TEC') + t_OCC = self.BS_NN_OFF.by_col(' t_OCC_TEC') + est_OWN = self.BS_NN_OFF.by_col(' est_OWNH') + se_OWN = self.BS_NN_OFF.by_col(' se_OWNH') + t_OWN = self.BS_NN_OFF.by_col(' t_OWNH') + est_POP = self.BS_NN_OFF.by_col(' est_POP65') + se_POP = self.BS_NN_OFF.by_col(' se_POP65') + t_POP = self.BS_NN_OFF.by_col(' t_POP65') + est_UNEMP = self.BS_NN_OFF.by_col(' est_UNEMP') + se_UNEMP = self.BS_NN_OFF.by_col(' se_UNEMP') + t_UNEMP = self.BS_NN_OFF.by_col(' t_UNEMP') + yhat = self.BS_NN_OFF.by_col(' yhat') + pdev = np.array(self.BS_NN_OFF.by_col(' localpdev')).reshape((-1,1)) + + model = GWR(self.coords, self.y, self.X, bw=100, offset=self.off, family=Poisson(), + kernel='bisquare', fixed=False) + rslt = model.fit() + + AICc = get_AICc(rslt) + AIC = get_AIC(rslt) + BIC = get_BIC(rslt) + + self.assertAlmostEquals(np.floor(AICc), 367.0) + self.assertAlmostEquals(np.floor(AIC), 361.0) + self.assertAlmostEquals(np.floor(BIC), 451.0) + np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-02, + atol=1e-02) + np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-02, atol=1e-02) + np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-01, + atol=1e-02) + np.testing.assert_allclose(est_OCC, rslt.params[:,1], rtol=1e-03, + atol=1e-02) + np.testing.assert_allclose(se_OCC, rslt.bse[:,1], rtol=1e-02, atol=1e-02) + np.testing.assert_allclose(t_OCC, rslt.tvalues[:,1], rtol=1e-01, + atol=1e-02) + np.testing.assert_allclose(est_OWN, rslt.params[:,2], rtol=1e-04, + atol=1e-02) + np.testing.assert_allclose(se_OWN, rslt.bse[:,2], rtol=1e-02, atol=1e-02) + np.testing.assert_allclose(t_OWN, rslt.tvalues[:,2], rtol=1e-01, + atol=1e-02) + np.testing.assert_allclose(est_POP, rslt.params[:,3], rtol=1e-03, + atol=1e-02) + np.testing.assert_allclose(se_POP, rslt.bse[:,3], rtol=1e-02, atol=1e-02) + np.testing.assert_allclose(t_POP, rslt.tvalues[:,3], rtol=1e-01, + atol=1e-02) + np.testing.assert_allclose(est_UNEMP, rslt.params[:,4], rtol=1e-04, + atol=1e-02) + np.testing.assert_allclose(se_UNEMP, rslt.bse[:,4], rtol=1e-02, + atol=1e-02) + np.testing.assert_allclose(t_UNEMP, rslt.tvalues[:,4], rtol=1e-01, + atol=1e-02) + np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-03, atol=1e-02) + np.testing.assert_allclose(pdev, rslt.pDev, rtol=1e-04, atol=1e-02) + + def test_GS_F(self): + est_Int = self.GS_F.by_col(' est_Intercept') + se_Int = self.GS_F.by_col(' se_Intercept') + t_Int = self.GS_F.by_col(' t_Intercept') + est_OCC = self.GS_F.by_col(' est_OCC_TEC') + se_OCC = self.GS_F.by_col(' se_OCC_TEC') + t_OCC = self.GS_F.by_col(' t_OCC_TEC') + est_OWN = self.GS_F.by_col(' est_OWNH') + se_OWN = self.GS_F.by_col(' se_OWNH') + t_OWN = self.GS_F.by_col(' t_OWNH') + est_POP = self.GS_F.by_col(' est_POP65') + se_POP = self.GS_F.by_col(' se_POP65') + t_POP = self.GS_F.by_col(' t_POP65') + est_UNEMP = self.GS_F.by_col(' est_UNEMP') + se_UNEMP = self.GS_F.by_col(' se_UNEMP') + t_UNEMP = self.GS_F.by_col(' t_UNEMP') + yhat = self.GS_F.by_col(' yhat') + pdev = np.array(self.GS_F.by_col(' localpdev')).reshape((-1,1)) + + model = GWR(self.coords, self.y, self.X, bw=8764.474, family=Poisson(), + kernel='gaussian', fixed=True) + rslt = model.fit() + + AICc = get_AICc(rslt) + AIC = get_AIC(rslt) + BIC = get_BIC(rslt) + + self.assertAlmostEquals(np.floor(AICc), 11283.0) + self.assertAlmostEquals(np.floor(AIC), 11211.0) + self.assertAlmostEquals(np.floor(BIC), 11497.0) + np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-03) + np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-02) + np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-02) + np.testing.assert_allclose(est_OCC, rslt.params[:,1], rtol=1e-03) + np.testing.assert_allclose(se_OCC, rslt.bse[:,1], rtol=1e-02) + np.testing.assert_allclose(t_OCC, rslt.tvalues[:,1], rtol=1e-02) + np.testing.assert_allclose(est_OWN, rslt.params[:,2], rtol=1e-03) + np.testing.assert_allclose(se_OWN, rslt.bse[:,2], rtol=1e-02) + np.testing.assert_allclose(t_OWN, rslt.tvalues[:,2], rtol=1e-02) + np.testing.assert_allclose(est_POP, rslt.params[:,3], rtol=1e-02) + np.testing.assert_allclose(se_POP, rslt.bse[:,3], rtol=1e-02) + np.testing.assert_allclose(t_POP, rslt.tvalues[:,3], rtol=1e-02) + np.testing.assert_allclose(est_UNEMP, rslt.params[:,4], rtol=1e-02) + np.testing.assert_allclose(se_UNEMP, rslt.bse[:,4], rtol=1e-02) + np.testing.assert_allclose(t_UNEMP, rslt.tvalues[:,4], rtol=1e-02) + np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-04) + np.testing.assert_allclose(pdev, rslt.pDev, rtol=1e-05) + + def test_GS_NN(self): + est_Int = self.GS_NN.by_col(' est_Intercept') + se_Int = self.GS_NN.by_col(' se_Intercept') + t_Int = self.GS_NN.by_col(' t_Intercept') + est_OCC = self.GS_NN.by_col(' est_OCC_TEC') + se_OCC = self.GS_NN.by_col(' se_OCC_TEC') + t_OCC = self.GS_NN.by_col(' t_OCC_TEC') + est_OWN = self.GS_NN.by_col(' est_OWNH') + se_OWN = self.GS_NN.by_col(' se_OWNH') + t_OWN = self.GS_NN.by_col(' t_OWNH') + est_POP = self.GS_NN.by_col(' est_POP65') + se_POP = self.GS_NN.by_col(' se_POP65') + t_POP = self.GS_NN.by_col(' t_POP65') + est_UNEMP = self.GS_NN.by_col(' est_UNEMP') + se_UNEMP = self.GS_NN.by_col(' se_UNEMP') + t_UNEMP = self.GS_NN.by_col(' t_UNEMP') + yhat = self.GS_NN.by_col(' yhat') + pdev = np.array(self.GS_NN.by_col(' localpdev')).reshape((-1,1)) + + model = GWR(self.coords, self.y, self.X, bw=50, family=Poisson(), + kernel='gaussian', fixed=False) + rslt = model.fit() + + AICc = get_AICc(rslt) + AIC = get_AIC(rslt) + BIC = get_BIC(rslt) + + self.assertAlmostEquals(np.floor(AICc), 21070.0) + self.assertAlmostEquals(np.floor(AIC), 21069.0) + self.assertAlmostEquals(np.floor(BIC), 21111.0) + np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04) + np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-02) + np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-02) + np.testing.assert_allclose(est_OCC, rslt.params[:,1], rtol=1e-03) + np.testing.assert_allclose(se_OCC, rslt.bse[:,1], rtol=1e-02) + np.testing.assert_allclose(t_OCC, rslt.tvalues[:,1], rtol=1e-02) + np.testing.assert_allclose(est_OWN, rslt.params[:,2], rtol=1e-04) + np.testing.assert_allclose(se_OWN, rslt.bse[:,2], rtol=1e-02) + np.testing.assert_allclose(t_OWN, rslt.tvalues[:,2], rtol=1e-02) + np.testing.assert_allclose(est_POP, rslt.params[:,3], rtol=1e-02) + np.testing.assert_allclose(se_POP, rslt.bse[:,3], rtol=1e-02) + np.testing.assert_allclose(t_POP, rslt.tvalues[:,3], rtol=1e-02) + np.testing.assert_allclose(est_UNEMP, rslt.params[:,4], rtol=1e-02) + np.testing.assert_allclose(se_UNEMP, rslt.bse[:,4], rtol=1e-02) + np.testing.assert_allclose(t_UNEMP, rslt.tvalues[:,4], rtol=1e-02) + np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-04) + np.testing.assert_allclose(pdev, rslt.pDev, rtol=1e-05) + +class TestGWRBinomial(unittest.TestCase): + def setUp(self): + data = pysal.open(pysal.examples.get_path('landslides.csv')) + self.coords = zip(data.by_col('X'), data.by_col('Y')) + self.y = np.array(data.by_col('Landslid')).reshape((-1,1)) + ELEV = np.array(data.by_col('Elev')).reshape((-1,1)) + SLOPE = np.array(data.by_col('Slope')).reshape((-1,1)) + SIN = np.array(data.by_col('SinAspct')).reshape((-1,1)) + COS = np.array(data.by_col('CosAspct')).reshape((-1,1)) + SOUTH = np.array(data.by_col('AbsSouth')).reshape((-1,1)) + DIST = np.array(data.by_col('DistStrm')).reshape((-1,1)) + self.X = np.hstack([ELEV, SLOPE, SIN, COS, SOUTH, DIST]) + self.BS_F = pysal.open(pysal.examples.get_path('clearwater_BS_F_listwise.csv')) + self.BS_NN = pysal.open(pysal.examples.get_path('clearwater_BS_NN_listwise.csv')) + self.GS_F = pysal.open(pysal.examples.get_path('clearwater_GS_F_listwise.csv')) + self.GS_NN = pysal.open(pysal.examples.get_path('clearwater_GS_NN_listwise.csv')) + + def test_BS_F(self): + est_Int = self.BS_F.by_col(' est_Intercept') + se_Int = self.BS_F.by_col(' se_Intercept') + t_Int = self.BS_F.by_col(' t_Intercept') + est_elev = self.BS_F.by_col(' est_Elev') + se_elev = self.BS_F.by_col(' se_Elev') + t_elev = self.BS_F.by_col(' t_Elev') + est_slope = self.BS_F.by_col(' est_Slope') + se_slope = self.BS_F.by_col(' se_Slope') + t_slope = self.BS_F.by_col(' t_Slope') + est_sin = self.BS_F.by_col(' est_SinAspct') + se_sin = self.BS_F.by_col(' se_SinAspct') + t_sin = self.BS_F.by_col(' t_SinAspct') + est_cos = self.BS_F.by_col(' est_CosAspct') + se_cos = self.BS_F.by_col(' se_CosAspct') + t_cos = self.BS_F.by_col(' t_CosAspct') + est_south = self.BS_F.by_col(' est_AbsSouth') + se_south = self.BS_F.by_col(' se_AbsSouth') + t_south = self.BS_F.by_col(' t_AbsSouth') + est_strm = self.BS_F.by_col(' est_DistStrm') + se_strm = self.BS_F.by_col(' se_DistStrm') + t_strm = self.BS_F.by_col(' t_DistStrm') + yhat = self.BS_F.by_col(' yhat') + pdev = np.array(self.BS_F.by_col(' localpdev')).reshape((-1,1)) + + model = GWR(self.coords, self.y, self.X, bw=19642.170, family=Binomial(), + kernel='bisquare', fixed=True) + rslt = model.fit() + + AICc = get_AICc(rslt) + AIC = get_AIC(rslt) + BIC = get_BIC(rslt) + + self.assertAlmostEquals(np.floor(AICc), 275.0) + self.assertAlmostEquals(np.floor(AIC), 271.0) + self.assertAlmostEquals(np.floor(BIC), 349.0) + np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-00) + np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-00) + np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-00) + np.testing.assert_allclose(est_elev, rslt.params[:,1], rtol=1e-00) + np.testing.assert_allclose(se_elev, rslt.bse[:,1], rtol=1e-00) + np.testing.assert_allclose(t_elev, rslt.tvalues[:,1], rtol=1e-00) + np.testing.assert_allclose(est_slope, rslt.params[:,2], rtol=1e-00) + np.testing.assert_allclose(se_slope, rslt.bse[:,2], rtol=1e-00) + np.testing.assert_allclose(t_slope, rslt.tvalues[:,2], rtol=1e-00) + np.testing.assert_allclose(est_sin, rslt.params[:,3], rtol=1e01) + np.testing.assert_allclose(se_sin, rslt.bse[:,3], rtol=1e01) + np.testing.assert_allclose(t_sin, rslt.tvalues[:,3], rtol=1e01) + np.testing.assert_allclose(est_cos, rslt.params[:,4], rtol=1e01) + np.testing.assert_allclose(se_cos, rslt.bse[:,4], rtol=1e01) + np.testing.assert_allclose(t_cos, rslt.tvalues[:,4], rtol=1e01) + np.testing.assert_allclose(est_south, rslt.params[:,5], rtol=1e01) + np.testing.assert_allclose(se_south, rslt.bse[:,5], rtol=1e01) + np.testing.assert_allclose(t_south, rslt.tvalues[:,5], rtol=1e01) + np.testing.assert_allclose(est_strm, rslt.params[:,6], rtol=1e02) + np.testing.assert_allclose(se_strm, rslt.bse[:,6], rtol=1e01) + np.testing.assert_allclose(t_strm, rslt.tvalues[:,6], rtol=1e02) + np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-01) + #This test fails - likely due to compound rounding errors + #Has been tested using statsmodels.family calculations and + #code from Jing's python version, which both yield the same + #np.testing.assert_allclose(pdev, rslt.pDev, rtol=1e-05) + + def test_BS_NN(self): + est_Int = self.BS_NN.by_col(' est_Intercept') + se_Int = self.BS_NN.by_col(' se_Intercept') + t_Int = self.BS_NN.by_col(' t_Intercept') + est_elev = self.BS_NN.by_col(' est_Elev') + se_elev = self.BS_NN.by_col(' se_Elev') + t_elev = self.BS_NN.by_col(' t_Elev') + est_slope = self.BS_NN.by_col(' est_Slope') + se_slope = self.BS_NN.by_col(' se_Slope') + t_slope = self.BS_NN.by_col(' t_Slope') + est_sin = self.BS_NN.by_col(' est_SinAspct') + se_sin = self.BS_NN.by_col(' se_SinAspct') + t_sin = self.BS_NN.by_col(' t_SinAspct') + est_cos = self.BS_NN.by_col(' est_CosAspct') + se_cos = self.BS_NN.by_col(' se_CosAspct') + t_cos = self.BS_NN.by_col(' t_CosAspct') + est_south = self.BS_NN.by_col(' est_AbsSouth') + se_south = self.BS_NN.by_col(' se_AbsSouth') + t_south = self.BS_NN.by_col(' t_AbsSouth') + est_strm = self.BS_NN.by_col(' est_DistStrm') + se_strm = self.BS_NN.by_col(' se_DistStrm') + t_strm = self.BS_NN.by_col(' t_DistStrm') + yhat = self.BS_NN.by_col(' yhat') + pdev = self.BS_NN.by_col(' localpdev') + + model = GWR(self.coords, self.y, self.X, bw=158, family=Binomial(), + kernel='bisquare', fixed=False) + rslt = model.fit() + + AICc = get_AICc(rslt) + AIC = get_AIC(rslt) + BIC = get_BIC(rslt) + + self.assertAlmostEquals(np.floor(AICc), 277.0) + self.assertAlmostEquals(np.floor(AIC), 271.0) + self.assertAlmostEquals(np.floor(BIC), 358.0) + np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-00) + np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-00) + np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-00) + np.testing.assert_allclose(est_elev, rslt.params[:,1], rtol=1e-00) + np.testing.assert_allclose(se_elev, rslt.bse[:,1], rtol=1e-00) + np.testing.assert_allclose(t_elev, rslt.tvalues[:,1], rtol=1e-00) + np.testing.assert_allclose(est_slope, rslt.params[:,2], rtol=1e-00) + np.testing.assert_allclose(se_slope, rslt.bse[:,2], rtol=1e-00) + np.testing.assert_allclose(t_slope, rslt.tvalues[:,2], rtol=1e-00) + np.testing.assert_allclose(est_sin, rslt.params[:,3], rtol=1e01) + np.testing.assert_allclose(se_sin, rslt.bse[:,3], rtol=1e01) + np.testing.assert_allclose(t_sin, rslt.tvalues[:,3], rtol=1e01) + np.testing.assert_allclose(est_cos, rslt.params[:,4], rtol=1e01) + np.testing.assert_allclose(se_cos, rslt.bse[:,4], rtol=1e01) + np.testing.assert_allclose(t_cos, rslt.tvalues[:,4], rtol=1e01) + np.testing.assert_allclose(est_south, rslt.params[:,5], rtol=1e01) + np.testing.assert_allclose(se_south, rslt.bse[:,5], rtol=1e01) + np.testing.assert_allclose(t_south, rslt.tvalues[:,5], rtol=1e01) + np.testing.assert_allclose(est_strm, rslt.params[:,6], rtol=1e03) + np.testing.assert_allclose(se_strm, rslt.bse[:,6], rtol=1e01) + np.testing.assert_allclose(t_strm, rslt.tvalues[:,6], rtol=1e03) + np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-01) + #This test fails - likely due to compound rounding errors + #Has been tested using statsmodels.family calculations and + #code from Jing's python version, which both yield the same + #np.testing.assert_allclose(pdev, rslt.pDev, rtol=1e-05) + + def test_GS_F(self): + est_Int = self.GS_F.by_col(' est_Intercept') + se_Int = self.GS_F.by_col(' se_Intercept') + t_Int = self.GS_F.by_col(' t_Intercept') + est_elev = self.GS_F.by_col(' est_Elev') + se_elev = self.GS_F.by_col(' se_Elev') + t_elev = self.GS_F.by_col(' t_Elev') + est_slope = self.GS_F.by_col(' est_Slope') + se_slope = self.GS_F.by_col(' se_Slope') + t_slope = self.GS_F.by_col(' t_Slope') + est_sin = self.GS_F.by_col(' est_SinAspct') + se_sin = self.GS_F.by_col(' se_SinAspct') + t_sin = self.GS_F.by_col(' t_SinAspct') + est_cos = self.GS_F.by_col(' est_CosAspct') + se_cos = self.GS_F.by_col(' se_CosAspct') + t_cos = self.GS_F.by_col(' t_CosAspct') + est_south = self.GS_F.by_col(' est_AbsSouth') + se_south = self.GS_F.by_col(' se_AbsSouth') + t_south = self.GS_F.by_col(' t_AbsSouth') + est_strm = self.GS_F.by_col(' est_DistStrm') + se_strm = self.GS_F.by_col(' se_DistStrm') + t_strm = self.GS_F.by_col(' t_DistStrm') + yhat = self.GS_F.by_col(' yhat') + pdev = self.GS_F.by_col(' localpdev') + + model = GWR(self.coords, self.y, self.X, bw=8929.061, family=Binomial(), + kernel='gaussian', fixed=True) + rslt = model.fit() + + AICc = get_AICc(rslt) + AIC = get_AIC(rslt) + BIC = get_BIC(rslt) + + self.assertAlmostEquals(np.floor(AICc), 276.0) + self.assertAlmostEquals(np.floor(AIC), 272.0) + self.assertAlmostEquals(np.floor(BIC), 341.0) + np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-00) + np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-00) + np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-00) + np.testing.assert_allclose(est_elev, rslt.params[:,1], rtol=1e-00) + np.testing.assert_allclose(se_elev, rslt.bse[:,1], rtol=1e-00) + np.testing.assert_allclose(t_elev, rslt.tvalues[:,1], rtol=1e-00) + np.testing.assert_allclose(est_slope, rslt.params[:,2], rtol=1e-00) + np.testing.assert_allclose(se_slope, rslt.bse[:,2], rtol=1e-00) + np.testing.assert_allclose(t_slope, rslt.tvalues[:,2], rtol=1e-00) + np.testing.assert_allclose(est_sin, rslt.params[:,3], rtol=1e01) + np.testing.assert_allclose(se_sin, rslt.bse[:,3], rtol=1e01) + np.testing.assert_allclose(t_sin, rslt.tvalues[:,3], rtol=1e01) + np.testing.assert_allclose(est_cos, rslt.params[:,4], rtol=1e01) + np.testing.assert_allclose(se_cos, rslt.bse[:,4], rtol=1e01) + np.testing.assert_allclose(t_cos, rslt.tvalues[:,4], rtol=1e01) + np.testing.assert_allclose(est_south, rslt.params[:,5], rtol=1e01) + np.testing.assert_allclose(se_south, rslt.bse[:,5], rtol=1e01) + np.testing.assert_allclose(t_south, rslt.tvalues[:,5], rtol=1e01) + np.testing.assert_allclose(est_strm, rslt.params[:,6], rtol=1e02) + np.testing.assert_allclose(se_strm, rslt.bse[:,6], rtol=1e01) + np.testing.assert_allclose(t_strm, rslt.tvalues[:,6], rtol=1e02) + np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-01) + #This test fails - likely due to compound rounding errors + #Has been tested using statsmodels.family calculations and + #code from Jing's python version, which both yield the same + #np.testing.assert_allclose(pdev, rslt.pDev, rtol=1e-05) + + def test_GS_NN(self): + est_Int = self.GS_NN.by_col(' est_Intercept') + se_Int = self.GS_NN.by_col(' se_Intercept') + t_Int = self.GS_NN.by_col(' t_Intercept') + est_elev = self.GS_NN.by_col(' est_Elev') + se_elev = self.GS_NN.by_col(' se_Elev') + t_elev = self.GS_NN.by_col(' t_Elev') + est_slope = self.GS_NN.by_col(' est_Slope') + se_slope = self.GS_NN.by_col(' se_Slope') + t_slope = self.GS_NN.by_col(' t_Slope') + est_sin = self.GS_NN.by_col(' est_SinAspct') + se_sin = self.GS_NN.by_col(' se_SinAspct') + t_sin = self.GS_NN.by_col(' t_SinAspct') + est_cos = self.GS_NN.by_col(' est_CosAspct') + se_cos = self.GS_NN.by_col(' se_CosAspct') + t_cos = self.GS_NN.by_col(' t_CosAspct') + est_south = self.GS_NN.by_col(' est_AbsSouth') + se_south = self.GS_NN.by_col(' se_AbsSouth') + t_south = self.GS_NN.by_col(' t_AbsSouth') + est_strm = self.GS_NN.by_col(' est_DistStrm') + se_strm = self.GS_NN.by_col(' se_DistStrm') + t_strm = self.GS_NN.by_col(' t_DistStrm') + yhat = self.GS_NN.by_col(' yhat') + pdev = self.GS_NN.by_col(' localpdev') + + model = GWR(self.coords, self.y, self.X, bw=64, family=Binomial(), + kernel='gaussian', fixed=False) + rslt = model.fit() + + AICc = get_AICc(rslt) + AIC = get_AIC(rslt) + BIC = get_BIC(rslt) + + self.assertAlmostEquals(np.floor(AICc), 276.0) + self.assertAlmostEquals(np.floor(AIC), 273.0) + self.assertAlmostEquals(np.floor(BIC), 331.0) + np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-00) + np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-00) + np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-00) + np.testing.assert_allclose(est_elev, rslt.params[:,1], rtol=1e-00) + np.testing.assert_allclose(se_elev, rslt.bse[:,1], rtol=1e-00) + np.testing.assert_allclose(t_elev, rslt.tvalues[:,1], rtol=1e-00) + np.testing.assert_allclose(est_slope, rslt.params[:,2], rtol=1e-00) + np.testing.assert_allclose(se_slope, rslt.bse[:,2], rtol=1e-00) + np.testing.assert_allclose(t_slope, rslt.tvalues[:,2], rtol=1e-00) + np.testing.assert_allclose(est_sin, rslt.params[:,3], rtol=1e01) + np.testing.assert_allclose(se_sin, rslt.bse[:,3], rtol=1e01) + np.testing.assert_allclose(t_sin, rslt.tvalues[:,3], rtol=1e01) + np.testing.assert_allclose(est_cos, rslt.params[:,4], rtol=1e01) + np.testing.assert_allclose(se_cos, rslt.bse[:,4], rtol=1e01) + np.testing.assert_allclose(t_cos, rslt.tvalues[:,4], rtol=1e01) + np.testing.assert_allclose(est_south, rslt.params[:,5], rtol=1e01) + np.testing.assert_allclose(se_south, rslt.bse[:,5], rtol=1e01) + np.testing.assert_allclose(t_south, rslt.tvalues[:,5], rtol=1e01) + np.testing.assert_allclose(est_strm, rslt.params[:,6], rtol=1e02) + np.testing.assert_allclose(se_strm, rslt.bse[:,6], rtol=1e01) + np.testing.assert_allclose(t_strm, rslt.tvalues[:,6], rtol=1e02) + np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-00) + #This test fails - likely due to compound rounding errors + #Has been tested using statsmodels.family calculations and + #code from Jing's python version, which both yield the same + #np.testing.assert_allclose(pdev, rslt.pDev, rtol=1e-05) + +if __name__ == '__main__': + unittest.main() diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/tests/test_kernels.py b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/tests/test_kernels.py new file mode 100644 index 0000000..ea044b9 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/tests/test_kernels.py @@ -0,0 +1,84 @@ +import unittest +import numpy as np +import pysal +from pysal.contrib.gwr.kernels import * + +PEGP = pysal.examples.get_path + +class TestKernels(unittest.TestCase): + def setUp(self): + np.random.seed(1234) + x = np.arange(1,6) + y = np.arange(5,0, -1) + np.random.shuffle(x) + np.random.shuffle(y) + self.coords = np.array(zip(x, y)) + self.fix_gauss_kern = np.array([ + [ 1. , 0.38889556, 0.48567179, 0.48567179, 0.89483932], + [ 0.38889556, 1. , 0.89483932, 0.64118039, 0.48567179], + [ 0.48567179, 0.89483932, 1. , 0.89483932, 0.48567179], + [ 0.48567179, 0.64118039, 0.89483932, 1. , 0.38889556], + [ 0.89483932, 0.48567179, 0.48567179, 0.38889556, 1. ]]) + self.adapt_gauss_kern = np.array([ + [ 1. , 0.52004183, 0.60653072, 0.60653072, 0.92596109], + [ 0.34559083, 1. , 0.88249692, 0.60653072, 0.44374738], + [ 0.03877423, 0.60653072, 1. , 0.60653072, 0.03877423], + [ 0.44374738, 0.60653072, 0.88249692, 1. , 0.34559083], + [ 0.92596109, 0.60653072, 0.60653072, 0.52004183, 1. ]]) + self.fix_bisquare_kern = np.array([ + [ 1. , 0. , 0. , 0. , 0.60493827], + [ 0. , 1. , 0.60493827, 0.01234568, 0. ], + [ 0. , 0.60493827, 1. , 0.60493827, 0. ], + [ 0. , 0.01234568, 0.60493827, 1. , 0. ], + [ 0.60493827, 0. , 0. , 0. , 1. ]]) + self.adapt_bisquare_kern = np.array([ + [ 1.00000000e+00, 0.00000000e+00, 0.00000000e+00, + 3.99999881e-14, 7.15976383e-01], + [ 0.00000000e+00, 1.00000000e+00, 5.62500075e-01, + 3.99999881e-14, 0.00000000e+00], + [ 0.00000000e+00, 3.99999881e-14, 1.00000000e+00, + 3.99999881e-14, 0.00000000e+00], + [ 0.00000000e+00, 3.99999881e-14, 5.62500075e-01, + 1.00000000e+00, 0.00000000e+00], + [ 7.15976383e-01, 0.00000000e+00, 3.99999881e-14, + 0.00000000e+00, 1.00000000e+00]]) + self.fix_exp_kern = np.array([ + [ 1. , 0.2529993 , 0.30063739, 0.30063739, 0.62412506], + [ 0.2529993 , 1. , 0.62412506, 0.38953209, 0.30063739], + [ 0.30063739, 0.62412506, 1. , 0.62412506, 0.30063739], + [ 0.30063739, 0.38953209, 0.62412506, 1. , 0.2529993 ], + [ 0.62412506, 0.30063739, 0.30063739, 0.2529993 , 1. ]]) + self.adapt_exp_kern = np.array([ + [ 1. , 0.31868771, 0.36787948, 0.36787948, 0.67554721], + [ 0.23276223, 1. , 0.60653069, 0.36787948, 0.27949951], + [ 0.07811997, 0.36787948, 1. , 0.36787948, 0.07811997], + [ 0.27949951, 0.36787948, 0.60653069, 1. , 0.23276223], + [ 0.67554721, 0.36787948, 0.36787948, 0.31868771, 1. ]]) + + def test_fix_gauss(self): + kern = fix_gauss(self.coords, 3) + np.testing.assert_allclose(kern, self.fix_gauss_kern) + + def test_adapt_gauss(self): + kern = adapt_gauss(self.coords, 3) + np.testing.assert_allclose(kern, self.adapt_gauss_kern) + + def test_fix_biqsquare(self): + kern = fix_bisquare(self.coords, 3) + np.testing.assert_allclose(kern, self.fix_bisquare_kern, + atol=1e-01) + + def test_adapt_bisqaure(self): + kern = adapt_bisquare(self.coords, 3) + np.testing.assert_allclose(kern, self.adapt_bisquare_kern, atol=1e-012) + + def test_fix_exp(self): + kern = fix_exp(self.coords, 3) + np.testing.assert_allclose(kern, self.fix_exp_kern) + + def test_adapt_exp(self): + kern = adapt_exp(self.coords, 3) + np.testing.assert_allclose(kern, self.adapt_exp_kern) + +if __name__ == '__main__': + unittest.main() diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/tests/test_sel_bw.py b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/tests/test_sel_bw.py new file mode 100644 index 0000000..47c6d9d --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr/base/tests/test_sel_bw.py @@ -0,0 +1,139 @@ + +""" +GWR is tested against results from GWR4 +""" + +import unittest +import pickle as pk +from pysal.contrib.glm.family import Gaussian, Poisson, Binomial +from pysal.contrib.gwr.sel_bw import Sel_BW +import numpy as np +import pysal + +class TestSelBW(unittest.TestCase): + def setUp(self): + data = pysal.open(pysal.examples.get_path('GData_utm.csv')) + self.coords = zip(data.by_col('X'), data.by_col('Y')) + self.y = np.array(data.by_col('PctBach')).reshape((-1,1)) + rural = np.array(data.by_col('PctRural')).reshape((-1,1)) + pov = np.array(data.by_col('PctPov')).reshape((-1,1)) + black = np.array(data.by_col('PctBlack')).reshape((-1,1)) + self.X = np.hstack([rural, pov, black]) + self.XB = pk.load(open(pysal.examples.get_path('XB.p'), 'r')) + self.err = pk.load(open(pysal.examples.get_path('err.p'), 'r')) + + def test_golden_fixed_AICc(self): + bw1 = 211027.34 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='bisquare', + fixed=True).search(criterion='AICc') + self.assertAlmostEqual(bw1, bw2) + + def test_golden_adapt_AICc(self): + bw1 = 93.0 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='bisquare', + fixed=False).search(criterion='AICc') + self.assertAlmostEqual(bw1, bw2) + + def test_golden_fixed_AIC(self): + bw1 = 76169.15 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian', + fixed=True).search(criterion='AIC') + self.assertAlmostEqual(bw1, bw2) + + def test_golden_adapt_AIC(self): + bw1 = 50.0 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian', + fixed=False).search(criterion='AIC') + self.assertAlmostEqual(bw1, bw2) + + def test_golden_fixed_BIC(self): + bw1 = 279451.43 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian', + fixed=True).search(criterion='BIC') + self.assertAlmostEqual(bw1, bw2) + + def test_golden_adapt_BIC(self): + bw1 = 62.0 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian', + fixed=False).search(criterion='BIC') + self.assertAlmostEqual(bw1, bw2) + + def test_golden_fixed_CV(self): + bw1 = 130406.67 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian', + fixed=True).search(criterion='CV') + self.assertAlmostEqual(bw1, bw2) + + def test_golden_adapt_CV(self): + bw1 = 68.0 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian', + fixed=False).search(criterion='CV') + self.assertAlmostEqual(bw1, bw2) + + def test_interval_fixed_AICc(self): + bw1 = 211025.0#211027.00 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='bisquare', + fixed=True).search(criterion='AICc', search='interval', bw_min=211001., + bw_max=211035.0, interval=2) + self.assertAlmostEqual(bw1, bw2) + + def test_interval_adapt_AICc(self): + bw1 = 93.0 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='bisquare', + fixed=False).search(criterion='AICc', search='interval', + bw_min=90.0, bw_max=95.0, interval=1) + self.assertAlmostEqual(bw1, bw2) + + def test_interval_fixed_AIC(self): + bw1 = 76175.0#76169.00 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian', + fixed=True).search(criterion='AIC', search='interval', + bw_min=76161.0, bw_max=76175.0, interval=1) + self.assertAlmostEqual(bw1, bw2) + + def test_interval_adapt_AIC(self): + bw1 = 40.0#50.0 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian', + fixed=False).search(criterion='AIC', search='interval', bw_min=40.0, + bw_max=60.0, interval=2) + self.assertAlmostEqual(bw1, bw2) + + def test_interval_fixed_BIC(self): + bw1 = 279461.0#279451.00 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian', + fixed=True).search(criterion='BIC', search='interval', bw_min=279441.0, + bw_max=279461.0, interval=2) + self.assertAlmostEqual(bw1, bw2) + + def test_interval_adapt_BIC(self): + bw1 = 62.0 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian', + fixed=False).search(criterion='BIC', search='interval', + bw_min=52.0, bw_max=72.0, interval=2) + self.assertAlmostEqual(bw1, bw2) + + def test_interval_fixed_CV(self): + bw1 = 130400.0#130406.00 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian', + fixed=True).search(criterion='CV', search='interval', bw_min=130400.0, + bw_max=130410.0, interval=1) + self.assertAlmostEqual(bw1, bw2) + + def test_interval_adapt_CV(self): + bw1 = 62.0#68.0 + bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian', + fixed=False).search(criterion='CV', search='interval', bw_min=60.0, + bw_max=76.0 , interval=2) + self.assertAlmostEqual(bw1, bw2) + + def test_FBGWR_AIC(self): + bw1 = [157.0, 65.0, 52.0] + sel = Sel_BW(self.coords, self.y, self.X, fb=True, kernel='bisquare', + constant=False) + bw2 = sel.search(tol_fb=1e-03) + np.testing.assert_allclose(bw1, bw2) + np.testing.assert_allclose(sel.XB, self.XB, atol=1e-05) + np.testing.assert_allclose(sel.err, self.err, atol=1e-05) + +if __name__ == '__main__': + unittest.main() diff --git a/release/python/0.7.0/crankshaft/crankshaft/regression/gwr_cs.py b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr_cs.py new file mode 100644 index 0000000..9ccaefb --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/regression/gwr_cs.py @@ -0,0 +1,202 @@ +""" + Geographically weighted regression +""" +import numpy as np +from gwr.base.gwr import GWR as PySAL_GWR +from gwr.base.sel_bw import Sel_BW +import json +from crankshaft.analysis_data_provider import AnalysisDataProvider +import plpy + + +class GWR: + def __init__(self, data_provider=None): + if data_provider: + self.data_provider = data_provider + else: + self.data_provider = AnalysisDataProvider() + + def gwr(self, subquery, dep_var, ind_vars, + bw=None, fixed=False, kernel='bisquare', + geom_col='the_geom', id_col='cartodb_id'): + """ + subquery: 'select * from demographics' + dep_var: 'pctbachelor' + ind_vars: ['intercept', 'pctpov', 'pctrural', 'pctblack'] + bw: value of bandwidth, if None then select optimal + fixed: False (kNN) or True ('distance') + kernel: 'bisquare' (default), or 'exponential', 'gaussian' + """ + + params = {'geom_col': geom_col, + 'id_col': id_col, + 'subquery': subquery, + 'dep_var': dep_var, + 'ind_vars': ind_vars} + + # get data from data provider + query_result = self.data_provider.get_gwr(params) + + # exit if data to analyze is empty + if len(query_result) == 0: + plpy.error('No data passed to analysis or independent variables ' + 'are all null-valued') + + # unique ids and variable names list + rowid = np.array(query_result[0]['rowid'], dtype=np.int) + + # x, y are centroids of input geometries + x = np.array(query_result[0]['x'], dtype=np.float) + y = np.array(query_result[0]['y'], dtype=np.float) + coords = zip(x, y) + + # extract dependent variable + Y = np.array(query_result[0]['dep_var'], dtype=np.float).reshape((-1, 1)) + + n = Y.shape[0] + k = len(ind_vars) + X = np.zeros((n, k)) + + # extract query result + for attr in range(0, k): + attr_name = 'attr' + str(attr + 1) + X[:, attr] = np.array( + query_result[0][attr_name], dtype=np.float).flatten() + + # add intercept variable name + ind_vars.insert(0, 'intercept') + + # calculate bandwidth if none is supplied + if bw is None: + bw = Sel_BW(coords, Y, X, + fixed=fixed, kernel=kernel).search() + model = PySAL_GWR(coords, Y, X, bw, + fixed=fixed, kernel=kernel).fit() + + # containers for outputs + coeffs = [] + stand_errs = [] + t_vals = [] + filtered_t_vals = [] + + # extracted model information + c_alpha = model.adj_alpha + filtered_t = model.filter_tvals(c_alpha[1]) + predicted = model.predy.flatten() + residuals = model.resid_response + r_squared = model.localR2.flatten() + bw = np.repeat(float(bw), n) + + # create lists of json objs for model outputs + for idx in xrange(n): + coeffs.append(json.dumps({var: model.params[idx, k] + for k, var in enumerate(ind_vars)})) + stand_errs.append(json.dumps({var: model.bse[idx, k] + for k, var in enumerate(ind_vars)})) + t_vals.append(json.dumps({var: model.tvalues[idx, k] + for k, var in enumerate(ind_vars)})) + filtered_t_vals.append( + json.dumps({var: filtered_t[idx, k] + for k, var in enumerate(ind_vars)})) + + return zip(coeffs, stand_errs, t_vals, filtered_t_vals, + predicted, residuals, r_squared, bw, rowid) + + def gwr_predict(self, subquery, dep_var, ind_vars, + bw=None, fixed=False, kernel='bisquare', + geom_col='the_geom', id_col='cartodb_id'): + """ + subquery: 'select * from demographics' + dep_var: 'pctbachelor' + ind_vars: ['intercept', 'pctpov', 'pctrural', 'pctblack'] + bw: value of bandwidth, if None then select optimal + fixed: False (kNN) or True ('distance') + kernel: 'bisquare' (default), or 'exponential', 'gaussian' + """ + + params = {'geom_col': geom_col, + 'id_col': id_col, + 'subquery': subquery, + 'dep_var': dep_var, + 'ind_vars': ind_vars} + + # get data from data provider + query_result = self.data_provider.get_gwr_predict(params) + + # exit if data to analyze is empty + if len(query_result) == 0: + plpy.error('No data passed to analysis or independent variables ' + 'are all null-valued') + + # unique ids and variable names list + rowid = np.array(query_result[0]['rowid'], dtype=np.int) + + x = np.array(query_result[0]['x'], dtype=np.float) + y = np.array(query_result[0]['y'], dtype=np.float) + coords = np.array(zip(x, y), dtype=np.float) + + # extract dependent variable + Y = np.array(query_result[0]['dep_var']).reshape((-1, 1)) + + n = Y.shape[0] + k = len(ind_vars) + X = np.empty((n, k), dtype=np.float) + + for attr in range(0, k): + attr_name = 'attr' + str(attr + 1) + X[:, attr] = np.array( + query_result[0][attr_name], dtype=np.float).flatten() + + # add intercept variable name + ind_vars.insert(0, 'intercept') + + # split data into "training" and "test" for predictions + # create index to split based on null y values + train = np.where(Y != np.array(None))[0] + test = np.where(Y == np.array(None))[0] + + # report error if there is no data to predict + if len(test) < 1: + plpy.error('No rows flagged for prediction: verify that rows ' + 'denoting prediction locations have a dependent ' + 'variable value of `null`') + + # split dependent variable (only need training which is non-Null's) + Y_train = Y[train].reshape((-1, 1)) + Y_train = Y_train.astype(np.float) + + # split coords + coords_train = coords[train] + coords_test = coords[test] + + # split explanatory variables + X_train = X[train] + X_test = X[test] + + # calculate bandwidth if none is supplied + if bw is None: + bw = Sel_BW(coords_train, Y_train, X_train, + fixed=fixed, kernel=kernel).search() + + # estimate model and predict at new locations + model = PySAL_GWR(coords_train, Y_train, X_train, + bw, fixed=fixed, + kernel=kernel).predict(coords_test, X_test) + + coeffs = [] + stand_errs = [] + t_vals = [] + r_squared = model.localR2.flatten() + predicted = model.predy.flatten() + + m = len(model.predy) + for idx in xrange(m): + coeffs.append(json.dumps({var: model.params[idx, k] + for k, var in enumerate(ind_vars)})) + stand_errs.append(json.dumps({var: model.bse[idx, k] + for k, var in enumerate(ind_vars)})) + t_vals.append(json.dumps({var: model.tvalues[idx, k] + for k, var in enumerate(ind_vars)})) + + return zip(coeffs, stand_errs, t_vals, + r_squared, predicted, rowid[test]) diff --git a/release/python/0.7.0/crankshaft/crankshaft/segmentation/__init__.py b/release/python/0.7.0/crankshaft/crankshaft/segmentation/__init__.py new file mode 100644 index 0000000..b825e85 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/segmentation/__init__.py @@ -0,0 +1 @@ +from segmentation import * diff --git a/release/python/0.7.0/crankshaft/crankshaft/segmentation/segmentation.py b/release/python/0.7.0/crankshaft/crankshaft/segmentation/segmentation.py new file mode 100644 index 0000000..ed61139 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/segmentation/segmentation.py @@ -0,0 +1,176 @@ +""" +Segmentation creation and prediction +""" + +import sklearn +import numpy as np +import plpy +from sklearn.ensemble import GradientBoostingRegressor +from sklearn import metrics +from sklearn.cross_validation import train_test_split + +# Lower level functions +#---------------------- + +def replace_nan_with_mean(array): + """ + Input: + @param array: an array of floats which may have null-valued entries + Output: + array with nans filled in with the mean of the dataset + """ + # returns an array of rows and column indices + indices = np.where(np.isnan(array)) + + # iterate through entries which have nan values + for row, col in zip(*indices): + array[row, col] = np.mean(array[~np.isnan(array[:, col]), col]) + + return array + +def get_data(variable, feature_columns, query): + """ + Fetch data from the database, clean, and package into + numpy arrays + Input: + @param variable: name of the target variable + @param feature_columns: list of column names + @param query: subquery that data is pulled from for the packaging + Output: + prepared data, packaged into NumPy arrays + """ + + columns = ','.join(['array_agg("{col}") As "{col}"'.format(col=col) for col in feature_columns]) + + try: + data = plpy.execute('''SELECT array_agg("{variable}") As target, {columns} FROM ({query}) As a'''.format( + variable=variable, + columns=columns, + query=query)) + except Exception, e: + plpy.error('Failed to access data to build segmentation model: %s' % e) + + # extract target data from plpy object + target = np.array(data[0]['target']) + + # put n feature data arrays into an n x m array of arrays + features = np.column_stack([np.array(data[0][col], dtype=float) for col in feature_columns]) + + return replace_nan_with_mean(target), replace_nan_with_mean(features) + +# High level interface +# -------------------- + +def create_and_predict_segment_agg(target, features, target_features, target_ids, model_parameters): + """ + Version of create_and_predict_segment that works on arrays that come stright form the SQL calling + the function. + + Input: + @param target: The 1D array of lenth NSamples containing the target variable we want the model to predict + @param features: Thw 2D array of size NSamples * NFeatures that form the imput to the model + @param target_ids: A 1D array of target_ids that will be used to associate the results of the prediction with the rows which they come from + @param model_parameters: A dictionary containing parameters for the model. + """ + + clean_target = replace_nan_with_mean(target) + clean_features = replace_nan_with_mean(features) + target_features = replace_nan_with_mean(target_features) + + model, accuracy = train_model(clean_target, clean_features, model_parameters, 0.2) + prediction = model.predict(target_features) + accuracy_array = [accuracy]*prediction.shape[0] + return zip(target_ids, prediction, np.full(prediction.shape, accuracy_array)) + + + +def create_and_predict_segment(query, variable, target_query, model_params): + """ + generate a segment with machine learning + Stuart Lynn + """ + + ## fetch column names + try: + columns = plpy.execute('SELECT * FROM ({query}) As a LIMIT 1 '.format(query=query))[0].keys() + except Exception, e: + plpy.error('Failed to build segmentation model: %s' % e) + + ## extract column names to be used in building the segmentation model + feature_columns = set(columns) - set([variable, 'cartodb_id', 'the_geom', 'the_geom_webmercator']) + ## get data from database + target, features = get_data(variable, feature_columns, query) + + model, accuracy = train_model(target, features, model_params, 0.2) + cartodb_ids, result = predict_segment(model, feature_columns, target_query) + accuracy_array = [accuracy]*result.shape[0] + return zip(cartodb_ids, result, accuracy_array) + + +def train_model(target, features, model_params, test_split): + """ + Train the Gradient Boosting model on the provided data and calculate the accuracy of the model + Input: + @param target: 1D Array of the variable that the model is to be trianed to predict + @param features: 2D Array NSamples * NFeatures to use in trining the model + @param model_params: A dictionary of model parameters, the full specification can be found on the + scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html) + @parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray + """ + features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split) + model = GradientBoostingRegressor(**model_params) + model.fit(features_train, target_train) + accuracy = calculate_model_accuracy(model, features, target) + return model, accuracy + +def calculate_model_accuracy(model, features, target): + """ + Calculate the mean squared error of the model prediction + Input: + @param model: model trained from input features + @param features: features to make a prediction from + @param target: target to compare prediction to + Output: + mean squared error of the model prection compared to the target + """ + prediction = model.predict(features) + return metrics.mean_squared_error(prediction, target) + +def predict_segment(model, features, target_query): + """ + Use the provided model to predict the values for the new feature set + Input: + @param model: The pretrained model + @features: A list of features to use in the model prediction (list of column names) + @target_query: The query to run to obtain the data to predict on and the cartdb_ids associated with it. + """ + + batch_size = 1000 + joined_features = ','.join(['"{0}"::numeric'.format(a) for a in features]) + + try: + cursor = plpy.cursor('SELECT Array[{joined_features}] As features FROM ({target_query}) As a'.format( + joined_features=joined_features, + target_query=target_query)) + except Exception, e: + plpy.error('Failed to build segmentation model: %s' % e) + + results = [] + + while True: + rows = cursor.fetch(batch_size) + if not rows: + break + batch = np.row_stack([np.array(row['features'], dtype=float) for row in rows]) + + #Need to fix this. Should be global mean. This will cause weird effects + batch = replace_nan_with_mean(batch) + prediction = model.predict(batch) + results.append(prediction) + + try: + cartodb_ids = plpy.execute('''SELECT array_agg(cartodb_id ORDER BY cartodb_id) As cartodb_ids FROM ({0}) As a'''.format(target_query))[0]['cartodb_ids'] + except Exception, e: + plpy.error('Failed to build segmentation model: %s' % e) + + return cartodb_ids, np.concatenate(results) diff --git a/release/python/0.7.0/crankshaft/crankshaft/space_time_dynamics/__init__.py b/release/python/0.7.0/crankshaft/crankshaft/space_time_dynamics/__init__.py new file mode 100644 index 0000000..a439286 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/space_time_dynamics/__init__.py @@ -0,0 +1,2 @@ +"""Import all functions from clustering libraries.""" +from markov import * diff --git a/release/python/0.7.0/crankshaft/crankshaft/space_time_dynamics/markov.py b/release/python/0.7.0/crankshaft/crankshaft/space_time_dynamics/markov.py new file mode 100644 index 0000000..20daaf1 --- /dev/null +++ b/release/python/0.7.0/crankshaft/crankshaft/space_time_dynamics/markov.py @@ -0,0 +1,194 @@ +""" +Spatial dynamics measurements using Spatial Markov +""" + +# TODO: remove all plpy dependencies + +import numpy as np +import pysal as ps +import plpy +import crankshaft.pysal_utils as pu +from crankshaft.analysis_data_provider import AnalysisDataProvider + + +class Markov(object): + def __init__(self, data_provider=None): + if data_provider is None: + self.data_provider = AnalysisDataProvider() + else: + self.data_provider = data_provider + + def spatial_trend(self, subquery, time_cols, num_classes=7, + w_type='knn', num_ngbrs=5, permutations=0, + geom_col='the_geom', id_col='cartodb_id'): + """ + Predict the trends of a unit based on: + 1. history of its transitions to different classes (e.g., 1st + quantile -> 2nd quantile) + 2. average class of its neighbors + + Inputs: + @param subquery string: e.g., SELECT the_geom, cartodb_id, + interesting_time_column FROM table_name + @param time_cols list of strings: list of strings of column names + @param num_classes (optional): number of classes to break + distribution of values into. Currently uses quantile bins. + @param w_type string (optional): weight type ('knn' or 'queen') + @param num_ngbrs int (optional): number of neighbors (if knn type) + @param permutations int (optional): number of permutations for test + stats + @param geom_col string (optional): name of column which contains + the geometries + @param id_col string (optional): name of column which has the ids + of the table + + Outputs: + @param trend_up float: probablity that a geom will move to a higher + class + @param trend_down float: probablity that a geom will move to a + lower class + @param trend float: (trend_up - trend_down) / trend_static + @param volatility float: a measure of the volatility based on + probability stddev(prob array) + """ + + if len(time_cols) < 2: + plpy.error('More than one time column needs to be passed') + + params = {"id_col": id_col, + "time_cols": time_cols, + "geom_col": geom_col, + "subquery": subquery, + "num_ngbrs": num_ngbrs} + + result = self.data_provider.get_markov(w_type, params) + + # build weight + weights = pu.get_weight(result, w_type) + weights.transform = 'r' + + # prep time data + t_data = get_time_data(result, time_cols) + + sp_markov_result = ps.Spatial_Markov(t_data, + weights, + k=num_classes, + fixed=False, + permutations=permutations) + + # get lag classes + lag_classes = ps.Quantiles( + ps.lag_spatial(weights, t_data[:, -1]), + k=num_classes).yb + + # look up probablity distribution for each unit according to class and + # lag class + prob_dist = get_prob_dist(sp_markov_result.P, + lag_classes, + sp_markov_result.classes[:, -1]) + + # find the ups and down and overall distribution of each cell + trend_up, trend_down, trend, volatility = get_prob_stats(prob_dist, sp_markov_result.classes[:, -1]) + + # output the results + return zip(trend, trend_up, trend_down, volatility, weights.id_order) + + + +def get_time_data(markov_data, time_cols): + """ + Extract the time columns and bin appropriately + """ + num_attrs = len(time_cols) + return np.array([[x['attr' + str(i)] for x in markov_data] + for i in range(1, num_attrs+1)], dtype=float).transpose() + + +# not currently used +def rebin_data(time_data, num_time_per_bin): + """ + Convert an n x l matrix into an (n/m) x l matrix where the values are + reduced (averaged) for the intervening states: + 1 2 3 4 1.5 3.5 + 5 6 7 8 -> 5.5 7.5 + 9 8 7 6 8.5 6.5 + 5 4 3 2 4.5 2.5 + + if m = 2, the 4 x 4 matrix is transformed to a 2 x 4 matrix. + + This process effectively resamples the data at a longer time span n + units longer than the input data. + For cases when there is a remainder (remainder(5/3) = 2), the remaining + two columns are binned together as the last time period, while the + first three are binned together for the first period. + + Input: + @param time_data n x l ndarray: measurements of an attribute at + different time intervals + @param num_time_per_bin int: number of columns to average into a new + column + Output: + ceil(n / m) x l ndarray of resampled time series + """ + + if time_data.shape[1] % num_time_per_bin == 0: + # if fit is perfect, then use it + n_max = time_data.shape[1] / num_time_per_bin + else: + # fit remainders into an additional column + n_max = time_data.shape[1] / num_time_per_bin + 1 + + return np.array( + [time_data[:, num_time_per_bin * i:num_time_per_bin * (i+1)].mean(axis=1) + for i in range(n_max)]).T + + +def get_prob_dist(transition_matrix, lag_indices, unit_indices): + """ + Given an array of transition matrices, look up the probability + associated with the arrangements passed + + Input: + @param transition_matrix ndarray[k,k,k]: + @param lag_indices ndarray: + @param unit_indices ndarray: + + Output: + Array of probability distributions + """ + + return np.array([transition_matrix[(lag_indices[i], unit_indices[i])] + for i in range(len(lag_indices))]) + + +def get_prob_stats(prob_dist, unit_indices): + """ + get the statistics of the probability distributions + + Outputs: + @param trend_up ndarray(float): sum of probabilities for upward + movement (relative to the unit index of that prob) + @param trend_down ndarray(float): sum of probabilities for downward + movement (relative to the unit index of that prob) + @param trend ndarray(float): difference of upward and downward + movements + """ + + num_elements = len(unit_indices) + trend_up = np.empty(num_elements, dtype=float) + trend_down = np.empty(num_elements, dtype=float) + trend = np.empty(num_elements, dtype=float) + + for i in range(num_elements): + trend_up[i] = prob_dist[i, (unit_indices[i]+1):].sum() + trend_down[i] = prob_dist[i, :unit_indices[i]].sum() + if prob_dist[i, unit_indices[i]] > 0.0: + trend[i] = (trend_up[i] - trend_down[i]) / ( + prob_dist[i, unit_indices[i]]) + else: + trend[i] = None + + # calculate volatility of distribution + volatility = prob_dist.std(axis=1) + + return trend_up, trend_down, trend, volatility diff --git a/release/python/0.7.0/crankshaft/requirements.txt b/release/python/0.7.0/crankshaft/requirements.txt new file mode 100644 index 0000000..88c0a9e --- /dev/null +++ b/release/python/0.7.0/crankshaft/requirements.txt @@ -0,0 +1,5 @@ +joblib==0.8.3 +numpy==1.6.1 +scipy==0.14.0 +pysal==1.14.3 +scikit-learn==0.14.1 diff --git a/release/python/0.7.0/crankshaft/setup.py b/release/python/0.7.0/crankshaft/setup.py new file mode 100644 index 0000000..a1f9ab2 --- /dev/null +++ b/release/python/0.7.0/crankshaft/setup.py @@ -0,0 +1,49 @@ + +""" +CartoDB Spatial Analysis Python Library +See: +https://github.com/CartoDB/crankshaft +""" + +from setuptools import setup, find_packages + +setup( + name='crankshaft', + + version='0.7.0', + + description='CartoDB Spatial Analysis Python Library', + + url='https://github.com/CartoDB/crankshaft', + + author='Data Services Team - CartoDB', + author_email='dataservices@cartodb.com', + + license='MIT', + + classifiers=[ + 'Development Status :: 3 - Alpha', + 'Intended Audience :: Mapping comunity', + 'Topic :: Maps :: Mapping Tools', + 'License :: OSI Approved :: MIT License', + 'Programming Language :: Python :: 2.7', + ], + + keywords='maps mapping tools spatial analysis geostatistics', + + packages=find_packages(exclude=['contrib', 'docs', 'tests']), + + extras_require={ + 'dev': ['unittest'], + 'test': ['unittest', 'nose', 'mock'], + }, + + # The choice of component versions is dictated by what's + # provisioned in the production servers. + # IMPORTANT NOTE: please don't change this line. Instead issue a ticket to systems for evaluation. + install_requires=['joblib==0.8.3', 'numpy==1.6.1', 'scipy==0.14.0', 'pysal==1.14.3', 'scikit-learn==0.14.1'], + + requires=['pysal', 'numpy', 'sklearn'], + + test_suite='test' +) diff --git a/release/python/0.7.0/crankshaft/test/fixtures/getis.json b/release/python/0.7.0/crankshaft/test/fixtures/getis.json new file mode 100644 index 0000000..02566fc --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/fixtures/getis.json @@ -0,0 +1 @@ +[[0.004793783909323601, 0.17999999999999999, 0.49808756424021061], [-1.0701189472090842, 0.079000000000000001, 0.14228288580832316], [-0.67867750971877305, 0.42099999999999999, 0.24867110969448558], [-0.67407386707620487, 0.246, 0.25013217644612995], [-0.79495689068870035, 0.33200000000000002, 0.21331928959090596], [-0.49279481022182703, 0.058999999999999997, 0.31107878905057329], [-0.38075627530057132, 0.28399999999999997, 0.35169205342069643], [-0.86710921611314895, 0.23699999999999999, 0.19294108571294855], [-0.78618647240956485, 0.050000000000000003, 0.2158791250244505], [-0.76108527223116984, 0.064000000000000001, 0.22330306830813684], [-0.13340753531942209, 0.247, 0.44693554317763651], [-0.57584545722033043, 0.48999999999999999, 0.28235982246156488], [-0.78882694661192831, 0.433, 0.2151065788731219], [-0.38769767950046219, 0.375, 0.34911988661484239], [-0.56057819488052207, 0.41399999999999998, 0.28754255985169652], [-0.41354017495644935, 0.45500000000000002, 0.339605447117173], [-0.23993577722243081, 0.49099999999999999, 0.40519002230969337], [-0.1389080156677496, 0.40400000000000003, 0.44476141839645233], [-0.25485737510500855, 0.376, 0.39941662953554224], [-0.71218610582902353, 0.17399999999999999, 0.23817476979886087], [-0.54533105995872144, 0.13700000000000001, 0.2927629228714812], [-0.39547917847510977, 0.033000000000000002, 0.34624464252424236], [-0.43052658996257548, 0.35399999999999998, 0.33340631435564982], [-0.37296719193774736, 0.40300000000000002, 0.35458643102865428], [-0.66482612169465694, 0.31900000000000001, 0.25308085650392698], [-0.13772133540823422, 0.34699999999999998, 0.44523032843016275], [-0.6765304487868502, 0.20999999999999999, 0.24935196033890672], [-0.64518763494323472, 0.32200000000000001, 0.25940279912025543], [-0.5078622084312413, 0.41099999999999998, 0.30577498972600159], [-0.12652006733772059, 0.42899999999999999, 0.44966013262301163], [-0.32691133022814595, 0.498, 0.37186747562269029], [0.25533848511500978, 0.42399999999999999, 0.39923083899077472], [2.7045138116476508, 0.0050000000000000001, 0.0034202212972238577], [-0.1551614486076057, 0.44400000000000001, 0.43834701985429037], [1.9524487722567723, 0.012999999999999999, 0.025442473674991528], [-1.2055816465306763, 0.017000000000000001, 0.11398941970467646], [3.478472976017831, 0.002, 0.00025213964072468009], [-1.4621715757903719, 0.002, 0.071847099325659136], [-0.84010307600180256, 0.085000000000000006, 0.20042529779230778], [5.7097646237318243, 0.0030000000000000001, 5.6566262784940591e-09], [1.5082367956567375, 0.065000000000000002, 0.065746966514827365], [-0.58337270103430816, 0.44, 0.27982121546450034], [-0.083271860457022437, 0.45100000000000001, 0.46681768733385554], [-0.46872337815000953, 0.34599999999999997, 0.31963368715684204], [0.18490279849545319, 0.23799999999999999, 0.42665263797981101], [3.470424529947997, 0.012, 0.00025981817437825683], [-0.99942612137154796, 0.032000000000000001, 0.15879415560388499], [-1.3650387953594485, 0.034000000000000002, 0.08612042845912049], [1.8617160516432014, 0.081000000000000003, 0.03132156240215267], [1.1321188945775384, 0.11600000000000001, 0.12879222611766061], [0.064116686050580601, 0.27300000000000002, 0.4744386578180424], [-0.42032194540259099, 0.29999999999999999, 0.33712514016213468], [-0.79581215423980922, 0.123, 0.21307061309098785], [-0.42792753720906046, 0.45600000000000002, 0.33435193892883741], [-1.0629378527428395, 0.051999999999999998, 0.14390506780140866], [-0.54164761752225477, 0.33700000000000002, 0.29403064095211839], [1.0934778886820793, 0.13700000000000001, 0.13709201601893539], [-0.094068785378413719, 0.38200000000000001, 0.46252725802998929], [0.13482026574801856, 0.36799999999999999, 0.44637699118865737], [-0.13976995315653129, 0.34699999999999998, 0.44442087706276601], [-0.051047663924746682, 0.32000000000000001, 0.47964376985626245], [-0.21468297736730158, 0.41699999999999998, 0.41500724761906527], [-0.20873154637330626, 0.38800000000000001, 0.41732890604390893], [-0.32427876152583485, 0.49199999999999999, 0.37286349875557478], [-0.65254842943280977, 0.374, 0.25702372075306734], [-0.48611858196118796, 0.23300000000000001, 0.31344154643990074], [-0.14482354344529477, 0.32600000000000001, 0.44242509660469886], [-0.51052030974200002, 0.439, 0.30484349480873729], [0.56814382285283538, 0.14999999999999999, 0.28496865660103166], [0.58680919931668207, 0.161, 0.27866592887231878], [0.013390357044409013, 0.25800000000000001, 0.49465818005865647], [-0.19050728887961568, 0.41399999999999998, 0.4244558160399462], [-0.60531777422216049, 0.35199999999999998, 0.2724839368239631], [1.0899331115425805, 0.127, 0.13787130480311838], [0.17015055382651084, 0.36899999999999999, 0.43244586845546418], [-0.21738337124409801, 0.40600000000000003, 0.41395479459421991], [1.0329303331079593, 0.079000000000000001, 0.15081825117169467], [1.0218317101096221, 0.104, 0.15343027913308094]] diff --git a/release/python/0.7.0/crankshaft/test/fixtures/gwr_packed_data.json b/release/python/0.7.0/crankshaft/test/fixtures/gwr_packed_data.json new file mode 100644 index 0000000..cbee3fb --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/fixtures/gwr_packed_data.json @@ -0,0 +1 @@ +[{"x":[941396.6,895553,930946.4,745398.6,849431.3,819317.3,803747.1,699011.5,863020.8,859915.8,809736.9,844270.1,979288.9,827822,1023145,994903.4,971593.8,782448.2,724741.2,1008480,964264.9,678778.6,670055.9,962612.3,1059706,704959.2,653026.6,734240.9,832508.6,695793.9,745538.8,908046.1,724646.8,894463.9,808691.8,942527.9,839816.1,705457.9,783416.5,805648.4,635964.3,764386.1,732628.4,759231.9,860451.4,800031.3,764116.9,707288.7,703495.1,896654,1031899,879541.2,943066.2,981727.8,739255.8,731468.7,662257.4,765397.3,845701.3,733728.4,732702.3,908386.8,1023411,695325.1,765058.1,855577.3,772634.6,818917.1,794419.5,873518.8,665933.8,695500.6,870749.9,675280.4,763488.4,814118.9,855461.8,815753.1,807249.1,915741.9,924108.1,970465.7,908636.7,821367.1,766461.7,873804.3,884830.4,770455.5,1014742,919396.5,1004544,864781.1,772600,917730.9,1030500,777055.3,848638.8,732876.8,715359.8,716369.8,766238.6,790338.7,920887.4,825920.1,707834.3,700833.7,793263.9,830735.9,863291.8,695329.2,798061.4,733846.7,953533.8,744180.8,668031.4,833819.6,840169.1,686875.4,824645.5,712437.1,954272.3,777759,752973.1,1004028,704495.6,754916.2,842085.9,703256.8,763457.1,734217.9,884376.9,963427.8,759410.8,882069.4,743031.8,795506.2,831682.3,941734.4,797981.7,919077.6,682616.8,819399.6,832935,777040.1,752165.2,658870.4,800384.3,938349.6,902471.1,894704.3,986832.8,731576.3,898776.3,796905.6,686891.4,838551.5,891228.5,858796.9,801018.1],"y":[3521764,3471916,3502787,3474765,3665553,3807616,3769623,3793408,3520432,3466377,3636468,3595691,3463849,3421638,3554982,3600493,3671394,3684504,3492653,3437933,3598842,3713250,3862318,3432769,3556747,3577608,3813760,3794110,3762905,3495219,3711726,3428340,3757187,3492465,3455994,3722100,3449007,3694344,3623343,3537103,3854592,3812502,3421800,3735253,3569933,3564188,3494367,3731361,3467152,3401148,3596117,3785425,3616602,3571315,3866604,3700612,3789664,3789005,3813323,3733248,3844809,3685752,3471063,3822135,3421817,3722330,3764306,3839931,3803344,3689861,3740622,3624790,3810303,3685569,3699716,3590553,3506293,3783949,3695092,3530869,3668080,3640263,3624562,3660143,3663959,3439981,3599291,3520161,3537225,3752562,3517834,3419313,3832429,3716368,3500535,3584821,3785405,3584393,3660275,3451034,3453930,3660608,3568473,3717990,3854188,3598228,3719734,3750903,3756777,3758093,3609091,3812828,3482044,3665561,3764766,3567447,3695254,3524124,3864805,3519627,3697862,3729605,3570222,3641918,3422002,3685029,3827075,3552857,3551752,3623162,3717493,3560039,3608179,3534470,3522636,3421725,3487715,3567586,3872640,3595170,3660254,3514927,3623868,3858779,3639192,3842167,3742691,3446675,3699878,3648583,3494323,3544716,3563384,3841086,3855274,3538547,3749769,3637891,3487328],"dep_var":[8.2,6.4,6.6,9.4,13.3,6.4,9.2,9,7.6,7.5,17,10.3,5.8,9.1,11.8,19.9,9.6,7.2,10.1,13.5,9.9,12,8.1,6.4,18.6,20.2,5.9,18.4,37.5,11.2,14.7,6.7,33,11.1,10,23.9,6.5,13.3,5.7,10,8,8.6,11.7,32.7,8,9.5,17,12,9.4,4.7,7.6,8,9.1,8.6,7.8,25.8,13.7,15.6,9.5,31.6,8.6,5.3,19.9,9.2,7.7,8.8,29.6,12,15.4,6.8,7.5,13.6,9.1,5.7,10.7,16,8.3,9,10.8,8.3,6.2,7.7,4.9,12,10,5.4,12,13.7,13.4,8.2,5.2,16.3,11.1,10.4,8.7,10.1,9.7,4.6,6.7,8.2,7.8,12.9,10.1,11,5.5,16.6,9.5,28.4,12.8,7.6,15.2,9,6.3,9.3,6.8,10.7,11.7,7.3,11.6,6,17.3,18.1,8,8.6,7.8,11.1,13.1,8,15.9,7.1,5.6,6.5,7.1,8.6,9.2,13.4,14,11.4,11.4,6.3,13.6,7.2,4.8,10.1,9,8.4,9.4,10.4,4.2,9.8,9.6,5.5,8.6,13.6,12,7.6,10.4,8.8,6.3],"attr1":[75.6,100,61.7,100,42.7,100,64.6,75.2,47,66.2,16.1,57.9,100,65.6,80.6,63.2,72.3,73.4,100,47.1,52.1,68.5,43.6,100,5.1,13.7,77.4,57.8,17.6,100,4.4,58.6,5.8,64.6,59.4,30.6,62,76.1,100,48.4,96.5,100,58,2.5,70.7,72.6,10,26.7,52.8,100,89.1,70,64.2,100,100,53.9,36.1,93.7,87.2,4.2,100,100,20.3,79.7,55.4,75.7,13.6,88.5,81.1,100,67.8,95.8,73.8,100,76,20.9,63.4,78,100,65.1,100,53.8,100,81.9,63.6,100,52.9,78.2,32.9,100,100,47.6,78.6,65.9,100,65.6,100,100,82.3,100,56.2,75.1,98.6,73,89,3.2,76,95.2,100,93.7,61.3,100,74.4,100,66.5,56.5,66.5,100,100,53.5,9.9,59.2,100,79.3,69.4,53.6,64.5,100,45.4,97.9,100,79.3,100,72.6,50.3,55.2,51.1,35.7,100,53.3,44,44.5,100,100,65.3,44.8,61.2,54.2,100,67.1,59.9,100,100,100,70,100,59.6,100,71.1],"attr2":[19.9,26,24.1,24.8,17.5,15.1,14.7,10.7,22,19.3,19.2,18.3,18.2,25.9,13.2,27.5,30.3,15.6,31.8,11.5,24.1,14.4,12,18.3,17.2,10.4,14.6,6.1,27,35.7,8.6,26.4,5.6,22.5,22.8,6.6,22.4,11.4,14,29,14.6,12.8,23.3,9.9,21.8,32.9,24.4,6.6,31.4,14.6,12.7,19.7,25.7,25.4,17.2,2.6,13.6,6.8,16.5,18.4,16.6,16.8,14.3,11.1,22.3,25.1,4,11.6,10.6,30.1,14.4,13.7,14.2,19.1,6.1,10.6,27.2,14.1,17.4,18.8,31.3,27.8,22.2,10.8,16.3,25.9,20.5,12.6,17.2,17.8,23.7,19.9,15.3,21.6,22.3,29.2,15.7,28.2,22.4,22.1,28.7,13.8,24.5,15,11.3,18.6,14.4,7.9,16.2,8.8,24,12.8,21.3,13.4,16.3,24.3,16.4,33,13.6,35.9,18.2,6.2,19.9,22.9,29.1,15.6,17,31.4,24.8,24.9,31.9,21.9,29.5,27.3,29.1,22.6,22.9,24,14,27.1,16.3,31.3,26,18.3,14.7,12.8,13.2,21.1,32.6,21.6,21.2,22.5,30.3,12.5,11.1,28.6,22.6,15.3,26.2],"attr3":[20.76,26.86,15.42,51.67,42.39,3.49,11.44,9.21,31.33,11.62,41.68,22.36,4.58,41.47,14.85,25.95,52.19,35.48,58.89,20.19,30.94,15.46,0.91,27.05,38.02,30.94,8.61,1.77,26.23,60.76,23.82,27.29,9.84,25.46,24.16,10.93,29.94,22.59,30.66,40.66,0.35,0.29,39.47,42.23,27.64,48.98,50.15,7.63,44.09,11.48,14.03,29.99,32.58,33.88,0.03,5.13,13.56,0,9.89,49.92,0.26,12.69,25.57,3.78,31.5,49.89,5.11,5.42,8.48,79.64,6.47,25.49,20.41,13.38,10.24,21.8,30.5,9.58,34.8,15.36,55.92,41.51,33.89,25.6,34.03,26.58,33.32,19.22,39.15,38.19,21.75,31.88,1.41,36.38,43.34,58.72,8.32,41.32,44.62,27.48,47.91,31.78,28.27,34.74,0.26,37.95,22.35,7.37,24.74,3.94,47.53,1.48,11.69,20.04,14.3,32.46,32.79,49.93,0.35,58.17,41.96,8.03,34.09,44.69,32.74,29.08,11.81,63.46,46.53,62.34,61.36,29.19,43.21,34.45,59.9,37.93,26.68,23.38,0,33.1,30.03,40.66,45.93,0.1,27.78,3.73,18.37,25.88,60.23,51.86,19.45,50.2,30.06,2.59,4.06,31.76,45.94,41.99,30.71],"rowid":[13001,13003,13005,13007,13009,13011,13013,13015,13017,13019,13021,13023,13025,13027,13029,13031,13033,13035,13037,13039,13043,13045,13047,13049,13051,13053,13055,13057,13059,13061,13063,13065,13067,13069,13071,13073,13075,13077,13079,13081,13083,13085,13087,13089,13091,13093,13095,13097,13099,13101,13103,13105,13107,13109,13111,13113,13115,13117,13119,13121,13123,13125,13127,13129,13131,13133,13135,13137,13139,13141,13143,13145,13147,13149,13151,13153,13155,13157,13159,13161,13163,13165,13167,13169,13171,13173,13175,13177,13179,13181,13183,13185,13187,13189,13191,13193,13195,13197,13199,13201,13205,13207,13209,13211,13213,13215,13217,13219,13221,13223,13225,13227,13229,13231,13233,13235,13237,13239,13241,13243,13245,13247,13249,13251,13253,13255,13257,13259,13261,13263,13265,13267,13269,13271,13273,13275,13277,13279,13281,13283,13285,13287,13289,13291,13293,13295,13297,13299,13301,13303,13305,13307,13309,13311,13313,13315,13317,13319,13321]}] diff --git a/release/python/0.7.0/crankshaft/test/fixtures/gwr_packed_knowns.json b/release/python/0.7.0/crankshaft/test/fixtures/gwr_packed_knowns.json new file mode 100644 index 0000000..4e73b79 --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/fixtures/gwr_packed_knowns.json @@ -0,0 +1 @@ +{"y_coord": [3521764, 3471916, 3502787, 3474765, 3665553, 3807616, 3769623, 3793408, 3520432, 3466377, 3636468, 3595691, 3463849, 3421638, 3554982, 3600493, 3671394, 3684504, 3492653, 3437933, 3598842, 3713250, 3862318, 3432769, 3556747, 3577608, 3813760, 3794110, 3762905, 3495219, 3711726, 3428340, 3757187, 3492465, 3455994, 3722100, 3449007, 3694344, 3623343, 3537103, 3854592, 3812502, 3421800, 3735253, 3569933, 3564188, 3494367, 3731361, 3467152, 3401148, 3596117, 3785425, 3616602, 3571315, 3866604, 3700612, 3789664, 3789005, 3813323, 3733248, 3844809, 3685752, 3471063, 3822135, 3421817, 3722330, 3764306, 3839931, 3803344, 3689861, 3740622, 3624790, 3810303, 3685569, 3699716, 3590553, 3506293, 3783949, 3695092, 3530869, 3668080, 3640263, 3624562, 3660143, 3663959, 3439981, 3599291, 3520161, 3537225, 3752562, 3517834, 3419313, 3832429, 3716368, 3500535, 3584821, 3785405, 3584393, 3660275, 3451034, 3453930, 3660608, 3568473, 3717990, 3854188, 3598228, 3719734, 3750903, 3756777, 3758093, 3609091, 3812828, 3482044, 3665561, 3764766, 3567447, 3695254, 3524124, 3864805, 3519627, 3697862, 3729605, 3570222, 3641918, 3422002, 3685029, 3827075, 3552857, 3551752, 3623162, 3717493, 3560039, 3608179, 3534470, 3522636, 3421725, 3487715, 3567586, 3872640, 3595170, 3660254, 3514927, 3623868, 3858779, 3639192, 3842167, 3742691, 3446675, 3699878, 3648583, 3494323, 3544716, 3563384, 3841086, 3855274, 3538547, 3749769, 3637891, 3487328], "influence": [0.041718, 0.093454, 0.10983, 0.118198, 0.097548, 0.059443, 0.041031, 0.032462, 0.058498, 0.100714, 0.170747, 0.082058, 0.184081, 0.037431, 0.131419, 0.11251, 0.101114, 0.047942, 0.113107, 0.181309, 0.05591, 0.037814, 0.109586, 0.130853, 0.237285, 0.172302, 0.036228, 0.064756, 0.443808, 0.13641, 0.141066, 0.076699, 0.150241, 0.032761, 0.051816, 0.223754, 0.029353, 0.06002, 0.105169, 0.076247, 0.069329, 0.051965, 0.039944, 0.20308, 0.046086, 0.098109, 0.194576, 0.093258, 0.093739, 0.186424, 0.178352, 0.036825, 0.053419, 0.069314, 0.088385, 0.131412, 0.08002, 0.085975, 0.047994, 0.201466, 0.077724, 0.153041, 0.174215, 0.034634, 0.040242, 0.063712, 0.149308, 0.040139, 0.036003, 0.285484, 0.063911, 0.090216, 0.033014, 0.133651, 0.097922, 0.198633, 0.060055, 0.032494, 0.071734, 0.073892, 0.121394, 0.087385, 0.077991, 0.093922, 0.03193, 0.091298, 0.053937, 0.126429, 0.13414, 0.102608, 0.078631, 0.067484, 0.06222, 0.036613, 0.166179, 0.107871, 0.06172, 0.080811, 0.060716, 0.096678, 0.05549, 0.054726, 0.076288, 0.051877, 0.045589, 0.170792, 0.030476, 0.097062, 0.057117, 0.077883, 0.057165, 0.050752, 0.097715, 0.08115, 0.047653, 0.049173, 0.03286, 0.099923, 0.061466, 0.137244, 0.27255, 0.071571, 0.070927, 0.063635, 0.098482, 0.034063, 0.064541, 0.143175, 0.059673, 0.219599, 0.140317, 0.030782, 0.102154, 0.05155, 0.118173, 0.045636, 0.052966, 0.12225, 0.059551, 0.084581, 0.048937, 0.120127, 0.087659, 0.106444, 0.034626, 0.096799, 0.02894, 0.040209, 0.138238, 0.09661, 0.053615, 0.116263, 0.159493, 0.050856, 0.035205, 0.123618, 0.061337, 0.156479, 0.044714], "x_coord": [941396.6, 895553, 930946.4, 745398.6, 849431.3, 819317.3, 803747.1, 699011.5, 863020.8, 859915.8, 809736.9, 844270.1, 979288.9, 827822, 1023145, 994903.4, 971593.8, 782448.2, 724741.2, 1008480, 964264.9, 678778.6, 670055.9, 962612.3, 1059706, 704959.2, 653026.6, 734240.9, 832508.6, 695793.9, 745538.8, 908046.1, 724646.8, 894463.9, 808691.8, 942527.9, 839816.1, 705457.9, 783416.5, 805648.4, 635964.3, 764386.1, 732628.4, 759231.9, 860451.4, 800031.3, 764116.9, 707288.7, 703495.1, 896654, 1031899, 879541.2, 943066.2, 981727.8, 739255.8, 731468.7, 662257.4, 765397.3, 845701.3, 733728.4, 732702.3, 908386.8, 1023411, 695325.1, 765058.1, 855577.3, 772634.6, 818917.1, 794419.5, 873518.8, 665933.8, 695500.6, 870749.9, 675280.4, 763488.4, 814118.9, 855461.8, 815753.1, 807249.1, 915741.9, 924108.1, 970465.7, 908636.7, 821367.1, 766461.7, 873804.3, 884830.4, 770455.5, 1014742, 919396.5, 1004544, 864781.1, 772600, 917730.9, 1030500, 777055.3, 848638.8, 732876.8, 715359.8, 716369.8, 766238.6, 790338.7, 920887.4, 825920.1, 707834.3, 700833.7, 793263.9, 830735.9, 863291.8, 695329.2, 798061.4, 733846.7, 953533.8, 744180.8, 668031.4, 833819.6, 840169.1, 686875.4, 824645.5, 712437.1, 954272.3, 777759, 752973.1, 1004028, 704495.6, 754916.2, 842085.9, 703256.8, 763457.1, 734217.9, 884376.9, 963427.8, 759410.8, 882069.4, 743031.8, 795506.2, 831682.3, 941734.4, 797981.7, 919077.6, 682616.8, 819399.6, 832935, 777040.1, 752165.2, 658870.4, 800384.3, 938349.6, 902471.1, 894704.3, 986832.8, 731576.3, 898776.3, 796905.6, 686891.4, 838551.5, 891228.5, 858796.9, 801018.1], "se_pctblack": [0.048422, 0.053382, 0.050307, 0.052233, 0.050361, 0.041694, 0.041354, 0.041423, 0.048378, 0.049213, 0.051088, 0.050376, 0.05714, 0.049579, 0.047733, 0.045694, 0.048206, 0.048881, 0.054756, 0.06, 0.044953, 0.046902, 0.043006, 0.057079, 0.048872, 0.056886, 0.043753, 0.039689, 0.044035, 0.057198, 0.046953, 0.054178, 0.040812, 0.051914, 0.049325, 0.047751, 0.048778, 0.04959, 0.053515, 0.049339, 0.044162, 0.039811, 0.053106, 0.041446, 0.0497, 0.051743, 0.051662, 0.042636, 0.055382, 0.053312, 0.047102, 0.044382, 0.046127, 0.045928, 0.041238, 0.048361, 0.043512, 0.03973, 0.043382, 0.042789, 0.041214, 0.048842, 0.055459, 0.04163, 0.051566, 0.04629, 0.04035, 0.041668, 0.0405, 0.047923, 0.043671, 0.059377, 0.044651, 0.05022, 0.047643, 0.050203, 0.048129, 0.04155, 0.048144, 0.047513, 0.049365, 0.046377, 0.048207, 0.051755, 0.050068, 0.050653, 0.047949, 0.05133, 0.049083, 0.044645, 0.050489, 0.050633, 0.040368, 0.046813, 0.05234, 0.056291, 0.043751, 0.056776, 0.054586, 0.05369, 0.05193, 0.051624, 0.044863, 0.047448, 0.042099, 0.058191, 0.045691, 0.044437, 0.044577, 0.04214, 0.053326, 0.040489, 0.052861, 0.051687, 0.042796, 0.048223, 0.048837, 0.058596, 0.042367, 0.056893, 0.048682, 0.044234, 0.055933, 0.046173, 0.053802, 0.049959, 0.042715, 0.057894, 0.054198, 0.05754, 0.046272, 0.045435, 0.055455, 0.048073, 0.05567, 0.050858, 0.048796, 0.044856, 0.041348, 0.045268, 0.057257, 0.047627, 0.052411, 0.041008, 0.05606, 0.043488, 0.043092, 0.054819, 0.047066, 0.050849, 0.052717, 0.05641, 0.046776, 0.040768, 0.042667, 0.048373, 0.04495, 0.050613, 0.04877], "cooksd": [7.7e-05, 0.000315, 0.002225, 0.000205, 0.001606, 0.001427, 0.00652, 0.001942, 0.003764, 0.000481, 0.000267, 0.000157, 8.8e-05, 0.000123, 0.003019, 0.072736, 0.001193, 0.005313, 0.005343, 1e-06, 0.000363, 3.1e-05, 0.034878, 0.00106, 0.001396, 0.001463, 0.003457, 0.001192, 1.423109, 0.021445, 0.098779, 0.001687, 0.055873, 0.000531, 0.000136, 0.021004, 0.001615, 6.7e-05, 0.006305, 2e-05, 0.000228, 3.7e-05, 6.9e-05, 0.065765, 0.000349, 0.001329, 0.003647, 0.0419, 7e-06, 0.00795, 0.003791, 0.003857, 8.5e-05, 0.002004, 0.000484, 0.042878, 0.003828, 0.011277, 0.000442, 0.094025, 0.001259, 0.014043, 0.018378, 0.000812, 0.001675, 0.00019, 0.016271, 0.000218, 0.001723, 0.026378, 0.003864, 0.007504, 0.002657, 0.000982, 0.005258, 0.000183, 9e-06, 0.002044, 0.004633, 0.000515, 0.00141, 0.003848, 0.002212, 8.9e-05, 0.001534, 5e-06, 0.000109, 0.006075, 0.001587, 0.000427, 0.000156, 0.006069, 4e-06, 0.000344, 0.001701, 1.1e-05, 0.000192, 0.000832, 0.002053, 0.000569, 0.001872, 0.000233, 0.005009, 0.000331, 0.004382, 0.003999, 0.00118, 0.170092, 0.006522, 0.002474, 0.005137, 0.00027, 0.001113, 0.000403, 0.004921, 5.5e-05, 0.000272, 0.001283, 0.002676, 0.008287, 0.003014, 0.000491, 7e-06, 6.4e-05, 3.7e-05, 0.002458, 0.000757, 0.000476, 0.004148, 0.002506, 0.002465, 0.00071, 0.00184, 1.5e-05, 0.004272, 0.000945, 0.003874, 0.000189, 0.002952, 0.006644, 0.000699, 0.004442, 0.0013, 0.005564, 0.002488, 0.024203, 0.00519, 1.8e-05, 3.8e-05, 0.000263, 2.9e-05, 0.007778, 0.008509, 0.006229, 0.000105, 0.002952, 0.001612, 8e-06, 0.000767], "est_pctrural": [-0.087919, -0.077996, -0.085464, -0.072676, -0.128431, -0.180965, -0.18567, -0.143921, -0.072048, -0.074505, -0.117008, -0.087278, -0.091904, -0.073817, -0.099557, -0.098698, -0.10867, -0.160167, -0.073158, -0.094154, -0.094601, -0.135079, -0.133974, -0.08942, -0.101838, -0.091672, -0.131022, -0.155214, -0.190065, -0.073282, -0.165429, -0.081867, -0.156664, -0.076699, -0.072464, -0.133921, -0.074071, -0.141117, -0.111564, -0.078856, -0.127798, -0.162809, -0.070745, -0.174179, -0.077704, -0.084767, -0.073785, -0.14867, -0.071717, -0.080666, -0.101715, -0.180059, -0.092867, -0.096093, -0.148798, -0.15388, -0.133349, -0.168574, -0.185284, -0.163127, -0.149285, -0.123165, -0.096677, -0.141254, -0.071164, -0.167284, -0.178183, -0.17299, -0.175146, -0.138416, -0.134046, -0.111022, -0.185981, -0.129588, -0.165865, -0.090566, -0.072647, -0.18459, -0.167686, -0.081112, -0.109965, -0.101055, -0.090609, -0.134897, -0.143216, -0.077016, -0.082642, -0.076247, -0.098465, -0.155026, -0.096848, -0.076702, -0.162008, -0.138472, -0.098386, -0.091171, -0.18779, -0.09308, -0.131121, -0.071531, -0.071437, -0.141435, -0.082891, -0.177965, -0.142118, -0.099834, -0.18257, -0.188049, -0.178213, -0.144304, -0.099787, -0.153281, -0.089241, -0.140148, -0.135427, -0.08363, -0.158342, -0.0768, -0.169776, -0.075752, -0.120551, -0.184296, -0.086448, -0.103888, -0.070842, -0.154438, -0.180631, -0.082675, -0.081094, -0.112567, -0.151714, -0.09336, -0.103733, -0.072526, -0.075826, -0.071792, -0.072662, -0.088969, -0.161521, -0.085392, -0.123629, -0.075361, -0.098113, -0.15897, -0.123605, -0.131963, -0.188348, -0.086523, -0.134341, -0.101212, -0.094024, -0.079862, -0.074241, -0.166676, -0.137364, -0.076497, -0.164552, -0.101861, -0.07352], "std_residual": [-0.162278, 0.213714, -0.518796, 0.151238, -0.470868, -0.580528, -1.508125, -0.929355, -0.95091, -0.253303, -0.139052, -0.162155, 0.076204, -0.217609, 0.545752, 2.926313, 0.397864, -1.254875, 0.790774, -0.006914, -0.30256, 0.108831, -2.056611, -0.32422, 0.258772, 0.32387, -1.171601, 0.506852, 5.159316, 1.423472, -2.996125, -0.550564, 2.171775, 0.483877, 0.193059, 1.042856, -0.892786, 0.125184, -0.894825, -0.060875, 0.213632, 0.099734, 0.157788, 1.962586, -0.328491, 0.426944, 0.474682, -2.46583, 0.031892, -0.719617, -0.510536, -1.227088, -0.150029, 0.633771, 0.272938, 2.056669, -0.810461, 1.337637, -0.361836, 2.358452, 0.472247, -1.07699, 1.140253, -0.581307, -0.772136, -0.20392, 1.176268, 0.278975, 0.829814, 0.99264, -0.919034, 1.062743, -1.077658, 0.308164, -0.850273, -0.104966, -0.046276, -0.953096, 0.945945, -0.310309, 0.390222, -0.774466, -0.62474, 0.113205, -0.833277, -0.027681, 0.168806, 0.791532, -0.390956, 0.236217, -0.165374, 1.118755, -0.031656, -0.36767, 0.356947, 0.036409, 0.208857, -0.375831, -0.688536, 0.281743, -0.689616, 0.244942, 0.95144, -0.300273, -1.170075, -0.538318, -0.748598, 4.85965, 1.267608, -0.661149, 1.124549, 0.274469, -0.391676, 0.261042, -1.211531, 0.126055, -0.345827, 0.415296, 0.780979, -0.881768, -0.346488, 0.308225, -0.037905, -0.118149, 0.071239, -1.020007, -0.404594, 0.206144, 0.987719, -0.364614, 0.474766, -0.577719, 0.491319, 0.06313, -0.68978, 0.543157, 1.016772, -0.142276, 0.834189, -1.035981, -0.450355, -0.696808, -0.449433, 0.834905, -1.01757, -1.835909, -1.612155, -0.080997, 0.059813, -0.191521, -0.086895, -0.939373, 0.818087, 1.31727, -0.206834, 0.558896, -0.606735, 0.025605, -0.494558], "localr2": [0.551117, 0.557455, 0.553851, 0.571077, 0.559486, 0.551175, 0.558752, 0.571809, 0.513439, 0.550571, 0.57839, 0.545373, 0.604611, 0.563673, 0.606627, 0.579241, 0.547193, 0.58401, 0.57804, 0.622744, 0.554506, 0.616314, 0.553322, 0.610492, 0.618849, 0.631907, 0.568832, 0.566261, 0.551402, 0.582397, 0.594597, 0.591202, 0.583349, 0.535237, 0.556551, 0.54599, 0.55833, 0.62123, 0.609202, 0.58125, 0.557535, 0.557277, 0.560922, 0.57934, 0.511668, 0.593458, 0.573619, 0.599358, 0.573775, 0.594004, 0.599749, 0.551664, 0.534209, 0.576381, 0.547637, 0.607718, 0.577489, 0.562801, 0.547044, 0.592063, 0.552589, 0.548079, 0.619311, 0.562128, 0.557048, 0.560955, 0.566841, 0.546595, 0.555068, 0.560605, 0.599587, 0.649273, 0.546138, 0.629902, 0.587461, 0.578893, 0.529977, 0.555579, 0.569569, 0.503741, 0.539692, 0.553734, 0.51914, 0.565167, 0.607287, 0.56818, 0.491248, 0.580526, 0.605081, 0.551278, 0.603162, 0.573075, 0.551941, 0.550795, 0.617381, 0.608093, 0.550716, 0.627532, 0.634592, 0.569713, 0.55971, 0.589995, 0.481273, 0.557708, 0.552413, 0.644127, 0.563542, 0.552622, 0.554574, 0.588412, 0.591719, 0.56091, 0.584391, 0.619238, 0.587011, 0.5735, 0.563046, 0.59661, 0.541754, 0.589691, 0.545224, 0.568126, 0.608463, 0.570086, 0.566278, 0.601112, 0.546778, 0.611534, 0.594733, 0.640115, 0.5595, 0.564123, 0.626729, 0.473791, 0.587694, 0.557015, 0.54805, 0.527237, 0.543414, 0.493954, 0.643286, 0.563722, 0.537914, 0.546488, 0.628564, 0.559385, 0.560098, 0.595942, 0.553755, 0.534857, 0.598634, 0.599113, 0.423182, 0.548624, 0.553793, 0.557453, 0.555125, 0.538754, 0.561673], "est_intercept": [18.375924, 18.039692, 18.173904, 18.612431, 25.027931, 28.868732, 29.126594, 26.73874, 17.332852, 18.009999, 23.331917, 18.575691, 18.853338, 18.212539, 20.021869, 20.563701, 23.303807, 27.304692, 18.937685, 19.091389, 20.107213, 26.58407, 25.993181, 18.861404, 20.315761, 21.024737, 26.101723, 27.237578, 29.623695, 19.199889, 28.067985, 18.570921, 27.550537, 17.692965, 18.158333, 26.14995, 18.119124, 26.596568, 22.804616, 18.569581, 25.775193, 27.602714, 18.386703, 28.434523, 17.509755, 19.093217, 18.613133, 27.243553, 18.862486, 18.63039, 20.726368, 29.409853, 20.50841, 19.704834, 26.704982, 27.33758, 26.343322, 27.988326, 29.33117, 28.013628, 26.776764, 25.151079, 19.341762, 26.468527, 18.258714, 28.364577, 28.606646, 28.307209, 28.404865, 26.321627, 26.500457, 23.059413, 29.625733, 25.689204, 27.858068, 19.647124, 17.629684, 29.102336, 27.867154, 17.579579, 23.757825, 21.960048, 20.80086, 25.164234, 26.024579, 18.276315, 18.598483, 18.808846, 19.712953, 27.921286, 19.433808, 18.35095, 27.53054, 26.556594, 19.58565, 20.302644, 29.590949, 21.013811, 25.224396, 18.655811, 18.36884, 25.731843, 17.978451, 28.693695, 26.386878, 21.843074, 28.847315, 29.460662, 29.177378, 26.994593, 21.207599, 27.084437, 18.612524, 25.907755, 26.504604, 18.394926, 27.466413, 19.674459, 28.095647, 19.400471, 24.815945, 28.99667, 20.143981, 22.000899, 18.552729, 27.04672, 28.950415, 20.171, 19.436785, 23.078763, 27.520861, 19.256298, 22.034483, 16.895516, 19.123574, 18.208195, 18.006901, 18.743279, 27.490506, 18.866189, 24.730344, 18.225195, 20.965009, 27.320591, 24.200809, 25.991901, 29.279015, 18.656967, 26.15185, 22.653152, 19.058441, 19.682721, 16.844017, 27.838898, 26.164642, 18.0256, 28.516555, 22.169471, 18.263625], "yhat": [8.815245, 5.611921, 8.495724, 8.849965, 15.03242, 8.580509, 14.919817, 12.540446, 11.17349, 8.430319, 17.490415, 10.9017, 5.533408, 9.926865, 9.830106, 9.223105, 8.139072, 11.942142, 7.215765, 13.524229, 11.038573, 11.586549, 15.616062, 7.570655, 17.724733, 19.058832, 10.354608, 16.501607, 22.597924, 6.076758, 25.4543, 8.748904, 25.246368, 9.256918, 9.271919, 20.341504, 9.906595, 12.829942, 8.978318, 10.226601, 7.201808, 8.223906, 11.101222, 25.914554, 9.242568, 7.929672, 15.350101, 21.093841, 9.282415, 7.213871, 9.392305, 12.664129, 9.665323, 6.232029, 6.790721, 18.376414, 16.710674, 10.647101, 10.867332, 23.437631, 6.843525, 9.138708, 15.886925, 11.412046, 10.629657, 9.564201, 25.398206, 10.941449, 12.244551, 3.550323, 10.943758, 9.674096, 13.204245, 4.58911, 13.827683, 16.363919, 8.473759, 12.630831, 7.270247, 9.456558, 4.783389, 10.565422, 7.223319, 11.582661, 13.175302, 5.502197, 11.364096, 10.834768, 14.808946, 7.333349, 5.814789, 12.115865, 11.218728, 11.797658, 7.437642, 9.966811, 8.916465, 5.995525, 9.284445, 7.162908, 10.395689, 11.977673, 6.558461, 12.132376, 9.927145, 18.498512, 12.354765, 10.515515, 8.032877, 10.058864, 10.970992, 7.964322, 7.740923, 8.330884, 11.379041, 10.22395, 13.017184, 5.774051, 8.669737, 9.172055, 18.444544, 16.949769, 8.141504, 9.042787, 7.538033, 14.982577, 14.615563, 7.260976, 12.190501, 8.347484, 3.895132, 8.702775, 5.296954, 8.361885, 11.708678, 11.344938, 10.167793, 11.916248, 8.266897, 10.138875, 15.30099, 9.731427, 6.462592, 7.04339, 12.872171, 15.157496, 15.552791, 10.707326, 3.984952, 10.505011, 9.927395, 8.920127, 5.695227, 8.629694, 12.786831, 5.573625, 12.676648, 8.708923, 8.172088], "est_pctpov": [-0.218522, -0.291285, -0.235007, -0.325567, -0.188146, -0.137907, -0.119547, -0.379195, -0.259626, -0.310736, -0.244202, -0.206076, -0.248679, -0.334842, -0.215786, -0.21054, -0.199793, -0.192097, -0.322443, -0.253757, -0.206276, -0.534973, -0.384352, -0.274742, -0.216591, -0.341741, -0.443096, -0.30198, -0.068634, -0.327487, -0.315412, -0.310284, -0.345864, -0.269059, -0.328491, -0.16491, -0.323102, -0.475082, -0.288485, -0.275601, -0.429848, -0.242496, -0.341276, -0.227016, -0.2048, -0.252217, -0.316204, -0.425002, -0.332753, -0.326117, -0.210497, -0.10558, -0.209458, -0.206017, -0.292783, -0.384946, -0.455325, -0.227329, -0.109925, -0.340684, -0.302287, -0.187858, -0.23682, -0.36714, -0.34067, -0.110955, -0.186993, -0.170485, -0.176104, -0.172523, -0.500233, -0.412376, -0.097125, -0.519982, -0.243051, -0.236613, -0.282387, -0.119102, -0.113713, -0.208408, -0.202582, -0.214505, -0.217432, -0.180505, -0.284279, -0.319038, -0.196772, -0.299851, -0.216323, -0.148542, -0.221665, -0.329237, -0.235629, -0.167632, -0.226774, -0.265047, -0.086238, -0.318679, -0.423909, -0.334852, -0.335002, -0.222813, -0.183316, -0.067768, -0.338818, -0.366487, -0.102857, -0.064893, -0.096511, -0.426293, -0.262799, -0.301715, -0.245085, -0.354426, -0.464528, -0.230167, -0.116111, -0.327569, -0.179779, -0.31768, -0.180967, -0.141807, -0.290041, -0.211583, -0.340959, -0.30384, -0.131568, -0.32466, -0.28365, -0.360965, -0.151989, -0.206422, -0.31246, -0.218596, -0.305719, -0.341804, -0.311581, -0.194575, -0.219397, -0.189617, -0.491018, -0.292708, -0.229657, -0.239476, -0.344539, -0.411469, -0.091658, -0.278733, -0.177206, -0.216008, -0.23146, -0.304909, -0.172899, -0.203986, -0.367377, -0.261962, -0.131495, -0.224866, -0.31454], "area_key": [13001, 13003, 13005, 13007, 13009, 13011, 13013, 13015, 13017, 13019, 13021, 13023, 13025, 13027, 13029, 13031, 13033, 13035, 13037, 13039, 13043, 13045, 13047, 13049, 13051, 13053, 13055, 13057, 13059, 13061, 13063, 13065, 13067, 13069, 13071, 13073, 13075, 13077, 13079, 13081, 13083, 13085, 13087, 13089, 13091, 13093, 13095, 13097, 13099, 13101, 13103, 13105, 13107, 13109, 13111, 13113, 13115, 13117, 13119, 13121, 13123, 13125, 13127, 13129, 13131, 13133, 13135, 13137, 13139, 13141, 13143, 13145, 13147, 13149, 13151, 13153, 13155, 13157, 13159, 13161, 13163, 13165, 13167, 13169, 13171, 13173, 13175, 13177, 13179, 13181, 13183, 13185, 13187, 13189, 13191, 13193, 13195, 13197, 13199, 13201, 13205, 13207, 13209, 13211, 13213, 13215, 13217, 13219, 13221, 13223, 13225, 13227, 13229, 13231, 13233, 13235, 13237, 13239, 13241, 13243, 13245, 13247, 13249, 13251, 13253, 13255, 13257, 13259, 13261, 13263, 13265, 13267, 13269, 13271, 13273, 13275, 13277, 13279, 13281, 13283, 13285, 13287, 13289, 13291, 13293, 13295, 13297, 13299, 13301, 13303, 13305, 13307, 13309, 13311, 13313, 13315, 13317, 13319, 13321], "residual": [-0.615245, 0.788079, -1.895724, 0.550035, -1.73242, -2.180509, -5.719817, -3.540446, -3.57349, -0.930319, -0.490415, -0.6017, 0.266592, -0.826865, 1.969894, 10.676895, 1.460928, -4.742142, 2.884235, -0.024229, -1.138573, 0.413451, -7.516062, -1.170655, 0.875267, 1.141168, -4.454608, 1.898393, 14.902076, 5.123242, -10.7543, -2.048904, 7.753632, 1.843082, 0.728081, 3.558496, -3.406595, 0.470058, -3.278318, -0.226601, 0.798192, 0.376094, 0.598778, 6.785446, -1.242568, 1.570328, 1.649899, -9.093841, 0.117585, -2.513871, -1.792305, -4.664129, -0.565323, 2.367971, 1.009279, 7.423586, -3.010674, 4.952899, -1.367332, 8.162369, 1.756475, -3.838708, 4.013075, -2.212046, -2.929657, -0.764201, 4.201794, 1.058551, 3.155449, 3.249677, -3.443758, 3.925904, -4.104245, 1.11089, -3.127683, -0.363919, -0.173759, -3.630831, 3.529753, -1.156558, 1.416611, -2.865422, -2.323319, 0.417339, -3.175302, -0.102197, 0.635904, 2.865232, -1.408946, 0.866651, -0.614789, 4.184135, -0.118728, -1.397658, 1.262358, 0.133189, 0.783535, -1.395525, -2.584445, 1.037092, -2.595689, 0.922327, 3.541539, -1.132376, -4.427145, -1.898512, -2.854765, 17.884485, 4.767123, -2.458864, 4.229008, 1.035678, -1.440923, 0.969116, -4.579041, 0.47605, -1.317184, 1.525949, 2.930263, -3.172055, -1.144544, 1.150231, -0.141504, -0.442787, 0.261967, -3.882577, -1.515563, 0.739024, 3.709499, -1.247484, 1.704868, -2.202775, 1.803046, 0.238115, -2.508678, 2.055062, 3.832207, -0.516248, 3.133103, -3.838875, -1.70099, -2.531427, -1.662592, 3.05661, -3.872171, -6.757496, -6.152791, -0.307326, 0.215048, -0.705011, -0.327395, -3.420127, 2.904773, 4.970306, -0.786831, 2.026375, -2.276648, 0.091077, -1.872088], "se_pctpov": [0.115485, 0.126975, 0.118227, 0.10991, 0.106755, 0.130001, 0.128153, 0.129687, 0.122903, 0.120344, 0.105854, 0.108188, 0.131652, 0.119039, 0.120983, 0.118868, 0.116824, 0.124573, 0.109529, 0.137167, 0.114816, 0.135058, 0.134571, 0.132733, 0.125196, 0.107398, 0.135228, 0.126003, 0.132212, 0.111889, 0.132315, 0.129999, 0.126709, 0.122888, 0.11624, 0.118168, 0.117511, 0.133364, 0.107538, 0.105176, 0.13661, 0.127189, 0.118856, 0.125138, 0.117719, 0.102601, 0.108322, 0.128055, 0.11285, 0.129822, 0.124218, 0.128278, 0.113438, 0.116881, 0.131403, 0.132619, 0.134311, 0.126518, 0.132408, 0.128477, 0.131258, 0.109249, 0.130866, 0.13096, 0.119307, 0.122192, 0.126621, 0.130642, 0.128241, 0.10831, 0.131083, 0.122289, 0.132841, 0.129917, 0.129134, 0.10135, 0.121095, 0.12867, 0.124665, 0.113515, 0.109758, 0.114253, 0.106067, 0.112681, 0.119755, 0.12324, 0.109178, 0.103299, 0.122218, 0.117998, 0.123027, 0.123298, 0.128914, 0.113454, 0.126838, 0.103437, 0.131109, 0.105346, 0.12633, 0.113005, 0.117147, 0.119013, 0.111667, 0.130562, 0.133131, 0.112803, 0.130715, 0.131731, 0.126917, 0.129851, 0.10397, 0.128916, 0.12393, 0.12372, 0.131301, 0.108238, 0.119382, 0.111591, 0.132555, 0.108586, 0.116635, 0.130817, 0.102402, 0.118168, 0.116027, 0.129284, 0.131295, 0.108254, 0.101675, 0.115852, 0.114644, 0.113088, 0.106211, 0.11992, 0.107638, 0.120813, 0.118432, 0.11174, 0.130834, 0.110938, 0.132968, 0.10945, 0.105169, 0.13064, 0.119565, 0.135161, 0.129666, 0.128515, 0.108749, 0.105576, 0.125065, 0.104974, 0.117933, 0.129239, 0.134044, 0.115963, 0.121461, 0.101262, 0.110659], "area_num": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158], "t_pctblack": [1.427054, 2.054089, 1.62247, 2.067223, -0.571001, -0.751165, -0.962027, 1.787476, 1.938852, 2.360389, 0.343293, 1.021487, 1.514925, 2.543305, 0.960576, 0.579333, -0.498492, -0.351502, 1.81343, 1.474091, 0.597696, 2.701643, 1.951858, 1.735149, 0.890002, 1.615912, 2.291372, 1.102801, -1.582224, 1.696627, 0.739203, 2.141897, 1.347938, 1.946313, 2.439857, -1.191861, 2.476966, 2.131924, 0.834293, 1.727777, 2.256296, 0.527207, 2.275422, 0.093091, 1.231076, 1.297547, 2.003273, 1.920576, 1.906397, 2.306169, 0.608413, -1.548925, 0.334071, 0.879985, 1.142487, 1.345685, 2.328186, 0.326093, -1.149781, 1.113164, 1.234299, -0.871034, 1.335771, 1.758505, 2.407358, -1.451449, -0.180277, -0.347764, -0.26002, -0.97909, 2.60129, 1.916199, -1.445025, 2.664429, 0.118778, 1.021292, 2.132797, -0.990281, -1.104017, 1.473619, -0.593063, 0.002792, 0.189985, -0.440808, 0.524102, 2.369783, 0.73308, 1.790638, 1.07006, -1.431878, 1.201306, 2.458042, 0.476659, -1.18185, 1.211263, 1.023896, -1.487726, 1.396634, 1.784685, 2.074325, 2.273394, -0.035106, 0.981109, -1.549049, 1.58959, 1.716912, -1.112541, -1.619114, -1.59564, 2.029086, 0.863052, 1.167835, 1.599251, 1.146231, 2.372841, 1.371699, -1.258477, 1.568643, -0.223734, 1.580033, -0.922473, -0.724635, 1.266086, 0.060804, 2.170711, 0.65643, -0.887132, 1.511118, 1.375946, 1.471151, -1.270003, 1.033424, 1.190796, 1.62997, 1.588052, 2.50093, 2.311368, 0.971615, 0.272884, 0.642506, 2.334434, 2.076718, 0.531731, 0.537312, 1.160625, 2.122713, -1.249899, 1.847558, -1.042728, -0.262048, 1.373168, 1.442209, 1.076834, 0.077966, 1.821677, 1.750865, -1.482242, 0.078119, 2.254575], "t_pctpov": [-1.892203, -2.294038, -1.987757, -2.962119, -1.762416, -1.060813, -0.932843, -2.923921, -2.112442, -2.582057, -2.306955, -1.904795, -1.888905, -2.812877, -1.783596, -1.771213, -1.710202, -1.542034, -2.943902, -1.84999, -1.796571, -3.96105, -2.856124, -2.069892, -1.730021, -3.181992, -3.276647, -2.396616, -0.519117, -2.926894, -2.383793, -2.386815, -2.729594, -2.189468, -2.825986, -1.395554, -2.749553, -3.562294, -2.682624, -2.620376, -3.146541, -1.906591, -2.871335, -1.814127, -1.739742, -2.458238, -2.919104, -3.318914, -2.948627, -2.512025, -1.694582, -0.823055, -1.846445, -1.762619, -2.228132, -2.902644, -3.390087, -1.796814, -0.830198, -2.651717, -2.303005, -1.719545, -1.809644, -2.80344, -2.855407, -0.908037, -1.476799, -1.304983, -1.373228, -1.592868, -3.816142, -3.372144, -0.731138, -4.002403, -1.88216, -2.334626, -2.331943, -0.925637, -0.912148, -1.835951, -1.845709, -1.877457, -2.049956, -1.601903, -2.373838, -2.588749, -1.802296, -2.902752, -1.769972, -1.258854, -1.801754, -2.670262, -1.827804, -1.477533, -1.787908, -2.56241, -0.657756, -3.025058, -3.355567, -2.963173, -2.859663, -1.872183, -1.641636, -0.519046, -2.544991, -3.248908, -0.786878, -0.492614, -0.760425, -3.282933, -2.527649, -2.340399, -1.977606, -2.864736, -3.537897, -2.126485, -0.972603, -2.935453, -1.356259, -2.925619, -1.551575, -1.084007, -2.832382, -1.790533, -2.938625, -2.350179, -1.002078, -2.999061, -2.789759, -3.115739, -1.325746, -1.825317, -2.941886, -1.822841, -2.840247, -2.829192, -2.630888, -1.741319, -1.676917, -1.709216, -3.692752, -2.674345, -2.1837, -1.833093, -2.881611, -3.044283, -0.706881, -2.168882, -1.6295, -2.045982, -1.850712, -2.904624, -1.466082, -1.57836, -2.740722, -2.259014, -1.082611, -2.220636, -2.842413], "t_intercept": [7.609379, 6.697503, 7.195672, 8.255795, 14.156497, 18.407904, 18.455024, 16.077957, 6.054346, 7.204967, 13.296838, 7.329656, 6.866246, 7.675677, 8.4897, 9.071485, 9.999358, 17.841186, 8.604366, 6.69702, 9.080628, 16.810189, 15.17167, 6.835291, 8.384399, 11.955242, 15.299161, 16.868109, 18.579766, 8.871642, 17.194655, 7.052968, 16.742201, 6.53967, 7.638512, 11.604368, 7.593943, 17.183962, 13.302195, 8.155776, 14.929177, 17.265368, 7.668111, 17.658053, 6.083667, 8.634759, 8.295939, 16.901099, 8.408389, 7.259141, 8.810223, 17.697046, 9.122639, 8.7043, 16.183129, 17.280266, 15.592031, 17.414749, 18.485488, 16.905, 16.099393, 12.162038, 7.239849, 15.818031, 7.578958, 17.668373, 17.682992, 18.050671, 17.981138, 14.869228, 16.588831, 14.879699, 18.090369, 17.196475, 17.748172, 9.091638, 6.535975, 18.607913, 18.083927, 7.076811, 10.768372, 9.726515, 9.322075, 14.772789, 17.375394, 7.339344, 7.649891, 8.714896, 8.184484, 14.374787, 7.88758, 7.504212, 17.187921, 12.960213, 7.696175, 10.289304, 18.564107, 11.821484, 16.94767, 8.180987, 7.707789, 16.470342, 7.740734, 18.284333, 15.557935, 13.164332, 18.149077, 18.57868, 18.007492, 16.32635, 10.980793, 16.440536, 7.153743, 17.304261, 16.205781, 7.203743, 17.151505, 9.602895, 17.765752, 9.255965, 10.787743, 17.733848, 10.47511, 9.635323, 7.958089, 17.585678, 18.452111, 10.484603, 9.497019, 14.547858, 15.52837, 8.620986, 12.9729, 5.923095, 8.817958, 7.528327, 7.241212, 8.412973, 17.323629, 8.263278, 16.566306, 7.755803, 9.926525, 16.978309, 15.470937, 15.129371, 18.3484, 6.97004, 13.472636, 10.58946, 7.433559, 9.7771, 6.279439, 17.679091, 15.298295, 6.775579, 16.180906, 11.101535, 7.890733], "t_pctrural": [-4.164093, -3.455642, -3.984466, -3.502894, -6.433839, -9.00415, -9.330583, -7.273569, -3.216792, -3.500257, -5.878384, -3.709493, -3.960341, -3.505663, -4.520597, -4.513625, -4.899843, -8.51144, -3.51973, -3.918325, -4.387163, -6.760826, -6.474081, -3.862817, -4.494979, -4.783707, -6.350931, -8.039987, -9.289223, -3.501551, -8.415808, -3.635542, -8.063056, -3.433894, -3.469055, -5.926957, -3.549694, -7.252536, -5.86143, -3.892085, -6.085906, -8.344896, -3.287008, -8.992089, -3.202256, -4.180517, -3.586052, -7.6905, -3.396553, -3.617773, -4.518828, -8.679796, -4.198436, -4.462907, -7.393499, -7.965298, -6.544475, -8.638036, -8.9432, -8.304627, -7.421519, -5.740929, -4.173442, -7.05162, -3.322371, -8.713754, -9.045413, -8.630204, -8.865623, -7.075921, -6.790647, -5.812039, -8.758158, -6.627627, -8.67066, -4.359978, -3.354427, -9.25172, -8.815745, -3.810158, -4.97107, -4.671502, -4.036906, -6.657152, -7.832129, -3.570442, -3.507163, -3.796826, -4.452569, -7.373823, -4.376163, -3.569278, -8.212728, -6.485034, -4.339861, -4.626838, -9.107394, -4.892205, -6.95731, -3.39262, -3.353496, -7.438071, -3.829113, -9.139108, -6.952847, -5.251575, -9.25942, -9.319456, -8.890033, -7.332245, -4.988105, -7.76106, -4.047026, -7.595919, -6.829612, -3.826044, -8.286358, -3.730161, -8.37047, -3.692142, -5.351946, -9.213078, -4.448672, -4.779433, -3.320568, -8.197081, -8.853278, -4.155026, -4.096627, -6.011687, -7.675126, -4.450236, -5.547404, -3.188337, -3.686576, -3.351909, -3.445811, -4.216177, -8.09466, -3.813858, -6.277743, -3.697309, -4.365304, -7.969116, -6.606297, -6.362486, -9.425633, -3.841971, -6.526246, -4.606206, -4.214534, -3.997518, -3.167862, -8.434538, -6.669445, -3.479159, -8.106824, -4.771271, -3.595321], "se_pctrural": [0.021113, 0.022571, 0.021449, 0.020748, 0.019962, 0.020098, 0.019899, 0.019787, 0.022398, 0.021286, 0.019905, 0.023528, 0.023206, 0.021056, 0.022023, 0.021867, 0.022178, 0.018818, 0.020785, 0.024029, 0.021563, 0.01998, 0.020694, 0.023149, 0.022656, 0.019163, 0.02063, 0.019305, 0.020461, 0.020929, 0.019657, 0.022518, 0.01943, 0.022336, 0.020889, 0.022595, 0.020867, 0.019458, 0.019034, 0.020261, 0.020999, 0.01951, 0.021523, 0.01937, 0.024265, 0.020277, 0.020575, 0.019332, 0.021115, 0.022297, 0.022509, 0.020745, 0.022119, 0.021531, 0.020126, 0.019319, 0.020376, 0.019515, 0.020718, 0.019643, 0.020115, 0.021454, 0.023165, 0.020031, 0.02142, 0.019198, 0.019699, 0.020045, 0.019756, 0.019562, 0.01974, 0.019102, 0.021235, 0.019553, 0.019129, 0.020772, 0.021657, 0.019952, 0.019021, 0.021288, 0.022121, 0.021632, 0.022445, 0.020264, 0.018286, 0.02157, 0.023564, 0.020082, 0.022114, 0.021024, 0.022131, 0.021489, 0.019726, 0.021353, 0.02267, 0.019705, 0.02062, 0.019026, 0.018846, 0.021084, 0.021302, 0.019015, 0.021648, 0.019473, 0.02044, 0.01901, 0.019717, 0.020178, 0.020046, 0.019681, 0.020005, 0.01975, 0.022051, 0.01845, 0.019829, 0.021858, 0.019109, 0.020589, 0.020283, 0.020517, 0.022525, 0.020004, 0.019432, 0.021736, 0.021334, 0.018841, 0.020403, 0.019898, 0.019795, 0.018725, 0.019767, 0.020979, 0.018699, 0.022747, 0.020568, 0.021418, 0.021087, 0.021102, 0.019954, 0.02239, 0.019693, 0.020383, 0.022476, 0.019948, 0.01871, 0.020741, 0.019983, 0.02252, 0.020585, 0.021973, 0.02231, 0.019978, 0.023436, 0.019761, 0.020596, 0.021987, 0.020298, 0.021349, 0.020449], "est_pctblack": [0.069101, 0.109652, 0.081621, 0.107978, -0.028756, -0.031319, -0.039784, 0.074043, 0.093798, 0.116161, 0.017538, 0.051458, 0.086563, 0.126093, 0.045851, 0.026472, -0.024031, -0.017182, 0.099296, 0.088445, 0.026868, 0.126713, 0.083942, 0.099041, 0.043496, 0.091923, 0.100254, 0.043769, -0.069673, 0.097044, 0.034708, 0.116045, 0.055012, 0.101042, 0.120346, -0.056913, 0.120821, 0.105723, 0.044647, 0.085246, 0.099642, 0.020989, 0.120838, 0.003858, 0.061184, 0.067138, 0.103493, 0.081887, 0.105581, 0.122948, 0.028657, -0.068744, 0.01541, 0.040416, 0.047114, 0.065078, 0.101303, 0.012956, -0.04988, 0.047631, 0.05087, -0.042543, 0.07408, 0.073207, 0.124138, -0.067188, -0.007274, -0.014491, -0.010531, -0.046921, 0.113601, 0.113778, -0.064522, 0.133808, 0.005659, 0.051272, 0.102649, -0.041146, -0.053152, 0.070016, -0.029277, 0.00013, 0.009159, -0.022814, 0.026241, 0.120036, 0.035151, 0.091913, 0.052522, -0.063926, 0.060652, 0.124459, 0.019242, -0.055326, 0.063397, 0.057636, -0.06509, 0.079295, 0.09742, 0.111371, 0.118058, -0.001812, 0.044016, -0.073499, 0.066919, 0.09991, -0.050833, -0.071948, -0.07113, 0.085506, 0.046023, 0.047284, 0.084538, 0.059245, 0.101547, 0.066148, -0.061461, 0.091916, -0.009479, 0.089892, -0.044908, -0.032054, 0.070816, 0.002808, 0.116789, 0.032794, -0.037893, 0.087485, 0.074573, 0.08465, -0.058766, 0.046954, 0.066035, 0.078357, 0.088406, 0.127193, 0.112785, 0.043583, 0.011283, 0.029085, 0.133664, 0.098907, 0.027869, 0.022034, 0.065065, 0.092313, -0.053861, 0.101281, -0.049077, -0.013325, 0.072389, 0.081356, 0.05037, 0.003179, 0.077726, 0.084694, -0.066626, 0.003954, 0.109955], "se_intercept": [2.414905, 2.693495, 2.525672, 2.254469, 1.767947, 1.568279, 1.578247, 1.663068, 2.862878, 2.499664, 1.754697, 2.53432, 2.7458, 2.37276, 2.358372, 2.266851, 2.33053, 1.53043, 2.200939, 2.850729, 2.214298, 1.581426, 1.713271, 2.759415, 2.423043, 1.758621, 1.706088, 1.614738, 1.594406, 2.164187, 1.632367, 2.633065, 1.645574, 2.705483, 2.377208, 2.253458, 2.385997, 1.547755, 1.71435, 2.276863, 1.726498, 1.598733, 2.397814, 1.610286, 2.878158, 2.211205, 2.243644, 1.61194, 2.243294, 2.566473, 2.352536, 1.661851, 2.248079, 2.263805, 1.650174, 1.582012, 1.689538, 1.607162, 1.586713, 1.657121, 1.663216, 2.067999, 2.67157, 1.673314, 2.409132, 1.605387, 1.617749, 1.568208, 1.579703, 1.770208, 1.597488, 1.549723, 1.637652, 1.493865, 1.56963, 2.16101, 2.69733, 1.563976, 1.54099, 2.48411, 2.20626, 2.257751, 2.231355, 1.703418, 1.497784, 2.490184, 2.431209, 2.158241, 2.408576, 1.942379, 2.463849, 2.44542, 1.601738, 2.049086, 2.544855, 1.97318, 1.593987, 1.777595, 1.48837, 2.280386, 2.383153, 1.562314, 2.322577, 1.569305, 1.69604, 1.659262, 1.589465, 1.585724, 1.620291, 1.653437, 1.931336, 1.647418, 2.601788, 1.497189, 1.635503, 2.553523, 1.6014, 2.048805, 1.58145, 2.095997, 2.300383, 1.635103, 1.923033, 2.283359, 2.331305, 1.537997, 1.568949, 1.923869, 2.046619, 1.586403, 1.772296, 2.233654, 1.698501, 2.852481, 2.168708, 2.418624, 2.486725, 2.227902, 1.586879, 2.283136, 1.49281, 2.349879, 2.112019, 1.609147, 1.564276, 1.717976, 1.595726, 2.676737, 1.941109, 2.139217, 2.563838, 2.013145, 2.682408, 1.574679, 1.710298, 2.660378, 1.762358, 1.996974, 2.314566], "y": [8.2, 6.4, 6.6, 9.4, 13.3, 6.4, 9.2, 9.0, 7.6, 7.5, 17.0, 10.3, 5.8, 9.1, 11.8, 19.9, 9.6, 7.2, 10.1, 13.5, 9.9, 12.0, 8.1, 6.4, 18.6, 20.2, 5.9, 18.4, 37.5, 11.2, 14.7, 6.7, 33.0, 11.1, 10.0, 23.9, 6.5, 13.3, 5.7, 10.0, 8.0, 8.6, 11.7, 32.7, 8.0, 9.5, 17.0, 12.0, 9.4, 4.7, 7.6, 8.0, 9.1, 8.6, 7.8, 25.8, 13.7, 15.6, 9.5, 31.6, 8.6, 5.3, 19.9, 9.2, 7.7, 8.8, 29.6, 12.0, 15.4, 6.8, 7.5, 13.6, 9.1, 5.7, 10.7, 16.0, 8.3, 9.0, 10.8, 8.3, 6.2, 7.7, 4.9, 12.0, 10.0, 5.4, 12.0, 13.7, 13.4, 8.2, 5.2, 16.3, 11.1, 10.4, 8.7, 10.1, 9.7, 4.6, 6.7, 8.2, 7.8, 12.9, 10.1, 11.0, 5.5, 16.6, 9.5, 28.4, 12.8, 7.6, 15.2, 9.0, 6.3, 9.3, 6.8, 10.7, 11.7, 7.3, 11.6, 6.0, 17.3, 18.1, 8.0, 8.6, 7.8, 11.1, 13.1, 8.0, 15.9, 7.1, 5.6, 6.5, 7.1, 8.6, 9.2, 13.4, 14.0, 11.4, 11.4, 6.3, 13.6, 7.2, 4.8, 10.1, 9.0, 8.4, 9.4, 10.4, 4.2, 9.8, 9.6, 5.5, 8.6, 13.6, 12.0, 7.6, 10.4, 8.8, 6.3]} \ No newline at end of file diff --git a/release/python/0.7.0/crankshaft/test/fixtures/kmeans.json b/release/python/0.7.0/crankshaft/test/fixtures/kmeans.json new file mode 100644 index 0000000..8f31c79 --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/fixtures/kmeans.json @@ -0,0 +1 @@ +[{"xs": [9.917239463463458, 9.042767302696836, 10.798929825304187, 8.763751051762995, 11.383882954810852, 11.018206993460897, 8.939526075734316, 9.636159342565252, 10.136336896960058, 11.480610059427342, 12.115011910725082, 9.173267848893428, 10.239300931201738, 8.00012512174072, 8.979962292282131, 9.318376124429575, 10.82259513754284, 10.391747171927115, 10.04904588886165, 9.96007160443463, -0.78825626804569, -0.3511819898577426, -1.2796410003764271, -0.3977049391203402, 2.4792311265774667, 1.3670311632092624, 1.2963504112955613, 2.0404844103073025, -1.6439708506073223, 0.39122885445645805, 1.026031821452462, -0.04044477160482201, -0.7442346929085072, -0.34687120826243034, -0.23420359971379054, -0.5919629143336708, -0.202903054395391, -0.1893399644841902, 1.9331834251176807, -0.12321054392851609], "ys": [8.735627063679981, 9.857615954045011, 10.81439096759407, 10.586727233537191, 9.232919976568622, 11.54281262696508, 8.392787912674466, 9.355119689665944, 9.22380703532752, 10.542142541823122, 10.111980619367035, 10.760836265570738, 8.819773453269804, 10.25325722424816, 9.802077905695608, 8.955420161552611, 9.833801181904477, 10.491684241001613, 12.076108669877556, 11.74289693140474, -0.5685725015474191, -0.5715728344759778, -0.20180907868635137, 0.38431336480089595, -0.3402202083684184, -2.4652736827783586, 0.08295159401756182, 0.8503818775816505, 0.6488691600321166, 0.5794762568230527, -0.6770063922144103, -0.6557616416449478, -1.2834289177624947, 0.1096318195532717, -0.38986922166834853, -1.6224497706950238, 0.09429787743230483, 0.4005097316394031, -0.508002811195673, -1.2473463371366507], "ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]}] \ No newline at end of file diff --git a/release/python/0.7.0/crankshaft/test/fixtures/markov.json b/release/python/0.7.0/crankshaft/test/fixtures/markov.json new file mode 100644 index 0000000..d60e4e0 --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/fixtures/markov.json @@ -0,0 +1 @@ +[[0.11111111111111112, 0.10000000000000001, 0.0, 0.35213633723318016, 0], [0.03125, 0.030303030303030304, 0.0, 0.3850273981640871, 1], [0.03125, 0.030303030303030304, 0.0, 0.3850273981640871, 2], [0.0, 0.10000000000000001, 0.10000000000000001, 0.30331501776206204, 3], [0.0, 0.065217391304347824, 0.065217391304347824, 0.33605067580764519, 4], [-0.054054054054054057, 0.0, 0.05128205128205128, 0.37488547451276033, 5], [0.1875, 0.23999999999999999, 0.12, 0.23731835158706122, 6], [0.034482758620689655, 0.0625, 0.03125, 0.35388469167230169, 7], [0.030303030303030304, 0.078947368421052627, 0.052631578947368418, 0.33560628561957595, 8], [0.19047619047619049, 0.16, 0.0, 0.32594478059941379, 9], [-0.23529411764705882, 0.0, 0.19047619047619047, 0.31356338348865387, 10], [0.030303030303030304, 0.078947368421052627, 0.052631578947368418, 0.33560628561957595, 11], [-0.22222222222222224, 0.13333333333333333, 0.26666666666666666, 0.22310934040908681, 12], [0.027777777777777783, 0.11111111111111112, 0.088888888888888892, 0.30339641183779581, 13], [0.03125, 0.030303030303030304, 0.0, 0.3850273981640871, 14], [0.052631578947368425, 0.090909090909090912, 0.045454545454545456, 0.33352611505171165, 15], [-0.22222222222222224, 0.13333333333333333, 0.26666666666666666, 0.22310934040908681, 16], [-0.20512820512820512, 0.0, 0.1702127659574468, 0.32172013908826891, 17], [-0.20512820512820512, 0.0, 0.1702127659574468, 0.32172013908826891, 18], [-0.0625, 0.095238095238095233, 0.14285714285714285, 0.28634850244519822, 19], [0.0, 0.10000000000000001, 0.10000000000000001, 0.30331501776206204, 20], [0.078947368421052641, 0.073170731707317083, 0.0, 0.36451788667842738, 21], [0.030303030303030304, 0.078947368421052627, 0.052631578947368418, 0.33560628561957595, 22], [-0.16666666666666663, 0.18181818181818182, 0.27272727272727271, 0.20246415864836445, 23], [-0.22222222222222224, 0.13333333333333333, 0.26666666666666666, 0.22310934040908681, 24], [0.1875, 0.23999999999999999, 0.12, 0.23731835158706122, 25], [-0.20512820512820512, 0.0, 0.1702127659574468, 0.32172013908826891, 26], [-0.043478260869565216, 0.0, 0.041666666666666664, 0.37950991789118999, 27], [0.22222222222222221, 0.18181818181818182, 0.0, 0.31701083225750354, 28], [-0.054054054054054057, 0.0, 0.05128205128205128, 0.37488547451276033, 29], [-0.0625, 0.095238095238095233, 0.14285714285714285, 0.28634850244519822, 30], [0.0, 0.10000000000000001, 0.10000000000000001, 0.30331501776206204, 31], [0.030303030303030304, 0.078947368421052627, 0.052631578947368418, 0.33560628561957595, 32], [-0.0625, 0.095238095238095233, 0.14285714285714285, 0.28634850244519822, 33], [0.034482758620689655, 0.0625, 0.03125, 0.35388469167230169, 34], [0.0, 0.10000000000000001, 0.10000000000000001, 0.30331501776206204, 35], [-0.054054054054054057, 0.0, 0.05128205128205128, 0.37488547451276033, 36], [0.11111111111111112, 0.10000000000000001, 0.0, 0.35213633723318016, 37], [-0.22222222222222224, 0.13333333333333333, 0.26666666666666666, 0.22310934040908681, 38], [-0.0625, 0.095238095238095233, 0.14285714285714285, 0.28634850244519822, 39], [0.034482758620689655, 0.0625, 0.03125, 0.35388469167230169, 40], [0.11111111111111112, 0.10000000000000001, 0.0, 0.35213633723318016, 41], [0.052631578947368425, 0.090909090909090912, 0.045454545454545456, 0.33352611505171165, 42], [0.0, 0.0, 0.0, 0.40000000000000002, 43], [0.0, 0.065217391304347824, 0.065217391304347824, 0.33605067580764519, 44], [0.078947368421052641, 0.073170731707317083, 0.0, 0.36451788667842738, 45], [0.052631578947368425, 0.090909090909090912, 0.045454545454545456, 0.33352611505171165, 46], [-0.20512820512820512, 0.0, 0.1702127659574468, 0.32172013908826891, 47]] diff --git a/release/python/0.7.0/crankshaft/test/fixtures/moran.json b/release/python/0.7.0/crankshaft/test/fixtures/moran.json new file mode 100644 index 0000000..2f75cf1 --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/fixtures/moran.json @@ -0,0 +1,52 @@ +[[0.9319096128346788, "HH"], +[-1.135787401862846, "HL"], +[0.11732030672508517, "LL"], +[0.6152779669180425, "LL"], +[-0.14657336660125297, "LH"], +[0.6967858120189607, "LL"], +[0.07949310115714454, "HH"], +[0.4703198759258987, "HH"], +[0.4421125200498064, "HH"], +[0.5724288737143592, "LL"], +[0.8970743435692062, "LL"], +[0.18327334401918674, "LL"], +[-0.01466729201304962, "HL"], +[0.3481559372544409, "LL"], +[0.06547094736902978, "LL"], +[0.15482141569329988, "HH"], +[0.4373841193538136, "HH"], +[0.15971286468915544, "LL"], +[1.0543588860308968, "HH"], +[1.7372866900020818, "HH"], +[1.091998586053999, "LL"], +[0.1171572584252222, "HH"], +[0.08438455015300014, "LL"], +[0.06547094736902978, "LL"], +[0.15482141569329985, "HH"], +[1.1627044812890683, "HH"], +[0.06547094736902978, "LL"], +[0.795275137550483, "HH"], +[0.18562939195219, "LL"], +[0.3010757406693439, "LL"], +[2.8205795942839376, "HH"], +[0.11259190602909264, "LL"], +[-0.07116352791516614, "HL"], +[-0.09945240794119009, "LH"], +[0.18562939195219, "LL"], +[0.1832733440191868, "LL"], +[-0.39054253768447705, "HL"], +[-0.1672071289487642, "HL"], +[0.3337669247916343, "HH"], +[0.2584386102554792, "HH"], +[-0.19733845476322634, "HL"], +[-0.9379282899805409, "LH"], +[-0.028770969951095866, "LH"], +[0.051367269430983485, "LL"], +[-0.2172548045913472, "LH"], +[0.05136726943098351, "LL"], +[0.04191046803899837, "LL"], +[0.7482357030403517, "HH"], +[-0.014585767863118111, "LH"], +[0.5410013139159929, "HH"], +[1.0223932668429925, "LL"], +[1.4179402898927476, "LL"]] \ No newline at end of file diff --git a/release/python/0.7.0/crankshaft/test/fixtures/neighbors.json b/release/python/0.7.0/crankshaft/test/fixtures/neighbors.json new file mode 100644 index 0000000..055b359 --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/fixtures/neighbors.json @@ -0,0 +1,54 @@ +[ + {"neighbors": [48, 26, 20, 9, 31], "id": 1, "value": 0.5}, + {"neighbors": [30, 16, 46, 3, 4], "id": 2, "value": 0.7}, + {"neighbors": [46, 30, 2, 12, 16], "id": 3, "value": 0.2}, + {"neighbors": [18, 30, 23, 2, 52], "id": 4, "value": 0.1}, + {"neighbors": [47, 40, 45, 37, 28], "id": 5, "value": 0.3}, + {"neighbors": [10, 21, 41, 14, 37], "id": 6, "value": 0.05}, + {"neighbors": [8, 17, 43, 25, 12], "id": 7, "value": 0.4}, + {"neighbors": [17, 25, 43, 22, 7], "id": 8, "value": 0.7}, + {"neighbors": [39, 34, 1, 26, 48], "id": 9, "value": 0.5}, + {"neighbors": [6, 37, 5, 45, 49], "id": 10, "value": 0.04}, + {"neighbors": [51, 41, 29, 21, 14], "id": 11, "value": 0.08}, + {"neighbors": [44, 46, 43, 50, 3], "id": 12, "value": 0.2}, + {"neighbors": [45, 23, 14, 28, 18], "id": 13, "value": 0.4}, + {"neighbors": [41, 29, 13, 23, 6], "id": 14, "value": 0.2}, + {"neighbors": [36, 27, 32, 33, 24], "id": 15, "value": 0.3}, + {"neighbors": [19, 2, 46, 44, 28], "id": 16, "value": 0.4}, + {"neighbors": [8, 25, 43, 7, 22], "id": 17, "value": 0.6}, + {"neighbors": [23, 4, 29, 14, 13], "id": 18, "value": 0.3}, + {"neighbors": [42, 16, 28, 26, 40], "id": 19, "value": 0.7}, + {"neighbors": [1, 48, 31, 26, 42], "id": 20, "value": 0.8}, + {"neighbors": [41, 6, 11, 14, 10], "id": 21, "value": 0.1}, + {"neighbors": [25, 50, 43, 31, 44], "id": 22, "value": 0.4}, + {"neighbors": [18, 13, 14, 4, 2], "id": 23, "value": 0.1}, + {"neighbors": [33, 49, 34, 47, 27], "id": 24, "value": 0.3}, + {"neighbors": [43, 8, 22, 17, 50], "id": 25, "value": 0.4}, + {"neighbors": [1, 42, 20, 31, 48], "id": 26, "value": 0.6}, + {"neighbors": [32, 15, 36, 33, 24], "id": 27, "value": 0.3}, + {"neighbors": [40, 45, 19, 5, 13], "id": 28, "value": 0.8}, + {"neighbors": [11, 51, 41, 14, 18], "id": 29, "value": 0.3}, + {"neighbors": [2, 3, 4, 46, 18], "id": 30, "value": 0.1}, + {"neighbors": [20, 26, 1, 50, 48], "id": 31, "value": 0.9}, + {"neighbors": [27, 36, 15, 49, 24], "id": 32, "value": 0.3}, + {"neighbors": [24, 27, 49, 34, 32], "id": 33, "value": 0.4}, + {"neighbors": [47, 9, 39, 40, 24], "id": 34, "value": 0.3}, + {"neighbors": [38, 51, 11, 21, 41], "id": 35, "value": 0.3}, + {"neighbors": [15, 32, 27, 49, 33], "id": 36, "value": 0.2}, + {"neighbors": [49, 10, 5, 47, 24], "id": 37, "value": 0.5}, + {"neighbors": [35, 21, 51, 11, 41], "id": 38, "value": 0.4}, + {"neighbors": [9, 34, 48, 1, 47], "id": 39, "value": 0.6}, + {"neighbors": [28, 47, 5, 9, 34], "id": 40, "value": 0.5}, + {"neighbors": [11, 14, 29, 21, 6], "id": 41, "value": 0.4}, + {"neighbors": [26, 19, 1, 9, 31], "id": 42, "value": 0.2}, + {"neighbors": [25, 12, 8, 22, 44], "id": 43, "value": 0.3}, + {"neighbors": [12, 50, 46, 16, 43], "id": 44, "value": 0.2}, + {"neighbors": [28, 13, 5, 40, 19], "id": 45, "value": 0.3}, + {"neighbors": [3, 12, 44, 2, 16], "id": 46, "value": 0.2}, + {"neighbors": [34, 40, 5, 49, 24], "id": 47, "value": 0.3}, + {"neighbors": [1, 20, 26, 9, 39], "id": 48, "value": 0.5}, + {"neighbors": [24, 37, 47, 5, 33], "id": 49, "value": 0.2}, + {"neighbors": [44, 22, 31, 42, 26], "id": 50, "value": 0.6}, + {"neighbors": [11, 29, 41, 14, 21], "id": 51, "value": 0.01}, + {"neighbors": [4, 18, 29, 51, 23], "id": 52, "value": 0.01} + ] diff --git a/release/python/0.7.0/crankshaft/test/fixtures/neighbors_getis.json b/release/python/0.7.0/crankshaft/test/fixtures/neighbors_getis.json new file mode 100644 index 0000000..be367ff --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/fixtures/neighbors_getis.json @@ -0,0 +1 @@ +[{"neighbors": [3, 6, 7], "id": 1, "value": 1.624458}, {"neighbors": [10, 5, 8], "id": 2, "value": 2.2554919999999998}, {"neighbors": [1, 4, 7], "id": 3, "value": 1.4678899999999999}, {"neighbors": [9, 3, 5, 7], "id": 4, "value": 2.4842559999999998}, {"neighbors": [9, 2, 4, 10], "id": 5, "value": 0.0}, {"neighbors": [1, 11, 12, 7, 16], "id": 6, "value": 9.0486730000000009}, {"neighbors": [1, 3, 4, 6, 9, 11, 18, 19], "id": 7, "value": 6.0294889999999999}, {"neighbors": [2, 15, 10], "id": 8, "value": 1.8003849999999999}, {"neighbors": [4, 5, 7, 10, 13, 19, 20], "id": 9, "value": 4.581251}, {"neighbors": [2, 5, 8, 9, 13, 15, 17, 20, 21], "id": 10, "value": 3.7906070000000001}, {"neighbors": [18, 6, 7, 16], "id": 11, "value": 1.4474359999999999}, {"neighbors": [16, 6, 14], "id": 12, "value": 1.1919660000000001}, {"neighbors": [9, 10, 20], "id": 13, "value": 0.0}, {"neighbors": [12, 22, 16], "id": 14, "value": 1.608017}, {"neighbors": [17, 10, 23, 8], "id": 15, "value": 1.9498120000000001}, {"neighbors": [6, 11, 12, 14, 18, 22, 27, 28], "id": 16, "value": 0.74509000000000003}, {"neighbors": [10, 15, 21, 23, 26, 30], "id": 17, "value": 4.1733180000000001}, {"neighbors": [33, 7, 11, 16, 19, 27, 32], "id": 18, "value": 3.7832520000000001}, {"neighbors": [33, 7, 9, 18, 20, 24], "id": 19, "value": 2.0851359999999999}, {"neighbors": [9, 10, 13, 19, 21, 24], "id": 20, "value": 2.1763020000000002}, {"neighbors": [35, 10, 17, 20, 24, 26], "id": 21, "value": 6.3093469999999998}, {"neighbors": [28, 29, 14, 16], "id": 22, "value": 10.855743}, {"neighbors": [17, 25, 31, 30, 15], "id": 23, "value": 4.211354}, {"neighbors": [33, 19, 20, 21, 35], "id": 24, "value": 0.80481000000000003}, {"neighbors": [42, 31, 23], "id": 25, "value": 3.2153309999999999}, {"neighbors": [17, 34, 35, 21, 30], "id": 26, "value": 2.8336640000000002}, {"neighbors": [36, 39, 41, 16, 18, 28, 32], "id": 27, "value": 1.5920399999999999}, {"neighbors": [27, 36, 29, 22, 16], "id": 28, "value": 1.5711580000000001}, {"neighbors": [36, 28, 22, 38], "id": 29, "value": 3.1275900000000001}, {"neighbors": [34, 43, 17, 23, 26, 31], "id": 30, "value": 4.4168960000000004}, {"neighbors": [42, 43, 44, 23, 25, 30], "id": 31, "value": 3.0174859999999999}, {"neighbors": [33, 18, 27, 41], "id": 32, "value": 9.9242450000000009}, {"neighbors": [35, 37, 40, 41, 46, 18, 19, 24, 32], "id": 33, "value": 7.9739570000000004}, {"neighbors": [26, 35, 43, 45, 30], "id": 34, "value": 5.0054639999999999}, {"neighbors": [33, 34, 37, 40, 45, 21, 24, 26], "id": 35, "value": 2.4638909999999998}, {"neighbors": [38, 39, 47, 27, 28, 29], "id": 36, "value": 0.0}, {"neighbors": [33, 35, 40, 45, 46, 49, 51], "id": 37, "value": 7.377974}, {"neighbors": [36, 29, 47, 48], "id": 38, "value": 1.0038750000000001}, {"neighbors": [36, 41, 47, 50, 52, 27], "id": 39, "value": 3.1900469999999999}, {"neighbors": [33, 35, 37, 46], "id": 40, "value": 45.905405999999999}, {"neighbors": [33, 39, 46, 50, 27, 32], "id": 41, "value": 2.447597}, {"neighbors": [25, 44, 53, 31], "id": 42, "value": 1.2949580000000001}, {"neighbors": [34, 44, 45, 54, 59, 61, 30, 31], "id": 43, "value": 5.9330980000000002}, {"neighbors": [42, 43, 53, 54, 31], "id": 44, "value": 4.1339969999999999}, {"neighbors": [34, 35, 37, 43, 51, 59, 60], "id": 45, "value": 4.298311}, {"neighbors": [33, 37, 40, 41, 49, 50, 57], "id": 46, "value": 27.483827000000002}, {"neighbors": [36, 38, 39, 48, 52, 55, 56], "id": 47, "value": 0.96979099999999996}, {"neighbors": [55, 38, 47], "id": 48, "value": 0.0}, {"neighbors": [57, 51, 37, 46, 63], "id": 49, "value": 2.934466}, {"neighbors": [39, 41, 46, 52, 57, 58], "id": 50, "value": 4.4564269999999997}, {"neighbors": [37, 45, 49, 60, 63, 64], "id": 51, "value": 4.629264}, {"neighbors": [39, 47, 50, 56, 58, 62], "id": 52, "value": 4.9415329999999997}, {"neighbors": [65, 42, 44, 54], "id": 53, "value": 3.9900410000000002}, {"neighbors": [65, 61, 43, 44, 53], "id": 54, "value": 2.064324}, {"neighbors": [56, 47, 48], "id": 55, "value": 3.0402529999999999}, {"neighbors": [52, 55, 47, 62], "id": 56, "value": 3.905411}, {"neighbors": [66, 67, 46, 49, 50, 58, 63], "id": 57, "value": 4.3328389999999999}, {"neighbors": [57, 50, 52, 62, 66], "id": 58, "value": 3.8941110000000001}, {"neighbors": [69, 70, 43, 45, 60, 61], "id": 59, "value": 6.8287940000000003}, {"neighbors": [51, 64, 45, 59, 70], "id": 60, "value": 3.2639469999999999}, {"neighbors": [65, 69, 72, 43, 54, 59], "id": 61, "value": 3.2821630000000002}, {"neighbors": [58, 68, 52, 66, 56], "id": 62, "value": 3.2957619999999999}, {"neighbors": [49, 57, 51, 67, 64], "id": 63, "value": 7.2496790000000004}, {"neighbors": [67, 70, 71, 51, 60, 63], "id": 64, "value": 3.041846}, {"neighbors": [61, 53, 54, 72], "id": 65, "value": 1.618018}, {"neighbors": [67, 68, 73, 76, 57, 58, 62], "id": 66, "value": 4.9108010000000002}, {"neighbors": [66, 71, 73, 75, 76, 57, 63, 64], "id": 67, "value": 1.991457}, {"neighbors": [73, 66, 62], "id": 68, "value": 3.1461920000000001}, {"neighbors": [70, 72, 74, 77, 59, 61], "id": 69, "value": 7.2666500000000003}, {"neighbors": [69, 71, 74, 78, 59, 60, 64], "id": 70, "value": 3.1109040000000001}, {"neighbors": [67, 75, 70, 78, 64], "id": 71, "value": 2.9802710000000001}, {"neighbors": [65, 69, 61, 77], "id": 72, "value": 3.8667669999999998}, {"neighbors": [76, 66, 67, 68], "id": 73, "value": 1.8684080000000001}, {"neighbors": [77, 69, 70, 78], "id": 74, "value": 12.577033999999999}, {"neighbors": [67, 76, 78, 71], "id": 75, "value": 7.8035990000000002}, {"neighbors": [73, 66, 67, 75], "id": 76, "value": 3.4714900000000002}, {"neighbors": [74, 69, 72], "id": 77, "value": 4.334822}, {"neighbors": [74, 75, 70, 71], "id": 78, "value": 8.4515370000000001}] diff --git a/release/python/0.7.0/crankshaft/test/fixtures/neighbors_markov.json b/release/python/0.7.0/crankshaft/test/fixtures/neighbors_markov.json new file mode 100644 index 0000000..45a20e7 --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/fixtures/neighbors_markov.json @@ -0,0 +1 @@ +[{"neighbors": [10, 7, 21, 23, 1], "y1995": 0.87654416055651474, "y1997": 0.85637566664752718, "y1996": 0.8631470006766887, "y1999": 0.84461540228037335, "y1998": 0.84811668329242784, "y2006": 0.86302631339545688, "y2007": 0.86148266513456728, "y2004": 0.86416611731111015, "y2005": 0.87119374831581786, "y2002": 0.85012592862683589, "y2003": 0.8550965633336135, "y2000": 0.83271652434603094, "y2001": 0.83786313566577242, "id": 0, "y2008": 0.86252252380501315, "y2009": 0.86746356478544273}, {"neighbors": [5, 7, 22, 29, 3], "y1995": 0.91889509774542122, "y1997": 0.92333257900976462, "y1996": 0.91757931190043385, "y1999": 0.92552387732371888, "y1998": 0.92517289327379471, "y2006": 0.91706053906277052, "y2007": 0.90139504820726424, "y2004": 0.89815175749309051, "y2005": 0.91832090781161113, "y2002": 0.89431990798552208, "y2003": 0.88924793576523797, "y2000": 0.90746978227271013, "y2001": 0.89830489127332913, "id": 1, "y2008": 0.87897455159080617, "y2009": 0.86216858051752643}, {"neighbors": [11, 8, 13, 18, 17], "y1995": 0.82591007476914713, "y1997": 0.81989792988843901, "y1996": 0.82548595539161707, "y1999": 0.81731522200916285, "y1998": 0.81503235035017918, "y2006": 0.81814804358939286, "y2007": 0.83675961003285626, "y2004": 0.82668195534569056, "y2005": 0.82373723764184559, "y2002": 0.80849979516360859, "y2003": 0.82258550658074148, "y2000": 0.78964559168205917, "y2001": 0.8058444152731008, "id": 2, "y2008": 0.8357419865626442, "y2009": 0.84647177436289112}, {"neighbors": [4, 14, 9, 5, 12], "y1995": 1.0908817638059434, "y1997": 1.0845641754849344, "y1996": 1.0853768890893893, "y1999": 1.098988414417104, "y1998": 1.0841540389418189, "y2006": 1.1316479722785828, "y2007": 1.1295850763954971, "y2004": 1.1139980568106316, "y2005": 1.1216802898290368, "y2002": 1.1116069731657288, "y2003": 1.1088862051501811, "y2000": 1.1450694824791507, "y2001": 1.1215113292620285, "id": 3, "y2008": 1.1137181812756343, "y2009": 1.0993677488645406}, {"neighbors": [14, 3, 9, 31, 12], "y1995": 1.1073144618319228, "y1997": 1.1328363804627946, "y1996": 1.1137394350312471, "y1999": 1.1591002514611153, "y1998": 1.144725587086376, "y2006": 1.1173646811350333, "y2007": 1.1086324218539598, "y2004": 1.1102496406140896, "y2005": 1.11943471361418, "y2002": 1.1475230282561595, "y2003": 1.1184328424005199, "y2000": 1.1689820101690329, "y2001": 1.1721248787169682, "id": 4, "y2008": 1.0964251552643696, "y2009": 1.0776233718455337}, {"neighbors": [29, 1, 22, 7, 4], "y1995": 1.422697571371182, "y1997": 1.4427350196405593, "y1996": 1.4211843379728528, "y1999": 1.4440068434166562, "y1998": 1.4357757095632602, "y2006": 1.4405276647793266, "y2007": 1.4524121586440921, "y2004": 1.4059372049179741, "y2005": 1.4078864636665769, "y2002": 1.4197822680667809, "y2003": 1.3909220829548647, "y2000": 1.4418473669388905, "y2001": 1.4478283203013527, "id": 5, "y2008": 1.4330609762040207, "y2009": 1.4174430982377491}, {"neighbors": [12, 47, 9, 25, 20], "y1995": 1.1307388498039153, "y1997": 1.1107470843142355, "y1996": 1.1311051255854685, "y1999": 1.130881491772973, "y1998": 1.1336463608751246, "y2006": 1.1088003408832796, "y2007": 1.0840170924825394, "y2004": 1.1244623853593112, "y2005": 1.1167100811401538, "y2002": 1.1306293052597198, "y2003": 1.1194498381213465, "y2000": 1.1088813841947593, "y2001": 1.1185662918783175, "id": 6, "y2008": 1.0695920556329086, "y2009": 1.0787522517402164}, {"neighbors": [21, 1, 22, 10, 0], "y1995": 1.0470612357366649, "y1997": 1.0425337165747406, "y1996": 1.0451683097376836, "y1999": 1.0207254480945218, "y1998": 1.0323998680588111, "y2006": 1.0405109962442973, "y2007": 1.0174964540280445, "y2004": 1.0140090547678748, "y2005": 1.0317674181861733, "y2002": 0.99669586934394627, "y2003": 0.99327675611171373, "y2000": 0.99854316295509526, "y2001": 0.98802579761429143, "id": 7, "y2008": 0.9936394033949828, "y2009": 0.98279746069218921}, {"neighbors": [11, 13, 17, 18, 15], "y1995": 0.98996985668705595, "y1997": 0.99491000469481983, "y1996": 1.0014356415938011, "y1999": 1.0045584503565237, "y1998": 1.0018840754492748, "y2006": 0.92232873520447411, "y2007": 0.91284090705064902, "y2004": 0.93694786512729977, "y2005": 0.94308212820743131, "y2002": 0.96834820215592055, "y2003": 0.95335147249088092, "y2000": 0.99127006477048718, "y2001": 0.97925917470464008, "id": 8, "y2008": 0.89689832627117483, "y2009": 0.88928857608264111}, {"neighbors": [12, 6, 4, 3, 14], "y1995": 0.87418390853652306, "y1997": 0.84425695187978567, "y1996": 0.86416601430334228, "y1999": 0.83903043942542854, "y1998": 0.8404493987171674, "y2006": 0.87204140839730271, "y2007": 0.86633032299764789, "y2004": 0.86981997840756087, "y2005": 0.86837929279319737, "y2002": 0.86107306112852877, "y2003": 0.85007719735663123, "y2000": 0.85787080050645603, "y2001": 0.86036185149249467, "id": 9, "y2008": 0.84946077011565357, "y2009": 0.83287145944123797}, {"neighbors": [0, 7, 21, 23, 22], "y1995": 1.1419611801631209, "y1997": 1.1489271154554144, "y1996": 1.146602624490825, "y1999": 1.1443662376135306, "y1998": 1.1490959392942743, "y2006": 1.1049125811637337, "y2007": 1.1105984164317646, "y2004": 1.1119989015058092, "y2005": 1.1025779214946556, "y2002": 1.1259666377127024, "y2003": 1.1221399558345004, "y2000": 1.144501826035474, "y2001": 1.1234975172649961, "id": 10, "y2008": 1.1050979494645479, "y2009": 1.1002009697391872}, {"neighbors": [8, 13, 18, 17, 2], "y1995": 0.97282462974938089, "y1997": 0.96252588061647382, "y1996": 0.96700147279313231, "y1999": 0.96057686787383312, "y1998": 0.96538780087103548, "y2006": 0.91010201260822066, "y2007": 0.89280392121658247, "y2004": 0.94103988614185807, "y2005": 0.9212251863828258, "y2002": 0.94804194711420009, "y2003": 0.9543028555845573, "y2000": 0.95831051250950716, "y2001": 0.94480908623936988, "id": 11, "y2008": 0.89298242828382146, "y2009": 0.89165384824292859}, {"neighbors": [33, 9, 6, 25, 31], "y1995": 0.94325467991401402, "y1997": 0.96455242154753429, "y1996": 0.96436902092427723, "y1999": 0.94117647058823528, "y1998": 0.95243008993884537, "y2006": 0.9346681464882507, "y2007": 0.94281559150403071, "y2004": 0.96918424441756057, "y2005": 0.94781280876672958, "y2002": 0.95388717527096822, "y2003": 0.94597005193649519, "y2000": 0.94809269652332606, "y2001": 0.93539181553564288, "id": 12, "y2008": 0.965203150896216, "y2009": 0.967154410723015}, {"neighbors": [18, 17, 11, 8, 19], "y1995": 0.97478408425654373, "y1997": 0.98712808751954773, "y1996": 0.98169225257738801, "y1999": 0.985598971191053, "y1998": 0.98474769442356791, "y2006": 0.98416665248276058, "y2007": 0.98423613480079708, "y2004": 0.97399471186978948, "y2005": 0.96910087128357136, "y2002": 0.9820996926750224, "y2003": 0.98776529543110569, "y2000": 0.98687072733199255, "y2001": 0.99237486444837619, "id": 13, "y2008": 0.99823861244053191, "y2009": 0.99545704236827348}, {"neighbors": [4, 31, 3, 29, 12], "y1995": 0.85570268988941878, "y1997": 0.85986131704895119, "y1996": 0.85575915188345031, "y1999": 0.85380119644969055, "y1998": 0.85693406055397725, "y2006": 0.82803647591954255, "y2007": 0.81987360180979219, "y2004": 0.83998883284341452, "y2005": 0.83478547261894065, "y2002": 0.85472102128186755, "y2003": 0.84564834502399988, "y2000": 0.86191535266765262, "y2001": 0.84981450830432048, "id": 14, "y2008": 0.82265395167873867, "y2009": 0.83994039782937002}, {"neighbors": [19, 8, 17, 16, 13], "y1995": 0.87022046646521634, "y1997": 0.85961813213722393, "y1996": 0.85996258309339635, "y1999": 0.8394713575455558, "y1998": 0.85689572413110093, "y2006": 0.94202108334913126, "y2007": 0.94222309998743192, "y2004": 0.86763340229291142, "y2005": 0.89179316746010362, "y2002": 0.86776297543511893, "y2003": 0.86720209304280604, "y2000": 0.82785596604704892, "y2001": 0.86008789452656809, "id": 15, "y2008": 0.93902708112840494, "y2009": 0.94479183757120588}, {"neighbors": [28, 26, 15, 19, 32], "y1995": 0.90134907329491731, "y1997": 0.90403990934606904, "y1996": 0.904077381347274, "y1999": 0.90399237579083946, "y1998": 0.90201769385650832, "y2006": 0.91108803862404764, "y2007": 0.90543476309316473, "y2004": 0.94338264626469681, "y2005": 0.91981795862151561, "y2002": 0.93695966482853577, "y2003": 0.94242697007039, "y2000": 0.90906631602055099, "y2001": 0.92693339421265908, "id": 16, "y2008": 0.91737137682250491, "y2009": 0.94793657442067902}, {"neighbors": [13, 18, 11, 19, 8], "y1995": 1.1977611005602815, "y1997": 1.1843915817489725, "y1996": 1.1822256425225894, "y1999": 1.1928672308275252, "y1998": 1.1826786457339149, "y2006": 1.2392938410349985, "y2007": 1.2341867605077472, "y2004": 1.2385704217423759, "y2005": 1.2441989281116201, "y2002": 1.2262477774195681, "y2003": 1.2239707531714479, "y2000": 1.2017286912636342, "y2001": 1.2132869128474402, "id": 17, "y2008": 1.2362673914436095, "y2009": 1.2675439750795283}, {"neighbors": [13, 17, 11, 8, 19], "y1995": 1.2491967813733067, "y1997": 1.2699116090397236, "y1996": 1.2575477330927329, "y1999": 1.3062566740535762, "y1998": 1.2802065055312271, "y2006": 1.3210776560048689, "y2007": 1.329362443219563, "y2004": 1.3054484140490119, "y2005": 1.3030330249408666, "y2002": 1.3257518058685978, "y2003": 1.3079549159235695, "y2000": 1.3479002255103918, "y2001": 1.3439986302151703, "id": 18, "y2008": 1.3300124123891741, "y2009": 1.3328846185074705}, {"neighbors": [26, 17, 28, 15, 16], "y1995": 1.0676800411188558, "y1997": 1.0363730321443168, "y1996": 1.0379927554499979, "y1999": 1.0329609259280523, "y1998": 1.027684488045026, "y2006": 0.94241549375546196, "y2007": 0.92754546923532677, "y2004": 0.99614160423102482, "y2005": 0.97356208269708677, "y2002": 1.0274762326434594, "y2003": 1.0316273366809443, "y2000": 1.0505901631347052, "y2001": 1.0340505678899605, "id": 19, "y2008": 0.92549226593721745, "y2009": 0.92138101880290568}, {"neighbors": [30, 25, 24, 37, 47], "y1995": 1.0947561397632881, "y1997": 1.1165429913770684, "y1996": 1.1152679554712275, "y1999": 1.1314326394231322, "y1998": 1.1310394841195361, "y2006": 1.1090538904302065, "y2007": 1.1057776900012568, "y2004": 1.1402994437897009, "y2005": 1.1197940058085571, "y2002": 1.133670175399079, "y2003": 1.139822558851451, "y2000": 1.1388962186541665, "y2001": 1.1244221220249986, "id": 20, "y2008": 1.1116682481010467, "y2009": 1.0998515545336902}, {"neighbors": [23, 22, 7, 10, 34], "y1995": 0.76530058421804126, "y1997": 0.76542450966153397, "y1996": 0.76612841163904621, "y1999": 0.76014283909933289, "y1998": 0.7672268310234307, "y2006": 0.76842416021983684, "y2007": 0.77487117798086069, "y2004": 0.76533287692895391, "y2005": 0.78205934309410463, "y2002": 0.76156903267949927, "y2003": 0.76651951668098528, "y2000": 0.74480073263159763, "y2001": 0.76098396210261965, "id": 21, "y2008": 0.77768682781054099, "y2009": 0.78801192267396702}, {"neighbors": [21, 34, 5, 7, 29], "y1995": 0.98391336093764348, "y1997": 0.98295341320156315, "y1996": 0.98075815675295552, "y1999": 0.96913802803963667, "y1998": 0.97386015032669815, "y2006": 0.93965462091114671, "y2007": 0.93069644684632924, "y2004": 0.9635616201227476, "y2005": 0.94745351657235244, "y2002": 0.97209860866113018, "y2003": 0.97441312580606143, "y2000": 0.97370819354423843, "y2001": 0.96419154157867693, "id": 22, "y2008": 0.94020973488297466, "y2009": 0.94358232339833159}, {"neighbors": [21, 10, 22, 34, 7], "y1995": 0.83561828119099946, "y1997": 0.81738501913392403, "y1996": 0.82298088022609361, "y1999": 0.80904800725677739, "y1998": 0.81748588141426259, "y2006": 0.87170334233473346, "y2007": 0.8786379876833581, "y2004": 0.85954307066870839, "y2005": 0.86790023653402792, "y2002": 0.83451612857812574, "y2003": 0.85175031934895873, "y2000": 0.80071489233375537, "y2001": 0.83358255807316928, "id": 23, "y2008": 0.87497981001981484, "y2009": 0.87888675419592222}, {"neighbors": [27, 20, 30, 32, 47], "y1995": 0.98845573274970278, "y1997": 0.99665282989553183, "y1996": 1.0209242772035507, "y1999": 0.99386618594343845, "y1998": 0.99141823200404444, "y2006": 0.97906748937234156, "y2007": 0.9932312332800689, "y2004": 1.0111665058188304, "y2005": 0.9998802359352077, "y2002": 0.99669586934394627, "y2003": 1.0255909749831356, "y2000": 0.98733194819247994, "y2001": 0.99644997431653437, "id": 24, "y2008": 1.0020493856497013, "y2009": 0.99602148231561483}, {"neighbors": [20, 33, 6, 30, 12], "y1995": 1.1493091345649815, "y1997": 1.143009615936718, "y1996": 1.1524194939429724, "y1999": 1.1398468268822266, "y1998": 1.1426554202510555, "y2006": 1.0889107875354573, "y2007": 1.0860369499254896, "y2004": 1.0856975145267398, "y2005": 1.1244348633192611, "y2002": 1.0423089214343333, "y2003": 1.0557727834721793, "y2000": 1.0831239730629278, "y2001": 1.0519262599166714, "id": 25, "y2008": 1.0599731384290745, "y2009": 1.0216094265950888}, {"neighbors": [28, 19, 16, 32, 17], "y1995": 1.1136826889802023, "y1997": 1.1189343096757198, "y1996": 1.1057147027213501, "y1999": 1.1432271991365353, "y1998": 1.1377866945457653, "y2006": 1.1268023587150906, "y2007": 1.1235793669317915, "y2004": 1.1482023546040769, "y2005": 1.1238659840114973, "y2002": 1.1600919581655105, "y2003": 1.1446778932605579, "y2000": 1.1825702862895446, "y2001": 1.1622624279436105, "id": 26, "y2008": 1.115925801617498, "y2009": 1.1257082797404696}, {"neighbors": [32, 24, 36, 16, 28], "y1995": 1.303794309231981, "y1997": 1.3120636604057812, "y1996": 1.3075218596998686, "y1999": 1.3062566740535762, "y1998": 1.3153226688859194, "y2006": 1.2865667454509278, "y2007": 1.2973409698906584, "y2004": 1.2683078569016086, "y2005": 1.2617743046198988, "y2002": 1.2920319347677043, "y2003": 1.2718351646774422, "y2000": 1.3121023910310281, "y2001": 1.2998915587009874, "id": 27, "y2008": 1.2939020510829768, "y2009": 1.2934544564717687}, {"neighbors": [26, 16, 19, 32, 27], "y1995": 0.83953719020532513, "y1997": 0.82006005316292385, "y1996": 0.82701447583159737, "y1999": 0.80294863992835086, "y1998": 0.8118887636743225, "y2006": 0.8389109342655191, "y2007": 0.84349246817602375, "y2004": 0.83108634437662732, "y2005": 0.84373783646216949, "y2002": 0.82596790474192727, "y2003": 0.82435704751379402, "y2000": 0.78772975118465016, "y2001": 0.82848010958278628, "id": 28, "y2008": 0.85637272428125033, "y2009": 0.86539395164519117}, {"neighbors": [5, 39, 22, 14, 31], "y1995": 1.2345008725695852, "y1997": 1.2353793515744536, "y1996": 1.2426021999018138, "y1999": 1.2452262575926329, "y1998": 1.2358129278404693, "y2006": 1.2365329681906834, "y2007": 1.2796200872578414, "y2004": 1.1967443443492951, "y2005": 1.2153657295128597, "y2002": 1.1937780418204111, "y2003": 1.1835533748469893, "y2000": 1.2256766974812463, "y2001": 1.2112664802237314, "id": 29, "y2008": 1.2796839248335934, "y2009": 1.2590773758694083}, {"neighbors": [37, 20, 24, 25, 27], "y1995": 0.97696620404861145, "y1997": 0.98035944080980575, "y1996": 0.9740071914763756, "y1999": 0.95543282313901556, "y1998": 0.97581530789338955, "y2006": 0.92100464312607799, "y2007": 0.9147530387633086, "y2004": 0.9298883479571457, "y2005": 0.93442917452618346, "y2002": 0.93679072759857129, "y2003": 0.92540049332494034, "y2000": 0.96480308308405971, "y2001": 0.9468637634838194, "id": 30, "y2008": 0.90249622070947177, "y2009": 0.90213630440783921}, {"neighbors": [35, 14, 33, 12, 4], "y1995": 0.84986885942491119, "y1997": 0.84295996568390696, "y1996": 0.89868510090623221, "y1999": 0.85659367787716301, "y1998": 0.87280533962476625, "y2006": 0.92562487931452408, "y2007": 0.96635366357254426, "y2004": 0.92698332540482575, "y2005": 0.94745351657235244, "y2002": 0.90448992922937876, "y2003": 0.95495898185605821, "y2000": 0.88937573313051443, "y2001": 0.89440100450887505, "id": 31, "y2008": 1.025203118044723, "y2009": 1.0394296020754366}, {"neighbors": [36, 27, 28, 16, 26], "y1995": 1.0192280751235561, "y1997": 1.0097442843101825, "y1996": 1.0025820319237864, "y1999": 0.99765073314119712, "y1998": 1.0030341681355639, "y2006": 0.94779637858468868, "y2007": 0.93759089358493275, "y2004": 0.97583768316642261, "y2005": 0.96101679691008712, "y2002": 0.99747298060178258, "y2003": 0.99550758543481688, "y2000": 1.0075901875261932, "y2001": 0.99192968437874551, "id": 32, "y2008": 0.93353431146829191, "y2009": 0.94121705123804411}, {"neighbors": [44, 25, 12, 35, 31], "y1995": 0.86367410708901315, "y1997": 0.85544345781923936, "y1996": 0.85558931627900803, "y1999": 0.84336613427334628, "y1998": 0.85103025143102673, "y2006": 0.89455097373003656, "y2007": 0.88283929116469462, "y2004": 0.85951183386707053, "y2005": 0.87194227372077004, "y2002": 0.84667960913556228, "y2003": 0.84374557883664714, "y2000": 0.83434853662160158, "y2001": 0.85813595114434105, "id": 33, "y2008": 0.90349490610221961, "y2009": 0.9060067497610369}, {"neighbors": [22, 39, 21, 29, 23], "y1995": 1.0094753356447226, "y1997": 1.0069881886439402, "y1996": 1.0041105523637666, "y1999": 0.99291086334982948, "y1998": 0.99513686502304577, "y2006": 0.96382634438484593, "y2007": 0.95011400973122428, "y2004": 0.975119236728752, "y2005": 0.96134614808826613, "y2002": 0.99291167539274383, "y2003": 0.98983209318633369, "y2000": 1.0058162611397035, "y2001": 0.98850522230466298, "id": 34, "y2008": 0.94346860300667812, "y2009": 0.9463776450423077}, {"neighbors": [31, 38, 44, 33, 14], "y1995": 1.0571257066143651, "y1997": 1.0575301194645879, "y1996": 1.0545941857842291, "y1999": 1.0510385688532684, "y1998": 1.0488078570498685, "y2006": 1.0247627521629479, "y2007": 1.0234752320591773, "y2004": 1.0329697933620496, "y2005": 1.0219168238570018, "y2002": 1.0420048344203974, "y2003": 1.0402553971511816, "y2000": 1.0480002306104303, "y2001": 1.030249414987729, "id": 35, "y2008": 1.0251768368501768, "y2009": 1.0435957064486703}, {"neighbors": [32, 43, 27, 28, 42], "y1995": 1.070841888164505, "y1997": 1.0793762307014196, "y1996": 1.0666949726007404, "y1999": 1.0794043012481198, "y1998": 1.0738798776109699, "y2006": 1.087727556316465, "y2007": 1.0885954360198933, "y2004": 1.1032213602455734, "y2005": 1.0916793915985508, "y2002": 1.0938347765734742, "y2003": 1.1052447043433509, "y2000": 1.0531800956589803, "y2001": 1.0745277096056161, "id": 36, "y2008": 1.0917733838297285, "y2009": 1.1096083021948762}, {"neighbors": [30, 40, 20, 42, 41], "y1995": 0.8671922185905101, "y1997": 0.86675155621455668, "y1996": 0.86628895935887062, "y1999": 0.86511809486628932, "y1998": 0.86425631732335095, "y2006": 0.84488343470424199, "y2007": 0.83374328958471722, "y2004": 0.84517414191529749, "y2005": 0.84843857600526962, "y2002": 0.85411284725399572, "y2003": 0.84886336375435456, "y2000": 0.86287327291635718, "y2001": 0.8516979624450659, "id": 37, "y2008": 0.82812044014430564, "y2009": 0.82878598934619596}, {"neighbors": [35, 31, 45, 39, 44], "y1995": 0.8838921149583755, "y1997": 0.90282398478743275, "y1996": 0.92288667453925455, "y1999": 0.92023285988219217, "y1998": 0.91229185518735723, "y2006": 0.93869676706720051, "y2007": 0.96947770975097391, "y2004": 0.99223700402629367, "y2005": 0.97984969609868555, "y2002": 0.93682451504456421, "y2003": 0.98655146182882891, "y2000": 0.92652175166361039, "y2001": 0.94278865361566122, "id": 38, "y2008": 1.0036262573224608, "y2009": 0.98102350657197357}, {"neighbors": [29, 34, 38, 22, 35], "y1995": 0.970820642185237, "y1997": 0.94534081352108112, "y1996": 0.95320232993219844, "y1999": 0.93967000034446724, "y1998": 0.94215592860799646, "y2006": 0.91035556215514757, "y2007": 0.90430364292511256, "y2004": 0.92879505989982103, "y2005": 0.9211054223180335, "y2002": 0.93412151936513388, "y2003": 0.93501274320242933, "y2000": 0.93092108910210503, "y2001": 0.92662519262599163, "id": 39, "y2008": 0.89994694483851023, "y2009": 0.9007386435858511}, {"neighbors": [41, 37, 42, 30, 45], "y1995": 0.95861858457245008, "y1997": 0.98254810501535106, "y1996": 0.95774543235102894, "y1999": 0.98684823919808018, "y1998": 0.98919471947721893, "y2006": 0.97163003599581876, "y2007": 0.97007020126757271, "y2004": 0.9493488753775261, "y2005": 0.97152609359561659, "y2002": 0.95601578436851964, "y2003": 0.94905384541254967, "y2000": 0.98882204635713133, "y2001": 0.97662233890759653, "id": 40, "y2008": 0.97158948117089283, "y2009": 0.95884908006927827}, {"neighbors": [40, 45, 44, 37, 42], "y1995": 0.83980438854721107, "y1997": 0.85746999875029983, "y1996": 0.84726737166133714, "y1999": 0.85567509846023126, "y1998": 0.85467221160427542, "y2006": 0.8333891885768886, "y2007": 0.83511679264592342, "y2004": 0.81743586206088703, "y2005": 0.83550405700769481, "y2002": 0.84502402428191115, "y2003": 0.82645665158259707, "y2000": 0.84818516243622177, "y2001": 0.85265681182580899, "id": 41, "y2008": 0.82136617314598481, "y2009": 0.80921873783836296}, {"neighbors": [43, 40, 46, 37, 36], "y1995": 0.95118156405662746, "y1997": 0.94688098462868708, "y1996": 0.9466212002600608, "y1999": 0.95124410099780687, "y1998": 0.95085829660091703, "y2006": 0.96895367966714574, "y2007": 0.9700163384024274, "y2004": 0.97583768316642261, "y2005": 0.95571723704302525, "y2002": 0.96804411514198463, "y2003": 0.97136213864358201, "y2000": 0.95440787445922959, "y2001": 0.96364362764682376, "id": 42, "y2008": 0.97082732652905901, "y2009": 0.9878236640328002}, {"neighbors": [36, 42, 32, 27, 46], "y1995": 1.0891004415267045, "y1997": 1.0849289528525252, "y1996": 1.0824896838138709, "y1999": 1.0945424900391545, "y1998": 1.0865692335830259, "y2006": 1.1450297539219478, "y2007": 1.1447474729339102, "y2004": 1.1334273474293739, "y2005": 1.1468606844516303, "y2002": 1.1229257675733433, "y2003": 1.1302103089739621, "y2000": 1.1055818811158884, "y2001": 1.1214085953998059, "id": 43, "y2008": 1.1408403740471014, "y2009": 1.1614292649793569}, {"neighbors": [33, 41, 45, 35, 40], "y1995": 1.0633603345917013, "y1997": 1.0869149629649646, "y1996": 1.0736582323828732, "y1999": 1.1166986255755473, "y1998": 1.0976484597942771, "y2006": 1.0839806574563229, "y2007": 1.0983176831786272, "y2004": 1.0927882684985315, "y2005": 1.0700320368873319, "y2002": 1.0881584856466706, "y2003": 1.0804431312806149, "y2000": 1.1185670222649935, "y2001": 1.0976428286056732, "id": 44, "y2008": 1.0929823187788443, "y2009": 1.0917612486217978}, {"neighbors": [41, 44, 40, 35, 33], "y1995": 0.79772064970019041, "y1997": 0.7858115114280021, "y1996": 0.78829195801876151, "y1999": 0.77035744221561353, "y1998": 0.77615921755360906, "y2006": 0.79949806580432425, "y2007": 0.80172181625581262, "y2004": 0.79603865293896003, "y2005": 0.78966436120841943, "y2002": 0.81437881076636964, "y2003": 0.80788827809912023, "y2000": 0.77751193519846906, "y2001": 0.79902973574567659, "id": 45, "y2008": 0.82168154748053679, "y2009": 0.85587910681858015}, {"neighbors": [42, 43, 40, 36, 37], "y1995": 1.0052446952315301, "y1997": 1.0047589936197736, "y1996": 1.0000769567582628, "y1999": 1.0063956091903872, "y1998": 1.0061394183885444, "y2006": 0.97292595590233411, "y2007": 0.96519561197191939, "y2004": 0.99030032232474696, "y2005": 0.97682565346267858, "y2002": 1.0081498135355325, "y2003": 1.0057431552702318, "y2000": 1.0016297948675874, "y2001": 0.99860738542320637, "id": 46, "y2008": 0.9617340332161447, "y2009": 0.95890283625473927}, {"neighbors": [20, 6, 24, 25, 30], "y1995": 0.95808418788867844, "y1997": 0.9654440995572009, "y1996": 0.93825679674127938, "y1999": 0.96987289157318213, "y1998": 0.95561201303757848, "y2006": 1.1704973973021624, "y2007": 1.1702515395802287, "y2004": 1.0533361880299275, "y2005": 1.0983262971945267, "y2002": 1.0078119390756035, "y2003": 1.0348423554112989, "y2000": 0.96608031008233231, "y2001": 0.99727184521431422, "id": 47, "y2008": 1.1873055260044207, "y2009": 1.1424264534188653}] diff --git a/release/python/0.7.0/crankshaft/test/helper.py b/release/python/0.7.0/crankshaft/test/helper.py new file mode 100644 index 0000000..7d28b94 --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/helper.py @@ -0,0 +1,13 @@ +import unittest + +from mock_plpy import MockPlPy +plpy = MockPlPy() + +import sys +sys.modules['plpy'] = plpy + +import os + +def fixture_file(name): + dir = os.path.dirname(os.path.realpath(__file__)) + return os.path.join(dir, 'fixtures', name) diff --git a/release/python/0.7.0/crankshaft/test/mock_plpy.py b/release/python/0.7.0/crankshaft/test/mock_plpy.py new file mode 100644 index 0000000..9c3340c --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/mock_plpy.py @@ -0,0 +1,57 @@ +import re + + +class MockCursor: + def __init__(self, data): + self.cursor_pos = 0 + self.data = data + + def fetch(self, batch_size): + batch = self.data[self.cursor_pos:self.cursor_pos + batch_size] + self.cursor_pos += batch_size + return batch + + +class MockPlPy: + def __init__(self): + self._reset() + + def _reset(self): + self.infos = [] + self.notices = [] + self.debugs = [] + self.logs = [] + self.warnings = [] + self.errors = [] + self.fatals = [] + self.executes = [] + self.results = [] + self.prepares = [] + self.results = [] + + def _define_result(self, query, result): + pattern = re.compile(query, re.IGNORECASE | re.MULTILINE) + self.results.append([pattern, result]) + + def notice(self, msg): + self.notices.append(msg) + + def debug(self, msg): + self.notices.append(msg) + + def info(self, msg): + self.infos.append(msg) + + def error(self, msg): + self.notices.append(msg) + + def cursor(self, query): + data = self.execute(query) + return MockCursor(data) + + # TODO: additional arguments + def execute(self, query): + for result in self.results: + if result[0].match(query): + return result[1] + return [] diff --git a/release/python/0.7.0/crankshaft/test/test_clustering_getis.py b/release/python/0.7.0/crankshaft/test/test_clustering_getis.py new file mode 100644 index 0000000..61add11 --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/test_clustering_getis.py @@ -0,0 +1,78 @@ +import unittest +import numpy as np + +from helper import fixture_file + +from crankshaft.clustering import Getis +import crankshaft.pysal_utils as pu +from crankshaft import random_seeds +import json +from crankshaft.analysis_data_provider import AnalysisDataProvider + +# Fixture files produced as follows +# +# import pysal as ps +# import numpy as np +# import random +# +# # setup variables +# f = ps.open(ps.examples.get_path("stl_hom.dbf")) +# y = np.array(f.by_col['HR8893']) +# w_queen = ps.queen_from_shapefile(ps.examples.get_path("stl_hom.shp")) +# +# out_queen = [{"id": index + 1, +# "neighbors": [x+1 for x in w_queen.neighbors[index]], +# "value": val} for index, val in enumerate(y)] +# +# with open('neighbors_queen_getis.json', 'w') as f: +# f.write(str(out_queen)) +# +# random.seed(1234) +# np.random.seed(1234) +# lgstar_queen = ps.esda.getisord.G_Local(y, w_queen, star=True, +# permutations=999) +# +# with open('getis_queen.json', 'w') as f: +# f.write(str(zip(lgstar_queen.z_sim, +# lgstar_queen.p_sim, lgstar_queen.p_z_sim))) + + +class FakeDataProvider(AnalysisDataProvider): + def __init__(self, mock_data): + self.mock_result = mock_data + + def get_getis(self, w_type, param): + return self.mock_result + + +class GetisTest(unittest.TestCase): + """Testing class for Getis-Ord's G* funtion + This test replicates the work done in PySAL documentation: + https://pysal.readthedocs.io/en/v1.11.0/users/tutorials/autocorrelation.html#local-g-and-g + """ + + def setUp(self): + # load raw data for analysis + self.neighbors_data = json.loads( + open(fixture_file('neighbors_getis.json')).read()) + + # load pre-computed/known values + self.getis_data = json.loads( + open(fixture_file('getis.json')).read()) + + def test_getis_ord(self): + """Test Getis-Ord's G*""" + data = [{'id': d['id'], + 'attr1': d['value'], + 'neighbors': d['neighbors']} for d in self.neighbors_data] + + random_seeds.set_random_seeds(1234) + getis = Getis(FakeDataProvider(data)) + + result = getis.getis_ord('subquery', 'value', + 'queen', None, 999, 'the_geom', + 'cartodb_id') + result = [(row[0], row[1]) for row in result] + expected = np.array(self.getis_data)[:, 0:2] + for ([res_z, res_p], [exp_z, exp_p]) in zip(result, expected): + self.assertAlmostEqual(res_z, exp_z, delta=1e-2) diff --git a/release/python/0.7.0/crankshaft/test/test_clustering_kmeans.py b/release/python/0.7.0/crankshaft/test/test_clustering_kmeans.py new file mode 100644 index 0000000..c118d34 --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/test_clustering_kmeans.py @@ -0,0 +1,87 @@ +import unittest +import numpy as np + + +from helper import fixture_file +from crankshaft.clustering import Kmeans +from crankshaft.analysis_data_provider import AnalysisDataProvider +import crankshaft.clustering as cc +from crankshaft import random_seeds + +import json +from collections import OrderedDict + + +class FakeDataProvider(AnalysisDataProvider): + def __init__(self, mocked_result): + self.mocked_result = mocked_result + + def get_spatial_kmeans(self, query): + return self.mocked_result + + def get_nonspatial_kmeans(self, query): + return self.mocked_result + + +class KMeansTest(unittest.TestCase): + """Testing class for k-means spatial""" + + def setUp(self): + self.cluster_data = json.loads( + open(fixture_file('kmeans.json')).read()) + self.params = {"subquery": "select * from table", + "no_clusters": "10"} + + def test_kmeans(self): + """ + """ + data = [{'xs': d['xs'], + 'ys': d['ys'], + 'ids': d['ids']} for d in self.cluster_data] + + random_seeds.set_random_seeds(1234) + kmeans = Kmeans(FakeDataProvider(data)) + clusters = kmeans.spatial('subquery', 2) + labels = [a[1] for a in clusters] + c1 = [a for a in clusters if a[1] == 0] + c2 = [a for a in clusters if a[1] == 1] + + self.assertEqual(len(np.unique(labels)), 2) + self.assertEqual(len(c1), 20) + self.assertEqual(len(c2), 20) + + +class KMeansNonspatialTest(unittest.TestCase): + """Testing class for k-means non-spatial""" + + def setUp(self): + self.params = {"subquery": "SELECT * FROM TABLE", + "n_clusters": 5} + + def test_kmeans_nonspatial(self): + """ + test for k-means non-spatial + """ + # data from: + # http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn-cluster-kmeans + data_raw = [OrderedDict([("arr_col1", [1, 1, 1, 4, 4, 4]), + ("arr_col2", [2, 4, 0, 2, 4, 0]), + ("rowid", [1, 2, 3, 4, 5, 6])])] + + random_seeds.set_random_seeds(1234) + kmeans = Kmeans(FakeDataProvider(data_raw)) + clusters = kmeans.nonspatial('subquery', ['col1', 'col2'], 2) + + cl1 = clusters[0][0] + cl2 = clusters[3][0] + + for idx, val in enumerate(clusters): + if idx < 3: + self.assertEqual(val[0], cl1) + else: + self.assertEqual(val[0], cl2) + + # raises exception for no data + with self.assertRaises(Exception): + kmeans = Kmeans(FakeDataProvider([])) + kmeans.nonspatial('subquery', ['col1', 'col2'], 2) diff --git a/release/python/0.7.0/crankshaft/test/test_clustering_moran.py b/release/python/0.7.0/crankshaft/test/test_clustering_moran.py new file mode 100644 index 0000000..cc1930e --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/test_clustering_moran.py @@ -0,0 +1,112 @@ +import unittest +import numpy as np + +from helper import fixture_file +from crankshaft.clustering import Moran +from crankshaft.analysis_data_provider import AnalysisDataProvider +import crankshaft.pysal_utils as pu +from crankshaft import random_seeds +import json +from collections import OrderedDict + + +class FakeDataProvider(AnalysisDataProvider): + def __init__(self, mock_data): + self.mock_result = mock_data + + def get_moran(self, w_type, params): + return self.mock_result + + +class MoranTest(unittest.TestCase): + """Testing class for Moran's I functions""" + + def setUp(self): + self.params = {"id_col": "cartodb_id", + "attr1": "andy", + "attr2": "jay_z", + "subquery": "SELECT * FROM a_list", + "geom_col": "the_geom", + "num_ngbrs": 321} + self.params_markov = {"id_col": "cartodb_id", + "time_cols": ["_2013_dec", "_2014_jan", + "_2014_feb"], + "subquery": "SELECT * FROM a_list", + "geom_col": "the_geom", + "num_ngbrs": 321} + self.neighbors_data = json.loads( + open(fixture_file('neighbors.json')).read()) + self.moran_data = json.loads( + open(fixture_file('moran.json')).read()) + + def test_map_quads(self): + """Test map_quads""" + from crankshaft.clustering import map_quads + self.assertEqual(map_quads(1), 'HH') + self.assertEqual(map_quads(2), 'LH') + self.assertEqual(map_quads(3), 'LL') + self.assertEqual(map_quads(4), 'HL') + self.assertEqual(map_quads(33), None) + self.assertEqual(map_quads('andy'), None) + + def test_quad_position(self): + """Test lisa_sig_vals""" + from crankshaft.clustering import quad_position + + quads = np.array([1, 2, 3, 4], np.int) + + ans = np.array(['HH', 'LH', 'LL', 'HL']) + test_ans = quad_position(quads) + + self.assertTrue((test_ans == ans).all()) + + def test_local_stat(self): + """Test Moran's I local""" + data = [OrderedDict([('id', d['id']), + ('attr1', d['value']), + ('neighbors', d['neighbors'])]) + for d in self.neighbors_data] + + moran = Moran(FakeDataProvider(data)) + random_seeds.set_random_seeds(1234) + result = moran.local_stat('subquery', 'value', + 'knn', 5, 99, 'the_geom', 'cartodb_id') + result = [(row[0], row[1]) for row in result] + zipped_values = zip(result, self.moran_data) + + for ([res_val, res_quad], [exp_val, exp_quad]) in zipped_values: + self.assertAlmostEqual(res_val, exp_val) + self.assertEqual(res_quad, exp_quad) + + def test_moran_local_rate(self): + """Test Moran's I rate""" + data = [{'id': d['id'], + 'attr1': d['value'], + 'attr2': 1, + 'neighbors': d['neighbors']} for d in self.neighbors_data] + + random_seeds.set_random_seeds(1234) + moran = Moran(FakeDataProvider(data)) + result = moran.local_rate_stat('subquery', 'numerator', 'denominator', + 'knn', 5, 99, 'the_geom', 'cartodb_id') + result = [(row[0], row[1]) for row in result] + + zipped_values = zip(result, self.moran_data) + + for ([res_val, res_quad], [exp_val, exp_quad]) in zipped_values: + self.assertAlmostEqual(res_val, exp_val) + + def test_moran(self): + """Test Moran's I global""" + data = [{'id': d['id'], + 'attr1': d['value'], + 'neighbors': d['neighbors']} for d in self.neighbors_data] + random_seeds.set_random_seeds(1235) + moran = Moran(FakeDataProvider(data)) + result = moran.global_stat('table', 'value', + 'knn', 5, 99, 'the_geom', + 'cartodb_id') + + result_moran = result[0][0] + expected_moran = np.array([row[0] for row in self.moran_data]).mean() + self.assertAlmostEqual(expected_moran, result_moran, delta=10e-2) diff --git a/release/python/0.7.0/crankshaft/test/test_pysal_utils.py b/release/python/0.7.0/crankshaft/test/test_pysal_utils.py new file mode 100644 index 0000000..be45164 --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/test_pysal_utils.py @@ -0,0 +1,83 @@ +import unittest + +import crankshaft.pysal_utils as pu +from crankshaft import random_seeds +from collections import OrderedDict + + +class PysalUtilsTest(unittest.TestCase): + """Testing class for utility functions related to PySAL integrations""" + + def setUp(self): + self.params1 = OrderedDict([("id_col", "cartodb_id"), + ("attr1", "andy"), + ("attr2", "jay_z"), + ("subquery", "SELECT * FROM a_list"), + ("geom_col", "the_geom"), + ("num_ngbrs", 321)]) + + self.params2 = OrderedDict([("id_col", "cartodb_id"), + ("numerator", "price"), + ("denominator", "sq_meters"), + ("subquery", "SELECT * FROM pecan"), + ("geom_col", "the_geom"), + ("num_ngbrs", 321)]) + + self.params3 = OrderedDict([("id_col", "cartodb_id"), + ("numerator", "sq_meters"), + ("denominator", "price"), + ("subquery", "SELECT * FROM pecan"), + ("geom_col", "the_geom"), + ("num_ngbrs", 321)]) + + self.params_array = {"id_col": "cartodb_id", + "time_cols": ["_2013_dec", "_2014_jan", "_2014_feb"], + "subquery": "SELECT * FROM a_list", + "geom_col": "the_geom", + "num_ngbrs": 321} + + def test_query_attr_select(self): + """Test query_attr_select""" + + ans1 = ("i.\"andy\"::numeric As attr1, " + "i.\"jay_z\"::numeric As attr2, ") + + ans2 = ("i.\"price\"::numeric As attr1, " + "i.\"sq_meters\"::numeric As attr2, ") + + ans3 = ("i.\"sq_meters\"::numeric As attr1, " + "i.\"price\"::numeric As attr2, ") + + ans_array = ("i.\"_2013_dec\"::numeric As attr1, " + "i.\"_2014_jan\"::numeric As attr2, " + "i.\"_2014_feb\"::numeric As attr3, ") + + self.assertEqual(pu.query_attr_select(self.params1), ans1) + self.assertEqual(pu.query_attr_select(self.params2), ans2) + self.assertEqual(pu.query_attr_select(self.params3), ans3) + self.assertEqual(pu.query_attr_select(self.params_array), ans_array) + + def test_query_attr_where(self): + """Test pu.query_attr_where""" + + ans1 = ("idx_replace.\"andy\" IS NOT NULL AND " + "idx_replace.\"jay_z\" IS NOT NULL") + + ans_array = ("idx_replace.\"_2013_dec\" IS NOT NULL AND " + "idx_replace.\"_2014_jan\" IS NOT NULL AND " + "idx_replace.\"_2014_feb\" IS NOT NULL") + + self.assertEqual(pu.query_attr_where(self.params1), ans1) + self.assertEqual(pu.query_attr_where(self.params_array), ans_array) + + def test_get_attributes(self): + """Test get_attributes""" + + # need to add tests + + self.assertEqual(True, True) + + def test_get_weight(self): + """Test get_weight""" + + self.assertEqual(True, True) diff --git a/release/python/0.7.0/crankshaft/test/test_regression_gwr.py b/release/python/0.7.0/crankshaft/test/test_regression_gwr.py new file mode 100644 index 0000000..57cd952 --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/test_regression_gwr.py @@ -0,0 +1,130 @@ +import unittest +import json +import numpy as np + + +from crankshaft import random_seeds +from helper import fixture_file +from crankshaft.regression import GWR +from crankshaft.analysis_data_provider import AnalysisDataProvider + + +class FakeDataProvider(AnalysisDataProvider): + def __init__(self, mocked_result): + self.mocked_result = mocked_result + + def get_gwr(self, params): + return self.mocked_result + + def get_gwr_predict(self, params): + return self.mocked_result + + +class GWRTest(unittest.TestCase): + """Testing class for geographically weighted regression (gwr)""" + + def setUp(self): + """ + fixture packed from canonical GWR georgia dataset using the + following query: + SELECT array_agg(x) As x, + array_agg(y) As y, + array_agg(pctbach) As dep_var, + array_agg(pctrural) As attr1, + array_agg(pctpov) As attr2, + array_agg(pctblack) As attr3, + array_agg(areakey) As rowid + FROM g_utm + WHERE pctbach is not NULL AND + pctrural IS NOT NULL AND + pctpov IS NOT NULL AND + pctblack IS NOT NULL + """ + import copy + # data packed from https://github.com/TaylorOshan/pysal/blob/1d6af33bda46b1d623f70912c56155064463383f/pysal/examples/georgia/GData_utm.csv + self.data = json.loads( + open(fixture_file('gwr_packed_data.json')).read()) + + # data packed from https://github.com/TaylorOshan/pysal/blob/a44c5541e2e0d10a99ff05edc1b7f81b70f5a82f/pysal/examples/georgia/georgia_BS_NN_listwise.csv + self.knowns = json.loads( + open(fixture_file('gwr_packed_knowns.json')).read()) + + # data for GWR prediction + self.data_predict = copy.deepcopy(self.data) + self.ids_of_unknowns = [13083, 13009, 13281, 13115, 13247, 13169] + self.idx_ids_of_unknowns = [self.data_predict[0]['rowid'].index(idx) + for idx in self.ids_of_unknowns] + + for idx in self.idx_ids_of_unknowns: + self.data_predict[0]['dep_var'][idx] = None + + self.predicted_knowns = {13009: 10.879, + 13083: 4.5259, + 13115: 9.4022, + 13169: 6.0793, + 13247: 8.1608, + 13281: 13.886} + + # params, with ind_vars in same ordering as query above + self.params = {'subquery': 'select * from table', + 'dep_var': 'pctbach', + 'ind_vars': ['pctrural', 'pctpov', 'pctblack'], + 'bw': 90.000, + 'fixed': False, + 'geom_col': 'the_geom', + 'id_col': 'areakey'} + + def test_gwr(self): + """ + """ + gwr = GWR(FakeDataProvider(self.data)) + gwr_resp = gwr.gwr(self.params['subquery'], + self.params['dep_var'], + self.params['ind_vars'], + bw=self.params['bw'], + fixed=self.params['fixed']) + + # unpack response + coeffs, stand_errs, t_vals, t_vals_filtered, predicteds, \ + residuals, r_squareds, bws, rowids = zip(*gwr_resp) + + # prepare for comparision + coeff_known_pctpov = self.knowns['est_pctpov'] + tval_known_pctblack = self.knowns['t_pctrural'] + pctpov_se = self.knowns['se_pctpov'] + ids = self.knowns['area_key'] + resp_idx = None + + # test pctpov coefficient estimates + for idx, val in enumerate(coeff_known_pctpov): + resp_idx = rowids.index(ids[idx]) + self.assertAlmostEquals(val, + json.loads(coeffs[resp_idx])['pctpov'], + places=4) + # test pctrural tvals + for idx, val in enumerate(tval_known_pctblack): + resp_idx = rowids.index(ids[idx]) + self.assertAlmostEquals(val, + json.loads(t_vals[resp_idx])['pctrural'], + places=4) + + def test_gwr_predict(self): + """Testing for GWR_Predict""" + gwr = GWR(FakeDataProvider(self.data_predict)) + gwr_resp = gwr.gwr_predict(self.params['subquery'], + self.params['dep_var'], + self.params['ind_vars'], + bw=self.params['bw'], + fixed=self.params['fixed']) + + # unpack response + coeffs, stand_errs, t_vals, \ + r_squareds, predicteds, rowid = zip(*gwr_resp) + threshold = 0.01 + + for i, idx in enumerate(self.idx_ids_of_unknowns): + + known_val = self.predicted_knowns[rowid[i]] + predicted_val = predicteds[i] + test_val = abs(known_val - predicted_val) / known_val + self.assertTrue(test_val < threshold) diff --git a/release/python/0.7.0/crankshaft/test/test_segmentation.py b/release/python/0.7.0/crankshaft/test/test_segmentation.py new file mode 100644 index 0000000..d02e8b1 --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/test_segmentation.py @@ -0,0 +1,64 @@ +import unittest +import numpy as np +from helper import plpy, fixture_file +import crankshaft.segmentation as segmentation +import json + +class SegmentationTest(unittest.TestCase): + """Testing class for Moran's I functions""" + + def setUp(self): + plpy._reset() + + def generate_random_data(self,n_samples,random_state, row_type=False): + x1 = random_state.uniform(size=n_samples) + x2 = random_state.uniform(size=n_samples) + x3 = random_state.randint(0, 4, size=n_samples) + + y = x1+x2*x2+x3 + cartodb_id = range(len(x1)) + + if row_type: + return [ {'features': vals} for vals in zip(x1,x2,x3)], y + else: + return [dict( zip(['x1','x2','x3','target', 'cartodb_id'],[x1,x2,x3,y,cartodb_id]))] + + def test_replace_nan_with_mean(self): + test_array = np.array([1.2, np.nan, 3.2, np.nan, np.nan]) + + def test_create_and_predict_segment(self): + n_samples = 1000 + + random_state_train = np.random.RandomState(13) + random_state_test = np.random.RandomState(134) + training_data = self.generate_random_data(n_samples, random_state_train) + test_data, test_y = self.generate_random_data(n_samples, random_state_test, row_type=True) + + + ids = [{'cartodb_ids': range(len(test_data))}] + rows = [{'x1': 0,'x2':0,'x3':0,'y':0,'cartodb_id':0}] + + plpy._define_result('select \* from \(select \* from training\) a limit 1',rows) + plpy._define_result('.*from \(select \* from training\) as a' ,training_data) + plpy._define_result('select array_agg\(cartodb\_id order by cartodb\_id\) as cartodb_ids from \(.*\) a',ids) + plpy._define_result('.*select \* from test.*' ,test_data) + + model_parameters = {'n_estimators': 1200, + 'max_depth': 3, + 'subsample' : 0.5, + 'learning_rate': 0.01, + 'min_samples_leaf': 1} + + result = segmentation.create_and_predict_segment( + 'select * from training', + 'target', + 'select * from test', + model_parameters) + + prediction = [r[1] for r in result] + + accuracy =np.sqrt(np.mean( np.square( np.array(prediction) - np.array(test_y)))) + + self.assertEqual(len(result),len(test_data)) + self.assertTrue( result[0][2] < 0.01) + self.assertTrue( accuracy < 0.5*np.mean(test_y) ) diff --git a/release/python/0.7.0/crankshaft/test/test_space_time_dynamics.py b/release/python/0.7.0/crankshaft/test/test_space_time_dynamics.py new file mode 100644 index 0000000..d14563e --- /dev/null +++ b/release/python/0.7.0/crankshaft/test/test_space_time_dynamics.py @@ -0,0 +1,349 @@ +import unittest +import numpy as np + +import unittest + + +from helper import fixture_file + +from crankshaft.space_time_dynamics import Markov +import crankshaft.space_time_dynamics as std +from crankshaft import random_seeds +from crankshaft.analysis_data_provider import AnalysisDataProvider +import json + + +class FakeDataProvider(AnalysisDataProvider): + def __init__(self, data): + self.mock_result = data + + def get_markov(self, w_type, params): + return self.mock_result + + +class SpaceTimeTests(unittest.TestCase): + """Testing class for Markov Functions.""" + + def setUp(self): + self.params = {"id_col": "cartodb_id", + "time_cols": ['dec_2013', 'jan_2014', 'feb_2014'], + "subquery": "SELECT * FROM a_list", + "geom_col": "the_geom", + "num_ngbrs": 321} + self.neighbors_data = json.loads( + open(fixture_file('neighbors_markov.json')).read()) + self.markov_data = json.loads(open(fixture_file('markov.json')).read()) + + self.time_data = np.array([i * np.ones(10, dtype=float) + for i in range(10)]).T + + self.transition_matrix = np.array([ + [[0.96341463, 0.0304878, 0.00609756, 0., 0.], + [0.06040268, 0.83221477, 0.10738255, 0., 0.], + [0., 0.14, 0.74, 0.12, 0.], + [0., 0.03571429, 0.32142857, 0.57142857, 0.07142857], + [0., 0., 0., 0.16666667, 0.83333333]], + [[0.79831933, 0.16806723, 0.03361345, 0., 0.], + [0.0754717, 0.88207547, 0.04245283, 0., 0.], + [0.00537634, 0.06989247, 0.8655914, 0.05913978, 0.], + [0., 0., 0.06372549, 0.90196078, 0.03431373], + [0., 0., 0., 0.19444444, 0.80555556]], + [[0.84693878, 0.15306122, 0., 0., 0.], + [0.08133971, 0.78947368, 0.1291866, 0., 0.], + [0.00518135, 0.0984456, 0.79274611, 0.0984456, 0.00518135], + [0., 0., 0.09411765, 0.87058824, 0.03529412], + [0., 0., 0., 0.10204082, 0.89795918]], + [[0.8852459, 0.09836066, 0., 0.01639344, 0.], + [0.03875969, 0.81395349, 0.13953488, 0., 0.00775194], + [0.0049505, 0.09405941, 0.77722772, 0.11881188, 0.0049505], + [0., 0.02339181, 0.12865497, 0.75438596, 0.09356725], + [0., 0., 0., 0.09661836, 0.90338164]], + [[0.33333333, 0.66666667, 0., 0., 0.], + [0.0483871, 0.77419355, 0.16129032, 0.01612903, 0.], + [0.01149425, 0.16091954, 0.74712644, 0.08045977, 0.], + [0., 0.01036269, 0.06217617, 0.89637306, 0.03108808], + [0., 0., 0., 0.02352941, 0.97647059]]] + ) + + def test_spatial_markov(self): + """Test Spatial Markov.""" + data = [{'id': d['id'], + 'attr1': d['y1995'], + 'attr2': d['y1996'], + 'attr3': d['y1997'], + 'attr4': d['y1998'], + 'attr5': d['y1999'], + 'attr6': d['y2000'], + 'attr7': d['y2001'], + 'attr8': d['y2002'], + 'attr9': d['y2003'], + 'attr10': d['y2004'], + 'attr11': d['y2005'], + 'attr12': d['y2006'], + 'attr13': d['y2007'], + 'attr14': d['y2008'], + 'attr15': d['y2009'], + 'neighbors': d['neighbors']} for d in self.neighbors_data] + # print(str(data[0])) + markov = Markov(FakeDataProvider(data)) + random_seeds.set_random_seeds(1234) + + result = markov.spatial_trend('subquery', + ['y1995', 'y1996', 'y1997', 'y1998', + 'y1999', 'y2000', 'y2001', 'y2002', + 'y2003', 'y2004', 'y2005', 'y2006', + 'y2007', 'y2008', 'y2009'], + 5, 'knn', 5, 0, 'the_geom', + 'cartodb_id') + + self.assertTrue(result is not None) + result = [(row[0], row[1], row[2], row[3], row[4]) for row in result] + print result[0] + expected = self.markov_data + for ([res_trend, res_up, res_down, res_vol, res_id], + [exp_trend, exp_up, exp_down, exp_vol, exp_id] + ) in zip(result, expected): + self.assertAlmostEqual(res_trend, exp_trend) + + def test_get_time_data(self): + """Test get_time_data""" + data = [{'attr1': d['y1995'], + 'attr2': d['y1996'], + 'attr3': d['y1997'], + 'attr4': d['y1998'], + 'attr5': d['y1999'], + 'attr6': d['y2000'], + 'attr7': d['y2001'], + 'attr8': d['y2002'], + 'attr9': d['y2003'], + 'attr10': d['y2004'], + 'attr11': d['y2005'], + 'attr12': d['y2006'], + 'attr13': d['y2007'], + 'attr14': d['y2008'], + 'attr15': d['y2009']} for d in self.neighbors_data] + + result = std.get_time_data(data, ['y1995', 'y1996', 'y1997', 'y1998', + 'y1999', 'y2000', 'y2001', 'y2002', + 'y2003', 'y2004', 'y2005', 'y2006', + 'y2007', 'y2008', 'y2009']) + + # expected was prepared from PySAL example: + # f = ps.open(ps.examples.get_path("usjoin.csv")) + # pci = np.array([f.by_col[str(y)] + # for y in range(1995, 2010)]).transpose() + # rpci = pci / (pci.mean(axis = 0)) + + expected = np.array( + [[0.87654416, 0.863147, 0.85637567, 0.84811668, 0.8446154, + 0.83271652, 0.83786314, 0.85012593, 0.85509656, 0.86416612, + 0.87119375, 0.86302631, 0.86148267, 0.86252252, 0.86746356], + [0.9188951, 0.91757931, 0.92333258, 0.92517289, 0.92552388, + 0.90746978, 0.89830489, 0.89431991, 0.88924794, 0.89815176, + 0.91832091, 0.91706054, 0.90139505, 0.87897455, 0.86216858], + [0.82591007, 0.82548596, 0.81989793, 0.81503235, 0.81731522, + 0.78964559, 0.80584442, 0.8084998, 0.82258551, 0.82668196, + 0.82373724, 0.81814804, 0.83675961, 0.83574199, 0.84647177], + [1.09088176, 1.08537689, 1.08456418, 1.08415404, 1.09898841, + 1.14506948, 1.12151133, 1.11160697, 1.10888621, 1.11399806, + 1.12168029, 1.13164797, 1.12958508, 1.11371818, 1.09936775], + [1.10731446, 1.11373944, 1.13283638, 1.14472559, 1.15910025, + 1.16898201, 1.17212488, 1.14752303, 1.11843284, 1.11024964, + 1.11943471, 1.11736468, 1.10863242, 1.09642516, 1.07762337], + [1.42269757, 1.42118434, 1.44273502, 1.43577571, 1.44400684, + 1.44184737, 1.44782832, 1.41978227, 1.39092208, 1.4059372, + 1.40788646, 1.44052766, 1.45241216, 1.43306098, 1.4174431], + [1.13073885, 1.13110513, 1.11074708, 1.13364636, 1.13088149, + 1.10888138, 1.11856629, 1.13062931, 1.11944984, 1.12446239, + 1.11671008, 1.10880034, 1.08401709, 1.06959206, 1.07875225], + [1.04706124, 1.04516831, 1.04253372, 1.03239987, 1.02072545, + 0.99854316, 0.9880258, 0.99669587, 0.99327676, 1.01400905, + 1.03176742, 1.040511, 1.01749645, 0.9936394, 0.98279746], + [0.98996986, 1.00143564, 0.99491, 1.00188408, 1.00455845, + 0.99127006, 0.97925917, 0.9683482, 0.95335147, 0.93694787, + 0.94308213, 0.92232874, 0.91284091, 0.89689833, 0.88928858], + [0.87418391, 0.86416601, 0.84425695, 0.8404494, 0.83903044, + 0.8578708, 0.86036185, 0.86107306, 0.8500772, 0.86981998, + 0.86837929, 0.87204141, 0.86633032, 0.84946077, 0.83287146], + [1.14196118, 1.14660262, 1.14892712, 1.14909594, 1.14436624, + 1.14450183, 1.12349752, 1.12596664, 1.12213996, 1.1119989, + 1.10257792, 1.10491258, 1.11059842, 1.10509795, 1.10020097], + [0.97282463, 0.96700147, 0.96252588, 0.9653878, 0.96057687, + 0.95831051, 0.94480909, 0.94804195, 0.95430286, 0.94103989, + 0.92122519, 0.91010201, 0.89280392, 0.89298243, 0.89165385], + [0.94325468, 0.96436902, 0.96455242, 0.95243009, 0.94117647, + 0.9480927, 0.93539182, 0.95388718, 0.94597005, 0.96918424, + 0.94781281, 0.93466815, 0.94281559, 0.96520315, 0.96715441], + [0.97478408, 0.98169225, 0.98712809, 0.98474769, 0.98559897, + 0.98687073, 0.99237486, 0.98209969, 0.9877653, 0.97399471, + 0.96910087, 0.98416665, 0.98423613, 0.99823861, 0.99545704], + [0.85570269, 0.85575915, 0.85986132, 0.85693406, 0.8538012, + 0.86191535, 0.84981451, 0.85472102, 0.84564835, 0.83998883, + 0.83478547, 0.82803648, 0.8198736, 0.82265395, 0.8399404], + [0.87022047, 0.85996258, 0.85961813, 0.85689572, 0.83947136, + 0.82785597, 0.86008789, 0.86776298, 0.86720209, 0.8676334, + 0.89179317, 0.94202108, 0.9422231, 0.93902708, 0.94479184], + [0.90134907, 0.90407738, 0.90403991, 0.90201769, 0.90399238, + 0.90906632, 0.92693339, 0.93695966, 0.94242697, 0.94338265, + 0.91981796, 0.91108804, 0.90543476, 0.91737138, 0.94793657], + [1.1977611, 1.18222564, 1.18439158, 1.18267865, 1.19286723, + 1.20172869, 1.21328691, 1.22624778, 1.22397075, 1.23857042, + 1.24419893, 1.23929384, 1.23418676, 1.23626739, 1.26754398], + [1.24919678, 1.25754773, 1.26991161, 1.28020651, 1.30625667, + 1.34790023, 1.34399863, 1.32575181, 1.30795492, 1.30544841, + 1.30303302, 1.32107766, 1.32936244, 1.33001241, 1.33288462], + [1.06768004, 1.03799276, 1.03637303, 1.02768449, 1.03296093, + 1.05059016, 1.03405057, 1.02747623, 1.03162734, 0.9961416, + 0.97356208, 0.94241549, 0.92754547, 0.92549227, 0.92138102], + [1.09475614, 1.11526796, 1.11654299, 1.13103948, 1.13143264, + 1.13889622, 1.12442212, 1.13367018, 1.13982256, 1.14029944, + 1.11979401, 1.10905389, 1.10577769, 1.11166825, 1.09985155], + [0.76530058, 0.76612841, 0.76542451, 0.76722683, 0.76014284, + 0.74480073, 0.76098396, 0.76156903, 0.76651952, 0.76533288, + 0.78205934, 0.76842416, 0.77487118, 0.77768683, 0.78801192], + [0.98391336, 0.98075816, 0.98295341, 0.97386015, 0.96913803, + 0.97370819, 0.96419154, 0.97209861, 0.97441313, 0.96356162, + 0.94745352, 0.93965462, 0.93069645, 0.94020973, 0.94358232], + [0.83561828, 0.82298088, 0.81738502, 0.81748588, 0.80904801, + 0.80071489, 0.83358256, 0.83451613, 0.85175032, 0.85954307, + 0.86790024, 0.87170334, 0.87863799, 0.87497981, 0.87888675], + [0.98845573, 1.02092428, 0.99665283, 0.99141823, 0.99386619, + 0.98733195, 0.99644997, 0.99669587, 1.02559097, 1.01116651, + 0.99988024, 0.97906749, 0.99323123, 1.00204939, 0.99602148], + [1.14930913, 1.15241949, 1.14300962, 1.14265542, 1.13984683, + 1.08312397, 1.05192626, 1.04230892, 1.05577278, 1.08569751, + 1.12443486, 1.08891079, 1.08603695, 1.05997314, 1.02160943], + [1.11368269, 1.1057147, 1.11893431, 1.13778669, 1.1432272, + 1.18257029, 1.16226243, 1.16009196, 1.14467789, 1.14820235, + 1.12386598, 1.12680236, 1.12357937, 1.1159258, 1.12570828], + [1.30379431, 1.30752186, 1.31206366, 1.31532267, 1.30625667, + 1.31210239, 1.29989156, 1.29203193, 1.27183516, 1.26830786, + 1.2617743, 1.28656675, 1.29734097, 1.29390205, 1.29345446], + [0.83953719, 0.82701448, 0.82006005, 0.81188876, 0.80294864, + 0.78772975, 0.82848011, 0.8259679, 0.82435705, 0.83108634, + 0.84373784, 0.83891093, 0.84349247, 0.85637272, 0.86539395], + [1.23450087, 1.2426022, 1.23537935, 1.23581293, 1.24522626, + 1.2256767, 1.21126648, 1.19377804, 1.18355337, 1.19674434, + 1.21536573, 1.23653297, 1.27962009, 1.27968392, 1.25907738], + [0.9769662, 0.97400719, 0.98035944, 0.97581531, 0.95543282, + 0.96480308, 0.94686376, 0.93679073, 0.92540049, 0.92988835, + 0.93442917, 0.92100464, 0.91475304, 0.90249622, 0.9021363], + [0.84986886, 0.8986851, 0.84295997, 0.87280534, 0.85659368, + 0.88937573, 0.894401, 0.90448993, 0.95495898, 0.92698333, + 0.94745352, 0.92562488, 0.96635366, 1.02520312, 1.0394296], + [1.01922808, 1.00258203, 1.00974428, 1.00303417, 0.99765073, + 1.00759019, 0.99192968, 0.99747298, 0.99550759, 0.97583768, + 0.9610168, 0.94779638, 0.93759089, 0.93353431, 0.94121705], + [0.86367411, 0.85558932, 0.85544346, 0.85103025, 0.84336613, + 0.83434854, 0.85813595, 0.84667961, 0.84374558, 0.85951183, + 0.87194227, 0.89455097, 0.88283929, 0.90349491, 0.90600675], + [1.00947534, 1.00411055, 1.00698819, 0.99513687, 0.99291086, + 1.00581626, 0.98850522, 0.99291168, 0.98983209, 0.97511924, + 0.96134615, 0.96382634, 0.95011401, 0.9434686, 0.94637765], + [1.05712571, 1.05459419, 1.05753012, 1.04880786, 1.05103857, + 1.04800023, 1.03024941, 1.04200483, 1.0402554, 1.03296979, + 1.02191682, 1.02476275, 1.02347523, 1.02517684, 1.04359571], + [1.07084189, 1.06669497, 1.07937623, 1.07387988, 1.0794043, + 1.0531801, 1.07452771, 1.09383478, 1.1052447, 1.10322136, + 1.09167939, 1.08772756, 1.08859544, 1.09177338, 1.1096083], + [0.86719222, 0.86628896, 0.86675156, 0.86425632, 0.86511809, + 0.86287327, 0.85169796, 0.85411285, 0.84886336, 0.84517414, + 0.84843858, 0.84488343, 0.83374329, 0.82812044, 0.82878599], + [0.88389211, 0.92288667, 0.90282398, 0.91229186, 0.92023286, + 0.92652175, 0.94278865, 0.93682452, 0.98655146, 0.992237, + 0.9798497, 0.93869677, 0.96947771, 1.00362626, 0.98102351], + [0.97082064, 0.95320233, 0.94534081, 0.94215593, 0.93967, + 0.93092109, 0.92662519, 0.93412152, 0.93501274, 0.92879506, + 0.92110542, 0.91035556, 0.90430364, 0.89994694, 0.90073864], + [0.95861858, 0.95774543, 0.98254811, 0.98919472, 0.98684824, + 0.98882205, 0.97662234, 0.95601578, 0.94905385, 0.94934888, + 0.97152609, 0.97163004, 0.9700702, 0.97158948, 0.95884908], + [0.83980439, 0.84726737, 0.85747, 0.85467221, 0.8556751, + 0.84818516, 0.85265681, 0.84502402, 0.82645665, 0.81743586, + 0.83550406, 0.83338919, 0.83511679, 0.82136617, 0.80921874], + [0.95118156, 0.9466212, 0.94688098, 0.9508583, 0.9512441, + 0.95440787, 0.96364363, 0.96804412, 0.97136214, 0.97583768, + 0.95571724, 0.96895368, 0.97001634, 0.97082733, 0.98782366], + [1.08910044, 1.08248968, 1.08492895, 1.08656923, 1.09454249, + 1.10558188, 1.1214086, 1.12292577, 1.13021031, 1.13342735, + 1.14686068, 1.14502975, 1.14474747, 1.14084037, 1.16142926], + [1.06336033, 1.07365823, 1.08691496, 1.09764846, 1.11669863, + 1.11856702, 1.09764283, 1.08815849, 1.08044313, 1.09278827, + 1.07003204, 1.08398066, 1.09831768, 1.09298232, 1.09176125], + [0.79772065, 0.78829196, 0.78581151, 0.77615922, 0.77035744, + 0.77751194, 0.79902974, 0.81437881, 0.80788828, 0.79603865, + 0.78966436, 0.79949807, 0.80172182, 0.82168155, 0.85587911], + [1.0052447, 1.00007696, 1.00475899, 1.00613942, 1.00639561, + 1.00162979, 0.99860739, 1.00814981, 1.00574316, 0.99030032, + 0.97682565, 0.97292596, 0.96519561, 0.96173403, 0.95890284], + [0.95808419, 0.9382568, 0.9654441, 0.95561201, 0.96987289, + 0.96608031, 0.99727185, 1.00781194, 1.03484236, 1.05333619, + 1.0983263, 1.1704974, 1.17025154, 1.18730553, 1.14242645]]) + + self.assertTrue(np.allclose(result, expected)) + self.assertTrue(type(result) == type(expected)) + self.assertTrue(result.shape == expected.shape) + + def test_rebin_data(self): + """Test rebin_data""" + # sample in double the time (even case since 10 % 2 = 0): + # (0+1)/2, (2+3)/2, (4+5)/2, (6+7)/2, (8+9)/2 + # = 0.5, 2.5, 4.5, 6.5, 8.5 + ans_even = np.array([(i + 0.5) * np.ones(10, dtype=float) + for i in range(0, 10, 2)]).T + + self.assertTrue( + np.array_equal(std.rebin_data(self.time_data, 2), ans_even)) + + # sample in triple the time (uneven since 10 % 3 = 1): + # (0+1+2)/3, (3+4+5)/3, (6+7+8)/3, (9)/1 + # = 1, 4, 7, 9 + ans_odd = np.array([i * np.ones(10, dtype=float) + for i in (1, 4, 7, 9)]).T + self.assertTrue( + np.array_equal(std.rebin_data(self.time_data, 3), ans_odd)) + + def test_get_prob_dist(self): + """Test get_prob_dist""" + lag_indices = np.array([1, 2, 3, 4]) + unit_indices = np.array([1, 3, 2, 4]) + answer = np.array([ + [0.0754717, 0.88207547, 0.04245283, 0., 0.], + [0., 0., 0.09411765, 0.87058824, 0.03529412], + [0.0049505, 0.09405941, 0.77722772, 0.11881188, 0.0049505], + [0., 0., 0., 0.02352941, 0.97647059] + ]) + result = std.get_prob_dist(self.transition_matrix, + lag_indices, unit_indices) + + self.assertTrue(np.array_equal(result, answer)) + + def test_get_prob_stats(self): + """Test get_prob_stats""" + + probs = np.array([ + [0.0754717, 0.88207547, 0.04245283, 0., 0.], + [0., 0., 0.09411765, 0.87058824, 0.03529412], + [0.0049505, 0.09405941, 0.77722772, 0.11881188, 0.0049505], + [0., 0., 0., 0.02352941, 0.97647059] + ]) + unit_indices = np.array([1, 3, 2, 4]) + answer_up = np.array([0.04245283, 0.03529412, 0.12376238, 0.]) + answer_down = np.array([0.0754717, 0.09411765, 0.0990099, 0.02352941]) + answer_trend = np.array([-0.03301887 / 0.88207547, + -0.05882353 / 0.87058824, + 0.02475248 / 0.77722772, + -0.02352941 / 0.97647059]) + answer_volatility = np.array([0.34221495, 0.33705421, + 0.29226542, 0.38834223]) + + result = std.get_prob_stats(probs, unit_indices) + result_up = result[0] + result_down = result[1] + result_trend = result[2] + result_volatility = result[3] + + self.assertTrue(np.allclose(result_up, answer_up)) + self.assertTrue(np.allclose(result_down, answer_down)) + self.assertTrue(np.allclose(result_trend, answer_trend)) + self.assertTrue(np.allclose(result_volatility, answer_volatility)) diff --git a/src/pg/crankshaft.control b/src/pg/crankshaft.control index 216a89f..7d5a93a 100644 --- a/src/pg/crankshaft.control +++ b/src/pg/crankshaft.control @@ -1,5 +1,5 @@ comment = 'CartoDB Spatial Analysis extension' -default_version = '0.6.1' +default_version = '0.7.0' requires = 'plpythonu, postgis' superuser = true schema = cdb_crankshaft