diff --git a/NEWS.md b/NEWS.md
index 2260f7e..c3d70bd 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,4 +1,4 @@
-0.8.0 (yyyy-mm-dd)
+0.8.0 (2018-03-12)
------------------
* Adds `CDB_MoransILocal*` functions that return spatial lag [#202](https://github.com/CartoDB/crankshaft/pull/202)
diff --git a/release/crankshaft--0.7.0--0.8.0.sql b/release/crankshaft--0.7.0--0.8.0.sql
new file mode 100644
index 0000000..a0b2474
--- /dev/null
+++ b/release/crankshaft--0.7.0--0.8.0.sql
@@ -0,0 +1,2307 @@
+--DO NOT MODIFY THIS FILE, IT IS GENERATED AUTOMATICALLY FROM SOURCES
+-- Complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION crankshaft" to load this file. \quit
+-- Version number of the extension release
+CREATE OR REPLACE FUNCTION cdb_crankshaft_version()
+RETURNS text AS $$
+ SELECT '0.8.0'::text;
+$$ language 'sql' IMMUTABLE STRICT PARALLEL SAFE;
+
+-- Internal identifier of the installed extension instence
+-- e.g. 'dev' for current development version
+CREATE OR REPLACE FUNCTION _cdb_crankshaft_internal_version()
+RETURNS text AS $$
+ SELECT installed_version FROM pg_available_extensions where name='crankshaft' and pg_available_extensions IS NOT NULL;
+$$ language 'sql' STABLE STRICT PARALLEL SAFE;
+-- Internal function.
+-- Set the seeds of the RNGs (Random Number Generators)
+-- used internally.
+CREATE OR REPLACE FUNCTION
+_cdb_random_seeds (seed_value INTEGER) RETURNS VOID
+AS $$
+ from crankshaft import random_seeds
+ random_seeds.set_random_seeds(seed_value)
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+CREATE OR REPLACE FUNCTION
+ CDB_PyAggS(current_state Numeric[], current_row Numeric[])
+ returns NUMERIC[] as $$
+ BEGIN
+ if array_upper(current_state,1) is null then
+ RAISE NOTICE 'setting state %',array_upper(current_row,1);
+ current_state[1] = array_upper(current_row,1);
+ end if;
+ return array_cat(current_state,current_row) ;
+ END
+ $$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
+
+-- Create aggregate if it did not exist
+DO $$
+BEGIN
+ IF NOT EXISTS (
+ SELECT *
+ FROM pg_catalog.pg_proc p
+ LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
+ WHERE n.nspname = 'cdb_crankshaft'
+ AND p.proname = 'cdb_pyagg'
+ AND p.proisagg)
+ THEN
+ CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
+ SFUNC = CDB_PyAggS,
+ STYPE = Numeric[],
+ PARALLEL = SAFE,
+ INITCOND = "{}"
+ );
+ END IF;
+END
+$$ LANGUAGE plpgsql;
+
+CREATE OR REPLACE FUNCTION
+ CDB_CreateAndPredictSegment(
+ target NUMERIC[],
+ features NUMERIC[],
+ target_features NUMERIC[],
+ target_ids NUMERIC[],
+ n_estimators INTEGER DEFAULT 1200,
+ max_depth INTEGER DEFAULT 3,
+ subsample DOUBLE PRECISION DEFAULT 0.5,
+ learning_rate DOUBLE PRECISION DEFAULT 0.01,
+ min_samples_leaf INTEGER DEFAULT 1)
+RETURNS TABLE(cartodb_id NUMERIC, prediction NUMERIC, accuracy NUMERIC)
+AS $$
+ import numpy as np
+ import plpy
+
+ from crankshaft.segmentation import create_and_predict_segment_agg
+ model_params = {'n_estimators': n_estimators,
+ 'max_depth': max_depth,
+ 'subsample': subsample,
+ 'learning_rate': learning_rate,
+ 'min_samples_leaf': min_samples_leaf}
+
+ def unpack2D(data):
+ dimension = data.pop(0)
+ a = np.array(data, dtype=float)
+ return a.reshape(len(a)/dimension, dimension)
+
+ return create_and_predict_segment_agg(np.array(target, dtype=float),
+ unpack2D(features),
+ unpack2D(target_features),
+ target_ids,
+ model_params)
+
+$$ LANGUAGE plpythonu VOLATILE PARALLEL RESTRICTED;
+
+CREATE OR REPLACE FUNCTION
+ CDB_CreateAndPredictSegment (
+ query TEXT,
+ variable_name TEXT,
+ target_table TEXT,
+ n_estimators INTEGER DEFAULT 1200,
+ max_depth INTEGER DEFAULT 3,
+ subsample DOUBLE PRECISION DEFAULT 0.5,
+ learning_rate DOUBLE PRECISION DEFAULT 0.01,
+ min_samples_leaf INTEGER DEFAULT 1)
+RETURNS TABLE (cartodb_id TEXT, prediction NUMERIC, accuracy NUMERIC)
+AS $$
+ from crankshaft.segmentation import create_and_predict_segment
+ model_params = {'n_estimators': n_estimators, 'max_depth':max_depth, 'subsample' : subsample, 'learning_rate': learning_rate, 'min_samples_leaf' : min_samples_leaf}
+ return create_and_predict_segment(query,variable_name,target_table, model_params)
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+CREATE OR REPLACE FUNCTION CDB_Gravity(
+ IN target_query text,
+ IN weight_column text,
+ IN source_query text,
+ IN pop_column text,
+ IN target bigint,
+ IN radius integer,
+ IN minval numeric DEFAULT -10e307
+ )
+RETURNS TABLE(
+ the_geom geometry,
+ source_id bigint,
+ target_id bigint,
+ dist numeric,
+ h numeric,
+ hpop numeric) AS $$
+DECLARE
+ t_id bigint[];
+ t_geom geometry[];
+ t_weight numeric[];
+ s_id bigint[];
+ s_geom geometry[];
+ s_pop numeric[];
+BEGIN
+ EXECUTE 'WITH foo as('+target_query+') SELECT array_agg(cartodb_id), array_agg(the_geom), array_agg(' || weight_column || ') FROM foo' INTO t_id, t_geom, t_weight;
+ EXECUTE 'WITH foo as('+source_query+') SELECT array_agg(cartodb_id), array_agg(the_geom), array_agg(' || pop_column || ') FROM foo' INTO s_id, s_geom, s_pop;
+ RETURN QUERY
+ SELECT g.* FROM t, s, CDB_Gravity(t_id, t_geom, t_weight, s_id, s_geom, s_pop, target, radius, minval) g;
+END;
+$$ language plpgsql VOLATILE PARALLEL UNSAFE;
+
+CREATE OR REPLACE FUNCTION CDB_Gravity(
+ IN t_id bigint[],
+ IN t_geom geometry[],
+ IN t_weight numeric[],
+ IN s_id bigint[],
+ IN s_geom geometry[],
+ IN s_pop numeric[],
+ IN target bigint,
+ IN radius integer,
+ IN minval numeric DEFAULT -10e307
+ )
+RETURNS TABLE(
+ the_geom geometry,
+ source_id bigint,
+ target_id bigint,
+ dist numeric,
+ h numeric,
+ hpop numeric) AS $$
+DECLARE
+ t_type text;
+ s_type text;
+ t_center geometry[];
+ s_center geometry[];
+BEGIN
+ t_type := GeometryType(t_geom[1]);
+ s_type := GeometryType(s_geom[1]);
+ IF t_type = 'POINT' THEN
+ t_center := t_geom;
+ ELSE
+ WITH tmp as (SELECT unnest(t_geom) as g) SELECT array_agg(ST_Centroid(g)) INTO t_center FROM tmp;
+ END IF;
+ IF s_type = 'POINT' THEN
+ s_center := s_geom;
+ ELSE
+ WITH tmp as (SELECT unnest(s_geom) as g) SELECT array_agg(ST_Centroid(g)) INTO s_center FROM tmp;
+ END IF;
+ RETURN QUERY
+ with target0 as(
+ SELECT unnest(t_center) as tc, unnest(t_weight) as tw, unnest(t_id) as td
+ ),
+ source0 as(
+ SELECT unnest(s_center) as sc, unnest(s_id) as sd, unnest (s_geom) as sg, unnest(s_pop) as sp
+ ),
+ prev0 as(
+ SELECT
+ source0.sg,
+ source0.sd as sourc_id,
+ coalesce(source0.sp,0) as sp,
+ target.td as targ_id,
+ coalesce(target.tw,0) as tw,
+ GREATEST(1.0,ST_Distance(geography(target.tc), geography(source0.sc)))::numeric as distance
+ FROM source0
+ CROSS JOIN LATERAL
+ (
+ SELECT
+ *
+ FROM target0
+ WHERE tw > minval
+ AND ST_DWithin(geography(source0.sc), geography(tc), radius)
+ ) AS target
+ ),
+ deno as(
+ SELECT
+ sourc_id,
+ sum(tw/distance) as h_deno
+ FROM
+ prev0
+ GROUP BY sourc_id
+ )
+ SELECT
+ p.sg as the_geom,
+ p.sourc_id as source_id,
+ p.targ_id as target_id,
+ case when p.distance > 1 then p.distance else 0.0 end as dist,
+ 100*(p.tw/p.distance)/d.h_deno as h,
+ p.sp*(p.tw/p.distance)/d.h_deno as hpop
+ FROM
+ prev0 p,
+ deno d
+ WHERE
+ p.targ_id = target AND
+ p.sourc_id = d.sourc_id;
+END;
+$$ language plpgsql IMMUTABLE PARALLEL SAFE;
+-- 0: nearest neighbor(s)
+-- 1: barymetric
+-- 2: IDW
+-- 3: krigin ---> TO DO
+
+
+CREATE OR REPLACE FUNCTION CDB_SpatialInterpolation(
+ IN query text,
+ IN point geometry,
+ IN method integer DEFAULT 1,
+ IN p1 numeric DEFAULT 0,
+ IN p2 numeric DEFAULT 0
+ )
+RETURNS numeric AS
+$$
+DECLARE
+ gs geometry[];
+ vs numeric[];
+ output numeric;
+BEGIN
+ EXECUTE 'WITH a AS('||query||') SELECT array_agg(the_geom), array_agg(attrib) FROM a' INTO gs, vs;
+ SELECT CDB_SpatialInterpolation(gs, vs, point, method, p1,p2) INTO output FROM a;
+
+ RETURN output;
+END;
+$$
+language plpgsql VOLATILE PARALLEL UNSAFE;
+
+CREATE OR REPLACE FUNCTION CDB_SpatialInterpolation(
+ IN geomin geometry[],
+ IN colin numeric[],
+ IN point geometry,
+ IN method integer DEFAULT 1,
+ IN p1 numeric DEFAULT 0,
+ IN p2 numeric DEFAULT 0
+ )
+RETURNS numeric AS
+$$
+DECLARE
+ gs geometry[];
+ vs numeric[];
+ gs2 geometry[];
+ vs2 numeric[];
+ g geometry;
+ vertex geometry[];
+ sg numeric;
+ sa numeric;
+ sb numeric;
+ sc numeric;
+ va numeric;
+ vb numeric;
+ vc numeric;
+ output numeric;
+BEGIN
+ -- output := -999.999;
+
+ -- nearest neighbors
+ -- p1: limit the number of neighbors, 0-> closest one
+ IF method = 0 THEN
+
+ IF p1 = 0 THEN
+ p1 := 1;
+ END IF;
+
+ WITH a as (SELECT unnest(geomin) as g, unnest(colin) as v),
+ b as (SELECT a.v as v FROM a ORDER BY point<->a.g LIMIT p1::integer)
+ SELECT avg(b.v) INTO output FROM b;
+ RETURN output;
+
+ -- barymetric
+ ELSIF method = 1 THEN
+ WITH a as (SELECT unnest(geomin) AS e),
+ b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a),
+ c as (SELECT (ST_Dump(t)).geom as v FROM b),
+ d as (SELECT v FROM c WHERE ST_Within(point, v))
+ SELECT v INTO g FROM d;
+ IF g is null THEN
+ -- out of the realm of the input data
+ RETURN -888.888;
+ END IF;
+ -- vertex of the selected cell
+ WITH a AS (SELECT (ST_DumpPoints(g)).geom AS v)
+ SELECT array_agg(v) INTO vertex FROM a;
+
+ -- retrieve the value of each vertex
+ WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c)
+ SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]);
+ WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c)
+ SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]);
+ WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c)
+ SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]);
+
+ SELECT ST_area(g), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[2], vertex[3], point]))), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[1], vertex[3], point]))), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point,vertex[1],vertex[2], point]))) INTO sg, sa, sb, sc;
+
+ output := (coalesce(sa,0) * coalesce(va,0) + coalesce(sb,0) * coalesce(vb,0) + coalesce(sc,0) * coalesce(vc,0)) / coalesce(sg);
+ RETURN output;
+
+ -- IDW
+ -- p1: limit the number of neighbors, 0->no limit
+ -- p2: order of distance decay, 0-> order 1
+ ELSIF method = 2 THEN
+
+ IF p2 = 0 THEN
+ p2 := 1;
+ END IF;
+
+ WITH a as (SELECT unnest(geomin) as g, unnest(colin) as v),
+ b as (SELECT a.g, a.v FROM a ORDER BY point<->a.g)
+ SELECT array_agg(b.g), array_agg(b.v) INTO gs, vs FROM b;
+ IF p1::integer>0 THEN
+ gs2:=gs;
+ vs2:=vs;
+ FOR i IN 1..p1
+ LOOP
+ gs2 := gs2 || gs[i];
+ vs2 := vs2 || vs[i];
+ END LOOP;
+ ELSE
+ gs2:=gs;
+ vs2:=vs;
+ END IF;
+
+ WITH a as (SELECT unnest(gs2) as g, unnest(vs2) as v),
+ b as (
+ SELECT
+ (1/ST_distance(point, a.g)^p2::integer) as k,
+ (a.v/ST_distance(point, a.g)^p2::integer) as f
+ FROM a
+ )
+ SELECT sum(b.f)/sum(b.k) INTO output FROM b;
+ RETURN output;
+
+ -- krigin
+ ELSIF method = 3 THEN
+
+ -- TO DO
+
+ END IF;
+
+ RETURN -777.777;
+
+END;
+$$
+language plpgsql IMMUTABLE PARALLEL SAFE;
+-- =============================================================================================
+--
+-- CDB_Voronoi
+--
+-- =============================================================================================
+CREATE OR REPLACE FUNCTION CDB_voronoi(
+ IN geomin geometry[],
+ IN buffer numeric DEFAULT 0.5,
+ IN tolerance numeric DEFAULT 1e-9
+ )
+RETURNS geometry AS $$
+DECLARE
+ geomout geometry;
+BEGIN
+ -- we need to make the geometry calculations in (pseudo)meters!!!
+ with a as (
+ SELECT unnest(geomin) as g1
+ ),
+ b as(
+ SELECT st_transform(g1, 3857) g2 from a
+ )
+ SELECT array_agg(g2) INTO geomin from b;
+
+ WITH
+ convexhull_1 as (
+ SELECT
+ ST_ConvexHull(ST_Collect(geomin)) as g,
+ buffer * |/ (st_area(ST_ConvexHull(ST_Collect(geomin)))/PI()) as r
+ ),
+ clipper as(
+ SELECT
+ st_buffer(ST_MinimumBoundingCircle(a.g), buffer*a.r) as g
+ FROM convexhull_1 a
+ ),
+ env0 as (
+ SELECT
+ (st_dumppoints(st_expand(a.g, buffer*a.r))).geom as e
+ FROM convexhull_1 a
+ ),
+ env as (
+ SELECT
+ array_agg(env0.e) as e
+ FROM env0
+ ),
+ sample AS (
+ SELECT
+ ST_Collect(geomin || env.e) as geom
+ FROM env
+ ),
+ convexhull as (
+ SELECT
+ ST_ConvexHull(ST_Collect(geomin)) as cg
+ ),
+ tin as (
+ SELECT
+ ST_Dump(ST_DelaunayTriangles(geom, tolerance, 0)) as gd
+ FROM
+ sample
+ ),
+ tin_polygons as (
+ SELECT
+ (gd).Path as id,
+ (gd).Geom as pg,
+ ST_Centroid(ST_MinimumBoundingCircle((gd).Geom, 180)) as ct
+ FROM tin
+ ),
+ tin_lines as (
+ SELECT
+ id,
+ ST_ExteriorRing(pg) as lg
+ FROM tin_polygons
+ ),
+ tin_nodes as (
+ SELECT
+ id,
+ ST_PointN(lg,1) p1,
+ ST_PointN(lg,2) p2,
+ ST_PointN(lg,3) p3
+ FROM tin_lines
+ ),
+ tin_edges AS (
+ SELECT
+ p.id,
+ UNNEST(ARRAY[
+ ST_MakeLine(n.p1,n.p2) ,
+ ST_MakeLine(n.p2,n.p3) ,
+ ST_MakeLine(n.p3,n.p1)]) as Edge,
+ ST_Force2D(cdb_crankshaft._Find_Circle(n.p1,n.p2,n.p3)) as ct,
+ CASE WHEN st_distance(p.ct, ST_ExteriorRing(p.pg)) < tolerance THEN
+ TRUE
+ ELSE FALSE END AS ctx,
+ p.pg,
+ ST_within(p.ct, convexhull.cg) as ctin
+ FROM
+ tin_polygons p,
+ tin_nodes n,
+ convexhull
+ WHERE p.id = n.id
+ ),
+ voro_nodes as (
+ SELECT
+ CASE WHEN x.ctx = TRUE THEN
+ ST_Centroid(x.edge)
+ ELSE
+ x.ct
+ END as xct,
+ CASE WHEN y.id is null THEN
+ CASE WHEN x.ctin = TRUE THEN
+ ST_SetSRID(ST_MakePoint(
+ ST_X(x.ct) + ((ST_X(ST_Centroid(x.edge)) - ST_X(x.ct)) * (1+buffer)),
+ ST_Y(x.ct) + ((ST_Y(ST_Centroid(x.edge)) - ST_Y(x.ct)) * (1+buffer))
+ ), ST_SRID(x.ct))
+ END
+ ELSE
+ y.ct
+ END as yct
+ FROM
+ tin_edges x
+ LEFT OUTER JOIN
+ tin_edges y
+ ON x.id <> y.id AND ST_Equals(x.edge, y.edge)
+ ),
+ voro_edges as(
+ SELECT
+ ST_LineMerge(ST_Collect(ST_MakeLine(xct, yct))) as v
+ FROM
+ voro_nodes
+ ),
+ voro_cells as(
+ SELECT
+ ST_Polygonize(
+ ST_Node(
+ ST_LineMerge(
+ ST_Union(v, ST_ExteriorRing(
+ ST_Convexhull(v)
+ )
+ )
+ )
+ )
+ ) as g
+ FROM
+ voro_edges
+ ),
+ voro_set as(
+ SELECT
+ (st_dump(v.g)).geom as g
+ FROM voro_cells v
+ ),
+ clipped_voro as(
+ SELECT
+ ST_intersection(c.g, v.g) as g
+ FROM
+ voro_set v,
+ clipper c
+ WHERE
+ ST_GeometryType(v.g) = 'ST_Polygon'
+ )
+ SELECT
+ st_collect(
+ ST_Transform(
+ ST_ConvexHull(g),
+ 4326
+ )
+ )
+ INTO geomout
+ FROM
+ clipped_voro;
+ RETURN geomout;
+END;
+$$ language plpgsql IMMUTABLE PARALLEL SAFE;
+
+/** ----------------------------------------------------------------------------------------
+ * @function : FindCircle
+ * @precis : Function that determines if three points form a circle. If so a table containing
+ * centre and radius is returned. If not, a null table is returned.
+ * @version : 1.0.1
+ * @param : p_pt1 : First point in curve
+ * @param : p_pt2 : Second point in curve
+ * @param : p_pt3 : Third point in curve
+ * @return : geometry : In which X,Y ordinates are the centre X, Y and the Z being the radius of found circle
+ * or NULL if three points do not form a circle.
+ * @history : Simon Greener - Feb 2012 - Original coding.
+ * Rafa de la Torre - Aug 2016 - Small fix for type checking
+ * Raul Marin - Sept 2017 - Remove unnecessary NULL checks and set function categories
+ * @copyright : Simon Greener @ 2012
+ * Licensed under a Creative Commons Attribution-Share Alike 2.5 Australia License. (http://creativecommons.org/licenses/by-sa/2.5/au/)
+**/
+CREATE OR REPLACE FUNCTION _Find_Circle(
+ IN p_pt1 geometry,
+ IN p_pt2 geometry,
+ IN p_pt3 geometry)
+ RETURNS geometry AS
+$BODY$
+DECLARE
+ v_Centre geometry;
+ v_radius NUMERIC;
+ v_CX NUMERIC;
+ v_CY NUMERIC;
+ v_dA NUMERIC;
+ v_dB NUMERIC;
+ v_dC NUMERIC;
+ v_dD NUMERIC;
+ v_dE NUMERIC;
+ v_dF NUMERIC;
+ v_dG NUMERIC;
+BEGIN
+ IF ( ST_GeometryType(p_pt1) <> 'ST_Point' OR
+ ST_GeometryType(p_pt2) <> 'ST_Point' OR
+ ST_GeometryType(p_pt3) <> 'ST_Point' ) THEN
+ RAISE EXCEPTION 'All supplied geometries must be points.';
+ RETURN NULL;
+ END IF;
+ v_dA := ST_X(p_pt2) - ST_X(p_pt1);
+ v_dB := ST_Y(p_pt2) - ST_Y(p_pt1);
+ v_dC := ST_X(p_pt3) - ST_X(p_pt1);
+ v_dD := ST_Y(p_pt3) - ST_Y(p_pt1);
+ v_dE := v_dA * (ST_X(p_pt1) + ST_X(p_pt2)) + v_dB * (ST_Y(p_pt1) + ST_Y(p_pt2));
+ v_dF := v_dC * (ST_X(p_pt1) + ST_X(p_pt3)) + v_dD * (ST_Y(p_pt1) + ST_Y(p_pt3));
+ v_dG := 2.0 * (v_dA * (ST_Y(p_pt3) - ST_Y(p_pt2)) - v_dB * (ST_X(p_pt3) - ST_X(p_pt2)));
+ -- If v_dG is zero then the three points are collinear and no finite-radius
+ -- circle through them exists.
+ IF ( v_dG = 0 ) THEN
+ RETURN NULL;
+ ELSE
+ v_CX := (v_dD * v_dE - v_dB * v_dF) / v_dG;
+ v_CY := (v_dA * v_dF - v_dC * v_dE) / v_dG;
+ v_Radius := SQRT(POWER(ST_X(p_pt1) - v_CX,2) + POWER(ST_Y(p_pt1) - v_CY,2) );
+ END IF;
+ RETURN ST_SetSRID(ST_MakePoint(v_CX, v_CY, v_radius),ST_Srid(p_pt1));
+END;
+$BODY$
+ LANGUAGE plpgsql IMMUTABLE STRICT PARALLEL SAFE;
+
+-- Moran's I Global Measure (public-facing)
+CREATE OR REPLACE FUNCTION
+ CDB_AreasOfInterestGlobal(
+ subquery TEXT,
+ column_name TEXT,
+ w_type TEXT DEFAULT 'knn',
+ num_ngbrs INT DEFAULT 5,
+ permutations INT DEFAULT 99,
+ geom_col TEXT DEFAULT 'the_geom',
+ id_col TEXT DEFAULT 'cartodb_id')
+RETURNS TABLE (moran NUMERIC, significance NUMERIC)
+AS $$
+ from crankshaft.clustering import Moran
+ # TODO: use named parameters or a dictionary
+ moran = Moran()
+ return moran.global_stat(subquery, column_name, w_type,
+ num_ngbrs, permutations, geom_col, id_col)
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+
+-- Moran's I Local (internal function) - DEPRECATED
+CREATE OR REPLACE FUNCTION
+ _CDB_AreasOfInterestLocal(
+ subquery TEXT,
+ column_name TEXT,
+ w_type TEXT,
+ num_ngbrs INT,
+ permutations INT,
+ geom_col TEXT,
+ id_col TEXT)
+RETURNS TABLE (
+ moran NUMERIC,
+ quads TEXT,
+ significance NUMERIC,
+ rowid INT,
+ vals NUMERIC)
+AS $$
+ from crankshaft.clustering import Moran
+ moran = Moran()
+ result = moran.local_stat(subquery, column_name, w_type,
+ num_ngbrs, permutations, geom_col, id_col)
+ # remove spatial lag
+ return [(r[6], r[0], r[1], r[7], r[5]) for r in result]
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+
+-- Moran's I Local (internal function)
+CREATE OR REPLACE FUNCTION
+ _CDB_MoransILocal(
+ subquery TEXT,
+ column_name TEXT,
+ w_type TEXT,
+ num_ngbrs INT,
+ permutations INT,
+ geom_col TEXT,
+ id_col TEXT)
+RETURNS TABLE (
+ quads TEXT,
+ significance NUMERIC,
+ spatial_lag NUMERIC,
+ spatial_lag_std NUMERIC,
+ orig_val NUMERIC,
+ orig_val_std NUMERIC,
+ moran_stat NUMERIC,
+ rowid INT)
+AS $$
+
+from crankshaft.clustering import Moran
+moran = Moran()
+return moran.local_stat(subquery, column_name, w_type,
+ num_ngbrs, permutations, geom_col, id_col)
+
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+
+
+-- Moran's I Local (public-facing function)
+-- Replaces CDB_AreasOfInterestLocal
+CREATE OR REPLACE FUNCTION
+ CDB_MoransILocal(
+ subquery TEXT,
+ column_name TEXT,
+ w_type TEXT DEFAULT 'knn',
+ num_ngbrs INT DEFAULT 5,
+ permutations INT DEFAULT 99,
+ geom_col TEXT DEFAULT 'the_geom',
+ id_col TEXT DEFAULT 'cartodb_id')
+RETURNS TABLE (
+ quads TEXT,
+ significance NUMERIC,
+ spatial_lag NUMERIC,
+ spatial_lag_std NUMERIC,
+ orig_val NUMERIC,
+ orig_val_std NUMERIC,
+ moran_stat NUMERIC,
+ rowid INT)
+AS $$
+
+ SELECT
+ quads, significance, spatial_lag, spatial_lag_std,
+ orig_val, orig_val_std, moran_stat, rowid
+ FROM cdb_crankshaft._CDB_MoransILocal(
+ subquery, column_name, w_type,
+ num_ngbrs, permutations, geom_col, id_col);
+
+$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
+
+-- Moran's I Local (public-facing function) - DEPRECATED
+CREATE OR REPLACE FUNCTION
+ CDB_AreasOfInterestLocal(
+ subquery TEXT,
+ column_name TEXT,
+ w_type TEXT DEFAULT 'knn',
+ num_ngbrs INT DEFAULT 5,
+ permutations INT DEFAULT 99,
+ geom_col TEXT DEFAULT 'the_geom',
+ id_col TEXT DEFAULT 'cartodb_id')
+RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
+AS $$
+
+ SELECT moran, quads, significance, rowid, vals
+ FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col);
+
+$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
+
+-- Moran's I only for HH and HL (public-facing function)
+CREATE OR REPLACE FUNCTION
+ CDB_GetSpatialHotspots(
+ subquery TEXT,
+ column_name TEXT,
+ w_type TEXT DEFAULT 'knn',
+ num_ngbrs INT DEFAULT 5,
+ permutations INT DEFAULT 99,
+ geom_col TEXT DEFAULT 'the_geom',
+ id_col TEXT DEFAULT 'cartodb_id')
+ RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
+AS $$
+
+ SELECT moran, quads, significance, rowid, vals
+ FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col)
+ WHERE quads IN ('HH', 'HL');
+
+$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
+
+-- Moran's I only for LL and LH (public-facing function)
+CREATE OR REPLACE FUNCTION
+ CDB_GetSpatialColdspots(
+ subquery TEXT,
+ attr TEXT,
+ w_type TEXT DEFAULT 'knn',
+ num_ngbrs INT DEFAULT 5,
+ permutations INT DEFAULT 99,
+ geom_col TEXT DEFAULT 'the_geom',
+ id_col TEXT DEFAULT 'cartodb_id')
+ RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
+AS $$
+
+ SELECT moran, quads, significance, rowid, vals
+ FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col)
+ WHERE quads IN ('LL', 'LH');
+
+$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
+
+-- Moran's I only for LH and HL (public-facing function)
+CREATE OR REPLACE FUNCTION
+ CDB_GetSpatialOutliers(
+ subquery TEXT,
+ attr TEXT,
+ w_type TEXT DEFAULT 'knn',
+ num_ngbrs INT DEFAULT 5,
+ permutations INT DEFAULT 99,
+ geom_col TEXT DEFAULT 'the_geom',
+ id_col TEXT DEFAULT 'cartodb_id')
+ RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
+AS $$
+
+ SELECT moran, quads, significance, rowid, vals
+ FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col)
+ WHERE quads IN ('HL', 'LH');
+
+$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
+
+-- Moran's I Global Rate (public-facing function)
+CREATE OR REPLACE FUNCTION
+ CDB_AreasOfInterestGlobalRate(
+ subquery TEXT,
+ numerator TEXT,
+ denominator TEXT,
+ w_type TEXT DEFAULT 'knn',
+ num_ngbrs INT DEFAULT 5,
+ permutations INT DEFAULT 99,
+ geom_col TEXT DEFAULT 'the_geom',
+ id_col TEXT DEFAULT 'cartodb_id')
+RETURNS TABLE (moran FLOAT, significance FLOAT)
+AS $$
+ from crankshaft.clustering import Moran
+ moran = Moran()
+ # TODO: use named parameters or a dictionary
+ return moran.global_rate_stat(subquery, numerator, denominator, w_type,
+ num_ngbrs, permutations, geom_col, id_col)
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+
+
+-- Moran's I Local Rate (internal function) - DEPRECATED
+CREATE OR REPLACE FUNCTION
+ _CDB_AreasOfInterestLocalRate(
+ subquery TEXT,
+ numerator TEXT,
+ denominator TEXT,
+ w_type TEXT,
+ num_ngbrs INT,
+ permutations INT,
+ geom_col TEXT,
+ id_col TEXT)
+RETURNS
+TABLE(
+ moran NUMERIC,
+ quads TEXT,
+ significance NUMERIC,
+ rowid INT,
+ vals NUMERIC)
+AS $$
+ from crankshaft.clustering import Moran
+ moran = Moran()
+ # TODO: use named parameters or a dictionary
+ result = moran.local_rate_stat(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
+ # remove spatial lag
+ return [(r[6], r[0], r[1], r[7], r[4]) for r in result]
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+
+-- Moran's I Local Rate (public-facing function) - DEPRECATED
+CREATE OR REPLACE FUNCTION
+ CDB_AreasOfInterestLocalRate(
+ subquery TEXT,
+ numerator TEXT,
+ denominator TEXT,
+ w_type TEXT DEFAULT 'knn',
+ num_ngbrs INT DEFAULT 5,
+ permutations INT DEFAULT 99,
+ geom_col TEXT DEFAULT 'the_geom',
+ id_col TEXT DEFAULT 'cartodb_id')
+RETURNS
+TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
+AS $$
+
+ SELECT moran, quads, significance, rowid, vals
+ FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col);
+
+$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
+
+-- Internal function
+CREATE OR REPLACE FUNCTION
+ _CDB_MoransILocalRate(
+ subquery TEXT,
+ numerator TEXT,
+ denominator TEXT,
+ w_type TEXT,
+ num_ngbrs INT,
+ permutations INT,
+ geom_col TEXT,
+ id_col TEXT)
+RETURNS
+TABLE(
+ quads TEXT,
+ significance NUMERIC,
+ spatial_lag NUMERIC,
+ spatial_lag_std NUMERIC,
+ orig_val NUMERIC,
+ orig_val_std NUMERIC,
+ moran_stat NUMERIC,
+ rowid INT)
+AS $$
+from crankshaft.clustering import Moran
+moran = Moran()
+return moran.local_rate_stat(
+ subquery,
+ numerator,
+ denominator,
+ w_type,
+ num_ngbrs,
+ permutations,
+ geom_col,
+ id_col
+)
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+
+-- Moran's I Rate
+-- Replaces CDB_AreasOfInterestLocalRate
+CREATE OR REPLACE FUNCTION
+ CDB_MoransILocalRate(
+ subquery TEXT,
+ numerator TEXT,
+ denominator TEXT,
+ w_type TEXT DEFAULT 'knn',
+ num_ngbrs INT DEFAULT 5,
+ permutations INT DEFAULT 99,
+ geom_col TEXT DEFAULT 'the_geom',
+ id_col TEXT DEFAULT 'cartodb_id')
+RETURNS
+TABLE(
+ quads TEXT,
+ significance NUMERIC,
+ spatial_lag NUMERIC,
+ spatial_lag_std NUMERIC,
+ orig_val NUMERIC,
+ orig_val_std NUMERIC,
+ moran_stat NUMERIC,
+ rowid INT)
+AS $$
+
+SELECT
+ quads, significance, spatial_lag, spatial_lag_std,
+ orig_val, orig_val_std, moran_stat, rowid
+FROM cdb_crankshaft._CDB_MoransILocalRate(
+ subquery, numerator, denominator, w_type,
+ num_ngbrs, permutations, geom_col, id_col);
+
+$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
+
+-- Moran's I Local Rate only for HH and HL (public-facing function)
+CREATE OR REPLACE FUNCTION
+ CDB_GetSpatialHotspotsRate(
+ subquery TEXT,
+ numerator TEXT,
+ denominator TEXT,
+ w_type TEXT DEFAULT 'knn',
+ num_ngbrs INT DEFAULT 5,
+ permutations INT DEFAULT 99,
+ geom_col TEXT DEFAULT 'the_geom',
+ id_col TEXT DEFAULT 'cartodb_id')
+RETURNS
+TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
+AS $$
+
+ SELECT moran, quads, significance, rowid, vals
+ FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
+ WHERE quads IN ('HH', 'HL');
+
+$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
+
+-- Moran's I Local Rate only for LL and LH (public-facing function)
+CREATE OR REPLACE FUNCTION
+ CDB_GetSpatialColdspotsRate(
+ subquery TEXT,
+ numerator TEXT,
+ denominator TEXT,
+ w_type TEXT DEFAULT 'knn',
+ num_ngbrs INT DEFAULT 5,
+ permutations INT DEFAULT 99,
+ geom_col TEXT DEFAULT 'the_geom',
+ id_col TEXT DEFAULT 'cartodb_id')
+RETURNS
+TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
+AS $$
+
+ SELECT moran, quads, significance, rowid, vals
+ FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
+ WHERE quads IN ('LL', 'LH');
+
+$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
+
+-- Moran's I Local Rate only for LH and HL (public-facing function)
+CREATE OR REPLACE FUNCTION
+ CDB_GetSpatialOutliersRate(
+ subquery TEXT,
+ numerator TEXT,
+ denominator TEXT,
+ w_type TEXT DEFAULT 'knn',
+ num_ngbrs INT DEFAULT 5,
+ permutations INT DEFAULT 99,
+ geom_col TEXT DEFAULT 'the_geom',
+ id_col TEXT DEFAULT 'cartodb_id')
+RETURNS
+TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
+AS $$
+
+ SELECT moran, quads, significance, rowid, vals
+ FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
+ WHERE quads IN ('HL', 'LH');
+
+$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
+-- Spatial k-means clustering
+
+CREATE OR REPLACE FUNCTION CDB_KMeans(
+ query TEXT,
+ no_clusters INTEGER,
+ no_init INTEGER DEFAULT 20
+)
+RETURNS TABLE(
+ cartodb_id INTEGER,
+ cluster_no INTEGER
+) AS $$
+
+from crankshaft.clustering import Kmeans
+kmeans = Kmeans()
+return kmeans.spatial(query, no_clusters, no_init)
+
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+
+-- Non-spatial k-means clustering
+-- query: sql query to retrieve all the needed data
+-- colnames: text array of column names for doing the clustering analysis
+-- no_clusters: number of requested clusters
+-- standardize: whether to scale variables to a mean of zero and a standard
+-- deviation of 1
+-- id_colname: name of the id column
+
+CREATE OR REPLACE FUNCTION CDB_KMeansNonspatial(
+ query TEXT,
+ colnames TEXT[],
+ no_clusters INTEGER,
+ standardize BOOLEAN DEFAULT true,
+ id_col TEXT DEFAULT 'cartodb_id'
+)
+RETURNS TABLE(
+ cluster_label text,
+ cluster_center json,
+ silhouettes numeric,
+ inertia numeric,
+ rowid bigint
+) AS $$
+
+from crankshaft.clustering import Kmeans
+kmeans = Kmeans()
+return kmeans.nonspatial(query, colnames, no_clusters,
+ standardize=standardize,
+ id_col=id_col)
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+
+
+CREATE OR REPLACE FUNCTION CDB_WeightedMeanS(
+ state NUMERIC[],
+ the_geom GEOMETRY(Point, 4326),
+ weight NUMERIC
+)
+RETURNS Numeric[] AS $$
+DECLARE
+ newX NUMERIC;
+ newY NUMERIC;
+ newW NUMERIC;
+BEGIN
+ IF weight IS NULL OR the_geom IS NULL THEN
+ newX = state[1];
+ newY = state[2];
+ newW = state[3];
+ ELSE
+ newX = state[1] + ST_X(the_geom)*weight;
+ newY = state[2] + ST_Y(the_geom)*weight;
+ newW = state[3] + weight;
+ END IF;
+ RETURN Array[newX,newY,newW];
+
+END
+$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
+
+
+CREATE OR REPLACE FUNCTION CDB_WeightedMeanF(state NUMERIC[])
+RETURNS GEOMETRY AS
+$$
+BEGIN
+ IF state[3] = 0 THEN
+ RETURN ST_SetSRID(ST_MakePoint(state[1],state[2]), 4326);
+ ELSE
+ RETURN ST_SETSRID(ST_MakePoint(state[1]/state[3], state[2]/state[3]),4326);
+ END IF;
+END
+$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
+
+-- Create aggregate if it did not exist
+DO $$
+BEGIN
+ IF NOT EXISTS (
+ SELECT *
+ FROM pg_catalog.pg_proc p
+ LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
+ WHERE n.nspname = 'cdb_crankshaft'
+ AND p.proname = 'cdb_weightedmean'
+ AND p.proisagg)
+ THEN
+ CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
+ SFUNC = CDB_WeightedMeanS,
+ FINALFUNC = CDB_WeightedMeanF,
+ STYPE = Numeric[],
+ PARALLEL = SAFE,
+ INITCOND = "{0.0,0.0,0.0}"
+ );
+ END IF;
+END
+$$ LANGUAGE plpgsql;
+-- Spatial Markov
+
+-- input table format:
+-- id | geom | date_1 | date_2 | date_3
+-- 1 | Pt1 | 12.3 | 13.1 | 14.2
+-- 2 | Pt2 | 11.0 | 13.2 | 12.5
+-- ...
+-- Sample Function call:
+-- SELECT CDB_SpatialMarkov('SELECT * FROM real_estate',
+-- Array['date_1', 'date_2', 'date_3'])
+
+CREATE OR REPLACE FUNCTION
+ CDB_SpatialMarkovTrend (
+ subquery TEXT,
+ time_cols TEXT[],
+ num_classes INT DEFAULT 7,
+ w_type TEXT DEFAULT 'knn',
+ num_ngbrs INT DEFAULT 5,
+ permutations INT DEFAULT 99,
+ geom_col TEXT DEFAULT 'the_geom',
+ id_col TEXT DEFAULT 'cartodb_id')
+RETURNS TABLE (trend NUMERIC, trend_up NUMERIC, trend_down NUMERIC, volatility NUMERIC, rowid INT)
+AS $$
+
+ from crankshaft.space_time_dynamics import Markov
+ markov = Markov()
+
+ ## TODO: use named parameters or a dictionary
+ return markov.spatial_trend(subquery, time_cols, num_classes, w_type, num_ngbrs, permutations, geom_col, id_col)
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+
+-- input table format: identical to above but in a predictable format
+-- Sample function call:
+-- SELECT cdb_spatial_markov('SELECT * FROM real_estate',
+-- 'date_1')
+
+
+-- CREATE OR REPLACE FUNCTION
+-- cdb_spatial_markov (
+-- subquery TEXT,
+-- time_col_min text,
+-- time_col_max text,
+-- date_format text, -- '_YYYY_MM_DD'
+-- num_time_per_bin INT DEFAULT 1,
+-- permutations INT DEFAULT 99,
+-- geom_column TEXT DEFAULT 'the_geom',
+-- id_col TEXT DEFAULT 'cartodb_id',
+-- w_type TEXT DEFAULT 'knn',
+-- num_ngbrs int DEFAULT 5)
+-- RETURNS TABLE (moran FLOAT, quads TEXT, significance FLOAT, ids INT)
+-- AS $$
+-- plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()')
+-- from crankshaft.clustering import moran_local
+-- # TODO: use named parameters or a dictionary
+-- return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs)
+-- $$ LANGUAGE plpythonu;
+--
+-- -- input table format:
+-- -- id | geom | date | measurement
+-- -- 1 | Pt1 | 12/3 | 13.2
+-- -- 2 | Pt2 | 11/5 | 11.3
+-- -- 3 | Pt1 | 11/13 | 12.9
+-- -- 4 | Pt3 | 12/19 | 10.1
+-- -- ...
+--
+-- CREATE OR REPLACE FUNCTION
+-- cdb_spatial_markov (
+-- subquery TEXT,
+-- time_col text,
+-- num_time_per_bin INT DEFAULT 1,
+-- permutations INT DEFAULT 99,
+-- geom_column TEXT DEFAULT 'the_geom',
+-- id_col TEXT DEFAULT 'cartodb_id',
+-- w_type TEXT DEFAULT 'knn',
+-- num_ngbrs int DEFAULT 5)
+-- RETURNS TABLE (moran FLOAT, quads TEXT, significance FLOAT, ids INT)
+-- AS $$
+-- plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()')
+-- from crankshaft.clustering import moran_local
+-- # TODO: use named parameters or a dictionary
+-- return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs)
+-- $$ LANGUAGE plpythonu;
+-- Based on:
+-- https://github.com/mapbox/polylabel/blob/master/index.js
+-- https://sites.google.com/site/polesofinaccessibility/
+-- Requires: https://github.com/CartoDB/cartodb-postgresql
+
+-- Based on:
+-- https://github.com/mapbox/polylabel/blob/master/index.js
+-- https://sites.google.com/site/polesofinaccessibility/
+-- Requires: https://github.com/CartoDB/cartodb-postgresql
+
+CREATE OR REPLACE FUNCTION CDB_PIA(
+ IN polygon geometry,
+ IN tolerance numeric DEFAULT 1.0
+ )
+RETURNS geometry AS $$
+DECLARE
+ env geometry[];
+ cells geometry[];
+ cell geometry;
+ best_c geometry;
+ best_d numeric;
+ test_d numeric;
+ test_mx numeric;
+ test_h numeric;
+ test_cells geometry[];
+ width numeric;
+ height numeric;
+ h numeric;
+ i integer;
+ n integer;
+ sqr numeric;
+ p geometry;
+BEGIN
+ sqr := 0.5*(|/2.0);
+ polygon := ST_Transform(polygon, 3857);
+
+ -- grid #0 cell size
+ height := ST_YMax(polygon) - ST_YMin(polygon);
+ width := ST_XMax(polygon) - ST_XMin(polygon);
+ h := 0.5*LEAST(height, width);
+
+ -- grid #0
+ with c1 as(
+ SELECT cdb_crankshaft.CDB_RectangleGrid(polygon, h, h) as c
+ )
+ SELECT array_agg(c) INTO cells FROM c1;
+
+ -- 1st guess: centroid
+ best_c := polygon;
+ best_d := cdb_crankshaft._Signed_Dist(polygon, ST_Centroid(Polygon));
+
+ -- looping the loop
+ n := array_length(cells,1);
+ i := 1;
+ LOOP
+
+ EXIT WHEN i > n;
+
+ cell := cells[i];
+
+ i := i+1;
+
+ -- cell side size, it's square
+ test_h := ST_XMax(cell) - ST_XMin(cell) ;
+
+ -- check distance
+ test_d := cdb_crankshaft._Signed_Dist(polygon, ST_Centroid(cell));
+
+ IF test_d > best_d THEN
+ best_d := test_d;
+ best_c := cell;
+ END IF;
+
+ -- longest distance within the cell
+ test_mx := test_d + (test_h * sqr);
+
+ -- if the cell has no chance to contains the desired point, continue
+ CONTINUE WHEN test_mx - best_d <= tolerance;
+
+ -- resample the cell
+ with c1 as(
+ SELECT cdb_crankshaft.CDB_RectangleGrid(cell, test_h/2, test_h/2) as c
+ )
+ SELECT array_agg(c) INTO test_cells FROM c1;
+
+ -- concat the new cells to the former array
+ cells := cells || test_cells;
+
+ -- prepare next iteration
+ n := array_length(cells,1);
+
+ END LOOP;
+
+ RETURN ST_transform(ST_Centroid(best_c), 4326);
+
+END;
+$$ language plpgsql IMMUTABLE PARALLEL SAFE;
+
+
+
+-- signed distance point to polygon with holes
+-- negative is the point is out the polygon
+-- rev 1. adding MULTIPOLYGON and GEOMETRYCOLLECTION support by @abelvm
+CREATE OR REPLACE FUNCTION _Signed_Dist(
+ IN polygon geometry,
+ IN point geometry
+ )
+RETURNS numeric AS $$
+DECLARE
+ pols geometry[];
+ pol geometry;
+ i integer;
+ j integer;
+ within integer;
+ w integer;
+ holes integer;
+ dist numeric;
+ d numeric;
+BEGIN
+ dist := 1e999;
+ WITH collection as (SELECT (ST_dump(polygon)).geom as geom) SELECT array_agg(geom) into pols FROM collection;
+ FOR j in 1..array_length(pols, 1)
+ LOOP
+ pol := pols[j];
+ d := dist;
+ SELECT LEAST(dist, ST_distance(point, ST_ExteriorRing(pol))::numeric) INTO d;
+ SELECT CASE WHEN ST_Within(point,pol) THEN 1 ELSE -1 END INTO w;
+ SELECT ST_NumInteriorRings(pol) INTO holes;
+ IF holes > 0 THEN
+ FOR i IN 1..holes
+ LOOP
+ SELECT LEAST(d, ST_distance(point, ST_InteriorRingN(pol, i))::numeric) INTO d;
+ END LOOP;
+ END IF;
+ IF d < dist THEN
+ dist:= d;
+ within := w;
+ END IF;
+ END LOOP;
+ dist := dist * within::numeric;
+ RETURN dist;
+END;
+$$ language plpgsql IMMUTABLE PARALLEL SAFE;
+--
+-- Iterative densification of a set of points using Delaunay triangulation
+-- the new points have as assigned value the average value of the 3 vertex (centroid)
+--
+-- @param geomin - array of geometries (points)
+--
+-- @param colin - array of numeric values in that points
+--
+-- @param iterations - integer, number of iterations
+--
+--
+-- Returns: TABLE(geomout geometry, colout numeric)
+--
+--
+CREATE OR REPLACE FUNCTION CDB_Densify(
+ IN geomin geometry[],
+ IN colin numeric[],
+ IN iterations integer
+ )
+RETURNS TABLE(geomout geometry, colout numeric) AS $$
+DECLARE
+ geotemp geometry[];
+ coltemp numeric[];
+ i integer;
+ gs geometry[];
+ g geometry;
+ vertex geometry[];
+ va numeric;
+ vb numeric;
+ vc numeric;
+ center geometry;
+ centerval numeric;
+ tmp integer;
+BEGIN
+ geotemp := geomin;
+ coltemp := colin;
+ FOR i IN 1..iterations
+ LOOP
+ -- generate TIN
+ WITH a as (SELECT unnest(geotemp) AS e),
+ b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a),
+ c as (SELECT (ST_Dump(t)).geom AS v FROM b)
+ SELECT array_agg(v) INTO gs FROM c;
+ -- loop cells
+ FOREACH g IN ARRAY gs
+ LOOP
+ -- append centroid
+ SELECT ST_Centroid(g) INTO center;
+ geotemp := array_append(geotemp, center);
+ -- retrieve the value of each vertex
+ WITH a AS (SELECT (ST_DumpPoints(g)).geom AS v)
+ SELECT array_agg(v) INTO vertex FROM a;
+ WITH a AS(SELECT unnest(geotemp) as geo, unnest(coltemp) as c)
+ SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]);
+ WITH a AS(SELECT unnest(geotemp) as geo, unnest(coltemp) as c)
+ SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]);
+ WITH a AS(SELECT unnest(geotemp) as geo, unnest(coltemp) as c)
+ SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]);
+ -- calc the value at the center
+ centerval := (va + vb + vc) / 3;
+ -- append the value
+ coltemp := array_append(coltemp, centerval);
+ END LOOP;
+ END LOOP;
+ RETURN QUERY SELECT unnest(geotemp ) as geomout, unnest(coltemp ) as colout;
+END;
+$$ language plpgsql IMMUTABLE PARALLEL SAFE;
+CREATE OR REPLACE FUNCTION CDB_TINmap(
+ IN geomin geometry[],
+ IN colin numeric[],
+ IN iterations integer
+ )
+RETURNS TABLE(geomout geometry, colout numeric) AS $$
+DECLARE
+ p geometry[];
+ vals numeric[];
+ gs geometry[];
+ g geometry;
+ vertex geometry[];
+ centerval numeric;
+ va numeric;
+ vb numeric;
+ vc numeric;
+ coltemp numeric[];
+BEGIN
+ SELECT array_agg(dens.geomout), array_agg(dens.colout) INTO p, vals FROM cdb_crankshaft.CDB_Densify(geomin, colin, iterations) dens;
+ WITH a as (SELECT unnest(p) AS e),
+ b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a),
+ c as (SELECT (ST_Dump(t)).geom AS v FROM b)
+ SELECT array_agg(v) INTO gs FROM c;
+ FOREACH g IN ARRAY gs
+ LOOP
+ -- retrieve the vertex of each triangle
+ WITH a AS (SELECT (ST_DumpPoints(g)).geom AS v)
+ SELECT array_agg(v) INTO vertex FROM a;
+ -- retrieve the value of each vertex
+ WITH a AS(SELECT unnest(p) as geo, unnest(vals) as c)
+ SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]);
+ WITH a AS(SELECT unnest(p) as geo, unnest(vals) as c)
+ SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]);
+ WITH a AS(SELECT unnest(p) as geo, unnest(vals) as c)
+ SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]);
+ -- calc the value at the center
+ centerval := (va + vb + vc) / 3;
+ -- append the value
+ coltemp := array_append(coltemp, centerval);
+ END LOOP;
+ RETURN QUERY SELECT unnest(gs) as geomout, unnest(coltemp ) as colout;
+END;
+$$ language plpgsql IMMUTABLE PARALLEL SAFE;
+-- Getis-Ord's G
+-- Hotspot/Coldspot Analysis tool
+CREATE OR REPLACE FUNCTION
+ CDB_GetisOrdsG(
+ subquery TEXT,
+ column_name TEXT,
+ w_type TEXT DEFAULT 'knn',
+ num_ngbrs INT DEFAULT 5,
+ permutations INT DEFAULT 999,
+ geom_col TEXT DEFAULT 'the_geom',
+ id_col TEXT DEFAULT 'cartodb_id')
+RETURNS TABLE (z_score NUMERIC, p_value NUMERIC, p_z_sim NUMERIC, rowid BIGINT)
+AS $$
+ from crankshaft.clustering import Getis
+ getis = Getis()
+ return getis.getis_ord(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col)
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+
+-- TODO: make a version that accepts the values as arrays
+
+-- Find outliers using a static threshold
+--
+CREATE OR REPLACE FUNCTION CDB_StaticOutlier(column_value numeric, threshold numeric)
+RETURNS boolean
+AS $$
+BEGIN
+
+ RETURN column_value > threshold;
+
+END;
+$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE ;
+
+-- Find outliers by a percentage above the threshold
+-- TODO: add symmetric option? `is_symmetric boolean DEFAULT false`
+
+CREATE OR REPLACE FUNCTION CDB_PercentOutlier(column_values numeric[], outlier_fraction numeric, ids int[])
+RETURNS TABLE(is_outlier boolean, rowid int)
+AS $$
+DECLARE
+ avg_val numeric;
+ out_vals boolean[];
+BEGIN
+
+ SELECT avg(i) INTO avg_val
+ FROM unnest(column_values) As x(i);
+
+ IF avg_val = 0 THEN
+ RAISE EXCEPTION 'Mean value is zero. Try another outlier method.';
+ END IF;
+
+ SELECT array_agg(
+ outlier_fraction < i / avg_val) INTO out_vals
+ FROM unnest(column_values) As x(i);
+
+ RETURN QUERY
+ SELECT unnest(out_vals) As is_outlier,
+ unnest(ids) As rowid;
+
+END;
+$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
+
+-- Find outliers above a given number of standard deviations from the mean
+
+CREATE OR REPLACE FUNCTION CDB_StdDevOutlier(column_values numeric[], num_deviations numeric, ids int[], is_symmetric boolean DEFAULT true)
+RETURNS TABLE(is_outlier boolean, rowid int)
+AS $$
+DECLARE
+ stddev_val numeric;
+ avg_val numeric;
+ out_vals boolean[];
+BEGIN
+
+ SELECT stddev(i), avg(i) INTO stddev_val, avg_val
+ FROM unnest(column_values) As x(i);
+
+ IF stddev_val = 0 THEN
+ RAISE EXCEPTION 'Standard deviation of input data is zero';
+ END IF;
+
+ IF is_symmetric THEN
+ SELECT array_agg(
+ abs(i - avg_val) / stddev_val > num_deviations) INTO out_vals
+ FROM unnest(column_values) As x(i);
+ ELSE
+ SELECT array_agg(
+ (i - avg_val) / stddev_val > num_deviations) INTO out_vals
+ FROM unnest(column_values) As x(i);
+ END IF;
+
+ RETURN QUERY
+ SELECT unnest(out_vals) As is_outlier,
+ unnest(ids) As rowid;
+END;
+$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
+CREATE OR REPLACE FUNCTION CDB_Contour(
+ IN geomin geometry[],
+ IN colin numeric[],
+ IN buffer numeric,
+ IN intmethod integer,
+ IN classmethod integer,
+ IN steps integer,
+ IN max_time integer DEFAULT 60000
+ )
+RETURNS TABLE(
+ the_geom geometry,
+ bin integer,
+ min_value numeric,
+ max_value numeric,
+ avg_value numeric
+) AS $$
+DECLARE
+ cell_count integer;
+ tin geometry[];
+ resolution integer;
+BEGIN
+
+ -- nasty trick to override issue #121
+ IF max_time = 0 THEN
+ max_time = -90;
+ END IF;
+ resolution := max_time;
+ max_time := -1 * resolution;
+
+ -- calc the optimal number of cells for the current dataset
+ SELECT
+ CASE intmethod
+ WHEN 0 THEN round(3.7745903782 * max_time - 9.4399210051 * array_length(geomin,1) - 1350.8778213073)
+ WHEN 1 THEN round(2.2855592156 * max_time - 87.285217133 * array_length(geomin,1) + 17255.7085601797)
+ WHEN 2 THEN round(0.9799471999 * max_time - 127.0334085369 * array_length(geomin,1) + 22707.9579721218)
+ ELSE 10000
+ END INTO cell_count;
+
+ -- we don't have iterative barycentric interpolation in CDB_interpolation,
+ -- and it's a costy function, so let's make a custom one here till
+ -- we update the code
+ -- tin := ARRAY[]::geometry[];
+ IF intmethod=1 THEN
+ WITH
+ a as (SELECT unnest(geomin) AS e),
+ b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a),
+ c as (SELECT (ST_Dump(t)).geom as v FROM b)
+ SELECT array_agg(v) INTO tin FROM c;
+ END IF;
+ -- Delaunay stuff performed just ONCE!!
+
+ -- magic
+ RETURN QUERY
+ WITH
+ convexhull as (
+ SELECT
+ ST_ConvexHull(ST_Collect(geomin)) as g,
+ buffer * |/ st_area(ST_ConvexHull(ST_Collect(geomin)))/PI() as r
+ ),
+ envelope as (
+ SELECT
+ st_expand(a.g, a.r) as e
+ FROM convexhull a
+ ),
+ envelope3857 as(
+ SELECT
+ ST_Transform(e, 3857) as geom
+ FROM envelope
+ ),
+ resolution as(
+ SELECT
+ CASE WHEN resolution <= 0 THEN
+ round(|/ (
+ ST_area(geom) / abs(cell_count)
+ ))
+ ELSE
+ resolution
+ END AS cell
+ FROM envelope3857
+ ),
+ grid as(
+ SELECT
+ ST_Transform(cdb_crankshaft.CDB_RectangleGrid(e.geom, r.cell, r.cell), 4326) as geom
+ FROM envelope3857 e, resolution r
+ ),
+ interp as(
+ SELECT
+ geom,
+ CASE
+ WHEN intmethod=1 THEN cdb_crankshaft._interp_in_tin(geomin, colin, tin, ST_Centroid(geom))
+ ELSE cdb_crankshaft.CDB_SpatialInterpolation(geomin, colin, ST_Centroid(geom), intmethod)
+ END as val
+ FROM grid
+ ),
+ classes as(
+ SELECT CASE
+ WHEN classmethod = 0 THEN
+ cdb_crankshaft.CDB_EqualIntervalBins(array_agg(val), steps)
+ WHEN classmethod = 1 THEN
+ cdb_crankshaft.CDB_HeadsTailsBins(array_agg(val), steps)
+ WHEN classmethod = 2 THEN
+ cdb_crankshaft.CDB_JenksBins(array_agg(val), steps)
+ ELSE
+ cdb_crankshaft.CDB_QuantileBins(array_agg(val), steps)
+ END as b
+ FROM interp
+ where val is not null
+ ),
+ classified as(
+ SELECT
+ i.*,
+ width_bucket(i.val, c.b) as bucket
+ FROM interp i left join classes c
+ ON 1=1
+ ),
+ classified2 as(
+ SELECT
+ geom,
+ val,
+ CASE
+ WHEN bucket = steps THEN bucket - 1
+ ELSE bucket
+ END as b
+ FROM classified
+ ),
+ final as(
+ SELECT
+ st_union(geom) as the_geom,
+ b as bin,
+ min(val) as min_value,
+ max(val) as max_value,
+ avg(val) as avg_value
+ FROM classified2
+ GROUP BY bin
+ )
+ SELECT
+ *
+ FROM final
+ where final.bin is not null
+ ;
+END;
+$$ language plpgsql VOLATILE PARALLEL RESTRICTED;
+
+
+-- =====================================================================
+-- Interp in grid, so we can use barycentric with a precalculated tin (NNI)
+-- =====================================================================
+CREATE OR REPLACE FUNCTION _interp_in_tin(
+ IN geomin geometry[],
+ IN colin numeric[],
+ IN tin geometry[],
+ IN point geometry
+ )
+RETURNS numeric AS
+$$
+DECLARE
+ g geometry;
+ vertex geometry[];
+ sg numeric;
+ sa numeric;
+ sb numeric;
+ sc numeric;
+ va numeric;
+ vb numeric;
+ vc numeric;
+ output numeric;
+BEGIN
+ -- get the cell the point is within
+ WITH
+ a as (SELECT unnest(tin) as v),
+ b as (SELECT v FROM a WHERE ST_Within(point, v))
+ SELECT v INTO g FROM b;
+
+ -- if we're out of the data realm,
+ -- return null
+ IF g is null THEN
+ RETURN null;
+ END IF;
+
+ -- vertex of the selected cell
+ WITH a AS (
+ SELECT (ST_DumpPoints(g)).geom AS v
+ )
+ SELECT array_agg(v) INTO vertex FROM a;
+
+ -- retrieve the value of each vertex
+ WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c)
+ SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]);
+
+ WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c)
+ SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]);
+
+ WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c)
+ SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]);
+
+ -- calc the areas
+ SELECT
+ ST_area(g),
+ ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[2], vertex[3], point]))),
+ ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[1], vertex[3], point]))),
+ ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point,vertex[1],vertex[2], point]))) INTO sg, sa, sb, sc;
+
+ output := (coalesce(sa,0) * coalesce(va,0) + coalesce(sb,0) * coalesce(vb,0) + coalesce(sc,0) * coalesce(vc,0)) / coalesce(sg,1);
+ RETURN output;
+END;
+$$
+language plpgsql IMMUTABLE PARALLEL SAFE;
+-- Function by Stuart Lynn for a simple interpolation of a value
+-- from a polygon table over an arbitrary polygon
+-- (weighted by the area proportion overlapped)
+-- Aereal weighting is a very simple form of aereal interpolation.
+--
+-- Parameters:
+-- * geom a Polygon geometry which defines the area where a value will be
+-- estimated as the area-weighted sum of a given table/column
+-- * target_table_name table name of the table that provides the values
+-- * target_column column name of the column that provides the values
+-- * schema_name optional parameter to defina the schema the target table
+-- belongs to, which is necessary if its not in the search_path.
+-- Note that target_table_name should never include the schema in it.
+-- Return value:
+-- Aereal-weighted interpolation of the column values over the geometry
+CREATE OR REPLACE
+FUNCTION cdb_overlap_sum(geom geometry, target_table_name text, target_column text, schema_name text DEFAULT NULL)
+ RETURNS numeric AS
+$$
+DECLARE
+ result numeric;
+ qualified_name text;
+BEGIN
+ IF schema_name IS NULL THEN
+ qualified_name := Format('%I', target_table_name);
+ ELSE
+ qualified_name := Format('%I.%s', schema_name, target_table_name);
+ END IF;
+ EXECUTE Format('
+ SELECT sum(%I*ST_Area(St_Intersection($1, a.the_geom))/ST_Area(a.the_geom))
+ FROM %s AS a
+ WHERE $1 && a.the_geom
+ ', target_column, qualified_name)
+ USING geom
+ INTO result;
+ RETURN result;
+END;
+$$ LANGUAGE plpgsql STABLE PARALLEL SAFE;
+CREATE OR REPLACE FUNCTION
+CDB_GWR(subquery text, dep_var text, ind_vars text[],
+ bw numeric default null, fixed boolean default False,
+ kernel text default 'bisquare', geom_col text default 'the_geom',
+ id_col text default 'cartodb_id')
+RETURNS table(coeffs JSON, stand_errs JSON, t_vals JSON,
+ filtered_t_vals JSON, predicted numeric,
+ residuals numeric, r_squared numeric, bandwidth numeric,
+ rowid bigint)
+AS $$
+
+from crankshaft.regression import GWR
+
+gwr = GWR()
+
+return gwr.gwr(subquery, dep_var, ind_vars, bw, fixed, kernel, geom_col, id_col)
+
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+
+
+CREATE OR REPLACE FUNCTION
+CDB_GWR_Predict(subquery text, dep_var text, ind_vars text[],
+ bw numeric default null, fixed boolean default False,
+ kernel text default 'bisquare',
+ geom_col text default 'the_geom',
+ id_col text default 'cartodb_id')
+RETURNS table(coeffs JSON, stand_errs JSON, t_vals JSON,
+ r_squared numeric, predicted numeric, rowid bigint)
+AS $$
+
+from crankshaft.regression import GWR
+gwr = GWR()
+
+return gwr.gwr_predict(subquery, dep_var, ind_vars, bw, fixed, kernel, geom_col, id_col)
+
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+--
+-- Creates N points randomly distributed arround the polygon
+--
+-- @param g - the geometry to be turned in to points
+--
+-- @param no_points - the number of points to generate
+--
+-- @params max_iter_per_point - the function generates points in the polygon's bounding box
+-- and discards points which don't lie in the polygon. max_iter_per_point specifies how many
+-- misses per point the funciton accepts before giving up.
+--
+-- Returns: Multipoint with the requested points
+CREATE OR REPLACE FUNCTION cdb_dot_density(geom geometry , no_points Integer, max_iter_per_point Integer DEFAULT 1000)
+RETURNS GEOMETRY AS $$
+DECLARE
+ extent GEOMETRY;
+ test_point Geometry;
+ width NUMERIC;
+ height NUMERIC;
+ x0 NUMERIC;
+ y0 NUMERIC;
+ xp NUMERIC;
+ yp NUMERIC;
+ no_left INTEGER;
+ remaining_iterations INTEGER;
+ points GEOMETRY[];
+ bbox_line GEOMETRY;
+ intersection_line GEOMETRY;
+BEGIN
+ extent := ST_Envelope(geom);
+ width := ST_XMax(extent) - ST_XMIN(extent);
+ height := ST_YMax(extent) - ST_YMIN(extent);
+ x0 := ST_XMin(extent);
+ y0 := ST_YMin(extent);
+ no_left := no_points;
+
+ LOOP
+ if(no_left=0) THEN
+ EXIT;
+ END IF;
+ yp = y0 + height*random();
+ bbox_line = ST_MakeLine(
+ ST_SetSRID(ST_MakePoint(yp, x0),4326),
+ ST_SetSRID(ST_MakePoint(yp, x0+width),4326)
+ );
+ intersection_line = ST_Intersection(bbox_line,geom);
+ test_point = ST_LineInterpolatePoint(st_makeline(st_linemerge(intersection_line)),random());
+ points := points || test_point;
+ no_left = no_left - 1 ;
+ END LOOP;
+ RETURN ST_Collect(points);
+END;
+$$
+LANGUAGE plpgsql VOLATILE PARALLEL RESTRICTED;
+-- Make sure by default there are no permissions for publicuser
+-- NOTE: this happens at extension creation time, as part of an implicit transaction.
+-- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE;
+
+-- Grant permissions on the schema to publicuser (but just the schema)
+GRANT USAGE ON SCHEMA cdb_crankshaft TO publicuser;
+
+-- Revoke execute permissions on all functions in the schema by default
+-- REVOKE EXECUTE ON ALL FUNCTIONS IN SCHEMA cdb_crankshaft FROM PUBLIC, publicuser;
+--
+-- Fill given extent with a rectangular coverage
+--
+-- @param ext Extent to fill. Only rectangles with center point falling
+-- inside the extent (or at the lower or leftmost edge) will
+-- be emitted. The returned hexagons will have the same SRID
+-- as this extent.
+--
+-- @param width With of each rectangle
+--
+-- @param height Height of each rectangle
+--
+-- @param origin Optional origin to allow for exact tiling.
+-- If omitted the origin will be 0,0.
+-- The parameter is checked for having the same SRID
+-- as the extent.
+--
+--
+CREATE OR REPLACE FUNCTION CDB_RectangleGrid(ext GEOMETRY, width FLOAT8, height FLOAT8, origin GEOMETRY DEFAULT NULL)
+RETURNS SETOF GEOMETRY
+AS $$
+DECLARE
+ h GEOMETRY; -- rectangle cell
+ hstep FLOAT8; -- horizontal step
+ vstep FLOAT8; -- vertical step
+ hw FLOAT8; -- half width
+ hh FLOAT8; -- half height
+ vstart FLOAT8;
+ hstart FLOAT8;
+ hend FLOAT8;
+ vend FLOAT8;
+ xoff FLOAT8;
+ yoff FLOAT8;
+ xgrd FLOAT8;
+ ygrd FLOAT8;
+ x FLOAT8;
+ y FLOAT8;
+ srid INTEGER;
+BEGIN
+
+ srid := ST_SRID(ext);
+
+ xoff := 0;
+ yoff := 0;
+
+ IF origin IS NOT NULL THEN
+ IF ST_SRID(origin) != srid THEN
+ RAISE EXCEPTION 'SRID mismatch between extent (%) and origin (%)', srid, ST_SRID(origin);
+ END IF;
+ xoff := ST_X(origin);
+ yoff := ST_Y(origin);
+ END IF;
+
+ --RAISE DEBUG 'X offset: %', xoff;
+ --RAISE DEBUG 'Y offset: %', yoff;
+
+ hw := width/2.0;
+ hh := height/2.0;
+
+ xgrd := hw;
+ ygrd := hh;
+ --RAISE DEBUG 'X grid size: %', xgrd;
+ --RAISE DEBUG 'Y grid size: %', ygrd;
+
+ hstep := width;
+ vstep := height;
+
+ -- Tweak horizontal start on hstep grid from origin
+ hstart := xoff + ceil((ST_XMin(ext)-xoff)/hstep)*hstep;
+ --RAISE DEBUG 'hstart: %', hstart;
+
+ -- Tweak vertical start on vstep grid from origin
+ vstart := yoff + ceil((ST_Ymin(ext)-yoff)/vstep)*vstep;
+ --RAISE DEBUG 'vstart: %', vstart;
+
+ hend := ST_XMax(ext);
+ vend := ST_YMax(ext);
+
+ --RAISE DEBUG 'hend: %', hend;
+ --RAISE DEBUG 'vend: %', vend;
+
+ x := hstart;
+ WHILE x < hend LOOP -- over X
+ y := vstart;
+ h := ST_MakeEnvelope(x-hw, y-hh, x+hw, y+hh, srid);
+ WHILE y < vend LOOP -- over Y
+ RETURN NEXT h;
+ h := ST_Translate(h, 0, vstep);
+ y := yoff + round(((y + vstep)-yoff)/ygrd)*ygrd; -- round to grid
+ END LOOP;
+ x := xoff + round(((x + hstep)-xoff)/xgrd)*xgrd; -- round to grid
+ END LOOP;
+
+ RETURN;
+END
+$$ LANGUAGE 'plpgsql' IMMUTABLE PARALLEL SAFE;
+
+--
+-- Calculate the equal interval bins for a given column
+--
+-- @param in_array A numeric array of numbers to determine the best
+-- to determine the bin boundary
+--
+-- @param breaks The number of bins you want to find.
+--
+--
+-- Returns: upper edges of bins
+--
+--
+
+CREATE OR REPLACE FUNCTION CDB_EqualIntervalBins ( in_array NUMERIC[], breaks INT ) RETURNS NUMERIC[] as $$
+DECLARE
+ diff numeric;
+ min_val numeric;
+ max_val numeric;
+ tmp_val numeric;
+ i INT := 1;
+ reply numeric[];
+BEGIN
+ SELECT min(e), max(e) INTO min_val, max_val FROM ( SELECT unnest(in_array) e ) x WHERE e IS NOT NULL;
+ diff = (max_val - min_val) / breaks::numeric;
+ LOOP
+ IF i < breaks THEN
+ tmp_val = min_val + i::numeric * diff;
+ reply = array_append(reply, tmp_val);
+ i := i+1;
+ ELSE
+ reply = array_append(reply, max_val);
+ EXIT;
+ END IF;
+ END LOOP;
+ RETURN reply;
+END;
+$$ language plpgsql IMMUTABLE PARALLEL SAFE;
+
+--
+-- Determine the Heads/Tails classifications from a numeric array
+--
+-- @param in_array A numeric array of numbers to determine the best
+-- bins based on the Heads/Tails method.
+--
+-- @param breaks The number of bins you want to find.
+--
+--
+
+CREATE OR REPLACE FUNCTION CDB_HeadsTailsBins ( in_array NUMERIC[], breaks INT) RETURNS NUMERIC[] as $$
+DECLARE
+ element_count INT4;
+ arr_mean numeric;
+ i INT := 2;
+ reply numeric[];
+BEGIN
+ -- get the total size of our row
+ element_count := array_upper(in_array, 1) - array_lower(in_array, 1);
+ -- ensure the ordering of in_array
+ SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x;
+ -- stop if no rows
+ IF element_count IS NULL THEN
+ RETURN NULL;
+ END IF;
+ -- stop if our breaks are more than our input array size
+ IF element_count < breaks THEN
+ RETURN in_array;
+ END IF;
+
+ -- get our mean value
+ SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x;
+
+ reply = Array[arr_mean];
+ -- slice our bread
+ LOOP
+ IF i > breaks THEN EXIT; END IF;
+ SELECT avg(e) INTO arr_mean FROM ( SELECT unnest(in_array) e) x WHERE e > reply[i-1];
+ IF arr_mean IS NOT NULL THEN
+ reply = array_append(reply, arr_mean);
+ END IF;
+ i := i+1;
+ END LOOP;
+ RETURN reply;
+END;
+$$ language plpgsql IMMUTABLE PARALLEL SAFE;
+
+--
+-- Determine the Jenks classifications from a numeric array
+--
+-- @param in_array A numeric array of numbers to determine the best
+-- bins based on the Jenks method.
+--
+-- @param breaks The number of bins you want to find.
+--
+-- @param iterations The number of different starting positions to test.
+--
+-- @param invert Optional wheter to return the top of each bin (default)
+-- or the bottom. BOOLEAN, default=FALSE.
+--
+--
+
+
+CREATE OR REPLACE FUNCTION CDB_JenksBins ( in_array NUMERIC[], breaks INT, iterations INT DEFAULT 5, invert BOOLEAN DEFAULT FALSE) RETURNS NUMERIC[] as $$
+DECLARE
+ element_count INT4;
+ arr_mean NUMERIC;
+ bot INT;
+ top INT;
+ tops INT[];
+ classes INT[][];
+ i INT := 1; j INT := 1;
+ curr_result NUMERIC[];
+ best_result NUMERIC[];
+ seedtarget TEXT;
+ quant NUMERIC[];
+ shuffles INT;
+BEGIN
+ -- get the total size of our row
+ element_count := array_length(in_array, 1); --array_upper(in_array, 1) - array_lower(in_array, 1);
+ -- ensure the ordering of in_array
+ SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x;
+ -- stop if no rows
+ IF element_count IS NULL THEN
+ RETURN NULL;
+ END IF;
+ -- stop if our breaks are more than our input array size
+ IF element_count < breaks THEN
+ RETURN in_array;
+ END IF;
+
+ shuffles := LEAST(GREATEST(floor(2500000.0/(element_count::float*iterations::float)), 1), 750)::int;
+ -- get our mean value
+ SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x;
+
+ -- assume best is actually Quantile
+ SELECT cdb_crankshaft.CDB_QuantileBins(in_array, breaks) INTO quant;
+
+ -- if data is very very large, just return quant and be done
+ IF element_count > 5000000 THEN
+ RETURN quant;
+ END IF;
+
+ -- change quant into bottom, top markers
+ LOOP
+ IF i = 1 THEN
+ bot = 1;
+ ELSE
+ -- use last top to find this bot
+ bot = top+1;
+ END IF;
+ IF i = breaks THEN
+ top = element_count;
+ ELSE
+ SELECT count(*) INTO top FROM ( SELECT unnest(in_array) as v) x WHERE v <= quant[i];
+ END IF;
+ IF i = 1 THEN
+ classes = ARRAY[ARRAY[bot,top]];
+ ELSE
+ classes = ARRAY_CAT(classes,ARRAY[bot,top]);
+ END IF;
+ IF i > breaks THEN EXIT; END IF;
+ i = i+1;
+ END LOOP;
+
+ best_result = cdb_crankshaft.CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles);
+
+ --set the seed so we can ensure the same results
+ SELECT setseed(0.4567) INTO seedtarget;
+ --loop through random starting positions
+ LOOP
+ IF j > iterations-1 THEN EXIT; END IF;
+ i = 1;
+ tops = ARRAY[element_count];
+ LOOP
+ IF i = breaks THEN EXIT; END IF;
+ SELECT array_agg(distinct e) INTO tops FROM (SELECT unnest(array_cat(tops, ARRAY[floor(random()*element_count::float)::int])) as e ORDER BY e) x WHERE e != 1;
+ i = array_length(tops, 1);
+ END LOOP;
+ i = 1;
+ LOOP
+ IF i > breaks THEN EXIT; END IF;
+ IF i = 1 THEN
+ bot = 1;
+ ELSE
+ bot = top+1;
+ END IF;
+ top = tops[i];
+ IF i = 1 THEN
+ classes = ARRAY[ARRAY[bot,top]];
+ ELSE
+ classes = ARRAY_CAT(classes,ARRAY[bot,top]);
+ END IF;
+ i := i+1;
+ END LOOP;
+ curr_result = cdb_crankshaft.CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles);
+
+ IF curr_result[1] > best_result[1] THEN
+ best_result = curr_result;
+ j = j-1; -- if we found a better result, add one more search
+ END IF;
+ j = j+1;
+ END LOOP;
+
+ RETURN (best_result)[2:array_upper(best_result, 1)];
+END;
+$$ language plpgsql VOLATILE PARALLEL RESTRICTED;
+
+
+
+--
+-- Perform a single iteration of the Jenks classification
+--
+
+CREATE OR REPLACE FUNCTION CDB_JenksBinsIteration ( in_array NUMERIC[], breaks INT, classes INT[][], invert BOOLEAN, element_count INT4, arr_mean NUMERIC, max_search INT DEFAULT 50) RETURNS NUMERIC[] as $$
+DECLARE
+ tmp_val numeric;
+ new_classes int[][];
+ tmp_class int[];
+ i INT := 1;
+ j INT := 1;
+ side INT := 2;
+ sdam numeric;
+ gvf numeric := 0.0;
+ new_gvf numeric;
+ arr_gvf numeric[];
+ class_avg numeric;
+ class_max_i INT;
+ class_min_i INT;
+ class_max numeric;
+ class_min numeric;
+ reply numeric[];
+BEGIN
+
+ -- Calculate the sum of squared deviations from the array mean (SDAM).
+ SELECT sum((arr_mean - e)^2) INTO sdam FROM ( SELECT unnest(in_array) as e ) x;
+ --Identify the breaks for the lowest GVF
+ LOOP
+ i = 1;
+ LOOP
+ -- get our mean
+ SELECT avg(e) INTO class_avg FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e) x;
+ -- find the deviation
+ SELECT sum((class_avg-e)^2) INTO tmp_val FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e ) x;
+ IF i = 1 THEN
+ arr_gvf = ARRAY[tmp_val];
+ -- init our min/max map for later
+ class_max = arr_gvf[i];
+ class_min = arr_gvf[i];
+ class_min_i = 1;
+ class_max_i = 1;
+ ELSE
+ arr_gvf = array_append(arr_gvf, tmp_val);
+ END IF;
+ i := i+1;
+ IF i > breaks THEN EXIT; END IF;
+ END LOOP;
+ -- calculate our new GVF
+ SELECT sdam-sum(e) INTO new_gvf FROM ( SELECT unnest(arr_gvf) as e ) x;
+ -- if no improvement was made, exit
+ IF new_gvf < gvf THEN EXIT; END IF;
+ gvf = new_gvf;
+ IF j > max_search THEN EXIT; END IF;
+ j = j+1;
+ i = 1;
+ LOOP
+ --establish directionality (uppward through classes or downward)
+ IF arr_gvf[i] < class_min THEN
+ class_min = arr_gvf[i];
+ class_min_i = i;
+ END IF;
+ IF arr_gvf[i] > class_max THEN
+ class_max = arr_gvf[i];
+ class_max_i = i;
+ END IF;
+ i := i+1;
+ IF i > breaks THEN EXIT; END IF;
+ END LOOP;
+ IF class_max_i > class_min_i THEN
+ class_min_i = class_max_i - 1;
+ ELSE
+ class_min_i = class_max_i + 1;
+ END IF;
+ --Move from higher class to a lower gid order
+ IF class_max_i > class_min_i THEN
+ classes[class_max_i][1] = classes[class_max_i][1] + 1;
+ classes[class_min_i][2] = classes[class_min_i][2] + 1;
+ ELSE -- Move from lower class UP into a higher class by gid
+ classes[class_max_i][2] = classes[class_max_i][2] - 1;
+ classes[class_min_i][1] = classes[class_min_i][1] - 1;
+ END IF;
+ END LOOP;
+
+ i = 1;
+ LOOP
+ IF invert = TRUE THEN
+ side = 1; --default returns bottom side of breaks, invert returns top side
+ END IF;
+ reply = array_append(reply, in_array[classes[i][side]]);
+ i = i+1;
+ IF i > breaks THEN EXIT; END IF;
+ END LOOP;
+
+ RETURN array_prepend(gvf, reply);
+
+END;
+$$ language plpgsql IMMUTABLE PARALLEL SAFE;
+
+
+--
+-- Determine the Quantile classifications from a numeric array
+--
+-- @param in_array A numeric array of numbers to determine the best
+-- bins based on the Quantile method.
+--
+-- @param breaks The number of bins you want to find.
+--
+--
+CREATE OR REPLACE FUNCTION CDB_QuantileBins ( in_array NUMERIC[], breaks INT) RETURNS NUMERIC[] as $$
+DECLARE
+ element_count INT4;
+ break_size numeric;
+ tmp_val numeric;
+ i INT := 1;
+ reply numeric[];
+BEGIN
+ -- sort our values
+ SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e ASC) x;
+ -- get the total size of our data
+ element_count := array_length(in_array, 1);
+ break_size := element_count::numeric / breaks;
+ -- slice our bread
+ LOOP
+ IF i < breaks THEN
+ IF break_size * i % 1 > 0 THEN
+ SELECT e INTO tmp_val FROM ( SELECT unnest(in_array) e LIMIT 1 OFFSET ceil(break_size * i) - 1) x;
+ ELSE
+ SELECT avg(e) INTO tmp_val FROM ( SELECT unnest(in_array) e LIMIT 2 OFFSET ceil(break_size * i) - 1 ) x;
+ END IF;
+ ELSIF i = breaks THEN
+ -- select the last value
+ SELECT max(e) INTO tmp_val FROM ( SELECT unnest(in_array) e ) x;
+ ELSE
+ EXIT;
+ END IF;
+
+ reply = array_append(reply, tmp_val);
+ i := i+1;
+ END LOOP;
+ RETURN reply;
+END;
+$$ language plpgsql IMMUTABLE STRICT PARALLEL SAFE;
diff --git a/release/crankshaft--0.8.0.sql b/release/crankshaft--0.8.0.sql
new file mode 100644
index 0000000..a0b2474
--- /dev/null
+++ b/release/crankshaft--0.8.0.sql
@@ -0,0 +1,2307 @@
+--DO NOT MODIFY THIS FILE, IT IS GENERATED AUTOMATICALLY FROM SOURCES
+-- Complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION crankshaft" to load this file. \quit
+-- Version number of the extension release
+CREATE OR REPLACE FUNCTION cdb_crankshaft_version()
+RETURNS text AS $$
+ SELECT '0.8.0'::text;
+$$ language 'sql' IMMUTABLE STRICT PARALLEL SAFE;
+
+-- Internal identifier of the installed extension instence
+-- e.g. 'dev' for current development version
+CREATE OR REPLACE FUNCTION _cdb_crankshaft_internal_version()
+RETURNS text AS $$
+ SELECT installed_version FROM pg_available_extensions where name='crankshaft' and pg_available_extensions IS NOT NULL;
+$$ language 'sql' STABLE STRICT PARALLEL SAFE;
+-- Internal function.
+-- Set the seeds of the RNGs (Random Number Generators)
+-- used internally.
+CREATE OR REPLACE FUNCTION
+_cdb_random_seeds (seed_value INTEGER) RETURNS VOID
+AS $$
+ from crankshaft import random_seeds
+ random_seeds.set_random_seeds(seed_value)
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+CREATE OR REPLACE FUNCTION
+ CDB_PyAggS(current_state Numeric[], current_row Numeric[])
+ returns NUMERIC[] as $$
+ BEGIN
+ if array_upper(current_state,1) is null then
+ RAISE NOTICE 'setting state %',array_upper(current_row,1);
+ current_state[1] = array_upper(current_row,1);
+ end if;
+ return array_cat(current_state,current_row) ;
+ END
+ $$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
+
+-- Create aggregate if it did not exist
+DO $$
+BEGIN
+ IF NOT EXISTS (
+ SELECT *
+ FROM pg_catalog.pg_proc p
+ LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
+ WHERE n.nspname = 'cdb_crankshaft'
+ AND p.proname = 'cdb_pyagg'
+ AND p.proisagg)
+ THEN
+ CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
+ SFUNC = CDB_PyAggS,
+ STYPE = Numeric[],
+ PARALLEL = SAFE,
+ INITCOND = "{}"
+ );
+ END IF;
+END
+$$ LANGUAGE plpgsql;
+
+CREATE OR REPLACE FUNCTION
+ CDB_CreateAndPredictSegment(
+ target NUMERIC[],
+ features NUMERIC[],
+ target_features NUMERIC[],
+ target_ids NUMERIC[],
+ n_estimators INTEGER DEFAULT 1200,
+ max_depth INTEGER DEFAULT 3,
+ subsample DOUBLE PRECISION DEFAULT 0.5,
+ learning_rate DOUBLE PRECISION DEFAULT 0.01,
+ min_samples_leaf INTEGER DEFAULT 1)
+RETURNS TABLE(cartodb_id NUMERIC, prediction NUMERIC, accuracy NUMERIC)
+AS $$
+ import numpy as np
+ import plpy
+
+ from crankshaft.segmentation import create_and_predict_segment_agg
+ model_params = {'n_estimators': n_estimators,
+ 'max_depth': max_depth,
+ 'subsample': subsample,
+ 'learning_rate': learning_rate,
+ 'min_samples_leaf': min_samples_leaf}
+
+ def unpack2D(data):
+ dimension = data.pop(0)
+ a = np.array(data, dtype=float)
+ return a.reshape(len(a)/dimension, dimension)
+
+ return create_and_predict_segment_agg(np.array(target, dtype=float),
+ unpack2D(features),
+ unpack2D(target_features),
+ target_ids,
+ model_params)
+
+$$ LANGUAGE plpythonu VOLATILE PARALLEL RESTRICTED;
+
+CREATE OR REPLACE FUNCTION
+ CDB_CreateAndPredictSegment (
+ query TEXT,
+ variable_name TEXT,
+ target_table TEXT,
+ n_estimators INTEGER DEFAULT 1200,
+ max_depth INTEGER DEFAULT 3,
+ subsample DOUBLE PRECISION DEFAULT 0.5,
+ learning_rate DOUBLE PRECISION DEFAULT 0.01,
+ min_samples_leaf INTEGER DEFAULT 1)
+RETURNS TABLE (cartodb_id TEXT, prediction NUMERIC, accuracy NUMERIC)
+AS $$
+ from crankshaft.segmentation import create_and_predict_segment
+ model_params = {'n_estimators': n_estimators, 'max_depth':max_depth, 'subsample' : subsample, 'learning_rate': learning_rate, 'min_samples_leaf' : min_samples_leaf}
+ return create_and_predict_segment(query,variable_name,target_table, model_params)
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+CREATE OR REPLACE FUNCTION CDB_Gravity(
+ IN target_query text,
+ IN weight_column text,
+ IN source_query text,
+ IN pop_column text,
+ IN target bigint,
+ IN radius integer,
+ IN minval numeric DEFAULT -10e307
+ )
+RETURNS TABLE(
+ the_geom geometry,
+ source_id bigint,
+ target_id bigint,
+ dist numeric,
+ h numeric,
+ hpop numeric) AS $$
+DECLARE
+ t_id bigint[];
+ t_geom geometry[];
+ t_weight numeric[];
+ s_id bigint[];
+ s_geom geometry[];
+ s_pop numeric[];
+BEGIN
+ EXECUTE 'WITH foo as('+target_query+') SELECT array_agg(cartodb_id), array_agg(the_geom), array_agg(' || weight_column || ') FROM foo' INTO t_id, t_geom, t_weight;
+ EXECUTE 'WITH foo as('+source_query+') SELECT array_agg(cartodb_id), array_agg(the_geom), array_agg(' || pop_column || ') FROM foo' INTO s_id, s_geom, s_pop;
+ RETURN QUERY
+ SELECT g.* FROM t, s, CDB_Gravity(t_id, t_geom, t_weight, s_id, s_geom, s_pop, target, radius, minval) g;
+END;
+$$ language plpgsql VOLATILE PARALLEL UNSAFE;
+
+CREATE OR REPLACE FUNCTION CDB_Gravity(
+ IN t_id bigint[],
+ IN t_geom geometry[],
+ IN t_weight numeric[],
+ IN s_id bigint[],
+ IN s_geom geometry[],
+ IN s_pop numeric[],
+ IN target bigint,
+ IN radius integer,
+ IN minval numeric DEFAULT -10e307
+ )
+RETURNS TABLE(
+ the_geom geometry,
+ source_id bigint,
+ target_id bigint,
+ dist numeric,
+ h numeric,
+ hpop numeric) AS $$
+DECLARE
+ t_type text;
+ s_type text;
+ t_center geometry[];
+ s_center geometry[];
+BEGIN
+ t_type := GeometryType(t_geom[1]);
+ s_type := GeometryType(s_geom[1]);
+ IF t_type = 'POINT' THEN
+ t_center := t_geom;
+ ELSE
+ WITH tmp as (SELECT unnest(t_geom) as g) SELECT array_agg(ST_Centroid(g)) INTO t_center FROM tmp;
+ END IF;
+ IF s_type = 'POINT' THEN
+ s_center := s_geom;
+ ELSE
+ WITH tmp as (SELECT unnest(s_geom) as g) SELECT array_agg(ST_Centroid(g)) INTO s_center FROM tmp;
+ END IF;
+ RETURN QUERY
+ with target0 as(
+ SELECT unnest(t_center) as tc, unnest(t_weight) as tw, unnest(t_id) as td
+ ),
+ source0 as(
+ SELECT unnest(s_center) as sc, unnest(s_id) as sd, unnest (s_geom) as sg, unnest(s_pop) as sp
+ ),
+ prev0 as(
+ SELECT
+ source0.sg,
+ source0.sd as sourc_id,
+ coalesce(source0.sp,0) as sp,
+ target.td as targ_id,
+ coalesce(target.tw,0) as tw,
+ GREATEST(1.0,ST_Distance(geography(target.tc), geography(source0.sc)))::numeric as distance
+ FROM source0
+ CROSS JOIN LATERAL
+ (
+ SELECT
+ *
+ FROM target0
+ WHERE tw > minval
+ AND ST_DWithin(geography(source0.sc), geography(tc), radius)
+ ) AS target
+ ),
+ deno as(
+ SELECT
+ sourc_id,
+ sum(tw/distance) as h_deno
+ FROM
+ prev0
+ GROUP BY sourc_id
+ )
+ SELECT
+ p.sg as the_geom,
+ p.sourc_id as source_id,
+ p.targ_id as target_id,
+ case when p.distance > 1 then p.distance else 0.0 end as dist,
+ 100*(p.tw/p.distance)/d.h_deno as h,
+ p.sp*(p.tw/p.distance)/d.h_deno as hpop
+ FROM
+ prev0 p,
+ deno d
+ WHERE
+ p.targ_id = target AND
+ p.sourc_id = d.sourc_id;
+END;
+$$ language plpgsql IMMUTABLE PARALLEL SAFE;
+-- 0: nearest neighbor(s)
+-- 1: barymetric
+-- 2: IDW
+-- 3: krigin ---> TO DO
+
+
+CREATE OR REPLACE FUNCTION CDB_SpatialInterpolation(
+ IN query text,
+ IN point geometry,
+ IN method integer DEFAULT 1,
+ IN p1 numeric DEFAULT 0,
+ IN p2 numeric DEFAULT 0
+ )
+RETURNS numeric AS
+$$
+DECLARE
+ gs geometry[];
+ vs numeric[];
+ output numeric;
+BEGIN
+ EXECUTE 'WITH a AS('||query||') SELECT array_agg(the_geom), array_agg(attrib) FROM a' INTO gs, vs;
+ SELECT CDB_SpatialInterpolation(gs, vs, point, method, p1,p2) INTO output FROM a;
+
+ RETURN output;
+END;
+$$
+language plpgsql VOLATILE PARALLEL UNSAFE;
+
+CREATE OR REPLACE FUNCTION CDB_SpatialInterpolation(
+ IN geomin geometry[],
+ IN colin numeric[],
+ IN point geometry,
+ IN method integer DEFAULT 1,
+ IN p1 numeric DEFAULT 0,
+ IN p2 numeric DEFAULT 0
+ )
+RETURNS numeric AS
+$$
+DECLARE
+ gs geometry[];
+ vs numeric[];
+ gs2 geometry[];
+ vs2 numeric[];
+ g geometry;
+ vertex geometry[];
+ sg numeric;
+ sa numeric;
+ sb numeric;
+ sc numeric;
+ va numeric;
+ vb numeric;
+ vc numeric;
+ output numeric;
+BEGIN
+ -- output := -999.999;
+
+ -- nearest neighbors
+ -- p1: limit the number of neighbors, 0-> closest one
+ IF method = 0 THEN
+
+ IF p1 = 0 THEN
+ p1 := 1;
+ END IF;
+
+ WITH a as (SELECT unnest(geomin) as g, unnest(colin) as v),
+ b as (SELECT a.v as v FROM a ORDER BY point<->a.g LIMIT p1::integer)
+ SELECT avg(b.v) INTO output FROM b;
+ RETURN output;
+
+ -- barymetric
+ ELSIF method = 1 THEN
+ WITH a as (SELECT unnest(geomin) AS e),
+ b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a),
+ c as (SELECT (ST_Dump(t)).geom as v FROM b),
+ d as (SELECT v FROM c WHERE ST_Within(point, v))
+ SELECT v INTO g FROM d;
+ IF g is null THEN
+ -- out of the realm of the input data
+ RETURN -888.888;
+ END IF;
+ -- vertex of the selected cell
+ WITH a AS (SELECT (ST_DumpPoints(g)).geom AS v)
+ SELECT array_agg(v) INTO vertex FROM a;
+
+ -- retrieve the value of each vertex
+ WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c)
+ SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]);
+ WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c)
+ SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]);
+ WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c)
+ SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]);
+
+ SELECT ST_area(g), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[2], vertex[3], point]))), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[1], vertex[3], point]))), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point,vertex[1],vertex[2], point]))) INTO sg, sa, sb, sc;
+
+ output := (coalesce(sa,0) * coalesce(va,0) + coalesce(sb,0) * coalesce(vb,0) + coalesce(sc,0) * coalesce(vc,0)) / coalesce(sg);
+ RETURN output;
+
+ -- IDW
+ -- p1: limit the number of neighbors, 0->no limit
+ -- p2: order of distance decay, 0-> order 1
+ ELSIF method = 2 THEN
+
+ IF p2 = 0 THEN
+ p2 := 1;
+ END IF;
+
+ WITH a as (SELECT unnest(geomin) as g, unnest(colin) as v),
+ b as (SELECT a.g, a.v FROM a ORDER BY point<->a.g)
+ SELECT array_agg(b.g), array_agg(b.v) INTO gs, vs FROM b;
+ IF p1::integer>0 THEN
+ gs2:=gs;
+ vs2:=vs;
+ FOR i IN 1..p1
+ LOOP
+ gs2 := gs2 || gs[i];
+ vs2 := vs2 || vs[i];
+ END LOOP;
+ ELSE
+ gs2:=gs;
+ vs2:=vs;
+ END IF;
+
+ WITH a as (SELECT unnest(gs2) as g, unnest(vs2) as v),
+ b as (
+ SELECT
+ (1/ST_distance(point, a.g)^p2::integer) as k,
+ (a.v/ST_distance(point, a.g)^p2::integer) as f
+ FROM a
+ )
+ SELECT sum(b.f)/sum(b.k) INTO output FROM b;
+ RETURN output;
+
+ -- krigin
+ ELSIF method = 3 THEN
+
+ -- TO DO
+
+ END IF;
+
+ RETURN -777.777;
+
+END;
+$$
+language plpgsql IMMUTABLE PARALLEL SAFE;
+-- =============================================================================================
+--
+-- CDB_Voronoi
+--
+-- =============================================================================================
+CREATE OR REPLACE FUNCTION CDB_voronoi(
+ IN geomin geometry[],
+ IN buffer numeric DEFAULT 0.5,
+ IN tolerance numeric DEFAULT 1e-9
+ )
+RETURNS geometry AS $$
+DECLARE
+ geomout geometry;
+BEGIN
+ -- we need to make the geometry calculations in (pseudo)meters!!!
+ with a as (
+ SELECT unnest(geomin) as g1
+ ),
+ b as(
+ SELECT st_transform(g1, 3857) g2 from a
+ )
+ SELECT array_agg(g2) INTO geomin from b;
+
+ WITH
+ convexhull_1 as (
+ SELECT
+ ST_ConvexHull(ST_Collect(geomin)) as g,
+ buffer * |/ (st_area(ST_ConvexHull(ST_Collect(geomin)))/PI()) as r
+ ),
+ clipper as(
+ SELECT
+ st_buffer(ST_MinimumBoundingCircle(a.g), buffer*a.r) as g
+ FROM convexhull_1 a
+ ),
+ env0 as (
+ SELECT
+ (st_dumppoints(st_expand(a.g, buffer*a.r))).geom as e
+ FROM convexhull_1 a
+ ),
+ env as (
+ SELECT
+ array_agg(env0.e) as e
+ FROM env0
+ ),
+ sample AS (
+ SELECT
+ ST_Collect(geomin || env.e) as geom
+ FROM env
+ ),
+ convexhull as (
+ SELECT
+ ST_ConvexHull(ST_Collect(geomin)) as cg
+ ),
+ tin as (
+ SELECT
+ ST_Dump(ST_DelaunayTriangles(geom, tolerance, 0)) as gd
+ FROM
+ sample
+ ),
+ tin_polygons as (
+ SELECT
+ (gd).Path as id,
+ (gd).Geom as pg,
+ ST_Centroid(ST_MinimumBoundingCircle((gd).Geom, 180)) as ct
+ FROM tin
+ ),
+ tin_lines as (
+ SELECT
+ id,
+ ST_ExteriorRing(pg) as lg
+ FROM tin_polygons
+ ),
+ tin_nodes as (
+ SELECT
+ id,
+ ST_PointN(lg,1) p1,
+ ST_PointN(lg,2) p2,
+ ST_PointN(lg,3) p3
+ FROM tin_lines
+ ),
+ tin_edges AS (
+ SELECT
+ p.id,
+ UNNEST(ARRAY[
+ ST_MakeLine(n.p1,n.p2) ,
+ ST_MakeLine(n.p2,n.p3) ,
+ ST_MakeLine(n.p3,n.p1)]) as Edge,
+ ST_Force2D(cdb_crankshaft._Find_Circle(n.p1,n.p2,n.p3)) as ct,
+ CASE WHEN st_distance(p.ct, ST_ExteriorRing(p.pg)) < tolerance THEN
+ TRUE
+ ELSE FALSE END AS ctx,
+ p.pg,
+ ST_within(p.ct, convexhull.cg) as ctin
+ FROM
+ tin_polygons p,
+ tin_nodes n,
+ convexhull
+ WHERE p.id = n.id
+ ),
+ voro_nodes as (
+ SELECT
+ CASE WHEN x.ctx = TRUE THEN
+ ST_Centroid(x.edge)
+ ELSE
+ x.ct
+ END as xct,
+ CASE WHEN y.id is null THEN
+ CASE WHEN x.ctin = TRUE THEN
+ ST_SetSRID(ST_MakePoint(
+ ST_X(x.ct) + ((ST_X(ST_Centroid(x.edge)) - ST_X(x.ct)) * (1+buffer)),
+ ST_Y(x.ct) + ((ST_Y(ST_Centroid(x.edge)) - ST_Y(x.ct)) * (1+buffer))
+ ), ST_SRID(x.ct))
+ END
+ ELSE
+ y.ct
+ END as yct
+ FROM
+ tin_edges x
+ LEFT OUTER JOIN
+ tin_edges y
+ ON x.id <> y.id AND ST_Equals(x.edge, y.edge)
+ ),
+ voro_edges as(
+ SELECT
+ ST_LineMerge(ST_Collect(ST_MakeLine(xct, yct))) as v
+ FROM
+ voro_nodes
+ ),
+ voro_cells as(
+ SELECT
+ ST_Polygonize(
+ ST_Node(
+ ST_LineMerge(
+ ST_Union(v, ST_ExteriorRing(
+ ST_Convexhull(v)
+ )
+ )
+ )
+ )
+ ) as g
+ FROM
+ voro_edges
+ ),
+ voro_set as(
+ SELECT
+ (st_dump(v.g)).geom as g
+ FROM voro_cells v
+ ),
+ clipped_voro as(
+ SELECT
+ ST_intersection(c.g, v.g) as g
+ FROM
+ voro_set v,
+ clipper c
+ WHERE
+ ST_GeometryType(v.g) = 'ST_Polygon'
+ )
+ SELECT
+ st_collect(
+ ST_Transform(
+ ST_ConvexHull(g),
+ 4326
+ )
+ )
+ INTO geomout
+ FROM
+ clipped_voro;
+ RETURN geomout;
+END;
+$$ language plpgsql IMMUTABLE PARALLEL SAFE;
+
+/** ----------------------------------------------------------------------------------------
+ * @function : FindCircle
+ * @precis : Function that determines if three points form a circle. If so a table containing
+ * centre and radius is returned. If not, a null table is returned.
+ * @version : 1.0.1
+ * @param : p_pt1 : First point in curve
+ * @param : p_pt2 : Second point in curve
+ * @param : p_pt3 : Third point in curve
+ * @return : geometry : In which X,Y ordinates are the centre X, Y and the Z being the radius of found circle
+ * or NULL if three points do not form a circle.
+ * @history : Simon Greener - Feb 2012 - Original coding.
+ * Rafa de la Torre - Aug 2016 - Small fix for type checking
+ * Raul Marin - Sept 2017 - Remove unnecessary NULL checks and set function categories
+ * @copyright : Simon Greener @ 2012
+ * Licensed under a Creative Commons Attribution-Share Alike 2.5 Australia License. (http://creativecommons.org/licenses/by-sa/2.5/au/)
+**/
+CREATE OR REPLACE FUNCTION _Find_Circle(
+ IN p_pt1 geometry,
+ IN p_pt2 geometry,
+ IN p_pt3 geometry)
+ RETURNS geometry AS
+$BODY$
+DECLARE
+ v_Centre geometry;
+ v_radius NUMERIC;
+ v_CX NUMERIC;
+ v_CY NUMERIC;
+ v_dA NUMERIC;
+ v_dB NUMERIC;
+ v_dC NUMERIC;
+ v_dD NUMERIC;
+ v_dE NUMERIC;
+ v_dF NUMERIC;
+ v_dG NUMERIC;
+BEGIN
+ IF ( ST_GeometryType(p_pt1) <> 'ST_Point' OR
+ ST_GeometryType(p_pt2) <> 'ST_Point' OR
+ ST_GeometryType(p_pt3) <> 'ST_Point' ) THEN
+ RAISE EXCEPTION 'All supplied geometries must be points.';
+ RETURN NULL;
+ END IF;
+ v_dA := ST_X(p_pt2) - ST_X(p_pt1);
+ v_dB := ST_Y(p_pt2) - ST_Y(p_pt1);
+ v_dC := ST_X(p_pt3) - ST_X(p_pt1);
+ v_dD := ST_Y(p_pt3) - ST_Y(p_pt1);
+ v_dE := v_dA * (ST_X(p_pt1) + ST_X(p_pt2)) + v_dB * (ST_Y(p_pt1) + ST_Y(p_pt2));
+ v_dF := v_dC * (ST_X(p_pt1) + ST_X(p_pt3)) + v_dD * (ST_Y(p_pt1) + ST_Y(p_pt3));
+ v_dG := 2.0 * (v_dA * (ST_Y(p_pt3) - ST_Y(p_pt2)) - v_dB * (ST_X(p_pt3) - ST_X(p_pt2)));
+ -- If v_dG is zero then the three points are collinear and no finite-radius
+ -- circle through them exists.
+ IF ( v_dG = 0 ) THEN
+ RETURN NULL;
+ ELSE
+ v_CX := (v_dD * v_dE - v_dB * v_dF) / v_dG;
+ v_CY := (v_dA * v_dF - v_dC * v_dE) / v_dG;
+ v_Radius := SQRT(POWER(ST_X(p_pt1) - v_CX,2) + POWER(ST_Y(p_pt1) - v_CY,2) );
+ END IF;
+ RETURN ST_SetSRID(ST_MakePoint(v_CX, v_CY, v_radius),ST_Srid(p_pt1));
+END;
+$BODY$
+ LANGUAGE plpgsql IMMUTABLE STRICT PARALLEL SAFE;
+
+-- Moran's I Global Measure (public-facing)
+CREATE OR REPLACE FUNCTION
+ CDB_AreasOfInterestGlobal(
+ subquery TEXT,
+ column_name TEXT,
+ w_type TEXT DEFAULT 'knn',
+ num_ngbrs INT DEFAULT 5,
+ permutations INT DEFAULT 99,
+ geom_col TEXT DEFAULT 'the_geom',
+ id_col TEXT DEFAULT 'cartodb_id')
+RETURNS TABLE (moran NUMERIC, significance NUMERIC)
+AS $$
+ from crankshaft.clustering import Moran
+ # TODO: use named parameters or a dictionary
+ moran = Moran()
+ return moran.global_stat(subquery, column_name, w_type,
+ num_ngbrs, permutations, geom_col, id_col)
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+
+-- Moran's I Local (internal function) - DEPRECATED
+CREATE OR REPLACE FUNCTION
+ _CDB_AreasOfInterestLocal(
+ subquery TEXT,
+ column_name TEXT,
+ w_type TEXT,
+ num_ngbrs INT,
+ permutations INT,
+ geom_col TEXT,
+ id_col TEXT)
+RETURNS TABLE (
+ moran NUMERIC,
+ quads TEXT,
+ significance NUMERIC,
+ rowid INT,
+ vals NUMERIC)
+AS $$
+ from crankshaft.clustering import Moran
+ moran = Moran()
+ result = moran.local_stat(subquery, column_name, w_type,
+ num_ngbrs, permutations, geom_col, id_col)
+ # remove spatial lag
+ return [(r[6], r[0], r[1], r[7], r[5]) for r in result]
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+
+-- Moran's I Local (internal function)
+CREATE OR REPLACE FUNCTION
+ _CDB_MoransILocal(
+ subquery TEXT,
+ column_name TEXT,
+ w_type TEXT,
+ num_ngbrs INT,
+ permutations INT,
+ geom_col TEXT,
+ id_col TEXT)
+RETURNS TABLE (
+ quads TEXT,
+ significance NUMERIC,
+ spatial_lag NUMERIC,
+ spatial_lag_std NUMERIC,
+ orig_val NUMERIC,
+ orig_val_std NUMERIC,
+ moran_stat NUMERIC,
+ rowid INT)
+AS $$
+
+from crankshaft.clustering import Moran
+moran = Moran()
+return moran.local_stat(subquery, column_name, w_type,
+ num_ngbrs, permutations, geom_col, id_col)
+
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+
+
+-- Moran's I Local (public-facing function)
+-- Replaces CDB_AreasOfInterestLocal
+CREATE OR REPLACE FUNCTION
+ CDB_MoransILocal(
+ subquery TEXT,
+ column_name TEXT,
+ w_type TEXT DEFAULT 'knn',
+ num_ngbrs INT DEFAULT 5,
+ permutations INT DEFAULT 99,
+ geom_col TEXT DEFAULT 'the_geom',
+ id_col TEXT DEFAULT 'cartodb_id')
+RETURNS TABLE (
+ quads TEXT,
+ significance NUMERIC,
+ spatial_lag NUMERIC,
+ spatial_lag_std NUMERIC,
+ orig_val NUMERIC,
+ orig_val_std NUMERIC,
+ moran_stat NUMERIC,
+ rowid INT)
+AS $$
+
+ SELECT
+ quads, significance, spatial_lag, spatial_lag_std,
+ orig_val, orig_val_std, moran_stat, rowid
+ FROM cdb_crankshaft._CDB_MoransILocal(
+ subquery, column_name, w_type,
+ num_ngbrs, permutations, geom_col, id_col);
+
+$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
+
+-- Moran's I Local (public-facing function) - DEPRECATED
+CREATE OR REPLACE FUNCTION
+ CDB_AreasOfInterestLocal(
+ subquery TEXT,
+ column_name TEXT,
+ w_type TEXT DEFAULT 'knn',
+ num_ngbrs INT DEFAULT 5,
+ permutations INT DEFAULT 99,
+ geom_col TEXT DEFAULT 'the_geom',
+ id_col TEXT DEFAULT 'cartodb_id')
+RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
+AS $$
+
+ SELECT moran, quads, significance, rowid, vals
+ FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col);
+
+$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
+
+-- Moran's I only for HH and HL (public-facing function)
+CREATE OR REPLACE FUNCTION
+ CDB_GetSpatialHotspots(
+ subquery TEXT,
+ column_name TEXT,
+ w_type TEXT DEFAULT 'knn',
+ num_ngbrs INT DEFAULT 5,
+ permutations INT DEFAULT 99,
+ geom_col TEXT DEFAULT 'the_geom',
+ id_col TEXT DEFAULT 'cartodb_id')
+ RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
+AS $$
+
+ SELECT moran, quads, significance, rowid, vals
+ FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col)
+ WHERE quads IN ('HH', 'HL');
+
+$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
+
+-- Moran's I only for LL and LH (public-facing function)
+CREATE OR REPLACE FUNCTION
+ CDB_GetSpatialColdspots(
+ subquery TEXT,
+ attr TEXT,
+ w_type TEXT DEFAULT 'knn',
+ num_ngbrs INT DEFAULT 5,
+ permutations INT DEFAULT 99,
+ geom_col TEXT DEFAULT 'the_geom',
+ id_col TEXT DEFAULT 'cartodb_id')
+ RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
+AS $$
+
+ SELECT moran, quads, significance, rowid, vals
+ FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col)
+ WHERE quads IN ('LL', 'LH');
+
+$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
+
+-- Moran's I only for LH and HL (public-facing function)
+CREATE OR REPLACE FUNCTION
+ CDB_GetSpatialOutliers(
+ subquery TEXT,
+ attr TEXT,
+ w_type TEXT DEFAULT 'knn',
+ num_ngbrs INT DEFAULT 5,
+ permutations INT DEFAULT 99,
+ geom_col TEXT DEFAULT 'the_geom',
+ id_col TEXT DEFAULT 'cartodb_id')
+ RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
+AS $$
+
+ SELECT moran, quads, significance, rowid, vals
+ FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col)
+ WHERE quads IN ('HL', 'LH');
+
+$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
+
+-- Moran's I Global Rate (public-facing function)
+CREATE OR REPLACE FUNCTION
+ CDB_AreasOfInterestGlobalRate(
+ subquery TEXT,
+ numerator TEXT,
+ denominator TEXT,
+ w_type TEXT DEFAULT 'knn',
+ num_ngbrs INT DEFAULT 5,
+ permutations INT DEFAULT 99,
+ geom_col TEXT DEFAULT 'the_geom',
+ id_col TEXT DEFAULT 'cartodb_id')
+RETURNS TABLE (moran FLOAT, significance FLOAT)
+AS $$
+ from crankshaft.clustering import Moran
+ moran = Moran()
+ # TODO: use named parameters or a dictionary
+ return moran.global_rate_stat(subquery, numerator, denominator, w_type,
+ num_ngbrs, permutations, geom_col, id_col)
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+
+
+-- Moran's I Local Rate (internal function) - DEPRECATED
+CREATE OR REPLACE FUNCTION
+ _CDB_AreasOfInterestLocalRate(
+ subquery TEXT,
+ numerator TEXT,
+ denominator TEXT,
+ w_type TEXT,
+ num_ngbrs INT,
+ permutations INT,
+ geom_col TEXT,
+ id_col TEXT)
+RETURNS
+TABLE(
+ moran NUMERIC,
+ quads TEXT,
+ significance NUMERIC,
+ rowid INT,
+ vals NUMERIC)
+AS $$
+ from crankshaft.clustering import Moran
+ moran = Moran()
+ # TODO: use named parameters or a dictionary
+ result = moran.local_rate_stat(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
+ # remove spatial lag
+ return [(r[6], r[0], r[1], r[7], r[4]) for r in result]
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+
+-- Moran's I Local Rate (public-facing function) - DEPRECATED
+CREATE OR REPLACE FUNCTION
+ CDB_AreasOfInterestLocalRate(
+ subquery TEXT,
+ numerator TEXT,
+ denominator TEXT,
+ w_type TEXT DEFAULT 'knn',
+ num_ngbrs INT DEFAULT 5,
+ permutations INT DEFAULT 99,
+ geom_col TEXT DEFAULT 'the_geom',
+ id_col TEXT DEFAULT 'cartodb_id')
+RETURNS
+TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
+AS $$
+
+ SELECT moran, quads, significance, rowid, vals
+ FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col);
+
+$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
+
+-- Internal function
+CREATE OR REPLACE FUNCTION
+ _CDB_MoransILocalRate(
+ subquery TEXT,
+ numerator TEXT,
+ denominator TEXT,
+ w_type TEXT,
+ num_ngbrs INT,
+ permutations INT,
+ geom_col TEXT,
+ id_col TEXT)
+RETURNS
+TABLE(
+ quads TEXT,
+ significance NUMERIC,
+ spatial_lag NUMERIC,
+ spatial_lag_std NUMERIC,
+ orig_val NUMERIC,
+ orig_val_std NUMERIC,
+ moran_stat NUMERIC,
+ rowid INT)
+AS $$
+from crankshaft.clustering import Moran
+moran = Moran()
+return moran.local_rate_stat(
+ subquery,
+ numerator,
+ denominator,
+ w_type,
+ num_ngbrs,
+ permutations,
+ geom_col,
+ id_col
+)
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+
+-- Moran's I Rate
+-- Replaces CDB_AreasOfInterestLocalRate
+CREATE OR REPLACE FUNCTION
+ CDB_MoransILocalRate(
+ subquery TEXT,
+ numerator TEXT,
+ denominator TEXT,
+ w_type TEXT DEFAULT 'knn',
+ num_ngbrs INT DEFAULT 5,
+ permutations INT DEFAULT 99,
+ geom_col TEXT DEFAULT 'the_geom',
+ id_col TEXT DEFAULT 'cartodb_id')
+RETURNS
+TABLE(
+ quads TEXT,
+ significance NUMERIC,
+ spatial_lag NUMERIC,
+ spatial_lag_std NUMERIC,
+ orig_val NUMERIC,
+ orig_val_std NUMERIC,
+ moran_stat NUMERIC,
+ rowid INT)
+AS $$
+
+SELECT
+ quads, significance, spatial_lag, spatial_lag_std,
+ orig_val, orig_val_std, moran_stat, rowid
+FROM cdb_crankshaft._CDB_MoransILocalRate(
+ subquery, numerator, denominator, w_type,
+ num_ngbrs, permutations, geom_col, id_col);
+
+$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
+
+-- Moran's I Local Rate only for HH and HL (public-facing function)
+CREATE OR REPLACE FUNCTION
+ CDB_GetSpatialHotspotsRate(
+ subquery TEXT,
+ numerator TEXT,
+ denominator TEXT,
+ w_type TEXT DEFAULT 'knn',
+ num_ngbrs INT DEFAULT 5,
+ permutations INT DEFAULT 99,
+ geom_col TEXT DEFAULT 'the_geom',
+ id_col TEXT DEFAULT 'cartodb_id')
+RETURNS
+TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
+AS $$
+
+ SELECT moran, quads, significance, rowid, vals
+ FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
+ WHERE quads IN ('HH', 'HL');
+
+$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
+
+-- Moran's I Local Rate only for LL and LH (public-facing function)
+CREATE OR REPLACE FUNCTION
+ CDB_GetSpatialColdspotsRate(
+ subquery TEXT,
+ numerator TEXT,
+ denominator TEXT,
+ w_type TEXT DEFAULT 'knn',
+ num_ngbrs INT DEFAULT 5,
+ permutations INT DEFAULT 99,
+ geom_col TEXT DEFAULT 'the_geom',
+ id_col TEXT DEFAULT 'cartodb_id')
+RETURNS
+TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
+AS $$
+
+ SELECT moran, quads, significance, rowid, vals
+ FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
+ WHERE quads IN ('LL', 'LH');
+
+$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
+
+-- Moran's I Local Rate only for LH and HL (public-facing function)
+CREATE OR REPLACE FUNCTION
+ CDB_GetSpatialOutliersRate(
+ subquery TEXT,
+ numerator TEXT,
+ denominator TEXT,
+ w_type TEXT DEFAULT 'knn',
+ num_ngbrs INT DEFAULT 5,
+ permutations INT DEFAULT 99,
+ geom_col TEXT DEFAULT 'the_geom',
+ id_col TEXT DEFAULT 'cartodb_id')
+RETURNS
+TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
+AS $$
+
+ SELECT moran, quads, significance, rowid, vals
+ FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
+ WHERE quads IN ('HL', 'LH');
+
+$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
+-- Spatial k-means clustering
+
+CREATE OR REPLACE FUNCTION CDB_KMeans(
+ query TEXT,
+ no_clusters INTEGER,
+ no_init INTEGER DEFAULT 20
+)
+RETURNS TABLE(
+ cartodb_id INTEGER,
+ cluster_no INTEGER
+) AS $$
+
+from crankshaft.clustering import Kmeans
+kmeans = Kmeans()
+return kmeans.spatial(query, no_clusters, no_init)
+
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+
+-- Non-spatial k-means clustering
+-- query: sql query to retrieve all the needed data
+-- colnames: text array of column names for doing the clustering analysis
+-- no_clusters: number of requested clusters
+-- standardize: whether to scale variables to a mean of zero and a standard
+-- deviation of 1
+-- id_colname: name of the id column
+
+CREATE OR REPLACE FUNCTION CDB_KMeansNonspatial(
+ query TEXT,
+ colnames TEXT[],
+ no_clusters INTEGER,
+ standardize BOOLEAN DEFAULT true,
+ id_col TEXT DEFAULT 'cartodb_id'
+)
+RETURNS TABLE(
+ cluster_label text,
+ cluster_center json,
+ silhouettes numeric,
+ inertia numeric,
+ rowid bigint
+) AS $$
+
+from crankshaft.clustering import Kmeans
+kmeans = Kmeans()
+return kmeans.nonspatial(query, colnames, no_clusters,
+ standardize=standardize,
+ id_col=id_col)
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+
+
+CREATE OR REPLACE FUNCTION CDB_WeightedMeanS(
+ state NUMERIC[],
+ the_geom GEOMETRY(Point, 4326),
+ weight NUMERIC
+)
+RETURNS Numeric[] AS $$
+DECLARE
+ newX NUMERIC;
+ newY NUMERIC;
+ newW NUMERIC;
+BEGIN
+ IF weight IS NULL OR the_geom IS NULL THEN
+ newX = state[1];
+ newY = state[2];
+ newW = state[3];
+ ELSE
+ newX = state[1] + ST_X(the_geom)*weight;
+ newY = state[2] + ST_Y(the_geom)*weight;
+ newW = state[3] + weight;
+ END IF;
+ RETURN Array[newX,newY,newW];
+
+END
+$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
+
+
+CREATE OR REPLACE FUNCTION CDB_WeightedMeanF(state NUMERIC[])
+RETURNS GEOMETRY AS
+$$
+BEGIN
+ IF state[3] = 0 THEN
+ RETURN ST_SetSRID(ST_MakePoint(state[1],state[2]), 4326);
+ ELSE
+ RETURN ST_SETSRID(ST_MakePoint(state[1]/state[3], state[2]/state[3]),4326);
+ END IF;
+END
+$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
+
+-- Create aggregate if it did not exist
+DO $$
+BEGIN
+ IF NOT EXISTS (
+ SELECT *
+ FROM pg_catalog.pg_proc p
+ LEFT JOIN pg_catalog.pg_namespace n ON n.oid = p.pronamespace
+ WHERE n.nspname = 'cdb_crankshaft'
+ AND p.proname = 'cdb_weightedmean'
+ AND p.proisagg)
+ THEN
+ CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
+ SFUNC = CDB_WeightedMeanS,
+ FINALFUNC = CDB_WeightedMeanF,
+ STYPE = Numeric[],
+ PARALLEL = SAFE,
+ INITCOND = "{0.0,0.0,0.0}"
+ );
+ END IF;
+END
+$$ LANGUAGE plpgsql;
+-- Spatial Markov
+
+-- input table format:
+-- id | geom | date_1 | date_2 | date_3
+-- 1 | Pt1 | 12.3 | 13.1 | 14.2
+-- 2 | Pt2 | 11.0 | 13.2 | 12.5
+-- ...
+-- Sample Function call:
+-- SELECT CDB_SpatialMarkov('SELECT * FROM real_estate',
+-- Array['date_1', 'date_2', 'date_3'])
+
+CREATE OR REPLACE FUNCTION
+ CDB_SpatialMarkovTrend (
+ subquery TEXT,
+ time_cols TEXT[],
+ num_classes INT DEFAULT 7,
+ w_type TEXT DEFAULT 'knn',
+ num_ngbrs INT DEFAULT 5,
+ permutations INT DEFAULT 99,
+ geom_col TEXT DEFAULT 'the_geom',
+ id_col TEXT DEFAULT 'cartodb_id')
+RETURNS TABLE (trend NUMERIC, trend_up NUMERIC, trend_down NUMERIC, volatility NUMERIC, rowid INT)
+AS $$
+
+ from crankshaft.space_time_dynamics import Markov
+ markov = Markov()
+
+ ## TODO: use named parameters or a dictionary
+ return markov.spatial_trend(subquery, time_cols, num_classes, w_type, num_ngbrs, permutations, geom_col, id_col)
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+
+-- input table format: identical to above but in a predictable format
+-- Sample function call:
+-- SELECT cdb_spatial_markov('SELECT * FROM real_estate',
+-- 'date_1')
+
+
+-- CREATE OR REPLACE FUNCTION
+-- cdb_spatial_markov (
+-- subquery TEXT,
+-- time_col_min text,
+-- time_col_max text,
+-- date_format text, -- '_YYYY_MM_DD'
+-- num_time_per_bin INT DEFAULT 1,
+-- permutations INT DEFAULT 99,
+-- geom_column TEXT DEFAULT 'the_geom',
+-- id_col TEXT DEFAULT 'cartodb_id',
+-- w_type TEXT DEFAULT 'knn',
+-- num_ngbrs int DEFAULT 5)
+-- RETURNS TABLE (moran FLOAT, quads TEXT, significance FLOAT, ids INT)
+-- AS $$
+-- plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()')
+-- from crankshaft.clustering import moran_local
+-- # TODO: use named parameters or a dictionary
+-- return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs)
+-- $$ LANGUAGE plpythonu;
+--
+-- -- input table format:
+-- -- id | geom | date | measurement
+-- -- 1 | Pt1 | 12/3 | 13.2
+-- -- 2 | Pt2 | 11/5 | 11.3
+-- -- 3 | Pt1 | 11/13 | 12.9
+-- -- 4 | Pt3 | 12/19 | 10.1
+-- -- ...
+--
+-- CREATE OR REPLACE FUNCTION
+-- cdb_spatial_markov (
+-- subquery TEXT,
+-- time_col text,
+-- num_time_per_bin INT DEFAULT 1,
+-- permutations INT DEFAULT 99,
+-- geom_column TEXT DEFAULT 'the_geom',
+-- id_col TEXT DEFAULT 'cartodb_id',
+-- w_type TEXT DEFAULT 'knn',
+-- num_ngbrs int DEFAULT 5)
+-- RETURNS TABLE (moran FLOAT, quads TEXT, significance FLOAT, ids INT)
+-- AS $$
+-- plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()')
+-- from crankshaft.clustering import moran_local
+-- # TODO: use named parameters or a dictionary
+-- return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs)
+-- $$ LANGUAGE plpythonu;
+-- Based on:
+-- https://github.com/mapbox/polylabel/blob/master/index.js
+-- https://sites.google.com/site/polesofinaccessibility/
+-- Requires: https://github.com/CartoDB/cartodb-postgresql
+
+-- Based on:
+-- https://github.com/mapbox/polylabel/blob/master/index.js
+-- https://sites.google.com/site/polesofinaccessibility/
+-- Requires: https://github.com/CartoDB/cartodb-postgresql
+
+CREATE OR REPLACE FUNCTION CDB_PIA(
+ IN polygon geometry,
+ IN tolerance numeric DEFAULT 1.0
+ )
+RETURNS geometry AS $$
+DECLARE
+ env geometry[];
+ cells geometry[];
+ cell geometry;
+ best_c geometry;
+ best_d numeric;
+ test_d numeric;
+ test_mx numeric;
+ test_h numeric;
+ test_cells geometry[];
+ width numeric;
+ height numeric;
+ h numeric;
+ i integer;
+ n integer;
+ sqr numeric;
+ p geometry;
+BEGIN
+ sqr := 0.5*(|/2.0);
+ polygon := ST_Transform(polygon, 3857);
+
+ -- grid #0 cell size
+ height := ST_YMax(polygon) - ST_YMin(polygon);
+ width := ST_XMax(polygon) - ST_XMin(polygon);
+ h := 0.5*LEAST(height, width);
+
+ -- grid #0
+ with c1 as(
+ SELECT cdb_crankshaft.CDB_RectangleGrid(polygon, h, h) as c
+ )
+ SELECT array_agg(c) INTO cells FROM c1;
+
+ -- 1st guess: centroid
+ best_c := polygon;
+ best_d := cdb_crankshaft._Signed_Dist(polygon, ST_Centroid(Polygon));
+
+ -- looping the loop
+ n := array_length(cells,1);
+ i := 1;
+ LOOP
+
+ EXIT WHEN i > n;
+
+ cell := cells[i];
+
+ i := i+1;
+
+ -- cell side size, it's square
+ test_h := ST_XMax(cell) - ST_XMin(cell) ;
+
+ -- check distance
+ test_d := cdb_crankshaft._Signed_Dist(polygon, ST_Centroid(cell));
+
+ IF test_d > best_d THEN
+ best_d := test_d;
+ best_c := cell;
+ END IF;
+
+ -- longest distance within the cell
+ test_mx := test_d + (test_h * sqr);
+
+ -- if the cell has no chance to contains the desired point, continue
+ CONTINUE WHEN test_mx - best_d <= tolerance;
+
+ -- resample the cell
+ with c1 as(
+ SELECT cdb_crankshaft.CDB_RectangleGrid(cell, test_h/2, test_h/2) as c
+ )
+ SELECT array_agg(c) INTO test_cells FROM c1;
+
+ -- concat the new cells to the former array
+ cells := cells || test_cells;
+
+ -- prepare next iteration
+ n := array_length(cells,1);
+
+ END LOOP;
+
+ RETURN ST_transform(ST_Centroid(best_c), 4326);
+
+END;
+$$ language plpgsql IMMUTABLE PARALLEL SAFE;
+
+
+
+-- signed distance point to polygon with holes
+-- negative is the point is out the polygon
+-- rev 1. adding MULTIPOLYGON and GEOMETRYCOLLECTION support by @abelvm
+CREATE OR REPLACE FUNCTION _Signed_Dist(
+ IN polygon geometry,
+ IN point geometry
+ )
+RETURNS numeric AS $$
+DECLARE
+ pols geometry[];
+ pol geometry;
+ i integer;
+ j integer;
+ within integer;
+ w integer;
+ holes integer;
+ dist numeric;
+ d numeric;
+BEGIN
+ dist := 1e999;
+ WITH collection as (SELECT (ST_dump(polygon)).geom as geom) SELECT array_agg(geom) into pols FROM collection;
+ FOR j in 1..array_length(pols, 1)
+ LOOP
+ pol := pols[j];
+ d := dist;
+ SELECT LEAST(dist, ST_distance(point, ST_ExteriorRing(pol))::numeric) INTO d;
+ SELECT CASE WHEN ST_Within(point,pol) THEN 1 ELSE -1 END INTO w;
+ SELECT ST_NumInteriorRings(pol) INTO holes;
+ IF holes > 0 THEN
+ FOR i IN 1..holes
+ LOOP
+ SELECT LEAST(d, ST_distance(point, ST_InteriorRingN(pol, i))::numeric) INTO d;
+ END LOOP;
+ END IF;
+ IF d < dist THEN
+ dist:= d;
+ within := w;
+ END IF;
+ END LOOP;
+ dist := dist * within::numeric;
+ RETURN dist;
+END;
+$$ language plpgsql IMMUTABLE PARALLEL SAFE;
+--
+-- Iterative densification of a set of points using Delaunay triangulation
+-- the new points have as assigned value the average value of the 3 vertex (centroid)
+--
+-- @param geomin - array of geometries (points)
+--
+-- @param colin - array of numeric values in that points
+--
+-- @param iterations - integer, number of iterations
+--
+--
+-- Returns: TABLE(geomout geometry, colout numeric)
+--
+--
+CREATE OR REPLACE FUNCTION CDB_Densify(
+ IN geomin geometry[],
+ IN colin numeric[],
+ IN iterations integer
+ )
+RETURNS TABLE(geomout geometry, colout numeric) AS $$
+DECLARE
+ geotemp geometry[];
+ coltemp numeric[];
+ i integer;
+ gs geometry[];
+ g geometry;
+ vertex geometry[];
+ va numeric;
+ vb numeric;
+ vc numeric;
+ center geometry;
+ centerval numeric;
+ tmp integer;
+BEGIN
+ geotemp := geomin;
+ coltemp := colin;
+ FOR i IN 1..iterations
+ LOOP
+ -- generate TIN
+ WITH a as (SELECT unnest(geotemp) AS e),
+ b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a),
+ c as (SELECT (ST_Dump(t)).geom AS v FROM b)
+ SELECT array_agg(v) INTO gs FROM c;
+ -- loop cells
+ FOREACH g IN ARRAY gs
+ LOOP
+ -- append centroid
+ SELECT ST_Centroid(g) INTO center;
+ geotemp := array_append(geotemp, center);
+ -- retrieve the value of each vertex
+ WITH a AS (SELECT (ST_DumpPoints(g)).geom AS v)
+ SELECT array_agg(v) INTO vertex FROM a;
+ WITH a AS(SELECT unnest(geotemp) as geo, unnest(coltemp) as c)
+ SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]);
+ WITH a AS(SELECT unnest(geotemp) as geo, unnest(coltemp) as c)
+ SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]);
+ WITH a AS(SELECT unnest(geotemp) as geo, unnest(coltemp) as c)
+ SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]);
+ -- calc the value at the center
+ centerval := (va + vb + vc) / 3;
+ -- append the value
+ coltemp := array_append(coltemp, centerval);
+ END LOOP;
+ END LOOP;
+ RETURN QUERY SELECT unnest(geotemp ) as geomout, unnest(coltemp ) as colout;
+END;
+$$ language plpgsql IMMUTABLE PARALLEL SAFE;
+CREATE OR REPLACE FUNCTION CDB_TINmap(
+ IN geomin geometry[],
+ IN colin numeric[],
+ IN iterations integer
+ )
+RETURNS TABLE(geomout geometry, colout numeric) AS $$
+DECLARE
+ p geometry[];
+ vals numeric[];
+ gs geometry[];
+ g geometry;
+ vertex geometry[];
+ centerval numeric;
+ va numeric;
+ vb numeric;
+ vc numeric;
+ coltemp numeric[];
+BEGIN
+ SELECT array_agg(dens.geomout), array_agg(dens.colout) INTO p, vals FROM cdb_crankshaft.CDB_Densify(geomin, colin, iterations) dens;
+ WITH a as (SELECT unnest(p) AS e),
+ b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a),
+ c as (SELECT (ST_Dump(t)).geom AS v FROM b)
+ SELECT array_agg(v) INTO gs FROM c;
+ FOREACH g IN ARRAY gs
+ LOOP
+ -- retrieve the vertex of each triangle
+ WITH a AS (SELECT (ST_DumpPoints(g)).geom AS v)
+ SELECT array_agg(v) INTO vertex FROM a;
+ -- retrieve the value of each vertex
+ WITH a AS(SELECT unnest(p) as geo, unnest(vals) as c)
+ SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]);
+ WITH a AS(SELECT unnest(p) as geo, unnest(vals) as c)
+ SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]);
+ WITH a AS(SELECT unnest(p) as geo, unnest(vals) as c)
+ SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]);
+ -- calc the value at the center
+ centerval := (va + vb + vc) / 3;
+ -- append the value
+ coltemp := array_append(coltemp, centerval);
+ END LOOP;
+ RETURN QUERY SELECT unnest(gs) as geomout, unnest(coltemp ) as colout;
+END;
+$$ language plpgsql IMMUTABLE PARALLEL SAFE;
+-- Getis-Ord's G
+-- Hotspot/Coldspot Analysis tool
+CREATE OR REPLACE FUNCTION
+ CDB_GetisOrdsG(
+ subquery TEXT,
+ column_name TEXT,
+ w_type TEXT DEFAULT 'knn',
+ num_ngbrs INT DEFAULT 5,
+ permutations INT DEFAULT 999,
+ geom_col TEXT DEFAULT 'the_geom',
+ id_col TEXT DEFAULT 'cartodb_id')
+RETURNS TABLE (z_score NUMERIC, p_value NUMERIC, p_z_sim NUMERIC, rowid BIGINT)
+AS $$
+ from crankshaft.clustering import Getis
+ getis = Getis()
+ return getis.getis_ord(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col)
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+
+-- TODO: make a version that accepts the values as arrays
+
+-- Find outliers using a static threshold
+--
+CREATE OR REPLACE FUNCTION CDB_StaticOutlier(column_value numeric, threshold numeric)
+RETURNS boolean
+AS $$
+BEGIN
+
+ RETURN column_value > threshold;
+
+END;
+$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE ;
+
+-- Find outliers by a percentage above the threshold
+-- TODO: add symmetric option? `is_symmetric boolean DEFAULT false`
+
+CREATE OR REPLACE FUNCTION CDB_PercentOutlier(column_values numeric[], outlier_fraction numeric, ids int[])
+RETURNS TABLE(is_outlier boolean, rowid int)
+AS $$
+DECLARE
+ avg_val numeric;
+ out_vals boolean[];
+BEGIN
+
+ SELECT avg(i) INTO avg_val
+ FROM unnest(column_values) As x(i);
+
+ IF avg_val = 0 THEN
+ RAISE EXCEPTION 'Mean value is zero. Try another outlier method.';
+ END IF;
+
+ SELECT array_agg(
+ outlier_fraction < i / avg_val) INTO out_vals
+ FROM unnest(column_values) As x(i);
+
+ RETURN QUERY
+ SELECT unnest(out_vals) As is_outlier,
+ unnest(ids) As rowid;
+
+END;
+$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
+
+-- Find outliers above a given number of standard deviations from the mean
+
+CREATE OR REPLACE FUNCTION CDB_StdDevOutlier(column_values numeric[], num_deviations numeric, ids int[], is_symmetric boolean DEFAULT true)
+RETURNS TABLE(is_outlier boolean, rowid int)
+AS $$
+DECLARE
+ stddev_val numeric;
+ avg_val numeric;
+ out_vals boolean[];
+BEGIN
+
+ SELECT stddev(i), avg(i) INTO stddev_val, avg_val
+ FROM unnest(column_values) As x(i);
+
+ IF stddev_val = 0 THEN
+ RAISE EXCEPTION 'Standard deviation of input data is zero';
+ END IF;
+
+ IF is_symmetric THEN
+ SELECT array_agg(
+ abs(i - avg_val) / stddev_val > num_deviations) INTO out_vals
+ FROM unnest(column_values) As x(i);
+ ELSE
+ SELECT array_agg(
+ (i - avg_val) / stddev_val > num_deviations) INTO out_vals
+ FROM unnest(column_values) As x(i);
+ END IF;
+
+ RETURN QUERY
+ SELECT unnest(out_vals) As is_outlier,
+ unnest(ids) As rowid;
+END;
+$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
+CREATE OR REPLACE FUNCTION CDB_Contour(
+ IN geomin geometry[],
+ IN colin numeric[],
+ IN buffer numeric,
+ IN intmethod integer,
+ IN classmethod integer,
+ IN steps integer,
+ IN max_time integer DEFAULT 60000
+ )
+RETURNS TABLE(
+ the_geom geometry,
+ bin integer,
+ min_value numeric,
+ max_value numeric,
+ avg_value numeric
+) AS $$
+DECLARE
+ cell_count integer;
+ tin geometry[];
+ resolution integer;
+BEGIN
+
+ -- nasty trick to override issue #121
+ IF max_time = 0 THEN
+ max_time = -90;
+ END IF;
+ resolution := max_time;
+ max_time := -1 * resolution;
+
+ -- calc the optimal number of cells for the current dataset
+ SELECT
+ CASE intmethod
+ WHEN 0 THEN round(3.7745903782 * max_time - 9.4399210051 * array_length(geomin,1) - 1350.8778213073)
+ WHEN 1 THEN round(2.2855592156 * max_time - 87.285217133 * array_length(geomin,1) + 17255.7085601797)
+ WHEN 2 THEN round(0.9799471999 * max_time - 127.0334085369 * array_length(geomin,1) + 22707.9579721218)
+ ELSE 10000
+ END INTO cell_count;
+
+ -- we don't have iterative barycentric interpolation in CDB_interpolation,
+ -- and it's a costy function, so let's make a custom one here till
+ -- we update the code
+ -- tin := ARRAY[]::geometry[];
+ IF intmethod=1 THEN
+ WITH
+ a as (SELECT unnest(geomin) AS e),
+ b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a),
+ c as (SELECT (ST_Dump(t)).geom as v FROM b)
+ SELECT array_agg(v) INTO tin FROM c;
+ END IF;
+ -- Delaunay stuff performed just ONCE!!
+
+ -- magic
+ RETURN QUERY
+ WITH
+ convexhull as (
+ SELECT
+ ST_ConvexHull(ST_Collect(geomin)) as g,
+ buffer * |/ st_area(ST_ConvexHull(ST_Collect(geomin)))/PI() as r
+ ),
+ envelope as (
+ SELECT
+ st_expand(a.g, a.r) as e
+ FROM convexhull a
+ ),
+ envelope3857 as(
+ SELECT
+ ST_Transform(e, 3857) as geom
+ FROM envelope
+ ),
+ resolution as(
+ SELECT
+ CASE WHEN resolution <= 0 THEN
+ round(|/ (
+ ST_area(geom) / abs(cell_count)
+ ))
+ ELSE
+ resolution
+ END AS cell
+ FROM envelope3857
+ ),
+ grid as(
+ SELECT
+ ST_Transform(cdb_crankshaft.CDB_RectangleGrid(e.geom, r.cell, r.cell), 4326) as geom
+ FROM envelope3857 e, resolution r
+ ),
+ interp as(
+ SELECT
+ geom,
+ CASE
+ WHEN intmethod=1 THEN cdb_crankshaft._interp_in_tin(geomin, colin, tin, ST_Centroid(geom))
+ ELSE cdb_crankshaft.CDB_SpatialInterpolation(geomin, colin, ST_Centroid(geom), intmethod)
+ END as val
+ FROM grid
+ ),
+ classes as(
+ SELECT CASE
+ WHEN classmethod = 0 THEN
+ cdb_crankshaft.CDB_EqualIntervalBins(array_agg(val), steps)
+ WHEN classmethod = 1 THEN
+ cdb_crankshaft.CDB_HeadsTailsBins(array_agg(val), steps)
+ WHEN classmethod = 2 THEN
+ cdb_crankshaft.CDB_JenksBins(array_agg(val), steps)
+ ELSE
+ cdb_crankshaft.CDB_QuantileBins(array_agg(val), steps)
+ END as b
+ FROM interp
+ where val is not null
+ ),
+ classified as(
+ SELECT
+ i.*,
+ width_bucket(i.val, c.b) as bucket
+ FROM interp i left join classes c
+ ON 1=1
+ ),
+ classified2 as(
+ SELECT
+ geom,
+ val,
+ CASE
+ WHEN bucket = steps THEN bucket - 1
+ ELSE bucket
+ END as b
+ FROM classified
+ ),
+ final as(
+ SELECT
+ st_union(geom) as the_geom,
+ b as bin,
+ min(val) as min_value,
+ max(val) as max_value,
+ avg(val) as avg_value
+ FROM classified2
+ GROUP BY bin
+ )
+ SELECT
+ *
+ FROM final
+ where final.bin is not null
+ ;
+END;
+$$ language plpgsql VOLATILE PARALLEL RESTRICTED;
+
+
+-- =====================================================================
+-- Interp in grid, so we can use barycentric with a precalculated tin (NNI)
+-- =====================================================================
+CREATE OR REPLACE FUNCTION _interp_in_tin(
+ IN geomin geometry[],
+ IN colin numeric[],
+ IN tin geometry[],
+ IN point geometry
+ )
+RETURNS numeric AS
+$$
+DECLARE
+ g geometry;
+ vertex geometry[];
+ sg numeric;
+ sa numeric;
+ sb numeric;
+ sc numeric;
+ va numeric;
+ vb numeric;
+ vc numeric;
+ output numeric;
+BEGIN
+ -- get the cell the point is within
+ WITH
+ a as (SELECT unnest(tin) as v),
+ b as (SELECT v FROM a WHERE ST_Within(point, v))
+ SELECT v INTO g FROM b;
+
+ -- if we're out of the data realm,
+ -- return null
+ IF g is null THEN
+ RETURN null;
+ END IF;
+
+ -- vertex of the selected cell
+ WITH a AS (
+ SELECT (ST_DumpPoints(g)).geom AS v
+ )
+ SELECT array_agg(v) INTO vertex FROM a;
+
+ -- retrieve the value of each vertex
+ WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c)
+ SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]);
+
+ WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c)
+ SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]);
+
+ WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c)
+ SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]);
+
+ -- calc the areas
+ SELECT
+ ST_area(g),
+ ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[2], vertex[3], point]))),
+ ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[1], vertex[3], point]))),
+ ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point,vertex[1],vertex[2], point]))) INTO sg, sa, sb, sc;
+
+ output := (coalesce(sa,0) * coalesce(va,0) + coalesce(sb,0) * coalesce(vb,0) + coalesce(sc,0) * coalesce(vc,0)) / coalesce(sg,1);
+ RETURN output;
+END;
+$$
+language plpgsql IMMUTABLE PARALLEL SAFE;
+-- Function by Stuart Lynn for a simple interpolation of a value
+-- from a polygon table over an arbitrary polygon
+-- (weighted by the area proportion overlapped)
+-- Aereal weighting is a very simple form of aereal interpolation.
+--
+-- Parameters:
+-- * geom a Polygon geometry which defines the area where a value will be
+-- estimated as the area-weighted sum of a given table/column
+-- * target_table_name table name of the table that provides the values
+-- * target_column column name of the column that provides the values
+-- * schema_name optional parameter to defina the schema the target table
+-- belongs to, which is necessary if its not in the search_path.
+-- Note that target_table_name should never include the schema in it.
+-- Return value:
+-- Aereal-weighted interpolation of the column values over the geometry
+CREATE OR REPLACE
+FUNCTION cdb_overlap_sum(geom geometry, target_table_name text, target_column text, schema_name text DEFAULT NULL)
+ RETURNS numeric AS
+$$
+DECLARE
+ result numeric;
+ qualified_name text;
+BEGIN
+ IF schema_name IS NULL THEN
+ qualified_name := Format('%I', target_table_name);
+ ELSE
+ qualified_name := Format('%I.%s', schema_name, target_table_name);
+ END IF;
+ EXECUTE Format('
+ SELECT sum(%I*ST_Area(St_Intersection($1, a.the_geom))/ST_Area(a.the_geom))
+ FROM %s AS a
+ WHERE $1 && a.the_geom
+ ', target_column, qualified_name)
+ USING geom
+ INTO result;
+ RETURN result;
+END;
+$$ LANGUAGE plpgsql STABLE PARALLEL SAFE;
+CREATE OR REPLACE FUNCTION
+CDB_GWR(subquery text, dep_var text, ind_vars text[],
+ bw numeric default null, fixed boolean default False,
+ kernel text default 'bisquare', geom_col text default 'the_geom',
+ id_col text default 'cartodb_id')
+RETURNS table(coeffs JSON, stand_errs JSON, t_vals JSON,
+ filtered_t_vals JSON, predicted numeric,
+ residuals numeric, r_squared numeric, bandwidth numeric,
+ rowid bigint)
+AS $$
+
+from crankshaft.regression import GWR
+
+gwr = GWR()
+
+return gwr.gwr(subquery, dep_var, ind_vars, bw, fixed, kernel, geom_col, id_col)
+
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+
+
+CREATE OR REPLACE FUNCTION
+CDB_GWR_Predict(subquery text, dep_var text, ind_vars text[],
+ bw numeric default null, fixed boolean default False,
+ kernel text default 'bisquare',
+ geom_col text default 'the_geom',
+ id_col text default 'cartodb_id')
+RETURNS table(coeffs JSON, stand_errs JSON, t_vals JSON,
+ r_squared numeric, predicted numeric, rowid bigint)
+AS $$
+
+from crankshaft.regression import GWR
+gwr = GWR()
+
+return gwr.gwr_predict(subquery, dep_var, ind_vars, bw, fixed, kernel, geom_col, id_col)
+
+$$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE;
+--
+-- Creates N points randomly distributed arround the polygon
+--
+-- @param g - the geometry to be turned in to points
+--
+-- @param no_points - the number of points to generate
+--
+-- @params max_iter_per_point - the function generates points in the polygon's bounding box
+-- and discards points which don't lie in the polygon. max_iter_per_point specifies how many
+-- misses per point the funciton accepts before giving up.
+--
+-- Returns: Multipoint with the requested points
+CREATE OR REPLACE FUNCTION cdb_dot_density(geom geometry , no_points Integer, max_iter_per_point Integer DEFAULT 1000)
+RETURNS GEOMETRY AS $$
+DECLARE
+ extent GEOMETRY;
+ test_point Geometry;
+ width NUMERIC;
+ height NUMERIC;
+ x0 NUMERIC;
+ y0 NUMERIC;
+ xp NUMERIC;
+ yp NUMERIC;
+ no_left INTEGER;
+ remaining_iterations INTEGER;
+ points GEOMETRY[];
+ bbox_line GEOMETRY;
+ intersection_line GEOMETRY;
+BEGIN
+ extent := ST_Envelope(geom);
+ width := ST_XMax(extent) - ST_XMIN(extent);
+ height := ST_YMax(extent) - ST_YMIN(extent);
+ x0 := ST_XMin(extent);
+ y0 := ST_YMin(extent);
+ no_left := no_points;
+
+ LOOP
+ if(no_left=0) THEN
+ EXIT;
+ END IF;
+ yp = y0 + height*random();
+ bbox_line = ST_MakeLine(
+ ST_SetSRID(ST_MakePoint(yp, x0),4326),
+ ST_SetSRID(ST_MakePoint(yp, x0+width),4326)
+ );
+ intersection_line = ST_Intersection(bbox_line,geom);
+ test_point = ST_LineInterpolatePoint(st_makeline(st_linemerge(intersection_line)),random());
+ points := points || test_point;
+ no_left = no_left - 1 ;
+ END LOOP;
+ RETURN ST_Collect(points);
+END;
+$$
+LANGUAGE plpgsql VOLATILE PARALLEL RESTRICTED;
+-- Make sure by default there are no permissions for publicuser
+-- NOTE: this happens at extension creation time, as part of an implicit transaction.
+-- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE;
+
+-- Grant permissions on the schema to publicuser (but just the schema)
+GRANT USAGE ON SCHEMA cdb_crankshaft TO publicuser;
+
+-- Revoke execute permissions on all functions in the schema by default
+-- REVOKE EXECUTE ON ALL FUNCTIONS IN SCHEMA cdb_crankshaft FROM PUBLIC, publicuser;
+--
+-- Fill given extent with a rectangular coverage
+--
+-- @param ext Extent to fill. Only rectangles with center point falling
+-- inside the extent (or at the lower or leftmost edge) will
+-- be emitted. The returned hexagons will have the same SRID
+-- as this extent.
+--
+-- @param width With of each rectangle
+--
+-- @param height Height of each rectangle
+--
+-- @param origin Optional origin to allow for exact tiling.
+-- If omitted the origin will be 0,0.
+-- The parameter is checked for having the same SRID
+-- as the extent.
+--
+--
+CREATE OR REPLACE FUNCTION CDB_RectangleGrid(ext GEOMETRY, width FLOAT8, height FLOAT8, origin GEOMETRY DEFAULT NULL)
+RETURNS SETOF GEOMETRY
+AS $$
+DECLARE
+ h GEOMETRY; -- rectangle cell
+ hstep FLOAT8; -- horizontal step
+ vstep FLOAT8; -- vertical step
+ hw FLOAT8; -- half width
+ hh FLOAT8; -- half height
+ vstart FLOAT8;
+ hstart FLOAT8;
+ hend FLOAT8;
+ vend FLOAT8;
+ xoff FLOAT8;
+ yoff FLOAT8;
+ xgrd FLOAT8;
+ ygrd FLOAT8;
+ x FLOAT8;
+ y FLOAT8;
+ srid INTEGER;
+BEGIN
+
+ srid := ST_SRID(ext);
+
+ xoff := 0;
+ yoff := 0;
+
+ IF origin IS NOT NULL THEN
+ IF ST_SRID(origin) != srid THEN
+ RAISE EXCEPTION 'SRID mismatch between extent (%) and origin (%)', srid, ST_SRID(origin);
+ END IF;
+ xoff := ST_X(origin);
+ yoff := ST_Y(origin);
+ END IF;
+
+ --RAISE DEBUG 'X offset: %', xoff;
+ --RAISE DEBUG 'Y offset: %', yoff;
+
+ hw := width/2.0;
+ hh := height/2.0;
+
+ xgrd := hw;
+ ygrd := hh;
+ --RAISE DEBUG 'X grid size: %', xgrd;
+ --RAISE DEBUG 'Y grid size: %', ygrd;
+
+ hstep := width;
+ vstep := height;
+
+ -- Tweak horizontal start on hstep grid from origin
+ hstart := xoff + ceil((ST_XMin(ext)-xoff)/hstep)*hstep;
+ --RAISE DEBUG 'hstart: %', hstart;
+
+ -- Tweak vertical start on vstep grid from origin
+ vstart := yoff + ceil((ST_Ymin(ext)-yoff)/vstep)*vstep;
+ --RAISE DEBUG 'vstart: %', vstart;
+
+ hend := ST_XMax(ext);
+ vend := ST_YMax(ext);
+
+ --RAISE DEBUG 'hend: %', hend;
+ --RAISE DEBUG 'vend: %', vend;
+
+ x := hstart;
+ WHILE x < hend LOOP -- over X
+ y := vstart;
+ h := ST_MakeEnvelope(x-hw, y-hh, x+hw, y+hh, srid);
+ WHILE y < vend LOOP -- over Y
+ RETURN NEXT h;
+ h := ST_Translate(h, 0, vstep);
+ y := yoff + round(((y + vstep)-yoff)/ygrd)*ygrd; -- round to grid
+ END LOOP;
+ x := xoff + round(((x + hstep)-xoff)/xgrd)*xgrd; -- round to grid
+ END LOOP;
+
+ RETURN;
+END
+$$ LANGUAGE 'plpgsql' IMMUTABLE PARALLEL SAFE;
+
+--
+-- Calculate the equal interval bins for a given column
+--
+-- @param in_array A numeric array of numbers to determine the best
+-- to determine the bin boundary
+--
+-- @param breaks The number of bins you want to find.
+--
+--
+-- Returns: upper edges of bins
+--
+--
+
+CREATE OR REPLACE FUNCTION CDB_EqualIntervalBins ( in_array NUMERIC[], breaks INT ) RETURNS NUMERIC[] as $$
+DECLARE
+ diff numeric;
+ min_val numeric;
+ max_val numeric;
+ tmp_val numeric;
+ i INT := 1;
+ reply numeric[];
+BEGIN
+ SELECT min(e), max(e) INTO min_val, max_val FROM ( SELECT unnest(in_array) e ) x WHERE e IS NOT NULL;
+ diff = (max_val - min_val) / breaks::numeric;
+ LOOP
+ IF i < breaks THEN
+ tmp_val = min_val + i::numeric * diff;
+ reply = array_append(reply, tmp_val);
+ i := i+1;
+ ELSE
+ reply = array_append(reply, max_val);
+ EXIT;
+ END IF;
+ END LOOP;
+ RETURN reply;
+END;
+$$ language plpgsql IMMUTABLE PARALLEL SAFE;
+
+--
+-- Determine the Heads/Tails classifications from a numeric array
+--
+-- @param in_array A numeric array of numbers to determine the best
+-- bins based on the Heads/Tails method.
+--
+-- @param breaks The number of bins you want to find.
+--
+--
+
+CREATE OR REPLACE FUNCTION CDB_HeadsTailsBins ( in_array NUMERIC[], breaks INT) RETURNS NUMERIC[] as $$
+DECLARE
+ element_count INT4;
+ arr_mean numeric;
+ i INT := 2;
+ reply numeric[];
+BEGIN
+ -- get the total size of our row
+ element_count := array_upper(in_array, 1) - array_lower(in_array, 1);
+ -- ensure the ordering of in_array
+ SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x;
+ -- stop if no rows
+ IF element_count IS NULL THEN
+ RETURN NULL;
+ END IF;
+ -- stop if our breaks are more than our input array size
+ IF element_count < breaks THEN
+ RETURN in_array;
+ END IF;
+
+ -- get our mean value
+ SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x;
+
+ reply = Array[arr_mean];
+ -- slice our bread
+ LOOP
+ IF i > breaks THEN EXIT; END IF;
+ SELECT avg(e) INTO arr_mean FROM ( SELECT unnest(in_array) e) x WHERE e > reply[i-1];
+ IF arr_mean IS NOT NULL THEN
+ reply = array_append(reply, arr_mean);
+ END IF;
+ i := i+1;
+ END LOOP;
+ RETURN reply;
+END;
+$$ language plpgsql IMMUTABLE PARALLEL SAFE;
+
+--
+-- Determine the Jenks classifications from a numeric array
+--
+-- @param in_array A numeric array of numbers to determine the best
+-- bins based on the Jenks method.
+--
+-- @param breaks The number of bins you want to find.
+--
+-- @param iterations The number of different starting positions to test.
+--
+-- @param invert Optional wheter to return the top of each bin (default)
+-- or the bottom. BOOLEAN, default=FALSE.
+--
+--
+
+
+CREATE OR REPLACE FUNCTION CDB_JenksBins ( in_array NUMERIC[], breaks INT, iterations INT DEFAULT 5, invert BOOLEAN DEFAULT FALSE) RETURNS NUMERIC[] as $$
+DECLARE
+ element_count INT4;
+ arr_mean NUMERIC;
+ bot INT;
+ top INT;
+ tops INT[];
+ classes INT[][];
+ i INT := 1; j INT := 1;
+ curr_result NUMERIC[];
+ best_result NUMERIC[];
+ seedtarget TEXT;
+ quant NUMERIC[];
+ shuffles INT;
+BEGIN
+ -- get the total size of our row
+ element_count := array_length(in_array, 1); --array_upper(in_array, 1) - array_lower(in_array, 1);
+ -- ensure the ordering of in_array
+ SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x;
+ -- stop if no rows
+ IF element_count IS NULL THEN
+ RETURN NULL;
+ END IF;
+ -- stop if our breaks are more than our input array size
+ IF element_count < breaks THEN
+ RETURN in_array;
+ END IF;
+
+ shuffles := LEAST(GREATEST(floor(2500000.0/(element_count::float*iterations::float)), 1), 750)::int;
+ -- get our mean value
+ SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x;
+
+ -- assume best is actually Quantile
+ SELECT cdb_crankshaft.CDB_QuantileBins(in_array, breaks) INTO quant;
+
+ -- if data is very very large, just return quant and be done
+ IF element_count > 5000000 THEN
+ RETURN quant;
+ END IF;
+
+ -- change quant into bottom, top markers
+ LOOP
+ IF i = 1 THEN
+ bot = 1;
+ ELSE
+ -- use last top to find this bot
+ bot = top+1;
+ END IF;
+ IF i = breaks THEN
+ top = element_count;
+ ELSE
+ SELECT count(*) INTO top FROM ( SELECT unnest(in_array) as v) x WHERE v <= quant[i];
+ END IF;
+ IF i = 1 THEN
+ classes = ARRAY[ARRAY[bot,top]];
+ ELSE
+ classes = ARRAY_CAT(classes,ARRAY[bot,top]);
+ END IF;
+ IF i > breaks THEN EXIT; END IF;
+ i = i+1;
+ END LOOP;
+
+ best_result = cdb_crankshaft.CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles);
+
+ --set the seed so we can ensure the same results
+ SELECT setseed(0.4567) INTO seedtarget;
+ --loop through random starting positions
+ LOOP
+ IF j > iterations-1 THEN EXIT; END IF;
+ i = 1;
+ tops = ARRAY[element_count];
+ LOOP
+ IF i = breaks THEN EXIT; END IF;
+ SELECT array_agg(distinct e) INTO tops FROM (SELECT unnest(array_cat(tops, ARRAY[floor(random()*element_count::float)::int])) as e ORDER BY e) x WHERE e != 1;
+ i = array_length(tops, 1);
+ END LOOP;
+ i = 1;
+ LOOP
+ IF i > breaks THEN EXIT; END IF;
+ IF i = 1 THEN
+ bot = 1;
+ ELSE
+ bot = top+1;
+ END IF;
+ top = tops[i];
+ IF i = 1 THEN
+ classes = ARRAY[ARRAY[bot,top]];
+ ELSE
+ classes = ARRAY_CAT(classes,ARRAY[bot,top]);
+ END IF;
+ i := i+1;
+ END LOOP;
+ curr_result = cdb_crankshaft.CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles);
+
+ IF curr_result[1] > best_result[1] THEN
+ best_result = curr_result;
+ j = j-1; -- if we found a better result, add one more search
+ END IF;
+ j = j+1;
+ END LOOP;
+
+ RETURN (best_result)[2:array_upper(best_result, 1)];
+END;
+$$ language plpgsql VOLATILE PARALLEL RESTRICTED;
+
+
+
+--
+-- Perform a single iteration of the Jenks classification
+--
+
+CREATE OR REPLACE FUNCTION CDB_JenksBinsIteration ( in_array NUMERIC[], breaks INT, classes INT[][], invert BOOLEAN, element_count INT4, arr_mean NUMERIC, max_search INT DEFAULT 50) RETURNS NUMERIC[] as $$
+DECLARE
+ tmp_val numeric;
+ new_classes int[][];
+ tmp_class int[];
+ i INT := 1;
+ j INT := 1;
+ side INT := 2;
+ sdam numeric;
+ gvf numeric := 0.0;
+ new_gvf numeric;
+ arr_gvf numeric[];
+ class_avg numeric;
+ class_max_i INT;
+ class_min_i INT;
+ class_max numeric;
+ class_min numeric;
+ reply numeric[];
+BEGIN
+
+ -- Calculate the sum of squared deviations from the array mean (SDAM).
+ SELECT sum((arr_mean - e)^2) INTO sdam FROM ( SELECT unnest(in_array) as e ) x;
+ --Identify the breaks for the lowest GVF
+ LOOP
+ i = 1;
+ LOOP
+ -- get our mean
+ SELECT avg(e) INTO class_avg FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e) x;
+ -- find the deviation
+ SELECT sum((class_avg-e)^2) INTO tmp_val FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e ) x;
+ IF i = 1 THEN
+ arr_gvf = ARRAY[tmp_val];
+ -- init our min/max map for later
+ class_max = arr_gvf[i];
+ class_min = arr_gvf[i];
+ class_min_i = 1;
+ class_max_i = 1;
+ ELSE
+ arr_gvf = array_append(arr_gvf, tmp_val);
+ END IF;
+ i := i+1;
+ IF i > breaks THEN EXIT; END IF;
+ END LOOP;
+ -- calculate our new GVF
+ SELECT sdam-sum(e) INTO new_gvf FROM ( SELECT unnest(arr_gvf) as e ) x;
+ -- if no improvement was made, exit
+ IF new_gvf < gvf THEN EXIT; END IF;
+ gvf = new_gvf;
+ IF j > max_search THEN EXIT; END IF;
+ j = j+1;
+ i = 1;
+ LOOP
+ --establish directionality (uppward through classes or downward)
+ IF arr_gvf[i] < class_min THEN
+ class_min = arr_gvf[i];
+ class_min_i = i;
+ END IF;
+ IF arr_gvf[i] > class_max THEN
+ class_max = arr_gvf[i];
+ class_max_i = i;
+ END IF;
+ i := i+1;
+ IF i > breaks THEN EXIT; END IF;
+ END LOOP;
+ IF class_max_i > class_min_i THEN
+ class_min_i = class_max_i - 1;
+ ELSE
+ class_min_i = class_max_i + 1;
+ END IF;
+ --Move from higher class to a lower gid order
+ IF class_max_i > class_min_i THEN
+ classes[class_max_i][1] = classes[class_max_i][1] + 1;
+ classes[class_min_i][2] = classes[class_min_i][2] + 1;
+ ELSE -- Move from lower class UP into a higher class by gid
+ classes[class_max_i][2] = classes[class_max_i][2] - 1;
+ classes[class_min_i][1] = classes[class_min_i][1] - 1;
+ END IF;
+ END LOOP;
+
+ i = 1;
+ LOOP
+ IF invert = TRUE THEN
+ side = 1; --default returns bottom side of breaks, invert returns top side
+ END IF;
+ reply = array_append(reply, in_array[classes[i][side]]);
+ i = i+1;
+ IF i > breaks THEN EXIT; END IF;
+ END LOOP;
+
+ RETURN array_prepend(gvf, reply);
+
+END;
+$$ language plpgsql IMMUTABLE PARALLEL SAFE;
+
+
+--
+-- Determine the Quantile classifications from a numeric array
+--
+-- @param in_array A numeric array of numbers to determine the best
+-- bins based on the Quantile method.
+--
+-- @param breaks The number of bins you want to find.
+--
+--
+CREATE OR REPLACE FUNCTION CDB_QuantileBins ( in_array NUMERIC[], breaks INT) RETURNS NUMERIC[] as $$
+DECLARE
+ element_count INT4;
+ break_size numeric;
+ tmp_val numeric;
+ i INT := 1;
+ reply numeric[];
+BEGIN
+ -- sort our values
+ SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e ASC) x;
+ -- get the total size of our data
+ element_count := array_length(in_array, 1);
+ break_size := element_count::numeric / breaks;
+ -- slice our bread
+ LOOP
+ IF i < breaks THEN
+ IF break_size * i % 1 > 0 THEN
+ SELECT e INTO tmp_val FROM ( SELECT unnest(in_array) e LIMIT 1 OFFSET ceil(break_size * i) - 1) x;
+ ELSE
+ SELECT avg(e) INTO tmp_val FROM ( SELECT unnest(in_array) e LIMIT 2 OFFSET ceil(break_size * i) - 1 ) x;
+ END IF;
+ ELSIF i = breaks THEN
+ -- select the last value
+ SELECT max(e) INTO tmp_val FROM ( SELECT unnest(in_array) e ) x;
+ ELSE
+ EXIT;
+ END IF;
+
+ reply = array_append(reply, tmp_val);
+ i := i+1;
+ END LOOP;
+ RETURN reply;
+END;
+$$ language plpgsql IMMUTABLE STRICT PARALLEL SAFE;
diff --git a/release/crankshaft.control b/release/crankshaft.control
index 7d5a93a..028fb76 100644
--- a/release/crankshaft.control
+++ b/release/crankshaft.control
@@ -1,5 +1,5 @@
comment = 'CartoDB Spatial Analysis extension'
-default_version = '0.7.0'
+default_version = '0.8.0'
requires = 'plpythonu, postgis'
superuser = true
schema = cdb_crankshaft
diff --git a/release/python/0.8.0/crankshaft/crankshaft/.analysis_data_provider.py.swp b/release/python/0.8.0/crankshaft/crankshaft/.analysis_data_provider.py.swp
new file mode 100644
index 0000000..ac1b6c3
Binary files /dev/null and b/release/python/0.8.0/crankshaft/crankshaft/.analysis_data_provider.py.swp differ
diff --git a/release/python/0.8.0/crankshaft/crankshaft/__init__.py b/release/python/0.8.0/crankshaft/crankshaft/__init__.py
new file mode 100644
index 0000000..82b2b87
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/__init__.py
@@ -0,0 +1,7 @@
+"""Import all modules"""
+import crankshaft.random_seeds
+import crankshaft.clustering
+import crankshaft.space_time_dynamics
+import crankshaft.segmentation
+import crankshaft.regression
+import analysis_data_provider
diff --git a/release/python/0.8.0/crankshaft/crankshaft/analysis_data_provider.py b/release/python/0.8.0/crankshaft/crankshaft/analysis_data_provider.py
new file mode 100644
index 0000000..3d5225a
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/analysis_data_provider.py
@@ -0,0 +1,98 @@
+"""class for fetching data"""
+import plpy
+import pysal_utils as pu
+
+NULL_VALUE_ERROR = ('No usable data passed to analysis. Check your input rows '
+ 'for null values and fill in appropriately.')
+
+
+def verify_data(func):
+ """decorator to verify data result before returning to algorithm"""
+ def wrapper(*args, **kwargs):
+ """Error checking"""
+ try:
+ data = func(*args, **kwargs)
+ if not data:
+ plpy.error(NULL_VALUE_ERROR)
+ else:
+ return data
+ except Exception as err:
+ plpy.error('Analysis failed: {}'.format(err))
+
+ return []
+
+ return wrapper
+
+
+class AnalysisDataProvider(object):
+ @verify_data
+ def get_getis(self, w_type, params):
+ """fetch data for getis ord's g"""
+ query = pu.construct_neighbor_query(w_type, params)
+ return plpy.execute(query)
+
+ @verify_data
+ def get_markov(self, w_type, params):
+ """fetch data for spatial markov"""
+ query = pu.construct_neighbor_query(w_type, params)
+ return plpy.execute(query)
+
+ @verify_data
+ def get_moran(self, w_type, params):
+ """fetch data for moran's i analyses"""
+ query = pu.construct_neighbor_query(w_type, params)
+ return plpy.execute(query)
+
+ @verify_data
+ def get_nonspatial_kmeans(self, params):
+ """
+ Fetch data for non-spatial k-means.
+
+ Inputs - a dict (params) with the following keys:
+ colnames: a (text) list of column names (e.g.,
+ `['andy', 'cookie']`)
+ id_col: the name of the id column (e.g., `'cartodb_id'`)
+ subquery: the subquery for exposing the data (e.g.,
+ SELECT * FROM favorite_things)
+ Output:
+ A SQL query for packaging the data for consumption within
+ `KMeans().nonspatial`. Format will be a list of length one,
+ with the first element a dict with keys ('rowid', 'attr1',
+ 'attr2', ...)
+ """
+ agg_cols = ', '.join([
+ 'array_agg({0}) As arr_col{1}'.format(val, idx+1)
+ for idx, val in enumerate(params['colnames'])
+ ])
+ query = '''
+ SELECT {cols}, array_agg({id_col}) As rowid
+ FROM ({subquery}) As a
+ '''.format(subquery=params['subquery'],
+ id_col=params['id_col'],
+ cols=agg_cols).strip()
+ return plpy.execute(query)
+
+ @verify_data
+ def get_spatial_kmeans(self, params):
+ """fetch data for spatial kmeans"""
+ query = '''
+ SELECT
+ array_agg("{id_col}" ORDER BY "{id_col}") as ids,
+ array_agg(ST_X("{geom_col}") ORDER BY "{id_col}") As xs,
+ array_agg(ST_Y("{geom_col}") ORDER BY "{id_col}") As ys
+ FROM ({subquery}) As a
+ WHERE "{geom_col}" IS NOT NULL
+ '''.format(**params)
+ return plpy.execute(query)
+
+ @verify_data
+ def get_gwr(self, params):
+ """fetch data for gwr analysis"""
+ query = pu.gwr_query(params)
+ return plpy.execute(query)
+
+ @verify_data
+ def get_gwr_predict(self, params):
+ """fetch data for gwr predict"""
+ query = pu.gwr_predict_query(params)
+ return plpy.execute(query)
diff --git a/release/python/0.8.0/crankshaft/crankshaft/centers/median_center.py b/release/python/0.8.0/crankshaft/crankshaft/centers/median_center.py
new file mode 100644
index 0000000..c5b41bd
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/centers/median_center.py
@@ -0,0 +1,76 @@
+"""
+ Based on the Weiszfeld algorithm:
+ https://en.wikipedia.org/wiki/Geometric_median
+"""
+
+
+# import plpy
+import numpy as np
+from numpy.linalg import norm
+
+
+def median_center(tablename, geom_col, num_iters=50, tolerance=0.001):
+
+ query = '''
+ SELECT array_agg(ST_X({geom_col})) As x_coords,
+ array_agg(ST_Y({geom_col})) As y_coords
+ FROM {tablename}
+ '''.format(geom_col=geom_col, tablename=tablename)
+
+ try:
+ resp = plpy.execute(query)
+ data = np.vstack((resp['x_coords'][0],
+ resp['y_coords'][0])).T
+
+ plpy.notice('coords: %s' % str(coords))
+ except Exception, err:
+ # plpy.error('Analysis failed: %s' % err)
+ print('No plpy')
+ data = np.array([[1.2 * np.random.random() + 10.,
+ 1.1 * (np.random.random() - 1.) + 3.]
+ for i in range(1, 100)])
+
+ # initialize 'median center' to be the mean
+ coords_center_temp = data.mean(axis=0)
+
+ # plpy.notice('temp_center: %s' % str(coords_center_temp))
+ print('temp_center: %s' % str(coords_center_temp))
+
+ for i in range(0, num_iters):
+ old_coords_center = coords_center_temp.copy()
+ denom = denominator(coords_center_temp, data)
+ coords_center_temp = np.sum([data[j] * numerator(coords_center_temp,
+ data[j])
+ for j in range(len(data))], axis=0)
+ coords_center_temp = coords_center_temp / denom
+
+ print("Pass #%d" % i)
+ print("max, min of data: %0.4f, %0.4f" % (data.max(), data.min()))
+ print('temp_center: %s' % str(coords_center_temp))
+ print("Change in center: %0.4f" % np.linalg.norm(old_coords_center -
+ coords_center_temp))
+ print("Center coords: %s" % str(coords_center_temp))
+ print("Objective Function: %0.4f" % obj_func(coords_center_temp, data))
+
+ return coords_center_temp
+
+
+def obj_func(center_coords, data):
+ """
+
+ """
+ return np.linalg.norm(center_coords - data)
+
+
+def numerator(center_coords, data_i):
+ """
+
+ """
+ return np.reciprocal(np.linalg.norm(center_coords - data_i))
+
+
+def denominator(center_coords, data):
+ """
+
+ """
+ return np.reciprocal(np.linalg.norm(data - center_coords))
diff --git a/release/python/0.8.0/crankshaft/crankshaft/clustering/__init__.py b/release/python/0.8.0/crankshaft/crankshaft/clustering/__init__.py
new file mode 100644
index 0000000..d9682fa
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/clustering/__init__.py
@@ -0,0 +1,4 @@
+"""Import all functions from for clustering"""
+from moran import *
+from kmeans import *
+from getis import *
diff --git a/release/python/0.8.0/crankshaft/crankshaft/clustering/getis.py b/release/python/0.8.0/crankshaft/crankshaft/clustering/getis.py
new file mode 100644
index 0000000..2bee3a2
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/clustering/getis.py
@@ -0,0 +1,50 @@
+"""
+Getis-Ord's G geostatistics (hotspot/coldspot analysis)
+"""
+
+import pysal as ps
+from collections import OrderedDict
+
+# crankshaft modules
+import crankshaft.pysal_utils as pu
+from crankshaft.analysis_data_provider import AnalysisDataProvider
+
+# High level interface ---------------------------------------
+
+
+class Getis(object):
+ def __init__(self, data_provider=None):
+ if data_provider is None:
+ self.data_provider = AnalysisDataProvider()
+ else:
+ self.data_provider = data_provider
+
+ def getis_ord(self, subquery, attr,
+ w_type, num_ngbrs, permutations, geom_col, id_col):
+ """
+ Getis-Ord's G*
+ Implementation building neighbors with a PostGIS database and PySAL's
+ Getis-Ord's G* hotspot/coldspot module.
+ Andy Eschbacher
+ """
+
+ # geometries with attributes that are null are ignored
+ # resulting in a collection of not as near neighbors if kNN is chosen
+
+ params = OrderedDict([("id_col", id_col),
+ ("attr1", attr),
+ ("geom_col", geom_col),
+ ("subquery", subquery),
+ ("num_ngbrs", num_ngbrs)])
+
+ result = self.data_provider.get_getis(w_type, params)
+ attr_vals = pu.get_attributes(result)
+
+ # build PySAL weight object
+ weight = pu.get_weight(result, w_type, num_ngbrs)
+
+ # calculate Getis-Ord's G* z- and p-values
+ getis = ps.esda.getisord.G_Local(attr_vals, weight,
+ star=True, permutations=permutations)
+
+ return zip(getis.z_sim, getis.p_sim, getis.p_z_sim, weight.id_order)
diff --git a/release/python/0.8.0/crankshaft/crankshaft/clustering/kmeans.py b/release/python/0.8.0/crankshaft/crankshaft/clustering/kmeans.py
new file mode 100644
index 0000000..6d22d44
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/clustering/kmeans.py
@@ -0,0 +1,113 @@
+from sklearn.cluster import KMeans
+import numpy as np
+
+from crankshaft.analysis_data_provider import AnalysisDataProvider
+
+
+class Kmeans(object):
+ def __init__(self, data_provider=None):
+ if data_provider is None:
+ self.data_provider = AnalysisDataProvider()
+ else:
+ self.data_provider = data_provider
+
+ def spatial(self, query, no_clusters, no_init=20):
+ """
+ find centers based on clusters of latitude/longitude pairs
+ query: SQL query that has a WGS84 geometry (the_geom)
+ """
+ params = {"subquery": query,
+ "geom_col": "the_geom",
+ "id_col": "cartodb_id"}
+
+ result = self.data_provider.get_spatial_kmeans(params)
+
+ # Unpack query response
+ xs = result[0]['xs']
+ ys = result[0]['ys']
+ ids = result[0]['ids']
+
+ km = KMeans(n_clusters=no_clusters, n_init=no_init)
+ labels = km.fit_predict(zip(xs, ys))
+ return zip(ids, labels)
+
+ def nonspatial(self, subquery, colnames, no_clusters=5,
+ standardize=True, id_col='cartodb_id'):
+ """
+ Arguments:
+ query (string): A SQL query to retrieve the data required to do the
+ k-means clustering analysis, like so:
+ SELECT * FROM iris_flower_data
+ colnames (list): a list of the column names which contain the data
+ of interest, like so: ['sepal_width',
+ 'petal_width',
+ 'sepal_length',
+ 'petal_length']
+ no_clusters (int): number of clusters (greater than zero)
+ id_col (string): name of the input id_column
+
+ Returns:
+ A list of tuples with the following columns:
+ cluster labels: a label for the cluster that the row belongs to
+ centers: center of the cluster that this row belongs to
+ silhouettes: silhouette measure for this value
+ rowid: row that these values belong to (corresponds to the value in
+ `id_col`)
+ """
+ import json
+ from sklearn import metrics
+
+ params = {
+ "colnames": colnames,
+ "subquery": subquery,
+ "id_col": id_col
+ }
+
+ data = self.data_provider.get_nonspatial_kmeans(params)
+
+ # fill array with values for k-means clustering
+ if standardize:
+ cluster_columns = _scale_data(
+ _extract_columns(data))
+ else:
+ cluster_columns = _extract_columns(data)
+
+ kmeans = KMeans(n_clusters=no_clusters,
+ random_state=0).fit(cluster_columns)
+
+ centers = [json.dumps(dict(zip(colnames, c)))
+ for c in kmeans.cluster_centers_[kmeans.labels_]]
+
+ silhouettes = metrics.silhouette_samples(cluster_columns,
+ kmeans.labels_,
+ metric='sqeuclidean')
+
+ return zip(kmeans.labels_,
+ centers,
+ silhouettes,
+ [kmeans.inertia_] * kmeans.labels_.shape[0],
+ data[0]['rowid'])
+
+
+# -- Preprocessing steps
+
+def _extract_columns(data):
+ """
+ Extract the features from the query and pack them into a NumPy array
+ data (list of dicts): result of the kmeans request
+ """
+ # number of columns minus rowid column
+ n_cols = len(data[0]) - 1
+ return np.array([data[0]['arr_col{0}'.format(i+1)]
+ for i in xrange(n_cols)],
+ dtype=float).T
+
+
+def _scale_data(features):
+ """
+ Scale all input columns to center on 0 with a standard devation of 1
+ features (numpy matrix): features of dimension (n_features, n_samples)
+ """
+ from sklearn.preprocessing import StandardScaler
+ scaler = StandardScaler()
+ return scaler.fit_transform(features)
diff --git a/release/python/0.8.0/crankshaft/crankshaft/clustering/moran.py b/release/python/0.8.0/crankshaft/crankshaft/clustering/moran.py
new file mode 100644
index 0000000..cce5670
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/clustering/moran.py
@@ -0,0 +1,341 @@
+"""
+Moran's I geostatistics (global clustering & outliers presence)
+Functionality relies on a combination of `PySAL
+`__ and the data providered provided in
+the class instantiation (which defaults to PostgreSQL's plpy module's `database
+access functions `__).
+"""
+
+from collections import OrderedDict
+import pysal as ps
+
+# crankshaft module
+import crankshaft.pysal_utils as pu
+from crankshaft.analysis_data_provider import AnalysisDataProvider
+
+# High level interface ---------------------------------------
+
+
+class Moran(object):
+ """Class for calculation of Moran's I statistics (global, local, and local
+ rate)
+
+ Parameters:
+ data_provider (:obj:`AnalysisDataProvider`): Class for fetching data. See
+ the `crankshaft.analysis_data_provider` module for more information.
+ """
+ def __init__(self, data_provider=None):
+ if data_provider is None:
+ self.data_provider = AnalysisDataProvider()
+ else:
+ self.data_provider = data_provider
+
+ def global_stat(self, subquery, attr_name,
+ w_type, num_ngbrs, permutations, geom_col, id_col):
+ """
+ Moran's I (global)
+ Implementation building neighbors with a PostGIS database and Moran's I
+ core clusters with PySAL.
+
+ Args:
+
+ subquery (str): Query to give access to the data needed. This query
+ must give access to ``attr_name``, ``geom_col``, and ``id_col``.
+ attr_name (str): Column name of data to analyze
+ w_type (str): Type of spatial weight. Must be one of `knn`
+ or `queen`. See `PySAL documentation
+ `__
+ for more information.
+ num_ngbrs (int): If using `knn` for ``w_type``, this
+ specifies the number of neighbors to be used to define the spatial
+ neighborhoods.
+ permutations (int): Number of permutations for performing
+ conditional randomization to find the p-value. Higher numbers
+ takes a longer time for getting results.
+ geom_col (str): Name of the geometry column in the dataset for
+ finding the spatial neighborhoods.
+ id_col (str): Row index for each value. Usually the database index.
+
+ """
+ params = OrderedDict([("id_col", id_col),
+ ("attr1", attr_name),
+ ("geom_col", geom_col),
+ ("subquery", subquery),
+ ("num_ngbrs", num_ngbrs)])
+
+ result = self.data_provider.get_moran(w_type, params)
+
+ # collect attributes
+ attr_vals = pu.get_attributes(result)
+
+ # calculate weights
+ weight = pu.get_weight(result, w_type, num_ngbrs)
+
+ # calculate moran global
+ moran_global = ps.esda.moran.Moran(attr_vals, weight,
+ permutations=permutations)
+
+ return zip([moran_global.I], [moran_global.EI])
+
+ def local_stat(self, subquery, attr,
+ w_type, num_ngbrs, permutations, geom_col, id_col):
+ """
+ Moran's I (local)
+
+ Args:
+
+ subquery (str): Query to give access to the data needed. This query
+ must give access to ``attr_name``, ``geom_col``, and ``id_col``.
+ attr (str): Column name of data to analyze
+ w_type (str): Type of spatial weight. Must be one of `knn`
+ or `queen`. See `PySAL documentation
+ `__
+ for more information.
+ num_ngbrs (int): If using `knn` for ``w_type``, this
+ specifies the number of neighbors to be used to define the spatial
+ neighborhoods.
+ permutations (int): Number of permutations for performing
+ conditional randomization to find the p-value. Higher numbers
+ takes a longer time for getting results.
+ geom_col (str): Name of the geometry column in the dataset for
+ finding the spatial neighborhoods.
+ id_col (str): Row index for each value. Usually the database index.
+
+ Returns:
+ list of tuples: Where each tuple consists of the following values:
+ - quadrants classification (one of `HH`, `HL`, `LL`, or `LH`)
+ - p-value
+ - spatial lag
+ - standardized spatial lag (centered on the mean, normalized by the
+ standard deviation)
+ - original value
+ - standardized value
+ - Moran's I statistic
+ - original row index
+ """
+
+ # geometries with attributes that are null are ignored
+ # resulting in a collection of not as near neighbors
+
+ params = OrderedDict([("id_col", id_col),
+ ("attr1", attr),
+ ("geom_col", geom_col),
+ ("subquery", subquery),
+ ("num_ngbrs", num_ngbrs)])
+
+ result = self.data_provider.get_moran(w_type, params)
+
+ attr_vals = pu.get_attributes(result)
+ weight = pu.get_weight(result, w_type, num_ngbrs)
+
+ # calculate LISA values
+ lisa = ps.esda.moran.Moran_Local(attr_vals, weight,
+ permutations=permutations)
+
+ # find quadrants for each geometry
+ quads = quad_position(lisa.q)
+
+ # calculate spatial lag
+ lag = ps.weights.spatial_lag.lag_spatial(weight, lisa.y)
+ lag_std = ps.weights.spatial_lag.lag_spatial(weight, lisa.z)
+
+ return zip(
+ quads,
+ lisa.p_sim,
+ lag,
+ lag_std,
+ lisa.y,
+ lisa.z,
+ lisa.Is,
+ weight.id_order
+ )
+
+ def global_rate_stat(self, subquery, numerator, denominator,
+ w_type, num_ngbrs, permutations, geom_col, id_col):
+ """
+ Moran's I Rate (global)
+
+ Args:
+
+ subquery (str): Query to give access to the data needed. This query
+ must give access to ``attr_name``, ``geom_col``, and ``id_col``.
+ numerator (str): Column name of numerator to analyze
+ denominator (str): Column name of the denominator
+ w_type (str): Type of spatial weight. Must be one of `knn`
+ or `queen`. See `PySAL documentation
+ `__
+ for more information.
+ num_ngbrs (int): If using `knn` for ``w_type``, this
+ specifies the number of neighbors to be used to define the spatial
+ neighborhoods.
+ permutations (int): Number of permutations for performing
+ conditional randomization to find the p-value. Higher numbers
+ takes a longer time for getting results.
+ geom_col (str): Name of the geometry column in the dataset for
+ finding the spatial neighborhoods.
+ id_col (str): Row index for each value. Usually the database index.
+ """
+ params = OrderedDict([("id_col", id_col),
+ ("attr1", numerator),
+ ("attr2", denominator),
+ ("geom_col", geom_col),
+ ("subquery", subquery),
+ ("num_ngbrs", num_ngbrs)])
+
+ result = self.data_provider.get_moran(w_type, params)
+
+ # collect attributes
+ numer = pu.get_attributes(result, 1)
+ denom = pu.get_attributes(result, 2)
+
+ weight = pu.get_weight(result, w_type, num_ngbrs)
+
+ # calculate moran global rate
+ lisa_rate = ps.esda.moran.Moran_Rate(numer, denom, weight,
+ permutations=permutations)
+
+ return zip([lisa_rate.I], [lisa_rate.EI])
+
+ def local_rate_stat(self, subquery, numerator, denominator,
+ w_type, num_ngbrs, permutations, geom_col, id_col):
+ """
+ Moran's I Local Rate
+
+ Args:
+
+ subquery (str): Query to give access to the data needed. This query
+ must give access to ``attr_name``, ``geom_col``, and ``id_col``.
+ numerator (str): Column name of numerator to analyze
+ denominator (str): Column name of the denominator
+ w_type (str): Type of spatial weight. Must be one of `knn`
+ or `queen`. See `PySAL documentation
+ `__
+ for more information.
+ num_ngbrs (int): If using `knn` for ``w_type``, this
+ specifies the number of neighbors to be used to define the spatial
+ neighborhoods.
+ permutations (int): Number of permutations for performing
+ conditional randomization to find the p-value. Higher numbers
+ takes a longer time for getting results.
+ geom_col (str): Name of the geometry column in the dataset for
+ finding the spatial neighborhoods.
+ id_col (str): Row index for each value. Usually the database index.
+
+ Returns:
+ list of tuples: Where each tuple consists of the following values:
+ - quadrants classification (one of `HH`, `HL`, `LL`, or `LH`)
+ - p-value
+ - spatial lag
+ - standardized spatial lag (centered on the mean, normalized by the
+ standard deviation)
+ - original value (roughly numerator divided by denominator)
+ - standardized value
+ - Moran's I statistic
+ - original row index
+ """
+ # geometries with values that are null are ignored
+ # resulting in a collection of not as near neighbors
+
+ params = OrderedDict([("id_col", id_col),
+ ("numerator", numerator),
+ ("denominator", denominator),
+ ("geom_col", geom_col),
+ ("subquery", subquery),
+ ("num_ngbrs", num_ngbrs)])
+
+ result = self.data_provider.get_moran(w_type, params)
+
+ # collect attributes
+ numer = pu.get_attributes(result, 1)
+ denom = pu.get_attributes(result, 2)
+
+ weight = pu.get_weight(result, w_type, num_ngbrs)
+
+ # calculate LISA values
+ lisa = ps.esda.moran.Moran_Local_Rate(numer, denom, weight,
+ permutations=permutations)
+
+ # find quadrants for each geometry
+ quads = quad_position(lisa.q)
+
+ # spatial lag
+ lag = ps.weights.spatial_lag.lag_spatial(weight, lisa.y)
+ lag_std = ps.weights.spatial_lag.lag_spatial(weight, lisa.z)
+
+ return zip(
+ quads,
+ lisa.p_sim,
+ lag,
+ lag_std,
+ lisa.y,
+ lisa.z,
+ lisa.Is,
+ weight.id_order
+ )
+
+ def local_bivariate_stat(self, subquery, attr1, attr2,
+ permutations, geom_col, id_col,
+ w_type, num_ngbrs):
+ """
+ Moran's I (local) Bivariate (untested)
+ """
+
+ params = OrderedDict([("id_col", id_col),
+ ("attr1", attr1),
+ ("attr2", attr2),
+ ("geom_col", geom_col),
+ ("subquery", subquery),
+ ("num_ngbrs", num_ngbrs)])
+
+ result = self.data_provider.get_moran(w_type, params)
+
+ # collect attributes
+ attr1_vals = pu.get_attributes(result, 1)
+ attr2_vals = pu.get_attributes(result, 2)
+
+ # create weights
+ weight = pu.get_weight(result, w_type, num_ngbrs)
+
+ # calculate LISA values
+ lisa = ps.esda.moran.Moran_Local_BV(attr1_vals, attr2_vals, weight,
+ permutations=permutations)
+
+ # find clustering of significance
+ lisa_sig = quad_position(lisa.q)
+
+ return zip(lisa.Is, lisa_sig, lisa.p_sim, weight.id_order)
+
+# Low level functions ----------------------------------------
+
+
+def map_quads(coord):
+ """
+ Map a quadrant number to Moran's I designation
+ HH=1, LH=2, LL=3, HL=4
+ Args:
+ coord (int): quadrant of a specific measurement
+ Returns:
+ classification (one of 'HH', 'LH', 'LL', or 'HL')
+ """
+ if coord == 1:
+ return 'HH'
+ elif coord == 2:
+ return 'LH'
+ elif coord == 3:
+ return 'LL'
+ elif coord == 4:
+ return 'HL'
+ return None
+
+
+def quad_position(quads):
+ """
+ Map all quads
+
+ Args:
+ quads (:obj:`numpy.ndarray`): an array of quads classified by
+ 1-4 (PySAL default)
+ Returns:
+ list: an array of quads classied by 'HH', 'LL', etc.
+ """
+ return [map_quads(q) for q in quads]
diff --git a/release/python/0.8.0/crankshaft/crankshaft/pysal_utils/__init__.py b/release/python/0.8.0/crankshaft/crankshaft/pysal_utils/__init__.py
new file mode 100644
index 0000000..fdf073b
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/pysal_utils/__init__.py
@@ -0,0 +1,2 @@
+"""Import all functions for pysal_utils"""
+from crankshaft.pysal_utils.pysal_utils import *
diff --git a/release/python/0.8.0/crankshaft/crankshaft/pysal_utils/pysal_utils.py b/release/python/0.8.0/crankshaft/crankshaft/pysal_utils/pysal_utils.py
new file mode 100644
index 0000000..6b02f6d
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/pysal_utils/pysal_utils.py
@@ -0,0 +1,251 @@
+"""
+ Utilities module for generic PySAL functionality, mainly centered on
+ translating queries into numpy arrays or PySAL weights objects
+"""
+
+import numpy as np
+import pysal as ps
+
+
+def construct_neighbor_query(w_type, query_vals):
+ """Return query (a string) used for finding neighbors
+ @param w_type text: type of neighbors to calculate ('knn' or 'queen')
+ @param query_vals dict: values used to construct the query
+ """
+
+ if w_type.lower() == 'knn':
+ return knn(query_vals)
+ else:
+ return queen(query_vals)
+
+
+# Build weight object
+def get_weight(query_res, w_type='knn', num_ngbrs=5):
+ """
+ Construct PySAL weight from return value of query
+ @param query_res dict-like: query results with attributes and neighbors
+ """
+
+ neighbors = {x['id']: x['neighbors'] for x in query_res}
+ print 'len of neighbors: %d' % len(neighbors)
+
+ built_weight = ps.W(neighbors)
+ built_weight.transform = 'r'
+
+ return built_weight
+
+
+def query_attr_select(params, table_ref=True):
+ """
+ Create portion of SELECT statement for attributes inolved in query.
+ Defaults to order in the params
+ @param params: dict of information used in query (column names,
+ table name, etc.)
+ Example:
+ OrderedDict([('numerator', 'price'),
+ ('denominator', 'sq_meters'),
+ ('subquery', 'SELECT * FROM interesting_data')])
+ Output:
+ "i.\"price\"::numeric As attr1, " \
+ "i.\"sq_meters\"::numeric As attr2, "
+ """
+
+ attr_string = ""
+ template = "\"%(col)s\"::numeric As attr%(alias_num)s, "
+
+ if table_ref:
+ template = "i." + template
+
+ if ('time_cols' in params) or ('ind_vars' in params):
+ # if markov or gwr analysis
+ attrs = (params['time_cols'] if 'time_cols' in params
+ else params['ind_vars'])
+ if 'ind_vars' in params:
+ template = "array_agg(\"%(col)s\"::numeric) As attr%(alias_num)s, "
+
+ for idx, val in enumerate(attrs):
+ attr_string += template % {"col": val, "alias_num": idx + 1}
+ else:
+ # if moran's analysis
+ attrs = [k for k in params
+ if k not in ('id_col', 'geom_col', 'subquery',
+ 'num_ngbrs', 'subquery')]
+
+ for idx, val in enumerate(attrs):
+ attr_string += template % {"col": params[val],
+ "alias_num": idx + 1}
+
+ return attr_string
+
+
+def query_attr_where(params, table_ref=True):
+ """
+ Construct where conditions when building neighbors query
+ Create portion of WHERE clauses for weeding out NULL-valued geometries
+ Input: dict of params:
+ {'subquery': ...,
+ 'numerator': 'data1',
+ 'denominator': 'data2',
+ '': ...}
+ Output:
+ 'idx_replace."data1" IS NOT NULL AND idx_replace."data2" IS NOT NULL'
+ Input:
+ {'subquery': ...,
+ 'time_cols': ['time1', 'time2', 'time3'],
+ 'etc': ...}
+ Output: 'idx_replace."time1" IS NOT NULL AND idx_replace."time2" IS NOT
+ NULL AND idx_replace."time3" IS NOT NULL'
+ """
+ attr_string = []
+ template = "\"%s\" IS NOT NULL"
+ if table_ref:
+ template = "idx_replace." + template
+
+ if ('time_cols' in params) or ('ind_vars' in params):
+ # markov or gwr where clauses
+ attrs = (params['time_cols'] if 'time_cols' in params
+ else params['ind_vars'])
+ # add values to template
+ for attr in attrs:
+ attr_string.append(template % attr)
+ else:
+ # moran where clauses
+
+ # get keys
+ attrs = [k for k in params
+ if k not in ('id_col', 'geom_col', 'subquery',
+ 'num_ngbrs', 'subquery')]
+
+ # add values to template
+ for attr in attrs:
+ attr_string.append(template % params[attr])
+
+ if 'denominator' in attrs:
+ attr_string.append(
+ "idx_replace.\"%s\" <> 0" % params['denominator'])
+
+ out = " AND ".join(attr_string)
+
+ return out
+
+
+def knn(params):
+ """SQL query for k-nearest neighbors.
+ @param vars: dict of values to fill template
+ """
+
+ attr_select = query_attr_select(params, table_ref=True)
+ attr_where = query_attr_where(params, table_ref=True)
+
+ replacements = {"attr_select": attr_select,
+ "attr_where_i": attr_where.replace("idx_replace", "i"),
+ "attr_where_j": attr_where.replace("idx_replace", "j")}
+
+ query = '''
+ SELECT
+ i."{id_col}" As id,
+ %(attr_select)s
+ (SELECT ARRAY(SELECT j."{id_col}"
+ FROM ({subquery}) As j
+ WHERE i."{id_col}" <> j."{id_col}" AND
+ %(attr_where_j)s AND
+ j."{geom_col}" IS NOT NULL
+ ORDER BY j."{geom_col}" <-> i."{geom_col}" ASC
+ LIMIT {num_ngbrs})) As neighbors
+ FROM ({subquery}) As i
+ WHERE %(attr_where_i)s AND i."{geom_col}" IS NOT NULL
+ ORDER BY i."{id_col}" ASC;
+ ''' % replacements
+
+ return query.format(**params)
+
+
+# SQL query for finding queens neighbors (all contiguous polygons)
+def queen(params):
+ """SQL query for queen neighbors.
+ @param params dict: information to fill query
+ """
+ attr_select = query_attr_select(params)
+ attr_where = query_attr_where(params)
+
+ replacements = {"attr_select": attr_select,
+ "attr_where_i": attr_where.replace("idx_replace", "i"),
+ "attr_where_j": attr_where.replace("idx_replace", "j")}
+
+ query = '''
+ SELECT
+ i."{id_col}" As id,
+ %(attr_select)s
+ (SELECT ARRAY(SELECT j."{id_col}"
+ FROM ({subquery}) As j
+ WHERE i."{id_col}" <> j."{id_col}" AND
+ ST_Touches(i."{geom_col}", j."{geom_col}") AND
+ %(attr_where_j)s)) As neighbors
+ FROM ({subquery}) As i
+ WHERE
+ %(attr_where_i)s
+ ORDER BY i."{id_col}" ASC;
+ ''' % replacements
+
+ return query.format(**params)
+
+
+def gwr_query(params):
+ """
+ GWR query
+ """
+
+ replacements = {"ind_vars_select": query_attr_select(params,
+ table_ref=None),
+ "ind_vars_where": query_attr_where(params,
+ table_ref=None)}
+
+ query = '''
+ SELECT
+ array_agg(ST_X(ST_Centroid("{geom_col}"))) As x,
+ array_agg(ST_Y(ST_Centroid("{geom_col}"))) As y,
+ array_agg("{dep_var}") As dep_var,
+ %(ind_vars_select)s
+ array_agg("{id_col}") As rowid
+ FROM ({subquery}) As q
+ WHERE
+ "{dep_var}" IS NOT NULL AND
+ %(ind_vars_where)s
+ ''' % replacements
+
+ return query.format(**params).strip()
+
+
+def gwr_predict_query(params):
+ """
+ GWR query
+ """
+
+ replacements = {"ind_vars_select": query_attr_select(params,
+ table_ref=None),
+ "ind_vars_where": query_attr_where(params,
+ table_ref=None)}
+
+ query = '''
+ SELECT
+ array_agg(ST_X(ST_Centroid({geom_col}))) As x,
+ array_agg(ST_Y(ST_Centroid({geom_col}))) As y,
+ array_agg({dep_var}) As dep_var,
+ %(ind_vars_select)s
+ array_agg({id_col}) As rowid
+ FROM ({subquery}) As q
+ WHERE
+ %(ind_vars_where)s
+ ''' % replacements
+
+ return query.format(**params).strip()
+# to add more weight methods open a ticket or pull request
+
+
+def get_attributes(query_res, attr_num=1):
+ """
+ @param query_res: query results with attributes and neighbors
+ @param attr_num: attribute number (1, 2, ...)
+ """
+ return np.array([x['attr' + str(attr_num)] for x in query_res],
+ dtype=np.float)
diff --git a/release/python/0.8.0/crankshaft/crankshaft/random_seeds.py b/release/python/0.8.0/crankshaft/crankshaft/random_seeds.py
new file mode 100644
index 0000000..c55ba14
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/random_seeds.py
@@ -0,0 +1,12 @@
+"""Random seed generator used for non-deterministic functions in crankshaft"""
+import random
+import numpy
+
+
+def set_random_seeds(value):
+ """
+ Set the seeds of the RNGs (Random Number Generators)
+ used internally.
+ """
+ random.seed(value)
+ numpy.random.seed(value)
diff --git a/release/python/0.8.0/crankshaft/crankshaft/regression/__init__.py b/release/python/0.8.0/crankshaft/crankshaft/regression/__init__.py
new file mode 100644
index 0000000..f9d6d07
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/regression/__init__.py
@@ -0,0 +1,3 @@
+from crankshaft.regression.gwr import *
+from crankshaft.regression.glm import *
+from crankshaft.regression.gwr_cs import *
diff --git a/release/python/0.8.0/crankshaft/crankshaft/regression/glm/GLM_validate_estimation.ipynb b/release/python/0.8.0/crankshaft/crankshaft/regression/glm/GLM_validate_estimation.ipynb
new file mode 100644
index 0000000..1b17831
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/regression/glm/GLM_validate_estimation.ipynb
@@ -0,0 +1,444 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "#Import GLM and pysal\n",
+ "import os\n",
+ "import numpy as np\n",
+ "os.chdir('/Users/toshan/dev/pysal/pysal/contrib/glm')\n",
+ "from glm import GLM\n",
+ "import pysal\n",
+ "import pandas as pd\n",
+ "import statsmodels.formula.api as smf\n",
+ "import statsmodels.api as sm\n",
+ "from family import Gaussian, Binomial, Poisson, QuasiPoisson\n",
+ "\n",
+ "from statsmodels.api import families"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "#Prepare some test data - columbus example\n",
+ "db = pysal.open(pysal.examples.get_path('columbus.dbf'),'r')\n",
+ "y = np.array(db.by_col(\"HOVAL\"))\n",
+ "y = np.reshape(y, (49,1))\n",
+ "X = []\n",
+ "#X.append(np.ones(len(y)))\n",
+ "X.append(db.by_col(\"INC\"))\n",
+ "X.append(db.by_col(\"CRIME\"))\n",
+ "X = np.array(X).T"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[[ 46.42818268]\n",
+ " [ 0.62898397]\n",
+ " [ -0.48488854]]\n"
+ ]
+ }
+ ],
+ "source": [
+ "#First fit pysal OLS model\n",
+ "from pysal.spreg import ols\n",
+ "OLS = ols.OLS(y, X)\n",
+ "print OLS.betas"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "collapsed": false,
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "\n",
+ "\n",
+ "[ 46.42818268 0.62898397 -0.48488854]\n",
+ "[ 46.42818268 0.62898397 -0.48488854]\n"
+ ]
+ }
+ ],
+ "source": [
+ "#Then fit Gaussian GLM\n",
+ "\n",
+ "#create Gaussian GLM model object\n",
+ "model = GLM(y, X, Gaussian())\n",
+ "model\n",
+ "\n",
+ "#Fit model to estimate coefficients and return GLMResults object\n",
+ "results = model.fit()\n",
+ "\n",
+ "#Check coefficients - R betas [46.4282, 0.6290, -0.4849]\n",
+ "print results.params\n",
+ "\n",
+ "# Gaussian GLM results from statsmodels\n",
+ "sm_model = smf.GLM(y, sm.add_constant(X), family=families.Gaussian())\n",
+ "sm_results = sm_model.fit()\n",
+ "print sm_results.params"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2 2\n",
+ "\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "\n",
+ "\n",
+ "\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n"
+ ]
+ }
+ ],
+ "source": [
+ "print results.df_model, sm_results.df_model\n",
+ "print np.allclose(results.aic, sm_results.aic)\n",
+ "print np.allclose(results.bic, sm_results.bic)\n",
+ "print np.allclose(results.deviance, sm_results.deviance)\n",
+ "print np.allclose(results.df_model, sm_results.df_model)\n",
+ "print np.allclose(results.df_resid, sm_results.df_resid)\n",
+ "print np.allclose(results.llf, sm_results.llf)\n",
+ "print np.allclose(results.mu, sm_results.mu)\n",
+ "print np.allclose(results.n, sm_results.nobs)\n",
+ "print np.allclose(results.null, sm_results.null)\n",
+ "print np.allclose(results.null_deviance, sm_results.null_deviance)\n",
+ "print np.allclose(results.params, sm_results.params)\n",
+ "print np.allclose(results.pearson_chi2, sm_results.pearson_chi2)\n",
+ "print np.allclose(results.resid_anscombe, sm_results.resid_anscombe)\n",
+ "print np.allclose(results.resid_deviance, sm_results.resid_deviance)\n",
+ "print np.allclose(results.resid_pearson, sm_results.resid_pearson)\n",
+ "print np.allclose(results.resid_response, sm_results.resid_response)\n",
+ "print np.allclose(results.resid_working, sm_results.resid_working)\n",
+ "print np.allclose(results.scale, sm_results.scale)\n",
+ "print np.allclose(results.normalized_cov_params, sm_results.normalized_cov_params)\n",
+ "print np.allclose(results.cov_params(), sm_results.cov_params())\n",
+ "print np.allclose(results.bse, sm_results.bse)\n",
+ "print np.allclose(results.conf_int(), sm_results.conf_int())\n",
+ "print np.allclose(results.pvalues, sm_results.pvalues)\n",
+ "print np.allclose(results.tvalues, sm_results.tvalues)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "\n",
+ "\n",
+ "[ 3.92159085 0.01183491 -0.01371397]\n",
+ "[ 3.92159085 0.01183491 -0.01371397]\n"
+ ]
+ }
+ ],
+ "source": [
+ "#Now fit a Poisson GLM \n",
+ "\n",
+ "poisson_y = np.round(y).astype(int)\n",
+ "\n",
+ "#create Poisson GLM model object\n",
+ "model = GLM(poisson_y, X, Poisson())\n",
+ "model\n",
+ "\n",
+ "#Fit model to estimate coefficients and return GLMResults object\n",
+ "results = model.fit()\n",
+ "\n",
+ "#Check coefficients - R betas [3.91926, 0.01198, -0.01371]\n",
+ "print results.params.T\n",
+ "\n",
+ "# Poisson GLM results from statsmodels\n",
+ "sm_results = smf.GLM(poisson_y, sm.add_constant(X), family=families.Poisson()).fit()\n",
+ "print sm_results.params"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "\n",
+ "\n",
+ "\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "[ 0.13049161 0.00511599 0.00193769] [ 0.13049161 0.00511599 0.00193769]\n"
+ ]
+ }
+ ],
+ "source": [
+ "print np.allclose(results.aic, sm_results.aic)\n",
+ "print np.allclose(results.bic, sm_results.bic)\n",
+ "print np.allclose(results.deviance, sm_results.deviance)\n",
+ "print np.allclose(results.df_model, sm_results.df_model)\n",
+ "print np.allclose(results.df_resid, sm_results.df_resid)\n",
+ "print np.allclose(results.llf, sm_results.llf)\n",
+ "print np.allclose(results.mu, sm_results.mu)\n",
+ "print np.allclose(results.n, sm_results.nobs)\n",
+ "print np.allclose(results.null, sm_results.null)\n",
+ "print np.allclose(results.null_deviance, sm_results.null_deviance)\n",
+ "print np.allclose(results.params, sm_results.params)\n",
+ "print np.allclose(results.pearson_chi2, sm_results.pearson_chi2)\n",
+ "print np.allclose(results.resid_anscombe, sm_results.resid_anscombe)\n",
+ "print np.allclose(results.resid_deviance, sm_results.resid_deviance)\n",
+ "print np.allclose(results.resid_pearson, sm_results.resid_pearson)\n",
+ "print np.allclose(results.resid_response, sm_results.resid_response)\n",
+ "print np.allclose(results.resid_working, sm_results.resid_working)\n",
+ "print np.allclose(results.scale, sm_results.scale)\n",
+ "print np.allclose(results.normalized_cov_params, sm_results.normalized_cov_params)\n",
+ "print np.allclose(results.cov_params(), sm_results.cov_params())\n",
+ "print np.allclose(results.bse, sm_results.bse)\n",
+ "print np.allclose(results.conf_int(), sm_results.conf_int())\n",
+ "print np.allclose(results.pvalues, sm_results.pvalues)\n",
+ "print np.allclose(results.tvalues, sm_results.tvalues)\n",
+ "print results.bse, sm_results.bse"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 82,
+ "metadata": {
+ "collapsed": false,
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[-5.33638276 0.0287754 ]\n",
+ "[-5.33638276 0.0287754 ]\n"
+ ]
+ }
+ ],
+ "source": [
+ "#Now fit a binomial GLM\n",
+ "londonhp = pd.read_csv('/Users/toshan/projects/londonhp.csv')\n",
+ "#londonhp = pd.read_csv('/Users/qszhao/Dropbox/pysal/pysal/contrib/gwr/londonhp.csv')\n",
+ "y = londonhp['BATH2'].values\n",
+ "y = np.reshape(y, (316,1))\n",
+ "X = londonhp['FLOORSZ'].values\n",
+ "X = np.reshape(X, (316,1))\n",
+ "\n",
+ "#create logistic GLM model object\n",
+ "model = GLM(y, X, Binomial())\n",
+ "model\n",
+ "\n",
+ "#Fit model to estimate coefficients and return GLMResults object\n",
+ "results = model.fit()\n",
+ "\n",
+ "#Check coefficients - R betas [-5.33638, 0.02878]\n",
+ "print results.params.T\n",
+ "\n",
+ "# Logistic GLM results from statsmodels\n",
+ "sm_results = smf.GLM(y, sm.add_constant(X), family=families.Binomial()).fit()\n",
+ "print sm_results.params"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 76,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1 1\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n",
+ "True\n"
+ ]
+ }
+ ],
+ "source": [
+ "print results.df_model, sm_results.df_model\n",
+ "print np.allclose(results.aic, sm_results.aic)\n",
+ "print np.allclose(results.bic, sm_results.bic)\n",
+ "print np.allclose(results.deviance, sm_results.deviance)\n",
+ "print np.allclose(results.df_model, sm_results.df_model)\n",
+ "print np.allclose(results.df_resid, sm_results.df_resid)\n",
+ "print np.allclose(results.llf, sm_results.llf)\n",
+ "print np.allclose(results.mu, sm_results.mu)\n",
+ "print np.allclose(results.n, sm_results.nobs)\n",
+ "print np.allclose(results.null, sm_results.null)\n",
+ "print np.allclose(results.null_deviance, sm_results.null_deviance)\n",
+ "print np.allclose(results.params, sm_results.params)\n",
+ "print np.allclose(results.pearson_chi2, sm_results.pearson_chi2)\n",
+ "print np.allclose(results.resid_anscombe, sm_results.resid_anscombe)\n",
+ "print np.allclose(results.resid_deviance, sm_results.resid_deviance)\n",
+ "print np.allclose(results.resid_pearson, sm_results.resid_pearson)\n",
+ "print np.allclose(results.resid_response, sm_results.resid_response)\n",
+ "print np.allclose(results.resid_working, sm_results.resid_working)\n",
+ "print np.allclose(results.scale, sm_results.scale)\n",
+ "print np.allclose(results.normalized_cov_params, sm_results.normalized_cov_params)\n",
+ "print np.allclose(results.cov_params(), sm_results.cov_params())\n",
+ "print np.allclose(results.bse, sm_results.bse)\n",
+ "print np.allclose(results.conf_int(), sm_results.conf_int())\n",
+ "print np.allclose(results.pvalues, sm_results.pvalues)\n",
+ "print np.allclose(results.tvalues, sm_results.tvalues)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "#create QUasiPoisson GLM model object\n",
+ "model = GLM(poisson_y, X, QuasiPoisson())\n",
+ "model\n",
+ "\n",
+ "#Fit model to estimate coefficients and return GLMResults object\n",
+ "results = model.fit()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 2",
+ "language": "python",
+ "name": "python2"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 2
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython2",
+ "version": "2.7.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/release/python/0.8.0/crankshaft/crankshaft/regression/glm/__init__.py b/release/python/0.8.0/crankshaft/crankshaft/regression/glm/__init__.py
new file mode 100644
index 0000000..4a468d5
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/regression/glm/__init__.py
@@ -0,0 +1,4 @@
+import glm
+import family
+import utils
+import iwls
diff --git a/release/python/0.8.0/crankshaft/crankshaft/regression/glm/base.py b/release/python/0.8.0/crankshaft/crankshaft/regression/glm/base.py
new file mode 100644
index 0000000..484c1c8
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/regression/glm/base.py
@@ -0,0 +1,959 @@
+
+from __future__ import print_function
+import numpy as np
+from scipy import stats
+from utils import cache_readonly
+
+class Results(object):
+ """
+ Class to contain model results
+ Parameters
+ ----------
+ model : class instance
+ the previously specified model instance
+ params : array
+ parameter estimates from the fit model
+ """
+ def __init__(self, model, params, **kwd):
+ self.__dict__.update(kwd)
+ self.initialize(model, params, **kwd)
+ self._data_attr = []
+
+ def initialize(self, model, params, **kwd):
+ self.params = params
+ self.model = model
+ if hasattr(model, 'k_constant'):
+ self.k_constant = model.k_constant
+
+ def predict(self, exog=None, transform=True, *args, **kwargs):
+ """
+ Call self.model.predict with self.params as the first argument.
+ Parameters
+ ----------
+ exog : array-like, optional
+ The values for which you want to predict.
+ transform : bool, optional
+ If the model was fit via a formula, do you want to pass
+ exog through the formula. Default is True. E.g., if you fit
+ a model y ~ log(x1) + log(x2), and transform is True, then
+ you can pass a data structure that contains x1 and x2 in
+ their original form. Otherwise, you'd need to log the data
+ first.
+ args, kwargs :
+ Some models can take additional arguments or keywords, see the
+ predict method of the model for the details.
+ Returns
+ -------
+ prediction : ndarray or pandas.Series
+ See self.model.predict
+ """
+ if transform and hasattr(self.model, 'formula') and exog is not None:
+ from patsy import dmatrix
+ exog = dmatrix(self.model.data.design_info.builder,
+ exog)
+
+ if exog is not None:
+ exog = np.asarray(exog)
+ if exog.ndim == 1 and (self.model.exog.ndim == 1 or
+ self.model.exog.shape[1] == 1):
+ exog = exog[:, None]
+ exog = np.atleast_2d(exog) # needed in count model shape[1]
+
+ return self.model.predict(self.params, exog, *args, **kwargs)
+
+
+#TODO: public method?
+class LikelihoodModelResults(Results):
+ """
+ Class to contain results from likelihood models
+ Parameters
+ -----------
+ model : LikelihoodModel instance or subclass instance
+ LikelihoodModelResults holds a reference to the model that is fit.
+ params : 1d array_like
+ parameter estimates from estimated model
+ normalized_cov_params : 2d array
+ Normalized (before scaling) covariance of params. (dot(X.T,X))**-1
+ scale : float
+ For (some subset of models) scale will typically be the
+ mean square error from the estimated model (sigma^2)
+ Returns
+ -------
+ **Attributes**
+ mle_retvals : dict
+ Contains the values returned from the chosen optimization method if
+ full_output is True during the fit. Available only if the model
+ is fit by maximum likelihood. See notes below for the output from
+ the different methods.
+ mle_settings : dict
+ Contains the arguments passed to the chosen optimization method.
+ Available if the model is fit by maximum likelihood. See
+ LikelihoodModel.fit for more information.
+ model : model instance
+ LikelihoodResults contains a reference to the model that is fit.
+ params : ndarray
+ The parameters estimated for the model.
+ scale : float
+ The scaling factor of the model given during instantiation.
+ tvalues : array
+ The t-values of the standard errors.
+ Notes
+ -----
+ The covariance of params is given by scale times normalized_cov_params.
+ Return values by solver if full_output is True during fit:
+ 'newton'
+ fopt : float
+ The value of the (negative) loglikelihood at its
+ minimum.
+ iterations : int
+ Number of iterations performed.
+ score : ndarray
+ The score vector at the optimum.
+ Hessian : ndarray
+ The Hessian at the optimum.
+ warnflag : int
+ 1 if maxiter is exceeded. 0 if successful convergence.
+ converged : bool
+ True: converged. False: did not converge.
+ allvecs : list
+ List of solutions at each iteration.
+ 'nm'
+ fopt : float
+ The value of the (negative) loglikelihood at its
+ minimum.
+ iterations : int
+ Number of iterations performed.
+ warnflag : int
+ 1: Maximum number of function evaluations made.
+ 2: Maximum number of iterations reached.
+ converged : bool
+ True: converged. False: did not converge.
+ allvecs : list
+ List of solutions at each iteration.
+ 'bfgs'
+ fopt : float
+ Value of the (negative) loglikelihood at its minimum.
+ gopt : float
+ Value of gradient at minimum, which should be near 0.
+ Hinv : ndarray
+ value of the inverse Hessian matrix at minimum. Note
+ that this is just an approximation and will often be
+ different from the value of the analytic Hessian.
+ fcalls : int
+ Number of calls to loglike.
+ gcalls : int
+ Number of calls to gradient/score.
+ warnflag : int
+ 1: Maximum number of iterations exceeded. 2: Gradient
+ and/or function calls are not changing.
+ converged : bool
+ True: converged. False: did not converge.
+ allvecs : list
+ Results at each iteration.
+ 'lbfgs'
+ fopt : float
+ Value of the (negative) loglikelihood at its minimum.
+ gopt : float
+ Value of gradient at minimum, which should be near 0.
+ fcalls : int
+ Number of calls to loglike.
+ warnflag : int
+ Warning flag:
+ - 0 if converged
+ - 1 if too many function evaluations or too many iterations
+ - 2 if stopped for another reason
+ converged : bool
+ True: converged. False: did not converge.
+ 'powell'
+ fopt : float
+ Value of the (negative) loglikelihood at its minimum.
+ direc : ndarray
+ Current direction set.
+ iterations : int
+ Number of iterations performed.
+ fcalls : int
+ Number of calls to loglike.
+ warnflag : int
+ 1: Maximum number of function evaluations. 2: Maximum number
+ of iterations.
+ converged : bool
+ True : converged. False: did not converge.
+ allvecs : list
+ Results at each iteration.
+ 'cg'
+ fopt : float
+ Value of the (negative) loglikelihood at its minimum.
+ fcalls : int
+ Number of calls to loglike.
+ gcalls : int
+ Number of calls to gradient/score.
+ warnflag : int
+ 1: Maximum number of iterations exceeded. 2: Gradient and/
+ or function calls not changing.
+ converged : bool
+ True: converged. False: did not converge.
+ allvecs : list
+ Results at each iteration.
+ 'ncg'
+ fopt : float
+ Value of the (negative) loglikelihood at its minimum.
+ fcalls : int
+ Number of calls to loglike.
+ gcalls : int
+ Number of calls to gradient/score.
+ hcalls : int
+ Number of calls to hessian.
+ warnflag : int
+ 1: Maximum number of iterations exceeded.
+ converged : bool
+ True: converged. False: did not converge.
+ allvecs : list
+ Results at each iteration.
+ """
+
+ # by default we use normal distribution
+ # can be overwritten by instances or subclasses
+ use_t = False
+
+ def __init__(self, model, params, normalized_cov_params=None, scale=1.,
+ **kwargs):
+ super(LikelihoodModelResults, self).__init__(model, params)
+ self.normalized_cov_params = normalized_cov_params
+ self.scale = scale
+
+ # robust covariance
+ # We put cov_type in kwargs so subclasses can decide in fit whether to
+ # use this generic implementation
+ if 'use_t' in kwargs:
+ use_t = kwargs['use_t']
+ if use_t is not None:
+ self.use_t = use_t
+ if 'cov_type' in kwargs:
+ cov_type = kwargs.get('cov_type', 'nonrobust')
+ cov_kwds = kwargs.get('cov_kwds', {})
+
+ if cov_type == 'nonrobust':
+ self.cov_type = 'nonrobust'
+ self.cov_kwds = {'description' : 'Standard Errors assume that the ' +
+ 'covariance matrix of the errors is correctly ' +
+ 'specified.'}
+ else:
+ from statsmodels.base.covtype import get_robustcov_results
+ if cov_kwds is None:
+ cov_kwds = {}
+ use_t = self.use_t
+ # TODO: we shouldn't need use_t in get_robustcov_results
+ get_robustcov_results(self, cov_type=cov_type, use_self=True,
+ use_t=use_t, **cov_kwds)
+
+
+ def normalized_cov_params(self):
+ raise NotImplementedError
+
+
+ def _get_robustcov_results(self, cov_type='nonrobust', use_self=True,
+ use_t=None, **cov_kwds):
+ from statsmodels.base.covtype import get_robustcov_results
+ if cov_kwds is None:
+ cov_kwds = {}
+
+ if cov_type == 'nonrobust':
+ self.cov_type = 'nonrobust'
+ self.cov_kwds = {'description' : 'Standard Errors assume that the ' +
+ 'covariance matrix of the errors is correctly ' +
+ 'specified.'}
+ else:
+ # TODO: we shouldn't need use_t in get_robustcov_results
+ get_robustcov_results(self, cov_type=cov_type, use_self=True,
+ use_t=use_t, **cov_kwds)
+
+ @cache_readonly
+ def llf(self):
+ return self.model.loglike(self.params)
+
+ @cache_readonly
+ def bse(self):
+ return np.sqrt(np.diag(self.cov_params()))
+
+ @cache_readonly
+ def tvalues(self):
+ """
+ Return the t-statistic for a given parameter estimate.
+ """
+ return self.params / self.bse
+
+ @cache_readonly
+ def pvalues(self):
+ if self.use_t:
+ df_resid = getattr(self, 'df_resid_inference', self.df_resid)
+ return stats.t.sf(np.abs(self.tvalues), df_resid)*2
+ else:
+ return stats.norm.sf(np.abs(self.tvalues))*2
+
+
+ def cov_params(self, r_matrix=None, column=None, scale=None, cov_p=None,
+ other=None):
+ """
+ Returns the variance/covariance matrix.
+ The variance/covariance matrix can be of a linear contrast
+ of the estimates of params or all params multiplied by scale which
+ will usually be an estimate of sigma^2. Scale is assumed to be
+ a scalar.
+ Parameters
+ ----------
+ r_matrix : array-like
+ Can be 1d, or 2d. Can be used alone or with other.
+ column : array-like, optional
+ Must be used on its own. Can be 0d or 1d see below.
+ scale : float, optional
+ Can be specified or not. Default is None, which means that
+ the scale argument is taken from the model.
+ other : array-like, optional
+ Can be used when r_matrix is specified.
+ Returns
+ -------
+ cov : ndarray
+ covariance matrix of the parameter estimates or of linear
+ combination of parameter estimates. See Notes.
+ Notes
+ -----
+ (The below are assumed to be in matrix notation.)
+ If no argument is specified returns the covariance matrix of a model
+ ``(scale)*(X.T X)^(-1)``
+ If contrast is specified it pre and post-multiplies as follows
+ ``(scale) * r_matrix (X.T X)^(-1) r_matrix.T``
+ If contrast and other are specified returns
+ ``(scale) * r_matrix (X.T X)^(-1) other.T``
+ If column is specified returns
+ ``(scale) * (X.T X)^(-1)[column,column]`` if column is 0d
+ OR
+ ``(scale) * (X.T X)^(-1)[column][:,column]`` if column is 1d
+ """
+ if (hasattr(self, 'mle_settings') and
+ self.mle_settings['optimizer'] in ['l1', 'l1_cvxopt_cp']):
+ dot_fun = nan_dot
+ else:
+ dot_fun = np.dot
+
+ if (cov_p is None and self.normalized_cov_params is None and
+ not hasattr(self, 'cov_params_default')):
+ raise ValueError('need covariance of parameters for computing '
+ '(unnormalized) covariances')
+ if column is not None and (r_matrix is not None or other is not None):
+ raise ValueError('Column should be specified without other '
+ 'arguments.')
+ if other is not None and r_matrix is None:
+ raise ValueError('other can only be specified with r_matrix')
+
+ if cov_p is None:
+ if hasattr(self, 'cov_params_default'):
+ cov_p = self.cov_params_default
+ else:
+ if scale is None:
+ scale = self.scale
+ cov_p = self.normalized_cov_params * scale
+
+ if column is not None:
+ column = np.asarray(column)
+ if column.shape == ():
+ return cov_p[column, column]
+ else:
+ #return cov_p[column][:, column]
+ return cov_p[column[:, None], column]
+ elif r_matrix is not None:
+ r_matrix = np.asarray(r_matrix)
+ if r_matrix.shape == ():
+ raise ValueError("r_matrix should be 1d or 2d")
+ if other is None:
+ other = r_matrix
+ else:
+ other = np.asarray(other)
+ tmp = dot_fun(r_matrix, dot_fun(cov_p, np.transpose(other)))
+ return tmp
+ else: # if r_matrix is None and column is None:
+ return cov_p
+
+ #TODO: make sure this works as needed for GLMs
+ def t_test(self, r_matrix, cov_p=None, scale=None,
+ use_t=None):
+ """
+ Compute a t-test for a each linear hypothesis of the form Rb = q
+ Parameters
+ ----------
+ r_matrix : array-like, str, tuple
+ - array : If an array is given, a p x k 2d array or length k 1d
+ array specifying the linear restrictions. It is assumed
+ that the linear combination is equal to zero.
+ - str : The full hypotheses to test can be given as a string.
+ See the examples.
+ - tuple : A tuple of arrays in the form (R, q). If q is given,
+ can be either a scalar or a length p row vector.
+ cov_p : array-like, optional
+ An alternative estimate for the parameter covariance matrix.
+ If None is given, self.normalized_cov_params is used.
+ scale : float, optional
+ An optional `scale` to use. Default is the scale specified
+ by the model fit.
+ use_t : bool, optional
+ If use_t is None, then the default of the model is used.
+ If use_t is True, then the p-values are based on the t
+ distribution.
+ If use_t is False, then the p-values are based on the normal
+ distribution.
+ Returns
+ -------
+ res : ContrastResults instance
+ The results for the test are attributes of this results instance.
+ The available results have the same elements as the parameter table
+ in `summary()`.
+ Examples
+ --------
+ >>> import numpy as np
+ >>> import statsmodels.api as sm
+ >>> data = sm.datasets.longley.load()
+ >>> data.exog = sm.add_constant(data.exog)
+ >>> results = sm.OLS(data.endog, data.exog).fit()
+ >>> r = np.zeros_like(results.params)
+ >>> r[5:] = [1,-1]
+ >>> print(r)
+ [ 0. 0. 0. 0. 0. 1. -1.]
+ r tests that the coefficients on the 5th and 6th independent
+ variable are the same.
+ >>> T_test = results.t_test(r)
+ >>> print(T_test)
+
+ >>> T_test.effect
+ -1829.2025687192481
+ >>> T_test.sd
+ 455.39079425193762
+ >>> T_test.tvalue
+ -4.0167754636411717
+ >>> T_test.pvalue
+ 0.0015163772380899498
+ Alternatively, you can specify the hypothesis tests using a string
+ >>> from statsmodels.formula.api import ols
+ >>> dta = sm.datasets.longley.load_pandas().data
+ >>> formula = 'TOTEMP ~ GNPDEFL + GNP + UNEMP + ARMED + POP + YEAR'
+ >>> results = ols(formula, dta).fit()
+ >>> hypotheses = 'GNPDEFL = GNP, UNEMP = 2, YEAR/1829 = 1'
+ >>> t_test = results.t_test(hypotheses)
+ >>> print(t_test)
+ See Also
+ ---------
+ tvalues : individual t statistics
+ f_test : for F tests
+ patsy.DesignInfo.linear_constraint
+ """
+ from patsy import DesignInfo
+ names = self.model.data.param_names
+ LC = DesignInfo(names).linear_constraint(r_matrix)
+ r_matrix, q_matrix = LC.coefs, LC.constants
+ num_ttests = r_matrix.shape[0]
+ num_params = r_matrix.shape[1]
+
+ if (cov_p is None and self.normalized_cov_params is None and
+ not hasattr(self, 'cov_params_default')):
+ raise ValueError('Need covariance of parameters for computing '
+ 'T statistics')
+ if num_params != self.params.shape[0]:
+ raise ValueError('r_matrix and params are not aligned')
+ if q_matrix is None:
+ q_matrix = np.zeros(num_ttests)
+ else:
+ q_matrix = np.asarray(q_matrix)
+ q_matrix = q_matrix.squeeze()
+ if q_matrix.size > 1:
+ if q_matrix.shape[0] != num_ttests:
+ raise ValueError("r_matrix and q_matrix must have the same "
+ "number of rows")
+
+ if use_t is None:
+ #switch to use_t false if undefined
+ use_t = (hasattr(self, 'use_t') and self.use_t)
+
+ _t = _sd = None
+
+ _effect = np.dot(r_matrix, self.params)
+ # nan_dot multiplies with the convention nan * 0 = 0
+
+ # Perform the test
+ if num_ttests > 1:
+ _sd = np.sqrt(np.diag(self.cov_params(
+ r_matrix=r_matrix, cov_p=cov_p)))
+ else:
+ _sd = np.sqrt(self.cov_params(r_matrix=r_matrix, cov_p=cov_p))
+ _t = (_effect - q_matrix) * recipr(_sd)
+
+ df_resid = getattr(self, 'df_resid_inference', self.df_resid)
+
+ if use_t:
+ return ContrastResults(effect=_effect, t=_t, sd=_sd,
+ df_denom=df_resid)
+ else:
+ return ContrastResults(effect=_effect, statistic=_t, sd=_sd,
+ df_denom=df_resid,
+ distribution='norm')
+
+ def f_test(self, r_matrix, cov_p=None, scale=1.0, invcov=None):
+ """
+ Compute the F-test for a joint linear hypothesis.
+ This is a special case of `wald_test` that always uses the F
+ distribution.
+ Parameters
+ ----------
+ r_matrix : array-like, str, or tuple
+ - array : An r x k array where r is the number of restrictions to
+ test and k is the number of regressors. It is assumed
+ that the linear combination is equal to zero.
+ - str : The full hypotheses to test can be given as a string.
+ See the examples.
+ - tuple : A tuple of arrays in the form (R, q), ``q`` can be
+ either a scalar or a length k row vector.
+ cov_p : array-like, optional
+ An alternative estimate for the parameter covariance matrix.
+ If None is given, self.normalized_cov_params is used.
+ scale : float, optional
+ Default is 1.0 for no scaling.
+ invcov : array-like, optional
+ A q x q array to specify an inverse covariance matrix based on a
+ restrictions matrix.
+ Returns
+ -------
+ res : ContrastResults instance
+ The results for the test are attributes of this results instance.
+ Examples
+ --------
+ >>> import numpy as np
+ >>> import statsmodels.api as sm
+ >>> data = sm.datasets.longley.load()
+ >>> data.exog = sm.add_constant(data.exog)
+ >>> results = sm.OLS(data.endog, data.exog).fit()
+ >>> A = np.identity(len(results.params))
+ >>> A = A[1:,:]
+ This tests that each coefficient is jointly statistically
+ significantly different from zero.
+ >>> print(results.f_test(A))
+
+ Compare this to
+ >>> results.fvalue
+ 330.2853392346658
+ >>> results.f_pvalue
+ 4.98403096572e-10
+ >>> B = np.array(([0,0,1,-1,0,0,0],[0,0,0,0,0,1,-1]))
+ This tests that the coefficient on the 2nd and 3rd regressors are
+ equal and jointly that the coefficient on the 5th and 6th regressors
+ are equal.
+ >>> print(results.f_test(B))
+
+ Alternatively, you can specify the hypothesis tests using a string
+ >>> from statsmodels.datasets import longley
+ >>> from statsmodels.formula.api import ols
+ >>> dta = longley.load_pandas().data
+ >>> formula = 'TOTEMP ~ GNPDEFL + GNP + UNEMP + ARMED + POP + YEAR'
+ >>> results = ols(formula, dta).fit()
+ >>> hypotheses = '(GNPDEFL = GNP), (UNEMP = 2), (YEAR/1829 = 1)'
+ >>> f_test = results.f_test(hypotheses)
+ >>> print(f_test)
+ See Also
+ --------
+ statsmodels.stats.contrast.ContrastResults
+ wald_test
+ t_test
+ patsy.DesignInfo.linear_constraint
+ Notes
+ -----
+ The matrix `r_matrix` is assumed to be non-singular. More precisely,
+ r_matrix (pX pX.T) r_matrix.T
+ is assumed invertible. Here, pX is the generalized inverse of the
+ design matrix of the model. There can be problems in non-OLS models
+ where the rank of the covariance of the noise is not full.
+ """
+ res = self.wald_test(r_matrix, cov_p=cov_p, scale=scale,
+ invcov=invcov, use_f=True)
+ return res
+
+ #TODO: untested for GLMs?
+ def wald_test(self, r_matrix, cov_p=None, scale=1.0, invcov=None,
+ use_f=None):
+ """
+ Compute a Wald-test for a joint linear hypothesis.
+ Parameters
+ ----------
+ r_matrix : array-like, str, or tuple
+ - array : An r x k array where r is the number of restrictions to
+ test and k is the number of regressors. It is assumed that the
+ linear combination is equal to zero.
+ - str : The full hypotheses to test can be given as a string.
+ See the examples.
+ - tuple : A tuple of arrays in the form (R, q), ``q`` can be
+ either a scalar or a length p row vector.
+ cov_p : array-like, optional
+ An alternative estimate for the parameter covariance matrix.
+ If None is given, self.normalized_cov_params is used.
+ scale : float, optional
+ Default is 1.0 for no scaling.
+ invcov : array-like, optional
+ A q x q array to specify an inverse covariance matrix based on a
+ restrictions matrix.
+ use_f : bool
+ If True, then the F-distribution is used. If False, then the
+ asymptotic distribution, chisquare is used. If use_f is None, then
+ the F distribution is used if the model specifies that use_t is True.
+ The test statistic is proportionally adjusted for the distribution
+ by the number of constraints in the hypothesis.
+ Returns
+ -------
+ res : ContrastResults instance
+ The results for the test are attributes of this results instance.
+ See also
+ --------
+ statsmodels.stats.contrast.ContrastResults
+ f_test
+ t_test
+ patsy.DesignInfo.linear_constraint
+ Notes
+ -----
+ The matrix `r_matrix` is assumed to be non-singular. More precisely,
+ r_matrix (pX pX.T) r_matrix.T
+ is assumed invertible. Here, pX is the generalized inverse of the
+ design matrix of the model. There can be problems in non-OLS models
+ where the rank of the covariance of the noise is not full.
+ """
+ if use_f is None:
+ #switch to use_t false if undefined
+ use_f = (hasattr(self, 'use_t') and self.use_t)
+
+ from patsy import DesignInfo
+ names = self.model.data.param_names
+ LC = DesignInfo(names).linear_constraint(r_matrix)
+ r_matrix, q_matrix = LC.coefs, LC.constants
+
+ if (self.normalized_cov_params is None and cov_p is None and
+ invcov is None and not hasattr(self, 'cov_params_default')):
+ raise ValueError('need covariance of parameters for computing '
+ 'F statistics')
+
+ cparams = np.dot(r_matrix, self.params[:, None])
+ J = float(r_matrix.shape[0]) # number of restrictions
+ if q_matrix is None:
+ q_matrix = np.zeros(J)
+ else:
+ q_matrix = np.asarray(q_matrix)
+ if q_matrix.ndim == 1:
+ q_matrix = q_matrix[:, None]
+ if q_matrix.shape[0] != J:
+ raise ValueError("r_matrix and q_matrix must have the same "
+ "number of rows")
+ Rbq = cparams - q_matrix
+ if invcov is None:
+ cov_p = self.cov_params(r_matrix=r_matrix, cov_p=cov_p)
+ if np.isnan(cov_p).max():
+ raise ValueError("r_matrix performs f_test for using "
+ "dimensions that are asymptotically "
+ "non-normal")
+ invcov = np.linalg.inv(cov_p)
+
+ if (hasattr(self, 'mle_settings') and
+ self.mle_settings['optimizer'] in ['l1', 'l1_cvxopt_cp']):
+ F = nan_dot(nan_dot(Rbq.T, invcov), Rbq)
+ else:
+ F = np.dot(np.dot(Rbq.T, invcov), Rbq)
+
+ df_resid = getattr(self, 'df_resid_inference', self.df_resid)
+ if use_f:
+ F /= J
+ return ContrastResults(F=F, df_denom=df_resid,
+ df_num=invcov.shape[0])
+ else:
+ return ContrastResults(chi2=F, df_denom=J, statistic=F,
+ distribution='chi2', distargs=(J,))
+
+
+ def wald_test_terms(self, skip_single=False, extra_constraints=None,
+ combine_terms=None):
+ """
+ Compute a sequence of Wald tests for terms over multiple columns
+ This computes joined Wald tests for the hypothesis that all
+ coefficients corresponding to a `term` are zero.
+ `Terms` are defined by the underlying formula or by string matching.
+ Parameters
+ ----------
+ skip_single : boolean
+ If true, then terms that consist only of a single column and,
+ therefore, refers only to a single parameter is skipped.
+ If false, then all terms are included.
+ extra_constraints : ndarray
+ not tested yet
+ combine_terms : None or list of strings
+ Each string in this list is matched to the name of the terms or
+ the name of the exogenous variables. All columns whose name
+ includes that string are combined in one joint test.
+ Returns
+ -------
+ test_result : result instance
+ The result instance contains `table` which is a pandas DataFrame
+ with the test results: test statistic, degrees of freedom and
+ pvalues.
+ Examples
+ --------
+ >>> res_ols = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)",
+ data).fit()
+ >>> res_ols.wald_test_terms()
+
+ F P>F df constraint df denom
+ Intercept 279.754525 2.37985521351e-22 1 51
+ C(Duration, Sum) 5.367071 0.0245738436636 1 51
+ C(Weight, Sum) 12.432445 3.99943118767e-05 2 51
+ C(Duration, Sum):C(Weight, Sum) 0.176002 0.83912310946 2 51
+ >>> res_poi = Poisson.from_formula("Days ~ C(Weight) * C(Duration)",
+ data).fit(cov_type='HC0')
+ >>> wt = res_poi.wald_test_terms(skip_single=False,
+ combine_terms=['Duration', 'Weight'])
+ >>> print(wt)
+ chi2 P>chi2 df constraint
+ Intercept 15.695625 7.43960374424e-05 1
+ C(Weight) 16.132616 0.000313940174705 2
+ C(Duration) 1.009147 0.315107378931 1
+ C(Weight):C(Duration) 0.216694 0.897315972824 2
+ Duration 11.187849 0.010752286833 3
+ Weight 30.263368 4.32586407145e-06 4
+ """
+ # lazy import
+ from collections import defaultdict
+
+ result = self
+ if extra_constraints is None:
+ extra_constraints = []
+ if combine_terms is None:
+ combine_terms = []
+ design_info = getattr(result.model.data.orig_exog, 'design_info', None)
+
+ if design_info is None and extra_constraints is None:
+ raise ValueError('no constraints, nothing to do')
+
+
+ identity = np.eye(len(result.params))
+ constraints = []
+ combined = defaultdict(list)
+ if design_info is not None:
+ for term in design_info.terms:
+ cols = design_info.slice(term)
+ name = term.name()
+ constraint_matrix = identity[cols]
+
+ # check if in combined
+ for cname in combine_terms:
+ if cname in name:
+ combined[cname].append(constraint_matrix)
+
+ k_constraint = constraint_matrix.shape[0]
+ if skip_single:
+ if k_constraint == 1:
+ continue
+
+ constraints.append((name, constraint_matrix))
+
+ combined_constraints = []
+ for cname in combine_terms:
+ combined_constraints.append((cname, np.vstack(combined[cname])))
+ else:
+ # check by exog/params names if there is no formula info
+ for col, name in enumerate(result.model.exog_names):
+ constraint_matrix = identity[col]
+
+ # check if in combined
+ for cname in combine_terms:
+ if cname in name:
+ combined[cname].append(constraint_matrix)
+
+ if skip_single:
+ continue
+
+ constraints.append((name, constraint_matrix))
+
+ combined_constraints = []
+ for cname in combine_terms:
+ combined_constraints.append((cname, np.vstack(combined[cname])))
+
+ use_t = result.use_t
+ distribution = ['chi2', 'F'][use_t]
+
+ res_wald = []
+ index = []
+ for name, constraint in constraints + combined_constraints + extra_constraints:
+ wt = result.wald_test(constraint)
+ row = [wt.statistic.item(), wt.pvalue, constraint.shape[0]]
+ if use_t:
+ row.append(wt.df_denom)
+ res_wald.append(row)
+ index.append(name)
+
+ # distribution nerutral names
+ col_names = ['statistic', 'pvalue', 'df_constraint']
+ if use_t:
+ col_names.append('df_denom')
+ # TODO: maybe move DataFrame creation to results class
+ from pandas import DataFrame
+ table = DataFrame(res_wald, index=index, columns=col_names)
+ res = WaldTestResults(None, distribution, None, table=table)
+ # TODO: remove temp again, added for testing
+ res.temp = constraints + combined_constraints + extra_constraints
+ return res
+
+
+ def conf_int(self, alpha=.05, cols=None, method='default'):
+ """
+ Returns the confidence interval of the fitted parameters.
+ Parameters
+ ----------
+ alpha : float, optional
+ The significance level for the confidence interval.
+ ie., The default `alpha` = .05 returns a 95% confidence interval.
+ cols : array-like, optional
+ `cols` specifies which confidence intervals to return
+ method : string
+ Not Implemented Yet
+ Method to estimate the confidence_interval.
+ "Default" : uses self.bse which is based on inverse Hessian for MLE
+ "hjjh" :
+ "jac" :
+ "boot-bse"
+ "boot_quant"
+ "profile"
+ Returns
+ --------
+ conf_int : array
+ Each row contains [lower, upper] limits of the confidence interval
+ for the corresponding parameter. The first column contains all
+ lower, the second column contains all upper limits.
+ Examples
+ --------
+ >>> import statsmodels.api as sm
+ >>> data = sm.datasets.longley.load()
+ >>> data.exog = sm.add_constant(data.exog)
+ >>> results = sm.OLS(data.endog, data.exog).fit()
+ >>> results.conf_int()
+ array([[-5496529.48322745, -1467987.78596704],
+ [ -177.02903529, 207.15277984],
+ [ -0.1115811 , 0.03994274],
+ [ -3.12506664, -0.91539297],
+ [ -1.5179487 , -0.54850503],
+ [ -0.56251721, 0.460309 ],
+ [ 798.7875153 , 2859.51541392]])
+ >>> results.conf_int(cols=(2,3))
+ array([[-0.1115811 , 0.03994274],
+ [-3.12506664, -0.91539297]])
+ Notes
+ -----
+ The confidence interval is based on the standard normal distribution.
+ Models wish to use a different distribution should overwrite this
+ method.
+ """
+ bse = self.bse
+
+ if self.use_t:
+ dist = stats.t
+ df_resid = getattr(self, 'df_resid_inference', self.df_resid)
+ q = dist.ppf(1 - alpha / 2, df_resid)
+ else:
+ dist = stats.norm
+ q = dist.ppf(1 - alpha / 2)
+
+ if cols is None:
+ lower = self.params - q * bse
+ upper = self.params + q * bse
+ else:
+ cols = np.asarray(cols)
+ lower = self.params[cols] - q * bse[cols]
+ upper = self.params[cols] + q * bse[cols]
+ return np.asarray(lzip(lower, upper))
+
+ def save(self, fname, remove_data=False):
+ '''
+ save a pickle of this instance
+ Parameters
+ ----------
+ fname : string or filehandle
+ fname can be a string to a file path or filename, or a filehandle.
+ remove_data : bool
+ If False (default), then the instance is pickled without changes.
+ If True, then all arrays with length nobs are set to None before
+ pickling. See the remove_data method.
+ In some cases not all arrays will be set to None.
+ Notes
+ -----
+ If remove_data is true and the model result does not implement a
+ remove_data method then this will raise an exception.
+ '''
+
+ from statsmodels.iolib.smpickle import save_pickle
+
+ if remove_data:
+ self.remove_data()
+
+ save_pickle(self, fname)
+
+ @classmethod
+ def load(cls, fname):
+ '''
+ load a pickle, (class method)
+ Parameters
+ ----------
+ fname : string or filehandle
+ fname can be a string to a file path or filename, or a filehandle.
+ Returns
+ -------
+ unpickled instance
+ '''
+
+ from statsmodels.iolib.smpickle import load_pickle
+ return load_pickle(fname)
+
+ def remove_data(self):
+ '''remove data arrays, all nobs arrays from result and model
+ This reduces the size of the instance, so it can be pickled with less
+ memory. Currently tested for use with predict from an unpickled
+ results and model instance.
+ .. warning:: Since data and some intermediate results have been removed
+ calculating new statistics that require them will raise exceptions.
+ The exception will occur the first time an attribute is accessed
+ that has been set to None.
+ Not fully tested for time series models, tsa, and might delete too much
+ for prediction or not all that would be possible.
+ The list of arrays to delete is maintained as an attribute of the
+ result and model instance, except for cached values. These lists could
+ be changed before calling remove_data.
+ '''
+ def wipe(obj, att):
+ #get to last element in attribute path
+ p = att.split('.')
+ att_ = p.pop(-1)
+ try:
+ obj_ = reduce(getattr, [obj] + p)
+
+ #print(repr(obj), repr(att))
+ #print(hasattr(obj_, att_))
+ if hasattr(obj_, att_):
+ #print('removing3', att_)
+ setattr(obj_, att_, None)
+ except AttributeError:
+ pass
+
+ model_attr = ['model.' + i for i in self.model._data_attr]
+ for att in self._data_attr + model_attr:
+ #print('removing', att)
+ wipe(self, att)
+
+ data_in_cache = getattr(self, 'data_in_cache', [])
+ data_in_cache += ['fittedvalues', 'resid', 'wresid']
+ for key in data_in_cache:
+ try:
+ self._cache[key] = None
+ except (AttributeError, KeyError):
+ pass
+
+def lzip(*args, **kwargs):
+ return list(zip(*args, **kwargs))
diff --git a/release/python/0.8.0/crankshaft/crankshaft/regression/glm/family.py b/release/python/0.8.0/crankshaft/crankshaft/regression/glm/family.py
new file mode 100644
index 0000000..bad22c1
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/regression/glm/family.py
@@ -0,0 +1,1845 @@
+'''
+The one parameter exponential family distributions used by GLM.
+'''
+# TODO: quasi, quasibinomial, quasipoisson
+# see http://www.biostat.jhsph.edu/~qli/biostatistics_r_doc/library/stats/html/family.html
+# for comparison to R, and McCullagh and Nelder
+
+import numpy as np
+from scipy import special
+import links as L
+import varfuncs as V
+FLOAT_EPS = np.finfo(float).eps
+
+
+class Family(object):
+ """
+ The parent class for one-parameter exponential families.
+
+ Parameters
+ ----------
+ link : a link function instance
+ Link is the linear transformation function.
+ See the individual families for available links.
+ variance : a variance function
+ Measures the variance as a function of the mean probabilities.
+ See the individual families for the default variance function.
+
+ See Also
+ --------
+ :ref:`links`
+
+ """
+ # TODO: change these class attributes, use valid somewhere...
+ valid = [-np.inf, np.inf]
+
+ links = []
+
+ def _setlink(self, link):
+ """
+ Helper method to set the link for a family.
+
+ Raises a ValueError exception if the link is not available. Note that
+ the error message might not be that informative because it tells you
+ that the link should be in the base class for the link function.
+
+ See glm.GLM for a list of appropriate links for each family but note
+ that not all of these are currently available.
+ """
+ # TODO: change the links class attribute in the families to hold
+ # meaningful information instead of a list of links instances such as
+ # [,
+ # ,
+ # ]
+ # for Poisson...
+ self._link = link
+ if not isinstance(link, L.Link):
+ raise TypeError("The input should be a valid Link object.")
+ if hasattr(self, "links"):
+ validlink = link in self.links
+ validlink = max([isinstance(link, _) for _ in self.links])
+ if not validlink:
+ errmsg = "Invalid link for family, should be in %s. (got %s)"
+ raise ValueError(errmsg % (repr(self.links), link))
+
+ def _getlink(self):
+ """
+ Helper method to get the link for a family.
+ """
+ return self._link
+
+ # link property for each family is a pointer to link instance
+ link = property(_getlink, _setlink, doc="Link function for family")
+
+ def __init__(self, link, variance):
+ self.link = link()
+ self.variance = variance
+
+ def starting_mu(self, y):
+ r"""
+ Starting value for mu in the IRLS algorithm.
+
+ Parameters
+ ----------
+ y : array
+ The untransformed response variable.
+
+ Returns
+ -------
+ mu_0 : array
+ The first guess on the transformed response variable.
+
+ Notes
+ -----
+ .. math::
+
+ \mu_0 = (Y + \overline{Y})/2
+
+ Notes
+ -----
+ Only the Binomial family takes a different initial value.
+ """
+ return (y + y.mean())/2.
+
+ def weights(self, mu):
+ r"""
+ Weights for IRLS steps
+
+ Parameters
+ ----------
+ mu : array-like
+ The transformed mean response variable in the exponential family
+
+ Returns
+ -------
+ w : array
+ The weights for the IRLS steps
+
+ Notes
+ -----
+ .. math::
+
+ w = 1 / (g'(\mu)^2 * Var(\mu))
+ """
+ return 1. / (self.link.deriv(mu)**2 * self.variance(mu))
+
+ def deviance(self, endog, mu, freq_weights=1., scale=1.):
+ r"""
+ The deviance function evaluated at (endog,mu,freq_weights,mu).
+
+ Deviance is usually defined as twice the loglikelihood ratio.
+
+ Parameters
+ ----------
+ endog : array-like
+ The endogenous response variable
+ mu : array-like
+ The inverse of the link function at the linear predicted values.
+ freq_weights : array-like
+ 1d array of frequency weights. The default is 1.
+ scale : float, optional
+ An optional scale argument. The default is 1.
+
+ Returns
+ -------
+ Deviance : array
+ The value of deviance function defined below.
+
+ Notes
+ -----
+ Deviance is defined
+
+ .. math::
+
+ D = \sum_i (2 * freq\_weights_i * llf(Y_i, Y_i) - 2 *
+ llf(Y_i, \mu_i)) / scale
+
+ where y is the endogenous variable. The deviance functions are
+ analytically defined for each family.
+ """
+ raise NotImplementedError
+
+ def resid_dev(self, endog, mu, freq_weights=1., scale=1.):
+ """
+ The deviance residuals
+
+ Parameters
+ ----------
+ endog : array
+ The endogenous response variable
+ mu : array
+ The inverse of the link function at the linear predicted values.
+ freq_weights : array-like
+ 1d array of frequency weights. The default is 1.
+ scale : float, optional
+ An optional argument to divide the residuals by scale. The default
+ is 1.
+
+ Returns
+ -------
+ Deviance residuals.
+
+ Notes
+ -----
+ The deviance residuals are defined for each family.
+ """
+ raise NotImplementedError
+
+ def fitted(self, lin_pred):
+ """
+ Fitted values based on linear predictors lin_pred.
+
+ Parameters
+ -----------
+ lin_pred : array
+ Values of the linear predictor of the model.
+ dot(X,beta) in a classical linear model.
+
+ Returns
+ --------
+ mu : array
+ The mean response variables given by the inverse of the link
+ function.
+ """
+ fits = self.link.inverse(lin_pred)
+ return fits
+
+ def predict(self, mu):
+ """
+ Linear predictors based on given mu values.
+
+ Parameters
+ ----------
+ mu : array
+ The mean response variables
+
+ Returns
+ -------
+ lin_pred : array
+ Linear predictors based on the mean response variables. The value
+ of the link function at the given mu.
+ """
+ return self.link(mu)
+
+ def loglike(self, endog, mu, freq_weights=1., scale=1.):
+ """
+ The log-likelihood function in terms of the fitted mean response.
+
+ Parameters
+ ----------
+ `endog` : array
+ Usually the endogenous response variable.
+ `mu` : array
+ Usually but not always the fitted mean response variable.
+ freq_weights : array-like
+ 1d array of frequency weights. The default is 1.
+ scale : float
+ The scale parameter. The default is 1.
+
+ Returns
+ -------
+ llf : float
+ The value of the loglikelihood evaluated at
+ (endog,mu,freq_weights,scale) as defined below.
+ Notes
+ -----
+ This is defined for each family. endog and mu are not restricted to
+ `endog` and `mu` respectively. For instance, the deviance function
+ calls both loglike(endog,endog) and loglike(endog,mu) to get the
+ likelihood ratio.
+ """
+ raise NotImplementedError
+
+ def resid_anscombe(self, endog, mu):
+ """
+ The Anscome residuals.
+
+ See also
+ --------
+ statsmodels.families.family.Family docstring and the `resid_anscombe`
+ for the individual families for more information.
+ """
+ raise NotImplementedError
+
+
+class Poisson(Family):
+ """
+ Poisson exponential family.
+
+ Parameters
+ ----------
+ link : a link instance, optional
+ The default link for the Poisson family is the log link. Available
+ links are log, identity, and sqrt. See statsmodels.family.links for
+ more information.
+
+ Attributes
+ ----------
+ Poisson.link : a link instance
+ The link function of the Poisson instance.
+ Poisson.variance : varfuncs instance
+ `variance` is an instance of
+ statsmodels.genmod.families.family.varfuncs.mu
+
+ See also
+ --------
+ statsmodels.genmod.families.family.Family
+ :ref:`links`
+
+ """
+
+ links = [L.log, L.identity, L.sqrt]
+ variance = V.mu
+ valid = [0, np.inf]
+ safe_links = [L.Log, ]
+
+ def __init__(self, link=L.log):
+ self.variance = Poisson.variance
+ self.link = link()
+
+ def _clean(self, x):
+ """
+ Helper function to trim the data so that is in (0,inf)
+
+ Notes
+ -----
+ The need for this function was discovered through usage and its
+ possible that other families might need a check for validity of the
+ domain.
+ """
+ return np.clip(x, FLOAT_EPS, np.inf)
+
+ def resid_dev(self, endog, mu, scale=1.):
+ r"""Poisson deviance residual
+
+ Parameters
+ ----------
+ endog : array-like
+ Endogenous response variable
+ mu : array-like
+ Fitted mean response variable
+ scale : float, optional
+ An optional argument to divide the residuals by scale. The default
+ is 1.
+
+ Returns
+ -------
+ resid_dev : array
+ Deviance residuals as defined below
+
+ Notes
+ -----
+ .. math::
+
+ resid\_dev_i = sign(Y_i - \mu_i) * \sqrt{2 *
+ (Y_i * \log(Y_i / \mu_i) - (Y_i - \mu_i))} / scale
+ """
+ endog_mu = self._clean(endog / mu)
+ return (np.sign(endog - mu) *
+ np.sqrt(2 * (endog * np.log(endog_mu) - (endog - mu))) / scale)
+
+ def deviance(self, endog, mu, freq_weights=1., scale=1.):
+ r'''
+ Poisson deviance function
+
+ Parameters
+ ----------
+ endog : array-like
+ Endogenous response variable
+ mu : array-like
+ Fitted mean response variable
+ freq_weights : array-like
+ 1d array of frequency weights. The default is 1.
+ scale : float, optional
+ An optional scale argument. The default is 1.
+
+ Returns
+ -------
+ deviance : float
+ The deviance function at (endog,mu,freq_weights,scale) as defined
+ below.
+
+ Notes
+ -----
+ If a constant term is included it is defined as
+
+ .. math::
+
+ D = 2 * \sum_i (freq\_weights_i * Y_i * \log(Y_i / \mu_i))/ scale
+ '''
+ endog_mu = self._clean(endog / mu)
+ return 2 * np.sum(endog * freq_weights * np.log(endog_mu)) / scale
+
+ def loglike(self, endog, mu, freq_weights=1., scale=1.):
+ r"""
+ The log-likelihood function in terms of the fitted mean response.
+
+ Parameters
+ ----------
+ endog : array-like
+ Endogenous response variable
+ mu : array-like
+ Fitted mean response variable
+ freq_weights : array-like
+ 1d array of frequency weights. The default is 1.
+ scale : float, optional
+ The scale parameter, defaults to 1.
+
+ Returns
+ -------
+ llf : float
+ The value of the loglikelihood function evaluated at
+ (endog,mu,freq_weights,scale) as defined below.
+
+ Notes
+ -----
+ .. math::
+
+ llf = scale * \sum_i freq\_weights_i * (Y_i * \log(\mu_i) - \mu_i -
+ \ln \Gamma(Y_i + 1))
+ """
+ loglike = np.sum(freq_weights * (endog * np.log(mu) - mu -
+ special.gammaln(endog + 1)))
+ return scale * loglike
+
+ def resid_anscombe(self, endog, mu):
+ r"""
+ Anscombe residuals for the Poisson exponential family distribution
+
+ Parameters
+ ----------
+ endog : array-like
+ Endogenous response variable
+ mu : array-like
+ Fitted mean response variable
+
+ Returns
+ -------
+ resid_anscombe : array
+ The Anscome residuals for the Poisson family defined below
+
+ Notes
+ -----
+ .. math::
+
+ resid\_anscombe_i = (3/2) * (Y_i^{2/3} - \mu_i^{2/3}) / \mu_i^{1/6}
+ """
+ return (3 / 2.) * (endog**(2/3.) - mu**(2 / 3.)) / mu**(1 / 6.)
+
+class QuasiPoisson(Family):
+ """
+ QuasiPoisson exponential family.
+
+ Parameters
+ ----------
+ link : a link instance, optional
+ The default link for the Poisson family is the log link. Available
+ links are log, identity, and sqrt. See statsmodels.family.links for
+ more information.
+
+ Attributes
+ ----------
+ Poisson.link : a link instance
+ The link function of the Poisson instance.
+ Poisson.variance : varfuncs instance
+ `variance` is an instance of
+ statsmodels.genmod.families.family.varfuncs.mu
+
+ See also
+ --------
+ statsmodels.genmod.families.family.Family
+ :ref:`links`
+
+ """
+
+ links = [L.log, L.identity, L.sqrt]
+ variance = V.mu
+ valid = [0, np.inf]
+ safe_links = [L.Log, ]
+
+ def __init__(self, link=L.log):
+ self.variance = Poisson.variance
+ self.link = link()
+
+ def _clean(self, x):
+ """
+ Helper function to trim the data so that is in (0,inf)
+
+ Notes
+ -----
+ The need for this function was discovered through usage and its
+ possible that other families might need a check for validity of the
+ domain.
+ """
+ return np.clip(x, FLOAT_EPS, np.inf)
+
+ def resid_dev(self, endog, mu, scale=1.):
+ r"""Poisson deviance residual
+
+ Parameters
+ ----------
+ endog : array-like
+ Endogenous response variable
+ mu : array-like
+ Fitted mean response variable
+ scale : float, optional
+ An optional argument to divide the residuals by scale. The default
+ is 1.
+
+ Returns
+ -------
+ resid_dev : array
+ Deviance residuals as defined below
+
+ Notes
+ -----
+ .. math::
+
+ resid\_dev_i = sign(Y_i - \mu_i) * \sqrt{2 *
+ (Y_i * \log(Y_i / \mu_i) - (Y_i - \mu_i))} / scale
+ """
+ endog_mu = self._clean(endog / mu)
+ return (np.sign(endog - mu) *
+ np.sqrt(2 * (endog * np.log(endog_mu) - (endog - mu))) / scale)
+
+ def deviance(self, endog, mu, freq_weights=1., scale=1.):
+ r'''
+ Poisson deviance function
+
+ Parameters
+ ----------
+ endog : array-like
+ Endogenous response variable
+ mu : array-like
+ Fitted mean response variable
+ freq_weights : array-like
+ 1d array of frequency weights. The default is 1.
+ scale : float, optional
+ An optional scale argument. The default is 1.
+
+ Returns
+ -------
+ deviance : float
+ The deviance function at (endog,mu,freq_weights,scale) as defined
+ below.
+
+ Notes
+ -----
+ If a constant term is included it is defined as
+
+ .. math::
+
+ D = 2 * \sum_i (freq\_weights_i * Y_i * \log(Y_i / \mu_i))/ scale
+ '''
+ endog_mu = self._clean(endog / mu)
+ return 2 * np.sum(endog * freq_weights * np.log(endog_mu)) / scale
+
+ def loglike(self, endog, mu, freq_weights=1., scale=1.):
+ r"""
+ The log-likelihood function in terms of the fitted mean response.
+
+ Returns NaN for QuasiPoisson
+
+ Returns
+ -------
+ None: not applicable for QuasiPoisson
+ """
+ return np.nan
+
+ def resid_anscombe(self, endog, mu):
+ r"""
+ Anscombe residuals for the Poisson exponential family distribution
+
+ Parameters
+ ----------
+ endog : array-like
+ Endogenous response variable
+ mu : array-like
+ Fitted mean response variable
+
+ Returns
+ -------
+ resid_anscombe : array
+ The Anscome residuals for the Poisson family defined below
+
+ Notes
+ -----
+ .. math::
+
+ resid\_anscombe_i = (3/2) * (Y_i^{2/3} - \mu_i^{2/3}) / \mu_i^{1/6}
+ """
+ return (3 / 2.) * (endog**(2/3.) - mu**(2 / 3.)) / mu**(1 / 6.)
+
+class Gaussian(Family):
+ """
+ Gaussian exponential family distribution.
+
+ Parameters
+ ----------
+ link : a link instance, optional
+ The default link for the Gaussian family is the identity link.
+ Available links are log, identity, and inverse.
+ See statsmodels.family.links for more information.
+
+ Attributes
+ ----------
+ Gaussian.link : a link instance
+ The link function of the Gaussian instance
+ Gaussian.variance : varfunc instance
+ `variance` is an instance of statsmodels.family.varfuncs.constant
+
+ See also
+ --------
+ statsmodels.genmod.families.family.Family
+ :ref:`links`
+
+ """
+
+ links = [L.log, L.identity, L.inverse_power]
+ variance = V.constant
+ safe_links = links
+
+ def __init__(self, link=L.identity):
+ self.variance = Gaussian.variance
+ self.link = link()
+
+ def resid_dev(self, endog, mu, scale=1.):
+ r"""
+ Gaussian deviance residuals
+
+ Parameters
+ -----------
+ endog : array-like
+ Endogenous response variable
+ mu : array-like
+ Fitted mean response variable
+ scale : float, optional
+ An optional argument to divide the residuals by scale. The default
+ is 1.
+
+ Returns
+ -------
+ resid_dev : array
+ Deviance residuals as defined below
+
+ Notes
+ --------
+ .. math::
+
+ resid\_dev_i = (Y_i - \mu_i) / \sqrt{Var(\mu_i)} / scale
+ """
+
+ return (endog - mu) / np.sqrt(self.variance(mu)) / scale
+
+ def deviance(self, endog, mu, freq_weights=1., scale=1.):
+ r"""
+ Gaussian deviance function
+
+ Parameters
+ ----------
+ endog : array-like
+ Endogenous response variable
+ mu : array-like
+ Fitted mean response variable
+ freq_weights : array-like
+ 1d array of frequency weights. The default is 1.
+ scale : float, optional
+ An optional scale argument. The default is 1.
+
+ Returns
+ -------
+ deviance : float
+ The deviance function at (endog,mu,freq_weights,scale)
+ as defined below.
+
+ Notes
+ --------
+ .. math::
+
+ D = \sum_i freq\_weights_i * (Y_i - \mu_i)^2 / scale
+ """
+ return np.sum((freq_weights * (endog - mu)**2)) / scale
+
+ def loglike(self, endog, mu, freq_weights=1., scale=1.):
+ r"""
+ The log-likelihood in terms of the fitted mean response.
+
+ Parameters
+ ----------
+ endog : array-like
+ Endogenous response variable
+ mu : array-like
+ Fitted mean response variable
+ freq_weights : array-like
+ 1d array of frequency weights. The default is 1.
+ scale : float, optional
+ Scales the loglikelihood function. The default is 1.
+
+ Returns
+ -------
+ llf : float
+ The value of the loglikelihood function evaluated at
+ (endog,mu,freq_weights,scale) as defined below.
+
+ Notes
+ -----
+ If the link is the identity link function then the
+ loglikelihood function is the same as the classical OLS model.
+
+ .. math::
+
+ llf = -nobs / 2 * (\log(SSR) + (1 + \log(2 \pi / nobs)))
+
+ where
+
+ .. math::
+ SSR = \sum_i (Y_i - g^{-1}(\mu_i))^2
+
+ If the links is not the identity link then the loglikelihood
+ function is defined as
+
+ .. math::
+
+ llf = \sum_i freq\_weights_i * ((Y_i * \mu_i - \mu_i^2 / 2) / scale-
+ Y^2 / (2 * scale) - (1/2) * \log(2 * \pi * scale))
+ """
+ if isinstance(self.link, L.Power) and self.link.power == 1:
+ # This is just the loglikelihood for classical OLS
+ nobs2 = endog.shape[0] / 2.
+ SSR = np.sum((endog-self.fitted(mu))**2, axis=0)
+ llf = -np.log(SSR) * nobs2
+ llf -= (1+np.log(np.pi/nobs2))*nobs2
+ return llf
+ else:
+ return np.sum(freq_weights * ((endog * mu - mu**2/2)/scale -
+ endog**2/(2 * scale) - .5*np.log(2 * np.pi * scale)))
+
+ def resid_anscombe(self, endog, mu):
+ r"""
+ The Anscombe residuals for the Gaussian exponential family distribution
+
+ Parameters
+ ----------
+ endog : array
+ Endogenous response variable
+ mu : array
+ Fitted mean response variable
+
+ Returns
+ -------
+ resid_anscombe : array
+ The Anscombe residuals for the Gaussian family defined below
+
+ Notes
+ --------
+ .. math::
+
+ resid\_anscombe_i = Y_i - \mu_i
+ """
+ return endog - mu
+
+
+class Gamma(Family):
+ """
+ Gamma exponential family distribution.
+
+ Parameters
+ ----------
+ link : a link instance, optional
+ The default link for the Gamma family is the inverse link.
+ Available links are log, identity, and inverse.
+ See statsmodels.family.links for more information.
+
+ Attributes
+ ----------
+ Gamma.link : a link instance
+ The link function of the Gamma instance
+ Gamma.variance : varfunc instance
+ `variance` is an instance of statsmodels.family.varfuncs.mu_squared
+
+ See also
+ --------
+ statsmodels.genmod.families.family.Family
+ :ref:`links`
+
+ """
+
+ links = [L.log, L.identity, L.inverse_power]
+ variance = V.mu_squared
+ safe_links = [L.Log, ]
+
+ def __init__(self, link=L.inverse_power):
+ self.variance = Gamma.variance
+ self.link = link()
+
+ def _clean(self, x):
+ """
+ Helper function to trim the data so that is in (0,inf)
+
+ Notes
+ -----
+ The need for this function was discovered through usage and its
+ possible that other families might need a check for validity of the
+ domain.
+ """
+ return np.clip(x, FLOAT_EPS, np.inf)
+
+ def deviance(self, endog, mu, freq_weights=1., scale=1.):
+ r"""
+ Gamma deviance function
+
+ Parameters
+ -----------
+ endog : array-like
+ Endogenous response variable
+ mu : array-like
+ Fitted mean response variable
+ freq_weights : array-like
+ 1d array of frequency weights. The default is 1.
+ scale : float, optional
+ An optional scale argument. The default is 1.
+
+ Returns
+ -------
+ deviance : float
+ Deviance function as defined below
+
+ Notes
+ -----
+ .. math::
+
+ D = 2 * \sum_i freq\_weights_i * ((Y_i - \mu_i)/\mu_i - \log(Y_i /
+ \mu_i))
+ """
+ endog_mu = self._clean(endog/mu)
+ return 2*np.sum(freq_weights*((endog-mu)/mu-np.log(endog_mu)))
+
+ def resid_dev(self, endog, mu, scale=1.):
+ r"""
+ Gamma deviance residuals
+
+ Parameters
+ -----------
+ endog : array-like
+ Endogenous response variable
+ mu : array-like
+ Fitted mean response variable
+ scale : float, optional
+ An optional argument to divide the residuals by scale. The default
+ is 1.
+
+ Returns
+ -------
+ resid_dev : array
+ Deviance residuals as defined below
+
+ Notes
+ -----
+ .. math::
+
+ resid\_dev_i = sign(Y_i - \mu_i) \sqrt{-2 *
+ (-(Y_i - \mu_i) / \mu_i + \log(Y_i / \mu_i))}
+ """
+ endog_mu = self._clean(endog / mu)
+ return np.sign(endog - mu) * np.sqrt(-2 * (-(endog - mu)/mu +
+ np.log(endog_mu)))
+
+ def loglike(self, endog, mu, freq_weights=1., scale=1.):
+ r"""
+ The log-likelihood function in terms of the fitted mean response.
+
+ Parameters
+ ----------
+ endog : array-like
+ Endogenous response variable
+ mu : array-like
+ Fitted mean response variable
+ freq_weights : array-like
+ 1d array of frequency weights. The default is 1.
+ scale : float, optional
+ The default is 1.
+
+ Returns
+ -------
+ llf : float
+ The value of the loglikelihood function evaluated at
+ (endog,mu,freq_weights,scale) as defined below.
+
+ Notes
+ --------
+ .. math::
+
+ llf = -1 / scale * \sum_i *(Y_i / \mu_i+ \log(\mu_i)+
+ (scale -1) * \log(Y) + \log(scale) + scale *
+ \ln \Gamma(1 / scale))
+ """
+ return - 1./scale * np.sum((endog/mu + np.log(mu) + (scale - 1) *
+ np.log(endog) + np.log(scale) + scale *
+ special.gammaln(1./scale)) * freq_weights)
+
+ # in Stata scale is set to equal 1 for reporting llf
+ # in R it's the dispersion, though there is a loss of precision vs.
+ # our results due to an assumed difference in implementation
+
+ def resid_anscombe(self, endog, mu):
+ r"""
+ The Anscombe residuals for Gamma exponential family distribution
+
+ Parameters
+ ----------
+ endog : array
+ Endogenous response variable
+ mu : array
+ Fitted mean response variable
+
+ Returns
+ -------
+ resid_anscombe : array
+ The Anscombe residuals for the Gamma family defined below
+
+ Notes
+ -----
+ .. math::
+
+ resid\_anscombe_i = 3 * (Y_i^{1/3} - \mu_i^{1/3}) / \mu_i^{1/3}
+ """
+ return 3 * (endog**(1/3.) - mu**(1/3.)) / mu**(1/3.)
+
+
+class Binomial(Family):
+ """
+ Binomial exponential family distribution.
+
+ Parameters
+ ----------
+ link : a link instance, optional
+ The default link for the Binomial family is the logit link.
+ Available links are logit, probit, cauchy, log, and cloglog.
+ See statsmodels.family.links for more information.
+
+ Attributes
+ ----------
+ Binomial.link : a link instance
+ The link function of the Binomial instance
+ Binomial.variance : varfunc instance
+ `variance` is an instance of statsmodels.family.varfuncs.binary
+
+ See also
+ --------
+ statsmodels.genmod.families.family.Family
+ :ref:`links`
+
+ Notes
+ -----
+ endog for Binomial can be specified in one of three ways.
+
+ """
+
+ links = [L.logit, L.probit, L.cauchy, L.log, L.cloglog, L.identity]
+ variance = V.binary # this is not used below in an effort to include n
+
+ # Other safe links, e.g. cloglog and probit are subclasses
+ safe_links = [L.Logit, L.CDFLink]
+
+ def __init__(self, link=L.logit): # , n=1.):
+ # TODO: it *should* work for a constant n>1 actually, if freq_weights
+ # is equal to n
+ self.n = 1
+ # overwritten by initialize if needed but always used to initialize
+ # variance since endog is assumed/forced to be (0,1)
+ self.variance = V.Binomial(n=self.n)
+ self.link = link()
+
+ def starting_mu(self, y):
+ """
+ The starting values for the IRLS algorithm for the Binomial family.
+ A good choice for the binomial family is :math:`\mu_0 = (Y_i + 0.5)/2`
+ """
+ return (y + .5)/2
+
+ def initialize(self, endog, freq_weights):
+ '''
+ Initialize the response variable.
+
+ Parameters
+ ----------
+ endog : array
+ Endogenous response variable
+
+ Returns
+ --------
+ If `endog` is binary, returns `endog`
+
+ If `endog` is a 2d array, then the input is assumed to be in the format
+ (successes, failures) and
+ successes/(success + failures) is returned. And n is set to
+ successes + failures.
+ '''
+ # if not np.all(np.asarray(freq_weights) == 1):
+ # self.variance = V.Binomial(n=freq_weights)
+ if (endog.ndim > 1 and endog.shape[1] > 1):
+ y = endog[:, 0]
+ # overwrite self.freq_weights for deviance below
+ self.n = endog.sum(1)
+ return y*1./self.n, self.n
+ else:
+ return endog, np.ones(endog.shape[0])
+
+ def deviance(self, endog, mu, freq_weights=1, scale=1., axis=None):
+ r'''
+ Deviance function for either Bernoulli or Binomial data.
+
+ Parameters
+ ----------
+ endog : array-like
+ Endogenous response variable (already transformed to a probability
+ if appropriate).
+ mu : array
+ Fitted mean response variable
+ freq_weights : array-like
+ 1d array of frequency weights. The default is 1.
+ scale : float, optional
+ An optional scale argument. The default is 1.
+
+ Returns
+ --------
+ deviance : float
+ The deviance function as defined below
+
+ Notes
+ -----
+ If the endogenous variable is binary:
+
+ .. math::
+
+ D = -2 * \sum_i freq\_weights * (I_{1,i} * \log(\mu_i) + I_{0,i} *
+ \log(1 - \mu_i))
+
+ where :math:`I_{1,i}` is an indicator function that evalueates to 1 if
+ :math:`Y_i = 1`. and :math:`I_{0,i}` is an indicator function that
+ evaluates to 1 if :math:`Y_i = 0`.
+
+ If the model is ninomial:
+
+ .. math::
+
+ D = 2 * \sum_i freq\_weights * (\log(Y_i / \mu_i) + (n_i - Y_i) *
+ \log((n_i - Y_i) / n_i - \mu_i))
+
+ where :math:`Y_i` and :math:`n` are as defined in Binomial.initialize.
+ '''
+ if np.shape(self.n) == () and self.n == 1:
+ one = np.equal(endog, 1)
+ return -2 * np.sum((one * np.log(mu + 1e-200) + (1-one) *
+ np.log(1 - mu + 1e-200)) * freq_weights, axis=axis)
+
+ else:
+ return 2 * np.sum(self.n * freq_weights *
+ (endog * np.log(endog/mu + 1e-200) +
+ (1 - endog) * np.log((1 - endog) /
+ (1 - mu) + 1e-200)), axis=axis)
+
+ def resid_dev(self, endog, mu, scale=1.):
+ r"""
+ Binomial deviance residuals
+
+ Parameters
+ -----------
+ endog : array-like
+ Endogenous response variable
+ mu : array-like
+ Fitted mean response variable
+ scale : float, optional
+ An optional argument to divide the residuals by scale. The default
+ is 1.
+
+ Returns
+ -------
+ resid_dev : array
+ Deviance residuals as defined below
+
+ Notes
+ -----
+ If the endogenous variable is binary:
+
+ .. math::
+
+ resid\_dev_i = sign(Y_i - \mu_i) * \sqrt{-2 *
+ \log(I_{1,i} * \mu_i + I_{0,i} * (1 - \mu_i))}
+
+ where :math:`I_{1,i}` is an indicator function that evalueates to 1 if
+ :math:`Y_i = 1`. and :math:`I_{0,i}` is an indicator function that
+ evaluates to 1 if :math:`Y_i = 0`.
+
+ If the endogenous variable is binomial:
+
+ .. math::
+
+ resid\_dev_i = sign(Y_i - \mu_i) \sqrt{2 * n_i *
+ (Y_i * \log(Y_i / \mu_i) + (1 - Y_i) *
+ \log(1 - Y_i)/(1 - \mu_i))}
+
+ where :math:`Y_i` and :math:`n` are as defined in Binomial.initialize.
+ """
+
+ mu = self.link._clean(mu)
+ if np.shape(self.n) == () and self.n == 1:
+ one = np.equal(endog, 1)
+ return np.sign(endog-mu)*np.sqrt(-2 *
+ np.log(one * mu + (1 - one) *
+ (1 - mu)))/scale
+ else:
+ return (np.sign(endog - mu) *
+ np.sqrt(2 * self.n *
+ (endog * np.log(endog/mu + 1e-200) +
+ (1 - endog) * np.log((1 - endog)/(1 - mu) + 1e-200)))/scale)
+
+ def loglike(self, endog, mu, freq_weights=1, scale=1.):
+ r"""
+ The log-likelihood function in terms of the fitted mean response.
+
+ Parameters
+ ----------
+ endog : array-like
+ Endogenous response variable
+ mu : array-like
+ Fitted mean response variable
+ freq_weights : array-like
+ 1d array of frequency weights. The default is 1.
+ scale : float, optional
+ Not used for the Binomial GLM.
+
+ Returns
+ -------
+ llf : float
+ The value of the loglikelihood function evaluated at
+ (endog,mu,freq_weights,scale) as defined below.
+
+ Notes
+ --------
+ If the endogenous variable is binary:
+
+ .. math::
+
+ llf = scale * \sum_i (y_i * \log(\mu_i/(1-\mu_i)) + \log(1-\mu_i)) *
+ freq\_weights_i
+
+ If the endogenous variable is binomial:
+
+ .. math::
+
+ llf = scale * \sum_i freq\_weights_i * (\ln \Gamma(n+1) -
+ \ln \Gamma(y_i + 1) - \ln \Gamma(n_i - y_i +1) + y_i *
+ \log(\mu_i / (1 - \mu_i)) + n * \log(1 - \mu_i))
+
+ where :math:`y_i = Y_i * n_i` with :math:`Y_i` and :math:`n_i` as
+ defined in Binomial initialize. This simply makes :math:`y_i` the
+ original number of successes.
+ """
+
+ if np.shape(self.n) == () and self.n == 1:
+ return scale * np.sum((endog * np.log(mu/(1 - mu) + 1e-200) +
+ np.log(1 - mu)) * freq_weights)
+ else:
+ y = endog * self.n # convert back to successes
+ return scale * np.sum((special.gammaln(self.n + 1) -
+ special.gammaln(y + 1) -
+ special.gammaln(self.n - y + 1) + y *
+ np.log(mu/(1 - mu)) + self.n *
+ np.log(1 - mu)) * freq_weights)
+
+ def resid_anscombe(self, endog, mu):
+ '''
+ The Anscombe residuals
+
+ Parameters
+ ----------
+ endog : array-like
+ Endogenous response variable
+ mu : array-like
+ Fitted mean response variable
+
+ Returns
+ -------
+ resid_anscombe : array
+ The Anscombe residuals as defined below.
+
+ Notes
+ -----
+ sqrt(n)*(cox_snell(endog)-cox_snell(mu))/(mu**(1/6.)*(1-mu)**(1/6.))
+
+ where cox_snell is defined as
+ cox_snell(x) = betainc(2/3., 2/3., x)*betainc(2/3.,2/3.)
+ where betainc is the incomplete beta function
+
+ The name 'cox_snell' is idiosyncratic and is simply used for
+ convenience following the approach suggested in Cox and Snell (1968).
+ Further note that
+ cox_snell(x) = x**(2/3.)/(2/3.)*hyp2f1(2/3.,1/3.,5/3.,x)
+ where hyp2f1 is the hypergeometric 2f1 function. The Anscombe
+ residuals are sometimes defined in the literature using the
+ hyp2f1 formulation. Both betainc and hyp2f1 can be found in scipy.
+
+ References
+ ----------
+ Anscombe, FJ. (1953) "Contribution to the discussion of H. Hotelling's
+ paper." Journal of the Royal Statistical Society B. 15, 229-30.
+
+ Cox, DR and Snell, EJ. (1968) "A General Definition of Residuals."
+ Journal of the Royal Statistical Society B. 30, 248-75.
+
+ '''
+ cox_snell = lambda x: (special.betainc(2/3., 2/3., x)
+ * special.beta(2/3., 2/3.))
+ return np.sqrt(self.n) * ((cox_snell(endog) - cox_snell(mu)) /
+ (mu**(1/6.) * (1 - mu)**(1/6.)))
+
+
+class InverseGaussian(Family):
+ """
+ InverseGaussian exponential family.
+
+ Parameters
+ ----------
+ link : a link instance, optional
+ The default link for the inverse Gaussian family is the
+ inverse squared link.
+ Available links are inverse_squared, inverse, log, and identity.
+ See statsmodels.family.links for more information.
+
+ Attributes
+ ----------
+ InverseGaussian.link : a link instance
+ The link function of the inverse Gaussian instance
+ InverseGaussian.variance : varfunc instance
+ `variance` is an instance of statsmodels.family.varfuncs.mu_cubed
+
+ See also
+ --------
+ statsmodels.genmod.families.family.Family
+ :ref:`links`
+
+ Notes
+ -----
+ The inverse Guassian distribution is sometimes referred to in the
+ literature as the Wald distribution.
+
+ """
+
+ links = [L.inverse_squared, L.inverse_power, L.identity, L.log]
+ variance = V.mu_cubed
+ safe_links = [L.inverse_squared, L.Log, ]
+
+ def __init__(self, link=L.inverse_squared):
+ self.variance = InverseGaussian.variance
+ self.link = link()
+
+ def resid_dev(self, endog, mu, scale=1.):
+ r"""
+ Returns the deviance residuals for the inverse Gaussian family.
+
+ Parameters
+ -----------
+ endog : array-like
+ Endogenous response variable
+ mu : array-like
+ Fitted mean response variable
+ freq_weights : array-like
+ 1d array of frequency weights. The default is 1.
+ scale : float, optional
+ An optional argument to divide the residuals by scale. The default
+ is 1.
+
+ Returns
+ -------
+ resid_dev : array
+ Deviance residuals as defined below
+
+ Notes
+ -----
+ .. math::
+
+ resid\_dev_i = sign(Y_i - \mu_i) *
+ \sqrt {(Y_i - \mu_i)^2 / (Y_i * \mu_i^2)} / scale
+ """
+ return np.sign(endog-mu) * np.sqrt((endog-mu)**2/(endog*mu**2))/scale
+
+ def deviance(self, endog, mu, freq_weights=1., scale=1.):
+ r"""
+ Inverse Gaussian deviance function
+
+ Parameters
+ -----------
+ endog : array-like
+ Endogenous response variable
+ mu : array-like
+ Fitted mean response variable
+ freq_weights : array-like
+ 1d array of frequency weights. The default is 1.
+ scale : float, optional
+ An optional scale argument. The default is 1.
+
+ Returns
+ -------
+ deviance : float
+ Deviance function as defined below
+
+ Notes
+ -----
+ .. math::
+
+ D = \sum_i freq\_weights_i * ((Y_i - \mu_i)^2 / (Y_i *\mu_i^2)) /
+ scale
+ """
+ return np.sum(freq_weights*(endog-mu)**2/(endog*mu**2))/scale
+
+ def loglike(self, endog, mu, freq_weights=1., scale=1.):
+ r"""
+ The log-likelihood function in terms of the fitted mean response.
+
+ Parameters
+ ----------
+ endog : array-like
+ Endogenous response variable
+ mu : array-like
+ Fitted mean response variable
+ freq_weights : array-like
+ 1d array of frequency weights. The default is 1.
+ scale : float, optional
+ The default is 1.
+
+ Returns
+ -------
+ llf : float
+ The value of the loglikelihood function evaluated at
+ (endog,mu,freq_weights,scale) as defined below.
+
+ Notes
+ -----
+ .. math::
+
+ llf = -1/2 * \sum_i freq\_weights_i * ((Y_i - \mu_i)^2 / (Y_i *
+ \mu_i * scale) + \log(scale * Y_i^3) + \log(2 * \pi))
+ """
+ return -.5 * np.sum(((endog - mu)**2/(endog * mu**2 * scale) +
+ np.log(scale * endog**3) + np.log(2 * np.pi)) *
+ freq_weights)
+
+ def resid_anscombe(self, endog, mu):
+ r"""
+ The Anscombe residuals for the inverse Gaussian distribution
+
+ Parameters
+ ----------
+ endog : array
+ Endogenous response variable
+ mu : array
+ Fitted mean response variable
+
+ Returns
+ -------
+ resid_anscombe : array
+ The Anscombe residuals for the inverse Gaussian distribution as
+ defined below
+
+ Notes
+ -----
+ .. math::
+
+ resid\_anscombe_i = \log(Y_i / \mu_i) / \sqrt{\mu_i}
+ """
+ return np.log(endog / mu) / np.sqrt(mu)
+
+
+class NegativeBinomial(Family):
+ """
+ Negative Binomial exponential family.
+
+ Parameters
+ ----------
+ link : a link instance, optional
+ The default link for the negative binomial family is the log link.
+ Available links are log, cloglog, identity, nbinom and power.
+ See statsmodels.family.links for more information.
+ alpha : float, optional
+ The ancillary parameter for the negative binomial distribution.
+ For now `alpha` is assumed to be nonstochastic. The default value
+ is 1. Permissible values are usually assumed to be between .01 and 2.
+
+
+ Attributes
+ ----------
+ NegativeBinomial.link : a link instance
+ The link function of the negative binomial instance
+ NegativeBinomial.variance : varfunc instance
+ `variance` is an instance of statsmodels.family.varfuncs.nbinom
+
+ See also
+ --------
+ statsmodels.genmod.families.family.Family
+ :ref:`links`
+
+ Notes
+ -----
+ Power link functions are not yet supported.
+
+ """
+ links = [L.log, L.cloglog, L.identity, L.nbinom, L.Power]
+ # TODO: add the ability to use the power links with an if test
+ # similar to below
+ variance = V.nbinom
+ safe_links = [L.Log, ]
+
+ def __init__(self, link=L.log, alpha=1.):
+ self.alpha = 1. * alpha # make it at least float
+ self.variance = V.NegativeBinomial(alpha=self.alpha)
+ if isinstance(link, L.NegativeBinomial):
+ self.link = link(alpha=self.alpha)
+ else:
+ self.link = link()
+
+ def _clean(self, x):
+ """
+ Helper function to trim the data so that is in (0,inf)
+
+ Notes
+ -----
+ The need for this function was discovered through usage and its
+ possible that other families might need a check for validity of the
+ domain.
+ """
+ return np.clip(x, FLOAT_EPS, np.inf)
+
+ def deviance(self, endog, mu, freq_weights=1., scale=1.):
+ r"""
+ Returns the value of the deviance function.
+
+ Parameters
+ -----------
+ endog : array-like
+ Endogenous response variable
+ mu : array-like
+ Fitted mean response variable
+ freq_weights : array-like
+ 1d array of frequency weights. The default is 1.
+ scale : float, optional
+ An optional scale argument. The default is 1.
+
+ Returns
+ -------
+ deviance : float
+ Deviance function as defined below
+
+ Notes
+ -----
+ :math:`D = \sum_i piecewise_i` where :math:`piecewise_i` is defined as:
+
+ If :math:`Y_{i} = 0`:
+
+ :math:`piecewise_i = 2* \log(1 + \alpha * \mu_i) / \alpha`
+
+ If :math:`Y_{i} > 0`:
+
+ :math:`piecewise_i = 2 * Y_i * \log(Y_i / \mu_i) - (2 / \alpha) *
+ (1 + \alpha * Y_i) * \ln(1 + \alpha * Y_i) / (1 + \alpha * \mu_i)`
+ """
+ iszero = np.equal(endog, 0)
+ notzero = 1 - iszero
+ endog_mu = self._clean(endog/mu)
+ tmp = iszero * 2 * np.log(1 + self.alpha * mu) / self.alpha
+ tmp += notzero * (2 * endog * np.log(endog_mu) - 2 / self.alpha *
+ (1 + self.alpha * endog) *
+ np.log((1 + self.alpha * endog) /
+ (1 + self.alpha * mu)))
+ return np.sum(freq_weights * tmp) / scale
+
+ def resid_dev(self, endog, mu, scale=1.):
+ r"""
+ Negative Binomial Deviance Residual
+
+ Parameters
+ ----------
+ endog : array-like
+ `endog` is the response variable
+ mu : array-like
+ `mu` is the fitted value of the model
+ scale : float, optional
+ An optional argument to divide the residuals by scale. The default
+ is 1.
+
+ Returns
+ --------
+ resid_dev : array
+ The array of deviance residuals
+
+ Notes
+ -----
+ :math:`resid\_dev_i = sign(Y_i-\mu_i) * \sqrt{piecewise_i}`
+
+ where :math:`piecewise_i` is defined as
+
+ If :math:`Y_i = 0`:
+
+ :math:`piecewise_i = 2 * \log(1 + \alpha * \mu_i)/ \alpha`
+
+ If :math:`Y_i > 0`:
+
+ :math:`piecewise_i = 2 * Y_i * \log(Y_i / \mu_i) - (2 / \alpha) *
+ (1 + \alpha * Y_i) * \log((1 + \alpha * Y_i) / (1 + \alpha * \mu_i))`
+ """
+ iszero = np.equal(endog, 0)
+ notzero = 1 - iszero
+ endog_mu = self._clean(endog / mu)
+ tmp = iszero * 2 * np.log(1 + self.alpha * mu) / self.alpha
+ tmp += notzero * (2 * endog * np.log(endog_mu) - 2 / self.alpha *
+ (1 + self.alpha * endog) *
+ np.log((1 + self.alpha * endog) /
+ (1 + self.alpha * mu)))
+ return np.sign(endog - mu) * np.sqrt(tmp) / scale
+
+ def loglike(self, endog, mu, freq_weights=1., scale=1.):
+ r"""
+ The log-likelihood function in terms of the fitted mean response.
+
+ Parameters
+ ----------
+ endog : array-like
+ Endogenous response variable
+ mu : array-like
+ The fitted mean response values
+ freq_weights : array-like
+ 1d array of frequency weights. The default is 1.
+ scale : float
+ The scale parameter. The default is 1.
+
+ Returns
+ -------
+ llf : float
+ The value of the loglikelihood function evaluated at
+ (endog,mu,freq_weights,scale) as defined below.
+
+ Notes
+ -----
+ Defined as:
+
+ .. math::
+
+ llf = \sum_i freq\_weights_i * (Y_i * \log{(\alpha * e^{\eta_i} /
+ (1 + \alpha * e^{\eta_i}))} - \log{(1 + \alpha * e^{\eta_i})}/
+ \alpha + Constant)
+
+ where :math:`Constant` is defined as:
+
+ .. math::
+
+ Constant = \ln \Gamma{(Y_i + 1/ \alpha )} - \ln \Gamma(Y_i + 1) -
+ \ln \Gamma{(1/ \alpha )}
+ """
+ lin_pred = self._link(mu)
+ constant = (special.gammaln(endog + 1 / self.alpha) -
+ special.gammaln(endog+1)-special.gammaln(1/self.alpha))
+ exp_lin_pred = np.exp(lin_pred)
+ return np.sum((endog * np.log(self.alpha * exp_lin_pred /
+ (1 + self.alpha * exp_lin_pred)) -
+ np.log(1 + self.alpha * exp_lin_pred) /
+ self.alpha + constant) * freq_weights)
+
+ def resid_anscombe(self, endog, mu):
+ """
+ The Anscombe residuals for the negative binomial family
+
+ Parameters
+ ----------
+ endog : array-like
+ Endogenous response variable
+ mu : array-like
+ Fitted mean response variable
+
+ Returns
+ -------
+ resid_anscombe : array
+ The Anscombe residuals as defined below.
+
+ Notes
+ -----
+ `resid_anscombe` = (hyp2f1(-alpha*endog)-hyp2f1(-alpha*mu)+\
+ 1.5*(endog**(2/3.)-mu**(2/3.)))/(mu+alpha*mu**2)**(1/6.)
+
+ where hyp2f1 is the hypergeometric 2f1 function parameterized as
+ hyp2f1(x) = hyp2f1(2/3.,1/3.,5/3.,x)
+ """
+
+ hyp2f1 = lambda x : special.hyp2f1(2 / 3., 1 / 3., 5 / 3., x)
+ return ((hyp2f1(-self.alpha * endog) - hyp2f1(-self.alpha * mu) +
+ 1.5 * ( endog**(2 / 3.) - mu**(2 / 3.))) /
+ (mu + self.alpha * mu**2)**(1 / 6.))
+
+
+class Tweedie(Family):
+ """
+ Tweedie family.
+
+ Parameters
+ ----------
+ link : a link instance, optional
+ The default link for the Tweedie family is the log link when the
+ link_power is 0. Otherwise, the power link is default.
+ Available links are log and Power.
+ var_power : float, optional
+ The variance power.
+ link_power : float, optional
+ The link power.
+
+ Attributes
+ ----------
+ Tweedie.link : a link instance
+ The link function of the Tweedie instance
+ Tweedie.variance : varfunc instance
+ `variance` is an instance of statsmodels.family.varfuncs.Power
+ Tweedie.link_power : float
+ The power of the link function, or 0 if its a log link.
+ Tweedie.var_power : float
+ The power of the variance function.
+
+ See also
+ --------
+ statsmodels.genmod.families.family.Family
+ :ref:`links`
+
+ Notes
+ -----
+ Logliklihood function not implemented because of the complexity of
+ calculating an infinite series of summations. The variance power can be
+ estimated using the `estimate_tweedie_power` function that is part of the
+ `GLM` class.
+ """
+ links = [L.log, L.Power]
+ variance = V.Power
+ safe_links = [L.log, L.Power]
+
+ def __init__(self, link=None, var_power=1., link_power=0):
+ self.var_power = var_power
+ self.link_power = link_power
+ self.variance = V.Power(power=var_power * 1.)
+ if link_power != 0 and not ((link is L.Power) or (link is None)):
+ msg = 'link_power of {} not supported specified link'
+ msg = msg.format(link_power)
+ raise ValueError(msg)
+ if (link_power == 0) and ((link is None) or (link is L.Log)):
+ self.link = L.log()
+ elif link_power != 0:
+ self.link = L.Power(power=link_power * 1.)
+ else:
+ self.link = link()
+
+ def _clean(self, x):
+ """
+ Helper function to trim the data so that is in (0,inf)
+
+ Notes
+ -----
+ The need for this function was discovered through usage and its
+ possible that other families might need a check for validity of the
+ domain.
+ """
+ return np.clip(x, 0, np.inf)
+
+ def deviance(self, endog, mu, freq_weights=1., scale=1.):
+ r"""
+ Returns the value of the deviance function.
+
+ Parameters
+ -----------
+ endog : array-like
+ Endogenous response variable
+ mu : array-like
+ Fitted mean response variable
+ freq_weights : array-like
+ 1d array of frequency weights. The default is 1.
+ scale : float, optional
+ An optional scale argument. The default is 1.
+
+ Returns
+ -------
+ deviance : float
+ Deviance function as defined below
+
+ Notes
+ -----
+ When :math:`p = 1`,
+
+ .. math::
+
+ resid\_dev_i = \mu
+
+ when :math:`endog = 0` and
+
+ .. math::
+
+ resid\_dev_i = endog * \log(endog / \mu) + (\mu - endog)
+
+ otherwise.
+
+ When :math:`p = 2`,
+
+ .. math::
+
+ resid\_dev_i = (endog - \mu) / \mu - \log(endog / \mu)
+
+ For all other p,
+
+ .. math::
+
+ resid\_dev_i = endog ^{2 - p} / ((1 - p) * (2 - p)) -
+ endog * \mu ^{1 - p} / (1 - p) + \mu ^{2 - p} /
+ (2 - p)
+
+ Once :math:`resid\_dev_i` is calculated, then calculate deviance as
+
+ .. math::
+
+ D = \sum{2 * freq\_weights * resid\_dev_i}
+ """
+ p = self.var_power
+ if p == 1:
+ dev = np.where(endog == 0,
+ mu,
+ endog * np.log(endog / mu) + (mu - endog))
+ elif p == 2:
+ endog1 = np.clip(endog, FLOAT_EPS, np.inf)
+ dev = ((endog - mu) / mu) - np.log(endog1 / mu)
+ else:
+ dev = (endog ** (2 - p) / ((1 - p) * (2 - p)) -
+ endog * mu ** (1-p) / (1 - p) + mu ** (2 - p) / (2 - p))
+ return np.sum(2 * freq_weights * dev)
+
+ def resid_dev(self, endog, mu, scale=1.):
+ r"""
+ Tweedie Deviance Residual
+
+ Parameters
+ ----------
+ endog : array-like
+ `endog` is the response variable
+ mu : array-like
+ `mu` is the fitted value of the model
+ scale : float, optional
+ An optional argument to divide the residuals by scale. The default
+ is 1.
+
+ Returns
+ --------
+ resid_dev : array
+ The array of deviance residuals
+
+ Notes
+ -----
+ When :math:`p = 1`,
+
+ .. math::
+
+ resid\_dev_i = \mu
+
+ when :math:`endog = 0` and
+
+ .. math::
+
+ resid\_dev_i = endog * \log(endog / \mu) + (\mu - endog)
+
+ otherwise.
+
+ When :math:`p = 2`,
+
+ .. math::
+
+ resid\_dev_i = (endog - \mu) / \mu - \log(endog / \mu)
+
+ For all other p,
+
+ .. math::
+
+ resid\_dev_i = endog ^{2 - p} / ((1 - p) * (2 - p)) -
+ endog * \mu ^{1 - p} / (1 - p) + \mu ^{2 - p} /
+ (2 - p)
+ """
+ p = self.var_power
+ if p == 1:
+ dev = np.where(endog == 0,
+ mu,
+ endog * np.log(endog / mu) + (mu - endog))
+ elif p == 2:
+ endog1 = np.clip(endog, FLOAT_EPS, np.inf)
+ dev = ((endog - mu) / mu) - np.log(endog1 / mu)
+ else:
+ dev = (endog ** (2 - p) / ((1 - p) * (2 - p)) -
+ endog * mu ** (1-p) / (1 - p) + mu ** (2 - p) / (2 - p))
+ return np.sign(endog - mu) * np.sqrt(2 * dev)
+
+ def loglike(self, endog, mu, freq_weights=1., scale=1.):
+ r"""
+ The log-likelihood function in terms of the fitted mean response.
+
+ Parameters
+ ----------
+ endog : array-like
+ Endogenous response variable
+ mu : array-like
+ The fitted mean response values
+ freq_weights : array-like
+ 1d array of frequency weights. The default is 1.
+ scale : float
+ The scale parameter. The default is 1.
+
+ Returns
+ -------
+ llf : float
+ The value of the loglikelihood function evaluated at
+ (endog,mu,freq_weights,scale) as defined below.
+
+ Notes
+ -----
+ This is not implemented because of the complexity of calculating an
+ infinite series of sums.
+ """
+ return np.nan
+
+ def resid_anscombe(self, endog, mu):
+ """
+ The Anscombe residuals for the Tweedie family
+
+ Parameters
+ ----------
+ endog : array-like
+ Endogenous response variable
+ mu : array-like
+ Fitted mean response variable
+
+ Returns
+ -------
+ resid_anscombe : array
+ The Anscombe residuals as defined below.
+
+ Notes
+ -----
+ When :math:`p = 3`, then
+
+ .. math::
+
+ resid\_anscombe_i = (\log(endog) - \log(\mu)) / \sqrt{mu}
+
+ Otherwise,
+
+ .. math::
+
+ c = (3 - p) / 3
+
+ .. math::
+
+ resid\_anscombe_i = (1 / c) * (endog ^ c - \mu ^ c) / \mu ^{p / 6}
+ """
+ if self.var_power == 3:
+ return (np.log(endog) - np.log(mu)) / np.sqrt(mu)
+ else:
+ c = (3. - self.var_power) / 3.
+ return ((1. / c) * (endog ** c - mu ** c) /
+ mu ** (self.var_power / 6.))
diff --git a/release/python/0.8.0/crankshaft/crankshaft/regression/glm/glm.py b/release/python/0.8.0/crankshaft/crankshaft/regression/glm/glm.py
new file mode 100644
index 0000000..f2fc17d
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/regression/glm/glm.py
@@ -0,0 +1,326 @@
+
+import numpy as np
+import numpy.linalg as la
+from pysal.spreg.utils import RegressionPropsY, spdot
+import pysal.spreg.user_output as USER
+from utils import cache_readonly
+from base import LikelihoodModelResults
+import family
+from iwls import iwls
+
+__all__ = ['GLM']
+
+class GLM(RegressionPropsY):
+ """
+ Generalised linear models. Can currently estimate Guassian, Poisson and
+ Logisitc regression coefficients. GLM object prepares model input and fit
+ method performs estimation which then returns a GLMResults object.
+
+ Parameters
+ ----------
+ y : array
+ n*1, dependent variable.
+ X : array
+ n*k, independent variable, exlcuding the constant.
+ family : string
+ Model type: 'Gaussian', 'Poisson', 'Binomial'
+
+ Attributes
+ ----------
+ y : array
+ n*1, dependent variable.
+ X : array
+ n*k, independent variable, including constant.
+ family : string
+ Model type: 'Gaussian', 'Poisson', 'logistic'
+ n : integer
+ Number of observations
+ k : integer
+ Number of independent variables
+ df_model : float
+ k-1, where k is the number of variables (including
+ intercept)
+ df_residual : float
+ observations minus variables (n-k)
+ mean_y : float
+ Mean of y
+ std_y : float
+ Standard deviation of y
+ fit_params : dict
+ Parameters passed into fit method to define estimation
+ routine.
+ normalized_cov_params : array
+ k*k, approximates [X.T*X]-1
+ """
+ def __init__(self, y, X, family=family.Gaussian(), constant=True):
+ """
+ Initialize class
+ """
+ self.n = USER.check_arrays(y, X)
+ USER.check_y(y, self.n)
+ self.y = y
+ if constant:
+ self.X = USER.check_constant(X)
+ else:
+ self.X = X
+ self.family = family
+ self.k = self.X.shape[1]
+ self.fit_params = {}
+
+ def fit(self, ini_betas=None, tol=1.0e-6, max_iter=200, solve='iwls'):
+ """
+ Method that fits a model with a particular estimation routine.
+
+ Parameters
+ ----------
+
+ ini_betas : array
+ k*1, initial coefficient values, including constant.
+ Default is None, which calculates initial values during
+ estimation.
+ tol: float
+ Tolerence for estimation convergence.
+ max_iter : integer
+ Maximum number of iterations if convergence not
+ achieved.
+ solve :string
+ Technique to solve MLE equations.
+ 'iwls' = iteratively (re)weighted least squares (default)
+ """
+ self.fit_params['ini_betas'] = ini_betas
+ self.fit_params['tol'] = tol
+ self.fit_params['max_iter'] = max_iter
+ self.fit_params['solve']=solve
+ if solve.lower() == 'iwls':
+ params, predy, w, n_iter = iwls(self.y, self.X, self.family,
+ ini_betas=ini_betas, tol=tol, max_iter=max_iter)
+ self.fit_params['n_iter'] = n_iter
+ return GLMResults(self, params.flatten(), predy, w)
+
+ @cache_readonly
+ def df_model(self):
+ return self.X.shape[1] - 1
+
+ @cache_readonly
+ def df_resid(self):
+ return self.n - self.df_model - 1
+
+class GLMResults(LikelihoodModelResults):
+ """
+ Results of estimated GLM and diagnostics.
+
+ Parameters
+ ----------
+ model : GLM object
+ Pointer to GLM object with estimation parameters.
+ params : array
+ k*1, estimared coefficients
+ mu : array
+ n*1, predicted y values.
+ w : array
+ n*1, final weight used for iwls
+
+ Attributes
+ ----------
+ model : GLM Object
+ Points to GLM object for which parameters have been
+ estimated.
+ y : array
+ n*1, dependent variable.
+ x : array
+ n*k, independent variable, including constant.
+ family : string
+ Model type: 'Gaussian', 'Poisson', 'Logistic'
+ n : integer
+ Number of observations
+ k : integer
+ Number of independent variables
+ df_model : float
+ k-1, where k is the number of variables (including
+ intercept)
+ df_residual : float
+ observations minus variables (n-k)
+ fit_params : dict
+ parameters passed into fit method to define estimation
+ routine.
+ scale : float
+ sigma squared used for subsequent computations.
+ params : array
+ n*k, estimared beta coefficients
+ w : array
+ n*1, final weight values of x
+ mu : array
+ n*1, predicted value of y (i.e., fittedvalues)
+ cov_params : array
+ Variance covariance matrix (kxk) of betas which has been
+ appropriately scaled by sigma-squared
+ bse : array
+ k*1, standard errors of betas
+ pvalues : array
+ k*1, two-tailed pvalues of parameters
+ tvalues : array
+ k*1, the tvalues of the standard errors
+ null : array
+ n*1, predicted values of y for null model
+ deviance : float
+ value of the deviance function evalued at params;
+ see family.py for distribution-specific deviance
+ null_deviance : float
+ value of the deviance function for the model fit with
+ a constant as the only regressor
+ llf : float
+ value of the loglikelihood function evalued at params;
+ see family.py for distribution-specific loglikelihoods
+ llnull : float
+ value of log-likelihood function evaluated at null
+ aic : float
+ AIC
+ bic : float
+ BIC
+ D2 : float
+ percent deviance explained
+ adj_D2 : float
+ adjusted percent deviance explained
+ pseudo_R2 : float
+ McFadden's pseudo R2 (coefficient of determination)
+ adj_pseudoR2 : float
+ adjusted McFadden's pseudo R2
+ resid_response : array
+ response residuals; defined as y-mu
+ resid_pearson : array
+ Pearson residuals; defined as (y-mu)/sqrt(VAR(mu))
+ where VAR is the distribution specific variance
+ function; see family.py and varfuncs.py for more information.
+ resid_working : array
+ Working residuals; the working residuals are defined as
+ resid_response/link'(mu); see links.py for the
+ derivatives of the link functions.
+
+ resid_anscombe : array
+ Anscombe residuals; see family.py for
+ distribution-specific Anscombe residuals.
+
+ resid_deviance : array
+ deviance residuals; see family.py for
+ distribution-specific deviance residuals.
+
+ pearson_chi2 : float
+ chi-Squared statistic is defined as the sum
+ of the squares of the Pearson residuals
+
+ normalized_cov_params : array
+ k*k, approximates [X.T*X]-1
+ """
+ def __init__(self, model, params, mu, w):
+ self.model = model
+ self.n = model.n
+ self.y = model.y.T.flatten()
+ self.X = model.X
+ self.k = model.k
+ self.family = model.family
+ self.fit_params = model.fit_params
+ self.params = params
+ self.w = w
+ self.mu = mu.flatten()
+ self._cache = {}
+
+ @cache_readonly
+ def df_model(self):
+ return self.model.df_model
+
+ @cache_readonly
+ def df_resid(self):
+ return self.model.df_resid
+
+ @cache_readonly
+ def normalized_cov_params(self):
+ return la.inv(spdot(self.w.T, self.w))
+
+ @cache_readonly
+ def resid_response(self):
+ return (self.y-self.mu)
+
+ @cache_readonly
+ def resid_pearson(self):
+ return ((self.y-self.mu) /
+ np.sqrt(self.family.variance(self.mu)))
+
+ @cache_readonly
+ def resid_working(self):
+ return (self.resid_response / self.family.link.deriv(self.mu))
+
+ @cache_readonly
+ def resid_anscombe(self):
+ return (self.family.resid_anscombe(self.y, self.mu))
+
+ @cache_readonly
+ def resid_deviance(self):
+ return (self.family.resid_dev(self.y, self.mu))
+
+ @cache_readonly
+ def pearson_chi2(self):
+ chisq = (self.y - self.mu)**2 / self.family.variance(self.mu)
+ chisqsum = np.sum(chisq)
+ return chisqsum
+
+ @cache_readonly
+ def null(self):
+ y = np.reshape(self.y, (-1,1))
+ model = self.model
+ X = np.ones((len(y), 1))
+ null_mod = GLM(y, X, family=self.family, constant=False)
+ return null_mod.fit().mu
+
+ @cache_readonly
+ def scale(self):
+ if isinstance(self.family, (family.Binomial, family.Poisson)):
+ return 1.
+ else:
+ return (((np.power(self.resid_response, 2) /
+ self.family.variance(self.mu))).sum() /
+ (self.df_resid))
+ @cache_readonly
+ def deviance(self):
+ return self.family.deviance(self.y, self.mu)
+
+ @cache_readonly
+ def null_deviance(self):
+ return self.family.deviance(self.y, self.null)
+
+ @cache_readonly
+ def llnull(self):
+ return self.family.loglike(self.y, self.null, scale=self.scale)
+
+ @cache_readonly
+ def llf(self):
+ return self.family.loglike(self.y, self.mu, scale=self.scale)
+
+ @cache_readonly
+ def aic(self):
+ if isinstance(self.family, family.QuasiPoisson):
+ return np.nan
+ else:
+ return -2 * self.llf + 2*(self.df_model+1)
+
+ @cache_readonly
+ def bic(self):
+ return (self.deviance -
+ (self.model.n - self.df_model - 1) *
+ np.log(self.model.n))
+
+ @cache_readonly
+ def D2(self):
+ return 1 - (self.deviance / self.null_deviance)
+
+ @cache_readonly
+ def adj_D2(self):
+ return 1.0 - (float(self.n) - 1.0)/(float(self.n) - float(self.k)) * (1.0-self.D2)
+
+ @cache_readonly
+ def pseudoR2(self):
+ return 1 - (self.llf/self.llnull)
+
+ @cache_readonly
+ def adj_pseudoR2(self):
+ return 1 - ((self.llf-self.k)/self.llnull)
+
diff --git a/release/python/0.8.0/crankshaft/crankshaft/regression/glm/iwls.py b/release/python/0.8.0/crankshaft/crankshaft/regression/glm/iwls.py
new file mode 100644
index 0000000..3ea6747
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/regression/glm/iwls.py
@@ -0,0 +1,84 @@
+import numpy as np
+import numpy.linalg as la
+from scipy import sparse as sp
+from scipy.sparse import linalg as spla
+from pysal.spreg.utils import spdot, spmultiply
+from family import Binomial, Poisson
+
+def _compute_betas(y, x):
+ """
+ compute MLE coefficients using iwls routine
+
+ Methods: p189, Iteratively (Re)weighted Least Squares (IWLS),
+ Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002).
+ Geographically weighted regression: the analysis of spatially varying relationships.
+ """
+ xT = x.T
+ xtx = spdot(xT, x)
+ xtx_inv = la.inv(xtx)
+ xtx_inv = sp.csr_matrix(xtx_inv)
+ xTy = spdot(xT, y, array_out=False)
+ betas = spdot(xtx_inv, xTy)
+ return betas
+
+def _compute_betas_gwr(y, x, wi):
+ """
+ compute MLE coefficients using iwls routine
+
+ Methods: p189, Iteratively (Re)weighted Least Squares (IWLS),
+ Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002).
+ Geographically weighted regression: the analysis of spatially varying relationships.
+ """
+ xT = (x * wi).T
+ xtx = np.dot(xT, x)
+ xtx_inv = la.inv(xtx)
+ xtx_inv_xt = np.dot(xtx_inv, xT)
+ betas = np.dot(xtx_inv_xt, y)
+ return betas, xtx_inv_xt
+
+def iwls(y, x, family, offset=1.0, ini_betas=None, tol=1.0e-8, max_iter=200, wi=None):
+ """
+ Iteratively re-weighted least squares estimation routine
+ """
+ n_iter = 0
+ diff = 1.0e6
+ if ini_betas is None:
+ betas = np.zeros((x.shape[1], 1), np.float)
+ else:
+ betas = ini_betas
+ if isinstance(family, Binomial):
+ y = family.link._clean(y)
+ if isinstance(family, Poisson):
+ y_off = y/offset
+ y_off = family.starting_mu(y_off)
+ v = family.predict(y_off)
+ mu = family.starting_mu(y)
+ else:
+ mu = family.starting_mu(y)
+ v = family.predict(mu)
+
+ while diff > tol and n_iter < max_iter:
+ n_iter += 1
+ w = family.weights(mu)
+ z = v + (family.link.deriv(mu)*(y-mu))
+ w = np.sqrt(w)
+ if type(x) != np.ndarray:
+ w = sp.csr_matrix(w)
+ z = sp.csr_matrix(z)
+ wx = spmultiply(x, w, array_out=False)
+ wz = spmultiply(z, w, array_out=False)
+ if wi is None:
+ n_betas = _compute_betas(wz, wx)
+ else:
+ n_betas, xtx_inv_xt = _compute_betas_gwr(wz, wx, wi)
+ v = spdot(x, n_betas)
+ mu = family.fitted(v)
+ if isinstance(family, Poisson):
+ mu = mu * offset
+ diff = min(abs(n_betas-betas))
+ betas = n_betas
+
+ if wi is None:
+ return betas, mu, wx, n_iter
+ else:
+ return betas, mu, v, w, z, xtx_inv_xt, n_iter
diff --git a/release/python/0.8.0/crankshaft/crankshaft/regression/glm/links.py b/release/python/0.8.0/crankshaft/crankshaft/regression/glm/links.py
new file mode 100644
index 0000000..f45724d
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/regression/glm/links.py
@@ -0,0 +1,953 @@
+'''
+Defines the link functions to be used with GLM and GEE families.
+'''
+
+import numpy as np
+import scipy.stats
+FLOAT_EPS = np.finfo(float).eps
+
+
+class Link(object):
+ """
+ A generic link function for one-parameter exponential family.
+
+ `Link` does nothing, but lays out the methods expected of any subclass.
+ """
+
+ def __call__(self, p):
+ """
+ Return the value of the link function. This is just a placeholder.
+
+ Parameters
+ ----------
+ p : array-like
+ Probabilities
+
+ Returns
+ -------
+ g(p) : array-like
+ The value of the link function g(p) = z
+ """
+ return NotImplementedError
+
+ def inverse(self, z):
+ """
+ Inverse of the link function. Just a placeholder.
+
+ Parameters
+ ----------
+ z : array-like
+ `z` is usually the linear predictor of the transformed variable
+ in the IRLS algorithm for GLM.
+
+ Returns
+ -------
+ g^(-1)(z) : array
+ The value of the inverse of the link function g^(-1)(z) = p
+
+
+ """
+ return NotImplementedError
+
+ def deriv(self, p):
+ """
+ Derivative of the link function g'(p). Just a placeholder.
+
+ Parameters
+ ----------
+ p : array-like
+
+ Returns
+ -------
+ g'(p) : array
+ The value of the derivative of the link function g'(p)
+ """
+ return NotImplementedError
+
+ def deriv2(self, p):
+ """Second derivative of the link function g''(p)
+
+ implemented through numerical differentiation
+ """
+ from statsmodels.tools.numdiff import approx_fprime_cs
+ # TODO: workaround proplem with numdiff for 1d
+ return np.diag(approx_fprime_cs(p, self.deriv))
+
+ def inverse_deriv(self, z):
+ """
+ Derivative of the inverse link function g^(-1)(z).
+
+ Notes
+ -----
+ This reference implementation gives the correct result but is
+ inefficient, so it can be overriden in subclasses.
+
+ Parameters
+ ----------
+ z : array-like
+ `z` is usually the linear predictor for a GLM or GEE model.
+
+ Returns
+ -------
+ g'^(-1)(z) : array
+ The value of the derivative of the inverse of the link function
+
+ """
+ return 1 / self.deriv(self.inverse(z))
+
+
+class Logit(Link):
+ """
+ The logit transform
+
+ Notes
+ -----
+ call and derivative use a private method _clean to make trim p by
+ machine epsilon so that p is in (0,1)
+
+ Alias of Logit:
+ logit = Logit()
+ """
+
+ def _clean(self, p):
+ """
+ Clip logistic values to range (eps, 1-eps)
+
+ Parameters
+ -----------
+ p : array-like
+ Probabilities
+
+ Returns
+ --------
+ pclip : array
+ Clipped probabilities
+ """
+ return np.clip(p, FLOAT_EPS, 1. - FLOAT_EPS)
+
+ def __call__(self, p):
+ """
+ The logit transform
+
+ Parameters
+ ----------
+ p : array-like
+ Probabilities
+
+ Returns
+ -------
+ z : array
+ Logit transform of `p`
+
+ Notes
+ -----
+ g(p) = log(p / (1 - p))
+ """
+ p = self._clean(p)
+ return np.log(p / (1. - p))
+
+ def inverse(self, z):
+ """
+ Inverse of the logit transform
+
+ Parameters
+ ----------
+ z : array-like
+ The value of the logit transform at `p`
+
+ Returns
+ -------
+ p : array
+ Probabilities
+
+ Notes
+ -----
+ g^(-1)(z) = exp(z)/(1+exp(z))
+ """
+ z = np.asarray(z)
+ t = np.exp(-z)
+ return 1. / (1. + t)
+
+ def deriv(self, p):
+
+ """
+ Derivative of the logit transform
+
+ Parameters
+ ----------
+ p: array-like
+ Probabilities
+
+ Returns
+ -------
+ g'(p) : array
+ Value of the derivative of logit transform at `p`
+
+ Notes
+ -----
+ g'(p) = 1 / (p * (1 - p))
+
+ Alias for `Logit`:
+ logit = Logit()
+ """
+ p = self._clean(p)
+ return 1. / (p * (1 - p))
+
+ def inverse_deriv(self, z):
+ """
+ Derivative of the inverse of the logit transform
+
+ Parameters
+ ----------
+ z : array-like
+ `z` is usually the linear predictor for a GLM or GEE model.
+
+ Returns
+ -------
+ g'^(-1)(z) : array
+ The value of the derivative of the inverse of the logit function
+
+ """
+ t = np.exp(z)
+ return t/(1 + t)**2
+
+
+ def deriv2(self, p):
+ """
+ Second derivative of the logit function.
+
+ Parameters
+ ----------
+ p : array-like
+ probabilities
+
+ Returns
+ -------
+ g''(z) : array
+ The value of the second derivative of the logit function
+ """
+ v = p * (1 - p)
+ return (2*p - 1) / v**2
+
+class logit(Logit):
+ pass
+
+
+class Power(Link):
+ """
+ The power transform
+
+ Parameters
+ ----------
+ power : float
+ The exponent of the power transform
+
+ Notes
+ -----
+ Aliases of Power:
+ inverse = Power(power=-1)
+ sqrt = Power(power=.5)
+ inverse_squared = Power(power=-2.)
+ identity = Power(power=1.)
+ """
+
+ def __init__(self, power=1.):
+ self.power = power
+
+ def __call__(self, p):
+ """
+ Power transform link function
+
+ Parameters
+ ----------
+ p : array-like
+ Mean parameters
+
+ Returns
+ -------
+ z : array-like
+ Power transform of x
+
+ Notes
+ -----
+ g(p) = x**self.power
+ """
+
+ z = np.power(p, self.power)
+ return z
+
+ def inverse(self, z):
+ """
+ Inverse of the power transform link function
+
+ Parameters
+ ----------
+ `z` : array-like
+ Value of the transformed mean parameters at `p`
+
+ Returns
+ -------
+ `p` : array
+ Mean parameters
+
+ Notes
+ -----
+ g^(-1)(z`) = `z`**(1/`power`)
+ """
+
+ p = np.power(z, 1. / self.power)
+ return p
+
+ def deriv(self, p):
+ """
+ Derivative of the power transform
+
+ Parameters
+ ----------
+ p : array-like
+ Mean parameters
+
+ Returns
+ --------
+ g'(p) : array
+ Derivative of power transform of `p`
+
+ Notes
+ -----
+ g'(`p`) = `power` * `p`**(`power` - 1)
+ """
+ return self.power * np.power(p, self.power - 1)
+
+ def deriv2(self, p):
+ """
+ Second derivative of the power transform
+
+ Parameters
+ ----------
+ p : array-like
+ Mean parameters
+
+ Returns
+ --------
+ g''(p) : array
+ Second derivative of the power transform of `p`
+
+ Notes
+ -----
+ g''(`p`) = `power` * (`power` - 1) * `p`**(`power` - 2)
+ """
+ return self.power * (self.power - 1) * np.power(p, self.power - 2)
+
+ def inverse_deriv(self, z):
+ """
+ Derivative of the inverse of the power transform
+
+ Parameters
+ ----------
+ z : array-like
+ `z` is usually the linear predictor for a GLM or GEE model.
+
+ Returns
+ -------
+ g^(-1)'(z) : array
+ The value of the derivative of the inverse of the power transform
+ function
+ """
+ return np.power(z, (1 - self.power)/self.power) / self.power
+
+
+class inverse_power(Power):
+ """
+ The inverse transform
+
+ Notes
+ -----
+ g(p) = 1/p
+
+ Alias of statsmodels.family.links.Power(power=-1.)
+ """
+ def __init__(self):
+ super(inverse_power, self).__init__(power=-1.)
+
+
+class sqrt(Power):
+ """
+ The square-root transform
+
+ Notes
+ -----
+ g(`p`) = sqrt(`p`)
+
+ Alias of statsmodels.family.links.Power(power=.5)
+ """
+ def __init__(self):
+ super(sqrt, self).__init__(power=.5)
+
+
+class inverse_squared(Power):
+ """
+ The inverse squared transform
+
+ Notes
+ -----
+ g(`p`) = 1/(`p`\ \*\*2)
+
+ Alias of statsmodels.family.links.Power(power=2.)
+ """
+ def __init__(self):
+ super(inverse_squared, self).__init__(power=-2.)
+
+
+class identity(Power):
+ """
+ The identity transform
+
+ Notes
+ -----
+ g(`p`) = `p`
+
+ Alias of statsmodels.family.links.Power(power=1.)
+ """
+ def __init__(self):
+ super(identity, self).__init__(power=1.)
+
+
+class Log(Link):
+ """
+ The log transform
+
+ Notes
+ -----
+ call and derivative call a private method _clean to trim the data by
+ machine epsilon so that p is in (0,1). log is an alias of Log.
+ """
+
+ def _clean(self, x):
+ return np.clip(x, FLOAT_EPS, np.inf)
+
+ def __call__(self, p, **extra):
+ """
+ Log transform link function
+
+ Parameters
+ ----------
+ x : array-like
+ Mean parameters
+
+ Returns
+ -------
+ z : array
+ log(x)
+
+ Notes
+ -----
+ g(p) = log(p)
+ """
+ x = self._clean(p)
+ return np.log(x)
+
+ def inverse(self, z):
+ """
+ Inverse of log transform link function
+
+ Parameters
+ ----------
+ z : array
+ The inverse of the link function at `p`
+
+ Returns
+ -------
+ p : array
+ The mean probabilities given the value of the inverse `z`
+
+ Notes
+ -----
+ g^{-1}(z) = exp(z)
+ """
+ return np.exp(z)
+
+ def deriv(self, p):
+ """
+ Derivative of log transform link function
+
+ Parameters
+ ----------
+ p : array-like
+ Mean parameters
+
+ Returns
+ -------
+ g'(p) : array
+ derivative of log transform of x
+
+ Notes
+ -----
+ g'(x) = 1/x
+ """
+ p = self._clean(p)
+ return 1. / p
+
+ def deriv2(self, p):
+ """
+ Second derivative of the log transform link function
+
+ Parameters
+ ----------
+ p : array-like
+ Mean parameters
+
+ Returns
+ -------
+ g''(p) : array
+ Second derivative of log transform of x
+
+ Notes
+ -----
+ g''(x) = -1/x^2
+ """
+ p = self._clean(p)
+ return -1. / p**2
+
+ def inverse_deriv(self, z):
+ """
+ Derivative of the inverse of the log transform link function
+
+ Parameters
+ ----------
+ z : array
+ The inverse of the link function at `p`
+
+ Returns
+ -------
+ g^(-1)'(z) : array
+ The value of the derivative of the inverse of the log function,
+ the exponential function
+ """
+ return np.exp(z)
+
+
+class log(Log):
+ """
+ The log transform
+
+ Notes
+ -----
+ log is a an alias of Log.
+ """
+ pass
+
+
+# TODO: the CDFLink is untested
+class CDFLink(Logit):
+ """
+ The use the CDF of a scipy.stats distribution
+
+ CDFLink is a subclass of logit in order to use its _clean method
+ for the link and its derivative.
+
+ Parameters
+ ----------
+ dbn : scipy.stats distribution
+ Default is dbn=scipy.stats.norm
+
+ Notes
+ -----
+ The CDF link is untested.
+ """
+
+ def __init__(self, dbn=scipy.stats.norm):
+ self.dbn = dbn
+
+ def __call__(self, p):
+ """
+ CDF link function
+
+ Parameters
+ ----------
+ p : array-like
+ Mean parameters
+
+ Returns
+ -------
+ z : array
+ (ppf) inverse of CDF transform of p
+
+ Notes
+ -----
+ g(`p`) = `dbn`.ppf(`p`)
+ """
+ p = self._clean(p)
+ return self.dbn.ppf(p)
+
+ def inverse(self, z):
+ """
+ The inverse of the CDF link
+
+ Parameters
+ ----------
+ z : array-like
+ The value of the inverse of the link function at `p`
+
+ Returns
+ -------
+ p : array
+ Mean probabilities. The value of the inverse of CDF link of `z`
+
+ Notes
+ -----
+ g^(-1)(`z`) = `dbn`.cdf(`z`)
+ """
+ return self.dbn.cdf(z)
+
+ def deriv(self, p):
+ """
+ Derivative of CDF link
+
+ Parameters
+ ----------
+ p : array-like
+ mean parameters
+
+ Returns
+ -------
+ g'(p) : array
+ The derivative of CDF transform at `p`
+
+ Notes
+ -----
+ g'(`p`) = 1./ `dbn`.pdf(`dbn`.ppf(`p`))
+ """
+ p = self._clean(p)
+ return 1. / self.dbn.pdf(self.dbn.ppf(p))
+
+ def deriv2(self, p):
+ """
+ Second derivative of the link function g''(p)
+
+ implemented through numerical differentiation
+ """
+ from statsmodels.tools.numdiff import approx_fprime
+ p = np.atleast_1d(p)
+ # Note: special function for norm.ppf does not support complex
+ return np.diag(approx_fprime(p, self.deriv, centered=True))
+
+ def inverse_deriv(self, z):
+ """
+ Derivative of the inverse of the CDF transformation link function
+
+ Parameters
+ ----------
+ z : array
+ The inverse of the link function at `p`
+
+ Returns
+ -------
+ g^(-1)'(z) : array
+ The value of the derivative of the inverse of the logit function
+ """
+ return 1/self.deriv(self.inverse(z))
+
+
+class probit(CDFLink):
+ """
+ The probit (standard normal CDF) transform
+
+ Notes
+ --------
+ g(p) = scipy.stats.norm.ppf(p)
+
+ probit is an alias of CDFLink.
+ """
+ pass
+
+
+class cauchy(CDFLink):
+ """
+ The Cauchy (standard Cauchy CDF) transform
+
+ Notes
+ -----
+ g(p) = scipy.stats.cauchy.ppf(p)
+
+ cauchy is an alias of CDFLink with dbn=scipy.stats.cauchy
+ """
+
+ def __init__(self):
+ super(cauchy, self).__init__(dbn=scipy.stats.cauchy)
+
+ def deriv2(self, p):
+ """
+ Second derivative of the Cauchy link function.
+
+ Parameters
+ ----------
+ p: array-like
+ Probabilities
+
+ Returns
+ -------
+ g''(p) : array
+ Value of the second derivative of Cauchy link function at `p`
+ """
+ a = np.pi * (p - 0.5)
+ d2 = 2 * np.pi**2 * np.sin(a) / np.cos(a)**3
+ return d2
+
+class CLogLog(Logit):
+ """
+ The complementary log-log transform
+
+ CLogLog inherits from Logit in order to have access to its _clean method
+ for the link and its derivative.
+
+ Notes
+ -----
+ CLogLog is untested.
+ """
+ def __call__(self, p):
+ """
+ C-Log-Log transform link function
+
+ Parameters
+ ----------
+ p : array
+ Mean parameters
+
+ Returns
+ -------
+ z : array
+ The CLogLog transform of `p`
+
+ Notes
+ -----
+ g(p) = log(-log(1-p))
+ """
+ p = self._clean(p)
+ return np.log(-np.log(1 - p))
+
+ def inverse(self, z):
+ """
+ Inverse of C-Log-Log transform link function
+
+
+ Parameters
+ ----------
+ z : array-like
+ The value of the inverse of the CLogLog link function at `p`
+
+ Returns
+ -------
+ p : array
+ Mean parameters
+
+ Notes
+ -----
+ g^(-1)(`z`) = 1-exp(-exp(`z`))
+ """
+ return 1 - np.exp(-np.exp(z))
+
+ def deriv(self, p):
+ """
+ Derivative of C-Log-Log transform link function
+
+ Parameters
+ ----------
+ p : array-like
+ Mean parameters
+
+ Returns
+ -------
+ g'(p) : array
+ The derivative of the CLogLog transform link function
+
+ Notes
+ -----
+ g'(p) = - 1 / ((p-1)*log(1-p))
+ """
+ p = self._clean(p)
+ return 1. / ((p - 1) * (np.log(1 - p)))
+
+ def deriv2(self, p):
+ """
+ Second derivative of the C-Log-Log ink function
+
+ Parameters
+ ----------
+ p : array-like
+ Mean parameters
+
+ Returns
+ -------
+ g''(p) : array
+ The second derivative of the CLogLog link function
+ """
+ p = self._clean(p)
+ fl = np.log(1 - p)
+ d2 = -1 / ((1 - p)**2 * fl)
+ d2 *= 1 + 1 / fl
+ return d2
+
+ def inverse_deriv(self, z):
+ """
+ Derivative of the inverse of the C-Log-Log transform link function
+
+ Parameters
+ ----------
+ z : array-like
+ The value of the inverse of the CLogLog link function at `p`
+
+ Returns
+ -------
+ g^(-1)'(z) : array
+ The derivative of the inverse of the CLogLog link function
+ """
+ return np.exp(z - np.exp(z))
+
+
+class cloglog(CLogLog):
+ """
+ The CLogLog transform link function.
+
+ Notes
+ -----
+ g(`p`) = log(-log(1-`p`))
+
+ cloglog is an alias for CLogLog
+ cloglog = CLogLog()
+ """
+ pass
+
+
+class NegativeBinomial(object):
+ '''
+ The negative binomial link function
+
+ Parameters
+ ----------
+ alpha : float, optional
+ Alpha is the ancillary parameter of the Negative Binomial link
+ function. It is assumed to be nonstochastic. The default value is 1.
+ Permissible values are usually assumed to be in (.01, 2).
+ '''
+
+ def __init__(self, alpha=1.):
+ self.alpha = alpha
+
+ def _clean(self, x):
+ return np.clip(x, FLOAT_EPS, np.inf)
+
+ def __call__(self, p):
+ '''
+ Negative Binomial transform link function
+
+ Parameters
+ ----------
+ p : array-like
+ Mean parameters
+
+ Returns
+ -------
+ z : array
+ The negative binomial transform of `p`
+
+ Notes
+ -----
+ g(p) = log(p/(p + 1/alpha))
+ '''
+ p = self._clean(p)
+ return np.log(p/(p + 1/self.alpha))
+
+ def inverse(self, z):
+ '''
+ Inverse of the negative binomial transform
+
+ Parameters
+ -----------
+ z : array-like
+ The value of the inverse of the negative binomial link at `p`.
+
+ Returns
+ -------
+ p : array
+ Mean parameters
+
+ Notes
+ -----
+ g^(-1)(z) = exp(z)/(alpha*(1-exp(z)))
+ '''
+ return -1/(self.alpha * (1 - np.exp(-z)))
+
+ def deriv(self, p):
+ '''
+ Derivative of the negative binomial transform
+
+ Parameters
+ ----------
+ p : array-like
+ Mean parameters
+
+ Returns
+ -------
+ g'(p) : array
+ The derivative of the negative binomial transform link function
+
+ Notes
+ -----
+ g'(x) = 1/(x+alpha*x^2)
+ '''
+ return 1/(p + self.alpha * p**2)
+
+ def deriv2(self,p):
+ '''
+ Second derivative of the negative binomial link function.
+
+ Parameters
+ ----------
+ p : array-like
+ Mean parameters
+
+ Returns
+ -------
+ g''(p) : array
+ The second derivative of the negative binomial transform link
+ function
+
+ Notes
+ -----
+ g''(x) = -(1+2*alpha*x)/(x+alpha*x^2)^2
+ '''
+ numer = -(1 + 2 * self.alpha * p)
+ denom = (p + self.alpha * p**2)**2
+ return numer / denom
+
+ def inverse_deriv(self, z):
+ '''
+ Derivative of the inverse of the negative binomial transform
+
+ Parameters
+ -----------
+ z : array-like
+ Usually the linear predictor for a GLM or GEE model
+
+ Returns
+ -------
+ g^(-1)'(z) : array
+ The value of the derivative of the inverse of the negative
+ binomial link
+ '''
+ t = np.exp(z)
+ return t / (self.alpha * (1-t)**2)
+
+
+class nbinom(NegativeBinomial):
+ """
+ The negative binomial link function.
+
+ Notes
+ -----
+ g(p) = log(p/(p + 1/alpha))
+
+ nbinom is an alias of NegativeBinomial.
+ nbinom = NegativeBinomial(alpha=1.)
+ """
+ pass
diff --git a/release/python/0.8.0/crankshaft/crankshaft/regression/glm/tests/test_glm.py b/release/python/0.8.0/crankshaft/crankshaft/regression/glm/tests/test_glm.py
new file mode 100644
index 0000000..b86ad6a
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/regression/glm/tests/test_glm.py
@@ -0,0 +1,993 @@
+"""
+Tests for generalized linear models. Majority of code either directly borrowed
+or closely adapted from statsmodels package. Model results verfiied using glm
+function in R and GLM function in statsmodels.
+"""
+
+__author__ = 'Taylor Oshan tayoshan@gmail.com'
+
+from pysal.contrib.glm.glm import GLM
+from pysal.contrib.glm.family import Gaussian, Poisson, Binomial, QuasiPoisson
+import numpy as np
+import pysal
+import unittest
+import math
+
+
+class TestGaussian(unittest.TestCase):
+ """
+ Tests for Poisson GLM
+ """
+
+ def setUp(self):
+ db = pysal.open(pysal.examples.get_path('columbus.dbf'),'r')
+ y = np.array(db.by_col("HOVAL"))
+ self.y = np.reshape(y, (49,1))
+ X = []
+ X.append(db.by_col("INC"))
+ X.append(db.by_col("CRIME"))
+ self.X = np.array(X).T
+
+ def testIWLS(self):
+ model = GLM(self.y, self.X, family=Gaussian())
+ results = model.fit()
+ self.assertEqual(results.n, 49)
+ self.assertEqual(results.df_model, 2)
+ self.assertEqual(results.df_resid, 46)
+ self.assertEqual(results.aic, 408.73548964604873)
+ self.assertEqual(results.bic, 10467.991340493107)
+ self.assertEqual(results.deviance, 10647.015074206196)
+ self.assertEqual(results.llf, -201.36774482302437)
+ self.assertEqual(results.null_deviance, 16367.794631703124)
+ self.assertEqual(results.scale, 231.45684943926514)
+ np.testing.assert_allclose(results.params, [ 46.42818268, 0.62898397,
+ -0.48488854])
+ np.testing.assert_allclose(results.bse, [ 13.19175703, 0.53591045,
+ 0.18267291])
+ np.testing.assert_allclose(results.cov_params(),
+ [[ 1.74022453e+02, -6.52060364e+00, -2.15109867e+00],
+ [ -6.52060364e+00, 2.87200008e-01, 6.80956787e-02],
+ [ -2.15109867e+00, 6.80956787e-02, 3.33693910e-02]])
+ np.testing.assert_allclose(results.tvalues, [ 3.51948437, 1.17367365,
+ -2.65440864])
+ np.testing.assert_allclose(results.pvalues, [ 0.00043239, 0.24052577,
+ 0.00794475], atol=1.0e-8)
+ np.testing.assert_allclose(results.conf_int(),
+ [[ 20.57281401, 72.28355135],
+ [ -0.42138121, 1.67934915],
+ [ -0.84292086, -0.12685622]])
+ np.testing.assert_allclose(results.normalized_cov_params,
+ [[ 7.51857004e-01, -2.81720055e-02, -9.29373521e-03],
+ [ -2.81720055e-02, 1.24083607e-03, 2.94204638e-04],
+ [ -9.29373521e-03, 2.94204638e-04, 1.44171110e-04]])
+ np.testing.assert_allclose(results.mu,
+ [ 51.08752105, 50.66601521, 41.61367567, 33.53969014,
+ 28.90638232, 43.87074227, 51.64910882, 34.92671563,
+ 42.69267622, 38.49449134, 20.92815471, 25.25228436,
+ 29.78223486, 25.02403635, 29.07959539, 24.63352275,
+ 34.71372149, 33.40443052, 27.29864225, 65.86219802,
+ 33.69854751, 37.44976435, 50.01304928, 36.81219959,
+ 22.02674837, 31.64775955, 27.63563294, 23.7697291 ,
+ 22.43119725, 21.76987089, 48.51169321, 49.05891819,
+ 32.31656426, 44.20550354, 35.49244888, 51.27811308,
+ 36.55047181, 27.37048914, 48.78812922, 57.31744163,
+ 51.22914162, 54.70515578, 37.06622277, 44.5075759 ,
+ 41.24328983, 49.93821824, 44.85644299, 40.93838609, 47.32045464])
+ self.assertEqual(results.pearson_chi2, 10647.015074206196)
+ np.testing.assert_allclose(results.resid_response,
+ [ 29.37948195, -6.09901421, -15.26367567, -0.33968914,
+ -5.68138232, -15.12074227, 23.35089118, 2.19828437,
+ 9.90732178, 57.90551066, -1.22815371, -5.35228436,
+ 11.91776614, 17.87596565, -11.07959539, -5.83352375,
+ 7.03627851, 26.59556948, 3.30135775, 15.40479998,
+ -13.72354751, -6.99976335, -2.28004728, 16.38780141,
+ -4.12674837, -11.34776055, 6.46436506, -0.9197291 ,
+ 10.06880275, 0.73012911, -16.71169421, -8.75891919,
+ -8.71656426, -15.75550254, -8.49244888, -14.97811408,
+ 6.74952719, -4.67048814, -9.18813122, 4.63255937,
+ -9.12914362, -10.37215578, -11.36622177, -11.0075759 ,
+ -13.51028983, 26.16177976, -2.35644299, -14.13838709, -11.52045564])
+ np.testing.assert_allclose(results.resid_working,
+ [ 29.37948195, -6.09901421, -15.26367567, -0.33968914,
+ -5.68138232, -15.12074227, 23.35089118, 2.19828437,
+ 9.90732178, 57.90551066, -1.22815371, -5.35228436,
+ 11.91776614, 17.87596565, -11.07959539, -5.83352375,
+ 7.03627851, 26.59556948, 3.30135775, 15.40479998,
+ -13.72354751, -6.99976335, -2.28004728, 16.38780141,
+ -4.12674837, -11.34776055, 6.46436506, -0.9197291 ,
+ 10.06880275, 0.73012911, -16.71169421, -8.75891919,
+ -8.71656426, -15.75550254, -8.49244888, -14.97811408,
+ 6.74952719, -4.67048814, -9.18813122, 4.63255937,
+ -9.12914362, -10.37215578, -11.36622177, -11.0075759 ,
+ -13.51028983, 26.16177976, -2.35644299, -14.13838709, -11.52045564])
+ np.testing.assert_allclose(results.resid_pearson,
+ [ 29.37948195, -6.09901421, -15.26367567, -0.33968914,
+ -5.68138232, -15.12074227, 23.35089118, 2.19828437,
+ 9.90732178, 57.90551066, -1.22815371, -5.35228436,
+ 11.91776614, 17.87596565, -11.07959539, -5.83352375,
+ 7.03627851, 26.59556948, 3.30135775, 15.40479998,
+ -13.72354751, -6.99976335, -2.28004728, 16.38780141,
+ -4.12674837, -11.34776055, 6.46436506, -0.9197291 ,
+ 10.06880275, 0.73012911, -16.71169421, -8.75891919,
+ -8.71656426, -15.75550254, -8.49244888, -14.97811408,
+ 6.74952719, -4.67048814, -9.18813122, 4.63255937,
+ -9.12914362, -10.37215578, -11.36622177, -11.0075759 ,
+ -13.51028983, 26.16177976, -2.35644299, -14.13838709, -11.52045564])
+ np.testing.assert_allclose(results.resid_anscombe,
+ [ 29.37948195, -6.09901421, -15.26367567, -0.33968914,
+ -5.68138232, -15.12074227, 23.35089118, 2.19828437,
+ 9.90732178, 57.90551066, -1.22815371, -5.35228436,
+ 11.91776614, 17.87596565, -11.07959539, -5.83352375,
+ 7.03627851, 26.59556948, 3.30135775, 15.40479998,
+ -13.72354751, -6.99976335, -2.28004728, 16.38780141,
+ -4.12674837, -11.34776055, 6.46436506, -0.9197291 ,
+ 10.06880275, 0.73012911, -16.71169421, -8.75891919,
+ -8.71656426, -15.75550254, -8.49244888, -14.97811408,
+ 6.74952719, -4.67048814, -9.18813122, 4.63255937,
+ -9.12914362, -10.37215578, -11.36622177, -11.0075759 ,
+ -13.51028983, 26.16177976, -2.35644299, -14.13838709, -11.52045564])
+ np.testing.assert_allclose(results.resid_deviance,
+ [ 29.37948195, -6.09901421, -15.26367567, -0.33968914,
+ -5.68138232, -15.12074227, 23.35089118, 2.19828437,
+ 9.90732178, 57.90551066, -1.22815371, -5.35228436,
+ 11.91776614, 17.87596565, -11.07959539, -5.83352375,
+ 7.03627851, 26.59556948, 3.30135775, 15.40479998,
+ -13.72354751, -6.99976335, -2.28004728, 16.38780141,
+ -4.12674837, -11.34776055, 6.46436506, -0.9197291 ,
+ 10.06880275, 0.73012911, -16.71169421, -8.75891919,
+ -8.71656426, -15.75550254, -8.49244888, -14.97811408,
+ 6.74952719, -4.67048814, -9.18813122, 4.63255937,
+ -9.12914362, -10.37215578, -11.36622177, -11.0075759 ,
+ -13.51028983, 26.16177976, -2.35644299, -14.13838709, -11.52045564])
+ np.testing.assert_allclose(results.null,
+ [ 38.43622447, 38.43622447, 38.43622447, 38.43622447,
+ 38.43622447, 38.43622447, 38.43622447, 38.43622447,
+ 38.43622447, 38.43622447, 38.43622447, 38.43622447,
+ 38.43622447, 38.43622447, 38.43622447, 38.43622447,
+ 38.43622447, 38.43622447, 38.43622447, 38.43622447,
+ 38.43622447, 38.43622447, 38.43622447, 38.43622447,
+ 38.43622447, 38.43622447, 38.43622447, 38.43622447,
+ 38.43622447, 38.43622447, 38.43622447, 38.43622447,
+ 38.43622447, 38.43622447, 38.43622447, 38.43622447,
+ 38.43622447, 38.43622447, 38.43622447, 38.43622447,
+ 38.43622447, 38.43622447, 38.43622447, 38.43622447,
+ 38.43622447, 38.43622447, 38.43622447, 38.43622447, 38.43622447])
+ self.assertAlmostEqual(results.D2, .349514377851)
+ self.assertAlmostEqual(results.adj_D2, 0.32123239427957673)
+
+class TestPoisson(unittest.TestCase):
+
+ def setUp(self):
+ db = pysal.open(pysal.examples.get_path('columbus.dbf'),'r')
+ y = np.array(db.by_col("HOVAL"))
+ y = np.reshape(y, (49,1))
+ self.y = np.round(y).astype(int)
+ X = []
+ X.append(db.by_col("INC"))
+ X.append(db.by_col("CRIME"))
+ self.X = np.array(X).T
+
+ def testIWLS(self):
+ model = GLM(self.y, self.X, family=Poisson())
+ results = model.fit()
+ self.assertEqual(results.n, 49)
+ self.assertEqual(results.df_model, 2)
+ self.assertEqual(results.df_resid, 46)
+ self.assertAlmostEqual(results.aic, 500.85184179938756)
+ self.assertAlmostEqual(results.bic, 51.436404535087661)
+ self.assertAlmostEqual(results.deviance, 230.46013824817649)
+ self.assertAlmostEqual(results.llf, -247.42592089969378)
+ self.assertAlmostEqual(results.null_deviance, 376.97293610347361)
+ self.assertEqual(results.scale, 1.0)
+ np.testing.assert_allclose(results.params, [ 3.92159085, 0.01183491,
+ -0.01371397], atol=1.0e-8)
+ np.testing.assert_allclose(results.bse, [ 0.13049161, 0.00511599,
+ 0.00193769], atol=1.0e-8)
+ np.testing.assert_allclose(results.cov_params(),
+ [[ 1.70280610e-02, -6.18628383e-04, -2.21386966e-04],
+ [ -6.18628383e-04, 2.61733917e-05, 6.77496445e-06],
+ [ -2.21386966e-04, 6.77496445e-06, 3.75463502e-06]])
+ np.testing.assert_allclose(results.tvalues, [ 30.0524361 , 2.31331634,
+ -7.07748998])
+ np.testing.assert_allclose(results.pvalues, [ 2.02901657e-198,
+ 2.07052532e-002, 1.46788805e-012])
+ np.testing.assert_allclose(results.conf_int(),
+ [[ 3.66583199e+00, 4.17734972e+00],
+ [ 1.80774841e-03, 2.18620753e-02],
+ [ -1.75117666e-02, -9.91616901e-03]])
+ np.testing.assert_allclose(results.normalized_cov_params,
+ [[ 1.70280610e-02, -6.18628383e-04, -2.21386966e-04],
+ [ -6.18628383e-04, 2.61733917e-05, 6.77496445e-06],
+ [ -2.21386966e-04, 6.77496445e-06, 3.75463502e-06]])
+ np.testing.assert_allclose(results.mu,
+ [ 51.26831574, 50.15022766, 40.06142973, 34.13799739,
+ 28.76119226, 42.6836241 , 55.64593703, 34.08277997,
+ 40.90389582, 37.19727958, 23.47459217, 26.12384057,
+ 29.78303507, 25.96888223, 29.14073823, 26.04369592,
+ 34.18996367, 32.28924005, 27.42284396, 72.69207879,
+ 33.05316347, 36.52276972, 49.2551479 , 35.33439632,
+ 24.07252457, 31.67153709, 27.81699478, 25.38021219,
+ 24.31759259, 23.13586161, 48.40724678, 48.57969818,
+ 31.92596006, 43.3679231 , 34.32925819, 51.78908089,
+ 34.49778584, 27.56236198, 48.34273194, 57.50829097,
+ 50.66038226, 54.68701352, 35.77103116, 43.21886784,
+ 40.07615759, 49.98658004, 43.13352883, 40.28520774, 46.28910294])
+ self.assertAlmostEqual(results.pearson_chi2, 264.62262932090221)
+ np.testing.assert_allclose(results.resid_response,
+ [ 28.73168426, -5.15022766, -14.06142973, -1.13799739,
+ -5.76119226, -13.6836241 , 19.35406297, 2.91722003,
+ 12.09610418, 58.80272042, -3.47459217, -6.12384057,
+ 12.21696493, 17.03111777, -11.14073823, -7.04369592,
+ 7.81003633, 27.71075995, 3.57715604, 8.30792121,
+ -13.05316347, -6.52276972, -1.2551479 , 17.66560368,
+ -6.07252457, -11.67153709, 6.18300522, -2.38021219,
+ 7.68240741, -1.13586161, -16.40724678, -8.57969818,
+ -7.92596006, -15.3679231 , -7.32925819, -15.78908089,
+ 8.50221416, -4.56236198, -8.34273194, 4.49170903,
+ -8.66038226, -10.68701352, -9.77103116, -9.21886784,
+ -12.07615759, 26.01341996, -1.13352883, -13.28520774, -10.28910294])
+ np.testing.assert_allclose(results.resid_working,
+ [ 1473.02506034, -258.28508941, -563.32097891, -38.84895192,
+ -165.69875817, -584.06666725, 1076.97496919, 99.42696848,
+ 494.77778514, 2187.30123163, -81.56463405, -159.97823479,
+ 363.858295 , 442.27909165, -324.64933645, -183.44387481,
+ 267.02485844, 894.75938 , 98.09579187, 603.9200634 ,
+ -431.44834594, -238.2296165 , -61.82249568, 624.20344168,
+ -146.18099686, -369.65551968, 171.99262399, -60.41029031,
+ 186.81765356, -26.27913713, -794.22964417, -416.79914795,
+ -253.04388425, -666.47490701, -251.6079969 , -817.70198717,
+ 293.30756327, -125.74947222, -403.31045369, 258.31051005,
+ -438.73827602, -584.440853 , -349.51985996, -398.42903071,
+ -483.96599444, 1300.32189904, -48.89309853, -535.19735391,
+ -476.27334527])
+ np.testing.assert_allclose(results.resid_pearson,
+ [ 4.01269878, -0.72726045, -2.221602 , -0.19477008, -1.07425881,
+ -2.09445239, 2.59451042, 0.49969118, 1.89131202, 9.64143836,
+ -0.71714142, -1.19813392, 2.23861212, 3.34207756, -2.0637814 ,
+ -1.3802231 , 1.33568403, 4.87662684, 0.68309584, 0.97442591,
+ -2.27043598, -1.07931992, -0.17884182, 2.97186889, -1.23768025,
+ -2.07392709, 1.1723155 , -0.47246327, 1.55789092, -0.23614708,
+ -2.35819937, -1.23096188, -1.40274877, -2.33362391, -1.25091503,
+ -2.19400568, 1.44755952, -0.8690235 , -1.19989348, 0.59230634,
+ -1.21675413, -1.44515442, -1.63370888, -1.40229988, -1.90759306,
+ 3.67934693, -0.17259375, -2.09312684, -1.51230062])
+ np.testing.assert_allclose(results.resid_anscombe,
+ [ 3.70889134, -0.74031295, -2.37729865, -0.19586855, -1.11374751,
+ -2.22611959, 2.46352013, 0.49282126, 1.80857757, 8.06444452,
+ -0.73610811, -1.25061371, 2.10820431, 3.05467547, -2.22437611,
+ -1.45136173, 1.28939698, 4.35942058, 0.66904552, 0.95674923,
+ -2.45438937, -1.11429881, -0.17961012, 2.76715848, -1.29658591,
+ -2.22816691, 1.13269136, -0.48017382, 1.48562248, -0.23812278,
+ -2.51664399, -1.2703721 , -1.4683091 , -2.49907536, -1.30026484,
+ -2.32398309, 1.39380683, -0.89495368, -1.23735395, 0.58485202,
+ -1.25435224, -1.4968484 , -1.71888038, -1.45756652, -2.01906267,
+ 3.41729922, -0.17335867, -2.22921828, -1.57470549])
+ np.testing.assert_allclose(results.resid_deviance,
+ [ 3.70529668, -0.74027329, -2.37536322, -0.19586751, -1.11349765,
+ -2.22466106, 2.46246446, 0.4928057 , 1.80799655, 8.02696525,
+ -0.73602255, -1.25021555, 2.10699958, 3.05084608, -2.22214376,
+ -1.45072221, 1.28913747, 4.35106213, 0.6689982 , 0.95669662,
+ -2.45171913, -1.11410444, -0.17960956, 2.76494217, -1.29609865,
+ -2.22612429, 1.13247453, -0.48015254, 1.48508549, -0.23812 ,
+ -2.51476072, -1.27015583, -1.46777697, -2.49699318, -1.29992892,
+ -2.32263069, 1.39348459, -0.89482132, -1.23715363, 0.58483655,
+ -1.25415329, -1.49653039, -1.7181055 , -1.45719072, -2.01791949,
+ 3.41437156, -0.1733581 , -2.22765605, -1.57426046])
+ np.testing.assert_allclose(results.null,
+ [ 38.42857143, 38.42857143, 38.42857143, 38.42857143,
+ 38.42857143, 38.42857143, 38.42857143, 38.42857143,
+ 38.42857143, 38.42857143, 38.42857143, 38.42857143,
+ 38.42857143, 38.42857143, 38.42857143, 38.42857143,
+ 38.42857143, 38.42857143, 38.42857143, 38.42857143,
+ 38.42857143, 38.42857143, 38.42857143, 38.42857143,
+ 38.42857143, 38.42857143, 38.42857143, 38.42857143,
+ 38.42857143, 38.42857143, 38.42857143, 38.42857143,
+ 38.42857143, 38.42857143, 38.42857143, 38.42857143,
+ 38.42857143, 38.42857143, 38.42857143, 38.42857143,
+ 38.42857143, 38.42857143, 38.42857143, 38.42857143,
+ 38.42857143, 38.42857143, 38.42857143, 38.42857143, 38.42857143])
+ self.assertAlmostEqual(results.D2, .388656011675)
+ self.assertAlmostEqual(results.adj_D2, 0.36207583826952761)#.375648692774)
+
+ def testQuasi(self):
+ model = GLM(self.y, self.X, family=QuasiPoisson())
+ results = model.fit()
+ self.assertEqual(results.n, 49)
+ self.assertEqual(results.df_model, 2)
+ self.assertEqual(results.df_resid, 46)
+ self.assertTrue(math.isnan(results.aic))
+ self.assertAlmostEqual(results.bic, 51.436404535087661)
+ self.assertAlmostEqual(results.deviance, 230.46013824817649)
+ self.assertTrue(math.isnan(results.llf))
+ self.assertAlmostEqual(results.null_deviance, 376.97293610347361)
+ self.assertAlmostEqual(results.scale, 5.7526658548022223)
+ np.testing.assert_allclose(results.params, [ 3.92159085, 0.01183491,
+ -0.01371397], atol=1.0e-8)
+ np.testing.assert_allclose(results.bse, [ 0.31298042, 0.01227057,
+ 0.00464749], atol=1.0e-8)
+ np.testing.assert_allclose(results.cov_params(),
+ [[ 9.79567451e-02, -3.55876238e-03, -1.27356524e-03],
+ [ -3.55876238e-03, 1.50566777e-04, 3.89741067e-05],
+ [ -1.27356524e-03, 3.89741067e-05, 2.15991606e-05]])
+ np.testing.assert_allclose(results.tvalues, [ 12.52982796, 0.96449604,
+ -2.95083339])
+ np.testing.assert_allclose(results.pvalues, [ 5.12737770e-36,
+ 3.34797291e-01, 3.16917819e-03])
+ np.testing.assert_allclose(results.conf_int(),
+ [[ 3.3081605 , 4.53502121],
+ [-0.01221495, 0.03588478],
+ [-0.02282288, -0.00460506]], atol=1.0e-8)
+ np.testing.assert_allclose(results.normalized_cov_params,
+ [[ 1.70280610e-02, -6.18628383e-04, -2.21386966e-04],
+ [ -6.18628383e-04, 2.61733917e-05, 6.77496445e-06],
+ [ -2.21386966e-04, 6.77496445e-06, 3.75463502e-06]])
+ np.testing.assert_allclose(results.mu,
+ [ 51.26831574, 50.15022766, 40.06142973, 34.13799739,
+ 28.76119226, 42.6836241 , 55.64593703, 34.08277997,
+ 40.90389582, 37.19727958, 23.47459217, 26.12384057,
+ 29.78303507, 25.96888223, 29.14073823, 26.04369592,
+ 34.18996367, 32.28924005, 27.42284396, 72.69207879,
+ 33.05316347, 36.52276972, 49.2551479 , 35.33439632,
+ 24.07252457, 31.67153709, 27.81699478, 25.38021219,
+ 24.31759259, 23.13586161, 48.40724678, 48.57969818,
+ 31.92596006, 43.3679231 , 34.32925819, 51.78908089,
+ 34.49778584, 27.56236198, 48.34273194, 57.50829097,
+ 50.66038226, 54.68701352, 35.77103116, 43.21886784,
+ 40.07615759, 49.98658004, 43.13352883, 40.28520774, 46.28910294])
+ self.assertAlmostEqual(results.pearson_chi2, 264.62262932090221)
+ np.testing.assert_allclose(results.resid_response,
+ [ 28.73168426, -5.15022766, -14.06142973, -1.13799739,
+ -5.76119226, -13.6836241 , 19.35406297, 2.91722003,
+ 12.09610418, 58.80272042, -3.47459217, -6.12384057,
+ 12.21696493, 17.03111777, -11.14073823, -7.04369592,
+ 7.81003633, 27.71075995, 3.57715604, 8.30792121,
+ -13.05316347, -6.52276972, -1.2551479 , 17.66560368,
+ -6.07252457, -11.67153709, 6.18300522, -2.38021219,
+ 7.68240741, -1.13586161, -16.40724678, -8.57969818,
+ -7.92596006, -15.3679231 , -7.32925819, -15.78908089,
+ 8.50221416, -4.56236198, -8.34273194, 4.49170903,
+ -8.66038226, -10.68701352, -9.77103116, -9.21886784,
+ -12.07615759, 26.01341996, -1.13352883, -13.28520774, -10.28910294])
+ np.testing.assert_allclose(results.resid_working,
+ [ 1473.02506034, -258.28508941, -563.32097891, -38.84895192,
+ -165.69875817, -584.06666725, 1076.97496919, 99.42696848,
+ 494.77778514, 2187.30123163, -81.56463405, -159.97823479,
+ 363.858295 , 442.27909165, -324.64933645, -183.44387481,
+ 267.02485844, 894.75938 , 98.09579187, 603.9200634 ,
+ -431.44834594, -238.2296165 , -61.82249568, 624.20344168,
+ -146.18099686, -369.65551968, 171.99262399, -60.41029031,
+ 186.81765356, -26.27913713, -794.22964417, -416.79914795,
+ -253.04388425, -666.47490701, -251.6079969 , -817.70198717,
+ 293.30756327, -125.74947222, -403.31045369, 258.31051005,
+ -438.73827602, -584.440853 , -349.51985996, -398.42903071,
+ -483.96599444, 1300.32189904, -48.89309853, -535.19735391,
+ -476.27334527])
+ np.testing.assert_allclose(results.resid_pearson,
+ [ 4.01269878, -0.72726045, -2.221602 , -0.19477008, -1.07425881,
+ -2.09445239, 2.59451042, 0.49969118, 1.89131202, 9.64143836,
+ -0.71714142, -1.19813392, 2.23861212, 3.34207756, -2.0637814 ,
+ -1.3802231 , 1.33568403, 4.87662684, 0.68309584, 0.97442591,
+ -2.27043598, -1.07931992, -0.17884182, 2.97186889, -1.23768025,
+ -2.07392709, 1.1723155 , -0.47246327, 1.55789092, -0.23614708,
+ -2.35819937, -1.23096188, -1.40274877, -2.33362391, -1.25091503,
+ -2.19400568, 1.44755952, -0.8690235 , -1.19989348, 0.59230634,
+ -1.21675413, -1.44515442, -1.63370888, -1.40229988, -1.90759306,
+ 3.67934693, -0.17259375, -2.09312684, -1.51230062])
+ np.testing.assert_allclose(results.resid_anscombe,
+ [ 3.70889134, -0.74031295, -2.37729865, -0.19586855, -1.11374751,
+ -2.22611959, 2.46352013, 0.49282126, 1.80857757, 8.06444452,
+ -0.73610811, -1.25061371, 2.10820431, 3.05467547, -2.22437611,
+ -1.45136173, 1.28939698, 4.35942058, 0.66904552, 0.95674923,
+ -2.45438937, -1.11429881, -0.17961012, 2.76715848, -1.29658591,
+ -2.22816691, 1.13269136, -0.48017382, 1.48562248, -0.23812278,
+ -2.51664399, -1.2703721 , -1.4683091 , -2.49907536, -1.30026484,
+ -2.32398309, 1.39380683, -0.89495368, -1.23735395, 0.58485202,
+ -1.25435224, -1.4968484 , -1.71888038, -1.45756652, -2.01906267,
+ 3.41729922, -0.17335867, -2.22921828, -1.57470549])
+ np.testing.assert_allclose(results.resid_deviance,
+ [ 3.70529668, -0.74027329, -2.37536322, -0.19586751, -1.11349765,
+ -2.22466106, 2.46246446, 0.4928057 , 1.80799655, 8.02696525,
+ -0.73602255, -1.25021555, 2.10699958, 3.05084608, -2.22214376,
+ -1.45072221, 1.28913747, 4.35106213, 0.6689982 , 0.95669662,
+ -2.45171913, -1.11410444, -0.17960956, 2.76494217, -1.29609865,
+ -2.22612429, 1.13247453, -0.48015254, 1.48508549, -0.23812 ,
+ -2.51476072, -1.27015583, -1.46777697, -2.49699318, -1.29992892,
+ -2.32263069, 1.39348459, -0.89482132, -1.23715363, 0.58483655,
+ -1.25415329, -1.49653039, -1.7181055 , -1.45719072, -2.01791949,
+ 3.41437156, -0.1733581 , -2.22765605, -1.57426046])
+ np.testing.assert_allclose(results.null,
+ [ 38.42857143, 38.42857143, 38.42857143, 38.42857143,
+ 38.42857143, 38.42857143, 38.42857143, 38.42857143,
+ 38.42857143, 38.42857143, 38.42857143, 38.42857143,
+ 38.42857143, 38.42857143, 38.42857143, 38.42857143,
+ 38.42857143, 38.42857143, 38.42857143, 38.42857143,
+ 38.42857143, 38.42857143, 38.42857143, 38.42857143,
+ 38.42857143, 38.42857143, 38.42857143, 38.42857143,
+ 38.42857143, 38.42857143, 38.42857143, 38.42857143,
+ 38.42857143, 38.42857143, 38.42857143, 38.42857143,
+ 38.42857143, 38.42857143, 38.42857143, 38.42857143,
+ 38.42857143, 38.42857143, 38.42857143, 38.42857143,
+ 38.42857143, 38.42857143, 38.42857143, 38.42857143, 38.42857143])
+ self.assertAlmostEqual(results.D2, .388656011675)
+ self.assertAlmostEqual(results.adj_D2, 0.36207583826952761)
+
+class TestBinomial(unittest.TestCase):
+
+ def setUp(self):
+ #London house price data
+ #y: 'BATH2'
+ y = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
+ 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+ self.y = y.reshape((316,1))
+ #X: 'FLOORSZ'
+ X = np.array([ 77, 75, 64, 95, 107, 100, 81, 151, 98, 260, 171, 161, 91,
+ 80, 50, 85, 52, 69, 60, 84, 155, 97, 69, 126, 90, 43,
+ 51, 41, 140, 80, 52, 86, 66, 60, 40, 155, 138, 97, 115,
+ 148, 206, 60, 53, 96, 88, 160, 31, 43, 154, 60, 131, 60,
+ 46, 61, 125, 150, 76, 92, 96, 100, 105, 72, 48, 41, 72,
+ 65, 60, 65, 98, 33, 144, 111, 91, 108, 38, 48, 95, 63,
+ 98, 129, 108, 51, 131, 66, 48, 127, 76, 68, 52, 64, 57,
+ 121, 67, 76, 112, 96, 90, 53, 93, 64, 97, 58, 44, 157,
+ 53, 70, 71, 167, 47, 70, 96, 77, 75, 71, 67, 47, 71,
+ 90, 69, 64, 65, 95, 60, 60, 65, 54, 121, 105, 50, 85,
+ 69, 69, 62, 65, 93, 93, 70, 62, 155, 68, 117, 80, 80,
+ 75, 98, 114, 86, 70, 50, 51, 163, 124, 59, 95, 51, 63,
+ 85, 53, 46, 102, 114, 83, 47, 40, 63, 123, 100, 63, 110,
+ 79, 98, 99, 120, 52, 48, 37, 81, 30, 88, 50, 35, 116,
+ 67, 45, 80, 86, 109, 59, 75, 60, 71, 141, 121, 50, 168,
+ 90, 51, 133, 75, 133, 127, 37, 68, 105, 61, 123, 151, 110,
+ 77, 220, 94, 77, 70, 100, 98, 126, 55, 105, 60, 176, 104,
+ 68, 62, 70, 48, 102, 80, 97, 66, 80, 102, 160, 55, 60,
+ 71, 125, 85, 85, 190, 137, 48, 41, 42, 51, 57, 60, 114,
+ 88, 84, 108, 66, 85, 42, 98, 90, 127, 100, 55, 76, 82,
+ 63, 80, 71, 76, 121, 109, 92, 160, 109, 185, 100, 90, 90,
+ 86, 88, 95, 116, 135, 61, 74, 60, 235, 76, 66, 100, 49,
+ 50, 37, 100, 88, 90, 52, 95, 81, 79, 96, 75, 91, 86,
+ 83, 180, 108, 80, 96, 49, 117, 117, 86, 46, 66, 95, 57,
+ 120, 137, 68, 240])
+ self.X = X.reshape((316,1))
+
+ def testIWLS(self):
+ model = GLM(self.y, self.X, family=Binomial())
+ results = model.fit()
+ self.assertEqual(results.n, 316)
+ self.assertEqual(results.df_model, 1)
+ self.assertEqual(results.df_resid, 314)
+ self.assertEqual(results.aic, 155.19347530342466)
+ self.assertEqual(results.bic, -1656.1095797628657)
+ self.assertEqual(results.deviance, 151.19347530342466)
+ self.assertEqual(results.llf, -75.596737651712331)
+ self.assertEqual(results.null_deviance, 189.16038985881212)
+ self.assertEqual(results.scale, 1.0)
+ np.testing.assert_allclose(results.params, [-5.33638276, 0.0287754 ])
+ np.testing.assert_allclose(results.bse, [ 0.64499904, 0.00518312],
+ atol=1.0e-8)
+ np.testing.assert_allclose(results.cov_params(),
+ [[ 4.16023762e-01, -3.14338457e-03],
+ [ -3.14338457e-03, 2.68646833e-05]])
+ np.testing.assert_allclose(results.tvalues, [-8.27347396, 5.55175826])
+ np.testing.assert_allclose(results.pvalues, [ 1.30111233e-16,
+ 2.82810512e-08])
+ np.testing.assert_allclose(results.conf_int(),
+ [[-6.60055765, -4.07220787],
+ [ 0.01861668, 0.03893412]], atol=1.0e-8)
+ np.testing.assert_allclose(results.normalized_cov_params,
+ [[ 4.16023762e-01, -3.14338457e-03],
+ [ -3.14338457e-03, 2.68646833e-05]])
+ np.testing.assert_allclose(results.mu,
+ [ 0.04226237, 0.03999333, 0.02946178, 0.0689636 , 0.09471181,
+ 0.07879431, 0.04717464, 0.27065598, 0.07471691, 0.89522144,
+ 0.39752487, 0.33102718, 0.06192993, 0.04589793, 0.01988679,
+ 0.0526265 , 0.02104007, 0.03386636, 0.02634295, 0.05121018,
+ 0.29396682, 0.07275173, 0.03386636, 0.15307528, 0.06027915,
+ 0.01631789, 0.02045547, 0.01541937, 0.2128508 , 0.04589793,
+ 0.02104007, 0.05407977, 0.0311527 , 0.02634295, 0.01498855,
+ 0.29396682, 0.20336776, 0.07275173, 0.11637537, 0.25395607,
+ 0.64367488, 0.02634295, 0.02164101, 0.07083428, 0.05710047,
+ 0.32468619, 0.01160845, 0.01631789, 0.28803008, 0.02634295,
+ 0.17267234, 0.02634295, 0.01776301, 0.02709115, 0.14938186,
+ 0.26501331, 0.04111287, 0.06362285, 0.07083428, 0.07879431,
+ 0.08989109, 0.03680743, 0.0187955 , 0.01541937, 0.03680743,
+ 0.03029581, 0.02634295, 0.03029581, 0.07471691, 0.01228768,
+ 0.23277197, 0.10505173, 0.06192993, 0.09720799, 0.01416217,
+ 0.0187955 , 0.0689636 , 0.02865003, 0.07471691, 0.16460503,
+ 0.09720799, 0.02045547, 0.17267234, 0.0311527 , 0.0187955 ,
+ 0.15684317, 0.04111287, 0.03293737, 0.02104007, 0.02946178,
+ 0.02421701, 0.1353385 , 0.03203302, 0.04111287, 0.10778798,
+ 0.07083428, 0.06027915, 0.02164101, 0.06535882, 0.02946178,
+ 0.07275173, 0.02490638, 0.01678627, 0.30605146, 0.02164101,
+ 0.03482061, 0.03580075, 0.37030921, 0.0182721 , 0.03482061,
+ 0.07083428, 0.04226237, 0.03999333, 0.03580075, 0.03203302,
+ 0.0182721 , 0.03580075, 0.06027915, 0.03386636, 0.02946178,
+ 0.03029581, 0.0689636 , 0.02634295, 0.02634295, 0.03029581,
+ 0.02225873, 0.1353385 , 0.08989109, 0.01988679, 0.0526265 ,
+ 0.03386636, 0.03386636, 0.02786 , 0.03029581, 0.06535882,
+ 0.06535882, 0.03482061, 0.02786 , 0.29396682, 0.03293737,
+ 0.12242534, 0.04589793, 0.04589793, 0.03999333, 0.07471691,
+ 0.11344884, 0.05407977, 0.03482061, 0.01988679, 0.02045547,
+ 0.34389327, 0.14576223, 0.02561486, 0.0689636 , 0.02045547,
+ 0.02865003, 0.0526265 , 0.02164101, 0.01776301, 0.08307425,
+ 0.11344884, 0.04982997, 0.0182721 , 0.01498855, 0.02865003,
+ 0.14221564, 0.07879431, 0.02865003, 0.10237696, 0.04465416,
+ 0.07471691, 0.07673078, 0.13200634, 0.02104007, 0.0187955 ,
+ 0.01376599, 0.04717464, 0.01128289, 0.05710047, 0.01988679,
+ 0.01300612, 0.11936722, 0.03203302, 0.01726786, 0.04589793,
+ 0.05407977, 0.09976271, 0.02561486, 0.03999333, 0.02634295,
+ 0.03580075, 0.21771181, 0.1353385 , 0.01988679, 0.37704374,
+ 0.06027915, 0.02045547, 0.18104935, 0.03999333, 0.18104935,
+ 0.15684317, 0.01376599, 0.03293737, 0.08989109, 0.02709115,
+ 0.14221564, 0.27065598, 0.10237696, 0.04226237, 0.72991785,
+ 0.06713876, 0.04226237, 0.03482061, 0.07879431, 0.07471691,
+ 0.15307528, 0.02289366, 0.08989109, 0.02634295, 0.43243779,
+ 0.08756457, 0.03293737, 0.02786 , 0.03482061, 0.0187955 ,
+ 0.08307425, 0.04589793, 0.07275173, 0.0311527 , 0.04589793,
+ 0.08307425, 0.32468619, 0.02289366, 0.02634295, 0.03580075,
+ 0.14938186, 0.0526265 , 0.0526265 , 0.53268924, 0.19874565,
+ 0.0187955 , 0.01541937, 0.01586237, 0.02045547, 0.02421701,
+ 0.02634295, 0.11344884, 0.05710047, 0.05121018, 0.09720799,
+ 0.0311527 , 0.0526265 , 0.01586237, 0.07471691, 0.06027915,
+ 0.15684317, 0.07879431, 0.02289366, 0.04111287, 0.04848506,
+ 0.02865003, 0.04589793, 0.03580075, 0.04111287, 0.1353385 ,
+ 0.09976271, 0.06362285, 0.32468619, 0.09976271, 0.49676673,
+ 0.07879431, 0.06027915, 0.06027915, 0.05407977, 0.05710047,
+ 0.0689636 , 0.11936722, 0.18973955, 0.02709115, 0.03890304,
+ 0.02634295, 0.80625182, 0.04111287, 0.0311527 , 0.07879431,
+ 0.0193336 , 0.01988679, 0.01376599, 0.07879431, 0.05710047,
+ 0.06027915, 0.02104007, 0.0689636 , 0.04717464, 0.04465416,
+ 0.07083428, 0.03999333, 0.06192993, 0.05407977, 0.04982997,
+ 0.46087756, 0.09720799, 0.04589793, 0.07083428, 0.0193336 ,
+ 0.12242534, 0.12242534, 0.05407977, 0.01776301, 0.0311527 ,
+ 0.0689636 , 0.02421701, 0.13200634, 0.19874565, 0.03293737,
+ 0.82774282], atol=1.0e-8)
+ self.assertAlmostEqual(results.pearson_chi2, 271.21110541713801)
+ np.testing.assert_allclose(results.resid_response,
+ [-0.04226237, -0.03999333, -0.02946178, -0.0689636 , -0.09471181,
+ -0.07879431, -0.04717464, -0.27065598, -0.07471691, 0.10477856,
+ -0.39752487, 0.66897282, -0.06192993, -0.04589793, -0.01988679,
+ -0.0526265 , -0.02104007, -0.03386636, -0.02634295, -0.05121018,
+ -0.29396682, 0.92724827, -0.03386636, -0.15307528, -0.06027915,
+ -0.01631789, -0.02045547, -0.01541937, -0.2128508 , -0.04589793,
+ -0.02104007, -0.05407977, -0.0311527 , -0.02634295, -0.01498855,
+ -0.29396682, 0.79663224, -0.07275173, -0.11637537, 0.74604393,
+ -0.64367488, -0.02634295, -0.02164101, -0.07083428, -0.05710047,
+ -0.32468619, -0.01160845, -0.01631789, -0.28803008, -0.02634295,
+ -0.17267234, -0.02634295, -0.01776301, -0.02709115, 0.85061814,
+ 0.73498669, -0.04111287, -0.06362285, -0.07083428, -0.07879431,
+ 0.91010891, -0.03680743, -0.0187955 , -0.01541937, -0.03680743,
+ -0.03029581, -0.02634295, -0.03029581, -0.07471691, -0.01228768,
+ 0.76722803, -0.10505173, -0.06192993, -0.09720799, -0.01416217,
+ -0.0187955 , -0.0689636 , -0.02865003, -0.07471691, -0.16460503,
+ -0.09720799, -0.02045547, 0.82732766, -0.0311527 , -0.0187955 ,
+ -0.15684317, -0.04111287, -0.03293737, -0.02104007, -0.02946178,
+ -0.02421701, -0.1353385 , -0.03203302, -0.04111287, -0.10778798,
+ -0.07083428, -0.06027915, -0.02164101, -0.06535882, -0.02946178,
+ -0.07275173, -0.02490638, -0.01678627, -0.30605146, -0.02164101,
+ -0.03482061, -0.03580075, 0.62969079, -0.0182721 , -0.03482061,
+ -0.07083428, -0.04226237, -0.03999333, -0.03580075, -0.03203302,
+ -0.0182721 , -0.03580075, -0.06027915, -0.03386636, -0.02946178,
+ -0.03029581, -0.0689636 , -0.02634295, -0.02634295, -0.03029581,
+ -0.02225873, -0.1353385 , -0.08989109, -0.01988679, -0.0526265 ,
+ -0.03386636, -0.03386636, -0.02786 , -0.03029581, -0.06535882,
+ -0.06535882, -0.03482061, -0.02786 , -0.29396682, -0.03293737,
+ -0.12242534, -0.04589793, -0.04589793, -0.03999333, -0.07471691,
+ -0.11344884, -0.05407977, -0.03482061, -0.01988679, -0.02045547,
+ 0.65610673, 0.85423777, -0.02561486, -0.0689636 , -0.02045547,
+ -0.02865003, -0.0526265 , -0.02164101, -0.01776301, -0.08307425,
+ -0.11344884, -0.04982997, -0.0182721 , -0.01498855, -0.02865003,
+ -0.14221564, -0.07879431, -0.02865003, -0.10237696, -0.04465416,
+ -0.07471691, -0.07673078, -0.13200634, -0.02104007, -0.0187955 ,
+ -0.01376599, -0.04717464, -0.01128289, 0.94289953, -0.01988679,
+ -0.01300612, -0.11936722, -0.03203302, -0.01726786, -0.04589793,
+ -0.05407977, -0.09976271, -0.02561486, -0.03999333, -0.02634295,
+ -0.03580075, -0.21771181, 0.8646615 , -0.01988679, 0.62295626,
+ -0.06027915, -0.02045547, -0.18104935, 0.96000667, -0.18104935,
+ -0.15684317, -0.01376599, -0.03293737, -0.08989109, -0.02709115,
+ -0.14221564, 0.72934402, -0.10237696, -0.04226237, -0.72991785,
+ -0.06713876, -0.04226237, -0.03482061, -0.07879431, -0.07471691,
+ -0.15307528, 0.97710634, 0.91010891, -0.02634295, -0.43243779,
+ -0.08756457, -0.03293737, -0.02786 , -0.03482061, -0.0187955 ,
+ 0.91692575, -0.04589793, -0.07275173, -0.0311527 , -0.04589793,
+ -0.08307425, 0.67531381, -0.02289366, -0.02634295, -0.03580075,
+ -0.14938186, -0.0526265 , -0.0526265 , 0.46731076, -0.19874565,
+ -0.0187955 , -0.01541937, -0.01586237, -0.02045547, -0.02421701,
+ -0.02634295, -0.11344884, -0.05710047, -0.05121018, -0.09720799,
+ 0.9688473 , -0.0526265 , -0.01586237, -0.07471691, -0.06027915,
+ -0.15684317, -0.07879431, -0.02289366, -0.04111287, -0.04848506,
+ -0.02865003, -0.04589793, -0.03580075, -0.04111287, -0.1353385 ,
+ -0.09976271, -0.06362285, 0.67531381, -0.09976271, -0.49676673,
+ -0.07879431, -0.06027915, -0.06027915, -0.05407977, -0.05710047,
+ -0.0689636 , -0.11936722, -0.18973955, -0.02709115, -0.03890304,
+ -0.02634295, 0.19374818, -0.04111287, -0.0311527 , -0.07879431,
+ -0.0193336 , -0.01988679, -0.01376599, -0.07879431, 0.94289953,
+ -0.06027915, -0.02104007, -0.0689636 , -0.04717464, -0.04465416,
+ 0.92916572, -0.03999333, -0.06192993, -0.05407977, -0.04982997,
+ -0.46087756, -0.09720799, -0.04589793, -0.07083428, -0.0193336 ,
+ -0.12242534, -0.12242534, -0.05407977, -0.01776301, -0.0311527 ,
+ -0.0689636 , -0.02421701, -0.13200634, -0.19874565, -0.03293737,
+ -0.82774282], atol=1.0e-8)
+ np.testing.assert_allclose(results.resid_working,
+ [ -1.71062283e-03, -1.53549840e-03, -8.42423701e-04,
+ -4.42798906e-03, -8.12073047e-03, -5.71934606e-03,
+ -2.12046213e-03, -5.34278480e-02, -5.16550074e-03,
+ 9.82823035e-03, -9.52067472e-02, 1.48142818e-01,
+ -3.59779501e-03, -2.00993083e-03, -3.87619325e-04,
+ -2.62379729e-03, -4.33370579e-04, -1.10808799e-03,
+ -6.75670103e-04, -2.48818484e-03, -6.10129090e-02,
+ 6.25511612e-02, -1.10808799e-03, -1.98451739e-02,
+ -3.41454749e-03, -2.61928659e-04, -4.09867263e-04,
+ -2.34090923e-04, -3.56621577e-02, -2.00993083e-03,
+ -4.33370579e-04, -2.76645832e-03, -9.40257152e-04,
+ -6.75670103e-04, -2.21289369e-04, -6.10129090e-02,
+ 1.29061842e-01, -4.90775251e-03, -1.19671283e-02,
+ 1.41347263e-01, -1.47631680e-01, -6.75670103e-04,
+ -4.58198217e-04, -4.66208406e-03, -3.07429001e-03,
+ -7.11923401e-02, -1.33191898e-04, -2.61928659e-04,
+ -5.90659690e-02, -6.75670103e-04, -2.46673839e-02,
+ -6.75670103e-04, -3.09919962e-04, -7.14047519e-04,
+ 1.08085429e-01, 1.43161630e-01, -1.62077632e-03,
+ -3.79032977e-03, -4.66208406e-03, -5.71934606e-03,
+ 7.44566288e-02, -1.30492035e-03, -3.46630910e-04,
+ -2.34090923e-04, -1.30492035e-03, -8.90029618e-04,
+ -6.75670103e-04, -8.90029618e-04, -5.16550074e-03,
+ -1.49131762e-04, 1.37018624e-01, -9.87652847e-03,
+ -3.59779501e-03, -8.53083698e-03, -1.97726627e-04,
+ -3.46630910e-04, -4.42798906e-03, -7.97307494e-04,
+ -5.16550074e-03, -2.26348718e-02, -8.53083698e-03,
+ -4.09867263e-04, 1.18189219e-01, -9.40257152e-04,
+ -3.46630910e-04, -2.07414715e-02, -1.62077632e-03,
+ -1.04913757e-03, -4.33370579e-04, -8.42423701e-04,
+ -5.72261321e-04, -1.58375811e-02, -9.93244730e-04,
+ -1.62077632e-03, -1.03659408e-02, -4.66208406e-03,
+ -3.41454749e-03, -4.58198217e-04, -3.99257703e-03,
+ -8.42423701e-04, -4.90775251e-03, -6.04877746e-04,
+ -2.77048947e-04, -6.50004229e-02, -4.58198217e-04,
+ -1.17025566e-03, -1.23580799e-03, 1.46831486e-01,
+ -3.27769165e-04, -1.17025566e-03, -4.66208406e-03,
+ -1.71062283e-03, -1.53549840e-03, -1.23580799e-03,
+ -9.93244730e-04, -3.27769165e-04, -1.23580799e-03,
+ -3.41454749e-03, -1.10808799e-03, -8.42423701e-04,
+ -8.90029618e-04, -4.42798906e-03, -6.75670103e-04,
+ -6.75670103e-04, -8.90029618e-04, -4.84422741e-04,
+ -1.58375811e-02, -7.35405096e-03, -3.87619325e-04,
+ -2.62379729e-03, -1.10808799e-03, -1.10808799e-03,
+ -7.54555329e-04, -8.90029618e-04, -3.99257703e-03,
+ -3.99257703e-03, -1.17025566e-03, -7.54555329e-04,
+ -6.10129090e-02, -1.04913757e-03, -1.31530576e-02,
+ -2.00993083e-03, -2.00993083e-03, -1.53549840e-03,
+ -5.16550074e-03, -1.14104800e-02, -2.76645832e-03,
+ -1.17025566e-03, -3.87619325e-04, -4.09867263e-04,
+ 1.48037813e-01, 1.06365931e-01, -6.39314594e-04,
+ -4.42798906e-03, -4.09867263e-04, -7.97307494e-04,
+ -2.62379729e-03, -4.58198217e-04, -3.09919962e-04,
+ -6.32800839e-03, -1.14104800e-02, -2.35929680e-03,
+ -3.27769165e-04, -2.21289369e-04, -7.97307494e-04,
+ -1.73489362e-02, -5.71934606e-03, -7.97307494e-04,
+ -9.40802551e-03, -1.90495384e-03, -5.16550074e-03,
+ -5.43585191e-03, -1.51253748e-02, -4.33370579e-04,
+ -3.46630910e-04, -1.86893696e-04, -2.12046213e-03,
+ -1.25867293e-04, 5.07657192e-02, -3.87619325e-04,
+ -1.66959104e-04, -1.25477263e-02, -9.93244730e-04,
+ -2.93030065e-04, -2.00993083e-03, -2.76645832e-03,
+ -8.95970087e-03, -6.39314594e-04, -1.53549840e-03,
+ -6.75670103e-04, -1.23580799e-03, -3.70792339e-02,
+ 1.01184411e-01, -3.87619325e-04, 1.46321062e-01,
+ -3.41454749e-03, -4.09867263e-04, -2.68442736e-02,
+ 3.68583645e-02, -2.68442736e-02, -2.07414715e-02,
+ -1.86893696e-04, -1.04913757e-03, -7.35405096e-03,
+ -7.14047519e-04, -1.73489362e-02, 1.43973473e-01,
+ -9.40802551e-03, -1.71062283e-03, -1.43894386e-01,
+ -4.20497779e-03, -1.71062283e-03, -1.17025566e-03,
+ -5.71934606e-03, -5.16550074e-03, -1.98451739e-02,
+ 2.18574168e-02, 7.44566288e-02, -6.75670103e-04,
+ -1.06135519e-01, -6.99614755e-03, -1.04913757e-03,
+ -7.54555329e-04, -1.17025566e-03, -3.46630910e-04,
+ 6.98449121e-02, -2.00993083e-03, -4.90775251e-03,
+ -9.40257152e-04, -2.00993083e-03, -6.32800839e-03,
+ 1.48072729e-01, -5.12120512e-04, -6.75670103e-04,
+ -1.23580799e-03, -1.89814939e-02, -2.62379729e-03,
+ -2.62379729e-03, 1.16328328e-01, -3.16494123e-02,
+ -3.46630910e-04, -2.34090923e-04, -2.47623705e-04,
+ -4.09867263e-04, -5.72261321e-04, -6.75670103e-04,
+ -1.14104800e-02, -3.07429001e-03, -2.48818484e-03,
+ -8.53083698e-03, 2.92419496e-02, -2.62379729e-03,
+ -2.47623705e-04, -5.16550074e-03, -3.41454749e-03,
+ -2.07414715e-02, -5.71934606e-03, -5.12120512e-04,
+ -1.62077632e-03, -2.23682205e-03, -7.97307494e-04,
+ -2.00993083e-03, -1.23580799e-03, -1.62077632e-03,
+ -1.58375811e-02, -8.95970087e-03, -3.79032977e-03,
+ 1.48072729e-01, -8.95970087e-03, -1.24186489e-01,
+ -5.71934606e-03, -3.41454749e-03, -3.41454749e-03,
+ -2.76645832e-03, -3.07429001e-03, -4.42798906e-03,
+ -1.25477263e-02, -2.91702648e-02, -7.14047519e-04,
+ -1.45456868e-03, -6.75670103e-04, 3.02653681e-02,
+ -1.62077632e-03, -9.40257152e-04, -5.71934606e-03,
+ -3.66561274e-04, -3.87619325e-04, -1.86893696e-04,
+ -5.71934606e-03, 5.07657192e-02, -3.41454749e-03,
+ -4.33370579e-04, -4.42798906e-03, -2.12046213e-03,
+ -1.90495384e-03, 6.11546973e-02, -1.53549840e-03,
+ -3.59779501e-03, -2.76645832e-03, -2.35929680e-03,
+ -1.14513988e-01, -8.53083698e-03, -2.00993083e-03,
+ -4.66208406e-03, -3.66561274e-04, -1.31530576e-02,
+ -1.31530576e-02, -2.76645832e-03, -3.09919962e-04,
+ -9.40257152e-04, -4.42798906e-03, -5.72261321e-04,
+ -1.51253748e-02, -3.16494123e-02, -1.04913757e-03,
+ -1.18023417e-01])
+ np.testing.assert_allclose(results.resid_pearson,
+ [-0.21006498, -0.20410641, -0.17423009, -0.27216147, -0.3234511 ,
+ -0.29246179, -0.22250903, -0.60917574, -0.28416602, 0.3421141 ,
+ -0.81229277, 1.42158361, -0.25694055, -0.21933056, -0.142444 ,
+ -0.23569027, -0.14660243, -0.18722578, -0.16448609, -0.2323235 ,
+ -0.64526275, 3.57006696, -0.18722578, -0.42513819, -0.25327023,
+ -0.12879668, -0.14450826, -0.12514332, -0.5200069 , -0.21933056,
+ -0.14660243, -0.23910582, -0.17931646, -0.16448609, -0.12335569,
+ -0.64526275, 1.97919183, -0.28010679, -0.36290807, 1.71396874,
+ -1.3440334 , -0.16448609, -0.14872695, -0.27610555, -0.24608613,
+ -0.69339243, -0.1083734 , -0.12879668, -0.63604537, -0.16448609,
+ -0.45684893, -0.16448609, -0.13447767, -0.16686977, 2.3862634 ,
+ 1.66535145, -0.20706426, -0.26066405, -0.27610555, -0.29246179,
+ 3.18191348, -0.19548397, -0.13840353, -0.12514332, -0.19548397,
+ -0.17675498, -0.16448609, -0.17675498, -0.28416602, -0.11153719,
+ 1.81550268, -0.34261205, -0.25694055, -0.32813846, -0.11985666,
+ -0.13840353, -0.27216147, -0.17174127, -0.28416602, -0.44389026,
+ -0.32813846, -0.14450826, 2.18890738, -0.17931646, -0.13840353,
+ -0.43129917, -0.20706426, -0.18455132, -0.14660243, -0.17423009,
+ -0.1575374 , -0.39562855, -0.18191506, -0.20706426, -0.34757708,
+ -0.27610555, -0.25327023, -0.14872695, -0.26444152, -0.17423009,
+ -0.28010679, -0.15982038, -0.13066317, -0.66410018, -0.14872695,
+ -0.189939 , -0.19269154, 1.30401147, -0.13642648, -0.189939 ,
+ -0.27610555, -0.21006498, -0.20410641, -0.19269154, -0.18191506,
+ -0.13642648, -0.19269154, -0.25327023, -0.18722578, -0.17423009,
+ -0.17675498, -0.27216147, -0.16448609, -0.16448609, -0.17675498,
+ -0.15088226, -0.39562855, -0.3142763 , -0.142444 , -0.23569027,
+ -0.18722578, -0.18722578, -0.169288 , -0.17675498, -0.26444152,
+ -0.26444152, -0.189939 , -0.169288 , -0.64526275, -0.18455132,
+ -0.3735026 , -0.21933056, -0.21933056, -0.20410641, -0.28416602,
+ -0.35772404, -0.23910582, -0.189939 , -0.142444 , -0.14450826,
+ 1.38125991, 2.42084442, -0.16213645, -0.27216147, -0.14450826,
+ -0.17174127, -0.23569027, -0.14872695, -0.13447767, -0.30099975,
+ -0.35772404, -0.22900483, -0.13642648, -0.12335569, -0.17174127,
+ -0.4071783 , -0.29246179, -0.17174127, -0.33771794, -0.21619749,
+ -0.28416602, -0.28828407, -0.38997712, -0.14660243, -0.13840353,
+ -0.11814455, -0.22250903, -0.10682532, 4.06361781, -0.142444 ,
+ -0.11479334, -0.36816723, -0.18191506, -0.1325567 , -0.21933056,
+ -0.23910582, -0.33289374, -0.16213645, -0.20410641, -0.16448609,
+ -0.19269154, -0.52754269, 2.52762346, -0.142444 , 1.28538406,
+ -0.25327023, -0.14450826, -0.47018591, 4.89940505, -0.47018591,
+ -0.43129917, -0.11814455, -0.18455132, -0.3142763 , -0.16686977,
+ -0.4071783 , 1.64156241, -0.33771794, -0.21006498, -1.6439517 ,
+ -0.26827373, -0.21006498, -0.189939 , -0.29246179, -0.28416602,
+ -0.42513819, 6.53301013, 3.18191348, -0.16448609, -0.87288109,
+ -0.30978696, -0.18455132, -0.169288 , -0.189939 , -0.13840353,
+ 3.32226189, -0.21933056, -0.28010679, -0.17931646, -0.21933056,
+ -0.30099975, 1.44218477, -0.1530688 , -0.16448609, -0.19269154,
+ -0.41906522, -0.23569027, -0.23569027, 0.93662539, -0.4980393 ,
+ -0.13840353, -0.12514332, -0.12695686, -0.14450826, -0.1575374 ,
+ -0.16448609, -0.35772404, -0.24608613, -0.2323235 , -0.32813846,
+ 5.57673284, -0.23569027, -0.12695686, -0.28416602, -0.25327023,
+ -0.43129917, -0.29246179, -0.1530688 , -0.20706426, -0.22573357,
+ -0.17174127, -0.21933056, -0.19269154, -0.20706426, -0.39562855,
+ -0.33289374, -0.26066405, 1.44218477, -0.33289374, -0.99355423,
+ -0.29246179, -0.25327023, -0.25327023, -0.23910582, -0.24608613,
+ -0.27216147, -0.36816723, -0.48391225, -0.16686977, -0.20119082,
+ -0.16448609, 0.49021146, -0.20706426, -0.17931646, -0.29246179,
+ -0.14040923, -0.142444 , -0.11814455, -0.29246179, 4.06361781,
+ -0.25327023, -0.14660243, -0.27216147, -0.22250903, -0.21619749,
+ 3.6218033 , -0.20410641, -0.25694055, -0.23910582, -0.22900483,
+ -0.92458976, -0.32813846, -0.21933056, -0.27610555, -0.14040923,
+ -0.3735026 , -0.3735026 , -0.23910582, -0.13447767, -0.17931646,
+ -0.27216147, -0.1575374 , -0.38997712, -0.4980393 , -0.18455132,
+ -2.19209332])
+ np.testing.assert_allclose(results.resid_anscombe,
+ [-0.31237627, -0.3036605 , -0.25978208, -0.40240831, -0.47552289,
+ -0.43149255, -0.33053793, -0.85617194, -0.41962951, 0.50181328,
+ -1.0954382 , 1.66940149, -0.38048321, -0.3259044 , -0.21280762,
+ -0.34971301, -0.21896842, -0.27890356, -0.2454118 , -0.34482158,
+ -0.90063409, 2.80452413, -0.27890356, -0.61652596, -0.37518169,
+ -0.19255932, -0.2158664 , -0.18713159, -0.74270558, -0.3259044 ,
+ -0.21896842, -0.35467084, -0.2672722 , -0.2454118 , -0.18447466,
+ -0.90063409, 2.05763941, -0.41381347, -0.53089521, 1.88552083,
+ -1.60654218, -0.2454118 , -0.22211425, -0.40807333, -0.3647888 ,
+ -0.95861559, -0.16218047, -0.19255932, -0.88935802, -0.2454118 ,
+ -0.65930821, -0.2454118 , -0.20099345, -0.24892975, 2.28774016,
+ 1.85167195, -0.30798858, -0.38585584, -0.40807333, -0.43149255,
+ 2.65398426, -0.2910267 , -0.20681747, -0.18713159, -0.2910267 ,
+ -0.26350118, -0.2454118 , -0.26350118, -0.41962951, -0.16689207,
+ 1.95381191, -0.50251231, -0.38048321, -0.48214234, -0.17927213,
+ -0.20681747, -0.40240831, -0.25611424, -0.41962951, -0.64189694,
+ -0.48214234, -0.2158664 , 2.18071204, -0.2672722 , -0.20681747,
+ -0.62488429, -0.30798858, -0.27497271, -0.21896842, -0.25978208,
+ -0.23514749, -0.57618899, -0.27109582, -0.30798858, -0.50947546,
+ -0.40807333, -0.37518169, -0.22211425, -0.39130036, -0.25978208,
+ -0.41381347, -0.2385213 , -0.19533116, -0.92350689, -0.22211425,
+ -0.28288904, -0.28692985, 1.5730846 , -0.20388497, -0.28288904,
+ -0.40807333, -0.31237627, -0.3036605 , -0.28692985, -0.27109582,
+ -0.20388497, -0.28692985, -0.37518169, -0.27890356, -0.25978208,
+ -0.26350118, -0.40240831, -0.2454118 , -0.2454118 , -0.26350118,
+ -0.22530448, -0.57618899, -0.46253505, -0.21280762, -0.34971301,
+ -0.27890356, -0.27890356, -0.25249702, -0.26350118, -0.39130036,
+ -0.39130036, -0.28288904, -0.25249702, -0.90063409, -0.27497271,
+ -0.5456246 , -0.3259044 , -0.3259044 , -0.3036605 , -0.41962951,
+ -0.52366614, -0.35467084, -0.28288904, -0.21280762, -0.2158664 ,
+ 1.63703418, 2.30570989, -0.24194253, -0.40240831, -0.2158664 ,
+ -0.25611424, -0.34971301, -0.22211425, -0.20099345, -0.44366892,
+ -0.52366614, -0.33999576, -0.20388497, -0.18447466, -0.25611424,
+ -0.59203547, -0.43149255, -0.25611424, -0.49563627, -0.32133344,
+ -0.41962951, -0.42552227, -0.56840788, -0.21896842, -0.20681747,
+ -0.17672552, -0.33053793, -0.15987433, 2.9768074 , -0.21280762,
+ -0.17173916, -0.53821445, -0.27109582, -0.19814236, -0.3259044 ,
+ -0.35467084, -0.48884654, -0.24194253, -0.3036605 , -0.2454118 ,
+ -0.28692985, -0.75249089, 2.35983933, -0.21280762, 1.55726719,
+ -0.37518169, -0.2158664 , -0.67712261, 3.23165236, -0.67712261,
+ -0.62488429, -0.17672552, -0.27497271, -0.46253505, -0.24892975,
+ -0.59203547, 1.83482464, -0.49563627, -0.31237627, -1.83652534,
+ -0.39681759, -0.31237627, -0.28288904, -0.43149255, -0.41962951,
+ -0.61652596, 3.63983609, 2.65398426, -0.2454118 , -1.16171662,
+ -0.45616505, -0.27497271, -0.25249702, -0.28288904, -0.20681747,
+ 2.71015945, -0.3259044 , -0.41381347, -0.2672722 , -0.3259044 ,
+ -0.44366892, 1.68567947, -0.22853969, -0.2454118 , -0.28692985,
+ -0.60826548, -0.34971301, -0.34971301, 1.2290223 , -0.71397735,
+ -0.20681747, -0.18713159, -0.1898263 , -0.2158664 , -0.23514749,
+ -0.2454118 , -0.52366614, -0.3647888 , -0.34482158, -0.48214234,
+ 3.41271513, -0.34971301, -0.1898263 , -0.41962951, -0.37518169,
+ -0.62488429, -0.43149255, -0.22853969, -0.30798858, -0.3352348 ,
+ -0.25611424, -0.3259044 , -0.28692985, -0.30798858, -0.57618899,
+ -0.48884654, -0.38585584, 1.68567947, -0.48884654, -1.28709718,
+ -0.43149255, -0.37518169, -0.37518169, -0.35467084, -0.3647888 ,
+ -0.40240831, -0.53821445, -0.69534436, -0.24892975, -0.29939131,
+ -0.2454118 , 0.70366797, -0.30798858, -0.2672722 , -0.43149255,
+ -0.2097915 , -0.21280762, -0.17672552, -0.43149255, 2.9768074 ,
+ -0.37518169, -0.21896842, -0.40240831, -0.33053793, -0.32133344,
+ 2.82351017, -0.3036605 , -0.38048321, -0.35467084, -0.33999576,
+ -1.21650102, -0.48214234, -0.3259044 , -0.40807333, -0.2097915 ,
+ -0.5456246 , -0.5456246 , -0.35467084, -0.20099345, -0.2672722 ,
+ -0.40240831, -0.23514749, -0.56840788, -0.71397735, -0.27497271,
+ -2.18250381])
+ np.testing.assert_allclose(results.resid_deviance,
+ [-0.29387552, -0.2857098 , -0.24455876, -0.37803944, -0.44609851,
+ -0.40514674, -0.31088148, -0.79449324, -0.39409528, 0.47049798,
+ -1.00668653, 1.48698001, -0.35757692, -0.30654405, -0.20043547,
+ -0.32882173, -0.20622595, -0.26249995, -0.23106769, -0.32424676,
+ -0.83437766, 2.28941155, -0.26249995, -0.57644334, -0.35262564,
+ -0.18139734, -0.20331052, -0.17629229, -0.69186337, -0.30654405,
+ -0.20622595, -0.33345774, -0.251588 , -0.23106769, -0.17379306,
+ -0.83437766, 1.78479093, -0.38867448, -0.4974393 , 1.65565332,
+ -1.43660134, -0.23106769, -0.20918228, -0.38332275, -0.34291558,
+ -0.88609006, -0.15281596, -0.18139734, -0.82428104, -0.23106769,
+ -0.61571821, -0.23106769, -0.18932865, -0.234371 , 1.94999969,
+ 1.62970871, -0.2897651 , -0.36259328, -0.38332275, -0.40514674,
+ 2.19506559, -0.27386827, -0.19480442, -0.17629229, -0.27386827,
+ -0.24804925, -0.23106769, -0.24804925, -0.39409528, -0.15725009,
+ 1.7074519 , -0.47114617, -0.35757692, -0.4522457 , -0.16889886,
+ -0.19480442, -0.37803944, -0.24111595, -0.39409528, -0.59975102,
+ -0.4522457 , -0.20331052, 1.87422489, -0.251588 , -0.19480442,
+ -0.5841272 , -0.2897651 , -0.25881274, -0.20622595, -0.24455876,
+ -0.22142749, -0.53929061, -0.25517563, -0.2897651 , -0.47760126,
+ -0.38332275, -0.35262564, -0.20918228, -0.36767536, -0.24455876,
+ -0.38867448, -0.2245965 , -0.18400413, -0.85481866, -0.20918228,
+ -0.26623785, -0.27002708, 1.40955093, -0.19204738, -0.26623785,
+ -0.38332275, -0.29387552, -0.2857098 , -0.27002708, -0.25517563,
+ -0.19204738, -0.27002708, -0.35262564, -0.26249995, -0.24455876,
+ -0.24804925, -0.37803944, -0.23106769, -0.23106769, -0.24804925,
+ -0.21218006, -0.53929061, -0.43402996, -0.20043547, -0.32882173,
+ -0.26249995, -0.26249995, -0.23772023, -0.24804925, -0.36767536,
+ -0.36767536, -0.26623785, -0.23772023, -0.83437766, -0.25881274,
+ -0.51106408, -0.30654405, -0.30654405, -0.2857098 , -0.39409528,
+ -0.49074728, -0.33345774, -0.26623785, -0.20043547, -0.20331052,
+ 1.46111186, 1.96253843, -0.22780971, -0.37803944, -0.20331052,
+ -0.24111595, -0.32882173, -0.20918228, -0.18932865, -0.41648237,
+ -0.49074728, -0.31973217, -0.19204738, -0.17379306, -0.24111595,
+ -0.55389988, -0.40514674, -0.24111595, -0.46476893, -0.30226435,
+ -0.39409528, -0.39958581, -0.53211065, -0.20622595, -0.19480442,
+ -0.16650295, -0.31088148, -0.15064545, 2.39288231, -0.20043547,
+ -0.16181126, -0.5042114 , -0.25517563, -0.18664773, -0.30654405,
+ -0.33345774, -0.45846897, -0.22780971, -0.2857098 , -0.23106769,
+ -0.27002708, -0.7007597 , 1.99998811, -0.20043547, 1.39670618,
+ -0.35262564, -0.20331052, -0.63203077, 2.53733821, -0.63203077,
+ -0.5841272 , -0.16650295, -0.25881274, -0.43402996, -0.234371 ,
+ -0.55389988, 1.61672923, -0.46476893, -0.29387552, -1.61804148,
+ -0.37282386, -0.29387552, -0.26623785, -0.40514674, -0.39409528,
+ -0.57644334, 2.74841605, 2.19506559, -0.23106769, -1.06433539,
+ -0.42810736, -0.25881274, -0.23772023, -0.26623785, -0.19480442,
+ 2.23070414, -0.30654405, -0.38867448, -0.251588 , -0.30654405,
+ -0.41648237, 1.49993075, -0.21521982, -0.23106769, -0.27002708,
+ -0.5688444 , -0.32882173, -0.32882173, 1.12233423, -0.66569789,
+ -0.19480442, -0.17629229, -0.17882689, -0.20331052, -0.22142749,
+ -0.23106769, -0.49074728, -0.34291558, -0.32424676, -0.4522457 ,
+ 2.63395309, -0.32882173, -0.17882689, -0.39409528, -0.35262564,
+ -0.5841272 , -0.40514674, -0.21521982, -0.2897651 , -0.3152773 ,
+ -0.24111595, -0.30654405, -0.27002708, -0.2897651 , -0.53929061,
+ -0.45846897, -0.36259328, 1.49993075, -0.45846897, -1.17192274,
+ -0.40514674, -0.35262564, -0.35262564, -0.33345774, -0.34291558,
+ -0.37803944, -0.5042114 , -0.64869028, -0.234371 , -0.28170899,
+ -0.23106769, 0.65629132, -0.2897651 , -0.251588 , -0.40514674,
+ -0.19760028, -0.20043547, -0.16650295, -0.40514674, 2.39288231,
+ -0.35262564, -0.20622595, -0.37803944, -0.31088148, -0.30226435,
+ 2.30104857, -0.2857098 , -0.35757692, -0.33345774, -0.31973217,
+ -1.11158678, -0.4522457 , -0.30654405, -0.38332275, -0.19760028,
+ -0.51106408, -0.51106408, -0.33345774, -0.18932865, -0.251588 ,
+ -0.37803944, -0.22142749, -0.53211065, -0.66569789, -0.25881274,
+ -1.87550882])
+ np.testing.assert_allclose(results.null,
+ [ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759, 0.08860759, 0.08860759, 0.08860759, 0.08860759,
+ 0.08860759])
+ self.assertAlmostEqual(results.D2, .200712816165)
+ self.assertAlmostEqual(results.adj_D2, 0.19816731557930456)
+
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/release/python/0.8.0/crankshaft/crankshaft/regression/glm/utils.py b/release/python/0.8.0/crankshaft/crankshaft/regression/glm/utils.py
new file mode 100644
index 0000000..0789675
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/regression/glm/utils.py
@@ -0,0 +1,350 @@
+
+from __future__ import absolute_import, print_function
+import numpy as np
+import warnings
+
+
+def _bit_length_26(x):
+ if x == 0:
+ return 0
+ elif x == 1:
+ return 1
+ else:
+ return len(bin(x)) - 2
+
+
+try:
+ from scipy.lib._version import NumpyVersion
+except ImportError:
+ import re
+ string_types = basestring
+
+ class NumpyVersion():
+ """Parse and compare numpy version strings.
+ Numpy has the following versioning scheme (numbers given are examples; they
+ can be >9) in principle):
+ - Released version: '1.8.0', '1.8.1', etc.
+ - Alpha: '1.8.0a1', '1.8.0a2', etc.
+ - Beta: '1.8.0b1', '1.8.0b2', etc.
+ - Release candidates: '1.8.0rc1', '1.8.0rc2', etc.
+ - Development versions: '1.8.0.dev-f1234afa' (git commit hash appended)
+ - Development versions after a1: '1.8.0a1.dev-f1234afa',
+ '1.8.0b2.dev-f1234afa',
+ '1.8.1rc1.dev-f1234afa', etc.
+ - Development versions (no git hash available): '1.8.0.dev-Unknown'
+ Comparing needs to be done against a valid version string or other
+ `NumpyVersion` instance.
+ Parameters
+ ----------
+ vstring : str
+ Numpy version string (``np.__version__``).
+ Notes
+ -----
+ All dev versions of the same (pre-)release compare equal.
+ Examples
+ --------
+ >>> from scipy.lib._version import NumpyVersion
+ >>> if NumpyVersion(np.__version__) < '1.7.0':
+ ... print('skip')
+ skip
+ >>> NumpyVersion('1.7') # raises ValueError, add ".0"
+ """
+
+ def __init__(self, vstring):
+ self.vstring = vstring
+ ver_main = re.match(r'\d[.]\d+[.]\d+', vstring)
+ if not ver_main:
+ raise ValueError("Not a valid numpy version string")
+
+ self.version = ver_main.group()
+ self.major, self.minor, self.bugfix = [int(x) for x in
+ self.version.split('.')]
+ if len(vstring) == ver_main.end():
+ self.pre_release = 'final'
+ else:
+ alpha = re.match(r'a\d', vstring[ver_main.end():])
+ beta = re.match(r'b\d', vstring[ver_main.end():])
+ rc = re.match(r'rc\d', vstring[ver_main.end():])
+ pre_rel = [m for m in [alpha, beta, rc] if m is not None]
+ if pre_rel:
+ self.pre_release = pre_rel[0].group()
+ else:
+ self.pre_release = ''
+
+ self.is_devversion = bool(re.search(r'.dev-', vstring))
+
+ def _compare_version(self, other):
+ """Compare major.minor.bugfix"""
+ if self.major == other.major:
+ if self.minor == other.minor:
+ if self.bugfix == other.bugfix:
+ vercmp = 0
+ elif self.bugfix > other.bugfix:
+ vercmp = 1
+ else:
+ vercmp = -1
+ elif self.minor > other.minor:
+ vercmp = 1
+ else:
+ vercmp = -1
+ elif self.major > other.major:
+ vercmp = 1
+ else:
+ vercmp = -1
+
+ return vercmp
+
+ def _compare_pre_release(self, other):
+ """Compare alpha/beta/rc/final."""
+ if self.pre_release == other.pre_release:
+ vercmp = 0
+ elif self.pre_release == 'final':
+ vercmp = 1
+ elif other.pre_release == 'final':
+ vercmp = -1
+ elif self.pre_release > other.pre_release:
+ vercmp = 1
+ else:
+ vercmp = -1
+
+ return vercmp
+
+ def _compare(self, other):
+ if not isinstance(other, (string_types, NumpyVersion)):
+ raise ValueError("Invalid object to compare with NumpyVersion.")
+
+ if isinstance(other, string_types):
+ other = NumpyVersion(other)
+
+ vercmp = self._compare_version(other)
+ if vercmp == 0:
+ # Same x.y.z version, check for alpha/beta/rc
+ vercmp = self._compare_pre_release(other)
+ if vercmp == 0:
+ # Same version and same pre-release, check if dev version
+ if self.is_devversion is other.is_devversion:
+ vercmp = 0
+ elif self.is_devversion:
+ vercmp = -1
+ else:
+ vercmp = 1
+
+ return vercmp
+
+ def __lt__(self, other):
+ return self._compare(other) < 0
+
+ def __le__(self, other):
+ return self._compare(other) <= 0
+
+ def __eq__(self, other):
+ return self._compare(other) == 0
+
+ def __ne__(self, other):
+ return self._compare(other) != 0
+
+ def __gt__(self, other):
+ return self._compare(other) > 0
+
+ def __ge__(self, other):
+ return self._compare(other) >= 0
+
+ def __repr(self):
+ return "NumpyVersion(%s)" % self.vstring
+
+
+def _next_regular(target):
+ """
+ Find the next regular number greater than or equal to target.
+ Regular numbers are composites of the prime factors 2, 3, and 5.
+ Also known as 5-smooth numbers or Hamming numbers, these are the optimal
+ size for inputs to FFTPACK.
+ Target must be a positive integer.
+ """
+ if target <= 6:
+ return target
+
+ # Quickly check if it's already a power of 2
+ if not (target & (target - 1)):
+ return target
+
+ match = float('inf') # Anything found will be smaller
+ p5 = 1
+ while p5 < target:
+ p35 = p5
+ while p35 < target:
+ # Ceiling integer division, avoiding conversion to float
+ # (quotient = ceil(target / p35))
+ quotient = -(-target // p35)
+ # Quickly find next power of 2 >= quotient
+ try:
+ p2 = 2 ** ((quotient - 1).bit_length())
+ except AttributeError:
+ # Fallback for Python <2.7
+ p2 = 2 ** _bit_length_26(quotient - 1)
+
+ N = p2 * p35
+ if N == target:
+ return N
+ elif N < match:
+ match = N
+ p35 *= 3
+ if p35 == target:
+ return p35
+ if p35 < match:
+ match = p35
+ p5 *= 5
+ if p5 == target:
+ return p5
+ if p5 < match:
+ match = p5
+ return match
+if NumpyVersion(np.__version__) >= '1.7.1':
+ np_matrix_rank = np.linalg.matrix_rank
+else:
+ def np_matrix_rank(M, tol=None):
+ """
+ Return matrix rank of array using SVD method
+ Rank of the array is the number of SVD singular values of the array that are
+ greater than `tol`.
+ Parameters
+ ----------
+ M : {(M,), (M, N)} array_like
+ array of <=2 dimensions
+ tol : {None, float}, optional
+ threshold below which SVD values are considered zero. If `tol` is
+ None, and ``S`` is an array with singular values for `M`, and
+ ``eps`` is the epsilon value for datatype of ``S``, then `tol` is
+ set to ``S.max() * max(M.shape) * eps``.
+ Notes
+ -----
+ The default threshold to detect rank deficiency is a test on the magnitude
+ of the singular values of `M`. By default, we identify singular values less
+ than ``S.max() * max(M.shape) * eps`` as indicating rank deficiency (with
+ the symbols defined above). This is the algorithm MATLAB uses [1]. It also
+ appears in *Numerical recipes* in the discussion of SVD solutions for linear
+ least squares [2].
+ This default threshold is designed to detect rank deficiency accounting for
+ the numerical errors of the SVD computation. Imagine that there is a column
+ in `M` that is an exact (in floating point) linear combination of other
+ columns in `M`. Computing the SVD on `M` will not produce a singular value
+ exactly equal to 0 in general: any difference of the smallest SVD value from
+ 0 will be caused by numerical imprecision in the calculation of the SVD.
+ Our threshold for small SVD values takes this numerical imprecision into
+ account, and the default threshold will detect such numerical rank
+ deficiency. The threshold may declare a matrix `M` rank deficient even if
+ the linear combination of some columns of `M` is not exactly equal to
+ another column of `M` but only numerically very close to another column of
+ `M`.
+ We chose our default threshold because it is in wide use. Other thresholds
+ are possible. For example, elsewhere in the 2007 edition of *Numerical
+ recipes* there is an alternative threshold of ``S.max() *
+ np.finfo(M.dtype).eps / 2. * np.sqrt(m + n + 1.)``. The authors describe
+ this threshold as being based on "expected roundoff error" (p 71).
+ The thresholds above deal with floating point roundoff error in the
+ calculation of the SVD. However, you may have more information about the
+ sources of error in `M` that would make you consider other tolerance values
+ to detect *effective* rank deficiency. The most useful measure of the
+ tolerance depends on the operations you intend to use on your matrix. For
+ example, if your data come from uncertain measurements with uncertainties
+ greater than floating point epsilon, choosing a tolerance near that
+ uncertainty may be preferable. The tolerance may be absolute if the
+ uncertainties are absolute rather than relative.
+ References
+ ----------
+ .. [1] MATLAB reference documention, "Rank"
+ http://www.mathworks.com/help/techdoc/ref/rank.html
+ .. [2] W. H. Press, S. A. Teukolsky, W. T. Vetterling and B. P. Flannery,
+ "Numerical Recipes (3rd edition)", Cambridge University Press, 2007,
+ page 795.
+ Examples
+ --------
+ >>> from numpy.linalg import matrix_rank
+ >>> matrix_rank(np.eye(4)) # Full rank matrix
+ 4
+ >>> I=np.eye(4); I[-1,-1] = 0. # rank deficient matrix
+ >>> matrix_rank(I)
+ 3
+ >>> matrix_rank(np.ones((4,))) # 1 dimension - rank 1 unless all 0
+ 1
+ >>> matrix_rank(np.zeros((4,)))
+ 0
+ """
+ M = np.asarray(M)
+ if M.ndim > 2:
+ raise TypeError('array should have 2 or fewer dimensions')
+ if M.ndim < 2:
+ return int(not all(M == 0))
+ S = np.linalg.svd(M, compute_uv=False)
+ if tol is None:
+ tol = S.max() * max(M.shape) * np.finfo(S.dtype).eps
+ return np.sum(S > tol)
+
+
+
+class CacheWriteWarning(UserWarning):
+ pass
+
+class CachedAttribute(object):
+
+ def __init__(self, func, cachename=None, resetlist=None):
+ self.fget = func
+ self.name = func.__name__
+ self.cachename = cachename or '_cache'
+ self.resetlist = resetlist or ()
+
+ def __get__(self, obj, type=None):
+ if obj is None:
+ return self.fget
+ # Get the cache or set a default one if needed
+ _cachename = self.cachename
+ _cache = getattr(obj, _cachename, None)
+ if _cache is None:
+ setattr(obj, _cachename, resettable_cache())
+ _cache = getattr(obj, _cachename)
+ # Get the name of the attribute to set and cache
+ name = self.name
+ _cachedval = _cache.get(name, None)
+ # print("[_cachedval=%s]" % _cachedval)
+ if _cachedval is None:
+ # Call the "fget" function
+ _cachedval = self.fget(obj)
+ # Set the attribute in obj
+ # print("Setting %s in cache to %s" % (name, _cachedval))
+ try:
+ _cache[name] = _cachedval
+ except KeyError:
+ setattr(_cache, name, _cachedval)
+ # Update the reset list if needed (and possible)
+ resetlist = self.resetlist
+ if resetlist is not ():
+ try:
+ _cache._resetdict[name] = self.resetlist
+ except AttributeError:
+ pass
+ # else:
+ # print("Reading %s from cache (%s)" % (name, _cachedval))
+ return _cachedval
+
+ def __set__(self, obj, value):
+ errmsg = "The attribute '%s' cannot be overwritten" % self.name
+ warnings.warn(errmsg, CacheWriteWarning)
+
+
+class _cache_readonly(object):
+ """
+ Decorator for CachedAttribute
+ """
+
+ def __init__(self, cachename=None, resetlist=None):
+ self.func = None
+ self.cachename = cachename
+ self.resetlist = resetlist or None
+
+ def __call__(self, func):
+ return CachedAttribute(func,
+ cachename=self.cachename,
+ resetlist=self.resetlist)
+cache_readonly = _cache_readonly()
+
+
diff --git a/release/python/0.8.0/crankshaft/crankshaft/regression/glm/varfuncs.py b/release/python/0.8.0/crankshaft/crankshaft/regression/glm/varfuncs.py
new file mode 100644
index 0000000..af66d8c
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/regression/glm/varfuncs.py
@@ -0,0 +1,284 @@
+"""
+Variance functions for use with the link functions in statsmodels.family.links
+"""
+
+__docformat__ = 'restructuredtext'
+
+import numpy as np
+FLOAT_EPS = np.finfo(float).eps
+
+class VarianceFunction(object):
+ """
+ Relates the variance of a random variable to its mean. Defaults to 1.
+
+ Methods
+ -------
+ call
+ Returns an array of ones that is the same shape as `mu`
+
+ Notes
+ -----
+ After a variance function is initialized, its call method can be used.
+
+ Alias for VarianceFunction:
+ constant = VarianceFunction()
+
+ See also
+ --------
+ statsmodels.family.family
+ """
+
+ def __call__(self, mu):
+ """
+ Default variance function
+
+ Parameters
+ -----------
+ mu : array-like
+ mean parameters
+
+ Returns
+ -------
+ v : array
+ ones(mu.shape)
+ """
+ mu = np.asarray(mu)
+ return np.ones(mu.shape, np.float64)
+
+
+ def deriv(self, mu):
+ """
+ Derivative of the variance function v'(mu)
+ """
+ from statsmodels.tools.numdiff import approx_fprime_cs
+ # TODO: diag workaround proplem with numdiff for 1d
+ return np.diag(approx_fprime_cs(mu, self))
+
+
+constant = VarianceFunction()
+constant.__doc__ = """
+The call method of constant returns a constant variance, i.e., a vector of ones.
+
+constant is an alias of VarianceFunction()
+"""
+
+class Power(object):
+ """
+ Power variance function
+
+ Parameters
+ ----------
+ power : float
+ exponent used in power variance function
+
+ Methods
+ -------
+ call
+ Returns the power variance
+
+ Formulas
+ --------
+ V(mu) = numpy.fabs(mu)**power
+
+ Notes
+ -----
+ Aliases for Power:
+ mu = Power()
+ mu_squared = Power(power=2)
+ mu_cubed = Power(power=3)
+ """
+
+ def __init__(self, power=1.):
+ self.power = power
+
+ def __call__(self, mu):
+ """
+ Power variance function
+
+ Parameters
+ ----------
+ mu : array-like
+ mean parameters
+
+ Returns
+ -------
+ variance : array
+ numpy.fabs(mu)**self.power
+ """
+ return np.power(np.fabs(mu), self.power)
+
+
+ def deriv(self, mu):
+ """
+ Derivative of the variance function v'(mu)
+ """
+ from statsmodels.tools.numdiff import approx_fprime_cs, approx_fprime
+ #return approx_fprime_cs(mu, self) # TODO fix breaks in `fabs
+ # TODO: diag is workaround problem with numdiff for 1d
+ return np.diag(approx_fprime(mu, self))
+
+
+mu = Power()
+mu.__doc__ = """
+Returns np.fabs(mu)
+
+Notes
+-----
+This is an alias of Power()
+"""
+mu_squared = Power(power=2)
+mu_squared.__doc__ = """
+Returns np.fabs(mu)**2
+
+Notes
+-----
+This is an alias of statsmodels.family.links.Power(power=2)
+"""
+mu_cubed = Power(power=3)
+mu_cubed.__doc__ = """
+Returns np.fabs(mu)**3
+
+Notes
+-----
+This is an alias of statsmodels.family.links.Power(power=3)
+"""
+
+class Binomial(object):
+ """
+ Binomial variance function
+
+ Parameters
+ ----------
+ n : int, optional
+ The number of trials for a binomial variable. The default is 1 for
+ p in (0,1)
+
+ Methods
+ -------
+ call
+ Returns the binomial variance
+
+ Formulas
+ --------
+ V(mu) = p * (1 - p) * n
+
+ where p = mu / n
+
+ Notes
+ -----
+ Alias for Binomial:
+ binary = Binomial()
+
+ A private method _clean trims the data by machine epsilon so that p is
+ in (0,1)
+ """
+
+ def __init__(self, n=1):
+ self.n = n
+
+ def _clean(self, p):
+ return np.clip(p, FLOAT_EPS, 1 - FLOAT_EPS)
+
+ def __call__(self, mu):
+ """
+ Binomial variance function
+
+ Parameters
+ -----------
+ mu : array-like
+ mean parameters
+
+ Returns
+ -------
+ variance : array
+ variance = mu/n * (1 - mu/n) * self.n
+ """
+ p = self._clean(mu / self.n)
+ return p * (1 - p) * self.n
+
+ #TODO: inherit from super
+ def deriv(self, mu):
+ """
+ Derivative of the variance function v'(mu)
+ """
+ from statsmodels.tools.numdiff import approx_fprime_cs, approx_fprime
+ # TODO: diag workaround proplem with numdiff for 1d
+ return np.diag(approx_fprime_cs(mu, self))
+
+
+binary = Binomial()
+binary.__doc__ = """
+The binomial variance function for n = 1
+
+Notes
+-----
+This is an alias of Binomial(n=1)
+"""
+
+class NegativeBinomial(object):
+ '''
+ Negative binomial variance function
+
+ Parameters
+ ----------
+ alpha : float
+ The ancillary parameter for the negative binomial variance function.
+ `alpha` is assumed to be nonstochastic. The default is 1.
+
+ Methods
+ -------
+ call
+ Returns the negative binomial variance
+
+ Formulas
+ --------
+ V(mu) = mu + alpha*mu**2
+
+ Notes
+ -----
+ Alias for NegativeBinomial:
+ nbinom = NegativeBinomial()
+
+ A private method _clean trims the data by machine epsilon so that p is
+ in (0,inf)
+ '''
+
+ def __init__(self, alpha=1.):
+ self.alpha = alpha
+
+ def _clean(self, p):
+ return np.clip(p, FLOAT_EPS, np.inf)
+
+ def __call__(self, mu):
+ """
+ Negative binomial variance function
+
+ Parameters
+ ----------
+ mu : array-like
+ mean parameters
+
+ Returns
+ -------
+ variance : array
+ variance = mu + alpha*mu**2
+ """
+ p = self._clean(mu)
+ return p + self.alpha*p**2
+
+ def deriv(self, mu):
+ """
+ Derivative of the negative binomial variance function.
+ """
+
+ p = self._clean(mu)
+ return 1 + 2 * self.alpha * p
+
+nbinom = NegativeBinomial()
+nbinom.__doc__ = """
+Negative Binomial variance function.
+
+Notes
+-----
+This is an alias of NegativeBinomial(alpha=1.)
+"""
diff --git a/release/python/0.8.0/crankshaft/crankshaft/regression/gwr/__init__.py b/release/python/0.8.0/crankshaft/crankshaft/regression/gwr/__init__.py
new file mode 100644
index 0000000..f7a77b2
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/regression/gwr/__init__.py
@@ -0,0 +1 @@
+from base import *
diff --git a/release/python/0.8.0/crankshaft/crankshaft/regression/gwr/base/__init__.py b/release/python/0.8.0/crankshaft/crankshaft/regression/gwr/base/__init__.py
new file mode 100644
index 0000000..eeb63b3
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/regression/gwr/base/__init__.py
@@ -0,0 +1,4 @@
+import gwr
+import sel_bw
+import diagnostics
+import kernels
diff --git a/release/python/0.8.0/crankshaft/crankshaft/regression/gwr/base/diagnostics.py b/release/python/0.8.0/crankshaft/crankshaft/regression/gwr/base/diagnostics.py
new file mode 100644
index 0000000..7fbcdc4
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/regression/gwr/base/diagnostics.py
@@ -0,0 +1,81 @@
+"""
+Diagnostics for estimated gwr modesl
+"""
+__author__ = "Taylor Oshan tayoshan@gmail.com"
+
+import numpy as np
+from crankshaft.regression.glm.family import Gaussian, Poisson, Binomial
+
+def get_AICc(gwr):
+ """
+ Get AICc value
+
+ Gaussian: p61, (2.33), Fotheringham, Brunsdon and Charlton (2002)
+
+ GWGLM: AICc=AIC+2k(k+1)/(n-k-1), Nakaya et al. (2005): p2704, (36)
+
+ """
+ n = gwr.n
+ k = gwr.tr_S
+ if isinstance(gwr.family, Gaussian):
+ aicc = -2.0*gwr.llf + 2.0*n*(k + 1.0)/(n-k-2.0)
+ elif isinstance(gwr.family, (Poisson, Binomial)):
+ aicc = get_AIC(gwr) + 2.0 * k * (k+1.0) / (n - k - 1.0)
+ return aicc
+
+def get_AIC(gwr):
+ """
+ Get AIC calue
+
+ Gaussian: p96, (4.22), Fotheringham, Brunsdon and Charlton (2002)
+
+ GWGLM: AIC(G)=D(G) + 2K(G), where D and K denote the deviance and the effective
+ number of parameters in the model with bandwidth G, respectively.
+
+ """
+ k = gwr.tr_S
+ #deviance = -2*log-likelihood
+ y = gwr.y
+ mu = gwr.mu
+ if isinstance(gwr.family, Gaussian):
+ aic = -2.0 * gwr.llf + 2.0 * (k+1)
+ elif isinstance(gwr.family, (Poisson, Binomial)):
+ aic = np.sum(gwr.family.resid_dev(y, mu)**2) + 2.0 * k
+ return aic
+
+def get_BIC(gwr):
+ """
+ Get BIC value
+
+ Gaussian: p61 (2.34), Fotheringham, Brunsdon and Charlton (2002)
+ BIC = -2log(L)+klog(n)
+
+ GWGLM: BIC = dev + tr_S * log(n)
+
+ """
+ n = gwr.n # (scalar) number of observations
+ k = gwr.tr_S
+ y = gwr.y
+ mu = gwr.mu
+ if isinstance(gwr.family, Gaussian):
+ bic = -2.0 * gwr.llf + (k+1) * np.log(n)
+ elif isinstance(gwr.family, (Poisson, Binomial)):
+ bic = np.sum(gwr.family.resid_dev(y, mu)**2) + k * np.log(n)
+ return bic
+
+def get_CV(gwr):
+ """
+ Get CV value
+
+ Gaussian only
+
+ Methods: p60, (2.31) or p212 (9.4)
+ Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002).
+ Geographically weighted regression: the analysis of spatially varying relationships.
+ Modification: sum of residual squared is divided by n according to GWR4 results
+
+ """
+ aa = gwr.resid_response.reshape((-1,1))/(1.0-gwr.influ)
+ cv = np.sum(aa**2)/gwr.n
+ return cv
+
diff --git a/release/python/0.8.0/crankshaft/crankshaft/regression/gwr/base/gwr.py b/release/python/0.8.0/crankshaft/crankshaft/regression/gwr/base/gwr.py
new file mode 100644
index 0000000..6d5257f
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/regression/gwr/base/gwr.py
@@ -0,0 +1,1086 @@
+#Main GWR classes
+
+#Offset does not yet do anyhting and needs to be implemented
+
+__author__ = "Taylor Oshan Tayoshan@gmail.com"
+
+import numpy as np
+import numpy.linalg as la
+from scipy.stats import t
+from kernels import *
+from diagnostics import get_AIC, get_AICc, get_BIC
+import pysal.spreg.user_output as USER
+from crankshaft.regression.glm.family import Gaussian, Binomial, Poisson
+from crankshaft.regression.glm.glm import GLM, GLMResults
+from crankshaft.regression.glm.iwls import iwls
+from crankshaft.regression.glm.utils import cache_readonly
+
+fk = {'gaussian': fix_gauss, 'bisquare': fix_bisquare, 'exponential': fix_exp}
+ak = {'gaussian': adapt_gauss, 'bisquare': adapt_bisquare, 'exponential': adapt_exp}
+
+class GWR(GLM):
+ """
+ Geographically weighted regression. Can currently estimate Gaussian,
+ Poisson, and logistic models(built on a GLM framework). GWR object prepares
+ model input. Fit method performs estimation and returns a GWRResults object.
+
+ Parameters
+ ----------
+ coords : array-like
+ n*2, collection of n sets of (x,y) coordinates of
+ observatons; also used as calibration locations is
+ 'points' is set to None
+
+ y : array
+ n*1, dependent variable
+
+ X : array
+ n*k, independent variable, exlcuding the constant
+
+ points : array-like
+ n*2, collection of n sets of (x,y) coordinates used for
+ calibration locations; default is set to None, which
+ uses every observation as a calibration point
+
+ bw : scalar
+ bandwidth value consisting of either a distance or N
+ nearest neighbors; user specified or obtained using
+ Sel_BW
+
+ family : family object
+ underlying probability model; provides
+ distribution-specific calculations
+
+ offset : array
+ n*1, the offset variable at the ith location. For Poisson model
+ this term is often the size of the population at risk or
+ the expected size of the outcome in spatial epidemiology
+ Default is None where Ni becomes 1.0 for all locations;
+ only for Poisson models
+
+ sigma2_v1 : boolean
+ specify sigma squared, True to use n as denominator;
+ default is False which uses n-k
+
+ kernel : string
+ type of kernel function used to weight observations;
+ available options:
+ 'gaussian'
+ 'bisquare'
+ 'exponential'
+
+ fixed : boolean
+ True for distance based kernel function and False for
+ adaptive (nearest neighbor) kernel function (default)
+
+ constant : boolean
+ True to include intercept (default) in model and False to exclude
+ intercept.
+
+ Attributes
+ ----------
+ coords : array-like
+ n*2, collection of n sets of (x,y) coordinates used for
+ calibration locations
+
+ y : array
+ n*1, dependent variable
+
+ X : array
+ n*k, independent variable, exlcuding the constant
+
+ bw : scalar
+ bandwidth value consisting of either a distance or N
+ nearest neighbors; user specified or obtained using
+ Sel_BW
+
+ family : family object
+ underlying probability model; provides
+ distribution-specific calculations
+
+ offset : array
+ n*1, the offset variable at the ith location. For Poisson model
+ this term is often the size of the population at risk or
+ the expected size of the outcome in spatial epidemiology
+ Default is None where Ni becomes 1.0 for all locations
+
+ sigma2_v1 : boolean
+ specify sigma squared, True to use n as denominator;
+ default is False which uses n-k
+
+ kernel : string
+ type of kernel function used to weight observations;
+ available options:
+ 'gaussian'
+ 'bisquare'
+ 'exponential'
+
+ fixed : boolean
+ True for distance based kernel function and False for
+ adaptive (nearest neighbor) kernel function (default)
+
+ constant : boolean
+ True to include intercept (default) in model and False to exclude
+ intercept
+
+ n : integer
+ number of observations
+
+ k : integer
+ number of independent variables
+
+ mean_y : float
+ mean of y
+
+ std_y : float
+ standard deviation of y
+
+ fit_params : dict
+ parameters passed into fit method to define estimation
+ routine
+
+ W : array
+ n*n, spatial weights matrix for weighting all
+ observations from each calibration point
+ """
+ def __init__(self, coords, y, X, bw, family=Gaussian(), offset=None,
+ sigma2_v1=False, kernel='bisquare', fixed=False, constant=True):
+ """
+ Initialize class
+ """
+ GLM.__init__(self, y, X, family, constant=constant)
+ self.constant = constant
+ self.sigma2_v1 = sigma2_v1
+ self.coords = coords
+ self.bw = bw
+ self.kernel = kernel
+ self.fixed = fixed
+ if offset is None:
+ self.offset = np.ones((self.n, 1))
+ else:
+ self.offset = offset * 1.0
+ self.fit_params = {}
+ self.W = self._build_W(fixed, kernel, coords, bw)
+ self.points = None
+ self.exog_scale = None
+ self.exog_resid = None
+ self.P = None
+
+ def _build_W(self, fixed, kernel, coords, bw, points=None):
+ if fixed:
+ try:
+ W = fk[kernel](coords, bw, points)
+ except:
+ raise TypeError('Unsupported kernel function ', kernel)
+ else:
+ try:
+ W = ak[kernel](coords, bw, points)
+ except:
+ raise TypeError('Unsupported kernel function ', kernel)
+
+ return W
+
+ def fit(self, ini_params=None, tol=1.0e-5, max_iter=20, solve='iwls'):
+ """
+ Method that fits a model with a particular estimation routine.
+
+ Parameters
+ ----------
+
+ ini_betas : array
+ k*1, initial coefficient values, including constant.
+ Default is None, which calculates initial values during
+ estimation
+ tol: float
+ Tolerence for estimation convergence
+ max_iter : integer
+ Maximum number of iterations if convergence not
+ achieved
+ solve : string
+ Technique to solve MLE equations.
+ 'iwls' = iteratively (re)weighted least squares (default)
+ """
+ self.fit_params['ini_params'] = ini_params
+ self.fit_params['tol'] = tol
+ self.fit_params['max_iter'] = max_iter
+ self.fit_params['solve']= solve
+ if solve.lower() == 'iwls':
+ m = self.W.shape[0]
+ params = np.zeros((m, self.k))
+ predy = np.zeros((m, 1))
+ v = np.zeros((m, 1))
+ w = np.zeros((m, 1))
+ z = np.zeros((self.n, self.n))
+ S = np.zeros((self.n, self.n))
+ R = np.zeros((self.n, self.n))
+ CCT = np.zeros((m, self.k))
+ #f = np.zeros((n, n))
+ p = np.zeros((m, 1))
+ for i in range(m):
+ wi = self.W[i].reshape((-1,1))
+ rslt = iwls(self.y, self.X, self.family, self.offset,
+ ini_params, tol, max_iter, wi=wi)
+ params[i,:] = rslt[0].T
+ predy[i] = rslt[1][i]
+ v[i] = rslt[2][i]
+ w[i] = rslt[3][i]
+ z[i] = rslt[4].flatten()
+ R[i] = np.dot(self.X[i], rslt[5])
+ ri = np.dot(self.X[i], rslt[5])
+ S[i] = ri*np.reshape(rslt[4].flatten(), (1,-1))
+ #dont need unless f is explicitly passed for
+ #prediction of non-sampled points
+ #cf = rslt[5] - np.dot(rslt[5], f)
+ #CCT[i] = np.diag(np.dot(cf, cf.T/rslt[3]))
+ CCT[i] = np.diag(np.dot(rslt[5], rslt[5].T))
+ S = S * (1.0/z)
+ return GWRResults(self, params, predy, S, CCT, w)
+
+ def predict(self, points, P, exog_scale=None, exog_resid=None, fit_params={}):
+ """
+ Method that predicts values of the dependent variable at un-sampled
+ locations
+
+ Parameters
+ ----------
+ points : array-like
+ n*2, collection of n sets of (x,y) coordinates used for
+ calibration prediction locations
+ P : array
+ n*k, independent variables used to make prediction;
+ exlcuding the constant
+ exog_scale : scalar
+ estimated scale using sampled locations; defualt is None
+ which estimates a model using points from "coords"
+ exog_resid : array-like
+ estimated residuals using sampled locations; defualt is None
+ which estimates a model using points from "coords"; if
+ given it must be n*1 where n is the length of coords
+ fit_params : dict
+ key-value pairs of parameters that will be passed into fit method to define estimation
+ routine; see fit method for more details
+
+ """
+ if (exog_scale is None) & (exog_resid is None):
+ train_gwr = self.fit(**fit_params)
+ self.exog_scale = train_gwr.scale
+ self.exog_resid = train_gwr.resid_response
+ elif (exog_scale is not None) & (exog_resid is not None):
+ self.exog_scale = exog_scale
+ self.exog_resid = exog_resid
+ else:
+ raise InputError('exog_scale and exog_resid must both either be'
+ 'None or specified')
+ self.points = points
+ if self.constant:
+ P = np.hstack([np.ones((len(P),1)), P])
+ self.P = P
+ else:
+ self.P = P
+ self.W = self._build_W(self.fixed, self.kernel, self.coords, self.bw, points)
+ gwr = self.fit(**fit_params)
+
+ return gwr
+
+ @cache_readonly
+ def df_model(self):
+ raise NotImplementedError('Only computed for fitted model in GWRResults')
+
+ @cache_readonly
+ def df_resid(self):
+ raise NotImplementedError('Only computed for fitted model in GWRResults')
+
+class GWRResults(GLMResults):
+ """
+ Basic class including common properties for all GWR regression models
+
+ Parameters
+ ----------
+ model : GWR object
+ pointer to GWR object with estimation parameters
+
+ params : array
+ n*k, estimated coefficients
+
+ predy : array
+ n*1, predicted y values
+
+ w : array
+ n*1, final weight used for iteratively re-weighted least
+ sqaures; default is None
+
+ S : array
+ n*n, hat matrix
+
+ CCT : array
+ n*k, scaled variance-covariance matrix
+
+ Attributes
+ ----------
+ model : GWR Object
+ points to GWR object for which parameters have been
+ estimated
+
+ params : array
+ n*k, parameter estimates
+
+ predy : array
+ n*1, predicted value of y
+
+ y : array
+ n*1, dependent variable
+
+ X : array
+ n*k, independent variable, including constant
+
+ family : family object
+ underlying probability model; provides
+ distribution-specific calculations
+
+ n : integer
+ number of observations
+
+ k : integer
+ number of independent variables
+
+ df_model : integer
+ model degrees of freedom
+
+ df_resid : integer
+ residual degrees of freedom
+
+ offset : array
+ n*1, the offset variable at the ith location.
+ For Poisson model this term is often the size of
+ the population at risk or the expected size of
+ the outcome in spatial epidemiology; Default is
+ None where Ni becomes 1.0 for all locations
+
+ scale : float
+ sigma squared used for subsequent computations
+
+ w : array
+ n*1, final weights from iteratively re-weighted least
+ sqaures routine
+
+ resid_response : array
+ n*1, residuals of the repsonse
+
+ resid_ss : scalar
+ residual sum of sqaures
+
+ W : array
+ n*n; spatial weights for each observation from each
+ calibration point
+
+ S : array
+ n*n, hat matrix
+
+ CCT : array
+ n*k, scaled variance-covariance matrix
+
+ tr_S : float
+ trace of S (hat) matrix
+
+ tr_STS : float
+ trace of STS matrix
+
+ tr_SWSTW : float
+ trace of weighted STS matrix; weights are those output
+ from iteratively weighted least sqaures (not spatial
+ weights)
+
+ y_bar : array
+ n*1, weighted mean value of y
+
+ TSS : array
+ n*1, geographically weighted total sum of squares
+
+ RSS : array
+ n*1, geographically weighted residual sum of squares
+
+ localR2 : array
+ n*1, local R square
+
+ sigma2_v1 : float
+ sigma squared, use (n-v1) as denominator
+
+ sigma2_v1v2 : float
+ sigma squared, use (n-2v1+v2) as denominator
+
+ sigma2_ML : float
+ sigma squared, estimated using ML
+
+ std_res : array
+ n*1, standardised residuals
+
+ bse : array
+ n*k, standard errors of parameters (betas)
+
+ influ : array
+ n*1, leading diagonal of S matrix
+
+ CooksD : array
+ n*1, Cook's D
+
+ tvalues : array
+ n*k, local t-statistics
+
+ adj_alpha : array
+ 3*1, corrected alpha values to account for multiple
+ hypothesis testing for the 90%, 95%, and 99% confidence
+ levels; tvalues with an absolute value larger than the
+ corrected alpha are considered statistically
+ significant.
+
+ deviance : array
+ n*1, local model deviance for each calibration point
+
+ resid_deviance : array
+ n*1, local sum of residual deviance for each
+ calibration point
+
+ llf : scalar
+ log-likelihood of the full model; see
+ pysal.contrib.glm.family for damily-sepcific
+ log-likelihoods
+
+ pDev : float
+ local percent of deviation accounted for; analogous to
+ r-squared for GLM's
+
+ mu : array
+ n*, flat one dimensional array of predicted mean
+ response value from estimator
+
+ fit_params : dict
+ parameters passed into fit method to define estimation
+ routine
+ """
+ def __init__(self, model, params, predy, S, CCT, w=None):
+ GLMResults.__init__(self, model, params, predy, w)
+ self.W = model.W
+ self.offset = model.offset
+ if w is not None:
+ self.w = w
+ self.predy = predy
+ self.S = S
+ self.CCT = self.cov_params(CCT, model.exog_scale)
+ self._cache = {}
+
+ @cache_readonly
+ def resid_ss(self):
+ u = self.resid_response.flatten()
+ return np.dot(u, u.T)
+
+ @cache_readonly
+ def scale(self, scale=None):
+ if isinstance(self.family, Gaussian):
+ if self.model.sigma2_v1:
+ scale = self.sigma2_v1
+ else:
+ scale = self.sigma2_v1v2
+ else:
+ scale = 1.0
+ return scale
+
+ def cov_params(self, cov, exog_scale=None):
+ """
+ Returns scaled covariance parameters
+ Parameters
+ ----------
+ cov : array
+ estimated covariance parameters
+
+ Returns
+ -------
+ Scaled covariance parameters
+
+ """
+ if exog_scale is not None:
+ return cov*exog_scale
+ else:
+ return cov*self.scale
+
+ @cache_readonly
+ def tr_S(self):
+ """
+ trace of S (hat) matrix
+ """
+ return np.trace(self.S*self.w)
+
+ @cache_readonly
+ def tr_STS(self):
+ """
+ trace of STS matrix
+ """
+ return np.trace(np.dot(self.S.T*self.w,self.S*self.w))
+
+ @cache_readonly
+ def y_bar(self):
+ """
+ weighted mean of y
+ """
+ if self.model.points is not None:
+ n = len(self.model.points)
+ else:
+ n = self.n
+ off = self.offset.reshape((-1,1))
+ arr_ybar = np.zeros(shape=(self.n,1))
+ for i in range(n):
+ w_i= np.reshape(np.array(self.W[i]), (-1, 1))
+ sum_yw = np.sum(self.y.reshape((-1,1)) * w_i)
+ arr_ybar[i] = 1.0 * sum_yw / np.sum(w_i*off)
+ return arr_ybar
+
+ @cache_readonly
+ def TSS(self):
+ """
+ geographically weighted total sum of squares
+
+ Methods: p215, (9.9)
+ Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002).
+ Geographically weighted regression: the analysis of spatially varying
+ relationships.
+
+ """
+ if self.model.points is not None:
+ n = len(self.model.points)
+ else:
+ n = self.n
+ TSS = np.zeros(shape=(n,1))
+ for i in range(n):
+ TSS[i] = np.sum(np.reshape(np.array(self.W[i]), (-1,1)) *
+ (self.y.reshape((-1,1)) - self.y_bar[i])**2)
+ return TSS
+
+ @cache_readonly
+ def RSS(self):
+ """
+ geographically weighted residual sum of squares
+
+ Methods: p215, (9.10)
+ Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002).
+ Geographically weighted regression: the analysis of spatially varying
+ relationships.
+ """
+ if self.model.points is not None:
+ n = len(self.model.points)
+ resid = self.model.exog_resid.reshape((-1,1))
+ else:
+ n = self.n
+ resid = self.resid_response.reshape((-1,1))
+ RSS = np.zeros(shape=(n,1))
+ for i in range(n):
+ RSS[i] = np.sum(np.reshape(np.array(self.W[i]), (-1,1))
+ * resid**2)
+ return RSS
+
+ @cache_readonly
+ def localR2(self):
+ """
+ local R square
+
+ Methods: p215, (9.8)
+ Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002).
+ Geographically weighted regression: the analysis of spatially varying
+ relationships.
+ """
+ if isinstance(self.family, Gaussian):
+ return (self.TSS - self.RSS)/self.TSS
+ else:
+ raise NotImplementedError('Only applicable to Gaussian')
+
+ @cache_readonly
+ def sigma2_v1(self):
+ """
+ residual variance
+
+ Methods: p214, (9.6),
+ Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002).
+ Geographically weighted regression: the analysis of spatially varying
+ relationships.
+
+ only use v1
+ """
+ return (self.resid_ss/(self.n-self.tr_S))
+
+ @cache_readonly
+ def sigma2_v1v2(self):
+ """
+ residual variance
+
+ Methods: p55 (2.16)-(2.18)
+ Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002).
+ Geographically weighted regression: the analysis of spatially varying
+ relationships.
+
+ use v1 and v2 #used in GWR4
+ """
+ if isinstance(self.family, (Poisson, Binomial)):
+ return self.resid_ss/(self.n - 2.0*self.tr_S +
+ self.tr_STS) #could be changed to SWSTW - nothing to test against
+ else:
+ return self.resid_ss/(self.n - 2.0*self.tr_S +
+ self.tr_STS) #could be changed to SWSTW - nothing to test against
+ @cache_readonly
+ def sigma2_ML(self):
+ """
+ residual variance
+
+ Methods: maximum likelihood
+ """
+ return self.resid_ss/self.n
+
+ @cache_readonly
+ def std_res(self):
+ """
+ standardized residuals
+
+ Methods: p215, (9.7)
+ Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002).
+ Geographically weighted regression: the analysis of spatially varying
+ relationships.
+ """
+ return self.resid_response.reshape((-1,1))/(np.sqrt(self.scale * (1.0 - self.influ)))
+
+ @cache_readonly
+ def bse(self):
+ """
+ standard errors of Betas
+
+ Methods: p215, (2.15) and (2.21)
+ Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002).
+ Geographically weighted regression: the analysis of spatially varying
+ relationships.
+ """
+ return np.sqrt(self.CCT)
+
+ @cache_readonly
+ def influ(self):
+ """
+ Influence: leading diagonal of S Matrix
+ """
+ return np.reshape(np.diag(self.S),(-1,1))
+
+ @cache_readonly
+ def cooksD(self):
+ """
+ Influence: leading diagonal of S Matrix
+
+ Methods: p216, (9.11),
+ Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002).
+ Geographically weighted regression: the analysis of spatially varying
+ relationships.
+ Note: in (9.11), p should be tr(S), that is, the effective number of parameters
+ """
+ return self.std_res**2 * self.influ / (self.tr_S * (1.0-self.influ))
+
+ @cache_readonly
+ def deviance(self):
+ off = self.offset.reshape((-1,1)).T
+ y = self.y
+ ybar = self.y_bar
+ if isinstance(self.family, Gaussian):
+ raise NotImplementedError('deviance not currently used for Gaussian')
+ elif isinstance(self.family, Poisson):
+ dev = np.sum(2.0*self.W*(y*np.log(y/(ybar*off))-(y-ybar*off)),axis=1)
+ elif isinstance(self.family, Binomial):
+ dev = self.family.deviance(self.y, self.y_bar, self.W, axis=1)
+ return dev.reshape((-1,1))
+
+ @cache_readonly
+ def resid_deviance(self):
+ if isinstance(self.family, Gaussian):
+ raise NotImplementedError('deviance not currently used for Gaussian')
+ else:
+ off = self.offset.reshape((-1,1)).T
+ y = self.y
+ ybar = self.y_bar
+ global_dev_res = ((self.family.resid_dev(self.y, self.mu))**2)
+ dev_res = np.repeat(global_dev_res.flatten(),self.n)
+ dev_res = dev_res.reshape((self.n, self.n))
+ dev_res = np.sum(dev_res * self.W.T, axis=0)
+ return dev_res.reshape((-1,1))
+
+ @cache_readonly
+ def pDev(self):
+ """
+ Local percentage of deviance accounted for. Described in the GWR4
+ manual. Equivalent to 1 - (deviance/null deviance)
+ """
+ if isinstance(self.family, Gaussian):
+ raise NotImplementedError('Not implemented for Gaussian')
+ else:
+ return 1.0 - (self.resid_deviance/self.deviance)
+
+ @cache_readonly
+ def adj_alpha(self):
+ """
+ Corrected alpha (critical) values to account for multiple testing during hypothesis
+ testing. Includes corrected value for 90% (.1), 95% (.05), and 99%
+ (.01) confidence levels. Correction comes from:
+
+ da Silva, A. R., & Fotheringham, A. S. (2015). The Multiple Testing Issue in
+ Geographically Weighted Regression. Geographical Analysis.
+
+ """
+ alpha = np.array([.1, .05, .001])
+ pe = (2.0 * self.tr_S) - self.tr_STS
+ p = self.k
+ return (alpha*p)/pe
+
+ def filter_tvals(self, alpha):
+ """
+ Utility function to set tvalues with an absolute value smaller than the
+ absolute value of the alpha (critical) value to 0
+
+ Parameters
+ ----------
+ alpha : scalar
+ critical value to determine which tvalues are
+ associated with statistically significant parameter
+ estimates
+
+ Returns
+ -------
+ filtered : array
+ n*k; new set of n tvalues for each of k variables
+ where absolute tvalues less than the absolute value of
+ alpha have been set to 0.
+ """
+ alpha = np.abs(alpha)/2.0
+ n = self.n
+ critical = t.ppf(1-alpha, n-1)
+ subset = (self.tvalues < critical) & (self.tvalues > -1.0*critical)
+ tvalues = self.tvalues.copy()
+ tvalues[subset] = 0
+ return tvalues
+
+ @cache_readonly
+ def df_model(self):
+ return self.n - self.tr_S
+
+ @cache_readonly
+ def df_resid(self):
+ return self.n - 2.0*self.tr_S + self.tr_STS
+
+ @cache_readonly
+ def normalized_cov_params(self):
+ raise NotImplementedError('Not implemented for GWR')
+
+ @cache_readonly
+ def resid_pearson(self):
+ raise NotImplementedError('Not implemented for GWR')
+
+ @cache_readonly
+ def resid_working(self):
+ raise NotImplementedError('Not implemented for GWR')
+
+ @cache_readonly
+ def resid_anscombe(self):
+ raise NotImplementedError('Not implemented for GWR')
+
+ @cache_readonly
+ def pearson_chi2(self):
+ raise NotImplementedError('Not implemented for GWR')
+
+ @cache_readonly
+ def null(self):
+ raise NotImplementedError('Not implemented for GWR')
+
+ @cache_readonly
+ def llnull(self):
+ raise NotImplementedError('Not implemented for GWR')
+
+ @cache_readonly
+ def null_deviance(self):
+ raise NotImplementedError('Not implemented for GWR')
+
+ @cache_readonly
+ def aic(self):
+ return get_AIC(self)
+
+ @cache_readonly
+ def aicc(self):
+ return get_AICc(self)
+
+ @cache_readonly
+ def bic(self):
+ return get_BIC(self)
+
+ @cache_readonly
+ def D2(self):
+ raise NotImplementedError('Not implemented for GWR')
+
+ @cache_readonly
+ def adj_D2(self):
+ raise NotImplementedError('Not implemented for GWR')
+
+ @cache_readonly
+ def pseudoR2(self):
+ raise NotImplementedError('Not implemented for GWR')
+
+ @cache_readonly
+ def adj_pseudoR2(self):
+ raise NotImplementedError('Not implemented for GWR')
+
+ @cache_readonly
+ def pvalues(self):
+ raise NotImplementedError('Not implemented for GWR')
+
+ @cache_readonly
+ def predictions(self):
+ P = self.model.P
+ if P is None:
+ raise NotImplementedError('predictions only avaialble if predict'
+ 'method called on GWR model')
+ else:
+ predictions = np.sum(P*self.params, axis=1).reshape((-1,1))
+ return predictions
+
+class FBGWR(GWR):
+ """
+ Parameters
+ ----------
+ coords : array-like
+ n*2, collection of n sets of (x,y) coordinates of
+ observatons; also used as calibration locations is
+ 'points' is set to None
+
+ y : array
+ n*1, dependent variable
+
+ X : array
+ n*k, independent variable, exlcuding the constant
+
+ points : array-like
+ n*2, collection of n sets of (x,y) coordinates used for
+ calibration locations; default is set to None, which
+ uses every observation as a calibration point
+
+ bws : array-like
+ collection of bandwidth values consisting of either a distance or N
+ nearest neighbors; user specified or obtained using
+ Sel_BW with fb=True. Order of values should the same as
+ the order of columns associated with X
+ XB : array
+ n*k, product of temporary X and params obtained as through-put
+ from the backfitting algorithm used to select flexible
+ bandwidths; product of the Sel_BW class
+ err : array
+ n*1, temporary residuals associated with the predicted values from
+ the backfitting algorithm used to select flexible
+ bandwidths; product of the Sel_BW class
+
+ family : family object
+ underlying probability model; provides
+ distribution-specific calculations
+
+ offset : array
+ n*1, the offset variable at the ith location. For Poisson model
+ this term is often the size of the population at risk or
+ the expected size of the outcome in spatial epidemiology
+ Default is None where Ni becomes 1.0 for all locations
+
+ sigma2_v1 : boolean
+ specify sigma squared, True to use n as denominator;
+ default is False which uses n-k
+
+ kernel : string
+ type of kernel function used to weight observations;
+ available options:
+ 'gaussian'
+ 'bisquare'
+ 'exponential'
+
+ fixed : boolean
+ True for distance based kernel function and False for
+ adaptive (nearest neighbor) kernel function (default)
+
+ constant : boolean
+ True to include intercept (default) in model and False to exclude
+ intercept.
+
+ Attributes
+ ----------
+ coords : array-like
+ n*2, collection of n sets of (x,y) coordinates of
+ observatons; also used as calibration locations is
+ 'points' is set to None
+
+ y : array
+ n*1, dependent variable
+
+ X : array
+ n*k, independent variable, exlcuding the constant
+
+ points : array-like
+ n*2, collection of n sets of (x,y) coordinates used for
+ calibration locations; default is set to None, which
+ uses every observation as a calibration point
+
+ bws : array-like
+ collection of bandwidth values consisting of either a distance or N
+ nearest neighbors; user specified or obtained using
+ Sel_BW with fb=True. Order of values should the same as
+ the order of columns associated with X
+ XB : array
+ n*k, product of temporary X and params obtained as through-put
+ from the backfitting algorithm used to select flexible
+ bandwidths; product of the Sel_BW class
+ err : array
+ n*1, temporary residuals associated with the predicted values from
+ the backfitting algorithm used to select flexible
+ bandwidths; product of the Sel_BW class
+
+ family : family object
+ underlying probability model; provides
+ distribution-specific calculations
+
+ offset : array
+ n*1, the offset variable at the ith location. For Poisson model
+ this term is often the size of the population at risk or
+ the expected size of the outcome in spatial epidemiology
+ Default is None where Ni becomes 1.0 for all locations
+
+ sigma2_v1 : boolean
+ specify sigma squared, True to use n as denominator;
+ default is False which uses n-k
+
+ kernel : string
+ type of kernel function used to weight observations;
+ available options:
+ 'gaussian'
+ 'bisquare'
+ 'exponential'
+
+ fixed : boolean
+ True for distance based kernel function and False for
+ adaptive (nearest neighbor) kernel function (default)
+
+ constant : boolean
+ True to include intercept (default) in model and False to exclude
+ intercept.
+
+
+ Examples
+ -------
+ TODO
+
+ """
+ def __init__(self, coords, y, X, bws, XB, err, family=Gaussian(), offset=None,
+ sigma2_v1=False, kernel='bisquare', fixed=False, constant=True):
+ """
+ Initialize class
+ """
+ self.coords = coords
+ self.y = y
+ self.X = X
+ self.XB = XB
+ self.err = err
+ self.bws = bws
+ self.family = family
+ self.offset = offset
+ self.sigma2_v1 = sigma2_v1
+ self.kernel = kernel
+ self.fixed = fixed
+ self.constant = constant
+ if constant:
+ self.X = USER.check_constant(self.X)
+
+ def fit(self, ini_params=None, tol=1.0e-5, max_iter=20, solve='iwls'):
+ """
+ Method that fits a model with a particular estimation routine.
+
+ Parameters
+ ----------
+
+ ini_betas : array
+ k*1, initial coefficient values, including constant.
+ Default is None, which calculates initial values during
+ estimation
+ tol: float
+ Tolerence for estimation convergence
+ max_iter : integer
+ Maximum number of iterations if convergence not
+ achieved
+ solve : string
+ Technique to solve MLE equations.
+ 'iwls' = iteratively (re)weighted least squares (default)
+
+ """
+ params = np.zeros_like(self.X)
+ err = self.err
+ for i, bw in enumerate(self.bws):
+ W = self._build_W(self.fixed, self.kernel, self.coords, bw)
+ X = self.X[:,i].reshape((-1,1))
+ y = self.XB[:,i].reshape((-1,1)) + err
+ model = GWR(self.coords, y, X, bw, self.family, self.offset,
+ self.sigma2_v1, self.kernel, self.fixed, constant=False)
+ results = model.fit(ini_params, tol, max_iter, solve)
+ params[:,i] = results.params.flatten()
+ err = results.resid_response.reshape((-1,1))
+ return FBGWRResults(self, params)
+
+class FBGWRResults(object):
+ """
+ Parameters
+ ----------
+ model : GWR object
+ pointer to FBGWR object with estimation parameters
+
+ params : array
+ n*k, estimated coefficients
+
+ Attributes
+ ----------
+ model : GWR Object
+ points to FBGWR object for which parameters have been
+ estimated
+
+ params : array
+ n*k, parameter estimates
+
+ predy : array
+ n*1, predicted value of y
+
+ y : array
+ n*1, dependent variable
+
+ X : array
+ n*k, independent variable, including constant
+
+ : array
+ resid_response n*1, residuals of response
+
+ resid_ss : scalar
+ residual sum of sqaures
+
+ Examples
+ -------
+ TODO
+
+ """
+ def __init__(self, model, params):
+ """
+ Initialize class
+ """
+ self.model = model
+ self.params = params
+ self.X = model.X
+ self.y = model.y
+ self._cache = {}
+
+ @cache_readonly
+ def predy(self):
+ return np.sum(np.multiply(self.params, self.X), axis=1).reshape((-1,1))
+
+ @cache_readonly
+ def resid_response(self):
+ return (self.y - self.predy).reshape((-1,1))
+
+ @cache_readonly
+ def resid_ss(self):
+ u = self.resid_response.flatten()
+ return np.dot(u, u.T)
diff --git a/release/python/0.8.0/crankshaft/crankshaft/regression/gwr/base/kernels.py b/release/python/0.8.0/crankshaft/crankshaft/regression/gwr/base/kernels.py
new file mode 100644
index 0000000..bdf246d
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/regression/gwr/base/kernels.py
@@ -0,0 +1,120 @@
+# GWR kernel function specifications
+
+__author__ = "Taylor Oshan tayoshan@gmail.com"
+
+#from pysal.weights.Distance import Kernel
+import scipy
+from scipy.spatial.kdtree import KDTree
+import numpy as np
+
+#adaptive specifications should be parameterized with nn-1 to match original gwr
+#implementation. That is, pysal counts self neighbors with knn automatically.
+
+def fix_gauss(coords, bw, points=None):
+ w = _Kernel(coords, function='gwr_gaussian', bandwidth=bw,
+ truncate=False, points=points)
+ return w.kernel
+
+def adapt_gauss(coords, nn, points=None):
+ w = _Kernel(coords, fixed=False, k=nn-1, function='gwr_gaussian',
+ truncate=False, points=points)
+ return w.kernel
+
+def fix_bisquare(coords, bw, points=None):
+ w = _Kernel(coords, function='bisquare', bandwidth=bw, points=points)
+ return w.kernel
+
+def adapt_bisquare(coords, nn, points=None):
+ w = _Kernel(coords, fixed=False, k=nn-1, function='bisquare', points=points)
+ return w.kernel
+
+def fix_exp(coords, bw, points=None):
+ w = _Kernel(coords, function='exponential', bandwidth=bw,
+ truncate=False, points=points)
+ return w.kernel
+
+def adapt_exp(coords, nn, points=None):
+ w = _Kernel(coords, fixed=False, k=nn-1, function='exponential',
+ truncate=False, points=points)
+ return w.kernel
+
+from scipy.spatial.distance import cdist
+
+class _Kernel(object):
+ """
+
+ """
+ def __init__(self, data, bandwidth=None, fixed=True, k=None,
+ function='triangular', eps=1.0000001, ids=None, truncate=True,
+ points=None): #Added truncate flag
+ if issubclass(type(data), scipy.spatial.KDTree):
+ self.data = data.data
+ data = self.data
+ else:
+ self.data = data
+ if k is not None:
+ self.k = int(k) + 1
+ else:
+ self.k = k
+ if points is None:
+ self.dmat = cdist(self.data, self.data)
+ else:
+ self.points = points
+ self.dmat = cdist(self.points, self.data)
+ self.function = function.lower()
+ self.fixed = fixed
+ self.eps = eps
+ self.trunc = truncate
+ if bandwidth:
+ try:
+ bandwidth = np.array(bandwidth)
+ bandwidth.shape = (len(bandwidth), 1)
+ except:
+ bandwidth = np.ones((len(data), 1), 'float') * bandwidth
+ self.bandwidth = bandwidth
+ else:
+ self._set_bw()
+ self.kernel = self._kernel_funcs(self.dmat/self.bandwidth)
+
+ if self.trunc:
+ mask = np.repeat(self.bandwidth, len(self.data), axis=1)
+ self.kernel[(self.dmat >= mask)] = 0
+
+ def _set_bw(self):
+ if self.k is not None:
+ dmat = np.sort(self.dmat)[:,:self.k]
+ else:
+ dmat = self.dmat
+ if self.fixed:
+ # use max knn distance as bandwidth
+ bandwidth = dmat.max() * self.eps
+ n = len(self.data)
+ self.bandwidth = np.ones((n, 1), 'float') * bandwidth
+ else:
+ # use local max knn distance
+ self.bandwidth = dmat.max(axis=1) * self.eps
+ self.bandwidth.shape = (self.bandwidth.size, 1)
+
+
+ def _kernel_funcs(self, zs):
+ # functions follow Anselin and Rey (2010) table 5.4
+ if self.function == 'triangular':
+ return 1 - zs
+ elif self.function == 'uniform':
+ return np.ones(zi.shape) * 0.5
+ elif self.function == 'quadratic':
+ return (3. / 4) * (1 - zs ** 2)
+ elif self.function == 'quartic':
+ return (15. / 16) * (1 - zs ** 2) ** 2
+ elif self.function == 'gaussian':
+ c = np.pi * 2
+ c = c ** (-0.5)
+ return c * np.exp(-(zs ** 2) / 2.)
+ elif self.function == 'gwr_gaussian':
+ return np.exp(-0.5*(zs)**2)
+ elif self.function == 'bisquare':
+ return (1-(zs)**2)**2
+ elif self.function =='exponential':
+ return np.exp(-zs)
+ else:
+ print('Unsupported kernel function', self.function)
diff --git a/release/python/0.8.0/crankshaft/crankshaft/regression/gwr/base/search.py b/release/python/0.8.0/crankshaft/crankshaft/regression/gwr/base/search.py
new file mode 100644
index 0000000..97de4be
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/regression/gwr/base/search.py
@@ -0,0 +1,208 @@
+#Bandwidth optimization methods
+
+__author__ = "Taylor Oshan"
+
+import numpy as np
+
+def golden_section(a, c, delta, function, tol, max_iter, int_score=False):
+ """
+ Golden section search routine
+ Method: p212, 9.6.4
+ Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002).
+ Geographically weighted regression: the analysis of spatially varying relationships.
+
+ Parameters
+ ----------
+ a : float
+ initial max search section value
+ b : float
+ initial min search section value
+ delta : float
+ constant used to determine width of search sections
+ function : function
+ obejective function to be evaluated at different section
+ values
+ int_score : boolean
+ False for float score, True for integer score
+ tol : float
+ tolerance used to determine convergence
+ max_iter : integer
+ maximum iterations if no convergence to tolerance
+
+ Returns
+ -------
+ opt_val : float
+ optimal value
+ opt_score : kernel
+ optimal score
+ output : list of tuples
+ searching history
+ """
+ b = a + delta * np.abs(c-a)
+ d = c - delta * np.abs(c-a)
+ score = 0.0
+ diff = 1.0e9
+ iters = 0
+ output = []
+ while np.abs(diff) > tol and iters < max_iter:
+ iters += 1
+ if int_score:
+ b = np.round(b)
+ d = np.round(d)
+
+ score_a = function(a)
+ score_b = function(b)
+ score_c = function(c)
+ score_d = function(d)
+
+ if score_b <= score_d:
+ opt_val = b
+ opt_score = score_b
+ c = d
+ d = b
+ b = a + delta * np.abs(c-a)
+ #if int_score:
+ #b = np.round(b)
+ else:
+ opt_val = d
+ opt_score = score_d
+ a = b
+ b = d
+ d = c - delta * np.abs(c-a)
+ #if int_score:
+ #d = np.round(b)
+
+ #if int_score:
+ # opt_val = np.round(opt_val)
+ output.append((opt_val, opt_score))
+ diff = score_b - score_d
+ score = opt_score
+ return np.round(opt_val, 2), opt_score, output
+
+def equal_interval(l_bound, u_bound, interval, function, int_score=False):
+ """
+ Interval search, using interval as stepsize
+
+ Parameters
+ ----------
+ l_bound : float
+ initial min search section value
+ u_bound : float
+ initial max search section value
+ interval : float
+ constant used to determine width of search sections
+ function : function
+ obejective function to be evaluated at different section
+ values
+ int_score : boolean
+ False for float score, True for integer score
+
+ Returns
+ -------
+ opt_val : float
+ optimal value
+ opt_score : kernel
+ optimal score
+ output : list of tuples
+ searching history
+ """
+ a = l_bound
+ c = u_bound
+ b = a + interval
+ if int_score:
+ a = np.round(a,0)
+ c = np.round(c,0)
+ b = np.round(b,0)
+
+ output = []
+
+ score_a = function(a)
+ score_c = function(c)
+
+ output.append((a,score_a))
+ output.append((c,score_c))
+
+ if score_a < score_c:
+ opt_val = a
+ opt_score = score_a
+ else:
+ opt_val = c
+ opt_score = score_c
+
+ while b < c:
+ score_b = function(b)
+
+ output.append((b,score_b))
+
+ if score_b < opt_score:
+ opt_val = b
+ opt_score = score_b
+ b = b + interval
+
+ return opt_val, opt_score, output
+
+
+def flexible_bw(init, y, X, n, k, family, tol, max_iter, rss_score,
+ gwr_func, bw_func, sel_func):
+ if init:
+ bw = sel_func(bw_func(y, X))
+ print bw
+ optim_model = gwr_func(y, X, bw)
+ err = optim_model.resid_response.reshape((-1,1))
+ est = optim_model.params
+ else:
+ model = GLM(y, X, family=self.family, constant=False).fit()
+ err = model.resid_response.reshape((-1,1))
+ est = np.repeat(model.params.T, n, axis=0)
+
+
+ XB = np.multiply(est, X)
+ if rss_score:
+ rss = np.sum((err)**2)
+ iters = 0
+ scores = []
+ delta = 1e6
+ BWs = []
+ VALs = []
+
+ while delta > tol and iters < max_iter:
+ iters += 1
+ new_XB = np.zeros_like(X)
+ bws = []
+ vals = []
+ ests = np.zeros_like(X)
+ f_XB = XB.copy()
+ f_err = err.copy()
+ for i in range(k):
+ temp_y = XB[:,i].reshape((-1,1))
+ temp_y = temp_y + err
+ temp_X = X[:,i].reshape((-1,1))
+ bw_class = bw_func(temp_y, temp_X)
+ bw = sel_func(bw_class)
+ optim_model = gwr_func(temp_y, temp_X, bw)
+ err = optim_model.resid_response.reshape((-1,1))
+ est = optim_model.params.reshape((-1,))
+
+ new_XB[:,i] = np.multiply(est, temp_X.reshape((-1,)))
+ bws.append(bw)
+ ests[:,i] = est
+ vals.append(bw_class.bw[1])
+
+ predy = np.sum(np.multiply(ests, X), axis=1).reshape((-1,1))
+ num = np.sum((new_XB - XB)**2)/n
+ den = np.sum(np.sum(new_XB, axis=1)**2)
+ score = (num/den)**0.5
+ XB = new_XB
+
+ if rss_score:
+ new_rss = np.sum((y - predy)**2)
+ score = np.abs((new_rss - rss)/new_rss)
+ rss = new_rss
+ print score
+ scores.append(score)
+ delta = score
+ BWs.append(bws)
+ VALs.append(vals)
+
+ opt_bws = BWs[-1]
+ return opt_bws, np.array(BWs), np.array(VALs), np.array(scores), f_XB, f_err
diff --git a/release/python/0.8.0/crankshaft/crankshaft/regression/gwr/base/sel_bw.py b/release/python/0.8.0/crankshaft/crankshaft/regression/gwr/base/sel_bw.py
new file mode 100644
index 0000000..9ab1263
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/regression/gwr/base/sel_bw.py
@@ -0,0 +1,286 @@
+# GWR Bandwidth selection class
+
+#Thinking about removing the search method and just having optimization begin in
+#class __init__
+
+#x_glob and offset parameters dont yet do anything; former is for semiparametric
+#GWR and later is for offset variable for Poisson model
+
+__author__ = "Taylor Oshan Tayoshan@gmail.com"
+
+from kernels import *
+from search import golden_section, equal_interval, flexible_bw
+from gwr import GWR
+from crankshaft.regression.glm.family import Gaussian, Poisson, Binomial
+import pysal.spreg.user_output as USER
+from diagnostics import get_AICc, get_AIC, get_BIC, get_CV
+from scipy.spatial.distance import pdist, squareform
+from pysal.common import KDTree
+import numpy as np
+
+kernels = {1: fix_gauss, 2: adapt_gauss, 3: fix_bisquare, 4:
+ adapt_bisquare, 5: fix_exp, 6:adapt_exp}
+getDiag = {'AICc': get_AICc,'AIC':get_AIC, 'BIC': get_BIC, 'CV': get_CV}
+
+class Sel_BW(object):
+ """
+ Select bandwidth for kernel
+
+ Methods: p211 - p213, bandwidth selection
+ Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002).
+ Geographically weighted regression: the analysis of spatially varying relationships.
+
+ Parameters
+ ----------
+ y : array
+ n*1, dependent variable.
+ x_glob : array
+ n*k1, fixed independent variable.
+ x_loc : array
+ n*k2, local independent variable, including constant.
+ coords : list of tuples
+ (x,y) of points used in bandwidth selection
+ family : string
+ GWR model type: 'Gaussian', 'logistic, 'Poisson''
+ offset : array
+ n*1, offset variable for Poisson model
+ kernel : string
+ kernel function: 'gaussian', 'bisquare', 'exponetial'
+ fixed : boolean
+ True for fixed bandwidth and False for adaptive (NN)
+ fb : True for flexible (mutliple covaraite-specific) bandwidths
+ False for a traditional (same for all covariates)
+ bandwdith; defualt is False.
+ constant : boolean
+ True to include intercept (default) in model and False to exclude
+ intercept.
+
+
+ Attributes
+ ----------
+ y : array
+ n*1, dependent variable.
+ x_glob : array
+ n*k1, fixed independent variable.
+ x_loc : array
+ n*k2, local independent variable, including constant.
+ coords : list of tuples
+ (x,y) of points used in bandwidth selection
+ family : string
+ GWR model type: 'Gaussian', 'logistic, 'Poisson''
+ kernel : string
+ type of kernel used and wether fixed or adaptive
+ criterion : string
+ bw selection criterion: 'AICc', 'AIC', 'BIC', 'CV'
+ search : string
+ bw search method: 'golden', 'interval'
+ bw_min : float
+ min value used in bandwidth search
+ bw_max : float
+ max value used in bandwidth search
+ interval : float
+ interval increment used in interval search
+ tol : float
+ tolerance used to determine convergence
+ max_iter : integer
+ max interations if no convergence to tol
+ fb : True for flexible (mutliple covaraite-specific) bandwidths
+ False for a traditional (same for all covariates)
+ bandwdith; defualt is False.
+ constant : boolean
+ True to include intercept (default) in model and False to exclude
+ intercept.
+ """
+ def __init__(self, coords, y, x_loc, x_glob=None, family=Gaussian(),
+ offset=None, kernel='bisquare', fixed=False, fb=False, constant=True):
+ self.coords = coords
+ self.y = y
+ self.x_loc = x_loc
+ if x_glob is not None:
+ self.x_glob = x_glob
+ else:
+ self.x_glob = []
+ self.family=family
+ self.fixed = fixed
+ self.kernel = kernel
+ if offset is None:
+ self.offset = np.ones((len(y), 1))
+ else:
+ self.offset = offset * 1.0
+ self.fb = fb
+ self.constant = constant
+
+ def search(self, search='golden_section', criterion='AICc', bw_min=0.0,
+ bw_max=0.0, interval=0.0, tol=1.0e-6, max_iter=200, init_fb=True,
+ tol_fb=1.0e-5, rss_score=False, max_iter_fb=200):
+ """
+ Parameters
+ ----------
+ criterion : string
+ bw selection criterion: 'AICc', 'AIC', 'BIC', 'CV'
+ search : string
+ bw search method: 'golden', 'interval'
+ bw_min : float
+ min value used in bandwidth search
+ bw_max : float
+ max value used in bandwidth search
+ interval : float
+ interval increment used in interval search
+ tol : float
+ tolerance used to determine convergence
+ max_iter : integer
+ max iterations if no convergence to tol
+ init_fb : True to initialize flexible bandwidth search with
+ esitmates from a traditional GWR and False to
+ initialize flexible bandwidth search with global
+ regression estimates
+ tol_fb : convergence tolerence for the flexible bandwidth
+ backfitting algorithm; a larger tolerance may stop the
+ algorith faster though it may result in a less optimal
+ model
+ max_iter_fb : max iterations if no convergence to tol for flexible
+ bandwidth backfittign algorithm
+ rss_score : True to use the residual sum of sqaures to evaluate
+ each iteration of the flexible bandwidth backfitting
+ routine and False to use a smooth function; default is
+ False
+
+ Returns
+ -------
+ bw : scalar or array
+ optimal bandwidth value or values; returns scalar for
+ fb=False and array for fb=True; ordering of bandwidths
+ matches the ordering of the covariates (columns) of the
+ designs matrix, X
+ """
+ self.search = search
+ self.criterion = criterion
+ self.bw_min = bw_min
+ self.bw_max = bw_max
+ self.interval = interval
+ self.tol = tol
+ self.max_iter = max_iter
+ self.init_fb = init_fb
+ self.tol_fb = tol_fb
+ self.rss_score = rss_score
+ self.max_iter_fb = max_iter_fb
+
+
+ if self.fixed:
+ if self.kernel == 'gaussian':
+ ktype = 1
+ elif self.kernel == 'bisquare':
+ ktype = 3
+ elif self.kernel == 'exponential':
+ ktype = 5
+ else:
+ raise TypeError('Unsupported kernel function ', self.kernel)
+ else:
+ if self.kernel == 'gaussian':
+ ktype = 2
+ elif self.kernel == 'bisquare':
+ ktype = 4
+ elif self.kernel == 'exponential':
+ ktype = 6
+ else:
+ raise TypeError('Unsupported kernel function ', self.kernel)
+
+ function = lambda bw: getDiag[criterion](
+ GWR(self.coords, self.y, self.x_loc, bw, family=self.family,
+ kernel=self.kernel, fixed=self.fixed, offset=self.offset).fit())
+
+ if ktype % 2 == 0:
+ int_score = True
+ else:
+ int_score = False
+ self.int_score = int_score
+
+ if self.fb:
+ self._fbw()
+ print self.bw[1]
+ self.XB = self.bw[4]
+ self.err = self.bw[5]
+ else:
+ self._bw()
+
+ return self.bw[0]
+
+ def _bw(self):
+ gwr_func = lambda bw: getDiag[self.criterion](
+ GWR(self.coords, self.y, self.x_loc, bw, family=self.family,
+ kernel=self.kernel, fixed=self.fixed, constant=self.constant).fit())
+ if self.search == 'golden_section':
+ a,c = self._init_section(self.x_glob, self.x_loc, self.coords,
+ self.constant)
+ delta = 0.38197 #1 - (np.sqrt(5.0)-1.0)/2.0
+ self.bw = golden_section(a, c, delta, gwr_func, self.tol,
+ self.max_iter, self.int_score)
+ elif self.search == 'interval':
+ self.bw = equal_interval(self.bw_min, self.bw_max, self.interval,
+ gwr_func, self.int_score)
+ else:
+ raise TypeError('Unsupported computational search method ', search)
+
+ def _fbw(self):
+ y = self.y
+ if self.constant:
+ X = USER.check_constant(self.x_loc)
+ else:
+ X = self.x_loc
+ n, k = X.shape
+ family = self.family
+ offset = self.offset
+ kernel = self.kernel
+ fixed = self.fixed
+ coords = self.coords
+ search = self.search
+ criterion = self.criterion
+ bw_min = self.bw_min
+ bw_max = self.bw_max
+ interval = self.interval
+ tol = self.tol
+ max_iter = self.max_iter
+ gwr_func = lambda y, X, bw: GWR(coords, y, X, bw, family=family,
+ kernel=kernel, fixed=fixed, offset=offset, constant=False).fit()
+ bw_func = lambda y, X: Sel_BW(coords, y, X, x_glob=[], family=family,
+ kernel=kernel, fixed=fixed, offset=offset, constant=False)
+ sel_func = lambda bw_func: bw_func.search(search=search,
+ criterion=criterion, bw_min=bw_min, bw_max=bw_max,
+ interval=interval, tol=tol, max_iter=max_iter)
+ self.bw = flexible_bw(self.init_fb, y, X, n, k, family, self.tol_fb,
+ self.max_iter_fb, self.rss_score, gwr_func, bw_func, sel_func)
+
+
+
+ def _init_section(self, x_glob, x_loc, coords, constant):
+ if len(x_glob) > 0:
+ n_glob = x_glob.shape[1]
+ else:
+ n_glob = 0
+ if len(x_loc) > 0:
+ n_loc = x_loc.shape[1]
+ else:
+ n_loc = 0
+ if constant:
+ n_vars = n_glob + n_loc + 1
+ else:
+ n_vars = n_glob + n_loc
+ n = np.array(coords).shape[0]
+
+ if self.int_score:
+ a = 40 + 2 * n_vars
+ c = n
+ else:
+ nn = 40 + 2 * n_vars
+ sq_dists = squareform(pdist(coords))
+ sort_dists = np.sort(sq_dists, axis=1)
+ min_dists = sort_dists[:,nn-1]
+ max_dists = sort_dists[:,-1]
+ a = np.min(min_dists)/2.0
+ c = np.max(max_dists)/2.0
+
+ if a < self.bw_min:
+ a = self.bw_min
+ if c > self.bw_max and self.bw_max > 0:
+ c = self.bw_max
+ return a, c
diff --git a/release/python/0.8.0/crankshaft/crankshaft/regression/gwr/base/tests/test_gwr.py b/release/python/0.8.0/crankshaft/crankshaft/regression/gwr/base/tests/test_gwr.py
new file mode 100644
index 0000000..7f12b7e
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/regression/gwr/base/tests/test_gwr.py
@@ -0,0 +1,853 @@
+"""
+GWR is tested against results from GWR4
+"""
+
+import unittest
+import pickle as pk
+from crankshaft.regression.gwr.gwr import GWR, FBGWR
+from crankshaft.regression.gwr.sel_bw import Sel_BW
+from crankshaft.regression.gwr.diagnostics import get_AICc, get_AIC, get_BIC, get_CV
+from crankshaft.regression.glm.family import Gaussian, Poisson, Binomial
+import numpy as np
+import pysal
+
+class TestGWRGaussian(unittest.TestCase):
+ def setUp(self):
+ data = pysal.open(pysal.examples.get_path('GData_utm.csv'))
+ self.coords = zip(data.by_col('X'), data.by_col('Y'))
+ self.y = np.array(data.by_col('PctBach')).reshape((-1,1))
+ rural = np.array(data.by_col('PctRural')).reshape((-1,1))
+ pov = np.array(data.by_col('PctPov')).reshape((-1,1))
+ black = np.array(data.by_col('PctBlack')).reshape((-1,1))
+ self.X = np.hstack([rural, pov, black])
+ self.BS_F = pysal.open(pysal.examples.get_path('georgia_BS_F_listwise.csv'))
+ self.BS_NN = pysal.open(pysal.examples.get_path('georgia_BS_NN_listwise.csv'))
+ self.GS_F = pysal.open(pysal.examples.get_path('georgia_GS_F_listwise.csv'))
+ self.GS_NN = pysal.open(pysal.examples.get_path('georgia_GS_NN_listwise.csv'))
+ self.FB = pk.load(open(pysal.examples.get_path('FB.p'), 'r'))
+ self.XB = pk.load(open(pysal.examples.get_path('XB.p'), 'r'))
+ self.err = pk.load(open(pysal.examples.get_path('err.p'), 'r'))
+
+ def test_BS_F(self):
+ est_Int = self.BS_F.by_col(' est_Intercept')
+ se_Int = self.BS_F.by_col(' se_Intercept')
+ t_Int = self.BS_F.by_col(' t_Intercept')
+ est_rural = self.BS_F.by_col(' est_PctRural')
+ se_rural = self.BS_F.by_col(' se_PctRural')
+ t_rural = self.BS_F.by_col(' t_PctRural')
+ est_pov = self.BS_F.by_col(' est_PctPov')
+ se_pov = self.BS_F.by_col(' se_PctPov')
+ t_pov = self.BS_F.by_col(' t_PctPov')
+ est_black = self.BS_F.by_col(' est_PctBlack')
+ se_black = self.BS_F.by_col(' se_PctBlack')
+ t_black = self.BS_F.by_col(' t_PctBlack')
+ yhat = self.BS_F.by_col(' yhat')
+ res = np.array(self.BS_F.by_col(' residual'))
+ std_res = np.array(self.BS_F.by_col(' std_residual')).reshape((-1,1))
+ localR2 = np.array(self.BS_F.by_col(' localR2')).reshape((-1,1))
+ inf = np.array(self.BS_F.by_col(' influence')).reshape((-1,1))
+ cooksD = np.array(self.BS_F.by_col(' CooksD')).reshape((-1,1))
+
+ model = GWR(self.coords, self.y, self.X, bw=209267.689, fixed=True)
+ rslt = model.fit()
+
+ AICc = get_AICc(rslt)
+ AIC = get_AIC(rslt)
+ BIC = get_BIC(rslt)
+ CV = get_CV(rslt)
+
+ self.assertAlmostEquals(np.floor(AICc), 894.0)
+ self.assertAlmostEquals(np.floor(AIC), 890.0)
+ self.assertAlmostEquals(np.floor(BIC), 944.0)
+ self.assertAlmostEquals(np.round(CV,2), 18.25)
+ np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04)
+ np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-04)
+ np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-04)
+ np.testing.assert_allclose(est_rural, rslt.params[:,1], rtol=1e-04)
+ np.testing.assert_allclose(se_rural, rslt.bse[:,1], rtol=1e-04)
+ np.testing.assert_allclose(t_rural, rslt.tvalues[:,1], rtol=1e-04)
+ np.testing.assert_allclose(est_pov, rslt.params[:,2], rtol=1e-04)
+ np.testing.assert_allclose(se_pov, rslt.bse[:,2], rtol=1e-04)
+ np.testing.assert_allclose(t_pov, rslt.tvalues[:,2], rtol=1e-04)
+ np.testing.assert_allclose(est_black, rslt.params[:,3], rtol=1e-02)
+ np.testing.assert_allclose(se_black, rslt.bse[:,3], rtol=1e-02)
+ np.testing.assert_allclose(t_black, rslt.tvalues[:,3], rtol=1e-02)
+ np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-05)
+ np.testing.assert_allclose(res, rslt.resid_response, rtol=1e-04)
+ np.testing.assert_allclose(std_res, rslt.std_res, rtol=1e-04)
+ np.testing.assert_allclose(localR2, rslt.localR2, rtol=1e-05)
+ np.testing.assert_allclose(inf, rslt.influ, rtol=1e-04)
+ np.testing.assert_allclose(cooksD, rslt.cooksD, rtol=1e-00)
+
+ def test_BS_NN(self):
+ est_Int = self.BS_NN.by_col(' est_Intercept')
+ se_Int = self.BS_NN.by_col(' se_Intercept')
+ t_Int = self.BS_NN.by_col(' t_Intercept')
+ est_rural = self.BS_NN.by_col(' est_PctRural')
+ se_rural = self.BS_NN.by_col(' se_PctRural')
+ t_rural = self.BS_NN.by_col(' t_PctRural')
+ est_pov = self.BS_NN.by_col(' est_PctPov')
+ se_pov = self.BS_NN.by_col(' se_PctPov')
+ t_pov = self.BS_NN.by_col(' t_PctPov')
+ est_black = self.BS_NN.by_col(' est_PctBlack')
+ se_black = self.BS_NN.by_col(' se_PctBlack')
+ t_black = self.BS_NN.by_col(' t_PctBlack')
+ yhat = self.BS_NN.by_col(' yhat')
+ res = np.array(self.BS_NN.by_col(' residual'))
+ std_res = np.array(self.BS_NN.by_col(' std_residual')).reshape((-1,1))
+ localR2 = np.array(self.BS_NN.by_col(' localR2')).reshape((-1,1))
+ inf = np.array(self.BS_NN.by_col(' influence')).reshape((-1,1))
+ cooksD = np.array(self.BS_NN.by_col(' CooksD')).reshape((-1,1))
+
+ model = GWR(self.coords, self.y, self.X, bw=90.000, fixed=False)
+ rslt = model.fit()
+
+ AICc = get_AICc(rslt)
+ AIC = get_AIC(rslt)
+ BIC = get_BIC(rslt)
+ CV = get_CV(rslt)
+
+ self.assertAlmostEquals(np.floor(AICc), 896.0)
+ self.assertAlmostEquals(np.floor(AIC), 892.0)
+ self.assertAlmostEquals(np.floor(BIC), 941.0)
+ self.assertAlmostEquals(np.around(CV, 2), 19.19)
+ np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04)
+ np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-04)
+ np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-04)
+ np.testing.assert_allclose(est_rural, rslt.params[:,1], rtol=1e-04)
+ np.testing.assert_allclose(se_rural, rslt.bse[:,1], rtol=1e-04)
+ np.testing.assert_allclose(t_rural, rslt.tvalues[:,1], rtol=1e-04)
+ np.testing.assert_allclose(est_pov, rslt.params[:,2], rtol=1e-04)
+ np.testing.assert_allclose(se_pov, rslt.bse[:,2], rtol=1e-04)
+ np.testing.assert_allclose(t_pov, rslt.tvalues[:,2], rtol=1e-04)
+ np.testing.assert_allclose(est_black, rslt.params[:,3], rtol=1e-02)
+ np.testing.assert_allclose(se_black, rslt.bse[:,3], rtol=1e-02)
+ np.testing.assert_allclose(t_black, rslt.tvalues[:,3], rtol=1e-02)
+ np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-05)
+ np.testing.assert_allclose(res, rslt.resid_response, rtol=1e-04)
+ np.testing.assert_allclose(std_res, rslt.std_res, rtol=1e-04)
+ np.testing.assert_allclose(localR2, rslt.localR2, rtol=1e-05)
+ np.testing.assert_allclose(inf, rslt.influ, rtol=1e-04)
+ np.testing.assert_allclose(cooksD, rslt.cooksD, rtol=1e-00)
+
+ def test_GS_F(self):
+ est_Int = self.GS_F.by_col(' est_Intercept')
+ se_Int = self.GS_F.by_col(' se_Intercept')
+ t_Int = self.GS_F.by_col(' t_Intercept')
+ est_rural = self.GS_F.by_col(' est_PctRural')
+ se_rural = self.GS_F.by_col(' se_PctRural')
+ t_rural = self.GS_F.by_col(' t_PctRural')
+ est_pov = self.GS_F.by_col(' est_PctPov')
+ se_pov = self.GS_F.by_col(' se_PctPov')
+ t_pov = self.GS_F.by_col(' t_PctPov')
+ est_black = self.GS_F.by_col(' est_PctBlack')
+ se_black = self.GS_F.by_col(' se_PctBlack')
+ t_black = self.GS_F.by_col(' t_PctBlack')
+ yhat = self.GS_F.by_col(' yhat')
+ res = np.array(self.GS_F.by_col(' residual'))
+ std_res = np.array(self.GS_F.by_col(' std_residual')).reshape((-1,1))
+ localR2 = np.array(self.GS_F.by_col(' localR2')).reshape((-1,1))
+ inf = np.array(self.GS_F.by_col(' influence')).reshape((-1,1))
+ cooksD = np.array(self.GS_F.by_col(' CooksD')).reshape((-1,1))
+
+ model = GWR(self.coords, self.y, self.X, bw=87308.298,
+ kernel='gaussian', fixed=True)
+ rslt = model.fit()
+
+ AICc = get_AICc(rslt)
+ AIC = get_AIC(rslt)
+ BIC = get_BIC(rslt)
+ CV = get_CV(rslt)
+
+ self.assertAlmostEquals(np.floor(AICc), 895.0)
+ self.assertAlmostEquals(np.floor(AIC), 890.0)
+ self.assertAlmostEquals(np.floor(BIC), 943.0)
+ self.assertAlmostEquals(np.around(CV, 2), 18.21)
+ np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04)
+ np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-04)
+ np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-04)
+ np.testing.assert_allclose(est_rural, rslt.params[:,1], rtol=1e-04)
+ np.testing.assert_allclose(se_rural, rslt.bse[:,1], rtol=1e-04)
+ np.testing.assert_allclose(t_rural, rslt.tvalues[:,1], rtol=1e-04)
+ np.testing.assert_allclose(est_pov, rslt.params[:,2], rtol=1e-04)
+ np.testing.assert_allclose(se_pov, rslt.bse[:,2], rtol=1e-04)
+ np.testing.assert_allclose(t_pov, rslt.tvalues[:,2], rtol=1e-04)
+ np.testing.assert_allclose(est_black, rslt.params[:,3], rtol=1e-02)
+ np.testing.assert_allclose(se_black, rslt.bse[:,3], rtol=1e-02)
+ np.testing.assert_allclose(t_black, rslt.tvalues[:,3], rtol=1e-02)
+ np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-05)
+ np.testing.assert_allclose(res, rslt.resid_response, rtol=1e-04)
+ np.testing.assert_allclose(std_res, rslt.std_res, rtol=1e-04)
+ np.testing.assert_allclose(localR2, rslt.localR2, rtol=1e-05)
+ np.testing.assert_allclose(inf, rslt.influ, rtol=1e-04)
+ np.testing.assert_allclose(cooksD, rslt.cooksD, rtol=1e-00)
+
+ def test_GS_NN(self):
+ est_Int = self.GS_NN.by_col(' est_Intercept')
+ se_Int = self.GS_NN.by_col(' se_Intercept')
+ t_Int = self.GS_NN.by_col(' t_Intercept')
+ est_rural = self.GS_NN.by_col(' est_PctRural')
+ se_rural = self.GS_NN.by_col(' se_PctRural')
+ t_rural = self.GS_NN.by_col(' t_PctRural')
+ est_pov = self.GS_NN.by_col(' est_PctPov')
+ se_pov = self.GS_NN.by_col(' se_PctPov')
+ t_pov = self.GS_NN.by_col(' t_PctPov')
+ est_black = self.GS_NN.by_col(' est_PctBlack')
+ se_black = self.GS_NN.by_col(' se_PctBlack')
+ t_black = self.GS_NN.by_col(' t_PctBlack')
+ yhat = self.GS_NN.by_col(' yhat')
+ res = np.array(self.GS_NN.by_col(' residual'))
+ std_res = np.array(self.GS_NN.by_col(' std_residual')).reshape((-1,1))
+ localR2 = np.array(self.GS_NN.by_col(' localR2')).reshape((-1,1))
+ inf = np.array(self.GS_NN.by_col(' influence')).reshape((-1,1))
+ cooksD = np.array(self.GS_NN.by_col(' CooksD')).reshape((-1,1))
+
+ model = GWR(self.coords, self.y, self.X, bw=49.000,
+ kernel='gaussian', fixed=False)
+ rslt = model.fit()
+
+ AICc = get_AICc(rslt)
+ AIC = get_AIC(rslt)
+ BIC = get_BIC(rslt)
+ CV = get_CV(rslt)
+
+ self.assertAlmostEquals(np.floor(AICc), 896)
+ self.assertAlmostEquals(np.floor(AIC), 894.0)
+ self.assertAlmostEquals(np.floor(BIC), 922.0)
+ self.assertAlmostEquals(np.around(CV, 2), 17.91)
+ np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04)
+ np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-04)
+ np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-04)
+ np.testing.assert_allclose(est_rural, rslt.params[:,1], rtol=1e-04)
+ np.testing.assert_allclose(se_rural, rslt.bse[:,1], rtol=1e-04)
+ np.testing.assert_allclose(t_rural, rslt.tvalues[:,1], rtol=1e-04)
+ np.testing.assert_allclose(est_pov, rslt.params[:,2], rtol=1e-04)
+ np.testing.assert_allclose(se_pov, rslt.bse[:,2], rtol=1e-04)
+ np.testing.assert_allclose(t_pov, rslt.tvalues[:,2], rtol=1e-04)
+ np.testing.assert_allclose(est_black, rslt.params[:,3], rtol=1e-02)
+ np.testing.assert_allclose(se_black, rslt.bse[:,3], rtol=1e-02)
+ np.testing.assert_allclose(t_black, rslt.tvalues[:,3], rtol=1e-02)
+ np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-05)
+ np.testing.assert_allclose(res, rslt.resid_response, rtol=1e-04)
+ np.testing.assert_allclose(std_res, rslt.std_res, rtol=1e-04)
+ np.testing.assert_allclose(localR2, rslt.localR2, rtol=1e-05)
+ np.testing.assert_allclose(inf, rslt.influ, rtol=1e-04)
+ np.testing.assert_allclose(cooksD, rslt.cooksD, rtol=1e-00)
+
+ def test_FBGWR(self):
+ model = FBGWR(self.coords, self.y, self.X, [157.0, 65.0, 52.0],
+ XB=self.XB, err=self.err, constant=False)
+ rslt = model.fit()
+
+ np.testing.assert_allclose(rslt.predy, self.FB['predy'], atol=1e-07)
+ np.testing.assert_allclose(rslt.params, self.FB['params'], atol=1e-07)
+ np.testing.assert_allclose(rslt.resid_response, self.FB['u'], atol=1e-05)
+ np.testing.assert_almost_equal(rslt.resid_ss, 6339.3497144025841)
+
+ def test_Prediction(self):
+ coords =np.array(self.coords)
+ index = np.arange(len(self.y))
+ #train = index[0:-10]
+ test = index[-10:]
+
+ #y_train = self.y[train]
+ #X_train = self.X[train]
+ #coords_train = list(coords[train])
+
+ #y_test = self.y[test]
+ X_test = self.X[test]
+ coords_test = list(coords[test])
+
+
+ model = GWR(self.coords, self.y, self.X, 93, family=Gaussian(),
+ fixed=False, kernel='bisquare')
+ results = model.predict(coords_test, X_test)
+
+ params = np.array([22.77198, -0.10254, -0.215093, -0.01405,
+ 19.10531, -0.094177, -0.232529, 0.071913,
+ 19.743421, -0.080447, -0.30893, 0.083206,
+ 17.505759, -0.078919, -0.187955, 0.051719,
+ 27.747402, -0.165335, -0.208553, 0.004067,
+ 26.210627, -0.138398, -0.360514, 0.072199,
+ 18.034833, -0.077047, -0.260556, 0.084319,
+ 28.452802, -0.163408, -0.14097, -0.063076,
+ 22.353095, -0.103046, -0.226654, 0.002992,
+ 18.220508, -0.074034, -0.309812, 0.108636]).reshape((10,4))
+ np.testing.assert_allclose(params, results.params, rtol=1e-03)
+
+ bse = np.array([2.080166, 0.021462, 0.102954, 0.049627,
+ 2.536355, 0.022111, 0.123857, 0.051917,
+ 1.967813, 0.019716, 0.102562, 0.054918,
+ 2.463219, 0.021745, 0.110297, 0.044189,
+ 1.556056, 0.019513, 0.12764, 0.040315,
+ 1.664108, 0.020114, 0.131208, 0.041613,
+ 2.5835, 0.021481, 0.113158, 0.047243,
+ 1.709483, 0.019752, 0.116944, 0.043636,
+ 1.958233, 0.020947, 0.09974, 0.049821,
+ 2.276849, 0.020122, 0.107867, 0.047842]).reshape((10,4))
+ np.testing.assert_allclose(bse, results.bse, rtol=1e-03)
+
+ tvalues = np.array([10.947193, -4.777659, -2.089223, -0.283103,
+ 7.532584, -4.259179, -1.877395, 1.385161,
+ 10.033179, -4.080362, -3.012133, 1.515096,
+ 7.106862, -3.629311, -1.704079, 1.17042,
+ 17.831878, -8.473156, -1.633924, 0.100891,
+ 15.750552, -6.880725, -2.74765, 1.734978,
+ 6.980774, -3.586757, -2.302575, 1.784818,
+ 16.644095, -8.273001, -1.205451, -1.445501,
+ 11.414933, -4.919384, -2.272458, 0.060064,
+ 8.00251, -3.679274, -2.872176, 2.270738]).reshape((10,4))
+ np.testing.assert_allclose(tvalues, results.tvalues, rtol=1e-03)
+
+ localR2 = np.array([[ 0.53068693],
+ [ 0.59582647],
+ [ 0.59700925],
+ [ 0.45769954],
+ [ 0.54634509],
+ [ 0.5494828 ],
+ [ 0.55159604],
+ [ 0.55634237],
+ [ 0.53903842],
+ [ 0.55884954]])
+ np.testing.assert_allclose(localR2, results.localR2, rtol=1e-05)
+
+class TestGWRPoisson(unittest.TestCase):
+ def setUp(self):
+ data = pysal.open(pysal.examples.get_path('Tokyomortality.csv'), mode='Ur')
+ self.coords = zip(data.by_col('X_CENTROID'), data.by_col('Y_CENTROID'))
+ self.y = np.array(data.by_col('db2564')).reshape((-1,1))
+ self.off = np.array(data.by_col('eb2564')).reshape((-1,1))
+ OCC = np.array(data.by_col('OCC_TEC')).reshape((-1,1))
+ OWN = np.array(data.by_col('OWNH')).reshape((-1,1))
+ POP = np.array(data.by_col('POP65')).reshape((-1,1))
+ UNEMP = np.array(data.by_col('UNEMP')).reshape((-1,1))
+ self.X = np.hstack([OCC,OWN,POP,UNEMP])
+ self.BS_F = pysal.open(pysal.examples.get_path('tokyo_BS_F_listwise.csv'))
+ self.BS_NN = pysal.open(pysal.examples.get_path('tokyo_BS_NN_listwise.csv'))
+ self.GS_F = pysal.open(pysal.examples.get_path('tokyo_GS_F_listwise.csv'))
+ self.GS_NN = pysal.open(pysal.examples.get_path('tokyo_GS_NN_listwise.csv'))
+ self.BS_NN_OFF = pysal.open(pysal.examples.get_path('tokyo_BS_NN_OFF_listwise.csv'))
+
+ def test_BS_F(self):
+ est_Int = self.BS_F.by_col(' est_Intercept')
+ se_Int = self.BS_F.by_col(' se_Intercept')
+ t_Int = self.BS_F.by_col(' t_Intercept')
+ est_OCC = self.BS_F.by_col(' est_OCC_TEC')
+ se_OCC = self.BS_F.by_col(' se_OCC_TEC')
+ t_OCC = self.BS_F.by_col(' t_OCC_TEC')
+ est_OWN = self.BS_F.by_col(' est_OWNH')
+ se_OWN = self.BS_F.by_col(' se_OWNH')
+ t_OWN = self.BS_F.by_col(' t_OWNH')
+ est_POP = self.BS_F.by_col(' est_POP65')
+ se_POP = self.BS_F.by_col(' se_POP65')
+ t_POP = self.BS_F.by_col(' t_POP65')
+ est_UNEMP = self.BS_F.by_col(' est_UNEMP')
+ se_UNEMP = self.BS_F.by_col(' se_UNEMP')
+ t_UNEMP = self.BS_F.by_col(' t_UNEMP')
+ yhat = self.BS_F.by_col(' yhat')
+ pdev = np.array(self.BS_F.by_col(' localpdev')).reshape((-1,1))
+
+ model = GWR(self.coords, self.y, self.X, bw=26029.625, family=Poisson(),
+ kernel='bisquare', fixed=True)
+ rslt = model.fit()
+
+ AICc = get_AICc(rslt)
+ AIC = get_AIC(rslt)
+ BIC = get_BIC(rslt)
+
+ self.assertAlmostEquals(np.floor(AICc), 13294.0)
+ self.assertAlmostEquals(np.floor(AIC), 13247.0)
+ self.assertAlmostEquals(np.floor(BIC), 13485.0)
+ np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-05)
+ np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-03)
+ np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-03)
+ np.testing.assert_allclose(est_OCC, rslt.params[:,1], rtol=1e-04)
+ np.testing.assert_allclose(se_OCC, rslt.bse[:,1], rtol=1e-02)
+ np.testing.assert_allclose(t_OCC, rslt.tvalues[:,1], rtol=1e-02)
+ np.testing.assert_allclose(est_OWN, rslt.params[:,2], rtol=1e-04)
+ np.testing.assert_allclose(se_OWN, rslt.bse[:,2], rtol=1e-03)
+ np.testing.assert_allclose(t_OWN, rslt.tvalues[:,2], rtol=1e-03)
+ np.testing.assert_allclose(est_POP, rslt.params[:,3], rtol=1e-04)
+ np.testing.assert_allclose(se_POP, rslt.bse[:,3], rtol=1e-02)
+ np.testing.assert_allclose(t_POP, rslt.tvalues[:,3], rtol=1e-02)
+ np.testing.assert_allclose(est_UNEMP, rslt.params[:,4], rtol=1e-04)
+ np.testing.assert_allclose(se_UNEMP, rslt.bse[:,4], rtol=1e-02)
+ np.testing.assert_allclose(t_UNEMP, rslt.tvalues[:,4], rtol=1e-02)
+ np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-05)
+ np.testing.assert_allclose(pdev, rslt.pDev, rtol=1e-05)
+
+
+ def test_BS_NN(self):
+ est_Int = self.BS_NN.by_col(' est_Intercept')
+ se_Int = self.BS_NN.by_col(' se_Intercept')
+ t_Int = self.BS_NN.by_col(' t_Intercept')
+ est_OCC = self.BS_NN.by_col(' est_OCC_TEC')
+ se_OCC = self.BS_NN.by_col(' se_OCC_TEC')
+ t_OCC = self.BS_NN.by_col(' t_OCC_TEC')
+ est_OWN = self.BS_NN.by_col(' est_OWNH')
+ se_OWN = self.BS_NN.by_col(' se_OWNH')
+ t_OWN = self.BS_NN.by_col(' t_OWNH')
+ est_POP = self.BS_NN.by_col(' est_POP65')
+ se_POP = self.BS_NN.by_col(' se_POP65')
+ t_POP = self.BS_NN.by_col(' t_POP65')
+ est_UNEMP = self.BS_NN.by_col(' est_UNEMP')
+ se_UNEMP = self.BS_NN.by_col(' se_UNEMP')
+ t_UNEMP = self.BS_NN.by_col(' t_UNEMP')
+ yhat = self.BS_NN.by_col(' yhat')
+ pdev = np.array(self.BS_NN.by_col(' localpdev')).reshape((-1,1))
+
+ model = GWR(self.coords, self.y, self.X, bw=50, family=Poisson(),
+ kernel='bisquare', fixed=False)
+ rslt = model.fit()
+
+ AICc = get_AICc(rslt)
+ AIC = get_AIC(rslt)
+ BIC = get_BIC(rslt)
+
+ self.assertAlmostEquals(np.floor(AICc), 13285)
+ self.assertAlmostEquals(np.floor(AIC), 13259.0)
+ self.assertAlmostEquals(np.floor(BIC), 13442.0)
+ np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04)
+ np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-02)
+ np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-02)
+ np.testing.assert_allclose(est_OCC, rslt.params[:,1], rtol=1e-03)
+ np.testing.assert_allclose(se_OCC, rslt.bse[:,1], rtol=1e-02)
+ np.testing.assert_allclose(t_OCC, rslt.tvalues[:,1], rtol=1e-02)
+ np.testing.assert_allclose(est_OWN, rslt.params[:,2], rtol=1e-04)
+ np.testing.assert_allclose(se_OWN, rslt.bse[:,2], rtol=1e-02)
+ np.testing.assert_allclose(t_OWN, rslt.tvalues[:,2], rtol=1e-02)
+ np.testing.assert_allclose(est_POP, rslt.params[:,3], rtol=1e-03)
+ np.testing.assert_allclose(se_POP, rslt.bse[:,3], rtol=1e-02)
+ np.testing.assert_allclose(t_POP, rslt.tvalues[:,3], rtol=1e-02)
+ np.testing.assert_allclose(est_UNEMP, rslt.params[:,4], rtol=1e-04)
+ np.testing.assert_allclose(se_UNEMP, rslt.bse[:,4], rtol=1e-02)
+ np.testing.assert_allclose(t_UNEMP, rslt.tvalues[:,4], rtol=1e-02)
+ np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-04)
+ np.testing.assert_allclose(pdev, rslt.pDev, rtol=1e-05)
+
+ def test_BS_NN_Offset(self):
+ est_Int = self.BS_NN_OFF.by_col(' est_Intercept')
+ se_Int = self.BS_NN_OFF.by_col(' se_Intercept')
+ t_Int = self.BS_NN_OFF.by_col(' t_Intercept')
+ est_OCC = self.BS_NN_OFF.by_col(' est_OCC_TEC')
+ se_OCC = self.BS_NN_OFF.by_col(' se_OCC_TEC')
+ t_OCC = self.BS_NN_OFF.by_col(' t_OCC_TEC')
+ est_OWN = self.BS_NN_OFF.by_col(' est_OWNH')
+ se_OWN = self.BS_NN_OFF.by_col(' se_OWNH')
+ t_OWN = self.BS_NN_OFF.by_col(' t_OWNH')
+ est_POP = self.BS_NN_OFF.by_col(' est_POP65')
+ se_POP = self.BS_NN_OFF.by_col(' se_POP65')
+ t_POP = self.BS_NN_OFF.by_col(' t_POP65')
+ est_UNEMP = self.BS_NN_OFF.by_col(' est_UNEMP')
+ se_UNEMP = self.BS_NN_OFF.by_col(' se_UNEMP')
+ t_UNEMP = self.BS_NN_OFF.by_col(' t_UNEMP')
+ yhat = self.BS_NN_OFF.by_col(' yhat')
+ pdev = np.array(self.BS_NN_OFF.by_col(' localpdev')).reshape((-1,1))
+
+ model = GWR(self.coords, self.y, self.X, bw=100, offset=self.off, family=Poisson(),
+ kernel='bisquare', fixed=False)
+ rslt = model.fit()
+
+ AICc = get_AICc(rslt)
+ AIC = get_AIC(rslt)
+ BIC = get_BIC(rslt)
+
+ self.assertAlmostEquals(np.floor(AICc), 367.0)
+ self.assertAlmostEquals(np.floor(AIC), 361.0)
+ self.assertAlmostEquals(np.floor(BIC), 451.0)
+ np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-02,
+ atol=1e-02)
+ np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-02, atol=1e-02)
+ np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-01,
+ atol=1e-02)
+ np.testing.assert_allclose(est_OCC, rslt.params[:,1], rtol=1e-03,
+ atol=1e-02)
+ np.testing.assert_allclose(se_OCC, rslt.bse[:,1], rtol=1e-02, atol=1e-02)
+ np.testing.assert_allclose(t_OCC, rslt.tvalues[:,1], rtol=1e-01,
+ atol=1e-02)
+ np.testing.assert_allclose(est_OWN, rslt.params[:,2], rtol=1e-04,
+ atol=1e-02)
+ np.testing.assert_allclose(se_OWN, rslt.bse[:,2], rtol=1e-02, atol=1e-02)
+ np.testing.assert_allclose(t_OWN, rslt.tvalues[:,2], rtol=1e-01,
+ atol=1e-02)
+ np.testing.assert_allclose(est_POP, rslt.params[:,3], rtol=1e-03,
+ atol=1e-02)
+ np.testing.assert_allclose(se_POP, rslt.bse[:,3], rtol=1e-02, atol=1e-02)
+ np.testing.assert_allclose(t_POP, rslt.tvalues[:,3], rtol=1e-01,
+ atol=1e-02)
+ np.testing.assert_allclose(est_UNEMP, rslt.params[:,4], rtol=1e-04,
+ atol=1e-02)
+ np.testing.assert_allclose(se_UNEMP, rslt.bse[:,4], rtol=1e-02,
+ atol=1e-02)
+ np.testing.assert_allclose(t_UNEMP, rslt.tvalues[:,4], rtol=1e-01,
+ atol=1e-02)
+ np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-03, atol=1e-02)
+ np.testing.assert_allclose(pdev, rslt.pDev, rtol=1e-04, atol=1e-02)
+
+ def test_GS_F(self):
+ est_Int = self.GS_F.by_col(' est_Intercept')
+ se_Int = self.GS_F.by_col(' se_Intercept')
+ t_Int = self.GS_F.by_col(' t_Intercept')
+ est_OCC = self.GS_F.by_col(' est_OCC_TEC')
+ se_OCC = self.GS_F.by_col(' se_OCC_TEC')
+ t_OCC = self.GS_F.by_col(' t_OCC_TEC')
+ est_OWN = self.GS_F.by_col(' est_OWNH')
+ se_OWN = self.GS_F.by_col(' se_OWNH')
+ t_OWN = self.GS_F.by_col(' t_OWNH')
+ est_POP = self.GS_F.by_col(' est_POP65')
+ se_POP = self.GS_F.by_col(' se_POP65')
+ t_POP = self.GS_F.by_col(' t_POP65')
+ est_UNEMP = self.GS_F.by_col(' est_UNEMP')
+ se_UNEMP = self.GS_F.by_col(' se_UNEMP')
+ t_UNEMP = self.GS_F.by_col(' t_UNEMP')
+ yhat = self.GS_F.by_col(' yhat')
+ pdev = np.array(self.GS_F.by_col(' localpdev')).reshape((-1,1))
+
+ model = GWR(self.coords, self.y, self.X, bw=8764.474, family=Poisson(),
+ kernel='gaussian', fixed=True)
+ rslt = model.fit()
+
+ AICc = get_AICc(rslt)
+ AIC = get_AIC(rslt)
+ BIC = get_BIC(rslt)
+
+ self.assertAlmostEquals(np.floor(AICc), 11283.0)
+ self.assertAlmostEquals(np.floor(AIC), 11211.0)
+ self.assertAlmostEquals(np.floor(BIC), 11497.0)
+ np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-03)
+ np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-02)
+ np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-02)
+ np.testing.assert_allclose(est_OCC, rslt.params[:,1], rtol=1e-03)
+ np.testing.assert_allclose(se_OCC, rslt.bse[:,1], rtol=1e-02)
+ np.testing.assert_allclose(t_OCC, rslt.tvalues[:,1], rtol=1e-02)
+ np.testing.assert_allclose(est_OWN, rslt.params[:,2], rtol=1e-03)
+ np.testing.assert_allclose(se_OWN, rslt.bse[:,2], rtol=1e-02)
+ np.testing.assert_allclose(t_OWN, rslt.tvalues[:,2], rtol=1e-02)
+ np.testing.assert_allclose(est_POP, rslt.params[:,3], rtol=1e-02)
+ np.testing.assert_allclose(se_POP, rslt.bse[:,3], rtol=1e-02)
+ np.testing.assert_allclose(t_POP, rslt.tvalues[:,3], rtol=1e-02)
+ np.testing.assert_allclose(est_UNEMP, rslt.params[:,4], rtol=1e-02)
+ np.testing.assert_allclose(se_UNEMP, rslt.bse[:,4], rtol=1e-02)
+ np.testing.assert_allclose(t_UNEMP, rslt.tvalues[:,4], rtol=1e-02)
+ np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-04)
+ np.testing.assert_allclose(pdev, rslt.pDev, rtol=1e-05)
+
+ def test_GS_NN(self):
+ est_Int = self.GS_NN.by_col(' est_Intercept')
+ se_Int = self.GS_NN.by_col(' se_Intercept')
+ t_Int = self.GS_NN.by_col(' t_Intercept')
+ est_OCC = self.GS_NN.by_col(' est_OCC_TEC')
+ se_OCC = self.GS_NN.by_col(' se_OCC_TEC')
+ t_OCC = self.GS_NN.by_col(' t_OCC_TEC')
+ est_OWN = self.GS_NN.by_col(' est_OWNH')
+ se_OWN = self.GS_NN.by_col(' se_OWNH')
+ t_OWN = self.GS_NN.by_col(' t_OWNH')
+ est_POP = self.GS_NN.by_col(' est_POP65')
+ se_POP = self.GS_NN.by_col(' se_POP65')
+ t_POP = self.GS_NN.by_col(' t_POP65')
+ est_UNEMP = self.GS_NN.by_col(' est_UNEMP')
+ se_UNEMP = self.GS_NN.by_col(' se_UNEMP')
+ t_UNEMP = self.GS_NN.by_col(' t_UNEMP')
+ yhat = self.GS_NN.by_col(' yhat')
+ pdev = np.array(self.GS_NN.by_col(' localpdev')).reshape((-1,1))
+
+ model = GWR(self.coords, self.y, self.X, bw=50, family=Poisson(),
+ kernel='gaussian', fixed=False)
+ rslt = model.fit()
+
+ AICc = get_AICc(rslt)
+ AIC = get_AIC(rslt)
+ BIC = get_BIC(rslt)
+
+ self.assertAlmostEquals(np.floor(AICc), 21070.0)
+ self.assertAlmostEquals(np.floor(AIC), 21069.0)
+ self.assertAlmostEquals(np.floor(BIC), 21111.0)
+ np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-04)
+ np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-02)
+ np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-02)
+ np.testing.assert_allclose(est_OCC, rslt.params[:,1], rtol=1e-03)
+ np.testing.assert_allclose(se_OCC, rslt.bse[:,1], rtol=1e-02)
+ np.testing.assert_allclose(t_OCC, rslt.tvalues[:,1], rtol=1e-02)
+ np.testing.assert_allclose(est_OWN, rslt.params[:,2], rtol=1e-04)
+ np.testing.assert_allclose(se_OWN, rslt.bse[:,2], rtol=1e-02)
+ np.testing.assert_allclose(t_OWN, rslt.tvalues[:,2], rtol=1e-02)
+ np.testing.assert_allclose(est_POP, rslt.params[:,3], rtol=1e-02)
+ np.testing.assert_allclose(se_POP, rslt.bse[:,3], rtol=1e-02)
+ np.testing.assert_allclose(t_POP, rslt.tvalues[:,3], rtol=1e-02)
+ np.testing.assert_allclose(est_UNEMP, rslt.params[:,4], rtol=1e-02)
+ np.testing.assert_allclose(se_UNEMP, rslt.bse[:,4], rtol=1e-02)
+ np.testing.assert_allclose(t_UNEMP, rslt.tvalues[:,4], rtol=1e-02)
+ np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-04)
+ np.testing.assert_allclose(pdev, rslt.pDev, rtol=1e-05)
+
+class TestGWRBinomial(unittest.TestCase):
+ def setUp(self):
+ data = pysal.open(pysal.examples.get_path('landslides.csv'))
+ self.coords = zip(data.by_col('X'), data.by_col('Y'))
+ self.y = np.array(data.by_col('Landslid')).reshape((-1,1))
+ ELEV = np.array(data.by_col('Elev')).reshape((-1,1))
+ SLOPE = np.array(data.by_col('Slope')).reshape((-1,1))
+ SIN = np.array(data.by_col('SinAspct')).reshape((-1,1))
+ COS = np.array(data.by_col('CosAspct')).reshape((-1,1))
+ SOUTH = np.array(data.by_col('AbsSouth')).reshape((-1,1))
+ DIST = np.array(data.by_col('DistStrm')).reshape((-1,1))
+ self.X = np.hstack([ELEV, SLOPE, SIN, COS, SOUTH, DIST])
+ self.BS_F = pysal.open(pysal.examples.get_path('clearwater_BS_F_listwise.csv'))
+ self.BS_NN = pysal.open(pysal.examples.get_path('clearwater_BS_NN_listwise.csv'))
+ self.GS_F = pysal.open(pysal.examples.get_path('clearwater_GS_F_listwise.csv'))
+ self.GS_NN = pysal.open(pysal.examples.get_path('clearwater_GS_NN_listwise.csv'))
+
+ def test_BS_F(self):
+ est_Int = self.BS_F.by_col(' est_Intercept')
+ se_Int = self.BS_F.by_col(' se_Intercept')
+ t_Int = self.BS_F.by_col(' t_Intercept')
+ est_elev = self.BS_F.by_col(' est_Elev')
+ se_elev = self.BS_F.by_col(' se_Elev')
+ t_elev = self.BS_F.by_col(' t_Elev')
+ est_slope = self.BS_F.by_col(' est_Slope')
+ se_slope = self.BS_F.by_col(' se_Slope')
+ t_slope = self.BS_F.by_col(' t_Slope')
+ est_sin = self.BS_F.by_col(' est_SinAspct')
+ se_sin = self.BS_F.by_col(' se_SinAspct')
+ t_sin = self.BS_F.by_col(' t_SinAspct')
+ est_cos = self.BS_F.by_col(' est_CosAspct')
+ se_cos = self.BS_F.by_col(' se_CosAspct')
+ t_cos = self.BS_F.by_col(' t_CosAspct')
+ est_south = self.BS_F.by_col(' est_AbsSouth')
+ se_south = self.BS_F.by_col(' se_AbsSouth')
+ t_south = self.BS_F.by_col(' t_AbsSouth')
+ est_strm = self.BS_F.by_col(' est_DistStrm')
+ se_strm = self.BS_F.by_col(' se_DistStrm')
+ t_strm = self.BS_F.by_col(' t_DistStrm')
+ yhat = self.BS_F.by_col(' yhat')
+ pdev = np.array(self.BS_F.by_col(' localpdev')).reshape((-1,1))
+
+ model = GWR(self.coords, self.y, self.X, bw=19642.170, family=Binomial(),
+ kernel='bisquare', fixed=True)
+ rslt = model.fit()
+
+ AICc = get_AICc(rslt)
+ AIC = get_AIC(rslt)
+ BIC = get_BIC(rslt)
+
+ self.assertAlmostEquals(np.floor(AICc), 275.0)
+ self.assertAlmostEquals(np.floor(AIC), 271.0)
+ self.assertAlmostEquals(np.floor(BIC), 349.0)
+ np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-00)
+ np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-00)
+ np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-00)
+ np.testing.assert_allclose(est_elev, rslt.params[:,1], rtol=1e-00)
+ np.testing.assert_allclose(se_elev, rslt.bse[:,1], rtol=1e-00)
+ np.testing.assert_allclose(t_elev, rslt.tvalues[:,1], rtol=1e-00)
+ np.testing.assert_allclose(est_slope, rslt.params[:,2], rtol=1e-00)
+ np.testing.assert_allclose(se_slope, rslt.bse[:,2], rtol=1e-00)
+ np.testing.assert_allclose(t_slope, rslt.tvalues[:,2], rtol=1e-00)
+ np.testing.assert_allclose(est_sin, rslt.params[:,3], rtol=1e01)
+ np.testing.assert_allclose(se_sin, rslt.bse[:,3], rtol=1e01)
+ np.testing.assert_allclose(t_sin, rslt.tvalues[:,3], rtol=1e01)
+ np.testing.assert_allclose(est_cos, rslt.params[:,4], rtol=1e01)
+ np.testing.assert_allclose(se_cos, rslt.bse[:,4], rtol=1e01)
+ np.testing.assert_allclose(t_cos, rslt.tvalues[:,4], rtol=1e01)
+ np.testing.assert_allclose(est_south, rslt.params[:,5], rtol=1e01)
+ np.testing.assert_allclose(se_south, rslt.bse[:,5], rtol=1e01)
+ np.testing.assert_allclose(t_south, rslt.tvalues[:,5], rtol=1e01)
+ np.testing.assert_allclose(est_strm, rslt.params[:,6], rtol=1e02)
+ np.testing.assert_allclose(se_strm, rslt.bse[:,6], rtol=1e01)
+ np.testing.assert_allclose(t_strm, rslt.tvalues[:,6], rtol=1e02)
+ np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-01)
+ #This test fails - likely due to compound rounding errors
+ #Has been tested using statsmodels.family calculations and
+ #code from Jing's python version, which both yield the same
+ #np.testing.assert_allclose(pdev, rslt.pDev, rtol=1e-05)
+
+ def test_BS_NN(self):
+ est_Int = self.BS_NN.by_col(' est_Intercept')
+ se_Int = self.BS_NN.by_col(' se_Intercept')
+ t_Int = self.BS_NN.by_col(' t_Intercept')
+ est_elev = self.BS_NN.by_col(' est_Elev')
+ se_elev = self.BS_NN.by_col(' se_Elev')
+ t_elev = self.BS_NN.by_col(' t_Elev')
+ est_slope = self.BS_NN.by_col(' est_Slope')
+ se_slope = self.BS_NN.by_col(' se_Slope')
+ t_slope = self.BS_NN.by_col(' t_Slope')
+ est_sin = self.BS_NN.by_col(' est_SinAspct')
+ se_sin = self.BS_NN.by_col(' se_SinAspct')
+ t_sin = self.BS_NN.by_col(' t_SinAspct')
+ est_cos = self.BS_NN.by_col(' est_CosAspct')
+ se_cos = self.BS_NN.by_col(' se_CosAspct')
+ t_cos = self.BS_NN.by_col(' t_CosAspct')
+ est_south = self.BS_NN.by_col(' est_AbsSouth')
+ se_south = self.BS_NN.by_col(' se_AbsSouth')
+ t_south = self.BS_NN.by_col(' t_AbsSouth')
+ est_strm = self.BS_NN.by_col(' est_DistStrm')
+ se_strm = self.BS_NN.by_col(' se_DistStrm')
+ t_strm = self.BS_NN.by_col(' t_DistStrm')
+ yhat = self.BS_NN.by_col(' yhat')
+ pdev = self.BS_NN.by_col(' localpdev')
+
+ model = GWR(self.coords, self.y, self.X, bw=158, family=Binomial(),
+ kernel='bisquare', fixed=False)
+ rslt = model.fit()
+
+ AICc = get_AICc(rslt)
+ AIC = get_AIC(rslt)
+ BIC = get_BIC(rslt)
+
+ self.assertAlmostEquals(np.floor(AICc), 277.0)
+ self.assertAlmostEquals(np.floor(AIC), 271.0)
+ self.assertAlmostEquals(np.floor(BIC), 358.0)
+ np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-00)
+ np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-00)
+ np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-00)
+ np.testing.assert_allclose(est_elev, rslt.params[:,1], rtol=1e-00)
+ np.testing.assert_allclose(se_elev, rslt.bse[:,1], rtol=1e-00)
+ np.testing.assert_allclose(t_elev, rslt.tvalues[:,1], rtol=1e-00)
+ np.testing.assert_allclose(est_slope, rslt.params[:,2], rtol=1e-00)
+ np.testing.assert_allclose(se_slope, rslt.bse[:,2], rtol=1e-00)
+ np.testing.assert_allclose(t_slope, rslt.tvalues[:,2], rtol=1e-00)
+ np.testing.assert_allclose(est_sin, rslt.params[:,3], rtol=1e01)
+ np.testing.assert_allclose(se_sin, rslt.bse[:,3], rtol=1e01)
+ np.testing.assert_allclose(t_sin, rslt.tvalues[:,3], rtol=1e01)
+ np.testing.assert_allclose(est_cos, rslt.params[:,4], rtol=1e01)
+ np.testing.assert_allclose(se_cos, rslt.bse[:,4], rtol=1e01)
+ np.testing.assert_allclose(t_cos, rslt.tvalues[:,4], rtol=1e01)
+ np.testing.assert_allclose(est_south, rslt.params[:,5], rtol=1e01)
+ np.testing.assert_allclose(se_south, rslt.bse[:,5], rtol=1e01)
+ np.testing.assert_allclose(t_south, rslt.tvalues[:,5], rtol=1e01)
+ np.testing.assert_allclose(est_strm, rslt.params[:,6], rtol=1e03)
+ np.testing.assert_allclose(se_strm, rslt.bse[:,6], rtol=1e01)
+ np.testing.assert_allclose(t_strm, rslt.tvalues[:,6], rtol=1e03)
+ np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-01)
+ #This test fails - likely due to compound rounding errors
+ #Has been tested using statsmodels.family calculations and
+ #code from Jing's python version, which both yield the same
+ #np.testing.assert_allclose(pdev, rslt.pDev, rtol=1e-05)
+
+ def test_GS_F(self):
+ est_Int = self.GS_F.by_col(' est_Intercept')
+ se_Int = self.GS_F.by_col(' se_Intercept')
+ t_Int = self.GS_F.by_col(' t_Intercept')
+ est_elev = self.GS_F.by_col(' est_Elev')
+ se_elev = self.GS_F.by_col(' se_Elev')
+ t_elev = self.GS_F.by_col(' t_Elev')
+ est_slope = self.GS_F.by_col(' est_Slope')
+ se_slope = self.GS_F.by_col(' se_Slope')
+ t_slope = self.GS_F.by_col(' t_Slope')
+ est_sin = self.GS_F.by_col(' est_SinAspct')
+ se_sin = self.GS_F.by_col(' se_SinAspct')
+ t_sin = self.GS_F.by_col(' t_SinAspct')
+ est_cos = self.GS_F.by_col(' est_CosAspct')
+ se_cos = self.GS_F.by_col(' se_CosAspct')
+ t_cos = self.GS_F.by_col(' t_CosAspct')
+ est_south = self.GS_F.by_col(' est_AbsSouth')
+ se_south = self.GS_F.by_col(' se_AbsSouth')
+ t_south = self.GS_F.by_col(' t_AbsSouth')
+ est_strm = self.GS_F.by_col(' est_DistStrm')
+ se_strm = self.GS_F.by_col(' se_DistStrm')
+ t_strm = self.GS_F.by_col(' t_DistStrm')
+ yhat = self.GS_F.by_col(' yhat')
+ pdev = self.GS_F.by_col(' localpdev')
+
+ model = GWR(self.coords, self.y, self.X, bw=8929.061, family=Binomial(),
+ kernel='gaussian', fixed=True)
+ rslt = model.fit()
+
+ AICc = get_AICc(rslt)
+ AIC = get_AIC(rslt)
+ BIC = get_BIC(rslt)
+
+ self.assertAlmostEquals(np.floor(AICc), 276.0)
+ self.assertAlmostEquals(np.floor(AIC), 272.0)
+ self.assertAlmostEquals(np.floor(BIC), 341.0)
+ np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-00)
+ np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-00)
+ np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-00)
+ np.testing.assert_allclose(est_elev, rslt.params[:,1], rtol=1e-00)
+ np.testing.assert_allclose(se_elev, rslt.bse[:,1], rtol=1e-00)
+ np.testing.assert_allclose(t_elev, rslt.tvalues[:,1], rtol=1e-00)
+ np.testing.assert_allclose(est_slope, rslt.params[:,2], rtol=1e-00)
+ np.testing.assert_allclose(se_slope, rslt.bse[:,2], rtol=1e-00)
+ np.testing.assert_allclose(t_slope, rslt.tvalues[:,2], rtol=1e-00)
+ np.testing.assert_allclose(est_sin, rslt.params[:,3], rtol=1e01)
+ np.testing.assert_allclose(se_sin, rslt.bse[:,3], rtol=1e01)
+ np.testing.assert_allclose(t_sin, rslt.tvalues[:,3], rtol=1e01)
+ np.testing.assert_allclose(est_cos, rslt.params[:,4], rtol=1e01)
+ np.testing.assert_allclose(se_cos, rslt.bse[:,4], rtol=1e01)
+ np.testing.assert_allclose(t_cos, rslt.tvalues[:,4], rtol=1e01)
+ np.testing.assert_allclose(est_south, rslt.params[:,5], rtol=1e01)
+ np.testing.assert_allclose(se_south, rslt.bse[:,5], rtol=1e01)
+ np.testing.assert_allclose(t_south, rslt.tvalues[:,5], rtol=1e01)
+ np.testing.assert_allclose(est_strm, rslt.params[:,6], rtol=1e02)
+ np.testing.assert_allclose(se_strm, rslt.bse[:,6], rtol=1e01)
+ np.testing.assert_allclose(t_strm, rslt.tvalues[:,6], rtol=1e02)
+ np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-01)
+ #This test fails - likely due to compound rounding errors
+ #Has been tested using statsmodels.family calculations and
+ #code from Jing's python version, which both yield the same
+ #np.testing.assert_allclose(pdev, rslt.pDev, rtol=1e-05)
+
+ def test_GS_NN(self):
+ est_Int = self.GS_NN.by_col(' est_Intercept')
+ se_Int = self.GS_NN.by_col(' se_Intercept')
+ t_Int = self.GS_NN.by_col(' t_Intercept')
+ est_elev = self.GS_NN.by_col(' est_Elev')
+ se_elev = self.GS_NN.by_col(' se_Elev')
+ t_elev = self.GS_NN.by_col(' t_Elev')
+ est_slope = self.GS_NN.by_col(' est_Slope')
+ se_slope = self.GS_NN.by_col(' se_Slope')
+ t_slope = self.GS_NN.by_col(' t_Slope')
+ est_sin = self.GS_NN.by_col(' est_SinAspct')
+ se_sin = self.GS_NN.by_col(' se_SinAspct')
+ t_sin = self.GS_NN.by_col(' t_SinAspct')
+ est_cos = self.GS_NN.by_col(' est_CosAspct')
+ se_cos = self.GS_NN.by_col(' se_CosAspct')
+ t_cos = self.GS_NN.by_col(' t_CosAspct')
+ est_south = self.GS_NN.by_col(' est_AbsSouth')
+ se_south = self.GS_NN.by_col(' se_AbsSouth')
+ t_south = self.GS_NN.by_col(' t_AbsSouth')
+ est_strm = self.GS_NN.by_col(' est_DistStrm')
+ se_strm = self.GS_NN.by_col(' se_DistStrm')
+ t_strm = self.GS_NN.by_col(' t_DistStrm')
+ yhat = self.GS_NN.by_col(' yhat')
+ pdev = self.GS_NN.by_col(' localpdev')
+
+ model = GWR(self.coords, self.y, self.X, bw=64, family=Binomial(),
+ kernel='gaussian', fixed=False)
+ rslt = model.fit()
+
+ AICc = get_AICc(rslt)
+ AIC = get_AIC(rslt)
+ BIC = get_BIC(rslt)
+
+ self.assertAlmostEquals(np.floor(AICc), 276.0)
+ self.assertAlmostEquals(np.floor(AIC), 273.0)
+ self.assertAlmostEquals(np.floor(BIC), 331.0)
+ np.testing.assert_allclose(est_Int, rslt.params[:,0], rtol=1e-00)
+ np.testing.assert_allclose(se_Int, rslt.bse[:,0], rtol=1e-00)
+ np.testing.assert_allclose(t_Int, rslt.tvalues[:,0], rtol=1e-00)
+ np.testing.assert_allclose(est_elev, rslt.params[:,1], rtol=1e-00)
+ np.testing.assert_allclose(se_elev, rslt.bse[:,1], rtol=1e-00)
+ np.testing.assert_allclose(t_elev, rslt.tvalues[:,1], rtol=1e-00)
+ np.testing.assert_allclose(est_slope, rslt.params[:,2], rtol=1e-00)
+ np.testing.assert_allclose(se_slope, rslt.bse[:,2], rtol=1e-00)
+ np.testing.assert_allclose(t_slope, rslt.tvalues[:,2], rtol=1e-00)
+ np.testing.assert_allclose(est_sin, rslt.params[:,3], rtol=1e01)
+ np.testing.assert_allclose(se_sin, rslt.bse[:,3], rtol=1e01)
+ np.testing.assert_allclose(t_sin, rslt.tvalues[:,3], rtol=1e01)
+ np.testing.assert_allclose(est_cos, rslt.params[:,4], rtol=1e01)
+ np.testing.assert_allclose(se_cos, rslt.bse[:,4], rtol=1e01)
+ np.testing.assert_allclose(t_cos, rslt.tvalues[:,4], rtol=1e01)
+ np.testing.assert_allclose(est_south, rslt.params[:,5], rtol=1e01)
+ np.testing.assert_allclose(se_south, rslt.bse[:,5], rtol=1e01)
+ np.testing.assert_allclose(t_south, rslt.tvalues[:,5], rtol=1e01)
+ np.testing.assert_allclose(est_strm, rslt.params[:,6], rtol=1e02)
+ np.testing.assert_allclose(se_strm, rslt.bse[:,6], rtol=1e01)
+ np.testing.assert_allclose(t_strm, rslt.tvalues[:,6], rtol=1e02)
+ np.testing.assert_allclose(yhat, rslt.mu, rtol=1e-00)
+ #This test fails - likely due to compound rounding errors
+ #Has been tested using statsmodels.family calculations and
+ #code from Jing's python version, which both yield the same
+ #np.testing.assert_allclose(pdev, rslt.pDev, rtol=1e-05)
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/release/python/0.8.0/crankshaft/crankshaft/regression/gwr/base/tests/test_kernels.py b/release/python/0.8.0/crankshaft/crankshaft/regression/gwr/base/tests/test_kernels.py
new file mode 100644
index 0000000..ea044b9
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/regression/gwr/base/tests/test_kernels.py
@@ -0,0 +1,84 @@
+import unittest
+import numpy as np
+import pysal
+from pysal.contrib.gwr.kernels import *
+
+PEGP = pysal.examples.get_path
+
+class TestKernels(unittest.TestCase):
+ def setUp(self):
+ np.random.seed(1234)
+ x = np.arange(1,6)
+ y = np.arange(5,0, -1)
+ np.random.shuffle(x)
+ np.random.shuffle(y)
+ self.coords = np.array(zip(x, y))
+ self.fix_gauss_kern = np.array([
+ [ 1. , 0.38889556, 0.48567179, 0.48567179, 0.89483932],
+ [ 0.38889556, 1. , 0.89483932, 0.64118039, 0.48567179],
+ [ 0.48567179, 0.89483932, 1. , 0.89483932, 0.48567179],
+ [ 0.48567179, 0.64118039, 0.89483932, 1. , 0.38889556],
+ [ 0.89483932, 0.48567179, 0.48567179, 0.38889556, 1. ]])
+ self.adapt_gauss_kern = np.array([
+ [ 1. , 0.52004183, 0.60653072, 0.60653072, 0.92596109],
+ [ 0.34559083, 1. , 0.88249692, 0.60653072, 0.44374738],
+ [ 0.03877423, 0.60653072, 1. , 0.60653072, 0.03877423],
+ [ 0.44374738, 0.60653072, 0.88249692, 1. , 0.34559083],
+ [ 0.92596109, 0.60653072, 0.60653072, 0.52004183, 1. ]])
+ self.fix_bisquare_kern = np.array([
+ [ 1. , 0. , 0. , 0. , 0.60493827],
+ [ 0. , 1. , 0.60493827, 0.01234568, 0. ],
+ [ 0. , 0.60493827, 1. , 0.60493827, 0. ],
+ [ 0. , 0.01234568, 0.60493827, 1. , 0. ],
+ [ 0.60493827, 0. , 0. , 0. , 1. ]])
+ self.adapt_bisquare_kern = np.array([
+ [ 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
+ 3.99999881e-14, 7.15976383e-01],
+ [ 0.00000000e+00, 1.00000000e+00, 5.62500075e-01,
+ 3.99999881e-14, 0.00000000e+00],
+ [ 0.00000000e+00, 3.99999881e-14, 1.00000000e+00,
+ 3.99999881e-14, 0.00000000e+00],
+ [ 0.00000000e+00, 3.99999881e-14, 5.62500075e-01,
+ 1.00000000e+00, 0.00000000e+00],
+ [ 7.15976383e-01, 0.00000000e+00, 3.99999881e-14,
+ 0.00000000e+00, 1.00000000e+00]])
+ self.fix_exp_kern = np.array([
+ [ 1. , 0.2529993 , 0.30063739, 0.30063739, 0.62412506],
+ [ 0.2529993 , 1. , 0.62412506, 0.38953209, 0.30063739],
+ [ 0.30063739, 0.62412506, 1. , 0.62412506, 0.30063739],
+ [ 0.30063739, 0.38953209, 0.62412506, 1. , 0.2529993 ],
+ [ 0.62412506, 0.30063739, 0.30063739, 0.2529993 , 1. ]])
+ self.adapt_exp_kern = np.array([
+ [ 1. , 0.31868771, 0.36787948, 0.36787948, 0.67554721],
+ [ 0.23276223, 1. , 0.60653069, 0.36787948, 0.27949951],
+ [ 0.07811997, 0.36787948, 1. , 0.36787948, 0.07811997],
+ [ 0.27949951, 0.36787948, 0.60653069, 1. , 0.23276223],
+ [ 0.67554721, 0.36787948, 0.36787948, 0.31868771, 1. ]])
+
+ def test_fix_gauss(self):
+ kern = fix_gauss(self.coords, 3)
+ np.testing.assert_allclose(kern, self.fix_gauss_kern)
+
+ def test_adapt_gauss(self):
+ kern = adapt_gauss(self.coords, 3)
+ np.testing.assert_allclose(kern, self.adapt_gauss_kern)
+
+ def test_fix_biqsquare(self):
+ kern = fix_bisquare(self.coords, 3)
+ np.testing.assert_allclose(kern, self.fix_bisquare_kern,
+ atol=1e-01)
+
+ def test_adapt_bisqaure(self):
+ kern = adapt_bisquare(self.coords, 3)
+ np.testing.assert_allclose(kern, self.adapt_bisquare_kern, atol=1e-012)
+
+ def test_fix_exp(self):
+ kern = fix_exp(self.coords, 3)
+ np.testing.assert_allclose(kern, self.fix_exp_kern)
+
+ def test_adapt_exp(self):
+ kern = adapt_exp(self.coords, 3)
+ np.testing.assert_allclose(kern, self.adapt_exp_kern)
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/release/python/0.8.0/crankshaft/crankshaft/regression/gwr/base/tests/test_sel_bw.py b/release/python/0.8.0/crankshaft/crankshaft/regression/gwr/base/tests/test_sel_bw.py
new file mode 100644
index 0000000..47c6d9d
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/regression/gwr/base/tests/test_sel_bw.py
@@ -0,0 +1,139 @@
+
+"""
+GWR is tested against results from GWR4
+"""
+
+import unittest
+import pickle as pk
+from pysal.contrib.glm.family import Gaussian, Poisson, Binomial
+from pysal.contrib.gwr.sel_bw import Sel_BW
+import numpy as np
+import pysal
+
+class TestSelBW(unittest.TestCase):
+ def setUp(self):
+ data = pysal.open(pysal.examples.get_path('GData_utm.csv'))
+ self.coords = zip(data.by_col('X'), data.by_col('Y'))
+ self.y = np.array(data.by_col('PctBach')).reshape((-1,1))
+ rural = np.array(data.by_col('PctRural')).reshape((-1,1))
+ pov = np.array(data.by_col('PctPov')).reshape((-1,1))
+ black = np.array(data.by_col('PctBlack')).reshape((-1,1))
+ self.X = np.hstack([rural, pov, black])
+ self.XB = pk.load(open(pysal.examples.get_path('XB.p'), 'r'))
+ self.err = pk.load(open(pysal.examples.get_path('err.p'), 'r'))
+
+ def test_golden_fixed_AICc(self):
+ bw1 = 211027.34
+ bw2 = Sel_BW(self.coords, self.y, self.X, kernel='bisquare',
+ fixed=True).search(criterion='AICc')
+ self.assertAlmostEqual(bw1, bw2)
+
+ def test_golden_adapt_AICc(self):
+ bw1 = 93.0
+ bw2 = Sel_BW(self.coords, self.y, self.X, kernel='bisquare',
+ fixed=False).search(criterion='AICc')
+ self.assertAlmostEqual(bw1, bw2)
+
+ def test_golden_fixed_AIC(self):
+ bw1 = 76169.15
+ bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian',
+ fixed=True).search(criterion='AIC')
+ self.assertAlmostEqual(bw1, bw2)
+
+ def test_golden_adapt_AIC(self):
+ bw1 = 50.0
+ bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian',
+ fixed=False).search(criterion='AIC')
+ self.assertAlmostEqual(bw1, bw2)
+
+ def test_golden_fixed_BIC(self):
+ bw1 = 279451.43
+ bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian',
+ fixed=True).search(criterion='BIC')
+ self.assertAlmostEqual(bw1, bw2)
+
+ def test_golden_adapt_BIC(self):
+ bw1 = 62.0
+ bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian',
+ fixed=False).search(criterion='BIC')
+ self.assertAlmostEqual(bw1, bw2)
+
+ def test_golden_fixed_CV(self):
+ bw1 = 130406.67
+ bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian',
+ fixed=True).search(criterion='CV')
+ self.assertAlmostEqual(bw1, bw2)
+
+ def test_golden_adapt_CV(self):
+ bw1 = 68.0
+ bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian',
+ fixed=False).search(criterion='CV')
+ self.assertAlmostEqual(bw1, bw2)
+
+ def test_interval_fixed_AICc(self):
+ bw1 = 211025.0#211027.00
+ bw2 = Sel_BW(self.coords, self.y, self.X, kernel='bisquare',
+ fixed=True).search(criterion='AICc', search='interval', bw_min=211001.,
+ bw_max=211035.0, interval=2)
+ self.assertAlmostEqual(bw1, bw2)
+
+ def test_interval_adapt_AICc(self):
+ bw1 = 93.0
+ bw2 = Sel_BW(self.coords, self.y, self.X, kernel='bisquare',
+ fixed=False).search(criterion='AICc', search='interval',
+ bw_min=90.0, bw_max=95.0, interval=1)
+ self.assertAlmostEqual(bw1, bw2)
+
+ def test_interval_fixed_AIC(self):
+ bw1 = 76175.0#76169.00
+ bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian',
+ fixed=True).search(criterion='AIC', search='interval',
+ bw_min=76161.0, bw_max=76175.0, interval=1)
+ self.assertAlmostEqual(bw1, bw2)
+
+ def test_interval_adapt_AIC(self):
+ bw1 = 40.0#50.0
+ bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian',
+ fixed=False).search(criterion='AIC', search='interval', bw_min=40.0,
+ bw_max=60.0, interval=2)
+ self.assertAlmostEqual(bw1, bw2)
+
+ def test_interval_fixed_BIC(self):
+ bw1 = 279461.0#279451.00
+ bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian',
+ fixed=True).search(criterion='BIC', search='interval', bw_min=279441.0,
+ bw_max=279461.0, interval=2)
+ self.assertAlmostEqual(bw1, bw2)
+
+ def test_interval_adapt_BIC(self):
+ bw1 = 62.0
+ bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian',
+ fixed=False).search(criterion='BIC', search='interval',
+ bw_min=52.0, bw_max=72.0, interval=2)
+ self.assertAlmostEqual(bw1, bw2)
+
+ def test_interval_fixed_CV(self):
+ bw1 = 130400.0#130406.00
+ bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian',
+ fixed=True).search(criterion='CV', search='interval', bw_min=130400.0,
+ bw_max=130410.0, interval=1)
+ self.assertAlmostEqual(bw1, bw2)
+
+ def test_interval_adapt_CV(self):
+ bw1 = 62.0#68.0
+ bw2 = Sel_BW(self.coords, self.y, self.X, kernel='gaussian',
+ fixed=False).search(criterion='CV', search='interval', bw_min=60.0,
+ bw_max=76.0 , interval=2)
+ self.assertAlmostEqual(bw1, bw2)
+
+ def test_FBGWR_AIC(self):
+ bw1 = [157.0, 65.0, 52.0]
+ sel = Sel_BW(self.coords, self.y, self.X, fb=True, kernel='bisquare',
+ constant=False)
+ bw2 = sel.search(tol_fb=1e-03)
+ np.testing.assert_allclose(bw1, bw2)
+ np.testing.assert_allclose(sel.XB, self.XB, atol=1e-05)
+ np.testing.assert_allclose(sel.err, self.err, atol=1e-05)
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/release/python/0.8.0/crankshaft/crankshaft/regression/gwr_cs.py b/release/python/0.8.0/crankshaft/crankshaft/regression/gwr_cs.py
new file mode 100644
index 0000000..9ccaefb
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/regression/gwr_cs.py
@@ -0,0 +1,202 @@
+"""
+ Geographically weighted regression
+"""
+import numpy as np
+from gwr.base.gwr import GWR as PySAL_GWR
+from gwr.base.sel_bw import Sel_BW
+import json
+from crankshaft.analysis_data_provider import AnalysisDataProvider
+import plpy
+
+
+class GWR:
+ def __init__(self, data_provider=None):
+ if data_provider:
+ self.data_provider = data_provider
+ else:
+ self.data_provider = AnalysisDataProvider()
+
+ def gwr(self, subquery, dep_var, ind_vars,
+ bw=None, fixed=False, kernel='bisquare',
+ geom_col='the_geom', id_col='cartodb_id'):
+ """
+ subquery: 'select * from demographics'
+ dep_var: 'pctbachelor'
+ ind_vars: ['intercept', 'pctpov', 'pctrural', 'pctblack']
+ bw: value of bandwidth, if None then select optimal
+ fixed: False (kNN) or True ('distance')
+ kernel: 'bisquare' (default), or 'exponential', 'gaussian'
+ """
+
+ params = {'geom_col': geom_col,
+ 'id_col': id_col,
+ 'subquery': subquery,
+ 'dep_var': dep_var,
+ 'ind_vars': ind_vars}
+
+ # get data from data provider
+ query_result = self.data_provider.get_gwr(params)
+
+ # exit if data to analyze is empty
+ if len(query_result) == 0:
+ plpy.error('No data passed to analysis or independent variables '
+ 'are all null-valued')
+
+ # unique ids and variable names list
+ rowid = np.array(query_result[0]['rowid'], dtype=np.int)
+
+ # x, y are centroids of input geometries
+ x = np.array(query_result[0]['x'], dtype=np.float)
+ y = np.array(query_result[0]['y'], dtype=np.float)
+ coords = zip(x, y)
+
+ # extract dependent variable
+ Y = np.array(query_result[0]['dep_var'], dtype=np.float).reshape((-1, 1))
+
+ n = Y.shape[0]
+ k = len(ind_vars)
+ X = np.zeros((n, k))
+
+ # extract query result
+ for attr in range(0, k):
+ attr_name = 'attr' + str(attr + 1)
+ X[:, attr] = np.array(
+ query_result[0][attr_name], dtype=np.float).flatten()
+
+ # add intercept variable name
+ ind_vars.insert(0, 'intercept')
+
+ # calculate bandwidth if none is supplied
+ if bw is None:
+ bw = Sel_BW(coords, Y, X,
+ fixed=fixed, kernel=kernel).search()
+ model = PySAL_GWR(coords, Y, X, bw,
+ fixed=fixed, kernel=kernel).fit()
+
+ # containers for outputs
+ coeffs = []
+ stand_errs = []
+ t_vals = []
+ filtered_t_vals = []
+
+ # extracted model information
+ c_alpha = model.adj_alpha
+ filtered_t = model.filter_tvals(c_alpha[1])
+ predicted = model.predy.flatten()
+ residuals = model.resid_response
+ r_squared = model.localR2.flatten()
+ bw = np.repeat(float(bw), n)
+
+ # create lists of json objs for model outputs
+ for idx in xrange(n):
+ coeffs.append(json.dumps({var: model.params[idx, k]
+ for k, var in enumerate(ind_vars)}))
+ stand_errs.append(json.dumps({var: model.bse[idx, k]
+ for k, var in enumerate(ind_vars)}))
+ t_vals.append(json.dumps({var: model.tvalues[idx, k]
+ for k, var in enumerate(ind_vars)}))
+ filtered_t_vals.append(
+ json.dumps({var: filtered_t[idx, k]
+ for k, var in enumerate(ind_vars)}))
+
+ return zip(coeffs, stand_errs, t_vals, filtered_t_vals,
+ predicted, residuals, r_squared, bw, rowid)
+
+ def gwr_predict(self, subquery, dep_var, ind_vars,
+ bw=None, fixed=False, kernel='bisquare',
+ geom_col='the_geom', id_col='cartodb_id'):
+ """
+ subquery: 'select * from demographics'
+ dep_var: 'pctbachelor'
+ ind_vars: ['intercept', 'pctpov', 'pctrural', 'pctblack']
+ bw: value of bandwidth, if None then select optimal
+ fixed: False (kNN) or True ('distance')
+ kernel: 'bisquare' (default), or 'exponential', 'gaussian'
+ """
+
+ params = {'geom_col': geom_col,
+ 'id_col': id_col,
+ 'subquery': subquery,
+ 'dep_var': dep_var,
+ 'ind_vars': ind_vars}
+
+ # get data from data provider
+ query_result = self.data_provider.get_gwr_predict(params)
+
+ # exit if data to analyze is empty
+ if len(query_result) == 0:
+ plpy.error('No data passed to analysis or independent variables '
+ 'are all null-valued')
+
+ # unique ids and variable names list
+ rowid = np.array(query_result[0]['rowid'], dtype=np.int)
+
+ x = np.array(query_result[0]['x'], dtype=np.float)
+ y = np.array(query_result[0]['y'], dtype=np.float)
+ coords = np.array(zip(x, y), dtype=np.float)
+
+ # extract dependent variable
+ Y = np.array(query_result[0]['dep_var']).reshape((-1, 1))
+
+ n = Y.shape[0]
+ k = len(ind_vars)
+ X = np.empty((n, k), dtype=np.float)
+
+ for attr in range(0, k):
+ attr_name = 'attr' + str(attr + 1)
+ X[:, attr] = np.array(
+ query_result[0][attr_name], dtype=np.float).flatten()
+
+ # add intercept variable name
+ ind_vars.insert(0, 'intercept')
+
+ # split data into "training" and "test" for predictions
+ # create index to split based on null y values
+ train = np.where(Y != np.array(None))[0]
+ test = np.where(Y == np.array(None))[0]
+
+ # report error if there is no data to predict
+ if len(test) < 1:
+ plpy.error('No rows flagged for prediction: verify that rows '
+ 'denoting prediction locations have a dependent '
+ 'variable value of `null`')
+
+ # split dependent variable (only need training which is non-Null's)
+ Y_train = Y[train].reshape((-1, 1))
+ Y_train = Y_train.astype(np.float)
+
+ # split coords
+ coords_train = coords[train]
+ coords_test = coords[test]
+
+ # split explanatory variables
+ X_train = X[train]
+ X_test = X[test]
+
+ # calculate bandwidth if none is supplied
+ if bw is None:
+ bw = Sel_BW(coords_train, Y_train, X_train,
+ fixed=fixed, kernel=kernel).search()
+
+ # estimate model and predict at new locations
+ model = PySAL_GWR(coords_train, Y_train, X_train,
+ bw, fixed=fixed,
+ kernel=kernel).predict(coords_test, X_test)
+
+ coeffs = []
+ stand_errs = []
+ t_vals = []
+ r_squared = model.localR2.flatten()
+ predicted = model.predy.flatten()
+
+ m = len(model.predy)
+ for idx in xrange(m):
+ coeffs.append(json.dumps({var: model.params[idx, k]
+ for k, var in enumerate(ind_vars)}))
+ stand_errs.append(json.dumps({var: model.bse[idx, k]
+ for k, var in enumerate(ind_vars)}))
+ t_vals.append(json.dumps({var: model.tvalues[idx, k]
+ for k, var in enumerate(ind_vars)}))
+
+ return zip(coeffs, stand_errs, t_vals,
+ r_squared, predicted, rowid[test])
diff --git a/release/python/0.8.0/crankshaft/crankshaft/segmentation/__init__.py b/release/python/0.8.0/crankshaft/crankshaft/segmentation/__init__.py
new file mode 100644
index 0000000..b825e85
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/segmentation/__init__.py
@@ -0,0 +1 @@
+from segmentation import *
diff --git a/release/python/0.8.0/crankshaft/crankshaft/segmentation/segmentation.py b/release/python/0.8.0/crankshaft/crankshaft/segmentation/segmentation.py
new file mode 100644
index 0000000..ed61139
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/segmentation/segmentation.py
@@ -0,0 +1,176 @@
+"""
+Segmentation creation and prediction
+"""
+
+import sklearn
+import numpy as np
+import plpy
+from sklearn.ensemble import GradientBoostingRegressor
+from sklearn import metrics
+from sklearn.cross_validation import train_test_split
+
+# Lower level functions
+#----------------------
+
+def replace_nan_with_mean(array):
+ """
+ Input:
+ @param array: an array of floats which may have null-valued entries
+ Output:
+ array with nans filled in with the mean of the dataset
+ """
+ # returns an array of rows and column indices
+ indices = np.where(np.isnan(array))
+
+ # iterate through entries which have nan values
+ for row, col in zip(*indices):
+ array[row, col] = np.mean(array[~np.isnan(array[:, col]), col])
+
+ return array
+
+def get_data(variable, feature_columns, query):
+ """
+ Fetch data from the database, clean, and package into
+ numpy arrays
+ Input:
+ @param variable: name of the target variable
+ @param feature_columns: list of column names
+ @param query: subquery that data is pulled from for the packaging
+ Output:
+ prepared data, packaged into NumPy arrays
+ """
+
+ columns = ','.join(['array_agg("{col}") As "{col}"'.format(col=col) for col in feature_columns])
+
+ try:
+ data = plpy.execute('''SELECT array_agg("{variable}") As target, {columns} FROM ({query}) As a'''.format(
+ variable=variable,
+ columns=columns,
+ query=query))
+ except Exception, e:
+ plpy.error('Failed to access data to build segmentation model: %s' % e)
+
+ # extract target data from plpy object
+ target = np.array(data[0]['target'])
+
+ # put n feature data arrays into an n x m array of arrays
+ features = np.column_stack([np.array(data[0][col], dtype=float) for col in feature_columns])
+
+ return replace_nan_with_mean(target), replace_nan_with_mean(features)
+
+# High level interface
+# --------------------
+
+def create_and_predict_segment_agg(target, features, target_features, target_ids, model_parameters):
+ """
+ Version of create_and_predict_segment that works on arrays that come stright form the SQL calling
+ the function.
+
+ Input:
+ @param target: The 1D array of lenth NSamples containing the target variable we want the model to predict
+ @param features: Thw 2D array of size NSamples * NFeatures that form the imput to the model
+ @param target_ids: A 1D array of target_ids that will be used to associate the results of the prediction with the rows which they come from
+ @param model_parameters: A dictionary containing parameters for the model.
+ """
+
+ clean_target = replace_nan_with_mean(target)
+ clean_features = replace_nan_with_mean(features)
+ target_features = replace_nan_with_mean(target_features)
+
+ model, accuracy = train_model(clean_target, clean_features, model_parameters, 0.2)
+ prediction = model.predict(target_features)
+ accuracy_array = [accuracy]*prediction.shape[0]
+ return zip(target_ids, prediction, np.full(prediction.shape, accuracy_array))
+
+
+
+def create_and_predict_segment(query, variable, target_query, model_params):
+ """
+ generate a segment with machine learning
+ Stuart Lynn
+ """
+
+ ## fetch column names
+ try:
+ columns = plpy.execute('SELECT * FROM ({query}) As a LIMIT 1 '.format(query=query))[0].keys()
+ except Exception, e:
+ plpy.error('Failed to build segmentation model: %s' % e)
+
+ ## extract column names to be used in building the segmentation model
+ feature_columns = set(columns) - set([variable, 'cartodb_id', 'the_geom', 'the_geom_webmercator'])
+ ## get data from database
+ target, features = get_data(variable, feature_columns, query)
+
+ model, accuracy = train_model(target, features, model_params, 0.2)
+ cartodb_ids, result = predict_segment(model, feature_columns, target_query)
+ accuracy_array = [accuracy]*result.shape[0]
+ return zip(cartodb_ids, result, accuracy_array)
+
+
+def train_model(target, features, model_params, test_split):
+ """
+ Train the Gradient Boosting model on the provided data and calculate the accuracy of the model
+ Input:
+ @param target: 1D Array of the variable that the model is to be trianed to predict
+ @param features: 2D Array NSamples * NFeatures to use in trining the model
+ @param model_params: A dictionary of model parameters, the full specification can be found on the
+ scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
+ @parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray
+ """
+ features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
+ model = GradientBoostingRegressor(**model_params)
+ model.fit(features_train, target_train)
+ accuracy = calculate_model_accuracy(model, features, target)
+ return model, accuracy
+
+def calculate_model_accuracy(model, features, target):
+ """
+ Calculate the mean squared error of the model prediction
+ Input:
+ @param model: model trained from input features
+ @param features: features to make a prediction from
+ @param target: target to compare prediction to
+ Output:
+ mean squared error of the model prection compared to the target
+ """
+ prediction = model.predict(features)
+ return metrics.mean_squared_error(prediction, target)
+
+def predict_segment(model, features, target_query):
+ """
+ Use the provided model to predict the values for the new feature set
+ Input:
+ @param model: The pretrained model
+ @features: A list of features to use in the model prediction (list of column names)
+ @target_query: The query to run to obtain the data to predict on and the cartdb_ids associated with it.
+ """
+
+ batch_size = 1000
+ joined_features = ','.join(['"{0}"::numeric'.format(a) for a in features])
+
+ try:
+ cursor = plpy.cursor('SELECT Array[{joined_features}] As features FROM ({target_query}) As a'.format(
+ joined_features=joined_features,
+ target_query=target_query))
+ except Exception, e:
+ plpy.error('Failed to build segmentation model: %s' % e)
+
+ results = []
+
+ while True:
+ rows = cursor.fetch(batch_size)
+ if not rows:
+ break
+ batch = np.row_stack([np.array(row['features'], dtype=float) for row in rows])
+
+ #Need to fix this. Should be global mean. This will cause weird effects
+ batch = replace_nan_with_mean(batch)
+ prediction = model.predict(batch)
+ results.append(prediction)
+
+ try:
+ cartodb_ids = plpy.execute('''SELECT array_agg(cartodb_id ORDER BY cartodb_id) As cartodb_ids FROM ({0}) As a'''.format(target_query))[0]['cartodb_ids']
+ except Exception, e:
+ plpy.error('Failed to build segmentation model: %s' % e)
+
+ return cartodb_ids, np.concatenate(results)
diff --git a/release/python/0.8.0/crankshaft/crankshaft/space_time_dynamics/__init__.py b/release/python/0.8.0/crankshaft/crankshaft/space_time_dynamics/__init__.py
new file mode 100644
index 0000000..a439286
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/space_time_dynamics/__init__.py
@@ -0,0 +1,2 @@
+"""Import all functions from clustering libraries."""
+from markov import *
diff --git a/release/python/0.8.0/crankshaft/crankshaft/space_time_dynamics/markov.py b/release/python/0.8.0/crankshaft/crankshaft/space_time_dynamics/markov.py
new file mode 100644
index 0000000..20daaf1
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/crankshaft/space_time_dynamics/markov.py
@@ -0,0 +1,194 @@
+"""
+Spatial dynamics measurements using Spatial Markov
+"""
+
+# TODO: remove all plpy dependencies
+
+import numpy as np
+import pysal as ps
+import plpy
+import crankshaft.pysal_utils as pu
+from crankshaft.analysis_data_provider import AnalysisDataProvider
+
+
+class Markov(object):
+ def __init__(self, data_provider=None):
+ if data_provider is None:
+ self.data_provider = AnalysisDataProvider()
+ else:
+ self.data_provider = data_provider
+
+ def spatial_trend(self, subquery, time_cols, num_classes=7,
+ w_type='knn', num_ngbrs=5, permutations=0,
+ geom_col='the_geom', id_col='cartodb_id'):
+ """
+ Predict the trends of a unit based on:
+ 1. history of its transitions to different classes (e.g., 1st
+ quantile -> 2nd quantile)
+ 2. average class of its neighbors
+
+ Inputs:
+ @param subquery string: e.g., SELECT the_geom, cartodb_id,
+ interesting_time_column FROM table_name
+ @param time_cols list of strings: list of strings of column names
+ @param num_classes (optional): number of classes to break
+ distribution of values into. Currently uses quantile bins.
+ @param w_type string (optional): weight type ('knn' or 'queen')
+ @param num_ngbrs int (optional): number of neighbors (if knn type)
+ @param permutations int (optional): number of permutations for test
+ stats
+ @param geom_col string (optional): name of column which contains
+ the geometries
+ @param id_col string (optional): name of column which has the ids
+ of the table
+
+ Outputs:
+ @param trend_up float: probablity that a geom will move to a higher
+ class
+ @param trend_down float: probablity that a geom will move to a
+ lower class
+ @param trend float: (trend_up - trend_down) / trend_static
+ @param volatility float: a measure of the volatility based on
+ probability stddev(prob array)
+ """
+
+ if len(time_cols) < 2:
+ plpy.error('More than one time column needs to be passed')
+
+ params = {"id_col": id_col,
+ "time_cols": time_cols,
+ "geom_col": geom_col,
+ "subquery": subquery,
+ "num_ngbrs": num_ngbrs}
+
+ result = self.data_provider.get_markov(w_type, params)
+
+ # build weight
+ weights = pu.get_weight(result, w_type)
+ weights.transform = 'r'
+
+ # prep time data
+ t_data = get_time_data(result, time_cols)
+
+ sp_markov_result = ps.Spatial_Markov(t_data,
+ weights,
+ k=num_classes,
+ fixed=False,
+ permutations=permutations)
+
+ # get lag classes
+ lag_classes = ps.Quantiles(
+ ps.lag_spatial(weights, t_data[:, -1]),
+ k=num_classes).yb
+
+ # look up probablity distribution for each unit according to class and
+ # lag class
+ prob_dist = get_prob_dist(sp_markov_result.P,
+ lag_classes,
+ sp_markov_result.classes[:, -1])
+
+ # find the ups and down and overall distribution of each cell
+ trend_up, trend_down, trend, volatility = get_prob_stats(prob_dist, sp_markov_result.classes[:, -1])
+
+ # output the results
+ return zip(trend, trend_up, trend_down, volatility, weights.id_order)
+
+
+
+def get_time_data(markov_data, time_cols):
+ """
+ Extract the time columns and bin appropriately
+ """
+ num_attrs = len(time_cols)
+ return np.array([[x['attr' + str(i)] for x in markov_data]
+ for i in range(1, num_attrs+1)], dtype=float).transpose()
+
+
+# not currently used
+def rebin_data(time_data, num_time_per_bin):
+ """
+ Convert an n x l matrix into an (n/m) x l matrix where the values are
+ reduced (averaged) for the intervening states:
+ 1 2 3 4 1.5 3.5
+ 5 6 7 8 -> 5.5 7.5
+ 9 8 7 6 8.5 6.5
+ 5 4 3 2 4.5 2.5
+
+ if m = 2, the 4 x 4 matrix is transformed to a 2 x 4 matrix.
+
+ This process effectively resamples the data at a longer time span n
+ units longer than the input data.
+ For cases when there is a remainder (remainder(5/3) = 2), the remaining
+ two columns are binned together as the last time period, while the
+ first three are binned together for the first period.
+
+ Input:
+ @param time_data n x l ndarray: measurements of an attribute at
+ different time intervals
+ @param num_time_per_bin int: number of columns to average into a new
+ column
+ Output:
+ ceil(n / m) x l ndarray of resampled time series
+ """
+
+ if time_data.shape[1] % num_time_per_bin == 0:
+ # if fit is perfect, then use it
+ n_max = time_data.shape[1] / num_time_per_bin
+ else:
+ # fit remainders into an additional column
+ n_max = time_data.shape[1] / num_time_per_bin + 1
+
+ return np.array(
+ [time_data[:, num_time_per_bin * i:num_time_per_bin * (i+1)].mean(axis=1)
+ for i in range(n_max)]).T
+
+
+def get_prob_dist(transition_matrix, lag_indices, unit_indices):
+ """
+ Given an array of transition matrices, look up the probability
+ associated with the arrangements passed
+
+ Input:
+ @param transition_matrix ndarray[k,k,k]:
+ @param lag_indices ndarray:
+ @param unit_indices ndarray:
+
+ Output:
+ Array of probability distributions
+ """
+
+ return np.array([transition_matrix[(lag_indices[i], unit_indices[i])]
+ for i in range(len(lag_indices))])
+
+
+def get_prob_stats(prob_dist, unit_indices):
+ """
+ get the statistics of the probability distributions
+
+ Outputs:
+ @param trend_up ndarray(float): sum of probabilities for upward
+ movement (relative to the unit index of that prob)
+ @param trend_down ndarray(float): sum of probabilities for downward
+ movement (relative to the unit index of that prob)
+ @param trend ndarray(float): difference of upward and downward
+ movements
+ """
+
+ num_elements = len(unit_indices)
+ trend_up = np.empty(num_elements, dtype=float)
+ trend_down = np.empty(num_elements, dtype=float)
+ trend = np.empty(num_elements, dtype=float)
+
+ for i in range(num_elements):
+ trend_up[i] = prob_dist[i, (unit_indices[i]+1):].sum()
+ trend_down[i] = prob_dist[i, :unit_indices[i]].sum()
+ if prob_dist[i, unit_indices[i]] > 0.0:
+ trend[i] = (trend_up[i] - trend_down[i]) / (
+ prob_dist[i, unit_indices[i]])
+ else:
+ trend[i] = None
+
+ # calculate volatility of distribution
+ volatility = prob_dist.std(axis=1)
+
+ return trend_up, trend_down, trend, volatility
diff --git a/release/python/0.8.0/crankshaft/requirements.txt b/release/python/0.8.0/crankshaft/requirements.txt
new file mode 100644
index 0000000..88c0a9e
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/requirements.txt
@@ -0,0 +1,5 @@
+joblib==0.8.3
+numpy==1.6.1
+scipy==0.14.0
+pysal==1.14.3
+scikit-learn==0.14.1
diff --git a/release/python/0.8.0/crankshaft/setup.py b/release/python/0.8.0/crankshaft/setup.py
new file mode 100644
index 0000000..ffbdc19
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/setup.py
@@ -0,0 +1,49 @@
+
+"""
+CartoDB Spatial Analysis Python Library
+See:
+https://github.com/CartoDB/crankshaft
+"""
+
+from setuptools import setup, find_packages
+
+setup(
+ name='crankshaft',
+
+ version='0.0.0',
+
+ description='CartoDB Spatial Analysis Python Library',
+
+ url='https://github.com/CartoDB/crankshaft',
+
+ author='Data Services Team - CartoDB',
+ author_email='dataservices@cartodb.com',
+
+ license='MIT',
+
+ classifiers=[
+ 'Development Status :: 3 - Alpha',
+ 'Intended Audience :: Mapping comunity',
+ 'Topic :: Maps :: Mapping Tools',
+ 'License :: OSI Approved :: MIT License',
+ 'Programming Language :: Python :: 2.7',
+ ],
+
+ keywords='maps mapping tools spatial analysis geostatistics',
+
+ packages=find_packages(exclude=['contrib', 'docs', 'tests']),
+
+ extras_require={
+ 'dev': ['unittest'],
+ 'test': ['unittest', 'nose', 'mock'],
+ },
+
+ # The choice of component versions is dictated by what's
+ # provisioned in the production servers.
+ # IMPORTANT NOTE: please don't change this line. Instead issue a ticket to systems for evaluation.
+ install_requires=['joblib==0.8.3', 'numpy==1.6.1', 'scipy==0.14.0', 'pysal==1.14.3', 'scikit-learn==0.14.1'],
+
+ requires=['pysal', 'numpy', 'sklearn'],
+
+ test_suite='test'
+)
diff --git a/release/python/0.8.0/crankshaft/setup.py-r b/release/python/0.8.0/crankshaft/setup.py-r
new file mode 100644
index 0000000..ffbdc19
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/setup.py-r
@@ -0,0 +1,49 @@
+
+"""
+CartoDB Spatial Analysis Python Library
+See:
+https://github.com/CartoDB/crankshaft
+"""
+
+from setuptools import setup, find_packages
+
+setup(
+ name='crankshaft',
+
+ version='0.0.0',
+
+ description='CartoDB Spatial Analysis Python Library',
+
+ url='https://github.com/CartoDB/crankshaft',
+
+ author='Data Services Team - CartoDB',
+ author_email='dataservices@cartodb.com',
+
+ license='MIT',
+
+ classifiers=[
+ 'Development Status :: 3 - Alpha',
+ 'Intended Audience :: Mapping comunity',
+ 'Topic :: Maps :: Mapping Tools',
+ 'License :: OSI Approved :: MIT License',
+ 'Programming Language :: Python :: 2.7',
+ ],
+
+ keywords='maps mapping tools spatial analysis geostatistics',
+
+ packages=find_packages(exclude=['contrib', 'docs', 'tests']),
+
+ extras_require={
+ 'dev': ['unittest'],
+ 'test': ['unittest', 'nose', 'mock'],
+ },
+
+ # The choice of component versions is dictated by what's
+ # provisioned in the production servers.
+ # IMPORTANT NOTE: please don't change this line. Instead issue a ticket to systems for evaluation.
+ install_requires=['joblib==0.8.3', 'numpy==1.6.1', 'scipy==0.14.0', 'pysal==1.14.3', 'scikit-learn==0.14.1'],
+
+ requires=['pysal', 'numpy', 'sklearn'],
+
+ test_suite='test'
+)
diff --git a/release/python/0.8.0/crankshaft/test/fixtures/getis.json b/release/python/0.8.0/crankshaft/test/fixtures/getis.json
new file mode 100644
index 0000000..02566fc
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/test/fixtures/getis.json
@@ -0,0 +1 @@
+[[0.004793783909323601, 0.17999999999999999, 0.49808756424021061], [-1.0701189472090842, 0.079000000000000001, 0.14228288580832316], [-0.67867750971877305, 0.42099999999999999, 0.24867110969448558], [-0.67407386707620487, 0.246, 0.25013217644612995], [-0.79495689068870035, 0.33200000000000002, 0.21331928959090596], [-0.49279481022182703, 0.058999999999999997, 0.31107878905057329], [-0.38075627530057132, 0.28399999999999997, 0.35169205342069643], [-0.86710921611314895, 0.23699999999999999, 0.19294108571294855], [-0.78618647240956485, 0.050000000000000003, 0.2158791250244505], [-0.76108527223116984, 0.064000000000000001, 0.22330306830813684], [-0.13340753531942209, 0.247, 0.44693554317763651], [-0.57584545722033043, 0.48999999999999999, 0.28235982246156488], [-0.78882694661192831, 0.433, 0.2151065788731219], [-0.38769767950046219, 0.375, 0.34911988661484239], [-0.56057819488052207, 0.41399999999999998, 0.28754255985169652], [-0.41354017495644935, 0.45500000000000002, 0.339605447117173], [-0.23993577722243081, 0.49099999999999999, 0.40519002230969337], [-0.1389080156677496, 0.40400000000000003, 0.44476141839645233], [-0.25485737510500855, 0.376, 0.39941662953554224], [-0.71218610582902353, 0.17399999999999999, 0.23817476979886087], [-0.54533105995872144, 0.13700000000000001, 0.2927629228714812], [-0.39547917847510977, 0.033000000000000002, 0.34624464252424236], [-0.43052658996257548, 0.35399999999999998, 0.33340631435564982], [-0.37296719193774736, 0.40300000000000002, 0.35458643102865428], [-0.66482612169465694, 0.31900000000000001, 0.25308085650392698], [-0.13772133540823422, 0.34699999999999998, 0.44523032843016275], [-0.6765304487868502, 0.20999999999999999, 0.24935196033890672], [-0.64518763494323472, 0.32200000000000001, 0.25940279912025543], [-0.5078622084312413, 0.41099999999999998, 0.30577498972600159], [-0.12652006733772059, 0.42899999999999999, 0.44966013262301163], [-0.32691133022814595, 0.498, 0.37186747562269029], [0.25533848511500978, 0.42399999999999999, 0.39923083899077472], [2.7045138116476508, 0.0050000000000000001, 0.0034202212972238577], [-0.1551614486076057, 0.44400000000000001, 0.43834701985429037], [1.9524487722567723, 0.012999999999999999, 0.025442473674991528], [-1.2055816465306763, 0.017000000000000001, 0.11398941970467646], [3.478472976017831, 0.002, 0.00025213964072468009], [-1.4621715757903719, 0.002, 0.071847099325659136], [-0.84010307600180256, 0.085000000000000006, 0.20042529779230778], [5.7097646237318243, 0.0030000000000000001, 5.6566262784940591e-09], [1.5082367956567375, 0.065000000000000002, 0.065746966514827365], [-0.58337270103430816, 0.44, 0.27982121546450034], [-0.083271860457022437, 0.45100000000000001, 0.46681768733385554], [-0.46872337815000953, 0.34599999999999997, 0.31963368715684204], [0.18490279849545319, 0.23799999999999999, 0.42665263797981101], [3.470424529947997, 0.012, 0.00025981817437825683], [-0.99942612137154796, 0.032000000000000001, 0.15879415560388499], [-1.3650387953594485, 0.034000000000000002, 0.08612042845912049], [1.8617160516432014, 0.081000000000000003, 0.03132156240215267], [1.1321188945775384, 0.11600000000000001, 0.12879222611766061], [0.064116686050580601, 0.27300000000000002, 0.4744386578180424], [-0.42032194540259099, 0.29999999999999999, 0.33712514016213468], [-0.79581215423980922, 0.123, 0.21307061309098785], [-0.42792753720906046, 0.45600000000000002, 0.33435193892883741], [-1.0629378527428395, 0.051999999999999998, 0.14390506780140866], [-0.54164761752225477, 0.33700000000000002, 0.29403064095211839], [1.0934778886820793, 0.13700000000000001, 0.13709201601893539], [-0.094068785378413719, 0.38200000000000001, 0.46252725802998929], [0.13482026574801856, 0.36799999999999999, 0.44637699118865737], [-0.13976995315653129, 0.34699999999999998, 0.44442087706276601], [-0.051047663924746682, 0.32000000000000001, 0.47964376985626245], [-0.21468297736730158, 0.41699999999999998, 0.41500724761906527], [-0.20873154637330626, 0.38800000000000001, 0.41732890604390893], [-0.32427876152583485, 0.49199999999999999, 0.37286349875557478], [-0.65254842943280977, 0.374, 0.25702372075306734], [-0.48611858196118796, 0.23300000000000001, 0.31344154643990074], [-0.14482354344529477, 0.32600000000000001, 0.44242509660469886], [-0.51052030974200002, 0.439, 0.30484349480873729], [0.56814382285283538, 0.14999999999999999, 0.28496865660103166], [0.58680919931668207, 0.161, 0.27866592887231878], [0.013390357044409013, 0.25800000000000001, 0.49465818005865647], [-0.19050728887961568, 0.41399999999999998, 0.4244558160399462], [-0.60531777422216049, 0.35199999999999998, 0.2724839368239631], [1.0899331115425805, 0.127, 0.13787130480311838], [0.17015055382651084, 0.36899999999999999, 0.43244586845546418], [-0.21738337124409801, 0.40600000000000003, 0.41395479459421991], [1.0329303331079593, 0.079000000000000001, 0.15081825117169467], [1.0218317101096221, 0.104, 0.15343027913308094]]
diff --git a/release/python/0.8.0/crankshaft/test/fixtures/gwr_packed_data.json b/release/python/0.8.0/crankshaft/test/fixtures/gwr_packed_data.json
new file mode 100644
index 0000000..cbee3fb
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/test/fixtures/gwr_packed_data.json
@@ -0,0 +1 @@
+[{"x":[941396.6,895553,930946.4,745398.6,849431.3,819317.3,803747.1,699011.5,863020.8,859915.8,809736.9,844270.1,979288.9,827822,1023145,994903.4,971593.8,782448.2,724741.2,1008480,964264.9,678778.6,670055.9,962612.3,1059706,704959.2,653026.6,734240.9,832508.6,695793.9,745538.8,908046.1,724646.8,894463.9,808691.8,942527.9,839816.1,705457.9,783416.5,805648.4,635964.3,764386.1,732628.4,759231.9,860451.4,800031.3,764116.9,707288.7,703495.1,896654,1031899,879541.2,943066.2,981727.8,739255.8,731468.7,662257.4,765397.3,845701.3,733728.4,732702.3,908386.8,1023411,695325.1,765058.1,855577.3,772634.6,818917.1,794419.5,873518.8,665933.8,695500.6,870749.9,675280.4,763488.4,814118.9,855461.8,815753.1,807249.1,915741.9,924108.1,970465.7,908636.7,821367.1,766461.7,873804.3,884830.4,770455.5,1014742,919396.5,1004544,864781.1,772600,917730.9,1030500,777055.3,848638.8,732876.8,715359.8,716369.8,766238.6,790338.7,920887.4,825920.1,707834.3,700833.7,793263.9,830735.9,863291.8,695329.2,798061.4,733846.7,953533.8,744180.8,668031.4,833819.6,840169.1,686875.4,824645.5,712437.1,954272.3,777759,752973.1,1004028,704495.6,754916.2,842085.9,703256.8,763457.1,734217.9,884376.9,963427.8,759410.8,882069.4,743031.8,795506.2,831682.3,941734.4,797981.7,919077.6,682616.8,819399.6,832935,777040.1,752165.2,658870.4,800384.3,938349.6,902471.1,894704.3,986832.8,731576.3,898776.3,796905.6,686891.4,838551.5,891228.5,858796.9,801018.1],"y":[3521764,3471916,3502787,3474765,3665553,3807616,3769623,3793408,3520432,3466377,3636468,3595691,3463849,3421638,3554982,3600493,3671394,3684504,3492653,3437933,3598842,3713250,3862318,3432769,3556747,3577608,3813760,3794110,3762905,3495219,3711726,3428340,3757187,3492465,3455994,3722100,3449007,3694344,3623343,3537103,3854592,3812502,3421800,3735253,3569933,3564188,3494367,3731361,3467152,3401148,3596117,3785425,3616602,3571315,3866604,3700612,3789664,3789005,3813323,3733248,3844809,3685752,3471063,3822135,3421817,3722330,3764306,3839931,3803344,3689861,3740622,3624790,3810303,3685569,3699716,3590553,3506293,3783949,3695092,3530869,3668080,3640263,3624562,3660143,3663959,3439981,3599291,3520161,3537225,3752562,3517834,3419313,3832429,3716368,3500535,3584821,3785405,3584393,3660275,3451034,3453930,3660608,3568473,3717990,3854188,3598228,3719734,3750903,3756777,3758093,3609091,3812828,3482044,3665561,3764766,3567447,3695254,3524124,3864805,3519627,3697862,3729605,3570222,3641918,3422002,3685029,3827075,3552857,3551752,3623162,3717493,3560039,3608179,3534470,3522636,3421725,3487715,3567586,3872640,3595170,3660254,3514927,3623868,3858779,3639192,3842167,3742691,3446675,3699878,3648583,3494323,3544716,3563384,3841086,3855274,3538547,3749769,3637891,3487328],"dep_var":[8.2,6.4,6.6,9.4,13.3,6.4,9.2,9,7.6,7.5,17,10.3,5.8,9.1,11.8,19.9,9.6,7.2,10.1,13.5,9.9,12,8.1,6.4,18.6,20.2,5.9,18.4,37.5,11.2,14.7,6.7,33,11.1,10,23.9,6.5,13.3,5.7,10,8,8.6,11.7,32.7,8,9.5,17,12,9.4,4.7,7.6,8,9.1,8.6,7.8,25.8,13.7,15.6,9.5,31.6,8.6,5.3,19.9,9.2,7.7,8.8,29.6,12,15.4,6.8,7.5,13.6,9.1,5.7,10.7,16,8.3,9,10.8,8.3,6.2,7.7,4.9,12,10,5.4,12,13.7,13.4,8.2,5.2,16.3,11.1,10.4,8.7,10.1,9.7,4.6,6.7,8.2,7.8,12.9,10.1,11,5.5,16.6,9.5,28.4,12.8,7.6,15.2,9,6.3,9.3,6.8,10.7,11.7,7.3,11.6,6,17.3,18.1,8,8.6,7.8,11.1,13.1,8,15.9,7.1,5.6,6.5,7.1,8.6,9.2,13.4,14,11.4,11.4,6.3,13.6,7.2,4.8,10.1,9,8.4,9.4,10.4,4.2,9.8,9.6,5.5,8.6,13.6,12,7.6,10.4,8.8,6.3],"attr1":[75.6,100,61.7,100,42.7,100,64.6,75.2,47,66.2,16.1,57.9,100,65.6,80.6,63.2,72.3,73.4,100,47.1,52.1,68.5,43.6,100,5.1,13.7,77.4,57.8,17.6,100,4.4,58.6,5.8,64.6,59.4,30.6,62,76.1,100,48.4,96.5,100,58,2.5,70.7,72.6,10,26.7,52.8,100,89.1,70,64.2,100,100,53.9,36.1,93.7,87.2,4.2,100,100,20.3,79.7,55.4,75.7,13.6,88.5,81.1,100,67.8,95.8,73.8,100,76,20.9,63.4,78,100,65.1,100,53.8,100,81.9,63.6,100,52.9,78.2,32.9,100,100,47.6,78.6,65.9,100,65.6,100,100,82.3,100,56.2,75.1,98.6,73,89,3.2,76,95.2,100,93.7,61.3,100,74.4,100,66.5,56.5,66.5,100,100,53.5,9.9,59.2,100,79.3,69.4,53.6,64.5,100,45.4,97.9,100,79.3,100,72.6,50.3,55.2,51.1,35.7,100,53.3,44,44.5,100,100,65.3,44.8,61.2,54.2,100,67.1,59.9,100,100,100,70,100,59.6,100,71.1],"attr2":[19.9,26,24.1,24.8,17.5,15.1,14.7,10.7,22,19.3,19.2,18.3,18.2,25.9,13.2,27.5,30.3,15.6,31.8,11.5,24.1,14.4,12,18.3,17.2,10.4,14.6,6.1,27,35.7,8.6,26.4,5.6,22.5,22.8,6.6,22.4,11.4,14,29,14.6,12.8,23.3,9.9,21.8,32.9,24.4,6.6,31.4,14.6,12.7,19.7,25.7,25.4,17.2,2.6,13.6,6.8,16.5,18.4,16.6,16.8,14.3,11.1,22.3,25.1,4,11.6,10.6,30.1,14.4,13.7,14.2,19.1,6.1,10.6,27.2,14.1,17.4,18.8,31.3,27.8,22.2,10.8,16.3,25.9,20.5,12.6,17.2,17.8,23.7,19.9,15.3,21.6,22.3,29.2,15.7,28.2,22.4,22.1,28.7,13.8,24.5,15,11.3,18.6,14.4,7.9,16.2,8.8,24,12.8,21.3,13.4,16.3,24.3,16.4,33,13.6,35.9,18.2,6.2,19.9,22.9,29.1,15.6,17,31.4,24.8,24.9,31.9,21.9,29.5,27.3,29.1,22.6,22.9,24,14,27.1,16.3,31.3,26,18.3,14.7,12.8,13.2,21.1,32.6,21.6,21.2,22.5,30.3,12.5,11.1,28.6,22.6,15.3,26.2],"attr3":[20.76,26.86,15.42,51.67,42.39,3.49,11.44,9.21,31.33,11.62,41.68,22.36,4.58,41.47,14.85,25.95,52.19,35.48,58.89,20.19,30.94,15.46,0.91,27.05,38.02,30.94,8.61,1.77,26.23,60.76,23.82,27.29,9.84,25.46,24.16,10.93,29.94,22.59,30.66,40.66,0.35,0.29,39.47,42.23,27.64,48.98,50.15,7.63,44.09,11.48,14.03,29.99,32.58,33.88,0.03,5.13,13.56,0,9.89,49.92,0.26,12.69,25.57,3.78,31.5,49.89,5.11,5.42,8.48,79.64,6.47,25.49,20.41,13.38,10.24,21.8,30.5,9.58,34.8,15.36,55.92,41.51,33.89,25.6,34.03,26.58,33.32,19.22,39.15,38.19,21.75,31.88,1.41,36.38,43.34,58.72,8.32,41.32,44.62,27.48,47.91,31.78,28.27,34.74,0.26,37.95,22.35,7.37,24.74,3.94,47.53,1.48,11.69,20.04,14.3,32.46,32.79,49.93,0.35,58.17,41.96,8.03,34.09,44.69,32.74,29.08,11.81,63.46,46.53,62.34,61.36,29.19,43.21,34.45,59.9,37.93,26.68,23.38,0,33.1,30.03,40.66,45.93,0.1,27.78,3.73,18.37,25.88,60.23,51.86,19.45,50.2,30.06,2.59,4.06,31.76,45.94,41.99,30.71],"rowid":[13001,13003,13005,13007,13009,13011,13013,13015,13017,13019,13021,13023,13025,13027,13029,13031,13033,13035,13037,13039,13043,13045,13047,13049,13051,13053,13055,13057,13059,13061,13063,13065,13067,13069,13071,13073,13075,13077,13079,13081,13083,13085,13087,13089,13091,13093,13095,13097,13099,13101,13103,13105,13107,13109,13111,13113,13115,13117,13119,13121,13123,13125,13127,13129,13131,13133,13135,13137,13139,13141,13143,13145,13147,13149,13151,13153,13155,13157,13159,13161,13163,13165,13167,13169,13171,13173,13175,13177,13179,13181,13183,13185,13187,13189,13191,13193,13195,13197,13199,13201,13205,13207,13209,13211,13213,13215,13217,13219,13221,13223,13225,13227,13229,13231,13233,13235,13237,13239,13241,13243,13245,13247,13249,13251,13253,13255,13257,13259,13261,13263,13265,13267,13269,13271,13273,13275,13277,13279,13281,13283,13285,13287,13289,13291,13293,13295,13297,13299,13301,13303,13305,13307,13309,13311,13313,13315,13317,13319,13321]}]
diff --git a/release/python/0.8.0/crankshaft/test/fixtures/gwr_packed_knowns.json b/release/python/0.8.0/crankshaft/test/fixtures/gwr_packed_knowns.json
new file mode 100644
index 0000000..4e73b79
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/test/fixtures/gwr_packed_knowns.json
@@ -0,0 +1 @@
+{"y_coord": [3521764, 3471916, 3502787, 3474765, 3665553, 3807616, 3769623, 3793408, 3520432, 3466377, 3636468, 3595691, 3463849, 3421638, 3554982, 3600493, 3671394, 3684504, 3492653, 3437933, 3598842, 3713250, 3862318, 3432769, 3556747, 3577608, 3813760, 3794110, 3762905, 3495219, 3711726, 3428340, 3757187, 3492465, 3455994, 3722100, 3449007, 3694344, 3623343, 3537103, 3854592, 3812502, 3421800, 3735253, 3569933, 3564188, 3494367, 3731361, 3467152, 3401148, 3596117, 3785425, 3616602, 3571315, 3866604, 3700612, 3789664, 3789005, 3813323, 3733248, 3844809, 3685752, 3471063, 3822135, 3421817, 3722330, 3764306, 3839931, 3803344, 3689861, 3740622, 3624790, 3810303, 3685569, 3699716, 3590553, 3506293, 3783949, 3695092, 3530869, 3668080, 3640263, 3624562, 3660143, 3663959, 3439981, 3599291, 3520161, 3537225, 3752562, 3517834, 3419313, 3832429, 3716368, 3500535, 3584821, 3785405, 3584393, 3660275, 3451034, 3453930, 3660608, 3568473, 3717990, 3854188, 3598228, 3719734, 3750903, 3756777, 3758093, 3609091, 3812828, 3482044, 3665561, 3764766, 3567447, 3695254, 3524124, 3864805, 3519627, 3697862, 3729605, 3570222, 3641918, 3422002, 3685029, 3827075, 3552857, 3551752, 3623162, 3717493, 3560039, 3608179, 3534470, 3522636, 3421725, 3487715, 3567586, 3872640, 3595170, 3660254, 3514927, 3623868, 3858779, 3639192, 3842167, 3742691, 3446675, 3699878, 3648583, 3494323, 3544716, 3563384, 3841086, 3855274, 3538547, 3749769, 3637891, 3487328], "influence": [0.041718, 0.093454, 0.10983, 0.118198, 0.097548, 0.059443, 0.041031, 0.032462, 0.058498, 0.100714, 0.170747, 0.082058, 0.184081, 0.037431, 0.131419, 0.11251, 0.101114, 0.047942, 0.113107, 0.181309, 0.05591, 0.037814, 0.109586, 0.130853, 0.237285, 0.172302, 0.036228, 0.064756, 0.443808, 0.13641, 0.141066, 0.076699, 0.150241, 0.032761, 0.051816, 0.223754, 0.029353, 0.06002, 0.105169, 0.076247, 0.069329, 0.051965, 0.039944, 0.20308, 0.046086, 0.098109, 0.194576, 0.093258, 0.093739, 0.186424, 0.178352, 0.036825, 0.053419, 0.069314, 0.088385, 0.131412, 0.08002, 0.085975, 0.047994, 0.201466, 0.077724, 0.153041, 0.174215, 0.034634, 0.040242, 0.063712, 0.149308, 0.040139, 0.036003, 0.285484, 0.063911, 0.090216, 0.033014, 0.133651, 0.097922, 0.198633, 0.060055, 0.032494, 0.071734, 0.073892, 0.121394, 0.087385, 0.077991, 0.093922, 0.03193, 0.091298, 0.053937, 0.126429, 0.13414, 0.102608, 0.078631, 0.067484, 0.06222, 0.036613, 0.166179, 0.107871, 0.06172, 0.080811, 0.060716, 0.096678, 0.05549, 0.054726, 0.076288, 0.051877, 0.045589, 0.170792, 0.030476, 0.097062, 0.057117, 0.077883, 0.057165, 0.050752, 0.097715, 0.08115, 0.047653, 0.049173, 0.03286, 0.099923, 0.061466, 0.137244, 0.27255, 0.071571, 0.070927, 0.063635, 0.098482, 0.034063, 0.064541, 0.143175, 0.059673, 0.219599, 0.140317, 0.030782, 0.102154, 0.05155, 0.118173, 0.045636, 0.052966, 0.12225, 0.059551, 0.084581, 0.048937, 0.120127, 0.087659, 0.106444, 0.034626, 0.096799, 0.02894, 0.040209, 0.138238, 0.09661, 0.053615, 0.116263, 0.159493, 0.050856, 0.035205, 0.123618, 0.061337, 0.156479, 0.044714], "x_coord": [941396.6, 895553, 930946.4, 745398.6, 849431.3, 819317.3, 803747.1, 699011.5, 863020.8, 859915.8, 809736.9, 844270.1, 979288.9, 827822, 1023145, 994903.4, 971593.8, 782448.2, 724741.2, 1008480, 964264.9, 678778.6, 670055.9, 962612.3, 1059706, 704959.2, 653026.6, 734240.9, 832508.6, 695793.9, 745538.8, 908046.1, 724646.8, 894463.9, 808691.8, 942527.9, 839816.1, 705457.9, 783416.5, 805648.4, 635964.3, 764386.1, 732628.4, 759231.9, 860451.4, 800031.3, 764116.9, 707288.7, 703495.1, 896654, 1031899, 879541.2, 943066.2, 981727.8, 739255.8, 731468.7, 662257.4, 765397.3, 845701.3, 733728.4, 732702.3, 908386.8, 1023411, 695325.1, 765058.1, 855577.3, 772634.6, 818917.1, 794419.5, 873518.8, 665933.8, 695500.6, 870749.9, 675280.4, 763488.4, 814118.9, 855461.8, 815753.1, 807249.1, 915741.9, 924108.1, 970465.7, 908636.7, 821367.1, 766461.7, 873804.3, 884830.4, 770455.5, 1014742, 919396.5, 1004544, 864781.1, 772600, 917730.9, 1030500, 777055.3, 848638.8, 732876.8, 715359.8, 716369.8, 766238.6, 790338.7, 920887.4, 825920.1, 707834.3, 700833.7, 793263.9, 830735.9, 863291.8, 695329.2, 798061.4, 733846.7, 953533.8, 744180.8, 668031.4, 833819.6, 840169.1, 686875.4, 824645.5, 712437.1, 954272.3, 777759, 752973.1, 1004028, 704495.6, 754916.2, 842085.9, 703256.8, 763457.1, 734217.9, 884376.9, 963427.8, 759410.8, 882069.4, 743031.8, 795506.2, 831682.3, 941734.4, 797981.7, 919077.6, 682616.8, 819399.6, 832935, 777040.1, 752165.2, 658870.4, 800384.3, 938349.6, 902471.1, 894704.3, 986832.8, 731576.3, 898776.3, 796905.6, 686891.4, 838551.5, 891228.5, 858796.9, 801018.1], "se_pctblack": [0.048422, 0.053382, 0.050307, 0.052233, 0.050361, 0.041694, 0.041354, 0.041423, 0.048378, 0.049213, 0.051088, 0.050376, 0.05714, 0.049579, 0.047733, 0.045694, 0.048206, 0.048881, 0.054756, 0.06, 0.044953, 0.046902, 0.043006, 0.057079, 0.048872, 0.056886, 0.043753, 0.039689, 0.044035, 0.057198, 0.046953, 0.054178, 0.040812, 0.051914, 0.049325, 0.047751, 0.048778, 0.04959, 0.053515, 0.049339, 0.044162, 0.039811, 0.053106, 0.041446, 0.0497, 0.051743, 0.051662, 0.042636, 0.055382, 0.053312, 0.047102, 0.044382, 0.046127, 0.045928, 0.041238, 0.048361, 0.043512, 0.03973, 0.043382, 0.042789, 0.041214, 0.048842, 0.055459, 0.04163, 0.051566, 0.04629, 0.04035, 0.041668, 0.0405, 0.047923, 0.043671, 0.059377, 0.044651, 0.05022, 0.047643, 0.050203, 0.048129, 0.04155, 0.048144, 0.047513, 0.049365, 0.046377, 0.048207, 0.051755, 0.050068, 0.050653, 0.047949, 0.05133, 0.049083, 0.044645, 0.050489, 0.050633, 0.040368, 0.046813, 0.05234, 0.056291, 0.043751, 0.056776, 0.054586, 0.05369, 0.05193, 0.051624, 0.044863, 0.047448, 0.042099, 0.058191, 0.045691, 0.044437, 0.044577, 0.04214, 0.053326, 0.040489, 0.052861, 0.051687, 0.042796, 0.048223, 0.048837, 0.058596, 0.042367, 0.056893, 0.048682, 0.044234, 0.055933, 0.046173, 0.053802, 0.049959, 0.042715, 0.057894, 0.054198, 0.05754, 0.046272, 0.045435, 0.055455, 0.048073, 0.05567, 0.050858, 0.048796, 0.044856, 0.041348, 0.045268, 0.057257, 0.047627, 0.052411, 0.041008, 0.05606, 0.043488, 0.043092, 0.054819, 0.047066, 0.050849, 0.052717, 0.05641, 0.046776, 0.040768, 0.042667, 0.048373, 0.04495, 0.050613, 0.04877], "cooksd": [7.7e-05, 0.000315, 0.002225, 0.000205, 0.001606, 0.001427, 0.00652, 0.001942, 0.003764, 0.000481, 0.000267, 0.000157, 8.8e-05, 0.000123, 0.003019, 0.072736, 0.001193, 0.005313, 0.005343, 1e-06, 0.000363, 3.1e-05, 0.034878, 0.00106, 0.001396, 0.001463, 0.003457, 0.001192, 1.423109, 0.021445, 0.098779, 0.001687, 0.055873, 0.000531, 0.000136, 0.021004, 0.001615, 6.7e-05, 0.006305, 2e-05, 0.000228, 3.7e-05, 6.9e-05, 0.065765, 0.000349, 0.001329, 0.003647, 0.0419, 7e-06, 0.00795, 0.003791, 0.003857, 8.5e-05, 0.002004, 0.000484, 0.042878, 0.003828, 0.011277, 0.000442, 0.094025, 0.001259, 0.014043, 0.018378, 0.000812, 0.001675, 0.00019, 0.016271, 0.000218, 0.001723, 0.026378, 0.003864, 0.007504, 0.002657, 0.000982, 0.005258, 0.000183, 9e-06, 0.002044, 0.004633, 0.000515, 0.00141, 0.003848, 0.002212, 8.9e-05, 0.001534, 5e-06, 0.000109, 0.006075, 0.001587, 0.000427, 0.000156, 0.006069, 4e-06, 0.000344, 0.001701, 1.1e-05, 0.000192, 0.000832, 0.002053, 0.000569, 0.001872, 0.000233, 0.005009, 0.000331, 0.004382, 0.003999, 0.00118, 0.170092, 0.006522, 0.002474, 0.005137, 0.00027, 0.001113, 0.000403, 0.004921, 5.5e-05, 0.000272, 0.001283, 0.002676, 0.008287, 0.003014, 0.000491, 7e-06, 6.4e-05, 3.7e-05, 0.002458, 0.000757, 0.000476, 0.004148, 0.002506, 0.002465, 0.00071, 0.00184, 1.5e-05, 0.004272, 0.000945, 0.003874, 0.000189, 0.002952, 0.006644, 0.000699, 0.004442, 0.0013, 0.005564, 0.002488, 0.024203, 0.00519, 1.8e-05, 3.8e-05, 0.000263, 2.9e-05, 0.007778, 0.008509, 0.006229, 0.000105, 0.002952, 0.001612, 8e-06, 0.000767], "est_pctrural": [-0.087919, -0.077996, -0.085464, -0.072676, -0.128431, -0.180965, -0.18567, -0.143921, -0.072048, -0.074505, -0.117008, -0.087278, -0.091904, -0.073817, -0.099557, -0.098698, -0.10867, -0.160167, -0.073158, -0.094154, -0.094601, -0.135079, -0.133974, -0.08942, -0.101838, -0.091672, -0.131022, -0.155214, -0.190065, -0.073282, -0.165429, -0.081867, -0.156664, -0.076699, -0.072464, -0.133921, -0.074071, -0.141117, -0.111564, -0.078856, -0.127798, -0.162809, -0.070745, -0.174179, -0.077704, -0.084767, -0.073785, -0.14867, -0.071717, -0.080666, -0.101715, -0.180059, -0.092867, -0.096093, -0.148798, -0.15388, -0.133349, -0.168574, -0.185284, -0.163127, -0.149285, -0.123165, -0.096677, -0.141254, -0.071164, -0.167284, -0.178183, -0.17299, -0.175146, -0.138416, -0.134046, -0.111022, -0.185981, -0.129588, -0.165865, -0.090566, -0.072647, -0.18459, -0.167686, -0.081112, -0.109965, -0.101055, -0.090609, -0.134897, -0.143216, -0.077016, -0.082642, -0.076247, -0.098465, -0.155026, -0.096848, -0.076702, -0.162008, -0.138472, -0.098386, -0.091171, -0.18779, -0.09308, -0.131121, -0.071531, -0.071437, -0.141435, -0.082891, -0.177965, -0.142118, -0.099834, -0.18257, -0.188049, -0.178213, -0.144304, -0.099787, -0.153281, -0.089241, -0.140148, -0.135427, -0.08363, -0.158342, -0.0768, -0.169776, -0.075752, -0.120551, -0.184296, -0.086448, -0.103888, -0.070842, -0.154438, -0.180631, -0.082675, -0.081094, -0.112567, -0.151714, -0.09336, -0.103733, -0.072526, -0.075826, -0.071792, -0.072662, -0.088969, -0.161521, -0.085392, -0.123629, -0.075361, -0.098113, -0.15897, -0.123605, -0.131963, -0.188348, -0.086523, -0.134341, -0.101212, -0.094024, -0.079862, -0.074241, -0.166676, -0.137364, -0.076497, -0.164552, -0.101861, -0.07352], "std_residual": [-0.162278, 0.213714, -0.518796, 0.151238, -0.470868, -0.580528, -1.508125, -0.929355, -0.95091, -0.253303, -0.139052, -0.162155, 0.076204, -0.217609, 0.545752, 2.926313, 0.397864, -1.254875, 0.790774, -0.006914, -0.30256, 0.108831, -2.056611, -0.32422, 0.258772, 0.32387, -1.171601, 0.506852, 5.159316, 1.423472, -2.996125, -0.550564, 2.171775, 0.483877, 0.193059, 1.042856, -0.892786, 0.125184, -0.894825, -0.060875, 0.213632, 0.099734, 0.157788, 1.962586, -0.328491, 0.426944, 0.474682, -2.46583, 0.031892, -0.719617, -0.510536, -1.227088, -0.150029, 0.633771, 0.272938, 2.056669, -0.810461, 1.337637, -0.361836, 2.358452, 0.472247, -1.07699, 1.140253, -0.581307, -0.772136, -0.20392, 1.176268, 0.278975, 0.829814, 0.99264, -0.919034, 1.062743, -1.077658, 0.308164, -0.850273, -0.104966, -0.046276, -0.953096, 0.945945, -0.310309, 0.390222, -0.774466, -0.62474, 0.113205, -0.833277, -0.027681, 0.168806, 0.791532, -0.390956, 0.236217, -0.165374, 1.118755, -0.031656, -0.36767, 0.356947, 0.036409, 0.208857, -0.375831, -0.688536, 0.281743, -0.689616, 0.244942, 0.95144, -0.300273, -1.170075, -0.538318, -0.748598, 4.85965, 1.267608, -0.661149, 1.124549, 0.274469, -0.391676, 0.261042, -1.211531, 0.126055, -0.345827, 0.415296, 0.780979, -0.881768, -0.346488, 0.308225, -0.037905, -0.118149, 0.071239, -1.020007, -0.404594, 0.206144, 0.987719, -0.364614, 0.474766, -0.577719, 0.491319, 0.06313, -0.68978, 0.543157, 1.016772, -0.142276, 0.834189, -1.035981, -0.450355, -0.696808, -0.449433, 0.834905, -1.01757, -1.835909, -1.612155, -0.080997, 0.059813, -0.191521, -0.086895, -0.939373, 0.818087, 1.31727, -0.206834, 0.558896, -0.606735, 0.025605, -0.494558], "localr2": [0.551117, 0.557455, 0.553851, 0.571077, 0.559486, 0.551175, 0.558752, 0.571809, 0.513439, 0.550571, 0.57839, 0.545373, 0.604611, 0.563673, 0.606627, 0.579241, 0.547193, 0.58401, 0.57804, 0.622744, 0.554506, 0.616314, 0.553322, 0.610492, 0.618849, 0.631907, 0.568832, 0.566261, 0.551402, 0.582397, 0.594597, 0.591202, 0.583349, 0.535237, 0.556551, 0.54599, 0.55833, 0.62123, 0.609202, 0.58125, 0.557535, 0.557277, 0.560922, 0.57934, 0.511668, 0.593458, 0.573619, 0.599358, 0.573775, 0.594004, 0.599749, 0.551664, 0.534209, 0.576381, 0.547637, 0.607718, 0.577489, 0.562801, 0.547044, 0.592063, 0.552589, 0.548079, 0.619311, 0.562128, 0.557048, 0.560955, 0.566841, 0.546595, 0.555068, 0.560605, 0.599587, 0.649273, 0.546138, 0.629902, 0.587461, 0.578893, 0.529977, 0.555579, 0.569569, 0.503741, 0.539692, 0.553734, 0.51914, 0.565167, 0.607287, 0.56818, 0.491248, 0.580526, 0.605081, 0.551278, 0.603162, 0.573075, 0.551941, 0.550795, 0.617381, 0.608093, 0.550716, 0.627532, 0.634592, 0.569713, 0.55971, 0.589995, 0.481273, 0.557708, 0.552413, 0.644127, 0.563542, 0.552622, 0.554574, 0.588412, 0.591719, 0.56091, 0.584391, 0.619238, 0.587011, 0.5735, 0.563046, 0.59661, 0.541754, 0.589691, 0.545224, 0.568126, 0.608463, 0.570086, 0.566278, 0.601112, 0.546778, 0.611534, 0.594733, 0.640115, 0.5595, 0.564123, 0.626729, 0.473791, 0.587694, 0.557015, 0.54805, 0.527237, 0.543414, 0.493954, 0.643286, 0.563722, 0.537914, 0.546488, 0.628564, 0.559385, 0.560098, 0.595942, 0.553755, 0.534857, 0.598634, 0.599113, 0.423182, 0.548624, 0.553793, 0.557453, 0.555125, 0.538754, 0.561673], "est_intercept": [18.375924, 18.039692, 18.173904, 18.612431, 25.027931, 28.868732, 29.126594, 26.73874, 17.332852, 18.009999, 23.331917, 18.575691, 18.853338, 18.212539, 20.021869, 20.563701, 23.303807, 27.304692, 18.937685, 19.091389, 20.107213, 26.58407, 25.993181, 18.861404, 20.315761, 21.024737, 26.101723, 27.237578, 29.623695, 19.199889, 28.067985, 18.570921, 27.550537, 17.692965, 18.158333, 26.14995, 18.119124, 26.596568, 22.804616, 18.569581, 25.775193, 27.602714, 18.386703, 28.434523, 17.509755, 19.093217, 18.613133, 27.243553, 18.862486, 18.63039, 20.726368, 29.409853, 20.50841, 19.704834, 26.704982, 27.33758, 26.343322, 27.988326, 29.33117, 28.013628, 26.776764, 25.151079, 19.341762, 26.468527, 18.258714, 28.364577, 28.606646, 28.307209, 28.404865, 26.321627, 26.500457, 23.059413, 29.625733, 25.689204, 27.858068, 19.647124, 17.629684, 29.102336, 27.867154, 17.579579, 23.757825, 21.960048, 20.80086, 25.164234, 26.024579, 18.276315, 18.598483, 18.808846, 19.712953, 27.921286, 19.433808, 18.35095, 27.53054, 26.556594, 19.58565, 20.302644, 29.590949, 21.013811, 25.224396, 18.655811, 18.36884, 25.731843, 17.978451, 28.693695, 26.386878, 21.843074, 28.847315, 29.460662, 29.177378, 26.994593, 21.207599, 27.084437, 18.612524, 25.907755, 26.504604, 18.394926, 27.466413, 19.674459, 28.095647, 19.400471, 24.815945, 28.99667, 20.143981, 22.000899, 18.552729, 27.04672, 28.950415, 20.171, 19.436785, 23.078763, 27.520861, 19.256298, 22.034483, 16.895516, 19.123574, 18.208195, 18.006901, 18.743279, 27.490506, 18.866189, 24.730344, 18.225195, 20.965009, 27.320591, 24.200809, 25.991901, 29.279015, 18.656967, 26.15185, 22.653152, 19.058441, 19.682721, 16.844017, 27.838898, 26.164642, 18.0256, 28.516555, 22.169471, 18.263625], "yhat": [8.815245, 5.611921, 8.495724, 8.849965, 15.03242, 8.580509, 14.919817, 12.540446, 11.17349, 8.430319, 17.490415, 10.9017, 5.533408, 9.926865, 9.830106, 9.223105, 8.139072, 11.942142, 7.215765, 13.524229, 11.038573, 11.586549, 15.616062, 7.570655, 17.724733, 19.058832, 10.354608, 16.501607, 22.597924, 6.076758, 25.4543, 8.748904, 25.246368, 9.256918, 9.271919, 20.341504, 9.906595, 12.829942, 8.978318, 10.226601, 7.201808, 8.223906, 11.101222, 25.914554, 9.242568, 7.929672, 15.350101, 21.093841, 9.282415, 7.213871, 9.392305, 12.664129, 9.665323, 6.232029, 6.790721, 18.376414, 16.710674, 10.647101, 10.867332, 23.437631, 6.843525, 9.138708, 15.886925, 11.412046, 10.629657, 9.564201, 25.398206, 10.941449, 12.244551, 3.550323, 10.943758, 9.674096, 13.204245, 4.58911, 13.827683, 16.363919, 8.473759, 12.630831, 7.270247, 9.456558, 4.783389, 10.565422, 7.223319, 11.582661, 13.175302, 5.502197, 11.364096, 10.834768, 14.808946, 7.333349, 5.814789, 12.115865, 11.218728, 11.797658, 7.437642, 9.966811, 8.916465, 5.995525, 9.284445, 7.162908, 10.395689, 11.977673, 6.558461, 12.132376, 9.927145, 18.498512, 12.354765, 10.515515, 8.032877, 10.058864, 10.970992, 7.964322, 7.740923, 8.330884, 11.379041, 10.22395, 13.017184, 5.774051, 8.669737, 9.172055, 18.444544, 16.949769, 8.141504, 9.042787, 7.538033, 14.982577, 14.615563, 7.260976, 12.190501, 8.347484, 3.895132, 8.702775, 5.296954, 8.361885, 11.708678, 11.344938, 10.167793, 11.916248, 8.266897, 10.138875, 15.30099, 9.731427, 6.462592, 7.04339, 12.872171, 15.157496, 15.552791, 10.707326, 3.984952, 10.505011, 9.927395, 8.920127, 5.695227, 8.629694, 12.786831, 5.573625, 12.676648, 8.708923, 8.172088], "est_pctpov": [-0.218522, -0.291285, -0.235007, -0.325567, -0.188146, -0.137907, -0.119547, -0.379195, -0.259626, -0.310736, -0.244202, -0.206076, -0.248679, -0.334842, -0.215786, -0.21054, -0.199793, -0.192097, -0.322443, -0.253757, -0.206276, -0.534973, -0.384352, -0.274742, -0.216591, -0.341741, -0.443096, -0.30198, -0.068634, -0.327487, -0.315412, -0.310284, -0.345864, -0.269059, -0.328491, -0.16491, -0.323102, -0.475082, -0.288485, -0.275601, -0.429848, -0.242496, -0.341276, -0.227016, -0.2048, -0.252217, -0.316204, -0.425002, -0.332753, -0.326117, -0.210497, -0.10558, -0.209458, -0.206017, -0.292783, -0.384946, -0.455325, -0.227329, -0.109925, -0.340684, -0.302287, -0.187858, -0.23682, -0.36714, -0.34067, -0.110955, -0.186993, -0.170485, -0.176104, -0.172523, -0.500233, -0.412376, -0.097125, -0.519982, -0.243051, -0.236613, -0.282387, -0.119102, -0.113713, -0.208408, -0.202582, -0.214505, -0.217432, -0.180505, -0.284279, -0.319038, -0.196772, -0.299851, -0.216323, -0.148542, -0.221665, -0.329237, -0.235629, -0.167632, -0.226774, -0.265047, -0.086238, -0.318679, -0.423909, -0.334852, -0.335002, -0.222813, -0.183316, -0.067768, -0.338818, -0.366487, -0.102857, -0.064893, -0.096511, -0.426293, -0.262799, -0.301715, -0.245085, -0.354426, -0.464528, -0.230167, -0.116111, -0.327569, -0.179779, -0.31768, -0.180967, -0.141807, -0.290041, -0.211583, -0.340959, -0.30384, -0.131568, -0.32466, -0.28365, -0.360965, -0.151989, -0.206422, -0.31246, -0.218596, -0.305719, -0.341804, -0.311581, -0.194575, -0.219397, -0.189617, -0.491018, -0.292708, -0.229657, -0.239476, -0.344539, -0.411469, -0.091658, -0.278733, -0.177206, -0.216008, -0.23146, -0.304909, -0.172899, -0.203986, -0.367377, -0.261962, -0.131495, -0.224866, -0.31454], "area_key": [13001, 13003, 13005, 13007, 13009, 13011, 13013, 13015, 13017, 13019, 13021, 13023, 13025, 13027, 13029, 13031, 13033, 13035, 13037, 13039, 13043, 13045, 13047, 13049, 13051, 13053, 13055, 13057, 13059, 13061, 13063, 13065, 13067, 13069, 13071, 13073, 13075, 13077, 13079, 13081, 13083, 13085, 13087, 13089, 13091, 13093, 13095, 13097, 13099, 13101, 13103, 13105, 13107, 13109, 13111, 13113, 13115, 13117, 13119, 13121, 13123, 13125, 13127, 13129, 13131, 13133, 13135, 13137, 13139, 13141, 13143, 13145, 13147, 13149, 13151, 13153, 13155, 13157, 13159, 13161, 13163, 13165, 13167, 13169, 13171, 13173, 13175, 13177, 13179, 13181, 13183, 13185, 13187, 13189, 13191, 13193, 13195, 13197, 13199, 13201, 13205, 13207, 13209, 13211, 13213, 13215, 13217, 13219, 13221, 13223, 13225, 13227, 13229, 13231, 13233, 13235, 13237, 13239, 13241, 13243, 13245, 13247, 13249, 13251, 13253, 13255, 13257, 13259, 13261, 13263, 13265, 13267, 13269, 13271, 13273, 13275, 13277, 13279, 13281, 13283, 13285, 13287, 13289, 13291, 13293, 13295, 13297, 13299, 13301, 13303, 13305, 13307, 13309, 13311, 13313, 13315, 13317, 13319, 13321], "residual": [-0.615245, 0.788079, -1.895724, 0.550035, -1.73242, -2.180509, -5.719817, -3.540446, -3.57349, -0.930319, -0.490415, -0.6017, 0.266592, -0.826865, 1.969894, 10.676895, 1.460928, -4.742142, 2.884235, -0.024229, -1.138573, 0.413451, -7.516062, -1.170655, 0.875267, 1.141168, -4.454608, 1.898393, 14.902076, 5.123242, -10.7543, -2.048904, 7.753632, 1.843082, 0.728081, 3.558496, -3.406595, 0.470058, -3.278318, -0.226601, 0.798192, 0.376094, 0.598778, 6.785446, -1.242568, 1.570328, 1.649899, -9.093841, 0.117585, -2.513871, -1.792305, -4.664129, -0.565323, 2.367971, 1.009279, 7.423586, -3.010674, 4.952899, -1.367332, 8.162369, 1.756475, -3.838708, 4.013075, -2.212046, -2.929657, -0.764201, 4.201794, 1.058551, 3.155449, 3.249677, -3.443758, 3.925904, -4.104245, 1.11089, -3.127683, -0.363919, -0.173759, -3.630831, 3.529753, -1.156558, 1.416611, -2.865422, -2.323319, 0.417339, -3.175302, -0.102197, 0.635904, 2.865232, -1.408946, 0.866651, -0.614789, 4.184135, -0.118728, -1.397658, 1.262358, 0.133189, 0.783535, -1.395525, -2.584445, 1.037092, -2.595689, 0.922327, 3.541539, -1.132376, -4.427145, -1.898512, -2.854765, 17.884485, 4.767123, -2.458864, 4.229008, 1.035678, -1.440923, 0.969116, -4.579041, 0.47605, -1.317184, 1.525949, 2.930263, -3.172055, -1.144544, 1.150231, -0.141504, -0.442787, 0.261967, -3.882577, -1.515563, 0.739024, 3.709499, -1.247484, 1.704868, -2.202775, 1.803046, 0.238115, -2.508678, 2.055062, 3.832207, -0.516248, 3.133103, -3.838875, -1.70099, -2.531427, -1.662592, 3.05661, -3.872171, -6.757496, -6.152791, -0.307326, 0.215048, -0.705011, -0.327395, -3.420127, 2.904773, 4.970306, -0.786831, 2.026375, -2.276648, 0.091077, -1.872088], "se_pctpov": [0.115485, 0.126975, 0.118227, 0.10991, 0.106755, 0.130001, 0.128153, 0.129687, 0.122903, 0.120344, 0.105854, 0.108188, 0.131652, 0.119039, 0.120983, 0.118868, 0.116824, 0.124573, 0.109529, 0.137167, 0.114816, 0.135058, 0.134571, 0.132733, 0.125196, 0.107398, 0.135228, 0.126003, 0.132212, 0.111889, 0.132315, 0.129999, 0.126709, 0.122888, 0.11624, 0.118168, 0.117511, 0.133364, 0.107538, 0.105176, 0.13661, 0.127189, 0.118856, 0.125138, 0.117719, 0.102601, 0.108322, 0.128055, 0.11285, 0.129822, 0.124218, 0.128278, 0.113438, 0.116881, 0.131403, 0.132619, 0.134311, 0.126518, 0.132408, 0.128477, 0.131258, 0.109249, 0.130866, 0.13096, 0.119307, 0.122192, 0.126621, 0.130642, 0.128241, 0.10831, 0.131083, 0.122289, 0.132841, 0.129917, 0.129134, 0.10135, 0.121095, 0.12867, 0.124665, 0.113515, 0.109758, 0.114253, 0.106067, 0.112681, 0.119755, 0.12324, 0.109178, 0.103299, 0.122218, 0.117998, 0.123027, 0.123298, 0.128914, 0.113454, 0.126838, 0.103437, 0.131109, 0.105346, 0.12633, 0.113005, 0.117147, 0.119013, 0.111667, 0.130562, 0.133131, 0.112803, 0.130715, 0.131731, 0.126917, 0.129851, 0.10397, 0.128916, 0.12393, 0.12372, 0.131301, 0.108238, 0.119382, 0.111591, 0.132555, 0.108586, 0.116635, 0.130817, 0.102402, 0.118168, 0.116027, 0.129284, 0.131295, 0.108254, 0.101675, 0.115852, 0.114644, 0.113088, 0.106211, 0.11992, 0.107638, 0.120813, 0.118432, 0.11174, 0.130834, 0.110938, 0.132968, 0.10945, 0.105169, 0.13064, 0.119565, 0.135161, 0.129666, 0.128515, 0.108749, 0.105576, 0.125065, 0.104974, 0.117933, 0.129239, 0.134044, 0.115963, 0.121461, 0.101262, 0.110659], "area_num": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158], "t_pctblack": [1.427054, 2.054089, 1.62247, 2.067223, -0.571001, -0.751165, -0.962027, 1.787476, 1.938852, 2.360389, 0.343293, 1.021487, 1.514925, 2.543305, 0.960576, 0.579333, -0.498492, -0.351502, 1.81343, 1.474091, 0.597696, 2.701643, 1.951858, 1.735149, 0.890002, 1.615912, 2.291372, 1.102801, -1.582224, 1.696627, 0.739203, 2.141897, 1.347938, 1.946313, 2.439857, -1.191861, 2.476966, 2.131924, 0.834293, 1.727777, 2.256296, 0.527207, 2.275422, 0.093091, 1.231076, 1.297547, 2.003273, 1.920576, 1.906397, 2.306169, 0.608413, -1.548925, 0.334071, 0.879985, 1.142487, 1.345685, 2.328186, 0.326093, -1.149781, 1.113164, 1.234299, -0.871034, 1.335771, 1.758505, 2.407358, -1.451449, -0.180277, -0.347764, -0.26002, -0.97909, 2.60129, 1.916199, -1.445025, 2.664429, 0.118778, 1.021292, 2.132797, -0.990281, -1.104017, 1.473619, -0.593063, 0.002792, 0.189985, -0.440808, 0.524102, 2.369783, 0.73308, 1.790638, 1.07006, -1.431878, 1.201306, 2.458042, 0.476659, -1.18185, 1.211263, 1.023896, -1.487726, 1.396634, 1.784685, 2.074325, 2.273394, -0.035106, 0.981109, -1.549049, 1.58959, 1.716912, -1.112541, -1.619114, -1.59564, 2.029086, 0.863052, 1.167835, 1.599251, 1.146231, 2.372841, 1.371699, -1.258477, 1.568643, -0.223734, 1.580033, -0.922473, -0.724635, 1.266086, 0.060804, 2.170711, 0.65643, -0.887132, 1.511118, 1.375946, 1.471151, -1.270003, 1.033424, 1.190796, 1.62997, 1.588052, 2.50093, 2.311368, 0.971615, 0.272884, 0.642506, 2.334434, 2.076718, 0.531731, 0.537312, 1.160625, 2.122713, -1.249899, 1.847558, -1.042728, -0.262048, 1.373168, 1.442209, 1.076834, 0.077966, 1.821677, 1.750865, -1.482242, 0.078119, 2.254575], "t_pctpov": [-1.892203, -2.294038, -1.987757, -2.962119, -1.762416, -1.060813, -0.932843, -2.923921, -2.112442, -2.582057, -2.306955, -1.904795, -1.888905, -2.812877, -1.783596, -1.771213, -1.710202, -1.542034, -2.943902, -1.84999, -1.796571, -3.96105, -2.856124, -2.069892, -1.730021, -3.181992, -3.276647, -2.396616, -0.519117, -2.926894, -2.383793, -2.386815, -2.729594, -2.189468, -2.825986, -1.395554, -2.749553, -3.562294, -2.682624, -2.620376, -3.146541, -1.906591, -2.871335, -1.814127, -1.739742, -2.458238, -2.919104, -3.318914, -2.948627, -2.512025, -1.694582, -0.823055, -1.846445, -1.762619, -2.228132, -2.902644, -3.390087, -1.796814, -0.830198, -2.651717, -2.303005, -1.719545, -1.809644, -2.80344, -2.855407, -0.908037, -1.476799, -1.304983, -1.373228, -1.592868, -3.816142, -3.372144, -0.731138, -4.002403, -1.88216, -2.334626, -2.331943, -0.925637, -0.912148, -1.835951, -1.845709, -1.877457, -2.049956, -1.601903, -2.373838, -2.588749, -1.802296, -2.902752, -1.769972, -1.258854, -1.801754, -2.670262, -1.827804, -1.477533, -1.787908, -2.56241, -0.657756, -3.025058, -3.355567, -2.963173, -2.859663, -1.872183, -1.641636, -0.519046, -2.544991, -3.248908, -0.786878, -0.492614, -0.760425, -3.282933, -2.527649, -2.340399, -1.977606, -2.864736, -3.537897, -2.126485, -0.972603, -2.935453, -1.356259, -2.925619, -1.551575, -1.084007, -2.832382, -1.790533, -2.938625, -2.350179, -1.002078, -2.999061, -2.789759, -3.115739, -1.325746, -1.825317, -2.941886, -1.822841, -2.840247, -2.829192, -2.630888, -1.741319, -1.676917, -1.709216, -3.692752, -2.674345, -2.1837, -1.833093, -2.881611, -3.044283, -0.706881, -2.168882, -1.6295, -2.045982, -1.850712, -2.904624, -1.466082, -1.57836, -2.740722, -2.259014, -1.082611, -2.220636, -2.842413], "t_intercept": [7.609379, 6.697503, 7.195672, 8.255795, 14.156497, 18.407904, 18.455024, 16.077957, 6.054346, 7.204967, 13.296838, 7.329656, 6.866246, 7.675677, 8.4897, 9.071485, 9.999358, 17.841186, 8.604366, 6.69702, 9.080628, 16.810189, 15.17167, 6.835291, 8.384399, 11.955242, 15.299161, 16.868109, 18.579766, 8.871642, 17.194655, 7.052968, 16.742201, 6.53967, 7.638512, 11.604368, 7.593943, 17.183962, 13.302195, 8.155776, 14.929177, 17.265368, 7.668111, 17.658053, 6.083667, 8.634759, 8.295939, 16.901099, 8.408389, 7.259141, 8.810223, 17.697046, 9.122639, 8.7043, 16.183129, 17.280266, 15.592031, 17.414749, 18.485488, 16.905, 16.099393, 12.162038, 7.239849, 15.818031, 7.578958, 17.668373, 17.682992, 18.050671, 17.981138, 14.869228, 16.588831, 14.879699, 18.090369, 17.196475, 17.748172, 9.091638, 6.535975, 18.607913, 18.083927, 7.076811, 10.768372, 9.726515, 9.322075, 14.772789, 17.375394, 7.339344, 7.649891, 8.714896, 8.184484, 14.374787, 7.88758, 7.504212, 17.187921, 12.960213, 7.696175, 10.289304, 18.564107, 11.821484, 16.94767, 8.180987, 7.707789, 16.470342, 7.740734, 18.284333, 15.557935, 13.164332, 18.149077, 18.57868, 18.007492, 16.32635, 10.980793, 16.440536, 7.153743, 17.304261, 16.205781, 7.203743, 17.151505, 9.602895, 17.765752, 9.255965, 10.787743, 17.733848, 10.47511, 9.635323, 7.958089, 17.585678, 18.452111, 10.484603, 9.497019, 14.547858, 15.52837, 8.620986, 12.9729, 5.923095, 8.817958, 7.528327, 7.241212, 8.412973, 17.323629, 8.263278, 16.566306, 7.755803, 9.926525, 16.978309, 15.470937, 15.129371, 18.3484, 6.97004, 13.472636, 10.58946, 7.433559, 9.7771, 6.279439, 17.679091, 15.298295, 6.775579, 16.180906, 11.101535, 7.890733], "t_pctrural": [-4.164093, -3.455642, -3.984466, -3.502894, -6.433839, -9.00415, -9.330583, -7.273569, -3.216792, -3.500257, -5.878384, -3.709493, -3.960341, -3.505663, -4.520597, -4.513625, -4.899843, -8.51144, -3.51973, -3.918325, -4.387163, -6.760826, -6.474081, -3.862817, -4.494979, -4.783707, -6.350931, -8.039987, -9.289223, -3.501551, -8.415808, -3.635542, -8.063056, -3.433894, -3.469055, -5.926957, -3.549694, -7.252536, -5.86143, -3.892085, -6.085906, -8.344896, -3.287008, -8.992089, -3.202256, -4.180517, -3.586052, -7.6905, -3.396553, -3.617773, -4.518828, -8.679796, -4.198436, -4.462907, -7.393499, -7.965298, -6.544475, -8.638036, -8.9432, -8.304627, -7.421519, -5.740929, -4.173442, -7.05162, -3.322371, -8.713754, -9.045413, -8.630204, -8.865623, -7.075921, -6.790647, -5.812039, -8.758158, -6.627627, -8.67066, -4.359978, -3.354427, -9.25172, -8.815745, -3.810158, -4.97107, -4.671502, -4.036906, -6.657152, -7.832129, -3.570442, -3.507163, -3.796826, -4.452569, -7.373823, -4.376163, -3.569278, -8.212728, -6.485034, -4.339861, -4.626838, -9.107394, -4.892205, -6.95731, -3.39262, -3.353496, -7.438071, -3.829113, -9.139108, -6.952847, -5.251575, -9.25942, -9.319456, -8.890033, -7.332245, -4.988105, -7.76106, -4.047026, -7.595919, -6.829612, -3.826044, -8.286358, -3.730161, -8.37047, -3.692142, -5.351946, -9.213078, -4.448672, -4.779433, -3.320568, -8.197081, -8.853278, -4.155026, -4.096627, -6.011687, -7.675126, -4.450236, -5.547404, -3.188337, -3.686576, -3.351909, -3.445811, -4.216177, -8.09466, -3.813858, -6.277743, -3.697309, -4.365304, -7.969116, -6.606297, -6.362486, -9.425633, -3.841971, -6.526246, -4.606206, -4.214534, -3.997518, -3.167862, -8.434538, -6.669445, -3.479159, -8.106824, -4.771271, -3.595321], "se_pctrural": [0.021113, 0.022571, 0.021449, 0.020748, 0.019962, 0.020098, 0.019899, 0.019787, 0.022398, 0.021286, 0.019905, 0.023528, 0.023206, 0.021056, 0.022023, 0.021867, 0.022178, 0.018818, 0.020785, 0.024029, 0.021563, 0.01998, 0.020694, 0.023149, 0.022656, 0.019163, 0.02063, 0.019305, 0.020461, 0.020929, 0.019657, 0.022518, 0.01943, 0.022336, 0.020889, 0.022595, 0.020867, 0.019458, 0.019034, 0.020261, 0.020999, 0.01951, 0.021523, 0.01937, 0.024265, 0.020277, 0.020575, 0.019332, 0.021115, 0.022297, 0.022509, 0.020745, 0.022119, 0.021531, 0.020126, 0.019319, 0.020376, 0.019515, 0.020718, 0.019643, 0.020115, 0.021454, 0.023165, 0.020031, 0.02142, 0.019198, 0.019699, 0.020045, 0.019756, 0.019562, 0.01974, 0.019102, 0.021235, 0.019553, 0.019129, 0.020772, 0.021657, 0.019952, 0.019021, 0.021288, 0.022121, 0.021632, 0.022445, 0.020264, 0.018286, 0.02157, 0.023564, 0.020082, 0.022114, 0.021024, 0.022131, 0.021489, 0.019726, 0.021353, 0.02267, 0.019705, 0.02062, 0.019026, 0.018846, 0.021084, 0.021302, 0.019015, 0.021648, 0.019473, 0.02044, 0.01901, 0.019717, 0.020178, 0.020046, 0.019681, 0.020005, 0.01975, 0.022051, 0.01845, 0.019829, 0.021858, 0.019109, 0.020589, 0.020283, 0.020517, 0.022525, 0.020004, 0.019432, 0.021736, 0.021334, 0.018841, 0.020403, 0.019898, 0.019795, 0.018725, 0.019767, 0.020979, 0.018699, 0.022747, 0.020568, 0.021418, 0.021087, 0.021102, 0.019954, 0.02239, 0.019693, 0.020383, 0.022476, 0.019948, 0.01871, 0.020741, 0.019983, 0.02252, 0.020585, 0.021973, 0.02231, 0.019978, 0.023436, 0.019761, 0.020596, 0.021987, 0.020298, 0.021349, 0.020449], "est_pctblack": [0.069101, 0.109652, 0.081621, 0.107978, -0.028756, -0.031319, -0.039784, 0.074043, 0.093798, 0.116161, 0.017538, 0.051458, 0.086563, 0.126093, 0.045851, 0.026472, -0.024031, -0.017182, 0.099296, 0.088445, 0.026868, 0.126713, 0.083942, 0.099041, 0.043496, 0.091923, 0.100254, 0.043769, -0.069673, 0.097044, 0.034708, 0.116045, 0.055012, 0.101042, 0.120346, -0.056913, 0.120821, 0.105723, 0.044647, 0.085246, 0.099642, 0.020989, 0.120838, 0.003858, 0.061184, 0.067138, 0.103493, 0.081887, 0.105581, 0.122948, 0.028657, -0.068744, 0.01541, 0.040416, 0.047114, 0.065078, 0.101303, 0.012956, -0.04988, 0.047631, 0.05087, -0.042543, 0.07408, 0.073207, 0.124138, -0.067188, -0.007274, -0.014491, -0.010531, -0.046921, 0.113601, 0.113778, -0.064522, 0.133808, 0.005659, 0.051272, 0.102649, -0.041146, -0.053152, 0.070016, -0.029277, 0.00013, 0.009159, -0.022814, 0.026241, 0.120036, 0.035151, 0.091913, 0.052522, -0.063926, 0.060652, 0.124459, 0.019242, -0.055326, 0.063397, 0.057636, -0.06509, 0.079295, 0.09742, 0.111371, 0.118058, -0.001812, 0.044016, -0.073499, 0.066919, 0.09991, -0.050833, -0.071948, -0.07113, 0.085506, 0.046023, 0.047284, 0.084538, 0.059245, 0.101547, 0.066148, -0.061461, 0.091916, -0.009479, 0.089892, -0.044908, -0.032054, 0.070816, 0.002808, 0.116789, 0.032794, -0.037893, 0.087485, 0.074573, 0.08465, -0.058766, 0.046954, 0.066035, 0.078357, 0.088406, 0.127193, 0.112785, 0.043583, 0.011283, 0.029085, 0.133664, 0.098907, 0.027869, 0.022034, 0.065065, 0.092313, -0.053861, 0.101281, -0.049077, -0.013325, 0.072389, 0.081356, 0.05037, 0.003179, 0.077726, 0.084694, -0.066626, 0.003954, 0.109955], "se_intercept": [2.414905, 2.693495, 2.525672, 2.254469, 1.767947, 1.568279, 1.578247, 1.663068, 2.862878, 2.499664, 1.754697, 2.53432, 2.7458, 2.37276, 2.358372, 2.266851, 2.33053, 1.53043, 2.200939, 2.850729, 2.214298, 1.581426, 1.713271, 2.759415, 2.423043, 1.758621, 1.706088, 1.614738, 1.594406, 2.164187, 1.632367, 2.633065, 1.645574, 2.705483, 2.377208, 2.253458, 2.385997, 1.547755, 1.71435, 2.276863, 1.726498, 1.598733, 2.397814, 1.610286, 2.878158, 2.211205, 2.243644, 1.61194, 2.243294, 2.566473, 2.352536, 1.661851, 2.248079, 2.263805, 1.650174, 1.582012, 1.689538, 1.607162, 1.586713, 1.657121, 1.663216, 2.067999, 2.67157, 1.673314, 2.409132, 1.605387, 1.617749, 1.568208, 1.579703, 1.770208, 1.597488, 1.549723, 1.637652, 1.493865, 1.56963, 2.16101, 2.69733, 1.563976, 1.54099, 2.48411, 2.20626, 2.257751, 2.231355, 1.703418, 1.497784, 2.490184, 2.431209, 2.158241, 2.408576, 1.942379, 2.463849, 2.44542, 1.601738, 2.049086, 2.544855, 1.97318, 1.593987, 1.777595, 1.48837, 2.280386, 2.383153, 1.562314, 2.322577, 1.569305, 1.69604, 1.659262, 1.589465, 1.585724, 1.620291, 1.653437, 1.931336, 1.647418, 2.601788, 1.497189, 1.635503, 2.553523, 1.6014, 2.048805, 1.58145, 2.095997, 2.300383, 1.635103, 1.923033, 2.283359, 2.331305, 1.537997, 1.568949, 1.923869, 2.046619, 1.586403, 1.772296, 2.233654, 1.698501, 2.852481, 2.168708, 2.418624, 2.486725, 2.227902, 1.586879, 2.283136, 1.49281, 2.349879, 2.112019, 1.609147, 1.564276, 1.717976, 1.595726, 2.676737, 1.941109, 2.139217, 2.563838, 2.013145, 2.682408, 1.574679, 1.710298, 2.660378, 1.762358, 1.996974, 2.314566], "y": [8.2, 6.4, 6.6, 9.4, 13.3, 6.4, 9.2, 9.0, 7.6, 7.5, 17.0, 10.3, 5.8, 9.1, 11.8, 19.9, 9.6, 7.2, 10.1, 13.5, 9.9, 12.0, 8.1, 6.4, 18.6, 20.2, 5.9, 18.4, 37.5, 11.2, 14.7, 6.7, 33.0, 11.1, 10.0, 23.9, 6.5, 13.3, 5.7, 10.0, 8.0, 8.6, 11.7, 32.7, 8.0, 9.5, 17.0, 12.0, 9.4, 4.7, 7.6, 8.0, 9.1, 8.6, 7.8, 25.8, 13.7, 15.6, 9.5, 31.6, 8.6, 5.3, 19.9, 9.2, 7.7, 8.8, 29.6, 12.0, 15.4, 6.8, 7.5, 13.6, 9.1, 5.7, 10.7, 16.0, 8.3, 9.0, 10.8, 8.3, 6.2, 7.7, 4.9, 12.0, 10.0, 5.4, 12.0, 13.7, 13.4, 8.2, 5.2, 16.3, 11.1, 10.4, 8.7, 10.1, 9.7, 4.6, 6.7, 8.2, 7.8, 12.9, 10.1, 11.0, 5.5, 16.6, 9.5, 28.4, 12.8, 7.6, 15.2, 9.0, 6.3, 9.3, 6.8, 10.7, 11.7, 7.3, 11.6, 6.0, 17.3, 18.1, 8.0, 8.6, 7.8, 11.1, 13.1, 8.0, 15.9, 7.1, 5.6, 6.5, 7.1, 8.6, 9.2, 13.4, 14.0, 11.4, 11.4, 6.3, 13.6, 7.2, 4.8, 10.1, 9.0, 8.4, 9.4, 10.4, 4.2, 9.8, 9.6, 5.5, 8.6, 13.6, 12.0, 7.6, 10.4, 8.8, 6.3]}
\ No newline at end of file
diff --git a/release/python/0.8.0/crankshaft/test/fixtures/kmeans.json b/release/python/0.8.0/crankshaft/test/fixtures/kmeans.json
new file mode 100644
index 0000000..8f31c79
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/test/fixtures/kmeans.json
@@ -0,0 +1 @@
+[{"xs": [9.917239463463458, 9.042767302696836, 10.798929825304187, 8.763751051762995, 11.383882954810852, 11.018206993460897, 8.939526075734316, 9.636159342565252, 10.136336896960058, 11.480610059427342, 12.115011910725082, 9.173267848893428, 10.239300931201738, 8.00012512174072, 8.979962292282131, 9.318376124429575, 10.82259513754284, 10.391747171927115, 10.04904588886165, 9.96007160443463, -0.78825626804569, -0.3511819898577426, -1.2796410003764271, -0.3977049391203402, 2.4792311265774667, 1.3670311632092624, 1.2963504112955613, 2.0404844103073025, -1.6439708506073223, 0.39122885445645805, 1.026031821452462, -0.04044477160482201, -0.7442346929085072, -0.34687120826243034, -0.23420359971379054, -0.5919629143336708, -0.202903054395391, -0.1893399644841902, 1.9331834251176807, -0.12321054392851609], "ys": [8.735627063679981, 9.857615954045011, 10.81439096759407, 10.586727233537191, 9.232919976568622, 11.54281262696508, 8.392787912674466, 9.355119689665944, 9.22380703532752, 10.542142541823122, 10.111980619367035, 10.760836265570738, 8.819773453269804, 10.25325722424816, 9.802077905695608, 8.955420161552611, 9.833801181904477, 10.491684241001613, 12.076108669877556, 11.74289693140474, -0.5685725015474191, -0.5715728344759778, -0.20180907868635137, 0.38431336480089595, -0.3402202083684184, -2.4652736827783586, 0.08295159401756182, 0.8503818775816505, 0.6488691600321166, 0.5794762568230527, -0.6770063922144103, -0.6557616416449478, -1.2834289177624947, 0.1096318195532717, -0.38986922166834853, -1.6224497706950238, 0.09429787743230483, 0.4005097316394031, -0.508002811195673, -1.2473463371366507], "ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]}]
\ No newline at end of file
diff --git a/release/python/0.8.0/crankshaft/test/fixtures/markov.json b/release/python/0.8.0/crankshaft/test/fixtures/markov.json
new file mode 100644
index 0000000..d60e4e0
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/test/fixtures/markov.json
@@ -0,0 +1 @@
+[[0.11111111111111112, 0.10000000000000001, 0.0, 0.35213633723318016, 0], [0.03125, 0.030303030303030304, 0.0, 0.3850273981640871, 1], [0.03125, 0.030303030303030304, 0.0, 0.3850273981640871, 2], [0.0, 0.10000000000000001, 0.10000000000000001, 0.30331501776206204, 3], [0.0, 0.065217391304347824, 0.065217391304347824, 0.33605067580764519, 4], [-0.054054054054054057, 0.0, 0.05128205128205128, 0.37488547451276033, 5], [0.1875, 0.23999999999999999, 0.12, 0.23731835158706122, 6], [0.034482758620689655, 0.0625, 0.03125, 0.35388469167230169, 7], [0.030303030303030304, 0.078947368421052627, 0.052631578947368418, 0.33560628561957595, 8], [0.19047619047619049, 0.16, 0.0, 0.32594478059941379, 9], [-0.23529411764705882, 0.0, 0.19047619047619047, 0.31356338348865387, 10], [0.030303030303030304, 0.078947368421052627, 0.052631578947368418, 0.33560628561957595, 11], [-0.22222222222222224, 0.13333333333333333, 0.26666666666666666, 0.22310934040908681, 12], [0.027777777777777783, 0.11111111111111112, 0.088888888888888892, 0.30339641183779581, 13], [0.03125, 0.030303030303030304, 0.0, 0.3850273981640871, 14], [0.052631578947368425, 0.090909090909090912, 0.045454545454545456, 0.33352611505171165, 15], [-0.22222222222222224, 0.13333333333333333, 0.26666666666666666, 0.22310934040908681, 16], [-0.20512820512820512, 0.0, 0.1702127659574468, 0.32172013908826891, 17], [-0.20512820512820512, 0.0, 0.1702127659574468, 0.32172013908826891, 18], [-0.0625, 0.095238095238095233, 0.14285714285714285, 0.28634850244519822, 19], [0.0, 0.10000000000000001, 0.10000000000000001, 0.30331501776206204, 20], [0.078947368421052641, 0.073170731707317083, 0.0, 0.36451788667842738, 21], [0.030303030303030304, 0.078947368421052627, 0.052631578947368418, 0.33560628561957595, 22], [-0.16666666666666663, 0.18181818181818182, 0.27272727272727271, 0.20246415864836445, 23], [-0.22222222222222224, 0.13333333333333333, 0.26666666666666666, 0.22310934040908681, 24], [0.1875, 0.23999999999999999, 0.12, 0.23731835158706122, 25], [-0.20512820512820512, 0.0, 0.1702127659574468, 0.32172013908826891, 26], [-0.043478260869565216, 0.0, 0.041666666666666664, 0.37950991789118999, 27], [0.22222222222222221, 0.18181818181818182, 0.0, 0.31701083225750354, 28], [-0.054054054054054057, 0.0, 0.05128205128205128, 0.37488547451276033, 29], [-0.0625, 0.095238095238095233, 0.14285714285714285, 0.28634850244519822, 30], [0.0, 0.10000000000000001, 0.10000000000000001, 0.30331501776206204, 31], [0.030303030303030304, 0.078947368421052627, 0.052631578947368418, 0.33560628561957595, 32], [-0.0625, 0.095238095238095233, 0.14285714285714285, 0.28634850244519822, 33], [0.034482758620689655, 0.0625, 0.03125, 0.35388469167230169, 34], [0.0, 0.10000000000000001, 0.10000000000000001, 0.30331501776206204, 35], [-0.054054054054054057, 0.0, 0.05128205128205128, 0.37488547451276033, 36], [0.11111111111111112, 0.10000000000000001, 0.0, 0.35213633723318016, 37], [-0.22222222222222224, 0.13333333333333333, 0.26666666666666666, 0.22310934040908681, 38], [-0.0625, 0.095238095238095233, 0.14285714285714285, 0.28634850244519822, 39], [0.034482758620689655, 0.0625, 0.03125, 0.35388469167230169, 40], [0.11111111111111112, 0.10000000000000001, 0.0, 0.35213633723318016, 41], [0.052631578947368425, 0.090909090909090912, 0.045454545454545456, 0.33352611505171165, 42], [0.0, 0.0, 0.0, 0.40000000000000002, 43], [0.0, 0.065217391304347824, 0.065217391304347824, 0.33605067580764519, 44], [0.078947368421052641, 0.073170731707317083, 0.0, 0.36451788667842738, 45], [0.052631578947368425, 0.090909090909090912, 0.045454545454545456, 0.33352611505171165, 46], [-0.20512820512820512, 0.0, 0.1702127659574468, 0.32172013908826891, 47]]
diff --git a/release/python/0.8.0/crankshaft/test/fixtures/moran.json b/release/python/0.8.0/crankshaft/test/fixtures/moran.json
new file mode 100644
index 0000000..2f75cf1
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/test/fixtures/moran.json
@@ -0,0 +1,52 @@
+[[0.9319096128346788, "HH"],
+[-1.135787401862846, "HL"],
+[0.11732030672508517, "LL"],
+[0.6152779669180425, "LL"],
+[-0.14657336660125297, "LH"],
+[0.6967858120189607, "LL"],
+[0.07949310115714454, "HH"],
+[0.4703198759258987, "HH"],
+[0.4421125200498064, "HH"],
+[0.5724288737143592, "LL"],
+[0.8970743435692062, "LL"],
+[0.18327334401918674, "LL"],
+[-0.01466729201304962, "HL"],
+[0.3481559372544409, "LL"],
+[0.06547094736902978, "LL"],
+[0.15482141569329988, "HH"],
+[0.4373841193538136, "HH"],
+[0.15971286468915544, "LL"],
+[1.0543588860308968, "HH"],
+[1.7372866900020818, "HH"],
+[1.091998586053999, "LL"],
+[0.1171572584252222, "HH"],
+[0.08438455015300014, "LL"],
+[0.06547094736902978, "LL"],
+[0.15482141569329985, "HH"],
+[1.1627044812890683, "HH"],
+[0.06547094736902978, "LL"],
+[0.795275137550483, "HH"],
+[0.18562939195219, "LL"],
+[0.3010757406693439, "LL"],
+[2.8205795942839376, "HH"],
+[0.11259190602909264, "LL"],
+[-0.07116352791516614, "HL"],
+[-0.09945240794119009, "LH"],
+[0.18562939195219, "LL"],
+[0.1832733440191868, "LL"],
+[-0.39054253768447705, "HL"],
+[-0.1672071289487642, "HL"],
+[0.3337669247916343, "HH"],
+[0.2584386102554792, "HH"],
+[-0.19733845476322634, "HL"],
+[-0.9379282899805409, "LH"],
+[-0.028770969951095866, "LH"],
+[0.051367269430983485, "LL"],
+[-0.2172548045913472, "LH"],
+[0.05136726943098351, "LL"],
+[0.04191046803899837, "LL"],
+[0.7482357030403517, "HH"],
+[-0.014585767863118111, "LH"],
+[0.5410013139159929, "HH"],
+[1.0223932668429925, "LL"],
+[1.4179402898927476, "LL"]]
\ No newline at end of file
diff --git a/release/python/0.8.0/crankshaft/test/fixtures/neighbors.json b/release/python/0.8.0/crankshaft/test/fixtures/neighbors.json
new file mode 100644
index 0000000..055b359
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/test/fixtures/neighbors.json
@@ -0,0 +1,54 @@
+[
+ {"neighbors": [48, 26, 20, 9, 31], "id": 1, "value": 0.5},
+ {"neighbors": [30, 16, 46, 3, 4], "id": 2, "value": 0.7},
+ {"neighbors": [46, 30, 2, 12, 16], "id": 3, "value": 0.2},
+ {"neighbors": [18, 30, 23, 2, 52], "id": 4, "value": 0.1},
+ {"neighbors": [47, 40, 45, 37, 28], "id": 5, "value": 0.3},
+ {"neighbors": [10, 21, 41, 14, 37], "id": 6, "value": 0.05},
+ {"neighbors": [8, 17, 43, 25, 12], "id": 7, "value": 0.4},
+ {"neighbors": [17, 25, 43, 22, 7], "id": 8, "value": 0.7},
+ {"neighbors": [39, 34, 1, 26, 48], "id": 9, "value": 0.5},
+ {"neighbors": [6, 37, 5, 45, 49], "id": 10, "value": 0.04},
+ {"neighbors": [51, 41, 29, 21, 14], "id": 11, "value": 0.08},
+ {"neighbors": [44, 46, 43, 50, 3], "id": 12, "value": 0.2},
+ {"neighbors": [45, 23, 14, 28, 18], "id": 13, "value": 0.4},
+ {"neighbors": [41, 29, 13, 23, 6], "id": 14, "value": 0.2},
+ {"neighbors": [36, 27, 32, 33, 24], "id": 15, "value": 0.3},
+ {"neighbors": [19, 2, 46, 44, 28], "id": 16, "value": 0.4},
+ {"neighbors": [8, 25, 43, 7, 22], "id": 17, "value": 0.6},
+ {"neighbors": [23, 4, 29, 14, 13], "id": 18, "value": 0.3},
+ {"neighbors": [42, 16, 28, 26, 40], "id": 19, "value": 0.7},
+ {"neighbors": [1, 48, 31, 26, 42], "id": 20, "value": 0.8},
+ {"neighbors": [41, 6, 11, 14, 10], "id": 21, "value": 0.1},
+ {"neighbors": [25, 50, 43, 31, 44], "id": 22, "value": 0.4},
+ {"neighbors": [18, 13, 14, 4, 2], "id": 23, "value": 0.1},
+ {"neighbors": [33, 49, 34, 47, 27], "id": 24, "value": 0.3},
+ {"neighbors": [43, 8, 22, 17, 50], "id": 25, "value": 0.4},
+ {"neighbors": [1, 42, 20, 31, 48], "id": 26, "value": 0.6},
+ {"neighbors": [32, 15, 36, 33, 24], "id": 27, "value": 0.3},
+ {"neighbors": [40, 45, 19, 5, 13], "id": 28, "value": 0.8},
+ {"neighbors": [11, 51, 41, 14, 18], "id": 29, "value": 0.3},
+ {"neighbors": [2, 3, 4, 46, 18], "id": 30, "value": 0.1},
+ {"neighbors": [20, 26, 1, 50, 48], "id": 31, "value": 0.9},
+ {"neighbors": [27, 36, 15, 49, 24], "id": 32, "value": 0.3},
+ {"neighbors": [24, 27, 49, 34, 32], "id": 33, "value": 0.4},
+ {"neighbors": [47, 9, 39, 40, 24], "id": 34, "value": 0.3},
+ {"neighbors": [38, 51, 11, 21, 41], "id": 35, "value": 0.3},
+ {"neighbors": [15, 32, 27, 49, 33], "id": 36, "value": 0.2},
+ {"neighbors": [49, 10, 5, 47, 24], "id": 37, "value": 0.5},
+ {"neighbors": [35, 21, 51, 11, 41], "id": 38, "value": 0.4},
+ {"neighbors": [9, 34, 48, 1, 47], "id": 39, "value": 0.6},
+ {"neighbors": [28, 47, 5, 9, 34], "id": 40, "value": 0.5},
+ {"neighbors": [11, 14, 29, 21, 6], "id": 41, "value": 0.4},
+ {"neighbors": [26, 19, 1, 9, 31], "id": 42, "value": 0.2},
+ {"neighbors": [25, 12, 8, 22, 44], "id": 43, "value": 0.3},
+ {"neighbors": [12, 50, 46, 16, 43], "id": 44, "value": 0.2},
+ {"neighbors": [28, 13, 5, 40, 19], "id": 45, "value": 0.3},
+ {"neighbors": [3, 12, 44, 2, 16], "id": 46, "value": 0.2},
+ {"neighbors": [34, 40, 5, 49, 24], "id": 47, "value": 0.3},
+ {"neighbors": [1, 20, 26, 9, 39], "id": 48, "value": 0.5},
+ {"neighbors": [24, 37, 47, 5, 33], "id": 49, "value": 0.2},
+ {"neighbors": [44, 22, 31, 42, 26], "id": 50, "value": 0.6},
+ {"neighbors": [11, 29, 41, 14, 21], "id": 51, "value": 0.01},
+ {"neighbors": [4, 18, 29, 51, 23], "id": 52, "value": 0.01}
+ ]
diff --git a/release/python/0.8.0/crankshaft/test/fixtures/neighbors_getis.json b/release/python/0.8.0/crankshaft/test/fixtures/neighbors_getis.json
new file mode 100644
index 0000000..be367ff
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/test/fixtures/neighbors_getis.json
@@ -0,0 +1 @@
+[{"neighbors": [3, 6, 7], "id": 1, "value": 1.624458}, {"neighbors": [10, 5, 8], "id": 2, "value": 2.2554919999999998}, {"neighbors": [1, 4, 7], "id": 3, "value": 1.4678899999999999}, {"neighbors": [9, 3, 5, 7], "id": 4, "value": 2.4842559999999998}, {"neighbors": [9, 2, 4, 10], "id": 5, "value": 0.0}, {"neighbors": [1, 11, 12, 7, 16], "id": 6, "value": 9.0486730000000009}, {"neighbors": [1, 3, 4, 6, 9, 11, 18, 19], "id": 7, "value": 6.0294889999999999}, {"neighbors": [2, 15, 10], "id": 8, "value": 1.8003849999999999}, {"neighbors": [4, 5, 7, 10, 13, 19, 20], "id": 9, "value": 4.581251}, {"neighbors": [2, 5, 8, 9, 13, 15, 17, 20, 21], "id": 10, "value": 3.7906070000000001}, {"neighbors": [18, 6, 7, 16], "id": 11, "value": 1.4474359999999999}, {"neighbors": [16, 6, 14], "id": 12, "value": 1.1919660000000001}, {"neighbors": [9, 10, 20], "id": 13, "value": 0.0}, {"neighbors": [12, 22, 16], "id": 14, "value": 1.608017}, {"neighbors": [17, 10, 23, 8], "id": 15, "value": 1.9498120000000001}, {"neighbors": [6, 11, 12, 14, 18, 22, 27, 28], "id": 16, "value": 0.74509000000000003}, {"neighbors": [10, 15, 21, 23, 26, 30], "id": 17, "value": 4.1733180000000001}, {"neighbors": [33, 7, 11, 16, 19, 27, 32], "id": 18, "value": 3.7832520000000001}, {"neighbors": [33, 7, 9, 18, 20, 24], "id": 19, "value": 2.0851359999999999}, {"neighbors": [9, 10, 13, 19, 21, 24], "id": 20, "value": 2.1763020000000002}, {"neighbors": [35, 10, 17, 20, 24, 26], "id": 21, "value": 6.3093469999999998}, {"neighbors": [28, 29, 14, 16], "id": 22, "value": 10.855743}, {"neighbors": [17, 25, 31, 30, 15], "id": 23, "value": 4.211354}, {"neighbors": [33, 19, 20, 21, 35], "id": 24, "value": 0.80481000000000003}, {"neighbors": [42, 31, 23], "id": 25, "value": 3.2153309999999999}, {"neighbors": [17, 34, 35, 21, 30], "id": 26, "value": 2.8336640000000002}, {"neighbors": [36, 39, 41, 16, 18, 28, 32], "id": 27, "value": 1.5920399999999999}, {"neighbors": [27, 36, 29, 22, 16], "id": 28, "value": 1.5711580000000001}, {"neighbors": [36, 28, 22, 38], "id": 29, "value": 3.1275900000000001}, {"neighbors": [34, 43, 17, 23, 26, 31], "id": 30, "value": 4.4168960000000004}, {"neighbors": [42, 43, 44, 23, 25, 30], "id": 31, "value": 3.0174859999999999}, {"neighbors": [33, 18, 27, 41], "id": 32, "value": 9.9242450000000009}, {"neighbors": [35, 37, 40, 41, 46, 18, 19, 24, 32], "id": 33, "value": 7.9739570000000004}, {"neighbors": [26, 35, 43, 45, 30], "id": 34, "value": 5.0054639999999999}, {"neighbors": [33, 34, 37, 40, 45, 21, 24, 26], "id": 35, "value": 2.4638909999999998}, {"neighbors": [38, 39, 47, 27, 28, 29], "id": 36, "value": 0.0}, {"neighbors": [33, 35, 40, 45, 46, 49, 51], "id": 37, "value": 7.377974}, {"neighbors": [36, 29, 47, 48], "id": 38, "value": 1.0038750000000001}, {"neighbors": [36, 41, 47, 50, 52, 27], "id": 39, "value": 3.1900469999999999}, {"neighbors": [33, 35, 37, 46], "id": 40, "value": 45.905405999999999}, {"neighbors": [33, 39, 46, 50, 27, 32], "id": 41, "value": 2.447597}, {"neighbors": [25, 44, 53, 31], "id": 42, "value": 1.2949580000000001}, {"neighbors": [34, 44, 45, 54, 59, 61, 30, 31], "id": 43, "value": 5.9330980000000002}, {"neighbors": [42, 43, 53, 54, 31], "id": 44, "value": 4.1339969999999999}, {"neighbors": [34, 35, 37, 43, 51, 59, 60], "id": 45, "value": 4.298311}, {"neighbors": [33, 37, 40, 41, 49, 50, 57], "id": 46, "value": 27.483827000000002}, {"neighbors": [36, 38, 39, 48, 52, 55, 56], "id": 47, "value": 0.96979099999999996}, {"neighbors": [55, 38, 47], "id": 48, "value": 0.0}, {"neighbors": [57, 51, 37, 46, 63], "id": 49, "value": 2.934466}, {"neighbors": [39, 41, 46, 52, 57, 58], "id": 50, "value": 4.4564269999999997}, {"neighbors": [37, 45, 49, 60, 63, 64], "id": 51, "value": 4.629264}, {"neighbors": [39, 47, 50, 56, 58, 62], "id": 52, "value": 4.9415329999999997}, {"neighbors": [65, 42, 44, 54], "id": 53, "value": 3.9900410000000002}, {"neighbors": [65, 61, 43, 44, 53], "id": 54, "value": 2.064324}, {"neighbors": [56, 47, 48], "id": 55, "value": 3.0402529999999999}, {"neighbors": [52, 55, 47, 62], "id": 56, "value": 3.905411}, {"neighbors": [66, 67, 46, 49, 50, 58, 63], "id": 57, "value": 4.3328389999999999}, {"neighbors": [57, 50, 52, 62, 66], "id": 58, "value": 3.8941110000000001}, {"neighbors": [69, 70, 43, 45, 60, 61], "id": 59, "value": 6.8287940000000003}, {"neighbors": [51, 64, 45, 59, 70], "id": 60, "value": 3.2639469999999999}, {"neighbors": [65, 69, 72, 43, 54, 59], "id": 61, "value": 3.2821630000000002}, {"neighbors": [58, 68, 52, 66, 56], "id": 62, "value": 3.2957619999999999}, {"neighbors": [49, 57, 51, 67, 64], "id": 63, "value": 7.2496790000000004}, {"neighbors": [67, 70, 71, 51, 60, 63], "id": 64, "value": 3.041846}, {"neighbors": [61, 53, 54, 72], "id": 65, "value": 1.618018}, {"neighbors": [67, 68, 73, 76, 57, 58, 62], "id": 66, "value": 4.9108010000000002}, {"neighbors": [66, 71, 73, 75, 76, 57, 63, 64], "id": 67, "value": 1.991457}, {"neighbors": [73, 66, 62], "id": 68, "value": 3.1461920000000001}, {"neighbors": [70, 72, 74, 77, 59, 61], "id": 69, "value": 7.2666500000000003}, {"neighbors": [69, 71, 74, 78, 59, 60, 64], "id": 70, "value": 3.1109040000000001}, {"neighbors": [67, 75, 70, 78, 64], "id": 71, "value": 2.9802710000000001}, {"neighbors": [65, 69, 61, 77], "id": 72, "value": 3.8667669999999998}, {"neighbors": [76, 66, 67, 68], "id": 73, "value": 1.8684080000000001}, {"neighbors": [77, 69, 70, 78], "id": 74, "value": 12.577033999999999}, {"neighbors": [67, 76, 78, 71], "id": 75, "value": 7.8035990000000002}, {"neighbors": [73, 66, 67, 75], "id": 76, "value": 3.4714900000000002}, {"neighbors": [74, 69, 72], "id": 77, "value": 4.334822}, {"neighbors": [74, 75, 70, 71], "id": 78, "value": 8.4515370000000001}]
diff --git a/release/python/0.8.0/crankshaft/test/fixtures/neighbors_markov.json b/release/python/0.8.0/crankshaft/test/fixtures/neighbors_markov.json
new file mode 100644
index 0000000..45a20e7
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/test/fixtures/neighbors_markov.json
@@ -0,0 +1 @@
+[{"neighbors": [10, 7, 21, 23, 1], "y1995": 0.87654416055651474, "y1997": 0.85637566664752718, "y1996": 0.8631470006766887, "y1999": 0.84461540228037335, "y1998": 0.84811668329242784, "y2006": 0.86302631339545688, "y2007": 0.86148266513456728, "y2004": 0.86416611731111015, "y2005": 0.87119374831581786, "y2002": 0.85012592862683589, "y2003": 0.8550965633336135, "y2000": 0.83271652434603094, "y2001": 0.83786313566577242, "id": 0, "y2008": 0.86252252380501315, "y2009": 0.86746356478544273}, {"neighbors": [5, 7, 22, 29, 3], "y1995": 0.91889509774542122, "y1997": 0.92333257900976462, "y1996": 0.91757931190043385, "y1999": 0.92552387732371888, "y1998": 0.92517289327379471, "y2006": 0.91706053906277052, "y2007": 0.90139504820726424, "y2004": 0.89815175749309051, "y2005": 0.91832090781161113, "y2002": 0.89431990798552208, "y2003": 0.88924793576523797, "y2000": 0.90746978227271013, "y2001": 0.89830489127332913, "id": 1, "y2008": 0.87897455159080617, "y2009": 0.86216858051752643}, {"neighbors": [11, 8, 13, 18, 17], "y1995": 0.82591007476914713, "y1997": 0.81989792988843901, "y1996": 0.82548595539161707, "y1999": 0.81731522200916285, "y1998": 0.81503235035017918, "y2006": 0.81814804358939286, "y2007": 0.83675961003285626, "y2004": 0.82668195534569056, "y2005": 0.82373723764184559, "y2002": 0.80849979516360859, "y2003": 0.82258550658074148, "y2000": 0.78964559168205917, "y2001": 0.8058444152731008, "id": 2, "y2008": 0.8357419865626442, "y2009": 0.84647177436289112}, {"neighbors": [4, 14, 9, 5, 12], "y1995": 1.0908817638059434, "y1997": 1.0845641754849344, "y1996": 1.0853768890893893, "y1999": 1.098988414417104, "y1998": 1.0841540389418189, "y2006": 1.1316479722785828, "y2007": 1.1295850763954971, "y2004": 1.1139980568106316, "y2005": 1.1216802898290368, "y2002": 1.1116069731657288, "y2003": 1.1088862051501811, "y2000": 1.1450694824791507, "y2001": 1.1215113292620285, "id": 3, "y2008": 1.1137181812756343, "y2009": 1.0993677488645406}, {"neighbors": [14, 3, 9, 31, 12], "y1995": 1.1073144618319228, "y1997": 1.1328363804627946, "y1996": 1.1137394350312471, "y1999": 1.1591002514611153, "y1998": 1.144725587086376, "y2006": 1.1173646811350333, "y2007": 1.1086324218539598, "y2004": 1.1102496406140896, "y2005": 1.11943471361418, "y2002": 1.1475230282561595, "y2003": 1.1184328424005199, "y2000": 1.1689820101690329, "y2001": 1.1721248787169682, "id": 4, "y2008": 1.0964251552643696, "y2009": 1.0776233718455337}, {"neighbors": [29, 1, 22, 7, 4], "y1995": 1.422697571371182, "y1997": 1.4427350196405593, "y1996": 1.4211843379728528, "y1999": 1.4440068434166562, "y1998": 1.4357757095632602, "y2006": 1.4405276647793266, "y2007": 1.4524121586440921, "y2004": 1.4059372049179741, "y2005": 1.4078864636665769, "y2002": 1.4197822680667809, "y2003": 1.3909220829548647, "y2000": 1.4418473669388905, "y2001": 1.4478283203013527, "id": 5, "y2008": 1.4330609762040207, "y2009": 1.4174430982377491}, {"neighbors": [12, 47, 9, 25, 20], "y1995": 1.1307388498039153, "y1997": 1.1107470843142355, "y1996": 1.1311051255854685, "y1999": 1.130881491772973, "y1998": 1.1336463608751246, "y2006": 1.1088003408832796, "y2007": 1.0840170924825394, "y2004": 1.1244623853593112, "y2005": 1.1167100811401538, "y2002": 1.1306293052597198, "y2003": 1.1194498381213465, "y2000": 1.1088813841947593, "y2001": 1.1185662918783175, "id": 6, "y2008": 1.0695920556329086, "y2009": 1.0787522517402164}, {"neighbors": [21, 1, 22, 10, 0], "y1995": 1.0470612357366649, "y1997": 1.0425337165747406, "y1996": 1.0451683097376836, "y1999": 1.0207254480945218, "y1998": 1.0323998680588111, "y2006": 1.0405109962442973, "y2007": 1.0174964540280445, "y2004": 1.0140090547678748, "y2005": 1.0317674181861733, "y2002": 0.99669586934394627, "y2003": 0.99327675611171373, "y2000": 0.99854316295509526, "y2001": 0.98802579761429143, "id": 7, "y2008": 0.9936394033949828, "y2009": 0.98279746069218921}, {"neighbors": [11, 13, 17, 18, 15], "y1995": 0.98996985668705595, "y1997": 0.99491000469481983, "y1996": 1.0014356415938011, "y1999": 1.0045584503565237, "y1998": 1.0018840754492748, "y2006": 0.92232873520447411, "y2007": 0.91284090705064902, "y2004": 0.93694786512729977, "y2005": 0.94308212820743131, "y2002": 0.96834820215592055, "y2003": 0.95335147249088092, "y2000": 0.99127006477048718, "y2001": 0.97925917470464008, "id": 8, "y2008": 0.89689832627117483, "y2009": 0.88928857608264111}, {"neighbors": [12, 6, 4, 3, 14], "y1995": 0.87418390853652306, "y1997": 0.84425695187978567, "y1996": 0.86416601430334228, "y1999": 0.83903043942542854, "y1998": 0.8404493987171674, "y2006": 0.87204140839730271, "y2007": 0.86633032299764789, "y2004": 0.86981997840756087, "y2005": 0.86837929279319737, "y2002": 0.86107306112852877, "y2003": 0.85007719735663123, "y2000": 0.85787080050645603, "y2001": 0.86036185149249467, "id": 9, "y2008": 0.84946077011565357, "y2009": 0.83287145944123797}, {"neighbors": [0, 7, 21, 23, 22], "y1995": 1.1419611801631209, "y1997": 1.1489271154554144, "y1996": 1.146602624490825, "y1999": 1.1443662376135306, "y1998": 1.1490959392942743, "y2006": 1.1049125811637337, "y2007": 1.1105984164317646, "y2004": 1.1119989015058092, "y2005": 1.1025779214946556, "y2002": 1.1259666377127024, "y2003": 1.1221399558345004, "y2000": 1.144501826035474, "y2001": 1.1234975172649961, "id": 10, "y2008": 1.1050979494645479, "y2009": 1.1002009697391872}, {"neighbors": [8, 13, 18, 17, 2], "y1995": 0.97282462974938089, "y1997": 0.96252588061647382, "y1996": 0.96700147279313231, "y1999": 0.96057686787383312, "y1998": 0.96538780087103548, "y2006": 0.91010201260822066, "y2007": 0.89280392121658247, "y2004": 0.94103988614185807, "y2005": 0.9212251863828258, "y2002": 0.94804194711420009, "y2003": 0.9543028555845573, "y2000": 0.95831051250950716, "y2001": 0.94480908623936988, "id": 11, "y2008": 0.89298242828382146, "y2009": 0.89165384824292859}, {"neighbors": [33, 9, 6, 25, 31], "y1995": 0.94325467991401402, "y1997": 0.96455242154753429, "y1996": 0.96436902092427723, "y1999": 0.94117647058823528, "y1998": 0.95243008993884537, "y2006": 0.9346681464882507, "y2007": 0.94281559150403071, "y2004": 0.96918424441756057, "y2005": 0.94781280876672958, "y2002": 0.95388717527096822, "y2003": 0.94597005193649519, "y2000": 0.94809269652332606, "y2001": 0.93539181553564288, "id": 12, "y2008": 0.965203150896216, "y2009": 0.967154410723015}, {"neighbors": [18, 17, 11, 8, 19], "y1995": 0.97478408425654373, "y1997": 0.98712808751954773, "y1996": 0.98169225257738801, "y1999": 0.985598971191053, "y1998": 0.98474769442356791, "y2006": 0.98416665248276058, "y2007": 0.98423613480079708, "y2004": 0.97399471186978948, "y2005": 0.96910087128357136, "y2002": 0.9820996926750224, "y2003": 0.98776529543110569, "y2000": 0.98687072733199255, "y2001": 0.99237486444837619, "id": 13, "y2008": 0.99823861244053191, "y2009": 0.99545704236827348}, {"neighbors": [4, 31, 3, 29, 12], "y1995": 0.85570268988941878, "y1997": 0.85986131704895119, "y1996": 0.85575915188345031, "y1999": 0.85380119644969055, "y1998": 0.85693406055397725, "y2006": 0.82803647591954255, "y2007": 0.81987360180979219, "y2004": 0.83998883284341452, "y2005": 0.83478547261894065, "y2002": 0.85472102128186755, "y2003": 0.84564834502399988, "y2000": 0.86191535266765262, "y2001": 0.84981450830432048, "id": 14, "y2008": 0.82265395167873867, "y2009": 0.83994039782937002}, {"neighbors": [19, 8, 17, 16, 13], "y1995": 0.87022046646521634, "y1997": 0.85961813213722393, "y1996": 0.85996258309339635, "y1999": 0.8394713575455558, "y1998": 0.85689572413110093, "y2006": 0.94202108334913126, "y2007": 0.94222309998743192, "y2004": 0.86763340229291142, "y2005": 0.89179316746010362, "y2002": 0.86776297543511893, "y2003": 0.86720209304280604, "y2000": 0.82785596604704892, "y2001": 0.86008789452656809, "id": 15, "y2008": 0.93902708112840494, "y2009": 0.94479183757120588}, {"neighbors": [28, 26, 15, 19, 32], "y1995": 0.90134907329491731, "y1997": 0.90403990934606904, "y1996": 0.904077381347274, "y1999": 0.90399237579083946, "y1998": 0.90201769385650832, "y2006": 0.91108803862404764, "y2007": 0.90543476309316473, "y2004": 0.94338264626469681, "y2005": 0.91981795862151561, "y2002": 0.93695966482853577, "y2003": 0.94242697007039, "y2000": 0.90906631602055099, "y2001": 0.92693339421265908, "id": 16, "y2008": 0.91737137682250491, "y2009": 0.94793657442067902}, {"neighbors": [13, 18, 11, 19, 8], "y1995": 1.1977611005602815, "y1997": 1.1843915817489725, "y1996": 1.1822256425225894, "y1999": 1.1928672308275252, "y1998": 1.1826786457339149, "y2006": 1.2392938410349985, "y2007": 1.2341867605077472, "y2004": 1.2385704217423759, "y2005": 1.2441989281116201, "y2002": 1.2262477774195681, "y2003": 1.2239707531714479, "y2000": 1.2017286912636342, "y2001": 1.2132869128474402, "id": 17, "y2008": 1.2362673914436095, "y2009": 1.2675439750795283}, {"neighbors": [13, 17, 11, 8, 19], "y1995": 1.2491967813733067, "y1997": 1.2699116090397236, "y1996": 1.2575477330927329, "y1999": 1.3062566740535762, "y1998": 1.2802065055312271, "y2006": 1.3210776560048689, "y2007": 1.329362443219563, "y2004": 1.3054484140490119, "y2005": 1.3030330249408666, "y2002": 1.3257518058685978, "y2003": 1.3079549159235695, "y2000": 1.3479002255103918, "y2001": 1.3439986302151703, "id": 18, "y2008": 1.3300124123891741, "y2009": 1.3328846185074705}, {"neighbors": [26, 17, 28, 15, 16], "y1995": 1.0676800411188558, "y1997": 1.0363730321443168, "y1996": 1.0379927554499979, "y1999": 1.0329609259280523, "y1998": 1.027684488045026, "y2006": 0.94241549375546196, "y2007": 0.92754546923532677, "y2004": 0.99614160423102482, "y2005": 0.97356208269708677, "y2002": 1.0274762326434594, "y2003": 1.0316273366809443, "y2000": 1.0505901631347052, "y2001": 1.0340505678899605, "id": 19, "y2008": 0.92549226593721745, "y2009": 0.92138101880290568}, {"neighbors": [30, 25, 24, 37, 47], "y1995": 1.0947561397632881, "y1997": 1.1165429913770684, "y1996": 1.1152679554712275, "y1999": 1.1314326394231322, "y1998": 1.1310394841195361, "y2006": 1.1090538904302065, "y2007": 1.1057776900012568, "y2004": 1.1402994437897009, "y2005": 1.1197940058085571, "y2002": 1.133670175399079, "y2003": 1.139822558851451, "y2000": 1.1388962186541665, "y2001": 1.1244221220249986, "id": 20, "y2008": 1.1116682481010467, "y2009": 1.0998515545336902}, {"neighbors": [23, 22, 7, 10, 34], "y1995": 0.76530058421804126, "y1997": 0.76542450966153397, "y1996": 0.76612841163904621, "y1999": 0.76014283909933289, "y1998": 0.7672268310234307, "y2006": 0.76842416021983684, "y2007": 0.77487117798086069, "y2004": 0.76533287692895391, "y2005": 0.78205934309410463, "y2002": 0.76156903267949927, "y2003": 0.76651951668098528, "y2000": 0.74480073263159763, "y2001": 0.76098396210261965, "id": 21, "y2008": 0.77768682781054099, "y2009": 0.78801192267396702}, {"neighbors": [21, 34, 5, 7, 29], "y1995": 0.98391336093764348, "y1997": 0.98295341320156315, "y1996": 0.98075815675295552, "y1999": 0.96913802803963667, "y1998": 0.97386015032669815, "y2006": 0.93965462091114671, "y2007": 0.93069644684632924, "y2004": 0.9635616201227476, "y2005": 0.94745351657235244, "y2002": 0.97209860866113018, "y2003": 0.97441312580606143, "y2000": 0.97370819354423843, "y2001": 0.96419154157867693, "id": 22, "y2008": 0.94020973488297466, "y2009": 0.94358232339833159}, {"neighbors": [21, 10, 22, 34, 7], "y1995": 0.83561828119099946, "y1997": 0.81738501913392403, "y1996": 0.82298088022609361, "y1999": 0.80904800725677739, "y1998": 0.81748588141426259, "y2006": 0.87170334233473346, "y2007": 0.8786379876833581, "y2004": 0.85954307066870839, "y2005": 0.86790023653402792, "y2002": 0.83451612857812574, "y2003": 0.85175031934895873, "y2000": 0.80071489233375537, "y2001": 0.83358255807316928, "id": 23, "y2008": 0.87497981001981484, "y2009": 0.87888675419592222}, {"neighbors": [27, 20, 30, 32, 47], "y1995": 0.98845573274970278, "y1997": 0.99665282989553183, "y1996": 1.0209242772035507, "y1999": 0.99386618594343845, "y1998": 0.99141823200404444, "y2006": 0.97906748937234156, "y2007": 0.9932312332800689, "y2004": 1.0111665058188304, "y2005": 0.9998802359352077, "y2002": 0.99669586934394627, "y2003": 1.0255909749831356, "y2000": 0.98733194819247994, "y2001": 0.99644997431653437, "id": 24, "y2008": 1.0020493856497013, "y2009": 0.99602148231561483}, {"neighbors": [20, 33, 6, 30, 12], "y1995": 1.1493091345649815, "y1997": 1.143009615936718, "y1996": 1.1524194939429724, "y1999": 1.1398468268822266, "y1998": 1.1426554202510555, "y2006": 1.0889107875354573, "y2007": 1.0860369499254896, "y2004": 1.0856975145267398, "y2005": 1.1244348633192611, "y2002": 1.0423089214343333, "y2003": 1.0557727834721793, "y2000": 1.0831239730629278, "y2001": 1.0519262599166714, "id": 25, "y2008": 1.0599731384290745, "y2009": 1.0216094265950888}, {"neighbors": [28, 19, 16, 32, 17], "y1995": 1.1136826889802023, "y1997": 1.1189343096757198, "y1996": 1.1057147027213501, "y1999": 1.1432271991365353, "y1998": 1.1377866945457653, "y2006": 1.1268023587150906, "y2007": 1.1235793669317915, "y2004": 1.1482023546040769, "y2005": 1.1238659840114973, "y2002": 1.1600919581655105, "y2003": 1.1446778932605579, "y2000": 1.1825702862895446, "y2001": 1.1622624279436105, "id": 26, "y2008": 1.115925801617498, "y2009": 1.1257082797404696}, {"neighbors": [32, 24, 36, 16, 28], "y1995": 1.303794309231981, "y1997": 1.3120636604057812, "y1996": 1.3075218596998686, "y1999": 1.3062566740535762, "y1998": 1.3153226688859194, "y2006": 1.2865667454509278, "y2007": 1.2973409698906584, "y2004": 1.2683078569016086, "y2005": 1.2617743046198988, "y2002": 1.2920319347677043, "y2003": 1.2718351646774422, "y2000": 1.3121023910310281, "y2001": 1.2998915587009874, "id": 27, "y2008": 1.2939020510829768, "y2009": 1.2934544564717687}, {"neighbors": [26, 16, 19, 32, 27], "y1995": 0.83953719020532513, "y1997": 0.82006005316292385, "y1996": 0.82701447583159737, "y1999": 0.80294863992835086, "y1998": 0.8118887636743225, "y2006": 0.8389109342655191, "y2007": 0.84349246817602375, "y2004": 0.83108634437662732, "y2005": 0.84373783646216949, "y2002": 0.82596790474192727, "y2003": 0.82435704751379402, "y2000": 0.78772975118465016, "y2001": 0.82848010958278628, "id": 28, "y2008": 0.85637272428125033, "y2009": 0.86539395164519117}, {"neighbors": [5, 39, 22, 14, 31], "y1995": 1.2345008725695852, "y1997": 1.2353793515744536, "y1996": 1.2426021999018138, "y1999": 1.2452262575926329, "y1998": 1.2358129278404693, "y2006": 1.2365329681906834, "y2007": 1.2796200872578414, "y2004": 1.1967443443492951, "y2005": 1.2153657295128597, "y2002": 1.1937780418204111, "y2003": 1.1835533748469893, "y2000": 1.2256766974812463, "y2001": 1.2112664802237314, "id": 29, "y2008": 1.2796839248335934, "y2009": 1.2590773758694083}, {"neighbors": [37, 20, 24, 25, 27], "y1995": 0.97696620404861145, "y1997": 0.98035944080980575, "y1996": 0.9740071914763756, "y1999": 0.95543282313901556, "y1998": 0.97581530789338955, "y2006": 0.92100464312607799, "y2007": 0.9147530387633086, "y2004": 0.9298883479571457, "y2005": 0.93442917452618346, "y2002": 0.93679072759857129, "y2003": 0.92540049332494034, "y2000": 0.96480308308405971, "y2001": 0.9468637634838194, "id": 30, "y2008": 0.90249622070947177, "y2009": 0.90213630440783921}, {"neighbors": [35, 14, 33, 12, 4], "y1995": 0.84986885942491119, "y1997": 0.84295996568390696, "y1996": 0.89868510090623221, "y1999": 0.85659367787716301, "y1998": 0.87280533962476625, "y2006": 0.92562487931452408, "y2007": 0.96635366357254426, "y2004": 0.92698332540482575, "y2005": 0.94745351657235244, "y2002": 0.90448992922937876, "y2003": 0.95495898185605821, "y2000": 0.88937573313051443, "y2001": 0.89440100450887505, "id": 31, "y2008": 1.025203118044723, "y2009": 1.0394296020754366}, {"neighbors": [36, 27, 28, 16, 26], "y1995": 1.0192280751235561, "y1997": 1.0097442843101825, "y1996": 1.0025820319237864, "y1999": 0.99765073314119712, "y1998": 1.0030341681355639, "y2006": 0.94779637858468868, "y2007": 0.93759089358493275, "y2004": 0.97583768316642261, "y2005": 0.96101679691008712, "y2002": 0.99747298060178258, "y2003": 0.99550758543481688, "y2000": 1.0075901875261932, "y2001": 0.99192968437874551, "id": 32, "y2008": 0.93353431146829191, "y2009": 0.94121705123804411}, {"neighbors": [44, 25, 12, 35, 31], "y1995": 0.86367410708901315, "y1997": 0.85544345781923936, "y1996": 0.85558931627900803, "y1999": 0.84336613427334628, "y1998": 0.85103025143102673, "y2006": 0.89455097373003656, "y2007": 0.88283929116469462, "y2004": 0.85951183386707053, "y2005": 0.87194227372077004, "y2002": 0.84667960913556228, "y2003": 0.84374557883664714, "y2000": 0.83434853662160158, "y2001": 0.85813595114434105, "id": 33, "y2008": 0.90349490610221961, "y2009": 0.9060067497610369}, {"neighbors": [22, 39, 21, 29, 23], "y1995": 1.0094753356447226, "y1997": 1.0069881886439402, "y1996": 1.0041105523637666, "y1999": 0.99291086334982948, "y1998": 0.99513686502304577, "y2006": 0.96382634438484593, "y2007": 0.95011400973122428, "y2004": 0.975119236728752, "y2005": 0.96134614808826613, "y2002": 0.99291167539274383, "y2003": 0.98983209318633369, "y2000": 1.0058162611397035, "y2001": 0.98850522230466298, "id": 34, "y2008": 0.94346860300667812, "y2009": 0.9463776450423077}, {"neighbors": [31, 38, 44, 33, 14], "y1995": 1.0571257066143651, "y1997": 1.0575301194645879, "y1996": 1.0545941857842291, "y1999": 1.0510385688532684, "y1998": 1.0488078570498685, "y2006": 1.0247627521629479, "y2007": 1.0234752320591773, "y2004": 1.0329697933620496, "y2005": 1.0219168238570018, "y2002": 1.0420048344203974, "y2003": 1.0402553971511816, "y2000": 1.0480002306104303, "y2001": 1.030249414987729, "id": 35, "y2008": 1.0251768368501768, "y2009": 1.0435957064486703}, {"neighbors": [32, 43, 27, 28, 42], "y1995": 1.070841888164505, "y1997": 1.0793762307014196, "y1996": 1.0666949726007404, "y1999": 1.0794043012481198, "y1998": 1.0738798776109699, "y2006": 1.087727556316465, "y2007": 1.0885954360198933, "y2004": 1.1032213602455734, "y2005": 1.0916793915985508, "y2002": 1.0938347765734742, "y2003": 1.1052447043433509, "y2000": 1.0531800956589803, "y2001": 1.0745277096056161, "id": 36, "y2008": 1.0917733838297285, "y2009": 1.1096083021948762}, {"neighbors": [30, 40, 20, 42, 41], "y1995": 0.8671922185905101, "y1997": 0.86675155621455668, "y1996": 0.86628895935887062, "y1999": 0.86511809486628932, "y1998": 0.86425631732335095, "y2006": 0.84488343470424199, "y2007": 0.83374328958471722, "y2004": 0.84517414191529749, "y2005": 0.84843857600526962, "y2002": 0.85411284725399572, "y2003": 0.84886336375435456, "y2000": 0.86287327291635718, "y2001": 0.8516979624450659, "id": 37, "y2008": 0.82812044014430564, "y2009": 0.82878598934619596}, {"neighbors": [35, 31, 45, 39, 44], "y1995": 0.8838921149583755, "y1997": 0.90282398478743275, "y1996": 0.92288667453925455, "y1999": 0.92023285988219217, "y1998": 0.91229185518735723, "y2006": 0.93869676706720051, "y2007": 0.96947770975097391, "y2004": 0.99223700402629367, "y2005": 0.97984969609868555, "y2002": 0.93682451504456421, "y2003": 0.98655146182882891, "y2000": 0.92652175166361039, "y2001": 0.94278865361566122, "id": 38, "y2008": 1.0036262573224608, "y2009": 0.98102350657197357}, {"neighbors": [29, 34, 38, 22, 35], "y1995": 0.970820642185237, "y1997": 0.94534081352108112, "y1996": 0.95320232993219844, "y1999": 0.93967000034446724, "y1998": 0.94215592860799646, "y2006": 0.91035556215514757, "y2007": 0.90430364292511256, "y2004": 0.92879505989982103, "y2005": 0.9211054223180335, "y2002": 0.93412151936513388, "y2003": 0.93501274320242933, "y2000": 0.93092108910210503, "y2001": 0.92662519262599163, "id": 39, "y2008": 0.89994694483851023, "y2009": 0.9007386435858511}, {"neighbors": [41, 37, 42, 30, 45], "y1995": 0.95861858457245008, "y1997": 0.98254810501535106, "y1996": 0.95774543235102894, "y1999": 0.98684823919808018, "y1998": 0.98919471947721893, "y2006": 0.97163003599581876, "y2007": 0.97007020126757271, "y2004": 0.9493488753775261, "y2005": 0.97152609359561659, "y2002": 0.95601578436851964, "y2003": 0.94905384541254967, "y2000": 0.98882204635713133, "y2001": 0.97662233890759653, "id": 40, "y2008": 0.97158948117089283, "y2009": 0.95884908006927827}, {"neighbors": [40, 45, 44, 37, 42], "y1995": 0.83980438854721107, "y1997": 0.85746999875029983, "y1996": 0.84726737166133714, "y1999": 0.85567509846023126, "y1998": 0.85467221160427542, "y2006": 0.8333891885768886, "y2007": 0.83511679264592342, "y2004": 0.81743586206088703, "y2005": 0.83550405700769481, "y2002": 0.84502402428191115, "y2003": 0.82645665158259707, "y2000": 0.84818516243622177, "y2001": 0.85265681182580899, "id": 41, "y2008": 0.82136617314598481, "y2009": 0.80921873783836296}, {"neighbors": [43, 40, 46, 37, 36], "y1995": 0.95118156405662746, "y1997": 0.94688098462868708, "y1996": 0.9466212002600608, "y1999": 0.95124410099780687, "y1998": 0.95085829660091703, "y2006": 0.96895367966714574, "y2007": 0.9700163384024274, "y2004": 0.97583768316642261, "y2005": 0.95571723704302525, "y2002": 0.96804411514198463, "y2003": 0.97136213864358201, "y2000": 0.95440787445922959, "y2001": 0.96364362764682376, "id": 42, "y2008": 0.97082732652905901, "y2009": 0.9878236640328002}, {"neighbors": [36, 42, 32, 27, 46], "y1995": 1.0891004415267045, "y1997": 1.0849289528525252, "y1996": 1.0824896838138709, "y1999": 1.0945424900391545, "y1998": 1.0865692335830259, "y2006": 1.1450297539219478, "y2007": 1.1447474729339102, "y2004": 1.1334273474293739, "y2005": 1.1468606844516303, "y2002": 1.1229257675733433, "y2003": 1.1302103089739621, "y2000": 1.1055818811158884, "y2001": 1.1214085953998059, "id": 43, "y2008": 1.1408403740471014, "y2009": 1.1614292649793569}, {"neighbors": [33, 41, 45, 35, 40], "y1995": 1.0633603345917013, "y1997": 1.0869149629649646, "y1996": 1.0736582323828732, "y1999": 1.1166986255755473, "y1998": 1.0976484597942771, "y2006": 1.0839806574563229, "y2007": 1.0983176831786272, "y2004": 1.0927882684985315, "y2005": 1.0700320368873319, "y2002": 1.0881584856466706, "y2003": 1.0804431312806149, "y2000": 1.1185670222649935, "y2001": 1.0976428286056732, "id": 44, "y2008": 1.0929823187788443, "y2009": 1.0917612486217978}, {"neighbors": [41, 44, 40, 35, 33], "y1995": 0.79772064970019041, "y1997": 0.7858115114280021, "y1996": 0.78829195801876151, "y1999": 0.77035744221561353, "y1998": 0.77615921755360906, "y2006": 0.79949806580432425, "y2007": 0.80172181625581262, "y2004": 0.79603865293896003, "y2005": 0.78966436120841943, "y2002": 0.81437881076636964, "y2003": 0.80788827809912023, "y2000": 0.77751193519846906, "y2001": 0.79902973574567659, "id": 45, "y2008": 0.82168154748053679, "y2009": 0.85587910681858015}, {"neighbors": [42, 43, 40, 36, 37], "y1995": 1.0052446952315301, "y1997": 1.0047589936197736, "y1996": 1.0000769567582628, "y1999": 1.0063956091903872, "y1998": 1.0061394183885444, "y2006": 0.97292595590233411, "y2007": 0.96519561197191939, "y2004": 0.99030032232474696, "y2005": 0.97682565346267858, "y2002": 1.0081498135355325, "y2003": 1.0057431552702318, "y2000": 1.0016297948675874, "y2001": 0.99860738542320637, "id": 46, "y2008": 0.9617340332161447, "y2009": 0.95890283625473927}, {"neighbors": [20, 6, 24, 25, 30], "y1995": 0.95808418788867844, "y1997": 0.9654440995572009, "y1996": 0.93825679674127938, "y1999": 0.96987289157318213, "y1998": 0.95561201303757848, "y2006": 1.1704973973021624, "y2007": 1.1702515395802287, "y2004": 1.0533361880299275, "y2005": 1.0983262971945267, "y2002": 1.0078119390756035, "y2003": 1.0348423554112989, "y2000": 0.96608031008233231, "y2001": 0.99727184521431422, "id": 47, "y2008": 1.1873055260044207, "y2009": 1.1424264534188653}]
diff --git a/release/python/0.8.0/crankshaft/test/helper.py b/release/python/0.8.0/crankshaft/test/helper.py
new file mode 100644
index 0000000..7d28b94
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/test/helper.py
@@ -0,0 +1,13 @@
+import unittest
+
+from mock_plpy import MockPlPy
+plpy = MockPlPy()
+
+import sys
+sys.modules['plpy'] = plpy
+
+import os
+
+def fixture_file(name):
+ dir = os.path.dirname(os.path.realpath(__file__))
+ return os.path.join(dir, 'fixtures', name)
diff --git a/release/python/0.8.0/crankshaft/test/mock_plpy.py b/release/python/0.8.0/crankshaft/test/mock_plpy.py
new file mode 100644
index 0000000..9c3340c
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/test/mock_plpy.py
@@ -0,0 +1,57 @@
+import re
+
+
+class MockCursor:
+ def __init__(self, data):
+ self.cursor_pos = 0
+ self.data = data
+
+ def fetch(self, batch_size):
+ batch = self.data[self.cursor_pos:self.cursor_pos + batch_size]
+ self.cursor_pos += batch_size
+ return batch
+
+
+class MockPlPy:
+ def __init__(self):
+ self._reset()
+
+ def _reset(self):
+ self.infos = []
+ self.notices = []
+ self.debugs = []
+ self.logs = []
+ self.warnings = []
+ self.errors = []
+ self.fatals = []
+ self.executes = []
+ self.results = []
+ self.prepares = []
+ self.results = []
+
+ def _define_result(self, query, result):
+ pattern = re.compile(query, re.IGNORECASE | re.MULTILINE)
+ self.results.append([pattern, result])
+
+ def notice(self, msg):
+ self.notices.append(msg)
+
+ def debug(self, msg):
+ self.notices.append(msg)
+
+ def info(self, msg):
+ self.infos.append(msg)
+
+ def error(self, msg):
+ self.notices.append(msg)
+
+ def cursor(self, query):
+ data = self.execute(query)
+ return MockCursor(data)
+
+ # TODO: additional arguments
+ def execute(self, query):
+ for result in self.results:
+ if result[0].match(query):
+ return result[1]
+ return []
diff --git a/release/python/0.8.0/crankshaft/test/test_clustering_getis.py b/release/python/0.8.0/crankshaft/test/test_clustering_getis.py
new file mode 100644
index 0000000..61add11
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/test/test_clustering_getis.py
@@ -0,0 +1,78 @@
+import unittest
+import numpy as np
+
+from helper import fixture_file
+
+from crankshaft.clustering import Getis
+import crankshaft.pysal_utils as pu
+from crankshaft import random_seeds
+import json
+from crankshaft.analysis_data_provider import AnalysisDataProvider
+
+# Fixture files produced as follows
+#
+# import pysal as ps
+# import numpy as np
+# import random
+#
+# # setup variables
+# f = ps.open(ps.examples.get_path("stl_hom.dbf"))
+# y = np.array(f.by_col['HR8893'])
+# w_queen = ps.queen_from_shapefile(ps.examples.get_path("stl_hom.shp"))
+#
+# out_queen = [{"id": index + 1,
+# "neighbors": [x+1 for x in w_queen.neighbors[index]],
+# "value": val} for index, val in enumerate(y)]
+#
+# with open('neighbors_queen_getis.json', 'w') as f:
+# f.write(str(out_queen))
+#
+# random.seed(1234)
+# np.random.seed(1234)
+# lgstar_queen = ps.esda.getisord.G_Local(y, w_queen, star=True,
+# permutations=999)
+#
+# with open('getis_queen.json', 'w') as f:
+# f.write(str(zip(lgstar_queen.z_sim,
+# lgstar_queen.p_sim, lgstar_queen.p_z_sim)))
+
+
+class FakeDataProvider(AnalysisDataProvider):
+ def __init__(self, mock_data):
+ self.mock_result = mock_data
+
+ def get_getis(self, w_type, param):
+ return self.mock_result
+
+
+class GetisTest(unittest.TestCase):
+ """Testing class for Getis-Ord's G* funtion
+ This test replicates the work done in PySAL documentation:
+ https://pysal.readthedocs.io/en/v1.11.0/users/tutorials/autocorrelation.html#local-g-and-g
+ """
+
+ def setUp(self):
+ # load raw data for analysis
+ self.neighbors_data = json.loads(
+ open(fixture_file('neighbors_getis.json')).read())
+
+ # load pre-computed/known values
+ self.getis_data = json.loads(
+ open(fixture_file('getis.json')).read())
+
+ def test_getis_ord(self):
+ """Test Getis-Ord's G*"""
+ data = [{'id': d['id'],
+ 'attr1': d['value'],
+ 'neighbors': d['neighbors']} for d in self.neighbors_data]
+
+ random_seeds.set_random_seeds(1234)
+ getis = Getis(FakeDataProvider(data))
+
+ result = getis.getis_ord('subquery', 'value',
+ 'queen', None, 999, 'the_geom',
+ 'cartodb_id')
+ result = [(row[0], row[1]) for row in result]
+ expected = np.array(self.getis_data)[:, 0:2]
+ for ([res_z, res_p], [exp_z, exp_p]) in zip(result, expected):
+ self.assertAlmostEqual(res_z, exp_z, delta=1e-2)
diff --git a/release/python/0.8.0/crankshaft/test/test_clustering_kmeans.py b/release/python/0.8.0/crankshaft/test/test_clustering_kmeans.py
new file mode 100644
index 0000000..c118d34
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/test/test_clustering_kmeans.py
@@ -0,0 +1,87 @@
+import unittest
+import numpy as np
+
+
+from helper import fixture_file
+from crankshaft.clustering import Kmeans
+from crankshaft.analysis_data_provider import AnalysisDataProvider
+import crankshaft.clustering as cc
+from crankshaft import random_seeds
+
+import json
+from collections import OrderedDict
+
+
+class FakeDataProvider(AnalysisDataProvider):
+ def __init__(self, mocked_result):
+ self.mocked_result = mocked_result
+
+ def get_spatial_kmeans(self, query):
+ return self.mocked_result
+
+ def get_nonspatial_kmeans(self, query):
+ return self.mocked_result
+
+
+class KMeansTest(unittest.TestCase):
+ """Testing class for k-means spatial"""
+
+ def setUp(self):
+ self.cluster_data = json.loads(
+ open(fixture_file('kmeans.json')).read())
+ self.params = {"subquery": "select * from table",
+ "no_clusters": "10"}
+
+ def test_kmeans(self):
+ """
+ """
+ data = [{'xs': d['xs'],
+ 'ys': d['ys'],
+ 'ids': d['ids']} for d in self.cluster_data]
+
+ random_seeds.set_random_seeds(1234)
+ kmeans = Kmeans(FakeDataProvider(data))
+ clusters = kmeans.spatial('subquery', 2)
+ labels = [a[1] for a in clusters]
+ c1 = [a for a in clusters if a[1] == 0]
+ c2 = [a for a in clusters if a[1] == 1]
+
+ self.assertEqual(len(np.unique(labels)), 2)
+ self.assertEqual(len(c1), 20)
+ self.assertEqual(len(c2), 20)
+
+
+class KMeansNonspatialTest(unittest.TestCase):
+ """Testing class for k-means non-spatial"""
+
+ def setUp(self):
+ self.params = {"subquery": "SELECT * FROM TABLE",
+ "n_clusters": 5}
+
+ def test_kmeans_nonspatial(self):
+ """
+ test for k-means non-spatial
+ """
+ # data from:
+ # http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn-cluster-kmeans
+ data_raw = [OrderedDict([("arr_col1", [1, 1, 1, 4, 4, 4]),
+ ("arr_col2", [2, 4, 0, 2, 4, 0]),
+ ("rowid", [1, 2, 3, 4, 5, 6])])]
+
+ random_seeds.set_random_seeds(1234)
+ kmeans = Kmeans(FakeDataProvider(data_raw))
+ clusters = kmeans.nonspatial('subquery', ['col1', 'col2'], 2)
+
+ cl1 = clusters[0][0]
+ cl2 = clusters[3][0]
+
+ for idx, val in enumerate(clusters):
+ if idx < 3:
+ self.assertEqual(val[0], cl1)
+ else:
+ self.assertEqual(val[0], cl2)
+
+ # raises exception for no data
+ with self.assertRaises(Exception):
+ kmeans = Kmeans(FakeDataProvider([]))
+ kmeans.nonspatial('subquery', ['col1', 'col2'], 2)
diff --git a/release/python/0.8.0/crankshaft/test/test_clustering_moran.py b/release/python/0.8.0/crankshaft/test/test_clustering_moran.py
new file mode 100644
index 0000000..a91c046
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/test/test_clustering_moran.py
@@ -0,0 +1,112 @@
+import unittest
+import numpy as np
+
+from helper import fixture_file
+from crankshaft.clustering import Moran
+from crankshaft.analysis_data_provider import AnalysisDataProvider
+import crankshaft.pysal_utils as pu
+from crankshaft import random_seeds
+import json
+from collections import OrderedDict
+
+
+class FakeDataProvider(AnalysisDataProvider):
+ def __init__(self, mock_data):
+ self.mock_result = mock_data
+
+ def get_moran(self, w_type, params):
+ return self.mock_result
+
+
+class MoranTest(unittest.TestCase):
+ """Testing class for Moran's I functions"""
+
+ def setUp(self):
+ self.params = {"id_col": "cartodb_id",
+ "attr1": "andy",
+ "attr2": "jay_z",
+ "subquery": "SELECT * FROM a_list",
+ "geom_col": "the_geom",
+ "num_ngbrs": 321}
+ self.params_markov = {"id_col": "cartodb_id",
+ "time_cols": ["_2013_dec", "_2014_jan",
+ "_2014_feb"],
+ "subquery": "SELECT * FROM a_list",
+ "geom_col": "the_geom",
+ "num_ngbrs": 321}
+ self.neighbors_data = json.loads(
+ open(fixture_file('neighbors.json')).read())
+ self.moran_data = json.loads(
+ open(fixture_file('moran.json')).read())
+
+ def test_map_quads(self):
+ """Test map_quads"""
+ from crankshaft.clustering import map_quads
+ self.assertEqual(map_quads(1), 'HH')
+ self.assertEqual(map_quads(2), 'LH')
+ self.assertEqual(map_quads(3), 'LL')
+ self.assertEqual(map_quads(4), 'HL')
+ self.assertEqual(map_quads(33), None)
+ self.assertEqual(map_quads('andy'), None)
+
+ def test_quad_position(self):
+ """Test lisa_sig_vals"""
+ from crankshaft.clustering import quad_position
+
+ quads = np.array([1, 2, 3, 4], np.int)
+
+ ans = np.array(['HH', 'LH', 'LL', 'HL'])
+ test_ans = quad_position(quads)
+
+ self.assertTrue((test_ans == ans).all())
+
+ def test_local_stat(self):
+ """Test Moran's I local"""
+ data = [OrderedDict([('id', d['id']),
+ ('attr1', d['value']),
+ ('neighbors', d['neighbors'])])
+ for d in self.neighbors_data]
+
+ moran = Moran(FakeDataProvider(data))
+ random_seeds.set_random_seeds(1234)
+ result = moran.local_stat('subquery', 'value',
+ 'knn', 5, 99, 'the_geom', 'cartodb_id')
+ result = [(row[0], row[6]) for row in result]
+ zipped_values = zip(result, self.moran_data)
+
+ for ([res_quad, res_val], [exp_val, exp_quad]) in zipped_values:
+ self.assertAlmostEqual(res_val, exp_val)
+ self.assertEqual(res_quad, exp_quad)
+
+ def test_moran_local_rate(self):
+ """Test Moran's I rate"""
+ data = [{'id': d['id'],
+ 'attr1': d['value'],
+ 'attr2': 1,
+ 'neighbors': d['neighbors']} for d in self.neighbors_data]
+
+ random_seeds.set_random_seeds(1234)
+ moran = Moran(FakeDataProvider(data))
+ result = moran.local_rate_stat('subquery', 'numerator', 'denominator',
+ 'knn', 5, 99, 'the_geom', 'cartodb_id')
+ result = [(row[0], row[6]) for row in result]
+
+ zipped_values = zip(result, self.moran_data)
+
+ for ([res_quad, res_val], [exp_val, exp_quad]) in zipped_values:
+ self.assertAlmostEqual(res_val, exp_val)
+
+ def test_moran(self):
+ """Test Moran's I global"""
+ data = [{'id': d['id'],
+ 'attr1': d['value'],
+ 'neighbors': d['neighbors']} for d in self.neighbors_data]
+ random_seeds.set_random_seeds(1235)
+ moran = Moran(FakeDataProvider(data))
+ result = moran.global_stat('table', 'value',
+ 'knn', 5, 99, 'the_geom',
+ 'cartodb_id')
+
+ result_moran = result[0][0]
+ expected_moran = np.array([row[0] for row in self.moran_data]).mean()
+ self.assertAlmostEqual(expected_moran, result_moran, delta=10e-2)
diff --git a/release/python/0.8.0/crankshaft/test/test_pysal_utils.py b/release/python/0.8.0/crankshaft/test/test_pysal_utils.py
new file mode 100644
index 0000000..be45164
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/test/test_pysal_utils.py
@@ -0,0 +1,83 @@
+import unittest
+
+import crankshaft.pysal_utils as pu
+from crankshaft import random_seeds
+from collections import OrderedDict
+
+
+class PysalUtilsTest(unittest.TestCase):
+ """Testing class for utility functions related to PySAL integrations"""
+
+ def setUp(self):
+ self.params1 = OrderedDict([("id_col", "cartodb_id"),
+ ("attr1", "andy"),
+ ("attr2", "jay_z"),
+ ("subquery", "SELECT * FROM a_list"),
+ ("geom_col", "the_geom"),
+ ("num_ngbrs", 321)])
+
+ self.params2 = OrderedDict([("id_col", "cartodb_id"),
+ ("numerator", "price"),
+ ("denominator", "sq_meters"),
+ ("subquery", "SELECT * FROM pecan"),
+ ("geom_col", "the_geom"),
+ ("num_ngbrs", 321)])
+
+ self.params3 = OrderedDict([("id_col", "cartodb_id"),
+ ("numerator", "sq_meters"),
+ ("denominator", "price"),
+ ("subquery", "SELECT * FROM pecan"),
+ ("geom_col", "the_geom"),
+ ("num_ngbrs", 321)])
+
+ self.params_array = {"id_col": "cartodb_id",
+ "time_cols": ["_2013_dec", "_2014_jan", "_2014_feb"],
+ "subquery": "SELECT * FROM a_list",
+ "geom_col": "the_geom",
+ "num_ngbrs": 321}
+
+ def test_query_attr_select(self):
+ """Test query_attr_select"""
+
+ ans1 = ("i.\"andy\"::numeric As attr1, "
+ "i.\"jay_z\"::numeric As attr2, ")
+
+ ans2 = ("i.\"price\"::numeric As attr1, "
+ "i.\"sq_meters\"::numeric As attr2, ")
+
+ ans3 = ("i.\"sq_meters\"::numeric As attr1, "
+ "i.\"price\"::numeric As attr2, ")
+
+ ans_array = ("i.\"_2013_dec\"::numeric As attr1, "
+ "i.\"_2014_jan\"::numeric As attr2, "
+ "i.\"_2014_feb\"::numeric As attr3, ")
+
+ self.assertEqual(pu.query_attr_select(self.params1), ans1)
+ self.assertEqual(pu.query_attr_select(self.params2), ans2)
+ self.assertEqual(pu.query_attr_select(self.params3), ans3)
+ self.assertEqual(pu.query_attr_select(self.params_array), ans_array)
+
+ def test_query_attr_where(self):
+ """Test pu.query_attr_where"""
+
+ ans1 = ("idx_replace.\"andy\" IS NOT NULL AND "
+ "idx_replace.\"jay_z\" IS NOT NULL")
+
+ ans_array = ("idx_replace.\"_2013_dec\" IS NOT NULL AND "
+ "idx_replace.\"_2014_jan\" IS NOT NULL AND "
+ "idx_replace.\"_2014_feb\" IS NOT NULL")
+
+ self.assertEqual(pu.query_attr_where(self.params1), ans1)
+ self.assertEqual(pu.query_attr_where(self.params_array), ans_array)
+
+ def test_get_attributes(self):
+ """Test get_attributes"""
+
+ # need to add tests
+
+ self.assertEqual(True, True)
+
+ def test_get_weight(self):
+ """Test get_weight"""
+
+ self.assertEqual(True, True)
diff --git a/release/python/0.8.0/crankshaft/test/test_regression_gwr.py b/release/python/0.8.0/crankshaft/test/test_regression_gwr.py
new file mode 100644
index 0000000..57cd952
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/test/test_regression_gwr.py
@@ -0,0 +1,130 @@
+import unittest
+import json
+import numpy as np
+
+
+from crankshaft import random_seeds
+from helper import fixture_file
+from crankshaft.regression import GWR
+from crankshaft.analysis_data_provider import AnalysisDataProvider
+
+
+class FakeDataProvider(AnalysisDataProvider):
+ def __init__(self, mocked_result):
+ self.mocked_result = mocked_result
+
+ def get_gwr(self, params):
+ return self.mocked_result
+
+ def get_gwr_predict(self, params):
+ return self.mocked_result
+
+
+class GWRTest(unittest.TestCase):
+ """Testing class for geographically weighted regression (gwr)"""
+
+ def setUp(self):
+ """
+ fixture packed from canonical GWR georgia dataset using the
+ following query:
+ SELECT array_agg(x) As x,
+ array_agg(y) As y,
+ array_agg(pctbach) As dep_var,
+ array_agg(pctrural) As attr1,
+ array_agg(pctpov) As attr2,
+ array_agg(pctblack) As attr3,
+ array_agg(areakey) As rowid
+ FROM g_utm
+ WHERE pctbach is not NULL AND
+ pctrural IS NOT NULL AND
+ pctpov IS NOT NULL AND
+ pctblack IS NOT NULL
+ """
+ import copy
+ # data packed from https://github.com/TaylorOshan/pysal/blob/1d6af33bda46b1d623f70912c56155064463383f/pysal/examples/georgia/GData_utm.csv
+ self.data = json.loads(
+ open(fixture_file('gwr_packed_data.json')).read())
+
+ # data packed from https://github.com/TaylorOshan/pysal/blob/a44c5541e2e0d10a99ff05edc1b7f81b70f5a82f/pysal/examples/georgia/georgia_BS_NN_listwise.csv
+ self.knowns = json.loads(
+ open(fixture_file('gwr_packed_knowns.json')).read())
+
+ # data for GWR prediction
+ self.data_predict = copy.deepcopy(self.data)
+ self.ids_of_unknowns = [13083, 13009, 13281, 13115, 13247, 13169]
+ self.idx_ids_of_unknowns = [self.data_predict[0]['rowid'].index(idx)
+ for idx in self.ids_of_unknowns]
+
+ for idx in self.idx_ids_of_unknowns:
+ self.data_predict[0]['dep_var'][idx] = None
+
+ self.predicted_knowns = {13009: 10.879,
+ 13083: 4.5259,
+ 13115: 9.4022,
+ 13169: 6.0793,
+ 13247: 8.1608,
+ 13281: 13.886}
+
+ # params, with ind_vars in same ordering as query above
+ self.params = {'subquery': 'select * from table',
+ 'dep_var': 'pctbach',
+ 'ind_vars': ['pctrural', 'pctpov', 'pctblack'],
+ 'bw': 90.000,
+ 'fixed': False,
+ 'geom_col': 'the_geom',
+ 'id_col': 'areakey'}
+
+ def test_gwr(self):
+ """
+ """
+ gwr = GWR(FakeDataProvider(self.data))
+ gwr_resp = gwr.gwr(self.params['subquery'],
+ self.params['dep_var'],
+ self.params['ind_vars'],
+ bw=self.params['bw'],
+ fixed=self.params['fixed'])
+
+ # unpack response
+ coeffs, stand_errs, t_vals, t_vals_filtered, predicteds, \
+ residuals, r_squareds, bws, rowids = zip(*gwr_resp)
+
+ # prepare for comparision
+ coeff_known_pctpov = self.knowns['est_pctpov']
+ tval_known_pctblack = self.knowns['t_pctrural']
+ pctpov_se = self.knowns['se_pctpov']
+ ids = self.knowns['area_key']
+ resp_idx = None
+
+ # test pctpov coefficient estimates
+ for idx, val in enumerate(coeff_known_pctpov):
+ resp_idx = rowids.index(ids[idx])
+ self.assertAlmostEquals(val,
+ json.loads(coeffs[resp_idx])['pctpov'],
+ places=4)
+ # test pctrural tvals
+ for idx, val in enumerate(tval_known_pctblack):
+ resp_idx = rowids.index(ids[idx])
+ self.assertAlmostEquals(val,
+ json.loads(t_vals[resp_idx])['pctrural'],
+ places=4)
+
+ def test_gwr_predict(self):
+ """Testing for GWR_Predict"""
+ gwr = GWR(FakeDataProvider(self.data_predict))
+ gwr_resp = gwr.gwr_predict(self.params['subquery'],
+ self.params['dep_var'],
+ self.params['ind_vars'],
+ bw=self.params['bw'],
+ fixed=self.params['fixed'])
+
+ # unpack response
+ coeffs, stand_errs, t_vals, \
+ r_squareds, predicteds, rowid = zip(*gwr_resp)
+ threshold = 0.01
+
+ for i, idx in enumerate(self.idx_ids_of_unknowns):
+
+ known_val = self.predicted_knowns[rowid[i]]
+ predicted_val = predicteds[i]
+ test_val = abs(known_val - predicted_val) / known_val
+ self.assertTrue(test_val < threshold)
diff --git a/release/python/0.8.0/crankshaft/test/test_segmentation.py b/release/python/0.8.0/crankshaft/test/test_segmentation.py
new file mode 100644
index 0000000..d02e8b1
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/test/test_segmentation.py
@@ -0,0 +1,64 @@
+import unittest
+import numpy as np
+from helper import plpy, fixture_file
+import crankshaft.segmentation as segmentation
+import json
+
+class SegmentationTest(unittest.TestCase):
+ """Testing class for Moran's I functions"""
+
+ def setUp(self):
+ plpy._reset()
+
+ def generate_random_data(self,n_samples,random_state, row_type=False):
+ x1 = random_state.uniform(size=n_samples)
+ x2 = random_state.uniform(size=n_samples)
+ x3 = random_state.randint(0, 4, size=n_samples)
+
+ y = x1+x2*x2+x3
+ cartodb_id = range(len(x1))
+
+ if row_type:
+ return [ {'features': vals} for vals in zip(x1,x2,x3)], y
+ else:
+ return [dict( zip(['x1','x2','x3','target', 'cartodb_id'],[x1,x2,x3,y,cartodb_id]))]
+
+ def test_replace_nan_with_mean(self):
+ test_array = np.array([1.2, np.nan, 3.2, np.nan, np.nan])
+
+ def test_create_and_predict_segment(self):
+ n_samples = 1000
+
+ random_state_train = np.random.RandomState(13)
+ random_state_test = np.random.RandomState(134)
+ training_data = self.generate_random_data(n_samples, random_state_train)
+ test_data, test_y = self.generate_random_data(n_samples, random_state_test, row_type=True)
+
+
+ ids = [{'cartodb_ids': range(len(test_data))}]
+ rows = [{'x1': 0,'x2':0,'x3':0,'y':0,'cartodb_id':0}]
+
+ plpy._define_result('select \* from \(select \* from training\) a limit 1',rows)
+ plpy._define_result('.*from \(select \* from training\) as a' ,training_data)
+ plpy._define_result('select array_agg\(cartodb\_id order by cartodb\_id\) as cartodb_ids from \(.*\) a',ids)
+ plpy._define_result('.*select \* from test.*' ,test_data)
+
+ model_parameters = {'n_estimators': 1200,
+ 'max_depth': 3,
+ 'subsample' : 0.5,
+ 'learning_rate': 0.01,
+ 'min_samples_leaf': 1}
+
+ result = segmentation.create_and_predict_segment(
+ 'select * from training',
+ 'target',
+ 'select * from test',
+ model_parameters)
+
+ prediction = [r[1] for r in result]
+
+ accuracy =np.sqrt(np.mean( np.square( np.array(prediction) - np.array(test_y))))
+
+ self.assertEqual(len(result),len(test_data))
+ self.assertTrue( result[0][2] < 0.01)
+ self.assertTrue( accuracy < 0.5*np.mean(test_y) )
diff --git a/release/python/0.8.0/crankshaft/test/test_space_time_dynamics.py b/release/python/0.8.0/crankshaft/test/test_space_time_dynamics.py
new file mode 100644
index 0000000..d14563e
--- /dev/null
+++ b/release/python/0.8.0/crankshaft/test/test_space_time_dynamics.py
@@ -0,0 +1,349 @@
+import unittest
+import numpy as np
+
+import unittest
+
+
+from helper import fixture_file
+
+from crankshaft.space_time_dynamics import Markov
+import crankshaft.space_time_dynamics as std
+from crankshaft import random_seeds
+from crankshaft.analysis_data_provider import AnalysisDataProvider
+import json
+
+
+class FakeDataProvider(AnalysisDataProvider):
+ def __init__(self, data):
+ self.mock_result = data
+
+ def get_markov(self, w_type, params):
+ return self.mock_result
+
+
+class SpaceTimeTests(unittest.TestCase):
+ """Testing class for Markov Functions."""
+
+ def setUp(self):
+ self.params = {"id_col": "cartodb_id",
+ "time_cols": ['dec_2013', 'jan_2014', 'feb_2014'],
+ "subquery": "SELECT * FROM a_list",
+ "geom_col": "the_geom",
+ "num_ngbrs": 321}
+ self.neighbors_data = json.loads(
+ open(fixture_file('neighbors_markov.json')).read())
+ self.markov_data = json.loads(open(fixture_file('markov.json')).read())
+
+ self.time_data = np.array([i * np.ones(10, dtype=float)
+ for i in range(10)]).T
+
+ self.transition_matrix = np.array([
+ [[0.96341463, 0.0304878, 0.00609756, 0., 0.],
+ [0.06040268, 0.83221477, 0.10738255, 0., 0.],
+ [0., 0.14, 0.74, 0.12, 0.],
+ [0., 0.03571429, 0.32142857, 0.57142857, 0.07142857],
+ [0., 0., 0., 0.16666667, 0.83333333]],
+ [[0.79831933, 0.16806723, 0.03361345, 0., 0.],
+ [0.0754717, 0.88207547, 0.04245283, 0., 0.],
+ [0.00537634, 0.06989247, 0.8655914, 0.05913978, 0.],
+ [0., 0., 0.06372549, 0.90196078, 0.03431373],
+ [0., 0., 0., 0.19444444, 0.80555556]],
+ [[0.84693878, 0.15306122, 0., 0., 0.],
+ [0.08133971, 0.78947368, 0.1291866, 0., 0.],
+ [0.00518135, 0.0984456, 0.79274611, 0.0984456, 0.00518135],
+ [0., 0., 0.09411765, 0.87058824, 0.03529412],
+ [0., 0., 0., 0.10204082, 0.89795918]],
+ [[0.8852459, 0.09836066, 0., 0.01639344, 0.],
+ [0.03875969, 0.81395349, 0.13953488, 0., 0.00775194],
+ [0.0049505, 0.09405941, 0.77722772, 0.11881188, 0.0049505],
+ [0., 0.02339181, 0.12865497, 0.75438596, 0.09356725],
+ [0., 0., 0., 0.09661836, 0.90338164]],
+ [[0.33333333, 0.66666667, 0., 0., 0.],
+ [0.0483871, 0.77419355, 0.16129032, 0.01612903, 0.],
+ [0.01149425, 0.16091954, 0.74712644, 0.08045977, 0.],
+ [0., 0.01036269, 0.06217617, 0.89637306, 0.03108808],
+ [0., 0., 0., 0.02352941, 0.97647059]]]
+ )
+
+ def test_spatial_markov(self):
+ """Test Spatial Markov."""
+ data = [{'id': d['id'],
+ 'attr1': d['y1995'],
+ 'attr2': d['y1996'],
+ 'attr3': d['y1997'],
+ 'attr4': d['y1998'],
+ 'attr5': d['y1999'],
+ 'attr6': d['y2000'],
+ 'attr7': d['y2001'],
+ 'attr8': d['y2002'],
+ 'attr9': d['y2003'],
+ 'attr10': d['y2004'],
+ 'attr11': d['y2005'],
+ 'attr12': d['y2006'],
+ 'attr13': d['y2007'],
+ 'attr14': d['y2008'],
+ 'attr15': d['y2009'],
+ 'neighbors': d['neighbors']} for d in self.neighbors_data]
+ # print(str(data[0]))
+ markov = Markov(FakeDataProvider(data))
+ random_seeds.set_random_seeds(1234)
+
+ result = markov.spatial_trend('subquery',
+ ['y1995', 'y1996', 'y1997', 'y1998',
+ 'y1999', 'y2000', 'y2001', 'y2002',
+ 'y2003', 'y2004', 'y2005', 'y2006',
+ 'y2007', 'y2008', 'y2009'],
+ 5, 'knn', 5, 0, 'the_geom',
+ 'cartodb_id')
+
+ self.assertTrue(result is not None)
+ result = [(row[0], row[1], row[2], row[3], row[4]) for row in result]
+ print result[0]
+ expected = self.markov_data
+ for ([res_trend, res_up, res_down, res_vol, res_id],
+ [exp_trend, exp_up, exp_down, exp_vol, exp_id]
+ ) in zip(result, expected):
+ self.assertAlmostEqual(res_trend, exp_trend)
+
+ def test_get_time_data(self):
+ """Test get_time_data"""
+ data = [{'attr1': d['y1995'],
+ 'attr2': d['y1996'],
+ 'attr3': d['y1997'],
+ 'attr4': d['y1998'],
+ 'attr5': d['y1999'],
+ 'attr6': d['y2000'],
+ 'attr7': d['y2001'],
+ 'attr8': d['y2002'],
+ 'attr9': d['y2003'],
+ 'attr10': d['y2004'],
+ 'attr11': d['y2005'],
+ 'attr12': d['y2006'],
+ 'attr13': d['y2007'],
+ 'attr14': d['y2008'],
+ 'attr15': d['y2009']} for d in self.neighbors_data]
+
+ result = std.get_time_data(data, ['y1995', 'y1996', 'y1997', 'y1998',
+ 'y1999', 'y2000', 'y2001', 'y2002',
+ 'y2003', 'y2004', 'y2005', 'y2006',
+ 'y2007', 'y2008', 'y2009'])
+
+ # expected was prepared from PySAL example:
+ # f = ps.open(ps.examples.get_path("usjoin.csv"))
+ # pci = np.array([f.by_col[str(y)]
+ # for y in range(1995, 2010)]).transpose()
+ # rpci = pci / (pci.mean(axis = 0))
+
+ expected = np.array(
+ [[0.87654416, 0.863147, 0.85637567, 0.84811668, 0.8446154,
+ 0.83271652, 0.83786314, 0.85012593, 0.85509656, 0.86416612,
+ 0.87119375, 0.86302631, 0.86148267, 0.86252252, 0.86746356],
+ [0.9188951, 0.91757931, 0.92333258, 0.92517289, 0.92552388,
+ 0.90746978, 0.89830489, 0.89431991, 0.88924794, 0.89815176,
+ 0.91832091, 0.91706054, 0.90139505, 0.87897455, 0.86216858],
+ [0.82591007, 0.82548596, 0.81989793, 0.81503235, 0.81731522,
+ 0.78964559, 0.80584442, 0.8084998, 0.82258551, 0.82668196,
+ 0.82373724, 0.81814804, 0.83675961, 0.83574199, 0.84647177],
+ [1.09088176, 1.08537689, 1.08456418, 1.08415404, 1.09898841,
+ 1.14506948, 1.12151133, 1.11160697, 1.10888621, 1.11399806,
+ 1.12168029, 1.13164797, 1.12958508, 1.11371818, 1.09936775],
+ [1.10731446, 1.11373944, 1.13283638, 1.14472559, 1.15910025,
+ 1.16898201, 1.17212488, 1.14752303, 1.11843284, 1.11024964,
+ 1.11943471, 1.11736468, 1.10863242, 1.09642516, 1.07762337],
+ [1.42269757, 1.42118434, 1.44273502, 1.43577571, 1.44400684,
+ 1.44184737, 1.44782832, 1.41978227, 1.39092208, 1.4059372,
+ 1.40788646, 1.44052766, 1.45241216, 1.43306098, 1.4174431],
+ [1.13073885, 1.13110513, 1.11074708, 1.13364636, 1.13088149,
+ 1.10888138, 1.11856629, 1.13062931, 1.11944984, 1.12446239,
+ 1.11671008, 1.10880034, 1.08401709, 1.06959206, 1.07875225],
+ [1.04706124, 1.04516831, 1.04253372, 1.03239987, 1.02072545,
+ 0.99854316, 0.9880258, 0.99669587, 0.99327676, 1.01400905,
+ 1.03176742, 1.040511, 1.01749645, 0.9936394, 0.98279746],
+ [0.98996986, 1.00143564, 0.99491, 1.00188408, 1.00455845,
+ 0.99127006, 0.97925917, 0.9683482, 0.95335147, 0.93694787,
+ 0.94308213, 0.92232874, 0.91284091, 0.89689833, 0.88928858],
+ [0.87418391, 0.86416601, 0.84425695, 0.8404494, 0.83903044,
+ 0.8578708, 0.86036185, 0.86107306, 0.8500772, 0.86981998,
+ 0.86837929, 0.87204141, 0.86633032, 0.84946077, 0.83287146],
+ [1.14196118, 1.14660262, 1.14892712, 1.14909594, 1.14436624,
+ 1.14450183, 1.12349752, 1.12596664, 1.12213996, 1.1119989,
+ 1.10257792, 1.10491258, 1.11059842, 1.10509795, 1.10020097],
+ [0.97282463, 0.96700147, 0.96252588, 0.9653878, 0.96057687,
+ 0.95831051, 0.94480909, 0.94804195, 0.95430286, 0.94103989,
+ 0.92122519, 0.91010201, 0.89280392, 0.89298243, 0.89165385],
+ [0.94325468, 0.96436902, 0.96455242, 0.95243009, 0.94117647,
+ 0.9480927, 0.93539182, 0.95388718, 0.94597005, 0.96918424,
+ 0.94781281, 0.93466815, 0.94281559, 0.96520315, 0.96715441],
+ [0.97478408, 0.98169225, 0.98712809, 0.98474769, 0.98559897,
+ 0.98687073, 0.99237486, 0.98209969, 0.9877653, 0.97399471,
+ 0.96910087, 0.98416665, 0.98423613, 0.99823861, 0.99545704],
+ [0.85570269, 0.85575915, 0.85986132, 0.85693406, 0.8538012,
+ 0.86191535, 0.84981451, 0.85472102, 0.84564835, 0.83998883,
+ 0.83478547, 0.82803648, 0.8198736, 0.82265395, 0.8399404],
+ [0.87022047, 0.85996258, 0.85961813, 0.85689572, 0.83947136,
+ 0.82785597, 0.86008789, 0.86776298, 0.86720209, 0.8676334,
+ 0.89179317, 0.94202108, 0.9422231, 0.93902708, 0.94479184],
+ [0.90134907, 0.90407738, 0.90403991, 0.90201769, 0.90399238,
+ 0.90906632, 0.92693339, 0.93695966, 0.94242697, 0.94338265,
+ 0.91981796, 0.91108804, 0.90543476, 0.91737138, 0.94793657],
+ [1.1977611, 1.18222564, 1.18439158, 1.18267865, 1.19286723,
+ 1.20172869, 1.21328691, 1.22624778, 1.22397075, 1.23857042,
+ 1.24419893, 1.23929384, 1.23418676, 1.23626739, 1.26754398],
+ [1.24919678, 1.25754773, 1.26991161, 1.28020651, 1.30625667,
+ 1.34790023, 1.34399863, 1.32575181, 1.30795492, 1.30544841,
+ 1.30303302, 1.32107766, 1.32936244, 1.33001241, 1.33288462],
+ [1.06768004, 1.03799276, 1.03637303, 1.02768449, 1.03296093,
+ 1.05059016, 1.03405057, 1.02747623, 1.03162734, 0.9961416,
+ 0.97356208, 0.94241549, 0.92754547, 0.92549227, 0.92138102],
+ [1.09475614, 1.11526796, 1.11654299, 1.13103948, 1.13143264,
+ 1.13889622, 1.12442212, 1.13367018, 1.13982256, 1.14029944,
+ 1.11979401, 1.10905389, 1.10577769, 1.11166825, 1.09985155],
+ [0.76530058, 0.76612841, 0.76542451, 0.76722683, 0.76014284,
+ 0.74480073, 0.76098396, 0.76156903, 0.76651952, 0.76533288,
+ 0.78205934, 0.76842416, 0.77487118, 0.77768683, 0.78801192],
+ [0.98391336, 0.98075816, 0.98295341, 0.97386015, 0.96913803,
+ 0.97370819, 0.96419154, 0.97209861, 0.97441313, 0.96356162,
+ 0.94745352, 0.93965462, 0.93069645, 0.94020973, 0.94358232],
+ [0.83561828, 0.82298088, 0.81738502, 0.81748588, 0.80904801,
+ 0.80071489, 0.83358256, 0.83451613, 0.85175032, 0.85954307,
+ 0.86790024, 0.87170334, 0.87863799, 0.87497981, 0.87888675],
+ [0.98845573, 1.02092428, 0.99665283, 0.99141823, 0.99386619,
+ 0.98733195, 0.99644997, 0.99669587, 1.02559097, 1.01116651,
+ 0.99988024, 0.97906749, 0.99323123, 1.00204939, 0.99602148],
+ [1.14930913, 1.15241949, 1.14300962, 1.14265542, 1.13984683,
+ 1.08312397, 1.05192626, 1.04230892, 1.05577278, 1.08569751,
+ 1.12443486, 1.08891079, 1.08603695, 1.05997314, 1.02160943],
+ [1.11368269, 1.1057147, 1.11893431, 1.13778669, 1.1432272,
+ 1.18257029, 1.16226243, 1.16009196, 1.14467789, 1.14820235,
+ 1.12386598, 1.12680236, 1.12357937, 1.1159258, 1.12570828],
+ [1.30379431, 1.30752186, 1.31206366, 1.31532267, 1.30625667,
+ 1.31210239, 1.29989156, 1.29203193, 1.27183516, 1.26830786,
+ 1.2617743, 1.28656675, 1.29734097, 1.29390205, 1.29345446],
+ [0.83953719, 0.82701448, 0.82006005, 0.81188876, 0.80294864,
+ 0.78772975, 0.82848011, 0.8259679, 0.82435705, 0.83108634,
+ 0.84373784, 0.83891093, 0.84349247, 0.85637272, 0.86539395],
+ [1.23450087, 1.2426022, 1.23537935, 1.23581293, 1.24522626,
+ 1.2256767, 1.21126648, 1.19377804, 1.18355337, 1.19674434,
+ 1.21536573, 1.23653297, 1.27962009, 1.27968392, 1.25907738],
+ [0.9769662, 0.97400719, 0.98035944, 0.97581531, 0.95543282,
+ 0.96480308, 0.94686376, 0.93679073, 0.92540049, 0.92988835,
+ 0.93442917, 0.92100464, 0.91475304, 0.90249622, 0.9021363],
+ [0.84986886, 0.8986851, 0.84295997, 0.87280534, 0.85659368,
+ 0.88937573, 0.894401, 0.90448993, 0.95495898, 0.92698333,
+ 0.94745352, 0.92562488, 0.96635366, 1.02520312, 1.0394296],
+ [1.01922808, 1.00258203, 1.00974428, 1.00303417, 0.99765073,
+ 1.00759019, 0.99192968, 0.99747298, 0.99550759, 0.97583768,
+ 0.9610168, 0.94779638, 0.93759089, 0.93353431, 0.94121705],
+ [0.86367411, 0.85558932, 0.85544346, 0.85103025, 0.84336613,
+ 0.83434854, 0.85813595, 0.84667961, 0.84374558, 0.85951183,
+ 0.87194227, 0.89455097, 0.88283929, 0.90349491, 0.90600675],
+ [1.00947534, 1.00411055, 1.00698819, 0.99513687, 0.99291086,
+ 1.00581626, 0.98850522, 0.99291168, 0.98983209, 0.97511924,
+ 0.96134615, 0.96382634, 0.95011401, 0.9434686, 0.94637765],
+ [1.05712571, 1.05459419, 1.05753012, 1.04880786, 1.05103857,
+ 1.04800023, 1.03024941, 1.04200483, 1.0402554, 1.03296979,
+ 1.02191682, 1.02476275, 1.02347523, 1.02517684, 1.04359571],
+ [1.07084189, 1.06669497, 1.07937623, 1.07387988, 1.0794043,
+ 1.0531801, 1.07452771, 1.09383478, 1.1052447, 1.10322136,
+ 1.09167939, 1.08772756, 1.08859544, 1.09177338, 1.1096083],
+ [0.86719222, 0.86628896, 0.86675156, 0.86425632, 0.86511809,
+ 0.86287327, 0.85169796, 0.85411285, 0.84886336, 0.84517414,
+ 0.84843858, 0.84488343, 0.83374329, 0.82812044, 0.82878599],
+ [0.88389211, 0.92288667, 0.90282398, 0.91229186, 0.92023286,
+ 0.92652175, 0.94278865, 0.93682452, 0.98655146, 0.992237,
+ 0.9798497, 0.93869677, 0.96947771, 1.00362626, 0.98102351],
+ [0.97082064, 0.95320233, 0.94534081, 0.94215593, 0.93967,
+ 0.93092109, 0.92662519, 0.93412152, 0.93501274, 0.92879506,
+ 0.92110542, 0.91035556, 0.90430364, 0.89994694, 0.90073864],
+ [0.95861858, 0.95774543, 0.98254811, 0.98919472, 0.98684824,
+ 0.98882205, 0.97662234, 0.95601578, 0.94905385, 0.94934888,
+ 0.97152609, 0.97163004, 0.9700702, 0.97158948, 0.95884908],
+ [0.83980439, 0.84726737, 0.85747, 0.85467221, 0.8556751,
+ 0.84818516, 0.85265681, 0.84502402, 0.82645665, 0.81743586,
+ 0.83550406, 0.83338919, 0.83511679, 0.82136617, 0.80921874],
+ [0.95118156, 0.9466212, 0.94688098, 0.9508583, 0.9512441,
+ 0.95440787, 0.96364363, 0.96804412, 0.97136214, 0.97583768,
+ 0.95571724, 0.96895368, 0.97001634, 0.97082733, 0.98782366],
+ [1.08910044, 1.08248968, 1.08492895, 1.08656923, 1.09454249,
+ 1.10558188, 1.1214086, 1.12292577, 1.13021031, 1.13342735,
+ 1.14686068, 1.14502975, 1.14474747, 1.14084037, 1.16142926],
+ [1.06336033, 1.07365823, 1.08691496, 1.09764846, 1.11669863,
+ 1.11856702, 1.09764283, 1.08815849, 1.08044313, 1.09278827,
+ 1.07003204, 1.08398066, 1.09831768, 1.09298232, 1.09176125],
+ [0.79772065, 0.78829196, 0.78581151, 0.77615922, 0.77035744,
+ 0.77751194, 0.79902974, 0.81437881, 0.80788828, 0.79603865,
+ 0.78966436, 0.79949807, 0.80172182, 0.82168155, 0.85587911],
+ [1.0052447, 1.00007696, 1.00475899, 1.00613942, 1.00639561,
+ 1.00162979, 0.99860739, 1.00814981, 1.00574316, 0.99030032,
+ 0.97682565, 0.97292596, 0.96519561, 0.96173403, 0.95890284],
+ [0.95808419, 0.9382568, 0.9654441, 0.95561201, 0.96987289,
+ 0.96608031, 0.99727185, 1.00781194, 1.03484236, 1.05333619,
+ 1.0983263, 1.1704974, 1.17025154, 1.18730553, 1.14242645]])
+
+ self.assertTrue(np.allclose(result, expected))
+ self.assertTrue(type(result) == type(expected))
+ self.assertTrue(result.shape == expected.shape)
+
+ def test_rebin_data(self):
+ """Test rebin_data"""
+ # sample in double the time (even case since 10 % 2 = 0):
+ # (0+1)/2, (2+3)/2, (4+5)/2, (6+7)/2, (8+9)/2
+ # = 0.5, 2.5, 4.5, 6.5, 8.5
+ ans_even = np.array([(i + 0.5) * np.ones(10, dtype=float)
+ for i in range(0, 10, 2)]).T
+
+ self.assertTrue(
+ np.array_equal(std.rebin_data(self.time_data, 2), ans_even))
+
+ # sample in triple the time (uneven since 10 % 3 = 1):
+ # (0+1+2)/3, (3+4+5)/3, (6+7+8)/3, (9)/1
+ # = 1, 4, 7, 9
+ ans_odd = np.array([i * np.ones(10, dtype=float)
+ for i in (1, 4, 7, 9)]).T
+ self.assertTrue(
+ np.array_equal(std.rebin_data(self.time_data, 3), ans_odd))
+
+ def test_get_prob_dist(self):
+ """Test get_prob_dist"""
+ lag_indices = np.array([1, 2, 3, 4])
+ unit_indices = np.array([1, 3, 2, 4])
+ answer = np.array([
+ [0.0754717, 0.88207547, 0.04245283, 0., 0.],
+ [0., 0., 0.09411765, 0.87058824, 0.03529412],
+ [0.0049505, 0.09405941, 0.77722772, 0.11881188, 0.0049505],
+ [0., 0., 0., 0.02352941, 0.97647059]
+ ])
+ result = std.get_prob_dist(self.transition_matrix,
+ lag_indices, unit_indices)
+
+ self.assertTrue(np.array_equal(result, answer))
+
+ def test_get_prob_stats(self):
+ """Test get_prob_stats"""
+
+ probs = np.array([
+ [0.0754717, 0.88207547, 0.04245283, 0., 0.],
+ [0., 0., 0.09411765, 0.87058824, 0.03529412],
+ [0.0049505, 0.09405941, 0.77722772, 0.11881188, 0.0049505],
+ [0., 0., 0., 0.02352941, 0.97647059]
+ ])
+ unit_indices = np.array([1, 3, 2, 4])
+ answer_up = np.array([0.04245283, 0.03529412, 0.12376238, 0.])
+ answer_down = np.array([0.0754717, 0.09411765, 0.0990099, 0.02352941])
+ answer_trend = np.array([-0.03301887 / 0.88207547,
+ -0.05882353 / 0.87058824,
+ 0.02475248 / 0.77722772,
+ -0.02352941 / 0.97647059])
+ answer_volatility = np.array([0.34221495, 0.33705421,
+ 0.29226542, 0.38834223])
+
+ result = std.get_prob_stats(probs, unit_indices)
+ result_up = result[0]
+ result_down = result[1]
+ result_trend = result[2]
+ result_volatility = result[3]
+
+ self.assertTrue(np.allclose(result_up, answer_up))
+ self.assertTrue(np.allclose(result_down, answer_down))
+ self.assertTrue(np.allclose(result_trend, answer_trend))
+ self.assertTrue(np.allclose(result_volatility, answer_volatility))
diff --git a/src/pg/crankshaft.control b/src/pg/crankshaft.control
index 7d5a93a..028fb76 100644
--- a/src/pg/crankshaft.control
+++ b/src/pg/crankshaft.control
@@ -1,5 +1,5 @@
comment = 'CartoDB Spatial Analysis extension'
-default_version = '0.7.0'
+default_version = '0.8.0'
requires = 'plpythonu, postgis'
superuser = true
schema = cdb_crankshaft