From 9b88db8475c7870037110f63d09082723287fd31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Thu, 19 Dec 2019 18:50:54 +0100 Subject: [PATCH] Missing upgrade path --- NEWS.md | 1 - release/crankshaft--0.8.2--0.9.0.sql | 2339 ++++++++++++++++++++++++++ 2 files changed, 2339 insertions(+), 1 deletion(-) create mode 100644 release/crankshaft--0.8.2--0.9.0.sql diff --git a/NEWS.md b/NEWS.md index b3c137f..5f18850 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,7 +3,6 @@ * Compatibility with PG12. * Compatibility with python3 (enable with PYTHON3=true env variable, default in PG12+). - 0.8.2 (2019-02-07) ------------------ * Update dependencies to match what it's being used in production. diff --git a/release/crankshaft--0.8.2--0.9.0.sql b/release/crankshaft--0.8.2--0.9.0.sql new file mode 100644 index 0000000..ebe80f7 --- /dev/null +++ b/release/crankshaft--0.8.2--0.9.0.sql @@ -0,0 +1,2339 @@ +--DO NOT MODIFY THIS FILE, IT IS GENERATED AUTOMATICALLY FROM SOURCES +-- Complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION crankshaft" to load this file. \quit +-- Version number of the extension release +CREATE OR REPLACE FUNCTION cdb_crankshaft_version() +RETURNS text AS $$ + SELECT '0.9.0'::text; +$$ language 'sql' IMMUTABLE STRICT PARALLEL SAFE; + +-- Internal identifier of the installed extension instence +-- e.g. 'dev' for current development version +CREATE OR REPLACE FUNCTION _cdb_crankshaft_internal_version() +RETURNS text AS $$ + SELECT installed_version FROM pg_available_extensions where name='crankshaft' and pg_available_extensions IS NOT NULL; +$$ language 'sql' STABLE STRICT PARALLEL SAFE; +-- Internal function. +-- Set the seeds of the RNGs (Random Number Generators) +-- used internally. +CREATE OR REPLACE FUNCTION +_cdb_random_seeds (seed_value INTEGER) RETURNS VOID +AS $$ + from crankshaft import random_seeds + random_seeds.set_random_seeds(seed_value) +$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; +CREATE OR REPLACE FUNCTION + CDB_PyAggS(current_state Numeric[], current_row Numeric[]) + returns NUMERIC[] as $$ + BEGIN + if array_upper(current_state,1) is null then + RAISE NOTICE 'setting state %',array_upper(current_row,1); + current_state[1] = array_upper(current_row,1); + end if; + return array_cat(current_state,current_row) ; + END + $$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; + +-- Create aggregate if it did not exist +DO $$ BEGIN + CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) ( + SFUNC = CDB_PyAggS, + STYPE = Numeric[], + PARALLEL = SAFE, + INITCOND = "{}" + ); +EXCEPTION + WHEN duplicate_function THEN NULL; +END $$; + +CREATE OR REPLACE FUNCTION + CDB_CreateAndPredictSegment( + target NUMERIC[], + features NUMERIC[], + target_features NUMERIC[], + target_ids NUMERIC[], + n_estimators INTEGER DEFAULT 1200, + max_depth INTEGER DEFAULT 3, + subsample DOUBLE PRECISION DEFAULT 0.5, + learning_rate DOUBLE PRECISION DEFAULT 0.01, + min_samples_leaf INTEGER DEFAULT 1) +RETURNS TABLE(cartodb_id NUMERIC, prediction NUMERIC, accuracy NUMERIC) +AS $$ + import numpy as np + import plpy + + from crankshaft.segmentation import Segmentation + seg = Segmentation() + model_params = {'n_estimators': n_estimators, + 'max_depth': max_depth, + 'subsample': subsample, + 'learning_rate': learning_rate, + 'min_samples_leaf': min_samples_leaf} + + def unpack2D(data): + dimension = data.pop(0) + a = np.array(data, dtype=np.float64) + return a.reshape(int(len(a)/dimension), int(dimension)) + + return seg.create_and_predict_segment_agg( + np.array(target, dtype=np.float64), + unpack2D(features), + unpack2D(target_features), + target_ids, + model_params) + +$$ LANGUAGE plpython3u VOLATILE PARALLEL RESTRICTED; + +CREATE OR REPLACE FUNCTION + CDB_CreateAndPredictSegment( + query TEXT, + variable_name TEXT, + target_table TEXT, + n_estimators INTEGER DEFAULT 1200, + max_depth INTEGER DEFAULT 3, + subsample DOUBLE PRECISION DEFAULT 0.5, + learning_rate DOUBLE PRECISION DEFAULT 0.01, + min_samples_leaf INTEGER DEFAULT 1) +RETURNS TABLE (cartodb_id TEXT, prediction NUMERIC, accuracy NUMERIC) +AS $$ + from crankshaft.segmentation import Segmentation + seg = Segmentation() + model_params = { + 'n_estimators': n_estimators, + 'max_depth': max_depth, + 'subsample': subsample, + 'learning_rate': learning_rate, + 'min_samples_leaf': min_samples_leaf + } + feature_cols = set(plpy.execute(''' + select * from ({query}) as _w limit 0 + '''.format(query=query)).colnames()) - set([variable_name, 'cartodb_id', ]) + return seg.create_and_predict_segment( + query, + variable_name, + feature_cols, + target_table, + model_params + ) +$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; + +CREATE OR REPLACE FUNCTION + CDB_CreateAndPredictSegment( + query TEXT, + variable TEXT, + feature_columns TEXT[], + target_query TEXT, + n_estimators INTEGER DEFAULT 1200, + max_depth INTEGER DEFAULT 3, + subsample DOUBLE PRECISION DEFAULT 0.5, + learning_rate DOUBLE PRECISION DEFAULT 0.01, + min_samples_leaf INTEGER DEFAULT 1) +RETURNS TABLE (cartodb_id TEXT, prediction NUMERIC, accuracy NUMERIC) +AS $$ + from crankshaft.segmentation import Segmentation + seg = Segmentation() + model_params = { + 'n_estimators': n_estimators, + 'max_depth': max_depth, + 'subsample': subsample, + 'learning_rate': learning_rate, + 'min_samples_leaf': min_samples_leaf + } + return seg.create_and_predict_segment( + query, + variable, + feature_columns, + target_query, + model_params + ) +$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; +CREATE OR REPLACE FUNCTION CDB_Gravity( + IN target_query text, + IN weight_column text, + IN source_query text, + IN pop_column text, + IN target bigint, + IN radius integer, + IN minval numeric DEFAULT -10e307 + ) +RETURNS TABLE( + the_geom geometry, + source_id bigint, + target_id bigint, + dist numeric, + h numeric, + hpop numeric) AS $$ +DECLARE + t_id bigint[]; + t_geom geometry[]; + t_weight numeric[]; + s_id bigint[]; + s_geom geometry[]; + s_pop numeric[]; +BEGIN + EXECUTE 'WITH foo as('+target_query+') SELECT array_agg(cartodb_id), array_agg(the_geom), array_agg(' || weight_column || ') FROM foo' INTO t_id, t_geom, t_weight; + EXECUTE 'WITH foo as('+source_query+') SELECT array_agg(cartodb_id), array_agg(the_geom), array_agg(' || pop_column || ') FROM foo' INTO s_id, s_geom, s_pop; + RETURN QUERY + SELECT g.* FROM t, s, CDB_Gravity(t_id, t_geom, t_weight, s_id, s_geom, s_pop, target, radius, minval) g; +END; +$$ language plpgsql VOLATILE PARALLEL UNSAFE; + +CREATE OR REPLACE FUNCTION CDB_Gravity( + IN t_id bigint[], + IN t_geom geometry[], + IN t_weight numeric[], + IN s_id bigint[], + IN s_geom geometry[], + IN s_pop numeric[], + IN target bigint, + IN radius integer, + IN minval numeric DEFAULT -10e307 + ) +RETURNS TABLE( + the_geom geometry, + source_id bigint, + target_id bigint, + dist numeric, + h numeric, + hpop numeric) AS $$ +DECLARE + t_type text; + s_type text; + t_center geometry[]; + s_center geometry[]; +BEGIN + t_type := GeometryType(t_geom[1]); + s_type := GeometryType(s_geom[1]); + IF t_type = 'POINT' THEN + t_center := t_geom; + ELSE + WITH tmp as (SELECT unnest(t_geom) as g) SELECT array_agg(ST_Centroid(g)) INTO t_center FROM tmp; + END IF; + IF s_type = 'POINT' THEN + s_center := s_geom; + ELSE + WITH tmp as (SELECT unnest(s_geom) as g) SELECT array_agg(ST_Centroid(g)) INTO s_center FROM tmp; + END IF; + RETURN QUERY + with target0 as( + SELECT unnest(t_center) as tc, unnest(t_weight) as tw, unnest(t_id) as td + ), + source0 as( + SELECT unnest(s_center) as sc, unnest(s_id) as sd, unnest (s_geom) as sg, unnest(s_pop) as sp + ), + prev0 as( + SELECT + source0.sg, + source0.sd as sourc_id, + coalesce(source0.sp,0) as sp, + target.td as targ_id, + coalesce(target.tw,0) as tw, + GREATEST(1.0,ST_Distance(geography(target.tc), geography(source0.sc)))::numeric as distance + FROM source0 + CROSS JOIN LATERAL + ( + SELECT + * + FROM target0 + WHERE tw > minval + AND ST_DWithin(geography(source0.sc), geography(tc), radius) + ) AS target + ), + deno as( + SELECT + sourc_id, + sum(tw/distance) as h_deno + FROM + prev0 + GROUP BY sourc_id + ) + SELECT + p.sg as the_geom, + p.sourc_id as source_id, + p.targ_id as target_id, + case when p.distance > 1 then p.distance else 0.0 end as dist, + 100*(p.tw/p.distance)/d.h_deno as h, + p.sp*(p.tw/p.distance)/d.h_deno as hpop + FROM + prev0 p, + deno d + WHERE + p.targ_id = target AND + p.sourc_id = d.sourc_id; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; +-- 0: nearest neighbor(s) +-- 1: barymetric +-- 2: IDW +-- 3: krigin ---> TO DO + + +CREATE OR REPLACE FUNCTION CDB_SpatialInterpolation( + IN query text, + IN point geometry, + IN method integer DEFAULT 1, + IN p1 numeric DEFAULT 0, + IN p2 numeric DEFAULT 0 + ) +RETURNS numeric AS +$$ +DECLARE + gs geometry[]; + vs numeric[]; + output numeric; +BEGIN + EXECUTE 'WITH a AS('||query||') SELECT array_agg(the_geom), array_agg(attrib) FROM a' INTO gs, vs; + SELECT CDB_SpatialInterpolation(gs, vs, point, method, p1,p2) INTO output FROM a; + + RETURN output; +END; +$$ +language plpgsql VOLATILE PARALLEL UNSAFE; + +CREATE OR REPLACE FUNCTION CDB_SpatialInterpolation( + IN geomin geometry[], + IN colin numeric[], + IN point geometry, + IN method integer DEFAULT 1, + IN p1 numeric DEFAULT 0, + IN p2 numeric DEFAULT 0 + ) +RETURNS numeric AS +$$ +DECLARE + gs geometry[]; + vs numeric[]; + gs2 geometry[]; + vs2 numeric[]; + g geometry; + vertex geometry[]; + sg numeric; + sa numeric; + sb numeric; + sc numeric; + va numeric; + vb numeric; + vc numeric; + output numeric; +BEGIN + -- output := -999.999; + + -- nearest neighbors + -- p1: limit the number of neighbors, 0-> closest one + IF method = 0 THEN + + IF p1 = 0 THEN + p1 := 1; + END IF; + + WITH a as (SELECT unnest(geomin) as g, unnest(colin) as v), + b as (SELECT a.v as v FROM a ORDER BY point<->a.g LIMIT p1::integer) + SELECT avg(b.v) INTO output FROM b; + RETURN output; + + -- barymetric + ELSIF method = 1 THEN + WITH a as (SELECT unnest(geomin) AS e), + b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a), + c as (SELECT (ST_Dump(t)).geom as v FROM b), + d as (SELECT v FROM c WHERE ST_Within(point, v)) + SELECT v INTO g FROM d; + IF g is null THEN + -- out of the realm of the input data + RETURN -888.888; + END IF; + -- vertex of the selected cell + WITH a AS (SELECT (ST_DumpPoints(g)).geom AS v) + SELECT array_agg(v) INTO vertex FROM a; + + -- retrieve the value of each vertex + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]); + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]); + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]); + + SELECT ST_area(g), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[2], vertex[3], point]))), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[1], vertex[3], point]))), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point,vertex[1],vertex[2], point]))) INTO sg, sa, sb, sc; + + output := (coalesce(sa,0) * coalesce(va,0) + coalesce(sb,0) * coalesce(vb,0) + coalesce(sc,0) * coalesce(vc,0)) / coalesce(sg); + RETURN output; + + -- IDW + -- p1: limit the number of neighbors, 0->no limit + -- p2: order of distance decay, 0-> order 1 + ELSIF method = 2 THEN + + IF p2 = 0 THEN + p2 := 1; + END IF; + + WITH a as (SELECT unnest(geomin) as g, unnest(colin) as v), + b as (SELECT a.g, a.v FROM a ORDER BY point<->a.g) + SELECT array_agg(b.g), array_agg(b.v) INTO gs, vs FROM b; + IF p1::integer>0 THEN + gs2:=gs; + vs2:=vs; + FOR i IN 1..p1 + LOOP + gs2 := gs2 || gs[i]; + vs2 := vs2 || vs[i]; + END LOOP; + ELSE + gs2:=gs; + vs2:=vs; + END IF; + + WITH a as (SELECT unnest(gs2) as g, unnest(vs2) as v), + b as ( + SELECT + (1/ST_distance(point, a.g)^p2::integer) as k, + (a.v/ST_distance(point, a.g)^p2::integer) as f + FROM a + ) + SELECT sum(b.f)/sum(b.k) INTO output FROM b; + RETURN output; + + -- krigin + ELSIF method = 3 THEN + + -- TO DO + + END IF; + + RETURN -777.777; + +END; +$$ +language plpgsql IMMUTABLE PARALLEL SAFE; +-- ============================================================================================= +-- +-- CDB_Voronoi +-- +-- ============================================================================================= +CREATE OR REPLACE FUNCTION CDB_voronoi( + IN geomin geometry[], + IN buffer numeric DEFAULT 0.5, + IN tolerance numeric DEFAULT 1e-9 + ) +RETURNS geometry AS $$ +DECLARE + geomout geometry; +BEGIN + -- we need to make the geometry calculations in (pseudo)meters!!! + with a as ( + SELECT unnest(geomin) as g1 + ), + b as( + SELECT st_transform(g1, 3857) g2 from a + ) + SELECT array_agg(g2) INTO geomin from b; + + WITH + convexhull_1 as ( + SELECT + ST_ConvexHull(ST_Collect(geomin)) as g, + buffer * |/ (st_area(ST_ConvexHull(ST_Collect(geomin)))/PI()) as r + ), + clipper as( + SELECT + st_buffer(ST_MinimumBoundingCircle(a.g), buffer*a.r) as g + FROM convexhull_1 a + ), + env0 as ( + SELECT + (st_dumppoints(st_expand(a.g, buffer*a.r))).geom as e + FROM convexhull_1 a + ), + env as ( + SELECT + array_agg(env0.e) as e + FROM env0 + ), + sample AS ( + SELECT + ST_Collect(geomin || env.e) as geom + FROM env + ), + convexhull as ( + SELECT + ST_ConvexHull(ST_Collect(geomin)) as cg + ), + tin as ( + SELECT + ST_Dump(ST_DelaunayTriangles(geom, tolerance, 0)) as gd + FROM + sample + ), + tin_polygons as ( + SELECT + (gd).Path as id, + (gd).Geom as pg, + ST_Centroid(ST_MinimumBoundingCircle((gd).Geom, 180)) as ct + FROM tin + ), + tin_lines as ( + SELECT + id, + ST_ExteriorRing(pg) as lg + FROM tin_polygons + ), + tin_nodes as ( + SELECT + id, + ST_PointN(lg,1) p1, + ST_PointN(lg,2) p2, + ST_PointN(lg,3) p3 + FROM tin_lines + ), + tin_edges AS ( + SELECT + p.id, + UNNEST(ARRAY[ + ST_MakeLine(n.p1,n.p2) , + ST_MakeLine(n.p2,n.p3) , + ST_MakeLine(n.p3,n.p1)]) as Edge, + ST_Force2D(cdb_crankshaft._Find_Circle(n.p1,n.p2,n.p3)) as ct, + CASE WHEN st_distance(p.ct, ST_ExteriorRing(p.pg)) < tolerance THEN + TRUE + ELSE FALSE END AS ctx, + p.pg, + ST_within(p.ct, convexhull.cg) as ctin + FROM + tin_polygons p, + tin_nodes n, + convexhull + WHERE p.id = n.id + ), + voro_nodes as ( + SELECT + CASE WHEN x.ctx = TRUE THEN + ST_Centroid(x.edge) + ELSE + x.ct + END as xct, + CASE WHEN y.id is null THEN + CASE WHEN x.ctin = TRUE THEN + ST_SetSRID(ST_MakePoint( + ST_X(x.ct) + ((ST_X(ST_Centroid(x.edge)) - ST_X(x.ct)) * (1+buffer)), + ST_Y(x.ct) + ((ST_Y(ST_Centroid(x.edge)) - ST_Y(x.ct)) * (1+buffer)) + ), ST_SRID(x.ct)) + END + ELSE + y.ct + END as yct + FROM + tin_edges x + LEFT OUTER JOIN + tin_edges y + ON x.id <> y.id AND ST_Equals(x.edge, y.edge) + ), + voro_edges as( + SELECT + ST_LineMerge(ST_Collect(ST_MakeLine(xct, yct))) as v + FROM + voro_nodes + ), + voro_cells as( + SELECT + ST_Polygonize( + ST_Node( + ST_LineMerge( + ST_Union(v, ST_ExteriorRing( + ST_Convexhull(v) + ) + ) + ) + ) + ) as g + FROM + voro_edges + ), + voro_set as( + SELECT + (st_dump(v.g)).geom as g + FROM voro_cells v + ), + clipped_voro as( + SELECT + ST_intersection(c.g, v.g) as g + FROM + voro_set v, + clipper c + WHERE + ST_GeometryType(v.g) = 'ST_Polygon' + ) + SELECT + st_collect( + ST_Transform( + ST_ConvexHull(g), + 4326 + ) + ) + INTO geomout + FROM + clipped_voro; + RETURN geomout; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; + +/** ---------------------------------------------------------------------------------------- + * @function : FindCircle + * @precis : Function that determines if three points form a circle. If so a table containing + * centre and radius is returned. If not, a null table is returned. + * @version : 1.0.1 + * @param : p_pt1 : First point in curve + * @param : p_pt2 : Second point in curve + * @param : p_pt3 : Third point in curve + * @return : geometry : In which X,Y ordinates are the centre X, Y and the Z being the radius of found circle + * or NULL if three points do not form a circle. + * @history : Simon Greener - Feb 2012 - Original coding. + * Rafa de la Torre - Aug 2016 - Small fix for type checking + * Raul Marin - Sept 2017 - Remove unnecessary NULL checks and set function categories + * @copyright : Simon Greener @ 2012 + * Licensed under a Creative Commons Attribution-Share Alike 2.5 Australia License. (http://creativecommons.org/licenses/by-sa/2.5/au/) +**/ +CREATE OR REPLACE FUNCTION _Find_Circle( + IN p_pt1 geometry, + IN p_pt2 geometry, + IN p_pt3 geometry) + RETURNS geometry AS +$BODY$ +DECLARE + v_Centre geometry; + v_radius NUMERIC; + v_CX NUMERIC; + v_CY NUMERIC; + v_dA NUMERIC; + v_dB NUMERIC; + v_dC NUMERIC; + v_dD NUMERIC; + v_dE NUMERIC; + v_dF NUMERIC; + v_dG NUMERIC; +BEGIN + IF ( ST_GeometryType(p_pt1) <> 'ST_Point' OR + ST_GeometryType(p_pt2) <> 'ST_Point' OR + ST_GeometryType(p_pt3) <> 'ST_Point' ) THEN + RAISE EXCEPTION 'All supplied geometries must be points.'; + RETURN NULL; + END IF; + v_dA := ST_X(p_pt2) - ST_X(p_pt1); + v_dB := ST_Y(p_pt2) - ST_Y(p_pt1); + v_dC := ST_X(p_pt3) - ST_X(p_pt1); + v_dD := ST_Y(p_pt3) - ST_Y(p_pt1); + v_dE := v_dA * (ST_X(p_pt1) + ST_X(p_pt2)) + v_dB * (ST_Y(p_pt1) + ST_Y(p_pt2)); + v_dF := v_dC * (ST_X(p_pt1) + ST_X(p_pt3)) + v_dD * (ST_Y(p_pt1) + ST_Y(p_pt3)); + v_dG := 2.0 * (v_dA * (ST_Y(p_pt3) - ST_Y(p_pt2)) - v_dB * (ST_X(p_pt3) - ST_X(p_pt2))); + -- If v_dG is zero then the three points are collinear and no finite-radius + -- circle through them exists. + IF ( v_dG = 0 ) THEN + RETURN NULL; + ELSE + v_CX := (v_dD * v_dE - v_dB * v_dF) / v_dG; + v_CY := (v_dA * v_dF - v_dC * v_dE) / v_dG; + v_Radius := SQRT(POWER(ST_X(p_pt1) - v_CX,2) + POWER(ST_Y(p_pt1) - v_CY,2) ); + END IF; + RETURN ST_SetSRID(ST_MakePoint(v_CX, v_CY, v_radius),ST_Srid(p_pt1)); +END; +$BODY$ + LANGUAGE plpgsql IMMUTABLE STRICT PARALLEL SAFE; + +-- Moran's I Global Measure (public-facing) +CREATE OR REPLACE FUNCTION + CDB_AreasOfInterestGlobal( + subquery TEXT, + column_name TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS TABLE (moran NUMERIC, significance NUMERIC) +AS $$ + from crankshaft.clustering import Moran + # TODO: use named parameters or a dictionary + moran = Moran() + return moran.global_stat(subquery, column_name, w_type, + num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local (internal function) - DEPRECATED +CREATE OR REPLACE FUNCTION + _CDB_AreasOfInterestLocal( + subquery TEXT, + column_name TEXT, + w_type TEXT, + num_ngbrs INT, + permutations INT, + geom_col TEXT, + id_col TEXT) +RETURNS TABLE ( + moran NUMERIC, + quads TEXT, + significance NUMERIC, + rowid INT, + vals NUMERIC) +AS $$ + from crankshaft.clustering import Moran + moran = Moran() + result = moran.local_stat(subquery, column_name, w_type, + num_ngbrs, permutations, geom_col, id_col) + # remove spatial lag + return [(r[6], r[0], r[1], r[7], r[5]) for r in result] +$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local (internal function) +CREATE OR REPLACE FUNCTION + _CDB_MoransILocal( + subquery TEXT, + column_name TEXT, + w_type TEXT, + num_ngbrs INT, + permutations INT, + geom_col TEXT, + id_col TEXT) +RETURNS TABLE ( + quads TEXT, + significance NUMERIC, + spatial_lag NUMERIC, + spatial_lag_std NUMERIC, + orig_val NUMERIC, + orig_val_std NUMERIC, + moran_stat NUMERIC, + rowid INT) +AS $$ + +from crankshaft.clustering import Moran +moran = Moran() +return moran.local_stat(subquery, column_name, w_type, + num_ngbrs, permutations, geom_col, id_col) + +$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; + + +-- Moran's I Local (public-facing function) +-- Replaces CDB_AreasOfInterestLocal +CREATE OR REPLACE FUNCTION + CDB_MoransILocal( + subquery TEXT, + column_name TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS TABLE ( + quads TEXT, + significance NUMERIC, + spatial_lag NUMERIC, + spatial_lag_std NUMERIC, + orig_val NUMERIC, + orig_val_std NUMERIC, + moran_stat NUMERIC, + rowid INT) +AS $$ + + SELECT + quads, significance, spatial_lag, spatial_lag_std, + orig_val, orig_val_std, moran_stat, rowid + FROM cdb_crankshaft._CDB_MoransILocal( + subquery, column_name, w_type, + num_ngbrs, permutations, geom_col, id_col); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local (public-facing function) - DEPRECATED +CREATE OR REPLACE FUNCTION + CDB_AreasOfInterestLocal( + subquery TEXT, + column_name TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I only for HH and HL (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialHotspots( + subquery TEXT, + column_name TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') + RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('HH', 'HL'); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I only for LL and LH (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialColdspots( + subquery TEXT, + attr TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') + RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('LL', 'LH'); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I only for LH and HL (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialOutliers( + subquery TEXT, + attr TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') + RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('HL', 'LH'); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I Global Rate (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_AreasOfInterestGlobalRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS TABLE (moran FLOAT, significance FLOAT) +AS $$ + from crankshaft.clustering import Moran + moran = Moran() + # TODO: use named parameters or a dictionary + return moran.global_rate_stat(subquery, numerator, denominator, w_type, + num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; + + +-- Moran's I Local Rate (internal function) - DEPRECATED +CREATE OR REPLACE FUNCTION + _CDB_AreasOfInterestLocalRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT, + num_ngbrs INT, + permutations INT, + geom_col TEXT, + id_col TEXT) +RETURNS +TABLE( + moran NUMERIC, + quads TEXT, + significance NUMERIC, + rowid INT, + vals NUMERIC) +AS $$ + from crankshaft.clustering import Moran + moran = Moran() + # TODO: use named parameters or a dictionary + result = moran.local_rate_stat(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) + # remove spatial lag + return [(r[6], r[0], r[1], r[7], r[4]) for r in result] +$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local Rate (public-facing function) - DEPRECATED +CREATE OR REPLACE FUNCTION + CDB_AreasOfInterestLocalRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS +TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Internal function +CREATE OR REPLACE FUNCTION + _CDB_MoransILocalRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT, + num_ngbrs INT, + permutations INT, + geom_col TEXT, + id_col TEXT) +RETURNS +TABLE( + quads TEXT, + significance NUMERIC, + spatial_lag NUMERIC, + spatial_lag_std NUMERIC, + orig_val NUMERIC, + orig_val_std NUMERIC, + moran_stat NUMERIC, + rowid INT) +AS $$ +from crankshaft.clustering import Moran +moran = Moran() +return moran.local_rate_stat( + subquery, + numerator, + denominator, + w_type, + num_ngbrs, + permutations, + geom_col, + id_col +) +$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; + +-- Moran's I Rate +-- Replaces CDB_AreasOfInterestLocalRate +CREATE OR REPLACE FUNCTION + CDB_MoransILocalRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS +TABLE( + quads TEXT, + significance NUMERIC, + spatial_lag NUMERIC, + spatial_lag_std NUMERIC, + orig_val NUMERIC, + orig_val_std NUMERIC, + moran_stat NUMERIC, + rowid INT) +AS $$ + +SELECT + quads, significance, spatial_lag, spatial_lag_std, + orig_val, orig_val_std, moran_stat, rowid +FROM cdb_crankshaft._CDB_MoransILocalRate( + subquery, numerator, denominator, w_type, + num_ngbrs, permutations, geom_col, id_col); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local Rate only for HH and HL (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialHotspotsRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS +TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('HH', 'HL'); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local Rate only for LL and LH (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialColdspotsRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS +TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('LL', 'LH'); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; + +-- Moran's I Local Rate only for LH and HL (public-facing function) +CREATE OR REPLACE FUNCTION + CDB_GetSpatialOutliersRate( + subquery TEXT, + numerator TEXT, + denominator TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS +TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC) +AS $$ + + SELECT moran, quads, significance, rowid, vals + FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col) + WHERE quads IN ('HL', 'LH'); + +$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE; +-- Spatial k-means clustering + +CREATE OR REPLACE FUNCTION CDB_KMeans( + query TEXT, + no_clusters INTEGER, + no_init INTEGER DEFAULT 20 +) +RETURNS TABLE( + cartodb_id INTEGER, + cluster_no INTEGER +) AS $$ + +from crankshaft.clustering import Kmeans +kmeans = Kmeans() +return kmeans.spatial(query, no_clusters, no_init) + +$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; + +-- Non-spatial k-means clustering +-- query: sql query to retrieve all the needed data +-- colnames: text array of column names for doing the clustering analysis +-- no_clusters: number of requested clusters +-- standardize: whether to scale variables to a mean of zero and a standard +-- deviation of 1 +-- id_colname: name of the id column + +CREATE OR REPLACE FUNCTION CDB_KMeansNonspatial( + query TEXT, + colnames TEXT[], + no_clusters INTEGER, + standardize BOOLEAN DEFAULT true, + id_col TEXT DEFAULT 'cartodb_id' +) +RETURNS TABLE( + cluster_label text, + cluster_center json, + silhouettes numeric, + inertia numeric, + rowid bigint +) AS $$ + +from crankshaft.clustering import Kmeans +kmeans = Kmeans() +return kmeans.nonspatial(query, colnames, no_clusters, + standardize=standardize, + id_col=id_col) +$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; + + +CREATE OR REPLACE FUNCTION CDB_WeightedMeanS( + state NUMERIC[], + the_geom GEOMETRY(Point, 4326), + weight NUMERIC +) +RETURNS Numeric[] AS $$ +DECLARE + newX NUMERIC; + newY NUMERIC; + newW NUMERIC; +BEGIN + IF weight IS NULL OR the_geom IS NULL THEN + newX = state[1]; + newY = state[2]; + newW = state[3]; + ELSE + newX = state[1] + ST_X(the_geom)*weight; + newY = state[2] + ST_Y(the_geom)*weight; + newW = state[3] + weight; + END IF; + RETURN Array[newX,newY,newW]; + +END +$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; + + +CREATE OR REPLACE FUNCTION CDB_WeightedMeanF(state NUMERIC[]) +RETURNS GEOMETRY AS +$$ +BEGIN + IF state[3] = 0 THEN + RETURN ST_SetSRID(ST_MakePoint(state[1],state[2]), 4326); + ELSE + RETURN ST_SETSRID(ST_MakePoint(state[1]/state[3], state[2]/state[3]),4326); + END IF; +END +$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; + + +-- Create aggregate if it did not exist +DO $$ BEGIN + CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) ( + SFUNC = CDB_WeightedMeanS, + FINALFUNC = CDB_WeightedMeanF, + STYPE = Numeric[], + PARALLEL = SAFE, + INITCOND = "{0.0,0.0,0.0}" +); +EXCEPTION + WHEN duplicate_function THEN NULL; +END $$; +-- Spatial Markov + +-- input table format: +-- id | geom | date_1 | date_2 | date_3 +-- 1 | Pt1 | 12.3 | 13.1 | 14.2 +-- 2 | Pt2 | 11.0 | 13.2 | 12.5 +-- ... +-- Sample Function call: +-- SELECT CDB_SpatialMarkov('SELECT * FROM real_estate', +-- Array['date_1', 'date_2', 'date_3']) + +CREATE OR REPLACE FUNCTION + CDB_SpatialMarkovTrend ( + subquery TEXT, + time_cols TEXT[], + num_classes INT DEFAULT 7, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 99, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS TABLE (trend NUMERIC, trend_up NUMERIC, trend_down NUMERIC, volatility NUMERIC, rowid INT) +AS $$ + + from crankshaft.space_time_dynamics import Markov + markov = Markov() + + ## TODO: use named parameters or a dictionary + return markov.spatial_trend(subquery, time_cols, num_classes, w_type, num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; + +-- input table format: identical to above but in a predictable format +-- Sample function call: +-- SELECT cdb_spatial_markov('SELECT * FROM real_estate', +-- 'date_1') + + +-- CREATE OR REPLACE FUNCTION +-- cdb_spatial_markov ( +-- subquery TEXT, +-- time_col_min text, +-- time_col_max text, +-- date_format text, -- '_YYYY_MM_DD' +-- num_time_per_bin INT DEFAULT 1, +-- permutations INT DEFAULT 99, +-- geom_column TEXT DEFAULT 'the_geom', +-- id_col TEXT DEFAULT 'cartodb_id', +-- w_type TEXT DEFAULT 'knn', +-- num_ngbrs int DEFAULT 5) +-- RETURNS TABLE (moran FLOAT, quads TEXT, significance FLOAT, ids INT) +-- AS $$ +-- plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()') +-- from crankshaft.clustering import moran_local +-- # TODO: use named parameters or a dictionary +-- return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs) +-- $$ LANGUAGE plpython3u; +-- +-- -- input table format: +-- -- id | geom | date | measurement +-- -- 1 | Pt1 | 12/3 | 13.2 +-- -- 2 | Pt2 | 11/5 | 11.3 +-- -- 3 | Pt1 | 11/13 | 12.9 +-- -- 4 | Pt3 | 12/19 | 10.1 +-- -- ... +-- +-- CREATE OR REPLACE FUNCTION +-- cdb_spatial_markov ( +-- subquery TEXT, +-- time_col text, +-- num_time_per_bin INT DEFAULT 1, +-- permutations INT DEFAULT 99, +-- geom_column TEXT DEFAULT 'the_geom', +-- id_col TEXT DEFAULT 'cartodb_id', +-- w_type TEXT DEFAULT 'knn', +-- num_ngbrs int DEFAULT 5) +-- RETURNS TABLE (moran FLOAT, quads TEXT, significance FLOAT, ids INT) +-- AS $$ +-- plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()') +-- from crankshaft.clustering import moran_local +-- # TODO: use named parameters or a dictionary +-- return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs) +-- $$ LANGUAGE plpython3u; +-- Based on: +-- https://github.com/mapbox/polylabel/blob/master/index.js +-- https://sites.google.com/site/polesofinaccessibility/ +-- Requires: https://github.com/CartoDB/cartodb-postgresql + +-- Based on: +-- https://github.com/mapbox/polylabel/blob/master/index.js +-- https://sites.google.com/site/polesofinaccessibility/ +-- Requires: https://github.com/CartoDB/cartodb-postgresql + +CREATE OR REPLACE FUNCTION CDB_PIA( + IN polygon geometry, + IN tolerance numeric DEFAULT 1.0 + ) +RETURNS geometry AS $$ +DECLARE + env geometry[]; + cells geometry[]; + cell geometry; + best_c geometry; + best_d numeric; + test_d numeric; + test_mx numeric; + test_h numeric; + test_cells geometry[]; + width numeric; + height numeric; + h numeric; + i integer; + n integer; + sqr numeric; + p geometry; +BEGIN + sqr := 0.5*(|/2.0); + polygon := ST_Transform(polygon, 3857); + + -- grid #0 cell size + height := ST_YMax(polygon) - ST_YMin(polygon); + width := ST_XMax(polygon) - ST_XMin(polygon); + h := 0.5*LEAST(height, width); + + -- grid #0 + with c1 as( + SELECT cdb_crankshaft.CDB_RectangleGrid(polygon, h, h) as c + ) + SELECT array_agg(c) INTO cells FROM c1; + + -- 1st guess: centroid + best_c := polygon; + best_d := cdb_crankshaft._Signed_Dist(polygon, ST_Centroid(Polygon)); + + -- looping the loop + n := array_length(cells,1); + i := 1; + LOOP + + EXIT WHEN i > n; + + cell := cells[i]; + + i := i+1; + + -- cell side size, it's square + test_h := ST_XMax(cell) - ST_XMin(cell) ; + + -- check distance + test_d := cdb_crankshaft._Signed_Dist(polygon, ST_Centroid(cell)); + + IF test_d > best_d THEN + best_d := test_d; + best_c := cell; + END IF; + + -- longest distance within the cell + test_mx := test_d + (test_h * sqr); + + -- if the cell has no chance to contains the desired point, continue + CONTINUE WHEN test_mx - best_d <= tolerance; + + -- resample the cell + with c1 as( + SELECT cdb_crankshaft.CDB_RectangleGrid(cell, test_h/2, test_h/2) as c + ) + SELECT array_agg(c) INTO test_cells FROM c1; + + -- concat the new cells to the former array + cells := cells || test_cells; + + -- prepare next iteration + n := array_length(cells,1); + + END LOOP; + + RETURN ST_transform(ST_Centroid(best_c), 4326); + +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; + + + +-- signed distance point to polygon with holes +-- negative is the point is out the polygon +-- rev 1. adding MULTIPOLYGON and GEOMETRYCOLLECTION support by @abelvm +CREATE OR REPLACE FUNCTION _Signed_Dist( + IN polygon geometry, + IN point geometry + ) +RETURNS numeric AS $$ +DECLARE + pols geometry[]; + pol geometry; + i integer; + j integer; + within integer; + w integer; + holes integer; + dist numeric; + d numeric; +BEGIN + dist := 1e999; + WITH collection as (SELECT (ST_dump(polygon)).geom as geom) SELECT array_agg(geom) into pols FROM collection; + FOR j in 1..array_length(pols, 1) + LOOP + pol := pols[j]; + d := dist; + SELECT LEAST(dist, ST_distance(point, ST_ExteriorRing(pol))::numeric) INTO d; + SELECT CASE WHEN ST_Within(point,pol) THEN 1 ELSE -1 END INTO w; + SELECT ST_NumInteriorRings(pol) INTO holes; + IF holes > 0 THEN + FOR i IN 1..holes + LOOP + SELECT LEAST(d, ST_distance(point, ST_InteriorRingN(pol, i))::numeric) INTO d; + END LOOP; + END IF; + IF d < dist THEN + dist:= d; + within := w; + END IF; + END LOOP; + dist := dist * within::numeric; + RETURN dist; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; +-- +-- Iterative densification of a set of points using Delaunay triangulation +-- the new points have as assigned value the average value of the 3 vertex (centroid) +-- +-- @param geomin - array of geometries (points) +-- +-- @param colin - array of numeric values in that points +-- +-- @param iterations - integer, number of iterations +-- +-- +-- Returns: TABLE(geomout geometry, colout numeric) +-- +-- +CREATE OR REPLACE FUNCTION CDB_Densify( + IN geomin geometry[], + IN colin numeric[], + IN iterations integer + ) +RETURNS TABLE(geomout geometry, colout numeric) AS $$ +DECLARE + geotemp geometry[]; + coltemp numeric[]; + i integer; + gs geometry[]; + g geometry; + vertex geometry[]; + va numeric; + vb numeric; + vc numeric; + center geometry; + centerval numeric; + tmp integer; +BEGIN + geotemp := geomin; + coltemp := colin; + FOR i IN 1..iterations + LOOP + -- generate TIN + WITH a as (SELECT unnest(geotemp) AS e), + b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a), + c as (SELECT (ST_Dump(t)).geom AS v FROM b) + SELECT array_agg(v) INTO gs FROM c; + -- loop cells + FOREACH g IN ARRAY gs + LOOP + -- append centroid + SELECT ST_Centroid(g) INTO center; + geotemp := array_append(geotemp, center); + -- retrieve the value of each vertex + WITH a AS (SELECT (ST_DumpPoints(g)).geom AS v) + SELECT array_agg(v) INTO vertex FROM a; + WITH a AS(SELECT unnest(geotemp) as geo, unnest(coltemp) as c) + SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]); + WITH a AS(SELECT unnest(geotemp) as geo, unnest(coltemp) as c) + SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]); + WITH a AS(SELECT unnest(geotemp) as geo, unnest(coltemp) as c) + SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]); + -- calc the value at the center + centerval := (va + vb + vc) / 3; + -- append the value + coltemp := array_append(coltemp, centerval); + END LOOP; + END LOOP; + RETURN QUERY SELECT unnest(geotemp ) as geomout, unnest(coltemp ) as colout; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; +CREATE OR REPLACE FUNCTION CDB_TINmap( + IN geomin geometry[], + IN colin numeric[], + IN iterations integer + ) +RETURNS TABLE(geomout geometry, colout numeric) AS $$ +DECLARE + p geometry[]; + vals numeric[]; + gs geometry[]; + g geometry; + vertex geometry[]; + centerval numeric; + va numeric; + vb numeric; + vc numeric; + coltemp numeric[]; +BEGIN + SELECT array_agg(dens.geomout), array_agg(dens.colout) INTO p, vals FROM cdb_crankshaft.CDB_Densify(geomin, colin, iterations) dens; + WITH a as (SELECT unnest(p) AS e), + b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a), + c as (SELECT (ST_Dump(t)).geom AS v FROM b) + SELECT array_agg(v) INTO gs FROM c; + FOREACH g IN ARRAY gs + LOOP + -- retrieve the vertex of each triangle + WITH a AS (SELECT (ST_DumpPoints(g)).geom AS v) + SELECT array_agg(v) INTO vertex FROM a; + -- retrieve the value of each vertex + WITH a AS(SELECT unnest(p) as geo, unnest(vals) as c) + SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]); + WITH a AS(SELECT unnest(p) as geo, unnest(vals) as c) + SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]); + WITH a AS(SELECT unnest(p) as geo, unnest(vals) as c) + SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]); + -- calc the value at the center + centerval := (va + vb + vc) / 3; + -- append the value + coltemp := array_append(coltemp, centerval); + END LOOP; + RETURN QUERY SELECT unnest(gs) as geomout, unnest(coltemp ) as colout; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; +-- Getis-Ord's G +-- Hotspot/Coldspot Analysis tool +CREATE OR REPLACE FUNCTION + CDB_GetisOrdsG( + subquery TEXT, + column_name TEXT, + w_type TEXT DEFAULT 'knn', + num_ngbrs INT DEFAULT 5, + permutations INT DEFAULT 999, + geom_col TEXT DEFAULT 'the_geom', + id_col TEXT DEFAULT 'cartodb_id') +RETURNS TABLE (z_score NUMERIC, p_value NUMERIC, p_z_sim NUMERIC, rowid BIGINT) +AS $$ + from crankshaft.clustering import Getis + getis = Getis() + return getis.getis_ord(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col) +$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; + +-- TODO: make a version that accepts the values as arrays + +-- Find outliers using a static threshold +-- +CREATE OR REPLACE FUNCTION CDB_StaticOutlier(column_value numeric, threshold numeric) +RETURNS boolean +AS $$ +BEGIN + + RETURN column_value > threshold; + +END; +$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE ; + +-- Find outliers by a percentage above the threshold +-- TODO: add symmetric option? `is_symmetric boolean DEFAULT false` + +CREATE OR REPLACE FUNCTION CDB_PercentOutlier(column_values numeric[], outlier_fraction numeric, ids int[]) +RETURNS TABLE(is_outlier boolean, rowid int) +AS $$ +DECLARE + avg_val numeric; + out_vals boolean[]; +BEGIN + + SELECT avg(i) INTO avg_val + FROM unnest(column_values) As x(i); + + IF avg_val = 0 THEN + RAISE EXCEPTION 'Mean value is zero. Try another outlier method.'; + END IF; + + SELECT array_agg( + outlier_fraction < i / avg_val) INTO out_vals + FROM unnest(column_values) As x(i); + + RETURN QUERY + SELECT unnest(out_vals) As is_outlier, + unnest(ids) As rowid; + +END; +$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; + +-- Find outliers above a given number of standard deviations from the mean + +CREATE OR REPLACE FUNCTION CDB_StdDevOutlier(column_values numeric[], num_deviations numeric, ids int[], is_symmetric boolean DEFAULT true) +RETURNS TABLE(is_outlier boolean, rowid int) +AS $$ +DECLARE + stddev_val numeric; + avg_val numeric; + out_vals boolean[]; +BEGIN + + SELECT stddev(i), avg(i) INTO stddev_val, avg_val + FROM unnest(column_values) As x(i); + + IF stddev_val = 0 THEN + RAISE EXCEPTION 'Standard deviation of input data is zero'; + END IF; + + IF is_symmetric THEN + SELECT array_agg( + abs(i - avg_val) / stddev_val > num_deviations) INTO out_vals + FROM unnest(column_values) As x(i); + ELSE + SELECT array_agg( + (i - avg_val) / stddev_val > num_deviations) INTO out_vals + FROM unnest(column_values) As x(i); + END IF; + + RETURN QUERY + SELECT unnest(out_vals) As is_outlier, + unnest(ids) As rowid; +END; +$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; +CREATE OR REPLACE FUNCTION CDB_Contour( + IN geomin geometry[], + IN colin numeric[], + IN buffer numeric, + IN intmethod integer, + IN classmethod integer, + IN steps integer, + IN max_time integer DEFAULT 60000 + ) +RETURNS TABLE( + the_geom geometry, + bin integer, + min_value numeric, + max_value numeric, + avg_value numeric +) AS $$ +DECLARE + cell_count integer; + tin geometry[]; + resolution integer; +BEGIN + + -- nasty trick to override issue #121 + IF max_time = 0 THEN + max_time = -90; + END IF; + resolution := max_time; + max_time := -1 * resolution; + + -- calc the optimal number of cells for the current dataset + SELECT + CASE intmethod + WHEN 0 THEN round(3.7745903782 * max_time - 9.4399210051 * array_length(geomin,1) - 1350.8778213073) + WHEN 1 THEN round(2.2855592156 * max_time - 87.285217133 * array_length(geomin,1) + 17255.7085601797) + WHEN 2 THEN round(0.9799471999 * max_time - 127.0334085369 * array_length(geomin,1) + 22707.9579721218) + ELSE 10000 + END INTO cell_count; + + -- we don't have iterative barycentric interpolation in CDB_interpolation, + -- and it's a costy function, so let's make a custom one here till + -- we update the code + -- tin := ARRAY[]::geometry[]; + IF intmethod=1 THEN + WITH + a as (SELECT unnest(geomin) AS e), + b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a), + c as (SELECT (ST_Dump(t)).geom as v FROM b) + SELECT array_agg(v) INTO tin FROM c; + END IF; + -- Delaunay stuff performed just ONCE!! + + -- magic + RETURN QUERY + WITH + convexhull as ( + SELECT + ST_ConvexHull(ST_Collect(geomin)) as g, + buffer * |/ st_area(ST_ConvexHull(ST_Collect(geomin)))/PI() as r + ), + envelope as ( + SELECT + st_expand(a.g, a.r) as e + FROM convexhull a + ), + envelope3857 as( + SELECT + ST_Transform(e, 3857) as geom + FROM envelope + ), + resolution as( + SELECT + CASE WHEN resolution <= 0 THEN + round(|/ ( + ST_area(geom) / abs(cell_count) + )) + ELSE + resolution + END AS cell + FROM envelope3857 + ), + grid as( + SELECT + ST_Transform(cdb_crankshaft.CDB_RectangleGrid(e.geom, r.cell, r.cell), 4326) as geom + FROM envelope3857 e, resolution r + ), + interp as( + SELECT + geom, + CASE + WHEN intmethod=1 THEN cdb_crankshaft._interp_in_tin(geomin, colin, tin, ST_Centroid(geom)) + ELSE cdb_crankshaft.CDB_SpatialInterpolation(geomin, colin, ST_Centroid(geom), intmethod) + END as val + FROM grid + ), + classes as( + SELECT CASE + WHEN classmethod = 0 THEN + cdb_crankshaft.CDB_EqualIntervalBins(array_agg(val), steps) + WHEN classmethod = 1 THEN + cdb_crankshaft.CDB_HeadsTailsBins(array_agg(val), steps) + WHEN classmethod = 2 THEN + cdb_crankshaft.CDB_JenksBins(array_agg(val), steps) + ELSE + cdb_crankshaft.CDB_QuantileBins(array_agg(val), steps) + END as b + FROM interp + where val is not null + ), + classified as( + SELECT + i.*, + width_bucket(i.val, c.b) as bucket + FROM interp i left join classes c + ON 1=1 + ), + classified2 as( + SELECT + geom, + val, + CASE + WHEN bucket = steps THEN bucket - 1 + ELSE bucket + END as b + FROM classified + ), + final as( + SELECT + st_union(geom) as the_geom, + b as bin, + min(val) as min_value, + max(val) as max_value, + avg(val) as avg_value + FROM classified2 + GROUP BY bin + ) + SELECT + * + FROM final + where final.bin is not null + ; +END; +$$ language plpgsql VOLATILE PARALLEL RESTRICTED; + + +-- ===================================================================== +-- Interp in grid, so we can use barycentric with a precalculated tin (NNI) +-- ===================================================================== +CREATE OR REPLACE FUNCTION _interp_in_tin( + IN geomin geometry[], + IN colin numeric[], + IN tin geometry[], + IN point geometry + ) +RETURNS numeric AS +$$ +DECLARE + g geometry; + vertex geometry[]; + sg numeric; + sa numeric; + sb numeric; + sc numeric; + va numeric; + vb numeric; + vc numeric; + output numeric; +BEGIN + -- get the cell the point is within + WITH + a as (SELECT unnest(tin) as v), + b as (SELECT v FROM a WHERE ST_Within(point, v)) + SELECT v INTO g FROM b; + + -- if we're out of the data realm, + -- return null + IF g is null THEN + RETURN null; + END IF; + + -- vertex of the selected cell + WITH a AS ( + SELECT (ST_DumpPoints(g)).geom AS v + ) + SELECT array_agg(v) INTO vertex FROM a; + + -- retrieve the value of each vertex + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]); + + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]); + + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]); + + -- calc the areas + SELECT + ST_area(g), + ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[2], vertex[3], point]))), + ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[1], vertex[3], point]))), + ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point,vertex[1],vertex[2], point]))) INTO sg, sa, sb, sc; + + output := (coalesce(sa,0) * coalesce(va,0) + coalesce(sb,0) * coalesce(vb,0) + coalesce(sc,0) * coalesce(vc,0)) / coalesce(sg,1); + RETURN output; +END; +$$ +language plpgsql IMMUTABLE PARALLEL SAFE; +-- Function by Stuart Lynn for a simple interpolation of a value +-- from a polygon table over an arbitrary polygon +-- (weighted by the area proportion overlapped) +-- Aereal weighting is a very simple form of aereal interpolation. +-- +-- Parameters: +-- * geom a Polygon geometry which defines the area where a value will be +-- estimated as the area-weighted sum of a given table/column +-- * target_table_name table name of the table that provides the values +-- * target_column column name of the column that provides the values +-- * schema_name optional parameter to defina the schema the target table +-- belongs to, which is necessary if its not in the search_path. +-- Note that target_table_name should never include the schema in it. +-- Return value: +-- Aereal-weighted interpolation of the column values over the geometry +CREATE OR REPLACE +FUNCTION cdb_overlap_sum(geom geometry, target_table_name text, target_column text, schema_name text DEFAULT NULL) + RETURNS numeric AS +$$ +DECLARE + result numeric; + qualified_name text; +BEGIN + IF schema_name IS NULL THEN + qualified_name := Format('%I', target_table_name); + ELSE + qualified_name := Format('%I.%s', schema_name, target_table_name); + END IF; + EXECUTE Format(' + SELECT sum(%I*ST_Area(St_Intersection($1, a.the_geom))/ST_Area(a.the_geom)) + FROM %s AS a + WHERE $1 && a.the_geom + ', target_column, qualified_name) + USING geom + INTO result; + RETURN result; +END; +$$ LANGUAGE plpgsql STABLE PARALLEL SAFE; +CREATE OR REPLACE FUNCTION +CDB_GWR(subquery text, dep_var text, ind_vars text[], + bw numeric default null, fixed boolean default False, + kernel text default 'bisquare', geom_col text default 'the_geom', + id_col text default 'cartodb_id') +RETURNS table(coeffs JSON, stand_errs JSON, t_vals JSON, + filtered_t_vals JSON, predicted numeric, + residuals numeric, r_squared numeric, bandwidth numeric, + rowid bigint) +AS $$ + +from crankshaft.regression import GWR + +gwr = GWR() + +return gwr.gwr(subquery, dep_var, ind_vars, bw, fixed, kernel, geom_col, id_col) + +$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; + + +CREATE OR REPLACE FUNCTION +CDB_GWR_Predict(subquery text, dep_var text, ind_vars text[], + bw numeric default null, fixed boolean default False, + kernel text default 'bisquare', + geom_col text default 'the_geom', + id_col text default 'cartodb_id') +RETURNS table(coeffs JSON, stand_errs JSON, t_vals JSON, + r_squared numeric, predicted numeric, rowid bigint) +AS $$ + +from crankshaft.regression import GWR +gwr = GWR() + +return gwr.gwr_predict(subquery, dep_var, ind_vars, bw, fixed, kernel, geom_col, id_col) + +$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE; +-- +-- Creates N points randomly distributed arround the polygon +-- +-- @param g - the geometry to be turned in to points +-- +-- @param no_points - the number of points to generate +-- +-- @params max_iter_per_point - the function generates points in the polygon's bounding box +-- and discards points which don't lie in the polygon. max_iter_per_point specifies how many +-- misses per point the funciton accepts before giving up. +-- +-- Returns: Multipoint with the requested points +CREATE OR REPLACE FUNCTION cdb_dot_density(geom geometry , no_points Integer, max_iter_per_point Integer DEFAULT 1000) +RETURNS GEOMETRY AS $$ +DECLARE + extent GEOMETRY; + test_point Geometry; + width NUMERIC; + height NUMERIC; + x0 NUMERIC; + y0 NUMERIC; + xp NUMERIC; + yp NUMERIC; + no_left INTEGER; + remaining_iterations INTEGER; + points GEOMETRY[]; + bbox_line GEOMETRY; + intersection_line GEOMETRY; +BEGIN + extent := ST_Envelope(geom); + width := ST_XMax(extent) - ST_XMIN(extent); + height := ST_YMax(extent) - ST_YMIN(extent); + x0 := ST_XMin(extent); + y0 := ST_YMin(extent); + no_left := no_points; + + LOOP + if(no_left=0) THEN + EXIT; + END IF; + yp = y0 + height*random(); + bbox_line = ST_MakeLine( + ST_SetSRID(ST_MakePoint(yp, x0),4326), + ST_SetSRID(ST_MakePoint(yp, x0+width),4326) + ); + intersection_line = ST_Intersection(bbox_line,geom); + test_point = ST_LineInterpolatePoint(st_makeline(st_linemerge(intersection_line)),random()); + points := points || test_point; + no_left = no_left - 1 ; + END LOOP; + RETURN ST_Collect(points); +END; +$$ +LANGUAGE plpgsql VOLATILE PARALLEL RESTRICTED; +-- Make sure by default there are no permissions for publicuser +-- NOTE: this happens at extension creation time, as part of an implicit transaction. +-- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE; + +-- Grant permissions on the schema to publicuser (but just the schema) +GRANT USAGE ON SCHEMA cdb_crankshaft TO publicuser; + +-- Revoke execute permissions on all functions in the schema by default +-- REVOKE EXECUTE ON ALL FUNCTIONS IN SCHEMA cdb_crankshaft FROM PUBLIC, publicuser; +-- +-- Fill given extent with a rectangular coverage +-- +-- @param ext Extent to fill. Only rectangles with center point falling +-- inside the extent (or at the lower or leftmost edge) will +-- be emitted. The returned hexagons will have the same SRID +-- as this extent. +-- +-- @param width With of each rectangle +-- +-- @param height Height of each rectangle +-- +-- @param origin Optional origin to allow for exact tiling. +-- If omitted the origin will be 0,0. +-- The parameter is checked for having the same SRID +-- as the extent. +-- +-- +CREATE OR REPLACE FUNCTION CDB_RectangleGrid(ext GEOMETRY, width FLOAT8, height FLOAT8, origin GEOMETRY DEFAULT NULL) +RETURNS SETOF GEOMETRY +AS $$ +DECLARE + h GEOMETRY; -- rectangle cell + hstep FLOAT8; -- horizontal step + vstep FLOAT8; -- vertical step + hw FLOAT8; -- half width + hh FLOAT8; -- half height + vstart FLOAT8; + hstart FLOAT8; + hend FLOAT8; + vend FLOAT8; + xoff FLOAT8; + yoff FLOAT8; + xgrd FLOAT8; + ygrd FLOAT8; + x FLOAT8; + y FLOAT8; + srid INTEGER; +BEGIN + + srid := ST_SRID(ext); + + xoff := 0; + yoff := 0; + + IF origin IS NOT NULL THEN + IF ST_SRID(origin) != srid THEN + RAISE EXCEPTION 'SRID mismatch between extent (%) and origin (%)', srid, ST_SRID(origin); + END IF; + xoff := ST_X(origin); + yoff := ST_Y(origin); + END IF; + + --RAISE DEBUG 'X offset: %', xoff; + --RAISE DEBUG 'Y offset: %', yoff; + + hw := width/2.0; + hh := height/2.0; + + xgrd := hw; + ygrd := hh; + --RAISE DEBUG 'X grid size: %', xgrd; + --RAISE DEBUG 'Y grid size: %', ygrd; + + hstep := width; + vstep := height; + + -- Tweak horizontal start on hstep grid from origin + hstart := xoff + ceil((ST_XMin(ext)-xoff)/hstep)*hstep; + --RAISE DEBUG 'hstart: %', hstart; + + -- Tweak vertical start on vstep grid from origin + vstart := yoff + ceil((ST_Ymin(ext)-yoff)/vstep)*vstep; + --RAISE DEBUG 'vstart: %', vstart; + + hend := ST_XMax(ext); + vend := ST_YMax(ext); + + --RAISE DEBUG 'hend: %', hend; + --RAISE DEBUG 'vend: %', vend; + + x := hstart; + WHILE x < hend LOOP -- over X + y := vstart; + h := ST_MakeEnvelope(x-hw, y-hh, x+hw, y+hh, srid); + WHILE y < vend LOOP -- over Y + RETURN NEXT h; + h := ST_Translate(h, 0, vstep); + y := yoff + round(((y + vstep)-yoff)/ygrd)*ygrd; -- round to grid + END LOOP; + x := xoff + round(((x + hstep)-xoff)/xgrd)*xgrd; -- round to grid + END LOOP; + + RETURN; +END +$$ LANGUAGE 'plpgsql' IMMUTABLE PARALLEL SAFE; + +-- +-- Calculate the equal interval bins for a given column +-- +-- @param in_array A numeric array of numbers to determine the best +-- to determine the bin boundary +-- +-- @param breaks The number of bins you want to find. +-- +-- +-- Returns: upper edges of bins +-- +-- + +CREATE OR REPLACE FUNCTION CDB_EqualIntervalBins ( in_array NUMERIC[], breaks INT ) RETURNS NUMERIC[] as $$ +DECLARE + diff numeric; + min_val numeric; + max_val numeric; + tmp_val numeric; + i INT := 1; + reply numeric[]; +BEGIN + SELECT min(e), max(e) INTO min_val, max_val FROM ( SELECT unnest(in_array) e ) x WHERE e IS NOT NULL; + diff = (max_val - min_val) / breaks::numeric; + LOOP + IF i < breaks THEN + tmp_val = min_val + i::numeric * diff; + reply = array_append(reply, tmp_val); + i := i+1; + ELSE + reply = array_append(reply, max_val); + EXIT; + END IF; + END LOOP; + RETURN reply; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; + +-- +-- Determine the Heads/Tails classifications from a numeric array +-- +-- @param in_array A numeric array of numbers to determine the best +-- bins based on the Heads/Tails method. +-- +-- @param breaks The number of bins you want to find. +-- +-- + +CREATE OR REPLACE FUNCTION CDB_HeadsTailsBins ( in_array NUMERIC[], breaks INT) RETURNS NUMERIC[] as $$ +DECLARE + element_count INT4; + arr_mean numeric; + i INT := 2; + reply numeric[]; +BEGIN + -- get the total size of our row + element_count := array_upper(in_array, 1) - array_lower(in_array, 1); + -- ensure the ordering of in_array + SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x; + -- stop if no rows + IF element_count IS NULL THEN + RETURN NULL; + END IF; + -- stop if our breaks are more than our input array size + IF element_count < breaks THEN + RETURN in_array; + END IF; + + -- get our mean value + SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x; + + reply = Array[arr_mean]; + -- slice our bread + LOOP + IF i > breaks THEN EXIT; END IF; + SELECT avg(e) INTO arr_mean FROM ( SELECT unnest(in_array) e) x WHERE e > reply[i-1]; + IF arr_mean IS NOT NULL THEN + reply = array_append(reply, arr_mean); + END IF; + i := i+1; + END LOOP; + RETURN reply; +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; + +-- +-- Determine the Jenks classifications from a numeric array +-- +-- @param in_array A numeric array of numbers to determine the best +-- bins based on the Jenks method. +-- +-- @param breaks The number of bins you want to find. +-- +-- @param iterations The number of different starting positions to test. +-- +-- @param invert Optional wheter to return the top of each bin (default) +-- or the bottom. BOOLEAN, default=FALSE. +-- +-- + + +CREATE OR REPLACE FUNCTION CDB_JenksBins ( in_array NUMERIC[], breaks INT, iterations INT DEFAULT 5, invert BOOLEAN DEFAULT FALSE) RETURNS NUMERIC[] as $$ +DECLARE + element_count INT4; + arr_mean NUMERIC; + bot INT; + top INT; + tops INT[]; + classes INT[][]; + i INT := 1; j INT := 1; + curr_result NUMERIC[]; + best_result NUMERIC[]; + seedtarget TEXT; + quant NUMERIC[]; + shuffles INT; +BEGIN + -- get the total size of our row + element_count := array_length(in_array, 1); --array_upper(in_array, 1) - array_lower(in_array, 1); + -- ensure the ordering of in_array + SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x; + -- stop if no rows + IF element_count IS NULL THEN + RETURN NULL; + END IF; + -- stop if our breaks are more than our input array size + IF element_count < breaks THEN + RETURN in_array; + END IF; + + shuffles := LEAST(GREATEST(floor(2500000.0/(element_count::float*iterations::float)), 1), 750)::int; + -- get our mean value + SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x; + + -- assume best is actually Quantile + SELECT cdb_crankshaft.CDB_QuantileBins(in_array, breaks) INTO quant; + + -- if data is very very large, just return quant and be done + IF element_count > 5000000 THEN + RETURN quant; + END IF; + + -- change quant into bottom, top markers + LOOP + IF i = 1 THEN + bot = 1; + ELSE + -- use last top to find this bot + bot = top+1; + END IF; + IF i = breaks THEN + top = element_count; + ELSE + SELECT count(*) INTO top FROM ( SELECT unnest(in_array) as v) x WHERE v <= quant[i]; + END IF; + IF i = 1 THEN + classes = ARRAY[ARRAY[bot,top]]; + ELSE + classes = ARRAY_CAT(classes,ARRAY[bot,top]); + END IF; + IF i > breaks THEN EXIT; END IF; + i = i+1; + END LOOP; + + best_result = cdb_crankshaft.CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles); + + --set the seed so we can ensure the same results + SELECT setseed(0.4567) INTO seedtarget; + --loop through random starting positions + LOOP + IF j > iterations-1 THEN EXIT; END IF; + i = 1; + tops = ARRAY[element_count]; + LOOP + IF i = breaks THEN EXIT; END IF; + SELECT array_agg(distinct e) INTO tops FROM (SELECT unnest(array_cat(tops, ARRAY[floor(random()*element_count::float)::int])) as e ORDER BY e) x WHERE e != 1; + i = array_length(tops, 1); + END LOOP; + i = 1; + LOOP + IF i > breaks THEN EXIT; END IF; + IF i = 1 THEN + bot = 1; + ELSE + bot = top+1; + END IF; + top = tops[i]; + IF i = 1 THEN + classes = ARRAY[ARRAY[bot,top]]; + ELSE + classes = ARRAY_CAT(classes,ARRAY[bot,top]); + END IF; + i := i+1; + END LOOP; + curr_result = cdb_crankshaft.CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles); + + IF curr_result[1] > best_result[1] THEN + best_result = curr_result; + j = j-1; -- if we found a better result, add one more search + END IF; + j = j+1; + END LOOP; + + RETURN (best_result)[2:array_upper(best_result, 1)]; +END; +$$ language plpgsql VOLATILE PARALLEL RESTRICTED; + + + +-- +-- Perform a single iteration of the Jenks classification +-- + +CREATE OR REPLACE FUNCTION CDB_JenksBinsIteration ( in_array NUMERIC[], breaks INT, classes INT[][], invert BOOLEAN, element_count INT4, arr_mean NUMERIC, max_search INT DEFAULT 50) RETURNS NUMERIC[] as $$ +DECLARE + tmp_val numeric; + new_classes int[][]; + tmp_class int[]; + i INT := 1; + j INT := 1; + side INT := 2; + sdam numeric; + gvf numeric := 0.0; + new_gvf numeric; + arr_gvf numeric[]; + class_avg numeric; + class_max_i INT; + class_min_i INT; + class_max numeric; + class_min numeric; + reply numeric[]; +BEGIN + + -- Calculate the sum of squared deviations from the array mean (SDAM). + SELECT sum((arr_mean - e)^2) INTO sdam FROM ( SELECT unnest(in_array) as e ) x; + --Identify the breaks for the lowest GVF + LOOP + i = 1; + LOOP + -- get our mean + SELECT avg(e) INTO class_avg FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e) x; + -- find the deviation + SELECT sum((class_avg-e)^2) INTO tmp_val FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e ) x; + IF i = 1 THEN + arr_gvf = ARRAY[tmp_val]; + -- init our min/max map for later + class_max = arr_gvf[i]; + class_min = arr_gvf[i]; + class_min_i = 1; + class_max_i = 1; + ELSE + arr_gvf = array_append(arr_gvf, tmp_val); + END IF; + i := i+1; + IF i > breaks THEN EXIT; END IF; + END LOOP; + -- calculate our new GVF + SELECT sdam-sum(e) INTO new_gvf FROM ( SELECT unnest(arr_gvf) as e ) x; + -- if no improvement was made, exit + IF new_gvf < gvf THEN EXIT; END IF; + gvf = new_gvf; + IF j > max_search THEN EXIT; END IF; + j = j+1; + i = 1; + LOOP + --establish directionality (uppward through classes or downward) + IF arr_gvf[i] < class_min THEN + class_min = arr_gvf[i]; + class_min_i = i; + END IF; + IF arr_gvf[i] > class_max THEN + class_max = arr_gvf[i]; + class_max_i = i; + END IF; + i := i+1; + IF i > breaks THEN EXIT; END IF; + END LOOP; + IF class_max_i > class_min_i THEN + class_min_i = class_max_i - 1; + ELSE + class_min_i = class_max_i + 1; + END IF; + --Move from higher class to a lower gid order + IF class_max_i > class_min_i THEN + classes[class_max_i][1] = classes[class_max_i][1] + 1; + classes[class_min_i][2] = classes[class_min_i][2] + 1; + ELSE -- Move from lower class UP into a higher class by gid + classes[class_max_i][2] = classes[class_max_i][2] - 1; + classes[class_min_i][1] = classes[class_min_i][1] - 1; + END IF; + END LOOP; + + i = 1; + LOOP + IF invert = TRUE THEN + side = 1; --default returns bottom side of breaks, invert returns top side + END IF; + reply = array_append(reply, in_array[classes[i][side]]); + i = i+1; + IF i > breaks THEN EXIT; END IF; + END LOOP; + + RETURN array_prepend(gvf, reply); + +END; +$$ language plpgsql IMMUTABLE PARALLEL SAFE; + + +-- +-- Determine the Quantile classifications from a numeric array +-- +-- @param in_array A numeric array of numbers to determine the best +-- bins based on the Quantile method. +-- +-- @param breaks The number of bins you want to find. +-- +-- +CREATE OR REPLACE FUNCTION CDB_QuantileBins ( in_array NUMERIC[], breaks INT) RETURNS NUMERIC[] as $$ +DECLARE + element_count INT4; + break_size numeric; + tmp_val numeric; + i INT := 1; + reply numeric[]; +BEGIN + -- sort our values + SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e ASC) x; + -- get the total size of our data + element_count := array_length(in_array, 1); + break_size := element_count::numeric / breaks; + -- slice our bread + LOOP + IF i < breaks THEN + IF break_size * i % 1 > 0 THEN + SELECT e INTO tmp_val FROM ( SELECT unnest(in_array) e LIMIT 1 OFFSET ceil(break_size * i) - 1) x; + ELSE + SELECT avg(e) INTO tmp_val FROM ( SELECT unnest(in_array) e LIMIT 2 OFFSET ceil(break_size * i) - 1 ) x; + END IF; + ELSIF i = breaks THEN + -- select the last value + SELECT max(e) INTO tmp_val FROM ( SELECT unnest(in_array) e ) x; + ELSE + EXIT; + END IF; + + reply = array_append(reply, tmp_val); + i := i+1; + END LOOP; + RETURN reply; +END; +$$ language plpgsql IMMUTABLE STRICT PARALLEL SAFE;