You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
crankshaft/release/crankshaft--0.8.2--0.9.0.sql

2340 lines
70 KiB

This file contains invisible Unicode characters!

This file contains invisible Unicode characters that may be processed differently from what appears below. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to reveal hidden characters.

--DO NOT MODIFY THIS FILE, IT IS GENERATED AUTOMATICALLY FROM SOURCES
-- Complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION crankshaft" to load this file. \quit
-- Version number of the extension release
CREATE OR REPLACE FUNCTION cdb_crankshaft_version()
RETURNS text AS $$
SELECT '0.9.0'::text;
$$ language 'sql' IMMUTABLE STRICT PARALLEL SAFE;
-- Internal identifier of the installed extension instence
-- e.g. 'dev' for current development version
CREATE OR REPLACE FUNCTION _cdb_crankshaft_internal_version()
RETURNS text AS $$
SELECT installed_version FROM pg_available_extensions where name='crankshaft' and pg_available_extensions IS NOT NULL;
$$ language 'sql' STABLE STRICT PARALLEL SAFE;
-- Internal function.
-- Set the seeds of the RNGs (Random Number Generators)
-- used internally.
CREATE OR REPLACE FUNCTION
_cdb_random_seeds (seed_value INTEGER) RETURNS VOID
AS $$
from crankshaft import random_seeds
random_seeds.set_random_seeds(seed_value)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION
CDB_PyAggS(current_state Numeric[], current_row Numeric[])
returns NUMERIC[] as $$
BEGIN
if array_upper(current_state,1) is null then
RAISE NOTICE 'setting state %',array_upper(current_row,1);
current_state[1] = array_upper(current_row,1);
end if;
return array_cat(current_state,current_row) ;
END
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
-- Create aggregate if it did not exist
DO $$ BEGIN
CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
SFUNC = CDB_PyAggS,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{}"
);
EXCEPTION
WHEN duplicate_function THEN NULL;
END $$;
CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment(
target NUMERIC[],
features NUMERIC[],
target_features NUMERIC[],
target_ids NUMERIC[],
n_estimators INTEGER DEFAULT 1200,
max_depth INTEGER DEFAULT 3,
subsample DOUBLE PRECISION DEFAULT 0.5,
learning_rate DOUBLE PRECISION DEFAULT 0.01,
min_samples_leaf INTEGER DEFAULT 1)
RETURNS TABLE(cartodb_id NUMERIC, prediction NUMERIC, accuracy NUMERIC)
AS $$
import numpy as np
import plpy
from crankshaft.segmentation import Segmentation
seg = Segmentation()
model_params = {'n_estimators': n_estimators,
'max_depth': max_depth,
'subsample': subsample,
'learning_rate': learning_rate,
'min_samples_leaf': min_samples_leaf}
def unpack2D(data):
dimension = data.pop(0)
a = np.array(data, dtype=np.float64)
return a.reshape(int(len(a)/dimension), int(dimension))
return seg.create_and_predict_segment_agg(
np.array(target, dtype=np.float64),
unpack2D(features),
unpack2D(target_features),
target_ids,
model_params)
$$ LANGUAGE plpython3u VOLATILE PARALLEL RESTRICTED;
CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment(
query TEXT,
variable_name TEXT,
target_table TEXT,
n_estimators INTEGER DEFAULT 1200,
max_depth INTEGER DEFAULT 3,
subsample DOUBLE PRECISION DEFAULT 0.5,
learning_rate DOUBLE PRECISION DEFAULT 0.01,
min_samples_leaf INTEGER DEFAULT 1)
RETURNS TABLE (cartodb_id TEXT, prediction NUMERIC, accuracy NUMERIC)
AS $$
from crankshaft.segmentation import Segmentation
seg = Segmentation()
model_params = {
'n_estimators': n_estimators,
'max_depth': max_depth,
'subsample': subsample,
'learning_rate': learning_rate,
'min_samples_leaf': min_samples_leaf
}
feature_cols = set(plpy.execute('''
select * from ({query}) as _w limit 0
'''.format(query=query)).colnames()) - set([variable_name, 'cartodb_id', ])
return seg.create_and_predict_segment(
query,
variable_name,
feature_cols,
target_table,
model_params
)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment(
query TEXT,
variable TEXT,
feature_columns TEXT[],
target_query TEXT,
n_estimators INTEGER DEFAULT 1200,
max_depth INTEGER DEFAULT 3,
subsample DOUBLE PRECISION DEFAULT 0.5,
learning_rate DOUBLE PRECISION DEFAULT 0.01,
min_samples_leaf INTEGER DEFAULT 1)
RETURNS TABLE (cartodb_id TEXT, prediction NUMERIC, accuracy NUMERIC)
AS $$
from crankshaft.segmentation import Segmentation
seg = Segmentation()
model_params = {
'n_estimators': n_estimators,
'max_depth': max_depth,
'subsample': subsample,
'learning_rate': learning_rate,
'min_samples_leaf': min_samples_leaf
}
return seg.create_and_predict_segment(
query,
variable,
feature_columns,
target_query,
model_params
)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION CDB_Gravity(
IN target_query text,
IN weight_column text,
IN source_query text,
IN pop_column text,
IN target bigint,
IN radius integer,
IN minval numeric DEFAULT -10e307
)
RETURNS TABLE(
the_geom geometry,
source_id bigint,
target_id bigint,
dist numeric,
h numeric,
hpop numeric) AS $$
DECLARE
t_id bigint[];
t_geom geometry[];
t_weight numeric[];
s_id bigint[];
s_geom geometry[];
s_pop numeric[];
BEGIN
EXECUTE 'WITH foo as('+target_query+') SELECT array_agg(cartodb_id), array_agg(the_geom), array_agg(' || weight_column || ') FROM foo' INTO t_id, t_geom, t_weight;
EXECUTE 'WITH foo as('+source_query+') SELECT array_agg(cartodb_id), array_agg(the_geom), array_agg(' || pop_column || ') FROM foo' INTO s_id, s_geom, s_pop;
RETURN QUERY
SELECT g.* FROM t, s, CDB_Gravity(t_id, t_geom, t_weight, s_id, s_geom, s_pop, target, radius, minval) g;
END;
$$ language plpgsql VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION CDB_Gravity(
IN t_id bigint[],
IN t_geom geometry[],
IN t_weight numeric[],
IN s_id bigint[],
IN s_geom geometry[],
IN s_pop numeric[],
IN target bigint,
IN radius integer,
IN minval numeric DEFAULT -10e307
)
RETURNS TABLE(
the_geom geometry,
source_id bigint,
target_id bigint,
dist numeric,
h numeric,
hpop numeric) AS $$
DECLARE
t_type text;
s_type text;
t_center geometry[];
s_center geometry[];
BEGIN
t_type := GeometryType(t_geom[1]);
s_type := GeometryType(s_geom[1]);
IF t_type = 'POINT' THEN
t_center := t_geom;
ELSE
WITH tmp as (SELECT unnest(t_geom) as g) SELECT array_agg(ST_Centroid(g)) INTO t_center FROM tmp;
END IF;
IF s_type = 'POINT' THEN
s_center := s_geom;
ELSE
WITH tmp as (SELECT unnest(s_geom) as g) SELECT array_agg(ST_Centroid(g)) INTO s_center FROM tmp;
END IF;
RETURN QUERY
with target0 as(
SELECT unnest(t_center) as tc, unnest(t_weight) as tw, unnest(t_id) as td
),
source0 as(
SELECT unnest(s_center) as sc, unnest(s_id) as sd, unnest (s_geom) as sg, unnest(s_pop) as sp
),
prev0 as(
SELECT
source0.sg,
source0.sd as sourc_id,
coalesce(source0.sp,0) as sp,
target.td as targ_id,
coalesce(target.tw,0) as tw,
GREATEST(1.0,ST_Distance(geography(target.tc), geography(source0.sc)))::numeric as distance
FROM source0
CROSS JOIN LATERAL
(
SELECT
*
FROM target0
WHERE tw > minval
AND ST_DWithin(geography(source0.sc), geography(tc), radius)
) AS target
),
deno as(
SELECT
sourc_id,
sum(tw/distance) as h_deno
FROM
prev0
GROUP BY sourc_id
)
SELECT
p.sg as the_geom,
p.sourc_id as source_id,
p.targ_id as target_id,
case when p.distance > 1 then p.distance else 0.0 end as dist,
100*(p.tw/p.distance)/d.h_deno as h,
p.sp*(p.tw/p.distance)/d.h_deno as hpop
FROM
prev0 p,
deno d
WHERE
p.targ_id = target AND
p.sourc_id = d.sourc_id;
END;
$$ language plpgsql IMMUTABLE PARALLEL SAFE;
-- 0: nearest neighbor(s)
-- 1: barymetric
-- 2: IDW
-- 3: krigin ---> TO DO
CREATE OR REPLACE FUNCTION CDB_SpatialInterpolation(
IN query text,
IN point geometry,
IN method integer DEFAULT 1,
IN p1 numeric DEFAULT 0,
IN p2 numeric DEFAULT 0
)
RETURNS numeric AS
$$
DECLARE
gs geometry[];
vs numeric[];
output numeric;
BEGIN
EXECUTE 'WITH a AS('||query||') SELECT array_agg(the_geom), array_agg(attrib) FROM a' INTO gs, vs;
SELECT CDB_SpatialInterpolation(gs, vs, point, method, p1,p2) INTO output FROM a;
RETURN output;
END;
$$
language plpgsql VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION CDB_SpatialInterpolation(
IN geomin geometry[],
IN colin numeric[],
IN point geometry,
IN method integer DEFAULT 1,
IN p1 numeric DEFAULT 0,
IN p2 numeric DEFAULT 0
)
RETURNS numeric AS
$$
DECLARE
gs geometry[];
vs numeric[];
gs2 geometry[];
vs2 numeric[];
g geometry;
vertex geometry[];
sg numeric;
sa numeric;
sb numeric;
sc numeric;
va numeric;
vb numeric;
vc numeric;
output numeric;
BEGIN
-- output := -999.999;
-- nearest neighbors
-- p1: limit the number of neighbors, 0-> closest one
IF method = 0 THEN
IF p1 = 0 THEN
p1 := 1;
END IF;
WITH a as (SELECT unnest(geomin) as g, unnest(colin) as v),
b as (SELECT a.v as v FROM a ORDER BY point<->a.g LIMIT p1::integer)
SELECT avg(b.v) INTO output FROM b;
RETURN output;
-- barymetric
ELSIF method = 1 THEN
WITH a as (SELECT unnest(geomin) AS e),
b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a),
c as (SELECT (ST_Dump(t)).geom as v FROM b),
d as (SELECT v FROM c WHERE ST_Within(point, v))
SELECT v INTO g FROM d;
IF g is null THEN
-- out of the realm of the input data
RETURN -888.888;
END IF;
-- vertex of the selected cell
WITH a AS (SELECT (ST_DumpPoints(g)).geom AS v)
SELECT array_agg(v) INTO vertex FROM a;
-- retrieve the value of each vertex
WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c)
SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]);
WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c)
SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]);
WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c)
SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]);
SELECT ST_area(g), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[2], vertex[3], point]))), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[1], vertex[3], point]))), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point,vertex[1],vertex[2], point]))) INTO sg, sa, sb, sc;
output := (coalesce(sa,0) * coalesce(va,0) + coalesce(sb,0) * coalesce(vb,0) + coalesce(sc,0) * coalesce(vc,0)) / coalesce(sg);
RETURN output;
-- IDW
-- p1: limit the number of neighbors, 0->no limit
-- p2: order of distance decay, 0-> order 1
ELSIF method = 2 THEN
IF p2 = 0 THEN
p2 := 1;
END IF;
WITH a as (SELECT unnest(geomin) as g, unnest(colin) as v),
b as (SELECT a.g, a.v FROM a ORDER BY point<->a.g)
SELECT array_agg(b.g), array_agg(b.v) INTO gs, vs FROM b;
IF p1::integer>0 THEN
gs2:=gs;
vs2:=vs;
FOR i IN 1..p1
LOOP
gs2 := gs2 || gs[i];
vs2 := vs2 || vs[i];
END LOOP;
ELSE
gs2:=gs;
vs2:=vs;
END IF;
WITH a as (SELECT unnest(gs2) as g, unnest(vs2) as v),
b as (
SELECT
(1/ST_distance(point, a.g)^p2::integer) as k,
(a.v/ST_distance(point, a.g)^p2::integer) as f
FROM a
)
SELECT sum(b.f)/sum(b.k) INTO output FROM b;
RETURN output;
-- krigin
ELSIF method = 3 THEN
-- TO DO
END IF;
RETURN -777.777;
END;
$$
language plpgsql IMMUTABLE PARALLEL SAFE;
-- =============================================================================================
--
-- CDB_Voronoi
--
-- =============================================================================================
CREATE OR REPLACE FUNCTION CDB_voronoi(
IN geomin geometry[],
IN buffer numeric DEFAULT 0.5,
IN tolerance numeric DEFAULT 1e-9
)
RETURNS geometry AS $$
DECLARE
geomout geometry;
BEGIN
-- we need to make the geometry calculations in (pseudo)meters!!!
with a as (
SELECT unnest(geomin) as g1
),
b as(
SELECT st_transform(g1, 3857) g2 from a
)
SELECT array_agg(g2) INTO geomin from b;
WITH
convexhull_1 as (
SELECT
ST_ConvexHull(ST_Collect(geomin)) as g,
buffer * |/ (st_area(ST_ConvexHull(ST_Collect(geomin)))/PI()) as r
),
clipper as(
SELECT
st_buffer(ST_MinimumBoundingCircle(a.g), buffer*a.r) as g
FROM convexhull_1 a
),
env0 as (
SELECT
(st_dumppoints(st_expand(a.g, buffer*a.r))).geom as e
FROM convexhull_1 a
),
env as (
SELECT
array_agg(env0.e) as e
FROM env0
),
sample AS (
SELECT
ST_Collect(geomin || env.e) as geom
FROM env
),
convexhull as (
SELECT
ST_ConvexHull(ST_Collect(geomin)) as cg
),
tin as (
SELECT
ST_Dump(ST_DelaunayTriangles(geom, tolerance, 0)) as gd
FROM
sample
),
tin_polygons as (
SELECT
(gd).Path as id,
(gd).Geom as pg,
ST_Centroid(ST_MinimumBoundingCircle((gd).Geom, 180)) as ct
FROM tin
),
tin_lines as (
SELECT
id,
ST_ExteriorRing(pg) as lg
FROM tin_polygons
),
tin_nodes as (
SELECT
id,
ST_PointN(lg,1) p1,
ST_PointN(lg,2) p2,
ST_PointN(lg,3) p3
FROM tin_lines
),
tin_edges AS (
SELECT
p.id,
UNNEST(ARRAY[
ST_MakeLine(n.p1,n.p2) ,
ST_MakeLine(n.p2,n.p3) ,
ST_MakeLine(n.p3,n.p1)]) as Edge,
ST_Force2D(cdb_crankshaft._Find_Circle(n.p1,n.p2,n.p3)) as ct,
CASE WHEN st_distance(p.ct, ST_ExteriorRing(p.pg)) < tolerance THEN
TRUE
ELSE FALSE END AS ctx,
p.pg,
ST_within(p.ct, convexhull.cg) as ctin
FROM
tin_polygons p,
tin_nodes n,
convexhull
WHERE p.id = n.id
),
voro_nodes as (
SELECT
CASE WHEN x.ctx = TRUE THEN
ST_Centroid(x.edge)
ELSE
x.ct
END as xct,
CASE WHEN y.id is null THEN
CASE WHEN x.ctin = TRUE THEN
ST_SetSRID(ST_MakePoint(
ST_X(x.ct) + ((ST_X(ST_Centroid(x.edge)) - ST_X(x.ct)) * (1+buffer)),
ST_Y(x.ct) + ((ST_Y(ST_Centroid(x.edge)) - ST_Y(x.ct)) * (1+buffer))
), ST_SRID(x.ct))
END
ELSE
y.ct
END as yct
FROM
tin_edges x
LEFT OUTER JOIN
tin_edges y
ON x.id <> y.id AND ST_Equals(x.edge, y.edge)
),
voro_edges as(
SELECT
ST_LineMerge(ST_Collect(ST_MakeLine(xct, yct))) as v
FROM
voro_nodes
),
voro_cells as(
SELECT
ST_Polygonize(
ST_Node(
ST_LineMerge(
ST_Union(v, ST_ExteriorRing(
ST_Convexhull(v)
)
)
)
)
) as g
FROM
voro_edges
),
voro_set as(
SELECT
(st_dump(v.g)).geom as g
FROM voro_cells v
),
clipped_voro as(
SELECT
ST_intersection(c.g, v.g) as g
FROM
voro_set v,
clipper c
WHERE
ST_GeometryType(v.g) = 'ST_Polygon'
)
SELECT
st_collect(
ST_Transform(
ST_ConvexHull(g),
4326
)
)
INTO geomout
FROM
clipped_voro;
RETURN geomout;
END;
$$ language plpgsql IMMUTABLE PARALLEL SAFE;
/** ----------------------------------------------------------------------------------------
* @function : FindCircle
* @precis : Function that determines if three points form a circle. If so a table containing
* centre and radius is returned. If not, a null table is returned.
* @version : 1.0.1
* @param : p_pt1 : First point in curve
* @param : p_pt2 : Second point in curve
* @param : p_pt3 : Third point in curve
* @return : geometry : In which X,Y ordinates are the centre X, Y and the Z being the radius of found circle
* or NULL if three points do not form a circle.
* @history : Simon Greener - Feb 2012 - Original coding.
* Rafa de la Torre - Aug 2016 - Small fix for type checking
* Raul Marin - Sept 2017 - Remove unnecessary NULL checks and set function categories
* @copyright : Simon Greener @ 2012
* Licensed under a Creative Commons Attribution-Share Alike 2.5 Australia License. (http://creativecommons.org/licenses/by-sa/2.5/au/)
**/
CREATE OR REPLACE FUNCTION _Find_Circle(
IN p_pt1 geometry,
IN p_pt2 geometry,
IN p_pt3 geometry)
RETURNS geometry AS
$BODY$
DECLARE
v_Centre geometry;
v_radius NUMERIC;
v_CX NUMERIC;
v_CY NUMERIC;
v_dA NUMERIC;
v_dB NUMERIC;
v_dC NUMERIC;
v_dD NUMERIC;
v_dE NUMERIC;
v_dF NUMERIC;
v_dG NUMERIC;
BEGIN
IF ( ST_GeometryType(p_pt1) <> 'ST_Point' OR
ST_GeometryType(p_pt2) <> 'ST_Point' OR
ST_GeometryType(p_pt3) <> 'ST_Point' ) THEN
RAISE EXCEPTION 'All supplied geometries must be points.';
RETURN NULL;
END IF;
v_dA := ST_X(p_pt2) - ST_X(p_pt1);
v_dB := ST_Y(p_pt2) - ST_Y(p_pt1);
v_dC := ST_X(p_pt3) - ST_X(p_pt1);
v_dD := ST_Y(p_pt3) - ST_Y(p_pt1);
v_dE := v_dA * (ST_X(p_pt1) + ST_X(p_pt2)) + v_dB * (ST_Y(p_pt1) + ST_Y(p_pt2));
v_dF := v_dC * (ST_X(p_pt1) + ST_X(p_pt3)) + v_dD * (ST_Y(p_pt1) + ST_Y(p_pt3));
v_dG := 2.0 * (v_dA * (ST_Y(p_pt3) - ST_Y(p_pt2)) - v_dB * (ST_X(p_pt3) - ST_X(p_pt2)));
-- If v_dG is zero then the three points are collinear and no finite-radius
-- circle through them exists.
IF ( v_dG = 0 ) THEN
RETURN NULL;
ELSE
v_CX := (v_dD * v_dE - v_dB * v_dF) / v_dG;
v_CY := (v_dA * v_dF - v_dC * v_dE) / v_dG;
v_Radius := SQRT(POWER(ST_X(p_pt1) - v_CX,2) + POWER(ST_Y(p_pt1) - v_CY,2) );
END IF;
RETURN ST_SetSRID(ST_MakePoint(v_CX, v_CY, v_radius),ST_Srid(p_pt1));
END;
$BODY$
LANGUAGE plpgsql IMMUTABLE STRICT PARALLEL SAFE;
-- Moran's I Global Measure (public-facing)
CREATE OR REPLACE FUNCTION
CDB_AreasOfInterestGlobal(
subquery TEXT,
column_name TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran NUMERIC, significance NUMERIC)
AS $$
from crankshaft.clustering import Moran
# TODO: use named parameters or a dictionary
moran = Moran()
return moran.global_stat(subquery, column_name, w_type,
num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
-- Moran's I Local (internal function) - DEPRECATED
CREATE OR REPLACE FUNCTION
_CDB_AreasOfInterestLocal(
subquery TEXT,
column_name TEXT,
w_type TEXT,
num_ngbrs INT,
permutations INT,
geom_col TEXT,
id_col TEXT)
RETURNS TABLE (
moran NUMERIC,
quads TEXT,
significance NUMERIC,
rowid INT,
vals NUMERIC)
AS $$
from crankshaft.clustering import Moran
moran = Moran()
result = moran.local_stat(subquery, column_name, w_type,
num_ngbrs, permutations, geom_col, id_col)
# remove spatial lag
return [(r[6], r[0], r[1], r[7], r[5]) for r in result]
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
-- Moran's I Local (internal function)
CREATE OR REPLACE FUNCTION
_CDB_MoransILocal(
subquery TEXT,
column_name TEXT,
w_type TEXT,
num_ngbrs INT,
permutations INT,
geom_col TEXT,
id_col TEXT)
RETURNS TABLE (
quads TEXT,
significance NUMERIC,
spatial_lag NUMERIC,
spatial_lag_std NUMERIC,
orig_val NUMERIC,
orig_val_std NUMERIC,
moran_stat NUMERIC,
rowid INT)
AS $$
from crankshaft.clustering import Moran
moran = Moran()
return moran.local_stat(subquery, column_name, w_type,
num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
-- Moran's I Local (public-facing function)
-- Replaces CDB_AreasOfInterestLocal
CREATE OR REPLACE FUNCTION
CDB_MoransILocal(
subquery TEXT,
column_name TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (
quads TEXT,
significance NUMERIC,
spatial_lag NUMERIC,
spatial_lag_std NUMERIC,
orig_val NUMERIC,
orig_val_std NUMERIC,
moran_stat NUMERIC,
rowid INT)
AS $$
SELECT
quads, significance, spatial_lag, spatial_lag_std,
orig_val, orig_val_std, moran_stat, rowid
FROM cdb_crankshaft._CDB_MoransILocal(
subquery, column_name, w_type,
num_ngbrs, permutations, geom_col, id_col);
$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
-- Moran's I Local (public-facing function) - DEPRECATED
CREATE OR REPLACE FUNCTION
CDB_AreasOfInterestLocal(
subquery TEXT,
column_name TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col);
$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
-- Moran's I only for HH and HL (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_GetSpatialHotspots(
subquery TEXT,
column_name TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col)
WHERE quads IN ('HH', 'HL');
$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
-- Moran's I only for LL and LH (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_GetSpatialColdspots(
subquery TEXT,
attr TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col)
WHERE quads IN ('LL', 'LH');
$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
-- Moran's I only for LH and HL (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_GetSpatialOutliers(
subquery TEXT,
attr TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col)
WHERE quads IN ('HL', 'LH');
$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
-- Moran's I Global Rate (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_AreasOfInterestGlobalRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran FLOAT, significance FLOAT)
AS $$
from crankshaft.clustering import Moran
moran = Moran()
# TODO: use named parameters or a dictionary
return moran.global_rate_stat(subquery, numerator, denominator, w_type,
num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
-- Moran's I Local Rate (internal function) - DEPRECATED
CREATE OR REPLACE FUNCTION
_CDB_AreasOfInterestLocalRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT,
num_ngbrs INT,
permutations INT,
geom_col TEXT,
id_col TEXT)
RETURNS
TABLE(
moran NUMERIC,
quads TEXT,
significance NUMERIC,
rowid INT,
vals NUMERIC)
AS $$
from crankshaft.clustering import Moran
moran = Moran()
# TODO: use named parameters or a dictionary
result = moran.local_rate_stat(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
# remove spatial lag
return [(r[6], r[0], r[1], r[7], r[4]) for r in result]
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
-- Moran's I Local Rate (public-facing function) - DEPRECATED
CREATE OR REPLACE FUNCTION
CDB_AreasOfInterestLocalRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS
TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col);
$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
-- Internal function
CREATE OR REPLACE FUNCTION
_CDB_MoransILocalRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT,
num_ngbrs INT,
permutations INT,
geom_col TEXT,
id_col TEXT)
RETURNS
TABLE(
quads TEXT,
significance NUMERIC,
spatial_lag NUMERIC,
spatial_lag_std NUMERIC,
orig_val NUMERIC,
orig_val_std NUMERIC,
moran_stat NUMERIC,
rowid INT)
AS $$
from crankshaft.clustering import Moran
moran = Moran()
return moran.local_rate_stat(
subquery,
numerator,
denominator,
w_type,
num_ngbrs,
permutations,
geom_col,
id_col
)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
-- Moran's I Rate
-- Replaces CDB_AreasOfInterestLocalRate
CREATE OR REPLACE FUNCTION
CDB_MoransILocalRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS
TABLE(
quads TEXT,
significance NUMERIC,
spatial_lag NUMERIC,
spatial_lag_std NUMERIC,
orig_val NUMERIC,
orig_val_std NUMERIC,
moran_stat NUMERIC,
rowid INT)
AS $$
SELECT
quads, significance, spatial_lag, spatial_lag_std,
orig_val, orig_val_std, moran_stat, rowid
FROM cdb_crankshaft._CDB_MoransILocalRate(
subquery, numerator, denominator, w_type,
num_ngbrs, permutations, geom_col, id_col);
$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
-- Moran's I Local Rate only for HH and HL (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_GetSpatialHotspotsRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS
TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
WHERE quads IN ('HH', 'HL');
$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
-- Moran's I Local Rate only for LL and LH (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_GetSpatialColdspotsRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS
TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
WHERE quads IN ('LL', 'LH');
$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
-- Moran's I Local Rate only for LH and HL (public-facing function)
CREATE OR REPLACE FUNCTION
CDB_GetSpatialOutliersRate(
subquery TEXT,
numerator TEXT,
denominator TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS
TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$
SELECT moran, quads, significance, rowid, vals
FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
WHERE quads IN ('HL', 'LH');
$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
-- Spatial k-means clustering
CREATE OR REPLACE FUNCTION CDB_KMeans(
query TEXT,
no_clusters INTEGER,
no_init INTEGER DEFAULT 20
)
RETURNS TABLE(
cartodb_id INTEGER,
cluster_no INTEGER
) AS $$
from crankshaft.clustering import Kmeans
kmeans = Kmeans()
return kmeans.spatial(query, no_clusters, no_init)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
-- Non-spatial k-means clustering
-- query: sql query to retrieve all the needed data
-- colnames: text array of column names for doing the clustering analysis
-- no_clusters: number of requested clusters
-- standardize: whether to scale variables to a mean of zero and a standard
-- deviation of 1
-- id_colname: name of the id column
CREATE OR REPLACE FUNCTION CDB_KMeansNonspatial(
query TEXT,
colnames TEXT[],
no_clusters INTEGER,
standardize BOOLEAN DEFAULT true,
id_col TEXT DEFAULT 'cartodb_id'
)
RETURNS TABLE(
cluster_label text,
cluster_center json,
silhouettes numeric,
inertia numeric,
rowid bigint
) AS $$
from crankshaft.clustering import Kmeans
kmeans = Kmeans()
return kmeans.nonspatial(query, colnames, no_clusters,
standardize=standardize,
id_col=id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION CDB_WeightedMeanS(
state NUMERIC[],
the_geom GEOMETRY(Point, 4326),
weight NUMERIC
)
RETURNS Numeric[] AS $$
DECLARE
newX NUMERIC;
newY NUMERIC;
newW NUMERIC;
BEGIN
IF weight IS NULL OR the_geom IS NULL THEN
newX = state[1];
newY = state[2];
newW = state[3];
ELSE
newX = state[1] + ST_X(the_geom)*weight;
newY = state[2] + ST_Y(the_geom)*weight;
newW = state[3] + weight;
END IF;
RETURN Array[newX,newY,newW];
END
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
CREATE OR REPLACE FUNCTION CDB_WeightedMeanF(state NUMERIC[])
RETURNS GEOMETRY AS
$$
BEGIN
IF state[3] = 0 THEN
RETURN ST_SetSRID(ST_MakePoint(state[1],state[2]), 4326);
ELSE
RETURN ST_SETSRID(ST_MakePoint(state[1]/state[3], state[2]/state[3]),4326);
END IF;
END
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
-- Create aggregate if it did not exist
DO $$ BEGIN
CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
SFUNC = CDB_WeightedMeanS,
FINALFUNC = CDB_WeightedMeanF,
STYPE = Numeric[],
PARALLEL = SAFE,
INITCOND = "{0.0,0.0,0.0}"
);
EXCEPTION
WHEN duplicate_function THEN NULL;
END $$;
-- Spatial Markov
-- input table format:
-- id | geom | date_1 | date_2 | date_3
-- 1 | Pt1 | 12.3 | 13.1 | 14.2
-- 2 | Pt2 | 11.0 | 13.2 | 12.5
-- ...
-- Sample Function call:
-- SELECT CDB_SpatialMarkov('SELECT * FROM real_estate',
-- Array['date_1', 'date_2', 'date_3'])
CREATE OR REPLACE FUNCTION
CDB_SpatialMarkovTrend (
subquery TEXT,
time_cols TEXT[],
num_classes INT DEFAULT 7,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 99,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (trend NUMERIC, trend_up NUMERIC, trend_down NUMERIC, volatility NUMERIC, rowid INT)
AS $$
from crankshaft.space_time_dynamics import Markov
markov = Markov()
## TODO: use named parameters or a dictionary
return markov.spatial_trend(subquery, time_cols, num_classes, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
-- input table format: identical to above but in a predictable format
-- Sample function call:
-- SELECT cdb_spatial_markov('SELECT * FROM real_estate',
-- 'date_1')
-- CREATE OR REPLACE FUNCTION
-- cdb_spatial_markov (
-- subquery TEXT,
-- time_col_min text,
-- time_col_max text,
-- date_format text, -- '_YYYY_MM_DD'
-- num_time_per_bin INT DEFAULT 1,
-- permutations INT DEFAULT 99,
-- geom_column TEXT DEFAULT 'the_geom',
-- id_col TEXT DEFAULT 'cartodb_id',
-- w_type TEXT DEFAULT 'knn',
-- num_ngbrs int DEFAULT 5)
-- RETURNS TABLE (moran FLOAT, quads TEXT, significance FLOAT, ids INT)
-- AS $$
-- plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()')
-- from crankshaft.clustering import moran_local
-- # TODO: use named parameters or a dictionary
-- return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs)
-- $$ LANGUAGE plpython3u;
--
-- -- input table format:
-- -- id | geom | date | measurement
-- -- 1 | Pt1 | 12/3 | 13.2
-- -- 2 | Pt2 | 11/5 | 11.3
-- -- 3 | Pt1 | 11/13 | 12.9
-- -- 4 | Pt3 | 12/19 | 10.1
-- -- ...
--
-- CREATE OR REPLACE FUNCTION
-- cdb_spatial_markov (
-- subquery TEXT,
-- time_col text,
-- num_time_per_bin INT DEFAULT 1,
-- permutations INT DEFAULT 99,
-- geom_column TEXT DEFAULT 'the_geom',
-- id_col TEXT DEFAULT 'cartodb_id',
-- w_type TEXT DEFAULT 'knn',
-- num_ngbrs int DEFAULT 5)
-- RETURNS TABLE (moran FLOAT, quads TEXT, significance FLOAT, ids INT)
-- AS $$
-- plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()')
-- from crankshaft.clustering import moran_local
-- # TODO: use named parameters or a dictionary
-- return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs)
-- $$ LANGUAGE plpython3u;
-- Based on:
-- https://github.com/mapbox/polylabel/blob/master/index.js
-- https://sites.google.com/site/polesofinaccessibility/
-- Requires: https://github.com/CartoDB/cartodb-postgresql
-- Based on:
-- https://github.com/mapbox/polylabel/blob/master/index.js
-- https://sites.google.com/site/polesofinaccessibility/
-- Requires: https://github.com/CartoDB/cartodb-postgresql
CREATE OR REPLACE FUNCTION CDB_PIA(
IN polygon geometry,
IN tolerance numeric DEFAULT 1.0
)
RETURNS geometry AS $$
DECLARE
env geometry[];
cells geometry[];
cell geometry;
best_c geometry;
best_d numeric;
test_d numeric;
test_mx numeric;
test_h numeric;
test_cells geometry[];
width numeric;
height numeric;
h numeric;
i integer;
n integer;
sqr numeric;
p geometry;
BEGIN
sqr := 0.5*(|/2.0);
polygon := ST_Transform(polygon, 3857);
-- grid #0 cell size
height := ST_YMax(polygon) - ST_YMin(polygon);
width := ST_XMax(polygon) - ST_XMin(polygon);
h := 0.5*LEAST(height, width);
-- grid #0
with c1 as(
SELECT cdb_crankshaft.CDB_RectangleGrid(polygon, h, h) as c
)
SELECT array_agg(c) INTO cells FROM c1;
-- 1st guess: centroid
best_c := polygon;
best_d := cdb_crankshaft._Signed_Dist(polygon, ST_Centroid(Polygon));
-- looping the loop
n := array_length(cells,1);
i := 1;
LOOP
EXIT WHEN i > n;
cell := cells[i];
i := i+1;
-- cell side size, it's square
test_h := ST_XMax(cell) - ST_XMin(cell) ;
-- check distance
test_d := cdb_crankshaft._Signed_Dist(polygon, ST_Centroid(cell));
IF test_d > best_d THEN
best_d := test_d;
best_c := cell;
END IF;
-- longest distance within the cell
test_mx := test_d + (test_h * sqr);
-- if the cell has no chance to contains the desired point, continue
CONTINUE WHEN test_mx - best_d <= tolerance;
-- resample the cell
with c1 as(
SELECT cdb_crankshaft.CDB_RectangleGrid(cell, test_h/2, test_h/2) as c
)
SELECT array_agg(c) INTO test_cells FROM c1;
-- concat the new cells to the former array
cells := cells || test_cells;
-- prepare next iteration
n := array_length(cells,1);
END LOOP;
RETURN ST_transform(ST_Centroid(best_c), 4326);
END;
$$ language plpgsql IMMUTABLE PARALLEL SAFE;
-- signed distance point to polygon with holes
-- negative is the point is out the polygon
-- rev 1. adding MULTIPOLYGON and GEOMETRYCOLLECTION support by @abelvm
CREATE OR REPLACE FUNCTION _Signed_Dist(
IN polygon geometry,
IN point geometry
)
RETURNS numeric AS $$
DECLARE
pols geometry[];
pol geometry;
i integer;
j integer;
within integer;
w integer;
holes integer;
dist numeric;
d numeric;
BEGIN
dist := 1e999;
WITH collection as (SELECT (ST_dump(polygon)).geom as geom) SELECT array_agg(geom) into pols FROM collection;
FOR j in 1..array_length(pols, 1)
LOOP
pol := pols[j];
d := dist;
SELECT LEAST(dist, ST_distance(point, ST_ExteriorRing(pol))::numeric) INTO d;
SELECT CASE WHEN ST_Within(point,pol) THEN 1 ELSE -1 END INTO w;
SELECT ST_NumInteriorRings(pol) INTO holes;
IF holes > 0 THEN
FOR i IN 1..holes
LOOP
SELECT LEAST(d, ST_distance(point, ST_InteriorRingN(pol, i))::numeric) INTO d;
END LOOP;
END IF;
IF d < dist THEN
dist:= d;
within := w;
END IF;
END LOOP;
dist := dist * within::numeric;
RETURN dist;
END;
$$ language plpgsql IMMUTABLE PARALLEL SAFE;
--
-- Iterative densification of a set of points using Delaunay triangulation
-- the new points have as assigned value the average value of the 3 vertex (centroid)
--
-- @param geomin - array of geometries (points)
--
-- @param colin - array of numeric values in that points
--
-- @param iterations - integer, number of iterations
--
--
-- Returns: TABLE(geomout geometry, colout numeric)
--
--
CREATE OR REPLACE FUNCTION CDB_Densify(
IN geomin geometry[],
IN colin numeric[],
IN iterations integer
)
RETURNS TABLE(geomout geometry, colout numeric) AS $$
DECLARE
geotemp geometry[];
coltemp numeric[];
i integer;
gs geometry[];
g geometry;
vertex geometry[];
va numeric;
vb numeric;
vc numeric;
center geometry;
centerval numeric;
tmp integer;
BEGIN
geotemp := geomin;
coltemp := colin;
FOR i IN 1..iterations
LOOP
-- generate TIN
WITH a as (SELECT unnest(geotemp) AS e),
b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a),
c as (SELECT (ST_Dump(t)).geom AS v FROM b)
SELECT array_agg(v) INTO gs FROM c;
-- loop cells
FOREACH g IN ARRAY gs
LOOP
-- append centroid
SELECT ST_Centroid(g) INTO center;
geotemp := array_append(geotemp, center);
-- retrieve the value of each vertex
WITH a AS (SELECT (ST_DumpPoints(g)).geom AS v)
SELECT array_agg(v) INTO vertex FROM a;
WITH a AS(SELECT unnest(geotemp) as geo, unnest(coltemp) as c)
SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]);
WITH a AS(SELECT unnest(geotemp) as geo, unnest(coltemp) as c)
SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]);
WITH a AS(SELECT unnest(geotemp) as geo, unnest(coltemp) as c)
SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]);
-- calc the value at the center
centerval := (va + vb + vc) / 3;
-- append the value
coltemp := array_append(coltemp, centerval);
END LOOP;
END LOOP;
RETURN QUERY SELECT unnest(geotemp ) as geomout, unnest(coltemp ) as colout;
END;
$$ language plpgsql IMMUTABLE PARALLEL SAFE;
CREATE OR REPLACE FUNCTION CDB_TINmap(
IN geomin geometry[],
IN colin numeric[],
IN iterations integer
)
RETURNS TABLE(geomout geometry, colout numeric) AS $$
DECLARE
p geometry[];
vals numeric[];
gs geometry[];
g geometry;
vertex geometry[];
centerval numeric;
va numeric;
vb numeric;
vc numeric;
coltemp numeric[];
BEGIN
SELECT array_agg(dens.geomout), array_agg(dens.colout) INTO p, vals FROM cdb_crankshaft.CDB_Densify(geomin, colin, iterations) dens;
WITH a as (SELECT unnest(p) AS e),
b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a),
c as (SELECT (ST_Dump(t)).geom AS v FROM b)
SELECT array_agg(v) INTO gs FROM c;
FOREACH g IN ARRAY gs
LOOP
-- retrieve the vertex of each triangle
WITH a AS (SELECT (ST_DumpPoints(g)).geom AS v)
SELECT array_agg(v) INTO vertex FROM a;
-- retrieve the value of each vertex
WITH a AS(SELECT unnest(p) as geo, unnest(vals) as c)
SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]);
WITH a AS(SELECT unnest(p) as geo, unnest(vals) as c)
SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]);
WITH a AS(SELECT unnest(p) as geo, unnest(vals) as c)
SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]);
-- calc the value at the center
centerval := (va + vb + vc) / 3;
-- append the value
coltemp := array_append(coltemp, centerval);
END LOOP;
RETURN QUERY SELECT unnest(gs) as geomout, unnest(coltemp ) as colout;
END;
$$ language plpgsql IMMUTABLE PARALLEL SAFE;
-- Getis-Ord's G
-- Hotspot/Coldspot Analysis tool
CREATE OR REPLACE FUNCTION
CDB_GetisOrdsG(
subquery TEXT,
column_name TEXT,
w_type TEXT DEFAULT 'knn',
num_ngbrs INT DEFAULT 5,
permutations INT DEFAULT 999,
geom_col TEXT DEFAULT 'the_geom',
id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (z_score NUMERIC, p_value NUMERIC, p_z_sim NUMERIC, rowid BIGINT)
AS $$
from crankshaft.clustering import Getis
getis = Getis()
return getis.getis_ord(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
-- TODO: make a version that accepts the values as arrays
-- Find outliers using a static threshold
--
CREATE OR REPLACE FUNCTION CDB_StaticOutlier(column_value numeric, threshold numeric)
RETURNS boolean
AS $$
BEGIN
RETURN column_value > threshold;
END;
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE ;
-- Find outliers by a percentage above the threshold
-- TODO: add symmetric option? `is_symmetric boolean DEFAULT false`
CREATE OR REPLACE FUNCTION CDB_PercentOutlier(column_values numeric[], outlier_fraction numeric, ids int[])
RETURNS TABLE(is_outlier boolean, rowid int)
AS $$
DECLARE
avg_val numeric;
out_vals boolean[];
BEGIN
SELECT avg(i) INTO avg_val
FROM unnest(column_values) As x(i);
IF avg_val = 0 THEN
RAISE EXCEPTION 'Mean value is zero. Try another outlier method.';
END IF;
SELECT array_agg(
outlier_fraction < i / avg_val) INTO out_vals
FROM unnest(column_values) As x(i);
RETURN QUERY
SELECT unnest(out_vals) As is_outlier,
unnest(ids) As rowid;
END;
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
-- Find outliers above a given number of standard deviations from the mean
CREATE OR REPLACE FUNCTION CDB_StdDevOutlier(column_values numeric[], num_deviations numeric, ids int[], is_symmetric boolean DEFAULT true)
RETURNS TABLE(is_outlier boolean, rowid int)
AS $$
DECLARE
stddev_val numeric;
avg_val numeric;
out_vals boolean[];
BEGIN
SELECT stddev(i), avg(i) INTO stddev_val, avg_val
FROM unnest(column_values) As x(i);
IF stddev_val = 0 THEN
RAISE EXCEPTION 'Standard deviation of input data is zero';
END IF;
IF is_symmetric THEN
SELECT array_agg(
abs(i - avg_val) / stddev_val > num_deviations) INTO out_vals
FROM unnest(column_values) As x(i);
ELSE
SELECT array_agg(
(i - avg_val) / stddev_val > num_deviations) INTO out_vals
FROM unnest(column_values) As x(i);
END IF;
RETURN QUERY
SELECT unnest(out_vals) As is_outlier,
unnest(ids) As rowid;
END;
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
CREATE OR REPLACE FUNCTION CDB_Contour(
IN geomin geometry[],
IN colin numeric[],
IN buffer numeric,
IN intmethod integer,
IN classmethod integer,
IN steps integer,
IN max_time integer DEFAULT 60000
)
RETURNS TABLE(
the_geom geometry,
bin integer,
min_value numeric,
max_value numeric,
avg_value numeric
) AS $$
DECLARE
cell_count integer;
tin geometry[];
resolution integer;
BEGIN
-- nasty trick to override issue #121
IF max_time = 0 THEN
max_time = -90;
END IF;
resolution := max_time;
max_time := -1 * resolution;
-- calc the optimal number of cells for the current dataset
SELECT
CASE intmethod
WHEN 0 THEN round(3.7745903782 * max_time - 9.4399210051 * array_length(geomin,1) - 1350.8778213073)
WHEN 1 THEN round(2.2855592156 * max_time - 87.285217133 * array_length(geomin,1) + 17255.7085601797)
WHEN 2 THEN round(0.9799471999 * max_time - 127.0334085369 * array_length(geomin,1) + 22707.9579721218)
ELSE 10000
END INTO cell_count;
-- we don't have iterative barycentric interpolation in CDB_interpolation,
-- and it's a costy function, so let's make a custom one here till
-- we update the code
-- tin := ARRAY[]::geometry[];
IF intmethod=1 THEN
WITH
a as (SELECT unnest(geomin) AS e),
b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a),
c as (SELECT (ST_Dump(t)).geom as v FROM b)
SELECT array_agg(v) INTO tin FROM c;
END IF;
-- Delaunay stuff performed just ONCE!!
-- magic
RETURN QUERY
WITH
convexhull as (
SELECT
ST_ConvexHull(ST_Collect(geomin)) as g,
buffer * |/ st_area(ST_ConvexHull(ST_Collect(geomin)))/PI() as r
),
envelope as (
SELECT
st_expand(a.g, a.r) as e
FROM convexhull a
),
envelope3857 as(
SELECT
ST_Transform(e, 3857) as geom
FROM envelope
),
resolution as(
SELECT
CASE WHEN resolution <= 0 THEN
round(|/ (
ST_area(geom) / abs(cell_count)
))
ELSE
resolution
END AS cell
FROM envelope3857
),
grid as(
SELECT
ST_Transform(cdb_crankshaft.CDB_RectangleGrid(e.geom, r.cell, r.cell), 4326) as geom
FROM envelope3857 e, resolution r
),
interp as(
SELECT
geom,
CASE
WHEN intmethod=1 THEN cdb_crankshaft._interp_in_tin(geomin, colin, tin, ST_Centroid(geom))
ELSE cdb_crankshaft.CDB_SpatialInterpolation(geomin, colin, ST_Centroid(geom), intmethod)
END as val
FROM grid
),
classes as(
SELECT CASE
WHEN classmethod = 0 THEN
cdb_crankshaft.CDB_EqualIntervalBins(array_agg(val), steps)
WHEN classmethod = 1 THEN
cdb_crankshaft.CDB_HeadsTailsBins(array_agg(val), steps)
WHEN classmethod = 2 THEN
cdb_crankshaft.CDB_JenksBins(array_agg(val), steps)
ELSE
cdb_crankshaft.CDB_QuantileBins(array_agg(val), steps)
END as b
FROM interp
where val is not null
),
classified as(
SELECT
i.*,
width_bucket(i.val, c.b) as bucket
FROM interp i left join classes c
ON 1=1
),
classified2 as(
SELECT
geom,
val,
CASE
WHEN bucket = steps THEN bucket - 1
ELSE bucket
END as b
FROM classified
),
final as(
SELECT
st_union(geom) as the_geom,
b as bin,
min(val) as min_value,
max(val) as max_value,
avg(val) as avg_value
FROM classified2
GROUP BY bin
)
SELECT
*
FROM final
where final.bin is not null
;
END;
$$ language plpgsql VOLATILE PARALLEL RESTRICTED;
-- =====================================================================
-- Interp in grid, so we can use barycentric with a precalculated tin (NNI)
-- =====================================================================
CREATE OR REPLACE FUNCTION _interp_in_tin(
IN geomin geometry[],
IN colin numeric[],
IN tin geometry[],
IN point geometry
)
RETURNS numeric AS
$$
DECLARE
g geometry;
vertex geometry[];
sg numeric;
sa numeric;
sb numeric;
sc numeric;
va numeric;
vb numeric;
vc numeric;
output numeric;
BEGIN
-- get the cell the point is within
WITH
a as (SELECT unnest(tin) as v),
b as (SELECT v FROM a WHERE ST_Within(point, v))
SELECT v INTO g FROM b;
-- if we're out of the data realm,
-- return null
IF g is null THEN
RETURN null;
END IF;
-- vertex of the selected cell
WITH a AS (
SELECT (ST_DumpPoints(g)).geom AS v
)
SELECT array_agg(v) INTO vertex FROM a;
-- retrieve the value of each vertex
WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c)
SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]);
WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c)
SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]);
WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c)
SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]);
-- calc the areas
SELECT
ST_area(g),
ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[2], vertex[3], point]))),
ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[1], vertex[3], point]))),
ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point,vertex[1],vertex[2], point]))) INTO sg, sa, sb, sc;
output := (coalesce(sa,0) * coalesce(va,0) + coalesce(sb,0) * coalesce(vb,0) + coalesce(sc,0) * coalesce(vc,0)) / coalesce(sg,1);
RETURN output;
END;
$$
language plpgsql IMMUTABLE PARALLEL SAFE;
-- Function by Stuart Lynn for a simple interpolation of a value
-- from a polygon table over an arbitrary polygon
-- (weighted by the area proportion overlapped)
-- Aereal weighting is a very simple form of aereal interpolation.
--
-- Parameters:
-- * geom a Polygon geometry which defines the area where a value will be
-- estimated as the area-weighted sum of a given table/column
-- * target_table_name table name of the table that provides the values
-- * target_column column name of the column that provides the values
-- * schema_name optional parameter to defina the schema the target table
-- belongs to, which is necessary if its not in the search_path.
-- Note that target_table_name should never include the schema in it.
-- Return value:
-- Aereal-weighted interpolation of the column values over the geometry
CREATE OR REPLACE
FUNCTION cdb_overlap_sum(geom geometry, target_table_name text, target_column text, schema_name text DEFAULT NULL)
RETURNS numeric AS
$$
DECLARE
result numeric;
qualified_name text;
BEGIN
IF schema_name IS NULL THEN
qualified_name := Format('%I', target_table_name);
ELSE
qualified_name := Format('%I.%s', schema_name, target_table_name);
END IF;
EXECUTE Format('
SELECT sum(%I*ST_Area(St_Intersection($1, a.the_geom))/ST_Area(a.the_geom))
FROM %s AS a
WHERE $1 && a.the_geom
', target_column, qualified_name)
USING geom
INTO result;
RETURN result;
END;
$$ LANGUAGE plpgsql STABLE PARALLEL SAFE;
CREATE OR REPLACE FUNCTION
CDB_GWR(subquery text, dep_var text, ind_vars text[],
bw numeric default null, fixed boolean default False,
kernel text default 'bisquare', geom_col text default 'the_geom',
id_col text default 'cartodb_id')
RETURNS table(coeffs JSON, stand_errs JSON, t_vals JSON,
filtered_t_vals JSON, predicted numeric,
residuals numeric, r_squared numeric, bandwidth numeric,
rowid bigint)
AS $$
from crankshaft.regression import GWR
gwr = GWR()
return gwr.gwr(subquery, dep_var, ind_vars, bw, fixed, kernel, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION
CDB_GWR_Predict(subquery text, dep_var text, ind_vars text[],
bw numeric default null, fixed boolean default False,
kernel text default 'bisquare',
geom_col text default 'the_geom',
id_col text default 'cartodb_id')
RETURNS table(coeffs JSON, stand_errs JSON, t_vals JSON,
r_squared numeric, predicted numeric, rowid bigint)
AS $$
from crankshaft.regression import GWR
gwr = GWR()
return gwr.gwr_predict(subquery, dep_var, ind_vars, bw, fixed, kernel, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
--
-- Creates N points randomly distributed arround the polygon
--
-- @param g - the geometry to be turned in to points
--
-- @param no_points - the number of points to generate
--
-- @params max_iter_per_point - the function generates points in the polygon's bounding box
-- and discards points which don't lie in the polygon. max_iter_per_point specifies how many
-- misses per point the funciton accepts before giving up.
--
-- Returns: Multipoint with the requested points
CREATE OR REPLACE FUNCTION cdb_dot_density(geom geometry , no_points Integer, max_iter_per_point Integer DEFAULT 1000)
RETURNS GEOMETRY AS $$
DECLARE
extent GEOMETRY;
test_point Geometry;
width NUMERIC;
height NUMERIC;
x0 NUMERIC;
y0 NUMERIC;
xp NUMERIC;
yp NUMERIC;
no_left INTEGER;
remaining_iterations INTEGER;
points GEOMETRY[];
bbox_line GEOMETRY;
intersection_line GEOMETRY;
BEGIN
extent := ST_Envelope(geom);
width := ST_XMax(extent) - ST_XMIN(extent);
height := ST_YMax(extent) - ST_YMIN(extent);
x0 := ST_XMin(extent);
y0 := ST_YMin(extent);
no_left := no_points;
LOOP
if(no_left=0) THEN
EXIT;
END IF;
yp = y0 + height*random();
bbox_line = ST_MakeLine(
ST_SetSRID(ST_MakePoint(yp, x0),4326),
ST_SetSRID(ST_MakePoint(yp, x0+width),4326)
);
intersection_line = ST_Intersection(bbox_line,geom);
test_point = ST_LineInterpolatePoint(st_makeline(st_linemerge(intersection_line)),random());
points := points || test_point;
no_left = no_left - 1 ;
END LOOP;
RETURN ST_Collect(points);
END;
$$
LANGUAGE plpgsql VOLATILE PARALLEL RESTRICTED;
-- Make sure by default there are no permissions for publicuser
-- NOTE: this happens at extension creation time, as part of an implicit transaction.
-- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE;
-- Grant permissions on the schema to publicuser (but just the schema)
GRANT USAGE ON SCHEMA cdb_crankshaft TO publicuser;
-- Revoke execute permissions on all functions in the schema by default
-- REVOKE EXECUTE ON ALL FUNCTIONS IN SCHEMA cdb_crankshaft FROM PUBLIC, publicuser;
--
-- Fill given extent with a rectangular coverage
--
-- @param ext Extent to fill. Only rectangles with center point falling
-- inside the extent (or at the lower or leftmost edge) will
-- be emitted. The returned hexagons will have the same SRID
-- as this extent.
--
-- @param width With of each rectangle
--
-- @param height Height of each rectangle
--
-- @param origin Optional origin to allow for exact tiling.
-- If omitted the origin will be 0,0.
-- The parameter is checked for having the same SRID
-- as the extent.
--
--
CREATE OR REPLACE FUNCTION CDB_RectangleGrid(ext GEOMETRY, width FLOAT8, height FLOAT8, origin GEOMETRY DEFAULT NULL)
RETURNS SETOF GEOMETRY
AS $$
DECLARE
h GEOMETRY; -- rectangle cell
hstep FLOAT8; -- horizontal step
vstep FLOAT8; -- vertical step
hw FLOAT8; -- half width
hh FLOAT8; -- half height
vstart FLOAT8;
hstart FLOAT8;
hend FLOAT8;
vend FLOAT8;
xoff FLOAT8;
yoff FLOAT8;
xgrd FLOAT8;
ygrd FLOAT8;
x FLOAT8;
y FLOAT8;
srid INTEGER;
BEGIN
srid := ST_SRID(ext);
xoff := 0;
yoff := 0;
IF origin IS NOT NULL THEN
IF ST_SRID(origin) != srid THEN
RAISE EXCEPTION 'SRID mismatch between extent (%) and origin (%)', srid, ST_SRID(origin);
END IF;
xoff := ST_X(origin);
yoff := ST_Y(origin);
END IF;
--RAISE DEBUG 'X offset: %', xoff;
--RAISE DEBUG 'Y offset: %', yoff;
hw := width/2.0;
hh := height/2.0;
xgrd := hw;
ygrd := hh;
--RAISE DEBUG 'X grid size: %', xgrd;
--RAISE DEBUG 'Y grid size: %', ygrd;
hstep := width;
vstep := height;
-- Tweak horizontal start on hstep grid from origin
hstart := xoff + ceil((ST_XMin(ext)-xoff)/hstep)*hstep;
--RAISE DEBUG 'hstart: %', hstart;
-- Tweak vertical start on vstep grid from origin
vstart := yoff + ceil((ST_Ymin(ext)-yoff)/vstep)*vstep;
--RAISE DEBUG 'vstart: %', vstart;
hend := ST_XMax(ext);
vend := ST_YMax(ext);
--RAISE DEBUG 'hend: %', hend;
--RAISE DEBUG 'vend: %', vend;
x := hstart;
WHILE x < hend LOOP -- over X
y := vstart;
h := ST_MakeEnvelope(x-hw, y-hh, x+hw, y+hh, srid);
WHILE y < vend LOOP -- over Y
RETURN NEXT h;
h := ST_Translate(h, 0, vstep);
y := yoff + round(((y + vstep)-yoff)/ygrd)*ygrd; -- round to grid
END LOOP;
x := xoff + round(((x + hstep)-xoff)/xgrd)*xgrd; -- round to grid
END LOOP;
RETURN;
END
$$ LANGUAGE 'plpgsql' IMMUTABLE PARALLEL SAFE;
--
-- Calculate the equal interval bins for a given column
--
-- @param in_array A numeric array of numbers to determine the best
-- to determine the bin boundary
--
-- @param breaks The number of bins you want to find.
--
--
-- Returns: upper edges of bins
--
--
CREATE OR REPLACE FUNCTION CDB_EqualIntervalBins ( in_array NUMERIC[], breaks INT ) RETURNS NUMERIC[] as $$
DECLARE
diff numeric;
min_val numeric;
max_val numeric;
tmp_val numeric;
i INT := 1;
reply numeric[];
BEGIN
SELECT min(e), max(e) INTO min_val, max_val FROM ( SELECT unnest(in_array) e ) x WHERE e IS NOT NULL;
diff = (max_val - min_val) / breaks::numeric;
LOOP
IF i < breaks THEN
tmp_val = min_val + i::numeric * diff;
reply = array_append(reply, tmp_val);
i := i+1;
ELSE
reply = array_append(reply, max_val);
EXIT;
END IF;
END LOOP;
RETURN reply;
END;
$$ language plpgsql IMMUTABLE PARALLEL SAFE;
--
-- Determine the Heads/Tails classifications from a numeric array
--
-- @param in_array A numeric array of numbers to determine the best
-- bins based on the Heads/Tails method.
--
-- @param breaks The number of bins you want to find.
--
--
CREATE OR REPLACE FUNCTION CDB_HeadsTailsBins ( in_array NUMERIC[], breaks INT) RETURNS NUMERIC[] as $$
DECLARE
element_count INT4;
arr_mean numeric;
i INT := 2;
reply numeric[];
BEGIN
-- get the total size of our row
element_count := array_upper(in_array, 1) - array_lower(in_array, 1);
-- ensure the ordering of in_array
SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x;
-- stop if no rows
IF element_count IS NULL THEN
RETURN NULL;
END IF;
-- stop if our breaks are more than our input array size
IF element_count < breaks THEN
RETURN in_array;
END IF;
-- get our mean value
SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x;
reply = Array[arr_mean];
-- slice our bread
LOOP
IF i > breaks THEN EXIT; END IF;
SELECT avg(e) INTO arr_mean FROM ( SELECT unnest(in_array) e) x WHERE e > reply[i-1];
IF arr_mean IS NOT NULL THEN
reply = array_append(reply, arr_mean);
END IF;
i := i+1;
END LOOP;
RETURN reply;
END;
$$ language plpgsql IMMUTABLE PARALLEL SAFE;
--
-- Determine the Jenks classifications from a numeric array
--
-- @param in_array A numeric array of numbers to determine the best
-- bins based on the Jenks method.
--
-- @param breaks The number of bins you want to find.
--
-- @param iterations The number of different starting positions to test.
--
-- @param invert Optional wheter to return the top of each bin (default)
-- or the bottom. BOOLEAN, default=FALSE.
--
--
CREATE OR REPLACE FUNCTION CDB_JenksBins ( in_array NUMERIC[], breaks INT, iterations INT DEFAULT 5, invert BOOLEAN DEFAULT FALSE) RETURNS NUMERIC[] as $$
DECLARE
element_count INT4;
arr_mean NUMERIC;
bot INT;
top INT;
tops INT[];
classes INT[][];
i INT := 1; j INT := 1;
curr_result NUMERIC[];
best_result NUMERIC[];
seedtarget TEXT;
quant NUMERIC[];
shuffles INT;
BEGIN
-- get the total size of our row
element_count := array_length(in_array, 1); --array_upper(in_array, 1) - array_lower(in_array, 1);
-- ensure the ordering of in_array
SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x;
-- stop if no rows
IF element_count IS NULL THEN
RETURN NULL;
END IF;
-- stop if our breaks are more than our input array size
IF element_count < breaks THEN
RETURN in_array;
END IF;
shuffles := LEAST(GREATEST(floor(2500000.0/(element_count::float*iterations::float)), 1), 750)::int;
-- get our mean value
SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x;
-- assume best is actually Quantile
SELECT cdb_crankshaft.CDB_QuantileBins(in_array, breaks) INTO quant;
-- if data is very very large, just return quant and be done
IF element_count > 5000000 THEN
RETURN quant;
END IF;
-- change quant into bottom, top markers
LOOP
IF i = 1 THEN
bot = 1;
ELSE
-- use last top to find this bot
bot = top+1;
END IF;
IF i = breaks THEN
top = element_count;
ELSE
SELECT count(*) INTO top FROM ( SELECT unnest(in_array) as v) x WHERE v <= quant[i];
END IF;
IF i = 1 THEN
classes = ARRAY[ARRAY[bot,top]];
ELSE
classes = ARRAY_CAT(classes,ARRAY[bot,top]);
END IF;
IF i > breaks THEN EXIT; END IF;
i = i+1;
END LOOP;
best_result = cdb_crankshaft.CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles);
--set the seed so we can ensure the same results
SELECT setseed(0.4567) INTO seedtarget;
--loop through random starting positions
LOOP
IF j > iterations-1 THEN EXIT; END IF;
i = 1;
tops = ARRAY[element_count];
LOOP
IF i = breaks THEN EXIT; END IF;
SELECT array_agg(distinct e) INTO tops FROM (SELECT unnest(array_cat(tops, ARRAY[floor(random()*element_count::float)::int])) as e ORDER BY e) x WHERE e != 1;
i = array_length(tops, 1);
END LOOP;
i = 1;
LOOP
IF i > breaks THEN EXIT; END IF;
IF i = 1 THEN
bot = 1;
ELSE
bot = top+1;
END IF;
top = tops[i];
IF i = 1 THEN
classes = ARRAY[ARRAY[bot,top]];
ELSE
classes = ARRAY_CAT(classes,ARRAY[bot,top]);
END IF;
i := i+1;
END LOOP;
curr_result = cdb_crankshaft.CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles);
IF curr_result[1] > best_result[1] THEN
best_result = curr_result;
j = j-1; -- if we found a better result, add one more search
END IF;
j = j+1;
END LOOP;
RETURN (best_result)[2:array_upper(best_result, 1)];
END;
$$ language plpgsql VOLATILE PARALLEL RESTRICTED;
--
-- Perform a single iteration of the Jenks classification
--
CREATE OR REPLACE FUNCTION CDB_JenksBinsIteration ( in_array NUMERIC[], breaks INT, classes INT[][], invert BOOLEAN, element_count INT4, arr_mean NUMERIC, max_search INT DEFAULT 50) RETURNS NUMERIC[] as $$
DECLARE
tmp_val numeric;
new_classes int[][];
tmp_class int[];
i INT := 1;
j INT := 1;
side INT := 2;
sdam numeric;
gvf numeric := 0.0;
new_gvf numeric;
arr_gvf numeric[];
class_avg numeric;
class_max_i INT;
class_min_i INT;
class_max numeric;
class_min numeric;
reply numeric[];
BEGIN
-- Calculate the sum of squared deviations from the array mean (SDAM).
SELECT sum((arr_mean - e)^2) INTO sdam FROM ( SELECT unnest(in_array) as e ) x;
--Identify the breaks for the lowest GVF
LOOP
i = 1;
LOOP
-- get our mean
SELECT avg(e) INTO class_avg FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e) x;
-- find the deviation
SELECT sum((class_avg-e)^2) INTO tmp_val FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e ) x;
IF i = 1 THEN
arr_gvf = ARRAY[tmp_val];
-- init our min/max map for later
class_max = arr_gvf[i];
class_min = arr_gvf[i];
class_min_i = 1;
class_max_i = 1;
ELSE
arr_gvf = array_append(arr_gvf, tmp_val);
END IF;
i := i+1;
IF i > breaks THEN EXIT; END IF;
END LOOP;
-- calculate our new GVF
SELECT sdam-sum(e) INTO new_gvf FROM ( SELECT unnest(arr_gvf) as e ) x;
-- if no improvement was made, exit
IF new_gvf < gvf THEN EXIT; END IF;
gvf = new_gvf;
IF j > max_search THEN EXIT; END IF;
j = j+1;
i = 1;
LOOP
--establish directionality (uppward through classes or downward)
IF arr_gvf[i] < class_min THEN
class_min = arr_gvf[i];
class_min_i = i;
END IF;
IF arr_gvf[i] > class_max THEN
class_max = arr_gvf[i];
class_max_i = i;
END IF;
i := i+1;
IF i > breaks THEN EXIT; END IF;
END LOOP;
IF class_max_i > class_min_i THEN
class_min_i = class_max_i - 1;
ELSE
class_min_i = class_max_i + 1;
END IF;
--Move from higher class to a lower gid order
IF class_max_i > class_min_i THEN
classes[class_max_i][1] = classes[class_max_i][1] + 1;
classes[class_min_i][2] = classes[class_min_i][2] + 1;
ELSE -- Move from lower class UP into a higher class by gid
classes[class_max_i][2] = classes[class_max_i][2] - 1;
classes[class_min_i][1] = classes[class_min_i][1] - 1;
END IF;
END LOOP;
i = 1;
LOOP
IF invert = TRUE THEN
side = 1; --default returns bottom side of breaks, invert returns top side
END IF;
reply = array_append(reply, in_array[classes[i][side]]);
i = i+1;
IF i > breaks THEN EXIT; END IF;
END LOOP;
RETURN array_prepend(gvf, reply);
END;
$$ language plpgsql IMMUTABLE PARALLEL SAFE;
--
-- Determine the Quantile classifications from a numeric array
--
-- @param in_array A numeric array of numbers to determine the best
-- bins based on the Quantile method.
--
-- @param breaks The number of bins you want to find.
--
--
CREATE OR REPLACE FUNCTION CDB_QuantileBins ( in_array NUMERIC[], breaks INT) RETURNS NUMERIC[] as $$
DECLARE
element_count INT4;
break_size numeric;
tmp_val numeric;
i INT := 1;
reply numeric[];
BEGIN
-- sort our values
SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e ASC) x;
-- get the total size of our data
element_count := array_length(in_array, 1);
break_size := element_count::numeric / breaks;
-- slice our bread
LOOP
IF i < breaks THEN
IF break_size * i % 1 > 0 THEN
SELECT e INTO tmp_val FROM ( SELECT unnest(in_array) e LIMIT 1 OFFSET ceil(break_size * i) - 1) x;
ELSE
SELECT avg(e) INTO tmp_val FROM ( SELECT unnest(in_array) e LIMIT 2 OFFSET ceil(break_size * i) - 1 ) x;
END IF;
ELSIF i = breaks THEN
-- select the last value
SELECT max(e) INTO tmp_val FROM ( SELECT unnest(in_array) e ) x;
ELSE
EXIT;
END IF;
reply = array_append(reply, tmp_val);
i := i+1;
END LOOP;
RETURN reply;
END;
$$ language plpgsql IMMUTABLE STRICT PARALLEL SAFE;