crankshaft/release/crankshaft--0.8.2--0.9.0.sql

--DO NOT MODIFY THIS FILE, IT IS GENERATED AUTOMATICALLY FROM SOURCES
-- Complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION crankshaft" to load this file. \quit
-- Version number of the extension release
CREATE OR REPLACE FUNCTION cdb_crankshaft_version()
RETURNS text AS $$
  SELECT '0.9.0'::text;
$$ language 'sql' IMMUTABLE STRICT PARALLEL SAFE;

-- Internal identifier of the installed extension instence
-- e.g. 'dev' for current development version
CREATE OR REPLACE FUNCTION _cdb_crankshaft_internal_version()
RETURNS text AS $$
  SELECT installed_version FROM pg_available_extensions where name='crankshaft' and pg_available_extensions IS NOT NULL;
$$ language 'sql' STABLE STRICT PARALLEL SAFE;
-- Internal function.
-- Set the seeds of the RNGs (Random Number Generators)
-- used internally.
CREATE OR REPLACE FUNCTION
_cdb_random_seeds (seed_value INTEGER) RETURNS VOID
AS $$
  from crankshaft import random_seeds
  random_seeds.set_random_seeds(seed_value)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION
    CDB_PyAggS(current_state Numeric[], current_row Numeric[]) 
    returns NUMERIC[] as $$
    BEGIN
        if array_upper(current_state,1) is null  then
            RAISE NOTICE 'setting state %',array_upper(current_row,1);
            current_state[1] = array_upper(current_row,1);
        end if;
        return array_cat(current_state,current_row) ;
    END
    $$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;

-- Create aggregate if it did not exist
DO $$ BEGIN
    CREATE AGGREGATE CDB_PyAgg(NUMERIC[]) (
        SFUNC = CDB_PyAggS,
        STYPE = Numeric[],
        PARALLEL = SAFE,
        INITCOND = "{}"
    );
EXCEPTION
    WHEN duplicate_function THEN NULL;
END $$;

CREATE OR REPLACE FUNCTION
  CDB_CreateAndPredictSegment(
    target NUMERIC[],
    features NUMERIC[],
    target_features NUMERIC[],
    target_ids NUMERIC[],
    n_estimators INTEGER DEFAULT 1200,
    max_depth INTEGER DEFAULT 3,
    subsample DOUBLE PRECISION DEFAULT 0.5,
    learning_rate DOUBLE PRECISION DEFAULT 0.01,
    min_samples_leaf INTEGER DEFAULT 1)
RETURNS TABLE(cartodb_id NUMERIC, prediction NUMERIC, accuracy NUMERIC)
AS $$
    import numpy as np
    import plpy

    from crankshaft.segmentation import Segmentation
    seg = Segmentation()
    model_params = {'n_estimators': n_estimators,
                    'max_depth': max_depth,
                    'subsample': subsample,
                    'learning_rate': learning_rate,
                    'min_samples_leaf': min_samples_leaf}

    def unpack2D(data):
        dimension = data.pop(0)
        a = np.array(data, dtype=np.float64)
        return a.reshape(int(len(a)/dimension), int(dimension))

    return seg.create_and_predict_segment_agg(
        np.array(target, dtype=np.float64),
        unpack2D(features),
        unpack2D(target_features),
        target_ids,
        model_params)

$$ LANGUAGE plpython3u VOLATILE PARALLEL RESTRICTED;

CREATE OR REPLACE FUNCTION
  CDB_CreateAndPredictSegment(
      query TEXT,
      variable_name TEXT,
      target_table TEXT,
      n_estimators INTEGER DEFAULT 1200,
      max_depth INTEGER DEFAULT 3,
      subsample DOUBLE PRECISION DEFAULT 0.5,
      learning_rate DOUBLE PRECISION DEFAULT 0.01,
      min_samples_leaf INTEGER DEFAULT 1)
RETURNS TABLE (cartodb_id TEXT, prediction NUMERIC, accuracy NUMERIC)
AS $$
    from crankshaft.segmentation import Segmentation
    seg = Segmentation()
    model_params = {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'subsample': subsample,
        'learning_rate': learning_rate,
        'min_samples_leaf': min_samples_leaf
    }
    feature_cols = set(plpy.execute('''
        select * from ({query}) as _w limit 0
    '''.format(query=query)).colnames()) -  set([variable_name, 'cartodb_id', ])
    return seg.create_and_predict_segment(
        query,
        variable_name,
        feature_cols,
        target_table,
        model_params
    )
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;

CREATE OR REPLACE FUNCTION
  CDB_CreateAndPredictSegment(
      query TEXT,
      variable TEXT,
      feature_columns TEXT[],
      target_query TEXT,
      n_estimators INTEGER DEFAULT 1200,
      max_depth INTEGER DEFAULT 3,
      subsample DOUBLE PRECISION DEFAULT 0.5,
      learning_rate DOUBLE PRECISION DEFAULT 0.01,
      min_samples_leaf INTEGER DEFAULT 1)
RETURNS TABLE (cartodb_id TEXT, prediction NUMERIC, accuracy NUMERIC)
AS $$
    from crankshaft.segmentation import Segmentation
    seg = Segmentation()
    model_params = {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'subsample': subsample,
        'learning_rate': learning_rate,
        'min_samples_leaf': min_samples_leaf
    }
    return seg.create_and_predict_segment(
        query,
        variable,
        feature_columns,
        target_query,
        model_params
    )
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
CREATE OR REPLACE FUNCTION CDB_Gravity(
    IN target_query text,
    IN weight_column text,
    IN source_query text,
    IN pop_column text,
    IN target bigint,
    IN radius integer,
    IN minval numeric DEFAULT -10e307
    )
RETURNS TABLE(
    the_geom geometry,
    source_id bigint,
    target_id bigint,
    dist numeric,
    h numeric,
    hpop numeric)  AS $$
DECLARE
    t_id bigint[];
    t_geom geometry[];
    t_weight numeric[];
    s_id bigint[];
    s_geom geometry[];
    s_pop numeric[];
BEGIN
    EXECUTE 'WITH foo as('+target_query+') SELECT array_agg(cartodb_id), array_agg(the_geom), array_agg(' || weight_column || ') FROM foo' INTO t_id, t_geom, t_weight;
    EXECUTE 'WITH foo as('+source_query+') SELECT array_agg(cartodb_id), array_agg(the_geom), array_agg(' || pop_column || ') FROM foo' INTO s_id, s_geom, s_pop;
    RETURN QUERY
    SELECT g.* FROM t, s, CDB_Gravity(t_id, t_geom, t_weight, s_id, s_geom, s_pop, target, radius, minval) g;
END;
$$ language plpgsql VOLATILE PARALLEL UNSAFE;

CREATE OR REPLACE FUNCTION CDB_Gravity(
    IN t_id bigint[],
    IN t_geom geometry[],
    IN t_weight numeric[],
    IN s_id bigint[],
    IN s_geom geometry[],
    IN s_pop numeric[],
    IN target bigint,
    IN radius integer,
    IN minval numeric DEFAULT -10e307
    )
RETURNS TABLE(
    the_geom geometry,
    source_id bigint,
    target_id bigint,
    dist numeric,
    h numeric,
    hpop numeric)  AS $$
DECLARE
    t_type text;
    s_type text;
    t_center geometry[];
    s_center geometry[];
BEGIN
    t_type := GeometryType(t_geom[1]);
    s_type := GeometryType(s_geom[1]);
    IF t_type = 'POINT' THEN
        t_center := t_geom;
    ELSE
        WITH tmp as (SELECT unnest(t_geom) as g) SELECT array_agg(ST_Centroid(g)) INTO t_center FROM tmp;
    END IF;
    IF s_type = 'POINT' THEN
        s_center := s_geom;
    ELSE
        WITH tmp as (SELECT unnest(s_geom) as g) SELECT array_agg(ST_Centroid(g)) INTO s_center FROM tmp;
    END IF;
    RETURN QUERY
        with target0 as(
            SELECT unnest(t_center) as tc, unnest(t_weight) as tw, unnest(t_id) as td
        ),
        source0 as(
            SELECT unnest(s_center) as sc, unnest(s_id) as sd, unnest (s_geom) as sg, unnest(s_pop) as sp
        ),
        prev0 as(
            SELECT
                source0.sg,
                source0.sd as sourc_id,
                coalesce(source0.sp,0) as sp,
                target.td as targ_id,
                coalesce(target.tw,0) as tw,
                GREATEST(1.0,ST_Distance(geography(target.tc), geography(source0.sc)))::numeric as distance
            FROM source0
            CROSS JOIN LATERAL
                (
                SELECT
                    *
                FROM target0
                    WHERE tw > minval
                    AND ST_DWithin(geography(source0.sc), geography(tc), radius)
                ) AS target
        ),
        deno as(
            SELECT
                sourc_id,
                sum(tw/distance) as h_deno
            FROM
                prev0
            GROUP BY sourc_id
        )
        SELECT
            p.sg as the_geom,
            p.sourc_id as source_id,
            p.targ_id as target_id,
            case when p.distance > 1 then p.distance else 0.0 end as dist,
            100*(p.tw/p.distance)/d.h_deno as h,
            p.sp*(p.tw/p.distance)/d.h_deno as hpop
        FROM
            prev0 p,
            deno d
        WHERE
            p.targ_id = target AND
            p.sourc_id = d.sourc_id;
END;
$$ language plpgsql IMMUTABLE PARALLEL SAFE;
-- 0: nearest neighbor(s)
-- 1: barymetric
-- 2: IDW
-- 3: krigin ---> TO DO


CREATE OR REPLACE FUNCTION CDB_SpatialInterpolation(
    IN query text,
    IN point geometry,
    IN method integer DEFAULT 1,
    IN p1 numeric DEFAULT 0,
    IN p2 numeric DEFAULT 0
    )
RETURNS numeric AS
$$
DECLARE
    gs geometry[];
    vs numeric[];
    output numeric;
BEGIN
    EXECUTE 'WITH a AS('||query||') SELECT array_agg(the_geom), array_agg(attrib) FROM a' INTO gs, vs;
    SELECT CDB_SpatialInterpolation(gs, vs, point, method, p1,p2) INTO output FROM a;

    RETURN output;
END;
$$
language plpgsql VOLATILE PARALLEL UNSAFE;

CREATE OR REPLACE FUNCTION CDB_SpatialInterpolation(
    IN geomin geometry[],
    IN colin numeric[],
    IN point geometry,
    IN method integer DEFAULT 1,
    IN p1 numeric DEFAULT 0,
    IN p2 numeric DEFAULT 0
    )
RETURNS numeric AS
$$
DECLARE
    gs geometry[];
    vs numeric[];
    gs2 geometry[];
    vs2 numeric[];
    g geometry;
    vertex geometry[];
    sg numeric;
    sa numeric;
    sb numeric;
    sc numeric;
    va numeric;
    vb numeric;
    vc numeric;
    output numeric;
BEGIN
    -- output :=  -999.999;

    -- nearest neighbors
    -- p1: limit the number of neighbors, 0-> closest one
    IF method = 0 THEN

        IF p1 = 0 THEN
            p1 := 1;
        END IF;

        WITH    a as (SELECT unnest(geomin) as g, unnest(colin) as v),
                b as (SELECT a.v as v FROM a ORDER BY point<->a.g LIMIT p1::integer)
        SELECT avg(b.v) INTO output FROM b;
        RETURN output;

    -- barymetric
    ELSIF method = 1 THEN
        WITH    a as (SELECT unnest(geomin) AS e),
                b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a),
                c as (SELECT (ST_Dump(t)).geom as v FROM b),
                d as (SELECT v FROM c WHERE ST_Within(point, v))
            SELECT v INTO g FROM d;
        IF g is null THEN
            -- out of the realm of the input data
            RETURN -888.888;
        END IF;
        -- vertex of the selected cell
        WITH a AS (SELECT (ST_DumpPoints(g)).geom AS v)
                SELECT array_agg(v) INTO vertex FROM a;

            -- retrieve the value of each vertex
        WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c)
            SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]);
        WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c)
            SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]);
        WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c)
                SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]);

        SELECT ST_area(g), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[2], vertex[3], point]))), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[1], vertex[3], point]))), ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point,vertex[1],vertex[2], point]))) INTO sg, sa, sb, sc;

        output := (coalesce(sa,0) * coalesce(va,0) + coalesce(sb,0) * coalesce(vb,0) + coalesce(sc,0) * coalesce(vc,0)) / coalesce(sg);
        RETURN output;

    -- IDW
    -- p1: limit the number of neighbors, 0->no limit
    -- p2: order of distance decay, 0-> order 1
    ELSIF method = 2 THEN

        IF p2 = 0 THEN
            p2 := 1;
        END IF;

        WITH    a as (SELECT unnest(geomin) as g, unnest(colin) as v),
                b as (SELECT a.g, a.v FROM a ORDER BY point<->a.g)
        SELECT array_agg(b.g), array_agg(b.v) INTO gs, vs FROM b;
        IF p1::integer>0 THEN
            gs2:=gs;
            vs2:=vs;
            FOR i IN 1..p1
            LOOP
                gs2 := gs2 || gs[i];
                vs2 := vs2 || vs[i];
            END LOOP;
        ELSE
            gs2:=gs;
            vs2:=vs;
        END IF;

        WITH    a as (SELECT unnest(gs2) as g, unnest(vs2) as v),
                b as (
                    SELECT
                    (1/ST_distance(point, a.g)^p2::integer) as k,
                    (a.v/ST_distance(point, a.g)^p2::integer) as f
                    FROM a
                )
        SELECT sum(b.f)/sum(b.k) INTO output FROM b;
        RETURN output;

    -- krigin
    ELSIF method = 3 THEN

    --  TO DO

    END IF;

    RETURN -777.777;

END;
$$
language plpgsql IMMUTABLE PARALLEL SAFE;
-- =============================================================================================
--
-- CDB_Voronoi
--
-- =============================================================================================
CREATE OR REPLACE FUNCTION CDB_voronoi(
    IN geomin geometry[],
    IN buffer numeric DEFAULT 0.5,
    IN tolerance numeric DEFAULT 1e-9
    )
RETURNS geometry  AS $$
DECLARE
    geomout geometry;
BEGIN
    -- we need to make the geometry calculations in (pseudo)meters!!!
    with a as (
        SELECT unnest(geomin) as g1
    ),
    b as(
        SELECT st_transform(g1, 3857) g2 from a
    )
    SELECT array_agg(g2) INTO geomin from b;

    WITH
    convexhull_1 as (
        SELECT
            ST_ConvexHull(ST_Collect(geomin)) as g,
            buffer * |/ (st_area(ST_ConvexHull(ST_Collect(geomin)))/PI()) as r
    ),
    clipper as(
        SELECT
            st_buffer(ST_MinimumBoundingCircle(a.g), buffer*a.r)  as g
        FROM convexhull_1 a
    ),
    env0 as (
        SELECT
            (st_dumppoints(st_expand(a.g, buffer*a.r))).geom as e
        FROM  convexhull_1 a
    ),
    env as (
        SELECT
            array_agg(env0.e) as e
        FROM  env0
    ),
    sample AS (
        SELECT
            ST_Collect(geomin || env.e) as geom
        FROM env
    ),
    convexhull as (
        SELECT
            ST_ConvexHull(ST_Collect(geomin)) as cg
    ),
    tin as (
        SELECT
            ST_Dump(ST_DelaunayTriangles(geom, tolerance, 0)) as gd
        FROM
            sample
    ),
    tin_polygons as (
        SELECT
            (gd).Path as id,
            (gd).Geom as pg,
            ST_Centroid(ST_MinimumBoundingCircle((gd).Geom, 180)) as ct
        FROM tin
    ),
    tin_lines as (
        SELECT
            id,
            ST_ExteriorRing(pg) as lg
        FROM tin_polygons
    ),
    tin_nodes as (
        SELECT
            id,
            ST_PointN(lg,1) p1,
            ST_PointN(lg,2) p2,
            ST_PointN(lg,3) p3
        FROM tin_lines
    ),
    tin_edges AS (
        SELECT
            p.id,
            UNNEST(ARRAY[
            ST_MakeLine(n.p1,n.p2) ,
            ST_MakeLine(n.p2,n.p3) ,
            ST_MakeLine(n.p3,n.p1)]) as Edge,
            ST_Force2D(cdb_crankshaft._Find_Circle(n.p1,n.p2,n.p3)) as ct,
            CASE WHEN st_distance(p.ct, ST_ExteriorRing(p.pg)) < tolerance THEN
                TRUE
            ELSE  FALSE END AS ctx,
            p.pg,
            ST_within(p.ct, convexhull.cg) as ctin
        FROM
            tin_polygons p,
            tin_nodes n,
            convexhull
        WHERE p.id = n.id
    ),
    voro_nodes as (
        SELECT
            CASE WHEN x.ctx = TRUE THEN
                ST_Centroid(x.edge)
            ELSE
                x.ct
            END as xct,
            CASE WHEN y.id is null THEN
                CASE WHEN x.ctin = TRUE THEN
                    ST_SetSRID(ST_MakePoint(
                        ST_X(x.ct) + ((ST_X(ST_Centroid(x.edge)) - ST_X(x.ct)) * (1+buffer)),
                        ST_Y(x.ct) + ((ST_Y(ST_Centroid(x.edge)) - ST_Y(x.ct)) * (1+buffer))
                    ), ST_SRID(x.ct))
                END
            ELSE
                y.ct
            END as yct
        FROM
            tin_edges x
        LEFT OUTER JOIN
            tin_edges y
        ON x.id <> y.id AND ST_Equals(x.edge, y.edge)
    ),
    voro_edges as(
        SELECT
            ST_LineMerge(ST_Collect(ST_MakeLine(xct, yct))) as v
        FROM
            voro_nodes
    ),
    voro_cells as(
        SELECT
            ST_Polygonize(
                ST_Node(
                    ST_LineMerge(
                        ST_Union(v, ST_ExteriorRing(
                            ST_Convexhull(v)
                            )
                        )
                    )
                )
            ) as g
        FROM
            voro_edges
    ),
    voro_set as(
        SELECT
        (st_dump(v.g)).geom as g
        FROM voro_cells v
    ),
    clipped_voro as(
        SELECT
            ST_intersection(c.g, v.g) as g
        FROM
            voro_set v,
            clipper c
        WHERE
            ST_GeometryType(v.g) = 'ST_Polygon'
    )
    SELECT
        st_collect(
            ST_Transform(
                ST_ConvexHull(g),
                4326
            )
        )
    INTO geomout
    FROM
        clipped_voro;
    RETURN geomout;
END;
$$ language plpgsql IMMUTABLE PARALLEL SAFE;

/** ----------------------------------------------------------------------------------------
  * @function   : FindCircle
  * @precis     : Function that determines if three points form a circle. If so a table containing
  *               centre and radius is returned. If not, a null table is returned.
  * @version    : 1.0.1
  * @param      : p_pt1        : First point in curve
  * @param      : p_pt2        : Second point in curve
  * @param      : p_pt3        : Third point in curve
  * @return     : geometry     : In which X,Y ordinates are the centre X, Y and the Z being the radius of found circle
  *                              or NULL if three points do not form a circle.
  * @history    : Simon Greener - Feb 2012 - Original coding.
  *               Rafa de la Torre - Aug 2016 - Small fix for type checking
  *               Raul Marin - Sept 2017 - Remove unnecessary NULL checks and set function categories
  * @copyright  : Simon Greener @ 2012
  *               Licensed under a Creative Commons Attribution-Share Alike 2.5 Australia License. (http://creativecommons.org/licenses/by-sa/2.5/au/)
**/
CREATE OR REPLACE FUNCTION _Find_Circle(
    IN p_pt1 geometry,
    IN p_pt2 geometry,
    IN p_pt3 geometry)
  RETURNS geometry AS
$BODY$
DECLARE
   v_Centre geometry;
   v_radius NUMERIC;
   v_CX     NUMERIC;
   v_CY     NUMERIC;
   v_dA     NUMERIC;
   v_dB     NUMERIC;
   v_dC     NUMERIC;
   v_dD     NUMERIC;
   v_dE     NUMERIC;
   v_dF     NUMERIC;
   v_dG     NUMERIC;
BEGIN
   IF ( ST_GeometryType(p_pt1) <> 'ST_Point' OR
        ST_GeometryType(p_pt2) <> 'ST_Point' OR
        ST_GeometryType(p_pt3) <> 'ST_Point' ) THEN
      RAISE EXCEPTION 'All supplied geometries must be points.';
      RETURN NULL;
   END IF;
   v_dA := ST_X(p_pt2) - ST_X(p_pt1);
   v_dB := ST_Y(p_pt2) - ST_Y(p_pt1);
   v_dC := ST_X(p_pt3) - ST_X(p_pt1);
   v_dD := ST_Y(p_pt3) - ST_Y(p_pt1);
   v_dE := v_dA * (ST_X(p_pt1) + ST_X(p_pt2)) + v_dB * (ST_Y(p_pt1) + ST_Y(p_pt2));
   v_dF := v_dC * (ST_X(p_pt1) + ST_X(p_pt3)) + v_dD * (ST_Y(p_pt1) + ST_Y(p_pt3));
   v_dG := 2.0  * (v_dA * (ST_Y(p_pt3) - ST_Y(p_pt2)) - v_dB * (ST_X(p_pt3) - ST_X(p_pt2)));
   -- If v_dG is zero then the three points are collinear and no finite-radius
   -- circle through them exists.
   IF ( v_dG = 0 ) THEN
      RETURN NULL;
   ELSE
      v_CX := (v_dD * v_dE - v_dB * v_dF) / v_dG;
      v_CY := (v_dA * v_dF - v_dC * v_dE) / v_dG;
      v_Radius := SQRT(POWER(ST_X(p_pt1) - v_CX,2) + POWER(ST_Y(p_pt1) - v_CY,2) );
   END IF;
   RETURN ST_SetSRID(ST_MakePoint(v_CX, v_CY, v_radius),ST_Srid(p_pt1));
END;
$BODY$
  LANGUAGE plpgsql IMMUTABLE STRICT PARALLEL SAFE;

-- Moran's I Global Measure (public-facing)
CREATE OR REPLACE FUNCTION
  CDB_AreasOfInterestGlobal(
      subquery TEXT,
      column_name TEXT,
      w_type TEXT DEFAULT 'knn',
      num_ngbrs INT DEFAULT 5,
      permutations INT DEFAULT 99,
      geom_col TEXT DEFAULT 'the_geom',
      id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran NUMERIC, significance NUMERIC)
AS $$
  from crankshaft.clustering import Moran
  # TODO: use named parameters or a dictionary
  moran = Moran()
  return moran.global_stat(subquery, column_name, w_type,
                           num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;

-- Moran's I Local (internal function) - DEPRECATED
CREATE OR REPLACE FUNCTION
  _CDB_AreasOfInterestLocal(
      subquery TEXT,
      column_name TEXT,
      w_type TEXT,
      num_ngbrs INT,
      permutations INT,
      geom_col TEXT,
      id_col TEXT)
RETURNS TABLE (
    moran NUMERIC,
    quads TEXT,
    significance NUMERIC,
    rowid INT,
    vals NUMERIC)
AS $$
  from crankshaft.clustering import Moran
  moran = Moran()
  result = moran.local_stat(subquery, column_name, w_type,
                            num_ngbrs, permutations, geom_col, id_col)
  # remove spatial lag
  return [(r[6], r[0], r[1], r[7], r[5]) for r in result]
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;

-- Moran's I Local (internal function)
CREATE OR REPLACE FUNCTION
  _CDB_MoransILocal(
      subquery TEXT,
      column_name TEXT,
      w_type TEXT,
      num_ngbrs INT,
      permutations INT,
      geom_col TEXT,
      id_col TEXT)
RETURNS TABLE (
    quads TEXT,
    significance NUMERIC,
    spatial_lag NUMERIC,
    spatial_lag_std NUMERIC,
    orig_val NUMERIC,
    orig_val_std NUMERIC,
    moran_stat NUMERIC,
    rowid INT)
AS $$

from crankshaft.clustering import Moran
moran = Moran()
return moran.local_stat(subquery, column_name, w_type,
                        num_ngbrs, permutations, geom_col, id_col)

$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;


-- Moran's I Local (public-facing function)
--  Replaces CDB_AreasOfInterestLocal
CREATE OR REPLACE FUNCTION
  CDB_MoransILocal(
    subquery TEXT,
    column_name TEXT,
    w_type TEXT DEFAULT 'knn',
    num_ngbrs INT DEFAULT 5,
    permutations INT DEFAULT 99,
    geom_col TEXT DEFAULT 'the_geom',
    id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (
    quads TEXT,
    significance NUMERIC,
    spatial_lag NUMERIC,
    spatial_lag_std NUMERIC,
    orig_val NUMERIC,
    orig_val_std NUMERIC,
    moran_stat NUMERIC,
    rowid INT)
AS $$

  SELECT
    quads, significance, spatial_lag, spatial_lag_std,
    orig_val, orig_val_std, moran_stat, rowid
  FROM cdb_crankshaft._CDB_MoransILocal(
    subquery, column_name, w_type,
    num_ngbrs, permutations, geom_col, id_col);

$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;

-- Moran's I Local (public-facing function) - DEPRECATED
CREATE OR REPLACE FUNCTION
  CDB_AreasOfInterestLocal(
    subquery TEXT,
    column_name TEXT,
    w_type TEXT DEFAULT 'knn',
    num_ngbrs INT DEFAULT 5,
    permutations INT DEFAULT 99,
    geom_col TEXT DEFAULT 'the_geom',
    id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$

  SELECT moran, quads, significance, rowid, vals
  FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col);

$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;

-- Moran's I only for HH and HL (public-facing function)
CREATE OR REPLACE FUNCTION
  CDB_GetSpatialHotspots(
    subquery TEXT,
    column_name TEXT,
    w_type TEXT DEFAULT 'knn',
    num_ngbrs INT DEFAULT 5,
    permutations INT DEFAULT 99,
    geom_col TEXT DEFAULT 'the_geom',
    id_col TEXT DEFAULT 'cartodb_id')
    RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$

  SELECT moran, quads, significance, rowid, vals
  FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col)
  WHERE quads IN ('HH', 'HL');

$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;

-- Moran's I only for LL and LH (public-facing function)
CREATE OR REPLACE FUNCTION
  CDB_GetSpatialColdspots(
    subquery TEXT,
    attr TEXT,
    w_type TEXT DEFAULT 'knn',
    num_ngbrs INT DEFAULT 5,
    permutations INT DEFAULT 99,
    geom_col TEXT DEFAULT 'the_geom',
    id_col TEXT DEFAULT 'cartodb_id')
    RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$

  SELECT moran, quads, significance, rowid, vals
  FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col)
  WHERE quads IN ('LL', 'LH');

$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;

-- Moran's I only for LH and HL (public-facing function)
CREATE OR REPLACE FUNCTION
  CDB_GetSpatialOutliers(
    subquery TEXT,
    attr TEXT,
    w_type TEXT DEFAULT 'knn',
    num_ngbrs INT DEFAULT 5,
    permutations INT DEFAULT 99,
    geom_col TEXT DEFAULT 'the_geom',
    id_col TEXT DEFAULT 'cartodb_id')
    RETURNS TABLE (moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$

  SELECT moran, quads, significance, rowid, vals
  FROM cdb_crankshaft._CDB_AreasOfInterestLocal(subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col)
  WHERE quads IN ('HL', 'LH');

$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;

-- Moran's I Global Rate (public-facing function)
CREATE OR REPLACE FUNCTION
  CDB_AreasOfInterestGlobalRate(
      subquery TEXT,
      numerator TEXT,
      denominator TEXT,
      w_type TEXT DEFAULT 'knn',
      num_ngbrs INT DEFAULT 5,
      permutations INT DEFAULT 99,
      geom_col TEXT DEFAULT 'the_geom',
      id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (moran FLOAT, significance FLOAT)
AS $$
  from crankshaft.clustering import Moran
  moran = Moran()
  # TODO: use named parameters or a dictionary
  return moran.global_rate_stat(subquery, numerator, denominator, w_type,
                                num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;


-- Moran's I Local Rate (internal function) - DEPRECATED
CREATE OR REPLACE FUNCTION
  _CDB_AreasOfInterestLocalRate(
      subquery TEXT,
      numerator TEXT,
      denominator TEXT,
      w_type TEXT,
      num_ngbrs INT,
      permutations INT,
      geom_col TEXT,
      id_col TEXT)
RETURNS
TABLE(
    moran NUMERIC,
    quads TEXT,
    significance NUMERIC,
    rowid INT,
    vals NUMERIC)
AS $$
  from crankshaft.clustering import Moran
  moran = Moran()
  # TODO: use named parameters or a dictionary
  result = moran.local_rate_stat(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
  # remove spatial lag
  return [(r[6], r[0], r[1], r[7], r[4]) for r in result]
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;

-- Moran's I Local Rate (public-facing function) - DEPRECATED
CREATE OR REPLACE FUNCTION
  CDB_AreasOfInterestLocalRate(
      subquery TEXT,
      numerator TEXT,
      denominator TEXT,
      w_type TEXT DEFAULT 'knn',
      num_ngbrs INT DEFAULT 5,
      permutations INT DEFAULT 99,
      geom_col TEXT DEFAULT 'the_geom',
      id_col TEXT DEFAULT 'cartodb_id')
RETURNS
TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$

  SELECT moran, quads, significance, rowid, vals
  FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col);

$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;

-- Internal function
CREATE OR REPLACE FUNCTION
  _CDB_MoransILocalRate(
      subquery TEXT,
      numerator TEXT,
      denominator TEXT,
      w_type TEXT,
      num_ngbrs INT,
      permutations INT,
      geom_col TEXT,
      id_col TEXT)
RETURNS
TABLE(
    quads TEXT,
    significance NUMERIC,
    spatial_lag NUMERIC,
    spatial_lag_std NUMERIC,
    orig_val NUMERIC,
    orig_val_std NUMERIC,
    moran_stat NUMERIC,
    rowid INT)
AS $$
from crankshaft.clustering import Moran
moran = Moran()
return moran.local_rate_stat(
    subquery,
    numerator,
    denominator,
    w_type,
    num_ngbrs,
    permutations,
    geom_col,
    id_col
)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;

-- Moran's I Rate
-- Replaces CDB_AreasOfInterestLocalRate
CREATE OR REPLACE FUNCTION
  CDB_MoransILocalRate(
      subquery TEXT,
      numerator TEXT,
      denominator TEXT,
      w_type TEXT DEFAULT 'knn',
      num_ngbrs INT DEFAULT 5,
      permutations INT DEFAULT 99,
      geom_col TEXT DEFAULT 'the_geom',
      id_col TEXT DEFAULT 'cartodb_id')
RETURNS
TABLE(
    quads TEXT,
    significance NUMERIC,
    spatial_lag NUMERIC,
    spatial_lag_std NUMERIC,
    orig_val NUMERIC,
    orig_val_std NUMERIC,
    moran_stat NUMERIC,
    rowid INT)
AS $$

SELECT
  quads, significance, spatial_lag, spatial_lag_std,
  orig_val, orig_val_std, moran_stat, rowid
FROM cdb_crankshaft._CDB_MoransILocalRate(
  subquery, numerator, denominator, w_type,
  num_ngbrs, permutations, geom_col, id_col);

$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;

-- Moran's I Local Rate only for HH and HL (public-facing function)
CREATE OR REPLACE FUNCTION
  CDB_GetSpatialHotspotsRate(
      subquery TEXT,
      numerator TEXT,
      denominator TEXT,
      w_type TEXT DEFAULT 'knn',
      num_ngbrs INT DEFAULT 5,
      permutations INT DEFAULT 99,
      geom_col TEXT DEFAULT 'the_geom',
      id_col TEXT DEFAULT 'cartodb_id')
RETURNS
TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$

  SELECT moran, quads, significance, rowid, vals
  FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
  WHERE quads IN ('HH', 'HL');

$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;

-- Moran's I Local Rate only for LL and LH (public-facing function)
CREATE OR REPLACE FUNCTION
  CDB_GetSpatialColdspotsRate(
      subquery TEXT,
      numerator TEXT,
      denominator TEXT,
      w_type TEXT DEFAULT 'knn',
      num_ngbrs INT DEFAULT 5,
      permutations INT DEFAULT 99,
      geom_col TEXT DEFAULT 'the_geom',
      id_col TEXT DEFAULT 'cartodb_id')
RETURNS
TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$

  SELECT moran, quads, significance, rowid, vals
  FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
  WHERE quads IN ('LL', 'LH');

$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;

-- Moran's I Local Rate only for LH and HL (public-facing function)
CREATE OR REPLACE FUNCTION
  CDB_GetSpatialOutliersRate(
      subquery TEXT,
      numerator TEXT,
      denominator TEXT,
      w_type TEXT DEFAULT 'knn',
      num_ngbrs INT DEFAULT 5,
      permutations INT DEFAULT 99,
      geom_col TEXT DEFAULT 'the_geom',
      id_col TEXT DEFAULT 'cartodb_id')
RETURNS
TABLE(moran NUMERIC, quads TEXT, significance NUMERIC, rowid INT, vals NUMERIC)
AS $$

  SELECT moran, quads, significance, rowid, vals
  FROM cdb_crankshaft._CDB_AreasOfInterestLocalRate(subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col)
  WHERE quads IN ('HL', 'LH');

$$ LANGUAGE SQL VOLATILE PARALLEL UNSAFE;
-- Spatial k-means clustering

CREATE OR REPLACE FUNCTION CDB_KMeans(
  query TEXT,
  no_clusters INTEGER,
  no_init INTEGER DEFAULT 20
)
RETURNS TABLE(
  cartodb_id INTEGER,
  cluster_no INTEGER
) AS $$

from crankshaft.clustering import Kmeans
kmeans = Kmeans()
return kmeans.spatial(query, no_clusters, no_init)

$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;

-- Non-spatial k-means clustering
-- query: sql query to retrieve all the needed data
-- colnames: text array of column names for doing the clustering analysis
-- no_clusters: number of requested clusters
-- standardize: whether to scale variables to a mean of zero and a standard
--              deviation of 1
-- id_colname: name of the id column

CREATE OR REPLACE FUNCTION CDB_KMeansNonspatial(
  query TEXT,
  colnames TEXT[],
  no_clusters INTEGER,
  standardize BOOLEAN DEFAULT true,
  id_col TEXT DEFAULT 'cartodb_id'
)
RETURNS TABLE(
  cluster_label text,
  cluster_center json,
  silhouettes numeric,
  inertia numeric,
  rowid bigint
) AS $$

from crankshaft.clustering import Kmeans
kmeans = Kmeans()
return kmeans.nonspatial(query, colnames, no_clusters,
                         standardize=standardize,
                         id_col=id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;


CREATE OR REPLACE FUNCTION CDB_WeightedMeanS(
  state NUMERIC[],
  the_geom GEOMETRY(Point, 4326),
  weight NUMERIC
)
RETURNS Numeric[] AS $$
DECLARE
    newX NUMERIC;
    newY NUMERIC;
    newW NUMERIC;
BEGIN
    IF weight IS NULL OR the_geom IS NULL THEN
        newX = state[1];
        newY = state[2];
        newW = state[3];
    ELSE
        newX = state[1] + ST_X(the_geom)*weight;
        newY = state[2] + ST_Y(the_geom)*weight;
        newW = state[3] + weight;
    END IF;
    RETURN Array[newX,newY,newW];

END
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;


CREATE OR REPLACE FUNCTION CDB_WeightedMeanF(state NUMERIC[])
RETURNS GEOMETRY AS
$$
BEGIN
    IF state[3] = 0 THEN
        RETURN ST_SetSRID(ST_MakePoint(state[1],state[2]), 4326);
    ELSE
        RETURN ST_SETSRID(ST_MakePoint(state[1]/state[3], state[2]/state[3]),4326);
    END IF;
END
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;


-- Create aggregate if it did not exist
DO $$ BEGIN
    CREATE AGGREGATE CDB_WeightedMean(geometry(Point, 4326), NUMERIC) (
        SFUNC = CDB_WeightedMeanS,
        FINALFUNC = CDB_WeightedMeanF,
        STYPE = Numeric[],
        PARALLEL = SAFE,
        INITCOND = "{0.0,0.0,0.0}"
);
EXCEPTION
    WHEN duplicate_function THEN NULL;
END $$;
-- Spatial Markov

-- input table format:
-- id | geom | date_1 | date_2 | date_3
--  1 | Pt1  | 12.3   | 13.1   | 14.2
--  2 | Pt2  | 11.0   | 13.2   | 12.5
--  ...
-- Sample Function call:
-- SELECT CDB_SpatialMarkov('SELECT * FROM real_estate',
--                          Array['date_1', 'date_2', 'date_3'])

CREATE OR REPLACE FUNCTION
  CDB_SpatialMarkovTrend (
      subquery TEXT,
      time_cols TEXT[],
      num_classes INT DEFAULT 7,
      w_type TEXT DEFAULT 'knn',
      num_ngbrs INT DEFAULT 5,
  	  permutations INT DEFAULT 99,
  	  geom_col TEXT DEFAULT 'the_geom',
  	  id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (trend NUMERIC, trend_up NUMERIC, trend_down NUMERIC, volatility NUMERIC, rowid INT)
AS $$

  from crankshaft.space_time_dynamics import Markov
  markov = Markov()

  ## TODO: use named parameters or a dictionary
  return markov.spatial_trend(subquery, time_cols, num_classes, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;

-- input table format: identical to above but in a predictable format
-- Sample function call:
-- SELECT cdb_spatial_markov('SELECT * FROM real_estate',
--                           'date_1')


-- CREATE OR REPLACE FUNCTION
--   cdb_spatial_markov (
--       subquery TEXT,
--       time_col_min text,
--       time_col_max text,
--       date_format text, -- '_YYYY_MM_DD'
--       num_time_per_bin INT DEFAULT 1,
--   	  permutations INT DEFAULT 99,
--   	  geom_column TEXT DEFAULT 'the_geom',
--   	  id_col TEXT DEFAULT 'cartodb_id',
--       w_type TEXT DEFAULT 'knn',
--       num_ngbrs int DEFAULT 5)
-- RETURNS TABLE (moran FLOAT, quads TEXT, significance FLOAT, ids INT)
-- AS $$
--   plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()')
--   from crankshaft.clustering import moran_local
--   # TODO: use named parameters or a dictionary
--   return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs)
-- $$ LANGUAGE plpython3u;
--
-- -- input table format:
-- -- id | geom | date  | measurement
-- --  1 | Pt1  | 12/3  | 13.2
-- --  2 | Pt2  | 11/5  | 11.3
-- --  3 | Pt1  | 11/13 | 12.9
-- --  4 | Pt3  | 12/19 | 10.1
-- -- ...
--
-- CREATE OR REPLACE FUNCTION
--   cdb_spatial_markov (
--       subquery TEXT,
--       time_col text,
--       num_time_per_bin INT DEFAULT 1,
--   	  permutations INT DEFAULT 99,
--   	  geom_column TEXT DEFAULT 'the_geom',
--   	  id_col TEXT DEFAULT 'cartodb_id',
--       w_type TEXT DEFAULT 'knn',
--       num_ngbrs int DEFAULT 5)
-- RETURNS TABLE (moran FLOAT, quads TEXT, significance FLOAT, ids INT)
-- AS $$
--   plpy.execute('SELECT cdb_crankshaft._cdb_crankshaft_activate_py()')
--   from crankshaft.clustering import moran_local
--   # TODO: use named parameters or a dictionary
--   return spatial_markov(subquery, time_cols, permutations, geom_column, id_col, w_type, num_ngbrs)
-- $$ LANGUAGE plpython3u;
-- Based on:
-- https://github.com/mapbox/polylabel/blob/master/index.js
-- https://sites.google.com/site/polesofinaccessibility/
-- Requires: https://github.com/CartoDB/cartodb-postgresql

-- Based on:
-- https://github.com/mapbox/polylabel/blob/master/index.js
-- https://sites.google.com/site/polesofinaccessibility/
-- Requires: https://github.com/CartoDB/cartodb-postgresql

CREATE OR REPLACE FUNCTION CDB_PIA(
    IN polygon geometry,
    IN tolerance numeric DEFAULT 1.0
    )
RETURNS geometry  AS $$
DECLARE
    env geometry[];
    cells geometry[];
    cell geometry;
    best_c geometry;
    best_d numeric;
    test_d numeric;
    test_mx numeric;
    test_h numeric;
    test_cells geometry[];
    width numeric;
    height numeric;
    h numeric;
    i integer;
    n integer;
    sqr numeric;
    p geometry;
BEGIN
    sqr := 0.5*(|/2.0);
    polygon := ST_Transform(polygon, 3857);

    -- grid #0 cell size
    height := ST_YMax(polygon) - ST_YMin(polygon);
    width := ST_XMax(polygon) - ST_XMin(polygon);
    h := 0.5*LEAST(height, width);

    -- grid #0
    with c1 as(
        SELECT cdb_crankshaft.CDB_RectangleGrid(polygon, h, h) as c
    )
    SELECT array_agg(c) INTO cells FROM c1;

    -- 1st guess: centroid
    best_c := polygon;
    best_d := cdb_crankshaft._Signed_Dist(polygon, ST_Centroid(Polygon));

    -- looping the loop
    n := array_length(cells,1);
    i := 1;
    LOOP

        EXIT WHEN i > n;

        cell := cells[i];

        i := i+1;

        -- cell side size, it's square
        test_h := ST_XMax(cell) - ST_XMin(cell) ;

        -- check distance
        test_d := cdb_crankshaft._Signed_Dist(polygon, ST_Centroid(cell));

        IF test_d > best_d THEN
            best_d := test_d;
            best_c := cell;
        END IF;

        -- longest distance within the cell
        test_mx := test_d + (test_h * sqr);

        -- if the cell has no chance to contains the desired point, continue
        CONTINUE WHEN test_mx - best_d <= tolerance;

        -- resample the cell
        with c1 as(
            SELECT cdb_crankshaft.CDB_RectangleGrid(cell, test_h/2, test_h/2) as c
        )
        SELECT array_agg(c) INTO test_cells FROM c1;

        -- concat the new cells to the former array
        cells := cells || test_cells;

        -- prepare next iteration
        n := array_length(cells,1);

    END LOOP;

    RETURN ST_transform(ST_Centroid(best_c), 4326);

END;
$$ language plpgsql IMMUTABLE PARALLEL SAFE;


-- signed distance point to polygon with holes
-- negative is the point is out the polygon
-- rev 1. adding MULTIPOLYGON and GEOMETRYCOLLECTION support by @abelvm
CREATE OR REPLACE FUNCTION _Signed_Dist(
    IN polygon geometry,
    IN point geometry
    )
RETURNS numeric  AS $$
DECLARE
    pols geometry[];
    pol geometry;
    i integer;
    j integer;
    within integer;
    w integer;
    holes integer;
    dist numeric;
    d numeric;
BEGIN
    dist := 1e999;
    WITH collection as (SELECT (ST_dump(polygon)).geom as geom) SELECT array_agg(geom) into pols FROM collection;
    FOR j in 1..array_length(pols, 1)
    LOOP
        pol := pols[j];
        d := dist;
        SELECT LEAST(dist, ST_distance(point, ST_ExteriorRing(pol))::numeric) INTO d;
        SELECT CASE WHEN ST_Within(point,pol) THEN 1 ELSE -1 END INTO w;
        SELECT ST_NumInteriorRings(pol) INTO holes;
        IF holes > 0 THEN
            FOR i IN 1..holes
            LOOP
                SELECT LEAST(d, ST_distance(point, ST_InteriorRingN(pol, i))::numeric) INTO d;
            END LOOP;
        END IF;
        IF d < dist THEN
            dist:= d;
            within := w;
        END IF;
    END LOOP;
    dist := dist * within::numeric;
    RETURN dist;
END;
$$ language plpgsql IMMUTABLE PARALLEL SAFE;
--
-- Iterative densification of a set of points using Delaunay triangulation
-- the new points have as assigned value the average value of the 3 vertex (centroid)
--
-- @param geomin - array of geometries (points)
--
-- @param colin - array of numeric values in that points
--
-- @param iterations - integer, number of iterations
--
--
-- Returns: TABLE(geomout geometry, colout numeric)
--
--
CREATE OR REPLACE FUNCTION CDB_Densify(
    IN geomin geometry[],
    IN colin numeric[],
    IN iterations integer
    )
RETURNS TABLE(geomout geometry, colout numeric)  AS $$
DECLARE
    geotemp geometry[];
    coltemp numeric[];
    i integer;
    gs geometry[];
    g geometry;
    vertex geometry[];
    va numeric;
    vb numeric;
    vc numeric;
    center geometry;
    centerval numeric;
    tmp integer;
BEGIN
    geotemp := geomin;
    coltemp := colin;
    FOR i IN 1..iterations
    LOOP
        -- generate TIN
        WITH    a as (SELECT unnest(geotemp) AS e),
                b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a),
                c as (SELECT (ST_Dump(t)).geom AS v FROM b)
        SELECT array_agg(v) INTO gs FROM c;
        -- loop cells
        FOREACH g IN ARRAY gs
        LOOP
            -- append centroid
            SELECT ST_Centroid(g) INTO center;
            geotemp := array_append(geotemp, center);
            -- retrieve the value of each vertex
            WITH a AS (SELECT (ST_DumpPoints(g)).geom AS v)
            SELECT array_agg(v) INTO vertex FROM a;
            WITH a AS(SELECT unnest(geotemp) as geo, unnest(coltemp) as c)
            SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]);
            WITH a AS(SELECT unnest(geotemp) as geo, unnest(coltemp) as c)
            SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]);
            WITH a AS(SELECT unnest(geotemp) as geo, unnest(coltemp) as c)
            SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]);
            -- calc the value at the center
            centerval := (va + vb + vc) / 3;
            -- append the value
            coltemp := array_append(coltemp, centerval);
        END LOOP;
    END LOOP;
    RETURN QUERY SELECT unnest(geotemp ) as geomout, unnest(coltemp ) as colout;
END;
$$ language plpgsql IMMUTABLE PARALLEL SAFE;
CREATE OR REPLACE FUNCTION CDB_TINmap(
    IN geomin geometry[],
    IN colin numeric[],
    IN iterations integer
    )
RETURNS TABLE(geomout geometry, colout numeric)  AS $$
DECLARE
    p geometry[];
    vals numeric[];
    gs geometry[];
    g geometry;
    vertex geometry[];
    centerval numeric;
    va numeric;
    vb numeric;
    vc numeric;
    coltemp numeric[];
BEGIN
    SELECT array_agg(dens.geomout), array_agg(dens.colout) INTO p, vals FROM cdb_crankshaft.CDB_Densify(geomin, colin, iterations) dens;
    WITH    a as (SELECT unnest(p) AS e),
            b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a),
            c as (SELECT (ST_Dump(t)).geom AS v FROM b)
        SELECT array_agg(v) INTO gs FROM c;
    FOREACH g IN ARRAY gs
    LOOP
        -- retrieve the vertex of each triangle
        WITH a AS (SELECT (ST_DumpPoints(g)).geom AS v)
            SELECT array_agg(v) INTO vertex FROM a;
        -- retrieve the value of each vertex
        WITH a AS(SELECT unnest(p) as geo, unnest(vals) as c)
            SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]);
        WITH a AS(SELECT unnest(p) as geo, unnest(vals) as c)
            SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]);
        WITH a AS(SELECT unnest(p) as geo, unnest(vals) as c)
            SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]);
        -- calc the value at the center
        centerval := (va + vb + vc) / 3;
        -- append the value
        coltemp := array_append(coltemp, centerval);
    END LOOP;
    RETURN QUERY SELECT unnest(gs) as geomout, unnest(coltemp ) as colout;
END;
$$ language plpgsql IMMUTABLE PARALLEL SAFE;
-- Getis-Ord's G
-- Hotspot/Coldspot Analysis tool
CREATE OR REPLACE FUNCTION
  CDB_GetisOrdsG(
      subquery TEXT,
      column_name TEXT,
      w_type TEXT DEFAULT 'knn',
      num_ngbrs INT DEFAULT 5,
      permutations INT DEFAULT 999,
      geom_col TEXT DEFAULT 'the_geom',
      id_col TEXT DEFAULT 'cartodb_id')
RETURNS TABLE (z_score NUMERIC, p_value NUMERIC, p_z_sim NUMERIC, rowid BIGINT)
AS $$
  from crankshaft.clustering import Getis
  getis = Getis()
  return getis.getis_ord(subquery, column_name, w_type, num_ngbrs, permutations, geom_col, id_col)
$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;

-- TODO: make a version that accepts the values as arrays

-- Find outliers using a static threshold
--
CREATE OR REPLACE FUNCTION CDB_StaticOutlier(column_value numeric, threshold numeric)
RETURNS boolean
AS $$
BEGIN

  RETURN column_value > threshold;

END;
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE ;

-- Find outliers by a percentage above the threshold
-- TODO: add symmetric option? `is_symmetric boolean DEFAULT false`

CREATE OR REPLACE FUNCTION CDB_PercentOutlier(column_values numeric[], outlier_fraction numeric, ids int[])
RETURNS TABLE(is_outlier boolean, rowid int)
AS $$
DECLARE
  avg_val numeric;
  out_vals boolean[];
BEGIN

  SELECT avg(i) INTO avg_val
    FROM unnest(column_values) As x(i);

  IF avg_val = 0 THEN
    RAISE EXCEPTION 'Mean value is zero. Try another outlier method.';
  END IF;

  SELECT array_agg(
           outlier_fraction < i / avg_val) INTO out_vals
    FROM unnest(column_values) As x(i);

  RETURN QUERY
  SELECT unnest(out_vals) As is_outlier,
         unnest(ids) As rowid;

END;
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;

-- Find outliers above a given number of standard deviations from the mean

CREATE OR REPLACE FUNCTION CDB_StdDevOutlier(column_values numeric[], num_deviations numeric, ids int[], is_symmetric boolean DEFAULT true)
RETURNS TABLE(is_outlier boolean, rowid int)
AS $$
DECLARE
  stddev_val numeric;
  avg_val numeric;
  out_vals boolean[];
BEGIN

  SELECT stddev(i), avg(i) INTO stddev_val, avg_val
    FROM unnest(column_values) As x(i);

  IF stddev_val = 0 THEN
    RAISE EXCEPTION 'Standard deviation of input data is zero';
  END IF;

  IF is_symmetric THEN
    SELECT array_agg(
             abs(i - avg_val) / stddev_val > num_deviations) INTO out_vals
      FROM unnest(column_values) As x(i);
  ELSE
    SELECT array_agg(
             (i - avg_val) / stddev_val > num_deviations) INTO out_vals
      FROM unnest(column_values) As x(i);
  END IF;

  RETURN QUERY
  SELECT unnest(out_vals) As is_outlier,
         unnest(ids) As rowid;
END;
$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
CREATE OR REPLACE FUNCTION CDB_Contour(
    IN geomin geometry[],
    IN colin numeric[],
    IN buffer numeric,
    IN intmethod integer,
    IN classmethod integer,
    IN steps integer,
    IN max_time integer DEFAULT 60000
    )
RETURNS TABLE(
    the_geom geometry,
    bin integer,
    min_value numeric,
    max_value numeric,
    avg_value numeric
)  AS $$
DECLARE
    cell_count integer;
    tin geometry[];
    resolution integer;
BEGIN

    -- nasty trick to override issue #121
    IF max_time = 0 THEN
        max_time = -90;
    END IF;
    resolution := max_time;
    max_time := -1 * resolution;

    -- calc the optimal number of cells for the current dataset
    SELECT
    CASE intmethod
        WHEN 0 THEN round(3.7745903782 * max_time - 9.4399210051 * array_length(geomin,1) - 1350.8778213073)
        WHEN 1 THEN round(2.2855592156 * max_time - 87.285217133 * array_length(geomin,1) + 17255.7085601797)
        WHEN 2 THEN  round(0.9799471999 * max_time - 127.0334085369 * array_length(geomin,1) + 22707.9579721218)
        ELSE 10000
    END INTO cell_count;

    -- we don't have iterative barycentric interpolation in CDB_interpolation,
    --    and it's a costy function, so let's make a custom one here till
    --    we update the code
    -- tin := ARRAY[]::geometry[];
    IF intmethod=1 THEN
        WITH
            a as (SELECT unnest(geomin) AS e),
            b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a),
            c as (SELECT (ST_Dump(t)).geom as v FROM b)
        SELECT array_agg(v) INTO tin FROM c;
    END IF;
    -- Delaunay stuff performed just ONCE!!

    -- magic
    RETURN QUERY
    WITH
    convexhull as (
        SELECT
            ST_ConvexHull(ST_Collect(geomin)) as g,
            buffer * |/ st_area(ST_ConvexHull(ST_Collect(geomin)))/PI() as r
    ),
    envelope as (
        SELECT
            st_expand(a.g, a.r) as e
        FROM  convexhull a
    ),
    envelope3857 as(
        SELECT
            ST_Transform(e, 3857) as geom
        FROM envelope
    ),
    resolution as(
        SELECT
            CASE WHEN resolution <= 0  THEN
                round(|/ (
                 ST_area(geom) / abs(cell_count)
             ))
            ELSE
                resolution
            END AS cell
        FROM envelope3857
    ),
    grid as(
        SELECT
            ST_Transform(cdb_crankshaft.CDB_RectangleGrid(e.geom, r.cell, r.cell), 4326) as geom
        FROM envelope3857 e, resolution r
    ),
    interp as(
        SELECT
            geom,
            CASE
                WHEN intmethod=1 THEN cdb_crankshaft._interp_in_tin(geomin, colin, tin, ST_Centroid(geom))
                ELSE cdb_crankshaft.CDB_SpatialInterpolation(geomin, colin, ST_Centroid(geom), intmethod)
            END as val
        FROM grid
    ),
    classes as(
        SELECT CASE
            WHEN classmethod = 0 THEN
                cdb_crankshaft.CDB_EqualIntervalBins(array_agg(val), steps)
            WHEN classmethod = 1 THEN
                cdb_crankshaft.CDB_HeadsTailsBins(array_agg(val), steps)
            WHEN classmethod = 2 THEN
                cdb_crankshaft.CDB_JenksBins(array_agg(val), steps)
            ELSE
                cdb_crankshaft.CDB_QuantileBins(array_agg(val), steps)
            END as b
        FROM interp
        where val is not null
    ),
    classified as(
        SELECT
        i.*,
        width_bucket(i.val, c.b) as bucket
        FROM interp i left join classes c
        ON 1=1
    ),
    classified2 as(
        SELECT
            geom,
            val,
            CASE
                WHEN bucket = steps THEN bucket - 1
                ELSE bucket
            END as b
        FROM classified
    ),
    final as(
        SELECT
            st_union(geom) as the_geom,
            b as bin,
            min(val) as min_value,
            max(val) as max_value,
            avg(val) as avg_value
        FROM classified2
        GROUP BY bin
    )
    SELECT
        *
    FROM final
    where final.bin is not null
    ;
END;
$$ language plpgsql VOLATILE PARALLEL RESTRICTED;


-- =====================================================================
-- Interp in grid, so we can use barycentric with a precalculated tin (NNI)
-- =====================================================================
CREATE OR REPLACE FUNCTION _interp_in_tin(
    IN geomin geometry[],
    IN colin numeric[],
    IN tin geometry[],
    IN point geometry
    )
RETURNS numeric AS
$$
DECLARE
    g geometry;
    vertex geometry[];
    sg numeric;
    sa numeric;
    sb numeric;
    sc numeric;
    va numeric;
    vb numeric;
    vc numeric;
    output numeric;
BEGIN
    -- get the cell the point is within
    WITH
        a as (SELECT unnest(tin) as v),
        b as (SELECT v FROM a WHERE ST_Within(point, v))
    SELECT v INTO g FROM b;

    -- if we're out of the data realm,
    -- return null
    IF g is null THEN
        RETURN null;
    END IF;

    -- vertex of the selected cell
    WITH a AS (
        SELECT (ST_DumpPoints(g)).geom AS v
    )
    SELECT array_agg(v) INTO vertex FROM a;

    -- retrieve the value of each vertex
    WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c)
        SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]);

    WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c)
        SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]);

    WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c)
            SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]);

    -- calc the areas
    SELECT
        ST_area(g),
        ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[2], vertex[3], point]))),
        ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[1], vertex[3], point]))),
        ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point,vertex[1],vertex[2], point]))) INTO sg, sa, sb, sc;

    output := (coalesce(sa,0) * coalesce(va,0) + coalesce(sb,0) * coalesce(vb,0) + coalesce(sc,0) * coalesce(vc,0)) / coalesce(sg,1);
    RETURN output;
END;
$$
language plpgsql IMMUTABLE PARALLEL SAFE;
-- Function by Stuart Lynn for a simple interpolation of a value
-- from a polygon table over an arbitrary polygon
-- (weighted by the area proportion overlapped)
-- Aereal weighting is a very simple form of aereal interpolation.
--
-- Parameters:
--   * geom a Polygon geometry which defines the area where a value will be
--     estimated as the area-weighted sum of a given table/column
--   * target_table_name table name of the table that provides the values
--   * target_column column name of the column that provides the values
--   * schema_name optional parameter to defina the schema the target table
--     belongs to, which is necessary if its not in the search_path.
--     Note that target_table_name should never include the schema in it.
-- Return value:
--   Aereal-weighted interpolation of the column values over the geometry
CREATE OR REPLACE
FUNCTION cdb_overlap_sum(geom geometry, target_table_name text, target_column text, schema_name text DEFAULT NULL)
  RETURNS numeric AS
$$
DECLARE
	result numeric;
  qualified_name text;
BEGIN
  IF schema_name IS NULL THEN
    qualified_name := Format('%I', target_table_name);
  ELSE
    qualified_name := Format('%I.%s', schema_name, target_table_name);
  END IF;
  EXECUTE Format('
    SELECT sum(%I*ST_Area(St_Intersection($1, a.the_geom))/ST_Area(a.the_geom))
    FROM %s AS a
    WHERE $1 && a.the_geom
  ', target_column, qualified_name)
  USING geom
  INTO result;
  RETURN result;
END;
$$ LANGUAGE plpgsql STABLE PARALLEL SAFE;
CREATE OR REPLACE FUNCTION
CDB_GWR(subquery text, dep_var text, ind_vars text[],
        bw numeric default null, fixed boolean default False,
        kernel text default 'bisquare', geom_col text default 'the_geom',
        id_col text default 'cartodb_id')
RETURNS table(coeffs JSON, stand_errs JSON, t_vals JSON,
              filtered_t_vals JSON, predicted numeric,
              residuals numeric, r_squared numeric, bandwidth numeric,
              rowid bigint)
AS $$

from crankshaft.regression import GWR

gwr = GWR()

return gwr.gwr(subquery, dep_var, ind_vars, bw, fixed, kernel, geom_col, id_col)

$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;


CREATE OR REPLACE FUNCTION
CDB_GWR_Predict(subquery text, dep_var text, ind_vars text[],
                bw numeric default null, fixed boolean default False,
                kernel text default 'bisquare',
                geom_col text default 'the_geom',
                id_col text default 'cartodb_id')
RETURNS table(coeffs JSON, stand_errs JSON, t_vals JSON,
              r_squared numeric, predicted numeric, rowid bigint)
AS $$

from crankshaft.regression import GWR
gwr = GWR()

return gwr.gwr_predict(subquery, dep_var, ind_vars, bw, fixed, kernel, geom_col, id_col)

$$ LANGUAGE plpython3u VOLATILE PARALLEL UNSAFE;
--
-- Creates N points randomly distributed arround the polygon
--
-- @param g - the geometry to be turned in to points
--
-- @param no_points - the number of points to generate
--
-- @params max_iter_per_point - the function generates points in the polygon's bounding box
-- and discards points which don't lie in the polygon. max_iter_per_point specifies how many
-- misses per point the funciton accepts before giving up.
--
-- Returns: Multipoint with the requested points
CREATE OR REPLACE FUNCTION cdb_dot_density(geom geometry , no_points Integer, max_iter_per_point Integer DEFAULT 1000)
RETURNS GEOMETRY AS $$
DECLARE
  extent GEOMETRY;
  test_point Geometry;
  width                NUMERIC;
  height               NUMERIC;
  x0                   NUMERIC;
  y0                   NUMERIC;
  xp                   NUMERIC;
  yp                   NUMERIC;
  no_left              INTEGER;
  remaining_iterations INTEGER;
  points               GEOMETRY[];
  bbox_line            GEOMETRY;
  intersection_line    GEOMETRY;
BEGIN
  extent  := ST_Envelope(geom);
  width   := ST_XMax(extent) - ST_XMIN(extent);
  height  := ST_YMax(extent) - ST_YMIN(extent);
  x0 	  := ST_XMin(extent);
  y0 	  := ST_YMin(extent);
  no_left := no_points;

  LOOP
    if(no_left=0) THEN
      EXIT;
    END IF;
    yp = y0 + height*random();
    bbox_line  = ST_MakeLine(
      ST_SetSRID(ST_MakePoint(yp, x0),4326),
      ST_SetSRID(ST_MakePoint(yp, x0+width),4326)
    );
    intersection_line = ST_Intersection(bbox_line,geom);
  	test_point = ST_LineInterpolatePoint(st_makeline(st_linemerge(intersection_line)),random());
	  points := points || test_point;
	  no_left = no_left - 1 ;
  END LOOP;
  RETURN ST_Collect(points);
END;
$$
LANGUAGE plpgsql VOLATILE PARALLEL RESTRICTED;
-- Make sure by default there are no permissions for publicuser
-- NOTE: this happens at extension creation time, as part of an implicit transaction.
-- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE;

-- Grant permissions on the schema to publicuser (but just the schema)
GRANT USAGE ON SCHEMA cdb_crankshaft TO publicuser;

-- Revoke execute permissions on all functions in the schema by default
-- REVOKE EXECUTE ON ALL FUNCTIONS IN SCHEMA cdb_crankshaft FROM PUBLIC, publicuser;
--
-- Fill given extent with a rectangular coverage
--
-- @param ext Extent to fill. Only rectangles with center point falling
--            inside the extent (or at the lower or leftmost edge) will
--            be emitted. The returned hexagons will have the same SRID
--            as this extent.
--
-- @param width With of each rectangle
--
-- @param height Height of each rectangle
--
-- @param origin Optional origin to allow for exact tiling.
--               If omitted the origin will be 0,0.
--               The parameter is checked for having the same SRID
--               as the extent.
--
--
CREATE OR REPLACE FUNCTION CDB_RectangleGrid(ext GEOMETRY, width FLOAT8, height FLOAT8, origin GEOMETRY DEFAULT NULL)
RETURNS SETOF GEOMETRY
AS $$
DECLARE
  h GEOMETRY; -- rectangle cell
  hstep FLOAT8; -- horizontal step
  vstep FLOAT8; -- vertical step
  hw FLOAT8; -- half width
  hh FLOAT8; -- half height
  vstart FLOAT8;
  hstart FLOAT8;
  hend FLOAT8;
  vend FLOAT8;
  xoff FLOAT8;
  yoff FLOAT8;
  xgrd FLOAT8;
  ygrd FLOAT8;
  x FLOAT8;
  y FLOAT8;
  srid INTEGER;
BEGIN

  srid := ST_SRID(ext);

  xoff := 0;
  yoff := 0;

  IF origin IS NOT NULL THEN
    IF ST_SRID(origin) != srid THEN
      RAISE EXCEPTION 'SRID mismatch between extent (%) and origin (%)', srid, ST_SRID(origin);
    END IF;
    xoff := ST_X(origin);
    yoff := ST_Y(origin);
  END IF;

  --RAISE DEBUG 'X offset: %', xoff;
  --RAISE DEBUG 'Y offset: %', yoff;

  hw := width/2.0;
  hh := height/2.0;

  xgrd := hw;
  ygrd := hh;
  --RAISE DEBUG 'X grid size: %', xgrd;
  --RAISE DEBUG 'Y grid size: %', ygrd;

  hstep := width;
  vstep := height;

  -- Tweak horizontal start on hstep grid from origin
  hstart := xoff + ceil((ST_XMin(ext)-xoff)/hstep)*hstep;
  --RAISE DEBUG 'hstart: %', hstart;

  -- Tweak vertical start on vstep grid from origin
  vstart := yoff + ceil((ST_Ymin(ext)-yoff)/vstep)*vstep;
  --RAISE DEBUG 'vstart: %', vstart;

  hend := ST_XMax(ext);
  vend := ST_YMax(ext);

  --RAISE DEBUG 'hend: %', hend;
  --RAISE DEBUG 'vend: %', vend;

  x := hstart;
  WHILE x < hend LOOP -- over X
    y := vstart;
    h := ST_MakeEnvelope(x-hw, y-hh, x+hw, y+hh, srid);
    WHILE y < vend LOOP -- over Y
      RETURN NEXT h;
      h := ST_Translate(h, 0, vstep);
      y := yoff + round(((y + vstep)-yoff)/ygrd)*ygrd; -- round to grid
    END LOOP;
    x := xoff + round(((x + hstep)-xoff)/xgrd)*xgrd; -- round to grid
  END LOOP;

  RETURN;
END
$$ LANGUAGE 'plpgsql' IMMUTABLE PARALLEL SAFE;

--
-- Calculate the equal interval bins for a given column
--
-- @param in_array A numeric array of numbers to determine the best
--                   to determine the bin boundary
--
-- @param breaks The number of bins you want to find.
--
--
-- Returns: upper edges of bins
--
--

CREATE OR REPLACE FUNCTION CDB_EqualIntervalBins ( in_array NUMERIC[], breaks INT ) RETURNS NUMERIC[] as $$
DECLARE
    diff numeric;
    min_val numeric;
    max_val numeric;
    tmp_val numeric;
    i INT := 1;
    reply numeric[];
BEGIN
    SELECT min(e), max(e) INTO min_val, max_val FROM ( SELECT unnest(in_array) e ) x WHERE e IS NOT NULL;
    diff = (max_val - min_val) / breaks::numeric;
    LOOP
        IF i < breaks THEN
            tmp_val = min_val + i::numeric * diff;
            reply = array_append(reply, tmp_val);
            i := i+1;
        ELSE
            reply = array_append(reply, max_val);
            EXIT;
        END IF;
    END LOOP;
    RETURN reply;
END;
$$ language plpgsql IMMUTABLE PARALLEL SAFE;

--
-- Determine the Heads/Tails classifications from a numeric array
--
-- @param in_array A numeric array of numbers to determine the best
--            bins based on the Heads/Tails method.
--
-- @param breaks The number of bins you want to find.
--
--

CREATE OR REPLACE FUNCTION CDB_HeadsTailsBins ( in_array NUMERIC[], breaks INT) RETURNS NUMERIC[] as $$
DECLARE
    element_count INT4;
    arr_mean numeric;
    i INT := 2;
    reply numeric[];
BEGIN
    -- get the total size of our row
    element_count := array_upper(in_array, 1) - array_lower(in_array, 1);
    -- ensure the ordering of in_array
    SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x;
    -- stop if no rows
    IF element_count IS NULL THEN
        RETURN NULL;
    END IF;
    -- stop if our breaks are more than our input array size
    IF element_count < breaks THEN
        RETURN in_array;
    END IF;

    -- get our mean value
    SELECT avg(v) INTO arr_mean FROM (  SELECT unnest(in_array) as v ) x;

    reply = Array[arr_mean];
    -- slice our bread
    LOOP
        IF i > breaks THEN  EXIT;  END IF;
        SELECT avg(e) INTO arr_mean FROM ( SELECT unnest(in_array) e) x WHERE e > reply[i-1];
        IF arr_mean IS NOT NULL THEN
            reply = array_append(reply, arr_mean);
        END IF;
        i := i+1;
    END LOOP;
    RETURN reply;
END;
$$ language plpgsql IMMUTABLE PARALLEL SAFE;

--
-- Determine the Jenks classifications from a numeric array
--
-- @param in_array A numeric array of numbers to determine the best
--            bins based on the Jenks method.
--
-- @param breaks The number of bins you want to find.
--
-- @param iterations The number of different starting positions to test.
--
-- @param invert Optional wheter to return the top of each bin (default)
--               or the bottom. BOOLEAN, default=FALSE.
--
--


CREATE OR REPLACE FUNCTION CDB_JenksBins ( in_array NUMERIC[], breaks INT, iterations INT DEFAULT 5, invert BOOLEAN DEFAULT FALSE) RETURNS NUMERIC[] as $$
DECLARE
    element_count INT4;
    arr_mean NUMERIC;
    bot INT;
    top INT;
    tops INT[];
    classes INT[][];
    i INT := 1; j INT := 1;
    curr_result NUMERIC[];
    best_result NUMERIC[];
    seedtarget TEXT;
    quant NUMERIC[];
    shuffles INT;
BEGIN
    -- get the total size of our row
    element_count := array_length(in_array, 1); --array_upper(in_array, 1) - array_lower(in_array, 1);
    -- ensure the ordering of in_array
    SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x;
    -- stop if no rows
    IF element_count IS NULL THEN
        RETURN NULL;
    END IF;
    -- stop if our breaks are more than our input array size
    IF element_count < breaks THEN
        RETURN in_array;
    END IF;

    shuffles := LEAST(GREATEST(floor(2500000.0/(element_count::float*iterations::float)), 1), 750)::int;
    -- get our mean value
    SELECT avg(v) INTO arr_mean FROM (  SELECT unnest(in_array) as v ) x;

    -- assume best is actually Quantile
    SELECT cdb_crankshaft.CDB_QuantileBins(in_array, breaks) INTO quant;

    -- if data is very very large, just return quant and be done
    IF element_count > 5000000 THEN
        RETURN quant;
    END IF;

    -- change quant into bottom, top markers
    LOOP
        IF i = 1 THEN
            bot = 1;
        ELSE
            -- use last top to find this bot
            bot = top+1;
        END IF;
        IF i = breaks THEN
            top = element_count;
        ELSE
            SELECT count(*) INTO top FROM ( SELECT unnest(in_array) as v) x WHERE v <= quant[i];
        END IF;
        IF i = 1 THEN
            classes = ARRAY[ARRAY[bot,top]];
        ELSE
            classes = ARRAY_CAT(classes,ARRAY[bot,top]);
        END IF;
        IF i > breaks THEN EXIT; END IF;
        i = i+1;
    END LOOP;

    best_result = cdb_crankshaft.CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles);

    --set the seed so we can ensure the same results
    SELECT setseed(0.4567) INTO seedtarget;
    --loop through random starting positions
    LOOP
        IF j > iterations-1 THEN  EXIT;  END IF;
        i = 1;
        tops = ARRAY[element_count];
        LOOP
            IF i = breaks THEN  EXIT;  END IF;
            SELECT array_agg(distinct e) INTO tops FROM (SELECT unnest(array_cat(tops, ARRAY[floor(random()*element_count::float)::int])) as e ORDER BY e) x WHERE e != 1;
            i = array_length(tops, 1);
        END LOOP;
        i = 1;
        LOOP
            IF i > breaks THEN  EXIT;  END IF;
            IF i = 1 THEN
                bot = 1;
            ELSE
                bot = top+1;
            END IF;
            top = tops[i];
            IF i = 1 THEN
                classes = ARRAY[ARRAY[bot,top]];
            ELSE
                classes = ARRAY_CAT(classes,ARRAY[bot,top]);
            END IF;
            i := i+1;
        END LOOP;
        curr_result = cdb_crankshaft.CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles);

        IF curr_result[1] > best_result[1] THEN
            best_result = curr_result;
            j = j-1; -- if we found a better result, add one more search
        END IF;
        j = j+1;
    END LOOP;

    RETURN (best_result)[2:array_upper(best_result, 1)];
END;
$$ language plpgsql VOLATILE PARALLEL RESTRICTED;


--
-- Perform a single iteration of the Jenks classification
--

CREATE OR REPLACE FUNCTION CDB_JenksBinsIteration ( in_array NUMERIC[], breaks INT, classes INT[][], invert BOOLEAN, element_count INT4, arr_mean NUMERIC, max_search INT DEFAULT 50) RETURNS NUMERIC[] as $$
DECLARE
    tmp_val numeric;
    new_classes int[][];
    tmp_class int[];
    i INT := 1;
    j INT := 1;
    side INT := 2;
    sdam numeric;
    gvf numeric := 0.0;
    new_gvf numeric;
    arr_gvf numeric[];
    class_avg numeric;
    class_max_i INT;
    class_min_i INT;
    class_max numeric;
    class_min numeric;
    reply numeric[];
BEGIN

    -- Calculate the sum of squared deviations from the array mean (SDAM).
    SELECT sum((arr_mean - e)^2) INTO sdam FROM (  SELECT unnest(in_array) as e ) x;
    --Identify the breaks for the lowest GVF
    LOOP
        i = 1;
        LOOP
            -- get our mean
            SELECT avg(e) INTO class_avg FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e) x;
            -- find the deviation
            SELECT sum((class_avg-e)^2) INTO tmp_val FROM (   SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e  ) x;
            IF i = 1 THEN
                arr_gvf = ARRAY[tmp_val];
                -- init our min/max map for later
                class_max = arr_gvf[i];
                class_min = arr_gvf[i];
                class_min_i = 1;
                class_max_i = 1;
            ELSE
                arr_gvf = array_append(arr_gvf, tmp_val);
            END IF;
            i := i+1;
            IF i > breaks THEN EXIT; END IF;
        END LOOP;
        -- calculate our new GVF
        SELECT sdam-sum(e) INTO new_gvf FROM (  SELECT unnest(arr_gvf) as e  ) x;
        -- if no improvement was made, exit
        IF new_gvf < gvf THEN EXIT; END IF;
        gvf = new_gvf;
        IF j > max_search THEN EXIT; END IF;
        j = j+1;
        i = 1;
        LOOP
            --establish directionality (uppward through classes or downward)
            IF arr_gvf[i] < class_min THEN
                class_min = arr_gvf[i];
                class_min_i = i;
            END IF;
            IF arr_gvf[i] > class_max THEN
                class_max = arr_gvf[i];
                class_max_i = i;
            END IF;
            i := i+1;
            IF i > breaks THEN EXIT; END IF;
        END LOOP;
        IF class_max_i > class_min_i THEN
            class_min_i = class_max_i - 1;
        ELSE
            class_min_i = class_max_i + 1;
        END IF;
            --Move from higher class to a lower gid order
            IF class_max_i > class_min_i THEN
                classes[class_max_i][1] = classes[class_max_i][1] + 1;
                classes[class_min_i][2] = classes[class_min_i][2] + 1;
            ELSE -- Move from lower class UP into a higher class by gid
                classes[class_max_i][2] = classes[class_max_i][2] - 1;
                classes[class_min_i][1] = classes[class_min_i][1] - 1;
            END IF;
    END LOOP;

    i = 1;
    LOOP
        IF invert = TRUE THEN
            side = 1; --default returns bottom side of breaks, invert returns top side
        END IF;
        reply = array_append(reply, in_array[classes[i][side]]);
        i = i+1;
        IF i > breaks THEN  EXIT; END IF;
    END LOOP;

    RETURN array_prepend(gvf, reply);

END;
$$ language plpgsql IMMUTABLE PARALLEL SAFE;


--
-- Determine the Quantile classifications from a numeric array
--
-- @param in_array A numeric array of numbers to determine the best
--            bins based on the Quantile method.
--
-- @param breaks The number of bins you want to find.
--
--
CREATE OR REPLACE FUNCTION CDB_QuantileBins ( in_array NUMERIC[], breaks INT) RETURNS NUMERIC[] as $$
DECLARE
    element_count INT4;
    break_size numeric;
    tmp_val numeric;
    i INT := 1;
    reply numeric[];
BEGIN
    -- sort our values
    SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e ASC) x;
    -- get the total size of our data
    element_count := array_length(in_array, 1);
    break_size :=  element_count::numeric / breaks;
    -- slice our bread
    LOOP
        IF i < breaks THEN
            IF break_size * i % 1 > 0 THEN
                SELECT e INTO tmp_val FROM ( SELECT unnest(in_array) e LIMIT 1 OFFSET ceil(break_size * i) - 1) x;
            ELSE
                SELECT avg(e) INTO tmp_val FROM ( SELECT unnest(in_array) e LIMIT 2 OFFSET ceil(break_size * i) - 1 ) x;
            END IF;
        ELSIF i = breaks THEN
            -- select the last value
            SELECT max(e) INTO tmp_val FROM ( SELECT unnest(in_array) e ) x;
        ELSE
            EXIT;
        END IF;

        reply = array_append(reply, tmp_val);
        i := i+1;
    END LOOP;
    RETURN reply;
END;
$$ language plpgsql IMMUTABLE STRICT PARALLEL SAFE;