From 5b6d2c568b9799b7ebc899c79b7a46999219d47a Mon Sep 17 00:00:00 2001 From: abelvm Date: Mon, 15 Aug 2016 17:15:36 -0400 Subject: [PATCH 01/11] first commit --- doc/19_contour.md | 47 ++++++ src/pg/sql/19_contour.sql | 191 +++++++++++++++++++++++ src/pg/test/expected/19_contour_test.out | 2 + src/pg/test/sql/19_contour_test.sql | 14 ++ 4 files changed, 254 insertions(+) create mode 100644 doc/19_contour.md create mode 100644 src/pg/sql/19_contour.sql create mode 100644 src/pg/test/expected/19_contour_test.out create mode 100644 src/pg/test/sql/19_contour_test.sql diff --git a/doc/19_contour.md b/doc/19_contour.md new file mode 100644 index 0000000..f34f7c0 --- /dev/null +++ b/doc/19_contour.md @@ -0,0 +1,47 @@ +## Contour maps + +Function to generate a contour map from an scatter dataset of points, using one of three methos: + +* [Nearest neighbor](https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation) +* [Barycentric](https://en.wikipedia.org/wiki/Barycentric_coordinate_system) +* [IDW](https://en.wikipedia.org/wiki/Inverse_distance_weighting) + +### CDB_Contour (geom geometry[], values numeric[], resolution integer, buffer numeric, method, classmethod integer, steps integer) + +#### Arguments + +| Name | Type | Description | +|------|------|-------------| +| geom | geometry[] | Array of points's geometries | +| values | numeric[] | Array of points' values for the param under study| +| resolution | integer | Size in meters of the cells in the grid| +| buffer | numeric | Value between 0 and 1 for spatial buffer of the set of points +| method | integer | 0:nearest neighbor, 1: barycentric, 2: IDW| +| classmethod | integer | 0:equals, 1: heads&tails, 2:jenks, 3:quantiles | +| steps | integer | Number of steps in the classification| + +### Returns +Returns a table object + +| Name | Type | Description | +|------|------|-------------| +| the_geom | geometry | Geometries of the classified contour map| +| avg_value | numeric | Avg value of the area| +| min_value | numeric | Min value of the area| +| max_value | numeric | Max value of the areal| +| bin | integer | Index of the class of the area| + +#### Example Usage + +```sql +WITH a AS ( + SELECT + ARRAY[800, 700, 600, 500, 400, 300, 200, 100]::numeric[] AS vals, + ARRAY[ST_GeomFromText('POINT(2.1744 41.403)'),ST_GeomFromText('POINT(2.1228 41.380)'),ST_GeomFromText('POINT(2.1511 41.374)'),ST_GeomFromText('POINT(2.1528 41.413)'),ST_GeomFromText('POINT(2.165 41.391)'),ST_GeomFromText('POINT(2.1498 41.371)'),ST_GeomFromText('POINT(2.1533 41.368)'),ST_GeomFromText('POINT(2.131386 41.41399)')] AS g +) +SELECT + foo.* +FROM + a, + CDB_contour(a.g, a.vals, 500, 0.0, 1,3,5) foo; +``` diff --git a/src/pg/sql/19_contour.sql b/src/pg/sql/19_contour.sql new file mode 100644 index 0000000..5118972 --- /dev/null +++ b/src/pg/sql/19_contour.sql @@ -0,0 +1,191 @@ +CREATE OR REPLACE FUNCTION CDB_Contour( + IN geomin geometry[], + IN colin numeric[], + IN resolution integer, + IN buffer numeric, + IN intmethod integer, + IN classmethod integer, + IN steps integer + ) +RETURNS TABLE( + the_geom geometry, + bin integer, + min_value numeric, + max_value numeric, + avg_value numeric +) AS $$ +DECLARE + cell numeric; + tin geometry[]; +BEGIN + -- calc the cell size in web mercator units + WITH center as ( + SELECT ST_centroid(ST_Collect(geomin)) as c + ) + SELECT + round(resolution / cos(ST_y(c) * pi()/180)) + INTO cell + FROM center; + -- raise notice 'Resol: %', cell; + + -- we don't have iterative barycentric interpolation in CDB_interpolation, + -- and it's a costy function, so let's make a custom one here till + -- we update the code + -- tin := ARRAY[]::geometry[]; + IF intmethod=1 THEN + WITH + a as (SELECT unnest(geomin) AS e), + b as (SELECT ST_DelaunayTriangles(ST_Collect(a.e),0.001, 0) AS t FROM a), + c as (SELECT (ST_Dump(t)).geom as v FROM b) + SELECT array_agg(v) INTO tin FROM c; + END IF; + -- Delaunay stuff performed just ONCE!! + + -- magic + RETURN QUERY + WITH + convexhull as ( + SELECT + ST_ConvexHull(ST_Collect(geomin)) as g, + buffer * |/ st_area(ST_ConvexHull(ST_Collect(geomin)))/PI() as r + ), + envelope as ( + SELECT + st_expand(a.g, a.r) as e + FROM convexhull a + ), + envelope3857 as( + SELECT + ST_Transform(e, 3857) as geom + FROM envelope + ), + grid as( + SELECT + ST_Transform(CDB_RectangleGrid(geom, cell, cell), 4326) as geom + FROM envelope3857 + ), + interp as( + SELECT + geom, + CASE + WHEN intmethod=1 THEN cdb_crankshaft._interp_in_tin(geomin, colin, tin, ST_Centroid(geom)) + ELSE db_crankshaft.CDB_SpatialInterpolation(geomin, colin, ST_Centroid(geom), intmethod) + END as val + FROM grid + ), + classes as( + SELECT CASE + WHEN classmethod = 0 THEN + CDB_EqualIntervalBins(array_agg(val), steps) + WHEN classmethod = 1 THEN + CDB_HeadsTailsBins(array_agg(val), steps) + WHEN classmethod = 2 THEN + CDB_JenksBins(array_agg(val), steps) + ELSE + CDB_QuantileBins(array_agg(val), steps) + END as b + FROM interp + where val is not null + ), + classified as( + SELECT + i.*, + width_bucket(i.val, c.b) as bucket + FROM interp i left join classes c + ON 1=1 + ), + classified2 as( + SELECT + geom, + val, + CASE + WHEN bucket = steps THEN bucket - 1 + ELSE bucket + END as b + FROM classified + ), + final as( + SELECT + st_union(geom) as the_geom, + b as bin, + min(val) as min_value, + max(val) as max_value, + avg(val) as avg_value + FROM classified2 + GROUP BY bin + ) + SELECT + * + FROM final + where final.bin is not null + ; +END; +$$ language plpgsql; + + + +-- ===================================================================== +-- Interp in grid, so we can use barycentric with a precalculated tin (NNI) +-- ===================================================================== +CREATE OR REPLACE FUNCTION cdb_crankshaft._interp_in_tin( + IN geomin geometry[], + IN colin numeric[], + IN tin geometry[], + IN point geometry + ) +RETURNS numeric AS +$$ +DECLARE + g geometry; + vertex geometry[]; + sg numeric; + sa numeric; + sb numeric; + sc numeric; + va numeric; + vb numeric; + vc numeric; + output numeric; +BEGIN + -- get the cell the point is within + WITH + a as (SELECT unnest(tin) as v), + b as (SELECT v FROM a WHERE ST_Within(point, v)) + SELECT v INTO g FROM b; + + -- if we're out of the data realm, + -- return null + IF g is null THEN + RETURN null; + END IF; + + -- vertex of the selected cell + WITH a AS ( + SELECT (ST_DumpPoints(g)).geom AS v + ) + SELECT array_agg(v) INTO vertex FROM a; + + -- retrieve the value of each vertex + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO va FROM a WHERE ST_Equals(geo, vertex[1]); + + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO vb FROM a WHERE ST_Equals(geo, vertex[2]); + + WITH a AS(SELECT unnest(geomin) as geo, unnest(colin) as c) + SELECT c INTO vc FROM a WHERE ST_Equals(geo, vertex[3]); + + -- calc the areas + SELECT + ST_area(g), + ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[2], vertex[3], point]))), + ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point, vertex[1], vertex[3], point]))), + ST_area(ST_MakePolygon(ST_MakeLine(ARRAY[point,vertex[1],vertex[2], point]))) INTO sg, sa, sb, sc; + + output := (coalesce(sa,0) * coalesce(va,0) + coalesce(sb,0) * coalesce(vb,0) + coalesce(sc,0) * coalesce(vc,0)) / coalesce(sg,1); + RETURN output; +END; +$$ +language plpgsql; + + diff --git a/src/pg/test/expected/19_contour_test.out b/src/pg/test/expected/19_contour_test.out new file mode 100644 index 0000000..da0e94b --- /dev/null +++ b/src/pg/test/expected/19_contour_test.out @@ -0,0 +1,2 @@ +SET client_min_messages TO WARNING; +\set ECHO none diff --git a/src/pg/test/sql/19_contour_test.sql b/src/pg/test/sql/19_contour_test.sql new file mode 100644 index 0000000..fa32609 --- /dev/null +++ b/src/pg/test/sql/19_contour_test.sql @@ -0,0 +1,14 @@ +SET client_min_messages TO WARNING; +\set ECHO none +\pset format unaligned + +WITH a AS ( + SELECT + ARRAY[800, 700, 600, 500, 400, 300, 200, 100]::numeric[] AS vals, + ARRAY[ST_GeomFromText('POINT(2.1744 41.403)'),ST_GeomFromText('POINT(2.1228 41.380)'),ST_GeomFromText('POINT(2.1511 41.374)'),ST_GeomFromText('POINT(2.1528 41.413)'),ST_GeomFromText('POINT(2.165 41.391)'),ST_GeomFromText('POINT(2.1498 41.371)'),ST_GeomFromText('POINT(2.1533 41.368)'),ST_GeomFromText('POINT(2.131386 41.41399)')] AS g +) +SELECT + foo.* +FROM + a, + CDB_contour(a.g, a.vals, 500, 0.0, 1,3,5) foo; From cb39d1830da659c47b3f48f2ab081f065f7ef247 Mon Sep 17 00:00:00 2001 From: abelvm Date: Mon, 15 Aug 2016 17:23:52 -0400 Subject: [PATCH 02/11] 2nd commit --- src/pg/sql/19_contour.sql | 2 +- src/pg/test/sql/19_contour_test.sql | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pg/sql/19_contour.sql b/src/pg/sql/19_contour.sql index 5118972..8c614b6 100644 --- a/src/pg/sql/19_contour.sql +++ b/src/pg/sql/19_contour.sql @@ -69,7 +69,7 @@ BEGIN geom, CASE WHEN intmethod=1 THEN cdb_crankshaft._interp_in_tin(geomin, colin, tin, ST_Centroid(geom)) - ELSE db_crankshaft.CDB_SpatialInterpolation(geomin, colin, ST_Centroid(geom), intmethod) + ELSE cdb_crankshaft.CDB_SpatialInterpolation(geomin, colin, ST_Centroid(geom), intmethod) END as val FROM grid ), diff --git a/src/pg/test/sql/19_contour_test.sql b/src/pg/test/sql/19_contour_test.sql index fa32609..92438cc 100644 --- a/src/pg/test/sql/19_contour_test.sql +++ b/src/pg/test/sql/19_contour_test.sql @@ -11,4 +11,4 @@ SELECT foo.* FROM a, - CDB_contour(a.g, a.vals, 500, 0.0, 1,3,5) foo; + cdb_crankshaft.CDB_contour(a.g, a.vals, 500, 0.0, 1,3,5) foo; From edeac1838913c06dfe1616c20dc1e9ce8c1b6e07 Mon Sep 17 00:00:00 2001 From: abelvm Date: Tue, 16 Aug 2016 13:02:28 -0400 Subject: [PATCH 03/11] changed test --- src/pg/test/sql/19_contour_test.sql | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/pg/test/sql/19_contour_test.sql b/src/pg/test/sql/19_contour_test.sql index 92438cc..da82327 100644 --- a/src/pg/test/sql/19_contour_test.sql +++ b/src/pg/test/sql/19_contour_test.sql @@ -6,9 +6,12 @@ WITH a AS ( SELECT ARRAY[800, 700, 600, 500, 400, 300, 200, 100]::numeric[] AS vals, ARRAY[ST_GeomFromText('POINT(2.1744 41.403)'),ST_GeomFromText('POINT(2.1228 41.380)'),ST_GeomFromText('POINT(2.1511 41.374)'),ST_GeomFromText('POINT(2.1528 41.413)'),ST_GeomFromText('POINT(2.165 41.391)'),ST_GeomFromText('POINT(2.1498 41.371)'),ST_GeomFromText('POINT(2.1533 41.368)'),ST_GeomFromText('POINT(2.131386 41.41399)')] AS g -) +), +b as( SELECT foo.* FROM a, - cdb_crankshaft.CDB_contour(a.g, a.vals, 500, 0.0, 1,3,5) foo; + CDB_contour(a.g, a.vals, 500, 0.0, 1, 3, 5) foo +) +SELECT bin, avg_value from b order by bin; From bae3362b59a9fa365e98a77e1199818f5f7ba6a5 Mon Sep 17 00:00:00 2001 From: abelvm Date: Tue, 16 Aug 2016 13:15:13 -0400 Subject: [PATCH 04/11] added CDB stuff --- src/pg/sql/19_contour.sql | 459 +++++++++++++++++++++++++++++++++++++- 1 file changed, 453 insertions(+), 6 deletions(-) diff --git a/src/pg/sql/19_contour.sql b/src/pg/sql/19_contour.sql index 8c614b6..528ad9a 100644 --- a/src/pg/sql/19_contour.sql +++ b/src/pg/sql/19_contour.sql @@ -61,7 +61,7 @@ BEGIN ), grid as( SELECT - ST_Transform(CDB_RectangleGrid(geom, cell, cell), 4326) as geom + ST_Transform(cdb_crankshaft.CDB_RectangleGrid(geom, cell, cell), 4326) as geom FROM envelope3857 ), interp as( @@ -76,13 +76,13 @@ BEGIN classes as( SELECT CASE WHEN classmethod = 0 THEN - CDB_EqualIntervalBins(array_agg(val), steps) + cdb_crankshaft.CDB_EqualIntervalBins(array_agg(val), steps) WHEN classmethod = 1 THEN - CDB_HeadsTailsBins(array_agg(val), steps) + cdb_crankshaft.CDB_HeadsTailsBins(array_agg(val), steps) WHEN classmethod = 2 THEN - CDB_JenksBins(array_agg(val), steps) + cdb_crankshaft.CDB_JenksBins(array_agg(val), steps) ELSE - CDB_QuantileBins(array_agg(val), steps) + cdb_crankshaft.CDB_QuantileBins(array_agg(val), steps) END as b FROM interp where val is not null @@ -127,7 +127,7 @@ $$ language plpgsql; -- ===================================================================== -- Interp in grid, so we can use barycentric with a precalculated tin (NNI) -- ===================================================================== -CREATE OR REPLACE FUNCTION cdb_crankshaft._interp_in_tin( +CREATE OR REPLACE FUNCTION _interp_in_tin( IN geomin geometry[], IN colin numeric[], IN tin geometry[], @@ -189,3 +189,450 @@ $$ language plpgsql; +-- +-- Fill given extent with a rectangular coverage +-- +-- @param ext Extent to fill. Only rectangles with center point falling +-- inside the extent (or at the lower or leftmost edge) will +-- be emitted. The returned hexagons will have the same SRID +-- as this extent. +-- +-- @param width With of each rectangle +-- +-- @param height Height of each rectangle +-- +-- @param origin Optional origin to allow for exact tiling. +-- If omitted the origin will be 0,0. +-- The parameter is checked for having the same SRID +-- as the extent. +-- +-- +CREATE OR REPLACE FUNCTION CDB_RectangleGrid(ext GEOMETRY, width FLOAT8, height FLOAT8, origin GEOMETRY DEFAULT NULL) +RETURNS SETOF GEOMETRY +AS $$ +DECLARE + h GEOMETRY; -- rectangle cell + hstep FLOAT8; -- horizontal step + vstep FLOAT8; -- vertical step + hw FLOAT8; -- half width + hh FLOAT8; -- half height + vstart FLOAT8; + hstart FLOAT8; + hend FLOAT8; + vend FLOAT8; + xoff FLOAT8; + yoff FLOAT8; + xgrd FLOAT8; + ygrd FLOAT8; + x FLOAT8; + y FLOAT8; + srid INTEGER; +BEGIN + + srid := ST_SRID(ext); + + xoff := 0; + yoff := 0; + + IF origin IS NOT NULL THEN + IF ST_SRID(origin) != srid THEN + RAISE EXCEPTION 'SRID mismatch between extent (%) and origin (%)', srid, ST_SRID(origin); + END IF; + xoff := ST_X(origin); + yoff := ST_Y(origin); + END IF; + + --RAISE DEBUG 'X offset: %', xoff; + --RAISE DEBUG 'Y offset: %', yoff; + + hw := width/2.0; + hh := height/2.0; + + xgrd := hw; + ygrd := hh; + --RAISE DEBUG 'X grid size: %', xgrd; + --RAISE DEBUG 'Y grid size: %', ygrd; + + hstep := width; + vstep := height; + + -- Tweak horizontal start on hstep grid from origin + hstart := xoff + ceil((ST_XMin(ext)-xoff)/hstep)*hstep; + --RAISE DEBUG 'hstart: %', hstart; + + -- Tweak vertical start on vstep grid from origin + vstart := yoff + ceil((ST_Ymin(ext)-yoff)/vstep)*vstep; + --RAISE DEBUG 'vstart: %', vstart; + + hend := ST_XMax(ext); + vend := ST_YMax(ext); + + --RAISE DEBUG 'hend: %', hend; + --RAISE DEBUG 'vend: %', vend; + + x := hstart; + WHILE x < hend LOOP -- over X + y := vstart; + h := ST_MakeEnvelope(x-hw, y-hh, x+hw, y+hh, srid); + WHILE y < vend LOOP -- over Y + RETURN NEXT h; + h := ST_Translate(h, 0, vstep); + y := yoff + round(((y + vstep)-yoff)/ygrd)*ygrd; -- round to grid + END LOOP; + x := xoff + round(((x + hstep)-xoff)/xgrd)*xgrd; -- round to grid + END LOOP; + + RETURN; +END +$$ LANGUAGE 'plpgsql' IMMUTABLE; + +-- +-- Calculate the equal interval bins for a given column +-- +-- @param in_array A numeric array of numbers to determine the best +-- to determine the bin boundary +-- +-- @param breaks The number of bins you want to find. +-- +-- +-- Returns: upper edges of bins +-- +-- + +CREATE OR REPLACE FUNCTION CDB_EqualIntervalBins ( in_array NUMERIC[], breaks INT ) RETURNS NUMERIC[] as $$ +DECLARE + diff numeric; + min_val numeric; + max_val numeric; + tmp_val numeric; + i INT := 1; + reply numeric[]; +BEGIN + SELECT min(e), max(e) INTO min_val, max_val FROM ( SELECT unnest(in_array) e ) x WHERE e IS NOT NULL; + diff = (max_val - min_val) / breaks::numeric; + LOOP + IF i < breaks THEN + tmp_val = min_val + i::numeric * diff; + reply = array_append(reply, tmp_val); + i := i+1; + ELSE + reply = array_append(reply, max_val); + EXIT; + END IF; + END LOOP; + RETURN reply; +END; +$$ language plpgsql IMMUTABLE; + +-- +-- Determine the Heads/Tails classifications from a numeric array +-- +-- @param in_array A numeric array of numbers to determine the best +-- bins based on the Heads/Tails method. +-- +-- @param breaks The number of bins you want to find. +-- +-- + +CREATE OR REPLACE FUNCTION CDB_HeadsTailsBins ( in_array NUMERIC[], breaks INT) RETURNS NUMERIC[] as $$ +DECLARE + element_count INT4; + arr_mean numeric; + i INT := 2; + reply numeric[]; +BEGIN + -- get the total size of our row + element_count := array_upper(in_array, 1) - array_lower(in_array, 1); + -- ensure the ordering of in_array + SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x; + -- stop if no rows + IF element_count IS NULL THEN + RETURN NULL; + END IF; + -- stop if our breaks are more than our input array size + IF element_count < breaks THEN + RETURN in_array; + END IF; + + -- get our mean value + SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x; + + reply = Array[arr_mean]; + -- slice our bread + LOOP + IF i > breaks THEN EXIT; END IF; + SELECT avg(e) INTO arr_mean FROM ( SELECT unnest(in_array) e) x WHERE e > reply[i-1]; + IF arr_mean IS NOT NULL THEN + reply = array_append(reply, arr_mean); + END IF; + i := i+1; + END LOOP; + RETURN reply; +END; +$$ language plpgsql IMMUTABLE; + +-- +-- Determine the Jenks classifications from a numeric array +-- +-- @param in_array A numeric array of numbers to determine the best +-- bins based on the Jenks method. +-- +-- @param breaks The number of bins you want to find. +-- +-- @param iterations The number of different starting positions to test. +-- +-- @param invert Optional wheter to return the top of each bin (default) +-- or the bottom. BOOLEAN, default=FALSE. +-- +-- + + +CREATE OR REPLACE FUNCTION CDB_JenksBins ( in_array NUMERIC[], breaks INT, iterations INT DEFAULT 5, invert BOOLEAN DEFAULT FALSE) RETURNS NUMERIC[] as $$ +DECLARE + element_count INT4; + arr_mean NUMERIC; + bot INT; + top INT; + tops INT[]; + classes INT[][]; + i INT := 1; j INT := 1; + curr_result NUMERIC[]; + best_result NUMERIC[]; + seedtarget TEXT; + quant NUMERIC[]; + shuffles INT; +BEGIN + -- get the total size of our row + element_count := array_length(in_array, 1); --array_upper(in_array, 1) - array_lower(in_array, 1); + -- ensure the ordering of in_array + SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x; + -- stop if no rows + IF element_count IS NULL THEN + RETURN NULL; + END IF; + -- stop if our breaks are more than our input array size + IF element_count < breaks THEN + RETURN in_array; + END IF; + + shuffles := LEAST(GREATEST(floor(2500000.0/(element_count::float*iterations::float)), 1), 750)::int; + -- get our mean value + SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x; + + -- assume best is actually Quantile + SELECT cdb_crankshaft.CDB_QuantileBins(in_array, breaks) INTO quant; + + -- if data is very very large, just return quant and be done + IF element_count > 5000000 THEN + RETURN quant; + END IF; + + -- change quant into bottom, top markers + LOOP + IF i = 1 THEN + bot = 1; + ELSE + -- use last top to find this bot + bot = top+1; + END IF; + IF i = breaks THEN + top = element_count; + ELSE + SELECT count(*) INTO top FROM ( SELECT unnest(in_array) as v) x WHERE v <= quant[i]; + END IF; + IF i = 1 THEN + classes = ARRAY[ARRAY[bot,top]]; + ELSE + classes = ARRAY_CAT(classes,ARRAY[bot,top]); + END IF; + IF i > breaks THEN EXIT; END IF; + i = i+1; + END LOOP; + + best_result = cdb_crankshaft.CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles); + + --set the seed so we can ensure the same results + SELECT setseed(0.4567) INTO seedtarget; + --loop through random starting positions + LOOP + IF j > iterations-1 THEN EXIT; END IF; + i = 1; + tops = ARRAY[element_count]; + LOOP + IF i = breaks THEN EXIT; END IF; + SELECT array_agg(distinct e) INTO tops FROM (SELECT unnest(array_cat(tops, ARRAY[floor(random()*element_count::float)::int])) as e ORDER BY e) x WHERE e != 1; + i = array_length(tops, 1); + END LOOP; + i = 1; + LOOP + IF i > breaks THEN EXIT; END IF; + IF i = 1 THEN + bot = 1; + ELSE + bot = top+1; + END IF; + top = tops[i]; + IF i = 1 THEN + classes = ARRAY[ARRAY[bot,top]]; + ELSE + classes = ARRAY_CAT(classes,ARRAY[bot,top]); + END IF; + i := i+1; + END LOOP; + curr_result = cdb_crankshaft.CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles); + + IF curr_result[1] > best_result[1] THEN + best_result = curr_result; + j = j-1; -- if we found a better result, add one more search + END IF; + j = j+1; + END LOOP; + + RETURN (best_result)[2:array_upper(best_result, 1)]; +END; +$$ language plpgsql IMMUTABLE; + + + +-- +-- Perform a single iteration of the Jenks classification +-- + +CREATE OR REPLACE FUNCTION CDB_JenksBinsIteration ( in_array NUMERIC[], breaks INT, classes INT[][], invert BOOLEAN, element_count INT4, arr_mean NUMERIC, max_search INT DEFAULT 50) RETURNS NUMERIC[] as $$ +DECLARE + tmp_val numeric; + new_classes int[][]; + tmp_class int[]; + i INT := 1; + j INT := 1; + side INT := 2; + sdam numeric; + gvf numeric := 0.0; + new_gvf numeric; + arr_gvf numeric[]; + class_avg numeric; + class_max_i INT; + class_min_i INT; + class_max numeric; + class_min numeric; + reply numeric[]; +BEGIN + + -- Calculate the sum of squared deviations from the array mean (SDAM). + SELECT sum((arr_mean - e)^2) INTO sdam FROM ( SELECT unnest(in_array) as e ) x; + --Identify the breaks for the lowest GVF + LOOP + i = 1; + LOOP + -- get our mean + SELECT avg(e) INTO class_avg FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e) x; + -- find the deviation + SELECT sum((class_avg-e)^2) INTO tmp_val FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e ) x; + IF i = 1 THEN + arr_gvf = ARRAY[tmp_val]; + -- init our min/max map for later + class_max = arr_gvf[i]; + class_min = arr_gvf[i]; + class_min_i = 1; + class_max_i = 1; + ELSE + arr_gvf = array_append(arr_gvf, tmp_val); + END IF; + i := i+1; + IF i > breaks THEN EXIT; END IF; + END LOOP; + -- calculate our new GVF + SELECT sdam-sum(e) INTO new_gvf FROM ( SELECT unnest(arr_gvf) as e ) x; + -- if no improvement was made, exit + IF new_gvf < gvf THEN EXIT; END IF; + gvf = new_gvf; + IF j > max_search THEN EXIT; END IF; + j = j+1; + i = 1; + LOOP + --establish directionality (uppward through classes or downward) + IF arr_gvf[i] < class_min THEN + class_min = arr_gvf[i]; + class_min_i = i; + END IF; + IF arr_gvf[i] > class_max THEN + class_max = arr_gvf[i]; + class_max_i = i; + END IF; + i := i+1; + IF i > breaks THEN EXIT; END IF; + END LOOP; + IF class_max_i > class_min_i THEN + class_min_i = class_max_i - 1; + ELSE + class_min_i = class_max_i + 1; + END IF; + --Move from higher class to a lower gid order + IF class_max_i > class_min_i THEN + classes[class_max_i][1] = classes[class_max_i][1] + 1; + classes[class_min_i][2] = classes[class_min_i][2] + 1; + ELSE -- Move from lower class UP into a higher class by gid + classes[class_max_i][2] = classes[class_max_i][2] - 1; + classes[class_min_i][1] = classes[class_min_i][1] - 1; + END IF; + END LOOP; + + i = 1; + LOOP + IF invert = TRUE THEN + side = 1; --default returns bottom side of breaks, invert returns top side + END IF; + reply = array_append(reply, in_array[classes[i][side]]); + i = i+1; + IF i > breaks THEN EXIT; END IF; + END LOOP; + + RETURN array_prepend(gvf, reply); + +END; +$$ language plpgsql IMMUTABLE; + + +-- +-- Determine the Quantile classifications from a numeric array +-- +-- @param in_array A numeric array of numbers to determine the best +-- bins based on the Quantile method. +-- +-- @param breaks The number of bins you want to find. +-- +-- +CREATE OR REPLACE FUNCTION CDB_QuantileBins ( in_array NUMERIC[], breaks INT) RETURNS NUMERIC[] as $$ +DECLARE + element_count INT4; + break_size numeric; + tmp_val numeric; + i INT := 1; + reply numeric[]; +BEGIN + -- sort our values + SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e ASC) x; + -- get the total size of our data + element_count := array_length(in_array, 1); + break_size := element_count::numeric / breaks; + -- slice our bread + LOOP + IF i < breaks THEN + IF break_size * i % 1 > 0 THEN + SELECT e INTO tmp_val FROM ( SELECT unnest(in_array) e LIMIT 1 OFFSET ceil(break_size * i) - 1) x; + ELSE + SELECT avg(e) INTO tmp_val FROM ( SELECT unnest(in_array) e LIMIT 2 OFFSET ceil(break_size * i) - 1 ) x; + END IF; + ELSIF i = breaks THEN + -- select the last value + SELECT max(e) INTO tmp_val FROM ( SELECT unnest(in_array) e ) x; + ELSE + EXIT; + END IF; + + reply = array_append(reply, tmp_val); + i := i+1; + END LOOP; + RETURN reply; +END; +$$ language plpgsql IMMUTABLE; From ed6ecd0fef4346abe0facfe62a5ca51367da854f Mon Sep 17 00:00:00 2001 From: abelvm Date: Tue, 16 Aug 2016 13:29:55 -0400 Subject: [PATCH 05/11] added schema --- src/pg/test/sql/19_contour_test.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pg/test/sql/19_contour_test.sql b/src/pg/test/sql/19_contour_test.sql index da82327..889f48a 100644 --- a/src/pg/test/sql/19_contour_test.sql +++ b/src/pg/test/sql/19_contour_test.sql @@ -12,6 +12,6 @@ SELECT foo.* FROM a, - CDB_contour(a.g, a.vals, 500, 0.0, 1, 3, 5) foo + cdb_crankshaft.CDB_contour(a.g, a.vals, 500, 0.0, 1, 3, 5) foo ) SELECT bin, avg_value from b order by bin; From 0ea060a69816a2318a77d8c7e23579ed90601a92 Mon Sep 17 00:00:00 2001 From: abelvm Date: Tue, 16 Aug 2016 13:56:01 -0400 Subject: [PATCH 06/11] added srid to tests --- src/pg/test/sql/19_contour_test.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pg/test/sql/19_contour_test.sql b/src/pg/test/sql/19_contour_test.sql index 889f48a..eb1da6c 100644 --- a/src/pg/test/sql/19_contour_test.sql +++ b/src/pg/test/sql/19_contour_test.sql @@ -5,7 +5,7 @@ SET client_min_messages TO WARNING; WITH a AS ( SELECT ARRAY[800, 700, 600, 500, 400, 300, 200, 100]::numeric[] AS vals, - ARRAY[ST_GeomFromText('POINT(2.1744 41.403)'),ST_GeomFromText('POINT(2.1228 41.380)'),ST_GeomFromText('POINT(2.1511 41.374)'),ST_GeomFromText('POINT(2.1528 41.413)'),ST_GeomFromText('POINT(2.165 41.391)'),ST_GeomFromText('POINT(2.1498 41.371)'),ST_GeomFromText('POINT(2.1533 41.368)'),ST_GeomFromText('POINT(2.131386 41.41399)')] AS g + ARRAY[ST_GeomFromText('POINT(2.1744 41.403)',4326),ST_GeomFromText('POINT(2.1228 41.380)',4326),ST_GeomFromText('POINT(2.1511 41.374)',4326),ST_GeomFromText('POINT(2.1528 41.413)',4326),ST_GeomFromText('POINT(2.165 41.391)',4326),ST_GeomFromText('POINT(2.1498 41.371)',4326),ST_GeomFromText('POINT(2.1533 41.368)',4326),ST_GeomFromText('POINT(2.131386 41.41399)',4326)] AS g ), b as( SELECT From f6cedd7c866af3223d158924141a2bbc1232ec4b Mon Sep 17 00:00:00 2001 From: abelvm Date: Tue, 16 Aug 2016 14:05:53 -0400 Subject: [PATCH 07/11] fixed spaces --- src/pg/test/expected/19_contour_test.out | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/pg/test/expected/19_contour_test.out b/src/pg/test/expected/19_contour_test.out index da0e94b..ef53315 100644 --- a/src/pg/test/expected/19_contour_test.out +++ b/src/pg/test/expected/19_contour_test.out @@ -1,2 +1,10 @@ SET client_min_messages TO WARNING; \set ECHO none + bin|avg_value + 0|288.072587252400038769154 + 1|419.135632263450792294773 + 2|476.5442571943126671224 + 3|524.0779810136330772726 + 4|611.26099562647644421297 + (5 rows) + From b7a412fa4dc49c4720fa4ce781c3262237c1b667 Mon Sep 17 00:00:00 2001 From: abelvm Date: Tue, 16 Aug 2016 14:17:14 -0400 Subject: [PATCH 08/11] fixed spaces --- src/pg/test/expected/19_contour_test.out | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/pg/test/expected/19_contour_test.out b/src/pg/test/expected/19_contour_test.out index ef53315..ff25bdf 100644 --- a/src/pg/test/expected/19_contour_test.out +++ b/src/pg/test/expected/19_contour_test.out @@ -1,10 +1,9 @@ SET client_min_messages TO WARNING; \set ECHO none - bin|avg_value - 0|288.072587252400038769154 - 1|419.135632263450792294773 - 2|476.5442571943126671224 - 3|524.0779810136330772726 - 4|611.26099562647644421297 - (5 rows) - +bin|avg_value +0|288.072587252400038769154 +1|419.135632263450792294773 +2|476.5442571943126671224 +3|524.0779810136330772726 +4|611.26099562647644421297 +(5 rows) From 0f47659cf9ee25a6b6bb548707a60d1d8f2c983f Mon Sep 17 00:00:00 2001 From: abelvm Date: Thu, 18 Aug 2016 16:13:56 -0400 Subject: [PATCH 09/11] smart resolution guessing --- doc/19_contour.md | 11 +++++--- src/pg/sql/19_contour.sql | 40 ++++++++++++++++++++--------- src/pg/test/sql/19_contour_test.sql | 2 +- 3 files changed, 36 insertions(+), 17 deletions(-) diff --git a/doc/19_contour.md b/doc/19_contour.md index f34f7c0..d6629e4 100644 --- a/doc/19_contour.md +++ b/doc/19_contour.md @@ -14,11 +14,11 @@ Function to generate a contour map from an scatter dataset of points, using one |------|------|-------------| | geom | geometry[] | Array of points's geometries | | values | numeric[] | Array of points' values for the param under study| -| resolution | integer | Size in meters of the cells in the grid| | buffer | numeric | Value between 0 and 1 for spatial buffer of the set of points | method | integer | 0:nearest neighbor, 1: barycentric, 2: IDW| | classmethod | integer | 0:equals, 1: heads&tails, 2:jenks, 3:quantiles | | steps | integer | Number of steps in the classification| +| max_time | integer | Max time in millisecons for processing time ### Returns Returns a table object @@ -37,11 +37,14 @@ Returns a table object WITH a AS ( SELECT ARRAY[800, 700, 600, 500, 400, 300, 200, 100]::numeric[] AS vals, - ARRAY[ST_GeomFromText('POINT(2.1744 41.403)'),ST_GeomFromText('POINT(2.1228 41.380)'),ST_GeomFromText('POINT(2.1511 41.374)'),ST_GeomFromText('POINT(2.1528 41.413)'),ST_GeomFromText('POINT(2.165 41.391)'),ST_GeomFromText('POINT(2.1498 41.371)'),ST_GeomFromText('POINT(2.1533 41.368)'),ST_GeomFromText('POINT(2.131386 41.41399)')] AS g -) + ARRAY[ST_GeomFromText('POINT(2.1744 41.403)',4326),ST_GeomFromText('POINT(2.1228 41.380)',4326),ST_GeomFromText('POINT(2.1511 41.374)',4326),ST_GeomFromText('POINT(2.1528 41.413)',4326),ST_GeomFromText('POINT(2.165 41.391)',4326),ST_GeomFromText('POINT(2.1498 41.371)',4326),ST_GeomFromText('POINT(2.1533 41.368)',4326),ST_GeomFromText('POINT(2.131386 41.41399)',4326)] AS g +), +b as( SELECT foo.* FROM a, - CDB_contour(a.g, a.vals, 500, 0.0, 1,3,5) foo; + cdb_crankshaft.CDB_contour(a.g, a.vals, 0.0, 1, 3, 5, 60) foo +) +SELECT bin, avg_value from b order by bin; ``` diff --git a/src/pg/sql/19_contour.sql b/src/pg/sql/19_contour.sql index 528ad9a..c0a64c2 100644 --- a/src/pg/sql/19_contour.sql +++ b/src/pg/sql/19_contour.sql @@ -1,11 +1,11 @@ CREATE OR REPLACE FUNCTION CDB_Contour( IN geomin geometry[], IN colin numeric[], - IN resolution integer, IN buffer numeric, IN intmethod integer, IN classmethod integer, - IN steps integer + IN steps integer, + IN max_time integer DEFAULT 60000 ) RETURNS TABLE( the_geom geometry, @@ -15,19 +15,28 @@ RETURNS TABLE( avg_value numeric ) AS $$ DECLARE - cell numeric; + cell_count integer; tin geometry[]; BEGIN -- calc the cell size in web mercator units - WITH center as ( - SELECT ST_centroid(ST_Collect(geomin)) as c - ) - SELECT - round(resolution / cos(ST_y(c) * pi()/180)) - INTO cell - FROM center; + -- WITH center as ( + -- SELECT ST_centroid(ST_Collect(geomin)) as c + -- ) + -- SELECT + -- round(resolution / cos(ST_y(c) * pi()/180)) + -- INTO cell + -- FROM center; -- raise notice 'Resol: %', cell; + -- calc the optimal number of cells for the current dataset + SELECT + CASE intmethod + WHEN 0 THEN round(3.7745903782 * max_time - 9.4399210051 * array_length(geomin,1) - 1350.8778213073) + WHEN 1 THEN round(2.2855592156 * max_time - 87.285217133 * array_length(geomin,1) + 17255.7085601797) + WHEN 2 THEN round(0.9799471999 * max_time - 127.0334085369 * array_length(geomin,1) + 22707.9579721218) + ELSE 10000 + END INTO cell_count; + -- we don't have iterative barycentric interpolation in CDB_interpolation, -- and it's a costy function, so let's make a custom one here till -- we update the code @@ -59,10 +68,17 @@ BEGIN ST_Transform(e, 3857) as geom FROM envelope ), + resolution as( + SELECT + round(|/ ( + ST_area(geom) / cell_count + )) as cell + FROM envelope3857 + ), grid as( SELECT - ST_Transform(cdb_crankshaft.CDB_RectangleGrid(geom, cell, cell), 4326) as geom - FROM envelope3857 + ST_Transform(cdb_crankshaft.CDB_RectangleGrid(e.geom, r.cell, r.cell), 4326) as geom + FROM envelope3857 e, resolution r ), interp as( SELECT diff --git a/src/pg/test/sql/19_contour_test.sql b/src/pg/test/sql/19_contour_test.sql index eb1da6c..70e5cf9 100644 --- a/src/pg/test/sql/19_contour_test.sql +++ b/src/pg/test/sql/19_contour_test.sql @@ -12,6 +12,6 @@ SELECT foo.* FROM a, - cdb_crankshaft.CDB_contour(a.g, a.vals, 500, 0.0, 1, 3, 5) foo + cdb_crankshaft.CDB_contour(a.g, a.vals, 0.0, 1, 3, 5, 60) foo ) SELECT bin, avg_value from b order by bin; From fa14b8d9bbdbd878c5eb0bfdc0d9ef2827ba02fc Mon Sep 17 00:00:00 2001 From: abelvm Date: Thu, 18 Aug 2016 16:33:38 -0400 Subject: [PATCH 10/11] SRG test --- src/pg/test/expected/19_contour_test.out | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/pg/test/expected/19_contour_test.out b/src/pg/test/expected/19_contour_test.out index ff25bdf..1404c96 100644 --- a/src/pg/test/expected/19_contour_test.out +++ b/src/pg/test/expected/19_contour_test.out @@ -1,9 +1,9 @@ SET client_min_messages TO WARNING; \set ECHO none bin|avg_value -0|288.072587252400038769154 -1|419.135632263450792294773 -2|476.5442571943126671224 -3|524.0779810136330772726 -4|611.26099562647644421297 +0|280.23070673030491816424178 +1|413.81702914846213479025305 +2|479.6334491374486884098328 +3|529.1545236882183479447113 +4|614.1132081424930103122037 (5 rows) From ef436546b4f113eb425c967e30fd577d07076f69 Mon Sep 17 00:00:00 2001 From: abelvm Date: Mon, 29 Aug 2016 11:26:02 -0400 Subject: [PATCH 11/11] add cdb_utils --- src/pg/sql/19_contour.sql | 449 -------------------------------------- src/pg/sql/cdb_utils.sql | 447 +++++++++++++++++++++++++++++++++++++ 2 files changed, 447 insertions(+), 449 deletions(-) create mode 100644 src/pg/sql/cdb_utils.sql diff --git a/src/pg/sql/19_contour.sql b/src/pg/sql/19_contour.sql index c0a64c2..759b151 100644 --- a/src/pg/sql/19_contour.sql +++ b/src/pg/sql/19_contour.sql @@ -203,452 +203,3 @@ BEGIN END; $$ language plpgsql; - - --- --- Fill given extent with a rectangular coverage --- --- @param ext Extent to fill. Only rectangles with center point falling --- inside the extent (or at the lower or leftmost edge) will --- be emitted. The returned hexagons will have the same SRID --- as this extent. --- --- @param width With of each rectangle --- --- @param height Height of each rectangle --- --- @param origin Optional origin to allow for exact tiling. --- If omitted the origin will be 0,0. --- The parameter is checked for having the same SRID --- as the extent. --- --- -CREATE OR REPLACE FUNCTION CDB_RectangleGrid(ext GEOMETRY, width FLOAT8, height FLOAT8, origin GEOMETRY DEFAULT NULL) -RETURNS SETOF GEOMETRY -AS $$ -DECLARE - h GEOMETRY; -- rectangle cell - hstep FLOAT8; -- horizontal step - vstep FLOAT8; -- vertical step - hw FLOAT8; -- half width - hh FLOAT8; -- half height - vstart FLOAT8; - hstart FLOAT8; - hend FLOAT8; - vend FLOAT8; - xoff FLOAT8; - yoff FLOAT8; - xgrd FLOAT8; - ygrd FLOAT8; - x FLOAT8; - y FLOAT8; - srid INTEGER; -BEGIN - - srid := ST_SRID(ext); - - xoff := 0; - yoff := 0; - - IF origin IS NOT NULL THEN - IF ST_SRID(origin) != srid THEN - RAISE EXCEPTION 'SRID mismatch between extent (%) and origin (%)', srid, ST_SRID(origin); - END IF; - xoff := ST_X(origin); - yoff := ST_Y(origin); - END IF; - - --RAISE DEBUG 'X offset: %', xoff; - --RAISE DEBUG 'Y offset: %', yoff; - - hw := width/2.0; - hh := height/2.0; - - xgrd := hw; - ygrd := hh; - --RAISE DEBUG 'X grid size: %', xgrd; - --RAISE DEBUG 'Y grid size: %', ygrd; - - hstep := width; - vstep := height; - - -- Tweak horizontal start on hstep grid from origin - hstart := xoff + ceil((ST_XMin(ext)-xoff)/hstep)*hstep; - --RAISE DEBUG 'hstart: %', hstart; - - -- Tweak vertical start on vstep grid from origin - vstart := yoff + ceil((ST_Ymin(ext)-yoff)/vstep)*vstep; - --RAISE DEBUG 'vstart: %', vstart; - - hend := ST_XMax(ext); - vend := ST_YMax(ext); - - --RAISE DEBUG 'hend: %', hend; - --RAISE DEBUG 'vend: %', vend; - - x := hstart; - WHILE x < hend LOOP -- over X - y := vstart; - h := ST_MakeEnvelope(x-hw, y-hh, x+hw, y+hh, srid); - WHILE y < vend LOOP -- over Y - RETURN NEXT h; - h := ST_Translate(h, 0, vstep); - y := yoff + round(((y + vstep)-yoff)/ygrd)*ygrd; -- round to grid - END LOOP; - x := xoff + round(((x + hstep)-xoff)/xgrd)*xgrd; -- round to grid - END LOOP; - - RETURN; -END -$$ LANGUAGE 'plpgsql' IMMUTABLE; - --- --- Calculate the equal interval bins for a given column --- --- @param in_array A numeric array of numbers to determine the best --- to determine the bin boundary --- --- @param breaks The number of bins you want to find. --- --- --- Returns: upper edges of bins --- --- - -CREATE OR REPLACE FUNCTION CDB_EqualIntervalBins ( in_array NUMERIC[], breaks INT ) RETURNS NUMERIC[] as $$ -DECLARE - diff numeric; - min_val numeric; - max_val numeric; - tmp_val numeric; - i INT := 1; - reply numeric[]; -BEGIN - SELECT min(e), max(e) INTO min_val, max_val FROM ( SELECT unnest(in_array) e ) x WHERE e IS NOT NULL; - diff = (max_val - min_val) / breaks::numeric; - LOOP - IF i < breaks THEN - tmp_val = min_val + i::numeric * diff; - reply = array_append(reply, tmp_val); - i := i+1; - ELSE - reply = array_append(reply, max_val); - EXIT; - END IF; - END LOOP; - RETURN reply; -END; -$$ language plpgsql IMMUTABLE; - --- --- Determine the Heads/Tails classifications from a numeric array --- --- @param in_array A numeric array of numbers to determine the best --- bins based on the Heads/Tails method. --- --- @param breaks The number of bins you want to find. --- --- - -CREATE OR REPLACE FUNCTION CDB_HeadsTailsBins ( in_array NUMERIC[], breaks INT) RETURNS NUMERIC[] as $$ -DECLARE - element_count INT4; - arr_mean numeric; - i INT := 2; - reply numeric[]; -BEGIN - -- get the total size of our row - element_count := array_upper(in_array, 1) - array_lower(in_array, 1); - -- ensure the ordering of in_array - SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x; - -- stop if no rows - IF element_count IS NULL THEN - RETURN NULL; - END IF; - -- stop if our breaks are more than our input array size - IF element_count < breaks THEN - RETURN in_array; - END IF; - - -- get our mean value - SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x; - - reply = Array[arr_mean]; - -- slice our bread - LOOP - IF i > breaks THEN EXIT; END IF; - SELECT avg(e) INTO arr_mean FROM ( SELECT unnest(in_array) e) x WHERE e > reply[i-1]; - IF arr_mean IS NOT NULL THEN - reply = array_append(reply, arr_mean); - END IF; - i := i+1; - END LOOP; - RETURN reply; -END; -$$ language plpgsql IMMUTABLE; - --- --- Determine the Jenks classifications from a numeric array --- --- @param in_array A numeric array of numbers to determine the best --- bins based on the Jenks method. --- --- @param breaks The number of bins you want to find. --- --- @param iterations The number of different starting positions to test. --- --- @param invert Optional wheter to return the top of each bin (default) --- or the bottom. BOOLEAN, default=FALSE. --- --- - - -CREATE OR REPLACE FUNCTION CDB_JenksBins ( in_array NUMERIC[], breaks INT, iterations INT DEFAULT 5, invert BOOLEAN DEFAULT FALSE) RETURNS NUMERIC[] as $$ -DECLARE - element_count INT4; - arr_mean NUMERIC; - bot INT; - top INT; - tops INT[]; - classes INT[][]; - i INT := 1; j INT := 1; - curr_result NUMERIC[]; - best_result NUMERIC[]; - seedtarget TEXT; - quant NUMERIC[]; - shuffles INT; -BEGIN - -- get the total size of our row - element_count := array_length(in_array, 1); --array_upper(in_array, 1) - array_lower(in_array, 1); - -- ensure the ordering of in_array - SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x; - -- stop if no rows - IF element_count IS NULL THEN - RETURN NULL; - END IF; - -- stop if our breaks are more than our input array size - IF element_count < breaks THEN - RETURN in_array; - END IF; - - shuffles := LEAST(GREATEST(floor(2500000.0/(element_count::float*iterations::float)), 1), 750)::int; - -- get our mean value - SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x; - - -- assume best is actually Quantile - SELECT cdb_crankshaft.CDB_QuantileBins(in_array, breaks) INTO quant; - - -- if data is very very large, just return quant and be done - IF element_count > 5000000 THEN - RETURN quant; - END IF; - - -- change quant into bottom, top markers - LOOP - IF i = 1 THEN - bot = 1; - ELSE - -- use last top to find this bot - bot = top+1; - END IF; - IF i = breaks THEN - top = element_count; - ELSE - SELECT count(*) INTO top FROM ( SELECT unnest(in_array) as v) x WHERE v <= quant[i]; - END IF; - IF i = 1 THEN - classes = ARRAY[ARRAY[bot,top]]; - ELSE - classes = ARRAY_CAT(classes,ARRAY[bot,top]); - END IF; - IF i > breaks THEN EXIT; END IF; - i = i+1; - END LOOP; - - best_result = cdb_crankshaft.CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles); - - --set the seed so we can ensure the same results - SELECT setseed(0.4567) INTO seedtarget; - --loop through random starting positions - LOOP - IF j > iterations-1 THEN EXIT; END IF; - i = 1; - tops = ARRAY[element_count]; - LOOP - IF i = breaks THEN EXIT; END IF; - SELECT array_agg(distinct e) INTO tops FROM (SELECT unnest(array_cat(tops, ARRAY[floor(random()*element_count::float)::int])) as e ORDER BY e) x WHERE e != 1; - i = array_length(tops, 1); - END LOOP; - i = 1; - LOOP - IF i > breaks THEN EXIT; END IF; - IF i = 1 THEN - bot = 1; - ELSE - bot = top+1; - END IF; - top = tops[i]; - IF i = 1 THEN - classes = ARRAY[ARRAY[bot,top]]; - ELSE - classes = ARRAY_CAT(classes,ARRAY[bot,top]); - END IF; - i := i+1; - END LOOP; - curr_result = cdb_crankshaft.CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles); - - IF curr_result[1] > best_result[1] THEN - best_result = curr_result; - j = j-1; -- if we found a better result, add one more search - END IF; - j = j+1; - END LOOP; - - RETURN (best_result)[2:array_upper(best_result, 1)]; -END; -$$ language plpgsql IMMUTABLE; - - - --- --- Perform a single iteration of the Jenks classification --- - -CREATE OR REPLACE FUNCTION CDB_JenksBinsIteration ( in_array NUMERIC[], breaks INT, classes INT[][], invert BOOLEAN, element_count INT4, arr_mean NUMERIC, max_search INT DEFAULT 50) RETURNS NUMERIC[] as $$ -DECLARE - tmp_val numeric; - new_classes int[][]; - tmp_class int[]; - i INT := 1; - j INT := 1; - side INT := 2; - sdam numeric; - gvf numeric := 0.0; - new_gvf numeric; - arr_gvf numeric[]; - class_avg numeric; - class_max_i INT; - class_min_i INT; - class_max numeric; - class_min numeric; - reply numeric[]; -BEGIN - - -- Calculate the sum of squared deviations from the array mean (SDAM). - SELECT sum((arr_mean - e)^2) INTO sdam FROM ( SELECT unnest(in_array) as e ) x; - --Identify the breaks for the lowest GVF - LOOP - i = 1; - LOOP - -- get our mean - SELECT avg(e) INTO class_avg FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e) x; - -- find the deviation - SELECT sum((class_avg-e)^2) INTO tmp_val FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e ) x; - IF i = 1 THEN - arr_gvf = ARRAY[tmp_val]; - -- init our min/max map for later - class_max = arr_gvf[i]; - class_min = arr_gvf[i]; - class_min_i = 1; - class_max_i = 1; - ELSE - arr_gvf = array_append(arr_gvf, tmp_val); - END IF; - i := i+1; - IF i > breaks THEN EXIT; END IF; - END LOOP; - -- calculate our new GVF - SELECT sdam-sum(e) INTO new_gvf FROM ( SELECT unnest(arr_gvf) as e ) x; - -- if no improvement was made, exit - IF new_gvf < gvf THEN EXIT; END IF; - gvf = new_gvf; - IF j > max_search THEN EXIT; END IF; - j = j+1; - i = 1; - LOOP - --establish directionality (uppward through classes or downward) - IF arr_gvf[i] < class_min THEN - class_min = arr_gvf[i]; - class_min_i = i; - END IF; - IF arr_gvf[i] > class_max THEN - class_max = arr_gvf[i]; - class_max_i = i; - END IF; - i := i+1; - IF i > breaks THEN EXIT; END IF; - END LOOP; - IF class_max_i > class_min_i THEN - class_min_i = class_max_i - 1; - ELSE - class_min_i = class_max_i + 1; - END IF; - --Move from higher class to a lower gid order - IF class_max_i > class_min_i THEN - classes[class_max_i][1] = classes[class_max_i][1] + 1; - classes[class_min_i][2] = classes[class_min_i][2] + 1; - ELSE -- Move from lower class UP into a higher class by gid - classes[class_max_i][2] = classes[class_max_i][2] - 1; - classes[class_min_i][1] = classes[class_min_i][1] - 1; - END IF; - END LOOP; - - i = 1; - LOOP - IF invert = TRUE THEN - side = 1; --default returns bottom side of breaks, invert returns top side - END IF; - reply = array_append(reply, in_array[classes[i][side]]); - i = i+1; - IF i > breaks THEN EXIT; END IF; - END LOOP; - - RETURN array_prepend(gvf, reply); - -END; -$$ language plpgsql IMMUTABLE; - - --- --- Determine the Quantile classifications from a numeric array --- --- @param in_array A numeric array of numbers to determine the best --- bins based on the Quantile method. --- --- @param breaks The number of bins you want to find. --- --- -CREATE OR REPLACE FUNCTION CDB_QuantileBins ( in_array NUMERIC[], breaks INT) RETURNS NUMERIC[] as $$ -DECLARE - element_count INT4; - break_size numeric; - tmp_val numeric; - i INT := 1; - reply numeric[]; -BEGIN - -- sort our values - SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e ASC) x; - -- get the total size of our data - element_count := array_length(in_array, 1); - break_size := element_count::numeric / breaks; - -- slice our bread - LOOP - IF i < breaks THEN - IF break_size * i % 1 > 0 THEN - SELECT e INTO tmp_val FROM ( SELECT unnest(in_array) e LIMIT 1 OFFSET ceil(break_size * i) - 1) x; - ELSE - SELECT avg(e) INTO tmp_val FROM ( SELECT unnest(in_array) e LIMIT 2 OFFSET ceil(break_size * i) - 1 ) x; - END IF; - ELSIF i = breaks THEN - -- select the last value - SELECT max(e) INTO tmp_val FROM ( SELECT unnest(in_array) e ) x; - ELSE - EXIT; - END IF; - - reply = array_append(reply, tmp_val); - i := i+1; - END LOOP; - RETURN reply; -END; -$$ language plpgsql IMMUTABLE; diff --git a/src/pg/sql/cdb_utils.sql b/src/pg/sql/cdb_utils.sql new file mode 100644 index 0000000..8f87084 --- /dev/null +++ b/src/pg/sql/cdb_utils.sql @@ -0,0 +1,447 @@ +-- +-- Fill given extent with a rectangular coverage +-- +-- @param ext Extent to fill. Only rectangles with center point falling +-- inside the extent (or at the lower or leftmost edge) will +-- be emitted. The returned hexagons will have the same SRID +-- as this extent. +-- +-- @param width With of each rectangle +-- +-- @param height Height of each rectangle +-- +-- @param origin Optional origin to allow for exact tiling. +-- If omitted the origin will be 0,0. +-- The parameter is checked for having the same SRID +-- as the extent. +-- +-- +CREATE OR REPLACE FUNCTION CDB_RectangleGrid(ext GEOMETRY, width FLOAT8, height FLOAT8, origin GEOMETRY DEFAULT NULL) +RETURNS SETOF GEOMETRY +AS $$ +DECLARE + h GEOMETRY; -- rectangle cell + hstep FLOAT8; -- horizontal step + vstep FLOAT8; -- vertical step + hw FLOAT8; -- half width + hh FLOAT8; -- half height + vstart FLOAT8; + hstart FLOAT8; + hend FLOAT8; + vend FLOAT8; + xoff FLOAT8; + yoff FLOAT8; + xgrd FLOAT8; + ygrd FLOAT8; + x FLOAT8; + y FLOAT8; + srid INTEGER; +BEGIN + + srid := ST_SRID(ext); + + xoff := 0; + yoff := 0; + + IF origin IS NOT NULL THEN + IF ST_SRID(origin) != srid THEN + RAISE EXCEPTION 'SRID mismatch between extent (%) and origin (%)', srid, ST_SRID(origin); + END IF; + xoff := ST_X(origin); + yoff := ST_Y(origin); + END IF; + + --RAISE DEBUG 'X offset: %', xoff; + --RAISE DEBUG 'Y offset: %', yoff; + + hw := width/2.0; + hh := height/2.0; + + xgrd := hw; + ygrd := hh; + --RAISE DEBUG 'X grid size: %', xgrd; + --RAISE DEBUG 'Y grid size: %', ygrd; + + hstep := width; + vstep := height; + + -- Tweak horizontal start on hstep grid from origin + hstart := xoff + ceil((ST_XMin(ext)-xoff)/hstep)*hstep; + --RAISE DEBUG 'hstart: %', hstart; + + -- Tweak vertical start on vstep grid from origin + vstart := yoff + ceil((ST_Ymin(ext)-yoff)/vstep)*vstep; + --RAISE DEBUG 'vstart: %', vstart; + + hend := ST_XMax(ext); + vend := ST_YMax(ext); + + --RAISE DEBUG 'hend: %', hend; + --RAISE DEBUG 'vend: %', vend; + + x := hstart; + WHILE x < hend LOOP -- over X + y := vstart; + h := ST_MakeEnvelope(x-hw, y-hh, x+hw, y+hh, srid); + WHILE y < vend LOOP -- over Y + RETURN NEXT h; + h := ST_Translate(h, 0, vstep); + y := yoff + round(((y + vstep)-yoff)/ygrd)*ygrd; -- round to grid + END LOOP; + x := xoff + round(((x + hstep)-xoff)/xgrd)*xgrd; -- round to grid + END LOOP; + + RETURN; +END +$$ LANGUAGE 'plpgsql' IMMUTABLE; + +-- +-- Calculate the equal interval bins for a given column +-- +-- @param in_array A numeric array of numbers to determine the best +-- to determine the bin boundary +-- +-- @param breaks The number of bins you want to find. +-- +-- +-- Returns: upper edges of bins +-- +-- + +CREATE OR REPLACE FUNCTION CDB_EqualIntervalBins ( in_array NUMERIC[], breaks INT ) RETURNS NUMERIC[] as $$ +DECLARE + diff numeric; + min_val numeric; + max_val numeric; + tmp_val numeric; + i INT := 1; + reply numeric[]; +BEGIN + SELECT min(e), max(e) INTO min_val, max_val FROM ( SELECT unnest(in_array) e ) x WHERE e IS NOT NULL; + diff = (max_val - min_val) / breaks::numeric; + LOOP + IF i < breaks THEN + tmp_val = min_val + i::numeric * diff; + reply = array_append(reply, tmp_val); + i := i+1; + ELSE + reply = array_append(reply, max_val); + EXIT; + END IF; + END LOOP; + RETURN reply; +END; +$$ language plpgsql IMMUTABLE; + +-- +-- Determine the Heads/Tails classifications from a numeric array +-- +-- @param in_array A numeric array of numbers to determine the best +-- bins based on the Heads/Tails method. +-- +-- @param breaks The number of bins you want to find. +-- +-- + +CREATE OR REPLACE FUNCTION CDB_HeadsTailsBins ( in_array NUMERIC[], breaks INT) RETURNS NUMERIC[] as $$ +DECLARE + element_count INT4; + arr_mean numeric; + i INT := 2; + reply numeric[]; +BEGIN + -- get the total size of our row + element_count := array_upper(in_array, 1) - array_lower(in_array, 1); + -- ensure the ordering of in_array + SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x; + -- stop if no rows + IF element_count IS NULL THEN + RETURN NULL; + END IF; + -- stop if our breaks are more than our input array size + IF element_count < breaks THEN + RETURN in_array; + END IF; + + -- get our mean value + SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x; + + reply = Array[arr_mean]; + -- slice our bread + LOOP + IF i > breaks THEN EXIT; END IF; + SELECT avg(e) INTO arr_mean FROM ( SELECT unnest(in_array) e) x WHERE e > reply[i-1]; + IF arr_mean IS NOT NULL THEN + reply = array_append(reply, arr_mean); + END IF; + i := i+1; + END LOOP; + RETURN reply; +END; +$$ language plpgsql IMMUTABLE; + +-- +-- Determine the Jenks classifications from a numeric array +-- +-- @param in_array A numeric array of numbers to determine the best +-- bins based on the Jenks method. +-- +-- @param breaks The number of bins you want to find. +-- +-- @param iterations The number of different starting positions to test. +-- +-- @param invert Optional wheter to return the top of each bin (default) +-- or the bottom. BOOLEAN, default=FALSE. +-- +-- + + +CREATE OR REPLACE FUNCTION CDB_JenksBins ( in_array NUMERIC[], breaks INT, iterations INT DEFAULT 5, invert BOOLEAN DEFAULT FALSE) RETURNS NUMERIC[] as $$ +DECLARE + element_count INT4; + arr_mean NUMERIC; + bot INT; + top INT; + tops INT[]; + classes INT[][]; + i INT := 1; j INT := 1; + curr_result NUMERIC[]; + best_result NUMERIC[]; + seedtarget TEXT; + quant NUMERIC[]; + shuffles INT; +BEGIN + -- get the total size of our row + element_count := array_length(in_array, 1); --array_upper(in_array, 1) - array_lower(in_array, 1); + -- ensure the ordering of in_array + SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x; + -- stop if no rows + IF element_count IS NULL THEN + RETURN NULL; + END IF; + -- stop if our breaks are more than our input array size + IF element_count < breaks THEN + RETURN in_array; + END IF; + + shuffles := LEAST(GREATEST(floor(2500000.0/(element_count::float*iterations::float)), 1), 750)::int; + -- get our mean value + SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x; + + -- assume best is actually Quantile + SELECT cdb_crankshaft.CDB_QuantileBins(in_array, breaks) INTO quant; + + -- if data is very very large, just return quant and be done + IF element_count > 5000000 THEN + RETURN quant; + END IF; + + -- change quant into bottom, top markers + LOOP + IF i = 1 THEN + bot = 1; + ELSE + -- use last top to find this bot + bot = top+1; + END IF; + IF i = breaks THEN + top = element_count; + ELSE + SELECT count(*) INTO top FROM ( SELECT unnest(in_array) as v) x WHERE v <= quant[i]; + END IF; + IF i = 1 THEN + classes = ARRAY[ARRAY[bot,top]]; + ELSE + classes = ARRAY_CAT(classes,ARRAY[bot,top]); + END IF; + IF i > breaks THEN EXIT; END IF; + i = i+1; + END LOOP; + + best_result = cdb_crankshaft.CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles); + + --set the seed so we can ensure the same results + SELECT setseed(0.4567) INTO seedtarget; + --loop through random starting positions + LOOP + IF j > iterations-1 THEN EXIT; END IF; + i = 1; + tops = ARRAY[element_count]; + LOOP + IF i = breaks THEN EXIT; END IF; + SELECT array_agg(distinct e) INTO tops FROM (SELECT unnest(array_cat(tops, ARRAY[floor(random()*element_count::float)::int])) as e ORDER BY e) x WHERE e != 1; + i = array_length(tops, 1); + END LOOP; + i = 1; + LOOP + IF i > breaks THEN EXIT; END IF; + IF i = 1 THEN + bot = 1; + ELSE + bot = top+1; + END IF; + top = tops[i]; + IF i = 1 THEN + classes = ARRAY[ARRAY[bot,top]]; + ELSE + classes = ARRAY_CAT(classes,ARRAY[bot,top]); + END IF; + i := i+1; + END LOOP; + curr_result = cdb_crankshaft.CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles); + + IF curr_result[1] > best_result[1] THEN + best_result = curr_result; + j = j-1; -- if we found a better result, add one more search + END IF; + j = j+1; + END LOOP; + + RETURN (best_result)[2:array_upper(best_result, 1)]; +END; +$$ language plpgsql IMMUTABLE; + + + +-- +-- Perform a single iteration of the Jenks classification +-- + +CREATE OR REPLACE FUNCTION CDB_JenksBinsIteration ( in_array NUMERIC[], breaks INT, classes INT[][], invert BOOLEAN, element_count INT4, arr_mean NUMERIC, max_search INT DEFAULT 50) RETURNS NUMERIC[] as $$ +DECLARE + tmp_val numeric; + new_classes int[][]; + tmp_class int[]; + i INT := 1; + j INT := 1; + side INT := 2; + sdam numeric; + gvf numeric := 0.0; + new_gvf numeric; + arr_gvf numeric[]; + class_avg numeric; + class_max_i INT; + class_min_i INT; + class_max numeric; + class_min numeric; + reply numeric[]; +BEGIN + + -- Calculate the sum of squared deviations from the array mean (SDAM). + SELECT sum((arr_mean - e)^2) INTO sdam FROM ( SELECT unnest(in_array) as e ) x; + --Identify the breaks for the lowest GVF + LOOP + i = 1; + LOOP + -- get our mean + SELECT avg(e) INTO class_avg FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e) x; + -- find the deviation + SELECT sum((class_avg-e)^2) INTO tmp_val FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e ) x; + IF i = 1 THEN + arr_gvf = ARRAY[tmp_val]; + -- init our min/max map for later + class_max = arr_gvf[i]; + class_min = arr_gvf[i]; + class_min_i = 1; + class_max_i = 1; + ELSE + arr_gvf = array_append(arr_gvf, tmp_val); + END IF; + i := i+1; + IF i > breaks THEN EXIT; END IF; + END LOOP; + -- calculate our new GVF + SELECT sdam-sum(e) INTO new_gvf FROM ( SELECT unnest(arr_gvf) as e ) x; + -- if no improvement was made, exit + IF new_gvf < gvf THEN EXIT; END IF; + gvf = new_gvf; + IF j > max_search THEN EXIT; END IF; + j = j+1; + i = 1; + LOOP + --establish directionality (uppward through classes or downward) + IF arr_gvf[i] < class_min THEN + class_min = arr_gvf[i]; + class_min_i = i; + END IF; + IF arr_gvf[i] > class_max THEN + class_max = arr_gvf[i]; + class_max_i = i; + END IF; + i := i+1; + IF i > breaks THEN EXIT; END IF; + END LOOP; + IF class_max_i > class_min_i THEN + class_min_i = class_max_i - 1; + ELSE + class_min_i = class_max_i + 1; + END IF; + --Move from higher class to a lower gid order + IF class_max_i > class_min_i THEN + classes[class_max_i][1] = classes[class_max_i][1] + 1; + classes[class_min_i][2] = classes[class_min_i][2] + 1; + ELSE -- Move from lower class UP into a higher class by gid + classes[class_max_i][2] = classes[class_max_i][2] - 1; + classes[class_min_i][1] = classes[class_min_i][1] - 1; + END IF; + END LOOP; + + i = 1; + LOOP + IF invert = TRUE THEN + side = 1; --default returns bottom side of breaks, invert returns top side + END IF; + reply = array_append(reply, in_array[classes[i][side]]); + i = i+1; + IF i > breaks THEN EXIT; END IF; + END LOOP; + + RETURN array_prepend(gvf, reply); + +END; +$$ language plpgsql IMMUTABLE; + + +-- +-- Determine the Quantile classifications from a numeric array +-- +-- @param in_array A numeric array of numbers to determine the best +-- bins based on the Quantile method. +-- +-- @param breaks The number of bins you want to find. +-- +-- +CREATE OR REPLACE FUNCTION CDB_QuantileBins ( in_array NUMERIC[], breaks INT) RETURNS NUMERIC[] as $$ +DECLARE + element_count INT4; + break_size numeric; + tmp_val numeric; + i INT := 1; + reply numeric[]; +BEGIN + -- sort our values + SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e ASC) x; + -- get the total size of our data + element_count := array_length(in_array, 1); + break_size := element_count::numeric / breaks; + -- slice our bread + LOOP + IF i < breaks THEN + IF break_size * i % 1 > 0 THEN + SELECT e INTO tmp_val FROM ( SELECT unnest(in_array) e LIMIT 1 OFFSET ceil(break_size * i) - 1) x; + ELSE + SELECT avg(e) INTO tmp_val FROM ( SELECT unnest(in_array) e LIMIT 2 OFFSET ceil(break_size * i) - 1 ) x; + END IF; + ELSIF i = breaks THEN + -- select the last value + SELECT max(e) INTO tmp_val FROM ( SELECT unnest(in_array) e ) x; + ELSE + EXIT; + END IF; + + reply = array_append(reply, tmp_val); + i := i+1; + END LOOP; + RETURN reply; +END; +$$ language plpgsql IMMUTABLE;