Merge pull request #340 from Algunenano/trav_jenkins

Improvements in `CDB_JenksBins`
This commit is contained in:
Raúl Marín 2018-09-13 10:24:41 +02:00 committed by GitHub
commit 5033a0ba35
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 321 additions and 168 deletions

View File

@ -1,7 +1,7 @@
# cartodb/Makefile # cartodb/Makefile
EXTENSION = cartodb EXTENSION = cartodb
EXTVERSION = 0.23.2 EXTVERSION = 0.24.0
SED = sed SED = sed
AWK = awk AWK = awk

View File

@ -1,6 +1,13 @@
0.24.0 (2018-XX-XX) 0.24.0 (2018-XX-XX)
* Travis: Test with PostgreSQL 9.5 and 10. * Travis: Test with PostgreSQL 9.5 and 10.
* _cdb_estimated_extent: Fix bug with ST_EstimatedExtent interaction. * _cdb_estimated_extent: Fix bug with ST_EstimatedExtent interaction.
* Improvements in `CDB_JenksBins`.
* Now it ignores NULLs.
* No longer puts the same value in multiple categories.
* Removes all limits related to size.
* If not set, the number of iterations done is based now on the size of the array.
* Fixed multiple bugs.
* The internal function `CDB_JenksBinsIteration` has changed its signature.
0.23.2 (2018-07-19) 0.23.2 (2018-07-19)
* Fix `CDB_QueryTablesText` with parenthesized queries (#335) * Fix `CDB_QueryTablesText` with parenthesized queries (#335)

View File

@ -10,212 +10,337 @@
-- --
-- @param invert Optional wheter to return the top of each bin (default) -- @param invert Optional wheter to return the top of each bin (default)
-- or the bottom. BOOLEAN, default=FALSE. -- or the bottom. BOOLEAN, default=FALSE.
-- --
-- --
CREATE OR REPLACE FUNCTION CDB_JenksBins(in_array NUMERIC[], breaks INT, iterations INT DEFAULT 0, invert BOOLEAN DEFAULT FALSE)
RETURNS NUMERIC[] as
$$
DECLARE
in_matrix NUMERIC[][];
in_unique_count BIGINT;
CREATE OR REPLACE FUNCTION CDB_JenksBins ( in_array NUMERIC[], breaks INT, iterations INT DEFAULT 5, invert BOOLEAN DEFAULT FALSE) RETURNS NUMERIC[] as $$ shuffles INT;
DECLARE
element_count INT4;
arr_mean NUMERIC; arr_mean NUMERIC;
sdam NUMERIC;
i INT;
bot INT; bot INT;
top INT; top INT;
tops INT[]; tops INT[];
classes INT[][]; classes INT[][];
i INT := 1; j INT := 1; j INT := 1;
curr_result NUMERIC[]; curr_result NUMERIC[];
best_result NUMERIC[]; best_result NUMERIC[];
seedtarget TEXT; seedtarget TEXT;
quant NUMERIC[];
shuffles INT;
BEGIN BEGIN
-- get the total size of our row -- We clean the input array (remove NULLs) and create 2 arrays
element_count := array_length(in_array, 1); --array_upper(in_array, 1) - array_lower(in_array, 1); -- [1] contains the unique values in in_array
-- ensure the ordering of in_array -- [2] contains the number of appearances of those unique values
SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x; SELECT ARRAY[array_agg(value), array_agg(count)] FROM
-- stop if no rows (
IF element_count IS NULL THEN SELECT value, count(1)::numeric as count
RETURN NULL; FROM unnest(in_array) AS value
END IF; WHERE value is NOT NULL
-- stop if our breaks are more than our input array size GROUP BY value
IF element_count < breaks THEN ORDER BY value
RETURN in_array; ) __clean_array_q INTO in_matrix;
END IF;
shuffles := LEAST(GREATEST(floor(2500000.0/(element_count::float*iterations::float)), 1), 750)::int; -- Get the number of unique values
-- get our mean value in_unique_count := array_length(in_matrix[1:1], 2);
SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x;
-- assume best is actually Quantile IF in_unique_count IS NULL THEN
SELECT CDB_QuantileBins(in_array, breaks) INTO quant; RETURN NULL;
-- if data is very very large, just return quant and be done
IF element_count > 5000000 THEN
RETURN quant;
END IF; END IF;
-- change quant into bottom, top markers IF in_unique_count <= breaks THEN
LOOP -- There isn't enough distinct values for the requested breaks
IF i = 1 THEN RETURN ARRAY(Select unnest(in_matrix[1:1])) _a;
bot = 1; END IF;
ELSE
-- use last top to find this bot -- If not declated explicitly we iterate based on the length of the array
bot = top+1; IF iterations < 1 THEN
END IF; -- This is based on a 'looks fine' heuristic
IF i = breaks THEN iterations := log(in_unique_count)::integer + 1;
top = element_count; END IF;
-- We set the number of shuffles per iteration as the number of unique values but
-- this is just another 'looks fine' heuristic
shuffles := in_unique_count;
-- Get the mean value of the whole vector (already ignores NULLs)
SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x;
-- Calculate the sum of squared deviations from the array mean (SDAM).
SELECT sum(((arr_mean - v)^2) * w) INTO sdam FROM (
SELECT unnest(in_matrix[1:1]) as v, unnest(in_matrix[2:2]) as w
) x;
-- To start, we create ranges with approximately the same amount of different values
top := 0;
i := 1;
LOOP
bot := top + 1;
top := ROUND(i * in_unique_count::numeric / breaks::NUMERIC);
IF i = 1 THEN
classes = ARRAY[ARRAY[bot,top]];
ELSE ELSE
SELECT count(*) INTO top FROM ( SELECT unnest(in_array) as v) x WHERE v <= quant[i]; classes = ARRAY_CAT(classes, ARRAY[bot,top]);
END IF;
IF i = 1 THEN
classes = ARRAY[ARRAY[bot,top]];
ELSE
classes = ARRAY_CAT(classes,ARRAY[bot,top]);
END IF; END IF;
i := i + 1;
IF i > breaks THEN EXIT; END IF; IF i > breaks THEN EXIT; END IF;
i = i+1;
END LOOP; END LOOP;
best_result = CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles); best_result = CDB_JenksBinsIteration(in_matrix, breaks, classes, invert, sdam, shuffles);
--set the seed so we can ensure the same results --set the seed so we can ensure the same results
SELECT setseed(0.4567) INTO seedtarget; SELECT setseed(0.4567) INTO seedtarget;
--loop through random starting positions --loop through random starting positions
LOOP LOOP
IF j > iterations-1 THEN EXIT; END IF; IF j > iterations-1 THEN EXIT; END IF;
i = 1; i = 1;
tops = ARRAY[element_count]; tops = ARRAY[in_unique_count];
LOOP LOOP
IF i = breaks THEN EXIT; END IF; IF i = breaks THEN EXIT; END IF;
SELECT array_agg(distinct e) INTO tops FROM (SELECT unnest(array_cat(tops, ARRAY[floor(random()*element_count::float)::int])) as e ORDER BY e) x WHERE e != 1; SELECT array_agg(distinct e) INTO tops FROM (
i = array_length(tops, 1); SELECT unnest(array_cat(tops, ARRAY[trunc(random() * in_unique_count::float8)::int + 1])) as e ORDER BY e
END LOOP; ) x;
i = array_length(tops, 1);
END LOOP;
top := 0;
i = 1; i = 1;
LOOP LOOP
IF i > breaks THEN EXIT; END IF; bot := top + 1;
IF i = 1 THEN
bot = 1;
ELSE
bot = top+1;
END IF;
top = tops[i]; top = tops[i];
IF i = 1 THEN IF i = 1 THEN
classes = ARRAY[ARRAY[bot,top]]; classes = ARRAY[ARRAY[bot,top]];
ELSE ELSE
classes = ARRAY_CAT(classes,ARRAY[bot,top]); classes = ARRAY_CAT(classes, ARRAY[bot,top]);
END IF; END IF;
i := i+1;
END LOOP; i := i+1;
curr_result = CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles); IF i > breaks THEN EXIT; END IF;
END LOOP;
curr_result = CDB_JenksBinsIteration(in_matrix, breaks, classes, invert, sdam, shuffles);
IF curr_result[1] > best_result[1] THEN IF curr_result[1] > best_result[1] THEN
best_result = curr_result; best_result = curr_result;
j = j-1; -- if we found a better result, add one more search
END IF; END IF;
j = j+1; j = j+1;
END LOOP; END LOOP;
RETURN (best_result)[2:array_upper(best_result, 1)]; RETURN (best_result)[2:array_upper(best_result, 1)];
END; END;
$$ language plpgsql VOLATILE PARALLEL RESTRICTED; $$ LANGUAGE PLPGSQL IMMUTABLE PARALLEL RESTRICTED;
-- --
-- Perform a single iteration of the Jenks classification -- Perform a single iteration of the Jenks classification
-- --
-- Returns an array with:
-- - First element: gvf
-- - Second to 2+n: Category limits
DROP FUNCTION IF EXISTS CDB_JenksBinsIteration ( in_matrix NUMERIC[], breaks INT, classes INT[], invert BOOLEAN, element_count INT4, arr_mean NUMERIC, max_search INT); -- Old signature
CREATE OR REPLACE FUNCTION CDB_JenksBinsIteration ( in_matrix NUMERIC[], breaks INT, classes INT[], invert BOOLEAN, sdam NUMERIC, max_search INT DEFAULT 50) RETURNS NUMERIC[] as $$
DECLARE
i INT;
iterations INT = 0;
CREATE OR REPLACE FUNCTION CDB_JenksBinsIteration ( in_array NUMERIC[], breaks INT, classes INT[][], invert BOOLEAN, element_count INT4, arr_mean NUMERIC, max_search INT DEFAULT 50) RETURNS NUMERIC[] as $$
DECLARE
tmp_val numeric;
new_classes int[][];
tmp_class int[];
i INT := 1;
j INT := 1;
side INT := 2; side INT := 2;
sdam numeric;
gvf numeric := 0.0;
new_gvf numeric;
arr_gvf numeric[];
class_avg numeric;
class_max_i INT;
class_min_i INT;
class_max numeric;
class_min numeric;
reply numeric[];
BEGIN
-- Calculate the sum of squared deviations from the array mean (SDAM). gvf numeric := 0.0;
SELECT sum((arr_mean - e)^2) INTO sdam FROM ( SELECT unnest(in_array) as e ) x; new_gvf numeric;
--Identify the breaks for the lowest GVF arr_gvf numeric[];
LOOP arr_avg numeric[];
i = 1; class_avg numeric;
LOOP class_dev numeric;
-- get our mean
SELECT avg(e) INTO class_avg FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e) x; class_max_i INT = 0;
-- find the deviation class_min_i INT = 0;
SELECT sum((class_avg-e)^2) INTO tmp_val FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e ) x; dev_max numeric;
IF i = 1 THEN dev_min numeric;
arr_gvf = ARRAY[tmp_val];
-- init our min/max map for later best_classes INT[] = classes;
class_max = arr_gvf[i]; best_gvf numeric[];
class_min = arr_gvf[i]; best_avg numeric[];
class_min_i = 1; move_elements INT = 1;
class_max_i = 1;
ELSE reply numeric[];
arr_gvf = array_append(arr_gvf, tmp_val);
END IF; BEGIN
i := i+1;
IF i > breaks THEN EXIT; END IF; -- We fill the arrays with the initial values
END LOOP; i = 0;
-- calculate our new GVF LOOP
SELECT sdam-sum(e) INTO new_gvf FROM ( SELECT unnest(arr_gvf) as e ) x; IF i = breaks THEN EXIT; END IF;
-- if no improvement was made, exit i = i + 1;
IF new_gvf < gvf THEN EXIT; END IF;
gvf = new_gvf; -- Get class mean
IF j > max_search THEN EXIT; END IF; SELECT (sum(v * w) / sum(w)) INTO class_avg FROM (
j = j+1; SELECT unnest(in_matrix[1:1][classes[i][1]:classes[i][2]]) as v,
i = 1; unnest(in_matrix[2:2][classes[i][1]:classes[i][2]]) as w
LOOP ) x;
--establish directionality (uppward through classes or downward)
IF arr_gvf[i] < class_min THEN -- Get class deviation
class_min = arr_gvf[i]; SELECT sum((class_avg - v)^2 * w) INTO class_dev FROM (
class_min_i = i; SELECT unnest(in_matrix[1:1][classes[i][1]:classes[i][2]]) as v,
END IF; unnest(in_matrix[2:2][classes[i][1]:classes[i][2]]) as w
IF arr_gvf[i] > class_max THEN ) x;
class_max = arr_gvf[i];
class_max_i = i;
END IF; IF i = 1 THEN
i := i+1; arr_avg = ARRAY[class_avg];
IF i > breaks THEN EXIT; END IF; arr_gvf = ARRAY[class_dev];
END LOOP;
IF class_max_i > class_min_i THEN
class_min_i = class_max_i - 1;
ELSE ELSE
class_min_i = class_max_i + 1; arr_avg = array_append(arr_avg, class_avg);
arr_gvf = array_append(arr_gvf, class_dev);
END IF; END IF;
--Move from higher class to a lower gid order END LOOP;
IF class_max_i > class_min_i THEN
classes[class_max_i][1] = classes[class_max_i][1] + 1; -- We copy the values to avoid recalculation when a failure happens
classes[class_min_i][2] = classes[class_min_i][2] + 1; best_avg = arr_avg;
ELSE -- Move from lower class UP into a higher class by gid best_gvf = arr_gvf;
classes[class_max_i][2] = classes[class_max_i][2] - 1;
classes[class_min_i][1] = classes[class_min_i][1] - 1; iterations = 0;
LOOP
IF iterations = max_search THEN EXIT; END IF;
iterations = iterations + 1;
-- calculate our new GVF
SELECT sdam - sum(e) INTO new_gvf FROM ( SELECT unnest(arr_gvf) as e ) x;
-- Check if any improvement was made
IF new_gvf <= gvf THEN
-- If we were moving too many elements, go back and move less
IF move_elements <= 2 OR class_max_i = class_min_i THEN
EXIT;
END IF; END IF;
END LOOP;
move_elements = GREATEST(move_elements / 8, 1);
-- Rollback from saved statuses
classes = best_classes;
new_gvf = gvf;
i = class_min_i;
LOOP
arr_avg[i] = best_avg[i];
arr_gvf[i] = best_gvf[i];
IF i = class_max_i THEN EXIT; END IF;
i = i + 1;
END LOOP;
END IF;
-- We search for the classes with the min and max deviation
i = 1;
class_min_i = 1;
class_max_i = 1;
dev_max = arr_gvf[1];
dev_min = arr_gvf[1];
LOOP
IF i = breaks THEN EXIT; END IF;
i = i + 1;
IF arr_gvf[i] < dev_min THEN
dev_min = arr_gvf[i];
class_min_i = i;
ELSE
IF arr_gvf[i] > dev_max THEN
dev_max = arr_gvf[i];
class_max_i = i;
END IF;
END IF;
END LOOP;
-- Save best values for comparison and output
gvf = new_gvf;
best_classes = classes;
-- Limit the moved elements as to not remove everything from class_max_i
move_elements = LEAST(move_elements, classes[class_max_i][2] - classes[class_max_i][1]);
-- Move `move_elements` from class_max_i to class_min_i
IF class_min_i < class_max_i THEN
i := class_min_i;
LOOP
IF i = class_max_i THEN EXIT; END IF;
classes[i][2] = classes[i][2] + move_elements;
i := i + 1;
END LOOP;
i := class_max_i;
LOOP
IF i = class_min_i THEN EXIT; END IF;
classes[i][1] = classes[i][1] + move_elements;
i := i - 1;
END LOOP;
ELSE
i := class_min_i;
LOOP
IF i = class_max_i THEN EXIT; END IF;
classes[i][1] = classes[i][1] - move_elements;
i := i - 1;
END LOOP;
i := class_max_i;
LOOP
IF i = class_min_i THEN EXIT; END IF;
classes[i][2] = classes[i][2] - move_elements;
i := i + 1;
END LOOP;
END IF;
-- Recalculate avg and deviation ONLY for the affected classes
i = LEAST(class_min_i, class_max_i);
class_max_i = GREATEST(class_min_i, class_max_i);
class_min_i = i;
LOOP
SELECT (sum(v * w) / sum(w)) INTO class_avg FROM (
SELECT unnest(in_matrix[1:1][classes[i][1]:classes[i][2]]) as v,
unnest(in_matrix[2:2][classes[i][1]:classes[i][2]]) as w
) x;
SELECT sum((class_avg - v)^2 * w) INTO class_dev FROM (
SELECT unnest(in_matrix[1:1][classes[i][1]:classes[i][2]]) as v,
unnest(in_matrix[2:2][classes[i][1]:classes[i][2]]) as w
) x;
-- Save status (in case it's needed for rollback) and store the new one
best_avg[i] = arr_avg[i];
arr_avg[i] = class_avg;
best_gvf[i] = arr_gvf[i];
arr_gvf[i] = class_dev;
IF i = class_max_i THEN EXIT; END IF;
i = i + 1;
END LOOP;
move_elements = move_elements * 2;
END LOOP;
i = 1; i = 1;
LOOP LOOP
IF invert = TRUE THEN IF invert = TRUE THEN
side = 1; --default returns bottom side of breaks, invert returns top side side = 1; --default returns bottom side of breaks, invert returns top side
END IF; END IF;
reply = array_append(reply, in_array[classes[i][side]]); reply = array_append(reply, unnest(in_matrix[1:1][best_classes[i][side]:best_classes[i][side]]));
i = i+1; i = i+1;
IF i > breaks THEN EXIT; END IF; IF i > breaks THEN EXIT; END IF;
END LOOP; END LOOP;
RETURN array_prepend(gvf, reply);
END; reply = array_prepend(gvf, reply);
$$ language plpgsql IMMUTABLE PARALLEL SAFE; RETURN reply;
END;
$$ LANGUAGE PLPGSQL IMMUTABLE PARALLEL SAFE;

View File

@ -5,9 +5,8 @@ WITH data AS (
15.01, 14.99, 15.01, 14.99,
20.1, 19.9]::numeric[] AS s 20.1, 19.9]::numeric[] AS s
) )
-- expectation is: 1, 5, 10, 15, 20 SELECT unnest(CDB_JenksBins(s, 5)) FROM data;
-- TODO: fix cdb_jenksbins to match ^^
SELECT round(unnest(CDB_JenksBins(s, 5))) FROM data;
WITH data_nulls AS ( WITH data_nulls AS (
SELECT Array[0.99, 1.0, 1.01, SELECT Array[0.99, 1.0, 1.01,
@ -18,6 +17,20 @@ WITH data_nulls AS (
null, null, null, null,
20.1, 19.9]::numeric[] AS s 20.1, 19.9]::numeric[] AS s
) )
-- expectation is: 1, 5, 10, 15, 20 SELECT unnest(CDB_JenksBins(s, 5)) FROM data_nulls;
-- TODO: fix cdb_jenksbins to match ^^
SELECT round(unnest(CDB_JenksBins(s, 5))) FROM data_nulls;
WITH data_inverse AS (
SELECT Array[0.99, 1.0, 1.01,
4.99, 5.01,
10.01, 10.01,
15.01, 14.99,
20.1, 19.9]::numeric[] AS s
)
SELECT unnest(CDB_JenksBins(s, 5, 0, true)) FROM data_inverse;
WITH data_small AS (
SELECT Array[0.99, 1.0, 10.01, 10.01, 10.01, 10.01]::numeric[] AS s
)
SELECT unnest(CDB_JenksBins(s, 4)) FROM data_small;

View File

@ -1,10 +1,18 @@
1 1.01
5 5.01
10 10.01
20 15.01
20 20.1
1 1.01
5 5.01
10 10.01
20 15.01
20.1
0.99
4.99
10.01
14.99
19.9
0.99
1.0
10.01