Jenks: Fix multiple bugs
This commit is contained in:
parent
fcc06e82bf
commit
0e7d797400
@ -10,212 +10,319 @@
|
||||
--
|
||||
-- @param invert Optional wheter to return the top of each bin (default)
|
||||
-- or the bottom. BOOLEAN, default=FALSE.
|
||||
--
|
||||
--
|
||||
--
|
||||
|
||||
CREATE OR REPLACE FUNCTION CDB_JenksBins(in_array NUMERIC[], breaks INT, iterations INT DEFAULT 0, invert BOOLEAN DEFAULT FALSE)
|
||||
RETURNS NUMERIC[] as
|
||||
$$
|
||||
DECLARE
|
||||
in_matrix NUMERIC[][];
|
||||
in_unique_count BIGINT;
|
||||
|
||||
CREATE OR REPLACE FUNCTION CDB_JenksBins ( in_array NUMERIC[], breaks INT, iterations INT DEFAULT 5, invert BOOLEAN DEFAULT FALSE) RETURNS NUMERIC[] as $$
|
||||
DECLARE
|
||||
element_count INT4;
|
||||
shuffles INT;
|
||||
arr_mean NUMERIC;
|
||||
sdam NUMERIC;
|
||||
|
||||
i INT;
|
||||
bot INT;
|
||||
top INT;
|
||||
|
||||
tops INT[];
|
||||
classes INT[][];
|
||||
i INT := 1; j INT := 1;
|
||||
j INT := 1;
|
||||
curr_result NUMERIC[];
|
||||
best_result NUMERIC[];
|
||||
seedtarget TEXT;
|
||||
quant NUMERIC[];
|
||||
shuffles INT;
|
||||
|
||||
BEGIN
|
||||
-- get the total size of our row
|
||||
element_count := array_length(in_array, 1); --array_upper(in_array, 1) - array_lower(in_array, 1);
|
||||
-- ensure the ordering of in_array
|
||||
SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x;
|
||||
-- stop if no rows
|
||||
IF element_count IS NULL THEN
|
||||
RETURN NULL;
|
||||
END IF;
|
||||
-- stop if our breaks are more than our input array size
|
||||
IF element_count < breaks THEN
|
||||
RETURN in_array;
|
||||
END IF;
|
||||
-- We clean the input array (remove NULLs) and create 2 arrays
|
||||
-- [1] contains the unique values in in_array
|
||||
-- [2] contains the number of appearances of those unique values
|
||||
SELECT ARRAY[array_agg(value), array_agg(count)] FROM
|
||||
(
|
||||
SELECT value, count(1)::numeric as count
|
||||
FROM unnest(in_array) AS value
|
||||
WHERE value is NOT NULL
|
||||
GROUP BY value
|
||||
ORDER BY value
|
||||
) __clean_array_q INTO in_matrix;
|
||||
|
||||
shuffles := LEAST(GREATEST(floor(2500000.0/(element_count::float*iterations::float)), 1), 750)::int;
|
||||
-- get our mean value
|
||||
SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x;
|
||||
-- Get the number of unique values
|
||||
in_unique_count := array_length(in_matrix[1:1], 2);
|
||||
RAISE INFO 'Unique %', in_unique_count;
|
||||
|
||||
-- assume best is actually Quantile
|
||||
SELECT CDB_QuantileBins(in_array, breaks) INTO quant;
|
||||
|
||||
-- if data is very very large, just return quant and be done
|
||||
IF element_count > 5000000 THEN
|
||||
RETURN quant;
|
||||
IF in_unique_count IS NULL THEN
|
||||
RETURN NULL;
|
||||
END IF;
|
||||
|
||||
-- change quant into bottom, top markers
|
||||
LOOP
|
||||
IF i = 1 THEN
|
||||
bot = 1;
|
||||
ELSE
|
||||
-- use last top to find this bot
|
||||
bot = top+1;
|
||||
END IF;
|
||||
IF i = breaks THEN
|
||||
top = element_count;
|
||||
ELSE
|
||||
SELECT count(*) INTO top FROM ( SELECT unnest(in_array) as v) x WHERE v <= quant[i];
|
||||
END IF;
|
||||
IF i = 1 THEN
|
||||
classes = ARRAY[ARRAY[bot,top]];
|
||||
ELSE
|
||||
classes = ARRAY_CAT(classes,ARRAY[bot,top]);
|
||||
END IF;
|
||||
IF i > breaks THEN EXIT; END IF;
|
||||
i = i+1;
|
||||
END LOOP;
|
||||
IF in_unique_count <= breaks THEN
|
||||
-- There isn't enough distinct values for the requested breaks
|
||||
RETURN ARRAY(Select unnest(in_matrix[1:1])) _a;
|
||||
END IF;
|
||||
|
||||
best_result = CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles);
|
||||
-- If not declated explicitly we iterate based on the length of the array
|
||||
IF iterations < 1 THEN
|
||||
-- This is based on a 'looks fine' heuristic
|
||||
iterations := log(in_unique_count)::integer + 1;
|
||||
END IF;
|
||||
RAISE INFO 'Iterations: %', iterations;
|
||||
|
||||
-- We set the number of shuffles per iteration as the number of unique values but
|
||||
-- this is just another 'looks fine' heuristic
|
||||
shuffles := in_unique_count;
|
||||
RAISE INFO 'Suffles %', shuffles;
|
||||
|
||||
-- Get the mean value of the whole vector (already ignores NULLs)
|
||||
SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x;
|
||||
RAISE INFO 'Mean %', arr_mean;
|
||||
|
||||
-- Calculate the sum of squared deviations from the array mean (SDAM).
|
||||
SELECT sum(((arr_mean - v)^2) * w) INTO sdam FROM (
|
||||
SELECT unnest(in_matrix[1:1]) as v, unnest(in_matrix[2:2]) as w
|
||||
) x;
|
||||
RAISE INFO 'Deviation %', sdam;
|
||||
|
||||
-- To start, we create ranges with approximately the same amount of different values
|
||||
top := 0;
|
||||
i := 1;
|
||||
LOOP
|
||||
bot := top + 1;
|
||||
top := ROUND(i * in_unique_count::numeric / breaks::NUMERIC);
|
||||
|
||||
IF i = 1 THEN
|
||||
classes = ARRAY[ARRAY[bot,top]];
|
||||
ELSE
|
||||
classes = ARRAY_CAT(classes, ARRAY[bot,top]);
|
||||
END IF;
|
||||
|
||||
i := i + 1;
|
||||
IF i > breaks THEN EXIT; END IF;
|
||||
END LOOP;
|
||||
RAISE INFO 'Initial classes %', classes;
|
||||
|
||||
|
||||
best_result = CDB_JenksBinsIteration(in_matrix, breaks, classes, invert, sdam, shuffles);
|
||||
|
||||
--set the seed so we can ensure the same results
|
||||
SELECT setseed(0.4567) INTO seedtarget;
|
||||
--loop through random starting positions
|
||||
LOOP
|
||||
IF j > iterations-1 THEN EXIT; END IF;
|
||||
IF j > iterations-1 THEN EXIT; END IF;
|
||||
i = 1;
|
||||
tops = ARRAY[element_count];
|
||||
tops = ARRAY[in_unique_count];
|
||||
LOOP
|
||||
IF i = breaks THEN EXIT; END IF;
|
||||
SELECT array_agg(distinct e) INTO tops FROM (SELECT unnest(array_cat(tops, ARRAY[floor(random()*element_count::float)::int])) as e ORDER BY e) x WHERE e != 1;
|
||||
i = array_length(tops, 1);
|
||||
END LOOP;
|
||||
IF i = breaks THEN EXIT; END IF;
|
||||
SELECT array_agg(distinct e) INTO tops FROM (
|
||||
SELECT unnest(array_cat(tops, ARRAY[trunc(random() * in_unique_count::float8)::int + 1])) as e ORDER BY e
|
||||
) x;
|
||||
i = array_length(tops, 1);
|
||||
END LOOP;
|
||||
RAISE INFO 'Tops %', tops;
|
||||
top := 0;
|
||||
i = 1;
|
||||
LOOP
|
||||
IF i > breaks THEN EXIT; END IF;
|
||||
IF i = 1 THEN
|
||||
bot = 1;
|
||||
ELSE
|
||||
bot = top+1;
|
||||
END IF;
|
||||
LOOP
|
||||
bot := top + 1;
|
||||
top = tops[i];
|
||||
IF i = 1 THEN
|
||||
classes = ARRAY[ARRAY[bot,top]];
|
||||
ELSE
|
||||
classes = ARRAY_CAT(classes,ARRAY[bot,top]);
|
||||
IF i = 1 THEN
|
||||
classes = ARRAY[ARRAY[bot,top]];
|
||||
ELSE
|
||||
classes = ARRAY_CAT(classes, ARRAY[bot,top]);
|
||||
END IF;
|
||||
i := i+1;
|
||||
END LOOP;
|
||||
curr_result = CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles);
|
||||
|
||||
i := i+1;
|
||||
IF i > breaks THEN EXIT; END IF;
|
||||
END LOOP;
|
||||
|
||||
RAISE INFO 'Classes %', classes;
|
||||
curr_result = CDB_JenksBinsIteration(in_matrix, breaks, classes, invert, sdam, shuffles);
|
||||
|
||||
IF curr_result[1] > best_result[1] THEN
|
||||
best_result = curr_result;
|
||||
j = j-1; -- if we found a better result, add one more search
|
||||
END IF;
|
||||
|
||||
j = j+1;
|
||||
END LOOP;
|
||||
|
||||
RETURN (best_result)[2:array_upper(best_result, 1)];
|
||||
END;
|
||||
$$ language plpgsql VOLATILE PARALLEL RESTRICTED;
|
||||
|
||||
$$ LANGUAGE PLPGSQL IMMUTABLE PARALLEL RESTRICTED;
|
||||
|
||||
|
||||
--
|
||||
-- Perform a single iteration of the Jenks classification
|
||||
--
|
||||
-- Returns an array with:
|
||||
-- - First element: gvf
|
||||
-- - Second to 2+n: Category limits
|
||||
DROP FUNCTION IF EXISTS CDB_JenksBinsIteration ( in_matrix NUMERIC[], breaks INT, classes INT[], invert BOOLEAN, element_count INT4, arr_mean NUMERIC, max_search INT); -- Old signature
|
||||
|
||||
CREATE OR REPLACE FUNCTION CDB_JenksBinsIteration ( in_matrix NUMERIC[], breaks INT, classes INT[], invert BOOLEAN, sdam NUMERIC, max_search INT DEFAULT 50) RETURNS NUMERIC[] as $$
|
||||
DECLARE
|
||||
i INT;
|
||||
iterations INT = 0;
|
||||
|
||||
CREATE OR REPLACE FUNCTION CDB_JenksBinsIteration ( in_array NUMERIC[], breaks INT, classes INT[][], invert BOOLEAN, element_count INT4, arr_mean NUMERIC, max_search INT DEFAULT 50) RETURNS NUMERIC[] as $$
|
||||
DECLARE
|
||||
tmp_val numeric;
|
||||
new_classes int[][];
|
||||
tmp_class int[];
|
||||
i INT := 1;
|
||||
j INT := 1;
|
||||
side INT := 2;
|
||||
sdam numeric;
|
||||
gvf numeric := 0.0;
|
||||
new_gvf numeric;
|
||||
arr_gvf numeric[];
|
||||
class_avg numeric;
|
||||
class_max_i INT;
|
||||
class_min_i INT;
|
||||
class_max numeric;
|
||||
class_min numeric;
|
||||
reply numeric[];
|
||||
BEGIN
|
||||
|
||||
-- Calculate the sum of squared deviations from the array mean (SDAM).
|
||||
SELECT sum((arr_mean - e)^2) INTO sdam FROM ( SELECT unnest(in_array) as e ) x;
|
||||
--Identify the breaks for the lowest GVF
|
||||
LOOP
|
||||
i = 1;
|
||||
LOOP
|
||||
-- get our mean
|
||||
SELECT avg(e) INTO class_avg FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e) x;
|
||||
-- find the deviation
|
||||
SELECT sum((class_avg-e)^2) INTO tmp_val FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e ) x;
|
||||
IF i = 1 THEN
|
||||
arr_gvf = ARRAY[tmp_val];
|
||||
-- init our min/max map for later
|
||||
class_max = arr_gvf[i];
|
||||
class_min = arr_gvf[i];
|
||||
class_min_i = 1;
|
||||
class_max_i = 1;
|
||||
ELSE
|
||||
arr_gvf = array_append(arr_gvf, tmp_val);
|
||||
END IF;
|
||||
i := i+1;
|
||||
IF i > breaks THEN EXIT; END IF;
|
||||
END LOOP;
|
||||
-- calculate our new GVF
|
||||
SELECT sdam-sum(e) INTO new_gvf FROM ( SELECT unnest(arr_gvf) as e ) x;
|
||||
-- if no improvement was made, exit
|
||||
IF new_gvf < gvf THEN EXIT; END IF;
|
||||
gvf = new_gvf;
|
||||
IF j > max_search THEN EXIT; END IF;
|
||||
j = j+1;
|
||||
i = 1;
|
||||
LOOP
|
||||
--establish directionality (uppward through classes or downward)
|
||||
IF arr_gvf[i] < class_min THEN
|
||||
class_min = arr_gvf[i];
|
||||
class_min_i = i;
|
||||
END IF;
|
||||
IF arr_gvf[i] > class_max THEN
|
||||
class_max = arr_gvf[i];
|
||||
class_max_i = i;
|
||||
END IF;
|
||||
i := i+1;
|
||||
IF i > breaks THEN EXIT; END IF;
|
||||
END LOOP;
|
||||
IF class_max_i > class_min_i THEN
|
||||
class_min_i = class_max_i - 1;
|
||||
gvf numeric := 0.0;
|
||||
new_gvf numeric;
|
||||
arr_gvf numeric[];
|
||||
arr_avg numeric[];
|
||||
class_avg numeric;
|
||||
class_dev numeric;
|
||||
|
||||
class_max_i INT;
|
||||
class_min_i INT;
|
||||
dev_max numeric;
|
||||
dev_min numeric;
|
||||
best_classes INT[] = classes;
|
||||
|
||||
reply numeric[];
|
||||
|
||||
BEGIN
|
||||
|
||||
-- We fill the arrays with the initial values
|
||||
i = 0;
|
||||
LOOP
|
||||
IF i = breaks THEN EXIT; END IF;
|
||||
i = i + 1;
|
||||
RAISE INFO 'Loop %', i;
|
||||
|
||||
-- Get class mean
|
||||
SELECT (sum(v * w) / sum(w)) INTO class_avg FROM (
|
||||
SELECT unnest(in_matrix[1:1][classes[i][1]:classes[i][2]]) as v,
|
||||
unnest(in_matrix[2:2][classes[i][1]:classes[i][2]]) as w
|
||||
) x;
|
||||
|
||||
-- Get class deviation
|
||||
SELECT sum((class_avg - v)^2 * w) INTO class_dev FROM (
|
||||
SELECT unnest(in_matrix[1:1][classes[i][1]:classes[i][2]]) as v,
|
||||
unnest(in_matrix[2:2][classes[i][1]:classes[i][2]]) as w
|
||||
) x;
|
||||
|
||||
|
||||
IF i = 1 THEN
|
||||
arr_avg = ARRAY[class_avg];
|
||||
arr_gvf = ARRAY[class_dev];
|
||||
ELSE
|
||||
class_min_i = class_max_i + 1;
|
||||
arr_avg = array_append(arr_avg, class_avg);
|
||||
arr_gvf = array_append(arr_gvf, class_dev);
|
||||
END IF;
|
||||
--Move from higher class to a lower gid order
|
||||
IF class_max_i > class_min_i THEN
|
||||
classes[class_max_i][1] = classes[class_max_i][1] + 1;
|
||||
classes[class_min_i][2] = classes[class_min_i][2] + 1;
|
||||
ELSE -- Move from lower class UP into a higher class by gid
|
||||
classes[class_max_i][2] = classes[class_max_i][2] - 1;
|
||||
classes[class_min_i][1] = classes[class_min_i][1] - 1;
|
||||
END LOOP;
|
||||
|
||||
|
||||
|
||||
iterations = 0;
|
||||
LOOP
|
||||
IF iterations = max_search THEN EXIT; END IF;
|
||||
iterations = iterations + 1;
|
||||
|
||||
-- calculate our new GVF
|
||||
SELECT sdam - sum(e) INTO new_gvf FROM ( SELECT unnest(arr_gvf) as e ) x;
|
||||
|
||||
-- if no improvement was made, exit
|
||||
IF new_gvf <= gvf OR class_max_i = class_min_i THEN EXIT; END IF;
|
||||
|
||||
-- We search for classes with the min and max deviation
|
||||
i = 1;
|
||||
class_min_i = 1;
|
||||
class_max_i = 1;
|
||||
dev_max = arr_gvf[1];
|
||||
dev_min = arr_gvf[1];
|
||||
LOOP
|
||||
IF i = breaks THEN EXIT; END IF;
|
||||
i = i + 1;
|
||||
|
||||
IF arr_gvf[i] < dev_min THEN
|
||||
dev_min = arr_gvf[i];
|
||||
class_min_i = i;
|
||||
ELSE
|
||||
IF arr_gvf[i] > dev_max THEN
|
||||
dev_max = arr_gvf[i];
|
||||
class_max_i = i;
|
||||
END IF;
|
||||
END IF;
|
||||
END LOOP;
|
||||
END LOOP;
|
||||
|
||||
|
||||
-- Save best values for comparison and output
|
||||
gvf = new_gvf;
|
||||
best_classes = classes;
|
||||
RAISE INFO 'Deviations %', arr_gvf;
|
||||
RAISE INFO 'Min %. Max %', class_min_i, class_max_i;
|
||||
|
||||
-- Iterate by moving an element from class_max_i to class_min_i
|
||||
IF class_min_i < class_max_i THEN
|
||||
i := class_min_i;
|
||||
LOOP
|
||||
IF i = class_max_i THEN EXIT; END IF;
|
||||
classes[i][2] = classes[i][2] + 1;
|
||||
i := i + 1;
|
||||
END LOOP;
|
||||
|
||||
i := class_max_i;
|
||||
LOOP
|
||||
IF i = class_min_i THEN EXIT; END IF;
|
||||
classes[i][1] = classes[i][1] + 1;
|
||||
i := i - 1;
|
||||
END LOOP;
|
||||
ELSE
|
||||
i := class_min_i;
|
||||
LOOP
|
||||
IF i = class_max_i THEN EXIT; END IF;
|
||||
classes[i][1] = classes[i][1] - 1;
|
||||
i := i - 1;
|
||||
END LOOP;
|
||||
|
||||
i := class_max_i;
|
||||
LOOP
|
||||
IF i = class_min_i THEN EXIT; END IF;
|
||||
classes[i][2] = classes[i][2] - 1;
|
||||
i := i + 1;
|
||||
END LOOP;
|
||||
END IF;
|
||||
RAISE INFO 'Classes %', classes;
|
||||
|
||||
-- Recalculate avg and deviation for the affected classes
|
||||
i = LEAST(class_min_i, class_max_i);
|
||||
class_max_i = GREATEST(class_min_i, class_max_i);
|
||||
LOOP
|
||||
-- Get class mean
|
||||
SELECT (sum(v * w) / sum(w)) INTO class_avg FROM (
|
||||
SELECT unnest(in_matrix[1:1][classes[i][1]:classes[i][2]]) as v,
|
||||
unnest(in_matrix[2:2][classes[i][1]:classes[i][2]]) as w
|
||||
) x;
|
||||
|
||||
-- Get class deviation
|
||||
SELECT sum((class_avg - v)^2 * w) INTO class_dev FROM (
|
||||
SELECT unnest(in_matrix[1:1][classes[i][1]:classes[i][2]]) as v,
|
||||
unnest(in_matrix[2:2][classes[i][1]:classes[i][2]]) as w
|
||||
) x;
|
||||
|
||||
arr_avg[i] = class_avg;
|
||||
arr_gvf[i] = class_dev;
|
||||
|
||||
IF i = class_max_i THEN EXIT; END IF;
|
||||
i = i + 1;
|
||||
END LOOP;
|
||||
|
||||
END LOOP;
|
||||
|
||||
i = 1;
|
||||
LOOP
|
||||
LOOP
|
||||
IF invert = TRUE THEN
|
||||
side = 1; --default returns bottom side of breaks, invert returns top side
|
||||
END IF;
|
||||
reply = array_append(reply, in_array[classes[i][side]]);
|
||||
i = i+1;
|
||||
IF i > breaks THEN EXIT; END IF;
|
||||
END LOOP;
|
||||
|
||||
RETURN array_prepend(gvf, reply);
|
||||
reply = array_append(reply, unnest(in_matrix[1:1][best_classes[i][side]:best_classes[i][side]]));
|
||||
i = i+1;
|
||||
IF i > breaks THEN EXIT; END IF;
|
||||
END LOOP;
|
||||
|
||||
END;
|
||||
$$ language plpgsql IMMUTABLE PARALLEL SAFE;
|
||||
reply = array_prepend(gvf, reply);
|
||||
RAISE INFO 'Reply: %', reply;
|
||||
RETURN reply;
|
||||
|
||||
END;
|
||||
$$ LANGUAGE PLPGSQL IMMUTABLE PARALLEL SAFE;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user