cartodb-postgresql/scripts-available/CDB_JenksBins.sql

347 lines
10 KiB
MySQL
Raw Normal View History

--
-- Determine the Jenks classifications from a numeric array
--
-- @param in_array A numeric array of numbers to determine the best
-- bins based on the Jenks method.
--
-- @param breaks The number of bins you want to find.
--
-- @param iterations The number of different starting positions to test.
--
-- @param invert Optional wheter to return the top of each bin (default)
-- or the bottom. BOOLEAN, default=FALSE.
2018-09-10 19:16:54 +08:00
--
--
2018-09-10 19:16:54 +08:00
CREATE OR REPLACE FUNCTION CDB_JenksBins(in_array NUMERIC[], breaks INT, iterations INT DEFAULT 0, invert BOOLEAN DEFAULT FALSE)
RETURNS NUMERIC[] as
$$
DECLARE
in_matrix NUMERIC[][];
in_unique_count BIGINT;
2018-09-10 19:16:54 +08:00
shuffles INT;
arr_mean NUMERIC;
2018-09-10 19:16:54 +08:00
sdam NUMERIC;
i INT;
bot INT;
top INT;
2018-09-10 19:16:54 +08:00
tops INT[];
classes INT[][];
2018-09-10 19:16:54 +08:00
j INT := 1;
curr_result NUMERIC[];
best_result NUMERIC[];
seedtarget TEXT;
2018-09-10 19:16:54 +08:00
BEGIN
2018-09-10 19:16:54 +08:00
-- We clean the input array (remove NULLs) and create 2 arrays
-- [1] contains the unique values in in_array
-- [2] contains the number of appearances of those unique values
SELECT ARRAY[array_agg(value), array_agg(count)] FROM
(
SELECT value, count(1)::numeric as count
FROM unnest(in_array) AS value
WHERE value is NOT NULL
GROUP BY value
ORDER BY value
) __clean_array_q INTO in_matrix;
-- Get the number of unique values
in_unique_count := array_length(in_matrix[1:1], 2);
IF in_unique_count IS NULL THEN
RETURN NULL;
END IF;
2018-09-10 19:16:54 +08:00
IF in_unique_count <= breaks THEN
-- There isn't enough distinct values for the requested breaks
RETURN ARRAY(Select unnest(in_matrix[1:1])) _a;
END IF;
-- If not declated explicitly we iterate based on the length of the array
IF iterations < 1 THEN
-- This is based on a 'looks fine' heuristic
iterations := log(in_unique_count)::integer + 1;
END IF;
-- We set the number of shuffles per iteration as the number of unique values but
-- this is just another 'looks fine' heuristic
shuffles := in_unique_count;
-- Get the mean value of the whole vector (already ignores NULLs)
SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x;
-- Calculate the sum of squared deviations from the array mean (SDAM).
SELECT sum(((arr_mean - v)^2) * w) INTO sdam FROM (
SELECT unnest(in_matrix[1:1]) as v, unnest(in_matrix[2:2]) as w
) x;
-- To start, we create ranges with approximately the same amount of different values
top := 0;
i := 1;
LOOP
bot := top + 1;
top := ROUND(i * in_unique_count::numeric / breaks::NUMERIC);
IF i = 1 THEN
classes = ARRAY[ARRAY[bot,top]];
ELSE
2018-09-10 19:16:54 +08:00
classes = ARRAY_CAT(classes, ARRAY[bot,top]);
END IF;
2018-09-10 19:16:54 +08:00
i := i + 1;
IF i > breaks THEN EXIT; END IF;
END LOOP;
2018-09-10 19:16:54 +08:00
best_result = CDB_JenksBinsIteration(in_matrix, breaks, classes, invert, sdam, shuffles);
--set the seed so we can ensure the same results
SELECT setseed(0.4567) INTO seedtarget;
--loop through random starting positions
LOOP
2018-09-10 19:16:54 +08:00
IF j > iterations-1 THEN EXIT; END IF;
i = 1;
2018-09-10 19:16:54 +08:00
tops = ARRAY[in_unique_count];
LOOP
2018-09-10 19:16:54 +08:00
IF i = breaks THEN EXIT; END IF;
SELECT array_agg(distinct e) INTO tops FROM (
SELECT unnest(array_cat(tops, ARRAY[trunc(random() * in_unique_count::float8)::int + 1])) as e ORDER BY e
) x;
i = array_length(tops, 1);
END LOOP;
top := 0;
i = 1;
2018-09-10 19:16:54 +08:00
LOOP
bot := top + 1;
top = tops[i];
IF i = 1 THEN
2018-09-10 19:16:54 +08:00
classes = ARRAY[ARRAY[bot,top]];
ELSE
2018-09-10 19:16:54 +08:00
classes = ARRAY_CAT(classes, ARRAY[bot,top]);
END IF;
2018-09-10 19:16:54 +08:00
i := i+1;
IF i > breaks THEN EXIT; END IF;
END LOOP;
curr_result = CDB_JenksBinsIteration(in_matrix, breaks, classes, invert, sdam, shuffles);
IF curr_result[1] > best_result[1] THEN
best_result = curr_result;
END IF;
2018-09-10 19:16:54 +08:00
j = j+1;
END LOOP;
RETURN (best_result)[2:array_upper(best_result, 1)];
END;
2018-09-10 19:16:54 +08:00
$$ LANGUAGE PLPGSQL IMMUTABLE PARALLEL RESTRICTED;
--
-- Perform a single iteration of the Jenks classification
--
2018-09-10 19:16:54 +08:00
-- Returns an array with:
-- - First element: gvf
-- - Second to 2+n: Category limits
DROP FUNCTION IF EXISTS CDB_JenksBinsIteration ( in_matrix NUMERIC[], breaks INT, classes INT[], invert BOOLEAN, element_count INT4, arr_mean NUMERIC, max_search INT); -- Old signature
CREATE OR REPLACE FUNCTION CDB_JenksBinsIteration ( in_matrix NUMERIC[], breaks INT, classes INT[], invert BOOLEAN, sdam NUMERIC, max_search INT DEFAULT 50) RETURNS NUMERIC[] as $$
DECLARE
i INT;
iterations INT = 0;
side INT := 2;
2018-09-10 19:16:54 +08:00
gvf numeric := 0.0;
new_gvf numeric;
arr_gvf numeric[];
arr_avg numeric[];
class_avg numeric;
class_dev numeric;
class_max_i INT = 0;
class_min_i INT = 0;
2018-09-10 19:16:54 +08:00
dev_max numeric;
dev_min numeric;
2018-09-10 19:16:54 +08:00
best_classes INT[] = classes;
best_gvf numeric[];
best_avg numeric[];
move_elements INT = 1;
2018-09-10 19:16:54 +08:00
reply numeric[];
BEGIN
-- We fill the arrays with the initial values
i = 0;
LOOP
IF i = breaks THEN EXIT; END IF;
i = i + 1;
-- Get class mean
SELECT (sum(v * w) / sum(w)) INTO class_avg FROM (
SELECT unnest(in_matrix[1:1][classes[i][1]:classes[i][2]]) as v,
unnest(in_matrix[2:2][classes[i][1]:classes[i][2]]) as w
) x;
-- Get class deviation
SELECT sum((class_avg - v)^2 * w) INTO class_dev FROM (
SELECT unnest(in_matrix[1:1][classes[i][1]:classes[i][2]]) as v,
unnest(in_matrix[2:2][classes[i][1]:classes[i][2]]) as w
) x;
IF i = 1 THEN
arr_avg = ARRAY[class_avg];
arr_gvf = ARRAY[class_dev];
ELSE
arr_avg = array_append(arr_avg, class_avg);
arr_gvf = array_append(arr_gvf, class_dev);
END IF;
END LOOP;
-- We copy the values to avoid recalculation when a failure happens
best_avg = arr_avg;
best_gvf = arr_gvf;
2018-09-10 19:16:54 +08:00
iterations = 0;
LOOP
IF iterations = max_search THEN EXIT; END IF;
iterations = iterations + 1;
-- calculate our new GVF
2018-09-10 19:16:54 +08:00
SELECT sdam - sum(e) INTO new_gvf FROM ( SELECT unnest(arr_gvf) as e ) x;
-- Check if any improvement was made
IF new_gvf <= gvf THEN
-- If we were moving too many elements, go back and move less
IF move_elements <= 2 OR class_max_i = class_min_i THEN
EXIT;
END IF;
move_elements = GREATEST(move_elements / 8, 1);
2018-09-10 19:16:54 +08:00
-- Rollback from saved statuses
classes = best_classes;
new_gvf = gvf;
i = class_min_i;
LOOP
arr_avg[i] = best_avg[i];
arr_gvf[i] = best_gvf[i];
IF i = class_max_i THEN EXIT; END IF;
i = i + 1;
END LOOP;
END IF;
-- We search for the classes with the min and max deviation
2018-09-10 19:16:54 +08:00
i = 1;
class_min_i = 1;
class_max_i = 1;
dev_max = arr_gvf[1];
dev_min = arr_gvf[1];
LOOP
IF i = breaks THEN EXIT; END IF;
i = i + 1;
IF arr_gvf[i] < dev_min THEN
dev_min = arr_gvf[i];
class_min_i = i;
ELSE
IF arr_gvf[i] > dev_max THEN
dev_max = arr_gvf[i];
class_max_i = i;
END IF;
END IF;
END LOOP;
-- Save best values for comparison and output
gvf = new_gvf;
best_classes = classes;
-- Limit the moved elements as to not remove everything from class_max_i
move_elements = LEAST(move_elements, classes[class_max_i][2] - classes[class_max_i][1]);
-- Move `move_elements` from class_max_i to class_min_i
2018-09-10 19:16:54 +08:00
IF class_min_i < class_max_i THEN
i := class_min_i;
LOOP
IF i = class_max_i THEN EXIT; END IF;
classes[i][2] = classes[i][2] + move_elements;
2018-09-10 19:16:54 +08:00
i := i + 1;
END LOOP;
i := class_max_i;
LOOP
IF i = class_min_i THEN EXIT; END IF;
classes[i][1] = classes[i][1] + move_elements;
2018-09-10 19:16:54 +08:00
i := i - 1;
END LOOP;
ELSE
2018-09-10 19:16:54 +08:00
i := class_min_i;
LOOP
IF i = class_max_i THEN EXIT; END IF;
classes[i][1] = classes[i][1] - move_elements;
2018-09-10 19:16:54 +08:00
i := i - 1;
END LOOP;
i := class_max_i;
LOOP
IF i = class_min_i THEN EXIT; END IF;
classes[i][2] = classes[i][2] - move_elements;
2018-09-10 19:16:54 +08:00
i := i + 1;
END LOOP;
END IF;
2018-09-10 19:16:54 +08:00
-- Recalculate avg and deviation ONLY for the affected classes
2018-09-10 19:16:54 +08:00
i = LEAST(class_min_i, class_max_i);
class_max_i = GREATEST(class_min_i, class_max_i);
class_min_i = i;
2018-09-10 19:16:54 +08:00
LOOP
SELECT (sum(v * w) / sum(w)) INTO class_avg FROM (
SELECT unnest(in_matrix[1:1][classes[i][1]:classes[i][2]]) as v,
unnest(in_matrix[2:2][classes[i][1]:classes[i][2]]) as w
) x;
SELECT sum((class_avg - v)^2 * w) INTO class_dev FROM (
SELECT unnest(in_matrix[1:1][classes[i][1]:classes[i][2]]) as v,
unnest(in_matrix[2:2][classes[i][1]:classes[i][2]]) as w
) x;
-- Save status (in case it's needed for rollback) and store the new one
best_avg[i] = arr_avg[i];
2018-09-10 19:16:54 +08:00
arr_avg[i] = class_avg;
best_gvf[i] = arr_gvf[i];
2018-09-10 19:16:54 +08:00
arr_gvf[i] = class_dev;
IF i = class_max_i THEN EXIT; END IF;
i = i + 1;
END LOOP;
move_elements = move_elements * 2;
2018-09-10 19:16:54 +08:00
END LOOP;
i = 1;
2018-09-10 19:16:54 +08:00
LOOP
IF invert = TRUE THEN
side = 1; --default returns bottom side of breaks, invert returns top side
END IF;
2018-09-10 19:16:54 +08:00
reply = array_append(reply, unnest(in_matrix[1:1][best_classes[i][side]:best_classes[i][side]]));
i = i+1;
IF i > breaks THEN EXIT; END IF;
END LOOP;
reply = array_prepend(gvf, reply);
RETURN reply;
END;
$$ LANGUAGE PLPGSQL IMMUTABLE PARALLEL SAFE;