222 lines
7.3 KiB
MySQL
222 lines
7.3 KiB
MySQL
|
--
|
||
|
-- Determine the Jenks classifications from a numeric array
|
||
|
--
|
||
|
-- @param in_array A numeric array of numbers to determine the best
|
||
|
-- bins based on the Jenks method.
|
||
|
--
|
||
|
-- @param breaks The number of bins you want to find.
|
||
|
--
|
||
|
-- @param iterations The number of different starting positions to test.
|
||
|
--
|
||
|
-- @param invert Optional wheter to return the top of each bin (default)
|
||
|
-- or the bottom. BOOLEAN, default=FALSE.
|
||
|
--
|
||
|
--
|
||
|
|
||
|
|
||
|
CREATE OR REPLACE FUNCTION CDB_JenksBins ( in_array NUMERIC[], breaks INT, iterations INT DEFAULT 5, invert BOOLEAN DEFAULT FALSE) RETURNS NUMERIC[] as $$
|
||
|
DECLARE
|
||
|
element_count INT4;
|
||
|
arr_mean NUMERIC;
|
||
|
bot INT;
|
||
|
top INT;
|
||
|
tops INT[];
|
||
|
classes INT[][];
|
||
|
i INT := 1; j INT := 1;
|
||
|
curr_result NUMERIC[];
|
||
|
best_result NUMERIC[];
|
||
|
seedtarget TEXT;
|
||
|
quant NUMERIC[];
|
||
|
shuffles INT;
|
||
|
BEGIN
|
||
|
-- get the total size of our row
|
||
|
element_count := array_length(in_array, 1); --array_upper(in_array, 1) - array_lower(in_array, 1);
|
||
|
-- ensure the ordering of in_array
|
||
|
SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x;
|
||
|
-- stop if no rows
|
||
|
IF element_count IS NULL THEN
|
||
|
RETURN NULL;
|
||
|
END IF;
|
||
|
-- stop if our breaks are more than our input array size
|
||
|
IF element_count < breaks THEN
|
||
|
RETURN in_array;
|
||
|
END IF;
|
||
|
|
||
|
shuffles := LEAST(GREATEST(floor(2500000.0/(element_count::float*iterations::float)), 1), 750)::int;
|
||
|
-- get our mean value
|
||
|
SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x;
|
||
|
|
||
|
-- assume best is actually Quantile
|
||
|
SELECT CDB_QuantileBins(in_array, breaks) INTO quant;
|
||
|
|
||
|
-- if data is very very large, just return quant and be done
|
||
|
IF element_count > 5000000 THEN
|
||
|
RETURN quant;
|
||
|
END IF;
|
||
|
|
||
|
-- change quant into bottom, top markers
|
||
|
LOOP
|
||
|
IF i = 1 THEN
|
||
|
bot = 1;
|
||
|
ELSE
|
||
|
-- use last top to find this bot
|
||
|
bot = top+1;
|
||
|
END IF;
|
||
|
IF i = breaks THEN
|
||
|
top = element_count;
|
||
|
ELSE
|
||
|
SELECT count(*) INTO top FROM ( SELECT unnest(in_array) as v) x WHERE v <= quant[i];
|
||
|
END IF;
|
||
|
IF i = 1 THEN
|
||
|
classes = ARRAY[ARRAY[bot,top]];
|
||
|
ELSE
|
||
|
classes = ARRAY_CAT(classes,ARRAY[bot,top]);
|
||
|
END IF;
|
||
|
IF i > breaks THEN EXIT; END IF;
|
||
|
i = i+1;
|
||
|
END LOOP;
|
||
|
|
||
|
best_result = CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles);
|
||
|
|
||
|
--set the seed so we can ensure the same results
|
||
|
SELECT setseed(0.4567) INTO seedtarget;
|
||
|
--loop through random starting positions
|
||
|
LOOP
|
||
|
IF j > iterations-1 THEN EXIT; END IF;
|
||
|
i = 1;
|
||
|
tops = ARRAY[element_count];
|
||
|
LOOP
|
||
|
IF i = breaks THEN EXIT; END IF;
|
||
|
SELECT array_agg(distinct e) INTO tops FROM (SELECT unnest(array_cat(tops, ARRAY[floor(random()*element_count::float)::int])) as e ORDER BY e) x WHERE e != 1;
|
||
|
i = array_length(tops, 1);
|
||
|
END LOOP;
|
||
|
i = 1;
|
||
|
LOOP
|
||
|
IF i > breaks THEN EXIT; END IF;
|
||
|
IF i = 1 THEN
|
||
|
bot = 1;
|
||
|
ELSE
|
||
|
bot = top+1;
|
||
|
END IF;
|
||
|
top = tops[i];
|
||
|
IF i = 1 THEN
|
||
|
classes = ARRAY[ARRAY[bot,top]];
|
||
|
ELSE
|
||
|
classes = ARRAY_CAT(classes,ARRAY[bot,top]);
|
||
|
END IF;
|
||
|
i := i+1;
|
||
|
END LOOP;
|
||
|
curr_result = CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles);
|
||
|
|
||
|
IF curr_result[1] > best_result[1] THEN
|
||
|
best_result = curr_result;
|
||
|
j = j-1; -- if we found a better result, add one more search
|
||
|
END IF;
|
||
|
j = j+1;
|
||
|
END LOOP;
|
||
|
|
||
|
RETURN (best_result)[2:array_upper(best_result, 1)];
|
||
|
END;
|
||
|
$$ language plpgsql IMMUTABLE;
|
||
|
|
||
|
|
||
|
|
||
|
--
|
||
|
-- Perform a single iteration of the Jenks classification
|
||
|
--
|
||
|
|
||
|
CREATE OR REPLACE FUNCTION CDB_JenksBinsIteration ( in_array NUMERIC[], breaks INT, classes INT[][], invert BOOLEAN, element_count INT4, arr_mean NUMERIC, max_search INT DEFAULT 50) RETURNS NUMERIC[] as $$
|
||
|
DECLARE
|
||
|
tmp_val numeric;
|
||
|
new_classes int[][];
|
||
|
tmp_class int[];
|
||
|
i INT := 1;
|
||
|
j INT := 1;
|
||
|
side INT := 2;
|
||
|
sdam numeric;
|
||
|
gvf numeric := 0.0;
|
||
|
new_gvf numeric;
|
||
|
arr_gvf numeric[];
|
||
|
class_avg numeric;
|
||
|
class_max_i INT;
|
||
|
class_min_i INT;
|
||
|
class_max numeric;
|
||
|
class_min numeric;
|
||
|
reply numeric[];
|
||
|
BEGIN
|
||
|
|
||
|
-- Calculate the sum of squared deviations from the array mean (SDAM).
|
||
|
SELECT sum((arr_mean - e)^2) INTO sdam FROM ( SELECT unnest(in_array) as e ) x;
|
||
|
--Identify the breaks for the lowest GVF
|
||
|
LOOP
|
||
|
i = 1;
|
||
|
LOOP
|
||
|
-- get our mean
|
||
|
SELECT avg(e) INTO class_avg FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e) x;
|
||
|
-- find the deviation
|
||
|
SELECT sum((class_avg-e)^2) INTO tmp_val FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e ) x;
|
||
|
IF i = 1 THEN
|
||
|
arr_gvf = ARRAY[tmp_val];
|
||
|
-- init our min/max map for later
|
||
|
class_max = arr_gvf[i];
|
||
|
class_min = arr_gvf[i];
|
||
|
class_min_i = 1;
|
||
|
class_max_i = 1;
|
||
|
ELSE
|
||
|
arr_gvf = array_append(arr_gvf, tmp_val);
|
||
|
END IF;
|
||
|
i := i+1;
|
||
|
IF i > breaks THEN EXIT; END IF;
|
||
|
END LOOP;
|
||
|
-- calculate our new GVF
|
||
|
SELECT sdam-sum(e) INTO new_gvf FROM ( SELECT unnest(arr_gvf) as e ) x;
|
||
|
-- if no improvement was made, exit
|
||
|
IF new_gvf < gvf THEN EXIT; END IF;
|
||
|
gvf = new_gvf;
|
||
|
IF j > max_search THEN EXIT; END IF;
|
||
|
j = j+1;
|
||
|
i = 1;
|
||
|
LOOP
|
||
|
--establish directionality (uppward through classes or downward)
|
||
|
IF arr_gvf[i] < class_min THEN
|
||
|
class_min = arr_gvf[i];
|
||
|
class_min_i = i;
|
||
|
END IF;
|
||
|
IF arr_gvf[i] > class_max THEN
|
||
|
class_max = arr_gvf[i];
|
||
|
class_max_i = i;
|
||
|
END IF;
|
||
|
i := i+1;
|
||
|
IF i > breaks THEN EXIT; END IF;
|
||
|
END LOOP;
|
||
|
IF class_max_i > class_min_i THEN
|
||
|
class_min_i = class_max_i - 1;
|
||
|
ELSE
|
||
|
class_min_i = class_max_i + 1;
|
||
|
END IF;
|
||
|
--Move from higher class to a lower gid order
|
||
|
IF class_max_i > class_min_i THEN
|
||
|
classes[class_max_i][1] = classes[class_max_i][1] + 1;
|
||
|
classes[class_min_i][2] = classes[class_min_i][2] + 1;
|
||
|
ELSE -- Move from lower class UP into a higher class by gid
|
||
|
classes[class_max_i][2] = classes[class_max_i][2] - 1;
|
||
|
classes[class_min_i][1] = classes[class_min_i][1] - 1;
|
||
|
END IF;
|
||
|
END LOOP;
|
||
|
|
||
|
i = 1;
|
||
|
LOOP
|
||
|
IF invert = TRUE THEN
|
||
|
side = 1; --default returns bottom side of breaks, invert returns top side
|
||
|
END IF;
|
||
|
reply = array_append(reply, in_array[classes[i][side]]);
|
||
|
i = i+1;
|
||
|
IF i > breaks THEN EXIT; END IF;
|
||
|
END LOOP;
|
||
|
|
||
|
RETURN array_prepend(gvf, reply);
|
||
|
|
||
|
END;
|
||
|
$$ language plpgsql IMMUTABLE;
|
||
|
|