From 0e7d7974006c54702d71d5fbab24a4673734c705 Mon Sep 17 00:00:00 2001 From: Raul Marin Date: Mon, 10 Sep 2018 13:16:54 +0200 Subject: [PATCH] Jenks: Fix multiple bugs --- scripts-available/CDB_JenksBins.sql | 415 +++++++++++++++++----------- 1 file changed, 261 insertions(+), 154 deletions(-) diff --git a/scripts-available/CDB_JenksBins.sql b/scripts-available/CDB_JenksBins.sql index 9db1ad6..f1361e5 100644 --- a/scripts-available/CDB_JenksBins.sql +++ b/scripts-available/CDB_JenksBins.sql @@ -10,212 +10,319 @@ -- -- @param invert Optional wheter to return the top of each bin (default) -- or the bottom. BOOLEAN, default=FALSE. --- +-- -- +CREATE OR REPLACE FUNCTION CDB_JenksBins(in_array NUMERIC[], breaks INT, iterations INT DEFAULT 0, invert BOOLEAN DEFAULT FALSE) +RETURNS NUMERIC[] as +$$ +DECLARE + in_matrix NUMERIC[][]; + in_unique_count BIGINT; -CREATE OR REPLACE FUNCTION CDB_JenksBins ( in_array NUMERIC[], breaks INT, iterations INT DEFAULT 5, invert BOOLEAN DEFAULT FALSE) RETURNS NUMERIC[] as $$ -DECLARE - element_count INT4; + shuffles INT; arr_mean NUMERIC; + sdam NUMERIC; + + i INT; bot INT; top INT; + tops INT[]; classes INT[][]; - i INT := 1; j INT := 1; + j INT := 1; curr_result NUMERIC[]; best_result NUMERIC[]; seedtarget TEXT; - quant NUMERIC[]; - shuffles INT; + BEGIN - -- get the total size of our row - element_count := array_length(in_array, 1); --array_upper(in_array, 1) - array_lower(in_array, 1); - -- ensure the ordering of in_array - SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x; - -- stop if no rows - IF element_count IS NULL THEN - RETURN NULL; - END IF; - -- stop if our breaks are more than our input array size - IF element_count < breaks THEN - RETURN in_array; - END IF; + -- We clean the input array (remove NULLs) and create 2 arrays + -- [1] contains the unique values in in_array + -- [2] contains the number of appearances of those unique values + SELECT ARRAY[array_agg(value), array_agg(count)] FROM + ( + SELECT value, count(1)::numeric as count + FROM unnest(in_array) AS value + WHERE value is NOT NULL + GROUP BY value + ORDER BY value + ) __clean_array_q INTO in_matrix; - shuffles := LEAST(GREATEST(floor(2500000.0/(element_count::float*iterations::float)), 1), 750)::int; - -- get our mean value - SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x; + -- Get the number of unique values + in_unique_count := array_length(in_matrix[1:1], 2); + RAISE INFO 'Unique %', in_unique_count; - -- assume best is actually Quantile - SELECT CDB_QuantileBins(in_array, breaks) INTO quant; - - -- if data is very very large, just return quant and be done - IF element_count > 5000000 THEN - RETURN quant; + IF in_unique_count IS NULL THEN + RETURN NULL; END IF; - -- change quant into bottom, top markers - LOOP - IF i = 1 THEN - bot = 1; - ELSE - -- use last top to find this bot - bot = top+1; - END IF; - IF i = breaks THEN - top = element_count; - ELSE - SELECT count(*) INTO top FROM ( SELECT unnest(in_array) as v) x WHERE v <= quant[i]; - END IF; - IF i = 1 THEN - classes = ARRAY[ARRAY[bot,top]]; - ELSE - classes = ARRAY_CAT(classes,ARRAY[bot,top]); - END IF; - IF i > breaks THEN EXIT; END IF; - i = i+1; - END LOOP; + IF in_unique_count <= breaks THEN + -- There isn't enough distinct values for the requested breaks + RETURN ARRAY(Select unnest(in_matrix[1:1])) _a; + END IF; - best_result = CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles); + -- If not declated explicitly we iterate based on the length of the array + IF iterations < 1 THEN + -- This is based on a 'looks fine' heuristic + iterations := log(in_unique_count)::integer + 1; + END IF; + RAISE INFO 'Iterations: %', iterations; + + -- We set the number of shuffles per iteration as the number of unique values but + -- this is just another 'looks fine' heuristic + shuffles := in_unique_count; + RAISE INFO 'Suffles %', shuffles; + + -- Get the mean value of the whole vector (already ignores NULLs) + SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x; + RAISE INFO 'Mean %', arr_mean; + + -- Calculate the sum of squared deviations from the array mean (SDAM). + SELECT sum(((arr_mean - v)^2) * w) INTO sdam FROM ( + SELECT unnest(in_matrix[1:1]) as v, unnest(in_matrix[2:2]) as w + ) x; + RAISE INFO 'Deviation %', sdam; + + -- To start, we create ranges with approximately the same amount of different values + top := 0; + i := 1; + LOOP + bot := top + 1; + top := ROUND(i * in_unique_count::numeric / breaks::NUMERIC); + + IF i = 1 THEN + classes = ARRAY[ARRAY[bot,top]]; + ELSE + classes = ARRAY_CAT(classes, ARRAY[bot,top]); + END IF; + + i := i + 1; + IF i > breaks THEN EXIT; END IF; + END LOOP; + RAISE INFO 'Initial classes %', classes; + + + best_result = CDB_JenksBinsIteration(in_matrix, breaks, classes, invert, sdam, shuffles); --set the seed so we can ensure the same results SELECT setseed(0.4567) INTO seedtarget; --loop through random starting positions LOOP - IF j > iterations-1 THEN EXIT; END IF; + IF j > iterations-1 THEN EXIT; END IF; i = 1; - tops = ARRAY[element_count]; + tops = ARRAY[in_unique_count]; LOOP - IF i = breaks THEN EXIT; END IF; - SELECT array_agg(distinct e) INTO tops FROM (SELECT unnest(array_cat(tops, ARRAY[floor(random()*element_count::float)::int])) as e ORDER BY e) x WHERE e != 1; - i = array_length(tops, 1); - END LOOP; + IF i = breaks THEN EXIT; END IF; + SELECT array_agg(distinct e) INTO tops FROM ( + SELECT unnest(array_cat(tops, ARRAY[trunc(random() * in_unique_count::float8)::int + 1])) as e ORDER BY e + ) x; + i = array_length(tops, 1); + END LOOP; + RAISE INFO 'Tops %', tops; + top := 0; i = 1; - LOOP - IF i > breaks THEN EXIT; END IF; - IF i = 1 THEN - bot = 1; - ELSE - bot = top+1; - END IF; + LOOP + bot := top + 1; top = tops[i]; - IF i = 1 THEN - classes = ARRAY[ARRAY[bot,top]]; - ELSE - classes = ARRAY_CAT(classes,ARRAY[bot,top]); + IF i = 1 THEN + classes = ARRAY[ARRAY[bot,top]]; + ELSE + classes = ARRAY_CAT(classes, ARRAY[bot,top]); END IF; - i := i+1; - END LOOP; - curr_result = CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles); + + i := i+1; + IF i > breaks THEN EXIT; END IF; + END LOOP; + + RAISE INFO 'Classes %', classes; + curr_result = CDB_JenksBinsIteration(in_matrix, breaks, classes, invert, sdam, shuffles); IF curr_result[1] > best_result[1] THEN best_result = curr_result; j = j-1; -- if we found a better result, add one more search END IF; + j = j+1; END LOOP; RETURN (best_result)[2:array_upper(best_result, 1)]; END; -$$ language plpgsql VOLATILE PARALLEL RESTRICTED; - +$$ LANGUAGE PLPGSQL IMMUTABLE PARALLEL RESTRICTED; -- -- Perform a single iteration of the Jenks classification -- +-- Returns an array with: +-- - First element: gvf +-- - Second to 2+n: Category limits +DROP FUNCTION IF EXISTS CDB_JenksBinsIteration ( in_matrix NUMERIC[], breaks INT, classes INT[], invert BOOLEAN, element_count INT4, arr_mean NUMERIC, max_search INT); -- Old signature + +CREATE OR REPLACE FUNCTION CDB_JenksBinsIteration ( in_matrix NUMERIC[], breaks INT, classes INT[], invert BOOLEAN, sdam NUMERIC, max_search INT DEFAULT 50) RETURNS NUMERIC[] as $$ +DECLARE + i INT; + iterations INT = 0; -CREATE OR REPLACE FUNCTION CDB_JenksBinsIteration ( in_array NUMERIC[], breaks INT, classes INT[][], invert BOOLEAN, element_count INT4, arr_mean NUMERIC, max_search INT DEFAULT 50) RETURNS NUMERIC[] as $$ -DECLARE - tmp_val numeric; - new_classes int[][]; - tmp_class int[]; - i INT := 1; - j INT := 1; side INT := 2; - sdam numeric; - gvf numeric := 0.0; - new_gvf numeric; - arr_gvf numeric[]; - class_avg numeric; - class_max_i INT; - class_min_i INT; - class_max numeric; - class_min numeric; - reply numeric[]; -BEGIN - -- Calculate the sum of squared deviations from the array mean (SDAM). - SELECT sum((arr_mean - e)^2) INTO sdam FROM ( SELECT unnest(in_array) as e ) x; - --Identify the breaks for the lowest GVF - LOOP - i = 1; - LOOP - -- get our mean - SELECT avg(e) INTO class_avg FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e) x; - -- find the deviation - SELECT sum((class_avg-e)^2) INTO tmp_val FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e ) x; - IF i = 1 THEN - arr_gvf = ARRAY[tmp_val]; - -- init our min/max map for later - class_max = arr_gvf[i]; - class_min = arr_gvf[i]; - class_min_i = 1; - class_max_i = 1; - ELSE - arr_gvf = array_append(arr_gvf, tmp_val); - END IF; - i := i+1; - IF i > breaks THEN EXIT; END IF; - END LOOP; - -- calculate our new GVF - SELECT sdam-sum(e) INTO new_gvf FROM ( SELECT unnest(arr_gvf) as e ) x; - -- if no improvement was made, exit - IF new_gvf < gvf THEN EXIT; END IF; - gvf = new_gvf; - IF j > max_search THEN EXIT; END IF; - j = j+1; - i = 1; - LOOP - --establish directionality (uppward through classes or downward) - IF arr_gvf[i] < class_min THEN - class_min = arr_gvf[i]; - class_min_i = i; - END IF; - IF arr_gvf[i] > class_max THEN - class_max = arr_gvf[i]; - class_max_i = i; - END IF; - i := i+1; - IF i > breaks THEN EXIT; END IF; - END LOOP; - IF class_max_i > class_min_i THEN - class_min_i = class_max_i - 1; + gvf numeric := 0.0; + new_gvf numeric; + arr_gvf numeric[]; + arr_avg numeric[]; + class_avg numeric; + class_dev numeric; + + class_max_i INT; + class_min_i INT; + dev_max numeric; + dev_min numeric; + best_classes INT[] = classes; + + reply numeric[]; + +BEGIN + + -- We fill the arrays with the initial values + i = 0; + LOOP + IF i = breaks THEN EXIT; END IF; + i = i + 1; + RAISE INFO 'Loop %', i; + + -- Get class mean + SELECT (sum(v * w) / sum(w)) INTO class_avg FROM ( + SELECT unnest(in_matrix[1:1][classes[i][1]:classes[i][2]]) as v, + unnest(in_matrix[2:2][classes[i][1]:classes[i][2]]) as w + ) x; + + -- Get class deviation + SELECT sum((class_avg - v)^2 * w) INTO class_dev FROM ( + SELECT unnest(in_matrix[1:1][classes[i][1]:classes[i][2]]) as v, + unnest(in_matrix[2:2][classes[i][1]:classes[i][2]]) as w + ) x; + + + IF i = 1 THEN + arr_avg = ARRAY[class_avg]; + arr_gvf = ARRAY[class_dev]; ELSE - class_min_i = class_max_i + 1; + arr_avg = array_append(arr_avg, class_avg); + arr_gvf = array_append(arr_gvf, class_dev); END IF; - --Move from higher class to a lower gid order - IF class_max_i > class_min_i THEN - classes[class_max_i][1] = classes[class_max_i][1] + 1; - classes[class_min_i][2] = classes[class_min_i][2] + 1; - ELSE -- Move from lower class UP into a higher class by gid - classes[class_max_i][2] = classes[class_max_i][2] - 1; - classes[class_min_i][1] = classes[class_min_i][1] - 1; + END LOOP; + + + + iterations = 0; + LOOP + IF iterations = max_search THEN EXIT; END IF; + iterations = iterations + 1; + + -- calculate our new GVF + SELECT sdam - sum(e) INTO new_gvf FROM ( SELECT unnest(arr_gvf) as e ) x; + + -- if no improvement was made, exit + IF new_gvf <= gvf OR class_max_i = class_min_i THEN EXIT; END IF; + + -- We search for classes with the min and max deviation + i = 1; + class_min_i = 1; + class_max_i = 1; + dev_max = arr_gvf[1]; + dev_min = arr_gvf[1]; + LOOP + IF i = breaks THEN EXIT; END IF; + i = i + 1; + + IF arr_gvf[i] < dev_min THEN + dev_min = arr_gvf[i]; + class_min_i = i; + ELSE + IF arr_gvf[i] > dev_max THEN + dev_max = arr_gvf[i]; + class_max_i = i; + END IF; END IF; - END LOOP; + END LOOP; + + + -- Save best values for comparison and output + gvf = new_gvf; + best_classes = classes; + RAISE INFO 'Deviations %', arr_gvf; + RAISE INFO 'Min %. Max %', class_min_i, class_max_i; + + -- Iterate by moving an element from class_max_i to class_min_i + IF class_min_i < class_max_i THEN + i := class_min_i; + LOOP + IF i = class_max_i THEN EXIT; END IF; + classes[i][2] = classes[i][2] + 1; + i := i + 1; + END LOOP; + + i := class_max_i; + LOOP + IF i = class_min_i THEN EXIT; END IF; + classes[i][1] = classes[i][1] + 1; + i := i - 1; + END LOOP; + ELSE + i := class_min_i; + LOOP + IF i = class_max_i THEN EXIT; END IF; + classes[i][1] = classes[i][1] - 1; + i := i - 1; + END LOOP; + + i := class_max_i; + LOOP + IF i = class_min_i THEN EXIT; END IF; + classes[i][2] = classes[i][2] - 1; + i := i + 1; + END LOOP; + END IF; + RAISE INFO 'Classes %', classes; + + -- Recalculate avg and deviation for the affected classes + i = LEAST(class_min_i, class_max_i); + class_max_i = GREATEST(class_min_i, class_max_i); + LOOP + -- Get class mean + SELECT (sum(v * w) / sum(w)) INTO class_avg FROM ( + SELECT unnest(in_matrix[1:1][classes[i][1]:classes[i][2]]) as v, + unnest(in_matrix[2:2][classes[i][1]:classes[i][2]]) as w + ) x; + + -- Get class deviation + SELECT sum((class_avg - v)^2 * w) INTO class_dev FROM ( + SELECT unnest(in_matrix[1:1][classes[i][1]:classes[i][2]]) as v, + unnest(in_matrix[2:2][classes[i][1]:classes[i][2]]) as w + ) x; + + arr_avg[i] = class_avg; + arr_gvf[i] = class_dev; + + IF i = class_max_i THEN EXIT; END IF; + i = i + 1; + END LOOP; + + END LOOP; i = 1; - LOOP + LOOP IF invert = TRUE THEN side = 1; --default returns bottom side of breaks, invert returns top side END IF; - reply = array_append(reply, in_array[classes[i][side]]); - i = i+1; - IF i > breaks THEN EXIT; END IF; - END LOOP; - - RETURN array_prepend(gvf, reply); + reply = array_append(reply, unnest(in_matrix[1:1][best_classes[i][side]:best_classes[i][side]])); + i = i+1; + IF i > breaks THEN EXIT; END IF; + END LOOP; -END; -$$ language plpgsql IMMUTABLE PARALLEL SAFE; + reply = array_prepend(gvf, reply); + RAISE INFO 'Reply: %', reply; + RETURN reply; + +END; +$$ LANGUAGE PLPGSQL IMMUTABLE PARALLEL SAFE;