From 0e7d7974006c54702d71d5fbab24a4673734c705 Mon Sep 17 00:00:00 2001
From: Raul Marin <git@rmr.ninja>
Date: Mon, 10 Sep 2018 13:16:54 +0200
Subject: [PATCH] Jenks: Fix multiple bugs

---
 scripts-available/CDB_JenksBins.sql | 415 +++++++++++++++++-----------
 1 file changed, 261 insertions(+), 154 deletions(-)

diff --git a/scripts-available/CDB_JenksBins.sql b/scripts-available/CDB_JenksBins.sql
index 9db1ad6..f1361e5 100644
--- a/scripts-available/CDB_JenksBins.sql
+++ b/scripts-available/CDB_JenksBins.sql
@@ -10,212 +10,319 @@
 --
 -- @param invert Optional wheter to return the top of each bin (default)
 --               or the bottom. BOOLEAN, default=FALSE.
---  
+--
 --
 
+CREATE OR REPLACE FUNCTION CDB_JenksBins(in_array NUMERIC[], breaks INT, iterations INT DEFAULT 0, invert BOOLEAN DEFAULT FALSE)
+RETURNS NUMERIC[] as
+$$
+DECLARE
+    in_matrix NUMERIC[][];
+    in_unique_count BIGINT;
 
-CREATE OR REPLACE FUNCTION CDB_JenksBins ( in_array NUMERIC[], breaks INT, iterations INT DEFAULT 5, invert BOOLEAN DEFAULT FALSE) RETURNS NUMERIC[] as $$ 
-DECLARE 
-    element_count INT4; 
+    shuffles INT;
     arr_mean NUMERIC;
+    sdam NUMERIC;
+
+    i INT;
     bot INT;
     top INT;
+
     tops INT[];
     classes INT[][];
-    i INT := 1; j INT := 1; 
+    j INT := 1;
     curr_result NUMERIC[];
     best_result NUMERIC[];
     seedtarget TEXT;
-    quant NUMERIC[];
-    shuffles INT;
+
 BEGIN
-    -- get the total size of our row
-    element_count := array_length(in_array, 1); --array_upper(in_array, 1) - array_lower(in_array, 1); 
-    -- ensure the ordering of in_array
-    SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x;
-    -- stop if no rows 
-    IF element_count IS NULL THEN  
-        RETURN NULL; 
-    END IF; 
-    -- stop if our breaks are more than our input array size
-    IF element_count < breaks THEN  
-        RETURN in_array; 
-    END IF; 
+    -- We clean the input array (remove NULLs) and create 2 arrays
+    -- [1] contains the unique values in in_array
+    -- [2] contains the number of appearances of those unique values
+    SELECT ARRAY[array_agg(value), array_agg(count)] FROM
+    (
+        SELECT value, count(1)::numeric as count
+        FROM  unnest(in_array) AS value
+        WHERE value is NOT NULL
+        GROUP BY value
+        ORDER BY value
+    ) __clean_array_q INTO in_matrix;
 
-    shuffles := LEAST(GREATEST(floor(2500000.0/(element_count::float*iterations::float)), 1), 750)::int;
-    -- get our mean value
-    SELECT avg(v) INTO arr_mean FROM (  SELECT unnest(in_array) as v ) x; 
+    -- Get the number of unique values
+    in_unique_count := array_length(in_matrix[1:1], 2);
+    RAISE INFO 'Unique %', in_unique_count;
 
-    -- assume best is actually Quantile
-    SELECT CDB_QuantileBins(in_array, breaks) INTO quant;
-
-    -- if data is very very large, just return quant and be done
-    IF element_count > 5000000 THEN
-        RETURN quant;
+    IF in_unique_count IS NULL THEN
+        RETURN NULL;
     END IF;
 
-    -- change quant into bottom, top markers
-    LOOP 
-        IF i = 1 THEN 
-            bot = 1;
-        ELSE 
-            -- use last top to find this bot
-            bot = top+1;
-        END IF;
-        IF i = breaks THEN
-            top = element_count;
-        ELSE
-            SELECT count(*) INTO top FROM ( SELECT unnest(in_array) as v) x WHERE v <= quant[i];
-        END IF;
-        IF i = 1 THEN 
-            classes = ARRAY[ARRAY[bot,top]]; 
-        ELSE 
-            classes = ARRAY_CAT(classes,ARRAY[bot,top]); 
-        END IF;
-        IF i > breaks THEN EXIT; END IF;
-        i = i+1;
-    END LOOP;
+    IF in_unique_count <= breaks THEN
+        -- There isn't enough distinct values for the requested breaks
+        RETURN ARRAY(Select unnest(in_matrix[1:1])) _a;
+    END IF;
 
-    best_result = CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles);
+    -- If not declated explicitly we iterate based on the length of the array
+    IF iterations < 1 THEN
+        -- This is based on a 'looks fine' heuristic
+        iterations := log(in_unique_count)::integer + 1;
+    END IF;
+    RAISE INFO 'Iterations: %', iterations;
+
+    -- We set the number of shuffles per iteration as the number of unique values but
+    -- this is just another 'looks fine' heuristic
+    shuffles := in_unique_count;
+    RAISE INFO 'Suffles %', shuffles;
+
+    -- Get the mean value of the whole vector (already ignores NULLs)
+    SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x;
+    RAISE INFO 'Mean %', arr_mean;
+
+    -- Calculate the sum of squared deviations from the array mean (SDAM).
+    SELECT sum(((arr_mean - v)^2) * w) INTO sdam FROM (
+        SELECT unnest(in_matrix[1:1]) as v, unnest(in_matrix[2:2]) as w
+        ) x;
+    RAISE INFO 'Deviation %', sdam;
+
+    -- To start, we create ranges with approximately the same amount of different values
+    top := 0;
+    i := 1;
+    LOOP
+        bot := top + 1;
+        top := ROUND(i * in_unique_count::numeric / breaks::NUMERIC);
+
+        IF i = 1 THEN
+            classes = ARRAY[ARRAY[bot,top]];
+        ELSE
+            classes = ARRAY_CAT(classes, ARRAY[bot,top]);
+        END IF;
+
+        i := i + 1;
+        IF i > breaks THEN EXIT; END IF;
+    END LOOP;
+    RAISE INFO 'Initial classes %', classes;
+
+
+    best_result = CDB_JenksBinsIteration(in_matrix, breaks, classes, invert, sdam, shuffles);
 
     --set the seed so we can ensure the same results
     SELECT setseed(0.4567) INTO seedtarget;
     --loop through random starting positions
     LOOP
-        IF j > iterations-1 THEN  EXIT;  END IF;  
+        IF j > iterations-1 THEN  EXIT;  END IF;
         i = 1;
-        tops = ARRAY[element_count];
+        tops = ARRAY[in_unique_count];
         LOOP
-            IF i = breaks THEN  EXIT;  END IF;  
-            SELECT array_agg(distinct e) INTO tops FROM (SELECT unnest(array_cat(tops, ARRAY[floor(random()*element_count::float)::int])) as e ORDER BY e) x WHERE e != 1;
-            i = array_length(tops, 1); 
-        END LOOP; 
+            IF i = breaks THEN  EXIT;  END IF;
+            SELECT array_agg(distinct e) INTO tops FROM (
+                SELECT unnest(array_cat(tops, ARRAY[trunc(random() * in_unique_count::float8)::int + 1])) as e ORDER BY e
+                ) x;
+            i = array_length(tops, 1);
+        END LOOP;
+        RAISE INFO 'Tops %', tops;
+        top := 0;
         i = 1;
-        LOOP  
-            IF i > breaks THEN  EXIT;  END IF;  
-            IF i = 1 THEN
-                bot = 1;
-            ELSE
-                bot = top+1;
-            END IF;
+        LOOP
+            bot := top + 1;
             top = tops[i];
-            IF i = 1 THEN 
-                classes = ARRAY[ARRAY[bot,top]]; 
-            ELSE 
-                classes = ARRAY_CAT(classes,ARRAY[bot,top]); 
+            IF i = 1 THEN
+                classes = ARRAY[ARRAY[bot,top]];
+            ELSE
+                classes = ARRAY_CAT(classes, ARRAY[bot,top]);
             END IF;
-            i := i+1; 
-        END LOOP; 
-        curr_result = CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles);
+
+            i := i+1;
+            IF i > breaks THEN EXIT; END IF;
+        END LOOP;
+
+        RAISE INFO 'Classes %', classes;
+        curr_result = CDB_JenksBinsIteration(in_matrix, breaks, classes, invert, sdam, shuffles);
 
         IF curr_result[1] > best_result[1] THEN
             best_result = curr_result;
             j = j-1; -- if we found a better result, add one more search
         END IF;
+
         j = j+1;
     END LOOP;
 
     RETURN (best_result)[2:array_upper(best_result, 1)];
 END;
-$$ language plpgsql VOLATILE PARALLEL RESTRICTED;
-
+$$ LANGUAGE PLPGSQL IMMUTABLE PARALLEL RESTRICTED;
 
 
 --
 -- Perform a single iteration of the Jenks classification
 --
+-- Returns an array with:
+-- - First element: gvf
+-- - Second to 2+n: Category limits
+DROP FUNCTION IF EXISTS CDB_JenksBinsIteration ( in_matrix NUMERIC[], breaks INT, classes INT[], invert BOOLEAN, element_count INT4, arr_mean NUMERIC, max_search INT); -- Old signature
+
+CREATE OR REPLACE FUNCTION CDB_JenksBinsIteration ( in_matrix NUMERIC[], breaks INT, classes INT[], invert BOOLEAN, sdam NUMERIC, max_search INT DEFAULT 50) RETURNS NUMERIC[] as $$
+DECLARE
+    i INT;
+    iterations INT = 0;
 
-CREATE OR REPLACE FUNCTION CDB_JenksBinsIteration ( in_array NUMERIC[], breaks INT, classes INT[][], invert BOOLEAN, element_count INT4, arr_mean NUMERIC, max_search INT DEFAULT 50) RETURNS NUMERIC[] as $$ 
-DECLARE 
-    tmp_val numeric; 
-    new_classes int[][];
-    tmp_class int[];
-    i INT := 1; 
-    j INT := 1; 
     side INT := 2;
-    sdam numeric; 
-    gvf numeric := 0.0; 
-    new_gvf numeric; 
-    arr_gvf numeric[]; 
-    class_avg numeric; 
-    class_max_i INT; 
-    class_min_i INT; 
-    class_max numeric; 
-    class_min numeric; 
-    reply numeric[]; 
-BEGIN 
 
-    -- Calculate the sum of squared deviations from the array mean (SDAM).
-    SELECT sum((arr_mean - e)^2) INTO sdam FROM (  SELECT unnest(in_array) as e ) x; 
-    --Identify the breaks for the lowest GVF
-    LOOP   
-        i = 1; 
-        LOOP 
-            -- get our mean
-            SELECT avg(e) INTO class_avg FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e) x;  
-            -- find the deviation
-            SELECT sum((class_avg-e)^2) INTO tmp_val FROM (   SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e  ) x;  
-            IF i = 1 THEN 
-                arr_gvf = ARRAY[tmp_val]; 
-                -- init our min/max map for later
-                class_max = arr_gvf[i];  
-                class_min = arr_gvf[i];  
-                class_min_i = 1;   
-                class_max_i = 1;  
-            ELSE 
-                arr_gvf = array_append(arr_gvf, tmp_val); 
-            END IF;
-            i := i+1;  
-            IF i > breaks THEN EXIT; END IF;  
-        END LOOP;  
-        -- calculate our new GVF
-        SELECT sdam-sum(e) INTO new_gvf FROM (  SELECT unnest(arr_gvf) as e  ) x;  
-        -- if no improvement was made, exit
-        IF new_gvf < gvf THEN EXIT; END IF; 
-        gvf = new_gvf;  
-        IF j > max_search THEN EXIT; END IF; 
-        j = j+1;
-        i = 1;  
-        LOOP  
-            --establish directionality (uppward through classes or downward)
-            IF arr_gvf[i] < class_min THEN   
-                class_min = arr_gvf[i];   
-                class_min_i = i;  
-            END IF;  
-            IF arr_gvf[i] > class_max THEN   
-                class_max = arr_gvf[i];   
-                class_max_i = i;  
-            END IF;  
-            i := i+1;  
-            IF i > breaks THEN EXIT; END IF;  
-        END LOOP;  
-        IF class_max_i > class_min_i THEN
-            class_min_i = class_max_i - 1;
+    gvf numeric := 0.0;
+    new_gvf numeric;
+    arr_gvf numeric[];
+    arr_avg numeric[];
+    class_avg numeric;
+    class_dev numeric;
+
+    class_max_i INT;
+    class_min_i INT;
+    dev_max numeric;
+    dev_min numeric;
+    best_classes INT[] = classes;
+
+    reply numeric[];
+
+BEGIN
+
+    -- We fill the arrays with the initial values
+    i = 0;
+    LOOP
+        IF i = breaks THEN EXIT; END IF;
+        i = i + 1;
+        RAISE INFO 'Loop %', i;
+
+        -- Get class mean
+        SELECT (sum(v * w) / sum(w)) INTO class_avg FROM (
+            SELECT unnest(in_matrix[1:1][classes[i][1]:classes[i][2]]) as v,
+                    unnest(in_matrix[2:2][classes[i][1]:classes[i][2]]) as w
+            ) x;
+
+        -- Get class deviation
+        SELECT sum((class_avg - v)^2 * w) INTO class_dev FROM (
+            SELECT unnest(in_matrix[1:1][classes[i][1]:classes[i][2]]) as v,
+                    unnest(in_matrix[2:2][classes[i][1]:classes[i][2]]) as w
+            ) x;
+
+
+        IF i = 1 THEN
+            arr_avg = ARRAY[class_avg];
+            arr_gvf = ARRAY[class_dev];
         ELSE
-            class_min_i = class_max_i + 1;
+            arr_avg = array_append(arr_avg, class_avg);
+            arr_gvf = array_append(arr_gvf, class_dev);
         END IF;
-            --Move from higher class to a lower gid order
-            IF class_max_i > class_min_i THEN
-                classes[class_max_i][1] = classes[class_max_i][1] + 1;
-                classes[class_min_i][2] = classes[class_min_i][2] + 1;
-            ELSE -- Move from lower class UP into a higher class by gid
-                classes[class_max_i][2] = classes[class_max_i][2] - 1;
-                classes[class_min_i][1] = classes[class_min_i][1] - 1;
+    END LOOP;
+
+
+
+    iterations = 0;
+    LOOP
+        IF iterations = max_search THEN EXIT; END IF;
+        iterations = iterations + 1;
+
+        -- calculate our new GVF
+        SELECT sdam - sum(e) INTO new_gvf FROM ( SELECT unnest(arr_gvf) as e ) x;
+
+        -- if no improvement was made, exit
+        IF new_gvf <= gvf OR class_max_i = class_min_i THEN EXIT; END IF;
+
+        -- We search for classes with the min and max deviation
+        i = 1;
+        class_min_i = 1;
+        class_max_i = 1;
+        dev_max = arr_gvf[1];
+        dev_min = arr_gvf[1];
+        LOOP
+            IF i = breaks THEN EXIT; END IF;
+            i = i + 1;
+
+            IF arr_gvf[i] < dev_min THEN
+                dev_min = arr_gvf[i];
+                class_min_i = i;
+            ELSE
+                IF arr_gvf[i] > dev_max THEN
+                    dev_max = arr_gvf[i];
+                    class_max_i = i;
+                END IF;
             END IF;
-    END LOOP; 
+        END LOOP;
+
+
+        -- Save best values for comparison and output
+        gvf = new_gvf;
+        best_classes = classes;
+        RAISE INFO 'Deviations %', arr_gvf;
+        RAISE INFO 'Min %. Max %', class_min_i, class_max_i;
+
+        -- Iterate by moving an element from class_max_i to class_min_i
+        IF class_min_i < class_max_i THEN
+            i := class_min_i;
+            LOOP
+                IF i = class_max_i THEN EXIT; END IF;
+                classes[i][2] = classes[i][2] + 1;
+                i := i + 1;
+            END LOOP;
+
+            i := class_max_i;
+            LOOP
+                IF i = class_min_i THEN EXIT; END IF;
+                classes[i][1] = classes[i][1] + 1;
+                i := i - 1;
+            END LOOP;
+        ELSE
+            i := class_min_i;
+            LOOP
+                IF i = class_max_i THEN EXIT; END IF;
+                classes[i][1] = classes[i][1] - 1;
+                i := i - 1;
+            END LOOP;
+
+            i := class_max_i;
+            LOOP
+                IF i = class_min_i THEN EXIT; END IF;
+                classes[i][2] = classes[i][2] - 1;
+                i := i + 1;
+            END LOOP;
+        END IF;
+        RAISE INFO 'Classes %', classes;
+
+        -- Recalculate avg and deviation for the affected classes
+        i = LEAST(class_min_i, class_max_i);
+        class_max_i = GREATEST(class_min_i, class_max_i);
+        LOOP
+            -- Get class mean
+            SELECT (sum(v * w) / sum(w)) INTO class_avg FROM (
+                SELECT unnest(in_matrix[1:1][classes[i][1]:classes[i][2]]) as v,
+                        unnest(in_matrix[2:2][classes[i][1]:classes[i][2]]) as w
+                ) x;
+
+            -- Get class deviation
+            SELECT sum((class_avg - v)^2 * w) INTO class_dev FROM (
+                SELECT unnest(in_matrix[1:1][classes[i][1]:classes[i][2]]) as v,
+                        unnest(in_matrix[2:2][classes[i][1]:classes[i][2]]) as w
+                ) x;
+
+            arr_avg[i] = class_avg;
+            arr_gvf[i] = class_dev;
+
+            IF i = class_max_i THEN EXIT; END IF;
+            i = i + 1;
+        END LOOP;
+
+    END LOOP;
 
     i = 1;
-    LOOP  
+    LOOP
         IF invert = TRUE THEN
             side = 1; --default returns bottom side of breaks, invert returns top side
         END IF;
-        reply = array_append(reply, in_array[classes[i][side]]);  
-        i = i+1;  
-        IF i > breaks THEN  EXIT; END IF; 
-    END LOOP; 
-    
-    RETURN array_prepend(gvf, reply); 
+        reply = array_append(reply, unnest(in_matrix[1:1][best_classes[i][side]:best_classes[i][side]]));
+        i = i+1;
+        IF i > breaks THEN  EXIT; END IF;
+    END LOOP;
 
-END; 
-$$ language plpgsql IMMUTABLE PARALLEL SAFE;
+    reply = array_prepend(gvf, reply);
+    RAISE INFO 'Reply: %', reply;
+    RETURN reply;
+
+END;
+$$ LANGUAGE PLPGSQL IMMUTABLE PARALLEL SAFE;