Merge pull request #340 from Algunenano/trav_jenkins

Improvements in `CDB_JenksBins`
2018-09-13 10:24:41 +02:00 · 2018-09-13 10:24:41 +02:00 · 5033a0ba35
commit 5033a0ba35
parent 60a21d34bf d2a00852a8
5 changed files with 321 additions and 168 deletions
--- a/2
+++ b/2
@ -1,7 +1,7 @@
 # cartodb/Makefile
 EXTENSION = cartodb
-EXTVERSION = 0.23.2
+EXTVERSION = 0.24.0
 SED = sed
 AWK = awk
--- a/NEWS.md
+++ b/NEWS.md
@ -1,6 +1,13 @@
 0.24.0 (2018-XX-XX)
 * Travis: Test with PostgreSQL 9.5 and 10.
 * _cdb_estimated_extent: Fix bug with ST_EstimatedExtent interaction.
 * Improvements in `CDB_JenksBins`.
  * Now it ignores NULLs.
  * No longer puts the same value in multiple categories.
  * Removes all limits related to size.
  * If not set, the number of iterations done is based now on the size of the array.
  * Fixed multiple bugs.
  * The internal function `CDB_JenksBinsIteration` has changed its signature.
 0.23.2 (2018-07-19)
 * Fix `CDB_QueryTablesText` with parenthesized queries (#335)
--- a/scripts-available/CDB_JenksBins.sql
+++ b/scripts-available/CDB_JenksBins.sql
@ -13,70 +13,89 @@
 --
 --
-
+CREATE OR REPLACE FUNCTION CDB_JenksBins(in_array NUMERIC[], breaks INT, iterations INT DEFAULT 0, invert BOOLEAN DEFAULT FALSE)
-CREATE OR REPLACE FUNCTION CDB_JenksBins ( in_array NUMERIC[], breaks INT, iterations INT DEFAULT 5, invert BOOLEAN DEFAULT FALSE) RETURNS NUMERIC[] as $$ 
+RETURNS NUMERIC[] as
 $$
 DECLARE
-    element_count INT4; 
+    in_matrix NUMERIC[][];
    in_unique_count BIGINT;
    shuffles INT;
    arr_mean NUMERIC;
    sdam NUMERIC;
    i INT;
    bot INT;
    top INT;
    tops INT[];
    classes INT[][];
-    i INT := 1; j INT := 1; 
+    j INT := 1;
    curr_result NUMERIC[];
    best_result NUMERIC[];
    seedtarget TEXT;
-    quant NUMERIC[];
+
    shuffles INT;
 BEGIN
-    -- get the total size of our row
+    -- We clean the input array (remove NULLs) and create 2 arrays
-    element_count := array_length(in_array, 1); --array_upper(in_array, 1) - array_lower(in_array, 1); 
+    -- [1] contains the unique values in in_array
-    -- ensure the ordering of in_array
+    -- [2] contains the number of appearances of those unique values
-    SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x;
+    SELECT ARRAY[array_agg(value), array_agg(count)] FROM
-    -- stop if no rows 
+    (
-    IF element_count IS NULL THEN  
+        SELECT value, count(1)::numeric as count
        FROM  unnest(in_array) AS value
        WHERE value is NOT NULL
        GROUP BY value
        ORDER BY value
    ) __clean_array_q INTO in_matrix;
    -- Get the number of unique values
    in_unique_count := array_length(in_matrix[1:1], 2);
    IF in_unique_count IS NULL THEN
        RETURN NULL;
    END IF;
-    -- stop if our breaks are more than our input array size
+
-    IF element_count < breaks THEN  
+    IF in_unique_count <= breaks THEN
-        RETURN in_array; 
+        -- There isn't enough distinct values for the requested breaks
        RETURN ARRAY(Select unnest(in_matrix[1:1])) _a;
    END IF;
-    shuffles := LEAST(GREATEST(floor(2500000.0/(element_count::float*iterations::float)), 1), 750)::int;
+    -- If not declated explicitly we iterate based on the length of the array
-    -- get our mean value
+    IF iterations < 1 THEN
-    SELECT avg(v) INTO arr_mean FROM (  SELECT unnest(in_array) as v ) x; 
+        -- This is based on a 'looks fine' heuristic
-
+        iterations := log(in_unique_count)::integer + 1;
    -- assume best is actually Quantile
    SELECT CDB_QuantileBins(in_array, breaks) INTO quant;
    -- if data is very very large, just return quant and be done
    IF element_count > 5000000 THEN
        RETURN quant;
    END IF;
-    -- change quant into bottom, top markers
+    -- We set the number of shuffles per iteration as the number of unique values but
    -- this is just another 'looks fine' heuristic
    shuffles := in_unique_count;
    -- Get the mean value of the whole vector (already ignores NULLs)
    SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x;
    -- Calculate the sum of squared deviations from the array mean (SDAM).
    SELECT sum(((arr_mean - v)^2) * w) INTO sdam FROM (
        SELECT unnest(in_matrix[1:1]) as v, unnest(in_matrix[2:2]) as w
        ) x;
    -- To start, we create ranges with approximately the same amount of different values
    top := 0;
    i := 1;
    LOOP
-        IF i = 1 THEN 
+        bot := top + 1;
-            bot = 1;
+        top := ROUND(i * in_unique_count::numeric / breaks::NUMERIC);
-        ELSE 
+
            -- use last top to find this bot
            bot = top+1;
        END IF;
        IF i = breaks THEN
            top = element_count;
        ELSE
            SELECT count(*) INTO top FROM ( SELECT unnest(in_array) as v) x WHERE v <= quant[i];
        END IF;
        IF i = 1 THEN
            classes = ARRAY[ARRAY[bot,top]];
        ELSE
-            classes = ARRAY_CAT(classes,ARRAY[bot,top]); 
+            classes = ARRAY_CAT(classes, ARRAY[bot,top]);
        END IF;
        i := i + 1;
        IF i > breaks THEN EXIT; END IF;
        i = i+1;
    END LOOP;
-    best_result = CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles);
+    best_result = CDB_JenksBinsIteration(in_matrix, breaks, classes, invert, sdam, shuffles);
    --set the seed so we can ensure the same results
    SELECT setseed(0.4567) INTO seedtarget;
@ -84,124 +103,230 @@ BEGIN
    LOOP
        IF j > iterations-1 THEN  EXIT;  END IF;
        i = 1;
-        tops = ARRAY[element_count];
+        tops = ARRAY[in_unique_count];
        LOOP
            IF i = breaks THEN  EXIT;  END IF;
-            SELECT array_agg(distinct e) INTO tops FROM (SELECT unnest(array_cat(tops, ARRAY[floor(random()*element_count::float)::int])) as e ORDER BY e) x WHERE e != 1;
+            SELECT array_agg(distinct e) INTO tops FROM (
                SELECT unnest(array_cat(tops, ARRAY[trunc(random() * in_unique_count::float8)::int + 1])) as e ORDER BY e
                ) x;
            i = array_length(tops, 1);
        END LOOP;
        top := 0;
        i = 1;
        LOOP
-            IF i > breaks THEN  EXIT;  END IF;  
+            bot := top + 1;
            IF i = 1 THEN
                bot = 1;
            ELSE
                bot = top+1;
            END IF;
            top = tops[i];
            IF i = 1 THEN
                classes = ARRAY[ARRAY[bot,top]];
            ELSE
-                classes = ARRAY_CAT(classes,ARRAY[bot,top]); 
+                classes = ARRAY_CAT(classes, ARRAY[bot,top]);
            END IF;
            i := i+1;
            IF i > breaks THEN EXIT; END IF;
        END LOOP;
-        curr_result = CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles);
+
        curr_result = CDB_JenksBinsIteration(in_matrix, breaks, classes, invert, sdam, shuffles);
        IF curr_result[1] > best_result[1] THEN
            best_result = curr_result;
            j = j-1; -- if we found a better result, add one more search
        END IF;
        j = j+1;
    END LOOP;
    RETURN (best_result)[2:array_upper(best_result, 1)];
 END;
-$$ language plpgsql VOLATILE PARALLEL RESTRICTED;
+$$ LANGUAGE PLPGSQL IMMUTABLE PARALLEL RESTRICTED;
 --
 -- Perform a single iteration of the Jenks classification
 --
 -- Returns an array with:
 -- - First element: gvf
 -- - Second to 2+n: Category limits
 DROP FUNCTION IF EXISTS CDB_JenksBinsIteration ( in_matrix NUMERIC[], breaks INT, classes INT[], invert BOOLEAN, element_count INT4, arr_mean NUMERIC, max_search INT); -- Old signature
-CREATE OR REPLACE FUNCTION CDB_JenksBinsIteration ( in_array NUMERIC[], breaks INT, classes INT[][], invert BOOLEAN, element_count INT4, arr_mean NUMERIC, max_search INT DEFAULT 50) RETURNS NUMERIC[] as $$ 
+CREATE OR REPLACE FUNCTION CDB_JenksBinsIteration ( in_matrix NUMERIC[], breaks INT, classes INT[], invert BOOLEAN, sdam NUMERIC, max_search INT DEFAULT 50) RETURNS NUMERIC[] as $$
 DECLARE
-    tmp_val numeric; 
+    i INT;
-    new_classes int[][];
+    iterations INT = 0;
-    tmp_class int[];
+
    i INT := 1; 
    j INT := 1; 
    side INT := 2;
-    sdam numeric; 
+
    gvf numeric := 0.0;
    new_gvf numeric;
    arr_gvf numeric[];
    arr_avg numeric[];
    class_avg numeric;
-    class_max_i INT; 
+    class_dev numeric;
-    class_min_i INT; 
+
-    class_max numeric; 
+    class_max_i INT = 0;
-    class_min numeric; 
+    class_min_i INT = 0;
    dev_max numeric;
    dev_min numeric;
    best_classes INT[] = classes;
    best_gvf numeric[];
    best_avg numeric[];
    move_elements INT = 1;
    reply numeric[];
 BEGIN
-    -- Calculate the sum of squared deviations from the array mean (SDAM).
+    -- We fill the arrays with the initial values
-    SELECT sum((arr_mean - e)^2) INTO sdam FROM (  SELECT unnest(in_array) as e ) x; 
+    i = 0;
    --Identify the breaks for the lowest GVF
    LOOP
-        i = 1; 
+        IF i = breaks THEN EXIT; END IF;
-        LOOP 
+        i = i + 1;
-            -- get our mean
+
-            SELECT avg(e) INTO class_avg FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e) x;  
+        -- Get class mean
-            -- find the deviation
+        SELECT (sum(v * w) / sum(w)) INTO class_avg FROM (
-            SELECT sum((class_avg-e)^2) INTO tmp_val FROM (   SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e  ) x;  
+            SELECT unnest(in_matrix[1:1][classes[i][1]:classes[i][2]]) as v,
-            IF i = 1 THEN 
+                    unnest(in_matrix[2:2][classes[i][1]:classes[i][2]]) as w
-                arr_gvf = ARRAY[tmp_val]; 
+            ) x;
-                -- init our min/max map for later
+
-                class_max = arr_gvf[i];  
+        -- Get class deviation
-                class_min = arr_gvf[i];  
+        SELECT sum((class_avg - v)^2 * w) INTO class_dev FROM (
-                class_min_i = 1;   
+            SELECT unnest(in_matrix[1:1][classes[i][1]:classes[i][2]]) as v,
-                class_max_i = 1;  
+                    unnest(in_matrix[2:2][classes[i][1]:classes[i][2]]) as w
-            ELSE 
+            ) x;
-                arr_gvf = array_append(arr_gvf, tmp_val); 
+
-            END IF;
+
-            i := i+1;  
+        IF i = 1 THEN
-            IF i > breaks THEN EXIT; END IF;  
+            arr_avg = ARRAY[class_avg];
-        END LOOP;  
+            arr_gvf = ARRAY[class_dev];
        -- calculate our new GVF
        SELECT sdam-sum(e) INTO new_gvf FROM (  SELECT unnest(arr_gvf) as e  ) x;  
        -- if no improvement was made, exit
        IF new_gvf < gvf THEN EXIT; END IF; 
        gvf = new_gvf;  
        IF j > max_search THEN EXIT; END IF; 
        j = j+1;
        i = 1;  
        LOOP  
            --establish directionality (uppward through classes or downward)
            IF arr_gvf[i] < class_min THEN   
                class_min = arr_gvf[i];   
                class_min_i = i;  
            END IF;  
            IF arr_gvf[i] > class_max THEN   
                class_max = arr_gvf[i];   
                class_max_i = i;  
            END IF;  
            i := i+1;  
            IF i > breaks THEN EXIT; END IF;  
        END LOOP;  
        IF class_max_i > class_min_i THEN
            class_min_i = class_max_i - 1;
        ELSE
-            class_min_i = class_max_i + 1;
+            arr_avg = array_append(arr_avg, class_avg);
            arr_gvf = array_append(arr_gvf, class_dev);
        END IF;
-            --Move from higher class to a lower gid order
+    END LOOP;
-            IF class_max_i > class_min_i THEN
+
-                classes[class_max_i][1] = classes[class_max_i][1] + 1;
+    -- We copy the values to avoid recalculation when a failure happens
-                classes[class_min_i][2] = classes[class_min_i][2] + 1;
+    best_avg = arr_avg;
-            ELSE -- Move from lower class UP into a higher class by gid
+    best_gvf = arr_gvf;
-                classes[class_max_i][2] = classes[class_max_i][2] - 1;
+
-                classes[class_min_i][1] = classes[class_min_i][1] - 1;
+    iterations = 0;
    LOOP
        IF iterations = max_search THEN EXIT; END IF;
        iterations = iterations + 1;
        -- calculate our new GVF
        SELECT sdam - sum(e) INTO new_gvf FROM ( SELECT unnest(arr_gvf) as e ) x;
        -- Check if any improvement was made
        IF new_gvf <= gvf THEN
            -- If we were moving too many elements, go back and move less
            IF move_elements <= 2 OR class_max_i = class_min_i THEN
                EXIT;
            END IF;
            move_elements = GREATEST(move_elements / 8, 1);
            -- Rollback from saved statuses
            classes = best_classes;
            new_gvf = gvf;
            i = class_min_i;
            LOOP
                arr_avg[i] = best_avg[i];
                arr_gvf[i] = best_gvf[i];
                IF i = class_max_i THEN EXIT; END IF;
                i = i + 1;
            END LOOP;
        END IF;
        -- We search for the classes with the min and max deviation
        i = 1;
        class_min_i = 1;
        class_max_i = 1;
        dev_max = arr_gvf[1];
        dev_min = arr_gvf[1];
        LOOP
            IF i = breaks THEN EXIT; END IF;
            i = i + 1;
            IF arr_gvf[i] < dev_min THEN
                dev_min = arr_gvf[i];
                class_min_i = i;
            ELSE
                IF arr_gvf[i] > dev_max THEN
                    dev_max = arr_gvf[i];
                    class_max_i = i;
                END IF;
            END IF;
        END LOOP;
        -- Save best values for comparison and output
        gvf = new_gvf;
        best_classes = classes;
        -- Limit the moved elements as to not remove everything from class_max_i
        move_elements = LEAST(move_elements, classes[class_max_i][2] - classes[class_max_i][1]);
        -- Move `move_elements` from class_max_i to class_min_i
        IF class_min_i < class_max_i THEN
            i := class_min_i;
            LOOP
                IF i = class_max_i THEN EXIT; END IF;
                classes[i][2] = classes[i][2] + move_elements;
                i := i + 1;
            END LOOP;
            i := class_max_i;
            LOOP
                IF i = class_min_i THEN EXIT; END IF;
                classes[i][1] = classes[i][1] + move_elements;
                i := i - 1;
            END LOOP;
        ELSE
            i := class_min_i;
            LOOP
                IF i = class_max_i THEN EXIT; END IF;
                classes[i][1] = classes[i][1] - move_elements;
                i := i - 1;
            END LOOP;
            i := class_max_i;
            LOOP
                IF i = class_min_i THEN EXIT; END IF;
                classes[i][2] = classes[i][2] - move_elements;
                i := i + 1;
            END LOOP;
        END IF;
        -- Recalculate avg and deviation ONLY for the affected classes
        i = LEAST(class_min_i, class_max_i);
        class_max_i = GREATEST(class_min_i, class_max_i);
        class_min_i = i;
        LOOP
            SELECT (sum(v * w) / sum(w)) INTO class_avg FROM (
                SELECT unnest(in_matrix[1:1][classes[i][1]:classes[i][2]]) as v,
                        unnest(in_matrix[2:2][classes[i][1]:classes[i][2]]) as w
                ) x;
            SELECT sum((class_avg - v)^2 * w) INTO class_dev FROM (
                SELECT unnest(in_matrix[1:1][classes[i][1]:classes[i][2]]) as v,
                        unnest(in_matrix[2:2][classes[i][1]:classes[i][2]]) as w
                ) x;
            -- Save status (in case it's needed for rollback) and store the new one
            best_avg[i] = arr_avg[i];
            arr_avg[i] = class_avg;
            best_gvf[i] = arr_gvf[i];
            arr_gvf[i] = class_dev;
            IF i = class_max_i THEN EXIT; END IF;
            i = i + 1;
        END LOOP;
        move_elements = move_elements * 2;
    END LOOP;
    i = 1;
@ -209,13 +334,13 @@ BEGIN
        IF invert = TRUE THEN
            side = 1; --default returns bottom side of breaks, invert returns top side
        END IF;
-        reply = array_append(reply, in_array[classes[i][side]]);  
+        reply = array_append(reply, unnest(in_matrix[1:1][best_classes[i][side]:best_classes[i][side]]));
        i = i+1;
        IF i > breaks THEN  EXIT; END IF;
    END LOOP;
-    RETURN array_prepend(gvf, reply); 
+    reply = array_prepend(gvf, reply);
    RETURN reply;
 END;
-$$ language plpgsql IMMUTABLE PARALLEL SAFE;
+$$ LANGUAGE PLPGSQL IMMUTABLE PARALLEL SAFE;
--- a/test/CDB_JenksBinsTest.sql
+++ b/test/CDB_JenksBinsTest.sql
@ -5,9 +5,8 @@ WITH data AS (
                 15.01, 14.99,
                 20.1, 19.9]::numeric[] AS s
 )
-- expectation is: 1, 5, 10, 15, 20
+SELECT unnest(CDB_JenksBins(s, 5)) FROM data;
-- TODO: fix cdb_jenksbins to match ^^
+
 SELECT round(unnest(CDB_JenksBins(s, 5))) FROM data;
 WITH data_nulls AS (
    SELECT Array[0.99, 1.0, 1.01,
@ -18,6 +17,20 @@ WITH data_nulls AS (
                 null, null,
                 20.1, 19.9]::numeric[] AS s
 )
-- expectation is: 1, 5, 10, 15, 20
+SELECT unnest(CDB_JenksBins(s, 5)) FROM data_nulls;
-- TODO: fix cdb_jenksbins to match ^^
+
-SELECT round(unnest(CDB_JenksBins(s, 5))) FROM data_nulls;
+
 WITH data_inverse AS (
    SELECT Array[0.99, 1.0, 1.01,
                 4.99, 5.01,
                 10.01, 10.01,
                 15.01, 14.99,
                 20.1, 19.9]::numeric[] AS s
 )
 SELECT unnest(CDB_JenksBins(s, 5, 0, true)) FROM data_inverse;
 WITH data_small AS (
    SELECT Array[0.99, 1.0, 10.01, 10.01, 10.01, 10.01]::numeric[] AS s
 )
 SELECT unnest(CDB_JenksBins(s, 4)) FROM data_small;
--- a/test/CDB_JenksBinsTest_expect
+++ b/test/CDB_JenksBinsTest_expect
@ -1,10 +1,18 @@
-1
+1.01
-5
+5.01
-10
+10.01
-20
+15.01
-20
+20.1
-1
+1.01
-5
+5.01
-10
+10.01
-20
+15.01
-
+20.1
 0.99
 4.99
 10.01
 14.99
 19.9
 0.99
 1.0
 10.01
+.01
+.01
+.01
+.01
+.1
+.01
+.01
+.01
+.01
+.1
+.99
+.99
+.01
+.99
+.9
+.99
+.0
+.01