cartodb-postgresql/scripts-available/CDB_JenksBins.sql

--
-- Determine the Jenks classifications from a numeric array
--
-- @param in_array A numeric array of numbers to determine the best
--            bins based on the Jenks method.
--
-- @param breaks The number of bins you want to find.
--
-- @param iterations The number of different starting positions to test.
--
-- @param invert Optional wheter to return the top of each bin (default)
--               or the bottom. BOOLEAN, default=FALSE.
--  
--


CREATE OR REPLACE FUNCTION CDB_JenksBins ( in_array NUMERIC[], breaks INT, iterations INT DEFAULT 5, invert BOOLEAN DEFAULT FALSE) RETURNS NUMERIC[] as $$ 
DECLARE 
    element_count INT4; 
    arr_mean NUMERIC;
    bot INT;
    top INT;
    tops INT[];
    classes INT[][];
    i INT := 1; j INT := 1; 
    curr_result NUMERIC[];
    best_result NUMERIC[];
    seedtarget TEXT;
    quant NUMERIC[];
    shuffles INT;
BEGIN
    -- get the total size of our row
    element_count := array_length(in_array, 1); --array_upper(in_array, 1) - array_lower(in_array, 1); 
    -- ensure the ordering of in_array
    SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x;
    -- stop if no rows 
    IF element_count IS NULL THEN  
        RETURN NULL; 
    END IF; 
    -- stop if our breaks are more than our input array size
    IF element_count < breaks THEN  
        RETURN in_array; 
    END IF; 

    shuffles := LEAST(GREATEST(floor(2500000.0/(element_count::float*iterations::float)), 1), 750)::int;
    -- get our mean value
    SELECT avg(v) INTO arr_mean FROM (  SELECT unnest(in_array) as v ) x; 

    -- assume best is actually Quantile
    SELECT CDB_QuantileBins(in_array, breaks) INTO quant;

    -- if data is very very large, just return quant and be done
    IF element_count > 5000000 THEN
        RETURN quant;
    END IF;

    -- change quant into bottom, top markers
    LOOP 
        IF i = 1 THEN 
            bot = 1;
        ELSE 
            -- use last top to find this bot
            bot = top+1;
        END IF;
        IF i = breaks THEN
            top = element_count;
        ELSE
            SELECT count(*) INTO top FROM ( SELECT unnest(in_array) as v) x WHERE v <= quant[i];
        END IF;
        IF i = 1 THEN 
            classes = ARRAY[ARRAY[bot,top]]; 
        ELSE 
            classes = ARRAY_CAT(classes,ARRAY[bot,top]); 
        END IF;
        IF i > breaks THEN EXIT; END IF;
        i = i+1;
    END LOOP;

    best_result = CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles);

    --set the seed so we can ensure the same results
    SELECT setseed(0.4567) INTO seedtarget;
    --loop through random starting positions
    LOOP
        IF j > iterations-1 THEN  EXIT;  END IF;  
        i = 1;
        tops = ARRAY[element_count];
        LOOP
            IF i = breaks THEN  EXIT;  END IF;  
            SELECT array_agg(distinct e) INTO tops FROM (SELECT unnest(array_cat(tops, ARRAY[floor(random()*element_count::float)::int])) as e ORDER BY e) x WHERE e != 1;
            i = array_length(tops, 1); 
        END LOOP; 
        i = 1;
        LOOP  
            IF i > breaks THEN  EXIT;  END IF;  
            IF i = 1 THEN
                bot = 1;
            ELSE
                bot = top+1;
            END IF;
            top = tops[i];
            IF i = 1 THEN 
                classes = ARRAY[ARRAY[bot,top]]; 
            ELSE 
                classes = ARRAY_CAT(classes,ARRAY[bot,top]); 
            END IF;
            i := i+1; 
        END LOOP; 
        curr_result = CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles);

        IF curr_result[1] > best_result[1] THEN
            best_result = curr_result;
            j = j-1; -- if we found a better result, add one more search
        END IF;
        j = j+1;
    END LOOP;

    RETURN (best_result)[2:array_upper(best_result, 1)];
END;
$$ language plpgsql IMMUTABLE;


--
-- Perform a single iteration of the Jenks classification
--

CREATE OR REPLACE FUNCTION CDB_JenksBinsIteration ( in_array NUMERIC[], breaks INT, classes INT[][], invert BOOLEAN, element_count INT4, arr_mean NUMERIC, max_search INT DEFAULT 50) RETURNS NUMERIC[] as $$ 
DECLARE 
    tmp_val numeric; 
    new_classes int[][];
    tmp_class int[];
    i INT := 1; 
    j INT := 1; 
    side INT := 2;
    sdam numeric; 
    gvf numeric := 0.0; 
    new_gvf numeric; 
    arr_gvf numeric[]; 
    class_avg numeric; 
    class_max_i INT; 
    class_min_i INT; 
    class_max numeric; 
    class_min numeric; 
    reply numeric[]; 
BEGIN 

    -- Calculate the sum of squared deviations from the array mean (SDAM).
    SELECT sum((arr_mean - e)^2) INTO sdam FROM (  SELECT unnest(in_array) as e ) x; 
    --Identify the breaks for the lowest GVF
    LOOP   
        i = 1; 
        LOOP 
            -- get our mean
            SELECT avg(e) INTO class_avg FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e) x;  
            -- find the deviation
            SELECT sum((class_avg-e)^2) INTO tmp_val FROM (   SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e  ) x;  
            IF i = 1 THEN 
                arr_gvf = ARRAY[tmp_val]; 
                -- init our min/max map for later
                class_max = arr_gvf[i];  
                class_min = arr_gvf[i];  
                class_min_i = 1;   
                class_max_i = 1;  
            ELSE 
                arr_gvf = array_append(arr_gvf, tmp_val); 
            END IF;
            i := i+1;  
            IF i > breaks THEN EXIT; END IF;  
        END LOOP;  
        -- calculate our new GVF
        SELECT sdam-sum(e) INTO new_gvf FROM (  SELECT unnest(arr_gvf) as e  ) x;  
        -- if no improvement was made, exit
        IF new_gvf < gvf THEN EXIT; END IF; 
        gvf = new_gvf;  
        IF j > max_search THEN EXIT; END IF; 
        j = j+1;
        i = 1;  
        LOOP  
            --establish directionality (uppward through classes or downward)
            IF arr_gvf[i] < class_min THEN   
                class_min = arr_gvf[i];   
                class_min_i = i;  
            END IF;  
            IF arr_gvf[i] > class_max THEN   
                class_max = arr_gvf[i];   
                class_max_i = i;  
            END IF;  
            i := i+1;  
            IF i > breaks THEN EXIT; END IF;  
        END LOOP;  
        IF class_max_i > class_min_i THEN
            class_min_i = class_max_i - 1;
        ELSE
            class_min_i = class_max_i + 1;
        END IF;
            --Move from higher class to a lower gid order
            IF class_max_i > class_min_i THEN
                classes[class_max_i][1] = classes[class_max_i][1] + 1;
                classes[class_min_i][2] = classes[class_min_i][2] + 1;
            ELSE -- Move from lower class UP into a higher class by gid
                classes[class_max_i][2] = classes[class_max_i][2] - 1;
                classes[class_min_i][1] = classes[class_min_i][1] - 1;
            END IF;
    END LOOP; 

    i = 1;
    LOOP  
        IF invert = TRUE THEN
            side = 1; --default returns bottom side of breaks, invert returns top side
        END IF;
        reply = array_append(reply, in_array[classes[i][side]]);  
        i = i+1;  
        IF i > breaks THEN  EXIT; END IF; 
    END LOOP; 
    
    RETURN array_prepend(gvf, reply); 

END; 
$$ language plpgsql IMMUTABLE;
Copy cartodb lib/sql scripts from CDB_CartodbfyTable branch 2014-05-05 23:13:06 +08:00			`--`
			`-- Determine the Jenks classifications from a numeric array`
			`--`
			`-- @param in_array A numeric array of numbers to determine the best`
			`-- bins based on the Jenks method.`
			`--`
			`-- @param breaks The number of bins you want to find.`
			`--`
			`-- @param iterations The number of different starting positions to test.`
			`--`
			`-- @param invert Optional wheter to return the top of each bin (default)`
			`-- or the bottom. BOOLEAN, default=FALSE.`
			`--`
			`--`


			`CREATE OR REPLACE FUNCTION CDB_JenksBins ( in_array NUMERIC[], breaks INT, iterations INT DEFAULT 5, invert BOOLEAN DEFAULT FALSE) RETURNS NUMERIC[] as $$`
			`DECLARE`
			`element_count INT4;`
			`arr_mean NUMERIC;`
			`bot INT;`
			`top INT;`
			`tops INT[];`
			`classes INT[][];`
			`i INT := 1; j INT := 1;`
			`curr_result NUMERIC[];`
			`best_result NUMERIC[];`
			`seedtarget TEXT;`
			`quant NUMERIC[];`
			`shuffles INT;`
			`BEGIN`
			`-- get the total size of our row`
			`element_count := array_length(in_array, 1); --array_upper(in_array, 1) - array_lower(in_array, 1);`
			`-- ensure the ordering of in_array`
			`SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e) x;`
			`-- stop if no rows`
			`IF element_count IS NULL THEN`
			`RETURN NULL;`
			`END IF;`
			`-- stop if our breaks are more than our input array size`
			`IF element_count < breaks THEN`
			`RETURN in_array;`
			`END IF;`

			`shuffles := LEAST(GREATEST(floor(2500000.0/(element_count::float*iterations::float)), 1), 750)::int;`
			`-- get our mean value`
			`SELECT avg(v) INTO arr_mean FROM ( SELECT unnest(in_array) as v ) x;`

			`-- assume best is actually Quantile`
			`SELECT CDB_QuantileBins(in_array, breaks) INTO quant;`

			`-- if data is very very large, just return quant and be done`
			`IF element_count > 5000000 THEN`
			`RETURN quant;`
			`END IF;`

			`-- change quant into bottom, top markers`
			`LOOP`
			`IF i = 1 THEN`
			`bot = 1;`
			`ELSE`
			`-- use last top to find this bot`
			`bot = top+1;`
			`END IF;`
			`IF i = breaks THEN`
			`top = element_count;`
			`ELSE`
			`SELECT count(*) INTO top FROM ( SELECT unnest(in_array) as v) x WHERE v <= quant[i];`
			`END IF;`
			`IF i = 1 THEN`
			`classes = ARRAY[ARRAY[bot,top]];`
			`ELSE`
			`classes = ARRAY_CAT(classes,ARRAY[bot,top]);`
			`END IF;`
			`IF i > breaks THEN EXIT; END IF;`
			`i = i+1;`
			`END LOOP;`

			`best_result = CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles);`

			`--set the seed so we can ensure the same results`
			`SELECT setseed(0.4567) INTO seedtarget;`
			`--loop through random starting positions`
			`LOOP`
			`IF j > iterations-1 THEN EXIT; END IF;`
			`i = 1;`
			`tops = ARRAY[element_count];`
			`LOOP`
			`IF i = breaks THEN EXIT; END IF;`
			`SELECT array_agg(distinct e) INTO tops FROM (SELECT unnest(array_cat(tops, ARRAY[floor(random()*element_count::float)::int])) as e ORDER BY e) x WHERE e != 1;`
			`i = array_length(tops, 1);`
			`END LOOP;`
			`i = 1;`
			`LOOP`
			`IF i > breaks THEN EXIT; END IF;`
			`IF i = 1 THEN`
			`bot = 1;`
			`ELSE`
			`bot = top+1;`
			`END IF;`
			`top = tops[i];`
			`IF i = 1 THEN`
			`classes = ARRAY[ARRAY[bot,top]];`
			`ELSE`
			`classes = ARRAY_CAT(classes,ARRAY[bot,top]);`
			`END IF;`
			`i := i+1;`
			`END LOOP;`
			`curr_result = CDB_JenksBinsIteration( in_array, breaks, classes, invert, element_count, arr_mean, shuffles);`

			`IF curr_result[1] > best_result[1] THEN`
			`best_result = curr_result;`
			`j = j-1; -- if we found a better result, add one more search`
			`END IF;`
			`j = j+1;`
			`END LOOP;`

			`RETURN (best_result)[2:array_upper(best_result, 1)];`
			`END;`
			`$$ language plpgsql IMMUTABLE;`



			`--`
			`-- Perform a single iteration of the Jenks classification`
			`--`

			`CREATE OR REPLACE FUNCTION CDB_JenksBinsIteration ( in_array NUMERIC[], breaks INT, classes INT[][], invert BOOLEAN, element_count INT4, arr_mean NUMERIC, max_search INT DEFAULT 50) RETURNS NUMERIC[] as $$`
			`DECLARE`
			`tmp_val numeric;`
			`new_classes int[][];`
			`tmp_class int[];`
			`i INT := 1;`
			`j INT := 1;`
			`side INT := 2;`
			`sdam numeric;`
			`gvf numeric := 0.0;`
			`new_gvf numeric;`
			`arr_gvf numeric[];`
			`class_avg numeric;`
			`class_max_i INT;`
			`class_min_i INT;`
			`class_max numeric;`
			`class_min numeric;`
			`reply numeric[];`
			`BEGIN`

			`-- Calculate the sum of squared deviations from the array mean (SDAM).`
			`SELECT sum((arr_mean - e)^2) INTO sdam FROM ( SELECT unnest(in_array) as e ) x;`
			`--Identify the breaks for the lowest GVF`
			`LOOP`
			`i = 1;`
			`LOOP`
			`-- get our mean`
			`SELECT avg(e) INTO class_avg FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e) x;`
			`-- find the deviation`
			`SELECT sum((class_avg-e)^2) INTO tmp_val FROM ( SELECT unnest(in_array[classes[i][1]:classes[i][2]]) as e ) x;`
			`IF i = 1 THEN`
			`arr_gvf = ARRAY[tmp_val];`
			`-- init our min/max map for later`
			`class_max = arr_gvf[i];`
			`class_min = arr_gvf[i];`
			`class_min_i = 1;`
			`class_max_i = 1;`
			`ELSE`
			`arr_gvf = array_append(arr_gvf, tmp_val);`
			`END IF;`
			`i := i+1;`
			`IF i > breaks THEN EXIT; END IF;`
			`END LOOP;`
			`-- calculate our new GVF`
			`SELECT sdam-sum(e) INTO new_gvf FROM ( SELECT unnest(arr_gvf) as e ) x;`
			`-- if no improvement was made, exit`
			`IF new_gvf < gvf THEN EXIT; END IF;`
			`gvf = new_gvf;`
			`IF j > max_search THEN EXIT; END IF;`
			`j = j+1;`
			`i = 1;`
			`LOOP`
			`--establish directionality (uppward through classes or downward)`
			`IF arr_gvf[i] < class_min THEN`
			`class_min = arr_gvf[i];`
			`class_min_i = i;`
			`END IF;`
			`IF arr_gvf[i] > class_max THEN`
			`class_max = arr_gvf[i];`
			`class_max_i = i;`
			`END IF;`
			`i := i+1;`
			`IF i > breaks THEN EXIT; END IF;`
			`END LOOP;`
			`IF class_max_i > class_min_i THEN`
			`class_min_i = class_max_i - 1;`
			`ELSE`
			`class_min_i = class_max_i + 1;`
			`END IF;`
			`--Move from higher class to a lower gid order`
			`IF class_max_i > class_min_i THEN`
			`classes[class_max_i][1] = classes[class_max_i][1] + 1;`
			`classes[class_min_i][2] = classes[class_min_i][2] + 1;`
			`ELSE -- Move from lower class UP into a higher class by gid`
			`classes[class_max_i][2] = classes[class_max_i][2] - 1;`
			`classes[class_min_i][1] = classes[class_min_i][1] - 1;`
			`END IF;`
			`END LOOP;`

			`i = 1;`
			`LOOP`
			`IF invert = TRUE THEN`
			`side = 1; --default returns bottom side of breaks, invert returns top side`
			`END IF;`
			`reply = array_append(reply, in_array[classes[i][side]]);`
			`i = i+1;`
			`IF i > breaks THEN EXIT; END IF;`
			`END LOOP;`

			`RETURN array_prepend(gvf, reply);`

			`END;`
			`$$ language plpgsql IMMUTABLE;`