2014-05-05 23:13:06 +08:00
--
-- Determine the Jenks classifications from a numeric array
--
-- @param in_array A numeric array of numbers to determine the best
-- bins based on the Jenks method.
--
-- @param breaks The number of bins you want to find.
--
-- @param iterations The number of different starting positions to test.
--
-- @param invert Optional wheter to return the top of each bin (default)
-- or the bottom. BOOLEAN, default=FALSE.
2018-09-10 19:16:54 +08:00
--
2014-05-05 23:13:06 +08:00
--
2019-05-31 21:29:28 +08:00
CREATE OR REPLACE FUNCTION @ extschema @ . CDB_JenksBins ( in_array NUMERIC [ ] , breaks INT , iterations INT DEFAULT 0 , invert BOOLEAN DEFAULT FALSE )
2018-09-10 19:16:54 +08:00
RETURNS NUMERIC [ ] as
$ $
DECLARE
in_matrix NUMERIC [ ] [ ] ;
in_unique_count BIGINT ;
2014-05-05 23:13:06 +08:00
2018-09-10 19:16:54 +08:00
shuffles INT ;
2014-05-05 23:13:06 +08:00
arr_mean NUMERIC ;
2018-09-10 19:16:54 +08:00
sdam NUMERIC ;
i INT ;
2014-05-05 23:13:06 +08:00
bot INT ;
top INT ;
2018-09-10 19:16:54 +08:00
2014-05-05 23:13:06 +08:00
tops INT [ ] ;
classes INT [ ] [ ] ;
2018-09-10 19:16:54 +08:00
j INT : = 1 ;
2014-05-05 23:13:06 +08:00
curr_result NUMERIC [ ] ;
best_result NUMERIC [ ] ;
seedtarget TEXT ;
2018-09-10 19:16:54 +08:00
2014-05-05 23:13:06 +08:00
BEGIN
2018-09-10 19:16:54 +08:00
-- We clean the input array (remove NULLs) and create 2 arrays
-- [1] contains the unique values in in_array
-- [2] contains the number of appearances of those unique values
SELECT ARRAY [ array_agg ( value ) , array_agg ( count ) ] FROM
(
SELECT value , count ( 1 ) : : numeric as count
FROM unnest ( in_array ) AS value
WHERE value is NOT NULL
GROUP BY value
ORDER BY value
) __clean_array_q INTO in_matrix ;
-- Get the number of unique values
in_unique_count : = array_length ( in_matrix [ 1 : 1 ] , 2 ) ;
IF in_unique_count IS NULL THEN
RETURN NULL ;
2014-05-05 23:13:06 +08:00
END IF ;
2018-09-10 19:16:54 +08:00
IF in_unique_count < = breaks THEN
-- There isn't enough distinct values for the requested breaks
RETURN ARRAY ( Select unnest ( in_matrix [ 1 : 1 ] ) ) _a ;
END IF ;
-- If not declated explicitly we iterate based on the length of the array
IF iterations < 1 THEN
-- This is based on a 'looks fine' heuristic
iterations : = log ( in_unique_count ) : : integer + 1 ;
END IF ;
-- We set the number of shuffles per iteration as the number of unique values but
-- this is just another 'looks fine' heuristic
shuffles : = in_unique_count ;
-- Get the mean value of the whole vector (already ignores NULLs)
SELECT avg ( v ) INTO arr_mean FROM ( SELECT unnest ( in_array ) as v ) x ;
-- Calculate the sum of squared deviations from the array mean (SDAM).
SELECT sum ( ( ( arr_mean - v ) ^ 2 ) * w ) INTO sdam FROM (
SELECT unnest ( in_matrix [ 1 : 1 ] ) as v , unnest ( in_matrix [ 2 : 2 ] ) as w
) x ;
-- To start, we create ranges with approximately the same amount of different values
top : = 0 ;
i : = 1 ;
LOOP
bot : = top + 1 ;
top : = ROUND ( i * in_unique_count : : numeric / breaks : : NUMERIC ) ;
IF i = 1 THEN
classes = ARRAY [ ARRAY [ bot , top ] ] ;
2014-05-05 23:13:06 +08:00
ELSE
2018-09-10 19:16:54 +08:00
classes = ARRAY_CAT ( classes , ARRAY [ bot , top ] ) ;
2014-05-05 23:13:06 +08:00
END IF ;
2018-09-10 19:16:54 +08:00
i : = i + 1 ;
2014-05-05 23:13:06 +08:00
IF i > breaks THEN EXIT ; END IF ;
END LOOP ;
2019-05-31 21:29:28 +08:00
best_result = @ extschema @ . CDB_JenksBinsIteration ( in_matrix , breaks , classes , invert , sdam , shuffles ) ;
2014-05-05 23:13:06 +08:00
- - set the seed so we can ensure the same results
SELECT setseed ( 0 . 4567 ) INTO seedtarget ;
- - loop through random starting positions
LOOP
2018-09-10 19:16:54 +08:00
IF j > iterations - 1 THEN EXIT ; END IF ;
2014-05-05 23:13:06 +08:00
i = 1 ;
2018-09-10 19:16:54 +08:00
tops = ARRAY [ in_unique_count ] ;
2014-05-05 23:13:06 +08:00
LOOP
2018-09-10 19:16:54 +08:00
IF i = breaks THEN EXIT ; END IF ;
SELECT array_agg ( distinct e ) INTO tops FROM (
SELECT unnest ( array_cat ( tops , ARRAY [ trunc ( random ( ) * in_unique_count : : float8 ) : : int + 1 ] ) ) as e ORDER BY e
) x ;
i = array_length ( tops , 1 ) ;
END LOOP ;
top : = 0 ;
2014-05-05 23:13:06 +08:00
i = 1 ;
2018-09-10 19:16:54 +08:00
LOOP
bot : = top + 1 ;
top = tops [ i ] ;
2014-05-05 23:13:06 +08:00
IF i = 1 THEN
2018-09-10 19:16:54 +08:00
classes = ARRAY [ ARRAY [ bot , top ] ] ;
2014-05-05 23:13:06 +08:00
ELSE
2018-09-10 19:16:54 +08:00
classes = ARRAY_CAT ( classes , ARRAY [ bot , top ] ) ;
2014-05-05 23:13:06 +08:00
END IF ;
2018-09-10 19:16:54 +08:00
i : = i + 1 ;
IF i > breaks THEN EXIT ; END IF ;
END LOOP ;
2019-05-31 21:29:28 +08:00
curr_result = @ extschema @ . CDB_JenksBinsIteration ( in_matrix , breaks , classes , invert , sdam , shuffles ) ;
2014-05-05 23:13:06 +08:00
IF curr_result [ 1 ] > best_result [ 1 ] THEN
best_result = curr_result ;
END IF ;
2018-09-10 19:16:54 +08:00
2014-05-05 23:13:06 +08:00
j = j + 1 ;
END LOOP ;
RETURN ( best_result ) [ 2 : array_upper ( best_result , 1 ) ] ;
END ;
2018-09-10 19:16:54 +08:00
$ $ LANGUAGE PLPGSQL IMMUTABLE PARALLEL RESTRICTED ;
2014-05-05 23:13:06 +08:00
--
-- Perform a single iteration of the Jenks classification
--
2018-09-10 19:16:54 +08:00
-- Returns an array with:
-- - First element: gvf
-- - Second to 2+n: Category limits
2019-05-31 21:29:28 +08:00
DROP FUNCTION IF EXISTS @ extschema @ . CDB_JenksBinsIteration ( in_matrix NUMERIC [ ] , breaks INT , classes INT [ ] , invert BOOLEAN , element_count INT4 , arr_mean NUMERIC , max_search INT ) ; -- Old signature
2018-09-10 19:16:54 +08:00
2019-05-31 21:29:28 +08:00
CREATE OR REPLACE FUNCTION @ extschema @ . CDB_JenksBinsIteration ( in_matrix NUMERIC [ ] , breaks INT , classes INT [ ] , invert BOOLEAN , sdam NUMERIC , max_search INT DEFAULT 50 ) RETURNS NUMERIC [ ] as $ $
2018-09-10 19:16:54 +08:00
DECLARE
i INT ;
iterations INT = 0 ;
2014-05-05 23:13:06 +08:00
side INT : = 2 ;
2018-09-10 19:16:54 +08:00
gvf numeric : = 0 . 0 ;
new_gvf numeric ;
arr_gvf numeric [ ] ;
arr_avg numeric [ ] ;
class_avg numeric ;
class_dev numeric ;
2018-09-12 01:22:48 +08:00
class_max_i INT = 0 ;
class_min_i INT = 0 ;
2018-09-10 19:16:54 +08:00
dev_max numeric ;
dev_min numeric ;
2018-09-12 01:22:48 +08:00
2018-09-10 19:16:54 +08:00
best_classes INT [ ] = classes ;
2018-09-12 01:22:48 +08:00
best_gvf numeric [ ] ;
best_avg numeric [ ] ;
move_elements INT = 1 ;
2018-09-10 19:16:54 +08:00
reply numeric [ ] ;
BEGIN
-- We fill the arrays with the initial values
i = 0 ;
LOOP
IF i = breaks THEN EXIT ; END IF ;
i = i + 1 ;
-- Get class mean
SELECT ( sum ( v * w ) / sum ( w ) ) INTO class_avg FROM (
SELECT unnest ( in_matrix [ 1 : 1 ] [ classes [ i ] [ 1 ] : classes [ i ] [ 2 ] ] ) as v ,
unnest ( in_matrix [ 2 : 2 ] [ classes [ i ] [ 1 ] : classes [ i ] [ 2 ] ] ) as w
) x ;
-- Get class deviation
SELECT sum ( ( class_avg - v ) ^ 2 * w ) INTO class_dev FROM (
SELECT unnest ( in_matrix [ 1 : 1 ] [ classes [ i ] [ 1 ] : classes [ i ] [ 2 ] ] ) as v ,
unnest ( in_matrix [ 2 : 2 ] [ classes [ i ] [ 1 ] : classes [ i ] [ 2 ] ] ) as w
) x ;
IF i = 1 THEN
arr_avg = ARRAY [ class_avg ] ;
arr_gvf = ARRAY [ class_dev ] ;
ELSE
arr_avg = array_append ( arr_avg , class_avg ) ;
arr_gvf = array_append ( arr_gvf , class_dev ) ;
END IF ;
END LOOP ;
2018-09-12 01:22:48 +08:00
-- We copy the values to avoid recalculation when a failure happens
best_avg = arr_avg ;
best_gvf = arr_gvf ;
2018-09-10 19:16:54 +08:00
iterations = 0 ;
LOOP
IF iterations = max_search THEN EXIT ; END IF ;
iterations = iterations + 1 ;
2014-05-05 23:13:06 +08:00
-- calculate our new GVF
2018-09-10 19:16:54 +08:00
SELECT sdam - sum ( e ) INTO new_gvf FROM ( SELECT unnest ( arr_gvf ) as e ) x ;
2018-09-12 01:22:48 +08:00
-- Check if any improvement was made
IF new_gvf < = gvf THEN
-- If we were moving too many elements, go back and move less
IF move_elements < = 2 OR class_max_i = class_min_i THEN
EXIT ;
END IF ;
move_elements = GREATEST ( move_elements / 8 , 1 ) ;
2018-09-10 19:16:54 +08:00
2018-09-12 01:22:48 +08:00
-- Rollback from saved statuses
classes = best_classes ;
new_gvf = gvf ;
i = class_min_i ;
LOOP
arr_avg [ i ] = best_avg [ i ] ;
arr_gvf [ i ] = best_gvf [ i ] ;
IF i = class_max_i THEN EXIT ; END IF ;
i = i + 1 ;
END LOOP ;
END IF ;
-- We search for the classes with the min and max deviation
2018-09-10 19:16:54 +08:00
i = 1 ;
class_min_i = 1 ;
class_max_i = 1 ;
dev_max = arr_gvf [ 1 ] ;
dev_min = arr_gvf [ 1 ] ;
LOOP
IF i = breaks THEN EXIT ; END IF ;
i = i + 1 ;
IF arr_gvf [ i ] < dev_min THEN
dev_min = arr_gvf [ i ] ;
class_min_i = i ;
ELSE
IF arr_gvf [ i ] > dev_max THEN
dev_max = arr_gvf [ i ] ;
class_max_i = i ;
END IF ;
END IF ;
END LOOP ;
-- Save best values for comparison and output
gvf = new_gvf ;
best_classes = classes ;
2018-09-12 01:22:48 +08:00
-- Limit the moved elements as to not remove everything from class_max_i
move_elements = LEAST ( move_elements , classes [ class_max_i ] [ 2 ] - classes [ class_max_i ] [ 1 ] ) ;
-- Move `move_elements` from class_max_i to class_min_i
2018-09-10 19:16:54 +08:00
IF class_min_i < class_max_i THEN
i : = class_min_i ;
LOOP
IF i = class_max_i THEN EXIT ; END IF ;
2018-09-12 01:22:48 +08:00
classes [ i ] [ 2 ] = classes [ i ] [ 2 ] + move_elements ;
2018-09-10 19:16:54 +08:00
i : = i + 1 ;
END LOOP ;
i : = class_max_i ;
LOOP
IF i = class_min_i THEN EXIT ; END IF ;
2018-09-12 01:22:48 +08:00
classes [ i ] [ 1 ] = classes [ i ] [ 1 ] + move_elements ;
2018-09-10 19:16:54 +08:00
i : = i - 1 ;
END LOOP ;
2014-05-05 23:13:06 +08:00
ELSE
2018-09-10 19:16:54 +08:00
i : = class_min_i ;
LOOP
IF i = class_max_i THEN EXIT ; END IF ;
2018-09-12 01:22:48 +08:00
classes [ i ] [ 1 ] = classes [ i ] [ 1 ] - move_elements ;
2018-09-10 19:16:54 +08:00
i : = i - 1 ;
END LOOP ;
i : = class_max_i ;
LOOP
IF i = class_min_i THEN EXIT ; END IF ;
2018-09-12 01:22:48 +08:00
classes [ i ] [ 2 ] = classes [ i ] [ 2 ] - move_elements ;
2018-09-10 19:16:54 +08:00
i : = i + 1 ;
END LOOP ;
2014-05-05 23:13:06 +08:00
END IF ;
2018-09-10 19:16:54 +08:00
2018-09-12 01:22:48 +08:00
-- Recalculate avg and deviation ONLY for the affected classes
2018-09-10 19:16:54 +08:00
i = LEAST ( class_min_i , class_max_i ) ;
class_max_i = GREATEST ( class_min_i , class_max_i ) ;
2018-09-12 01:22:48 +08:00
class_min_i = i ;
2018-09-10 19:16:54 +08:00
LOOP
SELECT ( sum ( v * w ) / sum ( w ) ) INTO class_avg FROM (
SELECT unnest ( in_matrix [ 1 : 1 ] [ classes [ i ] [ 1 ] : classes [ i ] [ 2 ] ] ) as v ,
unnest ( in_matrix [ 2 : 2 ] [ classes [ i ] [ 1 ] : classes [ i ] [ 2 ] ] ) as w
) x ;
SELECT sum ( ( class_avg - v ) ^ 2 * w ) INTO class_dev FROM (
SELECT unnest ( in_matrix [ 1 : 1 ] [ classes [ i ] [ 1 ] : classes [ i ] [ 2 ] ] ) as v ,
unnest ( in_matrix [ 2 : 2 ] [ classes [ i ] [ 1 ] : classes [ i ] [ 2 ] ] ) as w
) x ;
2018-09-12 01:22:48 +08:00
-- Save status (in case it's needed for rollback) and store the new one
best_avg [ i ] = arr_avg [ i ] ;
2018-09-10 19:16:54 +08:00
arr_avg [ i ] = class_avg ;
2018-09-12 01:22:48 +08:00
best_gvf [ i ] = arr_gvf [ i ] ;
2018-09-10 19:16:54 +08:00
arr_gvf [ i ] = class_dev ;
IF i = class_max_i THEN EXIT ; END IF ;
i = i + 1 ;
END LOOP ;
2018-09-12 01:22:48 +08:00
move_elements = move_elements * 2 ;
2018-09-10 19:16:54 +08:00
END LOOP ;
2014-05-05 23:13:06 +08:00
i = 1 ;
2018-09-10 19:16:54 +08:00
LOOP
2014-05-05 23:13:06 +08:00
IF invert = TRUE THEN
side = 1 ; - - default returns bottom side of breaks , invert returns top side
END IF ;
2018-09-10 19:16:54 +08:00
reply = array_append ( reply , unnest ( in_matrix [ 1 : 1 ] [ best_classes [ i ] [ side ] : best_classes [ i ] [ side ] ] ) ) ;
i = i + 1 ;
IF i > breaks THEN EXIT ; END IF ;
END LOOP ;
reply = array_prepend ( gvf , reply ) ;
RETURN reply ;
END ;
$ $ LANGUAGE PLPGSQL IMMUTABLE PARALLEL SAFE ;