diff --git a/scripts-available/CDB_QuantileBins.sql b/scripts-available/CDB_QuantileBins.sql index 20fa440..c4a7564 100644 --- a/scripts-available/CDB_QuantileBins.sql +++ b/scripts-available/CDB_QuantileBins.sql @@ -5,39 +5,15 @@ -- bins based on the Quantile method. -- -- @param breaks The number of bins you want to find. --- -- -CREATE OR REPLACE FUNCTION CDB_QuantileBins ( in_array NUMERIC[], breaks INT) RETURNS NUMERIC[] as $$ -DECLARE - element_count INT4; - break_size numeric; - tmp_val numeric; - i INT := 1; - reply numeric[]; -BEGIN - -- sort our values - SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e ASC) x; - -- get the total size of our data - element_count := array_length(in_array, 1); - break_size := element_count::numeric / breaks; - -- slice our bread - LOOP - IF i < breaks THEN - IF break_size * i % 1 > 0 THEN - SELECT e INTO tmp_val FROM ( SELECT unnest(in_array) e LIMIT 1 OFFSET ceil(break_size * i) - 1) x; - ELSE - SELECT avg(e) INTO tmp_val FROM ( SELECT unnest(in_array) e LIMIT 2 OFFSET ceil(break_size * i) - 1 ) x; - END IF; - ELSIF i = breaks THEN - -- select the last value - SELECT max(e) INTO tmp_val FROM ( SELECT unnest(in_array) e ) x; - ELSE - EXIT; - END IF; - - reply = array_append(reply, tmp_val); - i := i+1; - END LOOP; - RETURN reply; -END; -$$ language plpgsql IMMUTABLE STRICT PARALLEL SAFE; +-- +CREATE OR REPLACE FUNCTION CDB_QuantileBins(in_array numeric[], breaks int) +RETURNS numeric[] +AS $$ + SELECT array_agg(p) FROM ( + SELECT percentile_disc(idx::numeric / breaks::numeric) + WITHIN GROUP (ORDER BY x ASC) AS p + FROM generate_series(1, breaks) AS idx, unnest(in_array) AS x + GROUP BY idx + ) AS quantiles; +$$ language sql; diff --git a/test/CDB_QuantileBinsTest.sql b/test/CDB_QuantileBinsTest.sql index 4a02a0c..90b9f62 100644 --- a/test/CDB_QuantileBinsTest.sql +++ b/test/CDB_QuantileBinsTest.sql @@ -1,11 +1,17 @@ WITH data AS ( - SELECT array_agg(x::numeric) s FROM generate_series(1,100) x - WHERE x % 5 != 0 AND x % 7 != 0 + SELECT array_agg(x::numeric) AS s + FROM generate_series(0, 99) AS x ) -SELECT unnest(CDB_QuantileBins(s, 7)) FROM data; +SELECT unnest(CDB_QuantileBins(s, 10)) + FROM data; WITH data_nulls AS ( - SELECT array_agg(CASE WHEN x % 2 != 0 THEN x ELSE NULL END::numeric) s FROM generate_series(1,100) x - WHERE x % 5 != 0 AND x % 7 != 0 + SELECT array_agg(x::numeric) + FROM ( + SELECT x FROM generate_series(0, 99) AS x + UNION ALL + SELECT null AS s FROM generate_series(1, 10) AS x + ) _wrap ) -SELECT unnest(CDB_QuantileBins(s, 7)) FROM data_nulls; +SELECT unnest(CDB_QuantileBins(s, 10)) + FROM data_nulls; diff --git a/test/CDB_QuantileBinsTest_expect b/test/CDB_QuantileBinsTest_expect index 9261ae3..fbb4b0e 100644 --- a/test/CDB_QuantileBinsTest_expect +++ b/test/CDB_QuantileBinsTest_expect @@ -1,14 +1,20 @@ -13 +9 +19 29 -43 -57 -71 -86 +39 +49 +59 +69 +79 +89 99 +9 +19 29 -57 -87 - - - +39 +49 +59 +69 +79 +89 99