moves quantile bins to use postgres precentile functions

This commit is contained in:
Andy Eschbacher 2017-12-05 16:16:39 -05:00
parent 5243192296
commit 691b9a8312
3 changed files with 39 additions and 51 deletions

View File

@ -5,39 +5,15 @@
-- bins based on the Quantile method.
--
-- @param breaks The number of bins you want to find.
--
--
CREATE OR REPLACE FUNCTION CDB_QuantileBins ( in_array NUMERIC[], breaks INT) RETURNS NUMERIC[] as $$
DECLARE
element_count INT4;
break_size numeric;
tmp_val numeric;
i INT := 1;
reply numeric[];
BEGIN
-- sort our values
SELECT array_agg(e) INTO in_array FROM (SELECT unnest(in_array) e ORDER BY e ASC) x;
-- get the total size of our data
element_count := array_length(in_array, 1);
break_size := element_count::numeric / breaks;
-- slice our bread
LOOP
IF i < breaks THEN
IF break_size * i % 1 > 0 THEN
SELECT e INTO tmp_val FROM ( SELECT unnest(in_array) e LIMIT 1 OFFSET ceil(break_size * i) - 1) x;
ELSE
SELECT avg(e) INTO tmp_val FROM ( SELECT unnest(in_array) e LIMIT 2 OFFSET ceil(break_size * i) - 1 ) x;
END IF;
ELSIF i = breaks THEN
-- select the last value
SELECT max(e) INTO tmp_val FROM ( SELECT unnest(in_array) e ) x;
ELSE
EXIT;
END IF;
reply = array_append(reply, tmp_val);
i := i+1;
END LOOP;
RETURN reply;
END;
$$ language plpgsql IMMUTABLE STRICT PARALLEL SAFE;
--
CREATE OR REPLACE FUNCTION CDB_QuantileBins(in_array numeric[], breaks int)
RETURNS numeric[]
AS $$
SELECT array_agg(p) FROM (
SELECT percentile_disc(idx::numeric / breaks::numeric)
WITHIN GROUP (ORDER BY x ASC) AS p
FROM generate_series(1, breaks) AS idx, unnest(in_array) AS x
GROUP BY idx
) AS quantiles;
$$ language sql;

View File

@ -1,11 +1,17 @@
WITH data AS (
SELECT array_agg(x::numeric) s FROM generate_series(1,100) x
WHERE x % 5 != 0 AND x % 7 != 0
SELECT array_agg(x::numeric) AS s
FROM generate_series(0, 99) AS x
)
SELECT unnest(CDB_QuantileBins(s, 7)) FROM data;
SELECT unnest(CDB_QuantileBins(s, 10))
FROM data;
WITH data_nulls AS (
SELECT array_agg(CASE WHEN x % 2 != 0 THEN x ELSE NULL END::numeric) s FROM generate_series(1,100) x
WHERE x % 5 != 0 AND x % 7 != 0
SELECT array_agg(x::numeric)
FROM (
SELECT x FROM generate_series(0, 99) AS x
UNION ALL
SELECT null AS s FROM generate_series(1, 10) AS x
) _wrap
)
SELECT unnest(CDB_QuantileBins(s, 7)) FROM data_nulls;
SELECT unnest(CDB_QuantileBins(s, 10))
FROM data_nulls;

View File

@ -1,14 +1,20 @@
13
9
19
29
43
57
71
86
39
49
59
69
79
89
99
9
19
29
57
87
39
49
59
69
79
89
99