2015-06-24 04:27:27 +08:00
|
|
|
--
|
|
|
|
-- CDB_DistinctMeasure
|
|
|
|
-- calculates the fraction of rows in the 10 most common distinct categories
|
|
|
|
-- returns true if the number of rows in these 10 categories is >= 0.9 * total number of rows
|
|
|
|
--
|
|
|
|
--
|
|
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION CDB_DistinctMeasure ( in_array text[], threshold numeric DEFAULT 0.90 ) RETURNS boolean as $$
|
|
|
|
DECLARE
|
|
|
|
element_count INT4;
|
|
|
|
maxval numeric;
|
|
|
|
passes boolean;
|
|
|
|
BEGIN
|
|
|
|
SELECT count(e) INTO element_count FROM ( SELECT unnest(in_array) e ) x;
|
|
|
|
|
|
|
|
-- count number of occurrences per bin
|
|
|
|
-- calculate the normalized cumulative sum
|
|
|
|
-- return the max value: which corresponds nth entry
|
|
|
|
-- for n <= 10 depending on # of distinct values
|
|
|
|
EXECUTE 'WITH a As (
|
|
|
|
SELECT
|
|
|
|
count(*) cnt,
|
|
|
|
e
|
|
|
|
FROM
|
|
|
|
(SELECT * FROM unnest($2) e ) x
|
|
|
|
GROUP BY e
|
|
|
|
ORDER BY cnt DESC
|
|
|
|
),
|
|
|
|
b As (
|
|
|
|
SELECT
|
|
|
|
sum(cnt) OVER (ORDER BY cnt DESC) / $1 As cumsum
|
|
|
|
FROM a
|
|
|
|
LIMIT 10
|
|
|
|
)
|
|
|
|
SELECT max(cumsum) maxval FROM b'
|
|
|
|
INTO maxval
|
|
|
|
USING element_count, in_array;
|
|
|
|
|
2015-06-24 06:49:59 +08:00
|
|
|
passes = (maxval >= threshold);
|
|
|
|
|
2015-06-24 04:27:27 +08:00
|
|
|
RETURN passes;
|
|
|
|
END;
|
|
|
|
$$ language plpgsql IMMUTABLE;
|