initial commit
This commit is contained in:
parent
189309e1a5
commit
cda6953ea6
46
scripts-available/CDB_DistinctMeasure.sql
Normal file
46
scripts-available/CDB_DistinctMeasure.sql
Normal file
@ -0,0 +1,46 @@
|
||||
--
|
||||
-- CDB_DistinctMeasure
|
||||
-- calculates the fraction of rows in the 10 most common distinct categories
|
||||
-- returns true if the number of rows in these 10 categories is >= 0.9 * total number of rows
|
||||
--
|
||||
--
|
||||
|
||||
CREATE OR REPLACE FUNCTION CDB_DistinctMeasure ( in_array text[], threshold numeric DEFAULT 0.90 ) RETURNS boolean as $$
|
||||
DECLARE
|
||||
element_count INT4;
|
||||
maxval numeric;
|
||||
passes boolean;
|
||||
BEGIN
|
||||
SELECT count(e) INTO element_count FROM ( SELECT unnest(in_array) e ) x;
|
||||
|
||||
-- count number of occurrences per bin
|
||||
-- calculate the normalized cumulative sum
|
||||
-- return the max value: which corresponds nth entry
|
||||
-- for n <= 10 depending on # of distinct values
|
||||
EXECUTE 'WITH a As (
|
||||
SELECT
|
||||
count(*) cnt,
|
||||
e
|
||||
FROM
|
||||
(SELECT * FROM unnest($2) e ) x
|
||||
GROUP BY e
|
||||
ORDER BY cnt DESC
|
||||
),
|
||||
b As (
|
||||
SELECT
|
||||
sum(cnt) OVER (ORDER BY cnt DESC) / $1 As cumsum
|
||||
FROM a
|
||||
LIMIT 10
|
||||
)
|
||||
SELECT max(cumsum) maxval FROM b'
|
||||
INTO maxval
|
||||
USING element_count, in_array;
|
||||
|
||||
passes = CASE WHEN maxval >= threshold
|
||||
THEN TRUE
|
||||
ELSE FALSE
|
||||
END;
|
||||
RAISE NOTICE 'maxval: %', maxval;
|
||||
RETURN passes;
|
||||
END;
|
||||
$$ language plpgsql IMMUTABLE;
|
Loading…
Reference in New Issue
Block a user