-- -- CDB_DistinctMeasure -- calculates the fraction of rows in the 10 most common distinct categories -- returns true if the number of rows in these 10 categories is >= 0.9 * total number of rows -- -- CREATE OR REPLACE FUNCTION CDB_DistinctMeasure ( in_array text[], threshold numeric DEFAULT 0.90 ) RETURNS boolean as $$ DECLARE element_count INT4; maxval numeric; passes boolean; BEGIN SELECT count(e) INTO element_count FROM ( SELECT unnest(in_array) e ) x; -- count number of occurrences per bin -- calculate the normalized cumulative sum -- return the max value: which corresponds nth entry -- for n <= 10 depending on # of distinct values EXECUTE 'WITH a As ( SELECT count(*) cnt, e FROM (SELECT * FROM unnest($2) e ) x GROUP BY e ORDER BY cnt DESC ), b As ( SELECT sum(cnt) OVER (ORDER BY cnt DESC) / $1 As cumsum FROM a LIMIT 10 ) SELECT max(cumsum) maxval FROM b' INTO maxval USING element_count, in_array; passes = CASE WHEN maxval >= threshold THEN TRUE ELSE FALSE END; RAISE NOTICE 'maxval: %', maxval; RETURN passes; END; $$ language plpgsql IMMUTABLE;