cartodb-postgresql/scripts-available/CDB_DistinctMeasure.sql

47 lines
1.4 KiB
MySQL
Raw Normal View History

2015-06-24 04:27:27 +08:00
--
-- CDB_DistinctMeasure
-- calculates the fraction of rows in the 10 most common distinct categories
-- returns true if the number of rows in these 10 categories is >= 0.9 * total number of rows
--
--
2015-06-24 23:03:16 +08:00
CREATE OR REPLACE FUNCTION CDB_DistinctMeasure ( in_array text[], threshold numeric DEFAULT null ) RETURNS numeric as $$
2015-06-24 04:27:27 +08:00
DECLARE
element_count INT4;
maxval numeric;
2015-06-24 23:03:16 +08:00
passes numeric;
2015-06-24 04:27:27 +08:00
BEGIN
SELECT count(e) INTO element_count FROM ( SELECT unnest(in_array) e ) x;
-- count number of occurrences per bin
-- calculate the normalized cumulative sum
-- return the max value: which corresponds nth entry
-- for n <= 10 depending on # of distinct values
EXECUTE 'WITH a As (
SELECT
2015-06-25 20:17:41 +08:00
count(*) cnt
2015-06-24 04:27:27 +08:00
FROM
(SELECT * FROM unnest($2) e ) x
2015-06-25 20:17:41 +08:00
WHERE e is not null
2015-06-24 04:27:27 +08:00
GROUP BY e
ORDER BY cnt DESC
),
b As (
SELECT
sum(cnt) OVER (ORDER BY cnt DESC) / $1 As cumsum
FROM a
LIMIT 10
2015-06-25 20:17:41 +08:00
)
2015-06-24 04:27:27 +08:00
SELECT max(cumsum) maxval FROM b'
INTO maxval
USING element_count, in_array;
2015-06-24 23:03:16 +08:00
IF threshold is null THEN
passes = maxval;
ELSE
passes = CASE WHEN (maxval >= threshold) THEN 1 ELSE 0 END;
END IF;
2015-06-24 06:49:59 +08:00
2015-06-24 04:27:27 +08:00
RETURN passes;
END;
$$ language plpgsql IMMUTABLE;