Merge pull request #87 from CartoDB/categ-distrib

Function deciding criteria for using a category column in a map
This commit is contained in:
Andy Eschbacher 2015-06-25 11:50:56 -04:00
commit e5897f3dad
4 changed files with 68 additions and 0 deletions

View File

@ -0,0 +1,46 @@
--
-- CDB_DistinctMeasure
-- calculates the fraction of rows in the 10 most common distinct categories
-- returns true if the number of rows in these 10 categories is >= 0.9 * total number of rows
--
--
CREATE OR REPLACE FUNCTION CDB_DistinctMeasure ( in_array text[], threshold numeric DEFAULT null ) RETURNS numeric as $$
DECLARE
element_count INT4;
maxval numeric;
passes numeric;
BEGIN
SELECT count(e) INTO element_count FROM ( SELECT unnest(in_array) e ) x;
-- count number of occurrences per bin
-- calculate the normalized cumulative sum
-- return the max value: which corresponds nth entry
-- for n <= 10 depending on # of distinct values
EXECUTE 'WITH a As (
SELECT
count(*) cnt
FROM
(SELECT * FROM unnest($2) e ) x
WHERE e is not null
GROUP BY e
ORDER BY cnt DESC
),
b As (
SELECT
sum(cnt) OVER (ORDER BY cnt DESC) / $1 As cumsum
FROM a
LIMIT 10
)
SELECT max(cumsum) maxval FROM b'
INTO maxval
USING element_count, in_array;
IF threshold is null THEN
passes = maxval;
ELSE
passes = CASE WHEN (maxval >= threshold) THEN 1 ELSE 0 END;
END IF;
RETURN passes;
END;
$$ language plpgsql IMMUTABLE;

View File

@ -0,0 +1 @@
../scripts-available/CDB_DistinctMeasure.sql

View File

@ -0,0 +1,20 @@
-- a - j add up to 89%, k-m add up to 11%
WITH a As (
SELECT (
repeat('a',12) ||
repeat('b',11) ||
repeat('c',11) ||
repeat('d',10) ||
repeat('e',10) ||
repeat('f',9) ||
repeat('g',8) ||
repeat('h',7) ||
repeat('i',6) ||
repeat('j',5) ||
repeat('k',4) ||
repeat('l',4) ||
repeat('m',3)
)::text AS x
)
SELECT CDB_DistinctMeasure(string_to_array(x,null),0.90) from a

View File

@ -0,0 +1 @@
0