Merge pull request #87 from CartoDB/categ-distrib
Function deciding criteria for using a category column in a map
This commit is contained in:
commit
e5897f3dad
46
scripts-available/CDB_DistinctMeasure.sql
Normal file
46
scripts-available/CDB_DistinctMeasure.sql
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
--
|
||||||
|
-- CDB_DistinctMeasure
|
||||||
|
-- calculates the fraction of rows in the 10 most common distinct categories
|
||||||
|
-- returns true if the number of rows in these 10 categories is >= 0.9 * total number of rows
|
||||||
|
--
|
||||||
|
--
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION CDB_DistinctMeasure ( in_array text[], threshold numeric DEFAULT null ) RETURNS numeric as $$
|
||||||
|
DECLARE
|
||||||
|
element_count INT4;
|
||||||
|
maxval numeric;
|
||||||
|
passes numeric;
|
||||||
|
BEGIN
|
||||||
|
SELECT count(e) INTO element_count FROM ( SELECT unnest(in_array) e ) x;
|
||||||
|
|
||||||
|
-- count number of occurrences per bin
|
||||||
|
-- calculate the normalized cumulative sum
|
||||||
|
-- return the max value: which corresponds nth entry
|
||||||
|
-- for n <= 10 depending on # of distinct values
|
||||||
|
EXECUTE 'WITH a As (
|
||||||
|
SELECT
|
||||||
|
count(*) cnt
|
||||||
|
FROM
|
||||||
|
(SELECT * FROM unnest($2) e ) x
|
||||||
|
WHERE e is not null
|
||||||
|
GROUP BY e
|
||||||
|
ORDER BY cnt DESC
|
||||||
|
),
|
||||||
|
b As (
|
||||||
|
SELECT
|
||||||
|
sum(cnt) OVER (ORDER BY cnt DESC) / $1 As cumsum
|
||||||
|
FROM a
|
||||||
|
LIMIT 10
|
||||||
|
)
|
||||||
|
SELECT max(cumsum) maxval FROM b'
|
||||||
|
INTO maxval
|
||||||
|
USING element_count, in_array;
|
||||||
|
IF threshold is null THEN
|
||||||
|
passes = maxval;
|
||||||
|
ELSE
|
||||||
|
passes = CASE WHEN (maxval >= threshold) THEN 1 ELSE 0 END;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
RETURN passes;
|
||||||
|
END;
|
||||||
|
$$ language plpgsql IMMUTABLE;
|
1
scripts-enabled/CDB_DistinctMeasure.sql
Symbolic link
1
scripts-enabled/CDB_DistinctMeasure.sql
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../scripts-available/CDB_DistinctMeasure.sql
|
20
test/CDB_DistinctMeasureTest.sql
Normal file
20
test/CDB_DistinctMeasureTest.sql
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
-- a - j add up to 89%, k-m add up to 11%
|
||||||
|
WITH a As (
|
||||||
|
SELECT (
|
||||||
|
repeat('a',12) ||
|
||||||
|
repeat('b',11) ||
|
||||||
|
repeat('c',11) ||
|
||||||
|
repeat('d',10) ||
|
||||||
|
repeat('e',10) ||
|
||||||
|
repeat('f',9) ||
|
||||||
|
repeat('g',8) ||
|
||||||
|
repeat('h',7) ||
|
||||||
|
repeat('i',6) ||
|
||||||
|
repeat('j',5) ||
|
||||||
|
repeat('k',4) ||
|
||||||
|
repeat('l',4) ||
|
||||||
|
repeat('m',3)
|
||||||
|
)::text AS x
|
||||||
|
)
|
||||||
|
|
||||||
|
SELECT CDB_DistinctMeasure(string_to_array(x,null),0.90) from a
|
1
test/CDB_DistinctMeasureTest_expect
Normal file
1
test/CDB_DistinctMeasureTest_expect
Normal file
@ -0,0 +1 @@
|
|||||||
|
0
|
Loading…
Reference in New Issue
Block a user