Merge pull request #87 from CartoDB/categ-distrib

Function deciding criteria for using a category column in a map
2015-06-25 11:50:56 -04:00 · 2015-06-25 11:50:56 -04:00 · e5897f3dad
commit e5897f3dad
parent b9fe204007 13946b4d47
4 changed files with 68 additions and 0 deletions
--- a/scripts-available/CDB_DistinctMeasure.sql
+++ b/scripts-available/CDB_DistinctMeasure.sql
@ -0,0 +1,46 @@
+--
+-- CDB_DistinctMeasure 
+--     calculates the fraction of rows in the 10 most common distinct categories
+--     returns true if the number of rows in these 10 categories is >= 0.9 * total number of rows
+-- 
+-- 
+
+CREATE OR REPLACE FUNCTION CDB_DistinctMeasure ( in_array text[], threshold numeric DEFAULT null ) RETURNS numeric as $$
+DECLARE
+    element_count INT4;
+    maxval numeric;
+    passes numeric;
+BEGIN
+    SELECT count(e) INTO element_count FROM ( SELECT unnest(in_array) e ) x;
+
+    -- count number of occurrences per bin
+    -- calculate the normalized cumulative sum
+    -- return the max value: which corresponds nth entry 
+    -- for n <= 10 depending on # of distinct values
+    EXECUTE 'WITH a As (
+              SELECT
+                count(*) cnt
+              FROM
+                (SELECT * FROM unnest($2) e ) x
+              WHERE e is not null
+              GROUP BY e
+              ORDER BY cnt DESC
+            ),
+            b As (
+              SELECT
+                sum(cnt) OVER (ORDER BY cnt DESC) / $1 As cumsum
+              FROM a
+              LIMIT 10
+            )
+            SELECT max(cumsum) maxval FROM b'
+            INTO maxval
+            USING element_count, in_array;
+    IF threshold is null THEN
+        passes = maxval;
+    ELSE
+        passes = CASE WHEN (maxval >= threshold) THEN 1 ELSE 0 END;
+    END IF;
+
+    RETURN passes;
+END;
+$$ language plpgsql IMMUTABLE;
--- a/scripts-enabled/CDB_DistinctMeasure.sql
+++ b/scripts-enabled/CDB_DistinctMeasure.sql
@ -0,0 +1 @@
+../scripts-available/CDB_DistinctMeasure.sql
--- a/test/CDB_DistinctMeasureTest.sql
+++ b/test/CDB_DistinctMeasureTest.sql
@ -0,0 +1,20 @@
+-- a - j add up to 89%, k-m add up to 11%
+WITH a As (
+	SELECT (
+		repeat('a',12) || 
+		repeat('b',11) || 
+		repeat('c',11) || 
+		repeat('d',10) || 
+		repeat('e',10) || 
+		repeat('f',9) || 
+		repeat('g',8) || 
+		repeat('h',7) || 
+		repeat('i',6) || 
+		repeat('j',5) || 
+		repeat('k',4) || 
+		repeat('l',4) || 
+		repeat('m',3)
+		)::text AS x 
+	) 
+
+SELECT CDB_DistinctMeasure(string_to_array(x,null),0.90) from a
--- a/test/CDB_DistinctMeasureTest_expect
+++ b/test/CDB_DistinctMeasureTest_expect
@ -0,0 +1 @@
+0
				`@ -0,0 +1 @@`
				`../scripts-available/CDB_DistinctMeasure.sql`