From cda6953ea669397a4c6f8bb3de347976c8a4a4c8 Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Tue, 23 Jun 2015 16:27:27 -0400 Subject: [PATCH 1/8] initial commit --- scripts-available/CDB_DistinctMeasure.sql | 46 +++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 scripts-available/CDB_DistinctMeasure.sql diff --git a/scripts-available/CDB_DistinctMeasure.sql b/scripts-available/CDB_DistinctMeasure.sql new file mode 100644 index 0000000..803ab69 --- /dev/null +++ b/scripts-available/CDB_DistinctMeasure.sql @@ -0,0 +1,46 @@ +-- +-- CDB_DistinctMeasure +-- calculates the fraction of rows in the 10 most common distinct categories +-- returns true if the number of rows in these 10 categories is >= 0.9 * total number of rows +-- +-- + +CREATE OR REPLACE FUNCTION CDB_DistinctMeasure ( in_array text[], threshold numeric DEFAULT 0.90 ) RETURNS boolean as $$ +DECLARE + element_count INT4; + maxval numeric; + passes boolean; +BEGIN + SELECT count(e) INTO element_count FROM ( SELECT unnest(in_array) e ) x; + + -- count number of occurrences per bin + -- calculate the normalized cumulative sum + -- return the max value: which corresponds nth entry + -- for n <= 10 depending on # of distinct values + EXECUTE 'WITH a As ( + SELECT + count(*) cnt, + e + FROM + (SELECT * FROM unnest($2) e ) x + GROUP BY e + ORDER BY cnt DESC + ), + b As ( + SELECT + sum(cnt) OVER (ORDER BY cnt DESC) / $1 As cumsum + FROM a + LIMIT 10 + ) + SELECT max(cumsum) maxval FROM b' + INTO maxval + USING element_count, in_array; + + passes = CASE WHEN maxval >= threshold + THEN TRUE + ELSE FALSE + END; + RAISE NOTICE 'maxval: %', maxval; + RETURN passes; +END; +$$ language plpgsql IMMUTABLE; From f8542af57af675c98368b7158071deaea49ce3a3 Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Tue, 23 Jun 2015 18:07:48 -0400 Subject: [PATCH 2/8] add tests --- test/CDB_DistinctMeasureTest.sql | 20 ++++++++++++++++++++ test/CDB_DistinctMeasureTest_expect | 1 + 2 files changed, 21 insertions(+) create mode 100644 test/CDB_DistinctMeasureTest.sql create mode 100644 test/CDB_DistinctMeasureTest_expect diff --git a/test/CDB_DistinctMeasureTest.sql b/test/CDB_DistinctMeasureTest.sql new file mode 100644 index 0000000..ac0f7a7 --- /dev/null +++ b/test/CDB_DistinctMeasureTest.sql @@ -0,0 +1,20 @@ +-- a - j add up to 89%, k-m add up to 11% +WITH a As ( + SELECT ( + repeat('a',12) || + repeat('b',11) || + repeat('c',11) || + repeat('d',10) || + repeat('e',10) || + repeat('f',9) || + repeat('g',8) || + repeat('h',7) || + repeat('i',6) || + repeat('j',5) || + repeat('k',4) || + repeat('l',4) || + repeat('m',3) + )::text AS x + ) + +SELECT CDB_DistinctMeasure(string_to_array(x,null),0.90) from a ; \ No newline at end of file diff --git a/test/CDB_DistinctMeasureTest_expect b/test/CDB_DistinctMeasureTest_expect new file mode 100644 index 0000000..4d1ae35 --- /dev/null +++ b/test/CDB_DistinctMeasureTest_expect @@ -0,0 +1 @@ +f \ No newline at end of file From c7bb57b405f84e5cf41393ccfffbce30593ff2ce Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Tue, 23 Jun 2015 18:08:32 -0400 Subject: [PATCH 3/8] add symlink --- scripts-enabled/CDB_DistinctMeasure.sql | 1 + 1 file changed, 1 insertion(+) create mode 120000 scripts-enabled/CDB_DistinctMeasure.sql diff --git a/scripts-enabled/CDB_DistinctMeasure.sql b/scripts-enabled/CDB_DistinctMeasure.sql new file mode 120000 index 0000000..711434b --- /dev/null +++ b/scripts-enabled/CDB_DistinctMeasure.sql @@ -0,0 +1 @@ +scripts-available/CDB_DistinctMeasure.sql \ No newline at end of file From 3ad3038c5e5edd796623e3b826e45c74d1125b45 Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Tue, 23 Jun 2015 18:32:16 -0400 Subject: [PATCH 4/8] fixed symlink path, other minor items --- scripts-enabled/CDB_DistinctMeasure.sql | 2 +- test/CDB_DistinctMeasureTest.sql | 2 +- test/CDB_DistinctMeasureTest_expect | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts-enabled/CDB_DistinctMeasure.sql b/scripts-enabled/CDB_DistinctMeasure.sql index 711434b..e78b435 120000 --- a/scripts-enabled/CDB_DistinctMeasure.sql +++ b/scripts-enabled/CDB_DistinctMeasure.sql @@ -1 +1 @@ -scripts-available/CDB_DistinctMeasure.sql \ No newline at end of file +../scripts-available/CDB_DistinctMeasure.sql \ No newline at end of file diff --git a/test/CDB_DistinctMeasureTest.sql b/test/CDB_DistinctMeasureTest.sql index ac0f7a7..1eb3a74 100644 --- a/test/CDB_DistinctMeasureTest.sql +++ b/test/CDB_DistinctMeasureTest.sql @@ -17,4 +17,4 @@ WITH a As ( )::text AS x ) -SELECT CDB_DistinctMeasure(string_to_array(x,null),0.90) from a ; \ No newline at end of file +SELECT CDB_DistinctMeasure(string_to_array(x,null),0.90) from a diff --git a/test/CDB_DistinctMeasureTest_expect b/test/CDB_DistinctMeasureTest_expect index 4d1ae35..6a69f92 100644 --- a/test/CDB_DistinctMeasureTest_expect +++ b/test/CDB_DistinctMeasureTest_expect @@ -1 +1 @@ -f \ No newline at end of file +f From 5941b473cad69f18f3bfad55b062c80d3ea960e3 Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Tue, 23 Jun 2015 18:39:15 -0400 Subject: [PATCH 5/8] removed notice --- scripts-available/CDB_DistinctMeasure.sql | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts-available/CDB_DistinctMeasure.sql b/scripts-available/CDB_DistinctMeasure.sql index 803ab69..b5f2cce 100644 --- a/scripts-available/CDB_DistinctMeasure.sql +++ b/scripts-available/CDB_DistinctMeasure.sql @@ -40,7 +40,6 @@ BEGIN THEN TRUE ELSE FALSE END; - RAISE NOTICE 'maxval: %', maxval; RETURN passes; END; $$ language plpgsql IMMUTABLE; From d124776c4e8dc9d07d93a56d7b8834edaabef03d Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Tue, 23 Jun 2015 18:49:59 -0400 Subject: [PATCH 6/8] simplified assignment --- scripts-available/CDB_DistinctMeasure.sql | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/scripts-available/CDB_DistinctMeasure.sql b/scripts-available/CDB_DistinctMeasure.sql index b5f2cce..228a0c7 100644 --- a/scripts-available/CDB_DistinctMeasure.sql +++ b/scripts-available/CDB_DistinctMeasure.sql @@ -36,10 +36,8 @@ BEGIN INTO maxval USING element_count, in_array; - passes = CASE WHEN maxval >= threshold - THEN TRUE - ELSE FALSE - END; + passes = (maxval >= threshold); + RETURN passes; END; $$ language plpgsql IMMUTABLE; From 97140b17c978f8a19c937b0aba5c91f7c45e1145 Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Wed, 24 Jun 2015 11:03:16 -0400 Subject: [PATCH 7/8] added more flexible output values --- scripts-available/CDB_DistinctMeasure.sql | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/scripts-available/CDB_DistinctMeasure.sql b/scripts-available/CDB_DistinctMeasure.sql index 228a0c7..a5bf1d7 100644 --- a/scripts-available/CDB_DistinctMeasure.sql +++ b/scripts-available/CDB_DistinctMeasure.sql @@ -5,11 +5,11 @@ -- -- -CREATE OR REPLACE FUNCTION CDB_DistinctMeasure ( in_array text[], threshold numeric DEFAULT 0.90 ) RETURNS boolean as $$ +CREATE OR REPLACE FUNCTION CDB_DistinctMeasure ( in_array text[], threshold numeric DEFAULT null ) RETURNS numeric as $$ DECLARE element_count INT4; maxval numeric; - passes boolean; + passes numeric; BEGIN SELECT count(e) INTO element_count FROM ( SELECT unnest(in_array) e ) x; @@ -35,8 +35,11 @@ BEGIN SELECT max(cumsum) maxval FROM b' INTO maxval USING element_count, in_array; - - passes = (maxval >= threshold); + IF threshold is null THEN + passes = maxval; + ELSE + passes = CASE WHEN (maxval >= threshold) THEN 1 ELSE 0 END; + END IF; RETURN passes; END; From 13946b4d475aaa5792ea7526e997f6730321930e Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Thu, 25 Jun 2015 08:17:41 -0400 Subject: [PATCH 8/8] update test output --- scripts-available/CDB_DistinctMeasure.sql | 6 +++--- test/CDB_DistinctMeasureTest_expect | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts-available/CDB_DistinctMeasure.sql b/scripts-available/CDB_DistinctMeasure.sql index a5bf1d7..63ec647 100644 --- a/scripts-available/CDB_DistinctMeasure.sql +++ b/scripts-available/CDB_DistinctMeasure.sql @@ -19,10 +19,10 @@ BEGIN -- for n <= 10 depending on # of distinct values EXECUTE 'WITH a As ( SELECT - count(*) cnt, - e + count(*) cnt FROM (SELECT * FROM unnest($2) e ) x + WHERE e is not null GROUP BY e ORDER BY cnt DESC ), @@ -31,7 +31,7 @@ BEGIN sum(cnt) OVER (ORDER BY cnt DESC) / $1 As cumsum FROM a LIMIT 10 - ) + ) SELECT max(cumsum) maxval FROM b' INTO maxval USING element_count, in_array; diff --git a/test/CDB_DistinctMeasureTest_expect b/test/CDB_DistinctMeasureTest_expect index 6a69f92..573541a 100644 --- a/test/CDB_DistinctMeasureTest_expect +++ b/test/CDB_DistinctMeasureTest_expect @@ -1 +1 @@ -f +0