From 45f063d469a01bac5896ca342a8165c494cd95d8 Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Fri, 15 Apr 2016 12:37:16 +0200 Subject: [PATCH 1/4] Aggregate small number of text items in overviews Instead of nulling text fields for non-singleton aggregated records concatenate distinct text values when they're few (5 or less). Fixes #231 --- scripts-available/CDB_Overviews.sql | 37 +++++++++++++++++++++-------- test/CDB_OverviewsTest_expect | 4 ++-- 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/scripts-available/CDB_Overviews.sql b/scripts-available/CDB_Overviews.sql index 7671a89..9f95bdc 100644 --- a/scripts-available/CDB_Overviews.sql +++ b/scripts-available/CDB_Overviews.sql @@ -515,6 +515,23 @@ BEGIN END $$ LANGUAGE PLPGSQL STABLE; +-- Check if a column of a table is of an unlimited-length text type +CREATE OR REPLACE FUNCTION _cdb_unlimited_text_column(reloid REGCLASS, col_name TEXT) +RETURNS BOOLEAN +AS $$ + SELECT EXISTS ( + SELECT * + FROM information_schema.columns c, pg_class _tn, pg_namespace _sn + WHERE table_name = _tn.relname + AND table_schema = _sn.nspname + AND c.column_name = col_name + AND _tn.oid = reloid + AND _sn.oid = _tn.relnamespace + AND character_maximum_length IS NULL + AND c.data_type IN ('text', 'character varying', 'character') + ); +$$ LANGUAGE SQL STABLE; + -- SQL Aggregation expression for a datase attribute -- Scope: private. -- Parameters @@ -532,6 +549,7 @@ DECLARE has_counter_column BOOLEAN; feature_count TEXT; total_feature_count TEXT; + unlimited_text BOOLEAN; BEGIN IF table_alias <> '' THEN qualified_column := Format('%I.%I', table_alias, column_name); @@ -559,16 +577,15 @@ BEGIN ELSE RETURN Format('SUM(%s*%s)/%s::' || column_type, qualified_column, feature_count, total_feature_count); END IF; - WHEN 'text' THEN - -- TODO: we could define a new aggregate function that returns distinct - -- separated values with a limit, adding ellipsis if more values existed - -- e.g. with '/' as separator and a limit of three: - -- 'A', 'B', 'A', 'C', 'D' => 'A/B/C/...' - -- Other ideas: if value is unique then use it, otherwise use something - -- like '*' or '(varies)' or '(multiple values)', or NULL - -- Using 'string_agg(' || qualified_column || ',''/'')' - -- here causes - RETURN 'CASE count(*) WHEN 1 THEN MIN(' || qualified_column || ') ELSE NULL END::' || column_type; + WHEN 'text', 'character varying', 'character' THEN + SELECT _cdb_unlimited_text_column(reloid, column_name) INTO unlimited_text; + IF unlimited_text THEN + -- TODO: this should not be applied to columns containing largish text; + -- it is intended only to short names/identifiers + RETURN 'CASE WHEN count(*) = 1 THEN MIN(' || qualified_column || ') WHEN ' || total_feature_count || ' < 5 THEN string_agg(distinct ' || qualified_column || ','' / '') ELSE ''*'' END::' || column_type; + ELSE + RETURN 'CASE count(*) WHEN 1 THEN MIN(' || qualified_column || ') ELSE NULL END::' || column_type; + END IF; WHEN 'boolean' THEN RETURN 'CASE count(*) WHEN 1 THEN BOOL_AND(' || qualified_column || ') ELSE NULL END::' || column_type; ELSE diff --git a/test/CDB_OverviewsTest_expect b/test/CDB_OverviewsTest_expect index 188b1ff..66bf25c 100644 --- a/test/CDB_OverviewsTest_expect +++ b/test/CDB_OverviewsTest_expect @@ -12,8 +12,8 @@ SELECT 1114 {_vovw_2_base_bare_t,_vovw_1_base_bare_t,_vovw_0_base_bare_t} 126 number,int_number,name,start -SUM(number*1)/count(*)::double precision AS number,SUM(int_number*1)/count(*)::integer AS int_number,CASE count(*) WHEN 1 THEN MIN(name) ELSE NULL END::text AS name,CASE count(*) WHEN 1 THEN MIN(start) ELSE NULL END::date AS start -SUM(tab.number*1)/count(*)::double precision AS number,SUM(tab.int_number*1)/count(*)::integer AS int_number,CASE count(*) WHEN 1 THEN MIN(tab.name) ELSE NULL END::text AS name,CASE count(*) WHEN 1 THEN MIN(tab.start) ELSE NULL END::date AS start +SUM(number*1)/count(*)::double precision AS number,SUM(int_number*1)/count(*)::integer AS int_number,CASE WHEN count(*) = 1 THEN MIN(name) WHEN count(*) < 5 THEN string_agg(distinct name,' / ') ELSE '*' END::text AS name,CASE count(*) WHEN 1 THEN MIN(start) ELSE NULL END::date AS start +SUM(tab.number*1)/count(*)::double precision AS number,SUM(tab.int_number*1)/count(*)::integer AS int_number,CASE WHEN count(*) = 1 THEN MIN(tab.name) WHEN count(*) < 5 THEN string_agg(distinct tab.name,' / ') ELSE '*' END::text AS name,CASE count(*) WHEN 1 THEN MIN(tab.start) ELSE NULL END::date AS start {_vovw_2_base_t,_vovw_1_base_t,_vovw_0_base_t} 126 From 1b0f77aa964154f50c0e024ed1e41b1f9f96e4a6 Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Fri, 15 Apr 2016 17:49:00 +0200 Subject: [PATCH 2/4] Always retain single-valued aggregated texts This makes columns which have the same value in a group to be aggregated maintain that value (rather than replacing it by the multiple-value indicator *) whatever the group value is. (Previously this happend only for small groups) --- scripts-available/CDB_Overviews.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts-available/CDB_Overviews.sql b/scripts-available/CDB_Overviews.sql index 9f95bdc..a9b0c7f 100644 --- a/scripts-available/CDB_Overviews.sql +++ b/scripts-available/CDB_Overviews.sql @@ -582,7 +582,7 @@ BEGIN IF unlimited_text THEN -- TODO: this should not be applied to columns containing largish text; -- it is intended only to short names/identifiers - RETURN 'CASE WHEN count(*) = 1 THEN MIN(' || qualified_column || ') WHEN ' || total_feature_count || ' < 5 THEN string_agg(distinct ' || qualified_column || ','' / '') ELSE ''*'' END::' || column_type; + RETURN 'CASE WHEN count(distinct ' || qualified_column || ') = 1 THEN MIN(' || qualified_column || ') WHEN ' || total_feature_count || ' < 5 THEN string_agg(distinct ' || qualified_column || ','' / '') ELSE ''*'' END::' || column_type; ELSE RETURN 'CASE count(*) WHEN 1 THEN MIN(' || qualified_column || ') ELSE NULL END::' || column_type; END IF; From 09ad550de3ef3db4ae068af8ad606b0bd8f590ed Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Fri, 15 Apr 2016 17:50:47 +0200 Subject: [PATCH 3/4] Fix tests --- test/CDB_OverviewsTest_expect | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/CDB_OverviewsTest_expect b/test/CDB_OverviewsTest_expect index 66bf25c..a8a1495 100644 --- a/test/CDB_OverviewsTest_expect +++ b/test/CDB_OverviewsTest_expect @@ -12,8 +12,8 @@ SELECT 1114 {_vovw_2_base_bare_t,_vovw_1_base_bare_t,_vovw_0_base_bare_t} 126 number,int_number,name,start -SUM(number*1)/count(*)::double precision AS number,SUM(int_number*1)/count(*)::integer AS int_number,CASE WHEN count(*) = 1 THEN MIN(name) WHEN count(*) < 5 THEN string_agg(distinct name,' / ') ELSE '*' END::text AS name,CASE count(*) WHEN 1 THEN MIN(start) ELSE NULL END::date AS start -SUM(tab.number*1)/count(*)::double precision AS number,SUM(tab.int_number*1)/count(*)::integer AS int_number,CASE WHEN count(*) = 1 THEN MIN(tab.name) WHEN count(*) < 5 THEN string_agg(distinct tab.name,' / ') ELSE '*' END::text AS name,CASE count(*) WHEN 1 THEN MIN(tab.start) ELSE NULL END::date AS start +SUM(number*1)/count(*)::double precision AS number,SUM(int_number*1)/count(*)::integer AS int_number,CASE WHEN count(distinct name) = 1 THEN MIN(name) WHEN count(*) < 5 THEN string_agg(distinct name,' / ') ELSE '*' END::text AS name,CASE count(*) WHEN 1 THEN MIN(start) ELSE NULL END::date AS start +SUM(tab.number*1)/count(*)::double precision AS number,SUM(tab.int_number*1)/count(*)::integer AS int_number,CASE WHEN count(distinct tab.name) = 1 THEN MIN(tab.name) WHEN count(*) < 5 THEN string_agg(distinct tab.name,' / ') ELSE '*' END::text AS name,CASE count(*) WHEN 1 THEN MIN(tab.start) ELSE NULL END::date AS start {_vovw_2_base_t,_vovw_1_base_t,_vovw_0_base_t} 126 From ef43623f779f6c18b72a17caae94226d887587ea Mon Sep 17 00:00:00 2001 From: Javier Goizueta Date: Fri, 15 Apr 2016 17:58:03 +0200 Subject: [PATCH 4/4] Remove unneeded variable --- scripts-available/CDB_Overviews.sql | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts-available/CDB_Overviews.sql b/scripts-available/CDB_Overviews.sql index a9b0c7f..699ca5c 100644 --- a/scripts-available/CDB_Overviews.sql +++ b/scripts-available/CDB_Overviews.sql @@ -549,7 +549,6 @@ DECLARE has_counter_column BOOLEAN; feature_count TEXT; total_feature_count TEXT; - unlimited_text BOOLEAN; BEGIN IF table_alias <> '' THEN qualified_column := Format('%I.%I', table_alias, column_name); @@ -578,8 +577,7 @@ BEGIN RETURN Format('SUM(%s*%s)/%s::' || column_type, qualified_column, feature_count, total_feature_count); END IF; WHEN 'text', 'character varying', 'character' THEN - SELECT _cdb_unlimited_text_column(reloid, column_name) INTO unlimited_text; - IF unlimited_text THEN + IF _cdb_unlimited_text_column(reloid, column_name) THEN -- TODO: this should not be applied to columns containing largish text; -- it is intended only to short names/identifiers RETURN 'CASE WHEN count(distinct ' || qualified_column || ') = 1 THEN MIN(' || qualified_column || ') WHEN ' || total_feature_count || ' < 5 THEN string_agg(distinct ' || qualified_column || ','' / '') ELSE ''*'' END::' || column_type;