initial commit

2015-06-08 13:37:27 -04:00 · 2015-06-08 13:37:27 -04:00 · 424564e324
commit 424564e324
parent b1e1723e75
1 changed files with 142 additions and 0 deletions
--- a/scripts-available/CDB_DistType.sql
+++ b/scripts-available/CDB_DistType.sql
@ -0,0 +1,142 @@
+--
+-- CDB_DistType classifies the histograms of a column into
+-- one of the basic types listed by Galtong: http://druedin.com/2012/12/08/galtungs-ajus-system/
+-- 
+-- Future improvements:
+--    variable number of bins (7 is baked in right now)
+--    catch the number of items to ensure that the sample is large enough
+--
+-- Refs:
+--    1. width_bucket/histograms: http://tapoueh.org/blog/2014/02/21-PostgreSQL-histogram
+--    2. R implementation: https://github.com/cran/agrmt
+
+
+CREATE OR REPLACE FUNCTION CDB_ClassTest ( in_array NUMERIC[] ) RETURNS text as $$ 
+DECLARE 
+    element_count INT4;
+    minv numeric;
+    maxv numeric;
+    bins numeric[];
+    freqs numeric[];
+    diff numeric[];
+    diffdiff numeric[];
+    ajus INT[];
+    freq INT4;
+    signature text;
+    i INT := 1;
+BEGIN
+    SELECT min(e), max(e), count(e) INTO minv, maxv, element_count FROM ( SELECT unnest(in_array) e ) x;
+
+    IF abs(maxv - minv) < 1e-7 THEN -- if max and min are nearly equal, call if 'F'
+        signature = 'F';
+    ELSE
+        -- Calculate bins and count in bins
+        EXECUTE 'WITH stats as (
+            SELECT min(e) as minv,
+                   max(e) as maxv,
+                   count(e) as total
+            FROM (SELECT unnest($1) e) x
+            WHERE e is not null
+        ),
+        hist as (
+            SELECT width_bucket(e, s.minv, s.maxv, 6) bucket,
+                   count(*) freq
+            FROM (SELECT unnest($1) e) x, stats s
+            WHERE e is not null
+            GROUP BY 1
+            ORDER BY 1
+        )
+        SELECT array_agg(round(100.0 * hist.freq::numeric / stats.total::numeric,1)) freqs,
+               array_agg(hist.bucket) buckets
+        FROM hist, stats'
+        INTO freqs, bins
+        USING in_array;
+        -- don't need bins variable (freq are already sorted), discard it later
+
+        LOOP
+            IF i < 7 THEN
+                ajus[i] = CDB_CompareValues(freqs[i],freqs[i+1],5.0); -- 5% tolerance
+            ELSE
+                EXIT;
+            END IF;
+            i := i + 1;
+        END LOOP;
+
+        signature = CDB_DistributionType(ajus);
+    END IF;
+
+    RETURN signature;
+END;
+$$ language plpgsql IMMUTABLE;
+
+-- Classify data into AJUSFL
+
+CREATE OR REPLACE FUNCTION CDB_DistributionType ( in_array INT[] ) RETURNS text as $$
+DECLARE
+    element_count INT4;
+    maxv numeric;
+    minv numeric;
+    uniques INT[];
+    type text;
+BEGIN
+    SELECT max(e), min(e) INTO maxv, minv FROM ( SELECT unnest(in_array) e ) x;
+
+    IF (maxv = 0 AND minv = 0) THEN
+        type = 'F';
+    ELSIF maxv < 1 THEN
+        type = 'L';
+    ELSIF minv > -1 THEN
+        type = 'J';
+    ELSE
+        -- Get distinct elements ordered by original position
+        EXECUTE 'WITH b AS (
+            SELECT a
+            FROM (SELECT unnest($1) a) x
+        ),
+        c AS (
+            SELECT a, row_number() OVER () r
+            FROM b
+        ),
+        d AS (
+            SELECT DISTINCT a
+            FROM c
+        ),
+        e AS (
+            SELECT a FROM d ORDER BY (
+                SELECT r FROM c WHERE d.a = c.a ORDER BY r ASC LIMIT 1
+            ) ASC)
+        SELECT array_agg(a) FROM e'
+        INTO uniques
+        USING in_array;
+
+        -- Decide if it's an A, U, or other
+        IF (uniques = ARRAY[1,-1] OR uniques = ARRAY[1,0,-1] OR uniques = ARRAY[1,-1,0] OR uniques = ARRAY[0,1,-1]) THEN
+            type = 'A';
+        ELSIF (uniques = ARRAY[-1,1] OR uniques = ARRAY[-1,0,1] OR uniques = ARRAY[-1,1,0] OR uniques = ARRAY[0,-1,1]) THEN
+            type = 'U';
+        ELSE
+            type = 'S';
+        END IF;
+    END IF;
+
+    RETURN type;
+END;
+$$ language plpgsql IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION CDB_CompareValues ( a numeric, b numeric, tolerance numeric ) RETURNS INT as $$
+DECLARE
+    d INT4;
+BEGIN
+    IF a > b THEN
+        SELECT -1 INTO d;
+    ELSE
+        SELECT 1 INTO d;
+    END IF;
+
+    IF abs(a-b) <= tolerance THEN
+        SELECT 0 INTO d;
+    END IF;
+
+    RETURN d;
+END; 
+$$ language plpgsql IMMUTABLE;