142 lines
3.9 KiB
MySQL
142 lines
3.9 KiB
MySQL
|
--
|
||
|
-- CDB_DistType classifies the histograms of a column into
|
||
|
-- one of the basic types listed by Galtong: http://druedin.com/2012/12/08/galtungs-ajus-system/
|
||
|
--
|
||
|
-- Future improvements:
|
||
|
-- variable number of bins (7 is baked in right now)
|
||
|
-- catch the number of items to ensure that the sample is large enough
|
||
|
--
|
||
|
-- Refs:
|
||
|
-- 1. width_bucket/histograms: http://tapoueh.org/blog/2014/02/21-PostgreSQL-histogram
|
||
|
-- 2. R implementation: https://github.com/cran/agrmt
|
||
|
|
||
|
|
||
|
CREATE OR REPLACE FUNCTION CDB_ClassTest ( in_array NUMERIC[] ) RETURNS text as $$
|
||
|
DECLARE
|
||
|
element_count INT4;
|
||
|
minv numeric;
|
||
|
maxv numeric;
|
||
|
bins numeric[];
|
||
|
freqs numeric[];
|
||
|
diff numeric[];
|
||
|
diffdiff numeric[];
|
||
|
ajus INT[];
|
||
|
freq INT4;
|
||
|
signature text;
|
||
|
i INT := 1;
|
||
|
BEGIN
|
||
|
SELECT min(e), max(e), count(e) INTO minv, maxv, element_count FROM ( SELECT unnest(in_array) e ) x;
|
||
|
|
||
|
IF abs(maxv - minv) < 1e-7 THEN -- if max and min are nearly equal, call if 'F'
|
||
|
signature = 'F';
|
||
|
ELSE
|
||
|
-- Calculate bins and count in bins
|
||
|
EXECUTE 'WITH stats as (
|
||
|
SELECT min(e) as minv,
|
||
|
max(e) as maxv,
|
||
|
count(e) as total
|
||
|
FROM (SELECT unnest($1) e) x
|
||
|
WHERE e is not null
|
||
|
),
|
||
|
hist as (
|
||
|
SELECT width_bucket(e, s.minv, s.maxv, 6) bucket,
|
||
|
count(*) freq
|
||
|
FROM (SELECT unnest($1) e) x, stats s
|
||
|
WHERE e is not null
|
||
|
GROUP BY 1
|
||
|
ORDER BY 1
|
||
|
)
|
||
|
SELECT array_agg(round(100.0 * hist.freq::numeric / stats.total::numeric,1)) freqs,
|
||
|
array_agg(hist.bucket) buckets
|
||
|
FROM hist, stats'
|
||
|
INTO freqs, bins
|
||
|
USING in_array;
|
||
|
-- don't need bins variable (freq are already sorted), discard it later
|
||
|
|
||
|
LOOP
|
||
|
IF i < 7 THEN
|
||
|
ajus[i] = CDB_CompareValues(freqs[i],freqs[i+1],5.0); -- 5% tolerance
|
||
|
ELSE
|
||
|
EXIT;
|
||
|
END IF;
|
||
|
i := i + 1;
|
||
|
END LOOP;
|
||
|
|
||
|
signature = CDB_DistributionType(ajus);
|
||
|
END IF;
|
||
|
|
||
|
RETURN signature;
|
||
|
END;
|
||
|
$$ language plpgsql IMMUTABLE;
|
||
|
|
||
|
-- Classify data into AJUSFL
|
||
|
|
||
|
CREATE OR REPLACE FUNCTION CDB_DistributionType ( in_array INT[] ) RETURNS text as $$
|
||
|
DECLARE
|
||
|
element_count INT4;
|
||
|
maxv numeric;
|
||
|
minv numeric;
|
||
|
uniques INT[];
|
||
|
type text;
|
||
|
BEGIN
|
||
|
SELECT max(e), min(e) INTO maxv, minv FROM ( SELECT unnest(in_array) e ) x;
|
||
|
|
||
|
IF (maxv = 0 AND minv = 0) THEN
|
||
|
type = 'F';
|
||
|
ELSIF maxv < 1 THEN
|
||
|
type = 'L';
|
||
|
ELSIF minv > -1 THEN
|
||
|
type = 'J';
|
||
|
ELSE
|
||
|
-- Get distinct elements ordered by original position
|
||
|
EXECUTE 'WITH b AS (
|
||
|
SELECT a
|
||
|
FROM (SELECT unnest($1) a) x
|
||
|
),
|
||
|
c AS (
|
||
|
SELECT a, row_number() OVER () r
|
||
|
FROM b
|
||
|
),
|
||
|
d AS (
|
||
|
SELECT DISTINCT a
|
||
|
FROM c
|
||
|
),
|
||
|
e AS (
|
||
|
SELECT a FROM d ORDER BY (
|
||
|
SELECT r FROM c WHERE d.a = c.a ORDER BY r ASC LIMIT 1
|
||
|
) ASC)
|
||
|
SELECT array_agg(a) FROM e'
|
||
|
INTO uniques
|
||
|
USING in_array;
|
||
|
|
||
|
-- Decide if it's an A, U, or other
|
||
|
IF (uniques = ARRAY[1,-1] OR uniques = ARRAY[1,0,-1] OR uniques = ARRAY[1,-1,0] OR uniques = ARRAY[0,1,-1]) THEN
|
||
|
type = 'A';
|
||
|
ELSIF (uniques = ARRAY[-1,1] OR uniques = ARRAY[-1,0,1] OR uniques = ARRAY[-1,1,0] OR uniques = ARRAY[0,-1,1]) THEN
|
||
|
type = 'U';
|
||
|
ELSE
|
||
|
type = 'S';
|
||
|
END IF;
|
||
|
END IF;
|
||
|
|
||
|
RETURN type;
|
||
|
END;
|
||
|
$$ language plpgsql IMMUTABLE;
|
||
|
|
||
|
CREATE OR REPLACE FUNCTION CDB_CompareValues ( a numeric, b numeric, tolerance numeric ) RETURNS INT as $$
|
||
|
DECLARE
|
||
|
d INT4;
|
||
|
BEGIN
|
||
|
IF a > b THEN
|
||
|
SELECT -1 INTO d;
|
||
|
ELSE
|
||
|
SELECT 1 INTO d;
|
||
|
END IF;
|
||
|
|
||
|
IF abs(a-b) <= tolerance THEN
|
||
|
SELECT 0 INTO d;
|
||
|
END IF;
|
||
|
|
||
|
RETURN d;
|
||
|
END;
|
||
|
$$ language plpgsql IMMUTABLE;
|