cartodb-postgresql/scripts-available/CDB_DistType.sql

123 lines
3.6 KiB
PL/PgSQL

--
-- CDB_DistType classifies the histograms of a column into
-- one of the basic types listed by Galtung: http://druedin.com/2012/12/08/galtungs-ajus-system/
--
-- Future improvements:
-- variable number of bins (7 is baked in right now)
-- catch the number of items to ensure that the sample is large enough
--
-- Refs:
-- 1. width_bucket/histograms: http://tapoueh.org/blog/2014/02/21-PostgreSQL-histogram
-- 2. R implementation: https://github.com/cran/agrmt
CREATE OR REPLACE FUNCTION CDB_DistType ( in_array NUMERIC[] ) RETURNS text as $$
DECLARE
element_count INT4;
minv numeric;
maxv numeric;
bins numeric[];
freqs numeric[];
ajus INT[];
freq INT4;
signature text;
i INT := 1;
BEGIN
SELECT min(e), max(e), count(e) INTO minv, maxv, element_count FROM ( SELECT unnest(in_array) e ) x;
IF abs(maxv - minv) < 1e-7 THEN -- if max and min are nearly equal, call if 'F' (make relative to maxv?)
signature = 'F';
ELSE
-- Calculate bins and count in bins
EXECUTE 'WITH stats as (
SELECT min(e) as minv,
max(e) as maxv,
count(e) as total
FROM (SELECT unnest($1) e) x
WHERE e is not null
),
hist as (
SELECT width_bucket(e, s.minv, s.maxv, 7) bucket,
count(*) freq
FROM (SELECT unnest($1) e) x, stats s
WHERE e is not null
GROUP BY 1
ORDER BY 1
)
SELECT array_agg(round(100.0 * hist.freq::numeric / stats.total::numeric,1)) freqs,
array_agg(hist.bucket) buckets
FROM hist, stats'
INTO freqs, bins
USING in_array;
LOOP
IF i < 7 THEN
ajus[i] = CASE WHEN freqs[i] > freqs[i+1] THEN -1
WHEN abs(freqs[i] - freqs[i+1]) <= 0.05 THEN 0
ELSE 1 END;
ELSE
EXIT;
END IF;
i := i + 1;
END LOOP;
signature = _CDB_DistTypeClassify(ajus);
END IF;
RETURN signature;
END;
$$ language plpgsql IMMUTABLE;
-- Classify data into AJUSFL
CREATE OR REPLACE FUNCTION _CDB_DistTypeClassify ( in_array INT[] ) RETURNS text as $$
DECLARE
element_count INT4;
maxv numeric;
minv numeric;
uniques INT[];
type text;
BEGIN
SELECT max(e), min(e) INTO maxv, minv FROM ( SELECT unnest(in_array) e ) x;
IF (maxv = 0 AND minv = 0) THEN
type = 'F';
ELSIF maxv < 1 THEN
type = 'L';
ELSIF minv > -1 THEN
type = 'J';
ELSE
-- Get distinct elements ordered by original position
EXECUTE 'WITH b AS (
SELECT a
FROM (SELECT unnest($1) a) x
),
c AS (
SELECT a, row_number() OVER () r
FROM b
),
d AS (
SELECT DISTINCT a
FROM c
),
e AS (
SELECT a FROM d ORDER BY (
SELECT r FROM c WHERE d.a = c.a ORDER BY r ASC LIMIT 1
) ASC)
SELECT array_agg(a) FROM e'
INTO uniques
USING in_array;
-- Decide if it's an A, U, or other
IF (uniques = ARRAY[1,-1] OR uniques = ARRAY[1,0,-1] OR uniques = ARRAY[1,-1,0] OR uniques = ARRAY[0,1,-1]) THEN
type = 'A';
ELSIF (uniques = ARRAY[-1,1] OR uniques = ARRAY[-1,0,1] OR uniques = ARRAY[-1,1,0] OR uniques = ARRAY[0,-1,1]) THEN
type = 'U';
ELSE
type = 'S';
END IF;
END IF;
RETURN type;
END;
$$ language plpgsql IMMUTABLE;