diff --git a/scripts-available/CDB_Stats.sql b/scripts-available/CDB_Stats.sql new file mode 100644 index 0000000..e16748c --- /dev/null +++ b/scripts-available/CDB_Stats.sql @@ -0,0 +1,47 @@ +-- +-- Calculate basic statistics of a given dataset +-- +-- @param in_array A numeric array of numbers +-- +-- Returns: statistical quantity chosen +-- +-- References: http://www.itl.nist.gov/div898/handbook/eda/section3/eda35b.htm +-- + +-- Calculate kurtosis +CREATE OR REPLACE FUNCTION CDB_Kurtosis ( in_array NUMERIC[] ) RETURNS NUMERIC as $$ +DECLARE + a numeric; + c numeric; + s numeric; + k numeric; +BEGIN + SELECT AVG(e), COUNT(e)::numeric, stddev(e) INTO a, c, s FROM ( SELECT unnest(in_array) e ) x; + + EXECUTE 'SELECT sum(power($1 - e, 4)) / ( $2 * power($3, 4)) - 3 + FROM (SELECT unnest($4) e ) x' + INTO k + USING a, c, s, in_array; + + RETURN k; +END; +$$ language plpgsql IMMUTABLE; + +-- Calculate skewness +CREATE OR REPLACE FUNCTION CDB_Skewness ( in_array NUMERIC[] ) RETURNS NUMERIC as $$ +DECLARE + a numeric; + c numeric; + s numeric; + sk numeric; +BEGIN + SELECT AVG(e), COUNT(e)::numeric, stddev(e) INTO a, c, s FROM ( SELECT unnest(in_array) e ) x; + + EXECUTE 'SELECT sum(power($1 - e, 3)) / ( $2 * power($3, 3)) + FROM (SELECT unnest($4) e ) x' + INTO sk + USING a, c, s, in_array; + + RETURN sk; +END; +$$ language plpgsql IMMUTABLE; diff --git a/scripts-enabled/CDB_Stats.sql b/scripts-enabled/CDB_Stats.sql new file mode 120000 index 0000000..37abd7b --- /dev/null +++ b/scripts-enabled/CDB_Stats.sql @@ -0,0 +1 @@ +../scripts-available/CDB_Stats.sql \ No newline at end of file diff --git a/test/CDB_StatsTest.sql b/test/CDB_StatsTest.sql new file mode 100644 index 0000000..571d323 --- /dev/null +++ b/test/CDB_StatsTest.sql @@ -0,0 +1,16 @@ +-- continuous uniform distribution has kurtosis = -6/5, skewness = 0.0 +-- http://mathworld.wolfram.com/UniformDistribution.html +set client_min_messages to ERROR; + +With dist As ( + SELECT random()::numeric As val + FROM generate_series(1,50000) t +) + +SELECT + -- does random dist values match within 1% of known values + abs(CDB_Kurtosis(array_agg(val)) + 1.20) < 1e-2 As kurtosis, + abs(CDB_Skewness(array_agg(val)) - 0) < 1e-2 As skewness +FROM dist; + +set client_min_messages to NOTICE; diff --git a/test/CDB_StatsTest_expect b/test/CDB_StatsTest_expect new file mode 100644 index 0000000..fdc125d --- /dev/null +++ b/test/CDB_StatsTest_expect @@ -0,0 +1,3 @@ +SET +t|t +SET