From 49c4cea4e7736bde316e4c921689d6dbe2735d73 Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Tue, 25 Aug 2015 22:45:55 -0400 Subject: [PATCH 1/9] adding kurtosis --- scripts-available/CDB_Stats.sql | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 scripts-available/CDB_Stats.sql diff --git a/scripts-available/CDB_Stats.sql b/scripts-available/CDB_Stats.sql new file mode 100644 index 0000000..f0a47ba --- /dev/null +++ b/scripts-available/CDB_Stats.sql @@ -0,0 +1,33 @@ +-- +-- Calculate the Pearson kurtosis of the input data +-- +-- @param in_array A numeric array of numbers to determine the best +-- to determine the bin boundary +-- +-- @param breaks The number of bins you want to find. +-- +-- +-- Returns: upper edges of bins +-- +-- + +CREATE OR REPLACE FUNCTION CDB_Kurtosis ( in_array NUMERIC[] ) RETURNS NUMERIC as $$ +DECLARE + a numeric; + c numeric; + s numeric; + k numeric; +BEGIN + SELECT AVG(e), COUNT(e)::numeric, stddev(e) INTO a, c, s FROM ( SELECT unnest(in_array) e ) x; + + RAISE NOTICE 'avg: %, cnt: %, std: %', a, c, s; + + EXECUTE ' + SELECT sum(power($1 - e,4)) / ( $2 * power($3, 4)) + FROM (SELECT unnest($4) e ) x' + INTO k + USING a, c, s, in_array; + + RETURN k; +END; +$$ language plpgsql IMMUTABLE; From db323f3e13574cf700c62e5a2001d669f1152960 Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Tue, 25 Aug 2015 22:58:31 -0400 Subject: [PATCH 2/9] adding skewness --- scripts-available/CDB_Stats.sql | 39 +++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/scripts-available/CDB_Stats.sql b/scripts-available/CDB_Stats.sql index f0a47ba..b9b765b 100644 --- a/scripts-available/CDB_Stats.sql +++ b/scripts-available/CDB_Stats.sql @@ -1,18 +1,15 @@ -- --- Calculate the Pearson kurtosis of the input data +-- Calculate basic statistics of a given dataset -- --- @param in_array A numeric array of numbers to determine the best --- to determine the bin boundary +-- @param in_array A numeric array of numbers -- --- @param breaks The number of bins you want to find. --- --- --- Returns: upper edges of bins +-- Returns: statistical quantity chosen -- -- +-- Calculate Pearson's moment coefficient of kurtosis CREATE OR REPLACE FUNCTION CDB_Kurtosis ( in_array NUMERIC[] ) RETURNS NUMERIC as $$ -DECLARE +DECLARE a numeric; c numeric; s numeric; @@ -22,12 +19,32 @@ BEGIN RAISE NOTICE 'avg: %, cnt: %, std: %', a, c, s; - EXECUTE ' - SELECT sum(power($1 - e,4)) / ( $2 * power($3, 4)) - FROM (SELECT unnest($4) e ) x' + EXECUTE 'SELECT sum(power($1 - e, 4)) / ( $2 * power($3, 4)) + FROM (SELECT unnest($4) e ) x' INTO k USING a, c, s, in_array; RETURN k; END; $$ language plpgsql IMMUTABLE; + +-- Calculate Pearson's moment coefficient of skewness +CREATE OR REPLACE FUNCTION CDB_Skewness ( in_array NUMERIC[] ) RETURNS NUMERIC as $$ +DECLARE + a numeric; + c numeric; + s numeric; + sk numeric; +BEGIN + SELECT AVG(e), COUNT(e)::numeric, stddev(e) INTO a, c, s FROM ( SELECT unnest(in_array) e ) x; + + RAISE NOTICE 'avg: %, cnt: %, std: %', a, c, s; + + EXECUTE 'SELECT sum(power($1 - e, 3)) / ( $2 * power($3, 3)) + FROM (SELECT unnest($4) e ) x' + INTO sk + USING a, c, s, in_array; + + RETURN sk; +END; +$$ language plpgsql IMMUTABLE; From d723487f67811f1aa3ee13ad8d49a826b6867d41 Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Tue, 25 Aug 2015 23:10:26 -0400 Subject: [PATCH 3/9] updated definition --- scripts-available/CDB_Stats.sql | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts-available/CDB_Stats.sql b/scripts-available/CDB_Stats.sql index b9b765b..543b3ee 100644 --- a/scripts-available/CDB_Stats.sql +++ b/scripts-available/CDB_Stats.sql @@ -5,9 +5,10 @@ -- -- Returns: statistical quantity chosen -- +-- References: http://www.itl.nist.gov/div898/handbook/eda/section3/eda35b.htm -- --- Calculate Pearson's moment coefficient of kurtosis +-- Calculate kurtosis CREATE OR REPLACE FUNCTION CDB_Kurtosis ( in_array NUMERIC[] ) RETURNS NUMERIC as $$ DECLARE a numeric; @@ -19,7 +20,7 @@ BEGIN RAISE NOTICE 'avg: %, cnt: %, std: %', a, c, s; - EXECUTE 'SELECT sum(power($1 - e, 4)) / ( $2 * power($3, 4)) + EXECUTE 'SELECT sum(power($1 - e, 4)) / ( $2 * power($3, 4)) - 3 FROM (SELECT unnest($4) e ) x' INTO k USING a, c, s, in_array; @@ -28,7 +29,7 @@ BEGIN END; $$ language plpgsql IMMUTABLE; --- Calculate Pearson's moment coefficient of skewness +-- Calculate skewness CREATE OR REPLACE FUNCTION CDB_Skewness ( in_array NUMERIC[] ) RETURNS NUMERIC as $$ DECLARE a numeric; From 14e2a65523f989e3879e0cef64e1b9479df4f199 Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Tue, 25 Aug 2015 23:11:09 -0400 Subject: [PATCH 4/9] adding symlink --- scripts-enabled/CDB_Stats.sql | 1 + 1 file changed, 1 insertion(+) create mode 120000 scripts-enabled/CDB_Stats.sql diff --git a/scripts-enabled/CDB_Stats.sql b/scripts-enabled/CDB_Stats.sql new file mode 120000 index 0000000..0d7fa50 --- /dev/null +++ b/scripts-enabled/CDB_Stats.sql @@ -0,0 +1 @@ +scripts-available/CDB_Stats.sql \ No newline at end of file From 07280321ab4f99a1314550806c1c3f9879721cda Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Wed, 2 Sep 2015 22:19:07 -0400 Subject: [PATCH 5/9] adding tests --- test/CDB_StatsTest.sql | 17 +++++++++++++++++ test/CDB_StatsTest_expect | 1 + 2 files changed, 18 insertions(+) create mode 100644 test/CDB_StatsTest.sql create mode 100644 test/CDB_StatsTest_expect diff --git a/test/CDB_StatsTest.sql b/test/CDB_StatsTest.sql new file mode 100644 index 0000000..2bcbf69 --- /dev/null +++ b/test/CDB_StatsTest.sql @@ -0,0 +1,17 @@ +-- continuous uniform distribution has kurtosis = -6/5, skewness = 0.0 +-- http://mathworld.wolfram.com/UniformDistribution.html + +With dist As ( + SELECT random() As val + FROM generate_series(1,5000000) t +), +m As ( + SELECT avg(val) mn, count(*) cnt, stddev(val) s + FROM dist + ) + +SELECT + abs(sum(power(mn - val,4)) / ( cnt * power(s,4)) - 3 + 1.20) < 1e-3 As kurtosis, + abs(sum(power(mn - val,3)) / ( cnt * power(s,3))) < 1e-3 As skewness +FROM dist, m +GROUP BY m.cnt, m.mn, m.s \ No newline at end of file diff --git a/test/CDB_StatsTest_expect b/test/CDB_StatsTest_expect new file mode 100644 index 0000000..c1a8116 --- /dev/null +++ b/test/CDB_StatsTest_expect @@ -0,0 +1 @@ +true|true From d00e71309d62f4f8353e4bcba73e2c2f7e8d33e5 Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Wed, 2 Sep 2015 22:35:03 -0400 Subject: [PATCH 6/9] really add tests --- test/CDB_StatsTest.sql | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/test/CDB_StatsTest.sql b/test/CDB_StatsTest.sql index 2bcbf69..5bfb5dc 100644 --- a/test/CDB_StatsTest.sql +++ b/test/CDB_StatsTest.sql @@ -2,16 +2,12 @@ -- http://mathworld.wolfram.com/UniformDistribution.html With dist As ( - SELECT random() As val + SELECT random()::numeric As val FROM generate_series(1,5000000) t -), -m As ( - SELECT avg(val) mn, count(*) cnt, stddev(val) s - FROM dist - ) +) SELECT - abs(sum(power(mn - val,4)) / ( cnt * power(s,4)) - 3 + 1.20) < 1e-3 As kurtosis, - abs(sum(power(mn - val,3)) / ( cnt * power(s,3))) < 1e-3 As skewness -FROM dist, m -GROUP BY m.cnt, m.mn, m.s \ No newline at end of file + -- does random dist values match within 1% of known values + abs(CDB_Kurtosis(array_agg(val)) + 1.20) < 1e-2 As kurtosis, + abs(CDB_Skewness(array_agg(val)) - 0) < 1e-2 As skewness +FROM dist \ No newline at end of file From 29efdf2ee726bdbc7c8a82242a80a5b452ed7a11 Mon Sep 17 00:00:00 2001 From: Raul Ochoa Date: Fri, 4 Sep 2015 00:00:30 +0200 Subject: [PATCH 7/9] Fix symlink for CDB_Stats.sql --- scripts-enabled/CDB_Stats.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts-enabled/CDB_Stats.sql b/scripts-enabled/CDB_Stats.sql index 0d7fa50..37abd7b 120000 --- a/scripts-enabled/CDB_Stats.sql +++ b/scripts-enabled/CDB_Stats.sql @@ -1 +1 @@ -scripts-available/CDB_Stats.sql \ No newline at end of file +../scripts-available/CDB_Stats.sql \ No newline at end of file From 25cf48d4a491bdd8b6a6fe14676c4822d71a56ae Mon Sep 17 00:00:00 2001 From: Raul Ochoa Date: Fri, 4 Sep 2015 00:02:30 +0200 Subject: [PATCH 8/9] Raise min message so we don't have to validate notices --- test/CDB_StatsTest.sql | 5 ++++- test/CDB_StatsTest_expect | 4 +++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/test/CDB_StatsTest.sql b/test/CDB_StatsTest.sql index 5bfb5dc..9cb6bc2 100644 --- a/test/CDB_StatsTest.sql +++ b/test/CDB_StatsTest.sql @@ -1,5 +1,6 @@ -- continuous uniform distribution has kurtosis = -6/5, skewness = 0.0 -- http://mathworld.wolfram.com/UniformDistribution.html +set client_min_messages to ERROR; With dist As ( SELECT random()::numeric As val @@ -10,4 +11,6 @@ SELECT -- does random dist values match within 1% of known values abs(CDB_Kurtosis(array_agg(val)) + 1.20) < 1e-2 As kurtosis, abs(CDB_Skewness(array_agg(val)) - 0) < 1e-2 As skewness -FROM dist \ No newline at end of file +FROM dist; + +set client_min_messages to NOTICE; diff --git a/test/CDB_StatsTest_expect b/test/CDB_StatsTest_expect index c1a8116..fdc125d 100644 --- a/test/CDB_StatsTest_expect +++ b/test/CDB_StatsTest_expect @@ -1 +1,3 @@ -true|true +SET +t|t +SET From 83b7f47617b900d62cf21281c3f12cc7a422ea56 Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Thu, 3 Sep 2015 22:43:25 -0400 Subject: [PATCH 9/9] removing raise notices and lower test bounds --- scripts-available/CDB_Stats.sql | 4 ---- test/CDB_StatsTest.sql | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/scripts-available/CDB_Stats.sql b/scripts-available/CDB_Stats.sql index 543b3ee..e16748c 100644 --- a/scripts-available/CDB_Stats.sql +++ b/scripts-available/CDB_Stats.sql @@ -18,8 +18,6 @@ DECLARE BEGIN SELECT AVG(e), COUNT(e)::numeric, stddev(e) INTO a, c, s FROM ( SELECT unnest(in_array) e ) x; - RAISE NOTICE 'avg: %, cnt: %, std: %', a, c, s; - EXECUTE 'SELECT sum(power($1 - e, 4)) / ( $2 * power($3, 4)) - 3 FROM (SELECT unnest($4) e ) x' INTO k @@ -39,8 +37,6 @@ DECLARE BEGIN SELECT AVG(e), COUNT(e)::numeric, stddev(e) INTO a, c, s FROM ( SELECT unnest(in_array) e ) x; - RAISE NOTICE 'avg: %, cnt: %, std: %', a, c, s; - EXECUTE 'SELECT sum(power($1 - e, 3)) / ( $2 * power($3, 3)) FROM (SELECT unnest($4) e ) x' INTO sk diff --git a/test/CDB_StatsTest.sql b/test/CDB_StatsTest.sql index 5bfb5dc..06ef647 100644 --- a/test/CDB_StatsTest.sql +++ b/test/CDB_StatsTest.sql @@ -3,7 +3,7 @@ With dist As ( SELECT random()::numeric As val - FROM generate_series(1,5000000) t + FROM generate_series(1,50000) t ) SELECT