adds tests

This commit is contained in:
Andy Eschbacher 2016-09-28 15:55:56 -04:00
parent f2bb0b496b
commit b8accb48fc
3 changed files with 97 additions and 5 deletions

View File

@ -12,6 +12,7 @@ END;
$$ LANGUAGE plpgsql; $$ LANGUAGE plpgsql;
-- Find outliers by a percentage above the threshold -- Find outliers by a percentage above the threshold
-- TODO: add symmetric option? `symmetric boolean DEFAULT false`
CREATE OR REPLACE FUNCTION CDB_PercentOutlier(attr numeric[], outlier_fraction numeric, ids int[]) CREATE OR REPLACE FUNCTION CDB_PercentOutlier(attr numeric[], outlier_fraction numeric, ids int[])
RETURNS TABLE(outlier boolean, rowid int) RETURNS TABLE(outlier boolean, rowid int)
@ -21,10 +22,15 @@ DECLARE
out_vals boolean[]; out_vals boolean[];
BEGIN BEGIN
SELECT avg(i) INTO avg_val FROM unnest(attr) As x(i); SELECT avg(i) INTO avg_val
FROM unnest(attr) As x(i);
SELECT array_agg( i > avg_val * outlier_fraction) INTO out_vals SELECT array_agg(
FROM unnest(attr) As x(i); CASE WHEN avg_val = 0 THEN null
ELSE outlier_fraction > i / avg_val
END
) INTO out_vals
FROM unnest(attr) As x(i);
RETURN QUERY RETURN QUERY
SELECT unnest(out_vals) As outlier, SELECT unnest(out_vals) As outlier,
@ -45,10 +51,10 @@ DECLARE
BEGIN BEGIN
SELECT stddev(i), avg(i) INTO stddev_val, avg_val SELECT stddev(i), avg(i) INTO stddev_val, avg_val
FROM unnest(attrs) As x(i); FROM unnest(attrs) As x(i);
SELECT array_agg(abs(i - avg_val) / stddev_val > num_deviations) INTO out_vals SELECT array_agg(abs(i - avg_val) / stddev_val > num_deviations) INTO out_vals
FROM unnest(attrs) As x(i); FROM unnest(attrs) As x(i);
RETURN QUERY RETURN QUERY

View File

@ -0,0 +1,16 @@
outlier|rowid
t|11
t|16
t|17
outlier|rowid
t|16
t|17
outlier|rowid
t|8
t|11
t|16
outlier|rowid
t|8
t|9
t|11
t|15

View File

@ -0,0 +1,70 @@
SET client_min_messages TO WARNING;
\set ECHO none
\pset format unaligned
--
-- postgres=# select round(avg(i), 3) as avg,
-- round(stddev(i), 3) as stddev,
-- round(avg(i) + stddev(i), 3) as one_stddev,
-- round(avg(i) + 2 * stddev(i), 3) As two_stddev
-- from unnest(ARRAY[1,3,2,3,5,1,2,32,12,3,57,2,1,4,2,100]) As x(i);
-- avg | stddev | one_stddev | two_stddev
-- --------+--------+------------+------------
-- 14.375 | 27.322 | 41.697 | 69.020
-- With an threshold of 1.0 standard deviation, ids 11, 16, and 17 are outliers
WITH a AS (
SELECT
ARRAY[1,3,2,3,5,1,2,32,12, 3,57, 2, 1, 4, 2,100,-100]::numeric[] As vals, ARRAY[1,2,3,4,5,6,7, 8, 9,10,11,12,13,14,15, 16, 17]::int[] As ids
), b As (
SELECT
(cdb_StdDevOutlier(vals, 1.0, ids)).*
FROM a
ORDER BY ids)
SELECT *
FROM b
WHERE outlier IS TRUE;
-- With a threshold of 2.0 standard deviations, id 16 is the only outlier
WITH a AS (
SELECT
ARRAY[1,3,2,3,5,1,2,32,12, 3,57, 2, 1, 4, 2,100,-100]::numeric[] As vals,
ARRAY[1,2,3,4,5,6,7, 8, 9,10,11,12,13,14,15, 16, 17]::int[] As ids
), b As (
SELECT
(CDB_StdDevOutlier(vals, 2.0, ids)).*
FROM a
ORDER BY ids)
SELECT *
FROM b
WHERE outlier IS TRUE;
-- With a ratio threshold of 2.0 threshold (100% above or below the mean)
-- which is greater than ~21, which are values
WITH a AS (
SELECT
ARRAY[1,3,2,3,5,1,2,32,12, 3,57, 2, 1, 4, 2,100,-100]::numeric[] As vals,
ARRAY[1,2,3,4,5,6,7, 8, 9,10,11,12,13,14,15, 16, 17]::int[] As ids
), b As (
SELECT
(CDB_PercentOutlier(vals, 2.0, ids)).*
FROM a
ORDER BY ids)
SELECT *
FROM b
WHERE outlier IS TRUE;
-- With a static threshold of 11, what are the outliers
WITH a AS (
SELECT
ARRAY[1,3,2,3,5,1,2,32,12, 3,57, 2, 1, 4, 2,100,-100]::numeric[] As vals,
ARRAY[1,2,3,4,5,6,7, 8, 9,10,11,12,13,14,15, 16, 17]::int[] As ids
), b As (
SELECT unnest(vals) As v, unnest(ids) as i
FROM a
)
SELECT CDB_StaticOutlier(v, 11.0), i
FROM b
WHERE CDB_StaticOutlier(v, 11.0) is True
ORDER BY i;