diff --git a/src/pg/sql/18_outliers.sql b/src/pg/sql/18_outliers.sql index 7d4b3b2..3ae7a5e 100644 --- a/src/pg/sql/18_outliers.sql +++ b/src/pg/sql/18_outliers.sql @@ -12,6 +12,7 @@ END; $$ LANGUAGE plpgsql; -- Find outliers by a percentage above the threshold +-- TODO: add symmetric option? `symmetric boolean DEFAULT false` CREATE OR REPLACE FUNCTION CDB_PercentOutlier(attr numeric[], outlier_fraction numeric, ids int[]) RETURNS TABLE(outlier boolean, rowid int) @@ -21,10 +22,15 @@ DECLARE out_vals boolean[]; BEGIN - SELECT avg(i) INTO avg_val FROM unnest(attr) As x(i); + SELECT avg(i) INTO avg_val + FROM unnest(attr) As x(i); - SELECT array_agg( i > avg_val * outlier_fraction) INTO out_vals - FROM unnest(attr) As x(i); + SELECT array_agg( + CASE WHEN avg_val = 0 THEN null + ELSE outlier_fraction > i / avg_val + END + ) INTO out_vals + FROM unnest(attr) As x(i); RETURN QUERY SELECT unnest(out_vals) As outlier, @@ -45,10 +51,10 @@ DECLARE BEGIN SELECT stddev(i), avg(i) INTO stddev_val, avg_val - FROM unnest(attrs) As x(i); + FROM unnest(attrs) As x(i); SELECT array_agg(abs(i - avg_val) / stddev_val > num_deviations) INTO out_vals - FROM unnest(attrs) As x(i); + FROM unnest(attrs) As x(i); RETURN QUERY diff --git a/src/pg/test/expected/18_outliers.out b/src/pg/test/expected/18_outliers.out new file mode 100644 index 0000000..1c0f186 --- /dev/null +++ b/src/pg/test/expected/18_outliers.out @@ -0,0 +1,16 @@ +outlier|rowid +t|11 +t|16 +t|17 +outlier|rowid +t|16 +t|17 +outlier|rowid +t|8 +t|11 +t|16 +outlier|rowid +t|8 +t|9 +t|11 +t|15 diff --git a/src/pg/test/sql/18_outliers.sql b/src/pg/test/sql/18_outliers.sql new file mode 100644 index 0000000..432d4c1 --- /dev/null +++ b/src/pg/test/sql/18_outliers.sql @@ -0,0 +1,70 @@ +SET client_min_messages TO WARNING; +\set ECHO none +\pset format unaligned + +-- +-- postgres=# select round(avg(i), 3) as avg, +-- round(stddev(i), 3) as stddev, +-- round(avg(i) + stddev(i), 3) as one_stddev, +-- round(avg(i) + 2 * stddev(i), 3) As two_stddev +-- from unnest(ARRAY[1,3,2,3,5,1,2,32,12,3,57,2,1,4,2,100]) As x(i); +-- avg | stddev | one_stddev | two_stddev +-- --------+--------+------------+------------ +-- 14.375 | 27.322 | 41.697 | 69.020 + + +-- With an threshold of 1.0 standard deviation, ids 11, 16, and 17 are outliers +WITH a AS ( + SELECT + ARRAY[1,3,2,3,5,1,2,32,12, 3,57, 2, 1, 4, 2,100,-100]::numeric[] As vals, ARRAY[1,2,3,4,5,6,7, 8, 9,10,11,12,13,14,15, 16, 17]::int[] As ids +), b As ( + SELECT + (cdb_StdDevOutlier(vals, 1.0, ids)).* + FROM a + ORDER BY ids) +SELECT * +FROM b +WHERE outlier IS TRUE; + +-- With a threshold of 2.0 standard deviations, id 16 is the only outlier +WITH a AS ( + SELECT + ARRAY[1,3,2,3,5,1,2,32,12, 3,57, 2, 1, 4, 2,100,-100]::numeric[] As vals, + ARRAY[1,2,3,4,5,6,7, 8, 9,10,11,12,13,14,15, 16, 17]::int[] As ids +), b As ( + SELECT + (CDB_StdDevOutlier(vals, 2.0, ids)).* + FROM a + ORDER BY ids) +SELECT * +FROM b +WHERE outlier IS TRUE; + +-- With a ratio threshold of 2.0 threshold (100% above or below the mean) +-- which is greater than ~21, which are values +WITH a AS ( + SELECT + ARRAY[1,3,2,3,5,1,2,32,12, 3,57, 2, 1, 4, 2,100,-100]::numeric[] As vals, + ARRAY[1,2,3,4,5,6,7, 8, 9,10,11,12,13,14,15, 16, 17]::int[] As ids +), b As ( + SELECT + (CDB_PercentOutlier(vals, 2.0, ids)).* + FROM a + ORDER BY ids) +SELECT * + FROM b + WHERE outlier IS TRUE; + +-- With a static threshold of 11, what are the outliers +WITH a AS ( + SELECT + ARRAY[1,3,2,3,5,1,2,32,12, 3,57, 2, 1, 4, 2,100,-100]::numeric[] As vals, + ARRAY[1,2,3,4,5,6,7, 8, 9,10,11,12,13,14,15, 16, 17]::int[] As ids + ), b As ( + SELECT unnest(vals) As v, unnest(ids) as i + FROM a + ) +SELECT CDB_StaticOutlier(v, 11.0), i + FROM b +WHERE CDB_StaticOutlier(v, 11.0) is True +ORDER BY i;