adds tests
This commit is contained in:
parent
f2bb0b496b
commit
b8accb48fc
@ -12,6 +12,7 @@ END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Find outliers by a percentage above the threshold
|
||||
-- TODO: add symmetric option? `symmetric boolean DEFAULT false`
|
||||
|
||||
CREATE OR REPLACE FUNCTION CDB_PercentOutlier(attr numeric[], outlier_fraction numeric, ids int[])
|
||||
RETURNS TABLE(outlier boolean, rowid int)
|
||||
@ -21,9 +22,14 @@ DECLARE
|
||||
out_vals boolean[];
|
||||
BEGIN
|
||||
|
||||
SELECT avg(i) INTO avg_val FROM unnest(attr) As x(i);
|
||||
SELECT avg(i) INTO avg_val
|
||||
FROM unnest(attr) As x(i);
|
||||
|
||||
SELECT array_agg( i > avg_val * outlier_fraction) INTO out_vals
|
||||
SELECT array_agg(
|
||||
CASE WHEN avg_val = 0 THEN null
|
||||
ELSE outlier_fraction > i / avg_val
|
||||
END
|
||||
) INTO out_vals
|
||||
FROM unnest(attr) As x(i);
|
||||
|
||||
RETURN QUERY
|
||||
|
16
src/pg/test/expected/18_outliers.out
Normal file
16
src/pg/test/expected/18_outliers.out
Normal file
@ -0,0 +1,16 @@
|
||||
outlier|rowid
|
||||
t|11
|
||||
t|16
|
||||
t|17
|
||||
outlier|rowid
|
||||
t|16
|
||||
t|17
|
||||
outlier|rowid
|
||||
t|8
|
||||
t|11
|
||||
t|16
|
||||
outlier|rowid
|
||||
t|8
|
||||
t|9
|
||||
t|11
|
||||
t|15
|
70
src/pg/test/sql/18_outliers.sql
Normal file
70
src/pg/test/sql/18_outliers.sql
Normal file
@ -0,0 +1,70 @@
|
||||
SET client_min_messages TO WARNING;
|
||||
\set ECHO none
|
||||
\pset format unaligned
|
||||
|
||||
--
|
||||
-- postgres=# select round(avg(i), 3) as avg,
|
||||
-- round(stddev(i), 3) as stddev,
|
||||
-- round(avg(i) + stddev(i), 3) as one_stddev,
|
||||
-- round(avg(i) + 2 * stddev(i), 3) As two_stddev
|
||||
-- from unnest(ARRAY[1,3,2,3,5,1,2,32,12,3,57,2,1,4,2,100]) As x(i);
|
||||
-- avg | stddev | one_stddev | two_stddev
|
||||
-- --------+--------+------------+------------
|
||||
-- 14.375 | 27.322 | 41.697 | 69.020
|
||||
|
||||
|
||||
-- With an threshold of 1.0 standard deviation, ids 11, 16, and 17 are outliers
|
||||
WITH a AS (
|
||||
SELECT
|
||||
ARRAY[1,3,2,3,5,1,2,32,12, 3,57, 2, 1, 4, 2,100,-100]::numeric[] As vals, ARRAY[1,2,3,4,5,6,7, 8, 9,10,11,12,13,14,15, 16, 17]::int[] As ids
|
||||
), b As (
|
||||
SELECT
|
||||
(cdb_StdDevOutlier(vals, 1.0, ids)).*
|
||||
FROM a
|
||||
ORDER BY ids)
|
||||
SELECT *
|
||||
FROM b
|
||||
WHERE outlier IS TRUE;
|
||||
|
||||
-- With a threshold of 2.0 standard deviations, id 16 is the only outlier
|
||||
WITH a AS (
|
||||
SELECT
|
||||
ARRAY[1,3,2,3,5,1,2,32,12, 3,57, 2, 1, 4, 2,100,-100]::numeric[] As vals,
|
||||
ARRAY[1,2,3,4,5,6,7, 8, 9,10,11,12,13,14,15, 16, 17]::int[] As ids
|
||||
), b As (
|
||||
SELECT
|
||||
(CDB_StdDevOutlier(vals, 2.0, ids)).*
|
||||
FROM a
|
||||
ORDER BY ids)
|
||||
SELECT *
|
||||
FROM b
|
||||
WHERE outlier IS TRUE;
|
||||
|
||||
-- With a ratio threshold of 2.0 threshold (100% above or below the mean)
|
||||
-- which is greater than ~21, which are values
|
||||
WITH a AS (
|
||||
SELECT
|
||||
ARRAY[1,3,2,3,5,1,2,32,12, 3,57, 2, 1, 4, 2,100,-100]::numeric[] As vals,
|
||||
ARRAY[1,2,3,4,5,6,7, 8, 9,10,11,12,13,14,15, 16, 17]::int[] As ids
|
||||
), b As (
|
||||
SELECT
|
||||
(CDB_PercentOutlier(vals, 2.0, ids)).*
|
||||
FROM a
|
||||
ORDER BY ids)
|
||||
SELECT *
|
||||
FROM b
|
||||
WHERE outlier IS TRUE;
|
||||
|
||||
-- With a static threshold of 11, what are the outliers
|
||||
WITH a AS (
|
||||
SELECT
|
||||
ARRAY[1,3,2,3,5,1,2,32,12, 3,57, 2, 1, 4, 2,100,-100]::numeric[] As vals,
|
||||
ARRAY[1,2,3,4,5,6,7, 8, 9,10,11,12,13,14,15, 16, 17]::int[] As ids
|
||||
), b As (
|
||||
SELECT unnest(vals) As v, unnest(ids) as i
|
||||
FROM a
|
||||
)
|
||||
SELECT CDB_StaticOutlier(v, 11.0), i
|
||||
FROM b
|
||||
WHERE CDB_StaticOutlier(v, 11.0) is True
|
||||
ORDER BY i;
|
Loading…
Reference in New Issue
Block a user