adds tests
This commit is contained in:
parent
f2bb0b496b
commit
b8accb48fc
@ -12,6 +12,7 @@ END;
|
|||||||
$$ LANGUAGE plpgsql;
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
-- Find outliers by a percentage above the threshold
|
-- Find outliers by a percentage above the threshold
|
||||||
|
-- TODO: add symmetric option? `symmetric boolean DEFAULT false`
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION CDB_PercentOutlier(attr numeric[], outlier_fraction numeric, ids int[])
|
CREATE OR REPLACE FUNCTION CDB_PercentOutlier(attr numeric[], outlier_fraction numeric, ids int[])
|
||||||
RETURNS TABLE(outlier boolean, rowid int)
|
RETURNS TABLE(outlier boolean, rowid int)
|
||||||
@ -21,9 +22,14 @@ DECLARE
|
|||||||
out_vals boolean[];
|
out_vals boolean[];
|
||||||
BEGIN
|
BEGIN
|
||||||
|
|
||||||
SELECT avg(i) INTO avg_val FROM unnest(attr) As x(i);
|
SELECT avg(i) INTO avg_val
|
||||||
|
FROM unnest(attr) As x(i);
|
||||||
|
|
||||||
SELECT array_agg( i > avg_val * outlier_fraction) INTO out_vals
|
SELECT array_agg(
|
||||||
|
CASE WHEN avg_val = 0 THEN null
|
||||||
|
ELSE outlier_fraction > i / avg_val
|
||||||
|
END
|
||||||
|
) INTO out_vals
|
||||||
FROM unnest(attr) As x(i);
|
FROM unnest(attr) As x(i);
|
||||||
|
|
||||||
RETURN QUERY
|
RETURN QUERY
|
||||||
|
16
src/pg/test/expected/18_outliers.out
Normal file
16
src/pg/test/expected/18_outliers.out
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
outlier|rowid
|
||||||
|
t|11
|
||||||
|
t|16
|
||||||
|
t|17
|
||||||
|
outlier|rowid
|
||||||
|
t|16
|
||||||
|
t|17
|
||||||
|
outlier|rowid
|
||||||
|
t|8
|
||||||
|
t|11
|
||||||
|
t|16
|
||||||
|
outlier|rowid
|
||||||
|
t|8
|
||||||
|
t|9
|
||||||
|
t|11
|
||||||
|
t|15
|
70
src/pg/test/sql/18_outliers.sql
Normal file
70
src/pg/test/sql/18_outliers.sql
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
SET client_min_messages TO WARNING;
|
||||||
|
\set ECHO none
|
||||||
|
\pset format unaligned
|
||||||
|
|
||||||
|
--
|
||||||
|
-- postgres=# select round(avg(i), 3) as avg,
|
||||||
|
-- round(stddev(i), 3) as stddev,
|
||||||
|
-- round(avg(i) + stddev(i), 3) as one_stddev,
|
||||||
|
-- round(avg(i) + 2 * stddev(i), 3) As two_stddev
|
||||||
|
-- from unnest(ARRAY[1,3,2,3,5,1,2,32,12,3,57,2,1,4,2,100]) As x(i);
|
||||||
|
-- avg | stddev | one_stddev | two_stddev
|
||||||
|
-- --------+--------+------------+------------
|
||||||
|
-- 14.375 | 27.322 | 41.697 | 69.020
|
||||||
|
|
||||||
|
|
||||||
|
-- With an threshold of 1.0 standard deviation, ids 11, 16, and 17 are outliers
|
||||||
|
WITH a AS (
|
||||||
|
SELECT
|
||||||
|
ARRAY[1,3,2,3,5,1,2,32,12, 3,57, 2, 1, 4, 2,100,-100]::numeric[] As vals, ARRAY[1,2,3,4,5,6,7, 8, 9,10,11,12,13,14,15, 16, 17]::int[] As ids
|
||||||
|
), b As (
|
||||||
|
SELECT
|
||||||
|
(cdb_StdDevOutlier(vals, 1.0, ids)).*
|
||||||
|
FROM a
|
||||||
|
ORDER BY ids)
|
||||||
|
SELECT *
|
||||||
|
FROM b
|
||||||
|
WHERE outlier IS TRUE;
|
||||||
|
|
||||||
|
-- With a threshold of 2.0 standard deviations, id 16 is the only outlier
|
||||||
|
WITH a AS (
|
||||||
|
SELECT
|
||||||
|
ARRAY[1,3,2,3,5,1,2,32,12, 3,57, 2, 1, 4, 2,100,-100]::numeric[] As vals,
|
||||||
|
ARRAY[1,2,3,4,5,6,7, 8, 9,10,11,12,13,14,15, 16, 17]::int[] As ids
|
||||||
|
), b As (
|
||||||
|
SELECT
|
||||||
|
(CDB_StdDevOutlier(vals, 2.0, ids)).*
|
||||||
|
FROM a
|
||||||
|
ORDER BY ids)
|
||||||
|
SELECT *
|
||||||
|
FROM b
|
||||||
|
WHERE outlier IS TRUE;
|
||||||
|
|
||||||
|
-- With a ratio threshold of 2.0 threshold (100% above or below the mean)
|
||||||
|
-- which is greater than ~21, which are values
|
||||||
|
WITH a AS (
|
||||||
|
SELECT
|
||||||
|
ARRAY[1,3,2,3,5,1,2,32,12, 3,57, 2, 1, 4, 2,100,-100]::numeric[] As vals,
|
||||||
|
ARRAY[1,2,3,4,5,6,7, 8, 9,10,11,12,13,14,15, 16, 17]::int[] As ids
|
||||||
|
), b As (
|
||||||
|
SELECT
|
||||||
|
(CDB_PercentOutlier(vals, 2.0, ids)).*
|
||||||
|
FROM a
|
||||||
|
ORDER BY ids)
|
||||||
|
SELECT *
|
||||||
|
FROM b
|
||||||
|
WHERE outlier IS TRUE;
|
||||||
|
|
||||||
|
-- With a static threshold of 11, what are the outliers
|
||||||
|
WITH a AS (
|
||||||
|
SELECT
|
||||||
|
ARRAY[1,3,2,3,5,1,2,32,12, 3,57, 2, 1, 4, 2,100,-100]::numeric[] As vals,
|
||||||
|
ARRAY[1,2,3,4,5,6,7, 8, 9,10,11,12,13,14,15, 16, 17]::int[] As ids
|
||||||
|
), b As (
|
||||||
|
SELECT unnest(vals) As v, unnest(ids) as i
|
||||||
|
FROM a
|
||||||
|
)
|
||||||
|
SELECT CDB_StaticOutlier(v, 11.0), i
|
||||||
|
FROM b
|
||||||
|
WHERE CDB_StaticOutlier(v, 11.0) is True
|
||||||
|
ORDER BY i;
|
Loading…
Reference in New Issue
Block a user