Merge pull request #137 from CartoDB/adds-outlier-functions
Adds (nonspatial) outlier functions
This commit is contained in:
commit
b0dcd7f572
163
doc/18_outliers.md
Normal file
163
doc/18_outliers.md
Normal file
@ -0,0 +1,163 @@
|
||||
## Outlier Detection
|
||||
|
||||
This set of functions detects the presence of outliers. There are three functions for finding outliers from non-spatial data:
|
||||
|
||||
1. Static Outliers
|
||||
1. Percentage Outliers
|
||||
1. Standard Deviation Outliers
|
||||
|
||||
### CDB_StaticOutlier(column_value numeric, threshold numeric)
|
||||
|
||||
#### Arguments
|
||||
|
||||
| Name | Type | Description |
|
||||
|------|------|-------------|
|
||||
| column_value | numeric | The column of values on which to apply the threshold |
|
||||
| threshold | numeric | The static threshold which is used to indicate whether a `column_value` is an outlier or not |
|
||||
|
||||
### Returns
|
||||
|
||||
Returns a boolean (true/false) depending on whether a value is above or below (or equal to) the threshold
|
||||
|
||||
| Name | Type | Description |
|
||||
|------|------|-------------|
|
||||
| outlier | boolean | classification of whether a row is an outlier or not |
|
||||
|
||||
#### Example Usage
|
||||
|
||||
With a table `website_visits` and a column of the number of website visits in units of 10,000 visits:
|
||||
|
||||
```
|
||||
| id | visits_10k |
|
||||
|----|------------|
|
||||
| 1 | 1 |
|
||||
| 2 | 3 |
|
||||
| 3 | 5 |
|
||||
| 4 | 1 |
|
||||
| 5 | 32 |
|
||||
| 6 | 3 |
|
||||
| 7 | 57 |
|
||||
| 8 | 2 |
|
||||
```
|
||||
|
||||
```sql
|
||||
SELECT
|
||||
id,
|
||||
CDB_StaticOutlier(visits_10k, 11.0) As outlier,
|
||||
visits_10k
|
||||
FROM website_visits
|
||||
```
|
||||
|
||||
```
|
||||
| id | outlier | visits_10k |
|
||||
|----|---------|------------|
|
||||
| 1 | f | 1 |
|
||||
| 2 | f | 3 |
|
||||
| 3 | f | 5 |
|
||||
| 4 | f | 1 |
|
||||
| 5 | t | 32 |
|
||||
| 6 | f | 3 |
|
||||
| 7 | t | 57 |
|
||||
| 8 | f | 2 |
|
||||
```
|
||||
|
||||
### CDB_PercentOutlier(column_values numeric[], outlier_fraction numeric, ids int[])
|
||||
|
||||
`CDB_PercentOutlier` calculates whether or not a value falls above a given threshold based on a percentage above the mean value of the input values.
|
||||
|
||||
#### Arguments
|
||||
|
||||
| Name | Type | Description |
|
||||
|------|------|-------------|
|
||||
| column_values | numeric[] | An array of the values to calculate the outlier classification on |
|
||||
| outlier_fraction | numeric | The threshold above which a column value divided by the mean of all values is considered an outlier |
|
||||
| ids | int[] | An array of the unique row ids of the input data (usually `cartodb_id`) |
|
||||
|
||||
### Returns
|
||||
|
||||
Returns a table of the outlier classification with the following columns
|
||||
|
||||
| Name | Type | Description |
|
||||
|------|------|-------------|
|
||||
| is_outlier | boolean | classification of whether a row is an outlier or not |
|
||||
| rowid | int | original row id (e.g., input `cartodb_id`) of the row which has the outlier classification |
|
||||
|
||||
#### Example Usage
|
||||
|
||||
This example find outliers which are more than 100% larger than the average (that is, more than 2.0 times larger).
|
||||
|
||||
```sql
|
||||
WITH cte As (
|
||||
SELECT
|
||||
unnest(Array[1,2,3,4,5,6,7,8]) As id,
|
||||
unnest(Array[1,3,5,1,32,3,57,2]) As visits_10k
|
||||
)
|
||||
SELECT
|
||||
(CDB_PercentOutlier(array_agg(visits_10k), 2.0, array_agg(id))).*
|
||||
FROM cte;
|
||||
```
|
||||
|
||||
Output
|
||||
```
|
||||
| outlier | rowid |
|
||||
|---------+-------|
|
||||
| f | 1 |
|
||||
| f | 2 |
|
||||
| f | 3 |
|
||||
| f | 4 |
|
||||
| t | 5 |
|
||||
| f | 6 |
|
||||
| t | 7 |
|
||||
| f | 8 |
|
||||
```
|
||||
|
||||
### CDB_StdDevOutlier(column_values numeric[], num_deviations numeric, ids int[], is_symmetric boolean DEFAULT true)
|
||||
|
||||
`CDB_StdDevOutlier` calculates whether or not a value falls above or below a given threshold based on the number of standard deviations from the mean.
|
||||
|
||||
#### Arguments
|
||||
|
||||
| Name | Type | Description |
|
||||
|------|------|-------------|
|
||||
| column_values | numeric[] | An array of the values to calculate the outlier classification on |
|
||||
| num_deviations | numeric | The threshold in units of standard deviation |
|
||||
| ids | int[] | An array of the unique row ids of the input data (usually `cartodb_id`) |
|
||||
| is_symmetric (optional) | boolean | Consider outliers that are symmetric about the mean (default: true) |
|
||||
|
||||
### Returns
|
||||
|
||||
Returns a table of the outlier classification with the following columns
|
||||
|
||||
| Name | Type | Description |
|
||||
|------|------|-------------|
|
||||
| is_outlier | boolean | classification of whether a row is an outlier or not |
|
||||
| rowid | int | original row id (e.g., input `cartodb_id`) of the row which has the outlier classification |
|
||||
|
||||
#### Example Usage
|
||||
|
||||
This example find outliers which are more than 100% larger than the average (that is, more than 2.0 times larger).
|
||||
|
||||
```sql
|
||||
WITH cte As (
|
||||
SELECT
|
||||
unnest(Array[1,2,3,4,5,6,7,8]) As id,
|
||||
unnest(Array[1,3,5,1,32,3,57,2]) As visits_10k
|
||||
)
|
||||
SELECT
|
||||
(CDB_StdDevOutlier(array_agg(visits_10k), 2.0, array_agg(id))).*
|
||||
FROM cte;
|
||||
```
|
||||
|
||||
Output
|
||||
```
|
||||
| outlier | rowid |
|
||||
|---------+-------|
|
||||
| f | 1 |
|
||||
| f | 2 |
|
||||
| f | 3 |
|
||||
| f | 4 |
|
||||
| f | 5 |
|
||||
| f | 6 |
|
||||
| t | 7 |
|
||||
| f | 8 |
|
||||
```
|
75
src/pg/sql/18_outliers.sql
Normal file
75
src/pg/sql/18_outliers.sql
Normal file
@ -0,0 +1,75 @@
|
||||
|
||||
-- Find outliers using a static threshold
|
||||
--
|
||||
CREATE OR REPLACE FUNCTION CDB_StaticOutlier(column_value numeric, threshold numeric)
|
||||
RETURNS boolean
|
||||
AS $$
|
||||
BEGIN
|
||||
|
||||
RETURN column_value > threshold;
|
||||
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Find outliers by a percentage above the threshold
|
||||
-- TODO: add symmetric option? `is_symmetric boolean DEFAULT false`
|
||||
|
||||
CREATE OR REPLACE FUNCTION CDB_PercentOutlier(column_values numeric[], outlier_fraction numeric, ids int[])
|
||||
RETURNS TABLE(is_outlier boolean, rowid int)
|
||||
AS $$
|
||||
DECLARE
|
||||
avg_val numeric;
|
||||
out_vals boolean[];
|
||||
BEGIN
|
||||
|
||||
SELECT avg(i) INTO avg_val
|
||||
FROM unnest(column_values) As x(i);
|
||||
|
||||
IF avg_val = 0 THEN
|
||||
RAISE EXCEPTION 'Mean value is zero. Try another outlier method.';
|
||||
END IF;
|
||||
|
||||
SELECT array_agg(
|
||||
outlier_fraction < i / avg_val) INTO out_vals
|
||||
FROM unnest(column_values) As x(i);
|
||||
|
||||
RETURN QUERY
|
||||
SELECT unnest(out_vals) As is_outlier,
|
||||
unnest(ids) As rowid;
|
||||
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Find outliers above a given number of standard deviations from the mean
|
||||
|
||||
CREATE OR REPLACE FUNCTION CDB_StdDevOutlier(column_values numeric[], num_deviations numeric, ids int[], is_symmetric boolean DEFAULT true)
|
||||
RETURNS TABLE(is_outlier boolean, rowid int)
|
||||
AS $$
|
||||
DECLARE
|
||||
stddev_val numeric;
|
||||
avg_val numeric;
|
||||
out_vals boolean[];
|
||||
BEGIN
|
||||
|
||||
SELECT stddev(i), avg(i) INTO stddev_val, avg_val
|
||||
FROM unnest(column_values) As x(i);
|
||||
|
||||
IF stddev_val = 0 THEN
|
||||
RAISE EXCEPTION 'Standard deviation of input data is zero';
|
||||
END IF;
|
||||
|
||||
IF is_symmetric THEN
|
||||
SELECT array_agg(
|
||||
abs(i - avg_val) / stddev_val > num_deviations) INTO out_vals
|
||||
FROM unnest(column_values) As x(i);
|
||||
ELSE
|
||||
SELECT array_agg(
|
||||
(i - avg_val) / stddev_val > num_deviations) INTO out_vals
|
||||
FROM unnest(column_values) As x(i);
|
||||
END IF;
|
||||
|
||||
RETURN QUERY
|
||||
SELECT unnest(out_vals) As is_outlier,
|
||||
unnest(ids) As rowid;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
23
src/pg/test/expected/18_outliers_test.out
Normal file
23
src/pg/test/expected/18_outliers_test.out
Normal file
@ -0,0 +1,23 @@
|
||||
SET client_min_messages TO WARNING;
|
||||
\set ECHO none
|
||||
is_outlier|rowid
|
||||
t|11
|
||||
t|16
|
||||
t|17
|
||||
(3 rows)
|
||||
is_outlier|rowid
|
||||
t|16
|
||||
t|17
|
||||
(2 rows)
|
||||
ERROR: Standard deviation of input data is zero
|
||||
is_outlier|rowid
|
||||
t|8
|
||||
t|11
|
||||
t|16
|
||||
(3 rows)
|
||||
is_outlier|rowid
|
||||
t|8
|
||||
t|9
|
||||
t|11
|
||||
t|16
|
||||
(4 rows)
|
85
src/pg/test/sql/18_outliers_test.sql
Normal file
85
src/pg/test/sql/18_outliers_test.sql
Normal file
@ -0,0 +1,85 @@
|
||||
SET client_min_messages TO WARNING;
|
||||
\set ECHO none
|
||||
\pset format unaligned
|
||||
|
||||
--
|
||||
-- postgres=# select round(avg(i), 3) as avg,
|
||||
-- round(stddev(i), 3) as stddev,
|
||||
-- round(avg(i) + stddev(i), 3) as one_stddev,
|
||||
-- round(avg(i) + 2 * stddev(i), 3) As two_stddev
|
||||
-- from unnest(ARRAY[1,3,2,3,5,1,2,32,12,3,57,2,1,4,2,100]) As x(i);
|
||||
-- avg | stddev | one_stddev | two_stddev
|
||||
-- --------+--------+------------+------------
|
||||
-- 14.375 | 27.322 | 41.697 | 69.020
|
||||
|
||||
|
||||
-- With an threshold of 1.0 standard deviation, ids 11, 16, and 17 are outliers
|
||||
WITH a AS (
|
||||
SELECT
|
||||
ARRAY[1,3,2,3,5,1,2,32,12, 3,57, 2, 1, 4, 2,100,-100]::numeric[] As vals, ARRAY[1,2,3,4,5,6,7, 8, 9,10,11,12,13,14,15, 16, 17]::int[] As ids
|
||||
), b As (
|
||||
SELECT
|
||||
(cdb_crankshaft.cdb_StdDevOutlier(vals, 1.0, ids)).*
|
||||
FROM a
|
||||
ORDER BY ids)
|
||||
SELECT *
|
||||
FROM b
|
||||
WHERE is_outlier IS TRUE;
|
||||
|
||||
-- With a threshold of 2.0 standard deviations, id 16 is the only outlier
|
||||
WITH a AS (
|
||||
SELECT
|
||||
ARRAY[1,3,2,3,5,1,2,32,12, 3,57, 2, 1, 4, 2,100,-100]::numeric[] As vals,
|
||||
ARRAY[1,2,3,4,5,6,7, 8, 9,10,11,12,13,14,15, 16, 17]::int[] As ids
|
||||
), b As (
|
||||
SELECT
|
||||
(cdb_crankshaft.CDB_StdDevOutlier(vals, 2.0, ids)).*
|
||||
FROM a
|
||||
ORDER BY ids)
|
||||
SELECT *
|
||||
FROM b
|
||||
WHERE is_outlier IS TRUE;
|
||||
|
||||
-- With a Stddev of zero, should throw back error
|
||||
-- With a threshold of 2.0 standard deviations, id 16 is the only outlier
|
||||
WITH a AS (
|
||||
SELECT
|
||||
ARRAY[5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]::numeric[] As vals,
|
||||
ARRAY[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]::int[] As ids
|
||||
), b As (
|
||||
SELECT
|
||||
(cdb_crankshaft.CDB_StdDevOutlier(vals, 1.0, ids)).*
|
||||
FROM a
|
||||
ORDER BY ids)
|
||||
SELECT *
|
||||
FROM b
|
||||
WHERE is_outlier IS TRUE;
|
||||
|
||||
-- With a ratio threshold of 2.0 threshold (100% above or below the mean)
|
||||
-- which is greater than ~21, which are values
|
||||
WITH a AS (
|
||||
SELECT
|
||||
ARRAY[1,3,2,3,5,1,2,32,12, 3,57, 2, 1, 4, 2,100,-100]::numeric[] As vals,
|
||||
ARRAY[1,2,3,4,5,6,7, 8, 9,10,11,12,13,14,15, 16, 17]::int[] As ids
|
||||
), b As (
|
||||
SELECT
|
||||
(cdb_crankshaft.CDB_PercentOutlier(vals, 2.0, ids)).*
|
||||
FROM a
|
||||
ORDER BY ids)
|
||||
SELECT *
|
||||
FROM b
|
||||
WHERE is_outlier IS TRUE;
|
||||
|
||||
-- With a static threshold of 11, what are the outliers
|
||||
WITH a AS (
|
||||
SELECT
|
||||
ARRAY[1,3,2,3,5,1,2,32,12, 3,57, 2, 1, 4, 2,100,-100]::numeric[] As vals,
|
||||
ARRAY[1,2,3,4,5,6,7, 8, 9,10,11,12,13,14,15, 16, 17]::int[] As ids
|
||||
), b As (
|
||||
SELECT unnest(vals) As v, unnest(ids) as i
|
||||
FROM a
|
||||
)
|
||||
SELECT cdb_crankshaft.CDB_StaticOutlier(v, 11.0) As is_outlier, i As rowid
|
||||
FROM b
|
||||
WHERE cdb_crankshaft.CDB_StaticOutlier(v, 11.0) is True
|
||||
ORDER BY i;
|
Loading…
Reference in New Issue
Block a user