Merge pull request #137 from CartoDB/adds-outlier-functions

Adds (nonspatial) outlier functions
This commit is contained in:
Mario de Frutos 2016-12-01 10:43:51 +01:00 committed by GitHub
commit b0dcd7f572
4 changed files with 346 additions and 0 deletions

163
doc/18_outliers.md Normal file
View File

@ -0,0 +1,163 @@
## Outlier Detection
This set of functions detects the presence of outliers. There are three functions for finding outliers from non-spatial data:
1. Static Outliers
1. Percentage Outliers
1. Standard Deviation Outliers
### CDB_StaticOutlier(column_value numeric, threshold numeric)
#### Arguments
| Name | Type | Description |
|------|------|-------------|
| column_value | numeric | The column of values on which to apply the threshold |
| threshold | numeric | The static threshold which is used to indicate whether a `column_value` is an outlier or not |
### Returns
Returns a boolean (true/false) depending on whether a value is above or below (or equal to) the threshold
| Name | Type | Description |
|------|------|-------------|
| outlier | boolean | classification of whether a row is an outlier or not |
#### Example Usage
With a table `website_visits` and a column of the number of website visits in units of 10,000 visits:
```
| id | visits_10k |
|----|------------|
| 1 | 1 |
| 2 | 3 |
| 3 | 5 |
| 4 | 1 |
| 5 | 32 |
| 6 | 3 |
| 7 | 57 |
| 8 | 2 |
```
```sql
SELECT
id,
CDB_StaticOutlier(visits_10k, 11.0) As outlier,
visits_10k
FROM website_visits
```
```
| id | outlier | visits_10k |
|----|---------|------------|
| 1 | f | 1 |
| 2 | f | 3 |
| 3 | f | 5 |
| 4 | f | 1 |
| 5 | t | 32 |
| 6 | f | 3 |
| 7 | t | 57 |
| 8 | f | 2 |
```
### CDB_PercentOutlier(column_values numeric[], outlier_fraction numeric, ids int[])
`CDB_PercentOutlier` calculates whether or not a value falls above a given threshold based on a percentage above the mean value of the input values.
#### Arguments
| Name | Type | Description |
|------|------|-------------|
| column_values | numeric[] | An array of the values to calculate the outlier classification on |
| outlier_fraction | numeric | The threshold above which a column value divided by the mean of all values is considered an outlier |
| ids | int[] | An array of the unique row ids of the input data (usually `cartodb_id`) |
### Returns
Returns a table of the outlier classification with the following columns
| Name | Type | Description |
|------|------|-------------|
| is_outlier | boolean | classification of whether a row is an outlier or not |
| rowid | int | original row id (e.g., input `cartodb_id`) of the row which has the outlier classification |
#### Example Usage
This example find outliers which are more than 100% larger than the average (that is, more than 2.0 times larger).
```sql
WITH cte As (
SELECT
unnest(Array[1,2,3,4,5,6,7,8]) As id,
unnest(Array[1,3,5,1,32,3,57,2]) As visits_10k
)
SELECT
(CDB_PercentOutlier(array_agg(visits_10k), 2.0, array_agg(id))).*
FROM cte;
```
Output
```
| outlier | rowid |
|---------+-------|
| f | 1 |
| f | 2 |
| f | 3 |
| f | 4 |
| t | 5 |
| f | 6 |
| t | 7 |
| f | 8 |
```
### CDB_StdDevOutlier(column_values numeric[], num_deviations numeric, ids int[], is_symmetric boolean DEFAULT true)
`CDB_StdDevOutlier` calculates whether or not a value falls above or below a given threshold based on the number of standard deviations from the mean.
#### Arguments
| Name | Type | Description |
|------|------|-------------|
| column_values | numeric[] | An array of the values to calculate the outlier classification on |
| num_deviations | numeric | The threshold in units of standard deviation |
| ids | int[] | An array of the unique row ids of the input data (usually `cartodb_id`) |
| is_symmetric (optional) | boolean | Consider outliers that are symmetric about the mean (default: true) |
### Returns
Returns a table of the outlier classification with the following columns
| Name | Type | Description |
|------|------|-------------|
| is_outlier | boolean | classification of whether a row is an outlier or not |
| rowid | int | original row id (e.g., input `cartodb_id`) of the row which has the outlier classification |
#### Example Usage
This example find outliers which are more than 100% larger than the average (that is, more than 2.0 times larger).
```sql
WITH cte As (
SELECT
unnest(Array[1,2,3,4,5,6,7,8]) As id,
unnest(Array[1,3,5,1,32,3,57,2]) As visits_10k
)
SELECT
(CDB_StdDevOutlier(array_agg(visits_10k), 2.0, array_agg(id))).*
FROM cte;
```
Output
```
| outlier | rowid |
|---------+-------|
| f | 1 |
| f | 2 |
| f | 3 |
| f | 4 |
| f | 5 |
| f | 6 |
| t | 7 |
| f | 8 |
```

View File

@ -0,0 +1,75 @@
-- Find outliers using a static threshold
--
CREATE OR REPLACE FUNCTION CDB_StaticOutlier(column_value numeric, threshold numeric)
RETURNS boolean
AS $$
BEGIN
RETURN column_value > threshold;
END;
$$ LANGUAGE plpgsql;
-- Find outliers by a percentage above the threshold
-- TODO: add symmetric option? `is_symmetric boolean DEFAULT false`
CREATE OR REPLACE FUNCTION CDB_PercentOutlier(column_values numeric[], outlier_fraction numeric, ids int[])
RETURNS TABLE(is_outlier boolean, rowid int)
AS $$
DECLARE
avg_val numeric;
out_vals boolean[];
BEGIN
SELECT avg(i) INTO avg_val
FROM unnest(column_values) As x(i);
IF avg_val = 0 THEN
RAISE EXCEPTION 'Mean value is zero. Try another outlier method.';
END IF;
SELECT array_agg(
outlier_fraction < i / avg_val) INTO out_vals
FROM unnest(column_values) As x(i);
RETURN QUERY
SELECT unnest(out_vals) As is_outlier,
unnest(ids) As rowid;
END;
$$ LANGUAGE plpgsql;
-- Find outliers above a given number of standard deviations from the mean
CREATE OR REPLACE FUNCTION CDB_StdDevOutlier(column_values numeric[], num_deviations numeric, ids int[], is_symmetric boolean DEFAULT true)
RETURNS TABLE(is_outlier boolean, rowid int)
AS $$
DECLARE
stddev_val numeric;
avg_val numeric;
out_vals boolean[];
BEGIN
SELECT stddev(i), avg(i) INTO stddev_val, avg_val
FROM unnest(column_values) As x(i);
IF stddev_val = 0 THEN
RAISE EXCEPTION 'Standard deviation of input data is zero';
END IF;
IF is_symmetric THEN
SELECT array_agg(
abs(i - avg_val) / stddev_val > num_deviations) INTO out_vals
FROM unnest(column_values) As x(i);
ELSE
SELECT array_agg(
(i - avg_val) / stddev_val > num_deviations) INTO out_vals
FROM unnest(column_values) As x(i);
END IF;
RETURN QUERY
SELECT unnest(out_vals) As is_outlier,
unnest(ids) As rowid;
END;
$$ LANGUAGE plpgsql;

View File

@ -0,0 +1,23 @@
SET client_min_messages TO WARNING;
\set ECHO none
is_outlier|rowid
t|11
t|16
t|17
(3 rows)
is_outlier|rowid
t|16
t|17
(2 rows)
ERROR: Standard deviation of input data is zero
is_outlier|rowid
t|8
t|11
t|16
(3 rows)
is_outlier|rowid
t|8
t|9
t|11
t|16
(4 rows)

View File

@ -0,0 +1,85 @@
SET client_min_messages TO WARNING;
\set ECHO none
\pset format unaligned
--
-- postgres=# select round(avg(i), 3) as avg,
-- round(stddev(i), 3) as stddev,
-- round(avg(i) + stddev(i), 3) as one_stddev,
-- round(avg(i) + 2 * stddev(i), 3) As two_stddev
-- from unnest(ARRAY[1,3,2,3,5,1,2,32,12,3,57,2,1,4,2,100]) As x(i);
-- avg | stddev | one_stddev | two_stddev
-- --------+--------+------------+------------
-- 14.375 | 27.322 | 41.697 | 69.020
-- With an threshold of 1.0 standard deviation, ids 11, 16, and 17 are outliers
WITH a AS (
SELECT
ARRAY[1,3,2,3,5,1,2,32,12, 3,57, 2, 1, 4, 2,100,-100]::numeric[] As vals, ARRAY[1,2,3,4,5,6,7, 8, 9,10,11,12,13,14,15, 16, 17]::int[] As ids
), b As (
SELECT
(cdb_crankshaft.cdb_StdDevOutlier(vals, 1.0, ids)).*
FROM a
ORDER BY ids)
SELECT *
FROM b
WHERE is_outlier IS TRUE;
-- With a threshold of 2.0 standard deviations, id 16 is the only outlier
WITH a AS (
SELECT
ARRAY[1,3,2,3,5,1,2,32,12, 3,57, 2, 1, 4, 2,100,-100]::numeric[] As vals,
ARRAY[1,2,3,4,5,6,7, 8, 9,10,11,12,13,14,15, 16, 17]::int[] As ids
), b As (
SELECT
(cdb_crankshaft.CDB_StdDevOutlier(vals, 2.0, ids)).*
FROM a
ORDER BY ids)
SELECT *
FROM b
WHERE is_outlier IS TRUE;
-- With a Stddev of zero, should throw back error
-- With a threshold of 2.0 standard deviations, id 16 is the only outlier
WITH a AS (
SELECT
ARRAY[5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]::numeric[] As vals,
ARRAY[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]::int[] As ids
), b As (
SELECT
(cdb_crankshaft.CDB_StdDevOutlier(vals, 1.0, ids)).*
FROM a
ORDER BY ids)
SELECT *
FROM b
WHERE is_outlier IS TRUE;
-- With a ratio threshold of 2.0 threshold (100% above or below the mean)
-- which is greater than ~21, which are values
WITH a AS (
SELECT
ARRAY[1,3,2,3,5,1,2,32,12, 3,57, 2, 1, 4, 2,100,-100]::numeric[] As vals,
ARRAY[1,2,3,4,5,6,7, 8, 9,10,11,12,13,14,15, 16, 17]::int[] As ids
), b As (
SELECT
(cdb_crankshaft.CDB_PercentOutlier(vals, 2.0, ids)).*
FROM a
ORDER BY ids)
SELECT *
FROM b
WHERE is_outlier IS TRUE;
-- With a static threshold of 11, what are the outliers
WITH a AS (
SELECT
ARRAY[1,3,2,3,5,1,2,32,12, 3,57, 2, 1, 4, 2,100,-100]::numeric[] As vals,
ARRAY[1,2,3,4,5,6,7, 8, 9,10,11,12,13,14,15, 16, 17]::int[] As ids
), b As (
SELECT unnest(vals) As v, unnest(ids) as i
FROM a
)
SELECT cdb_crankshaft.CDB_StaticOutlier(v, 11.0) As is_outlier, i As rowid
FROM b
WHERE cdb_crankshaft.CDB_StaticOutlier(v, 11.0) is True
ORDER BY i;