adds inertia as an output column
This commit is contained in:
parent
5e0fbf0f6f
commit
001062f660
@ -1,5 +1,7 @@
|
||||
## K-Means Functions
|
||||
|
||||
k-means clustering is a popular technique for finding clusters in data by minimizing the intra-cluster 'distance' and maximizing the inter-cluster 'distance'. The distance is defined in the parameter space of the variables entered.
|
||||
|
||||
### CDB_KMeans(subquery text, no_clusters INTEGER)
|
||||
|
||||
This function attempts to find `no_clusters` clusters within the input data based on the geographic distribution. It will return a table with ids and the cluster classification of each point input assuming `the_geom` is not null-valued. If `the_geom` is null-valued, the point will not be considered in the analysis.
|
||||
@ -9,7 +11,7 @@ This function attempts to find `no_clusters` clusters within the input data base
|
||||
| Name | Type | Description |
|
||||
|------|------|-------------|
|
||||
| subquery | TEXT | SQL query that exposes the data to be analyzed (e.g., `SELECT * FROM interesting_table`). This query must have the geometry column name `the_geom` and id column name `cartodb_id` unless otherwise specified in the input arguments |
|
||||
| no\_clusters | INTEGER | The number of clusters to try and find |
|
||||
| no\_clusters | INTEGER | The number of clusters to find |
|
||||
|
||||
#### Returns
|
||||
|
||||
@ -27,10 +29,11 @@ A table with the following columns.
|
||||
SELECT
|
||||
customers.*,
|
||||
km.cluster_no
|
||||
FROM
|
||||
cdb_crankshaft.CDB_Kmeans('SELECT * from customers' , 6) As km,
|
||||
customers
|
||||
WHERE customers.cartodb_id = km.cartodb_id
|
||||
FROM
|
||||
cdb_crankshaft.CDB_KMeans('SELECT * from customers' , 6) As km,
|
||||
customers
|
||||
WHERE
|
||||
customers.cartodb_id = km.cartodb_id
|
||||
```
|
||||
|
||||
### CDB_WeightedMean(subquery text, weight_column text, category_column text)
|
||||
@ -58,13 +61,13 @@ A table with the following columns.
|
||||
|
||||
```sql
|
||||
SELECT
|
||||
ST_Transform(the_geom, 3857) As the_geom_webmercator,
|
||||
class
|
||||
ST_Transform(km.the_geom, 3857) As the_geom_webmercator,
|
||||
km.class
|
||||
FROM
|
||||
cdb_crankshaft.CDB_WeightedMean(
|
||||
'SELECT *, customer_value FROM customers',
|
||||
'customer_value',
|
||||
'cluster_no')
|
||||
cdb_crankshaft.CDB_WeightedMean(
|
||||
'SELECT *, customer_value FROM customers',
|
||||
'customer_value',
|
||||
'cluster_no') As km
|
||||
```
|
||||
|
||||
## CDB_KMeansNonspatial(subquery text, colnames text[], no_clusters int)
|
||||
@ -80,7 +83,7 @@ As a standard machine learning method, k-means clustering is an unsupervised lea
|
||||
| query | TEXT | SQL query to expose the data to be used in the analysis (e.g., `SELECT * FROM iris_data`). It should contain at least the columns specified in `colnames` and the `id_colname`. |
|
||||
| colnames | TEXT[] | Array of columns to be used in the analysis (e.g., `Array['petal_width', 'sepal_length', 'petal_length']`). |
|
||||
| no\_clusters | INTEGER | Number of clusters for the classification of the data |
|
||||
| id_col (optional) | TEXT | The id column (default: 'cartodb_id') for identifying rows |
|
||||
| id\_col (optional) | TEXT | The id column (default: 'cartodb_id') for identifying rows |
|
||||
| standarize (optional) | BOOLEAN | Setting this to true (default) standardizes the data to have a mean at zero and a standard deviation of 1 |
|
||||
|
||||
### Returns
|
||||
@ -92,8 +95,26 @@ A table with the following columns.
|
||||
| cluster_label | TEXT | Label that a cluster belongs to, number from 0 to `no_clusters - 1`. |
|
||||
| cluster_center | JSON | Center of the cluster that a row belongs to. The keys of the JSON object are the `colnames`, with values that are the center of the respective cluster |
|
||||
| silhouettes | NUMERIC | [Silhouette score](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html#sklearn.metrics.silhouette_score) of the cluster label |
|
||||
| inertia | NUMERIC | Sum of squared distances of samples to their closest cluster center |
|
||||
| rowid | BIGINT | id of the original row for associating back with the original data |
|
||||
|
||||
### Example Usage
|
||||
|
||||
```sql
|
||||
SELECT
|
||||
customers.*,
|
||||
km.cluster_label,
|
||||
km.cluster_center,
|
||||
km.silhouettes
|
||||
FROM
|
||||
cdb_crankshaft.CDB_KMeansNonspatial(
|
||||
'SELECT * FROM customers',
|
||||
Array['customer_value', 'avg_amt_spent', 'home_median_income'],
|
||||
7) As km,
|
||||
customers
|
||||
WHERE
|
||||
customers.cartodb_id = km.rowid
|
||||
```
|
||||
|
||||
### Resources
|
||||
|
||||
|
@ -34,7 +34,7 @@ class Kmeans:
|
||||
def nonspatial(self, subquery, colnames, no_clusters=5,
|
||||
standardize=True, id_col='cartodb_id'):
|
||||
"""
|
||||
Inputs:
|
||||
Arguments:
|
||||
query (string): A SQL query to retrieve the data required to do the
|
||||
k-means clustering analysis, like so:
|
||||
SELECT * FROM iris_flower_data
|
||||
@ -46,7 +46,7 @@ class Kmeans:
|
||||
no_clusters (int): number of clusters (greater than zero)
|
||||
id_col (string): name of the input id_column
|
||||
|
||||
Output:
|
||||
Returns:
|
||||
A list of tuples with the following columns:
|
||||
cluster labels: a label for the cluster that the row belongs to
|
||||
centers: center of the cluster that this row belongs to
|
||||
@ -57,19 +57,20 @@ class Kmeans:
|
||||
import json
|
||||
from sklearn import metrics
|
||||
|
||||
# TODO: need a random seed?
|
||||
params = {"colnames": colnames,
|
||||
"subquery": subquery,
|
||||
"id_col": id_col}
|
||||
params = {
|
||||
"colnames": colnames,
|
||||
"subquery": subquery,
|
||||
"id_col": id_col
|
||||
}
|
||||
|
||||
data = self.data_provider.get_nonspatial_kmeans(params)
|
||||
|
||||
# fill array with values for k-means clustering
|
||||
if standardize:
|
||||
cluster_columns = _scale_data(
|
||||
_extract_columns(data, len(colnames)))
|
||||
_extract_columns(data))
|
||||
else:
|
||||
cluster_columns = _extract_columns(data, len(colnames))
|
||||
cluster_columns = _extract_columns(data)
|
||||
|
||||
kmeans = KMeans(n_clusters=no_clusters,
|
||||
random_state=0).fit(cluster_columns)
|
||||
@ -84,18 +85,19 @@ class Kmeans:
|
||||
return zip(kmeans.labels_,
|
||||
centers,
|
||||
silhouettes,
|
||||
[kmeans.inertia_] * kmeans.labels_.shape[0],
|
||||
data[0]['rowid'])
|
||||
|
||||
|
||||
# -- Preprocessing steps
|
||||
|
||||
def _extract_columns(data, n_cols):
|
||||
def _extract_columns(data):
|
||||
"""
|
||||
Extract the features from the query and pack them into a NumPy array
|
||||
data (list of dicts): result of the kmeans request
|
||||
id_col_name (string): name of column which has the row id (not a
|
||||
feature of the analysis)
|
||||
"""
|
||||
# number of columns minus rowid column
|
||||
n_cols = len(data) - 1
|
||||
return np.array([data[0]['arr_col{0}'.format(i+1)]
|
||||
for i in xrange(n_cols)],
|
||||
dtype=float).T
|
||||
|
Loading…
Reference in New Issue
Block a user