updating to support passing model paramters and returning accuracy from the function along with prediction
This commit is contained in:
parent
1d13b98d68
commit
4df8257377
@ -2,11 +2,18 @@ CREATE OR REPLACE FUNCTION
|
|||||||
CDB_CreateAndPredictSegment (
|
CDB_CreateAndPredictSegment (
|
||||||
query TEXT,
|
query TEXT,
|
||||||
variable_name TEXT,
|
variable_name TEXT,
|
||||||
target_table TEXT
|
target_table TEXT,
|
||||||
|
n_estimators INTEGER DEFAULT 1200,
|
||||||
|
max_depth INTEGER DEFAULT 3,
|
||||||
|
subsample DOUBLE PRECISION DEFAULT 0.5,
|
||||||
|
learning_rate DOUBLE PRECISION DEFAULT 0.01,
|
||||||
|
min_samples_leaf INTEGER DEFAULT 1
|
||||||
|
|
||||||
)
|
)
|
||||||
RETURNS TABLE (cartodb_id text, prediction Numeric )
|
RETURNS TABLE (cartodb_id text, prediction Numeric,accuracy Numeric )
|
||||||
AS $$
|
AS $$
|
||||||
from crankshaft.segmentation import create_and_predict_segment
|
from crankshaft.segmentation import create_and_predict_segment
|
||||||
# TODO: use named parameters or a dictionary
|
model_params = {'n_estimators': n_estimators, 'max_depth':max_depth, 'subsample' : subsample, 'learning_rate': learning_rate, 'min_samples_leaf' : min_samples_leaf}
|
||||||
return create_and_predict_segment(query,variable_name,target_table)
|
return create_and_predict_segment(query,variable_name,target_table, model_params)
|
||||||
$$ LANGUAGE plpythonu;
|
$$ LANGUAGE plpythonu;
|
||||||
|
|
||||||
|
@ -5,7 +5,7 @@ Segmentation creation and prediction
|
|||||||
import sklearn
|
import sklearn
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import plpy
|
import plpy
|
||||||
from sklearn.ensemble import GradientBoostingClassifier
|
from sklearn.ensemble import GradientBoostingRegressor
|
||||||
from sklearn import metrics
|
from sklearn import metrics
|
||||||
from sklearn.cross_validation import train_test_split
|
from sklearn.cross_validation import train_test_split
|
||||||
|
|
||||||
@ -26,9 +26,10 @@ def get_data(variable, feature_columns, query):
|
|||||||
))
|
))
|
||||||
target = np.array(data[0]['target'])
|
target = np.array(data[0]['target'])
|
||||||
features = np.column_stack([np.array(data[0][col], dtype=float) for col in feature_columns])
|
features = np.column_stack([np.array(data[0][col], dtype=float) for col in feature_columns])
|
||||||
|
|
||||||
return replace_nan_with_mean(target), replace_nan_with_mean(features)
|
return replace_nan_with_mean(target), replace_nan_with_mean(features)
|
||||||
|
|
||||||
def create_and_predict_segment(query,variable,target_query):
|
def create_and_predict_segment(query,variable,target_query,model_params):
|
||||||
"""
|
"""
|
||||||
generate a segment with machine learning
|
generate a segment with machine learning
|
||||||
Stuart Lynn
|
Stuart Lynn
|
||||||
@ -38,14 +39,14 @@ def create_and_predict_segment(query,variable,target_query):
|
|||||||
feature_columns = set(columns) - set([variable, 'the_geom', 'the_geom_webmercator'])
|
feature_columns = set(columns) - set([variable, 'the_geom', 'the_geom_webmercator'])
|
||||||
target,features = get_data(variable, feature_columns, query)
|
target,features = get_data(variable, feature_columns, query)
|
||||||
|
|
||||||
model, accuracy = train_model(target,features, test_split=0.2)
|
model, accuracy = train_model(target,features, model_params, 0.2)
|
||||||
cartodb_ids, result = predict_segment(model,feature_columns,target_query)
|
cartodb_ids, result = predict_segment(model,feature_columns,target_query)
|
||||||
return zip(cartodb_ids, result)
|
return zip(cartodb_ids, result, np.full(result.shape, accuracy ))
|
||||||
|
|
||||||
|
|
||||||
def train_model(target,features,test_split):
|
def train_model(target,features,model_params,test_split):
|
||||||
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
|
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
|
||||||
model = GradientBoostingClassifier(n_estimators = 200, max_features=features.shape[1])
|
model = GradientBoostingRegressor(**model_params)
|
||||||
plpy.notice('training the model: fitting to data')
|
plpy.notice('training the model: fitting to data')
|
||||||
model.fit(features_train, target_train)
|
model.fit(features_train, target_train)
|
||||||
plpy.notice('model trained')
|
plpy.notice('model trained')
|
||||||
@ -54,7 +55,7 @@ def train_model(target,features,test_split):
|
|||||||
|
|
||||||
def calculate_model_accuracy(model,features,target):
|
def calculate_model_accuracy(model,features,target):
|
||||||
prediction = model.predict(features)
|
prediction = model.predict(features)
|
||||||
return metrics.mean_squared_error(prediction,target)/np.std(target)
|
return metrics.mean_squared_error(prediction,target)
|
||||||
|
|
||||||
def predict_segment(model,features,target_query):
|
def predict_segment(model,features,target_query):
|
||||||
"""
|
"""
|
||||||
|
Loading…
Reference in New Issue
Block a user