updating to support passing model paramters and returning accuracy from the function along with prediction

This commit is contained in:
Stuart Lynn 2016-06-22 15:56:47 +00:00
parent 1d13b98d68
commit 4df8257377
2 changed files with 19 additions and 11 deletions

View File

@ -2,11 +2,18 @@ CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment ( CDB_CreateAndPredictSegment (
query TEXT, query TEXT,
variable_name TEXT, variable_name TEXT,
target_table TEXT target_table TEXT,
n_estimators INTEGER DEFAULT 1200,
max_depth INTEGER DEFAULT 3,
subsample DOUBLE PRECISION DEFAULT 0.5,
learning_rate DOUBLE PRECISION DEFAULT 0.01,
min_samples_leaf INTEGER DEFAULT 1
) )
RETURNS TABLE (cartodb_id text, prediction Numeric ) RETURNS TABLE (cartodb_id text, prediction Numeric,accuracy Numeric )
AS $$ AS $$
from crankshaft.segmentation import create_and_predict_segment from crankshaft.segmentation import create_and_predict_segment
# TODO: use named parameters or a dictionary model_params = {'n_estimators': n_estimators, 'max_depth':max_depth, 'subsample' : subsample, 'learning_rate': learning_rate, 'min_samples_leaf' : min_samples_leaf}
return create_and_predict_segment(query,variable_name,target_table) return create_and_predict_segment(query,variable_name,target_table, model_params)
$$ LANGUAGE plpythonu; $$ LANGUAGE plpythonu;

View File

@ -5,7 +5,7 @@ Segmentation creation and prediction
import sklearn import sklearn
import numpy as np import numpy as np
import plpy import plpy
from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import GradientBoostingRegressor
from sklearn import metrics from sklearn import metrics
from sklearn.cross_validation import train_test_split from sklearn.cross_validation import train_test_split
@ -26,9 +26,10 @@ def get_data(variable, feature_columns, query):
)) ))
target = np.array(data[0]['target']) target = np.array(data[0]['target'])
features = np.column_stack([np.array(data[0][col], dtype=float) for col in feature_columns]) features = np.column_stack([np.array(data[0][col], dtype=float) for col in feature_columns])
return replace_nan_with_mean(target), replace_nan_with_mean(features) return replace_nan_with_mean(target), replace_nan_with_mean(features)
def create_and_predict_segment(query,variable,target_query): def create_and_predict_segment(query,variable,target_query,model_params):
""" """
generate a segment with machine learning generate a segment with machine learning
Stuart Lynn Stuart Lynn
@ -38,14 +39,14 @@ def create_and_predict_segment(query,variable,target_query):
feature_columns = set(columns) - set([variable, 'the_geom', 'the_geom_webmercator']) feature_columns = set(columns) - set([variable, 'the_geom', 'the_geom_webmercator'])
target,features = get_data(variable, feature_columns, query) target,features = get_data(variable, feature_columns, query)
model, accuracy = train_model(target,features, test_split=0.2) model, accuracy = train_model(target,features, model_params, 0.2)
cartodb_ids, result = predict_segment(model,feature_columns,target_query) cartodb_ids, result = predict_segment(model,feature_columns,target_query)
return zip(cartodb_ids, result) return zip(cartodb_ids, result, np.full(result.shape, accuracy ))
def train_model(target,features,test_split): def train_model(target,features,model_params,test_split):
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split) features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
model = GradientBoostingClassifier(n_estimators = 200, max_features=features.shape[1]) model = GradientBoostingRegressor(**model_params)
plpy.notice('training the model: fitting to data') plpy.notice('training the model: fitting to data')
model.fit(features_train, target_train) model.fit(features_train, target_train)
plpy.notice('model trained') plpy.notice('model trained')
@ -54,7 +55,7 @@ def train_model(target,features,test_split):
def calculate_model_accuracy(model,features,target): def calculate_model_accuracy(model,features,target):
prediction = model.predict(features) prediction = model.predict(features)
return metrics.mean_squared_error(prediction,target)/np.std(target) return metrics.mean_squared_error(prediction,target)
def predict_segment(model,features,target_query): def predict_segment(model,features,target_query):
""" """