diff --git a/src/pg/sql/05_segmentation.sql b/src/pg/sql/05_segmentation.sql index 7def90d..3979ca4 100644 --- a/src/pg/sql/05_segmentation.sql +++ b/src/pg/sql/05_segmentation.sql @@ -2,11 +2,18 @@ CREATE OR REPLACE FUNCTION CDB_CreateAndPredictSegment ( query TEXT, variable_name TEXT, - target_table TEXT + target_table TEXT, + n_estimators INTEGER DEFAULT 1200, + max_depth INTEGER DEFAULT 3, + subsample DOUBLE PRECISION DEFAULT 0.5, + learning_rate DOUBLE PRECISION DEFAULT 0.01, + min_samples_leaf INTEGER DEFAULT 1 + ) -RETURNS TABLE (cartodb_id text, prediction Numeric ) +RETURNS TABLE (cartodb_id text, prediction Numeric,accuracy Numeric ) AS $$ from crankshaft.segmentation import create_and_predict_segment - # TODO: use named parameters or a dictionary - return create_and_predict_segment(query,variable_name,target_table) + model_params = {'n_estimators': n_estimators, 'max_depth':max_depth, 'subsample' : subsample, 'learning_rate': learning_rate, 'min_samples_leaf' : min_samples_leaf} + return create_and_predict_segment(query,variable_name,target_table, model_params) $$ LANGUAGE plpythonu; + diff --git a/src/py/crankshaft/crankshaft/segmentation/segmentation.py b/src/py/crankshaft/crankshaft/segmentation/segmentation.py index 3881733..d3b327b 100644 --- a/src/py/crankshaft/crankshaft/segmentation/segmentation.py +++ b/src/py/crankshaft/crankshaft/segmentation/segmentation.py @@ -5,7 +5,7 @@ Segmentation creation and prediction import sklearn import numpy as np import plpy -from sklearn.ensemble import GradientBoostingClassifier +from sklearn.ensemble import GradientBoostingRegressor from sklearn import metrics from sklearn.cross_validation import train_test_split @@ -26,9 +26,10 @@ def get_data(variable, feature_columns, query): )) target = np.array(data[0]['target']) features = np.column_stack([np.array(data[0][col], dtype=float) for col in feature_columns]) + return replace_nan_with_mean(target), replace_nan_with_mean(features) -def create_and_predict_segment(query,variable,target_query): +def create_and_predict_segment(query,variable,target_query,model_params): """ generate a segment with machine learning Stuart Lynn @@ -38,14 +39,14 @@ def create_and_predict_segment(query,variable,target_query): feature_columns = set(columns) - set([variable, 'the_geom', 'the_geom_webmercator']) target,features = get_data(variable, feature_columns, query) - model, accuracy = train_model(target,features, test_split=0.2) + model, accuracy = train_model(target,features, model_params, 0.2) cartodb_ids, result = predict_segment(model,feature_columns,target_query) - return zip(cartodb_ids, result) + return zip(cartodb_ids, result, np.full(result.shape, accuracy )) -def train_model(target,features,test_split): +def train_model(target,features,model_params,test_split): features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split) - model = GradientBoostingClassifier(n_estimators = 200, max_features=features.shape[1]) + model = GradientBoostingRegressor(**model_params) plpy.notice('training the model: fitting to data') model.fit(features_train, target_train) plpy.notice('model trained') @@ -54,7 +55,7 @@ def train_model(target,features,test_split): def calculate_model_accuracy(model,features,target): prediction = model.predict(features) - return metrics.mean_squared_error(prediction,target)/np.std(target) + return metrics.mean_squared_error(prediction,target) def predict_segment(model,features,target_query): """