diff --git a/src/pg/sql/05_segmentation.sql b/src/pg/sql/05_segmentation.sql index d2de727..dcef532 100644 --- a/src/pg/sql/05_segmentation.sql +++ b/src/pg/sql/05_segmentation.sql @@ -15,7 +15,8 @@ AS $$ import numpy as np import plpy - from crankshaft.segmentation import create_and_predict_segment_agg + from crankshaft.segmentation import Segmentation + seg = Segmentation() model_params = {'n_estimators': n_estimators, 'max_depth': max_depth, 'subsample': subsample, @@ -27,7 +28,7 @@ AS $$ a = np.array(data, dtype=float) return a.reshape(len(a)/dimension, dimension) - return create_and_predict_segment_agg(np.array(target, dtype=float), + return seg.create_and_predict_segment_agg(np.array(target, dtype=float), unpack2D(features), unpack2D(target_features), target_ids, @@ -65,7 +66,8 @@ CREATE OR REPLACE FUNCTION min_samples_leaf INTEGER DEFAULT 1) RETURNS TABLE (cartodb_id TEXT, prediction NUMERIC, accuracy NUMERIC) AS $$ - from crankshaft.segmentation import create_and_predict_segment + from crankshaft.segmentation import Segmentation + seg = Segmentation() model_params = {'n_estimators': n_estimators, 'max_depth':max_depth, 'subsample' : subsample, 'learning_rate': learning_rate, 'min_samples_leaf' : min_samples_leaf} - return create_and_predict_segment(query,variable_name,target_table, model_params) + return seg.create_and_predict_segment(query,variable_name,target_table, model_params) $$ LANGUAGE plpythonu; diff --git a/src/py/crankshaft/crankshaft/segmentation/__init__.py b/src/py/crankshaft/crankshaft/segmentation/__init__.py index b825e85..628c887 100644 --- a/src/py/crankshaft/crankshaft/segmentation/__init__.py +++ b/src/py/crankshaft/crankshaft/segmentation/__init__.py @@ -1 +1,2 @@ -from segmentation import * +"""Import all functions from for segmentation""" +from segmentation import * diff --git a/src/py/crankshaft/crankshaft/segmentation/segmentation.py b/src/py/crankshaft/crankshaft/segmentation/segmentation.py index 9964b8d..105c2f0 100644 --- a/src/py/crankshaft/crankshaft/segmentation/segmentation.py +++ b/src/py/crankshaft/crankshaft/segmentation/segmentation.py @@ -68,10 +68,11 @@ class Segmentation(object): """ params = {"subquery": target_query, - "id_col": id_col} + "id_col": id_col, + "feature_columns": features} target, features, target_mean, \ - feature_means = self.clean_data(variable, feature_columns, query) + feature_means = self.clean_data(variable, feature_columns, query) model, accuracy = train_model(target, features, model_params, 0.2) result = self.predict_segment(model, feature_columns, target_query, @@ -82,7 +83,8 @@ class Segmentation(object): return zip(rowid, result, accuracy_array) - def predict_segment(self, model, feature_columns, target_query, feature_means): + def predict_segment(self, model, feature_columns, target_query, + feature_means): """ Use the provided model to predict the values for the new feature set Input: @@ -115,7 +117,6 @@ class Segmentation(object): # NOTE: we removed the cartodb_ids calculation in here return np.concatenate(results) - def clean_data(self, query, variable, feature_columns): """ Add docstring @@ -179,8 +180,8 @@ def train_model(target, features, model_params, test_split): testing the model / calculating the accuray """ features_train, features_test, \ - target_train, target_test = train_test_split(features, target, - test_size=test_split) + target_train, target_test = train_test_split(features, target, + test_size=test_split) model = GradientBoostingRegressor(**model_params) model.fit(features_train, target_train) accuracy = calculate_model_accuracy(model, features_test, target_test) diff --git a/src/py/crankshaft/test/test_segmentation.py b/src/py/crankshaft/test/test_segmentation.py index d02e8b1..b6fbb00 100644 --- a/src/py/crankshaft/test/test_segmentation.py +++ b/src/py/crankshaft/test/test_segmentation.py @@ -1,7 +1,7 @@ import unittest import numpy as np from helper import plpy, fixture_file -import crankshaft.segmentation as segmentation +from crankshaft.segmentation import Segmentation import json class SegmentationTest(unittest.TestCase): @@ -48,16 +48,23 @@ class SegmentationTest(unittest.TestCase): 'subsample' : 0.5, 'learning_rate': 0.01, 'min_samples_leaf': 1} + seg = Segmentation() + ''' + self, query, variable, feature_columns, + target_query, model_params, + id_col='cartodb_id' + ''' - result = segmentation.create_and_predict_segment( + result = seg.create_and_predict_segment( 'select * from training', 'target', + 'feature_columns', 'select * from test', - model_parameters) + model_parameters) prediction = [r[1] for r in result] - accuracy =np.sqrt(np.mean( np.square( np.array(prediction) - np.array(test_y)))) + accuracy = np.sqrt(np.mean( np.square( np.array(prediction) - np.array(test_y)))) self.assertEqual(len(result),len(test_data)) self.assertTrue( result[0][2] < 0.01)