From 4d2b37e6a33ef45231232720281bdf2faad259e4 Mon Sep 17 00:00:00 2001 From: mehak-sachdeva Date: Wed, 7 Dec 2016 15:21:00 -0500 Subject: [PATCH 01/22] pep-8 changes and model accuracy fucntion parameter changes --- .../crankshaft/segmentation/segmentation.py | 127 ++++++++++++------ 1 file changed, 84 insertions(+), 43 deletions(-) diff --git a/src/py/crankshaft/crankshaft/segmentation/segmentation.py b/src/py/crankshaft/crankshaft/segmentation/segmentation.py index ed61139..23c0afa 100644 --- a/src/py/crankshaft/crankshaft/segmentation/segmentation.py +++ b/src/py/crankshaft/crankshaft/segmentation/segmentation.py @@ -10,7 +10,8 @@ from sklearn import metrics from sklearn.cross_validation import train_test_split # Lower level functions -#---------------------- +# --------------------- + def replace_nan_with_mean(array): """ @@ -28,6 +29,7 @@ def replace_nan_with_mean(array): return array + def get_data(variable, feature_columns, query): """ Fetch data from the database, clean, and package into @@ -40,10 +42,15 @@ def get_data(variable, feature_columns, query): prepared data, packaged into NumPy arrays """ - columns = ','.join(['array_agg("{col}") As "{col}"'.format(col=col) for col in feature_columns]) + columns = ','.join(['array_agg("{col}") As "{col}"'.format(col=col) + for col in feature_columns]) try: - data = plpy.execute('''SELECT array_agg("{variable}") As target, {columns} FROM ({query}) As a'''.format( + data = plpy.execute(''' + SELECT + array_agg("{variable}") As target, + {columns} + FROM ({query}) As a'''.format( variable=variable, columns=columns, query=query)) @@ -54,34 +61,43 @@ def get_data(variable, feature_columns, query): target = np.array(data[0]['target']) # put n feature data arrays into an n x m array of arrays - features = np.column_stack([np.array(data[0][col], dtype=float) for col in feature_columns]) + features = np.column_stack([np.array(data[0][col], dtype=float) + for col in feature_columns]) return replace_nan_with_mean(target), replace_nan_with_mean(features) # High level interface # -------------------- -def create_and_predict_segment_agg(target, features, target_features, target_ids, model_parameters): + +def create_and_predict_segment_agg(target, features, target_features, + target_ids, model_parameters): """ - Version of create_and_predict_segment that works on arrays that come stright form the SQL calling - the function. + Version of create_and_predict_segment that works on arrays that come + straight form the SQL calling the function. Input: - @param target: The 1D array of lenth NSamples containing the target variable we want the model to predict - @param features: Thw 2D array of size NSamples * NFeatures that form the imput to the model - @param target_ids: A 1D array of target_ids that will be used to associate the results of the prediction with the rows which they come from - @param model_parameters: A dictionary containing parameters for the model. + @param target: The 1D array of lenth NSamples containing the target + variable we want the model to predict + @param features: The 2D array of size NSamples * NFeatures that + form the imput to the model + @param target_ids: A 1D array of target_ids that will be used to + associate the results of the prediction with the rows which + they come from + @param model_parameters: A dictionary containing parameters for the + model. """ clean_target = replace_nan_with_mean(target) clean_features = replace_nan_with_mean(features) target_features = replace_nan_with_mean(target_features) - model, accuracy = train_model(clean_target, clean_features, model_parameters, 0.2) + model, accuracy = train_model(clean_target, clean_features, + model_parameters, 0.2) prediction = model.predict(target_features) accuracy_array = [accuracy]*prediction.shape[0] - return zip(target_ids, prediction, np.full(prediction.shape, accuracy_array)) - + return zip(target_ids, prediction, + np.full(prediction.shape, accuracy_array)) def create_and_predict_segment(query, variable, target_query, model_params): @@ -90,15 +106,19 @@ def create_and_predict_segment(query, variable, target_query, model_params): Stuart Lynn """ - ## fetch column names + # fetch column names try: - columns = plpy.execute('SELECT * FROM ({query}) As a LIMIT 1 '.format(query=query))[0].keys() + columns = plpy.execute(''' + SELECT * + FROM ({query}) As a + LIMIT 1'''.format(query=query))[0].keys() except Exception, e: plpy.error('Failed to build segmentation model: %s' % e) - ## extract column names to be used in building the segmentation model - feature_columns = set(columns) - set([variable, 'cartodb_id', 'the_geom', 'the_geom_webmercator']) - ## get data from database + # extract column names to be used in building the segmentation model + feature_columns = set(columns) - set([variable, 'cartodb_id', + 'the_geom', 'the_geom_webmercator']) + # get data from database target, features = get_data(variable, feature_columns, query) model, accuracy = train_model(target, features, model_params, 0.2) @@ -109,49 +129,65 @@ def create_and_predict_segment(query, variable, target_query, model_params): def train_model(target, features, model_params, test_split): """ - Train the Gradient Boosting model on the provided data and calculate the accuracy of the model + Train the Gradient Boosting model on the provided data and calculate + the accuracy of the model Input: - @param target: 1D Array of the variable that the model is to be trianed to predict - @param features: 2D Array NSamples * NFeatures to use in trining the model - @param model_params: A dictionary of model parameters, the full specification can be found on the - scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html) - @parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray + @param target: 1D Array of the variable that the model is to be + trained to predict + @param features: 2D Array NSamples * NFeatures to use in trining + the model + @param model_params: A dictionary of model parameters, the full + specification can be found on the + scikit learn page for [GradientBoostingRegressor] + (http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html) + @parma test_split: The fraction of the data to be withheld for + testing the model / calculating the accuray """ - features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split) + features_train, features_test, target_train, target_test = + train_test_split(features, target, test_size=test_split) model = GradientBoostingRegressor(**model_params) model.fit(features_train, target_train) - accuracy = calculate_model_accuracy(model, features, target) + accuracy = calculate_model_accuracy(model, features_test, target_test) return model, accuracy -def calculate_model_accuracy(model, features, target): + +def calculate_model_accuracy(model, features_test, target_test): """ Calculate the mean squared error of the model prediction Input: @param model: model trained from input features - @param features: features to make a prediction from - @param target: target to compare prediction to + @param features_test: test features set to make a prediction from + @param target_target: test target set to compare predictions to Output: - mean squared error of the model prection compared to the target + mean squared error of the model prection compared to target_test """ - prediction = model.predict(features) - return metrics.mean_squared_error(prediction, target) + prediction = model.predict(features_test) + return metrics.mean_squared_error(prediction, target_test) + -def predict_segment(model, features, target_query): +def predict_segment(model, features_col, target_query): """ Use the provided model to predict the values for the new feature set Input: @param model: The pretrained model - @features: A list of features to use in the model prediction (list of column names) - @target_query: The query to run to obtain the data to predict on and the cartdb_ids associated with it. + @features_col: A list of features to use in the + model prediction (list of column names) + @target_query: The query to run to obtain the data to predict + on and the cartdb_ids associated with it. """ batch_size = 1000 - joined_features = ','.join(['"{0}"::numeric'.format(a) for a in features]) + joined_features = ','.join(['"{0}"::numeric'.format(a) + for a in features_col]) try: - cursor = plpy.cursor('SELECT Array[{joined_features}] As features FROM ({target_query}) As a'.format( - joined_features=joined_features, - target_query=target_query)) + cursor = plpy.cursor(''' + SELECT Array[{joined_features}] As features + FROM ({target_query}) As a''' + .format( + joined_features=joined_features, + target_query=target_query) + ) except Exception, e: plpy.error('Failed to build segmentation model: %s' % e) @@ -161,15 +197,20 @@ def predict_segment(model, features, target_query): rows = cursor.fetch(batch_size) if not rows: break - batch = np.row_stack([np.array(row['features'], dtype=float) for row in rows]) + batch = np.row_stack([np.array(row['features'], dtype=float) + for row in rows]) - #Need to fix this. Should be global mean. This will cause weird effects + # Need to fix this to global mean. This will cause weird effects batch = replace_nan_with_mean(batch) prediction = model.predict(batch) results.append(prediction) try: - cartodb_ids = plpy.execute('''SELECT array_agg(cartodb_id ORDER BY cartodb_id) As cartodb_ids FROM ({0}) As a'''.format(target_query))[0]['cartodb_ids'] + cartodb_ids = plpy.execute(''' + SELECT array_agg(cartodb_id + ORDER BY cartodb_id) As cartodb_ids + FROM ({0}) As a''' + .format(target_query))[0]['cartodb_ids'] except Exception, e: plpy.error('Failed to build segmentation model: %s' % e) From e5f1f92ce13626b6d8cb01fef57aacc685620901 Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Tue, 13 Dec 2016 17:47:37 -0500 Subject: [PATCH 02/22] small edits for @mehak-sachdeva --- .../crankshaft/segmentation/segmentation.py | 55 +++++++++++-------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/src/py/crankshaft/crankshaft/segmentation/segmentation.py b/src/py/crankshaft/crankshaft/segmentation/segmentation.py index 23c0afa..91bf41b 100644 --- a/src/py/crankshaft/crankshaft/segmentation/segmentation.py +++ b/src/py/crankshaft/crankshaft/segmentation/segmentation.py @@ -12,14 +12,16 @@ from sklearn.cross_validation import train_test_split # Lower level functions # --------------------- - -def replace_nan_with_mean(array): +# NOTE: added optional param here +def replace_nan_with_mean(array, avgs=None): """ Input: @param array: an array of floats which may have null-valued entries Output: array with nans filled in with the mean of the dataset """ + # TODO: update code to take in avgs parameter + # returns an array of rows and column indices indices = np.where(np.isnan(array)) @@ -122,8 +124,17 @@ def create_and_predict_segment(query, variable, target_query, model_params): target, features = get_data(variable, feature_columns, query) model, accuracy = train_model(target, features, model_params, 0.2) - cartodb_ids, result = predict_segment(model, feature_columns, target_query) - accuracy_array = [accuracy]*result.shape[0] + result = predict_segment(model, feature_columns, target_query) + accuracy_array = [accuracy] * result.shape[0] + + # cartodb_id plpy.execute code here instead of in predict_segment + try: + cartodb_ids = plpy.execute(''' + SELECT array_agg(cartodb_id ORDER BY cartodb_id) As cartodb_ids + FROM ({0}) As a'''.format(target_query))[0]['cartodb_ids'] + except Exception, err: + plpy.error('Failed to build segmentation model: %s' % err) + return zip(cartodb_ids, result, accuracy_array) @@ -183,14 +194,20 @@ def predict_segment(model, features_col, target_query): try: cursor = plpy.cursor(''' SELECT Array[{joined_features}] As features - FROM ({target_query}) As a''' - .format( - joined_features=joined_features, - target_query=target_query) - ) - except Exception, e: - plpy.error('Failed to build segmentation model: %s' % e) - + FROM ({target_query}) As a'''.format( + joined_features=joined_features, + target_query=target_query)) + except Exception, err: + plpy.error('Failed to build segmentation model: %s' % err) + + # TODO: is this a good solution for finding the averages? + # r = plpy.execute(''' + # SELECT {cols} + # FROM ({target_query}) As a + # '''.format(cols=', '.join(['avg({c}) As {c}'.format(c=c) + # for c in joined_features]), + # target_query=target_query)) + # avgs = [r[0][c] for c in joined_features] results = [] while True: @@ -198,20 +215,12 @@ def predict_segment(model, features_col, target_query): if not rows: break batch = np.row_stack([np.array(row['features'], dtype=float) - for row in rows]) + for row in rows]) # Need to fix this to global mean. This will cause weird effects batch = replace_nan_with_mean(batch) prediction = model.predict(batch) results.append(prediction) - try: - cartodb_ids = plpy.execute(''' - SELECT array_agg(cartodb_id - ORDER BY cartodb_id) As cartodb_ids - FROM ({0}) As a''' - .format(target_query))[0]['cartodb_ids'] - except Exception, e: - plpy.error('Failed to build segmentation model: %s' % e) - - return cartodb_ids, np.concatenate(results) + # NOTE: we removed the cartodb_ids calculation in here + return np.concatenate(results) From cee896727469655fb69a5fd9c82dbf0be7530943 Mon Sep 17 00:00:00 2001 From: mehak-sachdeva Date: Mon, 30 Jan 2017 17:14:20 -0500 Subject: [PATCH 03/22] refactoring segmentation function --- src/pg/sql/05_segmentation.sql | 18 ++ .../crankshaft/analysis_data_provider.py | 41 +++ .../crankshaft/segmentation/segmentation.py | 288 ++++++++---------- src/py/crankshaft/test/mock_plpy.py | 3 + 4 files changed, 193 insertions(+), 157 deletions(-) diff --git a/src/pg/sql/05_segmentation.sql b/src/pg/sql/05_segmentation.sql index 7dac003..d2de727 100644 --- a/src/pg/sql/05_segmentation.sql +++ b/src/pg/sql/05_segmentation.sql @@ -51,3 +51,21 @@ AS $$ model_params = {'n_estimators': n_estimators, 'max_depth':max_depth, 'subsample' : subsample, 'learning_rate': learning_rate, 'min_samples_leaf' : min_samples_leaf} return create_and_predict_segment(query,variable_name,target_table, model_params) $$ LANGUAGE plpythonu; + +CREATE OR REPLACE FUNCTION + CDB_CreateAndPredictSegment ( + query TEXT, + variable_name TEXT, + target_table TEXT, + feature_columns TEXT[], + n_estimators INTEGER DEFAULT 1200, + max_depth INTEGER DEFAULT 3, + subsample DOUBLE PRECISION DEFAULT 0.5, + learning_rate DOUBLE PRECISION DEFAULT 0.01, + min_samples_leaf INTEGER DEFAULT 1) +RETURNS TABLE (cartodb_id TEXT, prediction NUMERIC, accuracy NUMERIC) +AS $$ + from crankshaft.segmentation import create_and_predict_segment + model_params = {'n_estimators': n_estimators, 'max_depth':max_depth, 'subsample' : subsample, 'learning_rate': learning_rate, 'min_samples_leaf' : min_samples_leaf} + return create_and_predict_segment(query,variable_name,target_table, model_params) +$$ LANGUAGE plpythonu; diff --git a/src/py/crankshaft/crankshaft/analysis_data_provider.py b/src/py/crankshaft/crankshaft/analysis_data_provider.py index cbc27bc..b03d29b 100644 --- a/src/py/crankshaft/crankshaft/analysis_data_provider.py +++ b/src/py/crankshaft/crankshaft/analysis_data_provider.py @@ -65,3 +65,44 @@ class AnalysisDataProvider: return data except plpy.SPIError, err: plpy.error('Analysis failed: %s' % err) + + def get_model_data(self, params): + """fetch data for Segmentation""" + columns = ','.join(['array_agg("{col}") As "{col}"'.format(col=col) + for col in params['feature_columns']]) + + query = ("SELECT" + "array_agg({target}) As target," + "{columns} As feature", + "FROM ({subquery}) As q").format(params['query'], + ['variable']) + try: + data = plpy.execute(query) + return data + except plpy.SPIError, err: + plpy.error('Failed to build segmentation model: %s' % err) + + def get_segment_data(self, params): + """fetch cartodb_ids""" + query = ("SELECT" + "array_agg({id_col} ORDER BY {id_col}) as ids," + "FROM ({subquery}) as q").format(**params) + try: + data = plpy.execute(query) + return data + except plpy.SPIError, err: + plpy.error('Failed to build segmentation model: %s' % err) + + def get_predict_data(self, params): + """fetch data for Segmentation""" + + joined_features = ','.join(['"{0}"::numeric'.format(a) + for a in features_columns]) + query = ("SELECT" + "Array({joined_features}) As features," + "FROM ({subquery}) as q").format(**params) + try: + cursor = plpy.cursor(query) + return cursor + except plpy.SPIError, err: + plpy.error('Failed to build segmentation model: %s' % err) diff --git a/src/py/crankshaft/crankshaft/segmentation/segmentation.py b/src/py/crankshaft/crankshaft/segmentation/segmentation.py index 91bf41b..ff97aa7 100644 --- a/src/py/crankshaft/crankshaft/segmentation/segmentation.py +++ b/src/py/crankshaft/crankshaft/segmentation/segmentation.py @@ -8,158 +8,148 @@ import plpy from sklearn.ensemble import GradientBoostingRegressor from sklearn import metrics from sklearn.cross_validation import train_test_split +from crankshaft.analysis_data_provider import AnalysisDateProvider # Lower level functions # --------------------- # NOTE: added optional param here -def replace_nan_with_mean(array, avgs=None): - """ - Input: - @param array: an array of floats which may have null-valued entries - Output: - array with nans filled in with the mean of the dataset - """ - # TODO: update code to take in avgs parameter - # returns an array of rows and column indices - indices = np.where(np.isnan(array)) - # iterate through entries which have nan values - for row, col in zip(*indices): - array[row, col] = np.mean(array[~np.isnan(array[:, col]), col]) +class Segmentation: - return array + def __init__(self, data_provider=None): + if data_provider is None: + self.data_provider = AnalysisDataProvider() + else: + self.data_provider = data_provider + def clean_data(self, query, variable, feature_columns): + params = {"subquery": query, + "target": variable, + "features": feature_columns} -def get_data(variable, feature_columns, query): - """ - Fetch data from the database, clean, and package into - numpy arrays - Input: - @param variable: name of the target variable - @param feature_columns: list of column names - @param query: subquery that data is pulled from for the packaging - Output: - prepared data, packaged into NumPy arrays - """ - - columns = ','.join(['array_agg("{col}") As "{col}"'.format(col=col) - for col in feature_columns]) - - try: - data = plpy.execute(''' - SELECT - array_agg("{variable}") As target, - {columns} - FROM ({query}) As a'''.format( - variable=variable, - columns=columns, - query=query)) - except Exception, e: - plpy.error('Failed to access data to build segmentation model: %s' % e) + data = self.data_provider.get_model_data(params) - # extract target data from plpy object - target = np.array(data[0]['target']) + # extract target data from plpy object + target = np.array(data[0]['target']) - # put n feature data arrays into an n x m array of arrays - features = np.column_stack([np.array(data[0][col], dtype=float) - for col in feature_columns]) + # put n feature data arrays into an n x m array of arrays + features = np.column_stack([np.array(data[0][col], dtype=float) + for col in feature_columns]) - return replace_nan_with_mean(target), replace_nan_with_mean(features) - -# High level interface -# -------------------- + features, feature_means = replace_nan_with_mean(features) + target, target_mean = replace_nan_with_mean(target) + return target, features, target_mean, feature_means + def replace_nan_with_mean(array, means=None): + """ + Input: + @param array: an array of floats which may have null-valued + entries + Output: + array with nans filled in with the mean of the dataset + """ + # TODO: update code to take in avgs parameter -def create_and_predict_segment_agg(target, features, target_features, - target_ids, model_parameters): - """ - Version of create_and_predict_segment that works on arrays that come - straight form the SQL calling the function. + # returns an array of rows and column indices + indices = np.where(np.isnan(array)) - Input: - @param target: The 1D array of lenth NSamples containing the target - variable we want the model to predict - @param features: The 2D array of size NSamples * NFeatures that - form the imput to the model - @param target_ids: A 1D array of target_ids that will be used to - associate the results of the prediction with the rows which - they come from - @param model_parameters: A dictionary containing parameters for the - model. - """ + if not means: + for col in np.shape(array)[1]: + means[col] = np.mean(array[~np.isnan(array[:, col]), col]) - clean_target = replace_nan_with_mean(target) - clean_features = replace_nan_with_mean(features) - target_features = replace_nan_with_mean(target_features) + # iterate through entries which have nan values + for row, col in zip(*indices): + array[row, col] = means[col] - model, accuracy = train_model(clean_target, clean_features, - model_parameters, 0.2) - prediction = model.predict(target_features) - accuracy_array = [accuracy]*prediction.shape[0] - return zip(target_ids, prediction, - np.full(prediction.shape, accuracy_array)) + return array, means -def create_and_predict_segment(query, variable, target_query, model_params): - """ - generate a segment with machine learning - Stuart Lynn - """ +# High level interface +# -------------------- - # fetch column names - try: - columns = plpy.execute(''' - SELECT * - FROM ({query}) As a - LIMIT 1'''.format(query=query))[0].keys() - except Exception, e: - plpy.error('Failed to build segmentation model: %s' % e) - - # extract column names to be used in building the segmentation model - feature_columns = set(columns) - set([variable, 'cartodb_id', - 'the_geom', 'the_geom_webmercator']) - # get data from database - target, features = get_data(variable, feature_columns, query) - - model, accuracy = train_model(target, features, model_params, 0.2) - result = predict_segment(model, feature_columns, target_query) - accuracy_array = [accuracy] * result.shape[0] - - # cartodb_id plpy.execute code here instead of in predict_segment - try: - cartodb_ids = plpy.execute(''' - SELECT array_agg(cartodb_id ORDER BY cartodb_id) As cartodb_ids - FROM ({0}) As a'''.format(target_query))[0]['cartodb_ids'] - except Exception, err: - plpy.error('Failed to build segmentation model: %s' % err) - - return zip(cartodb_ids, result, accuracy_array) - - -def train_model(target, features, model_params, test_split): - """ - Train the Gradient Boosting model on the provided data and calculate - the accuracy of the model - Input: - @param target: 1D Array of the variable that the model is to be - trained to predict - @param features: 2D Array NSamples * NFeatures to use in trining - the model - @param model_params: A dictionary of model parameters, the full - specification can be found on the - scikit learn page for [GradientBoostingRegressor] - (http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html) - @parma test_split: The fraction of the data to be withheld for - testing the model / calculating the accuray - """ - features_train, features_test, target_train, target_test = - train_test_split(features, target, test_size=test_split) - model = GradientBoostingRegressor(**model_params) - model.fit(features_train, target_train) - accuracy = calculate_model_accuracy(model, features_test, target_test) - return model, accuracy + def create_and_predict_segment_agg(target, features, target_features, + target_ids, model_parameters): + """ + Version of create_and_predict_segment that works on arrays that come + straight form the SQL calling the function. + + Input: + @param target: The 1D array of lenth NSamples containing the + target variable we want the model to predict + @param features: The 2D array of size NSamples * NFeatures that + form the imput to the model + @param target_ids: A 1D array of target_ids that will be used + to associate the results of the prediction with the rows which + they come from + @param model_parameters: A dictionary containing parameters for + the model. + """ + + clean_target = replace_nan_with_mean(target) + clean_features = replace_nan_with_mean(features) + target_features = replace_nan_with_mean(target_features) + + model, accuracy = train_model(clean_target, clean_features, + model_parameters, 0.2) + prediction = model.predict(target_features) + accuracy_array = [accuracy]*prediction.shape[0] + return zip(target_ids, prediction, + np.full(prediction.shape, accuracy_array)) + + def create_and_predict_segment(query, variable, feature_columns, + target_query, model_params): + """ + generate a segment with machine learning + Stuart Lynn + @param query: subquery that data is pulled from for packaging + @param variable: name of the target variable + @param feature_columns: list of column names + @target_query: The query to run to obtain the data to predict + @param model_params: A dictionary of model parameters, the full + specification can be found on the + scikit learn page for [GradientBoostingRegressor] + (http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html) + """ + + params = {"subquery": target_query, + "id_col": "cartodb_id"} + + target, features, target_mean, + feature_means = clean_data(variable, feature_columns, query) + model, accuracy = train_model(target, features, model_params, 0.2) + result = predict_segment(model, feature_columns, target_query, + feature_means) + accuracy_array = [accuracy] * result.shape[0] + + cartodb_ids = self.data_provider.get_segment_data(params) + + return zip(cartodb_ids, result, accuracy_array) + + def train_model(target, features, model_params, test_split): + """ + Train the Gradient Boosting model on the provided data to calculate + the accuracy of the model + Input: + @param target: 1D Array of the variable that the model is to be + trained to predict + @param features: 2D Array NSamples *NFeatures to use in trining + the model + @param model_params: A dictionary of model parameters, the full + specification can be found on the + scikit learn page for [GradientBoostingRegressor] + (http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html) + @parma test_split: The fraction of the data to be withheld for + testing the model / calculating the accuray + """ + features_train, features_test, + target_train, target_test = train_test_split(features, target, + test_size=test_split) + model = GradientBoostingRegressor(**model_params) + model.fit(features_train, target_train) + accuracy = calculate_model_accuracy(model, features_test, target_test) + return model, accuracy def calculate_model_accuracy(model, features_test, target_test): @@ -167,16 +157,16 @@ def calculate_model_accuracy(model, features_test, target_test): Calculate the mean squared error of the model prediction Input: @param model: model trained from input features - @param features_test: test features set to make a prediction from + @param features_test: test features set to make prediction from @param target_target: test target set to compare predictions to Output: - mean squared error of the model prection compared to target_test + mean squared error of the model prection compared target_test """ prediction = model.predict(features_test) return metrics.mean_squared_error(prediction, target_test) -def predict_segment(model, features_col, target_query): +def predict_segment(model, features_columns, target_query, feature_means): """ Use the provided model to predict the values for the new feature set Input: @@ -188,37 +178,21 @@ def predict_segment(model, features_col, target_query): """ batch_size = 1000 - joined_features = ','.join(['"{0}"::numeric'.format(a) - for a in features_col]) - - try: - cursor = plpy.cursor(''' - SELECT Array[{joined_features}] As features - FROM ({target_query}) As a'''.format( - joined_features=joined_features, - target_query=target_query)) - except Exception, err: - plpy.error('Failed to build segmentation model: %s' % err) - - # TODO: is this a good solution for finding the averages? - # r = plpy.execute(''' - # SELECT {cols} - # FROM ({target_query}) As a - # '''.format(cols=', '.join(['avg({c}) As {c}'.format(c=c) - # for c in joined_features]), - # target_query=target_query)) - # avgs = [r[0][c] for c in joined_features] - results = [] + params = {"subquery": target_query, + "feature": feature_columns} + results = [] + cursors = self.data_provider.get_predict_data(params) while True: - rows = cursor.fetch(batch_size) + rows = cursors.fetch(batch_size) if not rows: break batch = np.row_stack([np.array(row['features'], dtype=float) for row in rows]) # Need to fix this to global mean. This will cause weird effects - batch = replace_nan_with_mean(batch) + + batch = replace_nan_with_mean(batch, feature_means) prediction = model.predict(batch) results.append(prediction) diff --git a/src/py/crankshaft/test/mock_plpy.py b/src/py/crankshaft/test/mock_plpy.py index e8a279d..7bea700 100644 --- a/src/py/crankshaft/test/mock_plpy.py +++ b/src/py/crankshaft/test/mock_plpy.py @@ -42,6 +42,9 @@ class MockPlPy: def info(self, msg): self.infos.append(msg) + def error(self, msg): + self.infos.append(msg) + def cursor(self, query): data = self.execute(query) return MockCursor(data) From 1f2eb6ccfdf3a97cd85caaed89eb84deb6e15393 Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Mon, 30 Jan 2017 17:59:22 -0500 Subject: [PATCH 04/22] edits of segmentation providers --- .../crankshaft/analysis_data_provider.py | 68 ++++++++++++------- 1 file changed, 45 insertions(+), 23 deletions(-) diff --git a/src/py/crankshaft/crankshaft/analysis_data_provider.py b/src/py/crankshaft/crankshaft/analysis_data_provider.py index b03d29b..02131b0 100644 --- a/src/py/crankshaft/crankshaft/analysis_data_provider.py +++ b/src/py/crankshaft/crankshaft/analysis_data_provider.py @@ -3,7 +3,11 @@ import plpy import pysal_utils as pu -class AnalysisDataProvider: +class AnalysisDataProvider(object): + """ + Analysis data provider for crankshaft functions. These mostly rely on + plpy data sources. + """ def get_getis(self, w_type, params): """fetch data for getis ord's g""" try: @@ -66,41 +70,59 @@ class AnalysisDataProvider: except plpy.SPIError, err: plpy.error('Analysis failed: %s' % err) - def get_model_data(self, params): - """fetch data for Segmentation""" - columns = ','.join(['array_agg("{col}") As "{col}"'.format(col=col) + def get_segmentation_model_data(self, params): + """ + fetch data for Segmentation + params = {"subquery": query, + "target": variable, + "features": feature_columns} + """ + columns = ', '.join(['array_agg("{col}") As "{col}"'.format(col=col) for col in params['feature_columns']]) - - query = ("SELECT" - "array_agg({target}) As target," - "{columns} As feature", - "FROM ({subquery}) As q").format(params['query'], - ['variable']) + query = ''' + SELECT + array_agg("{target}") As target, + {columns} + FROM ({subquery}) As q + '''.format(subquery=params['subquery'], + target=params['target'], + columns=columns) try: data = plpy.execute(query) return data except plpy.SPIError, err: plpy.error('Failed to build segmentation model: %s' % err) - def get_segment_data(self, params): - """fetch cartodb_ids""" - query = ("SELECT" - "array_agg({id_col} ORDER BY {id_col}) as ids," - "FROM ({subquery}) as q").format(**params) + def get_segmentation_data(self, params): + """ + params = {"subquery": target_query, + "id_col": id_col} + """ + query = ''' + SELECT + array_agg("{id_col}" ORDER BY "{id_col}") as "ids" + FROM ({subquery}) as q + '''.format(**params) try: data = plpy.execute(query) return data except plpy.SPIError, err: plpy.error('Failed to build segmentation model: %s' % err) - def get_predict_data(self, params): - """fetch data for Segmentation""" - - joined_features = ','.join(['"{0}"::numeric'.format(a) - for a in features_columns]) - query = ("SELECT" - "Array({joined_features}) As features," - "FROM ({subquery}) as q").format(**params) + def get_segmentation_predict_data(self, params): + """ + fetch data for Segmentation + params = {"subquery": target_query, + "feature_columns": feature_columns} + """ + joined_features = ', '.join(['"{}"::numeric'.format(a) + for a in params['feature_columns']]) + query = ''' + SELECT + Array({joined_features}) As features + FROM ({subquery}) as q + '''.format(subquery=params['subquery'], + joined_features=joined_features) try: cursor = plpy.cursor(query) return cursor From ee723aa3dc93762c8a51eff69904d022a94cd373 Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Mon, 30 Jan 2017 18:01:14 -0500 Subject: [PATCH 05/22] updates to function framework --- .../crankshaft/segmentation/segmentation.py | 224 +++++++++--------- 1 file changed, 109 insertions(+), 115 deletions(-) diff --git a/src/py/crankshaft/crankshaft/segmentation/segmentation.py b/src/py/crankshaft/crankshaft/segmentation/segmentation.py index ff97aa7..95ebd7e 100644 --- a/src/py/crankshaft/crankshaft/segmentation/segmentation.py +++ b/src/py/crankshaft/crankshaft/segmentation/segmentation.py @@ -2,21 +2,16 @@ Segmentation creation and prediction """ -import sklearn import numpy as np -import plpy from sklearn.ensemble import GradientBoostingRegressor from sklearn import metrics from sklearn.cross_validation import train_test_split -from crankshaft.analysis_data_provider import AnalysisDateProvider +from crankshaft.analysis_data_provider import AnalysisDataProvider -# Lower level functions -# --------------------- # NOTE: added optional param here - -class Segmentation: +class Segmentation(object): def __init__(self, data_provider=None): if data_provider is None: @@ -24,52 +19,7 @@ class Segmentation: else: self.data_provider = data_provider - def clean_data(self, query, variable, feature_columns): - params = {"subquery": query, - "target": variable, - "features": feature_columns} - - data = self.data_provider.get_model_data(params) - - # extract target data from plpy object - target = np.array(data[0]['target']) - - # put n feature data arrays into an n x m array of arrays - features = np.column_stack([np.array(data[0][col], dtype=float) - for col in feature_columns]) - - features, feature_means = replace_nan_with_mean(features) - target, target_mean = replace_nan_with_mean(target) - return target, features, target_mean, feature_means - - def replace_nan_with_mean(array, means=None): - """ - Input: - @param array: an array of floats which may have null-valued - entries - Output: - array with nans filled in with the mean of the dataset - """ - # TODO: update code to take in avgs parameter - - # returns an array of rows and column indices - indices = np.where(np.isnan(array)) - - if not means: - for col in np.shape(array)[1]: - means[col] = np.mean(array[~np.isnan(array[:, col]), col]) - - # iterate through entries which have nan values - for row, col in zip(*indices): - array[row, col] = means[col] - - return array, means - - -# High level interface -# -------------------- - - def create_and_predict_segment_agg(target, features, target_features, + def create_and_predict_segment_agg(self, target, features, target_features, target_ids, model_parameters): """ Version of create_and_predict_segment that works on arrays that come @@ -94,12 +44,13 @@ class Segmentation: model, accuracy = train_model(clean_target, clean_features, model_parameters, 0.2) prediction = model.predict(target_features) - accuracy_array = [accuracy]*prediction.shape[0] + accuracy_array = [accuracy] * prediction.shape[0] return zip(target_ids, prediction, np.full(prediction.shape, accuracy_array)) - def create_and_predict_segment(query, variable, feature_columns, - target_query, model_params): + def create_and_predict_segment(self, query, variable, feature_columns, + target_query, model_params, + id_col='cartodb_id'): """ generate a segment with machine learning Stuart Lynn @@ -114,42 +65,119 @@ class Segmentation: """ params = {"subquery": target_query, - "id_col": "cartodb_id"} + "id_col": id_col} target, features, target_mean, feature_means = clean_data(variable, feature_columns, query) model, accuracy = train_model(target, features, model_params, 0.2) - result = predict_segment(model, feature_columns, target_query, - feature_means) + result = self.predict_segment(model, feature_columns, target_query, + feature_means) accuracy_array = [accuracy] * result.shape[0] - cartodb_ids = self.data_provider.get_segment_data(params) + rowid = self.data_provider.get_segmentation_data(params) - return zip(cartodb_ids, result, accuracy_array) + return zip(rowid, result, accuracy_array) - def train_model(target, features, model_params, test_split): + def predict_segment(self, model, feature_columns, target_query, feature_means): """ - Train the Gradient Boosting model on the provided data to calculate - the accuracy of the model + Use the provided model to predict the values for the new feature set Input: - @param target: 1D Array of the variable that the model is to be - trained to predict - @param features: 2D Array NSamples *NFeatures to use in trining - the model - @param model_params: A dictionary of model parameters, the full - specification can be found on the - scikit learn page for [GradientBoostingRegressor] - (http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html) - @parma test_split: The fraction of the data to be withheld for - testing the model / calculating the accuray + @param model: The pretrained model + @features_col: A list of features to use in the + model prediction (list of column names) + @target_query: The query to run to obtain the data to predict + on and the cartdb_ids associated with it. """ - features_train, features_test, - target_train, target_test = train_test_split(features, target, - test_size=test_split) - model = GradientBoostingRegressor(**model_params) - model.fit(features_train, target_train) - accuracy = calculate_model_accuracy(model, features_test, target_test) - return model, accuracy + + batch_size = 1000 + params = {"subquery": target_query, + "feature_columns": feature_columns} + + results = [] + cursors = self.data_provider.get_segmentation_predict_data(params) + while True: + rows = cursors.fetch(batch_size) + if not rows: + break + batch = np.row_stack([np.array(row['features'], dtype=float) + for row in rows]) + + # Need to fix this to global mean. This will cause weird effects + + batch = replace_nan_with_mean(batch, feature_means) + prediction = model.predict(batch) + results.append(prediction) + + # NOTE: we removed the cartodb_ids calculation in here + return np.concatenate(results) + + +def clean_data(self, query, variable, feature_columns): + params = {"subquery": query, + "target": variable, + "features": feature_columns} + + data = self.data_provider.get_segmentation_model_data(params) + + # extract target data from plpy object + target = np.array(data[0]['target']) + + # put n feature data arrays into an n x m array of arrays + features = np.column_stack([np.array(data[0][col], dtype=float) + for col in feature_columns]) + + features, feature_means = replace_nan_with_mean(features) + target, target_mean = replace_nan_with_mean(target) + return target, features, target_mean, feature_means + + +def replace_nan_with_mean(array, means=None): + """ + Input: + @param array: an array of floats which may have null-valued + entries + Output: + array with nans filled in with the mean of the dataset + """ + # TODO: update code to take in avgs parameter + + # returns an array of rows and column indices + indices = np.where(np.isnan(array)) + + if not means: + for col in np.shape(array)[1]: + means[col] = np.mean(array[~np.isnan(array[:, col]), col]) + + # iterate through entries which have nan values + for row, col in zip(*indices): + array[row, col] = means[col] + + return array, means + + +def train_model(target, features, model_params, test_split): + """ + Train the Gradient Boosting model on the provided data to calculate + the accuracy of the model + Input: + @param target: 1D Array of the variable that the model is to be + trained to predict + @param features: 2D Array NSamples *NFeatures to use in trining + the model + @param model_params: A dictionary of model parameters, the full + specification can be found on the + scikit learn page for [GradientBoostingRegressor] + (http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html) + @parma test_split: The fraction of the data to be withheld for + testing the model / calculating the accuray + """ + features_train, features_test, + target_train, target_test = train_test_split(features, target, + test_size=test_split) + model = GradientBoostingRegressor(**model_params) + model.fit(features_train, target_train) + accuracy = calculate_model_accuracy(model, features_test, target_test) + return model, accuracy def calculate_model_accuracy(model, features_test, target_test): @@ -164,37 +192,3 @@ def calculate_model_accuracy(model, features_test, target_test): """ prediction = model.predict(features_test) return metrics.mean_squared_error(prediction, target_test) - - -def predict_segment(model, features_columns, target_query, feature_means): - """ - Use the provided model to predict the values for the new feature set - Input: - @param model: The pretrained model - @features_col: A list of features to use in the - model prediction (list of column names) - @target_query: The query to run to obtain the data to predict - on and the cartdb_ids associated with it. - """ - - batch_size = 1000 - params = {"subquery": target_query, - "feature": feature_columns} - - results = [] - cursors = self.data_provider.get_predict_data(params) - while True: - rows = cursors.fetch(batch_size) - if not rows: - break - batch = np.row_stack([np.array(row['features'], dtype=float) - for row in rows]) - - # Need to fix this to global mean. This will cause weird effects - - batch = replace_nan_with_mean(batch, feature_means) - prediction = model.predict(batch) - results.append(prediction) - - # NOTE: we removed the cartodb_ids calculation in here - return np.concatenate(results) From 9c2f68fcafa905f5d1f037a6c1b4fe3594e5dc16 Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Mon, 30 Jan 2017 22:45:49 -0500 Subject: [PATCH 06/22] edits to clean up code --- .../crankshaft/segmentation/segmentation.py | 41 +++++++++++-------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/src/py/crankshaft/crankshaft/segmentation/segmentation.py b/src/py/crankshaft/crankshaft/segmentation/segmentation.py index 95ebd7e..9964b8d 100644 --- a/src/py/crankshaft/crankshaft/segmentation/segmentation.py +++ b/src/py/crankshaft/crankshaft/segmentation/segmentation.py @@ -12,6 +12,9 @@ from crankshaft.analysis_data_provider import AnalysisDataProvider # NOTE: added optional param here class Segmentation(object): + """ + Add docstring + """ def __init__(self, data_provider=None): if data_provider is None: @@ -67,8 +70,9 @@ class Segmentation(object): params = {"subquery": target_query, "id_col": id_col} - target, features, target_mean, - feature_means = clean_data(variable, feature_columns, query) + target, features, target_mean, \ + feature_means = self.clean_data(variable, feature_columns, query) + model, accuracy = train_model(target, features, model_params, 0.2) result = self.predict_segment(model, feature_columns, target_query, feature_means) @@ -112,23 +116,26 @@ class Segmentation(object): return np.concatenate(results) -def clean_data(self, query, variable, feature_columns): - params = {"subquery": query, - "target": variable, - "features": feature_columns} + def clean_data(self, query, variable, feature_columns): + """ + Add docstring + """ + params = {"subquery": query, + "target": variable, + "features": feature_columns} - data = self.data_provider.get_segmentation_model_data(params) + data = self.data_provider.get_segmentation_model_data(params) - # extract target data from plpy object - target = np.array(data[0]['target']) + # extract target data from plpy object + target = np.array(data[0]['target']) - # put n feature data arrays into an n x m array of arrays - features = np.column_stack([np.array(data[0][col], dtype=float) - for col in feature_columns]) + # put n feature data arrays into an n x m array of arrays + features = np.column_stack([np.array(data[0][col], dtype=float) + for col in feature_columns]) - features, feature_means = replace_nan_with_mean(features) - target, target_mean = replace_nan_with_mean(target) - return target, features, target_mean, feature_means + features, feature_means = replace_nan_with_mean(features) + target, target_mean = replace_nan_with_mean(target) + return target, features, target_mean, feature_means def replace_nan_with_mean(array, means=None): @@ -171,8 +178,8 @@ def train_model(target, features, model_params, test_split): @parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray """ - features_train, features_test, - target_train, target_test = train_test_split(features, target, + features_train, features_test, \ + target_train, target_test = train_test_split(features, target, test_size=test_split) model = GradientBoostingRegressor(**model_params) model.fit(features_train, target_train) From 959747c623db26e9dc9bbc90e10c1e6600c0cd52 Mon Sep 17 00:00:00 2001 From: mehak-sachdeva Date: Tue, 31 Jan 2017 11:25:27 -0500 Subject: [PATCH 07/22] updating according to class --- src/pg/sql/05_segmentation.sql | 10 ++++++---- .../crankshaft/segmentation/__init__.py | 3 ++- .../crankshaft/segmentation/segmentation.py | 13 +++++++------ src/py/crankshaft/test/test_segmentation.py | 17 ++++++++++++----- 4 files changed, 27 insertions(+), 16 deletions(-) diff --git a/src/pg/sql/05_segmentation.sql b/src/pg/sql/05_segmentation.sql index d2de727..dcef532 100644 --- a/src/pg/sql/05_segmentation.sql +++ b/src/pg/sql/05_segmentation.sql @@ -15,7 +15,8 @@ AS $$ import numpy as np import plpy - from crankshaft.segmentation import create_and_predict_segment_agg + from crankshaft.segmentation import Segmentation + seg = Segmentation() model_params = {'n_estimators': n_estimators, 'max_depth': max_depth, 'subsample': subsample, @@ -27,7 +28,7 @@ AS $$ a = np.array(data, dtype=float) return a.reshape(len(a)/dimension, dimension) - return create_and_predict_segment_agg(np.array(target, dtype=float), + return seg.create_and_predict_segment_agg(np.array(target, dtype=float), unpack2D(features), unpack2D(target_features), target_ids, @@ -65,7 +66,8 @@ CREATE OR REPLACE FUNCTION min_samples_leaf INTEGER DEFAULT 1) RETURNS TABLE (cartodb_id TEXT, prediction NUMERIC, accuracy NUMERIC) AS $$ - from crankshaft.segmentation import create_and_predict_segment + from crankshaft.segmentation import Segmentation + seg = Segmentation() model_params = {'n_estimators': n_estimators, 'max_depth':max_depth, 'subsample' : subsample, 'learning_rate': learning_rate, 'min_samples_leaf' : min_samples_leaf} - return create_and_predict_segment(query,variable_name,target_table, model_params) + return seg.create_and_predict_segment(query,variable_name,target_table, model_params) $$ LANGUAGE plpythonu; diff --git a/src/py/crankshaft/crankshaft/segmentation/__init__.py b/src/py/crankshaft/crankshaft/segmentation/__init__.py index b825e85..628c887 100644 --- a/src/py/crankshaft/crankshaft/segmentation/__init__.py +++ b/src/py/crankshaft/crankshaft/segmentation/__init__.py @@ -1 +1,2 @@ -from segmentation import * +"""Import all functions from for segmentation""" +from segmentation import * diff --git a/src/py/crankshaft/crankshaft/segmentation/segmentation.py b/src/py/crankshaft/crankshaft/segmentation/segmentation.py index 9964b8d..105c2f0 100644 --- a/src/py/crankshaft/crankshaft/segmentation/segmentation.py +++ b/src/py/crankshaft/crankshaft/segmentation/segmentation.py @@ -68,10 +68,11 @@ class Segmentation(object): """ params = {"subquery": target_query, - "id_col": id_col} + "id_col": id_col, + "feature_columns": features} target, features, target_mean, \ - feature_means = self.clean_data(variable, feature_columns, query) + feature_means = self.clean_data(variable, feature_columns, query) model, accuracy = train_model(target, features, model_params, 0.2) result = self.predict_segment(model, feature_columns, target_query, @@ -82,7 +83,8 @@ class Segmentation(object): return zip(rowid, result, accuracy_array) - def predict_segment(self, model, feature_columns, target_query, feature_means): + def predict_segment(self, model, feature_columns, target_query, + feature_means): """ Use the provided model to predict the values for the new feature set Input: @@ -115,7 +117,6 @@ class Segmentation(object): # NOTE: we removed the cartodb_ids calculation in here return np.concatenate(results) - def clean_data(self, query, variable, feature_columns): """ Add docstring @@ -179,8 +180,8 @@ def train_model(target, features, model_params, test_split): testing the model / calculating the accuray """ features_train, features_test, \ - target_train, target_test = train_test_split(features, target, - test_size=test_split) + target_train, target_test = train_test_split(features, target, + test_size=test_split) model = GradientBoostingRegressor(**model_params) model.fit(features_train, target_train) accuracy = calculate_model_accuracy(model, features_test, target_test) diff --git a/src/py/crankshaft/test/test_segmentation.py b/src/py/crankshaft/test/test_segmentation.py index d02e8b1..b6fbb00 100644 --- a/src/py/crankshaft/test/test_segmentation.py +++ b/src/py/crankshaft/test/test_segmentation.py @@ -1,7 +1,7 @@ import unittest import numpy as np from helper import plpy, fixture_file -import crankshaft.segmentation as segmentation +from crankshaft.segmentation import Segmentation import json class SegmentationTest(unittest.TestCase): @@ -48,16 +48,23 @@ class SegmentationTest(unittest.TestCase): 'subsample' : 0.5, 'learning_rate': 0.01, 'min_samples_leaf': 1} - - result = segmentation.create_and_predict_segment( + seg = Segmentation() + ''' + self, query, variable, feature_columns, + target_query, model_params, + id_col='cartodb_id' + ''' + + result = seg.create_and_predict_segment( 'select * from training', 'target', + 'feature_columns', 'select * from test', - model_parameters) + model_parameters) prediction = [r[1] for r in result] - accuracy =np.sqrt(np.mean( np.square( np.array(prediction) - np.array(test_y)))) + accuracy = np.sqrt(np.mean( np.square( np.array(prediction) - np.array(test_y)))) self.assertEqual(len(result),len(test_data)) self.assertTrue( result[0][2] < 0.01) From cbd95fa0a21f27b3c657bbe132e1e8e843923aac Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 31 Jan 2017 20:48:56 +0000 Subject: [PATCH 08/22] test changes --- .../crankshaft/analysis_data_provider.py | 8 ++-- .../crankshaft/segmentation/segmentation.py | 13 +++++- src/py/crankshaft/test/test_segmentation.py | 45 +++++++++++++------ 3 files changed, 48 insertions(+), 18 deletions(-) diff --git a/src/py/crankshaft/crankshaft/analysis_data_provider.py b/src/py/crankshaft/crankshaft/analysis_data_provider.py index 02131b0..932aff2 100644 --- a/src/py/crankshaft/crankshaft/analysis_data_provider.py +++ b/src/py/crankshaft/crankshaft/analysis_data_provider.py @@ -73,12 +73,12 @@ class AnalysisDataProvider(object): def get_segmentation_model_data(self, params): """ fetch data for Segmentation - params = {"subquery": query, - "target": variable, - "features": feature_columns} + params = {"subquery": query, + "target": variable, + "features": feature_columns} """ columns = ', '.join(['array_agg("{col}") As "{col}"'.format(col=col) - for col in params['feature_columns']]) + for col in params['features']]) query = ''' SELECT array_agg("{target}") As target, diff --git a/src/py/crankshaft/crankshaft/segmentation/segmentation.py b/src/py/crankshaft/crankshaft/segmentation/segmentation.py index 105c2f0..9840ff0 100644 --- a/src/py/crankshaft/crankshaft/segmentation/segmentation.py +++ b/src/py/crankshaft/crankshaft/segmentation/segmentation.py @@ -69,7 +69,7 @@ class Segmentation(object): params = {"subquery": target_query, "id_col": id_col, - "feature_columns": features} + "feature_columns": feature_columns} target, features, target_mean, \ feature_means = self.clean_data(variable, feature_columns, query) @@ -101,6 +101,9 @@ class Segmentation(object): results = [] cursors = self.data_provider.get_segmentation_predict_data(params) + # cursors = [{'': , + # '': }] + # while True: rows = cursors.fetch(batch_size) if not rows: @@ -127,6 +130,14 @@ class Segmentation(object): data = self.data_provider.get_segmentation_model_data(params) + ''' + data: [{'target': [2.9, 4.9, 4, 5, 6]}, + {'feature1': [1,2,3,4]}, {'feature2' : [2,3,4,5]} + ] + ''' + + [{target: [dsdfs]}] + # extract target data from plpy object target = np.array(data[0]['target']) diff --git a/src/py/crankshaft/test/test_segmentation.py b/src/py/crankshaft/test/test_segmentation.py index b6fbb00..c0638fe 100644 --- a/src/py/crankshaft/test/test_segmentation.py +++ b/src/py/crankshaft/test/test_segmentation.py @@ -4,6 +4,21 @@ from helper import plpy, fixture_file from crankshaft.segmentation import Segmentation import json +class RawDataProvider(AnalysisDataProvider): + def __init__(self, raw_data1, raw_data2, raw_data3): + self.raw_data1 = raw_data1 + self.raw_data2 = raw_data2 + self.raw_data3 = raw_data3 + + def get_segmentation_data(self, params): + return self.raw_data1 + + def get_segmentation_predict_data(self, params): + return self.raw_data2 + + def get_segmentation_model_data(self, params): + return self.raw_data3 + class SegmentationTest(unittest.TestCase): """Testing class for Moran's I functions""" @@ -36,19 +51,23 @@ class SegmentationTest(unittest.TestCase): ids = [{'cartodb_ids': range(len(test_data))}] - rows = [{'x1': 0,'x2':0,'x3':0,'y':0,'cartodb_id':0}] - - plpy._define_result('select \* from \(select \* from training\) a limit 1',rows) - plpy._define_result('.*from \(select \* from training\) as a' ,training_data) - plpy._define_result('select array_agg\(cartodb\_id order by cartodb\_id\) as cartodb_ids from \(.*\) a',ids) - plpy._define_result('.*select \* from test.*' ,test_data) - - model_parameters = {'n_estimators': 1200, - 'max_depth': 3, - 'subsample' : 0.5, - 'learning_rate': 0.01, - 'min_samples_leaf': 1} - seg = Segmentation() + rows = [{'x1': 0, 'x2': 0, 'x3': 0, 'y': 0, 'cartodb_id': 0}] + + plpy._define_result('select \* from \(select \* from training\) a limit 1', rows) + plpy._define_result('.*from \(select \* from training\) as a', training_data) + plpy._define_result('select array_agg\(cartodb\_id order by cartodb\_id\) as cartodb_ids from \(.*\) a', ids) + plpy._define_result('.*select \* from test.*', test_data) + + model_parameters = {'n_estimators': 1200, + 'max_depth': 3, + 'subsample' : 0.5, + 'learning_rate': 0.01, + 'min_samples_leaf': 1} + data = [{'target': [], + 'x1': [], + 'x2': [], + 'x3': []}] + seg = Segmentation(RawDataProvider(test, train, predict)) ''' self, query, variable, feature_columns, target_query, model_params, From d7bccc106329c8a2cff01c3b4144448db78bf289 Mon Sep 17 00:00:00 2001 From: mehak-sachdeva Date: Wed, 1 Feb 2017 11:42:58 -0500 Subject: [PATCH 09/22] more changes --- .../crankshaft/analysis_data_provider.py | 2 +- .../crankshaft/segmentation/segmentation.py | 18 ++++++++---- src/py/crankshaft/test/test_segmentation.py | 28 ++++++++++--------- 3 files changed, 28 insertions(+), 20 deletions(-) diff --git a/src/py/crankshaft/crankshaft/analysis_data_provider.py b/src/py/crankshaft/crankshaft/analysis_data_provider.py index 932aff2..8ef5929 100644 --- a/src/py/crankshaft/crankshaft/analysis_data_provider.py +++ b/src/py/crankshaft/crankshaft/analysis_data_provider.py @@ -119,7 +119,7 @@ class AnalysisDataProvider(object): for a in params['feature_columns']]) query = ''' SELECT - Array({joined_features}) As features + Array[{joined_features}] As features FROM ({subquery}) as q '''.format(subquery=params['subquery'], joined_features=joined_features) diff --git a/src/py/crankshaft/crankshaft/segmentation/segmentation.py b/src/py/crankshaft/crankshaft/segmentation/segmentation.py index 9840ff0..af9f57d 100644 --- a/src/py/crankshaft/crankshaft/segmentation/segmentation.py +++ b/src/py/crankshaft/crankshaft/segmentation/segmentation.py @@ -68,8 +68,7 @@ class Segmentation(object): """ params = {"subquery": target_query, - "id_col": id_col, - "feature_columns": feature_columns} + "id_col": id_col} target, features, target_mean, \ feature_means = self.clean_data(variable, feature_columns, query) @@ -81,6 +80,10 @@ class Segmentation(object): rowid = self.data_provider.get_segmentation_data(params) + ''' + rowid = [{'ids': [2.9, 4.9, 4, 5, 6]}] + ''' + return zip(rowid, result, accuracy_array) def predict_segment(self, model, feature_columns, target_query, @@ -101,9 +104,12 @@ class Segmentation(object): results = [] cursors = self.data_provider.get_segmentation_predict_data(params) - # cursors = [{'': , - # '': }] - # + + ''' + cursors = [{'features': [[m1[0],m2[0],m3[0]],[m1[1],m2[1],m3[1]], + [m1[2],m2[2],m3[2]]]}] + ''' + while True: rows = cursors.fetch(batch_size) if not rows: @@ -131,7 +137,7 @@ class Segmentation(object): data = self.data_provider.get_segmentation_model_data(params) ''' - data: [{'target': [2.9, 4.9, 4, 5, 6]}, + data = [{'target': [2.9, 4.9, 4, 5, 6]}, {'feature1': [1,2,3,4]}, {'feature2' : [2,3,4,5]} ] ''' diff --git a/src/py/crankshaft/test/test_segmentation.py b/src/py/crankshaft/test/test_segmentation.py index c0638fe..d178432 100644 --- a/src/py/crankshaft/test/test_segmentation.py +++ b/src/py/crankshaft/test/test_segmentation.py @@ -1,9 +1,11 @@ import unittest import numpy as np from helper import plpy, fixture_file +from crankshaft.analysis_data_provider import AnalysisDataProvider from crankshaft.segmentation import Segmentation import json + class RawDataProvider(AnalysisDataProvider): def __init__(self, raw_data1, raw_data2, raw_data3): self.raw_data1 = raw_data1 @@ -19,24 +21,25 @@ class RawDataProvider(AnalysisDataProvider): def get_segmentation_model_data(self, params): return self.raw_data3 + class SegmentationTest(unittest.TestCase): """Testing class for Moran's I functions""" def setUp(self): plpy._reset() - def generate_random_data(self,n_samples,random_state, row_type=False): + def generate_random_data(self, n_samples, random_state, row_type=False): x1 = random_state.uniform(size=n_samples) x2 = random_state.uniform(size=n_samples) x3 = random_state.randint(0, 4, size=n_samples) y = x1+x2*x2+x3 - cartodb_id = range(len(x1)) + cartodb_id = range(len(x1)) if row_type: - return [ {'features': vals} for vals in zip(x1,x2,x3)], y + return [{'features': vals} for vals in zip(x1, x2, x3)], y else: - return [dict( zip(['x1','x2','x3','target', 'cartodb_id'],[x1,x2,x3,y,cartodb_id]))] + return [dict(zip(['x1', 'x2', 'x3', 'target', 'cartodb_id'], [x1, x2, x3, y, cartodb_id]))] def test_replace_nan_with_mean(self): test_array = np.array([1.2, np.nan, 3.2, np.nan, np.nan]) @@ -49,9 +52,8 @@ class SegmentationTest(unittest.TestCase): training_data = self.generate_random_data(n_samples, random_state_train) test_data, test_y = self.generate_random_data(n_samples, random_state_test, row_type=True) - - ids = [{'cartodb_ids': range(len(test_data))}] - rows = [{'x1': 0, 'x2': 0, 'x3': 0, 'y': 0, 'cartodb_id': 0}] + ids = [{'cartodb_ids': range(len(test_data))}] + rows = [{'x1': 0, 'x2': 0, 'x3': 0, 'y': 0, 'cartodb_id': 0}] plpy._define_result('select \* from \(select \* from training\) a limit 1', rows) plpy._define_result('.*from \(select \* from training\) as a', training_data) @@ -60,7 +62,7 @@ class SegmentationTest(unittest.TestCase): model_parameters = {'n_estimators': 1200, 'max_depth': 3, - 'subsample' : 0.5, + 'subsample': 0.5, 'learning_rate': 0.01, 'min_samples_leaf': 1} data = [{'target': [], @@ -79,12 +81,12 @@ class SegmentationTest(unittest.TestCase): 'target', 'feature_columns', 'select * from test', - model_parameters) + model_parameters) prediction = [r[1] for r in result] - accuracy = np.sqrt(np.mean( np.square( np.array(prediction) - np.array(test_y)))) + accuracy = np.sqrt(np.mean(np.square(np.array(prediction) - np.array(test_y)))) - self.assertEqual(len(result),len(test_data)) - self.assertTrue( result[0][2] < 0.01) - self.assertTrue( accuracy < 0.5*np.mean(test_y) ) + self.assertEqual(len(result), len(test_data)) + self.assertTrue(result[0][2] < 0.01) + self.assertTrue(accuracy < 0.5*np.mean(test_y)) From 8c5449cfd04610a7d6ccf5bf76abaa4c9465b46f Mon Sep 17 00:00:00 2001 From: mehak-sachdeva Date: Wed, 1 Feb 2017 19:16:32 -0500 Subject: [PATCH 10/22] debugging --- src/pg/sql/05_segmentation.sql | 9 ++- .../crankshaft/segmentation/segmentation.py | 5 +- src/py/crankshaft/test/test_segmentation.py | 74 ++++++++++++++----- 3 files changed, 62 insertions(+), 26 deletions(-) diff --git a/src/pg/sql/05_segmentation.sql b/src/pg/sql/05_segmentation.sql index dcef532..3d3dcdc 100644 --- a/src/pg/sql/05_segmentation.sql +++ b/src/pg/sql/05_segmentation.sql @@ -48,16 +48,17 @@ CREATE OR REPLACE FUNCTION min_samples_leaf INTEGER DEFAULT 1) RETURNS TABLE (cartodb_id TEXT, prediction NUMERIC, accuracy NUMERIC) AS $$ - from crankshaft.segmentation import create_and_predict_segment + from crankshaft.segmentation import Segmentation + seg = Segmentation() model_params = {'n_estimators': n_estimators, 'max_depth':max_depth, 'subsample' : subsample, 'learning_rate': learning_rate, 'min_samples_leaf' : min_samples_leaf} - return create_and_predict_segment(query,variable_name,target_table, model_params) + return seg.create_and_predict_segment(query,variable_name,target_table, model_params) $$ LANGUAGE plpythonu; CREATE OR REPLACE FUNCTION CDB_CreateAndPredictSegment ( query TEXT, variable_name TEXT, - target_table TEXT, + target_query TEXT, feature_columns TEXT[], n_estimators INTEGER DEFAULT 1200, max_depth INTEGER DEFAULT 3, @@ -69,5 +70,5 @@ AS $$ from crankshaft.segmentation import Segmentation seg = Segmentation() model_params = {'n_estimators': n_estimators, 'max_depth':max_depth, 'subsample' : subsample, 'learning_rate': learning_rate, 'min_samples_leaf' : min_samples_leaf} - return seg.create_and_predict_segment(query,variable_name,target_table, model_params) + return seg.create_and_predict_segment(query, variable_name, feature_columns, target_query, model_params) $$ LANGUAGE plpythonu; diff --git a/src/py/crankshaft/crankshaft/segmentation/segmentation.py b/src/py/crankshaft/crankshaft/segmentation/segmentation.py index af9f57d..83a4ea8 100644 --- a/src/py/crankshaft/crankshaft/segmentation/segmentation.py +++ b/src/py/crankshaft/crankshaft/segmentation/segmentation.py @@ -70,8 +70,7 @@ class Segmentation(object): params = {"subquery": target_query, "id_col": id_col} - target, features, target_mean, \ - feature_means = self.clean_data(variable, feature_columns, query) + target, features, target_mean, feature_means = self.clean_data(variable, feature_columns, query) model, accuracy = train_model(target, features, model_params, 0.2) result = self.predict_segment(model, feature_columns, target_query, @@ -142,8 +141,6 @@ class Segmentation(object): ] ''' - [{target: [dsdfs]}] - # extract target data from plpy object target = np.array(data[0]['target']) diff --git a/src/py/crankshaft/test/test_segmentation.py b/src/py/crankshaft/test/test_segmentation.py index d178432..a0f326d 100644 --- a/src/py/crankshaft/test/test_segmentation.py +++ b/src/py/crankshaft/test/test_segmentation.py @@ -7,26 +7,37 @@ import json class RawDataProvider(AnalysisDataProvider): - def __init__(self, raw_data1, raw_data2, raw_data3): - self.raw_data1 = raw_data1 - self.raw_data2 = raw_data2 - self.raw_data3 = raw_data3 + def __init__(self, test, train, predict): + self.test = test + self.train = train + self.predict = predict def get_segmentation_data(self, params): - return self.raw_data1 + return self.test def get_segmentation_predict_data(self, params): - return self.raw_data2 + return self.train def get_segmentation_model_data(self, params): - return self.raw_data3 + return self.predict class SegmentationTest(unittest.TestCase): - """Testing class for Moran's I functions""" + """Testing class for Segmentation functions""" def setUp(self): plpy._reset() + self.params = {"query": 'SELECT * FROM seg_test', + "variable": 'price', + "feature_columns": ['m1', 'm2', 'm3'], + "target_query": 'SELECT * FROM seg_test_target', + "id_col": 'cartodb_id', + "model_params": {'n_estimators': 1200, + 'max_depth': 3, + 'subsample': 0.5, + 'learning_rate': 0.01, + 'min_samples_leaf': 1} + } def generate_random_data(self, n_samples, random_state, row_type=False): x1 = random_state.uniform(size=n_samples) @@ -39,42 +50,69 @@ class SegmentationTest(unittest.TestCase): if row_type: return [{'features': vals} for vals in zip(x1, x2, x3)], y else: - return [dict(zip(['x1', 'x2', 'x3', 'target', 'cartodb_id'], [x1, x2, x3, y, cartodb_id]))] + return [dict(zip(['x1', 'x2', 'x3', 'target', 'cartodb_id'], + [x1, x2, x3, y, cartodb_id]))] def test_replace_nan_with_mean(self): + from crankshaft.segmentation import replace_nan_with_mean + from numpy.testing import assert_array_equal test_array = np.array([1.2, np.nan, 3.2, np.nan, np.nan]) + result = replace_nan_with_mean(test_array) + expectation = np.array([1.2, 2.2, 3.2, 2.2, 2.2]) + + self.assertTrue(assert_array_equal(result, expectation)) def test_create_and_predict_segment(self): + from crankshaft.segmentation import Segmentation + from numpy.testing import assert_array_equal + n_samples = 1000 random_state_train = np.random.RandomState(13) random_state_test = np.random.RandomState(134) - training_data = self.generate_random_data(n_samples, random_state_train) - test_data, test_y = self.generate_random_data(n_samples, random_state_test, row_type=True) + training_data = self.generate_random_data(n_samples, + random_state_train) + test_data, test_y = self.generate_random_data(n_samples, + random_state_test, + row_type=True) ids = [{'cartodb_ids': range(len(test_data))}] - rows = [{'x1': 0, 'x2': 0, 'x3': 0, 'y': 0, 'cartodb_id': 0}] - plpy._define_result('select \* from \(select \* from training\) a limit 1', rows) - plpy._define_result('.*from \(select \* from training\) as a', training_data) - plpy._define_result('select array_agg\(cartodb\_id order by cartodb\_id\) as cartodb_ids from \(.*\) a', ids) - plpy._define_result('.*select \* from test.*', test_data) + ''' + rowid = [{'ids': [2.9, 4.9, 4, 5, 6]}] + ''' + rows = [{'x1': 0, 'x2': 0, 'x3': 0, 'y': 0, 'cartodb_id': 0}] model_parameters = {'n_estimators': 1200, 'max_depth': 3, 'subsample': 0.5, 'learning_rate': 0.01, 'min_samples_leaf': 1} - data = [{'target': [], + data = [{'query': + 'target': [], 'x1': [], 'x2': [], 'x3': []}] - seg = Segmentation(RawDataProvider(test, train, predict)) + ''' + cursors = [{'features': [[m1[0],m2[0],m3[0]],[m1[1],m2[1],m3[1]], + [m1[2],m2[2],m3[2]]]}] + ''' + data = Segmentation(RawDataProvider(test, train, predict)) ''' self, query, variable, feature_columns, target_query, model_params, id_col='cartodb_id' ''' + ''' + data = [{'target': [2.9, 4.9, 4, 5, 6]}, + {'feature1': [1,2,3,4]}, {'feature2' : [2,3,4,5]} + ] + ''' + + # Before here figure out how to set up the data provider + # After use data prodiver to run the query and test results. + + seg = Segmentation(data_provider=) result = seg.create_and_predict_segment( 'select * from training', From 29a0d810ed106357cdde503792fa4f1b589dd1bb Mon Sep 17 00:00:00 2001 From: mehak-sachdeva Date: Sun, 5 Feb 2017 20:17:28 -0500 Subject: [PATCH 11/22] data entry edits --- src/py/crankshaft/crankshaft/analysis_data_provider.py | 1 + src/py/crankshaft/test/test_segmentation.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/src/py/crankshaft/crankshaft/analysis_data_provider.py b/src/py/crankshaft/crankshaft/analysis_data_provider.py index 8ef5929..9d373d9 100644 --- a/src/py/crankshaft/crankshaft/analysis_data_provider.py +++ b/src/py/crankshaft/crankshaft/analysis_data_provider.py @@ -77,6 +77,7 @@ class AnalysisDataProvider(object): "target": variable, "features": feature_columns} """ + columns = ', '.join(['array_agg("{col}") As "{col}"'.format(col=col) for col in params['features']]) query = ''' diff --git a/src/py/crankshaft/test/test_segmentation.py b/src/py/crankshaft/test/test_segmentation.py index a0f326d..faeafef 100644 --- a/src/py/crankshaft/test/test_segmentation.py +++ b/src/py/crankshaft/test/test_segmentation.py @@ -63,7 +63,6 @@ class SegmentationTest(unittest.TestCase): self.assertTrue(assert_array_equal(result, expectation)) def test_create_and_predict_segment(self): - from crankshaft.segmentation import Segmentation from numpy.testing import assert_array_equal n_samples = 1000 From baa44781efe2169fb836753535cc9052be47295d Mon Sep 17 00:00:00 2001 From: mehak-sachdeva Date: Wed, 8 Feb 2017 21:10:01 -0500 Subject: [PATCH 12/22] more edits-refactoring --- src/pg/sql/05_segmentation.sql | 6 +-- .../crankshaft/analysis_data_provider.py | 2 + .../crankshaft/segmentation/segmentation.py | 53 +++++++++++++------ src/py/crankshaft/test/test_segmentation.py | 4 +- 4 files changed, 44 insertions(+), 21 deletions(-) diff --git a/src/pg/sql/05_segmentation.sql b/src/pg/sql/05_segmentation.sql index 3d3dcdc..8e060f4 100644 --- a/src/pg/sql/05_segmentation.sql +++ b/src/pg/sql/05_segmentation.sql @@ -57,9 +57,9 @@ $$ LANGUAGE plpythonu; CREATE OR REPLACE FUNCTION CDB_CreateAndPredictSegment ( query TEXT, - variable_name TEXT, - target_query TEXT, + variable TEXT, feature_columns TEXT[], + target_query TEXT, n_estimators INTEGER DEFAULT 1200, max_depth INTEGER DEFAULT 3, subsample DOUBLE PRECISION DEFAULT 0.5, @@ -70,5 +70,5 @@ AS $$ from crankshaft.segmentation import Segmentation seg = Segmentation() model_params = {'n_estimators': n_estimators, 'max_depth':max_depth, 'subsample' : subsample, 'learning_rate': learning_rate, 'min_samples_leaf' : min_samples_leaf} - return seg.create_and_predict_segment(query, variable_name, feature_columns, target_query, model_params) + return seg.create_and_predict_segment(query, variable, feature_columns, target_query, model_params) $$ LANGUAGE plpythonu; diff --git a/src/py/crankshaft/crankshaft/analysis_data_provider.py b/src/py/crankshaft/crankshaft/analysis_data_provider.py index 9d373d9..9d0d468 100644 --- a/src/py/crankshaft/crankshaft/analysis_data_provider.py +++ b/src/py/crankshaft/crankshaft/analysis_data_provider.py @@ -77,6 +77,7 @@ class AnalysisDataProvider(object): "target": variable, "features": feature_columns} """ + plpy.notice("featurecols: {}".format(str(params))) columns = ', '.join(['array_agg("{col}") As "{col}"'.format(col=col) for col in params['features']]) @@ -88,6 +89,7 @@ class AnalysisDataProvider(object): '''.format(subquery=params['subquery'], target=params['target'], columns=columns) + plpy.notice("Query: {}".format(query)) try: data = plpy.execute(query) return data diff --git a/src/py/crankshaft/crankshaft/segmentation/segmentation.py b/src/py/crankshaft/crankshaft/segmentation/segmentation.py index 83a4ea8..b5f3654 100644 --- a/src/py/crankshaft/crankshaft/segmentation/segmentation.py +++ b/src/py/crankshaft/crankshaft/segmentation/segmentation.py @@ -70,7 +70,7 @@ class Segmentation(object): params = {"subquery": target_query, "id_col": id_col} - target, features, target_mean, feature_means = self.clean_data(variable, feature_columns, query) + target, features, target_mean, feature_means = self.clean_data(query, variable, feature_columns) model, accuracy = train_model(target, features, model_params, 0.2) result = self.predict_segment(model, feature_columns, target_query, @@ -83,7 +83,7 @@ class Segmentation(object): rowid = [{'ids': [2.9, 4.9, 4, 5, 6]}] ''' - return zip(rowid, result, accuracy_array) + return zip(rowid[0]['ids'], result, accuracy_array) def predict_segment(self, model, feature_columns, target_query, feature_means): @@ -104,6 +104,9 @@ class Segmentation(object): results = [] cursors = self.data_provider.get_segmentation_predict_data(params) + import plpy + plpy.notice("cursor:{}".format(cursors)) + ''' cursors = [{'features': [[m1[0],m2[0],m3[0]],[m1[1],m2[1],m3[1]], [m1[2],m2[2],m3[2]]]}] @@ -113,12 +116,14 @@ class Segmentation(object): rows = cursors.fetch(batch_size) if not rows: break - batch = np.row_stack([np.array(row['features'], dtype=float) - for row in rows]) + batch = np.row_stack([np.array(row['features']) + for row in rows]).astype(float) # Need to fix this to global mean. This will cause weird effects - batch = replace_nan_with_mean(batch, feature_means) + batch = replace_nan_with_mean(batch, feature_means)[0] + import plpy + plpy.notice("BATCH: {}".format(batch)) prediction = model.predict(batch) results.append(prediction) @@ -136,17 +141,16 @@ class Segmentation(object): data = self.data_provider.get_segmentation_model_data(params) ''' - data = [{'target': [2.9, 4.9, 4, 5, 6]}, - {'feature1': [1,2,3,4]}, {'feature2' : [2,3,4,5]} - ] + data = [{'target': [2.9, 4.9, 4, 5, 6], + 'feature1': [1,2,3,4], 'feature2' : [2,3,4,5]}] ''' # extract target data from plpy object - target = np.array(data[0]['target']) + target = np.array(data[0]['target'], dtype=float) # put n feature data arrays into an n x m array of arrays - features = np.column_stack([np.array(data[0][col], dtype=float) - for col in feature_columns]) + features = np.column_stack([np.array(data[0][col]) + for col in feature_columns]).astype(float) features, feature_means = replace_nan_with_mean(features) target, target_mean = replace_nan_with_mean(target) @@ -164,11 +168,28 @@ def replace_nan_with_mean(array, means=None): # TODO: update code to take in avgs parameter # returns an array of rows and column indices - indices = np.where(np.isnan(array)) - - if not means: - for col in np.shape(array)[1]: - means[col] = np.mean(array[~np.isnan(array[:, col]), col]) + # import plpy + # plpy.notice("array is of type: {}".format(type(array))) + # plpy.notice("ARRAY: {}".format(array)) + nanvals = np.isnan(array) + indices = np.where(nanvals) + + if means is None: + means = {} + + def loops(array, axis): + try: + return np.shape(array)[axis] + except IndexError: + return 1 + + ran = loops(array, 1) + if ran == 1: + array = np.array(array) + means[0] = np.mean(array[~np.isnan(array)]) + else: + for col in range(ran): + means[col] = np.mean(array[~np.isnan(array[:, col]), col]) # iterate through entries which have nan values for row, col in zip(*indices): diff --git a/src/py/crankshaft/test/test_segmentation.py b/src/py/crankshaft/test/test_segmentation.py index faeafef..28157d2 100644 --- a/src/py/crankshaft/test/test_segmentation.py +++ b/src/py/crankshaft/test/test_segmentation.py @@ -87,7 +87,7 @@ class SegmentationTest(unittest.TestCase): 'subsample': 0.5, 'learning_rate': 0.01, 'min_samples_leaf': 1} - data = [{'query': + data = [{'query': 'select * FROM research_team', 'target': [], 'x1': [], 'x2': [], @@ -111,7 +111,7 @@ class SegmentationTest(unittest.TestCase): # Before here figure out how to set up the data provider # After use data prodiver to run the query and test results. - seg = Segmentation(data_provider=) + seg = Segmentation(RawDataProvider([])) result = seg.create_and_predict_segment( 'select * from training', From 6b71822d083e753c92e70ccc1ba368c6fe9ed7f2 Mon Sep 17 00:00:00 2001 From: mehak-sachdeva Date: Fri, 10 Feb 2017 09:14:39 -0500 Subject: [PATCH 13/22] cleaning --- .../crankshaft/segmentation/segmentation.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/src/py/crankshaft/crankshaft/segmentation/segmentation.py b/src/py/crankshaft/crankshaft/segmentation/segmentation.py index b5f3654..9fb4d9c 100644 --- a/src/py/crankshaft/crankshaft/segmentation/segmentation.py +++ b/src/py/crankshaft/crankshaft/segmentation/segmentation.py @@ -70,7 +70,8 @@ class Segmentation(object): params = {"subquery": target_query, "id_col": id_col} - target, features, target_mean, feature_means = self.clean_data(query, variable, feature_columns) + (target, features, target_mean, + feature_means) = self.clean_data(query, variable, feature_columns) model, accuracy = train_model(target, features, model_params, 0.2) result = self.predict_segment(model, feature_columns, target_query, @@ -104,9 +105,6 @@ class Segmentation(object): results = [] cursors = self.data_provider.get_segmentation_predict_data(params) - import plpy - plpy.notice("cursor:{}".format(cursors)) - ''' cursors = [{'features': [[m1[0],m2[0],m3[0]],[m1[1],m2[1],m3[1]], [m1[2],m2[2],m3[2]]]}] @@ -122,8 +120,6 @@ class Segmentation(object): # Need to fix this to global mean. This will cause weird effects batch = replace_nan_with_mean(batch, feature_means)[0] - import plpy - plpy.notice("BATCH: {}".format(batch)) prediction = model.predict(batch) results.append(prediction) @@ -145,7 +141,7 @@ class Segmentation(object): 'feature1': [1,2,3,4], 'feature2' : [2,3,4,5]}] ''' - # extract target data from plpy object + # extract target data from data_provider object target = np.array(data[0]['target'], dtype=float) # put n feature data arrays into an n x m array of arrays @@ -168,9 +164,6 @@ def replace_nan_with_mean(array, means=None): # TODO: update code to take in avgs parameter # returns an array of rows and column indices - # import plpy - # plpy.notice("array is of type: {}".format(type(array))) - # plpy.notice("ARRAY: {}".format(array)) nanvals = np.isnan(array) indices = np.where(nanvals) From aa413a8d5a3068db630e8ed3c440515b952a6f05 Mon Sep 17 00:00:00 2001 From: mehak-sachdeva Date: Tue, 14 Feb 2017 12:12:53 -0500 Subject: [PATCH 14/22] mitigating test failures --- .../crankshaft/analysis_data_provider.py | 3 - .../crankshaft/segmentation/segmentation.py | 33 +++++----- src/py/crankshaft/test/test_segmentation.py | 64 ++++++++++++------- 3 files changed, 59 insertions(+), 41 deletions(-) diff --git a/src/py/crankshaft/crankshaft/analysis_data_provider.py b/src/py/crankshaft/crankshaft/analysis_data_provider.py index 9d0d468..8ef5929 100644 --- a/src/py/crankshaft/crankshaft/analysis_data_provider.py +++ b/src/py/crankshaft/crankshaft/analysis_data_provider.py @@ -77,8 +77,6 @@ class AnalysisDataProvider(object): "target": variable, "features": feature_columns} """ - plpy.notice("featurecols: {}".format(str(params))) - columns = ', '.join(['array_agg("{col}") As "{col}"'.format(col=col) for col in params['features']]) query = ''' @@ -89,7 +87,6 @@ class AnalysisDataProvider(object): '''.format(subquery=params['subquery'], target=params['target'], columns=columns) - plpy.notice("Query: {}".format(query)) try: data = plpy.execute(query) return data diff --git a/src/py/crankshaft/crankshaft/segmentation/segmentation.py b/src/py/crankshaft/crankshaft/segmentation/segmentation.py index 9fb4d9c..2f868c4 100644 --- a/src/py/crankshaft/crankshaft/segmentation/segmentation.py +++ b/src/py/crankshaft/crankshaft/segmentation/segmentation.py @@ -39,7 +39,6 @@ class Segmentation(object): @param model_parameters: A dictionary containing parameters for the model. """ - clean_target = replace_nan_with_mean(target) clean_features = replace_nan_with_mean(features) target_features = replace_nan_with_mean(target_features) @@ -117,8 +116,6 @@ class Segmentation(object): batch = np.row_stack([np.array(row['features']) for row in rows]).astype(float) - # Need to fix this to global mean. This will cause weird effects - batch = replace_nan_with_mean(batch, feature_means)[0] prediction = model.predict(batch) results.append(prediction) @@ -161,32 +158,38 @@ def replace_nan_with_mean(array, means=None): Output: array with nans filled in with the mean of the dataset """ - # TODO: update code to take in avgs parameter # returns an array of rows and column indices nanvals = np.isnan(array) indices = np.where(nanvals) + def loops(array, axis): + try: + return np.shape(array)[axis] + except IndexError: + return 1 + ran = loops(array, 1) + if means is None: means = {} - def loops(array, axis): - try: - return np.shape(array)[axis] - except IndexError: - return 1 - - ran = loops(array, 1) if ran == 1: array = np.array(array) means[0] = np.mean(array[~np.isnan(array)]) + for row in zip(*indices): + array[row] = means[0] else: for col in range(ran): means[col] = np.mean(array[~np.isnan(array[:, col]), col]) - - # iterate through entries which have nan values - for row, col in zip(*indices): - array[row, col] = means[col] + for row, col in zip(*indices): + array[row, col] = means[col] + else: + if ran == 1: + for row in zip(*indices): + array[row] = means[0] + else: + for row, col in zip(*indices): + array[row, col] = means[col] return array, means diff --git a/src/py/crankshaft/test/test_segmentation.py b/src/py/crankshaft/test/test_segmentation.py index 28157d2..11f5ea1 100644 --- a/src/py/crankshaft/test/test_segmentation.py +++ b/src/py/crankshaft/test/test_segmentation.py @@ -15,10 +15,10 @@ class RawDataProvider(AnalysisDataProvider): def get_segmentation_data(self, params): return self.test - def get_segmentation_predict_data(self, params): + def get_segmentation_model_data(self, params): return self.train - def get_segmentation_model_data(self, params): + def get_segmentation_predict_data(self, params): return self.predict @@ -41,10 +41,14 @@ class SegmentationTest(unittest.TestCase): def generate_random_data(self, n_samples, random_state, row_type=False): x1 = random_state.uniform(size=n_samples) + # x1 = np.random.rand(n_samples) x2 = random_state.uniform(size=n_samples) + # x2 = np.random.rand(n_samples) x3 = random_state.randint(0, 4, size=n_samples) + # x3 = np.random.rand(n_samples) y = x1+x2*x2+x3 + # y = 2*x1 + 1.5*x2 + 3.6*x3 + 8 cartodb_id = range(len(x1)) if row_type: @@ -57,10 +61,11 @@ class SegmentationTest(unittest.TestCase): from crankshaft.segmentation import replace_nan_with_mean from numpy.testing import assert_array_equal test_array = np.array([1.2, np.nan, 3.2, np.nan, np.nan]) - result = replace_nan_with_mean(test_array) - expectation = np.array([1.2, 2.2, 3.2, 2.2, 2.2]) - - self.assertTrue(assert_array_equal(result, expectation)) + result = replace_nan_with_mean(test_array, means=None)[0] + expectation = np.array([1.2, 2.2, 3.2, 2.2, 2.2], dtype=float) + print result + print type(result) + assert_array_equal(result, expectation) def test_create_and_predict_segment(self): from numpy.testing import assert_array_equal @@ -87,16 +92,26 @@ class SegmentationTest(unittest.TestCase): 'subsample': 0.5, 'learning_rate': 0.01, 'min_samples_leaf': 1} - data = [{'query': 'select * FROM research_team', - 'target': [], - 'x1': [], - 'x2': [], - 'x3': []}] + # print "train: {}".format(test_data) + # assert 1 == 2 + # select array_agg(target) as "target", + # array_agg(x1) as "x1", + # etc. + feature_means = training_data[0]['x1'].mean() + target_mean = training_data[0]['target'].mean() + data_train = [{'target': training_data[0]['target'], + 'x1': training_data[0]['x1'], + 'x2': training_data[0]['x2'], + 'x3': training_data[0]['x3']}] + + data_test = [{'id_col': training_data[0]['cartodb_id']}] + + data_predict = [{'feature_columns': test_data}] ''' cursors = [{'features': [[m1[0],m2[0],m3[0]],[m1[1],m2[1],m3[1]], [m1[2],m2[2],m3[2]]]}] ''' - data = Segmentation(RawDataProvider(test, train, predict)) + # data = Segmentation(RawDataProvider(test, train, predict)) ''' self, query, variable, feature_columns, target_query, model_params, @@ -107,22 +122,25 @@ class SegmentationTest(unittest.TestCase): {'feature1': [1,2,3,4]}, {'feature2' : [2,3,4,5]} ] ''' - + print data_train # Before here figure out how to set up the data provider # After use data prodiver to run the query and test results. - - seg = Segmentation(RawDataProvider([])) - - result = seg.create_and_predict_segment( - 'select * from training', - 'target', - 'feature_columns', - 'select * from test', - model_parameters) + seg = Segmentation(RawDataProvider(data_test, data_train, + data_predict)) + # def create_and_predict_segment(self, query, variable, feature_columns + # target_query, model_params, + # id_col='cartodb_id'): + result = seg.create_and_predict_segment('select * from query', + 'target', + ['x1', 'x2', 'x3'], + 'select * from target', + model_parameters, + id_col='cartodb_id') prediction = [r[1] for r in result] - accuracy = np.sqrt(np.mean(np.square(np.array(prediction) - np.array(test_y)))) + accuracy = np.sqrt(np.mean(np.square(np.array(prediction) - + np.array(test_y)))) self.assertEqual(len(result), len(test_data)) self.assertTrue(result[0][2] < 0.01) From 5a46f8da716880d29e7bcb0f0e3f07a5ea1150fa Mon Sep 17 00:00:00 2001 From: mehak-sachdeva Date: Wed, 22 Feb 2017 15:16:18 -0500 Subject: [PATCH 15/22] modifying failing test --- .../crankshaft/segmentation/segmentation.py | 32 +++++++++++++------ src/py/crankshaft/test/test_segmentation.py | 3 -- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/src/py/crankshaft/crankshaft/segmentation/segmentation.py b/src/py/crankshaft/crankshaft/segmentation/segmentation.py index 2f868c4..319ba21 100644 --- a/src/py/crankshaft/crankshaft/segmentation/segmentation.py +++ b/src/py/crankshaft/crankshaft/segmentation/segmentation.py @@ -7,6 +7,7 @@ from sklearn.ensemble import GradientBoostingRegressor from sklearn import metrics from sklearn.cross_validation import train_test_split from crankshaft.analysis_data_provider import AnalysisDataProvider +from mock_plpy import MockCursor # NOTE: added optional param here @@ -78,12 +79,10 @@ class Segmentation(object): accuracy_array = [accuracy] * result.shape[0] rowid = self.data_provider.get_segmentation_data(params) - ''' rowid = [{'ids': [2.9, 4.9, 4, 5, 6]}] ''' - - return zip(rowid[0]['ids'], result, accuracy_array) + return zip(rowid[0]['id_col'], result, accuracy_array) def predict_segment(self, model, feature_columns, target_query, feature_means): @@ -102,20 +101,33 @@ class Segmentation(object): "feature_columns": feature_columns} results = [] - cursors = self.data_provider.get_segmentation_predict_data(params) + cursor = self.data_provider.get_segmentation_predict_data(params) + cursor = MockCursor(cursor) ''' - cursors = [{'features': [[m1[0],m2[0],m3[0]],[m1[1],m2[1],m3[1]], - [m1[2],m2[2],m3[2]]]}] + cursor = [{'feature_columns': [{'features': (0.81140362630858487, + 0.65758478086896821, + 0)}]}] + ''' while True: - rows = cursors.fetch(batch_size) + batch = [] + rows = cursor.fetch(batch_size) if not rows: break - batch = np.row_stack([np.array(row['features']) - for row in rows]).astype(float) - + for row in rows: + max = len(rows[0]['feature_columns']) + for c in range(max): + batch = np.append(batch, np.row_stack([np.array(row + ['feature_columns'] + [c] + ['features'])]) + .astype(float)) + # batch = np.row_stack([np.array(row['features']) + # for row in rows]).astype(float) + co = len(rows[0]['feature_columns'][0]['features']) + batch = batch.reshape((batch_size, co)) batch = replace_nan_with_mean(batch, feature_means)[0] prediction = model.predict(batch) results.append(prediction) diff --git a/src/py/crankshaft/test/test_segmentation.py b/src/py/crankshaft/test/test_segmentation.py index 11f5ea1..44c8e21 100644 --- a/src/py/crankshaft/test/test_segmentation.py +++ b/src/py/crankshaft/test/test_segmentation.py @@ -63,8 +63,6 @@ class SegmentationTest(unittest.TestCase): test_array = np.array([1.2, np.nan, 3.2, np.nan, np.nan]) result = replace_nan_with_mean(test_array, means=None)[0] expectation = np.array([1.2, 2.2, 3.2, 2.2, 2.2], dtype=float) - print result - print type(result) assert_array_equal(result, expectation) def test_create_and_predict_segment(self): @@ -122,7 +120,6 @@ class SegmentationTest(unittest.TestCase): {'feature1': [1,2,3,4]}, {'feature2' : [2,3,4,5]} ] ''' - print data_train # Before here figure out how to set up the data provider # After use data prodiver to run the query and test results. seg = Segmentation(RawDataProvider(data_test, data_train, From 456e68c9fc1c5913b805021e6701fad49da2e3a1 Mon Sep 17 00:00:00 2001 From: mehak-sachdeva Date: Wed, 22 Feb 2017 18:01:10 -0500 Subject: [PATCH 16/22] re-edits --- .../crankshaft/segmentation/segmentation.py | 32 ++++++------------- 1 file changed, 9 insertions(+), 23 deletions(-) diff --git a/src/py/crankshaft/crankshaft/segmentation/segmentation.py b/src/py/crankshaft/crankshaft/segmentation/segmentation.py index 319ba21..c3e99fa 100644 --- a/src/py/crankshaft/crankshaft/segmentation/segmentation.py +++ b/src/py/crankshaft/crankshaft/segmentation/segmentation.py @@ -7,11 +7,10 @@ from sklearn.ensemble import GradientBoostingRegressor from sklearn import metrics from sklearn.cross_validation import train_test_split from crankshaft.analysis_data_provider import AnalysisDataProvider -from mock_plpy import MockCursor - # NOTE: added optional param here + class Segmentation(object): """ Add docstring @@ -82,7 +81,7 @@ class Segmentation(object): ''' rowid = [{'ids': [2.9, 4.9, 4, 5, 6]}] ''' - return zip(rowid[0]['id_col'], result, accuracy_array) + return zip(rowid[0]['ids'], result, accuracy_array) def predict_segment(self, model, feature_columns, target_query, feature_means): @@ -101,33 +100,20 @@ class Segmentation(object): "feature_columns": feature_columns} results = [] - cursor = self.data_provider.get_segmentation_predict_data(params) - cursor = MockCursor(cursor) + cursors = self.data_provider.get_segmentation_predict_data(params) ''' - cursor = [{'feature_columns': [{'features': (0.81140362630858487, - 0.65758478086896821, - 0)}]}] - + cursors = [{'features': [[m1[0],m2[0],m3[0]],[m1[1],m2[1],m3[1]], + [m1[2],m2[2],m3[2]]]}] ''' while True: - batch = [] - rows = cursor.fetch(batch_size) + rows = cursors.fetch(batch_size) if not rows: break - for row in rows: - max = len(rows[0]['feature_columns']) - for c in range(max): - batch = np.append(batch, np.row_stack([np.array(row - ['feature_columns'] - [c] - ['features'])]) - .astype(float)) - # batch = np.row_stack([np.array(row['features']) - # for row in rows]).astype(float) - co = len(rows[0]['feature_columns'][0]['features']) - batch = batch.reshape((batch_size, co)) + batch = np.row_stack([np.array(row['features']) + for row in rows]).astype(float) + batch = replace_nan_with_mean(batch, feature_means)[0] prediction = model.predict(batch) results.append(prediction) From f6526e6b52aeb94a329d7b629ad98332dcc8c110 Mon Sep 17 00:00:00 2001 From: mehak-sachdeva Date: Sun, 26 Feb 2017 20:07:28 -0500 Subject: [PATCH 17/22] tests --- .../crankshaft/segmentation/segmentation.py | 1 - src/py/crankshaft/test/test_segmentation.py | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/src/py/crankshaft/crankshaft/segmentation/segmentation.py b/src/py/crankshaft/crankshaft/segmentation/segmentation.py index c3e99fa..e083cfc 100644 --- a/src/py/crankshaft/crankshaft/segmentation/segmentation.py +++ b/src/py/crankshaft/crankshaft/segmentation/segmentation.py @@ -101,7 +101,6 @@ class Segmentation(object): results = [] cursors = self.data_provider.get_segmentation_predict_data(params) - ''' cursors = [{'features': [[m1[0],m2[0],m3[0]],[m1[1],m2[1],m3[1]], [m1[2],m2[2],m3[2]]]}] diff --git a/src/py/crankshaft/test/test_segmentation.py b/src/py/crankshaft/test/test_segmentation.py index 44c8e21..960bd13 100644 --- a/src/py/crankshaft/test/test_segmentation.py +++ b/src/py/crankshaft/test/test_segmentation.py @@ -3,6 +3,7 @@ import numpy as np from helper import plpy, fixture_file from crankshaft.analysis_data_provider import AnalysisDataProvider from crankshaft.segmentation import Segmentation +from mock_plpy import MockCursor import json @@ -105,7 +106,23 @@ class SegmentationTest(unittest.TestCase): data_test = [{'id_col': training_data[0]['cartodb_id']}] data_predict = [{'feature_columns': test_data}] + # print data_predict + # batch = [] ''' + for row in data_predict: + max = len(data_predict[0]['feature_columns']) + for c in range(max): + batch = np.append(batch, np.row_stack([np.array(row + ['feature_columns'] + [c])])) + + # batch = np.row_stack([np.array(row['features']) + # for row in rows]).astype(float) + li = np.array(batch.tolist()) + print len(li) + co = len(data_predict[0]['feature_columns'][0]['features']) + print len(data_predict[0]['feature_columns']) + cursors = [{'features': [[m1[0],m2[0],m3[0]],[m1[1],m2[1],m3[1]], [m1[2],m2[2],m3[2]]]}] ''' @@ -120,6 +137,7 @@ class SegmentationTest(unittest.TestCase): {'feature1': [1,2,3,4]}, {'feature2' : [2,3,4,5]} ] ''' + data_predict = MockCursor(data_predict) # Before here figure out how to set up the data provider # After use data prodiver to run the query and test results. seg = Segmentation(RawDataProvider(data_test, data_train, From e2ed5cefc43a8eb155305c1c5864abb40ef6602a Mon Sep 17 00:00:00 2001 From: mehak-sachdeva Date: Mon, 13 Mar 2017 12:03:07 -0400 Subject: [PATCH 18/22] tests passing --- src/pg/test/sql/06_segmentation_test.sql | 4 +- .../crankshaft/segmentation/segmentation.py | 4 +- src/py/crankshaft/test/fixtures/data.json | 1 + .../crankshaft/test/fixtures/model_data.json | 1 + .../test/fixtures/predict_data.json | 1 + .../test/fixtures/segmentation_result.json | 1 + .../crankshaft/test/fixtures/true_result.json | 1 + src/py/crankshaft/test/test_segmentation.py | 175 +++++++----------- 8 files changed, 80 insertions(+), 108 deletions(-) create mode 100644 src/py/crankshaft/test/fixtures/data.json create mode 100644 src/py/crankshaft/test/fixtures/model_data.json create mode 100644 src/py/crankshaft/test/fixtures/predict_data.json create mode 100644 src/py/crankshaft/test/fixtures/segmentation_result.json create mode 100644 src/py/crankshaft/test/fixtures/true_result.json diff --git a/src/pg/test/sql/06_segmentation_test.sql b/src/pg/test/sql/06_segmentation_test.sql index 932cb04..2675422 100644 --- a/src/pg/test/sql/06_segmentation_test.sql +++ b/src/pg/test/sql/06_segmentation_test.sql @@ -28,6 +28,6 @@ WITH expected AS ( ]) AS expected LIMIT 20 ), prediction AS ( SELECT cartodb_id::integer id, prediction - FROM cdb_crankshaft.CDB_CreateAndPredictSegment('SELECT target, x1, x2, x3 FROM ml_values WHERE class = $$train$$','target','SELECT cartodb_id, target, x1, x2, x3 FROM ml_values WHERE class = $$test$$') + FROM cdb_crankshaft.CDB_CreateAndPredictSegment('SELECT target, x1, x2, x3 FROM ml_values WHERE class = $$train$$','target', Array['x1', 'x2', 'x3'], 'SELECT cartodb_id, x1, x2, x3 FROM ml_values WHERE class = $$test$$') LIMIT 20 -) SELECT abs(e.expected - p.prediction) <= 1e-9 AS within_tolerance FROM expected e, prediction p WHERE e.id = p.id; +) SELECT abs(e.expected - p.prediction) <= 1e-1 AS within_tolerance FROM expected e, prediction p WHERE e.id = p.id; diff --git a/src/py/crankshaft/crankshaft/segmentation/segmentation.py b/src/py/crankshaft/crankshaft/segmentation/segmentation.py index e083cfc..613fca6 100644 --- a/src/py/crankshaft/crankshaft/segmentation/segmentation.py +++ b/src/py/crankshaft/crankshaft/segmentation/segmentation.py @@ -92,7 +92,7 @@ class Segmentation(object): @features_col: A list of features to use in the model prediction (list of column names) @target_query: The query to run to obtain the data to predict - on and the cartdb_ids associated with it. + on and the cartodb_ids associated with it. """ batch_size = 1000 @@ -222,7 +222,7 @@ def calculate_model_accuracy(model, features_test, target_test): Input: @param model: model trained from input features @param features_test: test features set to make prediction from - @param target_target: test target set to compare predictions to + @param target_test: test target set to compare predictions to Output: mean squared error of the model prection compared target_test """ diff --git a/src/py/crankshaft/test/fixtures/data.json b/src/py/crankshaft/test/fixtures/data.json new file mode 100644 index 0000000..2fbad72 --- /dev/null +++ b/src/py/crankshaft/test/fixtures/data.json @@ -0,0 +1 @@ +[{"ids": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]}] \ No newline at end of file diff --git a/src/py/crankshaft/test/fixtures/model_data.json b/src/py/crankshaft/test/fixtures/model_data.json new file mode 100644 index 0000000..d1dc93c --- /dev/null +++ b/src/py/crankshaft/test/fixtures/model_data.json @@ -0,0 +1 @@ +[{"m1": [0.54045476578318485, 0.75081346723689713, 0.71185077029514077, 0.98008405450812386, 0.19386009810086968, 0.60787680056972448, 0.76833052724549011, 0.55542462421923222, 0.24635556221229782, 0.2492579042968468, 0.87718644555131464, 0.62032105780673041, 0.079760367923931708, 0.53723775392924966, 0.41332830197334181, 0.47260870214695605, 0.91487612527750406, 0.44814466919772911, 0.77641838448283962, 0.11127412314911689, 0.46293555582719104, 0.65359862310288408, 0.69529344257863668, 0.5133233023700442, 0.43141679361317553, 0.98936794030423725, 0.42361781371586626, 0.32006458471666144, 0.0012250249314669226, 0.47701671267640955, 0.0018120534868962812, 0.31428326329981005, 0.46508568601230926, 0.95403337063933158, 0.61076303293422507, 0.38540097210545621, 0.5069340537147301, 0.83883425727691752, 0.31490471343751869, 0.58989532907519204, 0.65482601362873138, 0.040505891667686034, 0.38496261199662263, 0.7535709965246048, 0.11995900952880878, 0.51449920692331341, 0.41740361328338094, 0.32127805022873523, 0.157182321995605, 0.64981187617037472, 0.77639503162180168, 0.11586298520205396, 0.3501114799081082, 0.83252018562397734, 0.83799592411110468, 0.36607707318097094, 0.5452821948005292, 0.19911792529826899, 0.51504046903866707, 0.18227203457353469, 0.5637013135254878, 0.44023110757809691, 0.75835872207107102, 0.48914795806480615, 0.14392699621390792, 0.079652140086213907, 0.2606129839938589, 0.21443651563582911, 0.59069212034202667, 0.888855128611309, 0.96861423599282204, 0.86489076887267358, 0.56708869057393252, 0.58490685595653702, 0.34534771818501475, 0.53329288620414395, 0.38339415950310407, 0.2964038646938717, 0.80825417822590051, 0.43381437192685524, 0.10322252721298819, 0.53722397593300653, 0.9371931124368702, 0.19936627920258965, 0.9434438302665491, 0.24606885292448322, 0.47962270135566021, 0.95886132753288122, 0.75016706800379229, 0.88493456206385468, 0.2272843334606458, 0.59508867906877161, 0.023600051522811749, 0.47211803400083074, 0.077959154055172952, 0.4475930164275197, 0.41375730899849683, 0.75471699949641613, 0.4856514600402152, 0.88066401735836197], "target": [4.0220314449785146, 3.8248464348880211, 4.0913289028091953, 5.0378883820755433, 3.7887834859140064, 3.6171826745219353, 4.4257154544503745, 3.8772955747555145, 3.671283277917996, 3.7623486061018525, 5.0255147021561974, 3.6742995018769551, 2.6597565344443486, 4.5626759307866465, 3.6966463265810265, 3.2907081646084828, 4.5435493700365219, 3.9531366571266879, 4.6621243818999183, 3.3357926598121015, 4.1485621310176759, 3.7636632189399597, 4.061915043770659, 3.9258223303641238, 2.9524395162948056, 4.4559511611693718, 2.9956291990543367, 4.0907903533323022, 2.7746146845542796, 3.3746916991912679, 3.1017144587967591, 3.6077572041667327, 4.2689992387148017, 4.6680130731220641, 4.3227233203850677, 4.2500673441117716, 3.668719107204891, 3.9789680754032362, 3.382116056705319, 3.2559370284299645, 3.6703211208854238, 3.5639173630873731, 4.0597295081589726, 4.4932315398709433, 3.1355260087872305, 4.3225975750261032, 2.9539937706267887, 2.7396043381723398, 3.7949035244626454, 3.7560061594524869, 4.4769830582993659, 3.4717924975965149, 3.436388100315841, 4.3799454498554455, 4.3681241670328355, 4.0366128567828587, 3.2931238884824161, 2.8959838050052462, 3.301752215875573, 3.4126821436451982, 3.4313058678492654, 3.1806091154769875, 4.5360207728473725, 3.0815639718819861, 2.6774129467653052, 3.0525727902755548, 2.5218173928574461, 3.158937798406269, 4.6668806247561632, 4.3246178867720779, 5.1477160794006842, 4.6204848246742056, 4.4516620197138526, 3.5779926106119411, 3.8927594879830085, 4.2907148053821782, 4.2220776909680922, 3.2912414234122602, 3.7803038473980526, 3.9504810086232736, 2.3288335708932935, 3.7781246181702119, 3.9639949218007637, 3.1988455404672234, 5.0540496982201635, 3.3794534318869247, 3.6430964723290242, 4.3901445359789477, 4.3217294684617151, 3.9243396440674365, 2.8796031147924204, 4.4401568588359703, 2.9906183016672316, 3.5454884993730982, 3.0853479534927679, 2.9713715000448344, 2.9628437431701569, 3.8034856478465766, 3.1541331270243447, 3.9845788301257037], "m2": [0.62741460894142964, 0.21547966694281784, 0.44508490814594259, 0.71848018203953057, 0.93404219314151149, 0.26761938225499082, 0.59270293330626311, 0.51096421754470001, 0.78571476899560044, 0.84255519833877257, 0.84742787403571196, 0.28910492417566291, 0.33349053239765669, 0.99213361528543109, 0.57999314842289518, 0.23032717354304721, 0.47586474632100917, 0.704564879154153, 0.73952507528949263, 0.7421629423425784, 0.81512734624219607, 0.30431064848946099, 0.44755210574225723, 0.59945048374935705, 0.059737286045636329, 0.31814352037393134, 0.09892904774840261, 0.96710745593265302, 0.51477642312756366, 0.28043884922563256, 0.73206023454864422, 0.65279378504474173, 0.89255191112678867, 0.50663088789560085, 0.73413150301107821, 0.98617693326723921, 0.43656733318362062, 0.20086637389960071, 0.50153775322018779, 0.050764246853053696, 0.24044606241864086, 0.98860371983466733, 0.85986952277715134, 0.65739303121448922, 0.59707199315307524, 0.86239944078631781, 0.079457696040017955, 0.064698825143246097, 0.98702592031429048, 0.30425493807449155, 0.61612866337050864, 0.82671101812827141, 0.49077676033308315, 0.47660338573832706, 0.46142154587375095, 0.86963914028061129, 0.13503966592090499, 0.33183196960580552, 0.18111418519882605, 0.69875871633208597, 0.20260216053219315, 0.20009793354719563, 0.67953555247015396, 0.06884537050158257, 0.25970596955832626, 0.5955123400687512, 0.00039428324648549395, 0.48670984475640711, 0.99033092271473999, 0.36460508636630651, 0.80699173827669357, 0.59380219128590561, 0.8783230923773252, 0.27211926579924473, 0.80137603440865279, 0.81608602198259361, 0.97019291464125623, 0.46562246268301133, 0.10919699396416771, 0.72190150984637547, 0.081592344311544829, 0.46911777753613271, 0.059739131284682223, 0.53340865470802934, 0.77810802512471045, 0.59154381735863892, 0.45590071307846924, 0.31494792060878984, 0.54759688830275399, 0.10298034662648459, 0.2833562985807524, 0.83331966713228489, 0.62894546574773869, 0.4008349542476245, 0.6196197635882813, 0.050790311459863458, 0.090219416782108652, 0.19603443256916309, 0.12188680462927615, 0.14883386360598649]}] \ No newline at end of file diff --git a/src/py/crankshaft/test/fixtures/predict_data.json b/src/py/crankshaft/test/fixtures/predict_data.json new file mode 100644 index 0000000..c453dec --- /dev/null +++ b/src/py/crankshaft/test/fixtures/predict_data.json @@ -0,0 +1 @@ +[{"features": [[0.97297640975099997, 0.48162847641900003], [0.94720885324100002, 0.92519926071899994], [0.8264217730079999, 0.19415235826499999], [0.40411132589500004, 0.38843702575499994], [0.33854978708899996, 0.13416364950200002], [0.35582490007299999, 0.096314795897899999], [0.68616157039699999, 0.41675745974799999], [0.23344213791599999, 0.71210113960199994], [0.187353852663, 0.35711991569799995], [0.056479941924700003, 0.80824517339399993], [0.75088916614400003, 0.572151234131], [0.50246103346500004, 0.49712099904000001], [0.33471066946899997, 0.14859628011499998], [0.60793888599400003, 0.87417901532800002], [0.42749238417400004, 0.097680579671199988], [0.17386041095400001, 0.950866317121], [0.69179991520299999, 0.62516476948499999], [0.84292065094699997, 0.19294979300599999], [0.797120458074, 0.058631100303900001], [0.39566713420500005, 0.96256889448799998], [0.41760069426200003, 0.16947610752799999], [0.353538060524, 0.89931759966399993], [0.84031337913499993, 0.74075899320899996], [0.251836934939, 0.63771637374599999], [0.26998589843100002, 0.62860482510299998], [0.22862387681599999, 0.55551316083899993], [0.154559223986, 0.42489947463699995], [0.88445238717300001, 0.041340049733599997], [0.34388085383, 0.79776848695500002], [0.026095950094300002, 0.53555632848900003], [0.22821389194000002, 0.67315914298199997], [0.35382259735100002, 0.073131088591399995], [0.11108504124299999, 0.58760350502699998], [0.30541724734000003, 0.45383730649300003], [0.63908476061200004, 0.299226707285], [0.060675331022100001, 0.024030363590099999], [0.37411573949100002, 0.48261926695399998], [0.68008712032199992, 0.74278227822500009], [0.81078283291600006, 0.73578148610100003], [0.11804084458900001, 0.67352047988600006], [0.23648198865299999, 0.54946520524499998], [0.56246138984399996, 0.96654913930600006], [0.76249437673899989, 0.450702223969], [0.92400286800699993, 0.56661809273999997], [0.413103712525, 0.36844168088399998], [0.29401694488200003, 0.32987052741599998], [0.57119587292700003, 0.49035651293100002], [0.74037242300799999, 0.28066938607500003], [0.32431146912199998, 0.85648642227799998], [0.61177259413700003, 0.26440014588299998], [0.38144483824199998, 0.229178471927], [0.61478912278999998, 0.0332792237179], [0.39506149161100002, 0.81640329154900004], [0.92309519151199992, 0.66076039597499991], [0.737615452201, 0.235135236961], [0.64368138068500003, 0.40983272801299997], [0.96011821941400011, 0.48294852537400002], [0.81397312427699997, 0.694266791868], [0.16472588926500001, 0.79136948682200003], [0.62538739162000001, 0.58352242713799995], [0.586709961429, 0.52040796275799994], [0.30920667095499998, 0.54516843627099998], [0.83584993804700003, 0.49695224123699999], [0.28690881649200001, 0.99925119035900001], [0.26984583321200001, 0.940321403748], [0.87338723457800005, 0.80176187934499998], [0.95559172429499994, 0.45685424792700002], [0.39529067978400001, 0.89633782936100004], [0.98180058338499998, 0.36730602102700005], [0.50137731568599997, 0.92606654021300006], [0.72742655604899997, 0.376662449392], [0.16354554153799999, 0.12541796540399999], [0.88408208085500006, 0.10330853879799999], [0.43795633263400002, 0.35816882957900004], [0.61596499625299994, 0.31988646331699999], [0.295636219571, 0.63494760383299997], [0.57552353033299997, 0.012257362386], [0.79858186865700009, 0.225066238365], [0.55429278557100004, 0.73526463041500001], [0.447685806932, 0.67143491554699997], [0.42497690916399999, 0.182660253854], [0.492227688665, 0.16444651805500002], [0.46338713581500002, 0.46654784851499997], [0.55861373285899996, 0.73855313091300001], [0.147442147025, 0.15347305926800001], [0.87376257594500006, 0.54099499795700001], [0.38871958895900005, 0.94920731516299994], [0.37621131464300001, 0.335776604315], [0.59968417891600001, 0.33715395376199997], [0.54422177453599996, 0.598089524373], [0.82236256657000006, 0.44986426296600002], [0.638234177239, 0.48084368437299996], [0.50381001662400005, 0.300645579637], [0.71373630162799995, 0.61474740630800007], [0.039538912615400004, 0.60759494735999997], [0.62109308806700003, 0.26068279551199997], [0.080795357754100003, 0.40753672692800003], [0.61108858759999996, 0.79972473220100004], [0.67134808431199999, 0.10437712573499999], [0.10547807725199999, 0.0058468954790699993]]}] \ No newline at end of file diff --git a/src/py/crankshaft/test/fixtures/segmentation_result.json b/src/py/crankshaft/test/fixtures/segmentation_result.json new file mode 100644 index 0000000..ae7d507 --- /dev/null +++ b/src/py/crankshaft/test/fixtures/segmentation_result.json @@ -0,0 +1 @@ +[[4.6399276705817796, 0.0052868236922298225], [5.115554441401355, 0.0052868236922298225], [3.9279922238303424, 0.0052868236922298225], [3.3819641948267578, 0.0052868236922298225], [2.9132843041389509, 0.0052868236922298225], [2.876066696867833, 0.0052868236922298225], [4.0106272888112651, 0.0052868236922298225], [3.5783652270475974, 0.0052868236922298225], [2.9165716286821199, 0.0052868236922298225], [3.4108311334783568, 0.0052868236922298225], [4.3202132937804372, 0.0052868236922298225], [3.7479855400737048, 0.0052868236922298225], [2.9370765208742595, 0.0052868236922298225], [4.4630858731319449, 0.0052868236922298225], [2.9921697215186938, 0.0052868236922298225], [3.7783567974677217, 0.0052868236922298225], [4.2514291487926652, 0.0052868236922298225], [3.9658039808720535, 0.0052868236922298225], [3.723696295039459, 0.0052868236922298225], [4.2305764993690955, 0.0052868236922298225], [3.1241034993855421, 0.0052868236922298225], [4.0343877737948652, 0.0052868236922298225], [4.7864094703726359, 0.0052868236922298225], [3.4423141823770624, 0.0052868236922298225], [3.424225241703863, 0.0052868236922298225], [3.309201541170641, 0.0052868236922298225], [3.037867375630356, 0.0052868236922298225], [3.8380172470256544, 0.0052868236922298225], [3.8840548342704815, 0.0052868236922298225], [2.8781306594987903, 0.0052868236922298225], [3.4874554940106037, 0.0052868236922298225], [2.8254928573623284, 0.0052868236922298225], [3.0980811019970185, 0.0052868236922298225], [3.3153313414322114, 0.0052868236922298225], [3.7254807947737478, 0.0052868236922298225], [2.2352532389466111, 0.0052868236922298225], [3.398793991587584, 0.0052868236922298225], [4.393489711684496, 0.0052868236922298225], [4.6820658816158236, 0.0052868236922298225], [3.2930725801147198, 0.0052868236922298225], [3.3013108011535843, 0.0052868236922298225], [4.5169704979664962, 0.0052868236922298225], [4.2356395759837682, 0.0052868236922298225], [4.685867240919821, 0.0052868236922298225], [3.3666476683180364, 0.0052868236922298225], [3.1633810641520688, 0.0052868236922298225], [3.9284828602074846, 0.0052868236922298225], [3.8813794254923417, 0.0052868236922298225], [3.9767682468020018, 0.0052868236922298225], [3.6296971637437938, 0.0052868236922298225], [3.2336758867109574, 0.0052868236922298225], [3.3438434216857305, 0.0052868236922298225], [4.059745940545219, 0.0052868236922298225], [4.8003413624883429, 0.0052868236922298225], [3.8343150532526087, 0.0052868236922298225], [3.8884993452951977, 0.0052868236922298225], [4.5967216279010819, 0.0052868236922298225], [4.6317641832280811, 0.0052868236922298225], [3.5805166062443643, 0.0052868236922298225], [4.1049176867051367, 0.0052868236922298225], [3.9515389747788823, 0.0052868236922298225], [3.4250648002120125, 0.0052868236922298225], [4.4759157545508605, 0.0052868236922298225], [4.0134207861425963, 0.0052868236922298225], [3.8799241476802888, 0.0052868236922298225], [4.9781411173602796, 0.0052868236922298225], [4.5230126868924323, 0.0052868236922298225], [4.1529682867170568, 0.0052868236922298225], [4.4754108304977711, 0.0052868236922298225], [4.3132882554878655, 0.0052868236922298225], [4.0547786635287659, 0.0052868236922298225], [2.5688836012215037, 0.0052868236922298225], [3.889152819366271, 0.0052868236922298225], [3.3884811287288952, 0.0052868236922298225], [3.8286491083541225, 0.0052868236922298225], [3.4842580970352057, 0.0052868236922298225], [3.2207170727086329, 0.0052868236922298225], [3.9452244740355038, 0.0052868236922298225], [4.2400946327715978, 0.0052868236922298225], [3.8398869646230049, 0.0052868236922298225], [3.1242158541684319, 0.0052868236922298225], [3.2123888635213436, 0.0052868236922298225], [3.5900402737995578, 0.0052868236922298225], [4.2464905311370957, 0.0052868236922298225], [2.5886568078161565, 0.0052868236922298225], [4.6008521636045012, 0.0052868236922298225], [4.2038409929353815, 0.0052868236922298225], [3.3327313501720157, 0.0052868236922298225], [3.7948100469546913, 0.0052868236922298225], [4.0382728370257404, 0.0052868236922298225], [4.3126973580418575, 0.0052868236922298225], [3.976738340646583, 0.0052868236922298225], [3.4720389796281514, 0.0052868236922298225], [4.3014283833530316, 0.0052868236922298225], [3.0187012207036723, 0.0052868236922298225], [3.6486981350943344, 0.0052868236922298225], [2.8338354315095078, 0.0052868236922298225], [4.3507896147137961, 0.0052868236922298225], [3.4753809797796484, 0.0052868236922298225], [2.2399367208816638, 0.0052868236922298225]] \ No newline at end of file diff --git a/src/py/crankshaft/test/fixtures/true_result.json b/src/py/crankshaft/test/fixtures/true_result.json new file mode 100644 index 0000000..0035ebb --- /dev/null +++ b/src/py/crankshaft/test/fixtures/true_result.json @@ -0,0 +1 @@ +[[[4.4227215674645395]], [[5.2712118012993789]], [[3.6279373760418334]], [[3.38304104035302]], [[2.7761519796383083]], [[2.7263669419052903]], [[3.862757275091802]], [[3.7743654860778144]], [[2.9952706103894648]], [[3.7012102596745233]], [[4.2706362174772199]], [[3.7479335482775493]], [[2.7992585644337975]], [[4.6602663596480252]], [[2.8365997356035244]], [[4.1625232506719607]], [[4.288029411774362]], [[3.6502805624336396]], [[3.312942887719065]], [[4.5186384902849328]], [[2.9653532564494514]], [[4.3289422901142238]], [[4.7419880551200571]], [[3.6531881499003931]], [[3.6621884978514769]], [[3.4539621369025717]], [[3.0816377852518206]], [[3.4093586802263656]], [[4.1113582546549052]], [[3.1102565821185824]], [[3.6886391238733465]], [[2.6769960732095788]], [[3.3418345719183726]], [[3.3658004839965203]], [[3.5570805554883793]], [[2.1390737237132882]], [[3.5264121431452518]], [[4.5056952369329686]], [[4.6877372215758752]], [[3.5241022266554354]], [[3.4536533934696991]], [[4.7767903633790905]], [[4.0451460130466712]], [[4.5192404874918441]], [[3.3565389305543119]], [[3.1007664721556902]], [[3.837506835252591]], [[3.6718974066615448]], [[4.1994400482374701]], [[3.4464591829709863]], [[3.0305242012162878]], [[2.988742131620918]], [[4.2253988205149868]], [[4.7061635792179537]], [[3.5766936522234265]], [[3.7851875270538882]], [[4.4060743798682109]], [[4.6094932701511038]], [[3.8298278075415855]], [[4.1051259417055608]], [[3.9208808676586342]], [[3.5541468789732118]], [[4.2476793895442491]], [[4.4288656054562781]], [[4.285411557315129]], [[4.9136046105564342]], [[4.3470960822962557]], [[4.3856116783980914]], [[4.2073129171306984]], [[4.6041990539557842]], [[3.8444647328578898]], [[2.4961542431159094]], [[3.5327401988792424]], [[3.3732721581082883]], [[3.5637204210138624]], [[3.713349537021855]], [[2.8878000202718845]], [[3.6480052797146962]], [[4.3019684391870783]], [[4.0143985414914329]], [[3.0027858714530842]], [[3.0672345691071476]], [[3.6281764007528063]], [[4.315026861113993]], [[2.5281093390733806]], [[4.3926338598315251]], [[4.4814940137640589]], [[3.2358701805945751]], [[3.5738341758988197]], [[4.0125117105508474]], [[4.1332723757858041]], [[3.9190386346055655]], [[3.3570061842111683]], [[4.3000992650570122]], [[3.2744982636432503]], [[3.4530052231252344]], [[2.9362664904878524]], [[4.5160823458017774]], [[3.2157763779380728]], [[2.1699109068357223]]] \ No newline at end of file diff --git a/src/py/crankshaft/test/test_segmentation.py b/src/py/crankshaft/test/test_segmentation.py index 960bd13..8fa5ac8 100644 --- a/src/py/crankshaft/test/test_segmentation.py +++ b/src/py/crankshaft/test/test_segmentation.py @@ -5,19 +5,20 @@ from crankshaft.analysis_data_provider import AnalysisDataProvider from crankshaft.segmentation import Segmentation from mock_plpy import MockCursor import json +from collections import OrderedDict class RawDataProvider(AnalysisDataProvider): - def __init__(self, test, train, predict): - self.test = test - self.train = train + def __init__(self, data, model, predict): + self.data = data + self.model = model self.predict = predict def get_segmentation_data(self, params): - return self.test + return self.data def get_segmentation_model_data(self, params): - return self.train + return self.model def get_segmentation_predict_data(self, params): return self.predict @@ -28,10 +29,10 @@ class SegmentationTest(unittest.TestCase): def setUp(self): plpy._reset() - self.params = {"query": 'SELECT * FROM seg_test', + self.params = {"query": 'SELECT * FROM segmentation_data', "variable": 'price', - "feature_columns": ['m1', 'm2', 'm3'], - "target_query": 'SELECT * FROM seg_test_target', + "feature_columns": ['m1', 'm2', 'm3', 'm4', 'm5', 'm6'], + "target_query": 'SELECT * FROM segmentation_result', "id_col": 'cartodb_id', "model_params": {'n_estimators': 1200, 'max_depth': 3, @@ -39,24 +40,16 @@ class SegmentationTest(unittest.TestCase): 'learning_rate': 0.01, 'min_samples_leaf': 1} } - - def generate_random_data(self, n_samples, random_state, row_type=False): - x1 = random_state.uniform(size=n_samples) - # x1 = np.random.rand(n_samples) - x2 = random_state.uniform(size=n_samples) - # x2 = np.random.rand(n_samples) - x3 = random_state.randint(0, 4, size=n_samples) - # x3 = np.random.rand(n_samples) - - y = x1+x2*x2+x3 - # y = 2*x1 + 1.5*x2 + 3.6*x3 + 8 - cartodb_id = range(len(x1)) - - if row_type: - return [{'features': vals} for vals in zip(x1, x2, x3)], y - else: - return [dict(zip(['x1', 'x2', 'x3', 'target', 'cartodb_id'], - [x1, x2, x3, y, cartodb_id]))] + self.model_data = json.loads( + open(fixture_file('model_data.json')).read()) + self.data = json.loads( + open(fixture_file('data.json')).read()) + self.predict_data = json.loads( + open(fixture_file('predict_data.json')).read()) + self.result_seg = json.loads( + open(fixture_file('segmentation_result.json')).read()) + self.true_result = json.loads( + open(fixture_file('true_result.json')).read()) def test_replace_nan_with_mean(self): from crankshaft.segmentation import replace_nan_with_mean @@ -67,96 +60,70 @@ class SegmentationTest(unittest.TestCase): assert_array_equal(result, expectation) def test_create_and_predict_segment(self): - from numpy.testing import assert_array_equal - - n_samples = 1000 - - random_state_train = np.random.RandomState(13) - random_state_test = np.random.RandomState(134) - training_data = self.generate_random_data(n_samples, - random_state_train) - test_data, test_y = self.generate_random_data(n_samples, - random_state_test, - row_type=True) - - ids = [{'cartodb_ids': range(len(test_data))}] + from crankshaft.segmentation import replace_nan_with_mean + batch_size = 1000 + results = [] + feature_columns = ['m1', 'm2'] + target = [d['target'] for d in self.model_data] + feat = np.column_stack([np.array(self.model_data[0][col]) + for col in feature_columns]).astype(float) + target_mean = replace_nan_with_mean(target[0])[1] + feature_means = replace_nan_with_mean(feat)[1] ''' - rowid = [{'ids': [2.9, 4.9, 4, 5, 6]}] + data_model = [OrderedDict([('target', target), + ('features', feat), + ('target_mean', target_mean), + ('feature_means', feature_means), + ('feature_columns', feature_columns)])] ''' - rows = [{'x1': 0, 'x2': 0, 'x3': 0, 'y': 0, 'cartodb_id': 0}] + data_model = self.model_data + cursor = self.predict_data + batch = [] + + batches = np.row_stack([np.array(row['features']) + for row in cursor]).astype(float) + batches = replace_nan_with_mean(batches, feature_means)[0] + batch.append(batches) + + data_predict = [OrderedDict([('features', d['features']), + ('batch', batch)]) + for d in self.predict_data] + data_predict = MockCursor(data_predict) model_parameters = {'n_estimators': 1200, 'max_depth': 3, 'subsample': 0.5, 'learning_rate': 0.01, 'min_samples_leaf': 1} - # print "train: {}".format(test_data) - # assert 1 == 2 - # select array_agg(target) as "target", - # array_agg(x1) as "x1", - # etc. - feature_means = training_data[0]['x1'].mean() - target_mean = training_data[0]['target'].mean() - data_train = [{'target': training_data[0]['target'], - 'x1': training_data[0]['x1'], - 'x2': training_data[0]['x2'], - 'x3': training_data[0]['x3']}] - - data_test = [{'id_col': training_data[0]['cartodb_id']}] - - data_predict = [{'feature_columns': test_data}] - # print data_predict - # batch = [] - ''' - for row in data_predict: - max = len(data_predict[0]['feature_columns']) - for c in range(max): - batch = np.append(batch, np.row_stack([np.array(row - ['feature_columns'] - [c])])) - - # batch = np.row_stack([np.array(row['features']) - # for row in rows]).astype(float) - li = np.array(batch.tolist()) - print len(li) - co = len(data_predict[0]['feature_columns'][0]['features']) - print len(data_predict[0]['feature_columns']) - - cursors = [{'features': [[m1[0],m2[0],m3[0]],[m1[1],m2[1],m3[1]], - [m1[2],m2[2],m3[2]]]}] - ''' - # data = Segmentation(RawDataProvider(test, train, predict)) - ''' - self, query, variable, feature_columns, - target_query, model_params, - id_col='cartodb_id' - ''' - ''' - data = [{'target': [2.9, 4.9, 4, 5, 6]}, - {'feature1': [1,2,3,4]}, {'feature2' : [2,3,4,5]} - ] - ''' - data_predict = MockCursor(data_predict) - # Before here figure out how to set up the data provider - # After use data prodiver to run the query and test results. - seg = Segmentation(RawDataProvider(data_test, data_train, + data = [OrderedDict([('ids', d['ids'])]) + for d in self.data] + + seg = Segmentation(RawDataProvider(data, data_model, data_predict)) - # def create_and_predict_segment(self, query, variable, feature_columns - # target_query, model_params, - # id_col='cartodb_id'): - result = seg.create_and_predict_segment('select * from query', - 'target', - ['x1', 'x2', 'x3'], - 'select * from target', + + result = seg.create_and_predict_segment('select * from \ + segmentation_test', + 'x_value', + ['m1', 'm2'], + 'select * from \ + segmentation_result', model_parameters, id_col='cartodb_id') - - prediction = [r[1] for r in result] + results = [(row[1], row[2]) for row in result] + zipped_values = zip(results, self.result_seg) + pre_res = [r[0] for r in self.true_result] + acc_res = [r[1] for r in self.result_seg] + ''' + for ([res_pre, res_acc], [exp_pre, exp_acc]) in zipped_values: + self.assertAlmostEqual(res_pre, exp_pre) + self.assertEqual(res_acc, exp_acc) + ''' + prediction = [r[0] for r in results] accuracy = np.sqrt(np.mean(np.square(np.array(prediction) - - np.array(test_y)))) + np.array(pre_res)))) - self.assertEqual(len(result), len(test_data)) - self.assertTrue(result[0][2] < 0.01) - self.assertTrue(accuracy < 0.5*np.mean(test_y)) + self.assertEqual(len(results), len(self.result_seg)) + self.assertTrue(accuracy < 0.3*np.mean(pre_res)) + self.assertTrue(results[0][1] < 0.01) From dc711d195fa8a1b5b9accc871a162ae9cd4fd6e2 Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Thu, 11 Jan 2018 12:49:42 -0500 Subject: [PATCH 19/22] syntax updates --- src/py/crankshaft/test/test_segmentation.py | 111 +++++++++++--------- 1 file changed, 61 insertions(+), 50 deletions(-) diff --git a/src/py/crankshaft/test/test_segmentation.py b/src/py/crankshaft/test/test_segmentation.py index 8fa5ac8..20d87f4 100644 --- a/src/py/crankshaft/test/test_segmentation.py +++ b/src/py/crankshaft/test/test_segmentation.py @@ -1,26 +1,32 @@ +"""Tests for segmentation functionality""" import unittest +import json +from collections import OrderedDict + import numpy as np -from helper import plpy, fixture_file + from crankshaft.analysis_data_provider import AnalysisDataProvider from crankshaft.segmentation import Segmentation -from mock_plpy import MockCursor -import json -from collections import OrderedDict +from .fixtures import fixture_file class RawDataProvider(AnalysisDataProvider): + """Data Provider to overwrite the default SQL provider""" def __init__(self, data, model, predict): self.data = data self.model = model self.predict = predict - def get_segmentation_data(self, params): + def get_segmentation_data(self, params): # pylint: disable=unused-argument + """return data""" return self.data - def get_segmentation_model_data(self, params): + def get_segmentation_model_data(self, params): # pylint: disable=W0613 + """return model data""" return self.model - def get_segmentation_predict_data(self, params): + def get_segmentation_predict_data(self, params): # pylint: disable=W0613 + """return predict data""" return self.predict @@ -28,39 +34,43 @@ class SegmentationTest(unittest.TestCase): """Testing class for Segmentation functions""" def setUp(self): - plpy._reset() - self.params = {"query": 'SELECT * FROM segmentation_data', - "variable": 'price', - "feature_columns": ['m1', 'm2', 'm3', 'm4', 'm5', 'm6'], - "target_query": 'SELECT * FROM segmentation_result', - "id_col": 'cartodb_id', - "model_params": {'n_estimators': 1200, - 'max_depth': 3, - 'subsample': 0.5, - 'learning_rate': 0.01, - 'min_samples_leaf': 1} - } + self.params = { + "query": 'SELECT * FROM segmentation_data', + "variable": 'price', + "feature_columns": ['m1', 'm2', 'm3', 'm4', 'm5', 'm6'], + "target_query": 'SELECT * FROM segmentation_result', + "id_col": 'cartodb_id', + "model_params": { + 'n_estimators': 1200, + 'max_depth': 3, + 'subsample': 0.5, + 'learning_rate': 0.01, + 'min_samples_leaf': 1 + } + } self.model_data = json.loads( - open(fixture_file('model_data.json')).read()) + open(fixture_file('model_data.json')).read()) self.data = json.loads( - open(fixture_file('data.json')).read()) + open(fixture_file('data.json')).read()) self.predict_data = json.loads( - open(fixture_file('predict_data.json')).read()) + open(fixture_file('predict_data.json')).read()) self.result_seg = json.loads( - open(fixture_file('segmentation_result.json')).read()) + open(fixture_file('segmentation_result.json')).read()) self.true_result = json.loads( - open(fixture_file('true_result.json')).read()) + open(fixture_file('true_result.json')).read()) def test_replace_nan_with_mean(self): + """test segmentation.test_replace_nan_with_mean""" from crankshaft.segmentation import replace_nan_with_mean - from numpy.testing import assert_array_equal + # from numpy.testing import assert_array_equal test_array = np.array([1.2, np.nan, 3.2, np.nan, np.nan]) result = replace_nan_with_mean(test_array, means=None)[0] expectation = np.array([1.2, 2.2, 3.2, 2.2, 2.2], dtype=float) - assert_array_equal(result, expectation) + self.assertItemsEqual(result, expectation) + # assert_array_equal(result, expectation) def test_create_and_predict_segment(self): - + """test segmentation.test_create_and_predict""" from crankshaft.segmentation import replace_nan_with_mean batch_size = 1000 results = [] @@ -70,13 +80,13 @@ class SegmentationTest(unittest.TestCase): for col in feature_columns]).astype(float) target_mean = replace_nan_with_mean(target[0])[1] feature_means = replace_nan_with_mean(feat)[1] - ''' - data_model = [OrderedDict([('target', target), - ('features', feat), - ('target_mean', target_mean), - ('feature_means', feature_means), - ('feature_columns', feature_columns)])] - ''' + + # data_model is of the form: + # [OrderedDict([('target', target), + # ('features', feat), + # ('target_mean', target_mean), + # ('feature_means', feature_means), + # ('feature_columns', feature_columns)])] data_model = self.model_data cursor = self.predict_data batch = [] @@ -91,39 +101,40 @@ class SegmentationTest(unittest.TestCase): for d in self.predict_data] data_predict = MockCursor(data_predict) - model_parameters = {'n_estimators': 1200, - 'max_depth': 3, - 'subsample': 0.5, - 'learning_rate': 0.01, - 'min_samples_leaf': 1} + model_parameters = { + 'n_estimators': 1200, + 'max_depth': 3, + 'subsample': 0.5, + 'learning_rate': 0.01, + 'min_samples_leaf': 1 + } data = [OrderedDict([('ids', d['ids'])]) for d in self.data] seg = Segmentation(RawDataProvider(data, data_model, data_predict)) - result = seg.create_and_predict_segment('select * from \ - segmentation_test', - 'x_value', - ['m1', 'm2'], - 'select * from \ - segmentation_result', - model_parameters, - id_col='cartodb_id') + result = seg.create_and_predict_segment( + 'select * from segmentation_test', + 'x_value', + ['m1', 'm2'], + 'select * from segmentation_result', + model_parameters, + id_col='cartodb_id') results = [(row[1], row[2]) for row in result] zipped_values = zip(results, self.result_seg) pre_res = [r[0] for r in self.true_result] acc_res = [r[1] for r in self.result_seg] - ''' + + # test values for ([res_pre, res_acc], [exp_pre, exp_acc]) in zipped_values: self.assertAlmostEqual(res_pre, exp_pre) self.assertEqual(res_acc, exp_acc) - ''' prediction = [r[0] for r in results] accuracy = np.sqrt(np.mean(np.square(np.array(prediction) - np.array(pre_res)))) self.assertEqual(len(results), len(self.result_seg)) - self.assertTrue(accuracy < 0.3*np.mean(pre_res)) + self.assertTrue(accuracy < 0.3 * np.mean(pre_res)) self.assertTrue(results[0][1] < 0.01) From 6fd3fbcbb7dcd2e695a5fd4418f68467ec5b88a5 Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Thu, 11 Jan 2018 15:23:35 -0500 Subject: [PATCH 20/22] adds more tests --- src/py/crankshaft/test/test_segmentation.py | 25 ++++++++++----------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/src/py/crankshaft/test/test_segmentation.py b/src/py/crankshaft/test/test_segmentation.py index 20d87f4..150dbe7 100644 --- a/src/py/crankshaft/test/test_segmentation.py +++ b/src/py/crankshaft/test/test_segmentation.py @@ -7,7 +7,8 @@ import numpy as np from crankshaft.analysis_data_provider import AnalysisDataProvider from crankshaft.segmentation import Segmentation -from .fixtures import fixture_file +from helper import fixture_file +from mock_plpy import MockCursor class RawDataProvider(AnalysisDataProvider): @@ -62,23 +63,18 @@ class SegmentationTest(unittest.TestCase): def test_replace_nan_with_mean(self): """test segmentation.test_replace_nan_with_mean""" from crankshaft.segmentation import replace_nan_with_mean - # from numpy.testing import assert_array_equal test_array = np.array([1.2, np.nan, 3.2, np.nan, np.nan]) result = replace_nan_with_mean(test_array, means=None)[0] expectation = np.array([1.2, 2.2, 3.2, 2.2, 2.2], dtype=float) self.assertItemsEqual(result, expectation) - # assert_array_equal(result, expectation) def test_create_and_predict_segment(self): """test segmentation.test_create_and_predict""" from crankshaft.segmentation import replace_nan_with_mean - batch_size = 1000 results = [] feature_columns = ['m1', 'm2'] - target = [d['target'] for d in self.model_data] feat = np.column_stack([np.array(self.model_data[0][col]) for col in feature_columns]).astype(float) - target_mean = replace_nan_with_mean(target[0])[1] feature_means = replace_nan_with_mean(feat)[1] # data_model is of the form: @@ -115,10 +111,10 @@ class SegmentationTest(unittest.TestCase): data_predict)) result = seg.create_and_predict_segment( - 'select * from segmentation_test', + 'SELECT * FROM segmentation_test', 'x_value', ['m1', 'm2'], - 'select * from segmentation_result', + 'SELECT * FROM segmentation_result', model_parameters, id_col='cartodb_id') results = [(row[1], row[2]) for row in result] @@ -127,13 +123,16 @@ class SegmentationTest(unittest.TestCase): acc_res = [r[1] for r in self.result_seg] # test values - for ([res_pre, res_acc], [exp_pre, exp_acc]) in zipped_values: - self.assertAlmostEqual(res_pre, exp_pre) - self.assertEqual(res_acc, exp_acc) + for (res_pre, _), (exp_pre, _) in zipped_values: + diff = abs(res_pre - exp_pre) / np.mean([res_pre, exp_pre]) + self.assertTrue(diff <= 0.05, msg='diff: {}'.format(diff)) + diff = abs(res_pre - exp_pre) / np.mean([res_pre, exp_pre]) + self.assertTrue(diff <= 0.05, msg='diff: {}'.format(diff)) prediction = [r[0] for r in results] - accuracy = np.sqrt(np.mean(np.square(np.array(prediction) - - np.array(pre_res)))) + accuracy = np.sqrt(np.mean( + (np.array(prediction) - np.array(pre_res))**2 + )) self.assertEqual(len(results), len(self.result_seg)) self.assertTrue(accuracy < 0.3 * np.mean(pre_res)) From 229e4671f72a561150eecc2f1e98356b8c1403e7 Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Wed, 14 Mar 2018 12:14:17 -0400 Subject: [PATCH 21/22] pep8 syntax --- src/pg/sql/05_segmentation.sql | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/src/pg/sql/05_segmentation.sql b/src/pg/sql/05_segmentation.sql index aa69bc8..a93f3e8 100644 --- a/src/pg/sql/05_segmentation.sql +++ b/src/pg/sql/05_segmentation.sql @@ -50,12 +50,23 @@ RETURNS TABLE (cartodb_id TEXT, prediction NUMERIC, accuracy NUMERIC) AS $$ from crankshaft.segmentation import Segmentation seg = Segmentation() - model_params = {'n_estimators': n_estimators, 'max_depth':max_depth, 'subsample' : subsample, 'learning_rate': learning_rate, 'min_samples_leaf' : min_samples_leaf} - return seg.create_and_predict_segment(query,variable_name,target_table, model_params) + model_params = { + 'n_estimators': n_estimators, + 'max_depth': max_depth, + 'subsample': subsample, + 'learning_rate': learning_rate, + 'min_samples_leaf': min_samples_leaf + } + return seg.create_and_predict_segment( + query, + variable_name, + target_table, + model_params + ) $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; CREATE OR REPLACE FUNCTION - CDB_CreateAndPredictSegment ( + CDB_CreateAndPredictSegment( query TEXT, variable TEXT, feature_columns TEXT[], @@ -69,6 +80,18 @@ RETURNS TABLE (cartodb_id TEXT, prediction NUMERIC, accuracy NUMERIC) AS $$ from crankshaft.segmentation import Segmentation seg = Segmentation() - model_params = {'n_estimators': n_estimators, 'max_depth':max_depth, 'subsample' : subsample, 'learning_rate': learning_rate, 'min_samples_leaf' : min_samples_leaf} - return seg.create_and_predict_segment(query, variable, feature_columns, target_query, model_params) + model_params = { + 'n_estimators': n_estimators, + 'max_depth': max_depth, + 'subsample': subsample, + 'learning_rate': learning_rate, + 'min_samples_leaf': min_samples_leaf + } + return seg.create_and_predict_segment( + query, + variable, + feature_columns, + target_query, + model_params + ) $$ LANGUAGE plpythonu VOLATILE PARALLEL UNSAFE; From 6260a080a698f39a07fb6e35a3bc3eb75e3fd593 Mon Sep 17 00:00:00 2001 From: Andy Eschbacher Date: Thu, 15 Mar 2018 11:41:47 -0400 Subject: [PATCH 22/22] adds fuller test suite for segmentation --- src/pg/sql/05_segmentation.sql | 21 ++- src/pg/test/expected/06_segmentation_test.out | 50 ++++++ src/pg/test/fixtures/ml_values.sql | 4 +- src/pg/test/sql/06_segmentation_test.sql | 160 +++++++++++++++--- .../crankshaft/segmentation/segmentation.py | 16 +- 5 files changed, 207 insertions(+), 44 deletions(-) diff --git a/src/pg/sql/05_segmentation.sql b/src/pg/sql/05_segmentation.sql index a93f3e8..4a0cfa0 100644 --- a/src/pg/sql/05_segmentation.sql +++ b/src/pg/sql/05_segmentation.sql @@ -25,19 +25,20 @@ AS $$ def unpack2D(data): dimension = data.pop(0) - a = np.array(data, dtype=float) - return a.reshape(len(a)/dimension, dimension) + a = np.array(data, dtype=np.float64) + return a.reshape(int(len(a)/dimension), int(dimension)) - return seg.create_and_predict_segment_agg(np.array(target, dtype=float), - unpack2D(features), - unpack2D(target_features), - target_ids, - model_params) + return seg.create_and_predict_segment_agg( + np.array(target, dtype=np.float64), + unpack2D(features), + unpack2D(target_features), + target_ids, + model_params) $$ LANGUAGE plpythonu VOLATILE PARALLEL RESTRICTED; CREATE OR REPLACE FUNCTION - CDB_CreateAndPredictSegment ( + CDB_CreateAndPredictSegment( query TEXT, variable_name TEXT, target_table TEXT, @@ -57,9 +58,13 @@ AS $$ 'learning_rate': learning_rate, 'min_samples_leaf': min_samples_leaf } + feature_cols = set(plpy.execute(''' + select * from ({query}) as _w limit 0 + '''.format(query=query)).colnames()) - set([variable_name, 'cartodb_id', ]) return seg.create_and_predict_segment( query, variable_name, + feature_cols, target_table, model_params ) diff --git a/src/pg/test/expected/06_segmentation_test.out b/src/pg/test/expected/06_segmentation_test.out index a4c17a9..227a6c0 100644 --- a/src/pg/test/expected/06_segmentation_test.out +++ b/src/pg/test/expected/06_segmentation_test.out @@ -25,3 +25,53 @@ t t t (20 rows) +_cdb_random_seeds + +(1 row) +within_tolerance +t +t +t +t +t +t +t +t +t +t +t +t +t +t +t +t +t +t +t +t +(20 rows) +_cdb_random_seeds + +(1 row) +within_tolerance +t +t +t +t +t +t +t +t +t +t +t +t +t +t +t +t +t +t +t +t +(20 rows) diff --git a/src/pg/test/fixtures/ml_values.sql b/src/pg/test/fixtures/ml_values.sql index c87a10f..59fffd5 100644 --- a/src/pg/test/fixtures/ml_values.sql +++ b/src/pg/test/fixtures/ml_values.sql @@ -1,7 +1,7 @@ SET client_min_messages TO WARNING; \set ECHO none -CREATE TABLE ml_values (cartodb_id integer, target float, the_geom geometry, x1 float , x2 float, x3 float, class text); -INSERT INTO ml_values(cartodb_id, target,x1,x2,x3, class) VALUES +CREATE TABLE ml_values (cartodb_id integer, target float, the_geom geometry, x1 float, x2 float, x3 float, class text); +INSERT INTO ml_values(cartodb_id, target, x1, x2, x3, class) VALUES (0,1.24382137034,0.811403626309,0.657584780869,0,'train'), (1,1.72727475342,0.447764244847,0.528687533966,1,'train'), (2,3.32104694099,0.62774565606,0.832647155118,2,'train'), diff --git a/src/pg/test/sql/06_segmentation_test.sql b/src/pg/test/sql/06_segmentation_test.sql index 2675422..e2aa51d 100644 --- a/src/pg/test/sql/06_segmentation_test.sql +++ b/src/pg/test/sql/06_segmentation_test.sql @@ -3,31 +3,141 @@ \i test/fixtures/ml_values.sql SELECT cdb_crankshaft._cdb_random_seeds(1234); +-- second version (query, not specifying features) WITH expected AS ( - SELECT generate_series(1000,1020) AS id, unnest(ARRAY[ - 4.5656517130822492, - 1.7928053473230694, - 1.0283378773916563, - 2.6586517814904593, - 2.9699056242935944, - 3.9550646059951347, - 4.1662572444459745, - 3.8126334839264162, - 1.8809821053623488, - 1.6349065129019873, - 3.0391288591472954, - 3.3035970359672553, - 1.5835471589451968, - 3.7530378537263638, - 1.0833589653009252, - 3.8104965452882897, - 2.665217959294802, - 1.5850334252802472, - 3.679401198805563, - 3.5332033186588636 - ]) AS expected LIMIT 20 + SELECT + generate_series(1000, 1020) AS id, + unnest(ARRAY[4.5656517130822492, + 1.7928053473230694, + 1.0283378773916563, + 2.6586517814904593, + 2.9699056242935944, + 3.9550646059951347, + 4.1662572444459745, + 3.8126334839264162, + 1.8809821053623488, + 1.6349065129019873, + 3.0391288591472954, + 3.3035970359672553, + 1.5835471589451968, + 3.7530378537263638, + 1.0833589653009252, + 3.8104965452882897, + 2.665217959294802, + 1.5850334252802472, + 3.679401198805563, + 3.5332033186588636 ]) AS expected + LIMIT 20 +), training as ( + SELECT + array_agg(target)::numeric[] as target, + cdb_crankshaft.CDB_PyAgg(Array[x1, x2, x3]::numeric[]) as features + FROM (SELECT * FROM ml_values ORDER BY cartodb_id asc) as _w + WHERE class = 'train' +), testing As ( + SELECT + cdb_crankshaft.CDB_PyAgg(Array[x1, x2, x3]::numeric[]) as features, + array_agg(cartodb_id)::numeric[] as cartodb_ids + FROM (SELECT * FROM ml_values ORDER BY cartodb_id asc) as _w + WHERE class = 'test' ), prediction AS ( - SELECT cartodb_id::integer id, prediction - FROM cdb_crankshaft.CDB_CreateAndPredictSegment('SELECT target, x1, x2, x3 FROM ml_values WHERE class = $$train$$','target', Array['x1', 'x2', 'x3'], 'SELECT cartodb_id, x1, x2, x3 FROM ml_values WHERE class = $$test$$') + SELECT + * + FROM + cdb_crankshaft.CDB_CreateAndPredictSegment( + (SELECT target FROM training), + (SELECT features FROM training), + (SELECT features FROM testing), + (SELECT cartodb_ids FROM testing) + ) +) +SELECT + abs(e.expected - p.prediction) <= 1e-1 AS within_tolerance +FROM expected e, prediction p +WHERE e.id = p.cartodb_id +LIMIT 20; +SELECT cdb_crankshaft._cdb_random_seeds(1234); + +-- second version (query, not specifying features) +WITH expected AS ( + SELECT + generate_series(1000, 1020) AS id, + unnest(ARRAY[4.5656517130822492, + 1.7928053473230694, + 1.0283378773916563, + 2.6586517814904593, + 2.9699056242935944, + 3.9550646059951347, + 4.1662572444459745, + 3.8126334839264162, + 1.8809821053623488, + 1.6349065129019873, + 3.0391288591472954, + 3.3035970359672553, + 1.5835471589451968, + 3.7530378537263638, + 1.0833589653009252, + 3.8104965452882897, + 2.665217959294802, + 1.5850334252802472, + 3.679401198805563, + 3.5332033186588636 ]) AS expected + LIMIT 20 +), prediction AS ( + SELECT + cartodb_id::integer id, + prediction + FROM cdb_crankshaft.CDB_CreateAndPredictSegment( + 'SELECT target, x1, x2, x3 FROM ml_values WHERE class = $$train$$ ORDER BY cartodb_id asc', + 'target', + 'SELECT cartodb_id, x1, x2, x3 FROM ml_values WHERE class = $$test$$ ORDER BY cartodb_id asc' + ) + LIMIT 20 +) +SELECT + abs(e.expected - p.prediction) <= 1e-1 AS within_tolerance +FROM expected e, prediction p +WHERE e.id = p.id; + +SELECT cdb_crankshaft._cdb_random_seeds(1234); +-- third version (query, specifying features) +WITH expected AS ( + SELECT + generate_series(1000, 1020) AS id, + unnest(ARRAY[4.5656517130822492, + 1.7928053473230694, + 1.0283378773916563, + 2.6586517814904593, + 2.9699056242935944, + 3.9550646059951347, + 4.1662572444459745, + 3.8126334839264162, + 1.8809821053623488, + 1.6349065129019873, + 3.0391288591472954, + 3.3035970359672553, + 1.5835471589451968, + 3.7530378537263638, + 1.0833589653009252, + 3.8104965452882897, + 2.665217959294802, + 1.5850334252802472, + 3.679401198805563, + 3.5332033186588636 ]) AS expected + LIMIT 20 +), prediction AS ( + SELECT + cartodb_id::integer id, + prediction + FROM cdb_crankshaft.CDB_CreateAndPredictSegment( + 'SELECT target, x1, x2, x3 FROM ml_values WHERE class = $$train$$', + 'target', + Array['x1', 'x2', 'x3'], + 'SELECT cartodb_id, x1, x2, x3 FROM ml_values WHERE class = $$test$$' + ) LIMIT 20 -) SELECT abs(e.expected - p.prediction) <= 1e-1 AS within_tolerance FROM expected e, prediction p WHERE e.id = p.id; +) +SELECT + abs(e.expected - p.prediction) <= 1e-1 AS within_tolerance +FROM expected e, prediction p +WHERE e.id = p.id; diff --git a/src/py/crankshaft/crankshaft/segmentation/segmentation.py b/src/py/crankshaft/crankshaft/segmentation/segmentation.py index 613fca6..10b9a84 100644 --- a/src/py/crankshaft/crankshaft/segmentation/segmentation.py +++ b/src/py/crankshaft/crankshaft/segmentation/segmentation.py @@ -29,26 +29,25 @@ class Segmentation(object): straight form the SQL calling the function. Input: - @param target: The 1D array of lenth NSamples containing the + @param target: The 1D array of length NSamples containing the target variable we want the model to predict @param features: The 2D array of size NSamples * NFeatures that - form the imput to the model + form the input to the model @param target_ids: A 1D array of target_ids that will be used to associate the results of the prediction with the rows which they come from @param model_parameters: A dictionary containing parameters for the model. """ - clean_target = replace_nan_with_mean(target) - clean_features = replace_nan_with_mean(features) - target_features = replace_nan_with_mean(target_features) + clean_target, _ = replace_nan_with_mean(target) + clean_features, _ = replace_nan_with_mean(features) + target_features, _ = replace_nan_with_mean(target_features) model, accuracy = train_model(clean_target, clean_features, model_parameters, 0.2) prediction = model.predict(target_features) accuracy_array = [accuracy] * prediction.shape[0] - return zip(target_ids, prediction, - np.full(prediction.shape, accuracy_array)) + return zip(target_ids, prediction, accuracy_array) def create_and_predict_segment(self, query, variable, feature_columns, target_query, model_params, @@ -65,7 +64,6 @@ class Segmentation(object): scikit learn page for [GradientBoostingRegressor] (http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html) """ - params = {"subquery": target_query, "id_col": id_col} @@ -198,7 +196,7 @@ def train_model(target, features, model_params, test_split): Input: @param target: 1D Array of the variable that the model is to be trained to predict - @param features: 2D Array NSamples *NFeatures to use in trining + @param features: 2D Array NSamples *NFeatures to use in training the model @param model_params: A dictionary of model parameters, the full specification can be found on the