debugging

This commit is contained in:
mehak-sachdeva 2017-02-01 19:16:32 -05:00
parent d7bccc1063
commit 8c5449cfd0
3 changed files with 62 additions and 26 deletions

View File

@ -48,16 +48,17 @@ CREATE OR REPLACE FUNCTION
min_samples_leaf INTEGER DEFAULT 1) min_samples_leaf INTEGER DEFAULT 1)
RETURNS TABLE (cartodb_id TEXT, prediction NUMERIC, accuracy NUMERIC) RETURNS TABLE (cartodb_id TEXT, prediction NUMERIC, accuracy NUMERIC)
AS $$ AS $$
from crankshaft.segmentation import create_and_predict_segment from crankshaft.segmentation import Segmentation
seg = Segmentation()
model_params = {'n_estimators': n_estimators, 'max_depth':max_depth, 'subsample' : subsample, 'learning_rate': learning_rate, 'min_samples_leaf' : min_samples_leaf} model_params = {'n_estimators': n_estimators, 'max_depth':max_depth, 'subsample' : subsample, 'learning_rate': learning_rate, 'min_samples_leaf' : min_samples_leaf}
return create_and_predict_segment(query,variable_name,target_table, model_params) return seg.create_and_predict_segment(query,variable_name,target_table, model_params)
$$ LANGUAGE plpythonu; $$ LANGUAGE plpythonu;
CREATE OR REPLACE FUNCTION CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment ( CDB_CreateAndPredictSegment (
query TEXT, query TEXT,
variable_name TEXT, variable_name TEXT,
target_table TEXT, target_query TEXT,
feature_columns TEXT[], feature_columns TEXT[],
n_estimators INTEGER DEFAULT 1200, n_estimators INTEGER DEFAULT 1200,
max_depth INTEGER DEFAULT 3, max_depth INTEGER DEFAULT 3,
@ -69,5 +70,5 @@ AS $$
from crankshaft.segmentation import Segmentation from crankshaft.segmentation import Segmentation
seg = Segmentation() seg = Segmentation()
model_params = {'n_estimators': n_estimators, 'max_depth':max_depth, 'subsample' : subsample, 'learning_rate': learning_rate, 'min_samples_leaf' : min_samples_leaf} model_params = {'n_estimators': n_estimators, 'max_depth':max_depth, 'subsample' : subsample, 'learning_rate': learning_rate, 'min_samples_leaf' : min_samples_leaf}
return seg.create_and_predict_segment(query,variable_name,target_table, model_params) return seg.create_and_predict_segment(query, variable_name, feature_columns, target_query, model_params)
$$ LANGUAGE plpythonu; $$ LANGUAGE plpythonu;

View File

@ -70,8 +70,7 @@ class Segmentation(object):
params = {"subquery": target_query, params = {"subquery": target_query,
"id_col": id_col} "id_col": id_col}
target, features, target_mean, \ target, features, target_mean, feature_means = self.clean_data(variable, feature_columns, query)
feature_means = self.clean_data(variable, feature_columns, query)
model, accuracy = train_model(target, features, model_params, 0.2) model, accuracy = train_model(target, features, model_params, 0.2)
result = self.predict_segment(model, feature_columns, target_query, result = self.predict_segment(model, feature_columns, target_query,
@ -142,8 +141,6 @@ class Segmentation(object):
] ]
''' '''
[{target: [dsdfs]}]
# extract target data from plpy object # extract target data from plpy object
target = np.array(data[0]['target']) target = np.array(data[0]['target'])

View File

@ -7,26 +7,37 @@ import json
class RawDataProvider(AnalysisDataProvider): class RawDataProvider(AnalysisDataProvider):
def __init__(self, raw_data1, raw_data2, raw_data3): def __init__(self, test, train, predict):
self.raw_data1 = raw_data1 self.test = test
self.raw_data2 = raw_data2 self.train = train
self.raw_data3 = raw_data3 self.predict = predict
def get_segmentation_data(self, params): def get_segmentation_data(self, params):
return self.raw_data1 return self.test
def get_segmentation_predict_data(self, params): def get_segmentation_predict_data(self, params):
return self.raw_data2 return self.train
def get_segmentation_model_data(self, params): def get_segmentation_model_data(self, params):
return self.raw_data3 return self.predict
class SegmentationTest(unittest.TestCase): class SegmentationTest(unittest.TestCase):
"""Testing class for Moran's I functions""" """Testing class for Segmentation functions"""
def setUp(self): def setUp(self):
plpy._reset() plpy._reset()
self.params = {"query": 'SELECT * FROM seg_test',
"variable": 'price',
"feature_columns": ['m1', 'm2', 'm3'],
"target_query": 'SELECT * FROM seg_test_target',
"id_col": 'cartodb_id',
"model_params": {'n_estimators': 1200,
'max_depth': 3,
'subsample': 0.5,
'learning_rate': 0.01,
'min_samples_leaf': 1}
}
def generate_random_data(self, n_samples, random_state, row_type=False): def generate_random_data(self, n_samples, random_state, row_type=False):
x1 = random_state.uniform(size=n_samples) x1 = random_state.uniform(size=n_samples)
@ -39,42 +50,69 @@ class SegmentationTest(unittest.TestCase):
if row_type: if row_type:
return [{'features': vals} for vals in zip(x1, x2, x3)], y return [{'features': vals} for vals in zip(x1, x2, x3)], y
else: else:
return [dict(zip(['x1', 'x2', 'x3', 'target', 'cartodb_id'], [x1, x2, x3, y, cartodb_id]))] return [dict(zip(['x1', 'x2', 'x3', 'target', 'cartodb_id'],
[x1, x2, x3, y, cartodb_id]))]
def test_replace_nan_with_mean(self): def test_replace_nan_with_mean(self):
from crankshaft.segmentation import replace_nan_with_mean
from numpy.testing import assert_array_equal
test_array = np.array([1.2, np.nan, 3.2, np.nan, np.nan]) test_array = np.array([1.2, np.nan, 3.2, np.nan, np.nan])
result = replace_nan_with_mean(test_array)
expectation = np.array([1.2, 2.2, 3.2, 2.2, 2.2])
self.assertTrue(assert_array_equal(result, expectation))
def test_create_and_predict_segment(self): def test_create_and_predict_segment(self):
from crankshaft.segmentation import Segmentation
from numpy.testing import assert_array_equal
n_samples = 1000 n_samples = 1000
random_state_train = np.random.RandomState(13) random_state_train = np.random.RandomState(13)
random_state_test = np.random.RandomState(134) random_state_test = np.random.RandomState(134)
training_data = self.generate_random_data(n_samples, random_state_train) training_data = self.generate_random_data(n_samples,
test_data, test_y = self.generate_random_data(n_samples, random_state_test, row_type=True) random_state_train)
test_data, test_y = self.generate_random_data(n_samples,
random_state_test,
row_type=True)
ids = [{'cartodb_ids': range(len(test_data))}] ids = [{'cartodb_ids': range(len(test_data))}]
rows = [{'x1': 0, 'x2': 0, 'x3': 0, 'y': 0, 'cartodb_id': 0}]
plpy._define_result('select \* from \(select \* from training\) a limit 1', rows) '''
plpy._define_result('.*from \(select \* from training\) as a', training_data) rowid = [{'ids': [2.9, 4.9, 4, 5, 6]}]
plpy._define_result('select array_agg\(cartodb\_id order by cartodb\_id\) as cartodb_ids from \(.*\) a', ids) '''
plpy._define_result('.*select \* from test.*', test_data) rows = [{'x1': 0, 'x2': 0, 'x3': 0, 'y': 0, 'cartodb_id': 0}]
model_parameters = {'n_estimators': 1200, model_parameters = {'n_estimators': 1200,
'max_depth': 3, 'max_depth': 3,
'subsample': 0.5, 'subsample': 0.5,
'learning_rate': 0.01, 'learning_rate': 0.01,
'min_samples_leaf': 1} 'min_samples_leaf': 1}
data = [{'target': [], data = [{'query':
'target': [],
'x1': [], 'x1': [],
'x2': [], 'x2': [],
'x3': []}] 'x3': []}]
seg = Segmentation(RawDataProvider(test, train, predict)) '''
cursors = [{'features': [[m1[0],m2[0],m3[0]],[m1[1],m2[1],m3[1]],
[m1[2],m2[2],m3[2]]]}]
'''
data = Segmentation(RawDataProvider(test, train, predict))
''' '''
self, query, variable, feature_columns, self, query, variable, feature_columns,
target_query, model_params, target_query, model_params,
id_col='cartodb_id' id_col='cartodb_id'
''' '''
'''
data = [{'target': [2.9, 4.9, 4, 5, 6]},
{'feature1': [1,2,3,4]}, {'feature2' : [2,3,4,5]}
]
'''
# Before here figure out how to set up the data provider
# After use data prodiver to run the query and test results.
seg = Segmentation(data_provider=)
result = seg.create_and_predict_segment( result = seg.create_and_predict_segment(
'select * from training', 'select * from training',