debugging
This commit is contained in:
parent
d7bccc1063
commit
8c5449cfd0
@ -48,16 +48,17 @@ CREATE OR REPLACE FUNCTION
|
|||||||
min_samples_leaf INTEGER DEFAULT 1)
|
min_samples_leaf INTEGER DEFAULT 1)
|
||||||
RETURNS TABLE (cartodb_id TEXT, prediction NUMERIC, accuracy NUMERIC)
|
RETURNS TABLE (cartodb_id TEXT, prediction NUMERIC, accuracy NUMERIC)
|
||||||
AS $$
|
AS $$
|
||||||
from crankshaft.segmentation import create_and_predict_segment
|
from crankshaft.segmentation import Segmentation
|
||||||
|
seg = Segmentation()
|
||||||
model_params = {'n_estimators': n_estimators, 'max_depth':max_depth, 'subsample' : subsample, 'learning_rate': learning_rate, 'min_samples_leaf' : min_samples_leaf}
|
model_params = {'n_estimators': n_estimators, 'max_depth':max_depth, 'subsample' : subsample, 'learning_rate': learning_rate, 'min_samples_leaf' : min_samples_leaf}
|
||||||
return create_and_predict_segment(query,variable_name,target_table, model_params)
|
return seg.create_and_predict_segment(query,variable_name,target_table, model_params)
|
||||||
$$ LANGUAGE plpythonu;
|
$$ LANGUAGE plpythonu;
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION
|
CREATE OR REPLACE FUNCTION
|
||||||
CDB_CreateAndPredictSegment (
|
CDB_CreateAndPredictSegment (
|
||||||
query TEXT,
|
query TEXT,
|
||||||
variable_name TEXT,
|
variable_name TEXT,
|
||||||
target_table TEXT,
|
target_query TEXT,
|
||||||
feature_columns TEXT[],
|
feature_columns TEXT[],
|
||||||
n_estimators INTEGER DEFAULT 1200,
|
n_estimators INTEGER DEFAULT 1200,
|
||||||
max_depth INTEGER DEFAULT 3,
|
max_depth INTEGER DEFAULT 3,
|
||||||
@ -69,5 +70,5 @@ AS $$
|
|||||||
from crankshaft.segmentation import Segmentation
|
from crankshaft.segmentation import Segmentation
|
||||||
seg = Segmentation()
|
seg = Segmentation()
|
||||||
model_params = {'n_estimators': n_estimators, 'max_depth':max_depth, 'subsample' : subsample, 'learning_rate': learning_rate, 'min_samples_leaf' : min_samples_leaf}
|
model_params = {'n_estimators': n_estimators, 'max_depth':max_depth, 'subsample' : subsample, 'learning_rate': learning_rate, 'min_samples_leaf' : min_samples_leaf}
|
||||||
return seg.create_and_predict_segment(query,variable_name,target_table, model_params)
|
return seg.create_and_predict_segment(query, variable_name, feature_columns, target_query, model_params)
|
||||||
$$ LANGUAGE plpythonu;
|
$$ LANGUAGE plpythonu;
|
||||||
|
@ -70,8 +70,7 @@ class Segmentation(object):
|
|||||||
params = {"subquery": target_query,
|
params = {"subquery": target_query,
|
||||||
"id_col": id_col}
|
"id_col": id_col}
|
||||||
|
|
||||||
target, features, target_mean, \
|
target, features, target_mean, feature_means = self.clean_data(variable, feature_columns, query)
|
||||||
feature_means = self.clean_data(variable, feature_columns, query)
|
|
||||||
|
|
||||||
model, accuracy = train_model(target, features, model_params, 0.2)
|
model, accuracy = train_model(target, features, model_params, 0.2)
|
||||||
result = self.predict_segment(model, feature_columns, target_query,
|
result = self.predict_segment(model, feature_columns, target_query,
|
||||||
@ -142,8 +141,6 @@ class Segmentation(object):
|
|||||||
]
|
]
|
||||||
'''
|
'''
|
||||||
|
|
||||||
[{target: [dsdfs]}]
|
|
||||||
|
|
||||||
# extract target data from plpy object
|
# extract target data from plpy object
|
||||||
target = np.array(data[0]['target'])
|
target = np.array(data[0]['target'])
|
||||||
|
|
||||||
|
@ -7,26 +7,37 @@ import json
|
|||||||
|
|
||||||
|
|
||||||
class RawDataProvider(AnalysisDataProvider):
|
class RawDataProvider(AnalysisDataProvider):
|
||||||
def __init__(self, raw_data1, raw_data2, raw_data3):
|
def __init__(self, test, train, predict):
|
||||||
self.raw_data1 = raw_data1
|
self.test = test
|
||||||
self.raw_data2 = raw_data2
|
self.train = train
|
||||||
self.raw_data3 = raw_data3
|
self.predict = predict
|
||||||
|
|
||||||
def get_segmentation_data(self, params):
|
def get_segmentation_data(self, params):
|
||||||
return self.raw_data1
|
return self.test
|
||||||
|
|
||||||
def get_segmentation_predict_data(self, params):
|
def get_segmentation_predict_data(self, params):
|
||||||
return self.raw_data2
|
return self.train
|
||||||
|
|
||||||
def get_segmentation_model_data(self, params):
|
def get_segmentation_model_data(self, params):
|
||||||
return self.raw_data3
|
return self.predict
|
||||||
|
|
||||||
|
|
||||||
class SegmentationTest(unittest.TestCase):
|
class SegmentationTest(unittest.TestCase):
|
||||||
"""Testing class for Moran's I functions"""
|
"""Testing class for Segmentation functions"""
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
plpy._reset()
|
plpy._reset()
|
||||||
|
self.params = {"query": 'SELECT * FROM seg_test',
|
||||||
|
"variable": 'price',
|
||||||
|
"feature_columns": ['m1', 'm2', 'm3'],
|
||||||
|
"target_query": 'SELECT * FROM seg_test_target',
|
||||||
|
"id_col": 'cartodb_id',
|
||||||
|
"model_params": {'n_estimators': 1200,
|
||||||
|
'max_depth': 3,
|
||||||
|
'subsample': 0.5,
|
||||||
|
'learning_rate': 0.01,
|
||||||
|
'min_samples_leaf': 1}
|
||||||
|
}
|
||||||
|
|
||||||
def generate_random_data(self, n_samples, random_state, row_type=False):
|
def generate_random_data(self, n_samples, random_state, row_type=False):
|
||||||
x1 = random_state.uniform(size=n_samples)
|
x1 = random_state.uniform(size=n_samples)
|
||||||
@ -39,42 +50,69 @@ class SegmentationTest(unittest.TestCase):
|
|||||||
if row_type:
|
if row_type:
|
||||||
return [{'features': vals} for vals in zip(x1, x2, x3)], y
|
return [{'features': vals} for vals in zip(x1, x2, x3)], y
|
||||||
else:
|
else:
|
||||||
return [dict(zip(['x1', 'x2', 'x3', 'target', 'cartodb_id'], [x1, x2, x3, y, cartodb_id]))]
|
return [dict(zip(['x1', 'x2', 'x3', 'target', 'cartodb_id'],
|
||||||
|
[x1, x2, x3, y, cartodb_id]))]
|
||||||
|
|
||||||
def test_replace_nan_with_mean(self):
|
def test_replace_nan_with_mean(self):
|
||||||
|
from crankshaft.segmentation import replace_nan_with_mean
|
||||||
|
from numpy.testing import assert_array_equal
|
||||||
test_array = np.array([1.2, np.nan, 3.2, np.nan, np.nan])
|
test_array = np.array([1.2, np.nan, 3.2, np.nan, np.nan])
|
||||||
|
result = replace_nan_with_mean(test_array)
|
||||||
|
expectation = np.array([1.2, 2.2, 3.2, 2.2, 2.2])
|
||||||
|
|
||||||
|
self.assertTrue(assert_array_equal(result, expectation))
|
||||||
|
|
||||||
def test_create_and_predict_segment(self):
|
def test_create_and_predict_segment(self):
|
||||||
|
from crankshaft.segmentation import Segmentation
|
||||||
|
from numpy.testing import assert_array_equal
|
||||||
|
|
||||||
n_samples = 1000
|
n_samples = 1000
|
||||||
|
|
||||||
random_state_train = np.random.RandomState(13)
|
random_state_train = np.random.RandomState(13)
|
||||||
random_state_test = np.random.RandomState(134)
|
random_state_test = np.random.RandomState(134)
|
||||||
training_data = self.generate_random_data(n_samples, random_state_train)
|
training_data = self.generate_random_data(n_samples,
|
||||||
test_data, test_y = self.generate_random_data(n_samples, random_state_test, row_type=True)
|
random_state_train)
|
||||||
|
test_data, test_y = self.generate_random_data(n_samples,
|
||||||
|
random_state_test,
|
||||||
|
row_type=True)
|
||||||
|
|
||||||
ids = [{'cartodb_ids': range(len(test_data))}]
|
ids = [{'cartodb_ids': range(len(test_data))}]
|
||||||
rows = [{'x1': 0, 'x2': 0, 'x3': 0, 'y': 0, 'cartodb_id': 0}]
|
|
||||||
|
|
||||||
plpy._define_result('select \* from \(select \* from training\) a limit 1', rows)
|
'''
|
||||||
plpy._define_result('.*from \(select \* from training\) as a', training_data)
|
rowid = [{'ids': [2.9, 4.9, 4, 5, 6]}]
|
||||||
plpy._define_result('select array_agg\(cartodb\_id order by cartodb\_id\) as cartodb_ids from \(.*\) a', ids)
|
'''
|
||||||
plpy._define_result('.*select \* from test.*', test_data)
|
rows = [{'x1': 0, 'x2': 0, 'x3': 0, 'y': 0, 'cartodb_id': 0}]
|
||||||
|
|
||||||
model_parameters = {'n_estimators': 1200,
|
model_parameters = {'n_estimators': 1200,
|
||||||
'max_depth': 3,
|
'max_depth': 3,
|
||||||
'subsample': 0.5,
|
'subsample': 0.5,
|
||||||
'learning_rate': 0.01,
|
'learning_rate': 0.01,
|
||||||
'min_samples_leaf': 1}
|
'min_samples_leaf': 1}
|
||||||
data = [{'target': [],
|
data = [{'query':
|
||||||
|
'target': [],
|
||||||
'x1': [],
|
'x1': [],
|
||||||
'x2': [],
|
'x2': [],
|
||||||
'x3': []}]
|
'x3': []}]
|
||||||
seg = Segmentation(RawDataProvider(test, train, predict))
|
'''
|
||||||
|
cursors = [{'features': [[m1[0],m2[0],m3[0]],[m1[1],m2[1],m3[1]],
|
||||||
|
[m1[2],m2[2],m3[2]]]}]
|
||||||
|
'''
|
||||||
|
data = Segmentation(RawDataProvider(test, train, predict))
|
||||||
'''
|
'''
|
||||||
self, query, variable, feature_columns,
|
self, query, variable, feature_columns,
|
||||||
target_query, model_params,
|
target_query, model_params,
|
||||||
id_col='cartodb_id'
|
id_col='cartodb_id'
|
||||||
'''
|
'''
|
||||||
|
'''
|
||||||
|
data = [{'target': [2.9, 4.9, 4, 5, 6]},
|
||||||
|
{'feature1': [1,2,3,4]}, {'feature2' : [2,3,4,5]}
|
||||||
|
]
|
||||||
|
'''
|
||||||
|
|
||||||
|
# Before here figure out how to set up the data provider
|
||||||
|
# After use data prodiver to run the query and test results.
|
||||||
|
|
||||||
|
seg = Segmentation(data_provider=)
|
||||||
|
|
||||||
result = seg.create_and_predict_segment(
|
result = seg.create_and_predict_segment(
|
||||||
'select * from training',
|
'select * from training',
|
||||||
|
Loading…
Reference in New Issue
Block a user