mirror of
https://github.com/CartoDB/crankshaft.git
synced 2024-11-01 10:20:48 +08:00
mitigating test failures
This commit is contained in:
parent
6b71822d08
commit
aa413a8d5a
@ -77,8 +77,6 @@ class AnalysisDataProvider(object):
|
|||||||
"target": variable,
|
"target": variable,
|
||||||
"features": feature_columns}
|
"features": feature_columns}
|
||||||
"""
|
"""
|
||||||
plpy.notice("featurecols: {}".format(str(params)))
|
|
||||||
|
|
||||||
columns = ', '.join(['array_agg("{col}") As "{col}"'.format(col=col)
|
columns = ', '.join(['array_agg("{col}") As "{col}"'.format(col=col)
|
||||||
for col in params['features']])
|
for col in params['features']])
|
||||||
query = '''
|
query = '''
|
||||||
@ -89,7 +87,6 @@ class AnalysisDataProvider(object):
|
|||||||
'''.format(subquery=params['subquery'],
|
'''.format(subquery=params['subquery'],
|
||||||
target=params['target'],
|
target=params['target'],
|
||||||
columns=columns)
|
columns=columns)
|
||||||
plpy.notice("Query: {}".format(query))
|
|
||||||
try:
|
try:
|
||||||
data = plpy.execute(query)
|
data = plpy.execute(query)
|
||||||
return data
|
return data
|
||||||
|
@ -39,7 +39,6 @@ class Segmentation(object):
|
|||||||
@param model_parameters: A dictionary containing parameters for
|
@param model_parameters: A dictionary containing parameters for
|
||||||
the model.
|
the model.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
clean_target = replace_nan_with_mean(target)
|
clean_target = replace_nan_with_mean(target)
|
||||||
clean_features = replace_nan_with_mean(features)
|
clean_features = replace_nan_with_mean(features)
|
||||||
target_features = replace_nan_with_mean(target_features)
|
target_features = replace_nan_with_mean(target_features)
|
||||||
@ -117,8 +116,6 @@ class Segmentation(object):
|
|||||||
batch = np.row_stack([np.array(row['features'])
|
batch = np.row_stack([np.array(row['features'])
|
||||||
for row in rows]).astype(float)
|
for row in rows]).astype(float)
|
||||||
|
|
||||||
# Need to fix this to global mean. This will cause weird effects
|
|
||||||
|
|
||||||
batch = replace_nan_with_mean(batch, feature_means)[0]
|
batch = replace_nan_with_mean(batch, feature_means)[0]
|
||||||
prediction = model.predict(batch)
|
prediction = model.predict(batch)
|
||||||
results.append(prediction)
|
results.append(prediction)
|
||||||
@ -161,32 +158,38 @@ def replace_nan_with_mean(array, means=None):
|
|||||||
Output:
|
Output:
|
||||||
array with nans filled in with the mean of the dataset
|
array with nans filled in with the mean of the dataset
|
||||||
"""
|
"""
|
||||||
# TODO: update code to take in avgs parameter
|
|
||||||
|
|
||||||
# returns an array of rows and column indices
|
# returns an array of rows and column indices
|
||||||
nanvals = np.isnan(array)
|
nanvals = np.isnan(array)
|
||||||
indices = np.where(nanvals)
|
indices = np.where(nanvals)
|
||||||
|
|
||||||
|
def loops(array, axis):
|
||||||
|
try:
|
||||||
|
return np.shape(array)[axis]
|
||||||
|
except IndexError:
|
||||||
|
return 1
|
||||||
|
ran = loops(array, 1)
|
||||||
|
|
||||||
if means is None:
|
if means is None:
|
||||||
means = {}
|
means = {}
|
||||||
|
|
||||||
def loops(array, axis):
|
|
||||||
try:
|
|
||||||
return np.shape(array)[axis]
|
|
||||||
except IndexError:
|
|
||||||
return 1
|
|
||||||
|
|
||||||
ran = loops(array, 1)
|
|
||||||
if ran == 1:
|
if ran == 1:
|
||||||
array = np.array(array)
|
array = np.array(array)
|
||||||
means[0] = np.mean(array[~np.isnan(array)])
|
means[0] = np.mean(array[~np.isnan(array)])
|
||||||
|
for row in zip(*indices):
|
||||||
|
array[row] = means[0]
|
||||||
else:
|
else:
|
||||||
for col in range(ran):
|
for col in range(ran):
|
||||||
means[col] = np.mean(array[~np.isnan(array[:, col]), col])
|
means[col] = np.mean(array[~np.isnan(array[:, col]), col])
|
||||||
|
for row, col in zip(*indices):
|
||||||
# iterate through entries which have nan values
|
array[row, col] = means[col]
|
||||||
for row, col in zip(*indices):
|
else:
|
||||||
array[row, col] = means[col]
|
if ran == 1:
|
||||||
|
for row in zip(*indices):
|
||||||
|
array[row] = means[0]
|
||||||
|
else:
|
||||||
|
for row, col in zip(*indices):
|
||||||
|
array[row, col] = means[col]
|
||||||
|
|
||||||
return array, means
|
return array, means
|
||||||
|
|
||||||
|
@ -15,10 +15,10 @@ class RawDataProvider(AnalysisDataProvider):
|
|||||||
def get_segmentation_data(self, params):
|
def get_segmentation_data(self, params):
|
||||||
return self.test
|
return self.test
|
||||||
|
|
||||||
def get_segmentation_predict_data(self, params):
|
def get_segmentation_model_data(self, params):
|
||||||
return self.train
|
return self.train
|
||||||
|
|
||||||
def get_segmentation_model_data(self, params):
|
def get_segmentation_predict_data(self, params):
|
||||||
return self.predict
|
return self.predict
|
||||||
|
|
||||||
|
|
||||||
@ -41,10 +41,14 @@ class SegmentationTest(unittest.TestCase):
|
|||||||
|
|
||||||
def generate_random_data(self, n_samples, random_state, row_type=False):
|
def generate_random_data(self, n_samples, random_state, row_type=False):
|
||||||
x1 = random_state.uniform(size=n_samples)
|
x1 = random_state.uniform(size=n_samples)
|
||||||
|
# x1 = np.random.rand(n_samples)
|
||||||
x2 = random_state.uniform(size=n_samples)
|
x2 = random_state.uniform(size=n_samples)
|
||||||
|
# x2 = np.random.rand(n_samples)
|
||||||
x3 = random_state.randint(0, 4, size=n_samples)
|
x3 = random_state.randint(0, 4, size=n_samples)
|
||||||
|
# x3 = np.random.rand(n_samples)
|
||||||
|
|
||||||
y = x1+x2*x2+x3
|
y = x1+x2*x2+x3
|
||||||
|
# y = 2*x1 + 1.5*x2 + 3.6*x3 + 8
|
||||||
cartodb_id = range(len(x1))
|
cartodb_id = range(len(x1))
|
||||||
|
|
||||||
if row_type:
|
if row_type:
|
||||||
@ -57,10 +61,11 @@ class SegmentationTest(unittest.TestCase):
|
|||||||
from crankshaft.segmentation import replace_nan_with_mean
|
from crankshaft.segmentation import replace_nan_with_mean
|
||||||
from numpy.testing import assert_array_equal
|
from numpy.testing import assert_array_equal
|
||||||
test_array = np.array([1.2, np.nan, 3.2, np.nan, np.nan])
|
test_array = np.array([1.2, np.nan, 3.2, np.nan, np.nan])
|
||||||
result = replace_nan_with_mean(test_array)
|
result = replace_nan_with_mean(test_array, means=None)[0]
|
||||||
expectation = np.array([1.2, 2.2, 3.2, 2.2, 2.2])
|
expectation = np.array([1.2, 2.2, 3.2, 2.2, 2.2], dtype=float)
|
||||||
|
print result
|
||||||
self.assertTrue(assert_array_equal(result, expectation))
|
print type(result)
|
||||||
|
assert_array_equal(result, expectation)
|
||||||
|
|
||||||
def test_create_and_predict_segment(self):
|
def test_create_and_predict_segment(self):
|
||||||
from numpy.testing import assert_array_equal
|
from numpy.testing import assert_array_equal
|
||||||
@ -87,16 +92,26 @@ class SegmentationTest(unittest.TestCase):
|
|||||||
'subsample': 0.5,
|
'subsample': 0.5,
|
||||||
'learning_rate': 0.01,
|
'learning_rate': 0.01,
|
||||||
'min_samples_leaf': 1}
|
'min_samples_leaf': 1}
|
||||||
data = [{'query': 'select * FROM research_team',
|
# print "train: {}".format(test_data)
|
||||||
'target': [],
|
# assert 1 == 2
|
||||||
'x1': [],
|
# select array_agg(target) as "target",
|
||||||
'x2': [],
|
# array_agg(x1) as "x1",
|
||||||
'x3': []}]
|
# etc.
|
||||||
|
feature_means = training_data[0]['x1'].mean()
|
||||||
|
target_mean = training_data[0]['target'].mean()
|
||||||
|
data_train = [{'target': training_data[0]['target'],
|
||||||
|
'x1': training_data[0]['x1'],
|
||||||
|
'x2': training_data[0]['x2'],
|
||||||
|
'x3': training_data[0]['x3']}]
|
||||||
|
|
||||||
|
data_test = [{'id_col': training_data[0]['cartodb_id']}]
|
||||||
|
|
||||||
|
data_predict = [{'feature_columns': test_data}]
|
||||||
'''
|
'''
|
||||||
cursors = [{'features': [[m1[0],m2[0],m3[0]],[m1[1],m2[1],m3[1]],
|
cursors = [{'features': [[m1[0],m2[0],m3[0]],[m1[1],m2[1],m3[1]],
|
||||||
[m1[2],m2[2],m3[2]]]}]
|
[m1[2],m2[2],m3[2]]]}]
|
||||||
'''
|
'''
|
||||||
data = Segmentation(RawDataProvider(test, train, predict))
|
# data = Segmentation(RawDataProvider(test, train, predict))
|
||||||
'''
|
'''
|
||||||
self, query, variable, feature_columns,
|
self, query, variable, feature_columns,
|
||||||
target_query, model_params,
|
target_query, model_params,
|
||||||
@ -107,22 +122,25 @@ class SegmentationTest(unittest.TestCase):
|
|||||||
{'feature1': [1,2,3,4]}, {'feature2' : [2,3,4,5]}
|
{'feature1': [1,2,3,4]}, {'feature2' : [2,3,4,5]}
|
||||||
]
|
]
|
||||||
'''
|
'''
|
||||||
|
print data_train
|
||||||
# Before here figure out how to set up the data provider
|
# Before here figure out how to set up the data provider
|
||||||
# After use data prodiver to run the query and test results.
|
# After use data prodiver to run the query and test results.
|
||||||
|
seg = Segmentation(RawDataProvider(data_test, data_train,
|
||||||
seg = Segmentation(RawDataProvider([]))
|
data_predict))
|
||||||
|
# def create_and_predict_segment(self, query, variable, feature_columns
|
||||||
result = seg.create_and_predict_segment(
|
# target_query, model_params,
|
||||||
'select * from training',
|
# id_col='cartodb_id'):
|
||||||
'target',
|
result = seg.create_and_predict_segment('select * from query',
|
||||||
'feature_columns',
|
'target',
|
||||||
'select * from test',
|
['x1', 'x2', 'x3'],
|
||||||
model_parameters)
|
'select * from target',
|
||||||
|
model_parameters,
|
||||||
|
id_col='cartodb_id')
|
||||||
|
|
||||||
prediction = [r[1] for r in result]
|
prediction = [r[1] for r in result]
|
||||||
|
|
||||||
accuracy = np.sqrt(np.mean(np.square(np.array(prediction) - np.array(test_y))))
|
accuracy = np.sqrt(np.mean(np.square(np.array(prediction) -
|
||||||
|
np.array(test_y))))
|
||||||
|
|
||||||
self.assertEqual(len(result), len(test_data))
|
self.assertEqual(len(result), len(test_data))
|
||||||
self.assertTrue(result[0][2] < 0.01)
|
self.assertTrue(result[0][2] < 0.01)
|
||||||
|
Loading…
Reference in New Issue
Block a user