mitigating test failures

This commit is contained in:
mehak-sachdeva 2017-02-14 12:12:53 -05:00
parent 6b71822d08
commit aa413a8d5a
3 changed files with 59 additions and 41 deletions

View File

@ -77,8 +77,6 @@ class AnalysisDataProvider(object):
"target": variable, "target": variable,
"features": feature_columns} "features": feature_columns}
""" """
plpy.notice("featurecols: {}".format(str(params)))
columns = ', '.join(['array_agg("{col}") As "{col}"'.format(col=col) columns = ', '.join(['array_agg("{col}") As "{col}"'.format(col=col)
for col in params['features']]) for col in params['features']])
query = ''' query = '''
@ -89,7 +87,6 @@ class AnalysisDataProvider(object):
'''.format(subquery=params['subquery'], '''.format(subquery=params['subquery'],
target=params['target'], target=params['target'],
columns=columns) columns=columns)
plpy.notice("Query: {}".format(query))
try: try:
data = plpy.execute(query) data = plpy.execute(query)
return data return data

View File

@ -39,7 +39,6 @@ class Segmentation(object):
@param model_parameters: A dictionary containing parameters for @param model_parameters: A dictionary containing parameters for
the model. the model.
""" """
clean_target = replace_nan_with_mean(target) clean_target = replace_nan_with_mean(target)
clean_features = replace_nan_with_mean(features) clean_features = replace_nan_with_mean(features)
target_features = replace_nan_with_mean(target_features) target_features = replace_nan_with_mean(target_features)
@ -117,8 +116,6 @@ class Segmentation(object):
batch = np.row_stack([np.array(row['features']) batch = np.row_stack([np.array(row['features'])
for row in rows]).astype(float) for row in rows]).astype(float)
# Need to fix this to global mean. This will cause weird effects
batch = replace_nan_with_mean(batch, feature_means)[0] batch = replace_nan_with_mean(batch, feature_means)[0]
prediction = model.predict(batch) prediction = model.predict(batch)
results.append(prediction) results.append(prediction)
@ -161,32 +158,38 @@ def replace_nan_with_mean(array, means=None):
Output: Output:
array with nans filled in with the mean of the dataset array with nans filled in with the mean of the dataset
""" """
# TODO: update code to take in avgs parameter
# returns an array of rows and column indices # returns an array of rows and column indices
nanvals = np.isnan(array) nanvals = np.isnan(array)
indices = np.where(nanvals) indices = np.where(nanvals)
def loops(array, axis):
try:
return np.shape(array)[axis]
except IndexError:
return 1
ran = loops(array, 1)
if means is None: if means is None:
means = {} means = {}
def loops(array, axis):
try:
return np.shape(array)[axis]
except IndexError:
return 1
ran = loops(array, 1)
if ran == 1: if ran == 1:
array = np.array(array) array = np.array(array)
means[0] = np.mean(array[~np.isnan(array)]) means[0] = np.mean(array[~np.isnan(array)])
for row in zip(*indices):
array[row] = means[0]
else: else:
for col in range(ran): for col in range(ran):
means[col] = np.mean(array[~np.isnan(array[:, col]), col]) means[col] = np.mean(array[~np.isnan(array[:, col]), col])
for row, col in zip(*indices):
# iterate through entries which have nan values array[row, col] = means[col]
for row, col in zip(*indices): else:
array[row, col] = means[col] if ran == 1:
for row in zip(*indices):
array[row] = means[0]
else:
for row, col in zip(*indices):
array[row, col] = means[col]
return array, means return array, means

View File

@ -15,10 +15,10 @@ class RawDataProvider(AnalysisDataProvider):
def get_segmentation_data(self, params): def get_segmentation_data(self, params):
return self.test return self.test
def get_segmentation_predict_data(self, params): def get_segmentation_model_data(self, params):
return self.train return self.train
def get_segmentation_model_data(self, params): def get_segmentation_predict_data(self, params):
return self.predict return self.predict
@ -41,10 +41,14 @@ class SegmentationTest(unittest.TestCase):
def generate_random_data(self, n_samples, random_state, row_type=False): def generate_random_data(self, n_samples, random_state, row_type=False):
x1 = random_state.uniform(size=n_samples) x1 = random_state.uniform(size=n_samples)
# x1 = np.random.rand(n_samples)
x2 = random_state.uniform(size=n_samples) x2 = random_state.uniform(size=n_samples)
# x2 = np.random.rand(n_samples)
x3 = random_state.randint(0, 4, size=n_samples) x3 = random_state.randint(0, 4, size=n_samples)
# x3 = np.random.rand(n_samples)
y = x1+x2*x2+x3 y = x1+x2*x2+x3
# y = 2*x1 + 1.5*x2 + 3.6*x3 + 8
cartodb_id = range(len(x1)) cartodb_id = range(len(x1))
if row_type: if row_type:
@ -57,10 +61,11 @@ class SegmentationTest(unittest.TestCase):
from crankshaft.segmentation import replace_nan_with_mean from crankshaft.segmentation import replace_nan_with_mean
from numpy.testing import assert_array_equal from numpy.testing import assert_array_equal
test_array = np.array([1.2, np.nan, 3.2, np.nan, np.nan]) test_array = np.array([1.2, np.nan, 3.2, np.nan, np.nan])
result = replace_nan_with_mean(test_array) result = replace_nan_with_mean(test_array, means=None)[0]
expectation = np.array([1.2, 2.2, 3.2, 2.2, 2.2]) expectation = np.array([1.2, 2.2, 3.2, 2.2, 2.2], dtype=float)
print result
self.assertTrue(assert_array_equal(result, expectation)) print type(result)
assert_array_equal(result, expectation)
def test_create_and_predict_segment(self): def test_create_and_predict_segment(self):
from numpy.testing import assert_array_equal from numpy.testing import assert_array_equal
@ -87,16 +92,26 @@ class SegmentationTest(unittest.TestCase):
'subsample': 0.5, 'subsample': 0.5,
'learning_rate': 0.01, 'learning_rate': 0.01,
'min_samples_leaf': 1} 'min_samples_leaf': 1}
data = [{'query': 'select * FROM research_team', # print "train: {}".format(test_data)
'target': [], # assert 1 == 2
'x1': [], # select array_agg(target) as "target",
'x2': [], # array_agg(x1) as "x1",
'x3': []}] # etc.
feature_means = training_data[0]['x1'].mean()
target_mean = training_data[0]['target'].mean()
data_train = [{'target': training_data[0]['target'],
'x1': training_data[0]['x1'],
'x2': training_data[0]['x2'],
'x3': training_data[0]['x3']}]
data_test = [{'id_col': training_data[0]['cartodb_id']}]
data_predict = [{'feature_columns': test_data}]
''' '''
cursors = [{'features': [[m1[0],m2[0],m3[0]],[m1[1],m2[1],m3[1]], cursors = [{'features': [[m1[0],m2[0],m3[0]],[m1[1],m2[1],m3[1]],
[m1[2],m2[2],m3[2]]]}] [m1[2],m2[2],m3[2]]]}]
''' '''
data = Segmentation(RawDataProvider(test, train, predict)) # data = Segmentation(RawDataProvider(test, train, predict))
''' '''
self, query, variable, feature_columns, self, query, variable, feature_columns,
target_query, model_params, target_query, model_params,
@ -107,22 +122,25 @@ class SegmentationTest(unittest.TestCase):
{'feature1': [1,2,3,4]}, {'feature2' : [2,3,4,5]} {'feature1': [1,2,3,4]}, {'feature2' : [2,3,4,5]}
] ]
''' '''
print data_train
# Before here figure out how to set up the data provider # Before here figure out how to set up the data provider
# After use data prodiver to run the query and test results. # After use data prodiver to run the query and test results.
seg = Segmentation(RawDataProvider(data_test, data_train,
seg = Segmentation(RawDataProvider([])) data_predict))
# def create_and_predict_segment(self, query, variable, feature_columns
result = seg.create_and_predict_segment( # target_query, model_params,
'select * from training', # id_col='cartodb_id'):
'target', result = seg.create_and_predict_segment('select * from query',
'feature_columns', 'target',
'select * from test', ['x1', 'x2', 'x3'],
model_parameters) 'select * from target',
model_parameters,
id_col='cartodb_id')
prediction = [r[1] for r in result] prediction = [r[1] for r in result]
accuracy = np.sqrt(np.mean(np.square(np.array(prediction) - np.array(test_y)))) accuracy = np.sqrt(np.mean(np.square(np.array(prediction) -
np.array(test_y))))
self.assertEqual(len(result), len(test_data)) self.assertEqual(len(result), len(test_data))
self.assertTrue(result[0][2] < 0.01) self.assertTrue(result[0][2] < 0.01)