mitigating test failures

2024-11-01 10:20:48 +08:00 · 2017-02-14 12:12:53 -05:00 · 2017-02-14 12:12:53 -05:00 · aa413a8d5a
commit aa413a8d5a
parent 6b71822d08
3 changed files with 59 additions and 41 deletions
--- a/src/py/crankshaft/crankshaft/analysis_data_provider.py
+++ b/src/py/crankshaft/crankshaft/analysis_data_provider.py
@ -77,8 +77,6 @@ class AnalysisDataProvider(object):
                  "target": variable,
                  "features": feature_columns}
        """
-        plpy.notice("featurecols: {}".format(str(params)))
-
        columns = ', '.join(['array_agg("{col}") As "{col}"'.format(col=col)
                             for col in params['features']])
        query = '''
@ -89,7 +87,6 @@ class AnalysisDataProvider(object):
                '''.format(subquery=params['subquery'],
                           target=params['target'],
                           columns=columns)
-        plpy.notice("Query: {}".format(query))
        try:
            data = plpy.execute(query)
            return data
--- a/src/py/crankshaft/crankshaft/segmentation/segmentation.py
+++ b/src/py/crankshaft/crankshaft/segmentation/segmentation.py
@ -39,7 +39,6 @@ class Segmentation(object):
                @param model_parameters: A dictionary containing parameters for
                the model.
        """
-
        clean_target = replace_nan_with_mean(target)
        clean_features = replace_nan_with_mean(features)
        target_features = replace_nan_with_mean(target_features)
@ -117,8 +116,6 @@ class Segmentation(object):
            batch = np.row_stack([np.array(row['features'])
                                  for row in rows]).astype(float)

-            # Need to fix this to global mean. This will cause weird effects
-
            batch = replace_nan_with_mean(batch, feature_means)[0]
            prediction = model.predict(batch)
            results.append(prediction)
@ -161,32 +158,38 @@ def replace_nan_with_mean(array, means=None):
        Output:
            array with nans filled in with the mean of the dataset
    """
-    # TODO: update code to take in avgs parameter

    # returns an array of rows and column indices
    nanvals = np.isnan(array)
    indices = np.where(nanvals)

+    def loops(array, axis):
+        try:
+            return np.shape(array)[axis]
+        except IndexError:
+            return 1
+    ran = loops(array, 1)
+
    if means is None:
        means = {}

-        def loops(array, axis):
-            try:
-                return np.shape(array)[axis]
-            except IndexError:
-                return 1
-
-        ran = loops(array, 1)
        if ran == 1:
            array = np.array(array)
            means[0] = np.mean(array[~np.isnan(array)])
+            for row in zip(*indices):
+                array[row] = means[0]
        else:
            for col in range(ran):
                means[col] = np.mean(array[~np.isnan(array[:, col]), col])
-
-    # iterate through entries which have nan values
-    for row, col in zip(*indices):
-        array[row, col] = means[col]
+            for row, col in zip(*indices):
+                array[row, col] = means[col]
+    else:
+        if ran == 1:
+            for row in zip(*indices):
+                array[row] = means[0]
+        else:
+            for row, col in zip(*indices):
+                array[row, col] = means[col]

    return array, means

--- a/src/py/crankshaft/test/test_segmentation.py
+++ b/src/py/crankshaft/test/test_segmentation.py
@ -15,10 +15,10 @@ class RawDataProvider(AnalysisDataProvider):
    def get_segmentation_data(self, params):
        return self.test

-    def get_segmentation_predict_data(self, params):
+    def get_segmentation_model_data(self, params):
        return self.train

-    def get_segmentation_model_data(self, params):
+    def get_segmentation_predict_data(self, params):
        return self.predict


@ -41,10 +41,14 @@ class SegmentationTest(unittest.TestCase):

    def generate_random_data(self, n_samples, random_state, row_type=False):
        x1 = random_state.uniform(size=n_samples)
+        # x1 = np.random.rand(n_samples)
        x2 = random_state.uniform(size=n_samples)
+        # x2 = np.random.rand(n_samples)
        x3 = random_state.randint(0, 4, size=n_samples)
+        # x3 = np.random.rand(n_samples)

        y = x1+x2*x2+x3
+        # y = 2*x1 + 1.5*x2 + 3.6*x3 + 8
        cartodb_id = range(len(x1))

        if row_type:
@ -57,10 +61,11 @@ class SegmentationTest(unittest.TestCase):
        from crankshaft.segmentation import replace_nan_with_mean
        from numpy.testing import assert_array_equal
        test_array = np.array([1.2, np.nan, 3.2, np.nan, np.nan])
-        result = replace_nan_with_mean(test_array)
-        expectation = np.array([1.2, 2.2, 3.2, 2.2, 2.2])
-
-        self.assertTrue(assert_array_equal(result, expectation))
+        result = replace_nan_with_mean(test_array, means=None)[0]
+        expectation = np.array([1.2, 2.2, 3.2, 2.2, 2.2], dtype=float)
+        print result
+        print type(result)
+        assert_array_equal(result, expectation)

    def test_create_and_predict_segment(self):
        from numpy.testing import assert_array_equal
@ -87,16 +92,26 @@ class SegmentationTest(unittest.TestCase):
                            'subsample': 0.5,
                            'learning_rate': 0.01,
                            'min_samples_leaf': 1}
-        data = [{'query': 'select * FROM research_team',
-                 'target': [],
-                 'x1': [],
-                 'x2': [],
-                 'x3': []}]
+        # print "train: {}".format(test_data)
+        # assert 1 == 2
+        # select array_agg(target) as "target",
+        #        array_agg(x1) as "x1",
+        #        etc.
+        feature_means = training_data[0]['x1'].mean()
+        target_mean = training_data[0]['target'].mean()
+        data_train = [{'target': training_data[0]['target'],
+                       'x1': training_data[0]['x1'],
+                       'x2': training_data[0]['x2'],
+                       'x3': training_data[0]['x3']}]
+
+        data_test = [{'id_col': training_data[0]['cartodb_id']}]
+
+        data_predict = [{'feature_columns': test_data}]
        '''
         cursors = [{'features': [[m1[0],m2[0],m3[0]],[m1[1],m2[1],m3[1]],
                                  [m1[2],m2[2],m3[2]]]}]
        '''
-        data = Segmentation(RawDataProvider(test, train, predict))
+        # data = Segmentation(RawDataProvider(test, train, predict))
        '''
        self, query, variable, feature_columns,
                                       target_query, model_params,
@ -107,22 +122,25 @@ class SegmentationTest(unittest.TestCase):
        {'feature1': [1,2,3,4]}, {'feature2' : [2,3,4,5]}
        ]
        '''
-
+        print data_train
        # Before here figure out how to set up the data provider
        # After use data prodiver to run the query and test results.
-
-        seg = Segmentation(RawDataProvider([]))
-
-        result = seg.create_and_predict_segment(
-                'select * from training',
-                'target',
-                'feature_columns',
-                'select * from test',
-                model_parameters)
+        seg = Segmentation(RawDataProvider(data_test, data_train,
+                                           data_predict))
+        # def create_and_predict_segment(self, query, variable, feature_columns
+        #                                target_query, model_params,
+        #                                id_col='cartodb_id'):
+        result = seg.create_and_predict_segment('select * from query',
+                                                'target',
+                                                ['x1', 'x2', 'x3'],
+                                                'select * from target',
+                                                model_parameters,
+                                                id_col='cartodb_id')

        prediction = [r[1] for r in result]

-        accuracy = np.sqrt(np.mean(np.square(np.array(prediction) - np.array(test_y))))
+        accuracy = np.sqrt(np.mean(np.square(np.array(prediction) -
+                                             np.array(test_y))))

        self.assertEqual(len(result), len(test_data))
        self.assertTrue(result[0][2] < 0.01)