adds fuller test suite for segmentation

2018-03-15 11:41:47 -04:00 · 2018-03-15 11:41:47 -04:00 · 6260a080a6
commit 6260a080a6
parent bc9cae1a63
5 changed files with 207 additions and 44 deletions
--- a/src/pg/sql/05_segmentation.sql
+++ b/src/pg/sql/05_segmentation.sql
@ -25,19 +25,20 @@ AS $$
    def unpack2D(data):
        dimension = data.pop(0)
-        a = np.array(data, dtype=float)
+        a = np.array(data, dtype=np.float64)
-        return a.reshape(len(a)/dimension, dimension)
+        return a.reshape(int(len(a)/dimension), int(dimension))
-    return seg.create_and_predict_segment_agg(np.array(target, dtype=float),
+    return seg.create_and_predict_segment_agg(
-            unpack2D(features),
+        np.array(target, dtype=np.float64),
-            unpack2D(target_features),
+        unpack2D(features),
-            target_ids,
+        unpack2D(target_features),
-            model_params)
+        target_ids,
        model_params)
 $$ LANGUAGE plpythonu VOLATILE PARALLEL RESTRICTED;
 CREATE OR REPLACE FUNCTION
-  CDB_CreateAndPredictSegment (
+  CDB_CreateAndPredictSegment(
      query TEXT,
      variable_name TEXT,
      target_table TEXT,
@ -57,9 +58,13 @@ AS $$
        'learning_rate': learning_rate,
        'min_samples_leaf': min_samples_leaf
    }
    feature_cols = set(plpy.execute('''
        select * from ({query}) as _w limit 0
    '''.format(query=query)).colnames()) -  set([variable_name, 'cartodb_id', ])
    return seg.create_and_predict_segment(
        query,
        variable_name,
        feature_cols,
        target_table,
        model_params
    )
--- a/src/pg/test/expected/06_segmentation_test.out
+++ b/src/pg/test/expected/06_segmentation_test.out
@ -25,3 +25,53 @@ t
 t
 t
 (20 rows)
 _cdb_random_seeds
 (1 row)
 within_tolerance
 t
 t
 t
 t
 t
 t
 t
 t
 t
 t
 t
 t
 t
 t
 t
 t
 t
 t
 t
 t
 (20 rows)
 _cdb_random_seeds
 (1 row)
 within_tolerance
 t
 t
 t
 t
 t
 t
 t
 t
 t
 t
 t
 t
 t
 t
 t
 t
 t
 t
 t
 t
 (20 rows)
--- a/src/pg/test/fixtures/ml_values.sql
+++ b/src/pg/test/fixtures/ml_values.sql
@ -1,7 +1,7 @@
 SET client_min_messages TO WARNING;
 \set ECHO none
-CREATE TABLE ml_values (cartodb_id integer, target float, the_geom geometry, x1 float , x2 float, x3 float, class text);
+CREATE TABLE ml_values (cartodb_id integer, target float, the_geom geometry, x1 float, x2 float, x3 float, class text);
-INSERT INTO ml_values(cartodb_id, target,x1,x2,x3, class) VALUES 
+INSERT INTO ml_values(cartodb_id, target, x1, x2, x3, class) VALUES
 (0,1.24382137034,0.811403626309,0.657584780869,0,'train'),
 (1,1.72727475342,0.447764244847,0.528687533966,1,'train'),
 (2,3.32104694099,0.62774565606,0.832647155118,2,'train'),
--- a/src/pg/test/sql/06_segmentation_test.sql
+++ b/src/pg/test/sql/06_segmentation_test.sql
@ -3,31 +3,141 @@
 \i test/fixtures/ml_values.sql
 SELECT cdb_crankshaft._cdb_random_seeds(1234);
 -- second version (query, not specifying features)
 WITH expected AS (
-  SELECT generate_series(1000,1020) AS id, unnest(ARRAY[
+  SELECT
-    4.5656517130822492,
+    generate_series(1000, 1020) AS id,
-    1.7928053473230694,
+    unnest(ARRAY[4.5656517130822492,
-    1.0283378773916563,
+                 1.7928053473230694,
-    2.6586517814904593,
+                 1.0283378773916563,
-    2.9699056242935944,
+                 2.6586517814904593,
-    3.9550646059951347,
+                 2.9699056242935944,
-    4.1662572444459745,
+                 3.9550646059951347,
-    3.8126334839264162,
+                 4.1662572444459745,
-    1.8809821053623488,
+                 3.8126334839264162,
-    1.6349065129019873,
+                 1.8809821053623488,
-    3.0391288591472954,
+                 1.6349065129019873,
-    3.3035970359672553,
+                 3.0391288591472954,
-    1.5835471589451968,
+                 3.3035970359672553,
-    3.7530378537263638,
+                 1.5835471589451968,
-    1.0833589653009252,
+                 3.7530378537263638,
-    3.8104965452882897,
+                 1.0833589653009252,
-    2.665217959294802,
+                 3.8104965452882897,
-    1.5850334252802472,
+                 2.665217959294802,
-    3.679401198805563,
+                 1.5850334252802472,
-    3.5332033186588636
+                 3.679401198805563,
-    ]) AS expected LIMIT 20
+                 3.5332033186588636 ]) AS expected
    LIMIT 20
 ), training as (
  SELECT
    array_agg(target)::numeric[] as target,
    cdb_crankshaft.CDB_PyAgg(Array[x1, x2, x3]::numeric[]) as features
  FROM (SELECT * FROM ml_values ORDER BY cartodb_id asc) as _w
  WHERE class = 'train'
 ), testing As (
  SELECT
    cdb_crankshaft.CDB_PyAgg(Array[x1, x2, x3]::numeric[]) as features,
    array_agg(cartodb_id)::numeric[] as cartodb_ids
  FROM (SELECT * FROM ml_values ORDER BY cartodb_id asc) as _w
  WHERE class = 'test'
 ), prediction AS (
-  SELECT cartodb_id::integer id, prediction
+  SELECT
-  FROM cdb_crankshaft.CDB_CreateAndPredictSegment('SELECT target, x1, x2, x3 FROM ml_values WHERE class = $$train$$','target', Array['x1', 'x2', 'x3'], 'SELECT cartodb_id, x1, x2, x3 FROM ml_values WHERE class = $$test$$')
+    *
  FROM
    cdb_crankshaft.CDB_CreateAndPredictSegment(
        (SELECT target FROM training),
        (SELECT features FROM training),
        (SELECT features FROM testing),
        (SELECT cartodb_ids FROM testing)
      )
 )
 SELECT
  abs(e.expected - p.prediction) <= 1e-1 AS within_tolerance
 FROM expected e, prediction p
 WHERE e.id = p.cartodb_id
 LIMIT 20;
 SELECT cdb_crankshaft._cdb_random_seeds(1234);
 -- second version (query, not specifying features)
 WITH expected AS (
  SELECT
    generate_series(1000, 1020) AS id,
    unnest(ARRAY[4.5656517130822492,
                 1.7928053473230694,
                 1.0283378773916563,
                 2.6586517814904593,
                 2.9699056242935944,
                 3.9550646059951347,
                 4.1662572444459745,
                 3.8126334839264162,
                 1.8809821053623488,
                 1.6349065129019873,
                 3.0391288591472954,
                 3.3035970359672553,
                 1.5835471589451968,
                 3.7530378537263638,
                 1.0833589653009252,
                 3.8104965452882897,
                 2.665217959294802,
                 1.5850334252802472,
                 3.679401198805563,
                 3.5332033186588636 ]) AS expected
    LIMIT 20
 ), prediction AS (
  SELECT
    cartodb_id::integer id,
    prediction
  FROM cdb_crankshaft.CDB_CreateAndPredictSegment(
    'SELECT target, x1, x2, x3 FROM ml_values WHERE class = $$train$$ ORDER BY cartodb_id asc',
    'target',
    'SELECT cartodb_id, x1, x2, x3 FROM ml_values WHERE class = $$test$$ ORDER BY cartodb_id asc'
  )
  LIMIT 20
-) SELECT abs(e.expected - p.prediction) <= 1e-1 AS within_tolerance FROM expected e, prediction p WHERE e.id = p.id;
+)
 SELECT
  abs(e.expected - p.prediction) <= 1e-1 AS within_tolerance
 FROM expected e, prediction p
 WHERE e.id = p.id;
 SELECT cdb_crankshaft._cdb_random_seeds(1234);
 -- third version (query, specifying features)
 WITH expected AS (
  SELECT
    generate_series(1000, 1020) AS id,
    unnest(ARRAY[4.5656517130822492,
                 1.7928053473230694,
                 1.0283378773916563,
                 2.6586517814904593,
                 2.9699056242935944,
                 3.9550646059951347,
                 4.1662572444459745,
                 3.8126334839264162,
                 1.8809821053623488,
                 1.6349065129019873,
                 3.0391288591472954,
                 3.3035970359672553,
                 1.5835471589451968,
                 3.7530378537263638,
                 1.0833589653009252,
                 3.8104965452882897,
                 2.665217959294802,
                 1.5850334252802472,
                 3.679401198805563,
                 3.5332033186588636 ]) AS expected
    LIMIT 20
 ), prediction AS (
  SELECT
    cartodb_id::integer id,
    prediction
  FROM cdb_crankshaft.CDB_CreateAndPredictSegment(
    'SELECT target, x1, x2, x3 FROM ml_values WHERE class = $$train$$',
    'target',
    Array['x1', 'x2', 'x3'],
    'SELECT cartodb_id, x1, x2, x3 FROM ml_values WHERE class = $$test$$'
  )
  LIMIT 20
 )
 SELECT
  abs(e.expected - p.prediction) <= 1e-1 AS within_tolerance
 FROM expected e, prediction p
 WHERE e.id = p.id;
--- a/src/py/crankshaft/crankshaft/segmentation/segmentation.py
+++ b/src/py/crankshaft/crankshaft/segmentation/segmentation.py
@ -29,26 +29,25 @@ class Segmentation(object):
            straight form the SQL calling the function.
            Input:
-                @param target: The 1D array of lenth NSamples containing the
+                @param target: The 1D array of length NSamples containing the
                target variable we want the model to predict
                @param features: The 2D array of size NSamples * NFeatures that
-                    form the imput to the model
+                    form the input to the model
                @param target_ids: A 1D array of target_ids that will be used
                to associate the results of the prediction with the rows which
                    they come from
                @param model_parameters: A dictionary containing parameters for
                the model.
        """
-        clean_target = replace_nan_with_mean(target)
+        clean_target, _ = replace_nan_with_mean(target)
-        clean_features = replace_nan_with_mean(features)
+        clean_features, _ = replace_nan_with_mean(features)
-        target_features = replace_nan_with_mean(target_features)
+        target_features,  _ = replace_nan_with_mean(target_features)
        model, accuracy = train_model(clean_target, clean_features,
                                      model_parameters, 0.2)
        prediction = model.predict(target_features)
        accuracy_array = [accuracy] * prediction.shape[0]
-        return zip(target_ids, prediction,
+        return zip(target_ids, prediction, accuracy_array)
                   np.full(prediction.shape, accuracy_array))
    def create_and_predict_segment(self, query, variable, feature_columns,
                                   target_query, model_params,
@ -65,7 +64,6 @@ class Segmentation(object):
                        scikit learn page for [GradientBoostingRegressor]
                        (http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
        """
        params = {"subquery": target_query,
                  "id_col": id_col}
@ -198,7 +196,7 @@ def train_model(target, features, model_params, test_split):
        Input:
            @param target: 1D Array of the variable that the model is to be
                trained to predict
-            @param features: 2D Array NSamples *NFeatures to use in trining
+            @param features: 2D Array NSamples *NFeatures to use in training
                the model
            @param model_params: A dictionary of model parameters, the full
                specification can be found on the