adds fuller test suite for segmentation

This commit is contained in:
Andy Eschbacher 2018-03-15 11:41:47 -04:00
parent bc9cae1a63
commit 6260a080a6
5 changed files with 207 additions and 44 deletions

View File

@ -25,19 +25,20 @@ AS $$
def unpack2D(data): def unpack2D(data):
dimension = data.pop(0) dimension = data.pop(0)
a = np.array(data, dtype=float) a = np.array(data, dtype=np.float64)
return a.reshape(len(a)/dimension, dimension) return a.reshape(int(len(a)/dimension), int(dimension))
return seg.create_and_predict_segment_agg(np.array(target, dtype=float), return seg.create_and_predict_segment_agg(
unpack2D(features), np.array(target, dtype=np.float64),
unpack2D(target_features), unpack2D(features),
target_ids, unpack2D(target_features),
model_params) target_ids,
model_params)
$$ LANGUAGE plpythonu VOLATILE PARALLEL RESTRICTED; $$ LANGUAGE plpythonu VOLATILE PARALLEL RESTRICTED;
CREATE OR REPLACE FUNCTION CREATE OR REPLACE FUNCTION
CDB_CreateAndPredictSegment ( CDB_CreateAndPredictSegment(
query TEXT, query TEXT,
variable_name TEXT, variable_name TEXT,
target_table TEXT, target_table TEXT,
@ -57,9 +58,13 @@ AS $$
'learning_rate': learning_rate, 'learning_rate': learning_rate,
'min_samples_leaf': min_samples_leaf 'min_samples_leaf': min_samples_leaf
} }
feature_cols = set(plpy.execute('''
select * from ({query}) as _w limit 0
'''.format(query=query)).colnames()) - set([variable_name, 'cartodb_id', ])
return seg.create_and_predict_segment( return seg.create_and_predict_segment(
query, query,
variable_name, variable_name,
feature_cols,
target_table, target_table,
model_params model_params
) )

View File

@ -25,3 +25,53 @@ t
t t
t t
(20 rows) (20 rows)
_cdb_random_seeds
(1 row)
within_tolerance
t
t
t
t
t
t
t
t
t
t
t
t
t
t
t
t
t
t
t
t
(20 rows)
_cdb_random_seeds
(1 row)
within_tolerance
t
t
t
t
t
t
t
t
t
t
t
t
t
t
t
t
t
t
t
t
(20 rows)

View File

@ -1,7 +1,7 @@
SET client_min_messages TO WARNING; SET client_min_messages TO WARNING;
\set ECHO none \set ECHO none
CREATE TABLE ml_values (cartodb_id integer, target float, the_geom geometry, x1 float , x2 float, x3 float, class text); CREATE TABLE ml_values (cartodb_id integer, target float, the_geom geometry, x1 float, x2 float, x3 float, class text);
INSERT INTO ml_values(cartodb_id, target,x1,x2,x3, class) VALUES INSERT INTO ml_values(cartodb_id, target, x1, x2, x3, class) VALUES
(0,1.24382137034,0.811403626309,0.657584780869,0,'train'), (0,1.24382137034,0.811403626309,0.657584780869,0,'train'),
(1,1.72727475342,0.447764244847,0.528687533966,1,'train'), (1,1.72727475342,0.447764244847,0.528687533966,1,'train'),
(2,3.32104694099,0.62774565606,0.832647155118,2,'train'), (2,3.32104694099,0.62774565606,0.832647155118,2,'train'),

View File

@ -3,31 +3,141 @@
\i test/fixtures/ml_values.sql \i test/fixtures/ml_values.sql
SELECT cdb_crankshaft._cdb_random_seeds(1234); SELECT cdb_crankshaft._cdb_random_seeds(1234);
-- second version (query, not specifying features)
WITH expected AS ( WITH expected AS (
SELECT generate_series(1000,1020) AS id, unnest(ARRAY[ SELECT
4.5656517130822492, generate_series(1000, 1020) AS id,
1.7928053473230694, unnest(ARRAY[4.5656517130822492,
1.0283378773916563, 1.7928053473230694,
2.6586517814904593, 1.0283378773916563,
2.9699056242935944, 2.6586517814904593,
3.9550646059951347, 2.9699056242935944,
4.1662572444459745, 3.9550646059951347,
3.8126334839264162, 4.1662572444459745,
1.8809821053623488, 3.8126334839264162,
1.6349065129019873, 1.8809821053623488,
3.0391288591472954, 1.6349065129019873,
3.3035970359672553, 3.0391288591472954,
1.5835471589451968, 3.3035970359672553,
3.7530378537263638, 1.5835471589451968,
1.0833589653009252, 3.7530378537263638,
3.8104965452882897, 1.0833589653009252,
2.665217959294802, 3.8104965452882897,
1.5850334252802472, 2.665217959294802,
3.679401198805563, 1.5850334252802472,
3.5332033186588636 3.679401198805563,
]) AS expected LIMIT 20 3.5332033186588636 ]) AS expected
LIMIT 20
), training as (
SELECT
array_agg(target)::numeric[] as target,
cdb_crankshaft.CDB_PyAgg(Array[x1, x2, x3]::numeric[]) as features
FROM (SELECT * FROM ml_values ORDER BY cartodb_id asc) as _w
WHERE class = 'train'
), testing As (
SELECT
cdb_crankshaft.CDB_PyAgg(Array[x1, x2, x3]::numeric[]) as features,
array_agg(cartodb_id)::numeric[] as cartodb_ids
FROM (SELECT * FROM ml_values ORDER BY cartodb_id asc) as _w
WHERE class = 'test'
), prediction AS ( ), prediction AS (
SELECT cartodb_id::integer id, prediction SELECT
FROM cdb_crankshaft.CDB_CreateAndPredictSegment('SELECT target, x1, x2, x3 FROM ml_values WHERE class = $$train$$','target', Array['x1', 'x2', 'x3'], 'SELECT cartodb_id, x1, x2, x3 FROM ml_values WHERE class = $$test$$') *
FROM
cdb_crankshaft.CDB_CreateAndPredictSegment(
(SELECT target FROM training),
(SELECT features FROM training),
(SELECT features FROM testing),
(SELECT cartodb_ids FROM testing)
)
)
SELECT
abs(e.expected - p.prediction) <= 1e-1 AS within_tolerance
FROM expected e, prediction p
WHERE e.id = p.cartodb_id
LIMIT 20;
SELECT cdb_crankshaft._cdb_random_seeds(1234);
-- second version (query, not specifying features)
WITH expected AS (
SELECT
generate_series(1000, 1020) AS id,
unnest(ARRAY[4.5656517130822492,
1.7928053473230694,
1.0283378773916563,
2.6586517814904593,
2.9699056242935944,
3.9550646059951347,
4.1662572444459745,
3.8126334839264162,
1.8809821053623488,
1.6349065129019873,
3.0391288591472954,
3.3035970359672553,
1.5835471589451968,
3.7530378537263638,
1.0833589653009252,
3.8104965452882897,
2.665217959294802,
1.5850334252802472,
3.679401198805563,
3.5332033186588636 ]) AS expected
LIMIT 20
), prediction AS (
SELECT
cartodb_id::integer id,
prediction
FROM cdb_crankshaft.CDB_CreateAndPredictSegment(
'SELECT target, x1, x2, x3 FROM ml_values WHERE class = $$train$$ ORDER BY cartodb_id asc',
'target',
'SELECT cartodb_id, x1, x2, x3 FROM ml_values WHERE class = $$test$$ ORDER BY cartodb_id asc'
)
LIMIT 20 LIMIT 20
) SELECT abs(e.expected - p.prediction) <= 1e-1 AS within_tolerance FROM expected e, prediction p WHERE e.id = p.id; )
SELECT
abs(e.expected - p.prediction) <= 1e-1 AS within_tolerance
FROM expected e, prediction p
WHERE e.id = p.id;
SELECT cdb_crankshaft._cdb_random_seeds(1234);
-- third version (query, specifying features)
WITH expected AS (
SELECT
generate_series(1000, 1020) AS id,
unnest(ARRAY[4.5656517130822492,
1.7928053473230694,
1.0283378773916563,
2.6586517814904593,
2.9699056242935944,
3.9550646059951347,
4.1662572444459745,
3.8126334839264162,
1.8809821053623488,
1.6349065129019873,
3.0391288591472954,
3.3035970359672553,
1.5835471589451968,
3.7530378537263638,
1.0833589653009252,
3.8104965452882897,
2.665217959294802,
1.5850334252802472,
3.679401198805563,
3.5332033186588636 ]) AS expected
LIMIT 20
), prediction AS (
SELECT
cartodb_id::integer id,
prediction
FROM cdb_crankshaft.CDB_CreateAndPredictSegment(
'SELECT target, x1, x2, x3 FROM ml_values WHERE class = $$train$$',
'target',
Array['x1', 'x2', 'x3'],
'SELECT cartodb_id, x1, x2, x3 FROM ml_values WHERE class = $$test$$'
)
LIMIT 20
)
SELECT
abs(e.expected - p.prediction) <= 1e-1 AS within_tolerance
FROM expected e, prediction p
WHERE e.id = p.id;

View File

@ -29,26 +29,25 @@ class Segmentation(object):
straight form the SQL calling the function. straight form the SQL calling the function.
Input: Input:
@param target: The 1D array of lenth NSamples containing the @param target: The 1D array of length NSamples containing the
target variable we want the model to predict target variable we want the model to predict
@param features: The 2D array of size NSamples * NFeatures that @param features: The 2D array of size NSamples * NFeatures that
form the imput to the model form the input to the model
@param target_ids: A 1D array of target_ids that will be used @param target_ids: A 1D array of target_ids that will be used
to associate the results of the prediction with the rows which to associate the results of the prediction with the rows which
they come from they come from
@param model_parameters: A dictionary containing parameters for @param model_parameters: A dictionary containing parameters for
the model. the model.
""" """
clean_target = replace_nan_with_mean(target) clean_target, _ = replace_nan_with_mean(target)
clean_features = replace_nan_with_mean(features) clean_features, _ = replace_nan_with_mean(features)
target_features = replace_nan_with_mean(target_features) target_features, _ = replace_nan_with_mean(target_features)
model, accuracy = train_model(clean_target, clean_features, model, accuracy = train_model(clean_target, clean_features,
model_parameters, 0.2) model_parameters, 0.2)
prediction = model.predict(target_features) prediction = model.predict(target_features)
accuracy_array = [accuracy] * prediction.shape[0] accuracy_array = [accuracy] * prediction.shape[0]
return zip(target_ids, prediction, return zip(target_ids, prediction, accuracy_array)
np.full(prediction.shape, accuracy_array))
def create_and_predict_segment(self, query, variable, feature_columns, def create_and_predict_segment(self, query, variable, feature_columns,
target_query, model_params, target_query, model_params,
@ -65,7 +64,6 @@ class Segmentation(object):
scikit learn page for [GradientBoostingRegressor] scikit learn page for [GradientBoostingRegressor]
(http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html) (http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
""" """
params = {"subquery": target_query, params = {"subquery": target_query,
"id_col": id_col} "id_col": id_col}
@ -198,7 +196,7 @@ def train_model(target, features, model_params, test_split):
Input: Input:
@param target: 1D Array of the variable that the model is to be @param target: 1D Array of the variable that the model is to be
trained to predict trained to predict
@param features: 2D Array NSamples *NFeatures to use in trining @param features: 2D Array NSamples *NFeatures to use in training
the model the model
@param model_params: A dictionary of model parameters, the full @param model_params: A dictionary of model parameters, the full
specification can be found on the specification can be found on the