adds fuller test suite for segmentation

This commit is contained in:
Andy Eschbacher 2018-03-15 11:41:47 -04:00
parent bc9cae1a63
commit 6260a080a6
5 changed files with 207 additions and 44 deletions

View File

@ -25,10 +25,11 @@ AS $$
def unpack2D(data):
dimension = data.pop(0)
a = np.array(data, dtype=float)
return a.reshape(len(a)/dimension, dimension)
a = np.array(data, dtype=np.float64)
return a.reshape(int(len(a)/dimension), int(dimension))
return seg.create_and_predict_segment_agg(np.array(target, dtype=float),
return seg.create_and_predict_segment_agg(
np.array(target, dtype=np.float64),
unpack2D(features),
unpack2D(target_features),
target_ids,
@ -57,9 +58,13 @@ AS $$
'learning_rate': learning_rate,
'min_samples_leaf': min_samples_leaf
}
feature_cols = set(plpy.execute('''
select * from ({query}) as _w limit 0
'''.format(query=query)).colnames()) - set([variable_name, 'cartodb_id', ])
return seg.create_and_predict_segment(
query,
variable_name,
feature_cols,
target_table,
model_params
)

View File

@ -25,3 +25,53 @@ t
t
t
(20 rows)
_cdb_random_seeds
(1 row)
within_tolerance
t
t
t
t
t
t
t
t
t
t
t
t
t
t
t
t
t
t
t
t
(20 rows)
_cdb_random_seeds
(1 row)
within_tolerance
t
t
t
t
t
t
t
t
t
t
t
t
t
t
t
t
t
t
t
t
(20 rows)

View File

@ -3,9 +3,11 @@
\i test/fixtures/ml_values.sql
SELECT cdb_crankshaft._cdb_random_seeds(1234);
-- second version (query, not specifying features)
WITH expected AS (
SELECT generate_series(1000,1020) AS id, unnest(ARRAY[
4.5656517130822492,
SELECT
generate_series(1000, 1020) AS id,
unnest(ARRAY[4.5656517130822492,
1.7928053473230694,
1.0283378773916563,
2.6586517814904593,
@ -24,10 +26,118 @@ WITH expected AS (
2.665217959294802,
1.5850334252802472,
3.679401198805563,
3.5332033186588636
]) AS expected LIMIT 20
), prediction AS (
SELECT cartodb_id::integer id, prediction
FROM cdb_crankshaft.CDB_CreateAndPredictSegment('SELECT target, x1, x2, x3 FROM ml_values WHERE class = $$train$$','target', Array['x1', 'x2', 'x3'], 'SELECT cartodb_id, x1, x2, x3 FROM ml_values WHERE class = $$test$$')
3.5332033186588636 ]) AS expected
LIMIT 20
) SELECT abs(e.expected - p.prediction) <= 1e-1 AS within_tolerance FROM expected e, prediction p WHERE e.id = p.id;
), training as (
SELECT
array_agg(target)::numeric[] as target,
cdb_crankshaft.CDB_PyAgg(Array[x1, x2, x3]::numeric[]) as features
FROM (SELECT * FROM ml_values ORDER BY cartodb_id asc) as _w
WHERE class = 'train'
), testing As (
SELECT
cdb_crankshaft.CDB_PyAgg(Array[x1, x2, x3]::numeric[]) as features,
array_agg(cartodb_id)::numeric[] as cartodb_ids
FROM (SELECT * FROM ml_values ORDER BY cartodb_id asc) as _w
WHERE class = 'test'
), prediction AS (
SELECT
*
FROM
cdb_crankshaft.CDB_CreateAndPredictSegment(
(SELECT target FROM training),
(SELECT features FROM training),
(SELECT features FROM testing),
(SELECT cartodb_ids FROM testing)
)
)
SELECT
abs(e.expected - p.prediction) <= 1e-1 AS within_tolerance
FROM expected e, prediction p
WHERE e.id = p.cartodb_id
LIMIT 20;
SELECT cdb_crankshaft._cdb_random_seeds(1234);
-- second version (query, not specifying features)
WITH expected AS (
SELECT
generate_series(1000, 1020) AS id,
unnest(ARRAY[4.5656517130822492,
1.7928053473230694,
1.0283378773916563,
2.6586517814904593,
2.9699056242935944,
3.9550646059951347,
4.1662572444459745,
3.8126334839264162,
1.8809821053623488,
1.6349065129019873,
3.0391288591472954,
3.3035970359672553,
1.5835471589451968,
3.7530378537263638,
1.0833589653009252,
3.8104965452882897,
2.665217959294802,
1.5850334252802472,
3.679401198805563,
3.5332033186588636 ]) AS expected
LIMIT 20
), prediction AS (
SELECT
cartodb_id::integer id,
prediction
FROM cdb_crankshaft.CDB_CreateAndPredictSegment(
'SELECT target, x1, x2, x3 FROM ml_values WHERE class = $$train$$ ORDER BY cartodb_id asc',
'target',
'SELECT cartodb_id, x1, x2, x3 FROM ml_values WHERE class = $$test$$ ORDER BY cartodb_id asc'
)
LIMIT 20
)
SELECT
abs(e.expected - p.prediction) <= 1e-1 AS within_tolerance
FROM expected e, prediction p
WHERE e.id = p.id;
SELECT cdb_crankshaft._cdb_random_seeds(1234);
-- third version (query, specifying features)
WITH expected AS (
SELECT
generate_series(1000, 1020) AS id,
unnest(ARRAY[4.5656517130822492,
1.7928053473230694,
1.0283378773916563,
2.6586517814904593,
2.9699056242935944,
3.9550646059951347,
4.1662572444459745,
3.8126334839264162,
1.8809821053623488,
1.6349065129019873,
3.0391288591472954,
3.3035970359672553,
1.5835471589451968,
3.7530378537263638,
1.0833589653009252,
3.8104965452882897,
2.665217959294802,
1.5850334252802472,
3.679401198805563,
3.5332033186588636 ]) AS expected
LIMIT 20
), prediction AS (
SELECT
cartodb_id::integer id,
prediction
FROM cdb_crankshaft.CDB_CreateAndPredictSegment(
'SELECT target, x1, x2, x3 FROM ml_values WHERE class = $$train$$',
'target',
Array['x1', 'x2', 'x3'],
'SELECT cartodb_id, x1, x2, x3 FROM ml_values WHERE class = $$test$$'
)
LIMIT 20
)
SELECT
abs(e.expected - p.prediction) <= 1e-1 AS within_tolerance
FROM expected e, prediction p
WHERE e.id = p.id;

View File

@ -29,26 +29,25 @@ class Segmentation(object):
straight form the SQL calling the function.
Input:
@param target: The 1D array of lenth NSamples containing the
@param target: The 1D array of length NSamples containing the
target variable we want the model to predict
@param features: The 2D array of size NSamples * NFeatures that
form the imput to the model
form the input to the model
@param target_ids: A 1D array of target_ids that will be used
to associate the results of the prediction with the rows which
they come from
@param model_parameters: A dictionary containing parameters for
the model.
"""
clean_target = replace_nan_with_mean(target)
clean_features = replace_nan_with_mean(features)
target_features = replace_nan_with_mean(target_features)
clean_target, _ = replace_nan_with_mean(target)
clean_features, _ = replace_nan_with_mean(features)
target_features, _ = replace_nan_with_mean(target_features)
model, accuracy = train_model(clean_target, clean_features,
model_parameters, 0.2)
prediction = model.predict(target_features)
accuracy_array = [accuracy] * prediction.shape[0]
return zip(target_ids, prediction,
np.full(prediction.shape, accuracy_array))
return zip(target_ids, prediction, accuracy_array)
def create_and_predict_segment(self, query, variable, feature_columns,
target_query, model_params,
@ -65,7 +64,6 @@ class Segmentation(object):
scikit learn page for [GradientBoostingRegressor]
(http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
"""
params = {"subquery": target_query,
"id_col": id_col}
@ -198,7 +196,7 @@ def train_model(target, features, model_params, test_split):
Input:
@param target: 1D Array of the variable that the model is to be
trained to predict
@param features: 2D Array NSamples *NFeatures to use in trining
@param features: 2D Array NSamples *NFeatures to use in training
the model
@param model_params: A dictionary of model parameters, the full
specification can be found on the