adds fuller test suite for segmentation
This commit is contained in:
parent
bc9cae1a63
commit
6260a080a6
@ -25,19 +25,20 @@ AS $$
|
|||||||
|
|
||||||
def unpack2D(data):
|
def unpack2D(data):
|
||||||
dimension = data.pop(0)
|
dimension = data.pop(0)
|
||||||
a = np.array(data, dtype=float)
|
a = np.array(data, dtype=np.float64)
|
||||||
return a.reshape(len(a)/dimension, dimension)
|
return a.reshape(int(len(a)/dimension), int(dimension))
|
||||||
|
|
||||||
return seg.create_and_predict_segment_agg(np.array(target, dtype=float),
|
return seg.create_and_predict_segment_agg(
|
||||||
unpack2D(features),
|
np.array(target, dtype=np.float64),
|
||||||
unpack2D(target_features),
|
unpack2D(features),
|
||||||
target_ids,
|
unpack2D(target_features),
|
||||||
model_params)
|
target_ids,
|
||||||
|
model_params)
|
||||||
|
|
||||||
$$ LANGUAGE plpythonu VOLATILE PARALLEL RESTRICTED;
|
$$ LANGUAGE plpythonu VOLATILE PARALLEL RESTRICTED;
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION
|
CREATE OR REPLACE FUNCTION
|
||||||
CDB_CreateAndPredictSegment (
|
CDB_CreateAndPredictSegment(
|
||||||
query TEXT,
|
query TEXT,
|
||||||
variable_name TEXT,
|
variable_name TEXT,
|
||||||
target_table TEXT,
|
target_table TEXT,
|
||||||
@ -57,9 +58,13 @@ AS $$
|
|||||||
'learning_rate': learning_rate,
|
'learning_rate': learning_rate,
|
||||||
'min_samples_leaf': min_samples_leaf
|
'min_samples_leaf': min_samples_leaf
|
||||||
}
|
}
|
||||||
|
feature_cols = set(plpy.execute('''
|
||||||
|
select * from ({query}) as _w limit 0
|
||||||
|
'''.format(query=query)).colnames()) - set([variable_name, 'cartodb_id', ])
|
||||||
return seg.create_and_predict_segment(
|
return seg.create_and_predict_segment(
|
||||||
query,
|
query,
|
||||||
variable_name,
|
variable_name,
|
||||||
|
feature_cols,
|
||||||
target_table,
|
target_table,
|
||||||
model_params
|
model_params
|
||||||
)
|
)
|
||||||
|
@ -25,3 +25,53 @@ t
|
|||||||
t
|
t
|
||||||
t
|
t
|
||||||
(20 rows)
|
(20 rows)
|
||||||
|
_cdb_random_seeds
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
within_tolerance
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
(20 rows)
|
||||||
|
_cdb_random_seeds
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
within_tolerance
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
t
|
||||||
|
(20 rows)
|
||||||
|
4
src/pg/test/fixtures/ml_values.sql
vendored
4
src/pg/test/fixtures/ml_values.sql
vendored
@ -1,7 +1,7 @@
|
|||||||
SET client_min_messages TO WARNING;
|
SET client_min_messages TO WARNING;
|
||||||
\set ECHO none
|
\set ECHO none
|
||||||
CREATE TABLE ml_values (cartodb_id integer, target float, the_geom geometry, x1 float , x2 float, x3 float, class text);
|
CREATE TABLE ml_values (cartodb_id integer, target float, the_geom geometry, x1 float, x2 float, x3 float, class text);
|
||||||
INSERT INTO ml_values(cartodb_id, target,x1,x2,x3, class) VALUES
|
INSERT INTO ml_values(cartodb_id, target, x1, x2, x3, class) VALUES
|
||||||
(0,1.24382137034,0.811403626309,0.657584780869,0,'train'),
|
(0,1.24382137034,0.811403626309,0.657584780869,0,'train'),
|
||||||
(1,1.72727475342,0.447764244847,0.528687533966,1,'train'),
|
(1,1.72727475342,0.447764244847,0.528687533966,1,'train'),
|
||||||
(2,3.32104694099,0.62774565606,0.832647155118,2,'train'),
|
(2,3.32104694099,0.62774565606,0.832647155118,2,'train'),
|
||||||
|
@ -3,31 +3,141 @@
|
|||||||
\i test/fixtures/ml_values.sql
|
\i test/fixtures/ml_values.sql
|
||||||
SELECT cdb_crankshaft._cdb_random_seeds(1234);
|
SELECT cdb_crankshaft._cdb_random_seeds(1234);
|
||||||
|
|
||||||
|
-- second version (query, not specifying features)
|
||||||
WITH expected AS (
|
WITH expected AS (
|
||||||
SELECT generate_series(1000,1020) AS id, unnest(ARRAY[
|
SELECT
|
||||||
4.5656517130822492,
|
generate_series(1000, 1020) AS id,
|
||||||
1.7928053473230694,
|
unnest(ARRAY[4.5656517130822492,
|
||||||
1.0283378773916563,
|
1.7928053473230694,
|
||||||
2.6586517814904593,
|
1.0283378773916563,
|
||||||
2.9699056242935944,
|
2.6586517814904593,
|
||||||
3.9550646059951347,
|
2.9699056242935944,
|
||||||
4.1662572444459745,
|
3.9550646059951347,
|
||||||
3.8126334839264162,
|
4.1662572444459745,
|
||||||
1.8809821053623488,
|
3.8126334839264162,
|
||||||
1.6349065129019873,
|
1.8809821053623488,
|
||||||
3.0391288591472954,
|
1.6349065129019873,
|
||||||
3.3035970359672553,
|
3.0391288591472954,
|
||||||
1.5835471589451968,
|
3.3035970359672553,
|
||||||
3.7530378537263638,
|
1.5835471589451968,
|
||||||
1.0833589653009252,
|
3.7530378537263638,
|
||||||
3.8104965452882897,
|
1.0833589653009252,
|
||||||
2.665217959294802,
|
3.8104965452882897,
|
||||||
1.5850334252802472,
|
2.665217959294802,
|
||||||
3.679401198805563,
|
1.5850334252802472,
|
||||||
3.5332033186588636
|
3.679401198805563,
|
||||||
]) AS expected LIMIT 20
|
3.5332033186588636 ]) AS expected
|
||||||
|
LIMIT 20
|
||||||
|
), training as (
|
||||||
|
SELECT
|
||||||
|
array_agg(target)::numeric[] as target,
|
||||||
|
cdb_crankshaft.CDB_PyAgg(Array[x1, x2, x3]::numeric[]) as features
|
||||||
|
FROM (SELECT * FROM ml_values ORDER BY cartodb_id asc) as _w
|
||||||
|
WHERE class = 'train'
|
||||||
|
), testing As (
|
||||||
|
SELECT
|
||||||
|
cdb_crankshaft.CDB_PyAgg(Array[x1, x2, x3]::numeric[]) as features,
|
||||||
|
array_agg(cartodb_id)::numeric[] as cartodb_ids
|
||||||
|
FROM (SELECT * FROM ml_values ORDER BY cartodb_id asc) as _w
|
||||||
|
WHERE class = 'test'
|
||||||
), prediction AS (
|
), prediction AS (
|
||||||
SELECT cartodb_id::integer id, prediction
|
SELECT
|
||||||
FROM cdb_crankshaft.CDB_CreateAndPredictSegment('SELECT target, x1, x2, x3 FROM ml_values WHERE class = $$train$$','target', Array['x1', 'x2', 'x3'], 'SELECT cartodb_id, x1, x2, x3 FROM ml_values WHERE class = $$test$$')
|
*
|
||||||
|
FROM
|
||||||
|
cdb_crankshaft.CDB_CreateAndPredictSegment(
|
||||||
|
(SELECT target FROM training),
|
||||||
|
(SELECT features FROM training),
|
||||||
|
(SELECT features FROM testing),
|
||||||
|
(SELECT cartodb_ids FROM testing)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
abs(e.expected - p.prediction) <= 1e-1 AS within_tolerance
|
||||||
|
FROM expected e, prediction p
|
||||||
|
WHERE e.id = p.cartodb_id
|
||||||
|
LIMIT 20;
|
||||||
|
SELECT cdb_crankshaft._cdb_random_seeds(1234);
|
||||||
|
|
||||||
|
-- second version (query, not specifying features)
|
||||||
|
WITH expected AS (
|
||||||
|
SELECT
|
||||||
|
generate_series(1000, 1020) AS id,
|
||||||
|
unnest(ARRAY[4.5656517130822492,
|
||||||
|
1.7928053473230694,
|
||||||
|
1.0283378773916563,
|
||||||
|
2.6586517814904593,
|
||||||
|
2.9699056242935944,
|
||||||
|
3.9550646059951347,
|
||||||
|
4.1662572444459745,
|
||||||
|
3.8126334839264162,
|
||||||
|
1.8809821053623488,
|
||||||
|
1.6349065129019873,
|
||||||
|
3.0391288591472954,
|
||||||
|
3.3035970359672553,
|
||||||
|
1.5835471589451968,
|
||||||
|
3.7530378537263638,
|
||||||
|
1.0833589653009252,
|
||||||
|
3.8104965452882897,
|
||||||
|
2.665217959294802,
|
||||||
|
1.5850334252802472,
|
||||||
|
3.679401198805563,
|
||||||
|
3.5332033186588636 ]) AS expected
|
||||||
|
LIMIT 20
|
||||||
|
), prediction AS (
|
||||||
|
SELECT
|
||||||
|
cartodb_id::integer id,
|
||||||
|
prediction
|
||||||
|
FROM cdb_crankshaft.CDB_CreateAndPredictSegment(
|
||||||
|
'SELECT target, x1, x2, x3 FROM ml_values WHERE class = $$train$$ ORDER BY cartodb_id asc',
|
||||||
|
'target',
|
||||||
|
'SELECT cartodb_id, x1, x2, x3 FROM ml_values WHERE class = $$test$$ ORDER BY cartodb_id asc'
|
||||||
|
)
|
||||||
LIMIT 20
|
LIMIT 20
|
||||||
) SELECT abs(e.expected - p.prediction) <= 1e-1 AS within_tolerance FROM expected e, prediction p WHERE e.id = p.id;
|
)
|
||||||
|
SELECT
|
||||||
|
abs(e.expected - p.prediction) <= 1e-1 AS within_tolerance
|
||||||
|
FROM expected e, prediction p
|
||||||
|
WHERE e.id = p.id;
|
||||||
|
|
||||||
|
SELECT cdb_crankshaft._cdb_random_seeds(1234);
|
||||||
|
-- third version (query, specifying features)
|
||||||
|
WITH expected AS (
|
||||||
|
SELECT
|
||||||
|
generate_series(1000, 1020) AS id,
|
||||||
|
unnest(ARRAY[4.5656517130822492,
|
||||||
|
1.7928053473230694,
|
||||||
|
1.0283378773916563,
|
||||||
|
2.6586517814904593,
|
||||||
|
2.9699056242935944,
|
||||||
|
3.9550646059951347,
|
||||||
|
4.1662572444459745,
|
||||||
|
3.8126334839264162,
|
||||||
|
1.8809821053623488,
|
||||||
|
1.6349065129019873,
|
||||||
|
3.0391288591472954,
|
||||||
|
3.3035970359672553,
|
||||||
|
1.5835471589451968,
|
||||||
|
3.7530378537263638,
|
||||||
|
1.0833589653009252,
|
||||||
|
3.8104965452882897,
|
||||||
|
2.665217959294802,
|
||||||
|
1.5850334252802472,
|
||||||
|
3.679401198805563,
|
||||||
|
3.5332033186588636 ]) AS expected
|
||||||
|
LIMIT 20
|
||||||
|
), prediction AS (
|
||||||
|
SELECT
|
||||||
|
cartodb_id::integer id,
|
||||||
|
prediction
|
||||||
|
FROM cdb_crankshaft.CDB_CreateAndPredictSegment(
|
||||||
|
'SELECT target, x1, x2, x3 FROM ml_values WHERE class = $$train$$',
|
||||||
|
'target',
|
||||||
|
Array['x1', 'x2', 'x3'],
|
||||||
|
'SELECT cartodb_id, x1, x2, x3 FROM ml_values WHERE class = $$test$$'
|
||||||
|
)
|
||||||
|
LIMIT 20
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
abs(e.expected - p.prediction) <= 1e-1 AS within_tolerance
|
||||||
|
FROM expected e, prediction p
|
||||||
|
WHERE e.id = p.id;
|
||||||
|
@ -29,26 +29,25 @@ class Segmentation(object):
|
|||||||
straight form the SQL calling the function.
|
straight form the SQL calling the function.
|
||||||
|
|
||||||
Input:
|
Input:
|
||||||
@param target: The 1D array of lenth NSamples containing the
|
@param target: The 1D array of length NSamples containing the
|
||||||
target variable we want the model to predict
|
target variable we want the model to predict
|
||||||
@param features: The 2D array of size NSamples * NFeatures that
|
@param features: The 2D array of size NSamples * NFeatures that
|
||||||
form the imput to the model
|
form the input to the model
|
||||||
@param target_ids: A 1D array of target_ids that will be used
|
@param target_ids: A 1D array of target_ids that will be used
|
||||||
to associate the results of the prediction with the rows which
|
to associate the results of the prediction with the rows which
|
||||||
they come from
|
they come from
|
||||||
@param model_parameters: A dictionary containing parameters for
|
@param model_parameters: A dictionary containing parameters for
|
||||||
the model.
|
the model.
|
||||||
"""
|
"""
|
||||||
clean_target = replace_nan_with_mean(target)
|
clean_target, _ = replace_nan_with_mean(target)
|
||||||
clean_features = replace_nan_with_mean(features)
|
clean_features, _ = replace_nan_with_mean(features)
|
||||||
target_features = replace_nan_with_mean(target_features)
|
target_features, _ = replace_nan_with_mean(target_features)
|
||||||
|
|
||||||
model, accuracy = train_model(clean_target, clean_features,
|
model, accuracy = train_model(clean_target, clean_features,
|
||||||
model_parameters, 0.2)
|
model_parameters, 0.2)
|
||||||
prediction = model.predict(target_features)
|
prediction = model.predict(target_features)
|
||||||
accuracy_array = [accuracy] * prediction.shape[0]
|
accuracy_array = [accuracy] * prediction.shape[0]
|
||||||
return zip(target_ids, prediction,
|
return zip(target_ids, prediction, accuracy_array)
|
||||||
np.full(prediction.shape, accuracy_array))
|
|
||||||
|
|
||||||
def create_and_predict_segment(self, query, variable, feature_columns,
|
def create_and_predict_segment(self, query, variable, feature_columns,
|
||||||
target_query, model_params,
|
target_query, model_params,
|
||||||
@ -65,7 +64,6 @@ class Segmentation(object):
|
|||||||
scikit learn page for [GradientBoostingRegressor]
|
scikit learn page for [GradientBoostingRegressor]
|
||||||
(http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
|
(http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
params = {"subquery": target_query,
|
params = {"subquery": target_query,
|
||||||
"id_col": id_col}
|
"id_col": id_col}
|
||||||
|
|
||||||
@ -198,7 +196,7 @@ def train_model(target, features, model_params, test_split):
|
|||||||
Input:
|
Input:
|
||||||
@param target: 1D Array of the variable that the model is to be
|
@param target: 1D Array of the variable that the model is to be
|
||||||
trained to predict
|
trained to predict
|
||||||
@param features: 2D Array NSamples *NFeatures to use in trining
|
@param features: 2D Array NSamples *NFeatures to use in training
|
||||||
the model
|
the model
|
||||||
@param model_params: A dictionary of model parameters, the full
|
@param model_params: A dictionary of model parameters, the full
|
||||||
specification can be found on the
|
specification can be found on the
|
||||||
|
Loading…
Reference in New Issue
Block a user