adding python tests
This commit is contained in:
parent
4df8257377
commit
1d7f62fa85
@ -19,7 +19,7 @@ def replace_nan_with_mean(array):
|
|||||||
|
|
||||||
def get_data(variable, feature_columns, query):
|
def get_data(variable, feature_columns, query):
|
||||||
columns = ','.join(['array_agg("{col}") as "{col}"'.format(col=col) for col in feature_columns])
|
columns = ','.join(['array_agg("{col}") as "{col}"'.format(col=col) for col in feature_columns])
|
||||||
data = plpy.execute(''' select array_agg("{variable}") as target, {columns} from ({query}) as a'''.format(
|
data = plpy.execute('''select array_agg("{variable}") as target, {columns} from ({query}) as a'''.format(
|
||||||
variable = variable,
|
variable = variable,
|
||||||
columns = columns,
|
columns = columns,
|
||||||
query = query
|
query = query
|
||||||
@ -34,9 +34,10 @@ def create_and_predict_segment(query,variable,target_query,model_params):
|
|||||||
generate a segment with machine learning
|
generate a segment with machine learning
|
||||||
Stuart Lynn
|
Stuart Lynn
|
||||||
"""
|
"""
|
||||||
|
|
||||||
columns = plpy.execute('select * from ({query}) a limit 1 '.format(query=query))[0].keys()
|
columns = plpy.execute('select * from ({query}) a limit 1 '.format(query=query))[0].keys()
|
||||||
|
|
||||||
feature_columns = set(columns) - set([variable, 'the_geom', 'the_geom_webmercator'])
|
feature_columns = set(columns) - set([variable, 'cartodb_id', 'the_geom', 'the_geom_webmercator'])
|
||||||
target,features = get_data(variable, feature_columns, query)
|
target,features = get_data(variable, feature_columns, query)
|
||||||
|
|
||||||
model, accuracy = train_model(target,features, model_params, 0.2)
|
model, accuracy = train_model(target,features, model_params, 0.2)
|
||||||
@ -75,7 +76,6 @@ def predict_segment(model,features,target_query):
|
|||||||
|
|
||||||
while True:
|
while True:
|
||||||
rows = cursor.fetch(batch_size)
|
rows = cursor.fetch(batch_size)
|
||||||
|
|
||||||
if not rows:
|
if not rows:
|
||||||
break
|
break
|
||||||
batch = np.row_stack([np.array(row['features'], dtype=float) for row in rows])
|
batch = np.row_stack([np.array(row['features'], dtype=float) for row in rows])
|
||||||
|
69
src/py/crankshaft/test/test_segmentation.py
Normal file
69
src/py/crankshaft/test/test_segmentation.py
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
import unittest
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
# from mock_plpy import MockPlPy
|
||||||
|
# plpy = MockPlPy()
|
||||||
|
#
|
||||||
|
# import sys
|
||||||
|
# sys.modules['plpy'] = plpy
|
||||||
|
from helper import plpy, fixture_file
|
||||||
|
|
||||||
|
import crankshaft.segmentation as segmentation
|
||||||
|
import json
|
||||||
|
|
||||||
|
class SegmentationTest(unittest.TestCase):
|
||||||
|
"""Testing class for Moran's I functions"""
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
plpy._reset()
|
||||||
|
|
||||||
|
def generate_random_data(self,n_samples,random_state, row_type=False):
|
||||||
|
x1 = random_state.uniform(size=n_samples)
|
||||||
|
x2 = random_state.uniform(size=n_samples)
|
||||||
|
x3 = random_state.randint(0, 4, size=n_samples)
|
||||||
|
|
||||||
|
y = x1+x2*x2+x3
|
||||||
|
cartodb_id = range(len(x1))
|
||||||
|
|
||||||
|
if row_type:
|
||||||
|
return [ {'features': vals} for vals in zip(x1,x2,x3)], y
|
||||||
|
else:
|
||||||
|
return [dict( zip(['x1','x2','x3','target', 'cartodb_id'],[x1,x2,x3,y,cartodb_id]))]
|
||||||
|
|
||||||
|
def test_create_and_predict_segment(self):
|
||||||
|
n_samples = 1000
|
||||||
|
|
||||||
|
random_state_train = np.random.RandomState(13)
|
||||||
|
random_state_test = np.random.RandomState(134)
|
||||||
|
training_data = self.generate_random_data(n_samples, random_state_train)
|
||||||
|
test_data, test_y = self.generate_random_data(n_samples, random_state_test, row_type=True)
|
||||||
|
|
||||||
|
ids = [{'cartodb_ids': range(len(test_data))}]
|
||||||
|
rows = [{'x1': 0,'x2':0,'x3':0,'y':0,'cartodb_id':0}]
|
||||||
|
|
||||||
|
plpy._define_result('select \* from \(select \* from training\) a limit 1',rows)
|
||||||
|
plpy._define_result('.*from \(select \* from training\) as a' ,training_data)
|
||||||
|
plpy._define_result('select array_agg\(cartodb\_id order by cartodb\_id\) as cartodb_ids from \(.*\) a',ids)
|
||||||
|
plpy._define_result('.*select \* from test.*' ,test_data)
|
||||||
|
|
||||||
|
|
||||||
|
model_parameters = {'n_estimators': 1200,
|
||||||
|
'max_depth': 3,
|
||||||
|
'subsample' : 0.5,
|
||||||
|
'learning_rate': 0.01,
|
||||||
|
'min_samples_leaf': 1}
|
||||||
|
|
||||||
|
result = segmentation.create_and_predict_segment(
|
||||||
|
'select * from training',
|
||||||
|
'y',
|
||||||
|
'select * from test',
|
||||||
|
model_parameters)
|
||||||
|
|
||||||
|
prediction = [r[1] for r in result]
|
||||||
|
|
||||||
|
accuracy =np.sqrt(np.mean( np.square( np.array(prediction) - np.array(test_y))))
|
||||||
|
|
||||||
|
self.assertEqual(len(result),len(test_data))
|
||||||
|
self.assertTrue( result[0][2] < 0.01)
|
||||||
|
self.assertTrue( accuracy < 0.5*np.mean(test_y) )
|
Loading…
Reference in New Issue
Block a user