mirror of
https://github.com/CartoDB/crankshaft.git
synced 2024-11-01 10:20:48 +08:00
updating according to class
This commit is contained in:
parent
9c2f68fcaf
commit
959747c623
@ -15,7 +15,8 @@ AS $$
|
||||
import numpy as np
|
||||
import plpy
|
||||
|
||||
from crankshaft.segmentation import create_and_predict_segment_agg
|
||||
from crankshaft.segmentation import Segmentation
|
||||
seg = Segmentation()
|
||||
model_params = {'n_estimators': n_estimators,
|
||||
'max_depth': max_depth,
|
||||
'subsample': subsample,
|
||||
@ -27,7 +28,7 @@ AS $$
|
||||
a = np.array(data, dtype=float)
|
||||
return a.reshape(len(a)/dimension, dimension)
|
||||
|
||||
return create_and_predict_segment_agg(np.array(target, dtype=float),
|
||||
return seg.create_and_predict_segment_agg(np.array(target, dtype=float),
|
||||
unpack2D(features),
|
||||
unpack2D(target_features),
|
||||
target_ids,
|
||||
@ -65,7 +66,8 @@ CREATE OR REPLACE FUNCTION
|
||||
min_samples_leaf INTEGER DEFAULT 1)
|
||||
RETURNS TABLE (cartodb_id TEXT, prediction NUMERIC, accuracy NUMERIC)
|
||||
AS $$
|
||||
from crankshaft.segmentation import create_and_predict_segment
|
||||
from crankshaft.segmentation import Segmentation
|
||||
seg = Segmentation()
|
||||
model_params = {'n_estimators': n_estimators, 'max_depth':max_depth, 'subsample' : subsample, 'learning_rate': learning_rate, 'min_samples_leaf' : min_samples_leaf}
|
||||
return create_and_predict_segment(query,variable_name,target_table, model_params)
|
||||
return seg.create_and_predict_segment(query,variable_name,target_table, model_params)
|
||||
$$ LANGUAGE plpythonu;
|
||||
|
@ -1 +1,2 @@
|
||||
"""Import all functions from for segmentation"""
|
||||
from segmentation import *
|
||||
|
@ -68,10 +68,11 @@ class Segmentation(object):
|
||||
"""
|
||||
|
||||
params = {"subquery": target_query,
|
||||
"id_col": id_col}
|
||||
"id_col": id_col,
|
||||
"feature_columns": features}
|
||||
|
||||
target, features, target_mean, \
|
||||
feature_means = self.clean_data(variable, feature_columns, query)
|
||||
feature_means = self.clean_data(variable, feature_columns, query)
|
||||
|
||||
model, accuracy = train_model(target, features, model_params, 0.2)
|
||||
result = self.predict_segment(model, feature_columns, target_query,
|
||||
@ -82,7 +83,8 @@ class Segmentation(object):
|
||||
|
||||
return zip(rowid, result, accuracy_array)
|
||||
|
||||
def predict_segment(self, model, feature_columns, target_query, feature_means):
|
||||
def predict_segment(self, model, feature_columns, target_query,
|
||||
feature_means):
|
||||
"""
|
||||
Use the provided model to predict the values for the new feature set
|
||||
Input:
|
||||
@ -115,7 +117,6 @@ class Segmentation(object):
|
||||
# NOTE: we removed the cartodb_ids calculation in here
|
||||
return np.concatenate(results)
|
||||
|
||||
|
||||
def clean_data(self, query, variable, feature_columns):
|
||||
"""
|
||||
Add docstring
|
||||
@ -179,8 +180,8 @@ def train_model(target, features, model_params, test_split):
|
||||
testing the model / calculating the accuray
|
||||
"""
|
||||
features_train, features_test, \
|
||||
target_train, target_test = train_test_split(features, target,
|
||||
test_size=test_split)
|
||||
target_train, target_test = train_test_split(features, target,
|
||||
test_size=test_split)
|
||||
model = GradientBoostingRegressor(**model_params)
|
||||
model.fit(features_train, target_train)
|
||||
accuracy = calculate_model_accuracy(model, features_test, target_test)
|
||||
|
@ -1,7 +1,7 @@
|
||||
import unittest
|
||||
import numpy as np
|
||||
from helper import plpy, fixture_file
|
||||
import crankshaft.segmentation as segmentation
|
||||
from crankshaft.segmentation import Segmentation
|
||||
import json
|
||||
|
||||
class SegmentationTest(unittest.TestCase):
|
||||
@ -48,16 +48,23 @@ class SegmentationTest(unittest.TestCase):
|
||||
'subsample' : 0.5,
|
||||
'learning_rate': 0.01,
|
||||
'min_samples_leaf': 1}
|
||||
seg = Segmentation()
|
||||
'''
|
||||
self, query, variable, feature_columns,
|
||||
target_query, model_params,
|
||||
id_col='cartodb_id'
|
||||
'''
|
||||
|
||||
result = segmentation.create_and_predict_segment(
|
||||
result = seg.create_and_predict_segment(
|
||||
'select * from training',
|
||||
'target',
|
||||
'feature_columns',
|
||||
'select * from test',
|
||||
model_parameters)
|
||||
model_parameters)
|
||||
|
||||
prediction = [r[1] for r in result]
|
||||
|
||||
accuracy =np.sqrt(np.mean( np.square( np.array(prediction) - np.array(test_y))))
|
||||
accuracy = np.sqrt(np.mean( np.square( np.array(prediction) - np.array(test_y))))
|
||||
|
||||
self.assertEqual(len(result),len(test_data))
|
||||
self.assertTrue( result[0][2] < 0.01)
|
||||
|
Loading…
Reference in New Issue
Block a user