updating formatting
This commit is contained in:
parent
dae406927f
commit
c80975fe46
@ -9,28 +9,65 @@ from sklearn.ensemble import GradientBoostingRegressor
|
|||||||
from sklearn import metrics
|
from sklearn import metrics
|
||||||
from sklearn.cross_validation import train_test_split
|
from sklearn.cross_validation import train_test_split
|
||||||
|
|
||||||
# High level interface ---------------------------------------
|
# Lower level functions
|
||||||
|
#----------------------
|
||||||
|
|
||||||
def replace_nan_with_mean(array):
|
def replace_nan_with_mean(array):
|
||||||
indices = np.where(np.isnan(array)) #returns an array of rows and column indices
|
"""
|
||||||
|
Input:
|
||||||
|
@param array: an array of floats which may have null-valued entries
|
||||||
|
Output:
|
||||||
|
array with nans filled in with the mean of the dataset
|
||||||
|
"""
|
||||||
|
# returns an array of rows and column indices
|
||||||
|
indices = np.where(np.isnan(array))
|
||||||
|
|
||||||
|
# iterate through entries which have nan values
|
||||||
for row, col in zip(*indices):
|
for row, col in zip(*indices):
|
||||||
array[row,col] = np.mean(array[~np.isnan(array[:,col]), col])
|
array[row, col] = np.mean(array[~np.isnan(array[:, col]), col])
|
||||||
|
|
||||||
return array
|
return array
|
||||||
|
|
||||||
def get_data(variable, feature_columns, query):
|
def get_data(variable, feature_columns, query):
|
||||||
columns = ','.join(['array_agg("{col}") as "{col}"'.format(col=col) for col in feature_columns])
|
"""
|
||||||
data = plpy.execute('''select array_agg("{variable}") as target, {columns} from ({query}) as a'''.format(
|
Fetch data from the database, clean, and package into
|
||||||
|
numpy arrays
|
||||||
|
Input:
|
||||||
|
@param variable: name of the target variable
|
||||||
|
@param feature_columns: list of column names
|
||||||
|
@param query: subquery that data is pulled from for the packaging
|
||||||
|
Output:
|
||||||
|
prepared data, packaged into NumPy arrays
|
||||||
|
"""
|
||||||
|
|
||||||
|
columns = ','.join(['array_agg("{col}") As "{col}"'.format(col=col) for col in feature_columns])
|
||||||
|
|
||||||
|
data = plpy.execute('''
|
||||||
|
SELECT array_agg("{variable}") As target,
|
||||||
|
{columns}
|
||||||
|
FROM ({query}) As a
|
||||||
|
'''.format(
|
||||||
variable = variable,
|
variable = variable,
|
||||||
columns = columns,
|
columns = columns,
|
||||||
query = query
|
query = query
|
||||||
))
|
))
|
||||||
|
|
||||||
target = np.array(data[0]['target'])
|
target = np.array(data[0]['target'])
|
||||||
|
|
||||||
|
# put arrays into an n x m array of arrays
|
||||||
features = np.column_stack([np.array(data[0][col], dtype=float) for col in feature_columns])
|
features = np.column_stack([np.array(data[0][col], dtype=float) for col in feature_columns])
|
||||||
|
|
||||||
return replace_nan_with_mean(target), replace_nan_with_mean(features)
|
return replace_nan_with_mean(target),
|
||||||
|
replace_nan_with_mean(features)
|
||||||
|
|
||||||
|
# High level interface
|
||||||
|
# --------------------
|
||||||
|
|
||||||
|
def create_and_predict_segment_agg(target, features, target_features, target_ids, model_parameters):
|
||||||
|
"""
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
def create_and_predict_segment_agg(target, features, target_features, target_ids,model_parameters):
|
|
||||||
clean_target = replace_nan_with_mean(target)
|
clean_target = replace_nan_with_mean(target)
|
||||||
clean_features = replace_nan_with_mean(features)
|
clean_features = replace_nan_with_mean(features)
|
||||||
target_features = replace_nan_with_mean(target_features)
|
target_features = replace_nan_with_mean(target_features)
|
||||||
@ -41,13 +78,13 @@ def create_and_predict_segment_agg(target, features, target_features, target_ids
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
def create_and_predict_segment(query,variable,target_query,model_params):
|
def create_and_predict_segment(query, variable, target_query, model_params):
|
||||||
"""
|
"""
|
||||||
generate a segment with machine learning
|
generate a segment with machine learning
|
||||||
Stuart Lynn
|
Stuart Lynn
|
||||||
"""
|
"""
|
||||||
|
|
||||||
columns = plpy.execute('select * from ({query}) a limit 1 '.format(query=query))[0].keys()
|
columns = plpy.execute('SELECT * FROM ({query}) As a LIMIT 1 '.format(query=query))[0].keys()
|
||||||
|
|
||||||
feature_columns = set(columns) - set([variable, 'cartodb_id', 'the_geom', 'the_geom_webmercator'])
|
feature_columns = set(columns) - set([variable, 'cartodb_id', 'the_geom', 'the_geom_webmercator'])
|
||||||
target,features = get_data(variable, feature_columns, query)
|
target,features = get_data(variable, feature_columns, query)
|
||||||
@ -57,46 +94,64 @@ def create_and_predict_segment(query,variable,target_query,model_params):
|
|||||||
return zip(cartodb_ids, result, np.full(result.shape, accuracy ))
|
return zip(cartodb_ids, result, np.full(result.shape, accuracy ))
|
||||||
|
|
||||||
|
|
||||||
def train_model(target,features,model_params,test_split):
|
def train_model(target, features, model_params, test_split):
|
||||||
|
"""
|
||||||
|
|
||||||
|
"""
|
||||||
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
|
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
|
||||||
model = GradientBoostingRegressor(**model_params)
|
model = GradientBoostingRegressor(**model_params)
|
||||||
model.fit(features_train, target_train)
|
model.fit(features_train, target_train)
|
||||||
accuracy = calculate_model_accuracy(model,features,target)
|
accuracy = calculate_model_accuracy(model,features,target)
|
||||||
return model, accuracy
|
return model, accuracy
|
||||||
|
|
||||||
def calculate_model_accuracy(model,features,target):
|
def calculate_model_accuracy(model, features, target):
|
||||||
|
"""
|
||||||
|
Calculate the mean squared error of the model prediction
|
||||||
|
Input:
|
||||||
|
@param model: model trained from input features
|
||||||
|
@param features: features to make a prediction from
|
||||||
|
@param target: target to compare prediction to
|
||||||
|
Output:
|
||||||
|
mean squared error of the model prection compared to the target
|
||||||
|
"""
|
||||||
prediction = model.predict(features)
|
prediction = model.predict(features)
|
||||||
return metrics.mean_squared_error(prediction,target)
|
return metrics.mean_squared_error(prediction, target)
|
||||||
|
|
||||||
def predict_segment(model,features,target_query):
|
def predict_segment(model, features, target_query):
|
||||||
"""
|
"""
|
||||||
predict a segment with machine learning
|
predict a segment with machine learning
|
||||||
Stuart Lynn
|
Stuart Lynn
|
||||||
|
|
||||||
|
description of params?
|
||||||
"""
|
"""
|
||||||
|
|
||||||
batch_size = 1000
|
batch_size = 1000
|
||||||
joined_features = ','.join(['"{0}"::numeric'.format(a) for a in features])
|
joined_features = ','.join(['"{0}"::numeric'.format(a) for a in features])
|
||||||
|
|
||||||
cursor = plpy.cursor('select Array[{joined_features}] features from ({target_query}) a'.format(
|
cursor = plpy.cursor('''
|
||||||
|
SELECT Array[{joined_features}] As features
|
||||||
|
FROM ({target_query}) As a
|
||||||
|
'''.format(
|
||||||
joined_features=joined_features,
|
joined_features=joined_features,
|
||||||
target_query= target_query
|
target_query= target_query))
|
||||||
))
|
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
rows = cursor.fetch(batch_size)
|
rows = cursor.fetch(batch_size)
|
||||||
if not rows:
|
if not rows:
|
||||||
break
|
break
|
||||||
batch = np.row_stack([np.array(row['features'], dtype=float) for row in rows])
|
batch = np.row_stack([np.array(row['features'], dtype=float) for row in rows])
|
||||||
|
|
||||||
#Need to fix this. Should be global mean. This will cause weird effects
|
#Need to fix this. Should be global mean. This will cause weird effects
|
||||||
batch = replace_nan_with_mean(batch)
|
batch = replace_nan_with_mean(batch)
|
||||||
prediction = model.predict(batch)
|
prediction = model.predict(batch)
|
||||||
results.append(prediction)
|
results.append(prediction)
|
||||||
|
|
||||||
|
|
||||||
cartodb_ids = plpy.execute('select array_agg(cartodb_id order by cartodb_id) as cartodb_ids from ({0}) a '.format(target_query))[0]['cartodb_ids']
|
cartodb_ids = plpy.execute('''
|
||||||
return cartodb_ids, np.concatenate(results)
|
SELECT array_agg(cartodb_id ORDER BY cartodb_id) As cartodb_ids
|
||||||
|
FROM ({0}) As a
|
||||||
|
'''.format(target_query))[0]['cartodb_ids']
|
||||||
|
|
||||||
|
return cartodb_ids, np.concatenate(results)
|
||||||
|
Loading…
Reference in New Issue
Block a user