fleshing out segmentation
This commit is contained in:
parent
746dcc9723
commit
d96d6b2c48
@ -0,0 +1 @@
|
||||
from segmentation import *
|
@ -1,118 +1,118 @@
|
||||
"""
|
||||
Segmentation creation and prediction
|
||||
"""
|
||||
|
||||
import sklearn
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pickle
|
||||
import plpy
|
||||
from sklearn.ensemble import ExtraTreesRegressor
|
||||
from sklearn import metrics
|
||||
from sklearn.cross_validation import train_test_split
|
||||
|
||||
# High level interface ---------------------------------------
|
||||
|
||||
def cdb_create_segment(segment_name,table_name,column_name,geoid_column,census_table,method):
|
||||
"""
|
||||
generate a segment with machine learning
|
||||
Stuart Lynn
|
||||
"""
|
||||
data = pd.DataFrame(join_with_census(table_name, column_name,geoid_column, census_table))
|
||||
features = data[data.columns.difference([column_name, 'geoid'])]
|
||||
target, mean, std = normalize(data[column_name])
|
||||
model, accuracy = train_model(target,features, test_split=0.2)
|
||||
save_model(segment_name, model, accuracy, table_name, column_name, census_table, geoid_column, method)
|
||||
return accuracy
|
||||
|
||||
def normalize(target):
|
||||
mean = np.mean(target)
|
||||
std = no.std(target)
|
||||
return (target - mean)/std, mean, std
|
||||
|
||||
def denormalize(target, mean ,std):
|
||||
return target*std + mean
|
||||
|
||||
def train_model(target,features,test_split):
|
||||
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
|
||||
model = ExtraTreesRegressor(n_estimators = 40, max_features=len(features.columns))
|
||||
model.fit(features_train, target_train)
|
||||
accuracy = calculate_model_accuracy(model,features,target)
|
||||
return model, accuracy
|
||||
|
||||
def calculate_model_accuracy(model,features,target):
|
||||
prediction = self.model.predict(features)
|
||||
return metrics.mean_squared_error(prediction,target)/np.std(target)
|
||||
|
||||
def join_with_census(table_name, column_name, geoid_column, census_table):
|
||||
coulmns = plpy.execute('select {census_table}.* limit 1 ')
|
||||
feature_names = ",".join(columns.keys.difference(['the_geom','cartodb_id']))
|
||||
join_data = plpy.execute('''
|
||||
WITH region_extent AS (
|
||||
SELECT ST_Extent(the_geom) as table_extent FROM {table_name};
|
||||
)
|
||||
SELECT {features_names}, {table_name}.{column_name}
|
||||
FROM {table_name} ,region_extent
|
||||
JOIN {census_table}
|
||||
ON {table_name}.{geoid_column} = {census_table}.geoid
|
||||
WHERE {census_table}.the_geom && region_extent.table_extent
|
||||
'''.format(**locals()))
|
||||
|
||||
if len(join_data) == 0:
|
||||
plpy.notice('Failed to join with census data')
|
||||
|
||||
return join_data
|
||||
|
||||
def cdb_predict_segment(segment_name,geoid_column,census_table):
|
||||
"""
|
||||
predict a segment with machine learning
|
||||
Stuart Lynn
|
||||
"""
|
||||
data = fetch_model(segment_name)
|
||||
model = data['model']
|
||||
features = ",".join(data['features'])
|
||||
targets = plpy.execute('select {features} from {census_table}')
|
||||
geo_ids = plpy.execute('select geoid from {census_table}')
|
||||
result = model.predict(targets)
|
||||
return zip(geo_ids,prediction)
|
||||
|
||||
|
||||
def fetch_model(model_name):
|
||||
"""
|
||||
fetch a model from storage
|
||||
"""
|
||||
data = plpy.execute('select * from models where name={model_name}')
|
||||
if len(data)==0:
|
||||
plpy.notice('model not found')
|
||||
data = data[0]
|
||||
data['model'] = pickle.load(data['model'])
|
||||
return data
|
||||
|
||||
|
||||
def create_model_table(model_name):
|
||||
"""
|
||||
create the model table if requred
|
||||
"""
|
||||
plpy.execute('''
|
||||
CREATE table IF NOT EXISTS _cdb_models(
|
||||
name TEXT,
|
||||
model BLOB,
|
||||
features TEXT[],
|
||||
accuracy NUMERIC,
|
||||
table_name TEXT,
|
||||
)''')
|
||||
|
||||
def save_model(model_name,model,accuracy,table_name, column_name,census_table,geoid_column,method):
|
||||
"""
|
||||
save a model to the model table for later use
|
||||
"""
|
||||
|
||||
plpy.execute('''
|
||||
DELETE FROM _cdb_models WHERE model_name = {model_name}
|
||||
'''.format(**locals()))
|
||||
|
||||
plpy.execute("""
|
||||
INSERT INTO _cdb_models ({model_name},{model_pickle},{accuracy})
|
||||
""")
|
||||
|
||||
def
|
||||
# """
|
||||
# Segmentation creation and prediction
|
||||
# """
|
||||
#
|
||||
# import sklearn
|
||||
# import numpy as np
|
||||
# import pandas as pd
|
||||
# import pickle
|
||||
# import plpy
|
||||
# from sklearn.ensemble import ExtraTreesRegressor
|
||||
# from sklearn import metrics
|
||||
# from sklearn.cross_validation import train_test_split
|
||||
#
|
||||
# # High level interface ---------------------------------------
|
||||
#
|
||||
# def cdb_create_segment(segment_name,table_name,column_name,geoid_column,census_table,method):
|
||||
# """
|
||||
# generate a segment with machine learning
|
||||
# Stuart Lynn
|
||||
# """
|
||||
# data = pd.DataFrame(join_with_census(table_name, column_name,geoid_column, census_table))
|
||||
# features = data[data.columns.difference([column_name, 'geoid'])]
|
||||
# target, mean, std = normalize(data[column_name])
|
||||
# model, accuracy = train_model(target,features, test_split=0.2)
|
||||
# save_model(segment_name, model, accuracy, table_name, column_name, census_table, geoid_column, method)
|
||||
# return accuracy
|
||||
#
|
||||
# def normalize(target):
|
||||
# mean = np.mean(target)
|
||||
# std = no.std(target)
|
||||
# return (target - mean)/std, mean, std
|
||||
#
|
||||
# def denormalize(target, mean ,std):
|
||||
# return target*std + mean
|
||||
#
|
||||
# def train_model(target,features,test_split):
|
||||
# features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
|
||||
# model = ExtraTreesRegressor(n_estimators = 40, max_features=len(features.columns))
|
||||
# model.fit(features_train, target_train)
|
||||
# accuracy = calculate_model_accuracy(model,features,target)
|
||||
# return model, accuracy
|
||||
#
|
||||
# def calculate_model_accuracy(model,features,target):
|
||||
# prediction = self.model.predict(features)
|
||||
# return metrics.mean_squared_error(prediction,target)/np.std(target)
|
||||
#
|
||||
# def join_with_census(table_name, column_name, geoid_column, census_table):
|
||||
# coulmns = plpy.execute('select {census_table}.* limit 1 ')
|
||||
# feature_names = ",".join(columns.keys.difference(['the_geom','cartodb_id']))
|
||||
# join_data = plpy.execute('''
|
||||
# WITH region_extent AS (
|
||||
# SELECT ST_Extent(the_geom) as table_extent FROM {table_name};
|
||||
# )
|
||||
# SELECT {features_names}, {table_name}.{column_name}
|
||||
# FROM {table_name} ,region_extent
|
||||
# JOIN {census_table}
|
||||
# ON {table_name}.{geoid_column} = {census_table}.geoid
|
||||
# WHERE {census_table}.the_geom && region_extent.table_extent
|
||||
# '''.format(**locals()))
|
||||
#
|
||||
# if len(join_data) == 0:
|
||||
# plpy.notice('Failed to join with census data')
|
||||
#
|
||||
# return join_data
|
||||
#
|
||||
# def cdb_predict_segment(segment_name,geoid_column,census_table):
|
||||
# """
|
||||
# predict a segment with machine learning
|
||||
# Stuart Lynn
|
||||
# """
|
||||
# data = fetch_model(segment_name)
|
||||
# model = data['model']
|
||||
# features = ",".join(data['features'])
|
||||
# targets = plpy.execute('select {features} from {census_table}')
|
||||
# geo_ids = plpy.execute('select geoid from {census_table}')
|
||||
# result = model.predict(targets)
|
||||
# return zip(geo_ids,prediction)
|
||||
#
|
||||
#
|
||||
# def fetch_model(model_name):
|
||||
# """
|
||||
# fetch a model from storage
|
||||
# """
|
||||
# data = plpy.execute('select * from models where name={model_name}')
|
||||
# if len(data)==0:
|
||||
# plpy.notice('model not found')
|
||||
# data = data[0]
|
||||
# data['model'] = pickle.load(data['model'])
|
||||
# return data
|
||||
#
|
||||
#
|
||||
# def create_model_table(model_name):
|
||||
# """
|
||||
# create the model table if requred
|
||||
# """
|
||||
# plpy.execute('''
|
||||
# CREATE table IF NOT EXISTS _cdb_models(
|
||||
# name TEXT,
|
||||
# model BLOB,
|
||||
# features TEXT[],
|
||||
# accuracy NUMERIC,
|
||||
# table_name TEXT,
|
||||
# )''')
|
||||
#
|
||||
# def save_model(model_name,model,accuracy,table_name, column_name,census_table,geoid_column,method):
|
||||
# """
|
||||
# save a model to the model table for later use
|
||||
# """
|
||||
#
|
||||
# plpy.execute('''
|
||||
# DELETE FROM _cdb_models WHERE model_name = {model_name}
|
||||
# '''.format(**locals()))
|
||||
#
|
||||
# plpy.execute("""
|
||||
# INSERT INTO _cdb_models ({model_name},{model_pickle},{accuracy})
|
||||
# """)
|
||||
#
|
||||
# def
|
||||
|
Loading…
Reference in New Issue
Block a user