mirror of
https://github.com/CartoDB/crankshaft.git
synced 2024-11-01 10:20:48 +08:00
Training section now works
This commit is contained in:
parent
f885cc9f7b
commit
fcf57289fc
@ -137,6 +137,33 @@ BEGIN
|
||||
END;
|
||||
$$
|
||||
LANGUAGE plpgsql VOLATILE;
|
||||
CREATE OR REPLACE FUNCTION
|
||||
cdb_create_segment (
|
||||
segment_name TEXT,
|
||||
table_name TEXT,
|
||||
column_name TEXT,
|
||||
geoid_column TEXT DEFAULT 'geoid',
|
||||
census_table TEXT DEFAULT 'block_groups'
|
||||
)
|
||||
RETURNS NUMERIC
|
||||
AS $$
|
||||
from crankshaft import segmentation
|
||||
# TODO: use named parameters or a dictionary
|
||||
return segmentation.create_segment(segment_name,table_name,column_name,geoid_column,census_table,'random_forest')
|
||||
$$ LANGUAGE plpythonu;
|
||||
|
||||
CREATE OR REPLACE FUNCTION
|
||||
cdb_predict_segment (
|
||||
segment_name TEXT,
|
||||
geoid_column TEXT DEFAULT 'geoid',
|
||||
census_table TEXT DEFAULT 'block_groups'
|
||||
)
|
||||
RETURNS TABLE(geoid TEXT, prediction NUMERIC)
|
||||
AS $$
|
||||
from crankshaft.segmentation import create_segemnt
|
||||
# TODO: use named parameters or a dictionary
|
||||
return create_segment('table')
|
||||
$$ LANGUAGE plpythonu;
|
||||
-- Make sure by default there are no permissions for publicuser
|
||||
-- NOTE: this happens at extension creation time, as part of an implicit transaction.
|
||||
-- REVOKE ALL PRIVILEGES ON SCHEMA cdb_crankshaft FROM PUBLIC, publicuser CASCADE;
|
||||
|
@ -8,9 +8,9 @@ CREATE OR REPLACE FUNCTION
|
||||
)
|
||||
RETURNS NUMERIC
|
||||
AS $$
|
||||
from crankshaft.segmentation import create_segemnt
|
||||
from crankshaft import segmentation
|
||||
# TODO: use named parameters or a dictionary
|
||||
return create_segment('table')
|
||||
return segmentation.create_segment(segment_name,table_name,column_name,geoid_column,census_table,'random_forest')
|
||||
$$ LANGUAGE plpythonu;
|
||||
|
||||
CREATE OR REPLACE FUNCTION
|
||||
|
@ -13,57 +13,71 @@ from sklearn.cross_validation import train_test_split
|
||||
|
||||
# High level interface ---------------------------------------
|
||||
|
||||
def cdb_create_segment(segment_name,table_name,column_name,geoid_column,census_table,method):
|
||||
def create_segment(segment_name,table_name,column_name,geoid_column,census_table,method):
|
||||
"""
|
||||
generate a segment with machine learning
|
||||
Stuart Lynn
|
||||
"""
|
||||
data = pd.DataFrame(join_with_census(table_name, column_name,geoid_column, census_table,))
|
||||
features = data[data.columns.difference([column_name, 'geoid'])]
|
||||
data = pd.DataFrame(join_with_census(table_name, column_name,geoid_column, census_table))
|
||||
features = data[data.columns.difference([column_name, 'geoid','the_geom'])]
|
||||
target, mean, std = normalize(data[column_name])
|
||||
model, accuracy = train_model(target,features, test_split=0.2)
|
||||
save_model(segment_name, model, accuracy, table_name, column_name, census_table, geoid_column, method)
|
||||
# save_model(segment_name, model, accuracy, table_name, column_name, census_table, geoid_column, method)
|
||||
# predict_segment
|
||||
return accuracy
|
||||
|
||||
def normalize(target):
|
||||
mean = np.mean(target)
|
||||
std = no.std(target)
|
||||
std = np.std(target)
|
||||
plpy.notice('mean '+str(mean)+" std : "+str(std))
|
||||
return (target - mean)/std, mean, std
|
||||
|
||||
def denormalize(target, mean ,std):
|
||||
return target*std + mean
|
||||
|
||||
def train_model(target,features,test_split):
|
||||
plpy.notice('training the model')
|
||||
plpy.notice('dataframe shape '+ str(np.shape(features)))
|
||||
plpy.notice('dataframe columns '+ str(features.dtypes))
|
||||
features = features.dropna(axis =1, how='all').fillna(0)
|
||||
target = target.fillna(0)
|
||||
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
|
||||
plpy.notice('training the model test train split')
|
||||
model = ExtraTreesRegressor(n_estimators = 40, max_features=len(features.columns))
|
||||
plpy.notice('training the model created tree')
|
||||
plpy.notice('features '+str(np.shape(features_train))+" "+str(np.shape(features_test)) )
|
||||
|
||||
model.fit(features_train, target_train)
|
||||
plpy.notice('training the model fitting model')
|
||||
accuracy = calculate_model_accuracy(model,features,target)
|
||||
return model, accuracy
|
||||
|
||||
def calculate_model_accuracy(model,features,target):
|
||||
prediction = self.model.predict(features)
|
||||
prediction = model.predict(features)
|
||||
return metrics.mean_squared_error(prediction,target)/np.std(target)
|
||||
|
||||
def join_with_census(table_name, column_name, geoid_column, census_table):
|
||||
coulmns = plpy.execute('select {census_table}.* limit 1 ')
|
||||
feature_names = ",".join(columns.keys.difference(['the_geom','cartodb_id']))
|
||||
columns = plpy.execute('select * from {census_table} limit 1 '.format(**locals()))
|
||||
combined_columns = [ a for a in columns[0].keys() if a not in ['the_geom','cartodb_id','geoid']]
|
||||
feature_names = ",".join([ " {census_table}.\"{a}\" as \"{a}\" ".format(**locals()) for a in combined_columns])
|
||||
plpy.notice('joining with census data')
|
||||
join_data = plpy.execute('''
|
||||
WITH region_extent AS (
|
||||
SELECT ST_Extent(the_geom) as table_extent FROM {table_name};
|
||||
)
|
||||
SELECT {features_names}, {table_name}.{column_name}
|
||||
FROM {table_name} ,region_extent
|
||||
|
||||
SELECT {feature_names}, {table_name}.{column_name}
|
||||
FROM {table_name}
|
||||
JOIN {census_table}
|
||||
ON {table_name}.{geoid_column} = {census_table}.geoid
|
||||
WHERE {census_table}.the_geom && region_extent.table_extent
|
||||
ON {table_name}.{geoid_column}::numeric = {census_table}.geoid::numeric
|
||||
'''.format(**locals()))
|
||||
|
||||
if len(join_data) == 0:
|
||||
plpy.notice('Failed to join with census data')
|
||||
|
||||
return join_data
|
||||
return query_to_dictionary(join_data)
|
||||
|
||||
def cdb_predict_segment(segment_name,geoid_column,census_table):
|
||||
def query_to_dictionary(result):
|
||||
return [ dict(zip(r.keys(), r.values())) for r in result ]
|
||||
|
||||
def predict_segment(model,features,geoid_column,census_table):
|
||||
"""
|
||||
predict a segment with machine learning
|
||||
Stuart Lynn
|
||||
@ -89,30 +103,31 @@ def fetch_model(model_name):
|
||||
return data
|
||||
|
||||
|
||||
def create_model_table(model_name):
|
||||
def create_model_table():
|
||||
"""
|
||||
create the model table if requred
|
||||
"""
|
||||
plpy.execute('''
|
||||
CREATE table IF NOT EXISTS _cdb_models(
|
||||
name TEXT,
|
||||
model BLOB,
|
||||
model TEXT,
|
||||
features TEXT[],
|
||||
accuracy NUMERIC,
|
||||
table_name TEXT,
|
||||
census_table_name TEXT,
|
||||
method TEXT
|
||||
)''')
|
||||
|
||||
def save_model(model_name,model,accuracy,table_name, column_name,census_table,geoid_column,method):
|
||||
"""
|
||||
save a model to the model table for later use
|
||||
"""
|
||||
create_model_table()
|
||||
|
||||
plpy.execute('''
|
||||
DELETE FROM _cdb_models WHERE model_name = {model_name}
|
||||
DELETE FROM _cdb_models WHERE name = '{model_name}'
|
||||
'''.format(**locals()))
|
||||
|
||||
model_pickle = pickle.dumps(model)
|
||||
plpy.execute("""
|
||||
INSERT INTO _cdb_models ({model_name},{model_pickle},{accuracy})
|
||||
""")
|
||||
|
||||
def
|
||||
INSERT INTO _cdb_models ('{model_name}','{model_pickle}',{accuracy}, '{table_name}', '{census_table}', '{method}')
|
||||
""".format(**locals()))
|
||||
|
@ -40,9 +40,8 @@ setup(
|
||||
|
||||
# The choice of component versions is dictated by what's
|
||||
# provisioned in the production servers.
|
||||
install_requires=['pysal==1.11.0','numpy==1.6.1','scipy==0.17.0'],
|
||||
install_requires=['pysal==1.11.0','numpy==1.10.1','scipy==0.17.0','pandas','sklearn'],
|
||||
|
||||
requires=['pysal', 'numpy'],
|
||||
|
||||
test_suite='test'
|
||||
)
|
||||
|
Loading…
Reference in New Issue
Block a user