From d96d6b2c482fe71472a1606cb0a45379003903da Mon Sep 17 00:00:00 2001 From: Stuart Lynn Date: Mon, 7 Mar 2016 11:41:37 -0500 Subject: [PATCH] fleshing out segmentation --- .../crankshaft/segmentation/__init__.py | 1 + .../crankshaft/segmentation/segmentation.py | 236 +++++++++--------- 2 files changed, 119 insertions(+), 118 deletions(-) diff --git a/python/crankshaft/crankshaft/segmentation/__init__.py b/python/crankshaft/crankshaft/segmentation/__init__.py index e69de29..b825e85 100644 --- a/python/crankshaft/crankshaft/segmentation/__init__.py +++ b/python/crankshaft/crankshaft/segmentation/__init__.py @@ -0,0 +1 @@ +from segmentation import * diff --git a/python/crankshaft/crankshaft/segmentation/segmentation.py b/python/crankshaft/crankshaft/segmentation/segmentation.py index 3894122..4f99573 100644 --- a/python/crankshaft/crankshaft/segmentation/segmentation.py +++ b/python/crankshaft/crankshaft/segmentation/segmentation.py @@ -1,118 +1,118 @@ -""" -Segmentation creation and prediction -""" - -import sklearn -import numpy as np -import pandas as pd -import pickle -import plpy -from sklearn.ensemble import ExtraTreesRegressor -from sklearn import metrics -from sklearn.cross_validation import train_test_split - -# High level interface --------------------------------------- - -def cdb_create_segment(segment_name,table_name,column_name,geoid_column,census_table,method): - """ - generate a segment with machine learning - Stuart Lynn - """ - data = pd.DataFrame(join_with_census(table_name, column_name,geoid_column, census_table)) - features = data[data.columns.difference([column_name, 'geoid'])] - target, mean, std = normalize(data[column_name]) - model, accuracy = train_model(target,features, test_split=0.2) - save_model(segment_name, model, accuracy, table_name, column_name, census_table, geoid_column, method) - return accuracy - -def normalize(target): - mean = np.mean(target) - std = no.std(target) - return (target - mean)/std, mean, std - -def denormalize(target, mean ,std): - return target*std + mean - -def train_model(target,features,test_split): - features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split) - model = ExtraTreesRegressor(n_estimators = 40, max_features=len(features.columns)) - model.fit(features_train, target_train) - accuracy = calculate_model_accuracy(model,features,target) - return model, accuracy - -def calculate_model_accuracy(model,features,target): - prediction = self.model.predict(features) - return metrics.mean_squared_error(prediction,target)/np.std(target) - -def join_with_census(table_name, column_name, geoid_column, census_table): - coulmns = plpy.execute('select {census_table}.* limit 1 ') - feature_names = ",".join(columns.keys.difference(['the_geom','cartodb_id'])) - join_data = plpy.execute(''' - WITH region_extent AS ( - SELECT ST_Extent(the_geom) as table_extent FROM {table_name}; - ) - SELECT {features_names}, {table_name}.{column_name} - FROM {table_name} ,region_extent - JOIN {census_table} - ON {table_name}.{geoid_column} = {census_table}.geoid - WHERE {census_table}.the_geom && region_extent.table_extent - '''.format(**locals())) - - if len(join_data) == 0: - plpy.notice('Failed to join with census data') - - return join_data - -def cdb_predict_segment(segment_name,geoid_column,census_table): - """ - predict a segment with machine learning - Stuart Lynn - """ - data = fetch_model(segment_name) - model = data['model'] - features = ",".join(data['features']) - targets = plpy.execute('select {features} from {census_table}') - geo_ids = plpy.execute('select geoid from {census_table}') - result = model.predict(targets) - return zip(geo_ids,prediction) - - -def fetch_model(model_name): - """ - fetch a model from storage - """ - data = plpy.execute('select * from models where name={model_name}') - if len(data)==0: - plpy.notice('model not found') - data = data[0] - data['model'] = pickle.load(data['model']) - return data - - -def create_model_table(model_name): - """ - create the model table if requred - """ - plpy.execute(''' - CREATE table IF NOT EXISTS _cdb_models( - name TEXT, - model BLOB, - features TEXT[], - accuracy NUMERIC, - table_name TEXT, - )''') - -def save_model(model_name,model,accuracy,table_name, column_name,census_table,geoid_column,method): - """ - save a model to the model table for later use - """ - - plpy.execute(''' - DELETE FROM _cdb_models WHERE model_name = {model_name} - '''.format(**locals())) - - plpy.execute(""" - INSERT INTO _cdb_models ({model_name},{model_pickle},{accuracy}) - """) - -def +# """ +# Segmentation creation and prediction +# """ +# +# import sklearn +# import numpy as np +# import pandas as pd +# import pickle +# import plpy +# from sklearn.ensemble import ExtraTreesRegressor +# from sklearn import metrics +# from sklearn.cross_validation import train_test_split +# +# # High level interface --------------------------------------- +# +# def cdb_create_segment(segment_name,table_name,column_name,geoid_column,census_table,method): +# """ +# generate a segment with machine learning +# Stuart Lynn +# """ +# data = pd.DataFrame(join_with_census(table_name, column_name,geoid_column, census_table)) +# features = data[data.columns.difference([column_name, 'geoid'])] +# target, mean, std = normalize(data[column_name]) +# model, accuracy = train_model(target,features, test_split=0.2) +# save_model(segment_name, model, accuracy, table_name, column_name, census_table, geoid_column, method) +# return accuracy +# +# def normalize(target): +# mean = np.mean(target) +# std = no.std(target) +# return (target - mean)/std, mean, std +# +# def denormalize(target, mean ,std): +# return target*std + mean +# +# def train_model(target,features,test_split): +# features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split) +# model = ExtraTreesRegressor(n_estimators = 40, max_features=len(features.columns)) +# model.fit(features_train, target_train) +# accuracy = calculate_model_accuracy(model,features,target) +# return model, accuracy +# +# def calculate_model_accuracy(model,features,target): +# prediction = self.model.predict(features) +# return metrics.mean_squared_error(prediction,target)/np.std(target) +# +# def join_with_census(table_name, column_name, geoid_column, census_table): +# coulmns = plpy.execute('select {census_table}.* limit 1 ') +# feature_names = ",".join(columns.keys.difference(['the_geom','cartodb_id'])) +# join_data = plpy.execute(''' +# WITH region_extent AS ( +# SELECT ST_Extent(the_geom) as table_extent FROM {table_name}; +# ) +# SELECT {features_names}, {table_name}.{column_name} +# FROM {table_name} ,region_extent +# JOIN {census_table} +# ON {table_name}.{geoid_column} = {census_table}.geoid +# WHERE {census_table}.the_geom && region_extent.table_extent +# '''.format(**locals())) +# +# if len(join_data) == 0: +# plpy.notice('Failed to join with census data') +# +# return join_data +# +# def cdb_predict_segment(segment_name,geoid_column,census_table): +# """ +# predict a segment with machine learning +# Stuart Lynn +# """ +# data = fetch_model(segment_name) +# model = data['model'] +# features = ",".join(data['features']) +# targets = plpy.execute('select {features} from {census_table}') +# geo_ids = plpy.execute('select geoid from {census_table}') +# result = model.predict(targets) +# return zip(geo_ids,prediction) +# +# +# def fetch_model(model_name): +# """ +# fetch a model from storage +# """ +# data = plpy.execute('select * from models where name={model_name}') +# if len(data)==0: +# plpy.notice('model not found') +# data = data[0] +# data['model'] = pickle.load(data['model']) +# return data +# +# +# def create_model_table(model_name): +# """ +# create the model table if requred +# """ +# plpy.execute(''' +# CREATE table IF NOT EXISTS _cdb_models( +# name TEXT, +# model BLOB, +# features TEXT[], +# accuracy NUMERIC, +# table_name TEXT, +# )''') +# +# def save_model(model_name,model,accuracy,table_name, column_name,census_table,geoid_column,method): +# """ +# save a model to the model table for later use +# """ +# +# plpy.execute(''' +# DELETE FROM _cdb_models WHERE model_name = {model_name} +# '''.format(**locals())) +# +# plpy.execute(""" +# INSERT INTO _cdb_models ({model_name},{model_pickle},{accuracy}) +# """) +# +# def