adding function to predict the importance of different features to a dataset.
This commit is contained in:
parent
d140b4249e
commit
e73862a6e1
@ -13,6 +13,18 @@ AS $$
|
|||||||
return segmentation.create_segment(segment_name,table_name,column_name,geoid_column,census_table,'random_forest')
|
return segmentation.create_segment(segment_name,table_name,column_name,geoid_column,census_table,'random_forest')
|
||||||
$$ LANGUAGE plpythonu;
|
$$ LANGUAGE plpythonu;
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION
|
||||||
|
cdb_correlated_variables(
|
||||||
|
query text,
|
||||||
|
geoid_column text DEFAULT 'geoid',
|
||||||
|
census_table text DEFAULT 'ml_learning_block_groups_clipped'
|
||||||
|
)
|
||||||
|
RETURNS TABLE(feature text, importance NUMERIC, std NUMERIC)
|
||||||
|
AS $$
|
||||||
|
from crankshaft.segmentation import correlated_variables
|
||||||
|
return correlated_variables(query,geoid_column,census_table)
|
||||||
|
$$ LANGUAGE plpythonu;
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION
|
CREATE OR REPLACE FUNCTION
|
||||||
cdb_predict_segment (
|
cdb_predict_segment (
|
||||||
segment_name TEXT,
|
segment_name TEXT,
|
||||||
|
@ -30,6 +30,20 @@ def create_segment(segment_name,table_name,column_name,geoid_column,census_table
|
|||||||
# predict_segment
|
# predict_segment
|
||||||
return accuracy
|
return accuracy
|
||||||
|
|
||||||
|
def correlated_variables(query,geoid_column,census_table):
|
||||||
|
"""
|
||||||
|
returns the columns which are importaint for the random forrest model
|
||||||
|
"""
|
||||||
|
data = pd.DataFrame(join_with_census(query,geoid_column, census_table))
|
||||||
|
features = data[data.columns.difference(['target', 'the_geom_webmercator', 'geoid','the_geom'])]
|
||||||
|
target, mean, std = normalize(data['target'])
|
||||||
|
model, accuracy, used_features = train_model(target,features, test_split=0.2)
|
||||||
|
std = np.std([tree.feature_importances_ for tree in model.estimators_],
|
||||||
|
axis=0)
|
||||||
|
importances = model.feature_importances_
|
||||||
|
return zip(features,importances,std)
|
||||||
|
|
||||||
|
|
||||||
def create_and_predict_segment(segment_name,query,geoid_column,census_table,target_table,method):
|
def create_and_predict_segment(segment_name,query,geoid_column,census_table,target_table,method):
|
||||||
"""
|
"""
|
||||||
generate a segment with machine learning
|
generate a segment with machine learning
|
||||||
@ -98,15 +112,6 @@ def join_with_census(query, geoid_column, census_table):
|
|||||||
def query_to_dictionary(result):
|
def query_to_dictionary(result):
|
||||||
return [ dict(zip(r.keys(), r.values())) for r in result ]
|
return [ dict(zip(r.keys(), r.values())) for r in result ]
|
||||||
|
|
||||||
def query_in_batches(query,batch_size):
|
|
||||||
cursor = plpy.cursor(query)
|
|
||||||
while True:
|
|
||||||
rows = cursor.fetch(batch_size)
|
|
||||||
if not rows:
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
yield query_to_dictionary(rows)
|
|
||||||
|
|
||||||
def predict_segment(model,features,geoid_column,census_table):
|
def predict_segment(model,features,geoid_column,census_table):
|
||||||
"""
|
"""
|
||||||
predict a segment with machine learning
|
predict a segment with machine learning
|
||||||
@ -117,21 +122,22 @@ def predict_segment(model,features,geoid_column,census_table):
|
|||||||
# features = ",".join(features)
|
# features = ",".join(features)
|
||||||
|
|
||||||
joined_features = ','.join(['\"'+a+'\"::numeric' for a in features])
|
joined_features = ','.join(['\"'+a+'\"::numeric' for a in features])
|
||||||
targets = pd.DataFrame(query_to_dictionary(plpy.execute('select {joined_features} from {census_table}'.format(**locals()))))
|
= plpy.execute()
|
||||||
|
cursor = plpy.cursor('select {joined_features} from {census_table}'.format(**locals()))
|
||||||
|
results = []
|
||||||
|
while True:
|
||||||
|
rows = cursor.fetch(batch_size)
|
||||||
|
|
||||||
predition = []
|
if not rows:
|
||||||
for batch in query_in_batches('select {joined_features} from {census_table}'.format(**locals()),2000):
|
break
|
||||||
targets = pd.DataFrame(batch)
|
|
||||||
plpy.notice('predicting:' + str(len(features)) + ' '+str(np.shape(targets)))
|
|
||||||
plpy.notice(joined_features)
|
|
||||||
targets = targets.dropna(axis =1, how='all').fillna(0)
|
|
||||||
plpy.notice('predicting:' + str(len(features)) + ' '+str(np.shape(targets)))
|
|
||||||
batch_prediction = model.predict(targets)
|
|
||||||
prediciton.append(batch_prediction.to_maxtrix)
|
|
||||||
|
|
||||||
geo_ids = plpy.execute('select geoid from {census_table}'.format(**locals()))
|
batch = pd.DataFrame(query_to_dictionary(rows))
|
||||||
|
batch_features = batch.dropna(axis =1, how='all').fillna(0)
|
||||||
|
prediction = model.predict(batch_features)
|
||||||
|
results.append(prediction)
|
||||||
|
plpy.notice('predicting: predicted')
|
||||||
|
|
||||||
return [[a['geoid'] for a in geo_ids],prediction]
|
return [a['the_geom'] for a in geoms], [a['geoid'] for a in geo_ids],prediction
|
||||||
|
|
||||||
|
|
||||||
def fetch_model(model_name):
|
def fetch_model(model_name):
|
||||||
|
Loading…
Reference in New Issue
Block a user