From e73862a6e1bbc20957d971dba914fab228d62b12 Mon Sep 17 00:00:00 2001
From: Stuart Lynn <stuart.lynn@gmail.com>
Date: Thu, 31 Mar 2016 11:25:30 -0400
Subject: [PATCH] adding function to predict the importance of different
 features to a dataset.

---
 pg/sql/0.0.1/05_segmentation.sql              | 12 +++++
 .../crankshaft/segmentation/segmentation.py   | 48 +++++++++++--------
 2 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/pg/sql/0.0.1/05_segmentation.sql b/pg/sql/0.0.1/05_segmentation.sql
index 8cbf1c9..b9bca6a 100644
--- a/pg/sql/0.0.1/05_segmentation.sql
+++ b/pg/sql/0.0.1/05_segmentation.sql
@@ -13,6 +13,18 @@ AS $$
   return segmentation.create_segment(segment_name,table_name,column_name,geoid_column,census_table,'random_forest')
 $$ LANGUAGE plpythonu;
 
+CREATE OR REPLACE FUNCTION
+  cdb_correlated_variables(
+    query text,
+    geoid_column text DEFAULT 'geoid',
+    census_table text DEFAULT 'ml_learning_block_groups_clipped'
+  )
+RETURNS TABLE(feature text, importance NUMERIC, std NUMERIC)
+AS $$
+  from crankshaft.segmentation import correlated_variables
+  return correlated_variables(query,geoid_column,census_table)
+$$ LANGUAGE plpythonu;
+
 CREATE OR REPLACE FUNCTION
   cdb_predict_segment (
       segment_name TEXT,
diff --git a/python/crankshaft/crankshaft/segmentation/segmentation.py b/python/crankshaft/crankshaft/segmentation/segmentation.py
index 130c8e4..0a1a8da 100644
--- a/python/crankshaft/crankshaft/segmentation/segmentation.py
+++ b/python/crankshaft/crankshaft/segmentation/segmentation.py
@@ -30,6 +30,20 @@ def create_segment(segment_name,table_name,column_name,geoid_column,census_table
     # predict_segment
     return accuracy
 
+def correlated_variables(query,geoid_column,census_table):
+    """
+    returns the columns which are importaint for the random forrest model
+    """
+    data     = pd.DataFrame(join_with_census(query,geoid_column, census_table))
+    features = data[data.columns.difference(['target', 'the_geom_webmercator', 'geoid','the_geom'])]
+    target, mean, std = normalize(data['target'])
+    model, accuracy, used_features = train_model(target,features, test_split=0.2)
+    std = np.std([tree.feature_importances_ for tree in model.estimators_],
+             axis=0)
+    importances = model.feature_importances_
+    return zip(features,importances,std)
+
+
 def create_and_predict_segment(segment_name,query,geoid_column,census_table,target_table,method):
     """
     generate a segment with machine learning
@@ -98,15 +112,6 @@ def join_with_census(query, geoid_column, census_table):
 def query_to_dictionary(result):
     return [ dict(zip(r.keys(), r.values())) for r in result ]
 
-def query_in_batches(query,batch_size):
-    cursor = plpy.cursor(query)
-    while True:
-        rows = cursor.fetch(batch_size)
-        if not rows:
-            break
-        else:
-            yield query_to_dictionary(rows)
-
 def predict_segment(model,features,geoid_column,census_table):
     """
     predict a segment with machine learning
@@ -117,21 +122,22 @@ def predict_segment(model,features,geoid_column,census_table):
     # features = ",".join(features)
 
     joined_features  = ','.join(['\"'+a+'\"::numeric' for a in features])
-    targets  = pd.DataFrame(query_to_dictionary(plpy.execute('select {joined_features} from {census_table}'.format(**locals()))))
+      = plpy.execute()
+    cursor = plpy.cursor('select {joined_features} from {census_table}'.format(**locals()))
+    results = []
+    while True:
+        rows  = cursor.fetch(batch_size)
 
-    predition = []
-    for batch in query_in_batches('select {joined_features} from {census_table}'.format(**locals()),2000):
-        targets = pd.DataFrame(batch)
-        plpy.notice('predicting:' + str(len(features)) + ' '+str(np.shape(targets)))
-        plpy.notice(joined_features)
-        targets = targets.dropna(axis =1, how='all').fillna(0)
-        plpy.notice('predicting:' + str(len(features)) + ' '+str(np.shape(targets)))
-        batch_prediction   = model.predict(targets)
-        prediciton.append(batch_prediction.to_maxtrix)
+        if not rows:
+            break
 
-    geo_ids  = plpy.execute('select geoid from {census_table}'.format(**locals()))
+        batch = pd.DataFrame(query_to_dictionary(rows))
+        batch_features = batch.dropna(axis =1, how='all').fillna(0)
+        prediction   = model.predict(batch_features)
+        results.append(prediction)
+        plpy.notice('predicting: predicted')
 
-    return  [[a['geoid'] for a in geo_ids],prediction]
+    return  [a['the_geom'] for a in geoms], [a['geoid'] for a in geo_ids],prediction
 
 
 def fetch_model(model_name):