From e80fdca7fcfd605cef19f209d6f4816578246f47 Mon Sep 17 00:00:00 2001
From: Andy Eschbacher <andy.eschbacher@gmail.com>
Date: Wed, 25 May 2016 12:02:47 -0400
Subject: [PATCH] removing time-binning options, reorganizes signature

---
 .../crankshaft/space_time_dynamics/markov.py  | 65 ++++++++++---------
 1 file changed, 35 insertions(+), 30 deletions(-)

diff --git a/src/py/crankshaft/crankshaft/space_time_dynamics/markov.py b/src/py/crankshaft/crankshaft/space_time_dynamics/markov.py
index 1600583..3e39d49 100644
--- a/src/py/crankshaft/crankshaft/space_time_dynamics/markov.py
+++ b/src/py/crankshaft/crankshaft/space_time_dynamics/markov.py
@@ -8,23 +8,22 @@ import pysal as ps
 import plpy
 import crankshaft.pysal_utils as pu
 
-def spatial_markov_trend(subquery, time_cols, num_time_per_bin,
-                         permutations, geom_col, id_col, w_type, num_ngbrs):
+def spatial_markov_trend(subquery, time_cols, num_classes = 7,
+                         w_type = 'knn', num_ngbrs = 5, permutations = 999,
+                         geom_col = 'the_geom', id_col = 'cartodb_id'):
     """
         Predict the trends of a unit based on:
         1. history of its transitions to different classes (e.g., 1st quantile -> 2nd quantile)
         2. average class of its neighbors
 
         Inputs:
-
-        @param subquery string: e.g., SELECT * FROM table_name
-        @param time_cols list (string): list of strings of column names
-        @param num_time_per_bin int: number of bins to divide # of time columns into
-        @param permutations int: number of permutations for test stats
-        @param geom_col string: name of column which contains the geometries
-        @param id_col string: name of column which has the ids of the table
-        @param w_type string: weight type ('knn' or 'queen')
-        @param num_ngbrs int: number of neighbors (if knn type)
+        @param subquery string: e.g., SELECT the_geom, cartodb_id, interesting_time_column FROM table_name
+        @param time_cols list of strings: list of strings of column names
+        @param w_type string (optional): weight type ('knn' or 'queen')
+        @param num_ngbrs int (optional): number of neighbors (if knn type)
+        @param permutations int (optional): number of permutations for test stats
+        @param geom_col string (optional): name of column which contains the geometries
+        @param id_col string (optional): name of column which has the ids of the table
 
         Outputs:
         @param trend_up float: probablity that a geom will move to a higher class
@@ -34,8 +33,8 @@ def spatial_markov_trend(subquery, time_cols, num_time_per_bin,
         @param
     """
 
-    if num_time_per_bin < 1:
-        plpy.error('Error: number of time bins must be >= 1')
+    if len(time_cols) < 2:
+        plpy.error('More than one time column needs to be passed')
 
     qvals = {"id_col": id_col,
              "time_cols": time_cols,
@@ -43,13 +42,15 @@ def spatial_markov_trend(subquery, time_cols, num_time_per_bin,
              "subquery": subquery,
              "num_ngbrs": num_ngbrs}
 
-    query = pu.construct_neighbor_query(w_type, qvals)
-
     try:
-        query_result = plpy.execute(query)
+        query_result = plpy.execute(
+            pu.construct_neighbor_query(w_type, qvals)
+        )
+        if len(query_result) == 0:
+            return zip([None], [None], [None], [None], [None])
     except plpy.SPIError, err:
-        plpy.notice('** Query failed with exception %s: %s' % (err, query))
-        plpy.error('Spatial Markov failed: check the input parameters')
+        plpy.debug('Query failed with exception %s: %s' % (err, query))
+        plpy.error('Query failed, check the input parameters')
         return zip([None], [None], [None], [None], [None])
 
     ## build weight
@@ -57,34 +58,33 @@ def spatial_markov_trend(subquery, time_cols, num_time_per_bin,
 
     ## prep time data
     t_data = get_time_data(query_result, time_cols)
-    ## rebin time data
-    if num_time_per_bin > 1:
-        ## rebin
-        t_data = rebin_data(t_data, int(num_time_per_bin))
 
-    print 'shape of t_data %d, %d' % t_data.shape
-    print 'number of weight objects: %d, %d' % (weights.sparse).shape
-    print 'first num elements: %f' % t_data[0, 0]
+    plpy.debug('shape of t_data %d, %d' % t_data.shape)
+    plpy.debug('number of weight objects: %d, %d' % (weights.sparse).shape)
+    plpy.debug('first num elements: %f' % t_data[0, 0])
     # ls = ps.lag_spatial(weights, t_data)
 
     sp_markov_result = ps.Spatial_Markov(t_data,
                                          weights,
-                                         k=7,
+                                         k=num_classes,
                                          fixed=False,
                                          permutations=permutations)
 
     ## get lag classes
-    lag_classes = ps.Quantiles(ps.lag_spatial(weights, t_data[:, -1]), k=7).yb
+    lag_classes = ps.Quantiles(
+                    ps.lag_spatial(weights, t_data[:, -1]),
+                    k=num_classes).yb
 
     ## look up probablity distribution for each unit according to class and lag class
-    prob_dist = get_prob_dist(sp_markov_result.P, lag_classes, sp_markov_result.classes[:, -1])
+    prob_dist = get_prob_dist(sp_markov_result.P,
+                              lag_classes,
+                              sp_markov_result.classes[:, -1])
 
     ## find the ups and down and overall distribution of each cell
     trend_up, trend_down, trend, volatility = get_prob_stats(prob_dist,
                                                              sp_markov_result.classes[:, -1])
 
     ## output the results
-
     return zip(trend, trend_up, trend_down, volatility, weights.id_order)
 
 def get_time_data(markov_data, time_cols):
@@ -95,6 +95,7 @@ def get_time_data(markov_data, time_cols):
     return np.array([[x['attr' + str(i)] for x in markov_data]
                      for i in range(1, num_attrs+1)], dtype=float).transpose()
 
+## not currently used
 def rebin_data(time_data, num_time_per_bin):
     """
         Convert an n x l matrix into an (n/m) x l matrix where the values are
@@ -130,6 +131,7 @@ def rebin_data(time_data, num_time_per_bin):
 
     return np.array([time_data[:, num_time_per_bin * i:num_time_per_bin * (i+1)].mean(axis=1)
                      for i in range(n_max)]).T
+
 def get_prob_dist(transition_matrix, lag_indices, unit_indices):
     """
         Given an array of transition matrices, look up the probability
@@ -168,7 +170,10 @@ def get_prob_stats(prob_dist, unit_indices):
     for i in range(num_elements):
         trend_up[i] = prob_dist[i, (unit_indices[i]+1):].sum()
         trend_down[i] = prob_dist[i, :unit_indices[i]].sum()
-        trend[i] = (trend_up[i] - trend_down[i]) / prob_dist[i, unit_indices[i]]
+        if prob_dist[i, unit_indices[i]] > 0.0:
+            trend[i] = (trend_up[i] - trend_down[i]) / prob_dist[i, unit_indices[i]]
+        else:
+            trend[i] = None
 
     ## calculate volatility of distribution
     volatility = prob_dist.std(axis=1)